Lucene library를 이용한 간단한 색인/검색(루씬 라이브러리이용)

2021. 4. 18. 23:59 Big Data/Lucene
package com.lucene.study;
 
import java.io.File;
import java.io.FileFilter;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
 
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
 
//yeoseong_yoon,2019/1/2->txt파일을 색인하는 유틸클래스
public class Indexer {
    
    private IndexWriter writer;
    
    //루씬의 IndexWriter 생성
    public Indexer(String indexDir) throws IOException {
        Directory dir = FSDirectory.open(new File(indexDir).toPath());
        
        writer = new IndexWriter(dir,new IndexWriterConfig(new StandardAnalyzer()));
    }
    
    public void close() throws IOException {
        writer.close();
    }
    
    public int index(String dataDir,FileFilter filter) throws IOException {
        
        File[] files = new File(dataDir).listFiles();
        
        for(File f : files) {
            if(!f.isDirectory() &&
                    !f.isHidden() &&
                    f.exists() &&
                    f.canRead() &&
                    (filter == null || filter.accept(f))) {
                
                indexFile(f);
            }
        }
        
        //색인된 문서건수 리턴
        return writer.numDocs();
    }
    
    private void indexFile(File file) throws IOException {
        System.out.println("Indexing "+file.getCanonicalPath());
        Document doc = getDocument(file);
        writer.addDocument(doc);
    }
    
    @SuppressWarnings("deprecation")
    protected Document getDocument(File file) throws IOException {
        Document doc = new Document();
        //파일의 내용추가
        doc.add(new Field("content", new FileReader(file)));
        //파일 이름추가
        doc.add(new Field("filename",file.getName(),Field.Store.YES,Field.Index.NOT_ANALYZED));
        //파일 전체경로 추가
        doc.add(new Field("fullpath",file.getCanonicalPath(),Field.Store.YES,Field.Index.NOT_ANALYZED));
        
        return doc;
    }
    
    //FileFilter를 이용하여 해당 확장자만 걸러낸다.
    private static class TextFilesFilter implements FileFilter{
        public boolean accept(File path) {
            return path.getName().toLowerCase().endsWith(".txt");
        }
    }
    public static void main(String[] args) throws IOException {
        // TODO Auto-generated method stub
        if(args.length!=2) {
            throw new IllegalArgumentException("please input index dir & data dir");
        }
        
        String indexDir = args[0];
        String dataDir = args[1];
        
        long start = System.currentTimeMillis();
        Indexer indexer = new Indexer(indexDir);
        int numIndexed = 0;
        
        try {
            numIndexed = indexer.index(dataDir, new TextFilesFilter());
        }catch (Exception e) {
            // TODO: handle exception
            System.out.println(e.getMessage());
        }finally {
            indexer.close();
        }
        
        long end = System.currentTimeMillis();
        
        System.out.println("Indexing " + numIndexed + " files took " + (end-start) + " milliseconds");
    }
 
}

 

=>특정 디렉토리에 들어있는 txt파일을 특정 디렉토리 위치에 색인하는 코드이다.(실행시 arguments로 2개의 인자를 전달해야한다.)

package com.lucene.study;
 
import java.io.File;
import java.io.IOException;
 
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
 
public class Searcher {
    
    public static void search(String indexDir, String query) throws IOException, ParseException {
        
        Directory dir = FSDirectory.open(new File(indexDir).toPath());
        
        //색인을 연다.
        IndexSearcher indexSearcher = new IndexSearcher(DirectoryReader.open(dir));
        
        //질의를 분석한다.
        QueryParser parser = new QueryParser("content",
                                            new StandardAnalyzer());
        
        Query q = parser.parse(query);
        
        long start = System.currentTimeMillis();
        
        TopDocs hits = indexSearcher.search(q, 10);
        
        long end = System.currentTimeMillis();
        
        System.out.println("Found "+ hits.totalHits +" document(s) (in "+(end-start) 
                +" millisecond) that matched query '" + q + "' :");
        
        for(ScoreDoc scoreDoc : hits.scoreDocs) {
            Document doc = indexSearcher.doc(scoreDoc.doc);
            System.out.println(String.format("content - %s\nfullpath = %s"
                                            ,doc.get("content") 
                                            ,doc.get("fullpath")));
        }
        
    }
 
    public static void main(String[] args) throws IOException, ParseException {
        // TODO Auto-generated method stub
        if(args.length !=2) {
            throw new IllegalArgumentException("please input index dir & search query String");
        }
        
        String indexDir = args[0];
        String query = args[1];
        search(indexDir, query);
    }
 
}

 

=>위에서 색인한 색인 목록에서 검색어에 해당하는 문서를 찾는 코드이다. 이도 동일하게 2개의 아규먼트를 전달해야한다.

 

2개의 간단한 색인/검색을 예제로 짜보았다. 이 코드를 짜기전에 루씬 라이브러리를 dependency 해야하는 선과정이 있다.

<!-- START - Lucene Dependencies -->
<dependency>
      <groupId>org.apache.lucene</groupId>
      <artifactId>lucene-core</artifactId>
      <version>5.3.0</version>
</dependency>
<dependency>
      <groupId>org.apache.lucene</groupId>
      <artifactId>lucene-analyzers-common</artifactId>
      <version>5.3.0</version>
</dependency>
<dependency>
      <groupId>org.apache.lucene</groupId>
      <artifactId>lucene-queryparser</artifactId>
      <version>5.3.0</version>
</dependency>
<!-- END - Lucene Dependencies -->


출처: https://coding-start.tistory.com/67?category=784008 [코딩스타트]