java - problem with incremental update in lucene -
i creating program can index many text files in different folder. that's mean every folder has text files indexed , index stored in folder. folder acts universal index of files in computer. , using lucene achieve because lucene supported incremental update. source code use indexing.
public class simplefileindexer { public static void main(string[] args) throws exception { int i=0; while(i<2) { file indexdir = new file("c:/users/raden/documents/myindex"); file datadir = new file("c:/users/raden/documents/indexthis"); string suffix = "txt"; simplefileindexer indexer = new simplefileindexer(); int numindex = indexer.index(indexdir, datadir, suffix); system.out.println("total files indexed " + numindex); i++; thread.sleep(1000); } } private int index(file indexdir, file datadir, string suffix) throws exception { ramdirectory ramdir = new ramdirectory(); // 1 @suppresswarnings("deprecation") indexwriter indexwriter = new indexwriter( ramdir, // 2 new standardanalyzer(version.lucene_current), true, indexwriter.maxfieldlength.unlimited); indexwriter.setusecompoundfile(false); indexdirectory(indexwriter, datadir, suffix); int numindexed = indexwriter.maxdoc(); indexwriter.optimize(); indexwriter.close(); directory.copy(ramdir, fsdirectory.open(indexdir), false); // 3 return numindexed; } private void indexdirectory(indexwriter indexwriter, file datadir, string suffix) throws ioexception { file[] files = datadir.listfiles(); (int = 0; < files.length; i++) { file f = files[i]; if (f.isdirectory()) { indexdirectory(indexwriter, f, suffix); } else { indexfilewithindexwriter(indexwriter, f, suffix); } } } private void indexfilewithindexwriter(indexwriter indexwriter, file f, string suffix) throws ioexception { if (f.ishidden() || f.isdirectory() || !f.canread() || !f.exists()) { return; } if (suffix!=null && !f.getname().endswith(suffix)) { return; } system.out.println("indexing file " + f.getcanonicalpath()); document doc = new document(); doc.add(new field("contents", new filereader(f))); doc.add(new field("filename", f.getcanonicalpath(), field.store.yes, field.index.analyzed)); indexwriter.adddocument(doc); } }
and source code use searching lucene-created index
public class simplesearcher { public static void main(string[] args) throws exception { file indexdir = new file("c:/users/raden/documents/myindex"); string query = "revolution"; int hits = 100; simplesearcher searcher = new simplesearcher(); searcher.searchindex(indexdir, query, hits); } private void searchindex(file indexdir, string querystr, int maxhits) throws exception { directory directory = fsdirectory.open(indexdir); indexsearcher searcher = new indexsearcher(directory); @suppresswarnings("deprecation") queryparser parser = new queryparser(version.lucene_30, "contents", new standardanalyzer(version.lucene_current)); query query = parser.parse(querystr); topdocs topdocs = searcher.search(query, maxhits); scoredoc[] hits = topdocs.scoredocs; (int = 0; < hits.length; i++) { int docid = hits[i].doc; document d = searcher.doc(docid); system.out.println(d.get("filename")); } system.out.println("found " + hits.length); } }
the problem having indexing program created above seem can't incremental update. mean can search text file file existed in last folder indexed, , other previous folder had indexed seems missing in search result , didn't displayed. can tell me went wrong in code? wanted able have incremental update feature in source code. in essence, program seems overwriting existing index new 1 instead of merging it.
thanks though
directory.copy()
overwrites destination directory, need use indexwriter.addindexes()
merge new directory indices main one.
you can re-open main index , add documents directly. ramdirectory isn't more efficient tuned buffer , merge factor settings (see indexwriter
docs).
update: instead of directory.copy()
need open ramdir
reading , indexdir
writing , call .addindexes
on indexdir
writer , pass ramdir
reader. alternatively, can use .addindexesnooptimize
, pass ramdir
directly (without opening reader) , optimize index before closing.
but really, it's easier skip ramdir , open writer on indexdir
in first place. make easier update changed files well.
example
private int index(file indexdir, file datadir, string suffix) throws exception { ramdirectory ramdir = new ramdirectory(); indexwriter indexwriter = new indexwriter(ramdir, new standardanalyzer(version.lucene_current), true, indexwriter.maxfieldlength.unlimited); indexwriter.setusecompoundfile(false); indexdirectory(indexwriter, datadir, suffix); int numindexed = indexwriter.maxdoc(); indexwriter.optimize(); indexwriter.close(); indexwriter index = new indexwriter(fsdirectory.open(indexdir), new standardanalyzer(version.lucene_current), true, indexwriter.maxfieldlength.unlimited); index.addindexesnooptimize(ramdir); index.optimize(); index.close(); return numindexed; }
but, fine too:
private int index(file indexdir, file datadir, string suffix) throws exception { indexwriter index = new indexwriter(fsdirectory.open(indexdir), new standardanalyzer(version.lucene_current), true, indexwriter.maxfieldlength.unlimited); // tweak settings hardware index.setusecompoundfile(false); index.setrambuffersizemb(256); index.setmergefactor(30); indexdirectory(index, datadir, suffix); index.optimize(); int numindexed = index.maxdoc(); index.close(); // you'll need update indexdirectory() keep track of indexed files return numindexed; }
Comments
Post a Comment