Updating Lucene test with Faceted search.

This commit is contained in:
Knut Forkalsrud 2013-11-07 21:11:55 -08:00
parent 193c51900f
commit 0ea01af309
2 changed files with 114 additions and 140 deletions

19
pom.xml
View file

@ -172,8 +172,25 @@
<dependency> <dependency>
<groupId>org.apache.lucene</groupId> <groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId> <artifactId>lucene-core</artifactId>
<version>3.0.0</version> <version>4.4.0</version>
</dependency> </dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>4.4.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-facet</artifactId>
<version>4.4.0</version>
</dependency>
<!--
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-bdb-je</artifactId>
<version>3.1.0</version>
</dependency>
-->
<dependency> <dependency>
<groupId>org.slf4j</groupId> <groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId> <artifactId>slf4j-api</artifactId>

View file

@ -3,26 +3,39 @@
*/ */
package org.forkalsrud.album.index; package org.forkalsrud.album.index;
import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.facet.index.FacetFields;
import org.apache.lucene.facet.params.FacetIndexingParams;
import org.apache.lucene.facet.params.FacetSearchParams;
import org.apache.lucene.facet.search.CountFacetRequest;
import org.apache.lucene.facet.search.DrillDownQuery;
import org.apache.lucene.facet.search.FacetResult;
import org.apache.lucene.facet.search.FacetResultNode;
import org.apache.lucene.facet.search.FacetsCollector;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.search.Collector; import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorer; import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Searcher; import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
import org.junit.Ignore;
import org.junit.Test; import org.junit.Test;
/** /**
@ -31,159 +44,103 @@ import org.junit.Test;
*/ */
public class LuceneTest { public class LuceneTest {
Version version = Version.LUCENE_29; Version version = Version.LUCENE_44;
@Ignore
@Test @Test
public void testIndexCreation() throws Exception { public void testIndexCreation() throws Exception {
File index = new File("/Users/knut/Desktop/albumidx"); StandardAnalyzer analyzer = new StandardAnalyzer(version);
recursiveDelete(index); Directory index = new RAMDirectory();
index.mkdirs(); Directory taxoDir = new RAMDirectory();
Directory dir = FSDirectory.open(index); IndexWriterConfig config = new IndexWriterConfig(version, analyzer);
IndexWriter w = new IndexWriter(index, config);
IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer( DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir);
Version.LUCENE_CURRENT), true, FacetFields facetFields = new FacetFields(taxoWriter);
IndexWriter.MaxFieldLength.LIMITED);
// System.out.println("Indexing to directory '" + index + "'..."); List<CategoryPath> book1 = new ArrayList<CategoryPath>();
indexDocs(writer, new File("photos")); book1.add(new CategoryPath("Author", "Erik Hatcher"));
System.out.println("Optimizing..."); book1.add(new CategoryPath("Author", "Otis Gospodnetić"));
writer.optimize(); book1.add(new CategoryPath("Pub Date", "2004", "December", "1"));
writer.close();
IndexReader reader = IndexReader.open(dir, true); List<CategoryPath> book2 = new ArrayList<CategoryPath>();
// only searching, so read-only=true book2.add(new CategoryPath("Author", "Michael McCandless"));
book2.add(new CategoryPath("Author", "Erik Hatcher"));
book2.add(new CategoryPath("Author", "Otis Gospodnetić"));
book2.add(new CategoryPath("Pub Date", "2010", "July", "28"));
Searcher searcher = new IndexSearcher(reader); addDoc(w, facetFields, "Lucene in Action", "193398817", book1);
Analyzer analyzer = new StandardAnalyzer(version); addDoc(w, facetFields, "Lucene for Dummies", "55320055Z", book2);
addDoc(w, facetFields, "Managing Gigabytes", "55063554A", null);
addDoc(w, facetFields, "The Art of Computer Science", "9900333X", null);
w.close();
taxoWriter.close();
String field = "path";
QueryParser parser = new QueryParser(version, field, analyzer);
Query query = parser.parse("geiranger");
System.out.println("Searching for: " + query.toString(field));
Collector streamingHitCollector = new Collector() { IndexReader reader = DirectoryReader.open(index);
IndexSearcher searcher = new IndexSearcher(reader);
private Scorer scorer; DirectoryTaxonomyReader taxor = new DirectoryTaxonomyReader(taxoDir);
private int docBase;
private IndexReader reader;
// simply print docId and score of every matching document
@Override
public void collect(int docNo) throws IOException {
int docId = docBase + docNo; FacetSearchParams fsp = new FacetSearchParams(new CountFacetRequest(new CategoryPath("Author"), 10), new CountFacetRequest(new CategoryPath("Pub Date"), 10));
Document doc = reader.document(docId); FacetsCollector facetsCollector = FacetsCollector.create(fsp, reader, taxor);
searcher.search(new MatchAllDocsQuery(), facetsCollector);
System.out.println("docId=" + docId + " score=" for (FacetResult fres : facetsCollector.getFacetResults()) {
+ scorer.score() + " path=" + doc.get("path")); FacetResultNode root = fres.getFacetResultNode();
} root.value = fres.getNumValidDescendants();
System.out.println(root.toString());
}
@Override String querystr = "lucene";
public boolean acceptsDocsOutOfOrder() { Query q = new QueryParser(version, "title", analyzer).parse(querystr);
return true;
}
@Override showResultsForQuery(searcher, q);
public void setNextReader(IndexReader reader, int docBase)
throws IOException {
this.docBase = docBase;
this.reader = reader;
}
@Override DrillDownQuery ddq = new DrillDownQuery(FacetIndexingParams.DEFAULT, q);
public void setScorer(Scorer scorer) throws IOException { ddq.add(new CategoryPath("Author", "Michael McCandless"));
this.scorer = scorer; showResultsForQuery(searcher, ddq);
}
};
searcher.search(query, streamingHitCollector); // reader can only be closed when there
// is no need to access the documents any more.
searcher.close();
reader.close(); reader.close();
taxor.close();
dir.close(); index.close();
// recursiveDelete(index); taxoDir.close();
} }
void indexDocs(IndexWriter writer, File file) throws IOException { /**
// do not try to index files that cannot be read * @param hitsPerPage
if (file.canRead()) { * @param searcher
if (file.isDirectory()) { * @param q
String[] files = file.list(); * @throws IOException
// an IO error could occur */
if (files != null) { public void showResultsForQuery(IndexSearcher searcher,
for (int i = 0; i < files.length; i++) { Query q) throws IOException {
indexDocs(writer, new File(file, files[i])); int hitsPerPage = 10;
}
} TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);
} else { searcher.search(q, collector);
System.out.println("adding " + file); ScoreDoc[] hits = collector.topDocs().scoreDocs;
writer.addDocument(FileDocument.Document(file));
} System.out.println("Found " + hits.length + " hits.");
for(int i=0;i<hits.length;++i) {
int docId = hits[i].doc;
Document d = searcher.doc(docId);
System.out.println((i + 1) + ". " + d.get("isbn") + "\t" + d.get("title"));
} }
} }
static class FileDocument { private void addDoc(IndexWriter w, FacetFields facetFields, String title, String isbn, List<CategoryPath> categories) throws IOException {
Document doc = new Document();
/** doc.add(new TextField("title", title, Field.Store.YES));
* Makes a document for a File. doc.add(new StringField("isbn", isbn, Field.Store.YES));
* <p> if (categories != null) {
* The document has three fields: facetFields.addFields(doc, categories);
* <ul>
* <li><code>path</code>--containing the pathname of the file, as a
* stored, untokenized field;
* <li><code>modified</code>--containing the last modified date of the
* file as a field as created by <a
* href="lucene.document.DateTools.html">DateTools</a>; and
* <li><code>contents</code>--containing the full contents of the file,
* as a Reader field;
*/
public static Document Document(File f) {
// make a new, empty document
Document doc = new Document();
// Add the path of the file as a field named "path". Use a
// field that is indexed (i.e. searchable), but don't
// tokenize the field into words.
doc.add(new Field("path", f.getPath()/*.replaceAll(File.separator, " ")*/, Field.Store.YES, Field.Index.ANALYZED));
// Add the last modified date of the file a field named
// "modified". Use a field that is indexed
// (i.e. searchable), but don't tokenize the field into
// words.
doc.add(new Field("modified", DateTools.timeToString(f
.lastModified(), DateTools.Resolution.MINUTE),
Field.Store.YES, Field.Index.NOT_ANALYZED));
// Add the contents of the file to a field named
// "contents". Specify a Reader, so that the text of the
// file is tokenized and indexed, but not stored. Note
// that FileReader expects the file to be in the system's
// default encoding. If that's not the case searching for
// special characters will fail.
// doc.add(new Field("contents", new FileReader(f)));
// return the document
return doc;
} }
w.addDocument(doc);
private FileDocument() {
// not to be instantiated
}
}
void recursiveDelete(File f) {
if (f.isDirectory()) {
for (File e : f.listFiles()) {
recursiveDelete(e);
}
}
f.delete();
} }
} }