Updating Lucene test with Faceted search.

2013-11-07 21:11:55 -08:00 · 2013-11-07 21:11:55 -08:00 · 0ea01af309
commit 0ea01af309
parent 193c51900f
2 changed files with 114 additions and 140 deletions
--- a/pom.xml
+++ b/pom.xml
@ -172,8 +172,25 @@
    <dependency>
      <groupId>org.apache.lucene</groupId>
      <artifactId>lucene-core</artifactId>
-      <version>3.0.0</version>
+      <version>4.4.0</version>
    </dependency>
+    <dependency>
+      <groupId>org.apache.lucene</groupId>
+      <artifactId>lucene-queryparser</artifactId>
+      <version>4.4.0</version>
+    </dependency>
+    <dependency>
+        <groupId>org.apache.lucene</groupId>
+        <artifactId>lucene-facet</artifactId>
+        <version>4.4.0</version>
+    </dependency>
+<!--
+    <dependency>
+        <groupId>org.apache.lucene</groupId>
+        <artifactId>lucene-bdb-je</artifactId>
+        <version>3.1.0</version>
+    </dependency>
+-->
    <dependency>
      <groupId>org.slf4j</groupId>
      <artifactId>slf4j-api</artifactId>
--- a/src/test/java/org/forkalsrud/album/index/LuceneTest.java
+++ b/src/test/java/org/forkalsrud/album/index/LuceneTest.java
@ -3,26 +3,39 @@
 */
 package org.forkalsrud.album.index;

-import java.io.File;
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;

-import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.document.DateTools;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.facet.index.FacetFields;
+import org.apache.lucene.facet.params.FacetIndexingParams;
+import org.apache.lucene.facet.params.FacetSearchParams;
+import org.apache.lucene.facet.search.CountFacetRequest;
+import org.apache.lucene.facet.search.DrillDownQuery;
+import org.apache.lucene.facet.search.FacetResult;
+import org.apache.lucene.facet.search.FacetResultNode;
+import org.apache.lucene.facet.search.FacetsCollector;
+import org.apache.lucene.facet.taxonomy.CategoryPath;
+import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
+import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
+import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.queryParser.QueryParser;
-import org.apache.lucene.search.Collector;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.queryparser.classic.QueryParser;
 import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.Query;
-import org.apache.lucene.search.Scorer;
-import org.apache.lucene.search.Searcher;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopScoreDocCollector;
 import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.store.RAMDirectory;
 import org.apache.lucene.util.Version;
-import org.junit.Ignore;
 import org.junit.Test;

 /**
@ -31,159 +44,103 @@ import org.junit.Test;
 */
 public class LuceneTest {

-    Version version = Version.LUCENE_29;
+    Version version = Version.LUCENE_44;

-    @Ignore
    @Test
    public void testIndexCreation() throws Exception {

-        File index = new File("/Users/knut/Desktop/albumidx");
-        recursiveDelete(index);
-        index.mkdirs();
+        StandardAnalyzer analyzer = new StandardAnalyzer(version);
+        Directory index = new RAMDirectory();
+        Directory taxoDir = new RAMDirectory();

-        Directory dir = FSDirectory.open(index);
+        IndexWriterConfig config = new IndexWriterConfig(version, analyzer);
+        IndexWriter w = new IndexWriter(index, config);

-        IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(
-                Version.LUCENE_CURRENT), true,
-                IndexWriter.MaxFieldLength.LIMITED);
+        DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir);
+        FacetFields facetFields = new FacetFields(taxoWriter);

-        // System.out.println("Indexing to directory '" + index + "'...");
-        indexDocs(writer, new File("photos"));
-        System.out.println("Optimizing...");
-        writer.optimize();
-        writer.close();
+        List<CategoryPath> book1 = new ArrayList<CategoryPath>();
+        book1.add(new CategoryPath("Author", "Erik Hatcher"));
+        book1.add(new CategoryPath("Author", "Otis Gospodnetić"));
+        book1.add(new CategoryPath("Pub Date", "2004", "December", "1"));

-        IndexReader reader = IndexReader.open(dir, true);
-        // only searching, so read-only=true
+        List<CategoryPath> book2 = new ArrayList<CategoryPath>();
+        book2.add(new CategoryPath("Author", "Michael McCandless"));
+        book2.add(new CategoryPath("Author", "Erik Hatcher"));
+        book2.add(new CategoryPath("Author", "Otis Gospodnetić"));
+        book2.add(new CategoryPath("Pub Date", "2010", "July", "28"));
+        
+        addDoc(w, facetFields, "Lucene in Action", "193398817", book1);
+        addDoc(w, facetFields, "Lucene for Dummies", "55320055Z", book2);
+        addDoc(w, facetFields, "Managing Gigabytes", "55063554A", null);
+        addDoc(w, facetFields, "The Art of Computer Science", "9900333X", null);
+        w.close();
+        taxoWriter.close();

-        Searcher searcher = new IndexSearcher(reader);
-        Analyzer analyzer = new StandardAnalyzer(version);

-        String field = "path";
-        QueryParser parser = new QueryParser(version, field, analyzer);
-        Query query = parser.parse("geiranger");
-        System.out.println("Searching for: " + query.toString(field));
+        IndexReader reader = DirectoryReader.open(index);
+        IndexSearcher searcher = new IndexSearcher(reader);

-        Collector streamingHitCollector = new Collector() {
+        DirectoryTaxonomyReader taxor = new DirectoryTaxonomyReader(taxoDir);

-            private Scorer scorer;
-            private int docBase;
-            private IndexReader reader;

-            // simply print docId and score of every matching document
-            @Override
-            public void collect(int docNo) throws IOException {
+        FacetSearchParams fsp = new FacetSearchParams(new CountFacetRequest(new CategoryPath("Author"), 10), new CountFacetRequest(new CategoryPath("Pub Date"), 10));
+        FacetsCollector facetsCollector = FacetsCollector.create(fsp, reader, taxor);
+        searcher.search(new MatchAllDocsQuery(), facetsCollector);

-                int docId = docBase + docNo;
-                Document doc = reader.document(docId);
+        for (FacetResult fres : facetsCollector.getFacetResults()) {
+            FacetResultNode root = fres.getFacetResultNode();
+            root.value = fres.getNumValidDescendants();
+            System.out.println(root.toString());
+        }

-                System.out.println("docId=" + docId + " score="
-                        + scorer.score() + " path=" + doc.get("path"));
-            }
+        String querystr = "lucene";
+        Query q = new QueryParser(version, "title", analyzer).parse(querystr);
+         
+        showResultsForQuery(searcher, q);

-            @Override
-            public boolean acceptsDocsOutOfOrder() {
-                return true;
-            }
+        DrillDownQuery ddq = new DrillDownQuery(FacetIndexingParams.DEFAULT, q);
+        ddq.add(new CategoryPath("Author", "Michael McCandless"));
+        showResultsForQuery(searcher, ddq);

-            @Override
-            public void setNextReader(IndexReader reader, int docBase)
-                    throws IOException {
-                this.docBase = docBase;
-                this.reader = reader;
-            }
-
-            @Override
-            public void setScorer(Scorer scorer) throws IOException {
-                this.scorer = scorer;
-            }
-
-        };
-
-        searcher.search(query, streamingHitCollector);
-
-        searcher.close();
+        
+        // reader can only be closed when there
+        // is no need to access the documents any more.
        reader.close();
-
-        dir.close();
-        // recursiveDelete(index);
+        taxor.close();
+        index.close();
+        taxoDir.close();
    }

-    void indexDocs(IndexWriter writer, File file) throws IOException {
-        // do not try to index files that cannot be read
-        if (file.canRead()) {
-            if (file.isDirectory()) {
-                String[] files = file.list();
-                // an IO error could occur
-                if (files != null) {
-                    for (int i = 0; i < files.length; i++) {
-                        indexDocs(writer, new File(file, files[i]));
-                    }
-                }
-            } else {
-                System.out.println("adding " + file);
-                writer.addDocument(FileDocument.Document(file));
-            }
+    /**
+     * @param hitsPerPage
+     * @param searcher
+     * @param q
+     * @throws IOException
+     */
+    public void showResultsForQuery(IndexSearcher searcher,
+            Query q) throws IOException {
+        int hitsPerPage = 10;
+
+        TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);
+        searcher.search(q, collector);
+        ScoreDoc[] hits = collector.topDocs().scoreDocs;
+
+        System.out.println("Found " + hits.length + " hits.");
+        for(int i=0;i<hits.length;++i) {
+            int docId = hits[i].doc;
+            Document d = searcher.doc(docId);
+            System.out.println((i + 1) + ". " + d.get("isbn") + "\t" + d.get("title"));
        }
    }

-    static class FileDocument {
-
-        /**
-         * Makes a document for a File.
-         * <p>
-         * The document has three fields:
-         * <ul>
-         * <li><code>path</code>--containing the pathname of the file, as a
-         * stored, untokenized field;
-         * <li><code>modified</code>--containing the last modified date of the
-         * file as a field as created by <a
-         * href="lucene.document.DateTools.html">DateTools</a>; and
-         * <li><code>contents</code>--containing the full contents of the file,
-         * as a Reader field;
-         */
-        public static Document Document(File f) {
-
-            // make a new, empty document
-            Document doc = new Document();
-
-            // Add the path of the file as a field named "path". Use a
-            // field that is indexed (i.e. searchable), but don't
-            // tokenize the field into words.
-            doc.add(new Field("path", f.getPath()/*.replaceAll(File.separator, " ")*/, Field.Store.YES, Field.Index.ANALYZED));
-
-            // Add the last modified date of the file a field named
-            // "modified". Use a field that is indexed
-            // (i.e. searchable), but don't tokenize the field into
-            // words.
-            doc.add(new Field("modified", DateTools.timeToString(f
-                    .lastModified(), DateTools.Resolution.MINUTE),
-                    Field.Store.YES, Field.Index.NOT_ANALYZED));
-
-            // Add the contents of the file to a field named
-            // "contents". Specify a Reader, so that the text of the
-            // file is tokenized and indexed, but not stored. Note
-            // that FileReader expects the file to be in the system's
-            // default encoding. If that's not the case searching for
-            // special characters will fail.
-
-            // doc.add(new Field("contents", new FileReader(f)));
-
-            // return the document
-            return doc;
+    private void addDoc(IndexWriter w, FacetFields facetFields, String title, String isbn, List<CategoryPath> categories) throws IOException {
+        Document doc = new Document();
+        doc.add(new TextField("title", title, Field.Store.YES));
+        doc.add(new StringField("isbn", isbn, Field.Store.YES));
+        if (categories != null) {
+            facetFields.addFields(doc, categories);
        }
-
-        private FileDocument() {
-            // not to be instantiated
-        }
-    }
-
-    void recursiveDelete(File f) {
-        if (f.isDirectory()) {
-            for (File e : f.listFiles()) {
-                recursiveDelete(e);
-            }
-        }
-        f.delete();
+        w.addDocument(doc);
    }
 }