Updating Lucene test with Faceted search.
This commit is contained in:
parent
193c51900f
commit
0ea01af309
2 changed files with 114 additions and 140 deletions
19
pom.xml
19
pom.xml
|
|
@ -172,8 +172,25 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.lucene</groupId>
|
<groupId>org.apache.lucene</groupId>
|
||||||
<artifactId>lucene-core</artifactId>
|
<artifactId>lucene-core</artifactId>
|
||||||
<version>3.0.0</version>
|
<version>4.4.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.lucene</groupId>
|
||||||
|
<artifactId>lucene-queryparser</artifactId>
|
||||||
|
<version>4.4.0</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.lucene</groupId>
|
||||||
|
<artifactId>lucene-facet</artifactId>
|
||||||
|
<version>4.4.0</version>
|
||||||
|
</dependency>
|
||||||
|
<!--
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.lucene</groupId>
|
||||||
|
<artifactId>lucene-bdb-je</artifactId>
|
||||||
|
<version>3.1.0</version>
|
||||||
|
</dependency>
|
||||||
|
-->
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.slf4j</groupId>
|
<groupId>org.slf4j</groupId>
|
||||||
<artifactId>slf4j-api</artifactId>
|
<artifactId>slf4j-api</artifactId>
|
||||||
|
|
|
||||||
|
|
@ -3,26 +3,39 @@
|
||||||
*/
|
*/
|
||||||
package org.forkalsrud.album.index;
|
package org.forkalsrud.album.index;
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
import org.apache.lucene.document.DateTools;
|
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.document.StringField;
|
||||||
|
import org.apache.lucene.document.TextField;
|
||||||
|
import org.apache.lucene.facet.index.FacetFields;
|
||||||
|
import org.apache.lucene.facet.params.FacetIndexingParams;
|
||||||
|
import org.apache.lucene.facet.params.FacetSearchParams;
|
||||||
|
import org.apache.lucene.facet.search.CountFacetRequest;
|
||||||
|
import org.apache.lucene.facet.search.DrillDownQuery;
|
||||||
|
import org.apache.lucene.facet.search.FacetResult;
|
||||||
|
import org.apache.lucene.facet.search.FacetResultNode;
|
||||||
|
import org.apache.lucene.facet.search.FacetsCollector;
|
||||||
|
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
||||||
|
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
|
||||||
|
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
|
||||||
|
import org.apache.lucene.index.DirectoryReader;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.IndexWriter;
|
import org.apache.lucene.index.IndexWriter;
|
||||||
import org.apache.lucene.queryParser.QueryParser;
|
import org.apache.lucene.index.IndexWriterConfig;
|
||||||
import org.apache.lucene.search.Collector;
|
import org.apache.lucene.queryparser.classic.QueryParser;
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
import org.apache.lucene.search.Scorer;
|
import org.apache.lucene.search.ScoreDoc;
|
||||||
import org.apache.lucene.search.Searcher;
|
import org.apache.lucene.search.TopScoreDocCollector;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.FSDirectory;
|
import org.apache.lucene.store.RAMDirectory;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
import org.junit.Ignore;
|
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -31,159 +44,103 @@ import org.junit.Test;
|
||||||
*/
|
*/
|
||||||
public class LuceneTest {
|
public class LuceneTest {
|
||||||
|
|
||||||
Version version = Version.LUCENE_29;
|
Version version = Version.LUCENE_44;
|
||||||
|
|
||||||
@Ignore
|
|
||||||
@Test
|
@Test
|
||||||
public void testIndexCreation() throws Exception {
|
public void testIndexCreation() throws Exception {
|
||||||
|
|
||||||
File index = new File("/Users/knut/Desktop/albumidx");
|
StandardAnalyzer analyzer = new StandardAnalyzer(version);
|
||||||
recursiveDelete(index);
|
Directory index = new RAMDirectory();
|
||||||
index.mkdirs();
|
Directory taxoDir = new RAMDirectory();
|
||||||
|
|
||||||
Directory dir = FSDirectory.open(index);
|
IndexWriterConfig config = new IndexWriterConfig(version, analyzer);
|
||||||
|
IndexWriter w = new IndexWriter(index, config);
|
||||||
|
|
||||||
IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(
|
DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir);
|
||||||
Version.LUCENE_CURRENT), true,
|
FacetFields facetFields = new FacetFields(taxoWriter);
|
||||||
IndexWriter.MaxFieldLength.LIMITED);
|
|
||||||
|
|
||||||
// System.out.println("Indexing to directory '" + index + "'...");
|
List<CategoryPath> book1 = new ArrayList<CategoryPath>();
|
||||||
indexDocs(writer, new File("photos"));
|
book1.add(new CategoryPath("Author", "Erik Hatcher"));
|
||||||
System.out.println("Optimizing...");
|
book1.add(new CategoryPath("Author", "Otis Gospodnetić"));
|
||||||
writer.optimize();
|
book1.add(new CategoryPath("Pub Date", "2004", "December", "1"));
|
||||||
writer.close();
|
|
||||||
|
|
||||||
IndexReader reader = IndexReader.open(dir, true);
|
List<CategoryPath> book2 = new ArrayList<CategoryPath>();
|
||||||
// only searching, so read-only=true
|
book2.add(new CategoryPath("Author", "Michael McCandless"));
|
||||||
|
book2.add(new CategoryPath("Author", "Erik Hatcher"));
|
||||||
|
book2.add(new CategoryPath("Author", "Otis Gospodnetić"));
|
||||||
|
book2.add(new CategoryPath("Pub Date", "2010", "July", "28"));
|
||||||
|
|
||||||
|
addDoc(w, facetFields, "Lucene in Action", "193398817", book1);
|
||||||
|
addDoc(w, facetFields, "Lucene for Dummies", "55320055Z", book2);
|
||||||
|
addDoc(w, facetFields, "Managing Gigabytes", "55063554A", null);
|
||||||
|
addDoc(w, facetFields, "The Art of Computer Science", "9900333X", null);
|
||||||
|
w.close();
|
||||||
|
taxoWriter.close();
|
||||||
|
|
||||||
Searcher searcher = new IndexSearcher(reader);
|
|
||||||
Analyzer analyzer = new StandardAnalyzer(version);
|
|
||||||
|
|
||||||
String field = "path";
|
IndexReader reader = DirectoryReader.open(index);
|
||||||
QueryParser parser = new QueryParser(version, field, analyzer);
|
IndexSearcher searcher = new IndexSearcher(reader);
|
||||||
Query query = parser.parse("geiranger");
|
|
||||||
System.out.println("Searching for: " + query.toString(field));
|
|
||||||
|
|
||||||
Collector streamingHitCollector = new Collector() {
|
DirectoryTaxonomyReader taxor = new DirectoryTaxonomyReader(taxoDir);
|
||||||
|
|
||||||
private Scorer scorer;
|
|
||||||
private int docBase;
|
|
||||||
private IndexReader reader;
|
|
||||||
|
|
||||||
// simply print docId and score of every matching document
|
FacetSearchParams fsp = new FacetSearchParams(new CountFacetRequest(new CategoryPath("Author"), 10), new CountFacetRequest(new CategoryPath("Pub Date"), 10));
|
||||||
@Override
|
FacetsCollector facetsCollector = FacetsCollector.create(fsp, reader, taxor);
|
||||||
public void collect(int docNo) throws IOException {
|
searcher.search(new MatchAllDocsQuery(), facetsCollector);
|
||||||
|
|
||||||
int docId = docBase + docNo;
|
for (FacetResult fres : facetsCollector.getFacetResults()) {
|
||||||
Document doc = reader.document(docId);
|
FacetResultNode root = fres.getFacetResultNode();
|
||||||
|
root.value = fres.getNumValidDescendants();
|
||||||
|
System.out.println(root.toString());
|
||||||
|
}
|
||||||
|
|
||||||
System.out.println("docId=" + docId + " score="
|
String querystr = "lucene";
|
||||||
+ scorer.score() + " path=" + doc.get("path"));
|
Query q = new QueryParser(version, "title", analyzer).parse(querystr);
|
||||||
}
|
|
||||||
|
showResultsForQuery(searcher, q);
|
||||||
|
|
||||||
@Override
|
DrillDownQuery ddq = new DrillDownQuery(FacetIndexingParams.DEFAULT, q);
|
||||||
public boolean acceptsDocsOutOfOrder() {
|
ddq.add(new CategoryPath("Author", "Michael McCandless"));
|
||||||
return true;
|
showResultsForQuery(searcher, ddq);
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void setNextReader(IndexReader reader, int docBase)
|
// reader can only be closed when there
|
||||||
throws IOException {
|
// is no need to access the documents any more.
|
||||||
this.docBase = docBase;
|
|
||||||
this.reader = reader;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void setScorer(Scorer scorer) throws IOException {
|
|
||||||
this.scorer = scorer;
|
|
||||||
}
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
searcher.search(query, streamingHitCollector);
|
|
||||||
|
|
||||||
searcher.close();
|
|
||||||
reader.close();
|
reader.close();
|
||||||
|
taxor.close();
|
||||||
dir.close();
|
index.close();
|
||||||
// recursiveDelete(index);
|
taxoDir.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
void indexDocs(IndexWriter writer, File file) throws IOException {
|
/**
|
||||||
// do not try to index files that cannot be read
|
* @param hitsPerPage
|
||||||
if (file.canRead()) {
|
* @param searcher
|
||||||
if (file.isDirectory()) {
|
* @param q
|
||||||
String[] files = file.list();
|
* @throws IOException
|
||||||
// an IO error could occur
|
*/
|
||||||
if (files != null) {
|
public void showResultsForQuery(IndexSearcher searcher,
|
||||||
for (int i = 0; i < files.length; i++) {
|
Query q) throws IOException {
|
||||||
indexDocs(writer, new File(file, files[i]));
|
int hitsPerPage = 10;
|
||||||
}
|
|
||||||
}
|
TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);
|
||||||
} else {
|
searcher.search(q, collector);
|
||||||
System.out.println("adding " + file);
|
ScoreDoc[] hits = collector.topDocs().scoreDocs;
|
||||||
writer.addDocument(FileDocument.Document(file));
|
|
||||||
}
|
System.out.println("Found " + hits.length + " hits.");
|
||||||
|
for(int i=0;i<hits.length;++i) {
|
||||||
|
int docId = hits[i].doc;
|
||||||
|
Document d = searcher.doc(docId);
|
||||||
|
System.out.println((i + 1) + ". " + d.get("isbn") + "\t" + d.get("title"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static class FileDocument {
|
private void addDoc(IndexWriter w, FacetFields facetFields, String title, String isbn, List<CategoryPath> categories) throws IOException {
|
||||||
|
Document doc = new Document();
|
||||||
/**
|
doc.add(new TextField("title", title, Field.Store.YES));
|
||||||
* Makes a document for a File.
|
doc.add(new StringField("isbn", isbn, Field.Store.YES));
|
||||||
* <p>
|
if (categories != null) {
|
||||||
* The document has three fields:
|
facetFields.addFields(doc, categories);
|
||||||
* <ul>
|
|
||||||
* <li><code>path</code>--containing the pathname of the file, as a
|
|
||||||
* stored, untokenized field;
|
|
||||||
* <li><code>modified</code>--containing the last modified date of the
|
|
||||||
* file as a field as created by <a
|
|
||||||
* href="lucene.document.DateTools.html">DateTools</a>; and
|
|
||||||
* <li><code>contents</code>--containing the full contents of the file,
|
|
||||||
* as a Reader field;
|
|
||||||
*/
|
|
||||||
public static Document Document(File f) {
|
|
||||||
|
|
||||||
// make a new, empty document
|
|
||||||
Document doc = new Document();
|
|
||||||
|
|
||||||
// Add the path of the file as a field named "path". Use a
|
|
||||||
// field that is indexed (i.e. searchable), but don't
|
|
||||||
// tokenize the field into words.
|
|
||||||
doc.add(new Field("path", f.getPath()/*.replaceAll(File.separator, " ")*/, Field.Store.YES, Field.Index.ANALYZED));
|
|
||||||
|
|
||||||
// Add the last modified date of the file a field named
|
|
||||||
// "modified". Use a field that is indexed
|
|
||||||
// (i.e. searchable), but don't tokenize the field into
|
|
||||||
// words.
|
|
||||||
doc.add(new Field("modified", DateTools.timeToString(f
|
|
||||||
.lastModified(), DateTools.Resolution.MINUTE),
|
|
||||||
Field.Store.YES, Field.Index.NOT_ANALYZED));
|
|
||||||
|
|
||||||
// Add the contents of the file to a field named
|
|
||||||
// "contents". Specify a Reader, so that the text of the
|
|
||||||
// file is tokenized and indexed, but not stored. Note
|
|
||||||
// that FileReader expects the file to be in the system's
|
|
||||||
// default encoding. If that's not the case searching for
|
|
||||||
// special characters will fail.
|
|
||||||
|
|
||||||
// doc.add(new Field("contents", new FileReader(f)));
|
|
||||||
|
|
||||||
// return the document
|
|
||||||
return doc;
|
|
||||||
}
|
}
|
||||||
|
w.addDocument(doc);
|
||||||
private FileDocument() {
|
|
||||||
// not to be instantiated
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void recursiveDelete(File f) {
|
|
||||||
if (f.isDirectory()) {
|
|
||||||
for (File e : f.listFiles()) {
|
|
||||||
recursiveDelete(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
f.delete();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue