lucene的多种搜索2-SpanQuery
导读:
SpanQuery按照词在文章中的距离或者查询几个相邻词的查询
SpanQuery包括以下几种:
SpanTermQuery:词距查询的基础,结果和TermQuery相似,只不过是增加了查询结果中单词的距离信息。
SpanFirstQuery:在指定距离可以找到第一个单词的查询。
SpanNearQuery:查询的几个语句之间保持者一定的距离。
SpanOrQuery:同时查询几个词句查询。
SpanNotQuery:从一个词距查询结果中,去除一个词距查询。
下面一个简单例子介绍
package com;
//SpanQuery:跨度查询。此类为抽象类。
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.spans.SpanFirstQuery;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanNotQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.store.RAMDirectory;
public class SpanQueryTest {
private RAMDirectory directory;
private IndexSearcher indexSearcher;
private IndexReader reader;
private SpanTermQuery quick;
private SpanTermQuery brown;
private SpanTermQuery red;
private SpanTermQuery fox;
private SpanTermQuery lazy;
private SpanTermQuery sleepy;
private SpanTermQuery dog;
private SpanTermQuery cat;
private Analyzer analyzer;
// 索引及初使化
public void index() throws IOException {
directory = new RAMDirectory();
analyzer = new WhitespaceAnalyzer();
IndexWriter writer = new IndexWriter(directory, analyzer, true);
Document doc1 = new Document();
doc1.add(new Field("field",
"the quick brown fox jumps over the lazy dog", Store.YES,
Index.TOKENIZED));
Document doc2 = new Document();
doc2.add(new Field("field",
"the quick red fox jumps over the sleepy cat", Store.YES,
Index.TOKENIZED));
writer.addDocument(doc1);
writer.addDocument(doc2);
writer.optimize();
writer.close();
quick = new SpanTermQuery(new Term("field", "quick"));
brown = new SpanTermQuery(new Term("field", "brown"));
red = new SpanTermQuery(new Term("field", "red"));
fox = new SpanTermQuery(new Term("field", "fox"));
lazy = new SpanTermQuery(new Term("field", "lazy"));
sleepy = new SpanTermQuery(new Term("field", "sleepy"));
dog = new SpanTermQuery(new Term("field", "dog"));
cat = new SpanTermQuery(new Term("field", "cat"));
indexSearcher = new IndexSearcher(directory);
reader = IndexReader.open(directory);
}
private void dumpSpans(SpanQuery query) throws IOException {
// 检索效果和TermQuery一样,可以把他当成TermQuery
Hits hits = indexSearcher.search(query);
for (int i = 0; i // System.out.println(hits.doc(i).get("field"));
}
// 但内部会记录一些位置信息,供SpanQuery的其它API使用,是其它属于SpanQuery的Query的基础。
Spans spans = query.getSpans(reader);
int numSpans = 0;
float[] scores = new float[2];
for (int i = 0; i scores[hits.id(i)] = hits.score(i);
}
while (spans.next()) {
numSpans++;
int id = spans.doc();
Document doc = reader.document(id);
Token[] tokens = AnalyzerUtils.tokensFromAnalysis(analyzer, doc
.get("field"));
StringBuffer buffer = new StringBuffer();
for (int i = 0; i // the quick brown fox jumps over the lazy dog
// spans记录了位置信息,比如搜索brown,brown在这句话中位于第三个位置,所以spans.start()=2,spans.end()=3
// 在第二项的位置后加<,第三项后加> 返回
if (i == spans.start()) {
buffer.append("<");
}
buffer.append(tokens[i].termText());
if (i + 1 == spans.end()) {
buffer.append(">");
}
buffer.append(" ");
}
buffer.append("(" + scores[id] + ") ");
System.out.println(buffer);
}
// indexSearcher.close();
}
// SpanTermQuery:检索效果完全同TermQuery,但内部会记录一些位置信息,供SpanQuery的其它API使用,是其它属于SpanQuery的Query的基础。
public void spanTermQueryTest() throws IOException {
dumpSpans(brown);
//// 搜索结果
// the quick
}
// SpanFirstQuery:查找方式为从Field的内容起始位置开始,在一个固定的宽度内查找所指定的词条。
public void spanFirstQueryTest() throws IOException {
// the quick brown fox jumps over the lazy dog
// 在给定的范围搜索,前两个为the quick
// brown 在doc1的第三个位置,用SpanFirstQuery从起点查找的话,他的跨度必须为>=3才能找到
SpanFirstQuery firstQuery = new SpanFirstQuery(brown, 3);
dumpSpans(firstQuery);
////搜索结果
// the quick
}
// SpanNearQuery:功能类似PharaseQuery。SpanNearQuery查找所匹配的不一定是短语,还有可能是另一个SpanQuery的查询结果作为整体考虑,进行嵌套查询。
推荐文章 |
