网站首页 新闻首页 网页设计图形动画软件编程网站开发办公软件操作系统数据库网络技术认证考试范文资料黑客攻防 书籍教程 进入论坛

lucene的多种搜索2-SpanQuery

http://www.diybl.com/ 2008-1-11  网络 点击:  [ 评论 ]
文章搜索:    【点击打包该文章】

导读:
  SpanQuery按照词在文章中的距离或者查询几个相邻词的查询
  
  SpanQuery包括以下几种:
  SpanTermQuery:词距查询的基础,结果和TermQuery相似,只不过是增加了查询结果中单词的距离信息。
  SpanFirstQuery:在指定距离可以找到第一个单词的查询。
  SpanNearQuery:查询的几个语句之间保持者一定的距离。
  SpanOrQuery:同时查询几个词句查询。
  SpanNotQuery:从一个词距查询结果中,去除一个词距查询。
  下面一个简单例子介绍
  package com;
  //SpanQuery:跨度查询。此类为抽象类。
  import java.io.IOException;
  import java.io.StringReader;
  import java.util.ArrayList;
  import java.util.List;
  import org.apache.lucene.analysis.Analyzer;
  import org.apache.lucene.analysis.Token;
  import org.apache.lucene.analysis.TokenStream;
  import org.apache.lucene.analysis.WhitespaceAnalyzer;
  import org.apache.lucene.document.Document;
  import org.apache.lucene.document.Field;
  import org.apache.lucene.document.Field.Index;
  import org.apache.lucene.document.Field.Store;
  import org.apache.lucene.index.IndexReader;
  import org.apache.lucene.index.IndexWriter;
  import org.apache.lucene.index.Term;
  import org.apache.lucene.search.Hits;
  import org.apache.lucene.search.IndexSearcher;
  import org.apache.lucene.search.spans.SpanFirstQuery;
  import org.apache.lucene.search.spans.SpanNearQuery;
  import org.apache.lucene.search.spans.SpanNotQuery;
  import org.apache.lucene.search.spans.SpanOrQuery;
  import org.apache.lucene.search.spans.SpanQuery;
  import org.apache.lucene.search.spans.SpanTermQuery;
  import org.apache.lucene.search.spans.Spans;
  import org.apache.lucene.store.RAMDirectory;
  public class SpanQueryTest {
  private RAMDirectory directory;
  private IndexSearcher indexSearcher;
  private IndexReader reader;
  private SpanTermQuery quick;
  private SpanTermQuery brown;
  private SpanTermQuery red;
  private SpanTermQuery fox;
  private SpanTermQuery lazy;
  private SpanTermQuery sleepy;
  private SpanTermQuery dog;
  private SpanTermQuery cat;
  private Analyzer analyzer;
  
  // 索引及初使化
  public void index() throws IOException {
  directory = new RAMDirectory();
  analyzer = new WhitespaceAnalyzer();
  IndexWriter writer = new IndexWriter(directory, analyzer, true);
  Document doc1 = new Document();
  doc1.add(new Field("field",
  "the quick brown fox jumps over the lazy dog", Store.YES,
  Index.TOKENIZED));
  Document doc2 = new Document();
  doc2.add(new Field("field",
  "the quick red fox jumps over the sleepy cat", Store.YES,
  Index.TOKENIZED));
  writer.addDocument(doc1);
  writer.addDocument(doc2);
  writer.optimize();
  writer.close();
  quick = new SpanTermQuery(new Term("field", "quick"));
  brown = new SpanTermQuery(new Term("field", "brown"));
  red = new SpanTermQuery(new Term("field", "red"));
  fox = new SpanTermQuery(new Term("field", "fox"));
  lazy = new SpanTermQuery(new Term("field", "lazy"));
  sleepy = new SpanTermQuery(new Term("field", "sleepy"));
  dog = new SpanTermQuery(new Term("field", "dog"));
  cat = new SpanTermQuery(new Term("field", "cat"));
  indexSearcher = new IndexSearcher(directory);
  reader = IndexReader.open(directory);
  }
  private void dumpSpans(SpanQuery query) throws IOException {
  // 检索效果和TermQuery一样,可以把他当成TermQuery
  Hits hits = indexSearcher.search(query);
  for (int i = 0; i   // System.out.println(hits.doc(i).get("field"));
  }
  // 但内部会记录一些位置信息,供SpanQuery的其它API使用,是其它属于SpanQuery的Query的基础。
  Spans spans = query.getSpans(reader);
  int numSpans = 0;
  float[] scores = new float[2];
  for (int i = 0; i   scores[hits.id(i)] = hits.score(i);
  }
  while (spans.next()) {
  numSpans++;
  int id = spans.doc();
  Document doc = reader.document(id);
  Token[] tokens = AnalyzerUtils.tokensFromAnalysis(analyzer, doc
  .get("field"));
  StringBuffer buffer = new StringBuffer();
  for (int i = 0; i   // the quick brown fox jumps over the lazy dog
  // spans记录了位置信息,比如搜索brown,brown在这句话中位于第三个位置,所以spans.start()=2,spans.end()=3
  // 在第二项的位置后加<,第三项后加> 返回
  if (i == spans.start()) {
  buffer.append("<");
  }
  buffer.append(tokens[i].termText());
  if (i + 1 == spans.end()) {
  buffer.append(">");
  }
  buffer.append(" ");
  }
  buffer.append("(" + scores[id] + ") ");
  System.out.println(buffer);
  }
  // indexSearcher.close();
  }
  // SpanTermQuery:检索效果完全同TermQuery,但内部会记录一些位置信息,供SpanQuery的其它API使用,是其它属于SpanQuery的Query的基础。
  public void spanTermQueryTest() throws IOException {
  dumpSpans(brown);
  
  //// 搜索结果
  // the quick fox jumps over the lazy dog (0.22097087)
  }
  // SpanFirstQuery:查找方式为从Field的内容起始位置开始,在一个固定的宽度内查找所指定的词条。
  public void spanFirstQueryTest() throws IOException {
  // the quick brown fox jumps over the lazy dog
  // 在给定的范围搜索,前两个为the quick
  // brown 在doc1的第三个位置,用SpanFirstQuery从起点查找的话,他的跨度必须为>=3才能找到
  SpanFirstQuery firstQuery = new SpanFirstQuery(brown, 3);
  dumpSpans(firstQuery);
  
  ////搜索结果
  // the quick fox jumps over the lazy dog (0.22097087)
  }
  // SpanNearQuery:功能类似PharaseQuery。SpanNearQuery查找所匹配的不一定是短语,还有可能是另一个SpanQuery的查询结果作为整体考虑,进行嵌套查询。

文章整理:DIY部落 http://www.diybl.com (本站)   【点击打包该文章】
[1] [2]
如果图片或页面不能正常显示请点击这里 站内搜索:   

文章评论

请您留言