Lucene+Sen组合实现简单的查询引擎(日本语对应)
http://www.diybl.com/ 2008-1-29 网络 点击:
[ 评论 ]
文章搜索:
【点击打包该文章】
package demo;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;


public class HelloLucene ...{

private static final String FIELD_CONTENT = "content";
private static final String SEN_XML_PATH = "D:\Java\sen-1.2.2.1\conf\sen.xml";
private static final Directory directory = new RAMDirectory();

private static final Analyzer analyzer = new JapaneseAnalyzer(SEN_XML_PATH);

private static final QueryParser qp = new QueryParser( FIELD_CONTENT, analyzer );




private static final String[] contents = ...{

"カツオはサザエの弟", "サザエはワカメの姉", "ワカメはカツオの妹",

"カツオは長男", "サザエは長女", "ワカメは次女",

"マスオはサザエの夫", "波平は舟の夫", "タラちゃんのパパはマスオ",

"サザエとマスオは夫婦", "波平はタラちゃんの祖父", "舟はカツオの母",

"マスオはカツオの義兄", "カツオはタラちゃんの叔父", "舟はワカメの母"

};




public static void main( String[] args ) throws IOException, ParseException ...{

makeIndex();

backupIndex();

BufferedReader br = new BufferedReader( new InputStreamReader( System.in ) );

String q = null;


while( q == null || !q.equals( "q" ) )...{

System.out.print( " 検索質問(qで終了)> " );

System.out.flush();

q = br.readLine();

if( !q.equals( "q" ) )

searchIndex( q );

}

br.close();

if( directory != null )

directory.close();

}




private static void makeIndex() throws IOException ...{
IndexWriter writer = new IndexWriter( directory, analyzer, true );


for( int i = 0; i < contents.length; i++ )...{

Document doc = new Document();

doc.add( new Field( FIELD_CONTENT, contents[i], Field.Store.YES, Field.Index.TOKENIZED ) );

writer.addDocument( doc );

}

writer.close();

}




private static void searchIndex( final String q ) throws IOException, ParseException ...{

IndexSearcher searcher = new IndexSearcher( directory );

Query query = qp.parse( q );

Hits hits = searcher.search( query );

int length = hits.length();

System.out.println( Integer.toString( length ) + "件ヒットしました。" );


for( int i = 0; i < length; i++ )...{

Document doc = hits.doc( i );

System.out.println( " " + doc.get( FIELD_CONTENT ) );

}

searcher.close();

}



private static void backupIndex() throws IOException...{

Directory persistent = FSDirectory.getDirectory( "index" );

Directory.copy( directory, persistent, false );

persistent.close();

}
}
相关资料:http://ultimania.org/sen/
perl5.6以降:http://www.activestate.com/store/download.aspx?prdGUID=81fbce82-6bd5-49bc-a915-08d58c2648ca
ant1.5以降:http://ant.apache.org/bindownload.cgi
注意)ant -Dperl.bin=D:\Java\Perl\bin\perl.exe
下载地址:https://sen.dev.java.net/servlets/ProjectDocumentList?folderID=755&expandFolder=755&folderID=0
代码如下:
package demo;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;

public class HelloLucene ...{
private static final String FIELD_CONTENT = "content";
private static final String SEN_XML_PATH = "D:\Java\sen-1.2.2.1\conf\sen.xml";
private static final Directory directory = new RAMDirectory();
private static final Analyzer analyzer = new JapaneseAnalyzer(SEN_XML_PATH);
private static final QueryParser qp = new QueryParser( FIELD_CONTENT, analyzer );



private static final String[] contents = ...{
"カツオはサザエの弟", "サザエはワカメの姉", "ワカメはカツオの妹",
"カツオは長男", "サザエは長女", "ワカメは次女",
"マスオはサザエの夫", "波平は舟の夫", "タラちゃんのパパはマスオ",
"サザエとマスオは夫婦", "波平はタラちゃんの祖父", "舟はカツオの母",
"マスオはカツオの義兄", "カツオはタラちゃんの叔父", "舟はワカメの母"
};



public static void main( String[] args ) throws IOException, ParseException ...{
makeIndex();
backupIndex();
BufferedReader br = new BufferedReader( new InputStreamReader( System.in ) );
String q = null;

while( q == null || !q.equals( "q" ) )...{
System.out.print( " 検索質問(qで終了)> " );
System.out.flush();
q = br.readLine();
if( !q.equals( "q" ) )
searchIndex( q );
}
br.close();
if( directory != null )
directory.close();
}



private static void makeIndex() throws IOException ...{
IndexWriter writer = new IndexWriter( directory, analyzer, true );

for( int i = 0; i < contents.length; i++ )...{
Document doc = new Document();
doc.add( new Field( FIELD_CONTENT, contents[i], Field.Store.YES, Field.Index.TOKENIZED ) );
writer.addDocument( doc );
}
writer.close();
}



private static void searchIndex( final String q ) throws IOException, ParseException ...{
IndexSearcher searcher = new IndexSearcher( directory );
Query query = qp.parse( q );
Hits hits = searcher.search( query );
int length = hits.length();
System.out.println( Integer.toString( length ) + "件ヒットしました。" );

for( int i = 0; i < length; i++ )...{
Document doc = hits.doc( i );
System.out.println( " " + doc.get( FIELD_CONTENT ) );
}
searcher.close();
}


private static void backupIndex() throws IOException...{
Directory persistent = FSDirectory.getDirectory( "index" );
Directory.copy( directory, persistent, false );
persistent.close();
}
}参考资料:
http://lucene.jugem.jp/?month=200703
http://www.geocities.jp/another4000yrs/lucene/
http://lucene.jugem.jp/?eid=97
http://www.getopt.org/luke/
中文相关:
http://www.javaeye.com/topic/49441?page=6
http://blog.cnblog.org/archives/2005/07/luceneaecee.html
http://blog.csdn.net/ugg/archive/2007/03/06/1522624.aspx
http://www.ideagrace.com/html/doc/2007/02/07/08630.html
如果图片或页面不能正常显示请点击这里 站内搜索:
推荐文章 |
