另外需要说明一点,有关索引数据分布和更新的问题。基于上述随机选择索引目录,在一定程度上能够均匀地将数据分布到不同的目录中,但是在更新的时候,如果处理不当会造成数据的重复(因为随机),解决重复的方法就是在外部增加重复检测工作,限制将重复(非常相似)的文档再次进行索引。
下面我们看一下索引的测试用例,代码如下所示:
- package org.shirdrn.lucene;
- import java.util.HashMap;
- import java.util.Map;
- import junit.framework.TestCase;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
- import org.apache.lucene.index.IndexWriterConfig;
- import org.apache.lucene.index.IndexWriterConfig.OpenMode;
- import org.apache.lucene.util.Version;
- import org.shirdrn.lucene.MultipleIndexing.MongoConfig;
- public class TestMultipleIndexing extends TestCase {
- MultipleIndexing indexer;
- @Override
- protected void setUp() throws Exception {
- MongoConfig mongoConfig = new MongoConfig("192.168.0.184", 27017, "page", "Article");
- String indexRoot = "E:\\Store\\indexes";
- int maxIndexCommitCount = 200;
- Analyzer a = new SmartChineseAnalyzer(Version.LUCENE_35, true);
- IndexWriterConfig indexConfig = new IndexWriterConfig(Version.LUCENE_35, a);
- indexConfig.setOpenMode(OpenMode.CREATE);
- indexer = new MultipleIndexing(indexRoot, maxIndexCommitCount, mongoConfig, indexConfig);
- }
- @Override
- protected void tearDown() throws Exception {
- super.tearDown();
- }
- public void testIndexing() {
- Map<String, Object> conditions = new HashMap<String, Object>();
- conditions.put("spiderName", "sinaSpider");
- indexer.index(conditions);
- }
- }
搜索实现
在搜索的时候,你可以选择ParallelMultiSearcher或MultiSearcher的任意一个,MultiSearcher是在搜索时候,通过一个循环来遍历多个索引获取到检索结果,而ParallelMultiSearcher则是启动多个线程并行执行搜索,使用它们的效率在不同配置的机器上效果是不同的,在实际使用的时候根据你的需要来决定。我简单地使用了MultiSearcher来构建搜索,实现代码如下所示:
- package org.shirdrn.lucene;
- import java.io.IOException;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.index.CorruptIndexException;
- import org.apache.lucene.index.IndexWriterConfig;
- import org.apache.lucene.queryParser.ParseException;
- import org.apache.lucene.queryParser.QueryParser;
- import org.apache.lucene.search.MultiSearcher;
- import org.apache.lucene.search.Query;
- import org.apache.lucene.search.ScoreDoc;
- import org.apache.lucene.search.Searcher;
- import org.apache.lucene.search.TopScoreDocCollector;
- import org.apache.lucene.util.Version;
- import org.shirdrn.lucene.IndexHelper.SearcherHelper;
- import org.slf4j.Logger;
- import org.slf4j.LoggerFactory;
- /**
- * Searching accross multiple Lucene indexes.
- *
- * @author shirdrn
- * @date 2011-12-12
- */
- public class MultipleSearching {
- private static Logger LOG = LoggerFactory.getLogger(MultipleSearching.class);
- private SearcherHelper searcherHelper;
- private Searcher searcher;
- private QueryParser queryParser;
- private IndexWriterConfig indexConfig;
- private Query query;
- private ScoreDoc[] scoreDocs;
- public MultipleSearching(String indexRoot, IndexWriterConfig indexConfig) {
- searcherHelper = IndexHelper.newSearcherHelper(indexRoot, indexConfig);
- this.indexConfig = indexConfig;
- try {
- searcher = new MultiSearcher(searcherHelper.getSearchers());
- searcher.setSimilarity(indexConfig.getSimilarity());
- queryParser = new QueryParser(Version.LUCENE_35, "content", indexConfig.getAnalyzer());
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- public void search(String queries) {
- try {
- query = queryParser.parse(queries);
- TopScoreDocCollector collector = TopScoreDocCollector.create(100000, true);
- searcher.search(query, collector);
- scoreDocs = collector.topDocs().scoreDocs;
- } catch (ParseException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- public void iterateDocs(int start, int end) {
- for (int i = start; i < Math.min(scoreDocs.length, end); i++) {
- try {
- LOG.info(searcher.doc(scoreDocs[i].doc).toString());
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- public void explain(int start, int end) {
- for (int i = start; i < Math.min(scoreDocs.length, end); i++) {
- try {
- System.out.println(searcher.explain(query, scoreDocs[i].doc));
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- public void close() {
- searcherHelper.closeAll();
- try {
- searcher.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
下面给出搜索的测试用例,代码如下所示:
- package org.shirdrn.lucene;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
- import org.apache.lucene.index.IndexWriterConfig;
- import org.apache.lucene.util.Version;
- import junit.framework.TestCase;
- public class TestMultipleSearching extends TestCase {
- MultipleSearching searcher;
- @Override
- protected void setUp() throws Exception {
- String indexRoot = "E:\\Store\\indexes";
- Analyzer a = new SmartChineseAnalyzer(Version.LUCENE_35, true);
- IndexWriterConfig indexConfig = new IndexWriterConfig(Version.LUCENE_35, a);
- searcher = new MultipleSearching(indexRoot, indexConfig);
- }
- @Override
- protected void tearDown() throws Exception {
- searcher.close();
- }
- public void testSearching() {
- searcher.search("+title:拉斯维加斯^1.25 (+content:美国^1.50 +content:拉斯维加斯)");
- searcher.iterateDocs(0, 10);
- searcher.explain(0, 5);
- }
- }