WebSphinx
Lucene
Index.java
import java.net.*;
import java.io.*;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
public class Index {
private static StandardAnalyzer analyzer;
private static Directory index;
public static void main(String[] args) {
try {
// Setup an Analyzer and prepare a new IndexWriter
analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
index = new RAMDirectory();
IndexWriter writer = new IndexWriter(index, analyzer, true,
IndexWriter.MaxFieldLength.UNLIMITED);
// The mergeFactor value tells Lucene how many segments of equal size to build
// before merging them into a single segment
writer.setMergeFactor(20);
// Setup a new IndexCrawler instance
IndexingCrawler c = new IndexingCrawler(writer, "http://www.ceecs.fau.edu/directory");
c.run();
writer.optimize();
// Close the writer when done
writer.close();
} catch (MalformedURLException e) {
e.printStackTrace(System.out);
} catch (IOException e) {
e.printStackTrace(System.out);
}
try {
// Query the created Index
String querystr = args.length > 0 ? args[0] : "Publications OR Research OR Papers";
// The "content" arg specifies the default field to use
// when no field is explicitly specified in the query.
Query q = new QueryParser(Version.LUCENE_CURRENT, "contents", analyzer)
.parse(querystr);
// Do the actual search
int hitsPerPage = 10;
IndexSearcher searcher = new IndexSearcher(index, true);
TopScoreDocCollector collector = TopScoreDocCollector.create(
hitsPerPage, true);
searcher.search(q, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
// Display the results
System.out.println("\nFound " + hits.length + " hits.");
for (int i = 0; i < hits.length; ++i) {
int docId = hits[i].doc;
Document d = searcher.doc(docId);
System.out.println((i + 1) + ". " + d.get("title"));
System.out.println( "\t"+ d.get("path"));
}
// Searcher can only be closed when there
// is no need to access the documents any more.
searcher.close();
} catch (Exception e) {
// TODO: handle exception
}
}
}
IndexCrawler. java
import websphinx.*;
import java.io.*;
import java.net.*;
import java.util.regex.*;
import java.util.regex.Pattern;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
public class IndexingCrawler extends Crawler {
private IndexWriter writer;
public IndexingCrawler(IndexWriter writer, String docroot) {
super();
try {
this.setRoot(new Link(docroot));
} catch (MalformedURLException e) {
this.setRoot(null);
}
this.writer = writer;
this.setSynchronous(true);
this.setDomain(Crawler.SERVER);
}
public void visit(Page p) {
boolean index = false;
System.out.println("Visiting [" + p.getURL() + "]");
if(p.getTitle() == null){
noindex(p);// skip pdf files
}else{
index(p);// process text
}
System.out.println(" Done.");
}
public void index(Page p) {
StringBuffer contents = new StringBuffer();
Document doc = new Document();
doc.add(new Field("path", p.getURL().toString(), Field.Store.YES, Field.Index.ANALYZED));
//doc.add(new Field("title", value, Field.Store.YES, Field.Index.ANALYZED));
//doc.add(Field.Keyword("modified",DateField.timeToString(p.getLastModified())));
if (p.getTitle() != null) {
doc.add(new Field("title", p.getTitle(), Field.Store.YES, Field.Index.ANALYZED));
}
System.out.println(" Indexing...");
System.out.println(" depth [" + p.getDepth() + "]");
System.out.println(" title [" + p.getTitle() + "]");
System.out.println(" modified [" + p.getLastModified() + "]");
Element[] elements = p.getElements();
for (int i = 0; i < elements.length; i++) {
if (elements[i].getTagName().equalsIgnoreCase("meta")) {
String name = elements[i].getHTMLAttribute("name", "");
String content = elements[i].getHTMLAttribute("content", "");
if (!name.equals("")) {
doc.add(new Field(name, content, Field.Store.YES, Field.Index.ANALYZED));
System.out.println(" meta [" + name + ":" + content + "]");
}
}
}
Text[] texts = p.getWords();
for (int i = 0; i < texts.length; i++) {
contents.append(texts[i].toText());
contents.append(" ");
}
doc.add(new Field("contents", contents.toString(), Field.Store.YES, Field.Index.ANALYZED));
try {
writer.addDocument(doc);
} catch (IOException e) {
throw new RuntimeException(e.toString());
}
}
public void noindex(Page p) {
System.out.println(" Skipping...");
}
}
I love you!
ReplyDelete