WebSphinx
Lucene
Index.java
import java.net.*; import java.io.*; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopScoreDocCollector; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; public class Index { private static StandardAnalyzer analyzer; private static Directory index; public static void main(String[] args) { try { // Setup an Analyzer and prepare a new IndexWriter analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); index = new RAMDirectory(); IndexWriter writer = new IndexWriter(index, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED); // The mergeFactor value tells Lucene how many segments of equal size to build // before merging them into a single segment writer.setMergeFactor(20); // Setup a new IndexCrawler instance IndexingCrawler c = new IndexingCrawler(writer, "http://www.ceecs.fau.edu/directory"); c.run(); writer.optimize(); // Close the writer when done writer.close(); } catch (MalformedURLException e) { e.printStackTrace(System.out); } catch (IOException e) { e.printStackTrace(System.out); } try { // Query the created Index String querystr = args.length > 0 ? args[0] : "Publications OR Research OR Papers"; // The "content" arg specifies the default field to use // when no field is explicitly specified in the query. Query q = new QueryParser(Version.LUCENE_CURRENT, "contents", analyzer) .parse(querystr); // Do the actual search int hitsPerPage = 10; IndexSearcher searcher = new IndexSearcher(index, true); TopScoreDocCollector collector = TopScoreDocCollector.create( hitsPerPage, true); searcher.search(q, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; // Display the results System.out.println("\nFound " + hits.length + " hits."); for (int i = 0; i < hits.length; ++i) { int docId = hits[i].doc; Document d = searcher.doc(docId); System.out.println((i + 1) + ". " + d.get("title")); System.out.println( "\t"+ d.get("path")); } // Searcher can only be closed when there // is no need to access the documents any more. searcher.close(); } catch (Exception e) { // TODO: handle exception } } }
IndexCrawler. java
import websphinx.*; import java.io.*; import java.net.*; import java.util.regex.*; import java.util.regex.Pattern; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; public class IndexingCrawler extends Crawler { private IndexWriter writer; public IndexingCrawler(IndexWriter writer, String docroot) { super(); try { this.setRoot(new Link(docroot)); } catch (MalformedURLException e) { this.setRoot(null); } this.writer = writer; this.setSynchronous(true); this.setDomain(Crawler.SERVER); } public void visit(Page p) { boolean index = false; System.out.println("Visiting [" + p.getURL() + "]"); if(p.getTitle() == null){ noindex(p);// skip pdf files }else{ index(p);// process text } System.out.println(" Done."); } public void index(Page p) { StringBuffer contents = new StringBuffer(); Document doc = new Document(); doc.add(new Field("path", p.getURL().toString(), Field.Store.YES, Field.Index.ANALYZED)); //doc.add(new Field("title", value, Field.Store.YES, Field.Index.ANALYZED)); //doc.add(Field.Keyword("modified",DateField.timeToString(p.getLastModified()))); if (p.getTitle() != null) { doc.add(new Field("title", p.getTitle(), Field.Store.YES, Field.Index.ANALYZED)); } System.out.println(" Indexing..."); System.out.println(" depth [" + p.getDepth() + "]"); System.out.println(" title [" + p.getTitle() + "]"); System.out.println(" modified [" + p.getLastModified() + "]"); Element[] elements = p.getElements(); for (int i = 0; i < elements.length; i++) { if (elements[i].getTagName().equalsIgnoreCase("meta")) { String name = elements[i].getHTMLAttribute("name", ""); String content = elements[i].getHTMLAttribute("content", ""); if (!name.equals("")) { doc.add(new Field(name, content, Field.Store.YES, Field.Index.ANALYZED)); System.out.println(" meta [" + name + ":" + content + "]"); } } } Text[] texts = p.getWords(); for (int i = 0; i < texts.length; i++) { contents.append(texts[i].toText()); contents.append(" "); } doc.add(new Field("contents", contents.toString(), Field.Store.YES, Field.Index.ANALYZED)); try { writer.addDocument(doc); } catch (IOException e) { throw new RuntimeException(e.toString()); } } public void noindex(Page p) { System.out.println(" Skipping..."); } }
I love you!
ReplyDelete