Monday, October 31, 2011

WebSphinx and Lucene Example

Here is a short example of how to crawl and index with WebSphinx (a Java-based web crawler) and query the created index with Lucene.  You will need the following Java libraries:

WebSphinx
Lucene

Index.java
import java.net.*;
import java.io.*;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
 
public class Index {
    private static StandardAnalyzer analyzer;
 private static Directory index;

 public static void main(String[] args) {
        try {
         
         // Setup an Analyzer and prepare a new IndexWriter 
         analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
         index = new RAMDirectory();
         IndexWriter writer = new IndexWriter(index, analyzer, true,
                 IndexWriter.MaxFieldLength.UNLIMITED);
            // The mergeFactor value tells Lucene how many segments of equal size to build 
         // before merging them into a single segment
         writer.setMergeFactor(20);
            // Setup a new IndexCrawler instance
         IndexingCrawler c = new IndexingCrawler(writer, "http://www.ceecs.fau.edu/directory");
            c.run();
            writer.optimize();
            // Close the writer when done
            writer.close();
        } catch (MalformedURLException e) {
            e.printStackTrace(System.out);
        } catch (IOException e) {
            e.printStackTrace(System.out);
        }
        
        try {
   // Query the created Index
   String querystr = args.length > 0 ? args[0] : "Publications OR Research OR Papers";
   // The "content" arg specifies the default field to use
   // when no field is explicitly specified in the query.
   Query q = new QueryParser(Version.LUCENE_CURRENT, "contents", analyzer)
     .parse(querystr);
   
   // Do the actual search
   int hitsPerPage = 10;
   
   IndexSearcher searcher = new IndexSearcher(index, true);
   TopScoreDocCollector collector = TopScoreDocCollector.create(
     hitsPerPage, true);
   searcher.search(q, collector);
   ScoreDoc[] hits = collector.topDocs().scoreDocs;
   
   // Display the results 
   System.out.println("\nFound " + hits.length + " hits.");
   for (int i = 0; i < hits.length; ++i) {
    int docId = hits[i].doc;
    Document d = searcher.doc(docId);
    System.out.println((i + 1) + ". " + d.get("title"));
    System.out.println( "\t"+ d.get("path"));
   }
   
   // Searcher can only be closed when there
   // is no need to access the documents any more. 
   searcher.close();
  } catch (Exception e) {
   // TODO: handle exception
  }
    }
}


IndexCrawler. java
import websphinx.*;
import java.io.*;
import java.net.*;
import java.util.regex.*;
import java.util.regex.Pattern;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;

 
public class IndexingCrawler extends Crawler {
 
    private IndexWriter writer;
    public IndexingCrawler(IndexWriter writer, String docroot) {
        super();
        try {
            this.setRoot(new Link(docroot));
        } catch (MalformedURLException e) {
            this.setRoot(null);
        }
        this.writer = writer;
        this.setSynchronous(true);
        this.setDomain(Crawler.SERVER);
    }
 
    public void visit(Page p) {
        boolean index = false;
        
        System.out.println("Visiting [" + p.getURL() + "]");
        
        if(p.getTitle() == null){ 
         noindex(p);// skip pdf files
        }else{
         index(p);// process text
        }
        
        System.out.println("    Done.");
    }
 
    public void index(Page p) {
        StringBuffer contents = new StringBuffer();
        Document doc = new Document();
        doc.add(new Field("path", p.getURL().toString(), Field.Store.YES, Field.Index.ANALYZED));
        //doc.add(new Field("title", value, Field.Store.YES, Field.Index.ANALYZED));
        //doc.add(Field.Keyword("modified",DateField.timeToString(p.getLastModified())));
 
        if (p.getTitle() != null) {
         doc.add(new Field("title", p.getTitle(), Field.Store.YES, Field.Index.ANALYZED));
        }
 
        System.out.println("    Indexing...");
        System.out.println("        depth [" + p.getDepth() + "]");
        System.out.println("        title [" + p.getTitle() + "]");
        System.out.println("        modified [" + p.getLastModified() + "]");
        Element[] elements = p.getElements();
        for (int i = 0; i < elements.length; i++) {
            if (elements[i].getTagName().equalsIgnoreCase("meta")) {
                String name = elements[i].getHTMLAttribute("name", "");
                String content = elements[i].getHTMLAttribute("content", "");
                if (!name.equals("")) {
                 doc.add(new Field(name, content, Field.Store.YES, Field.Index.ANALYZED));
                    System.out.println("        meta [" + name + ":" + content + "]");
                }
            }
        }
        Text[] texts = p.getWords();
        for (int i = 0; i < texts.length; i++) {
            contents.append(texts[i].toText());
            contents.append(" ");
        }
        doc.add(new Field("contents", contents.toString(), Field.Store.YES, Field.Index.ANALYZED));
        try {
            writer.addDocument(doc);
        } catch (IOException e) {
            throw new RuntimeException(e.toString());
        }
    }
 
    public void noindex(Page p) {
        System.out.println("    Skipping...");
    }
    
}