Semantic Web How-to's: 2011

Here is a short example of how to crawl and index with WebSphinx (a Java-based web crawler) and query the created index with Lucene. You will need the following Java libraries:

WebSphinx
Lucene

Index.java

import java.net.*;
import java.io.*;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
 
public class Index {
    private static StandardAnalyzer analyzer;
 private static Directory index;

 public static void main(String[] args) {
        try {
         
         // Setup an Analyzer and prepare a new IndexWriter 
         analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
         index = new RAMDirectory();
         IndexWriter writer = new IndexWriter(index, analyzer, true,
                 IndexWriter.MaxFieldLength.UNLIMITED);
            // The mergeFactor value tells Lucene how many segments of equal size to build 
         // before merging them into a single segment
         writer.setMergeFactor(20);
            // Setup a new IndexCrawler instance
         IndexingCrawler c = new IndexingCrawler(writer, "http://www.ceecs.fau.edu/directory");
            c.run();
            writer.optimize();
            // Close the writer when done
            writer.close();
        } catch (MalformedURLException e) {
            e.printStackTrace(System.out);
        } catch (IOException e) {
            e.printStackTrace(System.out);
        }
        
        try {
   // Query the created Index
   String querystr = args.length > 0 ? args[0] : "Publications OR Research OR Papers";
   // The "content" arg specifies the default field to use
   // when no field is explicitly specified in the query.
   Query q = new QueryParser(Version.LUCENE_CURRENT, "contents", analyzer)
     .parse(querystr);
   
   // Do the actual search
   int hitsPerPage = 10;
   
   IndexSearcher searcher = new IndexSearcher(index, true);
   TopScoreDocCollector collector = TopScoreDocCollector.create(
     hitsPerPage, true);
   searcher.search(q, collector);
   ScoreDoc[] hits = collector.topDocs().scoreDocs;
   
   // Display the results 
   System.out.println("\nFound " + hits.length + " hits.");
   for (int i = 0; i < hits.length; ++i) {
    int docId = hits[i].doc;
    Document d = searcher.doc(docId);
    System.out.println((i + 1) + ". " + d.get("title"));
    System.out.println( "\t"+ d.get("path"));
   }
   
   // Searcher can only be closed when there
   // is no need to access the documents any more. 
   searcher.close();
  } catch (Exception e) {
   // TODO: handle exception
  }
    }
}

IndexCrawler. java

import websphinx.*;
import java.io.*;
import java.net.*;
import java.util.regex.*;
import java.util.regex.Pattern;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;

 
public class IndexingCrawler extends Crawler {
 
    private IndexWriter writer;
    public IndexingCrawler(IndexWriter writer, String docroot) {
        super();
        try {
            this.setRoot(new Link(docroot));
        } catch (MalformedURLException e) {
            this.setRoot(null);
        }
        this.writer = writer;
        this.setSynchronous(true);
        this.setDomain(Crawler.SERVER);
    }
 
    public void visit(Page p) {
        boolean index = false;
        
        System.out.println("Visiting [" + p.getURL() + "]");
        
        if(p.getTitle() == null){ 
         noindex(p);// skip pdf files
        }else{
         index(p);// process text
        }
        
        System.out.println("    Done.");
    }
 
    public void index(Page p) {
        StringBuffer contents = new StringBuffer();
        Document doc = new Document();
        doc.add(new Field("path", p.getURL().toString(), Field.Store.YES, Field.Index.ANALYZED));
        //doc.add(new Field("title", value, Field.Store.YES, Field.Index.ANALYZED));
        //doc.add(Field.Keyword("modified",DateField.timeToString(p.getLastModified())));
 
        if (p.getTitle() != null) {
         doc.add(new Field("title", p.getTitle(), Field.Store.YES, Field.Index.ANALYZED));
        }
 
        System.out.println("    Indexing...");
        System.out.println("        depth [" + p.getDepth() + "]");
        System.out.println("        title [" + p.getTitle() + "]");
        System.out.println("        modified [" + p.getLastModified() + "]");
        Element[] elements = p.getElements();
        for (int i = 0; i < elements.length; i++) {
            if (elements[i].getTagName().equalsIgnoreCase("meta")) {
                String name = elements[i].getHTMLAttribute("name", "");
                String content = elements[i].getHTMLAttribute("content", "");
                if (!name.equals("")) {
                 doc.add(new Field(name, content, Field.Store.YES, Field.Index.ANALYZED));
                    System.out.println("        meta [" + name + ":" + content + "]");
                }
            }
        }
        Text[] texts = p.getWords();
        for (int i = 0; i < texts.length; i++) {
            contents.append(texts[i].toText());
            contents.append(" ");
        }
        doc.add(new Field("contents", contents.toString(), Field.Store.YES, Field.Index.ANALYZED));
        try {
            writer.addDocument(doc);
        } catch (IOException e) {
            throw new RuntimeException(e.toString());
        }
    }
 
    public void noindex(Page p) {
        System.out.println("    Skipping...");
    }
    
}

What you need:
The following links will take you to the download area for each tool.

Java JDK
Eclipse
Jena
Protege
Pellet

Jena Tutorial
The following website will give you a step-by-step tutorial for the Jena library in Eclipse.

http://www.iandickinson.me.uk/articles/jena-eclipse-helloworld/

Note: The following is the completed JenaTutorial example for HelloRDFWorld.java

package tutorial;

import com.hp.hpl.jena.datatypes.xsd.XSDDatatype;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import com.hp.hpl.jena.rdf.model.Property;
import com.hp.hpl.jena.rdf.model.Resource;

public class HelloRDFWorld {

	/**
	 * @param args
	 */
	public static void main(String[] args) {
		
		Model m = ModelFactory.createDefaultModel();
		String NS = "http://example.com/test/";
		
		Resource r = m.createResource(NS + "r");
		Property p = m.createProperty(NS + "p");
		
		r.addProperty(p,"hello world", XSDDatatype.XSDstring);
		
		m.write(System.out, "Turtle");
	}

}

The following is a screenshot of the completed HelloSemanticWeb example for Chapter 2 of the book.

Semantic Web How-to's

Tuesday, December 6, 2011

Semantic Web Project Presentation

Monday, October 31, 2011

WebSphinx and Lucene Example

Sunday, September 18, 2011

Protege Statements

Friday, September 2, 2011

Geonames and FOAF RDF

Saturday, August 27, 2011

Setup Semantic Web Development Environment

What is Semantic Web?