Tuesday, December 6, 2011

Semantic Web Project Presentation

Here is my Semantic Web Project Presentation for Fall 2011.

Monday, October 31, 2011

WebSphinx and Lucene Example

Here is a short example of how to crawl and index with WebSphinx (a Java-based web crawler) and query the created index with Lucene.  You will need the following Java libraries:

WebSphinx
Lucene

Index.java
import java.net.*;
import java.io.*;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
 
public class Index {
    private static StandardAnalyzer analyzer;
 private static Directory index;

 public static void main(String[] args) {
        try {
         
         // Setup an Analyzer and prepare a new IndexWriter 
         analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
         index = new RAMDirectory();
         IndexWriter writer = new IndexWriter(index, analyzer, true,
                 IndexWriter.MaxFieldLength.UNLIMITED);
            // The mergeFactor value tells Lucene how many segments of equal size to build 
         // before merging them into a single segment
         writer.setMergeFactor(20);
            // Setup a new IndexCrawler instance
         IndexingCrawler c = new IndexingCrawler(writer, "http://www.ceecs.fau.edu/directory");
            c.run();
            writer.optimize();
            // Close the writer when done
            writer.close();
        } catch (MalformedURLException e) {
            e.printStackTrace(System.out);
        } catch (IOException e) {
            e.printStackTrace(System.out);
        }
        
        try {
   // Query the created Index
   String querystr = args.length > 0 ? args[0] : "Publications OR Research OR Papers";
   // The "content" arg specifies the default field to use
   // when no field is explicitly specified in the query.
   Query q = new QueryParser(Version.LUCENE_CURRENT, "contents", analyzer)
     .parse(querystr);
   
   // Do the actual search
   int hitsPerPage = 10;
   
   IndexSearcher searcher = new IndexSearcher(index, true);
   TopScoreDocCollector collector = TopScoreDocCollector.create(
     hitsPerPage, true);
   searcher.search(q, collector);
   ScoreDoc[] hits = collector.topDocs().scoreDocs;
   
   // Display the results 
   System.out.println("\nFound " + hits.length + " hits.");
   for (int i = 0; i < hits.length; ++i) {
    int docId = hits[i].doc;
    Document d = searcher.doc(docId);
    System.out.println((i + 1) + ". " + d.get("title"));
    System.out.println( "\t"+ d.get("path"));
   }
   
   // Searcher can only be closed when there
   // is no need to access the documents any more. 
   searcher.close();
  } catch (Exception e) {
   // TODO: handle exception
  }
    }
}


IndexCrawler. java
import websphinx.*;
import java.io.*;
import java.net.*;
import java.util.regex.*;
import java.util.regex.Pattern;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;

 
public class IndexingCrawler extends Crawler {
 
    private IndexWriter writer;
    public IndexingCrawler(IndexWriter writer, String docroot) {
        super();
        try {
            this.setRoot(new Link(docroot));
        } catch (MalformedURLException e) {
            this.setRoot(null);
        }
        this.writer = writer;
        this.setSynchronous(true);
        this.setDomain(Crawler.SERVER);
    }
 
    public void visit(Page p) {
        boolean index = false;
        
        System.out.println("Visiting [" + p.getURL() + "]");
        
        if(p.getTitle() == null){ 
         noindex(p);// skip pdf files
        }else{
         index(p);// process text
        }
        
        System.out.println("    Done.");
    }
 
    public void index(Page p) {
        StringBuffer contents = new StringBuffer();
        Document doc = new Document();
        doc.add(new Field("path", p.getURL().toString(), Field.Store.YES, Field.Index.ANALYZED));
        //doc.add(new Field("title", value, Field.Store.YES, Field.Index.ANALYZED));
        //doc.add(Field.Keyword("modified",DateField.timeToString(p.getLastModified())));
 
        if (p.getTitle() != null) {
         doc.add(new Field("title", p.getTitle(), Field.Store.YES, Field.Index.ANALYZED));
        }
 
        System.out.println("    Indexing...");
        System.out.println("        depth [" + p.getDepth() + "]");
        System.out.println("        title [" + p.getTitle() + "]");
        System.out.println("        modified [" + p.getLastModified() + "]");
        Element[] elements = p.getElements();
        for (int i = 0; i < elements.length; i++) {
            if (elements[i].getTagName().equalsIgnoreCase("meta")) {
                String name = elements[i].getHTMLAttribute("name", "");
                String content = elements[i].getHTMLAttribute("content", "");
                if (!name.equals("")) {
                 doc.add(new Field(name, content, Field.Store.YES, Field.Index.ANALYZED));
                    System.out.println("        meta [" + name + ":" + content + "]");
                }
            }
        }
        Text[] texts = p.getWords();
        for (int i = 0; i < texts.length; i++) {
            contents.append(texts[i].toText());
            contents.append(" ");
        }
        doc.add(new Field("contents", contents.toString(), Field.Store.YES, Field.Index.ANALYZED));
        try {
            writer.addDocument(doc);
        } catch (IOException e) {
            throw new RuntimeException(e.toString());
        }
    }
 
    public void noindex(Page p) {
        System.out.println("    Skipping...");
    }
    
}

Sunday, September 18, 2011

Protege Statements

I have used Protege to create a Person class with 10 literals.  Here is the resulting OWL file:



    
    
    
    
    
        
    
    
        
    
    
        
    
    
        
    
    
        
    
    
        
    
    
        
    
    
        
    
    
        
    
    
        
    
    
        
    
    
        
    
    
        
        
    
    
        
        
    
    
        
        
        Neverland, USA
    
    
        
        
        (555) 555-5551
    
    
        
        
        peter.pan@disney.com
    
    
        
        
        Peter
    
    
        
        
        Peter Pan
    
    
        
        
        (555) 555-5552
    
    
        
        
        Pan
    
    
        
        
        M
    
    
        
        
        (555) 555-5555
    
    
        
        
        (555) 555-5553
    
    
        
        
    
    
        
        
    
    
        
        
    
    
        
        
    
    
        
        
    
    
        
        
    
    
        
        
    
    
        
        
    
    
        
        
    
    
        
        
    
    
        
        
    
    
        
        
    
    
        
        
    
    
        
        
    
    
        
        
    
    
        
        
    
    
        
        
    
    
        
        
    
    
        
        
    
    
        
        
    
    
        
        
    
    
        
        
    
    
        
        
    
    
        
        
    
    
        
        
    

Here is a screenshot of a SPARQL query taken in Protege 4.1:


Friday, September 2, 2011

Geonames and FOAF RDF

I have edited the map in geonames.org to include my favorite restaurant: Five Guys.

Here is a screenshot:


Secondly, I created a FOAF file from http://www.foaf-project.org/.  Here is a screenshot:


The resulting RDF file is the following:




  
  
  
  


Hello Foaf
Mr
Hello
Foaf
Hey
7cf7f585c62389f71b0a99487c523e183ed93857






Alex
c040dca66fbcd9569f4184a7d4b8d2150973744a


Saturday, August 27, 2011

Setup Semantic Web Development Environment

What you need:
The following links will take you to the download area for each tool.

Java JDK
Eclipse
Jena
Protege
Pellet

Jena Tutorial
The following website will give you a step-by-step tutorial for the Jena library in Eclipse.

http://www.iandickinson.me.uk/articles/jena-eclipse-helloworld/

Note:  The following is the completed JenaTutorial example for HelloRDFWorld.java


package tutorial;

import com.hp.hpl.jena.datatypes.xsd.XSDDatatype;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import com.hp.hpl.jena.rdf.model.Property;
import com.hp.hpl.jena.rdf.model.Resource;

public class HelloRDFWorld {

	/**
	 * @param args
	 */
	public static void main(String[] args) {
		
		Model m = ModelFactory.createDefaultModel();
		String NS = "http://example.com/test/";
		
		Resource r = m.createResource(NS + "r");
		Property p = m.createProperty(NS + "p");
		
		r.addProperty(p,"hello world", XSDDatatype.XSDstring);
		
		m.write(System.out, "Turtle");
	}

}

The following is a screenshot of the completed HelloSemanticWeb example for Chapter 2 of the book.




What is Semantic Web?

Invented by Timer Berners-Lee, the man behind the World Wide Web.  The Semantic Web is an extension of the web which allows machines to search, aggregate and combine the information stored on the web.  The information is stored in web pages that are designed to be human readable, and so new technologies are required for machine readability.  The RDF and XML technologies are used to turn basic web data in to data that can be processed by machines.