Apache Lucene has already widely known for its indexing and full text searching capabilities.But when we add the Apache Tika,a parsing api from apache to this scene,it becomes more powerful and too easy to parse any type of document formats like .doc ,pdf,zip,tar etc and consequently create indexes on the contents of those documents through lucene.
The following example demonstrates how to create lucene documents from different type of files formats using Apache Tika api.
public Document getDocument(File f) {
ContentHandler textHandler = new BodyContentHandler();
Parser parser = null;
Metadata metadata = new Metadata();
FileInputStream fi=null;
try {
fi = new FileInputStream(f);
if(f.getName().toUpperCase().endsWith(".PDF"))
{
parser = new PDFParser();
parser.parse(fi, textHandler, metadata, new ParseContext());
}
else if(f.getName().toUpperCase().endsWith(".ZIP"))
{
new ZipParser().parseArchive(fi, textHandler, metadata, new ParseContext());
}
else
{
parser= new AutoDetectParser();
parser.parse(fi, textHandler, metadata, new ParseContext());
}
}
catch (FileNotFoundException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
} catch (SAXException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
} catch (TikaException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
try {
fi.close();
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
Field authorfield = getField("author", metadata.get("Author"));
Field titlefield = getField("title",f.getAbsolutePath());
Field contentfield = getField("content", textHandler.toString());
System.err.println("textHandler "+textHandler);
if (authorfield == null && titlefield == null
&& contentfield == null) {
return null;
}
Field[] fields = new Field[] { authorfield,titlefield,contentfield };
Document doc = new Document();
for (Field field : fields) {
if (field != null)
doc.add(field);
}
return doc;
}
private Field getField(String fname, String data) {
if (data == null || fname == null) {
return null; }
Field field = null;
if(fname.equals("content"))
field = new Field(fname, data,Field.Store.YES,Index.ANALYZED,Field.TermVector.WITH_POSITIONS_OFFSETS);
else
field = new Field(fname, data, Store.YES, Index.ANALYZED);
return field;
}
}