Forum:

Other Open Source Projects

PDF file indexing and Searching using lucene

Ranch Hand

Posts: 97

posted 16 years ago

Number of slices to send:

Optional 'thank-you' note:

Send

Hi,

this is how i index PDF files...

import java.io.File;
import java.io.FileReader;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.pdfbox.searchengine.lucene.IndexFiles;

public class PDFBoxIndexFiles {
public static void main(String[] args) throws Exception {
IndexFiles indexFiles = new IndexFiles();
indexFiles.index(new File("D:\\testpdf"), true, "D:/pdfindex");

}
}

The below program is to Search the String in the PDF

import java.io.File;
import java.util.Date;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class PDFSearch {
public static void main(String[] args) throws Exception {

File indexDir = new File("d:\\pdfindex");
String q = "Blood Banks";
if (!indexDir.exists() || !indexDir.isDirectory()) {
throw new Exception(indexDir
+ " does not exist or is not a directory.");
}
search(indexDir, q);
}

public static void search(File indexDir, String q) throws Exception {
Directory fsDir = FSDirectory.getDirectory(indexDir, false);
IndexSearcher is = new IndexSearcher(fsDir);
Query query = QueryParser.parse(q, "contents", new StandardAnalyzer());
long start = new Date().getTime();
Hits hits = is.search(query);
long end = new Date().getTime();
System.err.println("Found " + hits.length() + " document(s) (in "
+ (end - start) + " milliseconds) that matched query '" + q
+ "':");
for (int i = 0; i < hits.length(); i++) {
Document doc = hits.doc(i);
System.out.println(doc.get("filename"));

}
}
}

The Output is:

Found 1 document(s) (in 32 milliseconds) that matched query 'Blood Banks':
null //

The name of the File is showing null..... Pls solve the problem

-Regards,
Rodricks

Paul Sturrock

Bartender

Posts: 10336

I like...

posted 16 years ago

Number of slices to send:

Optional 'thank-you' note:

Send

Where do you parse your PDFs before indexing them? IndexFiles is a convenience class (part of the lucene demo) to index text files. PDFs are not texts files.

JavaRanch FAQ HowToAskQuestionsOnJavaRanch

Rodricks george

Ranch Hand

Posts: 97

posted 16 years ago

Number of slices to send:

Optional 'thank-you' note:

Send

Dear Sir,

As you told i have parsed the pdf file using this code....

import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.util.Date;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.pdfbox.cos.COSDocument;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDDocumentInformation;
import org.pdfbox.searchengine.lucene.LucenePDFDocument;
import org.pdfbox.util.PDFTextStripper;

public class PDFIndexTest implements DocumentHandler {
//public static String password = "-password";
public PDFIndexTest() {
}

public Document getDocument(InputStream is)
throws DocumentHandlerException {
COSDocument cosDoc = null;
try {
cosDoc = parseDocument(is);
}
catch (IOException e) {
closeCOSDocument(cosDoc);
throw new DocumentHandlerException(
"Cannot parse PDF document", e);
}

// extract PDF document's textual content
String docText = null;
try {
PDFTextStripper stripper = new PDFTextStripper();
docText = stripper.getText(new PDDocument(cosDoc));
}
catch (IOException e) {
closeCOSDocument(cosDoc);
throw new DocumentHandlerException(
"Cannot parse PDF document", e);
}
Document doc = new Document();
if (docText != null) {
doc.add(Field.UnStored("body", docText));
}
// extract PDF document's meta-data
PDDocument pdDoc = null;
try {
PDDocumentInformation docInfo =
pdDoc.getDocumentInformation();
String author = docInfo.getAuthor();
String title = docInfo.getTitle();
String keywords = docInfo.getKeywords();
String summary = docInfo.getSubject();
if ((author != null) && !author.equals("")) {
doc.add(Field.Text("author", author));
}
if ((title != null) && !title.equals("")) {
doc.add(Field.Text("title", title));
}
if ((keywords != null) && !keywords.equals("")) {
doc.add(Field.Text("keywords", keywords));
}
if ((summary != null) && !summary.equals("")) {
doc.add(Field.Text("summary", summary));
}
}
catch (Exception e) {
closeCOSDocument(cosDoc);
closePDDocument(pdDoc);
System.err.println("Cannot get PDF document meta-data: "
+ e.getMessage());
}
return doc;
}
private static COSDocument parseDocument(InputStream is)
throws IOException {
PDFParser parser = new PDFParser(is);
parser.parse();
return parser.getDocument();
}
private void closeCOSDocument(COSDocument cosDoc) {
if (cosDoc != null) {
try {
cosDoc.close();
}
catch (IOException e) {
// eat it, what else can we do?
}
}
}
private void closePDDocument(PDDocument pdDoc) {
if (pdDoc != null) {
try {
pdDoc.close();
}
catch (IOException e) {
// eat it, what else can we do?
}
}
}

//recursive method that calls itself when it finds a directory
private static void indexDirectory(IndexWriter writer, File dir)
throws IOException, DocumentHandlerException {
File[] files = dir.listFiles();
for (int i = 0; i < files.length; i++) {
File f = files[i];
if (f.isDirectory()) {
indexDirectory(writer, f);
} else if (f.getName().endsWith(".pdf")) {
indexFile(writer, f);
}
}
}

private static void indexFile(IndexWriter writer, File f)
throws IOException, DocumentHandlerException {
if (f.isHidden() || !f.exists() || !f.canRead()) {
return;
}
System.out.println("Indexing " + f.getCanonicalPath());
PDFBoxPDFHandler handler = new PDFBoxPDFHandler();
//Document doc = handler.getDocument(new FileInputStream(f));
Document doc = LucenePDFDocument.getDocument(new FileInputStream(f));
writer.addDocument(doc);

//System.out.println(doc.get("filename"));
}

//open an index and start file directory traversal
public static int index(File indexDir, File dataDir)
throws IOException, DocumentHandlerException {
if (!dataDir.exists() || !dataDir.isDirectory()) {
throw new IOException(dataDir
+ " does not exist or is not a directory");
}
IndexWriter writer = new IndexWriter(indexDir,
new StandardAnalyzer(), true);
writer.setUseCompoundFile(false);
indexDirectory(writer, dataDir);
int numIndexed = writer.docCount();
writer.optimize();
writer.close();
return numIndexed;
}

public static void main(String[] args) throws Exception {

File indexDir = new File("d:\\index");
File dataDir = new File("D:\\testpdf");
long start = new Date().getTime();

int numIndexed = index(indexDir, dataDir);
long end = new Date().getTime();
System.out.println("Indexing " + numIndexed + " files took "
+ (end - start) + " milliseconds");

}
}

Still i am not able to see the name of the PDF File..... but searching is done successfully....

Found 2 document(s) (in 31 milliseconds) that matched query 'SQL is a Standard':
null
null

name of the PDF file is coming null

Ulf Dittmer

Rancher

Posts: 43081

posted 16 years ago

Number of slices to send:

Optional 'thank-you' note:

Send

I see no code in the indexer that adds a field called "filename" (which is what you are retrieving from the search result).

Rodricks george

Ranch Hand

Posts: 97

posted 16 years ago

Number of slices to send:

Optional 'thank-you' note:

Send

Sir,

Thank you so much.. i got the result.... i want to get the path of the file and add that while parsing and indexing it .how can i do it....

-Rodricks

Ulf Dittmer

Rancher

Posts: 43081

posted 16 years ago

Number of slices to send:

Optional 'thank-you' note:

Send

i want to get the path of the file and add that while parsing and indexing it .how can i do it....

You need to add a field called "filename" with the file name during indexing. Are you familiar with the Lucene API? If you look at the indexing code you're already using, it should be pretty obvious how to add fields.

The version of the API in that code is a bit dated, though; read up on the various Field.XYZ references - you should use the one called UNTOKENIZED (or something similar).

Rodricks george

Ranch Hand

Posts: 97

posted 16 years ago

Number of slices to send:

Optional 'thank-you' note:

Send

Thank you sir... I will follow

Henry Wong

author

Posts: 23951

142

I like...

posted 16 years ago

Number of slices to send:

Optional 'thank-you' note:

Send

Transfering this topic to the open source forum, of which Lucene is such a project...

Henry

Books: Java Threads, 3rd Edition, Jini in a Nutshell, and Java Gems (contributor)

PramilaT Thakur

Greenhorn

Posts: 8

posted 15 years ago

Number of slices to send:

Optional 'thank-you' note:

Send

Hi Everyone,

I am new to lucene. I need to index some pdf files. I tried using PDFBox and lucene document. But when I try to run the programme it does not run.

I have no idea. I also tried to use the code given @ https://coderanch.com/t/424178/open-source/PDF-file-indexing-Searching-lucene posting even this does not work.
Can anyone help me.

I think it is some version issues. My code is

package org.apache.solr.pdf.test;

import java.io.File;

import org.apache.lucene.index.IndexWriter;
import org.pdfbox.searchengine.lucene.IndexFiles;

public class PDFBoxIndexFiles {

/**
* @param args
*/
public static void main(String[] args)throws Exception {
IndexFiles indexFiles = new IndexFiles();
indexFiles.index(new File("who.pdf"), true, "C:/temp");
}

}

After running I get Exception in thread "main" java.lang.IllegalAccessError: tried to access field org.apache.lucene.index.IndexWriter.maxFieldLength from class org.pdfbox.searchengine.lucene.IndexFiles
at org.pdfbox.searchengine.lucene.IndexFiles.index(IndexFiles.java:158)
at org.apache.solr.pdf.test.PDFBoxIndexFiles.main(PDFBoxIndexFiles.java:15).

I need some pointers please.

thanks

Ulf Dittmer

Rancher

Posts: 43081

posted 15 years ago

Number of slices to send:

Optional 'thank-you' note:

Send

Pramila, can you get the example integration from the PDFBox web site to work? That's where I'd start.

PramilaT Thakur

Greenhorn

Posts: 8

posted 15 years ago

Number of slices to send:

Optional 'thank-you' note:

Send

Hi,

I tried it and after some trial and error I got it working on my local machine as a standalone application.

Now I need to intergrate it with Solr, so that Solr server can do the search from the index files.

I had been reading about solr a lot but it is confusing to me. specially with the SOLR_HOM, solr.solr.home.

If any one has any pointers please help me. Or any mini tutorials.

thanks in advance.

niju shrestha

Greenhorn

Posts: 3

posted 15 years ago

Number of slices to send:

Optional 'thank-you' note:

Send

[Thread hijack removed. Please do not hijack threads, and do not post your question more than once.]
[ December 28, 2008: Message edited by: Bear Bibeault ]

Don't get me started about those stupid light bulbs.