Parse a PDF document - Apache PDFBox example

Parse a PDF document - Apache PDFBox example

import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.IOException;
import java.util.List;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
import org.apache.pdfbox.pdmodel.interactive.form.PDField;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.PDFTextStripperByArea;


public class ParsePDF {

	public static void main(String[] args) {

		try {

			File file = new File("docs/my_form.pdf");
			PDDocument doc = PDDocument.load(file);
			PDDocumentCatalog catalog = doc.getDocumentCatalog();
			PDAcroForm form = catalog.getAcroForm();

			System.out.println("Form fields --->");
			//get all forms fields
			if(form != null) {
				List fields = form.getFields();
				for(PDField field: fields) {
					Object value = field.getValueAsString();
					String name = field.getFullyQualifiedName();
					String type = field.getFieldType();
					System.out.print(name);
					System.out.print(" = ");
					System.out.print(value);
					System.out.print(", of type " + type);
					System.out.println();
				}
			}

			//get all text from the PDF
			PDFTextStripper pdfTextStripper = new PDFTextStripper();
			pdfTextStripper.setSortByPosition( true );
			pdfTextStripper.setStartPage(1);
			pdfTextStripper.setEndPage(1);
			String text  = pdfTextStripper.getText(doc);
			System.out.println("");
			System.out.println("All texts --->");
			System.out.println(text);

			//get text from a specific area
			System.out.println("");
			PDFTextStripperByArea stripper = new PDFTextStripperByArea();
			stripper.setSortByPosition( true );
			Rectangle2D myArea = new Rectangle2D.Double(0, 20, 500, 100);
			stripper.addRegion( "region1", myArea );
			PDPage firstPage = doc.getPage(0);
			stripper.extractRegions( firstPage );
			System.out.println( "Text in the area:" + myArea );
			System.out.println( stripper.getTextForRegion( "region1" ) );

			doc.close();

		} 
		catch (IOException e) {
			e.printStackTrace();
		}

	}

}

PDF document parsing Result

Form fields --->
name = Your Name, of type Tx
address = , of type Tx
phone =  , of type Tx
sports = Off, of type Btn
cooking = Off, of type Btn
music = Yes, of type Btn
travel = Off, of type Btn
MyRadioButton = male, of type Btn

All texts --->
Registration Form
Name
Address
Phone No
Hobbies
Sports
Cooking
Music
Travelling
Gender
Male
Female
Other


Text in the area:java.awt.geom.Rectangle2D$Double[x=0.0,y=20.0,w=500.0,h=100.0]
Registration Form
Name
Address
Phone No


No comments:

Post a Comment

NO JUNK, Please try to keep this clean and related to the topic at hand.
Comments are for users to ask questions, collaborate or improve on existing.