import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.IOException;
import java.util.List;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
import org.apache.pdfbox.pdmodel.interactive.form.PDField;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.PDFTextStripperByArea;
public class ParsePDF {
public static void main(String[] args) {
try {
File file = new File("docs/my_form.pdf");
PDDocument doc = PDDocument.load(file);
PDDocumentCatalog catalog = doc.getDocumentCatalog();
PDAcroForm form = catalog.getAcroForm();
System.out.println("Form fields --->");
//get all forms fields
if(form != null) {
List fields = form.getFields();
for(PDField field: fields) {
Object value = field.getValueAsString();
String name = field.getFullyQualifiedName();
String type = field.getFieldType();
System.out.print(name);
System.out.print(" = ");
System.out.print(value);
System.out.print(", of type " + type);
System.out.println();
}
}
//get all text from the PDF
PDFTextStripper pdfTextStripper = new PDFTextStripper();
pdfTextStripper.setSortByPosition( true );
pdfTextStripper.setStartPage(1);
pdfTextStripper.setEndPage(1);
String text = pdfTextStripper.getText(doc);
System.out.println("");
System.out.println("All texts --->");
System.out.println(text);
//get text from a specific area
System.out.println("");
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
stripper.setSortByPosition( true );
Rectangle2D myArea = new Rectangle2D.Double(0, 20, 500, 100);
stripper.addRegion( "region1", myArea );
PDPage firstPage = doc.getPage(0);
stripper.extractRegions( firstPage );
System.out.println( "Text in the area:" + myArea );
System.out.println( stripper.getTextForRegion( "region1" ) );
doc.close();
}
catch (IOException e) {
e.printStackTrace();
}
}
}
PDF document parsing Result
Form fields --->
name = Your Name, of type Tx
address = , of type Tx
phone = , of type Tx
sports = Off, of type Btn
cooking = Off, of type Btn
music = Yes, of type Btn
travel = Off, of type Btn
MyRadioButton = male, of type Btn
All texts --->
Registration Form
Name
Address
Phone No
Hobbies
Sports
Cooking
Music
Travelling
Gender
Male
Female
Other
Text in the area:java.awt.geom.Rectangle2D$Double[x=0.0,y=20.0,w=500.0,h=100.0]
Registration Form
Name
Address
Phone No
No comments:
Post a Comment
NO JUNK, Please try to keep this clean and related to the topic at hand.
Comments are for users to ask questions, collaborate or improve on existing.