Automate text field extraction from PDF into an XML file

Use this Custom Job as an easy solution to extract text fields from incoming PDF documents (such as invoices, statements, financial applications, medical records, etc….) and save them in a structured format, for instance an XML file.

Note: This custom job will work with any PDF document (assuming the document contains text, otherwise the PDF will need to be ran through OCR first), it does not require for the document to be an interactive PDF form or contain interactive form fields.

Watching Incoming PDF Files

This Custom Job works with Qoppa’s PDF Automation Server(PAS), an affordable, self-contained solution to automate document workflows. PAS runs on your server and does not require a monthly subscription or per-document fee.

PDF Automation Server is set to watch a folder and process PDF documents as they arrive.

Definition of the Custom Job

Simply add graphical rectangle annotations to a PDF template in order to visually identify the locations of the fields to be extracted.

**Rectangle annotations identify fields to be extracted on a page**

The Custom Job finds the text located in the bounds of each graphical annotation and saved it to an output xml file. For each field, the annotation subject is used as the field name. The output xml is saved to an output folder but could also be emailed, saved to a database, etc…
Fields are saved in a structured format in an xml file
Need to change anything? The beauty of this custom project is that it can be very easily customized to fit your own needs. Contact us if you’d like us to provide you code samples or consulting services to help you with your project.

import com.qoppa.pas.api.*;
import com.qoppa.pdf.*;
import com.qoppa.pdf.errors.*;
import com.qoppa.pdf.dom.*;
import com.qoppa.pdf.form.*;
import com.qoppa.pdf.source.*;
import com.qoppa.pdf.permissions.*;
import com.qoppa.pdfProcess.*;
import com.qoppa.pdf.annotations.*;
import com.qoppa.pdfViewer.actions.*;
 
import java.util.List;
import java.io.PrintWriter;
import java.io.File;
 
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
 
import org.w3c.dom.Document;
import org.w3c.dom.Element;
 
public class Job_0 implements CustomProcess
{
public boolean process(ServerContext serverContext, JobStatus jobStatus, WorkingBundle bundle)
{
// Get the PDFDocument from the WorkingBundle
PDFDocument pdfDoc = bundle.getPDFDocument();
 
try
{
// Get annotations from template PDF
PDFDocument pdfTemplate = new PDFDocument("c:/qoppa/pas/template.pdf", null);
 
// Build the XML DOM
DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
 
// root elements
Document xmlDocument = docBuilder.newDocument();
Element rootNode = xmlDocument.createElement("invoice");
xmlDocument.appendChild(rootNode);
 
// Loop through pages to extract data
for (int pageIx = 0; pageIx &lt; pdfTemplate.getPageCount() &amp;&amp; pageIx &lt; pdfDoc.getPageCount(); ++pageIx)
{
// Get the annotations for the page
List annots = pdfTemplate.getPage(pageIx).getAnnotations();
 
// Get the input document page
PDFPage docPage = pdfDoc.getPage(pageIx);
 
// Create XML element for this page
Element pageNode = xmlDocument.createElement("page");
 
// Loop through annotations
for (int annotIx = 0; annotIx &lt; annots.size(); ++annotIx)
{
Annotation a = (Annotation)annots.get(annotIx);
String content = docPage.getTextInArea(a.getRectangle()).getText();
 
// Create field element
Element field = xmlDocument.createElement(a.getSubject());
field.appendChild(xmlDocument.createTextNode(content));
pageNode.appendChild(field);
}
 
rootNode.appendChild(pageNode);
}
 
// write the content into xml file
TransformerFactory transformerFactory = TransformerFactory.newInstance();
Transformer transformer = transformerFactory.newTransformer();
DOMSource source = new DOMSource(xmlDocument);
StreamResult outputFile = new StreamResult(File.createTempFile("out", ".xml", new File("c:/qoppa/pas/output")));
transformer.transform(source, outputFile);
}
catch(Throwable t)
{
// nothing for now
}
 
// See the full jPDFProcess API at
// http://www.qoppa.com/files/pdfprocess/guide/javadoc/current/index.html
return true;
}
}

import com.qoppa.pas.api.*; import com.qoppa.pdf.*; import com.qoppa.pdf.errors.*; import com.qoppa.pdf.dom.*; import com.qoppa.pdf.form.*; import com.qoppa.pdf.source.*; import com.qoppa.pdf.permissions.*; import com.qoppa.pdfProcess.*; import com.qoppa.pdf.annotations.*; import com.qoppa.pdfViewer.actions.*; import java.util.List; import java.io.PrintWriter; import java.io.File; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import org.w3c.dom.Document; import org.w3c.dom.Element; public class Job_0 implements CustomProcess { public boolean process(ServerContext serverContext, JobStatus jobStatus, WorkingBundle bundle) { // Get the PDFDocument from the WorkingBundle PDFDocument pdfDoc = bundle.getPDFDocument(); try { // Get annotations from template PDF PDFDocument pdfTemplate = new PDFDocument("c:/qoppa/pas/template.pdf", null); // Build the XML DOM DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder docBuilder = docFactory.newDocumentBuilder(); // root elements Document xmlDocument = docBuilder.newDocument(); Element rootNode = xmlDocument.createElement("invoice"); xmlDocument.appendChild(rootNode); // Loop through pages to extract data for (int pageIx = 0; pageIx < pdfTemplate.getPageCount() && pageIx < pdfDoc.getPageCount(); ++pageIx) { // Get the annotations for the page List annots = pdfTemplate.getPage(pageIx).getAnnotations(); // Get the input document page PDFPage docPage = pdfDoc.getPage(pageIx); // Create XML element for this page Element pageNode = xmlDocument.createElement("page"); // Loop through annotations for (int annotIx = 0; annotIx < annots.size(); ++annotIx) { Annotation a = (Annotation)annots.get(annotIx); String content = docPage.getTextInArea(a.getRectangle()).getText(); // Create field element Element field = xmlDocument.createElement(a.getSubject()); field.appendChild(xmlDocument.createTextNode(content)); pageNode.appendChild(field); } rootNode.appendChild(pageNode); } // write the content into xml file TransformerFactory transformerFactory = TransformerFactory.newInstance(); Transformer transformer = transformerFactory.newTransformer(); DOMSource source = new DOMSource(xmlDocument); StreamResult outputFile = new StreamResult(File.createTempFile("out", ".xml", new File("c:/qoppa/pas/output"))); transformer.transform(source, outputFile); } catch(Throwable t) { // nothing for now } // See the full jPDFProcess API at // http://www.qoppa.com/files/pdfprocess/guide/javadoc/current/index.html return true; } }

See our PDF technology in action!

Privacy Policy

Links to Qoppa’s Main Website

Contact Support

Follow Us

Suggested Articles

How to implement your own PDF workflow using custom processes

Related Articles

Custom Job: Clear Digital Signatures in PDF

Split a PDF and email based on text contained in the PDF

Route a PDF document based on file name