<< back
Mass-converting of HTML documents to a PDF with PD4ML
HTML as a source file format for PDF generation has its specifics. The entire document must be completely read and rendered in RAM before
a PDF output begins.
It makes no problem when you convert to regular 5-10-100 page PDFs. But if a source HTML document is
volumous and more than 1000 PDF pages are expected,
a converting of the document at-once probably is not a very good idea from an
RAM utilizing/performance perspective.If an HTML document structure and layout
allow that, try to represent the HTML as a set of smaller independent parts,
convert them separately to PDFs and merge the resulting PDFs to a single
document.
A typical use case of the approach is a conversion of bank or telecom account
statements to PDF. That type of document can be extremely volumous, but its
structure as a rule is repeated (a sequence of content portions correspond to
single pages) and easy to be split.
The below example requests for a new HTML by getNextDocument() call.
It collects a chunk of such docs, convert them to a PDF and keeps in a list.
After all HTMLs are converted it merges the kept PDFs to a single doc.
getNextDocument() in the sample is to be overridden with a more
practical code.
For example, it could request a new XML data portion from a database, transform it to HTML and return as a StringReader object.
As in the approach we deal with relatively independent HTML documents, a
parallelizing of conversion and merging procedures is possible and promises performance benefits on multi-CPU workstations/servers.
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Iterator;
import org.zefer.pd4ml.PD4Document;
import org.zefer.pd4ml.PD4ML;
public class MassConvert {
private final static int DOCS_PER_CHUNK = 20;
private static ArrayList chunks = new ArrayList();
public URL getNextDocument() throws MalformedURLException {
// to be overridden
//
// if source HTML documents are generated on-a-fly, probably it is a good idea
// to change the method to
// public StringReader getNextDocument()
// and to adjust the rest of the class code correspondingly
return new URL("file:/O:/work/testarea/AccountStatement.htm");
}
public int getDocumentNumber() {
// to be overridden
return 1000;
}
public static void main(String[] args) {
MassConvert mc = new MassConvert();
mc.convert();
}
public void convert() {
long start = System.currentTimeMillis();
int perc = 0;
int oldperc = 0;
byte[] pdf = null;
int docNumber = getDocumentNumber();
URL[] urls = new URL[DOCS_PER_CHUNK];
try {
System.out.println("Converting chunks");
for ( int i = 0; docNumber > i; i++ ) {
urls[i % DOCS_PER_CHUNK] = getNextDocument();
perc = i * 100 / docNumber;
if ( (i+1) % DOCS_PER_CHUNK == 0 || i == docNumber - 1 ) {
PD4ML pd4ml = new PD4ML();
pd4ml.setHtmlWidth(800);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
pd4ml.render(urls, baos);
pdf = baos.toByteArray();
if (perc / 10 > oldperc / 10) {
oldperc = perc;
System.out.print((perc/10)*10 + "%");
System.out.println(" " +
(System.currentTimeMillis() - start)/1000 + "sec");
}
chunks.add(pdf);
pdf = null;
urls = new URL[DOCS_PER_CHUNK > docNumber - i ?
docNumber - 1 - i : DOCS_PER_CHUNK];
}
}
System.out.println("Merging " + chunks.size() + " chunks");
int i = 0;
ArrayList buf = new ArrayList();
while ( chunks.size() > 1 ) {
Iterator ii = chunks.iterator();
while ( ii.hasNext() ) {
pdf = (byte[])ii.next();
ii.remove();
if ( ii.hasNext() ) {
byte[] pdf2 = (byte[])ii.next();
ii.remove();
InputStream is1 = new ByteArrayInputStream(pdf);
InputStream is2 = new ByteArrayInputStream(pdf2);
ByteArrayOutputStream osMerge = new ByteArrayOutputStream();
PD4Document.mergePDFs(is1, is2, osMerge);
buf.add(osMerge.toByteArray());
} else {
buf.add(pdf);
break;
}
System.out.print('.');
if ( (i+1) % DOCS_PER_CHUNK == 0 ) {
System.out.println(" " +
(System.currentTimeMillis() - start)/1000 + "sec");
}
i++;
}
ii = null;
chunks.clear();
chunks = buf;
buf = new ArrayList();
}
if ( chunks.size() != 1 ) {
// not likely
System.out.println("\nERROR?");
} else {
pdf = (byte[])chunks.get(0);
System.out.println("\ndone in " +
(System.currentTimeMillis() - start)/1000 + "sec");
System.out.println("Resulting PDF size: " + pdf.length + "bytes");
File pdfFile = File.createTempFile("merge", ".pdf");
FileOutputStream fos = new FileOutputStream(pdfFile);
fos.write(pdf);
fos.close();
String params = "C:\\Program Files (x86)\\Adobe\\Reader 11.0\\Reader\\AcroRD32.exe " +
pdfFile.getAbsolutePath();
Runtime.getRuntime().exec(params);
}
} catch (Exception ex) {
ex.printStackTrace();
}
}
}
|