Extract Text from Scanned Invoices for Automated Data Entry

Every accounts payable department deals with the same problem: vendors send PDF invoices, but half of them are scanned images. Your accounting software can't read them, so someone has to manually type in the vendor name, invoice number, line items, and total.

This manual data entry is slow, expensive, and error-prone. A single typo in an invoice number can cause payment delays and reconciliation headaches.

The solution is OCR (Optical Character Recognition). The aPDF.io OCR Read API extracts text from scanned PDFs, returning clean, structured data you can parse and feed directly into your accounting system.

Quick Example

Extract text from a scanned invoice with Node.js:
const axios = require('axios');

const API_TOKEN = 'YOUR_API_TOKEN';
const API_URL = 'https://apdf.io/api/pdf/ocr/read';
const STATUS_URL = 'https://apdf.io/api/job/status/check';

// Helper: poll until the async job finishes, then return its result.
async function waitForJob(jobId, maxAttempts = 1200) {
    for (let i = 0; i < maxAttempts; i++) {
        const check = await axios.post(STATUS_URL, { id: jobId }, {
            headers: { 'Authorization': `Bearer ${API_TOKEN}` }
        });
        const body = check.data;
        if (body.status === 'successful') return body.result;
        if (body.status === 'failed') throw new Error(body.error || 'Job failed');
        await new Promise(r => setTimeout(r, 2000));
    }
    throw new Error('Job did not finish in time');
}

async function extractText(pdfUrl) {
    const response = await axios.post(API_URL, {
        file: pdfUrl
    }, {
        headers: {
            'Authorization': `Bearer ${API_TOKEN}`,
            'Content-Type': 'application/json',
            'Accept': 'application/json'
        }
    });

    return await waitForJob(response.data.job_id);
}

// Extract text from a scanned invoice
extractText('https://example.com/scanned-invoice.pdf')
    .then(result => {
        console.log(`Pages: ${result.pages_total}`);
        console.log(`Total characters: ${result.characters_total}`);
        console.log('\\nExtracted text:');
        result.pages.forEach(page => {
            console.log(`\\n--- Page ${page.page} ---`);
            console.log(page.content);
        });
    });

Understanding the Response

The API returns text organized by page:

{
  "pages_total": 1,
  "characters_total": 847,
  "pages": [
    {
      "page": 1,
      "characters": 847,
      "content": "INVOICE\n\nFrom: Acme Supplies Inc.\n123 Business Street\nNew York, NY 10001\n\nInvoice #: INV-2024-0892\nDate: January 15, 2024\nDue Date: February 15, 2024\n\nBill To:\nTech Solutions Corp\n456 Corporate Ave\nSan Francisco, CA 94102\n\nDescription                    Qty    Price      Total\n---------------------------------------------------------\nOffice Supplies               10    $25.00    $250.00\nPrinter Paper (Box)            5    $45.00    $225.00\nInk Cartridges                 3    $89.00    $267.00\n\n                              Subtotal:        $742.00\n                              Tax (8%):         $59.36\n                              TOTAL:           $801.36"
    }
  ]
}

Real-World Scenario: Invoice Processing Pipeline

You're building an invoice processing system. When scanned invoices arrive, you need to extract key fields and insert them into your database. Here's how:

const axios = require('axios');

const API_TOKEN = 'YOUR_API_TOKEN';
const API_URL = 'https://apdf.io/api/pdf/ocr/read';
const STATUS_URL = 'https://apdf.io/api/job/status/check';

// Helper: poll until the async job finishes, then return its result.
async function waitForJob(jobId, maxAttempts = 1200) {
    for (let i = 0; i < maxAttempts; i++) {
        const check = await axios.post(STATUS_URL, { id: jobId }, {
            headers: { 'Authorization': `Bearer ${API_TOKEN}` }
        });
        const body = check.data;
        if (body.status === 'successful') return body.result;
        if (body.status === 'failed') throw new Error(body.error || 'Job failed');
        await new Promise(r => setTimeout(r, 2000));
    }
    throw new Error('Job did not finish in time');
}

async function extractText(pdfUrl) {
    const response = await axios.post(API_URL, {
        file: pdfUrl
    }, {
        headers: {
            'Authorization': `Bearer ${API_TOKEN}`,
            'Content-Type': 'application/json',
            'Accept': 'application/json'
        }
    });
    return await waitForJob(response.data.job_id);
}

function parseInvoiceData(text) {
    // Extract invoice number
    const invoiceMatch = text.match(/Invoice\s*#?:?\s*([A-Z0-9-]+)/i);
    const invoiceNumber = invoiceMatch ? invoiceMatch[1] : null;

    // Extract date
    const dateMatch = text.match(/Date:?\s*(\w+\s+\d{1,2},?\s+\d{4})/i);
    const invoiceDate = dateMatch ? dateMatch[1] : null;

    // Extract total amount
    const totalMatch = text.match(/TOTAL:?\s*\$?([\d,]+\.?\d*)/i);
    const totalAmount = totalMatch ? parseFloat(totalMatch[1].replace(',', '')) : null;

    // Extract vendor name (usually near the top)
    const lines = text.split('\\n').filter(line => line.trim());
    const vendorName = lines.length > 1 ? lines[1].trim() : null;

    return {
        invoiceNumber,
        invoiceDate,
        totalAmount,
        vendorName,
        rawText: text
    };
}

async function processInvoice(pdfUrl) {
    console.log(`Processing: ${pdfUrl}`);

    try {
        // Step 1: Extract text via OCR
        const ocrResult = await extractText(pdfUrl);
        const fullText = ocrResult.pages.map(p => p.content).join('\\n');

        // Step 2: Parse structured data
        const invoiceData = parseInvoiceData(fullText);

        console.log('\\nExtracted Invoice Data:');
        console.log(`  Invoice #: ${invoiceData.invoiceNumber}`);
        console.log(`  Date: ${invoiceData.invoiceDate}`);
        console.log(`  Vendor: ${invoiceData.vendorName}`);
        console.log(`  Total: \$${invoiceData.totalAmount}`);

        // Step 3: In production, save to database
        // await db.invoices.insert(invoiceData);

        return invoiceData;

    } catch (error) {
        console.error(`Failed to process invoice: ${error.message}`);
        return null;
    }
}

// Process a batch of scanned invoices
async function processBatch(invoiceUrls) {
    const results = [];

    for (const url of invoiceUrls) {
        const data = await processInvoice(url);
        if (data) {
            results.push(data);
        }
        // Rate limiting
        await new Promise(resolve => setTimeout(resolve, 1000));
    }

    console.log(`\\nProcessed ${results.length} invoices successfully`);
    return results;
}

// Example usage
const invoices = [
    'https://your-storage.com/invoices/scan-001.pdf',
    'https://your-storage.com/invoices/scan-002.pdf'
];

processBatch(invoices);

Handling Multi-Page Invoices

Some invoices span multiple pages. The API returns text for each page separately, so you can process them individually or combine them:

async function extractMultiPageInvoice(pdfUrl) {
    const result = await extractText(pdfUrl);

    console.log(`Invoice has ${result.pages_total} page(s)`);

    // Option 1: Process each page separately
    result.pages.forEach(page => {
        console.log(`\\nPage ${page.page} (${page.characters} chars):`);

        // Look for line items on this page
        if (page.content.includes('Subtotal') || page.content.includes('Total')) {
            console.log('  -> Contains totals');
        }
        if (page.content.match(/\d+\s+\$[\d.]+\s+\$[\d.]+/)) {
            console.log('  -> Contains line items');
        }
    });

    // Option 2: Combine all pages into one text block
    const fullText = result.pages
        .map(p => p.content)
        .join('\\n\\n--- PAGE BREAK ---\\n\\n');

    return {
        pageCount: result.pages_total,
        totalChars: result.characters_total,
        fullText: fullText,
        pages: result.pages
    };
}

Skip Polling with Webhooks

The OCR endpoint always runs in the background, so the calls above poll the job status until it finishes. If you'd rather not poll, pass a webhook_url in the original request and aPDF.io will POST the same result payload to that URL once the job is done:

await axios.post(API_URL, {
    file: pdfUrl,
    webhook_url: 'https://your-app.com/webhooks/ocr-complete'
}, {
    headers: {
        'Authorization': `Bearer ${API_TOKEN}`,
        'Content-Type': 'application/json',
        'Accept': 'application/json'
    }
});

Next Steps

Once you're extracting text from scanned documents, consider these enhancements:

  • Search specific terms: Use the OCR Search endpoint to find specific values (like PO numbers) without extracting all text.
  • Create searchable archives: Use the OCR Convert endpoint to add a text layer to scanned PDFs for permanent searchability.
Ready to build?
Get Started for Free