Extract Text from Scanned Invoices for Automated Data Entry

Every accounts payable department deals with the same problem: vendors send PDF invoices, but half of them are scanned images. Your accounting software can't read them, so someone has to manually type in the vendor name, invoice number, line items, and total.

This manual data entry is slow, expensive, and error-prone. A single typo in an invoice number can cause payment delays and reconciliation headaches.

The solution is OCR (Optical Character Recognition). The aPDF.io OCR Read API extracts text from scanned PDFs, returning clean, structured data you can parse and feed directly into your accounting system.

Quick Example

Extract text from a scanned invoice with Node.js:
const axios = require('axios');

const API_TOKEN = 'YOUR_API_TOKEN';
const API_URL = 'https://apdf.io/api/pdf/ocr/read';

async function extractText(pdfUrl) {
    const response = await axios.post(API_URL, {
        file: pdfUrl
    }, {
        headers: {
            'Authorization': `Bearer ${API_TOKEN}`,
            'Content-Type': 'application/json',
            'Accept': 'application/json'
        }
    });

    return response.data;
}

// Extract text from a scanned invoice
extractText('https://example.com/scanned-invoice.pdf')
    .then(result => {
        console.log(`Pages: ${result.pages_total}`);
        console.log(`Total characters: ${result.characters_total}`);
        console.log('\\nExtracted text:');
        result.pages.forEach(page => {
            console.log(`\\n--- Page ${page.page} ---`);
            console.log(page.content);
        });
    });

Understanding the Response

The API returns text organized by page:

{
  "pages_total": 1,
  "characters_total": 847,
  "pages": [
    {
      "page": 1,
      "characters": 847,
      "content": "INVOICE\n\nFrom: Acme Supplies Inc.\n123 Business Street\nNew York, NY 10001\n\nInvoice #: INV-2024-0892\nDate: January 15, 2024\nDue Date: February 15, 2024\n\nBill To:\nTech Solutions Corp\n456 Corporate Ave\nSan Francisco, CA 94102\n\nDescription                    Qty    Price      Total\n---------------------------------------------------------\nOffice Supplies               10    $25.00    $250.00\nPrinter Paper (Box)            5    $45.00    $225.00\nInk Cartridges                 3    $89.00    $267.00\n\n                              Subtotal:        $742.00\n                              Tax (8%):         $59.36\n                              TOTAL:           $801.36"
    }
  ]
}

Real-World Scenario: Invoice Processing Pipeline

You're building an invoice processing system. When scanned invoices arrive, you need to extract key fields and insert them into your database. Here's how:

const axios = require('axios');

const API_TOKEN = 'YOUR_API_TOKEN';
const API_URL = 'https://apdf.io/api/pdf/ocr/read';

async function extractText(pdfUrl) {
    const response = await axios.post(API_URL, {
        file: pdfUrl
    }, {
        headers: {
            'Authorization': `Bearer ${API_TOKEN}`,
            'Content-Type': 'application/json',
            'Accept': 'application/json'
        }
    });
    return response.data;
}

function parseInvoiceData(text) {
    // Extract invoice number
    const invoiceMatch = text.match(/Invoice\s*#?:?\s*([A-Z0-9-]+)/i);
    const invoiceNumber = invoiceMatch ? invoiceMatch[1] : null;

    // Extract date
    const dateMatch = text.match(/Date:?\s*(\w+\s+\d{1,2},?\s+\d{4})/i);
    const invoiceDate = dateMatch ? dateMatch[1] : null;

    // Extract total amount
    const totalMatch = text.match(/TOTAL:?\s*\$?([\d,]+\.?\d*)/i);
    const totalAmount = totalMatch ? parseFloat(totalMatch[1].replace(',', '')) : null;

    // Extract vendor name (usually near the top)
    const lines = text.split('\\n').filter(line => line.trim());
    const vendorName = lines.length > 1 ? lines[1].trim() : null;

    return {
        invoiceNumber,
        invoiceDate,
        totalAmount,
        vendorName,
        rawText: text
    };
}

async function processInvoice(pdfUrl) {
    console.log(`Processing: ${pdfUrl}`);

    try {
        // Step 1: Extract text via OCR
        const ocrResult = await extractText(pdfUrl);
        const fullText = ocrResult.pages.map(p => p.content).join('\\n');

        // Step 2: Parse structured data
        const invoiceData = parseInvoiceData(fullText);

        console.log('\\nExtracted Invoice Data:');
        console.log(`  Invoice #: ${invoiceData.invoiceNumber}`);
        console.log(`  Date: ${invoiceData.invoiceDate}`);
        console.log(`  Vendor: ${invoiceData.vendorName}`);
        console.log(`  Total: \$${invoiceData.totalAmount}`);

        // Step 3: In production, save to database
        // await db.invoices.insert(invoiceData);

        return invoiceData;

    } catch (error) {
        console.error(`Failed to process invoice: ${error.message}`);
        return null;
    }
}

// Process a batch of scanned invoices
async function processBatch(invoiceUrls) {
    const results = [];

    for (const url of invoiceUrls) {
        const data = await processInvoice(url);
        if (data) {
            results.push(data);
        }
        // Rate limiting
        await new Promise(resolve => setTimeout(resolve, 1000));
    }

    console.log(`\\nProcessed ${results.length} invoices successfully`);
    return results;
}

// Example usage
const invoices = [
    'https://your-storage.com/invoices/scan-001.pdf',
    'https://your-storage.com/invoices/scan-002.pdf'
];

processBatch(invoices);

Handling Multi-Page Invoices

Some invoices span multiple pages. The API returns text for each page separately, so you can process them individually or combine them:

async function extractMultiPageInvoice(pdfUrl) {
    const result = await extractText(pdfUrl);

    console.log(`Invoice has ${result.pages_total} page(s)`);

    // Option 1: Process each page separately
    result.pages.forEach(page => {
        console.log(`\\nPage ${page.page} (${page.characters} chars):`);

        // Look for line items on this page
        if (page.content.includes('Subtotal') || page.content.includes('Total')) {
            console.log('  -> Contains totals');
        }
        if (page.content.match(/\d+\s+\$[\d.]+\s+\$[\d.]+/)) {
            console.log('  -> Contains line items');
        }
    });

    // Option 2: Combine all pages into one text block
    const fullText = result.pages
        .map(p => p.content)
        .join('\\n\\n--- PAGE BREAK ---\\n\\n');

    return {
        pageCount: result.pages_total,
        totalChars: result.characters_total,
        fullText: fullText,
        pages: result.pages
    };
}

Async Processing for Large Documents

For large scanned documents, use async mode to avoid timeouts:

const OCR_URL = 'https://apdf.io/api/pdf/ocr/read';
const STATUS_URL = 'https://apdf.io/api/job/status/check';

async function extractTextAsync(pdfUrl) {
    // Start async job
    const startResponse = await axios.post(OCR_URL, {
        file: pdfUrl,
        async: 1
    }, {
        headers: {
            'Authorization': `Bearer ${API_TOKEN}`,
            'Content-Type': 'application/json',
            'Accept': 'application/json'
        }
    });

    const jobId = startResponse.data.job_id;
    console.log(`Job started: ${jobId}`);

    // Poll for completion
    while (true) {
        const statusResponse = await axios.post(STATUS_URL, {
            job_id: jobId
        }, {
            headers: {
                'Authorization': `Bearer ${API_TOKEN}`,
                'Content-Type': 'application/json',
                'Accept': 'application/json'
            }
        });

        const status = statusResponse.data;

        if (status.status === 'completed') {
            console.log('OCR completed!');
            return status.result;
        } else if (status.status === 'failed') {
            throw new Error(`OCR failed: ${status.error}`);
        }

        console.log('Still processing...');
        await new Promise(resolve => setTimeout(resolve, 3000));
    }
}

Next Steps

Once you're extracting text from scanned documents, consider these enhancements:

  • Search specific terms: Use the OCR Search endpoint to find specific values (like PO numbers) without extracting all text.
  • Create searchable archives: Use the OCR Convert endpoint to add a text layer to scanned PDFs for permanent searchability.
Ready to build?
Get Started for Free