ERP-DocIQ / scripts /generate_complex_invoice.py
kenmandal's picture
Add complex invoice (MiniCPM vision OCR) + complex multi-step web automation tab
ae053b7 verified
Raw
History Blame Contribute Delete
7.49 kB
#!/usr/bin/env python3
"""Generate a DELIBERATELY HARD invoice image to stress the OCR backends:
• rotated vertical "INVOICE" banner + sideways Terms block (orientation problems)
• a diagonal translucent "ORIGINAL COPY" watermark overlapping text
• scattered header fields (invoice #, dates, vendor, bill-to in offset boxes)
• a misaligned line-item table (inconsistent column alignment)
• totals scattered across corners (subtotal / tax / shipping / grand total / balance due)
This is the kind of layout where classic OCR (Tesseract) struggles but a vision
LLM (MiniCPM-V) reads it correctly. Writes:
backend/evals/datasets/complex_invoice_messy.png
backend/evals/datasets/complex_invoice_messy.gt.json (skip_eval — showcase only)
No .txt sidecar on purpose — this document REQUIRES a real OCR engine.
"""
from __future__ import annotations
import json
from pathlib import Path
from PIL import Image, ImageDraw, ImageFont
ROOT = Path(__file__).resolve().parent.parent
OUT = ROOT / "backend" / "evals" / "datasets"
W, H = 1240, 1600
GT = {
"doc_type": "invoice",
"invoice_number": "INV-9X-44821",
"issue_date": "2026-04-18",
"due_date": "2026-05-18",
"vendor_name": "Meridian Industrial Components Ltd",
"bill_to_name": "Aperture Retail Group",
"currency": "USD",
"subtotal": 9111.50,
"tax_amount": 751.70,
"total": 10043.20,
"line_items": [
{"description": "Hydraulic pump HX-220", "quantity": 4, "unit_price": 1250.00, "line_total": 5000.00},
{"description": "Seal kit (set of 12)", "quantity": 10, "unit_price": 85.50, "line_total": 855.00},
{"description": "Pressure gauge 0-300psi", "quantity": 6, "unit_price": 142.75, "line_total": 856.50},
{"description": "Installation labor", "quantity": 1, "unit_price": 2400.00, "line_total": 2400.00},
],
"_meta": {"doc_type": "invoice", "channel": "scanned", "difficulty": "complex_layout", "skip_eval": True},
}
def font(sz, bold=False):
for p in ([
"/System/Library/Fonts/Supplemental/Arial Bold.ttf" if bold else "/System/Library/Fonts/Supplemental/Arial.ttf",
"/System/Library/Fonts/Helvetica.ttc",
"/Library/Fonts/Arial.ttf",
]):
try:
return ImageFont.truetype(p, sz)
except Exception:
continue
return ImageFont.load_default()
def rotated(base, text, xy, angle, fnt, fill=(20, 20, 20)):
tmp = Image.new("RGBA", (max(20, len(text) * fnt.size), fnt.size + 16), (0, 0, 0, 0))
ImageDraw.Draw(tmp).text((2, 2), text, font=fnt, fill=fill)
tmp = tmp.rotate(angle, expand=True)
base.paste(tmp, xy, tmp)
def main():
OUT.mkdir(parents=True, exist_ok=True)
img = Image.new("RGB", (W, H), "white")
d = ImageDraw.Draw(img)
# --- diagonal translucent watermark overlapping content ---
wm = Image.new("RGBA", (W, H), (0, 0, 0, 0))
wd = ImageDraw.Draw(wm)
wd.text((140, 120), "ORIGINAL COPY", font=font(120, True), fill=(150, 150, 150, 60))
wm = wm.rotate(28, center=(W // 2, H // 2))
img.paste(wm, (0, 0), wm)
# --- left vertical INVOICE banner (rotated 90) ---
d.rectangle([18, 60, 96, 760], fill=(28, 40, 80))
rotated(img, "INVOICE", (24, 470), 90, font(54, True), fill=(255, 255, 255))
# --- scattered header fields (offset boxes, inconsistent placement) ---
d.text((900, 70), "Invoice No.", font=font(20, True), fill=(90, 90, 90))
d.text((900, 98), GT["invoice_number"], font=font(26, True), fill=(10, 10, 10))
d.text((690, 150), f"Issued: {GT['issue_date']}", font=font(20), fill=(10, 10, 10))
d.text((980, 200), f"Due {GT['due_date']}", font=font(20), fill=(160, 30, 30)) # offset, different spot
# Balance due repeated near top in red (inconsistent)
d.text((620, 60), "BALANCE DUE $10,043.20", font=font(26, True), fill=(190, 20, 20))
# vendor block (top-left, after banner) and bill-to (offset right-middle)
d.rectangle([130, 90, 560, 220], outline=(120, 120, 120), width=2)
d.text((145, 100), "FROM / Remit to:", font=font(18, True), fill=(70, 70, 70))
d.text((145, 128), GT["vendor_name"], font=font(22, True), fill=(10, 10, 10))
d.text((145, 160), "Unit 7, Kvaerner Estate", font=font(18), fill=(40, 40, 40))
d.text((145, 184), "VAT GB-882-114", font=font(18), fill=(40, 40, 40))
d.rectangle([640, 300, 1080, 420], outline=(120, 120, 120), width=2)
d.text((655, 310), "Bill To", font=font(18, True), fill=(70, 70, 70))
d.text((655, 338), GT["bill_to_name"], font=font(22, True), fill=(10, 10, 10))
d.text((655, 372), "Accounts Payable, Floor 3", font=font(18), fill=(40, 40, 40))
d.text((655, 396), "PO ref: PO-77-3391", font=font(18), fill=(40, 40, 40))
# --- misaligned line-item table ---
ty = 470
d.line([130, ty - 10, 1110, ty - 10], fill=(40, 40, 40), width=2)
# headers placed inconsistently (not above their columns)
d.text((150, ty), "Item / Description", font=font(20, True), fill=(20, 20, 20))
d.text((720, ty), "Unit", font=font(20, True), fill=(20, 20, 20))
d.text((640, ty), "Qty", font=font(20, True), fill=(20, 20, 20))
d.text((980, ty), "Amount", font=font(20, True), fill=(20, 20, 20))
ry = ty + 44
for i, it in enumerate(GT["line_items"], 1):
d.text((150, ry), f"{i}. {it['description']}", font=font(20), fill=(15, 15, 15))
# qty/unit/amount with inconsistent alignment (left vs right vs centered)
d.text((648, ry), str(int(it["quantity"])), font=font(20), fill=(15, 15, 15))
d.text((700, ry), f"${it['unit_price']:,.2f}", font=font(20), fill=(15, 15, 15))
amt = f"${it['line_total']:,.2f}"
d.text((1090 - d.textlength(amt, font=font(20)), ry), amt, font=font(20), fill=(15, 15, 15))
ry += 46
d.line([130, ry + 6, 1110, ry + 6], fill=(40, 40, 40), width=1)
# --- totals scattered across corners ---
d.text((180, ry + 60), "Tax @ 8.25%", font=font(20), fill=(15, 15, 15)) # bottom-left
d.text((320, ry + 60), "$751.70", font=font(20), fill=(15, 15, 15))
d.text((640, ry + 30), "Sub-total", font=font(20), fill=(15, 15, 15)) # middle
d.text((780, ry + 30), "$9,111.50", font=font(20), fill=(15, 15, 15))
d.text((640, ry + 90), "Freight/Shipping", font=font(20), fill=(15, 15, 15))
d.text((820, ry + 90), "$180.00", font=font(20), fill=(15, 15, 15))
# grand total in a bold box, bottom-right
d.rectangle([800, ry + 140, 1110, ry + 210], fill=(28, 40, 80))
d.text((820, ry + 150), "GRAND TOTAL", font=font(20, True), fill=(255, 255, 255))
d.text((820, ry + 178), "$10,043.20 USD", font=font(24, True), fill=(255, 255, 255))
# --- sideways Terms & Conditions block (rotated 90) on the right edge ---
rotated(img, "Terms: Net 30 days. Late fee 1.5%/mo. Goods remain property of seller until paid.",
(1150, 360), 90, font(18), fill=(90, 90, 90))
# footer note (two-column-ish)
d.text((150, H - 120), "Notes: Partial back-order on item 3 may apply.", font=font(18), fill=(60, 60, 60))
d.text((150, H - 92), "Remittance: SWIFT MERIGB2L / IBAN GB22 MERI 0099 8812", font=font(18), fill=(60, 60, 60))
out_png = OUT / "complex_invoice_messy.png"
img.save(out_png)
(OUT / "complex_invoice_messy.gt.json").write_text(json.dumps(GT, indent=2))
print(f"✓ wrote {out_png} ({W}x{H})")
print(f"✓ wrote {OUT/'complex_invoice_messy.gt.json'}")
if __name__ == "__main__":
main()