Spaces:
Sleeping
Sleeping
File size: 7,490 Bytes
ae053b7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 | #!/usr/bin/env python3
"""Generate a DELIBERATELY HARD invoice image to stress the OCR backends:
• rotated vertical "INVOICE" banner + sideways Terms block (orientation problems)
• a diagonal translucent "ORIGINAL COPY" watermark overlapping text
• scattered header fields (invoice #, dates, vendor, bill-to in offset boxes)
• a misaligned line-item table (inconsistent column alignment)
• totals scattered across corners (subtotal / tax / shipping / grand total / balance due)
This is the kind of layout where classic OCR (Tesseract) struggles but a vision
LLM (MiniCPM-V) reads it correctly. Writes:
backend/evals/datasets/complex_invoice_messy.png
backend/evals/datasets/complex_invoice_messy.gt.json (skip_eval — showcase only)
No .txt sidecar on purpose — this document REQUIRES a real OCR engine.
"""
from __future__ import annotations
import json
from pathlib import Path
from PIL import Image, ImageDraw, ImageFont
ROOT = Path(__file__).resolve().parent.parent
OUT = ROOT / "backend" / "evals" / "datasets"
W, H = 1240, 1600
GT = {
"doc_type": "invoice",
"invoice_number": "INV-9X-44821",
"issue_date": "2026-04-18",
"due_date": "2026-05-18",
"vendor_name": "Meridian Industrial Components Ltd",
"bill_to_name": "Aperture Retail Group",
"currency": "USD",
"subtotal": 9111.50,
"tax_amount": 751.70,
"total": 10043.20,
"line_items": [
{"description": "Hydraulic pump HX-220", "quantity": 4, "unit_price": 1250.00, "line_total": 5000.00},
{"description": "Seal kit (set of 12)", "quantity": 10, "unit_price": 85.50, "line_total": 855.00},
{"description": "Pressure gauge 0-300psi", "quantity": 6, "unit_price": 142.75, "line_total": 856.50},
{"description": "Installation labor", "quantity": 1, "unit_price": 2400.00, "line_total": 2400.00},
],
"_meta": {"doc_type": "invoice", "channel": "scanned", "difficulty": "complex_layout", "skip_eval": True},
}
def font(sz, bold=False):
for p in ([
"/System/Library/Fonts/Supplemental/Arial Bold.ttf" if bold else "/System/Library/Fonts/Supplemental/Arial.ttf",
"/System/Library/Fonts/Helvetica.ttc",
"/Library/Fonts/Arial.ttf",
]):
try:
return ImageFont.truetype(p, sz)
except Exception:
continue
return ImageFont.load_default()
def rotated(base, text, xy, angle, fnt, fill=(20, 20, 20)):
tmp = Image.new("RGBA", (max(20, len(text) * fnt.size), fnt.size + 16), (0, 0, 0, 0))
ImageDraw.Draw(tmp).text((2, 2), text, font=fnt, fill=fill)
tmp = tmp.rotate(angle, expand=True)
base.paste(tmp, xy, tmp)
def main():
OUT.mkdir(parents=True, exist_ok=True)
img = Image.new("RGB", (W, H), "white")
d = ImageDraw.Draw(img)
# --- diagonal translucent watermark overlapping content ---
wm = Image.new("RGBA", (W, H), (0, 0, 0, 0))
wd = ImageDraw.Draw(wm)
wd.text((140, 120), "ORIGINAL COPY", font=font(120, True), fill=(150, 150, 150, 60))
wm = wm.rotate(28, center=(W // 2, H // 2))
img.paste(wm, (0, 0), wm)
# --- left vertical INVOICE banner (rotated 90) ---
d.rectangle([18, 60, 96, 760], fill=(28, 40, 80))
rotated(img, "INVOICE", (24, 470), 90, font(54, True), fill=(255, 255, 255))
# --- scattered header fields (offset boxes, inconsistent placement) ---
d.text((900, 70), "Invoice No.", font=font(20, True), fill=(90, 90, 90))
d.text((900, 98), GT["invoice_number"], font=font(26, True), fill=(10, 10, 10))
d.text((690, 150), f"Issued: {GT['issue_date']}", font=font(20), fill=(10, 10, 10))
d.text((980, 200), f"Due {GT['due_date']}", font=font(20), fill=(160, 30, 30)) # offset, different spot
# Balance due repeated near top in red (inconsistent)
d.text((620, 60), "BALANCE DUE $10,043.20", font=font(26, True), fill=(190, 20, 20))
# vendor block (top-left, after banner) and bill-to (offset right-middle)
d.rectangle([130, 90, 560, 220], outline=(120, 120, 120), width=2)
d.text((145, 100), "FROM / Remit to:", font=font(18, True), fill=(70, 70, 70))
d.text((145, 128), GT["vendor_name"], font=font(22, True), fill=(10, 10, 10))
d.text((145, 160), "Unit 7, Kvaerner Estate", font=font(18), fill=(40, 40, 40))
d.text((145, 184), "VAT GB-882-114", font=font(18), fill=(40, 40, 40))
d.rectangle([640, 300, 1080, 420], outline=(120, 120, 120), width=2)
d.text((655, 310), "Bill To", font=font(18, True), fill=(70, 70, 70))
d.text((655, 338), GT["bill_to_name"], font=font(22, True), fill=(10, 10, 10))
d.text((655, 372), "Accounts Payable, Floor 3", font=font(18), fill=(40, 40, 40))
d.text((655, 396), "PO ref: PO-77-3391", font=font(18), fill=(40, 40, 40))
# --- misaligned line-item table ---
ty = 470
d.line([130, ty - 10, 1110, ty - 10], fill=(40, 40, 40), width=2)
# headers placed inconsistently (not above their columns)
d.text((150, ty), "Item / Description", font=font(20, True), fill=(20, 20, 20))
d.text((720, ty), "Unit", font=font(20, True), fill=(20, 20, 20))
d.text((640, ty), "Qty", font=font(20, True), fill=(20, 20, 20))
d.text((980, ty), "Amount", font=font(20, True), fill=(20, 20, 20))
ry = ty + 44
for i, it in enumerate(GT["line_items"], 1):
d.text((150, ry), f"{i}. {it['description']}", font=font(20), fill=(15, 15, 15))
# qty/unit/amount with inconsistent alignment (left vs right vs centered)
d.text((648, ry), str(int(it["quantity"])), font=font(20), fill=(15, 15, 15))
d.text((700, ry), f"${it['unit_price']:,.2f}", font=font(20), fill=(15, 15, 15))
amt = f"${it['line_total']:,.2f}"
d.text((1090 - d.textlength(amt, font=font(20)), ry), amt, font=font(20), fill=(15, 15, 15))
ry += 46
d.line([130, ry + 6, 1110, ry + 6], fill=(40, 40, 40), width=1)
# --- totals scattered across corners ---
d.text((180, ry + 60), "Tax @ 8.25%", font=font(20), fill=(15, 15, 15)) # bottom-left
d.text((320, ry + 60), "$751.70", font=font(20), fill=(15, 15, 15))
d.text((640, ry + 30), "Sub-total", font=font(20), fill=(15, 15, 15)) # middle
d.text((780, ry + 30), "$9,111.50", font=font(20), fill=(15, 15, 15))
d.text((640, ry + 90), "Freight/Shipping", font=font(20), fill=(15, 15, 15))
d.text((820, ry + 90), "$180.00", font=font(20), fill=(15, 15, 15))
# grand total in a bold box, bottom-right
d.rectangle([800, ry + 140, 1110, ry + 210], fill=(28, 40, 80))
d.text((820, ry + 150), "GRAND TOTAL", font=font(20, True), fill=(255, 255, 255))
d.text((820, ry + 178), "$10,043.20 USD", font=font(24, True), fill=(255, 255, 255))
# --- sideways Terms & Conditions block (rotated 90) on the right edge ---
rotated(img, "Terms: Net 30 days. Late fee 1.5%/mo. Goods remain property of seller until paid.",
(1150, 360), 90, font(18), fill=(90, 90, 90))
# footer note (two-column-ish)
d.text((150, H - 120), "Notes: Partial back-order on item 3 may apply.", font=font(18), fill=(60, 60, 60))
d.text((150, H - 92), "Remittance: SWIFT MERIGB2L / IBAN GB22 MERI 0099 8812", font=font(18), fill=(60, 60, 60))
out_png = OUT / "complex_invoice_messy.png"
img.save(out_png)
(OUT / "complex_invoice_messy.gt.json").write_text(json.dumps(GT, indent=2))
print(f"✓ wrote {out_png} ({W}x{H})")
print(f"✓ wrote {OUT/'complex_invoice_messy.gt.json'}")
if __name__ == "__main__":
main()
|