| |
| """Generate EXTREMELY hard OCR documents β embedded images + heavy degradation: |
| |
| 1. extreme_receipt_photo β thermal receipt PHOTOGRAPHED on a desk: perspective |
| warp, uneven lighting, shadow, crinkle lines, faded thermal band, printed logo. |
| 2. extreme_po_collage β image-heavy purchase order: product THUMBNAIL IMAGES |
| in table rows, QR code, barcode, rotated APPROVED stamp over the table, |
| signature scribble, misaligned columns. |
| 3. extreme_contract_fax β dense two-column contract received BY FAX: low |
| contrast, salt-and-pepper noise, skew, scanline streaks, punch-hole shadows, |
| handwritten blue margin note, red RECEIVED stamp. |
| |
| Each writes <id>.png + <id>.gt.json + <id>.txt (sidecar reference text, drawn from |
| the SAME strings as the image so CER/WER is fair). All are tagged skip_eval so the |
| main eval harness is unchanged; the OCR quality benchmark picks them up. |
| """ |
| from __future__ import annotations |
|
|
| import json |
| import math |
| import random |
| from pathlib import Path |
|
|
| import numpy as np |
| from PIL import Image, ImageDraw, ImageFilter, ImageFont |
|
|
| ROOT = Path(__file__).resolve().parent.parent |
| OUT = ROOT / "backend" / "evals" / "datasets" |
| rng = random.Random(42) |
|
|
|
|
| def font(sz, bold=False, mono=False): |
| paths = (["/System/Library/Fonts/Supplemental/Courier New Bold.ttf", |
| "/System/Library/Fonts/Supplemental/Courier New.ttf"] if mono else []) + [ |
| "/System/Library/Fonts/Supplemental/Arial Bold.ttf" if bold else "/System/Library/Fonts/Supplemental/Arial.ttf", |
| "/System/Library/Fonts/Helvetica.ttc", |
| "/Library/Fonts/Arial.ttf", |
| ] |
| for p in paths: |
| try: |
| return ImageFont.truetype(p, sz) |
| except Exception: |
| continue |
| return ImageFont.load_default() |
|
|
|
|
| def _find_coeffs(dst, src): |
| """Perspective coefficients so that src corners land on dst corners.""" |
| A, B = [], [] |
| for (x, y), (u, v) in zip(dst, src): |
| A.append([x, y, 1, 0, 0, 0, -u * x, -u * y]) |
| A.append([0, 0, 0, x, y, 1, -v * x, -v * y]) |
| B.extend([u, v]) |
| res, *_ = np.linalg.lstsq(np.array(A, float), np.array(B, float), rcond=None) |
| return res.tolist() |
|
|
|
|
| def stamp(text, color, angle, size=(360, 120), fsz=34): |
| im = Image.new("RGBA", size, (0, 0, 0, 0)) |
| d = ImageDraw.Draw(im) |
| d.rounded_rectangle([4, 4, size[0] - 4, size[1] - 4], radius=16, outline=color + (190,), width=5) |
| f = font(fsz, bold=True) |
| tw = d.textlength(text, font=f) |
| d.text(((size[0] - tw) / 2, (size[1] - fsz) / 2 - 6), text, font=f, fill=color + (190,)) |
| return im.rotate(angle, expand=True, resample=Image.BICUBIC) |
|
|
|
|
| def signature(w=220, h=60, color=(25, 30, 120)): |
| im = Image.new("RGBA", (w, h), (0, 0, 0, 0)) |
| d = ImageDraw.Draw(im) |
| pts = [] |
| for i in range(60): |
| t = i / 59 |
| x = 8 + t * (w - 16) |
| y = h / 2 + math.sin(t * 14 + 1.3) * (h / 3) * (1 - 0.5 * t) + rng.uniform(-2, 2) |
| pts.append((x, y)) |
| d.line(pts, fill=color + (230,), width=3, joint="curve") |
| return im |
|
|
|
|
| |
| R_LINES = [ |
| "BREW & BEAN COFFEE Co.", |
| "412 Harbor Lane, Portland OR", |
| "Receipt #R-88341 Reg 02", |
| "Date: 2026-06-02 14:37", |
| "Currency: USD", |
| "--------------------------------", |
| "Flat White 2 x 4.75 9.50", |
| "Butter Croissant 3 x 3.25 9.75", |
| "Cold Brew Growler 1 x 14.00 14.00", |
| "Loyalty discount -2.50", |
| "--------------------------------", |
| "Subtotal 30.75", |
| "Tax 8.8% 2.71", |
| "TOTAL 33.46", |
| "Payment: VISA ****4421", |
| "--------------------------------", |
| "Thank you! brewandbean.example", |
| ] |
| R_GT = { |
| "doc_type": "receipt", |
| "merchant": "BREW & BEAN COFFEE Co.", |
| "date": "2026-06-02", |
| "currency": "USD", |
| "subtotal": 30.75, |
| "tax_amount": 2.71, |
| "total": 33.46, |
| "payment_method": "VISA ****4421", |
| "line_items": [ |
| {"description": "Flat White", "quantity": 2, "unit_price": 4.75, "line_total": 9.50}, |
| {"description": "Butter Croissant", "quantity": 3, "unit_price": 3.25, "line_total": 9.75}, |
| {"description": "Cold Brew Growler", "quantity": 1, "unit_price": 14.00, "line_total": 14.00}, |
| ], |
| "_meta": {"doc_type": "receipt", "channel": "photo", "difficulty": "extreme", "skip_eval": True}, |
| } |
|
|
|
|
| def gen_receipt(): |
| pw, ph = 560, 1010 |
| paper = Image.new("RGBA", (pw, ph), (250, 248, 242, 255)) |
| d = ImageDraw.Draw(paper) |
| |
| cx, cy = pw // 2, 64 |
| d.ellipse([cx - 44, cy - 44, cx + 44, cy + 44], outline=(60, 50, 45), width=4) |
| d.rounded_rectangle([cx - 20, cy - 14, cx + 14, cy + 22], radius=5, fill=(60, 50, 45)) |
| d.arc([cx + 8, cy - 8, cx + 30, cy + 14], 270, 90, fill=(60, 50, 45), width=4) |
| fm = font(24, mono=True) |
| y = 130 |
| for ln in R_LINES: |
| w = d.textlength(ln, font=fm) |
| x = (pw - w) / 2 if not ln.startswith(("Flat", "Butter", "Cold", "Loyal", "Subt", "Tax", "TOTAL", "Paym")) else 28 |
| d.text((x, y), ln, font=fm, fill=(40, 38, 36)) |
| y += 36 |
| d.line([(0, ph - 14), (pw, ph - 6)], fill=(250, 248, 242, 0)) |
| |
| for _ in range(7): |
| x0 = rng.randint(0, pw) |
| d.line([(x0, 0), (x0 + rng.randint(-90, 90), ph)], fill=(208, 204, 196, 90), width=2) |
| |
| arr = np.asarray(paper).astype(np.float32) |
| y0, y1 = 430, 560 |
| fade = arr[y0:y1, :, :3] |
| arr[y0:y1, :, :3] = fade + (255 - fade) * 0.55 |
| paper = Image.fromarray(arr.astype(np.uint8)) |
|
|
| |
| W, H = 1000, 1400 |
| desk = Image.new("RGB", (W, H), (96, 74, 54)) |
| dd = ImageDraw.Draw(desk) |
| for yy in range(0, H, 7): |
| dd.line([(0, yy), (W, yy + rng.randint(-3, 3))], |
| fill=(96 + rng.randint(-10, 8), 74 + rng.randint(-8, 6), 54 + rng.randint(-6, 6)), width=3) |
| |
| sh = Image.new("RGBA", (W, H), (0, 0, 0, 0)) |
| ImageDraw.Draw(sh).polygon([(232, 152), (798, 198), (742, 1292), (172, 1232)], fill=(0, 0, 0, 110)) |
| desk.paste(Image.new("RGB", (W, H), 0), (0, 0), sh.filter(ImageFilter.GaussianBlur(18))) |
| |
| dst = [(248, 138), (786, 186), (730, 1276), (188, 1218)] |
| coeffs = _find_coeffs(dst, [(0, 0), (pw, 0), (pw, ph), (0, ph)]) |
| warped = paper.transform((W, H), Image.PERSPECTIVE, coeffs, Image.BICUBIC) |
| desk.paste(warped, (0, 0), warped) |
| |
| a = np.asarray(desk).astype(np.float32) |
| yy, xx = np.mgrid[0:H, 0:W] |
| light = 1.12 - 0.32 * ((xx / W) * 0.6 + (yy / H) * 0.4) |
| r2 = ((xx - W / 2) / (W / 2)) ** 2 + ((yy - H / 2) / (H / 2)) ** 2 |
| light *= 1 - 0.18 * np.clip(r2 - 0.45, 0, 1) |
| a *= light[..., None] |
| a += np.random.default_rng(7).normal(0, 4.5, a.shape) |
| img = Image.fromarray(np.clip(a, 0, 255).astype(np.uint8)).filter(ImageFilter.GaussianBlur(0.6)) |
| img.save(OUT / "extreme_receipt_photo.png") |
| (OUT / "extreme_receipt_photo.txt").write_text("\n".join(R_LINES) + "\n") |
| (OUT / "extreme_receipt_photo.gt.json").write_text(json.dumps(R_GT, indent=2)) |
|
|
|
|
| |
| PO_ITEMS = [ |
| ("SHELF UNIT S-200 heavy gauge", 24, 189.00, 4536.00), |
| ("LED STRIP 2m retail white", 60, 22.40, 1344.00), |
| ("ENDCAP DISPLAY birch finish", 12, 310.00, 3720.00), |
| ] |
| PO_GT = { |
| "doc_type": "purchase_order", |
| "order_number": "PO-77RX-3309", |
| "order_date": "2026-05-21", |
| "delivery_date": "2026-06-15", |
| "vendor_name": "Nordic Fixture Works AB", |
| "buyer_name": "Aperture Retail Group", |
| "ship_to": "DC-7, 4420 Logistics Pkwy, Columbus OH", |
| "currency": "USD", |
| "payment_terms": "Net 45", |
| "subtotal": 9600.00, |
| "tax_amount": 792.00, |
| "total": 10392.00, |
| "line_items": [{"description": d_, "quantity": q, "unit_price": u, "line_total": t} |
| for d_, q, u, t in PO_ITEMS], |
| "_meta": {"doc_type": "purchase_order", "channel": "scanned", "difficulty": "extreme", "skip_eval": True}, |
| } |
|
|
|
|
| def _thumb(kind): |
| im = Image.new("RGB", (76, 76), (235, 238, 242)) |
| d = ImageDraw.Draw(im) |
| if kind == 0: |
| for i in range(4): |
| d.rectangle([10, 12 + i * 15, 66, 18 + i * 15], fill=(120, 128, 140)) |
| d.line([(12, 12), (12, 66)], fill=(80, 86, 96), width=3) |
| d.line([(64, 12), (64, 66)], fill=(80, 86, 96), width=3) |
| elif kind == 1: |
| d.rounded_rectangle([8, 30, 68, 46], radius=8, fill=(60, 64, 70)) |
| for x in range(14, 66, 9): |
| d.ellipse([x, 34, x + 6, 42], fill=(255, 240, 160)) |
| else: |
| d.polygon([(14, 64), (26, 14), (50, 14), (62, 64)], fill=(196, 164, 120)) |
| d.rectangle([20, 40, 56, 46], fill=(160, 128, 88)) |
| d.rectangle([24, 26, 52, 32], fill=(160, 128, 88)) |
| d.rectangle([0, 0, 75, 75], outline=(150, 150, 150)) |
| return im |
|
|
|
|
| def _qr(d, x, y, n=21, cell=5): |
| g = random.Random(9) |
| for r in range(n): |
| for c in range(n): |
| if g.random() < 0.45: |
| d.rectangle([x + c * cell, y + r * cell, x + c * cell + cell - 1, y + r * cell + cell - 1], fill=0) |
| for fx, fy in [(0, 0), (n - 7, 0), (0, n - 7)]: |
| d.rectangle([x + fx * cell, y + fy * cell, x + (fx + 7) * cell, y + (fy + 7) * cell], outline=0, width=3) |
| d.rectangle([x + (fx + 2) * cell, y + (fy + 2) * cell, x + (fx + 5) * cell, y + (fy + 5) * cell], fill=0) |
|
|
|
|
| def gen_po(): |
| W, H = 1240, 1600 |
| im = Image.new("RGB", (W, H), (252, 252, 250)) |
| d = ImageDraw.Draw(im) |
| h1, h2, h3, body, small = font(40, True), font(22, True), font(18, True), font(19), font(15) |
| |
| d.rectangle([40, 40, 120, 120], fill=(30, 90, 160)) |
| d.polygon([(52, 108), (80, 52), (108, 108)], fill=(252, 252, 250)) |
| d.text((136, 48), "Nordic Fixture Works AB", font=h2, fill=(20, 20, 30)) |
| d.text((136, 80), "Industrigatan 14, Malmo SE Β· VAT SE5566778899", font=small, fill=(90, 90, 100)) |
| d.text((40, 150), "PURCHASE ORDER", font=h1, fill=(30, 90, 160)) |
| _qr(d, 1060, 40) |
| meta = [("PO Number:", "PO-77RX-3309"), ("Order Date:", "2026-05-21"), |
| ("Delivery Date:", "2026-06-15"), ("Payment Terms:", "Net 45"), ("Currency:", "USD")] |
| d.rounded_rectangle([720, 150, 1200, 320], radius=10, outline=(30, 90, 160), width=2) |
| for i, (k, v) in enumerate(meta): |
| d.text((740, 165 + i * 30), k, font=h3, fill=(90, 90, 100)) |
| d.text((920, 165 + i * 30), v, font=body, fill=(20, 20, 30)) |
| d.text((40, 230), "Buyer: Aperture Retail Group", font=body, fill=(20, 20, 30)) |
| d.text((40, 260), "Ship To: DC-7, 4420 Logistics Pkwy, Columbus OH", font=body, fill=(20, 20, 30)) |
|
|
| |
| d.rectangle([40, 360, 1200, 404], fill=(30, 90, 160)) |
| for x, t in [(56, "IMG"), (160, "DESCRIPTION"), (700, "QTY"), (840, "UNIT USD"), (1040, "AMOUNT")]: |
| d.text((x, 370), t, font=h3, fill=(255, 255, 255)) |
| y = 420 |
| for i, (desc, qty, unit, tot) in enumerate(PO_ITEMS): |
| off = [-14, 22, 6][i] |
| im.paste(_thumb(i), (52, y)) |
| d.text((160 + off, y + 24), desc, font=body, fill=(25, 25, 30)) |
| d.text((706 + off // 2, y + 24), str(qty), font=body, fill=(25, 25, 30)) |
| d.text((846 - off, y + 24), f"{unit:,.2f}", font=body, fill=(25, 25, 30)) |
| d.text((1042 + off, y + 24), f"{tot:,.2f}", font=body, fill=(25, 25, 30)) |
| d.line([(40, y + 88), (1200, y + 88)], fill=(210, 210, 215)) |
| y += 96 |
| |
| d.text((840, y + 24), "Subtotal:", font=h3, fill=(90, 90, 100)); d.text((1042, y + 24), "9,600.00", font=body, fill=(20, 20, 30)) |
| d.text((840, y + 58), "Tax 8.25%:", font=h3, fill=(90, 90, 100)); d.text((1042, y + 58), "792.00", font=body, fill=(20, 20, 30)) |
| d.rectangle([820, y + 92, 1200, y + 134], fill=(240, 244, 250)) |
| d.text((840, y + 100), "TOTAL:", font=h2, fill=(30, 90, 160)); d.text((1042, y + 100), "10,392.00 USD", font=h2, fill=(30, 90, 160)) |
| bx = 40 |
| g = random.Random(5) |
| for _ in range(60): |
| wbar = g.choice((2, 2, 3, 5)) |
| d.rectangle([bx, y + 40, bx + wbar, y + 110], fill=0) |
| bx += wbar + g.choice((2, 3)) |
| d.text((40, y + 116), "*PO77RX3309*", font=small, fill=(60, 60, 60)) |
| sig = signature() |
| im.paste(sig, (760, H - 220), sig) |
| d.line([(740, H - 160), (1010, H - 160)], fill=(60, 60, 60), width=2) |
| d.text((740, H - 150), "Authorized β K. Lindqvist, Procurement", font=small, fill=(60, 60, 60)) |
| |
| st = stamp("APPROVED Β· OPS DESK", (20, 130, 60), 12) |
| im.paste(st, (430, 560), st) |
| |
| a = np.asarray(im).astype(np.float32) + np.random.default_rng(3).normal(0, 5, (H, W, 3)) |
| im = Image.fromarray(np.clip(a, 0, 255).astype(np.uint8)).rotate(-0.7, expand=False, fillcolor=(252, 252, 250)) |
| im.save(OUT / "extreme_po_collage.png") |
| txt = ["PURCHASE ORDER", "Nordic Fixture Works AB", "Industrigatan 14, Malmo SE", |
| "PO Number: PO-77RX-3309", "Order Date: 2026-05-21", "Delivery Date: 2026-06-15", |
| "Payment Terms: Net 45", "Currency: USD", |
| "Buyer: Aperture Retail Group", "Ship To: DC-7, 4420 Logistics Pkwy, Columbus OH", |
| "IMG DESCRIPTION QTY UNIT USD AMOUNT"] + [ |
| f"{desc} {q} {u:,.2f} {t:,.2f}" for desc, q, u, t in PO_ITEMS] + [ |
| "Subtotal: 9,600.00", "Tax 8.25%: 792.00", "TOTAL: 10,392.00 USD", |
| "*PO77RX3309*", "APPROVED Β· OPS DESK", "Authorized β K. Lindqvist, Procurement"] |
| (OUT / "extreme_po_collage.txt").write_text("\n".join(txt) + "\n") |
| (OUT / "extreme_po_collage.gt.json").write_text(json.dumps(PO_GT, indent=2)) |
|
|
|
|
| |
| C_GT = { |
| "doc_type": "contract", |
| "contract_number": "MSA-2026-0481", |
| "title": "Master Services Agreement - Store Fit-Out Program", |
| "party_a": "Aperture Retail Group", |
| "party_b": "Halcyon Build Partners LLC", |
| "effective_date": "2026-03-01", |
| "expiration_date": "2029-02-28", |
| "contract_value": 1250000.00, |
| "currency": "USD", |
| "governing_law": "State of Ohio", |
| "auto_renew": False, |
| "termination_notice_days": 60, |
| "_meta": {"doc_type": "contract", "channel": "fax", "difficulty": "extreme", "skip_eval": True}, |
| } |
| C_HEAD = [ |
| "MASTER SERVICES AGREEMENT - STORE FIT-OUT PROGRAM", |
| "Contract No: MSA-2026-0481", |
| "Party A: Aperture Retail Group Party B: Halcyon Build Partners LLC", |
| "Effective Date: 2026-03-01 Expiration Date: 2029-02-28", |
| "Total Contract Value: USD 1,250,000.00 Governing Law: State of Ohio", |
| "Auto-Renewal: NO Termination Notice: 60 days written notice", |
| ] |
| C_BODY = [ |
| "1. SCOPE. Contractor shall furnish all labor, materials, supervision and", |
| "equipment required for the fit-out of retail premises identified in each", |
| "Statement of Work executed under this Agreement.", |
| "2. TERM. This Agreement commences on the Effective Date and continues", |
| "until the Expiration Date unless terminated earlier per Section 9.", |
| "3. COMPENSATION. Client shall pay Contractor fees not to exceed the", |
| "Total Contract Value, payable per approved milestone invoices Net 30.", |
| "4. CHANGE ORDERS. No variation is binding unless documented in a", |
| "written change order signed by both parties' authorized representatives.", |
| "5. WARRANTIES. Contractor warrants workmanship free of defects for", |
| "twenty-four (24) months following practical completion of each site.", |
| "6. INSURANCE. Contractor shall maintain commercial general liability", |
| "coverage of not less than USD 5,000,000 per occurrence.", |
| "7. CONFIDENTIALITY. Each party shall protect Confidential Information", |
| "with no less than reasonable care and use it solely for this Agreement.", |
| "8. LIABILITY. Neither party is liable for indirect or consequential", |
| "damages; aggregate liability is capped at the Total Contract Value.", |
| "9. TERMINATION. Either party may terminate for convenience upon sixty", |
| "(60) days written notice, or immediately for uncured material breach.", |
| "10. GOVERNING LAW. This Agreement is governed by the laws of the", |
| "State of Ohio, excluding its conflict of law provisions.", |
| ] |
|
|
|
|
| def gen_contract(): |
| W, H = 1240, 1600 |
| im = Image.new("RGB", (W, H), (255, 255, 255)) |
| d = ImageDraw.Draw(im) |
| fh, fb, fs = font(26, True), font(17), font(14) |
| d.text((30, 18), "FAX TX 06/12/2026 14:22 FROM HALCYON BUILD +1 614 555 0188 P.01/07", font=fs, fill=(60, 60, 60)) |
| d.line([(30, 44), (1210, 44)], fill=(60, 60, 60), width=2) |
| tw = d.textlength(C_HEAD[0], font=fh) |
| d.text(((W - tw) / 2, 70), C_HEAD[0], font=fh, fill=(15, 15, 15)) |
| y = 130 |
| for ln in C_HEAD[1:]: |
| d.text((80, y), ln, font=fb, fill=(20, 20, 20)) |
| y += 30 |
| d.line([(60, y + 8), (1180, y + 8)], fill=(120, 120, 120), width=2) |
| |
| half = (len(C_BODY) + 1) // 2 |
| for col, lines in enumerate((C_BODY[:half], C_BODY[half:])): |
| x = 70 + col * 590 |
| yy = y + 34 |
| for ln in lines: |
| d.text((x, yy), ln, font=fs, fill=(25, 25, 25)) |
| yy += 24 |
| for extra in range(14): |
| d.text((x, yy), f"{'WHEREAS the parties acknowledge the recitals set forth herein;'[: 58 - (extra % 3) * 4]}", |
| font=fs, fill=(45, 45, 45)) |
| yy += 24 |
| |
| sy = H - 300 |
| for col, (name, role) in enumerate([("M. Okafor β Aperture Retail Group", "Chief Procurement Officer"), |
| ("D. Reyes β Halcyon Build Partners LLC", "Managing Partner")]): |
| x = 90 + col * 600 |
| sig = signature(color=(20, 20, 20)) |
| im.paste(sig, (x, sy), sig) |
| d.line([(x, sy + 70), (x + 420, sy + 70)], fill=(40, 40, 40), width=2) |
| d.text((x, sy + 80), name, font=fs, fill=(30, 30, 30)) |
| d.text((x, sy + 102), role, font=fs, fill=(90, 90, 90)) |
| |
| note = Image.new("RGBA", (430, 60), (0, 0, 0, 0)) |
| ImageDraw.Draw(note).text((0, 8), "legal OK -> route to CFO (June 5)", font=font(24), fill=(28, 40, 160, 220)) |
| note = note.rotate(-3, expand=True, resample=Image.BICUBIC) |
| im.paste(note, (700, 360), note) |
| st = stamp("RECEIVED JUN 05 2026", (180, 30, 30), -14, size=(420, 110), fsz=30) |
| im.paste(st, (90, 430), st) |
| |
| g = im.convert("L") |
| a = np.asarray(g).astype(np.float32) |
| a = 255 - (255 - a) * 0.62 |
| nz = np.random.default_rng(11) |
| a += nz.normal(0, 9, a.shape) |
| pepper = nz.random(a.shape) |
| a[pepper < 0.004] = 30 |
| a[pepper > 0.997] = 245 |
| for yy in range(0, H, 90): |
| a[yy:yy + 2, :] = np.clip(a[yy:yy + 2, :] * 1.25, 0, 255) |
| img = Image.fromarray(np.clip(a, 0, 255).astype(np.uint8)).rotate(1.3, expand=False, fillcolor=235) |
| d2 = ImageDraw.Draw(img) |
| for hy in (H // 4, 3 * H // 4): |
| d2.ellipse([18, hy - 22, 62, hy + 22], fill=246, outline=140, width=3) |
| img.convert("RGB").save(OUT / "extreme_contract_fax.png") |
| (OUT / "extreme_contract_fax.txt").write_text("\n".join(C_HEAD + C_BODY) + "\n") |
| (OUT / "extreme_contract_fax.gt.json").write_text(json.dumps(C_GT, indent=2)) |
|
|
|
|
| if __name__ == "__main__": |
| OUT.mkdir(parents=True, exist_ok=True) |
| gen_receipt() |
| gen_po() |
| gen_contract() |
| for sid in ("extreme_receipt_photo", "extreme_po_collage", "extreme_contract_fax"): |
| print(f" wrote {OUT / sid}.png (+ .gt.json + .txt)") |
|
|