#!/usr/bin/env python3 """Generate a synthetic-but-realistic document corpus with paired ground truth. Covers the axes the reference docs care about: • type: invoice · purchase_order · contract · receipt • channel: digital (PDF text layer) · scanned (PNG, no text layer + .txt sidecar) • difficulty: standard · multicurrency · dense_table · missing_fields · multipage Outputs go to backend/evals/datasets/: .pdf | .png the document .txt sidecar OCR text (for scanned/photo, drives the OCR fallback) .gt.json ground-truth fields + _meta Run: python scripts/generate_samples.py (from repo root) """ from __future__ import annotations import json import sys from pathlib import Path REPO_ROOT = Path(__file__).resolve().parent.parent BACKEND = REPO_ROOT / "backend" sys.path.insert(0, str(BACKEND)) DATASET_DIR = BACKEND / "evals" / "datasets" SHOWCASE_DIR = REPO_ROOT / "samples" # --- document specifications -------------------------------------------------- def _items(rows): return [ {"description": d, "quantity": q, "unit_price": up, "line_total": round(q * up, 2)} for d, q, up in rows ] def corpus() -> list[dict]: docs: list[dict] = [] # ---- invoices ---- docs.append({ "id": "invoice_acme_digital", "type": "invoice", "channel": "digital", "difficulty": "standard", "fields": { "invoice_number": "INV-1001", "issue_date": "2026-07-15", "due_date": "2026-08-14", "vendor_name": "Acme Industrial Supplies", "bill_to_name": "Globex Corporation", "currency": "USD", "subtotal": 300.00, "tax_amount": 30.00, "total": 330.00, "line_items": _items([("Steel Bolts M8", 100, 1.20), ("Hex Nuts M8", 200, 0.40), ("Washers", 100, 1.00)]), }, }) docs.append({ "id": "invoice_globalparts_eur", "type": "invoice", "channel": "digital", "difficulty": "multicurrency", "fields": { "invoice_number": "GP-2026-558", "issue_date": "2026-05-03", "due_date": "2026-06-02", "vendor_name": "GlobalParts GmbH", "bill_to_name": "Initech LLC", "currency": "EUR", "subtotal": 1840.00, "tax_amount": 349.60, "total": 2189.60, "line_items": _items([("Bearing assembly", 8, 180.00), ("Drive belt", 20, 20.00)]), }, }) docs.append({ "id": "invoice_scanned_basic", "type": "invoice", "channel": "scanned", "difficulty": "standard", "fields": { "invoice_number": "INV-7741", "issue_date": "2026-03-22", "due_date": "2026-04-21", "vendor_name": "Northwind Traders", "bill_to_name": "Contoso Ltd", "currency": "USD", "subtotal": 540.00, "tax_amount": 43.20, "total": 583.20, "line_items": _items([("Office chairs", 6, 90.00)]), }, }) docs.append({ "id": "invoice_missing_total", "type": "invoice", "channel": "digital", "difficulty": "missing_fields", "fields": { # total intentionally absent → should be flagged + routed to HITL "invoice_number": "INV-3300", "issue_date": "2026-02-10", "vendor_name": "Stark Components", "currency": "USD", "subtotal": 1200.00, "tax_amount": 96.00, "line_items": _items([("Servo motor", 2, 600.00)]), }, "omit": ["total", "due_date", "bill_to_name"], }) docs.append({ "id": "invoice_dense_table", "type": "invoice", "channel": "digital", "difficulty": "dense_table", "fields": { "invoice_number": "INV-9120", "issue_date": "2026-06-01", "due_date": "2026-07-01", "vendor_name": "Wayne Enterprises", "bill_to_name": "Oscorp", "currency": "USD", "subtotal": 2140.00, "tax_amount": 214.00, "total": 2354.00, "line_items": _items([ ("Aluminium sheet 2mm", 30, 24.00), ("Copper wire 10m", 40, 8.00), ("Circuit board v2", 50, 12.00), ("Capacitor pack", 100, 2.20), ("Resistor pack", 100, 1.00), ("LED array", 60, 3.00), ]), }, }) # ---- purchase orders ---- docs.append({ "id": "po_acme_digital", "type": "purchase_order", "channel": "digital", "difficulty": "standard", "fields": { "order_number": "PO-100481", "order_date": "2026-07-02", "delivery_date": "2026-07-20", "vendor_name": "Acme Industrial", "buyer_name": "Globex Procurement", "ship_to": "12 Industrial Way, Springfield", "currency": "USD", "subtotal": 12000.00, "tax_amount": 450.00, "total": 12450.00, "payment_terms": "Net 30", "line_items": _items([("CNC spindle", 2, 5000.00), ("Tool set", 4, 500.00)]), }, }) docs.append({ "id": "po_scanned", "type": "purchase_order", "channel": "scanned", "difficulty": "standard", "fields": { "order_number": "PO-100483", "order_date": "2026-04-11", "delivery_date": "2026-05-01", "vendor_name": "Initech Supplies", "buyer_name": "Contoso Ops", "ship_to": "9 Market St, Metropolis", "currency": "USD", "subtotal": 900.00, "tax_amount": 80.00, "total": 980.00, "payment_terms": "Net 15", "line_items": _items([("Printer paper (box)", 20, 45.00)]), }, }) # ---- contracts ---- docs.append({ "id": "contract_msa_digital", "type": "contract", "channel": "digital", "difficulty": "standard", "fields": { "contract_number": "MSA-2026-014", "title": "Master Services Agreement", "party_a": "Acme Industrial Supplies", "party_b": "Globex Corporation", "effective_date": "2026-01-01", "expiration_date": "2027-12-31", "contract_value": 250000.00, "currency": "USD", "governing_law": "Delaware", "auto_renew": True, "termination_notice_days": 60, }, }) docs.append({ "id": "contract_scanned", "type": "contract", "channel": "scanned", "difficulty": "standard", "fields": { "contract_number": "NDA-7781", "title": "Mutual Non-Disclosure Agreement", "party_a": "Stark Components", "party_b": "Wayne Enterprises", "effective_date": "2026-03-15", "expiration_date": "2029-03-14", "contract_value": 0.0, "currency": "USD", "governing_law": "New York", "auto_renew": False, "termination_notice_days": 30, }, }) # ---- receipts ---- docs.append({ "id": "receipt_digital", "type": "receipt", "channel": "digital", "difficulty": "standard", "fields": { "merchant": "City Hardware", "date": "2026-06-18", "currency": "USD", "subtotal": 47.00, "tax_amount": 3.76, "total": 50.76, "payment_method": "Visa card ending 4242", "line_items": _items([("Paint 1L", 2, 18.00), ("Brush set", 1, 11.00)]), }, }) docs.append({ "id": "receipt_scanned", "type": "receipt", "channel": "scanned", "difficulty": "standard", "fields": { "merchant": "QuickMart", "date": "2026-05-30", "currency": "USD", "subtotal": 23.50, "tax_amount": 1.88, "total": 25.38, "payment_method": "Cash", "line_items": _items([("Coffee", 5, 3.50), ("Snacks", 2, 3.00)]), }, }) # ---- subscription memos ---- docs.append({ "id": "subscription_memo_pos", "type": "subscription_memo", "channel": "digital", "difficulty": "standard", "fields": { "memo_number": "SUB-2026-0091", "subscription_name": "POS Cloud Platform", "vendor_name": "Initech Supplies", "account_id": "ACC-55821", "plan": "Enterprise (500 lanes)", "billing_cycle": "annual", "start_date": "2025-08-01", "renewal_date": "2026-08-01", "amount": 84000.00, "currency": "USD", "auto_renew": True, "status": "active", "notes": "Price locked for 2 years; 60-day cancellation notice required.", }, }) docs.append({ "id": "subscription_memo_scanned", "type": "subscription_memo", "channel": "scanned", "difficulty": "standard", "fields": { "memo_number": "SUB-2026-0145", "subscription_name": "Store Wi-Fi & Analytics", "vendor_name": "GlobalParts GmbH", "account_id": "ACC-77310", "plan": "Standard", "billing_cycle": "monthly", "start_date": "2026-01-15", "renewal_date": "2026-07-15", "amount": 2500.00, "currency": "EUR", "auto_renew": False, "status": "pending", "notes": "Evaluate before renewal; usage below tier.", }, }) # ---- complex multi-layer form (hybrid offline+online demo) ---- def _cfi(code, desc, cat, qty, up): return {"item_code": code, "description": desc, "category": cat, "quantity": qty, "unit_price": up, "line_total": round(qty * up, 2)} docs.append({ "id": "complex_capex_requisition", "type": "complex_form", "channel": "digital", "difficulty": "complex", "skip_eval": True, "fields": { "form_title": "CAPITAL EXPENDITURE REQUEST & VENDOR QUOTE COMPARISON", "form_number": "CAPEX-2026-0457", "request_date": "2026-05-12", "department": "Manufacturing Operations", "cost_center": "CC-4400", "priority": "High", "requestor": {"name": "Dana Whitfield", "employee_id": "E-20455", "email": "dana.whitfield@globex.example", "manager": "Priya Anand"}, "project": {"name": "Line 3 Robotics Upgrade", "code": "PRJ-LINE3", "budget_code": "BUD-2026-CAP-11", "justification": "Replace end-of-life pick-and-place cells to raise throughput 18%."}, "vendor_quotes": [ {"vendor_name": "Acme Industrial Supplies", "vendor_id": "V-ACME-001", "quote_number": "Q-AC-9981", "valid_until": "2026-06-30", "currency": "USD", "line_items": [_cfi("RB-500", "6-axis robot arm", "equipment", 2, 38000.00), _cfi("EOAT-12", "End-of-arm tooling", "tooling", 2, 4500.00), _cfi("INST-01", "Installation and calibration", "service", 1, 12000.00)], "subtotal": 97000.00, "tax_rate": 0.08, "tax_amount": 7760.00, "shipping": 1500.00, "total": 106260.00, "recommended": False}, {"vendor_name": "GlobalParts GmbH", "vendor_id": "V-GLOB-014", "quote_number": "Q-GP-4471", "valid_until": "2026-06-15", "currency": "EUR", "line_items": [_cfi("RB-EU", "6-axis robot arm", "equipment", 2, 34000.00), _cfi("TOOL-9", "Tooling set", "tooling", 2, 5000.00), _cfi("SVC-2", "Commissioning", "service", 1, 15000.00)], "subtotal": 93000.00, "tax_rate": 0.19, "tax_amount": 17670.00, "shipping": 2000.00, "total": 112670.00, "recommended": False}, {"vendor_name": "Initech Supplies", "vendor_id": "V-INIT-007", "quote_number": "Q-IN-3320", "valid_until": "2026-07-10", "currency": "USD", "line_items": [_cfi("RB-INI", "Robot cell", "equipment", 2, 36000.00), _cfi("TLG-3", "Gripper kit", "tooling", 2, 4000.00), _cfi("SET-1", "Setup", "service", 1, 9000.00)], "subtotal": 89000.00, "tax_rate": 0.08, "tax_amount": 7120.00, "shipping": 1200.00, "total": 97320.00, "recommended": True}, ], "selected_vendor": "Initech Supplies", "selected_total": 97320.00, "currency": "USD", "grand_total": 97320.00, "approvals": [ {"role": "Department Head", "name": "Priya Anand", "decision": "Approved", "date": "2026-05-14"}, {"role": "Finance", "name": "Marcus Lee", "decision": "Approved", "date": "2026-05-15"}, {"role": "Procurement", "name": "Sofia Reyes", "decision": "Pending", "date": "-"}, ], "compliance": {"three_quotes_obtained": True, "budget_approved": True, "sole_source_justified": False, "vendor_vetted": True}, "notes": "Delivery required before Q3 production ramp.", }, }) return docs # --- text layout rendering ---------------------------------------------------- def render_lines(doc: dict) -> list[str]: f = doc["fields"] t = doc["type"] omit = set(doc.get("omit", [])) L: list[str] = [] cur_sym = {"USD": "$", "EUR": "€", "GBP": "£"}.get(f.get("currency", "USD"), "$") def money(v): return f"{cur_sym}{v:,.2f}" if t == "invoice": L += ["INVOICE", ""] L.append(f"Invoice Number: {f['invoice_number']}") L.append(f"Invoice Date: {f['issue_date']}") if "due_date" not in omit and f.get("due_date"): L.append(f"Due Date: {f['due_date']}") L.append(f"From: {f['vendor_name']}") if "bill_to_name" not in omit and f.get("bill_to_name"): L.append(f"Bill To: {f['bill_to_name']}") L.append(f"Currency: {f['currency']}") L += ["", _row("Description", "Qty", "Unit Price", "Amount")] for it in f.get("line_items", []): L.append(_row(it["description"], str(int(it["quantity"])), money(it["unit_price"]), money(it["line_total"]))) L.append("") L.append(f"Subtotal: {money(f['subtotal'])}") L.append(f"Tax (10%): {money(f['tax_amount'])}") if "total" not in omit and f.get("total") is not None: L.append(f"Total: {money(f['total'])}") elif t == "purchase_order": L += ["PURCHASE ORDER", ""] L.append(f"Purchase Order Number: {f['order_number']}") L.append(f"Order Date: {f['order_date']}") L.append(f"Delivery Date: {f['delivery_date']}") L.append(f"Vendor: {f['vendor_name']}") L.append(f"Buyer: {f['buyer_name']}") L.append(f"Ship To: {f['ship_to']}") L.append(f"Payment Terms: {f['payment_terms']}") L.append(f"Currency: {f['currency']}") L += ["", _row("Description", "Qty", "Unit Price", "Amount")] for it in f.get("line_items", []): L.append(_row(it["description"], str(int(it["quantity"])), money(it["unit_price"]), money(it["line_total"]))) L += ["", f"Subtotal: {money(f['subtotal'])}", f"Tax: {money(f['tax_amount'])}", f"Total: {money(f['total'])}"] elif t == "contract": L += [f["title"].upper(), ""] L.append(f"Contract Number: {f['contract_number']}") L.append(f"This Agreement is entered into between {f['party_a']} and {f['party_b']}.") L.append(f"Effective Date: {f['effective_date']}") L.append(f"Expiration Date: {f['expiration_date']}") if f.get("contract_value"): L.append(f"Contract Value: {money(f['contract_value'])}") L.append(f"Governing Law: {f['governing_law']}") L.append("This agreement shall automatically renew." if f.get("auto_renew") else "This agreement has no automatic renewal.") L.append(f"Either party may terminate upon {f['termination_notice_days']} days written notice.") elif t == "receipt": L += ["RECEIPT", ""] L.append(f"Merchant: {f['merchant']}") L.append(f"Date: {f['date']}") L.append(f"Currency: {f['currency']}") L += ["", _row("Description", "Qty", "Unit Price", "Amount")] for it in f.get("line_items", []): L.append(_row(it["description"], str(int(it["quantity"])), money(it["unit_price"]), money(it["line_total"]))) L += ["", f"Subtotal: {money(f['subtotal'])}", f"Tax: {money(f['tax_amount'])}", f"Total: {money(f['total'])}", f"Paid by: {f['payment_method']}"] elif t == "subscription_memo": cur_sym2 = {"USD": "$", "EUR": "€", "GBP": "£"}.get(f.get("currency", "USD"), "$") L += ["SUBSCRIPTION MEMO", ""] L.append(f"Memo Number: {f['memo_number']}") L.append(f"Subscription: {f['subscription_name']}") L.append(f"Vendor: {f['vendor_name']}") L.append(f"Account ID: {f['account_id']}") L.append(f"Plan: {f['plan']}") L.append(f"Billing Cycle: {f['billing_cycle']}") L.append(f"Start Date: {f['start_date']}") L.append(f"Renewal Date: {f['renewal_date']}") L.append(f"Amount: {cur_sym2}{f['amount']:,.2f}") L.append(f"Currency: {f['currency']}") L.append("Auto-Renew: " + ("Yes" if f["auto_renew"] else "No")) L.append(f"Status: {f['status']}") L.append(f"Notes: {f['notes']}") elif t == "complex_form": L += render_complex_form(f) return L def _cf_row(code, desc, cat, qty, price, amt): return f"{code:<10}{desc:<30}{cat:<14}{qty:>4} {price:>12} {amt:>12}" def render_complex_form(f: dict) -> list[str]: L = ["CAPITAL EXPENDITURE REQUEST & VENDOR QUOTE COMPARISON", ""] L += [f"Form Number: {f['form_number']}", f"Request Date: {f['request_date']}", f"Department: {f['department']}", f"Cost Center: {f['cost_center']}", f"Priority: {f['priority']}", ""] r = f["requestor"] L += ["REQUESTOR", f"Name: {r['name']}", f"Employee ID: {r['employee_id']}", f"Email: {r['email']}", f"Manager: {r['manager']}", ""] pr = f["project"] L += ["PROJECT", f"Name: {pr['name']}", f"Code: {pr['code']}", f"Budget Code: {pr['budget_code']}", f"Justification: {pr['justification']}", ""] for i, q in enumerate(f["vendor_quotes"], start=1): sym = {"USD": "$", "EUR": "€", "GBP": "£"}.get(q["currency"], "$") L += [f"VENDOR QUOTE {i}", f"Vendor: {q['vendor_name']}", f"Vendor ID: {q['vendor_id']}", f"Quote Number: {q['quote_number']}", f"Valid Until: {q['valid_until']}", f"Currency: {q['currency']}", _cf_row("Item Code", "Description", "Category", "Qty", "Unit Price", "Amount")] for it in q["line_items"]: L.append(_cf_row(it["item_code"], it["description"], it["category"], str(int(it["quantity"])), f"{it['unit_price']:.2f}", f"{it['line_total']:.2f}")) L += [f"Subtotal: {sym}{q['subtotal']:.2f}", f"Tax ({int(q['tax_rate']*100)}%): {sym}{q['tax_amount']:.2f}", f"Shipping: {sym}{q['shipping']:.2f}", f"Total: {sym}{q['total']:.2f}", ""] L += ["APPROVALS", f"{'Role':<20}{'Name':<18}{'Decision':<12}Date"] for a in f["approvals"]: L.append(f"{a['role']:<20}{a['name']:<18}{a['decision']:<12}{a['date']}") L += ["", "COMPLIANCE CHECKLIST"] cm = f["compliance"] L += [f"[{'X' if cm['three_quotes_obtained'] else ' '}] Three competitive quotes obtained", f"[{'X' if cm['budget_approved'] else ' '}] Budget approved", f"[{'X' if cm['sole_source_justified'] else ' '}] Sole-source justification attached", f"[{'X' if cm['vendor_vetted'] else ' '}] Vendor vetted / on approved list", ""] L += [f"Notes: {f['notes']}"] return L def _row(desc: str, qty: str, price: str, amount: str) -> str: return f"{desc:<28}{qty:>5} {price:>12} {amount:>12}" # --- ground truth ------------------------------------------------------------- def ground_truth(doc: dict) -> dict: f = dict(doc["fields"]) for k in doc.get("omit", []): f.pop(k, None) f["doc_type"] = doc["type"] f["_meta"] = {"doc_type": doc["type"], "channel": doc["channel"], "difficulty": doc["difficulty"]} if doc.get("skip_eval"): f["_meta"]["skip_eval"] = True return f # --- renderers ---------------------------------------------------------------- def write_pdf(path: Path, lines: list[str]) -> bool: try: from reportlab.lib.pagesizes import letter from reportlab.pdfgen import canvas except Exception: return False c = canvas.Canvas(str(path), pagesize=letter) c.setFont("Courier", 10) width, height = letter y = height - 60 for line in lines: if y < 60: c.showPage() c.setFont("Courier", 10) y = height - 60 c.drawString(50, y, line) y -= 15 c.save() return True def write_image(path: Path, lines: list[str]) -> bool: try: from PIL import Image, ImageDraw, ImageFont except Exception: return False W, H = 850, max(400, 60 + 20 * len(lines)) img = Image.new("RGB", (W, H), "white") d = ImageDraw.Draw(img) try: font = ImageFont.truetype("/System/Library/Fonts/Menlo.ttc", 13) except Exception: font = ImageFont.load_default() y = 30 for line in lines: d.text((40, y), line, fill="black", font=font) y += 20 img.save(str(path)) return True def main() -> None: DATASET_DIR.mkdir(parents=True, exist_ok=True) SHOWCASE_DIR.mkdir(parents=True, exist_ok=True) docs = corpus() made = 0 for doc in docs: lines = render_lines(doc) gt = ground_truth(doc) stem = DATASET_DIR / doc["id"] if doc["channel"] == "digital": ok = write_pdf(stem.with_suffix(".pdf"), lines) kind = "PDF" else: ok = write_image(stem.with_suffix(".png"), lines) # sidecar text = what OCR would read (drives the offline OCR fallback) stem.with_suffix(".txt").write_text("\n".join(lines), encoding="utf-8") kind = "PNG+OCR-sidecar" stem.with_suffix(".gt.json").write_text(json.dumps(gt, indent=2), encoding="utf-8") made += 1 print(f" [{doc['channel']:<8}] {doc['id']:<28} {kind}{'' if ok else ' (render lib missing!)'}") # copy two hero samples to samples/ for the README/demo for hero in ("invoice_acme_digital", "po_acme_digital"): for ext in (".pdf", ".png", ".gt.json", ".txt"): src = DATASET_DIR / f"{hero}{ext}" if src.exists(): (SHOWCASE_DIR / src.name).write_bytes(src.read_bytes()) print(f"\nGenerated {made} documents → {DATASET_DIR}") if __name__ == "__main__": main()