| |
| """Generate a synthetic-but-realistic document corpus with paired ground truth. |
| |
| Covers the axes the reference docs care about: |
| • type: invoice · purchase_order · contract · receipt |
| • channel: digital (PDF text layer) · scanned (PNG, no text layer + .txt sidecar) |
| • difficulty: standard · multicurrency · dense_table · missing_fields · multipage |
| |
| Outputs go to backend/evals/datasets/: |
| <id>.pdf | <id>.png the document |
| <id>.txt sidecar OCR text (for scanned/photo, drives the OCR fallback) |
| <id>.gt.json ground-truth fields + _meta |
| |
| Run: python scripts/generate_samples.py (from repo root) |
| """ |
| from __future__ import annotations |
|
|
| import json |
| import sys |
| from pathlib import Path |
|
|
| REPO_ROOT = Path(__file__).resolve().parent.parent |
| BACKEND = REPO_ROOT / "backend" |
| sys.path.insert(0, str(BACKEND)) |
|
|
| DATASET_DIR = BACKEND / "evals" / "datasets" |
| SHOWCASE_DIR = REPO_ROOT / "samples" |
|
|
|
|
| |
|
|
| def _items(rows): |
| return [ |
| {"description": d, "quantity": q, "unit_price": up, "line_total": round(q * up, 2)} |
| for d, q, up in rows |
| ] |
|
|
|
|
| def corpus() -> list[dict]: |
| docs: list[dict] = [] |
|
|
| |
| docs.append({ |
| "id": "invoice_acme_digital", "type": "invoice", "channel": "digital", |
| "difficulty": "standard", |
| "fields": { |
| "invoice_number": "INV-1001", "issue_date": "2026-07-15", "due_date": "2026-08-14", |
| "vendor_name": "Acme Industrial Supplies", "bill_to_name": "Globex Corporation", |
| "currency": "USD", "subtotal": 300.00, "tax_amount": 30.00, "total": 330.00, |
| "line_items": _items([("Steel Bolts M8", 100, 1.20), |
| ("Hex Nuts M8", 200, 0.40), |
| ("Washers", 100, 1.00)]), |
| }, |
| }) |
| docs.append({ |
| "id": "invoice_globalparts_eur", "type": "invoice", "channel": "digital", |
| "difficulty": "multicurrency", |
| "fields": { |
| "invoice_number": "GP-2026-558", "issue_date": "2026-05-03", "due_date": "2026-06-02", |
| "vendor_name": "GlobalParts GmbH", "bill_to_name": "Initech LLC", |
| "currency": "EUR", "subtotal": 1840.00, "tax_amount": 349.60, "total": 2189.60, |
| "line_items": _items([("Bearing assembly", 8, 180.00), |
| ("Drive belt", 20, 20.00)]), |
| }, |
| }) |
| docs.append({ |
| "id": "invoice_scanned_basic", "type": "invoice", "channel": "scanned", |
| "difficulty": "standard", |
| "fields": { |
| "invoice_number": "INV-7741", "issue_date": "2026-03-22", "due_date": "2026-04-21", |
| "vendor_name": "Northwind Traders", "bill_to_name": "Contoso Ltd", |
| "currency": "USD", "subtotal": 540.00, "tax_amount": 43.20, "total": 583.20, |
| "line_items": _items([("Office chairs", 6, 90.00)]), |
| }, |
| }) |
| docs.append({ |
| "id": "invoice_missing_total", "type": "invoice", "channel": "digital", |
| "difficulty": "missing_fields", |
| "fields": { |
| "invoice_number": "INV-3300", "issue_date": "2026-02-10", |
| "vendor_name": "Stark Components", "currency": "USD", |
| "subtotal": 1200.00, "tax_amount": 96.00, |
| "line_items": _items([("Servo motor", 2, 600.00)]), |
| }, |
| "omit": ["total", "due_date", "bill_to_name"], |
| }) |
| docs.append({ |
| "id": "invoice_dense_table", "type": "invoice", "channel": "digital", |
| "difficulty": "dense_table", |
| "fields": { |
| "invoice_number": "INV-9120", "issue_date": "2026-06-01", "due_date": "2026-07-01", |
| "vendor_name": "Wayne Enterprises", "bill_to_name": "Oscorp", |
| "currency": "USD", "subtotal": 2140.00, "tax_amount": 214.00, "total": 2354.00, |
| "line_items": _items([ |
| ("Aluminium sheet 2mm", 30, 24.00), ("Copper wire 10m", 40, 8.00), |
| ("Circuit board v2", 50, 12.00), ("Capacitor pack", 100, 2.20), |
| ("Resistor pack", 100, 1.00), ("LED array", 60, 3.00), |
| ]), |
| }, |
| }) |
|
|
| |
| docs.append({ |
| "id": "po_acme_digital", "type": "purchase_order", "channel": "digital", |
| "difficulty": "standard", |
| "fields": { |
| "order_number": "PO-100481", "order_date": "2026-07-02", "delivery_date": "2026-07-20", |
| "vendor_name": "Acme Industrial", "buyer_name": "Globex Procurement", |
| "ship_to": "12 Industrial Way, Springfield", "currency": "USD", |
| "subtotal": 12000.00, "tax_amount": 450.00, "total": 12450.00, |
| "payment_terms": "Net 30", |
| "line_items": _items([("CNC spindle", 2, 5000.00), ("Tool set", 4, 500.00)]), |
| }, |
| }) |
| docs.append({ |
| "id": "po_scanned", "type": "purchase_order", "channel": "scanned", |
| "difficulty": "standard", |
| "fields": { |
| "order_number": "PO-100483", "order_date": "2026-04-11", "delivery_date": "2026-05-01", |
| "vendor_name": "Initech Supplies", "buyer_name": "Contoso Ops", |
| "ship_to": "9 Market St, Metropolis", "currency": "USD", |
| "subtotal": 900.00, "tax_amount": 80.00, "total": 980.00, "payment_terms": "Net 15", |
| "line_items": _items([("Printer paper (box)", 20, 45.00)]), |
| }, |
| }) |
|
|
| |
| docs.append({ |
| "id": "contract_msa_digital", "type": "contract", "channel": "digital", |
| "difficulty": "standard", |
| "fields": { |
| "contract_number": "MSA-2026-014", "title": "Master Services Agreement", |
| "party_a": "Acme Industrial Supplies", "party_b": "Globex Corporation", |
| "effective_date": "2026-01-01", "expiration_date": "2027-12-31", |
| "contract_value": 250000.00, "currency": "USD", |
| "governing_law": "Delaware", "auto_renew": True, "termination_notice_days": 60, |
| }, |
| }) |
| docs.append({ |
| "id": "contract_scanned", "type": "contract", "channel": "scanned", |
| "difficulty": "standard", |
| "fields": { |
| "contract_number": "NDA-7781", "title": "Mutual Non-Disclosure Agreement", |
| "party_a": "Stark Components", "party_b": "Wayne Enterprises", |
| "effective_date": "2026-03-15", "expiration_date": "2029-03-14", |
| "contract_value": 0.0, "currency": "USD", |
| "governing_law": "New York", "auto_renew": False, "termination_notice_days": 30, |
| }, |
| }) |
|
|
| |
| docs.append({ |
| "id": "receipt_digital", "type": "receipt", "channel": "digital", |
| "difficulty": "standard", |
| "fields": { |
| "merchant": "City Hardware", "date": "2026-06-18", "currency": "USD", |
| "subtotal": 47.00, "tax_amount": 3.76, "total": 50.76, |
| "payment_method": "Visa card ending 4242", |
| "line_items": _items([("Paint 1L", 2, 18.00), ("Brush set", 1, 11.00)]), |
| }, |
| }) |
| docs.append({ |
| "id": "receipt_scanned", "type": "receipt", "channel": "scanned", |
| "difficulty": "standard", |
| "fields": { |
| "merchant": "QuickMart", "date": "2026-05-30", "currency": "USD", |
| "subtotal": 23.50, "tax_amount": 1.88, "total": 25.38, |
| "payment_method": "Cash", |
| "line_items": _items([("Coffee", 5, 3.50), ("Snacks", 2, 3.00)]), |
| }, |
| }) |
|
|
| |
| docs.append({ |
| "id": "subscription_memo_pos", "type": "subscription_memo", "channel": "digital", |
| "difficulty": "standard", |
| "fields": { |
| "memo_number": "SUB-2026-0091", "subscription_name": "POS Cloud Platform", |
| "vendor_name": "Initech Supplies", "account_id": "ACC-55821", |
| "plan": "Enterprise (500 lanes)", "billing_cycle": "annual", |
| "start_date": "2025-08-01", "renewal_date": "2026-08-01", |
| "amount": 84000.00, "currency": "USD", "auto_renew": True, "status": "active", |
| "notes": "Price locked for 2 years; 60-day cancellation notice required.", |
| }, |
| }) |
| docs.append({ |
| "id": "subscription_memo_scanned", "type": "subscription_memo", "channel": "scanned", |
| "difficulty": "standard", |
| "fields": { |
| "memo_number": "SUB-2026-0145", "subscription_name": "Store Wi-Fi & Analytics", |
| "vendor_name": "GlobalParts GmbH", "account_id": "ACC-77310", |
| "plan": "Standard", "billing_cycle": "monthly", |
| "start_date": "2026-01-15", "renewal_date": "2026-07-15", |
| "amount": 2500.00, "currency": "EUR", "auto_renew": False, "status": "pending", |
| "notes": "Evaluate before renewal; usage below tier.", |
| }, |
| }) |
|
|
| |
| def _cfi(code, desc, cat, qty, up): |
| return {"item_code": code, "description": desc, "category": cat, |
| "quantity": qty, "unit_price": up, "line_total": round(qty * up, 2)} |
| docs.append({ |
| "id": "complex_capex_requisition", "type": "complex_form", "channel": "digital", |
| "difficulty": "complex", "skip_eval": True, |
| "fields": { |
| "form_title": "CAPITAL EXPENDITURE REQUEST & VENDOR QUOTE COMPARISON", |
| "form_number": "CAPEX-2026-0457", "request_date": "2026-05-12", |
| "department": "Manufacturing Operations", "cost_center": "CC-4400", "priority": "High", |
| "requestor": {"name": "Dana Whitfield", "employee_id": "E-20455", |
| "email": "dana.whitfield@globex.example", "manager": "Priya Anand"}, |
| "project": {"name": "Line 3 Robotics Upgrade", "code": "PRJ-LINE3", |
| "budget_code": "BUD-2026-CAP-11", |
| "justification": "Replace end-of-life pick-and-place cells to raise throughput 18%."}, |
| "vendor_quotes": [ |
| {"vendor_name": "Acme Industrial Supplies", "vendor_id": "V-ACME-001", |
| "quote_number": "Q-AC-9981", "valid_until": "2026-06-30", "currency": "USD", |
| "line_items": [_cfi("RB-500", "6-axis robot arm", "equipment", 2, 38000.00), |
| _cfi("EOAT-12", "End-of-arm tooling", "tooling", 2, 4500.00), |
| _cfi("INST-01", "Installation and calibration", "service", 1, 12000.00)], |
| "subtotal": 97000.00, "tax_rate": 0.08, "tax_amount": 7760.00, "shipping": 1500.00, |
| "total": 106260.00, "recommended": False}, |
| {"vendor_name": "GlobalParts GmbH", "vendor_id": "V-GLOB-014", |
| "quote_number": "Q-GP-4471", "valid_until": "2026-06-15", "currency": "EUR", |
| "line_items": [_cfi("RB-EU", "6-axis robot arm", "equipment", 2, 34000.00), |
| _cfi("TOOL-9", "Tooling set", "tooling", 2, 5000.00), |
| _cfi("SVC-2", "Commissioning", "service", 1, 15000.00)], |
| "subtotal": 93000.00, "tax_rate": 0.19, "tax_amount": 17670.00, "shipping": 2000.00, |
| "total": 112670.00, "recommended": False}, |
| {"vendor_name": "Initech Supplies", "vendor_id": "V-INIT-007", |
| "quote_number": "Q-IN-3320", "valid_until": "2026-07-10", "currency": "USD", |
| "line_items": [_cfi("RB-INI", "Robot cell", "equipment", 2, 36000.00), |
| _cfi("TLG-3", "Gripper kit", "tooling", 2, 4000.00), |
| _cfi("SET-1", "Setup", "service", 1, 9000.00)], |
| "subtotal": 89000.00, "tax_rate": 0.08, "tax_amount": 7120.00, "shipping": 1200.00, |
| "total": 97320.00, "recommended": True}, |
| ], |
| "selected_vendor": "Initech Supplies", "selected_total": 97320.00, |
| "currency": "USD", "grand_total": 97320.00, |
| "approvals": [ |
| {"role": "Department Head", "name": "Priya Anand", "decision": "Approved", "date": "2026-05-14"}, |
| {"role": "Finance", "name": "Marcus Lee", "decision": "Approved", "date": "2026-05-15"}, |
| {"role": "Procurement", "name": "Sofia Reyes", "decision": "Pending", "date": "-"}, |
| ], |
| "compliance": {"three_quotes_obtained": True, "budget_approved": True, |
| "sole_source_justified": False, "vendor_vetted": True}, |
| "notes": "Delivery required before Q3 production ramp.", |
| }, |
| }) |
| return docs |
|
|
|
|
| |
|
|
| def render_lines(doc: dict) -> list[str]: |
| f = doc["fields"] |
| t = doc["type"] |
| omit = set(doc.get("omit", [])) |
| L: list[str] = [] |
| cur_sym = {"USD": "$", "EUR": "€", "GBP": "£"}.get(f.get("currency", "USD"), "$") |
|
|
| def money(v): |
| return f"{cur_sym}{v:,.2f}" |
|
|
| if t == "invoice": |
| L += ["INVOICE", ""] |
| L.append(f"Invoice Number: {f['invoice_number']}") |
| L.append(f"Invoice Date: {f['issue_date']}") |
| if "due_date" not in omit and f.get("due_date"): |
| L.append(f"Due Date: {f['due_date']}") |
| L.append(f"From: {f['vendor_name']}") |
| if "bill_to_name" not in omit and f.get("bill_to_name"): |
| L.append(f"Bill To: {f['bill_to_name']}") |
| L.append(f"Currency: {f['currency']}") |
| L += ["", _row("Description", "Qty", "Unit Price", "Amount")] |
| for it in f.get("line_items", []): |
| L.append(_row(it["description"], str(int(it["quantity"])), |
| money(it["unit_price"]), money(it["line_total"]))) |
| L.append("") |
| L.append(f"Subtotal: {money(f['subtotal'])}") |
| L.append(f"Tax (10%): {money(f['tax_amount'])}") |
| if "total" not in omit and f.get("total") is not None: |
| L.append(f"Total: {money(f['total'])}") |
| elif t == "purchase_order": |
| L += ["PURCHASE ORDER", ""] |
| L.append(f"Purchase Order Number: {f['order_number']}") |
| L.append(f"Order Date: {f['order_date']}") |
| L.append(f"Delivery Date: {f['delivery_date']}") |
| L.append(f"Vendor: {f['vendor_name']}") |
| L.append(f"Buyer: {f['buyer_name']}") |
| L.append(f"Ship To: {f['ship_to']}") |
| L.append(f"Payment Terms: {f['payment_terms']}") |
| L.append(f"Currency: {f['currency']}") |
| L += ["", _row("Description", "Qty", "Unit Price", "Amount")] |
| for it in f.get("line_items", []): |
| L.append(_row(it["description"], str(int(it["quantity"])), |
| money(it["unit_price"]), money(it["line_total"]))) |
| L += ["", f"Subtotal: {money(f['subtotal'])}", f"Tax: {money(f['tax_amount'])}", |
| f"Total: {money(f['total'])}"] |
| elif t == "contract": |
| L += [f["title"].upper(), ""] |
| L.append(f"Contract Number: {f['contract_number']}") |
| L.append(f"This Agreement is entered into between {f['party_a']} and {f['party_b']}.") |
| L.append(f"Effective Date: {f['effective_date']}") |
| L.append(f"Expiration Date: {f['expiration_date']}") |
| if f.get("contract_value"): |
| L.append(f"Contract Value: {money(f['contract_value'])}") |
| L.append(f"Governing Law: {f['governing_law']}") |
| L.append("This agreement shall automatically renew." |
| if f.get("auto_renew") else "This agreement has no automatic renewal.") |
| L.append(f"Either party may terminate upon {f['termination_notice_days']} days written notice.") |
| elif t == "receipt": |
| L += ["RECEIPT", ""] |
| L.append(f"Merchant: {f['merchant']}") |
| L.append(f"Date: {f['date']}") |
| L.append(f"Currency: {f['currency']}") |
| L += ["", _row("Description", "Qty", "Unit Price", "Amount")] |
| for it in f.get("line_items", []): |
| L.append(_row(it["description"], str(int(it["quantity"])), |
| money(it["unit_price"]), money(it["line_total"]))) |
| L += ["", f"Subtotal: {money(f['subtotal'])}", f"Tax: {money(f['tax_amount'])}", |
| f"Total: {money(f['total'])}", f"Paid by: {f['payment_method']}"] |
| elif t == "subscription_memo": |
| cur_sym2 = {"USD": "$", "EUR": "€", "GBP": "£"}.get(f.get("currency", "USD"), "$") |
| L += ["SUBSCRIPTION MEMO", ""] |
| L.append(f"Memo Number: {f['memo_number']}") |
| L.append(f"Subscription: {f['subscription_name']}") |
| L.append(f"Vendor: {f['vendor_name']}") |
| L.append(f"Account ID: {f['account_id']}") |
| L.append(f"Plan: {f['plan']}") |
| L.append(f"Billing Cycle: {f['billing_cycle']}") |
| L.append(f"Start Date: {f['start_date']}") |
| L.append(f"Renewal Date: {f['renewal_date']}") |
| L.append(f"Amount: {cur_sym2}{f['amount']:,.2f}") |
| L.append(f"Currency: {f['currency']}") |
| L.append("Auto-Renew: " + ("Yes" if f["auto_renew"] else "No")) |
| L.append(f"Status: {f['status']}") |
| L.append(f"Notes: {f['notes']}") |
| elif t == "complex_form": |
| L += render_complex_form(f) |
| return L |
|
|
|
|
| def _cf_row(code, desc, cat, qty, price, amt): |
| return f"{code:<10}{desc:<30}{cat:<14}{qty:>4} {price:>12} {amt:>12}" |
|
|
|
|
| def render_complex_form(f: dict) -> list[str]: |
| L = ["CAPITAL EXPENDITURE REQUEST & VENDOR QUOTE COMPARISON", ""] |
| L += [f"Form Number: {f['form_number']}", f"Request Date: {f['request_date']}", |
| f"Department: {f['department']}", f"Cost Center: {f['cost_center']}", |
| f"Priority: {f['priority']}", ""] |
| r = f["requestor"] |
| L += ["REQUESTOR", f"Name: {r['name']}", f"Employee ID: {r['employee_id']}", |
| f"Email: {r['email']}", f"Manager: {r['manager']}", ""] |
| pr = f["project"] |
| L += ["PROJECT", f"Name: {pr['name']}", f"Code: {pr['code']}", |
| f"Budget Code: {pr['budget_code']}", f"Justification: {pr['justification']}", ""] |
| for i, q in enumerate(f["vendor_quotes"], start=1): |
| sym = {"USD": "$", "EUR": "€", "GBP": "£"}.get(q["currency"], "$") |
| L += [f"VENDOR QUOTE {i}", f"Vendor: {q['vendor_name']}", f"Vendor ID: {q['vendor_id']}", |
| f"Quote Number: {q['quote_number']}", f"Valid Until: {q['valid_until']}", |
| f"Currency: {q['currency']}", |
| _cf_row("Item Code", "Description", "Category", "Qty", "Unit Price", "Amount")] |
| for it in q["line_items"]: |
| L.append(_cf_row(it["item_code"], it["description"], it["category"], |
| str(int(it["quantity"])), f"{it['unit_price']:.2f}", f"{it['line_total']:.2f}")) |
| L += [f"Subtotal: {sym}{q['subtotal']:.2f}", |
| f"Tax ({int(q['tax_rate']*100)}%): {sym}{q['tax_amount']:.2f}", |
| f"Shipping: {sym}{q['shipping']:.2f}", f"Total: {sym}{q['total']:.2f}", ""] |
| L += ["APPROVALS", f"{'Role':<20}{'Name':<18}{'Decision':<12}Date"] |
| for a in f["approvals"]: |
| L.append(f"{a['role']:<20}{a['name']:<18}{a['decision']:<12}{a['date']}") |
| L += ["", "COMPLIANCE CHECKLIST"] |
| cm = f["compliance"] |
| L += [f"[{'X' if cm['three_quotes_obtained'] else ' '}] Three competitive quotes obtained", |
| f"[{'X' if cm['budget_approved'] else ' '}] Budget approved", |
| f"[{'X' if cm['sole_source_justified'] else ' '}] Sole-source justification attached", |
| f"[{'X' if cm['vendor_vetted'] else ' '}] Vendor vetted / on approved list", ""] |
| L += [f"Notes: {f['notes']}"] |
| return L |
|
|
|
|
| def _row(desc: str, qty: str, price: str, amount: str) -> str: |
| return f"{desc:<28}{qty:>5} {price:>12} {amount:>12}" |
|
|
|
|
| |
|
|
| def ground_truth(doc: dict) -> dict: |
| f = dict(doc["fields"]) |
| for k in doc.get("omit", []): |
| f.pop(k, None) |
| f["doc_type"] = doc["type"] |
| f["_meta"] = {"doc_type": doc["type"], "channel": doc["channel"], |
| "difficulty": doc["difficulty"]} |
| if doc.get("skip_eval"): |
| f["_meta"]["skip_eval"] = True |
| return f |
|
|
|
|
| |
|
|
| def write_pdf(path: Path, lines: list[str]) -> bool: |
| try: |
| from reportlab.lib.pagesizes import letter |
| from reportlab.pdfgen import canvas |
| except Exception: |
| return False |
| c = canvas.Canvas(str(path), pagesize=letter) |
| c.setFont("Courier", 10) |
| width, height = letter |
| y = height - 60 |
| for line in lines: |
| if y < 60: |
| c.showPage() |
| c.setFont("Courier", 10) |
| y = height - 60 |
| c.drawString(50, y, line) |
| y -= 15 |
| c.save() |
| return True |
|
|
|
|
| def write_image(path: Path, lines: list[str]) -> bool: |
| try: |
| from PIL import Image, ImageDraw, ImageFont |
| except Exception: |
| return False |
| W, H = 850, max(400, 60 + 20 * len(lines)) |
| img = Image.new("RGB", (W, H), "white") |
| d = ImageDraw.Draw(img) |
| try: |
| font = ImageFont.truetype("/System/Library/Fonts/Menlo.ttc", 13) |
| except Exception: |
| font = ImageFont.load_default() |
| y = 30 |
| for line in lines: |
| d.text((40, y), line, fill="black", font=font) |
| y += 20 |
| img.save(str(path)) |
| return True |
|
|
|
|
| def main() -> None: |
| DATASET_DIR.mkdir(parents=True, exist_ok=True) |
| SHOWCASE_DIR.mkdir(parents=True, exist_ok=True) |
| docs = corpus() |
| made = 0 |
| for doc in docs: |
| lines = render_lines(doc) |
| gt = ground_truth(doc) |
| stem = DATASET_DIR / doc["id"] |
| if doc["channel"] == "digital": |
| ok = write_pdf(stem.with_suffix(".pdf"), lines) |
| kind = "PDF" |
| else: |
| ok = write_image(stem.with_suffix(".png"), lines) |
| |
| stem.with_suffix(".txt").write_text("\n".join(lines), encoding="utf-8") |
| kind = "PNG+OCR-sidecar" |
| stem.with_suffix(".gt.json").write_text(json.dumps(gt, indent=2), encoding="utf-8") |
| made += 1 |
| print(f" [{doc['channel']:<8}] {doc['id']:<28} {kind}{'' if ok else ' (render lib missing!)'}") |
| |
| for hero in ("invoice_acme_digital", "po_acme_digital"): |
| for ext in (".pdf", ".png", ".gt.json", ".txt"): |
| src = DATASET_DIR / f"{hero}{ext}" |
| if src.exists(): |
| (SHOWCASE_DIR / src.name).write_bytes(src.read_bytes()) |
| print(f"\nGenerated {made} documents → {DATASET_DIR}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|