ERP-DocIQ / scripts /generate_samples.py
kenmandal's picture
Deploy ERP-DocIQ: agentic OCR + IDP (MiniCPM-V 8B, Tesseract)
32b00ed verified
Raw
History Blame Contribute Delete
22.7 kB
#!/usr/bin/env python3
"""Generate a synthetic-but-realistic document corpus with paired ground truth.
Covers the axes the reference docs care about:
• type: invoice · purchase_order · contract · receipt
• channel: digital (PDF text layer) · scanned (PNG, no text layer + .txt sidecar)
• difficulty: standard · multicurrency · dense_table · missing_fields · multipage
Outputs go to backend/evals/datasets/:
<id>.pdf | <id>.png the document
<id>.txt sidecar OCR text (for scanned/photo, drives the OCR fallback)
<id>.gt.json ground-truth fields + _meta
Run: python scripts/generate_samples.py (from repo root)
"""
from __future__ import annotations
import json
import sys
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parent.parent
BACKEND = REPO_ROOT / "backend"
sys.path.insert(0, str(BACKEND))
DATASET_DIR = BACKEND / "evals" / "datasets"
SHOWCASE_DIR = REPO_ROOT / "samples"
# --- document specifications --------------------------------------------------
def _items(rows):
return [
{"description": d, "quantity": q, "unit_price": up, "line_total": round(q * up, 2)}
for d, q, up in rows
]
def corpus() -> list[dict]:
docs: list[dict] = []
# ---- invoices ----
docs.append({
"id": "invoice_acme_digital", "type": "invoice", "channel": "digital",
"difficulty": "standard",
"fields": {
"invoice_number": "INV-1001", "issue_date": "2026-07-15", "due_date": "2026-08-14",
"vendor_name": "Acme Industrial Supplies", "bill_to_name": "Globex Corporation",
"currency": "USD", "subtotal": 300.00, "tax_amount": 30.00, "total": 330.00,
"line_items": _items([("Steel Bolts M8", 100, 1.20),
("Hex Nuts M8", 200, 0.40),
("Washers", 100, 1.00)]),
},
})
docs.append({
"id": "invoice_globalparts_eur", "type": "invoice", "channel": "digital",
"difficulty": "multicurrency",
"fields": {
"invoice_number": "GP-2026-558", "issue_date": "2026-05-03", "due_date": "2026-06-02",
"vendor_name": "GlobalParts GmbH", "bill_to_name": "Initech LLC",
"currency": "EUR", "subtotal": 1840.00, "tax_amount": 349.60, "total": 2189.60,
"line_items": _items([("Bearing assembly", 8, 180.00),
("Drive belt", 20, 20.00)]),
},
})
docs.append({
"id": "invoice_scanned_basic", "type": "invoice", "channel": "scanned",
"difficulty": "standard",
"fields": {
"invoice_number": "INV-7741", "issue_date": "2026-03-22", "due_date": "2026-04-21",
"vendor_name": "Northwind Traders", "bill_to_name": "Contoso Ltd",
"currency": "USD", "subtotal": 540.00, "tax_amount": 43.20, "total": 583.20,
"line_items": _items([("Office chairs", 6, 90.00)]),
},
})
docs.append({
"id": "invoice_missing_total", "type": "invoice", "channel": "digital",
"difficulty": "missing_fields",
"fields": { # total intentionally absent → should be flagged + routed to HITL
"invoice_number": "INV-3300", "issue_date": "2026-02-10",
"vendor_name": "Stark Components", "currency": "USD",
"subtotal": 1200.00, "tax_amount": 96.00,
"line_items": _items([("Servo motor", 2, 600.00)]),
},
"omit": ["total", "due_date", "bill_to_name"],
})
docs.append({
"id": "invoice_dense_table", "type": "invoice", "channel": "digital",
"difficulty": "dense_table",
"fields": {
"invoice_number": "INV-9120", "issue_date": "2026-06-01", "due_date": "2026-07-01",
"vendor_name": "Wayne Enterprises", "bill_to_name": "Oscorp",
"currency": "USD", "subtotal": 2140.00, "tax_amount": 214.00, "total": 2354.00,
"line_items": _items([
("Aluminium sheet 2mm", 30, 24.00), ("Copper wire 10m", 40, 8.00),
("Circuit board v2", 50, 12.00), ("Capacitor pack", 100, 2.20),
("Resistor pack", 100, 1.00), ("LED array", 60, 3.00),
]),
},
})
# ---- purchase orders ----
docs.append({
"id": "po_acme_digital", "type": "purchase_order", "channel": "digital",
"difficulty": "standard",
"fields": {
"order_number": "PO-100481", "order_date": "2026-07-02", "delivery_date": "2026-07-20",
"vendor_name": "Acme Industrial", "buyer_name": "Globex Procurement",
"ship_to": "12 Industrial Way, Springfield", "currency": "USD",
"subtotal": 12000.00, "tax_amount": 450.00, "total": 12450.00,
"payment_terms": "Net 30",
"line_items": _items([("CNC spindle", 2, 5000.00), ("Tool set", 4, 500.00)]),
},
})
docs.append({
"id": "po_scanned", "type": "purchase_order", "channel": "scanned",
"difficulty": "standard",
"fields": {
"order_number": "PO-100483", "order_date": "2026-04-11", "delivery_date": "2026-05-01",
"vendor_name": "Initech Supplies", "buyer_name": "Contoso Ops",
"ship_to": "9 Market St, Metropolis", "currency": "USD",
"subtotal": 900.00, "tax_amount": 80.00, "total": 980.00, "payment_terms": "Net 15",
"line_items": _items([("Printer paper (box)", 20, 45.00)]),
},
})
# ---- contracts ----
docs.append({
"id": "contract_msa_digital", "type": "contract", "channel": "digital",
"difficulty": "standard",
"fields": {
"contract_number": "MSA-2026-014", "title": "Master Services Agreement",
"party_a": "Acme Industrial Supplies", "party_b": "Globex Corporation",
"effective_date": "2026-01-01", "expiration_date": "2027-12-31",
"contract_value": 250000.00, "currency": "USD",
"governing_law": "Delaware", "auto_renew": True, "termination_notice_days": 60,
},
})
docs.append({
"id": "contract_scanned", "type": "contract", "channel": "scanned",
"difficulty": "standard",
"fields": {
"contract_number": "NDA-7781", "title": "Mutual Non-Disclosure Agreement",
"party_a": "Stark Components", "party_b": "Wayne Enterprises",
"effective_date": "2026-03-15", "expiration_date": "2029-03-14",
"contract_value": 0.0, "currency": "USD",
"governing_law": "New York", "auto_renew": False, "termination_notice_days": 30,
},
})
# ---- receipts ----
docs.append({
"id": "receipt_digital", "type": "receipt", "channel": "digital",
"difficulty": "standard",
"fields": {
"merchant": "City Hardware", "date": "2026-06-18", "currency": "USD",
"subtotal": 47.00, "tax_amount": 3.76, "total": 50.76,
"payment_method": "Visa card ending 4242",
"line_items": _items([("Paint 1L", 2, 18.00), ("Brush set", 1, 11.00)]),
},
})
docs.append({
"id": "receipt_scanned", "type": "receipt", "channel": "scanned",
"difficulty": "standard",
"fields": {
"merchant": "QuickMart", "date": "2026-05-30", "currency": "USD",
"subtotal": 23.50, "tax_amount": 1.88, "total": 25.38,
"payment_method": "Cash",
"line_items": _items([("Coffee", 5, 3.50), ("Snacks", 2, 3.00)]),
},
})
# ---- subscription memos ----
docs.append({
"id": "subscription_memo_pos", "type": "subscription_memo", "channel": "digital",
"difficulty": "standard",
"fields": {
"memo_number": "SUB-2026-0091", "subscription_name": "POS Cloud Platform",
"vendor_name": "Initech Supplies", "account_id": "ACC-55821",
"plan": "Enterprise (500 lanes)", "billing_cycle": "annual",
"start_date": "2025-08-01", "renewal_date": "2026-08-01",
"amount": 84000.00, "currency": "USD", "auto_renew": True, "status": "active",
"notes": "Price locked for 2 years; 60-day cancellation notice required.",
},
})
docs.append({
"id": "subscription_memo_scanned", "type": "subscription_memo", "channel": "scanned",
"difficulty": "standard",
"fields": {
"memo_number": "SUB-2026-0145", "subscription_name": "Store Wi-Fi & Analytics",
"vendor_name": "GlobalParts GmbH", "account_id": "ACC-77310",
"plan": "Standard", "billing_cycle": "monthly",
"start_date": "2026-01-15", "renewal_date": "2026-07-15",
"amount": 2500.00, "currency": "EUR", "auto_renew": False, "status": "pending",
"notes": "Evaluate before renewal; usage below tier.",
},
})
# ---- complex multi-layer form (hybrid offline+online demo) ----
def _cfi(code, desc, cat, qty, up):
return {"item_code": code, "description": desc, "category": cat,
"quantity": qty, "unit_price": up, "line_total": round(qty * up, 2)}
docs.append({
"id": "complex_capex_requisition", "type": "complex_form", "channel": "digital",
"difficulty": "complex", "skip_eval": True,
"fields": {
"form_title": "CAPITAL EXPENDITURE REQUEST & VENDOR QUOTE COMPARISON",
"form_number": "CAPEX-2026-0457", "request_date": "2026-05-12",
"department": "Manufacturing Operations", "cost_center": "CC-4400", "priority": "High",
"requestor": {"name": "Dana Whitfield", "employee_id": "E-20455",
"email": "dana.whitfield@globex.example", "manager": "Priya Anand"},
"project": {"name": "Line 3 Robotics Upgrade", "code": "PRJ-LINE3",
"budget_code": "BUD-2026-CAP-11",
"justification": "Replace end-of-life pick-and-place cells to raise throughput 18%."},
"vendor_quotes": [
{"vendor_name": "Acme Industrial Supplies", "vendor_id": "V-ACME-001",
"quote_number": "Q-AC-9981", "valid_until": "2026-06-30", "currency": "USD",
"line_items": [_cfi("RB-500", "6-axis robot arm", "equipment", 2, 38000.00),
_cfi("EOAT-12", "End-of-arm tooling", "tooling", 2, 4500.00),
_cfi("INST-01", "Installation and calibration", "service", 1, 12000.00)],
"subtotal": 97000.00, "tax_rate": 0.08, "tax_amount": 7760.00, "shipping": 1500.00,
"total": 106260.00, "recommended": False},
{"vendor_name": "GlobalParts GmbH", "vendor_id": "V-GLOB-014",
"quote_number": "Q-GP-4471", "valid_until": "2026-06-15", "currency": "EUR",
"line_items": [_cfi("RB-EU", "6-axis robot arm", "equipment", 2, 34000.00),
_cfi("TOOL-9", "Tooling set", "tooling", 2, 5000.00),
_cfi("SVC-2", "Commissioning", "service", 1, 15000.00)],
"subtotal": 93000.00, "tax_rate": 0.19, "tax_amount": 17670.00, "shipping": 2000.00,
"total": 112670.00, "recommended": False},
{"vendor_name": "Initech Supplies", "vendor_id": "V-INIT-007",
"quote_number": "Q-IN-3320", "valid_until": "2026-07-10", "currency": "USD",
"line_items": [_cfi("RB-INI", "Robot cell", "equipment", 2, 36000.00),
_cfi("TLG-3", "Gripper kit", "tooling", 2, 4000.00),
_cfi("SET-1", "Setup", "service", 1, 9000.00)],
"subtotal": 89000.00, "tax_rate": 0.08, "tax_amount": 7120.00, "shipping": 1200.00,
"total": 97320.00, "recommended": True},
],
"selected_vendor": "Initech Supplies", "selected_total": 97320.00,
"currency": "USD", "grand_total": 97320.00,
"approvals": [
{"role": "Department Head", "name": "Priya Anand", "decision": "Approved", "date": "2026-05-14"},
{"role": "Finance", "name": "Marcus Lee", "decision": "Approved", "date": "2026-05-15"},
{"role": "Procurement", "name": "Sofia Reyes", "decision": "Pending", "date": "-"},
],
"compliance": {"three_quotes_obtained": True, "budget_approved": True,
"sole_source_justified": False, "vendor_vetted": True},
"notes": "Delivery required before Q3 production ramp.",
},
})
return docs
# --- text layout rendering ----------------------------------------------------
def render_lines(doc: dict) -> list[str]:
f = doc["fields"]
t = doc["type"]
omit = set(doc.get("omit", []))
L: list[str] = []
cur_sym = {"USD": "$", "EUR": "€", "GBP": "£"}.get(f.get("currency", "USD"), "$")
def money(v):
return f"{cur_sym}{v:,.2f}"
if t == "invoice":
L += ["INVOICE", ""]
L.append(f"Invoice Number: {f['invoice_number']}")
L.append(f"Invoice Date: {f['issue_date']}")
if "due_date" not in omit and f.get("due_date"):
L.append(f"Due Date: {f['due_date']}")
L.append(f"From: {f['vendor_name']}")
if "bill_to_name" not in omit and f.get("bill_to_name"):
L.append(f"Bill To: {f['bill_to_name']}")
L.append(f"Currency: {f['currency']}")
L += ["", _row("Description", "Qty", "Unit Price", "Amount")]
for it in f.get("line_items", []):
L.append(_row(it["description"], str(int(it["quantity"])),
money(it["unit_price"]), money(it["line_total"])))
L.append("")
L.append(f"Subtotal: {money(f['subtotal'])}")
L.append(f"Tax (10%): {money(f['tax_amount'])}")
if "total" not in omit and f.get("total") is not None:
L.append(f"Total: {money(f['total'])}")
elif t == "purchase_order":
L += ["PURCHASE ORDER", ""]
L.append(f"Purchase Order Number: {f['order_number']}")
L.append(f"Order Date: {f['order_date']}")
L.append(f"Delivery Date: {f['delivery_date']}")
L.append(f"Vendor: {f['vendor_name']}")
L.append(f"Buyer: {f['buyer_name']}")
L.append(f"Ship To: {f['ship_to']}")
L.append(f"Payment Terms: {f['payment_terms']}")
L.append(f"Currency: {f['currency']}")
L += ["", _row("Description", "Qty", "Unit Price", "Amount")]
for it in f.get("line_items", []):
L.append(_row(it["description"], str(int(it["quantity"])),
money(it["unit_price"]), money(it["line_total"])))
L += ["", f"Subtotal: {money(f['subtotal'])}", f"Tax: {money(f['tax_amount'])}",
f"Total: {money(f['total'])}"]
elif t == "contract":
L += [f["title"].upper(), ""]
L.append(f"Contract Number: {f['contract_number']}")
L.append(f"This Agreement is entered into between {f['party_a']} and {f['party_b']}.")
L.append(f"Effective Date: {f['effective_date']}")
L.append(f"Expiration Date: {f['expiration_date']}")
if f.get("contract_value"):
L.append(f"Contract Value: {money(f['contract_value'])}")
L.append(f"Governing Law: {f['governing_law']}")
L.append("This agreement shall automatically renew."
if f.get("auto_renew") else "This agreement has no automatic renewal.")
L.append(f"Either party may terminate upon {f['termination_notice_days']} days written notice.")
elif t == "receipt":
L += ["RECEIPT", ""]
L.append(f"Merchant: {f['merchant']}")
L.append(f"Date: {f['date']}")
L.append(f"Currency: {f['currency']}")
L += ["", _row("Description", "Qty", "Unit Price", "Amount")]
for it in f.get("line_items", []):
L.append(_row(it["description"], str(int(it["quantity"])),
money(it["unit_price"]), money(it["line_total"])))
L += ["", f"Subtotal: {money(f['subtotal'])}", f"Tax: {money(f['tax_amount'])}",
f"Total: {money(f['total'])}", f"Paid by: {f['payment_method']}"]
elif t == "subscription_memo":
cur_sym2 = {"USD": "$", "EUR": "€", "GBP": "£"}.get(f.get("currency", "USD"), "$")
L += ["SUBSCRIPTION MEMO", ""]
L.append(f"Memo Number: {f['memo_number']}")
L.append(f"Subscription: {f['subscription_name']}")
L.append(f"Vendor: {f['vendor_name']}")
L.append(f"Account ID: {f['account_id']}")
L.append(f"Plan: {f['plan']}")
L.append(f"Billing Cycle: {f['billing_cycle']}")
L.append(f"Start Date: {f['start_date']}")
L.append(f"Renewal Date: {f['renewal_date']}")
L.append(f"Amount: {cur_sym2}{f['amount']:,.2f}")
L.append(f"Currency: {f['currency']}")
L.append("Auto-Renew: " + ("Yes" if f["auto_renew"] else "No"))
L.append(f"Status: {f['status']}")
L.append(f"Notes: {f['notes']}")
elif t == "complex_form":
L += render_complex_form(f)
return L
def _cf_row(code, desc, cat, qty, price, amt):
return f"{code:<10}{desc:<30}{cat:<14}{qty:>4} {price:>12} {amt:>12}"
def render_complex_form(f: dict) -> list[str]:
L = ["CAPITAL EXPENDITURE REQUEST & VENDOR QUOTE COMPARISON", ""]
L += [f"Form Number: {f['form_number']}", f"Request Date: {f['request_date']}",
f"Department: {f['department']}", f"Cost Center: {f['cost_center']}",
f"Priority: {f['priority']}", ""]
r = f["requestor"]
L += ["REQUESTOR", f"Name: {r['name']}", f"Employee ID: {r['employee_id']}",
f"Email: {r['email']}", f"Manager: {r['manager']}", ""]
pr = f["project"]
L += ["PROJECT", f"Name: {pr['name']}", f"Code: {pr['code']}",
f"Budget Code: {pr['budget_code']}", f"Justification: {pr['justification']}", ""]
for i, q in enumerate(f["vendor_quotes"], start=1):
sym = {"USD": "$", "EUR": "€", "GBP": "£"}.get(q["currency"], "$")
L += [f"VENDOR QUOTE {i}", f"Vendor: {q['vendor_name']}", f"Vendor ID: {q['vendor_id']}",
f"Quote Number: {q['quote_number']}", f"Valid Until: {q['valid_until']}",
f"Currency: {q['currency']}",
_cf_row("Item Code", "Description", "Category", "Qty", "Unit Price", "Amount")]
for it in q["line_items"]:
L.append(_cf_row(it["item_code"], it["description"], it["category"],
str(int(it["quantity"])), f"{it['unit_price']:.2f}", f"{it['line_total']:.2f}"))
L += [f"Subtotal: {sym}{q['subtotal']:.2f}",
f"Tax ({int(q['tax_rate']*100)}%): {sym}{q['tax_amount']:.2f}",
f"Shipping: {sym}{q['shipping']:.2f}", f"Total: {sym}{q['total']:.2f}", ""]
L += ["APPROVALS", f"{'Role':<20}{'Name':<18}{'Decision':<12}Date"]
for a in f["approvals"]:
L.append(f"{a['role']:<20}{a['name']:<18}{a['decision']:<12}{a['date']}")
L += ["", "COMPLIANCE CHECKLIST"]
cm = f["compliance"]
L += [f"[{'X' if cm['three_quotes_obtained'] else ' '}] Three competitive quotes obtained",
f"[{'X' if cm['budget_approved'] else ' '}] Budget approved",
f"[{'X' if cm['sole_source_justified'] else ' '}] Sole-source justification attached",
f"[{'X' if cm['vendor_vetted'] else ' '}] Vendor vetted / on approved list", ""]
L += [f"Notes: {f['notes']}"]
return L
def _row(desc: str, qty: str, price: str, amount: str) -> str:
return f"{desc:<28}{qty:>5} {price:>12} {amount:>12}"
# --- ground truth -------------------------------------------------------------
def ground_truth(doc: dict) -> dict:
f = dict(doc["fields"])
for k in doc.get("omit", []):
f.pop(k, None)
f["doc_type"] = doc["type"]
f["_meta"] = {"doc_type": doc["type"], "channel": doc["channel"],
"difficulty": doc["difficulty"]}
if doc.get("skip_eval"):
f["_meta"]["skip_eval"] = True
return f
# --- renderers ----------------------------------------------------------------
def write_pdf(path: Path, lines: list[str]) -> bool:
try:
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
except Exception:
return False
c = canvas.Canvas(str(path), pagesize=letter)
c.setFont("Courier", 10)
width, height = letter
y = height - 60
for line in lines:
if y < 60:
c.showPage()
c.setFont("Courier", 10)
y = height - 60
c.drawString(50, y, line)
y -= 15
c.save()
return True
def write_image(path: Path, lines: list[str]) -> bool:
try:
from PIL import Image, ImageDraw, ImageFont
except Exception:
return False
W, H = 850, max(400, 60 + 20 * len(lines))
img = Image.new("RGB", (W, H), "white")
d = ImageDraw.Draw(img)
try:
font = ImageFont.truetype("/System/Library/Fonts/Menlo.ttc", 13)
except Exception:
font = ImageFont.load_default()
y = 30
for line in lines:
d.text((40, y), line, fill="black", font=font)
y += 20
img.save(str(path))
return True
def main() -> None:
DATASET_DIR.mkdir(parents=True, exist_ok=True)
SHOWCASE_DIR.mkdir(parents=True, exist_ok=True)
docs = corpus()
made = 0
for doc in docs:
lines = render_lines(doc)
gt = ground_truth(doc)
stem = DATASET_DIR / doc["id"]
if doc["channel"] == "digital":
ok = write_pdf(stem.with_suffix(".pdf"), lines)
kind = "PDF"
else:
ok = write_image(stem.with_suffix(".png"), lines)
# sidecar text = what OCR would read (drives the offline OCR fallback)
stem.with_suffix(".txt").write_text("\n".join(lines), encoding="utf-8")
kind = "PNG+OCR-sidecar"
stem.with_suffix(".gt.json").write_text(json.dumps(gt, indent=2), encoding="utf-8")
made += 1
print(f" [{doc['channel']:<8}] {doc['id']:<28} {kind}{'' if ok else ' (render lib missing!)'}")
# copy two hero samples to samples/ for the README/demo
for hero in ("invoice_acme_digital", "po_acme_digital"):
for ext in (".pdf", ".png", ".gt.json", ".txt"):
src = DATASET_DIR / f"{hero}{ext}"
if src.exists():
(SHOWCASE_DIR / src.name).write_bytes(src.read_bytes())
print(f"\nGenerated {made} documents → {DATASET_DIR}")
if __name__ == "__main__":
main()