Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """Generate a synthetic-but-realistic document corpus with paired ground truth. | |
| Covers the axes the reference docs care about: | |
| • type: invoice · purchase_order · contract · receipt | |
| • channel: digital (PDF text layer) · scanned (PNG, no text layer + .txt sidecar) | |
| • difficulty: standard · multicurrency · dense_table · missing_fields · multipage | |
| Outputs go to backend/evals/datasets/: | |
| <id>.pdf | <id>.png the document | |
| <id>.txt sidecar OCR text (for scanned/photo, drives the OCR fallback) | |
| <id>.gt.json ground-truth fields + _meta | |
| Run: python scripts/generate_samples.py (from repo root) | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import sys | |
| from pathlib import Path | |
| REPO_ROOT = Path(__file__).resolve().parent.parent | |
| BACKEND = REPO_ROOT / "backend" | |
| sys.path.insert(0, str(BACKEND)) | |
| DATASET_DIR = BACKEND / "evals" / "datasets" | |
| SHOWCASE_DIR = REPO_ROOT / "samples" | |
| # --- document specifications -------------------------------------------------- | |
| def _items(rows): | |
| return [ | |
| {"description": d, "quantity": q, "unit_price": up, "line_total": round(q * up, 2)} | |
| for d, q, up in rows | |
| ] | |
| def corpus() -> list[dict]: | |
| docs: list[dict] = [] | |
| # ---- invoices ---- | |
| docs.append({ | |
| "id": "invoice_acme_digital", "type": "invoice", "channel": "digital", | |
| "difficulty": "standard", | |
| "fields": { | |
| "invoice_number": "INV-1001", "issue_date": "2026-07-15", "due_date": "2026-08-14", | |
| "vendor_name": "Acme Industrial Supplies", "bill_to_name": "Globex Corporation", | |
| "currency": "USD", "subtotal": 300.00, "tax_amount": 30.00, "total": 330.00, | |
| "line_items": _items([("Steel Bolts M8", 100, 1.20), | |
| ("Hex Nuts M8", 200, 0.40), | |
| ("Washers", 100, 1.00)]), | |
| }, | |
| }) | |
| docs.append({ | |
| "id": "invoice_globalparts_eur", "type": "invoice", "channel": "digital", | |
| "difficulty": "multicurrency", | |
| "fields": { | |
| "invoice_number": "GP-2026-558", "issue_date": "2026-05-03", "due_date": "2026-06-02", | |
| "vendor_name": "GlobalParts GmbH", "bill_to_name": "Initech LLC", | |
| "currency": "EUR", "subtotal": 1840.00, "tax_amount": 349.60, "total": 2189.60, | |
| "line_items": _items([("Bearing assembly", 8, 180.00), | |
| ("Drive belt", 20, 20.00)]), | |
| }, | |
| }) | |
| docs.append({ | |
| "id": "invoice_scanned_basic", "type": "invoice", "channel": "scanned", | |
| "difficulty": "standard", | |
| "fields": { | |
| "invoice_number": "INV-7741", "issue_date": "2026-03-22", "due_date": "2026-04-21", | |
| "vendor_name": "Northwind Traders", "bill_to_name": "Contoso Ltd", | |
| "currency": "USD", "subtotal": 540.00, "tax_amount": 43.20, "total": 583.20, | |
| "line_items": _items([("Office chairs", 6, 90.00)]), | |
| }, | |
| }) | |
| docs.append({ | |
| "id": "invoice_missing_total", "type": "invoice", "channel": "digital", | |
| "difficulty": "missing_fields", | |
| "fields": { # total intentionally absent → should be flagged + routed to HITL | |
| "invoice_number": "INV-3300", "issue_date": "2026-02-10", | |
| "vendor_name": "Stark Components", "currency": "USD", | |
| "subtotal": 1200.00, "tax_amount": 96.00, | |
| "line_items": _items([("Servo motor", 2, 600.00)]), | |
| }, | |
| "omit": ["total", "due_date", "bill_to_name"], | |
| }) | |
| docs.append({ | |
| "id": "invoice_dense_table", "type": "invoice", "channel": "digital", | |
| "difficulty": "dense_table", | |
| "fields": { | |
| "invoice_number": "INV-9120", "issue_date": "2026-06-01", "due_date": "2026-07-01", | |
| "vendor_name": "Wayne Enterprises", "bill_to_name": "Oscorp", | |
| "currency": "USD", "subtotal": 2140.00, "tax_amount": 214.00, "total": 2354.00, | |
| "line_items": _items([ | |
| ("Aluminium sheet 2mm", 30, 24.00), ("Copper wire 10m", 40, 8.00), | |
| ("Circuit board v2", 50, 12.00), ("Capacitor pack", 100, 2.20), | |
| ("Resistor pack", 100, 1.00), ("LED array", 60, 3.00), | |
| ]), | |
| }, | |
| }) | |
| # ---- purchase orders ---- | |
| docs.append({ | |
| "id": "po_acme_digital", "type": "purchase_order", "channel": "digital", | |
| "difficulty": "standard", | |
| "fields": { | |
| "order_number": "PO-100481", "order_date": "2026-07-02", "delivery_date": "2026-07-20", | |
| "vendor_name": "Acme Industrial", "buyer_name": "Globex Procurement", | |
| "ship_to": "12 Industrial Way, Springfield", "currency": "USD", | |
| "subtotal": 12000.00, "tax_amount": 450.00, "total": 12450.00, | |
| "payment_terms": "Net 30", | |
| "line_items": _items([("CNC spindle", 2, 5000.00), ("Tool set", 4, 500.00)]), | |
| }, | |
| }) | |
| docs.append({ | |
| "id": "po_scanned", "type": "purchase_order", "channel": "scanned", | |
| "difficulty": "standard", | |
| "fields": { | |
| "order_number": "PO-100483", "order_date": "2026-04-11", "delivery_date": "2026-05-01", | |
| "vendor_name": "Initech Supplies", "buyer_name": "Contoso Ops", | |
| "ship_to": "9 Market St, Metropolis", "currency": "USD", | |
| "subtotal": 900.00, "tax_amount": 80.00, "total": 980.00, "payment_terms": "Net 15", | |
| "line_items": _items([("Printer paper (box)", 20, 45.00)]), | |
| }, | |
| }) | |
| # ---- contracts ---- | |
| docs.append({ | |
| "id": "contract_msa_digital", "type": "contract", "channel": "digital", | |
| "difficulty": "standard", | |
| "fields": { | |
| "contract_number": "MSA-2026-014", "title": "Master Services Agreement", | |
| "party_a": "Acme Industrial Supplies", "party_b": "Globex Corporation", | |
| "effective_date": "2026-01-01", "expiration_date": "2027-12-31", | |
| "contract_value": 250000.00, "currency": "USD", | |
| "governing_law": "Delaware", "auto_renew": True, "termination_notice_days": 60, | |
| }, | |
| }) | |
| docs.append({ | |
| "id": "contract_scanned", "type": "contract", "channel": "scanned", | |
| "difficulty": "standard", | |
| "fields": { | |
| "contract_number": "NDA-7781", "title": "Mutual Non-Disclosure Agreement", | |
| "party_a": "Stark Components", "party_b": "Wayne Enterprises", | |
| "effective_date": "2026-03-15", "expiration_date": "2029-03-14", | |
| "contract_value": 0.0, "currency": "USD", | |
| "governing_law": "New York", "auto_renew": False, "termination_notice_days": 30, | |
| }, | |
| }) | |
| # ---- receipts ---- | |
| docs.append({ | |
| "id": "receipt_digital", "type": "receipt", "channel": "digital", | |
| "difficulty": "standard", | |
| "fields": { | |
| "merchant": "City Hardware", "date": "2026-06-18", "currency": "USD", | |
| "subtotal": 47.00, "tax_amount": 3.76, "total": 50.76, | |
| "payment_method": "Visa card ending 4242", | |
| "line_items": _items([("Paint 1L", 2, 18.00), ("Brush set", 1, 11.00)]), | |
| }, | |
| }) | |
| docs.append({ | |
| "id": "receipt_scanned", "type": "receipt", "channel": "scanned", | |
| "difficulty": "standard", | |
| "fields": { | |
| "merchant": "QuickMart", "date": "2026-05-30", "currency": "USD", | |
| "subtotal": 23.50, "tax_amount": 1.88, "total": 25.38, | |
| "payment_method": "Cash", | |
| "line_items": _items([("Coffee", 5, 3.50), ("Snacks", 2, 3.00)]), | |
| }, | |
| }) | |
| # ---- subscription memos ---- | |
| docs.append({ | |
| "id": "subscription_memo_pos", "type": "subscription_memo", "channel": "digital", | |
| "difficulty": "standard", | |
| "fields": { | |
| "memo_number": "SUB-2026-0091", "subscription_name": "POS Cloud Platform", | |
| "vendor_name": "Initech Supplies", "account_id": "ACC-55821", | |
| "plan": "Enterprise (500 lanes)", "billing_cycle": "annual", | |
| "start_date": "2025-08-01", "renewal_date": "2026-08-01", | |
| "amount": 84000.00, "currency": "USD", "auto_renew": True, "status": "active", | |
| "notes": "Price locked for 2 years; 60-day cancellation notice required.", | |
| }, | |
| }) | |
| docs.append({ | |
| "id": "subscription_memo_scanned", "type": "subscription_memo", "channel": "scanned", | |
| "difficulty": "standard", | |
| "fields": { | |
| "memo_number": "SUB-2026-0145", "subscription_name": "Store Wi-Fi & Analytics", | |
| "vendor_name": "GlobalParts GmbH", "account_id": "ACC-77310", | |
| "plan": "Standard", "billing_cycle": "monthly", | |
| "start_date": "2026-01-15", "renewal_date": "2026-07-15", | |
| "amount": 2500.00, "currency": "EUR", "auto_renew": False, "status": "pending", | |
| "notes": "Evaluate before renewal; usage below tier.", | |
| }, | |
| }) | |
| # ---- complex multi-layer form (hybrid offline+online demo) ---- | |
| def _cfi(code, desc, cat, qty, up): | |
| return {"item_code": code, "description": desc, "category": cat, | |
| "quantity": qty, "unit_price": up, "line_total": round(qty * up, 2)} | |
| docs.append({ | |
| "id": "complex_capex_requisition", "type": "complex_form", "channel": "digital", | |
| "difficulty": "complex", "skip_eval": True, | |
| "fields": { | |
| "form_title": "CAPITAL EXPENDITURE REQUEST & VENDOR QUOTE COMPARISON", | |
| "form_number": "CAPEX-2026-0457", "request_date": "2026-05-12", | |
| "department": "Manufacturing Operations", "cost_center": "CC-4400", "priority": "High", | |
| "requestor": {"name": "Dana Whitfield", "employee_id": "E-20455", | |
| "email": "dana.whitfield@globex.example", "manager": "Priya Anand"}, | |
| "project": {"name": "Line 3 Robotics Upgrade", "code": "PRJ-LINE3", | |
| "budget_code": "BUD-2026-CAP-11", | |
| "justification": "Replace end-of-life pick-and-place cells to raise throughput 18%."}, | |
| "vendor_quotes": [ | |
| {"vendor_name": "Acme Industrial Supplies", "vendor_id": "V-ACME-001", | |
| "quote_number": "Q-AC-9981", "valid_until": "2026-06-30", "currency": "USD", | |
| "line_items": [_cfi("RB-500", "6-axis robot arm", "equipment", 2, 38000.00), | |
| _cfi("EOAT-12", "End-of-arm tooling", "tooling", 2, 4500.00), | |
| _cfi("INST-01", "Installation and calibration", "service", 1, 12000.00)], | |
| "subtotal": 97000.00, "tax_rate": 0.08, "tax_amount": 7760.00, "shipping": 1500.00, | |
| "total": 106260.00, "recommended": False}, | |
| {"vendor_name": "GlobalParts GmbH", "vendor_id": "V-GLOB-014", | |
| "quote_number": "Q-GP-4471", "valid_until": "2026-06-15", "currency": "EUR", | |
| "line_items": [_cfi("RB-EU", "6-axis robot arm", "equipment", 2, 34000.00), | |
| _cfi("TOOL-9", "Tooling set", "tooling", 2, 5000.00), | |
| _cfi("SVC-2", "Commissioning", "service", 1, 15000.00)], | |
| "subtotal": 93000.00, "tax_rate": 0.19, "tax_amount": 17670.00, "shipping": 2000.00, | |
| "total": 112670.00, "recommended": False}, | |
| {"vendor_name": "Initech Supplies", "vendor_id": "V-INIT-007", | |
| "quote_number": "Q-IN-3320", "valid_until": "2026-07-10", "currency": "USD", | |
| "line_items": [_cfi("RB-INI", "Robot cell", "equipment", 2, 36000.00), | |
| _cfi("TLG-3", "Gripper kit", "tooling", 2, 4000.00), | |
| _cfi("SET-1", "Setup", "service", 1, 9000.00)], | |
| "subtotal": 89000.00, "tax_rate": 0.08, "tax_amount": 7120.00, "shipping": 1200.00, | |
| "total": 97320.00, "recommended": True}, | |
| ], | |
| "selected_vendor": "Initech Supplies", "selected_total": 97320.00, | |
| "currency": "USD", "grand_total": 97320.00, | |
| "approvals": [ | |
| {"role": "Department Head", "name": "Priya Anand", "decision": "Approved", "date": "2026-05-14"}, | |
| {"role": "Finance", "name": "Marcus Lee", "decision": "Approved", "date": "2026-05-15"}, | |
| {"role": "Procurement", "name": "Sofia Reyes", "decision": "Pending", "date": "-"}, | |
| ], | |
| "compliance": {"three_quotes_obtained": True, "budget_approved": True, | |
| "sole_source_justified": False, "vendor_vetted": True}, | |
| "notes": "Delivery required before Q3 production ramp.", | |
| }, | |
| }) | |
| return docs | |
| # --- text layout rendering ---------------------------------------------------- | |
| def render_lines(doc: dict) -> list[str]: | |
| f = doc["fields"] | |
| t = doc["type"] | |
| omit = set(doc.get("omit", [])) | |
| L: list[str] = [] | |
| cur_sym = {"USD": "$", "EUR": "€", "GBP": "£"}.get(f.get("currency", "USD"), "$") | |
| def money(v): | |
| return f"{cur_sym}{v:,.2f}" | |
| if t == "invoice": | |
| L += ["INVOICE", ""] | |
| L.append(f"Invoice Number: {f['invoice_number']}") | |
| L.append(f"Invoice Date: {f['issue_date']}") | |
| if "due_date" not in omit and f.get("due_date"): | |
| L.append(f"Due Date: {f['due_date']}") | |
| L.append(f"From: {f['vendor_name']}") | |
| if "bill_to_name" not in omit and f.get("bill_to_name"): | |
| L.append(f"Bill To: {f['bill_to_name']}") | |
| L.append(f"Currency: {f['currency']}") | |
| L += ["", _row("Description", "Qty", "Unit Price", "Amount")] | |
| for it in f.get("line_items", []): | |
| L.append(_row(it["description"], str(int(it["quantity"])), | |
| money(it["unit_price"]), money(it["line_total"]))) | |
| L.append("") | |
| L.append(f"Subtotal: {money(f['subtotal'])}") | |
| L.append(f"Tax (10%): {money(f['tax_amount'])}") | |
| if "total" not in omit and f.get("total") is not None: | |
| L.append(f"Total: {money(f['total'])}") | |
| elif t == "purchase_order": | |
| L += ["PURCHASE ORDER", ""] | |
| L.append(f"Purchase Order Number: {f['order_number']}") | |
| L.append(f"Order Date: {f['order_date']}") | |
| L.append(f"Delivery Date: {f['delivery_date']}") | |
| L.append(f"Vendor: {f['vendor_name']}") | |
| L.append(f"Buyer: {f['buyer_name']}") | |
| L.append(f"Ship To: {f['ship_to']}") | |
| L.append(f"Payment Terms: {f['payment_terms']}") | |
| L.append(f"Currency: {f['currency']}") | |
| L += ["", _row("Description", "Qty", "Unit Price", "Amount")] | |
| for it in f.get("line_items", []): | |
| L.append(_row(it["description"], str(int(it["quantity"])), | |
| money(it["unit_price"]), money(it["line_total"]))) | |
| L += ["", f"Subtotal: {money(f['subtotal'])}", f"Tax: {money(f['tax_amount'])}", | |
| f"Total: {money(f['total'])}"] | |
| elif t == "contract": | |
| L += [f["title"].upper(), ""] | |
| L.append(f"Contract Number: {f['contract_number']}") | |
| L.append(f"This Agreement is entered into between {f['party_a']} and {f['party_b']}.") | |
| L.append(f"Effective Date: {f['effective_date']}") | |
| L.append(f"Expiration Date: {f['expiration_date']}") | |
| if f.get("contract_value"): | |
| L.append(f"Contract Value: {money(f['contract_value'])}") | |
| L.append(f"Governing Law: {f['governing_law']}") | |
| L.append("This agreement shall automatically renew." | |
| if f.get("auto_renew") else "This agreement has no automatic renewal.") | |
| L.append(f"Either party may terminate upon {f['termination_notice_days']} days written notice.") | |
| elif t == "receipt": | |
| L += ["RECEIPT", ""] | |
| L.append(f"Merchant: {f['merchant']}") | |
| L.append(f"Date: {f['date']}") | |
| L.append(f"Currency: {f['currency']}") | |
| L += ["", _row("Description", "Qty", "Unit Price", "Amount")] | |
| for it in f.get("line_items", []): | |
| L.append(_row(it["description"], str(int(it["quantity"])), | |
| money(it["unit_price"]), money(it["line_total"]))) | |
| L += ["", f"Subtotal: {money(f['subtotal'])}", f"Tax: {money(f['tax_amount'])}", | |
| f"Total: {money(f['total'])}", f"Paid by: {f['payment_method']}"] | |
| elif t == "subscription_memo": | |
| cur_sym2 = {"USD": "$", "EUR": "€", "GBP": "£"}.get(f.get("currency", "USD"), "$") | |
| L += ["SUBSCRIPTION MEMO", ""] | |
| L.append(f"Memo Number: {f['memo_number']}") | |
| L.append(f"Subscription: {f['subscription_name']}") | |
| L.append(f"Vendor: {f['vendor_name']}") | |
| L.append(f"Account ID: {f['account_id']}") | |
| L.append(f"Plan: {f['plan']}") | |
| L.append(f"Billing Cycle: {f['billing_cycle']}") | |
| L.append(f"Start Date: {f['start_date']}") | |
| L.append(f"Renewal Date: {f['renewal_date']}") | |
| L.append(f"Amount: {cur_sym2}{f['amount']:,.2f}") | |
| L.append(f"Currency: {f['currency']}") | |
| L.append("Auto-Renew: " + ("Yes" if f["auto_renew"] else "No")) | |
| L.append(f"Status: {f['status']}") | |
| L.append(f"Notes: {f['notes']}") | |
| elif t == "complex_form": | |
| L += render_complex_form(f) | |
| return L | |
| def _cf_row(code, desc, cat, qty, price, amt): | |
| return f"{code:<10}{desc:<30}{cat:<14}{qty:>4} {price:>12} {amt:>12}" | |
| def render_complex_form(f: dict) -> list[str]: | |
| L = ["CAPITAL EXPENDITURE REQUEST & VENDOR QUOTE COMPARISON", ""] | |
| L += [f"Form Number: {f['form_number']}", f"Request Date: {f['request_date']}", | |
| f"Department: {f['department']}", f"Cost Center: {f['cost_center']}", | |
| f"Priority: {f['priority']}", ""] | |
| r = f["requestor"] | |
| L += ["REQUESTOR", f"Name: {r['name']}", f"Employee ID: {r['employee_id']}", | |
| f"Email: {r['email']}", f"Manager: {r['manager']}", ""] | |
| pr = f["project"] | |
| L += ["PROJECT", f"Name: {pr['name']}", f"Code: {pr['code']}", | |
| f"Budget Code: {pr['budget_code']}", f"Justification: {pr['justification']}", ""] | |
| for i, q in enumerate(f["vendor_quotes"], start=1): | |
| sym = {"USD": "$", "EUR": "€", "GBP": "£"}.get(q["currency"], "$") | |
| L += [f"VENDOR QUOTE {i}", f"Vendor: {q['vendor_name']}", f"Vendor ID: {q['vendor_id']}", | |
| f"Quote Number: {q['quote_number']}", f"Valid Until: {q['valid_until']}", | |
| f"Currency: {q['currency']}", | |
| _cf_row("Item Code", "Description", "Category", "Qty", "Unit Price", "Amount")] | |
| for it in q["line_items"]: | |
| L.append(_cf_row(it["item_code"], it["description"], it["category"], | |
| str(int(it["quantity"])), f"{it['unit_price']:.2f}", f"{it['line_total']:.2f}")) | |
| L += [f"Subtotal: {sym}{q['subtotal']:.2f}", | |
| f"Tax ({int(q['tax_rate']*100)}%): {sym}{q['tax_amount']:.2f}", | |
| f"Shipping: {sym}{q['shipping']:.2f}", f"Total: {sym}{q['total']:.2f}", ""] | |
| L += ["APPROVALS", f"{'Role':<20}{'Name':<18}{'Decision':<12}Date"] | |
| for a in f["approvals"]: | |
| L.append(f"{a['role']:<20}{a['name']:<18}{a['decision']:<12}{a['date']}") | |
| L += ["", "COMPLIANCE CHECKLIST"] | |
| cm = f["compliance"] | |
| L += [f"[{'X' if cm['three_quotes_obtained'] else ' '}] Three competitive quotes obtained", | |
| f"[{'X' if cm['budget_approved'] else ' '}] Budget approved", | |
| f"[{'X' if cm['sole_source_justified'] else ' '}] Sole-source justification attached", | |
| f"[{'X' if cm['vendor_vetted'] else ' '}] Vendor vetted / on approved list", ""] | |
| L += [f"Notes: {f['notes']}"] | |
| return L | |
| def _row(desc: str, qty: str, price: str, amount: str) -> str: | |
| return f"{desc:<28}{qty:>5} {price:>12} {amount:>12}" | |
| # --- ground truth ------------------------------------------------------------- | |
| def ground_truth(doc: dict) -> dict: | |
| f = dict(doc["fields"]) | |
| for k in doc.get("omit", []): | |
| f.pop(k, None) | |
| f["doc_type"] = doc["type"] | |
| f["_meta"] = {"doc_type": doc["type"], "channel": doc["channel"], | |
| "difficulty": doc["difficulty"]} | |
| if doc.get("skip_eval"): | |
| f["_meta"]["skip_eval"] = True | |
| return f | |
| # --- renderers ---------------------------------------------------------------- | |
| def write_pdf(path: Path, lines: list[str]) -> bool: | |
| try: | |
| from reportlab.lib.pagesizes import letter | |
| from reportlab.pdfgen import canvas | |
| except Exception: | |
| return False | |
| c = canvas.Canvas(str(path), pagesize=letter) | |
| c.setFont("Courier", 10) | |
| width, height = letter | |
| y = height - 60 | |
| for line in lines: | |
| if y < 60: | |
| c.showPage() | |
| c.setFont("Courier", 10) | |
| y = height - 60 | |
| c.drawString(50, y, line) | |
| y -= 15 | |
| c.save() | |
| return True | |
| def write_image(path: Path, lines: list[str]) -> bool: | |
| try: | |
| from PIL import Image, ImageDraw, ImageFont | |
| except Exception: | |
| return False | |
| W, H = 850, max(400, 60 + 20 * len(lines)) | |
| img = Image.new("RGB", (W, H), "white") | |
| d = ImageDraw.Draw(img) | |
| try: | |
| font = ImageFont.truetype("/System/Library/Fonts/Menlo.ttc", 13) | |
| except Exception: | |
| font = ImageFont.load_default() | |
| y = 30 | |
| for line in lines: | |
| d.text((40, y), line, fill="black", font=font) | |
| y += 20 | |
| img.save(str(path)) | |
| return True | |
| def main() -> None: | |
| DATASET_DIR.mkdir(parents=True, exist_ok=True) | |
| SHOWCASE_DIR.mkdir(parents=True, exist_ok=True) | |
| docs = corpus() | |
| made = 0 | |
| for doc in docs: | |
| lines = render_lines(doc) | |
| gt = ground_truth(doc) | |
| stem = DATASET_DIR / doc["id"] | |
| if doc["channel"] == "digital": | |
| ok = write_pdf(stem.with_suffix(".pdf"), lines) | |
| kind = "PDF" | |
| else: | |
| ok = write_image(stem.with_suffix(".png"), lines) | |
| # sidecar text = what OCR would read (drives the offline OCR fallback) | |
| stem.with_suffix(".txt").write_text("\n".join(lines), encoding="utf-8") | |
| kind = "PNG+OCR-sidecar" | |
| stem.with_suffix(".gt.json").write_text(json.dumps(gt, indent=2), encoding="utf-8") | |
| made += 1 | |
| print(f" [{doc['channel']:<8}] {doc['id']:<28} {kind}{'' if ok else ' (render lib missing!)'}") | |
| # copy two hero samples to samples/ for the README/demo | |
| for hero in ("invoice_acme_digital", "po_acme_digital"): | |
| for ext in (".pdf", ".png", ".gt.json", ".txt"): | |
| src = DATASET_DIR / f"{hero}{ext}" | |
| if src.exists(): | |
| (SHOWCASE_DIR / src.name).write_bytes(src.read_bytes()) | |
| print(f"\nGenerated {made} documents → {DATASET_DIR}") | |
| if __name__ == "__main__": | |
| main() | |