| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import random |
| from datetime import date, timedelta |
| from pathlib import Path |
| from typing import Any |
|
|
|
|
| SUPPLIERS = [ |
| { |
| "name": "Mahalakshmi Marketing", |
| "document_type": "handwritten supplier bill", |
| "products": [ |
| ("Parle bulk", 2450.0), |
| ("Bingo(C)", 870.0), |
| ("Lays Classic", 480.0), |
| ("Parle-G", 50.0), |
| ("Happy Happy", 960.0), |
| ("Kurkure", 540.0), |
| ], |
| }, |
| { |
| "name": "Sri Venkateshwara Marketing", |
| "document_type": "printed tax invoice", |
| "products": [ |
| ("PARLE-G 60GM RS.72P", 8.625), |
| ("HAPPY HAPPY 27.5G(24P)*13", 4.464), |
| ("LAYS CLASSIC 52G", 12.5), |
| ("BINGO MAD ANGLES", 9.75), |
| ("KURKURE PUFFCORN 52G", 12.0), |
| ("BINGO TEDHE MEDHE 16G(20P)", 6.9), |
| ], |
| }, |
| { |
| "name": "Brundavan Buns", |
| "document_type": "handwritten tally note", |
| "products": [ |
| ("Bun", 10.0), |
| ("OBM", 9.5), |
| ("pav", 8.0), |
| ("brd", 32.0), |
| ("cake slc", 45.0), |
| ("Milk bread", 35.0), |
| ], |
| }, |
| { |
| "name": "Vikram Agencies", |
| "document_type": "tabular invoice", |
| "products": [ |
| ("Parle-G 250g", 180.0), |
| ("Bourbon Biscuit", 105.0), |
| ("Monaco Salted", 220.0), |
| ("Krack Jack", 150.0), |
| ("Hide & Seek", 195.0), |
| ("Sunfeast Dark Fantasy", 230.0), |
| ], |
| }, |
| { |
| "name": "Krishna General Stores", |
| "document_type": "retail purchase note", |
| "products": [ |
| ("Parle Monaco 200g", 75.0), |
| ("Krack Jack Biscuit", 60.0), |
| ("Hide & Seek Choco", 95.0), |
| ("Sunfeast YiPPee", 40.0), |
| ("Haldiram Bhujia", 120.0), |
| ("Lijjat Papad", 85.0), |
| ], |
| }, |
| ] |
|
|
|
|
| def _money(value: float) -> float: |
| return round(float(value), 2) |
|
|
|
|
| def _receipt_date(rng: random.Random) -> date: |
| return date(2026, 5, 20) + timedelta(days=rng.randint(0, 20)) |
|
|
|
|
| def _sample_items(rng: random.Random, supplier: dict[str, Any]) -> list[dict[str, Any]]: |
| product_count = rng.randint(2, min(4, len(supplier["products"]))) |
| selected = rng.sample(supplier["products"], product_count) |
| items: list[dict[str, Any]] = [] |
|
|
| for product_raw, unit_cost in selected: |
| qty = rng.choice([1, 2, 3, 4, 5, 8, 10, 12, 24]) |
| if supplier["document_type"] == "printed tax invoice": |
| qty_cases = rng.choice([0, qty]) |
| qty_units = 0 if qty_cases else qty |
| else: |
| qty_cases = 0 if supplier["document_type"] == "handwritten tally note" else qty |
| qty_units = qty |
|
|
| total = _money(qty * unit_cost) |
| items.append( |
| { |
| "product_raw": product_raw, |
| "qty_cases": int(qty_cases), |
| "qty_units": int(qty_units), |
| "unit_cost": _money(unit_cost), |
| "total": total, |
| } |
| ) |
|
|
| return items |
|
|
|
|
| def _render_handwritten( |
| supplier: dict[str, Any], |
| invoice_no: str, |
| bill_date: date, |
| items: list[dict[str, Any]], |
| rng: random.Random, |
| ) -> str: |
| lines = [ |
| supplier["name"], |
| f"No. {invoice_no} Date: {bill_date:%d/%m/%y}", |
| "M/s. Veerabhadra Stores", |
| "", |
| ] |
| for item in items: |
| qty = item["qty_units"] or item["qty_cases"] |
| separator = rng.choice([" X ", " x ", "X"]) |
| lines.append( |
| f"{item['product_raw']} {qty}{separator}{item['unit_cost']} = {item['total']}" |
| ) |
| lines.append("") |
| lines.append(f"Total {_money(sum(item['total'] for item in items))}") |
| return "\n".join(lines) |
|
|
|
|
| def _render_printed( |
| supplier: dict[str, Any], |
| invoice_no: str, |
| bill_date: date, |
| items: list[dict[str, Any]], |
| rng: random.Random, |
| ) -> str: |
| lines = [ |
| supplier["name"], |
| "GSTIN: 36AZLIPV6442K12M", |
| "CUSTOMER: VEERA BHADRA WS", |
| f"Invoice No: {invoice_no}", |
| f"Bill Date: {bill_date:%d/%m/%Y}", |
| "", |
| ] |
| for index, item in enumerate(items, start=1): |
| qty = item["qty_cases"] or item["qty_units"] |
| qty_text = rng.choice([f"{qty}/0", str(qty)]) |
| lines.append( |
| f"{index} {item['product_raw']} | QTY {qty_text} | RATE {item['unit_cost']} | NET {item['total']}" |
| ) |
| subtotal = _money(sum(item["total"] for item in items)) |
| gst = _money(subtotal * 0.05) |
| lines.extend( |
| [ |
| "", |
| f"GROSS SALES: {subtotal}", |
| f"GST: {gst}", |
| f"NET AMOUNT: {_money(subtotal + gst)}", |
| ] |
| ) |
| return "\n".join(lines) |
|
|
|
|
| def _render_tally( |
| supplier: dict[str, Any], |
| invoice_no: str, |
| bill_date: date, |
| items: list[dict[str, Any]], |
| rng: random.Random, |
| ) -> str: |
| lines = [ |
| supplier["name"], |
| f"Date: {bill_date:%d/%m}", |
| "", |
| ] |
| for item in items: |
| qty = item["qty_units"] or item["qty_cases"] |
| lines.append(f"{item['product_raw']} {qty}X{item['unit_cost']} {item['total']}") |
| lines.append("") |
| lines.append(f"Total: {_money(sum(item['total'] for item in items))}") |
| return "\n".join(lines) |
|
|
|
|
| def _render_tabular( |
| supplier: dict[str, Any], |
| invoice_no: str, |
| bill_date: date, |
| items: list[dict[str, Any]], |
| discount: float, |
| ) -> str: |
| subtotal = _money(sum(item["total"] for item in items)) |
| net = _money(subtotal - discount) |
| lines = [ |
| supplier["name"].upper(), |
| f"Invoice: {invoice_no}", |
| f"Date: {bill_date:%d-%m-%Y}", |
| "PARTY: VEERABHADRA STORES", |
| "", |
| ] |
| for index, item in enumerate(items, start=1): |
| qty = item["qty_units"] or item["qty_cases"] |
| lines.append( |
| f"{index} {item['product_raw']} QTY {qty} RATE {item['unit_cost']} AMT {item['total']}" |
| ) |
| lines.extend(["", f"Gross: {subtotal}", f"Disc: {discount}", f"Net: {net}"]) |
| return "\n".join(lines) |
|
|
|
|
| def _render_retail( |
| supplier: dict[str, Any], |
| invoice_no: str, |
| bill_date: date, |
| items: list[dict[str, Any]], |
| discount: float, |
| rng: random.Random, |
| ) -> str: |
| subtotal = _money(sum(item["total"] for item in items)) |
| disc_pct = round(discount / subtotal * 100) if subtotal else 0 |
| net = _money(subtotal - discount) |
| lines = [ |
| supplier["name"].upper(), |
| f"Bill No: {invoice_no}", |
| f"Date: {bill_date:%d/%m/%Y}", |
| "Customer: Veerabhadra", |
| "", |
| ] |
| unit_label = rng.choice(["pkt", "pc", "nos", "units"]) |
| for item in items: |
| qty = item["qty_units"] or item["qty_cases"] |
| lines.append( |
| f"{item['product_raw']} {qty} {unit_label} @{item['unit_cost']} {item['total']}" |
| ) |
| lines.extend([ |
| "", |
| f"Sub Total: {subtotal}", |
| f"Disc @ {disc_pct}%: {discount}", |
| f"Net Payable: {net}", |
| ]) |
| return "\n".join(lines) |
|
|
|
|
| def _compute_discount(doc_type: str, subtotal: float, rng: random.Random) -> float: |
| if doc_type == "tabular invoice": |
| return _money(subtotal * rng.choice([0.05, 0.08, 0.10])) |
| if doc_type == "retail purchase note": |
| return _money(subtotal * rng.choice([0.05, 0.08, 0.10])) |
| return 0.0 |
|
|
|
|
| def _render_input( |
| supplier: dict[str, Any], |
| invoice_no: str, |
| bill_date: date, |
| items: list[dict[str, Any]], |
| discount: float, |
| rng: random.Random, |
| ) -> str: |
| doc_type = supplier["document_type"] |
| if doc_type == "printed tax invoice": |
| return _render_printed(supplier, invoice_no, bill_date, items, rng) |
| if doc_type == "handwritten tally note": |
| return _render_tally(supplier, invoice_no, bill_date, items, rng) |
| if doc_type == "tabular invoice": |
| return _render_tabular(supplier, invoice_no, bill_date, items, discount) |
| if doc_type == "retail purchase note": |
| return _render_retail(supplier, invoice_no, bill_date, items, discount, rng) |
| return _render_handwritten(supplier, invoice_no, bill_date, items, rng) |
|
|
|
|
| def generate_examples(count: int, seed: int) -> list[dict[str, str]]: |
| rng = random.Random(seed) |
| examples: list[dict[str, str]] = [] |
|
|
| for index in range(count): |
| supplier = rng.choice(SUPPLIERS) |
| bill_date = _receipt_date(rng) |
| invoice_no = str(3000 + index + rng.randint(0, 500)) |
| items = _sample_items(rng, supplier) |
| subtotal = _money(sum(item["total"] for item in items)) |
| doc_type = supplier["document_type"] |
| gst = _money(subtotal * 0.05) if doc_type == "printed tax invoice" else 0.0 |
| discount = _compute_discount(doc_type, subtotal, rng) |
| parsed = { |
| "supplier": supplier["name"], |
| "invoice_no": invoice_no if doc_type != "handwritten tally note" else None, |
| "date": bill_date.isoformat(), |
| "items": items, |
| "subtotal": subtotal, |
| "discount": discount, |
| "gst": gst, |
| "net_total": _money(subtotal - discount + gst), |
| } |
| examples.append( |
| { |
| "input": _render_input(supplier, invoice_no, bill_date, items, discount, rng), |
| "output": json.dumps(parsed, ensure_ascii=False), |
| } |
| ) |
|
|
| return examples |
|
|
|
|
| def _load_base_examples(path: Path | None) -> list[dict[str, str]]: |
| if path is None: |
| return [] |
| return [ |
| json.loads(line) |
| for line in path.read_text().splitlines() |
| if line.strip() |
| ] |
|
|
|
|
| def _validate_example(example: dict[str, str]) -> None: |
| if not isinstance(example.get("input"), str) or not example["input"].strip(): |
| raise ValueError("Example input must be non-empty text") |
| output = json.loads(example["output"]) |
| if not isinstance(output.get("items"), list) or not output["items"]: |
| raise ValueError("Example output must contain at least one item") |
| for item in output["items"]: |
| for field in ("product_raw", "qty_cases", "qty_units", "unit_cost", "total"): |
| if field not in item: |
| raise ValueError(f"Example item missing field: {field}") |
|
|
|
|
| def write_examples(examples: list[dict[str, str]], output_path: Path) -> None: |
| output_path.parent.mkdir(parents=True, exist_ok=True) |
| for example in examples: |
| _validate_example(example) |
| output_path.write_text( |
| "\n".join(json.dumps(example, ensure_ascii=False) for example in examples) + "\n" |
| ) |
|
|
|
|
| def main() -> None: |
| parser = argparse.ArgumentParser(description="Generate synthetic receipt fine-tuning examples.") |
| parser.add_argument("--count", type=int, default=48) |
| parser.add_argument("--seed", type=int, default=7) |
| parser.add_argument("--base", type=Path, default=None) |
| parser.add_argument( |
| "--output", |
| type=Path, |
| default=Path("data/finetune/generated/receipt_examples_synthetic.jsonl"), |
| ) |
| args = parser.parse_args() |
|
|
| examples = _load_base_examples(args.base) + generate_examples(args.count, args.seed) |
| write_examples(examples, args.output) |
| print(f"Wrote {len(examples)} examples to {args.output}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|