"""Generate synthetic bank statements + ground-truth JSON. Produces: samples/sample_digital.pdf -- real reportlab table (exercises pdfplumber) samples/sample_scan.png -- rendered + rotated + noised (exercises MiniCPM-V) samples/sample_digital_truth.json -- ground-truth transactions (test fixture) samples/sample_scan_truth.json -- same data, used for vision-accuracy check No real bank data is used (privacy + licensing). Narrations are deliberately messy to mimic real Indian bank statements. Run: python samples/generate_samples.py """ import json import os import random HERE = os.path.dirname(os.path.abspath(__file__)) OPENING_BALANCE = 152340.50 # (narration, debit, credit) -- one tuple per transaction row. # Narrations are intentionally messy and cover most ledger heads so the # rules layer + reconciliation both get a real workout. RAW = [ ("UPI/CR/512345678901/RAMESH TRADERS/PAYTM/Payment for goods", None, 45000.00), ("NEFT-HDFC0001234-ACME PVT LTD-INV 4521", None, 128500.00), ("UPI/DR/423451234567/SHARMA RENT/GPAY/April office rent", 35000.00, None), ("ACH/NACH/SBILIFE INSURANCE/PREMIUM/POL 9921", 12500.00, None), ("SALARY APR 2026 - STAFF PAYROLL BATCH 04", 86000.00, None), ("ATM CSH WDL/AXIS/KANPUR MALL ROAD/04APR", 10000.00, None), ("UPI/CR/612345098765/SUNIL ENTERPRISES/PHONEPE/sale", None, 22750.00), ("GST PMT-CBIC-26AAAPL1234C1Z5-GSTR3B APR", 18900.00, None), ("NEFT-ICIC0004567-MEGHA SUPPLIERS-PO 7781", 64200.00, None), ("INT.PD-SAVINGS A/C-QTR ENDING", None, 1842.00), ("BANK CHRG-SMS CHGS APR 2026", 23.60, None), ("UPI/DR/734512345098/BSNL BROADBAND/INTERNET BILL", 1499.00, None), ("RTGS-UTIB0000123-DELTA EXPORTS-SETTLEMENT", None, 215000.00), ("ACH/NACH/HDFC HOME LOAN/EMI/LN44219", 28750.00, None), ("UPI/DR/845612340987/KESHAV ELECTRICALS/utility KESCO bill", 6540.00, None), ("CSH DEP/BRANCH/KANPUR/CASH DEPOSIT BY CLIENT", None, 50000.00), ("TDS PMT-CPC-TDS Q4 26AAAPL1234C", 9400.00, None), ("UPI/CR/923451234560/VIKAS AGENCY/sale receipt", None, 17800.00), ("NEFT-PUNB0123456-RAVI CONSULTING-PROF FEES", 25000.00, None), ("IRCTC TRAVEL/RAIL TICKET/PNR 8841200012", 3450.00, None), ("UPI/DR/103451234561/SWIGGY/staff lunch conveyance", 1280.00, None), ("ELECTRICITY KESCO BILL PMT/CONS 4412", 8920.00, None), ("UPI/CR/213451234562/ANJALI STORES/PHONEPE/sale", None, 9650.00), ("AMC CHGS-DEBIT CARD ANNUAL FEE", 590.00, None), ("NEFT-SBIN0009988-OWN ACCOUNT TRANSFER-SELF", 40000.00, None), ("UPI/DR/313451234563/GARG REPAIRS/AC servicing maintenance", 4200.00, None), ("INCOME TAX PMT-CBDT-ADVANCE TAX 26AAAPL", 35000.00, None), ("UPI/CR/413451234564/MOHIT TRADING/sale", None, 31200.00), ("LIC OF INDIA/ACH/PREMIUM/POLICY 5512", 9800.00, None), ("UPI/DR/513451234565/JIO RECHARGE/mobile postpaid", 799.00, None), ("NEFT-YESB0001122-GLOBE IMPORTS-PURCHASE", 52400.00, None), ("DRAWINGS-PROPRIETOR-PERSONAL WITHDRAWAL", 30000.00, None), ("UPI/CR/613451234566/CAPITAL INTRODUCED BY PARTNER", None, 200000.00), ("INT CREDIT-FIXED DEPOSIT INTEREST", None, 4521.00), ("UPI/DR/713451234567/PETROL HPCL/fuel conveyance", 2500.00, None), ("NEFT-KKBK0007788-ZENITH PVT LTD-INV 9921", None, 76300.00), ("ATM CSH WDL/HDFC/CIVIL LINES/22APR", 15000.00, None), ("UPI/DR/813451234568/PRINTWELL/office stationery purchase", 3120.00, None), ("INTEREST PAID ON OVERDRAFT-OD A/C", 5640.00, None), ("RTGS-HDFC0009090-NEXUS SOLUTIONS-FINAL PMT", None, 98000.00), ] DATES = None def build_transactions(): """Return the ground-truth transaction list with running balances.""" txns = [] balance = OPENING_BALANCE # Dates: spread across April 2026, monotonically non-decreasing. day = 1 for i, (narration, debit, credit) in enumerate(RAW): # advance the day every ~1-2 rows, stay within April if i > 0 and i % 2 == 0: day = min(day + 1, 30) date_iso = f"2026-04-{day:02d}" balance = round(balance - (debit or 0) + (credit or 0), 2) ref = "".join([c for c in narration if c.isdigit()][:6]) or None txns.append({ "date": date_iso, "narration": narration, "ref_no": ref, "debit": debit, "credit": credit, "balance": balance, }) return txns def _printed_date(iso): """ISO -> DD/MM/YYYY as printed on the statement.""" y, m, d = iso.split("-") return f"{d}/{m}/{y}" def _fmt_amt(v): if v is None: return "" # Indian-style grouping is hard to parse; keep it plain with thousands commas return f"{v:,.2f}" def render_pdf(txns, path): from reportlab.lib import colors from reportlab.lib.pagesizes import A4 from reportlab.lib.units import mm from reportlab.platypus import (SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer) from reportlab.lib.styles import getSampleStyleSheet styles = getSampleStyleSheet() doc = SimpleDocTemplate(path, pagesize=A4, leftMargin=10 * mm, rightMargin=10 * mm, topMargin=12 * mm, bottomMargin=12 * mm) story = [] story.append(Paragraph("STATE BANK STYLE DEMO — Account Statement", styles["Title"])) story.append(Paragraph("Account: XXXXXX4412    Period: 01/04/2026 - 30/04/2026", styles["Normal"])) story.append(Paragraph(f"Opening Balance: {_fmt_amt(OPENING_BALANCE)}", styles["Normal"])) story.append(Spacer(1, 6 * mm)) header = ["Date", "Narration", "Ref No", "Debit", "Credit", "Balance"] data = [header] narr_style = styles["BodyText"] narr_style.fontSize = 7 narr_style.leading = 8 for t in txns: data.append([ _printed_date(t["date"]), Paragraph(t["narration"], narr_style), t["ref_no"] or "", _fmt_amt(t["debit"]), _fmt_amt(t["credit"]), _fmt_amt(t["balance"]), ]) col_widths = [20 * mm, 78 * mm, 16 * mm, 22 * mm, 22 * mm, 24 * mm] table = Table(data, colWidths=col_widths, repeatRows=1) table.setStyle(TableStyle([ ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#1a3c6e")), ("TEXTCOLOR", (0, 0), (-1, 0), colors.white), ("FONTSIZE", (0, 0), (-1, -1), 7), ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"), ("ALIGN", (3, 0), (5, -1), "RIGHT"), ("GRID", (0, 0), (-1, -1), 0.4, colors.grey), ("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.white, colors.HexColor("#eef3fa")]), ("VALIGN", (0, 0), (-1, -1), "MIDDLE"), ])) story.append(table) doc.build(story) def render_scan(txns, path): """Render a statement page to an image, then rotate + add noise to mimic a scan.""" from PIL import Image, ImageDraw, ImageFont W, H = 1240, 1754 # ~A4 at 150dpi img = Image.new("RGB", (W, H), "white") draw = ImageDraw.Draw(img) try: font = ImageFont.truetype("arial.ttf", 16) font_b = ImageFont.truetype("arialbd.ttf", 20) font_s = ImageFont.truetype("arial.ttf", 13) except Exception: font = ImageFont.load_default() font_b = font font_s = font draw.text((40, 30), "DEMO BANK — Account Statement (scanned)", fill="black", font=font_b) draw.text((40, 64), "Account: XXXXXX4412 Period: 01/04/2026 - 30/04/2026", fill="black", font=font_s) draw.text((40, 84), f"Opening Balance: {_fmt_amt(OPENING_BALANCE)}", fill="black", font=font_s) # column x positions cols = [40, 140, 660, 760, 920, 1080] y = 120 headers = ["Date", "Narration", "Ref", "Debit", "Credit", "Balance"] for x, h in zip(cols, headers): draw.text((x, y), h, fill="black", font=font) y += 26 draw.line((40, y, W - 40, y), fill="black", width=1) y += 6 for t in txns: draw.text((cols[0], y), _printed_date(t["date"]), fill="black", font=font_s) narr = t["narration"] if len(narr) > 70: narr = narr[:70] draw.text((cols[1], y), narr, fill="black", font=font_s) draw.text((cols[2], y), (t["ref_no"] or ""), fill="black", font=font_s) draw.text((cols[3], y), _fmt_amt(t["debit"]), fill="black", font=font_s) draw.text((cols[4], y), _fmt_amt(t["credit"]), fill="black", font=font_s) draw.text((cols[5], y), _fmt_amt(t["balance"]), fill="black", font=font_s) y += 30 # Rotate 1.5 deg and add mild gaussian-ish noise to simulate a scan. img = img.rotate(1.5, expand=False, fillcolor="white") px = img.load() rnd = random.Random(42) for _ in range(45000): x = rnd.randint(0, W - 1) yy = rnd.randint(0, H - 1) v = rnd.randint(-25, 25) r, g, b = px[x, yy] px[x, yy] = (max(0, min(255, r + v)), max(0, min(255, g + v)), max(0, min(255, b + v))) img.save(path, "PNG") def main(): txns = build_transactions() pdf_path = os.path.join(HERE, "sample_digital.pdf") scan_path = os.path.join(HERE, "sample_scan.png") truth_digital = os.path.join(HERE, "sample_digital_truth.json") truth_scan = os.path.join(HERE, "sample_scan_truth.json") render_pdf(txns, pdf_path) render_scan(txns, scan_path) with open(truth_digital, "w", encoding="utf-8") as f: json.dump(txns, f, indent=2) with open(truth_scan, "w", encoding="utf-8") as f: json.dump(txns, f, indent=2) print(f"Wrote {len(txns)} transactions") print(f" {pdf_path}") print(f" {scan_path}") print(f" {truth_digital}") print(f" {truth_scan}") if __name__ == "__main__": main()