Spaces:
Sleeping
Sleeping
| """Generate synthetic bank statements + ground-truth JSON. | |
| Produces: | |
| samples/sample_digital.pdf -- real reportlab table (exercises pdfplumber) | |
| samples/sample_scan.png -- rendered + rotated + noised (exercises MiniCPM-V) | |
| samples/sample_digital_truth.json -- ground-truth transactions (test fixture) | |
| samples/sample_scan_truth.json -- same data, used for vision-accuracy check | |
| No real bank data is used (privacy + licensing). Narrations are deliberately | |
| messy to mimic real Indian bank statements. | |
| Run: python samples/generate_samples.py | |
| """ | |
| import json | |
| import os | |
| import random | |
| HERE = os.path.dirname(os.path.abspath(__file__)) | |
| OPENING_BALANCE = 152340.50 | |
| # (narration, debit, credit) -- one tuple per transaction row. | |
| # Narrations are intentionally messy and cover most ledger heads so the | |
| # rules layer + reconciliation both get a real workout. | |
| RAW = [ | |
| ("UPI/CR/512345678901/RAMESH TRADERS/PAYTM/Payment for goods", None, 45000.00), | |
| ("NEFT-HDFC0001234-ACME PVT LTD-INV 4521", None, 128500.00), | |
| ("UPI/DR/423451234567/SHARMA RENT/GPAY/April office rent", 35000.00, None), | |
| ("ACH/NACH/SBILIFE INSURANCE/PREMIUM/POL 9921", 12500.00, None), | |
| ("SALARY APR 2026 - STAFF PAYROLL BATCH 04", 86000.00, None), | |
| ("ATM CSH WDL/AXIS/KANPUR MALL ROAD/04APR", 10000.00, None), | |
| ("UPI/CR/612345098765/SUNIL ENTERPRISES/PHONEPE/sale", None, 22750.00), | |
| ("GST PMT-CBIC-26AAAPL1234C1Z5-GSTR3B APR", 18900.00, None), | |
| ("NEFT-ICIC0004567-MEGHA SUPPLIERS-PO 7781", 64200.00, None), | |
| ("INT.PD-SAVINGS A/C-QTR ENDING", None, 1842.00), | |
| ("BANK CHRG-SMS CHGS APR 2026", 23.60, None), | |
| ("UPI/DR/734512345098/BSNL BROADBAND/INTERNET BILL", 1499.00, None), | |
| ("RTGS-UTIB0000123-DELTA EXPORTS-SETTLEMENT", None, 215000.00), | |
| ("ACH/NACH/HDFC HOME LOAN/EMI/LN44219", 28750.00, None), | |
| ("UPI/DR/845612340987/KESHAV ELECTRICALS/utility KESCO bill", 6540.00, None), | |
| ("CSH DEP/BRANCH/KANPUR/CASH DEPOSIT BY CLIENT", None, 50000.00), | |
| ("TDS PMT-CPC-TDS Q4 26AAAPL1234C", 9400.00, None), | |
| ("UPI/CR/923451234560/VIKAS AGENCY/sale receipt", None, 17800.00), | |
| ("NEFT-PUNB0123456-RAVI CONSULTING-PROF FEES", 25000.00, None), | |
| ("IRCTC TRAVEL/RAIL TICKET/PNR 8841200012", 3450.00, None), | |
| ("UPI/DR/103451234561/SWIGGY/staff lunch conveyance", 1280.00, None), | |
| ("ELECTRICITY KESCO BILL PMT/CONS 4412", 8920.00, None), | |
| ("UPI/CR/213451234562/ANJALI STORES/PHONEPE/sale", None, 9650.00), | |
| ("AMC CHGS-DEBIT CARD ANNUAL FEE", 590.00, None), | |
| ("NEFT-SBIN0009988-OWN ACCOUNT TRANSFER-SELF", 40000.00, None), | |
| ("UPI/DR/313451234563/GARG REPAIRS/AC servicing maintenance", 4200.00, None), | |
| ("INCOME TAX PMT-CBDT-ADVANCE TAX 26AAAPL", 35000.00, None), | |
| ("UPI/CR/413451234564/MOHIT TRADING/sale", None, 31200.00), | |
| ("LIC OF INDIA/ACH/PREMIUM/POLICY 5512", 9800.00, None), | |
| ("UPI/DR/513451234565/JIO RECHARGE/mobile postpaid", 799.00, None), | |
| ("NEFT-YESB0001122-GLOBE IMPORTS-PURCHASE", 52400.00, None), | |
| ("DRAWINGS-PROPRIETOR-PERSONAL WITHDRAWAL", 30000.00, None), | |
| ("UPI/CR/613451234566/CAPITAL INTRODUCED BY PARTNER", None, 200000.00), | |
| ("INT CREDIT-FIXED DEPOSIT INTEREST", None, 4521.00), | |
| ("UPI/DR/713451234567/PETROL HPCL/fuel conveyance", 2500.00, None), | |
| ("NEFT-KKBK0007788-ZENITH PVT LTD-INV 9921", None, 76300.00), | |
| ("ATM CSH WDL/HDFC/CIVIL LINES/22APR", 15000.00, None), | |
| ("UPI/DR/813451234568/PRINTWELL/office stationery purchase", 3120.00, None), | |
| ("INTEREST PAID ON OVERDRAFT-OD A/C", 5640.00, None), | |
| ("RTGS-HDFC0009090-NEXUS SOLUTIONS-FINAL PMT", None, 98000.00), | |
| ] | |
| DATES = None | |
| def build_transactions(): | |
| """Return the ground-truth transaction list with running balances.""" | |
| txns = [] | |
| balance = OPENING_BALANCE | |
| # Dates: spread across April 2026, monotonically non-decreasing. | |
| day = 1 | |
| for i, (narration, debit, credit) in enumerate(RAW): | |
| # advance the day every ~1-2 rows, stay within April | |
| if i > 0 and i % 2 == 0: | |
| day = min(day + 1, 30) | |
| date_iso = f"2026-04-{day:02d}" | |
| balance = round(balance - (debit or 0) + (credit or 0), 2) | |
| ref = "".join([c for c in narration if c.isdigit()][:6]) or None | |
| txns.append({ | |
| "date": date_iso, | |
| "narration": narration, | |
| "ref_no": ref, | |
| "debit": debit, | |
| "credit": credit, | |
| "balance": balance, | |
| }) | |
| return txns | |
| def _printed_date(iso): | |
| """ISO -> DD/MM/YYYY as printed on the statement.""" | |
| y, m, d = iso.split("-") | |
| return f"{d}/{m}/{y}" | |
| def _fmt_amt(v): | |
| if v is None: | |
| return "" | |
| # Indian-style grouping is hard to parse; keep it plain with thousands commas | |
| return f"{v:,.2f}" | |
| def render_pdf(txns, path): | |
| from reportlab.lib import colors | |
| from reportlab.lib.pagesizes import A4 | |
| from reportlab.lib.units import mm | |
| from reportlab.platypus import (SimpleDocTemplate, Table, TableStyle, | |
| Paragraph, Spacer) | |
| from reportlab.lib.styles import getSampleStyleSheet | |
| styles = getSampleStyleSheet() | |
| doc = SimpleDocTemplate(path, pagesize=A4, | |
| leftMargin=10 * mm, rightMargin=10 * mm, | |
| topMargin=12 * mm, bottomMargin=12 * mm) | |
| story = [] | |
| story.append(Paragraph("STATE BANK STYLE DEMO — Account Statement", styles["Title"])) | |
| story.append(Paragraph("Account: XXXXXX4412 Period: 01/04/2026 - 30/04/2026", | |
| styles["Normal"])) | |
| story.append(Paragraph(f"Opening Balance: {_fmt_amt(OPENING_BALANCE)}", styles["Normal"])) | |
| story.append(Spacer(1, 6 * mm)) | |
| header = ["Date", "Narration", "Ref No", "Debit", "Credit", "Balance"] | |
| data = [header] | |
| narr_style = styles["BodyText"] | |
| narr_style.fontSize = 7 | |
| narr_style.leading = 8 | |
| for t in txns: | |
| data.append([ | |
| _printed_date(t["date"]), | |
| Paragraph(t["narration"], narr_style), | |
| t["ref_no"] or "", | |
| _fmt_amt(t["debit"]), | |
| _fmt_amt(t["credit"]), | |
| _fmt_amt(t["balance"]), | |
| ]) | |
| col_widths = [20 * mm, 78 * mm, 16 * mm, 22 * mm, 22 * mm, 24 * mm] | |
| table = Table(data, colWidths=col_widths, repeatRows=1) | |
| table.setStyle(TableStyle([ | |
| ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#1a3c6e")), | |
| ("TEXTCOLOR", (0, 0), (-1, 0), colors.white), | |
| ("FONTSIZE", (0, 0), (-1, -1), 7), | |
| ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"), | |
| ("ALIGN", (3, 0), (5, -1), "RIGHT"), | |
| ("GRID", (0, 0), (-1, -1), 0.4, colors.grey), | |
| ("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.white, colors.HexColor("#eef3fa")]), | |
| ("VALIGN", (0, 0), (-1, -1), "MIDDLE"), | |
| ])) | |
| story.append(table) | |
| doc.build(story) | |
| def render_scan(txns, path): | |
| """Render a statement page to an image, then rotate + add noise to mimic a scan.""" | |
| from PIL import Image, ImageDraw, ImageFont | |
| W, H = 1240, 1754 # ~A4 at 150dpi | |
| img = Image.new("RGB", (W, H), "white") | |
| draw = ImageDraw.Draw(img) | |
| try: | |
| font = ImageFont.truetype("arial.ttf", 16) | |
| font_b = ImageFont.truetype("arialbd.ttf", 20) | |
| font_s = ImageFont.truetype("arial.ttf", 13) | |
| except Exception: | |
| font = ImageFont.load_default() | |
| font_b = font | |
| font_s = font | |
| draw.text((40, 30), "DEMO BANK — Account Statement (scanned)", fill="black", font=font_b) | |
| draw.text((40, 64), "Account: XXXXXX4412 Period: 01/04/2026 - 30/04/2026", | |
| fill="black", font=font_s) | |
| draw.text((40, 84), f"Opening Balance: {_fmt_amt(OPENING_BALANCE)}", fill="black", font=font_s) | |
| # column x positions | |
| cols = [40, 140, 660, 760, 920, 1080] | |
| y = 120 | |
| headers = ["Date", "Narration", "Ref", "Debit", "Credit", "Balance"] | |
| for x, h in zip(cols, headers): | |
| draw.text((x, y), h, fill="black", font=font) | |
| y += 26 | |
| draw.line((40, y, W - 40, y), fill="black", width=1) | |
| y += 6 | |
| for t in txns: | |
| draw.text((cols[0], y), _printed_date(t["date"]), fill="black", font=font_s) | |
| narr = t["narration"] | |
| if len(narr) > 70: | |
| narr = narr[:70] | |
| draw.text((cols[1], y), narr, fill="black", font=font_s) | |
| draw.text((cols[2], y), (t["ref_no"] or ""), fill="black", font=font_s) | |
| draw.text((cols[3], y), _fmt_amt(t["debit"]), fill="black", font=font_s) | |
| draw.text((cols[4], y), _fmt_amt(t["credit"]), fill="black", font=font_s) | |
| draw.text((cols[5], y), _fmt_amt(t["balance"]), fill="black", font=font_s) | |
| y += 30 | |
| # Rotate 1.5 deg and add mild gaussian-ish noise to simulate a scan. | |
| img = img.rotate(1.5, expand=False, fillcolor="white") | |
| px = img.load() | |
| rnd = random.Random(42) | |
| for _ in range(45000): | |
| x = rnd.randint(0, W - 1) | |
| yy = rnd.randint(0, H - 1) | |
| v = rnd.randint(-25, 25) | |
| r, g, b = px[x, yy] | |
| px[x, yy] = (max(0, min(255, r + v)), | |
| max(0, min(255, g + v)), | |
| max(0, min(255, b + v))) | |
| img.save(path, "PNG") | |
| def main(): | |
| txns = build_transactions() | |
| pdf_path = os.path.join(HERE, "sample_digital.pdf") | |
| scan_path = os.path.join(HERE, "sample_scan.png") | |
| truth_digital = os.path.join(HERE, "sample_digital_truth.json") | |
| truth_scan = os.path.join(HERE, "sample_scan_truth.json") | |
| render_pdf(txns, pdf_path) | |
| render_scan(txns, scan_path) | |
| with open(truth_digital, "w", encoding="utf-8") as f: | |
| json.dump(txns, f, indent=2) | |
| with open(truth_scan, "w", encoding="utf-8") as f: | |
| json.dump(txns, f, indent=2) | |
| print(f"Wrote {len(txns)} transactions") | |
| print(f" {pdf_path}") | |
| print(f" {scan_path}") | |
| print(f" {truth_digital}") | |
| print(f" {truth_scan}") | |
| if __name__ == "__main__": | |
| main() | |