statementsetu / samples /generate_samples.py
perceptron01's picture
Upload 5 files
7fe2877 verified
Raw
History Blame Contribute Delete
9.84 kB
"""Generate synthetic bank statements + ground-truth JSON.
Produces:
samples/sample_digital.pdf -- real reportlab table (exercises pdfplumber)
samples/sample_scan.png -- rendered + rotated + noised (exercises MiniCPM-V)
samples/sample_digital_truth.json -- ground-truth transactions (test fixture)
samples/sample_scan_truth.json -- same data, used for vision-accuracy check
No real bank data is used (privacy + licensing). Narrations are deliberately
messy to mimic real Indian bank statements.
Run: python samples/generate_samples.py
"""
import json
import os
import random
HERE = os.path.dirname(os.path.abspath(__file__))
OPENING_BALANCE = 152340.50
# (narration, debit, credit) -- one tuple per transaction row.
# Narrations are intentionally messy and cover most ledger heads so the
# rules layer + reconciliation both get a real workout.
RAW = [
("UPI/CR/512345678901/RAMESH TRADERS/PAYTM/Payment for goods", None, 45000.00),
("NEFT-HDFC0001234-ACME PVT LTD-INV 4521", None, 128500.00),
("UPI/DR/423451234567/SHARMA RENT/GPAY/April office rent", 35000.00, None),
("ACH/NACH/SBILIFE INSURANCE/PREMIUM/POL 9921", 12500.00, None),
("SALARY APR 2026 - STAFF PAYROLL BATCH 04", 86000.00, None),
("ATM CSH WDL/AXIS/KANPUR MALL ROAD/04APR", 10000.00, None),
("UPI/CR/612345098765/SUNIL ENTERPRISES/PHONEPE/sale", None, 22750.00),
("GST PMT-CBIC-26AAAPL1234C1Z5-GSTR3B APR", 18900.00, None),
("NEFT-ICIC0004567-MEGHA SUPPLIERS-PO 7781", 64200.00, None),
("INT.PD-SAVINGS A/C-QTR ENDING", None, 1842.00),
("BANK CHRG-SMS CHGS APR 2026", 23.60, None),
("UPI/DR/734512345098/BSNL BROADBAND/INTERNET BILL", 1499.00, None),
("RTGS-UTIB0000123-DELTA EXPORTS-SETTLEMENT", None, 215000.00),
("ACH/NACH/HDFC HOME LOAN/EMI/LN44219", 28750.00, None),
("UPI/DR/845612340987/KESHAV ELECTRICALS/utility KESCO bill", 6540.00, None),
("CSH DEP/BRANCH/KANPUR/CASH DEPOSIT BY CLIENT", None, 50000.00),
("TDS PMT-CPC-TDS Q4 26AAAPL1234C", 9400.00, None),
("UPI/CR/923451234560/VIKAS AGENCY/sale receipt", None, 17800.00),
("NEFT-PUNB0123456-RAVI CONSULTING-PROF FEES", 25000.00, None),
("IRCTC TRAVEL/RAIL TICKET/PNR 8841200012", 3450.00, None),
("UPI/DR/103451234561/SWIGGY/staff lunch conveyance", 1280.00, None),
("ELECTRICITY KESCO BILL PMT/CONS 4412", 8920.00, None),
("UPI/CR/213451234562/ANJALI STORES/PHONEPE/sale", None, 9650.00),
("AMC CHGS-DEBIT CARD ANNUAL FEE", 590.00, None),
("NEFT-SBIN0009988-OWN ACCOUNT TRANSFER-SELF", 40000.00, None),
("UPI/DR/313451234563/GARG REPAIRS/AC servicing maintenance", 4200.00, None),
("INCOME TAX PMT-CBDT-ADVANCE TAX 26AAAPL", 35000.00, None),
("UPI/CR/413451234564/MOHIT TRADING/sale", None, 31200.00),
("LIC OF INDIA/ACH/PREMIUM/POLICY 5512", 9800.00, None),
("UPI/DR/513451234565/JIO RECHARGE/mobile postpaid", 799.00, None),
("NEFT-YESB0001122-GLOBE IMPORTS-PURCHASE", 52400.00, None),
("DRAWINGS-PROPRIETOR-PERSONAL WITHDRAWAL", 30000.00, None),
("UPI/CR/613451234566/CAPITAL INTRODUCED BY PARTNER", None, 200000.00),
("INT CREDIT-FIXED DEPOSIT INTEREST", None, 4521.00),
("UPI/DR/713451234567/PETROL HPCL/fuel conveyance", 2500.00, None),
("NEFT-KKBK0007788-ZENITH PVT LTD-INV 9921", None, 76300.00),
("ATM CSH WDL/HDFC/CIVIL LINES/22APR", 15000.00, None),
("UPI/DR/813451234568/PRINTWELL/office stationery purchase", 3120.00, None),
("INTEREST PAID ON OVERDRAFT-OD A/C", 5640.00, None),
("RTGS-HDFC0009090-NEXUS SOLUTIONS-FINAL PMT", None, 98000.00),
]
DATES = None
def build_transactions():
"""Return the ground-truth transaction list with running balances."""
txns = []
balance = OPENING_BALANCE
# Dates: spread across April 2026, monotonically non-decreasing.
day = 1
for i, (narration, debit, credit) in enumerate(RAW):
# advance the day every ~1-2 rows, stay within April
if i > 0 and i % 2 == 0:
day = min(day + 1, 30)
date_iso = f"2026-04-{day:02d}"
balance = round(balance - (debit or 0) + (credit or 0), 2)
ref = "".join([c for c in narration if c.isdigit()][:6]) or None
txns.append({
"date": date_iso,
"narration": narration,
"ref_no": ref,
"debit": debit,
"credit": credit,
"balance": balance,
})
return txns
def _printed_date(iso):
"""ISO -> DD/MM/YYYY as printed on the statement."""
y, m, d = iso.split("-")
return f"{d}/{m}/{y}"
def _fmt_amt(v):
if v is None:
return ""
# Indian-style grouping is hard to parse; keep it plain with thousands commas
return f"{v:,.2f}"
def render_pdf(txns, path):
from reportlab.lib import colors
from reportlab.lib.pagesizes import A4
from reportlab.lib.units import mm
from reportlab.platypus import (SimpleDocTemplate, Table, TableStyle,
Paragraph, Spacer)
from reportlab.lib.styles import getSampleStyleSheet
styles = getSampleStyleSheet()
doc = SimpleDocTemplate(path, pagesize=A4,
leftMargin=10 * mm, rightMargin=10 * mm,
topMargin=12 * mm, bottomMargin=12 * mm)
story = []
story.append(Paragraph("STATE BANK STYLE DEMO — Account Statement", styles["Title"]))
story.append(Paragraph("Account: XXXXXX4412    Period: 01/04/2026 - 30/04/2026",
styles["Normal"]))
story.append(Paragraph(f"Opening Balance: {_fmt_amt(OPENING_BALANCE)}", styles["Normal"]))
story.append(Spacer(1, 6 * mm))
header = ["Date", "Narration", "Ref No", "Debit", "Credit", "Balance"]
data = [header]
narr_style = styles["BodyText"]
narr_style.fontSize = 7
narr_style.leading = 8
for t in txns:
data.append([
_printed_date(t["date"]),
Paragraph(t["narration"], narr_style),
t["ref_no"] or "",
_fmt_amt(t["debit"]),
_fmt_amt(t["credit"]),
_fmt_amt(t["balance"]),
])
col_widths = [20 * mm, 78 * mm, 16 * mm, 22 * mm, 22 * mm, 24 * mm]
table = Table(data, colWidths=col_widths, repeatRows=1)
table.setStyle(TableStyle([
("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#1a3c6e")),
("TEXTCOLOR", (0, 0), (-1, 0), colors.white),
("FONTSIZE", (0, 0), (-1, -1), 7),
("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"),
("ALIGN", (3, 0), (5, -1), "RIGHT"),
("GRID", (0, 0), (-1, -1), 0.4, colors.grey),
("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.white, colors.HexColor("#eef3fa")]),
("VALIGN", (0, 0), (-1, -1), "MIDDLE"),
]))
story.append(table)
doc.build(story)
def render_scan(txns, path):
"""Render a statement page to an image, then rotate + add noise to mimic a scan."""
from PIL import Image, ImageDraw, ImageFont
W, H = 1240, 1754 # ~A4 at 150dpi
img = Image.new("RGB", (W, H), "white")
draw = ImageDraw.Draw(img)
try:
font = ImageFont.truetype("arial.ttf", 16)
font_b = ImageFont.truetype("arialbd.ttf", 20)
font_s = ImageFont.truetype("arial.ttf", 13)
except Exception:
font = ImageFont.load_default()
font_b = font
font_s = font
draw.text((40, 30), "DEMO BANK — Account Statement (scanned)", fill="black", font=font_b)
draw.text((40, 64), "Account: XXXXXX4412 Period: 01/04/2026 - 30/04/2026",
fill="black", font=font_s)
draw.text((40, 84), f"Opening Balance: {_fmt_amt(OPENING_BALANCE)}", fill="black", font=font_s)
# column x positions
cols = [40, 140, 660, 760, 920, 1080]
y = 120
headers = ["Date", "Narration", "Ref", "Debit", "Credit", "Balance"]
for x, h in zip(cols, headers):
draw.text((x, y), h, fill="black", font=font)
y += 26
draw.line((40, y, W - 40, y), fill="black", width=1)
y += 6
for t in txns:
draw.text((cols[0], y), _printed_date(t["date"]), fill="black", font=font_s)
narr = t["narration"]
if len(narr) > 70:
narr = narr[:70]
draw.text((cols[1], y), narr, fill="black", font=font_s)
draw.text((cols[2], y), (t["ref_no"] or ""), fill="black", font=font_s)
draw.text((cols[3], y), _fmt_amt(t["debit"]), fill="black", font=font_s)
draw.text((cols[4], y), _fmt_amt(t["credit"]), fill="black", font=font_s)
draw.text((cols[5], y), _fmt_amt(t["balance"]), fill="black", font=font_s)
y += 30
# Rotate 1.5 deg and add mild gaussian-ish noise to simulate a scan.
img = img.rotate(1.5, expand=False, fillcolor="white")
px = img.load()
rnd = random.Random(42)
for _ in range(45000):
x = rnd.randint(0, W - 1)
yy = rnd.randint(0, H - 1)
v = rnd.randint(-25, 25)
r, g, b = px[x, yy]
px[x, yy] = (max(0, min(255, r + v)),
max(0, min(255, g + v)),
max(0, min(255, b + v)))
img.save(path, "PNG")
def main():
txns = build_transactions()
pdf_path = os.path.join(HERE, "sample_digital.pdf")
scan_path = os.path.join(HERE, "sample_scan.png")
truth_digital = os.path.join(HERE, "sample_digital_truth.json")
truth_scan = os.path.join(HERE, "sample_scan_truth.json")
render_pdf(txns, pdf_path)
render_scan(txns, scan_path)
with open(truth_digital, "w", encoding="utf-8") as f:
json.dump(txns, f, indent=2)
with open(truth_scan, "w", encoding="utf-8") as f:
json.dump(txns, f, indent=2)
print(f"Wrote {len(txns)} transactions")
print(f" {pdf_path}")
print(f" {scan_path}")
print(f" {truth_digital}")
print(f" {truth_scan}")
if __name__ == "__main__":
main()