Spaces:
Sleeping
Sleeping
File size: 9,837 Bytes
10ec275 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 | """Generate synthetic bank statements + ground-truth JSON.
Produces:
samples/sample_digital.pdf -- real reportlab table (exercises pdfplumber)
samples/sample_scan.png -- rendered + rotated + noised (exercises MiniCPM-V)
samples/sample_digital_truth.json -- ground-truth transactions (test fixture)
samples/sample_scan_truth.json -- same data, used for vision-accuracy check
No real bank data is used (privacy + licensing). Narrations are deliberately
messy to mimic real Indian bank statements.
Run: python samples/generate_samples.py
"""
import json
import os
import random
HERE = os.path.dirname(os.path.abspath(__file__))
OPENING_BALANCE = 152340.50
# (narration, debit, credit) -- one tuple per transaction row.
# Narrations are intentionally messy and cover most ledger heads so the
# rules layer + reconciliation both get a real workout.
RAW = [
("UPI/CR/512345678901/RAMESH TRADERS/PAYTM/Payment for goods", None, 45000.00),
("NEFT-HDFC0001234-ACME PVT LTD-INV 4521", None, 128500.00),
("UPI/DR/423451234567/SHARMA RENT/GPAY/April office rent", 35000.00, None),
("ACH/NACH/SBILIFE INSURANCE/PREMIUM/POL 9921", 12500.00, None),
("SALARY APR 2026 - STAFF PAYROLL BATCH 04", 86000.00, None),
("ATM CSH WDL/AXIS/KANPUR MALL ROAD/04APR", 10000.00, None),
("UPI/CR/612345098765/SUNIL ENTERPRISES/PHONEPE/sale", None, 22750.00),
("GST PMT-CBIC-26AAAPL1234C1Z5-GSTR3B APR", 18900.00, None),
("NEFT-ICIC0004567-MEGHA SUPPLIERS-PO 7781", 64200.00, None),
("INT.PD-SAVINGS A/C-QTR ENDING", None, 1842.00),
("BANK CHRG-SMS CHGS APR 2026", 23.60, None),
("UPI/DR/734512345098/BSNL BROADBAND/INTERNET BILL", 1499.00, None),
("RTGS-UTIB0000123-DELTA EXPORTS-SETTLEMENT", None, 215000.00),
("ACH/NACH/HDFC HOME LOAN/EMI/LN44219", 28750.00, None),
("UPI/DR/845612340987/KESHAV ELECTRICALS/utility KESCO bill", 6540.00, None),
("CSH DEP/BRANCH/KANPUR/CASH DEPOSIT BY CLIENT", None, 50000.00),
("TDS PMT-CPC-TDS Q4 26AAAPL1234C", 9400.00, None),
("UPI/CR/923451234560/VIKAS AGENCY/sale receipt", None, 17800.00),
("NEFT-PUNB0123456-RAVI CONSULTING-PROF FEES", 25000.00, None),
("IRCTC TRAVEL/RAIL TICKET/PNR 8841200012", 3450.00, None),
("UPI/DR/103451234561/SWIGGY/staff lunch conveyance", 1280.00, None),
("ELECTRICITY KESCO BILL PMT/CONS 4412", 8920.00, None),
("UPI/CR/213451234562/ANJALI STORES/PHONEPE/sale", None, 9650.00),
("AMC CHGS-DEBIT CARD ANNUAL FEE", 590.00, None),
("NEFT-SBIN0009988-OWN ACCOUNT TRANSFER-SELF", 40000.00, None),
("UPI/DR/313451234563/GARG REPAIRS/AC servicing maintenance", 4200.00, None),
("INCOME TAX PMT-CBDT-ADVANCE TAX 26AAAPL", 35000.00, None),
("UPI/CR/413451234564/MOHIT TRADING/sale", None, 31200.00),
("LIC OF INDIA/ACH/PREMIUM/POLICY 5512", 9800.00, None),
("UPI/DR/513451234565/JIO RECHARGE/mobile postpaid", 799.00, None),
("NEFT-YESB0001122-GLOBE IMPORTS-PURCHASE", 52400.00, None),
("DRAWINGS-PROPRIETOR-PERSONAL WITHDRAWAL", 30000.00, None),
("UPI/CR/613451234566/CAPITAL INTRODUCED BY PARTNER", None, 200000.00),
("INT CREDIT-FIXED DEPOSIT INTEREST", None, 4521.00),
("UPI/DR/713451234567/PETROL HPCL/fuel conveyance", 2500.00, None),
("NEFT-KKBK0007788-ZENITH PVT LTD-INV 9921", None, 76300.00),
("ATM CSH WDL/HDFC/CIVIL LINES/22APR", 15000.00, None),
("UPI/DR/813451234568/PRINTWELL/office stationery purchase", 3120.00, None),
("INTEREST PAID ON OVERDRAFT-OD A/C", 5640.00, None),
("RTGS-HDFC0009090-NEXUS SOLUTIONS-FINAL PMT", None, 98000.00),
]
DATES = None
def build_transactions():
"""Return the ground-truth transaction list with running balances."""
txns = []
balance = OPENING_BALANCE
# Dates: spread across April 2026, monotonically non-decreasing.
day = 1
for i, (narration, debit, credit) in enumerate(RAW):
# advance the day every ~1-2 rows, stay within April
if i > 0 and i % 2 == 0:
day = min(day + 1, 30)
date_iso = f"2026-04-{day:02d}"
balance = round(balance - (debit or 0) + (credit or 0), 2)
ref = "".join([c for c in narration if c.isdigit()][:6]) or None
txns.append({
"date": date_iso,
"narration": narration,
"ref_no": ref,
"debit": debit,
"credit": credit,
"balance": balance,
})
return txns
def _printed_date(iso):
"""ISO -> DD/MM/YYYY as printed on the statement."""
y, m, d = iso.split("-")
return f"{d}/{m}/{y}"
def _fmt_amt(v):
if v is None:
return ""
# Indian-style grouping is hard to parse; keep it plain with thousands commas
return f"{v:,.2f}"
def render_pdf(txns, path):
from reportlab.lib import colors
from reportlab.lib.pagesizes import A4
from reportlab.lib.units import mm
from reportlab.platypus import (SimpleDocTemplate, Table, TableStyle,
Paragraph, Spacer)
from reportlab.lib.styles import getSampleStyleSheet
styles = getSampleStyleSheet()
doc = SimpleDocTemplate(path, pagesize=A4,
leftMargin=10 * mm, rightMargin=10 * mm,
topMargin=12 * mm, bottomMargin=12 * mm)
story = []
story.append(Paragraph("STATE BANK STYLE DEMO — Account Statement", styles["Title"]))
story.append(Paragraph("Account: XXXXXX4412 Period: 01/04/2026 - 30/04/2026",
styles["Normal"]))
story.append(Paragraph(f"Opening Balance: {_fmt_amt(OPENING_BALANCE)}", styles["Normal"]))
story.append(Spacer(1, 6 * mm))
header = ["Date", "Narration", "Ref No", "Debit", "Credit", "Balance"]
data = [header]
narr_style = styles["BodyText"]
narr_style.fontSize = 7
narr_style.leading = 8
for t in txns:
data.append([
_printed_date(t["date"]),
Paragraph(t["narration"], narr_style),
t["ref_no"] or "",
_fmt_amt(t["debit"]),
_fmt_amt(t["credit"]),
_fmt_amt(t["balance"]),
])
col_widths = [20 * mm, 78 * mm, 16 * mm, 22 * mm, 22 * mm, 24 * mm]
table = Table(data, colWidths=col_widths, repeatRows=1)
table.setStyle(TableStyle([
("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#1a3c6e")),
("TEXTCOLOR", (0, 0), (-1, 0), colors.white),
("FONTSIZE", (0, 0), (-1, -1), 7),
("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"),
("ALIGN", (3, 0), (5, -1), "RIGHT"),
("GRID", (0, 0), (-1, -1), 0.4, colors.grey),
("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.white, colors.HexColor("#eef3fa")]),
("VALIGN", (0, 0), (-1, -1), "MIDDLE"),
]))
story.append(table)
doc.build(story)
def render_scan(txns, path):
"""Render a statement page to an image, then rotate + add noise to mimic a scan."""
from PIL import Image, ImageDraw, ImageFont
W, H = 1240, 1754 # ~A4 at 150dpi
img = Image.new("RGB", (W, H), "white")
draw = ImageDraw.Draw(img)
try:
font = ImageFont.truetype("arial.ttf", 16)
font_b = ImageFont.truetype("arialbd.ttf", 20)
font_s = ImageFont.truetype("arial.ttf", 13)
except Exception:
font = ImageFont.load_default()
font_b = font
font_s = font
draw.text((40, 30), "DEMO BANK — Account Statement (scanned)", fill="black", font=font_b)
draw.text((40, 64), "Account: XXXXXX4412 Period: 01/04/2026 - 30/04/2026",
fill="black", font=font_s)
draw.text((40, 84), f"Opening Balance: {_fmt_amt(OPENING_BALANCE)}", fill="black", font=font_s)
# column x positions
cols = [40, 140, 660, 760, 920, 1080]
y = 120
headers = ["Date", "Narration", "Ref", "Debit", "Credit", "Balance"]
for x, h in zip(cols, headers):
draw.text((x, y), h, fill="black", font=font)
y += 26
draw.line((40, y, W - 40, y), fill="black", width=1)
y += 6
for t in txns:
draw.text((cols[0], y), _printed_date(t["date"]), fill="black", font=font_s)
narr = t["narration"]
if len(narr) > 70:
narr = narr[:70]
draw.text((cols[1], y), narr, fill="black", font=font_s)
draw.text((cols[2], y), (t["ref_no"] or ""), fill="black", font=font_s)
draw.text((cols[3], y), _fmt_amt(t["debit"]), fill="black", font=font_s)
draw.text((cols[4], y), _fmt_amt(t["credit"]), fill="black", font=font_s)
draw.text((cols[5], y), _fmt_amt(t["balance"]), fill="black", font=font_s)
y += 30
# Rotate 1.5 deg and add mild gaussian-ish noise to simulate a scan.
img = img.rotate(1.5, expand=False, fillcolor="white")
px = img.load()
rnd = random.Random(42)
for _ in range(45000):
x = rnd.randint(0, W - 1)
yy = rnd.randint(0, H - 1)
v = rnd.randint(-25, 25)
r, g, b = px[x, yy]
px[x, yy] = (max(0, min(255, r + v)),
max(0, min(255, g + v)),
max(0, min(255, b + v)))
img.save(path, "PNG")
def main():
txns = build_transactions()
pdf_path = os.path.join(HERE, "sample_digital.pdf")
scan_path = os.path.join(HERE, "sample_scan.png")
truth_digital = os.path.join(HERE, "sample_digital_truth.json")
truth_scan = os.path.join(HERE, "sample_scan_truth.json")
render_pdf(txns, pdf_path)
render_scan(txns, scan_path)
with open(truth_digital, "w", encoding="utf-8") as f:
json.dump(txns, f, indent=2)
with open(truth_scan, "w", encoding="utf-8") as f:
json.dump(txns, f, indent=2)
print(f"Wrote {len(txns)} transactions")
print(f" {pdf_path}")
print(f" {scan_path}")
print(f" {truth_digital}")
print(f" {truth_scan}")
if __name__ == "__main__":
main()
|