"""StatementSetu -- Bank Statement -> Tally for CA firms. Gradio app (HF ZeroGPU Space). Pipeline: Upload -> extract -> reconcile -> categorize -> review (editable) -> export. Built for the Backyard AI track. Models are OpenBMB MiniCPM-first (MiniCPM-V for vision, MiniCPM3-4B for text categorization) -- 12B total, well under the 32B hackathon cap. The digital-PDF path uses no GPU at all. """ import os import tempfile import time import gradio as gr import pandas as pd from categorize import categorize from constants import CATEGORIES, CATEGORY_SET, SUSPENSE, voucher_type_for from excel_export import write_excel from extraction import extract from tally_export import write_tally_xml from validate import flags_text, reconcile, summary_stats HERE = os.path.dirname(os.path.abspath(__file__)) SAMPLES = os.path.join(HERE, "samples") CA_NAME = os.environ.get("CA_NAME", "Anil") # Backyard-track framing CA_CITY = os.environ.get("CA_CITY", "Kanpur") SAMPLE_FILES = { "Sample 1 — digital PDF (40 txns)": os.path.join(SAMPLES, "sample_digital.pdf"), "Sample 2 — scanned image (40 txns)": os.path.join(SAMPLES, "sample_scan.png"), } DISPLAY_COLS = ["Date", "Narration", "Debit", "Credit", "Balance", "Category", "Confidence", "Flags"] # --------------------------------------------------------------------------- # # Conversions between the txn-dict list and the display DataFrame # --------------------------------------------------------------------------- # def txns_to_df(transactions): rows = [] for t in transactions: rows.append([ t.get("date", ""), t.get("narration", ""), t.get("debit"), t.get("credit"), t.get("balance"), t.get("category", ""), round(float(t.get("confidence", 0) or 0), 2), flags_text(t), ]) return pd.DataFrame(rows, columns=DISPLAY_COLS) def apply_edits(transactions, df): """Merge user category edits (by row order) back into the txn list.""" if df is None: return transactions if hasattr(df, "values"): records = df.values.tolist() cols = list(df.columns) else: records = df cols = DISPLAY_COLS try: cat_idx = cols.index("Category") except ValueError: cat_idx = 5 for t, row in zip(transactions, records): cat = str(row[cat_idx]).strip() if cat_idx < len(row) else "" if cat not in CATEGORY_SET: cat = SUSPENSE t["category"] = cat t["voucher_type"] = voucher_type_for(t.get("debit"), t.get("credit"), cat) return transactions # --------------------------------------------------------------------------- # # Core processing # --------------------------------------------------------------------------- # def process_file(file_path, sample_label, progress=gr.Progress()): """Run the full pipeline on an uploaded file or a chosen bundled sample. An uploaded file takes priority; otherwise fall back to the sample dropdown. """ if not file_path: file_path = SAMPLE_FILES.get(sample_label) if not file_path: return (None, gr.update(), "Please upload a file or pick a sample first.", "", "") if not os.path.exists(file_path): return (None, gr.update(), f"❌ Sample file missing on the server: `{file_path}`. " "Make sure the `samples/` folder was uploaded to the Space.", "", "") t0 = time.time() progress(0.1, desc="Extracting transactions…") try: transactions, meta = extract(file_path) except Exception as e: msg = str(e) hint = "" if "scan" in msg.lower() or "vision" in msg.lower(): hint = ("\n\nThis file needs the MiniCPM-V vision path, which runs on " "the ZeroGPU Space (GPU). Try the digital-PDF sample here.") return (None, gr.update(), f"❌ Extraction failed: {msg}{hint}", "", "") if not transactions: return (None, gr.update(), "❌ No transactions found in this file.", "", "") progress(0.4, desc="Reconciling balances…") recon = reconcile(transactions) progress(0.6, desc="Categorizing…") transactions, cat_stats = categorize(transactions, use_model=True) # refresh flag text after categorization (reconcile set flags already) df = txns_to_df(transactions) stats = summary_stats(transactions) elapsed = time.time() - t0 path_label = ("text-layer extraction — no GPU used" if meta.get("path") == "text-layer" else f"MiniCPM-V vision extraction ({meta.get('engine')})") model_label = ("MiniCPM categorization" if cat_stats["model_used"] else "rules-only categorization (model not loaded)") status = (f"✅ Done in {elapsed:.1f}s · {path_label} · {model_label} · " f"{cat_stats['rule_hits']}/{cat_stats['total']} rule-classified, " f"{cat_stats['suspense']} in Suspense") chips = (f"### {stats['count']} transactions · " f"Debits ₹{stats['total_debit']:,.2f} · " f"Credits ₹{stats['total_credit']:,.2f} · " f"Net ₹{stats['net']:,.2f}") return transactions, df, status, recon["banner"], chips def make_excel(transactions, df): if not transactions: raise gr.Error("Process a statement first.") transactions = apply_edits(transactions, df) out = os.path.join(tempfile.gettempdir(), "statementsetu_export.xlsx") write_excel(transactions, out) return out def make_xml(transactions, df, bank_ledger): if not transactions: raise gr.Error("Process a statement first.") transactions = apply_edits(transactions, df) out = os.path.join(tempfile.gettempdir(), "statementsetu_tally.xml") write_tally_xml(transactions, out, bank_ledger=(bank_ledger or "Bank Account")) return out # --------------------------------------------------------------------------- # # UI # --------------------------------------------------------------------------- # def build_ui(): with gr.Blocks(title="StatementSetu") as demo: state = gr.State([]) gr.Markdown( f"# 🧾 StatementSetu — Bank Statement → Tally in minutes\n" f"**Built for {CA_NAME}, a CA in {CA_CITY} 🇮🇳** — turn a 2-hour " f"re-typing job into 5 minutes. Upload a statement, review the " f"auto-categorized & balance-reconciled table, download Tally XML + Excel." ) # STEP 1 gr.Markdown("### Step 1 — Upload a statement") with gr.Row(): with gr.Column(scale=2): file_in = gr.File(label="Bank statement (PDF / JPG / PNG)", file_types=[".pdf", ".png", ".jpg", ".jpeg"], type="filepath") with gr.Column(scale=1): sample_dd = gr.Dropdown( choices=list(SAMPLE_FILES.keys()), label="…or try a bundled sample", value=None) bank_ledger = gr.Textbox( label="Bank ledger name (for Tally)", value="Bank Account") process_btn = gr.Button("⚙️ Process statement", variant="primary") status = gr.Markdown("") # STEP 2 gr.Markdown("### Step 2 — Review & fix categories") banner = gr.Markdown("") chips = gr.Markdown("") table = gr.Dataframe( headers=DISPLAY_COLS, datatype=["str", "str", "number", "number", "number", "str", "number", "str"], interactive=True, wrap=True, label="Transactions (edit the Category column; off-list values → Suspense on export)", ) gr.Markdown( "**Valid categories:** " + " · ".join(f"`{c}`" for c in CATEGORIES), elem_classes="categories-help") # STEP 3 gr.Markdown("### Step 3 — Download") with gr.Row(): xlsx_btn = gr.Button("⬇️ Download Excel (.xlsx)", variant="primary") xml_btn = gr.Button("⬇️ Download Tally XML (beta)") xlsx_out = gr.File(label="Excel export") xml_out = gr.File(label="Tally XML export") with gr.Accordion("How to import into Tally", open=False): gr.Markdown( "1. In Tally: **Gateway of Tally → Import Data → Vouchers**.\n" "2. Select the downloaded `.xml` file and confirm the company.\n" "3. Verify ledger names exist (create `Bank Account` + category " "ledgers first, or map on import).\n\n" "*Tally XML sign conventions are fiddly — the XML is marked beta. " "The Excel export is the zero-risk path and works with Excel-to-Tally " "utilities too.*") # Wiring process_btn.click( process_file, inputs=[file_in, sample_dd], outputs=[state, table, status, banner, chips]) xlsx_btn.click(make_excel, inputs=[state, table], outputs=xlsx_out) xml_btn.click(make_xml, inputs=[state, table, bank_ledger], outputs=xml_out) return demo # Module-level `demo` so `gradio app.py` / HF Spaces can discover it. demo = build_ui() if __name__ == "__main__": demo.launch(theme=gr.themes.Soft())