Spaces:
Sleeping
Sleeping
| """StatementSetu -- Bank Statement -> Tally for CA firms. | |
| Gradio app (HF ZeroGPU Space). Pipeline: | |
| Upload -> extract -> reconcile -> categorize -> review (editable) -> export. | |
| Built for the Backyard AI track. Models are OpenBMB MiniCPM-first | |
| (MiniCPM-V for vision, MiniCPM3-4B for text categorization) -- 12B total, well | |
| under the 32B hackathon cap. The digital-PDF path uses no GPU at all. | |
| """ | |
| import os | |
| import tempfile | |
| import time | |
| import gradio as gr | |
| import pandas as pd | |
| from categorize import categorize | |
| from constants import CATEGORIES, CATEGORY_SET, SUSPENSE, voucher_type_for | |
| from excel_export import write_excel | |
| from extraction import extract | |
| from tally_export import write_tally_xml | |
| from validate import flags_text, reconcile, summary_stats | |
| HERE = os.path.dirname(os.path.abspath(__file__)) | |
| SAMPLES = os.path.join(HERE, "samples") | |
| CA_NAME = os.environ.get("CA_NAME", "Anil") # Backyard-track framing | |
| CA_CITY = os.environ.get("CA_CITY", "Kanpur") | |
| SAMPLE_FILES = { | |
| "Sample 1 — digital PDF (40 txns)": os.path.join(SAMPLES, "sample_digital.pdf"), | |
| "Sample 2 — scanned image (40 txns)": os.path.join(SAMPLES, "sample_scan.png"), | |
| } | |
| DISPLAY_COLS = ["Date", "Narration", "Debit", "Credit", "Balance", | |
| "Category", "Confidence", "Flags"] | |
| # --------------------------------------------------------------------------- # | |
| # Conversions between the txn-dict list and the display DataFrame | |
| # --------------------------------------------------------------------------- # | |
| def txns_to_df(transactions): | |
| rows = [] | |
| for t in transactions: | |
| rows.append([ | |
| t.get("date", ""), | |
| t.get("narration", ""), | |
| t.get("debit"), | |
| t.get("credit"), | |
| t.get("balance"), | |
| t.get("category", ""), | |
| round(float(t.get("confidence", 0) or 0), 2), | |
| flags_text(t), | |
| ]) | |
| return pd.DataFrame(rows, columns=DISPLAY_COLS) | |
| def apply_edits(transactions, df): | |
| """Merge user category edits (by row order) back into the txn list.""" | |
| if df is None: | |
| return transactions | |
| if hasattr(df, "values"): | |
| records = df.values.tolist() | |
| cols = list(df.columns) | |
| else: | |
| records = df | |
| cols = DISPLAY_COLS | |
| try: | |
| cat_idx = cols.index("Category") | |
| except ValueError: | |
| cat_idx = 5 | |
| for t, row in zip(transactions, records): | |
| cat = str(row[cat_idx]).strip() if cat_idx < len(row) else "" | |
| if cat not in CATEGORY_SET: | |
| cat = SUSPENSE | |
| t["category"] = cat | |
| t["voucher_type"] = voucher_type_for(t.get("debit"), t.get("credit"), cat) | |
| return transactions | |
| # --------------------------------------------------------------------------- # | |
| # Core processing | |
| # --------------------------------------------------------------------------- # | |
| def process_file(file_path, sample_label, progress=gr.Progress()): | |
| """Run the full pipeline on an uploaded file or a chosen bundled sample. | |
| An uploaded file takes priority; otherwise fall back to the sample dropdown. | |
| """ | |
| if not file_path: | |
| file_path = SAMPLE_FILES.get(sample_label) | |
| if not file_path: | |
| return (None, gr.update(), "Please upload a file or pick a sample first.", | |
| "", "") | |
| if not os.path.exists(file_path): | |
| return (None, gr.update(), | |
| f"❌ Sample file missing on the server: `{file_path}`. " | |
| "Make sure the `samples/` folder was uploaded to the Space.", | |
| "", "") | |
| t0 = time.time() | |
| progress(0.1, desc="Extracting transactions…") | |
| try: | |
| transactions, meta = extract(file_path) | |
| except Exception as e: | |
| msg = str(e) | |
| hint = "" | |
| if "scan" in msg.lower() or "vision" in msg.lower(): | |
| hint = ("\n\nThis file needs the MiniCPM-V vision path, which runs on " | |
| "the ZeroGPU Space (GPU). Try the digital-PDF sample here.") | |
| return (None, gr.update(), f"❌ Extraction failed: {msg}{hint}", "", "") | |
| if not transactions: | |
| return (None, gr.update(), "❌ No transactions found in this file.", "", "") | |
| progress(0.4, desc="Reconciling balances…") | |
| recon = reconcile(transactions) | |
| progress(0.6, desc="Categorizing…") | |
| transactions, cat_stats = categorize(transactions, use_model=True) | |
| # refresh flag text after categorization (reconcile set flags already) | |
| df = txns_to_df(transactions) | |
| stats = summary_stats(transactions) | |
| elapsed = time.time() - t0 | |
| path_label = ("text-layer extraction — no GPU used" | |
| if meta.get("path") == "text-layer" | |
| else f"MiniCPM-V vision extraction ({meta.get('engine')})") | |
| model_label = ("MiniCPM categorization" if cat_stats["model_used"] | |
| else "rules-only categorization (model not loaded)") | |
| status = (f"✅ Done in {elapsed:.1f}s · {path_label} · {model_label} · " | |
| f"{cat_stats['rule_hits']}/{cat_stats['total']} rule-classified, " | |
| f"{cat_stats['suspense']} in Suspense") | |
| chips = (f"### {stats['count']} transactions · " | |
| f"Debits ₹{stats['total_debit']:,.2f} · " | |
| f"Credits ₹{stats['total_credit']:,.2f} · " | |
| f"Net ₹{stats['net']:,.2f}") | |
| return transactions, df, status, recon["banner"], chips | |
| def make_excel(transactions, df): | |
| if not transactions: | |
| raise gr.Error("Process a statement first.") | |
| transactions = apply_edits(transactions, df) | |
| out = os.path.join(tempfile.gettempdir(), "statementsetu_export.xlsx") | |
| write_excel(transactions, out) | |
| return out | |
| def make_xml(transactions, df, bank_ledger): | |
| if not transactions: | |
| raise gr.Error("Process a statement first.") | |
| transactions = apply_edits(transactions, df) | |
| out = os.path.join(tempfile.gettempdir(), "statementsetu_tally.xml") | |
| write_tally_xml(transactions, out, bank_ledger=(bank_ledger or "Bank Account")) | |
| return out | |
| # --------------------------------------------------------------------------- # | |
| # UI | |
| # --------------------------------------------------------------------------- # | |
| def build_ui(): | |
| with gr.Blocks(title="StatementSetu") as demo: | |
| state = gr.State([]) | |
| gr.Markdown( | |
| f"# 🧾 StatementSetu — Bank Statement → Tally in minutes\n" | |
| f"**Built for {CA_NAME}, a CA in {CA_CITY} 🇮🇳** — turn a 2-hour " | |
| f"re-typing job into 5 minutes. Upload a statement, review the " | |
| f"auto-categorized & balance-reconciled table, download Tally XML + Excel." | |
| ) | |
| # STEP 1 | |
| gr.Markdown("### Step 1 — Upload a statement") | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| file_in = gr.File(label="Bank statement (PDF / JPG / PNG)", | |
| file_types=[".pdf", ".png", ".jpg", ".jpeg"], | |
| type="filepath") | |
| with gr.Column(scale=1): | |
| sample_dd = gr.Dropdown( | |
| choices=list(SAMPLE_FILES.keys()), | |
| label="…or try a bundled sample", value=None) | |
| bank_ledger = gr.Textbox( | |
| label="Bank ledger name (for Tally)", value="Bank Account") | |
| process_btn = gr.Button("⚙️ Process statement", variant="primary") | |
| status = gr.Markdown("") | |
| # STEP 2 | |
| gr.Markdown("### Step 2 — Review & fix categories") | |
| banner = gr.Markdown("") | |
| chips = gr.Markdown("") | |
| table = gr.Dataframe( | |
| headers=DISPLAY_COLS, | |
| datatype=["str", "str", "number", "number", "number", | |
| "str", "number", "str"], | |
| interactive=True, | |
| wrap=True, | |
| label="Transactions (edit the Category column; off-list values → Suspense on export)", | |
| ) | |
| gr.Markdown( | |
| "**Valid categories:** " + " · ".join(f"`{c}`" for c in CATEGORIES), | |
| elem_classes="categories-help") | |
| # STEP 3 | |
| gr.Markdown("### Step 3 — Download") | |
| with gr.Row(): | |
| xlsx_btn = gr.Button("⬇️ Download Excel (.xlsx)", variant="primary") | |
| xml_btn = gr.Button("⬇️ Download Tally XML (beta)") | |
| xlsx_out = gr.File(label="Excel export") | |
| xml_out = gr.File(label="Tally XML export") | |
| with gr.Accordion("How to import into Tally", open=False): | |
| gr.Markdown( | |
| "1. In Tally: **Gateway of Tally → Import Data → Vouchers**.\n" | |
| "2. Select the downloaded `.xml` file and confirm the company.\n" | |
| "3. Verify ledger names exist (create `Bank Account` + category " | |
| "ledgers first, or map on import).\n\n" | |
| "*Tally XML sign conventions are fiddly — the XML is marked beta. " | |
| "The Excel export is the zero-risk path and works with Excel-to-Tally " | |
| "utilities too.*") | |
| # Wiring | |
| process_btn.click( | |
| process_file, inputs=[file_in, sample_dd], | |
| outputs=[state, table, status, banner, chips]) | |
| xlsx_btn.click(make_excel, inputs=[state, table], outputs=xlsx_out) | |
| xml_btn.click(make_xml, inputs=[state, table, bank_ledger], outputs=xml_out) | |
| return demo | |
| # Module-level `demo` so `gradio app.py` / HF Spaces can discover it. | |
| demo = build_ui() | |
| if __name__ == "__main__": | |
| demo.launch(theme=gr.themes.Soft()) | |