import gradio as gr import requests import json import re import os # ── Config ───────────────────────────────────────────────────── MODEL_ID = "ratulsur/multi-format-finance-parser" API_URL = f"https://api-inference.huggingface.co/models/{MODEL_ID}" HF_TOKEN = os.environ.get("HF_TOKEN", "") SYSTEM_PROMPT = """You are a production financial document parser. Given raw text from any financial document, output ONLY a single valid JSON object. Schema: {document_type, vendor, client, date (YYYY-MM-DD), due_date, document_id, currency, subtotal, tax_amount, tax_rate_pct, total_amount, line_items:[{description,quantity,unit_price,amount}], payment_terms, notes, metadata}. All monetary values must be floats. Unknown fields → null. No explanation.""" # ── Inference ────────────────────────────────────────────────── def call_api(text: str) -> dict: prompt = ( f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n" f"<|im_start|>user\nParse this financial document:\n\n{text}<|im_end|>\n" f"<|im_start|>assistant\n" ) headers = { "Authorization": f"Bearer {HF_TOKEN}", "Content-Type": "application/json", } payload = { "inputs": prompt, "parameters": { "max_new_tokens": 512, "temperature": 0.05, "return_full_text": False, "do_sample": False, }, } resp = requests.post(API_URL, headers=headers, json=payload, timeout=120) resp.raise_for_status() raw = resp.json()[0]["generated_text"].strip() raw = re.sub(r"```json\s*|```\s*", "", raw).strip() # JSON repair heuristics try: return json.loads(raw) except json.JSONDecodeError: raw = (raw .replace("'", '"') .replace("None", "null") .replace("True", "true") .replace("False", "false") .replace(",\n}", "\n}") .replace(",\n]", "\n]")) match = re.search(r"\{.*\}", raw, re.DOTALL) try: return json.loads(match.group() if match else raw) except Exception: return {"error": "Could not parse model output", "raw": raw} # ── Main processing function ─────────────────────────────────── def process(text_input: str, doc_hint: str): if not text_input.strip(): return "⚠️ Please paste some document text.", "" try: text = text_input.strip() if doc_hint and doc_hint != "Auto-detect": text = f"[Document type: {doc_hint}]\n\n{text}" result = call_api(text) # Build summary summary = [] if result.get("error"): return f"❌ Error: {result['error']}", json.dumps(result, indent=2) if result.get("document_type"): summary.append(f"**Type:** {result['document_type']}") if result.get("vendor"): summary.append(f"**Vendor:** {result['vendor']}") if result.get("client"): summary.append(f"**Client:** {result['client']}") if result.get("date"): summary.append(f"**Date:** {result['date']}") if result.get("due_date"): summary.append(f"**Due date:** {result['due_date']}") if result.get("document_id"): summary.append(f"**Document ID:** {result['document_id']}") if result.get("currency") and result.get("total_amount") is not None: summary.append(f"**Total:** {result['currency']} {result['total_amount']:,.2f}") if result.get("tax_amount") is not None: summary.append(f"**Tax:** {result.get('currency','')} {result['tax_amount']:,.2f}") if result.get("line_items"): summary.append(f"**Line items:** {len(result['line_items'])}") if result.get("payment_terms"): summary.append(f"**Payment terms:** {result['payment_terms']}") return "\n\n".join(summary), json.dumps(result, indent=2, ensure_ascii=False) except requests.exceptions.Timeout: return "⚠️ Model is loading, please wait 20 seconds and try again.", "" except requests.exceptions.HTTPError as e: return f"❌ API Error: {e}", "" except Exception as e: return f"❌ Error: {e}", "" # ── Examples ─────────────────────────────────────────────────── EXAMPLES = [ ["""INVOICE Vendor: Tata Consultancy Services Ltd. Invoice No: TCS-2024-8821 Date: 2024-11-15 Due Date: 2024-12-15 Bill To: Reliance Industries Ltd. Service: Cloud Infrastructure Management (Oct 2024) INR 42,500.00 Service: SAP Integration Support INR 18,000.00 GST @ 18%: INR 10,890.00 TOTAL DUE: INR 71,390.00 Payment Terms: Net 30""", "Invoice"], ["""SAP FI - VENDOR PAYMENT REPORT Company Code: 1000 | Fiscal Year: 2024 Run Date: 2024-09-30 |DocNo |Vendor |Amount |Curr|Status | |----------|--------------------|--------------|----|--------| |1900045621|Wipro Limited | 4,25,000.00 |INR |Open | |1900045622|HCL Technologies | 2,10,500.00 |INR |Cleared | |1900045623|Infosys BPO | 8,75,200.00 |INR |Open | Total: 15,10,700.00 INR""", "SAP Report"], ["""INCOME STATEMENT Reliance Industries Ltd. Period ending: 2024-09-30 (in INR) Revenue: 50,000,000.00 Cost of Revenue: (22,000,000.00) Gross Profit: 28,000,000.00 Operating Expenses: (12,000,000.00) EBIT: 16,000,000.00 Income Tax 25%: (4,000,000.00) Net Income: 12,000,000.00""", "Income Statement"], ["""PURCHASE ORDER PO Number: PO-2024-00456 Date: 2024-10-01 Vendor: Amazon Web Services India Ship To: HDFC Bank Ltd., Mumbai Item 1: EC2 Reserved Instances (1yr) USD 12,000.00 Item 2: S3 Storage 50TB USD 1,800.00 Item 3: RDS Multi-AZ USD 4,200.00 Subtotal: USD 18,000.00 Tax: USD 0.00 Total: USD 18,000.00 Payment Terms: Net 45""", "Purchase Order"], ] # ── UI ───────────────────────────────────────────────────────── with gr.Blocks( title="Multi-Format Finance Parser", theme=gr.themes.Soft(), css=".json-output { font-family: monospace; font-size: 13px; }" ) as demo: gr.Markdown(""" # 🏦 Multi-Format Finance Document Parser **Production-grade** financial document extraction → structured JSON. Supports: **Invoice · SAP Report · Income Statement · Bank Statement · Purchase Order · SQL results** *Fine-tuned Qwen2.5-7B-Instruct · QLoRA 4-bit NF4 · Trained on CORD-v2 + synthetic finance data* """) with gr.Row(): with gr.Column(scale=1): text_in = gr.Textbox( label="Paste document text", lines=16, placeholder="Paste your invoice, SAP export, income statement, or any financial document here...", ) hint_in = gr.Dropdown( choices=[ "Auto-detect", "Invoice", "SAP Report", "Balance Sheet", "Income Statement", "Bank Statement", "Purchase Order", "SQL Result", ], value="Auto-detect", label="Document type hint (optional)", ) parse_btn = gr.Button("Parse Document", variant="primary", size="lg") with gr.Column(scale=1): summary_out = gr.Markdown(label="Summary") json_out = gr.Code( label="Structured JSON output", language="json", lines=18, ) gr.Markdown("### Try an example") gr.Examples( examples=EXAMPLES, inputs=[text_in, hint_in], label="Click any example to load it", ) gr.Markdown(""" --- **Model:** [ratulsur/multi-format-finance-parser](https://huggingface.co/ratulsur/multi-format-finance-parser) **Training:** QLoRA (4-bit NF4 double quantization) on Qwen2.5-7B-Instruct **Dataset:** CORD-v2 receipts + synthetic invoices, SAP reports, income statements """) parse_btn.click( fn=process, inputs=[text_in, hint_in], outputs=[summary_out, json_out], ) if __name__ == "__main__": demo.launch()