| import gradio as gr |
| import requests |
| import json |
| import re |
| import os |
|
|
| |
| MODEL_ID = "ratulsur/multi-format-finance-parser" |
| API_URL = f"https://api-inference.huggingface.co/models/{MODEL_ID}" |
| HF_TOKEN = os.environ.get("HF_TOKEN", "") |
|
|
| SYSTEM_PROMPT = """You are a production financial document parser. |
| Given raw text from any financial document, output ONLY a single valid JSON object. |
| Schema: {document_type, vendor, client, date (YYYY-MM-DD), due_date, document_id, |
| currency, subtotal, tax_amount, tax_rate_pct, total_amount, |
| line_items:[{description,quantity,unit_price,amount}], payment_terms, notes, metadata}. |
| All monetary values must be floats. Unknown fields β null. No explanation.""" |
|
|
| |
| def call_api(text: str) -> dict: |
| prompt = ( |
| f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n" |
| f"<|im_start|>user\nParse this financial document:\n\n{text}<|im_end|>\n" |
| f"<|im_start|>assistant\n" |
| ) |
| headers = { |
| "Authorization": f"Bearer {HF_TOKEN}", |
| "Content-Type": "application/json", |
| } |
| payload = { |
| "inputs": prompt, |
| "parameters": { |
| "max_new_tokens": 512, |
| "temperature": 0.05, |
| "return_full_text": False, |
| "do_sample": False, |
| }, |
| } |
| resp = requests.post(API_URL, headers=headers, json=payload, timeout=120) |
| resp.raise_for_status() |
| raw = resp.json()[0]["generated_text"].strip() |
| raw = re.sub(r"```json\s*|```\s*", "", raw).strip() |
|
|
| |
| try: |
| return json.loads(raw) |
| except json.JSONDecodeError: |
| raw = (raw |
| .replace("'", '"') |
| .replace("None", "null") |
| .replace("True", "true") |
| .replace("False", "false") |
| .replace(",\n}", "\n}") |
| .replace(",\n]", "\n]")) |
| match = re.search(r"\{.*\}", raw, re.DOTALL) |
| try: |
| return json.loads(match.group() if match else raw) |
| except Exception: |
| return {"error": "Could not parse model output", "raw": raw} |
|
|
|
|
| |
| def process(text_input: str, doc_hint: str): |
| if not text_input.strip(): |
| return "β οΈ Please paste some document text.", "" |
|
|
| try: |
| text = text_input.strip() |
| if doc_hint and doc_hint != "Auto-detect": |
| text = f"[Document type: {doc_hint}]\n\n{text}" |
|
|
| result = call_api(text) |
|
|
| |
| summary = [] |
| if result.get("error"): |
| return f"β Error: {result['error']}", json.dumps(result, indent=2) |
| if result.get("document_type"): |
| summary.append(f"**Type:** {result['document_type']}") |
| if result.get("vendor"): |
| summary.append(f"**Vendor:** {result['vendor']}") |
| if result.get("client"): |
| summary.append(f"**Client:** {result['client']}") |
| if result.get("date"): |
| summary.append(f"**Date:** {result['date']}") |
| if result.get("due_date"): |
| summary.append(f"**Due date:** {result['due_date']}") |
| if result.get("document_id"): |
| summary.append(f"**Document ID:** {result['document_id']}") |
| if result.get("currency") and result.get("total_amount") is not None: |
| summary.append(f"**Total:** {result['currency']} {result['total_amount']:,.2f}") |
| if result.get("tax_amount") is not None: |
| summary.append(f"**Tax:** {result.get('currency','')} {result['tax_amount']:,.2f}") |
| if result.get("line_items"): |
| summary.append(f"**Line items:** {len(result['line_items'])}") |
| if result.get("payment_terms"): |
| summary.append(f"**Payment terms:** {result['payment_terms']}") |
|
|
| return "\n\n".join(summary), json.dumps(result, indent=2, ensure_ascii=False) |
|
|
| except requests.exceptions.Timeout: |
| return "β οΈ Model is loading, please wait 20 seconds and try again.", "" |
| except requests.exceptions.HTTPError as e: |
| return f"β API Error: {e}", "" |
| except Exception as e: |
| return f"β Error: {e}", "" |
|
|
|
|
| |
| EXAMPLES = [ |
| ["""INVOICE |
| Vendor: Tata Consultancy Services Ltd. |
| Invoice No: TCS-2024-8821 |
| Date: 2024-11-15 |
| Due Date: 2024-12-15 |
| Bill To: Reliance Industries Ltd. |
| |
| Service: Cloud Infrastructure Management (Oct 2024) INR 42,500.00 |
| Service: SAP Integration Support INR 18,000.00 |
| GST @ 18%: INR 10,890.00 |
| TOTAL DUE: INR 71,390.00 |
| Payment Terms: Net 30""", "Invoice"], |
|
|
| ["""SAP FI - VENDOR PAYMENT REPORT |
| Company Code: 1000 | Fiscal Year: 2024 |
| Run Date: 2024-09-30 |
| |
| |DocNo |Vendor |Amount |Curr|Status | |
| |----------|--------------------|--------------|----|--------| |
| |1900045621|Wipro Limited | 4,25,000.00 |INR |Open | |
| |1900045622|HCL Technologies | 2,10,500.00 |INR |Cleared | |
| |1900045623|Infosys BPO | 8,75,200.00 |INR |Open | |
| |
| Total: 15,10,700.00 INR""", "SAP Report"], |
|
|
| ["""INCOME STATEMENT |
| Reliance Industries Ltd. |
| Period ending: 2024-09-30 |
| (in INR) |
| |
| Revenue: 50,000,000.00 |
| Cost of Revenue: (22,000,000.00) |
| Gross Profit: 28,000,000.00 |
| Operating Expenses: (12,000,000.00) |
| EBIT: 16,000,000.00 |
| Income Tax 25%: (4,000,000.00) |
| Net Income: 12,000,000.00""", "Income Statement"], |
|
|
| ["""PURCHASE ORDER |
| PO Number: PO-2024-00456 |
| Date: 2024-10-01 |
| Vendor: Amazon Web Services India |
| Ship To: HDFC Bank Ltd., Mumbai |
| |
| Item 1: EC2 Reserved Instances (1yr) USD 12,000.00 |
| Item 2: S3 Storage 50TB USD 1,800.00 |
| Item 3: RDS Multi-AZ USD 4,200.00 |
| Subtotal: USD 18,000.00 |
| Tax: USD 0.00 |
| Total: USD 18,000.00 |
| Payment Terms: Net 45""", "Purchase Order"], |
| ] |
|
|
|
|
| |
| with gr.Blocks( |
| title="Multi-Format Finance Parser", |
| theme=gr.themes.Soft(), |
| css=".json-output { font-family: monospace; font-size: 13px; }" |
| ) as demo: |
|
|
| gr.Markdown(""" |
| # π¦ Multi-Format Finance Document Parser |
| **Production-grade** financial document extraction β structured JSON. |
| |
| Supports: **Invoice Β· SAP Report Β· Income Statement Β· Bank Statement Β· Purchase Order Β· SQL results** |
| |
| *Fine-tuned Qwen2.5-7B-Instruct Β· QLoRA 4-bit NF4 Β· Trained on CORD-v2 + synthetic finance data* |
| """) |
|
|
| with gr.Row(): |
| with gr.Column(scale=1): |
| text_in = gr.Textbox( |
| label="Paste document text", |
| lines=16, |
| placeholder="Paste your invoice, SAP export, income statement, or any financial document here...", |
| ) |
| hint_in = gr.Dropdown( |
| choices=[ |
| "Auto-detect", |
| "Invoice", |
| "SAP Report", |
| "Balance Sheet", |
| "Income Statement", |
| "Bank Statement", |
| "Purchase Order", |
| "SQL Result", |
| ], |
| value="Auto-detect", |
| label="Document type hint (optional)", |
| ) |
| parse_btn = gr.Button("Parse Document", variant="primary", size="lg") |
|
|
| with gr.Column(scale=1): |
| summary_out = gr.Markdown(label="Summary") |
| json_out = gr.Code( |
| label="Structured JSON output", |
| language="json", |
| lines=18, |
| ) |
|
|
| gr.Markdown("### Try an example") |
| gr.Examples( |
| examples=EXAMPLES, |
| inputs=[text_in, hint_in], |
| label="Click any example to load it", |
| ) |
|
|
| gr.Markdown(""" |
| --- |
| **Model:** [ratulsur/multi-format-finance-parser](https://huggingface.co/ratulsur/multi-format-finance-parser) |
| **Training:** QLoRA (4-bit NF4 double quantization) on Qwen2.5-7B-Instruct |
| **Dataset:** CORD-v2 receipts + synthetic invoices, SAP reports, income statements |
| """) |
|
|
| parse_btn.click( |
| fn=process, |
| inputs=[text_in, hint_in], |
| outputs=[summary_out, json_out], |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|