ratulsur's picture
Upload app.py
8789d31 verified
Raw
History Blame Contribute Delete
9.04 kB
import gradio as gr
import requests
import json
import re
import os
# ── Config ─────────────────────────────────────────────────────
MODEL_ID = "ratulsur/multi-format-finance-parser"
API_URL = f"https://api-inference.huggingface.co/models/{MODEL_ID}"
HF_TOKEN = os.environ.get("HF_TOKEN", "")
SYSTEM_PROMPT = """You are a production financial document parser.
Given raw text from any financial document, output ONLY a single valid JSON object.
Schema: {document_type, vendor, client, date (YYYY-MM-DD), due_date, document_id,
currency, subtotal, tax_amount, tax_rate_pct, total_amount,
line_items:[{description,quantity,unit_price,amount}], payment_terms, notes, metadata}.
All monetary values must be floats. Unknown fields β†’ null. No explanation."""
# ── Inference ──────────────────────────────────────────────────
def call_api(text: str) -> dict:
prompt = (
f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
f"<|im_start|>user\nParse this financial document:\n\n{text}<|im_end|>\n"
f"<|im_start|>assistant\n"
)
headers = {
"Authorization": f"Bearer {HF_TOKEN}",
"Content-Type": "application/json",
}
payload = {
"inputs": prompt,
"parameters": {
"max_new_tokens": 512,
"temperature": 0.05,
"return_full_text": False,
"do_sample": False,
},
}
resp = requests.post(API_URL, headers=headers, json=payload, timeout=120)
resp.raise_for_status()
raw = resp.json()[0]["generated_text"].strip()
raw = re.sub(r"```json\s*|```\s*", "", raw).strip()
# JSON repair heuristics
try:
return json.loads(raw)
except json.JSONDecodeError:
raw = (raw
.replace("'", '"')
.replace("None", "null")
.replace("True", "true")
.replace("False", "false")
.replace(",\n}", "\n}")
.replace(",\n]", "\n]"))
match = re.search(r"\{.*\}", raw, re.DOTALL)
try:
return json.loads(match.group() if match else raw)
except Exception:
return {"error": "Could not parse model output", "raw": raw}
# ── Main processing function ───────────────────────────────────
def process(text_input: str, doc_hint: str):
if not text_input.strip():
return "⚠️ Please paste some document text.", ""
try:
text = text_input.strip()
if doc_hint and doc_hint != "Auto-detect":
text = f"[Document type: {doc_hint}]\n\n{text}"
result = call_api(text)
# Build summary
summary = []
if result.get("error"):
return f"❌ Error: {result['error']}", json.dumps(result, indent=2)
if result.get("document_type"):
summary.append(f"**Type:** {result['document_type']}")
if result.get("vendor"):
summary.append(f"**Vendor:** {result['vendor']}")
if result.get("client"):
summary.append(f"**Client:** {result['client']}")
if result.get("date"):
summary.append(f"**Date:** {result['date']}")
if result.get("due_date"):
summary.append(f"**Due date:** {result['due_date']}")
if result.get("document_id"):
summary.append(f"**Document ID:** {result['document_id']}")
if result.get("currency") and result.get("total_amount") is not None:
summary.append(f"**Total:** {result['currency']} {result['total_amount']:,.2f}")
if result.get("tax_amount") is not None:
summary.append(f"**Tax:** {result.get('currency','')} {result['tax_amount']:,.2f}")
if result.get("line_items"):
summary.append(f"**Line items:** {len(result['line_items'])}")
if result.get("payment_terms"):
summary.append(f"**Payment terms:** {result['payment_terms']}")
return "\n\n".join(summary), json.dumps(result, indent=2, ensure_ascii=False)
except requests.exceptions.Timeout:
return "⚠️ Model is loading, please wait 20 seconds and try again.", ""
except requests.exceptions.HTTPError as e:
return f"❌ API Error: {e}", ""
except Exception as e:
return f"❌ Error: {e}", ""
# ── Examples ───────────────────────────────────────────────────
EXAMPLES = [
["""INVOICE
Vendor: Tata Consultancy Services Ltd.
Invoice No: TCS-2024-8821
Date: 2024-11-15
Due Date: 2024-12-15
Bill To: Reliance Industries Ltd.
Service: Cloud Infrastructure Management (Oct 2024) INR 42,500.00
Service: SAP Integration Support INR 18,000.00
GST @ 18%: INR 10,890.00
TOTAL DUE: INR 71,390.00
Payment Terms: Net 30""", "Invoice"],
["""SAP FI - VENDOR PAYMENT REPORT
Company Code: 1000 | Fiscal Year: 2024
Run Date: 2024-09-30
|DocNo |Vendor |Amount |Curr|Status |
|----------|--------------------|--------------|----|--------|
|1900045621|Wipro Limited | 4,25,000.00 |INR |Open |
|1900045622|HCL Technologies | 2,10,500.00 |INR |Cleared |
|1900045623|Infosys BPO | 8,75,200.00 |INR |Open |
Total: 15,10,700.00 INR""", "SAP Report"],
["""INCOME STATEMENT
Reliance Industries Ltd.
Period ending: 2024-09-30
(in INR)
Revenue: 50,000,000.00
Cost of Revenue: (22,000,000.00)
Gross Profit: 28,000,000.00
Operating Expenses: (12,000,000.00)
EBIT: 16,000,000.00
Income Tax 25%: (4,000,000.00)
Net Income: 12,000,000.00""", "Income Statement"],
["""PURCHASE ORDER
PO Number: PO-2024-00456
Date: 2024-10-01
Vendor: Amazon Web Services India
Ship To: HDFC Bank Ltd., Mumbai
Item 1: EC2 Reserved Instances (1yr) USD 12,000.00
Item 2: S3 Storage 50TB USD 1,800.00
Item 3: RDS Multi-AZ USD 4,200.00
Subtotal: USD 18,000.00
Tax: USD 0.00
Total: USD 18,000.00
Payment Terms: Net 45""", "Purchase Order"],
]
# ── UI ─────────────────────────────────────────────────────────
with gr.Blocks(
title="Multi-Format Finance Parser",
theme=gr.themes.Soft(),
css=".json-output { font-family: monospace; font-size: 13px; }"
) as demo:
gr.Markdown("""
# 🏦 Multi-Format Finance Document Parser
**Production-grade** financial document extraction β†’ structured JSON.
Supports: **Invoice Β· SAP Report Β· Income Statement Β· Bank Statement Β· Purchase Order Β· SQL results**
*Fine-tuned Qwen2.5-7B-Instruct Β· QLoRA 4-bit NF4 Β· Trained on CORD-v2 + synthetic finance data*
""")
with gr.Row():
with gr.Column(scale=1):
text_in = gr.Textbox(
label="Paste document text",
lines=16,
placeholder="Paste your invoice, SAP export, income statement, or any financial document here...",
)
hint_in = gr.Dropdown(
choices=[
"Auto-detect",
"Invoice",
"SAP Report",
"Balance Sheet",
"Income Statement",
"Bank Statement",
"Purchase Order",
"SQL Result",
],
value="Auto-detect",
label="Document type hint (optional)",
)
parse_btn = gr.Button("Parse Document", variant="primary", size="lg")
with gr.Column(scale=1):
summary_out = gr.Markdown(label="Summary")
json_out = gr.Code(
label="Structured JSON output",
language="json",
lines=18,
)
gr.Markdown("### Try an example")
gr.Examples(
examples=EXAMPLES,
inputs=[text_in, hint_in],
label="Click any example to load it",
)
gr.Markdown("""
---
**Model:** [ratulsur/multi-format-finance-parser](https://huggingface.co/ratulsur/multi-format-finance-parser)
**Training:** QLoRA (4-bit NF4 double quantization) on Qwen2.5-7B-Instruct
**Dataset:** CORD-v2 receipts + synthetic invoices, SAP reports, income statements
""")
parse_btn.click(
fn=process,
inputs=[text_in, hint_in],
outputs=[summary_out, json_out],
)
if __name__ == "__main__":
demo.launch()