File size: 9,041 Bytes
8789d31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
import gradio as gr
import requests
import json
import re
import os

# ── Config ─────────────────────────────────────────────────────
MODEL_ID  = "ratulsur/multi-format-finance-parser"
API_URL   = f"https://api-inference.huggingface.co/models/{MODEL_ID}"
HF_TOKEN  = os.environ.get("HF_TOKEN", "")

SYSTEM_PROMPT = """You are a production financial document parser.
Given raw text from any financial document, output ONLY a single valid JSON object.
Schema: {document_type, vendor, client, date (YYYY-MM-DD), due_date, document_id,
currency, subtotal, tax_amount, tax_rate_pct, total_amount,
line_items:[{description,quantity,unit_price,amount}], payment_terms, notes, metadata}.
All monetary values must be floats. Unknown fields β†’ null. No explanation."""

# ── Inference ──────────────────────────────────────────────────
def call_api(text: str) -> dict:
    prompt = (
        f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
        f"<|im_start|>user\nParse this financial document:\n\n{text}<|im_end|>\n"
        f"<|im_start|>assistant\n"
    )
    headers = {
        "Authorization": f"Bearer {HF_TOKEN}",
        "Content-Type": "application/json",
    }
    payload = {
        "inputs": prompt,
        "parameters": {
            "max_new_tokens": 512,
            "temperature": 0.05,
            "return_full_text": False,
            "do_sample": False,
        },
    }
    resp = requests.post(API_URL, headers=headers, json=payload, timeout=120)
    resp.raise_for_status()
    raw = resp.json()[0]["generated_text"].strip()
    raw = re.sub(r"```json\s*|```\s*", "", raw).strip()

    # JSON repair heuristics
    try:
        return json.loads(raw)
    except json.JSONDecodeError:
        raw = (raw
               .replace("'", '"')
               .replace("None", "null")
               .replace("True", "true")
               .replace("False", "false")
               .replace(",\n}", "\n}")
               .replace(",\n]", "\n]"))
        match = re.search(r"\{.*\}", raw, re.DOTALL)
        try:
            return json.loads(match.group() if match else raw)
        except Exception:
            return {"error": "Could not parse model output", "raw": raw}


# ── Main processing function ───────────────────────────────────
def process(text_input: str, doc_hint: str):
    if not text_input.strip():
        return "⚠️ Please paste some document text.", ""

    try:
        text = text_input.strip()
        if doc_hint and doc_hint != "Auto-detect":
            text = f"[Document type: {doc_hint}]\n\n{text}"

        result = call_api(text)

        # Build summary
        summary = []
        if result.get("error"):
            return f"❌ Error: {result['error']}", json.dumps(result, indent=2)
        if result.get("document_type"):
            summary.append(f"**Type:** {result['document_type']}")
        if result.get("vendor"):
            summary.append(f"**Vendor:** {result['vendor']}")
        if result.get("client"):
            summary.append(f"**Client:** {result['client']}")
        if result.get("date"):
            summary.append(f"**Date:** {result['date']}")
        if result.get("due_date"):
            summary.append(f"**Due date:** {result['due_date']}")
        if result.get("document_id"):
            summary.append(f"**Document ID:** {result['document_id']}")
        if result.get("currency") and result.get("total_amount") is not None:
            summary.append(f"**Total:** {result['currency']} {result['total_amount']:,.2f}")
        if result.get("tax_amount") is not None:
            summary.append(f"**Tax:** {result.get('currency','')} {result['tax_amount']:,.2f}")
        if result.get("line_items"):
            summary.append(f"**Line items:** {len(result['line_items'])}")
        if result.get("payment_terms"):
            summary.append(f"**Payment terms:** {result['payment_terms']}")

        return "\n\n".join(summary), json.dumps(result, indent=2, ensure_ascii=False)

    except requests.exceptions.Timeout:
        return "⚠️ Model is loading, please wait 20 seconds and try again.", ""
    except requests.exceptions.HTTPError as e:
        return f"❌ API Error: {e}", ""
    except Exception as e:
        return f"❌ Error: {e}", ""


# ── Examples ───────────────────────────────────────────────────
EXAMPLES = [
    ["""INVOICE
Vendor:       Tata Consultancy Services Ltd.
Invoice No:   TCS-2024-8821
Date:         2024-11-15
Due Date:     2024-12-15
Bill To:      Reliance Industries Ltd.

Service: Cloud Infrastructure Management (Oct 2024)   INR 42,500.00
Service: SAP Integration Support                       INR 18,000.00
GST @ 18%:                                             INR 10,890.00
TOTAL DUE:                                             INR 71,390.00
Payment Terms: Net 30""", "Invoice"],

    ["""SAP FI - VENDOR PAYMENT REPORT
Company Code: 1000 | Fiscal Year: 2024
Run Date: 2024-09-30

|DocNo     |Vendor              |Amount        |Curr|Status  |
|----------|--------------------|--------------|----|--------|
|1900045621|Wipro Limited       | 4,25,000.00  |INR |Open    |
|1900045622|HCL Technologies    | 2,10,500.00  |INR |Cleared |
|1900045623|Infosys BPO         | 8,75,200.00  |INR |Open    |

Total: 15,10,700.00 INR""", "SAP Report"],

    ["""INCOME STATEMENT
Reliance Industries Ltd.
Period ending: 2024-09-30
(in INR)

Revenue:                    50,000,000.00
Cost of Revenue:           (22,000,000.00)
Gross Profit:               28,000,000.00
Operating Expenses:        (12,000,000.00)
EBIT:                       16,000,000.00
Income Tax 25%:             (4,000,000.00)
Net Income:                 12,000,000.00""", "Income Statement"],

    ["""PURCHASE ORDER
PO Number:    PO-2024-00456
Date:         2024-10-01
Vendor:       Amazon Web Services India
Ship To:      HDFC Bank Ltd., Mumbai

Item 1: EC2 Reserved Instances (1yr)    USD 12,000.00
Item 2: S3 Storage 50TB                 USD  1,800.00
Item 3: RDS Multi-AZ                    USD  4,200.00
Subtotal:                               USD 18,000.00
Tax:                                    USD      0.00
Total:                                  USD 18,000.00
Payment Terms: Net 45""", "Purchase Order"],
]


# ── UI ─────────────────────────────────────────────────────────
with gr.Blocks(
    title="Multi-Format Finance Parser",
    theme=gr.themes.Soft(),
    css=".json-output { font-family: monospace; font-size: 13px; }"
) as demo:

    gr.Markdown("""
# 🏦 Multi-Format Finance Document Parser
**Production-grade** financial document extraction β†’ structured JSON.

Supports: **Invoice Β· SAP Report Β· Income Statement Β· Bank Statement Β· Purchase Order Β· SQL results**

*Fine-tuned Qwen2.5-7B-Instruct Β· QLoRA 4-bit NF4 Β· Trained on CORD-v2 + synthetic finance data*
    """)

    with gr.Row():
        with gr.Column(scale=1):
            text_in = gr.Textbox(
                label="Paste document text",
                lines=16,
                placeholder="Paste your invoice, SAP export, income statement, or any financial document here...",
            )
            hint_in = gr.Dropdown(
                choices=[
                    "Auto-detect",
                    "Invoice",
                    "SAP Report",
                    "Balance Sheet",
                    "Income Statement",
                    "Bank Statement",
                    "Purchase Order",
                    "SQL Result",
                ],
                value="Auto-detect",
                label="Document type hint (optional)",
            )
            parse_btn = gr.Button("Parse Document", variant="primary", size="lg")

        with gr.Column(scale=1):
            summary_out = gr.Markdown(label="Summary")
            json_out = gr.Code(
                label="Structured JSON output",
                language="json",
                lines=18,
            )

    gr.Markdown("### Try an example")
    gr.Examples(
        examples=EXAMPLES,
        inputs=[text_in, hint_in],
        label="Click any example to load it",
    )

    gr.Markdown("""
---
**Model:** [ratulsur/multi-format-finance-parser](https://huggingface.co/ratulsur/multi-format-finance-parser)
**Training:** QLoRA (4-bit NF4 double quantization) on Qwen2.5-7B-Instruct
**Dataset:** CORD-v2 receipts + synthetic invoices, SAP reports, income statements
    """)

    parse_btn.click(
        fn=process,
        inputs=[text_in, hint_in],
        outputs=[summary_out, json_out],
    )

if __name__ == "__main__":
    demo.launch()