Qwen-UI

Running

App Files Files Community

Ankushbl6 commited on Dec 12, 2025

Commit

efa6392

verified ·

1 Parent(s): c5b9438

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +383 -92

src/streamlit_app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 # =========================
 # Invoice Extractor (Qwen3-VL via RunPod vLLM) - Batch Mode with Tax Validation
-# UPDATED: Comprehensive date parsing (50+ formats) + Hybrid date display
-# FIX: Tax calculation skips both empty ("") and explicit zero (0.00) values
 # =========================
 import os
 from pathlib import Path
@@ -90,27 +89,118 @@ def ensure_state(k: str, default):
         st.session_state[k] = default
 def clean_float(x) -> float:
-    import re
     if x is None:
         return 0.0
     if isinstance(x, (int, float)):
         return float(x)
     s = str(x).strip()
     if s == "":
         return 0.0
-    s = re.sub(r"[,\s]", "", s)
-    s = re.sub(r"[^\d\.\-]", "", s)
-    if s in ("", ".", "-", "-."):
         return 0.0
     try:
-        return float(s)
-    except Exception:
         return 0.0
 def normalize_date(date_str) -> str:
     """
-    Normalize various date formats to dd-MMM-yyyy format (e.g., 01-Jan-2025)
-    Handles: ISO, US, EU, Asian, two-digit years, and 50+ worldwide date formats
     Returns empty string if date cannot be parsed
     """
     if not date_str or date_str == "":
@@ -121,8 +211,8 @@ def normalize_date(date_str) -> str:
         if date_str == "":
             return ""
-    # Comprehensive list of date formats to try (order matters - most specific first)
-    formats = [
         # ISO formats (4-digit year)
         "%Y-%m-%d",           # 2025-01-15
         "%Y/%m/%d",           # 2025/01/15
@@ -162,7 +252,7 @@ def normalize_date(date_str) -> str:
         # European formats with 2-digit year - Day first
         "%d-%m-%y",           # 15-01-25
-        "%d/%m/%y",           # 15/01/25 or 25/09/25 ← FIXES YOUR ISSUE!
         "%d.%m.%y",           # 15.01.25
         "%d %m %y",           # 15 01 25
@@ -205,35 +295,72 @@ def normalize_date(date_str) -> str:
         "%Y%d%m",             # 20251501
     ]
-    parsed_date = None
-    # Try parsing with each format
-    for fmt in formats:
         try:
             parsed_date = datetime.strptime(str(date_str), fmt)
-            break
         except (ValueError, TypeError):
             continue
-    # If still not parsed, try removing ordinal suffixes (st, nd, rd, th)
-    if parsed_date is None and isinstance(date_str, str):
-        import re
         cleaned = re.sub(r'(\d+)(st|nd|rd|th)\b', r'\1', date_str, flags=re.IGNORECASE)
         if cleaned != date_str:
-            for fmt in formats:
                 try:
                     parsed_date = datetime.strptime(cleaned, fmt)
-                    break
                 except (ValueError, TypeError):
                     continue
-    # If no format matched, return empty string
-    if parsed_date is None:
-        return ""
-    # Format as dd-MMM-yyyy (e.g., 01-Jan-2025)
-    return parsed_date.strftime("%d-%b-%Y")
 def parse_date_to_object(date_str):
     """
@@ -331,6 +458,41 @@ def parse_date_to_object(date_str):
         "%d%m%Y",             # 15012025
         "%m%d%Y",             # 01152025
         "%Y%d%m",             # 20251501
     ]
     # Try parsing with each format
@@ -356,22 +518,6 @@ def parse_date_to_object(date_str):
     return None
-# -----------------------------
-# HF login flow (REMOVED - No longer needed for vLLM API)
-# -----------------------------
-# Authentication is now handled via POD_URL and VLLM_API_KEY instead
-# -----------------------------
-# Model config
-# -----------------------------
-# OLD DONUT CODE (COMMENTED OUT - Now using vLLM API)
-# -----------------------------
-# HF_MODEL_ID = "Bhuvi13/model-V7"
-# TASK_PROMPT = "<s_cord-v2>"
-#
-# @st.cache_resource(show_spinner=False)
-# def load_model_and_processor(hf_model_id: str, task_prompt: str):
-#     ...
 # -----------------------------
 # vLLM Inference Function (RunPod API)
@@ -407,7 +553,8 @@ Extract the data into this exact JSON structure:
       "quantity": "Quantity of items",
       "unit_price": "Price per unit",
       "amount": "Total amount for this line item",
-      "tax": "Tax amount for this item",
       "Line_total": "Total amount including tax for this line"
     }
   ],
@@ -434,7 +581,7 @@ IMPORTANT GUIDELINES:
 - Extract text exactly as it appears, including special characters and formatting
 - For dates, preserve the original format shown in the invoice
 - If both sender and receiver addresses are in the United States, extract ACH; otherwise extract Wire transfer (WT).
-- If payment terms specify a number of days (e.g., “payment terms 30 days”, “payable within 15 days”, “terms 45 days”, “Net 30”, or any similar phrase), compute: due_date = invoice_date + N days. If the invoice states “due on receipt”, “due upon receipt” ,"Immediate" or any similar phrase meaning immediate payment, then: due_date = invoice_date. Use the same date format as the invoice. Output only the computed due_date.
 - if tax_rate is not given in invoice but tax_amount is given, calculate the tax_rate using tax_amount and subtotal.
 - line-item wise tax calculation has to be done properly based ONLY on the tax_rate given in the summary, and the same tax_rate must be used for every line item in that invoice.
 - If currency symbols are present, note them appropriately
@@ -519,12 +666,98 @@ Return only the JSON object with the extracted information"""
 def parse_vllm_json(raw_json_text):
     """Parse vLLM JSON output into structured format"""
     try:
-        data = json.loads(raw_json_text)
         def clean_amount(value):
-            if not value or value == "":
                 return 0.0
-            return float(re.sub(r'[^\d\.-]', '', str(value)))
         header = data.get("header", {})
         summary = data.get("summary", {})
@@ -792,6 +1025,7 @@ def map_prediction_to_ui(pred):
         return None
     def clean_number(x):
         if x is None:
             return 0.0
         if isinstance(x, (int, float)):
@@ -799,13 +1033,71 @@ def map_prediction_to_ui(pred):
         s = str(x).strip()
         if s == "":
             return 0.0
-        s = re.sub(r"[,\s]", "", s)
-        s = re.sub(r"[^\d\.\-]", "", s)
-        if s in ("", ".", "-", "-."):
             return 0.0
         try:
-            return float(s)
-        except Exception:
             return 0.0
     def collect_keys(obj, out):
@@ -1076,16 +1368,6 @@ def flatten_invoice_to_rows(invoice_data) -> list:
         rows.append(row)
     return rows
-# -----------------------------
-# Load model (COMMENTED OUT - Now using vLLM API)
-# -----------------------------
-# try:
-#     with st.spinner("Loading model & processor (cached) ..."):
-#         processor, model, device, decoder_input_ids = load_model_and_processor(HF_MODEL_ID, TASK_PROMPT)
-# except Exception as e:
-#     st.error("Could not load model automatically. See details below.")
-#     st.exception(e)
-#     st.stop()
 # -----------------------------
 # Session scaffolding
@@ -1156,6 +1438,8 @@ if not st.session_state.is_processing_batch and len(st.session_state.batch_resul
                     continue
                 # vLLM Inference + parsing + tax validation
                 try:
                     # Call vLLM API
                     raw_json = run_inference_vllm(image)
@@ -1174,18 +1458,18 @@ if not st.session_state.is_processing_batch and len(st.session_state.batch_resul
                         st.warning(f"No response from vLLM for {uploaded_file.name}")
                         mapped = {}
-                    pred = raw_json  # Store raw JSON for debugging
                 except Exception as e:
                     st.warning(f"Error processing {uploaded_file.name}: {str(e)}")
-                    pred = None
                     mapped = {}
                 safe_mapped = mapped if isinstance(mapped, dict) else {}
                 st.session_state.batch_results[file_hash] = {
                     "file_name": uploaded_file.name,
                     "image": image,
-                    "raw_pred": pred,
                     "mapped_data": safe_mapped,
                     "edited_data": safe_mapped.copy()
                 }
@@ -1320,9 +1604,17 @@ elif len(st.session_state.batch_results) > 0:
     with frame_left:
         st.image(image, caption=current["file_name"], width=FIXED_IMG_WIDTH)
         st.write(f"**File Hash:** {selected_hash[:8]}...")
-        if current.get('raw_pred') is not None:
-            with st.expander("🔍 Show raw model output"):
-                st.json(current['raw_pred'])
         if st.button("🔁 Re-Run Inference", key=f"rerun_{selected_hash}"):
             with st.spinner("Re-running inference..."):
@@ -1345,10 +1637,9 @@ elif len(st.session_state.batch_results) > 0:
                         mapped = {}
                     safe_mapped = mapped if isinstance(mapped, dict) else {}
-                    pred = raw_json  # Store raw JSON
                     # Update stored results
-                    st.session_state.batch_results[selected_hash]["raw_pred"] = pred
                     st.session_state.batch_results[selected_hash]["mapped_data"] = mapped
                     st.session_state.batch_results[selected_hash]["edited_data"] = safe_mapped.copy()
@@ -1396,10 +1687,10 @@ elif len(st.session_state.batch_results) > 0:
                 if st.session_state.get(f"Currency_{selected_hash}") == 'Other':
                     st.text_input("Specify Currency", key=f"Currency_Custom_{selected_hash}")
-                st.number_input("Subtotal", key=f"Subtotal_{selected_hash}")
-                st.number_input("Tax %", key=f"Tax Percentage_{selected_hash}")
-                st.number_input("Total Tax", key=f"Total Tax_{selected_hash}")
-                st.number_input("Total Amount", key=f"Total Amount_{selected_hash}")
             with tabs[1]:
                 st.text_input("Sender Name", key=f"Sender Name_{selected_hash}")
@@ -1512,7 +1803,7 @@ elif len(st.session_state.batch_results) > 0:
                     st.dataframe(
                         totals_df,
-                        width="stretch",          # <- see note below
                         hide_index=True,
                         height=38
                     )
@@ -1562,16 +1853,16 @@ elif len(st.session_state.batch_results) > 0:
             calculated_tax_pct = round((calculated_total_tax / calculated_subtotal) * 100, 4)
         if saved:
-            # Build updated data structure
             updated = {
                 'Invoice Number': st.session_state.get(f"Invoice Number_{selected_hash}", ''),
                 'Invoice Date': invoice_date_str,
                 'Due Date': due_date_str,
                 'Currency': currency,
-                'Subtotal': calculated_subtotal,  # Auto-calculated from line items
-                'Tax Percentage': calculated_tax_pct,  # Auto-calculated
-                'Total Tax': calculated_total_tax,  # Auto-calculated from line items
-                'Total Amount': calculated_total,  # Auto-calculated from line items
                 'Sender Name': st.session_state.get(f"Sender Name_{selected_hash}", ''),
                 'Sender Address': st.session_state.get(f"Sender Address_{selected_hash}", ''),
                 'Recipient Name': st.session_state.get(f"Recipient Name_{selected_hash}", ''),
@@ -1595,10 +1886,10 @@ elif len(st.session_state.batch_results) > 0:
             # Save to batch_results (this persists the data)
             st.session_state.batch_results[selected_hash]["edited_data"] = updated
-            # CRITICAL: Clear items_df from session state so it rebuilds from saved data on next rerun
-            items_state_key = f"items_df_{selected_hash}"
-            if items_state_key in st.session_state:
-                del st.session_state[items_state_key]
             # Show success message
             st.success("✅ Saved")
@@ -1606,16 +1897,16 @@ elif len(st.session_state.batch_results) > 0:
             # Rerun to reload the form with saved data
             st.rerun()
-        # Per-file CSV download (ALWAYS visible, uses current edited values)
         download_data = {
             'Invoice Number': st.session_state.get(f"Invoice Number_{selected_hash}", ''),
             'Invoice Date': invoice_date_str,
             'Due Date': due_date_str,
             'Currency': currency,
-            'Subtotal': calculated_subtotal,  # Use calculated value
-            'Tax Percentage': calculated_tax_pct,  # Use calculated value
-            'Total Tax': calculated_total_tax,  # Use calculated value
-            'Total Amount': calculated_total,  # Use calculated value
             'Sender Name': st.session_state.get(f"Sender Name_{selected_hash}", ''),
             'Sender Address': st.session_state.get(f"Sender Address_{selected_hash}", ''),
             'Recipient Name': st.session_state.get(f"Recipient Name_{selected_hash}", ''),

 # =========================
 # Invoice Extractor (Qwen3-VL via RunPod vLLM) - Batch Mode with Tax Validation
+# UPDATED: Fixed raw model output display
 # =========================
 import os
 from pathlib import Path
         st.session_state[k] = default
 def clean_float(x) -> float:
+    """
+    Parse a number string handling both US and European formats.
+    US Format:      1,234,567.89  (comma = thousands, period = decimal)
+    European:       1.234.567,89  (period = thousands, comma = decimal)
+    Examples:
+        "1,234.56"    → 1234.56  (US)
+        "1.234,56"    → 1234.56  (European)
+        "3.000,2234"  → 3000.2234 (European with 4 decimal places)
+        "261,49"      → 261.49   (European decimal only)
+        "39,22-"      → -39.22   (European with trailing minus)
+    """
     if x is None:
         return 0.0
     if isinstance(x, (int, float)):
         return float(x)
     s = str(x).strip()
     if s == "":
         return 0.0
+    # Handle negative signs (could be leading or trailing)
+    is_negative = False
+    if s.startswith('-'):
+        is_negative = True
+        s = s[1:].strip()
+    elif s.endswith('-'):
+        is_negative = True
+        s = s[:-1].strip()
+    elif s.startswith('(') and s.endswith(')'):
+        # Accounting format: (123.45) means negative
+        is_negative = True
+        s = s[1:-1].strip()
+    # Remove currency symbols and spaces
+    s = re.sub(r'[€$£¥₹\s]', '', s)
+    if s == "":
         return 0.0
+    # Count occurrences
+    comma_count = s.count(',')
+    period_count = s.count('.')
+    # Find positions of last comma and last period
+    last_comma = s.rfind(',')
+    last_period = s.rfind('.')
+    # Determine format based on which separator comes last
+    if comma_count > 0 and period_count > 0:
+        # Both separators present - the LAST one is the decimal separator
+        if last_comma > last_period:
+            # European format: 1.234,56 → comma is decimal
+            # Remove periods (thousands), replace comma with period
+            s = s.replace('.', '').replace(',', '.')
+        else:
+            # US format: 1,234.56 → period is decimal
+            # Remove commas (thousands)
+            s = s.replace(',', '')
+    elif comma_count > 0 and period_count == 0:
+        # Only commas present
+        # Check what comes after the LAST comma
+        after_last_comma = s[last_comma + 1:] if last_comma < len(s) - 1 else ""
+        if comma_count == 1 and len(after_last_comma) <= 4 and after_last_comma.isdigit():
+            # Single comma with 1-4 digits after → European decimal
+            # "261,49" → 261.49, "1234,5678" → 1234.5678
+            s = s.replace(',', '.')
+        elif len(after_last_comma) == 3 and comma_count >= 1:
+            # 3 digits after comma(s) → likely thousands separator
+            # "1,234" → 1234, "1,234,567" → 1234567
+            s = s.replace(',', '')
+        else:
+            # Multiple commas → thousands separator
+            # "1,234,567" → 1234567
+            s = s.replace(',', '')
+    elif period_count > 0 and comma_count == 0:
+        # Only periods present
+        # Check what comes after the LAST period
+        after_last_period = s[last_period + 1:] if last_period < len(s) - 1 else ""
+        if period_count > 1:
+            # Multiple periods → definitely thousands separator (European: "1.234.567")
+            s = s.replace('.', '')
+        elif len(after_last_period) == 3 and after_last_period.isdigit():
+            # Single period with exactly 3 digits after → European thousands: "1.000" → 1000
+            # (In invoices, "1.000" almost always means 1000, not 1.0 with trailing zeros)
+            before_period = s[:last_period]
+            if before_period.isdigit() and len(before_period) <= 3:
+                s = s.replace('.', '')
+        # Otherwise keep as is (standard decimal like "1.50", "123.45")
+    # Clean any remaining non-numeric characters except period and minus
+    s = re.sub(r'[^\d.]', '', s)
+    if s == "" or s == ".":
+        return 0.0
     try:
+        result = float(s)
+        return -result if is_negative else result
+    except ValueError:
         return 0.0
 def normalize_date(date_str) -> str:
     """
+    Normalize various date formats:
+    - Full dates (day-month-year) → dd-MMM-yyyy (e.g., 01-Jan-2025)
+    - Month-year only → MMM-yyyy (e.g., Aug-2025)
     Returns empty string if date cannot be parsed
     """
     if not date_str or date_str == "":
         if date_str == "":
             return ""
+    # FULL DATE FORMATS (day-month-year) - try these first
+    full_date_formats = [
         # ISO formats (4-digit year)
         "%Y-%m-%d",           # 2025-01-15
         "%Y/%m/%d",           # 2025/01/15
         # European formats with 2-digit year - Day first
         "%d-%m-%y",           # 15-01-25
+        "%d/%m/%y",           # 15/01/25
         "%d.%m.%y",           # 15.01.25
         "%d %m %y",           # 15 01 25
         "%Y%d%m",             # 20251501
     ]
+    # Try full date formats first → output as dd-MMM-yyyy
+    for fmt in full_date_formats:
         try:
             parsed_date = datetime.strptime(str(date_str), fmt)
+            return parsed_date.strftime("%d-%b-%Y")
         except (ValueError, TypeError):
             continue
+    # Try with ordinal suffixes removed (1st, 2nd, 3rd, etc.)
+    if isinstance(date_str, str):
         cleaned = re.sub(r'(\d+)(st|nd|rd|th)\b', r'\1', date_str, flags=re.IGNORECASE)
         if cleaned != date_str:
+            for fmt in full_date_formats:
                 try:
                     parsed_date = datetime.strptime(cleaned, fmt)
+                    return parsed_date.strftime("%d-%b-%Y")
                 except (ValueError, TypeError):
                     continue
+    # MONTH-YEAR ONLY FORMATS - output as MMM-yyyy
+    month_year_formats = [
+        # Full month name with year
+        "%B %Y",              # August 2025
+        "%b %Y",              # Aug 2025
+        "%B, %Y",             # August, 2025
+        "%b, %Y",             # Aug, 2025
+        "%B-%Y",              # August-2025
+        "%b-%Y",              # Aug-2025
+        "%B/%Y",              # August/2025
+        "%b/%Y",              # Aug/2025
+        # Numeric month-year (4-digit year)
+        "%m/%Y",              # 08/2025
+        "%m-%Y",              # 08-2025
+        "%m.%Y",              # 08.2025
+        "%m %Y",              # 08 2025
+        "%Y-%m",              # 2025-08
+        "%Y/%m",              # 2025/08
+        "%Y.%m",              # 2025.08
+        "%Y %m",              # 2025 08
+        # Numeric month-year (2-digit year)
+        "%m/%y",              # 08/25
+        "%m-%y",              # 08-25
+        "%m.%y",              # 08.25
+        "%m %y",              # 08 25
+        "%y-%m",              # 25-08
+        "%y/%m",              # 25/08
+        # Full month name with 2-digit year
+        "%B %y",              # August 25
+        "%b %y",              # Aug 25
+        "%B-%y",              # August-25
+        "%b-%y",              # Aug-25
+    ]
+    # Try month-year formats → output as MMM-yyyy (no day)
+    for fmt in month_year_formats:
+        try:
+            parsed_date = datetime.strptime(str(date_str), fmt)
+            return parsed_date.strftime("%b-%Y")  # Aug-2025 format
+        except (ValueError, TypeError):
+            continue
+    # If no format matched, return empty string
+    return ""
 def parse_date_to_object(date_str):
     """
         "%d%m%Y",             # 15012025
         "%m%d%Y",             # 01152025
         "%Y%d%m",             # 20251501
+        # ========== MONTH-YEAR ONLY FORMATS (defaults to 1st of month) ==========
+        # Full month name with year
+        "%B %Y",              # August 2025
+        "%b %Y",              # Aug 2025
+        "%B, %Y",             # August, 2025
+        "%b, %Y",             # Aug, 2025
+        "%B-%Y",              # August-2025
+        "%b-%Y",              # Aug-2025
+        "%B/%Y",              # August/2025
+        "%b/%Y",              # Aug/2025
+        # Numeric month-year (4-digit year)
+        "%m/%Y",              # 08/2025
+        "%m-%Y",              # 08-2025
+        "%m.%Y",              # 08.2025
+        "%m %Y",              # 08 2025
+        "%Y-%m",              # 2025-08
+        "%Y/%m",              # 2025/08
+        "%Y.%m",              # 2025.08
+        "%Y %m",              # 2025 08
+        # Numeric month-year (2-digit year)
+        "%m/%y",              # 08/25
+        "%m-%y",              # 08-25
+        "%m.%y",              # 08.25
+        "%m %y",              # 08 25
+        "%y-%m",              # 25-08
+        "%y/%m",              # 25/08
+        # Full month name with 2-digit year
+        "%B %y",              # August 25
+        "%b %y",              # Aug 25
+        "%B-%y",              # August-25
+        "%b-%y",              # Aug-25
     ]
     # Try parsing with each format
     return None
 # -----------------------------
 # vLLM Inference Function (RunPod API)
       "quantity": "Quantity of items",
       "unit_price": "Price per unit",
       "amount": "Total amount for this line item",
+      "t_rate": "tax_rate",
+      "tax": "amount*t_rate/100",
       "Line_total": "Total amount including tax for this line"
     }
   ],
 - Extract text exactly as it appears, including special characters and formatting
 - For dates, preserve the original format shown in the invoice
 - If both sender and receiver addresses are in the United States, extract ACH; otherwise extract Wire transfer (WT).
+- If payment terms specify a number of days (e.g., "payment terms 30 days", "payable within 15 days", "terms 45 days", "Net 30", or any similar phrase), compute: due_date = invoice_date + N days. If the invoice states "due on receipt", "due upon receipt" ,"Immediate" or any similar phrase meaning immediate payment, then: due_date = invoice_date. Use the same date format as the invoice. Output only the computed due_date.
 - if tax_rate is not given in invoice but tax_amount is given, calculate the tax_rate using tax_amount and subtotal.
 - line-item wise tax calculation has to be done properly based ONLY on the tax_rate given in the summary, and the same tax_rate must be used for every line item in that invoice.
 - If currency symbols are present, note them appropriately
 def parse_vllm_json(raw_json_text):
     """Parse vLLM JSON output into structured format"""
     try:
+        # Try to parse the JSON - handle potential markdown code blocks
+        text_to_parse = raw_json_text.strip()
+        # Remove markdown code fences if present
+        if text_to_parse.startswith("```json"):
+            text_to_parse = text_to_parse[7:]
+        elif text_to_parse.startswith("```"):
+            text_to_parse = text_to_parse[3:]
+        if text_to_parse.endswith("```"):
+            text_to_parse = text_to_parse[:-3]
+        text_to_parse = text_to_parse.strip()
+        data = json.loads(text_to_parse)
         def clean_amount(value):
+            """Parse number handling both US and European formats."""
+            if value is None:
+                return 0.0
+            if isinstance(value, (int, float)):
+                return float(value)
+            if value == "":
+                return 0.0
+            s = str(value).strip()
+            if s == "":
+                return 0.0
+            # Handle negative signs (leading or trailing)
+            is_negative = False
+            if s.startswith('-'):
+                is_negative = True
+                s = s[1:].strip()
+            elif s.endswith('-'):
+                is_negative = True
+                s = s[:-1].strip()
+            elif s.startswith('(') and s.endswith(')'):
+                is_negative = True
+                s = s[1:-1].strip()
+            # Remove currency symbols and spaces
+            s = re.sub(r'[€$£¥₹\s]', '', s)
+            if s == "":
+                return 0.0
+            comma_count = s.count(',')
+            period_count = s.count('.')
+            last_comma = s.rfind(',')
+            last_period = s.rfind('.')
+            if comma_count > 0 and period_count > 0:
+                # Both present - LAST one is decimal separator
+                if last_comma > last_period:
+                    # European: 1.234,56 or 3.000,2234
+                    s = s.replace('.', '').replace(',', '.')
+                else:
+                    # US: 1,234.56
+                    s = s.replace(',', '')
+            elif comma_count > 0:
+                # Only commas
+                after_last_comma = s[last_comma + 1:] if last_comma < len(s) - 1 else ""
+                if comma_count == 1 and len(after_last_comma) <= 4 and after_last_comma.isdigit():
+                    # European decimal: "261,49" or "9,60"
+                    s = s.replace(',', '.')
+                elif len(after_last_comma) == 3:
+                    # 3 digits after comma → thousands: "1,234"
+                    s = s.replace(',', '')
+                else:
+                    # Multiple commas = US thousands: "1,234,567"
+                    s = s.replace(',', '')
+            elif period_count > 0:
+                # Only periods
+                after_last_period = s[last_period + 1:] if last_period < len(s) - 1 else ""
+                if period_count > 1:
+                    # Multiple periods = European thousands: "1.234.567"
+                    s = s.replace('.', '')
+                elif len(after_last_period) == 3 and after_last_period.isdigit():
+                    # Single period with exactly 3 digits → European thousands: "1.000"
+                    before_period = s[:last_period]
+                    if before_period.isdigit() and len(before_period) <= 3:
+                        s = s.replace('.', '')
+            s = re.sub(r'[^\d.]', '', s)
+            if s == "" or s == ".":
+                return 0.0
+            try:
+                result = float(s)
+                return -result if is_negative else result
+            except ValueError:
                 return 0.0
         header = data.get("header", {})
         summary = data.get("summary", {})
         return None
     def clean_number(x):
+        """Parse number handling both US and European formats."""
         if x is None:
             return 0.0
         if isinstance(x, (int, float)):
         s = str(x).strip()
         if s == "":
             return 0.0
+        # Handle negative signs (leading or trailing)
+        is_negative = False
+        if s.startswith('-'):
+            is_negative = True
+            s = s[1:].strip()
+        elif s.endswith('-'):
+            is_negative = True
+            s = s[:-1].strip()
+        elif s.startswith('(') and s.endswith(')'):
+            is_negative = True
+            s = s[1:-1].strip()
+        # Remove currency symbols and spaces
+        s = re.sub(r'[€$£¥₹\s]', '', s)
+        if s == "":
             return 0.0
+        comma_count = s.count(',')
+        period_count = s.count('.')
+        last_comma = s.rfind(',')
+        last_period = s.rfind('.')
+        if comma_count > 0 and period_count > 0:
+            # Both present - LAST one is decimal separator
+            if last_comma > last_period:
+                # European: 1.234,56 or 3.000,2234
+                s = s.replace('.', '').replace(',', '.')
+            else:
+                # US: 1,234.56
+                s = s.replace(',', '')
+        elif comma_count > 0:
+            # Only commas
+            after_last_comma = s[last_comma + 1:] if last_comma < len(s) - 1 else ""
+            if comma_count == 1 and len(after_last_comma) <= 4 and after_last_comma.isdigit():
+                # European decimal: "261,49" or "9,60"
+                s = s.replace(',', '.')
+            elif len(after_last_comma) == 3:
+                # 3 digits after comma → thousands: "1,234"
+                s = s.replace(',', '')
+            else:
+                # Multiple commas = US thousands: "1,234,567"
+                s = s.replace(',', '')
+        elif period_count > 0:
+            # Only periods
+            after_last_period = s[last_period + 1:] if last_period < len(s) - 1 else ""
+            if period_count > 1:
+                # Multiple periods = European thousands: "1.234.567"
+                s = s.replace('.', '')
+            elif len(after_last_period) == 3 and after_last_period.isdigit():
+                # Single period with exactly 3 digits → European thousands: "1.000"
+                before_period = s[:last_period]
+                if before_period.isdigit() and len(before_period) <= 3:
+                    s = s.replace('.', '')
+        s = re.sub(r'[^\d.]', '', s)
+        if s == "" or s == ".":
+            return 0.0
         try:
+            result = float(s)
+            return -result if is_negative else result
+        except ValueError:
             return 0.0
     def collect_keys(obj, out):
         rows.append(row)
     return rows
 # -----------------------------
 # Session scaffolding
                     continue
                 # vLLM Inference + parsing + tax validation
+                raw_json = None
+                mapped = {}
                 try:
                     # Call vLLM API
                     raw_json = run_inference_vllm(image)
                         st.warning(f"No response from vLLM for {uploaded_file.name}")
                         mapped = {}
                 except Exception as e:
                     st.warning(f"Error processing {uploaded_file.name}: {str(e)}")
+                    raw_json = None
                     mapped = {}
                 safe_mapped = mapped if isinstance(mapped, dict) else {}
+                # Store BOTH raw string AND parsed dict for display
                 st.session_state.batch_results[file_hash] = {
                     "file_name": uploaded_file.name,
                     "image": image,
+                    "raw_pred": raw_json,  # Original string from API (untouched)
                     "mapped_data": safe_mapped,
                     "edited_data": safe_mapped.copy()
                 }
     with frame_left:
         st.image(image, caption=current["file_name"], width=FIXED_IMG_WIDTH)
         st.write(f"**File Hash:** {selected_hash[:8]}...")
+        # ============ RAW MODEL OUTPUT DISPLAY (UNTOUCHED) ============
+        with st.expander("🔍 Show raw model output"):
+            raw_pred = current.get('raw_pred')
+            if raw_pred is None:
+                st.warning("No raw output available (API may have returned None)")
+            else:
+                # Show raw output exactly as received from the model - UNTOUCHED
+                st.code(str(raw_pred), language='json')
+        # ==============================================================
         if st.button("🔁 Re-Run Inference", key=f"rerun_{selected_hash}"):
             with st.spinner("Re-running inference..."):
                         mapped = {}
                     safe_mapped = mapped if isinstance(mapped, dict) else {}
                     # Update stored results
+                    st.session_state.batch_results[selected_hash]["raw_pred"] = raw_json
                     st.session_state.batch_results[selected_hash]["mapped_data"] = mapped
                     st.session_state.batch_results[selected_hash]["edited_data"] = safe_mapped.copy()
                 if st.session_state.get(f"Currency_{selected_hash}") == 'Other':
                     st.text_input("Specify Currency", key=f"Currency_Custom_{selected_hash}")
+                st.number_input("Subtotal", key=f"Subtotal_{selected_hash}", format="%.2f")
+                st.number_input("Tax %", key=f"Tax Percentage_{selected_hash}", format="%.4f")
+                st.number_input("Total Tax", key=f"Total Tax_{selected_hash}", format="%.2f")
+                st.number_input("Total Amount", key=f"Total Amount_{selected_hash}", format="%.2f")
             with tabs[1]:
                 st.text_input("Sender Name", key=f"Sender Name_{selected_hash}")
                     st.dataframe(
                         totals_df,
+                        use_container_width=True,
                         hide_index=True,
                         height=38
                     )
             calculated_tax_pct = round((calculated_total_tax / calculated_subtotal) * 100, 4)
         if saved:
+            # Build updated data structure using ACTUAL user-entered values from form
             updated = {
                 'Invoice Number': st.session_state.get(f"Invoice Number_{selected_hash}", ''),
                 'Invoice Date': invoice_date_str,
                 'Due Date': due_date_str,
                 'Currency': currency,
+                'Subtotal': st.session_state.get(f"Subtotal_{selected_hash}", 0.0),
+                'Tax Percentage': st.session_state.get(f"Tax Percentage_{selected_hash}", 0.0),
+                'Total Tax': st.session_state.get(f"Total Tax_{selected_hash}", 0.0),
+                'Total Amount': st.session_state.get(f"Total Amount_{selected_hash}", 0.0),
                 'Sender Name': st.session_state.get(f"Sender Name_{selected_hash}", ''),
                 'Sender Address': st.session_state.get(f"Sender Address_{selected_hash}", ''),
                 'Recipient Name': st.session_state.get(f"Recipient Name_{selected_hash}", ''),
             # Save to batch_results (this persists the data)
             st.session_state.batch_results[selected_hash]["edited_data"] = updated
+            # CRITICAL: Clear ALL session state keys for this file so they reload from saved edited_data
+            keys_to_delete = [k for k in list(st.session_state.keys()) if k.endswith(f"_{selected_hash}")]
+            for key in keys_to_delete:
+                del st.session_state[key]
             # Show success message
             st.success("✅ Saved")
             # Rerun to reload the form with saved data
             st.rerun()
+        # Per-file CSV download (ALWAYS visible, uses current form values)
         download_data = {
             'Invoice Number': st.session_state.get(f"Invoice Number_{selected_hash}", ''),
             'Invoice Date': invoice_date_str,
             'Due Date': due_date_str,
             'Currency': currency,
+            'Subtotal': st.session_state.get(f"Subtotal_{selected_hash}", 0.0),
+            'Tax Percentage': st.session_state.get(f"Tax Percentage_{selected_hash}", 0.0),
+            'Total Tax': st.session_state.get(f"Total Tax_{selected_hash}", 0.0),
+            'Total Amount': st.session_state.get(f"Total Amount_{selected_hash}", 0.0),
             'Sender Name': st.session_state.get(f"Sender Name_{selected_hash}", ''),
             'Sender Address': st.session_state.get(f"Sender Address_{selected_hash}", ''),
             'Recipient Name': st.session_state.get(f"Recipient Name_{selected_hash}", ''),