Qwen-UI

Running

App Files Files Community

Ankushbl6 commited on Dec 8, 2025

Commit

40bde2e

verified ·

1 Parent(s): d82e82b

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +303 -83

src/streamlit_app.py CHANGED Viewed

@@ -1,5 +1,7 @@
 # =========================
 # Invoice Extractor (Qwen3-VL via RunPod vLLM) - Batch Mode with Tax Validation
 # =========================
 import os
 from pathlib import Path
@@ -108,7 +110,7 @@ def clean_float(x) -> float:
 def normalize_date(date_str) -> str:
     """
     Normalize various date formats to dd-MMM-yyyy format (e.g., 01-Jan-2025)
-    Handles: ISO, US, EU, and various other common date formats
     Returns empty string if date cannot be parsed
     """
     if not date_str or date_str == "":
@@ -119,23 +121,88 @@ def normalize_date(date_str) -> str:
         if date_str == "":
             return ""
-    # Common date formats to try
     formats = [
-        "%Y-%m-%d",           # 2025-01-15 (ISO)
-        "%d-%m-%Y",           # 15-01-2025 (EU)
-        "%m-%d-%Y",           # 01-15-2025 (US)
         "%Y/%m/%d",           # 2025/01/15
-        "%d/%m/%Y",           # 15/01/2025
-        "%m/%d/%Y",           # 01/15/2025
-        "%d.%m.%Y",           # 15.01.2025
         "%Y.%m.%d",           # 2025.01.15
         "%d %B %Y",           # 15 January 2025
         "%d %b %Y",           # 15 Jan 2025
         "%B %d, %Y",          # January 15, 2025
         "%b %d, %Y",          # Jan 15, 2025
-        "%d-%b-%Y",           # 15-Jan-2025
-        "%d-%B-%Y",           # 15-January-2025
-        "%Y%m%d",             # 20250115
     ]
     parsed_date = None
@@ -148,6 +215,19 @@ def normalize_date(date_str) -> str:
         except (ValueError, TypeError):
             continue
     # If no format matched, return empty string
     if parsed_date is None:
         return ""
@@ -158,6 +238,7 @@ def normalize_date(date_str) -> str:
 def parse_date_to_object(date_str):
     """
     Parse a date string to a datetime.date object for date_input widget
     Returns None if date cannot be parsed
     """
     if not date_str or date_str == "":
@@ -168,23 +249,88 @@ def parse_date_to_object(date_str):
         if date_str == "":
             return None
-    # Common date formats to try
     formats = [
-        "%Y-%m-%d",           # 2025-01-15 (ISO)
-        "%d-%m-%Y",           # 15-01-2025 (EU)
-        "%m-%d-%Y",           # 01-15-2025 (US)
         "%Y/%m/%d",           # 2025/01/15
-        "%d/%m/%Y",           # 15/01/2025
-        "%m/%d/%Y",           # 01/15/2025
-        "%d.%m.%Y",           # 15.01.2025
         "%Y.%m.%d",           # 2025.01.15
         "%d %B %Y",           # 15 January 2025
         "%d %b %Y",           # 15 Jan 2025
         "%B %d, %Y",          # January 15, 2025
         "%b %d, %Y",          # Jan 15, 2025
-        "%d-%b-%Y",           # 15-Jan-2025
-        "%d-%B-%Y",           # 15-January-2025
-        "%Y%m%d",             # 20250115
     ]
     # Try parsing with each format
@@ -195,6 +341,19 @@ def parse_date_to_object(date_str):
         except (ValueError, TypeError):
             continue
     return None
 # -----------------------------
@@ -449,10 +608,11 @@ def parse_vllm_json(raw_json_text):
 def validate_and_calculate_taxes(structured_data):
     """
     Enhanced tax validation with smart line-item calculation:
-    1. Calculate line-item tax ONLY when line item tax is empty/missing
-    2. Skip line items with explicit 0.00 tax (tax-exempt)
-    3. Skip validation if tax_amount is 0 but tax_rate exists
-    4. Ensure both Tax Percentage and Total Tax are properly filled
     """
     subtotal = structured_data.get("Subtotal", 0.0)
@@ -473,81 +633,128 @@ def validate_and_calculate_taxes(structured_data):
         structured_data["tax_skip_reason"] = "Tax rate exists but tax amount is 0"
         return structured_data
-    # Determine authoritative tax rate from available sources
-    authoritative_rate = None
-    authority_source = None
-    # TEST SOURCE A: tax_rate
-    if model_tax_rate > 0:
-        expected_tax_from_rate = subtotal * (model_tax_rate / 100)
-        expected_total_from_rate = subtotal + expected_tax_from_rate
-        error_from_rate = abs(expected_total_from_rate - total_amount)
-    else:
-        error_from_rate = float('inf')
-    # TEST SOURCE B: tax_amount
-    if model_tax_amount > 0:
-        calculated_rate_from_amount = (model_tax_amount / subtotal) * 100
-        expected_total_from_amount = subtotal + model_tax_amount
-        error_from_amount = abs(expected_total_from_amount - total_amount)
-    else:
-        error_from_amount = float('inf')
-    # PICK WINNER (or use whichever is available)
-    if model_tax_rate > 0 or model_tax_amount > 0:
-        if error_from_rate < error_from_amount:
-            authoritative_rate = round(model_tax_rate, 4)
-            authority_source = "tax_rate"
-        else:
-            authoritative_rate = round(calculated_rate_from_amount, 4)
-            authority_source = "tax_amount"
-    else:
-        # No tax information available
-        structured_data["tax_validated"] = False
-        structured_data["tax_skip_reason"] = "No tax rate or amount provided"
-        return structured_data
-    # APPLY to line items - BUT respect explicit 0.00 values
-    calculated_total_tax = 0.0
     for item in items:
         amount = item.get("Amount", 0.0)
-        original_tax = item.get("Tax", 0.0)
         raw_tax_value = item.get("Tax_Raw", "")  # Original string value from JSON
-        # If item amount is 0, keep tax at 0
         if amount == 0.0:
             item["Tax"] = 0.0
             item["Line Total"] = 0.0
             continue
-        # Distinguish between empty ("") and explicit "0" or "0.00"
-        # Empty string means tax was not provided - we should calculate it
-        # "0", "0.0", "0.00" means explicitly tax-exempt - skip calculation
         is_explicitly_zero = False
         if isinstance(raw_tax_value, str):
             cleaned = raw_tax_value.strip()
-            # Check if it's explicitly set to some form of zero
-            if cleaned != "" and float(re.sub(r'[^\d\.-]', '', cleaned) or '0') == 0.0:
-                is_explicitly_zero = True
         elif raw_tax_value == 0 or raw_tax_value == 0.0:
-            # If it's a number 0, treat as explicit
             is_explicitly_zero = True
-        # If explicitly 0.00 - tax-exempt item, don't calculate
-        if is_explicitly_zero and original_tax == 0.0:
             item["Tax"] = 0.0
             item["Line Total"] = amount
-            calculated_total_tax += 0.0
             continue
-        # Calculate tax for this line item
-        # Either: (1) Tax was empty/missing, or (2) Tax has a value that needs recalculation
-        corrected_tax = round(amount * (authoritative_rate / 100), 2)
-        item["Tax"] = corrected_tax
-        calculated_total_tax += corrected_tax
-        item["Line Total"] = round(amount + corrected_tax, 2)
     # Update summary - ENSURE BOTH FIELDS ARE FILLED
     structured_data["Tax Percentage"] = authoritative_rate
@@ -1174,16 +1381,29 @@ elif len(st.session_state.batch_results) > 0:
     with frame_right:
         st.subheader(f"Editable Invoice: {current['file_name']}")
-        # SWAP BUTTON REMOVED COMPLETELY
         # ----------------- FORM START -----------------
         with st.form(key=f"edit_form_{selected_hash}", clear_on_submit=False):
             tabs = st.tabs(["Invoice Details", "Sender/Recipient", "Bank Details", "Line Items"])
             with tabs[0]:
                 st.text_input("Invoice Number", key=f"Invoice Number_{selected_hash}")
-                st.date_input("Invoice Date", key=f"Invoice Date_{selected_hash}", format="DD/MM/YYYY")
-                st.date_input("Due Date", key=f"Due Date_{selected_hash}", format="DD/MM/YYYY")
                 curr_options = ['USD', 'EUR', 'GBP', 'INR', 'Other']
                 if st.session_state[f"Currency_{selected_hash}"] not in curr_options:

 # =========================
 # Invoice Extractor (Qwen3-VL via RunPod vLLM) - Batch Mode with Tax Validation
+# UPDATED: Comprehensive date parsing (50+ formats) + Hybrid date display
+# FIX: Tax calculation skips both empty ("") and explicit zero (0.00) values
 # =========================
 import os
 from pathlib import Path
 def normalize_date(date_str) -> str:
     """
     Normalize various date formats to dd-MMM-yyyy format (e.g., 01-Jan-2025)
+    Handles: ISO, US, EU, Asian, two-digit years, and 50+ worldwide date formats
     Returns empty string if date cannot be parsed
     """
     if not date_str or date_str == "":
         if date_str == "":
             return ""
+    # Comprehensive list of date formats to try (order matters - most specific first)
     formats = [
+        # ISO formats (4-digit year)
+        "%Y-%m-%d",           # 2025-01-15
         "%Y/%m/%d",           # 2025/01/15
         "%Y.%m.%d",           # 2025.01.15
+        "%Y %m %d",           # 2025 01 15
+        "%Y%m%d",             # 20250115 (compact)
+        # European formats with full month names (4-digit year)
         "%d %B %Y",           # 15 January 2025
         "%d %b %Y",           # 15 Jan 2025
+        "%d-%B-%Y",           # 15-January-2025
+        "%d-%b-%Y",           # 15-Jan-2025
+        "%d.%B.%Y",           # 15.January.2025
+        "%d.%b.%Y",           # 15.Jan.2025
+        "%d/%B/%Y",           # 15/January/2025
+        "%d/%b/%Y",           # 15/Jan/2025
+        # US formats with full month names (4-digit year)
         "%B %d, %Y",          # January 15, 2025
         "%b %d, %Y",          # Jan 15, 2025
+        "%B %d %Y",           # January 15 2025
+        "%b %d %Y",           # Jan 15 2025
+        "%B-%d-%Y",           # January-15-2025
+        "%b-%d-%Y",           # Jan-15-2025
+        # European formats - Day first (4-digit year)
+        "%d-%m-%Y",           # 15-01-2025
+        "%d/%m/%Y",           # 15/01/2025
+        "%d.%m.%Y",           # 15.01.2025
+        "%d %m %Y",           # 15 01 2025
+        # US formats - Month first (4-digit year)
+        "%m-%d-%Y",           # 01-15-2025
+        "%m/%d/%Y",           # 01/15/2025
+        "%m.%d.%Y",           # 01.15.2025
+        "%m %d %Y",           # 01 15 2025
+        # European formats with 2-digit year - Day first
+        "%d-%m-%y",           # 15-01-25
+        "%d/%m/%y",           # 15/01/25 or 25/09/25 ← FIXES YOUR ISSUE!
+        "%d.%m.%y",           # 15.01.25
+        "%d %m %y",           # 15 01 25
+        # US formats with 2-digit year - Month first
+        "%m-%d-%y",           # 01-15-25
+        "%m/%d/%y",           # 01/15/25
+        "%m.%d.%y",           # 01.15.25
+        "%m %d %y",           # 01 15 25
+        # ISO with 2-digit year
+        "%y-%m-%d",           # 25-01-15
+        "%y/%m/%d",           # 25/01/15
+        "%y.%m.%d",           # 25.01.15
+        "%y %m %d",           # 25 01 15
+        # Compact formats with 2-digit year
+        "%y%m%d",             # 250115
+        "%d%m%y",             # 150125
+        "%m%d%y",             # 011525
+        # European formats with abbreviated month (2-digit year)
+        "%d-%b-%y",           # 15-Jan-25
+        "%d/%b/%y",           # 15/Jan/25
+        "%d.%b.%y",           # 15.Jan.25
+        "%d %b %y",           # 15 Jan 25
+        "%d-%B-%y",           # 15-January-25
+        "%d/%B/%y",           # 15/January/25
+        # US formats with abbreviated month (2-digit year)
+        "%b %d, %y",          # Jan 15, 25
+        "%b %d %y",           # Jan 15 25
+        "%B %d, %y",          # January 15, 25
+        "%B %d %y",           # January 15 25
+        "%b-%d-%y",           # Jan-15-25
+        "%B-%d-%y",           # January-15-25
+        # Compact 8-digit formats
+        "%d%m%Y",             # 15012025
+        "%m%d%Y",             # 01152025
+        "%Y%d%m",             # 20251501
     ]
     parsed_date = None
         except (ValueError, TypeError):
             continue
+    # If still not parsed, try removing ordinal suffixes (st, nd, rd, th)
+    if parsed_date is None and isinstance(date_str, str):
+        import re
+        cleaned = re.sub(r'(\d+)(st|nd|rd|th)\b', r'\1', date_str, flags=re.IGNORECASE)
+        if cleaned != date_str:
+            for fmt in formats:
+                try:
+                    parsed_date = datetime.strptime(cleaned, fmt)
+                    break
+                except (ValueError, TypeError):
+                    continue
     # If no format matched, return empty string
     if parsed_date is None:
         return ""
 def parse_date_to_object(date_str):
     """
     Parse a date string to a datetime.date object for date_input widget
+    Handles: ISO, US, EU, Asian, two-digit years, and 50+ worldwide date formats
     Returns None if date cannot be parsed
     """
     if not date_str or date_str == "":
         if date_str == "":
             return None
+    # Comprehensive list of date formats to try (same as normalize_date)
     formats = [
+        # ISO formats (4-digit year)
+        "%Y-%m-%d",           # 2025-01-15
         "%Y/%m/%d",           # 2025/01/15
         "%Y.%m.%d",           # 2025.01.15
+        "%Y %m %d",           # 2025 01 15
+        "%Y%m%d",             # 20250115 (compact)
+        # European formats with full month names (4-digit year)
         "%d %B %Y",           # 15 January 2025
         "%d %b %Y",           # 15 Jan 2025
+        "%d-%B-%Y",           # 15-January-2025
+        "%d-%b-%Y",           # 15-Jan-2025
+        "%d.%B.%Y",           # 15.January.2025
+        "%d.%b.%Y",           # 15.Jan.2025
+        "%d/%B/%Y",           # 15/January/2025
+        "%d/%b/%Y",           # 15/Jan/2025
+        # US formats with full month names (4-digit year)
         "%B %d, %Y",          # January 15, 2025
         "%b %d, %Y",          # Jan 15, 2025
+        "%B %d %Y",           # January 15 2025
+        "%b %d %Y",           # Jan 15 2025
+        "%B-%d-%Y",           # January-15-2025
+        "%b-%d-%Y",           # Jan-15-2025
+        # European formats - Day first (4-digit year)
+        "%d-%m-%Y",           # 15-01-2025
+        "%d/%m/%Y",           # 15/01/2025
+        "%d.%m.%Y",           # 15.01.2025
+        "%d %m %Y",           # 15 01 2025
+        # US formats - Month first (4-digit year)
+        "%m-%d-%Y",           # 01-15-2025
+        "%m/%d/%Y",           # 01/15/2025
+        "%m.%d.%Y",           # 01.15.2025
+        "%m %d %Y",           # 01 15 2025
+        # European formats with 2-digit year - Day first
+        "%d-%m-%y",           # 15-01-25
+        "%d/%m/%y",           # 15/01/25 or 25/09/25 ← FIXES YOUR ISSUE!
+        "%d.%m.%y",           # 15.01.25
+        "%d %m %y",           # 15 01 25
+        # US formats with 2-digit year - Month first
+        "%m-%d-%y",           # 01-15-25
+        "%m/%d/%y",           # 01/15/25
+        "%m.%d.%y",           # 01.15.25
+        "%m %d %y",           # 01 15 25
+        # ISO with 2-digit year
+        "%y-%m-%d",           # 25-01-15
+        "%y/%m/%d",           # 25/01/15
+        "%y.%m.%d",           # 25.01.15
+        "%y %m %d",           # 25 01 15
+        # Compact formats with 2-digit year
+        "%y%m%d",             # 250115
+        "%d%m%y",             # 150125
+        "%m%d%y",             # 011525
+        # European formats with abbreviated month (2-digit year)
+        "%d-%b-%y",           # 15-Jan-25
+        "%d/%b/%y",           # 15/Jan/25
+        "%d.%b.%y",           # 15.Jan.25
+        "%d %b %y",           # 15 Jan 25
+        "%d-%B-%y",           # 15-January-25
+        "%d/%B/%y",           # 15/January/25
+        # US formats with abbreviated month (2-digit year)
+        "%b %d, %y",          # Jan 15, 25
+        "%b %d %y",           # Jan 15 25
+        "%B %d, %y",          # January 15, 25
+        "%B %d %y",           # January 15 25
+        "%b-%d-%y",           # Jan-15-25
+        "%B-%d-%y",           # January-15-25
+        # Compact 8-digit formats
+        "%d%m%Y",             # 15012025
+        "%m%d%Y",             # 01152025
+        "%Y%d%m",             # 20251501
     ]
     # Try parsing with each format
         except (ValueError, TypeError):
             continue
+    # If still not parsed, try removing ordinal suffixes
+    if isinstance(date_str, str):
+        import re
+        cleaned = re.sub(r'(\d+)(st|nd|rd|th)\b', r'\1', date_str, flags=re.IGNORECASE)
+        if cleaned != date_str:
+            for fmt in formats:
+                try:
+                    parsed_date = datetime.strptime(cleaned, fmt)
+                    return parsed_date.date()
+                except (ValueError, TypeError):
+                    continue
     return None
 # -----------------------------
 def validate_and_calculate_taxes(structured_data):
     """
     Enhanced tax validation with smart line-item calculation:
+    1. Skip calculation if tax is empty ("") - tax not provided
+    2. Skip calculation if tax is explicitly 0.00 - tax-exempt item
+    3. Calculate tax ONLY when line item has a non-zero tax value
+    4. Skip validation if tax_amount is 0 but tax_rate exists
+    5. Ensure both Tax Percentage and Total Tax are properly filled
     """
     subtotal = structured_data.get("Subtotal", 0.0)
         structured_data["tax_skip_reason"] = "Tax rate exists but tax amount is 0"
         return structured_data
+    # FIRST PASS: Identify which items are taxable (BEFORE determining authoritative rate)
+    # This is critical because we need to know the taxable subtotal to calculate the correct rate
+    taxable_items = []
+    non_taxable_items = []
     for item in items:
         amount = item.get("Amount", 0.0)
         raw_tax_value = item.get("Tax_Raw", "")  # Original string value from JSON
+        # If item amount is 0, it's non-taxable
         if amount == 0.0:
             item["Tax"] = 0.0
             item["Line Total"] = 0.0
+            non_taxable_items.append(item)
             continue
+        # Distinguish between:
+        # 1. Empty ("") = tax not provided → NON-TAXABLE
+        # 2. Explicit "0", "0.0", "0.00" = tax-exempt → NON-TAXABLE
+        # 3. Non-zero value = TAXABLE (calculate tax for this item)
+        is_empty = False
         is_explicitly_zero = False
         if isinstance(raw_tax_value, str):
             cleaned = raw_tax_value.strip()
+            if cleaned == "":
+                # Empty string means tax was not provided
+                is_empty = True
+            else:
+                # Check if it's explicitly set to some form of zero
+                try:
+                    cleaned_value = float(re.sub(r'[^\d\.-]', '', cleaned) or '0')
+                    if cleaned_value == 0.0:
+                        is_explicitly_zero = True
+                except (ValueError, TypeError):
+                    pass
+        elif raw_tax_value is None or raw_tax_value == "":
+            is_empty = True
         elif raw_tax_value == 0 or raw_tax_value == 0.0:
+            # If it's a number 0, treat as explicit zero
             is_explicitly_zero = True
+        # If empty - tax not provided, NON-TAXABLE
+        if is_empty:
+            item["Tax"] = 0.0
+            item["Line Total"] = amount
+            non_taxable_items.append(item)
+            continue
+        # If explicitly 0.00 - tax-exempt item, NON-TAXABLE
+        if is_explicitly_zero:
             item["Tax"] = 0.0
             item["Line Total"] = amount
+            non_taxable_items.append(item)
             continue
+        # This item is TAXABLE
+        taxable_items.append(item)
+    # SECOND PASS: Determine authoritative tax rate from available sources
+    # NOW we calculate based on TAXABLE items only (not all items)
+    authoritative_rate = None
+    authority_source = None
+    if taxable_items:
+        # Calculate total taxable amount (sum of amounts for taxable items only)
+        total_taxable_amount = sum(item.get("Amount", 0.0) for item in taxable_items)
+        if total_taxable_amount > 0:
+            # TEST SOURCE A: tax_rate (test against taxable subtotal, not total subtotal)
+            if model_tax_rate > 0:
+                expected_tax_from_rate = total_taxable_amount * (model_tax_rate / 100)
+                expected_total_from_rate = subtotal + expected_tax_from_rate
+                error_from_rate = abs(expected_total_from_rate - total_amount)
+            else:
+                error_from_rate = float('inf')
+            # TEST SOURCE B: tax_amount (calculate rate based on taxable subtotal only)
+            if model_tax_amount > 0:
+                calculated_rate_from_amount = (model_tax_amount / total_taxable_amount) * 100
+                expected_total_from_amount = subtotal + model_tax_amount
+                error_from_amount = abs(expected_total_from_amount - total_amount)
+            else:
+                error_from_amount = float('inf')
+            # PICK WINNER (or use whichever is available)
+            if model_tax_rate > 0 or model_tax_amount > 0:
+                if error_from_rate < error_from_amount:
+                    authoritative_rate = round(model_tax_rate, 4)
+                    authority_source = "tax_rate"
+                else:
+                    authoritative_rate = round(calculated_rate_from_amount, 4)
+                    authority_source = "tax_amount"
+            else:
+                # No tax information available
+                structured_data["tax_validated"] = False
+                structured_data["tax_skip_reason"] = "No tax rate or amount provided"
+                return structured_data
+        else:
+            # No taxable items with amount > 0
+            structured_data["tax_validated"] = False
+            structured_data["tax_skip_reason"] = "No taxable items with valid amounts"
+            return structured_data
+    else:
+        # No taxable items found
+        structured_data["tax_validated"] = False
+        structured_data["tax_skip_reason"] = "No taxable items found"
+        return structured_data
+    # THIRD PASS: Calculate tax for taxable items using authoritative rate
+    calculated_total_tax = 0.0
+    if taxable_items and authoritative_rate is not None:
+        # Calculate tax for each taxable item
+        for item in taxable_items:
+            amount = item.get("Amount", 0.0)
+            # Calculate tax based on authoritative rate
+            corrected_tax = round(amount * (authoritative_rate / 100), 2)
+            item["Tax"] = corrected_tax
+            calculated_total_tax += corrected_tax
+            item["Line Total"] = round(amount + corrected_tax, 2)
     # Update summary - ENSURE BOTH FIELDS ARE FILLED
     structured_data["Tax Percentage"] = authoritative_rate
     with frame_right:
         st.subheader(f"Editable Invoice: {current['file_name']}")
         # ----------------- FORM START -----------------
         with st.form(key=f"edit_form_{selected_hash}", clear_on_submit=False):
             tabs = st.tabs(["Invoice Details", "Sender/Recipient", "Bank Details", "Line Items"])
             with tabs[0]:
                 st.text_input("Invoice Number", key=f"Invoice Number_{selected_hash}")
+                # HYBRID DATE DISPLAY: Formatted display + Date picker
+                st.write("**Invoice Date:**")
+                invoice_date_obj = st.session_state.get(f"Invoice Date_{selected_hash}", None)
+                if invoice_date_obj:
+                    formatted_invoice = invoice_date_obj.strftime("%d-%b-%Y")
+                    st.info(f"📅 {formatted_invoice}")  # Shows: 📅 25-Sep-2025
+                st.date_input("Select date:", key=f"Invoice Date_{selected_hash}",
+                              format="DD/MM/YYYY", label_visibility="collapsed")
+                st.write("**Due Date:**")
+                due_date_obj = st.session_state.get(f"Due Date_{selected_hash}", None)
+                if due_date_obj:
+                    formatted_due = due_date_obj.strftime("%d-%b-%Y")
+                    st.info(f"📅 {formatted_due}")  # Shows: 📅 30-Sep-2025
+                st.date_input("Select date:", key=f"Due Date_{selected_hash}",
+                              format="DD/MM/YYYY", label_visibility="collapsed")
                 curr_options = ['USD', 'EUR', 'GBP', 'INR', 'Other']
                 if st.session_state[f"Currency_{selected_hash}"] not in curr_options: