Spaces:
Running
Running
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +33 -20
src/streamlit_app.py
CHANGED
|
@@ -229,14 +229,18 @@ def clean_tax_percentage(x) -> float:
|
|
| 229 |
except ValueError:
|
| 230 |
return 0.0
|
| 231 |
|
| 232 |
-
def clean_float(x) -> float:
|
| 233 |
"""
|
| 234 |
Parse a number string handling both US and European formats.
|
| 235 |
Use this for MONETARY AMOUNTS only, NOT for tax percentages.
|
| 236 |
-
|
| 237 |
US Format: 1,234,567.89 (comma = thousands, period = decimal)
|
| 238 |
European: 1.234.567,89 (period = thousands, comma = decimal)
|
| 239 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
Examples:
|
| 241 |
"1,234.56" β 1234.56 (US)
|
| 242 |
"1.234,56" β 1234.56 (European)
|
|
@@ -296,14 +300,26 @@ def clean_float(x) -> float:
|
|
| 296 |
# Only commas present
|
| 297 |
# Check what comes after the LAST comma
|
| 298 |
after_last_comma = s[last_comma + 1:] if last_comma < len(s) - 1 else ""
|
| 299 |
-
|
| 300 |
-
if comma_count == 1 and len(after_last_comma)
|
| 301 |
-
# Single comma with
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
# "261,49" β 261.49, "1234,5678" β 1234.5678
|
| 303 |
s = s.replace(',', '.')
|
| 304 |
elif len(after_last_comma) == 3 and comma_count >= 1:
|
| 305 |
-
# 3 digits after
|
| 306 |
-
# "1,234
|
| 307 |
s = s.replace(',', '')
|
| 308 |
else:
|
| 309 |
# Multiple commas β thousands separator
|
|
@@ -611,7 +627,7 @@ def parse_date_to_object(date_str, currency=None):
|
|
| 611 |
"%m-%d-%y", "%m/%d/%y", "%m.%d.%y", "%m %d %y",
|
| 612 |
|
| 613 |
# ISO with 2-digit year
|
| 614 |
-
"%y-%m-%d", "%y/%m/%d", "%y.%m.%d", "%y %m %
|
| 615 |
|
| 616 |
# Compact formats
|
| 617 |
"%y%m%d", "%d%m%y", "%m%d%y", "%d%m%Y", "%m%d%Y", "%Y%d%m",
|
|
@@ -652,7 +668,6 @@ def run_inference_vllm(image: Image.Image):
|
|
| 652 |
# Extraction prompt (JSON format)
|
| 653 |
EXTRACTION_PROMPT = """Please carefully examine this invoice image and extract all the information into the following structured JSON format. Pay close attention to details and ensure accuracy in number formatting and text extraction.
|
| 654 |
Extract the data into this exact JSON structure:
|
| 655 |
-
|
| 656 |
{
|
| 657 |
"header": {
|
| 658 |
"invoice_no": "Invoice number or reference ID",
|
|
@@ -689,8 +704,6 @@ Extract the data into this exact JSON structure:
|
|
| 689 |
"currency": "Currency code (USD, EUR, etc.)"
|
| 690 |
}
|
| 691 |
}
|
| 692 |
-
|
| 693 |
-
|
| 694 |
IMPORTANT GUIDELINES:
|
| 695 |
- Extract only the bank account details matching the invoice currency.
|
| 696 |
Example:
|
|
@@ -808,17 +821,17 @@ def parse_vllm_json(raw_json_text):
|
|
| 808 |
|
| 809 |
data = json.loads(text_to_parse)
|
| 810 |
|
| 811 |
-
def clean_amount(value):
|
| 812 |
-
"""Parse monetary amounts using clean_float"""
|
| 813 |
-
return clean_float(value)
|
| 814 |
-
|
| 815 |
header = data.get("header", {})
|
| 816 |
summary = data.get("summary", {})
|
| 817 |
items = data.get("items", [])
|
| 818 |
-
|
| 819 |
-
# Get currency first for date parsing
|
| 820 |
currency = summary.get("currency", "")
|
| 821 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 822 |
result = {
|
| 823 |
"Invoice Number": header.get("invoice_no", ""),
|
| 824 |
"Invoice Date": normalize_date(header.get("invoice_date", ""), currency),
|
|
@@ -1081,8 +1094,8 @@ def map_prediction_to_ui(pred):
|
|
| 1081 |
return None
|
| 1082 |
|
| 1083 |
def clean_number(x):
|
| 1084 |
-
"""Parse monetary amounts - use clean_float"""
|
| 1085 |
-
return clean_float(x)
|
| 1086 |
|
| 1087 |
def collect_keys(obj, out):
|
| 1088 |
if isinstance(obj, dict):
|
|
|
|
| 229 |
except ValueError:
|
| 230 |
return 0.0
|
| 231 |
|
| 232 |
+
def clean_float(x, currency=None) -> float:
|
| 233 |
"""
|
| 234 |
Parse a number string handling both US and European formats.
|
| 235 |
Use this for MONETARY AMOUNTS only, NOT for tax percentages.
|
| 236 |
+
|
| 237 |
US Format: 1,234,567.89 (comma = thousands, period = decimal)
|
| 238 |
European: 1.234.567,89 (period = thousands, comma = decimal)
|
| 239 |
+
|
| 240 |
+
Currency-aware for ambiguous cases (3 digits after comma):
|
| 241 |
+
- EUR: "10,000" β 10.0 (European decimal)
|
| 242 |
+
- USD/other: "10,000" β 10000 (thousands separator)
|
| 243 |
+
|
| 244 |
Examples:
|
| 245 |
"1,234.56" β 1234.56 (US)
|
| 246 |
"1.234,56" β 1234.56 (European)
|
|
|
|
| 300 |
# Only commas present
|
| 301 |
# Check what comes after the LAST comma
|
| 302 |
after_last_comma = s[last_comma + 1:] if last_comma < len(s) - 1 else ""
|
| 303 |
+
|
| 304 |
+
if comma_count == 1 and len(after_last_comma) == 3 and after_last_comma.isdigit():
|
| 305 |
+
# AMBIGUOUS CASE: Single comma with exactly 3 digits after
|
| 306 |
+
# Could be US thousands ("10,000" = 10000) or European decimal ("10,000" = 10.0)
|
| 307 |
+
# Use currency to decide:
|
| 308 |
+
if currency and currency.upper() == 'EUR':
|
| 309 |
+
# European: treat comma as decimal separator
|
| 310 |
+
# "10,000" β 10.000 β 10.0
|
| 311 |
+
s = s.replace(',', '.')
|
| 312 |
+
else:
|
| 313 |
+
# USD/other: treat comma as thousands separator
|
| 314 |
+
# "10,000" β 10000
|
| 315 |
+
s = s.replace(',', '')
|
| 316 |
+
elif comma_count == 1 and len(after_last_comma) <= 4 and after_last_comma.isdigit():
|
| 317 |
+
# Single comma with 1, 2, or 4 digits after β European decimal
|
| 318 |
# "261,49" β 261.49, "1234,5678" β 1234.5678
|
| 319 |
s = s.replace(',', '.')
|
| 320 |
elif len(after_last_comma) == 3 and comma_count >= 1:
|
| 321 |
+
# Multiple commas with 3 digits after last β thousands separator
|
| 322 |
+
# "1,234,567" β 1234567
|
| 323 |
s = s.replace(',', '')
|
| 324 |
else:
|
| 325 |
# Multiple commas β thousands separator
|
|
|
|
| 627 |
"%m-%d-%y", "%m/%d/%y", "%m.%d.%y", "%m %d %y",
|
| 628 |
|
| 629 |
# ISO with 2-digit year
|
| 630 |
+
"%y-%m-%d", "%y/%m/%d", "%y.%m.%d", "%y %m %d",
|
| 631 |
|
| 632 |
# Compact formats
|
| 633 |
"%y%m%d", "%d%m%y", "%m%d%y", "%d%m%Y", "%m%d%Y", "%Y%d%m",
|
|
|
|
| 668 |
# Extraction prompt (JSON format)
|
| 669 |
EXTRACTION_PROMPT = """Please carefully examine this invoice image and extract all the information into the following structured JSON format. Pay close attention to details and ensure accuracy in number formatting and text extraction.
|
| 670 |
Extract the data into this exact JSON structure:
|
|
|
|
| 671 |
{
|
| 672 |
"header": {
|
| 673 |
"invoice_no": "Invoice number or reference ID",
|
|
|
|
| 704 |
"currency": "Currency code (USD, EUR, etc.)"
|
| 705 |
}
|
| 706 |
}
|
|
|
|
|
|
|
| 707 |
IMPORTANT GUIDELINES:
|
| 708 |
- Extract only the bank account details matching the invoice currency.
|
| 709 |
Example:
|
|
|
|
| 821 |
|
| 822 |
data = json.loads(text_to_parse)
|
| 823 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 824 |
header = data.get("header", {})
|
| 825 |
summary = data.get("summary", {})
|
| 826 |
items = data.get("items", [])
|
| 827 |
+
|
| 828 |
+
# Get currency first for date parsing and amount parsing
|
| 829 |
currency = summary.get("currency", "")
|
| 830 |
|
| 831 |
+
def clean_amount(value):
|
| 832 |
+
"""Parse monetary amounts using clean_float with currency awareness"""
|
| 833 |
+
return clean_float(value, currency)
|
| 834 |
+
|
| 835 |
result = {
|
| 836 |
"Invoice Number": header.get("invoice_no", ""),
|
| 837 |
"Invoice Date": normalize_date(header.get("invoice_date", ""), currency),
|
|
|
|
| 1094 |
return None
|
| 1095 |
|
| 1096 |
def clean_number(x):
|
| 1097 |
+
"""Parse monetary amounts - use clean_float with currency awareness"""
|
| 1098 |
+
return clean_float(x, currency)
|
| 1099 |
|
| 1100 |
def collect_keys(obj, out):
|
| 1101 |
if isinstance(obj, dict):
|