Ankushbl6 commited on
Commit
8d68d78
Β·
verified Β·
1 Parent(s): 3b295dc

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +33 -20
src/streamlit_app.py CHANGED
@@ -229,14 +229,18 @@ def clean_tax_percentage(x) -> float:
229
  except ValueError:
230
  return 0.0
231
 
232
- def clean_float(x) -> float:
233
  """
234
  Parse a number string handling both US and European formats.
235
  Use this for MONETARY AMOUNTS only, NOT for tax percentages.
236
-
237
  US Format: 1,234,567.89 (comma = thousands, period = decimal)
238
  European: 1.234.567,89 (period = thousands, comma = decimal)
239
-
 
 
 
 
240
  Examples:
241
  "1,234.56" β†’ 1234.56 (US)
242
  "1.234,56" β†’ 1234.56 (European)
@@ -296,14 +300,26 @@ def clean_float(x) -> float:
296
  # Only commas present
297
  # Check what comes after the LAST comma
298
  after_last_comma = s[last_comma + 1:] if last_comma < len(s) - 1 else ""
299
-
300
- if comma_count == 1 and len(after_last_comma) <= 4 and after_last_comma.isdigit():
301
- # Single comma with 1-4 digits after β†’ European decimal
 
 
 
 
 
 
 
 
 
 
 
 
302
  # "261,49" β†’ 261.49, "1234,5678" β†’ 1234.5678
303
  s = s.replace(',', '.')
304
  elif len(after_last_comma) == 3 and comma_count >= 1:
305
- # 3 digits after comma(s) β†’ likely thousands separator
306
- # "1,234" β†’ 1234, "1,234,567" β†’ 1234567
307
  s = s.replace(',', '')
308
  else:
309
  # Multiple commas β†’ thousands separator
@@ -611,7 +627,7 @@ def parse_date_to_object(date_str, currency=None):
611
  "%m-%d-%y", "%m/%d/%y", "%m.%d.%y", "%m %d %y",
612
 
613
  # ISO with 2-digit year
614
- "%y-%m-%d", "%y/%m/%d", "%y.%m.%d", "%y %m %y",
615
 
616
  # Compact formats
617
  "%y%m%d", "%d%m%y", "%m%d%y", "%d%m%Y", "%m%d%Y", "%Y%d%m",
@@ -652,7 +668,6 @@ def run_inference_vllm(image: Image.Image):
652
  # Extraction prompt (JSON format)
653
  EXTRACTION_PROMPT = """Please carefully examine this invoice image and extract all the information into the following structured JSON format. Pay close attention to details and ensure accuracy in number formatting and text extraction.
654
  Extract the data into this exact JSON structure:
655
-
656
  {
657
  "header": {
658
  "invoice_no": "Invoice number or reference ID",
@@ -689,8 +704,6 @@ Extract the data into this exact JSON structure:
689
  "currency": "Currency code (USD, EUR, etc.)"
690
  }
691
  }
692
-
693
-
694
  IMPORTANT GUIDELINES:
695
  - Extract only the bank account details matching the invoice currency.
696
  Example:
@@ -808,17 +821,17 @@ def parse_vllm_json(raw_json_text):
808
 
809
  data = json.loads(text_to_parse)
810
 
811
- def clean_amount(value):
812
- """Parse monetary amounts using clean_float"""
813
- return clean_float(value)
814
-
815
  header = data.get("header", {})
816
  summary = data.get("summary", {})
817
  items = data.get("items", [])
818
-
819
- # Get currency first for date parsing (USD uses MM/DD/YYYY for numeric dates)
820
  currency = summary.get("currency", "")
821
 
 
 
 
 
822
  result = {
823
  "Invoice Number": header.get("invoice_no", ""),
824
  "Invoice Date": normalize_date(header.get("invoice_date", ""), currency),
@@ -1081,8 +1094,8 @@ def map_prediction_to_ui(pred):
1081
  return None
1082
 
1083
  def clean_number(x):
1084
- """Parse monetary amounts - use clean_float"""
1085
- return clean_float(x)
1086
 
1087
  def collect_keys(obj, out):
1088
  if isinstance(obj, dict):
 
229
  except ValueError:
230
  return 0.0
231
 
232
+ def clean_float(x, currency=None) -> float:
233
  """
234
  Parse a number string handling both US and European formats.
235
  Use this for MONETARY AMOUNTS only, NOT for tax percentages.
236
+
237
  US Format: 1,234,567.89 (comma = thousands, period = decimal)
238
  European: 1.234.567,89 (period = thousands, comma = decimal)
239
+
240
+ Currency-aware for ambiguous cases (3 digits after comma):
241
+ - EUR: "10,000" β†’ 10.0 (European decimal)
242
+ - USD/other: "10,000" β†’ 10000 (thousands separator)
243
+
244
  Examples:
245
  "1,234.56" β†’ 1234.56 (US)
246
  "1.234,56" β†’ 1234.56 (European)
 
300
  # Only commas present
301
  # Check what comes after the LAST comma
302
  after_last_comma = s[last_comma + 1:] if last_comma < len(s) - 1 else ""
303
+
304
+ if comma_count == 1 and len(after_last_comma) == 3 and after_last_comma.isdigit():
305
+ # AMBIGUOUS CASE: Single comma with exactly 3 digits after
306
+ # Could be US thousands ("10,000" = 10000) or European decimal ("10,000" = 10.0)
307
+ # Use currency to decide:
308
+ if currency and currency.upper() == 'EUR':
309
+ # European: treat comma as decimal separator
310
+ # "10,000" β†’ 10.000 β†’ 10.0
311
+ s = s.replace(',', '.')
312
+ else:
313
+ # USD/other: treat comma as thousands separator
314
+ # "10,000" β†’ 10000
315
+ s = s.replace(',', '')
316
+ elif comma_count == 1 and len(after_last_comma) <= 4 and after_last_comma.isdigit():
317
+ # Single comma with 1, 2, or 4 digits after β†’ European decimal
318
  # "261,49" β†’ 261.49, "1234,5678" β†’ 1234.5678
319
  s = s.replace(',', '.')
320
  elif len(after_last_comma) == 3 and comma_count >= 1:
321
+ # Multiple commas with 3 digits after last β†’ thousands separator
322
+ # "1,234,567" β†’ 1234567
323
  s = s.replace(',', '')
324
  else:
325
  # Multiple commas β†’ thousands separator
 
627
  "%m-%d-%y", "%m/%d/%y", "%m.%d.%y", "%m %d %y",
628
 
629
  # ISO with 2-digit year
630
+ "%y-%m-%d", "%y/%m/%d", "%y.%m.%d", "%y %m %d",
631
 
632
  # Compact formats
633
  "%y%m%d", "%d%m%y", "%m%d%y", "%d%m%Y", "%m%d%Y", "%Y%d%m",
 
668
  # Extraction prompt (JSON format)
669
  EXTRACTION_PROMPT = """Please carefully examine this invoice image and extract all the information into the following structured JSON format. Pay close attention to details and ensure accuracy in number formatting and text extraction.
670
  Extract the data into this exact JSON structure:
 
671
  {
672
  "header": {
673
  "invoice_no": "Invoice number or reference ID",
 
704
  "currency": "Currency code (USD, EUR, etc.)"
705
  }
706
  }
 
 
707
  IMPORTANT GUIDELINES:
708
  - Extract only the bank account details matching the invoice currency.
709
  Example:
 
821
 
822
  data = json.loads(text_to_parse)
823
 
 
 
 
 
824
  header = data.get("header", {})
825
  summary = data.get("summary", {})
826
  items = data.get("items", [])
827
+
828
+ # Get currency first for date parsing and amount parsing
829
  currency = summary.get("currency", "")
830
 
831
+ def clean_amount(value):
832
+ """Parse monetary amounts using clean_float with currency awareness"""
833
+ return clean_float(value, currency)
834
+
835
  result = {
836
  "Invoice Number": header.get("invoice_no", ""),
837
  "Invoice Date": normalize_date(header.get("invoice_date", ""), currency),
 
1094
  return None
1095
 
1096
  def clean_number(x):
1097
+ """Parse monetary amounts - use clean_float with currency awareness"""
1098
+ return clean_float(x, currency)
1099
 
1100
  def collect_keys(obj, out):
1101
  if isinstance(obj, dict):