DOC_VALID_AGENT

Sleeping

Seth0330 commited on Jun 17, 2025

Commit

bb4d429

verified ·

1 Parent(s): 788110c

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -291,29 +291,31 @@ def extract_text_from_unstract(uploaded_file):
 def clean_num(val):
     """
-    Extracts and converts a numeric value from a string.
-    Handles:
-    - Commas (e.g., "9,070.26")
-    - Currency symbols (e.g., "USD", "$")
-    - Words in the string (e.g., "Invoice Total USD 9,070.26")
-    - Returns None if not found.
     """
     if val is None:
         return None
     if isinstance(val, (int, float)):
         return float(val)
-    # Look for the last valid number in the string
     matches = re.findall(r"[-+]?\d[\d,]*\.?\d*", str(val))
     if matches:
-        # Remove commas and convert to float
-        num = matches[-1].replace(",", "")
-        try:
-            return float(num)
-        except Exception:
-            return None
     return None
 def normalize(s):
     if not s: return ""
     return re.sub(r"\W+", "", str(s).lower().strip())

 def clean_num(val):
     """
+    Extract the most relevant numeric value from a string (currency, label, commas, etc.).
+    Examples:
+    - 'Invoice Total USD 9,070.26' -> 9070.26
+    - '$194.41' -> 194.41
+    - 194.41 -> 194.41
     """
     if val is None:
         return None
     if isinstance(val, (int, float)):
         return float(val)
+    # Find *all* numbers in the string (with commas, decimals, etc.)
     matches = re.findall(r"[-+]?\d[\d,]*\.?\d*", str(val))
     if matches:
+        # Pick the number with the most digits after removing commas
+        cleaned = [m.replace(',', '') for m in matches if m]
+        if cleaned:
+            # Return the largest float (usually the total)
+            as_floats = [float(c) for c in cleaned if c.replace('.', '', 1).isdigit()]
+            if as_floats:
+                # Pick the biggest one (most likely to be the invoice total)
+                return max(as_floats)
     return None
 def normalize(s):
     if not s: return ""
     return re.sub(r"\W+", "", str(s).lower().strip())