Seth0330 commited on
Commit
bb4d429
·
verified ·
1 Parent(s): 788110c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -13
app.py CHANGED
@@ -291,29 +291,31 @@ def extract_text_from_unstract(uploaded_file):
291
 
292
  def clean_num(val):
293
  """
294
- Extracts and converts a numeric value from a string.
295
- Handles:
296
- - Commas (e.g., "9,070.26")
297
- - Currency symbols (e.g., "USD", "$")
298
- - Words in the string (e.g., "Invoice Total USD 9,070.26")
299
- - Returns None if not found.
300
  """
301
  if val is None:
302
  return None
303
  if isinstance(val, (int, float)):
304
  return float(val)
305
- # Look for the last valid number in the string
306
  matches = re.findall(r"[-+]?\d[\d,]*\.?\d*", str(val))
307
  if matches:
308
- # Remove commas and convert to float
309
- num = matches[-1].replace(",", "")
310
- try:
311
- return float(num)
312
- except Exception:
313
- return None
 
 
314
  return None
315
 
316
 
 
317
  def normalize(s):
318
  if not s: return ""
319
  return re.sub(r"\W+", "", str(s).lower().strip())
 
291
 
292
  def clean_num(val):
293
  """
294
+ Extract the most relevant numeric value from a string (currency, label, commas, etc.).
295
+ Examples:
296
+ - 'Invoice Total USD 9,070.26' -> 9070.26
297
+ - '$194.41' -> 194.41
298
+ - 194.41 -> 194.41
 
299
  """
300
  if val is None:
301
  return None
302
  if isinstance(val, (int, float)):
303
  return float(val)
304
+ # Find *all* numbers in the string (with commas, decimals, etc.)
305
  matches = re.findall(r"[-+]?\d[\d,]*\.?\d*", str(val))
306
  if matches:
307
+ # Pick the number with the most digits after removing commas
308
+ cleaned = [m.replace(',', '') for m in matches if m]
309
+ if cleaned:
310
+ # Return the largest float (usually the total)
311
+ as_floats = [float(c) for c in cleaned if c.replace('.', '', 1).isdigit()]
312
+ if as_floats:
313
+ # Pick the biggest one (most likely to be the invoice total)
314
+ return max(as_floats)
315
  return None
316
 
317
 
318
+
319
  def normalize(s):
320
  if not s: return ""
321
  return re.sub(r"\W+", "", str(s).lower().strip())