Seth0330 commited on
Commit
2c7ba82
·
verified ·
1 Parent(s): bb4d429

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +135 -169
app.py CHANGED
@@ -195,7 +195,6 @@ def get_extraction_prompt(model_choice, txt):
195
  )
196
 
197
  def ensure_total_due(invoice_header):
198
- # If total_due is missing, try to find a close equivalent
199
  if invoice_header.get("total_due") in [None, ""]:
200
  for field in ["invoice_total", "invoice_value", "total_before_tax", "balance_due", "amount_paid"]:
201
  if field in invoice_header and invoice_header[field]:
@@ -203,33 +202,6 @@ def ensure_total_due(invoice_header):
203
  break
204
  return invoice_header
205
 
206
- def extract_invoice_info(model_choice, text):
207
- prompt = get_extraction_prompt(model_choice, text)
208
- raw = query_llm(model_choice, prompt)
209
- if not raw:
210
- return None
211
- data = clean_json_response(raw)
212
- if not data:
213
- return None
214
- hdr = data.get("invoice_header", {})
215
- if not hdr and any(k in data for k in ("invoice_number","supplier_name","customer_name")):
216
- hdr = data
217
- for k in ("invoice_number","invoice_date","po_number","invoice_value","supplier_name","customer_name"):
218
- hdr.setdefault(k, None)
219
- if not hdr.get("supplier_name"):
220
- hdr["supplier_name"] = fallback_supplier(text)
221
- # Guarantee total_due is always present (if at all possible)
222
- hdr = ensure_total_due(hdr)
223
- items = data.get("line_items", [])
224
- if not isinstance(items, list):
225
- items = []
226
- for itm in items:
227
- if not isinstance(itm, dict):
228
- continue
229
- for k in ("item_number","description","quantity","unit_price","total_price"):
230
- itm.setdefault(k, None)
231
- return {"invoice_header": hdr, "line_items": items}
232
-
233
  def get_content_type(filename):
234
  mime, _ = mimetypes.guess_type(filename)
235
  ext = filename.lower().split('.')[-1]
@@ -289,36 +261,96 @@ def extract_text_from_unstract(uploaded_file):
289
  except Exception:
290
  return r.text
291
 
292
- def clean_num(val):
293
- """
294
- Extract the most relevant numeric value from a string (currency, label, commas, etc.).
295
- Examples:
296
- - 'Invoice Total USD 9,070.26' -> 9070.26
297
- - '$194.41' -> 194.41
298
- - 194.41 -> 194.41
299
- """
300
- if val is None:
301
- return None
302
- if isinstance(val, (int, float)):
303
- return float(val)
304
- # Find *all* numbers in the string (with commas, decimals, etc.)
305
- matches = re.findall(r"[-+]?\d[\d,]*\.?\d*", str(val))
306
- if matches:
307
- # Pick the number with the most digits after removing commas
308
- cleaned = [m.replace(',', '') for m in matches if m]
309
- if cleaned:
310
- # Return the largest float (usually the total)
311
- as_floats = [float(c) for c in cleaned if c.replace('.', '', 1).isdigit()]
312
- if as_floats:
313
- # Pick the biggest one (most likely to be the invoice total)
314
- return max(as_floats)
315
- return None
316
-
317
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
 
319
- def normalize(s):
320
- if not s: return ""
321
- return re.sub(r"\W+", "", str(s).lower().strip())
 
 
 
322
 
323
  st.sidebar.header("Step 1: Upload Active Purchase Orders (POs)")
324
  po_file = st.sidebar.file_uploader(
@@ -344,6 +376,7 @@ if st.button("Extract") and inv_file:
344
  with st.spinner("Extracting text from document using Unstract..."):
345
  text = extract_text_from_unstract(inv_file)
346
  if text:
 
347
  extracted_info = extract_invoice_info(mdl, text)
348
  if extracted_info:
349
  if "invoice_header" in extracted_info:
@@ -355,126 +388,60 @@ if st.button("Extract") and inv_file:
355
  st.table(extracted_info["line_items"])
356
  st.session_state['last_extracted_info'] = extracted_info
357
 
358
- # Always retrieve latest extracted info and PO df from session state!
359
  extracted_info = st.session_state.get('last_extracted_info', None)
360
  po_df = st.session_state.get('last_po_df', None)
361
 
362
  def po_match_tool_func(input_text):
363
  invoice = st.session_state.get("last_extracted_info")
364
  po_df = st.session_state.get("last_po_df")
365
- debug = {}
366
  if invoice is None or po_df is None:
367
  return json.dumps({
368
  "decision": "REJECTED",
369
  "reason": "Invoice or PO data not found.",
370
- "debug": debug,
371
  })
372
 
373
- inv_hdr = invoice["invoice_header"]
374
- inv_po_number = (inv_hdr.get("purchase_order_number") or
375
- inv_hdr.get("order_number") or
376
- inv_hdr.get("our_order_number") or "")
377
- inv_supplier = inv_hdr.get("supplier_name") or ""
378
- inv_total = inv_hdr.get("total_due") # <<--- ALWAYS USE total_due
379
- inv_total = clean_num(inv_total)
380
- inv_line_items = invoice.get("line_items", [])
381
-
382
- debug["inv_po_number"] = inv_po_number
383
- debug["inv_supplier"] = inv_supplier
384
- debug["inv_total"] = inv_total
385
-
386
- explanation = []
387
- best_match = None
388
- best_match_type = None
389
- match_row_debug = None
390
-
391
- for idx, row in po_df.iterrows():
392
- po_number = str(row.get("PO Number", ""))
393
- po_number_clean = normalize(po_number)
394
- inv_po_number_clean = normalize(inv_po_number)
395
- supplier = str(row.get("Supplier Name", ""))
396
- supplier_clean = normalize(supplier)
397
- inv_supplier_clean = normalize(inv_supplier)
398
- po_total = clean_num(row.get("Total PO Value", ""))
399
- po_desc = str(row.get("Description", "")).lower()
400
-
401
- po_match = (po_number_clean in inv_po_number_clean or inv_po_number_clean in po_number_clean) and po_number_clean
402
- supplier_score = fuzz.token_set_ratio(supplier, inv_supplier)
403
- supplier_match = supplier_score >= 90
404
- total_match = False
405
- if po_total is not None and inv_total is not None:
406
- total_match = abs(po_total - inv_total) < 1 # $1 tolerance
407
-
408
- debug_row = {
409
- "row_po_number": po_number,
410
- "row_supplier": supplier,
411
- "row_total": po_total,
412
- "po_match": po_match,
413
- "supplier_score": supplier_score,
414
- "supplier_match": supplier_match,
415
- "total_match": total_match,
416
- "row_desc": po_desc,
417
- }
418
-
419
- if po_match and supplier_match and total_match:
420
- best_match = row
421
- best_match_type = "APPROVED"
422
- explanation.append(f"PO Number, Supplier Name, and Total Due all matched. PO: {row.to_dict()}")
423
- match_row_debug = debug_row
424
- break
425
- elif (po_match or supplier_match) and not total_match:
426
- best_match = row
427
- best_match_type = "PARTIALLY APPROVED"
428
- fields = []
429
- if po_match:
430
- fields.append("PO Number matched")
431
- if supplier_match:
432
- fields.append("Supplier Name matched (fuzzy)")
433
- explanation.append(f"{' and '.join(fields)}, but Total Due did not match. PO: {row.to_dict()}")
434
- match_row_debug = debug_row
435
- break
436
 
437
- # If no direct match, try line item fuzzy matching
438
- if best_match is None and len(inv_line_items) > 0:
439
- for idx, row in po_df.iterrows():
440
- po_desc = str(row.get("Description", "")).lower()
441
- po_total = clean_num(row.get("Total PO Value", ""))
442
- line_item_matched = False
443
- for line in inv_line_items:
444
- desc = (line.get("description") or "").lower()
445
- if not desc: continue
446
- score = fuzz.token_set_ratio(desc, po_desc)
447
- if (desc and po_desc and score >= 80):
448
- line_item_matched = True
449
- explanation.append(f"Line item '{desc}' matched PO description '{po_desc}' with score {score}. PO: {row.to_dict()}")
450
- break
451
- if line_item_matched and po_total is not None and inv_total is not None and abs(po_total - inv_total) < 1:
452
- best_match = row
453
- best_match_type = "APPROVED"
454
- match_row_debug = {
455
- "row_desc": po_desc,
456
- "line_item_desc": desc,
457
- "fuzzy_score": score,
458
- "po_total": po_total,
459
- "inv_total": inv_total,
460
- "total_match": abs(po_total - inv_total) < 1,
461
- }
462
- break
463
 
464
- debug["matched_po_row"] = match_row_debug
 
 
 
 
 
465
 
466
- if best_match is not None:
467
- return json.dumps({
468
- "decision": best_match_type,
469
- "reason": " | ".join(explanation),
470
- "debug": debug
471
- })
472
- else:
473
- return json.dumps({
474
- "decision": "REJECTED",
475
- "reason": "No match found on PO Number, Supplier Name, Total Due, or any line item (including fuzzy match).",
476
- "debug": debug
477
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
478
 
479
  if po_df is not None:
480
  st.session_state["last_po_df"] = po_df
@@ -492,7 +459,7 @@ if extracted_info is not None and po_df is not None:
492
  Tool(
493
  name="po_match_tool",
494
  func=po_match_tool_func,
495
- description="Check if the invoice matches any PO (headers or fuzzy line items).",
496
  )
497
  ]
498
  decision_llm = ChatOpenAI(
@@ -509,12 +476,9 @@ if extracted_info is not None and po_df is not None:
509
  )
510
  prompt = (
511
  "You are an expert accounts payable agent. "
512
- "Use po_match_tool to check matches based on the following business rules:\n"
513
- "- If PO Number AND Supplier Name AND Total Value all match, the invoice is APPROVED.\n"
514
- "- If PO Number OR Supplier Name match, but Total Value does not, the invoice is PARTIALLY APPROVED.\n"
515
- "- If neither, try matching at least one line item (by fuzzy description, quantity, or price) and require total to match for APPROVED.\n"
516
- "- Otherwise, REJECTED.\n"
517
- "Call the tool and return its result as-is. Do not invent or guess the answer, do not add any comments outside the JSON.\n"
518
  f"Invoice JSON:\n{json.dumps(extracted_info, indent=2)}"
519
  )
520
  with st.spinner("AI is reasoning and making a decision..."):
@@ -523,10 +487,12 @@ if extracted_info is not None and po_df is not None:
523
  result_json = json.loads(result)
524
  st.write(f"**Decision:** {result_json.get('decision', 'N/A')}")
525
  st.write(f"**Reason:** {result_json.get('reason', 'N/A')}")
526
- with st.expander("Debug"):
527
  st.json(result_json.get('debug'))
528
  st.subheader("Extracted Invoice JSON")
529
  st.json(extracted_info)
 
 
530
  except Exception:
531
  st.subheader("AI Decision & Reason")
532
  st.write(result)
 
195
  )
196
 
197
  def ensure_total_due(invoice_header):
 
198
  if invoice_header.get("total_due") in [None, ""]:
199
  for field in ["invoice_total", "invoice_value", "total_before_tax", "balance_due", "amount_paid"]:
200
  if field in invoice_header and invoice_header[field]:
 
202
  break
203
  return invoice_header
204
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  def get_content_type(filename):
206
  mime, _ = mimetypes.guess_type(filename)
207
  ext = filename.lower().split('.')[-1]
 
261
  except Exception:
262
  return r.text
263
 
264
+ def weighted_fuzzy_score(s1, s2):
265
+ if not s1 and not s2:
266
+ return 100
267
+ return fuzz.token_set_ratio(str(s1).lower(), str(s2).lower())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
 
269
+ def find_best_po_match(inv, po_df):
270
+ inv_hdr = inv["invoice_header"]
271
+ inv_supplier = inv_hdr.get("supplier_name") or ""
272
+ inv_ship_to = inv_hdr.get("ship_to_name") or ""
273
+ inv_bill_to = inv_hdr.get("bill_to_name") or ""
274
+ inv_payment_terms = inv_hdr.get("payment_terms") or ""
275
+ inv_currency = inv_hdr.get("currency") or ""
276
+ inv_total_due = clean_num(inv_hdr.get("total_due"))
277
+ inv_line_items = inv.get("line_items", [])
278
+
279
+ scores = []
280
+ for idx, row in po_df.iterrows():
281
+ po_supplier = row.get("Supplier Name", "")
282
+ po_ship_to = row.get("Ship To", "")
283
+ po_bill_to = row.get("Bill To", "")
284
+ po_payment_terms = row.get("Payment Terms", "")
285
+ po_currency = row.get("Currency", "")
286
+ po_total = clean_num(row.get("PO Total Value", ""))
287
+ po_desc = row.get("Item Description", "")
288
+ po_qty = str(row.get("Item Quantity", ""))
289
+ po_unit = str(row.get("Item Unit Price", ""))
290
+ po_line_total = clean_num(row.get("Line Item Total", ""))
291
+
292
+ # Weighted fuzzy scores
293
+ s_supplier = weighted_fuzzy_score(inv_supplier, po_supplier)
294
+ s_ship_to = weighted_fuzzy_score(inv_ship_to, po_ship_to)
295
+ s_bill_to = weighted_fuzzy_score(inv_bill_to, po_bill_to)
296
+ s_terms = weighted_fuzzy_score(inv_payment_terms, po_payment_terms)
297
+ s_currency = weighted_fuzzy_score(inv_currency, po_currency)
298
+ s_total = 100 if inv_total_due is not None and po_total is not None and abs(inv_total_due - po_total) < 2 else 0
299
+
300
+ # Check for at least one line item strong match
301
+ line_item_score = 0
302
+ line_reason = ""
303
+ for line in inv_line_items:
304
+ desc_score = weighted_fuzzy_score(line.get("description", ""), po_desc)
305
+ qty_score = 100 if clean_num(line.get("quantity")) == clean_num(po_qty) else 0
306
+ unit_score = 100 if clean_num(line.get("price")) == clean_num(po_unit) else 0
307
+ amount_score = 100 if clean_num(line.get("amount")) == po_line_total else 0
308
+ total = desc_score * 0.5 + qty_score * 0.2 + unit_score * 0.15 + amount_score * 0.15
309
+ if total > line_item_score:
310
+ line_item_score = total
311
+ line_reason = (f"Best line item: desc_score={desc_score}, qty_score={qty_score}, "
312
+ f"unit_score={unit_score}, amount_score={amount_score}")
313
+ # Score weights (tune as needed)
314
+ total_score = (
315
+ s_supplier * 0.25 +
316
+ s_ship_to * 0.1 +
317
+ s_bill_to * 0.1 +
318
+ s_terms * 0.1 +
319
+ s_currency * 0.05 +
320
+ s_total * 0.2 +
321
+ line_item_score * 0.2
322
+ )
323
+ reason = (
324
+ f"Supplier match: {s_supplier}/100, Ship To: {s_ship_to}/100, "
325
+ f"Bill To: {s_bill_to}/100, Payment Terms: {s_terms}/100, Currency: {s_currency}/100, "
326
+ f"Total Due: {'match' if s_total else 'no match'}, "
327
+ f"Line item best match: {int(line_item_score)}/100. {line_reason}"
328
+ )
329
+ debug = {
330
+ "po_idx": idx,
331
+ "po_supplier": po_supplier,
332
+ "po_ship_to": po_ship_to,
333
+ "po_bill_to": po_bill_to,
334
+ "po_total": po_total,
335
+ "s_supplier": s_supplier,
336
+ "s_ship_to": s_ship_to,
337
+ "s_bill_to": s_bill_to,
338
+ "s_terms": s_terms,
339
+ "s_currency": s_currency,
340
+ "s_total": s_total,
341
+ "line_item_score": line_item_score,
342
+ "total_score": total_score,
343
+ "line_reason": line_reason,
344
+ "inv_total_due": inv_total_due
345
+ }
346
+ scores.append((row, total_score, reason, debug))
347
 
348
+ # Pick the highest
349
+ scores.sort(key=lambda tup: tup[1], reverse=True)
350
+ if not scores:
351
+ return None, 0, "No POs found.", {}
352
+ best_row, best_score, reason, debug = scores[0]
353
+ return best_row, best_score, reason, debug
354
 
355
  st.sidebar.header("Step 1: Upload Active Purchase Orders (POs)")
356
  po_file = st.sidebar.file_uploader(
 
376
  with st.spinner("Extracting text from document using Unstract..."):
377
  text = extract_text_from_unstract(inv_file)
378
  if text:
379
+ prompt = get_extraction_prompt(mdl, text)
380
  extracted_info = extract_invoice_info(mdl, text)
381
  if extracted_info:
382
  if "invoice_header" in extracted_info:
 
388
  st.table(extracted_info["line_items"])
389
  st.session_state['last_extracted_info'] = extracted_info
390
 
 
391
  extracted_info = st.session_state.get('last_extracted_info', None)
392
  po_df = st.session_state.get('last_po_df', None)
393
 
394
  def po_match_tool_func(input_text):
395
  invoice = st.session_state.get("last_extracted_info")
396
  po_df = st.session_state.get("last_po_df")
 
397
  if invoice is None or po_df is None:
398
  return json.dumps({
399
  "decision": "REJECTED",
400
  "reason": "Invoice or PO data not found.",
401
+ "debug": {},
402
  })
403
 
404
+ best_row, best_score, reason, debug = find_best_po_match(invoice, po_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
405
 
406
+ if best_score > 85:
407
+ status = "APPROVED"
408
+ elif best_score > 70:
409
+ status = "PARTIALLY APPROVED"
410
+ else:
411
+ status = "REJECTED"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
 
413
+ return json.dumps({
414
+ "decision": status,
415
+ "reason": f"Best match score: {int(best_score)}/100. {reason}",
416
+ "debug": debug,
417
+ "po_row": best_row.to_dict() if best_row is not None else None
418
+ })
419
 
420
+ def extract_invoice_info(model_choice, text):
421
+ prompt = get_extraction_prompt(model_choice, text)
422
+ raw = query_llm(model_choice, prompt)
423
+ if not raw:
424
+ return None
425
+ data = clean_json_response(raw)
426
+ if not data:
427
+ return None
428
+ hdr = data.get("invoice_header", {})
429
+ if not hdr and any(k in data for k in ("invoice_number","supplier_name","customer_name")):
430
+ hdr = data
431
+ for k in ("invoice_number","invoice_date","po_number","invoice_value","supplier_name","customer_name"):
432
+ hdr.setdefault(k, None)
433
+ if not hdr.get("supplier_name"):
434
+ hdr["supplier_name"] = fallback_supplier(text)
435
+ hdr = ensure_total_due(hdr)
436
+ items = data.get("line_items", [])
437
+ if not isinstance(items, list):
438
+ items = []
439
+ for itm in items:
440
+ if not isinstance(itm, dict):
441
+ continue
442
+ for k in ("item_number","description","quantity","unit_price","total_price"):
443
+ itm.setdefault(k, None)
444
+ return {"invoice_header": hdr, "line_items": items}
445
 
446
  if po_df is not None:
447
  st.session_state["last_po_df"] = po_df
 
459
  Tool(
460
  name="po_match_tool",
461
  func=po_match_tool_func,
462
+ description="Smartly match invoice to PO using all possible fields.",
463
  )
464
  ]
465
  decision_llm = ChatOpenAI(
 
476
  )
477
  prompt = (
478
  "You are an expert accounts payable agent. "
479
+ "Use po_match_tool to check for the best possible match using supplier, ship to, bill to, payment terms, currency, line items, and total value."
480
+ "Weigh the importance of each field as an expert would."
481
+ "Return a JSON with decision (APPROVED, PARTIALLY APPROVED, REJECTED), reason (include field scores and reasoning), debug, and the best matched PO row.\n"
 
 
 
482
  f"Invoice JSON:\n{json.dumps(extracted_info, indent=2)}"
483
  )
484
  with st.spinner("AI is reasoning and making a decision..."):
 
487
  result_json = json.loads(result)
488
  st.write(f"**Decision:** {result_json.get('decision', 'N/A')}")
489
  st.write(f"**Reason:** {result_json.get('reason', 'N/A')}")
490
+ with st.expander("Debug & Matching Details"):
491
  st.json(result_json.get('debug'))
492
  st.subheader("Extracted Invoice JSON")
493
  st.json(extracted_info)
494
+ st.subheader("Matched PO Row")
495
+ st.json(result_json.get('po_row'))
496
  except Exception:
497
  st.subheader("AI Decision & Reason")
498
  st.write(result)