DOC_VALID_AGENT

Sleeping

App Files Files Community

Seth0330 commited on Jun 17, 2025

Commit

2c7ba82

verified ·

1 Parent(s): bb4d429

Update app.py

Browse files

Files changed (1) hide show

app.py +135 -169

app.py CHANGED Viewed

@@ -195,7 +195,6 @@ def get_extraction_prompt(model_choice, txt):
     )
 def ensure_total_due(invoice_header):
-    # If total_due is missing, try to find a close equivalent
     if invoice_header.get("total_due") in [None, ""]:
         for field in ["invoice_total", "invoice_value", "total_before_tax", "balance_due", "amount_paid"]:
             if field in invoice_header and invoice_header[field]:
@@ -203,33 +202,6 @@ def ensure_total_due(invoice_header):
                 break
     return invoice_header
-def extract_invoice_info(model_choice, text):
-    prompt = get_extraction_prompt(model_choice, text)
-    raw = query_llm(model_choice, prompt)
-    if not raw:
-        return None
-    data = clean_json_response(raw)
-    if not data:
-        return None
-    hdr = data.get("invoice_header", {})
-    if not hdr and any(k in data for k in ("invoice_number","supplier_name","customer_name")):
-        hdr = data
-    for k in ("invoice_number","invoice_date","po_number","invoice_value","supplier_name","customer_name"):
-        hdr.setdefault(k, None)
-    if not hdr.get("supplier_name"):
-        hdr["supplier_name"] = fallback_supplier(text)
-    # Guarantee total_due is always present (if at all possible)
-    hdr = ensure_total_due(hdr)
-    items = data.get("line_items", [])
-    if not isinstance(items, list):
-        items = []
-    for itm in items:
-        if not isinstance(itm, dict):
-            continue
-        for k in ("item_number","description","quantity","unit_price","total_price"):
-            itm.setdefault(k, None)
-    return {"invoice_header": hdr, "line_items": items}
 def get_content_type(filename):
     mime, _ = mimetypes.guess_type(filename)
     ext = filename.lower().split('.')[-1]
@@ -289,36 +261,96 @@ def extract_text_from_unstract(uploaded_file):
     except Exception:
         return r.text
-def clean_num(val):
-    """
-    Extract the most relevant numeric value from a string (currency, label, commas, etc.).
-    Examples:
-    - 'Invoice Total USD 9,070.26' -> 9070.26
-    - '$194.41' -> 194.41
-    - 194.41 -> 194.41
-    """
-    if val is None:
-        return None
-    if isinstance(val, (int, float)):
-        return float(val)
-    # Find *all* numbers in the string (with commas, decimals, etc.)
-    matches = re.findall(r"[-+]?\d[\d,]*\.?\d*", str(val))
-    if matches:
-        # Pick the number with the most digits after removing commas
-        cleaned = [m.replace(',', '') for m in matches if m]
-        if cleaned:
-            # Return the largest float (usually the total)
-            as_floats = [float(c) for c in cleaned if c.replace('.', '', 1).isdigit()]
-            if as_floats:
-                # Pick the biggest one (most likely to be the invoice total)
-                return max(as_floats)
-    return None
-def normalize(s):
-    if not s: return ""
-    return re.sub(r"\W+", "", str(s).lower().strip())
 st.sidebar.header("Step 1: Upload Active Purchase Orders (POs)")
 po_file = st.sidebar.file_uploader(
@@ -344,6 +376,7 @@ if st.button("Extract") and inv_file:
     with st.spinner("Extracting text from document using Unstract..."):
         text = extract_text_from_unstract(inv_file)
     if text:
         extracted_info = extract_invoice_info(mdl, text)
         if extracted_info:
             if "invoice_header" in extracted_info:
@@ -355,126 +388,60 @@ if st.button("Extract") and inv_file:
             st.table(extracted_info["line_items"])
             st.session_state['last_extracted_info'] = extracted_info
-# Always retrieve latest extracted info and PO df from session state!
 extracted_info = st.session_state.get('last_extracted_info', None)
 po_df = st.session_state.get('last_po_df', None)
 def po_match_tool_func(input_text):
     invoice = st.session_state.get("last_extracted_info")
     po_df = st.session_state.get("last_po_df")
-    debug = {}
     if invoice is None or po_df is None:
         return json.dumps({
             "decision": "REJECTED",
             "reason": "Invoice or PO data not found.",
-            "debug": debug,
         })
-    inv_hdr = invoice["invoice_header"]
-    inv_po_number = (inv_hdr.get("purchase_order_number") or
-                     inv_hdr.get("order_number") or
-                     inv_hdr.get("our_order_number") or "")
-    inv_supplier = inv_hdr.get("supplier_name") or ""
-    inv_total = inv_hdr.get("total_due")   # <<--- ALWAYS USE total_due
-    inv_total = clean_num(inv_total)
-    inv_line_items = invoice.get("line_items", [])
-    debug["inv_po_number"] = inv_po_number
-    debug["inv_supplier"] = inv_supplier
-    debug["inv_total"] = inv_total
-    explanation = []
-    best_match = None
-    best_match_type = None
-    match_row_debug = None
-    for idx, row in po_df.iterrows():
-        po_number = str(row.get("PO Number", ""))
-        po_number_clean = normalize(po_number)
-        inv_po_number_clean = normalize(inv_po_number)
-        supplier = str(row.get("Supplier Name", ""))
-        supplier_clean = normalize(supplier)
-        inv_supplier_clean = normalize(inv_supplier)
-        po_total = clean_num(row.get("Total PO Value", ""))
-        po_desc = str(row.get("Description", "")).lower()
-        po_match = (po_number_clean in inv_po_number_clean or inv_po_number_clean in po_number_clean) and po_number_clean
-        supplier_score = fuzz.token_set_ratio(supplier, inv_supplier)
-        supplier_match = supplier_score >= 90
-        total_match = False
-        if po_total is not None and inv_total is not None:
-            total_match = abs(po_total - inv_total) < 1  # $1 tolerance
-        debug_row = {
-            "row_po_number": po_number,
-            "row_supplier": supplier,
-            "row_total": po_total,
-            "po_match": po_match,
-            "supplier_score": supplier_score,
-            "supplier_match": supplier_match,
-            "total_match": total_match,
-            "row_desc": po_desc,
-        }
-        if po_match and supplier_match and total_match:
-            best_match = row
-            best_match_type = "APPROVED"
-            explanation.append(f"PO Number, Supplier Name, and Total Due all matched. PO: {row.to_dict()}")
-            match_row_debug = debug_row
-            break
-        elif (po_match or supplier_match) and not total_match:
-            best_match = row
-            best_match_type = "PARTIALLY APPROVED"
-            fields = []
-            if po_match:
-                fields.append("PO Number matched")
-            if supplier_match:
-                fields.append("Supplier Name matched (fuzzy)")
-            explanation.append(f"{' and '.join(fields)}, but Total Due did not match. PO: {row.to_dict()}")
-            match_row_debug = debug_row
-            break
-    # If no direct match, try line item fuzzy matching
-    if best_match is None and len(inv_line_items) > 0:
-        for idx, row in po_df.iterrows():
-            po_desc = str(row.get("Description", "")).lower()
-            po_total = clean_num(row.get("Total PO Value", ""))
-            line_item_matched = False
-            for line in inv_line_items:
-                desc = (line.get("description") or "").lower()
-                if not desc: continue
-                score = fuzz.token_set_ratio(desc, po_desc)
-                if (desc and po_desc and score >= 80):
-                    line_item_matched = True
-                    explanation.append(f"Line item '{desc}' matched PO description '{po_desc}' with score {score}. PO: {row.to_dict()}")
-                    break
-            if line_item_matched and po_total is not None and inv_total is not None and abs(po_total - inv_total) < 1:
-                best_match = row
-                best_match_type = "APPROVED"
-                match_row_debug = {
-                    "row_desc": po_desc,
-                    "line_item_desc": desc,
-                    "fuzzy_score": score,
-                    "po_total": po_total,
-                    "inv_total": inv_total,
-                    "total_match": abs(po_total - inv_total) < 1,
-                }
-                break
-    debug["matched_po_row"] = match_row_debug
-    if best_match is not None:
-        return json.dumps({
-            "decision": best_match_type,
-            "reason": " | ".join(explanation),
-            "debug": debug
-        })
-    else:
-        return json.dumps({
-            "decision": "REJECTED",
-            "reason": "No match found on PO Number, Supplier Name, Total Due, or any line item (including fuzzy match).",
-            "debug": debug
-        })
 if po_df is not None:
     st.session_state["last_po_df"] = po_df
@@ -492,7 +459,7 @@ if extracted_info is not None and po_df is not None:
             Tool(
                 name="po_match_tool",
                 func=po_match_tool_func,
-                description="Check if the invoice matches any PO (headers or fuzzy line items).",
             )
         ]
         decision_llm = ChatOpenAI(
@@ -509,12 +476,9 @@ if extracted_info is not None and po_df is not None:
         )
         prompt = (
             "You are an expert accounts payable agent. "
-            "Use po_match_tool to check matches based on the following business rules:\n"
-            "- If PO Number AND Supplier Name AND Total Value all match, the invoice is APPROVED.\n"
-            "- If PO Number OR Supplier Name match, but Total Value does not, the invoice is PARTIALLY APPROVED.\n"
-            "- If neither, try matching at least one line item (by fuzzy description, quantity, or price) and require total to match for APPROVED.\n"
-            "- Otherwise, REJECTED.\n"
-            "Call the tool and return its result as-is. Do not invent or guess the answer, do not add any comments outside the JSON.\n"
             f"Invoice JSON:\n{json.dumps(extracted_info, indent=2)}"
         )
         with st.spinner("AI is reasoning and making a decision..."):
@@ -523,10 +487,12 @@ if extracted_info is not None and po_df is not None:
             result_json = json.loads(result)
             st.write(f"**Decision:** {result_json.get('decision', 'N/A')}")
             st.write(f"**Reason:** {result_json.get('reason', 'N/A')}")
-            with st.expander("Debug"):
                 st.json(result_json.get('debug'))
                 st.subheader("Extracted Invoice JSON")
                 st.json(extracted_info)
         except Exception:
             st.subheader("AI Decision & Reason")
             st.write(result)

     )
 def ensure_total_due(invoice_header):
     if invoice_header.get("total_due") in [None, ""]:
         for field in ["invoice_total", "invoice_value", "total_before_tax", "balance_due", "amount_paid"]:
             if field in invoice_header and invoice_header[field]:
                 break
     return invoice_header
 def get_content_type(filename):
     mime, _ = mimetypes.guess_type(filename)
     ext = filename.lower().split('.')[-1]
     except Exception:
         return r.text
+def weighted_fuzzy_score(s1, s2):
+    if not s1 and not s2:
+        return 100
+    return fuzz.token_set_ratio(str(s1).lower(), str(s2).lower())
+def find_best_po_match(inv, po_df):
+    inv_hdr = inv["invoice_header"]
+    inv_supplier = inv_hdr.get("supplier_name") or ""
+    inv_ship_to = inv_hdr.get("ship_to_name") or ""
+    inv_bill_to = inv_hdr.get("bill_to_name") or ""
+    inv_payment_terms = inv_hdr.get("payment_terms") or ""
+    inv_currency = inv_hdr.get("currency") or ""
+    inv_total_due = clean_num(inv_hdr.get("total_due"))
+    inv_line_items = inv.get("line_items", [])
+    scores = []
+    for idx, row in po_df.iterrows():
+        po_supplier = row.get("Supplier Name", "")
+        po_ship_to = row.get("Ship To", "")
+        po_bill_to = row.get("Bill To", "")
+        po_payment_terms = row.get("Payment Terms", "")
+        po_currency = row.get("Currency", "")
+        po_total = clean_num(row.get("PO Total Value", ""))
+        po_desc = row.get("Item Description", "")
+        po_qty = str(row.get("Item Quantity", ""))
+        po_unit = str(row.get("Item Unit Price", ""))
+        po_line_total = clean_num(row.get("Line Item Total", ""))
+        # Weighted fuzzy scores
+        s_supplier = weighted_fuzzy_score(inv_supplier, po_supplier)
+        s_ship_to = weighted_fuzzy_score(inv_ship_to, po_ship_to)
+        s_bill_to = weighted_fuzzy_score(inv_bill_to, po_bill_to)
+        s_terms = weighted_fuzzy_score(inv_payment_terms, po_payment_terms)
+        s_currency = weighted_fuzzy_score(inv_currency, po_currency)
+        s_total = 100 if inv_total_due is not None and po_total is not None and abs(inv_total_due - po_total) < 2 else 0
+        # Check for at least one line item strong match
+        line_item_score = 0
+        line_reason = ""
+        for line in inv_line_items:
+            desc_score = weighted_fuzzy_score(line.get("description", ""), po_desc)
+            qty_score = 100 if clean_num(line.get("quantity")) == clean_num(po_qty) else 0
+            unit_score = 100 if clean_num(line.get("price")) == clean_num(po_unit) else 0
+            amount_score = 100 if clean_num(line.get("amount")) == po_line_total else 0
+            total = desc_score * 0.5 + qty_score * 0.2 + unit_score * 0.15 + amount_score * 0.15
+            if total > line_item_score:
+                line_item_score = total
+                line_reason = (f"Best line item: desc_score={desc_score}, qty_score={qty_score}, "
+                               f"unit_score={unit_score}, amount_score={amount_score}")
+        # Score weights (tune as needed)
+        total_score = (
+            s_supplier * 0.25 +
+            s_ship_to * 0.1 +
+            s_bill_to * 0.1 +
+            s_terms * 0.1 +
+            s_currency * 0.05 +
+            s_total * 0.2 +
+            line_item_score * 0.2
+        )
+        reason = (
+            f"Supplier match: {s_supplier}/100, Ship To: {s_ship_to}/100, "
+            f"Bill To: {s_bill_to}/100, Payment Terms: {s_terms}/100, Currency: {s_currency}/100, "
+            f"Total Due: {'match' if s_total else 'no match'}, "
+            f"Line item best match: {int(line_item_score)}/100. {line_reason}"
+        )
+        debug = {
+            "po_idx": idx,
+            "po_supplier": po_supplier,
+            "po_ship_to": po_ship_to,
+            "po_bill_to": po_bill_to,
+            "po_total": po_total,
+            "s_supplier": s_supplier,
+            "s_ship_to": s_ship_to,
+            "s_bill_to": s_bill_to,
+            "s_terms": s_terms,
+            "s_currency": s_currency,
+            "s_total": s_total,
+            "line_item_score": line_item_score,
+            "total_score": total_score,
+            "line_reason": line_reason,
+            "inv_total_due": inv_total_due
+        }
+        scores.append((row, total_score, reason, debug))
+    # Pick the highest
+    scores.sort(key=lambda tup: tup[1], reverse=True)
+    if not scores:
+        return None, 0, "No POs found.", {}
+    best_row, best_score, reason, debug = scores[0]
+    return best_row, best_score, reason, debug
 st.sidebar.header("Step 1: Upload Active Purchase Orders (POs)")
 po_file = st.sidebar.file_uploader(
     with st.spinner("Extracting text from document using Unstract..."):
         text = extract_text_from_unstract(inv_file)
     if text:
+        prompt = get_extraction_prompt(mdl, text)
         extracted_info = extract_invoice_info(mdl, text)
         if extracted_info:
             if "invoice_header" in extracted_info:
             st.table(extracted_info["line_items"])
             st.session_state['last_extracted_info'] = extracted_info
 extracted_info = st.session_state.get('last_extracted_info', None)
 po_df = st.session_state.get('last_po_df', None)
 def po_match_tool_func(input_text):
     invoice = st.session_state.get("last_extracted_info")
     po_df = st.session_state.get("last_po_df")
     if invoice is None or po_df is None:
         return json.dumps({
             "decision": "REJECTED",
             "reason": "Invoice or PO data not found.",
+            "debug": {},
         })
+    best_row, best_score, reason, debug = find_best_po_match(invoice, po_df)
+    if best_score > 85:
+        status = "APPROVED"
+    elif best_score > 70:
+        status = "PARTIALLY APPROVED"
+    else:
+        status = "REJECTED"
+    return json.dumps({
+        "decision": status,
+        "reason": f"Best match score: {int(best_score)}/100. {reason}",
+        "debug": debug,
+        "po_row": best_row.to_dict() if best_row is not None else None
+    })
+def extract_invoice_info(model_choice, text):
+    prompt = get_extraction_prompt(model_choice, text)
+    raw = query_llm(model_choice, prompt)
+    if not raw:
+        return None
+    data = clean_json_response(raw)
+    if not data:
+        return None
+    hdr = data.get("invoice_header", {})
+    if not hdr and any(k in data for k in ("invoice_number","supplier_name","customer_name")):
+        hdr = data
+    for k in ("invoice_number","invoice_date","po_number","invoice_value","supplier_name","customer_name"):
+        hdr.setdefault(k, None)
+    if not hdr.get("supplier_name"):
+        hdr["supplier_name"] = fallback_supplier(text)
+    hdr = ensure_total_due(hdr)
+    items = data.get("line_items", [])
+    if not isinstance(items, list):
+        items = []
+    for itm in items:
+        if not isinstance(itm, dict):
+            continue
+        for k in ("item_number","description","quantity","unit_price","total_price"):
+            itm.setdefault(k, None)
+    return {"invoice_header": hdr, "line_items": items}
 if po_df is not None:
     st.session_state["last_po_df"] = po_df
             Tool(
                 name="po_match_tool",
                 func=po_match_tool_func,
+                description="Smartly match invoice to PO using all possible fields.",
             )
         ]
         decision_llm = ChatOpenAI(
         )
         prompt = (
             "You are an expert accounts payable agent. "
+            "Use po_match_tool to check for the best possible match using supplier, ship to, bill to, payment terms, currency, line items, and total value."
+            "Weigh the importance of each field as an expert would."
+            "Return a JSON with decision (APPROVED, PARTIALLY APPROVED, REJECTED), reason (include field scores and reasoning), debug, and the best matched PO row.\n"
             f"Invoice JSON:\n{json.dumps(extracted_info, indent=2)}"
         )
         with st.spinner("AI is reasoning and making a decision..."):
             result_json = json.loads(result)
             st.write(f"**Decision:** {result_json.get('decision', 'N/A')}")
             st.write(f"**Reason:** {result_json.get('reason', 'N/A')}")
+            with st.expander("Debug & Matching Details"):
                 st.json(result_json.get('debug'))
                 st.subheader("Extracted Invoice JSON")
                 st.json(extracted_info)
+                st.subheader("Matched PO Row")
+                st.json(result_json.get('po_row'))
         except Exception:
             st.subheader("AI Decision & Reason")
             st.write(result)