DOC_VALID_AGENT

Sleeping

App Files Files Community

Seth0330 commited on Jun 17, 2025

Commit

dc0c728

verified ·

1 Parent(s): 2c7ba82

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -84

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ from langchain_community.chat_models import ChatOpenAI
 from langchain.agents import initialize_agent, Tool, AgentType
 from fuzzywuzzy import fuzz
 st.set_page_config(page_title="Accounts Payable AI Agent", layout="wide")
 MODELS = {
@@ -45,6 +46,8 @@ MODELS = {
     },
 }
 def get_api_key(model_choice):
     key = os.getenv(MODELS[model_choice]["key_env"])
     if not key:
@@ -202,64 +205,18 @@ def ensure_total_due(invoice_header):
                 break
     return invoice_header
-def get_content_type(filename):
-    mime, _ = mimetypes.guess_type(filename)
-    ext = filename.lower().split('.')[-1]
-    if ext == "pdf":
-        return "text/plain"
-    if mime is None:
-        return "application/octet-stream"
-    return mime
-UNSTRACT_BASE = "https://llmwhisperer-api.us-central.unstract.com/api/v2"
-UNSTRACT_API_KEY = os.getenv("UNSTRACT_API_KEY")
-def extract_text_from_unstract(uploaded_file):
-    filename = getattr(uploaded_file, "name", "uploaded_file")
-    file_bytes = uploaded_file.read()
-    content_type = get_content_type(filename)
-    headers = {
-        "unstract-key": UNSTRACT_API_KEY,
-        "Content-Type": content_type,
-    }
-    url = f"{UNSTRACT_BASE}/whisper"
-    with st.spinner("Uploading and processing document with Unstract..."):
-        r = requests.post(url, headers=headers, data=file_bytes)
-        if r.status_code != 202:
-            st.error(f"Unstract: Error uploading file: {r.status_code} - {r.text}")
-            return None
-        whisper_hash = r.json().get("whisper_hash")
-        if not whisper_hash:
-            st.error("Unstract: No whisper_hash received.")
-            return None
-    status_url = f"{UNSTRACT_BASE}/whisper-status?whisper_hash={whisper_hash}"
-    status_placeholder = st.empty()
-    for i in range(30):
-        status_r = requests.get(status_url, headers={"unstract-key": UNSTRACT_API_KEY})
-        if status_r.status_code != 200:
-            st.error(f"Unstract: Error checking status: {status_r.status_code} - {status_r.text}")
-            return None
-        status = status_r.json().get("status")
-        if status == "processed":
-            status_placeholder.info("Unstract status: processed! 🎉")
-            break
-        status_placeholder.info(f"Unstract status: {status or 'waiting'}... ({i+1})")
-        time.sleep(2)
-    else:
-        status_placeholder.error("Unstract: Timeout waiting for OCR to finish.")
-        return None
-    retrieve_url = f"{UNSTRACT_BASE}/whisper-retrieve?whisper_hash={whisper_hash}&text_only=true"
-    r = requests.get(retrieve_url, headers={"unstract-key": UNSTRACT_API_KEY})
-    if r.status_code != 200:
-        st.error(f"Unstract: Error retrieving extracted text: {r.status_code} - {r.text}")
         return None
-    try:
-        data = r.json()
-        return data.get("result_text") or r.text
-    except Exception:
-        return r.text
 def weighted_fuzzy_score(s1, s2):
     if not s1 and not s2:
@@ -352,6 +309,93 @@ def find_best_po_match(inv, po_df):
     best_row, best_score, reason, debug = scores[0]
     return best_row, best_score, reason, debug
 st.sidebar.header("Step 1: Upload Active Purchase Orders (POs)")
 po_file = st.sidebar.file_uploader(
     "Upload POs CSV (must include PO number, Supplier, Items, etc.)",
@@ -376,7 +420,6 @@ if st.button("Extract") and inv_file:
     with st.spinner("Extracting text from document using Unstract..."):
         text = extract_text_from_unstract(inv_file)
     if text:
-        prompt = get_extraction_prompt(mdl, text)
         extracted_info = extract_invoice_info(mdl, text)
         if extracted_info:
             if "invoice_header" in extracted_info:
@@ -417,32 +460,6 @@ def po_match_tool_func(input_text):
         "po_row": best_row.to_dict() if best_row is not None else None
     })
-def extract_invoice_info(model_choice, text):
-    prompt = get_extraction_prompt(model_choice, text)
-    raw = query_llm(model_choice, prompt)
-    if not raw:
-        return None
-    data = clean_json_response(raw)
-    if not data:
-        return None
-    hdr = data.get("invoice_header", {})
-    if not hdr and any(k in data for k in ("invoice_number","supplier_name","customer_name")):
-        hdr = data
-    for k in ("invoice_number","invoice_date","po_number","invoice_value","supplier_name","customer_name"):
-        hdr.setdefault(k, None)
-    if not hdr.get("supplier_name"):
-        hdr["supplier_name"] = fallback_supplier(text)
-    hdr = ensure_total_due(hdr)
-    items = data.get("line_items", [])
-    if not isinstance(items, list):
-        items = []
-    for itm in items:
-        if not isinstance(itm, dict):
-            continue
-        for k in ("item_number","description","quantity","unit_price","total_price"):
-            itm.setdefault(k, None)
-    return {"invoice_header": hdr, "line_items": items}
 if po_df is not None:
     st.session_state["last_po_df"] = po_df

 from langchain.agents import initialize_agent, Tool, AgentType
 from fuzzywuzzy import fuzz
+# --- CONFIGURATION ---
 st.set_page_config(page_title="Accounts Payable AI Agent", layout="wide")
 MODELS = {
     },
 }
+# --- UTILITY FUNCTIONS ---
 def get_api_key(model_choice):
     key = os.getenv(MODELS[model_choice]["key_env"])
     if not key:
                 break
     return invoice_header
+def clean_num(val):
+    if val is None:
         return None
+    if isinstance(val, (int, float)):
+        return float(val)
+    matches = re.findall(r"[-+]?\d[\d,]*\.?\d*", str(val))
+    if matches:
+        cleaned = [m.replace(',', '') for m in matches if m]
+        as_floats = [float(c) for c in cleaned if c.replace('.', '', 1).isdigit()]
+        if as_floats:
+            return max(as_floats)
+    return None
 def weighted_fuzzy_score(s1, s2):
     if not s1 and not s2:
     best_row, best_score, reason, debug = scores[0]
     return best_row, best_score, reason, debug
+def extract_invoice_info(model_choice, text):
+    prompt = get_extraction_prompt(model_choice, text)
+    raw = query_llm(model_choice, prompt)
+    if not raw:
+        return None
+    data = clean_json_response(raw)
+    if not data:
+        return None
+    hdr = data.get("invoice_header", {})
+    if not hdr and any(k in data for k in ("invoice_number","supplier_name","customer_name")):
+        hdr = data
+    for k in ("invoice_number","invoice_date","po_number","invoice_value","supplier_name","customer_name"):
+        hdr.setdefault(k, None)
+    if not hdr.get("supplier_name"):
+        hdr["supplier_name"] = fallback_supplier(text)
+    hdr = ensure_total_due(hdr)
+    items = data.get("line_items", [])
+    if not isinstance(items, list):
+        items = []
+    for itm in items:
+        if not isinstance(itm, dict):
+            continue
+        for k in ("item_number","description","quantity","unit_price","total_price"):
+            itm.setdefault(k, None)
+    return {"invoice_header": hdr, "line_items": items}
+def get_content_type(filename):
+    mime, _ = mimetypes.guess_type(filename)
+    ext = filename.lower().split('.')[-1]
+    if ext == "pdf":
+        return "text/plain"
+    if mime is None:
+        return "application/octet-stream"
+    return mime
+UNSTRACT_BASE = "https://llmwhisperer-api.us-central.unstract.com/api/v2"
+UNSTRACT_API_KEY = os.getenv("UNSTRACT_API_KEY")
+def extract_text_from_unstract(uploaded_file):
+    filename = getattr(uploaded_file, "name", "uploaded_file")
+    file_bytes = uploaded_file.read()
+    content_type = get_content_type(filename)
+    headers = {
+        "unstract-key": UNSTRACT_API_KEY,
+        "Content-Type": content_type,
+    }
+    url = f"{UNSTRACT_BASE}/whisper"
+    with st.spinner("Uploading and processing document with Unstract..."):
+        r = requests.post(url, headers=headers, data=file_bytes)
+        if r.status_code != 202:
+            st.error(f"Unstract: Error uploading file: {r.status_code} - {r.text}")
+            return None
+        whisper_hash = r.json().get("whisper_hash")
+        if not whisper_hash:
+            st.error("Unstract: No whisper_hash received.")
+            return None
+    status_url = f"{UNSTRACT_BASE}/whisper-status?whisper_hash={whisper_hash}"
+    status_placeholder = st.empty()
+    for i in range(30):
+        status_r = requests.get(status_url, headers={"unstract-key": UNSTRACT_API_KEY})
+        if status_r.status_code != 200:
+            st.error(f"Unstract: Error checking status: {status_r.status_code} - {status_r.text}")
+            return None
+        status = status_r.json().get("status")
+        if status == "processed":
+            status_placeholder.info("Unstract status: processed! 🎉")
+            break
+        status_placeholder.info(f"Unstract status: {status or 'waiting'}... ({i+1})")
+        time.sleep(2)
+    else:
+        status_placeholder.error("Unstract: Timeout waiting for OCR to finish.")
+        return None
+    retrieve_url = f"{UNSTRACT_BASE}/whisper-retrieve?whisper_hash={whisper_hash}&text_only=true"
+    r = requests.get(retrieve_url, headers={"unstract-key": UNSTRACT_API_KEY})
+    if r.status_code != 200:
+        st.error(f"Unstract: Error retrieving extracted text: {r.status_code} - {r.text}")
+        return None
+    try:
+        data = r.json()
+        return data.get("result_text") or r.text
+    except Exception:
+        return r.text
+# --- UI/LOGIC ---
 st.sidebar.header("Step 1: Upload Active Purchase Orders (POs)")
 po_file = st.sidebar.file_uploader(
     "Upload POs CSV (must include PO number, Supplier, Items, etc.)",
     with st.spinner("Extracting text from document using Unstract..."):
         text = extract_text_from_unstract(inv_file)
     if text:
         extracted_info = extract_invoice_info(mdl, text)
         if extracted_info:
             if "invoice_header" in extracted_info:
         "po_row": best_row.to_dict() if best_row is not None else None
     })
 if po_df is not None:
     st.session_state["last_po_df"] = po_df