PDF_Upload

Sleeping

App Files Files Community

Seth0330 commited on May 30, 2025

Commit

08560ee

verified ·

1 Parent(s): 8c52b14

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -38

app.py CHANGED Viewed

@@ -116,7 +116,6 @@ def fallback_supplier(text):
     return None
 def get_extraction_prompt(model_choice, txt):
-    # Example output shows both header & line items
     return (
         "Extract every possible piece of metadata and detail from the following invoice text—including all header information, supplier details, customer details, addresses, invoice numbers, dates, tax information, payment terms, references, summary totals, and a full list of line items with as many columns as possible. "
         "Return a structured JSON with two keys: 'invoice_header' (an object with all header fields found) and 'line_items' (an array of all detected line items and their attributes). "
@@ -158,7 +157,6 @@ def get_extraction_prompt(model_choice, txt):
         f"{txt}"
     )
 def extract_invoice_info(model_choice, text):
     prompt = get_extraction_prompt(model_choice, text)
     raw = query_llm(model_choice, prompt)
@@ -168,18 +166,20 @@ def extract_invoice_info(model_choice, text):
     if not data:
         return None
-    # DeepSeek models: flat format
     if model_choice.startswith("DeepSeek"):
-        # Dynamically handle flat or semi-structured output (may contain any fields)
-        data.setdefault("line_items", [])
-        for itm in data["line_items"]:
             if not isinstance(itm, dict):
                 continue
             for k in ("description","quantity","unit_price","total_price"):
                 itm.setdefault(k, None)
-        return data
-    # Other models (OpenAI GPT-4.1, Mistral): usually nested under invoice_header, but now prompt is broader, so handle flexibly
-    # Accepts a flexible schema as model may include extra keys
     hdr = data.get("invoice_header", {})
     if not hdr and any(k in data for k in ("invoice_number","supplier_name","customer_name")):
         # If model returned flat, treat top-level keys as header
@@ -189,15 +189,14 @@ def extract_invoice_info(model_choice, text):
     if not hdr.get("supplier_name"):
         hdr["supplier_name"] = fallback_supplier(text)
     items = data.get("line_items", [])
     for itm in items:
         if not isinstance(itm, dict):
             continue
         for k in ("item_number","description","quantity","unit_price","total_price"):
             itm.setdefault(k, None)
-    data["invoice_header"] = hdr
-    data["line_items"] = items
-    return data
 # ---- UI ----
 tab1, tab2 = st.tabs(["PDF Summarizer","Invoice Extractor"])
@@ -222,31 +221,10 @@ with tab2:
         info = extract_invoice_info(mdl, txt)
         if info:
             st.success("Extraction Complete")
-            # For DeepSeek, output may be flat; for others, prefer "invoice_header" nesting
-            if mdl.startswith("DeepSeek"):
-                # Show all keys except line_items
-                non_items = {k: v for k, v in info.items() if k != "line_items"}
-                c1, c2 = st.columns(2)
-                for i, (k, v) in enumerate(non_items.items()):
-                    (c1 if i % 2 == 0 else c2).metric(k.replace("_", " ").title(), v)
-                st.subheader("Line Items")
-                st.table(info.get("line_items", []))
-            else:
-                h = info.get("invoice_header", {})
-                c1, c2, c3 = st.columns(3)
-                c1.metric("Invoice #", h.get("invoice_number"))
-                c1.metric("Supplier", h.get("supplier_name"))
-                c2.metric("Date", h.get("invoice_date"))
-                c2.metric("Customer", h.get("customer_name"))
-                c3.metric("PO #", h.get("po_number"))
-                c3.metric("Total", h.get("invoice_value"))
-                # Show any additional header fields detected
-                extra_fields = {k: v for k, v in h.items() if k not in ("invoice_number", "supplier_name", "customer_name", "invoice_date", "po_number", "invoice_value")}
-                if extra_fields:
-                    st.subheader("Additional Header Metadata")
-                    st.json(extra_fields)
-                st.subheader("Line Items")
-                st.table(info.get("line_items", []))
     if "last_api" in st.session_state:
         with st.expander("Debug"):

     return None
 def get_extraction_prompt(model_choice, txt):
     return (
         "Extract every possible piece of metadata and detail from the following invoice text—including all header information, supplier details, customer details, addresses, invoice numbers, dates, tax information, payment terms, references, summary totals, and a full list of line items with as many columns as possible. "
         "Return a structured JSON with two keys: 'invoice_header' (an object with all header fields found) and 'line_items' (an array of all detected line items and their attributes). "
         f"{txt}"
     )
 def extract_invoice_info(model_choice, text):
     prompt = get_extraction_prompt(model_choice, text)
     raw = query_llm(model_choice, prompt)
     if not data:
         return None
+    # DeepSeek models: flat format, but we standardize to always return "invoice_header" and "line_items"
     if model_choice.startswith("DeepSeek"):
+        # Put all keys except "line_items" into invoice_header
+        header = {k: v for k, v in data.items() if k != "line_items"}
+        items = data.get("line_items", [])
+        if not isinstance(items, list):
+            items = []
+        for itm in items:
             if not isinstance(itm, dict):
                 continue
             for k in ("description","quantity","unit_price","total_price"):
                 itm.setdefault(k, None)
+        return {"invoice_header": header, "line_items": items}
+    # Other models (OpenAI GPT-4.1, Mistral): expect proper structure
     hdr = data.get("invoice_header", {})
     if not hdr and any(k in data for k in ("invoice_number","supplier_name","customer_name")):
         # If model returned flat, treat top-level keys as header
     if not hdr.get("supplier_name"):
         hdr["supplier_name"] = fallback_supplier(text)
     items = data.get("line_items", [])
+    if not isinstance(items, list):
+        items = []
     for itm in items:
         if not isinstance(itm, dict):
             continue
         for k in ("item_number","description","quantity","unit_price","total_price"):
             itm.setdefault(k, None)
+    return {"invoice_header": hdr, "line_items": items}
 # ---- UI ----
 tab1, tab2 = st.tabs(["PDF Summarizer","Invoice Extractor"])
         info = extract_invoice_info(mdl, txt)
         if info:
             st.success("Extraction Complete")
+            st.subheader("Invoice Metadata")
+            st.table([{k.replace("_", " ").title(): v for k, v in info["invoice_header"].items()}])
+            st.subheader("Line Items")
+            st.table(info["line_items"])
     if "last_api" in st.session_state:
         with st.expander("Debug"):