Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -116,7 +116,6 @@ def fallback_supplier(text):
|
|
| 116 |
return None
|
| 117 |
|
| 118 |
def get_extraction_prompt(model_choice, txt):
|
| 119 |
-
# Example output shows both header & line items
|
| 120 |
return (
|
| 121 |
"Extract every possible piece of metadata and detail from the following invoice text—including all header information, supplier details, customer details, addresses, invoice numbers, dates, tax information, payment terms, references, summary totals, and a full list of line items with as many columns as possible. "
|
| 122 |
"Return a structured JSON with two keys: 'invoice_header' (an object with all header fields found) and 'line_items' (an array of all detected line items and their attributes). "
|
|
@@ -158,7 +157,6 @@ def get_extraction_prompt(model_choice, txt):
|
|
| 158 |
f"{txt}"
|
| 159 |
)
|
| 160 |
|
| 161 |
-
|
| 162 |
def extract_invoice_info(model_choice, text):
|
| 163 |
prompt = get_extraction_prompt(model_choice, text)
|
| 164 |
raw = query_llm(model_choice, prompt)
|
|
@@ -168,18 +166,20 @@ def extract_invoice_info(model_choice, text):
|
|
| 168 |
if not data:
|
| 169 |
return None
|
| 170 |
|
| 171 |
-
# DeepSeek models: flat format
|
| 172 |
if model_choice.startswith("DeepSeek"):
|
| 173 |
-
#
|
| 174 |
-
data.
|
| 175 |
-
|
|
|
|
|
|
|
|
|
|
| 176 |
if not isinstance(itm, dict):
|
| 177 |
continue
|
| 178 |
for k in ("description","quantity","unit_price","total_price"):
|
| 179 |
itm.setdefault(k, None)
|
| 180 |
-
return
|
| 181 |
-
# Other models (OpenAI GPT-4.1, Mistral):
|
| 182 |
-
# Accepts a flexible schema as model may include extra keys
|
| 183 |
hdr = data.get("invoice_header", {})
|
| 184 |
if not hdr and any(k in data for k in ("invoice_number","supplier_name","customer_name")):
|
| 185 |
# If model returned flat, treat top-level keys as header
|
|
@@ -189,15 +189,14 @@ def extract_invoice_info(model_choice, text):
|
|
| 189 |
if not hdr.get("supplier_name"):
|
| 190 |
hdr["supplier_name"] = fallback_supplier(text)
|
| 191 |
items = data.get("line_items", [])
|
|
|
|
|
|
|
| 192 |
for itm in items:
|
| 193 |
if not isinstance(itm, dict):
|
| 194 |
continue
|
| 195 |
for k in ("item_number","description","quantity","unit_price","total_price"):
|
| 196 |
itm.setdefault(k, None)
|
| 197 |
-
|
| 198 |
-
data["line_items"] = items
|
| 199 |
-
|
| 200 |
-
return data
|
| 201 |
|
| 202 |
# ---- UI ----
|
| 203 |
tab1, tab2 = st.tabs(["PDF Summarizer","Invoice Extractor"])
|
|
@@ -222,31 +221,10 @@ with tab2:
|
|
| 222 |
info = extract_invoice_info(mdl, txt)
|
| 223 |
if info:
|
| 224 |
st.success("Extraction Complete")
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
c1, c2 = st.columns(2)
|
| 230 |
-
for i, (k, v) in enumerate(non_items.items()):
|
| 231 |
-
(c1 if i % 2 == 0 else c2).metric(k.replace("_", " ").title(), v)
|
| 232 |
-
st.subheader("Line Items")
|
| 233 |
-
st.table(info.get("line_items", []))
|
| 234 |
-
else:
|
| 235 |
-
h = info.get("invoice_header", {})
|
| 236 |
-
c1, c2, c3 = st.columns(3)
|
| 237 |
-
c1.metric("Invoice #", h.get("invoice_number"))
|
| 238 |
-
c1.metric("Supplier", h.get("supplier_name"))
|
| 239 |
-
c2.metric("Date", h.get("invoice_date"))
|
| 240 |
-
c2.metric("Customer", h.get("customer_name"))
|
| 241 |
-
c3.metric("PO #", h.get("po_number"))
|
| 242 |
-
c3.metric("Total", h.get("invoice_value"))
|
| 243 |
-
# Show any additional header fields detected
|
| 244 |
-
extra_fields = {k: v for k, v in h.items() if k not in ("invoice_number", "supplier_name", "customer_name", "invoice_date", "po_number", "invoice_value")}
|
| 245 |
-
if extra_fields:
|
| 246 |
-
st.subheader("Additional Header Metadata")
|
| 247 |
-
st.json(extra_fields)
|
| 248 |
-
st.subheader("Line Items")
|
| 249 |
-
st.table(info.get("line_items", []))
|
| 250 |
|
| 251 |
if "last_api" in st.session_state:
|
| 252 |
with st.expander("Debug"):
|
|
|
|
| 116 |
return None
|
| 117 |
|
| 118 |
def get_extraction_prompt(model_choice, txt):
|
|
|
|
| 119 |
return (
|
| 120 |
"Extract every possible piece of metadata and detail from the following invoice text—including all header information, supplier details, customer details, addresses, invoice numbers, dates, tax information, payment terms, references, summary totals, and a full list of line items with as many columns as possible. "
|
| 121 |
"Return a structured JSON with two keys: 'invoice_header' (an object with all header fields found) and 'line_items' (an array of all detected line items and their attributes). "
|
|
|
|
| 157 |
f"{txt}"
|
| 158 |
)
|
| 159 |
|
|
|
|
| 160 |
def extract_invoice_info(model_choice, text):
|
| 161 |
prompt = get_extraction_prompt(model_choice, text)
|
| 162 |
raw = query_llm(model_choice, prompt)
|
|
|
|
| 166 |
if not data:
|
| 167 |
return None
|
| 168 |
|
| 169 |
+
# DeepSeek models: flat format, but we standardize to always return "invoice_header" and "line_items"
|
| 170 |
if model_choice.startswith("DeepSeek"):
|
| 171 |
+
# Put all keys except "line_items" into invoice_header
|
| 172 |
+
header = {k: v for k, v in data.items() if k != "line_items"}
|
| 173 |
+
items = data.get("line_items", [])
|
| 174 |
+
if not isinstance(items, list):
|
| 175 |
+
items = []
|
| 176 |
+
for itm in items:
|
| 177 |
if not isinstance(itm, dict):
|
| 178 |
continue
|
| 179 |
for k in ("description","quantity","unit_price","total_price"):
|
| 180 |
itm.setdefault(k, None)
|
| 181 |
+
return {"invoice_header": header, "line_items": items}
|
| 182 |
+
# Other models (OpenAI GPT-4.1, Mistral): expect proper structure
|
|
|
|
| 183 |
hdr = data.get("invoice_header", {})
|
| 184 |
if not hdr and any(k in data for k in ("invoice_number","supplier_name","customer_name")):
|
| 185 |
# If model returned flat, treat top-level keys as header
|
|
|
|
| 189 |
if not hdr.get("supplier_name"):
|
| 190 |
hdr["supplier_name"] = fallback_supplier(text)
|
| 191 |
items = data.get("line_items", [])
|
| 192 |
+
if not isinstance(items, list):
|
| 193 |
+
items = []
|
| 194 |
for itm in items:
|
| 195 |
if not isinstance(itm, dict):
|
| 196 |
continue
|
| 197 |
for k in ("item_number","description","quantity","unit_price","total_price"):
|
| 198 |
itm.setdefault(k, None)
|
| 199 |
+
return {"invoice_header": hdr, "line_items": items}
|
|
|
|
|
|
|
|
|
|
| 200 |
|
| 201 |
# ---- UI ----
|
| 202 |
tab1, tab2 = st.tabs(["PDF Summarizer","Invoice Extractor"])
|
|
|
|
| 221 |
info = extract_invoice_info(mdl, txt)
|
| 222 |
if info:
|
| 223 |
st.success("Extraction Complete")
|
| 224 |
+
st.subheader("Invoice Metadata")
|
| 225 |
+
st.table([{k.replace("_", " ").title(): v for k, v in info["invoice_header"].items()}])
|
| 226 |
+
st.subheader("Line Items")
|
| 227 |
+
st.table(info["line_items"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
|
| 229 |
if "last_api" in st.session_state:
|
| 230 |
with st.expander("Debug"):
|