Seth0330 commited on
Commit
08560ee
·
verified ·
1 Parent(s): 8c52b14

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -38
app.py CHANGED
@@ -116,7 +116,6 @@ def fallback_supplier(text):
116
  return None
117
 
118
  def get_extraction_prompt(model_choice, txt):
119
- # Example output shows both header & line items
120
  return (
121
  "Extract every possible piece of metadata and detail from the following invoice text—including all header information, supplier details, customer details, addresses, invoice numbers, dates, tax information, payment terms, references, summary totals, and a full list of line items with as many columns as possible. "
122
  "Return a structured JSON with two keys: 'invoice_header' (an object with all header fields found) and 'line_items' (an array of all detected line items and their attributes). "
@@ -158,7 +157,6 @@ def get_extraction_prompt(model_choice, txt):
158
  f"{txt}"
159
  )
160
 
161
-
162
  def extract_invoice_info(model_choice, text):
163
  prompt = get_extraction_prompt(model_choice, text)
164
  raw = query_llm(model_choice, prompt)
@@ -168,18 +166,20 @@ def extract_invoice_info(model_choice, text):
168
  if not data:
169
  return None
170
 
171
- # DeepSeek models: flat format
172
  if model_choice.startswith("DeepSeek"):
173
- # Dynamically handle flat or semi-structured output (may contain any fields)
174
- data.setdefault("line_items", [])
175
- for itm in data["line_items"]:
 
 
 
176
  if not isinstance(itm, dict):
177
  continue
178
  for k in ("description","quantity","unit_price","total_price"):
179
  itm.setdefault(k, None)
180
- return data
181
- # Other models (OpenAI GPT-4.1, Mistral): usually nested under invoice_header, but now prompt is broader, so handle flexibly
182
- # Accepts a flexible schema as model may include extra keys
183
  hdr = data.get("invoice_header", {})
184
  if not hdr and any(k in data for k in ("invoice_number","supplier_name","customer_name")):
185
  # If model returned flat, treat top-level keys as header
@@ -189,15 +189,14 @@ def extract_invoice_info(model_choice, text):
189
  if not hdr.get("supplier_name"):
190
  hdr["supplier_name"] = fallback_supplier(text)
191
  items = data.get("line_items", [])
 
 
192
  for itm in items:
193
  if not isinstance(itm, dict):
194
  continue
195
  for k in ("item_number","description","quantity","unit_price","total_price"):
196
  itm.setdefault(k, None)
197
- data["invoice_header"] = hdr
198
- data["line_items"] = items
199
-
200
- return data
201
 
202
  # ---- UI ----
203
  tab1, tab2 = st.tabs(["PDF Summarizer","Invoice Extractor"])
@@ -222,31 +221,10 @@ with tab2:
222
  info = extract_invoice_info(mdl, txt)
223
  if info:
224
  st.success("Extraction Complete")
225
- # For DeepSeek, output may be flat; for others, prefer "invoice_header" nesting
226
- if mdl.startswith("DeepSeek"):
227
- # Show all keys except line_items
228
- non_items = {k: v for k, v in info.items() if k != "line_items"}
229
- c1, c2 = st.columns(2)
230
- for i, (k, v) in enumerate(non_items.items()):
231
- (c1 if i % 2 == 0 else c2).metric(k.replace("_", " ").title(), v)
232
- st.subheader("Line Items")
233
- st.table(info.get("line_items", []))
234
- else:
235
- h = info.get("invoice_header", {})
236
- c1, c2, c3 = st.columns(3)
237
- c1.metric("Invoice #", h.get("invoice_number"))
238
- c1.metric("Supplier", h.get("supplier_name"))
239
- c2.metric("Date", h.get("invoice_date"))
240
- c2.metric("Customer", h.get("customer_name"))
241
- c3.metric("PO #", h.get("po_number"))
242
- c3.metric("Total", h.get("invoice_value"))
243
- # Show any additional header fields detected
244
- extra_fields = {k: v for k, v in h.items() if k not in ("invoice_number", "supplier_name", "customer_name", "invoice_date", "po_number", "invoice_value")}
245
- if extra_fields:
246
- st.subheader("Additional Header Metadata")
247
- st.json(extra_fields)
248
- st.subheader("Line Items")
249
- st.table(info.get("line_items", []))
250
 
251
  if "last_api" in st.session_state:
252
  with st.expander("Debug"):
 
116
  return None
117
 
118
  def get_extraction_prompt(model_choice, txt):
 
119
  return (
120
  "Extract every possible piece of metadata and detail from the following invoice text—including all header information, supplier details, customer details, addresses, invoice numbers, dates, tax information, payment terms, references, summary totals, and a full list of line items with as many columns as possible. "
121
  "Return a structured JSON with two keys: 'invoice_header' (an object with all header fields found) and 'line_items' (an array of all detected line items and their attributes). "
 
157
  f"{txt}"
158
  )
159
 
 
160
  def extract_invoice_info(model_choice, text):
161
  prompt = get_extraction_prompt(model_choice, text)
162
  raw = query_llm(model_choice, prompt)
 
166
  if not data:
167
  return None
168
 
169
+ # DeepSeek models: flat format, but we standardize to always return "invoice_header" and "line_items"
170
  if model_choice.startswith("DeepSeek"):
171
+ # Put all keys except "line_items" into invoice_header
172
+ header = {k: v for k, v in data.items() if k != "line_items"}
173
+ items = data.get("line_items", [])
174
+ if not isinstance(items, list):
175
+ items = []
176
+ for itm in items:
177
  if not isinstance(itm, dict):
178
  continue
179
  for k in ("description","quantity","unit_price","total_price"):
180
  itm.setdefault(k, None)
181
+ return {"invoice_header": header, "line_items": items}
182
+ # Other models (OpenAI GPT-4.1, Mistral): expect proper structure
 
183
  hdr = data.get("invoice_header", {})
184
  if not hdr and any(k in data for k in ("invoice_number","supplier_name","customer_name")):
185
  # If model returned flat, treat top-level keys as header
 
189
  if not hdr.get("supplier_name"):
190
  hdr["supplier_name"] = fallback_supplier(text)
191
  items = data.get("line_items", [])
192
+ if not isinstance(items, list):
193
+ items = []
194
  for itm in items:
195
  if not isinstance(itm, dict):
196
  continue
197
  for k in ("item_number","description","quantity","unit_price","total_price"):
198
  itm.setdefault(k, None)
199
+ return {"invoice_header": hdr, "line_items": items}
 
 
 
200
 
201
  # ---- UI ----
202
  tab1, tab2 = st.tabs(["PDF Summarizer","Invoice Extractor"])
 
221
  info = extract_invoice_info(mdl, txt)
222
  if info:
223
  st.success("Extraction Complete")
224
+ st.subheader("Invoice Metadata")
225
+ st.table([{k.replace("_", " ").title(): v for k, v in info["invoice_header"].items()}])
226
+ st.subheader("Line Items")
227
+ st.table(info["line_items"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
 
229
  if "last_api" in st.session_state:
230
  with st.expander("Debug"):