Seth0330 commited on
Commit
1d73f48
·
verified ·
1 Parent(s): 042447e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -15
app.py CHANGED
@@ -121,15 +121,88 @@ def get_extraction_prompt(model_choice, txt):
121
  "Shipment/invoice-level fields such as CAR NUMBER, SHIPPING POINT, SHIPMENT NUMBER, CURRENCY, etc., must go ONLY into the 'invoice_header', not as line item fields.\n"
122
  "Use this schema:\n"
123
  '{\n'
124
- ' "invoice_header": {...},\n'
125
- ' "line_items": [ {...} ]\n'
126
- '}\n'
127
- "If a field is missing for a line item or header, use null. "
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  "Do not invent fields. Do not add any header or shipment data to any line item. Return ONLY the JSON object, no explanation.\n"
129
  "\nInvoice Text:\n"
130
  f"{txt}"
131
  )
132
 
 
 
 
 
 
 
 
 
 
133
  def extract_invoice_info(model_choice, text):
134
  prompt = get_extraction_prompt(model_choice, text)
135
  raw = query_llm(model_choice, prompt)
@@ -145,6 +218,8 @@ def extract_invoice_info(model_choice, text):
145
  hdr.setdefault(k, None)
146
  if not hdr.get("supplier_name"):
147
  hdr["supplier_name"] = fallback_supplier(text)
 
 
148
  items = data.get("line_items", [])
149
  if not isinstance(items, list):
150
  items = []
@@ -188,7 +263,7 @@ def extract_text_from_unstract(uploaded_file):
188
 
189
  status_url = f"{UNSTRACT_BASE}/whisper-status?whisper_hash={whisper_hash}"
190
  status_placeholder = st.empty()
191
- for i in range(30): # Wait up to 60s (2s x 30)
192
  status_r = requests.get(status_url, headers={"unstract-key": UNSTRACT_API_KEY})
193
  if status_r.status_code != 200:
194
  st.error(f"Unstract: Error checking status: {status_r.status_code} - {status_r.text}")
@@ -214,8 +289,6 @@ def extract_text_from_unstract(uploaded_file):
214
  except Exception:
215
  return r.text
216
 
217
-
218
- # --- Utility functions for fuzzy and normalized matching ---
219
  def clean_num(val):
220
  if not val: return None
221
  val = re.sub(r"[^0-9.\-]", "", str(val))
@@ -228,7 +301,6 @@ def normalize(s):
228
  if not s: return ""
229
  return re.sub(r"\W+", "", str(s).lower().strip())
230
 
231
- # --------- Upload PO CSV ---------
232
  st.sidebar.header("Step 1: Upload Active Purchase Orders (POs)")
233
  po_file = st.sidebar.file_uploader(
234
  "Upload POs CSV (must include PO number, Supplier, Items, etc.)",
@@ -240,6 +312,7 @@ if po_file:
240
  po_df = pd.read_csv(po_file)
241
  st.sidebar.success(f"Loaded {len(po_df)} Purchase Orders.")
242
  st.sidebar.dataframe(po_df.head())
 
243
 
244
  st.title("Invoice/Document Extractor")
245
  mdl = st.selectbox("Model for Extraction", list(MODELS.keys()), key="extract_model")
@@ -247,7 +320,6 @@ inv_file = st.file_uploader(
247
  "Step 2: Upload Invoice or Document File",
248
  type=["pdf", "docx", "xlsx", "xls", "png", "jpg", "jpeg", "tiff"]
249
  )
250
- extracted_info = None
251
 
252
  if st.button("Extract") and inv_file:
253
  with st.spinner("Extracting text from document using Unstract..."):
@@ -255,19 +327,22 @@ if st.button("Extract") and inv_file:
255
  if text:
256
  extracted_info = extract_invoice_info(mdl, text)
257
  if extracted_info:
 
 
258
  st.success("Extraction Complete")
259
  st.subheader("Invoice Metadata")
260
  st.table([{k.replace("_", " ").title(): v for k, v in extracted_info["invoice_header"].items()}])
261
  st.subheader("Line Items")
262
  st.table(extracted_info["line_items"])
263
- st.session_state["last_extracted_info"] = extracted_info # store in session
264
 
265
- extracted_info = extracted_info or st.session_state.get("last_extracted_info", None)
 
 
266
 
267
- # --------- Classic ReAct AGENT ---------
268
  def po_match_tool_func(input_text):
269
  invoice = st.session_state.get("last_extracted_info")
270
- po_df = st.session_state.get("po_df")
271
  debug = {}
272
  if invoice is None or po_df is None:
273
  return json.dumps({
@@ -383,9 +458,8 @@ def po_match_tool_func(input_text):
383
  })
384
 
385
  if po_df is not None:
386
- st.session_state["po_df"] = po_df
387
 
388
- # -------------- DECISION SECTION --------------
389
  if extracted_info is not None and po_df is not None:
390
  st.markdown("---")
391
  st.subheader("AI Agent Decision")
 
121
  "Shipment/invoice-level fields such as CAR NUMBER, SHIPPING POINT, SHIPMENT NUMBER, CURRENCY, etc., must go ONLY into the 'invoice_header', not as line item fields.\n"
122
  "Use this schema:\n"
123
  '{\n'
124
+ ' "invoice_header": {\n'
125
+ ' "car_number": "string or null",\n'
126
+ ' "shipment_number": "string or null",\n'
127
+ ' "shipping_point": "string or null",\n'
128
+ ' "currency": "string or null",\n'
129
+ ' "invoice_number": "string or null",\n'
130
+ ' "invoice_date": "string or null",\n'
131
+ ' "order_number": "string or null",\n'
132
+ ' "customer_order_number": "string or null",\n'
133
+ ' "our_order_number": "string or null",\n'
134
+ ' "sales_order_number": "string or null",\n'
135
+ ' "purchase_order_number": "string or null",\n'
136
+ ' "order_date": "string or null",\n'
137
+ ' "supplier_name": "string or null",\n'
138
+ ' "supplier_address": "string or null",\n'
139
+ ' "supplier_phone": "string or null",\n'
140
+ ' "supplier_email": "string or null",\n'
141
+ ' "supplier_tax_id": "string or null",\n'
142
+ ' "customer_name": "string or null",\n'
143
+ ' "customer_address": "string or null",\n'
144
+ ' "customer_phone": "string or null",\n'
145
+ ' "customer_email": "string or null",\n'
146
+ ' "customer_tax_id": "string or null",\n'
147
+ ' "ship_to_name": "string or null",\n'
148
+ ' "ship_to_address": "string or null",\n'
149
+ ' "bill_to_name": "string or null",\n'
150
+ ' "bill_to_address": "string or null",\n'
151
+ ' "remit_to_name": "string or null",\n'
152
+ ' "remit_to_address": "string or null",\n'
153
+ ' "tax_id": "string or null",\n'
154
+ ' "tax_registration_number": "string or null",\n'
155
+ ' "vat_number": "string or null",\n'
156
+ ' "payment_terms": "string or null",\n'
157
+ ' "payment_method": "string or null",\n'
158
+ ' "payment_reference": "string or null",\n'
159
+ ' "bank_account_number": "string or null",\n'
160
+ ' "iban": "string or null",\n'
161
+ ' "swift_code": "string or null",\n'
162
+ ' "total_before_tax": "string or null",\n'
163
+ ' "tax_amount": "string or null",\n'
164
+ ' "tax_rate": "string or null",\n'
165
+ ' "shipping_charges": "string or null",\n'
166
+ ' "discount": "string or null",\n'
167
+ ' "total_due": "string or null",\n'
168
+ ' "amount_paid": "string or null",\n'
169
+ ' "balance_due": "string or null",\n'
170
+ ' "due_date": "string or null",\n'
171
+ ' "invoice_status": "string or null",\n'
172
+ ' "reference_number": "string or null",\n'
173
+ ' "project_code": "string or null",\n'
174
+ ' "department": "string or null",\n'
175
+ ' "contact_person": "string or null",\n'
176
+ ' "notes": "string or null",\n'
177
+ ' "additional_info": "string or null"\n'
178
+ ' },\n'
179
+ ' "line_items": [\n'
180
+ ' {\n'
181
+ ' "quantity": "string or null",\n'
182
+ ' "units": "string or null",\n'
183
+ ' "description": "string or null",\n'
184
+ ' "footage": "string or null",\n'
185
+ ' "price": "string or null",\n'
186
+ ' "amount": "string or null",\n'
187
+ ' "notes": "string or null"\n'
188
+ ' }\n'
189
+ ' ]\n'
190
+ '}'
191
+ "\nIf a field is missing for a line item or header, use null. "
192
  "Do not invent fields. Do not add any header or shipment data to any line item. Return ONLY the JSON object, no explanation.\n"
193
  "\nInvoice Text:\n"
194
  f"{txt}"
195
  )
196
 
197
+ def ensure_total_due(invoice_header):
198
+ # If total_due is missing, try to find a close equivalent
199
+ if invoice_header.get("total_due") in [None, ""]:
200
+ for field in ["invoice_total", "invoice_value", "total_before_tax", "balance_due", "amount_paid"]:
201
+ if field in invoice_header and invoice_header[field]:
202
+ invoice_header["total_due"] = invoice_header[field]
203
+ break
204
+ return invoice_header
205
+
206
  def extract_invoice_info(model_choice, text):
207
  prompt = get_extraction_prompt(model_choice, text)
208
  raw = query_llm(model_choice, prompt)
 
218
  hdr.setdefault(k, None)
219
  if not hdr.get("supplier_name"):
220
  hdr["supplier_name"] = fallback_supplier(text)
221
+ # Guarantee total_due is always present (if at all possible)
222
+ hdr = ensure_total_due(hdr)
223
  items = data.get("line_items", [])
224
  if not isinstance(items, list):
225
  items = []
 
263
 
264
  status_url = f"{UNSTRACT_BASE}/whisper-status?whisper_hash={whisper_hash}"
265
  status_placeholder = st.empty()
266
+ for i in range(30):
267
  status_r = requests.get(status_url, headers={"unstract-key": UNSTRACT_API_KEY})
268
  if status_r.status_code != 200:
269
  st.error(f"Unstract: Error checking status: {status_r.status_code} - {status_r.text}")
 
289
  except Exception:
290
  return r.text
291
 
 
 
292
  def clean_num(val):
293
  if not val: return None
294
  val = re.sub(r"[^0-9.\-]", "", str(val))
 
301
  if not s: return ""
302
  return re.sub(r"\W+", "", str(s).lower().strip())
303
 
 
304
  st.sidebar.header("Step 1: Upload Active Purchase Orders (POs)")
305
  po_file = st.sidebar.file_uploader(
306
  "Upload POs CSV (must include PO number, Supplier, Items, etc.)",
 
312
  po_df = pd.read_csv(po_file)
313
  st.sidebar.success(f"Loaded {len(po_df)} Purchase Orders.")
314
  st.sidebar.dataframe(po_df.head())
315
+ st.session_state['last_po_df'] = po_df # Save PO to session
316
 
317
  st.title("Invoice/Document Extractor")
318
  mdl = st.selectbox("Model for Extraction", list(MODELS.keys()), key="extract_model")
 
320
  "Step 2: Upload Invoice or Document File",
321
  type=["pdf", "docx", "xlsx", "xls", "png", "jpg", "jpeg", "tiff"]
322
  )
 
323
 
324
  if st.button("Extract") and inv_file:
325
  with st.spinner("Extracting text from document using Unstract..."):
 
327
  if text:
328
  extracted_info = extract_invoice_info(mdl, text)
329
  if extracted_info:
330
+ if "invoice_header" in extracted_info:
331
+ extracted_info["invoice_header"] = ensure_total_due(extracted_info["invoice_header"])
332
  st.success("Extraction Complete")
333
  st.subheader("Invoice Metadata")
334
  st.table([{k.replace("_", " ").title(): v for k, v in extracted_info["invoice_header"].items()}])
335
  st.subheader("Line Items")
336
  st.table(extracted_info["line_items"])
337
+ st.session_state['last_extracted_info'] = extracted_info
338
 
339
+ # Always retrieve latest extracted info and PO df from session state!
340
+ extracted_info = st.session_state.get('last_extracted_info', None)
341
+ po_df = st.session_state.get('last_po_df', None)
342
 
 
343
  def po_match_tool_func(input_text):
344
  invoice = st.session_state.get("last_extracted_info")
345
+ po_df = st.session_state.get("last_po_df")
346
  debug = {}
347
  if invoice is None or po_df is None:
348
  return json.dumps({
 
458
  })
459
 
460
  if po_df is not None:
461
+ st.session_state["last_po_df"] = po_df
462
 
 
463
  if extracted_info is not None and po_df is not None:
464
  st.markdown("---")
465
  st.subheader("AI Agent Decision")