Seth0330 commited on
Commit
dc0c728
·
verified ·
1 Parent(s): 2c7ba82

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -84
app.py CHANGED
@@ -11,6 +11,7 @@ from langchain_community.chat_models import ChatOpenAI
11
  from langchain.agents import initialize_agent, Tool, AgentType
12
  from fuzzywuzzy import fuzz
13
 
 
14
  st.set_page_config(page_title="Accounts Payable AI Agent", layout="wide")
15
 
16
  MODELS = {
@@ -45,6 +46,8 @@ MODELS = {
45
  },
46
  }
47
 
 
 
48
  def get_api_key(model_choice):
49
  key = os.getenv(MODELS[model_choice]["key_env"])
50
  if not key:
@@ -202,64 +205,18 @@ def ensure_total_due(invoice_header):
202
  break
203
  return invoice_header
204
 
205
- def get_content_type(filename):
206
- mime, _ = mimetypes.guess_type(filename)
207
- ext = filename.lower().split('.')[-1]
208
- if ext == "pdf":
209
- return "text/plain"
210
- if mime is None:
211
- return "application/octet-stream"
212
- return mime
213
-
214
- UNSTRACT_BASE = "https://llmwhisperer-api.us-central.unstract.com/api/v2"
215
- UNSTRACT_API_KEY = os.getenv("UNSTRACT_API_KEY")
216
-
217
- def extract_text_from_unstract(uploaded_file):
218
- filename = getattr(uploaded_file, "name", "uploaded_file")
219
- file_bytes = uploaded_file.read()
220
- content_type = get_content_type(filename)
221
- headers = {
222
- "unstract-key": UNSTRACT_API_KEY,
223
- "Content-Type": content_type,
224
- }
225
- url = f"{UNSTRACT_BASE}/whisper"
226
- with st.spinner("Uploading and processing document with Unstract..."):
227
- r = requests.post(url, headers=headers, data=file_bytes)
228
- if r.status_code != 202:
229
- st.error(f"Unstract: Error uploading file: {r.status_code} - {r.text}")
230
- return None
231
- whisper_hash = r.json().get("whisper_hash")
232
- if not whisper_hash:
233
- st.error("Unstract: No whisper_hash received.")
234
- return None
235
-
236
- status_url = f"{UNSTRACT_BASE}/whisper-status?whisper_hash={whisper_hash}"
237
- status_placeholder = st.empty()
238
- for i in range(30):
239
- status_r = requests.get(status_url, headers={"unstract-key": UNSTRACT_API_KEY})
240
- if status_r.status_code != 200:
241
- st.error(f"Unstract: Error checking status: {status_r.status_code} - {status_r.text}")
242
- return None
243
- status = status_r.json().get("status")
244
- if status == "processed":
245
- status_placeholder.info("Unstract status: processed! 🎉")
246
- break
247
- status_placeholder.info(f"Unstract status: {status or 'waiting'}... ({i+1})")
248
- time.sleep(2)
249
- else:
250
- status_placeholder.error("Unstract: Timeout waiting for OCR to finish.")
251
- return None
252
-
253
- retrieve_url = f"{UNSTRACT_BASE}/whisper-retrieve?whisper_hash={whisper_hash}&text_only=true"
254
- r = requests.get(retrieve_url, headers={"unstract-key": UNSTRACT_API_KEY})
255
- if r.status_code != 200:
256
- st.error(f"Unstract: Error retrieving extracted text: {r.status_code} - {r.text}")
257
  return None
258
- try:
259
- data = r.json()
260
- return data.get("result_text") or r.text
261
- except Exception:
262
- return r.text
 
 
 
 
263
 
264
  def weighted_fuzzy_score(s1, s2):
265
  if not s1 and not s2:
@@ -352,6 +309,93 @@ def find_best_po_match(inv, po_df):
352
  best_row, best_score, reason, debug = scores[0]
353
  return best_row, best_score, reason, debug
354
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
  st.sidebar.header("Step 1: Upload Active Purchase Orders (POs)")
356
  po_file = st.sidebar.file_uploader(
357
  "Upload POs CSV (must include PO number, Supplier, Items, etc.)",
@@ -376,7 +420,6 @@ if st.button("Extract") and inv_file:
376
  with st.spinner("Extracting text from document using Unstract..."):
377
  text = extract_text_from_unstract(inv_file)
378
  if text:
379
- prompt = get_extraction_prompt(mdl, text)
380
  extracted_info = extract_invoice_info(mdl, text)
381
  if extracted_info:
382
  if "invoice_header" in extracted_info:
@@ -417,32 +460,6 @@ def po_match_tool_func(input_text):
417
  "po_row": best_row.to_dict() if best_row is not None else None
418
  })
419
 
420
- def extract_invoice_info(model_choice, text):
421
- prompt = get_extraction_prompt(model_choice, text)
422
- raw = query_llm(model_choice, prompt)
423
- if not raw:
424
- return None
425
- data = clean_json_response(raw)
426
- if not data:
427
- return None
428
- hdr = data.get("invoice_header", {})
429
- if not hdr and any(k in data for k in ("invoice_number","supplier_name","customer_name")):
430
- hdr = data
431
- for k in ("invoice_number","invoice_date","po_number","invoice_value","supplier_name","customer_name"):
432
- hdr.setdefault(k, None)
433
- if not hdr.get("supplier_name"):
434
- hdr["supplier_name"] = fallback_supplier(text)
435
- hdr = ensure_total_due(hdr)
436
- items = data.get("line_items", [])
437
- if not isinstance(items, list):
438
- items = []
439
- for itm in items:
440
- if not isinstance(itm, dict):
441
- continue
442
- for k in ("item_number","description","quantity","unit_price","total_price"):
443
- itm.setdefault(k, None)
444
- return {"invoice_header": hdr, "line_items": items}
445
-
446
  if po_df is not None:
447
  st.session_state["last_po_df"] = po_df
448
 
 
11
  from langchain.agents import initialize_agent, Tool, AgentType
12
  from fuzzywuzzy import fuzz
13
 
14
+ # --- CONFIGURATION ---
15
  st.set_page_config(page_title="Accounts Payable AI Agent", layout="wide")
16
 
17
  MODELS = {
 
46
  },
47
  }
48
 
49
+ # --- UTILITY FUNCTIONS ---
50
+
51
  def get_api_key(model_choice):
52
  key = os.getenv(MODELS[model_choice]["key_env"])
53
  if not key:
 
205
  break
206
  return invoice_header
207
 
208
+ def clean_num(val):
209
+ if val is None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  return None
211
+ if isinstance(val, (int, float)):
212
+ return float(val)
213
+ matches = re.findall(r"[-+]?\d[\d,]*\.?\d*", str(val))
214
+ if matches:
215
+ cleaned = [m.replace(',', '') for m in matches if m]
216
+ as_floats = [float(c) for c in cleaned if c.replace('.', '', 1).isdigit()]
217
+ if as_floats:
218
+ return max(as_floats)
219
+ return None
220
 
221
  def weighted_fuzzy_score(s1, s2):
222
  if not s1 and not s2:
 
309
  best_row, best_score, reason, debug = scores[0]
310
  return best_row, best_score, reason, debug
311
 
312
+ def extract_invoice_info(model_choice, text):
313
+ prompt = get_extraction_prompt(model_choice, text)
314
+ raw = query_llm(model_choice, prompt)
315
+ if not raw:
316
+ return None
317
+ data = clean_json_response(raw)
318
+ if not data:
319
+ return None
320
+ hdr = data.get("invoice_header", {})
321
+ if not hdr and any(k in data for k in ("invoice_number","supplier_name","customer_name")):
322
+ hdr = data
323
+ for k in ("invoice_number","invoice_date","po_number","invoice_value","supplier_name","customer_name"):
324
+ hdr.setdefault(k, None)
325
+ if not hdr.get("supplier_name"):
326
+ hdr["supplier_name"] = fallback_supplier(text)
327
+ hdr = ensure_total_due(hdr)
328
+ items = data.get("line_items", [])
329
+ if not isinstance(items, list):
330
+ items = []
331
+ for itm in items:
332
+ if not isinstance(itm, dict):
333
+ continue
334
+ for k in ("item_number","description","quantity","unit_price","total_price"):
335
+ itm.setdefault(k, None)
336
+ return {"invoice_header": hdr, "line_items": items}
337
+
338
+ def get_content_type(filename):
339
+ mime, _ = mimetypes.guess_type(filename)
340
+ ext = filename.lower().split('.')[-1]
341
+ if ext == "pdf":
342
+ return "text/plain"
343
+ if mime is None:
344
+ return "application/octet-stream"
345
+ return mime
346
+
347
+ UNSTRACT_BASE = "https://llmwhisperer-api.us-central.unstract.com/api/v2"
348
+ UNSTRACT_API_KEY = os.getenv("UNSTRACT_API_KEY")
349
+
350
+ def extract_text_from_unstract(uploaded_file):
351
+ filename = getattr(uploaded_file, "name", "uploaded_file")
352
+ file_bytes = uploaded_file.read()
353
+ content_type = get_content_type(filename)
354
+ headers = {
355
+ "unstract-key": UNSTRACT_API_KEY,
356
+ "Content-Type": content_type,
357
+ }
358
+ url = f"{UNSTRACT_BASE}/whisper"
359
+ with st.spinner("Uploading and processing document with Unstract..."):
360
+ r = requests.post(url, headers=headers, data=file_bytes)
361
+ if r.status_code != 202:
362
+ st.error(f"Unstract: Error uploading file: {r.status_code} - {r.text}")
363
+ return None
364
+ whisper_hash = r.json().get("whisper_hash")
365
+ if not whisper_hash:
366
+ st.error("Unstract: No whisper_hash received.")
367
+ return None
368
+
369
+ status_url = f"{UNSTRACT_BASE}/whisper-status?whisper_hash={whisper_hash}"
370
+ status_placeholder = st.empty()
371
+ for i in range(30):
372
+ status_r = requests.get(status_url, headers={"unstract-key": UNSTRACT_API_KEY})
373
+ if status_r.status_code != 200:
374
+ st.error(f"Unstract: Error checking status: {status_r.status_code} - {status_r.text}")
375
+ return None
376
+ status = status_r.json().get("status")
377
+ if status == "processed":
378
+ status_placeholder.info("Unstract status: processed! 🎉")
379
+ break
380
+ status_placeholder.info(f"Unstract status: {status or 'waiting'}... ({i+1})")
381
+ time.sleep(2)
382
+ else:
383
+ status_placeholder.error("Unstract: Timeout waiting for OCR to finish.")
384
+ return None
385
+
386
+ retrieve_url = f"{UNSTRACT_BASE}/whisper-retrieve?whisper_hash={whisper_hash}&text_only=true"
387
+ r = requests.get(retrieve_url, headers={"unstract-key": UNSTRACT_API_KEY})
388
+ if r.status_code != 200:
389
+ st.error(f"Unstract: Error retrieving extracted text: {r.status_code} - {r.text}")
390
+ return None
391
+ try:
392
+ data = r.json()
393
+ return data.get("result_text") or r.text
394
+ except Exception:
395
+ return r.text
396
+
397
+ # --- UI/LOGIC ---
398
+
399
  st.sidebar.header("Step 1: Upload Active Purchase Orders (POs)")
400
  po_file = st.sidebar.file_uploader(
401
  "Upload POs CSV (must include PO number, Supplier, Items, etc.)",
 
420
  with st.spinner("Extracting text from document using Unstract..."):
421
  text = extract_text_from_unstract(inv_file)
422
  if text:
 
423
  extracted_info = extract_invoice_info(mdl, text)
424
  if extracted_info:
425
  if "invoice_header" in extracted_info:
 
460
  "po_row": best_row.to_dict() if best_row is not None else None
461
  })
462
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
463
  if po_df is not None:
464
  st.session_state["last_po_df"] = po_df
465