Seth0330 commited on
Commit
f192959
·
verified ·
1 Parent(s): 498d8b2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +183 -561
app.py CHANGED
@@ -5,422 +5,83 @@ import re
5
  import os
6
  import time
7
  import mimetypes
8
- import pandas as pd
9
- from langchain_community.chat_models import ChatOpenAI
10
- from langchain.agents import initialize_agent, Tool, AgentType
11
  from fuzzywuzzy import fuzz
 
12
 
13
- # --- Streamlit Page Settings ---
14
- st.set_page_config(page_title="EZOFIS Accounts Payable Agent", layout="wide")
15
-
16
- # --- Styles for SaaS Feel ---
17
  st.markdown("""
18
  <style>
19
  .block-card {
20
- background: #fff;
21
- border-radius: 20px;
22
- box-shadow: 0 2px 16px rgba(25,39,64,0.05);
23
- padding: 32px 26px 24px 26px;
24
- margin-bottom: 24px;
25
- }
26
- .step-num {
27
- background: #A020F0;
28
- color: #fff;
29
- border-radius: 999px;
30
- padding: 6px 13px;
31
- font-weight: 700;
32
- margin-right: 14px;
33
- font-size: 20px;
34
- display: inline-block;
35
- vertical-align: middle;
36
  }
 
 
 
37
  .stButton>button {
38
- background: #A020F0 !important;
39
- color: white !important;
40
- border-radius: 12px !important;
41
- padding: 10px 32px !important;
42
- font-weight: 700;
43
- border: none !important;
44
- font-size: 18px !important;
45
  margin-top: 12px !important;
46
  }
47
- .stSlider>div>div>div>div {
48
- background: #F3F6FB !important;
49
- border-radius: 999px;
50
- }
51
- .css-12w0qpk {padding-top: 0rem;}
52
- .css-1kyxreq {padding-top: 0rem;}
53
  </style>
54
  """, unsafe_allow_html=True)
55
 
56
- MODELS = {
57
- "OpenAI GPT-4.1": {
58
- "api_url": "https://api.openai.com/v1/chat/completions",
59
- "model": "gpt-4-1106-preview",
60
- "key_env": "OPENAI_API_KEY",
61
- "response_format": None,
62
- "extra_headers": {},
63
- },
64
- }
65
-
66
- def get_api_key(model_choice):
67
- key = os.getenv(MODELS[model_choice]["key_env"])
68
- if not key:
69
- st.error(f"❌ {MODELS[model_choice]['key_env']} not set")
70
- st.stop()
71
- return key
72
-
73
- def query_llm(model_choice, prompt):
74
- cfg = MODELS[model_choice]
75
- headers = {
76
- "Authorization": f"Bearer {get_api_key(model_choice)}",
77
- "Content-Type": "application/json",
78
- }
79
- if cfg.get("extra_headers"):
80
- headers.update(cfg["extra_headers"])
81
- payload = {
82
- "model": cfg["model"],
83
- "messages": [{"role": "user", "content": prompt}],
84
- "temperature": 0.1,
85
- "max_tokens": 2000,
86
- }
87
- if cfg.get("response_format"):
88
- payload["response_format"] = cfg["response_format"]
89
- try:
90
- with st.spinner(f"🔍 Fine Tuning The Extracted Data..."):
91
- r = requests.post(cfg["api_url"], headers=headers, json=payload, timeout=90)
92
- if r.status_code != 200:
93
- st.error(f"🚨 API Error {r.status_code}: {r.text}")
94
- return None
95
- content = r.json()["choices"][0]["message"]["content"]
96
- st.session_state.last_api = content
97
- st.session_state.last_raw = r.text
98
- return content
99
- except Exception as e:
100
- st.error(f"Connection error: {e}")
101
- return None
102
-
103
- def clean_json_response(text):
104
- if not text:
105
- return None
106
- orig = text
107
- text = re.sub(r'```(?:json)?', '', text).strip()
108
- start, end = text.find('{'), text.rfind('}') + 1
109
- if start < 0 or end < 1:
110
- st.error("Couldn't locate JSON in response.")
111
- st.code(orig)
112
- return None
113
- frag = text[start:end]
114
- frag = re.sub(r',\s*([}\]])', r'\1', frag)
115
- try:
116
- return json.loads(frag)
117
- except json.JSONDecodeError as e:
118
- repaired = re.sub(r'"\s*"\s*(?="[^"]+"\s*:)', '","', frag)
119
- try:
120
- return json.loads(repaired)
121
- except json.JSONDecodeError:
122
- st.error(f"JSON parse error: {e}")
123
- st.code(frag)
124
- return None
125
-
126
- def fallback_supplier(text):
127
- for line in text.splitlines():
128
- line = line.strip()
129
- if line:
130
- return line
131
- return None
132
-
133
- def get_extraction_prompt(model_choice, txt):
134
- return (
135
- "You are an expert invoice parser. "
136
- "Extract data according to the visible table structure and column headers in the invoice. "
137
- "For every line item, only extract fields that correspond to the table columns for that row (do not include header/shipment fields in line items). "
138
- "Merge all multi-line content within a single cell into that field (especially for the 'description' and 'notes'). "
139
- "Shipment/invoice-level fields such as CAR NUMBER, SHIPPING POINT, SHIPMENT NUMBER, CURRENCY, etc., must go ONLY into the 'invoice_header', not as line item fields.\n"
140
- "Use this schema:\n"
141
- '{\n'
142
- ' "invoice_header": {\n'
143
- ' "car_number": "string or null",\n'
144
- ' "shipment_number": "string or null",\n'
145
- ' "shipping_point": "string or null",\n'
146
- ' "currency": "string or null",\n'
147
- ' "invoice_number": "string or null",\n'
148
- ' "invoice_date": "string or null",\n'
149
- ' "order_number": "string or null",\n'
150
- ' "customer_order_number": "string or null",\n'
151
- ' "our_order_number": "string or null",\n'
152
- ' "sales_order_number": "string or null",\n'
153
- ' "purchase_order_number": "string or null",\n'
154
- ' "order_date": "string or null",\n'
155
- ' "supplier_name": "string or null",\n'
156
- ' "supplier_address": "string or null",\n'
157
- ' "supplier_phone": "string or null",\n'
158
- ' "supplier_email": "string or null",\n'
159
- ' "supplier_tax_id": "string or null",\n'
160
- ' "customer_name": "string or null",\n'
161
- ' "customer_address": "string or null",\n'
162
- ' "customer_phone": "string or null",\n'
163
- ' "customer_email": "string or null",\n'
164
- ' "customer_tax_id": "string or null",\n'
165
- ' "ship_to_name": "string or null",\n'
166
- ' "ship_to_address": "string or null",\n'
167
- ' "bill_to_name": "string or null",\n'
168
- ' "bill_to_address": "string or null",\n'
169
- ' "remit_to_name": "string or null",\n'
170
- ' "remit_to_address": "string or null",\n'
171
- ' "tax_id": "string or null",\n'
172
- ' "tax_registration_number": "string or null",\n'
173
- ' "vat_number": "string or null",\n'
174
- ' "payment_terms": "string or null",\n'
175
- ' "payment_method": "string or null",\n'
176
- ' "payment_reference": "string or null",\n'
177
- ' "bank_account_number": "string or null",\n'
178
- ' "iban": "string or null",\n'
179
- ' "swift_code": "string or null",\n'
180
- ' "total_before_tax": "string or null",\n'
181
- ' "tax_amount": "string or null",\n'
182
- ' "tax_rate": "string or null",\n'
183
- ' "shipping_charges": "string or null",\n'
184
- ' "discount": "string or null",\n'
185
- ' "total_due": "string or null",\n'
186
- ' "amount_paid": "string or null",\n'
187
- ' "balance_due": "string or null",\n'
188
- ' "due_date": "string or null",\n'
189
- ' "invoice_status": "string or null",\n'
190
- ' "reference_number": "string or null",\n'
191
- ' "project_code": "string or null",\n'
192
- ' "department": "string or null",\n'
193
- ' "contact_person": "string or null",\n'
194
- ' "notes": "string or null",\n'
195
- ' "additional_info": "string or null"\n'
196
- ' },\n'
197
- ' "line_items": [\n'
198
- ' {\n'
199
- ' "quantity": "string or null",\n'
200
- ' "units": "string or null",\n'
201
- ' "description": "string or null",\n'
202
- ' "footage": "string or null",\n'
203
- ' "price": "string or null",\n'
204
- ' "amount": "string or null",\n'
205
- ' "notes": "string or null"\n'
206
- ' }\n'
207
- ' ]\n'
208
- '}'
209
- "\nIf a field is missing for a line item or header, use null. "
210
- "Do not invent fields. Do not add any header or shipment data to any line item. Return ONLY the JSON object, no explanation.\n"
211
- "\nInvoice Text:\n"
212
- f"{txt}"
213
- )
214
-
215
- def ensure_total_due(invoice_header):
216
- if invoice_header.get("total_due") in [None, ""]:
217
- for field in ["invoice_total", "invoice_value", "total_before_tax", "balance_due", "amount_paid"]:
218
- if field in invoice_header and invoice_header[field]:
219
- invoice_header["total_due"] = invoice_header[field]
220
- break
221
- return invoice_header
222
-
223
- def clean_num(val):
224
- if val is None:
225
- return None
226
- if isinstance(val, (int, float)):
227
- return float(val)
228
- matches = re.findall(r"[-+]?\d[\d,]*\.?\d*", str(val))
229
- if matches:
230
- cleaned = [m.replace(',', '') for m in matches if m]
231
- as_floats = [float(c) for c in cleaned if c.replace('.', '', 1).isdigit()]
232
- if as_floats:
233
- return max(as_floats)
234
- return None
235
-
236
- def weighted_fuzzy_score(s1, s2):
237
- if not s1 and not s2:
238
- return 100
239
- return fuzz.token_set_ratio(str(s1).lower(), str(s2).lower())
240
-
241
- def find_po_number_in_json(po_number, invoice_json):
242
- def _flatten(obj):
243
- fields = []
244
- if isinstance(obj, dict):
245
- for v in obj.values():
246
- fields.extend(_flatten(v))
247
- elif isinstance(obj, list):
248
- for item in obj:
249
- fields.extend(_flatten(item))
250
- elif obj is not None:
251
- fields.append(str(obj))
252
- return fields
253
-
254
- po_str = str(po_number).strip().replace(" ", "").replace(".0", "")
255
- try:
256
- po_int = str(int(float(po_number)))
257
- except:
258
- po_int = po_str
259
-
260
- all_strs = [str(s).strip().replace(" ", "").replace(".0", "") for s in _flatten(invoice_json)]
261
- for s in all_strs:
262
- if not s:
263
- continue
264
- if po_str and (po_str in s or s in po_str):
265
- return True
266
- if po_int and (po_int in s or s in po_int):
267
- return True
268
- return False
269
-
270
- def find_best_po_match(inv, po_df, weight_supplier, weight_po_number, weight_currency, weight_total_due, weight_line_item):
271
- inv_hdr = inv["invoice_header"]
272
- inv_supplier = inv_hdr.get("supplier_name") or ""
273
- inv_po_number = inv_hdr.get("purchase_order_number") or inv_hdr.get("po_number") or inv_hdr.get("order_number") or ""
274
- inv_currency = inv_hdr.get("currency") or ""
275
- inv_total_due = clean_num(inv_hdr.get("total_due"))
276
- inv_line_items = inv.get("line_items", [])
277
-
278
- scores = []
279
- for idx, row in po_df.iterrows():
280
- po_supplier = row.get("Supplier Name", "")
281
- po_po_number = str(row.get("PO Number", ""))
282
- po_currency = row.get("Currency", "")
283
- po_total = clean_num(row.get("PO Total Value", ""))
284
- po_desc = row.get("Item Description", "")
285
- po_qty = str(row.get("Item Quantity", ""))
286
- po_unit = str(row.get("Item Unit Price", ""))
287
- po_line_total = clean_num(row.get("Line Item Total", ""))
288
-
289
- field_details = []
290
-
291
- s_supplier = weighted_fuzzy_score(inv_supplier, po_supplier)
292
- field_details.append({
293
- "field": "Supplier Name",
294
- "invoice": inv_supplier,
295
- "po": po_supplier,
296
- "score": s_supplier
297
- })
298
-
299
- s_po_number = 100 if find_po_number_in_json(po_po_number, inv) else 0
300
- field_details.append({
301
- "field": "PO Number (anywhere in JSON)",
302
- "invoice": "found" if s_po_number else "not found",
303
- "po": po_po_number,
304
- "score": s_po_number
305
- })
306
-
307
- s_currency = weighted_fuzzy_score(inv_currency, po_currency)
308
- field_details.append({
309
- "field": "Currency",
310
- "invoice": inv_currency,
311
- "po": po_currency,
312
- "score": s_currency
313
- })
314
-
315
- s_total = 100 if inv_total_due is not None and po_total is not None and abs(inv_total_due - po_total) < 2 else 0
316
- field_details.append({
317
- "field": "Total Due",
318
- "invoice": inv_total_due,
319
- "po": po_total,
320
- "score": s_total
321
- })
322
-
323
- # Line item logic as before
324
- line_item_score = 0
325
- line_reason = ""
326
- best_line_detail = None
327
- for line in inv_line_items:
328
- desc_score = weighted_fuzzy_score(line.get("description", ""), po_desc)
329
- qty_score = 100 if clean_num(line.get("quantity")) == clean_num(po_qty) else 0
330
- unit_score = 100 if clean_num(line.get("price")) == clean_num(po_unit) else 0
331
- amount_score = 100 if clean_num(line.get("amount")) == po_line_total else 0
332
- total = desc_score * 0.5 + qty_score * 0.2 + unit_score * 0.15 + amount_score * 0.15
333
- detail = {
334
- "field": "Line Item",
335
- "invoice": {
336
- "description": line.get("description", ""),
337
- "quantity": line.get("quantity", ""),
338
- "price": line.get("price", ""),
339
- "amount": line.get("amount", ""),
340
- },
341
- "po": {
342
- "description": po_desc,
343
- "quantity": po_qty,
344
- "price": po_unit,
345
- "amount": po_line_total,
346
- },
347
- "desc_score": desc_score,
348
- "qty_score": qty_score,
349
- "unit_score": unit_score,
350
- "amount_score": amount_score,
351
- "line_item_score": total
352
- }
353
- if total > line_item_score:
354
- line_item_score = total
355
- best_line_detail = detail
356
- line_reason = (
357
- f"Best line item: desc_score={desc_score}, qty_score={qty_score}, "
358
- f"unit_score={unit_score}, amount_score={amount_score}"
359
- )
360
-
361
- wsum = weight_supplier + weight_po_number + weight_currency + weight_total_due + weight_line_item
362
- total_score = (
363
- s_supplier * weight_supplier/100 +
364
- s_po_number * weight_po_number/100 +
365
- s_currency * weight_currency/100 +
366
- s_total * weight_total_due/100 +
367
- line_item_score * weight_line_item/100
368
- ) if wsum == 100 else 0
369
-
370
- reason = (
371
- f"Supplier match: {s_supplier}/100 (invoice: '{inv_supplier}' vs PO: '{po_supplier}'), "
372
- f"PO Number: {s_po_number}/100 ({'found anywhere in JSON' if s_po_number else 'not found'}), "
373
- f"Currency: {s_currency}/100 (invoice: '{inv_currency}' vs PO: '{po_currency}'), "
374
- f"Total Due: {'match' if s_total else 'no match'} (invoice: {inv_total_due} vs PO: {po_total}), "
375
- f"Line item best match: {int(line_item_score)}/100. {line_reason}"
376
- )
377
 
378
- debug = {
379
- "po_idx": idx,
380
- "po_supplier": po_supplier,
381
- "po_po_number": po_po_number,
382
- "po_total": po_total,
383
- "scores": field_details,
384
- "line_item_score": line_item_score,
385
- "best_line_detail": best_line_detail,
386
- "total_score": total_score,
387
- "line_reason": line_reason,
388
- "inv_total_due": inv_total_due
389
- }
390
- scores.append((row, total_score, reason, debug))
391
 
392
- scores.sort(key=lambda tup: tup[1], reverse=True)
393
- if not scores:
394
- return None, 0, "No POs found.", {}
395
- best_row, best_score, reason, debug = scores[0]
396
- return best_row, best_score, reason, debug
 
 
 
 
397
 
398
- def extract_invoice_info(model_choice, text):
399
- prompt = get_extraction_prompt(model_choice, text)
400
- raw = query_llm(model_choice, prompt)
401
- if not raw:
402
- return None
403
- data = clean_json_response(raw)
404
- if not data:
405
- return None
406
- hdr = data.get("invoice_header", {})
407
- if not hdr and any(k in data for k in ("invoice_number","supplier_name","customer_name")):
408
- hdr = data
409
- for k in ("invoice_number","invoice_date","po_number","invoice_value","supplier_name","customer_name"):
410
- hdr.setdefault(k, None)
411
- if not hdr.get("supplier_name"):
412
- hdr["supplier_name"] = fallback_supplier(text)
413
- hdr = ensure_total_due(hdr)
414
- items = data.get("line_items", [])
415
- if not isinstance(items, list):
416
- items = []
417
- for itm in items:
418
- if not isinstance(itm, dict):
419
- continue
420
- for k in ("item_number","description","quantity","unit_price","total_price"):
421
- itm.setdefault(k, None)
422
- return {"invoice_header": hdr, "line_items": items}
 
 
 
 
 
 
 
 
 
 
423
 
 
424
  def get_content_type(filename):
425
  mime, _ = mimetypes.guess_type(filename)
426
  ext = filename.lower().split('.')[-1]
@@ -430,9 +91,6 @@ def get_content_type(filename):
430
  return "application/octet-stream"
431
  return mime
432
 
433
- UNSTRACT_BASE = "https://llmwhisperer-api.us-central.unstract.com/api/v2"
434
- UNSTRACT_API_KEY = os.getenv("UNSTRACT_API_KEY")
435
-
436
  def extract_text_from_unstract(uploaded_file):
437
  filename = getattr(uploaded_file, "name", "uploaded_file")
438
  file_bytes = uploaded_file.read()
@@ -442,37 +100,35 @@ def extract_text_from_unstract(uploaded_file):
442
  "Content-Type": content_type,
443
  }
444
  url = f"{UNSTRACT_BASE}/whisper"
445
- with st.spinner("Uploading and processing document with EZOFIS AI OCR AGENT..."):
446
  r = requests.post(url, headers=headers, data=file_bytes)
447
  if r.status_code != 202:
448
- st.error(f"Unstract: Error uploading file: {r.status_code} - {r.text}")
449
  return None
450
  whisper_hash = r.json().get("whisper_hash")
451
  if not whisper_hash:
452
  st.error("Unstract: No whisper_hash received.")
453
  return None
454
 
 
455
  status_url = f"{UNSTRACT_BASE}/whisper-status?whisper_hash={whisper_hash}"
456
- status_placeholder = st.empty()
457
  for i in range(30):
458
  status_r = requests.get(status_url, headers={"unstract-key": UNSTRACT_API_KEY})
459
  if status_r.status_code != 200:
460
- st.error(f"Unstract: Error checking status: {status_r.status_code} - {status_r.text}")
461
  return None
462
  status = status_r.json().get("status")
463
  if status == "processed":
464
- status_placeholder.info("EZOFIS AI OCR AGENT STATUS: processed! 🎉")
465
  break
466
- status_placeholder.info(f"EZOFIS AI OCR AGENT STATUS: {status or 'waiting'}... ({i+1})")
467
  time.sleep(2)
468
  else:
469
- status_placeholder.error("Unstract: Timeout waiting for OCR to finish.")
470
  return None
471
 
472
  retrieve_url = f"{UNSTRACT_BASE}/whisper-retrieve?whisper_hash={whisper_hash}&text_only=true"
473
  r = requests.get(retrieve_url, headers={"unstract-key": UNSTRACT_API_KEY})
474
  if r.status_code != 200:
475
- st.error(f"Unstract: Error retrieving extracted text: {r.status_code} - {r.text}")
476
  return None
477
  try:
478
  data = r.json()
@@ -480,158 +136,124 @@ def extract_text_from_unstract(uploaded_file):
480
  except Exception:
481
  return r.text
482
 
483
- # ---------------- UI LAYOUT ----------------------
484
- st.markdown(
485
- "<h1 style='font-weight:800; margin-bottom:8px;'>EZOFIS Accounts Payable Agent</h1>",
486
- unsafe_allow_html=True
487
- )
488
- st.markdown(
489
- "<div style='font-size:20px; margin-bottom:28px; color:#24345C;'>Modern workflow automation for finance teams</div>",
490
- unsafe_allow_html=True
491
- )
492
-
493
- # ---- Three columns layout for horizontal flow
494
- col1, col2, col3 = st.columns([2,2,3])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
 
496
- # ---- Step 1: Upload POs (col1) ----
497
- with col1:
498
- st.markdown("<span class='step-num'>1</span> <b>Upload Active Purchase Orders (POs)</b>", unsafe_allow_html=True)
499
- po_file = st.file_uploader(
500
- "CSV with PO number, Supplier, Items, etc.",
501
- type=["csv"],
502
- key="po_csv",
503
- label_visibility="collapsed"
504
- )
505
- po_df = None
506
- if po_file:
507
- po_df = pd.read_csv(po_file)
508
- st.success(f"Loaded {len(po_df)} records from uploaded CSV.")
509
- st.session_state['last_po_df'] = po_df
510
-
511
- # ---- Step 2: Scoring Weights (col1) ----
512
- with col1:
513
- st.markdown("<span class='step-num'>2</span> <b>Configure Scoring Weights</b>", unsafe_allow_html=True)
514
- st.markdown("Set weights for matching. Total must equal 100%.", unsafe_allow_html=True)
515
- def int_slider(label, value, key):
516
- return st.slider(label, 0, 100, value, 1, key=key, format="%d")
517
- weight_supplier = int_slider("Supplier Name (%)", 25, "w_supplier")
518
- weight_po_number = int_slider("PO Number (%)", 25, "w_po")
519
- weight_currency = int_slider("Currency (%)", 10, "w_curr")
520
- weight_total_due = int_slider("Total Due (%)", 20, "w_due")
521
- weight_line_item = int_slider("Line Item (%)", 20, "w_line")
522
- weight_sum = weight_supplier + weight_po_number + weight_currency + weight_total_due + weight_line_item
523
- if weight_sum != 100:
524
- st.warning(f"Sum of weights is {weight_sum}%. Adjust so it equals 100%.")
525
-
526
- st.markdown("<span class='step-num'>3</span> <b>Set Decision Thresholds</b>", unsafe_allow_html=True)
527
- approved_threshold = st.slider("Threshold for 'APPROVED'", min_value=0, max_value=100, value=85, format="%d")
528
- partial_threshold = st.slider("Threshold for 'PARTIALLY APPROVED'", min_value=0, max_value=approved_threshold-1, value=70, format="%d")
529
-
530
- # ---- Step 4: Upload Invoice (col2) ----
531
- with col2:
532
- st.markdown("<span class='step-num'>4</span> <b>Upload Invoice/Document</b>", unsafe_allow_html=True)
533
- inv_file = st.file_uploader(
534
- "Upload PDF, DOCX, XLSX, PNG, JPG, TIFF",
535
- type=["pdf", "docx", "xlsx", "xls", "png", "jpg", "jpeg", "tiff"],
536
- key="invoice_file",
537
- label_visibility="collapsed"
538
- )
539
-
540
- # ---- Step 5: Extract Data (col2) ----
541
- with col2:
542
- st.markdown("<span class='step-num'>5</span> <b>Extract Data</b>", unsafe_allow_html=True)
543
- if st.button("Extract"):
544
- if inv_file:
545
- with st.spinner("Extracting text from document..."):
546
- text = extract_text_from_unstract(inv_file)
547
- if text:
548
- mdl = "OpenAI GPT-4.1"
549
- extracted_info = extract_invoice_info(mdl, text)
550
- if extracted_info:
551
- if "invoice_header" in extracted_info:
552
- extracted_info["invoice_header"] = ensure_total_due(extracted_info["invoice_header"])
553
- st.success("Extraction Complete")
554
- st.session_state['last_extracted_info'] = extracted_info
555
- else:
556
- st.warning("Please upload an invoice/document first.")
557
 
558
- # ---- Step 6: AP Agent Decision (col3) ----
559
- with col3:
560
- st.markdown("<span class='step-num'>6</span> <b>AP Agent Decision</b>", unsafe_allow_html=True)
561
- if st.button("Make a decision (EZOFIS AP AGENT)"):
562
- extracted_info = st.session_state.get('last_extracted_info', None)
563
- po_df = st.session_state.get('last_po_df', None)
564
- if extracted_info is not None and po_df is not None:
565
- def po_match_tool_func(input_text):
566
- invoice = st.session_state.get("last_extracted_info")
567
- po_df = st.session_state.get("last_po_df")
568
- if invoice is None or po_df is None:
569
- return json.dumps({
570
- "decision": "REJECTED",
571
- "reason": "Invoice or PO data not found.",
572
- "debug": {},
573
- })
574
- best_row, best_score, reason, debug = find_best_po_match(
575
- invoice, po_df, weight_supplier, weight_po_number, weight_currency, weight_total_due, weight_line_item
576
- )
577
- if best_score > approved_threshold:
578
- status = "APPROVED"
579
- elif best_score > partial_threshold:
580
- status = "PARTIALLY APPROVED"
581
- else:
582
- status = "REJECTED"
583
- return json.dumps({
584
- "decision": status,
585
- "reason": f"Best match score: {int(best_score)}/100. {reason}",
586
- "debug": debug,
587
- "po_row": best_row.to_dict() if best_row is not None else None
588
- })
589
- tools = [
590
- Tool(
591
- name="po_match_tool",
592
- func=po_match_tool_func,
593
- description="Smartly match invoice to PO using all possible fields.",
594
- )
595
- ]
596
- decision_llm = ChatOpenAI(
597
- openai_api_key=get_api_key("OpenAI GPT-4.1"),
598
- model=MODELS["OpenAI GPT-4.1"]["model"],
599
- temperature=0,
600
- streaming=False,
601
- )
602
- agent = initialize_agent(
603
- tools,
604
- decision_llm,
605
- agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
606
- verbose=True,
607
- )
608
- prompt = (
609
- "You are an expert accounts payable agent. "
610
- "Use po_match_tool to check for the best possible match using supplier, PO number (which may appear anywhere in the invoice JSON, even within other fields), currency, line items, and total value. "
611
- "Weigh the importance of each field as an expert would, according to the user-configured weights. "
612
- "Return a JSON with decision (APPROVED, PARTIALLY APPROVED, REJECTED), reason (include field scores and reasoning), debug, and the best matched PO row.\n"
613
- f"Invoice JSON:\n{json.dumps(extracted_info, indent=2)}"
614
- )
615
- with st.spinner("AI is reasoning and making a decision..."):
616
- result = agent.run(prompt)
617
- # Always display debug/info
618
- st.markdown("<h3 style='margin-top:18px;'>AI Decision & Reason</h3>", unsafe_allow_html=True)
619
- try:
620
- result_json = json.loads(result)
621
- st.write(f"**Decision:** {result_json.get('decision', 'N/A')}")
622
- st.write(f"**Reason:** {result_json.get('reason', 'N/A')}")
623
- st.markdown("##### Debug & Matching Details")
624
- st.json(result_json.get('debug'))
625
- st.markdown("##### Extracted Invoice JSON")
626
- st.json(extracted_info)
627
- st.markdown("##### Matched PO Row")
628
- st.json(result_json.get('po_row'))
629
- except Exception:
630
- st.subheader("AI Decision & Reason")
631
- st.write(result)
632
 
633
- # Always show extraction/decision debug in full for troubleshooting
634
  if "last_api" in st.session_state:
635
- with st.expander("Debug"):
636
  st.code(st.session_state.last_api)
637
- st.code(st.session_state.last_raw)
 
5
  import os
6
  import time
7
  import mimetypes
 
 
 
8
  from fuzzywuzzy import fuzz
9
+ import pandas as pd
10
 
11
+ # ----- Styling -----
12
+ st.set_page_config(page_title="EZOFIS Document Validation Agent", layout="wide")
 
 
13
  st.markdown("""
14
  <style>
15
  .block-card {
16
+ background: #fff; border-radius: 20px; box-shadow: 0 2px 16px rgba(25,39,64,0.05);
17
+ padding: 32px 26px 24px 26px; margin-bottom: 24px;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  }
19
+ .step-num {background: #A020F0; color: #fff; border-radius: 999px;
20
+ padding: 6px 13px; font-weight: 700; margin-right: 14px; font-size: 20px;
21
+ display: inline-block; vertical-align: middle;}
22
  .stButton>button {
23
+ background: #A020F0 !important; color: white !important; border-radius: 12px !important;
24
+ padding: 10px 32px !important; font-weight: 700; border: none !important; font-size: 18px !important;
 
 
 
 
 
25
  margin-top: 12px !important;
26
  }
 
 
 
 
 
 
27
  </style>
28
  """, unsafe_allow_html=True)
29
 
30
+ # ----- API Config -----
31
+ UNSTRACT_BASE = "https://llmwhisperer-api.us-central.unstract.com/api/v2"
32
+ UNSTRACT_API_KEY = os.getenv("UNSTRACT_API_KEY") # Set in environment
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") # Set in environment
35
+ OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"
36
+ GEMMA_MODEL = "google/gemma-3-4b-it:free"
 
 
 
 
 
 
 
 
 
 
37
 
38
+ # =========== UI ===========
39
+ st.markdown(
40
+ "<h1 style='font-weight:800; margin-bottom:8px;'>EZOFIS Document Validation Agent</h1>",
41
+ unsafe_allow_html=True
42
+ )
43
+ st.markdown(
44
+ "<div style='font-size:20px; margin-bottom:28px; color:#24345C;'>Check document submissions against mortgage checklist with AI.</div>",
45
+ unsafe_allow_html=True
46
+ )
47
 
48
+ # ===== Step 1: Checklist JSON input =====
49
+ st.markdown("<span class='step-num'>1</span> <b>Paste Mortgage Checklist (JSON)</b>", unsafe_allow_html=True)
50
+ sample_checklist = '''{
51
+ "required_documents": [
52
+ {"type": "Driver's License", "description": "Government-issued photo ID"},
53
+ {"type": "Passport", "description": "Valid passport"},
54
+ {"type": "SIN Card", "description": "Social Insurance Number document"},
55
+ {"type": "Bank Statement", "description": "Last 3 months bank statement"},
56
+ {"type": "Employment Letter", "description": "Signed letter from employer"},
57
+ {"type": "Pay Stub", "description": "Most recent pay stub"},
58
+ {"type": "Proof of Address", "description": "Utility bill or lease"}
59
+ ]
60
+ }'''
61
+ checklist_text = st.text_area(
62
+ "Paste or edit your mortgage checklist JSON below:",
63
+ value=sample_checklist,
64
+ height=200,
65
+ key="doc_checklist_json"
66
+ )
67
+ # Parse checklist
68
+ try:
69
+ checklist = json.loads(checklist_text)
70
+ required_types = [doc["type"] for doc in checklist["required_documents"]]
71
+ except Exception as e:
72
+ st.error("Invalid checklist JSON.")
73
+ st.stop()
74
+
75
+ # ===== Step 2: Document upload =====
76
+ st.markdown("<span class='step-num'>2</span> <b>Upload Document(s) to Validate</b>", unsafe_allow_html=True)
77
+ uploaded_files = st.file_uploader(
78
+ "Upload PDF, DOCX, XLSX, PNG, JPG, TIFF, etc.",
79
+ type=["pdf", "docx", "xlsx", "xls", "png", "jpg", "jpeg", "tiff"],
80
+ key="mortgage_files",
81
+ accept_multiple_files=True
82
+ )
83
 
84
+ # ===== Utilities =====
85
  def get_content_type(filename):
86
  mime, _ = mimetypes.guess_type(filename)
87
  ext = filename.lower().split('.')[-1]
 
91
  return "application/octet-stream"
92
  return mime
93
 
 
 
 
94
  def extract_text_from_unstract(uploaded_file):
95
  filename = getattr(uploaded_file, "name", "uploaded_file")
96
  file_bytes = uploaded_file.read()
 
100
  "Content-Type": content_type,
101
  }
102
  url = f"{UNSTRACT_BASE}/whisper"
103
+ with st.spinner("Uploading and extracting with Unstract..."):
104
  r = requests.post(url, headers=headers, data=file_bytes)
105
  if r.status_code != 202:
106
+ st.error(f"Unstract error: {r.status_code} - {r.text}")
107
  return None
108
  whisper_hash = r.json().get("whisper_hash")
109
  if not whisper_hash:
110
  st.error("Unstract: No whisper_hash received.")
111
  return None
112
 
113
+ # Poll for status
114
  status_url = f"{UNSTRACT_BASE}/whisper-status?whisper_hash={whisper_hash}"
 
115
  for i in range(30):
116
  status_r = requests.get(status_url, headers={"unstract-key": UNSTRACT_API_KEY})
117
  if status_r.status_code != 200:
118
+ st.error(f"Unstract status error: {status_r.status_code} - {status_r.text}")
119
  return None
120
  status = status_r.json().get("status")
121
  if status == "processed":
 
122
  break
 
123
  time.sleep(2)
124
  else:
125
+ st.error("Unstract: Timeout waiting for OCR.")
126
  return None
127
 
128
  retrieve_url = f"{UNSTRACT_BASE}/whisper-retrieve?whisper_hash={whisper_hash}&text_only=true"
129
  r = requests.get(retrieve_url, headers={"unstract-key": UNSTRACT_API_KEY})
130
  if r.status_code != 200:
131
+ st.error(f"Unstract: Error retrieving text: {r.status_code} - {r.text}")
132
  return None
133
  try:
134
  data = r.json()
 
136
  except Exception:
137
  return r.text
138
 
139
+ def fuzzy_match_type(detected_type, checklist_types):
140
+ # Returns best match and score
141
+ best_type = None
142
+ best_score = 0
143
+ for t in checklist_types:
144
+ score = fuzz.token_set_ratio(str(detected_type), str(t))
145
+ if score > best_score:
146
+ best_type = t
147
+ best_score = score
148
+ return best_type, best_score
149
+
150
+ def query_gemma_llm(doc_text, checklist_json):
151
+ prompt = f"""
152
+ Read the following extracted document text and analyze according to this checklist JSON:
153
+ {json.dumps(checklist_json)}
154
+
155
+ Can you read from this text, what type of document it is such as Certificate, License, Passport, etc and Also find the expiry date of it from the text, If you don't find the expiry date text but if you found any other code such as MRZ then find the expiry date from that. Also by the look of it give your verdict whether this is genuine with a confidence score. Also if the current date is 21st June 2025 then check whether the document is already expired or valid.
156
+
157
+ Return your output as a JSON like:
158
+ {{
159
+ "document_type": "...",
160
+ "expiry_date": "...",
161
+ "is_expired": true/false,
162
+ "looks_genuine": true/false,
163
+ "confidence": <score 0-100>,
164
+ "verdict": "...reasoned verdict..."
165
+ }}
166
+ Document Text:
167
+ {doc_text[:4000]}
168
+ """.strip()
169
 
170
+ headers = {
171
+ "Authorization": f"Bearer {OPENROUTER_API_KEY}",
172
+ "HTTP-Referer": "https://chat.openai.com", # Some openrouter models require this
173
+ "X-Title": "EZOFIS-Doc-Validator",
174
+ "Content-Type": "application/json",
175
+ }
176
+ data = {
177
+ "model": GEMMA_MODEL,
178
+ "messages": [{"role": "user", "content": prompt}],
179
+ "temperature": 0.1,
180
+ "max_tokens": 1024
181
+ }
182
+ with st.spinner("Gemma LLM is validating the document..."):
183
+ resp = requests.post(OPENROUTER_URL, headers=headers, json=data, timeout=90)
184
+ if resp.status_code != 200:
185
+ st.error(f"OpenRouter error: {resp.status_code}: {resp.text}")
186
+ return None
187
+ result = resp.json()["choices"][0]["message"]["content"]
188
+ # Extract only JSON
189
+ start = result.find("{")
190
+ end = result.rfind("}") + 1
191
+ if start == -1 or end == 0:
192
+ st.error("Gemma did not return JSON.")
193
+ st.code(result)
194
+ return None
195
+ try:
196
+ return json.loads(result[start:end])
197
+ except Exception as e:
198
+ st.error("Error parsing LLM response.")
199
+ st.code(result)
200
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
+ # ========== Step 3: Run Validation ==========
203
+ if st.button("Run Document Validation", type="primary") and uploaded_files:
204
+ results = []
205
+ for uploaded_file in uploaded_files:
206
+ st.subheader(f"Validating: {uploaded_file.name}")
207
+ # Extract text
208
+ doc_text = extract_text_from_unstract(uploaded_file)
209
+ if not doc_text:
210
+ st.warning("Skipping due to extraction error.")
211
+ continue
212
+ # Query LLM
213
+ llm_json = query_gemma_llm(doc_text, checklist)
214
+ if not llm_json:
215
+ st.warning("Skipping due to LLM error.")
216
+ continue
217
+ # Fuzzy match doc type with checklist
218
+ detected_type = llm_json.get("document_type", "")
219
+ matched_type, match_score = fuzzy_match_type(detected_type, required_types)
220
+ # Acceptance logic
221
+ accepted = (
222
+ matched_type is not None and match_score >= 70 and
223
+ llm_json.get("looks_genuine", False) and
224
+ not llm_json.get("is_expired", False)
225
+ )
226
+ reason = []
227
+ reason.append(
228
+ f"Document type '{detected_type}' matched checklist '{matched_type}' with score {match_score}/100." if matched_type else
229
+ f"Document type '{detected_type}' did not match any required type."
230
+ )
231
+ reason.append(
232
+ f"Genuineness confidence: {llm_json.get('confidence', 0)}."
233
+ )
234
+ reason.append(
235
+ "Document is not expired." if not llm_json.get("is_expired", False) else "Document is expired."
236
+ )
237
+ reason.append(llm_json.get("verdict", ""))
238
+ results.append({
239
+ "File": uploaded_file.name,
240
+ "Detected Type": detected_type,
241
+ "Checklist Match": matched_type or "-",
242
+ "Type Score": match_score,
243
+ "Expiry Date": llm_json.get("expiry_date", "-"),
244
+ "Expired": "Yes" if llm_json.get("is_expired", False) else "No",
245
+ "Genuine": "Yes" if llm_json.get("looks_genuine", False) else "No",
246
+ "Confidence": llm_json.get("confidence", "-"),
247
+ "Accepted": "Yes" if accepted else "No",
248
+ "Reason": " ".join(reason)
249
+ })
250
+ if results:
251
+ st.success("Validation Complete.")
252
+ st.dataframe(pd.DataFrame(results))
253
+ else:
254
+ st.warning("No valid results.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
 
256
+ # Debugging
257
  if "last_api" in st.session_state:
258
+ with st.expander("Debug (LLM raw output)"):
259
  st.code(st.session_state.last_api)