Seth0330 commited on
Commit
46902a8
·
verified ·
1 Parent(s): 2cf6487

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +421 -195
app.py CHANGED
@@ -6,122 +6,58 @@ import os
6
  import time
7
  import mimetypes
8
  import pandas as pd
9
-
10
  from langchain_community.chat_models import ChatOpenAI
11
  from langchain.agents import initialize_agent, Tool, AgentType
12
  from fuzzywuzzy import fuzz
13
 
14
- # ---- Custom CSS to hide status and streamline look ----
15
- st.markdown("""
16
- <style>
17
- header[data-testid="stHeader"] {visibility: hidden;}
18
- #MainMenu, .stDeployButton {visibility: hidden;}
19
- .st-bb, .st-c6, .stDataFrameContainer, .stDataFrame {background: transparent !important;}
20
- .stButton>button {
21
- background: linear-gradient(90deg, #1e88e5 0%, #0057b8 100%);
22
- color: #fff !important;
23
- border-radius: 8px !important;
24
- font-weight: 600;
25
- border: none;
26
- box-shadow: 0 2px 8px rgba(30,136,229,0.15);
27
  }
28
- .stButton>button:hover {background: #1565c0;}
29
- </style>
30
- """, unsafe_allow_html=True)
31
-
32
- # ---- Sidebar ----
33
- with st.sidebar:
34
- st.markdown("<div style='font-size:1.25em; font-weight:700; margin-bottom:0.2em; margin-top:0.7em;'>Step 1: Upload Active Purchase Orders (POs)</div>", unsafe_allow_html=True)
35
- st.markdown("<div style='color:#eee; margin-bottom:1.1em; font-size:1em;'>Upload a POs CSV (must include PO number, Supplier, Items, etc.)</div>", unsafe_allow_html=True)
36
- po_file = st.file_uploader(
37
- "", type=["csv"], key="po_csv", label_visibility="collapsed"
38
- )
39
- po_df = None
40
- if po_file:
41
- po_df = pd.read_csv(po_file)
42
- st.session_state['last_po_df'] = po_df
43
- st.success(f"{len(po_df)} rows uploaded and active.", icon="✅")
44
- else:
45
- st.markdown("<span style='color:#bbc2cf; font-size:0.9em'>No PO file uploaded yet.</span>", unsafe_allow_html=True)
46
-
47
- st.markdown("<hr style='border:0.5px solid #324259; margin:2em 0 1em 0;'/>")
48
- st.markdown("<span style='color:#b6b8bc; font-size:0.93em;'>Need help? <b>Contact your admin</b></span>", unsafe_allow_html=True)
49
-
50
- # ---- Scoring Weights Section ----
51
- st.markdown('<div style="font-size:2rem;font-weight:700;color:#1e2a3a;margin-bottom:0.2em;margin-top:0.5em;">Invoice/Document Extractor</div>', unsafe_allow_html=True)
52
- st.markdown('<div style="color:#6073a3; margin-bottom:1.3em;">Digitally process and approve invoices with AI-powered PO matching.</div>', unsafe_allow_html=True)
53
-
54
- with st.container():
55
- st.markdown('<div style="background:#fff;border-radius:14px;box-shadow:0 4px 32px rgba(34,48,90,0.09),0 1.5px 3.5px rgba(30,136,229,0.07);padding:2rem 2.5rem 1.5rem 2.5rem;margin-bottom:1.5em;">', unsafe_allow_html=True)
56
- st.markdown("<h3>Set Scoring Weights (Total = 100%)</h3>", unsafe_allow_html=True)
57
- if "scoring_weights" not in st.session_state:
58
- st.session_state.scoring_weights = {
59
- "Supplier": 20,
60
- "PO Number": 25,
61
- "Currency": 10,
62
- "Total Due": 25,
63
- "Line Item": 20,
64
- }
65
- scoring_weights = st.session_state.scoring_weights
66
- total_weight = 0
67
- cols = st.columns(len(scoring_weights))
68
- field_keys = list(scoring_weights.keys())
69
- for i, field in enumerate(field_keys):
70
- val = cols[i].number_input(
71
- f"{field} (%)",
72
- min_value=0, max_value=100,
73
- value=int(scoring_weights[field]),
74
- key=f"scoring_{field}",
75
- step=1,
76
- format="%d"
77
- )
78
- scoring_weights[field] = val
79
- total_weight += val
80
- st.markdown(
81
- f"<span style='font-size:1em; color:{'#E53935' if total_weight != 100 else '#3BB273'}; font-weight:600;'>"
82
- f"Total = {total_weight}/100</span>",
83
- unsafe_allow_html=True
84
- )
85
- if total_weight != 100:
86
- st.warning("Scoring weights must sum to 100!", icon="⚠️")
87
- st.markdown("</div>", unsafe_allow_html=True)
88
-
89
- # ---- Upload Invoice/Document ----
90
- with st.container():
91
- st.markdown('<div style="background:#fff;border-radius:14px;box-shadow:0 4px 32px rgba(34,48,90,0.09),0 1.5px 3.5px rgba(30,136,229,0.07);padding:2rem 2.5rem 1.5rem 2.5rem;margin-bottom:1.5em;">', unsafe_allow_html=True)
92
- st.markdown("<h3>Step 2: Upload Invoice or Document</h3>", unsafe_allow_html=True)
93
- # -- NO model dropdown! --
94
- inv_file = st.file_uploader(
95
- "", type=["pdf", "docx", "xlsx", "xls", "png", "jpg", "jpeg", "tiff"], label_visibility="collapsed"
96
- )
97
- st.markdown("</div>", unsafe_allow_html=True)
98
 
99
- # ========== BUSINESS LOGIC FUNCTIONS (INSERTS) ==========
100
- def get_api_key():
101
- key = os.getenv("OPENAI_API_KEY")
102
  if not key:
103
- st.error("❌ OPENAI_API_KEY not set")
104
  st.stop()
105
  return key
106
 
107
- def query_llm(prompt):
108
- api_url = "https://api.openai.com/v1/chat/completions"
109
  headers = {
110
- "Authorization": f"Bearer {get_api_key()}",
111
  "Content-Type": "application/json",
112
  }
 
 
113
  payload = {
114
- "model": "gpt-4-1106-preview",
115
  "messages": [{"role": "user", "content": prompt}],
116
  "temperature": 0.1,
117
  "max_tokens": 2000,
118
  }
119
- with st.spinner(f"🔍 Fine Tuning The Extracted Data..."):
120
- r = requests.post(api_url, headers=headers, json=payload, timeout=90)
121
- if r.status_code != 200:
122
- st.error(f"🚨 API Error {r.status_code}: {r.text}")
 
 
 
 
 
 
 
 
 
 
123
  return None
124
- return r.json()["choices"][0]["message"]["content"]
125
 
126
  def clean_json_response(text):
127
  if not text:
@@ -146,28 +82,102 @@ def clean_json_response(text):
146
  st.code(frag)
147
  return None
148
 
149
- def get_extraction_prompt(txt):
 
 
 
 
 
 
 
150
  return (
151
- "You are an expert invoice parser. Extract data according to the visible table structure and column headers in the invoice. "
152
- "For every line item, only extract fields that correspond to the table columns for that row. "
 
 
 
153
  "Use this schema:\n"
154
- '{ "invoice_header": {"supplier_name":"string", "po_number":"string", "currency":"string", "total_due":"string"}, "line_items": [{"description":"string", "quantity":"string", "price":"string", "amount":"string"}] }'
155
- "\nIf a field is missing, use null. Return ONLY the JSON object, no explanation.\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  "\nInvoice Text:\n"
157
  f"{txt}"
158
  )
159
 
160
- def extract_invoice_info(text):
161
- prompt = get_extraction_prompt(text)
162
- raw = query_llm(prompt)
163
- if not raw:
164
- return None
165
- data = clean_json_response(raw)
166
- if not data:
167
- return None
168
- hdr = data.get("invoice_header", {})
169
- items = data.get("line_items", [])
170
- return {"invoice_header": hdr, "line_items": items}
171
 
172
  def clean_num(val):
173
  if val is None:
@@ -187,85 +197,246 @@ def weighted_fuzzy_score(s1, s2):
187
  return 100
188
  return fuzz.token_set_ratio(str(s1).lower(), str(s2).lower())
189
 
190
- def find_po_number_anywhere(inv_json, po_number):
191
- if not po_number or not inv_json:
192
- return False
193
- po_str = str(po_number).replace(",", "").replace(".0", "")
194
- flat = json.dumps(inv_json)
195
- return po_str in flat.replace(",", "").replace(".0", "")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
 
197
- def find_best_po_match(inv, po_df, weights):
 
 
 
 
 
198
  inv_hdr = inv["invoice_header"]
199
  inv_supplier = inv_hdr.get("supplier_name") or ""
 
200
  inv_currency = inv_hdr.get("currency") or ""
201
  inv_total_due = clean_num(inv_hdr.get("total_due"))
202
  inv_line_items = inv.get("line_items", [])
 
203
  scores = []
204
  for idx, row in po_df.iterrows():
205
  po_supplier = row.get("Supplier Name", "")
206
  po_po_number = str(row.get("PO Number", ""))
207
  po_currency = row.get("Currency", "")
208
  po_total = clean_num(row.get("PO Total Value", ""))
209
- # --- SCORING FIELDS ---
 
 
 
 
 
 
210
  s_supplier = weighted_fuzzy_score(inv_supplier, po_supplier)
211
- po_number_in_json = find_po_number_anywhere(inv, po_po_number)
212
- s_po_number = 100 if po_number_in_json else 0
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  s_currency = weighted_fuzzy_score(inv_currency, po_currency)
 
 
 
 
 
 
 
214
  s_total = 100 if inv_total_due is not None and po_total is not None and abs(inv_total_due - po_total) < 2 else 0
215
- # --- LINE ITEM MATCH (basic) ---
 
 
 
 
 
 
 
216
  line_item_score = 0
 
 
217
  for line in inv_line_items:
218
- desc_score = weighted_fuzzy_score(line.get("description", ""), row.get("Item Description", ""))
219
- qty_score = 100 if clean_num(line.get("quantity")) == clean_num(row.get("Item Quantity", "")) else 0
220
- unit_score = 100 if clean_num(line.get("price")) == clean_num(row.get("Item Unit Price", "")) else 0
221
- amount_score = 100 if clean_num(line.get("amount")) == clean_num(row.get("Line Item Total", "")) else 0
222
  total = desc_score * 0.5 + qty_score * 0.2 + unit_score * 0.15 + amount_score * 0.15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  if total > line_item_score:
224
  line_item_score = total
225
- # -- WEIGHTED FINAL SCORE --
 
 
 
 
 
 
226
  total_score = (
227
- s_supplier * (weights["Supplier"]/100) +
228
- s_po_number * (weights["PO Number"]/100) +
229
- s_currency * (weights["Currency"]/100) +
230
- s_total * (weights["Total Due"]/100) +
231
- line_item_score * (weights["Line Item"]/100)
232
- )
 
233
  reason = (
234
- f"Supplier match: {s_supplier}/100, "
235
- f"PO Number: {s_po_number}/100, "
236
- f"Currency: {s_currency}/100, "
237
- f"Total Due: {'match' if s_total else 'no match'}, "
238
- f"Line item best match: {int(line_item_score)}/100."
239
  )
 
240
  debug = {
241
  "po_idx": idx,
242
- "scores": [
243
- {"field":"Supplier","score":s_supplier},
244
- {"field":"PO Number (anywhere in JSON)","score":s_po_number},
245
- {"field":"Currency","score":s_currency},
246
- {"field":"Total Due","score":s_total},
247
- {"field":"Line Item","score":line_item_score}
248
- ],
249
- "total_score": total_score
 
250
  }
251
  scores.append((row, total_score, reason, debug))
 
252
  scores.sort(key=lambda tup: tup[1], reverse=True)
253
  if not scores:
254
  return None, 0, "No POs found.", {}
255
  best_row, best_score, reason, debug = scores[0]
256
  return best_row, best_score, reason, debug
257
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  def extract_text_from_unstract(uploaded_file):
259
  filename = getattr(uploaded_file, "name", "uploaded_file")
260
  file_bytes = uploaded_file.read()
261
- content_type = "application/octet-stream"
262
- if filename.lower().endswith(".pdf"):
263
- content_type = "text/plain"
264
  headers = {
265
- "unstract-key": os.getenv("UNSTRACT_API_KEY"),
266
  "Content-Type": content_type,
267
  }
268
- url = "https://llmwhisperer-api.us-central.unstract.com/api/v2/whisper"
269
  with st.spinner("Uploading and processing document with EZOFIS AI OCR AGENT..."):
270
  r = requests.post(url, headers=headers, data=file_bytes)
271
  if r.status_code != 202:
@@ -275,10 +446,11 @@ def extract_text_from_unstract(uploaded_file):
275
  if not whisper_hash:
276
  st.error("Unstract: No whisper_hash received.")
277
  return None
278
- status_url = f"https://llmwhisperer-api.us-central.unstract.com/api/v2/whisper-status?whisper_hash={whisper_hash}"
 
279
  status_placeholder = st.empty()
280
  for i in range(30):
281
- status_r = requests.get(status_url, headers={"unstract-key": os.getenv("UNSTRACT_API_KEY")})
282
  if status_r.status_code != 200:
283
  st.error(f"Unstract: Error checking status: {status_r.status_code} - {status_r.text}")
284
  return None
@@ -291,8 +463,9 @@ def extract_text_from_unstract(uploaded_file):
291
  else:
292
  status_placeholder.error("Unstract: Timeout waiting for OCR to finish.")
293
  return None
294
- retrieve_url = f"https://llmwhisperer-api.us-central.unstract.com/api/v2/whisper-retrieve?whisper_hash={whisper_hash}&text_only=true"
295
- r = requests.get(retrieve_url, headers={"unstract-key": os.getenv("UNSTRACT_API_KEY")})
 
296
  if r.status_code != 200:
297
  st.error(f"Unstract: Error retrieving extracted text: {r.status_code} - {r.text}")
298
  return None
@@ -302,52 +475,105 @@ def extract_text_from_unstract(uploaded_file):
302
  except Exception:
303
  return r.text
304
 
305
- # ========== END BUSINESS LOGIC ==========
 
 
 
 
 
 
306
 
307
- # ---- Extraction/Decision Main UI ----
308
  if st.button("Extract") and inv_file:
309
  with st.spinner("Extracting text from document using Unstract..."):
310
  text = extract_text_from_unstract(inv_file)
311
  if text:
312
- extracted_info = extract_invoice_info(text)
313
  if extracted_info:
314
- st.success("Extraction Complete!", icon="✅")
315
- st.markdown('<div style="background:#fff;border-radius:14px;box-shadow:0 4px 32px rgba(34,48,90,0.09),0 1.5px 3.5px rgba(30,136,229,0.07);padding:2rem 2.5rem 1.5rem 2.5rem;margin-bottom:1.5em;">', unsafe_allow_html=True)
316
- st.markdown("#### Invoice Metadata")
317
- st.json(extracted_info["invoice_header"])
318
- st.markdown("#### Line Items")
319
- st.json(extracted_info["line_items"])
 
320
  st.session_state['last_extracted_info'] = extracted_info
321
- st.markdown("</div>", unsafe_allow_html=True)
322
 
323
  extracted_info = st.session_state.get('last_extracted_info', None)
324
  po_df = st.session_state.get('last_po_df', None)
325
- scoring_weights = st.session_state.get("scoring_weights", {
326
- "Supplier": 20,
327
- "PO Number": 25,
328
- "Currency": 10,
329
- "Total Due": 25,
330
- "Line Item": 20,
331
- })
332
-
333
- if extracted_info is not None and po_df is not None and sum(scoring_weights.values()) == 100:
334
- st.markdown('<div style="background:#fff;border-radius:14px;box-shadow:0 4px 32px rgba(34,48,90,0.09),0 1.5px 3.5px rgba(30,136,229,0.07);padding:2rem 2.5rem 1.5rem 2.5rem;margin-bottom:1.5em;">', unsafe_allow_html=True)
335
- st.markdown("<h3>EZOFIS AP AGENT Decision</h3>", unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
  if st.button("Make a decision (EZOFIS AP AGENT)"):
337
- # Smart PO matching
338
- best_row, best_score, reason, debug = find_best_po_match(extracted_info, po_df, scoring_weights)
339
- if best_score > 85:
340
- status = "APPROVED"
341
- elif best_score > 70:
342
- status = "PARTIALLY APPROVED"
343
- else:
344
- status = "REJECTED"
345
- st.write(f"**Decision:** {status}")
346
- st.write(f"**Reason:** Best match score: {int(best_score)}/100. {reason}")
347
- with st.expander("Debug & Matching Details"):
348
- st.json(debug)
349
- st.subheader("Extracted Invoice JSON")
350
- st.json(extracted_info)
351
- st.subheader("Matched PO Row")
352
- st.json(best_row.to_dict() if best_row is not None else None)
353
- st.markdown("</div>", unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  import time
7
  import mimetypes
8
  import pandas as pd
 
9
  from langchain_community.chat_models import ChatOpenAI
10
  from langchain.agents import initialize_agent, Tool, AgentType
11
  from fuzzywuzzy import fuzz
12
 
13
+ st.set_page_config(page_title="Accounts Payable AI Agent", layout="wide")
14
+
15
+ MODELS = {
16
+ "OpenAI GPT-4.1": {
17
+ "api_url": "https://api.openai.com/v1/chat/completions",
18
+ "model": "gpt-4-1106-preview",
19
+ "key_env": "OPENAI_API_KEY",
20
+ "response_format": None,
21
+ "extra_headers": {},
22
+ },
 
 
 
23
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
+ def get_api_key(model_choice):
26
+ key = os.getenv(MODELS[model_choice]["key_env"])
 
27
  if not key:
28
+ st.error(f"❌ {MODELS[model_choice]['key_env']} not set")
29
  st.stop()
30
  return key
31
 
32
+ def query_llm(model_choice, prompt):
33
+ cfg = MODELS[model_choice]
34
  headers = {
35
+ "Authorization": f"Bearer {get_api_key(model_choice)}",
36
  "Content-Type": "application/json",
37
  }
38
+ if cfg.get("extra_headers"):
39
+ headers.update(cfg["extra_headers"])
40
  payload = {
41
+ "model": cfg["model"],
42
  "messages": [{"role": "user", "content": prompt}],
43
  "temperature": 0.1,
44
  "max_tokens": 2000,
45
  }
46
+ if cfg.get("response_format"):
47
+ payload["response_format"] = cfg["response_format"]
48
+ try:
49
+ with st.spinner(f"🔍 Fine Tuning The Extracted Data..."):
50
+ r = requests.post(cfg["api_url"], headers=headers, json=payload, timeout=90)
51
+ if r.status_code != 200:
52
+ st.error(f"🚨 API Error {r.status_code}: {r.text}")
53
+ return None
54
+ content = r.json()["choices"][0]["message"]["content"]
55
+ st.session_state.last_api = content
56
+ st.session_state.last_raw = r.text
57
+ return content
58
+ except Exception as e:
59
+ st.error(f"Connection error: {e}")
60
  return None
 
61
 
62
  def clean_json_response(text):
63
  if not text:
 
82
  st.code(frag)
83
  return None
84
 
85
+ def fallback_supplier(text):
86
+ for line in text.splitlines():
87
+ line = line.strip()
88
+ if line:
89
+ return line
90
+ return None
91
+
92
+ def get_extraction_prompt(model_choice, txt):
93
  return (
94
+ "You are an expert invoice parser. "
95
+ "Extract data according to the visible table structure and column headers in the invoice. "
96
+ "For every line item, only extract fields that correspond to the table columns for that row (do not include header/shipment fields in line items). "
97
+ "Merge all multi-line content within a single cell into that field (especially for the 'description' and 'notes'). "
98
+ "Shipment/invoice-level fields such as CAR NUMBER, SHIPPING POINT, SHIPMENT NUMBER, CURRENCY, etc., must go ONLY into the 'invoice_header', not as line item fields.\n"
99
  "Use this schema:\n"
100
+ '{\n'
101
+ ' "invoice_header": {\n'
102
+ ' "car_number": "string or null",\n'
103
+ ' "shipment_number": "string or null",\n'
104
+ ' "shipping_point": "string or null",\n'
105
+ ' "currency": "string or null",\n'
106
+ ' "invoice_number": "string or null",\n'
107
+ ' "invoice_date": "string or null",\n'
108
+ ' "order_number": "string or null",\n'
109
+ ' "customer_order_number": "string or null",\n'
110
+ ' "our_order_number": "string or null",\n'
111
+ ' "sales_order_number": "string or null",\n'
112
+ ' "purchase_order_number": "string or null",\n'
113
+ ' "order_date": "string or null",\n'
114
+ ' "supplier_name": "string or null",\n'
115
+ ' "supplier_address": "string or null",\n'
116
+ ' "supplier_phone": "string or null",\n'
117
+ ' "supplier_email": "string or null",\n'
118
+ ' "supplier_tax_id": "string or null",\n'
119
+ ' "customer_name": "string or null",\n'
120
+ ' "customer_address": "string or null",\n'
121
+ ' "customer_phone": "string or null",\n'
122
+ ' "customer_email": "string or null",\n'
123
+ ' "customer_tax_id": "string or null",\n'
124
+ ' "ship_to_name": "string or null",\n'
125
+ ' "ship_to_address": "string or null",\n'
126
+ ' "bill_to_name": "string or null",\n'
127
+ ' "bill_to_address": "string or null",\n'
128
+ ' "remit_to_name": "string or null",\n'
129
+ ' "remit_to_address": "string or null",\n'
130
+ ' "tax_id": "string or null",\n'
131
+ ' "tax_registration_number": "string or null",\n'
132
+ ' "vat_number": "string or null",\n'
133
+ ' "payment_terms": "string or null",\n'
134
+ ' "payment_method": "string or null",\n'
135
+ ' "payment_reference": "string or null",\n'
136
+ ' "bank_account_number": "string or null",\n'
137
+ ' "iban": "string or null",\n'
138
+ ' "swift_code": "string or null",\n'
139
+ ' "total_before_tax": "string or null",\n'
140
+ ' "tax_amount": "string or null",\n'
141
+ ' "tax_rate": "string or null",\n'
142
+ ' "shipping_charges": "string or null",\n'
143
+ ' "discount": "string or null",\n'
144
+ ' "total_due": "string or null",\n'
145
+ ' "amount_paid": "string or null",\n'
146
+ ' "balance_due": "string or null",\n'
147
+ ' "due_date": "string or null",\n'
148
+ ' "invoice_status": "string or null",\n'
149
+ ' "reference_number": "string or null",\n'
150
+ ' "project_code": "string or null",\n'
151
+ ' "department": "string or null",\n'
152
+ ' "contact_person": "string or null",\n'
153
+ ' "notes": "string or null",\n'
154
+ ' "additional_info": "string or null"\n'
155
+ ' },\n'
156
+ ' "line_items": [\n'
157
+ ' {\n'
158
+ ' "quantity": "string or null",\n'
159
+ ' "units": "string or null",\n'
160
+ ' "description": "string or null",\n'
161
+ ' "footage": "string or null",\n'
162
+ ' "price": "string or null",\n'
163
+ ' "amount": "string or null",\n'
164
+ ' "notes": "string or null"\n'
165
+ ' }\n'
166
+ ' ]\n'
167
+ '}'
168
+ "\nIf a field is missing for a line item or header, use null. "
169
+ "Do not invent fields. Do not add any header or shipment data to any line item. Return ONLY the JSON object, no explanation.\n"
170
  "\nInvoice Text:\n"
171
  f"{txt}"
172
  )
173
 
174
+ def ensure_total_due(invoice_header):
175
+ if invoice_header.get("total_due") in [None, ""]:
176
+ for field in ["invoice_total", "invoice_value", "total_before_tax", "balance_due", "amount_paid"]:
177
+ if field in invoice_header and invoice_header[field]:
178
+ invoice_header["total_due"] = invoice_header[field]
179
+ break
180
+ return invoice_header
 
 
 
 
181
 
182
  def clean_num(val):
183
  if val is None:
 
197
  return 100
198
  return fuzz.token_set_ratio(str(s1).lower(), str(s2).lower())
199
 
200
+ def find_po_number_in_json(po_number, invoice_json):
201
+ def _flatten(obj):
202
+ fields = []
203
+ if isinstance(obj, dict):
204
+ for v in obj.values():
205
+ fields.extend(_flatten(v))
206
+ elif isinstance(obj, list):
207
+ for item in obj:
208
+ fields.extend(_flatten(item))
209
+ elif obj is not None:
210
+ fields.append(str(obj))
211
+ return fields
212
+
213
+ po_str = str(po_number).strip().replace(" ", "").replace(".0", "")
214
+ try:
215
+ po_int = str(int(float(po_number)))
216
+ except:
217
+ po_int = po_str
218
+
219
+ all_strs = [str(s).strip().replace(" ", "").replace(".0", "") for s in _flatten(invoice_json)]
220
+ for s in all_strs:
221
+ if not s:
222
+ continue
223
+ if po_str and (po_str in s or s in po_str):
224
+ return True
225
+ if po_int and (po_int in s or s in po_int):
226
+ return True
227
+ return False
228
+
229
+ # --- Step 1: Upload POs CSV (very top) ---
230
+ st.sidebar.header("Step 1: Upload Active Purchase Orders (POs)")
231
+ po_file = st.sidebar.file_uploader(
232
+ "Upload POs CSV (must include PO number, Supplier, Items, etc.)",
233
+ type=["csv"],
234
+ key="po_csv"
235
+ )
236
+ po_df = None
237
+ if po_file:
238
+ po_df = pd.read_csv(po_file)
239
+ st.sidebar.success(f"Loaded {len(po_df)} rows from uploaded CSV.")
240
+ st.sidebar.dataframe(po_df.head())
241
+ st.session_state['last_po_df'] = po_df # Save PO to session
242
+
243
+ # --- Set Scoring Weights (Total = 100%) ---
244
+ st.sidebar.header("Set Scoring Weights (Total = 100%)")
245
+ def int_slider(label, value, key):
246
+ # A slider with number input
247
+ return st.sidebar.slider(label, 0, 100, value, 1, key=key, format="%d")
248
+
249
+ weight_supplier = int_slider("Supplier Name Weight (%)", 25, "w_supplier")
250
+ weight_po_number = int_slider("PO Number Weight (%)", 25, "w_po")
251
+ weight_currency = int_slider("Currency Weight (%)", 10, "w_curr")
252
+ weight_total_due = int_slider("Total Due Weight (%)", 20, "w_due")
253
+ weight_line_item = int_slider("Line Item Weight (%)", 20, "w_line")
254
+ weight_sum = weight_supplier + weight_po_number + weight_currency + weight_total_due + weight_line_item
255
+ if weight_sum != 100:
256
+ st.sidebar.warning(f"Sum of weights is {weight_sum}%. Adjust so it equals 100%.")
257
 
258
+ # --- Thresholds for decision ---
259
+ st.sidebar.header("Set Decision Thresholds")
260
+ approved_threshold = st.sidebar.slider("Threshold for 'APPROVED'", min_value=0, max_value=100, value=85, format="%d")
261
+ partial_threshold = st.sidebar.slider("Threshold for 'PARTIALLY APPROVED'", min_value=0, max_value=approved_threshold-1, value=70, format="%d")
262
+
263
+ def find_best_po_match(inv, po_df):
264
  inv_hdr = inv["invoice_header"]
265
  inv_supplier = inv_hdr.get("supplier_name") or ""
266
+ inv_po_number = inv_hdr.get("purchase_order_number") or inv_hdr.get("po_number") or inv_hdr.get("order_number") or ""
267
  inv_currency = inv_hdr.get("currency") or ""
268
  inv_total_due = clean_num(inv_hdr.get("total_due"))
269
  inv_line_items = inv.get("line_items", [])
270
+
271
  scores = []
272
  for idx, row in po_df.iterrows():
273
  po_supplier = row.get("Supplier Name", "")
274
  po_po_number = str(row.get("PO Number", ""))
275
  po_currency = row.get("Currency", "")
276
  po_total = clean_num(row.get("PO Total Value", ""))
277
+ po_desc = row.get("Item Description", "")
278
+ po_qty = str(row.get("Item Quantity", ""))
279
+ po_unit = str(row.get("Item Unit Price", ""))
280
+ po_line_total = clean_num(row.get("Line Item Total", ""))
281
+
282
+ field_details = []
283
+
284
  s_supplier = weighted_fuzzy_score(inv_supplier, po_supplier)
285
+ field_details.append({
286
+ "field": "Supplier Name",
287
+ "invoice": inv_supplier,
288
+ "po": po_supplier,
289
+ "score": s_supplier
290
+ })
291
+
292
+ s_po_number = 100 if find_po_number_in_json(po_po_number, inv) else 0
293
+ field_details.append({
294
+ "field": "PO Number (anywhere in JSON)",
295
+ "invoice": "found" if s_po_number else "not found",
296
+ "po": po_po_number,
297
+ "score": s_po_number
298
+ })
299
+
300
  s_currency = weighted_fuzzy_score(inv_currency, po_currency)
301
+ field_details.append({
302
+ "field": "Currency",
303
+ "invoice": inv_currency,
304
+ "po": po_currency,
305
+ "score": s_currency
306
+ })
307
+
308
  s_total = 100 if inv_total_due is not None and po_total is not None and abs(inv_total_due - po_total) < 2 else 0
309
+ field_details.append({
310
+ "field": "Total Due",
311
+ "invoice": inv_total_due,
312
+ "po": po_total,
313
+ "score": s_total
314
+ })
315
+
316
+ # Line item logic as before
317
  line_item_score = 0
318
+ line_reason = ""
319
+ best_line_detail = None
320
  for line in inv_line_items:
321
+ desc_score = weighted_fuzzy_score(line.get("description", ""), po_desc)
322
+ qty_score = 100 if clean_num(line.get("quantity")) == clean_num(po_qty) else 0
323
+ unit_score = 100 if clean_num(line.get("price")) == clean_num(po_unit) else 0
324
+ amount_score = 100 if clean_num(line.get("amount")) == po_line_total else 0
325
  total = desc_score * 0.5 + qty_score * 0.2 + unit_score * 0.15 + amount_score * 0.15
326
+ detail = {
327
+ "field": "Line Item",
328
+ "invoice": {
329
+ "description": line.get("description", ""),
330
+ "quantity": line.get("quantity", ""),
331
+ "price": line.get("price", ""),
332
+ "amount": line.get("amount", ""),
333
+ },
334
+ "po": {
335
+ "description": po_desc,
336
+ "quantity": po_qty,
337
+ "price": po_unit,
338
+ "amount": po_line_total,
339
+ },
340
+ "desc_score": desc_score,
341
+ "qty_score": qty_score,
342
+ "unit_score": unit_score,
343
+ "amount_score": amount_score,
344
+ "line_item_score": total
345
+ }
346
  if total > line_item_score:
347
  line_item_score = total
348
+ best_line_detail = detail
349
+ line_reason = (
350
+ f"Best line item: desc_score={desc_score}, qty_score={qty_score}, "
351
+ f"unit_score={unit_score}, amount_score={amount_score}"
352
+ )
353
+
354
+ wsum = weight_supplier + weight_po_number + weight_currency + weight_total_due + weight_line_item
355
  total_score = (
356
+ s_supplier * weight_supplier/100 +
357
+ s_po_number * weight_po_number/100 +
358
+ s_currency * weight_currency/100 +
359
+ s_total * weight_total_due/100 +
360
+ line_item_score * weight_line_item/100
361
+ ) if wsum == 100 else 0
362
+
363
  reason = (
364
+ f"Supplier match: {s_supplier}/100 (invoice: '{inv_supplier}' vs PO: '{po_supplier}'), "
365
+ f"PO Number: {s_po_number}/100 ({'found anywhere in JSON' if s_po_number else 'not found'}), "
366
+ f"Currency: {s_currency}/100 (invoice: '{inv_currency}' vs PO: '{po_currency}'), "
367
+ f"Total Due: {'match' if s_total else 'no match'} (invoice: {inv_total_due} vs PO: {po_total}), "
368
+ f"Line item best match: {int(line_item_score)}/100. {line_reason}"
369
  )
370
+
371
  debug = {
372
  "po_idx": idx,
373
+ "po_supplier": po_supplier,
374
+ "po_po_number": po_po_number,
375
+ "po_total": po_total,
376
+ "scores": field_details,
377
+ "line_item_score": line_item_score,
378
+ "best_line_detail": best_line_detail,
379
+ "total_score": total_score,
380
+ "line_reason": line_reason,
381
+ "inv_total_due": inv_total_due
382
  }
383
  scores.append((row, total_score, reason, debug))
384
+
385
  scores.sort(key=lambda tup: tup[1], reverse=True)
386
  if not scores:
387
  return None, 0, "No POs found.", {}
388
  best_row, best_score, reason, debug = scores[0]
389
  return best_row, best_score, reason, debug
390
 
391
+ # --- Extraction, decision, and UI logic below is unchanged ---
392
+
393
+ def extract_invoice_info(model_choice, text):
394
+ prompt = get_extraction_prompt(model_choice, text)
395
+ raw = query_llm(model_choice, prompt)
396
+ if not raw:
397
+ return None
398
+ data = clean_json_response(raw)
399
+ if not data:
400
+ return None
401
+ hdr = data.get("invoice_header", {})
402
+ if not hdr and any(k in data for k in ("invoice_number","supplier_name","customer_name")):
403
+ hdr = data
404
+ for k in ("invoice_number","invoice_date","po_number","invoice_value","supplier_name","customer_name"):
405
+ hdr.setdefault(k, None)
406
+ if not hdr.get("supplier_name"):
407
+ hdr["supplier_name"] = fallback_supplier(text)
408
+ hdr = ensure_total_due(hdr)
409
+ items = data.get("line_items", [])
410
+ if not isinstance(items, list):
411
+ items = []
412
+ for itm in items:
413
+ if not isinstance(itm, dict):
414
+ continue
415
+ for k in ("item_number","description","quantity","unit_price","total_price"):
416
+ itm.setdefault(k, None)
417
+ return {"invoice_header": hdr, "line_items": items}
418
+
419
+ def get_content_type(filename):
420
+ mime, _ = mimetypes.guess_type(filename)
421
+ ext = filename.lower().split('.')[-1]
422
+ if ext == "pdf":
423
+ return "text/plain"
424
+ if mime is None:
425
+ return "application/octet-stream"
426
+ return mime
427
+
428
+ UNSTRACT_BASE = "https://llmwhisperer-api.us-central.unstract.com/api/v2"
429
+ UNSTRACT_API_KEY = os.getenv("UNSTRACT_API_KEY")
430
+
431
  def extract_text_from_unstract(uploaded_file):
432
  filename = getattr(uploaded_file, "name", "uploaded_file")
433
  file_bytes = uploaded_file.read()
434
+ content_type = get_content_type(filename)
 
 
435
  headers = {
436
+ "unstract-key": UNSTRACT_API_KEY,
437
  "Content-Type": content_type,
438
  }
439
+ url = f"{UNSTRACT_BASE}/whisper"
440
  with st.spinner("Uploading and processing document with EZOFIS AI OCR AGENT..."):
441
  r = requests.post(url, headers=headers, data=file_bytes)
442
  if r.status_code != 202:
 
446
  if not whisper_hash:
447
  st.error("Unstract: No whisper_hash received.")
448
  return None
449
+
450
+ status_url = f"{UNSTRACT_BASE}/whisper-status?whisper_hash={whisper_hash}"
451
  status_placeholder = st.empty()
452
  for i in range(30):
453
+ status_r = requests.get(status_url, headers={"unstract-key": UNSTRACT_API_KEY})
454
  if status_r.status_code != 200:
455
  st.error(f"Unstract: Error checking status: {status_r.status_code} - {status_r.text}")
456
  return None
 
463
  else:
464
  status_placeholder.error("Unstract: Timeout waiting for OCR to finish.")
465
  return None
466
+
467
+ retrieve_url = f"{UNSTRACT_BASE}/whisper-retrieve?whisper_hash={whisper_hash}&text_only=true"
468
+ r = requests.get(retrieve_url, headers={"unstract-key": UNSTRACT_API_KEY})
469
  if r.status_code != 200:
470
  st.error(f"Unstract: Error retrieving extracted text: {r.status_code} - {r.text}")
471
  return None
 
475
  except Exception:
476
  return r.text
477
 
478
+ # --- Main page
479
+ st.title("Invoice/Document Extractor")
480
+ mdl = st.selectbox("Model for Extraction", list(MODELS.keys()), key="extract_model")
481
+ inv_file = st.file_uploader(
482
+ "Step 2: Upload Invoice or Document File",
483
+ type=["pdf", "docx", "xlsx", "xls", "png", "jpg", "jpeg", "tiff"]
484
+ )
485
 
 
486
  if st.button("Extract") and inv_file:
487
  with st.spinner("Extracting text from document using Unstract..."):
488
  text = extract_text_from_unstract(inv_file)
489
  if text:
490
+ extracted_info = extract_invoice_info(mdl, text)
491
  if extracted_info:
492
+ if "invoice_header" in extracted_info:
493
+ extracted_info["invoice_header"] = ensure_total_due(extracted_info["invoice_header"])
494
+ st.success("Extraction Complete")
495
+ st.subheader("Invoice Metadata")
496
+ st.table([{k.replace("_", " ").title(): v for k, v in extracted_info["invoice_header"].items()}])
497
+ st.subheader("Line Items")
498
+ st.table(extracted_info["line_items"])
499
  st.session_state['last_extracted_info'] = extracted_info
 
500
 
501
  extracted_info = st.session_state.get('last_extracted_info', None)
502
  po_df = st.session_state.get('last_po_df', None)
503
+
504
+ def po_match_tool_func(input_text):
505
+ invoice = st.session_state.get("last_extracted_info")
506
+ po_df = st.session_state.get("last_po_df")
507
+ if invoice is None or po_df is None:
508
+ return json.dumps({
509
+ "decision": "REJECTED",
510
+ "reason": "Invoice or PO data not found.",
511
+ "debug": {},
512
+ })
513
+ best_row, best_score, reason, debug = find_best_po_match(invoice, po_df)
514
+ if best_score > approved_threshold:
515
+ status = "APPROVED"
516
+ elif best_score > partial_threshold:
517
+ status = "PARTIALLY APPROVED"
518
+ else:
519
+ status = "REJECTED"
520
+ return json.dumps({
521
+ "decision": status,
522
+ "reason": f"Best match score: {int(best_score)}/100. {reason}",
523
+ "debug": debug,
524
+ "po_row": best_row.to_dict() if best_row is not None else None
525
+ })
526
+
527
+ if po_df is not None:
528
+ st.session_state["last_po_df"] = po_df
529
+
530
+ if extracted_info is not None and po_df is not None:
531
+ st.markdown("---")
532
+ st.subheader("EZOFIS AP AGENT Decision (OpenAI Only)")
533
  if st.button("Make a decision (EZOFIS AP AGENT)"):
534
+ tools = [
535
+ Tool(
536
+ name="po_match_tool",
537
+ func=po_match_tool_func,
538
+ description="Smartly match invoice to PO using all possible fields.",
539
+ )
540
+ ]
541
+ decision_llm = ChatOpenAI(
542
+ openai_api_key=get_api_key("OpenAI GPT-4.1"),
543
+ model=MODELS["OpenAI GPT-4.1"]["model"],
544
+ temperature=0,
545
+ streaming=False,
546
+ )
547
+ agent = initialize_agent(
548
+ tools,
549
+ decision_llm,
550
+ agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
551
+ verbose=True,
552
+ )
553
+ prompt = (
554
+ "You are an expert accounts payable agent. "
555
+ "Use po_match_tool to check for the best possible match using supplier, PO number (which may appear anywhere in the invoice JSON, even within other fields), currency, line items, and total value. "
556
+ "Weigh the importance of each field as an expert would, according to the user-configured weights. "
557
+ "Return a JSON with decision (APPROVED, PARTIALLY APPROVED, REJECTED), reason (include field scores and reasoning), debug, and the best matched PO row.\n"
558
+ f"Invoice JSON:\n{json.dumps(extracted_info, indent=2)}"
559
+ )
560
+ with st.spinner("AI is reasoning and making a decision..."):
561
+ result = agent.run(prompt)
562
+ try:
563
+ result_json = json.loads(result)
564
+ st.write(f"**Decision:** {result_json.get('decision', 'N/A')}")
565
+ st.write(f"**Reason:** {result_json.get('reason', 'N/A')}")
566
+ with st.expander("Debug & Matching Details"):
567
+ st.json(result_json.get('debug'))
568
+ st.subheader("Extracted Invoice JSON")
569
+ st.json(extracted_info)
570
+ st.subheader("Matched PO Row")
571
+ st.json(result_json.get('po_row'))
572
+ except Exception:
573
+ st.subheader("AI Decision & Reason")
574
+ st.write(result)
575
+
576
+ if "last_api" in st.session_state:
577
+ with st.expander("Debug"):
578
+ st.code(st.session_state.last_api)
579
+ st.code(st.session_state.last_raw)