Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,422 +5,83 @@ import re
|
|
| 5 |
import os
|
| 6 |
import time
|
| 7 |
import mimetypes
|
| 8 |
-
import pandas as pd
|
| 9 |
-
from langchain_community.chat_models import ChatOpenAI
|
| 10 |
-
from langchain.agents import initialize_agent, Tool, AgentType
|
| 11 |
from fuzzywuzzy import fuzz
|
|
|
|
| 12 |
|
| 13 |
-
#
|
| 14 |
-
st.set_page_config(page_title="EZOFIS
|
| 15 |
-
|
| 16 |
-
# --- Styles for SaaS Feel ---
|
| 17 |
st.markdown("""
|
| 18 |
<style>
|
| 19 |
.block-card {
|
| 20 |
-
background: #fff;
|
| 21 |
-
|
| 22 |
-
box-shadow: 0 2px 16px rgba(25,39,64,0.05);
|
| 23 |
-
padding: 32px 26px 24px 26px;
|
| 24 |
-
margin-bottom: 24px;
|
| 25 |
-
}
|
| 26 |
-
.step-num {
|
| 27 |
-
background: #A020F0;
|
| 28 |
-
color: #fff;
|
| 29 |
-
border-radius: 999px;
|
| 30 |
-
padding: 6px 13px;
|
| 31 |
-
font-weight: 700;
|
| 32 |
-
margin-right: 14px;
|
| 33 |
-
font-size: 20px;
|
| 34 |
-
display: inline-block;
|
| 35 |
-
vertical-align: middle;
|
| 36 |
}
|
|
|
|
|
|
|
|
|
|
| 37 |
.stButton>button {
|
| 38 |
-
background: #A020F0 !important;
|
| 39 |
-
|
| 40 |
-
border-radius: 12px !important;
|
| 41 |
-
padding: 10px 32px !important;
|
| 42 |
-
font-weight: 700;
|
| 43 |
-
border: none !important;
|
| 44 |
-
font-size: 18px !important;
|
| 45 |
margin-top: 12px !important;
|
| 46 |
}
|
| 47 |
-
.stSlider>div>div>div>div {
|
| 48 |
-
background: #F3F6FB !important;
|
| 49 |
-
border-radius: 999px;
|
| 50 |
-
}
|
| 51 |
-
.css-12w0qpk {padding-top: 0rem;}
|
| 52 |
-
.css-1kyxreq {padding-top: 0rem;}
|
| 53 |
</style>
|
| 54 |
""", unsafe_allow_html=True)
|
| 55 |
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
"model": "gpt-4-1106-preview",
|
| 60 |
-
"key_env": "OPENAI_API_KEY",
|
| 61 |
-
"response_format": None,
|
| 62 |
-
"extra_headers": {},
|
| 63 |
-
},
|
| 64 |
-
}
|
| 65 |
-
|
| 66 |
-
def get_api_key(model_choice):
|
| 67 |
-
key = os.getenv(MODELS[model_choice]["key_env"])
|
| 68 |
-
if not key:
|
| 69 |
-
st.error(f"❌ {MODELS[model_choice]['key_env']} not set")
|
| 70 |
-
st.stop()
|
| 71 |
-
return key
|
| 72 |
-
|
| 73 |
-
def query_llm(model_choice, prompt):
|
| 74 |
-
cfg = MODELS[model_choice]
|
| 75 |
-
headers = {
|
| 76 |
-
"Authorization": f"Bearer {get_api_key(model_choice)}",
|
| 77 |
-
"Content-Type": "application/json",
|
| 78 |
-
}
|
| 79 |
-
if cfg.get("extra_headers"):
|
| 80 |
-
headers.update(cfg["extra_headers"])
|
| 81 |
-
payload = {
|
| 82 |
-
"model": cfg["model"],
|
| 83 |
-
"messages": [{"role": "user", "content": prompt}],
|
| 84 |
-
"temperature": 0.1,
|
| 85 |
-
"max_tokens": 2000,
|
| 86 |
-
}
|
| 87 |
-
if cfg.get("response_format"):
|
| 88 |
-
payload["response_format"] = cfg["response_format"]
|
| 89 |
-
try:
|
| 90 |
-
with st.spinner(f"🔍 Fine Tuning The Extracted Data..."):
|
| 91 |
-
r = requests.post(cfg["api_url"], headers=headers, json=payload, timeout=90)
|
| 92 |
-
if r.status_code != 200:
|
| 93 |
-
st.error(f"🚨 API Error {r.status_code}: {r.text}")
|
| 94 |
-
return None
|
| 95 |
-
content = r.json()["choices"][0]["message"]["content"]
|
| 96 |
-
st.session_state.last_api = content
|
| 97 |
-
st.session_state.last_raw = r.text
|
| 98 |
-
return content
|
| 99 |
-
except Exception as e:
|
| 100 |
-
st.error(f"Connection error: {e}")
|
| 101 |
-
return None
|
| 102 |
-
|
| 103 |
-
def clean_json_response(text):
|
| 104 |
-
if not text:
|
| 105 |
-
return None
|
| 106 |
-
orig = text
|
| 107 |
-
text = re.sub(r'```(?:json)?', '', text).strip()
|
| 108 |
-
start, end = text.find('{'), text.rfind('}') + 1
|
| 109 |
-
if start < 0 or end < 1:
|
| 110 |
-
st.error("Couldn't locate JSON in response.")
|
| 111 |
-
st.code(orig)
|
| 112 |
-
return None
|
| 113 |
-
frag = text[start:end]
|
| 114 |
-
frag = re.sub(r',\s*([}\]])', r'\1', frag)
|
| 115 |
-
try:
|
| 116 |
-
return json.loads(frag)
|
| 117 |
-
except json.JSONDecodeError as e:
|
| 118 |
-
repaired = re.sub(r'"\s*"\s*(?="[^"]+"\s*:)', '","', frag)
|
| 119 |
-
try:
|
| 120 |
-
return json.loads(repaired)
|
| 121 |
-
except json.JSONDecodeError:
|
| 122 |
-
st.error(f"JSON parse error: {e}")
|
| 123 |
-
st.code(frag)
|
| 124 |
-
return None
|
| 125 |
-
|
| 126 |
-
def fallback_supplier(text):
|
| 127 |
-
for line in text.splitlines():
|
| 128 |
-
line = line.strip()
|
| 129 |
-
if line:
|
| 130 |
-
return line
|
| 131 |
-
return None
|
| 132 |
-
|
| 133 |
-
def get_extraction_prompt(model_choice, txt):
|
| 134 |
-
return (
|
| 135 |
-
"You are an expert invoice parser. "
|
| 136 |
-
"Extract data according to the visible table structure and column headers in the invoice. "
|
| 137 |
-
"For every line item, only extract fields that correspond to the table columns for that row (do not include header/shipment fields in line items). "
|
| 138 |
-
"Merge all multi-line content within a single cell into that field (especially for the 'description' and 'notes'). "
|
| 139 |
-
"Shipment/invoice-level fields such as CAR NUMBER, SHIPPING POINT, SHIPMENT NUMBER, CURRENCY, etc., must go ONLY into the 'invoice_header', not as line item fields.\n"
|
| 140 |
-
"Use this schema:\n"
|
| 141 |
-
'{\n'
|
| 142 |
-
' "invoice_header": {\n'
|
| 143 |
-
' "car_number": "string or null",\n'
|
| 144 |
-
' "shipment_number": "string or null",\n'
|
| 145 |
-
' "shipping_point": "string or null",\n'
|
| 146 |
-
' "currency": "string or null",\n'
|
| 147 |
-
' "invoice_number": "string or null",\n'
|
| 148 |
-
' "invoice_date": "string or null",\n'
|
| 149 |
-
' "order_number": "string or null",\n'
|
| 150 |
-
' "customer_order_number": "string or null",\n'
|
| 151 |
-
' "our_order_number": "string or null",\n'
|
| 152 |
-
' "sales_order_number": "string or null",\n'
|
| 153 |
-
' "purchase_order_number": "string or null",\n'
|
| 154 |
-
' "order_date": "string or null",\n'
|
| 155 |
-
' "supplier_name": "string or null",\n'
|
| 156 |
-
' "supplier_address": "string or null",\n'
|
| 157 |
-
' "supplier_phone": "string or null",\n'
|
| 158 |
-
' "supplier_email": "string or null",\n'
|
| 159 |
-
' "supplier_tax_id": "string or null",\n'
|
| 160 |
-
' "customer_name": "string or null",\n'
|
| 161 |
-
' "customer_address": "string or null",\n'
|
| 162 |
-
' "customer_phone": "string or null",\n'
|
| 163 |
-
' "customer_email": "string or null",\n'
|
| 164 |
-
' "customer_tax_id": "string or null",\n'
|
| 165 |
-
' "ship_to_name": "string or null",\n'
|
| 166 |
-
' "ship_to_address": "string or null",\n'
|
| 167 |
-
' "bill_to_name": "string or null",\n'
|
| 168 |
-
' "bill_to_address": "string or null",\n'
|
| 169 |
-
' "remit_to_name": "string or null",\n'
|
| 170 |
-
' "remit_to_address": "string or null",\n'
|
| 171 |
-
' "tax_id": "string or null",\n'
|
| 172 |
-
' "tax_registration_number": "string or null",\n'
|
| 173 |
-
' "vat_number": "string or null",\n'
|
| 174 |
-
' "payment_terms": "string or null",\n'
|
| 175 |
-
' "payment_method": "string or null",\n'
|
| 176 |
-
' "payment_reference": "string or null",\n'
|
| 177 |
-
' "bank_account_number": "string or null",\n'
|
| 178 |
-
' "iban": "string or null",\n'
|
| 179 |
-
' "swift_code": "string or null",\n'
|
| 180 |
-
' "total_before_tax": "string or null",\n'
|
| 181 |
-
' "tax_amount": "string or null",\n'
|
| 182 |
-
' "tax_rate": "string or null",\n'
|
| 183 |
-
' "shipping_charges": "string or null",\n'
|
| 184 |
-
' "discount": "string or null",\n'
|
| 185 |
-
' "total_due": "string or null",\n'
|
| 186 |
-
' "amount_paid": "string or null",\n'
|
| 187 |
-
' "balance_due": "string or null",\n'
|
| 188 |
-
' "due_date": "string or null",\n'
|
| 189 |
-
' "invoice_status": "string or null",\n'
|
| 190 |
-
' "reference_number": "string or null",\n'
|
| 191 |
-
' "project_code": "string or null",\n'
|
| 192 |
-
' "department": "string or null",\n'
|
| 193 |
-
' "contact_person": "string or null",\n'
|
| 194 |
-
' "notes": "string or null",\n'
|
| 195 |
-
' "additional_info": "string or null"\n'
|
| 196 |
-
' },\n'
|
| 197 |
-
' "line_items": [\n'
|
| 198 |
-
' {\n'
|
| 199 |
-
' "quantity": "string or null",\n'
|
| 200 |
-
' "units": "string or null",\n'
|
| 201 |
-
' "description": "string or null",\n'
|
| 202 |
-
' "footage": "string or null",\n'
|
| 203 |
-
' "price": "string or null",\n'
|
| 204 |
-
' "amount": "string or null",\n'
|
| 205 |
-
' "notes": "string or null"\n'
|
| 206 |
-
' }\n'
|
| 207 |
-
' ]\n'
|
| 208 |
-
'}'
|
| 209 |
-
"\nIf a field is missing for a line item or header, use null. "
|
| 210 |
-
"Do not invent fields. Do not add any header or shipment data to any line item. Return ONLY the JSON object, no explanation.\n"
|
| 211 |
-
"\nInvoice Text:\n"
|
| 212 |
-
f"{txt}"
|
| 213 |
-
)
|
| 214 |
-
|
| 215 |
-
def ensure_total_due(invoice_header):
|
| 216 |
-
if invoice_header.get("total_due") in [None, ""]:
|
| 217 |
-
for field in ["invoice_total", "invoice_value", "total_before_tax", "balance_due", "amount_paid"]:
|
| 218 |
-
if field in invoice_header and invoice_header[field]:
|
| 219 |
-
invoice_header["total_due"] = invoice_header[field]
|
| 220 |
-
break
|
| 221 |
-
return invoice_header
|
| 222 |
-
|
| 223 |
-
def clean_num(val):
|
| 224 |
-
if val is None:
|
| 225 |
-
return None
|
| 226 |
-
if isinstance(val, (int, float)):
|
| 227 |
-
return float(val)
|
| 228 |
-
matches = re.findall(r"[-+]?\d[\d,]*\.?\d*", str(val))
|
| 229 |
-
if matches:
|
| 230 |
-
cleaned = [m.replace(',', '') for m in matches if m]
|
| 231 |
-
as_floats = [float(c) for c in cleaned if c.replace('.', '', 1).isdigit()]
|
| 232 |
-
if as_floats:
|
| 233 |
-
return max(as_floats)
|
| 234 |
-
return None
|
| 235 |
-
|
| 236 |
-
def weighted_fuzzy_score(s1, s2):
|
| 237 |
-
if not s1 and not s2:
|
| 238 |
-
return 100
|
| 239 |
-
return fuzz.token_set_ratio(str(s1).lower(), str(s2).lower())
|
| 240 |
-
|
| 241 |
-
def find_po_number_in_json(po_number, invoice_json):
|
| 242 |
-
def _flatten(obj):
|
| 243 |
-
fields = []
|
| 244 |
-
if isinstance(obj, dict):
|
| 245 |
-
for v in obj.values():
|
| 246 |
-
fields.extend(_flatten(v))
|
| 247 |
-
elif isinstance(obj, list):
|
| 248 |
-
for item in obj:
|
| 249 |
-
fields.extend(_flatten(item))
|
| 250 |
-
elif obj is not None:
|
| 251 |
-
fields.append(str(obj))
|
| 252 |
-
return fields
|
| 253 |
-
|
| 254 |
-
po_str = str(po_number).strip().replace(" ", "").replace(".0", "")
|
| 255 |
-
try:
|
| 256 |
-
po_int = str(int(float(po_number)))
|
| 257 |
-
except:
|
| 258 |
-
po_int = po_str
|
| 259 |
-
|
| 260 |
-
all_strs = [str(s).strip().replace(" ", "").replace(".0", "") for s in _flatten(invoice_json)]
|
| 261 |
-
for s in all_strs:
|
| 262 |
-
if not s:
|
| 263 |
-
continue
|
| 264 |
-
if po_str and (po_str in s or s in po_str):
|
| 265 |
-
return True
|
| 266 |
-
if po_int and (po_int in s or s in po_int):
|
| 267 |
-
return True
|
| 268 |
-
return False
|
| 269 |
-
|
| 270 |
-
def find_best_po_match(inv, po_df, weight_supplier, weight_po_number, weight_currency, weight_total_due, weight_line_item):
|
| 271 |
-
inv_hdr = inv["invoice_header"]
|
| 272 |
-
inv_supplier = inv_hdr.get("supplier_name") or ""
|
| 273 |
-
inv_po_number = inv_hdr.get("purchase_order_number") or inv_hdr.get("po_number") or inv_hdr.get("order_number") or ""
|
| 274 |
-
inv_currency = inv_hdr.get("currency") or ""
|
| 275 |
-
inv_total_due = clean_num(inv_hdr.get("total_due"))
|
| 276 |
-
inv_line_items = inv.get("line_items", [])
|
| 277 |
-
|
| 278 |
-
scores = []
|
| 279 |
-
for idx, row in po_df.iterrows():
|
| 280 |
-
po_supplier = row.get("Supplier Name", "")
|
| 281 |
-
po_po_number = str(row.get("PO Number", ""))
|
| 282 |
-
po_currency = row.get("Currency", "")
|
| 283 |
-
po_total = clean_num(row.get("PO Total Value", ""))
|
| 284 |
-
po_desc = row.get("Item Description", "")
|
| 285 |
-
po_qty = str(row.get("Item Quantity", ""))
|
| 286 |
-
po_unit = str(row.get("Item Unit Price", ""))
|
| 287 |
-
po_line_total = clean_num(row.get("Line Item Total", ""))
|
| 288 |
-
|
| 289 |
-
field_details = []
|
| 290 |
-
|
| 291 |
-
s_supplier = weighted_fuzzy_score(inv_supplier, po_supplier)
|
| 292 |
-
field_details.append({
|
| 293 |
-
"field": "Supplier Name",
|
| 294 |
-
"invoice": inv_supplier,
|
| 295 |
-
"po": po_supplier,
|
| 296 |
-
"score": s_supplier
|
| 297 |
-
})
|
| 298 |
-
|
| 299 |
-
s_po_number = 100 if find_po_number_in_json(po_po_number, inv) else 0
|
| 300 |
-
field_details.append({
|
| 301 |
-
"field": "PO Number (anywhere in JSON)",
|
| 302 |
-
"invoice": "found" if s_po_number else "not found",
|
| 303 |
-
"po": po_po_number,
|
| 304 |
-
"score": s_po_number
|
| 305 |
-
})
|
| 306 |
-
|
| 307 |
-
s_currency = weighted_fuzzy_score(inv_currency, po_currency)
|
| 308 |
-
field_details.append({
|
| 309 |
-
"field": "Currency",
|
| 310 |
-
"invoice": inv_currency,
|
| 311 |
-
"po": po_currency,
|
| 312 |
-
"score": s_currency
|
| 313 |
-
})
|
| 314 |
-
|
| 315 |
-
s_total = 100 if inv_total_due is not None and po_total is not None and abs(inv_total_due - po_total) < 2 else 0
|
| 316 |
-
field_details.append({
|
| 317 |
-
"field": "Total Due",
|
| 318 |
-
"invoice": inv_total_due,
|
| 319 |
-
"po": po_total,
|
| 320 |
-
"score": s_total
|
| 321 |
-
})
|
| 322 |
-
|
| 323 |
-
# Line item logic as before
|
| 324 |
-
line_item_score = 0
|
| 325 |
-
line_reason = ""
|
| 326 |
-
best_line_detail = None
|
| 327 |
-
for line in inv_line_items:
|
| 328 |
-
desc_score = weighted_fuzzy_score(line.get("description", ""), po_desc)
|
| 329 |
-
qty_score = 100 if clean_num(line.get("quantity")) == clean_num(po_qty) else 0
|
| 330 |
-
unit_score = 100 if clean_num(line.get("price")) == clean_num(po_unit) else 0
|
| 331 |
-
amount_score = 100 if clean_num(line.get("amount")) == po_line_total else 0
|
| 332 |
-
total = desc_score * 0.5 + qty_score * 0.2 + unit_score * 0.15 + amount_score * 0.15
|
| 333 |
-
detail = {
|
| 334 |
-
"field": "Line Item",
|
| 335 |
-
"invoice": {
|
| 336 |
-
"description": line.get("description", ""),
|
| 337 |
-
"quantity": line.get("quantity", ""),
|
| 338 |
-
"price": line.get("price", ""),
|
| 339 |
-
"amount": line.get("amount", ""),
|
| 340 |
-
},
|
| 341 |
-
"po": {
|
| 342 |
-
"description": po_desc,
|
| 343 |
-
"quantity": po_qty,
|
| 344 |
-
"price": po_unit,
|
| 345 |
-
"amount": po_line_total,
|
| 346 |
-
},
|
| 347 |
-
"desc_score": desc_score,
|
| 348 |
-
"qty_score": qty_score,
|
| 349 |
-
"unit_score": unit_score,
|
| 350 |
-
"amount_score": amount_score,
|
| 351 |
-
"line_item_score": total
|
| 352 |
-
}
|
| 353 |
-
if total > line_item_score:
|
| 354 |
-
line_item_score = total
|
| 355 |
-
best_line_detail = detail
|
| 356 |
-
line_reason = (
|
| 357 |
-
f"Best line item: desc_score={desc_score}, qty_score={qty_score}, "
|
| 358 |
-
f"unit_score={unit_score}, amount_score={amount_score}"
|
| 359 |
-
)
|
| 360 |
-
|
| 361 |
-
wsum = weight_supplier + weight_po_number + weight_currency + weight_total_due + weight_line_item
|
| 362 |
-
total_score = (
|
| 363 |
-
s_supplier * weight_supplier/100 +
|
| 364 |
-
s_po_number * weight_po_number/100 +
|
| 365 |
-
s_currency * weight_currency/100 +
|
| 366 |
-
s_total * weight_total_due/100 +
|
| 367 |
-
line_item_score * weight_line_item/100
|
| 368 |
-
) if wsum == 100 else 0
|
| 369 |
-
|
| 370 |
-
reason = (
|
| 371 |
-
f"Supplier match: {s_supplier}/100 (invoice: '{inv_supplier}' vs PO: '{po_supplier}'), "
|
| 372 |
-
f"PO Number: {s_po_number}/100 ({'found anywhere in JSON' if s_po_number else 'not found'}), "
|
| 373 |
-
f"Currency: {s_currency}/100 (invoice: '{inv_currency}' vs PO: '{po_currency}'), "
|
| 374 |
-
f"Total Due: {'match' if s_total else 'no match'} (invoice: {inv_total_due} vs PO: {po_total}), "
|
| 375 |
-
f"Line item best match: {int(line_item_score)}/100. {line_reason}"
|
| 376 |
-
)
|
| 377 |
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
"po_po_number": po_po_number,
|
| 382 |
-
"po_total": po_total,
|
| 383 |
-
"scores": field_details,
|
| 384 |
-
"line_item_score": line_item_score,
|
| 385 |
-
"best_line_detail": best_line_detail,
|
| 386 |
-
"total_score": total_score,
|
| 387 |
-
"line_reason": line_reason,
|
| 388 |
-
"inv_total_due": inv_total_due
|
| 389 |
-
}
|
| 390 |
-
scores.append((row, total_score, reason, debug))
|
| 391 |
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 423 |
|
|
|
|
| 424 |
def get_content_type(filename):
|
| 425 |
mime, _ = mimetypes.guess_type(filename)
|
| 426 |
ext = filename.lower().split('.')[-1]
|
|
@@ -430,9 +91,6 @@ def get_content_type(filename):
|
|
| 430 |
return "application/octet-stream"
|
| 431 |
return mime
|
| 432 |
|
| 433 |
-
UNSTRACT_BASE = "https://llmwhisperer-api.us-central.unstract.com/api/v2"
|
| 434 |
-
UNSTRACT_API_KEY = os.getenv("UNSTRACT_API_KEY")
|
| 435 |
-
|
| 436 |
def extract_text_from_unstract(uploaded_file):
|
| 437 |
filename = getattr(uploaded_file, "name", "uploaded_file")
|
| 438 |
file_bytes = uploaded_file.read()
|
|
@@ -442,37 +100,35 @@ def extract_text_from_unstract(uploaded_file):
|
|
| 442 |
"Content-Type": content_type,
|
| 443 |
}
|
| 444 |
url = f"{UNSTRACT_BASE}/whisper"
|
| 445 |
-
with st.spinner("Uploading and
|
| 446 |
r = requests.post(url, headers=headers, data=file_bytes)
|
| 447 |
if r.status_code != 202:
|
| 448 |
-
st.error(f"Unstract
|
| 449 |
return None
|
| 450 |
whisper_hash = r.json().get("whisper_hash")
|
| 451 |
if not whisper_hash:
|
| 452 |
st.error("Unstract: No whisper_hash received.")
|
| 453 |
return None
|
| 454 |
|
|
|
|
| 455 |
status_url = f"{UNSTRACT_BASE}/whisper-status?whisper_hash={whisper_hash}"
|
| 456 |
-
status_placeholder = st.empty()
|
| 457 |
for i in range(30):
|
| 458 |
status_r = requests.get(status_url, headers={"unstract-key": UNSTRACT_API_KEY})
|
| 459 |
if status_r.status_code != 200:
|
| 460 |
-
st.error(f"Unstract
|
| 461 |
return None
|
| 462 |
status = status_r.json().get("status")
|
| 463 |
if status == "processed":
|
| 464 |
-
status_placeholder.info("EZOFIS AI OCR AGENT STATUS: processed! 🎉")
|
| 465 |
break
|
| 466 |
-
status_placeholder.info(f"EZOFIS AI OCR AGENT STATUS: {status or 'waiting'}... ({i+1})")
|
| 467 |
time.sleep(2)
|
| 468 |
else:
|
| 469 |
-
|
| 470 |
return None
|
| 471 |
|
| 472 |
retrieve_url = f"{UNSTRACT_BASE}/whisper-retrieve?whisper_hash={whisper_hash}&text_only=true"
|
| 473 |
r = requests.get(retrieve_url, headers={"unstract-key": UNSTRACT_API_KEY})
|
| 474 |
if r.status_code != 200:
|
| 475 |
-
st.error(f"Unstract: Error retrieving
|
| 476 |
return None
|
| 477 |
try:
|
| 478 |
data = r.json()
|
|
@@ -480,158 +136,124 @@ def extract_text_from_unstract(uploaded_file):
|
|
| 480 |
except Exception:
|
| 481 |
return r.text
|
| 482 |
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
"
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
st.
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
approved_threshold = st.slider("Threshold for 'APPROVED'", min_value=0, max_value=100, value=85, format="%d")
|
| 528 |
-
partial_threshold = st.slider("Threshold for 'PARTIALLY APPROVED'", min_value=0, max_value=approved_threshold-1, value=70, format="%d")
|
| 529 |
-
|
| 530 |
-
# ---- Step 4: Upload Invoice (col2) ----
|
| 531 |
-
with col2:
|
| 532 |
-
st.markdown("<span class='step-num'>4</span> <b>Upload Invoice/Document</b>", unsafe_allow_html=True)
|
| 533 |
-
inv_file = st.file_uploader(
|
| 534 |
-
"Upload PDF, DOCX, XLSX, PNG, JPG, TIFF",
|
| 535 |
-
type=["pdf", "docx", "xlsx", "xls", "png", "jpg", "jpeg", "tiff"],
|
| 536 |
-
key="invoice_file",
|
| 537 |
-
label_visibility="collapsed"
|
| 538 |
-
)
|
| 539 |
-
|
| 540 |
-
# ---- Step 5: Extract Data (col2) ----
|
| 541 |
-
with col2:
|
| 542 |
-
st.markdown("<span class='step-num'>5</span> <b>Extract Data</b>", unsafe_allow_html=True)
|
| 543 |
-
if st.button("Extract"):
|
| 544 |
-
if inv_file:
|
| 545 |
-
with st.spinner("Extracting text from document..."):
|
| 546 |
-
text = extract_text_from_unstract(inv_file)
|
| 547 |
-
if text:
|
| 548 |
-
mdl = "OpenAI GPT-4.1"
|
| 549 |
-
extracted_info = extract_invoice_info(mdl, text)
|
| 550 |
-
if extracted_info:
|
| 551 |
-
if "invoice_header" in extracted_info:
|
| 552 |
-
extracted_info["invoice_header"] = ensure_total_due(extracted_info["invoice_header"])
|
| 553 |
-
st.success("Extraction Complete")
|
| 554 |
-
st.session_state['last_extracted_info'] = extracted_info
|
| 555 |
-
else:
|
| 556 |
-
st.warning("Please upload an invoice/document first.")
|
| 557 |
|
| 558 |
-
#
|
| 559 |
-
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
| 576 |
-
|
| 577 |
-
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
|
| 599 |
-
|
| 600 |
-
|
| 601 |
-
)
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
|
| 610 |
-
|
| 611 |
-
"Weigh the importance of each field as an expert would, according to the user-configured weights. "
|
| 612 |
-
"Return a JSON with decision (APPROVED, PARTIALLY APPROVED, REJECTED), reason (include field scores and reasoning), debug, and the best matched PO row.\n"
|
| 613 |
-
f"Invoice JSON:\n{json.dumps(extracted_info, indent=2)}"
|
| 614 |
-
)
|
| 615 |
-
with st.spinner("AI is reasoning and making a decision..."):
|
| 616 |
-
result = agent.run(prompt)
|
| 617 |
-
# Always display debug/info
|
| 618 |
-
st.markdown("<h3 style='margin-top:18px;'>AI Decision & Reason</h3>", unsafe_allow_html=True)
|
| 619 |
-
try:
|
| 620 |
-
result_json = json.loads(result)
|
| 621 |
-
st.write(f"**Decision:** {result_json.get('decision', 'N/A')}")
|
| 622 |
-
st.write(f"**Reason:** {result_json.get('reason', 'N/A')}")
|
| 623 |
-
st.markdown("##### Debug & Matching Details")
|
| 624 |
-
st.json(result_json.get('debug'))
|
| 625 |
-
st.markdown("##### Extracted Invoice JSON")
|
| 626 |
-
st.json(extracted_info)
|
| 627 |
-
st.markdown("##### Matched PO Row")
|
| 628 |
-
st.json(result_json.get('po_row'))
|
| 629 |
-
except Exception:
|
| 630 |
-
st.subheader("AI Decision & Reason")
|
| 631 |
-
st.write(result)
|
| 632 |
|
| 633 |
-
#
|
| 634 |
if "last_api" in st.session_state:
|
| 635 |
-
with st.expander("Debug"):
|
| 636 |
st.code(st.session_state.last_api)
|
| 637 |
-
st.code(st.session_state.last_raw)
|
|
|
|
| 5 |
import os
|
| 6 |
import time
|
| 7 |
import mimetypes
|
|
|
|
|
|
|
|
|
|
| 8 |
from fuzzywuzzy import fuzz
|
| 9 |
+
import pandas as pd
|
| 10 |
|
| 11 |
+
# ----- Styling -----
|
| 12 |
+
st.set_page_config(page_title="EZOFIS Document Validation Agent", layout="wide")
|
|
|
|
|
|
|
| 13 |
st.markdown("""
|
| 14 |
<style>
|
| 15 |
.block-card {
|
| 16 |
+
background: #fff; border-radius: 20px; box-shadow: 0 2px 16px rgba(25,39,64,0.05);
|
| 17 |
+
padding: 32px 26px 24px 26px; margin-bottom: 24px;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
}
|
| 19 |
+
.step-num {background: #A020F0; color: #fff; border-radius: 999px;
|
| 20 |
+
padding: 6px 13px; font-weight: 700; margin-right: 14px; font-size: 20px;
|
| 21 |
+
display: inline-block; vertical-align: middle;}
|
| 22 |
.stButton>button {
|
| 23 |
+
background: #A020F0 !important; color: white !important; border-radius: 12px !important;
|
| 24 |
+
padding: 10px 32px !important; font-weight: 700; border: none !important; font-size: 18px !important;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
margin-top: 12px !important;
|
| 26 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
</style>
|
| 28 |
""", unsafe_allow_html=True)
|
| 29 |
|
| 30 |
+
# ----- API Config -----
|
| 31 |
+
UNSTRACT_BASE = "https://llmwhisperer-api.us-central.unstract.com/api/v2"
|
| 32 |
+
UNSTRACT_API_KEY = os.getenv("UNSTRACT_API_KEY") # Set in environment
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
+
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") # Set in environment
|
| 35 |
+
OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"
|
| 36 |
+
GEMMA_MODEL = "google/gemma-3-4b-it:free"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
+
# =========== UI ===========
|
| 39 |
+
st.markdown(
|
| 40 |
+
"<h1 style='font-weight:800; margin-bottom:8px;'>EZOFIS Document Validation Agent</h1>",
|
| 41 |
+
unsafe_allow_html=True
|
| 42 |
+
)
|
| 43 |
+
st.markdown(
|
| 44 |
+
"<div style='font-size:20px; margin-bottom:28px; color:#24345C;'>Check document submissions against mortgage checklist with AI.</div>",
|
| 45 |
+
unsafe_allow_html=True
|
| 46 |
+
)
|
| 47 |
|
| 48 |
+
# ===== Step 1: Checklist JSON input =====
|
| 49 |
+
st.markdown("<span class='step-num'>1</span> <b>Paste Mortgage Checklist (JSON)</b>", unsafe_allow_html=True)
|
| 50 |
+
sample_checklist = '''{
|
| 51 |
+
"required_documents": [
|
| 52 |
+
{"type": "Driver's License", "description": "Government-issued photo ID"},
|
| 53 |
+
{"type": "Passport", "description": "Valid passport"},
|
| 54 |
+
{"type": "SIN Card", "description": "Social Insurance Number document"},
|
| 55 |
+
{"type": "Bank Statement", "description": "Last 3 months bank statement"},
|
| 56 |
+
{"type": "Employment Letter", "description": "Signed letter from employer"},
|
| 57 |
+
{"type": "Pay Stub", "description": "Most recent pay stub"},
|
| 58 |
+
{"type": "Proof of Address", "description": "Utility bill or lease"}
|
| 59 |
+
]
|
| 60 |
+
}'''
|
| 61 |
+
checklist_text = st.text_area(
|
| 62 |
+
"Paste or edit your mortgage checklist JSON below:",
|
| 63 |
+
value=sample_checklist,
|
| 64 |
+
height=200,
|
| 65 |
+
key="doc_checklist_json"
|
| 66 |
+
)
|
| 67 |
+
# Parse checklist
|
| 68 |
+
try:
|
| 69 |
+
checklist = json.loads(checklist_text)
|
| 70 |
+
required_types = [doc["type"] for doc in checklist["required_documents"]]
|
| 71 |
+
except Exception as e:
|
| 72 |
+
st.error("Invalid checklist JSON.")
|
| 73 |
+
st.stop()
|
| 74 |
+
|
| 75 |
+
# ===== Step 2: Document upload =====
|
| 76 |
+
st.markdown("<span class='step-num'>2</span> <b>Upload Document(s) to Validate</b>", unsafe_allow_html=True)
|
| 77 |
+
uploaded_files = st.file_uploader(
|
| 78 |
+
"Upload PDF, DOCX, XLSX, PNG, JPG, TIFF, etc.",
|
| 79 |
+
type=["pdf", "docx", "xlsx", "xls", "png", "jpg", "jpeg", "tiff"],
|
| 80 |
+
key="mortgage_files",
|
| 81 |
+
accept_multiple_files=True
|
| 82 |
+
)
|
| 83 |
|
| 84 |
+
# ===== Utilities =====
|
| 85 |
def get_content_type(filename):
|
| 86 |
mime, _ = mimetypes.guess_type(filename)
|
| 87 |
ext = filename.lower().split('.')[-1]
|
|
|
|
| 91 |
return "application/octet-stream"
|
| 92 |
return mime
|
| 93 |
|
|
|
|
|
|
|
|
|
|
| 94 |
def extract_text_from_unstract(uploaded_file):
|
| 95 |
filename = getattr(uploaded_file, "name", "uploaded_file")
|
| 96 |
file_bytes = uploaded_file.read()
|
|
|
|
| 100 |
"Content-Type": content_type,
|
| 101 |
}
|
| 102 |
url = f"{UNSTRACT_BASE}/whisper"
|
| 103 |
+
with st.spinner("Uploading and extracting with Unstract..."):
|
| 104 |
r = requests.post(url, headers=headers, data=file_bytes)
|
| 105 |
if r.status_code != 202:
|
| 106 |
+
st.error(f"Unstract error: {r.status_code} - {r.text}")
|
| 107 |
return None
|
| 108 |
whisper_hash = r.json().get("whisper_hash")
|
| 109 |
if not whisper_hash:
|
| 110 |
st.error("Unstract: No whisper_hash received.")
|
| 111 |
return None
|
| 112 |
|
| 113 |
+
# Poll for status
|
| 114 |
status_url = f"{UNSTRACT_BASE}/whisper-status?whisper_hash={whisper_hash}"
|
|
|
|
| 115 |
for i in range(30):
|
| 116 |
status_r = requests.get(status_url, headers={"unstract-key": UNSTRACT_API_KEY})
|
| 117 |
if status_r.status_code != 200:
|
| 118 |
+
st.error(f"Unstract status error: {status_r.status_code} - {status_r.text}")
|
| 119 |
return None
|
| 120 |
status = status_r.json().get("status")
|
| 121 |
if status == "processed":
|
|
|
|
| 122 |
break
|
|
|
|
| 123 |
time.sleep(2)
|
| 124 |
else:
|
| 125 |
+
st.error("Unstract: Timeout waiting for OCR.")
|
| 126 |
return None
|
| 127 |
|
| 128 |
retrieve_url = f"{UNSTRACT_BASE}/whisper-retrieve?whisper_hash={whisper_hash}&text_only=true"
|
| 129 |
r = requests.get(retrieve_url, headers={"unstract-key": UNSTRACT_API_KEY})
|
| 130 |
if r.status_code != 200:
|
| 131 |
+
st.error(f"Unstract: Error retrieving text: {r.status_code} - {r.text}")
|
| 132 |
return None
|
| 133 |
try:
|
| 134 |
data = r.json()
|
|
|
|
| 136 |
except Exception:
|
| 137 |
return r.text
|
| 138 |
|
| 139 |
+
def fuzzy_match_type(detected_type, checklist_types):
|
| 140 |
+
# Returns best match and score
|
| 141 |
+
best_type = None
|
| 142 |
+
best_score = 0
|
| 143 |
+
for t in checklist_types:
|
| 144 |
+
score = fuzz.token_set_ratio(str(detected_type), str(t))
|
| 145 |
+
if score > best_score:
|
| 146 |
+
best_type = t
|
| 147 |
+
best_score = score
|
| 148 |
+
return best_type, best_score
|
| 149 |
+
|
| 150 |
+
def query_gemma_llm(doc_text, checklist_json):
|
| 151 |
+
prompt = f"""
|
| 152 |
+
Read the following extracted document text and analyze according to this checklist JSON:
|
| 153 |
+
{json.dumps(checklist_json)}
|
| 154 |
+
|
| 155 |
+
Can you read from this text, what type of document it is such as Certificate, License, Passport, etc and Also find the expiry date of it from the text, If you don't find the expiry date text but if you found any other code such as MRZ then find the expiry date from that. Also by the look of it give your verdict whether this is genuine with a confidence score. Also if the current date is 21st June 2025 then check whether the document is already expired or valid.
|
| 156 |
+
|
| 157 |
+
Return your output as a JSON like:
|
| 158 |
+
{{
|
| 159 |
+
"document_type": "...",
|
| 160 |
+
"expiry_date": "...",
|
| 161 |
+
"is_expired": true/false,
|
| 162 |
+
"looks_genuine": true/false,
|
| 163 |
+
"confidence": <score 0-100>,
|
| 164 |
+
"verdict": "...reasoned verdict..."
|
| 165 |
+
}}
|
| 166 |
+
Document Text:
|
| 167 |
+
{doc_text[:4000]}
|
| 168 |
+
""".strip()
|
| 169 |
|
| 170 |
+
headers = {
|
| 171 |
+
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
|
| 172 |
+
"HTTP-Referer": "https://chat.openai.com", # Some openrouter models require this
|
| 173 |
+
"X-Title": "EZOFIS-Doc-Validator",
|
| 174 |
+
"Content-Type": "application/json",
|
| 175 |
+
}
|
| 176 |
+
data = {
|
| 177 |
+
"model": GEMMA_MODEL,
|
| 178 |
+
"messages": [{"role": "user", "content": prompt}],
|
| 179 |
+
"temperature": 0.1,
|
| 180 |
+
"max_tokens": 1024
|
| 181 |
+
}
|
| 182 |
+
with st.spinner("Gemma LLM is validating the document..."):
|
| 183 |
+
resp = requests.post(OPENROUTER_URL, headers=headers, json=data, timeout=90)
|
| 184 |
+
if resp.status_code != 200:
|
| 185 |
+
st.error(f"OpenRouter error: {resp.status_code}: {resp.text}")
|
| 186 |
+
return None
|
| 187 |
+
result = resp.json()["choices"][0]["message"]["content"]
|
| 188 |
+
# Extract only JSON
|
| 189 |
+
start = result.find("{")
|
| 190 |
+
end = result.rfind("}") + 1
|
| 191 |
+
if start == -1 or end == 0:
|
| 192 |
+
st.error("Gemma did not return JSON.")
|
| 193 |
+
st.code(result)
|
| 194 |
+
return None
|
| 195 |
+
try:
|
| 196 |
+
return json.loads(result[start:end])
|
| 197 |
+
except Exception as e:
|
| 198 |
+
st.error("Error parsing LLM response.")
|
| 199 |
+
st.code(result)
|
| 200 |
+
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
|
| 202 |
+
# ========== Step 3: Run Validation ==========
|
| 203 |
+
if st.button("Run Document Validation", type="primary") and uploaded_files:
|
| 204 |
+
results = []
|
| 205 |
+
for uploaded_file in uploaded_files:
|
| 206 |
+
st.subheader(f"Validating: {uploaded_file.name}")
|
| 207 |
+
# Extract text
|
| 208 |
+
doc_text = extract_text_from_unstract(uploaded_file)
|
| 209 |
+
if not doc_text:
|
| 210 |
+
st.warning("Skipping due to extraction error.")
|
| 211 |
+
continue
|
| 212 |
+
# Query LLM
|
| 213 |
+
llm_json = query_gemma_llm(doc_text, checklist)
|
| 214 |
+
if not llm_json:
|
| 215 |
+
st.warning("Skipping due to LLM error.")
|
| 216 |
+
continue
|
| 217 |
+
# Fuzzy match doc type with checklist
|
| 218 |
+
detected_type = llm_json.get("document_type", "")
|
| 219 |
+
matched_type, match_score = fuzzy_match_type(detected_type, required_types)
|
| 220 |
+
# Acceptance logic
|
| 221 |
+
accepted = (
|
| 222 |
+
matched_type is not None and match_score >= 70 and
|
| 223 |
+
llm_json.get("looks_genuine", False) and
|
| 224 |
+
not llm_json.get("is_expired", False)
|
| 225 |
+
)
|
| 226 |
+
reason = []
|
| 227 |
+
reason.append(
|
| 228 |
+
f"Document type '{detected_type}' matched checklist '{matched_type}' with score {match_score}/100." if matched_type else
|
| 229 |
+
f"Document type '{detected_type}' did not match any required type."
|
| 230 |
+
)
|
| 231 |
+
reason.append(
|
| 232 |
+
f"Genuineness confidence: {llm_json.get('confidence', 0)}."
|
| 233 |
+
)
|
| 234 |
+
reason.append(
|
| 235 |
+
"Document is not expired." if not llm_json.get("is_expired", False) else "Document is expired."
|
| 236 |
+
)
|
| 237 |
+
reason.append(llm_json.get("verdict", ""))
|
| 238 |
+
results.append({
|
| 239 |
+
"File": uploaded_file.name,
|
| 240 |
+
"Detected Type": detected_type,
|
| 241 |
+
"Checklist Match": matched_type or "-",
|
| 242 |
+
"Type Score": match_score,
|
| 243 |
+
"Expiry Date": llm_json.get("expiry_date", "-"),
|
| 244 |
+
"Expired": "Yes" if llm_json.get("is_expired", False) else "No",
|
| 245 |
+
"Genuine": "Yes" if llm_json.get("looks_genuine", False) else "No",
|
| 246 |
+
"Confidence": llm_json.get("confidence", "-"),
|
| 247 |
+
"Accepted": "Yes" if accepted else "No",
|
| 248 |
+
"Reason": " ".join(reason)
|
| 249 |
+
})
|
| 250 |
+
if results:
|
| 251 |
+
st.success("Validation Complete.")
|
| 252 |
+
st.dataframe(pd.DataFrame(results))
|
| 253 |
+
else:
|
| 254 |
+
st.warning("No valid results.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
|
| 256 |
+
# Debugging
|
| 257 |
if "last_api" in st.session_state:
|
| 258 |
+
with st.expander("Debug (LLM raw output)"):
|
| 259 |
st.code(st.session_state.last_api)
|
|
|