Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -11,6 +11,7 @@ from langchain_community.chat_models import ChatOpenAI
|
|
| 11 |
from langchain.agents import initialize_agent, Tool, AgentType
|
| 12 |
from fuzzywuzzy import fuzz
|
| 13 |
|
|
|
|
| 14 |
st.set_page_config(page_title="Accounts Payable AI Agent", layout="wide")
|
| 15 |
|
| 16 |
MODELS = {
|
|
@@ -45,6 +46,8 @@ MODELS = {
|
|
| 45 |
},
|
| 46 |
}
|
| 47 |
|
|
|
|
|
|
|
| 48 |
def get_api_key(model_choice):
|
| 49 |
key = os.getenv(MODELS[model_choice]["key_env"])
|
| 50 |
if not key:
|
|
@@ -202,64 +205,18 @@ def ensure_total_due(invoice_header):
|
|
| 202 |
break
|
| 203 |
return invoice_header
|
| 204 |
|
| 205 |
-
def
|
| 206 |
-
|
| 207 |
-
ext = filename.lower().split('.')[-1]
|
| 208 |
-
if ext == "pdf":
|
| 209 |
-
return "text/plain"
|
| 210 |
-
if mime is None:
|
| 211 |
-
return "application/octet-stream"
|
| 212 |
-
return mime
|
| 213 |
-
|
| 214 |
-
UNSTRACT_BASE = "https://llmwhisperer-api.us-central.unstract.com/api/v2"
|
| 215 |
-
UNSTRACT_API_KEY = os.getenv("UNSTRACT_API_KEY")
|
| 216 |
-
|
| 217 |
-
def extract_text_from_unstract(uploaded_file):
|
| 218 |
-
filename = getattr(uploaded_file, "name", "uploaded_file")
|
| 219 |
-
file_bytes = uploaded_file.read()
|
| 220 |
-
content_type = get_content_type(filename)
|
| 221 |
-
headers = {
|
| 222 |
-
"unstract-key": UNSTRACT_API_KEY,
|
| 223 |
-
"Content-Type": content_type,
|
| 224 |
-
}
|
| 225 |
-
url = f"{UNSTRACT_BASE}/whisper"
|
| 226 |
-
with st.spinner("Uploading and processing document with Unstract..."):
|
| 227 |
-
r = requests.post(url, headers=headers, data=file_bytes)
|
| 228 |
-
if r.status_code != 202:
|
| 229 |
-
st.error(f"Unstract: Error uploading file: {r.status_code} - {r.text}")
|
| 230 |
-
return None
|
| 231 |
-
whisper_hash = r.json().get("whisper_hash")
|
| 232 |
-
if not whisper_hash:
|
| 233 |
-
st.error("Unstract: No whisper_hash received.")
|
| 234 |
-
return None
|
| 235 |
-
|
| 236 |
-
status_url = f"{UNSTRACT_BASE}/whisper-status?whisper_hash={whisper_hash}"
|
| 237 |
-
status_placeholder = st.empty()
|
| 238 |
-
for i in range(30):
|
| 239 |
-
status_r = requests.get(status_url, headers={"unstract-key": UNSTRACT_API_KEY})
|
| 240 |
-
if status_r.status_code != 200:
|
| 241 |
-
st.error(f"Unstract: Error checking status: {status_r.status_code} - {status_r.text}")
|
| 242 |
-
return None
|
| 243 |
-
status = status_r.json().get("status")
|
| 244 |
-
if status == "processed":
|
| 245 |
-
status_placeholder.info("Unstract status: processed! 🎉")
|
| 246 |
-
break
|
| 247 |
-
status_placeholder.info(f"Unstract status: {status or 'waiting'}... ({i+1})")
|
| 248 |
-
time.sleep(2)
|
| 249 |
-
else:
|
| 250 |
-
status_placeholder.error("Unstract: Timeout waiting for OCR to finish.")
|
| 251 |
-
return None
|
| 252 |
-
|
| 253 |
-
retrieve_url = f"{UNSTRACT_BASE}/whisper-retrieve?whisper_hash={whisper_hash}&text_only=true"
|
| 254 |
-
r = requests.get(retrieve_url, headers={"unstract-key": UNSTRACT_API_KEY})
|
| 255 |
-
if r.status_code != 200:
|
| 256 |
-
st.error(f"Unstract: Error retrieving extracted text: {r.status_code} - {r.text}")
|
| 257 |
return None
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
|
| 264 |
def weighted_fuzzy_score(s1, s2):
|
| 265 |
if not s1 and not s2:
|
|
@@ -352,6 +309,93 @@ def find_best_po_match(inv, po_df):
|
|
| 352 |
best_row, best_score, reason, debug = scores[0]
|
| 353 |
return best_row, best_score, reason, debug
|
| 354 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
st.sidebar.header("Step 1: Upload Active Purchase Orders (POs)")
|
| 356 |
po_file = st.sidebar.file_uploader(
|
| 357 |
"Upload POs CSV (must include PO number, Supplier, Items, etc.)",
|
|
@@ -376,7 +420,6 @@ if st.button("Extract") and inv_file:
|
|
| 376 |
with st.spinner("Extracting text from document using Unstract..."):
|
| 377 |
text = extract_text_from_unstract(inv_file)
|
| 378 |
if text:
|
| 379 |
-
prompt = get_extraction_prompt(mdl, text)
|
| 380 |
extracted_info = extract_invoice_info(mdl, text)
|
| 381 |
if extracted_info:
|
| 382 |
if "invoice_header" in extracted_info:
|
|
@@ -417,32 +460,6 @@ def po_match_tool_func(input_text):
|
|
| 417 |
"po_row": best_row.to_dict() if best_row is not None else None
|
| 418 |
})
|
| 419 |
|
| 420 |
-
def extract_invoice_info(model_choice, text):
|
| 421 |
-
prompt = get_extraction_prompt(model_choice, text)
|
| 422 |
-
raw = query_llm(model_choice, prompt)
|
| 423 |
-
if not raw:
|
| 424 |
-
return None
|
| 425 |
-
data = clean_json_response(raw)
|
| 426 |
-
if not data:
|
| 427 |
-
return None
|
| 428 |
-
hdr = data.get("invoice_header", {})
|
| 429 |
-
if not hdr and any(k in data for k in ("invoice_number","supplier_name","customer_name")):
|
| 430 |
-
hdr = data
|
| 431 |
-
for k in ("invoice_number","invoice_date","po_number","invoice_value","supplier_name","customer_name"):
|
| 432 |
-
hdr.setdefault(k, None)
|
| 433 |
-
if not hdr.get("supplier_name"):
|
| 434 |
-
hdr["supplier_name"] = fallback_supplier(text)
|
| 435 |
-
hdr = ensure_total_due(hdr)
|
| 436 |
-
items = data.get("line_items", [])
|
| 437 |
-
if not isinstance(items, list):
|
| 438 |
-
items = []
|
| 439 |
-
for itm in items:
|
| 440 |
-
if not isinstance(itm, dict):
|
| 441 |
-
continue
|
| 442 |
-
for k in ("item_number","description","quantity","unit_price","total_price"):
|
| 443 |
-
itm.setdefault(k, None)
|
| 444 |
-
return {"invoice_header": hdr, "line_items": items}
|
| 445 |
-
|
| 446 |
if po_df is not None:
|
| 447 |
st.session_state["last_po_df"] = po_df
|
| 448 |
|
|
|
|
| 11 |
from langchain.agents import initialize_agent, Tool, AgentType
|
| 12 |
from fuzzywuzzy import fuzz
|
| 13 |
|
| 14 |
+
# --- CONFIGURATION ---
|
| 15 |
st.set_page_config(page_title="Accounts Payable AI Agent", layout="wide")
|
| 16 |
|
| 17 |
MODELS = {
|
|
|
|
| 46 |
},
|
| 47 |
}
|
| 48 |
|
| 49 |
+
# --- UTILITY FUNCTIONS ---
|
| 50 |
+
|
| 51 |
def get_api_key(model_choice):
|
| 52 |
key = os.getenv(MODELS[model_choice]["key_env"])
|
| 53 |
if not key:
|
|
|
|
| 205 |
break
|
| 206 |
return invoice_header
|
| 207 |
|
| 208 |
+
def clean_num(val):
|
| 209 |
+
if val is None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
return None
|
| 211 |
+
if isinstance(val, (int, float)):
|
| 212 |
+
return float(val)
|
| 213 |
+
matches = re.findall(r"[-+]?\d[\d,]*\.?\d*", str(val))
|
| 214 |
+
if matches:
|
| 215 |
+
cleaned = [m.replace(',', '') for m in matches if m]
|
| 216 |
+
as_floats = [float(c) for c in cleaned if c.replace('.', '', 1).isdigit()]
|
| 217 |
+
if as_floats:
|
| 218 |
+
return max(as_floats)
|
| 219 |
+
return None
|
| 220 |
|
| 221 |
def weighted_fuzzy_score(s1, s2):
|
| 222 |
if not s1 and not s2:
|
|
|
|
| 309 |
best_row, best_score, reason, debug = scores[0]
|
| 310 |
return best_row, best_score, reason, debug
|
| 311 |
|
| 312 |
+
def extract_invoice_info(model_choice, text):
|
| 313 |
+
prompt = get_extraction_prompt(model_choice, text)
|
| 314 |
+
raw = query_llm(model_choice, prompt)
|
| 315 |
+
if not raw:
|
| 316 |
+
return None
|
| 317 |
+
data = clean_json_response(raw)
|
| 318 |
+
if not data:
|
| 319 |
+
return None
|
| 320 |
+
hdr = data.get("invoice_header", {})
|
| 321 |
+
if not hdr and any(k in data for k in ("invoice_number","supplier_name","customer_name")):
|
| 322 |
+
hdr = data
|
| 323 |
+
for k in ("invoice_number","invoice_date","po_number","invoice_value","supplier_name","customer_name"):
|
| 324 |
+
hdr.setdefault(k, None)
|
| 325 |
+
if not hdr.get("supplier_name"):
|
| 326 |
+
hdr["supplier_name"] = fallback_supplier(text)
|
| 327 |
+
hdr = ensure_total_due(hdr)
|
| 328 |
+
items = data.get("line_items", [])
|
| 329 |
+
if not isinstance(items, list):
|
| 330 |
+
items = []
|
| 331 |
+
for itm in items:
|
| 332 |
+
if not isinstance(itm, dict):
|
| 333 |
+
continue
|
| 334 |
+
for k in ("item_number","description","quantity","unit_price","total_price"):
|
| 335 |
+
itm.setdefault(k, None)
|
| 336 |
+
return {"invoice_header": hdr, "line_items": items}
|
| 337 |
+
|
| 338 |
+
def get_content_type(filename):
|
| 339 |
+
mime, _ = mimetypes.guess_type(filename)
|
| 340 |
+
ext = filename.lower().split('.')[-1]
|
| 341 |
+
if ext == "pdf":
|
| 342 |
+
return "text/plain"
|
| 343 |
+
if mime is None:
|
| 344 |
+
return "application/octet-stream"
|
| 345 |
+
return mime
|
| 346 |
+
|
| 347 |
+
UNSTRACT_BASE = "https://llmwhisperer-api.us-central.unstract.com/api/v2"
|
| 348 |
+
UNSTRACT_API_KEY = os.getenv("UNSTRACT_API_KEY")
|
| 349 |
+
|
| 350 |
+
def extract_text_from_unstract(uploaded_file):
|
| 351 |
+
filename = getattr(uploaded_file, "name", "uploaded_file")
|
| 352 |
+
file_bytes = uploaded_file.read()
|
| 353 |
+
content_type = get_content_type(filename)
|
| 354 |
+
headers = {
|
| 355 |
+
"unstract-key": UNSTRACT_API_KEY,
|
| 356 |
+
"Content-Type": content_type,
|
| 357 |
+
}
|
| 358 |
+
url = f"{UNSTRACT_BASE}/whisper"
|
| 359 |
+
with st.spinner("Uploading and processing document with Unstract..."):
|
| 360 |
+
r = requests.post(url, headers=headers, data=file_bytes)
|
| 361 |
+
if r.status_code != 202:
|
| 362 |
+
st.error(f"Unstract: Error uploading file: {r.status_code} - {r.text}")
|
| 363 |
+
return None
|
| 364 |
+
whisper_hash = r.json().get("whisper_hash")
|
| 365 |
+
if not whisper_hash:
|
| 366 |
+
st.error("Unstract: No whisper_hash received.")
|
| 367 |
+
return None
|
| 368 |
+
|
| 369 |
+
status_url = f"{UNSTRACT_BASE}/whisper-status?whisper_hash={whisper_hash}"
|
| 370 |
+
status_placeholder = st.empty()
|
| 371 |
+
for i in range(30):
|
| 372 |
+
status_r = requests.get(status_url, headers={"unstract-key": UNSTRACT_API_KEY})
|
| 373 |
+
if status_r.status_code != 200:
|
| 374 |
+
st.error(f"Unstract: Error checking status: {status_r.status_code} - {status_r.text}")
|
| 375 |
+
return None
|
| 376 |
+
status = status_r.json().get("status")
|
| 377 |
+
if status == "processed":
|
| 378 |
+
status_placeholder.info("Unstract status: processed! 🎉")
|
| 379 |
+
break
|
| 380 |
+
status_placeholder.info(f"Unstract status: {status or 'waiting'}... ({i+1})")
|
| 381 |
+
time.sleep(2)
|
| 382 |
+
else:
|
| 383 |
+
status_placeholder.error("Unstract: Timeout waiting for OCR to finish.")
|
| 384 |
+
return None
|
| 385 |
+
|
| 386 |
+
retrieve_url = f"{UNSTRACT_BASE}/whisper-retrieve?whisper_hash={whisper_hash}&text_only=true"
|
| 387 |
+
r = requests.get(retrieve_url, headers={"unstract-key": UNSTRACT_API_KEY})
|
| 388 |
+
if r.status_code != 200:
|
| 389 |
+
st.error(f"Unstract: Error retrieving extracted text: {r.status_code} - {r.text}")
|
| 390 |
+
return None
|
| 391 |
+
try:
|
| 392 |
+
data = r.json()
|
| 393 |
+
return data.get("result_text") or r.text
|
| 394 |
+
except Exception:
|
| 395 |
+
return r.text
|
| 396 |
+
|
| 397 |
+
# --- UI/LOGIC ---
|
| 398 |
+
|
| 399 |
st.sidebar.header("Step 1: Upload Active Purchase Orders (POs)")
|
| 400 |
po_file = st.sidebar.file_uploader(
|
| 401 |
"Upload POs CSV (must include PO number, Supplier, Items, etc.)",
|
|
|
|
| 420 |
with st.spinner("Extracting text from document using Unstract..."):
|
| 421 |
text = extract_text_from_unstract(inv_file)
|
| 422 |
if text:
|
|
|
|
| 423 |
extracted_info = extract_invoice_info(mdl, text)
|
| 424 |
if extracted_info:
|
| 425 |
if "invoice_header" in extracted_info:
|
|
|
|
| 460 |
"po_row": best_row.to_dict() if best_row is not None else None
|
| 461 |
})
|
| 462 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 463 |
if po_df is not None:
|
| 464 |
st.session_state["last_po_df"] = po_df
|
| 465 |
|