Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -27,7 +27,6 @@ st.markdown("""
|
|
| 27 |
padding: 10px 32px !important; font-weight: 700; border: none !important; font-size: 18px !important;
|
| 28 |
margin-top: 12px !important;
|
| 29 |
}
|
| 30 |
-
/* Style result table headers */
|
| 31 |
.styled-table th {
|
| 32 |
background: #f3ecff !important;
|
| 33 |
color: #42318d !important;
|
|
@@ -42,12 +41,6 @@ st.markdown("""
|
|
| 42 |
word-break: break-word;
|
| 43 |
max-width: 220px;
|
| 44 |
}
|
| 45 |
-
.accepted-row {
|
| 46 |
-
background: #e7ffe7 !important;
|
| 47 |
-
}
|
| 48 |
-
.rejected-row {
|
| 49 |
-
background: #fff1f0 !important;
|
| 50 |
-
}
|
| 51 |
</style>
|
| 52 |
""", unsafe_allow_html=True)
|
| 53 |
|
|
@@ -57,7 +50,6 @@ st.markdown(
|
|
| 57 |
)
|
| 58 |
|
| 59 |
# ====== SIDE-BY-SIDE LAYOUT ======
|
| 60 |
-
|
| 61 |
col_left, col_right = st.columns([1.35, 1.05])
|
| 62 |
|
| 63 |
with col_left:
|
|
@@ -97,7 +89,7 @@ with col_left:
|
|
| 97 |
accept_multiple_files=True
|
| 98 |
)
|
| 99 |
|
| 100 |
-
# Step 3: Thresholds (SLIDERS
|
| 101 |
st.markdown("<span class='step-num'>3</span> <b>Configure Acceptance Thresholds</b>", unsafe_allow_html=True)
|
| 102 |
min_match_score = st.slider("Minimum Type Match Score (0-100)", 50, 100, 70, 1)
|
| 103 |
min_confidence = st.slider("Minimum LLM Confidence (0-100)", 50, 100, 70, 1)
|
|
@@ -132,7 +124,6 @@ Checklist for precision:
|
|
| 132 |
# Step 6: Run button
|
| 133 |
run_btn = st.button("Run Document Validation", type="primary")
|
| 134 |
|
| 135 |
-
|
| 136 |
# ========== FUNCTIONS ==========
|
| 137 |
|
| 138 |
def get_content_type(filename):
|
|
@@ -224,7 +215,7 @@ def query_gemma_llm(doc_text, checklist, agent_instruction, current_date, status
|
|
| 224 |
prompt = build_prompt(doc_text, checklist, agent_instruction, current_date)
|
| 225 |
headers = {
|
| 226 |
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
|
| 227 |
-
"HTTP-Referer": "https://chat.openai.com",
|
| 228 |
"X-Title": "EZOFIS-Doc-Validator",
|
| 229 |
"Content-Type": "application/json",
|
| 230 |
}
|
|
@@ -242,7 +233,6 @@ def query_gemma_llm(doc_text, checklist, agent_instruction, current_date, status
|
|
| 242 |
status_box.error(f"OpenRouter error: {resp.status_code}: {resp.text}")
|
| 243 |
return None, None, prompt
|
| 244 |
result = resp.json()["choices"][0]["message"]["content"]
|
| 245 |
-
# Extract only JSON
|
| 246 |
start = result.find("{")
|
| 247 |
end = result.rfind("}") + 1
|
| 248 |
if start == -1 or end == 0:
|
|
@@ -321,118 +311,118 @@ if run_btn and uploaded_files:
|
|
| 321 |
results = []
|
| 322 |
debug_data = []
|
| 323 |
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 344 |
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 348 |
debug_data.append({uploaded_file.name: debug})
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
if not checklist_matched:
|
| 368 |
-
reason.append("No matching checklist item found. Document rejected.")
|
| 369 |
else:
|
| 370 |
-
|
| 371 |
-
f"Document type '{detected_type}' matched checklist '{matched_type}' with score {match_score}/100."
|
| 372 |
-
)
|
| 373 |
-
if not llm_json.get("looks_genuine", False):
|
| 374 |
-
reason.append("Document does not look genuine.")
|
| 375 |
-
if llm_json.get("is_expired", False):
|
| 376 |
-
reason.append("Document is expired.")
|
| 377 |
-
|
| 378 |
-
reason.append(f"Genuineness confidence: {llm_conf}.")
|
| 379 |
-
reason.append(llm_json.get("verdict", ""))
|
| 380 |
-
|
| 381 |
-
# Advanced agent: If confidence is in a "gray zone", ask the LLM for a final self-verdict
|
| 382 |
-
verdict_json, verdict_raw, verdict_prompt = advanced_llm_verdict(llm_json, min_confidence, status_box)
|
| 383 |
-
debug['LLM_self_verdict_prompt'] = verdict_prompt
|
| 384 |
-
debug['LLM_self_verdict_raw'] = verdict_raw
|
| 385 |
-
debug['LLM_self_verdict_json'] = verdict_json
|
| 386 |
-
|
| 387 |
-
if verdict_json:
|
| 388 |
-
accepted = verdict_json.get("accepted", False)
|
| 389 |
-
reason.append(f"LLM Self-verdict: {verdict_json.get('reason','')}")
|
| 390 |
-
status_box.info("Final decision (gray zone) taken by LLM self-verdict.")
|
| 391 |
-
|
| 392 |
-
results.append({
|
| 393 |
-
"File": uploaded_file.name,
|
| 394 |
-
"Detected Type": detected_type,
|
| 395 |
-
"Checklist Match": matched_type if checklist_matched else "-",
|
| 396 |
-
"Type Score": match_score,
|
| 397 |
-
"Expiry Date": llm_json.get("expiry_date", "-"),
|
| 398 |
-
"Expired": "Yes" if llm_json.get("is_expired", False) else "No",
|
| 399 |
-
"Genuine": "Yes" if llm_json.get("looks_genuine", False) else "No",
|
| 400 |
-
"Confidence": llm_conf,
|
| 401 |
-
"Accepted": "Yes" if accepted else "No",
|
| 402 |
-
"Reason": " ".join(reason)
|
| 403 |
-
})
|
| 404 |
-
debug['Checklist_match_details'] = {
|
| 405 |
-
"detected_type": detected_type,
|
| 406 |
-
"matched_type": matched_type,
|
| 407 |
-
"match_score": match_score,
|
| 408 |
-
"checklist_matched": checklist_matched,
|
| 409 |
-
"accepted": accepted
|
| 410 |
-
}
|
| 411 |
-
debug_data.append({uploaded_file.name: debug})
|
| 412 |
-
status_box.success("Validation complete. See result below.")
|
| 413 |
-
|
| 414 |
-
# ==== Results table with custom styling ====
|
| 415 |
-
if results:
|
| 416 |
-
st.success("All validations complete.")
|
| 417 |
-
df = pd.DataFrame(results)
|
| 418 |
-
# Convert to HTML with classes for styling
|
| 419 |
-
def style_row(row):
|
| 420 |
-
color = "#e7ffe7" if row["Accepted"] == "Yes" else "#fff1f0"
|
| 421 |
-
return [f"background-color: {color}"]*len(row)
|
| 422 |
-
styled_df = df.style.apply(style_row, axis=1)\
|
| 423 |
-
.set_table_attributes('class="styled-table"')\
|
| 424 |
-
.set_properties(**{
|
| 425 |
-
'font-size': '15px',
|
| 426 |
-
'word-break': 'break-word',
|
| 427 |
-
'border': '1px solid #ddd'
|
| 428 |
-
})
|
| 429 |
-
st.markdown('<h4 style="margin-top:28px;">Validation Results</h4>', unsafe_allow_html=True)
|
| 430 |
-
st.write(styled_df.to_html(escape=False), unsafe_allow_html=True)
|
| 431 |
-
else:
|
| 432 |
-
st.warning("No valid results.")
|
| 433 |
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
|
|
|
| 27 |
padding: 10px 32px !important; font-weight: 700; border: none !important; font-size: 18px !important;
|
| 28 |
margin-top: 12px !important;
|
| 29 |
}
|
|
|
|
| 30 |
.styled-table th {
|
| 31 |
background: #f3ecff !important;
|
| 32 |
color: #42318d !important;
|
|
|
|
| 41 |
word-break: break-word;
|
| 42 |
max-width: 220px;
|
| 43 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
</style>
|
| 45 |
""", unsafe_allow_html=True)
|
| 46 |
|
|
|
|
| 50 |
)
|
| 51 |
|
| 52 |
# ====== SIDE-BY-SIDE LAYOUT ======
|
|
|
|
| 53 |
col_left, col_right = st.columns([1.35, 1.05])
|
| 54 |
|
| 55 |
with col_left:
|
|
|
|
| 89 |
accept_multiple_files=True
|
| 90 |
)
|
| 91 |
|
| 92 |
+
# Step 3: Thresholds (SLIDERS HERE)
|
| 93 |
st.markdown("<span class='step-num'>3</span> <b>Configure Acceptance Thresholds</b>", unsafe_allow_html=True)
|
| 94 |
min_match_score = st.slider("Minimum Type Match Score (0-100)", 50, 100, 70, 1)
|
| 95 |
min_confidence = st.slider("Minimum LLM Confidence (0-100)", 50, 100, 70, 1)
|
|
|
|
| 124 |
# Step 6: Run button
|
| 125 |
run_btn = st.button("Run Document Validation", type="primary")
|
| 126 |
|
|
|
|
| 127 |
# ========== FUNCTIONS ==========
|
| 128 |
|
| 129 |
def get_content_type(filename):
|
|
|
|
| 215 |
prompt = build_prompt(doc_text, checklist, agent_instruction, current_date)
|
| 216 |
headers = {
|
| 217 |
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
|
| 218 |
+
"HTTP-Referer": "https://chat.openai.com",
|
| 219 |
"X-Title": "EZOFIS-Doc-Validator",
|
| 220 |
"Content-Type": "application/json",
|
| 221 |
}
|
|
|
|
| 233 |
status_box.error(f"OpenRouter error: {resp.status_code}: {resp.text}")
|
| 234 |
return None, None, prompt
|
| 235 |
result = resp.json()["choices"][0]["message"]["content"]
|
|
|
|
| 236 |
start = result.find("{")
|
| 237 |
end = result.rfind("}") + 1
|
| 238 |
if start == -1 or end == 0:
|
|
|
|
| 311 |
results = []
|
| 312 |
debug_data = []
|
| 313 |
|
| 314 |
+
with col_right:
|
| 315 |
+
for uploaded_file in uploaded_files:
|
| 316 |
+
st.subheader(f"Validating: {uploaded_file.name}")
|
| 317 |
+
status_box = st.empty()
|
| 318 |
+
debug = {}
|
| 319 |
+
|
| 320 |
+
# Step 1: OCR
|
| 321 |
+
doc_text = extract_text_from_unstract(uploaded_file, status_box)
|
| 322 |
+
debug['OCR_extracted_text'] = doc_text
|
| 323 |
+
|
| 324 |
+
if not doc_text:
|
| 325 |
+
status_box.error("Skipping due to OCR extraction error.")
|
| 326 |
+
debug['error'] = "OCR extraction error"
|
| 327 |
+
debug_data.append({uploaded_file.name: debug})
|
| 328 |
+
continue
|
| 329 |
+
|
| 330 |
+
# Step 2: LLM Validation
|
| 331 |
+
llm_json, llm_raw, llm_prompt = query_gemma_llm(doc_text, checklist, agent_instruction, date_str, status_box)
|
| 332 |
+
debug['LLM_prompt'] = llm_prompt
|
| 333 |
+
debug['LLM_raw_response'] = llm_raw
|
| 334 |
+
debug['LLM_parsed_json'] = llm_json
|
| 335 |
+
|
| 336 |
+
if not llm_json:
|
| 337 |
+
status_box.error("Skipping due to LLM error.")
|
| 338 |
+
debug['error'] = "LLM processing error"
|
| 339 |
+
debug_data.append({uploaded_file.name: debug})
|
| 340 |
+
continue
|
| 341 |
+
|
| 342 |
+
detected_type = llm_json.get("document_type", "")
|
| 343 |
+
matched_type, match_score = fuzzy_match_type(detected_type, required_types)
|
| 344 |
+
|
| 345 |
+
checklist_matched = llm_json.get("checklist_matched", False)
|
| 346 |
+
if checklist_matched and match_score < min_match_score:
|
| 347 |
+
checklist_matched = False
|
| 348 |
+
|
| 349 |
+
llm_conf = llm_json.get("confidence", 0)
|
| 350 |
+
accepted = (
|
| 351 |
+
checklist_matched and
|
| 352 |
+
llm_json.get("looks_genuine", False) and
|
| 353 |
+
not llm_json.get("is_expired", False) and
|
| 354 |
+
(llm_conf >= min_confidence)
|
| 355 |
+
)
|
| 356 |
|
| 357 |
+
reason = []
|
| 358 |
+
if not checklist_matched:
|
| 359 |
+
reason.append("No matching checklist item found. Document rejected.")
|
| 360 |
+
else:
|
| 361 |
+
reason.append(
|
| 362 |
+
f"Document type '{detected_type}' matched checklist '{matched_type}' with score {match_score}/100."
|
| 363 |
+
)
|
| 364 |
+
if not llm_json.get("looks_genuine", False):
|
| 365 |
+
reason.append("Document does not look genuine.")
|
| 366 |
+
if llm_json.get("is_expired", False):
|
| 367 |
+
reason.append("Document is expired.")
|
| 368 |
+
|
| 369 |
+
reason.append(f"Genuineness confidence: {llm_conf}.")
|
| 370 |
+
reason.append(llm_json.get("verdict", ""))
|
| 371 |
+
|
| 372 |
+
# Advanced agent: If confidence is in a "gray zone", ask the LLM for a final self-verdict
|
| 373 |
+
verdict_json, verdict_raw, verdict_prompt = advanced_llm_verdict(llm_json, min_confidence, status_box)
|
| 374 |
+
debug['LLM_self_verdict_prompt'] = verdict_prompt
|
| 375 |
+
debug['LLM_self_verdict_raw'] = verdict_raw
|
| 376 |
+
debug['LLM_self_verdict_json'] = verdict_json
|
| 377 |
+
|
| 378 |
+
if verdict_json:
|
| 379 |
+
accepted = verdict_json.get("accepted", False)
|
| 380 |
+
reason.append(f"LLM Self-verdict: {verdict_json.get('reason','')}")
|
| 381 |
+
status_box.info("Final decision (gray zone) taken by LLM self-verdict.")
|
| 382 |
+
|
| 383 |
+
results.append({
|
| 384 |
+
"File": uploaded_file.name,
|
| 385 |
+
"Detected Type": detected_type,
|
| 386 |
+
"Checklist Match": matched_type if checklist_matched else "-",
|
| 387 |
+
"Type Score": match_score,
|
| 388 |
+
"Expiry Date": llm_json.get("expiry_date", "-"),
|
| 389 |
+
"Expired": "Yes" if llm_json.get("is_expired", False) else "No",
|
| 390 |
+
"Genuine": "Yes" if llm_json.get("looks_genuine", False) else "No",
|
| 391 |
+
"Confidence": llm_conf,
|
| 392 |
+
"Accepted": "Yes" if accepted else "No",
|
| 393 |
+
"Reason": " ".join(reason)
|
| 394 |
+
})
|
| 395 |
+
debug['Checklist_match_details'] = {
|
| 396 |
+
"detected_type": detected_type,
|
| 397 |
+
"matched_type": matched_type,
|
| 398 |
+
"match_score": match_score,
|
| 399 |
+
"checklist_matched": checklist_matched,
|
| 400 |
+
"accepted": accepted
|
| 401 |
+
}
|
| 402 |
debug_data.append({uploaded_file.name: debug})
|
| 403 |
+
status_box.success("Validation complete. See result below.")
|
| 404 |
+
|
| 405 |
+
# ==== Results table with custom styling ====
|
| 406 |
+
if results:
|
| 407 |
+
st.success("All validations complete.")
|
| 408 |
+
df = pd.DataFrame(results)
|
| 409 |
+
def style_row(row):
|
| 410 |
+
color = "#e7ffe7" if row["Accepted"] == "Yes" else "#fff1f0"
|
| 411 |
+
return [f"background-color: {color}"]*len(row)
|
| 412 |
+
styled_df = df.style.apply(style_row, axis=1)\
|
| 413 |
+
.set_table_attributes('class="styled-table"')\
|
| 414 |
+
.set_properties(**{
|
| 415 |
+
'font-size': '15px',
|
| 416 |
+
'word-break': 'break-word',
|
| 417 |
+
'border': '1px solid #ddd'
|
| 418 |
+
})
|
| 419 |
+
st.markdown('<h4 style="margin-top:28px;">Validation Results</h4>', unsafe_allow_html=True)
|
| 420 |
+
st.write(styled_df.to_html(escape=False), unsafe_allow_html=True)
|
|
|
|
|
|
|
| 421 |
else:
|
| 422 |
+
st.warning("No valid results.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 423 |
|
| 424 |
+
with st.expander("Debug Panel (per document)"):
|
| 425 |
+
for doc_debug in debug_data:
|
| 426 |
+
for fname, dbg in doc_debug.items():
|
| 427 |
+
st.markdown(f"**{fname}**")
|
| 428 |
+
st.json(dbg)
|