Seth0330 commited on
Commit
1c49f02
·
verified ·
1 Parent(s): bd13fee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -123
app.py CHANGED
@@ -27,7 +27,6 @@ st.markdown("""
27
  padding: 10px 32px !important; font-weight: 700; border: none !important; font-size: 18px !important;
28
  margin-top: 12px !important;
29
  }
30
- /* Style result table headers */
31
  .styled-table th {
32
  background: #f3ecff !important;
33
  color: #42318d !important;
@@ -42,12 +41,6 @@ st.markdown("""
42
  word-break: break-word;
43
  max-width: 220px;
44
  }
45
- .accepted-row {
46
- background: #e7ffe7 !important;
47
- }
48
- .rejected-row {
49
- background: #fff1f0 !important;
50
- }
51
  </style>
52
  """, unsafe_allow_html=True)
53
 
@@ -57,7 +50,6 @@ st.markdown(
57
  )
58
 
59
  # ====== SIDE-BY-SIDE LAYOUT ======
60
-
61
  col_left, col_right = st.columns([1.35, 1.05])
62
 
63
  with col_left:
@@ -97,7 +89,7 @@ with col_left:
97
  accept_multiple_files=True
98
  )
99
 
100
- # Step 3: Thresholds (SLIDERS MOVED HERE)
101
  st.markdown("<span class='step-num'>3</span> <b>Configure Acceptance Thresholds</b>", unsafe_allow_html=True)
102
  min_match_score = st.slider("Minimum Type Match Score (0-100)", 50, 100, 70, 1)
103
  min_confidence = st.slider("Minimum LLM Confidence (0-100)", 50, 100, 70, 1)
@@ -132,7 +124,6 @@ Checklist for precision:
132
  # Step 6: Run button
133
  run_btn = st.button("Run Document Validation", type="primary")
134
 
135
-
136
  # ========== FUNCTIONS ==========
137
 
138
  def get_content_type(filename):
@@ -224,7 +215,7 @@ def query_gemma_llm(doc_text, checklist, agent_instruction, current_date, status
224
  prompt = build_prompt(doc_text, checklist, agent_instruction, current_date)
225
  headers = {
226
  "Authorization": f"Bearer {OPENROUTER_API_KEY}",
227
- "HTTP-Referer": "https://chat.openai.com", # for OpenRouter
228
  "X-Title": "EZOFIS-Doc-Validator",
229
  "Content-Type": "application/json",
230
  }
@@ -242,7 +233,6 @@ def query_gemma_llm(doc_text, checklist, agent_instruction, current_date, status
242
  status_box.error(f"OpenRouter error: {resp.status_code}: {resp.text}")
243
  return None, None, prompt
244
  result = resp.json()["choices"][0]["message"]["content"]
245
- # Extract only JSON
246
  start = result.find("{")
247
  end = result.rfind("}") + 1
248
  if start == -1 or end == 0:
@@ -321,118 +311,118 @@ if run_btn and uploaded_files:
321
  results = []
322
  debug_data = []
323
 
324
- for uploaded_file in uploaded_files:
325
- st.subheader(f"Validating: {uploaded_file.name}")
326
- status_box = st.empty()
327
- debug = {}
328
-
329
- # Step 1: OCR
330
- doc_text = extract_text_from_unstract(uploaded_file, status_box)
331
- debug['OCR_extracted_text'] = doc_text
332
-
333
- if not doc_text:
334
- status_box.error("Skipping due to OCR extraction error.")
335
- debug['error'] = "OCR extraction error"
336
- debug_data.append({uploaded_file.name: debug})
337
- continue
338
-
339
- # Step 2: LLM Validation
340
- llm_json, llm_raw, llm_prompt = query_gemma_llm(doc_text, checklist, agent_instruction, date_str, status_box)
341
- debug['LLM_prompt'] = llm_prompt
342
- debug['LLM_raw_response'] = llm_raw
343
- debug['LLM_parsed_json'] = llm_json
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
 
345
- if not llm_json:
346
- status_box.error("Skipping due to LLM error.")
347
- debug['error'] = "LLM processing error"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
  debug_data.append({uploaded_file.name: debug})
349
- continue
350
-
351
- detected_type = llm_json.get("document_type", "")
352
- matched_type, match_score = fuzzy_match_type(detected_type, required_types)
353
-
354
- checklist_matched = llm_json.get("checklist_matched", False)
355
- if checklist_matched and match_score < min_match_score:
356
- checklist_matched = False
357
-
358
- llm_conf = llm_json.get("confidence", 0)
359
- accepted = (
360
- checklist_matched and
361
- llm_json.get("looks_genuine", False) and
362
- not llm_json.get("is_expired", False) and
363
- (llm_conf >= min_confidence)
364
- )
365
-
366
- reason = []
367
- if not checklist_matched:
368
- reason.append("No matching checklist item found. Document rejected.")
369
  else:
370
- reason.append(
371
- f"Document type '{detected_type}' matched checklist '{matched_type}' with score {match_score}/100."
372
- )
373
- if not llm_json.get("looks_genuine", False):
374
- reason.append("Document does not look genuine.")
375
- if llm_json.get("is_expired", False):
376
- reason.append("Document is expired.")
377
-
378
- reason.append(f"Genuineness confidence: {llm_conf}.")
379
- reason.append(llm_json.get("verdict", ""))
380
-
381
- # Advanced agent: If confidence is in a "gray zone", ask the LLM for a final self-verdict
382
- verdict_json, verdict_raw, verdict_prompt = advanced_llm_verdict(llm_json, min_confidence, status_box)
383
- debug['LLM_self_verdict_prompt'] = verdict_prompt
384
- debug['LLM_self_verdict_raw'] = verdict_raw
385
- debug['LLM_self_verdict_json'] = verdict_json
386
-
387
- if verdict_json:
388
- accepted = verdict_json.get("accepted", False)
389
- reason.append(f"LLM Self-verdict: {verdict_json.get('reason','')}")
390
- status_box.info("Final decision (gray zone) taken by LLM self-verdict.")
391
-
392
- results.append({
393
- "File": uploaded_file.name,
394
- "Detected Type": detected_type,
395
- "Checklist Match": matched_type if checklist_matched else "-",
396
- "Type Score": match_score,
397
- "Expiry Date": llm_json.get("expiry_date", "-"),
398
- "Expired": "Yes" if llm_json.get("is_expired", False) else "No",
399
- "Genuine": "Yes" if llm_json.get("looks_genuine", False) else "No",
400
- "Confidence": llm_conf,
401
- "Accepted": "Yes" if accepted else "No",
402
- "Reason": " ".join(reason)
403
- })
404
- debug['Checklist_match_details'] = {
405
- "detected_type": detected_type,
406
- "matched_type": matched_type,
407
- "match_score": match_score,
408
- "checklist_matched": checklist_matched,
409
- "accepted": accepted
410
- }
411
- debug_data.append({uploaded_file.name: debug})
412
- status_box.success("Validation complete. See result below.")
413
-
414
- # ==== Results table with custom styling ====
415
- if results:
416
- st.success("All validations complete.")
417
- df = pd.DataFrame(results)
418
- # Convert to HTML with classes for styling
419
- def style_row(row):
420
- color = "#e7ffe7" if row["Accepted"] == "Yes" else "#fff1f0"
421
- return [f"background-color: {color}"]*len(row)
422
- styled_df = df.style.apply(style_row, axis=1)\
423
- .set_table_attributes('class="styled-table"')\
424
- .set_properties(**{
425
- 'font-size': '15px',
426
- 'word-break': 'break-word',
427
- 'border': '1px solid #ddd'
428
- })
429
- st.markdown('<h4 style="margin-top:28px;">Validation Results</h4>', unsafe_allow_html=True)
430
- st.write(styled_df.to_html(escape=False), unsafe_allow_html=True)
431
- else:
432
- st.warning("No valid results.")
433
 
434
- with st.expander("Debug Panel (per document)"):
435
- for doc_debug in debug_data:
436
- for fname, dbg in doc_debug.items():
437
- st.markdown(f"**{fname}**")
438
- st.json(dbg)
 
27
  padding: 10px 32px !important; font-weight: 700; border: none !important; font-size: 18px !important;
28
  margin-top: 12px !important;
29
  }
 
30
  .styled-table th {
31
  background: #f3ecff !important;
32
  color: #42318d !important;
 
41
  word-break: break-word;
42
  max-width: 220px;
43
  }
 
 
 
 
 
 
44
  </style>
45
  """, unsafe_allow_html=True)
46
 
 
50
  )
51
 
52
  # ====== SIDE-BY-SIDE LAYOUT ======
 
53
  col_left, col_right = st.columns([1.35, 1.05])
54
 
55
  with col_left:
 
89
  accept_multiple_files=True
90
  )
91
 
92
+ # Step 3: Thresholds (SLIDERS HERE)
93
  st.markdown("<span class='step-num'>3</span> <b>Configure Acceptance Thresholds</b>", unsafe_allow_html=True)
94
  min_match_score = st.slider("Minimum Type Match Score (0-100)", 50, 100, 70, 1)
95
  min_confidence = st.slider("Minimum LLM Confidence (0-100)", 50, 100, 70, 1)
 
124
  # Step 6: Run button
125
  run_btn = st.button("Run Document Validation", type="primary")
126
 
 
127
  # ========== FUNCTIONS ==========
128
 
129
  def get_content_type(filename):
 
215
  prompt = build_prompt(doc_text, checklist, agent_instruction, current_date)
216
  headers = {
217
  "Authorization": f"Bearer {OPENROUTER_API_KEY}",
218
+ "HTTP-Referer": "https://chat.openai.com",
219
  "X-Title": "EZOFIS-Doc-Validator",
220
  "Content-Type": "application/json",
221
  }
 
233
  status_box.error(f"OpenRouter error: {resp.status_code}: {resp.text}")
234
  return None, None, prompt
235
  result = resp.json()["choices"][0]["message"]["content"]
 
236
  start = result.find("{")
237
  end = result.rfind("}") + 1
238
  if start == -1 or end == 0:
 
311
  results = []
312
  debug_data = []
313
 
314
+ with col_right:
315
+ for uploaded_file in uploaded_files:
316
+ st.subheader(f"Validating: {uploaded_file.name}")
317
+ status_box = st.empty()
318
+ debug = {}
319
+
320
+ # Step 1: OCR
321
+ doc_text = extract_text_from_unstract(uploaded_file, status_box)
322
+ debug['OCR_extracted_text'] = doc_text
323
+
324
+ if not doc_text:
325
+ status_box.error("Skipping due to OCR extraction error.")
326
+ debug['error'] = "OCR extraction error"
327
+ debug_data.append({uploaded_file.name: debug})
328
+ continue
329
+
330
+ # Step 2: LLM Validation
331
+ llm_json, llm_raw, llm_prompt = query_gemma_llm(doc_text, checklist, agent_instruction, date_str, status_box)
332
+ debug['LLM_prompt'] = llm_prompt
333
+ debug['LLM_raw_response'] = llm_raw
334
+ debug['LLM_parsed_json'] = llm_json
335
+
336
+ if not llm_json:
337
+ status_box.error("Skipping due to LLM error.")
338
+ debug['error'] = "LLM processing error"
339
+ debug_data.append({uploaded_file.name: debug})
340
+ continue
341
+
342
+ detected_type = llm_json.get("document_type", "")
343
+ matched_type, match_score = fuzzy_match_type(detected_type, required_types)
344
+
345
+ checklist_matched = llm_json.get("checklist_matched", False)
346
+ if checklist_matched and match_score < min_match_score:
347
+ checklist_matched = False
348
+
349
+ llm_conf = llm_json.get("confidence", 0)
350
+ accepted = (
351
+ checklist_matched and
352
+ llm_json.get("looks_genuine", False) and
353
+ not llm_json.get("is_expired", False) and
354
+ (llm_conf >= min_confidence)
355
+ )
356
 
357
+ reason = []
358
+ if not checklist_matched:
359
+ reason.append("No matching checklist item found. Document rejected.")
360
+ else:
361
+ reason.append(
362
+ f"Document type '{detected_type}' matched checklist '{matched_type}' with score {match_score}/100."
363
+ )
364
+ if not llm_json.get("looks_genuine", False):
365
+ reason.append("Document does not look genuine.")
366
+ if llm_json.get("is_expired", False):
367
+ reason.append("Document is expired.")
368
+
369
+ reason.append(f"Genuineness confidence: {llm_conf}.")
370
+ reason.append(llm_json.get("verdict", ""))
371
+
372
+ # Advanced agent: If confidence is in a "gray zone", ask the LLM for a final self-verdict
373
+ verdict_json, verdict_raw, verdict_prompt = advanced_llm_verdict(llm_json, min_confidence, status_box)
374
+ debug['LLM_self_verdict_prompt'] = verdict_prompt
375
+ debug['LLM_self_verdict_raw'] = verdict_raw
376
+ debug['LLM_self_verdict_json'] = verdict_json
377
+
378
+ if verdict_json:
379
+ accepted = verdict_json.get("accepted", False)
380
+ reason.append(f"LLM Self-verdict: {verdict_json.get('reason','')}")
381
+ status_box.info("Final decision (gray zone) taken by LLM self-verdict.")
382
+
383
+ results.append({
384
+ "File": uploaded_file.name,
385
+ "Detected Type": detected_type,
386
+ "Checklist Match": matched_type if checklist_matched else "-",
387
+ "Type Score": match_score,
388
+ "Expiry Date": llm_json.get("expiry_date", "-"),
389
+ "Expired": "Yes" if llm_json.get("is_expired", False) else "No",
390
+ "Genuine": "Yes" if llm_json.get("looks_genuine", False) else "No",
391
+ "Confidence": llm_conf,
392
+ "Accepted": "Yes" if accepted else "No",
393
+ "Reason": " ".join(reason)
394
+ })
395
+ debug['Checklist_match_details'] = {
396
+ "detected_type": detected_type,
397
+ "matched_type": matched_type,
398
+ "match_score": match_score,
399
+ "checklist_matched": checklist_matched,
400
+ "accepted": accepted
401
+ }
402
  debug_data.append({uploaded_file.name: debug})
403
+ status_box.success("Validation complete. See result below.")
404
+
405
+ # ==== Results table with custom styling ====
406
+ if results:
407
+ st.success("All validations complete.")
408
+ df = pd.DataFrame(results)
409
+ def style_row(row):
410
+ color = "#e7ffe7" if row["Accepted"] == "Yes" else "#fff1f0"
411
+ return [f"background-color: {color}"]*len(row)
412
+ styled_df = df.style.apply(style_row, axis=1)\
413
+ .set_table_attributes('class="styled-table"')\
414
+ .set_properties(**{
415
+ 'font-size': '15px',
416
+ 'word-break': 'break-word',
417
+ 'border': '1px solid #ddd'
418
+ })
419
+ st.markdown('<h4 style="margin-top:28px;">Validation Results</h4>', unsafe_allow_html=True)
420
+ st.write(styled_df.to_html(escape=False), unsafe_allow_html=True)
 
 
421
  else:
422
+ st.warning("No valid results.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
423
 
424
+ with st.expander("Debug Panel (per document)"):
425
+ for doc_debug in debug_data:
426
+ for fname, dbg in doc_debug.items():
427
+ st.markdown(f"**{fname}**")
428
+ st.json(dbg)