DOC_VALID_AGENT

Sleeping

App Files Files Community

Seth0330 commited on Jun 21, 2025

Commit

a1fcd1d

verified ·

1 Parent(s): 0521a05

Update app.py

Browse files

Files changed (1) hide show

app.py +178 -120

app.py CHANGED Viewed

@@ -8,8 +8,15 @@ import mimetypes
 from fuzzywuzzy import fuzz
 import pandas as pd
-# ----- Styling -----
 st.set_page_config(page_title="EZOFIS Document Validation Agent", layout="wide")
 st.markdown("""
     <style>
     .block-card {
@@ -27,61 +34,17 @@ st.markdown("""
     </style>
 """, unsafe_allow_html=True)
-# ----- API Config -----
-UNSTRACT_BASE = "https://llmwhisperer-api.us-central.unstract.com/api/v2"
-UNSTRACT_API_KEY = os.getenv("UNSTRACT_API_KEY")  # Set in environment
-OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")  # Set in environment
-OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"
-GEMMA_MODEL = "google/gemma-3-4b-it:free"
-# =========== UI ===========
 st.markdown(
     "<h1 style='font-weight:800; margin-bottom:8px;'>EZOFIS Document Validation Agent</h1>",
     unsafe_allow_html=True
 )
 st.markdown(
-    "<div style='font-size:20px; margin-bottom:28px; color:#24345C;'>Check document submissions against mortgage checklist with AI.</div>",
     unsafe_allow_html=True
 )
-# ===== Step 1: Checklist JSON input =====
-st.markdown("<span class='step-num'>1</span> <b>Paste Mortgage Checklist (JSON)</b>", unsafe_allow_html=True)
-sample_checklist = '''{
-  "required_documents": [
-    {"type": "Driver's License", "description": "Government-issued photo ID"},
-    {"type": "Passport", "description": "Valid passport"},
-    {"type": "SIN Card", "description": "Social Insurance Number document"},
-    {"type": "Bank Statement", "description": "Last 3 months bank statement"},
-    {"type": "Employment Letter", "description": "Signed letter from employer"},
-    {"type": "Pay Stub", "description": "Most recent pay stub"},
-    {"type": "Proof of Address", "description": "Utility bill or lease"}
-  ]
-}'''
-checklist_text = st.text_area(
-    "Paste or edit your mortgage checklist JSON below:",
-    value=sample_checklist,
-    height=200,
-    key="doc_checklist_json"
-)
-# Parse checklist
-try:
-    checklist = json.loads(checklist_text)
-    required_types = [doc["type"] for doc in checklist["required_documents"]]
-except Exception as e:
-    st.error("Invalid checklist JSON.")
-    st.stop()
-# ===== Step 2: Document upload =====
-st.markdown("<span class='step-num'>2</span> <b>Upload Document(s) to Validate</b>", unsafe_allow_html=True)
-uploaded_files = st.file_uploader(
-    "Upload PDF, DOCX, XLSX, PNG, JPG, TIFF, etc.",
-    type=["pdf", "docx", "xlsx", "xls", "png", "jpg", "jpeg", "tiff"],
-    key="mortgage_files",
-    accept_multiple_files=True
-)
-# ===== Utilities =====
 def get_content_type(filename):
     mime, _ = mimetypes.guess_type(filename)
     ext = filename.lower().split('.')[-1]
@@ -91,7 +54,7 @@ def get_content_type(filename):
         return "application/octet-stream"
     return mime
-def extract_text_from_unstract(uploaded_file):
     filename = getattr(uploaded_file, "name", "uploaded_file")
     file_bytes = uploaded_file.read()
     content_type = get_content_type(filename)
@@ -100,35 +63,42 @@ def extract_text_from_unstract(uploaded_file):
         "Content-Type": content_type,
     }
     url = f"{UNSTRACT_BASE}/whisper"
-    with st.spinner("Uploading and extracting with Unstract..."):
-        r = requests.post(url, headers=headers, data=file_bytes)
-        if r.status_code != 202:
-            st.error(f"Unstract error: {r.status_code} - {r.text}")
-            return None
-        whisper_hash = r.json().get("whisper_hash")
-        if not whisper_hash:
-            st.error("Unstract: No whisper_hash received.")
-            return None
-    # Poll for status
     status_url = f"{UNSTRACT_BASE}/whisper-status?whisper_hash={whisper_hash}"
     for i in range(30):
         status_r = requests.get(status_url, headers={"unstract-key": UNSTRACT_API_KEY})
         if status_r.status_code != 200:
-            st.error(f"Unstract status error: {status_r.status_code} - {status_r.text}")
             return None
         status = status_r.json().get("status")
         if status == "processed":
             break
         time.sleep(2)
     else:
-        st.error("Unstract: Timeout waiting for OCR.")
         return None
     retrieve_url = f"{UNSTRACT_BASE}/whisper-retrieve?whisper_hash={whisper_hash}&text_only=true"
     r = requests.get(retrieve_url, headers={"unstract-key": UNSTRACT_API_KEY})
     if r.status_code != 200:
-        st.error(f"Unstract: Error retrieving text: {r.status_code} - {r.text}")
         return None
     try:
         data = r.json()
@@ -136,40 +106,41 @@ def extract_text_from_unstract(uploaded_file):
     except Exception:
         return r.text
-def fuzzy_match_type(detected_type, checklist_types):
-    # Returns best match and score
-    best_type = None
-    best_score = 0
-    for t in checklist_types:
-        score = fuzz.token_set_ratio(str(detected_type), str(t))
-        if score > best_score:
-            best_type = t
-            best_score = score
-    return best_type, best_score
-def query_gemma_llm(doc_text, checklist_json):
-    prompt = f"""
-Read the following extracted document text and analyze according to this checklist JSON:
-{json.dumps(checklist_json)}
-Can you read from this text, what type of document it is such as Certificate, License, Passport, etc and Also find the expiry date  of it from the text, If you don't find the expiry date text but if you found any other code such as MRZ then find the expiry date from that. Also by the look of it give your verdict whether this is genuine with a confidence score. Also if the current date is 21st June 2025 then check whether the document is already expired or valid.
-Return your output as a JSON like:
 {{
-  "document_type": "...",
-  "expiry_date": "...",
   "is_expired": true/false,
   "looks_genuine": true/false,
   "confidence": <score 0-100>,
-  "verdict": "...reasoned verdict..."
 }}
 Document Text:
 {doc_text[:4000]}
     """.strip()
     headers = {
         "Authorization": f"Bearer {OPENROUTER_API_KEY}",
-        "HTTP-Referer": "https://chat.openai.com",  # Some openrouter models require this
         "X-Title": "EZOFIS-Doc-Validator",
         "Content-Type": "application/json",
     }
@@ -179,66 +150,142 @@ Document Text:
         "temperature": 0.1,
         "max_tokens": 1024
     }
-    with st.spinner("Gemma LLM is validating the document..."):
-        resp = requests.post(OPENROUTER_URL, headers=headers, json=data, timeout=90)
     if resp.status_code != 200:
-        st.error(f"OpenRouter error: {resp.status_code}: {resp.text}")
-        return None
     result = resp.json()["choices"][0]["message"]["content"]
     # Extract only JSON
     start = result.find("{")
     end = result.rfind("}") + 1
     if start == -1 or end == 0:
-        st.error("Gemma did not return JSON.")
-        st.code(result)
-        return None
     try:
-        return json.loads(result[start:end])
     except Exception as e:
-        st.error("Error parsing LLM response.")
-        st.code(result)
-        return None
-# ========== Step 3: Run Validation ==========
 if st.button("Run Document Validation", type="primary") and uploaded_files:
     results = []
     for uploaded_file in uploaded_files:
         st.subheader(f"Validating: {uploaded_file.name}")
-        # Extract text
-        doc_text = extract_text_from_unstract(uploaded_file)
         if not doc_text:
-            st.warning("Skipping due to extraction error.")
             continue
-        # Query LLM
-        llm_json = query_gemma_llm(doc_text, checklist)
         if not llm_json:
-            st.warning("Skipping due to LLM error.")
             continue
-        # Fuzzy match doc type with checklist
         detected_type = llm_json.get("document_type", "")
         matched_type, match_score = fuzzy_match_type(detected_type, required_types)
-        # Acceptance logic
         accepted = (
-            matched_type is not None and match_score >= 70 and
             llm_json.get("looks_genuine", False) and
             not llm_json.get("is_expired", False)
         )
         reason = []
-        reason.append(
-            f"Document type '{detected_type}' matched checklist '{matched_type}' with score {match_score}/100." if matched_type else
-            f"Document type '{detected_type}' did not match any required type."
-        )
-        reason.append(
-            f"Genuineness confidence: {llm_json.get('confidence', 0)}."
-        )
-        reason.append(
-            "Document is not expired." if not llm_json.get("is_expired", False) else "Document is expired."
-        )
         reason.append(llm_json.get("verdict", ""))
         results.append({
             "File": uploaded_file.name,
             "Detected Type": detected_type,
-            "Checklist Match": matched_type or "-",
             "Type Score": match_score,
             "Expiry Date": llm_json.get("expiry_date", "-"),
             "Expired": "Yes" if llm_json.get("is_expired", False) else "No",
@@ -247,13 +294,24 @@ if st.button("Run Document Validation", type="primary") and uploaded_files:
             "Accepted": "Yes" if accepted else "No",
             "Reason": " ".join(reason)
         })
     if results:
-        st.success("Validation Complete.")
-        st.dataframe(pd.DataFrame(results))
     else:
         st.warning("No valid results.")
-# Debugging
-if "last_api" in st.session_state:
-    with st.expander("Debug (LLM raw output)"):
-        st.code(st.session_state.last_api)

 from fuzzywuzzy import fuzz
 import pandas as pd
+# ========== CONFIG ==========
+UNSTRACT_BASE = "https://llmwhisperer-api.us-central.unstract.com/api/v2"
+UNSTRACT_API_KEY = os.getenv("UNSTRACT_API_KEY")
+OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
+OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"
+GEMMA_MODEL = "google/gemma-3-4b-it:free"
 st.set_page_config(page_title="EZOFIS Document Validation Agent", layout="wide")
 st.markdown("""
     <style>
     .block-card {
     </style>
 """, unsafe_allow_html=True)
 st.markdown(
     "<h1 style='font-weight:800; margin-bottom:8px;'>EZOFIS Document Validation Agent</h1>",
     unsafe_allow_html=True
 )
 st.markdown(
+    "<div style='font-size:20px; margin-bottom:28px; color:#24345C;'>AI-driven checklist-based document acceptance for mortgage applications.</div>",
     unsafe_allow_html=True
 )
+# ========== FUNCTIONS ==========
 def get_content_type(filename):
     mime, _ = mimetypes.guess_type(filename)
     ext = filename.lower().split('.')[-1]
         return "application/octet-stream"
     return mime
+def extract_text_from_unstract(uploaded_file, status_box=None):
     filename = getattr(uploaded_file, "name", "uploaded_file")
     file_bytes = uploaded_file.read()
     content_type = get_content_type(filename)
         "Content-Type": content_type,
     }
     url = f"{UNSTRACT_BASE}/whisper"
+    if status_box:
+        status_box.info("Step 1: Uploading and extracting text (OCR)...")
+    r = requests.post(url, headers=headers, data=file_bytes)
+    if r.status_code != 202:
+        if status_box:
+            status_box.error(f"Unstract error: {r.status_code} - {r.text}")
+        return None
+    whisper_hash = r.json().get("whisper_hash")
+    if not whisper_hash:
+        if status_box:
+            status_box.error("Unstract: No whisper_hash received.")
+        return None
+    # Poll status
     status_url = f"{UNSTRACT_BASE}/whisper-status?whisper_hash={whisper_hash}"
     for i in range(30):
         status_r = requests.get(status_url, headers={"unstract-key": UNSTRACT_API_KEY})
         if status_r.status_code != 200:
+            if status_box:
+                status_box.error(f"Unstract status error: {status_r.status_code} - {status_r.text}")
             return None
         status = status_r.json().get("status")
         if status == "processed":
             break
+        if status_box:
+            status_box.info(f"OCR in progress... ({i+1}/30)")
         time.sleep(2)
     else:
+        if status_box:
+            status_box.error("Unstract: Timeout waiting for OCR.")
         return None
     retrieve_url = f"{UNSTRACT_BASE}/whisper-retrieve?whisper_hash={whisper_hash}&text_only=true"
     r = requests.get(retrieve_url, headers={"unstract-key": UNSTRACT_API_KEY})
     if r.status_code != 200:
+        if status_box:
+            status_box.error(f"Unstract: Error retrieving text: {r.status_code} - {r.text}")
         return None
     try:
         data = r.json()
     except Exception:
         return r.text
+def build_prompt(doc_text, checklist):
+    return f"""
+You are a careful, expert document validation agent for mortgage workflows.
+Analyze the following extracted document text and this checklist JSON:
+{json.dumps(checklist)}
+First, **determine what document you are reading** (e.g., Driver's License, Passport, Bank Statement, etc.) as precisely as possible, based on content, layout, and terms.
+**DO NOT** attempt to "force match" or guess a checklist match if you are not sure. If the detected document type does NOT correspond (even loosely) to any checklist item, set "checklist_matched": false and recommend rejection. If it matches, set "checklist_matched": true.
+Extract the expiry date if found (or set as null/empty), and if present, check if it is expired compared to the current date: 21st June 2025.
+Assess if the document looks genuine (as much as possible from the text), and provide a confidence score (0-100).
+Respond with this JSON:
 {{
+  "document_type": "...",          // Your best judgment (e.g. Driver's License)
+  "expiry_date": "...",            // ISO format if possible
   "is_expired": true/false,
   "looks_genuine": true/false,
   "confidence": <score 0-100>,
+  "checklist_matched": true/false,
+  "verdict": "..."                 // One-sentence reason
 }}
 Document Text:
 {doc_text[:4000]}
     """.strip()
+def query_gemma_llm(doc_text, checklist, status_box=None):
+    prompt = build_prompt(doc_text, checklist)
     headers = {
         "Authorization": f"Bearer {OPENROUTER_API_KEY}",
+        "HTTP-Referer": "https://chat.openai.com",  # for OpenRouter
         "X-Title": "EZOFIS-Doc-Validator",
         "Content-Type": "application/json",
     }
         "temperature": 0.1,
         "max_tokens": 1024
     }
+    if status_box:
+        status_box.info("Step 2: Validating document with Gemma LLM...")
+    resp = requests.post(OPENROUTER_URL, headers=headers, json=data, timeout=90)
     if resp.status_code != 200:
+        if status_box:
+            status_box.error(f"OpenRouter error: {resp.status_code}: {resp.text}")
+        return None, None, prompt
     result = resp.json()["choices"][0]["message"]["content"]
     # Extract only JSON
     start = result.find("{")
     end = result.rfind("}") + 1
     if start == -1 or end == 0:
+        if status_box:
+            status_box.error("Gemma did not return JSON.")
+            status_box.write(result)
+        return None, result, prompt
     try:
+        return json.loads(result[start:end]), result, prompt
     except Exception as e:
+        if status_box:
+            status_box.error("Error parsing LLM response.")
+            status_box.write(result)
+        return None, result, prompt
+def fuzzy_match_type(detected_type, checklist_types):
+    best_type = None
+    best_score = 0
+    for t in checklist_types:
+        score = fuzz.token_set_ratio(str(detected_type), str(t))
+        if score > best_score:
+            best_type = t
+            best_score = score
+    return best_type, best_score
+# ========== UI ==========
+sample_checklist = '''{
+  "required_documents": [
+    {"type": "Driver's License", "description": "Government-issued photo ID"},
+    {"type": "Passport", "description": "Valid passport"},
+    {"type": "SIN Card", "description": "Social Insurance Number document"},
+    {"type": "Bank Statement", "description": "Last 3 months bank statement"},
+    {"type": "Employment Letter", "description": "Signed letter from employer"},
+    {"type": "Pay Stub", "description": "Most recent pay stub"},
+    {"type": "Proof of Address", "description": "Utility bill or lease"}
+  ]
+}'''
+st.markdown("<span class='step-num'>1</span> <b>Paste Mortgage Checklist (JSON)</b>", unsafe_allow_html=True)
+checklist_text = st.text_area(
+    "Paste or edit your mortgage checklist JSON below:",
+    value=sample_checklist,
+    height=200,
+    key="doc_checklist_json"
+)
+try:
+    checklist = json.loads(checklist_text)
+    required_types = [doc["type"] for doc in checklist["required_documents"]]
+except Exception as e:
+    st.error("Invalid checklist JSON.")
+    st.stop()
+st.markdown("<span class='step-num'>2</span> <b>Upload Document(s) to Validate</b>", unsafe_allow_html=True)
+uploaded_files = st.file_uploader(
+    "Upload PDF, DOCX, XLSX, PNG, JPG, TIFF, etc.",
+    type=["pdf", "docx", "xlsx", "xls", "png", "jpg", "jpeg", "tiff"],
+    key="mortgage_files",
+    accept_multiple_files=True
+)
+# ========== PROCESSING ==========
 if st.button("Run Document Validation", type="primary") and uploaded_files:
     results = []
+    debug_data = []
     for uploaded_file in uploaded_files:
         st.subheader(f"Validating: {uploaded_file.name}")
+        status_box = st.empty()
+        debug = {}
+        # Step 1: OCR
+        doc_text = extract_text_from_unstract(uploaded_file, status_box)
+        debug['OCR_extracted_text'] = doc_text
         if not doc_text:
+            status_box.error("Skipping due to OCR extraction error.")
+            debug['error'] = "OCR extraction error"
+            debug_data.append({uploaded_file.name: debug})
             continue
+        # Step 2: LLM Validation
+        llm_json, llm_raw, llm_prompt = query_gemma_llm(doc_text, checklist, status_box)
+        debug['LLM_prompt'] = llm_prompt
+        debug['LLM_raw_response'] = llm_raw
+        debug['LLM_parsed_json'] = llm_json
         if not llm_json:
+            status_box.error("Skipping due to LLM error.")
+            debug['error'] = "LLM processing error"
+            debug_data.append({uploaded_file.name: debug})
             continue
         detected_type = llm_json.get("document_type", "")
         matched_type, match_score = fuzzy_match_type(detected_type, required_types)
+        # Accept only if LLM states checklist_matched, looks genuine, and not expired
+        checklist_matched = llm_json.get("checklist_matched", False)
+        if checklist_matched:
+            # Double check: If match_score < 65, override to not matched
+            if match_score < 65:
+                checklist_matched = False
         accepted = (
+            checklist_matched and
             llm_json.get("looks_genuine", False) and
             not llm_json.get("is_expired", False)
         )
         reason = []
+        if not checklist_matched:
+            reason.append("No matching checklist item found. Document rejected.")
+        else:
+            reason.append(
+                f"Document type '{detected_type}' matched checklist '{matched_type}' with score {match_score}/100."
+            )
+            if not llm_json.get("looks_genuine", False):
+                reason.append("Document does not look genuine.")
+            if llm_json.get("is_expired", False):
+                reason.append("Document is expired.")
+        reason.append(f"Genuineness confidence: {llm_json.get('confidence', 0)}.")
         reason.append(llm_json.get("verdict", ""))
         results.append({
             "File": uploaded_file.name,
             "Detected Type": detected_type,
+            "Checklist Match": matched_type if checklist_matched else "-",
             "Type Score": match_score,
             "Expiry Date": llm_json.get("expiry_date", "-"),
             "Expired": "Yes" if llm_json.get("is_expired", False) else "No",
             "Accepted": "Yes" if accepted else "No",
             "Reason": " ".join(reason)
         })
+        debug['Checklist_match_details'] = {
+            "detected_type": detected_type,
+            "matched_type": matched_type,
+            "match_score": match_score,
+            "checklist_matched": checklist_matched,
+            "accepted": accepted
+        }
+        debug_data.append({uploaded_file.name: debug})
+        status_box.success("Validation complete. See result below.")
     if results:
+        st.success("All validations complete.")
+        st.dataframe(pd.DataFrame(results), use_container_width=True)
     else:
         st.warning("No valid results.")
+    with st.expander("Debug Panel (per document)"):
+        for doc_debug in debug_data:
+            for fname, dbg in doc_debug.items():
+                st.markdown(f"**{fname}**")
+                st.json(dbg)