File size: 19,338 Bytes
c289504
cdae312
0592d14
cdae312
0592d14
2682cc6
21e2212
d2967d8
 
3a41351
a1fcd1d
 
 
 
60c686c
a1fcd1d
f192959
e40b807
 
4ea086a
 
 
f192959
f52c5eb
f192959
 
f52c5eb
 
e40b807
 
 
f192959
 
 
 
21e2212
3a41351
4ea086a
 
 
 
60c686c
4ea086a
 
 
 
 
 
 
 
 
 
 
 
 
60c686c
4ea086a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a41351
bd13fee
 
 
 
4ea086a
bd13fee
 
4ea086a
21e2212
 
 
 
 
 
 
4ea086a
 
 
 
 
 
 
bd13fee
 
4ea086a
 
 
 
 
 
 
 
 
21e2212
3a41351
f192959
46902a8
 
 
 
 
 
 
 
 
a1fcd1d
dc0c728
 
46902a8
dc0c728
46902a8
dc0c728
 
46902a8
a1fcd1d
 
 
 
 
 
 
 
 
 
 
 
 
46902a8
dc0c728
46902a8
dc0c728
a1fcd1d
 
dc0c728
 
 
 
a1fcd1d
c572e2b
dc0c728
 
a1fcd1d
 
dc0c728
46902a8
 
 
dc0c728
a1fcd1d
 
dc0c728
 
 
 
 
 
 
3a41351
a1fcd1d
21e2212
a1fcd1d
3a41351
f192959
3a41351
0b2c1fd
f192959
3a41351
f192959
0b2c1fd
a1fcd1d
3a41351
f192959
 
a1fcd1d
 
f192959
a1fcd1d
f192959
 
 
e40b807
3a41351
 
f192959
 
1c49f02
f192959
 
 
 
3a41351
f192959
 
 
 
a1fcd1d
c572e2b
a1fcd1d
f192959
a1fcd1d
 
 
f192959
 
 
 
a1fcd1d
c572e2b
a1fcd1d
 
f192959
a1fcd1d
f192959
a1fcd1d
 
 
 
 
e47273e
 
 
 
 
 
 
 
60c686c
e47273e
 
 
 
 
 
 
 
 
3a41351
e47273e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a1fcd1d
 
 
 
 
 
 
 
 
f52c5eb
3a41351
ace9734
 
 
 
 
 
 
 
 
 
 
 
 
3a41351
 
 
ace9734
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ea086a
3a41351
4ea086a
 
 
 
f192959
a1fcd1d
 
1c49f02
 
3a41351
 
 
 
 
 
1c49f02
 
 
 
 
 
 
 
 
 
 
 
 
 
3a41351
1c49f02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a41351
 
 
 
1c49f02
 
 
3a41351
1c49f02
 
a1fcd1d
1c49f02
 
 
 
 
 
 
 
 
3a41351
1c49f02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a41351
1c49f02
 
 
 
 
 
 
 
 
 
 
 
a1fcd1d
1c49f02
 
ace9734
1c49f02
 
ace9734
 
a1fcd1d
1c49f02
f52c5eb
1c49f02
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
import streamlit as st
import requests
import json
import os
import time
import mimetypes
from datetime import datetime
from fuzzywuzzy import fuzz

# ====== CONFIG ======
UNSTRACT_BASE = "https://llmwhisperer-api.us-central.unstract.com/api/v2"
UNSTRACT_API_KEY = os.getenv("UNSTRACT_API_KEY")
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"
MISTRAL_MODEL = "mistralai/ministral-8b"

st.set_page_config(page_title="EZOFIS Document Validation Agent", layout="wide")
st.markdown("""
    <style>
    .step-num {
        background: #A020F0; color: #fff; border-radius: 999px;
        padding: 6px 16px; font-weight: 700; margin-right: 14px; font-size: 18px;
        display: inline-block; vertical-align: middle;}
    .stButton>button {
        background: #A020F0 !important; color: white !important; border-radius: 12px !important;
        padding: 10px 32px !important; font-weight: 700; border: none !important; font-size: 18px !important;
        margin-top: 12px !important;
    }
    </style>
""", unsafe_allow_html=True)

st.markdown(
    "<h1 style='font-weight:800; margin-bottom:8px;'>EZOFIS Document Validation Agent</h1>",
    unsafe_allow_html=True
)

# ====== UI LAYOUT ======
col_left, col_right = st.columns([1.35, 1.05])

with col_left:
    # Step 1: Checklist
    st.markdown("<span class='step-num'>1</span> <b>Your Document Checklist (JSON)</b>", unsafe_allow_html=True)
    sample_checklist = '''{
      "required_documents": [
        {"type": "Driver's License", "description": "Government-issued photo ID"},
        {"type": "Passport", "description": "Valid passport"},
        {"type": "SIN Card", "description": "Social Insurance Number document"},
        {"type": "Bank Statement", "description": "Last 3 months bank statement"},
        {"type": "Employment Letter", "description": "Signed letter from employer"},
        {"type": "Pay Stub", "description": "Most recent pay stub"},
        {"type": "Proof of Address", "description": "Utility bill or lease"},
        {"type": "Ontario Health Card", "description": "Provincial health insurance card"}
      ]
    }'''
    checklist_text = st.text_area(
        "Paste or edit your checklist JSON below:",
        value=sample_checklist,
        height=220,
        key="doc_checklist_json"
    )
    try:
        checklist = json.loads(checklist_text)
        required_types = [doc["type"] for doc in checklist["required_documents"]]
    except Exception as e:
        st.error("Invalid checklist JSON.")
        st.stop()

    # Step 2: Document upload
    st.markdown("<span class='step-num'>2</span> <b>Upload Document(s) to Validate</b>", unsafe_allow_html=True)
    uploaded_files = st.file_uploader(
        "Upload PDF, DOCX, XLSX, PNG, JPG, TIFF, etc.",
        type=["pdf", "docx", "xlsx", "xls", "png", "jpg", "jpeg", "tiff"],
        key="mortgage_files",
        accept_multiple_files=True
    )

    # Step 3: Thresholds
    st.markdown("<span class='step-num'>3</span> <b>Configure Acceptance Thresholds</b>", unsafe_allow_html=True)
    min_match_score = st.slider("Minimum Type Match Score (0-100)", 50, 100, 70, 1)
    min_confidence = st.slider("Minimum LLM Confidence (0-100)", 50, 100, 70, 1)

with col_right:
    # Step 4: Agent instructions
    st.markdown("<span class='step-num'>4</span> <b>Instruct Agent</b>", unsafe_allow_html=True)
    sample_instruction = """You are a careful, expert document validation agent for mortgage and finance workflows.

Before you answer, do this: Carefully scan the document for ANY evidence of regional/provincial or country-specific card types (such as "Ontario Health Card", "Medicare Card", "Insurance Card", "SIN", "Driver's License", "Passport", etc.)—be as specific as possible using visible card titles, authority names, or issuer logos.

Checklist for precision:
- Prefer the **most specific** document type (e.g. "Ontario Health Card" over just "Identification Card" or "Provincial ID").
- If there is any ambiguity, include relevant keywords from the card (like "Health", "Medicare", "OHIP", "SIN", "Social Insurance", "Driver", etc.) in the output type.
- If still not sure, show your best guess but include all possible hints from the document text."""
    agent_instruction = st.text_area(
        "Instructions for the Document Validation Agent (edit as needed):",
        value=sample_instruction,
        height=240,
        key="agent_instruction"
    )

    # Step 5: Current date
    st.markdown("<span class='step-num'>5</span> <b>Set Current Date for Expiry Validation</b>", unsafe_allow_html=True)
    current_date = st.date_input(
        "Current date to be used by the agent for expiry checking",
        value=datetime.now().date(),
        key="current_date"
    )
    date_str = str(current_date)

    # Step 6: Run button
    run_btn = st.button("Run Document Validation", type="primary")

# ====== HELPER FUNCTIONS ======

def get_content_type(filename):
    mime, _ = mimetypes.guess_type(filename)
    ext = filename.lower().split('.')[-1]
    if ext == "pdf":
        return "text/plain"
    if mime is None:
        return "application/octet-stream"
    return mime

def extract_text_from_unstract(uploaded_file, status_box=None):
    filename = getattr(uploaded_file, "name", "uploaded_file")
    file_bytes = uploaded_file.read()
    content_type = get_content_type(filename)
    headers = {
        "unstract-key": UNSTRACT_API_KEY,
        "Content-Type": content_type,
    }
    url = f"{UNSTRACT_BASE}/whisper"
    if status_box:
        status_box.info("Step 1: Uploading and extracting text (OCR)...")
    r = requests.post(url, headers=headers, data=file_bytes)
    if r.status_code != 202:
        if status_box:
            status_box.error(f"Unstract error: {r.status_code} - {r.text}")
        return None
    whisper_hash = r.json().get("whisper_hash")
    if not whisper_hash:
        if status_box:
            status_box.error("Unstract: No whisper_hash received.")
        return None
    # Poll status
    status_url = f"{UNSTRACT_BASE}/whisper-status?whisper_hash={whisper_hash}"
    for i in range(30):
        status_r = requests.get(status_url, headers={"unstract-key": UNSTRACT_API_KEY})
        if status_r.status_code != 200:
            if status_box:
                status_box.error(f"Unstract status error: {status_r.status_code} - {status_r.text}")
            return None
        status = status_r.json().get("status")
        if status == "processed":
            break
        if status_box:
            status_box.info(f"EZOFIS AI OCR AGENT in progress... ({i+1}/30)")
        time.sleep(2)
    else:
        if status_box:
            status_box.error("Unstract: Timeout waiting for OCR.")
        return None

    retrieve_url = f"{UNSTRACT_BASE}/whisper-retrieve?whisper_hash={whisper_hash}&text_only=true"
    r = requests.get(retrieve_url, headers={"unstract-key": UNSTRACT_API_KEY})
    if r.status_code != 200:
        if status_box:
            status_box.error(f"Unstract: Error retrieving text: {r.status_code} - {r.text}")
        return None
    try:
        data = r.json()
        return data.get("result_text") or r.text
    except Exception:
        return r.text

def build_mistral_prompt(doc_text, checklist, agent_instruction, current_date):
    return f"""
{agent_instruction}

IMPORTANT: Today's date for validation is: {current_date}. You MUST use this exact date, NOT today's system date, when checking if a document is expired.

Analyze the following extracted document text and the checklist JSON:
{json.dumps(checklist)}

Respond with this JSON (your response will be evaluated automatically):
{{
  "document_type": "...",          // e.g. Ontario Health Card, BC Services Card
  "expiry_date": "...",            // ISO format if possible
  "is_expired": true/false,        // must be true if expiry_date is before {current_date}
  "looks_genuine": true/false,
  "confidence": <score 0-100>,
  "checklist_matched": true/false,
  "verdict": "..."                 // One-sentence reason
}}

Document Text:
{doc_text[:4000]}
    """.strip()

def query_mistral_llm(doc_text, checklist, agent_instruction, current_date, status_box=None):
    prompt = build_mistral_prompt(doc_text, checklist, agent_instruction, current_date)
    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "HTTP-Referer": "https://chat.openai.com",
        "X-Title": "EZOFIS-Doc-Validator",
        "Content-Type": "application/json",
    }
    data = {
        "model": MISTRAL_MODEL,
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0.1,
        "max_tokens": 1024
    }
    if status_box:
        status_box.info("Step 2: Validating document with EZOFIS DOC VALIDATION AGENT...")
    resp = requests.post(OPENROUTER_URL, headers=headers, json=data, timeout=90)
    if resp.status_code != 200:
        if status_box:
            status_box.error(f"OpenRouter error: {resp.status_code}: {resp.text}")
        return None, None, prompt
    result = resp.json()["choices"][0]["message"]["content"]
    start = result.find("{")
    end = result.rfind("}") + 1
    if start == -1 or end == 0:
        if status_box:
            status_box.error("Agent did not return JSON.")
            status_box.write(result)
        return None, result, prompt
    try:
        return json.loads(result[start:end]), result, prompt
    except Exception as e:
        if status_box:
            status_box.error("Error parsing LLM response.")
            status_box.write(result)
        return None, result, prompt

def advanced_llm_verdict(llm_json, min_confidence, status_box=None):
    conf = llm_json.get("confidence", 0)
    if conf < min_confidence or conf >= min_confidence + 15:
        return None, None, None
    verdict_prompt = f"""
Here is the extracted document information and prior validation result:
{json.dumps(llm_json)}

The minimum required confidence is {min_confidence}. Should this document be accepted or rejected for an application, based on all available information?
Respond ONLY as: {{ "accepted": true/false, "reason": "..." }}
"""
    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "HTTP-Referer": "https://chat.openai.com",
        "X-Title": "EZOFIS-Doc-Validator",
        "Content-Type": "application/json",
    }
    data = {
        "model": MISTRAL_MODEL,
        "messages": [{"role": "user", "content": verdict_prompt}],
        "temperature": 0.1,
        "max_tokens": 256
    }
    if status_box:
        status_box.info("Step 3: LLM self-verdict (gray zone confidence)...")
    resp = requests.post(OPENROUTER_URL, headers=headers, json=data, timeout=60)
    if resp.status_code == 200:
        try:
            content = resp.json()["choices"][0]["message"]["content"]
            vstart = content.find("{")
            vend = content.rfind("}") + 1
            verdict_json = json.loads(content[vstart:vend])
            return verdict_json, content, verdict_prompt
        except Exception:
            return None, content, verdict_prompt
    return None, None, verdict_prompt

def fuzzy_match_type(detected_type, checklist_types):
    best_type = None
    best_score = 0
    for t in checklist_types:
        score = fuzz.token_set_ratio(str(detected_type), str(t))
        if score > best_score:
            best_type = t
            best_score = score
    return best_type, best_score

# ====== CARD RENDERING FUNCTION ======

def show_validation_card(result):
    accepted = result["Accepted"] == "Yes"
    expired = result["Expired"] == "Yes"
    genuine = result["Genuine"] == "Yes"

    decision_color = "#d32f2f" if not accepted else "#388e3c"
    yes_color = "#388e3c"
    no_color = "#d32f2f"
    bg_reason = "#ffeaea" if not accepted else "#eafbe8"

    st.markdown(f"""
    <div style="border-radius:16px;border:2px solid #A020F0; margin-bottom:32px; background:#f9f7ff;padding:18px 22px 22px 22px;box-shadow:0 3px 16px #0001;">
      <div style="font-size:14px;font-weight:600;letter-spacing:0.3px;margin-bottom:10px;color:#333;">
        {result['File']}
      </div>
      <table style="width:100%;border:none;margin-bottom:12px;">
        <tr>
          <td style="width:40%;font-size:17px;font-weight:700;">Decision:</td>
          <td style="width:60%;font-size:17px;font-weight:700;color:{decision_color};">{'Accepted' if accepted else 'Rejected'}</td>
        </tr>
        <tr>
          <td style="font-size:17px;font-weight:700;">Confidence:</td>
          <td style="font-size:17px;">{result['Confidence']}%</td>
        </tr>
      </table>
      <div style="border-radius:8px;background:{bg_reason};padding:11px 14px 11px 14px;color:#720000;font-size:15.5px;margin-bottom:17px;">
        <span style="font-weight:bold;">Reason:</span><br>{result['Reason']}
      </div>
      <table style="width:100%;margin-top:10px;margin-bottom:5px;">
        <tr>
          <td style="font-weight:600;font-size:15px;border-bottom:1px solid #ddd;padding-bottom:3px;">Detected Document:</td>
          <td style="font-weight:600;font-size:15px;border-bottom:1px solid #ddd;padding-bottom:3px;">Matched with Checklist:</td>
        </tr>
        <tr>
          <td style="color:{yes_color if accepted else '#222'};font-weight:600;font-size:15px;">{result['Detected Type']}</td>
          <td style="color:{yes_color if accepted else '#222'};font-weight:600;font-size:15px;">{result['Checklist Match']}</td>
        </tr>
        <tr>
          <td style="font-weight:600;font-size:15px;border-bottom:1px solid #ddd;padding-bottom:3px;">Genuine:</td>
          <td style="font-weight:600;font-size:15px;border-bottom:1px solid #ddd;padding-bottom:3px;">Expired:</td>
        </tr>
        <tr>
          <td style="color:{yes_color if genuine else no_color};font-weight:600;font-size:15px;">{"Yes" if genuine else "No"}</td>
          <td style="color:{yes_color if not expired else no_color};font-weight:600;font-size:15px;">{"Yes" if expired else "No"}</td>
        </tr>
      </table>
      <div style="color:#555;font-size:14px;margin-top:7px;">
        <b>Expiry Date:</b> {result["Expiry Date"]}
      </div>
    </div>
    """, unsafe_allow_html=True)

# ====== MAIN PROCESSING LOOP ======
if 'run_btn' not in locals():
    run_btn = False

if run_btn and uploaded_files:
    results = []
    debug_data = []

    with col_right:
        for uploaded_file in uploaded_files:
            st.markdown(
                f"<div style='font-size:15.5px;font-weight:500;color:#424242;margin:14px 0 2px 0;'>"
                f"Validating: <span style='color:#A020F0'>{uploaded_file.name}</span>"
                f"</div>",
                unsafe_allow_html=True
            )
            status_box = st.empty()
            debug = {}

            # Step 1: OCR
            doc_text = extract_text_from_unstract(uploaded_file, status_box)
            debug['OCR_extracted_text'] = doc_text

            if not doc_text:
                status_box.error("Skipping due to OCR extraction error.")
                debug['error'] = "OCR extraction error"
                debug_data.append({uploaded_file.name: debug})
                continue

            # Step 2: LLM Validation
            llm_json, llm_raw, llm_prompt = query_mistral_llm(doc_text, checklist, agent_instruction, date_str, status_box)
            debug['LLM_prompt'] = llm_prompt
            debug['LLM_raw_response'] = llm_raw
            debug['LLM_parsed_json'] = llm_json

            if not llm_json:
                status_box.error("Skipping due to LLM error.")
                debug['error'] = "LLM processing error"
                debug_data.append({uploaded_file.name: debug})
                continue

            detected_type = llm_json.get("document_type", "")
            matched_type, match_score = fuzzy_match_type(detected_type, required_types)

            checklist_matched = llm_json.get("checklist_matched", False)
            if checklist_matched and match_score < min_match_score:
                checklist_matched = False

            llm_conf = llm_json.get("confidence", 0)
            # Robustly handle is_expired
            is_expired = llm_json.get("is_expired", False)
            if isinstance(is_expired, str):
                is_expired = is_expired.lower() == "true"
            accepted = (
                checklist_matched and
                llm_json.get("looks_genuine", False) and
                not is_expired and
                (llm_conf >= min_confidence)
            )

            reason = []
            if not checklist_matched:
                reason.append("No matching checklist item found. Document rejected.")
            else:
                reason.append(
                    f"Document type '{detected_type}' matched checklist '{matched_type}' with score {match_score}/100."
                )
                if not llm_json.get("looks_genuine", False):
                    reason.append("Document does not look genuine.")
                if is_expired:
                    reason.append("Document is expired.")

            reason.append(f"Genuineness confidence: {llm_conf}.")
            reason.append(llm_json.get("verdict", ""))

            verdict_json, verdict_raw, verdict_prompt = advanced_llm_verdict(llm_json, min_confidence, status_box)
            debug['LLM_self_verdict_prompt'] = verdict_prompt
            debug['LLM_self_verdict_raw'] = verdict_raw
            debug['LLM_self_verdict_json'] = verdict_json

            if verdict_json:
                accepted = verdict_json.get("accepted", False)
                reason.append(f"LLM Self-verdict: {verdict_json.get('reason','')}")
                status_box.info("Final decision (gray zone) taken by LLM self-verdict.")

            results.append({
                "File": uploaded_file.name,
                "Detected Type": detected_type,
                "Checklist Match": matched_type if checklist_matched else "-",
                "Type Score": match_score,
                "Expiry Date": llm_json.get("expiry_date", "-"),
                "Expired": "Yes" if is_expired else "No",
                "Genuine": "Yes" if llm_json.get("looks_genuine", False) else "No",
                "Confidence": llm_conf,
                "Accepted": "Yes" if accepted else "No",
                "Reason": " ".join(reason)
            })
            debug['Checklist_match_details'] = {
                "detected_type": detected_type,
                "matched_type": matched_type,
                "match_score": match_score,
                "checklist_matched": checklist_matched,
                "accepted": accepted
            }
            debug_data.append({uploaded_file.name: debug})
            status_box.success("Validation complete. See result below.")

        # ==== Card-style results ====
        if results:
            st.success("All validations complete.")
            for result in results:
                show_validation_card(result)
        else:
            st.warning("No valid results.")

        with st.expander("Debug Panel (per document)"):
            for doc_debug in debug_data:
                for fname, dbg in doc_debug.items():
                    st.markdown(f"**{fname}**")
                    st.json(dbg)