Seth0330 commited on
Commit
a1fcd1d
·
verified ·
1 Parent(s): 0521a05

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +178 -120
app.py CHANGED
@@ -8,8 +8,15 @@ import mimetypes
8
  from fuzzywuzzy import fuzz
9
  import pandas as pd
10
 
11
- # ----- Styling -----
 
 
 
 
 
 
12
  st.set_page_config(page_title="EZOFIS Document Validation Agent", layout="wide")
 
13
  st.markdown("""
14
  <style>
15
  .block-card {
@@ -27,61 +34,17 @@ st.markdown("""
27
  </style>
28
  """, unsafe_allow_html=True)
29
 
30
- # ----- API Config -----
31
- UNSTRACT_BASE = "https://llmwhisperer-api.us-central.unstract.com/api/v2"
32
- UNSTRACT_API_KEY = os.getenv("UNSTRACT_API_KEY") # Set in environment
33
-
34
- OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") # Set in environment
35
- OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"
36
- GEMMA_MODEL = "google/gemma-3-4b-it:free"
37
-
38
- # =========== UI ===========
39
  st.markdown(
40
  "<h1 style='font-weight:800; margin-bottom:8px;'>EZOFIS Document Validation Agent</h1>",
41
  unsafe_allow_html=True
42
  )
43
  st.markdown(
44
- "<div style='font-size:20px; margin-bottom:28px; color:#24345C;'>Check document submissions against mortgage checklist with AI.</div>",
45
  unsafe_allow_html=True
46
  )
47
 
48
- # ===== Step 1: Checklist JSON input =====
49
- st.markdown("<span class='step-num'>1</span> <b>Paste Mortgage Checklist (JSON)</b>", unsafe_allow_html=True)
50
- sample_checklist = '''{
51
- "required_documents": [
52
- {"type": "Driver's License", "description": "Government-issued photo ID"},
53
- {"type": "Passport", "description": "Valid passport"},
54
- {"type": "SIN Card", "description": "Social Insurance Number document"},
55
- {"type": "Bank Statement", "description": "Last 3 months bank statement"},
56
- {"type": "Employment Letter", "description": "Signed letter from employer"},
57
- {"type": "Pay Stub", "description": "Most recent pay stub"},
58
- {"type": "Proof of Address", "description": "Utility bill or lease"}
59
- ]
60
- }'''
61
- checklist_text = st.text_area(
62
- "Paste or edit your mortgage checklist JSON below:",
63
- value=sample_checklist,
64
- height=200,
65
- key="doc_checklist_json"
66
- )
67
- # Parse checklist
68
- try:
69
- checklist = json.loads(checklist_text)
70
- required_types = [doc["type"] for doc in checklist["required_documents"]]
71
- except Exception as e:
72
- st.error("Invalid checklist JSON.")
73
- st.stop()
74
 
75
- # ===== Step 2: Document upload =====
76
- st.markdown("<span class='step-num'>2</span> <b>Upload Document(s) to Validate</b>", unsafe_allow_html=True)
77
- uploaded_files = st.file_uploader(
78
- "Upload PDF, DOCX, XLSX, PNG, JPG, TIFF, etc.",
79
- type=["pdf", "docx", "xlsx", "xls", "png", "jpg", "jpeg", "tiff"],
80
- key="mortgage_files",
81
- accept_multiple_files=True
82
- )
83
-
84
- # ===== Utilities =====
85
  def get_content_type(filename):
86
  mime, _ = mimetypes.guess_type(filename)
87
  ext = filename.lower().split('.')[-1]
@@ -91,7 +54,7 @@ def get_content_type(filename):
91
  return "application/octet-stream"
92
  return mime
93
 
94
- def extract_text_from_unstract(uploaded_file):
95
  filename = getattr(uploaded_file, "name", "uploaded_file")
96
  file_bytes = uploaded_file.read()
97
  content_type = get_content_type(filename)
@@ -100,35 +63,42 @@ def extract_text_from_unstract(uploaded_file):
100
  "Content-Type": content_type,
101
  }
102
  url = f"{UNSTRACT_BASE}/whisper"
103
- with st.spinner("Uploading and extracting with Unstract..."):
104
- r = requests.post(url, headers=headers, data=file_bytes)
105
- if r.status_code != 202:
106
- st.error(f"Unstract error: {r.status_code} - {r.text}")
107
- return None
108
- whisper_hash = r.json().get("whisper_hash")
109
- if not whisper_hash:
110
- st.error("Unstract: No whisper_hash received.")
111
- return None
112
-
113
- # Poll for status
 
 
114
  status_url = f"{UNSTRACT_BASE}/whisper-status?whisper_hash={whisper_hash}"
115
  for i in range(30):
116
  status_r = requests.get(status_url, headers={"unstract-key": UNSTRACT_API_KEY})
117
  if status_r.status_code != 200:
118
- st.error(f"Unstract status error: {status_r.status_code} - {status_r.text}")
 
119
  return None
120
  status = status_r.json().get("status")
121
  if status == "processed":
122
  break
 
 
123
  time.sleep(2)
124
  else:
125
- st.error("Unstract: Timeout waiting for OCR.")
 
126
  return None
127
 
128
  retrieve_url = f"{UNSTRACT_BASE}/whisper-retrieve?whisper_hash={whisper_hash}&text_only=true"
129
  r = requests.get(retrieve_url, headers={"unstract-key": UNSTRACT_API_KEY})
130
  if r.status_code != 200:
131
- st.error(f"Unstract: Error retrieving text: {r.status_code} - {r.text}")
 
132
  return None
133
  try:
134
  data = r.json()
@@ -136,40 +106,41 @@ def extract_text_from_unstract(uploaded_file):
136
  except Exception:
137
  return r.text
138
 
139
- def fuzzy_match_type(detected_type, checklist_types):
140
- # Returns best match and score
141
- best_type = None
142
- best_score = 0
143
- for t in checklist_types:
144
- score = fuzz.token_set_ratio(str(detected_type), str(t))
145
- if score > best_score:
146
- best_type = t
147
- best_score = score
148
- return best_type, best_score
149
 
150
- def query_gemma_llm(doc_text, checklist_json):
151
- prompt = f"""
152
- Read the following extracted document text and analyze according to this checklist JSON:
153
- {json.dumps(checklist_json)}
154
 
155
- Can you read from this text, what type of document it is such as Certificate, License, Passport, etc and Also find the expiry date of it from the text, If you don't find the expiry date text but if you found any other code such as MRZ then find the expiry date from that. Also by the look of it give your verdict whether this is genuine with a confidence score. Also if the current date is 21st June 2025 then check whether the document is already expired or valid.
156
 
157
- Return your output as a JSON like:
 
 
158
  {{
159
- "document_type": "...",
160
- "expiry_date": "...",
161
  "is_expired": true/false,
162
  "looks_genuine": true/false,
163
  "confidence": <score 0-100>,
164
- "verdict": "...reasoned verdict..."
 
165
  }}
 
166
  Document Text:
167
  {doc_text[:4000]}
168
  """.strip()
169
 
 
 
170
  headers = {
171
  "Authorization": f"Bearer {OPENROUTER_API_KEY}",
172
- "HTTP-Referer": "https://chat.openai.com", # Some openrouter models require this
173
  "X-Title": "EZOFIS-Doc-Validator",
174
  "Content-Type": "application/json",
175
  }
@@ -179,66 +150,142 @@ Document Text:
179
  "temperature": 0.1,
180
  "max_tokens": 1024
181
  }
182
- with st.spinner("Gemma LLM is validating the document..."):
183
- resp = requests.post(OPENROUTER_URL, headers=headers, json=data, timeout=90)
 
184
  if resp.status_code != 200:
185
- st.error(f"OpenRouter error: {resp.status_code}: {resp.text}")
186
- return None
 
187
  result = resp.json()["choices"][0]["message"]["content"]
188
  # Extract only JSON
189
  start = result.find("{")
190
  end = result.rfind("}") + 1
191
  if start == -1 or end == 0:
192
- st.error("Gemma did not return JSON.")
193
- st.code(result)
194
- return None
 
195
  try:
196
- return json.loads(result[start:end])
197
  except Exception as e:
198
- st.error("Error parsing LLM response.")
199
- st.code(result)
200
- return None
 
 
 
 
 
 
 
 
 
 
 
201
 
202
- # ========== Step 3: Run Validation ==========
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  if st.button("Run Document Validation", type="primary") and uploaded_files:
204
  results = []
 
 
205
  for uploaded_file in uploaded_files:
206
  st.subheader(f"Validating: {uploaded_file.name}")
207
- # Extract text
208
- doc_text = extract_text_from_unstract(uploaded_file)
 
 
 
 
 
209
  if not doc_text:
210
- st.warning("Skipping due to extraction error.")
 
 
211
  continue
212
- # Query LLM
213
- llm_json = query_gemma_llm(doc_text, checklist)
 
 
 
 
 
214
  if not llm_json:
215
- st.warning("Skipping due to LLM error.")
 
 
216
  continue
217
- # Fuzzy match doc type with checklist
218
  detected_type = llm_json.get("document_type", "")
219
  matched_type, match_score = fuzzy_match_type(detected_type, required_types)
220
- # Acceptance logic
 
 
 
 
 
 
 
221
  accepted = (
222
- matched_type is not None and match_score >= 70 and
223
  llm_json.get("looks_genuine", False) and
224
  not llm_json.get("is_expired", False)
225
  )
 
226
  reason = []
227
- reason.append(
228
- f"Document type '{detected_type}' matched checklist '{matched_type}' with score {match_score}/100." if matched_type else
229
- f"Document type '{detected_type}' did not match any required type."
230
- )
231
- reason.append(
232
- f"Genuineness confidence: {llm_json.get('confidence', 0)}."
233
- )
234
- reason.append(
235
- "Document is not expired." if not llm_json.get("is_expired", False) else "Document is expired."
236
- )
 
 
237
  reason.append(llm_json.get("verdict", ""))
 
238
  results.append({
239
  "File": uploaded_file.name,
240
  "Detected Type": detected_type,
241
- "Checklist Match": matched_type or "-",
242
  "Type Score": match_score,
243
  "Expiry Date": llm_json.get("expiry_date", "-"),
244
  "Expired": "Yes" if llm_json.get("is_expired", False) else "No",
@@ -247,13 +294,24 @@ if st.button("Run Document Validation", type="primary") and uploaded_files:
247
  "Accepted": "Yes" if accepted else "No",
248
  "Reason": " ".join(reason)
249
  })
 
 
 
 
 
 
 
 
 
 
250
  if results:
251
- st.success("Validation Complete.")
252
- st.dataframe(pd.DataFrame(results))
253
  else:
254
  st.warning("No valid results.")
255
 
256
- # Debugging
257
- if "last_api" in st.session_state:
258
- with st.expander("Debug (LLM raw output)"):
259
- st.code(st.session_state.last_api)
 
 
8
  from fuzzywuzzy import fuzz
9
  import pandas as pd
10
 
11
+ # ========== CONFIG ==========
12
+ UNSTRACT_BASE = "https://llmwhisperer-api.us-central.unstract.com/api/v2"
13
+ UNSTRACT_API_KEY = os.getenv("UNSTRACT_API_KEY")
14
+ OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
15
+ OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"
16
+ GEMMA_MODEL = "google/gemma-3-4b-it:free"
17
+
18
  st.set_page_config(page_title="EZOFIS Document Validation Agent", layout="wide")
19
+
20
  st.markdown("""
21
  <style>
22
  .block-card {
 
34
  </style>
35
  """, unsafe_allow_html=True)
36
 
 
 
 
 
 
 
 
 
 
37
  st.markdown(
38
  "<h1 style='font-weight:800; margin-bottom:8px;'>EZOFIS Document Validation Agent</h1>",
39
  unsafe_allow_html=True
40
  )
41
  st.markdown(
42
+ "<div style='font-size:20px; margin-bottom:28px; color:#24345C;'>AI-driven checklist-based document acceptance for mortgage applications.</div>",
43
  unsafe_allow_html=True
44
  )
45
 
46
+ # ========== FUNCTIONS ==========
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
 
 
 
 
 
 
 
 
 
 
48
  def get_content_type(filename):
49
  mime, _ = mimetypes.guess_type(filename)
50
  ext = filename.lower().split('.')[-1]
 
54
  return "application/octet-stream"
55
  return mime
56
 
57
+ def extract_text_from_unstract(uploaded_file, status_box=None):
58
  filename = getattr(uploaded_file, "name", "uploaded_file")
59
  file_bytes = uploaded_file.read()
60
  content_type = get_content_type(filename)
 
63
  "Content-Type": content_type,
64
  }
65
  url = f"{UNSTRACT_BASE}/whisper"
66
+ if status_box:
67
+ status_box.info("Step 1: Uploading and extracting text (OCR)...")
68
+ r = requests.post(url, headers=headers, data=file_bytes)
69
+ if r.status_code != 202:
70
+ if status_box:
71
+ status_box.error(f"Unstract error: {r.status_code} - {r.text}")
72
+ return None
73
+ whisper_hash = r.json().get("whisper_hash")
74
+ if not whisper_hash:
75
+ if status_box:
76
+ status_box.error("Unstract: No whisper_hash received.")
77
+ return None
78
+ # Poll status
79
  status_url = f"{UNSTRACT_BASE}/whisper-status?whisper_hash={whisper_hash}"
80
  for i in range(30):
81
  status_r = requests.get(status_url, headers={"unstract-key": UNSTRACT_API_KEY})
82
  if status_r.status_code != 200:
83
+ if status_box:
84
+ status_box.error(f"Unstract status error: {status_r.status_code} - {status_r.text}")
85
  return None
86
  status = status_r.json().get("status")
87
  if status == "processed":
88
  break
89
+ if status_box:
90
+ status_box.info(f"OCR in progress... ({i+1}/30)")
91
  time.sleep(2)
92
  else:
93
+ if status_box:
94
+ status_box.error("Unstract: Timeout waiting for OCR.")
95
  return None
96
 
97
  retrieve_url = f"{UNSTRACT_BASE}/whisper-retrieve?whisper_hash={whisper_hash}&text_only=true"
98
  r = requests.get(retrieve_url, headers={"unstract-key": UNSTRACT_API_KEY})
99
  if r.status_code != 200:
100
+ if status_box:
101
+ status_box.error(f"Unstract: Error retrieving text: {r.status_code} - {r.text}")
102
  return None
103
  try:
104
  data = r.json()
 
106
  except Exception:
107
  return r.text
108
 
109
+ def build_prompt(doc_text, checklist):
110
+ return f"""
111
+ You are a careful, expert document validation agent for mortgage workflows.
112
+
113
+ Analyze the following extracted document text and this checklist JSON:
114
+ {json.dumps(checklist)}
115
+
116
+ First, **determine what document you are reading** (e.g., Driver's License, Passport, Bank Statement, etc.) as precisely as possible, based on content, layout, and terms.
 
 
117
 
118
+ **DO NOT** attempt to "force match" or guess a checklist match if you are not sure. If the detected document type does NOT correspond (even loosely) to any checklist item, set "checklist_matched": false and recommend rejection. If it matches, set "checklist_matched": true.
 
 
 
119
 
120
+ Extract the expiry date if found (or set as null/empty), and if present, check if it is expired compared to the current date: 21st June 2025.
121
 
122
+ Assess if the document looks genuine (as much as possible from the text), and provide a confidence score (0-100).
123
+
124
+ Respond with this JSON:
125
  {{
126
+ "document_type": "...", // Your best judgment (e.g. Driver's License)
127
+ "expiry_date": "...", // ISO format if possible
128
  "is_expired": true/false,
129
  "looks_genuine": true/false,
130
  "confidence": <score 0-100>,
131
+ "checklist_matched": true/false,
132
+ "verdict": "..." // One-sentence reason
133
  }}
134
+
135
  Document Text:
136
  {doc_text[:4000]}
137
  """.strip()
138
 
139
+ def query_gemma_llm(doc_text, checklist, status_box=None):
140
+ prompt = build_prompt(doc_text, checklist)
141
  headers = {
142
  "Authorization": f"Bearer {OPENROUTER_API_KEY}",
143
+ "HTTP-Referer": "https://chat.openai.com", # for OpenRouter
144
  "X-Title": "EZOFIS-Doc-Validator",
145
  "Content-Type": "application/json",
146
  }
 
150
  "temperature": 0.1,
151
  "max_tokens": 1024
152
  }
153
+ if status_box:
154
+ status_box.info("Step 2: Validating document with Gemma LLM...")
155
+ resp = requests.post(OPENROUTER_URL, headers=headers, json=data, timeout=90)
156
  if resp.status_code != 200:
157
+ if status_box:
158
+ status_box.error(f"OpenRouter error: {resp.status_code}: {resp.text}")
159
+ return None, None, prompt
160
  result = resp.json()["choices"][0]["message"]["content"]
161
  # Extract only JSON
162
  start = result.find("{")
163
  end = result.rfind("}") + 1
164
  if start == -1 or end == 0:
165
+ if status_box:
166
+ status_box.error("Gemma did not return JSON.")
167
+ status_box.write(result)
168
+ return None, result, prompt
169
  try:
170
+ return json.loads(result[start:end]), result, prompt
171
  except Exception as e:
172
+ if status_box:
173
+ status_box.error("Error parsing LLM response.")
174
+ status_box.write(result)
175
+ return None, result, prompt
176
+
177
+ def fuzzy_match_type(detected_type, checklist_types):
178
+ best_type = None
179
+ best_score = 0
180
+ for t in checklist_types:
181
+ score = fuzz.token_set_ratio(str(detected_type), str(t))
182
+ if score > best_score:
183
+ best_type = t
184
+ best_score = score
185
+ return best_type, best_score
186
 
187
+ # ========== UI ==========
188
+ sample_checklist = '''{
189
+ "required_documents": [
190
+ {"type": "Driver's License", "description": "Government-issued photo ID"},
191
+ {"type": "Passport", "description": "Valid passport"},
192
+ {"type": "SIN Card", "description": "Social Insurance Number document"},
193
+ {"type": "Bank Statement", "description": "Last 3 months bank statement"},
194
+ {"type": "Employment Letter", "description": "Signed letter from employer"},
195
+ {"type": "Pay Stub", "description": "Most recent pay stub"},
196
+ {"type": "Proof of Address", "description": "Utility bill or lease"}
197
+ ]
198
+ }'''
199
+
200
+ st.markdown("<span class='step-num'>1</span> <b>Paste Mortgage Checklist (JSON)</b>", unsafe_allow_html=True)
201
+ checklist_text = st.text_area(
202
+ "Paste or edit your mortgage checklist JSON below:",
203
+ value=sample_checklist,
204
+ height=200,
205
+ key="doc_checklist_json"
206
+ )
207
+ try:
208
+ checklist = json.loads(checklist_text)
209
+ required_types = [doc["type"] for doc in checklist["required_documents"]]
210
+ except Exception as e:
211
+ st.error("Invalid checklist JSON.")
212
+ st.stop()
213
+
214
+ st.markdown("<span class='step-num'>2</span> <b>Upload Document(s) to Validate</b>", unsafe_allow_html=True)
215
+ uploaded_files = st.file_uploader(
216
+ "Upload PDF, DOCX, XLSX, PNG, JPG, TIFF, etc.",
217
+ type=["pdf", "docx", "xlsx", "xls", "png", "jpg", "jpeg", "tiff"],
218
+ key="mortgage_files",
219
+ accept_multiple_files=True
220
+ )
221
+
222
+ # ========== PROCESSING ==========
223
  if st.button("Run Document Validation", type="primary") and uploaded_files:
224
  results = []
225
+ debug_data = []
226
+
227
  for uploaded_file in uploaded_files:
228
  st.subheader(f"Validating: {uploaded_file.name}")
229
+ status_box = st.empty()
230
+ debug = {}
231
+
232
+ # Step 1: OCR
233
+ doc_text = extract_text_from_unstract(uploaded_file, status_box)
234
+ debug['OCR_extracted_text'] = doc_text
235
+
236
  if not doc_text:
237
+ status_box.error("Skipping due to OCR extraction error.")
238
+ debug['error'] = "OCR extraction error"
239
+ debug_data.append({uploaded_file.name: debug})
240
  continue
241
+
242
+ # Step 2: LLM Validation
243
+ llm_json, llm_raw, llm_prompt = query_gemma_llm(doc_text, checklist, status_box)
244
+ debug['LLM_prompt'] = llm_prompt
245
+ debug['LLM_raw_response'] = llm_raw
246
+ debug['LLM_parsed_json'] = llm_json
247
+
248
  if not llm_json:
249
+ status_box.error("Skipping due to LLM error.")
250
+ debug['error'] = "LLM processing error"
251
+ debug_data.append({uploaded_file.name: debug})
252
  continue
253
+
254
  detected_type = llm_json.get("document_type", "")
255
  matched_type, match_score = fuzzy_match_type(detected_type, required_types)
256
+
257
+ # Accept only if LLM states checklist_matched, looks genuine, and not expired
258
+ checklist_matched = llm_json.get("checklist_matched", False)
259
+ if checklist_matched:
260
+ # Double check: If match_score < 65, override to not matched
261
+ if match_score < 65:
262
+ checklist_matched = False
263
+
264
  accepted = (
265
+ checklist_matched and
266
  llm_json.get("looks_genuine", False) and
267
  not llm_json.get("is_expired", False)
268
  )
269
+
270
  reason = []
271
+ if not checklist_matched:
272
+ reason.append("No matching checklist item found. Document rejected.")
273
+ else:
274
+ reason.append(
275
+ f"Document type '{detected_type}' matched checklist '{matched_type}' with score {match_score}/100."
276
+ )
277
+ if not llm_json.get("looks_genuine", False):
278
+ reason.append("Document does not look genuine.")
279
+ if llm_json.get("is_expired", False):
280
+ reason.append("Document is expired.")
281
+
282
+ reason.append(f"Genuineness confidence: {llm_json.get('confidence', 0)}.")
283
  reason.append(llm_json.get("verdict", ""))
284
+
285
  results.append({
286
  "File": uploaded_file.name,
287
  "Detected Type": detected_type,
288
+ "Checklist Match": matched_type if checklist_matched else "-",
289
  "Type Score": match_score,
290
  "Expiry Date": llm_json.get("expiry_date", "-"),
291
  "Expired": "Yes" if llm_json.get("is_expired", False) else "No",
 
294
  "Accepted": "Yes" if accepted else "No",
295
  "Reason": " ".join(reason)
296
  })
297
+ debug['Checklist_match_details'] = {
298
+ "detected_type": detected_type,
299
+ "matched_type": matched_type,
300
+ "match_score": match_score,
301
+ "checklist_matched": checklist_matched,
302
+ "accepted": accepted
303
+ }
304
+ debug_data.append({uploaded_file.name: debug})
305
+ status_box.success("Validation complete. See result below.")
306
+
307
  if results:
308
+ st.success("All validations complete.")
309
+ st.dataframe(pd.DataFrame(results), use_container_width=True)
310
  else:
311
  st.warning("No valid results.")
312
 
313
+ with st.expander("Debug Panel (per document)"):
314
+ for doc_debug in debug_data:
315
+ for fname, dbg in doc_debug.items():
316
+ st.markdown(f"**{fname}**")
317
+ st.json(dbg)