Regulatory-Bot / app.py
Akash-Dragon's picture
Upload 5 files
0000719 verified
#!/usr/bin/env python
# coding: utf-8
# =========================================================
# 1. IMPORTS & ENV
# =========================================================
import os
import json
import re
import hashlib
from dotenv import load_dotenv
from PIL import Image
import gradio as gr
import pytesseract
from pdf2image import convert_from_path
from groq import Groq
load_dotenv()
# =========================================================
# 2. LOAD FDIC SECTION 3.2 ONCE (GLOBAL)
# =========================================================
with open("data/fdic_section_3_2_chunks_refined.json") as f:
FDIC_CHUNKS = json.load(f)
# =========================================================
# 3. GROQ CLIENT & MODELS
# =========================================================
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
MODEL_LLM1 = "llama-3.1-8b-instant" # OCR β†’ loan summary
MODEL_LLM2 = "llama-3.1-8b-instant" # topic indexing
MODEL_LLM4 = "meta-llama/llama-4-scout-17b-16e-instruct" # reasoning
# =========================================================
# 4. SESSION STATE
# =========================================================
SESSION_STATE = {
"ocr_text": "",
"loan_summary": None
}
OCR_CACHE = {}
# =========================================================
# 5. GUARDRAILS
# =========================================================
NON_LOAN_KEYWORDS = [
"movie", "music", "sports", "weather", "joke", "recipe",
"health", "cold", "fever", "doctor", "medicine",
"politics", "election"
]
def sanitize_user_input(text):
return text.strip()[:5000] if text else ""
def is_non_loan_question(text):
return any(k in text.lower() for k in NON_LOAN_KEYWORDS)
# =========================================================
# 6. SAFE JSON PARSER
# =========================================================
def safe_json_loads(text, stage):
if not text:
raise ValueError(f"{stage} returned empty response")
text = re.sub(r"```json|```", "", text).strip()
match = re.search(r"\{.*\}", text, re.DOTALL)
if not match:
raise ValueError(f"{stage} returned no JSON:\n{text}")
return json.loads(match.group())
# =========================================================
# 7. OCR HELPERS
# =========================================================
MAX_PAGES = 5
def file_hash(path, max_bytes=1024 * 1024):
h = hashlib.md5()
with open(path, "rb") as f:
h.update(f.read(max_bytes))
return h.hexdigest()
def ocr_file(path):
if path.lower().endswith(".pdf"):
text = ""
pages = convert_from_path(path, dpi=200)[:MAX_PAGES]
for p in pages:
text += pytesseract.image_to_string(p.convert("L")) + "\n"
return text.strip()
else:
img = Image.open(path).convert("L")
return pytesseract.image_to_string(img).strip()
def run_ocr_pipeline(uploaded_files):
texts = []
for f in uploaded_files:
path = str(f)
key = file_hash(path)
if key not in OCR_CACHE:
OCR_CACHE[key] = ocr_file(path)
texts.append(OCR_CACHE[key])
return "\n".join(texts)
# =========================================================
# 8. LOAN SCHEMA
# =========================================================
LOAN_SCHEMA = """<same as your original schema>"""
# =========================================================
# 9. SYSTEM PROMPTS
# =========================================================
LLM1_SYSTEM_PROMPT = f"""
You are an information extraction engine for bank loan documents.
Task:
- Extract ONLY facts that are explicitly stated in the text.
- Do NOT infer, assume, normalize, or calculate anything.
- If a value is missing or unclear, use null or "unknown".
Rules:
- Use ONLY the provided OCR text.
- Do NOT add explanations.
- Do NOT reference regulations.
- Output MUST strictly match the schema below.
- Return ONLY valid JSON.
Schema:
{LOAN_SCHEMA}
"""
LLM2_SYSTEM_PROMPT = """
You are a regulatory topic indexing assistant.
Inputs:
- A user question
- A list of FDIC RMS Manual Section 3.2 headings with chunk_ids
Task:
- Select ONLY the chunk_ids whose headings are directly relevant
to answering the user question.
- Base your decision ONLY on the heading titles.
- Do NOT interpret or summarize policy text.
Rules:
- Select between 1 and 6 chunk_ids.
- If no headings are relevant, return an empty list.
- Do NOT explain your reasoning.
- Return ONLY valid JSON.
Output format:
{
"selected_chunk_ids": ["string"]
}
"""
LLM4_SYSTEM_PROMPT = """
You are a regulatory-aligned loan evaluation assistant.
You are given TWO authoritative sources:
SOURCE A β€” Loan Summary
β€’ Structured facts extracted from uploaded loan documents
β€’ This is the ONLY source for borrower name, loan type, interest rate,
amounts, collateral, and other loan-specific details
SOURCE B β€” FDIC RMS Manual Section 3.2 (Loans)
β€’ This is the ONLY source for regulatory objectives, examiner expectations,
loan review systems, risk management, and policy intent
RULES (STRICT):
1. If the user asks for loan details β†’ answer ONLY from SOURCE A
2. If the user asks regulatory or examiner questions β†’ answer ONLY from SOURCE B
3. If the user asks a mixed question β†’ clearly separate:
β€’ factual loan details (SOURCE A)
β€’ regulatory interpretation (SOURCE B)
4. Do NOT infer or assume missing facts
5. Do NOT use general banking knowledge
6. Do NOT approve, reject, or predict loan outcomes
7. If required information is missing, explicitly state that it is not available
Tone:
Professional, neutral, examiner-style.
No markdown. No speculation.
"""
NO_DOC_PROMPT = f"""
You are creating a placeholder loan summary.
Rules:
- Use ONLY the schema provided.
- Do NOT infer or fabricate details.
- Populate fields only if explicitly stated in the user input.
- Otherwise, use null or "unknown".
- Return ONLY valid JSON.
Schema:
{LOAN_SCHEMA}
"""
# =========================================================
# 10. LLM CALL
# =========================================================
def call_llm(system_prompt, user_prompt, model, temperature=0):
r = client.chat.completions.create(
model=model,
temperature=temperature,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
]
)
return r.choices[0].message.content.strip()
# =========================================================
# 11. MAIN LOGIC (FINAL)
# =========================================================
def process_request(user_text, uploaded_files):
user_text = sanitize_user_input(user_text)
# 🚫 NON-LOAN GUARDRAIL
if is_non_loan_question(user_text):
return "", "⚠️ Only FDIC Section 3.2 loan and regulatory questions are supported."
# ======================================================
# LLM-1: OCR β†’ Loan Summary (ONLY if files exist)
# ======================================================
if uploaded_files:
ocr_text = run_ocr_pipeline(uploaded_files)
loan_summary = safe_json_loads(
call_llm(
LLM1_SYSTEM_PROMPT,
ocr_text,
MODEL_LLM1
),
"LLM-1"
)
SESSION_STATE["ocr_text"] = ocr_text
SESSION_STATE["loan_summary"] = loan_summary
else:
# Follow-up or regulatory-only question
ocr_text = SESSION_STATE.get("ocr_text", "")
loan_summary = SESSION_STATE.get("loan_summary")
# ❗ Do NOT force NO-DOC extraction for regulatory questions
if loan_summary is None:
loan_summary = {
"note": "No loan documents uploaded. Loan-specific facts unavailable."
}
# ======================================================
# LLM-2: FDIC Section 3.2 Topic Indexing (HEADINGS ONLY)
# ======================================================
headings_payload = {
"user_question": user_text,
"fdic_headings": [
{
"chunk_id": c["chunk_id"],
"heading": c.get("subtopic") or c.get("title")
}
for c in FDIC_CHUNKS
]
}
selected_ids = safe_json_loads(
call_llm(
LLM2_SYSTEM_PROMPT,
json.dumps(headings_payload),
MODEL_LLM2
),
"LLM-2"
).get("selected_chunk_ids", [])
selected_chunks = [
{
"chunk_id": c["chunk_id"],
"heading": c.get("subtopic") or c.get("title")
}
for c in FDIC_CHUNKS
if c["chunk_id"] in set(selected_ids)
][:6] # πŸ”’ HARD CAP (very important)
# ======================================================
# LLM-4: FINAL REGULATORY + FACTUAL ANSWER
# ======================================================
llm4_payload = {
"loan_summary": loan_summary,
"fdic_section_3_2": selected_chunks,
"user_question": user_text
}
answer = call_llm(
LLM4_SYSTEM_PROMPT,
json.dumps(llm4_payload),
MODEL_LLM4,
temperature=0.2
)
return ocr_text, answer
# =========================================================
# 12. GRADIO UI
# =========================================================
def chat_handler(user_text, uploaded_files, chat_history):
chat_history = chat_history or []
_, answer = process_request(user_text, uploaded_files)
chat_history.append({"role": "user", "content": user_text})
chat_history.append({"role": "assistant", "content": answer})
return chat_history
with gr.Blocks(title="Regulatory Loan Evaluation Assistant") as demo:
gr.Markdown("## πŸ“„ Regulatory Loan Evaluation Assistant")
chat = gr.Chatbot(height=450)
files = gr.File(
label="Upload Loan Documents (Optional)",
file_types=[".pdf", ".png", ".jpg", ".jpeg"],
file_count="multiple"
)
user_input = gr.Textbox(placeholder="Ask a regulatory or loan question")
gr.Button("Send").click(
fn=chat_handler,
inputs=[user_input, files, chat],
outputs=[chat]
)
demo.launch()