Spaces:

Akash-Dragon
/

Regulatory-Bot

Configuration error

App Files Files Community

Regulatory-Bot / app.py

Akash-Dragon

Upload 5 files

0000719 verified 3 months ago

raw

history blame contribute delete

10.2 kB

	#!/usr/bin/env python
	# coding: utf-8

	# =========================================================
	# 1. IMPORTS & ENV
	# =========================================================
	import os
	import json
	import re
	import hashlib
	from dotenv import load_dotenv
	from PIL import Image

	import gradio as gr
	import pytesseract
	from pdf2image import convert_from_path
	from groq import Groq

	load_dotenv()

	# =========================================================
	# 2. LOAD FDIC SECTION 3.2 ONCE (GLOBAL)
	# =========================================================
	with open("data/fdic_section_3_2_chunks_refined.json") as f:
	FDIC_CHUNKS = json.load(f)

	# =========================================================
	# 3. GROQ CLIENT & MODELS
	# =========================================================
	client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

	MODEL_LLM1 = "llama-3.1-8b-instant" # OCR → loan summary
	MODEL_LLM2 = "llama-3.1-8b-instant" # topic indexing
	MODEL_LLM4 = "meta-llama/llama-4-scout-17b-16e-instruct" # reasoning

	# =========================================================
	# 4. SESSION STATE
	# =========================================================
	SESSION_STATE = {
	"ocr_text": "",
	"loan_summary": None
	}

	OCR_CACHE = {}

	# =========================================================
	# 5. GUARDRAILS
	# =========================================================
	NON_LOAN_KEYWORDS = [
	"movie", "music", "sports", "weather", "joke", "recipe",
	"health", "cold", "fever", "doctor", "medicine",
	"politics", "election"
	]

	def sanitize_user_input(text):
	return text.strip()[:5000] if text else ""

	def is_non_loan_question(text):
	return any(k in text.lower() for k in NON_LOAN_KEYWORDS)

	# =========================================================
	# 6. SAFE JSON PARSER
	# =========================================================
	def safe_json_loads(text, stage):
	if not text:
	raise ValueError(f"{stage} returned empty response")

	text = re.sub(r"```json\|```", "", text).strip()
	match = re.search(r"\{.*\}", text, re.DOTALL)

	if not match:
	raise ValueError(f"{stage} returned no JSON:\n{text}")

	return json.loads(match.group())

	# =========================================================
	# 7. OCR HELPERS
	# =========================================================
	MAX_PAGES = 5

	def file_hash(path, max_bytes=1024 * 1024):
	h = hashlib.md5()
	with open(path, "rb") as f:
	h.update(f.read(max_bytes))
	return h.hexdigest()

	def ocr_file(path):
	if path.lower().endswith(".pdf"):
	text = ""
	pages = convert_from_path(path, dpi=200)[:MAX_PAGES]
	for p in pages:
	text += pytesseract.image_to_string(p.convert("L")) + "\n"
	return text.strip()
	else:
	img = Image.open(path).convert("L")
	return pytesseract.image_to_string(img).strip()

	def run_ocr_pipeline(uploaded_files):
	texts = []
	for f in uploaded_files:
	path = str(f)
	key = file_hash(path)
	if key not in OCR_CACHE:
	OCR_CACHE[key] = ocr_file(path)
	texts.append(OCR_CACHE[key])
	return "\n".join(texts)

	# =========================================================
	# 8. LOAN SCHEMA
	# =========================================================
	LOAN_SCHEMA = """<same as your original schema>"""

	# =========================================================
	# 9. SYSTEM PROMPTS
	# =========================================================
	LLM1_SYSTEM_PROMPT = f"""
	You are an information extraction engine for bank loan documents.

	Task:
	- Extract ONLY facts that are explicitly stated in the text.
	- Do NOT infer, assume, normalize, or calculate anything.
	- If a value is missing or unclear, use null or "unknown".

	Rules:
	- Use ONLY the provided OCR text.
	- Do NOT add explanations.
	- Do NOT reference regulations.
	- Output MUST strictly match the schema below.
	- Return ONLY valid JSON.

	Schema:
	{LOAN_SCHEMA}
	"""

	LLM2_SYSTEM_PROMPT = """
	You are a regulatory topic indexing assistant.

	Inputs:
	- A user question
	- A list of FDIC RMS Manual Section 3.2 headings with chunk_ids

	Task:
	- Select ONLY the chunk_ids whose headings are directly relevant
	to answering the user question.
	- Base your decision ONLY on the heading titles.
	- Do NOT interpret or summarize policy text.

	Rules:
	- Select between 1 and 6 chunk_ids.
	- If no headings are relevant, return an empty list.
	- Do NOT explain your reasoning.
	- Return ONLY valid JSON.

	Output format:
	{
	"selected_chunk_ids": ["string"]
	}
	"""

	LLM4_SYSTEM_PROMPT = """
	You are a regulatory-aligned loan evaluation assistant.

	You are given TWO authoritative sources:

	SOURCE A — Loan Summary
	• Structured facts extracted from uploaded loan documents
	• This is the ONLY source for borrower name, loan type, interest rate,
	amounts, collateral, and other loan-specific details

	SOURCE B — FDIC RMS Manual Section 3.2 (Loans)
	• This is the ONLY source for regulatory objectives, examiner expectations,
	loan review systems, risk management, and policy intent

	RULES (STRICT):
	1. If the user asks for loan details → answer ONLY from SOURCE A
	2. If the user asks regulatory or examiner questions → answer ONLY from SOURCE B
	3. If the user asks a mixed question → clearly separate:
	• factual loan details (SOURCE A)
	• regulatory interpretation (SOURCE B)
	4. Do NOT infer or assume missing facts
	5. Do NOT use general banking knowledge
	6. Do NOT approve, reject, or predict loan outcomes
	7. If required information is missing, explicitly state that it is not available

	Tone:
	Professional, neutral, examiner-style.
	No markdown. No speculation.

	"""

	NO_DOC_PROMPT = f"""
	You are creating a placeholder loan summary.

	Rules:
	- Use ONLY the schema provided.
	- Do NOT infer or fabricate details.
	- Populate fields only if explicitly stated in the user input.
	- Otherwise, use null or "unknown".
	- Return ONLY valid JSON.

	Schema:
	{LOAN_SCHEMA}
	"""

	# =========================================================
	# 10. LLM CALL
	# =========================================================
	def call_llm(system_prompt, user_prompt, model, temperature=0):
	r = client.chat.completions.create(
	model=model,
	temperature=temperature,
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_prompt}
	]
	)
	return r.choices[0].message.content.strip()

	# =========================================================
	# 11. MAIN LOGIC (FINAL)
	# =========================================================
	def process_request(user_text, uploaded_files):
	user_text = sanitize_user_input(user_text)

	# 🚫 NON-LOAN GUARDRAIL
	if is_non_loan_question(user_text):
	return "", "⚠️ Only FDIC Section 3.2 loan and regulatory questions are supported."

	# ======================================================
	# LLM-1: OCR → Loan Summary (ONLY if files exist)
	# ======================================================
	if uploaded_files:
	ocr_text = run_ocr_pipeline(uploaded_files)

	loan_summary = safe_json_loads(
	call_llm(
	LLM1_SYSTEM_PROMPT,
	ocr_text,
	MODEL_LLM1
	),
	"LLM-1"
	)

	SESSION_STATE["ocr_text"] = ocr_text
	SESSION_STATE["loan_summary"] = loan_summary

	else:
	# Follow-up or regulatory-only question
	ocr_text = SESSION_STATE.get("ocr_text", "")
	loan_summary = SESSION_STATE.get("loan_summary")

	# ❗ Do NOT force NO-DOC extraction for regulatory questions
	if loan_summary is None:
	loan_summary = {
	"note": "No loan documents uploaded. Loan-specific facts unavailable."
	}

	# ======================================================
	# LLM-2: FDIC Section 3.2 Topic Indexing (HEADINGS ONLY)
	# ======================================================
	headings_payload = {
	"user_question": user_text,
	"fdic_headings": [
	{
	"chunk_id": c["chunk_id"],
	"heading": c.get("subtopic") or c.get("title")
	}
	for c in FDIC_CHUNKS
	]
	}

	selected_ids = safe_json_loads(
	call_llm(
	LLM2_SYSTEM_PROMPT,
	json.dumps(headings_payload),
	MODEL_LLM2
	),
	"LLM-2"
	).get("selected_chunk_ids", [])

	selected_chunks = [
	{
	"chunk_id": c["chunk_id"],
	"heading": c.get("subtopic") or c.get("title")
	}
	for c in FDIC_CHUNKS
	if c["chunk_id"] in set(selected_ids)
	][:6] # 🔒 HARD CAP (very important)


	# ======================================================
	# LLM-4: FINAL REGULATORY + FACTUAL ANSWER
	# ======================================================
	llm4_payload = {
	"loan_summary": loan_summary,
	"fdic_section_3_2": selected_chunks,
	"user_question": user_text
	}

	answer = call_llm(
	LLM4_SYSTEM_PROMPT,
	json.dumps(llm4_payload),
	MODEL_LLM4,
	temperature=0.2
	)

	return ocr_text, answer

	# =========================================================
	# 12. GRADIO UI
	# =========================================================
	def chat_handler(user_text, uploaded_files, chat_history):
	chat_history = chat_history or []
	_, answer = process_request(user_text, uploaded_files)
	chat_history.append({"role": "user", "content": user_text})
	chat_history.append({"role": "assistant", "content": answer})
	return chat_history

	with gr.Blocks(title="Regulatory Loan Evaluation Assistant") as demo:
	gr.Markdown("## 📄 Regulatory Loan Evaluation Assistant")
	chat = gr.Chatbot(height=450)
	files = gr.File(
	label="Upload Loan Documents (Optional)",
	file_types=[".pdf", ".png", ".jpg", ".jpeg"],
	file_count="multiple"
	)
	user_input = gr.Textbox(placeholder="Ask a regulatory or loan question")
	gr.Button("Send").click(
	fn=chat_handler,
	inputs=[user_input, files, chat],
	outputs=[chat]
	)

	demo.launch()