Spaces:

Akash-Dragon
/

Regulatory-Bot

Configuration error

App Files Files Community

Akash-Dragon commited on Jan 14

Commit

0000719

verified ·

1 Parent(s): 3579588

Upload 5 files

Browse files

Files changed (5) hide show

README.md +34 -6
app.py +334 -0
fdic_section_3_2_chunks_refined.json +0 -0
packages.txt +2 -0
requirements.txt +7 -0

README.md CHANGED Viewed

@@ -1,13 +1,41 @@
 ---
-title: Regulatory Bot
-emoji: 🏢
 colorFrom: blue
-colorTo: gray
 sdk: gradio
-sdk_version: 6.3.0
 app_file: app.py
 pinned: false
-short_description: Prompt Engineering Regulatory bot based on section 3.2
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Regulatory Loan Evaluation Assistant
+emoji: 🏦
 colorFrom: blue
+colorTo: green
 sdk: gradio
+sdk_version: "4.0"
 app_file: app.py
 pinned: false
 ---
+## 📄 Regulatory Loan Evaluation Assistant
+This application is a **prompt-engineered regulatory reasoning system** designed for
+loan evaluation in accordance with the **FDIC RMS Manual of Examination Policies – Section 3.2 (Loans)**.
+### 🔍 What this system does
+- Extracts **structured loan facts** from uploaded loan documents using OCR
+- Answers user questions using **only**:
+  - Extracted loan facts, and
+  - FDIC Section 3.2 regulatory guidance
+- Refuses non-loan or out-of-scope questions
+- Avoids approvals, rejections, or predictions
+### 🧠 Key Design Principles
+- **Prompt engineering only** (no model training or fine-tuning)
+- **Single source of truth** for regulatory reasoning
+- **Audit-ready**, document-grounded responses
+- **Regulatory tone** aligned with examiner expectations
+### 📌 Inputs
+- Optional loan documents (PDF / image)
+- User regulatory or loan-related questions
+### 🚫 Explicitly excluded
+- Credit scoring
+- Automated decisions
+- OCR beyond basic text extraction
+- External data sources
+This project is intended for **educational and regulatory analysis purposes only**.

app.py ADDED Viewed

	@@ -0,0 +1,334 @@

+#!/usr/bin/env python
+# coding: utf-8
+# =========================================================
+# 1. IMPORTS & ENV
+# =========================================================
+import os
+import json
+import re
+import hashlib
+from dotenv import load_dotenv
+from PIL import Image
+import gradio as gr
+import pytesseract
+from pdf2image import convert_from_path
+from groq import Groq
+load_dotenv()
+# =========================================================
+# 2. LOAD FDIC SECTION 3.2 ONCE (GLOBAL)
+# =========================================================
+with open("data/fdic_section_3_2_chunks_refined.json") as f:
+    FDIC_CHUNKS = json.load(f)
+# =========================================================
+# 3. GROQ CLIENT & MODELS
+# =========================================================
+client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
+MODEL_LLM1 = "llama-3.1-8b-instant"   # OCR → loan summary
+MODEL_LLM2 = "llama-3.1-8b-instant"   # topic indexing
+MODEL_LLM4 = "meta-llama/llama-4-scout-17b-16e-instruct"  # reasoning
+# =========================================================
+# 4. SESSION STATE
+# =========================================================
+SESSION_STATE = {
+    "ocr_text": "",
+    "loan_summary": None
+}
+OCR_CACHE = {}
+# =========================================================
+# 5. GUARDRAILS
+# =========================================================
+NON_LOAN_KEYWORDS = [
+    "movie", "music", "sports", "weather", "joke", "recipe",
+    "health", "cold", "fever", "doctor", "medicine",
+    "politics", "election"
+]
+def sanitize_user_input(text):
+    return text.strip()[:5000] if text else ""
+def is_non_loan_question(text):
+    return any(k in text.lower() for k in NON_LOAN_KEYWORDS)
+# =========================================================
+# 6. SAFE JSON PARSER
+# =========================================================
+def safe_json_loads(text, stage):
+    if not text:
+        raise ValueError(f"{stage} returned empty response")
+    text = re.sub(r"```json|```", "", text).strip()
+    match = re.search(r"\{.*\}", text, re.DOTALL)
+    if not match:
+        raise ValueError(f"{stage} returned no JSON:\n{text}")
+    return json.loads(match.group())
+# =========================================================
+# 7. OCR HELPERS
+# =========================================================
+MAX_PAGES = 5
+def file_hash(path, max_bytes=1024 * 1024):
+    h = hashlib.md5()
+    with open(path, "rb") as f:
+        h.update(f.read(max_bytes))
+    return h.hexdigest()
+def ocr_file(path):
+    if path.lower().endswith(".pdf"):
+        text = ""
+        pages = convert_from_path(path, dpi=200)[:MAX_PAGES]
+        for p in pages:
+            text += pytesseract.image_to_string(p.convert("L")) + "\n"
+        return text.strip()
+    else:
+        img = Image.open(path).convert("L")
+        return pytesseract.image_to_string(img).strip()
+def run_ocr_pipeline(uploaded_files):
+    texts = []
+    for f in uploaded_files:
+        path = str(f)
+        key = file_hash(path)
+        if key not in OCR_CACHE:
+            OCR_CACHE[key] = ocr_file(path)
+        texts.append(OCR_CACHE[key])
+    return "\n".join(texts)
+# =========================================================
+# 8. LOAN SCHEMA
+# =========================================================
+LOAN_SCHEMA = """<same as your original schema>"""
+# =========================================================
+# 9. SYSTEM PROMPTS
+# =========================================================
+LLM1_SYSTEM_PROMPT = f"""
+You are an information extraction engine for bank loan documents.
+Task:
+- Extract ONLY facts that are explicitly stated in the text.
+- Do NOT infer, assume, normalize, or calculate anything.
+- If a value is missing or unclear, use null or "unknown".
+Rules:
+- Use ONLY the provided OCR text.
+- Do NOT add explanations.
+- Do NOT reference regulations.
+- Output MUST strictly match the schema below.
+- Return ONLY valid JSON.
+Schema:
+{LOAN_SCHEMA}
+"""
+LLM2_SYSTEM_PROMPT = """
+You are a regulatory topic indexing assistant.
+Inputs:
+- A user question
+- A list of FDIC RMS Manual Section 3.2 headings with chunk_ids
+Task:
+- Select ONLY the chunk_ids whose headings are directly relevant
+  to answering the user question.
+- Base your decision ONLY on the heading titles.
+- Do NOT interpret or summarize policy text.
+Rules:
+- Select between 1 and 6 chunk_ids.
+- If no headings are relevant, return an empty list.
+- Do NOT explain your reasoning.
+- Return ONLY valid JSON.
+Output format:
+{
+  "selected_chunk_ids": ["string"]
+}
+"""
+LLM4_SYSTEM_PROMPT = """
+You are a regulatory-aligned loan evaluation assistant.
+You are given TWO authoritative sources:
+SOURCE A — Loan Summary
+• Structured facts extracted from uploaded loan documents
+• This is the ONLY source for borrower name, loan type, interest rate,
+  amounts, collateral, and other loan-specific details
+SOURCE B — FDIC RMS Manual Section 3.2 (Loans)
+• This is the ONLY source for regulatory objectives, examiner expectations,
+  loan review systems, risk management, and policy intent
+RULES (STRICT):
+1. If the user asks for loan details → answer ONLY from SOURCE A
+2. If the user asks regulatory or examiner questions → answer ONLY from SOURCE B
+3. If the user asks a mixed question → clearly separate:
+   • factual loan details (SOURCE A)
+   • regulatory interpretation (SOURCE B)
+4. Do NOT infer or assume missing facts
+5. Do NOT use general banking knowledge
+6. Do NOT approve, reject, or predict loan outcomes
+7. If required information is missing, explicitly state that it is not available
+Tone:
+Professional, neutral, examiner-style.
+No markdown. No speculation.
+"""
+NO_DOC_PROMPT = f"""
+You are creating a placeholder loan summary.
+Rules:
+- Use ONLY the schema provided.
+- Do NOT infer or fabricate details.
+- Populate fields only if explicitly stated in the user input.
+- Otherwise, use null or "unknown".
+- Return ONLY valid JSON.
+Schema:
+{LOAN_SCHEMA}
+"""
+# =========================================================
+# 10. LLM CALL
+# =========================================================
+def call_llm(system_prompt, user_prompt, model, temperature=0):
+    r = client.chat.completions.create(
+        model=model,
+        temperature=temperature,
+        messages=[
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt}
+        ]
+    )
+    return r.choices[0].message.content.strip()
+# =========================================================
+# 11. MAIN LOGIC (FINAL)
+# =========================================================
+def process_request(user_text, uploaded_files):
+    user_text = sanitize_user_input(user_text)
+    # 🚫 NON-LOAN GUARDRAIL
+    if is_non_loan_question(user_text):
+        return "", "⚠️ Only FDIC Section 3.2 loan and regulatory questions are supported."
+    # ======================================================
+    # LLM-1: OCR → Loan Summary (ONLY if files exist)
+    # ======================================================
+    if uploaded_files:
+        ocr_text = run_ocr_pipeline(uploaded_files)
+        loan_summary = safe_json_loads(
+            call_llm(
+                LLM1_SYSTEM_PROMPT,
+                ocr_text,
+                MODEL_LLM1
+            ),
+            "LLM-1"
+        )
+        SESSION_STATE["ocr_text"] = ocr_text
+        SESSION_STATE["loan_summary"] = loan_summary
+    else:
+        # Follow-up or regulatory-only question
+        ocr_text = SESSION_STATE.get("ocr_text", "")
+        loan_summary = SESSION_STATE.get("loan_summary")
+        # ❗ Do NOT force NO-DOC extraction for regulatory questions
+        if loan_summary is None:
+            loan_summary = {
+                "note": "No loan documents uploaded. Loan-specific facts unavailable."
+            }
+    # ======================================================
+    # LLM-2: FDIC Section 3.2 Topic Indexing (HEADINGS ONLY)
+    # ======================================================
+    headings_payload = {
+        "user_question": user_text,
+        "fdic_headings": [
+            {
+                "chunk_id": c["chunk_id"],
+                "heading": c.get("subtopic") or c.get("title")
+            }
+            for c in FDIC_CHUNKS
+        ]
+    }
+    selected_ids = safe_json_loads(
+        call_llm(
+            LLM2_SYSTEM_PROMPT,
+            json.dumps(headings_payload),
+            MODEL_LLM2
+        ),
+        "LLM-2"
+    ).get("selected_chunk_ids", [])
+    selected_chunks = [
+    {
+        "chunk_id": c["chunk_id"],
+        "heading": c.get("subtopic") or c.get("title")
+    }
+    for c in FDIC_CHUNKS
+    if c["chunk_id"] in set(selected_ids)
+][:6]   # 🔒 HARD CAP (very important)
+    # ======================================================
+    # LLM-4: FINAL REGULATORY + FACTUAL ANSWER
+    # ======================================================
+    llm4_payload = {
+        "loan_summary": loan_summary,
+        "fdic_section_3_2": selected_chunks,
+        "user_question": user_text
+    }
+    answer = call_llm(
+        LLM4_SYSTEM_PROMPT,
+        json.dumps(llm4_payload),
+        MODEL_LLM4,
+        temperature=0.2
+    )
+    return ocr_text, answer
+# =========================================================
+# 12. GRADIO UI
+# =========================================================
+def chat_handler(user_text, uploaded_files, chat_history):
+    chat_history = chat_history or []
+    _, answer = process_request(user_text, uploaded_files)
+    chat_history.append({"role": "user", "content": user_text})
+    chat_history.append({"role": "assistant", "content": answer})
+    return chat_history
+with gr.Blocks(title="Regulatory Loan Evaluation Assistant") as demo:
+    gr.Markdown("## 📄 Regulatory Loan Evaluation Assistant")
+    chat = gr.Chatbot(height=450)
+    files = gr.File(
+        label="Upload Loan Documents (Optional)",
+        file_types=[".pdf", ".png", ".jpg", ".jpeg"],
+        file_count="multiple"
+    )
+    user_input = gr.Textbox(placeholder="Ask a regulatory or loan question")
+    gr.Button("Send").click(
+        fn=chat_handler,
+        inputs=[user_input, files, chat],
+        outputs=[chat]
+    )
+demo.launch()

fdic_section_3_2_chunks_refined.json ADDED Viewed

The diff for this file is too large to render. See raw diff

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ tesseract-ocr
2	+ poppler-utils

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio>=4.0
+pytesseract
+pdf2image
+pillow
+python-dotenv
+groq
+opencv-python-headless