Spaces:
Configuration error
Configuration error
| #!/usr/bin/env python | |
| # coding: utf-8 | |
| # ========================================================= | |
| # 1. IMPORTS & ENV | |
| # ========================================================= | |
| import os | |
| import json | |
| import re | |
| import hashlib | |
| from dotenv import load_dotenv | |
| from PIL import Image | |
| import gradio as gr | |
| import pytesseract | |
| from pdf2image import convert_from_path | |
| from groq import Groq | |
| load_dotenv() | |
| # ========================================================= | |
| # 2. LOAD FDIC SECTION 3.2 ONCE (GLOBAL) | |
| # ========================================================= | |
| with open("data/fdic_section_3_2_chunks_refined.json") as f: | |
| FDIC_CHUNKS = json.load(f) | |
| # ========================================================= | |
| # 3. GROQ CLIENT & MODELS | |
| # ========================================================= | |
| client = Groq(api_key=os.environ.get("GROQ_API_KEY")) | |
| MODEL_LLM1 = "llama-3.1-8b-instant" # OCR β loan summary | |
| MODEL_LLM2 = "llama-3.1-8b-instant" # topic indexing | |
| MODEL_LLM4 = "meta-llama/llama-4-scout-17b-16e-instruct" # reasoning | |
| # ========================================================= | |
| # 4. SESSION STATE | |
| # ========================================================= | |
| SESSION_STATE = { | |
| "ocr_text": "", | |
| "loan_summary": None | |
| } | |
| OCR_CACHE = {} | |
| # ========================================================= | |
| # 5. GUARDRAILS | |
| # ========================================================= | |
| NON_LOAN_KEYWORDS = [ | |
| "movie", "music", "sports", "weather", "joke", "recipe", | |
| "health", "cold", "fever", "doctor", "medicine", | |
| "politics", "election" | |
| ] | |
| def sanitize_user_input(text): | |
| return text.strip()[:5000] if text else "" | |
| def is_non_loan_question(text): | |
| return any(k in text.lower() for k in NON_LOAN_KEYWORDS) | |
| # ========================================================= | |
| # 6. SAFE JSON PARSER | |
| # ========================================================= | |
| def safe_json_loads(text, stage): | |
| if not text: | |
| raise ValueError(f"{stage} returned empty response") | |
| text = re.sub(r"```json|```", "", text).strip() | |
| match = re.search(r"\{.*\}", text, re.DOTALL) | |
| if not match: | |
| raise ValueError(f"{stage} returned no JSON:\n{text}") | |
| return json.loads(match.group()) | |
| # ========================================================= | |
| # 7. OCR HELPERS | |
| # ========================================================= | |
| MAX_PAGES = 5 | |
| def file_hash(path, max_bytes=1024 * 1024): | |
| h = hashlib.md5() | |
| with open(path, "rb") as f: | |
| h.update(f.read(max_bytes)) | |
| return h.hexdigest() | |
| def ocr_file(path): | |
| if path.lower().endswith(".pdf"): | |
| text = "" | |
| pages = convert_from_path(path, dpi=200)[:MAX_PAGES] | |
| for p in pages: | |
| text += pytesseract.image_to_string(p.convert("L")) + "\n" | |
| return text.strip() | |
| else: | |
| img = Image.open(path).convert("L") | |
| return pytesseract.image_to_string(img).strip() | |
| def run_ocr_pipeline(uploaded_files): | |
| texts = [] | |
| for f in uploaded_files: | |
| path = str(f) | |
| key = file_hash(path) | |
| if key not in OCR_CACHE: | |
| OCR_CACHE[key] = ocr_file(path) | |
| texts.append(OCR_CACHE[key]) | |
| return "\n".join(texts) | |
| # ========================================================= | |
| # 8. LOAN SCHEMA | |
| # ========================================================= | |
| LOAN_SCHEMA = """<same as your original schema>""" | |
| # ========================================================= | |
| # 9. SYSTEM PROMPTS | |
| # ========================================================= | |
| LLM1_SYSTEM_PROMPT = f""" | |
| You are an information extraction engine for bank loan documents. | |
| Task: | |
| - Extract ONLY facts that are explicitly stated in the text. | |
| - Do NOT infer, assume, normalize, or calculate anything. | |
| - If a value is missing or unclear, use null or "unknown". | |
| Rules: | |
| - Use ONLY the provided OCR text. | |
| - Do NOT add explanations. | |
| - Do NOT reference regulations. | |
| - Output MUST strictly match the schema below. | |
| - Return ONLY valid JSON. | |
| Schema: | |
| {LOAN_SCHEMA} | |
| """ | |
| LLM2_SYSTEM_PROMPT = """ | |
| You are a regulatory topic indexing assistant. | |
| Inputs: | |
| - A user question | |
| - A list of FDIC RMS Manual Section 3.2 headings with chunk_ids | |
| Task: | |
| - Select ONLY the chunk_ids whose headings are directly relevant | |
| to answering the user question. | |
| - Base your decision ONLY on the heading titles. | |
| - Do NOT interpret or summarize policy text. | |
| Rules: | |
| - Select between 1 and 6 chunk_ids. | |
| - If no headings are relevant, return an empty list. | |
| - Do NOT explain your reasoning. | |
| - Return ONLY valid JSON. | |
| Output format: | |
| { | |
| "selected_chunk_ids": ["string"] | |
| } | |
| """ | |
| LLM4_SYSTEM_PROMPT = """ | |
| You are a regulatory-aligned loan evaluation assistant. | |
| You are given TWO authoritative sources: | |
| SOURCE A β Loan Summary | |
| β’ Structured facts extracted from uploaded loan documents | |
| β’ This is the ONLY source for borrower name, loan type, interest rate, | |
| amounts, collateral, and other loan-specific details | |
| SOURCE B β FDIC RMS Manual Section 3.2 (Loans) | |
| β’ This is the ONLY source for regulatory objectives, examiner expectations, | |
| loan review systems, risk management, and policy intent | |
| RULES (STRICT): | |
| 1. If the user asks for loan details β answer ONLY from SOURCE A | |
| 2. If the user asks regulatory or examiner questions β answer ONLY from SOURCE B | |
| 3. If the user asks a mixed question β clearly separate: | |
| β’ factual loan details (SOURCE A) | |
| β’ regulatory interpretation (SOURCE B) | |
| 4. Do NOT infer or assume missing facts | |
| 5. Do NOT use general banking knowledge | |
| 6. Do NOT approve, reject, or predict loan outcomes | |
| 7. If required information is missing, explicitly state that it is not available | |
| Tone: | |
| Professional, neutral, examiner-style. | |
| No markdown. No speculation. | |
| """ | |
| NO_DOC_PROMPT = f""" | |
| You are creating a placeholder loan summary. | |
| Rules: | |
| - Use ONLY the schema provided. | |
| - Do NOT infer or fabricate details. | |
| - Populate fields only if explicitly stated in the user input. | |
| - Otherwise, use null or "unknown". | |
| - Return ONLY valid JSON. | |
| Schema: | |
| {LOAN_SCHEMA} | |
| """ | |
| # ========================================================= | |
| # 10. LLM CALL | |
| # ========================================================= | |
| def call_llm(system_prompt, user_prompt, model, temperature=0): | |
| r = client.chat.completions.create( | |
| model=model, | |
| temperature=temperature, | |
| messages=[ | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": user_prompt} | |
| ] | |
| ) | |
| return r.choices[0].message.content.strip() | |
| # ========================================================= | |
| # 11. MAIN LOGIC (FINAL) | |
| # ========================================================= | |
| def process_request(user_text, uploaded_files): | |
| user_text = sanitize_user_input(user_text) | |
| # π« NON-LOAN GUARDRAIL | |
| if is_non_loan_question(user_text): | |
| return "", "β οΈ Only FDIC Section 3.2 loan and regulatory questions are supported." | |
| # ====================================================== | |
| # LLM-1: OCR β Loan Summary (ONLY if files exist) | |
| # ====================================================== | |
| if uploaded_files: | |
| ocr_text = run_ocr_pipeline(uploaded_files) | |
| loan_summary = safe_json_loads( | |
| call_llm( | |
| LLM1_SYSTEM_PROMPT, | |
| ocr_text, | |
| MODEL_LLM1 | |
| ), | |
| "LLM-1" | |
| ) | |
| SESSION_STATE["ocr_text"] = ocr_text | |
| SESSION_STATE["loan_summary"] = loan_summary | |
| else: | |
| # Follow-up or regulatory-only question | |
| ocr_text = SESSION_STATE.get("ocr_text", "") | |
| loan_summary = SESSION_STATE.get("loan_summary") | |
| # β Do NOT force NO-DOC extraction for regulatory questions | |
| if loan_summary is None: | |
| loan_summary = { | |
| "note": "No loan documents uploaded. Loan-specific facts unavailable." | |
| } | |
| # ====================================================== | |
| # LLM-2: FDIC Section 3.2 Topic Indexing (HEADINGS ONLY) | |
| # ====================================================== | |
| headings_payload = { | |
| "user_question": user_text, | |
| "fdic_headings": [ | |
| { | |
| "chunk_id": c["chunk_id"], | |
| "heading": c.get("subtopic") or c.get("title") | |
| } | |
| for c in FDIC_CHUNKS | |
| ] | |
| } | |
| selected_ids = safe_json_loads( | |
| call_llm( | |
| LLM2_SYSTEM_PROMPT, | |
| json.dumps(headings_payload), | |
| MODEL_LLM2 | |
| ), | |
| "LLM-2" | |
| ).get("selected_chunk_ids", []) | |
| selected_chunks = [ | |
| { | |
| "chunk_id": c["chunk_id"], | |
| "heading": c.get("subtopic") or c.get("title") | |
| } | |
| for c in FDIC_CHUNKS | |
| if c["chunk_id"] in set(selected_ids) | |
| ][:6] # π HARD CAP (very important) | |
| # ====================================================== | |
| # LLM-4: FINAL REGULATORY + FACTUAL ANSWER | |
| # ====================================================== | |
| llm4_payload = { | |
| "loan_summary": loan_summary, | |
| "fdic_section_3_2": selected_chunks, | |
| "user_question": user_text | |
| } | |
| answer = call_llm( | |
| LLM4_SYSTEM_PROMPT, | |
| json.dumps(llm4_payload), | |
| MODEL_LLM4, | |
| temperature=0.2 | |
| ) | |
| return ocr_text, answer | |
| # ========================================================= | |
| # 12. GRADIO UI | |
| # ========================================================= | |
| def chat_handler(user_text, uploaded_files, chat_history): | |
| chat_history = chat_history or [] | |
| _, answer = process_request(user_text, uploaded_files) | |
| chat_history.append({"role": "user", "content": user_text}) | |
| chat_history.append({"role": "assistant", "content": answer}) | |
| return chat_history | |
| with gr.Blocks(title="Regulatory Loan Evaluation Assistant") as demo: | |
| gr.Markdown("## π Regulatory Loan Evaluation Assistant") | |
| chat = gr.Chatbot(height=450) | |
| files = gr.File( | |
| label="Upload Loan Documents (Optional)", | |
| file_types=[".pdf", ".png", ".jpg", ".jpeg"], | |
| file_count="multiple" | |
| ) | |
| user_input = gr.Textbox(placeholder="Ask a regulatory or loan question") | |
| gr.Button("Send").click( | |
| fn=chat_handler, | |
| inputs=[user_input, files, chat], | |
| outputs=[chat] | |
| ) | |
| demo.launch() | |