""" Stage 5: Gradio UI — Enhanced ────────────────────────────── Hallucination Detection & Prompt Remediation System - RoBERTa classifier → overall hallucination signal - DeBERTa NLI → sentence-level contradiction scoring - Rule-based engine → 3-tier corrective prompt generation Run: python app.py """ import subprocess import sys subprocess.run([sys.executable, "-m", "spacy", "download", "en_core_web_sm"], capture_output=True) import os import json import torch import spacy import torch.nn.functional as F import gradio as gr from datetime import datetime from transformers import AutoTokenizer, AutoModelForSequenceClassification # ── Model loading ───────────────────────────────────────────────────────────── ROBERTA_REPO = "JBond07/hallucination-detector-roberta" NLI_REPO = "cross-encoder/nli-deberta-v3-small" LOCAL_MODEL = "models/final_model" LOCAL_TOK = "models/tokenizer_saved" print("Loading models...") tok_src = LOCAL_TOK if os.path.exists(LOCAL_TOK) else ROBERTA_REPO mod_src = LOCAL_MODEL if os.path.exists(LOCAL_MODEL) else ROBERTA_REPO tokenizer = AutoTokenizer.from_pretrained(tok_src) model = AutoModelForSequenceClassification.from_pretrained(mod_src) nli_tokenizer = AutoTokenizer.from_pretrained(NLI_REPO) nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_REPO) nlp = spacy.load("en_core_web_sm") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device).eval() nli_model.to(device).eval() print(f"✓ All models ready on {device}") # ── History store ───────────────────────────────────────────────────────────── history = [] # ── Helpers ─────────────────────────────────────────────────────────────────── PRONOUNS = {"it", "they", "he", "she", "this", "that", "its", "their"} def resolve_pronouns(sentence, subject): tokens = sentence.split() if not tokens: return sentence if tokens[0].rstrip(".,!?").lower() in PRONOUNS: tokens[0] = subject return " ".join(tokens) def extract_subject(doc): for t in doc: if t.dep_ == "nsubj": return t.text for t in doc: if t.pos_ == "PROPN": return t.text for t in doc: if t.pos_ == "NOUN": return t.text return doc[0].text if doc else "" def get_overall_prob(context, question, response): inputs = tokenizer( context, question + " " + response, truncation=True, max_length=512, padding="max_length", return_tensors="pt" ).to(device) with torch.no_grad(): probs = torch.softmax(model(**inputs).logits, dim=1) return probs[0][1].item() def get_nli_scores(context, hypothesis): inputs = nli_tokenizer( context, hypothesis, truncation=True, max_length=512, return_tensors="pt" ).to(device) with torch.no_grad(): probs = F.softmax(nli_model(**inputs).logits, dim=1) return { "contradiction": round(probs[0][0].item(), 4), "neutral": round(probs[0][1].item(), 4), "entailment": round(probs[0][2].item(), 4) } def flag_sentences(context, question, response, threshold=0.6): overall_prob = get_overall_prob(context, question, response) doc = nlp(response) sentences = [s.text.strip() for s in doc.sents if s.text.strip()] subject = extract_subject(nlp(response)) results = [] for sent in sentences: resolved = resolve_pronouns(sent, subject) scores = get_nli_scores(context, resolved) flagged = scores["contradiction"] > threshold results.append({ "sentence": sent, "resolved": resolved, "scores": scores, "flagged": flagged, "flag_reason": "CONTRADICTION" if flagged else None }) verdict = "HALLUCINATED" if any(r["flagged"] for r in results) else "FAITHFUL" return overall_prob, verdict, results def generate_remediation(context, results): flagged = [r for r in results if r["flagged"]] if not flagged: return "✅ Response is faithful to the source context. No correction needed." bullets = "\n".join( f' • "{r["sentence"]} [{r["flag_reason"]}]"' for r in flagged ) instructions = ( "- Correct the contradictory claims listed above\n" "- Use only facts explicitly present in the context\n" "- Do not infer, guess, or add outside knowledge\n" "- If information is missing, explicitly state it is not available\n" "- Be factually precise and grounded to the source" ) return ( f"🚨 The following sentence(s) contradict the source context:\n\n" f"Flagged content:\n{bullets}\n\n" f"Please re-answer using ONLY the following context:\n\n" f'"""{context}"""\n\n' f"Instructions:\n{instructions}" ) # ── Output builders ─────────────────────────────────────────────────────────── def build_verdict_html(overall_prob, verdict): pct = round(overall_prob * 100, 1) color = "#ef4444" if verdict == "HALLUCINATED" else "#22c55e" icon = "🚨" if verdict == "HALLUCINATED" else "✅" bar_pct = pct if verdict == "HALLUCINATED" else 100 - pct bar_color = "#ef4444" if verdict == "HALLUCINATED" else "#22c55e" return f"""
{icon} {verdict}
Hallucination Probability: {pct}%
{"High hallucination risk detected" if verdict == "HALLUCINATED" else "Response appears grounded in context"}
""" def build_breakdown_html(results): rows = "" for r in results: s = r["scores"] icon = "🚨" if r["flagged"] else "✓" bg = "#3b0000" if r["flagged"] else "#0a2a0a" border = "#ef4444" if r["flagged"] else "#22c55e" label = f'[{r["flag_reason"]}]' \ if r["flagged"] else 'Supported' e_color = "#22c55e" if s["entailment"] > 0.5 else "#aaa" c_color = "#ef4444" if s["contradiction"] > 0.3 else "#aaa" resolved_note = "" if r["sentence"] != r["resolved"]: resolved_note = f'
📝 Resolved: "{r["resolved"]}"
' rows += f"""
{icon} {r['sentence']}
{resolved_note}
Entailment: {s['entailment']} Neutral: {s['neutral']} Contradiction: {s['contradiction']} {label}
""" return f"""
{len(results)} sentence(s) analyzed — {sum(1 for r in results if r['flagged'])} flagged
{rows}
""" def build_history_html(): if not history: return "
No history yet.
" rows = "" for h in reversed(history[-5:]): icon = "🚨" if h["verdict"] == "HALLUCINATED" else "✅" color = "#ef4444" if h["verdict"] == "HALLUCINATED" else "#22c55e" rows += f"""
{icon} {h['verdict']} {h['time']}
{h['response'][:80]}...
""" return rows def build_report(context, question, response, verdict, overall_prob, results, remediation): lines = [ "=" * 60, "HALLUCINATION DETECTION REPORT", f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", "=" * 60, "", "CONTEXT:", context, "", "QUESTION:", question or "(none)", "", "RESPONSE:", response, "", "-" * 60, f"VERDICT: {verdict}", f"HALLUCINATION PROBABILITY: {round(overall_prob * 100, 1)}%", "", "SENTENCE ANALYSIS:", ] for r in results: status = "FLAGGED" if r["flagged"] else "OK" lines.append( f" [{status}] {r['sentence']}\n" f" E={r['scores']['entailment']} " f"N={r['scores']['neutral']} " f"C={r['scores']['contradiction']}" ) lines += ["", "-" * 60, "CORRECTIVE PROMPT:", remediation, "=" * 60] return "\n".join(lines) # ── Main analyze function ───────────────────────────────────────────────────── def analyze(context, question, response): if not context.strip() or not response.strip(): empty = "
Waiting for input...
" return empty, empty, "", empty, None if not question.strip(): question = "What does the response claim?" overall_prob, verdict, results = flag_sentences(context, question, response) remediation = generate_remediation(context, results) # History history.append({ "verdict": verdict, "response": response, "time": datetime.now().strftime("%H:%M:%S") }) # Report file report_text = build_report( context, question, response, verdict, overall_prob, results, remediation ) report_path = "/tmp/hallucination_report.txt" with open(report_path, "w") as f: f.write(report_text) return ( build_verdict_html(overall_prob, verdict), build_breakdown_html(results), remediation, build_history_html(), report_path ) # ── Gradio UI ───────────────────────────────────────────────────────────────── CSS = """ .gradio-container { background: #0f0f1a !important; } .gr-button-primary { background: #6366f1 !important; border: none !important; } footer { display: none !important; } """ with gr.Blocks(title="Hallucination Detector") as demo: gr.Markdown(""" # 🔍 Hallucination Detection & Prompt Remediation **Checks whether an LLM response is grounded in the source context.** Flags contradictory sentences, explains why, and generates a corrective prompt. """) with gr.Row(): # ── Left: Inputs ── with gr.Column(scale=1): gr.Markdown("### 📥 Input") context_input = gr.Textbox( label="📄 Source Context", placeholder="Paste the source document or reference context here...", lines=6 ) question_input = gr.Textbox( label="❓ Question (optional)", placeholder="What question was the LLM answering? Leave blank if not applicable.", lines=2 ) response_input = gr.Textbox( label="🤖 LLM Response to Check", placeholder="Paste the LLM-generated response here...", lines=6 ) analyze_btn = gr.Button("🔍 Analyze Response", variant="primary", size="lg") # ── Right: Outputs ── with gr.Column(scale=1): gr.Markdown("### 📊 Results") verdict_output = gr.HTML( label="Overall Verdict", value="
Run analysis to see verdict.
" ) breakdown_output = gr.HTML( label="Sentence-Level Breakdown", value="
Sentence analysis will appear here.
" ) with gr.Row(): with gr.Column(scale=2): gr.Markdown("### 🛠️ Corrective Prompt") gr.Markdown( "" "Copy this prompt and paste it back into your LLM to get a grounded response." "" ) remediation_output = gr.Textbox( label="", interactive=False, lines=10 ) with gr.Column(scale=1): gr.Markdown("### 📥 Export & History") download_btn = gr.File(label="⬇️ Download Report", interactive=False) gr.Markdown("### 🕒 Recent Tests") history_output = gr.HTML( value="
No history yet.
" ) # ── Examples ── gr.Markdown("### 💡 Try These Examples") gr.Examples( examples=[ [ "The Eiffel Tower is located in Paris, France. It was built in 1889.", "Where is the Eiffel Tower and when was it built?", "The Eiffel Tower is in Paris. It was constructed in 1799. It is the tallest structure in Europe." ], [ "Python was created by Guido van Rossum and released in 1991.", "Who created Python and when?", "Python was created by Guido van Rossum. It was released in 1991." ], [ "The Amazon River flows through Brazil and is the largest river by discharge.", "What is the Amazon River known for?", "The Amazon River is the largest river in the world. It is also the longest." ], [ "Albert Einstein was born in Ulm, Germany in 1879. He developed the theory of relativity.", "Where was Einstein born and what is he known for?", "Einstein was born in Berlin. He is known for inventing the telephone." ], ], inputs=[context_input, question_input, response_input], label="" ) # ── Wiring ── analyze_btn.click( fn=analyze, inputs=[context_input, question_input, response_input], outputs=[ verdict_output, breakdown_output, remediation_output, history_output, download_btn ] ) demo.launch(theme=gr.themes.Soft())