Spaces:

halilolcay
/

nlp

Build error

App Files Files Community

halilolcay commited on Jan 6

Commit

b2b69aa

verified ·

1 Parent(s): 825a0a4

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -156

app.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import warnings
 import json
 import torch
 import random
 from transformers import pipeline
 from datasets import load_dataset
 from sentence_transformers import SentenceTransformer, util
@@ -11,51 +12,18 @@ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_sc
 warnings.filterwarnings("ignore")
 # ============================================================================
-# 1.DATA
 # ============================================================================
 device = "mps" if torch.backends.mps.is_available() else "cpu"
-dataset = load_dataset(
-    "UTAustin-AIHealth/MedHallu",
-    "pqa_labeled",
-    split="train",
-    streaming=True
-)
-data_pool = list(dataset.take(200))
-samples = random.sample(data_pool, 30)
-# ============================================================================
-# 2. MODELS
-# ============================================================================
-nli_model = pipeline(#mantık
-    "text-classification",
-    model="pritamdeka/PubMedBERT-MNLI-MedNLI",
-    device=device,
-    truncation=True,
-    max_length=512
-)
-sim_model = SentenceTransformer("all-MiniLM-L6-v2", device=device)#kapsam
-clf_model = pipeline(#alaka
-    "text-classification",
-    model="cross-encoder/ms-marco-MiniLM-L-6-v2",
-    device=device,
-    truncation=True
-)
-#  Instruction-following correction model
-correction_llm = pipeline(
-    "text2text-generation",
-    model="google/flan-t5-large",
-    device=device,
-    max_length=512
-)
 # ============================================================================
-# 3. DETECTION FUNCTIONS
 # ============================================================================
 def detect_nli(evidence, answer):
     res = nli_model(f"{evidence} [SEP] {answer}")[0]
@@ -69,129 +37,86 @@ def detect_similarity(evidence, answer):
 def detect_uncertainty(evidence, answer):
     return clf_model(f"{evidence} [SEP] {answer}")[0]["score"]
-# ============================================================================
-# 4. CORRECTION PROMPT
-# ============================================================================
 def build_correction_prompt(query, wrong, truth):
-    return f"""
-        You are a board-certified medical doctor.
-        A previous AI answer contains a clinical error.
-        QUESTION:
-        {query}
-        INCORRECT ANSWER:
-        {wrong}
-        VERIFIED MEDICAL EVIDENCE:
-        {truth}
-        TASK:
-        1. Briefly explain why the original answer is incorrect.
-        2. Provide the corrected, clinically accurate answer.
-        """
 def generate_correction(prompt):
     return correction_llm(prompt)[0]["generated_text"]
 # ============================================================================
-# 5. EVALUATION LOOP
 # ============================================================================
-results = []
-y_true, y_pred = [], []
-for i, sample in enumerate(samples):
-    evidence = " ".join(sample["Knowledge"])
-    query = sample["Question"]
-    hallucinated = sample["Hallucinated Answer"]
-    factual = sample["Ground Truth"]
-    # Balanced evaluation
-    if i % 2 == 0:
-        llm_answer = hallucinated
-        label = 1
-    else:
-        llm_answer = factual
-        label = 0
-    nli_label, _ = detect_nli(evidence, llm_answer)
-    sim_score = detect_similarity(evidence, llm_answer)
-    unc_score = detect_uncertainty(evidence, llm_answer)
-    detected = 0
-    reason = "Consistent with evidence"
-    # Safety-first but calibrated thresholds
-    if nli_label == "contradiction":
-            detected = 1
-            reason = "Logical contradiction with medical evidence"
-    elif sim_score < 0.30:
             detected = 1
-            reason = "Semantic drift from clinical context"
-    elif unc_score < 0.25:
-            detected = 1
-            reason = "Low relevance / high uncertainty"
-    y_true.append(label)
-    y_pred.append(detected)
-    correction = None
-    if detected:
-        prompt = build_correction_prompt(query, llm_answer, factual)
-        corrected_answer = generate_correction(prompt)
-        correction = {
-            "physician_prompt": prompt,
-            "llm_corrected_answer": corrected_answer
-        }
-    results.append({
-        "case_id": i + 1,
-        "query": query,
-        "llm_original_answer": llm_answer,
-        "ground_truth_answer": factual,
-        "detection": {
-            "label": label,
-            "prediction": detected,
-            "reason": reason,
-            "signals": {
-                "nli": nli_label,
-                "similarity": round(sim_score, 3),
-                "uncertainty": round(unc_score, 3)
-            }
-        },
-        "correction": correction
-    })
-    print(f"Case {i+1:02}: {'⚠️ Hallucination' if detected else '✅ Factual'}")
 # ============================================================================
-# 6. METRICS
 # ============================================================================
-acc = accuracy_score(y_true, y_pred)
-prec = precision_score(y_true, y_pred)
-rec = recall_score(y_true, y_pred)
-f1 = f1_score(y_true, y_pred)
-cm = confusion_matrix(y_true, y_pred)
-print("\n=== FINAL RESULTS ===")
-print(f"Accuracy : {acc:.3f}")
-print(f"Precision: {prec:.3f}")
-print(f"Recall   : {rec:.3f}")
-print(f"F1-score : {f1:.3f}")
-print("Confusion Matrix:\n", cm)
-with open("final_clinical_hallucination_results.json", "w") as f:
-    json.dump({
-        "metrics": {
-            "accuracy": acc,
-            "precision": prec,
-            "recall": rec,
-            "f1": f1,
-            "confusion_matrix": cm.tolist()
-        },
-        "results": results
-    }, f, indent=2)
-print("\n✓ FINAL audit complete. Results saved.")

 import warnings
 import json
 import torch
 import random
+import os
+import gradio as gr
 from transformers import pipeline
 from datasets import load_dataset
 from sentence_transformers import SentenceTransformer, util
 warnings.filterwarnings("ignore")
 # ============================================================================
+# 1. INITIALIZATION & MODELS
 # ============================================================================
 device = "mps" if torch.backends.mps.is_available() else "cpu"
+print("[INFO] Loading Expert Models...")
+nli_model = pipeline("text-classification", model="pritamdeka/PubMedBERT-MNLI-MedNLI", device=device)
+sim_model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
+clf_model = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-6-v2", device=device)
+correction_llm = pipeline("text2text-generation", model="google/flan-t5-large", device=device, max_length=512)
 # ============================================================================
+# 2. CORE FUNCTIONS
 # ============================================================================
 def detect_nli(evidence, answer):
     res = nli_model(f"{evidence} [SEP] {answer}")[0]
 def detect_uncertainty(evidence, answer):
     return clf_model(f"{evidence} [SEP] {answer}")[0]["score"]
 def build_correction_prompt(query, wrong, truth):
+    return f"You are a doctor. Explain error in: {wrong}. Correct it using: {truth} for Question: {query}"
 def generate_correction(prompt):
     return correction_llm(prompt)[0]["generated_text"]
 # ============================================================================
+# 3. THE AUDIT ENGINE (Main Logic for Gradio)
 # ============================================================================
+def run_clinical_audit():
+    # Load Dataset (Streaming)
+    dataset = load_dataset("UTAustin-AIHealth/MedHallu", "pqa_labeled", split="train", streaming=True)
+    data_pool = list(dataset.take(100))
+    samples = random.sample(data_pool, 30)
+    results = []
+    y_true, y_pred = [], []
+    for i, sample in enumerate(samples):
+        evidence = " ".join(sample["Knowledge"])
+        query = sample["Question"]
+        factual = sample["Ground Truth"]
+        # Balanced flip
+        label = 1 if i % 2 == 0 else 0
+        llm_answer = sample["Hallucinated Answer"] if label == 1 else factual
+        # Detection logic
+        nli_label, _ = detect_nli(evidence, llm_answer)
+        sim_score = detect_similarity(evidence, llm_answer)
+        unc_score = detect_uncertainty(evidence, llm_answer)
+        detected = 0
+        reason = "Consistent"
+        if nli_label == "contradiction" or sim_score < 0.30 or unc_score < 0.25:
             detected = 1
+            reason = "Hallucination Detected"
+        y_true.append(label)
+        y_pred.append(detected)
+        correction = None
+        if detected:
+            prompt = build_correction_prompt(query, llm_answer, factual)
+            correction = {"corrected": generate_correction(prompt)}
+        results.append({
+            "case_id": i + 1,
+            "query": query,
+            "detection": {"label": label, "prediction": detected, "reason": reason},
+            "correction": correction
+        })
+    # Metrics
+    metrics = {
+        "accuracy": accuracy_score(y_true, y_pred),
+        "recall": recall_score(y_true, y_pred),
+        "f1": f1_score(y_true, y_pred)
+    }
+    # Save File
+    file_name = "final_clinical_hallucination_results.json"
+    with open(file_name, "w") as f:
+        json.dump({"metrics": metrics, "results": results}, f, indent=2)
+    return f"✅ Audit Complete!\nAccuracy: {metrics['accuracy']:.2f}\nRecall: {metrics['recall']:.2f}", file_name
 # ============================================================================
+# 4. GRADIO INTERFACE (To see and download file)
 # ============================================================================
+with gr.Blocks() as demo:
+    gr.Markdown("# 🩺 Healthcare LLM Hallucination Audit System")
+    gr.Markdown("Click the button below to start the 30-case randomized clinical evaluation.")
+    with gr.Row():
+        run_btn = gr.Button("🚀 Start Clinical Audit", variant="primary")
+    output_text = gr.Textbox(label="Status & Summary")
+    output_file = gr.File(label="📥 Download Result JSON")
+    run_btn.click(fn=run_clinical_audit, inputs=None, outputs=[output_text, output_file])
+demo.launch()