Spaces:

halilolcay
/

nlp

Build error

App Files Files Community

halilolcay commited on Jan 7

Commit

a7d786a

verified ·

1 Parent(s): b801753

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -34

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ import torch
 import random
 import os
 import gradio as gr
-from transformers import pipeline
 from datasets import load_dataset
 from sentence_transformers import SentenceTransformer, util
 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
@@ -14,13 +14,31 @@ warnings.filterwarnings("ignore")
 # ============================================================================
 # 1. INITIALIZATION & MODELS
 # ============================================================================
-device = "mps" if torch.backends.mps.is_available() else "cpu"
-print("[INFO] Loading Expert Models...")
-nli_model = pipeline("text-classification", model="pritamdeka/PubMedBERT-MNLI-MedNLI", device=device,truncation=True, max_length=512)
 sim_model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
 clf_model = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-6-v2", device=device, truncation=True, max_length=512)
-correction_llm = pipeline("text2text-generation", model="google/flan-t5-large", device=device, max_length=512)
 # ============================================================================
 # 2. CORE FUNCTIONS
@@ -37,20 +55,41 @@ def detect_similarity(evidence, answer):
 def detect_uncertainty(evidence, answer):
     return clf_model(f"{evidence} [SEP] {answer}")[0]["score"]
-def build_correction_prompt(query, wrong, truth):
-    return f"You are a doctor. Explain error in: {wrong}. Correct it using: {truth} for Question: {query}"
-def generate_correction(prompt):
-    return correction_llm(prompt)[0]["generated_text"]
 # ============================================================================
-# 3. THE AUDIT ENGINE (Main Logic for Gradio)
 # ============================================================================
 def run_clinical_audit():
-    # Load Dataset (Streaming)
     dataset = load_dataset("UTAustin-AIHealth/MedHallu", "pqa_labeled", split="train", streaming=True)
-    data_pool = list(dataset.take(100))
-    samples = random.sample(data_pool, 30)
     results = []
     y_true, y_pred = [], []
@@ -59,45 +98,53 @@ def run_clinical_audit():
         evidence = " ".join(sample["Knowledge"])
         query = sample["Question"]
         factual = sample["Ground Truth"]
-        # Balanced flip
         label = 1 if i % 2 == 0 else 0
-        llm_answer = sample["Hallucinated Answer"] if label == 1 else factual
-        # Detection logic
         nli_label, _ = detect_nli(evidence, llm_answer)
         sim_score = detect_similarity(evidence, llm_answer)
         unc_score = detect_uncertainty(evidence, llm_answer)
         detected = 0
-        reason = "Consistent"
         if nli_label == "contradiction" or sim_score < 0.30 or unc_score < 0.25:
             detected = 1
-            reason = "Hallucination Detected"
         y_true.append(label)
         y_pred.append(detected)
         correction = None
         if detected:
-            prompt = build_correction_prompt(query, llm_answer, factual)
-            correction = {"corrected": generate_correction(prompt)}
         results.append({
             "case_id": i + 1,
             "query": query,
-            "detection": {"label": label, "prediction": detected, "reason": reason},
             "correction": correction
         })
-    # Metrics
     metrics = {
         "accuracy": accuracy_score(y_true, y_pred),
         "recall": recall_score(y_true, y_pred),
-        "f1": f1_score(y_true, y_pred)
     }
-    # Save File
     file_name = "final_clinical_hallucination_results.json"
     with open(file_name, "w") as f:
         json.dump({"metrics": metrics, "results": results}, f, indent=2)
@@ -105,18 +152,16 @@ def run_clinical_audit():
     return f"✅ Audit Complete!\nAccuracy: {metrics['accuracy']:.2f}\nRecall: {metrics['recall']:.2f}", file_name
 # ============================================================================
-# 4. GRADIO INTERFACE (To see and download file)
 # ============================================================================
 with gr.Blocks() as demo:
-    gr.Markdown("# 🩺 Healthcare LLM Hallucination Audit System")
-    gr.Markdown("Click the button below to start the 30-case randomized clinical evaluation.")
-    with gr.Row():
-        run_btn = gr.Button("🚀 Start Clinical Audit", variant="primary")
-    output_text = gr.Textbox(label="Status & Summary")
-    output_file = gr.File(label="📥 Download Result JSON")
-    run_btn.click(fn=run_clinical_audit, inputs=None, outputs=[output_text, output_file])
 demo.launch()

 import random
 import os
 import gradio as gr
+from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from datasets import load_dataset
 from sentence_transformers import SentenceTransformer, util
 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
 # ============================================================================
 # 1. INITIALIZATION & MODELS
 # ============================================================================
+device = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")
+print("[INFO] Loading Expert Models (NLI, Similarity, Uncertainty)...")
+nli_model = pipeline("text-classification", model="pritamdeka/PubMedBERT-MNLI-MedNLI", device=device, truncation=True, max_length=512)
 sim_model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
 clf_model = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-6-v2", device=device, truncation=True, max_length=512)
+# Nous-Hermes-2-Mistral-7B-DPO Yükleme (4-bit Sıkıştırma ile)
+print("[INFO] Loading Nous-Hermes-2-Mistral-7B-DPO (4-bit optimized)...")
+model_id = "NousResearch/Nous-Hermes-2-Mistral-7B-DPO"
+# Ücretsiz HF Space (16GB VRAM) için kritik ayarlar
+quant_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.float16,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_use_double_quant=True
+)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+correction_model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    quantization_config=quant_config,
+    device_map="auto"
+)
 # ============================================================================
 # 2. CORE FUNCTIONS
 def detect_uncertainty(evidence, answer):
     return clf_model(f"{evidence} [SEP] {answer}")[0]["score"]
+def generate_correction(query, wrong, truth):
+    # Nous-Hermes-2 ChatML Formatı
+    prompt = f"""<|im_start|>system
+You are a board-certified medical doctor. Analyze the clinical error and provide a fix based ONLY on verified evidence.<|im_end|>
+<|im_start|>user
+QUESTION: {query}
+INCORRECT ANSWER: {wrong}
+VERIFIED EVIDENCE: {truth}
+TASK:
+1. Explain why the answer is incorrect.
+2. Provide the clinically accurate correction.<|im_end|>
+<|im_start|>assistant
+"""
+    inputs = tokenizer(prompt, return_tensors="pt").to(device)
+    with torch.no_grad():
+        outputs = correction_model.generate(
+            **inputs,
+            max_new_tokens=300,
+            temperature=0.1, # Tıbbi doğruluk için düşük sıcaklık
+            eos_token_id=tokenizer.eos_token_id
+        )
+    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Sadece asistanın cevabını ayıklıyoruz
+    return decoded.split("assistant")[-1].strip()
 # ============================================================================
+# 3. THE AUDIT ENGINE (N=20)
 # ============================================================================
 def run_clinical_audit():
     dataset = load_dataset("UTAustin-AIHealth/MedHallu", "pqa_labeled", split="train", streaming=True)
+    data_pool = list(dataset.take(150))
+    samples = random.sample(data_pool, 20)
     results = []
     y_true, y_pred = [], []
         evidence = " ".join(sample["Knowledge"])
         query = sample["Question"]
         factual = sample["Ground Truth"]
+        hallucinated = sample["Hallucinated Answer"]
         label = 1 if i % 2 == 0 else 0
+        llm_answer = hallucinated if label == 1 else factual
         nli_label, _ = detect_nli(evidence, llm_answer)
         sim_score = detect_similarity(evidence, llm_answer)
         unc_score = detect_uncertainty(evidence, llm_answer)
         detected = 0
+        reason = "Factual"
         if nli_label == "contradiction" or sim_score < 0.30 or unc_score < 0.25:
             detected = 1
+            reason = "Clinical Hallucination Detected"
         y_true.append(label)
         y_pred.append(detected)
         correction = None
         if detected:
+            corrected_text = generate_correction(query, llm_answer, factual)
+            correction = {
+                "physician_prompt": "Nous-Hermes-2 ChatML Structure",
+                "llm_corrected_answer": corrected_text
+            }
         results.append({
             "case_id": i + 1,
             "query": query,
+            "llm_original_answer": llm_answer,
+            "ground_truth_answer": factual,
+            "detection": {
+                "label": label,
+                "prediction": detected,
+                "reason": reason,
+                "signals": {"nli": nli_label, "similarity": round(sim_score, 3), "uncertainty": round(unc_score, 3)}
+            },
             "correction": correction
         })
     metrics = {
         "accuracy": accuracy_score(y_true, y_pred),
         "recall": recall_score(y_true, y_pred),
+        "f1": f1_score(y_true, y_pred),
+        "confusion_matrix": confusion_matrix(y_true, y_pred).tolist()
     }
     file_name = "final_clinical_hallucination_results.json"
     with open(file_name, "w") as f:
         json.dump({"metrics": metrics, "results": results}, f, indent=2)
     return f"✅ Audit Complete!\nAccuracy: {metrics['accuracy']:.2f}\nRecall: {metrics['recall']:.2f}", file_name
 # ============================================================================
+# 4. GRADIO INTERFACE
 # ============================================================================
 with gr.Blocks() as demo:
+    gr.Markdown("# 🩺 Healthcare LLM Auditor (Nous-Hermes-2 Engine)")
+    gr.Markdown("Bu sistem 20 vakayı 4-bit optimize edilmiş Nous-Hermes-2 ile denetler.")
+    run_btn = gr.Button("🚀 Start Clinical Audit", variant="primary")
+    output_text = gr.Textbox(label="Status Summary")
+    output_file = gr.File(label="📥 Download JSON Results")
+    run_btn.click(fn=run_clinical_audit, outputs=[output_text, output_file])
 demo.launch()