Spaces:
Build error
Build error
File size: 6,156 Bytes
0e979da b2b69aa b2dfdbc 0e979da b2dfdbc 0e979da b2dfdbc 0e979da a7d786a b2dfdbc a7d786a b2dfdbc a7d786a 0e979da b2dfdbc 0e979da b2dfdbc 0e979da b2dfdbc 0e979da a7d786a b2dfdbc a7d786a b2dfdbc 0e979da b2dfdbc 0e979da b2b69aa b2dfdbc a7d786a b2b69aa a7d786a b2b69aa a7d786a b2b69aa a7d786a b2dfdbc 0e979da a7d786a b2b69aa a7d786a b2dfdbc a7d786a b2b69aa a7d786a b2b69aa a7d786a b2b69aa 0e979da b2dfdbc 0e979da b2b69aa b2dfdbc b2b69aa a7d786a b2b69aa a7d786a b2b69aa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
import warnings
import json
import torch
import random
import os
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from transformers import pipeline
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
warnings.filterwarnings("ignore")
# ============================================================================
# 1. INITIALIZATION & EXPERT MODELS (Lightweight)
# ============================================================================
device = "cpu" # Ücretsiz Space için zorunlu
print("[INFO] Loading Expert Models (NLI, Similarity, Uncertainty)...")
# Bu modeller küçük olduğu için CPU'da rahat çalışır
nli_model = pipeline("text-classification", model="pritamdeka/PubMedBERT-MNLI-MedNLI", device=-1)
sim_model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
clf_model = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-6-v2", device=-1)
# ============================================================================
# 2. LOADING GGUF MODEL (For CPU Correction)
# ============================================================================
print("[INFO] Downloading and Loading Nous-Hermes-2 GGUF (CPU Optimized)...")
# Modelin CPU dostu Q4_K_M (4-bit) versiyonunu indiriyoruz
model_path = hf_hub_download(
repo_id="QuantFactory/Nous-Hermes-2-Mistral-7B-DPO-GGUF",
filename="Nous-Hermes-2-Mistral-7B-DPO.Q4_K_M.gguf"
)
correction_model = Llama(
model_path=model_path,
n_ctx=1024, # Bağlam penceresi
n_threads=4, # İşlemci çekirdek kullanımı
n_gpu_layers=0 # GPU olmadığı için 0
)
# ============================================================================
# 3. CORE FUNCTIONS
# ============================================================================
def detect_nli(evidence, answer):
res = nli_model(f"{evidence} [SEP] {answer}", truncation=True, max_length=512)[0]
return res["label"], res["score"]
def detect_similarity(evidence, answer):
e1 = sim_model.encode(evidence, convert_to_tensor=True)
e2 = sim_model.encode(answer, convert_to_tensor=True)
return util.pytorch_cos_sim(e1, e2).item()
def detect_uncertainty(evidence, answer):
res = clf_model(f"{evidence} [SEP] {answer}", truncation=True, max_length=512)[0]
return res["score"]
def generate_correction(query, wrong, truth):
# ChatML Formatı GGUF için uyarlandı
prompt = f"<|im_start|>system\nYou are a doctor. Explain error and fix based on evidence.<|im_end|>\n<|im_start|>user\nQ: {query}\nWrong: {wrong}\nTruth: {truth}\n<|im_end|>\n<|im_start|>assistant\n"
output = correction_model(
prompt,
max_tokens=250,
stop=["<|im_end|>"],
echo=False
)
return output["choices"][0]["text"].strip()
# ============================================================================
# 4. THE AUDIT ENGINE (N=20)
# ============================================================================
def run_clinical_audit():
dataset = load_dataset("UTAustin-AIHealth/MedHallu", "pqa_labeled", split="train", streaming=True)
data_pool = list(dataset.take(100))
samples = random.sample(data_pool, 20)
results = []
y_true, y_pred = [], []
for i, sample in enumerate(samples):
evidence = " ".join(sample["Knowledge"])
query = sample["Question"]
factual = sample["Ground Truth"]
hallucinated = sample["Hallucinated Answer"]
label = 1 if i % 2 == 0 else 0
llm_answer = hallucinated if label == 1 else factual
nli_label, _ = detect_nli(evidence, llm_answer)
sim_score = detect_similarity(evidence, llm_answer)
unc_score = detect_uncertainty(evidence, llm_answer)
detected = 0
reason = "Factual"
# Eşik değerlerin (thresholds)
if nli_label == "contradiction" or sim_score < 0.25 or unc_score < 0.20:
detected = 1
reason = "Clinical Hallucination Detected"
y_true.append(label)
y_pred.append(detected)
correction = None
if detected:
corrected_text = generate_correction(query, llm_answer, factual)
correction = {
"physician_prompt": "Nous-Hermes-2 GGUF Structure",
"llm_corrected_answer": corrected_text
}
results.append({
"case_id": i + 1,
"query": query,
"llm_original_answer": llm_answer,
"ground_truth_answer": factual,
"detection": {
"label": label,
"prediction": detected,
"reason": reason,
"signals": {"nli": nli_label, "similarity": round(sim_score, 3), "uncertainty": round(unc_score, 3)}
},
"correction": correction
})
metrics = {
"accuracy": accuracy_score(y_true, y_pred),
"recall": recall_score(y_true, y_pred),
"f1": f1_score(y_true, y_pred),
"confusion_matrix": confusion_matrix(y_true, y_pred).tolist()
}
file_name = "final_clinical_hallucination_results.json"
with open(file_name, "w") as f:
json.dump({"metrics": metrics, "results": results}, f, indent=2)
return f"✅ Audit Complete!\nAccuracy: {metrics['accuracy']:.2f}\nRecall: {metrics['recall']:.2f}", file_name
# ============================================================================
# 5. GRADIO INTERFACE
# ============================================================================
with gr.Blocks() as demo:
gr.Markdown("# 🩺 Healthcare LLM Auditor (GGUF CPU Edition)")
gr.Markdown("Ücretsiz CPU katmanı için optimize edilmiştir. 20 vakayı analiz eder.")
run_btn = gr.Button("🚀 Start Clinical Audit", variant="primary")
output_text = gr.Textbox(label="Status Summary")
output_file = gr.File(label="📥 Download JSON Results")
run_btn.click(fn=run_clinical_audit, outputs=[output_text, output_file])
demo.launch() |