Spaces:
Build error
Build error
| import warnings | |
| import json | |
| import torch | |
| import random | |
| import os | |
| import gradio as gr | |
| from huggingface_hub import hf_hub_download | |
| from llama_cpp import Llama | |
| from transformers import pipeline | |
| from datasets import load_dataset | |
| from sentence_transformers import SentenceTransformer, util | |
| from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix | |
| warnings.filterwarnings("ignore") | |
| # ============================================================================ | |
| # 1. INITIALIZATION & EXPERT MODELS (Lightweight) | |
| # ============================================================================ | |
| device = "cpu" # Ücretsiz Space için zorunlu | |
| print("[INFO] Loading Expert Models (NLI, Similarity, Uncertainty)...") | |
| # Bu modeller küçük olduğu için CPU'da rahat çalışır | |
| nli_model = pipeline("text-classification", model="pritamdeka/PubMedBERT-MNLI-MedNLI", device=-1) | |
| sim_model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu") | |
| clf_model = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-6-v2", device=-1) | |
| # ============================================================================ | |
| # 2. LOADING GGUF MODEL (For CPU Correction) | |
| # ============================================================================ | |
| print("[INFO] Downloading and Loading Nous-Hermes-2 GGUF (CPU Optimized)...") | |
| # Modelin CPU dostu Q4_K_M (4-bit) versiyonunu indiriyoruz | |
| model_path = hf_hub_download( | |
| repo_id="QuantFactory/Nous-Hermes-2-Mistral-7B-DPO-GGUF", | |
| filename="Nous-Hermes-2-Mistral-7B-DPO.Q4_K_M.gguf" | |
| ) | |
| correction_model = Llama( | |
| model_path=model_path, | |
| n_ctx=1024, # Bağlam penceresi | |
| n_threads=4, # İşlemci çekirdek kullanımı | |
| n_gpu_layers=0 # GPU olmadığı için 0 | |
| ) | |
| # ============================================================================ | |
| # 3. CORE FUNCTIONS | |
| # ============================================================================ | |
| def detect_nli(evidence, answer): | |
| res = nli_model(f"{evidence} [SEP] {answer}", truncation=True, max_length=512)[0] | |
| return res["label"], res["score"] | |
| def detect_similarity(evidence, answer): | |
| e1 = sim_model.encode(evidence, convert_to_tensor=True) | |
| e2 = sim_model.encode(answer, convert_to_tensor=True) | |
| return util.pytorch_cos_sim(e1, e2).item() | |
| def detect_uncertainty(evidence, answer): | |
| res = clf_model(f"{evidence} [SEP] {answer}", truncation=True, max_length=512)[0] | |
| return res["score"] | |
| def generate_correction(query, wrong, truth): | |
| # ChatML Formatı GGUF için uyarlandı | |
| prompt = f"<|im_start|>system\nYou are a doctor. Explain error and fix based on evidence.<|im_end|>\n<|im_start|>user\nQ: {query}\nWrong: {wrong}\nTruth: {truth}\n<|im_end|>\n<|im_start|>assistant\n" | |
| output = correction_model( | |
| prompt, | |
| max_tokens=250, | |
| stop=["<|im_end|>"], | |
| echo=False | |
| ) | |
| return output["choices"][0]["text"].strip() | |
| # ============================================================================ | |
| # 4. THE AUDIT ENGINE (N=20) | |
| # ============================================================================ | |
| def run_clinical_audit(): | |
| dataset = load_dataset("UTAustin-AIHealth/MedHallu", "pqa_labeled", split="train", streaming=True) | |
| data_pool = list(dataset.take(100)) | |
| samples = random.sample(data_pool, 20) | |
| results = [] | |
| y_true, y_pred = [], [] | |
| for i, sample in enumerate(samples): | |
| evidence = " ".join(sample["Knowledge"]) | |
| query = sample["Question"] | |
| factual = sample["Ground Truth"] | |
| hallucinated = sample["Hallucinated Answer"] | |
| label = 1 if i % 2 == 0 else 0 | |
| llm_answer = hallucinated if label == 1 else factual | |
| nli_label, _ = detect_nli(evidence, llm_answer) | |
| sim_score = detect_similarity(evidence, llm_answer) | |
| unc_score = detect_uncertainty(evidence, llm_answer) | |
| detected = 0 | |
| reason = "Factual" | |
| # Eşik değerlerin (thresholds) | |
| if nli_label == "contradiction" or sim_score < 0.25 or unc_score < 0.20: | |
| detected = 1 | |
| reason = "Clinical Hallucination Detected" | |
| y_true.append(label) | |
| y_pred.append(detected) | |
| correction = None | |
| if detected: | |
| corrected_text = generate_correction(query, llm_answer, factual) | |
| correction = { | |
| "physician_prompt": "Nous-Hermes-2 GGUF Structure", | |
| "llm_corrected_answer": corrected_text | |
| } | |
| results.append({ | |
| "case_id": i + 1, | |
| "query": query, | |
| "llm_original_answer": llm_answer, | |
| "ground_truth_answer": factual, | |
| "detection": { | |
| "label": label, | |
| "prediction": detected, | |
| "reason": reason, | |
| "signals": {"nli": nli_label, "similarity": round(sim_score, 3), "uncertainty": round(unc_score, 3)} | |
| }, | |
| "correction": correction | |
| }) | |
| metrics = { | |
| "accuracy": accuracy_score(y_true, y_pred), | |
| "recall": recall_score(y_true, y_pred), | |
| "f1": f1_score(y_true, y_pred), | |
| "confusion_matrix": confusion_matrix(y_true, y_pred).tolist() | |
| } | |
| file_name = "final_clinical_hallucination_results.json" | |
| with open(file_name, "w") as f: | |
| json.dump({"metrics": metrics, "results": results}, f, indent=2) | |
| return f"✅ Audit Complete!\nAccuracy: {metrics['accuracy']:.2f}\nRecall: {metrics['recall']:.2f}", file_name | |
| # ============================================================================ | |
| # 5. GRADIO INTERFACE | |
| # ============================================================================ | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# 🩺 Healthcare LLM Auditor (GGUF CPU Edition)") | |
| gr.Markdown("Ücretsiz CPU katmanı için optimize edilmiştir. 20 vakayı analiz eder.") | |
| run_btn = gr.Button("🚀 Start Clinical Audit", variant="primary") | |
| output_text = gr.Textbox(label="Status Summary") | |
| output_file = gr.File(label="📥 Download JSON Results") | |
| run_btn.click(fn=run_clinical_audit, outputs=[output_text, output_file]) | |
| demo.launch() |