nlp / app.py
halilolcay's picture
Update app.py
b2dfdbc verified
import warnings
import json
import torch
import random
import os
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from transformers import pipeline
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
warnings.filterwarnings("ignore")
# ============================================================================
# 1. INITIALIZATION & EXPERT MODELS (Lightweight)
# ============================================================================
device = "cpu" # Ücretsiz Space için zorunlu
print("[INFO] Loading Expert Models (NLI, Similarity, Uncertainty)...")
# Bu modeller küçük olduğu için CPU'da rahat çalışır
nli_model = pipeline("text-classification", model="pritamdeka/PubMedBERT-MNLI-MedNLI", device=-1)
sim_model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
clf_model = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-6-v2", device=-1)
# ============================================================================
# 2. LOADING GGUF MODEL (For CPU Correction)
# ============================================================================
print("[INFO] Downloading and Loading Nous-Hermes-2 GGUF (CPU Optimized)...")
# Modelin CPU dostu Q4_K_M (4-bit) versiyonunu indiriyoruz
model_path = hf_hub_download(
repo_id="QuantFactory/Nous-Hermes-2-Mistral-7B-DPO-GGUF",
filename="Nous-Hermes-2-Mistral-7B-DPO.Q4_K_M.gguf"
)
correction_model = Llama(
model_path=model_path,
n_ctx=1024, # Bağlam penceresi
n_threads=4, # İşlemci çekirdek kullanımı
n_gpu_layers=0 # GPU olmadığı için 0
)
# ============================================================================
# 3. CORE FUNCTIONS
# ============================================================================
def detect_nli(evidence, answer):
res = nli_model(f"{evidence} [SEP] {answer}", truncation=True, max_length=512)[0]
return res["label"], res["score"]
def detect_similarity(evidence, answer):
e1 = sim_model.encode(evidence, convert_to_tensor=True)
e2 = sim_model.encode(answer, convert_to_tensor=True)
return util.pytorch_cos_sim(e1, e2).item()
def detect_uncertainty(evidence, answer):
res = clf_model(f"{evidence} [SEP] {answer}", truncation=True, max_length=512)[0]
return res["score"]
def generate_correction(query, wrong, truth):
# ChatML Formatı GGUF için uyarlandı
prompt = f"<|im_start|>system\nYou are a doctor. Explain error and fix based on evidence.<|im_end|>\n<|im_start|>user\nQ: {query}\nWrong: {wrong}\nTruth: {truth}\n<|im_end|>\n<|im_start|>assistant\n"
output = correction_model(
prompt,
max_tokens=250,
stop=["<|im_end|>"],
echo=False
)
return output["choices"][0]["text"].strip()
# ============================================================================
# 4. THE AUDIT ENGINE (N=20)
# ============================================================================
def run_clinical_audit():
dataset = load_dataset("UTAustin-AIHealth/MedHallu", "pqa_labeled", split="train", streaming=True)
data_pool = list(dataset.take(100))
samples = random.sample(data_pool, 20)
results = []
y_true, y_pred = [], []
for i, sample in enumerate(samples):
evidence = " ".join(sample["Knowledge"])
query = sample["Question"]
factual = sample["Ground Truth"]
hallucinated = sample["Hallucinated Answer"]
label = 1 if i % 2 == 0 else 0
llm_answer = hallucinated if label == 1 else factual
nli_label, _ = detect_nli(evidence, llm_answer)
sim_score = detect_similarity(evidence, llm_answer)
unc_score = detect_uncertainty(evidence, llm_answer)
detected = 0
reason = "Factual"
# Eşik değerlerin (thresholds)
if nli_label == "contradiction" or sim_score < 0.25 or unc_score < 0.20:
detected = 1
reason = "Clinical Hallucination Detected"
y_true.append(label)
y_pred.append(detected)
correction = None
if detected:
corrected_text = generate_correction(query, llm_answer, factual)
correction = {
"physician_prompt": "Nous-Hermes-2 GGUF Structure",
"llm_corrected_answer": corrected_text
}
results.append({
"case_id": i + 1,
"query": query,
"llm_original_answer": llm_answer,
"ground_truth_answer": factual,
"detection": {
"label": label,
"prediction": detected,
"reason": reason,
"signals": {"nli": nli_label, "similarity": round(sim_score, 3), "uncertainty": round(unc_score, 3)}
},
"correction": correction
})
metrics = {
"accuracy": accuracy_score(y_true, y_pred),
"recall": recall_score(y_true, y_pred),
"f1": f1_score(y_true, y_pred),
"confusion_matrix": confusion_matrix(y_true, y_pred).tolist()
}
file_name = "final_clinical_hallucination_results.json"
with open(file_name, "w") as f:
json.dump({"metrics": metrics, "results": results}, f, indent=2)
return f"✅ Audit Complete!\nAccuracy: {metrics['accuracy']:.2f}\nRecall: {metrics['recall']:.2f}", file_name
# ============================================================================
# 5. GRADIO INTERFACE
# ============================================================================
with gr.Blocks() as demo:
gr.Markdown("# 🩺 Healthcare LLM Auditor (GGUF CPU Edition)")
gr.Markdown("Ücretsiz CPU katmanı için optimize edilmiştir. 20 vakayı analiz eder.")
run_btn = gr.Button("🚀 Start Clinical Audit", variant="primary")
output_text = gr.Textbox(label="Status Summary")
output_file = gr.File(label="📥 Download JSON Results")
run_btn.click(fn=run_clinical_audit, outputs=[output_text, output_file])
demo.launch()