Spaces:

halilolcay
/

nlp

Build error

App Files Files Community

nlp / app.py

halilolcay

Update app.py

b2dfdbc verified 29 days ago

raw

history blame contribute delete

6.16 kB

	import warnings
	import json
	import torch
	import random
	import os
	import gradio as gr
	from huggingface_hub import hf_hub_download
	from llama_cpp import Llama
	from transformers import pipeline
	from datasets import load_dataset
	from sentence_transformers import SentenceTransformer, util
	from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

	warnings.filterwarnings("ignore")

	# ============================================================================
	# 1. INITIALIZATION & EXPERT MODELS (Lightweight)
	# ============================================================================
	device = "cpu" # Ücretsiz Space için zorunlu

	print("[INFO] Loading Expert Models (NLI, Similarity, Uncertainty)...")
	# Bu modeller küçük olduğu için CPU'da rahat çalışır
	nli_model = pipeline("text-classification", model="pritamdeka/PubMedBERT-MNLI-MedNLI", device=-1)
	sim_model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
	clf_model = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-6-v2", device=-1)

	# ============================================================================
	# 2. LOADING GGUF MODEL (For CPU Correction)
	# ============================================================================
	print("[INFO] Downloading and Loading Nous-Hermes-2 GGUF (CPU Optimized)...")
	# Modelin CPU dostu Q4_K_M (4-bit) versiyonunu indiriyoruz
	model_path = hf_hub_download(
	repo_id="QuantFactory/Nous-Hermes-2-Mistral-7B-DPO-GGUF",
	filename="Nous-Hermes-2-Mistral-7B-DPO.Q4_K_M.gguf"
	)

	correction_model = Llama(
	model_path=model_path,
	n_ctx=1024, # Bağlam penceresi
	n_threads=4, # İşlemci çekirdek kullanımı
	n_gpu_layers=0 # GPU olmadığı için 0
	)

	# ============================================================================
	# 3. CORE FUNCTIONS
	# ============================================================================
	def detect_nli(evidence, answer):
	res = nli_model(f"{evidence} [SEP] {answer}", truncation=True, max_length=512)[0]
	return res["label"], res["score"]

	def detect_similarity(evidence, answer):
	e1 = sim_model.encode(evidence, convert_to_tensor=True)
	e2 = sim_model.encode(answer, convert_to_tensor=True)
	return util.pytorch_cos_sim(e1, e2).item()

	def detect_uncertainty(evidence, answer):
	res = clf_model(f"{evidence} [SEP] {answer}", truncation=True, max_length=512)[0]
	return res["score"]

	def generate_correction(query, wrong, truth):
	# ChatML Formatı GGUF için uyarlandı
	prompt = f"<\|im_start\|>system\nYou are a doctor. Explain error and fix based on evidence.<\|im_end\|>\n<\|im_start\|>user\nQ: {query}\nWrong: {wrong}\nTruth: {truth}\n<\|im_end\|>\n<\|im_start\|>assistant\n"

	output = correction_model(
	prompt,
	max_tokens=250,
	stop=["<\|im_end\|>"],
	echo=False
	)
	return output["choices"][0]["text"].strip()

	# ============================================================================
	# 4. THE AUDIT ENGINE (N=20)
	# ============================================================================
	def run_clinical_audit():
	dataset = load_dataset("UTAustin-AIHealth/MedHallu", "pqa_labeled", split="train", streaming=True)
	data_pool = list(dataset.take(100))
	samples = random.sample(data_pool, 20)

	results = []
	y_true, y_pred = [], []

	for i, sample in enumerate(samples):
	evidence = " ".join(sample["Knowledge"])
	query = sample["Question"]
	factual = sample["Ground Truth"]
	hallucinated = sample["Hallucinated Answer"]

	label = 1 if i % 2 == 0 else 0
	llm_answer = hallucinated if label == 1 else factual

	nli_label, _ = detect_nli(evidence, llm_answer)
	sim_score = detect_similarity(evidence, llm_answer)
	unc_score = detect_uncertainty(evidence, llm_answer)

	detected = 0
	reason = "Factual"
	# Eşik değerlerin (thresholds)
	if nli_label == "contradiction" or sim_score < 0.25 or unc_score < 0.20:
	detected = 1
	reason = "Clinical Hallucination Detected"

	y_true.append(label)
	y_pred.append(detected)

	correction = None
	if detected:
	corrected_text = generate_correction(query, llm_answer, factual)
	correction = {
	"physician_prompt": "Nous-Hermes-2 GGUF Structure",
	"llm_corrected_answer": corrected_text
	}

	results.append({
	"case_id": i + 1,
	"query": query,
	"llm_original_answer": llm_answer,
	"ground_truth_answer": factual,
	"detection": {
	"label": label,
	"prediction": detected,
	"reason": reason,
	"signals": {"nli": nli_label, "similarity": round(sim_score, 3), "uncertainty": round(unc_score, 3)}
	},
	"correction": correction
	})

	metrics = {
	"accuracy": accuracy_score(y_true, y_pred),
	"recall": recall_score(y_true, y_pred),
	"f1": f1_score(y_true, y_pred),
	"confusion_matrix": confusion_matrix(y_true, y_pred).tolist()
	}

	file_name = "final_clinical_hallucination_results.json"
	with open(file_name, "w") as f:
	json.dump({"metrics": metrics, "results": results}, f, indent=2)

	return f"✅ Audit Complete!\nAccuracy: {metrics['accuracy']:.2f}\nRecall: {metrics['recall']:.2f}", file_name

	# ============================================================================
	# 5. GRADIO INTERFACE
	# ============================================================================
	with gr.Blocks() as demo:
	gr.Markdown("# 🩺 Healthcare LLM Auditor (GGUF CPU Edition)")
	gr.Markdown("Ücretsiz CPU katmanı için optimize edilmiştir. 20 vakayı analiz eder.")

	run_btn = gr.Button("🚀 Start Clinical Audit", variant="primary")
	output_text = gr.Textbox(label="Status Summary")
	output_file = gr.File(label="📥 Download JSON Results")

	run_btn.click(fn=run_clinical_audit, outputs=[output_text, output_file])

	demo.launch()