import warnings
import json
import torch
import random
import os
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from transformers import pipeline
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

warnings.filterwarnings("ignore")

# ============================================================================
# 1. INITIALIZATION & EXPERT MODELS (Lightweight)
# ============================================================================
device = "cpu" # Ücretsiz Space için zorunlu

print("[INFO] Loading Expert Models (NLI, Similarity, Uncertainty)...")
# Bu modeller küçük olduğu için CPU'da rahat çalışır
nli_model = pipeline("text-classification", model="pritamdeka/PubMedBERT-MNLI-MedNLI", device=-1)
sim_model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
clf_model = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-6-v2", device=-1)

# ============================================================================
# 2. LOADING GGUF MODEL (For CPU Correction)
# ============================================================================
print("[INFO] Downloading and Loading Nous-Hermes-2 GGUF (CPU Optimized)...")
# Modelin CPU dostu Q4_K_M (4-bit) versiyonunu indiriyoruz
model_path = hf_hub_download(
    repo_id="QuantFactory/Nous-Hermes-2-Mistral-7B-DPO-GGUF",
    filename="Nous-Hermes-2-Mistral-7B-DPO.Q4_K_M.gguf"
)

correction_model = Llama(
    model_path=model_path,
    n_ctx=1024,      # Bağlam penceresi
    n_threads=4,     # İşlemci çekirdek kullanımı
    n_gpu_layers=0   # GPU olmadığı için 0
)

# ============================================================================
# 3. CORE FUNCTIONS
# ============================================================================
def detect_nli(evidence, answer):
    res = nli_model(f"{evidence} [SEP] {answer}", truncation=True, max_length=512)[0]
    return res["label"], res["score"]

def detect_similarity(evidence, answer):
    e1 = sim_model.encode(evidence, convert_to_tensor=True)
    e2 = sim_model.encode(answer, convert_to_tensor=True)
    return util.pytorch_cos_sim(e1, e2).item()

def detect_uncertainty(evidence, answer):
    res = clf_model(f"{evidence} [SEP] {answer}", truncation=True, max_length=512)[0]
    return res["score"]

def generate_correction(query, wrong, truth):
    # ChatML Formatı GGUF için uyarlandı
    prompt = f"<|im_start|>system\nYou are a doctor. Explain error and fix based on evidence.<|im_end|>\n<|im_start|>user\nQ: {query}\nWrong: {wrong}\nTruth: {truth}\n<|im_end|>\n<|im_start|>assistant\n"
    
    output = correction_model(
        prompt, 
        max_tokens=250, 
        stop=["<|im_end|>"], 
        echo=False
    )
    return output["choices"][0]["text"].strip()

# ============================================================================
# 4. THE AUDIT ENGINE (N=20)
# ============================================================================
def run_clinical_audit():
    dataset = load_dataset("UTAustin-AIHealth/MedHallu", "pqa_labeled", split="train", streaming=True)
    data_pool = list(dataset.take(100))
    samples = random.sample(data_pool, 20)

    results = []
    y_true, y_pred = [], []

    for i, sample in enumerate(samples):
        evidence = " ".join(sample["Knowledge"])
        query = sample["Question"]
        factual = sample["Ground Truth"]
        hallucinated = sample["Hallucinated Answer"]
        
        label = 1 if i % 2 == 0 else 0
        llm_answer = hallucinated if label == 1 else factual

        nli_label, _ = detect_nli(evidence, llm_answer)
        sim_score = detect_similarity(evidence, llm_answer)
        unc_score = detect_uncertainty(evidence, llm_answer)

        detected = 0
        reason = "Factual"
        # Eşik değerlerin (thresholds)
        if nli_label == "contradiction" or sim_score < 0.25 or unc_score < 0.20:
            detected = 1
            reason = "Clinical Hallucination Detected"

        y_true.append(label)
        y_pred.append(detected)

        correction = None
        if detected:
            corrected_text = generate_correction(query, llm_answer, factual)
            correction = {
                "physician_prompt": "Nous-Hermes-2 GGUF Structure",
                "llm_corrected_answer": corrected_text
            }

        results.append({
            "case_id": i + 1,
            "query": query,
            "llm_original_answer": llm_answer,
            "ground_truth_answer": factual,
            "detection": {
                "label": label,
                "prediction": detected,
                "reason": reason,
                "signals": {"nli": nli_label, "similarity": round(sim_score, 3), "uncertainty": round(unc_score, 3)}
            },
            "correction": correction
        })

    metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "f1": f1_score(y_true, y_pred),
        "confusion_matrix": confusion_matrix(y_true, y_pred).tolist()
    }

    file_name = "final_clinical_hallucination_results.json"
    with open(file_name, "w") as f:
        json.dump({"metrics": metrics, "results": results}, f, indent=2)

    return f"✅ Audit Complete!\nAccuracy: {metrics['accuracy']:.2f}\nRecall: {metrics['recall']:.2f}", file_name

# ============================================================================
# 5. GRADIO INTERFACE
# ============================================================================
with gr.Blocks() as demo:
    gr.Markdown("# 🩺 Healthcare LLM Auditor (GGUF CPU Edition)")
    gr.Markdown("Ücretsiz CPU katmanı için optimize edilmiştir. 20 vakayı analiz eder.")
    
    run_btn = gr.Button("🚀 Start Clinical Audit", variant="primary")
    output_text = gr.Textbox(label="Status Summary")
    output_file = gr.File(label="📥 Download JSON Results")

    run_btn.click(fn=run_clinical_audit, outputs=[output_text, output_file])

demo.launch()