# """
# Evaluation module: loads models, computes metrics, and creates visualizations.
# Lightweight, CPU-friendly, no Java required.
# """

# import re
# import math
# import uuid
# from typing import List, Dict, Tuple

# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
# import torch
# from transformers import AutoTokenizer, AutoModelForSequenceClassification
# from sentence_transformers import SentenceTransformer, util

# # --------------------------
# # MODEL LOADING
# # --------------------------
# NLI_MODEL = "textattack/roberta-base-MNLI"
# EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

# # Load NLI model & tokenizer
# nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL)
# nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL)
# nli_model.to("cpu")
# nli_model.eval()

# # Load embedding model
# embed_model = SentenceTransformer(EMBED_MODEL)

# # Label mapping from config
# id2label = {int(k): v.upper() for k, v in nli_model.config.id2label.items()}


# # --------------------------
# # METRIC FUNCTIONS
# # --------------------------
# def check_instruction_following(prompt: str, response: str) -> float:
#     """Embedding-based similarity between prompt and response."""
#     if not prompt or not response:
#         return 0.0
#     p_emb = embed_model.encode(prompt, convert_to_tensor=True)
#     r_emb = embed_model.encode(response, convert_to_tensor=True)
#     sim = float(util.cos_sim(p_emb, r_emb).item())
#     return round(max(0.0, min(1.0, sim)), 3)


# def check_hallucination(reference: str, response: str) -> float:
#     """
#     Single hallucination score:
#     Entailment prob - Contradiction prob (normalized to [0,1]).
#     Higher = less hallucination.
#     """
#     if not reference or not response:
#         return 0.0
#     with torch.no_grad():
#         inputs = nli_tokenizer.encode_plus(reference, response, return_tensors="pt", truncation=True)
#         outputs = nli_model(**inputs)
#         probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()[0]

#     entail_prob, contra_prob = 0.0, 0.0
#     for idx, p in enumerate(probs):
#         label = id2label.get(idx, "")
#         if "ENTAIL" in label:
#             entail_prob = float(p)
#         elif "CONTRA" in label:
#             contra_prob = float(p)

#     score = entail_prob - contra_prob
#     score = (score + 1) / 2  # normalize [-1,1] → [0,1]
#     return round(max(0.0, min(1.0, score)), 3)


# def check_assumption(response: str) -> float:
#     """Detect speculative/hedging terms."""
#     if not response:
#         return 0.0
#     speculative_terms = ["maybe", "probably", "might", "perhaps", "i guess", "seems", "could"]
#     count = sum(1 for t in speculative_terms if t in response.lower())
#     score = 1.0 - min(count / 5.0, 1.0)  # smoother decay
#     return round(score, 3)


# def check_coherence(response: str) -> float:
#     """Heuristic coherence metric: penalizes very short/long, rewards sentence balance."""
#     if not response:
#         return 0.0
#     words = len(re.findall(r"\w+", response))
#     sents = max(1, len(re.split(r"[.!?]+", response)) - 1)
#     if words < 5:
#         return 0.3
#     if words > 200:
#         return 0.5
#     base = min(1.0, (words / 50.0) + (sents / 5.0))
#     return round(max(0.4, min(base, 0.95)), 3)


# def check_accuracy(reference: str, response: str) -> float:
#     """Semantic similarity between reference and response via embeddings (cosine)."""
#     if not reference or not response:
#         return 0.0
#     ref_emb = embed_model.encode(reference, convert_to_tensor=True)
#     resp_emb = embed_model.encode(response, convert_to_tensor=True)
#     sim = float(util.cos_sim(ref_emb, resp_emb).item())
#     return round(max(0.0, min(1.0, sim)), 3)


# # --------------------------
# # SCORING PIPELINE
# # --------------------------
# def compute_row_scores(prompt, response, reference) -> Dict:
#     instr = check_instruction_following(prompt, response)
#     halluc = check_hallucination(reference, response)
#     assum = check_assumption(response)
#     coh = check_coherence(response)
#     acc = check_accuracy(reference, response)

#     # Final score: average
#     components = [instr, halluc, assum, coh, acc]
#     final = round(float(sum(components) / len(components)), 3)

#     return {
#         "InstructionFollowing": instr,
#         "Hallucination": halluc,
#         "AssumptionControl": assum,
#         "Coherence": coh,
#         "Accuracy": acc,
#         "FinalScore": final,
#     }


# # --------------------------
# # VISUALIZATION HELPERS
# # --------------------------
# # def spider_net_multi(labels: List[str], rows: List[Dict], title: str, fill_alpha: float = 0.12):
# #     """Radar chart for multiple agents."""
# #     N = len(labels)
# #     angles = [n / float(N) * 2 * math.pi for n in range(N)]
# #     angles += angles[:1]

# #     fig = plt.figure(figsize=(6.5, 6.5))
# #     ax = plt.subplot(111, polar=True)
# #     ax.set_xticks(angles[:-1])
# #     ax.set_xticklabels(labels, fontsize=9)
# #     ax.set_ylim(0, 100)
# #     ax.set_yticks([0, 25, 50, 75, 100])

# #     for r in rows:
# #         values = r["values"]
# #         values_closed = values + values[:1]
# #         ax.plot(angles, values_closed, linewidth=1.5, label=r["name"])
# #         ax.fill(angles, values_closed, alpha=fill_alpha)

# #     ax.set_title(title, y=1.08, fontsize=12)
# #     ax.legend(loc="upper right", bbox_to_anchor=(1.25, 1.1))
# #     return fig


# # def heatmap_plot(df: pd.DataFrame, metric_cols: List[str], title: str = "Metric Correlations"):
# #     fig, ax = plt.subplots(figsize=(7, 5))
# #     sns.heatmap(df[metric_cols].corr(), annot=True, fmt=".2f", cmap="coolwarm", ax=ax)
# #     ax.set_title(title)
# #     return fig


# # --------------------------
# # HIGH-LEVEL EVALUATION
# # --------------------------
# def evaluate_dataframe(df: pd.DataFrame) -> Tuple[pd.DataFrame, List[Tuple[str, str]], pd.DataFrame]:
#     """
#     df must contain: prompt, response, task, agent, reference
#     Returns: metrics_df, [(image_path, caption)], leaderboard_df
#     """
#     df = df.rename(columns={c: c.strip() for c in df.columns})

#     rows = []
#     for _, r in df.iterrows():
#         prompt = r.get("prompt", "")
#         response = r.get("response", "")
#         reference = r.get("reference", "")
#         agent = r.get("agent", "Unknown")
#         task = r.get("task", "Unknown")

#         scores = compute_row_scores(prompt, response, reference)
#         entry = {
#             "Task": str(task).strip(),
#             "Agent": str(agent),
#             "Prompt": prompt,
#             "Response": response,
#             "Reference": reference,
#         }
#         entry.update(scores)
#         rows.append(entry)

#     metrics_df = pd.DataFrame(rows)

#     # Visualization artifacts
#     images = []
#     metric_labels = ["InstructionFollowing", "Hallucination", "AssumptionControl", "Coherence", "Accuracy"]

#     # Per-task radar and bar charts
#     for task, g in metrics_df.groupby("Task"):
#         series = []
#         for a in g["Agent"].unique():
#             subset = g[g["Agent"] == a]
#             vals = [round(float(subset[m].mean()) * 100, 2) for m in metric_labels]
#             series.append({"name": a, "values": vals})
#         if series:
#             fig = spider_net_multi(metric_labels, series, title=f"{task} — Agent Comparison")
#             fname = f"/tmp/{uuid.uuid4().hex}_{task}_radar.png"
#             fig.savefig(fname, bbox_inches="tight")
#             plt.close(fig)
#             images.append((fname, f"{task} - radar"))

#             fig2, ax = plt.subplots(figsize=(8, 4))
#             avg = g.groupby("Agent")[metric_labels].mean()
#             avg.plot(kind="bar", ax=ax)
#             ax.set_title(f"{task} — Average Metrics by Agent")
#             ax.set_ylabel("Score (0-1)")
#             plt.xticks(rotation=45)
#             fname2 = f"/tmp/{uuid.uuid4().hex}_{task}_bar.png"
#             fig2.savefig(fname2, bbox_inches="tight")
#             plt.close(fig2)
#             images.append((fname2, f"{task} - bar"))

#     # Global heatmap
#     metric_cols = metric_labels + ["FinalScore"]
#     figh = heatmap_plot(metrics_df, metric_cols)
#     fnameh = f"/tmp/{uuid.uuid4().hex}_heatmap.png"
#     figh.savefig(fnameh, bbox_inches="tight")
#     plt.close(figh)
#     images.append((fnameh, "Metric Correlations Heatmap"))

#     # Leaderboard
#     lb = metrics_df.groupby(["Agent", "Task"])["FinalScore"].mean().reset_index()
#     lb = lb.sort_values(["FinalScore"], ascending=False)

#     return metrics_df, images, lb


# # --------------------------
# # DEMO USAGE
# # --------------------------
# if __name__ == "__main__":
#     # Sample dataset
#     data = [
#         {"task": "Math QA", "agent": "AgentA", "prompt": "What is 2+2?", "response": "The answer is 4.", "reference": "2+2=4"},
#         {"task": "Math QA", "agent": "AgentB", "prompt": "What is 2+2?", "response": "It might be 5, but usually 4.", "reference": "2+2=4"},
#         {"task": "Summarization", "agent": "AgentA", "prompt": "Summarize: 'The cat sat on the mat. The dog barked.'", "response": "A cat sat while a dog barked.", "reference": "Cat on mat, dog barking."},
#     ]
#     df = pd.DataFrame(data)

#     metrics_df, images, leaderboard = evaluate_dataframe(df)

#     print("\n=== Metrics per response ===")
#     print(metrics_df[["Task", "Agent", "InstructionFollowing", "Hallucination", "AssumptionControl", "Coherence", "Accuracy", "FinalScore"]])

#     print("\n=== Leaderboard (average per task & agent) ===")
#     print(leaderboard)

#     print("\nVisualization files saved in /tmp/:")
#     for path, caption in images:
#         print(f"{caption}: {path}")

import re
import json
import torch
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import uuid
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, util

# --------------------------
# MODEL LOADING
# --------------------------
NLI_MODEL = "textattack/roberta-base-MNLI"
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

# Load NLI model & tokenizer
nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL)
nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL)
nli_model.to("cpu")
nli_model.eval()

# Load embedding model
embed_model = SentenceTransformer(EMBED_MODEL)

# Label mapping from config
id2label = {int(k): v.upper() for k, v in nli_model.config.id2label.items()}


# --------------------------
# METRIC FUNCTIONS
# --------------------------
def check_instruction_following(prompt: str, response: str) -> float:
    """Embedding-based similarity between prompt and response."""
    if not prompt or not response:
        return 0.0
    p_emb = embed_model.encode(prompt, convert_to_tensor=True)
    r_emb = embed_model.encode(response, convert_to_tensor=True)
    sim = float(util.cos_sim(p_emb, r_emb).item())
    return round(max(0.0, min(1.0, sim)), 3)


def check_hallucination(reference: str, response: str) -> float:
    """
    Single hallucination score:
    Entailment prob - Contradiction prob (normalized to [0,1]).
    Higher = less hallucination.
    """
    if not reference or not response:
        return 0.0
    with torch.no_grad():
        inputs = nli_tokenizer.encode_plus(reference, response, return_tensors="pt", truncation=True)
        outputs = nli_model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()[0]

    entail_prob, contra_prob = 0.0, 0.0
    for idx, p in enumerate(probs):
        label = id2label.get(idx, "")
        if "ENTAIL" in label:
            entail_prob = float(p)
        elif "CONTRA" in label:
            contra_prob = float(p)

    score = entail_prob - contra_prob
    score = (score + 1) / 2  # normalize [-1,1] → [0,1]
    return round(max(0.0, min(1.0, score)), 3)


def check_assumption(response: str) -> float:
    """Detect speculative/hedging terms."""
    if not response:
        return 0.0
    speculative_terms = ["maybe", "probably", "might", "perhaps", "i guess", "seems", "could"]
    count = sum(1 for t in speculative_terms if t in response.lower())
    score = 1.0 - min(count / 5.0, 1.0)  # smoother decay
    return round(score, 3)


def check_coherence(response: str) -> float:
    """Heuristic coherence metric: penalizes very short/long, rewards sentence balance."""
    if not response:
        return 0.0
    words = len(re.findall(r"\w+", response))
    sents = max(1, len(re.split(r"[.!?]+", response)) - 1)
    if words < 5:
        return 0.3
    if words > 200:
        return 0.5
    base = min(1.0, (words / 50.0) + (sents / 5.0))
    return round(max(0.4, min(base, 0.95)), 3)


def check_accuracy(reference: str, response: str) -> float:
    """Semantic similarity between reference and response via embeddings (cosine)."""
    if not reference or not response:
        return 0.0
    ref_emb = embed_model.encode(reference, convert_to_tensor=True)
    resp_emb = embed_model.encode(response, convert_to_tensor=True)
    sim = float(util.cos_sim(ref_emb, resp_emb).item())
    return round(max(0.0, min(1.0, sim)), 3)


# --------------------------
# ROW & DF EVALUATION
# --------------------------
def evaluate_row(row):
    prompt = row.get("prompt", "")
    response = row.get("response", "")
    reference = row.get("reference", "")

    metrics = {
        "task_id": row.get("task_id", ""),
        "agent": row.get("agent", ""),
        "instruction_following": check_instruction_following(prompt, response),
        "hallucination": check_hallucination(reference, response),
        "assumption": check_assumption(response),
        "coherence": check_coherence(response),
        "accuracy": check_accuracy(reference, response),
    }

    # Weighted avg score (you can adjust weights)
    metrics["final_score"] = round(
        0.25 * metrics["instruction_following"]
        + 0.25 * metrics["accuracy"]
        + 0.2 * metrics["hallucination"]
        + 0.15 * metrics["coherence"]
        + 0.15 * metrics["assumption"],
        3,
    )
    return metrics


def evaluate_dataframe(df: pd.DataFrame):
    metrics_df = df.apply(evaluate_row, axis=1, result_type="expand")

    # Leaderboard
    leaderboard = (
        metrics_df.groupby(["agent", "task_id"])["final_score"]
        .mean()
        .reset_index()
    )

    # Plots
    images = []
    out_dir = "/tmp/plots"
    os.makedirs(out_dir, exist_ok=True)

    # Histogram of scores
    plt.figure(figsize=(6, 4))
    sns.histplot(metrics_df["final_score"], bins=10, kde=False)
    plt.title("Distribution of Final Scores")
    hist_path = os.path.join(out_dir, f"hist_{uuid.uuid4().hex}.png")
    plt.savefig(hist_path)
    plt.close()
    images.append((hist_path, "Final Score Distribution"))

    # Per-agent average
    plt.figure(figsize=(6, 4))
    agent_scores = metrics_df.groupby("agent")["final_score"].mean().reset_index()
    sns.barplot(data=agent_scores, x="agent", y="final_score")
    plt.title("Average Final Score per Agent")
    bar_path = os.path.join(out_dir, f"bar_{uuid.uuid4().hex}.png")
    plt.savefig(bar_path)
    plt.close()
    images.append((bar_path, "Average Score per Agent"))

    return metrics_df, images, leaderboard