# """ # Evaluation module: loads models, computes metrics, and creates visualizations. # Lightweight, CPU-friendly, no Java required. # """ # import re # import math # import uuid # from typing import List, Dict, Tuple # import numpy as np # import pandas as pd # import matplotlib.pyplot as plt # import seaborn as sns # import torch # from transformers import AutoTokenizer, AutoModelForSequenceClassification # from sentence_transformers import SentenceTransformer, util # # -------------------------- # # MODEL LOADING # # -------------------------- # NLI_MODEL = "textattack/roberta-base-MNLI" # EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2" # # Load NLI model & tokenizer # nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL) # nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL) # nli_model.to("cpu") # nli_model.eval() # # Load embedding model # embed_model = SentenceTransformer(EMBED_MODEL) # # Label mapping from config # id2label = {int(k): v.upper() for k, v in nli_model.config.id2label.items()} # # -------------------------- # # METRIC FUNCTIONS # # -------------------------- # def check_instruction_following(prompt: str, response: str) -> float: # """Embedding-based similarity between prompt and response.""" # if not prompt or not response: # return 0.0 # p_emb = embed_model.encode(prompt, convert_to_tensor=True) # r_emb = embed_model.encode(response, convert_to_tensor=True) # sim = float(util.cos_sim(p_emb, r_emb).item()) # return round(max(0.0, min(1.0, sim)), 3) # def check_hallucination(reference: str, response: str) -> float: # """ # Single hallucination score: # Entailment prob - Contradiction prob (normalized to [0,1]). # Higher = less hallucination. # """ # if not reference or not response: # return 0.0 # with torch.no_grad(): # inputs = nli_tokenizer.encode_plus(reference, response, return_tensors="pt", truncation=True) # outputs = nli_model(**inputs) # probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()[0] # entail_prob, contra_prob = 0.0, 0.0 # for idx, p in enumerate(probs): # label = id2label.get(idx, "") # if "ENTAIL" in label: # entail_prob = float(p) # elif "CONTRA" in label: # contra_prob = float(p) # score = entail_prob - contra_prob # score = (score + 1) / 2 # normalize [-1,1] → [0,1] # return round(max(0.0, min(1.0, score)), 3) # def check_assumption(response: str) -> float: # """Detect speculative/hedging terms.""" # if not response: # return 0.0 # speculative_terms = ["maybe", "probably", "might", "perhaps", "i guess", "seems", "could"] # count = sum(1 for t in speculative_terms if t in response.lower()) # score = 1.0 - min(count / 5.0, 1.0) # smoother decay # return round(score, 3) # def check_coherence(response: str) -> float: # """Heuristic coherence metric: penalizes very short/long, rewards sentence balance.""" # if not response: # return 0.0 # words = len(re.findall(r"\w+", response)) # sents = max(1, len(re.split(r"[.!?]+", response)) - 1) # if words < 5: # return 0.3 # if words > 200: # return 0.5 # base = min(1.0, (words / 50.0) + (sents / 5.0)) # return round(max(0.4, min(base, 0.95)), 3) # def check_accuracy(reference: str, response: str) -> float: # """Semantic similarity between reference and response via embeddings (cosine).""" # if not reference or not response: # return 0.0 # ref_emb = embed_model.encode(reference, convert_to_tensor=True) # resp_emb = embed_model.encode(response, convert_to_tensor=True) # sim = float(util.cos_sim(ref_emb, resp_emb).item()) # return round(max(0.0, min(1.0, sim)), 3) # # -------------------------- # # SCORING PIPELINE # # -------------------------- # def compute_row_scores(prompt, response, reference) -> Dict: # instr = check_instruction_following(prompt, response) # halluc = check_hallucination(reference, response) # assum = check_assumption(response) # coh = check_coherence(response) # acc = check_accuracy(reference, response) # # Final score: average # components = [instr, halluc, assum, coh, acc] # final = round(float(sum(components) / len(components)), 3) # return { # "InstructionFollowing": instr, # "Hallucination": halluc, # "AssumptionControl": assum, # "Coherence": coh, # "Accuracy": acc, # "FinalScore": final, # } # # -------------------------- # # VISUALIZATION HELPERS # # -------------------------- # # def spider_net_multi(labels: List[str], rows: List[Dict], title: str, fill_alpha: float = 0.12): # # """Radar chart for multiple agents.""" # # N = len(labels) # # angles = [n / float(N) * 2 * math.pi for n in range(N)] # # angles += angles[:1] # # fig = plt.figure(figsize=(6.5, 6.5)) # # ax = plt.subplot(111, polar=True) # # ax.set_xticks(angles[:-1]) # # ax.set_xticklabels(labels, fontsize=9) # # ax.set_ylim(0, 100) # # ax.set_yticks([0, 25, 50, 75, 100]) # # for r in rows: # # values = r["values"] # # values_closed = values + values[:1] # # ax.plot(angles, values_closed, linewidth=1.5, label=r["name"]) # # ax.fill(angles, values_closed, alpha=fill_alpha) # # ax.set_title(title, y=1.08, fontsize=12) # # ax.legend(loc="upper right", bbox_to_anchor=(1.25, 1.1)) # # return fig # # def heatmap_plot(df: pd.DataFrame, metric_cols: List[str], title: str = "Metric Correlations"): # # fig, ax = plt.subplots(figsize=(7, 5)) # # sns.heatmap(df[metric_cols].corr(), annot=True, fmt=".2f", cmap="coolwarm", ax=ax) # # ax.set_title(title) # # return fig # # -------------------------- # # HIGH-LEVEL EVALUATION # # -------------------------- # def evaluate_dataframe(df: pd.DataFrame) -> Tuple[pd.DataFrame, List[Tuple[str, str]], pd.DataFrame]: # """ # df must contain: prompt, response, task, agent, reference # Returns: metrics_df, [(image_path, caption)], leaderboard_df # """ # df = df.rename(columns={c: c.strip() for c in df.columns}) # rows = [] # for _, r in df.iterrows(): # prompt = r.get("prompt", "") # response = r.get("response", "") # reference = r.get("reference", "") # agent = r.get("agent", "Unknown") # task = r.get("task", "Unknown") # scores = compute_row_scores(prompt, response, reference) # entry = { # "Task": str(task).strip(), # "Agent": str(agent), # "Prompt": prompt, # "Response": response, # "Reference": reference, # } # entry.update(scores) # rows.append(entry) # metrics_df = pd.DataFrame(rows) # # Visualization artifacts # images = [] # metric_labels = ["InstructionFollowing", "Hallucination", "AssumptionControl", "Coherence", "Accuracy"] # # Per-task radar and bar charts # for task, g in metrics_df.groupby("Task"): # series = [] # for a in g["Agent"].unique(): # subset = g[g["Agent"] == a] # vals = [round(float(subset[m].mean()) * 100, 2) for m in metric_labels] # series.append({"name": a, "values": vals}) # if series: # fig = spider_net_multi(metric_labels, series, title=f"{task} — Agent Comparison") # fname = f"/tmp/{uuid.uuid4().hex}_{task}_radar.png" # fig.savefig(fname, bbox_inches="tight") # plt.close(fig) # images.append((fname, f"{task} - radar")) # fig2, ax = plt.subplots(figsize=(8, 4)) # avg = g.groupby("Agent")[metric_labels].mean() # avg.plot(kind="bar", ax=ax) # ax.set_title(f"{task} — Average Metrics by Agent") # ax.set_ylabel("Score (0-1)") # plt.xticks(rotation=45) # fname2 = f"/tmp/{uuid.uuid4().hex}_{task}_bar.png" # fig2.savefig(fname2, bbox_inches="tight") # plt.close(fig2) # images.append((fname2, f"{task} - bar")) # # Global heatmap # metric_cols = metric_labels + ["FinalScore"] # figh = heatmap_plot(metrics_df, metric_cols) # fnameh = f"/tmp/{uuid.uuid4().hex}_heatmap.png" # figh.savefig(fnameh, bbox_inches="tight") # plt.close(figh) # images.append((fnameh, "Metric Correlations Heatmap")) # # Leaderboard # lb = metrics_df.groupby(["Agent", "Task"])["FinalScore"].mean().reset_index() # lb = lb.sort_values(["FinalScore"], ascending=False) # return metrics_df, images, lb # # -------------------------- # # DEMO USAGE # # -------------------------- # if __name__ == "__main__": # # Sample dataset # data = [ # {"task": "Math QA", "agent": "AgentA", "prompt": "What is 2+2?", "response": "The answer is 4.", "reference": "2+2=4"}, # {"task": "Math QA", "agent": "AgentB", "prompt": "What is 2+2?", "response": "It might be 5, but usually 4.", "reference": "2+2=4"}, # {"task": "Summarization", "agent": "AgentA", "prompt": "Summarize: 'The cat sat on the mat. The dog barked.'", "response": "A cat sat while a dog barked.", "reference": "Cat on mat, dog barking."}, # ] # df = pd.DataFrame(data) # metrics_df, images, leaderboard = evaluate_dataframe(df) # print("\n=== Metrics per response ===") # print(metrics_df[["Task", "Agent", "InstructionFollowing", "Hallucination", "AssumptionControl", "Coherence", "Accuracy", "FinalScore"]]) # print("\n=== Leaderboard (average per task & agent) ===") # print(leaderboard) # print("\nVisualization files saved in /tmp/:") # for path, caption in images: # print(f"{caption}: {path}") import re import json import torch import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import os import uuid from transformers import AutoTokenizer, AutoModelForSequenceClassification from sentence_transformers import SentenceTransformer, util # -------------------------- # MODEL LOADING # -------------------------- NLI_MODEL = "textattack/roberta-base-MNLI" EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2" # Load NLI model & tokenizer nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL) nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL) nli_model.to("cpu") nli_model.eval() # Load embedding model embed_model = SentenceTransformer(EMBED_MODEL) # Label mapping from config id2label = {int(k): v.upper() for k, v in nli_model.config.id2label.items()} # -------------------------- # METRIC FUNCTIONS # -------------------------- def check_instruction_following(prompt: str, response: str) -> float: """Embedding-based similarity between prompt and response.""" if not prompt or not response: return 0.0 p_emb = embed_model.encode(prompt, convert_to_tensor=True) r_emb = embed_model.encode(response, convert_to_tensor=True) sim = float(util.cos_sim(p_emb, r_emb).item()) return round(max(0.0, min(1.0, sim)), 3) def check_hallucination(reference: str, response: str) -> float: """ Single hallucination score: Entailment prob - Contradiction prob (normalized to [0,1]). Higher = less hallucination. """ if not reference or not response: return 0.0 with torch.no_grad(): inputs = nli_tokenizer.encode_plus(reference, response, return_tensors="pt", truncation=True) outputs = nli_model(**inputs) probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()[0] entail_prob, contra_prob = 0.0, 0.0 for idx, p in enumerate(probs): label = id2label.get(idx, "") if "ENTAIL" in label: entail_prob = float(p) elif "CONTRA" in label: contra_prob = float(p) score = entail_prob - contra_prob score = (score + 1) / 2 # normalize [-1,1] → [0,1] return round(max(0.0, min(1.0, score)), 3) def check_assumption(response: str) -> float: """Detect speculative/hedging terms.""" if not response: return 0.0 speculative_terms = ["maybe", "probably", "might", "perhaps", "i guess", "seems", "could"] count = sum(1 for t in speculative_terms if t in response.lower()) score = 1.0 - min(count / 5.0, 1.0) # smoother decay return round(score, 3) def check_coherence(response: str) -> float: """Heuristic coherence metric: penalizes very short/long, rewards sentence balance.""" if not response: return 0.0 words = len(re.findall(r"\w+", response)) sents = max(1, len(re.split(r"[.!?]+", response)) - 1) if words < 5: return 0.3 if words > 200: return 0.5 base = min(1.0, (words / 50.0) + (sents / 5.0)) return round(max(0.4, min(base, 0.95)), 3) def check_accuracy(reference: str, response: str) -> float: """Semantic similarity between reference and response via embeddings (cosine).""" if not reference or not response: return 0.0 ref_emb = embed_model.encode(reference, convert_to_tensor=True) resp_emb = embed_model.encode(response, convert_to_tensor=True) sim = float(util.cos_sim(ref_emb, resp_emb).item()) return round(max(0.0, min(1.0, sim)), 3) # -------------------------- # ROW & DF EVALUATION # -------------------------- def evaluate_row(row): prompt = row.get("prompt", "") response = row.get("response", "") reference = row.get("reference", "") metrics = { "task_id": row.get("task_id", ""), "agent": row.get("agent", ""), "instruction_following": check_instruction_following(prompt, response), "hallucination": check_hallucination(reference, response), "assumption": check_assumption(response), "coherence": check_coherence(response), "accuracy": check_accuracy(reference, response), } # Weighted avg score (you can adjust weights) metrics["final_score"] = round( 0.25 * metrics["instruction_following"] + 0.25 * metrics["accuracy"] + 0.2 * metrics["hallucination"] + 0.15 * metrics["coherence"] + 0.15 * metrics["assumption"], 3, ) return metrics def evaluate_dataframe(df: pd.DataFrame): metrics_df = df.apply(evaluate_row, axis=1, result_type="expand") # Leaderboard leaderboard = ( metrics_df.groupby(["agent", "task_id"])["final_score"] .mean() .reset_index() ) # Plots images = [] out_dir = "/tmp/plots" os.makedirs(out_dir, exist_ok=True) # Histogram of scores plt.figure(figsize=(6, 4)) sns.histplot(metrics_df["final_score"], bins=10, kde=False) plt.title("Distribution of Final Scores") hist_path = os.path.join(out_dir, f"hist_{uuid.uuid4().hex}.png") plt.savefig(hist_path) plt.close() images.append((hist_path, "Final Score Distribution")) # Per-agent average plt.figure(figsize=(6, 4)) agent_scores = metrics_df.groupby("agent")["final_score"].mean().reset_index() sns.barplot(data=agent_scores, x="agent", y="final_score") plt.title("Average Final Score per Agent") bar_path = os.path.join(out_dir, f"bar_{uuid.uuid4().hex}.png") plt.savefig(bar_path) plt.close() images.append((bar_path, "Average Score per Agent")) return metrics_df, images, leaderboard