Spaces:
Runtime error
Runtime error
| # """ | |
| # Evaluation module: loads models, computes metrics, and creates visualizations. | |
| # Lightweight, CPU-friendly, no Java required. | |
| # """ | |
| # import re | |
| # import math | |
| # import uuid | |
| # from typing import List, Dict, Tuple | |
| # import numpy as np | |
| # import pandas as pd | |
| # import matplotlib.pyplot as plt | |
| # import seaborn as sns | |
| # import torch | |
| # from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
| # from sentence_transformers import SentenceTransformer, util | |
| # # -------------------------- | |
| # # MODEL LOADING | |
| # # -------------------------- | |
| # NLI_MODEL = "textattack/roberta-base-MNLI" | |
| # EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2" | |
| # # Load NLI model & tokenizer | |
| # nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL) | |
| # nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL) | |
| # nli_model.to("cpu") | |
| # nli_model.eval() | |
| # # Load embedding model | |
| # embed_model = SentenceTransformer(EMBED_MODEL) | |
| # # Label mapping from config | |
| # id2label = {int(k): v.upper() for k, v in nli_model.config.id2label.items()} | |
| # # -------------------------- | |
| # # METRIC FUNCTIONS | |
| # # -------------------------- | |
| # def check_instruction_following(prompt: str, response: str) -> float: | |
| # """Embedding-based similarity between prompt and response.""" | |
| # if not prompt or not response: | |
| # return 0.0 | |
| # p_emb = embed_model.encode(prompt, convert_to_tensor=True) | |
| # r_emb = embed_model.encode(response, convert_to_tensor=True) | |
| # sim = float(util.cos_sim(p_emb, r_emb).item()) | |
| # return round(max(0.0, min(1.0, sim)), 3) | |
| # def check_hallucination(reference: str, response: str) -> float: | |
| # """ | |
| # Single hallucination score: | |
| # Entailment prob - Contradiction prob (normalized to [0,1]). | |
| # Higher = less hallucination. | |
| # """ | |
| # if not reference or not response: | |
| # return 0.0 | |
| # with torch.no_grad(): | |
| # inputs = nli_tokenizer.encode_plus(reference, response, return_tensors="pt", truncation=True) | |
| # outputs = nli_model(**inputs) | |
| # probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()[0] | |
| # entail_prob, contra_prob = 0.0, 0.0 | |
| # for idx, p in enumerate(probs): | |
| # label = id2label.get(idx, "") | |
| # if "ENTAIL" in label: | |
| # entail_prob = float(p) | |
| # elif "CONTRA" in label: | |
| # contra_prob = float(p) | |
| # score = entail_prob - contra_prob | |
| # score = (score + 1) / 2 # normalize [-1,1] β [0,1] | |
| # return round(max(0.0, min(1.0, score)), 3) | |
| # def check_assumption(response: str) -> float: | |
| # """Detect speculative/hedging terms.""" | |
| # if not response: | |
| # return 0.0 | |
| # speculative_terms = ["maybe", "probably", "might", "perhaps", "i guess", "seems", "could"] | |
| # count = sum(1 for t in speculative_terms if t in response.lower()) | |
| # score = 1.0 - min(count / 5.0, 1.0) # smoother decay | |
| # return round(score, 3) | |
| # def check_coherence(response: str) -> float: | |
| # """Heuristic coherence metric: penalizes very short/long, rewards sentence balance.""" | |
| # if not response: | |
| # return 0.0 | |
| # words = len(re.findall(r"\w+", response)) | |
| # sents = max(1, len(re.split(r"[.!?]+", response)) - 1) | |
| # if words < 5: | |
| # return 0.3 | |
| # if words > 200: | |
| # return 0.5 | |
| # base = min(1.0, (words / 50.0) + (sents / 5.0)) | |
| # return round(max(0.4, min(base, 0.95)), 3) | |
| # def check_accuracy(reference: str, response: str) -> float: | |
| # """Semantic similarity between reference and response via embeddings (cosine).""" | |
| # if not reference or not response: | |
| # return 0.0 | |
| # ref_emb = embed_model.encode(reference, convert_to_tensor=True) | |
| # resp_emb = embed_model.encode(response, convert_to_tensor=True) | |
| # sim = float(util.cos_sim(ref_emb, resp_emb).item()) | |
| # return round(max(0.0, min(1.0, sim)), 3) | |
| # # -------------------------- | |
| # # SCORING PIPELINE | |
| # # -------------------------- | |
| # def compute_row_scores(prompt, response, reference) -> Dict: | |
| # instr = check_instruction_following(prompt, response) | |
| # halluc = check_hallucination(reference, response) | |
| # assum = check_assumption(response) | |
| # coh = check_coherence(response) | |
| # acc = check_accuracy(reference, response) | |
| # # Final score: average | |
| # components = [instr, halluc, assum, coh, acc] | |
| # final = round(float(sum(components) / len(components)), 3) | |
| # return { | |
| # "InstructionFollowing": instr, | |
| # "Hallucination": halluc, | |
| # "AssumptionControl": assum, | |
| # "Coherence": coh, | |
| # "Accuracy": acc, | |
| # "FinalScore": final, | |
| # } | |
| # # -------------------------- | |
| # # VISUALIZATION HELPERS | |
| # # -------------------------- | |
| # # def spider_net_multi(labels: List[str], rows: List[Dict], title: str, fill_alpha: float = 0.12): | |
| # # """Radar chart for multiple agents.""" | |
| # # N = len(labels) | |
| # # angles = [n / float(N) * 2 * math.pi for n in range(N)] | |
| # # angles += angles[:1] | |
| # # fig = plt.figure(figsize=(6.5, 6.5)) | |
| # # ax = plt.subplot(111, polar=True) | |
| # # ax.set_xticks(angles[:-1]) | |
| # # ax.set_xticklabels(labels, fontsize=9) | |
| # # ax.set_ylim(0, 100) | |
| # # ax.set_yticks([0, 25, 50, 75, 100]) | |
| # # for r in rows: | |
| # # values = r["values"] | |
| # # values_closed = values + values[:1] | |
| # # ax.plot(angles, values_closed, linewidth=1.5, label=r["name"]) | |
| # # ax.fill(angles, values_closed, alpha=fill_alpha) | |
| # # ax.set_title(title, y=1.08, fontsize=12) | |
| # # ax.legend(loc="upper right", bbox_to_anchor=(1.25, 1.1)) | |
| # # return fig | |
| # # def heatmap_plot(df: pd.DataFrame, metric_cols: List[str], title: str = "Metric Correlations"): | |
| # # fig, ax = plt.subplots(figsize=(7, 5)) | |
| # # sns.heatmap(df[metric_cols].corr(), annot=True, fmt=".2f", cmap="coolwarm", ax=ax) | |
| # # ax.set_title(title) | |
| # # return fig | |
| # # -------------------------- | |
| # # HIGH-LEVEL EVALUATION | |
| # # -------------------------- | |
| # def evaluate_dataframe(df: pd.DataFrame) -> Tuple[pd.DataFrame, List[Tuple[str, str]], pd.DataFrame]: | |
| # """ | |
| # df must contain: prompt, response, task, agent, reference | |
| # Returns: metrics_df, [(image_path, caption)], leaderboard_df | |
| # """ | |
| # df = df.rename(columns={c: c.strip() for c in df.columns}) | |
| # rows = [] | |
| # for _, r in df.iterrows(): | |
| # prompt = r.get("prompt", "") | |
| # response = r.get("response", "") | |
| # reference = r.get("reference", "") | |
| # agent = r.get("agent", "Unknown") | |
| # task = r.get("task", "Unknown") | |
| # scores = compute_row_scores(prompt, response, reference) | |
| # entry = { | |
| # "Task": str(task).strip(), | |
| # "Agent": str(agent), | |
| # "Prompt": prompt, | |
| # "Response": response, | |
| # "Reference": reference, | |
| # } | |
| # entry.update(scores) | |
| # rows.append(entry) | |
| # metrics_df = pd.DataFrame(rows) | |
| # # Visualization artifacts | |
| # images = [] | |
| # metric_labels = ["InstructionFollowing", "Hallucination", "AssumptionControl", "Coherence", "Accuracy"] | |
| # # Per-task radar and bar charts | |
| # for task, g in metrics_df.groupby("Task"): | |
| # series = [] | |
| # for a in g["Agent"].unique(): | |
| # subset = g[g["Agent"] == a] | |
| # vals = [round(float(subset[m].mean()) * 100, 2) for m in metric_labels] | |
| # series.append({"name": a, "values": vals}) | |
| # if series: | |
| # fig = spider_net_multi(metric_labels, series, title=f"{task} β Agent Comparison") | |
| # fname = f"/tmp/{uuid.uuid4().hex}_{task}_radar.png" | |
| # fig.savefig(fname, bbox_inches="tight") | |
| # plt.close(fig) | |
| # images.append((fname, f"{task} - radar")) | |
| # fig2, ax = plt.subplots(figsize=(8, 4)) | |
| # avg = g.groupby("Agent")[metric_labels].mean() | |
| # avg.plot(kind="bar", ax=ax) | |
| # ax.set_title(f"{task} β Average Metrics by Agent") | |
| # ax.set_ylabel("Score (0-1)") | |
| # plt.xticks(rotation=45) | |
| # fname2 = f"/tmp/{uuid.uuid4().hex}_{task}_bar.png" | |
| # fig2.savefig(fname2, bbox_inches="tight") | |
| # plt.close(fig2) | |
| # images.append((fname2, f"{task} - bar")) | |
| # # Global heatmap | |
| # metric_cols = metric_labels + ["FinalScore"] | |
| # figh = heatmap_plot(metrics_df, metric_cols) | |
| # fnameh = f"/tmp/{uuid.uuid4().hex}_heatmap.png" | |
| # figh.savefig(fnameh, bbox_inches="tight") | |
| # plt.close(figh) | |
| # images.append((fnameh, "Metric Correlations Heatmap")) | |
| # # Leaderboard | |
| # lb = metrics_df.groupby(["Agent", "Task"])["FinalScore"].mean().reset_index() | |
| # lb = lb.sort_values(["FinalScore"], ascending=False) | |
| # return metrics_df, images, lb | |
| # # -------------------------- | |
| # # DEMO USAGE | |
| # # -------------------------- | |
| # if __name__ == "__main__": | |
| # # Sample dataset | |
| # data = [ | |
| # {"task": "Math QA", "agent": "AgentA", "prompt": "What is 2+2?", "response": "The answer is 4.", "reference": "2+2=4"}, | |
| # {"task": "Math QA", "agent": "AgentB", "prompt": "What is 2+2?", "response": "It might be 5, but usually 4.", "reference": "2+2=4"}, | |
| # {"task": "Summarization", "agent": "AgentA", "prompt": "Summarize: 'The cat sat on the mat. The dog barked.'", "response": "A cat sat while a dog barked.", "reference": "Cat on mat, dog barking."}, | |
| # ] | |
| # df = pd.DataFrame(data) | |
| # metrics_df, images, leaderboard = evaluate_dataframe(df) | |
| # print("\n=== Metrics per response ===") | |
| # print(metrics_df[["Task", "Agent", "InstructionFollowing", "Hallucination", "AssumptionControl", "Coherence", "Accuracy", "FinalScore"]]) | |
| # print("\n=== Leaderboard (average per task & agent) ===") | |
| # print(leaderboard) | |
| # print("\nVisualization files saved in /tmp/:") | |
| # for path, caption in images: | |
| # print(f"{caption}: {path}") | |
| import re | |
| import json | |
| import torch | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import os | |
| import uuid | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
| from sentence_transformers import SentenceTransformer, util | |
| # -------------------------- | |
| # MODEL LOADING | |
| # -------------------------- | |
| NLI_MODEL = "textattack/roberta-base-MNLI" | |
| EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2" | |
| # Load NLI model & tokenizer | |
| nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL) | |
| nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL) | |
| nli_model.to("cpu") | |
| nli_model.eval() | |
| # Load embedding model | |
| embed_model = SentenceTransformer(EMBED_MODEL) | |
| # Label mapping from config | |
| id2label = {int(k): v.upper() for k, v in nli_model.config.id2label.items()} | |
| # -------------------------- | |
| # METRIC FUNCTIONS | |
| # -------------------------- | |
| def check_instruction_following(prompt: str, response: str) -> float: | |
| """Embedding-based similarity between prompt and response.""" | |
| if not prompt or not response: | |
| return 0.0 | |
| p_emb = embed_model.encode(prompt, convert_to_tensor=True) | |
| r_emb = embed_model.encode(response, convert_to_tensor=True) | |
| sim = float(util.cos_sim(p_emb, r_emb).item()) | |
| return round(max(0.0, min(1.0, sim)), 3) | |
| def check_hallucination(reference: str, response: str) -> float: | |
| """ | |
| Single hallucination score: | |
| Entailment prob - Contradiction prob (normalized to [0,1]). | |
| Higher = less hallucination. | |
| """ | |
| if not reference or not response: | |
| return 0.0 | |
| with torch.no_grad(): | |
| inputs = nli_tokenizer.encode_plus(reference, response, return_tensors="pt", truncation=True) | |
| outputs = nli_model(**inputs) | |
| probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()[0] | |
| entail_prob, contra_prob = 0.0, 0.0 | |
| for idx, p in enumerate(probs): | |
| label = id2label.get(idx, "") | |
| if "ENTAIL" in label: | |
| entail_prob = float(p) | |
| elif "CONTRA" in label: | |
| contra_prob = float(p) | |
| score = entail_prob - contra_prob | |
| score = (score + 1) / 2 # normalize [-1,1] β [0,1] | |
| return round(max(0.0, min(1.0, score)), 3) | |
| def check_assumption(response: str) -> float: | |
| """Detect speculative/hedging terms.""" | |
| if not response: | |
| return 0.0 | |
| speculative_terms = ["maybe", "probably", "might", "perhaps", "i guess", "seems", "could"] | |
| count = sum(1 for t in speculative_terms if t in response.lower()) | |
| score = 1.0 - min(count / 5.0, 1.0) # smoother decay | |
| return round(score, 3) | |
| def check_coherence(response: str) -> float: | |
| """Heuristic coherence metric: penalizes very short/long, rewards sentence balance.""" | |
| if not response: | |
| return 0.0 | |
| words = len(re.findall(r"\w+", response)) | |
| sents = max(1, len(re.split(r"[.!?]+", response)) - 1) | |
| if words < 5: | |
| return 0.3 | |
| if words > 200: | |
| return 0.5 | |
| base = min(1.0, (words / 50.0) + (sents / 5.0)) | |
| return round(max(0.4, min(base, 0.95)), 3) | |
| def check_accuracy(reference: str, response: str) -> float: | |
| """Semantic similarity between reference and response via embeddings (cosine).""" | |
| if not reference or not response: | |
| return 0.0 | |
| ref_emb = embed_model.encode(reference, convert_to_tensor=True) | |
| resp_emb = embed_model.encode(response, convert_to_tensor=True) | |
| sim = float(util.cos_sim(ref_emb, resp_emb).item()) | |
| return round(max(0.0, min(1.0, sim)), 3) | |
| # -------------------------- | |
| # ROW & DF EVALUATION | |
| # -------------------------- | |
| def evaluate_row(row): | |
| prompt = row.get("prompt", "") | |
| response = row.get("response", "") | |
| reference = row.get("reference", "") | |
| metrics = { | |
| "task_id": row.get("task_id", ""), | |
| "agent": row.get("agent", ""), | |
| "instruction_following": check_instruction_following(prompt, response), | |
| "hallucination": check_hallucination(reference, response), | |
| "assumption": check_assumption(response), | |
| "coherence": check_coherence(response), | |
| "accuracy": check_accuracy(reference, response), | |
| } | |
| # Weighted avg score (you can adjust weights) | |
| metrics["final_score"] = round( | |
| 0.25 * metrics["instruction_following"] | |
| + 0.25 * metrics["accuracy"] | |
| + 0.2 * metrics["hallucination"] | |
| + 0.15 * metrics["coherence"] | |
| + 0.15 * metrics["assumption"], | |
| 3, | |
| ) | |
| return metrics | |
| def evaluate_dataframe(df: pd.DataFrame): | |
| metrics_df = df.apply(evaluate_row, axis=1, result_type="expand") | |
| # Leaderboard | |
| leaderboard = ( | |
| metrics_df.groupby(["agent", "task_id"])["final_score"] | |
| .mean() | |
| .reset_index() | |
| ) | |
| # Plots | |
| images = [] | |
| out_dir = "/tmp/plots" | |
| os.makedirs(out_dir, exist_ok=True) | |
| # Histogram of scores | |
| plt.figure(figsize=(6, 4)) | |
| sns.histplot(metrics_df["final_score"], bins=10, kde=False) | |
| plt.title("Distribution of Final Scores") | |
| hist_path = os.path.join(out_dir, f"hist_{uuid.uuid4().hex}.png") | |
| plt.savefig(hist_path) | |
| plt.close() | |
| images.append((hist_path, "Final Score Distribution")) | |
| # Per-agent average | |
| plt.figure(figsize=(6, 4)) | |
| agent_scores = metrics_df.groupby("agent")["final_score"].mean().reset_index() | |
| sns.barplot(data=agent_scores, x="agent", y="final_score") | |
| plt.title("Average Final Score per Agent") | |
| bar_path = os.path.join(out_dir, f"bar_{uuid.uuid4().hex}.png") | |
| plt.savefig(bar_path) | |
| plt.close() | |
| images.append((bar_path, "Average Score per Agent")) | |
| return metrics_df, images, leaderboard | |