Spaces:
Runtime error
Runtime error
Update evaluator.py
#4
by
manayporwal07
- opened
- evaluator.py +326 -157
evaluator.py
CHANGED
|
@@ -1,18 +1,278 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Evaluation module: loads models, computes metrics, and creates visualizations.
|
| 3 |
-
Lightweight, CPU-friendly, no Java required.
|
| 4 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
import re
|
| 7 |
-
import
|
| 8 |
-
import
|
| 9 |
-
from typing import List, Dict, Tuple
|
| 10 |
-
|
| 11 |
-
import numpy as np
|
| 12 |
import pandas as pd
|
| 13 |
import matplotlib.pyplot as plt
|
| 14 |
import seaborn as sns
|
| 15 |
-
import
|
|
|
|
| 16 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 17 |
from sentence_transformers import SentenceTransformer, util
|
| 18 |
|
|
@@ -109,158 +369,67 @@ def check_accuracy(reference: str, response: str) -> float:
|
|
| 109 |
|
| 110 |
|
| 111 |
# --------------------------
|
| 112 |
-
#
|
| 113 |
# --------------------------
|
| 114 |
-
def
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
"
|
| 127 |
-
"Hallucination": halluc,
|
| 128 |
-
"AssumptionControl": assum,
|
| 129 |
-
"Coherence": coh,
|
| 130 |
-
"Accuracy": acc,
|
| 131 |
-
"FinalScore": final,
|
| 132 |
}
|
| 133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
-
# --------------------------
|
| 136 |
-
# VISUALIZATION HELPERS
|
| 137 |
-
# --------------------------
|
| 138 |
-
# def spider_net_multi(labels: List[str], rows: List[Dict], title: str, fill_alpha: float = 0.12):
|
| 139 |
-
# """Radar chart for multiple agents."""
|
| 140 |
-
# N = len(labels)
|
| 141 |
-
# angles = [n / float(N) * 2 * math.pi for n in range(N)]
|
| 142 |
-
# angles += angles[:1]
|
| 143 |
-
|
| 144 |
-
# fig = plt.figure(figsize=(6.5, 6.5))
|
| 145 |
-
# ax = plt.subplot(111, polar=True)
|
| 146 |
-
# ax.set_xticks(angles[:-1])
|
| 147 |
-
# ax.set_xticklabels(labels, fontsize=9)
|
| 148 |
-
# ax.set_ylim(0, 100)
|
| 149 |
-
# ax.set_yticks([0, 25, 50, 75, 100])
|
| 150 |
-
|
| 151 |
-
# for r in rows:
|
| 152 |
-
# values = r["values"]
|
| 153 |
-
# values_closed = values + values[:1]
|
| 154 |
-
# ax.plot(angles, values_closed, linewidth=1.5, label=r["name"])
|
| 155 |
-
# ax.fill(angles, values_closed, alpha=fill_alpha)
|
| 156 |
-
|
| 157 |
-
# ax.set_title(title, y=1.08, fontsize=12)
|
| 158 |
-
# ax.legend(loc="upper right", bbox_to_anchor=(1.25, 1.1))
|
| 159 |
-
# return fig
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
# def heatmap_plot(df: pd.DataFrame, metric_cols: List[str], title: str = "Metric Correlations"):
|
| 163 |
-
# fig, ax = plt.subplots(figsize=(7, 5))
|
| 164 |
-
# sns.heatmap(df[metric_cols].corr(), annot=True, fmt=".2f", cmap="coolwarm", ax=ax)
|
| 165 |
-
# ax.set_title(title)
|
| 166 |
-
# return fig
|
| 167 |
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
# HIGH-LEVEL EVALUATION
|
| 171 |
-
# --------------------------
|
| 172 |
-
def evaluate_dataframe(df: pd.DataFrame) -> Tuple[pd.DataFrame, List[Tuple[str, str]], pd.DataFrame]:
|
| 173 |
-
"""
|
| 174 |
-
df must contain: prompt, response, task, agent, reference
|
| 175 |
-
Returns: metrics_df, [(image_path, caption)], leaderboard_df
|
| 176 |
-
"""
|
| 177 |
-
df = df.rename(columns={c: c.strip() for c in df.columns})
|
| 178 |
-
|
| 179 |
-
rows = []
|
| 180 |
-
for _, r in df.iterrows():
|
| 181 |
-
prompt = r.get("prompt", "")
|
| 182 |
-
response = r.get("response", "")
|
| 183 |
-
reference = r.get("reference", "")
|
| 184 |
-
agent = r.get("agent", "Unknown")
|
| 185 |
-
task = r.get("task", "Unknown")
|
| 186 |
-
|
| 187 |
-
scores = compute_row_scores(prompt, response, reference)
|
| 188 |
-
entry = {
|
| 189 |
-
"Task": str(task).strip(),
|
| 190 |
-
"Agent": str(agent),
|
| 191 |
-
"Prompt": prompt,
|
| 192 |
-
"Response": response,
|
| 193 |
-
"Reference": reference,
|
| 194 |
-
}
|
| 195 |
-
entry.update(scores)
|
| 196 |
-
rows.append(entry)
|
| 197 |
-
|
| 198 |
-
metrics_df = pd.DataFrame(rows)
|
| 199 |
-
|
| 200 |
-
# Visualization artifacts
|
| 201 |
-
images = []
|
| 202 |
-
metric_labels = ["InstructionFollowing", "Hallucination", "AssumptionControl", "Coherence", "Accuracy"]
|
| 203 |
-
|
| 204 |
-
# Per-task radar and bar charts
|
| 205 |
-
for task, g in metrics_df.groupby("Task"):
|
| 206 |
-
series = []
|
| 207 |
-
for a in g["Agent"].unique():
|
| 208 |
-
subset = g[g["Agent"] == a]
|
| 209 |
-
vals = [round(float(subset[m].mean()) * 100, 2) for m in metric_labels]
|
| 210 |
-
series.append({"name": a, "values": vals})
|
| 211 |
-
if series:
|
| 212 |
-
fig = spider_net_multi(metric_labels, series, title=f"{task} β Agent Comparison")
|
| 213 |
-
fname = f"/tmp/{uuid.uuid4().hex}_{task}_radar.png"
|
| 214 |
-
fig.savefig(fname, bbox_inches="tight")
|
| 215 |
-
plt.close(fig)
|
| 216 |
-
images.append((fname, f"{task} - radar"))
|
| 217 |
-
|
| 218 |
-
fig2, ax = plt.subplots(figsize=(8, 4))
|
| 219 |
-
avg = g.groupby("Agent")[metric_labels].mean()
|
| 220 |
-
avg.plot(kind="bar", ax=ax)
|
| 221 |
-
ax.set_title(f"{task} β Average Metrics by Agent")
|
| 222 |
-
ax.set_ylabel("Score (0-1)")
|
| 223 |
-
plt.xticks(rotation=45)
|
| 224 |
-
fname2 = f"/tmp/{uuid.uuid4().hex}_{task}_bar.png"
|
| 225 |
-
fig2.savefig(fname2, bbox_inches="tight")
|
| 226 |
-
plt.close(fig2)
|
| 227 |
-
images.append((fname2, f"{task} - bar"))
|
| 228 |
-
|
| 229 |
-
# Global heatmap
|
| 230 |
-
metric_cols = metric_labels + ["FinalScore"]
|
| 231 |
-
figh = heatmap_plot(metrics_df, metric_cols)
|
| 232 |
-
fnameh = f"/tmp/{uuid.uuid4().hex}_heatmap.png"
|
| 233 |
-
figh.savefig(fnameh, bbox_inches="tight")
|
| 234 |
-
plt.close(figh)
|
| 235 |
-
images.append((fnameh, "Metric Correlations Heatmap"))
|
| 236 |
|
| 237 |
# Leaderboard
|
| 238 |
-
|
| 239 |
-
|
|
|
|
|
|
|
|
|
|
| 240 |
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
#
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
print(f"{caption}: {path}")
|
|
|
|
| 1 |
+
# """
|
| 2 |
+
# Evaluation module: loads models, computes metrics, and creates visualizations.
|
| 3 |
+
# Lightweight, CPU-friendly, no Java required.
|
| 4 |
+
# """
|
| 5 |
+
|
| 6 |
+
# import re
|
| 7 |
+
# import math
|
| 8 |
+
# import uuid
|
| 9 |
+
# from typing import List, Dict, Tuple
|
| 10 |
+
|
| 11 |
+
# import numpy as np
|
| 12 |
+
# import pandas as pd
|
| 13 |
+
# import matplotlib.pyplot as plt
|
| 14 |
+
# import seaborn as sns
|
| 15 |
+
# import torch
|
| 16 |
+
# from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 17 |
+
# from sentence_transformers import SentenceTransformer, util
|
| 18 |
+
|
| 19 |
+
# # --------------------------
|
| 20 |
+
# # MODEL LOADING
|
| 21 |
+
# # --------------------------
|
| 22 |
+
# NLI_MODEL = "textattack/roberta-base-MNLI"
|
| 23 |
+
# EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
| 24 |
+
|
| 25 |
+
# # Load NLI model & tokenizer
|
| 26 |
+
# nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL)
|
| 27 |
+
# nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL)
|
| 28 |
+
# nli_model.to("cpu")
|
| 29 |
+
# nli_model.eval()
|
| 30 |
+
|
| 31 |
+
# # Load embedding model
|
| 32 |
+
# embed_model = SentenceTransformer(EMBED_MODEL)
|
| 33 |
+
|
| 34 |
+
# # Label mapping from config
|
| 35 |
+
# id2label = {int(k): v.upper() for k, v in nli_model.config.id2label.items()}
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# # --------------------------
|
| 39 |
+
# # METRIC FUNCTIONS
|
| 40 |
+
# # --------------------------
|
| 41 |
+
# def check_instruction_following(prompt: str, response: str) -> float:
|
| 42 |
+
# """Embedding-based similarity between prompt and response."""
|
| 43 |
+
# if not prompt or not response:
|
| 44 |
+
# return 0.0
|
| 45 |
+
# p_emb = embed_model.encode(prompt, convert_to_tensor=True)
|
| 46 |
+
# r_emb = embed_model.encode(response, convert_to_tensor=True)
|
| 47 |
+
# sim = float(util.cos_sim(p_emb, r_emb).item())
|
| 48 |
+
# return round(max(0.0, min(1.0, sim)), 3)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
# def check_hallucination(reference: str, response: str) -> float:
|
| 52 |
+
# """
|
| 53 |
+
# Single hallucination score:
|
| 54 |
+
# Entailment prob - Contradiction prob (normalized to [0,1]).
|
| 55 |
+
# Higher = less hallucination.
|
| 56 |
+
# """
|
| 57 |
+
# if not reference or not response:
|
| 58 |
+
# return 0.0
|
| 59 |
+
# with torch.no_grad():
|
| 60 |
+
# inputs = nli_tokenizer.encode_plus(reference, response, return_tensors="pt", truncation=True)
|
| 61 |
+
# outputs = nli_model(**inputs)
|
| 62 |
+
# probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
|
| 63 |
+
|
| 64 |
+
# entail_prob, contra_prob = 0.0, 0.0
|
| 65 |
+
# for idx, p in enumerate(probs):
|
| 66 |
+
# label = id2label.get(idx, "")
|
| 67 |
+
# if "ENTAIL" in label:
|
| 68 |
+
# entail_prob = float(p)
|
| 69 |
+
# elif "CONTRA" in label:
|
| 70 |
+
# contra_prob = float(p)
|
| 71 |
+
|
| 72 |
+
# score = entail_prob - contra_prob
|
| 73 |
+
# score = (score + 1) / 2 # normalize [-1,1] β [0,1]
|
| 74 |
+
# return round(max(0.0, min(1.0, score)), 3)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
# def check_assumption(response: str) -> float:
|
| 78 |
+
# """Detect speculative/hedging terms."""
|
| 79 |
+
# if not response:
|
| 80 |
+
# return 0.0
|
| 81 |
+
# speculative_terms = ["maybe", "probably", "might", "perhaps", "i guess", "seems", "could"]
|
| 82 |
+
# count = sum(1 for t in speculative_terms if t in response.lower())
|
| 83 |
+
# score = 1.0 - min(count / 5.0, 1.0) # smoother decay
|
| 84 |
+
# return round(score, 3)
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
# def check_coherence(response: str) -> float:
|
| 88 |
+
# """Heuristic coherence metric: penalizes very short/long, rewards sentence balance."""
|
| 89 |
+
# if not response:
|
| 90 |
+
# return 0.0
|
| 91 |
+
# words = len(re.findall(r"\w+", response))
|
| 92 |
+
# sents = max(1, len(re.split(r"[.!?]+", response)) - 1)
|
| 93 |
+
# if words < 5:
|
| 94 |
+
# return 0.3
|
| 95 |
+
# if words > 200:
|
| 96 |
+
# return 0.5
|
| 97 |
+
# base = min(1.0, (words / 50.0) + (sents / 5.0))
|
| 98 |
+
# return round(max(0.4, min(base, 0.95)), 3)
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
# def check_accuracy(reference: str, response: str) -> float:
|
| 102 |
+
# """Semantic similarity between reference and response via embeddings (cosine)."""
|
| 103 |
+
# if not reference or not response:
|
| 104 |
+
# return 0.0
|
| 105 |
+
# ref_emb = embed_model.encode(reference, convert_to_tensor=True)
|
| 106 |
+
# resp_emb = embed_model.encode(response, convert_to_tensor=True)
|
| 107 |
+
# sim = float(util.cos_sim(ref_emb, resp_emb).item())
|
| 108 |
+
# return round(max(0.0, min(1.0, sim)), 3)
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
# # --------------------------
|
| 112 |
+
# # SCORING PIPELINE
|
| 113 |
+
# # --------------------------
|
| 114 |
+
# def compute_row_scores(prompt, response, reference) -> Dict:
|
| 115 |
+
# instr = check_instruction_following(prompt, response)
|
| 116 |
+
# halluc = check_hallucination(reference, response)
|
| 117 |
+
# assum = check_assumption(response)
|
| 118 |
+
# coh = check_coherence(response)
|
| 119 |
+
# acc = check_accuracy(reference, response)
|
| 120 |
+
|
| 121 |
+
# # Final score: average
|
| 122 |
+
# components = [instr, halluc, assum, coh, acc]
|
| 123 |
+
# final = round(float(sum(components) / len(components)), 3)
|
| 124 |
+
|
| 125 |
+
# return {
|
| 126 |
+
# "InstructionFollowing": instr,
|
| 127 |
+
# "Hallucination": halluc,
|
| 128 |
+
# "AssumptionControl": assum,
|
| 129 |
+
# "Coherence": coh,
|
| 130 |
+
# "Accuracy": acc,
|
| 131 |
+
# "FinalScore": final,
|
| 132 |
+
# }
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
# # --------------------------
|
| 136 |
+
# # VISUALIZATION HELPERS
|
| 137 |
+
# # --------------------------
|
| 138 |
+
# # def spider_net_multi(labels: List[str], rows: List[Dict], title: str, fill_alpha: float = 0.12):
|
| 139 |
+
# # """Radar chart for multiple agents."""
|
| 140 |
+
# # N = len(labels)
|
| 141 |
+
# # angles = [n / float(N) * 2 * math.pi for n in range(N)]
|
| 142 |
+
# # angles += angles[:1]
|
| 143 |
+
|
| 144 |
+
# # fig = plt.figure(figsize=(6.5, 6.5))
|
| 145 |
+
# # ax = plt.subplot(111, polar=True)
|
| 146 |
+
# # ax.set_xticks(angles[:-1])
|
| 147 |
+
# # ax.set_xticklabels(labels, fontsize=9)
|
| 148 |
+
# # ax.set_ylim(0, 100)
|
| 149 |
+
# # ax.set_yticks([0, 25, 50, 75, 100])
|
| 150 |
+
|
| 151 |
+
# # for r in rows:
|
| 152 |
+
# # values = r["values"]
|
| 153 |
+
# # values_closed = values + values[:1]
|
| 154 |
+
# # ax.plot(angles, values_closed, linewidth=1.5, label=r["name"])
|
| 155 |
+
# # ax.fill(angles, values_closed, alpha=fill_alpha)
|
| 156 |
+
|
| 157 |
+
# # ax.set_title(title, y=1.08, fontsize=12)
|
| 158 |
+
# # ax.legend(loc="upper right", bbox_to_anchor=(1.25, 1.1))
|
| 159 |
+
# # return fig
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
# # def heatmap_plot(df: pd.DataFrame, metric_cols: List[str], title: str = "Metric Correlations"):
|
| 163 |
+
# # fig, ax = plt.subplots(figsize=(7, 5))
|
| 164 |
+
# # sns.heatmap(df[metric_cols].corr(), annot=True, fmt=".2f", cmap="coolwarm", ax=ax)
|
| 165 |
+
# # ax.set_title(title)
|
| 166 |
+
# # return fig
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
# # --------------------------
|
| 170 |
+
# # HIGH-LEVEL EVALUATION
|
| 171 |
+
# # --------------------------
|
| 172 |
+
# def evaluate_dataframe(df: pd.DataFrame) -> Tuple[pd.DataFrame, List[Tuple[str, str]], pd.DataFrame]:
|
| 173 |
+
# """
|
| 174 |
+
# df must contain: prompt, response, task, agent, reference
|
| 175 |
+
# Returns: metrics_df, [(image_path, caption)], leaderboard_df
|
| 176 |
+
# """
|
| 177 |
+
# df = df.rename(columns={c: c.strip() for c in df.columns})
|
| 178 |
+
|
| 179 |
+
# rows = []
|
| 180 |
+
# for _, r in df.iterrows():
|
| 181 |
+
# prompt = r.get("prompt", "")
|
| 182 |
+
# response = r.get("response", "")
|
| 183 |
+
# reference = r.get("reference", "")
|
| 184 |
+
# agent = r.get("agent", "Unknown")
|
| 185 |
+
# task = r.get("task", "Unknown")
|
| 186 |
+
|
| 187 |
+
# scores = compute_row_scores(prompt, response, reference)
|
| 188 |
+
# entry = {
|
| 189 |
+
# "Task": str(task).strip(),
|
| 190 |
+
# "Agent": str(agent),
|
| 191 |
+
# "Prompt": prompt,
|
| 192 |
+
# "Response": response,
|
| 193 |
+
# "Reference": reference,
|
| 194 |
+
# }
|
| 195 |
+
# entry.update(scores)
|
| 196 |
+
# rows.append(entry)
|
| 197 |
+
|
| 198 |
+
# metrics_df = pd.DataFrame(rows)
|
| 199 |
+
|
| 200 |
+
# # Visualization artifacts
|
| 201 |
+
# images = []
|
| 202 |
+
# metric_labels = ["InstructionFollowing", "Hallucination", "AssumptionControl", "Coherence", "Accuracy"]
|
| 203 |
+
|
| 204 |
+
# # Per-task radar and bar charts
|
| 205 |
+
# for task, g in metrics_df.groupby("Task"):
|
| 206 |
+
# series = []
|
| 207 |
+
# for a in g["Agent"].unique():
|
| 208 |
+
# subset = g[g["Agent"] == a]
|
| 209 |
+
# vals = [round(float(subset[m].mean()) * 100, 2) for m in metric_labels]
|
| 210 |
+
# series.append({"name": a, "values": vals})
|
| 211 |
+
# if series:
|
| 212 |
+
# fig = spider_net_multi(metric_labels, series, title=f"{task} β Agent Comparison")
|
| 213 |
+
# fname = f"/tmp/{uuid.uuid4().hex}_{task}_radar.png"
|
| 214 |
+
# fig.savefig(fname, bbox_inches="tight")
|
| 215 |
+
# plt.close(fig)
|
| 216 |
+
# images.append((fname, f"{task} - radar"))
|
| 217 |
+
|
| 218 |
+
# fig2, ax = plt.subplots(figsize=(8, 4))
|
| 219 |
+
# avg = g.groupby("Agent")[metric_labels].mean()
|
| 220 |
+
# avg.plot(kind="bar", ax=ax)
|
| 221 |
+
# ax.set_title(f"{task} β Average Metrics by Agent")
|
| 222 |
+
# ax.set_ylabel("Score (0-1)")
|
| 223 |
+
# plt.xticks(rotation=45)
|
| 224 |
+
# fname2 = f"/tmp/{uuid.uuid4().hex}_{task}_bar.png"
|
| 225 |
+
# fig2.savefig(fname2, bbox_inches="tight")
|
| 226 |
+
# plt.close(fig2)
|
| 227 |
+
# images.append((fname2, f"{task} - bar"))
|
| 228 |
+
|
| 229 |
+
# # Global heatmap
|
| 230 |
+
# metric_cols = metric_labels + ["FinalScore"]
|
| 231 |
+
# figh = heatmap_plot(metrics_df, metric_cols)
|
| 232 |
+
# fnameh = f"/tmp/{uuid.uuid4().hex}_heatmap.png"
|
| 233 |
+
# figh.savefig(fnameh, bbox_inches="tight")
|
| 234 |
+
# plt.close(figh)
|
| 235 |
+
# images.append((fnameh, "Metric Correlations Heatmap"))
|
| 236 |
+
|
| 237 |
+
# # Leaderboard
|
| 238 |
+
# lb = metrics_df.groupby(["Agent", "Task"])["FinalScore"].mean().reset_index()
|
| 239 |
+
# lb = lb.sort_values(["FinalScore"], ascending=False)
|
| 240 |
+
|
| 241 |
+
# return metrics_df, images, lb
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
# # --------------------------
|
| 245 |
+
# # DEMO USAGE
|
| 246 |
+
# # --------------------------
|
| 247 |
+
# if __name__ == "__main__":
|
| 248 |
+
# # Sample dataset
|
| 249 |
+
# data = [
|
| 250 |
+
# {"task": "Math QA", "agent": "AgentA", "prompt": "What is 2+2?", "response": "The answer is 4.", "reference": "2+2=4"},
|
| 251 |
+
# {"task": "Math QA", "agent": "AgentB", "prompt": "What is 2+2?", "response": "It might be 5, but usually 4.", "reference": "2+2=4"},
|
| 252 |
+
# {"task": "Summarization", "agent": "AgentA", "prompt": "Summarize: 'The cat sat on the mat. The dog barked.'", "response": "A cat sat while a dog barked.", "reference": "Cat on mat, dog barking."},
|
| 253 |
+
# ]
|
| 254 |
+
# df = pd.DataFrame(data)
|
| 255 |
+
|
| 256 |
+
# metrics_df, images, leaderboard = evaluate_dataframe(df)
|
| 257 |
+
|
| 258 |
+
# print("\n=== Metrics per response ===")
|
| 259 |
+
# print(metrics_df[["Task", "Agent", "InstructionFollowing", "Hallucination", "AssumptionControl", "Coherence", "Accuracy", "FinalScore"]])
|
| 260 |
+
|
| 261 |
+
# print("\n=== Leaderboard (average per task & agent) ===")
|
| 262 |
+
# print(leaderboard)
|
| 263 |
+
|
| 264 |
+
# print("\nVisualization files saved in /tmp/:")
|
| 265 |
+
# for path, caption in images:
|
| 266 |
+
# print(f"{caption}: {path}")
|
| 267 |
|
| 268 |
import re
|
| 269 |
+
import json
|
| 270 |
+
import torch
|
|
|
|
|
|
|
|
|
|
| 271 |
import pandas as pd
|
| 272 |
import matplotlib.pyplot as plt
|
| 273 |
import seaborn as sns
|
| 274 |
+
import os
|
| 275 |
+
import uuid
|
| 276 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 277 |
from sentence_transformers import SentenceTransformer, util
|
| 278 |
|
|
|
|
| 369 |
|
| 370 |
|
| 371 |
# --------------------------
|
| 372 |
+
# ROW & DF EVALUATION
|
| 373 |
# --------------------------
|
| 374 |
+
def evaluate_row(row):
|
| 375 |
+
prompt = row.get("prompt", "")
|
| 376 |
+
response = row.get("response", "")
|
| 377 |
+
reference = row.get("reference", "")
|
| 378 |
+
|
| 379 |
+
metrics = {
|
| 380 |
+
"task_id": row.get("task_id", ""),
|
| 381 |
+
"agent": row.get("agent", ""),
|
| 382 |
+
"instruction_following": check_instruction_following(prompt, response),
|
| 383 |
+
"hallucination": check_hallucination(reference, response),
|
| 384 |
+
"assumption": check_assumption(response),
|
| 385 |
+
"coherence": check_coherence(response),
|
| 386 |
+
"accuracy": check_accuracy(reference, response),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 387 |
}
|
| 388 |
|
| 389 |
+
# Weighted avg score (you can adjust weights)
|
| 390 |
+
metrics["final_score"] = round(
|
| 391 |
+
0.25 * metrics["instruction_following"]
|
| 392 |
+
+ 0.25 * metrics["accuracy"]
|
| 393 |
+
+ 0.2 * metrics["hallucination"]
|
| 394 |
+
+ 0.15 * metrics["coherence"]
|
| 395 |
+
+ 0.15 * metrics["assumption"],
|
| 396 |
+
3,
|
| 397 |
+
)
|
| 398 |
+
return metrics
|
| 399 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 400 |
|
| 401 |
+
def evaluate_dataframe(df: pd.DataFrame):
|
| 402 |
+
metrics_df = df.apply(evaluate_row, axis=1, result_type="expand")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 403 |
|
| 404 |
# Leaderboard
|
| 405 |
+
leaderboard = (
|
| 406 |
+
metrics_df.groupby(["agent", "task_id"])["final_score"]
|
| 407 |
+
.mean()
|
| 408 |
+
.reset_index()
|
| 409 |
+
)
|
| 410 |
|
| 411 |
+
# Plots
|
| 412 |
+
images = []
|
| 413 |
+
out_dir = "/tmp/plots"
|
| 414 |
+
os.makedirs(out_dir, exist_ok=True)
|
| 415 |
+
|
| 416 |
+
# Histogram of scores
|
| 417 |
+
plt.figure(figsize=(6, 4))
|
| 418 |
+
sns.histplot(metrics_df["final_score"], bins=10, kde=False)
|
| 419 |
+
plt.title("Distribution of Final Scores")
|
| 420 |
+
hist_path = os.path.join(out_dir, f"hist_{uuid.uuid4().hex}.png")
|
| 421 |
+
plt.savefig(hist_path)
|
| 422 |
+
plt.close()
|
| 423 |
+
images.append((hist_path, "Final Score Distribution"))
|
| 424 |
+
|
| 425 |
+
# Per-agent average
|
| 426 |
+
plt.figure(figsize=(6, 4))
|
| 427 |
+
agent_scores = metrics_df.groupby("agent")["final_score"].mean().reset_index()
|
| 428 |
+
sns.barplot(data=agent_scores, x="agent", y="final_score")
|
| 429 |
+
plt.title("Average Final Score per Agent")
|
| 430 |
+
bar_path = os.path.join(out_dir, f"bar_{uuid.uuid4().hex}.png")
|
| 431 |
+
plt.savefig(bar_path)
|
| 432 |
+
plt.close()
|
| 433 |
+
images.append((bar_path, "Average Score per Agent"))
|
| 434 |
+
|
| 435 |
+
return metrics_df, images, leaderboard
|
|
|