Spaces:
Runtime error
Runtime error
Update evaluator.py (#16)
Browse files- Update evaluator.py (260dd1f6f4d58ed1d98270526903191fc129a3a4)
Co-authored-by: Manay Porwal <manayporwal07@users.noreply.huggingface.co>
- evaluator.py +210 -211
evaluator.py
CHANGED
|
@@ -204,258 +204,257 @@
|
|
| 204 |
#
|
| 205 |
###############################################################################################################################
|
| 206 |
|
|
|
|
| 207 |
"""
|
| 208 |
-
Evaluation logic for Agentic Evaluation Framework.
|
|
|
|
| 209 |
"""
|
| 210 |
|
| 211 |
-
import
|
|
|
|
|
|
|
| 212 |
import numpy as np
|
| 213 |
import pandas as pd
|
| 214 |
-
import torch
|
| 215 |
import matplotlib.pyplot as plt
|
| 216 |
-
|
| 217 |
-
from transformers import (
|
| 218 |
-
AutoTokenizer,
|
| 219 |
-
AutoModelForSequenceClassification,
|
| 220 |
-
AutoModelForCausalLM,
|
| 221 |
-
pipeline,
|
| 222 |
-
)
|
| 223 |
-
from sentence_transformers import SentenceTransformer
|
| 224 |
-
import evaluate
|
| 225 |
|
| 226 |
# -----------------------------
|
| 227 |
-
#
|
| 228 |
# -----------------------------
|
| 229 |
-
NLI_MODEL = "
|
| 230 |
-
EMBED_MODEL = "all-MiniLM-L6-v2"
|
| 231 |
-
LLM_JUDGE_MODEL = "microsoft/DialoGPT-small"
|
| 232 |
-
FLUENCY_MODEL = "textattack/roberta-base-CoLA"
|
| 233 |
-
|
| 234 |
-
device = 0 if torch.cuda.is_available() else -1
|
| 235 |
|
| 236 |
-
|
| 237 |
-
_nli_model
|
| 238 |
_embed_model = None
|
| 239 |
-
|
| 240 |
-
_fluency_checker = None
|
| 241 |
-
|
| 242 |
-
# Metrics
|
| 243 |
-
bertscore = evaluate.load("bertscore")
|
| 244 |
-
bleu = evaluate.load("bleu")
|
| 245 |
-
rouge = evaluate.load("rouge")
|
| 246 |
-
|
| 247 |
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
|
|
|
| 253 |
if _nli_model is None:
|
|
|
|
| 254 |
_nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL)
|
| 255 |
-
_nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL)
|
| 256 |
-
|
| 257 |
-
)
|
| 258 |
_nli_model.eval()
|
| 259 |
-
|
| 260 |
-
|
| 261 |
|
| 262 |
def get_embed_model():
|
| 263 |
-
|
| 264 |
-
if _embed_model is None:
|
| 265 |
-
_embed_model = SentenceTransformer(EMBED_MODEL, device="cuda" if torch.cuda.is_available() else "cpu")
|
| 266 |
return _embed_model
|
| 267 |
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
if _judge_model is None:
|
| 272 |
-
_judge_tokenizer = AutoTokenizer.from_pretrained(LLM_JUDGE_MODEL)
|
| 273 |
-
_judge_model = AutoModelForCausalLM.from_pretrained(LLM_JUDGE_MODEL).to(
|
| 274 |
-
torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 275 |
-
)
|
| 276 |
-
return _judge_model, _judge_tokenizer
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
def get_fluency_checker():
|
| 280 |
-
global _fluency_checker
|
| 281 |
-
if _fluency_checker is None:
|
| 282 |
-
_fluency_checker = pipeline(
|
| 283 |
-
"text-classification", model=FLUENCY_MODEL, device=device
|
| 284 |
-
)
|
| 285 |
-
return _fluency_checker
|
| 286 |
-
|
| 287 |
|
| 288 |
# -----------------------------
|
| 289 |
-
#
|
| 290 |
# -----------------------------
|
| 291 |
-
def check_instruction_following(prompt, response):
|
| 292 |
-
try:
|
| 293 |
-
nli_model, nli_tokenizer = get_nli_model()
|
| 294 |
-
inputs = nli_tokenizer(prompt, response, return_tensors="pt", truncation=True, padding=True).to(
|
| 295 |
-
nli_model.device
|
| 296 |
-
)
|
| 297 |
-
with torch.no_grad():
|
| 298 |
-
logits = nli_model(**inputs).logits
|
| 299 |
-
probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]
|
| 300 |
-
entailment_score = probs[2] # entailment index
|
| 301 |
-
return float(entailment_score)
|
| 302 |
-
except Exception:
|
| 303 |
-
return 0.0
|
| 304 |
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
try:
|
| 308 |
-
nli_model, nli_tokenizer = get_nli_model()
|
| 309 |
-
inputs = nli_tokenizer(reference, response, return_tensors="pt", truncation=True, padding=True).to(
|
| 310 |
-
nli_model.device
|
| 311 |
-
)
|
| 312 |
-
with torch.no_grad():
|
| 313 |
-
logits = nli_model(**inputs).logits
|
| 314 |
-
probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]
|
| 315 |
-
contradiction_score = probs[0] # contradiction index
|
| 316 |
-
return 1.0 - float(contradiction_score)
|
| 317 |
-
except Exception:
|
| 318 |
return 0.0
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
judgment = judge_tokenizer.decode(outputs[0], skip_special_tokens=True).lower()
|
| 328 |
-
if "yes" in judgment:
|
| 329 |
-
return 0.0
|
| 330 |
-
elif "no" in judgment:
|
| 331 |
-
return 1.0
|
| 332 |
-
return 0.5
|
| 333 |
-
except Exception:
|
| 334 |
-
return 0.5
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
def check_coherence(response):
|
| 338 |
-
try:
|
| 339 |
-
emb = get_embed_model().encode(response, convert_to_tensor=True, normalize_embeddings=True)
|
| 340 |
-
coherence = float(torch.mean(emb).cpu().item())
|
| 341 |
-
return coherence
|
| 342 |
-
except Exception:
|
| 343 |
return 0.0
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
bleu_score = 0.0
|
| 358 |
-
|
| 359 |
-
try:
|
| 360 |
-
rouge_results = rouge.compute(predictions=[response], references=[reference])
|
| 361 |
-
rouge_l = rouge_results["rougeL"]
|
| 362 |
-
except Exception:
|
| 363 |
-
rouge_l = 0.0
|
| 364 |
-
|
| 365 |
-
return float((bert_f1 + bleu_score + rouge_l) / 3)
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
def check_relevance(prompt, response):
|
| 369 |
-
try:
|
| 370 |
-
model = get_embed_model()
|
| 371 |
-
emb1 = model.encode(prompt, convert_to_tensor=True)
|
| 372 |
-
emb2 = model.encode(response, convert_to_tensor=True)
|
| 373 |
-
cos_sim = torch.nn.functional.cosine_similarity(emb1, emb2, dim=0)
|
| 374 |
-
return float(cos_sim.item())
|
| 375 |
-
except Exception:
|
| 376 |
return 0.0
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
return 0.
|
| 386 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 387 |
|
| 388 |
# -----------------------------
|
| 389 |
-
#
|
| 390 |
# -----------------------------
|
| 391 |
-
def evaluate_row(row):
|
| 392 |
-
scores = {
|
| 393 |
-
"instruction_following": check_instruction_following(row["prompt"], row["response"]),
|
| 394 |
-
"hallucination": check_hallucination(row["reference"], row["response"]),
|
| 395 |
-
"assumption": check_assumption(row["prompt"], row["response"]),
|
| 396 |
-
"coherence": check_coherence(row["response"]),
|
| 397 |
-
"accuracy": check_accuracy(row["reference"], row["response"]),
|
| 398 |
-
"relevance": check_relevance(row["prompt"], row["response"]),
|
| 399 |
-
"fluency": check_fluency(row["response"]),
|
| 400 |
-
}
|
| 401 |
-
scores["final_score"] = np.mean(list(scores.values()))
|
| 402 |
-
return pd.Series(scores)
|
| 403 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 404 |
|
| 405 |
# -----------------------------
|
| 406 |
-
#
|
| 407 |
# -----------------------------
|
| 408 |
-
def plot_radar_chart(metrics_df, out_path="/tmp/radar.png"):
|
| 409 |
-
import seaborn as sns
|
| 410 |
-
|
| 411 |
-
mean_scores = metrics_df.mean(numeric_only=True).drop("final_score", errors="ignore")
|
| 412 |
-
categories = list(mean_scores.index)
|
| 413 |
-
values = mean_scores.values.tolist()
|
| 414 |
|
| 415 |
-
|
| 416 |
-
|
| 417 |
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
return out_path, "Radar Chart (Mean Scores)"
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
def plot_leaderboard(metrics_df, out_path="/tmp/leaderboard.png"):
|
| 432 |
-
agent_means = metrics_df.groupby("agent")["final_score"].mean().sort_values(ascending=False)
|
| 433 |
-
plt.figure(figsize=(10, 5))
|
| 434 |
-
agent_means.plot(kind="bar", colormap="Set3", ax=plt.gca())
|
| 435 |
-
plt.title("Leaderboard: Avg Final Score per Agent")
|
| 436 |
-
plt.ylabel("Score")
|
| 437 |
-
plt.tight_layout()
|
| 438 |
-
plt.savefig(out_path)
|
| 439 |
-
plt.close()
|
| 440 |
-
return out_path, "Leaderboard"
|
| 441 |
|
|
|
|
| 442 |
|
| 443 |
-
#
|
| 444 |
-
|
| 445 |
-
# -----------------------------
|
| 446 |
-
def evaluate_dataframe(df: pd.DataFrame):
|
| 447 |
-
metrics_df = df.apply(evaluate_row, axis=1, result_type="expand")
|
| 448 |
-
metrics_df = pd.concat([df, metrics_df], axis=1)
|
| 449 |
-
|
| 450 |
leaderboard = (
|
| 451 |
-
metrics_df.groupby("agent"
|
| 452 |
.mean()
|
| 453 |
.reset_index()
|
| 454 |
-
.sort_values("final_score", ascending=False)
|
| 455 |
)
|
| 456 |
|
|
|
|
|
|
|
|
|
|
| 457 |
images = []
|
| 458 |
-
images.append(plot_radar_chart(metrics_df))
|
| 459 |
-
images.append(plot_leaderboard(metrics_df))
|
| 460 |
|
| 461 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
#
|
| 205 |
###############################################################################################################################
|
| 206 |
|
| 207 |
+
# evaluator.py
|
| 208 |
"""
|
| 209 |
+
Upgraded Evaluation logic for the Agentic Evaluation Framework.
|
| 210 |
+
Provides scoring functions, visualization generation, and summary outputs.
|
| 211 |
"""
|
| 212 |
|
| 213 |
+
import math
|
| 214 |
+
import uuid
|
| 215 |
+
from typing import List, Dict, Tuple
|
| 216 |
import numpy as np
|
| 217 |
import pandas as pd
|
|
|
|
| 218 |
import matplotlib.pyplot as plt
|
| 219 |
+
import seaborn as sns
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
|
| 221 |
# -----------------------------
|
| 222 |
+
# Lazy model loading
|
| 223 |
# -----------------------------
|
| 224 |
+
NLI_MODEL = "textattack/roberta-base-MNLI"
|
| 225 |
+
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
|
| 227 |
+
_nli_tokenizer = None
|
| 228 |
+
_nli_model = None
|
| 229 |
_embed_model = None
|
| 230 |
+
_id2label = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
|
| 232 |
+
def ensure_models_loaded():
|
| 233 |
+
global _nli_tokenizer, _nli_model, _embed_model, _id2label
|
| 234 |
+
if _embed_model is None:
|
| 235 |
+
from sentence_transformers import SentenceTransformer, util
|
| 236 |
+
_embed_model = SentenceTransformer(EMBED_MODEL)
|
| 237 |
+
globals()["util"] = util
|
| 238 |
if _nli_model is None:
|
| 239 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 240 |
_nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL)
|
| 241 |
+
_nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL)
|
| 242 |
+
_nli_model.to("cpu")
|
|
|
|
| 243 |
_nli_model.eval()
|
| 244 |
+
_id2label = {int(k): v.upper() for k, v in _nli_model.config.id2label.items()}
|
|
|
|
| 245 |
|
| 246 |
def get_embed_model():
|
| 247 |
+
ensure_models_loaded()
|
|
|
|
|
|
|
| 248 |
return _embed_model
|
| 249 |
|
| 250 |
+
def get_nli_tokenizer_and_model():
|
| 251 |
+
ensure_models_loaded()
|
| 252 |
+
return _nli_tokenizer, _nli_model, _id2label
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
|
| 254 |
# -----------------------------
|
| 255 |
+
# Metric functions
|
| 256 |
# -----------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
|
| 258 |
+
def check_instruction_following(prompt: str, response: str) -> float:
|
| 259 |
+
if not prompt or not response:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
return 0.0
|
| 261 |
+
embed_model = get_embed_model()
|
| 262 |
+
p_emb = embed_model.encode(prompt, convert_to_tensor=True)
|
| 263 |
+
r_emb = embed_model.encode(response, convert_to_tensor=True)
|
| 264 |
+
sim = float(util.cos_sim(p_emb, r_emb).item())
|
| 265 |
+
return round(max(0.0, min(1.0, sim)), 3)
|
| 266 |
+
|
| 267 |
+
def check_hallucination(prompt: str, response: str) -> float:
|
| 268 |
+
if not prompt or not response:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
return 0.0
|
| 270 |
+
tokenizer, model, id2label = get_nli_tokenizer_and_model()
|
| 271 |
+
inputs = tokenizer.encode_plus(prompt, response, return_tensors="pt", truncation=True)
|
| 272 |
+
outputs = model(**inputs)
|
| 273 |
+
probs = outputs.logits.softmax(dim=1).detach().cpu().numpy()[0]
|
| 274 |
+
labels = [id2label[i] for i in range(len(probs))]
|
| 275 |
+
if "ENTAILMENT" in labels:
|
| 276 |
+
entailment_prob = float(probs[labels.index("ENTAILMENT")])
|
| 277 |
+
else:
|
| 278 |
+
entailment_prob = float(probs.max())
|
| 279 |
+
return round(entailment_prob, 3)
|
| 280 |
+
|
| 281 |
+
def check_accuracy(reference: str, response: str) -> float:
|
| 282 |
+
if not reference or not response:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
return 0.0
|
| 284 |
+
embed_model = get_embed_model()
|
| 285 |
+
ref_emb = embed_model.encode(reference, convert_to_tensor=True)
|
| 286 |
+
r_emb = embed_model.encode(response, convert_to_tensor=True)
|
| 287 |
+
sim = float(util.cos_sim(ref_emb, r_emb).item())
|
| 288 |
+
return round(max(0.0, min(1.0, sim)), 3)
|
| 289 |
+
|
| 290 |
+
def check_coherence(response: str) -> float:
|
| 291 |
+
if not response or not isinstance(response, str):
|
| 292 |
+
return 0.0
|
| 293 |
+
sentences = [s.strip() for s in response.split(".") if s.strip()]
|
| 294 |
+
if not sentences:
|
| 295 |
+
return 0.0
|
| 296 |
+
lengths = [len(s.split()) for s in sentences]
|
| 297 |
+
avg_len = np.mean(lengths)
|
| 298 |
+
std = np.std(lengths)
|
| 299 |
+
score = max(0.0, min(1.0, 1.0 - (std / (avg_len + 1e-6))))
|
| 300 |
+
return round(score, 3)
|
| 301 |
+
|
| 302 |
+
def check_fluency(response: str) -> float:
|
| 303 |
+
if not response or not isinstance(response, str):
|
| 304 |
+
return 0.0
|
| 305 |
+
letters = sum(ch.isalpha() for ch in response)
|
| 306 |
+
total = len(response)
|
| 307 |
+
return round(max(0.0, min(1.0, letters / max(1, total))), 3)
|
| 308 |
|
| 309 |
# -----------------------------
|
| 310 |
+
# Visualization helpers
|
| 311 |
# -----------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
|
| 313 |
+
def spider_net_multi(labels: List[str], rows: List[Dict], title: str, fill_alpha: float = 0.12):
|
| 314 |
+
N = len(labels)
|
| 315 |
+
angles = [n / float(N) * 2 * math.pi for n in range(N)]
|
| 316 |
+
angles += angles[:1]
|
| 317 |
+
fig = plt.figure(figsize=(6.5, 6.5))
|
| 318 |
+
ax = plt.subplot(111, polar=True)
|
| 319 |
+
ax.set_xticks(angles[:-1])
|
| 320 |
+
ax.set_xticklabels(labels, fontsize=9)
|
| 321 |
+
ax.set_ylim(0, 1)
|
| 322 |
+
ax.set_yticks([0, 0.25, 0.5, 0.75, 1.0])
|
| 323 |
+
for r in rows:
|
| 324 |
+
values = r["values"]
|
| 325 |
+
values_closed = values + values[:1]
|
| 326 |
+
ax.plot(angles, values_closed, linewidth=1.5, label=r["name"])
|
| 327 |
+
ax.fill(angles, values_closed, alpha=fill_alpha)
|
| 328 |
+
ax.set_title(title, y=1.08, fontsize=12)
|
| 329 |
+
ax.legend(loc="upper right", bbox_to_anchor=(1.25, 1.1))
|
| 330 |
+
return fig
|
| 331 |
+
|
| 332 |
+
def heatmap_plot(df: pd.DataFrame, metric_cols: List[str], title: str = "Metric Correlations"):
|
| 333 |
+
fig, ax = plt.subplots(figsize=(7, 5))
|
| 334 |
+
sns.heatmap(df[metric_cols].corr(), annot=True, fmt=".2f", cmap="coolwarm", ax=ax)
|
| 335 |
+
ax.set_title(title)
|
| 336 |
+
return fig
|
| 337 |
+
|
| 338 |
+
def task_agent_heatmap(leaderboard: pd.DataFrame, metric: str):
|
| 339 |
+
pivot = leaderboard.pivot(index="task", columns="agent", values=metric)
|
| 340 |
+
fig, ax = plt.subplots(figsize=(7, 5))
|
| 341 |
+
sns.heatmap(pivot, annot=True, fmt=".2f", cmap="YlGnBu", ax=ax)
|
| 342 |
+
ax.set_title(f"Task-Agent Performance ({metric})")
|
| 343 |
+
return fig
|
| 344 |
+
|
| 345 |
+
def leaderboard_barplot(leaderboard: pd.DataFrame, metric_cols: List[str]):
|
| 346 |
+
melted = leaderboard.melt(id_vars=["agent"], value_vars=metric_cols, var_name="metric", value_name="score")
|
| 347 |
+
fig, ax = plt.subplots(figsize=(8, 5))
|
| 348 |
+
sns.barplot(x="metric", y="score", hue="agent", data=melted, ax=ax)
|
| 349 |
+
ax.set_title("Leaderboard Bar Chart")
|
| 350 |
+
ax.set_ylim(0, 1)
|
| 351 |
+
return fig
|
| 352 |
+
|
| 353 |
+
def distribution_plot(metrics_df: pd.DataFrame, metric: str):
|
| 354 |
+
fig, ax = plt.subplots(figsize=(7, 5))
|
| 355 |
+
sns.boxplot(x="agent", y=metric, data=metrics_df, ax=ax)
|
| 356 |
+
sns.stripplot(x="agent", y=metric, data=metrics_df, ax=ax, color="black", alpha=0.4, jitter=True)
|
| 357 |
+
ax.set_title(f"Distribution of {metric} Scores per Agent")
|
| 358 |
+
ax.set_ylim(0, 1)
|
| 359 |
+
return fig
|
| 360 |
+
|
| 361 |
+
def scatter_two_metrics(metrics_df: pd.DataFrame, metric_x: str, metric_y: str):
|
| 362 |
+
fig, ax = plt.subplots(figsize=(6, 6))
|
| 363 |
+
sns.scatterplot(x=metric_x, y=metric_y, hue="agent", data=metrics_df, ax=ax, alpha=0.7)
|
| 364 |
+
ax.set_title(f"{metric_x} vs {metric_y}")
|
| 365 |
+
ax.set_xlim(0, 1)
|
| 366 |
+
ax.set_ylim(0, 1)
|
| 367 |
+
return fig
|
| 368 |
|
| 369 |
# -----------------------------
|
| 370 |
+
# Main evaluation entrypoint
|
| 371 |
# -----------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
|
| 373 |
+
def evaluate_dataframe(df: pd.DataFrame) -> Tuple[pd.DataFrame, List[Tuple[str,str]], pd.DataFrame]:
|
| 374 |
+
df = df.copy()
|
| 375 |
|
| 376 |
+
# compute scores per row
|
| 377 |
+
scores = []
|
| 378 |
+
for _, row in df.iterrows():
|
| 379 |
+
s = {}
|
| 380 |
+
s["instruction_following"] = check_instruction_following(str(row.get("prompt", "")), str(row.get("response", "")))
|
| 381 |
+
s["hallucination"] = check_hallucination(str(row.get("prompt", "")), str(row.get("response", "")))
|
| 382 |
+
s["accuracy"] = check_accuracy(str(row.get("reference", "")), str(row.get("response", "")))
|
| 383 |
+
s["coherence"] = check_coherence(str(row.get("response", "")))
|
| 384 |
+
s["fluency"] = check_fluency(str(row.get("response", "")))
|
| 385 |
+
scores.append(s)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 386 |
|
| 387 |
+
metrics_df = pd.concat([df.reset_index(drop=True), pd.DataFrame(scores)], axis=1)
|
| 388 |
|
| 389 |
+
# leaderboard: average per agent & task
|
| 390 |
+
metric_cols = ["instruction_following", "hallucination", "accuracy", "coherence", "fluency"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 391 |
leaderboard = (
|
| 392 |
+
metrics_df.groupby(["agent", "task"])[metric_cols]
|
| 393 |
.mean()
|
| 394 |
.reset_index()
|
|
|
|
| 395 |
)
|
| 396 |
|
| 397 |
+
# -------------------
|
| 398 |
+
# Visualization images
|
| 399 |
+
# -------------------
|
| 400 |
images = []
|
|
|
|
|
|
|
| 401 |
|
| 402 |
+
try:
|
| 403 |
+
rows = []
|
| 404 |
+
for agent in leaderboard["agent"].unique():
|
| 405 |
+
vals = leaderboard[leaderboard["agent"] == agent][metric_cols].mean().tolist()
|
| 406 |
+
rows.append({"name": agent, "values": vals})
|
| 407 |
+
fig1 = spider_net_multi(metric_cols, rows, "Agent Performance Radar")
|
| 408 |
+
path1 = f"/tmp/radar_{uuid.uuid4().hex}.png"
|
| 409 |
+
fig1.savefig(path1, bbox_inches="tight")
|
| 410 |
+
plt.close(fig1)
|
| 411 |
+
images.append((path1, "Radar Plot"))
|
| 412 |
+
except Exception as e:
|
| 413 |
+
print("Radar plot failed:", e)
|
| 414 |
+
|
| 415 |
+
try:
|
| 416 |
+
fig2 = heatmap_plot(metrics_df, metric_cols, title="Metric Correlation Heatmap")
|
| 417 |
+
path2 = f"/tmp/heatmap_{uuid.uuid4().hex}.png"
|
| 418 |
+
fig2.savefig(path2, bbox_inches="tight")
|
| 419 |
+
plt.close(fig2)
|
| 420 |
+
images.append((path2, "Correlation Heatmap"))
|
| 421 |
+
except Exception as e:
|
| 422 |
+
print("Heatmap failed:", e)
|
| 423 |
+
|
| 424 |
+
try:
|
| 425 |
+
fig3 = task_agent_heatmap(leaderboard, "accuracy")
|
| 426 |
+
path3 = f"/tmp/task_agent_{uuid.uuid4().hex}.png"
|
| 427 |
+
fig3.savefig(path3, bbox_inches="tight")
|
| 428 |
+
plt.close(fig3)
|
| 429 |
+
images.append((path3, "Task-Agent Heatmap (Accuracy)"))
|
| 430 |
+
except Exception as e:
|
| 431 |
+
print("Task-agent heatmap failed:", e)
|
| 432 |
+
|
| 433 |
+
try:
|
| 434 |
+
fig4 = leaderboard_barplot(leaderboard, metric_cols)
|
| 435 |
+
path4 = f"/tmp/barplot_{uuid.uuid4().hex}.png"
|
| 436 |
+
fig4.savefig(path4, bbox_inches="tight")
|
| 437 |
+
plt.close(fig4)
|
| 438 |
+
images.append((path4, "Leaderboard Bar Chart"))
|
| 439 |
+
except Exception as e:
|
| 440 |
+
print("Barplot failed:", e)
|
| 441 |
+
|
| 442 |
+
try:
|
| 443 |
+
fig5 = distribution_plot(metrics_df, "accuracy")
|
| 444 |
+
path5 = f"/tmp/distribution_{uuid.uuid4().hex}.png"
|
| 445 |
+
fig5.savefig(path5, bbox_inches="tight")
|
| 446 |
+
plt.close(fig5)
|
| 447 |
+
images.append((path5, "Accuracy Distribution"))
|
| 448 |
+
except Exception as e:
|
| 449 |
+
print("Distribution plot failed:", e)
|
| 450 |
+
|
| 451 |
+
try:
|
| 452 |
+
fig6 = scatter_two_metrics(metrics_df, "instruction_following", "accuracy")
|
| 453 |
+
path6 = f"/tmp/scatter_{uuid.uuid4().hex}.png"
|
| 454 |
+
fig6.savefig(path6, bbox_inches="tight")
|
| 455 |
+
plt.close(fig6)
|
| 456 |
+
images.append((path6, "Instruction Following vs Accuracy"))
|
| 457 |
+
except Exception as e:
|
| 458 |
+
print("Scatter plot failed:", e)
|
| 459 |
+
|
| 460 |
+
return metrics_df, images, leaderboard
|