Update evaluator.py

#18
by manayporwal07 - opened
Files changed (1) hide show
  1. evaluator.py +112 -195
evaluator.py CHANGED
@@ -206,174 +206,85 @@
206
 
207
  # evaluator.py
208
  """
209
- Upgraded Evaluation logic for the Agentic Evaluation Framework.
210
- Provides scoring functions, visualization generation, and summary outputs.
211
  """
212
 
213
- import math
214
- import uuid
215
- from typing import List, Dict, Tuple
216
  import numpy as np
217
  import pandas as pd
218
  import matplotlib.pyplot as plt
219
  import seaborn as sns
 
220
 
221
- # -----------------------------
222
- # Lazy model loading
223
- # -----------------------------
224
- NLI_MODEL = "textattack/roberta-base-MNLI"
 
 
225
  EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 
226
 
227
- _nli_tokenizer = None
228
- _nli_model = None
229
- _embed_model = None
230
- _id2label = None
231
-
232
- def ensure_models_loaded():
233
- global _nli_tokenizer, _nli_model, _embed_model, _id2label
234
- if _embed_model is None:
235
- from sentence_transformers import SentenceTransformer, util
236
- _embed_model = SentenceTransformer(EMBED_MODEL)
237
- globals()["util"] = util
238
- if _nli_model is None:
239
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
240
- _nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL)
241
- _nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL)
242
- _nli_model.to("cpu")
243
- _nli_model.eval()
244
- _id2label = {int(k): v.upper() for k, v in _nli_model.config.id2label.items()}
245
-
246
- def get_embed_model():
247
- ensure_models_loaded()
248
- return _embed_model
249
-
250
- def get_nli_tokenizer_and_model():
251
- ensure_models_loaded()
252
- return _nli_tokenizer, _nli_model, _id2label
253
-
254
- # -----------------------------
255
- # Metric functions
256
- # -----------------------------
257
-
258
- def check_instruction_following(prompt: str, response: str) -> float:
259
  if not prompt or not response:
260
  return 0.0
261
- embed_model = get_embed_model()
262
- p_emb = embed_model.encode(prompt, convert_to_tensor=True)
263
- r_emb = embed_model.encode(response, convert_to_tensor=True)
264
  sim = float(util.cos_sim(p_emb, r_emb).item())
265
  return round(max(0.0, min(1.0, sim)), 3)
266
 
267
- def check_hallucination(prompt: str, response: str) -> float:
268
  if not prompt or not response:
269
  return 0.0
270
- tokenizer, model, id2label = get_nli_tokenizer_and_model()
271
- inputs = tokenizer.encode_plus(prompt, response, return_tensors="pt", truncation=True)
272
- outputs = model(**inputs)
273
  probs = outputs.logits.softmax(dim=1).detach().cpu().numpy()[0]
274
- labels = [id2label[i] for i in range(len(probs))]
275
- if "ENTAILMENT" in labels:
276
- entailment_prob = float(probs[labels.index("ENTAILMENT")])
277
- else:
278
- entailment_prob = float(probs.max())
279
- return round(entailment_prob, 3)
280
-
281
- def check_accuracy(reference: str, response: str) -> float:
282
  if not reference or not response:
283
  return 0.0
284
- embed_model = get_embed_model()
285
- ref_emb = embed_model.encode(reference, convert_to_tensor=True)
286
- r_emb = embed_model.encode(response, convert_to_tensor=True)
287
  sim = float(util.cos_sim(ref_emb, r_emb).item())
288
  return round(max(0.0, min(1.0, sim)), 3)
289
 
290
- def check_coherence(response: str) -> float:
291
- if not response or not isinstance(response, str):
292
  return 0.0
293
- sentences = [s.strip() for s in response.split(".") if s.strip()]
294
- if not sentences:
295
- return 0.0
296
- lengths = [len(s.split()) for s in sentences]
297
- avg_len = np.mean(lengths)
298
- std = np.std(lengths)
299
- score = max(0.0, min(1.0, 1.0 - (std / (avg_len + 1e-6))))
300
- return round(score, 3)
301
-
302
- def check_fluency(response: str) -> float:
303
- if not response or not isinstance(response, str):
 
 
304
  return 0.0
305
  letters = sum(ch.isalpha() for ch in response)
306
  total = len(response)
307
- return round(max(0.0, min(1.0, letters / max(1, total))), 3)
308
-
309
- # -----------------------------
310
- # Visualization helpers
311
- # -----------------------------
312
-
313
- def spider_net_multi(labels: List[str], rows: List[Dict], title: str, fill_alpha: float = 0.12):
314
- N = len(labels)
315
- angles = [n / float(N) * 2 * math.pi for n in range(N)]
316
- angles += angles[:1]
317
- fig = plt.figure(figsize=(6.5, 6.5))
318
- ax = plt.subplot(111, polar=True)
319
- ax.set_xticks(angles[:-1])
320
- ax.set_xticklabels(labels, fontsize=9)
321
- ax.set_ylim(0, 1)
322
- ax.set_yticks([0, 0.25, 0.5, 0.75, 1.0])
323
- for r in rows:
324
- values = r["values"]
325
- values_closed = values + values[:1]
326
- ax.plot(angles, values_closed, linewidth=1.5, label=r["name"])
327
- ax.fill(angles, values_closed, alpha=fill_alpha)
328
- ax.set_title(title, y=1.08, fontsize=12)
329
- ax.legend(loc="upper right", bbox_to_anchor=(1.25, 1.1))
330
- return fig
331
-
332
- def heatmap_plot(df: pd.DataFrame, metric_cols: List[str], title: str = "Metric Correlations"):
333
- fig, ax = plt.subplots(figsize=(7, 5))
334
- sns.heatmap(df[metric_cols].corr(), annot=True, fmt=".2f", cmap="coolwarm", ax=ax)
335
- ax.set_title(title)
336
- return fig
337
-
338
- def task_agent_heatmap(leaderboard: pd.DataFrame, metric: str):
339
- pivot = leaderboard.pivot(index="task", columns="agent", values=metric)
340
- fig, ax = plt.subplots(figsize=(7, 5))
341
- sns.heatmap(pivot, annot=True, fmt=".2f", cmap="YlGnBu", ax=ax)
342
- ax.set_title(f"Task-Agent Performance ({metric})")
343
- return fig
344
-
345
- def leaderboard_barplot(leaderboard: pd.DataFrame, metric_cols: List[str]):
346
- melted = leaderboard.melt(id_vars=["agent"], value_vars=metric_cols, var_name="metric", value_name="score")
347
- fig, ax = plt.subplots(figsize=(8, 5))
348
- sns.barplot(x="metric", y="score", hue="agent", data=melted, ax=ax)
349
- ax.set_title("Leaderboard Bar Chart")
350
- ax.set_ylim(0, 1)
351
- return fig
352
-
353
- def distribution_plot(metrics_df: pd.DataFrame, metric: str):
354
- fig, ax = plt.subplots(figsize=(7, 5))
355
- sns.boxplot(x="agent", y=metric, data=metrics_df, ax=ax)
356
- sns.stripplot(x="agent", y=metric, data=metrics_df, ax=ax, color="black", alpha=0.4, jitter=True)
357
- ax.set_title(f"Distribution of {metric} Scores per Agent")
358
- ax.set_ylim(0, 1)
359
- return fig
360
-
361
- def scatter_two_metrics(metrics_df: pd.DataFrame, metric_x: str, metric_y: str):
362
- fig, ax = plt.subplots(figsize=(6, 6))
363
- sns.scatterplot(x=metric_x, y=metric_y, hue="agent", data=metrics_df, ax=ax, alpha=0.7)
364
- ax.set_title(f"{metric_x} vs {metric_y}")
365
- ax.set_xlim(0, 1)
366
- ax.set_ylim(0, 1)
367
- return fig
368
 
369
- # -----------------------------
370
- # Main evaluation entrypoint
371
- # -----------------------------
372
 
373
- def evaluate_dataframe(df: pd.DataFrame) -> Tuple[pd.DataFrame, List[Tuple[str,str]], pd.DataFrame]:
374
- df = df.copy()
375
-
376
- # compute scores per row
377
  scores = []
378
  for _, row in df.iterrows():
379
  s = {}
@@ -382,79 +293,85 @@ def evaluate_dataframe(df: pd.DataFrame) -> Tuple[pd.DataFrame, List[Tuple[str,s
382
  s["accuracy"] = check_accuracy(str(row.get("reference", "")), str(row.get("response", "")))
383
  s["coherence"] = check_coherence(str(row.get("response", "")))
384
  s["fluency"] = check_fluency(str(row.get("response", "")))
 
 
 
 
385
  scores.append(s)
386
 
387
  metrics_df = pd.concat([df.reset_index(drop=True), pd.DataFrame(scores)], axis=1)
 
388
 
389
- # leaderboard: average per agent & task
390
- metric_cols = ["instruction_following", "hallucination", "accuracy", "coherence", "fluency"]
391
  leaderboard = (
392
- metrics_df.groupby(["agent", "task"])[metric_cols]
393
  .mean()
394
  .reset_index()
395
  )
 
396
 
397
- # -------------------
398
- # Visualization images
399
- # -------------------
400
- images = []
401
 
402
- try:
403
- rows = []
404
- for agent in leaderboard["agent"].unique():
405
- vals = leaderboard[leaderboard["agent"] == agent][metric_cols].mean().tolist()
406
- rows.append({"name": agent, "values": vals})
407
- fig1 = spider_net_multi(metric_cols, rows, "Agent Performance Radar")
408
- path1 = f"/tmp/radar_{uuid.uuid4().hex}.png"
409
- fig1.savefig(path1, bbox_inches="tight")
410
- plt.close(fig1)
411
- images.append((path1, "Radar Plot"))
412
- except Exception as e:
413
- print("Radar plot failed:", e)
 
 
 
 
414
 
415
- try:
416
- fig2 = heatmap_plot(metrics_df, metric_cols, title="Metric Correlation Heatmap")
417
- path2 = f"/tmp/heatmap_{uuid.uuid4().hex}.png"
418
- fig2.savefig(path2, bbox_inches="tight")
419
- plt.close(fig2)
420
- images.append((path2, "Correlation Heatmap"))
421
- except Exception as e:
422
- print("Heatmap failed:", e)
 
423
 
 
 
 
 
 
 
 
 
 
424
  try:
425
- fig3 = task_agent_heatmap(leaderboard, "accuracy")
426
- path3 = f"/tmp/task_agent_{uuid.uuid4().hex}.png"
427
- fig3.savefig(path3, bbox_inches="tight")
428
- plt.close(fig3)
429
- images.append((path3, "Task-Agent Heatmap (Accuracy)"))
430
  except Exception as e:
431
- print("Task-agent heatmap failed:", e)
432
-
433
  try:
434
- fig4 = leaderboard_barplot(leaderboard, metric_cols)
435
- path4 = f"/tmp/barplot_{uuid.uuid4().hex}.png"
436
- fig4.savefig(path4, bbox_inches="tight")
437
- plt.close(fig4)
438
- images.append((path4, "Leaderboard Bar Chart"))
439
  except Exception as e:
440
- print("Barplot failed:", e)
441
-
442
  try:
443
- fig5 = distribution_plot(metrics_df, "accuracy")
444
- path5 = f"/tmp/distribution_{uuid.uuid4().hex}.png"
445
- fig5.savefig(path5, bbox_inches="tight")
446
- plt.close(fig5)
447
- images.append((path5, "Accuracy Distribution"))
448
  except Exception as e:
449
- print("Distribution plot failed:", e)
450
-
451
  try:
452
- fig6 = scatter_two_metrics(metrics_df, "instruction_following", "accuracy")
453
- path6 = f"/tmp/scatter_{uuid.uuid4().hex}.png"
454
- fig6.savefig(path6, bbox_inches="tight")
455
- plt.close(fig6)
456
- images.append((path6, "Instruction Following vs Accuracy"))
457
  except Exception as e:
458
- print("Scatter plot failed:", e)
 
 
 
 
 
 
 
 
 
459
 
460
- return metrics_df, images, leaderboard
 
206
 
207
  # evaluator.py
208
  """
209
+ Evaluator for Agentic Evaluation Framework
 
210
  """
211
 
 
 
 
212
  import numpy as np
213
  import pandas as pd
214
  import matplotlib.pyplot as plt
215
  import seaborn as sns
216
+ import math, uuid, re
217
 
218
+ from sentence_transformers import SentenceTransformer, util
219
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
220
+
221
+ # ------------------------
222
+ # Models (lightweight)
223
+ # ------------------------
224
  EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
225
+ NLI_MODEL = "textattack/roberta-base-MNLI"
226
 
227
+ _embed_model = SentenceTransformer(EMBED_MODEL)
228
+ _nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL)
229
+ _nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL)
230
+ _id2label = {int(k): v.upper() for k, v in _nli_model.config.id2label.items()}
231
+
232
+ # ------------------------
233
+ # Metrics
234
+ # ------------------------
235
+
236
+ def check_instruction_following(prompt, response):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  if not prompt or not response:
238
  return 0.0
239
+ p_emb = _embed_model.encode(prompt, convert_to_tensor=True)
240
+ r_emb = _embed_model.encode(response, convert_to_tensor=True)
 
241
  sim = float(util.cos_sim(p_emb, r_emb).item())
242
  return round(max(0.0, min(1.0, sim)), 3)
243
 
244
+ def check_hallucination(prompt, response):
245
  if not prompt or not response:
246
  return 0.0
247
+ inputs = _nli_tokenizer.encode_plus(prompt, response, return_tensors="pt", truncation=True)
248
+ outputs = _nli_model(**inputs)
 
249
  probs = outputs.logits.softmax(dim=1).detach().cpu().numpy()[0]
250
+ labels = [ _id2label[i] for i in range(len(probs)) ]
251
+ entailment_prob = float(probs[labels.index("ENTAILMENT")]) if "ENTAILMENT" in labels else float(probs.max())
252
+ return round(max(0.0, min(1.0, entailment_prob)), 3)
253
+
254
+ def check_accuracy(reference, response):
 
 
 
255
  if not reference or not response:
256
  return 0.0
257
+ ref_emb = _embed_model.encode(reference, convert_to_tensor=True)
258
+ r_emb = _embed_model.encode(response, convert_to_tensor=True)
 
259
  sim = float(util.cos_sim(ref_emb, r_emb).item())
260
  return round(max(0.0, min(1.0, sim)), 3)
261
 
262
+ def check_coherence(response):
263
+ if not response:
264
  return 0.0
265
+ sents = [s.strip() for s in re.split(r"[.!?]+", response) if s.strip()]
266
+ if len(sents) <= 1:
267
+ return 1.0
268
+ embs = _embed_model.encode(sents, convert_to_tensor=True)
269
+ sims = []
270
+ for i in range(len(embs)):
271
+ for j in range(i+1, len(embs)):
272
+ sims.append(float(util.cos_sim(embs[i], embs[j]).item()))
273
+ avg = np.mean(sims)
274
+ return round((avg + 1) / 2, 3) # normalize to [0,1]
275
+
276
+ def check_fluency(response):
277
+ if not response:
278
  return 0.0
279
  letters = sum(ch.isalpha() for ch in response)
280
  total = len(response)
281
+ return round(letters / max(1, total), 3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
 
283
+ # ------------------------
284
+ # Evaluation
285
+ # ------------------------
286
 
287
+ def evaluate_dataframe(df: pd.DataFrame):
 
 
 
288
  scores = []
289
  for _, row in df.iterrows():
290
  s = {}
 
293
  s["accuracy"] = check_accuracy(str(row.get("reference", "")), str(row.get("response", "")))
294
  s["coherence"] = check_coherence(str(row.get("response", "")))
295
  s["fluency"] = check_fluency(str(row.get("response", "")))
296
+ # clamp
297
+ for k in s:
298
+ s[k] = max(0.0, min(1.0, s[k]))
299
+ s["final_score"] = round(float(np.mean(list(s.values()))), 3)
300
  scores.append(s)
301
 
302
  metrics_df = pd.concat([df.reset_index(drop=True), pd.DataFrame(scores)], axis=1)
303
+ metric_cols = ["instruction_following", "hallucination", "accuracy", "coherence", "fluency", "final_score"]
304
 
 
 
305
  leaderboard = (
306
+ metrics_df.groupby(["agent", "task_type"])[metric_cols]
307
  .mean()
308
  .reset_index()
309
  )
310
+ return metrics_df, [], leaderboard
311
 
312
+ # ------------------------
313
+ # Visualizations
314
+ # ------------------------
 
315
 
316
+ def plot_radar_chart(leaderboard, metric_cols):
317
+ categories = metric_cols
318
+ angles = np.linspace(0, 2*np.pi, len(categories), endpoint=False).tolist()
319
+ angles += angles[:1]
320
+ fig = plt.figure(figsize=(6,6))
321
+ ax = plt.subplot(111, polar=True)
322
+ for agent in leaderboard["agent"].unique():
323
+ vals = leaderboard[leaderboard["agent"]==agent][metric_cols].mean().tolist()
324
+ vals += vals[:1]
325
+ ax.plot(angles, vals, label=agent)
326
+ ax.fill(angles, vals, alpha=0.1)
327
+ ax.set_xticks(angles[:-1])
328
+ ax.set_xticklabels(categories)
329
+ ax.set_ylim(0,1)
330
+ ax.legend(loc="upper right")
331
+ return fig
332
 
333
+ def plot_heatmap(metrics_df, metric_cols):
334
+ fig, ax = plt.subplots(figsize=(7,5))
335
+ sns.heatmap(metrics_df[metric_cols].corr(), annot=True, fmt=".2f", cmap="coolwarm", ax=ax)
336
+ return fig
337
+
338
+ def plot_boxplot(metrics_df, metric_cols):
339
+ fig, ax = plt.subplots(figsize=(7,5))
340
+ sns.boxplot(data=metrics_df[metric_cols], ax=ax)
341
+ return fig
342
 
343
+ def plot_bar(leaderboard, metric_cols):
344
+ fig, ax = plt.subplots(figsize=(8,5))
345
+ leaderboard.plot(x="agent", y="final_score", kind="bar", ax=ax, legend=False)
346
+ ax.set_ylabel("Final Score")
347
+ return fig
348
+
349
+ def generate_visualizations(metrics_df, leaderboard):
350
+ metric_cols = ["instruction_following", "hallucination", "accuracy", "coherence", "fluency", "final_score"]
351
+ figs = []
352
  try:
353
+ figs.append(plot_radar_chart(leaderboard, metric_cols))
 
 
 
 
354
  except Exception as e:
355
+ print("Radar failed:", e)
 
356
  try:
357
+ figs.append(plot_heatmap(metrics_df, metric_cols))
 
 
 
 
358
  except Exception as e:
359
+ print("Heatmap failed:", e)
 
360
  try:
361
+ figs.append(plot_boxplot(metrics_df, metric_cols))
 
 
 
 
362
  except Exception as e:
363
+ print("Boxplot failed:", e)
 
364
  try:
365
+ figs.append(plot_bar(leaderboard, metric_cols))
 
 
 
 
366
  except Exception as e:
367
+ print("Bar failed:", e)
368
+
369
+ # Save to temp and return as gallery list
370
+ images = []
371
+ for fig in figs:
372
+ path = f"/tmp/viz_{uuid.uuid4().hex}.png"
373
+ fig.savefig(path, bbox_inches="tight")
374
+ plt.close(fig)
375
+ images.append(path)
376
+ return images
377