Update evaluator.py

#7
by manayporwal07 - opened
Files changed (1) hide show
  1. evaluator.py +87 -23
evaluator.py CHANGED
@@ -276,6 +276,57 @@ import uuid
276
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
277
  from sentence_transformers import SentenceTransformer, util
278
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
  # --------------------------
280
  # MODEL LOADING
281
  # --------------------------
@@ -408,28 +459,41 @@ def evaluate_dataframe(df: pd.DataFrame):
408
  .reset_index()
409
  )
410
 
411
- # Plots
412
- images = []
413
- out_dir = "/tmp/plots"
414
- os.makedirs(out_dir, exist_ok=True)
415
-
416
- # Histogram of scores
417
- plt.figure(figsize=(6, 4))
418
- sns.histplot(metrics_df["final_score"], bins=10, kde=False)
419
- plt.title("Distribution of Final Scores")
420
- hist_path = os.path.join(out_dir, f"hist_{uuid.uuid4().hex}.png")
421
- plt.savefig(hist_path)
422
- plt.close()
423
- images.append((hist_path, "Final Score Distribution"))
424
-
425
- # Per-agent average
426
- plt.figure(figsize=(6, 4))
427
- agent_scores = metrics_df.groupby("agent")["final_score"].mean().reset_index()
428
- sns.barplot(data=agent_scores, x="agent", y="final_score")
429
- plt.title("Average Final Score per Agent")
430
- bar_path = os.path.join(out_dir, f"bar_{uuid.uuid4().hex}.png")
431
- plt.savefig(bar_path)
432
- plt.close()
433
- images.append((bar_path, "Average Score per Agent"))
434
 
 
 
 
 
 
 
 
 
 
 
 
 
 
435
  return metrics_df, images, leaderboard
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
277
  from sentence_transformers import SentenceTransformer, util
278
 
279
+ import matplotlib.pyplot as plt
280
+ import numpy as np
281
+
282
+ def plot_radar_chart(metrics_df, agents, metrics, out_path="/tmp/radar.png"):
283
+ """
284
+ Radar chart comparing multiple agents across metrics.
285
+ """
286
+ labels = metrics
287
+ num_vars = len(labels)
288
+
289
+ # Compute angle for each axis
290
+ angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
291
+ angles += angles[:1] # close loop
292
+
293
+ fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
294
+
295
+ for agent in agents:
296
+ values = []
297
+ for m in metrics:
298
+ mean_val = metrics_df.loc[metrics_df['agent'] == agent, m].mean()
299
+ values.append(mean_val if not np.isnan(mean_val) else 0)
300
+ values += values[:1]
301
+ ax.plot(angles, values, label=agent, linewidth=2)
302
+ ax.fill(angles, values, alpha=0.25)
303
+
304
+ ax.set_xticks(angles[:-1])
305
+ ax.set_xticklabels(labels)
306
+ ax.set_yticklabels([])
307
+ ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.1))
308
+ ax.set_title("Agent Performance Radar Chart")
309
+
310
+ plt.tight_layout()
311
+ plt.savefig(out_path)
312
+ plt.close()
313
+ return out_path
314
+
315
+ import seaborn as sns
316
+
317
+ def plot_heatmap(metrics_df, out_path="/tmp/heatmap.png"):
318
+ pivot = metrics_df.groupby("agent")[
319
+ ["accuracy", "hallucination", "instruction_following", "coherence", "assumption"]
320
+ ].mean()
321
+
322
+ plt.figure(figsize=(8, 5))
323
+ sns.heatmap(pivot, annot=True, cmap="viridis", fmt=".2f")
324
+ plt.title("Agent Γ— Metric Heatmap")
325
+ plt.tight_layout()
326
+ plt.savefig(out_path)
327
+ plt.close()
328
+ return out_path
329
+
330
  # --------------------------
331
  # MODEL LOADING
332
  # --------------------------
 
459
  .reset_index()
460
  )
461
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
462
 
463
+ # # Plots
464
+ # images = []
465
+ # Existing images list
466
+ images = []
467
+
468
+ # Add radar chart
469
+ radar_path = plot_radar_chart(metrics_df, agents=df["agent"].unique(),
470
+ metrics=["accuracy", "hallucination", "instruction_following", "coherence", "assumption"])
471
+ images.append((radar_path, "Radar Chart: Agent vs Metrics"))
472
+
473
+ # Add heatmap
474
+ heatmap_path = plot_heatmap(metrics_df)
475
+ images.append((heatmap_path, "Heatmap: Agent vs Metrics"))
476
  return metrics_df, images, leaderboard
477
+ # out_dir = "/tmp/plots"
478
+ # os.makedirs(out_dir, exist_ok=True)
479
+
480
+ # # Histogram of scores
481
+ # plt.figure(figsize=(6, 4))
482
+ # sns.histplot(metrics_df["final_score"], bins=10, kde=False)
483
+ # plt.title("Distribution of Final Scores")
484
+ # hist_path = os.path.join(out_dir, f"hist_{uuid.uuid4().hex}.png")
485
+ # plt.savefig(hist_path)
486
+ # plt.close()
487
+ # images.append((hist_path, "Final Score Distribution"))
488
+
489
+ # # Per-agent average
490
+ # plt.figure(figsize=(6, 4))
491
+ # agent_scores = metrics_df.groupby("agent")["final_score"].mean().reset_index()
492
+ # sns.barplot(data=agent_scores, x="agent", y="final_score")
493
+ # plt.title("Average Final Score per Agent")
494
+ # bar_path = os.path.join(out_dir, f"bar_{uuid.uuid4().hex}.png")
495
+ # plt.savefig(bar_path)
496
+ # plt.close()
497
+ # images.append((bar_path, "Average Score per Agent"))
498
+
499
+ # return metrics_df, images, leaderboard