Spaces:
Runtime error
Runtime error
Update evaluator.py
#7
by
manayporwal07
- opened
- evaluator.py +87 -23
evaluator.py
CHANGED
|
@@ -276,6 +276,57 @@ import uuid
|
|
| 276 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 277 |
from sentence_transformers import SentenceTransformer, util
|
| 278 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
# --------------------------
|
| 280 |
# MODEL LOADING
|
| 281 |
# --------------------------
|
|
@@ -408,28 +459,41 @@ def evaluate_dataframe(df: pd.DataFrame):
|
|
| 408 |
.reset_index()
|
| 409 |
)
|
| 410 |
|
| 411 |
-
# Plots
|
| 412 |
-
images = []
|
| 413 |
-
out_dir = "/tmp/plots"
|
| 414 |
-
os.makedirs(out_dir, exist_ok=True)
|
| 415 |
-
|
| 416 |
-
# Histogram of scores
|
| 417 |
-
plt.figure(figsize=(6, 4))
|
| 418 |
-
sns.histplot(metrics_df["final_score"], bins=10, kde=False)
|
| 419 |
-
plt.title("Distribution of Final Scores")
|
| 420 |
-
hist_path = os.path.join(out_dir, f"hist_{uuid.uuid4().hex}.png")
|
| 421 |
-
plt.savefig(hist_path)
|
| 422 |
-
plt.close()
|
| 423 |
-
images.append((hist_path, "Final Score Distribution"))
|
| 424 |
-
|
| 425 |
-
# Per-agent average
|
| 426 |
-
plt.figure(figsize=(6, 4))
|
| 427 |
-
agent_scores = metrics_df.groupby("agent")["final_score"].mean().reset_index()
|
| 428 |
-
sns.barplot(data=agent_scores, x="agent", y="final_score")
|
| 429 |
-
plt.title("Average Final Score per Agent")
|
| 430 |
-
bar_path = os.path.join(out_dir, f"bar_{uuid.uuid4().hex}.png")
|
| 431 |
-
plt.savefig(bar_path)
|
| 432 |
-
plt.close()
|
| 433 |
-
images.append((bar_path, "Average Score per Agent"))
|
| 434 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 435 |
return metrics_df, images, leaderboard
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 276 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 277 |
from sentence_transformers import SentenceTransformer, util
|
| 278 |
|
| 279 |
+
import matplotlib.pyplot as plt
|
| 280 |
+
import numpy as np
|
| 281 |
+
|
| 282 |
+
def plot_radar_chart(metrics_df, agents, metrics, out_path="/tmp/radar.png"):
|
| 283 |
+
"""
|
| 284 |
+
Radar chart comparing multiple agents across metrics.
|
| 285 |
+
"""
|
| 286 |
+
labels = metrics
|
| 287 |
+
num_vars = len(labels)
|
| 288 |
+
|
| 289 |
+
# Compute angle for each axis
|
| 290 |
+
angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
|
| 291 |
+
angles += angles[:1] # close loop
|
| 292 |
+
|
| 293 |
+
fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
|
| 294 |
+
|
| 295 |
+
for agent in agents:
|
| 296 |
+
values = []
|
| 297 |
+
for m in metrics:
|
| 298 |
+
mean_val = metrics_df.loc[metrics_df['agent'] == agent, m].mean()
|
| 299 |
+
values.append(mean_val if not np.isnan(mean_val) else 0)
|
| 300 |
+
values += values[:1]
|
| 301 |
+
ax.plot(angles, values, label=agent, linewidth=2)
|
| 302 |
+
ax.fill(angles, values, alpha=0.25)
|
| 303 |
+
|
| 304 |
+
ax.set_xticks(angles[:-1])
|
| 305 |
+
ax.set_xticklabels(labels)
|
| 306 |
+
ax.set_yticklabels([])
|
| 307 |
+
ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.1))
|
| 308 |
+
ax.set_title("Agent Performance Radar Chart")
|
| 309 |
+
|
| 310 |
+
plt.tight_layout()
|
| 311 |
+
plt.savefig(out_path)
|
| 312 |
+
plt.close()
|
| 313 |
+
return out_path
|
| 314 |
+
|
| 315 |
+
import seaborn as sns
|
| 316 |
+
|
| 317 |
+
def plot_heatmap(metrics_df, out_path="/tmp/heatmap.png"):
|
| 318 |
+
pivot = metrics_df.groupby("agent")[
|
| 319 |
+
["accuracy", "hallucination", "instruction_following", "coherence", "assumption"]
|
| 320 |
+
].mean()
|
| 321 |
+
|
| 322 |
+
plt.figure(figsize=(8, 5))
|
| 323 |
+
sns.heatmap(pivot, annot=True, cmap="viridis", fmt=".2f")
|
| 324 |
+
plt.title("Agent Γ Metric Heatmap")
|
| 325 |
+
plt.tight_layout()
|
| 326 |
+
plt.savefig(out_path)
|
| 327 |
+
plt.close()
|
| 328 |
+
return out_path
|
| 329 |
+
|
| 330 |
# --------------------------
|
| 331 |
# MODEL LOADING
|
| 332 |
# --------------------------
|
|
|
|
| 459 |
.reset_index()
|
| 460 |
)
|
| 461 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 462 |
|
| 463 |
+
# # Plots
|
| 464 |
+
# images = []
|
| 465 |
+
# Existing images list
|
| 466 |
+
images = []
|
| 467 |
+
|
| 468 |
+
# Add radar chart
|
| 469 |
+
radar_path = plot_radar_chart(metrics_df, agents=df["agent"].unique(),
|
| 470 |
+
metrics=["accuracy", "hallucination", "instruction_following", "coherence", "assumption"])
|
| 471 |
+
images.append((radar_path, "Radar Chart: Agent vs Metrics"))
|
| 472 |
+
|
| 473 |
+
# Add heatmap
|
| 474 |
+
heatmap_path = plot_heatmap(metrics_df)
|
| 475 |
+
images.append((heatmap_path, "Heatmap: Agent vs Metrics"))
|
| 476 |
return metrics_df, images, leaderboard
|
| 477 |
+
# out_dir = "/tmp/plots"
|
| 478 |
+
# os.makedirs(out_dir, exist_ok=True)
|
| 479 |
+
|
| 480 |
+
# # Histogram of scores
|
| 481 |
+
# plt.figure(figsize=(6, 4))
|
| 482 |
+
# sns.histplot(metrics_df["final_score"], bins=10, kde=False)
|
| 483 |
+
# plt.title("Distribution of Final Scores")
|
| 484 |
+
# hist_path = os.path.join(out_dir, f"hist_{uuid.uuid4().hex}.png")
|
| 485 |
+
# plt.savefig(hist_path)
|
| 486 |
+
# plt.close()
|
| 487 |
+
# images.append((hist_path, "Final Score Distribution"))
|
| 488 |
+
|
| 489 |
+
# # Per-agent average
|
| 490 |
+
# plt.figure(figsize=(6, 4))
|
| 491 |
+
# agent_scores = metrics_df.groupby("agent")["final_score"].mean().reset_index()
|
| 492 |
+
# sns.barplot(data=agent_scores, x="agent", y="final_score")
|
| 493 |
+
# plt.title("Average Final Score per Agent")
|
| 494 |
+
# bar_path = os.path.join(out_dir, f"bar_{uuid.uuid4().hex}.png")
|
| 495 |
+
# plt.savefig(bar_path)
|
| 496 |
+
# plt.close()
|
| 497 |
+
# images.append((bar_path, "Average Score per Agent"))
|
| 498 |
+
|
| 499 |
+
# return metrics_df, images, leaderboard
|