Spaces:

Supastrikas-004
/

evaluation-framework

Runtime error

App Files Files Community

Update evaluator.py

by manayporwal07 - opened Sep 13, 2025

base: refs/heads/main

←

from: refs/pr/7

Discussion Files changed

+87

-23

Files changed (1) hide show

evaluator.py +87 -23

evaluator.py CHANGED Viewed

@@ -276,6 +276,57 @@ import uuid
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from sentence_transformers import SentenceTransformer, util
 # --------------------------
 # MODEL LOADING
 # --------------------------
@@ -408,28 +459,41 @@ def evaluate_dataframe(df: pd.DataFrame):
         .reset_index()
     )
-    # Plots
-    images = []
-    out_dir = "/tmp/plots"
-    os.makedirs(out_dir, exist_ok=True)
-    # Histogram of scores
-    plt.figure(figsize=(6, 4))
-    sns.histplot(metrics_df["final_score"], bins=10, kde=False)
-    plt.title("Distribution of Final Scores")
-    hist_path = os.path.join(out_dir, f"hist_{uuid.uuid4().hex}.png")
-    plt.savefig(hist_path)
-    plt.close()
-    images.append((hist_path, "Final Score Distribution"))
-    # Per-agent average
-    plt.figure(figsize=(6, 4))
-    agent_scores = metrics_df.groupby("agent")["final_score"].mean().reset_index()
-    sns.barplot(data=agent_scores, x="agent", y="final_score")
-    plt.title("Average Final Score per Agent")
-    bar_path = os.path.join(out_dir, f"bar_{uuid.uuid4().hex}.png")
-    plt.savefig(bar_path)
-    plt.close()
-    images.append((bar_path, "Average Score per Agent"))
     return metrics_df, images, leaderboard

 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from sentence_transformers import SentenceTransformer, util
+import matplotlib.pyplot as plt
+import numpy as np
+def plot_radar_chart(metrics_df, agents, metrics, out_path="/tmp/radar.png"):
+    """
+    Radar chart comparing multiple agents across metrics.
+    """
+    labels = metrics
+    num_vars = len(labels)
+    # Compute angle for each axis
+    angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
+    angles += angles[:1]  # close loop
+    fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
+    for agent in agents:
+        values = []
+        for m in metrics:
+            mean_val = metrics_df.loc[metrics_df['agent'] == agent, m].mean()
+            values.append(mean_val if not np.isnan(mean_val) else 0)
+        values += values[:1]
+        ax.plot(angles, values, label=agent, linewidth=2)
+        ax.fill(angles, values, alpha=0.25)
+    ax.set_xticks(angles[:-1])
+    ax.set_xticklabels(labels)
+    ax.set_yticklabels([])
+    ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.1))
+    ax.set_title("Agent Performance Radar Chart")
+    plt.tight_layout()
+    plt.savefig(out_path)
+    plt.close()
+    return out_path
+import seaborn as sns
+def plot_heatmap(metrics_df, out_path="/tmp/heatmap.png"):
+    pivot = metrics_df.groupby("agent")[
+        ["accuracy", "hallucination", "instruction_following", "coherence", "assumption"]
+    ].mean()
+    plt.figure(figsize=(8, 5))
+    sns.heatmap(pivot, annot=True, cmap="viridis", fmt=".2f")
+    plt.title("Agent × Metric Heatmap")
+    plt.tight_layout()
+    plt.savefig(out_path)
+    plt.close()
+    return out_path
 # --------------------------
 # MODEL LOADING
 # --------------------------
         .reset_index()
     )
+    # # Plots
+    # images = []
+    # Existing images list
+    images = []
+    # Add radar chart
+    radar_path = plot_radar_chart(metrics_df, agents=df["agent"].unique(),
+                                  metrics=["accuracy", "hallucination", "instruction_following", "coherence", "assumption"])
+    images.append((radar_path, "Radar Chart: Agent vs Metrics"))
+    # Add heatmap
+    heatmap_path = plot_heatmap(metrics_df)
+    images.append((heatmap_path, "Heatmap: Agent vs Metrics"))
     return metrics_df, images, leaderboard
+    # out_dir = "/tmp/plots"
+    # os.makedirs(out_dir, exist_ok=True)
+    # # Histogram of scores
+    # plt.figure(figsize=(6, 4))
+    # sns.histplot(metrics_df["final_score"], bins=10, kde=False)
+    # plt.title("Distribution of Final Scores")
+    # hist_path = os.path.join(out_dir, f"hist_{uuid.uuid4().hex}.png")
+    # plt.savefig(hist_path)
+    # plt.close()
+    # images.append((hist_path, "Final Score Distribution"))
+    # # Per-agent average
+    # plt.figure(figsize=(6, 4))
+    # agent_scores = metrics_df.groupby("agent")["final_score"].mean().reset_index()
+    # sns.barplot(data=agent_scores, x="agent", y="final_score")
+    # plt.title("Average Final Score per Agent")
+    # bar_path = os.path.join(out_dir, f"bar_{uuid.uuid4().hex}.png")
+    # plt.savefig(bar_path)
+    # plt.close()
+    # images.append((bar_path, "Average Score per Agent"))
+    # return metrics_df, images, leaderboard