Files changed (1) hide show
  1. evaluator.py +326 -157
evaluator.py CHANGED
@@ -1,18 +1,278 @@
1
- """
2
- Evaluation module: loads models, computes metrics, and creates visualizations.
3
- Lightweight, CPU-friendly, no Java required.
4
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  import re
7
- import math
8
- import uuid
9
- from typing import List, Dict, Tuple
10
-
11
- import numpy as np
12
  import pandas as pd
13
  import matplotlib.pyplot as plt
14
  import seaborn as sns
15
- import torch
 
16
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
17
  from sentence_transformers import SentenceTransformer, util
18
 
@@ -109,158 +369,67 @@ def check_accuracy(reference: str, response: str) -> float:
109
 
110
 
111
  # --------------------------
112
- # SCORING PIPELINE
113
  # --------------------------
114
- def compute_row_scores(prompt, response, reference) -> Dict:
115
- instr = check_instruction_following(prompt, response)
116
- halluc = check_hallucination(reference, response)
117
- assum = check_assumption(response)
118
- coh = check_coherence(response)
119
- acc = check_accuracy(reference, response)
120
-
121
- # Final score: average
122
- components = [instr, halluc, assum, coh, acc]
123
- final = round(float(sum(components) / len(components)), 3)
124
-
125
- return {
126
- "InstructionFollowing": instr,
127
- "Hallucination": halluc,
128
- "AssumptionControl": assum,
129
- "Coherence": coh,
130
- "Accuracy": acc,
131
- "FinalScore": final,
132
  }
133
 
 
 
 
 
 
 
 
 
 
 
134
 
135
- # --------------------------
136
- # VISUALIZATION HELPERS
137
- # --------------------------
138
- # def spider_net_multi(labels: List[str], rows: List[Dict], title: str, fill_alpha: float = 0.12):
139
- # """Radar chart for multiple agents."""
140
- # N = len(labels)
141
- # angles = [n / float(N) * 2 * math.pi for n in range(N)]
142
- # angles += angles[:1]
143
-
144
- # fig = plt.figure(figsize=(6.5, 6.5))
145
- # ax = plt.subplot(111, polar=True)
146
- # ax.set_xticks(angles[:-1])
147
- # ax.set_xticklabels(labels, fontsize=9)
148
- # ax.set_ylim(0, 100)
149
- # ax.set_yticks([0, 25, 50, 75, 100])
150
-
151
- # for r in rows:
152
- # values = r["values"]
153
- # values_closed = values + values[:1]
154
- # ax.plot(angles, values_closed, linewidth=1.5, label=r["name"])
155
- # ax.fill(angles, values_closed, alpha=fill_alpha)
156
-
157
- # ax.set_title(title, y=1.08, fontsize=12)
158
- # ax.legend(loc="upper right", bbox_to_anchor=(1.25, 1.1))
159
- # return fig
160
-
161
-
162
- # def heatmap_plot(df: pd.DataFrame, metric_cols: List[str], title: str = "Metric Correlations"):
163
- # fig, ax = plt.subplots(figsize=(7, 5))
164
- # sns.heatmap(df[metric_cols].corr(), annot=True, fmt=".2f", cmap="coolwarm", ax=ax)
165
- # ax.set_title(title)
166
- # return fig
167
 
168
-
169
- # --------------------------
170
- # HIGH-LEVEL EVALUATION
171
- # --------------------------
172
- def evaluate_dataframe(df: pd.DataFrame) -> Tuple[pd.DataFrame, List[Tuple[str, str]], pd.DataFrame]:
173
- """
174
- df must contain: prompt, response, task, agent, reference
175
- Returns: metrics_df, [(image_path, caption)], leaderboard_df
176
- """
177
- df = df.rename(columns={c: c.strip() for c in df.columns})
178
-
179
- rows = []
180
- for _, r in df.iterrows():
181
- prompt = r.get("prompt", "")
182
- response = r.get("response", "")
183
- reference = r.get("reference", "")
184
- agent = r.get("agent", "Unknown")
185
- task = r.get("task", "Unknown")
186
-
187
- scores = compute_row_scores(prompt, response, reference)
188
- entry = {
189
- "Task": str(task).strip(),
190
- "Agent": str(agent),
191
- "Prompt": prompt,
192
- "Response": response,
193
- "Reference": reference,
194
- }
195
- entry.update(scores)
196
- rows.append(entry)
197
-
198
- metrics_df = pd.DataFrame(rows)
199
-
200
- # Visualization artifacts
201
- images = []
202
- metric_labels = ["InstructionFollowing", "Hallucination", "AssumptionControl", "Coherence", "Accuracy"]
203
-
204
- # Per-task radar and bar charts
205
- for task, g in metrics_df.groupby("Task"):
206
- series = []
207
- for a in g["Agent"].unique():
208
- subset = g[g["Agent"] == a]
209
- vals = [round(float(subset[m].mean()) * 100, 2) for m in metric_labels]
210
- series.append({"name": a, "values": vals})
211
- if series:
212
- fig = spider_net_multi(metric_labels, series, title=f"{task} β€” Agent Comparison")
213
- fname = f"/tmp/{uuid.uuid4().hex}_{task}_radar.png"
214
- fig.savefig(fname, bbox_inches="tight")
215
- plt.close(fig)
216
- images.append((fname, f"{task} - radar"))
217
-
218
- fig2, ax = plt.subplots(figsize=(8, 4))
219
- avg = g.groupby("Agent")[metric_labels].mean()
220
- avg.plot(kind="bar", ax=ax)
221
- ax.set_title(f"{task} β€” Average Metrics by Agent")
222
- ax.set_ylabel("Score (0-1)")
223
- plt.xticks(rotation=45)
224
- fname2 = f"/tmp/{uuid.uuid4().hex}_{task}_bar.png"
225
- fig2.savefig(fname2, bbox_inches="tight")
226
- plt.close(fig2)
227
- images.append((fname2, f"{task} - bar"))
228
-
229
- # Global heatmap
230
- metric_cols = metric_labels + ["FinalScore"]
231
- figh = heatmap_plot(metrics_df, metric_cols)
232
- fnameh = f"/tmp/{uuid.uuid4().hex}_heatmap.png"
233
- figh.savefig(fnameh, bbox_inches="tight")
234
- plt.close(figh)
235
- images.append((fnameh, "Metric Correlations Heatmap"))
236
 
237
  # Leaderboard
238
- lb = metrics_df.groupby(["Agent", "Task"])["FinalScore"].mean().reset_index()
239
- lb = lb.sort_values(["FinalScore"], ascending=False)
 
 
 
240
 
241
- return metrics_df, images, lb
242
-
243
-
244
- # --------------------------
245
- # DEMO USAGE
246
- # --------------------------
247
- if __name__ == "__main__":
248
- # Sample dataset
249
- data = [
250
- {"task": "Math QA", "agent": "AgentA", "prompt": "What is 2+2?", "response": "The answer is 4.", "reference": "2+2=4"},
251
- {"task": "Math QA", "agent": "AgentB", "prompt": "What is 2+2?", "response": "It might be 5, but usually 4.", "reference": "2+2=4"},
252
- {"task": "Summarization", "agent": "AgentA", "prompt": "Summarize: 'The cat sat on the mat. The dog barked.'", "response": "A cat sat while a dog barked.", "reference": "Cat on mat, dog barking."},
253
- ]
254
- df = pd.DataFrame(data)
255
-
256
- metrics_df, images, leaderboard = evaluate_dataframe(df)
257
-
258
- print("\n=== Metrics per response ===")
259
- print(metrics_df[["Task", "Agent", "InstructionFollowing", "Hallucination", "AssumptionControl", "Coherence", "Accuracy", "FinalScore"]])
260
-
261
- print("\n=== Leaderboard (average per task & agent) ===")
262
- print(leaderboard)
263
-
264
- print("\nVisualization files saved in /tmp/:")
265
- for path, caption in images:
266
- print(f"{caption}: {path}")
 
1
+ # """
2
+ # Evaluation module: loads models, computes metrics, and creates visualizations.
3
+ # Lightweight, CPU-friendly, no Java required.
4
+ # """
5
+
6
+ # import re
7
+ # import math
8
+ # import uuid
9
+ # from typing import List, Dict, Tuple
10
+
11
+ # import numpy as np
12
+ # import pandas as pd
13
+ # import matplotlib.pyplot as plt
14
+ # import seaborn as sns
15
+ # import torch
16
+ # from transformers import AutoTokenizer, AutoModelForSequenceClassification
17
+ # from sentence_transformers import SentenceTransformer, util
18
+
19
+ # # --------------------------
20
+ # # MODEL LOADING
21
+ # # --------------------------
22
+ # NLI_MODEL = "textattack/roberta-base-MNLI"
23
+ # EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
24
+
25
+ # # Load NLI model & tokenizer
26
+ # nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL)
27
+ # nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL)
28
+ # nli_model.to("cpu")
29
+ # nli_model.eval()
30
+
31
+ # # Load embedding model
32
+ # embed_model = SentenceTransformer(EMBED_MODEL)
33
+
34
+ # # Label mapping from config
35
+ # id2label = {int(k): v.upper() for k, v in nli_model.config.id2label.items()}
36
+
37
+
38
+ # # --------------------------
39
+ # # METRIC FUNCTIONS
40
+ # # --------------------------
41
+ # def check_instruction_following(prompt: str, response: str) -> float:
42
+ # """Embedding-based similarity between prompt and response."""
43
+ # if not prompt or not response:
44
+ # return 0.0
45
+ # p_emb = embed_model.encode(prompt, convert_to_tensor=True)
46
+ # r_emb = embed_model.encode(response, convert_to_tensor=True)
47
+ # sim = float(util.cos_sim(p_emb, r_emb).item())
48
+ # return round(max(0.0, min(1.0, sim)), 3)
49
+
50
+
51
+ # def check_hallucination(reference: str, response: str) -> float:
52
+ # """
53
+ # Single hallucination score:
54
+ # Entailment prob - Contradiction prob (normalized to [0,1]).
55
+ # Higher = less hallucination.
56
+ # """
57
+ # if not reference or not response:
58
+ # return 0.0
59
+ # with torch.no_grad():
60
+ # inputs = nli_tokenizer.encode_plus(reference, response, return_tensors="pt", truncation=True)
61
+ # outputs = nli_model(**inputs)
62
+ # probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
63
+
64
+ # entail_prob, contra_prob = 0.0, 0.0
65
+ # for idx, p in enumerate(probs):
66
+ # label = id2label.get(idx, "")
67
+ # if "ENTAIL" in label:
68
+ # entail_prob = float(p)
69
+ # elif "CONTRA" in label:
70
+ # contra_prob = float(p)
71
+
72
+ # score = entail_prob - contra_prob
73
+ # score = (score + 1) / 2 # normalize [-1,1] β†’ [0,1]
74
+ # return round(max(0.0, min(1.0, score)), 3)
75
+
76
+
77
+ # def check_assumption(response: str) -> float:
78
+ # """Detect speculative/hedging terms."""
79
+ # if not response:
80
+ # return 0.0
81
+ # speculative_terms = ["maybe", "probably", "might", "perhaps", "i guess", "seems", "could"]
82
+ # count = sum(1 for t in speculative_terms if t in response.lower())
83
+ # score = 1.0 - min(count / 5.0, 1.0) # smoother decay
84
+ # return round(score, 3)
85
+
86
+
87
+ # def check_coherence(response: str) -> float:
88
+ # """Heuristic coherence metric: penalizes very short/long, rewards sentence balance."""
89
+ # if not response:
90
+ # return 0.0
91
+ # words = len(re.findall(r"\w+", response))
92
+ # sents = max(1, len(re.split(r"[.!?]+", response)) - 1)
93
+ # if words < 5:
94
+ # return 0.3
95
+ # if words > 200:
96
+ # return 0.5
97
+ # base = min(1.0, (words / 50.0) + (sents / 5.0))
98
+ # return round(max(0.4, min(base, 0.95)), 3)
99
+
100
+
101
+ # def check_accuracy(reference: str, response: str) -> float:
102
+ # """Semantic similarity between reference and response via embeddings (cosine)."""
103
+ # if not reference or not response:
104
+ # return 0.0
105
+ # ref_emb = embed_model.encode(reference, convert_to_tensor=True)
106
+ # resp_emb = embed_model.encode(response, convert_to_tensor=True)
107
+ # sim = float(util.cos_sim(ref_emb, resp_emb).item())
108
+ # return round(max(0.0, min(1.0, sim)), 3)
109
+
110
+
111
+ # # --------------------------
112
+ # # SCORING PIPELINE
113
+ # # --------------------------
114
+ # def compute_row_scores(prompt, response, reference) -> Dict:
115
+ # instr = check_instruction_following(prompt, response)
116
+ # halluc = check_hallucination(reference, response)
117
+ # assum = check_assumption(response)
118
+ # coh = check_coherence(response)
119
+ # acc = check_accuracy(reference, response)
120
+
121
+ # # Final score: average
122
+ # components = [instr, halluc, assum, coh, acc]
123
+ # final = round(float(sum(components) / len(components)), 3)
124
+
125
+ # return {
126
+ # "InstructionFollowing": instr,
127
+ # "Hallucination": halluc,
128
+ # "AssumptionControl": assum,
129
+ # "Coherence": coh,
130
+ # "Accuracy": acc,
131
+ # "FinalScore": final,
132
+ # }
133
+
134
+
135
+ # # --------------------------
136
+ # # VISUALIZATION HELPERS
137
+ # # --------------------------
138
+ # # def spider_net_multi(labels: List[str], rows: List[Dict], title: str, fill_alpha: float = 0.12):
139
+ # # """Radar chart for multiple agents."""
140
+ # # N = len(labels)
141
+ # # angles = [n / float(N) * 2 * math.pi for n in range(N)]
142
+ # # angles += angles[:1]
143
+
144
+ # # fig = plt.figure(figsize=(6.5, 6.5))
145
+ # # ax = plt.subplot(111, polar=True)
146
+ # # ax.set_xticks(angles[:-1])
147
+ # # ax.set_xticklabels(labels, fontsize=9)
148
+ # # ax.set_ylim(0, 100)
149
+ # # ax.set_yticks([0, 25, 50, 75, 100])
150
+
151
+ # # for r in rows:
152
+ # # values = r["values"]
153
+ # # values_closed = values + values[:1]
154
+ # # ax.plot(angles, values_closed, linewidth=1.5, label=r["name"])
155
+ # # ax.fill(angles, values_closed, alpha=fill_alpha)
156
+
157
+ # # ax.set_title(title, y=1.08, fontsize=12)
158
+ # # ax.legend(loc="upper right", bbox_to_anchor=(1.25, 1.1))
159
+ # # return fig
160
+
161
+
162
+ # # def heatmap_plot(df: pd.DataFrame, metric_cols: List[str], title: str = "Metric Correlations"):
163
+ # # fig, ax = plt.subplots(figsize=(7, 5))
164
+ # # sns.heatmap(df[metric_cols].corr(), annot=True, fmt=".2f", cmap="coolwarm", ax=ax)
165
+ # # ax.set_title(title)
166
+ # # return fig
167
+
168
+
169
+ # # --------------------------
170
+ # # HIGH-LEVEL EVALUATION
171
+ # # --------------------------
172
+ # def evaluate_dataframe(df: pd.DataFrame) -> Tuple[pd.DataFrame, List[Tuple[str, str]], pd.DataFrame]:
173
+ # """
174
+ # df must contain: prompt, response, task, agent, reference
175
+ # Returns: metrics_df, [(image_path, caption)], leaderboard_df
176
+ # """
177
+ # df = df.rename(columns={c: c.strip() for c in df.columns})
178
+
179
+ # rows = []
180
+ # for _, r in df.iterrows():
181
+ # prompt = r.get("prompt", "")
182
+ # response = r.get("response", "")
183
+ # reference = r.get("reference", "")
184
+ # agent = r.get("agent", "Unknown")
185
+ # task = r.get("task", "Unknown")
186
+
187
+ # scores = compute_row_scores(prompt, response, reference)
188
+ # entry = {
189
+ # "Task": str(task).strip(),
190
+ # "Agent": str(agent),
191
+ # "Prompt": prompt,
192
+ # "Response": response,
193
+ # "Reference": reference,
194
+ # }
195
+ # entry.update(scores)
196
+ # rows.append(entry)
197
+
198
+ # metrics_df = pd.DataFrame(rows)
199
+
200
+ # # Visualization artifacts
201
+ # images = []
202
+ # metric_labels = ["InstructionFollowing", "Hallucination", "AssumptionControl", "Coherence", "Accuracy"]
203
+
204
+ # # Per-task radar and bar charts
205
+ # for task, g in metrics_df.groupby("Task"):
206
+ # series = []
207
+ # for a in g["Agent"].unique():
208
+ # subset = g[g["Agent"] == a]
209
+ # vals = [round(float(subset[m].mean()) * 100, 2) for m in metric_labels]
210
+ # series.append({"name": a, "values": vals})
211
+ # if series:
212
+ # fig = spider_net_multi(metric_labels, series, title=f"{task} β€” Agent Comparison")
213
+ # fname = f"/tmp/{uuid.uuid4().hex}_{task}_radar.png"
214
+ # fig.savefig(fname, bbox_inches="tight")
215
+ # plt.close(fig)
216
+ # images.append((fname, f"{task} - radar"))
217
+
218
+ # fig2, ax = plt.subplots(figsize=(8, 4))
219
+ # avg = g.groupby("Agent")[metric_labels].mean()
220
+ # avg.plot(kind="bar", ax=ax)
221
+ # ax.set_title(f"{task} β€” Average Metrics by Agent")
222
+ # ax.set_ylabel("Score (0-1)")
223
+ # plt.xticks(rotation=45)
224
+ # fname2 = f"/tmp/{uuid.uuid4().hex}_{task}_bar.png"
225
+ # fig2.savefig(fname2, bbox_inches="tight")
226
+ # plt.close(fig2)
227
+ # images.append((fname2, f"{task} - bar"))
228
+
229
+ # # Global heatmap
230
+ # metric_cols = metric_labels + ["FinalScore"]
231
+ # figh = heatmap_plot(metrics_df, metric_cols)
232
+ # fnameh = f"/tmp/{uuid.uuid4().hex}_heatmap.png"
233
+ # figh.savefig(fnameh, bbox_inches="tight")
234
+ # plt.close(figh)
235
+ # images.append((fnameh, "Metric Correlations Heatmap"))
236
+
237
+ # # Leaderboard
238
+ # lb = metrics_df.groupby(["Agent", "Task"])["FinalScore"].mean().reset_index()
239
+ # lb = lb.sort_values(["FinalScore"], ascending=False)
240
+
241
+ # return metrics_df, images, lb
242
+
243
+
244
+ # # --------------------------
245
+ # # DEMO USAGE
246
+ # # --------------------------
247
+ # if __name__ == "__main__":
248
+ # # Sample dataset
249
+ # data = [
250
+ # {"task": "Math QA", "agent": "AgentA", "prompt": "What is 2+2?", "response": "The answer is 4.", "reference": "2+2=4"},
251
+ # {"task": "Math QA", "agent": "AgentB", "prompt": "What is 2+2?", "response": "It might be 5, but usually 4.", "reference": "2+2=4"},
252
+ # {"task": "Summarization", "agent": "AgentA", "prompt": "Summarize: 'The cat sat on the mat. The dog barked.'", "response": "A cat sat while a dog barked.", "reference": "Cat on mat, dog barking."},
253
+ # ]
254
+ # df = pd.DataFrame(data)
255
+
256
+ # metrics_df, images, leaderboard = evaluate_dataframe(df)
257
+
258
+ # print("\n=== Metrics per response ===")
259
+ # print(metrics_df[["Task", "Agent", "InstructionFollowing", "Hallucination", "AssumptionControl", "Coherence", "Accuracy", "FinalScore"]])
260
+
261
+ # print("\n=== Leaderboard (average per task & agent) ===")
262
+ # print(leaderboard)
263
+
264
+ # print("\nVisualization files saved in /tmp/:")
265
+ # for path, caption in images:
266
+ # print(f"{caption}: {path}")
267
 
268
  import re
269
+ import json
270
+ import torch
 
 
 
271
  import pandas as pd
272
  import matplotlib.pyplot as plt
273
  import seaborn as sns
274
+ import os
275
+ import uuid
276
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
277
  from sentence_transformers import SentenceTransformer, util
278
 
 
369
 
370
 
371
  # --------------------------
372
+ # ROW & DF EVALUATION
373
  # --------------------------
374
+ def evaluate_row(row):
375
+ prompt = row.get("prompt", "")
376
+ response = row.get("response", "")
377
+ reference = row.get("reference", "")
378
+
379
+ metrics = {
380
+ "task_id": row.get("task_id", ""),
381
+ "agent": row.get("agent", ""),
382
+ "instruction_following": check_instruction_following(prompt, response),
383
+ "hallucination": check_hallucination(reference, response),
384
+ "assumption": check_assumption(response),
385
+ "coherence": check_coherence(response),
386
+ "accuracy": check_accuracy(reference, response),
 
 
 
 
 
387
  }
388
 
389
+ # Weighted avg score (you can adjust weights)
390
+ metrics["final_score"] = round(
391
+ 0.25 * metrics["instruction_following"]
392
+ + 0.25 * metrics["accuracy"]
393
+ + 0.2 * metrics["hallucination"]
394
+ + 0.15 * metrics["coherence"]
395
+ + 0.15 * metrics["assumption"],
396
+ 3,
397
+ )
398
+ return metrics
399
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
 
401
+ def evaluate_dataframe(df: pd.DataFrame):
402
+ metrics_df = df.apply(evaluate_row, axis=1, result_type="expand")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
403
 
404
  # Leaderboard
405
+ leaderboard = (
406
+ metrics_df.groupby(["agent", "task_id"])["final_score"]
407
+ .mean()
408
+ .reset_index()
409
+ )
410
 
411
+ # Plots
412
+ images = []
413
+ out_dir = "/tmp/plots"
414
+ os.makedirs(out_dir, exist_ok=True)
415
+
416
+ # Histogram of scores
417
+ plt.figure(figsize=(6, 4))
418
+ sns.histplot(metrics_df["final_score"], bins=10, kde=False)
419
+ plt.title("Distribution of Final Scores")
420
+ hist_path = os.path.join(out_dir, f"hist_{uuid.uuid4().hex}.png")
421
+ plt.savefig(hist_path)
422
+ plt.close()
423
+ images.append((hist_path, "Final Score Distribution"))
424
+
425
+ # Per-agent average
426
+ plt.figure(figsize=(6, 4))
427
+ agent_scores = metrics_df.groupby("agent")["final_score"].mean().reset_index()
428
+ sns.barplot(data=agent_scores, x="agent", y="final_score")
429
+ plt.title("Average Final Score per Agent")
430
+ bar_path = os.path.join(out_dir, f"bar_{uuid.uuid4().hex}.png")
431
+ plt.savefig(bar_path)
432
+ plt.close()
433
+ images.append((bar_path, "Average Score per Agent"))
434
+
435
+ return metrics_df, images, leaderboard