Files changed (1) hide show
  1. evaluator.py +485 -292
evaluator.py CHANGED
@@ -1,21 +1,66 @@
1
- # """
2
- # Evaluation module: loads models, computes metrics, and creates visualizations.
3
- # Lightweight, CPU-friendly, no Java required.
4
- # """
5
-
6
  # import re
7
- # import math
8
- # import uuid
9
- # from typing import List, Dict, Tuple
10
-
11
- # import numpy as np
12
  # import pandas as pd
13
  # import matplotlib.pyplot as plt
14
  # import seaborn as sns
15
- # import torch
 
16
  # from transformers import AutoTokenizer, AutoModelForSequenceClassification
17
  # from sentence_transformers import SentenceTransformer, util
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  # # --------------------------
20
  # # MODEL LOADING
21
  # # --------------------------
@@ -109,161 +154,55 @@
109
 
110
 
111
  # # --------------------------
112
- # # SCORING PIPELINE
113
  # # --------------------------
114
- # def compute_row_scores(prompt, response, reference) -> Dict:
115
- # instr = check_instruction_following(prompt, response)
116
- # halluc = check_hallucination(reference, response)
117
- # assum = check_assumption(response)
118
- # coh = check_coherence(response)
119
- # acc = check_accuracy(reference, response)
120
-
121
- # # Final score: average
122
- # components = [instr, halluc, assum, coh, acc]
123
- # final = round(float(sum(components) / len(components)), 3)
124
-
125
- # return {
126
- # "InstructionFollowing": instr,
127
- # "Hallucination": halluc,
128
- # "AssumptionControl": assum,
129
- # "Coherence": coh,
130
- # "Accuracy": acc,
131
- # "FinalScore": final,
132
  # }
133
 
 
 
 
 
 
 
 
 
 
 
134
 
135
- # # --------------------------
136
- # # VISUALIZATION HELPERS
137
- # # --------------------------
138
- # # def spider_net_multi(labels: List[str], rows: List[Dict], title: str, fill_alpha: float = 0.12):
139
- # # """Radar chart for multiple agents."""
140
- # # N = len(labels)
141
- # # angles = [n / float(N) * 2 * math.pi for n in range(N)]
142
- # # angles += angles[:1]
143
-
144
- # # fig = plt.figure(figsize=(6.5, 6.5))
145
- # # ax = plt.subplot(111, polar=True)
146
- # # ax.set_xticks(angles[:-1])
147
- # # ax.set_xticklabels(labels, fontsize=9)
148
- # # ax.set_ylim(0, 100)
149
- # # ax.set_yticks([0, 25, 50, 75, 100])
150
-
151
- # # for r in rows:
152
- # # values = r["values"]
153
- # # values_closed = values + values[:1]
154
- # # ax.plot(angles, values_closed, linewidth=1.5, label=r["name"])
155
- # # ax.fill(angles, values_closed, alpha=fill_alpha)
156
 
157
- # # ax.set_title(title, y=1.08, fontsize=12)
158
- # # ax.legend(loc="upper right", bbox_to_anchor=(1.25, 1.1))
159
- # # return fig
160
-
161
-
162
- # # def heatmap_plot(df: pd.DataFrame, metric_cols: List[str], title: str = "Metric Correlations"):
163
- # # fig, ax = plt.subplots(figsize=(7, 5))
164
- # # sns.heatmap(df[metric_cols].corr(), annot=True, fmt=".2f", cmap="coolwarm", ax=ax)
165
- # # ax.set_title(title)
166
- # # return fig
167
-
168
-
169
- # # --------------------------
170
- # # HIGH-LEVEL EVALUATION
171
- # # --------------------------
172
- # def evaluate_dataframe(df: pd.DataFrame) -> Tuple[pd.DataFrame, List[Tuple[str, str]], pd.DataFrame]:
173
- # """
174
- # df must contain: prompt, response, task, agent, reference
175
- # Returns: metrics_df, [(image_path, caption)], leaderboard_df
176
- # """
177
- # df = df.rename(columns={c: c.strip() for c in df.columns})
178
-
179
- # rows = []
180
- # for _, r in df.iterrows():
181
- # prompt = r.get("prompt", "")
182
- # response = r.get("response", "")
183
- # reference = r.get("reference", "")
184
- # agent = r.get("agent", "Unknown")
185
- # task = r.get("task", "Unknown")
186
-
187
- # scores = compute_row_scores(prompt, response, reference)
188
- # entry = {
189
- # "Task": str(task).strip(),
190
- # "Agent": str(agent),
191
- # "Prompt": prompt,
192
- # "Response": response,
193
- # "Reference": reference,
194
- # }
195
- # entry.update(scores)
196
- # rows.append(entry)
197
-
198
- # metrics_df = pd.DataFrame(rows)
199
-
200
- # # Visualization artifacts
201
- # images = []
202
- # metric_labels = ["InstructionFollowing", "Hallucination", "AssumptionControl", "Coherence", "Accuracy"]
203
-
204
- # # Per-task radar and bar charts
205
- # for task, g in metrics_df.groupby("Task"):
206
- # series = []
207
- # for a in g["Agent"].unique():
208
- # subset = g[g["Agent"] == a]
209
- # vals = [round(float(subset[m].mean()) * 100, 2) for m in metric_labels]
210
- # series.append({"name": a, "values": vals})
211
- # if series:
212
- # fig = spider_net_multi(metric_labels, series, title=f"{task} β€” Agent Comparison")
213
- # fname = f"/tmp/{uuid.uuid4().hex}_{task}_radar.png"
214
- # fig.savefig(fname, bbox_inches="tight")
215
- # plt.close(fig)
216
- # images.append((fname, f"{task} - radar"))
217
-
218
- # fig2, ax = plt.subplots(figsize=(8, 4))
219
- # avg = g.groupby("Agent")[metric_labels].mean()
220
- # avg.plot(kind="bar", ax=ax)
221
- # ax.set_title(f"{task} β€” Average Metrics by Agent")
222
- # ax.set_ylabel("Score (0-1)")
223
- # plt.xticks(rotation=45)
224
- # fname2 = f"/tmp/{uuid.uuid4().hex}_{task}_bar.png"
225
- # fig2.savefig(fname2, bbox_inches="tight")
226
- # plt.close(fig2)
227
- # images.append((fname2, f"{task} - bar"))
228
-
229
- # # Global heatmap
230
- # metric_cols = metric_labels + ["FinalScore"]
231
- # figh = heatmap_plot(metrics_df, metric_cols)
232
- # fnameh = f"/tmp/{uuid.uuid4().hex}_heatmap.png"
233
- # figh.savefig(fnameh, bbox_inches="tight")
234
- # plt.close(figh)
235
- # images.append((fnameh, "Metric Correlations Heatmap"))
236
 
237
  # # Leaderboard
238
- # lb = metrics_df.groupby(["Agent", "Task"])["FinalScore"].mean().reset_index()
239
- # lb = lb.sort_values(["FinalScore"], ascending=False)
240
-
241
- # return metrics_df, images, lb
242
-
243
-
244
- # # --------------------------
245
- # # DEMO USAGE
246
- # # --------------------------
247
- # if __name__ == "__main__":
248
- # # Sample dataset
249
- # data = [
250
- # {"task": "Math QA", "agent": "AgentA", "prompt": "What is 2+2?", "response": "The answer is 4.", "reference": "2+2=4"},
251
- # {"task": "Math QA", "agent": "AgentB", "prompt": "What is 2+2?", "response": "It might be 5, but usually 4.", "reference": "2+2=4"},
252
- # {"task": "Summarization", "agent": "AgentA", "prompt": "Summarize: 'The cat sat on the mat. The dog barked.'", "response": "A cat sat while a dog barked.", "reference": "Cat on mat, dog barking."},
253
- # ]
254
- # df = pd.DataFrame(data)
255
 
256
- # metrics_df, images, leaderboard = evaluate_dataframe(df)
257
 
258
- # print("\n=== Metrics per response ===")
259
- # print(metrics_df[["Task", "Agent", "InstructionFollowing", "Hallucination", "AssumptionControl", "Coherence", "Accuracy", "FinalScore"]])
260
-
261
- # print("\n=== Leaderboard (average per task & agent) ===")
262
- # print(leaderboard)
263
-
264
- # print("\nVisualization files saved in /tmp/:")
265
- # for path, caption in images:
266
- # print(f"{caption}: {path}")
267
 
268
  import re
269
  import json
@@ -273,151 +212,311 @@ import matplotlib.pyplot as plt
273
  import seaborn as sns
274
  import os
275
  import uuid
276
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
277
- from sentence_transformers import SentenceTransformer, util
278
-
279
- import matplotlib.pyplot as plt
280
  import numpy as np
281
-
282
- def plot_radar_chart(metrics_df, agents, metrics, out_path="/tmp/radar.png"):
283
- """
284
- Radar chart comparing multiple agents across metrics.
285
- """
286
- labels = metrics
287
- num_vars = len(labels)
288
-
289
- # Compute angle for each axis
290
- angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
291
- angles += angles[:1] # close loop
292
-
293
- fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
294
-
295
- for agent in agents:
296
- values = []
297
- for m in metrics:
298
- mean_val = metrics_df.loc[metrics_df['agent'] == agent, m].mean()
299
- values.append(mean_val if not np.isnan(mean_val) else 0)
300
- values += values[:1]
301
- ax.plot(angles, values, label=agent, linewidth=2)
302
- ax.fill(angles, values, alpha=0.25)
303
-
304
- ax.set_xticks(angles[:-1])
305
- ax.set_xticklabels(labels)
306
- ax.set_yticklabels([])
307
- ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.1))
308
- ax.set_title("Agent Performance Radar Chart")
309
-
310
- plt.tight_layout()
311
- plt.savefig(out_path)
312
- plt.close()
313
- return out_path
314
-
315
- import seaborn as sns
316
-
317
- def plot_heatmap(metrics_df, out_path="/tmp/heatmap.png"):
318
- pivot = metrics_df.groupby("agent")[
319
- ["accuracy", "hallucination", "instruction_following", "coherence", "assumption"]
320
- ].mean()
321
-
322
- plt.figure(figsize=(8, 5))
323
- sns.heatmap(pivot, annot=True, cmap="viridis", fmt=".2f")
324
- plt.title("Agent Γ— Metric Heatmap")
325
- plt.tight_layout()
326
- plt.savefig(out_path)
327
- plt.close()
328
- return out_path
329
 
330
  # --------------------------
331
  # MODEL LOADING
332
  # --------------------------
333
- NLI_MODEL = "textattack/roberta-base-MNLI"
334
- EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 
335
 
336
  # Load NLI model & tokenizer
337
  nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL)
338
  nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL)
339
- nli_model.to("cpu")
340
  nli_model.eval()
341
 
342
  # Load embedding model
343
  embed_model = SentenceTransformer(EMBED_MODEL)
344
 
 
 
 
 
 
 
 
 
 
 
 
345
  # Label mapping from config
346
  id2label = {int(k): v.upper() for k, v in nli_model.config.id2label.items()}
347
 
348
-
349
  # --------------------------
350
- # METRIC FUNCTIONS
351
  # --------------------------
352
  def check_instruction_following(prompt: str, response: str) -> float:
353
- """Embedding-based similarity between prompt and response."""
354
  if not prompt or not response:
355
  return 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
  p_emb = embed_model.encode(prompt, convert_to_tensor=True)
357
  r_emb = embed_model.encode(response, convert_to_tensor=True)
358
- sim = float(util.cos_sim(p_emb, r_emb).item())
359
- return round(max(0.0, min(1.0, sim)), 3)
360
-
 
 
361
 
362
  def check_hallucination(reference: str, response: str) -> float:
363
- """
364
- Single hallucination score:
365
- Entailment prob - Contradiction prob (normalized to [0,1]).
366
- Higher = less hallucination.
367
- """
368
  if not reference or not response:
369
  return 0.0
 
 
370
  with torch.no_grad():
371
- inputs = nli_tokenizer.encode_plus(reference, response, return_tensors="pt", truncation=True)
 
 
 
 
 
 
 
372
  outputs = nli_model(**inputs)
373
  probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
374
-
375
- entail_prob, contra_prob = 0.0, 0.0
376
  for idx, p in enumerate(probs):
377
  label = id2label.get(idx, "")
378
- if "ENTAIL" in label:
379
- entail_prob = float(p)
380
- elif "CONTRA" in label:
381
  contra_prob = float(p)
382
-
383
- score = entail_prob - contra_prob
384
- score = (score + 1) / 2 # normalize [-1,1] β†’ [0,1]
385
- return round(max(0.0, min(1.0, score)), 3)
386
-
 
 
 
 
 
 
 
 
387
 
388
  def check_assumption(response: str) -> float:
389
- """Detect speculative/hedging terms."""
390
  if not response:
391
  return 0.0
392
- speculative_terms = ["maybe", "probably", "might", "perhaps", "i guess", "seems", "could"]
393
- count = sum(1 for t in speculative_terms if t in response.lower())
394
- score = 1.0 - min(count / 5.0, 1.0) # smoother decay
395
- return round(score, 3)
396
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
 
398
  def check_coherence(response: str) -> float:
399
- """Heuristic coherence metric: penalizes very short/long, rewards sentence balance."""
400
  if not response:
401
  return 0.0
402
- words = len(re.findall(r"\w+", response))
403
- sents = max(1, len(re.split(r"[.!?]+", response)) - 1)
404
- if words < 5:
405
- return 0.3
406
- if words > 200:
407
- return 0.5
408
- base = min(1.0, (words / 50.0) + (sents / 5.0))
409
- return round(max(0.4, min(base, 0.95)), 3)
410
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
 
412
  def check_accuracy(reference: str, response: str) -> float:
413
- """Semantic similarity between reference and response via embeddings (cosine)."""
414
  if not reference or not response:
415
  return 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
416
  ref_emb = embed_model.encode(reference, convert_to_tensor=True)
417
  resp_emb = embed_model.encode(response, convert_to_tensor=True)
418
- sim = float(util.cos_sim(ref_emb, resp_emb).item())
419
- return round(max(0.0, min(1.0, sim)), 3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
420
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
 
422
  # --------------------------
423
  # ROW & DF EVALUATION
@@ -435,22 +534,130 @@ def evaluate_row(row):
435
  "assumption": check_assumption(response),
436
  "coherence": check_coherence(response),
437
  "accuracy": check_accuracy(reference, response),
 
 
438
  }
439
 
440
- # Weighted avg score (you can adjust weights)
441
  metrics["final_score"] = round(
442
- 0.25 * metrics["instruction_following"]
443
- + 0.25 * metrics["accuracy"]
444
- + 0.2 * metrics["hallucination"]
445
- + 0.15 * metrics["coherence"]
446
- + 0.15 * metrics["assumption"],
 
 
447
  3,
448
  )
449
  return metrics
450
 
 
 
 
 
 
 
 
 
 
 
 
451
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
452
  def evaluate_dataframe(df: pd.DataFrame):
453
- metrics_df = df.apply(evaluate_row, axis=1, result_type="expand")
 
454
 
455
  # Leaderboard
456
  leaderboard = (
@@ -459,41 +666,27 @@ def evaluate_dataframe(df: pd.DataFrame):
459
  .reset_index()
460
  )
461
 
462
-
463
- # # Plots
464
- # images = []
465
- # Existing images list
466
  images = []
467
 
468
- # Add radar chart
469
- radar_path = plot_radar_chart(metrics_df, agents=df["agent"].unique(),
470
- metrics=["accuracy", "hallucination", "instruction_following", "coherence", "assumption"])
 
 
 
471
  images.append((radar_path, "Radar Chart: Agent vs Metrics"))
472
 
473
- # Add heatmap
474
  heatmap_path = plot_heatmap(metrics_df)
475
  images.append((heatmap_path, "Heatmap: Agent vs Metrics"))
476
- return metrics_df, images, leaderboard
477
- # out_dir = "/tmp/plots"
478
- # os.makedirs(out_dir, exist_ok=True)
479
-
480
- # # Histogram of scores
481
- # plt.figure(figsize=(6, 4))
482
- # sns.histplot(metrics_df["final_score"], bins=10, kde=False)
483
- # plt.title("Distribution of Final Scores")
484
- # hist_path = os.path.join(out_dir, f"hist_{uuid.uuid4().hex}.png")
485
- # plt.savefig(hist_path)
486
- # plt.close()
487
- # images.append((hist_path, "Final Score Distribution"))
488
-
489
- # # Per-agent average
490
- # plt.figure(figsize=(6, 4))
491
- # agent_scores = metrics_df.groupby("agent")["final_score"].mean().reset_index()
492
- # sns.barplot(data=agent_scores, x="agent", y="final_score")
493
- # plt.title("Average Final Score per Agent")
494
- # bar_path = os.path.join(out_dir, f"bar_{uuid.uuid4().hex}.png")
495
- # plt.savefig(bar_path)
496
- # plt.close()
497
- # images.append((bar_path, "Average Score per Agent"))
498
-
499
- # return metrics_df, images, leaderboard
 
1
+ #####################################################################################################################################################################
 
 
 
 
2
  # import re
3
+ # import json
4
+ # import torch
 
 
 
5
  # import pandas as pd
6
  # import matplotlib.pyplot as plt
7
  # import seaborn as sns
8
+ # import os
9
+ # import uuid
10
  # from transformers import AutoTokenizer, AutoModelForSequenceClassification
11
  # from sentence_transformers import SentenceTransformer, util
12
 
13
+ # import matplotlib.pyplot as plt
14
+ # import numpy as np
15
+
16
+ # def plot_radar_chart(metrics_df, agents, metrics, out_path="/tmp/radar.png"):
17
+ # """
18
+ # Radar chart comparing multiple agents across metrics.
19
+ # """
20
+ # labels = metrics
21
+ # num_vars = len(labels)
22
+
23
+ # # Compute angle for each axis
24
+ # angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
25
+ # angles += angles[:1] # close loop
26
+
27
+ # fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
28
+
29
+ # for agent in agents:
30
+ # values = []
31
+ # for m in metrics:
32
+ # mean_val = metrics_df.loc[metrics_df['agent'] == agent, m].mean()
33
+ # values.append(mean_val if not np.isnan(mean_val) else 0)
34
+ # values += values[:1]
35
+ # ax.plot(angles, values, label=agent, linewidth=2)
36
+ # ax.fill(angles, values, alpha=0.25)
37
+
38
+ # ax.set_xticks(angles[:-1])
39
+ # ax.set_xticklabels(labels)
40
+ # ax.set_yticklabels([])
41
+ # ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.1))
42
+ # ax.set_title("Agent Performance Radar Chart")
43
+
44
+ # plt.tight_layout()
45
+ # plt.savefig(out_path)
46
+ # plt.close()
47
+ # return out_path
48
+
49
+ # import seaborn as sns
50
+
51
+ # def plot_heatmap(metrics_df, out_path="/tmp/heatmap.png"):
52
+ # pivot = metrics_df.groupby("agent")[
53
+ # ["accuracy", "hallucination", "instruction_following", "coherence", "assumption"]
54
+ # ].mean()
55
+
56
+ # plt.figure(figsize=(8, 5))
57
+ # sns.heatmap(pivot, annot=True, cmap="viridis", fmt=".2f")
58
+ # plt.title("Agent Γ— Metric Heatmap")
59
+ # plt.tight_layout()
60
+ # plt.savefig(out_path)
61
+ # plt.close()
62
+ # return out_path
63
+
64
  # # --------------------------
65
  # # MODEL LOADING
66
  # # --------------------------
 
154
 
155
 
156
  # # --------------------------
157
+ # # ROW & DF EVALUATION
158
  # # --------------------------
159
+ # def evaluate_row(row):
160
+ # prompt = row.get("prompt", "")
161
+ # response = row.get("response", "")
162
+ # reference = row.get("reference", "")
163
+
164
+ # metrics = {
165
+ # "task_id": row.get("task_id", ""),
166
+ # "agent": row.get("agent", ""),
167
+ # "instruction_following": check_instruction_following(prompt, response),
168
+ # "hallucination": check_hallucination(reference, response),
169
+ # "assumption": check_assumption(response),
170
+ # "coherence": check_coherence(response),
171
+ # "accuracy": check_accuracy(reference, response),
 
 
 
 
 
172
  # }
173
 
174
+ # # Weighted avg score (you can adjust weights)
175
+ # metrics["final_score"] = round(
176
+ # 0.25 * metrics["instruction_following"]
177
+ # + 0.25 * metrics["accuracy"]
178
+ # + 0.2 * metrics["hallucination"]
179
+ # + 0.15 * metrics["coherence"]
180
+ # + 0.15 * metrics["assumption"],
181
+ # 3,
182
+ # )
183
+ # return metrics
184
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
+ # def evaluate_dataframe(df: pd.DataFrame):
187
+ # metrics_df = df.apply(evaluate_row, axis=1, result_type="expand")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
  # # Leaderboard
190
+ # leaderboard = (
191
+ # metrics_df.groupby(["agent", "task_id"])["final_score"]
192
+ # .mean()
193
+ # .reset_index()
194
+ # )
 
 
 
 
 
 
 
 
 
 
 
 
195
 
 
196
 
197
+ # # # Plots
198
+ # # images = []
199
+ # # Existing images list
200
+ # images = []
201
+
202
+ # # Add radar chart
203
+ # radar_path = plot_radar_chart(metrics_df, agents=df["agent"].unique(),
204
+ #
205
+ ###############################################################################################################################
206
 
207
  import re
208
  import json
 
212
  import seaborn as sns
213
  import os
214
  import uuid
 
 
 
 
215
  import numpy as np
216
+ from transformers import (
217
+ AutoTokenizer,
218
+ AutoModelForSequenceClassification,
219
+ AutoModelForCausalLM,
220
+ pipeline
221
+ )
222
+ from sentence_transformers import SentenceTransformer, util
223
+ import evaluate
224
+ from sklearn.metrics import accuracy_score, f1_score
225
+ from collections import defaultdict
226
+ import warnings
227
+ warnings.filterwarnings('ignore')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
 
229
  # --------------------------
230
  # MODEL LOADING
231
  # --------------------------
232
+ NLI_MODEL = "microsoft/deberta-v2-xlarge-mnli"
233
+ EMBED_MODEL = "sentence-transformers/all-mpnet-base-v2"
234
+ LLM_JUDGE_MODEL = "microsoft/DialoGPT-large" # Can be replaced with more powerful models
235
 
236
  # Load NLI model & tokenizer
237
  nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL)
238
  nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL)
239
+ nli_model.to("cuda" if torch.cuda.is_available() else "cpu")
240
  nli_model.eval()
241
 
242
  # Load embedding model
243
  embed_model = SentenceTransformer(EMBED_MODEL)
244
 
245
+ # Load LLM judge
246
+ judge_tokenizer = AutoTokenizer.from_pretrained(LLM_JUDGE_MODEL)
247
+ judge_model = AutoModelForCausalLM.from_pretrained(LLM_JUDGE_MODEL)
248
+ judge_model.to("cuda" if torch.cuda.is_available() else "cpu")
249
+ judge_model.eval()
250
+
251
+ # Load additional evaluation metrics
252
+ bertscore = evaluate.load("bertscore")
253
+ bleu = evaluate.load("bleu")
254
+ rouge = evaluate.load("rouge")
255
+
256
  # Label mapping from config
257
  id2label = {int(k): v.upper() for k, v in nli_model.config.id2label.items()}
258
 
 
259
  # --------------------------
260
+ # IMPROVED METRIC FUNCTIONS
261
  # --------------------------
262
  def check_instruction_following(prompt: str, response: str) -> float:
263
+ """Improved instruction following using NLI and semantic similarity."""
264
  if not prompt or not response:
265
  return 0.0
266
+
267
+ # Method 1: NLI-based evaluation
268
+ with torch.no_grad():
269
+ inputs = nli_tokenizer.encode_plus(
270
+ prompt,
271
+ response,
272
+ return_tensors="pt",
273
+ truncation=True,
274
+ max_length=512
275
+ ).to(nli_model.device)
276
+
277
+ outputs = nli_model(**inputs)
278
+ probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
279
+
280
+ entail_prob, neutral_prob = 0.0, 0.0
281
+ for idx, p in enumerate(probs):
282
+ label = id2label.get(idx, "")
283
+ if "ENTAIL" in label:
284
+ entail_prob = float(p)
285
+ elif "NEUTRAL" in label:
286
+ neutral_prob = float(p)
287
+
288
+ nli_score = entail_prob + (neutral_prob * 0.5)
289
+
290
+ # Method 2: Semantic similarity
291
  p_emb = embed_model.encode(prompt, convert_to_tensor=True)
292
  r_emb = embed_model.encode(response, convert_to_tensor=True)
293
+ sim_score = float(util.cos_sim(p_emb, r_emb).item())
294
+
295
+ # Combined score (weighted average)
296
+ final_score = 0.7 * nli_score + 0.3 * sim_score
297
+ return round(max(0.0, min(1.0, final_score)), 3)
298
 
299
  def check_hallucination(reference: str, response: str) -> float:
300
+ """Enhanced hallucination detection using multiple methods."""
 
 
 
 
301
  if not reference or not response:
302
  return 0.0
303
+
304
+ # Method 1: NLI-based contradiction detection
305
  with torch.no_grad():
306
+ inputs = nli_tokenizer.encode_plus(
307
+ reference,
308
+ response,
309
+ return_tensors="pt",
310
+ truncation=True,
311
+ max_length=512
312
+ ).to(nli_model.device)
313
+
314
  outputs = nli_model(**inputs)
315
  probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
316
+
317
+ contra_prob, neutral_prob = 0.0, 0.0
318
  for idx, p in enumerate(probs):
319
  label = id2label.get(idx, "")
320
+ if "CONTRA" in label:
 
 
321
  contra_prob = float(p)
322
+ elif "NEUTRAL" in label:
323
+ neutral_prob = float(p)
324
+
325
+ nli_hallucination_score = contra_prob + (neutral_prob * 0.3)
326
+
327
+ # Method 2: Semantic similarity penalty
328
+ ref_emb = embed_model.encode(reference, convert_to_tensor=True)
329
+ resp_emb = embed_model.encode(response, convert_to_tensor=True)
330
+ semantic_sim = float(util.cos_sim(ref_emb, resp_emb).item())
331
+
332
+ # Combined score: Higher when less hallucination
333
+ hallucination_score = 1.0 - (0.7 * nli_hallucination_score + 0.3 * (1 - semantic_sim))
334
+ return round(max(0.0, min(1.0, hallucination_score)), 3)
335
 
336
  def check_assumption(response: str) -> float:
337
+ """Improved assumption detection using pattern matching and LLM judgment."""
338
  if not response:
339
  return 0.0
340
+
341
+ # Pattern-based detection
342
+ speculative_patterns = [
343
+ r"\b(maybe|perhaps|possibly|probably|might|could|would|should)\b",
344
+ r"\b(I think|I believe|I guess|I suppose|I assume)\b",
345
+ r"\b(it seems|it appears|it looks like)\b",
346
+ r"\b(likely|unlikely|presumably|arguably)\b",
347
+ r"\b(some|many|most|often|usually|generally|typically)\b"
348
+ ]
349
+
350
+ pattern_count = sum(
351
+ len(re.findall(pattern, response.lower()))
352
+ for pattern in speculative_patterns
353
+ )
354
+
355
+ # Length normalization
356
+ word_count = len(response.split())
357
+ pattern_score = min(1.0, pattern_count / max(1, word_count / 5))
358
+
359
+ # LLM-based judgment
360
+ assumption_prompt = f"""
361
+ Determine if the following text contains assumptions, speculation, or hedging language.
362
+ Text: {response}
363
+ Answer with only 'yes' or 'no':
364
+ """
365
+
366
+ with torch.no_grad():
367
+ inputs = judge_tokenizer.encode(assumption_prompt, return_tensors="pt")
368
+ outputs = judge_model.generate(
369
+ inputs,
370
+ max_length=len(inputs[0]) + 3,
371
+ pad_token_id=judge_tokenizer.eos_token_id
372
+ )
373
+ judgment = judge_tokenizer.decode(outputs[0], skip_special_tokens=True)
374
+
375
+ llm_score = 0.0 if "yes" in judgment.lower() else 1.0
376
+
377
+ # Combined score
378
+ final_score = 0.6 * (1 - pattern_score) + 0.4 * llm_score
379
+ return round(final_score, 3)
380
 
381
  def check_coherence(response: str) -> float:
382
+ """Enhanced coherence evaluation using multiple linguistic features."""
383
  if not response:
384
  return 0.0
385
+
386
+ # Feature 1: Sentence structure
387
+ sentences = re.split(r'[.!?]+', response)
388
+ sentences = [s.strip() for s in sentences if len(s.strip()) > 0]
389
+ num_sentences = len(sentences)
390
+
391
+ if num_sentences == 0:
392
+ return 0.0
393
+
394
+ # Feature 2: Sentence length variation
395
+ sent_lengths = [len(s.split()) for s in sentences]
396
+ length_variance = np.var(sent_lengths) if len(sent_lengths) > 1 else 0
397
+ length_score = 1.0 - min(1.0, length_variance / 100)
398
+
399
+ # Feature 3: Transition words
400
+ transition_words = [
401
+ 'however', 'therefore', 'moreover', 'furthermore', 'consequently',
402
+ 'additionally', 'likewise', 'similarly', 'nevertheless', 'nonetheless'
403
+ ]
404
+
405
+ transition_count = sum(1 for word in transition_words
406
+ if word in response.lower())
407
+ transition_score = min(1.0, transition_count / 3)
408
+
409
+ # Feature 4: Repetition penalty
410
+ words = response.lower().split()
411
+ unique_words = set(words)
412
+ repetition_ratio = len(unique_words) / max(1, len(words))
413
+
414
+ # Combined score
415
+ coherence_score = (
416
+ 0.3 * min(1.0, num_sentences / 5) +
417
+ 0.2 * length_score +
418
+ 0.3 * transition_score +
419
+ 0.2 * repetition_ratio
420
+ )
421
+
422
+ return round(max(0.0, min(1.0, coherence_score)), 3)
423
 
424
  def check_accuracy(reference: str, response: str) -> float:
425
+ """Enhanced accuracy evaluation using multiple metrics."""
426
  if not reference or not response:
427
  return 0.0
428
+
429
+ # BERTScore
430
+ bert_results = bertscore.compute(
431
+ predictions=[response],
432
+ references=[reference],
433
+ lang="en",
434
+ model_type=EMBED_MODEL
435
+ )
436
+ bert_f1 = bert_results['f1'][0]
437
+
438
+ # ROUGE-L
439
+ rouge_results = rouge.compute(
440
+ predictions=[response],
441
+ references=[reference],
442
+ use_stemmer=True
443
+ )
444
+ rouge_l = rouge_results['rougeL']
445
+
446
+ # BLEU (for shorter responses)
447
+ try:
448
+ bleu_results = bleu.compute(
449
+ predictions=[response.split()],
450
+ references=[[reference.split()]]
451
+ )
452
+ bleu_score = bleu_results['bleu']
453
+ except:
454
+ bleu_score = 0.0
455
+
456
+ # Semantic similarity
457
  ref_emb = embed_model.encode(reference, convert_to_tensor=True)
458
  resp_emb = embed_model.encode(response, convert_to_tensor=True)
459
+ semantic_sim = float(util.cos_sim(ref_emb, resp_emb).item())
460
+
461
+ # Combined score (weighted average)
462
+ accuracy_score = (
463
+ 0.4 * bert_f1 +
464
+ 0.3 * rouge_l +
465
+ 0.1 * bleu_score +
466
+ 0.2 * semantic_sim
467
+ )
468
+
469
+ return round(max(0.0, min(1.0, accuracy_score)), 3)
470
+
471
+ def check_relevance(prompt: str, response: str) -> float:
472
+ """Check how relevant the response is to the prompt."""
473
+ if not prompt or not response:
474
+ return 0.0
475
+
476
+ # Encode both prompt and response
477
+ p_emb = embed_model.encode(prompt, convert_to_tensor=True)
478
+ r_emb = embed_model.encode(response, convert_to_tensor=True)
479
+
480
+ # Calculate cosine similarity
481
+ similarity = float(util.cos_sim(p_emb, r_emb).item())
482
+
483
+ return round(max(0.0, min(1.0, similarity)), 3)
484
 
485
+ def check_fluency(response: str) -> float:
486
+ """Check the fluency of the response using perplexity-based approach."""
487
+ if not response:
488
+ return 0.0
489
+
490
+ # Load a fluency model (perplexity-based)
491
+ fluency_checker = pipeline(
492
+ "text-classification",
493
+ model="textattack/roberta-base-CoLA",
494
+ device=0 if torch.cuda.is_available() else -1
495
+ )
496
+
497
+ try:
498
+ # Split into sentences if too long
499
+ sentences = re.split(r'[.!?]+', response)
500
+ sentences = [s.strip() for s in sentences if len(s.strip()) > 5]
501
+
502
+ if not sentences:
503
+ return 0.5
504
+
505
+ # Check each sentence
506
+ fluency_scores = []
507
+ for sent in sentences[:3]: # Limit to first 3 sentences
508
+ result = fluency_checker(sent[:512]) # Truncate if too long
509
+ score = result[0]['score'] if result[0]['label'] == 'LABEL_1' else 1 - result[0]['score']
510
+ fluency_scores.append(score)
511
+
512
+ avg_fluency = sum(fluency_scores) / len(fluency_scores)
513
+ return round(avg_fluency, 3)
514
+ except:
515
+ # Fallback to simple heuristic
516
+ words = response.split()
517
+ if len(words) < 3:
518
+ return 0.3
519
+ return 0.7
520
 
521
  # --------------------------
522
  # ROW & DF EVALUATION
 
534
  "assumption": check_assumption(response),
535
  "coherence": check_coherence(response),
536
  "accuracy": check_accuracy(reference, response),
537
+ "relevance": check_relevance(prompt, response),
538
+ "fluency": check_fluency(response),
539
  }
540
 
541
+ # Weighted avg score (adjust weights as needed)
542
  metrics["final_score"] = round(
543
+ 0.20 * metrics["instruction_following"] +
544
+ 0.20 * metrics["accuracy"] +
545
+ 0.15 * metrics["hallucination"] +
546
+ 0.10 * metrics["coherence"] +
547
+ 0.10 * metrics["assumption"] +
548
+ 0.15 * metrics["relevance"] +
549
+ 0.10 * metrics["fluency"],
550
  3,
551
  )
552
  return metrics
553
 
554
+ # --------------------------
555
+ # VISUALIZATION FUNCTIONS
556
+ # --------------------------
557
+ def plot_radar_chart(metrics_df, agents, metrics, out_path="/tmp/radar.png"):
558
+ """Radar chart comparing multiple agents across metrics."""
559
+ labels = metrics
560
+ num_vars = len(labels)
561
+
562
+ # Compute angle for each axis
563
+ angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
564
+ angles += angles[:1] # close loop
565
 
566
+ fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))
567
+
568
+ for agent in agents:
569
+ values = []
570
+ for m in metrics:
571
+ mean_val = metrics_df.loc[metrics_df['agent'] == agent, m].mean()
572
+ values.append(mean_val if not np.isnan(mean_val) else 0)
573
+ values += values[:1]
574
+ ax.plot(angles, values, label=agent, linewidth=2)
575
+ ax.fill(angles, values, alpha=0.25)
576
+
577
+ ax.set_xticks(angles[:-1])
578
+ ax.set_xticklabels(labels)
579
+ ax.set_yticklabels([])
580
+ ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.1))
581
+ ax.set_title("Agent Performance Radar Chart")
582
+
583
+ plt.tight_layout()
584
+ plt.savefig(out_path)
585
+ plt.close()
586
+ return out_path
587
+
588
+ def plot_heatmap(metrics_df, out_path="/tmp/heatmap.png"):
589
+ """Heatmap of agent performance across metrics."""
590
+ metrics = ["accuracy", "hallucination", "instruction_following",
591
+ "coherence", "assumption", "relevance", "fluency"]
592
+
593
+ pivot = metrics_df.groupby("agent")[metrics].mean()
594
+
595
+ plt.figure(figsize=(10, 6))
596
+ sns.heatmap(pivot, annot=True, cmap="YlGnBu", fmt=".3f", center=0.5)
597
+ plt.title("Agent Γ— Metric Heatmap")
598
+ plt.tight_layout()
599
+ plt.savefig(out_path)
600
+ plt.close()
601
+ return out_path
602
+
603
+ def plot_score_distribution(metrics_df, out_path="/tmp/distribution.png"):
604
+ """Distribution of final scores by agent."""
605
+ plt.figure(figsize=(10, 6))
606
+ agents = metrics_df['agent'].unique()
607
+
608
+ for agent in agents:
609
+ agent_scores = metrics_df[metrics_df['agent'] == agent]['final_score']
610
+ sns.kdeplot(agent_scores, label=agent, fill=True, alpha=0.3)
611
+
612
+ plt.xlabel('Final Score')
613
+ plt.ylabel('Density')
614
+ plt.title('Distribution of Final Scores by Agent')
615
+ plt.legend()
616
+ plt.tight_layout()
617
+ plt.savefig(out_path)
618
+ plt.close()
619
+ return out_path
620
+
621
+ def plot_metric_correlation(metrics_df, out_path="/tmp/correlation.png"):
622
+ """Correlation matrix between different metrics."""
623
+ metrics = ["accuracy", "hallucination", "instruction_following",
624
+ "coherence", "assumption", "relevance", "fluency", "final_score"]
625
+
626
+ plt.figure(figsize=(10, 8))
627
+ correlation_matrix = metrics_df[metrics].corr()
628
+ sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", center=0,
629
+ fmt=".2f", square=True)
630
+ plt.title('Correlation Between Metrics')
631
+ plt.tight_layout()
632
+ plt.savefig(out_path)
633
+ plt.close()
634
+ return out_path
635
+
636
+ def plot_agent_comparison(metrics_df, out_path="/tmp/agent_comparison.png"):
637
+ """Bar chart comparing agent performance across metrics."""
638
+ metrics = ["accuracy", "hallucination", "instruction_following",
639
+ "coherence", "assumption", "relevance", "fluency"]
640
+
641
+ agent_means = metrics_df.groupby('agent')[metrics].mean()
642
+
643
+ plt.figure(figsize=(12, 6))
644
+ agent_means.plot(kind='bar', colormap='Set3')
645
+ plt.title('Agent Performance Across Metrics')
646
+ plt.xlabel('Agent')
647
+ plt.ylabel('Score')
648
+ plt.xticks(rotation=45)
649
+ plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
650
+ plt.tight_layout()
651
+ plt.savefig(out_path)
652
+ plt.close()
653
+ return out_path
654
+
655
+ # --------------------------
656
+ # MAIN EVALUATION FUNCTION
657
+ # --------------------------
658
  def evaluate_dataframe(df: pd.DataFrame):
659
+ """Evaluate a dataframe of agent responses."""
660
+ metrics_df = df.apply(evaluate_row, axis=1, result_type='expand')
661
 
662
  # Leaderboard
663
  leaderboard = (
 
666
  .reset_index()
667
  )
668
 
669
+ # Generate visualizations
 
 
 
670
  images = []
671
 
672
+ # Add all visualizations
673
+ agents = df["agent"].unique()
674
+ metrics = ["accuracy", "hallucination", "instruction_following",
675
+ "coherence", "assumption", "relevance", "fluency"]
676
+
677
+ radar_path = plot_radar_chart(metrics_df, agents, metrics)
678
  images.append((radar_path, "Radar Chart: Agent vs Metrics"))
679
 
 
680
  heatmap_path = plot_heatmap(metrics_df)
681
  images.append((heatmap_path, "Heatmap: Agent vs Metrics"))
682
+
683
+ distribution_path = plot_score_distribution(metrics_df)
684
+ images.append((distribution_path, "Score Distribution by Agent"))
685
+
686
+ correlation_path = plot_metric_correlation(metrics_df)
687
+ images.append((correlation_path, "Metric Correlation Matrix"))
688
+
689
+ agent_comparison_path = plot_agent_comparison(metrics_df)
690
+ images.append((agent_comparison_path, "Agent Comparison Chart"))
691
+
692
+ return metrics_df, images, leaderboard