Supastrikas-004 manayporwal07 commited on
Commit
fca26b1
Β·
verified Β·
1 Parent(s): d4e4912

Update evaluator.py (#16)

Browse files

- Update evaluator.py (260dd1f6f4d58ed1d98270526903191fc129a3a4)


Co-authored-by: Manay Porwal <manayporwal07@users.noreply.huggingface.co>

Files changed (1) hide show
  1. evaluator.py +210 -211
evaluator.py CHANGED
@@ -204,258 +204,257 @@
204
  #
205
  ###############################################################################################################################
206
 
 
207
  """
208
- Evaluation logic for Agentic Evaluation Framework.
 
209
  """
210
 
211
- import os
 
 
212
  import numpy as np
213
  import pandas as pd
214
- import torch
215
  import matplotlib.pyplot as plt
216
-
217
- from transformers import (
218
- AutoTokenizer,
219
- AutoModelForSequenceClassification,
220
- AutoModelForCausalLM,
221
- pipeline,
222
- )
223
- from sentence_transformers import SentenceTransformer
224
- import evaluate
225
 
226
  # -----------------------------
227
- # Global Config
228
  # -----------------------------
229
- NLI_MODEL = "microsoft/deberta-v2-xlarge-mnli"
230
- EMBED_MODEL = "all-MiniLM-L6-v2"
231
- LLM_JUDGE_MODEL = "microsoft/DialoGPT-small"
232
- FLUENCY_MODEL = "textattack/roberta-base-CoLA"
233
-
234
- device = 0 if torch.cuda.is_available() else -1
235
 
236
- # Caches
237
- _nli_model, _nli_tokenizer = None, None
238
  _embed_model = None
239
- _judge_model, _judge_tokenizer = None, None
240
- _fluency_checker = None
241
-
242
- # Metrics
243
- bertscore = evaluate.load("bertscore")
244
- bleu = evaluate.load("bleu")
245
- rouge = evaluate.load("rouge")
246
-
247
 
248
- # -----------------------------
249
- # Lazy Model Loaders
250
- # -----------------------------
251
- def get_nli_model():
252
- global _nli_model, _nli_tokenizer
 
253
  if _nli_model is None:
 
254
  _nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL)
255
- _nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL).to(
256
- torch.device("cuda" if torch.cuda.is_available() else "cpu")
257
- )
258
  _nli_model.eval()
259
- return _nli_model, _nli_tokenizer
260
-
261
 
262
  def get_embed_model():
263
- global _embed_model
264
- if _embed_model is None:
265
- _embed_model = SentenceTransformer(EMBED_MODEL, device="cuda" if torch.cuda.is_available() else "cpu")
266
  return _embed_model
267
 
268
-
269
- def get_judge_model():
270
- global _judge_model, _judge_tokenizer
271
- if _judge_model is None:
272
- _judge_tokenizer = AutoTokenizer.from_pretrained(LLM_JUDGE_MODEL)
273
- _judge_model = AutoModelForCausalLM.from_pretrained(LLM_JUDGE_MODEL).to(
274
- torch.device("cuda" if torch.cuda.is_available() else "cpu")
275
- )
276
- return _judge_model, _judge_tokenizer
277
-
278
-
279
- def get_fluency_checker():
280
- global _fluency_checker
281
- if _fluency_checker is None:
282
- _fluency_checker = pipeline(
283
- "text-classification", model=FLUENCY_MODEL, device=device
284
- )
285
- return _fluency_checker
286
-
287
 
288
  # -----------------------------
289
- # Evaluation Functions
290
  # -----------------------------
291
- def check_instruction_following(prompt, response):
292
- try:
293
- nli_model, nli_tokenizer = get_nli_model()
294
- inputs = nli_tokenizer(prompt, response, return_tensors="pt", truncation=True, padding=True).to(
295
- nli_model.device
296
- )
297
- with torch.no_grad():
298
- logits = nli_model(**inputs).logits
299
- probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]
300
- entailment_score = probs[2] # entailment index
301
- return float(entailment_score)
302
- except Exception:
303
- return 0.0
304
 
305
-
306
- def check_hallucination(reference, response):
307
- try:
308
- nli_model, nli_tokenizer = get_nli_model()
309
- inputs = nli_tokenizer(reference, response, return_tensors="pt", truncation=True, padding=True).to(
310
- nli_model.device
311
- )
312
- with torch.no_grad():
313
- logits = nli_model(**inputs).logits
314
- probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]
315
- contradiction_score = probs[0] # contradiction index
316
- return 1.0 - float(contradiction_score)
317
- except Exception:
318
  return 0.0
319
-
320
-
321
- def check_assumption(prompt, response):
322
- try:
323
- judge_model, judge_tokenizer = get_judge_model()
324
- input_text = f"Does this response make assumptions not in the prompt?\nPrompt: {prompt}\nResponse: {response}\nAnswer yes or no:"
325
- inputs = judge_tokenizer.encode(input_text, return_tensors="pt").to(judge_model.device)
326
- outputs = judge_model.generate(inputs, max_length=50)
327
- judgment = judge_tokenizer.decode(outputs[0], skip_special_tokens=True).lower()
328
- if "yes" in judgment:
329
- return 0.0
330
- elif "no" in judgment:
331
- return 1.0
332
- return 0.5
333
- except Exception:
334
- return 0.5
335
-
336
-
337
- def check_coherence(response):
338
- try:
339
- emb = get_embed_model().encode(response, convert_to_tensor=True, normalize_embeddings=True)
340
- coherence = float(torch.mean(emb).cpu().item())
341
- return coherence
342
- except Exception:
343
  return 0.0
344
-
345
-
346
- def check_accuracy(reference, response):
347
- try:
348
- bert_results = bertscore.compute(predictions=[response], references=[reference], lang="en")
349
- bert_f1 = bert_results["f1"][0]
350
- except Exception:
351
- bert_f1 = 0.0
352
-
353
- try:
354
- bleu_results = bleu.compute(predictions=[response], references=[[reference]])
355
- bleu_score = bleu_results["bleu"]
356
- except Exception:
357
- bleu_score = 0.0
358
-
359
- try:
360
- rouge_results = rouge.compute(predictions=[response], references=[reference])
361
- rouge_l = rouge_results["rougeL"]
362
- except Exception:
363
- rouge_l = 0.0
364
-
365
- return float((bert_f1 + bleu_score + rouge_l) / 3)
366
-
367
-
368
- def check_relevance(prompt, response):
369
- try:
370
- model = get_embed_model()
371
- emb1 = model.encode(prompt, convert_to_tensor=True)
372
- emb2 = model.encode(response, convert_to_tensor=True)
373
- cos_sim = torch.nn.functional.cosine_similarity(emb1, emb2, dim=0)
374
- return float(cos_sim.item())
375
- except Exception:
376
  return 0.0
377
-
378
-
379
- def check_fluency(response):
380
- try:
381
- fluency_checker = get_fluency_checker()
382
- result = fluency_checker(response)[0]
383
- return float(result["score"]) if result["label"] == "LABEL_1" else 1.0 - float(result["score"])
384
- except Exception:
385
- return 0.5
386
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
387
 
388
  # -----------------------------
389
- # Row Evaluation
390
  # -----------------------------
391
- def evaluate_row(row):
392
- scores = {
393
- "instruction_following": check_instruction_following(row["prompt"], row["response"]),
394
- "hallucination": check_hallucination(row["reference"], row["response"]),
395
- "assumption": check_assumption(row["prompt"], row["response"]),
396
- "coherence": check_coherence(row["response"]),
397
- "accuracy": check_accuracy(row["reference"], row["response"]),
398
- "relevance": check_relevance(row["prompt"], row["response"]),
399
- "fluency": check_fluency(row["response"]),
400
- }
401
- scores["final_score"] = np.mean(list(scores.values()))
402
- return pd.Series(scores)
403
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
404
 
405
  # -----------------------------
406
- # Visualization Helpers
407
  # -----------------------------
408
- def plot_radar_chart(metrics_df, out_path="/tmp/radar.png"):
409
- import seaborn as sns
410
-
411
- mean_scores = metrics_df.mean(numeric_only=True).drop("final_score", errors="ignore")
412
- categories = list(mean_scores.index)
413
- values = mean_scores.values.tolist()
414
 
415
- values += values[:1]
416
- categories += categories[:1]
417
 
418
- angles = np.linspace(0, 2 * np.pi, len(categories), endpoint=False).tolist()
419
- angles += angles[:1]
420
-
421
- plt.figure(figsize=(6, 6))
422
- ax = plt.subplot(111, polar=True)
423
- ax.plot(angles, values, "o-", linewidth=2)
424
- ax.fill(angles, values, alpha=0.25)
425
- ax.set_thetagrids(np.degrees(angles[:-1]), categories)
426
- plt.savefig(out_path)
427
- plt.close()
428
- return out_path, "Radar Chart (Mean Scores)"
429
-
430
-
431
- def plot_leaderboard(metrics_df, out_path="/tmp/leaderboard.png"):
432
- agent_means = metrics_df.groupby("agent")["final_score"].mean().sort_values(ascending=False)
433
- plt.figure(figsize=(10, 5))
434
- agent_means.plot(kind="bar", colormap="Set3", ax=plt.gca())
435
- plt.title("Leaderboard: Avg Final Score per Agent")
436
- plt.ylabel("Score")
437
- plt.tight_layout()
438
- plt.savefig(out_path)
439
- plt.close()
440
- return out_path, "Leaderboard"
441
 
 
442
 
443
- # -----------------------------
444
- # Main Evaluation Entry
445
- # -----------------------------
446
- def evaluate_dataframe(df: pd.DataFrame):
447
- metrics_df = df.apply(evaluate_row, axis=1, result_type="expand")
448
- metrics_df = pd.concat([df, metrics_df], axis=1)
449
-
450
  leaderboard = (
451
- metrics_df.groupby("agent")["final_score"]
452
  .mean()
453
  .reset_index()
454
- .sort_values("final_score", ascending=False)
455
  )
456
 
 
 
 
457
  images = []
458
- images.append(plot_radar_chart(metrics_df))
459
- images.append(plot_leaderboard(metrics_df))
460
 
461
- return metrics_df, images, leaderboard
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  #
205
  ###############################################################################################################################
206
 
207
+ # evaluator.py
208
  """
209
+ Upgraded Evaluation logic for the Agentic Evaluation Framework.
210
+ Provides scoring functions, visualization generation, and summary outputs.
211
  """
212
 
213
+ import math
214
+ import uuid
215
+ from typing import List, Dict, Tuple
216
  import numpy as np
217
  import pandas as pd
 
218
  import matplotlib.pyplot as plt
219
+ import seaborn as sns
 
 
 
 
 
 
 
 
220
 
221
  # -----------------------------
222
+ # Lazy model loading
223
  # -----------------------------
224
+ NLI_MODEL = "textattack/roberta-base-MNLI"
225
+ EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 
 
 
 
226
 
227
+ _nli_tokenizer = None
228
+ _nli_model = None
229
  _embed_model = None
230
+ _id2label = None
 
 
 
 
 
 
 
231
 
232
+ def ensure_models_loaded():
233
+ global _nli_tokenizer, _nli_model, _embed_model, _id2label
234
+ if _embed_model is None:
235
+ from sentence_transformers import SentenceTransformer, util
236
+ _embed_model = SentenceTransformer(EMBED_MODEL)
237
+ globals()["util"] = util
238
  if _nli_model is None:
239
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
240
  _nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL)
241
+ _nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL)
242
+ _nli_model.to("cpu")
 
243
  _nli_model.eval()
244
+ _id2label = {int(k): v.upper() for k, v in _nli_model.config.id2label.items()}
 
245
 
246
  def get_embed_model():
247
+ ensure_models_loaded()
 
 
248
  return _embed_model
249
 
250
+ def get_nli_tokenizer_and_model():
251
+ ensure_models_loaded()
252
+ return _nli_tokenizer, _nli_model, _id2label
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
 
254
  # -----------------------------
255
+ # Metric functions
256
  # -----------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
257
 
258
+ def check_instruction_following(prompt: str, response: str) -> float:
259
+ if not prompt or not response:
 
 
 
 
 
 
 
 
 
 
 
260
  return 0.0
261
+ embed_model = get_embed_model()
262
+ p_emb = embed_model.encode(prompt, convert_to_tensor=True)
263
+ r_emb = embed_model.encode(response, convert_to_tensor=True)
264
+ sim = float(util.cos_sim(p_emb, r_emb).item())
265
+ return round(max(0.0, min(1.0, sim)), 3)
266
+
267
+ def check_hallucination(prompt: str, response: str) -> float:
268
+ if not prompt or not response:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  return 0.0
270
+ tokenizer, model, id2label = get_nli_tokenizer_and_model()
271
+ inputs = tokenizer.encode_plus(prompt, response, return_tensors="pt", truncation=True)
272
+ outputs = model(**inputs)
273
+ probs = outputs.logits.softmax(dim=1).detach().cpu().numpy()[0]
274
+ labels = [id2label[i] for i in range(len(probs))]
275
+ if "ENTAILMENT" in labels:
276
+ entailment_prob = float(probs[labels.index("ENTAILMENT")])
277
+ else:
278
+ entailment_prob = float(probs.max())
279
+ return round(entailment_prob, 3)
280
+
281
+ def check_accuracy(reference: str, response: str) -> float:
282
+ if not reference or not response:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  return 0.0
284
+ embed_model = get_embed_model()
285
+ ref_emb = embed_model.encode(reference, convert_to_tensor=True)
286
+ r_emb = embed_model.encode(response, convert_to_tensor=True)
287
+ sim = float(util.cos_sim(ref_emb, r_emb).item())
288
+ return round(max(0.0, min(1.0, sim)), 3)
289
+
290
+ def check_coherence(response: str) -> float:
291
+ if not response or not isinstance(response, str):
292
+ return 0.0
293
+ sentences = [s.strip() for s in response.split(".") if s.strip()]
294
+ if not sentences:
295
+ return 0.0
296
+ lengths = [len(s.split()) for s in sentences]
297
+ avg_len = np.mean(lengths)
298
+ std = np.std(lengths)
299
+ score = max(0.0, min(1.0, 1.0 - (std / (avg_len + 1e-6))))
300
+ return round(score, 3)
301
+
302
+ def check_fluency(response: str) -> float:
303
+ if not response or not isinstance(response, str):
304
+ return 0.0
305
+ letters = sum(ch.isalpha() for ch in response)
306
+ total = len(response)
307
+ return round(max(0.0, min(1.0, letters / max(1, total))), 3)
308
 
309
  # -----------------------------
310
+ # Visualization helpers
311
  # -----------------------------
 
 
 
 
 
 
 
 
 
 
 
 
312
 
313
+ def spider_net_multi(labels: List[str], rows: List[Dict], title: str, fill_alpha: float = 0.12):
314
+ N = len(labels)
315
+ angles = [n / float(N) * 2 * math.pi for n in range(N)]
316
+ angles += angles[:1]
317
+ fig = plt.figure(figsize=(6.5, 6.5))
318
+ ax = plt.subplot(111, polar=True)
319
+ ax.set_xticks(angles[:-1])
320
+ ax.set_xticklabels(labels, fontsize=9)
321
+ ax.set_ylim(0, 1)
322
+ ax.set_yticks([0, 0.25, 0.5, 0.75, 1.0])
323
+ for r in rows:
324
+ values = r["values"]
325
+ values_closed = values + values[:1]
326
+ ax.plot(angles, values_closed, linewidth=1.5, label=r["name"])
327
+ ax.fill(angles, values_closed, alpha=fill_alpha)
328
+ ax.set_title(title, y=1.08, fontsize=12)
329
+ ax.legend(loc="upper right", bbox_to_anchor=(1.25, 1.1))
330
+ return fig
331
+
332
+ def heatmap_plot(df: pd.DataFrame, metric_cols: List[str], title: str = "Metric Correlations"):
333
+ fig, ax = plt.subplots(figsize=(7, 5))
334
+ sns.heatmap(df[metric_cols].corr(), annot=True, fmt=".2f", cmap="coolwarm", ax=ax)
335
+ ax.set_title(title)
336
+ return fig
337
+
338
+ def task_agent_heatmap(leaderboard: pd.DataFrame, metric: str):
339
+ pivot = leaderboard.pivot(index="task", columns="agent", values=metric)
340
+ fig, ax = plt.subplots(figsize=(7, 5))
341
+ sns.heatmap(pivot, annot=True, fmt=".2f", cmap="YlGnBu", ax=ax)
342
+ ax.set_title(f"Task-Agent Performance ({metric})")
343
+ return fig
344
+
345
+ def leaderboard_barplot(leaderboard: pd.DataFrame, metric_cols: List[str]):
346
+ melted = leaderboard.melt(id_vars=["agent"], value_vars=metric_cols, var_name="metric", value_name="score")
347
+ fig, ax = plt.subplots(figsize=(8, 5))
348
+ sns.barplot(x="metric", y="score", hue="agent", data=melted, ax=ax)
349
+ ax.set_title("Leaderboard Bar Chart")
350
+ ax.set_ylim(0, 1)
351
+ return fig
352
+
353
+ def distribution_plot(metrics_df: pd.DataFrame, metric: str):
354
+ fig, ax = plt.subplots(figsize=(7, 5))
355
+ sns.boxplot(x="agent", y=metric, data=metrics_df, ax=ax)
356
+ sns.stripplot(x="agent", y=metric, data=metrics_df, ax=ax, color="black", alpha=0.4, jitter=True)
357
+ ax.set_title(f"Distribution of {metric} Scores per Agent")
358
+ ax.set_ylim(0, 1)
359
+ return fig
360
+
361
+ def scatter_two_metrics(metrics_df: pd.DataFrame, metric_x: str, metric_y: str):
362
+ fig, ax = plt.subplots(figsize=(6, 6))
363
+ sns.scatterplot(x=metric_x, y=metric_y, hue="agent", data=metrics_df, ax=ax, alpha=0.7)
364
+ ax.set_title(f"{metric_x} vs {metric_y}")
365
+ ax.set_xlim(0, 1)
366
+ ax.set_ylim(0, 1)
367
+ return fig
368
 
369
  # -----------------------------
370
+ # Main evaluation entrypoint
371
  # -----------------------------
 
 
 
 
 
 
372
 
373
+ def evaluate_dataframe(df: pd.DataFrame) -> Tuple[pd.DataFrame, List[Tuple[str,str]], pd.DataFrame]:
374
+ df = df.copy()
375
 
376
+ # compute scores per row
377
+ scores = []
378
+ for _, row in df.iterrows():
379
+ s = {}
380
+ s["instruction_following"] = check_instruction_following(str(row.get("prompt", "")), str(row.get("response", "")))
381
+ s["hallucination"] = check_hallucination(str(row.get("prompt", "")), str(row.get("response", "")))
382
+ s["accuracy"] = check_accuracy(str(row.get("reference", "")), str(row.get("response", "")))
383
+ s["coherence"] = check_coherence(str(row.get("response", "")))
384
+ s["fluency"] = check_fluency(str(row.get("response", "")))
385
+ scores.append(s)
 
 
 
 
 
 
 
 
 
 
 
 
 
386
 
387
+ metrics_df = pd.concat([df.reset_index(drop=True), pd.DataFrame(scores)], axis=1)
388
 
389
+ # leaderboard: average per agent & task
390
+ metric_cols = ["instruction_following", "hallucination", "accuracy", "coherence", "fluency"]
 
 
 
 
 
391
  leaderboard = (
392
+ metrics_df.groupby(["agent", "task"])[metric_cols]
393
  .mean()
394
  .reset_index()
 
395
  )
396
 
397
+ # -------------------
398
+ # Visualization images
399
+ # -------------------
400
  images = []
 
 
401
 
402
+ try:
403
+ rows = []
404
+ for agent in leaderboard["agent"].unique():
405
+ vals = leaderboard[leaderboard["agent"] == agent][metric_cols].mean().tolist()
406
+ rows.append({"name": agent, "values": vals})
407
+ fig1 = spider_net_multi(metric_cols, rows, "Agent Performance Radar")
408
+ path1 = f"/tmp/radar_{uuid.uuid4().hex}.png"
409
+ fig1.savefig(path1, bbox_inches="tight")
410
+ plt.close(fig1)
411
+ images.append((path1, "Radar Plot"))
412
+ except Exception as e:
413
+ print("Radar plot failed:", e)
414
+
415
+ try:
416
+ fig2 = heatmap_plot(metrics_df, metric_cols, title="Metric Correlation Heatmap")
417
+ path2 = f"/tmp/heatmap_{uuid.uuid4().hex}.png"
418
+ fig2.savefig(path2, bbox_inches="tight")
419
+ plt.close(fig2)
420
+ images.append((path2, "Correlation Heatmap"))
421
+ except Exception as e:
422
+ print("Heatmap failed:", e)
423
+
424
+ try:
425
+ fig3 = task_agent_heatmap(leaderboard, "accuracy")
426
+ path3 = f"/tmp/task_agent_{uuid.uuid4().hex}.png"
427
+ fig3.savefig(path3, bbox_inches="tight")
428
+ plt.close(fig3)
429
+ images.append((path3, "Task-Agent Heatmap (Accuracy)"))
430
+ except Exception as e:
431
+ print("Task-agent heatmap failed:", e)
432
+
433
+ try:
434
+ fig4 = leaderboard_barplot(leaderboard, metric_cols)
435
+ path4 = f"/tmp/barplot_{uuid.uuid4().hex}.png"
436
+ fig4.savefig(path4, bbox_inches="tight")
437
+ plt.close(fig4)
438
+ images.append((path4, "Leaderboard Bar Chart"))
439
+ except Exception as e:
440
+ print("Barplot failed:", e)
441
+
442
+ try:
443
+ fig5 = distribution_plot(metrics_df, "accuracy")
444
+ path5 = f"/tmp/distribution_{uuid.uuid4().hex}.png"
445
+ fig5.savefig(path5, bbox_inches="tight")
446
+ plt.close(fig5)
447
+ images.append((path5, "Accuracy Distribution"))
448
+ except Exception as e:
449
+ print("Distribution plot failed:", e)
450
+
451
+ try:
452
+ fig6 = scatter_two_metrics(metrics_df, "instruction_following", "accuracy")
453
+ path6 = f"/tmp/scatter_{uuid.uuid4().hex}.png"
454
+ fig6.savefig(path6, bbox_inches="tight")
455
+ plt.close(fig6)
456
+ images.append((path6, "Instruction Following vs Accuracy"))
457
+ except Exception as e:
458
+ print("Scatter plot failed:", e)
459
+
460
+ return metrics_df, images, leaderboard