Update evaluator.py

#11
by manayporwal07 - opened
Files changed (1) hide show
  1. evaluator.py +204 -435
evaluator.py CHANGED
@@ -204,489 +204,258 @@
204
  #
205
  ###############################################################################################################################
206
 
207
- import re
208
- import json
209
- import torch
210
- import pandas as pd
211
- import matplotlib.pyplot as plt
212
- import seaborn as sns
213
  import os
214
- import uuid
215
  import numpy as np
 
 
 
 
216
  from transformers import (
217
- AutoTokenizer,
218
- AutoModelForSequenceClassification,
219
  AutoModelForCausalLM,
220
- pipeline
221
  )
222
- from sentence_transformers import SentenceTransformer, util
223
  import evaluate
224
- from sklearn.metrics import accuracy_score, f1_score
225
- from collections import defaultdict
226
- import warnings
227
- warnings.filterwarnings('ignore')
228
-
229
- # --------------------------
230
- # MODEL LOADING
231
- # --------------------------
232
- NLI_MODEL = "microsoft/deberta-v2-xlarge-mnli"
233
- EMBED_MODEL = "sentence-transformers/all-mpnet-base-v2"
234
- LLM_JUDGE_MODEL = "microsoft/DialoGPT-large" # Can be replaced with more powerful models
235
 
236
- # Load NLI model & tokenizer
237
- nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL)
238
- nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL)
239
- nli_model.to("cuda" if torch.cuda.is_available() else "cpu")
240
- nli_model.eval()
 
 
241
 
242
- # Load embedding model
243
- embed_model = SentenceTransformer(EMBED_MODEL)
244
 
245
- # Load LLM judge
246
- judge_tokenizer = AutoTokenizer.from_pretrained(LLM_JUDGE_MODEL)
247
- judge_model = AutoModelForCausalLM.from_pretrained(LLM_JUDGE_MODEL)
248
- judge_model.to("cuda" if torch.cuda.is_available() else "cpu")
249
- judge_model.eval()
250
 
251
- # Load additional evaluation metrics
252
  bertscore = evaluate.load("bertscore")
253
  bleu = evaluate.load("bleu")
254
  rouge = evaluate.load("rouge")
255
 
256
- # Label mapping from config
257
- id2label = {int(k): v.upper() for k, v in nli_model.config.id2label.items()}
258
 
259
- # --------------------------
260
- # IMPROVED METRIC FUNCTIONS
261
- # --------------------------
262
- def check_instruction_following(prompt: str, response: str) -> float:
263
- """Improved instruction following using NLI and semantic similarity."""
264
- if not prompt or not response:
265
- return 0.0
266
-
267
- # Method 1: NLI-based evaluation
268
- with torch.no_grad():
269
- inputs = nli_tokenizer.encode_plus(
270
- prompt,
271
- response,
272
- return_tensors="pt",
273
- truncation=True,
274
- max_length=512
275
- ).to(nli_model.device)
276
-
277
- outputs = nli_model(**inputs)
278
- probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
279
-
280
- entail_prob, neutral_prob = 0.0, 0.0
281
- for idx, p in enumerate(probs):
282
- label = id2label.get(idx, "")
283
- if "ENTAIL" in label:
284
- entail_prob = float(p)
285
- elif "NEUTRAL" in label:
286
- neutral_prob = float(p)
287
-
288
- nli_score = entail_prob + (neutral_prob * 0.5)
289
-
290
- # Method 2: Semantic similarity
291
- p_emb = embed_model.encode(prompt, convert_to_tensor=True)
292
- r_emb = embed_model.encode(response, convert_to_tensor=True)
293
- sim_score = float(util.cos_sim(p_emb, r_emb).item())
294
-
295
- # Combined score (weighted average)
296
- final_score = 0.7 * nli_score + 0.3 * sim_score
297
- return round(max(0.0, min(1.0, final_score)), 3)
298
 
299
- def check_hallucination(reference: str, response: str) -> float:
300
- """Enhanced hallucination detection using multiple methods."""
301
- if not reference or not response:
302
- return 0.0
303
-
304
- # Method 1: NLI-based contradiction detection
305
- with torch.no_grad():
306
- inputs = nli_tokenizer.encode_plus(
307
- reference,
308
- response,
309
- return_tensors="pt",
310
- truncation=True,
311
- max_length=512
312
- ).to(nli_model.device)
313
-
314
- outputs = nli_model(**inputs)
315
- probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
316
-
317
- contra_prob, neutral_prob = 0.0, 0.0
318
- for idx, p in enumerate(probs):
319
- label = id2label.get(idx, "")
320
- if "CONTRA" in label:
321
- contra_prob = float(p)
322
- elif "NEUTRAL" in label:
323
- neutral_prob = float(p)
324
-
325
- nli_hallucination_score = contra_prob + (neutral_prob * 0.3)
326
-
327
- # Method 2: Semantic similarity penalty
328
- ref_emb = embed_model.encode(reference, convert_to_tensor=True)
329
- resp_emb = embed_model.encode(response, convert_to_tensor=True)
330
- semantic_sim = float(util.cos_sim(ref_emb, resp_emb).item())
331
-
332
- # Combined score: Higher when less hallucination
333
- hallucination_score = 1.0 - (0.7 * nli_hallucination_score + 0.3 * (1 - semantic_sim))
334
- return round(max(0.0, min(1.0, hallucination_score)), 3)
335
 
336
- def check_assumption(response: str) -> float:
337
- """Improved assumption detection using pattern matching and LLM judgment."""
338
- if not response:
339
- return 0.0
340
-
341
- # Pattern-based detection
342
- speculative_patterns = [
343
- r"\b(maybe|perhaps|possibly|probably|might|could|would|should)\b",
344
- r"\b(I think|I believe|I guess|I suppose|I assume)\b",
345
- r"\b(it seems|it appears|it looks like)\b",
346
- r"\b(likely|unlikely|presumably|arguably)\b",
347
- r"\b(some|many|most|often|usually|generally|typically)\b"
348
- ]
349
-
350
- pattern_count = sum(
351
- len(re.findall(pattern, response.lower()))
352
- for pattern in speculative_patterns
353
- )
354
-
355
- # Length normalization
356
- word_count = len(response.split())
357
- pattern_score = min(1.0, pattern_count / max(1, word_count / 5))
358
-
359
- # LLM-based judgment
360
- assumption_prompt = f"""
361
- Determine if the following text contains assumptions, speculation, or hedging language.
362
- Text: {response}
363
- Answer with only 'yes' or 'no':
364
- """
365
-
366
- with torch.no_grad():
367
- inputs = judge_tokenizer.encode(assumption_prompt, return_tensors="pt")
368
- outputs = judge_model.generate(
369
- inputs,
370
- max_length=len(inputs[0]) + 3,
371
- pad_token_id=judge_tokenizer.eos_token_id
372
  )
373
- judgment = judge_tokenizer.decode(outputs[0], skip_special_tokens=True)
374
-
375
- llm_score = 0.0 if "yes" in judgment.lower() else 1.0
376
-
377
- # Combined score
378
- final_score = 0.6 * (1 - pattern_score) + 0.4 * llm_score
379
- return round(final_score, 3)
 
 
 
380
 
381
- def check_coherence(response: str) -> float:
382
- """Enhanced coherence evaluation using multiple linguistic features."""
383
- if not response:
 
 
 
 
 
 
 
 
 
 
 
 
 
384
  return 0.0
385
-
386
- # Feature 1: Sentence structure
387
- sentences = re.split(r'[.!?]+', response)
388
- sentences = [s.strip() for s in sentences if len(s.strip()) > 0]
389
- num_sentences = len(sentences)
390
-
391
- if num_sentences == 0:
 
 
 
 
 
 
 
392
  return 0.0
393
-
394
- # Feature 2: Sentence length variation
395
- sent_lengths = [len(s.split()) for s in sentences]
396
- length_variance = np.var(sent_lengths) if len(sent_lengths) > 1 else 0
397
- length_score = 1.0 - min(1.0, length_variance / 100)
398
-
399
- # Feature 3: Transition words
400
- transition_words = [
401
- 'however', 'therefore', 'moreover', 'furthermore', 'consequently',
402
- 'additionally', 'likewise', 'similarly', 'nevertheless', 'nonetheless'
403
- ]
404
-
405
- transition_count = sum(1 for word in transition_words
406
- if word in response.lower())
407
- transition_score = min(1.0, transition_count / 3)
408
-
409
- # Feature 4: Repetition penalty
410
- words = response.lower().split()
411
- unique_words = set(words)
412
- repetition_ratio = len(unique_words) / max(1, len(words))
413
-
414
- # Combined score
415
- coherence_score = (
416
- 0.3 * min(1.0, num_sentences / 5) +
417
- 0.2 * length_score +
418
- 0.3 * transition_score +
419
- 0.2 * repetition_ratio
420
- )
421
-
422
- return round(max(0.0, min(1.0, coherence_score)), 3)
423
 
424
- def check_accuracy(reference: str, response: str) -> float:
425
- """Enhanced accuracy evaluation using multiple metrics."""
426
- if not reference or not response:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
427
  return 0.0
428
-
429
- # BERTScore
430
- bert_results = bertscore.compute(
431
- predictions=[response],
432
- references=[reference],
433
- lang="en",
434
- model_type=EMBED_MODEL
435
- )
436
- bert_f1 = bert_results['f1'][0]
437
-
438
- # ROUGE-L
439
- rouge_results = rouge.compute(
440
- predictions=[response],
441
- references=[reference],
442
- use_stemmer=True
443
- )
444
- rouge_l = rouge_results['rougeL']
445
-
446
- # BLEU (for shorter responses)
447
  try:
448
- bleu_results = bleu.compute(
449
- predictions=[response.split()],
450
- references=[[reference.split()]]
451
- )
452
- bleu_score = bleu_results['bleu']
453
- except:
 
 
 
454
  bleu_score = 0.0
455
-
456
- # Semantic similarity
457
- ref_emb = embed_model.encode(reference, convert_to_tensor=True)
458
- resp_emb = embed_model.encode(response, convert_to_tensor=True)
459
- semantic_sim = float(util.cos_sim(ref_emb, resp_emb).item())
460
-
461
- # Combined score (weighted average)
462
- accuracy_score = (
463
- 0.4 * bert_f1 +
464
- 0.3 * rouge_l +
465
- 0.1 * bleu_score +
466
- 0.2 * semantic_sim
467
- )
468
-
469
- return round(max(0.0, min(1.0, accuracy_score)), 3)
470
 
471
- def check_relevance(prompt: str, response: str) -> float:
472
- """Check how relevant the response is to the prompt."""
473
- if not prompt or not response:
474
- return 0.0
475
-
476
- # Encode both prompt and response
477
- p_emb = embed_model.encode(prompt, convert_to_tensor=True)
478
- r_emb = embed_model.encode(response, convert_to_tensor=True)
479
-
480
- # Calculate cosine similarity
481
- similarity = float(util.cos_sim(p_emb, r_emb).item())
482
-
483
- return round(max(0.0, min(1.0, similarity)), 3)
484
 
485
- def check_fluency(response: str) -> float:
486
- """Check the fluency of the response using perplexity-based approach."""
487
- if not response:
 
 
 
 
 
488
  return 0.0
489
-
490
- # Load a fluency model (perplexity-based)
491
- fluency_checker = pipeline(
492
- "text-classification",
493
- model="textattack/roberta-base-CoLA",
494
- device=0 if torch.cuda.is_available() else -1
495
- )
496
-
497
  try:
498
- # Split into sentences if too long
499
- sentences = re.split(r'[.!?]+', response)
500
- sentences = [s.strip() for s in sentences if len(s.strip()) > 5]
501
-
502
- if not sentences:
503
- return 0.5
504
-
505
- # Check each sentence
506
- fluency_scores = []
507
- for sent in sentences[:3]: # Limit to first 3 sentences
508
- result = fluency_checker(sent[:512]) # Truncate if too long
509
- score = result[0]['score'] if result[0]['label'] == 'LABEL_1' else 1 - result[0]['score']
510
- fluency_scores.append(score)
511
-
512
- avg_fluency = sum(fluency_scores) / len(fluency_scores)
513
- return round(avg_fluency, 3)
514
- except:
515
- # Fallback to simple heuristic
516
- words = response.split()
517
- if len(words) < 3:
518
- return 0.3
519
- return 0.7
520
-
521
- # --------------------------
522
- # ROW & DF EVALUATION
523
- # --------------------------
524
  def evaluate_row(row):
525
- prompt = row.get("prompt", "")
526
- response = row.get("response", "")
527
- reference = row.get("reference", "")
528
-
529
- metrics = {
530
- "task_id": row.get("task_id", ""),
531
- "agent": row.get("agent", ""),
532
- "instruction_following": check_instruction_following(prompt, response),
533
- "hallucination": check_hallucination(reference, response),
534
- "assumption": check_assumption(response),
535
- "coherence": check_coherence(response),
536
- "accuracy": check_accuracy(reference, response),
537
- "relevance": check_relevance(prompt, response),
538
- "fluency": check_fluency(response),
539
  }
 
 
540
 
541
- # Weighted avg score (adjust weights as needed)
542
- metrics["final_score"] = round(
543
- 0.20 * metrics["instruction_following"] +
544
- 0.20 * metrics["accuracy"] +
545
- 0.15 * metrics["hallucination"] +
546
- 0.10 * metrics["coherence"] +
547
- 0.10 * metrics["assumption"] +
548
- 0.15 * metrics["relevance"] +
549
- 0.10 * metrics["fluency"],
550
- 3,
551
- )
552
- return metrics
553
-
554
- # --------------------------
555
- # VISUALIZATION FUNCTIONS
556
- # --------------------------
557
- def plot_radar_chart(metrics_df, agents, metrics, out_path="/tmp/radar.png"):
558
- """Radar chart comparing multiple agents across metrics."""
559
- labels = metrics
560
- num_vars = len(labels)
561
-
562
- # Compute angle for each axis
563
- angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
564
- angles += angles[:1] # close loop
565
-
566
- fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))
567
-
568
- for agent in agents:
569
- values = []
570
- for m in metrics:
571
- mean_val = metrics_df.loc[metrics_df['agent'] == agent, m].mean()
572
- values.append(mean_val if not np.isnan(mean_val) else 0)
573
- values += values[:1]
574
- ax.plot(angles, values, label=agent, linewidth=2)
575
- ax.fill(angles, values, alpha=0.25)
576
-
577
- ax.set_xticks(angles[:-1])
578
- ax.set_xticklabels(labels)
579
- ax.set_yticklabels([])
580
- ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.1))
581
- ax.set_title("Agent Performance Radar Chart")
582
-
583
- plt.tight_layout()
584
- plt.savefig(out_path)
585
- plt.close()
586
- return out_path
587
 
588
- def plot_heatmap(metrics_df, out_path="/tmp/heatmap.png"):
589
- """Heatmap of agent performance across metrics."""
590
- metrics = ["accuracy", "hallucination", "instruction_following",
591
- "coherence", "assumption", "relevance", "fluency"]
592
-
593
- pivot = metrics_df.groupby("agent")[metrics].mean()
594
-
595
- plt.figure(figsize=(10, 6))
596
- sns.heatmap(pivot, annot=True, cmap="YlGnBu", fmt=".3f", center=0.5)
597
- plt.title("Agent Γ— Metric Heatmap")
598
- plt.tight_layout()
599
- plt.savefig(out_path)
600
- plt.close()
601
- return out_path
602
 
603
- def plot_score_distribution(metrics_df, out_path="/tmp/distribution.png"):
604
- """Distribution of final scores by agent."""
605
- plt.figure(figsize=(10, 6))
606
- agents = metrics_df['agent'].unique()
607
-
608
- for agent in agents:
609
- agent_scores = metrics_df[metrics_df['agent'] == agent]['final_score']
610
- sns.kdeplot(agent_scores, label=agent, fill=True, alpha=0.3)
611
-
612
- plt.xlabel('Final Score')
613
- plt.ylabel('Density')
614
- plt.title('Distribution of Final Scores by Agent')
615
- plt.legend()
616
- plt.tight_layout()
617
- plt.savefig(out_path)
618
- plt.close()
619
- return out_path
620
 
621
- def plot_metric_correlation(metrics_df, out_path="/tmp/correlation.png"):
622
- """Correlation matrix between different metrics."""
623
- metrics = ["accuracy", "hallucination", "instruction_following",
624
- "coherence", "assumption", "relevance", "fluency", "final_score"]
625
-
626
- plt.figure(figsize=(10, 8))
627
- correlation_matrix = metrics_df[metrics].corr()
628
- sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", center=0,
629
- fmt=".2f", square=True)
630
- plt.title('Correlation Between Metrics')
631
- plt.tight_layout()
632
  plt.savefig(out_path)
633
  plt.close()
634
- return out_path
635
 
636
- def plot_agent_comparison(metrics_df, out_path="/tmp/agent_comparison.png"):
637
- """Bar chart comparing agent performance across metrics."""
638
- metrics = ["accuracy", "hallucination", "instruction_following",
639
- "coherence", "assumption", "relevance", "fluency"]
640
-
641
- agent_means = metrics_df.groupby('agent')[metrics].mean()
642
-
643
- plt.figure(figsize=(12, 6))
644
- agent_means.plot(kind='bar', colormap='Set3')
645
- plt.title('Agent Performance Across Metrics')
646
- plt.xlabel('Agent')
647
- plt.ylabel('Score')
648
- plt.xticks(rotation=45)
649
- plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
650
  plt.tight_layout()
651
  plt.savefig(out_path)
652
  plt.close()
653
- return out_path
 
654
 
655
- # --------------------------
656
- # MAIN EVALUATION FUNCTION
657
- # --------------------------
658
  def evaluate_dataframe(df: pd.DataFrame):
659
- """Evaluate a dataframe of agent responses."""
660
- metrics_df = df.apply(evaluate_row, axis=1, result_type='expand')
661
 
662
- # Leaderboard
663
  leaderboard = (
664
- metrics_df.groupby(["agent", "task_id"])["final_score"]
665
  .mean()
666
  .reset_index()
 
667
  )
668
 
669
- # Generate visualizations
670
  images = []
671
-
672
- # Add all visualizations
673
- agents = df["agent"].unique()
674
- metrics = ["accuracy", "hallucination", "instruction_following",
675
- "coherence", "assumption", "relevance", "fluency"]
676
-
677
- radar_path = plot_radar_chart(metrics_df, agents, metrics)
678
- images.append((radar_path, "Radar Chart: Agent vs Metrics"))
679
-
680
- heatmap_path = plot_heatmap(metrics_df)
681
- images.append((heatmap_path, "Heatmap: Agent vs Metrics"))
682
-
683
- distribution_path = plot_score_distribution(metrics_df)
684
- images.append((distribution_path, "Score Distribution by Agent"))
685
-
686
- correlation_path = plot_metric_correlation(metrics_df)
687
- images.append((correlation_path, "Metric Correlation Matrix"))
688
-
689
- agent_comparison_path = plot_agent_comparison(metrics_df)
690
- images.append((agent_comparison_path, "Agent Comparison Chart"))
691
-
692
  return metrics_df, images, leaderboard
 
204
  #
205
  ###############################################################################################################################
206
 
207
+ """
208
+ Evaluation logic for Agentic Evaluation Framework.
209
+ """
210
+
 
 
211
  import os
 
212
  import numpy as np
213
+ import pandas as pd
214
+ import torch
215
+ import matplotlib.pyplot as plt
216
+
217
  from transformers import (
218
+ AutoTokenizer,
219
+ AutoModelForSequenceClassification,
220
  AutoModelForCausalLM,
221
+ pipeline,
222
  )
223
+ from sentence_transformers import SentenceTransformer
224
  import evaluate
 
 
 
 
 
 
 
 
 
 
 
225
 
226
+ # -----------------------------
227
+ # Global Config
228
+ # -----------------------------
229
+ NLI_MODEL = "microsoft/deberta-v2-xlarge-mnli"
230
+ EMBED_MODEL = "all-MiniLM-L6-v2"
231
+ LLM_JUDGE_MODEL = "microsoft/DialoGPT-small"
232
+ FLUENCY_MODEL = "textattack/roberta-base-CoLA"
233
 
234
+ device = 0 if torch.cuda.is_available() else -1
 
235
 
236
+ # Caches
237
+ _nli_model, _nli_tokenizer = None, None
238
+ _embed_model = None
239
+ _judge_model, _judge_tokenizer = None, None
240
+ _fluency_checker = None
241
 
242
+ # Metrics
243
  bertscore = evaluate.load("bertscore")
244
  bleu = evaluate.load("bleu")
245
  rouge = evaluate.load("rouge")
246
 
 
 
247
 
248
+ # -----------------------------
249
+ # Lazy Model Loaders
250
+ # -----------------------------
251
+ def get_nli_model():
252
+ global _nli_model, _nli_tokenizer
253
+ if _nli_model is None:
254
+ _nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL)
255
+ _nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL).to(
256
+ torch.device("cuda" if torch.cuda.is_available() else "cpu")
257
+ )
258
+ _nli_model.eval()
259
+ return _nli_model, _nli_tokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
 
262
+ def get_embed_model():
263
+ global _embed_model
264
+ if _embed_model is None:
265
+ _embed_model = SentenceTransformer(EMBED_MODEL, device="cuda" if torch.cuda.is_available() else "cpu")
266
+ return _embed_model
267
+
268
+
269
+ def get_judge_model():
270
+ global _judge_model, _judge_tokenizer
271
+ if _judge_model is None:
272
+ _judge_tokenizer = AutoTokenizer.from_pretrained(LLM_JUDGE_MODEL)
273
+ _judge_model = AutoModelForCausalLM.from_pretrained(LLM_JUDGE_MODEL).to(
274
+ torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  )
276
+ return _judge_model, _judge_tokenizer
277
+
278
+
279
+ def get_fluency_checker():
280
+ global _fluency_checker
281
+ if _fluency_checker is None:
282
+ _fluency_checker = pipeline(
283
+ "text-classification", model=FLUENCY_MODEL, device=device
284
+ )
285
+ return _fluency_checker
286
 
287
+
288
+ # -----------------------------
289
+ # Evaluation Functions
290
+ # -----------------------------
291
+ def check_instruction_following(prompt, response):
292
+ try:
293
+ nli_model, nli_tokenizer = get_nli_model()
294
+ inputs = nli_tokenizer(prompt, response, return_tensors="pt", truncation=True, padding=True).to(
295
+ nli_model.device
296
+ )
297
+ with torch.no_grad():
298
+ logits = nli_model(**inputs).logits
299
+ probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]
300
+ entailment_score = probs[2] # entailment index
301
+ return float(entailment_score)
302
+ except Exception:
303
  return 0.0
304
+
305
+
306
+ def check_hallucination(reference, response):
307
+ try:
308
+ nli_model, nli_tokenizer = get_nli_model()
309
+ inputs = nli_tokenizer(reference, response, return_tensors="pt", truncation=True, padding=True).to(
310
+ nli_model.device
311
+ )
312
+ with torch.no_grad():
313
+ logits = nli_model(**inputs).logits
314
+ probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]
315
+ contradiction_score = probs[0] # contradiction index
316
+ return 1.0 - float(contradiction_score)
317
+ except Exception:
318
  return 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
 
320
+
321
+ def check_assumption(prompt, response):
322
+ try:
323
+ judge_model, judge_tokenizer = get_judge_model()
324
+ input_text = f"Does this response make assumptions not in the prompt?\nPrompt: {prompt}\nResponse: {response}\nAnswer yes or no:"
325
+ inputs = judge_tokenizer.encode(input_text, return_tensors="pt").to(judge_model.device)
326
+ outputs = judge_model.generate(inputs, max_length=50)
327
+ judgment = judge_tokenizer.decode(outputs[0], skip_special_tokens=True).lower()
328
+ if "yes" in judgment:
329
+ return 0.0
330
+ elif "no" in judgment:
331
+ return 1.0
332
+ return 0.5
333
+ except Exception:
334
+ return 0.5
335
+
336
+
337
+ def check_coherence(response):
338
+ try:
339
+ emb = get_embed_model().encode(response, convert_to_tensor=True, normalize_embeddings=True)
340
+ coherence = float(torch.mean(emb).cpu().item())
341
+ return coherence
342
+ except Exception:
343
  return 0.0
344
+
345
+
346
+ def check_accuracy(reference, response):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
  try:
348
+ bert_results = bertscore.compute(predictions=[response], references=[reference], lang="en")
349
+ bert_f1 = bert_results["f1"][0]
350
+ except Exception:
351
+ bert_f1 = 0.0
352
+
353
+ try:
354
+ bleu_results = bleu.compute(predictions=[response], references=[[reference]])
355
+ bleu_score = bleu_results["bleu"]
356
+ except Exception:
357
  bleu_score = 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
 
359
+ try:
360
+ rouge_results = rouge.compute(predictions=[response], references=[reference])
361
+ rouge_l = rouge_results["rougeL"]
362
+ except Exception:
363
+ rouge_l = 0.0
364
+
365
+ return float((bert_f1 + bleu_score + rouge_l) / 3)
366
+
 
 
 
 
 
367
 
368
+ def check_relevance(prompt, response):
369
+ try:
370
+ model = get_embed_model()
371
+ emb1 = model.encode(prompt, convert_to_tensor=True)
372
+ emb2 = model.encode(response, convert_to_tensor=True)
373
+ cos_sim = torch.nn.functional.cosine_similarity(emb1, emb2, dim=0)
374
+ return float(cos_sim.item())
375
+ except Exception:
376
  return 0.0
377
+
378
+
379
+ def check_fluency(response):
 
 
 
 
 
380
  try:
381
+ fluency_checker = get_fluency_checker()
382
+ result = fluency_checker(response)[0]
383
+ return float(result["score"]) if result["label"] == "LABEL_1" else 1.0 - float(result["score"])
384
+ except Exception:
385
+ return 0.5
386
+
387
+
388
+ # -----------------------------
389
+ # Row Evaluation
390
+ # -----------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
  def evaluate_row(row):
392
+ scores = {
393
+ "instruction_following": check_instruction_following(row["prompt"], row["response"]),
394
+ "hallucination": check_hallucination(row["reference"], row["response"]),
395
+ "assumption": check_assumption(row["prompt"], row["response"]),
396
+ "coherence": check_coherence(row["response"]),
397
+ "accuracy": check_accuracy(row["reference"], row["response"]),
398
+ "relevance": check_relevance(row["prompt"], row["response"]),
399
+ "fluency": check_fluency(row["response"]),
 
 
 
 
 
 
400
  }
401
+ scores["final_score"] = np.mean(list(scores.values()))
402
+ return pd.Series(scores)
403
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
404
 
405
+ # -----------------------------
406
+ # Visualization Helpers
407
+ # -----------------------------
408
+ def plot_radar_chart(metrics_df, out_path="/tmp/radar.png"):
409
+ import seaborn as sns
 
 
 
 
 
 
 
 
 
410
 
411
+ mean_scores = metrics_df.mean(numeric_only=True).drop("final_score", errors="ignore")
412
+ categories = list(mean_scores.index)
413
+ values = mean_scores.values.tolist()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414
 
415
+ values += values[:1]
416
+ categories += categories[:1]
417
+
418
+ angles = np.linspace(0, 2 * np.pi, len(categories), endpoint=False).tolist()
419
+ angles += angles[:1]
420
+
421
+ plt.figure(figsize=(6, 6))
422
+ ax = plt.subplot(111, polar=True)
423
+ ax.plot(angles, values, "o-", linewidth=2)
424
+ ax.fill(angles, values, alpha=0.25)
425
+ ax.set_thetagrids(np.degrees(angles[:-1]), categories)
426
  plt.savefig(out_path)
427
  plt.close()
428
+ return out_path, "Radar Chart (Mean Scores)"
429
 
430
+
431
+ def plot_leaderboard(metrics_df, out_path="/tmp/leaderboard.png"):
432
+ agent_means = metrics_df.groupby("agent")["final_score"].mean().sort_values(ascending=False)
433
+ plt.figure(figsize=(10, 5))
434
+ agent_means.plot(kind="bar", colormap="Set3", ax=plt.gca())
435
+ plt.title("Leaderboard: Avg Final Score per Agent")
436
+ plt.ylabel("Score")
 
 
 
 
 
 
 
437
  plt.tight_layout()
438
  plt.savefig(out_path)
439
  plt.close()
440
+ return out_path, "Leaderboard"
441
+
442
 
443
+ # -----------------------------
444
+ # Main Evaluation Entry
445
+ # -----------------------------
446
  def evaluate_dataframe(df: pd.DataFrame):
447
+ metrics_df = df.apply(evaluate_row, axis=1, result_type="expand")
448
+ metrics_df = pd.concat([df, metrics_df], axis=1)
449
 
 
450
  leaderboard = (
451
+ metrics_df.groupby("agent")["final_score"]
452
  .mean()
453
  .reset_index()
454
+ .sort_values("final_score", ascending=False)
455
  )
456
 
 
457
  images = []
458
+ images.append(plot_radar_chart(metrics_df))
459
+ images.append(plot_leaderboard(metrics_df))
460
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
461
  return metrics_df, images, leaderboard