Developer commited on
Commit
80326a1
·
1 Parent(s): d3be8f6

Add F1Score calculation for adherence metric aggregation

Browse files
Files changed (1) hide show
  1. advanced_rag_evaluator.py +145 -1
advanced_rag_evaluator.py CHANGED
@@ -15,7 +15,7 @@ import json
15
  import re
16
  from dataclasses import dataclass
17
  import numpy as np
18
- from sklearn.metrics import mean_squared_error, roc_auc_score, auc
19
  from sklearn.preprocessing import label_binarize
20
  import warnings
21
 
@@ -549,6 +549,140 @@ class AUCROCCalculator:
549
  return auc_results
550
 
551
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
552
  class DocumentSentencizer:
553
  """Split documents into sentences with keys (0a, 0b, 1a, etc.)."""
554
 
@@ -1065,4 +1199,14 @@ class AdvancedRAGEvaluator:
1065
  if aucroc_vs_ground_truth:
1066
  results["aucroc_vs_ground_truth"] = aucroc_vs_ground_truth
1067
 
 
 
 
 
 
 
 
 
 
 
1068
  return results
 
15
  import re
16
  from dataclasses import dataclass
17
  import numpy as np
18
+ from sklearn.metrics import mean_squared_error, roc_auc_score, auc, f1_score, precision_score, recall_score
19
  from sklearn.preprocessing import label_binarize
20
  import warnings
21
 
 
549
  return auc_results
550
 
551
 
552
+ class F1ScoreCalculator:
553
+ """Calculate F1Score for evaluation metrics (especially for adherence)."""
554
+
555
+ @staticmethod
556
+ def compute_f1_for_metric(predicted: List[float], ground_truth: List[float],
557
+ threshold: float = 0.5) -> Dict[str, float]:
558
+ """Compute F1 Score for a single metric using binary classification.
559
+
560
+ Converts continuous scores to binary labels using threshold, then calculates:
561
+ - Precision: TP / (TP + FP)
562
+ - Recall: TP / (TP + FN)
563
+ - F1 Score: 2 * (Precision * Recall) / (Precision + Recall)
564
+
565
+ Args:
566
+ predicted: List of predicted metric values (0-1)
567
+ ground_truth: List of ground truth metric values (0-1)
568
+ threshold: Threshold for binary classification (default 0.5)
569
+
570
+ Returns:
571
+ Dictionary with F1, Precision, Recall scores
572
+ """
573
+ if len(predicted) != len(ground_truth):
574
+ raise ValueError("Predicted and ground truth must have same length")
575
+
576
+ if len(predicted) <= 1:
577
+ return {"f1_score": 0.0, "precision": 0.0, "recall": 0.0}
578
+
579
+ try:
580
+ # Convert continuous scores to binary labels
581
+ pred_binary = [1 if score >= threshold else 0 for score in predicted]
582
+ truth_binary = [1 if score >= threshold else 0 for score in ground_truth]
583
+
584
+ # Calculate metrics
585
+ f1 = f1_score(truth_binary, pred_binary, zero_division=0)
586
+ precision = precision_score(truth_binary, pred_binary, zero_division=0)
587
+ recall = recall_score(truth_binary, pred_binary, zero_division=0)
588
+
589
+ return {
590
+ "f1_score": float(f1),
591
+ "precision": float(precision),
592
+ "recall": float(recall)
593
+ }
594
+ except Exception as e:
595
+ warnings.warn(f"Error computing F1 Score: {e}")
596
+ return {"f1_score": 0.0, "precision": 0.0, "recall": 0.0}
597
+
598
+ @staticmethod
599
+ def compute_adherence_f1(results: List[Dict]) -> Dict[str, float]:
600
+ """Compute F1 Score specifically for adherence metric aggregation.
601
+
602
+ Adherence is a binary metric (0 or 1), so F1 Score is particularly relevant.
603
+ Measures how well the predicted adherence scores match ground truth.
604
+
605
+ Args:
606
+ results: List of evaluation results with predicted and ground truth scores
607
+
608
+ Returns:
609
+ Dictionary with:
610
+ - adherence_f1: F1 Score for adherence
611
+ - adherence_precision: Precision for adherence
612
+ - adherence_recall: Recall for adherence
613
+ - num_evaluations: Number of evaluations used
614
+ """
615
+ predicted = []
616
+ ground_truth = []
617
+
618
+ for result in results:
619
+ if "metrics" in result and "adherence" in result["metrics"]:
620
+ predicted.append(result["metrics"]["adherence"])
621
+
622
+ if "ground_truth_scores" in result and "adherence" in result["ground_truth_scores"]:
623
+ ground_truth.append(result["ground_truth_scores"]["adherence"])
624
+ else:
625
+ if predicted:
626
+ predicted.pop()
627
+
628
+ if len(predicted) == 0 or len(ground_truth) == 0:
629
+ return {
630
+ "adherence_f1": 0.0,
631
+ "adherence_precision": 0.0,
632
+ "adherence_recall": 0.0,
633
+ "num_evaluations": 0
634
+ }
635
+
636
+ f1_metrics = F1ScoreCalculator.compute_f1_for_metric(predicted, ground_truth)
637
+
638
+ return {
639
+ "adherence_f1": f1_metrics["f1_score"],
640
+ "adherence_precision": f1_metrics["precision"],
641
+ "adherence_recall": f1_metrics["recall"],
642
+ "num_evaluations": len(predicted)
643
+ }
644
+
645
+ @staticmethod
646
+ def compute_f1_all_metrics(results: List[Dict]) -> Dict[str, float]:
647
+ """Compute F1 Score for all TRACE metrics.
648
+
649
+ Args:
650
+ results: List of evaluation results with predicted and ground truth scores
651
+
652
+ Returns:
653
+ Dictionary mapping metric names to F1 Scores with precision/recall
654
+ """
655
+ metrics = ["context_relevance", "context_utilization", "completeness", "adherence"]
656
+ f1_results = {}
657
+
658
+ for metric in metrics:
659
+ predicted = []
660
+ ground_truth = []
661
+
662
+ for result in results:
663
+ if "metrics" in result and metric in result["metrics"]:
664
+ predicted.append(result["metrics"][metric])
665
+
666
+ if "ground_truth_scores" in result and metric in result["ground_truth_scores"]:
667
+ ground_truth.append(result["ground_truth_scores"][metric])
668
+ else:
669
+ if predicted:
670
+ predicted.pop()
671
+
672
+ if len(predicted) > 0 and len(ground_truth) > 0:
673
+ f1_metrics = F1ScoreCalculator.compute_f1_for_metric(predicted, ground_truth)
674
+ f1_results[f"{metric}_f1"] = f1_metrics["f1_score"]
675
+ f1_results[f"{metric}_precision"] = f1_metrics["precision"]
676
+ f1_results[f"{metric}_recall"] = f1_metrics["recall"]
677
+
678
+ # Compute average F1 across all metrics
679
+ f1_scores = [v for k, v in f1_results.items() if k.endswith("_f1")]
680
+ if f1_scores:
681
+ f1_results["average_f1"] = float(np.mean(f1_scores))
682
+
683
+ return f1_results
684
+
685
+
686
  class DocumentSentencizer:
687
  """Split documents into sentences with keys (0a, 0b, 1a, etc.)."""
688
 
 
1199
  if aucroc_vs_ground_truth:
1200
  results["aucroc_vs_ground_truth"] = aucroc_vs_ground_truth
1201
 
1202
+ # Compute F1 Score for adherence aggregation
1203
+ adherence_f1_scores = F1ScoreCalculator.compute_adherence_f1(detailed_results)
1204
+ if adherence_f1_scores:
1205
+ results["adherence_f1_scores"] = adherence_f1_scores
1206
+
1207
+ # Compute F1 Scores for all metrics
1208
+ f1_all_metrics = F1ScoreCalculator.compute_f1_all_metrics(detailed_results)
1209
+ if f1_all_metrics:
1210
+ results["f1_scores"] = f1_all_metrics
1211
+
1212
  return results