Spaces:

gopikrishnait
/

CapStoneRAG10

Sleeping

App Files Files Community

Developer commited on Jan 17

Commit

80326a1

1 Parent(s): d3be8f6

Add F1Score calculation for adherence metric aggregation

Browse files

Files changed (1) hide show

advanced_rag_evaluator.py +145 -1

advanced_rag_evaluator.py CHANGED Viewed

@@ -15,7 +15,7 @@ import json
 import re
 from dataclasses import dataclass
 import numpy as np
-from sklearn.metrics import mean_squared_error, roc_auc_score, auc
 from sklearn.preprocessing import label_binarize
 import warnings
@@ -549,6 +549,140 @@ class AUCROCCalculator:
         return auc_results
 class DocumentSentencizer:
     """Split documents into sentences with keys (0a, 0b, 1a, etc.)."""
@@ -1065,4 +1199,14 @@ class AdvancedRAGEvaluator:
         if aucroc_vs_ground_truth:
             results["aucroc_vs_ground_truth"] = aucroc_vs_ground_truth
         return results

 import re
 from dataclasses import dataclass
 import numpy as np
+from sklearn.metrics import mean_squared_error, roc_auc_score, auc, f1_score, precision_score, recall_score
 from sklearn.preprocessing import label_binarize
 import warnings
         return auc_results
+class F1ScoreCalculator:
+    """Calculate F1Score for evaluation metrics (especially for adherence)."""
+    @staticmethod
+    def compute_f1_for_metric(predicted: List[float], ground_truth: List[float],
+                             threshold: float = 0.5) -> Dict[str, float]:
+        """Compute F1 Score for a single metric using binary classification.
+        Converts continuous scores to binary labels using threshold, then calculates:
+        - Precision: TP / (TP + FP)
+        - Recall: TP / (TP + FN)
+        - F1 Score: 2 * (Precision * Recall) / (Precision + Recall)
+        Args:
+            predicted: List of predicted metric values (0-1)
+            ground_truth: List of ground truth metric values (0-1)
+            threshold: Threshold for binary classification (default 0.5)
+        Returns:
+            Dictionary with F1, Precision, Recall scores
+        """
+        if len(predicted) != len(ground_truth):
+            raise ValueError("Predicted and ground truth must have same length")
+        if len(predicted) <= 1:
+            return {"f1_score": 0.0, "precision": 0.0, "recall": 0.0}
+        try:
+            # Convert continuous scores to binary labels
+            pred_binary = [1 if score >= threshold else 0 for score in predicted]
+            truth_binary = [1 if score >= threshold else 0 for score in ground_truth]
+            # Calculate metrics
+            f1 = f1_score(truth_binary, pred_binary, zero_division=0)
+            precision = precision_score(truth_binary, pred_binary, zero_division=0)
+            recall = recall_score(truth_binary, pred_binary, zero_division=0)
+            return {
+                "f1_score": float(f1),
+                "precision": float(precision),
+                "recall": float(recall)
+            }
+        except Exception as e:
+            warnings.warn(f"Error computing F1 Score: {e}")
+            return {"f1_score": 0.0, "precision": 0.0, "recall": 0.0}
+    @staticmethod
+    def compute_adherence_f1(results: List[Dict]) -> Dict[str, float]:
+        """Compute F1 Score specifically for adherence metric aggregation.
+        Adherence is a binary metric (0 or 1), so F1 Score is particularly relevant.
+        Measures how well the predicted adherence scores match ground truth.
+        Args:
+            results: List of evaluation results with predicted and ground truth scores
+        Returns:
+            Dictionary with:
+                - adherence_f1: F1 Score for adherence
+                - adherence_precision: Precision for adherence
+                - adherence_recall: Recall for adherence
+                - num_evaluations: Number of evaluations used
+        """
+        predicted = []
+        ground_truth = []
+        for result in results:
+            if "metrics" in result and "adherence" in result["metrics"]:
+                predicted.append(result["metrics"]["adherence"])
+                if "ground_truth_scores" in result and "adherence" in result["ground_truth_scores"]:
+                    ground_truth.append(result["ground_truth_scores"]["adherence"])
+                else:
+                    if predicted:
+                        predicted.pop()
+        if len(predicted) == 0 or len(ground_truth) == 0:
+            return {
+                "adherence_f1": 0.0,
+                "adherence_precision": 0.0,
+                "adherence_recall": 0.0,
+                "num_evaluations": 0
+            }
+        f1_metrics = F1ScoreCalculator.compute_f1_for_metric(predicted, ground_truth)
+        return {
+            "adherence_f1": f1_metrics["f1_score"],
+            "adherence_precision": f1_metrics["precision"],
+            "adherence_recall": f1_metrics["recall"],
+            "num_evaluations": len(predicted)
+        }
+    @staticmethod
+    def compute_f1_all_metrics(results: List[Dict]) -> Dict[str, float]:
+        """Compute F1 Score for all TRACE metrics.
+        Args:
+            results: List of evaluation results with predicted and ground truth scores
+        Returns:
+            Dictionary mapping metric names to F1 Scores with precision/recall
+        """
+        metrics = ["context_relevance", "context_utilization", "completeness", "adherence"]
+        f1_results = {}
+        for metric in metrics:
+            predicted = []
+            ground_truth = []
+            for result in results:
+                if "metrics" in result and metric in result["metrics"]:
+                    predicted.append(result["metrics"][metric])
+                    if "ground_truth_scores" in result and metric in result["ground_truth_scores"]:
+                        ground_truth.append(result["ground_truth_scores"][metric])
+                    else:
+                        if predicted:
+                            predicted.pop()
+            if len(predicted) > 0 and len(ground_truth) > 0:
+                f1_metrics = F1ScoreCalculator.compute_f1_for_metric(predicted, ground_truth)
+                f1_results[f"{metric}_f1"] = f1_metrics["f1_score"]
+                f1_results[f"{metric}_precision"] = f1_metrics["precision"]
+                f1_results[f"{metric}_recall"] = f1_metrics["recall"]
+        # Compute average F1 across all metrics
+        f1_scores = [v for k, v in f1_results.items() if k.endswith("_f1")]
+        if f1_scores:
+            f1_results["average_f1"] = float(np.mean(f1_scores))
+        return f1_results
 class DocumentSentencizer:
     """Split documents into sentences with keys (0a, 0b, 1a, etc.)."""
         if aucroc_vs_ground_truth:
             results["aucroc_vs_ground_truth"] = aucroc_vs_ground_truth
+        # Compute F1 Score for adherence aggregation
+        adherence_f1_scores = F1ScoreCalculator.compute_adherence_f1(detailed_results)
+        if adherence_f1_scores:
+            results["adherence_f1_scores"] = adherence_f1_scores
+        # Compute F1 Scores for all metrics
+        f1_all_metrics = F1ScoreCalculator.compute_f1_all_metrics(detailed_results)
+        if f1_all_metrics:
+            results["f1_scores"] = f1_all_metrics
         return results