Spaces:
Sleeping
Sleeping
Developer commited on
Commit ·
80326a1
1
Parent(s): d3be8f6
Add F1Score calculation for adherence metric aggregation
Browse files- advanced_rag_evaluator.py +145 -1
advanced_rag_evaluator.py
CHANGED
|
@@ -15,7 +15,7 @@ import json
|
|
| 15 |
import re
|
| 16 |
from dataclasses import dataclass
|
| 17 |
import numpy as np
|
| 18 |
-
from sklearn.metrics import mean_squared_error, roc_auc_score, auc
|
| 19 |
from sklearn.preprocessing import label_binarize
|
| 20 |
import warnings
|
| 21 |
|
|
@@ -549,6 +549,140 @@ class AUCROCCalculator:
|
|
| 549 |
return auc_results
|
| 550 |
|
| 551 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 552 |
class DocumentSentencizer:
|
| 553 |
"""Split documents into sentences with keys (0a, 0b, 1a, etc.)."""
|
| 554 |
|
|
@@ -1065,4 +1199,14 @@ class AdvancedRAGEvaluator:
|
|
| 1065 |
if aucroc_vs_ground_truth:
|
| 1066 |
results["aucroc_vs_ground_truth"] = aucroc_vs_ground_truth
|
| 1067 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1068 |
return results
|
|
|
|
| 15 |
import re
|
| 16 |
from dataclasses import dataclass
|
| 17 |
import numpy as np
|
| 18 |
+
from sklearn.metrics import mean_squared_error, roc_auc_score, auc, f1_score, precision_score, recall_score
|
| 19 |
from sklearn.preprocessing import label_binarize
|
| 20 |
import warnings
|
| 21 |
|
|
|
|
| 549 |
return auc_results
|
| 550 |
|
| 551 |
|
| 552 |
+
class F1ScoreCalculator:
|
| 553 |
+
"""Calculate F1Score for evaluation metrics (especially for adherence)."""
|
| 554 |
+
|
| 555 |
+
@staticmethod
|
| 556 |
+
def compute_f1_for_metric(predicted: List[float], ground_truth: List[float],
|
| 557 |
+
threshold: float = 0.5) -> Dict[str, float]:
|
| 558 |
+
"""Compute F1 Score for a single metric using binary classification.
|
| 559 |
+
|
| 560 |
+
Converts continuous scores to binary labels using threshold, then calculates:
|
| 561 |
+
- Precision: TP / (TP + FP)
|
| 562 |
+
- Recall: TP / (TP + FN)
|
| 563 |
+
- F1 Score: 2 * (Precision * Recall) / (Precision + Recall)
|
| 564 |
+
|
| 565 |
+
Args:
|
| 566 |
+
predicted: List of predicted metric values (0-1)
|
| 567 |
+
ground_truth: List of ground truth metric values (0-1)
|
| 568 |
+
threshold: Threshold for binary classification (default 0.5)
|
| 569 |
+
|
| 570 |
+
Returns:
|
| 571 |
+
Dictionary with F1, Precision, Recall scores
|
| 572 |
+
"""
|
| 573 |
+
if len(predicted) != len(ground_truth):
|
| 574 |
+
raise ValueError("Predicted and ground truth must have same length")
|
| 575 |
+
|
| 576 |
+
if len(predicted) <= 1:
|
| 577 |
+
return {"f1_score": 0.0, "precision": 0.0, "recall": 0.0}
|
| 578 |
+
|
| 579 |
+
try:
|
| 580 |
+
# Convert continuous scores to binary labels
|
| 581 |
+
pred_binary = [1 if score >= threshold else 0 for score in predicted]
|
| 582 |
+
truth_binary = [1 if score >= threshold else 0 for score in ground_truth]
|
| 583 |
+
|
| 584 |
+
# Calculate metrics
|
| 585 |
+
f1 = f1_score(truth_binary, pred_binary, zero_division=0)
|
| 586 |
+
precision = precision_score(truth_binary, pred_binary, zero_division=0)
|
| 587 |
+
recall = recall_score(truth_binary, pred_binary, zero_division=0)
|
| 588 |
+
|
| 589 |
+
return {
|
| 590 |
+
"f1_score": float(f1),
|
| 591 |
+
"precision": float(precision),
|
| 592 |
+
"recall": float(recall)
|
| 593 |
+
}
|
| 594 |
+
except Exception as e:
|
| 595 |
+
warnings.warn(f"Error computing F1 Score: {e}")
|
| 596 |
+
return {"f1_score": 0.0, "precision": 0.0, "recall": 0.0}
|
| 597 |
+
|
| 598 |
+
@staticmethod
|
| 599 |
+
def compute_adherence_f1(results: List[Dict]) -> Dict[str, float]:
|
| 600 |
+
"""Compute F1 Score specifically for adherence metric aggregation.
|
| 601 |
+
|
| 602 |
+
Adherence is a binary metric (0 or 1), so F1 Score is particularly relevant.
|
| 603 |
+
Measures how well the predicted adherence scores match ground truth.
|
| 604 |
+
|
| 605 |
+
Args:
|
| 606 |
+
results: List of evaluation results with predicted and ground truth scores
|
| 607 |
+
|
| 608 |
+
Returns:
|
| 609 |
+
Dictionary with:
|
| 610 |
+
- adherence_f1: F1 Score for adherence
|
| 611 |
+
- adherence_precision: Precision for adherence
|
| 612 |
+
- adherence_recall: Recall for adherence
|
| 613 |
+
- num_evaluations: Number of evaluations used
|
| 614 |
+
"""
|
| 615 |
+
predicted = []
|
| 616 |
+
ground_truth = []
|
| 617 |
+
|
| 618 |
+
for result in results:
|
| 619 |
+
if "metrics" in result and "adherence" in result["metrics"]:
|
| 620 |
+
predicted.append(result["metrics"]["adherence"])
|
| 621 |
+
|
| 622 |
+
if "ground_truth_scores" in result and "adherence" in result["ground_truth_scores"]:
|
| 623 |
+
ground_truth.append(result["ground_truth_scores"]["adherence"])
|
| 624 |
+
else:
|
| 625 |
+
if predicted:
|
| 626 |
+
predicted.pop()
|
| 627 |
+
|
| 628 |
+
if len(predicted) == 0 or len(ground_truth) == 0:
|
| 629 |
+
return {
|
| 630 |
+
"adherence_f1": 0.0,
|
| 631 |
+
"adherence_precision": 0.0,
|
| 632 |
+
"adherence_recall": 0.0,
|
| 633 |
+
"num_evaluations": 0
|
| 634 |
+
}
|
| 635 |
+
|
| 636 |
+
f1_metrics = F1ScoreCalculator.compute_f1_for_metric(predicted, ground_truth)
|
| 637 |
+
|
| 638 |
+
return {
|
| 639 |
+
"adherence_f1": f1_metrics["f1_score"],
|
| 640 |
+
"adherence_precision": f1_metrics["precision"],
|
| 641 |
+
"adherence_recall": f1_metrics["recall"],
|
| 642 |
+
"num_evaluations": len(predicted)
|
| 643 |
+
}
|
| 644 |
+
|
| 645 |
+
@staticmethod
|
| 646 |
+
def compute_f1_all_metrics(results: List[Dict]) -> Dict[str, float]:
|
| 647 |
+
"""Compute F1 Score for all TRACE metrics.
|
| 648 |
+
|
| 649 |
+
Args:
|
| 650 |
+
results: List of evaluation results with predicted and ground truth scores
|
| 651 |
+
|
| 652 |
+
Returns:
|
| 653 |
+
Dictionary mapping metric names to F1 Scores with precision/recall
|
| 654 |
+
"""
|
| 655 |
+
metrics = ["context_relevance", "context_utilization", "completeness", "adherence"]
|
| 656 |
+
f1_results = {}
|
| 657 |
+
|
| 658 |
+
for metric in metrics:
|
| 659 |
+
predicted = []
|
| 660 |
+
ground_truth = []
|
| 661 |
+
|
| 662 |
+
for result in results:
|
| 663 |
+
if "metrics" in result and metric in result["metrics"]:
|
| 664 |
+
predicted.append(result["metrics"][metric])
|
| 665 |
+
|
| 666 |
+
if "ground_truth_scores" in result and metric in result["ground_truth_scores"]:
|
| 667 |
+
ground_truth.append(result["ground_truth_scores"][metric])
|
| 668 |
+
else:
|
| 669 |
+
if predicted:
|
| 670 |
+
predicted.pop()
|
| 671 |
+
|
| 672 |
+
if len(predicted) > 0 and len(ground_truth) > 0:
|
| 673 |
+
f1_metrics = F1ScoreCalculator.compute_f1_for_metric(predicted, ground_truth)
|
| 674 |
+
f1_results[f"{metric}_f1"] = f1_metrics["f1_score"]
|
| 675 |
+
f1_results[f"{metric}_precision"] = f1_metrics["precision"]
|
| 676 |
+
f1_results[f"{metric}_recall"] = f1_metrics["recall"]
|
| 677 |
+
|
| 678 |
+
# Compute average F1 across all metrics
|
| 679 |
+
f1_scores = [v for k, v in f1_results.items() if k.endswith("_f1")]
|
| 680 |
+
if f1_scores:
|
| 681 |
+
f1_results["average_f1"] = float(np.mean(f1_scores))
|
| 682 |
+
|
| 683 |
+
return f1_results
|
| 684 |
+
|
| 685 |
+
|
| 686 |
class DocumentSentencizer:
|
| 687 |
"""Split documents into sentences with keys (0a, 0b, 1a, etc.)."""
|
| 688 |
|
|
|
|
| 1199 |
if aucroc_vs_ground_truth:
|
| 1200 |
results["aucroc_vs_ground_truth"] = aucroc_vs_ground_truth
|
| 1201 |
|
| 1202 |
+
# Compute F1 Score for adherence aggregation
|
| 1203 |
+
adherence_f1_scores = F1ScoreCalculator.compute_adherence_f1(detailed_results)
|
| 1204 |
+
if adherence_f1_scores:
|
| 1205 |
+
results["adherence_f1_scores"] = adherence_f1_scores
|
| 1206 |
+
|
| 1207 |
+
# Compute F1 Scores for all metrics
|
| 1208 |
+
f1_all_metrics = F1ScoreCalculator.compute_f1_all_metrics(detailed_results)
|
| 1209 |
+
if f1_all_metrics:
|
| 1210 |
+
results["f1_scores"] = f1_all_metrics
|
| 1211 |
+
|
| 1212 |
return results
|