Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python | |
| """Test RMSE aggregation for TRACE metrics.""" | |
| from advanced_rag_evaluator import AdvancedTRACEScores, RMSECalculator | |
| import numpy as np | |
| # Test 1: Perfect consistency | |
| print('Test 1: Perfect Consistency') | |
| scores1 = AdvancedTRACEScores( | |
| context_relevance=0.85, | |
| context_utilization=0.85, | |
| completeness=0.85, | |
| adherence=0.85, | |
| overall_supported=True, | |
| num_fully_supported_sentences=8, | |
| num_partially_supported_sentences=0, | |
| num_unsupported_sentences=0 | |
| ) | |
| rmse1 = scores1.rmse_aggregation() | |
| print(f' RMSE: {rmse1:.4f} (expected: ~0.0000)') | |
| assert rmse1 < 0.001, f"Expected ~0, got {rmse1}" | |
| # Test 2: Imbalanced metrics | |
| print('\nTest 2: Imbalanced Metrics') | |
| scores2 = AdvancedTRACEScores( | |
| context_relevance=0.95, | |
| context_utilization=0.50, | |
| completeness=0.85, | |
| adherence=0.70, | |
| overall_supported=True, | |
| num_fully_supported_sentences=6, | |
| num_partially_supported_sentences=2, | |
| num_unsupported_sentences=1 | |
| ) | |
| rmse2 = scores2.rmse_aggregation() | |
| print(f' RMSE: {rmse2:.4f} (expected: ~0.17)') | |
| assert 0.14 < rmse2 < 0.18, f"Expected ~0.17, got {rmse2}" | |
| # Test 3: to_dict includes rmse_aggregation | |
| print('\nTest 3: JSON Output') | |
| scores_dict = scores2.to_dict() | |
| has_rmse = 'rmse_aggregation' in scores_dict | |
| print(f' rmse_aggregation in dict: {has_rmse}') | |
| print(f' Value: {scores_dict["rmse_aggregation"]:.4f}') | |
| assert has_rmse, "rmse_aggregation not in dict output" | |
| # Test 4: Single evaluation comparison | |
| print('\nTest 4: Single Evaluation Comparison') | |
| predicted = AdvancedTRACEScores( | |
| context_relevance=0.85, | |
| context_utilization=0.80, | |
| completeness=0.88, | |
| adherence=0.82, | |
| overall_supported=True, | |
| num_fully_supported_sentences=8, | |
| num_partially_supported_sentences=1, | |
| num_unsupported_sentences=0 | |
| ) | |
| ground_truth = AdvancedTRACEScores( | |
| context_relevance=0.84, | |
| context_utilization=0.82, | |
| completeness=0.87, | |
| adherence=0.80, | |
| overall_supported=True, | |
| num_fully_supported_sentences=9, | |
| num_partially_supported_sentences=0, | |
| num_unsupported_sentences=0 | |
| ) | |
| comparison = RMSECalculator.compute_rmse_single_trace_evaluation(predicted, ground_truth) | |
| print(f' Per-metric RMSE: {comparison["per_metric"]}') | |
| print(f' Aggregated RMSE: {comparison["aggregated_rmse"]:.4f}') | |
| assert "per_metric" in comparison, "Missing per_metric in result" | |
| assert "aggregated_rmse" in comparison, "Missing aggregated_rmse in result" | |
| # Test 5: Batch aggregation | |
| print('\nTest 5: Batch RMSE Aggregation') | |
| batch_results = [ | |
| { | |
| "metrics": { | |
| "context_relevance": 0.85, | |
| "context_utilization": 0.80, | |
| "completeness": 0.88, | |
| "adherence": 0.82 | |
| }, | |
| "ground_truth_scores": { | |
| "context_relevance": 0.84, | |
| "context_utilization": 0.82, | |
| "completeness": 0.87, | |
| "adherence": 0.80 | |
| } | |
| }, | |
| { | |
| "metrics": { | |
| "context_relevance": 0.90, | |
| "context_utilization": 0.75, | |
| "completeness": 0.85, | |
| "adherence": 0.88 | |
| }, | |
| "ground_truth_scores": { | |
| "context_relevance": 0.88, | |
| "context_utilization": 0.78, | |
| "completeness": 0.84, | |
| "adherence": 0.86 | |
| } | |
| } | |
| ] | |
| batch_rmse = RMSECalculator.compute_trace_rmse_aggregation(batch_results) | |
| print(f' Per-metric RMSE: {batch_rmse["per_metric_rmse"]}') | |
| print(f' Aggregated RMSE: {batch_rmse["aggregated_rmse"]:.4f}') | |
| print(f' Consistency Score: {batch_rmse["consistency_score"]:.4f}') | |
| print(f' Num Evaluations: {batch_rmse["num_evaluations"]}') | |
| assert batch_rmse["num_evaluations"] == 2, "Should process 2 evaluations" | |
| assert 0 <= batch_rmse["consistency_score"] <= 1, "Score should be 0-1" | |
| print('\n✓ All tests passed successfully!') | |
| print('\nImplementation Summary:') | |
| print(' - rmse_aggregation() computes consistency within single evaluation') | |
| print(' - compute_rmse_single_trace_evaluation() compares to ground truth') | |
| print(' - compute_trace_rmse_aggregation() processes batch evaluations') | |
| print(' - All metrics automatically included in JSON output') | |