#!/usr/bin/env python """Test RMSE aggregation for TRACE metrics.""" from advanced_rag_evaluator import AdvancedTRACEScores, RMSECalculator import numpy as np # Test 1: Perfect consistency print('Test 1: Perfect Consistency') scores1 = AdvancedTRACEScores( context_relevance=0.85, context_utilization=0.85, completeness=0.85, adherence=0.85, overall_supported=True, num_fully_supported_sentences=8, num_partially_supported_sentences=0, num_unsupported_sentences=0 ) rmse1 = scores1.rmse_aggregation() print(f' RMSE: {rmse1:.4f} (expected: ~0.0000)') assert rmse1 < 0.001, f"Expected ~0, got {rmse1}" # Test 2: Imbalanced metrics print('\nTest 2: Imbalanced Metrics') scores2 = AdvancedTRACEScores( context_relevance=0.95, context_utilization=0.50, completeness=0.85, adherence=0.70, overall_supported=True, num_fully_supported_sentences=6, num_partially_supported_sentences=2, num_unsupported_sentences=1 ) rmse2 = scores2.rmse_aggregation() print(f' RMSE: {rmse2:.4f} (expected: ~0.17)') assert 0.14 < rmse2 < 0.18, f"Expected ~0.17, got {rmse2}" # Test 3: to_dict includes rmse_aggregation print('\nTest 3: JSON Output') scores_dict = scores2.to_dict() has_rmse = 'rmse_aggregation' in scores_dict print(f' rmse_aggregation in dict: {has_rmse}') print(f' Value: {scores_dict["rmse_aggregation"]:.4f}') assert has_rmse, "rmse_aggregation not in dict output" # Test 4: Single evaluation comparison print('\nTest 4: Single Evaluation Comparison') predicted = AdvancedTRACEScores( context_relevance=0.85, context_utilization=0.80, completeness=0.88, adherence=0.82, overall_supported=True, num_fully_supported_sentences=8, num_partially_supported_sentences=1, num_unsupported_sentences=0 ) ground_truth = AdvancedTRACEScores( context_relevance=0.84, context_utilization=0.82, completeness=0.87, adherence=0.80, overall_supported=True, num_fully_supported_sentences=9, num_partially_supported_sentences=0, num_unsupported_sentences=0 ) comparison = RMSECalculator.compute_rmse_single_trace_evaluation(predicted, ground_truth) print(f' Per-metric RMSE: {comparison["per_metric"]}') print(f' Aggregated RMSE: {comparison["aggregated_rmse"]:.4f}') assert "per_metric" in comparison, "Missing per_metric in result" assert "aggregated_rmse" in comparison, "Missing aggregated_rmse in result" # Test 5: Batch aggregation print('\nTest 5: Batch RMSE Aggregation') batch_results = [ { "metrics": { "context_relevance": 0.85, "context_utilization": 0.80, "completeness": 0.88, "adherence": 0.82 }, "ground_truth_scores": { "context_relevance": 0.84, "context_utilization": 0.82, "completeness": 0.87, "adherence": 0.80 } }, { "metrics": { "context_relevance": 0.90, "context_utilization": 0.75, "completeness": 0.85, "adherence": 0.88 }, "ground_truth_scores": { "context_relevance": 0.88, "context_utilization": 0.78, "completeness": 0.84, "adherence": 0.86 } } ] batch_rmse = RMSECalculator.compute_trace_rmse_aggregation(batch_results) print(f' Per-metric RMSE: {batch_rmse["per_metric_rmse"]}') print(f' Aggregated RMSE: {batch_rmse["aggregated_rmse"]:.4f}') print(f' Consistency Score: {batch_rmse["consistency_score"]:.4f}') print(f' Num Evaluations: {batch_rmse["num_evaluations"]}') assert batch_rmse["num_evaluations"] == 2, "Should process 2 evaluations" assert 0 <= batch_rmse["consistency_score"] <= 1, "Score should be 0-1" print('\n✓ All tests passed successfully!') print('\nImplementation Summary:') print(' - rmse_aggregation() computes consistency within single evaluation') print(' - compute_rmse_single_trace_evaluation() compares to ground truth') print(' - compute_trace_rmse_aggregation() processes batch evaluations') print(' - All metrics automatically included in JSON output')