CapStoneRAG10 / test_rmse_aggregation.py
Developer
Initial commit for HuggingFace Spaces - RAG Capstone Project with Qdrant Cloud
1d10b0a
#!/usr/bin/env python
"""Test RMSE aggregation for TRACE metrics."""
from advanced_rag_evaluator import AdvancedTRACEScores, RMSECalculator
import numpy as np
# Test 1: Perfect consistency
print('Test 1: Perfect Consistency')
scores1 = AdvancedTRACEScores(
context_relevance=0.85,
context_utilization=0.85,
completeness=0.85,
adherence=0.85,
overall_supported=True,
num_fully_supported_sentences=8,
num_partially_supported_sentences=0,
num_unsupported_sentences=0
)
rmse1 = scores1.rmse_aggregation()
print(f' RMSE: {rmse1:.4f} (expected: ~0.0000)')
assert rmse1 < 0.001, f"Expected ~0, got {rmse1}"
# Test 2: Imbalanced metrics
print('\nTest 2: Imbalanced Metrics')
scores2 = AdvancedTRACEScores(
context_relevance=0.95,
context_utilization=0.50,
completeness=0.85,
adherence=0.70,
overall_supported=True,
num_fully_supported_sentences=6,
num_partially_supported_sentences=2,
num_unsupported_sentences=1
)
rmse2 = scores2.rmse_aggregation()
print(f' RMSE: {rmse2:.4f} (expected: ~0.17)')
assert 0.14 < rmse2 < 0.18, f"Expected ~0.17, got {rmse2}"
# Test 3: to_dict includes rmse_aggregation
print('\nTest 3: JSON Output')
scores_dict = scores2.to_dict()
has_rmse = 'rmse_aggregation' in scores_dict
print(f' rmse_aggregation in dict: {has_rmse}')
print(f' Value: {scores_dict["rmse_aggregation"]:.4f}')
assert has_rmse, "rmse_aggregation not in dict output"
# Test 4: Single evaluation comparison
print('\nTest 4: Single Evaluation Comparison')
predicted = AdvancedTRACEScores(
context_relevance=0.85,
context_utilization=0.80,
completeness=0.88,
adherence=0.82,
overall_supported=True,
num_fully_supported_sentences=8,
num_partially_supported_sentences=1,
num_unsupported_sentences=0
)
ground_truth = AdvancedTRACEScores(
context_relevance=0.84,
context_utilization=0.82,
completeness=0.87,
adherence=0.80,
overall_supported=True,
num_fully_supported_sentences=9,
num_partially_supported_sentences=0,
num_unsupported_sentences=0
)
comparison = RMSECalculator.compute_rmse_single_trace_evaluation(predicted, ground_truth)
print(f' Per-metric RMSE: {comparison["per_metric"]}')
print(f' Aggregated RMSE: {comparison["aggregated_rmse"]:.4f}')
assert "per_metric" in comparison, "Missing per_metric in result"
assert "aggregated_rmse" in comparison, "Missing aggregated_rmse in result"
# Test 5: Batch aggregation
print('\nTest 5: Batch RMSE Aggregation')
batch_results = [
{
"metrics": {
"context_relevance": 0.85,
"context_utilization": 0.80,
"completeness": 0.88,
"adherence": 0.82
},
"ground_truth_scores": {
"context_relevance": 0.84,
"context_utilization": 0.82,
"completeness": 0.87,
"adherence": 0.80
}
},
{
"metrics": {
"context_relevance": 0.90,
"context_utilization": 0.75,
"completeness": 0.85,
"adherence": 0.88
},
"ground_truth_scores": {
"context_relevance": 0.88,
"context_utilization": 0.78,
"completeness": 0.84,
"adherence": 0.86
}
}
]
batch_rmse = RMSECalculator.compute_trace_rmse_aggregation(batch_results)
print(f' Per-metric RMSE: {batch_rmse["per_metric_rmse"]}')
print(f' Aggregated RMSE: {batch_rmse["aggregated_rmse"]:.4f}')
print(f' Consistency Score: {batch_rmse["consistency_score"]:.4f}')
print(f' Num Evaluations: {batch_rmse["num_evaluations"]}')
assert batch_rmse["num_evaluations"] == 2, "Should process 2 evaluations"
assert 0 <= batch_rmse["consistency_score"] <= 1, "Score should be 0-1"
print('\n✓ All tests passed successfully!')
print('\nImplementation Summary:')
print(' - rmse_aggregation() computes consistency within single evaluation')
print(' - compute_rmse_single_trace_evaluation() compares to ground truth')
print(' - compute_trace_rmse_aggregation() processes batch evaluations')
print(' - All metrics automatically included in JSON output')