Spaces:
Sleeping
Sleeping
| # test_properties_metrics.py | |
| """ | |
| Property-based tests for verification metrics calculator. | |
| Tests that metrics are calculated correctly across all inputs. | |
| """ | |
| import pytest | |
| from hypothesis import given, strategies as st, settings, HealthCheck | |
| from datetime import datetime | |
| from src.core.verification_models import VerificationRecord | |
| from src.core.verification_metrics import VerificationMetricsCalculator | |
| def verification_record_strategy(): | |
| """Generate random verification records.""" | |
| return st.builds( | |
| VerificationRecord, | |
| message_id=st.text( | |
| alphabet="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_", | |
| min_size=1, | |
| max_size=20, | |
| ), | |
| original_message=st.text(min_size=1, max_size=500), | |
| classifier_decision=st.sampled_from(["green", "yellow", "red"]), | |
| classifier_confidence=st.floats(min_value=0.0, max_value=1.0), | |
| classifier_indicators=st.lists(st.text(min_size=1, max_size=50), max_size=5), | |
| ground_truth_label=st.sampled_from(["green", "yellow", "red"]), | |
| verifier_notes=st.text(max_size=200), | |
| is_correct=st.booleans(), | |
| timestamp=st.just(datetime.now()), | |
| ) | |
| class TestAccuracyCalculation: | |
| """ | |
| **Feature: verification-mode, Property 4: Accuracy Calculation is Correct** | |
| Tests that accuracy is calculated correctly as (correct / total) * 100. | |
| """ | |
| def test_accuracy_calculation_is_correct(self, records): | |
| """ | |
| **Feature: verification-mode, Property 4: Accuracy Calculation is Correct** | |
| **Validates: Requirements 5.3, 5.4, 9.2** | |
| For any set of verification records, the calculated accuracy should equal | |
| (correct_count / total_count) * 100. | |
| """ | |
| # Calculate expected accuracy | |
| correct_count = sum(1 for r in records if r.is_correct) | |
| expected_accuracy = (correct_count / len(records)) * 100 | |
| # Calculate actual accuracy | |
| actual_accuracy = VerificationMetricsCalculator.calculate_accuracy(records) | |
| # Verify accuracy is correct | |
| assert actual_accuracy == expected_accuracy | |
| def test_accuracy_with_all_correct(self): | |
| """ | |
| **Feature: verification-mode, Property 4: Accuracy Calculation is Correct** | |
| **Validates: Requirements 5.3, 5.4, 9.2** | |
| When all records are correct, accuracy should be 100. | |
| """ | |
| records = [ | |
| VerificationRecord( | |
| message_id=f"msg_{i}", | |
| original_message=f"Message {i}", | |
| classifier_decision="green", | |
| classifier_confidence=0.9, | |
| classifier_indicators=["test"], | |
| ground_truth_label="green", | |
| verifier_notes="", | |
| is_correct=True, | |
| timestamp=datetime.now(), | |
| ) | |
| for i in range(10) | |
| ] | |
| accuracy = VerificationMetricsCalculator.calculate_accuracy(records) | |
| assert accuracy == 100.0 | |
| def test_accuracy_with_all_incorrect(self): | |
| """ | |
| **Feature: verification-mode, Property 4: Accuracy Calculation is Correct** | |
| **Validates: Requirements 5.3, 5.4, 9.2** | |
| When all records are incorrect, accuracy should be 0. | |
| """ | |
| records = [ | |
| VerificationRecord( | |
| message_id=f"msg_{i}", | |
| original_message=f"Message {i}", | |
| classifier_decision="green", | |
| classifier_confidence=0.9, | |
| classifier_indicators=["test"], | |
| ground_truth_label="yellow", | |
| verifier_notes="", | |
| is_correct=False, | |
| timestamp=datetime.now(), | |
| ) | |
| for i in range(10) | |
| ] | |
| accuracy = VerificationMetricsCalculator.calculate_accuracy(records) | |
| assert accuracy == 0.0 | |
| def test_accuracy_with_empty_records(self): | |
| """ | |
| **Feature: verification-mode, Property 4: Accuracy Calculation is Correct** | |
| **Validates: Requirements 5.3, 5.4, 9.2** | |
| When there are no records, accuracy should be 0. | |
| """ | |
| accuracy = VerificationMetricsCalculator.calculate_accuracy([]) | |
| assert accuracy == 0.0 | |
| def test_accuracy_with_half_correct(self): | |
| """ | |
| **Feature: verification-mode, Property 4: Accuracy Calculation is Correct** | |
| **Validates: Requirements 5.3, 5.4, 9.2** | |
| When half the records are correct, accuracy should be 50. | |
| """ | |
| records = [ | |
| VerificationRecord( | |
| message_id=f"msg_{i}", | |
| original_message=f"Message {i}", | |
| classifier_decision="green", | |
| classifier_confidence=0.9, | |
| classifier_indicators=["test"], | |
| ground_truth_label="green" if i % 2 == 0 else "yellow", | |
| verifier_notes="", | |
| is_correct=(i % 2 == 0), | |
| timestamp=datetime.now(), | |
| ) | |
| for i in range(10) | |
| ] | |
| accuracy = VerificationMetricsCalculator.calculate_accuracy(records) | |
| assert accuracy == 50.0 | |
| def test_accuracy_by_type_calculation(self, records): | |
| """ | |
| **Feature: verification-mode, Property 4: Accuracy Calculation is Correct** | |
| **Validates: Requirements 5.3, 5.4, 9.2** | |
| For any set of records, accuracy by type should correctly count records | |
| where classifier_decision equals ground_truth_label for each type. | |
| """ | |
| accuracy_by_type = ( | |
| VerificationMetricsCalculator.calculate_accuracy_by_type(records) | |
| ) | |
| # Verify we have all three types | |
| assert "green" in accuracy_by_type | |
| assert "yellow" in accuracy_by_type | |
| assert "red" in accuracy_by_type | |
| # Verify each type's accuracy is correct | |
| for classification_type in ["green", "yellow", "red"]: | |
| type_records = [ | |
| r for r in records | |
| if r.classifier_decision == classification_type | |
| ] | |
| if type_records: | |
| correct_count = sum(1 for r in type_records if r.is_correct) | |
| expected_accuracy = (correct_count / len(type_records)) * 100 | |
| assert accuracy_by_type[classification_type] == expected_accuracy | |
| else: | |
| assert accuracy_by_type[classification_type] == 0.0 | |
| def test_confusion_matrix_structure(self, records): | |
| """ | |
| **Feature: verification-mode, Property 4: Accuracy Calculation is Correct** | |
| **Validates: Requirements 5.3, 5.4, 9.2** | |
| For any set of records, the confusion matrix should have correct structure | |
| and all counts should sum to total records. | |
| """ | |
| matrix = VerificationMetricsCalculator.calculate_confusion_matrix(records) | |
| # Verify structure | |
| assert "green" in matrix | |
| assert "yellow" in matrix | |
| assert "red" in matrix | |
| for classifier_type in ["green", "yellow", "red"]: | |
| assert "green" in matrix[classifier_type] | |
| assert "yellow" in matrix[classifier_type] | |
| assert "red" in matrix[classifier_type] | |
| # Verify all counts sum to total records | |
| total_count = sum( | |
| matrix[classifier][truth] | |
| for classifier in ["green", "yellow", "red"] | |
| for truth in ["green", "yellow", "red"] | |
| ) | |
| assert total_count == len(records) | |
| def test_metrics_summary_consistency(self, records): | |
| """ | |
| **Feature: verification-mode, Property 4: Accuracy Calculation is Correct** | |
| **Validates: Requirements 5.3, 5.4, 9.2** | |
| For any set of records, the metrics summary should be internally consistent. | |
| """ | |
| summary = VerificationMetricsCalculator.get_metrics_summary(records) | |
| # Verify counts are consistent | |
| assert summary["total_records"] == len(records) | |
| assert ( | |
| summary["correct_count"] + summary["incorrect_count"] | |
| == summary["total_records"] | |
| ) | |
| # Verify accuracy matches calculated value | |
| expected_accuracy = ( | |
| summary["correct_count"] / summary["total_records"] * 100 | |
| if summary["total_records"] > 0 | |
| else 0.0 | |
| ) | |
| assert summary["accuracy"] == expected_accuracy | |
| # Verify accuracy_by_type values are between 0 and 100 | |
| for accuracy in summary["accuracy_by_type"].values(): | |
| assert 0.0 <= accuracy <= 100.0 | |