Spaces:

DocUA
/

Spiritual_Health_Project

Sleeping

App Files Files Community

Spiritual_Health_Project / tests /verification_mode /test_properties_metrics.py

DocUA

Add property-based tests for verification mode functionality

a3934b1 5 months ago

raw

history blame contribute delete

9.11 kB

	# test_properties_metrics.py
	"""
	Property-based tests for verification metrics calculator.

	Tests that metrics are calculated correctly across all inputs.
	"""

	import pytest
	from hypothesis import given, strategies as st, settings, HealthCheck
	from datetime import datetime
	from src.core.verification_models import VerificationRecord
	from src.core.verification_metrics import VerificationMetricsCalculator


	def verification_record_strategy():
	"""Generate random verification records."""
	return st.builds(
	VerificationRecord,
	message_id=st.text(
	alphabet="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_",
	min_size=1,
	max_size=20,
	),
	original_message=st.text(min_size=1, max_size=500),
	classifier_decision=st.sampled_from(["green", "yellow", "red"]),
	classifier_confidence=st.floats(min_value=0.0, max_value=1.0),
	classifier_indicators=st.lists(st.text(min_size=1, max_size=50), max_size=5),
	ground_truth_label=st.sampled_from(["green", "yellow", "red"]),
	verifier_notes=st.text(max_size=200),
	is_correct=st.booleans(),
	timestamp=st.just(datetime.now()),
	)


	class TestAccuracyCalculation:
	"""
	Feature: verification-mode, Property 4: Accuracy Calculation is Correct

	Tests that accuracy is calculated correctly as (correct / total) * 100.
	"""

	@given(st.lists(verification_record_strategy(), min_size=1, max_size=100))
	@settings(suppress_health_check=[HealthCheck.function_scoped_fixture])
	def test_accuracy_calculation_is_correct(self, records):
	"""
	Feature: verification-mode, Property 4: Accuracy Calculation is Correct
	Validates: Requirements 5.3, 5.4, 9.2

	For any set of verification records, the calculated accuracy should equal
	(correct_count / total_count) * 100.
	"""
	# Calculate expected accuracy
	correct_count = sum(1 for r in records if r.is_correct)
	expected_accuracy = (correct_count / len(records)) * 100

	# Calculate actual accuracy
	actual_accuracy = VerificationMetricsCalculator.calculate_accuracy(records)

	# Verify accuracy is correct
	assert actual_accuracy == expected_accuracy

	def test_accuracy_with_all_correct(self):
	"""
	Feature: verification-mode, Property 4: Accuracy Calculation is Correct
	Validates: Requirements 5.3, 5.4, 9.2

	When all records are correct, accuracy should be 100.
	"""
	records = [
	VerificationRecord(
	message_id=f"msg_{i}",
	original_message=f"Message {i}",
	classifier_decision="green",
	classifier_confidence=0.9,
	classifier_indicators=["test"],
	ground_truth_label="green",
	verifier_notes="",
	is_correct=True,
	timestamp=datetime.now(),
	)
	for i in range(10)
	]

	accuracy = VerificationMetricsCalculator.calculate_accuracy(records)
	assert accuracy == 100.0

	def test_accuracy_with_all_incorrect(self):
	"""
	Feature: verification-mode, Property 4: Accuracy Calculation is Correct
	Validates: Requirements 5.3, 5.4, 9.2

	When all records are incorrect, accuracy should be 0.
	"""
	records = [
	VerificationRecord(
	message_id=f"msg_{i}",
	original_message=f"Message {i}",
	classifier_decision="green",
	classifier_confidence=0.9,
	classifier_indicators=["test"],
	ground_truth_label="yellow",
	verifier_notes="",
	is_correct=False,
	timestamp=datetime.now(),
	)
	for i in range(10)
	]

	accuracy = VerificationMetricsCalculator.calculate_accuracy(records)
	assert accuracy == 0.0

	def test_accuracy_with_empty_records(self):
	"""
	Feature: verification-mode, Property 4: Accuracy Calculation is Correct
	Validates: Requirements 5.3, 5.4, 9.2

	When there are no records, accuracy should be 0.
	"""
	accuracy = VerificationMetricsCalculator.calculate_accuracy([])
	assert accuracy == 0.0

	def test_accuracy_with_half_correct(self):
	"""
	Feature: verification-mode, Property 4: Accuracy Calculation is Correct
	Validates: Requirements 5.3, 5.4, 9.2

	When half the records are correct, accuracy should be 50.
	"""
	records = [
	VerificationRecord(
	message_id=f"msg_{i}",
	original_message=f"Message {i}",
	classifier_decision="green",
	classifier_confidence=0.9,
	classifier_indicators=["test"],
	ground_truth_label="green" if i % 2 == 0 else "yellow",
	verifier_notes="",
	is_correct=(i % 2 == 0),
	timestamp=datetime.now(),
	)
	for i in range(10)
	]

	accuracy = VerificationMetricsCalculator.calculate_accuracy(records)
	assert accuracy == 50.0

	@given(st.lists(verification_record_strategy(), min_size=1, max_size=100))
	def test_accuracy_by_type_calculation(self, records):
	"""
	Feature: verification-mode, Property 4: Accuracy Calculation is Correct
	Validates: Requirements 5.3, 5.4, 9.2

	For any set of records, accuracy by type should correctly count records
	where classifier_decision equals ground_truth_label for each type.
	"""
	accuracy_by_type = (
	VerificationMetricsCalculator.calculate_accuracy_by_type(records)
	)

	# Verify we have all three types
	assert "green" in accuracy_by_type
	assert "yellow" in accuracy_by_type
	assert "red" in accuracy_by_type

	# Verify each type's accuracy is correct
	for classification_type in ["green", "yellow", "red"]:
	type_records = [
	r for r in records
	if r.classifier_decision == classification_type
	]

	if type_records:
	correct_count = sum(1 for r in type_records if r.is_correct)
	expected_accuracy = (correct_count / len(type_records)) * 100
	assert accuracy_by_type[classification_type] == expected_accuracy
	else:
	assert accuracy_by_type[classification_type] == 0.0

	@given(st.lists(verification_record_strategy(), min_size=1, max_size=100))
	def test_confusion_matrix_structure(self, records):
	"""
	Feature: verification-mode, Property 4: Accuracy Calculation is Correct
	Validates: Requirements 5.3, 5.4, 9.2

	For any set of records, the confusion matrix should have correct structure
	and all counts should sum to total records.
	"""
	matrix = VerificationMetricsCalculator.calculate_confusion_matrix(records)

	# Verify structure
	assert "green" in matrix
	assert "yellow" in matrix
	assert "red" in matrix

	for classifier_type in ["green", "yellow", "red"]:
	assert "green" in matrix[classifier_type]
	assert "yellow" in matrix[classifier_type]
	assert "red" in matrix[classifier_type]

	# Verify all counts sum to total records
	total_count = sum(
	matrix[classifier][truth]
	for classifier in ["green", "yellow", "red"]
	for truth in ["green", "yellow", "red"]
	)
	assert total_count == len(records)

	@given(st.lists(verification_record_strategy(), min_size=1, max_size=100))
	def test_metrics_summary_consistency(self, records):
	"""
	Feature: verification-mode, Property 4: Accuracy Calculation is Correct
	Validates: Requirements 5.3, 5.4, 9.2

	For any set of records, the metrics summary should be internally consistent.
	"""
	summary = VerificationMetricsCalculator.get_metrics_summary(records)

	# Verify counts are consistent
	assert summary["total_records"] == len(records)
	assert (
	summary["correct_count"] + summary["incorrect_count"]
	== summary["total_records"]
	)

	# Verify accuracy matches calculated value
	expected_accuracy = (
	summary["correct_count"] / summary["total_records"] * 100
	if summary["total_records"] > 0
	else 0.0
	)
	assert summary["accuracy"] == expected_accuracy

	# Verify accuracy_by_type values are between 0 and 100
	for accuracy in summary["accuracy_by_type"].values():
	assert 0.0 <= accuracy <= 100.0