ai-engineering-project / test_deterministic_evaluation.py
GitHub Action
Clean deployment without binary files
f884e6e
#!/usr/bin/env python3
"""
Test Deterministic Evaluation Improvements
Validates that the enhanced evaluation system provides reproducible and
improved groundedness scoring.
"""
import json
import os
# Import the enhanced evaluation components
import sys
import tempfile
import unittest
from pathlib import Path
# typing imports not required here
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "src"))
from src.evaluation.deterministic import ( # noqa: E402
evaluate_citation_accuracy_deterministic,
evaluate_groundedness_deterministic,
setup_deterministic_evaluation,
)
class TestDeterministicEvaluation(unittest.TestCase):
"""Test cases for deterministic evaluation improvements."""
def setUp(self):
"""Set up test fixtures."""
self.evaluator = setup_deterministic_evaluation(seed=42)
# Sample test data
self.sample_response = """
Based on the company's remote work policy, employees can work from home
up to 3 days per week. This policy was implemented to improve work-life
balance while maintaining team collaboration.
"""
self.sample_sources = [
"""
Remote Work Policy: All full-time employees are eligible to work
from home up to three days per week. Part-time employees may work
remotely up to 50% of their scheduled hours.
""",
"""
Work-Life Balance Initiative: The company recognizes the importance
of maintaining a healthy work-life balance. Our remote work policy
is designed to provide flexibility while ensuring team productivity.
""",
]
self.sample_returned_sources = [
{
"filename": "remote_work_policy.md",
"content": self.sample_sources[0],
"metadata": {"file": "remote_work_policy.md"},
},
{"filename": "work_life_balance.md", "content": self.sample_sources[1]},
]
self.expected_sources = ["remote_work_policy.md", "work_life_balance.md"]
def test_deterministic_reproducibility(self):
"""Test that evaluation results are reproducible with same seed."""
# Run evaluation twice with same seed
evaluator1 = setup_deterministic_evaluation(seed=42)
evaluator2 = setup_deterministic_evaluation(seed=42)
result1 = evaluate_groundedness_deterministic(self.sample_response, self.sample_sources, evaluator1)
result2 = evaluate_groundedness_deterministic(self.sample_response, self.sample_sources, evaluator2)
# Results should be identical
self.assertEqual(result1, result2)
# Test citation accuracy reproducibility
citation1 = evaluate_citation_accuracy_deterministic(
self.sample_response, self.sample_returned_sources, self.expected_sources, evaluator1
)
citation2 = evaluate_citation_accuracy_deterministic(
self.sample_response, self.sample_returned_sources, self.expected_sources, evaluator2
)
self.assertEqual(citation1, citation2)
def test_groundedness_scoring(self):
"""Test groundedness scoring functionality."""
result = evaluate_groundedness_deterministic(self.sample_response, self.sample_sources, self.evaluator)
# Check that all expected metrics are present
expected_metrics = ["groundedness_score", "passage_coverage", "token_overlap", "exact_matches"]
for metric in expected_metrics:
self.assertIn(metric, result)
self.assertIsInstance(result[metric], float)
self.assertGreaterEqual(result[metric], 0.0)
self.assertLessEqual(result[metric], 1.0)
# Groundedness should be positive since response relates to sources
self.assertGreater(result["groundedness_score"], 0.0)
# Token overlap should be detected
self.assertGreater(result["token_overlap"], 0.0)
def test_citation_accuracy_scoring(self):
"""Test citation accuracy scoring functionality."""
result = evaluate_citation_accuracy_deterministic(
self.sample_response, self.sample_returned_sources, self.expected_sources, self.evaluator
)
# Check expected metrics
expected_metrics = ["citation_accuracy", "source_precision", "source_recall", "exact_filename_matches"]
for metric in expected_metrics:
self.assertIn(metric, result)
self.assertIsInstance(result[metric], float)
self.assertGreaterEqual(result[metric], 0.0)
self.assertLessEqual(result[metric], 1.0)
# Should have perfect citation accuracy for exact filename matches
self.assertEqual(result["exact_filename_matches"], 1.0)
self.assertEqual(result["source_recall"], 1.0)
def test_empty_inputs_handling(self):
"""Test handling of empty or invalid inputs."""
# Empty generated text
result = evaluate_groundedness_deterministic("", self.sample_sources, self.evaluator)
self.assertEqual(result["groundedness_score"], 0.0)
# Empty sources
result = evaluate_groundedness_deterministic(self.sample_response, [], self.evaluator)
self.assertEqual(result["groundedness_score"], 0.0)
# Empty expected sources for citation
result = evaluate_citation_accuracy_deterministic(
self.sample_response, self.sample_returned_sources, [], self.evaluator
)
# Should be 0.0 since sources were returned but none expected
self.assertEqual(result["citation_accuracy"], 0.0)
def test_float_precision_normalization(self):
"""Test that floating point values are normalized consistently."""
result = evaluate_groundedness_deterministic(self.sample_response, self.sample_sources, self.evaluator)
# Check that values are rounded to expected precision
for value in result.values():
if isinstance(value, float):
# Should not have more than configured decimal places
decimal_places = len(str(value).split(".")[-1]) if "." in str(value) else 0
self.assertLessEqual(decimal_places, self.evaluator.config.float_precision)
def test_consistent_ordering(self):
"""Test that evaluation produces consistent ordering."""
# Test with sources in different orders
sources_order1 = self.sample_sources
sources_order2 = list(reversed(self.sample_sources))
result1 = evaluate_groundedness_deterministic(self.sample_response, sources_order1, self.evaluator)
result2 = evaluate_groundedness_deterministic(self.sample_response, sources_order2, self.evaluator)
# Results should be identical due to internal sorting
self.assertEqual(result1, result2)
def test_filename_normalization(self):
"""Test citation filename normalization."""
# Test various filename formats
test_sources = [
{"filename": "policy.md"},
{"filename": "policy.markdown"},
{"filename": "/path/to/policy.md"},
{"filename": "policy.MD"},
{"url": "https://example.com/policy.md?v=1"},
]
expected = ["policy"] # All should normalize to "policy"
result = evaluate_citation_accuracy_deterministic("Test response", test_sources, expected, self.evaluator)
# Should have high recall since all sources match the expected "policy"
self.assertGreater(result["source_recall"], 0.0)
def test_edge_cases(self):
"""Test edge cases and error conditions."""
# Very long text
long_text = "word " * 1000
result = evaluate_groundedness_deterministic(long_text, [long_text], self.evaluator)
self.assertGreater(result["groundedness_score"], 0.8) # Should be high overlap
# Special characters
special_text = "Test with special chars: @#$%^&*()"
result = evaluate_groundedness_deterministic(special_text, [special_text], self.evaluator)
self.assertGreater(result["groundedness_score"], 0.0)
# Unicode text
unicode_text = "Testing unicode: 测试 тест परीक्षा"
result = evaluate_groundedness_deterministic(unicode_text, [unicode_text], self.evaluator)
self.assertGreater(result["groundedness_score"], 0.0)
def create_mock_evaluation_files(temp_dir: Path) -> tuple[str, str]:
"""Create mock evaluation files for testing."""
questions = [
{"id": "1", "question": "What is the remote work policy?"},
{"id": "2", "question": "How many days can employees work from home?"},
]
gold_answers = {
"1": {
"answer": "Employees can work remotely up to 3 days per week according to company policy.",
"expected_sources": ["remote_work_policy.md"],
},
"2": {
"answer": "Full-time employees can work from home up to three days per week.",
"expected_sources": ["remote_work_policy.md", "employee_handbook.md"],
},
}
questions_file = temp_dir / "test_questions.json"
gold_file = temp_dir / "test_gold.json"
with open(questions_file, "w") as f:
json.dump(questions, f, indent=2)
with open(gold_file, "w") as f:
json.dump(gold_answers, f, indent=2)
return str(questions_file), str(gold_file)
class TestEnhancedEvaluationIntegration(unittest.TestCase):
"""Integration tests for the enhanced evaluation system."""
def setUp(self):
"""Set up integration test fixtures."""
self.temp_dir = Path(tempfile.mkdtemp())
self.questions_file, self.gold_file = create_mock_evaluation_files(self.temp_dir)
def tearDown(self):
"""Clean up temporary files."""
import shutil
shutil.rmtree(self.temp_dir, ignore_errors=True)
def test_evaluation_file_creation(self):
"""Test that evaluation files are created correctly."""
self.assertTrue(Path(self.questions_file).exists())
self.assertTrue(Path(self.gold_file).exists())
# Validate file contents
with open(self.questions_file) as f:
questions = json.load(f)
self.assertEqual(len(questions), 2)
with open(self.gold_file) as f:
gold_data = json.load(f)
self.assertEqual(len(gold_data), 2)
def test_deterministic_configuration(self):
"""Test deterministic configuration setup."""
evaluator = setup_deterministic_evaluation(seed=123)
self.assertEqual(evaluator.config.random_seed, 123)
self.assertTrue(evaluator.config.deterministic_mode)
self.assertTrue(evaluator.config.sort_results)
self.assertTrue(evaluator.config.consistent_order)
def run_evaluation_tests():
"""Run all evaluation tests."""
# Create test suite
suite = unittest.TestSuite()
# Add test cases
loader = unittest.TestLoader()
suite.addTest(loader.loadTestsFromTestCase(TestDeterministicEvaluation))
suite.addTest(loader.loadTestsFromTestCase(TestEnhancedEvaluationIntegration))
# Run tests
runner = unittest.TextTestRunner(verbosity=2)
result = runner.run(suite)
return result.wasSuccessful()
if __name__ == "__main__":
print("Testing Deterministic Evaluation Improvements...")
print("=" * 60)
success = run_evaluation_tests()
if success:
print("\n✅ All tests passed! Deterministic evaluation improvements are working correctly.")
else:
print("\n❌ Some tests failed. Please check the implementation.")
exit(1)