#!/usr/bin/env python3 """ Test Deterministic Evaluation Improvements Validates that the enhanced evaluation system provides reproducible and improved groundedness scoring. """ import json import os # Import the enhanced evaluation components import sys import tempfile import unittest from pathlib import Path # typing imports not required here sys.path.append(os.path.join(os.path.dirname(__file__), "..", "src")) from src.evaluation.deterministic import ( # noqa: E402 evaluate_citation_accuracy_deterministic, evaluate_groundedness_deterministic, setup_deterministic_evaluation, ) class TestDeterministicEvaluation(unittest.TestCase): """Test cases for deterministic evaluation improvements.""" def setUp(self): """Set up test fixtures.""" self.evaluator = setup_deterministic_evaluation(seed=42) # Sample test data self.sample_response = """ Based on the company's remote work policy, employees can work from home up to 3 days per week. This policy was implemented to improve work-life balance while maintaining team collaboration. """ self.sample_sources = [ """ Remote Work Policy: All full-time employees are eligible to work from home up to three days per week. Part-time employees may work remotely up to 50% of their scheduled hours. """, """ Work-Life Balance Initiative: The company recognizes the importance of maintaining a healthy work-life balance. Our remote work policy is designed to provide flexibility while ensuring team productivity. """, ] self.sample_returned_sources = [ { "filename": "remote_work_policy.md", "content": self.sample_sources[0], "metadata": {"file": "remote_work_policy.md"}, }, {"filename": "work_life_balance.md", "content": self.sample_sources[1]}, ] self.expected_sources = ["remote_work_policy.md", "work_life_balance.md"] def test_deterministic_reproducibility(self): """Test that evaluation results are reproducible with same seed.""" # Run evaluation twice with same seed evaluator1 = setup_deterministic_evaluation(seed=42) evaluator2 = setup_deterministic_evaluation(seed=42) result1 = evaluate_groundedness_deterministic(self.sample_response, self.sample_sources, evaluator1) result2 = evaluate_groundedness_deterministic(self.sample_response, self.sample_sources, evaluator2) # Results should be identical self.assertEqual(result1, result2) # Test citation accuracy reproducibility citation1 = evaluate_citation_accuracy_deterministic( self.sample_response, self.sample_returned_sources, self.expected_sources, evaluator1 ) citation2 = evaluate_citation_accuracy_deterministic( self.sample_response, self.sample_returned_sources, self.expected_sources, evaluator2 ) self.assertEqual(citation1, citation2) def test_groundedness_scoring(self): """Test groundedness scoring functionality.""" result = evaluate_groundedness_deterministic(self.sample_response, self.sample_sources, self.evaluator) # Check that all expected metrics are present expected_metrics = ["groundedness_score", "passage_coverage", "token_overlap", "exact_matches"] for metric in expected_metrics: self.assertIn(metric, result) self.assertIsInstance(result[metric], float) self.assertGreaterEqual(result[metric], 0.0) self.assertLessEqual(result[metric], 1.0) # Groundedness should be positive since response relates to sources self.assertGreater(result["groundedness_score"], 0.0) # Token overlap should be detected self.assertGreater(result["token_overlap"], 0.0) def test_citation_accuracy_scoring(self): """Test citation accuracy scoring functionality.""" result = evaluate_citation_accuracy_deterministic( self.sample_response, self.sample_returned_sources, self.expected_sources, self.evaluator ) # Check expected metrics expected_metrics = ["citation_accuracy", "source_precision", "source_recall", "exact_filename_matches"] for metric in expected_metrics: self.assertIn(metric, result) self.assertIsInstance(result[metric], float) self.assertGreaterEqual(result[metric], 0.0) self.assertLessEqual(result[metric], 1.0) # Should have perfect citation accuracy for exact filename matches self.assertEqual(result["exact_filename_matches"], 1.0) self.assertEqual(result["source_recall"], 1.0) def test_empty_inputs_handling(self): """Test handling of empty or invalid inputs.""" # Empty generated text result = evaluate_groundedness_deterministic("", self.sample_sources, self.evaluator) self.assertEqual(result["groundedness_score"], 0.0) # Empty sources result = evaluate_groundedness_deterministic(self.sample_response, [], self.evaluator) self.assertEqual(result["groundedness_score"], 0.0) # Empty expected sources for citation result = evaluate_citation_accuracy_deterministic( self.sample_response, self.sample_returned_sources, [], self.evaluator ) # Should be 0.0 since sources were returned but none expected self.assertEqual(result["citation_accuracy"], 0.0) def test_float_precision_normalization(self): """Test that floating point values are normalized consistently.""" result = evaluate_groundedness_deterministic(self.sample_response, self.sample_sources, self.evaluator) # Check that values are rounded to expected precision for value in result.values(): if isinstance(value, float): # Should not have more than configured decimal places decimal_places = len(str(value).split(".")[-1]) if "." in str(value) else 0 self.assertLessEqual(decimal_places, self.evaluator.config.float_precision) def test_consistent_ordering(self): """Test that evaluation produces consistent ordering.""" # Test with sources in different orders sources_order1 = self.sample_sources sources_order2 = list(reversed(self.sample_sources)) result1 = evaluate_groundedness_deterministic(self.sample_response, sources_order1, self.evaluator) result2 = evaluate_groundedness_deterministic(self.sample_response, sources_order2, self.evaluator) # Results should be identical due to internal sorting self.assertEqual(result1, result2) def test_filename_normalization(self): """Test citation filename normalization.""" # Test various filename formats test_sources = [ {"filename": "policy.md"}, {"filename": "policy.markdown"}, {"filename": "/path/to/policy.md"}, {"filename": "policy.MD"}, {"url": "https://example.com/policy.md?v=1"}, ] expected = ["policy"] # All should normalize to "policy" result = evaluate_citation_accuracy_deterministic("Test response", test_sources, expected, self.evaluator) # Should have high recall since all sources match the expected "policy" self.assertGreater(result["source_recall"], 0.0) def test_edge_cases(self): """Test edge cases and error conditions.""" # Very long text long_text = "word " * 1000 result = evaluate_groundedness_deterministic(long_text, [long_text], self.evaluator) self.assertGreater(result["groundedness_score"], 0.8) # Should be high overlap # Special characters special_text = "Test with special chars: @#$%^&*()" result = evaluate_groundedness_deterministic(special_text, [special_text], self.evaluator) self.assertGreater(result["groundedness_score"], 0.0) # Unicode text unicode_text = "Testing unicode: 测试 тест परीक्षा" result = evaluate_groundedness_deterministic(unicode_text, [unicode_text], self.evaluator) self.assertGreater(result["groundedness_score"], 0.0) def create_mock_evaluation_files(temp_dir: Path) -> tuple[str, str]: """Create mock evaluation files for testing.""" questions = [ {"id": "1", "question": "What is the remote work policy?"}, {"id": "2", "question": "How many days can employees work from home?"}, ] gold_answers = { "1": { "answer": "Employees can work remotely up to 3 days per week according to company policy.", "expected_sources": ["remote_work_policy.md"], }, "2": { "answer": "Full-time employees can work from home up to three days per week.", "expected_sources": ["remote_work_policy.md", "employee_handbook.md"], }, } questions_file = temp_dir / "test_questions.json" gold_file = temp_dir / "test_gold.json" with open(questions_file, "w") as f: json.dump(questions, f, indent=2) with open(gold_file, "w") as f: json.dump(gold_answers, f, indent=2) return str(questions_file), str(gold_file) class TestEnhancedEvaluationIntegration(unittest.TestCase): """Integration tests for the enhanced evaluation system.""" def setUp(self): """Set up integration test fixtures.""" self.temp_dir = Path(tempfile.mkdtemp()) self.questions_file, self.gold_file = create_mock_evaluation_files(self.temp_dir) def tearDown(self): """Clean up temporary files.""" import shutil shutil.rmtree(self.temp_dir, ignore_errors=True) def test_evaluation_file_creation(self): """Test that evaluation files are created correctly.""" self.assertTrue(Path(self.questions_file).exists()) self.assertTrue(Path(self.gold_file).exists()) # Validate file contents with open(self.questions_file) as f: questions = json.load(f) self.assertEqual(len(questions), 2) with open(self.gold_file) as f: gold_data = json.load(f) self.assertEqual(len(gold_data), 2) def test_deterministic_configuration(self): """Test deterministic configuration setup.""" evaluator = setup_deterministic_evaluation(seed=123) self.assertEqual(evaluator.config.random_seed, 123) self.assertTrue(evaluator.config.deterministic_mode) self.assertTrue(evaluator.config.sort_results) self.assertTrue(evaluator.config.consistent_order) def run_evaluation_tests(): """Run all evaluation tests.""" # Create test suite suite = unittest.TestSuite() # Add test cases loader = unittest.TestLoader() suite.addTest(loader.loadTestsFromTestCase(TestDeterministicEvaluation)) suite.addTest(loader.loadTestsFromTestCase(TestEnhancedEvaluationIntegration)) # Run tests runner = unittest.TextTestRunner(verbosity=2) result = runner.run(suite) return result.wasSuccessful() if __name__ == "__main__": print("Testing Deterministic Evaluation Improvements...") print("=" * 60) success = run_evaluation_tests() if success: print("\n✅ All tests passed! Deterministic evaluation improvements are working correctly.") else: print("\n❌ Some tests failed. Please check the implementation.") exit(1)