Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Test Deterministic Evaluation Improvements | |
| Validates that the enhanced evaluation system provides reproducible and | |
| improved groundedness scoring. | |
| """ | |
| import json | |
| import os | |
| # Import the enhanced evaluation components | |
| import sys | |
| import tempfile | |
| import unittest | |
| from pathlib import Path | |
| # typing imports not required here | |
| sys.path.append(os.path.join(os.path.dirname(__file__), "..", "src")) | |
| from src.evaluation.deterministic import ( # noqa: E402 | |
| evaluate_citation_accuracy_deterministic, | |
| evaluate_groundedness_deterministic, | |
| setup_deterministic_evaluation, | |
| ) | |
| class TestDeterministicEvaluation(unittest.TestCase): | |
| """Test cases for deterministic evaluation improvements.""" | |
| def setUp(self): | |
| """Set up test fixtures.""" | |
| self.evaluator = setup_deterministic_evaluation(seed=42) | |
| # Sample test data | |
| self.sample_response = """ | |
| Based on the company's remote work policy, employees can work from home | |
| up to 3 days per week. This policy was implemented to improve work-life | |
| balance while maintaining team collaboration. | |
| """ | |
| self.sample_sources = [ | |
| """ | |
| Remote Work Policy: All full-time employees are eligible to work | |
| from home up to three days per week. Part-time employees may work | |
| remotely up to 50% of their scheduled hours. | |
| """, | |
| """ | |
| Work-Life Balance Initiative: The company recognizes the importance | |
| of maintaining a healthy work-life balance. Our remote work policy | |
| is designed to provide flexibility while ensuring team productivity. | |
| """, | |
| ] | |
| self.sample_returned_sources = [ | |
| { | |
| "filename": "remote_work_policy.md", | |
| "content": self.sample_sources[0], | |
| "metadata": {"file": "remote_work_policy.md"}, | |
| }, | |
| {"filename": "work_life_balance.md", "content": self.sample_sources[1]}, | |
| ] | |
| self.expected_sources = ["remote_work_policy.md", "work_life_balance.md"] | |
| def test_deterministic_reproducibility(self): | |
| """Test that evaluation results are reproducible with same seed.""" | |
| # Run evaluation twice with same seed | |
| evaluator1 = setup_deterministic_evaluation(seed=42) | |
| evaluator2 = setup_deterministic_evaluation(seed=42) | |
| result1 = evaluate_groundedness_deterministic(self.sample_response, self.sample_sources, evaluator1) | |
| result2 = evaluate_groundedness_deterministic(self.sample_response, self.sample_sources, evaluator2) | |
| # Results should be identical | |
| self.assertEqual(result1, result2) | |
| # Test citation accuracy reproducibility | |
| citation1 = evaluate_citation_accuracy_deterministic( | |
| self.sample_response, self.sample_returned_sources, self.expected_sources, evaluator1 | |
| ) | |
| citation2 = evaluate_citation_accuracy_deterministic( | |
| self.sample_response, self.sample_returned_sources, self.expected_sources, evaluator2 | |
| ) | |
| self.assertEqual(citation1, citation2) | |
| def test_groundedness_scoring(self): | |
| """Test groundedness scoring functionality.""" | |
| result = evaluate_groundedness_deterministic(self.sample_response, self.sample_sources, self.evaluator) | |
| # Check that all expected metrics are present | |
| expected_metrics = ["groundedness_score", "passage_coverage", "token_overlap", "exact_matches"] | |
| for metric in expected_metrics: | |
| self.assertIn(metric, result) | |
| self.assertIsInstance(result[metric], float) | |
| self.assertGreaterEqual(result[metric], 0.0) | |
| self.assertLessEqual(result[metric], 1.0) | |
| # Groundedness should be positive since response relates to sources | |
| self.assertGreater(result["groundedness_score"], 0.0) | |
| # Token overlap should be detected | |
| self.assertGreater(result["token_overlap"], 0.0) | |
| def test_citation_accuracy_scoring(self): | |
| """Test citation accuracy scoring functionality.""" | |
| result = evaluate_citation_accuracy_deterministic( | |
| self.sample_response, self.sample_returned_sources, self.expected_sources, self.evaluator | |
| ) | |
| # Check expected metrics | |
| expected_metrics = ["citation_accuracy", "source_precision", "source_recall", "exact_filename_matches"] | |
| for metric in expected_metrics: | |
| self.assertIn(metric, result) | |
| self.assertIsInstance(result[metric], float) | |
| self.assertGreaterEqual(result[metric], 0.0) | |
| self.assertLessEqual(result[metric], 1.0) | |
| # Should have perfect citation accuracy for exact filename matches | |
| self.assertEqual(result["exact_filename_matches"], 1.0) | |
| self.assertEqual(result["source_recall"], 1.0) | |
| def test_empty_inputs_handling(self): | |
| """Test handling of empty or invalid inputs.""" | |
| # Empty generated text | |
| result = evaluate_groundedness_deterministic("", self.sample_sources, self.evaluator) | |
| self.assertEqual(result["groundedness_score"], 0.0) | |
| # Empty sources | |
| result = evaluate_groundedness_deterministic(self.sample_response, [], self.evaluator) | |
| self.assertEqual(result["groundedness_score"], 0.0) | |
| # Empty expected sources for citation | |
| result = evaluate_citation_accuracy_deterministic( | |
| self.sample_response, self.sample_returned_sources, [], self.evaluator | |
| ) | |
| # Should be 0.0 since sources were returned but none expected | |
| self.assertEqual(result["citation_accuracy"], 0.0) | |
| def test_float_precision_normalization(self): | |
| """Test that floating point values are normalized consistently.""" | |
| result = evaluate_groundedness_deterministic(self.sample_response, self.sample_sources, self.evaluator) | |
| # Check that values are rounded to expected precision | |
| for value in result.values(): | |
| if isinstance(value, float): | |
| # Should not have more than configured decimal places | |
| decimal_places = len(str(value).split(".")[-1]) if "." in str(value) else 0 | |
| self.assertLessEqual(decimal_places, self.evaluator.config.float_precision) | |
| def test_consistent_ordering(self): | |
| """Test that evaluation produces consistent ordering.""" | |
| # Test with sources in different orders | |
| sources_order1 = self.sample_sources | |
| sources_order2 = list(reversed(self.sample_sources)) | |
| result1 = evaluate_groundedness_deterministic(self.sample_response, sources_order1, self.evaluator) | |
| result2 = evaluate_groundedness_deterministic(self.sample_response, sources_order2, self.evaluator) | |
| # Results should be identical due to internal sorting | |
| self.assertEqual(result1, result2) | |
| def test_filename_normalization(self): | |
| """Test citation filename normalization.""" | |
| # Test various filename formats | |
| test_sources = [ | |
| {"filename": "policy.md"}, | |
| {"filename": "policy.markdown"}, | |
| {"filename": "/path/to/policy.md"}, | |
| {"filename": "policy.MD"}, | |
| {"url": "https://example.com/policy.md?v=1"}, | |
| ] | |
| expected = ["policy"] # All should normalize to "policy" | |
| result = evaluate_citation_accuracy_deterministic("Test response", test_sources, expected, self.evaluator) | |
| # Should have high recall since all sources match the expected "policy" | |
| self.assertGreater(result["source_recall"], 0.0) | |
| def test_edge_cases(self): | |
| """Test edge cases and error conditions.""" | |
| # Very long text | |
| long_text = "word " * 1000 | |
| result = evaluate_groundedness_deterministic(long_text, [long_text], self.evaluator) | |
| self.assertGreater(result["groundedness_score"], 0.8) # Should be high overlap | |
| # Special characters | |
| special_text = "Test with special chars: @#$%^&*()" | |
| result = evaluate_groundedness_deterministic(special_text, [special_text], self.evaluator) | |
| self.assertGreater(result["groundedness_score"], 0.0) | |
| # Unicode text | |
| unicode_text = "Testing unicode: 测试 тест परीक्षा" | |
| result = evaluate_groundedness_deterministic(unicode_text, [unicode_text], self.evaluator) | |
| self.assertGreater(result["groundedness_score"], 0.0) | |
| def create_mock_evaluation_files(temp_dir: Path) -> tuple[str, str]: | |
| """Create mock evaluation files for testing.""" | |
| questions = [ | |
| {"id": "1", "question": "What is the remote work policy?"}, | |
| {"id": "2", "question": "How many days can employees work from home?"}, | |
| ] | |
| gold_answers = { | |
| "1": { | |
| "answer": "Employees can work remotely up to 3 days per week according to company policy.", | |
| "expected_sources": ["remote_work_policy.md"], | |
| }, | |
| "2": { | |
| "answer": "Full-time employees can work from home up to three days per week.", | |
| "expected_sources": ["remote_work_policy.md", "employee_handbook.md"], | |
| }, | |
| } | |
| questions_file = temp_dir / "test_questions.json" | |
| gold_file = temp_dir / "test_gold.json" | |
| with open(questions_file, "w") as f: | |
| json.dump(questions, f, indent=2) | |
| with open(gold_file, "w") as f: | |
| json.dump(gold_answers, f, indent=2) | |
| return str(questions_file), str(gold_file) | |
| class TestEnhancedEvaluationIntegration(unittest.TestCase): | |
| """Integration tests for the enhanced evaluation system.""" | |
| def setUp(self): | |
| """Set up integration test fixtures.""" | |
| self.temp_dir = Path(tempfile.mkdtemp()) | |
| self.questions_file, self.gold_file = create_mock_evaluation_files(self.temp_dir) | |
| def tearDown(self): | |
| """Clean up temporary files.""" | |
| import shutil | |
| shutil.rmtree(self.temp_dir, ignore_errors=True) | |
| def test_evaluation_file_creation(self): | |
| """Test that evaluation files are created correctly.""" | |
| self.assertTrue(Path(self.questions_file).exists()) | |
| self.assertTrue(Path(self.gold_file).exists()) | |
| # Validate file contents | |
| with open(self.questions_file) as f: | |
| questions = json.load(f) | |
| self.assertEqual(len(questions), 2) | |
| with open(self.gold_file) as f: | |
| gold_data = json.load(f) | |
| self.assertEqual(len(gold_data), 2) | |
| def test_deterministic_configuration(self): | |
| """Test deterministic configuration setup.""" | |
| evaluator = setup_deterministic_evaluation(seed=123) | |
| self.assertEqual(evaluator.config.random_seed, 123) | |
| self.assertTrue(evaluator.config.deterministic_mode) | |
| self.assertTrue(evaluator.config.sort_results) | |
| self.assertTrue(evaluator.config.consistent_order) | |
| def run_evaluation_tests(): | |
| """Run all evaluation tests.""" | |
| # Create test suite | |
| suite = unittest.TestSuite() | |
| # Add test cases | |
| loader = unittest.TestLoader() | |
| suite.addTest(loader.loadTestsFromTestCase(TestDeterministicEvaluation)) | |
| suite.addTest(loader.loadTestsFromTestCase(TestEnhancedEvaluationIntegration)) | |
| # Run tests | |
| runner = unittest.TextTestRunner(verbosity=2) | |
| result = runner.run(suite) | |
| return result.wasSuccessful() | |
| if __name__ == "__main__": | |
| print("Testing Deterministic Evaluation Improvements...") | |
| print("=" * 60) | |
| success = run_evaluation_tests() | |
| if success: | |
| print("\n✅ All tests passed! Deterministic evaluation improvements are working correctly.") | |
| else: | |
| print("\n❌ Some tests failed. Please check the implementation.") | |
| exit(1) | |