File size: 11,699 Bytes
f884e6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
#!/usr/bin/env python3
"""
Test Deterministic Evaluation Improvements

Validates that the enhanced evaluation system provides reproducible and
improved groundedness scoring.
"""

import json
import os

# Import the enhanced evaluation components
import sys
import tempfile
import unittest
from pathlib import Path

# typing imports not required here

sys.path.append(os.path.join(os.path.dirname(__file__), "..", "src"))

from src.evaluation.deterministic import (  # noqa: E402
    evaluate_citation_accuracy_deterministic,
    evaluate_groundedness_deterministic,
    setup_deterministic_evaluation,
)


class TestDeterministicEvaluation(unittest.TestCase):
    """Test cases for deterministic evaluation improvements."""

    def setUp(self):
        """Set up test fixtures."""
        self.evaluator = setup_deterministic_evaluation(seed=42)

        # Sample test data
        self.sample_response = """
        Based on the company's remote work policy, employees can work from home
        up to 3 days per week. This policy was implemented to improve work-life
        balance while maintaining team collaboration.
        """

        self.sample_sources = [
            """
            Remote Work Policy: All full-time employees are eligible to work
            from home up to three days per week. Part-time employees may work
            remotely up to 50% of their scheduled hours.
            """,
            """
            Work-Life Balance Initiative: The company recognizes the importance
            of maintaining a healthy work-life balance. Our remote work policy
            is designed to provide flexibility while ensuring team productivity.
            """,
        ]

        self.sample_returned_sources = [
            {
                "filename": "remote_work_policy.md",
                "content": self.sample_sources[0],
                "metadata": {"file": "remote_work_policy.md"},
            },
            {"filename": "work_life_balance.md", "content": self.sample_sources[1]},
        ]

        self.expected_sources = ["remote_work_policy.md", "work_life_balance.md"]

    def test_deterministic_reproducibility(self):
        """Test that evaluation results are reproducible with same seed."""
        # Run evaluation twice with same seed
        evaluator1 = setup_deterministic_evaluation(seed=42)
        evaluator2 = setup_deterministic_evaluation(seed=42)

        result1 = evaluate_groundedness_deterministic(self.sample_response, self.sample_sources, evaluator1)
        result2 = evaluate_groundedness_deterministic(self.sample_response, self.sample_sources, evaluator2)

        # Results should be identical
        self.assertEqual(result1, result2)

        # Test citation accuracy reproducibility
        citation1 = evaluate_citation_accuracy_deterministic(
            self.sample_response, self.sample_returned_sources, self.expected_sources, evaluator1
        )
        citation2 = evaluate_citation_accuracy_deterministic(
            self.sample_response, self.sample_returned_sources, self.expected_sources, evaluator2
        )

        self.assertEqual(citation1, citation2)

    def test_groundedness_scoring(self):
        """Test groundedness scoring functionality."""
        result = evaluate_groundedness_deterministic(self.sample_response, self.sample_sources, self.evaluator)

        # Check that all expected metrics are present
        expected_metrics = ["groundedness_score", "passage_coverage", "token_overlap", "exact_matches"]
        for metric in expected_metrics:
            self.assertIn(metric, result)
            self.assertIsInstance(result[metric], float)
            self.assertGreaterEqual(result[metric], 0.0)
            self.assertLessEqual(result[metric], 1.0)

        # Groundedness should be positive since response relates to sources
        self.assertGreater(result["groundedness_score"], 0.0)

        # Token overlap should be detected
        self.assertGreater(result["token_overlap"], 0.0)

    def test_citation_accuracy_scoring(self):
        """Test citation accuracy scoring functionality."""
        result = evaluate_citation_accuracy_deterministic(
            self.sample_response, self.sample_returned_sources, self.expected_sources, self.evaluator
        )

        # Check expected metrics
        expected_metrics = ["citation_accuracy", "source_precision", "source_recall", "exact_filename_matches"]
        for metric in expected_metrics:
            self.assertIn(metric, result)
            self.assertIsInstance(result[metric], float)
            self.assertGreaterEqual(result[metric], 0.0)
            self.assertLessEqual(result[metric], 1.0)

        # Should have perfect citation accuracy for exact filename matches
        self.assertEqual(result["exact_filename_matches"], 1.0)
        self.assertEqual(result["source_recall"], 1.0)

    def test_empty_inputs_handling(self):
        """Test handling of empty or invalid inputs."""
        # Empty generated text
        result = evaluate_groundedness_deterministic("", self.sample_sources, self.evaluator)
        self.assertEqual(result["groundedness_score"], 0.0)

        # Empty sources
        result = evaluate_groundedness_deterministic(self.sample_response, [], self.evaluator)
        self.assertEqual(result["groundedness_score"], 0.0)

        # Empty expected sources for citation
        result = evaluate_citation_accuracy_deterministic(
            self.sample_response, self.sample_returned_sources, [], self.evaluator
        )
        # Should be 0.0 since sources were returned but none expected
        self.assertEqual(result["citation_accuracy"], 0.0)

    def test_float_precision_normalization(self):
        """Test that floating point values are normalized consistently."""
        result = evaluate_groundedness_deterministic(self.sample_response, self.sample_sources, self.evaluator)

        # Check that values are rounded to expected precision
        for value in result.values():
            if isinstance(value, float):
                # Should not have more than configured decimal places
                decimal_places = len(str(value).split(".")[-1]) if "." in str(value) else 0
                self.assertLessEqual(decimal_places, self.evaluator.config.float_precision)

    def test_consistent_ordering(self):
        """Test that evaluation produces consistent ordering."""
        # Test with sources in different orders
        sources_order1 = self.sample_sources
        sources_order2 = list(reversed(self.sample_sources))

        result1 = evaluate_groundedness_deterministic(self.sample_response, sources_order1, self.evaluator)
        result2 = evaluate_groundedness_deterministic(self.sample_response, sources_order2, self.evaluator)

        # Results should be identical due to internal sorting
        self.assertEqual(result1, result2)

    def test_filename_normalization(self):
        """Test citation filename normalization."""
        # Test various filename formats
        test_sources = [
            {"filename": "policy.md"},
            {"filename": "policy.markdown"},
            {"filename": "/path/to/policy.md"},
            {"filename": "policy.MD"},
            {"url": "https://example.com/policy.md?v=1"},
        ]

        expected = ["policy"]  # All should normalize to "policy"

        result = evaluate_citation_accuracy_deterministic("Test response", test_sources, expected, self.evaluator)

        # Should have high recall since all sources match the expected "policy"
        self.assertGreater(result["source_recall"], 0.0)

    def test_edge_cases(self):
        """Test edge cases and error conditions."""
        # Very long text
        long_text = "word " * 1000
        result = evaluate_groundedness_deterministic(long_text, [long_text], self.evaluator)
        self.assertGreater(result["groundedness_score"], 0.8)  # Should be high overlap

        # Special characters
        special_text = "Test with special chars: @#$%^&*()"
        result = evaluate_groundedness_deterministic(special_text, [special_text], self.evaluator)
        self.assertGreater(result["groundedness_score"], 0.0)

        # Unicode text
        unicode_text = "Testing unicode: 测试 тест परीक्षा"
        result = evaluate_groundedness_deterministic(unicode_text, [unicode_text], self.evaluator)
        self.assertGreater(result["groundedness_score"], 0.0)


def create_mock_evaluation_files(temp_dir: Path) -> tuple[str, str]:
    """Create mock evaluation files for testing."""
    questions = [
        {"id": "1", "question": "What is the remote work policy?"},
        {"id": "2", "question": "How many days can employees work from home?"},
    ]

    gold_answers = {
        "1": {
            "answer": "Employees can work remotely up to 3 days per week according to company policy.",
            "expected_sources": ["remote_work_policy.md"],
        },
        "2": {
            "answer": "Full-time employees can work from home up to three days per week.",
            "expected_sources": ["remote_work_policy.md", "employee_handbook.md"],
        },
    }

    questions_file = temp_dir / "test_questions.json"
    gold_file = temp_dir / "test_gold.json"

    with open(questions_file, "w") as f:
        json.dump(questions, f, indent=2)

    with open(gold_file, "w") as f:
        json.dump(gold_answers, f, indent=2)

    return str(questions_file), str(gold_file)


class TestEnhancedEvaluationIntegration(unittest.TestCase):
    """Integration tests for the enhanced evaluation system."""

    def setUp(self):
        """Set up integration test fixtures."""
        self.temp_dir = Path(tempfile.mkdtemp())
        self.questions_file, self.gold_file = create_mock_evaluation_files(self.temp_dir)

    def tearDown(self):
        """Clean up temporary files."""
        import shutil

        shutil.rmtree(self.temp_dir, ignore_errors=True)

    def test_evaluation_file_creation(self):
        """Test that evaluation files are created correctly."""
        self.assertTrue(Path(self.questions_file).exists())
        self.assertTrue(Path(self.gold_file).exists())

        # Validate file contents
        with open(self.questions_file) as f:
            questions = json.load(f)
        self.assertEqual(len(questions), 2)

        with open(self.gold_file) as f:
            gold_data = json.load(f)
        self.assertEqual(len(gold_data), 2)

    def test_deterministic_configuration(self):
        """Test deterministic configuration setup."""
        evaluator = setup_deterministic_evaluation(seed=123)

        self.assertEqual(evaluator.config.random_seed, 123)
        self.assertTrue(evaluator.config.deterministic_mode)
        self.assertTrue(evaluator.config.sort_results)
        self.assertTrue(evaluator.config.consistent_order)


def run_evaluation_tests():
    """Run all evaluation tests."""
    # Create test suite
    suite = unittest.TestSuite()

    # Add test cases
    loader = unittest.TestLoader()
    suite.addTest(loader.loadTestsFromTestCase(TestDeterministicEvaluation))
    suite.addTest(loader.loadTestsFromTestCase(TestEnhancedEvaluationIntegration))

    # Run tests
    runner = unittest.TextTestRunner(verbosity=2)
    result = runner.run(suite)

    return result.wasSuccessful()


if __name__ == "__main__":
    print("Testing Deterministic Evaluation Improvements...")
    print("=" * 60)

    success = run_evaluation_tests()

    if success:
        print("\n✅ All tests passed! Deterministic evaluation improvements are working correctly.")
    else:
        print("\n❌ Some tests failed. Please check the implementation.")
        exit(1)