Spaces:

msse-team-3
/

ai-engineering-project

Sleeping

File size: 11,699 Bytes

f884e6e

#!/usr/bin/env python3
"""
Test Deterministic Evaluation Improvements

Validates that the enhanced evaluation system provides reproducible and
improved groundedness scoring.
"""

import json
import os

# Import the enhanced evaluation components
import sys
import tempfile
import unittest
from pathlib import Path

# typing imports not required here

sys.path.append(os.path.join(os.path.dirname(__file__), "..", "src"))

from src.evaluation.deterministic import (  # noqa: E402
    evaluate_citation_accuracy_deterministic,
    evaluate_groundedness_deterministic,
    setup_deterministic_evaluation,
)


class TestDeterministicEvaluation(unittest.TestCase):
    """Test cases for deterministic evaluation improvements."""

    def setUp(self):
        """Set up test fixtures."""
        self.evaluator = setup_deterministic_evaluation(seed=42)

        # Sample test data
        self.sample_response = """
        Based on the company's remote work policy, employees can work from home
        up to 3 days per week. This policy was implemented to improve work-life
        balance while maintaining team collaboration.
        """

        self.sample_sources = [
            """
            Remote Work Policy: All full-time employees are eligible to work
            from home up to three days per week. Part-time employees may work
            remotely up to 50% of their scheduled hours.
            """,
            """
            Work-Life Balance Initiative: The company recognizes the importance
            of maintaining a healthy work-life balance. Our remote work policy
            is designed to provide flexibility while ensuring team productivity.
            """,
        ]

        self.sample_returned_sources = [
            {
                "filename": "remote_work_policy.md",
                "content": self.sample_sources[0],
                "metadata": {"file": "remote_work_policy.md"},
            },
            {"filename": "work_life_balance.md", "content": self.sample_sources[1]},
        ]

        self.expected_sources = ["remote_work_policy.md", "work_life_balance.md"]

    def test_deterministic_reproducibility(self):
        """Test that evaluation results are reproducible with same seed."""
        # Run evaluation twice with same seed
        evaluator1 = setup_deterministic_evaluation(seed=42)
        evaluator2 = setup_deterministic_evaluation(seed=42)

        result1 = evaluate_groundedness_deterministic(self.sample_response, self.sample_sources, evaluator1)
        result2 = evaluate_groundedness_deterministic(self.sample_response, self.sample_sources, evaluator2)

        # Results should be identical
        self.assertEqual(result1, result2)

        # Test citation accuracy reproducibility
        citation1 = evaluate_citation_accuracy_deterministic(
            self.sample_response, self.sample_returned_sources, self.expected_sources, evaluator1
        )
        citation2 = evaluate_citation_accuracy_deterministic(
            self.sample_response, self.sample_returned_sources, self.expected_sources, evaluator2
        )

        self.assertEqual(citation1, citation2)

    def test_groundedness_scoring(self):
        """Test groundedness scoring functionality."""
        result = evaluate_groundedness_deterministic(self.sample_response, self.sample_sources, self.evaluator)

        # Check that all expected metrics are present
        expected_metrics = ["groundedness_score", "passage_coverage", "token_overlap", "exact_matches"]
        for metric in expected_metrics:
            self.assertIn(metric, result)
            self.assertIsInstance(result[metric], float)
            self.assertGreaterEqual(result[metric], 0.0)
            self.assertLessEqual(result[metric], 1.0)

        # Groundedness should be positive since response relates to sources
        self.assertGreater(result["groundedness_score"], 0.0)

        # Token overlap should be detected
        self.assertGreater(result["token_overlap"], 0.0)

    def test_citation_accuracy_scoring(self):
        """Test citation accuracy scoring functionality."""
        result = evaluate_citation_accuracy_deterministic(
            self.sample_response, self.sample_returned_sources, self.expected_sources, self.evaluator
        )

        # Check expected metrics
        expected_metrics = ["citation_accuracy", "source_precision", "source_recall", "exact_filename_matches"]
        for metric in expected_metrics:
            self.assertIn(metric, result)
            self.assertIsInstance(result[metric], float)
            self.assertGreaterEqual(result[metric], 0.0)
            self.assertLessEqual(result[metric], 1.0)

        # Should have perfect citation accuracy for exact filename matches
        self.assertEqual(result["exact_filename_matches"], 1.0)
        self.assertEqual(result["source_recall"], 1.0)

    def test_empty_inputs_handling(self):
        """Test handling of empty or invalid inputs."""
        # Empty generated text
        result = evaluate_groundedness_deterministic("", self.sample_sources, self.evaluator)
        self.assertEqual(result["groundedness_score"], 0.0)

        # Empty sources
        result = evaluate_groundedness_deterministic(self.sample_response, [], self.evaluator)
        self.assertEqual(result["groundedness_score"], 0.0)

        # Empty expected sources for citation
        result = evaluate_citation_accuracy_deterministic(
            self.sample_response, self.sample_returned_sources, [], self.evaluator
        )
        # Should be 0.0 since sources were returned but none expected
        self.assertEqual(result["citation_accuracy"], 0.0)

    def test_float_precision_normalization(self):
        """Test that floating point values are normalized consistently."""
        result = evaluate_groundedness_deterministic(self.sample_response, self.sample_sources, self.evaluator)

        # Check that values are rounded to expected precision
        for value in result.values():
            if isinstance(value, float):
                # Should not have more than configured decimal places
                decimal_places = len(str(value).split(".")[-1]) if "." in str(value) else 0
                self.assertLessEqual(decimal_places, self.evaluator.config.float_precision)

    def test_consistent_ordering(self):
        """Test that evaluation produces consistent ordering."""
        # Test with sources in different orders
        sources_order1 = self.sample_sources
        sources_order2 = list(reversed(self.sample_sources))

        result1 = evaluate_groundedness_deterministic(self.sample_response, sources_order1, self.evaluator)
        result2 = evaluate_groundedness_deterministic(self.sample_response, sources_order2, self.evaluator)

        # Results should be identical due to internal sorting
        self.assertEqual(result1, result2)

    def test_filename_normalization(self):
        """Test citation filename normalization."""
        # Test various filename formats
        test_sources = [
            {"filename": "policy.md"},
            {"filename": "policy.markdown"},
            {"filename": "/path/to/policy.md"},
            {"filename": "policy.MD"},
            {"url": "https://example.com/policy.md?v=1"},
        ]

        expected = ["policy"]  # All should normalize to "policy"

        result = evaluate_citation_accuracy_deterministic("Test response", test_sources, expected, self.evaluator)

        # Should have high recall since all sources match the expected "policy"
        self.assertGreater(result["source_recall"], 0.0)

    def test_edge_cases(self):
        """Test edge cases and error conditions."""
        # Very long text
        long_text = "word " * 1000
        result = evaluate_groundedness_deterministic(long_text, [long_text], self.evaluator)
        self.assertGreater(result["groundedness_score"], 0.8)  # Should be high overlap

        # Special characters
        special_text = "Test with special chars: @#$%^&*()"
        result = evaluate_groundedness_deterministic(special_text, [special_text], self.evaluator)
        self.assertGreater(result["groundedness_score"], 0.0)

        # Unicode text
        unicode_text = "Testing unicode: 测试 тест परीक्षा"
        result = evaluate_groundedness_deterministic(unicode_text, [unicode_text], self.evaluator)
        self.assertGreater(result["groundedness_score"], 0.0)


def create_mock_evaluation_files(temp_dir: Path) -> tuple[str, str]:
    """Create mock evaluation files for testing."""
    questions = [
        {"id": "1", "question": "What is the remote work policy?"},
        {"id": "2", "question": "How many days can employees work from home?"},
    ]

    gold_answers = {
        "1": {
            "answer": "Employees can work remotely up to 3 days per week according to company policy.",
            "expected_sources": ["remote_work_policy.md"],
        },
        "2": {
            "answer": "Full-time employees can work from home up to three days per week.",
            "expected_sources": ["remote_work_policy.md", "employee_handbook.md"],
        },
    }

    questions_file = temp_dir / "test_questions.json"
    gold_file = temp_dir / "test_gold.json"

    with open(questions_file, "w") as f:
        json.dump(questions, f, indent=2)

    with open(gold_file, "w") as f:
        json.dump(gold_answers, f, indent=2)

    return str(questions_file), str(gold_file)


class TestEnhancedEvaluationIntegration(unittest.TestCase):
    """Integration tests for the enhanced evaluation system."""

    def setUp(self):
        """Set up integration test fixtures."""
        self.temp_dir = Path(tempfile.mkdtemp())
        self.questions_file, self.gold_file = create_mock_evaluation_files(self.temp_dir)

    def tearDown(self):
        """Clean up temporary files."""
        import shutil

        shutil.rmtree(self.temp_dir, ignore_errors=True)

    def test_evaluation_file_creation(self):
        """Test that evaluation files are created correctly."""
        self.assertTrue(Path(self.questions_file).exists())
        self.assertTrue(Path(self.gold_file).exists())

        # Validate file contents
        with open(self.questions_file) as f:
            questions = json.load(f)
        self.assertEqual(len(questions), 2)

        with open(self.gold_file) as f:
            gold_data = json.load(f)
        self.assertEqual(len(gold_data), 2)

    def test_deterministic_configuration(self):
        """Test deterministic configuration setup."""
        evaluator = setup_deterministic_evaluation(seed=123)

        self.assertEqual(evaluator.config.random_seed, 123)
        self.assertTrue(evaluator.config.deterministic_mode)
        self.assertTrue(evaluator.config.sort_results)
        self.assertTrue(evaluator.config.consistent_order)


def run_evaluation_tests():
    """Run all evaluation tests."""
    # Create test suite
    suite = unittest.TestSuite()

    # Add test cases
    loader = unittest.TestLoader()
    suite.addTest(loader.loadTestsFromTestCase(TestDeterministicEvaluation))
    suite.addTest(loader.loadTestsFromTestCase(TestEnhancedEvaluationIntegration))

    # Run tests
    runner = unittest.TextTestRunner(verbosity=2)
    result = runner.run(suite)

    return result.wasSuccessful()


if __name__ == "__main__":
    print("Testing Deterministic Evaluation Improvements...")
    print("=" * 60)

    success = run_evaluation_tests()

    if success:
        print("\n✅ All tests passed! Deterministic evaluation improvements are working correctly.")
    else:
        print("\n❌ Some tests failed. Please check the implementation.")
        exit(1)