Spaces:

msse-team-3
/

ai-engineering-project

Sleeping

ai-engineering-project / test_deterministic_evaluation.py

GitHub Action

Clean deployment without binary files

f884e6e 2 months ago

11.7 kB

	#!/usr/bin/env python3
	"""
	Test Deterministic Evaluation Improvements

	Validates that the enhanced evaluation system provides reproducible and
	improved groundedness scoring.
	"""

	import json
	import os

	# Import the enhanced evaluation components
	import sys
	import tempfile
	import unittest
	from pathlib import Path

	# typing imports not required here

	sys.path.append(os.path.join(os.path.dirname(__file__), "..", "src"))

	from src.evaluation.deterministic import ( # noqa: E402
	evaluate_citation_accuracy_deterministic,
	evaluate_groundedness_deterministic,
	setup_deterministic_evaluation,
	)


	class TestDeterministicEvaluation(unittest.TestCase):
	"""Test cases for deterministic evaluation improvements."""

	def setUp(self):
	"""Set up test fixtures."""
	self.evaluator = setup_deterministic_evaluation(seed=42)

	# Sample test data
	self.sample_response = """
	Based on the company's remote work policy, employees can work from home
	up to 3 days per week. This policy was implemented to improve work-life
	balance while maintaining team collaboration.
	"""

	self.sample_sources = [
	"""
	Remote Work Policy: All full-time employees are eligible to work
	from home up to three days per week. Part-time employees may work
	remotely up to 50% of their scheduled hours.
	""",
	"""
	Work-Life Balance Initiative: The company recognizes the importance
	of maintaining a healthy work-life balance. Our remote work policy
	is designed to provide flexibility while ensuring team productivity.
	""",
	]

	self.sample_returned_sources = [
	{
	"filename": "remote_work_policy.md",
	"content": self.sample_sources[0],
	"metadata": {"file": "remote_work_policy.md"},
	},
	{"filename": "work_life_balance.md", "content": self.sample_sources[1]},
	]

	self.expected_sources = ["remote_work_policy.md", "work_life_balance.md"]

	def test_deterministic_reproducibility(self):
	"""Test that evaluation results are reproducible with same seed."""
	# Run evaluation twice with same seed
	evaluator1 = setup_deterministic_evaluation(seed=42)
	evaluator2 = setup_deterministic_evaluation(seed=42)

	result1 = evaluate_groundedness_deterministic(self.sample_response, self.sample_sources, evaluator1)
	result2 = evaluate_groundedness_deterministic(self.sample_response, self.sample_sources, evaluator2)

	# Results should be identical
	self.assertEqual(result1, result2)

	# Test citation accuracy reproducibility
	citation1 = evaluate_citation_accuracy_deterministic(
	self.sample_response, self.sample_returned_sources, self.expected_sources, evaluator1
	)
	citation2 = evaluate_citation_accuracy_deterministic(
	self.sample_response, self.sample_returned_sources, self.expected_sources, evaluator2
	)

	self.assertEqual(citation1, citation2)

	def test_groundedness_scoring(self):
	"""Test groundedness scoring functionality."""
	result = evaluate_groundedness_deterministic(self.sample_response, self.sample_sources, self.evaluator)

	# Check that all expected metrics are present
	expected_metrics = ["groundedness_score", "passage_coverage", "token_overlap", "exact_matches"]
	for metric in expected_metrics:
	self.assertIn(metric, result)
	self.assertIsInstance(result[metric], float)
	self.assertGreaterEqual(result[metric], 0.0)
	self.assertLessEqual(result[metric], 1.0)

	# Groundedness should be positive since response relates to sources
	self.assertGreater(result["groundedness_score"], 0.0)

	# Token overlap should be detected
	self.assertGreater(result["token_overlap"], 0.0)

	def test_citation_accuracy_scoring(self):
	"""Test citation accuracy scoring functionality."""
	result = evaluate_citation_accuracy_deterministic(
	self.sample_response, self.sample_returned_sources, self.expected_sources, self.evaluator
	)

	# Check expected metrics
	expected_metrics = ["citation_accuracy", "source_precision", "source_recall", "exact_filename_matches"]
	for metric in expected_metrics:
	self.assertIn(metric, result)
	self.assertIsInstance(result[metric], float)
	self.assertGreaterEqual(result[metric], 0.0)
	self.assertLessEqual(result[metric], 1.0)

	# Should have perfect citation accuracy for exact filename matches
	self.assertEqual(result["exact_filename_matches"], 1.0)
	self.assertEqual(result["source_recall"], 1.0)

	def test_empty_inputs_handling(self):
	"""Test handling of empty or invalid inputs."""
	# Empty generated text
	result = evaluate_groundedness_deterministic("", self.sample_sources, self.evaluator)
	self.assertEqual(result["groundedness_score"], 0.0)

	# Empty sources
	result = evaluate_groundedness_deterministic(self.sample_response, [], self.evaluator)
	self.assertEqual(result["groundedness_score"], 0.0)

	# Empty expected sources for citation
	result = evaluate_citation_accuracy_deterministic(
	self.sample_response, self.sample_returned_sources, [], self.evaluator
	)
	# Should be 0.0 since sources were returned but none expected
	self.assertEqual(result["citation_accuracy"], 0.0)

	def test_float_precision_normalization(self):
	"""Test that floating point values are normalized consistently."""
	result = evaluate_groundedness_deterministic(self.sample_response, self.sample_sources, self.evaluator)

	# Check that values are rounded to expected precision
	for value in result.values():
	if isinstance(value, float):
	# Should not have more than configured decimal places
	decimal_places = len(str(value).split(".")[-1]) if "." in str(value) else 0
	self.assertLessEqual(decimal_places, self.evaluator.config.float_precision)

	def test_consistent_ordering(self):
	"""Test that evaluation produces consistent ordering."""
	# Test with sources in different orders
	sources_order1 = self.sample_sources
	sources_order2 = list(reversed(self.sample_sources))

	result1 = evaluate_groundedness_deterministic(self.sample_response, sources_order1, self.evaluator)
	result2 = evaluate_groundedness_deterministic(self.sample_response, sources_order2, self.evaluator)

	# Results should be identical due to internal sorting
	self.assertEqual(result1, result2)

	def test_filename_normalization(self):
	"""Test citation filename normalization."""
	# Test various filename formats
	test_sources = [
	{"filename": "policy.md"},
	{"filename": "policy.markdown"},
	{"filename": "/path/to/policy.md"},
	{"filename": "policy.MD"},
	{"url": "https://example.com/policy.md?v=1"},
	]

	expected = ["policy"] # All should normalize to "policy"

	result = evaluate_citation_accuracy_deterministic("Test response", test_sources, expected, self.evaluator)

	# Should have high recall since all sources match the expected "policy"
	self.assertGreater(result["source_recall"], 0.0)

	def test_edge_cases(self):
	"""Test edge cases and error conditions."""
	# Very long text
	long_text = "word " * 1000
	result = evaluate_groundedness_deterministic(long_text, [long_text], self.evaluator)
	self.assertGreater(result["groundedness_score"], 0.8) # Should be high overlap

	# Special characters
	special_text = "Test with special chars: @#$%^&*()"
	result = evaluate_groundedness_deterministic(special_text, [special_text], self.evaluator)
	self.assertGreater(result["groundedness_score"], 0.0)

	# Unicode text
	unicode_text = "Testing unicode: 测试 тест परीक्षा"
	result = evaluate_groundedness_deterministic(unicode_text, [unicode_text], self.evaluator)
	self.assertGreater(result["groundedness_score"], 0.0)


	def create_mock_evaluation_files(temp_dir: Path) -> tuple[str, str]:
	"""Create mock evaluation files for testing."""
	questions = [
	{"id": "1", "question": "What is the remote work policy?"},
	{"id": "2", "question": "How many days can employees work from home?"},
	]

	gold_answers = {
	"1": {
	"answer": "Employees can work remotely up to 3 days per week according to company policy.",
	"expected_sources": ["remote_work_policy.md"],
	},
	"2": {
	"answer": "Full-time employees can work from home up to three days per week.",
	"expected_sources": ["remote_work_policy.md", "employee_handbook.md"],
	},
	}

	questions_file = temp_dir / "test_questions.json"
	gold_file = temp_dir / "test_gold.json"

	with open(questions_file, "w") as f:
	json.dump(questions, f, indent=2)

	with open(gold_file, "w") as f:
	json.dump(gold_answers, f, indent=2)

	return str(questions_file), str(gold_file)


	class TestEnhancedEvaluationIntegration(unittest.TestCase):
	"""Integration tests for the enhanced evaluation system."""

	def setUp(self):
	"""Set up integration test fixtures."""
	self.temp_dir = Path(tempfile.mkdtemp())
	self.questions_file, self.gold_file = create_mock_evaluation_files(self.temp_dir)

	def tearDown(self):
	"""Clean up temporary files."""
	import shutil

	shutil.rmtree(self.temp_dir, ignore_errors=True)

	def test_evaluation_file_creation(self):
	"""Test that evaluation files are created correctly."""
	self.assertTrue(Path(self.questions_file).exists())
	self.assertTrue(Path(self.gold_file).exists())

	# Validate file contents
	with open(self.questions_file) as f:
	questions = json.load(f)
	self.assertEqual(len(questions), 2)

	with open(self.gold_file) as f:
	gold_data = json.load(f)
	self.assertEqual(len(gold_data), 2)

	def test_deterministic_configuration(self):
	"""Test deterministic configuration setup."""
	evaluator = setup_deterministic_evaluation(seed=123)

	self.assertEqual(evaluator.config.random_seed, 123)
	self.assertTrue(evaluator.config.deterministic_mode)
	self.assertTrue(evaluator.config.sort_results)
	self.assertTrue(evaluator.config.consistent_order)


	def run_evaluation_tests():
	"""Run all evaluation tests."""
	# Create test suite
	suite = unittest.TestSuite()

	# Add test cases
	loader = unittest.TestLoader()
	suite.addTest(loader.loadTestsFromTestCase(TestDeterministicEvaluation))
	suite.addTest(loader.loadTestsFromTestCase(TestEnhancedEvaluationIntegration))

	# Run tests
	runner = unittest.TextTestRunner(verbosity=2)
	result = runner.run(suite)

	return result.wasSuccessful()


	if __name__ == "__main__":
	print("Testing Deterministic Evaluation Improvements...")
	print("=" * 60)

	success = run_evaluation_tests()

	if success:
	print("\n✅ All tests passed! Deterministic evaluation improvements are working correctly.")
	else:
	print("\n❌ Some tests failed. Please check the implementation.")
	exit(1)