Spaces:

rfvasile
/

linalg-zero

Paused

linalg-zero / tests /grpo /test_math_verify.py

atomwalk12

initial commit

0dd6c2f 3 months ago

7.13 kB

	import pytest

	from linalg_zero.grpo.verify import parse_string, verify_answers


	class TestVerifyAnswersCorrectness:
	"""
	Test the end-to-end correctness of verify_answers function.
	Focus on whether the permissive extract_math_content leads to wrong verification results.
	"""

	def test_correct_exact_matches(self):
	"""Test that exact matches are correctly verified as True."""
	test_cases = [
	("42", "42"),
	("-17.5", "-17.5"),
	("[[1, 2], [3, 4]]", "[[1, 2], [3, 4]]"),
	("[1, 2, 3]", "[1, 2, 3]"),
	("0", "0"),
	("1.5e-10", "1.5e-10"),
	]

	for completion, ground_truth in test_cases:
	parsed_ground_truth = parse_string(ground_truth)
	parsed_completion = parse_string(completion)
	result = verify_answers(parsed_ground_truth, parsed_completion)
	assert result is True, f"Failed: {completion} should equal {ground_truth}"

	def test_incorrect_exact_mismatches(self):
	"""Test that clear mismatches are correctly verified as False."""
	test_cases = [
	("42", "43"),
	("-17.5", "17.5"),
	("[[1, 2], [3, 4]]", "[[4, 3], [2, 1]]"),
	("[1, 2, 3]", "[3, 2, 1]"),
	("5", "0"),
	]

	for completion, ground_truth in test_cases:
	parsed_ground_truth = parse_string(ground_truth)
	parsed_completion = parse_string(completion)
	result = verify_answers(parsed_ground_truth, parsed_completion)
	assert result is False, f"Failed: {completion} should NOT equal {ground_truth}"

	def test_current_behavior_strict_matching(self):
	"""Test that current implementation only does strict string matching."""
	test_cases = [
	("42", "42", True),
	("[[1, 2], [3, 4]]", "[[1, 2], [3, 4]]", True),
	("-17.5", "-17.5", True),
	("5", "5", True),
	("<answer>42</answer>", "42", False),
	("Some text <answer>[[1, 2], [3, 4]]</answer> more text", "[[1, 2], [3, 4]]", False),
	("The determinant is <answer>-17.5</answer>", "-17.5", False),
	("The answer is 42", "42", False),
	]

	for completion, ground_truth, expected in test_cases:
	parsed_ground_truth = parse_string(ground_truth)
	parsed_completion = parse_string(completion)
	result = verify_answers(parsed_ground_truth, parsed_completion)
	assert result == expected, f"Failed: '{completion}' vs '{ground_truth}' expected {expected}, got {result}"

	def test_no_answer_tags_fallback(self):
	"""Test that text without answer tags is processed as-is."""
	test_cases = [
	("42", "42"),
	("[[1, 2], [3, 4]]", "[[1, 2], [3, 4]]"),
	("-17.5", "-17.5"),
	("5", "5"),
	("The answer is 42", "42"),
	]

	for completion, ground_truth in test_cases[:-1]:
	parsed_ground_truth = parse_string(ground_truth)
	parsed_completion = parse_string(completion)
	result = verify_answers(parsed_ground_truth, parsed_completion)
	assert result is True, f"Failed: '{completion}' should match '{ground_truth}'"

	# Test that unparseable text returns False
	parsed_ground_truth = parse_string("42")
	parsed_completion = parse_string("The answer is 42") # This should return None
	result = verify_answers(parsed_ground_truth, parsed_completion)
	assert result is False, "Text without answer tags should not flexibly extract numbers"

	def test_malformed_input_edge_cases(self):
	"""
	Test that malformed input cases return False as expected.
	"""
	malformed_cases = [
	("[[1, 2], [3, 4", "[[1, 2], [3, 4]]"),
	("42.5.6", "42.5"),
	("Multiple answers: 42 and 24", "42"),
	("[[1, 2]] and [[3, 4]]", "[[1, 2]]"),
	("<answer>[[1, 2], [3, 4</answer>", "[[1, 2], [3, 4]]"),
	("Text <answer>42</answer> and <answer>24</answer>", "42"),
	]

	for completion, ground_truth in malformed_cases:
	parsed_ground_truth = parse_string(ground_truth)
	parsed_completion = parse_string(completion)
	result = verify_answers(parsed_ground_truth, parsed_completion)
	assert result is False, f"Malformed input should be False: '{completion}' vs '{ground_truth}'"

	def test_verify_mathematical_equivalence(self):
	"""Test that mathematically equivalent but differently formatted answers are correctly identified."""
	equivalence_cases = [
	("2.0", "2"),
	("[[1.0, 2.0], [3.0, 4.0]]", "[[1, 2], [3, 4]]"),
	("0.0", "0"),
	("-0", "0"),
	]

	for completion, ground_truth in equivalence_cases:
	parsed_ground_truth = parse_string(ground_truth)
	parsed_completion = parse_string(completion)
	result = verify_answers(parsed_ground_truth, parsed_completion)
	assert result is True, f"Mathematical equivalence failed: '{completion}' should equal '{ground_truth}'"

	def test_empty_and_invalid_inputs(self):
	"""Test that empty or completely invalid inputs return False."""
	edge_cases = [
	("", "42"),
	("42", ""),
	("", ""),
	("No numbers here", "42"),
	("The result is undefined", "42"),
	]

	for completion, ground_truth in edge_cases:
	parsed_ground_truth = parse_string(ground_truth)
	parsed_completion = parse_string(completion)
	result = verify_answers(parsed_ground_truth, parsed_completion)
	assert result is False, f"Edge case should be False: '{completion}' vs '{ground_truth}'"

	@pytest.mark.parametrize(
	"completion,ground_truth,should_match",
	[
	# Clear correct cases
	("42", "42", True),
	("[[1, 2], [3, 4]]", "[[1, 2], [3, 4]]", True),
	# Clear incorrect cases
	("42", "43", False),
	("[[1, 2], [3, 4]]", "[[4, 3], [2, 1]]", False),
	# Cases that fail
	("<answer>42</answer>", "42", False),
	("The answer is 42", "42", False),
	("The answer is 42", "43", False),
	# Malformed input
	("[[1, 2], [3, 4", "[[1, 2], [3, 4]]", False),
	("42.5.6", "42.5", False),
	],
	)
	def test_verify_answers_comprehensive(self, completion, ground_truth, should_match):
	"""Comprehensive test of verify_answers behavior."""
	parsed_ground_truth = parse_string(ground_truth)
	parsed_completion = parse_string(completion)
	result = verify_answers(parsed_ground_truth, parsed_completion)

	if should_match is not None:
	assert result == should_match, (
	f"Expected {should_match} for '{completion}' vs '{ground_truth}', got {result}"
	)


	if __name__ == "__main__":
	pytest.main([__file__, "-v"])