Spaces:
Sleeping
Sleeping
| """Tests for the code answer verifier.""" | |
| import time | |
| import pytest | |
| from data.verifiers.code_verifier import verify_code_answer | |
| # --------------------------------------------------------------------------- | |
| # execute_and_assert (MBPP-style) | |
| # --------------------------------------------------------------------------- | |
| class TestExecuteAndAssert: | |
| def test_correct_solution_passes(self): | |
| meta = { | |
| "verification_type": "execute_and_assert", | |
| "test_list": [ | |
| "assert add(1, 2) == 3", | |
| "assert add(-1, 1) == 0", | |
| "assert add(0, 0) == 0", | |
| ], | |
| } | |
| code = "def add(a, b):\n return a + b\n" | |
| assert verify_code_answer(code, meta) is True | |
| def test_buggy_solution_fails(self): | |
| meta = { | |
| "verification_type": "execute_and_assert", | |
| "test_list": ["assert add(1, 2) == 3"], | |
| } | |
| code = "def add(a, b):\n return a - b\n" # bug | |
| assert verify_code_answer(code, meta) is False | |
| def test_syntax_error_returns_false(self): | |
| meta = { | |
| "verification_type": "execute_and_assert", | |
| "test_list": ["assert add(1, 2) == 3"], | |
| } | |
| code = "def add(a, b:\n return a + b" # broken syntax | |
| assert verify_code_answer(code, meta) is False | |
| def test_runtime_error_returns_false(self): | |
| meta = { | |
| "verification_type": "execute_and_assert", | |
| "test_list": ["assert boom() == 1"], | |
| } | |
| code = "def boom():\n raise RuntimeError('nope')\n" | |
| assert verify_code_answer(code, meta) is False | |
| def test_infinite_loop_times_out(self): | |
| meta = { | |
| "verification_type": "execute_and_assert", | |
| "test_list": ["assert spin() == 1"], | |
| } | |
| code = "def spin():\n while True:\n pass\n" | |
| start = time.monotonic() | |
| result = verify_code_answer(code, meta, timeout_seconds=2) | |
| elapsed = time.monotonic() - start | |
| assert result is False | |
| # Must return promptly — the test itself must not hang. | |
| assert elapsed < 6, f"verifier hung for {elapsed:.1f}s" | |
| def test_missing_test_list_returns_false(self): | |
| meta = {"verification_type": "execute_and_assert", "test_list": []} | |
| code = "def add(a, b):\n return a + b\n" | |
| assert verify_code_answer(code, meta) is False | |
| def test_test_imports_are_executed(self): | |
| meta = { | |
| "verification_type": "execute_and_assert", | |
| "test_imports": ["import math"], | |
| "test_list": ["assert sqrt2() == math.sqrt(2)"], | |
| } | |
| code = "import math\ndef sqrt2():\n return math.sqrt(2)\n" | |
| assert verify_code_answer(code, meta) is True | |
| # --------------------------------------------------------------------------- | |
| # stdin_stdout (APPS-style) | |
| # --------------------------------------------------------------------------- | |
| class TestStdinStdout: | |
| def test_echo_program_passes(self): | |
| meta = { | |
| "verification_type": "stdin_stdout", | |
| "inputs": ["hello\n"], | |
| "outputs": ["hello\n"], | |
| } | |
| code = "import sys\nprint(sys.stdin.read().strip())\n" | |
| assert verify_code_answer(code, meta) is True | |
| def test_multiple_cases_all_pass(self): | |
| meta = { | |
| "verification_type": "stdin_stdout", | |
| "inputs": ["3\n4\n", "10\n20\n"], | |
| "outputs": ["7\n", "30\n"], | |
| } | |
| code = ( | |
| "import sys\n" | |
| "nums = [int(x) for x in sys.stdin.read().split()]\n" | |
| "print(sum(nums))\n" | |
| ) | |
| assert verify_code_answer(code, meta) is True | |
| def test_wrong_output_fails(self): | |
| meta = { | |
| "verification_type": "stdin_stdout", | |
| "inputs": ["3\n4\n"], | |
| "outputs": ["7\n"], | |
| } | |
| code = "import sys\nprint(99)\n" | |
| assert verify_code_answer(code, meta) is False | |
| def test_normalizes_trailing_whitespace(self): | |
| meta = { | |
| "verification_type": "stdin_stdout", | |
| "inputs": ["1\n"], | |
| "outputs": ["42\n\n\n"], # trailing blank lines should be stripped | |
| } | |
| code = "print(42)\n" | |
| assert verify_code_answer(code, meta) is True | |
| def test_empty_io_lists_fail(self): | |
| meta = { | |
| "verification_type": "stdin_stdout", | |
| "inputs": [], | |
| "outputs": [], | |
| } | |
| code = "print('anything')\n" | |
| assert verify_code_answer(code, meta) is False | |
| def test_mismatched_io_lengths_fail(self): | |
| meta = { | |
| "verification_type": "stdin_stdout", | |
| "inputs": ["1\n", "2\n"], | |
| "outputs": ["1\n"], # length mismatch | |
| } | |
| code = "import sys\nprint(sys.stdin.read().strip())\n" | |
| assert verify_code_answer(code, meta) is False | |
| # --------------------------------------------------------------------------- | |
| # Defensive / routing behavior | |
| # --------------------------------------------------------------------------- | |
| class TestDefensive: | |
| def test_unknown_verification_type_returns_false(self): | |
| meta = {"verification_type": "nonsense", "test_list": []} | |
| assert verify_code_answer("print(1)", meta) is False | |
| def test_non_string_code_returns_false(self): | |
| meta = { | |
| "verification_type": "execute_and_assert", | |
| "test_list": ["assert True"], | |
| } | |
| assert verify_code_answer(None, meta) is False # type: ignore[arg-type] | |
| assert verify_code_answer(123, meta) is False # type: ignore[arg-type] | |
| assert verify_code_answer("", meta) is False | |
| def test_non_dict_metadata_returns_false(self): | |
| assert verify_code_answer("print(1)", None) is False # type: ignore[arg-type] | |
| assert verify_code_answer("print(1)", "bad") is False # type: ignore[arg-type] | |