Spaces:
Sleeping
Sleeping
File size: 5,917 Bytes
3040767 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 | """Tests for the code answer verifier."""
import time
import pytest
from data.verifiers.code_verifier import verify_code_answer
# ---------------------------------------------------------------------------
# execute_and_assert (MBPP-style)
# ---------------------------------------------------------------------------
class TestExecuteAndAssert:
def test_correct_solution_passes(self):
meta = {
"verification_type": "execute_and_assert",
"test_list": [
"assert add(1, 2) == 3",
"assert add(-1, 1) == 0",
"assert add(0, 0) == 0",
],
}
code = "def add(a, b):\n return a + b\n"
assert verify_code_answer(code, meta) is True
def test_buggy_solution_fails(self):
meta = {
"verification_type": "execute_and_assert",
"test_list": ["assert add(1, 2) == 3"],
}
code = "def add(a, b):\n return a - b\n" # bug
assert verify_code_answer(code, meta) is False
def test_syntax_error_returns_false(self):
meta = {
"verification_type": "execute_and_assert",
"test_list": ["assert add(1, 2) == 3"],
}
code = "def add(a, b:\n return a + b" # broken syntax
assert verify_code_answer(code, meta) is False
def test_runtime_error_returns_false(self):
meta = {
"verification_type": "execute_and_assert",
"test_list": ["assert boom() == 1"],
}
code = "def boom():\n raise RuntimeError('nope')\n"
assert verify_code_answer(code, meta) is False
def test_infinite_loop_times_out(self):
meta = {
"verification_type": "execute_and_assert",
"test_list": ["assert spin() == 1"],
}
code = "def spin():\n while True:\n pass\n"
start = time.monotonic()
result = verify_code_answer(code, meta, timeout_seconds=2)
elapsed = time.monotonic() - start
assert result is False
# Must return promptly — the test itself must not hang.
assert elapsed < 6, f"verifier hung for {elapsed:.1f}s"
def test_missing_test_list_returns_false(self):
meta = {"verification_type": "execute_and_assert", "test_list": []}
code = "def add(a, b):\n return a + b\n"
assert verify_code_answer(code, meta) is False
def test_test_imports_are_executed(self):
meta = {
"verification_type": "execute_and_assert",
"test_imports": ["import math"],
"test_list": ["assert sqrt2() == math.sqrt(2)"],
}
code = "import math\ndef sqrt2():\n return math.sqrt(2)\n"
assert verify_code_answer(code, meta) is True
# ---------------------------------------------------------------------------
# stdin_stdout (APPS-style)
# ---------------------------------------------------------------------------
class TestStdinStdout:
def test_echo_program_passes(self):
meta = {
"verification_type": "stdin_stdout",
"inputs": ["hello\n"],
"outputs": ["hello\n"],
}
code = "import sys\nprint(sys.stdin.read().strip())\n"
assert verify_code_answer(code, meta) is True
def test_multiple_cases_all_pass(self):
meta = {
"verification_type": "stdin_stdout",
"inputs": ["3\n4\n", "10\n20\n"],
"outputs": ["7\n", "30\n"],
}
code = (
"import sys\n"
"nums = [int(x) for x in sys.stdin.read().split()]\n"
"print(sum(nums))\n"
)
assert verify_code_answer(code, meta) is True
def test_wrong_output_fails(self):
meta = {
"verification_type": "stdin_stdout",
"inputs": ["3\n4\n"],
"outputs": ["7\n"],
}
code = "import sys\nprint(99)\n"
assert verify_code_answer(code, meta) is False
def test_normalizes_trailing_whitespace(self):
meta = {
"verification_type": "stdin_stdout",
"inputs": ["1\n"],
"outputs": ["42\n\n\n"], # trailing blank lines should be stripped
}
code = "print(42)\n"
assert verify_code_answer(code, meta) is True
def test_empty_io_lists_fail(self):
meta = {
"verification_type": "stdin_stdout",
"inputs": [],
"outputs": [],
}
code = "print('anything')\n"
assert verify_code_answer(code, meta) is False
def test_mismatched_io_lengths_fail(self):
meta = {
"verification_type": "stdin_stdout",
"inputs": ["1\n", "2\n"],
"outputs": ["1\n"], # length mismatch
}
code = "import sys\nprint(sys.stdin.read().strip())\n"
assert verify_code_answer(code, meta) is False
# ---------------------------------------------------------------------------
# Defensive / routing behavior
# ---------------------------------------------------------------------------
class TestDefensive:
def test_unknown_verification_type_returns_false(self):
meta = {"verification_type": "nonsense", "test_list": []}
assert verify_code_answer("print(1)", meta) is False
def test_non_string_code_returns_false(self):
meta = {
"verification_type": "execute_and_assert",
"test_list": ["assert True"],
}
assert verify_code_answer(None, meta) is False # type: ignore[arg-type]
assert verify_code_answer(123, meta) is False # type: ignore[arg-type]
assert verify_code_answer("", meta) is False
def test_non_dict_metadata_returns_false(self):
assert verify_code_answer("print(1)", None) is False # type: ignore[arg-type]
assert verify_code_answer("print(1)", "bad") is False # type: ignore[arg-type]
|