Spaces:
Sleeping
Sleeping
File size: 6,246 Bytes
3040767 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 | """Verifier for code problems.
Exposes :func:`verify_code_answer`, which executes candidate Python
solutions in an isolated subprocess and returns ``True`` iff every test
case in the provided verification metadata passes.
The verifier supports two verification styles:
* ``execute_and_assert`` — MBPP-style: run the candidate code followed
by a list of ``assert`` statements; success iff the subprocess exits 0.
* ``stdin_stdout`` — APPS-style: for each input/output pair, run the
candidate code as a subprocess with the input on stdin and compare the
(normalized) stdout to the expected output.
Safety notes:
* The model's code is *never* imported, ``exec``'d, or ``eval``'d in the
parent process — it is always executed in a fresh subprocess via a
temp file, with a wall-clock timeout.
* On POSIX, a ``preexec_fn`` sets soft RLIMITs on CPU time and address
space to cap runaway solutions. These are best-effort — the parent
``subprocess.run(timeout=...)`` is the authoritative kill switch.
"""
from __future__ import annotations
import os
import subprocess
import sys
import tempfile
from pathlib import Path
from typing import Any, Dict, List, Optional
_MEMORY_LIMIT_BYTES = 512 * 1024 * 1024 # 512 MB
_CPU_LIMIT_SECONDS = 15 # >= subprocess timeout; parent timeout is authoritative.
def _set_child_limits() -> None: # pragma: no cover — runs in child
"""Best-effort rlimits for child processes on POSIX systems."""
try:
import resource
try:
resource.setrlimit(
resource.RLIMIT_CPU, (_CPU_LIMIT_SECONDS, _CPU_LIMIT_SECONDS)
)
except (ValueError, OSError):
pass
try:
resource.setrlimit(
resource.RLIMIT_AS, (_MEMORY_LIMIT_BYTES, _MEMORY_LIMIT_BYTES)
)
except (ValueError, OSError):
pass
except Exception:
pass
def _run_python(
script_path: Path, stdin: str, timeout_seconds: int
) -> Optional[subprocess.CompletedProcess]:
"""Run ``script_path`` as a fresh Python subprocess.
Returns the :class:`CompletedProcess` on success, or ``None`` on
timeout. Any other failure propagates to the caller's try/except.
"""
preexec = _set_child_limits if os.name == "posix" else None
try:
return subprocess.run(
[sys.executable, "-I", str(script_path)],
input=stdin,
capture_output=True,
text=True,
timeout=timeout_seconds,
preexec_fn=preexec,
)
except subprocess.TimeoutExpired:
return None
def _normalize_output(s: Any) -> str:
"""Normalize stdout/expected output for comparison.
APPS sometimes stores outputs as lists (for multi-line expected
output); coerce to a single string with Unix line endings, trim
trailing whitespace per line, and strip leading/trailing whitespace.
"""
if s is None:
return ""
if isinstance(s, list):
text = "\n".join(str(x) for x in s)
else:
text = str(s)
text = text.replace("\r\n", "\n").replace("\r", "\n")
lines = [line.rstrip() for line in text.split("\n")]
# Drop trailing empty lines for forgiving comparison.
while lines and lines[-1] == "":
lines.pop()
return "\n".join(lines).strip()
def _coerce_stdin(value: Any) -> str:
if value is None:
return ""
if isinstance(value, list):
return "\n".join(str(x) for x in value)
return str(value)
def _verify_execute_and_assert(
model_code: str, metadata: Dict[str, Any], timeout_seconds: int
) -> bool:
tests: List[str] = list(metadata.get("test_list") or [])
test_imports: List[str] = list(metadata.get("test_imports") or [])
if not tests:
return False
script = "\n".join(test_imports) + "\n" + model_code + "\n\n" + "\n".join(tests) + "\n"
with tempfile.TemporaryDirectory() as tmpdir:
script_path = Path(tmpdir) / "candidate.py"
script_path.write_text(script, encoding="utf-8")
result = _run_python(script_path, stdin="", timeout_seconds=timeout_seconds)
if result is None:
return False
return result.returncode == 0
def _verify_stdin_stdout(
model_code: str, metadata: Dict[str, Any], timeout_seconds: int
) -> bool:
inputs = metadata.get("inputs") or []
outputs = metadata.get("outputs") or []
if not isinstance(inputs, list) or not isinstance(outputs, list):
return False
if not inputs or len(inputs) != len(outputs):
return False
with tempfile.TemporaryDirectory() as tmpdir:
script_path = Path(tmpdir) / "candidate.py"
script_path.write_text(model_code, encoding="utf-8")
for stdin_value, expected in zip(inputs, outputs):
result = _run_python(
script_path,
stdin=_coerce_stdin(stdin_value),
timeout_seconds=timeout_seconds,
)
if result is None or result.returncode != 0:
return False
if _normalize_output(result.stdout) != _normalize_output(expected):
return False
return True
def verify_code_answer(
model_code: str,
verification_metadata: Dict[str, Any],
timeout_seconds: int = 5,
) -> bool:
"""Return ``True`` iff ``model_code`` passes every test in the metadata.
Any exception (syntax errors, missing imports, runtime errors in the
candidate code, infrastructure failures) is caught and reported as
``False`` — this function is designed never to raise.
"""
try:
if not isinstance(model_code, str) or not model_code.strip():
return False
if not isinstance(verification_metadata, dict):
return False
vtype = verification_metadata.get("verification_type")
if vtype == "execute_and_assert":
return _verify_execute_and_assert(
model_code, verification_metadata, timeout_seconds
)
if vtype == "stdin_stdout":
return _verify_stdin_stdout(
model_code, verification_metadata, timeout_seconds
)
return False
except Exception:
return False
|