HONEST-RL-Calibrator / data /verifiers /code_verifier.py
Rushhaabhhh's picture
HONEST-RL-Calibrator-v0
3040767 verified
"""Verifier for code problems.
Exposes :func:`verify_code_answer`, which executes candidate Python
solutions in an isolated subprocess and returns ``True`` iff every test
case in the provided verification metadata passes.
The verifier supports two verification styles:
* ``execute_and_assert`` — MBPP-style: run the candidate code followed
by a list of ``assert`` statements; success iff the subprocess exits 0.
* ``stdin_stdout`` — APPS-style: for each input/output pair, run the
candidate code as a subprocess with the input on stdin and compare the
(normalized) stdout to the expected output.
Safety notes:
* The model's code is *never* imported, ``exec``'d, or ``eval``'d in the
parent process — it is always executed in a fresh subprocess via a
temp file, with a wall-clock timeout.
* On POSIX, a ``preexec_fn`` sets soft RLIMITs on CPU time and address
space to cap runaway solutions. These are best-effort — the parent
``subprocess.run(timeout=...)`` is the authoritative kill switch.
"""
from __future__ import annotations
import os
import subprocess
import sys
import tempfile
from pathlib import Path
from typing import Any, Dict, List, Optional
_MEMORY_LIMIT_BYTES = 512 * 1024 * 1024 # 512 MB
_CPU_LIMIT_SECONDS = 15 # >= subprocess timeout; parent timeout is authoritative.
def _set_child_limits() -> None: # pragma: no cover — runs in child
"""Best-effort rlimits for child processes on POSIX systems."""
try:
import resource
try:
resource.setrlimit(
resource.RLIMIT_CPU, (_CPU_LIMIT_SECONDS, _CPU_LIMIT_SECONDS)
)
except (ValueError, OSError):
pass
try:
resource.setrlimit(
resource.RLIMIT_AS, (_MEMORY_LIMIT_BYTES, _MEMORY_LIMIT_BYTES)
)
except (ValueError, OSError):
pass
except Exception:
pass
def _run_python(
script_path: Path, stdin: str, timeout_seconds: int
) -> Optional[subprocess.CompletedProcess]:
"""Run ``script_path`` as a fresh Python subprocess.
Returns the :class:`CompletedProcess` on success, or ``None`` on
timeout. Any other failure propagates to the caller's try/except.
"""
preexec = _set_child_limits if os.name == "posix" else None
try:
return subprocess.run(
[sys.executable, "-I", str(script_path)],
input=stdin,
capture_output=True,
text=True,
timeout=timeout_seconds,
preexec_fn=preexec,
)
except subprocess.TimeoutExpired:
return None
def _normalize_output(s: Any) -> str:
"""Normalize stdout/expected output for comparison.
APPS sometimes stores outputs as lists (for multi-line expected
output); coerce to a single string with Unix line endings, trim
trailing whitespace per line, and strip leading/trailing whitespace.
"""
if s is None:
return ""
if isinstance(s, list):
text = "\n".join(str(x) for x in s)
else:
text = str(s)
text = text.replace("\r\n", "\n").replace("\r", "\n")
lines = [line.rstrip() for line in text.split("\n")]
# Drop trailing empty lines for forgiving comparison.
while lines and lines[-1] == "":
lines.pop()
return "\n".join(lines).strip()
def _coerce_stdin(value: Any) -> str:
if value is None:
return ""
if isinstance(value, list):
return "\n".join(str(x) for x in value)
return str(value)
def _verify_execute_and_assert(
model_code: str, metadata: Dict[str, Any], timeout_seconds: int
) -> bool:
tests: List[str] = list(metadata.get("test_list") or [])
test_imports: List[str] = list(metadata.get("test_imports") or [])
if not tests:
return False
script = "\n".join(test_imports) + "\n" + model_code + "\n\n" + "\n".join(tests) + "\n"
with tempfile.TemporaryDirectory() as tmpdir:
script_path = Path(tmpdir) / "candidate.py"
script_path.write_text(script, encoding="utf-8")
result = _run_python(script_path, stdin="", timeout_seconds=timeout_seconds)
if result is None:
return False
return result.returncode == 0
def _verify_stdin_stdout(
model_code: str, metadata: Dict[str, Any], timeout_seconds: int
) -> bool:
inputs = metadata.get("inputs") or []
outputs = metadata.get("outputs") or []
if not isinstance(inputs, list) or not isinstance(outputs, list):
return False
if not inputs or len(inputs) != len(outputs):
return False
with tempfile.TemporaryDirectory() as tmpdir:
script_path = Path(tmpdir) / "candidate.py"
script_path.write_text(model_code, encoding="utf-8")
for stdin_value, expected in zip(inputs, outputs):
result = _run_python(
script_path,
stdin=_coerce_stdin(stdin_value),
timeout_seconds=timeout_seconds,
)
if result is None or result.returncode != 0:
return False
if _normalize_output(result.stdout) != _normalize_output(expected):
return False
return True
def verify_code_answer(
model_code: str,
verification_metadata: Dict[str, Any],
timeout_seconds: int = 5,
) -> bool:
"""Return ``True`` iff ``model_code`` passes every test in the metadata.
Any exception (syntax errors, missing imports, runtime errors in the
candidate code, infrastructure failures) is caught and reported as
``False`` — this function is designed never to raise.
"""
try:
if not isinstance(model_code, str) or not model_code.strip():
return False
if not isinstance(verification_metadata, dict):
return False
vtype = verification_metadata.get("verification_type")
if vtype == "execute_and_assert":
return _verify_execute_and_assert(
model_code, verification_metadata, timeout_seconds
)
if vtype == "stdin_stdout":
return _verify_stdin_stdout(
model_code, verification_metadata, timeout_seconds
)
return False
except Exception:
return False