| """Subprocess-based test runner — lighter sandbox for trusted eval flows. |
| |
| Used when Docker isn't available (e.g. CHTC execute nodes). Runs pytest in |
| a subprocess with a wall-clock timeout. No namespace isolation — appropriate |
| only for stock-model baselines, NEVER for Week 4 GRPO where model output is |
| actively adversarial. For that, use SandboxVerifier (Docker) or a future |
| Apptainer backend. |
| |
| Returns the same `RunResult` dataclass as `runner.run_code` so the Verifier |
| interface stays backend-agnostic. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import os |
| import re |
| import subprocess |
| import sys |
| import tempfile |
| import time |
| from pathlib import Path |
|
|
| from .runner import RunResult |
|
|
| DEFAULT_TIMEOUT_S: float = 10.0 |
|
|
|
|
| def run_code_subprocess( |
| code: str, |
| tests: str, |
| *, |
| timeout_s: float = DEFAULT_TIMEOUT_S, |
| ) -> RunResult: |
| """Run `tests` against `code` via subprocess pytest; return a RunResult.""" |
| with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as workdir: |
| work = Path(workdir) |
| (work / "solution.py").write_text(code) |
| (work / "test_solution.py").write_text(tests) |
|
|
| start = time.monotonic() |
| try: |
| proc = subprocess.run( |
| [ |
| sys.executable, "-m", "pytest", "-q", |
| "-p", "no:cacheprovider", |
| "test_solution.py", |
| ], |
| cwd=str(work), |
| timeout=timeout_s, |
| capture_output=True, |
| text=True, |
| check=False, |
| env=_minimal_env(), |
| ) |
| except subprocess.TimeoutExpired as exc: |
| return RunResult( |
| passed=False, |
| num_tests_passed=0, |
| num_tests_total=0, |
| runtime_ms=int(timeout_s * 1000), |
| stdout=(exc.stdout or b"").decode("utf-8", errors="replace") |
| if isinstance(exc.stdout, bytes) else (exc.stdout or ""), |
| stderr="timeout", |
| timed_out=True, |
| oom=False, |
| error=None, |
| ) |
| except Exception as exc: |
| return RunResult( |
| passed=False, num_tests_passed=0, num_tests_total=0, |
| runtime_ms=0, stdout="", stderr="", timed_out=False, |
| oom=False, error=f"runner error: {exc}", |
| ) |
|
|
| runtime_ms = int((time.monotonic() - start) * 1000) |
| n_passed, n_total = _parse_pytest_output(proc.stdout + proc.stderr) |
| passed = proc.returncode == 0 and n_total > 0 and n_passed == n_total |
| return RunResult( |
| passed=passed, |
| num_tests_passed=n_passed, |
| num_tests_total=n_total, |
| runtime_ms=runtime_ms, |
| stdout=proc.stdout, |
| stderr=proc.stderr, |
| timed_out=False, |
| oom=False, |
| error=None, |
| ) |
|
|
|
|
| def _parse_pytest_output(output: str) -> tuple[int, int]: |
| """Extract (num_passed, num_total) from pytest's summary output.""" |
| n_passed = 0 |
| n_failed = 0 |
| n_error = 0 |
| if m := re.search(r"(\d+) passed", output): |
| n_passed = int(m.group(1)) |
| if m := re.search(r"(\d+) failed", output): |
| n_failed = int(m.group(1)) |
| if m := re.search(r"(\d+) error", output): |
| n_error = int(m.group(1)) |
| return n_passed, n_passed + n_failed + n_error |
|
|
|
|
| def _minimal_env() -> dict[str, str]: |
| """Inherit only what pytest needs; strip everything else for cleanliness.""" |
| keep = ("PATH", "PYTHONPATH", "HOME", "LANG", "LC_ALL", "USER", "LOGNAME") |
| out = {"PYTHONDONTWRITEBYTECODE": "1"} |
| for k in keep: |
| if k in os.environ: |
| out[k] = os.environ[k] |
| return out |
|
|