"""Subprocess-based test runner — lighter sandbox for trusted eval flows. Used when Docker isn't available (e.g. CHTC execute nodes). Runs pytest in a subprocess with a wall-clock timeout. No namespace isolation — appropriate only for stock-model baselines, NEVER for Week 4 GRPO where model output is actively adversarial. For that, use SandboxVerifier (Docker) or a future Apptainer backend. Returns the same `RunResult` dataclass as `runner.run_code` so the Verifier interface stays backend-agnostic. """ from __future__ import annotations import os import re import subprocess import sys import tempfile import time from pathlib import Path from .runner import RunResult DEFAULT_TIMEOUT_S: float = 10.0 def run_code_subprocess( code: str, tests: str, *, timeout_s: float = DEFAULT_TIMEOUT_S, ) -> RunResult: """Run `tests` against `code` via subprocess pytest; return a RunResult.""" with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as workdir: work = Path(workdir) (work / "solution.py").write_text(code) (work / "test_solution.py").write_text(tests) start = time.monotonic() try: proc = subprocess.run( [ sys.executable, "-m", "pytest", "-q", "-p", "no:cacheprovider", "test_solution.py", ], cwd=str(work), timeout=timeout_s, capture_output=True, text=True, check=False, env=_minimal_env(), ) except subprocess.TimeoutExpired as exc: return RunResult( passed=False, num_tests_passed=0, num_tests_total=0, runtime_ms=int(timeout_s * 1000), stdout=(exc.stdout or b"").decode("utf-8", errors="replace") if isinstance(exc.stdout, bytes) else (exc.stdout or ""), stderr="timeout", timed_out=True, oom=False, error=None, ) except Exception as exc: return RunResult( passed=False, num_tests_passed=0, num_tests_total=0, runtime_ms=0, stdout="", stderr="", timed_out=False, oom=False, error=f"runner error: {exc}", ) runtime_ms = int((time.monotonic() - start) * 1000) n_passed, n_total = _parse_pytest_output(proc.stdout + proc.stderr) passed = proc.returncode == 0 and n_total > 0 and n_passed == n_total return RunResult( passed=passed, num_tests_passed=n_passed, num_tests_total=n_total, runtime_ms=runtime_ms, stdout=proc.stdout, stderr=proc.stderr, timed_out=False, oom=False, error=None, ) def _parse_pytest_output(output: str) -> tuple[int, int]: """Extract (num_passed, num_total) from pytest's summary output.""" n_passed = 0 n_failed = 0 n_error = 0 if m := re.search(r"(\d+) passed", output): n_passed = int(m.group(1)) if m := re.search(r"(\d+) failed", output): n_failed = int(m.group(1)) if m := re.search(r"(\d+) error", output): n_error = int(m.group(1)) return n_passed, n_passed + n_failed + n_error def _minimal_env() -> dict[str, str]: """Inherit only what pytest needs; strip everything else for cleanliness.""" keep = ("PATH", "PYTHONPATH", "HOME", "LANG", "LC_ALL", "USER", "LOGNAME") out = {"PYTHONDONTWRITEBYTECODE": "1"} for k in keep: if k in os.environ: out[k] = os.environ[k] return out