Spaces:

dmaheshwar22
/

verifiable-rl-coder

Running

App Files Files Community

verifiable-rl-coder / src /verifiable_rl_coder /sandbox /subprocess_runner.py

dmaheshwar22

deploy: replace template with real demo

0dd7c80 verified 16 days ago

raw

history blame contribute delete

3.74 kB

	"""Subprocess-based test runner — lighter sandbox for trusted eval flows.

	Used when Docker isn't available (e.g. CHTC execute nodes). Runs pytest in
	a subprocess with a wall-clock timeout. No namespace isolation — appropriate
	only for stock-model baselines, NEVER for Week 4 GRPO where model output is
	actively adversarial. For that, use SandboxVerifier (Docker) or a future
	Apptainer backend.

	Returns the same `RunResult` dataclass as `runner.run_code` so the Verifier
	interface stays backend-agnostic.
	"""

	from __future__ import annotations

	import os
	import re
	import subprocess
	import sys
	import tempfile
	import time
	from pathlib import Path

	from .runner import RunResult

	DEFAULT_TIMEOUT_S: float = 10.0


	def run_code_subprocess(
	code: str,
	tests: str,
	*,
	timeout_s: float = DEFAULT_TIMEOUT_S,
	) -> RunResult:
	"""Run `tests` against `code` via subprocess pytest; return a RunResult."""
	with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as workdir:
	work = Path(workdir)
	(work / "solution.py").write_text(code)
	(work / "test_solution.py").write_text(tests)

	start = time.monotonic()
	try:
	proc = subprocess.run(
	[
	sys.executable, "-m", "pytest", "-q",
	"-p", "no:cacheprovider",
	"test_solution.py",
	],
	cwd=str(work),
	timeout=timeout_s,
	capture_output=True,
	text=True,
	check=False,
	env=_minimal_env(),
	)
	except subprocess.TimeoutExpired as exc:
	return RunResult(
	passed=False,
	num_tests_passed=0,
	num_tests_total=0,
	runtime_ms=int(timeout_s * 1000),
	stdout=(exc.stdout or b"").decode("utf-8", errors="replace")
	if isinstance(exc.stdout, bytes) else (exc.stdout or ""),
	stderr="timeout",
	timed_out=True,
	oom=False,
	error=None,
	)
	except Exception as exc:
	return RunResult(
	passed=False, num_tests_passed=0, num_tests_total=0,
	runtime_ms=0, stdout="", stderr="", timed_out=False,
	oom=False, error=f"runner error: {exc}",
	)

	runtime_ms = int((time.monotonic() - start) * 1000)
	n_passed, n_total = _parse_pytest_output(proc.stdout + proc.stderr)
	passed = proc.returncode == 0 and n_total > 0 and n_passed == n_total
	return RunResult(
	passed=passed,
	num_tests_passed=n_passed,
	num_tests_total=n_total,
	runtime_ms=runtime_ms,
	stdout=proc.stdout,
	stderr=proc.stderr,
	timed_out=False,
	oom=False,
	error=None,
	)


	def _parse_pytest_output(output: str) -> tuple[int, int]:
	"""Extract (num_passed, num_total) from pytest's summary output."""
	n_passed = 0
	n_failed = 0
	n_error = 0
	if m := re.search(r"(\d+) passed", output):
	n_passed = int(m.group(1))
	if m := re.search(r"(\d+) failed", output):
	n_failed = int(m.group(1))
	if m := re.search(r"(\d+) error", output):
	n_error = int(m.group(1))
	return n_passed, n_passed + n_failed + n_error


	def _minimal_env() -> dict[str, str]:
	"""Inherit only what pytest needs; strip everything else for cleanliness."""
	keep = ("PATH", "PYTHONPATH", "HOME", "LANG", "LC_ALL", "USER", "LOGNAME")
	out = {"PYTHONDONTWRITEBYTECODE": "1"}
	for k in keep:
	if k in os.environ:
	out[k] = os.environ[k]
	return out