Spaces:

dmaheshwar22
/

verifiable-rl-coder

Sleeping

App Files Files Community

verifiable-rl-coder / src /verifiable_rl_coder /sandbox /runner.py

dmaheshwar22

fix: lazy-import docker so RunResult is loadable in CPU-only envs

b318b1d verified 25 days ago

raw

history blame contribute delete

4.06 kB

	"""Docker-based sandbox runner for untrusted, model-generated code.

	Every reward signal in the RL training loop flows through `run_code`. A bug
	here shows up as reward hacking: the model will eventually find any accounting
	or isolation mistake and exploit it. Treat this module as security-critical.
	"""

	from __future__ import annotations

	import contextlib
	import os
	import tempfile
	import time
	from dataclasses import dataclass
	from pathlib import Path

	# `docker` is only required by run_code() at runtime. Importing it lazily
	# means RunResult (used everywhere) stays importable in environments without
	# Docker installed (HF Spaces, CI, light dev installs).

	from .resource_limits import (
	DEFAULT_MEM_MB,
	DEFAULT_TIMEOUT_S,
	SANDBOX_IMAGE,
	WAIT_BUFFER_S,
	container_kwargs,
	)
	from .test_executor import parse_report, pytest_command, was_oom_killed


	@dataclass(frozen=True)
	class RunResult:
	"""Outcome of executing a candidate solution against a test suite."""

	passed: bool
	num_tests_passed: int
	num_tests_total: int
	runtime_ms: int
	stdout: str
	stderr: str
	timed_out: bool
	oom: bool
	error: str \| None


	def run_code(
	code: str,
	tests: str,
	*,
	timeout_s: float = DEFAULT_TIMEOUT_S,
	mem_mb: int = DEFAULT_MEM_MB,
	) -> RunResult:
	"""Execute `tests` against `code` inside an isolated Docker container.

	User-code failures (syntax errors, failing tests, timeouts, OOM) are
	reflected in the result, not raised. Only internal runner failures
	(Docker daemon unreachable, image missing, etc.) raise.
	"""
	import docker # type: ignore[import-untyped] # lazy — keeps RunResult importable without docker installed
	client = docker.from_env()
	container = None
	try:
	# ignore_cleanup_errors: belt-and-braces in case any file somehow
	# ends up owned by the sandbox uid (e.g. future plugin writes).
	with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as workdir:
	work = Path(workdir)
	(work / "solution.py").write_text(code)
	(work / "test_solution.py").write_text(tests)
	# Container runs as UID 1000 (see Dockerfile). The host-side workdir
	# might be owned by a different UID, so open it up so the sandbox
	# user can write the pytest JSON report into the bind mount.
	os.chmod(work, 0o777)

	container = client.containers.run(
	SANDBOX_IMAGE,
	command=pytest_command(),
	volumes={str(work): {"bind": "/work", "mode": "rw"}},
	working_dir="/work",
	detach=True,
	**container_kwargs(mem_mb=mem_mb),
	)

	timed_out = False
	start = time.monotonic()
	try:
	status = container.wait(timeout=timeout_s + WAIT_BUFFER_S)
	except Exception:
	timed_out = True
	with contextlib.suppress(Exception):
	container.kill()
	status = container.wait()
	runtime_ms = int((time.monotonic() - start) * 1000)

	logs = container.logs().decode("utf-8", errors="replace")
	exit_code = int(status.get("StatusCode", 1))
	oom = was_oom_killed(container)
	n_passed, n_total = parse_report(work)

	return RunResult(
	passed=(
	not timed_out
	and not oom
	and exit_code == 0
	and n_total > 0
	and n_passed == n_total
	),
	num_tests_passed=n_passed,
	num_tests_total=n_total,
	runtime_ms=runtime_ms,
	stdout=logs,
	stderr="",
	timed_out=timed_out,
	oom=oom,
	error=None,
	)
	finally:
	if container is not None:
	with contextlib.suppress(Exception):
	container.remove(force=True)