"""Docker-based sandbox runner for untrusted, model-generated code. Every reward signal in the RL training loop flows through `run_code`. A bug here shows up as reward hacking: the model will eventually find any accounting or isolation mistake and exploit it. Treat this module as security-critical. """ from __future__ import annotations import contextlib import os import tempfile import time from dataclasses import dataclass from pathlib import Path # `docker` is only required by run_code() at runtime. Importing it lazily # means RunResult (used everywhere) stays importable in environments without # Docker installed (HF Spaces, CI, light dev installs). from .resource_limits import ( DEFAULT_MEM_MB, DEFAULT_TIMEOUT_S, SANDBOX_IMAGE, WAIT_BUFFER_S, container_kwargs, ) from .test_executor import parse_report, pytest_command, was_oom_killed @dataclass(frozen=True) class RunResult: """Outcome of executing a candidate solution against a test suite.""" passed: bool num_tests_passed: int num_tests_total: int runtime_ms: int stdout: str stderr: str timed_out: bool oom: bool error: str | None def run_code( code: str, tests: str, *, timeout_s: float = DEFAULT_TIMEOUT_S, mem_mb: int = DEFAULT_MEM_MB, ) -> RunResult: """Execute `tests` against `code` inside an isolated Docker container. User-code failures (syntax errors, failing tests, timeouts, OOM) are reflected in the result, not raised. Only internal runner failures (Docker daemon unreachable, image missing, etc.) raise. """ import docker # type: ignore[import-untyped] # lazy — keeps RunResult importable without docker installed client = docker.from_env() container = None try: # ignore_cleanup_errors: belt-and-braces in case any file somehow # ends up owned by the sandbox uid (e.g. future plugin writes). with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as workdir: work = Path(workdir) (work / "solution.py").write_text(code) (work / "test_solution.py").write_text(tests) # Container runs as UID 1000 (see Dockerfile). The host-side workdir # might be owned by a different UID, so open it up so the sandbox # user can write the pytest JSON report into the bind mount. os.chmod(work, 0o777) container = client.containers.run( SANDBOX_IMAGE, command=pytest_command(), volumes={str(work): {"bind": "/work", "mode": "rw"}}, working_dir="/work", detach=True, **container_kwargs(mem_mb=mem_mb), ) timed_out = False start = time.monotonic() try: status = container.wait(timeout=timeout_s + WAIT_BUFFER_S) except Exception: timed_out = True with contextlib.suppress(Exception): container.kill() status = container.wait() runtime_ms = int((time.monotonic() - start) * 1000) logs = container.logs().decode("utf-8", errors="replace") exit_code = int(status.get("StatusCode", 1)) oom = was_oom_killed(container) n_passed, n_total = parse_report(work) return RunResult( passed=( not timed_out and not oom and exit_code == 0 and n_total > 0 and n_passed == n_total ), num_tests_passed=n_passed, num_tests_total=n_total, runtime_ms=runtime_ms, stdout=logs, stderr="", timed_out=timed_out, oom=oom, error=None, ) finally: if container is not None: with contextlib.suppress(Exception): container.remove(force=True)