dmaheshwar22's picture
fix: lazy-import docker so RunResult is loadable in CPU-only envs
b318b1d verified
"""Docker-based sandbox runner for untrusted, model-generated code.
Every reward signal in the RL training loop flows through `run_code`. A bug
here shows up as reward hacking: the model will eventually find any accounting
or isolation mistake and exploit it. Treat this module as security-critical.
"""
from __future__ import annotations
import contextlib
import os
import tempfile
import time
from dataclasses import dataclass
from pathlib import Path
# `docker` is only required by run_code() at runtime. Importing it lazily
# means RunResult (used everywhere) stays importable in environments without
# Docker installed (HF Spaces, CI, light dev installs).
from .resource_limits import (
DEFAULT_MEM_MB,
DEFAULT_TIMEOUT_S,
SANDBOX_IMAGE,
WAIT_BUFFER_S,
container_kwargs,
)
from .test_executor import parse_report, pytest_command, was_oom_killed
@dataclass(frozen=True)
class RunResult:
"""Outcome of executing a candidate solution against a test suite."""
passed: bool
num_tests_passed: int
num_tests_total: int
runtime_ms: int
stdout: str
stderr: str
timed_out: bool
oom: bool
error: str | None
def run_code(
code: str,
tests: str,
*,
timeout_s: float = DEFAULT_TIMEOUT_S,
mem_mb: int = DEFAULT_MEM_MB,
) -> RunResult:
"""Execute `tests` against `code` inside an isolated Docker container.
User-code failures (syntax errors, failing tests, timeouts, OOM) are
reflected in the result, not raised. Only internal runner failures
(Docker daemon unreachable, image missing, etc.) raise.
"""
import docker # type: ignore[import-untyped] # lazy — keeps RunResult importable without docker installed
client = docker.from_env()
container = None
try:
# ignore_cleanup_errors: belt-and-braces in case any file somehow
# ends up owned by the sandbox uid (e.g. future plugin writes).
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as workdir:
work = Path(workdir)
(work / "solution.py").write_text(code)
(work / "test_solution.py").write_text(tests)
# Container runs as UID 1000 (see Dockerfile). The host-side workdir
# might be owned by a different UID, so open it up so the sandbox
# user can write the pytest JSON report into the bind mount.
os.chmod(work, 0o777)
container = client.containers.run(
SANDBOX_IMAGE,
command=pytest_command(),
volumes={str(work): {"bind": "/work", "mode": "rw"}},
working_dir="/work",
detach=True,
**container_kwargs(mem_mb=mem_mb),
)
timed_out = False
start = time.monotonic()
try:
status = container.wait(timeout=timeout_s + WAIT_BUFFER_S)
except Exception:
timed_out = True
with contextlib.suppress(Exception):
container.kill()
status = container.wait()
runtime_ms = int((time.monotonic() - start) * 1000)
logs = container.logs().decode("utf-8", errors="replace")
exit_code = int(status.get("StatusCode", 1))
oom = was_oom_killed(container)
n_passed, n_total = parse_report(work)
return RunResult(
passed=(
not timed_out
and not oom
and exit_code == 0
and n_total > 0
and n_passed == n_total
),
num_tests_passed=n_passed,
num_tests_total=n_total,
runtime_ms=runtime_ms,
stdout=logs,
stderr="",
timed_out=timed_out,
oom=oom,
error=None,
)
finally:
if container is not None:
with contextlib.suppress(Exception):
container.remove(force=True)