Spaces:
Sleeping
Sleeping
File size: 4,061 Bytes
0dd7c80 b318b1d 0dd7c80 b318b1d 0dd7c80 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 | """Docker-based sandbox runner for untrusted, model-generated code.
Every reward signal in the RL training loop flows through `run_code`. A bug
here shows up as reward hacking: the model will eventually find any accounting
or isolation mistake and exploit it. Treat this module as security-critical.
"""
from __future__ import annotations
import contextlib
import os
import tempfile
import time
from dataclasses import dataclass
from pathlib import Path
# `docker` is only required by run_code() at runtime. Importing it lazily
# means RunResult (used everywhere) stays importable in environments without
# Docker installed (HF Spaces, CI, light dev installs).
from .resource_limits import (
DEFAULT_MEM_MB,
DEFAULT_TIMEOUT_S,
SANDBOX_IMAGE,
WAIT_BUFFER_S,
container_kwargs,
)
from .test_executor import parse_report, pytest_command, was_oom_killed
@dataclass(frozen=True)
class RunResult:
"""Outcome of executing a candidate solution against a test suite."""
passed: bool
num_tests_passed: int
num_tests_total: int
runtime_ms: int
stdout: str
stderr: str
timed_out: bool
oom: bool
error: str | None
def run_code(
code: str,
tests: str,
*,
timeout_s: float = DEFAULT_TIMEOUT_S,
mem_mb: int = DEFAULT_MEM_MB,
) -> RunResult:
"""Execute `tests` against `code` inside an isolated Docker container.
User-code failures (syntax errors, failing tests, timeouts, OOM) are
reflected in the result, not raised. Only internal runner failures
(Docker daemon unreachable, image missing, etc.) raise.
"""
import docker # type: ignore[import-untyped] # lazy — keeps RunResult importable without docker installed
client = docker.from_env()
container = None
try:
# ignore_cleanup_errors: belt-and-braces in case any file somehow
# ends up owned by the sandbox uid (e.g. future plugin writes).
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as workdir:
work = Path(workdir)
(work / "solution.py").write_text(code)
(work / "test_solution.py").write_text(tests)
# Container runs as UID 1000 (see Dockerfile). The host-side workdir
# might be owned by a different UID, so open it up so the sandbox
# user can write the pytest JSON report into the bind mount.
os.chmod(work, 0o777)
container = client.containers.run(
SANDBOX_IMAGE,
command=pytest_command(),
volumes={str(work): {"bind": "/work", "mode": "rw"}},
working_dir="/work",
detach=True,
**container_kwargs(mem_mb=mem_mb),
)
timed_out = False
start = time.monotonic()
try:
status = container.wait(timeout=timeout_s + WAIT_BUFFER_S)
except Exception:
timed_out = True
with contextlib.suppress(Exception):
container.kill()
status = container.wait()
runtime_ms = int((time.monotonic() - start) * 1000)
logs = container.logs().decode("utf-8", errors="replace")
exit_code = int(status.get("StatusCode", 1))
oom = was_oom_killed(container)
n_passed, n_total = parse_report(work)
return RunResult(
passed=(
not timed_out
and not oom
and exit_code == 0
and n_total > 0
and n_passed == n_total
),
num_tests_passed=n_passed,
num_tests_total=n_total,
runtime_ms=runtime_ms,
stdout=logs,
stderr="",
timed_out=timed_out,
oom=oom,
error=None,
)
finally:
if container is not None:
with contextlib.suppress(Exception):
container.remove(force=True)
|