Spaces:
Sleeping
Sleeping
| """Docker-based sandbox runner for untrusted, model-generated code. | |
| Every reward signal in the RL training loop flows through `run_code`. A bug | |
| here shows up as reward hacking: the model will eventually find any accounting | |
| or isolation mistake and exploit it. Treat this module as security-critical. | |
| """ | |
| from __future__ import annotations | |
| import contextlib | |
| import os | |
| import tempfile | |
| import time | |
| from dataclasses import dataclass | |
| from pathlib import Path | |
| # `docker` is only required by run_code() at runtime. Importing it lazily | |
| # means RunResult (used everywhere) stays importable in environments without | |
| # Docker installed (HF Spaces, CI, light dev installs). | |
| from .resource_limits import ( | |
| DEFAULT_MEM_MB, | |
| DEFAULT_TIMEOUT_S, | |
| SANDBOX_IMAGE, | |
| WAIT_BUFFER_S, | |
| container_kwargs, | |
| ) | |
| from .test_executor import parse_report, pytest_command, was_oom_killed | |
| class RunResult: | |
| """Outcome of executing a candidate solution against a test suite.""" | |
| passed: bool | |
| num_tests_passed: int | |
| num_tests_total: int | |
| runtime_ms: int | |
| stdout: str | |
| stderr: str | |
| timed_out: bool | |
| oom: bool | |
| error: str | None | |
| def run_code( | |
| code: str, | |
| tests: str, | |
| *, | |
| timeout_s: float = DEFAULT_TIMEOUT_S, | |
| mem_mb: int = DEFAULT_MEM_MB, | |
| ) -> RunResult: | |
| """Execute `tests` against `code` inside an isolated Docker container. | |
| User-code failures (syntax errors, failing tests, timeouts, OOM) are | |
| reflected in the result, not raised. Only internal runner failures | |
| (Docker daemon unreachable, image missing, etc.) raise. | |
| """ | |
| import docker # type: ignore[import-untyped] # lazy — keeps RunResult importable without docker installed | |
| client = docker.from_env() | |
| container = None | |
| try: | |
| # ignore_cleanup_errors: belt-and-braces in case any file somehow | |
| # ends up owned by the sandbox uid (e.g. future plugin writes). | |
| with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as workdir: | |
| work = Path(workdir) | |
| (work / "solution.py").write_text(code) | |
| (work / "test_solution.py").write_text(tests) | |
| # Container runs as UID 1000 (see Dockerfile). The host-side workdir | |
| # might be owned by a different UID, so open it up so the sandbox | |
| # user can write the pytest JSON report into the bind mount. | |
| os.chmod(work, 0o777) | |
| container = client.containers.run( | |
| SANDBOX_IMAGE, | |
| command=pytest_command(), | |
| volumes={str(work): {"bind": "/work", "mode": "rw"}}, | |
| working_dir="/work", | |
| detach=True, | |
| **container_kwargs(mem_mb=mem_mb), | |
| ) | |
| timed_out = False | |
| start = time.monotonic() | |
| try: | |
| status = container.wait(timeout=timeout_s + WAIT_BUFFER_S) | |
| except Exception: | |
| timed_out = True | |
| with contextlib.suppress(Exception): | |
| container.kill() | |
| status = container.wait() | |
| runtime_ms = int((time.monotonic() - start) * 1000) | |
| logs = container.logs().decode("utf-8", errors="replace") | |
| exit_code = int(status.get("StatusCode", 1)) | |
| oom = was_oom_killed(container) | |
| n_passed, n_total = parse_report(work) | |
| return RunResult( | |
| passed=( | |
| not timed_out | |
| and not oom | |
| and exit_code == 0 | |
| and n_total > 0 | |
| and n_passed == n_total | |
| ), | |
| num_tests_passed=n_passed, | |
| num_tests_total=n_total, | |
| runtime_ms=runtime_ms, | |
| stdout=logs, | |
| stderr="", | |
| timed_out=timed_out, | |
| oom=oom, | |
| error=None, | |
| ) | |
| finally: | |
| if container is not None: | |
| with contextlib.suppress(Exception): | |
| container.remove(force=True) | |