File size: 4,061 Bytes
0dd7c80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b318b1d
 
 
0dd7c80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b318b1d
0dd7c80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""Docker-based sandbox runner for untrusted, model-generated code.

Every reward signal in the RL training loop flows through `run_code`. A bug
here shows up as reward hacking: the model will eventually find any accounting
or isolation mistake and exploit it. Treat this module as security-critical.
"""

from __future__ import annotations

import contextlib
import os
import tempfile
import time
from dataclasses import dataclass
from pathlib import Path

# `docker` is only required by run_code() at runtime. Importing it lazily
# means RunResult (used everywhere) stays importable in environments without
# Docker installed (HF Spaces, CI, light dev installs).

from .resource_limits import (
    DEFAULT_MEM_MB,
    DEFAULT_TIMEOUT_S,
    SANDBOX_IMAGE,
    WAIT_BUFFER_S,
    container_kwargs,
)
from .test_executor import parse_report, pytest_command, was_oom_killed


@dataclass(frozen=True)
class RunResult:
    """Outcome of executing a candidate solution against a test suite."""

    passed: bool
    num_tests_passed: int
    num_tests_total: int
    runtime_ms: int
    stdout: str
    stderr: str
    timed_out: bool
    oom: bool
    error: str | None


def run_code(
    code: str,
    tests: str,
    *,
    timeout_s: float = DEFAULT_TIMEOUT_S,
    mem_mb: int = DEFAULT_MEM_MB,
) -> RunResult:
    """Execute `tests` against `code` inside an isolated Docker container.

    User-code failures (syntax errors, failing tests, timeouts, OOM) are
    reflected in the result, not raised. Only internal runner failures
    (Docker daemon unreachable, image missing, etc.) raise.
    """
    import docker  # type: ignore[import-untyped]  # lazy — keeps RunResult importable without docker installed
    client = docker.from_env()
    container = None
    try:
        # ignore_cleanup_errors: belt-and-braces in case any file somehow
        # ends up owned by the sandbox uid (e.g. future plugin writes).
        with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as workdir:
            work = Path(workdir)
            (work / "solution.py").write_text(code)
            (work / "test_solution.py").write_text(tests)
            # Container runs as UID 1000 (see Dockerfile). The host-side workdir
            # might be owned by a different UID, so open it up so the sandbox
            # user can write the pytest JSON report into the bind mount.
            os.chmod(work, 0o777)

            container = client.containers.run(
                SANDBOX_IMAGE,
                command=pytest_command(),
                volumes={str(work): {"bind": "/work", "mode": "rw"}},
                working_dir="/work",
                detach=True,
                **container_kwargs(mem_mb=mem_mb),
            )

            timed_out = False
            start = time.monotonic()
            try:
                status = container.wait(timeout=timeout_s + WAIT_BUFFER_S)
            except Exception:
                timed_out = True
                with contextlib.suppress(Exception):
                    container.kill()
                status = container.wait()
            runtime_ms = int((time.monotonic() - start) * 1000)

            logs = container.logs().decode("utf-8", errors="replace")
            exit_code = int(status.get("StatusCode", 1))
            oom = was_oom_killed(container)
            n_passed, n_total = parse_report(work)

            return RunResult(
                passed=(
                    not timed_out
                    and not oom
                    and exit_code == 0
                    and n_total > 0
                    and n_passed == n_total
                ),
                num_tests_passed=n_passed,
                num_tests_total=n_total,
                runtime_ms=runtime_ms,
                stdout=logs,
                stderr="",
                timed_out=timed_out,
                oom=oom,
                error=None,
            )
    finally:
        if container is not None:
            with contextlib.suppress(Exception):
                container.remove(force=True)