File size: 4,208 Bytes
0dd7c80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""Verifier agent — runs a candidate solution in the sandbox and scores it.

Wraps the sandbox + composite reward behind the same agent-style interface
that Proposer exposes. Week 5's ablations will compare verifier backends
(this sandbox, evalplus's native evaluator, a SWE-bench harness) by
swapping implementations behind this protocol — callers never change.
"""

from __future__ import annotations

from dataclasses import dataclass
from typing import Protocol

from ..benchmarks.base import Task, to_sandbox_inputs
from ..rewards.composite import (
    DEFAULT_WEIGHTS,
    RewardWeights,
    composite_reward,
)
from ..rewards.correctness import RewardMode
from ..sandbox.runner import RunResult, run_code
from ..sandbox.subprocess_runner import run_code_subprocess


@dataclass(frozen=True)
class VerifyResult:
    """Outcome of verifying a candidate solution against a task's tests.

    Fields:
        passed: True iff every test passed (equivalent to RunResult.passed).
        reward: Composite scalar, suitable for RL / rejection sampling.
        raw: The underlying RunResult — stdout, stderr, counts, timing.
    """

    passed: bool
    reward: float
    raw: RunResult


class Verifier(Protocol):
    """Protocol every verifier backend implements."""

    def verify(self, code: str, task: Task, benchmark: str) -> VerifyResult: ...


class SandboxVerifier:
    """Reference Verifier: run code in the Docker sandbox, score with composite.

    Construct once per eval (cheap); reuse across many `verify` calls.
    """

    def __init__(
        self,
        *,
        weights: RewardWeights = DEFAULT_WEIGHTS,
        mode: RewardMode = "binary",
    ) -> None:
        self._weights: RewardWeights = weights
        self._mode: RewardMode = mode

    def verify(self, code: str, task: Task, benchmark: str) -> VerifyResult:
        """Build the test file, run it in sandbox, return scored result."""
        solution, tests = to_sandbox_inputs(task, code, benchmark)
        raw = run_code(solution, tests)
        reward = composite_reward(
            code,
            raw,
            weights=self._weights,
            mode=self._mode,
        )
        return VerifyResult(passed=raw.passed, reward=reward, raw=raw)


class SubprocessVerifier:
    """Docker-free Verifier using pytest subprocess.

    For CHTC and other hosts without Docker, where eval code isn't
    adversarial (stock model on public benchmarks). Faster than SandboxVerifier
    since there's no container startup; less isolated, so DO NOT use for
    Week 4 GRPO training.
    """

    def __init__(
        self,
        *,
        weights: RewardWeights = DEFAULT_WEIGHTS,
        mode: RewardMode = "binary",
        timeout_s: float = 10.0,
    ) -> None:
        self._weights: RewardWeights = weights
        self._mode: RewardMode = mode
        self._timeout_s = timeout_s

    def verify(self, code: str, task: Task, benchmark: str) -> VerifyResult:
        solution, tests = to_sandbox_inputs(task, code, benchmark)
        raw = run_code_subprocess(solution, tests, timeout_s=self._timeout_s)
        reward = composite_reward(
            code, raw, weights=self._weights, mode=self._mode,
        )
        return VerifyResult(passed=raw.passed, reward=reward, raw=raw)


class NoOpVerifier:
    """Returns a zero-reward placeholder — for flows that score elsewhere.

    Used when the Coordinator runs on a host without our Docker sandbox
    (e.g. CHTC execute nodes) and final scoring happens via `evalplus`
    after the loop. The Coordinator's retry decisions come from the Reviewer
    alone; SolveResult.final_verify ends up informational-only.
    """

    def verify(self, code: str, task: Task, benchmark: str) -> VerifyResult:
        _ = code, task, benchmark  # unused
        return VerifyResult(
            passed=False,
            reward=0.0,
            raw=RunResult(
                passed=False,
                num_tests_passed=0,
                num_tests_total=0,
                runtime_ms=0,
                stdout="",
                stderr="",
                timed_out=False,
                oom=False,
                error=None,
            ),
        )