Spaces:
Sleeping
Sleeping
| """Verifier agent — runs a candidate solution in the sandbox and scores it. | |
| Wraps the sandbox + composite reward behind the same agent-style interface | |
| that Proposer exposes. Week 5's ablations will compare verifier backends | |
| (this sandbox, evalplus's native evaluator, a SWE-bench harness) by | |
| swapping implementations behind this protocol — callers never change. | |
| """ | |
| from __future__ import annotations | |
| from dataclasses import dataclass | |
| from typing import Protocol | |
| from ..benchmarks.base import Task, to_sandbox_inputs | |
| from ..rewards.composite import ( | |
| DEFAULT_WEIGHTS, | |
| RewardWeights, | |
| composite_reward, | |
| ) | |
| from ..rewards.correctness import RewardMode | |
| from ..sandbox.runner import RunResult, run_code | |
| from ..sandbox.subprocess_runner import run_code_subprocess | |
| class VerifyResult: | |
| """Outcome of verifying a candidate solution against a task's tests. | |
| Fields: | |
| passed: True iff every test passed (equivalent to RunResult.passed). | |
| reward: Composite scalar, suitable for RL / rejection sampling. | |
| raw: The underlying RunResult — stdout, stderr, counts, timing. | |
| """ | |
| passed: bool | |
| reward: float | |
| raw: RunResult | |
| class Verifier(Protocol): | |
| """Protocol every verifier backend implements.""" | |
| def verify(self, code: str, task: Task, benchmark: str) -> VerifyResult: ... | |
| class SandboxVerifier: | |
| """Reference Verifier: run code in the Docker sandbox, score with composite. | |
| Construct once per eval (cheap); reuse across many `verify` calls. | |
| """ | |
| def __init__( | |
| self, | |
| *, | |
| weights: RewardWeights = DEFAULT_WEIGHTS, | |
| mode: RewardMode = "binary", | |
| ) -> None: | |
| self._weights: RewardWeights = weights | |
| self._mode: RewardMode = mode | |
| def verify(self, code: str, task: Task, benchmark: str) -> VerifyResult: | |
| """Build the test file, run it in sandbox, return scored result.""" | |
| solution, tests = to_sandbox_inputs(task, code, benchmark) | |
| raw = run_code(solution, tests) | |
| reward = composite_reward( | |
| code, | |
| raw, | |
| weights=self._weights, | |
| mode=self._mode, | |
| ) | |
| return VerifyResult(passed=raw.passed, reward=reward, raw=raw) | |
| class SubprocessVerifier: | |
| """Docker-free Verifier using pytest subprocess. | |
| For CHTC and other hosts without Docker, where eval code isn't | |
| adversarial (stock model on public benchmarks). Faster than SandboxVerifier | |
| since there's no container startup; less isolated, so DO NOT use for | |
| Week 4 GRPO training. | |
| """ | |
| def __init__( | |
| self, | |
| *, | |
| weights: RewardWeights = DEFAULT_WEIGHTS, | |
| mode: RewardMode = "binary", | |
| timeout_s: float = 10.0, | |
| ) -> None: | |
| self._weights: RewardWeights = weights | |
| self._mode: RewardMode = mode | |
| self._timeout_s = timeout_s | |
| def verify(self, code: str, task: Task, benchmark: str) -> VerifyResult: | |
| solution, tests = to_sandbox_inputs(task, code, benchmark) | |
| raw = run_code_subprocess(solution, tests, timeout_s=self._timeout_s) | |
| reward = composite_reward( | |
| code, raw, weights=self._weights, mode=self._mode, | |
| ) | |
| return VerifyResult(passed=raw.passed, reward=reward, raw=raw) | |
| class NoOpVerifier: | |
| """Returns a zero-reward placeholder — for flows that score elsewhere. | |
| Used when the Coordinator runs on a host without our Docker sandbox | |
| (e.g. CHTC execute nodes) and final scoring happens via `evalplus` | |
| after the loop. The Coordinator's retry decisions come from the Reviewer | |
| alone; SolveResult.final_verify ends up informational-only. | |
| """ | |
| def verify(self, code: str, task: Task, benchmark: str) -> VerifyResult: | |
| _ = code, task, benchmark # unused | |
| return VerifyResult( | |
| passed=False, | |
| reward=0.0, | |
| raw=RunResult( | |
| passed=False, | |
| num_tests_passed=0, | |
| num_tests_total=0, | |
| runtime_ms=0, | |
| stdout="", | |
| stderr="", | |
| timed_out=False, | |
| oom=False, | |
| error=None, | |
| ), | |
| ) | |