"""Verifier agent — runs a candidate solution in the sandbox and scores it. Wraps the sandbox + composite reward behind the same agent-style interface that Proposer exposes. Week 5's ablations will compare verifier backends (this sandbox, evalplus's native evaluator, a SWE-bench harness) by swapping implementations behind this protocol — callers never change. """ from __future__ import annotations from dataclasses import dataclass from typing import Protocol from ..benchmarks.base import Task, to_sandbox_inputs from ..rewards.composite import ( DEFAULT_WEIGHTS, RewardWeights, composite_reward, ) from ..rewards.correctness import RewardMode from ..sandbox.runner import RunResult, run_code from ..sandbox.subprocess_runner import run_code_subprocess @dataclass(frozen=True) class VerifyResult: """Outcome of verifying a candidate solution against a task's tests. Fields: passed: True iff every test passed (equivalent to RunResult.passed). reward: Composite scalar, suitable for RL / rejection sampling. raw: The underlying RunResult — stdout, stderr, counts, timing. """ passed: bool reward: float raw: RunResult class Verifier(Protocol): """Protocol every verifier backend implements.""" def verify(self, code: str, task: Task, benchmark: str) -> VerifyResult: ... class SandboxVerifier: """Reference Verifier: run code in the Docker sandbox, score with composite. Construct once per eval (cheap); reuse across many `verify` calls. """ def __init__( self, *, weights: RewardWeights = DEFAULT_WEIGHTS, mode: RewardMode = "binary", ) -> None: self._weights: RewardWeights = weights self._mode: RewardMode = mode def verify(self, code: str, task: Task, benchmark: str) -> VerifyResult: """Build the test file, run it in sandbox, return scored result.""" solution, tests = to_sandbox_inputs(task, code, benchmark) raw = run_code(solution, tests) reward = composite_reward( code, raw, weights=self._weights, mode=self._mode, ) return VerifyResult(passed=raw.passed, reward=reward, raw=raw) class SubprocessVerifier: """Docker-free Verifier using pytest subprocess. For CHTC and other hosts without Docker, where eval code isn't adversarial (stock model on public benchmarks). Faster than SandboxVerifier since there's no container startup; less isolated, so DO NOT use for Week 4 GRPO training. """ def __init__( self, *, weights: RewardWeights = DEFAULT_WEIGHTS, mode: RewardMode = "binary", timeout_s: float = 10.0, ) -> None: self._weights: RewardWeights = weights self._mode: RewardMode = mode self._timeout_s = timeout_s def verify(self, code: str, task: Task, benchmark: str) -> VerifyResult: solution, tests = to_sandbox_inputs(task, code, benchmark) raw = run_code_subprocess(solution, tests, timeout_s=self._timeout_s) reward = composite_reward( code, raw, weights=self._weights, mode=self._mode, ) return VerifyResult(passed=raw.passed, reward=reward, raw=raw) class NoOpVerifier: """Returns a zero-reward placeholder — for flows that score elsewhere. Used when the Coordinator runs on a host without our Docker sandbox (e.g. CHTC execute nodes) and final scoring happens via `evalplus` after the loop. The Coordinator's retry decisions come from the Reviewer alone; SolveResult.final_verify ends up informational-only. """ def verify(self, code: str, task: Task, benchmark: str) -> VerifyResult: _ = code, task, benchmark # unused return VerifyResult( passed=False, reward=0.0, raw=RunResult( passed=False, num_tests_passed=0, num_tests_total=0, runtime_ms=0, stdout="", stderr="", timed_out=False, oom=False, error=None, ), )