dmaheshwar22's picture
deploy: replace template with real demo
0dd7c80 verified
"""Verifier agent — runs a candidate solution in the sandbox and scores it.
Wraps the sandbox + composite reward behind the same agent-style interface
that Proposer exposes. Week 5's ablations will compare verifier backends
(this sandbox, evalplus's native evaluator, a SWE-bench harness) by
swapping implementations behind this protocol — callers never change.
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import Protocol
from ..benchmarks.base import Task, to_sandbox_inputs
from ..rewards.composite import (
DEFAULT_WEIGHTS,
RewardWeights,
composite_reward,
)
from ..rewards.correctness import RewardMode
from ..sandbox.runner import RunResult, run_code
from ..sandbox.subprocess_runner import run_code_subprocess
@dataclass(frozen=True)
class VerifyResult:
"""Outcome of verifying a candidate solution against a task's tests.
Fields:
passed: True iff every test passed (equivalent to RunResult.passed).
reward: Composite scalar, suitable for RL / rejection sampling.
raw: The underlying RunResult — stdout, stderr, counts, timing.
"""
passed: bool
reward: float
raw: RunResult
class Verifier(Protocol):
"""Protocol every verifier backend implements."""
def verify(self, code: str, task: Task, benchmark: str) -> VerifyResult: ...
class SandboxVerifier:
"""Reference Verifier: run code in the Docker sandbox, score with composite.
Construct once per eval (cheap); reuse across many `verify` calls.
"""
def __init__(
self,
*,
weights: RewardWeights = DEFAULT_WEIGHTS,
mode: RewardMode = "binary",
) -> None:
self._weights: RewardWeights = weights
self._mode: RewardMode = mode
def verify(self, code: str, task: Task, benchmark: str) -> VerifyResult:
"""Build the test file, run it in sandbox, return scored result."""
solution, tests = to_sandbox_inputs(task, code, benchmark)
raw = run_code(solution, tests)
reward = composite_reward(
code,
raw,
weights=self._weights,
mode=self._mode,
)
return VerifyResult(passed=raw.passed, reward=reward, raw=raw)
class SubprocessVerifier:
"""Docker-free Verifier using pytest subprocess.
For CHTC and other hosts without Docker, where eval code isn't
adversarial (stock model on public benchmarks). Faster than SandboxVerifier
since there's no container startup; less isolated, so DO NOT use for
Week 4 GRPO training.
"""
def __init__(
self,
*,
weights: RewardWeights = DEFAULT_WEIGHTS,
mode: RewardMode = "binary",
timeout_s: float = 10.0,
) -> None:
self._weights: RewardWeights = weights
self._mode: RewardMode = mode
self._timeout_s = timeout_s
def verify(self, code: str, task: Task, benchmark: str) -> VerifyResult:
solution, tests = to_sandbox_inputs(task, code, benchmark)
raw = run_code_subprocess(solution, tests, timeout_s=self._timeout_s)
reward = composite_reward(
code, raw, weights=self._weights, mode=self._mode,
)
return VerifyResult(passed=raw.passed, reward=reward, raw=raw)
class NoOpVerifier:
"""Returns a zero-reward placeholder — for flows that score elsewhere.
Used when the Coordinator runs on a host without our Docker sandbox
(e.g. CHTC execute nodes) and final scoring happens via `evalplus`
after the loop. The Coordinator's retry decisions come from the Reviewer
alone; SolveResult.final_verify ends up informational-only.
"""
def verify(self, code: str, task: Task, benchmark: str) -> VerifyResult:
_ = code, task, benchmark # unused
return VerifyResult(
passed=False,
reward=0.0,
raw=RunResult(
passed=False,
num_tests_passed=0,
num_tests_total=0,
runtime_ms=0,
stdout="",
stderr="",
timed_out=False,
oom=False,
error=None,
),
)