Spaces:

dmaheshwar22
/

verifiable-rl-coder

Running

File size: 4,208 Bytes

0dd7c80

"""Verifier agent — runs a candidate solution in the sandbox and scores it.

Wraps the sandbox + composite reward behind the same agent-style interface
that Proposer exposes. Week 5's ablations will compare verifier backends
(this sandbox, evalplus's native evaluator, a SWE-bench harness) by
swapping implementations behind this protocol — callers never change.
"""

from __future__ import annotations

from dataclasses import dataclass
from typing import Protocol

from ..benchmarks.base import Task, to_sandbox_inputs
from ..rewards.composite import (
    DEFAULT_WEIGHTS,
    RewardWeights,
    composite_reward,
)
from ..rewards.correctness import RewardMode
from ..sandbox.runner import RunResult, run_code
from ..sandbox.subprocess_runner import run_code_subprocess


@dataclass(frozen=True)
class VerifyResult:
    """Outcome of verifying a candidate solution against a task's tests.

    Fields:
        passed: True iff every test passed (equivalent to RunResult.passed).
        reward: Composite scalar, suitable for RL / rejection sampling.
        raw: The underlying RunResult — stdout, stderr, counts, timing.
    """

    passed: bool
    reward: float
    raw: RunResult


class Verifier(Protocol):
    """Protocol every verifier backend implements."""

    def verify(self, code: str, task: Task, benchmark: str) -> VerifyResult: ...


class SandboxVerifier:
    """Reference Verifier: run code in the Docker sandbox, score with composite.

    Construct once per eval (cheap); reuse across many `verify` calls.
    """

    def __init__(
        self,
        *,
        weights: RewardWeights = DEFAULT_WEIGHTS,
        mode: RewardMode = "binary",
    ) -> None:
        self._weights: RewardWeights = weights
        self._mode: RewardMode = mode

    def verify(self, code: str, task: Task, benchmark: str) -> VerifyResult:
        """Build the test file, run it in sandbox, return scored result."""
        solution, tests = to_sandbox_inputs(task, code, benchmark)
        raw = run_code(solution, tests)
        reward = composite_reward(
            code,
            raw,
            weights=self._weights,
            mode=self._mode,
        )
        return VerifyResult(passed=raw.passed, reward=reward, raw=raw)


class SubprocessVerifier:
    """Docker-free Verifier using pytest subprocess.

    For CHTC and other hosts without Docker, where eval code isn't
    adversarial (stock model on public benchmarks). Faster than SandboxVerifier
    since there's no container startup; less isolated, so DO NOT use for
    Week 4 GRPO training.
    """

    def __init__(
        self,
        *,
        weights: RewardWeights = DEFAULT_WEIGHTS,
        mode: RewardMode = "binary",
        timeout_s: float = 10.0,
    ) -> None:
        self._weights: RewardWeights = weights
        self._mode: RewardMode = mode
        self._timeout_s = timeout_s

    def verify(self, code: str, task: Task, benchmark: str) -> VerifyResult:
        solution, tests = to_sandbox_inputs(task, code, benchmark)
        raw = run_code_subprocess(solution, tests, timeout_s=self._timeout_s)
        reward = composite_reward(
            code, raw, weights=self._weights, mode=self._mode,
        )
        return VerifyResult(passed=raw.passed, reward=reward, raw=raw)


class NoOpVerifier:
    """Returns a zero-reward placeholder — for flows that score elsewhere.

    Used when the Coordinator runs on a host without our Docker sandbox
    (e.g. CHTC execute nodes) and final scoring happens via `evalplus`
    after the loop. The Coordinator's retry decisions come from the Reviewer
    alone; SolveResult.final_verify ends up informational-only.
    """

    def verify(self, code: str, task: Task, benchmark: str) -> VerifyResult:
        _ = code, task, benchmark  # unused
        return VerifyResult(
            passed=False,
            reward=0.0,
            raw=RunResult(
                passed=False,
                num_tests_passed=0,
                num_tests_total=0,
                runtime_ms=0,
                stdout="",
                stderr="",
                timed_out=False,
                oom=False,
                error=None,
            ),
        )