Spaces:

dmaheshwar22
/

verifiable-rl-coder

Sleeping

App Files Files Community

verifiable-rl-coder / src /verifiable_rl_coder /agents /verifier.py

dmaheshwar22

deploy: replace template with real demo

0dd7c80 verified 26 days ago

raw

history blame contribute delete

4.21 kB

	"""Verifier agent — runs a candidate solution in the sandbox and scores it.

	Wraps the sandbox + composite reward behind the same agent-style interface
	that Proposer exposes. Week 5's ablations will compare verifier backends
	(this sandbox, evalplus's native evaluator, a SWE-bench harness) by
	swapping implementations behind this protocol — callers never change.
	"""

	from __future__ import annotations

	from dataclasses import dataclass
	from typing import Protocol

	from ..benchmarks.base import Task, to_sandbox_inputs
	from ..rewards.composite import (
	DEFAULT_WEIGHTS,
	RewardWeights,
	composite_reward,
	)
	from ..rewards.correctness import RewardMode
	from ..sandbox.runner import RunResult, run_code
	from ..sandbox.subprocess_runner import run_code_subprocess


	@dataclass(frozen=True)
	class VerifyResult:
	"""Outcome of verifying a candidate solution against a task's tests.

	Fields:
	passed: True iff every test passed (equivalent to RunResult.passed).
	reward: Composite scalar, suitable for RL / rejection sampling.
	raw: The underlying RunResult — stdout, stderr, counts, timing.
	"""

	passed: bool
	reward: float
	raw: RunResult


	class Verifier(Protocol):
	"""Protocol every verifier backend implements."""

	def verify(self, code: str, task: Task, benchmark: str) -> VerifyResult: ...


	class SandboxVerifier:
	"""Reference Verifier: run code in the Docker sandbox, score with composite.

	Construct once per eval (cheap); reuse across many `verify` calls.
	"""

	def __init__(
	self,
	*,
	weights: RewardWeights = DEFAULT_WEIGHTS,
	mode: RewardMode = "binary",
	) -> None:
	self._weights: RewardWeights = weights
	self._mode: RewardMode = mode

	def verify(self, code: str, task: Task, benchmark: str) -> VerifyResult:
	"""Build the test file, run it in sandbox, return scored result."""
	solution, tests = to_sandbox_inputs(task, code, benchmark)
	raw = run_code(solution, tests)
	reward = composite_reward(
	code,
	raw,
	weights=self._weights,
	mode=self._mode,
	)
	return VerifyResult(passed=raw.passed, reward=reward, raw=raw)


	class SubprocessVerifier:
	"""Docker-free Verifier using pytest subprocess.

	For CHTC and other hosts without Docker, where eval code isn't
	adversarial (stock model on public benchmarks). Faster than SandboxVerifier
	since there's no container startup; less isolated, so DO NOT use for
	Week 4 GRPO training.
	"""

	def __init__(
	self,
	*,
	weights: RewardWeights = DEFAULT_WEIGHTS,
	mode: RewardMode = "binary",
	timeout_s: float = 10.0,
	) -> None:
	self._weights: RewardWeights = weights
	self._mode: RewardMode = mode
	self._timeout_s = timeout_s

	def verify(self, code: str, task: Task, benchmark: str) -> VerifyResult:
	solution, tests = to_sandbox_inputs(task, code, benchmark)
	raw = run_code_subprocess(solution, tests, timeout_s=self._timeout_s)
	reward = composite_reward(
	code, raw, weights=self._weights, mode=self._mode,
	)
	return VerifyResult(passed=raw.passed, reward=reward, raw=raw)


	class NoOpVerifier:
	"""Returns a zero-reward placeholder — for flows that score elsewhere.

	Used when the Coordinator runs on a host without our Docker sandbox
	(e.g. CHTC execute nodes) and final scoring happens via `evalplus`
	after the loop. The Coordinator's retry decisions come from the Reviewer
	alone; SolveResult.final_verify ends up informational-only.
	"""

	def verify(self, code: str, task: Task, benchmark: str) -> VerifyResult:
	_ = code, task, benchmark # unused
	return VerifyResult(
	passed=False,
	reward=0.0,
	raw=RunResult(
	passed=False,
	num_tests_passed=0,
	num_tests_total=0,
	runtime_ms=0,
	stdout="",
	stderr="",
	timed_out=False,
	oom=False,
	error=None,
	),
	)