Spaces:

dmaheshwar22
/

verifiable-rl-coder

Sleeping

deploy: replace template with real demo

0dd7c80 verified about 1 month ago

1.45 kB

	"""Primary reward signal — fraction of tests passed in the sandbox.

	Modes
	-----
	- `binary`: 1.0 if every test passes, else 0.0. Matches pass@k semantics and
	what humans mean by "correct". Week 2 default.
	- `proportional`: `num_passed / num_total`. Denser learning signal for partial
	credit; compared head-to-head against binary in Week 5's ablation.

	Program failures (timeout, OOM, runner error) always return 0 regardless of
	mode — a program that didn't run is not partially correct.
	"""

	from __future__ import annotations

	from typing import Literal

	from ..sandbox.runner import RunResult

	RewardMode = Literal["binary", "proportional"]


	def correctness_reward(
	result: RunResult,
	*,
	mode: RewardMode = "binary",
	) -> float:
	"""Map a sandbox `RunResult` to a reward in `[0, 1]`.

	Args:
	result: Output of `sandbox.runner.run_code()`.
	mode: "binary" (default) or "proportional".

	Returns:
	Reward in `[0, 1]`. Always 0.0 for program failures
	(timeout, OOM, runner error) or an empty test set.
	"""
	# Program did not run to completion → no credit, regardless of mode.
	if result.timed_out or result.oom or result.error is not None:
	return 0.0
	if result.num_tests_total == 0:
	return 0.0

	if mode == "binary":
	return 1.0 if result.num_tests_passed == result.num_tests_total else 0.0

	return result.num_tests_passed / result.num_tests_total