dmaheshwar22's picture
deploy: replace template with real demo
0dd7c80 verified
"""Primary reward signal — fraction of tests passed in the sandbox.
Modes
-----
- `binary`: 1.0 if every test passes, else 0.0. Matches pass@k semantics and
what humans mean by "correct". Week 2 default.
- `proportional`: `num_passed / num_total`. Denser learning signal for partial
credit; compared head-to-head against binary in Week 5's ablation.
Program failures (timeout, OOM, runner error) always return 0 regardless of
mode — a program that didn't run is not partially correct.
"""
from __future__ import annotations
from typing import Literal
from ..sandbox.runner import RunResult
RewardMode = Literal["binary", "proportional"]
def correctness_reward(
result: RunResult,
*,
mode: RewardMode = "binary",
) -> float:
"""Map a sandbox `RunResult` to a reward in `[0, 1]`.
Args:
result: Output of `sandbox.runner.run_code()`.
mode: "binary" (default) or "proportional".
Returns:
Reward in `[0, 1]`. Always 0.0 for program failures
(timeout, OOM, runner error) or an empty test set.
"""
# Program did not run to completion → no credit, regardless of mode.
if result.timed_out or result.oom or result.error is not None:
return 0.0
if result.num_tests_total == 0:
return 0.0
if mode == "binary":
return 1.0 if result.num_tests_passed == result.num_tests_total else 0.0
return result.num_tests_passed / result.num_tests_total