"""Primary reward signal — fraction of tests passed in the sandbox. Modes ----- - `binary`: 1.0 if every test passes, else 0.0. Matches pass@k semantics and what humans mean by "correct". Week 2 default. - `proportional`: `num_passed / num_total`. Denser learning signal for partial credit; compared head-to-head against binary in Week 5's ablation. Program failures (timeout, OOM, runner error) always return 0 regardless of mode — a program that didn't run is not partially correct. """ from __future__ import annotations from typing import Literal from ..sandbox.runner import RunResult RewardMode = Literal["binary", "proportional"] def correctness_reward( result: RunResult, *, mode: RewardMode = "binary", ) -> float: """Map a sandbox `RunResult` to a reward in `[0, 1]`. Args: result: Output of `sandbox.runner.run_code()`. mode: "binary" (default) or "proportional". Returns: Reward in `[0, 1]`. Always 0.0 for program failures (timeout, OOM, runner error) or an empty test set. """ # Program did not run to completion → no credit, regardless of mode. if result.timed_out or result.oom or result.error is not None: return 0.0 if result.num_tests_total == 0: return 0.0 if mode == "binary": return 1.0 if result.num_tests_passed == result.num_tests_total else 0.0 return result.num_tests_passed / result.num_tests_total