Spaces:
Sleeping
Sleeping
| """Primary reward signal — fraction of tests passed in the sandbox. | |
| Modes | |
| ----- | |
| - `binary`: 1.0 if every test passes, else 0.0. Matches pass@k semantics and | |
| what humans mean by "correct". Week 2 default. | |
| - `proportional`: `num_passed / num_total`. Denser learning signal for partial | |
| credit; compared head-to-head against binary in Week 5's ablation. | |
| Program failures (timeout, OOM, runner error) always return 0 regardless of | |
| mode — a program that didn't run is not partially correct. | |
| """ | |
| from __future__ import annotations | |
| from typing import Literal | |
| from ..sandbox.runner import RunResult | |
| RewardMode = Literal["binary", "proportional"] | |
| def correctness_reward( | |
| result: RunResult, | |
| *, | |
| mode: RewardMode = "binary", | |
| ) -> float: | |
| """Map a sandbox `RunResult` to a reward in `[0, 1]`. | |
| Args: | |
| result: Output of `sandbox.runner.run_code()`. | |
| mode: "binary" (default) or "proportional". | |
| Returns: | |
| Reward in `[0, 1]`. Always 0.0 for program failures | |
| (timeout, OOM, runner error) or an empty test set. | |
| """ | |
| # Program did not run to completion → no credit, regardless of mode. | |
| if result.timed_out or result.oom or result.error is not None: | |
| return 0.0 | |
| if result.num_tests_total == 0: | |
| return 0.0 | |
| if mode == "binary": | |
| return 1.0 if result.num_tests_passed == result.num_tests_total else 0.0 | |
| return result.num_tests_passed / result.num_tests_total | |