"""Primary reward signal — fraction of tests passed in the sandbox.

Modes
-----
- `binary`: 1.0 if every test passes, else 0.0. Matches pass@k semantics and
  what humans mean by "correct". Week 2 default.
- `proportional`: `num_passed / num_total`. Denser learning signal for partial
  credit; compared head-to-head against binary in Week 5's ablation.

Program failures (timeout, OOM, runner error) always return 0 regardless of
mode — a program that didn't run is not partially correct.
"""

from __future__ import annotations

from typing import Literal

from ..sandbox.runner import RunResult

RewardMode = Literal["binary", "proportional"]


def correctness_reward(
    result: RunResult,
    *,
    mode: RewardMode = "binary",
) -> float:
    """Map a sandbox `RunResult` to a reward in `[0, 1]`.

    Args:
        result: Output of `sandbox.runner.run_code()`.
        mode: "binary" (default) or "proportional".

    Returns:
        Reward in `[0, 1]`. Always 0.0 for program failures
        (timeout, OOM, runner error) or an empty test set.
    """
    # Program did not run to completion → no credit, regardless of mode.
    if result.timed_out or result.oom or result.error is not None:
        return 0.0
    if result.num_tests_total == 0:
        return 0.0

    if mode == "binary":
        return 1.0 if result.num_tests_passed == result.num_tests_total else 0.0

    return result.num_tests_passed / result.num_tests_total