File size: 1,287 Bytes
acf77ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from __future__ import annotations

_SANDBOX_WEIGHT = 0.6
_GROUNDING_WEIGHT = 0.4
_BRIER_CAP = 0.5
_UNCERTAIN_CONFIDENCE_THRESHOLD = 0.3
_UNCERTAIN_QUALITY_THRESHOLD = 0.5
_UNCERTAIN_FLOOR = 0.50


def compute_reward(
    *,
    sandbox_score: float,
    groundedness: float,
    confidence: float | None = None,
) -> float:
    """Compute the final reward for a submit action.

    quality   = weighted combination of sandbox and grounding signals
    brier     = calibration penalty (overconfidence on bad code is punished)
    uncertain = floor reward for honest uncertainty (below all task targets)
    """
    quality = _SANDBOX_WEIGHT * sandbox_score + _GROUNDING_WEIGHT * groundedness

    # Brier calibration: confidence=None treated as 0.5 (mediocre calibration)
    # so agents cannot bypass Brier entirely by omitting confidence.
    effective_confidence = confidence if confidence is not None else 0.5
    brier_penalty = min((effective_confidence - quality) ** 2, _BRIER_CAP)

    reward = quality * (1.0 - brier_penalty)

    if (
        confidence is not None
        and confidence < _UNCERTAIN_CONFIDENCE_THRESHOLD
        and quality < _UNCERTAIN_QUALITY_THRESHOLD
    ):
        reward = max(reward, _UNCERTAIN_FLOOR)

    return round(max(0.0, min(1.0, reward)), 3)