from __future__ import annotations _SANDBOX_WEIGHT = 0.6 _GROUNDING_WEIGHT = 0.4 _BRIER_CAP = 0.5 _UNCERTAIN_CONFIDENCE_THRESHOLD = 0.3 _UNCERTAIN_QUALITY_THRESHOLD = 0.5 _UNCERTAIN_FLOOR = 0.50 def compute_reward( *, sandbox_score: float, groundedness: float, confidence: float | None = None, ) -> float: """Compute the final reward for a submit action. quality = weighted combination of sandbox and grounding signals brier = calibration penalty (overconfidence on bad code is punished) uncertain = floor reward for honest uncertainty (below all task targets) """ quality = _SANDBOX_WEIGHT * sandbox_score + _GROUNDING_WEIGHT * groundedness # Brier calibration: confidence=None treated as 0.5 (mediocre calibration) # so agents cannot bypass Brier entirely by omitting confidence. effective_confidence = confidence if confidence is not None else 0.5 brier_penalty = min((effective_confidence - quality) ** 2, _BRIER_CAP) reward = quality * (1.0 - brier_penalty) if ( confidence is not None and confidence < _UNCERTAIN_CONFIDENCE_THRESHOLD and quality < _UNCERTAIN_QUALITY_THRESHOLD ): reward = max(reward, _UNCERTAIN_FLOOR) return round(max(0.0, min(1.0, reward)), 3)