DGX_AI / codeforge /grader.py
vasiuuu's picture
Initial commit for CodeForge GRPO training
acf77ab
from __future__ import annotations
_SANDBOX_WEIGHT = 0.6
_GROUNDING_WEIGHT = 0.4
_BRIER_CAP = 0.5
_UNCERTAIN_CONFIDENCE_THRESHOLD = 0.3
_UNCERTAIN_QUALITY_THRESHOLD = 0.5
_UNCERTAIN_FLOOR = 0.50
def compute_reward(
*,
sandbox_score: float,
groundedness: float,
confidence: float | None = None,
) -> float:
"""Compute the final reward for a submit action.
quality = weighted combination of sandbox and grounding signals
brier = calibration penalty (overconfidence on bad code is punished)
uncertain = floor reward for honest uncertainty (below all task targets)
"""
quality = _SANDBOX_WEIGHT * sandbox_score + _GROUNDING_WEIGHT * groundedness
# Brier calibration: confidence=None treated as 0.5 (mediocre calibration)
# so agents cannot bypass Brier entirely by omitting confidence.
effective_confidence = confidence if confidence is not None else 0.5
brier_penalty = min((effective_confidence - quality) ** 2, _BRIER_CAP)
reward = quality * (1.0 - brier_penalty)
if (
confidence is not None
and confidence < _UNCERTAIN_CONFIDENCE_THRESHOLD
and quality < _UNCERTAIN_QUALITY_THRESHOLD
):
reward = max(reward, _UNCERTAIN_FLOOR)
return round(max(0.0, min(1.0, reward)), 3)