from ..reward.trace_scorer import score_trace


def compute_reward(test_score, trace_obj, code, steps_taken, max_steps, prev_test_score=0.0, last_action_empty=False):
    """
    Compute reward for code fixing episode.

    Args:
        test_score:     Test execution score (0.0-1.0)
        trace_obj:      TraceCollector object with action history
        code:           Fixed code string for quality evaluation
        steps_taken:    Number of steps taken
        max_steps:      Maximum steps allowed
        prev_test_score: Previous test score (for regression penalty)
        last_action_empty: Whether the last action was empty/no-op

    Returns:
        Reward score in [0.0, 1.0]
    """
    # If last action was empty/no-op, give minimal reward to encourage meaningful actions
    # NOTE: validator requires score strictly > 0.0, so use a small epsilon
    _EPS = 1e-6
    if last_action_empty:
        return _EPS

    # 1. Functional Progress (90% weight) — primary signal
    functional_reward = float(test_score)

    # 1b. Regression Penalty: penalize when test score decreases
    # This encourages the agent to not make things worse
    test_score_delta = test_score - prev_test_score
    regression_penalty = 0.0
    if test_score_delta < 0:
        # Penalize proportionally to how much the score dropped
        regression_penalty = abs(test_score_delta) * 0.1  # 10% penalty for regression

    # 2. Reasoning Quality (10% weight) - bonus for good reasoning trace
    trace_reward = max(0.0, score_trace(trace_obj) if trace_obj else 0.0)  # Ensure non-negative

    # Weighted sum — coefficients sum to 1.0 before penalties
    reward = (
        0.9 * functional_reward
        + 0.1 * trace_reward
        - regression_penalty
    )

    # Clamp to open interval (0.0, 1.0) — validator rejects boundary values
    _EPS = 1e-6
    return max(_EPS, min(1.0 - _EPS, reward))