def clamp_unit_interval(x: float) -> float:
    """Clamp to [0.0, 1.0] (Phase 1 / rubric)."""
    return max(0.0, min(1.0, float(x)))


def compute_step_reward(action_type, is_stale):
    reward = 0

    if action_type == "invalidate":
        reward = 1.0 if is_stale else -0.5

    elif action_type == "keep":
        reward = 0.8 if not is_stale else -0.6

    elif action_type == "refresh":
        reward = 0.6 if is_stale else 0.2

    return reward


def normalize_episode_score(total_reward, max_steps=10):
    score = total_reward / max_steps
    return clamp_unit_interval(score)


def evaluate_episode(history):
    """
    history = list of:
    {
        "action": str,
        "is_stale": bool
    }
    """
    total_steps = len(history)

    if total_steps == 0:
        return clamp_unit_interval(0.0)

    correct_decisions = 0
    unnecessary_invalidations = 0
    oscillations = 0

    last_action = None

    for step in history:
        action = step["action"]
        is_stale = step["is_stale"]

        if (is_stale and action in ["invalidate", "refresh"]) or (
            not is_stale and action == "keep"
        ):
            correct_decisions += 1

        if action == "invalidate" and not is_stale:
            unnecessary_invalidations += 1

        if last_action and last_action != action:
            oscillations += 1

        last_action = action

    freshness = correct_decisions / total_steps
    efficiency = 1 - (unnecessary_invalidations / total_steps)
    stability = 1 - (oscillations / total_steps)

    score = 0.5 * freshness + 0.3 * efficiency + 0.2 * stability

    return clamp_unit_interval(score)