def clamp_unit_interval(x: float) -> float: """Clamp to [0.0, 1.0] (Phase 1 / rubric).""" return max(0.0, min(1.0, float(x))) def compute_step_reward(action_type, is_stale): reward = 0 if action_type == "invalidate": reward = 1.0 if is_stale else -0.5 elif action_type == "keep": reward = 0.8 if not is_stale else -0.6 elif action_type == "refresh": reward = 0.6 if is_stale else 0.2 return reward def normalize_episode_score(total_reward, max_steps=10): score = total_reward / max_steps return clamp_unit_interval(score) def evaluate_episode(history): """ history = list of: { "action": str, "is_stale": bool } """ total_steps = len(history) if total_steps == 0: return clamp_unit_interval(0.0) correct_decisions = 0 unnecessary_invalidations = 0 oscillations = 0 last_action = None for step in history: action = step["action"] is_stale = step["is_stale"] if (is_stale and action in ["invalidate", "refresh"]) or ( not is_stale and action == "keep" ): correct_decisions += 1 if action == "invalidate" and not is_stale: unnecessary_invalidations += 1 if last_action and last_action != action: oscillations += 1 last_action = action freshness = correct_decisions / total_steps efficiency = 1 - (unnecessary_invalidations / total_steps) stability = 1 - (oscillations / total_steps) score = 0.5 * freshness + 0.3 * efficiency + 0.2 * stability return clamp_unit_interval(score)