cache-env / env /grader.py
Parv Pareek
done
e75c8ce
def clamp_unit_interval(x: float) -> float:
"""Clamp to [0.0, 1.0] (Phase 1 / rubric)."""
return max(0.0, min(1.0, float(x)))
def compute_step_reward(action_type, is_stale):
reward = 0
if action_type == "invalidate":
reward = 1.0 if is_stale else -0.5
elif action_type == "keep":
reward = 0.8 if not is_stale else -0.6
elif action_type == "refresh":
reward = 0.6 if is_stale else 0.2
return reward
def normalize_episode_score(total_reward, max_steps=10):
score = total_reward / max_steps
return clamp_unit_interval(score)
def evaluate_episode(history):
"""
history = list of:
{
"action": str,
"is_stale": bool
}
"""
total_steps = len(history)
if total_steps == 0:
return clamp_unit_interval(0.0)
correct_decisions = 0
unnecessary_invalidations = 0
oscillations = 0
last_action = None
for step in history:
action = step["action"]
is_stale = step["is_stale"]
if (is_stale and action in ["invalidate", "refresh"]) or (
not is_stale and action == "keep"
):
correct_decisions += 1
if action == "invalidate" and not is_stale:
unnecessary_invalidations += 1
if last_action and last_action != action:
oscillations += 1
last_action = action
freshness = correct_decisions / total_steps
efficiency = 1 - (unnecessary_invalidations / total_steps)
stability = 1 - (oscillations / total_steps)
score = 0.5 * freshness + 0.3 * efficiency + 0.2 * stability
return clamp_unit_interval(score)