from __future__ import annotations import time from collections.abc import Callable from functools import lru_cache from .environment import V3DeliveryDispatchEnv from .models import V3TaskResult from .policies import baseline_policy, heuristic_policy from .solver import solve_exact STRICT_SCORE_EPSILON = 1e-4 BASELINE_SCORE_ANCHOR = 0.05 def grade_episode(task_id: str, seed: int, raw_reward: float) -> V3TaskResult: baseline_reward = cached_rollout_policy(task_id, seed, policy_name="baseline") heuristic_reward = cached_rollout_policy(task_id, seed, policy_name="heuristic") target_reward = cached_optimal_reward(task_id, seed) score = normalize_score(raw_reward, baseline_reward, target_reward) return V3TaskResult( task_id=task_id, raw_reward=raw_reward, baseline_reward=baseline_reward, target_reward=target_reward, score=score, heuristic_reward=heuristic_reward, ) def rollout_policy(task_id: str, seed: int, policy_name: str = "baseline") -> float: env = V3DeliveryDispatchEnv(default_task_id=task_id) observation = env.reset_internal(task_id=task_id, internal_seed=seed) policy = baseline_policy if policy_name == "baseline" else heuristic_policy while not env.done: result = env.step(policy(observation), grade_terminal=False) observation = result.observation return env.cumulative_reward @lru_cache(maxsize=512) def cached_rollout_policy(task_id: str, seed: int, policy_name: str = "baseline") -> float: return rollout_policy(task_id, seed, policy_name=policy_name) def optimal_reward( task_id: str, seed: int, progress_callback: Callable[[dict[str, object]], None] | None = None, ) -> float: env = V3DeliveryDispatchEnv(default_task_id=task_id) env.reset_internal(task_id=task_id, internal_seed=seed) reward, _ = solve_exact( recipe=env._require_recipe(), start_round=env.round_index, start_counts=env.courier_counts, progress_callback=progress_callback, ) return reward @lru_cache(maxsize=512) def cached_optimal_reward(task_id: str, seed: int) -> float: return optimal_reward(task_id, seed) def timed_optimal_reward(task_id: str, seed: int) -> tuple[float, float]: started_at = time.perf_counter() reward = optimal_reward(task_id, seed) runtime_ms = (time.perf_counter() - started_at) * 1000.0 return reward, runtime_ms def normalize_score(raw_reward: float, baseline_reward: float, target_reward: float) -> float: lower = STRICT_SCORE_EPSILON baseline_anchor = BASELINE_SCORE_ANCHOR upper = 1.0 - STRICT_SCORE_EPSILON if target_reward <= baseline_reward: return upper if raw_reward >= target_reward else baseline_anchor gap = target_reward - baseline_reward normalized = (raw_reward - baseline_reward) / gap if normalized >= 0.0: score = baseline_anchor + normalized * (upper - baseline_anchor) else: score = baseline_anchor + normalized * (baseline_anchor - lower) return max(lower, min(upper, score))