File size: 3,094 Bytes
0355a51
 
 
 
9850bda
0355a51
 
 
 
 
 
9850bda
4ca7a83
9850bda
0355a51
 
9850bda
 
 
0355a51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9850bda
 
 
 
 
0355a51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9850bda
 
 
 
 
0355a51
 
 
 
 
 
 
 
9850bda
4ca7a83
9850bda
0355a51
4ca7a83
 
 
 
 
 
 
 
9850bda
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from __future__ import annotations

import time
from collections.abc import Callable
from functools import lru_cache

from .environment import V3DeliveryDispatchEnv
from .models import V3TaskResult
from .policies import baseline_policy, heuristic_policy
from .solver import solve_exact

STRICT_SCORE_EPSILON = 1e-4
BASELINE_SCORE_ANCHOR = 0.05


def grade_episode(task_id: str, seed: int, raw_reward: float) -> V3TaskResult:
    baseline_reward = cached_rollout_policy(task_id, seed, policy_name="baseline")
    heuristic_reward = cached_rollout_policy(task_id, seed, policy_name="heuristic")
    target_reward = cached_optimal_reward(task_id, seed)
    score = normalize_score(raw_reward, baseline_reward, target_reward)
    return V3TaskResult(
        task_id=task_id,
        raw_reward=raw_reward,
        baseline_reward=baseline_reward,
        target_reward=target_reward,
        score=score,
        heuristic_reward=heuristic_reward,
    )


def rollout_policy(task_id: str, seed: int, policy_name: str = "baseline") -> float:
    env = V3DeliveryDispatchEnv(default_task_id=task_id)
    observation = env.reset_internal(task_id=task_id, internal_seed=seed)
    policy = baseline_policy if policy_name == "baseline" else heuristic_policy
    while not env.done:
        result = env.step(policy(observation), grade_terminal=False)
        observation = result.observation
    return env.cumulative_reward


@lru_cache(maxsize=512)
def cached_rollout_policy(task_id: str, seed: int, policy_name: str = "baseline") -> float:
    return rollout_policy(task_id, seed, policy_name=policy_name)


def optimal_reward(
    task_id: str,
    seed: int,
    progress_callback: Callable[[dict[str, object]], None] | None = None,
) -> float:
    env = V3DeliveryDispatchEnv(default_task_id=task_id)
    env.reset_internal(task_id=task_id, internal_seed=seed)
    reward, _ = solve_exact(
        recipe=env._require_recipe(),
        start_round=env.round_index,
        start_counts=env.courier_counts,
        progress_callback=progress_callback,
    )
    return reward


@lru_cache(maxsize=512)
def cached_optimal_reward(task_id: str, seed: int) -> float:
    return optimal_reward(task_id, seed)


def timed_optimal_reward(task_id: str, seed: int) -> tuple[float, float]:
    started_at = time.perf_counter()
    reward = optimal_reward(task_id, seed)
    runtime_ms = (time.perf_counter() - started_at) * 1000.0
    return reward, runtime_ms


def normalize_score(raw_reward: float, baseline_reward: float, target_reward: float) -> float:
    lower = STRICT_SCORE_EPSILON
    baseline_anchor = BASELINE_SCORE_ANCHOR
    upper = 1.0 - STRICT_SCORE_EPSILON
    if target_reward <= baseline_reward:
        return upper if raw_reward >= target_reward else baseline_anchor

    gap = target_reward - baseline_reward
    normalized = (raw_reward - baseline_reward) / gap
    if normalized >= 0.0:
        score = baseline_anchor + normalized * (upper - baseline_anchor)
    else:
        score = baseline_anchor + normalized * (baseline_anchor - lower)
    return max(lower, min(upper, score))