File size: 3,399 Bytes
3752981
 
42dd095
 
 
 
8ada670
 
 
 
42dd095
3752981
8ada670
 
 
 
8241eb5
 
ff634dc
 
8ada670
 
 
 
 
 
 
 
 
42dd095
8ada670
 
 
 
 
3752981
8ada670
 
 
 
 
 
 
 
 
 
3752981
8ada670
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3752981
8ada670
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3752981
8ada670
 
 
 
 
 
3752981
8ada670
 
8241eb5
8ada670
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from __future__ import annotations

MILESTONE_HIGH_THRESHOLD = 0.8
MILESTONE_LOW_THRESHOLD = 0.2
MILESTONE_BONUS = 0.05
MILESTONE_PENALTY = 0.05
DELTA_REWARD_WEIGHT = 0.08
DELTA_REWARD_CAP = 0.04
PROCESS_BONUS_CAP = 0.08
RISK_PENALTY_CAP = 0.12


def _clamp_unit_interval(value: float) -> float:
    return max(0.0, min(1.0, value))


def clamp_open_unit_interval(value: float, epsilon: float = 0.0) -> float:
    return _clamp_unit_interval(value)


def compute_step_adjustments(
    score: float,
    *,
    previous_average: float = 0.0,
    process_bonus: float = 0.0,
    risk_penalty: float = 0.0,
) -> dict[str, float]:
    base = _clamp_unit_interval(score)

    if score >= MILESTONE_HIGH_THRESHOLD:
        milestone_adjustment = MILESTONE_BONUS
    elif score < MILESTONE_LOW_THRESHOLD:
        milestone_adjustment = -MILESTONE_PENALTY
    else:
        milestone_adjustment = 0.0

    delta_adjustment = _clamp_delta((base - previous_average) * DELTA_REWARD_WEIGHT)
    bounded_process_bonus = max(0.0, min(PROCESS_BONUS_CAP, process_bonus))
    bounded_risk_penalty = max(0.0, min(RISK_PENALTY_CAP, risk_penalty))
    final_reward = _clamp_unit_interval(
        base
        + milestone_adjustment
        + delta_adjustment
        + bounded_process_bonus
        - bounded_risk_penalty
    )

    return {
        "base_reward": base,
        "milestone_adjustment": milestone_adjustment,
        "delta_adjustment": delta_adjustment,
        "process_bonus": bounded_process_bonus,
        "risk_penalty": bounded_risk_penalty,
        "final_reward": final_reward,
    }


def _clamp_delta(value: float) -> float:
    return max(-DELTA_REWARD_CAP, min(DELTA_REWARD_CAP, value))


def compute_step_reward(
    score: float,
    *,
    previous_average: float = 0.0,
    process_bonus: float = 0.0,
    risk_penalty: float = 0.0,
) -> float:
    return compute_step_adjustments(
        score,
        previous_average=previous_average,
        process_bonus=process_bonus,
        risk_penalty=risk_penalty,
    )["final_reward"]


def compute_trajectory_adjustments(
    per_ticket_scores: list[float],
    queue_size: int,
    steps_taken: int,
    *,
    completion_bonus: float = 0.0,
    consistency_bonus: float = 0.0,
) -> dict[str, float]:
    if not per_ticket_scores:
        return {
            "average_reward": 0.0,
            "completion_bonus": 0.0,
            "consistency_bonus": 0.0,
            "final_reward": 0.0,
        }
    avg = sum(per_ticket_scores) / len(per_ticket_scores)
    bounded_completion_bonus = max(0.0, min(0.08, completion_bonus))
    bounded_consistency_bonus = max(0.0, min(0.05, consistency_bonus))
    final_reward = _clamp_unit_interval(
        avg + bounded_completion_bonus + bounded_consistency_bonus
    )
    return {
        "average_reward": avg,
        "completion_bonus": bounded_completion_bonus,
        "consistency_bonus": bounded_consistency_bonus,
        "final_reward": final_reward,
    }


def compute_trajectory_reward(
    per_ticket_scores: list[float],
    queue_size: int,
    steps_taken: int,
    *,
    completion_bonus: float = 0.0,
    consistency_bonus: float = 0.0,
) -> float:
    return compute_trajectory_adjustments(
        per_ticket_scores,
        queue_size,
        steps_taken,
        completion_bonus=completion_bonus,
        consistency_bonus=consistency_bonus,
    )["final_reward"]