Spaces:
Running
Running
File size: 3,399 Bytes
3752981 42dd095 8ada670 42dd095 3752981 8ada670 8241eb5 ff634dc 8ada670 42dd095 8ada670 3752981 8ada670 3752981 8ada670 3752981 8ada670 3752981 8ada670 3752981 8ada670 8241eb5 8ada670 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 | from __future__ import annotations
MILESTONE_HIGH_THRESHOLD = 0.8
MILESTONE_LOW_THRESHOLD = 0.2
MILESTONE_BONUS = 0.05
MILESTONE_PENALTY = 0.05
DELTA_REWARD_WEIGHT = 0.08
DELTA_REWARD_CAP = 0.04
PROCESS_BONUS_CAP = 0.08
RISK_PENALTY_CAP = 0.12
def _clamp_unit_interval(value: float) -> float:
return max(0.0, min(1.0, value))
def clamp_open_unit_interval(value: float, epsilon: float = 0.0) -> float:
return _clamp_unit_interval(value)
def compute_step_adjustments(
score: float,
*,
previous_average: float = 0.0,
process_bonus: float = 0.0,
risk_penalty: float = 0.0,
) -> dict[str, float]:
base = _clamp_unit_interval(score)
if score >= MILESTONE_HIGH_THRESHOLD:
milestone_adjustment = MILESTONE_BONUS
elif score < MILESTONE_LOW_THRESHOLD:
milestone_adjustment = -MILESTONE_PENALTY
else:
milestone_adjustment = 0.0
delta_adjustment = _clamp_delta((base - previous_average) * DELTA_REWARD_WEIGHT)
bounded_process_bonus = max(0.0, min(PROCESS_BONUS_CAP, process_bonus))
bounded_risk_penalty = max(0.0, min(RISK_PENALTY_CAP, risk_penalty))
final_reward = _clamp_unit_interval(
base
+ milestone_adjustment
+ delta_adjustment
+ bounded_process_bonus
- bounded_risk_penalty
)
return {
"base_reward": base,
"milestone_adjustment": milestone_adjustment,
"delta_adjustment": delta_adjustment,
"process_bonus": bounded_process_bonus,
"risk_penalty": bounded_risk_penalty,
"final_reward": final_reward,
}
def _clamp_delta(value: float) -> float:
return max(-DELTA_REWARD_CAP, min(DELTA_REWARD_CAP, value))
def compute_step_reward(
score: float,
*,
previous_average: float = 0.0,
process_bonus: float = 0.0,
risk_penalty: float = 0.0,
) -> float:
return compute_step_adjustments(
score,
previous_average=previous_average,
process_bonus=process_bonus,
risk_penalty=risk_penalty,
)["final_reward"]
def compute_trajectory_adjustments(
per_ticket_scores: list[float],
queue_size: int,
steps_taken: int,
*,
completion_bonus: float = 0.0,
consistency_bonus: float = 0.0,
) -> dict[str, float]:
if not per_ticket_scores:
return {
"average_reward": 0.0,
"completion_bonus": 0.0,
"consistency_bonus": 0.0,
"final_reward": 0.0,
}
avg = sum(per_ticket_scores) / len(per_ticket_scores)
bounded_completion_bonus = max(0.0, min(0.08, completion_bonus))
bounded_consistency_bonus = max(0.0, min(0.05, consistency_bonus))
final_reward = _clamp_unit_interval(
avg + bounded_completion_bonus + bounded_consistency_bonus
)
return {
"average_reward": avg,
"completion_bonus": bounded_completion_bonus,
"consistency_bonus": bounded_consistency_bonus,
"final_reward": final_reward,
}
def compute_trajectory_reward(
per_ticket_scores: list[float],
queue_size: int,
steps_taken: int,
*,
completion_bonus: float = 0.0,
consistency_bonus: float = 0.0,
) -> float:
return compute_trajectory_adjustments(
per_ticket_scores,
queue_size,
steps_taken,
completion_bonus=completion_bonus,
consistency_bonus=consistency_bonus,
)["final_reward"]
|