File size: 1,044 Bytes
43f41de
 
8fa7af1
 
43f41de
 
 
 
 
 
 
 
 
 
5869d56
43f41de
5869d56
43f41de
 
 
5869d56
 
 
 
 
43f41de
8fa7af1
 
 
 
 
 
43f41de
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
"""Shared limits and scoring helpers for explainer episodes."""

MAX_EXPLORE_STEPS = 6
MAX_REPAIR_STEPS = 3

AVAILABLE_TOOLS = (
    "search_wikipedia",
    "search_hf_papers",
    "search_arxiv",
    "search_scholar",
    "fetch_docs",
    "search_hf_hub",
)

MAX_EXPLORE_REWARD = 1.0
MAX_GENERATE_REWARD = 1.0
MAX_REPAIR_REWARD = 1.0
SUCCESS_SCORE_THRESHOLD = 0.3


def clamp_action_reward(value: float) -> float:
    """Clamp any single action reward to the required [0, 1] range."""
    return min(max(value, 0.0), 1.0)


def normalized_episode_score(total_reward: float) -> float:
    """Normalize an episode's accumulated reward to the required [0, 1] range.

    Repair is intentionally not added to the denominator: repair rewards are
    discounted so a failed generate + successful repair should not beat a clean
    first-pass generation.
    """
    max_possible = MAX_EXPLORE_STEPS * MAX_EXPLORE_REWARD + MAX_GENERATE_REWARD
    score = total_reward / max_possible if max_possible > 0 else 0.0
    return min(max(score, 0.0), 1.0)