"""Shared limits and scoring helpers for explainer episodes.""" MAX_EXPLORE_STEPS = 6 MAX_REPAIR_STEPS = 3 AVAILABLE_TOOLS = ( "search_wikipedia", "search_hf_papers", "search_arxiv", "search_scholar", "fetch_docs", "search_hf_hub", ) MAX_EXPLORE_REWARD = 1.0 MAX_GENERATE_REWARD = 1.0 MAX_REPAIR_REWARD = 1.0 SUCCESS_SCORE_THRESHOLD = 0.3 def clamp_action_reward(value: float) -> float: """Clamp any single action reward to the required [0, 1] range.""" return min(max(value, 0.0), 1.0) def normalized_episode_score(total_reward: float) -> float: """Normalize an episode's accumulated reward to the required [0, 1] range. Repair is intentionally not added to the denominator: repair rewards are discounted so a failed generate + successful repair should not beat a clean first-pass generation. """ max_possible = MAX_EXPLORE_STEPS * MAX_EXPLORE_REWARD + MAX_GENERATE_REWARD score = total_reward / max_possible if max_possible > 0 else 0.0 return min(max(score, 0.0), 1.0)