""" Episode Tier Lock — ensures both baseline and specialist output are scored through the SAME tier. Prevents the circular dependency bug from v3. """ from __future__ import annotations import random from enum import IntEnum from dataclasses import dataclass class RewardTier(IntEnum): TIER_0 = 0 # Free structural checks TIER_1 = 1 # Embedding similarity TIER_2 = 2 # Small LLM micro-judge (GPT-4o-mini) TIER_3 = 3 # Full LLM-as-judge (checkpoints only) def _load_tier_config() -> tuple[dict, dict]: """Load tier_map and tier2_sample_rates from training_config.yaml at import time.""" import yaml, os config_path = os.path.join( os.path.dirname(__file__), "..", "configs", "training_config.yaml" ) try: with open(config_path) as f: reward_cfg = yaml.safe_load(f).get("reward", {}) tier_map_raw = reward_cfg.get("tier_map", { "atomic": 0, "simple": 1, "moderate": 1, "complex": 2, "enterprise": 2, }) tier_map = {k: RewardTier(v) for k, v in tier_map_raw.items()} sample_rates = reward_cfg.get("tier2_sample_rates", { "moderate": 0.30, "complex": 1.00, "enterprise": 1.00, }) return tier_map, sample_rates except Exception: return ( {"atomic": RewardTier.TIER_0, "simple": RewardTier.TIER_1, "moderate": RewardTier.TIER_1, "complex": RewardTier.TIER_2, "enterprise": RewardTier.TIER_2}, {"moderate": 0.30, "complex": 1.00, "enterprise": 1.00}, ) TIER_MAP, TIER2_SAMPLE_RATES = _load_tier_config() @dataclass class EpisodeTierLock: """ Locked once at episode start. Both generalist and specialist outputs are scored through this exact tier. No drift. """ complexity_class: str locked_tier: RewardTier tier2_sample_rate: float @classmethod def for_task(cls, complexity_class: str) -> "EpisodeTierLock": tier = TIER_MAP.get(complexity_class, RewardTier.TIER_1) sample_rate = TIER2_SAMPLE_RATES.get(complexity_class, 0.0) if complexity_class == "moderate" and random.random() < sample_rate: tier = RewardTier.TIER_2 return cls( complexity_class=complexity_class, locked_tier=tier, tier2_sample_rate=sample_rate, )