# Cloud Arena Training — Mathematical Model (MaskablePPO) # Extracted from cloud_arena_final.py (Cell 3) import os, sys, math import numpy as np import torch from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize, sync_envs_normalization from stable_baselines3.common.callbacks import BaseCallback from stable_baselines3.common.monitor import Monitor from sb3_contrib import MaskablePPO from sb3_contrib.common.wrappers import ActionMasker from sb3_contrib.common.maskable.callbacks import MaskableEvalCallback from cloud_arena.environment import ( CloudArenaEnv, get_action_masks, GLOBAL_SEED, N_ACTION_TYPES, MAX_RESOURCES, N_ACTIONS, MAX_STEPS, ) TOTAL_TIMESTEPS = 500_000 def cosine_lr(progress_remaining: float, init_lr: float = 3e-4, min_lr: float = 5e-5): return min_lr + (init_lr - min_lr) * 0.5 * (1.0 + math.cos(math.pi * (1.0 - progress_remaining))) class SafeMaskableEvalCallback(MaskableEvalCallback): def _on_step(self) -> bool: if self.model.get_vec_normalize_env() is not None: sync_envs_normalization(self.training_env, self.eval_env) return super()._on_step() class CloudArenaCallback(BaseCallback): EMA_ALPHA = 0.02 MIN_EPS_PER_PHASE = 800 PHASE_THRESHOLDS = {0: 0.65, 1: 0.62, 2: 0.58, 3: 0.55, 4: 0.52} PROGRESS_EVERY = 500 def __init__(self, curriculum_ref, verbose=0): super().__init__(verbose) self._curriculum_ref = curriculum_ref self.ema_win_rate = 0.0 self.current_level = 0 self._phase_eps = 0 self.episode_rewards = [] self.episode_wins = [] self.episode_savings = [] self.episode_security = [] self.episode_veto_rates = [] self.curriculum_log = [(0, 0)] self.action_freq = np.zeros(N_ACTION_TYPES) def _on_step(self) -> bool: if self.num_timesteps % self.PROGRESS_EVERY == 0: self._print_progress() actions = self.locals.get("actions") if actions is not None: for a in actions: atype = int(a) // MAX_RESOURCES if atype < N_ACTION_TYPES: self.action_freq[atype] += 1 dones = self.locals.get("dones", [False]) if dones[0]: info = self.locals.get("infos", [{}])[0] self._on_episode_end(info) return True def _on_episode_end(self, info): if "final_info" in info: info = info["final_info"] win = int(info.get("win", 0)) self.ema_win_rate = (1 - self.EMA_ALPHA) * self.ema_win_rate + self.EMA_ALPHA * win self.episode_rewards.append(float(self.locals.get("rewards", [0])[0])) self.episode_wins.append(win) self.episode_savings.append(info.get("savings_pct", 0)) self.episode_security.append(info.get("security_score", 0)) self.episode_veto_rates.append(info.get("veto_rate", 0)) self._phase_eps += 1 thr = self.PHASE_THRESHOLDS.get(self.current_level, 0.50) if self.current_level < 5 and self._phase_eps >= self.MIN_EPS_PER_PHASE and self.ema_win_rate >= thr: self._try_promote() def _try_promote(self): self.current_level += 1 self._curriculum_ref[0] = self.current_level self._phase_eps = 0 self.ema_win_rate = 0.0 self.curriculum_log.append((self.num_timesteps, self.current_level)) print(f"\nāœ„ PROMOTED -> Phase {self.current_level}") def _print_progress(self): pct = min(100.0, self.num_timesteps / TOTAL_TIMESTEPS * 100) sys.stdout.write(f"\rProgress: {pct:.1f}% | Phase: {self.current_level} | EMA Win: {self.ema_win_rate*100:.1f}%") sys.stdout.flush() def train_model(total_timesteps=TOTAL_TIMESTEPS, save_dir="./models"): os.makedirs(save_dir, exist_ok=True) os.makedirs("./logs/", exist_ok=True) os.makedirs("./eval_logs/", exist_ok=True) torch.manual_seed(GLOBAL_SEED) curriculum_ref = [0] global_step_ref = [0] def make_env(): env = CloudArenaEnv(curriculum_ref, global_step_ref) env = Monitor(env) return ActionMasker(env, get_action_masks) train_env = DummyVecEnv([make_env]) train_env = VecNormalize(train_env, norm_obs=True, norm_reward=True, clip_obs=10.0) eval_env = DummyVecEnv([make_env]) eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=False, training=False) eval_env.obs_rms = train_env.obs_rms model = MaskablePPO("MlpPolicy", train_env, learning_rate=cosine_lr, ent_coef=0.01, verbose=0) arena_cb = CloudArenaCallback(curriculum_ref) eval_cb = SafeMaskableEvalCallback(eval_env, best_model_save_path=save_dir, eval_freq=10000) print("Starting Pipeline...") model.learn(total_timesteps=total_timesteps, callback=[arena_cb, eval_cb]) model.save(os.path.join(save_dir, "cloud_arena_final")) train_env.save(os.path.join(save_dir, "cloud_arena_vecnorm.pkl")) print("\nāœ… Model and VecNormalize stats saved.") return model, arena_cb, train_env