Spaces:

trioskosmos
/

LovecaSim

Sleeping

App Files Files Community

trioskosmos commited on Feb 3

Commit

77b2fc5

verified ·

1 Parent(s): fa71ce1

Upload ai/training/train_vectorized.py with huggingface_hub

Browse files

Files changed (1) hide show

ai/training/train_vectorized.py +585 -0

ai/training/train_vectorized.py ADDED Viewed

	@@ -0,0 +1,585 @@

+import os
+import sys
+import time
+# Immediate feedback
+print(" [Init] Python process started. Loading libraries...")
+print(" [Init] Loading Pytorch...", end="", flush=True)
+import torch
+import torch as th
+import torch.nn.functional as F
+print(" Done.")
+print(" [Init] Loading Gymnasium & SB3...", end="", flush=True)
+import glob
+import warnings
+import numpy as np
+from gymnasium import spaces
+from sb3_contrib import MaskablePPO
+from stable_baselines3.common.callbacks import BaseCallback, CheckpointCallback
+from stable_baselines3.common.utils import explained_variance
+from tqdm import tqdm
+print(" Done.")
+# Filter Numba warning
+warnings.filterwarnings("ignore", category=RuntimeWarning, message="nopython is set for njit")
+# Ensure project root is in path
+sys.path.append(os.getcwd())
+print(" [Init] Loading LovecaSim Vector Engine...", end="", flush=True)
+from ai.environments.vec_env_adapter import VectorEnvAdapter
+from ai.utils.loveca_features_extractor import LovecaFeaturesExtractor
+print(" Done.")
+class TimeCheckpointCallback(BaseCallback):
+    """
+    Save the model every N minutes.
+    """
+    def __init__(self, save_freq_minutes: float, save_path: str, name_prefix: str, verbose: int = 0):
+        super().__init__(verbose)
+        self.save_freq_seconds = save_freq_minutes * 60
+        self.save_path = save_path
+        self.name_prefix = name_prefix
+        self.last_time_save = time.time()
+    def _on_step(self) -> bool:
+        if (time.time() - self.last_time_save) > self.save_freq_seconds:
+            save_path = os.path.join(self.save_path, f"{self.name_prefix}_time_auto")
+            self.model.save(save_path)
+            if self.verbose > 0:
+                print(f" [Save] Model auto-saved after 3 minutes to {save_path}")
+            self.last_time_save = time.time()
+        return True
+class ModelSnapshotCallback(BaseCallback):
+    """
+    Saves a 'Model Snapshot' every X minutes:
+    - model.zip
+    - verified_card_pool.json (Context)
+    - snapshot_meta.json (Architecture/Config)
+    """
+    def __init__(self, save_freq_minutes: float, save_path: str, verbose=0):
+        super().__init__(verbose)
+        self.save_freq_minutes = save_freq_minutes
+        self.save_path = save_path
+        self.last_save_time = time.time()
+        # Ensure historiccheckpoints exists
+        os.makedirs("historiccheckpoints", exist_ok=True)
+    def _on_step(self) -> bool:
+        if time.time() - self.last_save_time > self.save_freq_minutes * 60:
+            self.last_save_time = time.time()
+            self._save_snapshot()
+        return True
+    def _save_snapshot(self):
+        timestamp = time.strftime("%Y%m%d_%H%M%S")
+        steps = self.num_timesteps
+        snapshot_name = f"{timestamp}_{steps}_steps"
+        snapshot_dir = os.path.join("historiccheckpoints", snapshot_name)
+        if self.verbose > 0:
+            print(f" [Snapshot] Saving to {snapshot_dir}...")
+        os.makedirs(snapshot_dir, exist_ok=True)
+        # 1. Save Model
+        model_path = os.path.join(snapshot_dir, "model.zip")
+        self.model.save(model_path)
+        # 2. Save Card Pool (Context)
+        try:
+            import shutil
+            shutil.copy("verified_card_pool.json", os.path.join(snapshot_dir, "verified_card_pool.json"))
+        except Exception as e:
+            print(f" [Snapshot] Warning: Could not copy card pool: {e}")
+        # 3. Save Metadata (Architecture)
+        meta = {
+            "timestamp": timestamp,
+            "timesteps": int(steps),
+            "obs_dim": int(self.model.observation_space.shape[0]),
+            "action_space_size": int(self.model.action_space.n),
+            "features": ["GlobalVolumes", "LiveZone", "Traits", "TurnNumber"],
+            "notes": "Generated by ModelSnapshotCallback",
+        }
+        try:
+            import json
+            with open(os.path.join(snapshot_dir, "snapshot_meta.json"), "w") as f:
+                json.dump(meta, f, indent=2)
+        except Exception as e:
+            print(f" [Snapshot] Warning: Could not save meta: {e}")
+        # 4. Limit to Last 5 Snapshots
+        self._prune_snapshots()
+    def _prune_snapshots(self):
+        root = os.path.dirname(self.save_path)  # wait, save_path is "historiccheckpoints"?
+        # save_path passed in init is "historiccheckpoints" relative to cwd? Yes.
+        # But wait, self.save_path in init is used.
+        # Let's verify self.save_path from init
+        # It is "historiccheckpoints"
+        search_dir = self.save_path
+        if not os.path.exists(search_dir):
+            return
+        # Get list of directories
+        try:
+            subdirs = [
+                os.path.join(search_dir, d)
+                for d in os.listdir(search_dir)
+                if os.path.isdir(os.path.join(search_dir, d))
+            ]
+            # Sort by creation time (oldest first)
+            subdirs.sort(key=os.path.getctime)
+            # Keep last 5
+            max_keep = 5
+            if len(subdirs) > max_keep:
+                to_remove = subdirs[:-max_keep]
+                import shutil
+                for d in to_remove:
+                    try:
+                        shutil.rmtree(d)
+                        if self.verbose > 0:
+                            print(f" [Snapshot] Pruned old snapshot: {d}")
+                    except Exception as e:
+                        print(f" [Snapshot] Warning: Failed to prune {d}: {e}")
+        except Exception as e:
+            print(f" [Snapshot] Warning: Pruning failed: {e}")
+class DetailedStatusCallback(BaseCallback):
+    """
+    Logs detailed phase information (Collection vs Optimization) and VRAM usage.
+    """
+    def __init__(self, verbose=0):
+        super().__init__(verbose)
+        self.collection_start_time = 0.0
+    def _on_rollout_start(self) -> None:
+        """
+        A rollout is the collection of environment steps.
+        """
+        self.collection_start_time = time.time()
+        print(f"\n [Phase] Starting Rollout Collection (Steps: {self.model.n_steps})...")
+    def _on_rollout_end(self) -> None:
+        """
+        This event is triggered before updating the policy.
+        """
+        duration = time.time() - self.collection_start_time
+        n_envs = self.model.n_envs
+        n_steps = self.model.n_steps
+        total_steps = n_envs * n_steps
+        fps = total_steps / duration if duration > 0 else 0
+        print(f" [Phase] Collection Complete. Duration: {duration:.2f}s ({fps:.0f} FPS)")
+        # PPO optimization is about to start
+        print(f" [Phase] Starting PPO Optimization (Epochs: {self.model.n_epochs}, Batch: {self.model.batch_size})...")
+        if torch.cuda.is_available():
+            allocated = torch.cuda.memory_allocated() / 1024**3
+            reserved = torch.cuda.memory_reserved() / 1024**3
+            print(f" [VRAM] Allocated: {allocated:.2f} GB | Reserved: {reserved:.2f} GB")
+            print(" [Info] Optimization may take time if batch size is large. Please wait...")
+    def _on_step(self) -> bool:
+        return True
+class TrainingStatsCallback(BaseCallback):
+    """
+    Simple stats logging for Vectorized Training.
+    """
+    def __init__(self, verbose=0):
+        super().__init__(verbose)
+    def _on_step(self) -> bool:
+        # Log win rate if available in infos
+        infos = self.locals.get("infos")
+        if infos:
+            # VectorEnv doesn't emit 'win_rate' in infos by default unless we add it
+            # But we can look for 'episode' keys
+            episodes = [i.get("episode") for i in infos if "episode" in i]
+            if episodes:
+                rew = np.mean([ep["r"] for ep in episodes])
+                length = np.mean([ep["l"] for ep in episodes])
+                self.logger.record("rollout/ep_rew_mean", rew)
+                self.logger.record("rollout/ep_len_mean", length)
+        return True
+class ProgressMaskablePPO(MaskablePPO):
+    """
+    MaskablePPO with a tqdm progress bar during the optimization phase.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.optimization_pbar = None
+    def train(self) -> None:
+        """
+        Update policy using the currently gathered rollout buffer.
+        """
+        # Switch to train mode (this affects batch norm / dropout)
+        self.policy.set_training_mode(True)
+        # Update optimizer learning rate
+        self._update_learning_rate(self.policy.optimizer)
+        # Compute current clip range
+        clip_range = self.clip_range(self._current_progress_remaining)  # type: ignore[operator]
+        # Optional: clip range for the value function
+        if self.clip_range_vf is not None:
+            clip_range_vf = self.clip_range_vf(self._current_progress_remaining)  # type: ignore[operator]
+        entropy_losses = []
+        pg_losses, value_losses = [], []
+        clip_fractions = []
+        continue_training = True
+        # train for n_epochs epochs
+        # ADDED: Persistent TQDM Progress Bar
+        total_steps = self.n_epochs * (self.rollout_buffer.buffer_size // self.batch_size)
+        if self.optimization_pbar is None:
+            self.optimization_pbar = tqdm(total=total_steps, desc="Optimization", unit="batch", leave=True)
+        else:
+            self.optimization_pbar.reset(total=total_steps)
+        for epoch in range(self.n_epochs):
+            approx_kl_divs = []
+            # Do a complete pass on the rollout buffer
+            for rollout_data in self.rollout_buffer.get(self.batch_size):
+                actions = rollout_data.actions
+                if isinstance(self.action_space, spaces.Discrete):
+                    # Convert discrete action from float to long
+                    actions = rollout_data.actions.long().flatten()
+                with th.cuda.amp.autocast(enabled=th.cuda.is_available()):
+                    values, log_prob, entropy = self.policy.evaluate_actions(
+                        rollout_data.observations,
+                        actions,
+                        action_masks=rollout_data.action_masks,
+                    )
+                    values = values.flatten()
+                    # Normalize advantage
+                    advantages = rollout_data.advantages
+                    if self.normalize_advantage:
+                        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
+                    # ratio between old and new policy, should be one at the first iteration
+                    ratio = th.exp(log_prob - rollout_data.old_log_prob)
+                    # clipped surrogate loss
+                    policy_loss_1 = advantages * ratio
+                    policy_loss_2 = advantages * th.clamp(ratio, 1 - clip_range, 1 + clip_range)
+                    policy_loss = -th.min(policy_loss_1, policy_loss_2).mean()
+                    # Logging
+                    pg_losses.append(policy_loss.item())
+                    clip_fraction = th.mean((th.abs(ratio - 1) > clip_range).float()).item()
+                    clip_fractions.append(clip_fraction)
+                    if self.clip_range_vf is None:
+                        # No clipping
+                        values_pred = values
+                    else:
+                        # Clip the different between old and new value
+                        # NOTE: this depends on the reward scaling
+                        values_pred = rollout_data.old_values + th.clamp(
+                            values - rollout_data.old_values, -clip_range_vf, clip_range_vf
+                        )
+                    # Value loss using the TD(gae_lambda) target
+                    value_loss = F.mse_loss(rollout_data.returns, values_pred)
+                    value_losses.append(value_loss.item())
+                    # Entropy loss favor exploration
+                    if entropy is None:
+                        # Approximate entropy when no analytical form
+                        entropy_loss = -th.mean(-log_prob)
+                    else:
+                        entropy_loss = -th.mean(entropy)
+                    entropy_losses.append(entropy_loss.item())
+                    loss = policy_loss + self.ent_coef * entropy_loss + self.vf_coef * value_loss
+                # Calculate approximate form of reverse KL Divergence for early stopping
+                # see issue #417: https://github.com/DLR-RM/stable-baselines3/issues/417
+                # and discussion in PR #419: https://github.com/DLR-RM/stable-baselines3/pull/419
+                # and Schulman blog: http://joschu.net/blog/kl-approx.html
+                with th.no_grad():
+                    log_ratio = log_prob - rollout_data.old_log_prob
+                    approx_kl_div = th.mean((th.exp(log_ratio) - 1) - log_ratio).cpu().numpy()
+                    approx_kl_divs.append(approx_kl_div)
+                if self.target_kl is not None and approx_kl_div > 1.5 * self.target_kl:
+                    continue_training = False
+                    if self.verbose >= 1:
+                        print(f"Early stopping at step {epoch} due to reaching max kl: {approx_kl_div:.2f}")
+                    break
+                # Optimization step
+                self.policy.optimizer.zero_grad()
+                # AMP: Automatic Mixed Precision
+                # Check if scaler exists (backward compatibility)
+                if not hasattr(self, "scaler"):
+                    self.scaler = th.cuda.amp.GradScaler(enabled=th.cuda.is_available())
+                # Backward pass
+                self.scaler.scale(loss).backward()
+                # Clip grad norm
+                self.scaler.unscale_(self.policy.optimizer)
+                th.nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm)
+                # Optimizer step
+                self.scaler.step(self.policy.optimizer)
+                self.scaler.update()
+                # Update Progress Bar
+                self.optimization_pbar.update(1)
+            if not continue_training:
+                break
+        # Don't close, just leave it for the next reset
+        self._n_updates += self.n_epochs
+        explained_var = explained_variance(self.rollout_buffer.values.flatten(), self.rollout_buffer.returns.flatten())
+        # Logs
+        self.logger.record("train/entropy_loss", np.mean(entropy_losses))
+        self.logger.record("train/policy_gradient_loss", np.mean(pg_losses))
+        self.logger.record("train/value_loss", np.mean(value_losses))
+        self.logger.record("train/approx_kl", np.mean(approx_kl_divs))
+        self.logger.record("train/clip_fraction", np.mean(clip_fractions))
+        self.logger.record("train/loss", loss.item())
+        self.logger.record("train/explained_variance", explained_var)
+        self.logger.record("train/n_updates", self._n_updates, exclude="tensorboard")
+        self.logger.record("train/clip_range", clip_range)
+        if self.clip_range_vf is not None:
+            self.logger.record("train/clip_range_vf", clip_range_vf)
+    def _excluded_save_params(self) -> list[str]:
+        """
+        Returns the names of the parameters that should be excluded from being saved.
+        """
+        return super()._excluded_save_params() + ["optimization_pbar"]
+def main():
+    print("========================================================")
+    print(" LovecaSim - STARTING VECTORIZED TRAINING (700k+ SPS)   ")
+    print("========================================================")
+    # Configuration from Environment Variables
+    TOTAL_TIMESTEPS = int(os.getenv("TRAIN_STEPS", "100_000_000"))
+    BATCH_SIZE = int(os.getenv("TRAIN_BATCH_SIZE", "8192"))
+    NUM_ENVS = int(os.getenv("TRAIN_ENVS", "4096"))
+    N_STEPS = int(os.getenv("TRAIN_N_STEPS", "256"))
+    # Advanced Hyperparameters
+    ENT_COEF = float(os.getenv("ENT_COEF", "0.01"))
+    GAMMA = float(os.getenv("GAMMA", "0.99"))
+    GAE_LAMBDA = float(os.getenv("GAE_LAMBDA", "0.95"))
+    SAVE_PATH = "./checkpoints/vector/"
+    os.makedirs(SAVE_PATH, exist_ok=True)
+    # Log Hardware/Threading Config
+    omp_threads = os.getenv("OMP_NUM_THREADS", "Unset (All Cores)")
+    print(f" [Config] Batch Size: {BATCH_SIZE}")
+    print(f" [Config] Num Envs:   {NUM_ENVS}")
+    print(f" [Config] N Steps:    {N_STEPS}")
+    print(f" [Config] CPU Cores:  {omp_threads}")
+    # 1. Create Vector Environment (Numba)
+    print(f" [Init] Creating {NUM_ENVS} parallel Numba environments...")
+    env = VectorEnvAdapter(num_envs=NUM_ENVS)
+    # --- WARMUP / COMPILATION ---
+    print(" [Init] Compiling Numba functions (Reset)... This may take 30s+")
+    env.reset()
+    print(" [Init] Compiling Numba functions (Step)... This may take 60s+")
+    # Perform a dummy step to force compilation of the massive step kernel
+    dummy_actions = np.zeros(NUM_ENVS, dtype=np.int32)
+    env.step(dummy_actions)
+    print(" [Init] Compilation complete! Starting training...")
+    # ----------------------------
+    # 2. Setup or Load PPO Agent
+    checkpoint_path = os.getenv("LOAD_CHECKPOINT", "")
+    # Auto-resolve "LATEST" or "AUTO"
+    force_restart = os.getenv("RESTART_TRAINING", "FALSE").upper() == "TRUE"
+    if force_restart:
+        print(" [Config] RESTART_TRAINING=TRUE. Ignoring checkpoints.")
+        checkpoint_path = ""
+    elif checkpoint_path.upper() in ["LATEST", "AUTO"]:
+        list_of_files = glob.glob(os.path.join(SAVE_PATH, "*.zip"))
+        if list_of_files:
+            checkpoint_path = max(list_of_files, key=os.path.getctime)
+            print(f" [Config] LOAD_CHECKPOINT='{os.getenv('LOAD_CHECKPOINT')}' -> Auto-resolved to: {checkpoint_path}")
+        else:
+            print(" [Config] LOAD_CHECKPOINT='LATEST' but no checkpoints found. Starting fresh.")
+            checkpoint_path = ""
+    model = None
+    if checkpoint_path and os.path.exists(checkpoint_path):
+        print(f" [Load] Scanning checkpoint: {checkpoint_path}")
+        try:
+            # Check dimensions before full load if possible, or load and check
+            temp_model = ProgressMaskablePPO.load(checkpoint_path, device="cpu")
+            model_obs_dim = temp_model.observation_space.shape[0]
+            env_obs_dim = env.observation_space.shape[0]
+            if model_obs_dim != env_obs_dim:
+                print(f" [Load] Dimension Mismatch! Model: {model_obs_dim}, Env: {env_obs_dim}")
+                print(f" [Load] Cannot resume training across eras. Starting FRESH {env_obs_dim}-dim model.")
+                model = None
+            else:
+                print(f" [Load] Dimensions match ({model_obs_dim}). Resuming training...")
+                model = ProgressMaskablePPO.load(
+                    checkpoint_path,
+                    env=env,
+                    device="cuda" if torch.cuda.is_available() else "cpu",
+                    custom_objects={
+                        "learning_rate": float(os.getenv("LEARNING_RATE", "3e-4")),
+                        "batch_size": BATCH_SIZE,
+                        "n_epochs": int(os.getenv("NUM_EPOCHS", "4")),
+                    },
+                )
+                reset_num_timesteps = False
+                print(" [Load] Success.")
+        except Exception as e:
+            print(f" [Error] Failed to load checkpoint: {e}")
+            print(" [Init] Falling back to fresh model...")
+            model = None
+    if model is None:
+        if checkpoint_path and not os.path.exists(checkpoint_path):
+            print(f" [Warning] Checkpoint file not found: {checkpoint_path}")
+        print(" [Init] Creating fresh ProgressMaskablePPO model...")
+        # Determine Policy Args
+        obs_mode_env = os.getenv("OBS_MODE", "STANDARD")
+        if obs_mode_env == "ATTENTION":
+            print(" [Init] Using LovecaFeaturesExtractor (Attention)")
+            policy_kwargs = dict(
+                features_extractor_class=LovecaFeaturesExtractor,
+                features_extractor_kwargs=dict(features_dim=256),
+                net_arch=[],
+            )
+        else:
+            policy_kwargs = dict(net_arch=[512, 512])
+        model = ProgressMaskablePPO(
+            "MlpPolicy",
+            env,
+            verbose=1,
+            learning_rate=float(os.getenv("LEARNING_RATE", "3e-4")),
+            n_steps=N_STEPS,
+            batch_size=BATCH_SIZE,
+            n_epochs=int(os.getenv("NUM_EPOCHS", "4")),
+            gamma=GAMMA,
+            gae_lambda=GAE_LAMBDA,
+            ent_coef=ENT_COEF,
+            tensorboard_log="./logs/vector_tensorboard/",
+            policy_kwargs=policy_kwargs,
+        )
+        reset_num_timesteps = True
+    print(f" [Init] PPO Model initialized. Device: {model.device}")
+    # 3. Callbacks
+    # Refactored: Callbacks moved to module level.
+    # Standard Checkpoint (Keep for compatibility/safety)
+    checkpoint_callback = CheckpointCallback(
+        save_freq=max(1, 1000000 // NUM_ENVS), save_path=SAVE_PATH, name_prefix="numba_ppo"
+    )
+    save_freq = float(os.getenv("SAVE_FREQ_MINS", "15.0"))
+    # Snapshot Callback (Replaces TimeCheckpointCallback)
+    snapshot_callback = ModelSnapshotCallback(
+        save_freq_minutes=save_freq,
+        save_path="historiccheckpoints",
+        verbose=1,
+    )
+    # Store OBS_MODE in snapshot meta
+    # (We need to update ModelSnapshotCallback logic or just trust env stores it?
+    #  Ideally pass it to callback or update meta generation.
+    #  Let's keep it simple: Environment tracks it.)
+    # 4. Train
+    print(" [Train] Starting training loop...")
+    print(f" [Train] Model Mode:   {os.getenv('OBS_MODE', 'STANDARD')}")
+    print(f" [Train] Reset Timesteps: {reset_num_timesteps}")
+    print(" [Note] Press Ctrl+C to stop and force-save.")
+    # Generate a timestamped run name for TensorBoard
+    run_name = f"ProgressPPO_{time.strftime('%m%d_%H%M%S')}"
+    if not reset_num_timesteps:
+        run_name += "_RESUME"
+    try:
+        model.learn(
+            total_timesteps=TOTAL_TIMESTEPS,
+            callback=[
+                checkpoint_callback,
+                snapshot_callback,
+                TrainingStatsCallback(),
+                DetailedStatusCallback(),
+            ],  # Use Snapshot + DetailedStatus!
+            progress_bar=True,
+            reset_num_timesteps=reset_num_timesteps,
+            tb_log_name=run_name,
+        )
+        print(" [Train] Training finished.")
+        model.save(f"{SAVE_PATH}/final_model")
+    except KeyboardInterrupt:
+        print("\n [Train] Interrupted by user. Saving model...")
+        model.save(f"{SAVE_PATH}/interrupted_model")
+        # Trigger explicit snapshot on interrupt
+        snapshot_callback._save_snapshot()
+        print(" [Train] Model saved.")
+    except Exception as e:
+        print(f"\n [Error] Training crashed: {e}")
+        import traceback
+        traceback.print_exc()
+        emergency_save = os.path.join(SAVE_PATH, "crash_emergency")
+        model.save(emergency_save)
+        print(f" [Save] Crash emergency checkpoint saved to: {emergency_save}")
+    finally:
+        print(" [Done] Exiting gracefully.")
+        env.close()
+if __name__ == "__main__":
+    main()