Spaces:

trioskosmos
/

LovecaSim

Sleeping

App Files Files Community

trioskosmos commited on 17 days ago

Commit

250ac83

verified ·

1 Parent(s): dc702ee

Upload ai/environments/vector_env_gpu.py with huggingface_hub

Browse files

Files changed (1) hide show

ai/environments/vector_env_gpu.py +891 -0

ai/environments/vector_env_gpu.py ADDED Viewed

	@@ -0,0 +1,891 @@

+"""
+GPU-Native Vectorized Game Environment.
+This module provides VectorEnvGPU - a GPU-resident implementation using CuPy
+and Numba CUDA for maximum throughput. All game state arrays live in GPU VRAM,
+eliminating PCI-E transfer overhead during RL training.
+Usage:
+    Set USE_GPU_ENV=1 to enable GPU environment in training.
+"""
+import json
+import os
+import time
+import numpy as np
+# CUDA detection
+HAS_CUDA = False
+try:
+    import cupy as cp
+    from numba import cuda
+    if cuda.is_available():
+        HAS_CUDA = True
+        from numba.cuda.random import create_xoroshiro128p_states
+except ImportError:
+    pass
+# Mock objects for CPU fallback
+if not HAS_CUDA:
+    class MockCP:
+        int32 = np.int32
+        int8 = np.int8
+        float32 = np.float32
+        bool_ = np.bool_
+        def full(self, shape, val, dtype=None):
+            return np.full(shape, val, dtype=dtype)
+        def zeros(self, shape, dtype=None):
+            return np.zeros(shape, dtype=dtype)
+        def ones(self, shape, dtype=None):
+            return np.ones(shape, dtype=dtype)
+        def asnumpy(self, arr):
+            return np.array(arr)
+        def array(self, arr, dtype=None):
+            return np.array(arr, dtype=dtype)
+        def asarray(self, arr, dtype=None):
+            return np.asarray(arr, dtype=dtype)
+        def arange(self, n, dtype=None):
+            return np.arange(n, dtype=dtype)
+        def get_default_memory_pool(self):
+            class MockPool:
+                def used_bytes(self):
+                    return 0
+            return MockPool()
+    cp = MockCP()
+    class MockCudaMod:
+        def to_device(self, arr):
+            return arr
+        def device_array(self, shape, dtype=None):
+            return np.zeros(shape, dtype=dtype)
+        def synchronize(self):
+            pass
+        def jit(self, *args, **kwargs):
+            return lambda x: x
+        def grid(self, x):
+            return 0
+    cuda = MockCudaMod()
+    def create_xoroshiro128p_states(n, seed):
+        return None
+class VectorEnvGPU:
+    """
+    GPU-Resident Vectorized Game Environment.
+    All state arrays are CuPy arrays in GPU VRAM.
+    Observations and actions are passed as GPU tensors with zero-copy.
+    Args:
+        num_envs: Number of parallel environments
+        opp_mode: Opponent mode (0=Heuristic, 1=Random)
+        force_start_order: -1=Random, 0=P1, 1=P2
+    """
+    def __init__(self, num_envs: int = 4096, opp_mode: int = 0, force_start_order: int = -1, seed: int = 42):
+        self.num_envs = num_envs
+        self.opp_mode = opp_mode  # 0=Heuristic, 1=Random, 2=Solitaire
+        self.force_start_order = force_start_order
+        self.seed = seed
+        print(f" [VectorEnvGPU] Initializing {num_envs} environments. CUDA: {HAS_CUDA}")
+        # =========================================================
+        # AGENT STATE (GPU-Resident)
+        # =========================================================
+        self.batch_stage = cp.full((num_envs, 3), -1, dtype=cp.int32)
+        self.batch_energy_vec = cp.zeros((num_envs, 3, 32), dtype=cp.int32)
+        self.batch_energy_count = cp.zeros((num_envs, 3), dtype=cp.int32)
+        self.batch_continuous_vec = cp.zeros((num_envs, 32, 10), dtype=cp.int32)
+        self.batch_continuous_ptr = cp.zeros(num_envs, dtype=cp.int32)
+        self.batch_tapped = cp.zeros((num_envs, 16), dtype=cp.int32)
+        self.batch_live = cp.zeros((num_envs, 50), dtype=cp.int32)
+        self.batch_opp_tapped = cp.zeros((num_envs, 16), dtype=cp.int32)
+        self.batch_scores = cp.zeros(num_envs, dtype=cp.int32)
+        self.batch_flat_ctx = cp.zeros((num_envs, 64), dtype=cp.int32)
+        self.batch_global_ctx = cp.zeros((num_envs, 128), dtype=cp.int32)
+        self.batch_hand = cp.zeros((num_envs, 60), dtype=cp.int32)
+        self.batch_deck = cp.zeros((num_envs, 60), dtype=cp.int32)
+        self.batch_trash = cp.zeros((num_envs, 60), dtype=cp.int32)
+        self.batch_opp_history = cp.zeros((num_envs, 6), dtype=cp.int32)
+        # =========================================================
+        # OPPONENT STATE (GPU-Resident)
+        # =========================================================
+        self.opp_stage = cp.full((num_envs, 3), -1, dtype=cp.int32)
+        self.opp_energy_vec = cp.zeros((num_envs, 3, 32), dtype=cp.int32)
+        self.opp_energy_count = cp.zeros((num_envs, 3), dtype=cp.int32)
+        self.opp_tapped = cp.zeros((num_envs, 16), dtype=cp.int8)
+        self.opp_live = cp.zeros((num_envs, 50), dtype=cp.int32)
+        self.opp_scores = cp.zeros(num_envs, dtype=cp.int32)
+        self.opp_global_ctx = cp.zeros((num_envs, 128), dtype=cp.int32)
+        self.opp_hand = cp.zeros((num_envs, 60), dtype=cp.int32)
+        self.opp_deck = cp.zeros((num_envs, 60), dtype=cp.int32)
+        self.opp_trash = cp.zeros((num_envs, 60), dtype=cp.int32)
+        # =========================================================
+        # TRACKING STATE
+        # =========================================================
+        self.prev_scores = cp.zeros(num_envs, dtype=cp.int32)
+        self.prev_opp_scores = cp.zeros(num_envs, dtype=cp.int32)
+        self.prev_phases = cp.zeros(num_envs, dtype=cp.int32)
+        self.episode_returns = cp.zeros(num_envs, dtype=cp.float32)
+        self.episode_lengths = cp.zeros(num_envs, dtype=cp.int32)
+        # =========================================================
+        # OBSERVATION MODE
+        # =========================================================
+        self.obs_mode = os.getenv("OBS_MODE", "STANDARD")
+        if self.obs_mode == "COMPRESSED":
+            self.obs_dim = 512
+        elif self.obs_mode == "IMAX":
+            self.obs_dim = 8192
+        elif self.obs_mode == "ATTENTION":
+            self.obs_dim = 2240
+        else:
+            self.obs_dim = 2304
+        print(f" [VectorEnvGPU] Observation Mode: {self.obs_mode} ({self.obs_dim}-dim)")
+        self.batch_obs = cp.zeros((num_envs, self.obs_dim), dtype=cp.float32)
+        self.terminal_obs_buffer = cp.zeros((num_envs, self.obs_dim), dtype=cp.float32)
+        # Rewards and Dones
+        self.rewards = cp.zeros(num_envs, dtype=cp.float32)
+        self.dones = cp.zeros(num_envs, dtype=cp.bool_)
+        self.term_scores_agent = cp.zeros(num_envs, dtype=cp.int32)
+        self.term_scores_opp = cp.zeros(num_envs, dtype=cp.int32)
+        # =========================================================
+        # GAME CONFIG
+        # =========================================================
+        self.scenario_reward_scale = float(os.getenv("SCENARIO_REWARD_SCALE", "1.0"))
+        if os.getenv("USE_SCENARIOS", "0") == "1" and self.scenario_reward_scale != 1.0:
+            print(f" [VectorEnvGPU] Scenario Reward Scale: {self.scenario_reward_scale}")
+        self.game_config = cp.zeros(10, dtype=cp.float32)
+        self.game_config[0] = float(os.getenv("GAME_TURN_LIMIT", "100"))
+        self.game_config[1] = float(os.getenv("GAME_STEP_LIMIT", "1000"))
+        self.game_config[2] = float(os.getenv("GAME_REWARD_WIN", "100.0"))
+        self.game_config[3] = float(os.getenv("GAME_REWARD_LOSE", "-100.0"))
+        self.game_config[4] = float(os.getenv("GAME_REWARD_SCORE_SCALE", "50.0"))
+        self.game_config[5] = float(os.getenv("GAME_REWARD_TURN_PENALTY", "-0.05"))
+        # =========================================================
+        # GPU RNG
+        # =========================================================
+        if HAS_CUDA:
+            self.rng_states = create_xoroshiro128p_states(num_envs, seed=seed)
+        else:
+            self.rng_states = None
+        # =========================================================
+        # KERNEL CONFIGURATION
+        # =========================================================
+        self.threads_per_block = 128
+        self.blocks_per_grid = (num_envs + self.threads_per_block - 1) // self.threads_per_block
+        # =========================================================
+        # LOAD DATA
+        # =========================================================
+        self._load_bytecode()
+        self._load_card_stats()
+        self._load_deck_pool()
+        # Memory stats
+        if HAS_CUDA:
+            mempool = cp.get_default_memory_pool()
+            used_mb = mempool.used_bytes() / 1024 / 1024
+            print(f" [VectorEnvGPU] GPU VRAM used: {used_mb:.2f} MB")
+    def _load_bytecode(self):
+        """Load compiled bytecode to GPU."""
+        host_map = np.zeros((100, 128, 4), dtype=np.int32)
+        host_idx = np.zeros((2000, 8), dtype=np.int32)
+        try:
+            with open("data/cards_numba.json", "r") as f:
+                raw_map = json.load(f)
+            max_cards = 2000
+            max_abilities = 8
+            max_len = 128
+            unique_entries = len(raw_map)
+            host_map = np.zeros((unique_entries + 1, max_len, 4), dtype=np.int32)
+            host_idx = np.full((max_cards, max_abilities), 0, dtype=np.int32)
+            idx_counter = 1
+            for key, bc_list in raw_map.items():
+                cid, aid = map(int, key.split("_"))
+                if cid < max_cards and aid < max_abilities:
+                    bc_arr = np.array(bc_list, dtype=np.int32).reshape(-1, 4)
+                    length = min(bc_arr.shape[0], max_len)
+                    host_map[idx_counter, :length] = bc_arr[:length]
+                    host_idx[cid, aid] = idx_counter
+                    idx_counter += 1
+            print(f" [VectorEnvGPU] Loaded {unique_entries} compiled abilities.")
+        except FileNotFoundError:
+            print(" [VectorEnvGPU] Warning: cards_numba.json not found.")
+        except Exception as e:
+            print(f" [VectorEnvGPU] Warning: Failed to load bytecode: {e}")
+        self.bytecode_map = cp.asarray(host_map)
+        self.bytecode_index = cp.asarray(host_idx)
+    def _load_card_stats(self):
+        """Load card statistics to GPU."""
+        host_stats = np.zeros((2000, 80), dtype=np.int32)
+        try:
+            with open("data/cards_compiled.json", "r", encoding="utf-8") as f:
+                db = json.load(f)
+            count = 0
+            if "member_db" in db:
+                for cid_str, card in db["member_db"].items():
+                    cid = int(cid_str)
+                    if cid < 2000:
+                        host_stats[cid, 0] = card.get("cost", 0)
+                        host_stats[cid, 1] = card.get("blades", 0)
+                        host_stats[cid, 2] = sum(card.get("hearts", []))
+                        host_stats[cid, 10] = 1  # Type: Member
+                        # Hearts breakdown
+                        h_arr = card.get("hearts", [])
+                        for r_idx in range(min(len(h_arr), 7)):
+                            host_stats[cid, 12 + r_idx] = h_arr[r_idx]
+                        # Traits
+                        mask = 0
+                        for g in card.get("groups", []):
+                            try:
+                                mask |= 1 << (int(g) % 20)
+                            except:
+                                pass
+                        host_stats[cid, 11] = mask
+                        count += 1
+            if "live_db" in db:
+                for cid_str, card in db["live_db"].items():
+                    cid = int(cid_str)
+                    if cid < 2000:
+                        host_stats[cid, 10] = 2  # Type: Live
+                        reqs = card.get("required_hearts", [])
+                        for r_idx in range(min(len(reqs), 7)):
+                            host_stats[cid, 12 + r_idx] = reqs[r_idx]
+                        host_stats[cid, 38] = card.get("score", 0)
+                        count += 1
+            print(f" [VectorEnvGPU] Loaded stats for {count} cards.")
+        except Exception as e:
+            print(f" [VectorEnvGPU] Warning: Failed to load card stats: {e}")
+        self.card_stats = cp.asarray(host_stats)
+    def _load_deck_pool(self):
+        """Load verified card pool for deck generation."""
+        ability_member_ids = []
+        ability_live_ids = []
+        try:
+            with open("data/verified_card_pool.json", "r", encoding="utf-8") as f:
+                verified_data = json.load(f)
+            with open("data/cards_compiled.json", "r", encoding="utf-8") as f:
+                db_data = json.load(f)
+            member_no_map = {}
+            live_no_map = {}
+            for cid, cdata in db_data.get("member_db", {}).items():
+                member_no_map[cdata["card_no"]] = int(cid)
+            for cid, cdata in db_data.get("live_db", {}).items():
+                live_no_map[cdata["card_no"]] = int(cid)
+            if isinstance(verified_data, list):
+                for v_no in verified_data:
+                    if v_no in member_no_map:
+                        ability_member_ids.append(member_no_map[v_no])
+                    elif v_no in live_no_map:
+                        ability_live_ids.append(live_no_map[v_no])
+            else:
+                source_members = verified_data.get("verified_abilities", []) + verified_data.get("members", [])
+                for v_no in source_members:
+                    if v_no in member_no_map:
+                        ability_member_ids.append(member_no_map[v_no])
+                source_lives = verified_data.get("verified_lives", []) + verified_data.get("lives", [])
+                for v_no in source_lives:
+                    if v_no in live_no_map:
+                        ability_live_ids.append(live_no_map[v_no])
+                if not ability_member_ids:
+                    for v_no in verified_data.get("vanilla_members", []):
+                        if v_no in member_no_map:
+                            ability_member_ids.append(member_no_map[v_no])
+                if not ability_live_ids:
+                    for v_no in verified_data.get("vanilla_lives", []):
+                        if v_no in live_no_map:
+                            ability_live_ids.append(live_no_map[v_no])
+            if not ability_member_ids:
+                ability_member_ids = [1]
+            if not ability_live_ids:
+                ability_live_ids = [999]
+            print(f" [VectorEnvGPU] Deck Pool: {len(ability_member_ids)} members, {len(ability_live_ids)} lives")
+        except Exception as e:
+            print(f" [VectorEnvGPU] Deck Load Error: {e}")
+            ability_member_ids = [1]
+            ability_live_ids = [999]
+        self.ability_member_ids = cp.array(ability_member_ids, dtype=cp.int32)
+        self.ability_live_ids = cp.array(ability_live_ids, dtype=cp.int32)
+    # =========================================================
+    # PYTORCH INTERFACE
+    # =========================================================
+    def get_observations_tensor(self):
+        """Return observations as PyTorch CUDA tensor (zero-copy)."""
+        import torch
+        return torch.as_tensor(self.batch_obs, device="cuda")
+    def get_action_masks_tensor(self):
+        """Return action masks as PyTorch CUDA tensor."""
+        import torch
+        masks = self.get_action_masks()
+        return torch.as_tensor(masks, device="cuda")
+    def get_rewards_tensor(self):
+        """Return rewards as PyTorch CUDA tensor."""
+        import torch
+        return torch.as_tensor(self.rewards, device="cuda")
+    def get_dones_tensor(self):
+        """Return dones as PyTorch CUDA tensor."""
+        import torch
+        return torch.as_tensor(self.dones, device="cuda")
+    # =========================================================
+    # ENVIRONMENT INTERFACE
+    # =========================================================
+    def reset(self, indices=None):
+        """Reset environments."""
+        if not HAS_CUDA:
+            # CPU fallback
+            self.batch_stage.fill(-1)
+            self.batch_scores.fill(0)
+            self.batch_global_ctx.fill(0)
+            self.batch_hand.fill(0)
+            self.batch_deck.fill(0)
+            return self.batch_obs
+        from ai.cuda_kernels import encode_observations_attention_kernel, encode_observations_kernel, reset_kernel
+        if indices is None:
+            indices_gpu = cp.arange(self.num_envs, dtype=cp.int32)
+        else:
+            indices_gpu = cp.array(indices, dtype=cp.int32)
+        blocks = (len(indices_gpu) + self.threads_per_block - 1) // self.threads_per_block
+        reset_kernel[blocks, self.threads_per_block](
+            indices_gpu,
+            self.batch_stage,
+            self.batch_energy_vec,
+            self.batch_energy_count,
+            self.batch_continuous_vec,
+            self.batch_continuous_ptr,
+            self.batch_tapped,
+            self.batch_live,
+            self.batch_scores,
+            self.batch_flat_ctx,
+            self.batch_global_ctx,
+            self.batch_hand,
+            self.batch_deck,
+            self.batch_trash,
+            self.batch_opp_history,
+            self.opp_stage,
+            self.opp_energy_vec,
+            self.opp_energy_count,
+            self.opp_tapped,
+            self.opp_live,
+            self.opp_scores,
+            self.opp_global_ctx,
+            self.opp_hand,
+            self.opp_deck,
+            self.opp_trash,
+            self.ability_member_ids,
+            self.ability_live_ids,
+            self.rng_states,
+            self.force_start_order,
+            self.batch_obs,
+            self.card_stats,
+        )
+        # Encode initial observations
+        if self.obs_mode == "ATTENTION":
+            encode_observations_attention_kernel[self.blocks_per_grid, self.threads_per_block](
+                self.num_envs,
+                self.batch_hand,
+                self.batch_stage,
+                self.batch_energy_count,
+                self.batch_tapped,
+                self.batch_scores,
+                self.opp_scores,
+                self.opp_stage,
+                self.opp_tapped,
+                self.card_stats,
+                self.batch_global_ctx,
+                self.batch_live,
+                self.batch_opp_history,
+                self.opp_global_ctx,
+                1,
+                self.batch_obs,
+            )
+        else:
+            encode_observations_kernel[self.blocks_per_grid, self.threads_per_block](
+                self.num_envs,
+                self.batch_hand,
+                self.batch_stage,
+                self.batch_energy_count,
+                self.batch_tapped,
+                self.batch_scores,
+                self.opp_scores,
+                self.opp_stage,
+                self.opp_tapped,
+                self.card_stats,
+                self.batch_global_ctx,
+                self.batch_live,
+                1,
+                self.batch_obs,
+            )
+        # Reset tracking
+        if indices is None:
+            self.prev_scores.fill(0)
+            self.prev_opp_scores.fill(0)
+            self.episode_returns.fill(0)
+            self.episode_lengths.fill(0)
+        else:
+            self.prev_scores[indices_gpu] = 0
+            self.prev_opp_scores[indices_gpu] = 0
+            self.episode_returns[indices_gpu] = 0
+            self.episode_lengths[indices_gpu] = 0
+        return self.batch_obs
+    def step(self, actions):
+        """
+        Step all environments.
+        Args:
+            actions: CuPy array or PyTorch tensor of actions
+        Returns:
+            obs, rewards, dones, infos
+        """
+        if not HAS_CUDA:
+            # Fallback
+            return self.batch_obs, self.rewards, self.dones, [{}] * self.num_envs
+        import torch
+        from ai.cuda_kernels import (
+            encode_observations_attention_kernel,
+            encode_observations_kernel,
+            reset_kernel,
+            step_kernel,
+        )
+        # Convert to CuPy if needed
+        if isinstance(actions, torch.Tensor):
+            actions_gpu = cp.asarray(actions.cpu().numpy(), dtype=cp.int32)
+        elif isinstance(actions, np.ndarray):
+            actions_gpu = cp.asarray(actions, dtype=cp.int32)
+        else:
+            actions_gpu = actions
+        # 1. Step kernel
+        step_kernel[self.blocks_per_grid, self.threads_per_block](
+            self.num_envs,
+            actions_gpu,
+            self.batch_hand,
+            self.batch_deck,
+            self.batch_stage,
+            self.batch_energy_vec,
+            self.batch_energy_count,
+            self.batch_continuous_vec,
+            self.batch_continuous_ptr,
+            self.batch_tapped,
+            self.batch_live,
+            self.batch_scores,
+            self.batch_flat_ctx,
+            self.batch_global_ctx,
+            self.opp_hand,
+            self.opp_deck,
+            self.opp_stage,
+            self.opp_energy_vec,
+            self.opp_energy_count,
+            self.opp_tapped,
+            self.opp_live,
+            self.opp_scores,
+            self.opp_global_ctx,
+            self.card_stats,
+            self.bytecode_map,
+            self.bytecode_index,
+            self.batch_obs,
+            self.rewards,
+            self.dones,
+            self.prev_scores,
+            self.prev_opp_scores,
+            self.prev_phases,
+            self.terminal_obs_buffer,
+            self.batch_trash,
+            self.opp_trash,
+            self.batch_opp_history,
+            self.term_scores_agent,
+            self.term_scores_opp,
+            self.ability_member_ids,
+            self.ability_live_ids,
+            self.rng_states,
+            self.game_config,
+            self.opp_mode,
+            self.force_start_order,
+        )
+        # Apply Scenario Reward Scaling
+        if self.scenario_reward_scale != 1.0 and os.getenv("USE_SCENARIOS", "0") == "1":
+            self.rewards *= self.scenario_reward_scale
+        # 2. Update Episodic Returns/Lengths (Vectorized GPU)
+        self.episode_returns += self.rewards
+        self.episode_lengths += 1
+        # 3. Handle Auto-Reset (High Performance)
+        dones_cpu = cp.asnumpy(self.dones)
+        # Pre-allocate infos list (reused or created)
+        infos = [{} for _ in range(self.num_envs)]
+        if np.any(dones_cpu):
+            done_indices = np.where(dones_cpu)[0]
+            done_indices_gpu = cp.array(done_indices, dtype=cp.int32)
+            # A. Capture Terminal Observations (from UNRESET state)
+            # Efficient Device-to-Device copy
+            # NOTE: step_kernel leaves env in finished state, so batch_obs has terminal state.
+            # We must encode it?
+            # Actually, step_kernel calls encode at end? No, step_kernel does NOT encode obs in my implementation.
+            # I removed the Python-side encode calls from previous impl?
+            # Wait, step_kernel logic in my head vs file.
+            # In ai/cuda_kernels.py, step_kernel does NOT call encode.
+            # So batch_obs is STALE (from previous step)!
+            # We MUST encode the terminal state first.
+            # Encode CURRENT state (Terminal) for ALL envs? Or just done?
+            # Usually we encode all envs at end of step.
+            # BUT we need to reset done envs and encode AGAIN.
+            # OPTIMIZATION:
+            # 1. Encode ALL envs (Next state for running, Terminal for done).
+            turn_num = 1  # Dummy, kernels use ctx
+            if self.obs_mode == "ATTENTION":
+                encode_observations_attention_kernel[self.blocks_per_grid, self.threads_per_block](
+                    self.num_envs,
+                    self.batch_hand,
+                    self.batch_stage,
+                    self.batch_energy_count,
+                    self.batch_tapped,
+                    self.batch_scores,
+                    self.opp_scores,
+                    self.opp_stage,
+                    self.opp_tapped,
+                    self.card_stats,
+                    self.batch_global_ctx,
+                    self.batch_live,
+                    self.batch_opp_history,
+                    self.opp_global_ctx,
+                    turn_num,
+                    self.batch_obs,
+                )
+            else:
+                encode_observations_kernel[self.blocks_per_grid, self.threads_per_block](
+                    self.num_envs,
+                    self.batch_hand,
+                    self.batch_stage,
+                    self.batch_energy_count,
+                    self.batch_tapped,
+                    self.batch_scores,
+                    self.opp_scores,
+                    self.opp_stage,
+                    self.opp_tapped,
+                    self.card_stats,
+                    self.batch_global_ctx,
+                    self.batch_live,
+                    turn_num,
+                    self.batch_obs,
+                )
+            # 2. For Done Envs: Copy encoded terminal state to buffer
+            # We can use fancy indexing copy on GPU
+            self.terminal_obs_buffer[done_indices_gpu] = self.batch_obs[done_indices_gpu]
+            # 3. Fetch Terminal Info Metrics (Bulk D2H)
+            final_returns = cp.asnumpy(self.episode_returns[done_indices_gpu])
+            final_lengths = cp.asnumpy(self.episode_lengths[done_indices_gpu])
+            term_obs_cpu = cp.asnumpy(self.terminal_obs_buffer[done_indices_gpu])
+            term_scores_ag = cp.asnumpy(self.term_scores_agent[done_indices_gpu])
+            term_scores_op = cp.asnumpy(self.term_scores_opp[done_indices_gpu])
+            # 4. Populate Infos (CPU Loop over SMALL subset)
+            for k, idx in enumerate(done_indices):
+                infos[idx] = {
+                    "terminal_observation": term_obs_cpu[k],
+                    "episode": {"r": float(final_returns[k]), "l": int(final_lengths[k])},
+                    "terminal_score_agent": int(term_scores_ag[k]),
+                    "terminal_score_opp": int(term_scores_op[k]),
+                }
+            # 5. Reset Done Envs
+            # Reset accumulators
+            self.episode_returns[done_indices_gpu] = 0
+            self.episode_lengths[done_indices_gpu] = 0
+            # Launch Reset Kernel
+            blocks_reset = (len(done_indices) + self.threads_per_block - 1) // self.threads_per_block
+            reset_kernel[blocks_reset, self.threads_per_block](
+                done_indices_gpu,
+                self.batch_stage,
+                self.batch_energy_vec,
+                self.batch_energy_count,
+                self.batch_continuous_vec,
+                self.batch_continuous_ptr,
+                self.batch_tapped,
+                self.batch_live,
+                self.batch_scores,
+                self.batch_flat_ctx,
+                self.batch_global_ctx,
+                self.batch_hand,
+                self.batch_deck,
+                self.batch_trash,
+                self.batch_opp_history,
+                self.opp_stage,
+                self.opp_energy_vec,
+                self.opp_energy_count,
+                self.opp_tapped,
+                self.opp_live,
+                self.opp_scores,
+                self.opp_global_ctx,
+                self.opp_hand,
+                self.opp_deck,
+                self.opp_trash,
+                self.ability_member_ids,
+                self.ability_live_ids,
+                self.rng_states,
+                self.force_start_order,
+                self.batch_obs,
+                self.card_stats,
+            )
+            # 6. Re-Encode Reset Envs (to get initial state)
+            # We assume reset_kernel updates state but NOT obs.
+            # We need to re-run encode kernel ONLY for done indices?
+            # Or run global encode again? Global is waste.
+            # We need an encode kernel that takes indices.
+            # The current kernel takes `num_envs` and assumes `0..N`.
+            # We can reuse the global kernel if we are clever or modify it.
+            # Modifying kernel to accept indices is best.
+            # However, for now, to save complexity, we can re-run global encode.
+            # It's redundant for non-done envs but correct.
+            # Better: Reset modifies batch_obs directly? No, reset_kernel doesn't encode.
+            # Let's re-run global encode. It's fast (GPU) compared to CPU loop.
+            if self.obs_mode == "ATTENTION":
+                encode_observations_attention_kernel[self.blocks_per_grid, self.threads_per_block](
+                    self.num_envs,
+                    self.batch_hand,
+                    self.batch_stage,
+                    self.batch_energy_count,
+                    self.batch_tapped,
+                    self.batch_scores,
+                    self.opp_scores,
+                    self.opp_stage,
+                    self.opp_tapped,
+                    self.card_stats,
+                    self.batch_global_ctx,
+                    self.batch_live,
+                    self.batch_opp_history,
+                    self.opp_global_ctx,
+                    turn_num,
+                    self.batch_obs,
+                )
+            else:
+                encode_observations_kernel[self.blocks_per_grid, self.threads_per_block](
+                    self.num_envs,
+                    self.batch_hand,
+                    self.batch_stage,
+                    self.batch_energy_count,
+                    self.batch_tapped,
+                    self.batch_scores,
+                    self.opp_scores,
+                    self.opp_stage,
+                    self.opp_tapped,
+                    self.card_stats,
+                    self.batch_global_ctx,
+                    self.batch_live,
+                    turn_num,
+                    self.batch_obs,
+                )
+        else:
+            # No resets needed. Just encode once to get next states.
+            # Encode observations
+            turn_num = 1
+            if self.obs_mode == "ATTENTION":
+                encode_observations_attention_kernel[self.blocks_per_grid, self.threads_per_block](
+                    self.num_envs,
+                    self.batch_hand,
+                    self.batch_stage,
+                    self.batch_energy_count,
+                    self.batch_tapped,
+                    self.batch_scores,
+                    self.opp_scores,
+                    self.opp_stage,
+                    self.opp_tapped,
+                    self.card_stats,
+                    self.batch_global_ctx,
+                    self.batch_live,
+                    self.batch_opp_history,
+                    self.opp_global_ctx,
+                    turn_num,
+                    self.batch_obs,
+                )
+            else:
+                encode_observations_kernel[self.blocks_per_grid, self.threads_per_block](
+                    self.num_envs,
+                    self.batch_hand,
+                    self.batch_stage,
+                    self.batch_energy_count,
+                    self.batch_tapped,
+                    self.batch_scores,
+                    self.opp_scores,
+                    self.opp_stage,
+                    self.opp_tapped,
+                    self.card_stats,
+                    self.batch_global_ctx,
+                    self.batch_live,
+                    turn_num,
+                    self.batch_obs,
+                )
+        return self.batch_obs, self.rewards, self.dones, infos
+    def get_observations(self):
+        """Return observation buffer (CuPy array)."""
+        return self.batch_obs
+    def get_action_masks(self):
+        """Compute and return action masks (CuPy array)."""
+        if not HAS_CUDA:
+            return cp.ones((self.num_envs, 2000), dtype=cp.bool_)
+        from ai.cuda_kernels import compute_action_masks_kernel
+        masks = cp.zeros((self.num_envs, 2000), dtype=cp.bool_)
+        compute_action_masks_kernel[self.blocks_per_grid, self.threads_per_block](
+            self.num_envs,
+            self.batch_hand,
+            self.batch_stage,
+            self.batch_tapped,
+            self.batch_global_ctx,
+            self.batch_live,
+            self.card_stats,
+            masks,
+        )
+        return masks
+# ============================================================================
+# BENCHMARK
+# ============================================================================
+def benchmark_gpu_env(num_envs=4096, steps=1000):
+    """Benchmark GPU environment throughput."""
+    print("\n=== GPU Environment Benchmark ===")
+    print(f"Environments: {num_envs}")
+    print(f"Steps: {steps}")
+    env = VectorEnvGPU(num_envs=num_envs)
+    env.reset()
+    # Warmup
+    for _ in range(10):
+        actions = cp.zeros(num_envs, dtype=cp.int32)
+        env.step(actions)
+    if HAS_CUDA:
+        cuda.synchronize()
+    # Benchmark
+    start = time.time()
+    for _ in range(steps):
+        actions = cp.zeros(num_envs, dtype=cp.int32)  # Pass action
+        env.step(actions)
+    if HAS_CUDA:
+        cuda.synchronize()
+    elapsed = time.time() - start
+    total_steps = num_envs * steps
+    sps = total_steps / elapsed
+    print("\nResults:")
+    print(f"  Total Steps: {total_steps:,}")
+    print(f"  Time: {elapsed:.2f}s")
+    print(f"  Throughput: {sps:,.0f} steps/sec")
+    return sps
+if __name__ == "__main__":
+    # Quick test
+    env = VectorEnvGPU(num_envs=128)
+    obs = env.reset()
+    print(f"Observation shape: {obs.shape}")
+    actions = cp.zeros(128, dtype=cp.int32)
+    obs, rewards, dones, infos = env.step(actions)
+    print(f"Step completed. Rewards shape: {rewards.shape}")
+    # Benchmark
+    benchmark_gpu_env(num_envs=1024, steps=100)