Spaces:

trioskosmos
/

LovecaSim

Running

App Files Files Community

LovecaSim / ai /environments /vector_env_gpu.py

trioskosmos

Upload ai/environments/vector_env_gpu.py with huggingface_hub

250ac83 verified 10 days ago

raw

history blame contribute delete

34.4 kB

	"""
	GPU-Native Vectorized Game Environment.

	This module provides VectorEnvGPU - a GPU-resident implementation using CuPy
	and Numba CUDA for maximum throughput. All game state arrays live in GPU VRAM,
	eliminating PCI-E transfer overhead during RL training.

	Usage:
	Set USE_GPU_ENV=1 to enable GPU environment in training.
	"""

	import json
	import os
	import time

	import numpy as np

	# CUDA detection
	HAS_CUDA = False
	try:
	import cupy as cp
	from numba import cuda

	if cuda.is_available():
	HAS_CUDA = True
	from numba.cuda.random import create_xoroshiro128p_states
	except ImportError:
	pass

	# Mock objects for CPU fallback
	if not HAS_CUDA:

	class MockCP:
	int32 = np.int32
	int8 = np.int8
	float32 = np.float32
	bool_ = np.bool_

	def full(self, shape, val, dtype=None):
	return np.full(shape, val, dtype=dtype)

	def zeros(self, shape, dtype=None):
	return np.zeros(shape, dtype=dtype)

	def ones(self, shape, dtype=None):
	return np.ones(shape, dtype=dtype)

	def asnumpy(self, arr):
	return np.array(arr)

	def array(self, arr, dtype=None):
	return np.array(arr, dtype=dtype)

	def asarray(self, arr, dtype=None):
	return np.asarray(arr, dtype=dtype)

	def arange(self, n, dtype=None):
	return np.arange(n, dtype=dtype)

	def get_default_memory_pool(self):
	class MockPool:
	def used_bytes(self):
	return 0

	return MockPool()

	cp = MockCP()

	class MockCudaMod:
	def to_device(self, arr):
	return arr

	def device_array(self, shape, dtype=None):
	return np.zeros(shape, dtype=dtype)

	def synchronize(self):
	pass

	def jit(self, args, *kwargs):
	return lambda x: x

	def grid(self, x):
	return 0

	cuda = MockCudaMod()

	def create_xoroshiro128p_states(n, seed):
	return None


	class VectorEnvGPU:
	"""
	GPU-Resident Vectorized Game Environment.

	All state arrays are CuPy arrays in GPU VRAM.
	Observations and actions are passed as GPU tensors with zero-copy.

	Args:
	num_envs: Number of parallel environments
	opp_mode: Opponent mode (0=Heuristic, 1=Random)
	force_start_order: -1=Random, 0=P1, 1=P2
	"""

	def __init__(self, num_envs: int = 4096, opp_mode: int = 0, force_start_order: int = -1, seed: int = 42):
	self.num_envs = num_envs
	self.opp_mode = opp_mode # 0=Heuristic, 1=Random, 2=Solitaire
	self.force_start_order = force_start_order
	self.seed = seed

	print(f" [VectorEnvGPU] Initializing {num_envs} environments. CUDA: {HAS_CUDA}")

	# =========================================================
	# AGENT STATE (GPU-Resident)
	# =========================================================
	self.batch_stage = cp.full((num_envs, 3), -1, dtype=cp.int32)
	self.batch_energy_vec = cp.zeros((num_envs, 3, 32), dtype=cp.int32)
	self.batch_energy_count = cp.zeros((num_envs, 3), dtype=cp.int32)
	self.batch_continuous_vec = cp.zeros((num_envs, 32, 10), dtype=cp.int32)
	self.batch_continuous_ptr = cp.zeros(num_envs, dtype=cp.int32)
	self.batch_tapped = cp.zeros((num_envs, 16), dtype=cp.int32)
	self.batch_live = cp.zeros((num_envs, 50), dtype=cp.int32)
	self.batch_opp_tapped = cp.zeros((num_envs, 16), dtype=cp.int32)
	self.batch_scores = cp.zeros(num_envs, dtype=cp.int32)

	self.batch_flat_ctx = cp.zeros((num_envs, 64), dtype=cp.int32)
	self.batch_global_ctx = cp.zeros((num_envs, 128), dtype=cp.int32)

	self.batch_hand = cp.zeros((num_envs, 60), dtype=cp.int32)
	self.batch_deck = cp.zeros((num_envs, 60), dtype=cp.int32)
	self.batch_trash = cp.zeros((num_envs, 60), dtype=cp.int32)
	self.batch_opp_history = cp.zeros((num_envs, 6), dtype=cp.int32)

	# =========================================================
	# OPPONENT STATE (GPU-Resident)
	# =========================================================
	self.opp_stage = cp.full((num_envs, 3), -1, dtype=cp.int32)
	self.opp_energy_vec = cp.zeros((num_envs, 3, 32), dtype=cp.int32)
	self.opp_energy_count = cp.zeros((num_envs, 3), dtype=cp.int32)
	self.opp_tapped = cp.zeros((num_envs, 16), dtype=cp.int8)
	self.opp_live = cp.zeros((num_envs, 50), dtype=cp.int32)
	self.opp_scores = cp.zeros(num_envs, dtype=cp.int32)
	self.opp_global_ctx = cp.zeros((num_envs, 128), dtype=cp.int32)
	self.opp_hand = cp.zeros((num_envs, 60), dtype=cp.int32)
	self.opp_deck = cp.zeros((num_envs, 60), dtype=cp.int32)
	self.opp_trash = cp.zeros((num_envs, 60), dtype=cp.int32)

	# =========================================================
	# TRACKING STATE
	# =========================================================
	self.prev_scores = cp.zeros(num_envs, dtype=cp.int32)
	self.prev_opp_scores = cp.zeros(num_envs, dtype=cp.int32)
	self.prev_phases = cp.zeros(num_envs, dtype=cp.int32)
	self.episode_returns = cp.zeros(num_envs, dtype=cp.float32)
	self.episode_lengths = cp.zeros(num_envs, dtype=cp.int32)

	# =========================================================
	# OBSERVATION MODE
	# =========================================================
	self.obs_mode = os.getenv("OBS_MODE", "STANDARD")
	if self.obs_mode == "COMPRESSED":
	self.obs_dim = 512
	elif self.obs_mode == "IMAX":
	self.obs_dim = 8192
	elif self.obs_mode == "ATTENTION":
	self.obs_dim = 2240
	else:
	self.obs_dim = 2304
	print(f" [VectorEnvGPU] Observation Mode: {self.obs_mode} ({self.obs_dim}-dim)")

	self.batch_obs = cp.zeros((num_envs, self.obs_dim), dtype=cp.float32)
	self.terminal_obs_buffer = cp.zeros((num_envs, self.obs_dim), dtype=cp.float32)

	# Rewards and Dones
	self.rewards = cp.zeros(num_envs, dtype=cp.float32)
	self.dones = cp.zeros(num_envs, dtype=cp.bool_)
	self.term_scores_agent = cp.zeros(num_envs, dtype=cp.int32)
	self.term_scores_opp = cp.zeros(num_envs, dtype=cp.int32)

	# =========================================================
	# GAME CONFIG
	# =========================================================
	self.scenario_reward_scale = float(os.getenv("SCENARIO_REWARD_SCALE", "1.0"))
	if os.getenv("USE_SCENARIOS", "0") == "1" and self.scenario_reward_scale != 1.0:
	print(f" [VectorEnvGPU] Scenario Reward Scale: {self.scenario_reward_scale}")

	self.game_config = cp.zeros(10, dtype=cp.float32)
	self.game_config[0] = float(os.getenv("GAME_TURN_LIMIT", "100"))
	self.game_config[1] = float(os.getenv("GAME_STEP_LIMIT", "1000"))
	self.game_config[2] = float(os.getenv("GAME_REWARD_WIN", "100.0"))
	self.game_config[3] = float(os.getenv("GAME_REWARD_LOSE", "-100.0"))
	self.game_config[4] = float(os.getenv("GAME_REWARD_SCORE_SCALE", "50.0"))
	self.game_config[5] = float(os.getenv("GAME_REWARD_TURN_PENALTY", "-0.05"))

	# =========================================================
	# GPU RNG
	# =========================================================
	if HAS_CUDA:
	self.rng_states = create_xoroshiro128p_states(num_envs, seed=seed)
	else:
	self.rng_states = None

	# =========================================================
	# KERNEL CONFIGURATION
	# =========================================================
	self.threads_per_block = 128
	self.blocks_per_grid = (num_envs + self.threads_per_block - 1) // self.threads_per_block

	# =========================================================
	# LOAD DATA
	# =========================================================
	self._load_bytecode()
	self._load_card_stats()
	self._load_deck_pool()

	# Memory stats
	if HAS_CUDA:
	mempool = cp.get_default_memory_pool()
	used_mb = mempool.used_bytes() / 1024 / 1024
	print(f" [VectorEnvGPU] GPU VRAM used: {used_mb:.2f} MB")

	def _load_bytecode(self):
	"""Load compiled bytecode to GPU."""
	host_map = np.zeros((100, 128, 4), dtype=np.int32)
	host_idx = np.zeros((2000, 8), dtype=np.int32)

	try:
	with open("data/cards_numba.json", "r") as f:
	raw_map = json.load(f)

	max_cards = 2000
	max_abilities = 8
	max_len = 128

	unique_entries = len(raw_map)
	host_map = np.zeros((unique_entries + 1, max_len, 4), dtype=np.int32)
	host_idx = np.full((max_cards, max_abilities), 0, dtype=np.int32)

	idx_counter = 1
	for key, bc_list in raw_map.items():
	cid, aid = map(int, key.split("_"))
	if cid < max_cards and aid < max_abilities:
	bc_arr = np.array(bc_list, dtype=np.int32).reshape(-1, 4)
	length = min(bc_arr.shape[0], max_len)
	host_map[idx_counter, :length] = bc_arr[:length]
	host_idx[cid, aid] = idx_counter
	idx_counter += 1

	print(f" [VectorEnvGPU] Loaded {unique_entries} compiled abilities.")
	except FileNotFoundError:
	print(" [VectorEnvGPU] Warning: cards_numba.json not found.")
	except Exception as e:
	print(f" [VectorEnvGPU] Warning: Failed to load bytecode: {e}")

	self.bytecode_map = cp.asarray(host_map)
	self.bytecode_index = cp.asarray(host_idx)

	def _load_card_stats(self):
	"""Load card statistics to GPU."""
	host_stats = np.zeros((2000, 80), dtype=np.int32)

	try:
	with open("data/cards_compiled.json", "r", encoding="utf-8") as f:
	db = json.load(f)

	count = 0
	if "member_db" in db:
	for cid_str, card in db["member_db"].items():
	cid = int(cid_str)
	if cid < 2000:
	host_stats[cid, 0] = card.get("cost", 0)
	host_stats[cid, 1] = card.get("blades", 0)
	host_stats[cid, 2] = sum(card.get("hearts", []))
	host_stats[cid, 10] = 1 # Type: Member

	# Hearts breakdown
	h_arr = card.get("hearts", [])
	for r_idx in range(min(len(h_arr), 7)):
	host_stats[cid, 12 + r_idx] = h_arr[r_idx]

	# Traits
	mask = 0
	for g in card.get("groups", []):
	try:
	mask \|= 1 << (int(g) % 20)
	except:
	pass
	host_stats[cid, 11] = mask
	count += 1

	if "live_db" in db:
	for cid_str, card in db["live_db"].items():
	cid = int(cid_str)
	if cid < 2000:
	host_stats[cid, 10] = 2 # Type: Live
	reqs = card.get("required_hearts", [])
	for r_idx in range(min(len(reqs), 7)):
	host_stats[cid, 12 + r_idx] = reqs[r_idx]
	host_stats[cid, 38] = card.get("score", 0)
	count += 1

	print(f" [VectorEnvGPU] Loaded stats for {count} cards.")
	except Exception as e:
	print(f" [VectorEnvGPU] Warning: Failed to load card stats: {e}")

	self.card_stats = cp.asarray(host_stats)

	def _load_deck_pool(self):
	"""Load verified card pool for deck generation."""
	ability_member_ids = []
	ability_live_ids = []

	try:
	with open("data/verified_card_pool.json", "r", encoding="utf-8") as f:
	verified_data = json.load(f)

	with open("data/cards_compiled.json", "r", encoding="utf-8") as f:
	db_data = json.load(f)

	member_no_map = {}
	live_no_map = {}
	for cid, cdata in db_data.get("member_db", {}).items():
	member_no_map[cdata["card_no"]] = int(cid)
	for cid, cdata in db_data.get("live_db", {}).items():
	live_no_map[cdata["card_no"]] = int(cid)

	if isinstance(verified_data, list):
	for v_no in verified_data:
	if v_no in member_no_map:
	ability_member_ids.append(member_no_map[v_no])
	elif v_no in live_no_map:
	ability_live_ids.append(live_no_map[v_no])
	else:
	source_members = verified_data.get("verified_abilities", []) + verified_data.get("members", [])
	for v_no in source_members:
	if v_no in member_no_map:
	ability_member_ids.append(member_no_map[v_no])

	source_lives = verified_data.get("verified_lives", []) + verified_data.get("lives", [])
	for v_no in source_lives:
	if v_no in live_no_map:
	ability_live_ids.append(live_no_map[v_no])

	if not ability_member_ids:
	for v_no in verified_data.get("vanilla_members", []):
	if v_no in member_no_map:
	ability_member_ids.append(member_no_map[v_no])
	if not ability_live_ids:
	for v_no in verified_data.get("vanilla_lives", []):
	if v_no in live_no_map:
	ability_live_ids.append(live_no_map[v_no])

	if not ability_member_ids:
	ability_member_ids = [1]
	if not ability_live_ids:
	ability_live_ids = [999]

	print(f" [VectorEnvGPU] Deck Pool: {len(ability_member_ids)} members, {len(ability_live_ids)} lives")
	except Exception as e:
	print(f" [VectorEnvGPU] Deck Load Error: {e}")
	ability_member_ids = [1]
	ability_live_ids = [999]

	self.ability_member_ids = cp.array(ability_member_ids, dtype=cp.int32)
	self.ability_live_ids = cp.array(ability_live_ids, dtype=cp.int32)

	# =========================================================
	# PYTORCH INTERFACE
	# =========================================================

	def get_observations_tensor(self):
	"""Return observations as PyTorch CUDA tensor (zero-copy)."""
	import torch

	return torch.as_tensor(self.batch_obs, device="cuda")

	def get_action_masks_tensor(self):
	"""Return action masks as PyTorch CUDA tensor."""
	import torch

	masks = self.get_action_masks()
	return torch.as_tensor(masks, device="cuda")

	def get_rewards_tensor(self):
	"""Return rewards as PyTorch CUDA tensor."""
	import torch

	return torch.as_tensor(self.rewards, device="cuda")

	def get_dones_tensor(self):
	"""Return dones as PyTorch CUDA tensor."""
	import torch

	return torch.as_tensor(self.dones, device="cuda")

	# =========================================================
	# ENVIRONMENT INTERFACE
	# =========================================================

	def reset(self, indices=None):
	"""Reset environments."""
	if not HAS_CUDA:
	# CPU fallback
	self.batch_stage.fill(-1)
	self.batch_scores.fill(0)
	self.batch_global_ctx.fill(0)
	self.batch_hand.fill(0)
	self.batch_deck.fill(0)
	return self.batch_obs

	from ai.cuda_kernels import encode_observations_attention_kernel, encode_observations_kernel, reset_kernel

	if indices is None:
	indices_gpu = cp.arange(self.num_envs, dtype=cp.int32)
	else:
	indices_gpu = cp.array(indices, dtype=cp.int32)

	blocks = (len(indices_gpu) + self.threads_per_block - 1) // self.threads_per_block

	reset_kernel[blocks, self.threads_per_block](
	indices_gpu,
	self.batch_stage,
	self.batch_energy_vec,
	self.batch_energy_count,
	self.batch_continuous_vec,
	self.batch_continuous_ptr,
	self.batch_tapped,
	self.batch_live,
	self.batch_scores,
	self.batch_flat_ctx,
	self.batch_global_ctx,
	self.batch_hand,
	self.batch_deck,
	self.batch_trash,
	self.batch_opp_history,
	self.opp_stage,
	self.opp_energy_vec,
	self.opp_energy_count,
	self.opp_tapped,
	self.opp_live,
	self.opp_scores,
	self.opp_global_ctx,
	self.opp_hand,
	self.opp_deck,
	self.opp_trash,
	self.ability_member_ids,
	self.ability_live_ids,
	self.rng_states,
	self.force_start_order,
	self.batch_obs,
	self.card_stats,
	)

	# Encode initial observations
	if self.obs_mode == "ATTENTION":
	encode_observations_attention_kernel[self.blocks_per_grid, self.threads_per_block](
	self.num_envs,
	self.batch_hand,
	self.batch_stage,
	self.batch_energy_count,
	self.batch_tapped,
	self.batch_scores,
	self.opp_scores,
	self.opp_stage,
	self.opp_tapped,
	self.card_stats,
	self.batch_global_ctx,
	self.batch_live,
	self.batch_opp_history,
	self.opp_global_ctx,
	1,
	self.batch_obs,
	)
	else:
	encode_observations_kernel[self.blocks_per_grid, self.threads_per_block](
	self.num_envs,
	self.batch_hand,
	self.batch_stage,
	self.batch_energy_count,
	self.batch_tapped,
	self.batch_scores,
	self.opp_scores,
	self.opp_stage,
	self.opp_tapped,
	self.card_stats,
	self.batch_global_ctx,
	self.batch_live,
	1,
	self.batch_obs,
	)

	# Reset tracking
	if indices is None:
	self.prev_scores.fill(0)
	self.prev_opp_scores.fill(0)
	self.episode_returns.fill(0)
	self.episode_lengths.fill(0)
	else:
	self.prev_scores[indices_gpu] = 0
	self.prev_opp_scores[indices_gpu] = 0
	self.episode_returns[indices_gpu] = 0
	self.episode_lengths[indices_gpu] = 0

	return self.batch_obs

	def step(self, actions):
	"""
	Step all environments.

	Args:
	actions: CuPy array or PyTorch tensor of actions

	Returns:
	obs, rewards, dones, infos
	"""
	if not HAS_CUDA:
	# Fallback
	return self.batch_obs, self.rewards, self.dones, [{}] * self.num_envs

	import torch
	from ai.cuda_kernels import (
	encode_observations_attention_kernel,
	encode_observations_kernel,
	reset_kernel,
	step_kernel,
	)

	# Convert to CuPy if needed
	if isinstance(actions, torch.Tensor):
	actions_gpu = cp.asarray(actions.cpu().numpy(), dtype=cp.int32)
	elif isinstance(actions, np.ndarray):
	actions_gpu = cp.asarray(actions, dtype=cp.int32)
	else:
	actions_gpu = actions

	# 1. Step kernel
	step_kernel[self.blocks_per_grid, self.threads_per_block](
	self.num_envs,
	actions_gpu,
	self.batch_hand,
	self.batch_deck,
	self.batch_stage,
	self.batch_energy_vec,
	self.batch_energy_count,
	self.batch_continuous_vec,
	self.batch_continuous_ptr,
	self.batch_tapped,
	self.batch_live,
	self.batch_scores,
	self.batch_flat_ctx,
	self.batch_global_ctx,
	self.opp_hand,
	self.opp_deck,
	self.opp_stage,
	self.opp_energy_vec,
	self.opp_energy_count,
	self.opp_tapped,
	self.opp_live,
	self.opp_scores,
	self.opp_global_ctx,
	self.card_stats,
	self.bytecode_map,
	self.bytecode_index,
	self.batch_obs,
	self.rewards,
	self.dones,
	self.prev_scores,
	self.prev_opp_scores,
	self.prev_phases,
	self.terminal_obs_buffer,
	self.batch_trash,
	self.opp_trash,
	self.batch_opp_history,
	self.term_scores_agent,
	self.term_scores_opp,
	self.ability_member_ids,
	self.ability_live_ids,
	self.rng_states,
	self.game_config,
	self.opp_mode,
	self.force_start_order,
	)

	# Apply Scenario Reward Scaling
	if self.scenario_reward_scale != 1.0 and os.getenv("USE_SCENARIOS", "0") == "1":
	self.rewards *= self.scenario_reward_scale

	# 2. Update Episodic Returns/Lengths (Vectorized GPU)
	self.episode_returns += self.rewards
	self.episode_lengths += 1

	# 3. Handle Auto-Reset (High Performance)
	dones_cpu = cp.asnumpy(self.dones)

	# Pre-allocate infos list (reused or created)
	infos = [{} for _ in range(self.num_envs)]

	if np.any(dones_cpu):
	done_indices = np.where(dones_cpu)[0]
	done_indices_gpu = cp.array(done_indices, dtype=cp.int32)

	# A. Capture Terminal Observations (from UNRESET state)
	# Efficient Device-to-Device copy
	# NOTE: step_kernel leaves env in finished state, so batch_obs has terminal state.
	# We must encode it?
	# Actually, step_kernel calls encode at end? No, step_kernel does NOT encode obs in my implementation.
	# I removed the Python-side encode calls from previous impl?
	# Wait, step_kernel logic in my head vs file.
	# In ai/cuda_kernels.py, step_kernel does NOT call encode.
	# So batch_obs is STALE (from previous step)!
	# We MUST encode the terminal state first.

	# Encode CURRENT state (Terminal) for ALL envs? Or just done?
	# Usually we encode all envs at end of step.
	# BUT we need to reset done envs and encode AGAIN.

	# OPTIMIZATION:
	# 1. Encode ALL envs (Next state for running, Terminal for done).
	turn_num = 1 # Dummy, kernels use ctx
	if self.obs_mode == "ATTENTION":
	encode_observations_attention_kernel[self.blocks_per_grid, self.threads_per_block](
	self.num_envs,
	self.batch_hand,
	self.batch_stage,
	self.batch_energy_count,
	self.batch_tapped,
	self.batch_scores,
	self.opp_scores,
	self.opp_stage,
	self.opp_tapped,
	self.card_stats,
	self.batch_global_ctx,
	self.batch_live,
	self.batch_opp_history,
	self.opp_global_ctx,
	turn_num,
	self.batch_obs,
	)
	else:
	encode_observations_kernel[self.blocks_per_grid, self.threads_per_block](
	self.num_envs,
	self.batch_hand,
	self.batch_stage,
	self.batch_energy_count,
	self.batch_tapped,
	self.batch_scores,
	self.opp_scores,
	self.opp_stage,
	self.opp_tapped,
	self.card_stats,
	self.batch_global_ctx,
	self.batch_live,
	turn_num,
	self.batch_obs,
	)

	# 2. For Done Envs: Copy encoded terminal state to buffer
	# We can use fancy indexing copy on GPU
	self.terminal_obs_buffer[done_indices_gpu] = self.batch_obs[done_indices_gpu]

	# 3. Fetch Terminal Info Metrics (Bulk D2H)
	final_returns = cp.asnumpy(self.episode_returns[done_indices_gpu])
	final_lengths = cp.asnumpy(self.episode_lengths[done_indices_gpu])
	term_obs_cpu = cp.asnumpy(self.terminal_obs_buffer[done_indices_gpu])
	term_scores_ag = cp.asnumpy(self.term_scores_agent[done_indices_gpu])
	term_scores_op = cp.asnumpy(self.term_scores_opp[done_indices_gpu])

	# 4. Populate Infos (CPU Loop over SMALL subset)
	for k, idx in enumerate(done_indices):
	infos[idx] = {
	"terminal_observation": term_obs_cpu[k],
	"episode": {"r": float(final_returns[k]), "l": int(final_lengths[k])},
	"terminal_score_agent": int(term_scores_ag[k]),
	"terminal_score_opp": int(term_scores_op[k]),
	}

	# 5. Reset Done Envs
	# Reset accumulators
	self.episode_returns[done_indices_gpu] = 0
	self.episode_lengths[done_indices_gpu] = 0

	# Launch Reset Kernel
	blocks_reset = (len(done_indices) + self.threads_per_block - 1) // self.threads_per_block
	reset_kernel[blocks_reset, self.threads_per_block](
	done_indices_gpu,
	self.batch_stage,
	self.batch_energy_vec,
	self.batch_energy_count,
	self.batch_continuous_vec,
	self.batch_continuous_ptr,
	self.batch_tapped,
	self.batch_live,
	self.batch_scores,
	self.batch_flat_ctx,
	self.batch_global_ctx,
	self.batch_hand,
	self.batch_deck,
	self.batch_trash,
	self.batch_opp_history,
	self.opp_stage,
	self.opp_energy_vec,
	self.opp_energy_count,
	self.opp_tapped,
	self.opp_live,
	self.opp_scores,
	self.opp_global_ctx,
	self.opp_hand,
	self.opp_deck,
	self.opp_trash,
	self.ability_member_ids,
	self.ability_live_ids,
	self.rng_states,
	self.force_start_order,
	self.batch_obs,
	self.card_stats,
	)

	# 6. Re-Encode Reset Envs (to get initial state)
	# We assume reset_kernel updates state but NOT obs.
	# We need to re-run encode kernel ONLY for done indices?
	# Or run global encode again? Global is waste.
	# We need an encode kernel that takes indices.
	# The current kernel takes `num_envs` and assumes `0..N`.
	# We can reuse the global kernel if we are clever or modify it.
	# Modifying kernel to accept indices is best.
	# However, for now, to save complexity, we can re-run global encode.
	# It's redundant for non-done envs but correct.
	# Better: Reset modifies batch_obs directly? No, reset_kernel doesn't encode.

	# Let's re-run global encode. It's fast (GPU) compared to CPU loop.
	if self.obs_mode == "ATTENTION":
	encode_observations_attention_kernel[self.blocks_per_grid, self.threads_per_block](
	self.num_envs,
	self.batch_hand,
	self.batch_stage,
	self.batch_energy_count,
	self.batch_tapped,
	self.batch_scores,
	self.opp_scores,
	self.opp_stage,
	self.opp_tapped,
	self.card_stats,
	self.batch_global_ctx,
	self.batch_live,
	self.batch_opp_history,
	self.opp_global_ctx,
	turn_num,
	self.batch_obs,
	)
	else:
	encode_observations_kernel[self.blocks_per_grid, self.threads_per_block](
	self.num_envs,
	self.batch_hand,
	self.batch_stage,
	self.batch_energy_count,
	self.batch_tapped,
	self.batch_scores,
	self.opp_scores,
	self.opp_stage,
	self.opp_tapped,
	self.card_stats,
	self.batch_global_ctx,
	self.batch_live,
	turn_num,
	self.batch_obs,
	)

	else:
	# No resets needed. Just encode once to get next states.
	# Encode observations
	turn_num = 1
	if self.obs_mode == "ATTENTION":
	encode_observations_attention_kernel[self.blocks_per_grid, self.threads_per_block](
	self.num_envs,
	self.batch_hand,
	self.batch_stage,
	self.batch_energy_count,
	self.batch_tapped,
	self.batch_scores,
	self.opp_scores,
	self.opp_stage,
	self.opp_tapped,
	self.card_stats,
	self.batch_global_ctx,
	self.batch_live,
	self.batch_opp_history,
	self.opp_global_ctx,
	turn_num,
	self.batch_obs,
	)
	else:
	encode_observations_kernel[self.blocks_per_grid, self.threads_per_block](
	self.num_envs,
	self.batch_hand,
	self.batch_stage,
	self.batch_energy_count,
	self.batch_tapped,
	self.batch_scores,
	self.opp_scores,
	self.opp_stage,
	self.opp_tapped,
	self.card_stats,
	self.batch_global_ctx,
	self.batch_live,
	turn_num,
	self.batch_obs,
	)

	return self.batch_obs, self.rewards, self.dones, infos

	def get_observations(self):
	"""Return observation buffer (CuPy array)."""
	return self.batch_obs

	def get_action_masks(self):
	"""Compute and return action masks (CuPy array)."""
	if not HAS_CUDA:
	return cp.ones((self.num_envs, 2000), dtype=cp.bool_)

	from ai.cuda_kernels import compute_action_masks_kernel

	masks = cp.zeros((self.num_envs, 2000), dtype=cp.bool_)

	compute_action_masks_kernel[self.blocks_per_grid, self.threads_per_block](
	self.num_envs,
	self.batch_hand,
	self.batch_stage,
	self.batch_tapped,
	self.batch_global_ctx,
	self.batch_live,
	self.card_stats,
	masks,
	)

	return masks


	# ============================================================================
	# BENCHMARK
	# ============================================================================


	def benchmark_gpu_env(num_envs=4096, steps=1000):
	"""Benchmark GPU environment throughput."""
	print("\n=== GPU Environment Benchmark ===")
	print(f"Environments: {num_envs}")
	print(f"Steps: {steps}")

	env = VectorEnvGPU(num_envs=num_envs)
	env.reset()

	# Warmup
	for _ in range(10):
	actions = cp.zeros(num_envs, dtype=cp.int32)
	env.step(actions)

	if HAS_CUDA:
	cuda.synchronize()

	# Benchmark
	start = time.time()
	for _ in range(steps):
	actions = cp.zeros(num_envs, dtype=cp.int32) # Pass action
	env.step(actions)

	if HAS_CUDA:
	cuda.synchronize()

	elapsed = time.time() - start
	total_steps = num_envs * steps
	sps = total_steps / elapsed

	print("\nResults:")
	print(f" Total Steps: {total_steps:,}")
	print(f" Time: {elapsed:.2f}s")
	print(f" Throughput: {sps:,.0f} steps/sec")

	return sps


	if __name__ == "__main__":
	# Quick test
	env = VectorEnvGPU(num_envs=128)
	obs = env.reset()
	print(f"Observation shape: {obs.shape}")

	actions = cp.zeros(128, dtype=cp.int32)
	obs, rewards, dones, infos = env.step(actions)
	print(f"Step completed. Rewards shape: {rewards.shape}")

	# Benchmark
	benchmark_gpu_env(num_envs=1024, steps=100)