GRPO_chessagent / model_aggressive.py

Upload model_aggressive.py

73d8b96 verified 17 days ago

38.7 kB

	# NOTE FOR COLAB USERS: Run in a separate cell first:
	# !pip -q install chess numpy torch matplotlib pandas

	"""
	Aggressive GRPO Chess Agent — T4/Colab Optimized
	"""

	import os, sys, csv, time, math, shutil, argparse, random
	import numpy as np
	import pandas as pd
	import matplotlib
	matplotlib.use('Agg')
	import matplotlib.pyplot as plt

	try:
	import chess
	except ImportError:
	os.system("pip install -q chess")
	import chess

	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	# ── Hardware flags ─────────────────────────────────────────────────────────────
	torch.backends.cudnn.benchmark = True
	torch.backends.cuda.matmul.allow_tf32 = True
	torch.backends.cudnn.allow_tf32 = True
	if hasattr(torch, 'set_float32_matmul_precision'):
	torch.set_float32_matmul_precision('high')

	# ── Constants ──────────────────────────────────────────────────────────────────
	PIECE_VAL = {
	chess.PAWN: 1.0, chess.KNIGHT: 3.0, chess.BISHOP: 3.2,
	chess.ROOK: 5.0, chess.QUEEN: 9.0, chess.KING: 0.0,
	}
	RANDOM_BASELINE_ELO = 800 # estimated ELO of uniform-random player

	CONFIG = {
	"num_envs": 256,
	"grpo_group_size": 8, # G envs per group, all start from same opening position
	"ppo_epochs": 3,
	"mini_batch_size": 4096,
	"learning_rate": 2e-4,
	"weight_decay": 1e-4,
	"gamma": 0.98, # lower → discount future more → prefer fast wins
	"clip_epsilon": 0.15,
	"entropy_coef": 0.02, # low → exploit aggressive lines
	"value_coef": 0.5,
	"max_steps": 100,
	"opening_max_moves": 10, # randomize opening for GRPO diversity
	"checkpoint_dir": "./checkpoints",
	"save_interval": 50,
	"log_interval": 1,
	"elo_eval_interval": 100, # evaluate ELO every N iterations
	"elo_eval_games": 32,
	"max_runtime_hours": 4.5, # auto-save + download before Colab kills session
	"device": "cuda" if torch.cuda.is_available() else "cpu",
	"seed": 42,
	}

	# ── Action Space ───────────────────────────────────────────────────────────────
	class ActionMapper:
	__slots__ = ['move_to_idx', 'idx_to_move', 'num_actions']
	def __init__(self):
	self.move_to_idx: dict[str, int] = {}
	self.idx_to_move: list[str] = []
	idx = 0
	for f in range(64):
	for t in range(64):
	if f == t: continue
	uci = chess.SQUARE_NAMES[f] + chess.SQUARE_NAMES[t]
	self.move_to_idx[uci] = idx
	self.idx_to_move.append(uci)
	idx += 1
	if chess.square_rank(f) in (1, 6) and \
	abs(chess.square_file(f) - chess.square_file(t)) <= 1:
	for promo in "nbrq":
	puci = uci + promo
	self.move_to_idx[puci] = idx
	self.idx_to_move.append(puci)
	idx += 1
	self.num_actions = idx

	ACTION_MAPPER = ActionMapper()

	# ── Board Encoding ─────────────────────────────────────────────────────────────
	def populate_states_fast(envs: list, active_mask: np.ndarray,
	bbs_np: np.ndarray, meta_np: np.ndarray) -> None:
	"""Fill bbs_np [B,12] int64 and meta_np [B,3] float32 for active envs."""
	for b in range(len(envs)):
	if not active_mask[b]: continue
	env = envs[b]
	w = env.occupied_co[chess.WHITE]
	bc = env.occupied_co[chess.BLACK]
	bbs_np[b, 0] = env.pawns & w; bbs_np[b, 1] = env.knights & w
	bbs_np[b, 2] = env.bishops & w; bbs_np[b, 3] = env.rooks & w
	bbs_np[b, 4] = env.queens & w; bbs_np[b, 5] = env.kings & w
	bbs_np[b, 6] = env.pawns & bc; bbs_np[b, 7] = env.knights & bc
	bbs_np[b, 8] = env.bishops & bc; bbs_np[b, 9] = env.rooks & bc
	bbs_np[b, 10] = env.queens & bc; bbs_np[b, 11] = env.kings & bc
	meta_np[b, 0] = 1.0 if env.turn else -1.0
	meta_np[b, 1] = float(env.castling_rights) / 15.0 # [0,1]
	meta_np[b, 2] = 1.0 if env.ep_square is not None else 0.0

	def get_legal_masks(envs: list, active_mask: np.ndarray):
	masks = np.zeros((len(envs), ACTION_MAPPER.num_actions), dtype=np.bool_)
	moves_list = [None] * len(envs)
	for b in range(len(envs)):
	if not active_mask[b]: continue
	legal = list(envs[b].legal_moves)
	moves_list[b] = legal
	for m in legal:
	masks[b, ACTION_MAPPER.move_to_idx[m.uci()]] = True
	return masks, moves_list

	# ── Neural Network ─────────────────────────────────────────────────────────────
	class ChessNet(nn.Module):
	def __init__(self, res_blocks: int = 8, channels: int = 128):
	super().__init__()
	self.conv_in = nn.Conv2d(14, channels, 3, padding=1, bias=False)
	self.bn_in = nn.BatchNorm2d(channels)
	self.res_blocks = nn.ModuleList([
	nn.Sequential(
	nn.Conv2d(channels, channels, 3, padding=1, bias=False),
	nn.BatchNorm2d(channels), nn.ReLU(inplace=True),
	nn.Conv2d(channels, channels, 3, padding=1, bias=False),
	nn.BatchNorm2d(channels),
	) for _ in range(res_blocks)
	])
	self.policy_head = nn.Sequential(
	nn.Conv2d(channels, 32, 1, bias=False), nn.BatchNorm2d(32),
	nn.ReLU(inplace=True), nn.Flatten(),
	nn.Linear(32 * 64, ACTION_MAPPER.num_actions),
	)
	# No Tanh — shaped rewards exceed [-1,1]; unbounded linear output
	self.value_head = nn.Sequential(
	nn.Conv2d(channels, 32, 1, bias=False), nn.BatchNorm2d(32),
	nn.ReLU(inplace=True), nn.Flatten(),
	nn.Linear(32 * 64, 256), nn.ReLU(inplace=True),
	nn.Linear(256, 1),
	)

	def forward(self, x):
	x = F.relu(self.bn_in(self.conv_in(x)), inplace=True)
	for blk in self.res_blocks:
	x = F.relu(x + blk(x), inplace=True)
	return self.policy_head(x), self.value_head(x)

	# ── ELO Tracker ───────────────────────────────────────────────────────────────
	class ELOTracker:
	def __init__(self, initial_elo: float = 1200.0, K: float = 32.0):
	self.elo = initial_elo
	self.K = K

	def expected(self, opp_elo: float) -> float:
	return 1.0 / (1.0 + 10.0 ** ((opp_elo - self.elo) / 400.0))

	def update(self, score: float, opp_elo: float) -> None:
	self.elo += self.K * (score - self.expected(opp_elo))

	# ── Opening Position Generator ─────────────────────────────────────────────────
	def get_opening_position(max_moves: int = 10) -> chess.Board:
	"""Play 0..max_moves random half-moves from start for GRPO diversity."""
	board = chess.Board()
	for _ in range(random.randint(0, max_moves)):
	if board.is_game_over(): break
	board.push(random.choice(list(board.legal_moves)))
	return chess.Board(board.fen()) # detached copy

	# ── Auto-download ──────────────────────────────────────────────────────────────
	def auto_download(checkpoint_dir: str) -> None:
	"""Sync to Google Drive if mounted, else trigger browser downloads."""
	try:
	from google.colab import files as _cf
	drive_dst = '/content/drive/MyDrive/chess_agent'
	if os.path.exists('/content/drive/MyDrive'):
	os.makedirs(drive_dst, exist_ok=True)
	shutil.copytree(checkpoint_dir, drive_dst, dirs_exist_ok=True)
	print(f"[AutoSave] Synced → {drive_dst}")
	else:
	for fname in ['best.pt', 'latest.pt', 'training_log.csv',
	'elo_log.csv', 'training_performance.png']:
	fpath = os.path.join(checkpoint_dir, fname)
	if os.path.exists(fpath):
	_cf.download(fpath)
	print(f"[AutoSave] Downloaded {fname}")
	except Exception as e:
	print(f"[AutoSave] {e}")

	# ── GRPO Trainer ───────────────────────────────────────────────────────────────
	class GRPOTrainer:

	def __init__(self):
	self.device = CONFIG["device"]

	_model = ChessNet(res_blocks=8, channels=128)
	_model = _model.to(self.device).to(memory_format=torch.channels_last)
	try:
	print("Compiling model (reduce-overhead)…")
	self.model = torch.compile(_model, mode="reduce-overhead")
	except Exception:
	self.model = _model

	self.optimizer = torch.optim.AdamW(
	self.model.parameters(),
	lr=CONFIG["learning_rate"],
	weight_decay=CONFIG["weight_decay"],
	fused=torch.cuda.is_available(),
	)
	self.scaler = torch.amp.GradScaler('cuda')
	self.start_iter = 0
	self.best_win_rate = 0.0
	self.elo_tracker = ELOTracker()

	# Shared shift tensor for bit-unpacking (avoid repeated allocation)
	self.shifts = torch.arange(64, dtype=torch.int64,
	device=self.device).view(1, 1, 64)

	os.makedirs(CONFIG["checkpoint_dir"], exist_ok=True)
	self.log_file = os.path.join(CONFIG["checkpoint_dir"], "training_log.csv")
	self.elo_log_file = os.path.join(CONFIG["checkpoint_dir"], "elo_log.csv")

	if not os.path.exists(self.log_file):
	with open(self.log_file, "w", newline="") as f:
	csv.writer(f).writerow([
	"iteration", "p_loss", "v_loss", "v_mean", "fps",
	"win_rate", "draw_rate", "check_rate", "capture_rate", "avg_game_len",
	])
	if not os.path.exists(self.elo_log_file):
	with open(self.elo_log_file, "w", newline="") as f:
	csv.writer(f).writerow(
	["iteration", "elo", "eval_wins", "eval_draws", "eval_losses"])

	self._init_checkpointing()

	# ── Checkpointing ──────────────────────────────────────────────────────────
	def _init_checkpointing(self) -> None:
	latest = os.path.join(CONFIG["checkpoint_dir"], "latest.pt")
	if not os.path.exists(latest):
	return
	try:
	ckpt = torch.load(latest, map_location=self.device, weights_only=False)
	sd = ckpt['model_state_dict']
	# Handle compiled (_orig_mod. prefix) vs uncompiled state dicts
	loaded = False
	for attempt in [
	sd,
	{k.replace('_orig_mod.', ''): v for k, v in sd.items()},
	{'_orig_mod.' + k: v for k, v in sd.items()},
	]:
	try:
	self.model.load_state_dict(attempt); loaded = True; break
	except RuntimeError:
	continue
	if not loaded:
	raise RuntimeError("All state dict key variants failed.")
	self.optimizer.load_state_dict(ckpt['optimizer_state_dict'])
	self.scaler.load_state_dict(ckpt['scaler_state_dict'])
	self.start_iter = ckpt.get('iteration', 0) + 1
	self.elo_tracker.elo = ckpt.get('elo', 1200.0)
	self.best_win_rate = ckpt.get('best_win_rate', 0.0)
	print(f"Resumed from iter {self.start_iter} \| "
	f"ELO {self.elo_tracker.elo:.0f} \| best_win {self.best_win_rate:.3f}")
	except Exception as e:
	print(f"Checkpoint load failed ({e}). Starting fresh.")

	def save_checkpoint(self, iteration: int, is_best: bool = False) -> None:
	ckpt = {
	'iteration': iteration,
	'model_state_dict': self.model.state_dict(),
	'optimizer_state_dict': self.optimizer.state_dict(),
	'scaler_state_dict': self.scaler.state_dict(),
	'elo': self.elo_tracker.elo,
	'best_win_rate': self.best_win_rate,
	'config': CONFIG,
	}
	cdir = CONFIG["checkpoint_dir"]
	path = os.path.join(cdir, f"iter_{iteration:04d}.pt")
	# Atomic write: write to .tmp then os.replace (single syscall, crash-safe)
	torch.save(ckpt, path + ".tmp"); os.replace(path + ".tmp", path)
	latest = os.path.join(cdir, "latest.pt")
	shutil.copy2(path, latest + ".tmp"); os.replace(latest + ".tmp", latest)
	if is_best:
	best = os.path.join(cdir, "best.pt")
	shutil.copy2(path, best + ".tmp"); os.replace(best + ".tmp", best)

	# ── ELO Evaluation (batched, greedy) ──────────────────────────────────────
	def _elo_game_done(self, board: chess.Board, idx: int, agent_color,
	scores: np.ndarray, active: np.ndarray) -> None:
	if board.is_game_over():
	res = board.result()
	if (res == "1-0" and agent_color == chess.WHITE) or \
	(res == "0-1" and agent_color == chess.BLACK):
	scores[idx] = 1.0
	elif res == "1/2-1/2":
	scores[idx] = 0.5
	else:
	scores[idx] = 0.0
	active[idx] = False

	def evaluate_elo(self, n_games: int = 32, max_ply: int = 200) -> tuple:
	"""
	Play n_games vs random opponent (batched GPU for agent moves).
	Half games as White, half as Black.
	Returns (wins, draws, losses) from agent's perspective.
	"""
	self.model.eval()
	boards = [chess.Board() for _ in range(n_games)]
	agent_colors = [chess.WHITE if i % 2 == 0 else chess.BLACK
	for i in range(n_games)]
	scores = np.full(n_games, 0.5, dtype=np.float32) # default: draw
	active = np.ones(n_games, dtype=bool)
	bbs_sub = np.zeros((n_games, 12), dtype=np.int64)
	meta_sub= np.zeros((n_games, 3), dtype=np.float32)

	for _ in range(max_ply):
	if not active.any(): break

	# Random moves (opponent turns) — CPU
	for i in [i for i in range(n_games)
	if active[i] and boards[i].turn != agent_colors[i]]:
	legal = list(boards[i].legal_moves)
	if legal: boards[i].push(random.choice(legal))
	self._elo_game_done(boards[i], i, agent_colors[i], scores, active)

	# Agent moves (batched GPU)
	ag_idx = [i for i in range(n_games)
	if active[i] and boards[i].turn == agent_colors[i]]
	if not ag_idx:
	continue

	n = len(ag_idx)
	sub = [boards[i] for i in ag_idx]
	act_sub = np.ones(n, dtype=bool)
	populate_states_fast(sub, act_sub, bbs_sub[:n], meta_sub[:n])

	bbs_t = torch.tensor(bbs_sub[:n], dtype=torch.int64, device=self.device)
	unpacked = ((bbs_t.unsqueeze(-1) >> self.shifts) & 1).float().view(n, 12, 8, 8)
	state = torch.zeros(n, 14, 8, 8, device=self.device, dtype=torch.float32)
	state[:, :12] = unpacked
	state[:, 12] = torch.tensor(meta_sub[:n, 0], device=self.device).view(n, 1, 1).expand(n, 8, 8)
	state[:, 13] = torch.tensor(meta_sub[:n, 1], device=self.device).view(n, 1, 1).expand(n, 8, 8)
	for lj in range(n):
	if meta_sub[lj, 2]:
	state[lj, 13, 0, 1] = float(meta_sub[lj, 2])

	with torch.no_grad(), torch.amp.autocast('cuda'):
	logits, _ = self.model(state.to(memory_format=torch.channels_last))
	logits = logits.float()

	masks_np, legal_lists = get_legal_masks(sub, act_sub)
	masks_t = torch.tensor(masks_np, dtype=torch.bool, device=self.device)
	logits = torch.where(masks_t, logits,
	torch.tensor(-60000.0, device=self.device))
	best_acts = logits.argmax(dim=-1).cpu().numpy() # greedy for evaluation

	for lj, gi in enumerate(ag_idx):
	if not active[gi]: continue
	move_uci = ACTION_MAPPER.idx_to_move[best_acts[lj]]
	move = chess.Move.from_uci(move_uci)
	legal = legal_lists[lj] or list(boards[gi].legal_moves)
	if not legal:
	active[gi] = False; continue
	if move not in legal:
	move = random.choice(legal)
	boards[gi].push(move)
	self._elo_game_done(boards[gi], gi, agent_colors[gi], scores, active)

	wins = int((scores == 1.0).sum())
	draws = int((scores == 0.5).sum())
	losses = int((scores == 0.0).sum())
	for s in scores:
	self.elo_tracker.update(float(s), RANDOM_BASELINE_ELO)
	return wins, draws, losses

	# ── Main Training Loop ─────────────────────────────────────────────────────
	def train(self, num_iterations: int) -> None:
	B = CONFIG["num_envs"]
	max_steps = CONFIG["max_steps"]
	G = CONFIG["grpo_group_size"]
	num_groups= B // G
	gamma = CONFIG["gamma"]
	t_start = time.time()
	max_rt = CONFIG["max_runtime_hours"] * 3600.0

	# ── Preallocate GPU buffers (int8/bool minimizes VRAM footprint) ──────
	states_buf = torch.zeros((max_steps, B, 14, 8, 8), dtype=torch.int8, device=self.device)
	actions_buf = torch.zeros((max_steps, B), dtype=torch.int16, device=self.device)
	logprobs_buf= torch.zeros((max_steps, B), dtype=torch.float32, device=self.device)
	values_buf = torch.zeros((max_steps, B), dtype=torch.float32, device=self.device)
	rewards_buf = torch.zeros((max_steps, B), dtype=torch.float32, device=self.device)
	dones_buf = torch.zeros((max_steps, B), dtype=torch.bool, device=self.device)
	active_buf = torch.zeros((max_steps, B), dtype=torch.bool, device=self.device)

	bbs_np = np.zeros((B, 12), dtype=np.int64) # int64: no astype copy needed
	meta_np = np.zeros((B, 3), dtype=np.float32)

	vram_gb = (torch.cuda.get_device_properties(0).total_memory / 1e9
	if torch.cuda.is_available() else 0.0)
	print(f"\n🚀 Aggressive GRPO Chess Agent")
	print(f" Envs:{B} \| Groups:{num_groups}×G:{G} \| Device:{self.device.upper()} \| "
	f"VRAM:{vram_gb:.1f}GB")
	print(f" Reward: capture(0-0.3)+check(0.3)+checkmate_speed(1.0-1.5)"
	f"+draw_penalty(-0.5)+time(-0.003/step)")
	print(f" gamma:{gamma} \| entropy:{CONFIG['entropy_coef']} \| "
	f"lr:{CONFIG['learning_rate']}")

	for iteration in range(self.start_iter, num_iterations):

	# ── Runtime guard ──────────────────────────────────────────────
	elapsed = time.time() - t_start
	if elapsed > max_rt:
	print(f"\n⏱ {elapsed/3600:.2f}h reached. Saving & downloading…")
	self.save_checkpoint(iteration)
	self.plot_metrics()
	auto_download(CONFIG["checkpoint_dir"])
	break

	iter_start = time.time()

	# Zero buffers in-place (no reallocation)
	states_buf.zero_(); actions_buf.zero_(); logprobs_buf.zero_()
	values_buf.zero_(); rewards_buf.zero_()
	dones_buf.fill_(False); active_buf.fill_(False)

	# ── GRPO: each group of G envs shares an opening position ──────
	fens = [get_opening_position(CONFIG["opening_max_moves"]).fen()
	for _ in range(num_groups)]
	envs: list[chess.Board] = []
	for gi in range(num_groups):
	for _ in range(G):
	envs.append(chess.Board(fens[gi]))

	active = np.ones(B, dtype=bool)
	game_lengths = np.zeros(B, dtype=np.int32)

	# Per-iteration attack metrics
	white_wins = black_wins = draws_count = 0
	total_checks = total_captures = 0

	# ── PHASE 1: ROLLOUT ───────────────────────────────────────────
	for t in range(max_steps):
	if not active.any(): break

	populate_states_fast(envs, active, bbs_np, meta_np)

	# Bit-unpack bitboards → int8 state tensor (no float copy)
	bbs_t = torch.as_tensor(bbs_np, dtype=torch.int64, device=self.device)
	unpacked = ((bbs_t.unsqueeze(-1) >> self.shifts) & 1).to(torch.int8)
	meta_t = torch.as_tensor(meta_np, dtype=torch.float32, device=self.device)

	# Pack into int8 buffer (scale float meta to [-127,127])
	states_buf[t, :, :12, :, :] = unpacked.view(B, 12, 8, 8)
	states_buf[t, :, 12, :, :] = (meta_t[:, 0] * 127).clamp(-127, 127) \
	.to(torch.int8).view(B, 1, 1).expand(B, 8, 8)
	states_buf[t, :, 13, :, :] = (meta_t[:, 1] * 127).clamp(0, 127) \
	.to(torch.int8).view(B, 1, 1).expand(B, 8, 8)
	states_buf[t, :, 13, 0, 1]= (meta_t[:, 2] * 127).clamp(0, 127).to(torch.int8)
	active_buf[t] = torch.as_tensor(active, dtype=torch.bool, device=self.device)

	# Normalize int8→float32 for forward pass
	model_input = states_buf[t].to(
	dtype=torch.float32, memory_format=torch.channels_last) / 127.0

	self.model.eval()
	with torch.no_grad(), torch.amp.autocast('cuda'):
	logits, values = self.model(model_input)

	masks_np, legal_moves_list = get_legal_masks(envs, active)
	masks_t = torch.as_tensor(masks_np, dtype=torch.bool, device=self.device)
	logits = logits.float()
	logits = torch.where(masks_t, logits,
	torch.tensor(-60000.0, device=self.device))
	no_legal = ~masks_t.any(dim=-1, keepdim=True)
	logits.masked_fill_(no_legal, 0.0)

	probs = F.softmax(logits, dim=-1)
	dist = torch.distributions.Categorical(probs)
	actions = dist.sample()

	actions_buf[t] = actions.to(torch.int16)
	logprobs_buf[t] = dist.log_prob(actions)
	values_buf[t] = values.squeeze(-1)

	actions_cpu = actions.cpu().numpy()

	for b in range(B):
	if not active[b]: continue

	move_uci = ACTION_MAPPER.idx_to_move[actions_cpu[b]]
	move = chess.Move.from_uci(move_uci)
	if move not in legal_moves_list[b]:
	move = random.choice(legal_moves_list[b])

	board = envs[b]
	mover_is_white = (board.turn == chess.WHITE)
	sign = 1.0 if mover_is_white else -1.0

	# ── Reward: pre-push components ─────────────────────
	r = -0.003 * sign # time penalty (per-mover, white-perspective)

	if board.is_capture(move):
	if board.is_en_passant(move):
	cap_val = 1.0
	else:
	cp = board.piece_at(move.to_square)
	cap_val = PIECE_VAL.get(cp.piece_type, 0.0) if cp else 0.0
	r += sign * (cap_val / 9.0) * 0.3 # [0, 0.3]
	total_captures += 1

	if move.promotion in (chess.QUEEN, chess.ROOK):
	r += sign * 0.15 # aggressive promotion

	board.push(move)
	game_lengths[b] += 1

	# ── Reward: post-push components ────────────────────
	if board.is_check():
	r += sign * 0.3 # gave check
	total_checks += 1

	if board.is_game_over():
	if board.is_checkmate():
	# Mover delivered checkmate
	speed_bonus = 0.5 * math.exp(-game_lengths[b] / 20.0)
	r += sign * (1.0 + speed_bonus) # ~1.0-1.5
	if mover_is_white: white_wins += 1
	else: black_wins += 1
	else:
	# Draw (stalemate / 50-move / repetition / insufficient material)
	r -= 0.5 # flat penalty from white's perspective — attack to WIN
	draws_count += 1
	dones_buf[t, b] = True
	active[b] = False

	rewards_buf[t, b] = r
	# end per-env loop
	# end rollout

	# ── PHASE 2: VECTORIZED RETURNS ────────────────────────────────
	returns = torch.zeros(B, dtype=torch.float32, device=self.device)
	returns_buf = torch.zeros((max_steps, B), dtype=torch.float32, device=self.device)
	not_done_f = (~dones_buf).float()
	for step in reversed(range(max_steps)):
	returns = rewards_buf[step] + gamma * returns * not_done_f[step]
	returns_buf[step]= returns

	# ── PHASE 3: GRPO GROUP-WISE ADVANTAGE NORMALIZATION ───────────
	# advantages shape [max_steps, B]
	adv_raw = returns_buf - values_buf
	active_f = active_buf.float()

	# Reshape to [max_steps, num_groups, G] and normalize within each group
	adv_3d = adv_raw.view(max_steps, num_groups, G)
	act_3d = active_f.view(max_steps, num_groups, G)

	g_count = act_3d.sum(dim=[0, 2]).clamp(min=1.0) # [num_groups]
	g_mean = (adv_3d * act_3d).sum(dim=[0, 2]) / g_count # [num_groups]
	g_sq_diff = ((adv_3d - g_mean.view(1, num_groups, 1)) ** 2
	* act_3d).sum(dim=[0, 2])
	g_std = (g_sq_diff / g_count).sqrt().clamp(min=1e-8) # [num_groups]
	adv_3d = (adv_3d - g_mean.view(1, num_groups, 1)) / \
	g_std.view(1, num_groups, 1)
	adv_norm = adv_3d.view(max_steps, B)

	# Flatten, filter to active steps only
	valid_mask = active_buf.view(-1)
	flat_states = (states_buf.view(-1, 14, 8, 8)[valid_mask]
	.to(torch.float32, memory_format=torch.channels_last)
	.div_(127.0))
	flat_actions = actions_buf.view(-1)[valid_mask].to(torch.int64)
	flat_old_lp = logprobs_buf.view(-1)[valid_mask]
	flat_returns = returns_buf.view(-1)[valid_mask]
	flat_advantages = adv_norm.view(-1)[valid_mask]

	dataset_size = flat_states.size(0)
	if dataset_size < 100:
	continue # skip degenerate rollout (all games ended instantly)

	# ── PHASE 4: PPO OPTIMIZATION ──────────────────────────────────
	self.model.train()
	total_p_loss = total_v_loss = 0.0
	num_updates = 0
	mb_size = CONFIG["mini_batch_size"]

	for _ in range(CONFIG["ppo_epochs"]):
	perm = torch.randperm(dataset_size, device=self.device)
	for start in range(0, dataset_size, mb_size):
	mb = perm[start: start + mb_size]
	with torch.amp.autocast('cuda'):
	new_logits, new_vals = self.model(flat_states[mb])
	new_dist = torch.distributions.Categorical(logits=new_logits)
	new_lp = new_dist.log_prob(flat_actions[mb])
	ratio = torch.exp(new_lp - flat_old_lp[mb])
	adv = flat_advantages[mb]
	surr1 = ratio * adv
	surr2 = torch.clamp(
	ratio,
	1.0 - CONFIG["clip_epsilon"],
	1.0 + CONFIG["clip_epsilon"],
	) * adv
	p_loss = -torch.min(surr1, surr2).mean()
	v_loss = F.mse_loss(new_vals.squeeze(-1), flat_returns[mb])
	entropy = new_dist.entropy().mean()
	loss = (p_loss
	+ CONFIG["value_coef"] * v_loss
	- CONFIG["entropy_coef"] * entropy)

	self.optimizer.zero_grad(set_to_none=True)
	self.scaler.scale(loss).backward()
	self.scaler.unscale_(self.optimizer)
	nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
	self.scaler.step(self.optimizer)
	self.scaler.update()

	total_p_loss += p_loss.item()
	total_v_loss += v_loss.item()
	num_updates += 1

	# ── PHASE 5: METRICS & LOGGING ────────────────────────────────
	done_count = white_wins + black_wins + draws_count
	win_rate = white_wins / max(done_count, 1)
	draw_rate = draws_count / max(done_count, 1)
	active_steps = int(active_buf.sum().item())
	check_rate = total_checks / max(active_steps, 1)
	capture_rate = total_captures / max(active_steps, 1)
	avg_game_len = float(game_lengths.mean())
	fps = dataset_size / max(time.time() - iter_start, 1e-3)

	if (iteration + 1) % CONFIG["log_interval"] == 0:
	vram_alloc = (torch.cuda.memory_allocated() / 1e9
	if torch.cuda.is_available() else 0.0)
	vram_res = (torch.cuda.memory_reserved() / 1e9
	if torch.cuda.is_available() else 0.0)
	print(
	f"[{iteration+1:05d}] "
	f"P:{total_p_loss/max(1,num_updates):.4f} "
	f"V:{total_v_loss/max(1,num_updates):.4f} \| "
	f"W:{win_rate:.3f} D:{draw_rate:.3f} "
	f"Chk:{check_rate:.4f} Cap:{capture_rate:.4f} "
	f"Len:{avg_game_len:.1f} \| "
	f"ELO:{self.elo_tracker.elo:.0f} \| "
	f"FPS:{fps:.0f} \| "
	f"VRAM:{vram_alloc:.2f}/{vram_res:.2f}GB"
	)
	with open(self.log_file, "a", newline="") as f:
	csv.writer(f).writerow([
	iteration + 1,
	total_p_loss / max(1, num_updates),
	total_v_loss / max(1, num_updates),
	flat_returns.mean().item(),
	fps, win_rate, draw_rate,
	check_rate, capture_rate, avg_game_len,
	])

	# Save best checkpoint when win_rate improves
	if win_rate > self.best_win_rate:
	self.best_win_rate = win_rate
	self.save_checkpoint(iteration + 1, is_best=True)

	if (iteration + 1) % CONFIG["save_interval"] == 0:
	self.save_checkpoint(iteration + 1)
	self.plot_metrics()

	# ELO evaluation
	if (iteration + 1) % CONFIG["elo_eval_interval"] == 0:
	elo_before = self.elo_tracker.elo
	ew, ed, el = self.evaluate_elo(CONFIG["elo_eval_games"])
	print(
	f" [ELO eval] {elo_before:.0f} → {self.elo_tracker.elo:.0f} \| "
	f"W:{ew} D:{ed} L:{el} vs random({RANDOM_BASELINE_ELO})"
	)
	with open(self.elo_log_file, "a", newline="") as f:
	csv.writer(f).writerow(
	[iteration + 1, self.elo_tracker.elo, ew, ed, el])
	self.plot_metrics()

	# Aggressive cache reclaim (free fragmented blocks, not pinned allocs)
	torch.cuda.empty_cache()

	# ── Plotting ───────────────────────────────────────────────────────────────
	def plot_metrics(self) -> None:
	if not os.path.exists(self.log_file): return
	df = pd.read_csv(self.log_file)
	if len(df) < 2: return

	elo_df = None
	if os.path.exists(self.elo_log_file):
	elo_df = pd.read_csv(self.elo_log_file)

	fig, axs = plt.subplots(3, 2, figsize=(14, 12))
	fig.suptitle("Aggressive GRPO Chess Agent — Training Dashboard", fontsize=14)

	# Row 0: Losses
	axs[0, 0].plot(df['iteration'], df['p_loss'], color='steelblue', linewidth=1.2)
	axs[0, 0].set_title('Policy Loss'); axs[0, 0].set_xlabel('Iteration')

	axs[0, 1].plot(df['iteration'], df['v_loss'], color='tomato', linewidth=1.2)
	axs[0, 1].set_title('Value Loss'); axs[0, 1].set_xlabel('Iteration')

	# Row 1: Outcomes
	axs[1, 0].plot(df['iteration'], df['win_rate'], label='Win', color='green')
	axs[1, 0].plot(df['iteration'], df['draw_rate'], label='Draw', color='orange')
	axs[1, 0].set_title('Outcomes (White perspective)')
	axs[1, 0].legend(); axs[1, 0].set_xlabel('Iteration')

	# Row 1: Attack metrics
	axs[1, 1].plot(df['iteration'], df['check_rate'], label='Check/step', color='purple')
	axs[1, 1].plot(df['iteration'], df['capture_rate'], label='Capture/step', color='darkorange')
	axs[1, 1].set_title('Attack Metrics (↑ = more aggressive)')
	axs[1, 1].legend(); axs[1, 1].set_xlabel('Iteration')

	# Row 2: ELO Rating
	if elo_df is not None and len(elo_df) > 0:
	axs[2, 0].plot(elo_df['iteration'], elo_df['elo'],
	color='gold', linewidth=2.0, label='Agent ELO')
	axs[2, 0].axhline(RANDOM_BASELINE_ELO, linestyle='--',
	color='gray', alpha=0.8, label=f'Random ({RANDOM_BASELINE_ELO})')
	axs[2, 0].axhline(1200, linestyle=':', color='lightblue',
	alpha=0.6, label='Start (1200)')
	axs[2, 0].fill_between(elo_df['iteration'], RANDOM_BASELINE_ELO,
	elo_df['elo'], alpha=0.15, color='gold')
	axs[2, 0].set_title('ELO Rating vs Random Baseline')
	axs[2, 0].legend(); axs[2, 0].set_xlabel('Iteration')
	else:
	axs[2, 0].text(0.5, 0.5, f'ELO eval every {CONFIG["elo_eval_interval"]} iters',
	ha='center', va='center', transform=axs[2, 0].transAxes,
	color='gray', fontsize=11)
	axs[2, 0].set_title('ELO Rating (pending)')

	# Row 2: Average game length
	axs[2, 1].plot(df['iteration'], df['avg_game_len'], color='teal', linewidth=1.2)
	axs[2, 1].set_title('Avg Game Length (↓ = faster checkmates)')
	axs[2, 1].set_xlabel('Iteration')

	for ax in axs.flat:
	ax.grid(True, alpha=0.25)

	plt.tight_layout()
	out = os.path.join(CONFIG["checkpoint_dir"], "training_performance.png")
	plt.savefig(out, dpi=100, bbox_inches='tight')
	plt.close(fig)
	print(f" [Plot] saved → {out}")


	# ── Entry Point ────────────────────────────────────────────────────────────────
	if __name__ == "__main__":
	parser = argparse.ArgumentParser(
	description="Aggressive GRPO Chess Agent (T4/Colab)")
	parser.add_argument("--iterations", type=int, default=10000,
	help="Total training iterations")
	parser.add_argument("--test-batch", action="store_true",
	help="Run 2 iterations for smoke-test")
	args, _ = parser.parse_known_args()

	torch.manual_seed(CONFIG["seed"])
	np.random.seed(CONFIG["seed"])
	random.seed(CONFIG["seed"])

	# Print VRAM summary at startup
	if torch.cuda.is_available():
	props = torch.cuda.get_device_properties(0)
	print(f"GPU: {props.name} \| VRAM: {props.total_memory/1e9:.1f}GB \| "
	f"SM: {props.multi_processor_count} \| "
	f"Compute: {props.major}.{props.minor}")

	trainer = GRPOTrainer()
	trainer.train(2 if args.test_batch else args.iterations)