Spaces:

robell05
/

GptForChess

Sleeping

App Files Files Community

robell05 commited on 18 days ago

Commit

6d75857

1 Parent(s): 4c1be1b

serving model

Browse files

Files changed (14) hide show

.gitignore +17 -0
Dockerfile +36 -0
app.py +43 -0
model/policy_model.pt +3 -0
model/tokenizer.pt +3 -0
requirements.txt +28 -0
README.md → src/README.md +0 -0
src/__init__.py +0 -0
src/benchmark.py +204 -0
src/build_datasets.py +697 -0
src/minimax.py +116 -0
src/model.py +327 -0
src/tokenizer.py +104 -0
src/train.py +1319 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,17 @@

+# Virtual environments
+.venv/
+venv/
+env/
+# Python
+__pycache__/
+*.py[cod]
+*.egg-info/
+.pytest_cache/
+# IDE
+.vscode/
+.idea/
+# OS
+.DS_Store

Dockerfile ADDED Viewed

	@@ -0,0 +1,36 @@

+FROM python:3.12.2-slim
+ENV PYTHONDONTWRITEBYTECODE=1 \
+PYTHONUNBUFFERED=1 \
+PIP_NO_CACHE_DIR=1
+# build-essential covers the few deps with C extensions (numpy/pandas wheels
+# are usually prebuilt for 3.12, but this is cheap insurance).
+RUN apt-get update && apt-get install -y --no-install-recommends \
+build-essential \
+&& rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+# Deps first so editing app.py doesn't reinstall torch on every rebuild.
+COPY requirements.txt .
+RUN pip install --extra-index-url https://download.pytorch.org/whl/cpu -r requirements.txt
+# Source next.
+COPY src/ ./src/
+COPY app.py ./
+# Weights last — biggest layer, changes least often, so cache stays warm
+# when you iterate on app.py.
+COPY model/ ./model/
+# torch.load on tokenizer.pt unpickles a `Tokenizer` instance that was
+# saved from the module path `tokenizer` (src/tokenizer.py). Making /app/src
+# importable lets the unpickler find that class.
+ENV PYTHONPATH=/app/src
+# HF Spaces routes external traffic to $PORT; 7860 is the convention.
+# Single worker — the model lives in process memory and multiple workers
+# would multiply the ~1.5 GB RSS.
+EXPOSE 7860
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]

app.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from fastapi import FastAPI, HTTPException
+import chess
+from contextlib import asynccontextmanager
+import sys
+from src.model import ChessPolicyModel, PolicyModelInference
+from src.tokenizer import tokenizer
+import torch
+from pydantic import BaseModel
+ml = {}
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    tokenizer = torch.load("./model/tokenizer.pt", weights_only=False, map_location=torch.device('cpu'))
+    model = ChessPolicyModel(vocab_size=tokenizer.language_size)
+    model.load_state_dict(
+        torch.load("./model/policy_model.pt", weights_only=False, map_location=torch.device('cpu'))
+    )
+    ml["inference"] = PolicyModelInference(model, tokenizer, device="cpu")
+    yield
+    ml.clear()
+app = FastAPI(lifespan=lifespan)
+class InferenceRequest(BaseModel):
+    moves: list[str]
+@app.post("/inference")
+def model_inference(req: InferenceRequest):
+    board = chess.Board()
+    for move in req.moves:
+        try:
+            board.push_uci(move)
+        except ValueError as e:
+            raise HTTPException(status_code=400, detail=f"Incorrect move {move}: {e}")
+    try:
+        return {"move" : ml["inference"](board)}
+    except ValueError as e:
+        raise HTTPException(status_code=500, detail=f"Model Failed to evaluate: {e}")

model/policy_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b7c3556e2842ff3f6b20a11b9cc0759a89ca39b2d7cde30b54d8d3ba1bee573a
+size 322812083

model/tokenizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:45887b381da27b8cd119274704fb0b4766125a9218f87643be8260017869471b
+size 57977

requirements.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+annotated-doc==0.0.4
+annotated-types==0.7.0
+anyio==4.13.0
+chess==1.11.2
+click==8.4.0
+fastapi==0.136.1
+filelock==3.29.0
+fsspec==2026.4.0
+h11==0.16.0
+idna==3.15
+Jinja2==3.1.6
+MarkupSafe==3.0.3
+mpmath==1.3.0
+networkx==3.6.1
+numpy==2.4.5
+pandas==3.0.3
+pydantic==2.13.4
+pydantic_core==2.46.4
+python-chess==1.999
+python-dateutil==2.9.0.post0
+setuptools==81.0.0
+six==1.17.0
+starlette==1.0.0
+sympy==1.14.0
+torch==2.12.0
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+uvicorn==0.47.0

README.md → src/README.md RENAMED Viewed

File without changes

src/__init__.py ADDED Viewed

File without changes

src/benchmark.py ADDED Viewed

	@@ -0,0 +1,204 @@

+"""Evaluate trained reward and policy models on held-out test sets.
+Test sets are built by src/build_datasets.py (run once before benchmarking):
+  stockfish_test_*.bin  — 50K Stockfish-labeled positions  (reward model)
+  policy_test_*.bin     — 50K game sequences               (policy model)
+  puzzle_test_*.bin     — 100K puzzle sequences            (puzzle solve rate)
+Metrics:
+  Reward:  MSE, MAE, Pearson r
+  Policy:  loss, perplexity, top-1 move accuracy, top-5 move accuracy
+  Puzzle:  first-move solve rate (top-1), all-moves solve rate
+Usage:
+  arch -arm64 poetry run python src/benchmark.py
+  arch -arm64 poetry run python src/benchmark.py \\
+    --reward-model reward_model.pt \\
+    --policy-model policy_model.pt \\
+    --data-dir data/ \\
+    --batch-size 512
+"""
+import argparse
+import time
+from pathlib import Path
+import torch
+from torch.utils.data import DataLoader
+from model import ChessRewardModel, ChessPolicyModel, PAD_TOKEN
+from train import (
+    ChessPositionDataset,
+    ChessPolicyDataset,
+    collate_fn_memmap,
+    collate_fn_policy,
+    eval_reward,
+    eval_policy,
+    eval_puzzle_solve_rate,
+)
+def _fmt(v: float, pct: bool = False) -> str:
+    return f"{v * 100:.2f}%" if pct else f"{v:.4f}"
+def run_benchmark(
+    data_dir: Path,
+    reward_model_path: str | None,
+    policy_model_path: str | None,
+    batch_size: int = 512,
+    num_workers: int = 4,
+    device: str | None = None,
+) -> dict:
+    """Run all available benchmarks and return a results dict.
+    Returns a dict with keys 'reward', 'policy', 'puzzle' (each a sub-dict of
+    metrics), or omits a key if the test set / model is unavailable.
+    """
+    if device is None:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Benchmark device: {device}")
+    tokenizer_path = data_dir / "tokenizer.pt"
+    if not tokenizer_path.exists():
+        raise FileNotFoundError(f"Tokenizer not found at {tokenizer_path}")
+    tokenizer = torch.load(tokenizer_path, weights_only=False)
+    vocab_size = tokenizer.language_size
+    pad_id = tokenizer.symbol_to_token[PAD_TOKEN]
+    results = {}
+    # ── Reward model ──────────────────────────────────────────────────────────
+    reward_test_meta = data_dir / "stockfish_test_meta.pt"
+    if reward_model_path and Path(reward_model_path).exists() and reward_test_meta.exists():
+        print(f"\nLoading reward model from {reward_model_path}...")
+        reward_model = ChessRewardModel(vocab_size=vocab_size).to(device)
+        reward_model.load_state_dict(torch.load(reward_model_path, map_location=device, weights_only=True))
+        print("Loading reward test set...")
+        reward_test_ds = ChessPositionDataset.from_memmap(data_dir, "stockfish_test", tokenizer)
+        reward_test_loader = DataLoader(
+            reward_test_ds, batch_size=batch_size, shuffle=False,
+            collate_fn=collate_fn_memmap, num_workers=num_workers, pin_memory=True,
+        )
+        print(f"  {len(reward_test_ds):,} test positions")
+        t0 = time.time()
+        m = eval_reward(reward_model, reward_test_loader, device)
+        print(
+            f"  Reward  |  MSE={_fmt(m['mse'])}  MAE={_fmt(m['mae'])}"
+            f"  Pearson r={_fmt(m['pearson_r'])}  ({time.time()-t0:.1f}s)"
+        )
+        results["reward"] = m
+    elif not reward_test_meta.exists():
+        print("\nReward test set not found (run build_datasets.py to create it). Skipping.")
+    elif not reward_model_path or not Path(reward_model_path).exists():
+        print(f"\nReward model not found at {reward_model_path}. Skipping.")
+    # ── Policy model ──────────────────────────────────────────────────────────
+    policy_test_meta = data_dir / "policy_test_meta.pt"
+    puzzle_test_meta = data_dir / "puzzle_test_meta.pt"
+    policy_model = None
+    if policy_model_path and Path(policy_model_path).exists():
+        print(f"\nLoading policy model from {policy_model_path}...")
+        policy_model = ChessPolicyModel(vocab_size=vocab_size).to(device)
+        policy_model.load_state_dict(torch.load(policy_model_path, map_location=device, weights_only=True))
+    if policy_model is not None and policy_test_meta.exists():
+        print("Loading policy test set...")
+        policy_test_ds = ChessPolicyDataset.from_memmap(data_dir, tokenizer, name="policy_test")
+        policy_test_loader = DataLoader(
+            policy_test_ds, batch_size=batch_size, shuffle=False,
+            collate_fn=collate_fn_policy, num_workers=num_workers, pin_memory=True,
+        )
+        print(f"  {len(policy_test_ds):,} test sequences")
+        t0 = time.time()
+        m = eval_policy(policy_model, policy_test_loader, device, pad_id)
+        print(
+            f"  Policy  |  loss={_fmt(m['loss'])}  ppl={m['perplexity']:.2f}"
+            f"  top1={_fmt(m['top1_acc'], pct=True)}  top5={_fmt(m['top5_acc'], pct=True)}"
+            f"  ({time.time()-t0:.1f}s)"
+        )
+        results["policy"] = m
+    elif policy_model is None:
+        print(f"\nPolicy model not found at {policy_model_path}. Skipping policy + puzzle eval.")
+    elif not policy_test_meta.exists():
+        print("\nPolicy test set not found (run build_datasets.py to create it). Skipping.")
+    if policy_model is not None and puzzle_test_meta.exists():
+        print("Loading puzzle test set...")
+        puzzle_test_ds = ChessPolicyDataset.from_memmap(data_dir, tokenizer, name="puzzle_test")
+        puzzle_test_loader = DataLoader(
+            puzzle_test_ds, batch_size=batch_size, shuffle=False,
+            collate_fn=collate_fn_policy, num_workers=num_workers, pin_memory=True,
+        )
+        print(f"  {len(puzzle_test_ds):,} test puzzles")
+        t0 = time.time()
+        m = eval_puzzle_solve_rate(policy_model, puzzle_test_loader, device, pad_id)
+        print(
+            f"  Puzzle  |  first_move={_fmt(m['first_move_solve_rate'], pct=True)}"
+            f"  all_moves={_fmt(m['all_moves_solve_rate'], pct=True)}"
+            f"  ({time.time()-t0:.1f}s)"
+        )
+        results["puzzle"] = m
+    elif policy_model is not None and not puzzle_test_meta.exists():
+        print("\nPuzzle test set not found (run build_datasets.py to create it). Skipping.")
+    return results
+def _print_summary(results: dict) -> None:
+    print("\n" + "=" * 60)
+    print("BENCHMARK SUMMARY")
+    print("=" * 60)
+    if "reward" in results:
+        m = results["reward"]
+        print(f"  Reward model  MSE={m['mse']:.4f}  MAE={m['mae']:.4f}  Pearson r={m['pearson_r']:.4f}")
+    if "policy" in results:
+        m = results["policy"]
+        print(
+            f"  Policy model  loss={m['loss']:.4f}  perplexity={m['perplexity']:.2f}"
+            f"  top-1={m['top1_acc']*100:.2f}%  top-5={m['top5_acc']*100:.2f}%"
+        )
+    if "puzzle" in results:
+        m = results["puzzle"]
+        print(
+            f"  Puzzle eval   first-move solve={m['first_move_solve_rate']*100:.2f}%"
+            f"  all-moves solve={m['all_moves_solve_rate']*100:.2f}%"
+        )
+    if not results:
+        print("  No results — check that models and test sets exist.")
+    print("=" * 60)
+def _build_argparser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    p.add_argument("--reward-model", default="reward_model.pt",
+        help="Path to reward_model.pt (default: reward_model.pt)")
+    p.add_argument("--policy-model", default="policy_model.pt",
+        help="Path to policy_model.pt (default: policy_model.pt)")
+    p.add_argument("--data-dir", type=Path, default=Path("data"),
+        help="Directory containing test set .bin files and tokenizer.pt (default: data/)")
+    p.add_argument("--batch-size", type=int, default=512,
+        help="Batch size for evaluation (default: 512)")
+    p.add_argument("--num-workers", type=int, default=4,
+        help="DataLoader worker count (default: 4)")
+    p.add_argument("--device", default=None,
+        help="Device override, e.g. 'cpu' or 'cuda:0' (default: auto-detect)")
+    return p
+if __name__ == "__main__":
+    args = _build_argparser().parse_args()
+    results = run_benchmark(
+        data_dir=args.data_dir,
+        reward_model_path=args.reward_model,
+        policy_model_path=args.policy_model,
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        device=args.device,
+    )
+    _print_summary(results)

src/build_datasets.py ADDED Viewed

	@@ -0,0 +1,697 @@

+"""Build the two training datasets for hybrid two-phase training.
+Runs three resumable stages:
+  1. Stream the Lichess HF dataset, filter by Elo + Termination, save two
+     disjoint raw-game subsets (movetext + Result only).
+  2. Build the shared tokenizer from the outcome subset and generate
+     outcome-labeled samples ({+1, 0, -1} from game Result).
+  3. Run parallel Stockfish on the disjoint subset to produce precisely
+     labeled samples (tanh(cp/400)).
+Each stage skips if its output files already exist. Use --force to re-run.
+Outputs (under --out-dir):
+  games_outcome.pt       raw outcome-subset games
+  games_stockfish.pt     raw stockfish-subset games
+  tokenizer.pt           shared Tokenizer (built from outcome games)
+  outcome_samples.pt     list[(token_ids, outcome_label)]
+  stockfish_samples.pt   list[(token_ids, stockfish_label)]
+"""
+import argparse
+import random
+from pathlib import Path
+import chess
+import numpy as np
+import torch
+from datasets import load_dataset
+from tqdm import tqdm
+from model import CLS_TOKEN
+from train import (
+    build_tokenizer_from_games,
+    generate_samples_stockfish_parallel,
+    parse_movetext,
+)
+# Lichess Result → outcome label from white's perspective.
+RESULT_TO_LABEL = {"1-0": 1.0, "0-1": -1.0, "1/2-1/2": 0.0}
+def _save_as_memmap(
+    samples: list[tuple[list[int], float]], out_dir: Path, name: str, max_seq_len: int = 128
+) -> None:
+    """Save samples as memory-mapped arrays for fast DataLoader access.
+    Sequences longer than max_seq_len are truncated (keeps the most recent tokens,
+    since the CLS token is at position 0 we keep ids[:max_seq_len]).
+    Produces three files:
+      {name}_tokens.bin   — (N, max_seq_len) int32, zero-padded
+      {name}_labels.bin   — (N,) float32
+      {name}_lengths.bin  — (N,) int32, actual sequence length per sample (capped at max_seq_len)
+      {name}_meta.pt      — dict with 'n' and 'max_len'
+    """
+    n = len(samples)
+    max_len = min(max(len(ids) for ids, _ in samples), max_seq_len)
+    print(f"  memmap {name}: {n:,} samples, max_seq_len={max_len}")
+    tokens = np.memmap(out_dir / f"{name}_tokens.bin", dtype=np.int32, mode="w+", shape=(n, max_len))
+    labels = np.memmap(out_dir / f"{name}_labels.bin", dtype=np.float32, mode="w+", shape=(n,))
+    lengths = np.memmap(out_dir / f"{name}_lengths.bin", dtype=np.int32, mode="w+", shape=(n,))
+    for i, (ids, label) in enumerate(tqdm(samples, desc=f"  writing {name}", unit="sample")):
+        ids = ids[:max_len]
+        l = len(ids)
+        tokens[i, :l] = ids
+        labels[i] = label
+        lengths[i] = l
+    tokens.flush()
+    labels.flush()
+    lengths.flush()
+    torch.save({"n": n, "max_len": max_len}, out_dir / f"{name}_meta.pt")
+    size_gb = (tokens.nbytes + labels.nbytes + lengths.nbytes) / 1024 ** 3
+    print(f"  memmap {name} saved ({size_gb:.2f} GB)")
+def stage1_collect_games(args: argparse.Namespace) -> None:
+    policy_games_path = args.out_dir / "games_outcome.pt"
+    reward_games_path = args.out_dir / "games_stockfish.pt"
+    policy_only = getattr(args, "policy_only", False)
+    if policy_only:
+        # Reward subset is irrelevant when we're only training the policy model.
+        # Skip-condition checks only the policy artifact.
+        if policy_games_path.exists() and not args.force:
+            print(f"Stage 1: skipping — {policy_games_path.name} exists (--policy-only).")
+            return
+    else:
+        if policy_games_path.exists() and reward_games_path.exists() and not args.force:
+            print(f"Stage 1: skipping — {policy_games_path.name} and {reward_games_path.name} exist.")
+            return
+    if policy_only:
+        lower_elo = args.policy_min_elo
+        print(
+            f"Stage 1: streaming Lichess/standard-chess-games (Termination == 'Normal'), "
+            f"policy Elo >= {args.policy_min_elo} (target {args.policy_games:,}). "
+            f"Reward subset skipped (--policy-only)."
+        )
+    else:
+        lower_elo = min(args.reward_min_elo, args.policy_min_elo)
+        print(
+            f"Stage 1: streaming Lichess/standard-chess-games (Termination == 'Normal'), "
+            f"reward Elo >= {args.reward_min_elo} (target {args.reward_games:,}), "
+            f"policy Elo >= {args.policy_min_elo} (target {args.policy_games:,})..."
+        )
+    ds = load_dataset("Lichess/standard-chess-games", split="train", streaming=True)
+    # Pre-filter by the lower of the two thresholds to skip clearly ineligible games.
+    ds = ds.filter(
+        lambda r: (
+            r.get("WhiteElo") is not None
+            and r.get("BlackElo") is not None
+            and r["WhiteElo"] >= lower_elo
+            and r["BlackElo"] >= lower_elo
+            and r.get("Termination") == "Normal"
+        )
+    )
+    policy_games: list[dict] = []
+    reward_games: list[dict] = []
+    keep_keys = ("movetext", "Result")
+    for row in tqdm(ds, desc="Stage 1: streaming", unit="game"):
+        white_elo = row.get("WhiteElo", 0)
+        black_elo = row.get("BlackElo", 0)
+        minimal = {k: row.get(k) for k in keep_keys}
+        if (
+            not policy_only
+            and len(reward_games) < args.reward_games
+            and white_elo >= args.reward_min_elo
+            and black_elo >= args.reward_min_elo
+        ):
+            reward_games.append(minimal)
+        if len(policy_games) < args.policy_games and white_elo >= args.policy_min_elo and black_elo >= args.policy_min_elo:
+            policy_games.append(minimal)
+        if policy_only:
+            if len(policy_games) >= args.policy_games:
+                break
+        elif len(reward_games) >= args.reward_games and len(policy_games) >= args.policy_games:
+            break
+    if policy_only:
+        if len(policy_games) < args.policy_games:
+            print(
+                f"  WARNING: dataset exhausted before target — "
+                f"got {len(policy_games):,} policy games."
+            )
+    elif len(reward_games) < args.reward_games or len(policy_games) < args.policy_games:
+        print(
+            f"  WARNING: dataset exhausted before target — "
+            f"got {len(reward_games):,} reward + {len(policy_games):,} policy games."
+        )
+    if not policy_only:
+        print(f"Stage 1: saving {reward_games_path} ({len(reward_games):,} games)...")
+        torch.save(reward_games, reward_games_path)
+    print(f"Stage 1: saving {policy_games_path} ({len(policy_games):,} games)...")
+    torch.save(policy_games, policy_games_path)
+def _generate_outcome_samples(games, tokenizer, max_positions_per_game, skip_ply):
+    """Build (token_ids, outcome_label) samples for the phase-1 dataset."""
+    cls_id = tokenizer.symbol_to_token[CLS_TOKEN]
+    samples: list[tuple[list[int], float]] = []
+    with tqdm(games, desc="Stage 2: outcome samples", unit="game") as pbar:
+        for idx, game in enumerate(pbar):
+            result = game.get("Result")
+            if result not in RESULT_TO_LABEL:
+                continue
+            label = RESULT_TO_LABEL[result]
+            movetext = game.get("movetext", "")
+            if not movetext:
+                continue
+            move_sans = parse_movetext(movetext)
+            if len(move_sans) < max(2, skip_ply + 1):
+                continue
+            eligible = list(range(skip_ply, len(move_sans)))
+            num_positions = min(max_positions_per_game, len(eligible))
+            rng = random.Random(idx)
+            sample_indices = set(rng.sample(eligible, num_positions))
+            board = chess.Board()
+            valid_moves: list[str] = []
+            for i, san in enumerate(move_sans):
+                try:
+                    move = board.parse_san(san)
+                    board.push(move)
+                    valid_moves.append(move.uci())
+                except (chess.InvalidMoveError, chess.AmbiguousMoveError):
+                    break
+                if i in sample_indices:
+                    token_ids = [cls_id] + tokenizer.encode_moves(valid_moves)
+                    samples.append((token_ids, label))
+            if (idx + 1) % 50_000 == 0:
+                pbar.set_postfix(samples=f"{len(samples):,}")
+    return samples
+def stage2_outcome_samples(args: argparse.Namespace) -> None:
+    tokenizer_path = args.out_dir / "tokenizer.pt"
+    meta_path = args.out_dir / "outcome_meta.pt"
+    if tokenizer_path.exists() and meta_path.exists() and not args.force:
+        print(f"Stage 2: skipping — {tokenizer_path.name} and {meta_path.name} exist.")
+        return
+    raw_games_path = args.out_dir / "games_outcome.pt"
+    print(f"Stage 2: loading outcome games from {raw_games_path}...")
+    games = torch.load(raw_games_path, weights_only=False)
+    print("Stage 2: building tokenizer from all UCI moves...")
+    tokenizer = build_tokenizer_from_games()
+    print(f"Stage 2: tokenizer vocab size = {tokenizer.language_size}")
+    torch.save(tokenizer, tokenizer_path)
+    print("Stage 2: generating outcome samples (up to 20 per game)...")
+    samples = _generate_outcome_samples(
+        games,
+        tokenizer,
+        max_positions_per_game=20,
+        skip_ply=0,
+    )
+    print(f"Stage 2: saving {len(samples):,} outcome samples as memmap...")
+    _save_as_memmap(samples, args.out_dir, "outcome", max_seq_len=args.max_seq_len)
+def _generate_policy_sequences(games, tokenizer, max_seq_len: int = 128) -> list[list[int]]:
+    """Tokenize full game sequences for policy training.
+    Each output sequence is [CLS, m1, m2, ..., mN], truncated to max_seq_len.
+    Games with fewer than 2 valid UCI moves are skipped.
+    """
+    cls_id = tokenizer.symbol_to_token[CLS_TOKEN]
+    sequences: list[list[int]] = []
+    with tqdm(games, desc="Stage 4: policy sequences", unit="game") as pbar:
+        for game in pbar:
+            movetext = game.get("movetext", "")
+            if not movetext:
+                continue
+            move_sans = parse_movetext(movetext)
+            if len(move_sans) < 2:
+                continue
+            board = chess.Board()
+            move_ucis: list[str] = []
+            for san in move_sans:
+                try:
+                    move = board.parse_san(san)
+                    board.push(move)
+                    move_ucis.append(move.uci())
+                except (chess.InvalidMoveError, chess.AmbiguousMoveError):
+                    break
+            if len(move_ucis) < 2:
+                continue
+            move_ucis = move_ucis[:max_seq_len - 1]
+            sequences.append([cls_id] + tokenizer.encode_moves(move_ucis))
+    return sequences
+def _save_policy_memmap(
+    sequences: list[list[int]], out_dir: Path, name: str, max_seq_len: int = 128,
+    fens: list[str] | None = None, fen_len: int = 100,
+) -> None:
+    """Save policy sequences as memory-mapped arrays (no labels).
+    Produces:
+      {name}_tokens.bin   — (N, max_len) int32, zero-padded
+      {name}_lengths.bin  — (N,) int32, actual sequence length per sample
+      {name}_meta.pt      — dict with 'n', 'max_len', and (if fens given) 'fen_len'
+    If `fens` is provided, also writes {name}_fens.bin — (N, fen_len) uint8
+    holding zero-padded ASCII FEN strings, one per sample. Used by the
+    CNN-conditioned policy training to reconstruct each sample's starting board.
+    """
+    n = len(sequences)
+    max_len = min(max(len(s) for s in sequences), max_seq_len)
+    print(f"  memmap {name}: {n:,} sequences, max_seq_len={max_len}")
+    tokens = np.memmap(out_dir / f"{name}_tokens.bin", dtype=np.int32, mode="w+", shape=(n, max_len))
+    lengths = np.memmap(out_dir / f"{name}_lengths.bin", dtype=np.int32, mode="w+", shape=(n,))
+    for i, seq in enumerate(tqdm(sequences, desc=f"  writing {name}", unit="seq")):
+        seq = seq[:max_len]
+        l = len(seq)
+        tokens[i, :l] = seq
+        lengths[i] = l
+    tokens.flush()
+    lengths.flush()
+    meta = {"n": n, "max_len": max_len}
+    extra_bytes = 0
+    if fens is not None:
+        assert len(fens) == n, f"fen count {len(fens)} mismatch with sequence count {n}"
+        fens_mm = np.memmap(out_dir / f"{name}_fens.bin", dtype=np.uint8, mode="w+", shape=(n, fen_len))
+        for i, fen in enumerate(fens):
+            b = fen.encode("ascii")[:fen_len]
+            fens_mm[i, :len(b)] = list(b)
+        fens_mm.flush()
+        meta["fen_len"] = fen_len
+        extra_bytes = fens_mm.nbytes
+    torch.save(meta, out_dir / f"{name}_meta.pt")
+    size_gb = (tokens.nbytes + lengths.nbytes + extra_bytes) / 1024 ** 3
+    print(f"  memmap {name} saved ({size_gb:.3f} GB)")
+def _process_puzzle(
+    row: dict,
+    tokenizer_symbol_map: dict,
+    cls_id: int,
+) -> tuple[list[int], str] | None:
+    """Parse one Lichess puzzle row into a (token_sequence, FEN) pair.
+    Sequence layout: [CLS, setup_move, solver_move1, opp_response, solver_move2, ...]
+    The setup move (Moves[0]) is included as context so the model conditions on it
+    when predicting the solution. During training the loss on the setup move position
+    is masked out — we model P[m_n | S, m_{<n}] where S is the setup move.
+    The FEN is the puzzle's starting board position. It is persisted alongside the
+    token sequence so the CNN-conditioned policy training can reconstruct the
+    starting board planes (CNN's input) at __getitem__ time.
+    Returns None if any move is illegal, unknown to the tokenizer, or the sequence
+    has fewer than 3 tokens (CLS + setup + at least one solver move).
+    """
+    fen = row.get("FEN", "")
+    moves_str = row.get("Moves", "")
+    if not fen or not moves_str:
+        return None
+    uci_moves = moves_str.strip().split()
+    if len(uci_moves) < 2:  # need setup + at least one solver move
+        return None
+    try:
+        board = chess.Board(fen)
+    except ValueError:
+        return None
+    # Tokenize all moves: setup first (as context), then the full solution.
+    token_ids: list[int] = [cls_id]
+    for uci in uci_moves:
+        try:
+            move = chess.Move.from_uci(uci)
+        except ValueError:
+            return None
+        if move not in board.legal_moves:
+            return None
+        if uci not in tokenizer_symbol_map:
+            return None
+        token_ids.append(tokenizer_symbol_map[uci])
+        board.push(move)
+    if len(token_ids) < 3:  # CLS + setup + at least one solver move
+        return None
+    return token_ids, fen
+def stage3_stockfish_samples(args: argparse.Namespace) -> None:
+    meta_path = args.out_dir / "stockfish_meta.pt"
+    if meta_path.exists() and not args.force:
+        print(f"Stage 3: skipping — {meta_path.name} exists.")
+        return
+    games_path = args.out_dir / "games_stockfish.pt"
+    tokenizer_path = args.out_dir / "tokenizer.pt"
+    print(f"Stage 3: loading {games_path} and {tokenizer_path}...")
+    games = torch.load(games_path, weights_only=False)
+    tokenizer = torch.load(tokenizer_path, weights_only=False)
+    print(
+        f"Stage 3: running parallel Stockfish ({args.workers} workers, "
+        f"depth {args.stockfish_depth}) on {len(games):,} games..."
+    )
+    samples = generate_samples_stockfish_parallel(
+        games,
+        tokenizer,
+        num_workers=args.workers,
+        stockfish_depth=args.stockfish_depth,
+        sample_rate=args.sample_rate,
+        skew_exponent=args.position_skew,
+    )
+    print(f"Stage 3: saving {len(samples):,} stockfish samples as memmap...")
+    _save_as_memmap(samples, args.out_dir, "stockfish", max_seq_len=args.max_seq_len)
+def stage4_policy_sequences(args: argparse.Namespace) -> None:
+    meta_path = args.out_dir / "policy_meta.pt"
+    if meta_path.exists() and not args.force:
+        print(f"Stage 4: skipping — {meta_path.name} exists.")
+        return
+    games_path = args.out_dir / "games_outcome.pt"
+    tokenizer_path = args.out_dir / "tokenizer.pt"
+    print(f"Stage 4: loading {games_path} and {tokenizer_path}...")
+    games = torch.load(games_path, weights_only=False)
+    tokenizer = torch.load(tokenizer_path, weights_only=False)
+    print(f"Stage 4: tokenizing {len(games):,} games into policy sequences...")
+    sequences = _generate_policy_sequences(games, tokenizer, max_seq_len=args.max_seq_len)
+    print(f"Stage 4: saving {len(sequences):,} policy sequences as memmap...")
+    _save_policy_memmap(sequences, args.out_dir, "policy", max_seq_len=args.max_seq_len)
+def _write_test_subset_reward(out_dir: Path, src_name: str, dst_name: str, indices: np.ndarray) -> None:
+    """Write a subset of a reward memmap (tokens+labels+lengths) to new files."""
+    meta = torch.load(out_dir / f"{src_name}_meta.pt", weights_only=True)
+    n_src, max_len = meta["n"], meta["max_len"]
+    src_tokens = np.memmap(out_dir / f"{src_name}_tokens.bin", dtype=np.int32, mode="r", shape=(n_src, max_len))
+    src_labels = np.memmap(out_dir / f"{src_name}_labels.bin", dtype=np.float32, mode="r", shape=(n_src,))
+    src_lengths = np.memmap(out_dir / f"{src_name}_lengths.bin", dtype=np.int32, mode="r", shape=(n_src,))
+    n_test = len(indices)
+    dst_tokens = np.memmap(out_dir / f"{dst_name}_tokens.bin", dtype=np.int32, mode="w+", shape=(n_test, max_len))
+    dst_labels = np.memmap(out_dir / f"{dst_name}_labels.bin", dtype=np.float32, mode="w+", shape=(n_test,))
+    dst_lengths = np.memmap(out_dir / f"{dst_name}_lengths.bin", dtype=np.int32, mode="w+", shape=(n_test,))
+    for i, idx in enumerate(tqdm(indices, desc=f"  writing {dst_name}", unit="sample")):
+        dst_tokens[i] = src_tokens[idx]
+        dst_labels[i] = src_labels[idx]
+        dst_lengths[i] = src_lengths[idx]
+    dst_tokens.flush()
+    dst_labels.flush()
+    dst_lengths.flush()
+    torch.save({"n": n_test, "max_len": max_len}, out_dir / f"{dst_name}_meta.pt")
+    print(f"  {dst_name}: {n_test:,} samples written")
+def _write_test_subset_policy(out_dir: Path, src_name: str, dst_name: str, indices: np.ndarray) -> None:
+    """Write a subset of a policy memmap (tokens+lengths, no labels) to new files."""
+    meta = torch.load(out_dir / f"{src_name}_meta.pt", weights_only=True)
+    n_src, max_len = meta["n"], meta["max_len"]
+    src_tokens = np.memmap(out_dir / f"{src_name}_tokens.bin", dtype=np.int32, mode="r", shape=(n_src, max_len))
+    src_lengths = np.memmap(out_dir / f"{src_name}_lengths.bin", dtype=np.int32, mode="r", shape=(n_src,))
+    n_test = len(indices)
+    dst_tokens = np.memmap(out_dir / f"{dst_name}_tokens.bin", dtype=np.int32, mode="w+", shape=(n_test, max_len))
+    dst_lengths = np.memmap(out_dir / f"{dst_name}_lengths.bin", dtype=np.int32, mode="w+", shape=(n_test,))
+    for i, idx in enumerate(tqdm(indices, desc=f"  writing {dst_name}", unit="seq")):
+        dst_tokens[i] = src_tokens[idx]
+        dst_lengths[i] = src_lengths[idx]
+    dst_tokens.flush()
+    dst_lengths.flush()
+    torch.save({"n": n_test, "max_len": max_len}, out_dir / f"{dst_name}_meta.pt")
+    print(f"  {dst_name}: {n_test:,} sequences written")
+def stage_build_test_splits(args: argparse.Namespace, out_dir: Path) -> None:
+    """Build held-out test sets for reward and policy models from existing memmaps.
+    Uses a fixed random seed (42) so the same indices are always selected.
+    Saves the chosen indices to {name}_test_indices.npy so the corresponding
+    training memmap loader can exclude them — making train and test disjoint
+    even though they share the underlying .bin file.
+    Produces:
+      stockfish_test_*.bin / stockfish_test_meta.pt / stockfish_test_indices.npy
+      policy_test_*.bin    / policy_test_meta.pt    / policy_test_indices.npy
+    """
+    rng = np.random.default_rng(42)
+    policy_only = getattr(args, "policy_only", False)
+    # Reward test set — skipped when --policy-only since no Stockfish data exists.
+    reward_test_meta = out_dir / "stockfish_test_meta.pt"
+    sf_meta_path = out_dir / "stockfish_meta.pt"
+    if policy_only:
+        print("Test splits: stockfish_test skipped (--policy-only).")
+    elif (not reward_test_meta.exists() or args.force) and sf_meta_path.exists():
+        print(f"Test splits: building stockfish_test ({args.reward_test_size:,} samples)...")
+        sf_meta = torch.load(sf_meta_path, weights_only=True)
+        n = sf_meta["n"]
+        test_n = min(args.reward_test_size, n)
+        idx = rng.choice(n, size=test_n, replace=False)
+        idx.sort()
+        _write_test_subset_reward(out_dir, "stockfish", "stockfish_test", idx)
+        np.save(out_dir / "stockfish_test_indices.npy", idx)
+        print(f"  saved stockfish_test_indices.npy ({test_n:,} indices excluded from training)")
+    elif reward_test_meta.exists():
+        print("Test splits: stockfish_test already exists, skipping.")
+    # Policy test set
+    policy_test_meta = out_dir / "policy_test_meta.pt"
+    pol_meta_path = out_dir / "policy_meta.pt"
+    if (not policy_test_meta.exists() or args.force) and pol_meta_path.exists():
+        print(f"Test splits: building policy_test ({args.policy_test_size:,} sequences)...")
+        pol_meta = torch.load(pol_meta_path, weights_only=True)
+        n = pol_meta["n"]
+        test_n = min(args.policy_test_size, n)
+        idx = rng.choice(n, size=test_n, replace=False)
+        idx.sort()
+        _write_test_subset_policy(out_dir, "policy", "policy_test", idx)
+        np.save(out_dir / "policy_test_indices.npy", idx)
+        print(f"  saved policy_test_indices.npy ({test_n:,} indices excluded from training)")
+    elif policy_test_meta.exists():
+        print("Test splits: policy_test already exists, skipping.")
+def stage5_puzzle_samples(args: argparse.Namespace, tokenizer, out_dir: Path) -> None:
+    train_done = (out_dir / "puzzle_meta.pt").exists()
+    test_done = (out_dir / "puzzle_test_meta.pt").exists()
+    if train_done and test_done and not args.force:
+        print("Stage 5: skipping — puzzle_meta.pt and puzzle_test_meta.pt exist.")
+        return
+    print("Stage 5: loading Lichess/chess-puzzles from HuggingFace...")
+    ds = load_dataset("Lichess/chess-puzzles", split="train", streaming=True)
+    min_pop = args.min_puzzle_popularity
+    min_plays = args.min_puzzle_plays
+    cls_id = tokenizer.symbol_to_token[CLS_TOKEN]
+    sym_map = tokenizer.symbol_to_token
+    test_seqs: list[list[int]] = []
+    test_fens: list[str] = []
+    train_seqs: list[list[int]] = []
+    train_fens: list[str] = []
+    skipped = 0
+    test_target = args.puzzle_test_size
+    train_target = args.puzzle_count
+    with tqdm(ds, desc="Stage 5: puzzles", unit="puzzle") as pbar:
+        for row in pbar:
+            if min_pop is not None and row.get("Popularity", 0) < min_pop:
+                continue
+            if min_plays is not None and row.get("NbPlays", 0) < min_plays:
+                continue
+            result = _process_puzzle(row, sym_map, cls_id)
+            if result is None:
+                skipped += 1
+                continue
+            seq, fen = result
+            if len(test_seqs) < test_target:
+                test_seqs.append(seq)
+                test_fens.append(fen)
+            else:
+                train_seqs.append(seq)
+                train_fens.append(fen)
+            pbar.set_postfix(test=len(test_seqs), train=len(train_seqs), skipped=skipped)
+            if train_target is not None and len(train_seqs) >= train_target:
+                break
+    print(
+        f"Stage 5: test={len(test_seqs):,} puzzles, "
+        f"train={len(train_seqs):,} puzzles, "
+        f"skipped={skipped:,} invalid."
+    )
+    if test_seqs and not test_done:
+        _save_policy_memmap(
+            test_seqs, out_dir, "puzzle_test", max_seq_len=args.max_seq_len, fens=test_fens,
+        )
+    if train_seqs and not train_done:
+        _save_policy_memmap(
+            train_seqs, out_dir, "puzzle", max_seq_len=args.max_seq_len, fens=train_fens,
+        )
+    elif not train_seqs:
+        print("Stage 5: WARNING — no training puzzles collected.")
+def main():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--out-dir", type=Path, default=Path("data"))
+    parser.add_argument("--policy-games", type=int, default=1_000_000,
+                        help="Number of games to collect for policy model training")
+    parser.add_argument("--reward-games", type=int, default=1_000_000,
+                        help="Number of games to collect for reward model (Stockfish eval)")
+    parser.add_argument("--policy-min-elo", type=int, default=1800,
+                        help="Min Elo for both players in policy training games")
+    parser.add_argument("--reward-min-elo", type=int, default=1500,
+                        help="Min Elo for both players in reward model training games")
+    parser.add_argument("--sample-rate", type=float, default=0.25,
+                        help="Fraction of positions to sample per game (scales with game length)")
+    parser.add_argument("--position-skew", type=float, default=1.5,
+                        help="Power-law exponent weighting later positions; 1.0=linear, higher=more mid/late")
+    parser.add_argument("--workers", type=int, default=16)
+    parser.add_argument("--stockfish-depth", type=int, default=12)
+    parser.add_argument("--max-seq-len", type=int, default=128,
+                        help="Truncate token sequences to this length when writing .bin files")
+    parser.add_argument(
+        "--force",
+        action="store_true",
+        help="Re-run all stages even if their outputs already exist",
+    )
+    parser.add_argument("--puzzle-count", type=int, default=None, dest="puzzle_count",
+        help="Max puzzles to include (default: all ~4.99M)")
+    parser.add_argument("--min-puzzle-popularity", type=int, default=None, dest="min_puzzle_popularity",
+        help="Min Lichess Popularity score (0-100 scale)")
+    parser.add_argument("--min-puzzle-plays", type=int, default=None, dest="min_puzzle_plays",
+        help="Min NbPlays for a puzzle to be included")
+    parser.add_argument("--skip-puzzles", action="store_true",
+        help="Skip Stage 5 puzzle processing")
+    parser.add_argument("--puzzles-only", action="store_true",
+        help="Only run Stage 5 (puzzle processing). Skips game collection, "
+             "outcome/Stockfish/policy memmaps, and test splits. Requires "
+             "tokenizer.pt to exist (or it will be built from the UCI vocab).")
+    parser.add_argument("--policy-only", action="store_true",
+        help="Skip everything Stockfish/reward-related: Stage 1 collects only "
+             "policy games, Stage 3 (Stockfish labeling) is skipped, and the "
+             "stockfish_test split is not built. Stages 1/2/4/5 + policy_test "
+             "still run, producing tokenizer.pt, policy_* / puzzle_* memmaps, "
+             "and the policy_test split.")
+    parser.add_argument("--puzzle-test-size", type=int, default=100_000, dest="puzzle_test_size",
+        help="Number of puzzle sequences held out for the test set (default: 100000)")
+    parser.add_argument("--reward-test-size", type=int, default=50_000, dest="reward_test_size",
+        help="Number of reward positions held out for the test set (default: 50000)")
+    parser.add_argument("--policy-test-size", type=int, default=50_000, dest="policy_test_size",
+        help="Number of policy sequences held out for the test set (default: 50000)")
+    args = parser.parse_args()
+    args.out_dir.mkdir(parents=True, exist_ok=True)
+    if args.puzzles_only and args.policy_only:
+        parser.error("--puzzles-only and --policy-only are mutually exclusive.")
+    if args.puzzles_only:
+        print("--puzzles-only: skipping Stages 1-4 and test-split builder.")
+        tokenizer_path = args.out_dir / "tokenizer.pt"
+        if tokenizer_path.exists():
+            tokenizer = torch.load(tokenizer_path, weights_only=False)
+        else:
+            # Tokenizer is just the enumerated UCI vocab — no games needed.
+            print("  tokenizer.pt missing; building from UCI vocab...")
+            tokenizer = build_tokenizer_from_games()
+            torch.save(tokenizer, tokenizer_path)
+        stage5_puzzle_samples(args, tokenizer, args.out_dir)
+    elif args.policy_only:
+        print("--policy-only: skipping Stage 3 (Stockfish labeling) and stockfish_test split.")
+        stage1_collect_games(args)
+        stage2_outcome_samples(args)
+        stage4_policy_sequences(args)
+        tokenizer_path = args.out_dir / "tokenizer.pt"
+        if not args.skip_puzzles and tokenizer_path.exists():
+            tokenizer = torch.load(tokenizer_path, weights_only=False)
+            stage5_puzzle_samples(args, tokenizer, args.out_dir)
+        elif not args.skip_puzzles:
+            print("Stage 5: skipping — tokenizer.pt not found (run stages 1-2 first).")
+        stage_build_test_splits(args, args.out_dir)
+    else:
+        stage1_collect_games(args)
+        stage2_outcome_samples(args)
+        stage3_stockfish_samples(args)
+        stage4_policy_sequences(args)
+        tokenizer_path = args.out_dir / "tokenizer.pt"
+        if not args.skip_puzzles and tokenizer_path.exists():
+            tokenizer = torch.load(tokenizer_path, weights_only=False)
+            stage5_puzzle_samples(args, tokenizer, args.out_dir)
+        elif not args.skip_puzzles:
+            print("Stage 5: skipping — tokenizer.pt not found (run stages 1-2 first).")
+        stage_build_test_splits(args, args.out_dir)
+    print("\nAll stages complete. Artifacts:")
+    for name in (
+        "games_outcome.pt",
+        "games_stockfish.pt",
+        "tokenizer.pt",
+        "outcome_tokens.bin",
+        "outcome_labels.bin",
+        "outcome_lengths.bin",
+        "outcome_meta.pt",
+        "stockfish_tokens.bin",
+        "stockfish_labels.bin",
+        "stockfish_lengths.bin",
+        "stockfish_meta.pt",
+        "policy_tokens.bin",
+        "policy_lengths.bin",
+        "policy_meta.pt",
+        "puzzle_tokens.bin",
+        "puzzle_lengths.bin",
+        "puzzle_fens.bin",
+        "puzzle_meta.pt",
+        "puzzle_test_tokens.bin",
+        "puzzle_test_lengths.bin",
+        "puzzle_test_fens.bin",
+        "puzzle_test_meta.pt",
+        "stockfish_test_tokens.bin",
+        "stockfish_test_labels.bin",
+        "stockfish_test_lengths.bin",
+        "stockfish_test_meta.pt",
+        "policy_test_tokens.bin",
+        "policy_test_lengths.bin",
+        "policy_test_meta.pt",
+    ):
+        path = args.out_dir / name
+        size_mb = path.stat().st_size / 1024 / 1024 if path.exists() else 0
+        print(f"  {path}  ({size_mb:.1f} MB)")
+if __name__ == "__main__":
+    main()

src/minimax.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import chess
+import math
+from typing import Callable
+from model import PIECE_VALUES
+def dummy_reward_fn(board: chess.Board) -> float:
+    """Material-count heuristic: positive favors white."""
+    score = 0.0
+    for piece_type in PIECE_VALUES:
+        score += len(board.pieces(piece_type, chess.WHITE)) * PIECE_VALUES[piece_type]
+        score -= len(board.pieces(piece_type, chess.BLACK)) * PIECE_VALUES[piece_type]
+    return math.tanh(score / 10.0)
+class MinimaxSearch:
+    """Minimax search with top-N move pruning.
+    At each node, evaluates all legal moves with the reward function,
+    keeps the top N candidates, and recurses to the given depth.
+    Alternates between maximizing (white) and minimizing (black).
+    """
+    def __init__(
+        self,
+        reward_fn: Callable[[chess.Board], float],
+        depth: int = 3,
+        top_n: int = 5,
+    ):
+        self.reward_fn = reward_fn
+        self.depth = depth
+        self.top_n = top_n
+    def search(self, board: chess.Board) -> chess.Move:
+        """Return the best move for the current side to play."""
+        legal_moves = list(board.legal_moves)
+        if not legal_moves:
+            raise ValueError("No legal moves available")
+        if len(legal_moves) == 1:
+            return legal_moves[0]
+        maximizing = board.turn == chess.WHITE
+        # Score every legal move with a shallow reward evaluation
+        scored_moves = []
+        for move in legal_moves:
+            board.push(move)
+            score = self.reward_fn(board)
+            board.pop()
+            scored_moves.append((score, move))
+        # Keep top N candidates (best for current side)
+        scored_moves.sort(key=lambda x: x[0], reverse=maximizing)
+        candidates = scored_moves[: self.top_n]
+        # Recurse on each candidate to find the best
+        best_move = candidates[0][1]
+        best_value = float("-inf") if maximizing else float("inf")
+        for _, move in candidates:
+            board.push(move)
+            value = self._minimax(board, self.depth - 1, not maximizing)
+            board.pop()
+            if maximizing and value > best_value:
+                best_value = value
+                best_move = move
+            elif not maximizing and value < best_value:
+                best_value = value
+                best_move = move
+        return best_move
+    def _minimax(self, board: chess.Board, depth: int, maximizing: bool) -> float:
+        if depth <= 0 or board.is_game_over():
+            return self._terminal_eval(board)
+        legal_moves = list(board.legal_moves)
+        if not legal_moves:
+            return self._terminal_eval(board)
+        # Score all moves, keep top N for the current side
+        scored_moves = []
+        for move in legal_moves:
+            board.push(move)
+            score = self.reward_fn(board)
+            board.pop()
+            scored_moves.append((score, move))
+        scored_moves.sort(key=lambda x: x[0], reverse=maximizing)
+        candidates = scored_moves[: self.top_n]
+        if maximizing:
+            best = float("-inf")
+            for _, move in candidates:
+                board.push(move)
+                best = max(best, self._minimax(board, depth - 1, False))
+                board.pop()
+            return best
+        else:
+            best = float("inf")
+            for _, move in candidates:
+                board.push(move)
+                best = min(best, self._minimax(board, depth - 1, True))
+                board.pop()
+            return best
+    def _terminal_eval(self, board: chess.Board) -> float:
+        """Evaluate a terminal or leaf node."""
+        if board.is_checkmate():
+            # The side to move is checkmated
+            return -1.0 if board.turn == chess.WHITE else 1.0
+        if board.is_game_over():
+            return 0.0
+        return self.reward_fn(board)

src/model.py ADDED Viewed

	@@ -0,0 +1,327 @@

+import math
+import torch
+import torch.nn as nn
+import chess
+from src.tokenizer import Tokenizer
+CLS_TOKEN = "[CLS]"
+PAD_TOKEN = "[PAD]"
+PIECE_VALUES = {
+    chess.PAWN: 1,
+    chess.KNIGHT: 3,
+    chess.BISHOP: 3,
+    chess.ROOK: 5,
+    chess.QUEEN: 9,
+    chess.KING: 0,
+}
+BOARD_PLANES = 19
+def board_to_planes(board: chess.Board) -> torch.Tensor:
+    """chess.Board -> (19, 8, 8) float tensor."""
+    planes = torch.zeros(BOARD_PLANES, 8, 8, dtype=torch.float32)
+    pieces = [chess.PAWN, chess.KNIGHT, chess.BISHOP, chess.ROOK, chess.QUEEN, chess.KING]
+    colors = [chess.WHITE, chess.BLACK]
+    piece_to_plane = {(piece, color) : 6 * color_num + piece_num  for piece_num, piece in enumerate(pieces) for color_num, color in enumerate(colors)}
+    for sq, piece in board.piece_map().items():
+        r, c = chess.square_rank(sq), chess.square_file(sq)
+        planes[piece_to_plane[(piece.piece_type, piece.color)], r, c] = 1.0
+    if board.turn == chess.WHITE:
+        planes[12].fill_(1.0)
+    if board.has_kingside_castling_rights(chess.WHITE):  planes[13].fill_(1.0)
+    if board.has_queenside_castling_rights(chess.WHITE): planes[14].fill_(1.0)
+    if board.has_kingside_castling_rights(chess.BLACK):  planes[15].fill_(1.0)
+    if board.has_queenside_castling_rights(chess.BLACK): planes[16].fill_(1.0)
+    if board.ep_square is not None:
+        r, c = chess.square_rank(board.ep_square), chess.square_file(board.ep_square)
+        planes[17, r, c] = 1.0
+    planes[18].fill_(min(board.halfmove_clock, 100) / 100.0)
+    return planes
+def _group_norm(channels: int, groups: int = 32) -> nn.GroupNorm:
+    return nn.GroupNorm(num_groups=min(groups, channels), num_channels=channels)
+class ResidualBlock(nn.Module):
+    def __init__(self, channels: int):
+        super().__init__()
+        self.conv1 = nn.Conv2d(channels, channels, 3, padding=1, bias=False)
+        self.norm1 = _group_norm(channels)
+        self.conv2 = nn.Conv2d(channels, channels, 3, padding=1, bias=False)
+        self.norm2 = _group_norm(channels)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        h = torch.relu(self.norm1(self.conv1(x)))
+        h = self.norm2(self.conv2(h))
+        return torch.relu(h + x)
+class BoardCNN(nn.Module):
+    def __init__(self, d_model, channels=128, num_blocks=6):
+        super().__init__()
+        self.stem = nn.Sequential(
+            nn.Conv2d(BOARD_PLANES, channels, 3, padding=1, bias=False),
+            _group_norm(channels),
+            nn.ReLU(inplace=True),
+        )
+        self.blocks = nn.Sequential(*[ResidualBlock(channels) for _ in range(num_blocks)])
+        self.proj = nn.Linear(channels, d_model)
+        self.square_pos = nn.Embedding(64, d_model)
+    def forward(self, planes : torch.Tensor) -> torch.Tensor:
+        x = self.stem(planes)
+        x = self.blocks(x) # (N, C, 8, 8)
+        x = x.permute(0, 2, 3, 1).reshape(x.size(0), 64, -1) # (n, 64, C)
+        x = self.proj(x) + self.square_pos.weight # (n, 64, d_model)
+        return x
+class CrossAttnBlock(nn.Module):
+    def __init__(self, d_model, n_head, dim_ff, dropout):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, n_head, dropout = dropout, batch_first=True)
+        self.cross_attn = nn.MultiheadAttention(d_model, n_head, dropout = dropout, batch_first = True)
+        self.ff = nn.Sequential(
+            nn.Linear(d_model, dim_ff), nn.GELU(), nn.Linear(dim_ff, d_model)
+        )
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.drop = nn.Dropout(dropout)
+        #Adding this gate which is init to 0 so cross-attn starts disabled
+        self.cross_gate = nn.Parameter(torch.zeros(1))
+    def forward(self, moves, board, key_padding_mask, attn_mask):
+        """
+        moves:             (B, T, d)
+        board:             (B, T, 64, d)  -- per-position K/V banks
+        key_padding_mask:  (B, T)         -- True = padded move position
+        attn_mask:         (T, T)         -- causal mask for self-attn
+        """
+        m = self.norm1(moves)
+        sa, _ = self.self_attn(m, m, m, attn_mask = attn_mask, key_padding_mask=key_padding_mask, need_weights=False)
+        moves = moves + self.drop(sa)
+        B, T, d = moves.shape
+        q = self.norm2(moves).reshape(B * T, 1, d)
+        kv = board.reshape(B * T, 64, d)
+        ca, _ = self.cross_attn(q, kv, kv, need_weights = False)
+        ca = ca.reshape(B, T, d)
+        moves = moves + self.drop(self.cross_gate.tanh() * ca)
+        # FFN
+        moves = moves + self.drop(self.ff(self.norm3(moves)))
+        return moves
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model: int, max_len: int = 512, dropout: float = 0.1):
+        super().__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)  # (1, max_len, d_model)
+        self.register_buffer("pe", pe)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.pe[:, :x.size(1)]
+        return self.dropout(x)
+class ChessRewardModel(nn.Module):
+    def __init__(
+        self,
+        vocab_size: int,
+        d_model: int = 768,
+        nhead: int = 12,
+        num_layers: int = 8,
+        dim_feedforward: int = 3072,
+        max_seq_len: int = 128,
+        dropout: float = 0.1,
+    ):
+        super().__init__()
+        self.token_embedding = nn.Embedding(vocab_size, d_model)
+        self.pos_encoding = PositionalEncoding(d_model, max_seq_len, dropout)
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model, nhead, dim_feedforward, dropout, batch_first=True
+        )
+        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
+        self.reward_head = nn.Linear(d_model, 1)
+    def forward(
+        self,
+        token_ids: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            token_ids: (batch, seq_len) int tensor with CLS prepended
+            attention_mask: (batch, seq_len) bool tensor, True where padded
+        Returns:
+            (batch,) float tensor bounded to [-1, 1]
+        """
+        x = self.token_embedding(token_ids)
+        x = self.pos_encoding(x)
+        x = self.encoder(x, src_key_padding_mask=attention_mask)
+        cls_hidden = x[:, 0, :]  # CLS token at position 0
+        reward = self.reward_head(cls_hidden).squeeze(-1)
+        return torch.tanh(reward)
+class ChessPolicyModel(nn.Module):
+    """Causal next-move predictor with per-position live-board cross-attention.
+    Two streams flow through every block:
+      - Move stream: token embeddings + sinusoidal positional encoding, doing
+        causal self-attention over the move history.
+      - Board stream: a (B, T, 64, d_model) bank of CNN-encoded board features
+        where bank `t` is the state after token_ids[1..t] have been played.
+        At each block, the move query at position t cross-attends only to its
+        own 64 board-square keys — implicit causality via data layout, no
+        masking needed.
+    The board representation never depends on a token the model is being
+    asked to predict, so multi-position LM-style training is leak-safe.
+    """
+    def __init__(
+        self,
+        vocab_size: int,
+        d_model: int = 768,
+        nhead: int = 12,
+        num_layers: int = 8,
+        dim_feedforward: int = 3072,
+        max_seq_len: int = 128,
+        dropout: float = 0.1,
+        cnn_channels: int = 128,
+        cnn_blocks: int = 6,
+    ):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, d_model)
+        self.board_cnn = BoardCNN(d_model, cnn_channels, cnn_blocks)
+        self.pos_encoding = PositionalEncoding(d_model, max_seq_len, dropout)
+        self.blocks = nn.ModuleList([
+            CrossAttnBlock(d_model, nhead, dim_feedforward, dropout)
+            for _ in range(num_layers)
+        ])
+        self.norm_out = nn.LayerNorm(d_model)
+        self.prob_head = nn.Linear(d_model, vocab_size)
+    def forward(
+        self,
+        token_ids: torch.Tensor,
+        board_planes: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            token_ids:      (B, T) int — CLS at position 0
+            board_planes:   (B, T, 19, 8, 8) float — per-position live planes;
+                            planes[:, t] is the board state after token_ids[1..t]
+            attention_mask: (B, T) bool — True where padded
+        Returns:
+            (B, T, vocab_size) raw logits at every position
+        """
+        B, T = token_ids.shape
+        moves = self.token_embedding(token_ids)
+        moves = self.pos_encoding(moves)                          # (B, T, d)
+        # Vectorize the CNN over (B*T) boards — one big conv batch, not a loop.
+        planes_flat = board_planes.reshape(B * T, BOARD_PLANES, 8, 8)
+        board_feats = self.board_cnn(planes_flat)                 # (B*T, 64, d)
+        board_feats = board_feats.reshape(B, T, 64, -1)           # (B, T, 64, d)
+        # Bool causal mask (True = masked future position) to match the bool
+        # key_padding_mask. PyTorch deprecates mixing float and bool masks.
+        causal = torch.triu(
+            torch.ones(T, T, dtype=torch.bool, device=token_ids.device), diagonal=1
+        )
+        for blk in self.blocks:
+            moves = blk(moves, board_feats, attention_mask, causal)
+        moves = self.norm_out(moves)
+        return self.prob_head(moves)                              # (B, T, vocab)
+class DummyRewardModel:
+    """Material-count heuristic for MCTS testing."""
+    def __call__(self, board: chess.Board) -> float:
+        score = 0.0
+        for piece_type in PIECE_VALUES:
+            score += len(board.pieces(piece_type, chess.WHITE)) * PIECE_VALUES[piece_type]
+            score -= len(board.pieces(piece_type, chess.BLACK)) * PIECE_VALUES[piece_type]
+        return math.tanh(score / 10.0)
+class RewardModelInference:
+    """Wraps ChessRewardModel + Tokenizer for use in minimax"""
+    def __init__(self, model: ChessRewardModel, tokenizer: Tokenizer, device: str = "cpu"):
+        self.model = model
+        self.tokenizer = tokenizer
+        self.device = device
+        self.cls_id = tokenizer.symbol_to_token[CLS_TOKEN]
+        self.pad_id = tokenizer.symbol_to_token[PAD_TOKEN]
+        self.model.eval()
+    @torch.no_grad()
+    def __call__(self, board: chess.Board, max_seq_len: int = 128) -> float:
+        moves_uci = [move.uci() for move in board.move_stack]
+        # Keep the most recent moves to stay within the training sequence length.
+        # CLS occupies position 0, so cap move history at max_seq_len - 1.
+        moves_uci = moves_uci[-(max_seq_len - 1):]
+        token_ids = [self.cls_id] + self.tokenizer.encode_moves(moves_uci)
+        token_tensor = torch.tensor([token_ids], dtype=torch.long, device=self.device)
+        reward = self.model(token_tensor)
+        return reward.item()
+class PolicyModelInference:
+    """Wraps ChessPolicyModel + Tokenizer"""
+    def __init__(self, model: ChessPolicyModel, tokenizer: Tokenizer, device: str = "cpu"):
+        self.model = model
+        self.tokenizer = tokenizer
+        self.device = device
+        self.cls_id = tokenizer.symbol_to_token[CLS_TOKEN]
+        self.pad_id = tokenizer.symbol_to_token[PAD_TOKEN]
+        self.model.eval()
+    @torch.no_grad()
+    def __call__(self, board: chess.Board) -> str:
+        moves_uci = [move.uci() for move in board.move_stack]
+        token_ids = [self.cls_id] + self.tokenizer.encode_moves(moves_uci)
+        token_tensor = torch.tensor([token_ids], dtype=torch.long, device=self.device)
+        # Replay the full move history on a fresh board, snapshotting planes
+        # at every position. planes[0] = standard starting board (model has
+        # only seen [CLS]); planes[t] = state after the first t moves played.
+        # This matches the training pipeline (ChessPolicyDataset._replay_planes
+        # with start_board=chess.Board()) exactly.
+        replay_board = chess.Board()
+        plane_list = [board_to_planes(replay_board)]
+        for uci in moves_uci:
+            replay_board.push(chess.Move.from_uci(uci))
+            plane_list.append(board_to_planes(replay_board))
+        planes = torch.stack(plane_list).unsqueeze(0).to(self.device)  # (1, T, 19, 8, 8)
+        logits = self.model(token_tensor, planes)  # (1, T, vocab_size)
+        last_logits = logits[0, -1]                # last position predicts the next move
+        legal_move_ids = [self.tokenizer.symbol_to_token[move.uci()] for move in board.legal_moves]
+        mask = torch.full((self.tokenizer.language_size,), float('-inf'), device=self.device)
+        mask[legal_move_ids] = 0.0
+        best_move_idx = (last_logits + mask).argmax().item()
+        return self.tokenizer.token_to_symbol[best_move_idx]

src/tokenizer.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import pandas as pd
+import numpy as np
+import typing
+from collections import deque, defaultdict
+class Tokenizer():
+    def __init__(self):
+        self.symbol_set : set = None
+        self.symbol_to_token = {}
+        self.token_to_symbol = {}
+        self.language_size = 0
+        self.corpus = None
+    def train_tokenizer(self, input, max_language_size: int) -> None:
+        if type(input) == str:
+            self.corpus = input.split(",")
+        else:
+            self.corpus = input
+        self.symbol_set = set(self.corpus)
+        for sym in self.symbol_set:
+            self.symbol_to_token[sym] = self.language_size
+            self.token_to_symbol[self.language_size] = sym
+            self.language_size += 1
+        # Converted everythign to tokens from symbolic form
+        self.corpus = np.array([self.symbol_to_token[sym] for sym in self.corpus], dtype=int)
+        while self.language_size < max_language_size:
+            temp_corpus = self.corpus
+            common_pair = None
+            highest_pair_count = 0
+            pair_counts = defaultdict(int)
+            for i in range(len(temp_corpus)-1):
+                pair = (temp_corpus[i], temp_corpus[i+1])
+                pair_counts[pair] += 1
+                if (pair_counts[pair] > highest_pair_count):
+                    highest_pair_count = pair_counts[pair]
+                    common_pair = pair
+            synthetic_symbol = self.token_to_symbol[common_pair[0]] + self.token_to_symbol[common_pair[1]]
+            self.symbol_to_token[synthetic_symbol] = self.language_size
+            self.token_to_symbol[self.language_size] = synthetic_symbol
+            self.language_size += 1
+            combine_tokens = deque(temp_corpus)
+            self.corpus = []
+            while (len(combine_tokens) > 1):
+                first_elem = combine_tokens.popleft()
+                second_elem = combine_tokens.popleft()
+                if ((first_elem, second_elem) == common_pair):
+                    combine_tokens.appendleft(self.language_size - 1)
+                else:
+                    self.corpus.append(first_elem)
+                    self.corpus.append(second_elem)
+            if (len(combine_tokens) > 0):
+                self.corpus.append(combine_tokens.popleft())
+        self.corpus = None
+    def decode(self, tokens: list[int]) -> str:
+        return "".join([self.token_to_symbol[t] for t in tokens])
+    def encode(self, message: str):
+        char_list = list(message)
+        char_inputs = deque(char_list)
+        result_tokens = []
+        curr_symbol = ""
+        while (len(char_inputs) > 0):
+            f_char = char_inputs.popleft()
+            curr_symbol += f_char
+            if (curr_symbol not in self.symbol_to_token.keys()):
+                curr_symbol = curr_symbol[:-1]
+                result_tokens.append(self.symbol_to_token[curr_symbol])
+                char_inputs.appendleft(f_char)
+                curr_symbol = ""
+        if (len(curr_symbol) > 0):
+            result_tokens.append(self.symbol_to_token[curr_symbol])
+        return result_tokens
+    def encode_moves(self, moves: list[str]) -> list[int]:
+        return [self.symbol_to_token[move] for move in moves]
+    def add_special_tokens(self, tokens: list[str]) -> dict[str, int]:
+        mapping = {}
+        for tok in tokens:
+            self.symbol_to_token[tok] = self.language_size
+            self.token_to_symbol[self.language_size] = tok
+            mapping[tok] = self.language_size
+            self.language_size += 1
+        return mapping
+class DataLoader():
+    corpus = None
+    def __init__(self, file_name: str):
+        with open(file_name, "r") as f:
+            self.corpus = f.read()

src/train.py ADDED Viewed

	@@ -0,0 +1,1319 @@

+import argparse
+import atexit
+import contextlib
+import math  # noqa: F401 (used in eval_policy)
+import multiprocessing as mp
+import re
+import shutil
+import time
+import numpy as np
+from pathlib import Path
+import chess
+import chess.engine
+import torch
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader
+from torch.utils.tensorboard import SummaryWriter
+from datasets import load_dataset
+from tokenizer import Tokenizer
+from model import (
+    ChessRewardModel,
+    ChessPolicyModel,
+    DummyRewardModel,
+    CLS_TOKEN,
+    PAD_TOKEN,
+    board_to_planes,
+)
+STOCKFISH_PATH = shutil.which("stockfish") or "/usr/local/bin/stockfish"
+RESULT_TOKENS = {"1-0", "0-1", "1/2-1/2", "*"}
+MOVE_NUMBER_RE = re.compile(r"^\d+\.(\.\.)?$")
+# Brace-delimited PGN comments like {[%eval 0.37]} and {[%clk 0:05:00]}.
+# Non-greedy to handle multiple comments in one movetext.
+BRACE_COMMENT_RE = re.compile(r"\{[^}]*\}")
+def normalize_cp(centipawns: int) -> float:
+    """Map centipawn score to [-1, 1] using tanh scaling."""
+    return math.tanh(centipawns / 400.0)
+def material_eval(board: chess.Board) -> float:
+    """Material-count evaluation as a fallback for Stockfish."""
+    return DummyRewardModel()(board)
+class StockfishEvaluator:
+    """Wraps a persistent Stockfish engine for batch evaluation."""
+    def __init__(self, engine_path: str = STOCKFISH_PATH, depth: int = 15):
+        self.engine = chess.engine.SimpleEngine.popen_uci(engine_path)
+        self.depth = depth
+    def __call__(self, board: chess.Board) -> float:
+        info = self.engine.analyse(board, chess.engine.Limit(depth=self.depth))
+        score = info["score"].white()
+        if score.is_mate():
+            return 1.0 if score.mate() > 0 else -1.0
+        return normalize_cp(score.score())
+    def close(self):
+        self.engine.quit()
+def parse_movetext(movetext: str) -> list[str]:
+    """Parse PGN movetext into a list of SAN moves.
+    Handles Lichess-style annotations like '{[%eval 0.37]}' and '{[%clk 0:05:00]}'
+    by stripping them before tokenization. Without this, annotated games get
+    truncated mid-replay when parse_san chokes on comment fragments.
+    Input format: '1. d4 {[%eval 0.13]} d5 2. Nf3 ... 1-0'
+    Returns: ['d4', 'd5', 'Nf3', ...]
+    """
+    cleaned = BRACE_COMMENT_RE.sub(" ", movetext)
+    tokens = cleaned.split()
+    moves = []
+    for tok in tokens:
+        if tok in RESULT_TOKENS:
+            continue
+        if MOVE_NUMBER_RE.match(tok):
+            continue
+        moves.append(tok)
+    return moves
+def load_filtered_dataset(min_elo: int = 1500, min_rows: int = 100_000):
+    """Load and filter the Lichess HuggingFace dataset.
+    Filters:
+    - WhiteElo >= min_elo AND BlackElo >= min_elo
+    - Termination == 'Normal'
+    Returns the filtered dataset and raises ValueError if too few rows.
+    """
+    print("Loading Lichess dataset from HuggingFace...")
+    ds = load_dataset("Lichess/standard-chess-games", split="train", streaming=True)
+    print(f"Filtering for Elo >= {min_elo} and Termination == 'Normal'...")
+    ds_filtered = ds.filter(
+        lambda row: (
+            row["WhiteElo"] is not None
+            and row["BlackElo"] is not None
+            and row["WhiteElo"] >= min_elo
+            and row["BlackElo"] >= min_elo
+            and row.get("Termination") == "Normal"
+        )
+    )
+    # Materialize enough rows to validate the threshold
+    print(f"Collecting at least {min_rows:,} filtered games...")
+    rows = []
+    for row in ds_filtered:
+        rows.append(row)
+        if len(rows) % 50_000 == 0:
+            print(f"  collected {len(rows):,} games so far...")
+        if len(rows) >= min_rows:
+            break
+    if len(rows) < min_rows:
+        raise ValueError(
+            f"Only found {len(rows):,} games matching filters, "
+            f"need at least {min_rows:,}."
+        )
+    print(f"Collected {len(rows):,} games (target met).")
+    return rows
+def _enumerate_all_uci_moves() -> list[str]:
+    """Enumerate every UCI move string that can legally appear in a chess game.
+    Uses direct geometric enumeration rather than board simulation to avoid
+    edge cases where king placement blocks valid destination squares.
+    Covers all piece movement patterns: lines (rook/queen), diagonals
+    (bishop/queen), L-shapes (knight), and pawn promotions.
+    """
+    seen: set[str] = set()
+    for from_sq in chess.SQUARES:
+        fr = chess.square_rank(from_sq)
+        ff = chess.square_file(from_sq)
+        for to_sq in chess.SQUARES:
+            if from_sq == to_sq:
+                continue
+            tr = chess.square_rank(to_sq)
+            tf = chess.square_file(to_sq)
+            dr = abs(tr - fr)
+            df = abs(tf - ff)
+            is_line   = (dr == 0 or df == 0)                           # rook / queen
+            is_diag   = (dr == df)                                      # bishop / queen
+            is_knight = (dr == 2 and df == 1) or (dr == 1 and df == 2)
+            if not (is_line or is_diag or is_knight):
+                continue
+            seen.add(chess.Move(from_sq, to_sq).uci())
+            # Promotion variants: pawn on 7th rank advancing to 8th (or 2nd→1st)
+            if ((fr == 6 and tr == 7) or (fr == 1 and tr == 0)) and df <= 1:
+                for promo in (chess.QUEEN, chess.ROOK, chess.BISHOP, chess.KNIGHT):
+                    seen.add(chess.Move(from_sq, to_sq, promotion=promo).uci())
+    return list(seen)
+def _weighted_sample(eligible: list[int], k: int, skew_exponent: float, seed: int) -> set[int]:
+    """Sample k positions from eligible without replacement, skewed toward later positions.
+    Weights grow as (position_rank + 1)^skew_exponent so later positions in a game
+    are proportionally more likely to be selected. skew_exponent=1.0 gives linear
+    weighting; higher values concentrate more mass at the end of the game.
+    """
+    n = len(eligible)
+    k = min(k, n)
+    if k == n:
+        return set(eligible)
+    weights = np.array([(i + 1) ** skew_exponent for i in range(n)], dtype=np.float64)
+    weights /= weights.sum()
+    rng = np.random.default_rng(seed)
+    chosen = rng.choice(n, size=k, replace=False, p=weights)
+    return {eligible[i] for i in chosen}
+def build_tokenizer_from_games(games: list[dict] | None = None) -> Tokenizer:
+    """Build a move-level tokenizer covering all 1968 UCI moves."""
+    uci_moves = _enumerate_all_uci_moves()
+    print(f"  building tokenizer from {len(set(uci_moves)):,} UCI moves (no BPE)")
+    tokenizer = Tokenizer()
+    tokenizer.train_tokenizer(uci_moves, max_language_size=len(set(uci_moves)))
+    tokenizer.add_special_tokens([CLS_TOKEN, PAD_TOKEN])
+    return tokenizer
+def _load_train_idx(out_dir: Path, name: str, n: int) -> np.ndarray | None:
+    """If `{name}_test_indices.npy` exists, return the complement (training-only
+    indices into the full memmap). Returns None when no test split is recorded —
+    in that case the caller indexes into the memmap directly.
+    Names ending in '_test' always return None (the test memmap should not
+    exclude itself).
+    """
+    if name.endswith("_test"):
+        return None
+    test_idx_file = out_dir / f"{name}_test_indices.npy"
+    if not test_idx_file.exists():
+        return None
+    test_idx = np.load(test_idx_file)
+    mask = np.ones(n, dtype=bool)
+    mask[test_idx] = False
+    return np.where(mask)[0]
+class ChessPositionDataset(Dataset):
+    def __init__(
+        self,
+        games: list[dict],
+        tokenizer: Tokenizer,
+        eval_fn=material_eval,
+        sample_rate: float = 0.25,
+        skew_exponent: float = 1.5,
+    ):
+        self.tokenizer = tokenizer
+        self.cls_id = tokenizer.symbol_to_token[CLS_TOKEN]
+        self.samples: list[tuple[list[int], float]] = []
+        self._memmap = False
+        self._train_idx: np.ndarray | None = None
+        self._generate_samples(games, eval_fn, sample_rate, skew_exponent)
+    def _generate_samples(self, games, eval_fn, sample_rate, skew_exponent):
+        for idx, game in enumerate(games):
+            movetext = game.get("movetext", "")
+            if not movetext:
+                continue
+            move_sans = parse_movetext(movetext)
+            if len(move_sans) < 2:
+                continue
+            board = chess.Board()
+            eligible = list(range(len(move_sans)))
+            # Scale sample count with game length — longer games have more
+            # evaluation swings and contribute proportionally more samples.
+            num_positions = max(1, int(len(move_sans) * sample_rate))
+            # Deterministic weighted sampling seeded by game index so serial
+            # and parallel paths produce identical sample sets for the same input.
+            sample_indices = _weighted_sample(eligible, num_positions, skew_exponent, seed=idx)
+            valid_moves = []
+            for i, san in enumerate(move_sans):
+                try:
+                    move = board.parse_san(san)
+                    board.push(move)
+                    valid_moves.append(move.uci())
+                except (chess.InvalidMoveError, chess.AmbiguousMoveError):
+                    break
+                if i in sample_indices:
+                    token_ids = [self.cls_id] + self.tokenizer.encode_moves(valid_moves)
+                    score = eval_fn(board)
+                    self.samples.append((token_ids, score))
+            if (idx + 1) % 10_000 == 0:
+                print(f"  processed {idx + 1:,} games, {len(self.samples):,} positions...")
+    def __len__(self) -> int:
+        if self._memmap:
+            if self._train_idx is not None:
+                return len(self._train_idx)
+            return len(self._mm_labels)
+        return len(self.samples)
+    def __getitem__(self, idx: int):
+        if self._memmap:
+            if self._train_idx is not None:
+                idx = int(self._train_idx[idx])
+            tokens = torch.from_numpy(np.array(self._mm_tokens[idx], dtype=np.int32)).long()
+            length = int(self._mm_lengths[idx])
+            mask = torch.arange(tokens.shape[0]) >= length  # True = padded
+            return tokens, mask, float(self._mm_labels[idx])
+        token_ids, score = self.samples[idx]
+        return torch.tensor(token_ids, dtype=torch.long), score
+    @classmethod
+    def from_samples(cls, samples, tokenizer: Tokenizer):
+        """Build a dataset from pre-generated (token_ids, score) samples."""
+        inst = cls.__new__(cls)
+        inst.tokenizer = tokenizer
+        inst.cls_id = tokenizer.symbol_to_token[CLS_TOKEN]
+        inst.samples = list(samples)
+        inst._memmap = False
+        return inst
+    @classmethod
+    def from_file(cls, samples_path: str, tokenizer: Tokenizer):
+        """Load (token_ids, score) samples from a torch.save file."""
+        samples = torch.load(samples_path, weights_only=False)
+        return cls.from_samples(samples, tokenizer)
+    @classmethod
+    def from_memmap(cls, out_dir: Path, name: str, tokenizer: Tokenizer):
+        """Load pre-padded samples from memory-mapped arrays (fast DataLoader path).
+        If a sibling file `{name}_test_indices.npy` exists, those indices are
+        excluded from this dataset — used to make training disjoint from the
+        held-out test split that shares the same underlying .bin file.
+        """
+        meta = torch.load(out_dir / f"{name}_meta.pt", weights_only=True)
+        n, max_len = meta["n"], meta["max_len"]
+        inst = cls.__new__(cls)
+        inst.tokenizer = tokenizer
+        inst.cls_id = tokenizer.symbol_to_token[CLS_TOKEN]
+        inst._memmap = True
+        inst._mm_tokens = np.memmap(out_dir / f"{name}_tokens.bin", dtype=np.int32, mode="r", shape=(n, max_len))
+        inst._mm_labels = np.memmap(out_dir / f"{name}_labels.bin", dtype=np.float32, mode="r", shape=(n,))
+        inst._mm_lengths = np.memmap(out_dir / f"{name}_lengths.bin", dtype=np.int32, mode="r", shape=(n,))
+        inst._train_idx = _load_train_idx(out_dir, name, n)
+        return inst
+# ---------------------------------------------------------------------------
+# Parallel Stockfish-backed sample generation.
+#
+# One Stockfish subprocess per worker process. Each worker:
+#   1. receives a game + its index (used to seed a local random.Random)
+#   2. replays the game, samples positions, tokenizes move prefixes
+#   3. evaluates each sampled position with its own Stockfish engine
+#   4. returns a list of (token_ids, score) tuples
+#
+# The main process collects results via imap_unordered and flattens them.
+# ---------------------------------------------------------------------------
+# Module-level state populated by _init_worker in each spawned process.
+_worker_engine = None
+_worker_tokenizer = None
+_worker_cls_id = None
+_worker_sample_rate = None
+_worker_skew = None
+_worker_depth = None
+def _shutdown_worker():
+    """Called at worker exit to cleanly quit the Stockfish engine."""
+    global _worker_engine
+    if _worker_engine is not None:
+        try:
+            _worker_engine.quit()
+        except Exception:
+            pass
+        _worker_engine = None
+def _init_worker(engine_path, depth, tokenizer, cls_id, sample_rate, skew_exponent):
+    """Pool initializer: create one Stockfish engine per worker.
+    If engine_path is None, workers fall back to material_eval. This lets
+    tests exercise the parallel machinery without requiring Stockfish.
+    """
+    global _worker_engine, _worker_tokenizer, _worker_cls_id
+    global _worker_sample_rate, _worker_skew, _worker_depth
+    _worker_tokenizer = tokenizer
+    _worker_cls_id = cls_id
+    _worker_sample_rate = sample_rate
+    _worker_skew = skew_exponent
+    _worker_depth = depth
+    if engine_path is not None:
+        _worker_engine = chess.engine.SimpleEngine.popen_uci(engine_path)
+        atexit.register(_shutdown_worker)
+    else:
+        _worker_engine = None
+def _worker_eval(board: chess.Board) -> float:
+    if _worker_engine is None:
+        return material_eval(board)
+    info = _worker_engine.analyse(board, chess.engine.Limit(depth=_worker_depth))
+    score = info["score"].white()
+    if score.is_mate():
+        return 1.0 if score.mate() > 0 else -1.0
+    return normalize_cp(score.score())
+def _process_game(game_with_seed):
+    """Worker task: parse, replay, sample, tokenize, and evaluate one game."""
+    game, seed = game_with_seed
+    movetext = game.get("movetext", "")
+    if not movetext:
+        return []
+    move_sans = parse_movetext(movetext)
+    if len(move_sans) < 2:
+        return []
+    eligible = list(range(len(move_sans)))
+    num_positions = max(1, int(len(move_sans) * _worker_sample_rate))
+    sample_indices = _weighted_sample(eligible, num_positions, _worker_skew, seed=seed)
+    samples = []
+    board = chess.Board()
+    valid_moves = []
+    for i, san in enumerate(move_sans):
+        try:
+            move = board.parse_san(san)
+            board.push(move)
+            valid_moves.append(move.uci())
+        except (chess.InvalidMoveError, chess.AmbiguousMoveError):
+            break
+        if i in sample_indices:
+            token_ids = [_worker_cls_id] + _worker_tokenizer.encode_moves(valid_moves)
+            score = _worker_eval(board)
+            samples.append((token_ids, score))
+    return samples
+def generate_samples_stockfish_parallel(
+    games: list[dict],
+    tokenizer: Tokenizer,
+    num_workers: int = 8,
+    stockfish_depth: int = 12,
+    sample_rate: float = 0.25,
+    skew_exponent: float = 1.5,
+    engine_path: str | None = STOCKFISH_PATH,
+    chunksize: int = 8,
+    progress_every: int = 1000,
+) -> list[tuple[list[int], float]]:
+    """Parallel Stockfish-backed sample generation.
+    Spawns `num_workers` processes, each owning one Stockfish subprocess.
+    If `engine_path` is None, workers use material_eval instead of Stockfish
+    (used by tests to verify the parallel machinery without the binary).
+    Each game contributes `max(1, game_length * sample_rate)` positions,
+    weighted toward mid/late game by `skew_exponent`. Sampling is seeded
+    per-game-index for determinism across runs and worker counts.
+    """
+    cls_id = tokenizer.symbol_to_token[CLS_TOKEN]
+    tasks = [(game, idx) for idx, game in enumerate(games)]
+    # spawn context: safest across macOS/Linux and avoids fork-safety issues
+    # with chess.engine's subprocess.
+    ctx = mp.get_context("spawn")
+    samples: list[tuple[list[int], float]] = []
+    with ctx.Pool(
+        processes=num_workers,
+        initializer=_init_worker,
+        initargs=(engine_path, stockfish_depth, tokenizer, cls_id, sample_rate, skew_exponent),
+    ) as pool:
+        for i, game_samples in enumerate(
+            pool.imap_unordered(_process_game, tasks, chunksize=chunksize)
+        ):
+            samples.extend(game_samples)
+            if progress_every and (i + 1) % progress_every == 0:
+                print(
+                    f"  processed {i + 1:,}/{len(games):,} games, "
+                    f"{len(samples):,} positions..."
+                )
+    return samples
+def collate_fn(batch):
+    """Pad token sequences and create attention mask."""
+    tokens, labels = zip(*batch)
+    max_len = max(len(t) for t in tokens)
+    padded = torch.zeros(len(tokens), max_len, dtype=torch.long)
+    attention_mask = torch.ones(len(tokens), max_len, dtype=torch.bool)  # True = masked
+    for i, t in enumerate(tokens):
+        padded[i, :len(t)] = t
+        attention_mask[i, :len(t)] = False
+    labels_tensor = torch.tensor(labels, dtype=torch.float)
+    return padded, attention_mask, labels_tensor
+def collate_fn_memmap(batch):
+    """Collate pre-padded memmap samples — just stack, no per-batch padding needed."""
+    tokens, masks, labels = zip(*batch)
+    return torch.stack(tokens), torch.stack(masks), torch.tensor(labels, dtype=torch.float)
+def collate_fn_policy(batch):
+    """Pad token sequences and per-position board planes for policy training.
+    Each batch element is (tokens, planes, weight, source_tag) where
+    `tokens` is shape (L,) long and `planes` is shape (L, 19, 8, 8) float —
+    one set of board planes per position in the sequence. We pad both
+    along the sequence dimension to the batch's max length. Padded
+    positions get zero token, zero planes, and mask=True; downstream
+    loss masking ignores them.
+    Returns (padded_tokens, attention_mask, planes, weights, sources).
+    """
+    tokens_list, planes_list, weights_list, sources_list = zip(*batch)
+    B = len(tokens_list)
+    max_len = max(len(t) for t in tokens_list)
+    padded = torch.zeros(B, max_len, dtype=torch.long)
+    mask = torch.ones(B, max_len, dtype=torch.bool)  # True = padded
+    planes = torch.zeros(B, max_len, 19, 8, 8)
+    for i, (t, p) in enumerate(zip(tokens_list, planes_list)):
+        L = len(t)
+        padded[i, :L] = t
+        mask[i, :L] = False
+        planes[i, :L] = p
+    weights = torch.tensor(weights_list, dtype=torch.float)
+    sources = torch.tensor(sources_list, dtype=torch.long)
+    return padded, mask, planes, weights, sources
+class MixedBatchSampler(torch.utils.data.Sampler):
+    """Hard-balanced sampler over a ConcatDataset([games, puzzles]).
+    Each batch contains exactly `n_game_per_batch` game indices (drawn from
+    [0, n_game)) and `n_puzzle_per_batch` puzzle indices (drawn from
+    [n_game, n_game + n_puzzle)). Both pools are shuffled and consumed in
+    parallel; when the smaller (puzzle) pool runs out it gets re-shuffled,
+    so puzzles are effectively oversampled to match the game stream.
+    This guarantees a consistent gradient signal per batch and prevents the
+    puzzle samples from being statistical outliers under BatchNorm (already
+    moot now that the CNN uses GroupNorm, but still matters for loss-level
+    balance).
+    """
+    def __init__(
+        self,
+        n_game: int,
+        n_puzzle: int,
+        batch_size: int,
+        game_ratio: float = 0.8,
+        drop_last: bool = True,
+    ):
+        self.n_game = n_game
+        self.n_puzzle = n_puzzle
+        self.batch_size = batch_size
+        self.n_game_per_batch = max(1, int(round(batch_size * game_ratio)))
+        self.n_puzzle_per_batch = batch_size - self.n_game_per_batch
+        self.drop_last = drop_last
+    def __iter__(self):
+        game_perm = torch.randperm(self.n_game).tolist()
+        puzzle_perm = torch.randperm(self.n_puzzle).tolist() if self.n_puzzle > 0 else []
+        gi, pi = 0, 0
+        for _ in range(len(self)):
+            if gi + self.n_game_per_batch > self.n_game:
+                game_perm = torch.randperm(self.n_game).tolist()
+                gi = 0
+            if self.n_puzzle_per_batch > 0 and pi + self.n_puzzle_per_batch > self.n_puzzle:
+                puzzle_perm = torch.randperm(self.n_puzzle).tolist()
+                pi = 0
+            batch = []
+            for _ in range(self.n_game_per_batch):
+                batch.append(game_perm[gi]); gi += 1
+            for _ in range(self.n_puzzle_per_batch):
+                batch.append(self.n_game + puzzle_perm[pi]); pi += 1
+            yield batch
+    def __len__(self):
+        # One pass over the (more numerous) game pool defines an epoch.
+        return self.n_game // self.n_game_per_batch
+class ChessPolicyDataset(Dataset):
+    """Full game sequences for next-move prediction training.
+    Each sample yields (token_ids, board_planes, weight, source_tag):
+    - token_ids:    full tokenized sequence [CLS, m1, m2, ..., mN]
+    - board_planes: (L, 19, 8, 8) tensor of per-position planes built by
+                    replaying the move sequence. planes[0] is the starting
+                    board (the standard chess start for games, the puzzle
+                    FEN for puzzles); planes[t] is the board state after
+                    token_ids[1..t] have been played. This is the
+                    information-leak-safe per-position anchor that lets the
+                    model cross-attend to the live board at every step.
+    - weight:       per-sample loss weight (1.0 for games, default 5.0 for
+                    puzzles) so puzzle samples have outsized gradient pull.
+    - source_tag:   0 = game, 1 = puzzle. Used by the mixed training loop to
+                    mask the setup-move target on puzzle samples.
+    """
+    def __init__(self, games: list[dict], tokenizer: Tokenizer, max_seq_len: int = 128):
+        cls_id = tokenizer.symbol_to_token[CLS_TOKEN]
+        self.tokenizer = tokenizer
+        self._memmap = False
+        self._train_idx: np.ndarray | None = None
+        self._mm_fens = None
+        self._fen_len = None
+        self.source_tag: int = 0
+        self.loss_weight: float = 1.0
+        self.samples: list[list[int]] = []
+        for game in games:
+            movetext = game.get("movetext", "")
+            if not movetext:
+                continue
+            move_sans = parse_movetext(movetext)
+            if len(move_sans) < 2:
+                continue
+            board = chess.Board()
+            move_ucis: list[str] = []
+            for san in move_sans:
+                try:
+                    move = board.parse_san(san)
+                    board.push(move)
+                    move_ucis.append(move.uci())
+                except (chess.InvalidMoveError, chess.AmbiguousMoveError):
+                    break
+            if len(move_ucis) < 2:
+                continue
+            move_ucis = move_ucis[:max_seq_len - 1]  # reserve slot for CLS
+            self.samples.append([cls_id] + tokenizer.encode_moves(move_ucis))
+    def _get_start_board(self, idx: int) -> chess.Board:
+        """Resolve the starting board for the per-position replay.
+        Puzzles with a `{name}_fens.bin` sidecar use the puzzle's FEN.
+        Everything else (games, puzzles without FENs) starts from the
+        standard chess starting position. A corrupt FEN silently falls
+        back to the starting position so the loader doesn't crash.
+        """
+        if self._memmap and self._mm_fens is not None:
+            fen_bytes = bytes(self._mm_fens[idx])
+            fen_str = fen_bytes.rstrip(b"\x00").decode("ascii")
+            try:
+                return chess.Board(fen_str)
+            except ValueError:
+                return chess.Board()
+        return chess.Board()
+    def __len__(self) -> int:
+        if self._memmap:
+            if self._train_idx is not None:
+                return len(self._train_idx)
+            return len(self._mm_lengths)
+        return len(self.samples)
+    def __getitem__(self, idx: int):
+        if self._memmap:
+            if self._train_idx is not None:
+                idx = int(self._train_idx[idx])
+            length = int(self._mm_lengths[idx])
+            tokens = torch.from_numpy(np.array(self._mm_tokens[idx, :length], dtype=np.int32)).long()
+        else:
+            tokens = torch.tensor(self.samples[idx], dtype=torch.long)
+        start_board = self._get_start_board(idx)
+        planes = self._replay_planes(tokens.tolist(), start_board)
+        return tokens, planes, self.loss_weight, self.source_tag
+    @classmethod
+    def from_memmap(
+        cls,
+        out_dir: Path,
+        tokenizer: Tokenizer,
+        name: str = "policy",
+        source_tag: int = 0,
+        loss_weight: float = 1.0,
+    ):
+        """Load pre-tokenized policy sequences from memory-mapped arrays.
+        Args:
+            name:        filename prefix; use 'puzzle' to load puzzle_*.bin files.
+            source_tag:  0 for game data, 1 for puzzle data (drives setup-move
+                         masking in the mixed training loop).
+            loss_weight: per-sample weight applied to this dataset's samples in
+                         the weighted cross-entropy loss.
+        If a sibling file `{name}_fens.bin` exists, FENs are loaded and used
+        to reconstruct each sample's starting-board planes. Otherwise the
+        standard chess starting position is used.
+        If `{name}_test_indices.npy` exists, those indices are excluded from
+        this dataset — used to make training disjoint from the held-out test
+        split that shares the same underlying .bin file.
+        """
+        meta = torch.load(out_dir / f"{name}_meta.pt", weights_only=True)
+        n, max_len = meta["n"], meta["max_len"]
+        inst = cls.__new__(cls)
+        inst._memmap = True
+        inst._mm_tokens = np.memmap(out_dir / f"{name}_tokens.bin", dtype=np.int32, mode="r", shape=(n, max_len))
+        inst._mm_lengths = np.memmap(out_dir / f"{name}_lengths.bin", dtype=np.int32, mode="r", shape=(n,))
+        inst._train_idx = _load_train_idx(out_dir, name, n)
+        fen_path = out_dir / f"{name}_fens.bin"
+        if fen_path.exists() and "fen_len" in meta:
+            inst._mm_fens = np.memmap(fen_path, dtype=np.uint8, mode="r", shape=(n, meta["fen_len"]))
+            inst._fen_len = meta["fen_len"]
+        else:
+            inst._mm_fens = None
+            inst._fen_len = None
+            if source_tag == 1:
+                # Puzzle data without FENs: CNN will see the standard starting
+                # position for every puzzle, which is wrong. Loud warning.
+                print(
+                    f"WARNING: {name}_fens.bin not found — puzzle samples will "
+                    f"feed the starting-position planes to the CNN, defeating "
+                    f"the point of puzzle conditioning. Rebuild with the "
+                    f"updated build_datasets.py to fix."
+                )
+        inst.tokenizer = tokenizer
+        inst.source_tag = source_tag
+        inst.loss_weight = loss_weight
+        return inst
+    def _replay_planes(self, token_ids: list[int], start_board: chess.Board) -> torch.Tensor:
+        """Returns (L, 19, 8, 8) tensor of board planes per position.
+        plane_t = state of the board after token_ids[1..t] have been played.
+        plane_0 = start_board (the model has only seen [CLS] at that point).
+        If a token in the sequence isn't a parseable UCI move (corrupt
+        data, non-move special token mid-stream), we freeze planes at the
+        last valid state and return. The loss already masks padded targets,
+        so the worst case is a few positions with stale board input rather
+        than a crashed worker.
+        """
+        L = len(token_ids)
+        planes = torch.zeros(L, 19, 8, 8)
+        board = start_board.copy()
+        planes[0] = board_to_planes(board)
+        for t in range(1, L):
+            uci = self.tokenizer.token_to_symbol[int(token_ids[t])]
+            try:
+                board.push(chess.Move.from_uci(uci))
+            except (chess.InvalidMoveError, ValueError):
+                planes[t:] = planes[t - 1]
+                return planes
+            planes[t] = board_to_planes(board)
+        return planes
+def _fmt_duration(seconds: float) -> str:
+    h, m = divmod(int(seconds), 3600)
+    m, s = divmod(m, 60)
+    return f"{h}h {m:02d}m {s:02d}s" if h else f"{m}m {s:02d}s"
+def _amp_ctx(device):
+    """BF16 autocast on CUDA, no-op elsewhere.
+    BF16 is preferred over FP16 here: same dynamic range as FP32 (no GradScaler
+    needed) and full tensor-core acceleration on Ampere+ / Ada / Blackwell.
+    On Blackwell (RTX PRO 6000 / B200) this typically gives 2-3x training speedup
+    on transformer matmuls.
+    """
+    dev = device if isinstance(device, str) else getattr(device, "type", "cpu")
+    if dev == "cuda":
+        return torch.autocast(device_type="cuda", dtype=torch.bfloat16)
+    return contextlib.nullcontext()
+def _run_epoch_reward(model, loader, optimizer, device, writer, global_step, epoch_idx):
+    """Single training epoch: MSE against Stockfish labels."""
+    model.train()
+    total_loss = 0.0
+    n_batches = len(loader)
+    log_every = max(1, n_batches // 20)
+    epoch_start = time.time()
+    for i, (batch_tokens, batch_mask, batch_labels) in enumerate(loader):
+        batch_tokens = batch_tokens.to(device)
+        batch_mask = batch_mask.to(device)
+        batch_labels = batch_labels.to(device)
+        with _amp_ctx(device):
+            predictions = model(batch_tokens, attention_mask=batch_mask)
+            loss = F.mse_loss(predictions, batch_labels)
+        optimizer.zero_grad()
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
+        optimizer.step()
+        total_loss += loss.item()
+        writer.add_scalar("train/reward_batch_loss", loss.item(), global_step)
+        global_step += 1
+        if (i + 1) % log_every == 0 or (i + 1) == n_batches:
+            elapsed = time.time() - epoch_start
+            batches_done = i + 1
+            eta = elapsed / batches_done * (n_batches - batches_done)
+            samples_per_sec = batches_done * batch_tokens.size(0) / elapsed
+            avg_so_far = total_loss / batches_done
+            print(
+                f"    batch {batches_done:,}/{n_batches:,}  "
+                f"loss={avg_so_far:.4f}  "
+                f"{samples_per_sec:,.0f} samples/s  "
+                f"eta {_fmt_duration(eta)}"
+            )
+    epoch_elapsed = time.time() - epoch_start
+    avg = total_loss / n_batches
+    writer.add_scalar("train/reward_epoch_loss", avg, epoch_idx)
+    return avg, global_step, epoch_elapsed
+def _run_epoch_policy_mixed(
+    model, loader, optimizer, device, writer, global_step, epoch_idx, pad_id,
+):
+    """Single training epoch over mixed game + puzzle batches.
+    Loader yields (tokens, mask, planes, weights, sources). For each batch:
+    1. CNN-conditioned forward pass: position-0 embedding is replaced by the
+       CNN's encoding of `planes` (starting board of the sequence).
+    2. Per-position cross-entropy at every non-padded target position.
+    3. Setup-move target is masked out for puzzle rows (source==1): the setup
+       move is given as context, not a prediction target.
+    4. Per-sample loss weight upweights puzzle samples (default 5x via the
+       dataset's loss_weight field) — implemented as a position-weighted mean.
+    """
+    model.train()
+    total_loss = 0.0
+    n_batches = len(loader)
+    log_every = max(1, n_batches // 20)
+    epoch_start = time.time()
+    for i, (batch_tokens, batch_mask, batch_planes, batch_weights, batch_sources) in enumerate(loader):
+        batch_tokens = batch_tokens.to(device, non_blocking=True)
+        batch_mask = batch_mask.to(device, non_blocking=True)
+        batch_planes = batch_planes.to(device, non_blocking=True)
+        batch_weights = batch_weights.to(device, non_blocking=True)
+        batch_sources = batch_sources.to(device, non_blocking=True)
+        input_tokens = batch_tokens[:, :-1]
+        input_mask = batch_mask[:, :-1]
+        input_planes = batch_planes[:, :-1]  # planes are per-position; slice with tokens
+        targets = batch_tokens[:, 1:].contiguous()
+        # Mask the setup-move target (position 0 of the shifted target) for
+        # puzzle rows — it's the opponent's forcing move given as context.
+        is_puzzle = (batch_sources == 1)
+        if is_puzzle.any():
+            targets = targets.clone()
+            targets[is_puzzle, 0] = pad_id
+        with _amp_ctx(device):
+            logits = model(input_tokens, input_planes, attention_mask=input_mask)
+            B, T, V = logits.shape
+            ce = F.cross_entropy(
+                logits.reshape(-1, V),
+                targets.reshape(-1),
+                ignore_index=pad_id,
+                reduction="none",
+            ).reshape(B, T)
+            position_mask = (targets != pad_id).float()
+            sample_weights = batch_weights.unsqueeze(1)
+            weighted = ce * position_mask * sample_weights
+            denom = (position_mask * sample_weights).sum().clamp(min=1.0)
+            loss = weighted.sum() / denom
+        optimizer.zero_grad()
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
+        optimizer.step()
+        total_loss += loss.item()
+        writer.add_scalar("train_policy/batch_loss", loss.item(), global_step)
+        global_step += 1
+        if (i + 1) % log_every == 0 or (i + 1) == n_batches:
+            elapsed = time.time() - epoch_start
+            batches_done = i + 1
+            eta = elapsed / batches_done * (n_batches - batches_done)
+            samples_per_sec = batches_done * batch_tokens.size(0) / elapsed
+            avg_so_far = total_loss / batches_done
+            print(
+                f"    batch {batches_done:,}/{n_batches:,}  "
+                f"loss={avg_so_far:.4f}  "
+                f"{samples_per_sec:,.0f} samples/s  "
+                f"eta {_fmt_duration(eta)}"
+            )
+    epoch_elapsed = time.time() - epoch_start
+    avg = total_loss / max(n_batches, 1)
+    writer.add_scalar("train_policy/epoch_loss", avg, epoch_idx)
+    return avg, global_step, epoch_elapsed
+def eval_reward(model, loader, device) -> dict:
+    """Evaluate reward model on a test loader. Returns MSE, MAE, and Pearson r."""
+    model.eval()
+    all_preds, all_labels = [], []
+    with torch.no_grad(), _amp_ctx(device):
+        for batch_tokens, batch_mask, batch_labels in loader:
+            preds = model(batch_tokens.to(device), attention_mask=batch_mask.to(device))
+            all_preds.append(preds.float().cpu())
+            all_labels.append(batch_labels)
+    preds = torch.cat(all_preds)
+    labels = torch.cat(all_labels)
+    mse = F.mse_loss(preds, labels).item()
+    mae = (preds - labels).abs().mean().item()
+    # Pearson r
+    p_centered = preds - preds.mean()
+    l_centered = labels - labels.mean()
+    denom = (p_centered.norm() * l_centered.norm()).clamp(min=1e-8)
+    pearson_r = (p_centered * l_centered).sum() / denom
+    return {"mse": mse, "mae": mae, "pearson_r": pearson_r.item()}
+def eval_policy(model, loader, device, pad_id: int) -> dict:
+    """Evaluate policy model on a test loader. Returns loss, perplexity, top-1/top-5 acc.
+    Loader yields (tokens, mask, planes, weights, sources). Weights and sources
+    are ignored here — eval is uniform across samples.
+    """
+    model.eval()
+    total_loss = 0.0
+    total_correct1 = 0
+    total_correct5 = 0
+    total_positions = 0
+    with torch.no_grad(), _amp_ctx(device):
+        for batch_tokens, batch_mask, batch_planes, _, _ in loader:
+            batch_tokens = batch_tokens.to(device)
+            batch_mask = batch_mask.to(device)
+            batch_planes = batch_planes.to(device)
+            input_tokens = batch_tokens[:, :-1]
+            input_mask = batch_mask[:, :-1]
+            input_planes = batch_planes[:, :-1]
+            targets = batch_tokens[:, 1:].contiguous()
+            logits = model(input_tokens, input_planes, attention_mask=input_mask)
+            flat_logits = logits.reshape(-1, logits.size(-1))
+            flat_targets = targets.reshape(-1)
+            valid = flat_targets != pad_id
+            total_loss += F.cross_entropy(flat_logits, flat_targets, ignore_index=pad_id, reduction="sum").item()
+            total_positions += valid.sum().item()
+            top5 = flat_logits[valid].topk(5, dim=-1).indices
+            valid_targets = flat_targets[valid]
+            total_correct1 += (top5[:, 0] == valid_targets).sum().item()
+            total_correct5 += (top5 == valid_targets.unsqueeze(1)).any(dim=1).sum().item()
+    avg_loss = total_loss / max(total_positions, 1)
+    return {
+        "loss": avg_loss,
+        "perplexity": math.exp(min(avg_loss, 20)),
+        "top1_acc": total_correct1 / max(total_positions, 1),
+        "top5_acc": total_correct5 / max(total_positions, 1),
+    }
+def eval_puzzle_solve_rate(model, loader, device, pad_id: int) -> dict:
+    """Evaluate puzzle solve rate: % of solver positions where model's top-1 matches
+    ground truth. Sequence layout: [CLS, setup, solver1, opp1, solver2, ...]
+    Solver moves are at token positions 2, 4, 6, ... (logit positions 1, 3, 5, ...).
+    The setup move at token position 1 (logit 0) is excluded — it's context, not a
+    prediction target. Also reports first-move solve rate (logit position 1 only).
+    """
+    model.eval()
+    first_correct = 0
+    first_total = 0
+    all_correct = 0
+    all_total = 0
+    with torch.no_grad(), _amp_ctx(device):
+        for batch_tokens, batch_mask, batch_planes, _, _ in loader:
+            batch_tokens = batch_tokens.to(device)
+            batch_mask = batch_mask.to(device)
+            batch_planes = batch_planes.to(device)
+            input_tokens = batch_tokens[:, :-1]
+            input_mask = batch_mask[:, :-1]
+            input_planes = batch_planes[:, :-1]
+            logits = model(input_tokens, input_planes, attention_mask=input_mask)
+            seq_len = batch_tokens.size(1)
+            # Solver logit positions: 1, 3, 5, ... → target positions: 2, 4, 6, ...
+            for solver_logit_pos in range(1, seq_len - 1, 2):
+                solver_token_pos = solver_logit_pos + 1
+                if solver_token_pos >= seq_len:
+                    break
+                targets = batch_tokens[:, solver_token_pos]
+                valid = targets != pad_id
+                if not valid.any():
+                    continue
+                preds = logits[:, solver_logit_pos].argmax(dim=-1)
+                correct = (preds[valid] == targets[valid]).sum().item()
+                n_valid = valid.sum().item()
+                all_correct += correct
+                all_total += n_valid
+                if solver_logit_pos == 1:
+                    first_correct += correct
+                    first_total += n_valid
+    return {
+        "first_move_solve_rate": first_correct / max(first_total, 1),
+        "all_moves_solve_rate": all_correct / max(all_total, 1),
+    }
+def train(
+    tokenizer_path,
+    stockfish_samples_path,
+    outcome_games_path,
+    epochs,
+    policy_epochs,
+    batch_size,
+    learning_rate,
+    max_seq_len,
+    log_dir,
+    num_workers,
+    puzzle_data_dir=None,
+    puzzle_epochs=5,  # kept for CLI compat; no longer used (mixed training merges phases)
+    puzzle_loss_weight=5.0,
+    puzzle_ratio=0.2,
+    skip_reward=False,
+    keep_last_n_checkpoints=3,
+):
+    """Train the reward model then the policy model.
+    Phase 1: MSE on Stockfish-labeled positions (reward model).
+    Phase 2: Mixed game + puzzle policy training. Each batch is hard-balanced
+        at `puzzle_ratio` (default 20% puzzle) and puzzle samples carry a
+        `puzzle_loss_weight` (default 5x) in the weighted cross-entropy loss.
+        Games feed the CNN the standard chess starting board (constant signal,
+        effectively a no-op); puzzles feed the FEN-derived board.
+    If `skip_reward` is True, Phase 1 is skipped entirely — the reward dataset
+    is not loaded, no reward model is created, and `reward_model.pt` on disk is
+    untouched. Use this for iterating on Phase 2 without burning hours on a
+    Phase 1 that hasn't changed.
+    Requires stockfish memmap files, outcome games, and (for mixed training)
+    puzzle memmaps with FENs built by src/build_datasets.py.
+    """
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    amp_dtype = "bfloat16 autocast" if device == "cuda" else "fp32 (CPU)"
+    print(f"Using device: {device} ({amp_dtype})")
+    print(f"Loading tokenizer from {tokenizer_path}...")
+    tokenizer = torch.load(tokenizer_path, weights_only=False)
+    vocab_size = tokenizer.language_size
+    pad_id = tokenizer.symbol_to_token[PAD_TOKEN]
+    writer = SummaryWriter(log_dir=log_dir)
+    # ── Test loaders (optional, skip silently if test sets not built yet) ───────
+    out_dir = Path(stockfish_samples_path).parent
+    reward_test_loader = None
+    if not skip_reward and (out_dir / "stockfish_test_meta.pt").exists():
+        reward_test_ds = ChessPositionDataset.from_memmap(out_dir, "stockfish_test", tokenizer)
+        reward_test_loader = DataLoader(
+            reward_test_ds, batch_size=batch_size, shuffle=False, collate_fn=collate_fn_memmap,
+            num_workers=num_workers, pin_memory=True,
+        )
+        print(f"Reward test set: {len(reward_test_ds):,} positions loaded")
+    policy_data_dir_early = Path(outcome_games_path).parent
+    policy_test_loader = None
+    if (policy_data_dir_early / "policy_test_meta.pt").exists():
+        policy_test_ds = ChessPolicyDataset.from_memmap(policy_data_dir_early, tokenizer, name="policy_test")
+        policy_test_loader = DataLoader(
+            policy_test_ds, batch_size=batch_size, shuffle=False, collate_fn=collate_fn_policy,
+            num_workers=num_workers, pin_memory=True,
+        )
+        print(f"Policy test set: {len(policy_test_ds):,} sequences loaded")
+    puzzle_test_loader = None
+    _puzzle_dir = Path(puzzle_data_dir) if puzzle_data_dir is not None else policy_data_dir_early
+    if (_puzzle_dir / "puzzle_test_meta.pt").exists():
+        puzzle_test_ds = ChessPolicyDataset.from_memmap(
+            _puzzle_dir, tokenizer, name="puzzle_test",
+            source_tag=1, loss_weight=1.0,  # eval uses uniform weighting
+        )
+        puzzle_test_loader = DataLoader(
+            puzzle_test_ds, batch_size=batch_size, shuffle=False, collate_fn=collate_fn_policy,
+            num_workers=num_workers, pin_memory=True,
+        )
+        print(f"Puzzle test set: {len(puzzle_test_ds):,} sequences loaded")
+    # ── Phase 1: reward model ────────────────────────────────────────────────
+    reward_model = None
+    global_step = 0
+    if skip_reward:
+        print("\n── Phase 1: SKIPPED (--skip-reward) — existing reward_model.pt untouched.")
+    else:
+        sf_meta = out_dir / "stockfish_meta.pt"
+        if sf_meta.exists():
+            print(f"Loading Stockfish samples from memmap ({out_dir}/stockfish_*)...")
+            sf_ds = ChessPositionDataset.from_memmap(out_dir, "stockfish", tokenizer)
+            sf_collate = collate_fn_memmap
+        else:
+            print(f"Loading Stockfish samples from {stockfish_samples_path}...")
+            sf_ds = ChessPositionDataset.from_file(stockfish_samples_path, tokenizer)
+            sf_collate = collate_fn
+        print(f"Reward dataset: {len(sf_ds):,} positions")
+        sf_loader = DataLoader(
+            sf_ds, batch_size=batch_size, shuffle=True, collate_fn=sf_collate,
+            num_workers=num_workers, pin_memory=True,
+        )
+        reward_model = ChessRewardModel(vocab_size=vocab_size, max_seq_len=max_seq_len).to(device)
+        reward_optimizer = torch.optim.AdamW(reward_model.parameters(), lr=learning_rate)
+        print(f"\n── Phase 1: reward model — {epochs} epochs, lr={learning_rate}")
+        phase1_start = time.time()
+        for epoch in range(epochs):
+            epoch_num = epoch + 1
+            print(f"  [epoch {epoch_num}/{epochs}] starting...")
+            avg_loss, global_step, epoch_secs = _run_epoch_reward(
+                reward_model, sf_loader, reward_optimizer, device, writer, global_step, epoch
+            )
+            epochs_left = epochs - epoch_num
+            print(
+                f"  [epoch {epoch_num}/{epochs}]  "
+                f"loss={avg_loss:.4f}  "
+                f"epoch_time={_fmt_duration(epoch_secs)}  "
+                f"eta={_fmt_duration(epoch_secs * epochs_left)}"
+            )
+            if reward_test_loader is not None:
+                m = eval_reward(reward_model, reward_test_loader, device)
+                writer.add_scalar("test/reward_mse", m["mse"], epoch_num)
+                writer.add_scalar("test/reward_mae", m["mae"], epoch_num)
+                writer.add_scalar("test/reward_pearson_r", m["pearson_r"], epoch_num)
+                print(
+                    f"  [test]  mse={m['mse']:.4f}  mae={m['mae']:.4f}  r={m['pearson_r']:.4f}"
+                )
+        print(f"Phase 1 complete in {_fmt_duration(time.time() - phase1_start)}")
+        torch.save(reward_model.state_dict(), "reward_model.pt")
+        print("Reward model saved to reward_model.pt")
+    # ── Phase 2: mixed game + puzzle policy training ─────────────────────────
+    policy_data_dir = policy_data_dir_early  # already computed above
+    policy_meta = policy_data_dir / "policy_meta.pt"
+    if policy_meta.exists():
+        print(f"Loading policy sequences from memmap ({policy_data_dir}/policy_*)...")
+        game_ds = ChessPolicyDataset.from_memmap(
+            policy_data_dir, tokenizer, name="policy",
+            source_tag=0, loss_weight=1.0,
+        )
+    else:
+        print(f"Loading outcome games from {outcome_games_path} (tokenizing on-the-fly)...")
+        outcome_games = torch.load(outcome_games_path, weights_only=False)
+        game_ds = ChessPolicyDataset(outcome_games, tokenizer, max_seq_len=max_seq_len)
+    print(f"Game dataset: {len(game_ds):,} sequences")
+    # Puzzle dataset (optional — falls back to game-only training if absent).
+    # If --puzzle-data isn't passed, look for puzzle_*.bin alongside policy_*.bin
+    # so a `build_datasets.py --policy-only` layout (everything in data/) is
+    # picked up automatically without an extra CLI flag.
+    puzzle_ds = None
+    pdir = Path(puzzle_data_dir) if puzzle_data_dir is not None else policy_data_dir_early
+    if (pdir / "puzzle_meta.pt").exists():
+        puzzle_ds = ChessPolicyDataset.from_memmap(
+            pdir, tokenizer, name="puzzle",
+            source_tag=1, loss_weight=puzzle_loss_weight,
+        )
+        print(f"Puzzle dataset ({pdir}): {len(puzzle_ds):,} sequences (loss_weight={puzzle_loss_weight}x)")
+    elif puzzle_data_dir is not None:
+        print(f"WARNING: --puzzle-data given but {pdir}/puzzle_meta.pt not found.")
+    if puzzle_ds is not None:
+        mixed_ds = torch.utils.data.ConcatDataset([game_ds, puzzle_ds])
+        sampler = MixedBatchSampler(
+            n_game=len(game_ds),
+            n_puzzle=len(puzzle_ds),
+            batch_size=batch_size,
+            game_ratio=1.0 - puzzle_ratio,
+        )
+        print(
+            f"Mixed batch composition: {sampler.n_game_per_batch} game + "
+            f"{sampler.n_puzzle_per_batch} puzzle per batch (puzzle_ratio={puzzle_ratio})"
+        )
+        policy_loader = DataLoader(
+            mixed_ds, batch_sampler=sampler, collate_fn=collate_fn_policy,
+            num_workers=num_workers, pin_memory=True,
+        )
+    else:
+        policy_loader = DataLoader(
+            game_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_fn_policy,
+            num_workers=num_workers, pin_memory=True,
+        )
+    policy_model = ChessPolicyModel(vocab_size=vocab_size, max_seq_len=max_seq_len).to(device)
+    policy_optimizer = torch.optim.AdamW(policy_model.parameters(), lr=learning_rate)
+    global_step = 0
+    def _run_policy_test(epoch_num: int, tb_prefix: str) -> None:
+        if policy_test_loader is not None:
+            m = eval_policy(policy_model, policy_test_loader, device, pad_id)
+            writer.add_scalar(f"{tb_prefix}/policy_loss", m["loss"], epoch_num)
+            writer.add_scalar(f"{tb_prefix}/policy_perplexity", m["perplexity"], epoch_num)
+            writer.add_scalar(f"{tb_prefix}/policy_top1_acc", m["top1_acc"], epoch_num)
+            writer.add_scalar(f"{tb_prefix}/policy_top5_acc", m["top5_acc"], epoch_num)
+            print(
+                f"  [policy test]  loss={m['loss']:.4f}  ppl={m['perplexity']:.2f}"
+                f"  top1={m['top1_acc']:.3f}  top5={m['top5_acc']:.3f}"
+            )
+        if puzzle_test_loader is not None:
+            m = eval_puzzle_solve_rate(policy_model, puzzle_test_loader, device, pad_id)
+            writer.add_scalar(f"{tb_prefix}/puzzle_first_move", m["first_move_solve_rate"], epoch_num)
+            writer.add_scalar(f"{tb_prefix}/puzzle_all_moves", m["all_moves_solve_rate"], epoch_num)
+            print(
+                f"  [puzzle test]  first_move={m['first_move_solve_rate']:.3f}"
+                f"  all_moves={m['all_moves_solve_rate']:.3f}"
+            )
+    def _save_epoch_checkpoint(epoch_num: int) -> None:
+        """Save the policy model after a completed epoch.
+        Each checkpoint is `policy_model_epoch_{NN}.pt` next to the final
+        `policy_model.pt`. If `keep_last_n_checkpoints > 0`, older
+        checkpoints are pruned to cap disk usage. The final end-of-Phase-2
+        save (`policy_model.pt`) is kept regardless of this setting and
+        always reflects the last completed epoch.
+        """
+        ckpt_path = Path(f"policy_model_epoch_{epoch_num:02d}.pt")
+        torch.save(policy_model.state_dict(), ckpt_path)
+        print(f"  [checkpoint]  saved {ckpt_path.name}")
+        if keep_last_n_checkpoints and keep_last_n_checkpoints > 0:
+            existing = sorted(Path(".").glob("policy_model_epoch_*.pt"))
+            stale = existing[:-keep_last_n_checkpoints]
+            for p in stale:
+                try:
+                    p.unlink()
+                except OSError:
+                    pass
+    def _log_cross_gates(epoch_num: int) -> None:
+        """Log per-block cross-attention gate values to TensorBoard.
+        Each CrossAttnBlock has a single learned scalar `cross_gate` whose
+        tanh controls how much board cross-attention contributes through
+        its residual (init=0 means cross-attn starts disabled). Tracking
+        these over epochs shows which layers opened the board pathway and
+        how fast — flat-at-zero across all layers means the model decided
+        cross-attention wasn't worth it.
+        TB tags:
+          cross_gate/block_{i}      effective gate tanh(α) ∈ (-1, 1)
+          cross_gate_raw/block_{i}  raw parameter α (unbounded)
+        """
+        blocks = getattr(policy_model, "blocks", None)
+        if blocks is None:
+            return  # Older model variants without CrossAttnBlock stack.
+        gates_tanh = {}
+        for i, blk in enumerate(blocks):
+            raw = blk.cross_gate.detach()
+            tanh_val = raw.tanh().item()
+            raw_val = raw.item()
+            writer.add_scalar(f"cross_gate/block_{i}", tanh_val, epoch_num)
+            writer.add_scalar(f"cross_gate_raw/block_{i}", raw_val, epoch_num)
+            gates_tanh[f"block_{i}"] = tanh_val
+        # Overlay all blocks on a single chart for easy at-a-glance comparison.
+        writer.add_scalars("cross_gate_all", gates_tanh, epoch_num)
+        gate_summary = "  ".join(f"L{i}={v:+.3f}" for i, v in enumerate(gates_tanh.values()))
+        print(f"  [cross_gate]  {gate_summary}")
+    print(f"\n── Phase 2: mixed policy training — {policy_epochs} epochs, lr={learning_rate}")
+    phase2_start = time.time()
+    # Log initial gate values (all zeros at init) so TB charts start at epoch 0.
+    _log_cross_gates(0)
+    for epoch in range(policy_epochs):
+        epoch_num = epoch + 1
+        print(f"  [epoch {epoch_num}/{policy_epochs}] starting...")
+        avg_loss, global_step, epoch_secs = _run_epoch_policy_mixed(
+            policy_model, policy_loader, policy_optimizer, device, writer, global_step, epoch, pad_id,
+        )
+        epochs_left = policy_epochs - epoch_num
+        print(
+            f"  [epoch {epoch_num}/{policy_epochs}]  "
+            f"loss={avg_loss:.4f}  "
+            f"epoch_time={_fmt_duration(epoch_secs)}  "
+            f"eta={_fmt_duration(epoch_secs * epochs_left)}"
+        )
+        _run_policy_test(epoch_num, "test_mixed")
+        _log_cross_gates(epoch_num)
+        _save_epoch_checkpoint(epoch_num)
+    print(f"Phase 2 complete in {_fmt_duration(time.time() - phase2_start)}")
+    torch.save(policy_model.state_dict(), "policy_model.pt")
+    print("Policy model saved to policy_model.pt")
+    return reward_model, policy_model, tokenizer
+def _build_argparser():
+    p = argparse.ArgumentParser(description=train.__doc__)
+    p.add_argument("--tokenizer-path", default="data/tokenizer.pt")
+    p.add_argument("--stockfish-samples-path", default="data/stockfish_samples.pt")
+    p.add_argument("--outcome-games-path", default="data/games_outcome.pt")
+    p.add_argument("--epochs", type=int, default=15)
+    p.add_argument("--policy-epochs", type=int, default=15)
+    p.add_argument("--batch-size", type=int, default=1024)
+    p.add_argument("--learning-rate", type=float, default=3e-5)
+    p.add_argument("--max-seq-len", type=int, default=128)
+    p.add_argument("--log-dir", default="runs/chess_models")
+    p.add_argument("--num-workers", type=int, default=8)
+    p.add_argument("--puzzle-data", default=None, dest="puzzle_data_dir",
+        help="Directory containing puzzle_tokens.bin / puzzle_lengths.bin / puzzle_fens.bin / puzzle_meta.pt")
+    p.add_argument("--puzzle-epochs", type=int, default=5, dest="puzzle_epochs",
+        help="(Deprecated, retained for CLI compat) — mixed training merges game/puzzle into Phase 2.")
+    p.add_argument("--puzzle-loss-weight", type=float, default=5.0, dest="puzzle_loss_weight",
+        help="Per-sample loss weight applied to puzzle samples in the mixed-training "
+             "weighted cross-entropy (default 5.0).")
+    p.add_argument("--puzzle-ratio", type=float, default=0.2, dest="puzzle_ratio",
+        help="Fraction of each mixed batch drawn from the puzzle dataset (default 0.2).")
+    p.add_argument("--skip-reward", action="store_true", dest="skip_reward",
+        help="Skip Phase 1 (reward model training). Existing reward_model.pt is "
+             "left untouched. Use when iterating on Phase 2 only.")
+    p.add_argument("--keep-last-n-checkpoints", type=int, default=3, dest="keep_last_n_checkpoints",
+        help="Number of per-epoch policy_model_epoch_NN.pt files to keep on disk "
+             "(default 3). Use 0 to keep all epochs. Final policy_model.pt is kept "
+             "regardless.")
+    return p
+if __name__ == "__main__":
+    args = _build_argparser().parse_args()
+    train(
+        tokenizer_path=args.tokenizer_path,
+        stockfish_samples_path=args.stockfish_samples_path,
+        outcome_games_path=args.outcome_games_path,
+        epochs=args.epochs,
+        policy_epochs=args.policy_epochs,
+        batch_size=args.batch_size,
+        learning_rate=args.learning_rate,
+        max_seq_len=args.max_seq_len,
+        log_dir=args.log_dir,
+        num_workers=args.num_workers,
+        puzzle_data_dir=args.puzzle_data_dir,
+        puzzle_epochs=args.puzzle_epochs,
+        puzzle_loss_weight=args.puzzle_loss_weight,
+        puzzle_ratio=args.puzzle_ratio,
+        skip_reward=args.skip_reward,
+        keep_last_n_checkpoints=args.keep_last_n_checkpoints,
+    )