Spaces:

noamdwc
/

grpo-chess-api

Sleeping

App Files Files Community

noamdwc commited on Feb 6

Commit

60fd122

1 Parent(s): f488ed9

Switch Space to Docker + FastAPI

Browse files

Files changed (46) hide show

.dockerignore +8 -0
Dockerfile +16 -0
README.md +1 -3
app.py +22 -42
hf_space_repo/README.md +428 -0
hf_space_repo/__init__.py +30 -0
hf_space_repo/chess/__init__.py +0 -0
hf_space_repo/chess/boards_dataset.py +465 -0
hf_space_repo/chess/chess_logic.py +63 -0
hf_space_repo/chess/policy_player.py +98 -0
hf_space_repo/chess/rewards.py +108 -0
hf_space_repo/chess/searcher.py +90 -0
hf_space_repo/chess/stockfish.py +288 -0
hf_space_repo/configs/__init__.py +43 -0
hf_space_repo/configs/config_loader.py +290 -0
hf_space_repo/configs/default.yaml +123 -0
hf_space_repo/configs/pretrain.yaml +49 -0
hf_space_repo/constants.py +15 -0
hf_space_repo/eval_utils.py +211 -0
hf_space_repo/evaluator.py +118 -0
hf_space_repo/grpo_logic/__init__.py +0 -0
hf_space_repo/grpo_logic/loss.py +235 -0
hf_space_repo/grpo_logic/model.py +782 -0
hf_space_repo/grpo_logic/sampling.py +243 -0
hf_space_repo/logging_utils.py +32 -0
hf_space_repo/models.py +234 -0
hf_space_repo/pretrain/README.md +153 -0
hf_space_repo/pretrain/__init__.py +15 -0
hf_space_repo/pretrain/pretrain.py +579 -0
hf_space_repo/pretrain/pretrain_dataset.py +328 -0
hf_space_repo/pretrain/pretrain_load_config.py +21 -0
hf_space_repo/searchless_chess_imports.py +3 -0
hf_space_repo/searchless_chess_model/.gitattributes +35 -0
hf_space_repo/searchless_chess_model/README.md +177 -0
hf_space_repo/searchless_chess_model/config.json +10 -0
hf_space_repo/searchless_chess_model/model_info.json +13 -0
hf_space_repo/searchless_chess_model/searchless_chess_code/__init__.py +1 -0
hf_space_repo/searchless_chess_model/searchless_chess_code/config.py +90 -0
hf_space_repo/searchless_chess_model/searchless_chess_code/constants.py +119 -0
hf_space_repo/searchless_chess_model/searchless_chess_code/hf_model.py +329 -0
hf_space_repo/searchless_chess_model/searchless_chess_code/tokenizer.py +116 -0
hf_space_repo/searchless_chess_model/searchless_chess_code/transformer.py +284 -0
hf_space_repo/searchless_chess_model/searchless_chess_code/utils.py +162 -0
hf_space_repo/train_self_play.py +72 -0
hf_space_repo/trainer.py +74 -0
requirements.txt +1 -5

.dockerignore ADDED Viewed

	@@ -0,0 +1,8 @@

+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.pytest_cache/
+.git/
+.DS_Store
+node_modules/

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+FROM python:3.11-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV PORT=7860
+WORKDIR /app
+COPY requirements.txt /app/requirements.txt
+RUN pip install --no-cache-dir -r /app/requirements.txt
+COPY . /app
+EXPOSE 7860
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -15,9 +15,7 @@ title: grpo-chess-api
 emoji: ♟️
 colorFrom: amber
 colorTo: red
-sdk: gradio
-sdk_version: 4.44.1
-app_file: app.py
 pinned: false
 ---

 emoji: ♟️
 colorFrom: amber
 colorTo: red
+sdk: docker
 pinned: false
 ---

app.py CHANGED Viewed

@@ -2,8 +2,9 @@ import os
 from pathlib import Path
 import chess
-import gradio as gr
 import torch
 from huggingface_hub import hf_hub_download
 from pydantic import BaseModel
 from safetensors.torch import load_file
@@ -69,51 +70,30 @@ def choose_move(model, board: chess.Board, temperature: float, greedy: bool) ->
     return move
 def move(req: MoveRequest):
-    board = chess.Board(req.fen)
     model = load_model()
     move = choose_move(model, board, req.temperature, req.greedy)
     san = board.san(move)
     board.push(move)
     return MoveResponse(uci=move.uci(), san=san, fen=board.fen())
-def gradio_move(fen: str, temperature: float, greedy: bool):
-    req = MoveRequest(fen=fen, temperature=temperature, greedy=greedy)
-    res = move(req)
-    return res.uci, res.san, res.fen
-with gr.Blocks(title="GRPO Chess API") as demo:
-    gr.Markdown(
-        "## GRPO Chess Model API\n"
-        "Use this panel to test the model. The website calls the Gradio API at "
-        "`/run/move`."
-    )
-    fen = gr.Textbox(
-        label="FEN",
-        value="rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1",
-    )
-    temperature = gr.Slider(0.1, 2.0, value=1.0, step=0.1, label="Temperature")
-    greedy = gr.Checkbox(label="Greedy", value=False)
-    btn = gr.Button("Get Move")
-    uci = gr.Textbox(label="UCI Move")
-    san = gr.Textbox(label="SAN Move")
-    fen_out = gr.Textbox(label="Next FEN")
-    btn.click(
-        gradio_move,
-        inputs=[fen, temperature, greedy],
-        outputs=[uci, san, fen_out],
-        api_name="move",
-    )
-app = demo
-if __name__ == "__main__":
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=int(os.environ.get("PORT", 7860)),
-        show_error=True,
-    )

 from pathlib import Path
 import chess
 import torch
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
 from huggingface_hub import hf_hub_download
 from pydantic import BaseModel
 from safetensors.torch import load_file
     return move
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=False,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.get("/health")
+def health():
+    return {"status": "ok"}
+@app.post("/move", response_model=MoveResponse)
 def move(req: MoveRequest):
+    try:
+        board = chess.Board(req.fen)
+    except Exception as exc:
+        raise HTTPException(status_code=400, detail=f"Invalid FEN: {exc}")
     model = load_model()
     move = choose_move(model, board, req.temperature, req.greedy)
     san = board.san(move)
     board.push(move)
     return MoveResponse(uci=move.uci(), san=san, fen=board.fen())

hf_space_repo/README.md ADDED Viewed

	@@ -0,0 +1,428 @@

+# GRPO Self-Play Chess Module
+An experimental, research-grade implementation of **Group Relative Policy Optimization (GRPO)** for training transformer-based chess policies through self-play. This module implements a full reinforcement learning pipeline for chess, but training stability and final strength are still under active investigation.
+## Overview
+This module trains neural network chess policies using GRPO, a variant of Proximal Policy Optimization (PPO) that uses group-based advantage estimation. The system learns to play chess by:
+1. **Self-Play**: Sampling multiple trajectory groups from diverse starting positions
+2. **Reward Computation**: Using Stockfish evaluations to compute dense rewards
+3. **Policy Optimization**: Applying GRPO with PPO clipping and KL divergence penalties
+4. **Evaluation**: Comprehensive benchmarking against Stockfish at multiple skill levels
+## Key Features
+### 🎯 Core Capabilities
+- **Transformer-Based Policy Network**: Deep neural network architecture that processes FEN-encoded board states
+- **GRPO Training Algorithm**: Group-relative advantage estimation with PPO-style clipping
+- **Self-Play Training Loop**: Infinite dataset of diverse chess positions for robust learning
+- **Stockfish Integration**: Professional-grade evaluation and reward computation
+- **Comprehensive Evaluation**: Multi-level skill ladder evaluation against Stockfish
+- **PyTorch Lightning Integration**: Scalable training with automatic mixed precision, gradient clipping, and checkpointing
+- **Weights & Biases Logging**: Full experiment tracking and visualization
+### 🏗️ Architecture Highlights
+- **Modular Design**: Clean separation between model, training logic, chess rules, and evaluation
+- **Efficient Batching**: Parallel trajectory sampling across multiple board positions
+- **Legal Move Masking**: Proper handling of chess rules with action space masking
+- **Trajectory Search**: Optional trajectory search wrapper for improved play strength
+- **Resource Management**: Efficient Stockfish engine pooling and caching
+## Installation
+```bash
+# Install dependencies
+pip install torch pytorch-lightning wandb chess python-chess
+# Ensure Stockfish is available
+# On Ubuntu/Debian: sudo apt-get install stockfish
+# On macOS: brew install stockfish
+# Or download from: https://stockfishchess.org/download/
+```
+## Quick Start
+### Basic Training
+The easiest way to start training is using the YAML-based configuration system:
+```python
+from src.grpo_self_play.train_self_play import train
+# Use default configuration (loads from configs/default.yaml)
+train()
+# Use a custom config file
+train(config_path="my_experiment.yaml")
+# Override specific hyperparameters programmatically
+train(
+    config_path="default.yaml",
+    overrides={
+        "grpo": {"lr": 1e-4, "num_trajectories": 8},
+        "training": {"num_epochs": 100},
+    }
+)
+```
+All hyperparameters (learning rate, model architecture, training settings, etc.) are defined in YAML configuration files. See the [Configuration](#configuration) section below for details.
+### Running Training in Google Colab
+**Note for AI agents and contributors**: The primary way this code is run is through the `chess_model_run_git.ipynb` notebook in Google Colab. This notebook is the actual workflow used for training and evaluation.
+The `chess_model_run_git.ipynb` notebook provides:
+- **Automated Setup**: Clones the repository, installs dependencies, and downloads the searchless chess model
+- **Complete Configuration**: Pre-configured settings for GRPO training, dataset generation, and evaluation
+- **Phase-Aware Dataset**: Example configuration using `ChessDatasetConfig` with `phase_distribution` for balanced training across opening, middlegame, and endgame positions
+- **Evaluation Pipeline**: Integrated evaluation against Stockfish at multiple skill levels
+The notebook handles all setup steps including:
+1. Repository cloning and branch checkout
+2. Dependency installation (PyTorch Lightning, WandB, python-chess, etc.)
+3. Downloading the searchless chess model from HuggingFace
+4. Stockfish installation
+5. Training configuration with phase-distributed dataset sampling
+6. Model training and periodic evaluation
+### Evaluation
+```python
+from src.grpo_self_play import Evaluator, EvalConfig
+from src.grpo_self_play.chess.stockfish import StockfishConfig
+# Create evaluator
+evaluator = Evaluator(
+    eval_cfg=EvalConfig(games=50),
+    stockfish_cfg=StockfishConfig(skill_level=10, movetime_ms=100)
+)
+# Single evaluation
+results, policy = evaluator.single_evaluation(model)
+print(f"Win rate: {results['score']:.2%}")
+print(f"Approx Elo diff: {results['elo_diff_vs_stockfish_approx']:.0f}")
+# Skill ladder evaluation
+skill_results = evaluator.eval_ladder(model)
+for skill, score in skill_results.items():
+    print(f"Skill {skill}: {score:.2%} win rate")
+```
+## Architecture
+### Model Architecture
+The `ChessTransformer` processes chess positions using:
+- **Input Encoding**: FEN strings tokenized using DeepMind's chess tokenizer
+- **Transformer Encoder**: Multi-head self-attention with learnable positional encodings
+- **Policy Head**: Dense layers outputting logits over 1968 possible moves
+- **Legal Move Masking**: Automatic filtering of illegal moves during inference
+### GRPO Algorithm
+Group Relative Policy Optimization extends PPO by:
+1. **Group-Based Sampling**: Sample G trajectories per starting position
+2. **Group Rewards**: Compute final reward for each trajectory group
+3. **Relative Advantages**: Normalize advantages within each batch using group statistics
+4. **PPO Clipping**: Prevent large policy updates with clipped importance ratios
+5. **KL Penalty**: Regularize policy updates to prevent divergence
+The loss function combines:
+- **PPO Surrogate Loss**: `L_clip = E[min(r(θ)A, clip(r(θ), 1-ε, 1+ε)A)]`
+- **KL Divergence Penalty**: `β * KL(π_old || π_new)`
+### Training Pipeline
+```
+1. Sample random starting positions (FEN strings)
+2. For each position:
+   - Sample G trajectory groups using old policy
+   - Compute group rewards using Stockfish evaluation
+3. Compute advantages via group normalization
+4. Update policy using GRPO loss
+5. Sync old policy every epoch
+6. Periodic evaluation against Stockfish
+```
+## Module Structure
+```
+grpo_self_play/
+├── models.py              # ChessTransformer architecture
+├── trainer.py             # PyTorch Lightning trainer setup
+├── train_self_play.py     # Main training script
+├── evaluator.py           # Evaluation framework
+├── eval_utils.py          # Evaluation utilities
+├── constants.py           # Configuration constants
+├── grpo_logic/
+│   ├── model.py           # GRPOChessTransformer (Lightning module)
+│   ├── loss.py            # GRPO loss computation
+│   └── sampling.py        # Trajectory sampling logic
+└── chess/
+    ├── chess_logic.py     # Board encoding, legal moves
+    ├── policy_player.py   # Policy-based player
+    ├── searcher.py        # Trajectory search wrapper
+    ├── rewards.py         # Stockfish reward computation
+    └── stockfish.py       # Stockfish engine integration
+```
+## Key Design Decisions
+### 1. Group-Based Advantage Estimation
+Instead of using value functions or Monte Carlo returns, GRPO computes advantages by normalizing rewards within trajectory groups. This approach:
+- Eliminates the need for value function approximation
+- Provides stable learning signals through relative comparisons
+- Reduces variance in advantage estimates
+### 2. Stockfish-Based Rewards
+Using Stockfish for reward computation provides:
+- **Dense Rewards**: Evaluation at every position, not just terminal states
+- **High-Quality Signals**: Professional-grade position evaluation
+- **Caching**: LRU cache for efficient reward computation during training
+### 3. Legal Move Masking
+The action space (1968 moves) is larger than legal moves in any position. The system:
+- Masks illegal moves with `-inf` in logits
+- Ensures policy only samples legal moves
+- Handles edge cases (no legal moves, promotion moves)
+### 4. Trajectory Padding and Masking
+Trajectories have variable lengths due to game terminations. The implementation:
+- Pads trajectories to fixed length for batching
+- Uses attention masks to ignore padding
+- Only considers moves from the starting player's perspective
+## Configuration
+This module uses a **YAML-based configuration system** to manage all hyperparameters and experiment settings. All training hyperparameters, model architecture settings, and evaluation configurations are centralized in YAML files located in `configs/`.
+### Configuration Files
+The default configuration file is `configs/default.yaml`, which contains all hyperparameters organized into sections:
+- **`training`**: Training loop settings (epochs, batch size, steps per epoch)
+- **`grpo`**: GRPO algorithm hyperparameters (learning rate, trajectories, clipping, KL penalty, entropy regularization, adaptive KL control)
+- **`transformer`**: Model architecture (embedding dimension, layers, attention heads, vocabulary size, action space)
+- **`eval`**: Evaluation settings (number of games, max plies, opening randomization)
+- **`stockfish`**: Stockfish engine configuration (path, skill level, time limits, resource usage)
+- **`policy`**: Policy player settings (temperature, greedy mode, branching factor, search depth)
+- **`searcher`**: Optional trajectory search configuration
+- **`dataset`**: Dataset generation settings (position phases, quality filters, evaluation bounds)
+### Using Configurations
+#### Loading Configurations
+```python
+from src.grpo_self_play.configs.config_loader import load_experiment_config
+# Load default config
+config = load_experiment_config("default.yaml")
+# Load with overrides
+config = load_experiment_config("default.yaml", overrides={
+    "grpo": {"lr": 1e-4, "entropy_coef": 0.2},
+    "training": {"num_epochs": 100},
+})
+# Access config values
+print(config.grpo.lr)
+print(config.training.batch_size)
+print(config.transformer.embed_dim)
+```
+#### Training with Configurations
+```python
+from src.grpo_self_play.train_self_play import train
+# Use default config
+train()
+# Use custom config file
+train(config_path="my_experiment.yaml")
+# Override specific values
+train(
+    config_path="default.yaml",
+    overrides={
+        "grpo": {"lr": 1e-4},
+        "training": {"num_epochs": 50},
+    },
+    dataloader_kwargs={"num_workers": 4}  # Override DataLoader args
+)
+```
+### Creating Custom Configurations
+1. Copy the default config:
+   ```bash
+   cp configs/default.yaml configs/my_experiment.yaml
+   ```
+2. Edit `my_experiment.yaml` to modify hyperparameters
+3. Use your custom config:
+   ```python
+   train(config_path="my_experiment.yaml")
+   ```
+### Configuration Dataclasses
+The configuration system converts YAML files into typed dataclasses:
+- **`TrainingConfig`**: Training loop settings
+- **`GRPOConfig`**: GRPO algorithm hyperparameters
+- **`ChessTransformerConfig`**: Model architecture
+- **`EvalConfig`**: Evaluation settings
+- **`StockfishConfig`**: Stockfish engine settings
+- **`PolicyConfig`**: Policy player settings
+- **`SearchConfig`**: Trajectory search settings (optional)
+- **`ChessDatasetConfig`**: Dataset generation settings
+All configs are combined into an `ExperimentConfig` object that provides type-safe access to all settings.
+### Key Hyperparameters
+All hyperparameters are defined in YAML files. Key settings include:
+**GRPO Algorithm:**
+- `grpo.lr`: Learning rate for policy optimization
+- `grpo.num_trajectories`: Number of trajectory groups per starting position
+- `grpo.trajectory_depth`: Maximum moves per trajectory
+- `grpo.clip_ratio`: PPO clipping epsilon (prevents large policy updates)
+- `grpo.kl_coef`: KL divergence penalty coefficient
+- `grpo.entropy_coef`: Entropy regularization coefficient
+- `grpo.adaptive_kl`: Enable adaptive KL coefficient adjustment
+- `grpo.use_entropy_floor`: Monitor and respond to entropy collapse
+**Model Architecture:**
+- `transformer.embed_dim`: Transformer embedding dimension
+- `transformer.num_layers`: Number of transformer layers
+- `transformer.num_heads`: Number of attention heads
+- `transformer.vocab_size`: Token vocabulary size
+- `transformer.action_dim`: Action space size (1968 for chess)
+**Training:**
+- `training.num_epochs`: Total number of training epochs
+- `training.batch_size`: Batch size for training
+- `training.steps_per_epoch`: Number of training steps per epoch
+See `configs/default.yaml` for the complete list of all hyperparameters and their default values.
+## Advanced Usage
+### Custom Reward Function
+```python
+from src.grpo_self_play.chess.rewards import reward_board
+def custom_reward(board, start_board):
+    # Your custom reward logic
+    return reward_board(board, start_board, depth=8, movetime_ms=50)
+```
+### Trajectory Search
+```python
+from src.grpo_self_play.chess.searcher import TrajectorySearcher, SearchConfig
+from src.grpo_self_play.chess.policy_player import PolicyPlayer
+policy = PolicyPlayer(model)
+searcher = TrajectorySearcher(
+    policy,
+    cfg=SearchConfig(n_trajectories=10, trajectory_depth=3)
+)
+```
+### Custom Training Loop
+```python
+import pytorch_lightning as pl
+from src.grpo_self_play.grpo_logic.model import GRPOChessTransformer
+model = GRPOChessTransformer(transformer_config, grpo_config)
+trainer = pl.Trainer(
+    max_epochs=1000,
+    gradient_clip_val=1.0,
+    accelerator="gpu",
+    devices=1
+)
+trainer.fit(model, dataloader)
+```
+## Performance Considerations
+- **Batch Size**: Larger batches improve advantage normalization quality
+- **Trajectory Depth**: Deeper trajectories provide more learning signal but increase compute
+- **Stockfish Depth**: Higher depth = better rewards but slower training
+- **Caching**: Reward caching significantly speeds up training
+- **Gradient Clipping**: Prevents exploding gradients in transformer training
+## Monitoring and Logging
+The module logs comprehensive metrics to Weights & Biases:
+- **Training Metrics**: Loss, KL divergence, policy ratios, reward statistics
+- **Evaluation Metrics**: Win rate, Elo difference, game outcomes
+- **System Metrics**: Trajectory lengths, padding fractions, gradient norms
+## Research Background
+GRPO (Group Relative Policy Optimization) is inspired by:
+- **PPO (Proximal Policy Optimization)**: Clipped surrogate objective
+- **REINFORCE**: Policy gradient methods
+- **Self-Play**: Learning through playing against oneself
+- **AlphaZero**: Combining deep learning with game tree search
+This implementation adapts these ideas specifically for chess, using Stockfish for reward signals and evaluation.
+## Technical Highlights
+- ✅ **Practical Infrastructure**: Error handling, resource management, logging
+- ✅ **Scalable Design**: Efficient batching, parallel trajectory sampling
+- ✅ **Extensible**: Modular design allows easy customization
+- ✅ **Documented**: Type hints, docstrings, clear structure
+- ⚠️ **Status**: This is a research system, not a production-ready chess engine
+## Future Enhancements
+Potential improvements:
+- Value function approximation for better advantage estimates
+- More robust entropy and KL control for GRPO
+- Multi-GPU training support
+- Distributed self-play
+- Opening book integration
+- Endgame tablebase integration
+## License
+[Specify your license here]
+## Citation
+If you use this code in your research, please cite:
+```bibtex
+@software{grpo_chess,
+  title = {GRPO Self-Play Chess Module},
+  author = {Your Name},
+  year = {2024},
+  url = {https://github.com/yourusername/grpo_chess}
+}
+```
+## Contact
+For questions or contributions, please open an issue or contact [your email].

hf_space_repo/__init__.py ADDED Viewed

	@@ -0,0 +1,30 @@

+"""GRPO Self-Play Module for Chess.
+This module implements Group Relative Policy Optimization (GRPO) for training
+chess policies through self-play. It includes:
+- Transformer-based chess policy models
+- GRPO training logic with PPO clipping
+- Trajectory sampling and reward computation
+- Evaluation against Stockfish
+"""
+__version__ = "0.1.0"
+# Main exports
+from src.grpo_self_play.models import ChessTransformer, ChessTransformerConfig
+from src.grpo_self_play.grpo_logic.model import GRPOChessTransformer, GRPOConfig
+from src.grpo_self_play.grpo_logic.loss import grpo_ppo_loss, GRPOLossInfo
+from src.grpo_self_play.evaluator import Evaluator
+from src.grpo_self_play.eval_utils import EvalConfig
+__all__ = [
+    "ChessTransformer",
+    "ChessTransformerConfig",
+    "GRPOChessTransformer",
+    "GRPOConfig",
+    "grpo_ppo_loss",
+    "GRPOLossInfo",
+    "Evaluator",
+    "EvalConfig",
+]

hf_space_repo/chess/__init__.py ADDED Viewed

File without changes

hf_space_repo/chess/boards_dataset.py ADDED Viewed

	@@ -0,0 +1,465 @@

+"""Dataset of random chess boards."""
+import chess
+import random
+import torch
+from collections import deque
+from typing import Any, Optional, Dict
+from dataclasses import dataclass
+from torch.utils.data import IterableDataset
+from src.grpo_self_play.chess.rewards import evaluate_fen
+def generate_random_board(step_num=30):
+    """Generate a random board position by making random moves from starting position.
+    Args:
+        step_num: Maximum number of random moves to make
+    Returns:
+        Chess board after random moves
+    """
+    board = chess.Board()
+    random_steps = random.randint(0, step_num)
+    for _ in range(random_steps):
+        if board.is_game_over(): break
+        board.push(random.choice(list(board.legal_moves)))
+    return board
+def get_game_phase(board: chess.Board) -> str:
+    """Determine the game phase (opening, middlegame, or endgame).
+    Args:
+        board: Chess board position
+    Returns:
+        "opening", "middlegame", or "endgame"
+    """
+    move_count = board.fullmove_number * 2 - (1 if board.turn == chess.BLACK else 0)
+    # Count material (excluding kings)
+    material_count = sum(
+        len(board.pieces(pt, color))
+        for pt in [chess.PAWN, chess.ROOK, chess.KNIGHT, chess.BISHOP, chess.QUEEN]
+        for color in [chess.WHITE, chess.BLACK]
+    )
+    # Endgame: few pieces remaining (typically < 12-14 pieces)
+    if material_count <= 12:
+        return "endgame"
+    # Opening: early moves (typically first 15 moves)
+    elif move_count <= 15:
+        return "opening"
+    # Middlegame: everything else
+    else:
+        return "middlegame"
+def evaluate_position_quality(board: chess.Board, depth: int = 2) -> Optional[float]:
+    """Quick Stockfish evaluation to check position quality.
+    Args:
+        board: Chess board position
+        depth: Stockfish search depth (shallow for speed)
+    Returns:
+        Centipawn evaluation from White's perspective, or None if evaluation fails
+    """
+    try:
+        fen = board.fen()
+        pov_is_white = board.turn == chess.WHITE
+        eval_cp = evaluate_fen(fen, pov_is_white, movetime_ms=0, depth=depth)
+        return eval_cp
+    except Exception:
+        return None
+def generate_opening_position(max_moves: int = 15) -> chess.Board:
+    """Generate a realistic opening position using common opening moves.
+    Args:
+        max_moves: Maximum number of opening moves to make
+    Returns:
+        Chess board in opening phase
+    """
+    board = chess.Board()
+    # Common first moves for White
+    first_moves = [
+        chess.Move.from_uci("e2e4"),  # King's pawn
+        chess.Move.from_uci("d2d4"),  # Queen's pawn
+        chess.Move.from_uci("g1f3"),  # King's knight
+        chess.Move.from_uci("c2c4"),  # English opening
+    ]
+    # Make first move
+    if first_moves:
+        first_move = random.choice(first_moves)
+        if first_move in board.legal_moves:
+            board.push(first_move)
+    # Continue with semi-random play (preferring development moves)
+    moves_made = 1
+    while moves_made < max_moves and not board.is_game_over():
+        legal_moves = list(board.legal_moves)
+        if not legal_moves:
+            break
+        # Prefer piece development over pawn moves in opening
+        piece_moves = [m for m in legal_moves if board.piece_at(m.from_square) and
+                      board.piece_at(m.from_square).piece_type != chess.PAWN]
+        if piece_moves and random.random() < 0.6:  # 60% chance to prefer piece moves
+            move = random.choice(piece_moves)
+        else:
+            move = random.choice(legal_moves)
+        board.push(move)
+        moves_made += 1
+    return board
+def generate_middlegame_position(min_moves: int = 15, max_moves: int = 40) -> chess.Board:
+    """Generate a middlegame position from a reasonable starting point.
+    Args:
+        min_moves: Minimum moves to reach middlegame
+        max_moves: Maximum moves for middlegame
+    Returns:
+        Chess board in middlegame phase
+    """
+    # Start from an opening position
+    board = generate_opening_position(max_moves=min_moves)
+    # Continue with random play to reach middlegame
+    target_moves = random.randint(min_moves, max_moves)
+    moves_made = len(board.move_stack)
+    while moves_made < target_moves and not board.is_game_over():
+        legal_moves = list[Any](board.legal_moves)
+        if not legal_moves:
+            break
+        board.push(random.choice(legal_moves))
+        moves_made += 1
+    return board
+def generate_endgame_position() -> chess.Board: # TODO: This is  not working as expected, it should be a function that generates a random endgame position.
+    """Generate an endgame position by removing pieces from a middlegame position.
+    Returns:
+        Chess board in endgame phase
+    """
+    # Start with a middlegame position
+    board = generate_middlegame_position(min_moves=20, max_moves=35)
+    # Remove pieces to create endgame (keep kings, remove other pieces randomly)
+    pieces_to_remove = []
+    for square in chess.SQUARES:
+        piece = board.piece_at(square)
+        if piece and piece.piece_type != chess.KING:
+            pieces_to_remove.append(square)
+    # Remove random pieces until we have endgame material (<= 12 pieces total)
+    target_pieces = random.randint(6, 12)  # Endgame typically has 6-12 pieces
+    current_pieces = len([p for p in pieces_to_remove if board.piece_at(p)])
+    # We need to remove pieces, but we can't directly remove them from python-chess Board
+    # Instead, we'll generate a new position by making moves that trade pieces
+    # For simplicity, we'll just continue playing until we naturally reach endgame material
+    # Count material
+    def count_material(b: chess.Board) -> int:
+        return sum(
+            len(b.pieces(pt, color))
+            for pt in [chess.PAWN, chess.ROOK, chess.KNIGHT, chess.BISHOP, chess.QUEEN]
+            for color in [chess.WHITE, chess.BLACK]
+        )
+    # Play random moves until we reach endgame material
+    max_attempts = 100
+    attempts = 0
+    while count_material(board) > 12 and attempts < max_attempts and not board.is_game_over():
+        legal_moves = list(board.legal_moves)
+        if not legal_moves:
+            break
+        # Prefer captures to reduce material
+        captures = [m for m in legal_moves if board.is_capture(m)]
+        if captures:
+            move = random.choice(captures)
+        else:
+            move = random.choice(legal_moves)
+        board.push(move)
+        attempts += 1
+    return board
+def generate_position_by_phase(phase: str) -> chess.Board:
+    """Generate a position for a specific game phase.
+    Args:
+        phase: "opening", "middlegame", or "endgame"
+    Returns:
+        Chess board in the specified phase
+    """
+    if phase == "opening":
+        return generate_opening_position()
+    elif phase == "middlegame":
+        return generate_middlegame_position()
+    elif phase == "endgame":
+        return generate_endgame_position()
+    else:
+        raise ValueError(f"Unknown phase: {phase}. Must be 'opening', 'middlegame', or 'endgame'")
+def generate_quality_filtered_board(
+    step_num: int = 30,
+    min_eval_cp: int = -200,
+    max_eval_cp: int = 200,
+    filter_depth: int = 2,
+    max_attempts: int = 50,
+    phase: Optional[str] = None
+) -> Optional[chess.Board]:
+    """Generate a random board position filtered by Stockfish evaluation quality.
+    Args:
+        step_num: Maximum number of random moves (if phase is None)
+        min_eval_cp: Minimum centipawn evaluation to accept
+        max_eval_cp: Maximum centipawn evaluation to accept
+        filter_depth: Stockfish depth for filtering (shallow for speed)
+        max_attempts: Maximum attempts to generate a valid position
+        phase: Optional phase to generate ("opening", "middlegame", "endgame")
+    Returns:
+        Chess board within evaluation range, or None if no valid position found
+    """
+    for attempt in range(max_attempts):
+        # Generate position
+        if phase:
+            board = generate_position_by_phase(phase)
+        else:
+            board = generate_random_board(step_num)
+        # Skip if game over or no legal moves
+        if board.is_game_over() or not list(board.legal_moves):
+            continue
+        # Evaluate position quality
+        eval_cp = evaluate_position_quality(board, depth=filter_depth)
+        if eval_cp is None:
+            continue
+        # Check if evaluation is within acceptable range
+        if min_eval_cp <= eval_cp <= max_eval_cp:
+            return board
+    # If we couldn't find a good position, return a random one anyway
+    if phase:
+        return generate_position_by_phase(phase)
+    else:
+        return generate_random_board(step_num)
+@dataclass
+class ChessDatasetConfig:
+    """Configuration for the Chess Start States Dataset.
+    Attributes:
+        max_steps: Maximum number of positions to generate per epoch
+        random_walk_gen_steps: Maximum random moves (legacy, used if phase_distribution is None)
+        phase_distribution: Dict mapping phase names to weights, e.g. {"opening": 0.3, "middlegame": 0.5, "endgame": 0.2}
+        min_eval_cp: Minimum centipawn evaluation to accept (-200)
+        max_eval_cp: Maximum centipawn evaluation to accept (+200)
+        use_opening_book: Whether to use opening book moves for opening positions
+        stockfish_filter_depth: Stockfish depth for quality filtering (2-4 for speed)
+        cache_positions: Whether to cache and reuse high-quality positions
+        cache_size: Maximum number of positions to cache
+        quality_filter: Whether to filter positions by Stockfish evaluation
+    """
+    max_steps: int = 10000
+    random_walk_gen_steps: int = 30
+    phase_distribution: Optional[Dict[str, float]] = None
+    min_eval_cp: int = -200
+    max_eval_cp: int = 200
+    use_opening_book: bool = True
+    stockfish_filter_depth: int = 2
+    cache_positions: bool = False
+    cache_size: int = 1000
+    quality_filter: bool = True
+class ChessStartStatesDataset(IterableDataset):
+  """
+  Infinite dataset that yields high-quality FEN strings from diverse game phases.
+  Supports quality filtering, phase-aware generation, and position caching.
+  """
+  def __init__(
+      self,
+      config: ChessDatasetConfig = ChessDatasetConfig()
+  ):
+      """
+      Initialize dataset with quality filtering and phase diversity options.
+      Args:
+          config: ChessDatasetConfig object with all configuration parameters.
+          Defaults to ChessDatasetConfig() if no config is provided.
+          Parameters in the config are:
+            max_steps: Maximum number of positions to generate per epoch
+            random_walk_gen_steps: Maximum random moves (legacy, used if phase_distribution is None)
+            phase_distribution: Dict mapping phase names to weights, e.g. {"opening": 0.3, "middlegame": 0.5, "endgame": 0.2}
+            min_eval_cp: Minimum centipawn evaluation to accept (-200)
+            max_eval_cp: Maximum centipawn evaluation to accept (+200)
+            use_opening_book: Whether to use opening book moves for opening positions
+            stockfish_filter_depth: Stockfish depth for quality filtering (2-4 for speed)
+            cache_positions: Whether to cache and reuse high-quality positions
+            cache_size: Maximum number of positions to cache
+            quality_filter: Whether to filter positions by Stockfish evaluation
+      """
+      # Use config if provided, otherwise use individual parameters or defaults
+      self.max_steps = config.max_steps
+      self.random_walk_gen_steps = config.random_walk_gen_steps
+      self.phase_distribution = config.phase_distribution
+      self.min_eval_cp = config.min_eval_cp
+      self.max_eval_cp = config.max_eval_cp
+      self.use_opening_book = config.use_opening_book
+      self.stockfish_filter_depth = config.stockfish_filter_depth
+      self.cache_positions = config.cache_positions
+      self.cache_size = config.cache_size
+      self.quality_filter = config.quality_filter
+      # Normalize phase distribution (only if not None)
+      if self.phase_distribution is not None:
+          total_weight = sum(self.phase_distribution.values())
+          if total_weight > 0:
+              self.phase_distribution = {k: v / total_weight for k, v in self.phase_distribution.items()}
+      # Position cache
+      self._position_cache: deque = deque[Any](maxlen=self.cache_size)
+      self._cache_stats = {"hits": 0, "misses": 0, "generated": 0}
+      # Statistics tracking
+      self._stats = {
+          "opening": 0,
+          "middlegame": 0,
+          "endgame": 0,
+          "filtered_out": 0,
+          "total_generated": 0,
+      }
+  def _sample_phase(self) -> str:
+      """Sample a game phase according to phase_distribution weights.
+      Returns:
+          Phase name: "opening", "middlegame", or "endgame"
+      """
+      rand = random.random()
+      cumulative = 0.0
+      for phase, weight in self.phase_distribution.items():
+          cumulative += weight
+          if rand <= cumulative:
+              return phase
+      # Fallback to middlegame
+      return "middlegame"
+  def _generate_position(self) -> Optional[chess.Board]:
+      """Generate a single position according to configuration.
+      Returns:
+          Chess board or None if generation fails
+      """
+      # Check cache first
+      if self.cache_positions and self._position_cache:
+          if random.random() < 0.3:  # 30% chance to use cached position
+              cached_pos = random.choice(self._position_cache)
+              self._cache_stats["hits"] += 1
+              return chess.Board(cached_pos)
+          self._cache_stats["misses"] += 1
+      # Determine phase
+      if self.phase_distribution:
+          phase = self._sample_phase()
+      else:
+          phase = None
+      # Generate position
+      if self.quality_filter:
+          board = generate_quality_filtered_board(
+              step_num=self.random_walk_gen_steps,
+              min_eval_cp=self.min_eval_cp,
+              max_eval_cp=self.max_eval_cp,
+              filter_depth=self.stockfish_filter_depth,
+              phase=phase
+          )
+      else:
+          if phase:
+              board = generate_position_by_phase(phase)
+          else:
+              board = generate_random_board(self.random_walk_gen_steps)
+      if board is None:
+          return None
+      # Update statistics
+      if not board.is_game_over():
+          actual_phase = get_game_phase(board)
+          self._stats[actual_phase] = self._stats.get(actual_phase, 0) + 1
+          self._stats["total_generated"] += 1
+          # Cache position if enabled
+          if self.cache_positions:
+              self._position_cache.append(board.fen())
+              self._cache_stats["generated"] += 1
+      return board
+  def get_stats(self) -> Dict:
+      """Get statistics about generated positions.
+      Returns:
+          Dictionary with statistics
+      """
+      stats = self._stats.copy()
+      if self.cache_positions:
+          stats["cache"] = self._cache_stats.copy()
+          stats["cache"]["size"] = len(self._position_cache)
+      return stats
+  def __iter__(self):
+      worker_info = torch.utils.data.get_worker_info()
+      # Determine how many steps this worker should generate
+      if worker_info is not None:
+          # Split work among workers
+          num_workers = worker_info.num_workers
+          worker_id = worker_info.id
+          per_worker = self.max_steps // num_workers
+          # Give remainder to the last worker
+          if worker_id == num_workers - 1:
+              per_worker += self.max_steps % num_workers
+          # Set deterministic seed per worker for reproducibility and isolation
+          worker_seed = 42 + worker_id * 1000
+          random.seed(worker_seed)
+          torch.manual_seed(worker_seed)
+          steps_to_generate = per_worker
+      else:
+          # Single process mode
+          steps_to_generate = self.max_steps
+      # Generate positions for this worker's share
+      for step in range(steps_to_generate):
+          board = self._generate_position()
+          if board is not None and not board.is_game_over():
+              yield board.fen()

hf_space_repo/chess/chess_logic.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import chess
+import torch
+from typing import Optional
+from src.grpo_self_play.searchless_chess_imports import (MOVE_TO_ACTION,
+                                                         ACTION_TO_MOVE,
+                                                         tokenize as deepmind_tokenize)
+MAX_ACTION = max(ACTION_TO_MOVE.keys())
+def board_to_tensor(board, device: str | torch.device ='cpu') -> torch.Tensor:
+  fen = board.fen()
+  token_ids = list[int](deepmind_tokenize(fen)) # Returns list of ints
+  input_tensor = torch.tensor([token_ids], dtype=torch.long, device=device)
+  return input_tensor
+def get_legal_moves_indices(board):
+  legal_moves = list(board.legal_moves)
+  legal_indices = []
+  for move in legal_moves:
+    # move.uci() returns "e2e4" or "a7a8q" which matches your dict keys
+    uci_str = move.uci()
+    if uci_str in MOVE_TO_ACTION:
+        legal_indices.append(MOVE_TO_ACTION[uci_str])
+    else:
+        # Fallback: unlikely if MOVE_TO_ACTION is complete
+        raise ValueError(f"Invalid move: {uci_str}")
+  return legal_indices
+def get_legal_moves_mask(board, device: str | torch.device ='cpu') -> torch.Tensor:
+    legal_moves = list(board.legal_moves)
+    mask = torch.zeros(MAX_ACTION + 1, dtype=torch.bool)
+    for move in legal_moves:
+        uci_str = move.uci()
+        action_idx = MOVE_TO_ACTION.get(uci_str)
+        if action_idx is not None:
+            mask[action_idx] = True
+    return mask.to(device)
+def action_to_move(board: chess.Board, action_idx: int):
+    uci = ACTION_TO_MOVE.get(action_idx)
+    if uci is None:
+        return None
+    try:
+        mv = chess.Move.from_uci(uci)
+    except ValueError:
+        return None
+    return mv if mv in board.legal_moves else None
+class ChessPlayer:
+    """
+    An abstract chess player interface.
+    """
+    def act(self, board: chess.Board) -> Optional[chess.Move]:
+        """
+        Given a chess.Board, return a chess.Move or None to resign.
+        """
+        raise NotImplementedError()

hf_space_repo/chess/policy_player.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import random
+import torch
+import torch.nn.functional as F
+from src.grpo_self_play.chess.chess_logic import (board_to_tensor,
+                                                  get_legal_moves_indices,
+                                                  action_to_move,
+                                                  ChessPlayer)
+from dataclasses import dataclass
+@dataclass
+class PolicyConfig:
+    temperature: float = 1.0
+    greedy: bool = False  # if True, pick argmax among legal moves
+    branching_factor: int = 4  # for search; 0 = no limit
+    search_depth: int = 2  # for search; 0 = no search
+# Register as safe for torch.load with weights_only=True (PyTorch 2.6+ compatibility)
+torch.serialization.add_safe_globals([PolicyConfig])
+class PolicyPlayer(ChessPlayer):
+    def __init__(self, model, device=None, cfg=PolicyConfig()):
+        self.model = model.eval()
+        self.device = device or next(model.parameters()).device
+        self.cfg = cfg
+        self.stats = {"no_legal_idxs": 0, "mapping_failed": 0, "random_fallback": 0}
+    @torch.no_grad()
+    def act(self, board):
+        legal_moves_indices = get_legal_moves_indices(board)
+        if not legal_moves_indices:
+            self.stats["no_legal_idxs"] += 1
+            self.stats["random_fallback"] += 1
+            return random.choice(list(board.legal_moves))
+        return self.sample_move(board, legal_moves_indices)
+    @torch.no_grad()
+    def sample_move(self, board, legal_moves_indices=None):
+        if legal_moves_indices is None:
+            legal_moves_indices = get_legal_moves_indices(board)
+        if not legal_moves_indices:
+            self.stats["no_legal_idxs"] += 1
+            self.stats["random_fallback"] += 1
+            return random.choice(list(board.legal_moves))
+        board_tensor = board_to_tensor(board, self.device)
+        logits = self.model(board_tensor) # [1, A]
+        A = logits.size(-1)
+        masked = torch.full(
+            (A,),
+            -float("inf"),
+            device=self.device,
+            dtype=logits.dtype,
+        )
+        li = torch.tensor(legal_moves_indices, device=self.device, dtype=torch.long)
+        masked[li] = logits[0, li]
+        if self.cfg.greedy:
+            action_idx = int(torch.argmax(masked).item())
+        else:
+            temp = max(1e-6, self.cfg.temperature)
+            probs = F.softmax(masked / temp, dim=-1)
+            action_idx = int(torch.multinomial(probs, 1).item())
+        move = action_to_move(board, action_idx)
+        if move is None:
+            self.stats["mapping_failed"] += 1
+            self.stats["random_fallback"] += 1
+            return random.choice(list(board.legal_moves))
+        return move
+    @torch.no_grad()
+    def eval_board(self, board, root_color):
+        board_tensor = board_to_tensor(board, self.device)
+        legal_moves_indices = get_legal_moves_indices(board)
+        if not legal_moves_indices:
+             # no moves -> treat via game result if available
+             outcome = board.outcome()
+             if outcome is not None:
+                 if outcome.winner is None:
+                     return 0.0
+                 return 1.0 if outcome.winner == root_color else -1.0
+        logits = self.model(board_tensor) # [1, A]
+        A = logits.size(-1)
+        masked = torch.full(
+            (A,),
+            -float("inf"),
+            device=self.device,
+            dtype=logits.dtype,
+        )
+        li = torch.tensor(legal_moves_indices, device=self.device, dtype=torch.long)
+        masked[li] = logits[-1, li]
+        best_logit = float(torch.max(F.tanh(masked)).item())
+        return best_logit if board.turn == root_color else -best_logit

hf_space_repo/chess/rewards.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import os
+import chess
+import chess.engine
+from functools import lru_cache
+from src.grpo_self_play.chess.stockfish import stockfish_analyse, DEFAULT_STOCKFISH_TIMEOUT
+# Engine name for reward evaluation
+REWARD_ENGINE_NAME = f"reward_engine_{os.getpid()}"
+def _get_reward_engine_name() -> str:
+    """Get process-specific engine name for reward evaluation."""
+    return f"reward_engine_{os.getpid()}"
+def _raw_white_reward(fen: str, movetime_ms: int, depth: int, timeout: float = DEFAULT_STOCKFISH_TIMEOUT) -> float:
+    """Get raw centipawn evaluation from White's perspective using centralized wrapper."""
+    if depth and depth > 0:
+        limit = chess.engine.Limit(depth=depth)
+    else:
+        limit = chess.engine.Limit(time=movetime_ms / 1000.0)
+    info = stockfish_analyse(_get_reward_engine_name(), chess.Board(fen), limit, timeout=timeout)
+    if info is None:
+        return 0.0  # Fallback on engine failure
+    score = info["score"].pov(chess.WHITE)
+    if score.is_mate():
+        return 10000.0 if score.mate() > 0 else -10000.0
+    return float(score.score())
+@lru_cache(maxsize=50_000)
+def cached_raw_reward_white(fen: str, depth: int) -> float:
+    """
+    Cached Stockfish raw eval for a given FEN from White's POV.
+    Returns centipawn score (positive = White is better).
+    Only caches by depth, not movetime since movetime is not deterministic.
+    """
+    return _raw_white_reward(fen, movetime_ms=10, depth=depth)
+def normalize_cp(raw_cp: float) -> float:
+    """Normalize raw centipawn score to [-2, 2] using linear clipping."""
+    return float(max(-2.0, min(2.0, raw_cp / 1000.0)))
+def evaluate_fen(fen: str, pov_is_white: bool, movetime_ms: int, depth: int, normalize: bool = True):
+    """
+    Cached Stockfish eval for a given FEN and settings.
+    Returns a normalized reward in [-1, 1].
+    """
+    if depth and depth > 0:
+      raw_score = cached_raw_reward_white(fen, depth)
+    else:
+      raw_score = _raw_white_reward(fen, movetime_ms, depth)
+    if not pov_is_white: # Flip sign for black POV
+        raw_score = -raw_score
+    # Normalize raw score using linear clipping instead of tanh
+    # Linear clipping preserves gradient signal regardless of position evaluation
+    # tanh was compressing differentials at higher absolute values
+    if normalize:
+      return normalize_cp(raw_score)
+    else:
+      return raw_score
+def evaluate_board(board: chess.Board, pov_is_white: bool, depth: int = 16, normalize: bool = True) -> float:
+    """
+    Evaluate a board position from a given POV.
+    Returns normalized reward in [-2, 2] or raw centipawns if normalize=False.
+    """
+    if board.is_game_over(claim_draw=True):
+        if board.is_checkmate():
+            pov_loses = (board.turn == (chess.WHITE if pov_is_white else chess.BLACK))
+            raw = -10000.0 if pov_loses else 10000.0
+        else:
+            raw = 0.0  # Draw
+        return normalize_cp(raw) if normalize else raw
+    else:
+        return evaluate_fen(board.fen(), pov_is_white, movetime_ms=0, depth=depth, normalize=normalize)
+def reward_board(env: chess.Board, board_start: chess.Board, movetime_ms: int = 0, depth: int = 16) -> float:
+    """
+    Stockfish-based reward from the perspective of board_start.turn,
+    matching your original intent.
+    env: current board (python-chess Board)
+    board_start: board at trajectory start (used for POV)
+    """
+    pov_is_white = (board_start.turn == chess.WHITE)
+    if env.is_game_over(claim_draw=True): # Terminal state
+        if env.is_checkmate():
+          pov_loses = (env.turn == (chess.WHITE if pov_is_white else chess.BLACK))
+          r_t = -1.0 if pov_loses else 1.0
+        else:
+          r_t = 0.0 # Draw
+    else:
+      fen_t = env.fen()
+      r_t = evaluate_fen(fen_t, pov_is_white, movetime_ms, depth)
+    fen_0 = board_start.fen()
+    r_0 = evaluate_fen(fen_0, pov_is_white, movetime_ms, depth)
+    return r_t - r_0 # Reward is the change in eval

hf_space_repo/chess/searcher.py ADDED Viewed

	@@ -0,0 +1,90 @@

+'''
+Implement search method to choose moves based on a policy network.
+'''
+import chess
+import torch
+from typing import Optional
+from dataclasses import dataclass
+from src.grpo_self_play.chess.chess_logic import ChessPlayer
+from src.grpo_self_play.chess.policy_player import PolicyPlayer
+@dataclass
+class SearchConfig:
+    n_trajectories: int = 1   # G: number of sampled trajectories
+    trajectory_depth: int = 1 # T: max plies per trajectory
+# Register as safe for torch.load with weights_only=True (PyTorch 2.6+ compatibility)
+torch.serialization.add_safe_globals([SearchConfig])
+class TrajectorySearcher(ChessPlayer):
+    """
+    Searcher that uses a PolicyPlayer to:
+      - sample trajectories using the policy
+      - evaluate their final states using the policy
+    and picks the first move of the best-scoring trajectory.
+    """
+    def __init__(self, policy: PolicyPlayer, cfg: SearchConfig = SearchConfig()):
+        self.policy = policy
+        self.cfg = cfg
+    @torch.no_grad()
+    def act(self, board: chess.Board) -> Optional[chess.Move]:
+        """
+        If n_trajectories or trajectory_depth <= 1:
+          Just use the policy's one-step act() (no search).
+        Otherwise:
+          Sample G trajectories, score each by final state,
+          pick first move of best trajectory.
+        """
+        if self.cfg.n_trajectories <= 1 or self.cfg.trajectory_depth <= 1:
+            return self.policy.act(board)
+        root_color = board.turn
+        best_score = -float("inf")
+        best_first_move = None
+        for g in range(self.cfg.n_trajectories):
+            rollout_board = board.copy()
+            first_move = None
+            for step in range(self.cfg.trajectory_depth):
+                if rollout_board.is_game_over():
+                    break
+                mv = self.policy.sample_move(rollout_board)
+                if mv is None:
+                    # no move available -> end trajectory
+                    break
+                if first_move is None:
+                    first_move = mv
+                rollout_board.push(mv)
+            if first_move is None:
+                # This trajectory failed to get any move (should be rare)
+                continue
+            score = self.policy.eval_board(rollout_board, root_color)
+            if score > best_score:
+                best_score = score
+                best_first_move = first_move
+        if best_first_move is None:
+            # Fallback to simple 1-step policy
+            return self.policy.act(board)
+        return best_first_move
+    @property
+    def stats(self) -> dict:
+        return self.policy.stats

hf_space_repo/chess/stockfish.py ADDED Viewed

	@@ -0,0 +1,288 @@

+import os
+import threading
+import chess
+import chess.engine
+import torch
+from typing import Optional
+from dataclasses import dataclass
+from concurrent.futures import TimeoutError as FuturesTimeoutError
+from src.grpo_self_play.chess.chess_logic import ChessPlayer
+from src.grpo_self_play.logging_utils import get_logger
+logger = get_logger("grpo_chess.stockfish")
+DEFAULT_STOCKFISH_PATH = "/usr/games/stockfish"
+@dataclass(frozen=True)
+class StockfishConfig:
+    path: str = DEFAULT_STOCKFISH_PATH
+    skill_level: int = 20
+    use_elo_limit: bool = False
+    elo: int = 2500
+    movetime_ms: int = 50
+    threads: int = 1
+    hash_mb: int = 128
+# Register as safe for torch.load with weights_only=True (PyTorch 2.6+ compatibility)
+torch.serialization.add_safe_globals([StockfishConfig])
+class StockfishManager:
+  '''
+  Manage stockfish engine instances by name for player, eval and reward engines.
+  For example, We will use several enignes at diffrenet levels for evaluation,
+  or for reward we will limit by time.
+  '''
+  _pid: int = os.getpid()
+  _engines: dict[str, chess.engine.SimpleEngine] = {}
+  _cfgs: dict[str, StockfishConfig] = {}
+  _locks: dict[str, threading.Lock] = {}  # Per-engine locks for thread safety
+  _manager_lock: threading.Lock = threading.Lock()  # Lock for managing _engines/_locks dicts
+  @classmethod
+  def ensure_pid(cls) -> None:
+    pid = os.getpid()
+    if pid != cls._pid:
+      # We are in a forked/spawned child; discard inherited engine handles.
+      # This is a workaround to avoid issues with multiprocessing.
+      cls._pid = pid
+      cls._engines = {}
+      cls._cfgs = {}
+      cls._locks = {}
+      cls._manager_lock = threading.Lock()
+  @classmethod
+  def _configure_engine(cls, engine: chess.engine.SimpleEngine, cfg: StockfishConfig) -> None:
+      try:
+          engine.configure({"Threads": cfg.threads})
+      except Exception:
+          logger.warning("Failed to set Stockfish threads")
+      try:
+          engine.configure({"Hash": cfg.hash_mb})
+      except Exception:
+          logger.warning("Failed to set Stockfish hash size")
+      try:
+          engine.configure({"Skill Level": cfg.skill_level})
+      except Exception:
+          logger.warning("Failed to set Stockfish skill level")
+      if cfg.use_elo_limit:
+          try:
+              engine.configure({
+                  "UCI_LimitStrength": True,
+                  "UCI_Elo": cfg.elo,
+              })
+          except Exception:
+              logger.warning("Failed to set Stockfish ELO limit")
+  @classmethod
+  def is_name_registered(cls, name: str) -> bool:
+      return name in cls._engines
+  @classmethod
+  def get_lock(cls, name: str) -> threading.Lock:
+      """Get the lock for a named engine (creates if needed)."""
+      with cls._manager_lock:
+          if name not in cls._locks:
+              cls._locks[name] = threading.Lock()
+          return cls._locks[name]
+  @classmethod
+  def get_engine(cls, name: str, cfg: StockfishConfig | None = None) -> chess.engine.SimpleEngine:
+      """
+      Get (or create) a named engine instance.
+      - name: e.g. "reward", "player"
+      - cfg: config to use when creating it (ignored later calls).
+      """
+      cls.ensure_pid() # Check if we are in a forked/spawned child and discard inherited engine handles.
+      with cls._manager_lock:
+          if not cls.is_name_registered(name):
+              if cfg is None:
+                  cfg = StockfishConfig()
+              engine = chess.engine.SimpleEngine.popen_uci(cfg.path)
+              cls._configure_engine(engine, cfg)
+              cls._engines[name] = engine
+              cls._cfgs[name] = cfg
+              cls._locks[name] = threading.Lock()
+          return cls._engines[name]
+  @classmethod
+  def close(cls, name: str) -> None:
+      with cls._manager_lock:
+          engine = cls._engines.get(name)
+          if engine is not None:
+              try:
+                  engine.quit()
+              except Exception:
+                  logger.warning(f"Failed to close Stockfish engine '{name}'")
+              finally:
+                  cls._engines.pop(name, None)
+                  cls._cfgs.pop(name, None)
+                  cls._locks.pop(name, None)
+  @classmethod
+  def close_all(cls) -> None:
+      for name in list(cls._engines.keys()):
+          cls.close(name)
+# Default timeout for Stockfish operations (seconds)
+DEFAULT_STOCKFISH_TIMEOUT = 10.0
+def run_with_timeout(func, timeout: float, *args, **kwargs):
+    """Run a function with a timeout.
+    Uses a single threading.Thread + join(timeout) instead of ThreadPoolExecutor
+    so that this works correctly in forked child processes (ProcessPoolExecutor
+    with fork). ThreadPoolExecutor can deadlock in forked workers due to
+    inherited lock state.
+    Args:
+        func: Function to call
+        timeout: Maximum time to wait (seconds)
+        *args, **kwargs: Arguments to pass to func
+    Returns:
+        Result of func
+    Raises:
+        FuturesTimeoutError: If the function doesn't complete within timeout
+    """
+    result_holder: list = []
+    exc_holder: list = []
+    def target() -> None:
+        try:
+            out = func(*args, **kwargs)
+            result_holder.append(out)
+        except BaseException as e:
+            exc_holder.append(e)
+    t = threading.Thread(target=target, daemon=True)
+    t.start()
+    t.join(timeout=timeout)
+    if t.is_alive():
+        raise FuturesTimeoutError()
+    if exc_holder:
+        raise exc_holder[0]
+    return result_holder[0]
+def stockfish_analyse(
+    engine_name: str,
+    board: chess.Board,
+    limit: chess.engine.Limit,
+    timeout: float = DEFAULT_STOCKFISH_TIMEOUT,
+    cfg: StockfishConfig | None = None,
+    attempts_n: int = 2
+) -> Optional[chess.engine.InfoDict]:
+    """Analyse a position with Stockfish, with timeout and crash recovery.
+    Args:
+        engine_name: Name of the engine instance to use
+        board: Chess board position to analyse
+        limit: Search limit (depth, time, etc.)
+        timeout: Maximum time to wait for response (seconds)
+        cfg: Optional config for engine creation
+        attempts_n: how many attempts to try
+    Returns:
+        Analysis info dict, or None if analysis failed
+    """
+    for attempt in range(attempts_n):
+        try:
+            engine = StockfishManager.get_engine(engine_name, cfg)
+            lock = StockfishManager.get_lock(engine_name)
+            with lock:
+                return run_with_timeout(engine.analyse, timeout, board, limit)
+        except chess.engine.EngineTerminatedError:
+            logger.error(f"Stockfish engine '{engine_name}' terminated unexpectedly, recreating...")
+            StockfishManager.close(engine_name)
+            if attempt == 1:
+                return None
+        except FuturesTimeoutError:
+            logger.warning(f"Stockfish analyse timed out after {timeout}s for engine '{engine_name}'")
+            return None
+        except Exception as e:
+            logger.error(f"Stockfish analyse error: {e}")
+            return None
+    return None
+def stockfish_play(
+    engine_name: str,
+    board: chess.Board,
+    limit: chess.engine.Limit,
+    timeout: float = DEFAULT_STOCKFISH_TIMEOUT,
+    cfg: StockfishConfig | None = None,
+) -> Optional[chess.Move]:
+    """Get best move from Stockfish, with timeout and crash recovery.
+    Args:
+        engine_name: Name of the engine instance to use
+        board: Chess board position
+        limit: Search limit (depth, time, etc.)
+        timeout: Maximum time to wait for response (seconds)
+        cfg: Optional config for engine creation
+    Returns:
+        Best move, or None if engine failed
+    """
+    if board.is_game_over():
+        return None
+    for attempt in range(2):
+        try:
+            engine = StockfishManager.get_engine(engine_name, cfg)
+            lock = StockfishManager.get_lock(engine_name)
+            with lock:
+                result = run_with_timeout(engine.play, timeout, board, limit)
+            return result.move
+        except chess.engine.EngineTerminatedError:
+            logger.error(f"Stockfish engine '{engine_name}' terminated unexpectedly, recreating...")
+            StockfishManager.close(engine_name)
+            if attempt == 1:
+                return None
+        except FuturesTimeoutError:
+            logger.warning(f"Stockfish play timed out after {timeout}s for engine '{engine_name}'")
+            return None
+        except Exception as e:
+            logger.error(f"Stockfish play error: {e}")
+            return None
+    return None
+class StockfishPlayer(ChessPlayer):
+    '''
+    A chess player that uses Stockfish engine to select moves.
+    '''
+    DEFUALT_PLAYER_ENGINE_NAME = "player_engine"
+    def __init__(self, cfg: StockfishConfig, engine_name: Optional[str] = None):
+        if engine_name is None:
+            engine_name = self.DEFUALT_PLAYER_ENGINE_NAME
+        self.engine_name = engine_name
+        self.cfg = cfg
+        self.engine = StockfishManager.get_engine(self.engine_name, cfg)
+    def close(self):
+        try:
+            StockfishManager.close(self.engine_name)
+        except Exception:
+            logger.warning("Failed to close Stockfish engine in StockfishPlayer")
+    def act(self, board: chess.Board) -> chess.Move | None:
+        limit = chess.engine.Limit(time=self.cfg.movetime_ms / 1000.0)
+        return stockfish_play(self.engine_name, board, limit, cfg=self.cfg)

hf_space_repo/configs/__init__.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""
+Config module for GRPO Chess experiments.
+Provides YAML-based configuration loading with override support.
+Usage:
+    from src.grpo_self_play.configs import load_experiment_config
+    # Load default config
+    config = load_experiment_config()
+    # Load with overrides
+    config = load_experiment_config("default.yaml", overrides={
+        "grpo": {"lr": 1e-4},
+        "training": {"num_epochs": 100},
+    })
+"""
+from src.grpo_self_play.configs.config_loader import (
+    ExperimentConfig,
+    TrainingConfig,
+    load_experiment_config,
+    load_grpo_config,
+    load_transformer_config,
+    load_eval_config,
+    load_stockfish_config,
+    load_dataset_config,
+    list_available_configs,
+    print_config_summary,
+)
+__all__ = [
+    "ExperimentConfig",
+    "TrainingConfig",
+    "load_experiment_config",
+    "load_grpo_config",
+    "load_transformer_config",
+    "load_eval_config",
+    "load_stockfish_config",
+    "load_dataset_config",
+    "list_available_configs",
+    "print_config_summary",
+]

hf_space_repo/configs/config_loader.py ADDED Viewed

	@@ -0,0 +1,290 @@

+"""
+Config loader for GRPO Chess experiments.
+This module provides utilities to load experiment configurations from YAML files
+and convert them to the appropriate dataclass objects.
+Usage:
+    from src.grpo_self_play.configs.config_loader import load_experiment_config
+    # Load a complete experiment config
+    config = load_experiment_config("default.yaml")
+    # Load with overrides
+    config = load_experiment_config("default.yaml", overrides={
+        "grpo": {"lr": 1e-4, "entropy_coef": 0.2},
+        "training": {"num_epochs": 100},
+    })
+    # Access configs
+    grpo_config = config.grpo
+    transformer_config = config.transformer
+"""
+from dataclasses import dataclass, fields
+from pathlib import Path
+from typing import Any, Optional, TypeVar, Type
+import yaml
+# Import all config dataclasses
+from src.grpo_self_play.grpo_logic.model import GRPOConfig
+from src.grpo_self_play.models import ChessTransformerConfig
+from src.grpo_self_play.eval_utils import EvalConfig
+from src.grpo_self_play.chess.stockfish import StockfishConfig
+from src.grpo_self_play.chess.policy_player import PolicyConfig
+from src.grpo_self_play.chess.searcher import SearchConfig
+from src.grpo_self_play.chess.boards_dataset import ChessDatasetConfig
+from src.grpo_self_play.pretrain.pretrain_load_config import PretrainLoadConfig
+# Directory containing config YAML files
+CONFIGS_DIR = Path(__file__).parent
+@dataclass
+class TrainingConfig:
+    """Training loop configuration."""
+    num_epochs: int = 400
+    batch_size: int = 32
+    steps_per_epoch: int = 512
+    checkpoint_every_n_epochs: int = 5
+    keep_n_checkpoints: int = 3
+@dataclass
+class ExperimentConfig:
+    """Complete experiment configuration containing all sub-configs."""
+    training: TrainingConfig
+    grpo: GRPOConfig
+    transformer: ChessTransformerConfig
+    eval: EvalConfig
+    stockfish: StockfishConfig
+    policy: PolicyConfig
+    searcher: Optional[SearchConfig]
+    dataset: ChessDatasetConfig
+    pretrain: PretrainLoadConfig
+T = TypeVar('T')
+def _deep_merge(base: dict, overrides: dict) -> dict:
+    """Deep merge two dictionaries, with overrides taking precedence.
+    Args:
+        base: Base dictionary
+        overrides: Dictionary with values to override
+    Returns:
+        Merged dictionary
+    """
+    result = base.copy()
+    for key, value in overrides.items():
+        if key in result and isinstance(result[key], dict) and isinstance(value, dict):
+            result[key] = _deep_merge(result[key], value)
+        else:
+            result[key] = value
+    return result
+def dict_to_dataclass(cls: Type[T], data: dict[str, Any]) -> T:
+    """Convert a dictionary to a dataclass, ignoring extra keys.
+    Args:
+        cls: The dataclass type to instantiate
+        data: Dictionary with field values
+    Returns:
+        Instance of the dataclass with values from data
+    """
+    if data is None:
+        return None
+    # Get valid field names for this dataclass
+    valid_fields = {f.name for f in fields(cls)}
+    # Filter to only include valid fields
+    filtered_data = {k: v for k, v in data.items() if k in valid_fields}
+    return cls(**filtered_data)
+def load_yaml_file(path: str | Path) -> dict[str, Any]:
+    """Load a YAML config file.
+    Args:
+        path: Path to the YAML file (absolute or relative to configs dir)
+    Returns:
+        Dictionary containing the parsed YAML
+    """
+    path = Path(path)
+    # If not absolute, look in configs directory
+    if not path.is_absolute():
+        path = CONFIGS_DIR / path
+    if not path.exists():
+        raise FileNotFoundError(f"Config file not found: {path}")
+    with open(path, 'r') as f:
+        return yaml.safe_load(f)
+def load_experiment_config(
+    path: str | Path = "default.yaml",
+    overrides: dict[str, dict[str, Any]] | None = None
+) -> ExperimentConfig:
+    """Load a complete experiment configuration from a YAML file.
+    Args:
+        path: Path to the YAML file (absolute or relative to configs dir)
+        overrides: Optional dict of overrides per section. Example:
+            {
+                "grpo": {"lr": 1e-4, "entropy_coef": 0.2},
+                "training": {"num_epochs": 100},
+                "stockfish": {"skill_level": 5},
+            }
+    Returns:
+        ExperimentConfig containing all sub-configs
+    """
+    data = load_yaml_file(path)
+    # Apply overrides if provided
+    if overrides:
+        data = _deep_merge(data, overrides)
+    # Convert each section to its dataclass
+    training = dict_to_dataclass(TrainingConfig, data.get('training', {}))
+    grpo = dict_to_dataclass(GRPOConfig, data.get('grpo', {}))
+    transformer = dict_to_dataclass(ChessTransformerConfig, data.get('transformer', {}))
+    eval_cfg = dict_to_dataclass(EvalConfig, data.get('eval', {}))
+    stockfish = dict_to_dataclass(StockfishConfig, data.get('stockfish', {}))
+    policy = dict_to_dataclass(PolicyConfig, data.get('policy', {}))
+    dataset = dict_to_dataclass(ChessDatasetConfig, data.get('dataset', {}))
+    pretrain = dict_to_dataclass(PretrainLoadConfig, data.get('pretrain', {}))
+    # Searcher is optional (can be null)
+    searcher_data = data.get('searcher')
+    searcher = dict_to_dataclass(SearchConfig, searcher_data) if searcher_data else None
+    return ExperimentConfig(
+        training=training,
+        grpo=grpo,
+        transformer=transformer,
+        eval=eval_cfg,
+        stockfish=stockfish,
+        policy=policy,
+        searcher=searcher,
+        dataset=dataset,
+        pretrain=pretrain,
+    )
+def load_grpo_config(
+    path: str | Path = "default.yaml",
+    overrides: dict[str, Any] | None = None
+) -> GRPOConfig:
+    """Load just the GRPO config from a YAML file.
+    Args:
+        path: Path to the YAML file
+        overrides: Optional dict of field overrides. Example: {"lr": 1e-4}
+    """
+    data = load_yaml_file(path)
+    grpo_data = data.get('grpo', {})
+    if overrides:
+        grpo_data = _deep_merge(grpo_data, overrides)
+    return dict_to_dataclass(GRPOConfig, grpo_data)
+def load_transformer_config(
+    path: str | Path = "default.yaml",
+    overrides: dict[str, Any] | None = None
+) -> ChessTransformerConfig:
+    """Load just the transformer config from a YAML file."""
+    data = load_yaml_file(path)
+    cfg_data = data.get('transformer', {})
+    if overrides:
+        cfg_data = _deep_merge(cfg_data, overrides)
+    return dict_to_dataclass(ChessTransformerConfig, cfg_data)
+def load_eval_config(
+    path: str | Path = "default.yaml",
+    overrides: dict[str, Any] | None = None
+) -> EvalConfig:
+    """Load just the eval config from a YAML file."""
+    data = load_yaml_file(path)
+    cfg_data = data.get('eval', {})
+    if overrides:
+        cfg_data = _deep_merge(cfg_data, overrides)
+    return dict_to_dataclass(EvalConfig, cfg_data)
+def load_stockfish_config(
+    path: str | Path = "default.yaml",
+    overrides: dict[str, Any] | None = None
+) -> StockfishConfig:
+    """Load just the stockfish config from a YAML file."""
+    data = load_yaml_file(path)
+    cfg_data = data.get('stockfish', {})
+    if overrides:
+        cfg_data = _deep_merge(cfg_data, overrides)
+    return dict_to_dataclass(StockfishConfig, cfg_data)
+def load_dataset_config(
+    path: str | Path = "default.yaml",
+    overrides: dict[str, Any] | None = None
+) -> ChessDatasetConfig:
+    """Load just the dataset config from a YAML file."""
+    data = load_yaml_file(path)
+    cfg_data = data.get('dataset', {})
+    if overrides:
+        cfg_data = _deep_merge(cfg_data, overrides)
+    return dict_to_dataclass(ChessDatasetConfig, cfg_data)
+def list_available_configs() -> list[str]:
+    """List all available YAML config files in the configs directory."""
+    return [f.name for f in CONFIGS_DIR.glob("*.yaml")]
+def print_config_summary(config: ExperimentConfig) -> None:
+    """Print a summary of the experiment configuration."""
+    print("=" * 60)
+    print("EXPERIMENT CONFIGURATION")
+    print("=" * 60)
+    print("\n[Training]")
+    print(f"  epochs: {config.training.num_epochs}")
+    print(f"  batch_size: {config.training.batch_size}")
+    print(f"  steps_per_epoch: {config.training.steps_per_epoch}")
+    print("\n[GRPO]")
+    print(f"  lr: {config.grpo.lr}")
+    print(f"  num_trajectories: {config.grpo.num_trajectories}")
+    print(f"  trajectory_depth: {config.grpo.trajectory_depth}")
+    print(f"  entropy_coef: {config.grpo.entropy_coef}")
+    print(f"  rollout_temperature: {config.grpo.rollout_temperature}")
+    print(f"  adaptive_kl: {config.grpo.adaptive_kl}")
+    print(f"  use_entropy_floor: {config.grpo.use_entropy_floor}")
+    print("\n[Transformer]")
+    print(f"  embed_dim: {config.transformer.embed_dim}")
+    print(f"  num_layers: {config.transformer.num_layers}")
+    print(f"  num_heads: {config.transformer.num_heads}")
+    print("\n[Eval]")
+    print(f"  games: {config.eval.games}")
+    print(f"  max_plies: {config.eval.max_plies}")
+    print("\n[Stockfish]")
+    print(f"  skill_level: {config.stockfish.skill_level}")
+    print("\n[Searcher]")
+    print(f"  enabled: {config.searcher is not None}")
+    print("=" * 60)

hf_space_repo/configs/default.yaml ADDED Viewed

	@@ -0,0 +1,123 @@

+# Default experiment configuration
+# This file contains all hyperparameters for a training run.
+# Copy this file and modify for new experiments.
+# =============================================================================
+# Training Loop Settings
+# =============================================================================
+training:
+  num_epochs: 400
+  batch_size: 32
+  steps_per_epoch: 512
+  checkpoint_every_n_epochs: 5  # Save periodic checkpoint every N epochs for crash recovery
+  keep_n_checkpoints: 3         # Keep last N periodic checkpoints per run
+# =============================================================================
+# GRPO (Group Relative Policy Optimization) Config
+# Clean run config (see research_docs/2026-02-06_loss-budget-and-monitor-analysis.md)
+# =============================================================================
+grpo:
+  lr: 0.000001  # 1e-6: reduced because PPO signal now dominates gradient
+  num_trajectories: 16
+  trajectory_depth: 16
+  clip_ratio: 0.20
+  kl_coef: 0.001  # reduced from 0.01 (was being overridden to 0.1 by adaptive KL)
+  entropy_coef: 0.0  # removed: not part of original GRPO loss, was 95% of gradient
+  eval_every_n_epochs: 10
+  ppo_steps: 1
+  rollout_temperature: 1.3
+  # Entropy floor monitoring — disabled (never triggered, see research doc)
+  use_entropy_floor: false
+  entropy_floor: 1.5
+  entropy_floor_steps: 150
+  entropy_floor_action: "boost"
+  entropy_boost_factor: 1.5
+  # Adaptive KL controller — disabled (saturated at max instantly, see research doc)
+  adaptive_kl: false
+  target_kl: 0.012
+  kl_adapt_rate: 1.2
+  kl_coef_min: 0.001
+  kl_coef_max: 0.1
+  # Safety checks
+  enable_safety_checks: false
+  safety_patience_steps: 1000
+  max_clip_fraction: 0.95
+  min_entropy: 0.5
+  max_kl_divergence: 0.08
+  # Teacher forcing: use Stockfish for rival moves during trajectory sampling
+  teacher_forcing_prob: 0.1  # 10% of rival moves will be from Stockfish
+  teacher_forcing_depth: 4
+# =============================================================================
+# Transformer Model Config
+# =============================================================================
+transformer:
+  vocab_size: 300
+  embed_dim: 256
+  num_layers: 4
+  num_heads: 8
+  action_dim: 1968
+# =============================================================================
+# Evaluation Config (vs Stockfish)
+# =============================================================================
+eval:
+  games: 64
+  seed: 0
+  max_plies: 400
+  randomize_opening: true
+  opening_plies: 6
+# =============================================================================
+# Stockfish Config
+# =============================================================================
+stockfish:
+  path: "/usr/games/stockfish"  # Override in colab/local as needed
+  skill_level: 2
+  use_elo_limit: false
+  elo: 2500
+  movetime_ms: 50
+  threads: 1
+  hash_mb: 128
+# =============================================================================
+# Policy Player Config (for evaluation)
+# =============================================================================
+policy:
+  temperature: 0.8
+  greedy: true
+  branching_factor: 4
+  search_depth: 2
+# =============================================================================
+# Searcher Config (optional - set to null to disable)
+# =============================================================================
+searcher: null
+# searcher:
+#   n_trajectories: 4
+#   trajectory_depth: 8
+# =============================================================================
+# Pretraining (optional - load pretrained weights before GRPO)
+# =============================================================================
+pretrain:
+  checkpoint_path: null  # Path to pretrained checkpoint (e.g., "checkpoints/pretrain/pretrain_final.pt")
+  freeze_layers: 2       # Freeze first 2 transformer layers to preserve learned representations
+# =============================================================================
+# Dataset Config (Chess Start States)
+# =============================================================================
+dataset:
+  max_steps: 512  # Should match steps_per_epoch
+  phase_distribution:
+    opening: 0.33
+    middlegame: 0.34
+    endgame: 0.33
+  min_eval_cp: -200
+  max_eval_cp: 200
+  quality_filter: true
+  stockfish_filter_depth: 4

hf_space_repo/configs/pretrain.yaml ADDED Viewed

	@@ -0,0 +1,49 @@

+# Pretraining configuration for chess model
+# This file contains hyperparameters for supervised pretraining on Lichess games.
+#
+# Usage:
+#   python -m src.grpo_self_play.pretrain.pretrain --config pretrain.yaml
+# =============================================================================
+# Pretraining Settings
+# =============================================================================
+pretrain:
+  lr: 0.0001                    # Learning rate (higher than GRPO fine-tuning)
+  batch_size: 4096              # Batch size for pretraining
+  num_epochs: 22                # Number of passes through the dataset
+  warmup_steps: 1000            # Linear warmup steps
+  weight_decay: 0.01            # AdamW weight decay
+  max_grad_norm: 1.0            # Gradient clipping
+  checkpoint_dir: "checkpoints/pretrain"
+  resume_from: null             # Path to resume from (optional)
+  use_wandb: true
+  wandb_project: "chess-grpo-pretrain"
+  label_smoothing: 0.1          # Prevents overconfidence
+  num_workers: 4                # DataLoader workers
+  val_check_interval: 0.1       # Validate every 10% of epoch
+# =============================================================================
+# Dataset Settings (Lichess games from HuggingFace)
+# =============================================================================
+dataset:
+  min_elo: 1800                 # Minimum player rating to include
+  max_samples: 5000000          # Max samples per epoch (null = unlimited)
+  skip_first_n_moves: 5         # Skip opening moves (book territory)
+  skip_last_n_moves: 5          # Skip endgame/resignation moves
+  sample_positions_per_game: 3  # Positions to sample from each game
+  buffer_size: 10000            # Shuffle buffer size for streaming
+  filter_abandoned: true        # Skip abandoned games
+  dataset_name: "Lichess/standard-chess-games"
+  split: "train"                # Dataset split to use
+  is_eval: false                # False for training, True for evaluation
+  eval_fraction: 0.05           # 5% of games held out for evaluation
+# =============================================================================
+# Transformer Model Config (should match GRPO training)
+# =============================================================================
+transformer:
+  vocab_size: 300
+  embed_dim: 256
+  num_layers: 4
+  num_heads: 8
+  action_dim: 1968

hf_space_repo/constants.py ADDED Viewed

	@@ -0,0 +1,15 @@

+"""Constants used across the GRPO self-play module."""
+# Sequence length for tokenized FEN strings
+SEQUENCE_LENGTH = 77
+# Default training hyperparameters
+DEFAULT_LEARNING_RATE = 1e-4
+DEFAULT_NUM_TRAJECTORIES = 4
+DEFAULT_TRAJECTORY_DEPTH = 5
+DEFAULT_CLIP_RATIO = 0.2
+DEFAULT_KL_COEF = 0.01
+# Default evaluation settings
+DEFAULT_EVAL_GAMES = 50
+DEFAULT_EVAL_MAX_PLIES = 400

hf_space_repo/eval_utils.py ADDED Viewed

	@@ -0,0 +1,211 @@

+"""Utilities for evaluating chess policies against Stockfish."""
+import io
+import math
+import chess
+import chess.pgn
+import chess.engine
+import random
+import torch
+from dataclasses import dataclass
+from typing import Dict, List, Tuple
+from src.grpo_self_play.chess.chess_logic import MOVE_TO_ACTION
+from src.grpo_self_play.chess.policy_player import PolicyPlayer, PolicyConfig
+from src.grpo_self_play.chess.searcher import TrajectorySearcher, SearchConfig
+from src.grpo_self_play.chess.stockfish import StockfishPlayer, StockfishConfig, DEFAULT_STOCKFISH_PATH as STOCKFISH_PATH
+@dataclass
+class EvalConfig:
+    games: int = 50
+    seed: int = 0
+    max_plies: int = 400  # safety to avoid extremely long games
+    randomize_opening: bool = False
+    opening_plies: int = 6  # random legal moves to diversify early positions
+# Register as safe for torch.load with weights_only=True (PyTorch 2.6+ compatibility)
+torch.serialization.add_safe_globals([EvalConfig])
+def debug_legal_coverage(board: chess.Board) -> tuple[int, int, list[str]]:
+    """Debug function to check coverage of legal moves in action space.
+    Args:
+        board: Chess board position
+    Returns:
+        Tuple of (covered_count, total_legal_moves, list_of_missing_moves)
+    """
+    legals = list(board.legal_moves)
+    covered = 0
+    missing = []
+    for mv in legals:
+        u = mv.uci()
+        if u in MOVE_TO_ACTION:
+            covered += 1
+        else:
+            missing.append(u)
+    return covered, len(legals), missing[:10]
+def play_one_game(
+    policy: PolicyPlayer | TrajectorySearcher,
+    stockfish: StockfishPlayer,
+    policy_is_white: bool,
+    cfg: EvalConfig,
+    game_number: int = 0,
+) -> Tuple[str, str, str]:
+    """Play a single game between policy and Stockfish.
+    Args:
+        policy: Policy player to evaluate
+        stockfish: Stockfish player
+        policy_is_white: Whether policy plays as white
+        cfg: Evaluation configuration
+        game_number: Game number for PGN metadata
+    Returns:
+        Tuple of (result_str, termination_reason, pgn_str)
+        result_str in {"1-0", "0-1", "1/2-1/2"}
+    """
+    board = chess.Board()
+    game = chess.pgn.Game()
+    game.headers["Event"] = "Policy vs Stockfish Evaluation"
+    game.headers["White"] = "Policy" if policy_is_white else "Stockfish"
+    game.headers["Black"] = "Stockfish" if policy_is_white else "Policy"
+    game.headers["Round"] = str(game_number + 1)
+    node = game
+    # Optional random opening to reduce overfitting to a single line
+    if cfg.randomize_opening and cfg.opening_plies > 0:
+        for _ in range(cfg.opening_plies):
+            if board.is_game_over():
+                break
+            move = random.choice(list(board.legal_moves))
+            board.push(move)
+            node = node.add_variation(move)
+    for ply in range(cfg.max_plies):
+        if board.is_game_over(claim_draw=True):
+            break
+        is_white_to_move = board.turn
+        policy_turn = (is_white_to_move and policy_is_white) or ((not is_white_to_move) and (not policy_is_white))
+        if policy_turn:
+            move = policy.act(board)
+        else:
+            move = stockfish.act(board)
+        if move is None:
+            break  # no legal moves
+        board.push(move)
+        node = node.add_variation(move)
+    # Determine result
+    if board.is_game_over(claim_draw=True):
+        res = board.result(claim_draw=True)
+        reason = "game_over"
+    else:
+        # Reached max plies: treat as draw
+        res = "1/2-1/2"
+        reason = "max_plies"
+    game.headers["Result"] = res
+    # Generate PGN string
+    pgn_output = io.StringIO()
+    exporter = chess.pgn.FileExporter(pgn_output)
+    game.accept(exporter)
+    pgn_str = pgn_output.getvalue()
+    return res, reason, pgn_str
+def estimate_elo_diff(score: float) -> float:
+    """Estimate Elo difference from match score.
+    Uses logistic model: S = 1/(1+10^(-d/400)) => d = -400*log10(1/S - 1)
+    Clamped for numeric stability.
+    Args:
+        score: Win rate score in [0, 1]
+    Returns:
+        Estimated Elo difference
+    """
+    eps = 1e-6
+    s = min(max(score, eps), 1 - eps)
+    return -400.0 * math.log10(1.0 / s - 1.0)
+def evaluate_policy_vs_stockfish(
+    policy: PolicyPlayer | TrajectorySearcher,
+    sf: StockfishPlayer,
+    eval_cfg: EvalConfig,
+) -> Tuple[Dict, PolicyPlayer | TrajectorySearcher, List[str]]:
+    """Evaluate a policy by playing multiple games against Stockfish.
+    Args:
+        policy: Policy player to evaluate
+        sf: Stockfish player
+        eval_cfg: Evaluation configuration
+    Returns:
+        Tuple of (results_dict, policy_player, pgns)
+        results_dict contains: games, wins, draws, losses, score, elo_diff, etc.
+        pgns is a list of PGN strings for all games played
+    """
+    random.seed(eval_cfg.seed)
+    torch.manual_seed(eval_cfg.seed)
+    wins = draws = losses = 0
+    term_reasons = {}
+    pgns: List[str] = []
+    try:
+        for g in range(eval_cfg.games):
+            policy_is_white = (g % 2 == 0)
+            res, reason, pgn = play_one_game(policy, sf, policy_is_white, eval_cfg, game_number=g)
+            term_reasons[reason] = term_reasons.get(reason, 0) + 1
+            pgns.append(pgn)
+            # From policy perspective
+            if res == "1-0":
+                if policy_is_white:
+                    wins += 1
+                else:
+                    losses += 1
+            elif res == "0-1":
+                if policy_is_white:
+                    losses += 1
+                else:
+                    wins += 1
+            else:
+                draws += 1
+    finally:
+        sf.close()
+    total = wins + draws + losses
+    score = (wins + 0.5 * draws) / total if total else 0.0
+    elo_diff = estimate_elo_diff(score) if total else 0.0
+    return {
+        "games": total,
+        "wins": wins,
+        "draws": draws,
+        "losses": losses,
+        "score": score,
+        "elo_diff_vs_stockfish_approx": elo_diff,
+        "termination_reasons": term_reasons,
+        "eval_cfg": eval_cfg,
+    }, policy, pgns

hf_space_repo/evaluator.py ADDED Viewed

	@@ -0,0 +1,118 @@

+from typing import Dict, List, Optional, Tuple
+from chess import engine
+import torch.nn as nn
+from src.grpo_self_play.chess.policy_player import PolicyPlayer, PolicyConfig
+from src.grpo_self_play.chess.searcher import TrajectorySearcher, SearchConfig
+from src.grpo_self_play.chess.stockfish import StockfishPlayer, StockfishConfig, StockfishManager
+from src.grpo_self_play.eval_utils import EvalConfig, evaluate_policy_vs_stockfish
+class Evaluator:
+    """Evaluate a chess model by playing against Stockfish.
+    Handles evaluation of chess policies against Stockfish at various skill levels.
+    Supports both single evaluations and skill ladder evaluations.
+    """
+    def __init__(self,
+                 eval_cfg: EvalConfig = EvalConfig(),
+                 policy_cfg: PolicyConfig = PolicyConfig(),
+                 searcher_cfg: Optional[SearchConfig] = None,
+                 stockfish_cfg: StockfishConfig = StockfishConfig()):
+        """
+        Initialize evaluator.
+        Args:
+            eval_cfg: Evaluation configuration (number of games, etc.)
+            policy_cfg: Policy player configuration
+            searcher_cfg: Optional search configuration for tree search
+            stockfish_cfg: Stockfish engine configuration
+        """
+        self.eval_cfg = eval_cfg
+        self.policy_cfg = policy_cfg
+        self.searcher_cfg = searcher_cfg
+        self.default_stockfish_cfg = stockfish_cfg
+    def _make_policy(self, model: nn.Module) -> PolicyPlayer | TrajectorySearcher:
+        """Create a policy player (optionally wrapped with search).
+        Args:
+            model: Neural network model
+        Returns:
+            Policy player, optionally wrapped with trajectory search
+        """
+        policy = PolicyPlayer(model, cfg=self.policy_cfg)
+        if self.searcher_cfg is not None:
+            policy = TrajectorySearcher(policy, cfg=self.searcher_cfg)
+        return policy
+    def _make_stockfish(self) -> StockfishPlayer:
+        """Create a Stockfish player with default configuration.
+        Returns:
+            Stockfish player instance
+        """
+        return StockfishPlayer(self.default_stockfish_cfg)
+    def single_evaluation(self, model: nn.Module) -> Tuple[Dict, PolicyPlayer | TrajectorySearcher, List[str]]:
+        """Evaluate the model by playing games against Stockfish.
+        Args:
+            model: Neural network model to evaluate
+        Returns:
+            Tuple of (results_dict, policy_or_searcher, pgns)
+            pgns is a list of PGN strings for all games played
+        """
+        stockfish_player = self._make_stockfish()
+        policy = self._make_policy(model)
+        results, policy_or_searcher, pgns = evaluate_policy_vs_stockfish(
+            policy,
+            stockfish_player,
+            self.eval_cfg,
+        )
+        return results, policy_or_searcher, pgns
+    def eval_ladder(self, model: nn.Module) -> Dict[int, float]:
+        """Evaluate model against Stockfish at multiple skill levels.
+        Args:
+            model: Neural network model to evaluate
+        Returns:
+            Dictionary mapping skill level to win rate score
+        """
+        policy = self._make_policy(model)
+        results = {}
+        skill_levels = [1, 3, 5, 8, 10]
+        for skill in skill_levels:
+            stockfish_cfg = StockfishConfig(
+                path=self.default_stockfish_cfg.path,
+                skill_level=skill,
+                movetime_ms=self.default_stockfish_cfg.movetime_ms,
+            )
+            engine_name = f"stockfish_skill_{skill}"
+            stockfish_player = StockfishPlayer(stockfish_cfg, engine_name=engine_name)
+            try:
+                r, policy_wrapper, _ = evaluate_policy_vs_stockfish(
+                    policy,
+                    stockfish_player,
+                    self.eval_cfg,
+                )
+                results[skill] = r["score"]
+                print(f"Skill {skill}: {r}")
+                if hasattr(policy_wrapper, 'stats'):
+                    print(f'Policy stats: {policy_wrapper.stats}')
+            except Exception as e:
+                print(f"Error evaluating at skill {skill}: {e}")
+                results[skill] = 0.0
+            finally:
+                StockfishManager.close(engine_name)  # Close engine to free resources
+        return results

hf_space_repo/grpo_logic/__init__.py ADDED Viewed

File without changes

hf_space_repo/grpo_logic/loss.py ADDED Viewed

	@@ -0,0 +1,235 @@

+import torch
+from typing import Tuple
+from dataclasses import dataclass
+@dataclass
+class GRPOLossInfo:
+    """Information about GRPO loss components for logging and debugging."""
+    kl_div: torch.Tensor
+    mean_ratio: torch.Tensor
+    mean_clip_fraction: torch.Tensor
+    ppo_loss: torch.Tensor
+    entropy: torch.Tensor
+    loss_without_entropy: torch.Tensor
+def grpo_chess_loss(
+    logprobs_new: torch.Tensor,   # [G, T]  log πθ(a_{g,k,t} | s_{g,k,t})
+    logprobs_old: torch.Tensor,   # [G, T]  log πold(a_{g,k,t} | s_{g,k,t})
+    advantages: torch.Tensor,        # [G, T]
+    clip_eps: float = 0.2,  # ε in the formula
+    beta_kl: float = 0.0,   # β in the formula (0 = no explicit KL penalty)
+    eps: float = 1e-8) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Compute GRPO chess loss (legacy function, consider using grpo_ppo_loss instead).
+    Args:
+        logprobs_new: New policy log probabilities [G, T]
+        logprobs_old: Old policy log probabilities [G, T]
+        advantages: Advantage values [G, T]
+        clip_eps: PPO clipping epsilon
+        beta_kl: KL penalty coefficient
+        eps: Numerical stability epsilon
+    Returns:
+        Tuple of (loss, approximate_kl_divergence)
+    """
+    # ------------------------------------------------------------
+    # 3. Probability ratio r_{g,k,t}(θ)
+    #
+    #    r_{g,k,t}(θ) = πθ(a_{g,k,t}|s_{g,k,t}) / πold(a_{g,k,t}|s_{g,k,t})
+    #                 = exp( logπθ - logπold )
+    # ------------------------------------------------------------
+    ratio = (logprobs_new - logprobs_old).exp() # [G, T]
+    pg_unclipped = -advantages * ratio  # [G, T]
+    pg_clipped = -advantages * ratio.clamp(1.0 - clip_eps, 1.0 + clip_eps) # [G, T]
+    # Surrogate policy gradient loss (PPO-clip part)
+    # This corresponds to the -E[min(...)] in the formula.
+    policy_loss = torch.max(pg_unclipped, pg_clipped).mean()
+    approx_kl = (logprobs_old - logprobs_new).mean()
+    # KL penalty: β * E[ KL(...) ]
+    kl_loss = beta_kl * approx_kl
+    loss = policy_loss + kl_loss
+    return loss, approx_kl
+# Utils functions for GRPO
+def group_advantage(group_rewards: torch.Tensor) -> torch.Tensor:
+    """
+    Compute normalized advantages from group rewards using standardization.
+    Args:
+        group_rewards: Group rewards tensor [B, G] or [G]
+    Returns:
+        Normalized advantages with same shape as input
+    """
+    mean_reward = group_rewards.mean(dim=-1, keepdim=True)
+    std_reward = group_rewards.std(dim=-1, unbiased=False, keepdim=True) + 1e-8
+    advantages = (group_rewards - mean_reward) / std_reward
+    return advantages
+def step_group_advantage(step_rewards: torch.Tensor, pad_mask: torch.Tensor | None = None) -> torch.Tensor:
+    """
+    Compute per-step normalized advantages from step rewards.
+    For each timestep t, normalizes across the G dimension (trajectories).
+    NOTE: No std normalization is applied here, Using DR. GRPO paper.
+    Args:
+        step_rewards: Per-step rewards tensor [B, G, T]
+        pad_mask: Optional mask for valid steps [B, G, T], True=valid
+    Returns:
+        Normalized advantages [B, G, T] where each timestep is normalized across G
+    """
+    # Normalize across G dimension for each (batch, timestep)
+    # step_rewards: [B, G, T]
+    mean_t = step_rewards.mean(dim=1, keepdim=True)  # [B, 1, T]
+    advantages = (step_rewards - mean_t) # [B, G, T]
+    if pad_mask is not None:
+        advantages = advantages * pad_mask.float()
+    return advantages
+def ppo_chess_loss(
+    logprobs_new: torch.Tensor,   # [G, T]  log πθ(a_{g,k,t} | s_{g,k,t})
+    logprobs_old: torch.Tensor,   # [G, T]  log πold(a_{g,k,t} | s_{g,k,t})
+    advantages: torch.Tensor,        # [G, T]
+    clip_eps: float = 0.2,  # ε in the formula
+    pad_mask: torch.Tensor | None = None,  # [G, T], True = real, False = pad
+    return_info: bool = False,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Compute PPO-clip loss for chess policy optimization.
+    Args:
+        logprobs_new: New policy log probabilities [B, G, T] or [G, T]
+        logprobs_old: Old policy log probabilities [B, G, T] or [G, T]
+        advantages: Advantage values [B, G, T] or [G, T]
+        clip_eps: PPO clipping epsilon (default: 0.2)
+        pad_mask: Mask indicating valid steps, True=valid, False=padding
+        return_info: If True, return additional statistics
+    Returns:
+        If return_info=False: policy loss tensor [B, G, T] or [G, T]
+        If return_info=True: tuple of (policy_loss, mean_ratio, mean_clip_fraction)
+    """
+    if pad_mask is None:
+      pad_mask = torch.ones_like(logprobs_new, dtype=torch.bool)
+    ratio = (logprobs_new - logprobs_old).exp() # [G, T]
+    pg_unclipped = -advantages * ratio  # [G, T]
+    pg_clipped = -advantages * ratio.clamp(1.0 - clip_eps, 1.0 + clip_eps) # [G, T]
+    # Surrogate policy gradient loss (PPO-clip part)
+    # This corresponds to the -E[min(...)] in the formula.
+    policy_loss = torch.max(pg_unclipped, pg_clipped) * pad_mask.float()
+    if return_info:
+        valid_steps = pad_mask.sum().clamp_min(1.0)
+        mean_padded_ratio = (ratio * pad_mask.float()).sum() / valid_steps
+        clip_fraction_mask = (ratio > (1.0 + clip_eps)) | (ratio < (1.0 - clip_eps))
+        mean_clip_fraction = (clip_fraction_mask.float() * pad_mask.float()).sum() / valid_steps
+        return policy_loss, mean_padded_ratio, mean_clip_fraction # [G, T], scalar, scalar
+    return policy_loss # [G, T]
+def kl_penalty(logprobs_new: torch.Tensor,
+               logprobs_old: torch.Tensor,
+               pad_mask: torch.Tensor | None = None) -> torch.Tensor:
+    """
+    Compute KL divergence penalty between old and new policies.
+    Args:
+        logprobs_new: New policy log probabilities
+        logprobs_old: Old policy log probabilities
+        pad_mask: Optional mask for valid steps
+    Returns:
+        Mean KL divergence over valid steps
+    """
+    if pad_mask is None:
+      pad_mask = torch.ones_like(logprobs_new, dtype=torch.bool)
+    return (logprobs_old - logprobs_new)[pad_mask].mean()
+def grpo_ppo_loss(
+    logprobs_new: torch.Tensor,     # [B, G, T] or [G, T]
+    logprobs_old: torch.Tensor,     # [B, G, T] or [G, T]
+    step_rewards: torch.Tensor,     # [B, G, T] or [G, T] - per-step rewards
+    pad_mask: torch.Tensor | None = None,  # [B, G, T] or [G, T]
+    clip_ratio: float = 0.2,        # PPO clipping ratio (epsilon in paper)
+    kl_coef: float = 0.01,          # KL penalty coefficient (beta in paper)
+    entropy_coef: float = 0.1,      # Entropy bonus coefficient (prevents policy collapse)
+    return_info: bool = False,      # Return extra info for logging
+    ) -> torch.Tensor | Tuple[torch.Tensor, GRPOLossInfo]:
+    """
+    Compute GRPO (Group Relative Policy Optimization) loss with PPO clipping.
+    This combines PPO-clip loss with KL divergence penalty and optional entropy bonus.
+    Advantages are computed per-step by normalizing step rewards across trajectories
+    (G dimension) for each timestep.
+    Args:
+        logprobs_new: New policy log probabilities [B, G, T] or [G, T]
+        logprobs_old: Old policy log probabilities [B, G, T] or [G, T]
+        step_rewards: Per-step rewards [B, G, T] or [G, T]
+        pad_mask: Mask indicating valid steps, True=valid, False=padding
+        clip_ratio: PPO clipping ratio (default: 0.2)
+        kl_coef: KL divergence penalty coefficient (default: 0.01)
+        entropy_coef: Entropy bonus coefficient (default: 0.0, set >0 to encourage exploration)
+        return_info: If True, return GRPOLossInfo for logging
+    Returns:
+        If return_info=False: scalar loss tensor
+        If return_info=True: tuple of (loss, GRPOLossInfo)
+    """
+    # Handle 2D input (no batch dimension) by adding batch dimension
+    if logprobs_new.ndim == 2:
+        logprobs_new = logprobs_new.unsqueeze(0)
+        logprobs_old = logprobs_old.unsqueeze(0)
+        step_rewards = step_rewards.unsqueeze(0)
+        if pad_mask is not None:
+            pad_mask = pad_mask.unsqueeze(0)
+    if pad_mask is None:
+        pad_mask = torch.ones_like(logprobs_new, dtype=torch.bool)
+    # Compute per-step advantages (normalized across G for each timestep)
+    advantages = step_group_advantage(step_rewards, pad_mask).detach()  # [B, G, T]
+    ppo_loss, mean_ratio, mean_clip_fraction = ppo_chess_loss(logprobs_new,
+                                                              logprobs_old,
+                                                              advantages,
+                                                              clip_ratio,
+                                                              pad_mask,
+                                                              return_info=True)
+    valid_steps = pad_mask.sum().clamp_min(1)
+    ppo_loss = ppo_loss.sum() / valid_steps
+    kl_div = kl_penalty(logprobs_new, logprobs_old, pad_mask)
+    # Entropy bonus: H(π) ≈ -E[log π(a|s)] encourages exploration
+    # We use the negative log_probs of selected actions as an estimate
+    entropy = -logprobs_new[pad_mask].mean()
+    # Loss components:
+    # - loss_without_entropy = PPO loss + KL penalty
+    # - total loss           = loss_without_entropy - entropy bonus
+    loss_without_entropy = ppo_loss + kl_coef * kl_div
+    loss = loss_without_entropy - entropy_coef * entropy
+    if return_info:
+        return loss, GRPOLossInfo(
+            kl_div=kl_div.detach(),
+            mean_ratio=mean_ratio.detach(),
+            mean_clip_fraction=mean_clip_fraction.detach(),
+            ppo_loss=ppo_loss.detach(),
+            entropy=entropy.detach(),
+            loss_without_entropy=loss_without_entropy.detach(),
+        )
+    return loss

hf_space_repo/grpo_logic/model.py ADDED Viewed

	@@ -0,0 +1,782 @@

+from typing import Optional
+import torch
+import pytorch_lightning as pl
+import chess
+from dataclasses import dataclass
+from src.grpo_self_play.evaluator import Evaluator
+from src.grpo_self_play.models import ChessTransformer, ChessTransformerConfig
+from src.grpo_self_play.grpo_logic.loss import grpo_ppo_loss
+from src.grpo_self_play.grpo_logic.sampling import sample_trajectories_batched
+from src.grpo_self_play.eval_utils import EvalConfig
+from src.grpo_self_play.chess.policy_player import PolicyConfig
+from src.grpo_self_play.chess.searcher import SearchConfig
+from src.grpo_self_play.chess.stockfish import StockfishConfig
+from src.grpo_self_play.pretrain.pretrain_load_config import PretrainLoadConfig
+class EntropyFloorMonitor:
+    """Monitors entropy and takes action when it falls below a floor (Recommendation 1).
+    Tracks consecutive steps where entropy is below a threshold and triggers
+    configurable actions (warn, stop, or boost entropy_coef) when the threshold
+    is breached for too long.
+    """
+    def __init__(self, floor: float, steps_threshold: int, action: str, boost_factor: float):
+        """
+        Args:
+            floor: Minimum entropy threshold
+            steps_threshold: Consecutive steps below floor before action
+            action: Action to take ("warn", "stop", "boost")
+            boost_factor: Factor to multiply entropy_coef when boosting
+        """
+        self.floor = floor
+        self.steps_threshold = steps_threshold
+        self.action = action
+        self.boost_factor = boost_factor
+        self.consecutive_low_steps = 0
+        self.triggered = False
+    def check(self, entropy: float, current_entropy_coef: float) -> tuple[float, dict]:
+        """Check entropy and return updated entropy_coef and metrics.
+        Args:
+            entropy: Current entropy value
+            current_entropy_coef: Current entropy coefficient
+        Returns:
+            Tuple of (new_entropy_coef, metrics_dict)
+        """
+        metrics = {}
+        new_entropy_coef = current_entropy_coef
+        if entropy < self.floor:
+            self.consecutive_low_steps += 1
+            if self.consecutive_low_steps >= self.steps_threshold and not self.triggered:
+                self.triggered = True
+                if self.action == "warn":
+                    print(f"WARNING: Entropy collapse detected! Entropy={entropy:.4f} < floor={self.floor} "
+                          f"for {self.consecutive_low_steps} consecutive steps.")
+                elif self.action == "stop":
+                    raise RuntimeError(
+                        f"STOPPING: Entropy collapse detected! Entropy={entropy:.4f} < floor={self.floor} "
+                        f"for {self.consecutive_low_steps} consecutive steps.")
+                elif self.action == "boost":
+                    new_entropy_coef = current_entropy_coef * self.boost_factor
+                    print(f"BOOSTING entropy_coef: {current_entropy_coef:.4f} -> {new_entropy_coef:.4f} "
+                          f"(entropy={entropy:.4f} < floor={self.floor})")
+                    self.consecutive_low_steps = 0
+                    self.triggered = False
+        else:
+            self.consecutive_low_steps = 0
+            self.triggered = False
+        metrics["entropy_floor/consecutive_low_steps"] = self.consecutive_low_steps
+        metrics["entropy_floor/below_floor"] = float(entropy < self.floor)
+        metrics["entropy_floor/current_entropy_coef"] = new_entropy_coef
+        return new_entropy_coef, metrics
+def compute_group_collapse_metrics(
+    actions: torch.Tensor,
+    group_rewards: torch.Tensor,
+    step_rewards: torch.Tensor,
+    pad_mask: torch.Tensor,
+) -> dict:
+    """Compute within-board group collapse metrics (Recommendation 4).
+    These metrics directly measure whether all G trajectories from the same board
+    are converging to the same moves, which is the key failure mode in entropy collapse.
+    Args:
+        actions: Action indices [B, G, T]
+        group_rewards: Final rewards for each trajectory [B, G]
+        step_rewards: Per-step rewards [B, G, T]
+        pad_mask: Mask indicating valid steps [B, G, T], True=valid
+    Returns:
+        Dictionary of metrics for logging
+    """
+    B, _, T = actions.shape
+    metrics = {}
+    # 1. Action agreement: for each (b, t), what fraction of trajectories chose the most common action?
+    # agreement[b,t] = max_count(actions[b,:,t]) / G
+    action_agreement = torch.zeros(B, T, device=actions.device)
+    for b in range(B):
+        for t in range(T):
+            if pad_mask[b, :, t].any():  # At least one valid trajectory at this timestep
+                valid_actions = actions[b, pad_mask[b, :, t], t]
+                if len(valid_actions) > 0:
+                    # Count occurrences of each action
+                    _, counts = valid_actions.unique(return_counts=True)
+                    max_count = counts.max().item()
+                    num_valid = pad_mask[b, :, t].sum().item()
+                    action_agreement[b, t] = max_count / num_valid
+    # Mask to only consider valid (b, t) pairs
+    valid_bt_mask = pad_mask.any(dim=1)  # [B, T] - True if any trajectory valid at (b, t)
+    valid_agreements = action_agreement[valid_bt_mask]
+    if len(valid_agreements) > 0:
+        metrics["group_collapse/action_agreement_mean"] = valid_agreements.mean().item()
+        metrics["group_collapse/action_agreement_p90"] = valid_agreements.quantile(0.9).item()
+        metrics["group_collapse/action_agreement_max"] = valid_agreements.max().item()
+    else:
+        metrics["group_collapse/action_agreement_mean"] = 0.0
+        metrics["group_collapse/action_agreement_p90"] = 0.0
+        metrics["group_collapse/action_agreement_max"] = 0.0
+    # 2. Within-board reward diversity: std(group_rewards[b,:]) for each board b
+    # This measures whether trajectories from the same starting position get similar rewards
+    reward_std_within = group_rewards.std(dim=1)  # [B]
+    metrics["group_collapse/reward_std_within_mean"] = reward_std_within.mean().item()
+    metrics["group_collapse/reward_std_within_min"] = reward_std_within.min().item()
+    # 3. Within-board step reward diversity: std(step_rewards[b,:,t]) for each (b, t)
+    # Only compute for valid (b, t) pairs
+    step_reward_std_within = torch.zeros(B, T, device=step_rewards.device)
+    for b in range(B):
+        for t in range(T):
+            valid_mask_bt = pad_mask[b, :, t]
+            if valid_mask_bt.sum() > 1:  # Need at least 2 valid trajectories for std
+                step_reward_std_within[b, t] = step_rewards[b, valid_mask_bt, t].std().item()
+    valid_step_stds = step_reward_std_within[valid_bt_mask]
+    if len(valid_step_stds) > 0:
+        metrics["group_collapse/step_reward_std_within_mean"] = valid_step_stds.mean().item()
+        metrics["group_collapse/step_reward_std_within_min"] = valid_step_stds.min().item()
+    else:
+        metrics["group_collapse/step_reward_std_within_mean"] = 0.0
+        metrics["group_collapse/step_reward_std_within_min"] = 0.0
+    return metrics
+class AdaptiveKLController:
+    """Adapts KL coefficient to maintain target KL divergence (Recommendation 2).
+    Implements a simple multiplicative controller that increases kl_coef when
+    KL divergence exceeds target and decreases it when below target.
+    """
+    def __init__(self, initial_kl_coef: float, target_kl: float, adapt_rate: float,
+                 kl_coef_min: float, kl_coef_max: float):
+        """
+        Args:
+            initial_kl_coef: Starting KL coefficient
+            target_kl: Target KL divergence value
+            adapt_rate: Multiplicative factor for adjustment
+            kl_coef_min: Minimum allowed kl_coef
+            kl_coef_max: Maximum allowed kl_coef
+        """
+        self.current_kl_coef = initial_kl_coef
+        self.target_kl = target_kl
+        self.adapt_rate = adapt_rate
+        self.kl_coef_min = kl_coef_min
+        self.kl_coef_max = kl_coef_max
+    def update(self, kl_div: float) -> dict:
+        """Update KL coefficient based on current KL divergence.
+        Args:
+            kl_div: Current KL divergence value
+        Returns:
+            Metrics dict for logging
+        """
+        if kl_div > self.target_kl:
+            self.current_kl_coef = min(self.current_kl_coef * self.adapt_rate, self.kl_coef_max)
+        else:
+            self.current_kl_coef = max(self.current_kl_coef / self.adapt_rate, self.kl_coef_min)
+        return {
+            "adaptive_kl/current_kl_coef": self.current_kl_coef,
+            "adaptive_kl/target_kl": self.target_kl,
+            "adaptive_kl/kl_ratio": kl_div / self.target_kl if self.target_kl > 0 else 0.0,
+        }
+@dataclass
+class GRPOConfig:
+    """Configuration for GRPO (Group Relative Policy Optimization) training.
+    Attributes:
+        lr: Learning rate for optimizer
+        num_trajectories: Number of trajectory groups to sample per batch
+        trajectory_depth: Maximum depth of each trajectory
+        clip_ratio: PPO clipping ratio (epsilon)
+        kl_coef: KL divergence penalty coefficient (beta)
+        entropy_coef: Entropy bonus coefficient (encourages exploration, prevents policy collapse)
+        eval_every_n_epochs: Frequency of evaluation runs (not used in model, but useful for trainer)
+        # Entropy floor monitoring (Recommendation 1)
+        use_entropy_floor: Whether to enable entropy floor monitoring
+        entropy_floor: Minimum entropy threshold for collapse detection
+        entropy_floor_steps: Number of consecutive steps below floor before alert/action
+        entropy_floor_action: Action to take when entropy floor is breached ("warn", "stop", "boost")
+        entropy_boost_factor: Factor to multiply entropy_coef when boosting (if action="boost")
+        # Adaptive KL controller (Recommendation 2)
+        adaptive_kl: Whether to use adaptive KL coefficient
+        target_kl: Target KL divergence value
+        kl_adapt_rate: Rate at which to adjust kl_coef (higher = faster adaptation)
+        kl_coef_min: Minimum allowed kl_coef
+        kl_coef_max: Maximum allowed kl_coef
+        # PPO-style multiple updates
+        ppo_steps: Number of optimization steps per sampled trajectory batch (reuses samples)
+        # Rollout temperature for exploration
+        rollout_temperature: Temperature for action sampling during rollouts (>1 increases exploration)
+        # Safety checks on training dynamics
+        enable_safety_checks: Whether to abort training when known-bad patterns persist
+        safety_patience_steps: Number of training steps to tolerate violations before aborting
+        max_clip_fraction: If mean_clip_fraction > this for too long -> abort
+        min_entropy: If entropy < this for too long -> abort
+        max_kl_divergence: If KL >> target_kl for too long -> abort
+    """
+    # Clean run defaults (see research_docs/2026-02-06_loss-budget-and-monitor-analysis.md)
+    lr: float = 1e-6             # Reduced: PPO signal now dominates gradient
+    num_trajectories: int = 4
+    trajectory_depth: int = 5
+    clip_ratio: float = 0.2
+    kl_coef: float = 0.001       # Reduced from 0.01 (was overridden to 0.1 by adaptive KL)
+    entropy_coef: float = 0.0    # Removed: not in original GRPO loss, was 95% of gradient
+    eval_every_n_epochs: int = 10
+    # Entropy floor monitoring — disabled by default (never triggered in practice)
+    use_entropy_floor: bool = False
+    entropy_floor: float = 1.5
+    entropy_floor_steps: int = 200
+    entropy_floor_action: str = "boost"
+    entropy_boost_factor: float = 2.0
+    # Adaptive KL controller — disabled by default (saturated at max instantly)
+    adaptive_kl: bool = False
+    target_kl: float = 0.015
+    kl_adapt_rate: float = 1.2
+    kl_coef_min: float = 0.003
+    kl_coef_max: float = 0.05
+    # PPO-style multiple updates per sample
+    ppo_steps: int = 1
+    # Rollout temperature for exploration (>1 flattens distribution, increases entropy)
+    rollout_temperature: float = 1.0
+    # Safety checks on training dynamics
+    enable_safety_checks: bool = False
+    safety_patience_steps: int = 1000  # Number of training steps to tolerate violations
+    # Thresholds derived from prior research docs
+    max_clip_fraction: float = 0.95    # If mean_clip_fraction > this for too long -> abort
+    min_entropy: float = 0.5           # If entropy < this for too long -> abort
+    max_kl_divergence: float = 0.08    # If KL >> target_kl for too long -> abort
+    # Teacher forcing: use Stockfish for rival moves during trajectory sampling
+    teacher_forcing_prob: float = 0.0  # Probability of using Stockfish for rival (opponent) moves
+    teacher_forcing_depth: int = 4     # Stockfish search depth for teacher forcing moves
+# Register as safe for torch.load with weights_only=True (PyTorch 2.6+ compatibility)
+torch.serialization.add_safe_globals([GRPOConfig])
+class GRPOChessTransformer(pl.LightningModule):
+    """PyTorch Lightning module for training chess policy with GRPO.
+    This module implements Group Relative Policy Optimization (GRPO) for training
+    a chess transformer policy. It maintains both a current policy and an old policy
+    for computing importance sampling ratios in the PPO loss.
+    Attributes:
+        policy_model: Current policy model being trained
+        old_policy_model: Frozen copy of policy for importance sampling
+        evaluator: Evaluator for running games against Stockfish
+        eval_every_n_epochs: Frequency of evaluation runs
+        entropy_monitor: Optional entropy floor monitor (Recommendation 1)
+        kl_controller: Optional adaptive KL controller (Recommendation 2)
+        current_entropy_coef: Current entropy coefficient (mutable for entropy boosting)
+        automatic_optimization: Set to False for manual PPO steps
+    """
+    automatic_optimization = False  # Manual optimization for ppo_steps
+    def __init__(self,
+                 transformer_config: ChessTransformerConfig,
+                 grpo_config: GRPOConfig,
+                 eval_cfg: EvalConfig | None = None,
+                 stockfish_cfg: StockfishConfig | None = None,
+                 policy_cfg: PolicyConfig | None = None,
+                 searcher_cfg: SearchConfig | None = None,
+                 pretrain_cfg: PretrainLoadConfig | None = None):
+        """
+        Initialize GRPO Chess Transformer.
+        Args:
+            transformer_config: Configuration for the chess transformer model
+            grpo_config: GRPO training configuration
+            eval_cfg: Optional evaluation configuration
+            stockfish_cfg: Optional Stockfish configuration for evaluation
+            policy_cfg: Optional policy player configuration
+            searcher_cfg: Optional search configuration
+            pretrain_cfg: Optional pretrain config for loading pretrained weights
+        """
+        super().__init__()
+        self.save_hyperparameters()
+        self.policy_model = ChessTransformer(transformer_config)
+        self.old_policy_model = ChessTransformer(transformer_config)
+        # Load pretrained weights if specified
+        if pretrain_cfg and pretrain_cfg.checkpoint_path:
+            self._load_pretrained_weights(pretrain_cfg)
+        self._sync_old_policy()
+        # Evaluation config
+        self.eval_every_n_epochs = grpo_config.eval_every_n_epochs
+        self.evaluator = Evaluator(eval_cfg=eval_cfg or EvalConfig(),
+                                   policy_cfg=policy_cfg or PolicyConfig(),
+                                   stockfish_cfg=stockfish_cfg or StockfishConfig(),
+                                   searcher_cfg=searcher_cfg)
+        # Entropy floor monitor (Recommendation 1) - optional
+        self.entropy_monitor: EntropyFloorMonitor | None = None
+        if grpo_config.use_entropy_floor:
+            self.entropy_monitor = EntropyFloorMonitor(
+                floor=grpo_config.entropy_floor,
+                steps_threshold=grpo_config.entropy_floor_steps,
+                action=grpo_config.entropy_floor_action,
+                boost_factor=grpo_config.entropy_boost_factor,
+            )
+        self.current_entropy_coef = grpo_config.entropy_coef
+        # Adaptive KL controller (Recommendation 2) - optional
+        self.kl_controller: AdaptiveKLController | None = None
+        if grpo_config.adaptive_kl:
+            self.kl_controller = AdaptiveKLController(
+                initial_kl_coef=grpo_config.kl_coef,
+                target_kl=grpo_config.target_kl,
+                adapt_rate=grpo_config.kl_adapt_rate,
+                kl_coef_min=grpo_config.kl_coef_min,
+                kl_coef_max=grpo_config.kl_coef_max,
+            )
+        # Safety-check state (for tracking persistent violations)
+        self._safety_step_idx: int = 0
+        self._high_clip_steps: int = 0
+        self._low_entropy_steps: int = 0
+        self._high_kl_steps: int = 0
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward pass through the current policy model.
+        Args:
+            x: Input tensor [batch, seq_len]
+        Returns:
+            Policy logits [batch, action_dim]
+        """
+        return self.policy_model(x)
+    def _old_forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward pass through the old (frozen) policy model.
+        Args:
+            x: Input tensor [batch, seq_len]
+        Returns:
+            Policy logits [batch, action_dim]
+        """
+        return self.old_policy_model(x)
+    def _sync_old_policy(self) -> None:
+        """Synchronize old policy model with current policy and freeze it."""
+        self.old_policy_model.load_state_dict(self.policy_model.state_dict())
+        # Freeze old policy parameters
+        for param in self.old_policy_model.parameters():
+            param.requires_grad = False
+    def _load_pretrained_weights(self, pretrain_cfg: PretrainLoadConfig) -> None:
+        """Load pretrained weights from a checkpoint.
+        Args:
+            pretrain_cfg: Pretrain configuration with checkpoint path and freeze settings
+        """
+        checkpoint_path = pretrain_cfg.checkpoint_path
+        print(f"Loading pretrained weights from: {checkpoint_path}")
+        checkpoint = torch.load(checkpoint_path, map_location='cpu', weights_only=False)
+        # Handle different checkpoint formats
+        if 'model_state_dict' in checkpoint:
+            state_dict = checkpoint['model_state_dict']
+        elif 'state_dict' in checkpoint:
+            # Lightning checkpoint format - extract policy_model weights
+            state_dict = {}
+            for k, v in checkpoint['state_dict'].items():
+                if k.startswith('model.'):
+                    # From PretrainChessTransformer
+                    state_dict[k[6:]] = v  # Remove 'model.' prefix
+                elif k.startswith('policy_model.'):
+                    # From GRPOChessTransformer
+                    state_dict[k[13:]] = v  # Remove 'policy_model.' prefix
+        else:
+            # Assume it's a raw state dict
+            state_dict = checkpoint
+        # Load into policy model
+        missing, unexpected = self.policy_model.load_state_dict(state_dict, strict=False)
+        if missing:
+            print(f"Warning: Missing keys in pretrained checkpoint: {missing}")
+        if unexpected:
+            print(f"Warning: Unexpected keys in pretrained checkpoint: {unexpected}")
+        print(f"Successfully loaded pretrained weights")
+        # Optionally freeze transformer layers
+        if pretrain_cfg.freeze_layers > 0:
+            self._freeze_transformer_layers(pretrain_cfg.freeze_layers)
+    def _freeze_transformer_layers(self, num_layers: int) -> None:
+        """Freeze the first N transformer encoder layers.
+        Args:
+            num_layers: Number of layers to freeze (from the bottom)
+        """
+        # Freeze embedding and positional encoding
+        for param in self.policy_model.embedding.parameters():
+            param.requires_grad = False
+        self.policy_model.pos_encoding.requires_grad = False
+        # Freeze specified number of transformer layers
+        for i, layer in enumerate(self.policy_model.transformer.layers):
+            if i < num_layers:
+                for param in layer.parameters():
+                    param.requires_grad = False
+                print(f"Froze transformer layer {i}")
+        # Count trainable parameters
+        trainable = sum(p.numel() for p in self.policy_model.parameters() if p.requires_grad)
+        total = sum(p.numel() for p in self.policy_model.parameters())
+        print(f"Trainable parameters: {trainable:,} / {total:,} ({100*trainable/total:.1f}%)")
+    def _log_rewards_metrics(self, batch_group_rewards: torch.Tensor, prefix: str = "train/") -> None:
+        """Log reward statistics for monitoring training progress.
+        Args:
+            batch_group_rewards: Group rewards tensor [B, G]
+            prefix: Prefix for log keys (default: "train/")
+        """
+        mean_r = batch_group_rewards.mean()
+        best = batch_group_rewards.max()
+        gap = best - mean_r
+        self.log(prefix + "avg_reward", mean_r, prog_bar=True)
+        self.log(prefix + "reward_std", batch_group_rewards.std())
+        self.log(prefix + "reward_p50", batch_group_rewards.median())
+        self.log(prefix + "reward_p90", batch_group_rewards.quantile(0.9))
+        self.log(prefix + "reward_best", best)
+        self.log(prefix + "reward_gap_best_minus_mean", gap)
+    def on_train_epoch_start(self) -> None:
+        """Called at the start of each training epoch. Syncs old policy."""
+        self._sync_old_policy()
+    def _ppo_step(
+        self,
+        trajectories_states: torch.Tensor,
+        trajectories_actions: torch.Tensor,
+        trajectories_old_log_probs: torch.Tensor,
+        trajectories_legal_masks: torch.Tensor | None,
+        step_rewards: torch.Tensor,
+        effective_pad_mask: torch.Tensor,
+    ) -> tuple[torch.Tensor, object]:
+        """Perform a single PPO optimization step.
+        Args:
+            trajectories_states: State tensors [B, G, T, SEQ]
+            trajectories_actions: Action indices [B, G, T]
+            trajectories_old_log_probs: Log probs from old policy [B, G, T]
+            trajectories_legal_masks: Legal move masks [B, G, T, A] or None
+            step_rewards: Per-step rewards [B, G, T]
+            effective_pad_mask: Mask for valid steps [B, G, T]
+        Returns:
+            Tuple of (loss, loss_info)
+        """
+        # Compute new log probs with current policy
+        new_log_probs = self.policy_model.get_group_log_probs(
+            trajectories_states, trajectories_actions, trajectories_legal_masks
+        )
+        # Use current (possibly adapted) coefficients
+        kl_coef = self.kl_controller.current_kl_coef if self.kl_controller else self.hparams.grpo_config.kl_coef
+        loss, loss_info = grpo_ppo_loss(
+            new_log_probs,
+            trajectories_old_log_probs,
+            step_rewards,
+            effective_pad_mask,
+            clip_ratio=self.hparams.grpo_config.clip_ratio,
+            kl_coef=kl_coef,
+            entropy_coef=self.current_entropy_coef,
+            return_info=True,
+        )
+        if not torch.isfinite(loss):
+            raise ValueError(f"Non-finite loss encountered: {loss.item()}")
+        return loss, loss_info
+    def _run_safety_checks(self, loss_info) -> None:
+        """Run safety checks on training dynamics and abort if they persistently fail."""
+        cfg = self.hparams.grpo_config
+        if not cfg.enable_safety_checks:
+            return
+        self._safety_step_idx += 1
+        # 1) PPO clipping saturation
+        if loss_info.mean_clip_fraction.item() > cfg.max_clip_fraction:
+            self._high_clip_steps += 1
+        else:
+            self._high_clip_steps = 0
+        # 2) Entropy collapse
+        if loss_info.entropy.item() < cfg.min_entropy:
+            self._low_entropy_steps += 1
+        else:
+            self._low_entropy_steps = 0
+        # 3) Excessive KL divergence
+        if loss_info.kl_div.item() > cfg.max_kl_divergence:
+            self._high_kl_steps += 1
+        else:
+            self._high_kl_steps = 0
+        # Log safety counters for debugging
+        self.log("safety/high_clip_steps", float(self._high_clip_steps))
+        self.log("safety/low_entropy_steps", float(self._low_entropy_steps))
+        self.log("safety/high_kl_steps", float(self._high_kl_steps))
+        if (
+            self._high_clip_steps >= cfg.safety_patience_steps
+            or self._low_entropy_steps >= cfg.safety_patience_steps
+            or self._high_kl_steps >= cfg.safety_patience_steps
+        ):
+            raise RuntimeError(
+                "Safety checks triggered: training aborted due to persistent "
+                f"bad dynamics (clip={loss_info.mean_clip_fraction.item():.3f}, "
+                f"entropy={loss_info.entropy.item():.3f}, "
+                f"kl={loss_info.kl_div.item():.4f}). "
+                "Adjust GRPOConfig or investigate recent research docs."
+            )
+    def training_step(self, batch_fens: list[str], batch_idx: int) -> None:
+        """Perform a training step with multiple PPO optimization iterations.
+        Samples trajectories once, then performs ppo_steps optimization iterations
+        on the same sampled data to improve compute efficiency.
+        Args:
+            batch_fens: List of FEN strings representing starting positions
+            batch_idx: Batch index (unused)
+        """
+        opt = self.optimizers()
+        boards = [chess.Board(start_fen) for start_fen in batch_fens]
+        boards = [board for board in boards if not board.is_game_over()]
+        if not boards:
+            return  # Skip if game over
+        trajectories_sample = sample_trajectories_batched(
+            self.old_policy_model,
+            boards,
+            self.hparams.grpo_config.num_trajectories,
+            self.hparams.grpo_config.trajectory_depth,
+            temperature=self.hparams.grpo_config.rollout_temperature,
+            teacher_forcing_prob=self.hparams.grpo_config.teacher_forcing_prob,
+            teacher_forcing_depth=self.hparams.grpo_config.teacher_forcing_depth,
+        )
+        if trajectories_sample is None:
+            return  # Skip if no moves
+        # Extract trajectory components (sampled once, reused for ppo_steps)
+        trajectories_old_log_probs = trajectories_sample.trajectories_log_probs  # [B, G, T]
+        trajectories_actions = trajectories_sample.trajectories_actions  # [B, G, T]
+        trajectories_states = trajectories_sample.trajectories_states  # [B, G, T, SEQ]
+        batch_group_rewards = trajectories_sample.group_rewards  # [B, G] (for logging)
+        step_rewards = trajectories_sample.step_rewards  # [B, G, T]
+        pad_mask = trajectories_sample.pad_mask  # [B, G, T]
+        trajectories_legal_masks = trajectories_sample.trajectories_legal_masks  # [B, G, T, A] or None
+        # Add starting player mask (only consider moves from the starting player's perspective)
+        _, _, T = pad_mask.shape
+        t = torch.arange(T, device=pad_mask.device)
+        start_player_mask = (t % 2 == 0)[None, None, :]  # [1, 1, T]
+        effective_pad_mask = pad_mask & start_player_mask  # [B, G, T]
+        ppo_steps = self.hparams.grpo_config.ppo_steps
+        # Perform multiple PPO optimization steps on the same sampled trajectories
+        for ppo_step_idx in range(ppo_steps):
+            loss, loss_info = self._ppo_step(
+                trajectories_states,
+                trajectories_actions,
+                trajectories_old_log_probs,
+                trajectories_legal_masks,
+                step_rewards,
+                effective_pad_mask,
+            )
+            # Manual optimization step
+            opt.zero_grad()
+            self.manual_backward(loss)
+            self.clip_gradients(opt, gradient_clip_val=1.0, gradient_clip_algorithm="norm")
+            opt.step()
+            # Entropy floor monitoring (Recommendation 1) - only on last ppo_step
+            if ppo_step_idx == ppo_steps - 1 and self.entropy_monitor is not None:
+                self.current_entropy_coef, entropy_metrics = self.entropy_monitor.check(
+                    loss_info.entropy.item(), self.current_entropy_coef
+                )
+                for key, value in entropy_metrics.items():
+                    self.log(key, value)
+            # Adaptive KL controller (Recommendation 2) - only on last ppo_step
+            if ppo_step_idx == ppo_steps - 1 and self.kl_controller is not None:
+                kl_metrics = self.kl_controller.update(loss_info.kl_div.item())
+                for key, value in kl_metrics.items():
+                    self.log(key, value)
+        # Within-board group collapse metrics (Recommendation 4) - log once per training_step
+        collapse_metrics = compute_group_collapse_metrics(
+            trajectories_actions, batch_group_rewards, step_rewards, pad_mask
+        )
+        for key, value in collapse_metrics.items():
+            self.log(key, value)
+        # Standard logging (log final ppo_step metrics)
+        valid_mask = pad_mask.float()  # [B, G, T] 1 = real step
+        self.log("train_total_loss", loss, prog_bar=True)
+        self.log("pad_fraction", 1.0 - valid_mask.mean())
+        self.log("avg_trajectory_length", pad_mask.float().sum(dim=-1).mean())
+        self.log("mean_kl_divergence", loss_info.kl_div)
+        self.log("mean_ratio", loss_info.mean_ratio)
+        self.log("mean_clip_fraction", loss_info.mean_clip_fraction)
+        self.log("ppo_loss", loss_info.ppo_loss)
+        self.log("entropy", loss_info.entropy)
+        # Loss without the entropy bonus term (PPO + KL only)
+        self.log("train/loss_without_entropy", loss_info.loss_without_entropy)
+        self.log("ppo_steps", float(ppo_steps))
+        self._log_rewards_metrics(batch_group_rewards, prefix="train/")
+        # Log step rewards statistics (only for valid steps)
+        valid_step_rewards = step_rewards[pad_mask]
+        self.log("train/step_reward_mean", valid_step_rewards.mean())
+        self.log("train/step_reward_std", valid_step_rewards.std())
+        # Log raw centipawn step rewards (before normalization) for debugging
+        raw_step_cp = trajectories_sample.raw_step_cp
+        valid_raw_step_cp = raw_step_cp[pad_mask]
+        self.log("train/raw_step_cp_mean", valid_raw_step_cp.mean())
+        self.log("train/raw_step_cp_std", valid_raw_step_cp.std())
+        self.log("train/raw_step_cp_abs_mean", valid_raw_step_cp.abs().mean())
+        # Run safety checks on the final loss statistics
+        self._run_safety_checks(loss_info)
+    def configure_optimizers(self) -> torch.optim.Adam:
+        """Configure optimizer for training.
+        Returns:
+            Adam optimizer with learning rate from GRPO config
+        """
+        return torch.optim.Adam(self.parameters(), lr=self.hparams.grpo_config.lr)
+    def _evaluate_against_stockfish(self) -> Optional[tuple[dict, list[str]]]:
+        """Run a single game evaluation against Stockfish with current policy model.
+        Returns:
+            Tuple of (results_dict, pgns) or None if evaluation failed
+            pgns is a list of PGN strings for all games played
+        """
+        was_training = self.training
+        self.eval()
+        try:
+            with torch.no_grad():
+                results, _, pgns = self.evaluator.single_evaluation(self.policy_model)
+            return results, pgns
+        except Exception as e:
+            self.logger.warning(f"Evaluation against Stockfish failed: {e}") if hasattr(self, 'logger') else print(f"Evaluation against Stockfish failed: {e}")
+            return None
+        finally:
+            if was_training:
+                self.train()
+    def _log_stockfish_eval(self, results: dict) -> None:
+        """Log scalar evaluation metrics from the Stockfish evaluation.
+        Args:
+            results: Dictionary containing evaluation results with keys:
+                - games: Total number of games played
+                - wins: Number of wins
+                - draws: Number of draws
+                - losses: Number of losses
+                - score: Win rate (0-1)
+                - elo_diff_vs_stockfish_approx: Approximate Elo difference
+                - termination_reasons: Dict mapping termination reasons to counts
+        """
+        # Scalar stats
+        self.log("eval_stockfish/games", results["games"])
+        self.log("eval_stockfish/wins", results["wins"])
+        self.log("eval_stockfish/draws", results["draws"])
+        self.log("eval_stockfish/losses", results["losses"])
+        self.log("eval_stockfish/score", results["score"], prog_bar=True)
+        self.log("eval_stockfish/elo_diff", results["elo_diff_vs_stockfish_approx"], prog_bar=True)
+        # Termination reasons as fractions
+        games = results["games"] or 1
+        for reason, cnt in results["termination_reasons"].items():
+            frac = cnt / games
+            self.log(f"eval_stockfish/term_{reason}", frac)
+    def _log_pgns(self, pgns: list[str]) -> None:
+        """Log PGNs to WandB as a text artifact.
+        Args:
+            pgns: List of PGN strings for all games played
+        """
+        if not pgns:
+            return
+        # Combine all PGNs into a single string
+        combined_pgn = "\n\n".join(pgns)
+        # Log to WandB if available
+        if self.logger and hasattr(self.logger, 'experiment'):
+            try:
+                import wandb
+                # Log as a text artifact
+                self.logger.experiment.log({
+                    "eval_stockfish/pgns": wandb.Html(f"<pre>{combined_pgn}</pre>"),
+                    "eval_stockfish/pgn_text": combined_pgn,
+                })
+            except Exception as e:
+                print(f"Failed to log PGNs to WandB: {e}")
+    def on_train_epoch_end(self) -> None:
+        """Called at the end of each training epoch. Runs evaluation if scheduled."""
+        if (self.current_epoch + 1) % self.eval_every_n_epochs == 0:
+            eval_result = self._evaluate_against_stockfish()
+            if eval_result is not None:
+                results, pgns = eval_result
+                self._log_stockfish_eval(results)
+                self._log_pgns(pgns)

hf_space_repo/grpo_logic/sampling.py ADDED Viewed

	@@ -0,0 +1,243 @@

+import os
+import random
+from typing import List, Optional
+import chess
+import chess.engine
+import torch
+import torch.nn.functional as F
+from dataclasses import dataclass
+from src.grpo_self_play.chess.rewards import reward_board, evaluate_board, normalize_cp
+from src.grpo_self_play.models import ChessTransformer
+from src.grpo_self_play.searchless_chess_imports import ACTION_TO_MOVE, SEQUENCE_LENGTH, MOVE_TO_ACTION
+from src.grpo_self_play.chess.chess_logic import board_to_tensor,  get_legal_moves_mask
+from src.grpo_self_play.chess.stockfish import stockfish_play, DEFAULT_STOCKFISH_TIMEOUT
+def _get_teacher_engine_name() -> str:
+    """Get process-specific engine name for teacher forcing."""
+    return f"teacher_forcing_{os.getpid()}"
+def get_stockfish_move(board: chess.Board, depth: int = 4, timeout: float = DEFAULT_STOCKFISH_TIMEOUT) -> Optional[chess.Move]:
+    """Get the best move from Stockfish for a given board position.
+    Args:
+        board: Chess board position
+        depth: Stockfish search depth
+        timeout: Maximum time to wait for response (seconds)
+    Returns:
+        Best move from Stockfish, or None if no move available or on error
+    """
+    limit = chess.engine.Limit(depth=depth)
+    return stockfish_play(_get_teacher_engine_name(), board, limit, timeout=timeout)
+# Trajectories sampling logic
+@dataclass
+class TrajectoriesSample:
+    """Container for batched trajectory samples.
+    Attributes:
+        trajectories_log_probs: Log probabilities of sampled actions [B, G, T]
+        trajectories_actions: Action indices [B, G, T]
+        trajectories_states: State tensors [B, G, T, SEQ]
+        group_rewards: Final rewards for each trajectory group [B, G] (for logging)
+        step_rewards: Per-step rewards [B, G, T] where step_rewards[b,g,t] = eval(s_{t+1}) - eval(s_t)
+        pad_mask: Mask indicating valid steps, True=valid, False=padding [B, G, T]
+        trajectories_legal_masks: Legal moves masks [B, G, T, A]
+        raw_step_cp: Raw centipawn step rewards [B, G, T] (for logging, not normalized)
+    """
+    trajectories_log_probs: torch.Tensor  # [B, G, T]
+    trajectories_actions: torch.Tensor    # [B, G, T]
+    trajectories_states: torch.Tensor     # [B, G, T, SEQ]
+    group_rewards: torch.Tensor           # [B, G]
+    step_rewards: torch.Tensor            # [B, G, T]
+    pad_mask: torch.Tensor                # [B, G, T]
+    trajectories_legal_masks: torch.Tensor  # [B, G, T, A]
+    raw_step_cp: torch.Tensor             # [B, G, T] - raw centipawn differences
+def batched_policy_step(model: ChessTransformer, boards: List[chess.Board], temperature: float = 1.0) -> Optional[tuple]:
+    """Sample actions from policy for a batch of boards.
+    Args:
+        model: Chess transformer model
+        boards: List of chess board positions
+        temperature: Temperature for sampling
+    Returns:
+        Tuple of (action_indices, log_probs, moves, states_tensor, legal_mask) or None if empty
+    """
+    N = len(boards)
+    if N == 0:
+        return None
+    device = next(model.parameters()).device
+    states_list = []
+    legal_masks = []
+    for board in boards:
+        state = board_to_tensor(board, device=device)
+        states_list.append(state)
+        mask = get_legal_moves_mask(board, device=device)
+        if mask.ndim == 2:
+            mask = mask.squeeze(0)
+        assert mask.ndim == 1, f"legal_moves_mask must be 1D [A], got {mask.shape}"
+        legal_masks.append(mask)
+    states_tensor = torch.cat(states_list, dim=0)  # [N, SEQ]
+    legal_mask = torch.stack(legal_masks, dim=0)     # [N, A] bool
+    assert legal_mask.dtype == torch.bool, "legal_mask must be bool dtype"
+    assert legal_mask.shape[0] == N, f"legal_mask batch size mismatch {legal_mask.shape[0]} vs {N}"
+    assert legal_mask.shape[1] == model.action_size, f"legal_mask action size mismatch {legal_mask.shape[1]} vs {model.action_size}"
+    if not legal_mask.any(dim=1).all():
+        bad = (~legal_mask.any(dim=1)).nonzero(as_tuple=False).flatten().tolist()
+        raise ValueError(f"Empty legal mask for boards: {bad}")
+    probs = model.get_legal_moves_probs(states_tensor, legal_mask, temperature)  # [N, O]
+    action_idx = torch.multinomial(probs, 1).squeeze(1)  # [N,]
+    chosen_probs = probs.gather(1, action_idx.unsqueeze(1)).squeeze(1)  # [N,]
+    chosen_log_probs = torch.log(chosen_probs + 1e-12)  # [N,], avoid log(0)
+    # Convert action indices to moves, ensure legality
+    moves = []
+    for i, idx in enumerate(action_idx.tolist()):
+        uci = ACTION_TO_MOVE[idx]
+        move = chess.Move.from_uci(uci)
+        if move not in boards[i].legal_moves:
+            raise ValueError(f"Sampled illegal move {uci} for board:\n{boards[i]}")
+        moves.append(move)
+    return action_idx, chosen_log_probs, moves, states_tensor, legal_mask
+def sample_trajectories_batched(model: ChessTransformer,
+                                boards: List[chess.Board],
+                                num_trajectories: int,
+                                trajectory_depth: int,
+                                reward_depth: int = 4,
+                                temperature: float = 1.0,
+                                teacher_forcing_prob: float = 0.0,
+                                teacher_forcing_depth: int = 4) -> Optional[TrajectoriesSample]:
+    """Sample multiple trajectories from each board position using the policy model.
+    Args:
+        model: Chess transformer model for action selection
+        boards: List of starting board positions [B]
+        num_trajectories: Number of trajectory groups per board (G)
+        trajectory_depth: Maximum depth of each trajectory (T)
+        reward_depth: Stockfish depth for reward computation (default: 4)
+        temperature: Temperature for action sampling (default: 1.0, >1 increases exploration)
+        teacher_forcing_prob: Probability of using Stockfish for rival moves (default: 0.0)
+        teacher_forcing_depth: Stockfish depth for teacher forcing moves (default: 4)
+    Returns:
+        TrajectoriesSample containing batched trajectory data, or None if no boards
+    """
+    device = next(model.parameters()).device
+    B, G, T = len(boards), num_trajectories, trajectory_depth
+    if B == 0:
+        return None
+    # Create B*G copies of boards for parallel trajectory sampling
+    envs = [boards[b].copy() for b in range(B) for _ in range(G)]  # Length of B*G
+    # Per (b, g) storage as nested lists
+    traj_log_probs = [[[] for _ in range(G)] for _ in range(B)]
+    traj_actions = [[[] for _ in range(G)] for _ in range(B)]
+    traj_states = [[[] for _ in range(G)] for _ in range(B)]
+    traj_legal_masks = [[[] for _ in range(G)] for _ in range(B)]
+    traj_step_rewards = [[[] for _ in range(G)] for _ in range(B)]
+    traj_raw_step_cp = [[[] for _ in range(G)] for _ in range(B)]  # Raw centipawn differences for logging
+    # Track POV and previous raw eval for each trajectory (we normalize step rewards later)
+    pov_is_white = [(boards[b].turn == chess.WHITE) for b in range(B) for _ in range(G)]
+    prev_evals_raw = [evaluate_board(boards[b], pov_is_white[b * G], depth=reward_depth, normalize=False)
+                      for b in range(B) for _ in range(G)]
+    # Rollout: sample trajectories in batches
+    for t in range(T):
+        active_env_idx = [i for i, e in enumerate(envs) if not e.is_game_over()]
+        if not active_env_idx:
+            break
+        # Determine if this is the rival's turn (odd timesteps)
+        is_rival_turn = (t % 2 == 1)
+        use_teacher_forcing = is_rival_turn and teacher_forcing_prob > 0 and random.random() < teacher_forcing_prob
+        active_boards = [envs[i] for i in active_env_idx]
+        roll_out_step = batched_policy_step(model, active_boards, temperature=temperature)
+        if roll_out_step is None:
+            break
+        action_indices, log_probs, moves, states_batch, legal_mask = roll_out_step
+        if action_indices is None:
+            break
+        for j, env_idx_j in enumerate(active_env_idx):
+            move_j = moves[j]
+            if move_j is None:
+                continue  # End of game for this env
+            b_idx = env_idx_j // G
+            g_idx = env_idx_j % G
+            state_j = states_batch[j]
+            # Teacher forcing: override rival's move with Stockfish
+            if use_teacher_forcing:
+                sf_move = get_stockfish_move(envs[env_idx_j], depth=teacher_forcing_depth)
+                if sf_move is not None and sf_move in envs[env_idx_j].legal_moves:
+                    move_j = sf_move
+                    # Update action index to match the Stockfish move
+                    action_indices[j] = MOVE_TO_ACTION[move_j.uci()]
+            traj_log_probs[b_idx][g_idx].append(log_probs[j])
+            traj_actions[b_idx][g_idx].append(int(action_indices[j].item()))
+            traj_states[b_idx][g_idx].append(state_j)
+            traj_legal_masks[b_idx][g_idx].append(legal_mask[j])
+            envs[env_idx_j].push(move_j)
+            # Compute step reward: eval(new_state) - eval(prev_state)
+            # Get raw centipawn value, then normalize for step_rewards
+            new_eval_raw = evaluate_board(envs[env_idx_j], pov_is_white[env_idx_j], depth=reward_depth, normalize=False)
+            raw_step_cp = new_eval_raw - prev_evals_raw[env_idx_j]
+            step_reward = normalize_cp(new_eval_raw) - normalize_cp(prev_evals_raw[env_idx_j])
+            traj_step_rewards[b_idx][g_idx].append(step_reward)
+            traj_raw_step_cp[b_idx][g_idx].append(raw_step_cp)
+            prev_evals_raw[env_idx_j] = new_eval_raw
+    # Compute group_rewards for logging (sum of step rewards = final - initial)
+    group_rewards = torch.zeros(B, G, dtype=torch.float32, device=device)
+    for env_idx, env in enumerate(envs):
+        b_idx = env_idx // G
+        g_idx = env_idx % G
+        group_rewards[b_idx, g_idx] = reward_board(env, boards[b_idx], depth=reward_depth, movetime_ms=0)
+    # Allocate padded tensors
+    trajectories_log_probs = torch.zeros(B, G, T, dtype=torch.float32, device=device)
+    trajectories_actions = torch.zeros(B, G, T, dtype=torch.long, device=device)
+    trajectories_states = torch.zeros(B, G, T, SEQUENCE_LENGTH, dtype=torch.long, device=device)
+    trajectories_legal_masks = torch.zeros(B, G, T, model.action_size, dtype=torch.bool, device=device)
+    trajectories_legal_masks[..., 0] = True  # Ensure at least one legal move (to avoid empty legal masks -> NaNs in log_softmax)
+    step_rewards = torch.zeros(B, G, T, dtype=torch.float32, device=device)
+    raw_step_cp = torch.zeros(B, G, T, dtype=torch.float32, device=device)
+    pad_mask = torch.zeros(B, G, T, dtype=torch.bool, device=device)
+    for b in range(B):
+        for g in range(G):
+            L = len(traj_log_probs[b][g])
+            assert L <= T, f"Trajectory length {L} exceeds pad_length {T}"
+            pad_mask[b, g, :L] = True
+            trajectories_log_probs[b, g, :L] = torch.stack(traj_log_probs[b][g], dim=0)
+            trajectories_actions[b, g, :L] = torch.tensor(traj_actions[b][g], dtype=torch.long, device=device)
+            trajectories_states[b, g, :L] = torch.stack(traj_states[b][g], dim=0)
+            if L > 0:
+                trajectories_legal_masks[b, g, :L] = torch.stack(traj_legal_masks[b][g], dim=0)
+                step_rewards[b, g, :L] = torch.tensor(traj_step_rewards[b][g], dtype=torch.float32, device=device)
+                raw_step_cp[b, g, :L] = torch.tensor(traj_raw_step_cp[b][g], dtype=torch.float32, device=device)
+    return TrajectoriesSample(trajectories_log_probs,
+                              trajectories_actions,
+                              trajectories_states,
+                              group_rewards,
+                              step_rewards,
+                              pad_mask,
+                              trajectories_legal_masks,
+                              raw_step_cp)

hf_space_repo/logging_utils.py ADDED Viewed

	@@ -0,0 +1,32 @@

+"""Logging utilities for GRPO training.
+Uses Python's standard logging module which WandB captures automatically
+in the Logs tab of a run.
+"""
+import logging
+_initialized_loggers = set()
+def get_logger(name: str = "grpo_chess") -> logging.Logger:
+    """Get a logger that appears in WandB Logs tab.
+    Args:
+        name: Logger name (default: "grpo_chess")
+    Returns:
+        Configured logger instance
+    """
+    logger = logging.getLogger(name)
+    if name not in _initialized_loggers:
+        logger.setLevel(logging.INFO)
+        handler = logging.StreamHandler()
+        handler.setFormatter(logging.Formatter(
+            '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+        ))
+        logger.addHandler(handler)
+        _initialized_loggers.add(name)
+    return logger

hf_space_repo/models.py ADDED Viewed

	@@ -0,0 +1,234 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import chess
+from typing import Optional
+from dataclasses import dataclass
+from src.grpo_self_play.searchless_chess_imports import ACTION_TO_MOVE
+from src.grpo_self_play.chess.chess_logic import board_to_tensor, get_legal_moves_indices
+@dataclass
+class ChessTransformerConfig:
+    """Configuration for the Chess Transformer model.
+    Attributes:
+        vocab_size: Size of the vocabulary (token dictionary)
+        embed_dim: Embedding dimension for transformer
+        num_layers: Number of transformer encoder layers
+        num_heads: Number of attention heads
+        action_dim: Dimension of action space (number of possible moves)
+    """
+    vocab_size: int = 300
+    embed_dim: int = 256
+    num_layers: int = 4
+    num_heads: int = 8
+    action_dim: int = 1968
+# Register as safe for torch.load with weights_only=True (PyTorch 2.6+ compatibility)
+torch.serialization.add_safe_globals([ChessTransformerConfig])
+class ChessTransformer(nn.Module):
+    """Transformer-based chess policy network.
+    Takes FEN-encoded board states as input and outputs action logits.
+    Uses a transformer encoder with learnable positional encodings.
+    """
+    def __init__(self, transformer_config: ChessTransformerConfig):
+        """
+        Initialize Chess Transformer.
+        Args:
+            transformer_config: Configuration for the transformer model
+        """
+        super().__init__()
+        vocab_size = transformer_config.vocab_size
+        embed_dim = transformer_config.embed_dim
+        num_layers = transformer_config.num_layers
+        num_heads = transformer_config.num_heads
+        action_dim = transformer_config.action_dim
+        self.embedding = nn.Embedding(vocab_size, embed_dim)
+        # DeepMind uses absolute or relative pos encoding.
+        # For simplicity, we use learnable absolute encoding for FEN length (~80 chars)
+        self.pos_encoding = nn.Parameter(torch.randn(1, 128, embed_dim))
+        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, batch_first=True)
+        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
+        # Head outputs 1968 logits (one for each possible unique move type)
+        self.policy_head = nn.Sequential(
+            nn.Linear(embed_dim, embed_dim),
+            nn.ReLU(),
+            nn.Linear(embed_dim, action_dim)
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward pass through the transformer.
+        Args:
+            x: Input tensor of token IDs [batch, seq_len]
+        Returns:
+            Action logits [batch, action_dim]
+        """
+        batch, seq = x.shape
+        # Create padding mask: True indicates a masked position (padding token 0)
+        src_key_padding_mask = (x == 0)
+        x = self.embedding(x) + self.pos_encoding[:, :seq, :]
+        # Pass the padding mask to the transformer
+        out = self.transformer(x, src_key_padding_mask=src_key_padding_mask)
+        # Pool: Mean of the non-masked tokens
+        mask = ~src_key_padding_mask
+        mask_expanded = mask.unsqueeze(-1).float()  # [B, SEQ, 1]
+        pooled = (out * mask_expanded).sum(dim=1) / mask_expanded.sum(dim=1).clamp_min(1)
+        return self.policy_head(pooled)
+    @property
+    def device(self) -> torch.device:
+        """Get the device of the model parameters."""
+        return next(self.parameters()).device
+    @property
+    def action_size(self) -> int:
+        """Get the size of the action space."""
+        return self.policy_head[-1].out_features
+    def get_legal_moves_logits(self, tensor_state: torch.Tensor,
+                               legal_moves_mask: torch.Tensor,
+                               temperature: float = 1.0) -> torch.Tensor:
+        """Get logits for legal moves only, masking illegal moves.
+        Args:
+            tensor_state: Board state tensor [B, SEQ]
+            legal_moves_mask: Boolean mask for legal moves [B, A]
+            temperature: Temperature for scaling logits
+        Returns:
+            Masked logits [B, A] with illegal moves set to -inf
+        """
+        assert legal_moves_mask is not None, "legal_moves_mask cannot be None"
+        logits = self(tensor_state) / temperature
+        return logits.masked_fill(~legal_moves_mask, -float('inf'))
+    def get_legal_moves_probs(self, tensor_state: torch.Tensor,
+                              legal_moves_mask: torch.Tensor,
+                              temperature: float = 1.0) -> torch.Tensor:
+        """Get probability distribution over legal moves.
+        Args:
+            tensor_state: Board state tensor [B, SEQ]
+            legal_moves_mask: Boolean mask for legal moves [B, A]
+            temperature: Temperature for scaling logits
+        Returns:
+            Probability distribution [B, A] over legal moves
+        """
+        mask_logits = self.get_legal_moves_logits(tensor_state, legal_moves_mask, temperature)
+        return F.softmax(mask_logits, dim=-1)
+    def get_group_log_probs(self,
+                            trajectories_states: torch.Tensor,
+                            action_idx: torch.Tensor,
+                            legal_moves_mask: torch.Tensor,
+                            temperature: float = 1.0) -> torch.Tensor:
+        """Get log probabilities for actions in batched trajectories.
+        Args:
+            trajectories_states: State tensors [B, G, T, SEQ]
+            action_idx: Action indices [B, G, T]
+            legal_moves_mask: Legal moves mask [B, G, T, A]
+            temperature: Temperature for scaling logits
+        Returns:
+            Log probabilities [B, G, T] for the selected actions
+        """
+        assert legal_moves_mask is not None, "legal_moves_mask cannot be None"
+        assert legal_moves_mask.dtype == torch.bool, "legal_moves_mask must be bool dtype"
+        x = trajectories_states  # [B, G, T, SEQ]
+        B, G, T, L = x.shape
+        x_flat = x.view(B * G * T, L)  # [B*G*T, SEQ]
+        if legal_moves_mask is not None:
+            legal_moves_mask = legal_moves_mask.view(B * G * T, -1)  # [B*G*T, O]
+        masked_logits = self.get_legal_moves_logits(x_flat, legal_moves_mask, temperature)  # [B*G*T, O]
+        log_probs_all = F.log_softmax(masked_logits, dim=-1)  # [B*G*T, O]
+        action_idx_flat = action_idx.view(B * G * T, 1)  # [B*G*T, 1]
+        log_probs_flat = log_probs_all.gather(1, action_idx_flat).squeeze(-1)  # [B*G*T]
+        log_probs = log_probs_flat.view(B, G, T)  # [B, G, T]
+        return log_probs
+    def _get_action_logits(self, board: chess.Board, temperature: float = 1.0) -> Optional[torch.Tensor]:
+        """Get action logits for a single board position.
+        Args:
+            board: Chess board position
+            temperature: Temperature for scaling logits
+        Returns:
+            Logits tensor [1, action_dim] or None if no legal moves
+        """
+        legal_moves = list(board.legal_moves)
+        legal_indices = get_legal_moves_indices(board)
+        if not legal_moves:
+            return None
+        # Run model
+        state = board_to_tensor(board, device=self.device)
+        logits = self(state)  # [1, O]
+        output = torch.full_like(logits, -float('inf'))
+        output[0, legal_indices] = logits[0, legal_indices] / temperature
+        return output
+    def select_action(self, board: chess.Board, temperature: float = 1.0) -> tuple[Optional[chess.Move], Optional[torch.Tensor], Optional[int]]:
+        """Sample an action from the policy for a given board position.
+        Args:
+            board: Chess board position
+            temperature: Temperature for sampling (higher = more random)
+        Returns:
+            Tuple of (move, log_prob, action_idx) or (None, None, None) if no legal moves
+        """
+        logits = self._get_action_logits(board, temperature)
+        if logits is None:
+            return None, None, None
+        logits = logits.squeeze(0)  # Remove batch dimension
+        probs = F.softmax(logits, dim=0)
+        # Sample
+        action_idx = int(torch.multinomial(probs, 1).item())
+        chosen_move = ACTION_TO_MOVE[action_idx]
+        log_prob = torch.log(probs[action_idx] + 1e-12)  # Avoid log(0)
+        return chess.Move.from_uci(chosen_move), log_prob, action_idx
+def select_action_greedy(model: ChessTransformer, board: chess.Board, temperature: float = 1.0) -> Optional[chess.Move]:
+    """Select the best action greedily (no sampling).
+    Args:
+        model: Chess transformer model
+        board: Chess board position
+        temperature: Temperature for scaling logits (unused in greedy selection)
+    Returns:
+        Best move or None if no legal moves
+    """
+    logits = model._get_action_logits(board, temperature)
+    if logits is None:
+        return None
+    logits = logits.squeeze(0)  # Remove batch dimension
+    probs = F.softmax(logits, dim=0)
+    action_idx = int(torch.argmax(probs).item())
+    chosen_move = ACTION_TO_MOVE[action_idx]
+    return chess.Move.from_uci(chosen_move)

hf_space_repo/pretrain/README.md ADDED Viewed

	@@ -0,0 +1,153 @@

+# Chess Model Pretraining
+This module provides supervised pretraining on expert chess moves from Lichess games before GRPO reinforcement learning fine-tuning.
+## Overview
+The pretraining pipeline:
+1. Streams chess games from HuggingFace (`Lichess/standard-chess-games`)
+2. Filters by player ELO rating
+3. Extracts positions and moves from games
+4. Trains the ChessTransformer with cross-entropy loss on expert moves
+5. Saves checkpoints compatible with GRPO training
+## Quick Start
+```bash
+# Run pretraining with default config
+python -m src.grpo_self_play.pretrain.pretrain --config pretrain.yaml
+# With custom parameters
+python -m src.grpo_self_play.pretrain.pretrain --config pretrain.yaml \
+    --lr 1e-4 --batch_size 512 --min_elo 1800
+# Disable wandb logging
+python -m src.grpo_self_play.pretrain.pretrain --no_wandb
+```
+## Configuration
+Configuration is in `src/grpo_self_play/configs/pretrain.yaml`:
+```yaml
+pretrain:
+  lr: 0.0001                    # Learning rate
+  batch_size: 256               # Batch size
+  num_epochs: 1                 # Number of epochs
+  warmup_steps: 1000            # Linear warmup steps
+  weight_decay: 0.01            # AdamW weight decay
+  max_grad_norm: 1.0            # Gradient clipping
+  label_smoothing: 0.1          # Prevents overconfidence
+  val_check_interval: 0.1       # Validate every 10% of epoch
+dataset:
+  min_elo: 1800                 # Minimum player rating
+  skip_first_n_moves: 5         # Skip opening moves
+  skip_last_n_moves: 5          # Skip endgame moves
+  sample_positions_per_game: 3  # Positions per game
+  eval_fraction: 0.05           # 5% held out for evaluation
+transformer:
+  embed_dim: 256
+  num_layers: 4
+  num_heads: 8
+```
+## Train/Eval Split
+The dataset uses a **hash-based deterministic split** to ensure:
+- No data leakage between training and evaluation
+- Consistent splits across runs
+- Process-safe multi-worker data loading
+Games are assigned to train or eval based on:
+```python
+is_eval = hash(game_site_url) % 10000 < (eval_fraction * 10000)
+```
+This means the same game always goes to the same split, regardless of worker or epoch.
+## Using Pretrained Weights in GRPO
+After pretraining, use the checkpoint for GRPO fine-tuning by updating `default.yaml`:
+```yaml
+pretrain:
+  checkpoint_path: "checkpoints/pretrain/pretrain_final.pt"
+  freeze_layers: 0  # Optional: freeze first N transformer layers
+```
+Or pass the path when running training:
+```bash
+python -m src.grpo_self_play.train_self_play --config default.yaml
+```
+## Module Structure
+```
+pretrain/
+├── __init__.py              # Package exports
+├── pretrain.py              # PyTorch Lightning training module
+├── pretrain_dataset.py      # Streaming dataset from HuggingFace
+├── pretrain_load_config.py  # Config for loading pretrained weights
+└── README.md                # This file
+```
+## Key Classes
+### PretrainChessTransformer
+PyTorch Lightning module that wraps the ChessTransformer for supervised learning.
+```python
+from src.grpo_self_play.pretrain.pretrain import PretrainChessTransformer, PretrainConfig
+from src.grpo_self_play.models import ChessTransformerConfig
+model = PretrainChessTransformer(
+    transformer_config=ChessTransformerConfig(embed_dim=256, num_layers=4, num_heads=8),
+    pretrain_config=PretrainConfig(lr=1e-4, batch_size=256),
+)
+```
+### ChessPretrainDataset
+Streaming dataset that yields (board_tokens, action, legal_mask) tuples.
+```python
+from src.grpo_self_play.pretrain import ChessPretrainDataset, PretrainDatasetConfig
+dataset = ChessPretrainDataset(PretrainDatasetConfig(
+    min_elo=1800,
+    is_eval=False,  # True for evaluation set
+))
+```
+## Metrics
+The following metrics are logged during training:
+| Metric | Description |
+|--------|-------------|
+| `train/loss` | Cross-entropy loss with label smoothing |
+| `train/accuracy` | Top-1 move prediction accuracy |
+| `train/top5_accuracy` | Top-5 move prediction accuracy |
+| `train/entropy` | Policy entropy (confidence measure) |
+| `train/perplexity` | Exponential of loss |
+## Tests
+Run the test suite:
+```bash
+pytest tests/test_pretrain_pipeline.py -v
+```
+Tests cover:
+- Configuration dataclasses
+- PGN move parsing
+- Position extraction from games
+- UCI to action conversion
+- Collate function
+- Model creation and forward pass
+- Training and validation steps
+- Hash-based train/eval splitting
+- Integration with PyTorch Lightning

hf_space_repo/pretrain/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+"""Pretraining module for chess model."""
+from src.grpo_self_play.pretrain.pretrain_load_config import PretrainLoadConfig
+from src.grpo_self_play.pretrain.pretrain_dataset import (
+    ChessPretrainDataset,
+    PretrainDatasetConfig,
+    collate_pretrain_batch,
+)
+__all__ = [
+    "PretrainLoadConfig",
+    "ChessPretrainDataset",
+    "PretrainDatasetConfig",
+    "collate_pretrain_batch",
+]

hf_space_repo/pretrain/pretrain.py ADDED Viewed

	@@ -0,0 +1,579 @@

+"""Pretraining script for chess model on Lichess games using PyTorch Lightning.
+This script trains the ChessTransformer model using supervised learning
+on expert moves from Lichess games before GRPO reinforcement learning.
+Usage:
+    python -m src.grpo_self_play.pretrain.pretrain --config pretrain.yaml
+    # Or with overrides:
+    python -m src.grpo_self_play.pretrain.pretrain --config pretrain.yaml \
+        --lr 1e-4 --batch_size 512 --min_elo 1800
+"""
+import argparse
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+import torch
+import torch.nn.functional as F
+import pytorch_lightning as pl
+from pytorch_lightning.loggers import WandbLogger
+from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
+from torch.utils.data import DataLoader
+from src.grpo_self_play.models import ChessTransformer, ChessTransformerConfig
+from src.grpo_self_play.pretrain.pretrain_dataset import (
+    ChessPretrainDataset,
+    PretrainDatasetConfig,
+    collate_pretrain_batch,
+)
+from src.grpo_self_play.configs.config_loader import (
+    load_yaml_file,
+    dict_to_dataclass,
+)
+@dataclass
+class PretrainConfig:
+    """Configuration for pretraining.
+    Attributes:
+        lr: Learning rate
+        batch_size: Batch size for training
+        num_epochs: Number of epochs to train
+        warmup_steps: Number of warmup steps for learning rate
+        weight_decay: Weight decay for AdamW
+        max_grad_norm: Maximum gradient norm for clipping
+        checkpoint_dir: Directory to save checkpoints
+        resume_from: Path to checkpoint to resume from
+        use_wandb: Whether to use Weights & Biases logging
+        wandb_project: WandB project name
+        label_smoothing: Label smoothing factor for cross-entropy
+        num_workers: Number of DataLoader workers
+        val_check_interval: Validation check interval (fraction of epoch or int steps)
+    """
+    lr: float = 1e-4
+    batch_size: int = 256
+    num_epochs: int = 1
+    warmup_steps: int = 1000
+    weight_decay: float = 0.01
+    max_grad_norm: float = 1.0
+    checkpoint_dir: str = "checkpoints/pretrain"
+    resume_from: Optional[str] = None
+    use_wandb: bool = True
+    wandb_project: str = "chess-grpo-pretrain"
+    label_smoothing: float = 0.1
+    num_workers: int = 4
+    val_check_interval: float = 0.1
+# Register as safe for torch.load with weights_only=True (PyTorch 2.6+ compatibility)
+torch.serialization.add_safe_globals([PretrainConfig])
+class PretrainChessTransformer(pl.LightningModule):
+    """PyTorch Lightning module for pretraining chess policy with supervised learning.
+    This module implements supervised learning on expert chess moves from Lichess games.
+    The pretrained model can then be fine-tuned with GRPO reinforcement learning.
+    Attributes:
+        model: The ChessTransformer policy model
+        pretrain_config: Pretraining configuration
+        transformer_config: Model architecture configuration
+    """
+    def __init__(
+        self,
+        transformer_config: ChessTransformerConfig,
+        pretrain_config: PretrainConfig,
+    ):
+        """Initialize pretraining module.
+        Args:
+            transformer_config: Configuration for the chess transformer model
+            pretrain_config: Pretraining configuration
+        """
+        super().__init__()
+        self.save_hyperparameters()
+        self.model = ChessTransformer(transformer_config)
+        self.pretrain_config = pretrain_config
+        self.transformer_config = transformer_config
+        # For warmup scheduler
+        self._num_training_steps = None
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward pass through the model.
+        Args:
+            x: Input tensor [batch, seq_len]
+        Returns:
+            Policy logits [batch, action_dim]
+        """
+        return self.model(x)
+    def _compute_loss(
+        self,
+        logits: torch.Tensor,
+        targets: torch.Tensor,
+        legal_masks: torch.Tensor,
+    ) -> tuple[torch.Tensor, dict]:
+        """Compute cross-entropy loss with legal move masking.
+        Args:
+            logits: Model output logits [B, num_actions]
+            targets: Target action indices [B]
+            legal_masks: Legal moves mask [B, num_actions]
+        Returns:
+            Tuple of (loss, metrics_dict)
+        """
+        # Validate shapes match
+        B, action_dim = logits.shape
+        if legal_masks.shape != (B, action_dim):
+            raise ValueError(
+                f"Shape mismatch: logits {logits.shape} vs legal_masks {legal_masks.shape}. "
+                f"Expected legal_masks to be [{B}, {action_dim}]"
+            )
+        if targets.shape != (B,):
+            raise ValueError(
+                f"Shape mismatch: targets {targets.shape} vs expected [{B}]"
+            )
+        # Validate target actions are within bounds
+        max_target = targets.max().item()
+        min_target = targets.min().item()
+        if max_target >= action_dim or min_target < 0:
+            raise ValueError(
+                f"Target action indices out of bounds: min={min_target}, max={max_target}, "
+                f"action_dim={action_dim}. This suggests a mismatch between dataset action "
+                f"space and model action_dim."
+            )
+        # Validate target actions are legal (should always be true, but check defensively)
+        target_legal = legal_masks.gather(1, targets.unsqueeze(1)).squeeze(1)
+        if not target_legal.all():
+            illegal_count = (~target_legal).sum().item()
+            illegal_indices = (~target_legal).nonzero(as_tuple=False).flatten().tolist()
+            raise ValueError(
+                f"Found {illegal_count} illegal target actions in batch (out of {B}). "
+                f"First few batch indices: {illegal_indices[:10]}. "
+                f"This should not happen - dataset should filter these out."
+            )
+        # Check for NaN or Inf in raw logits (before masking)
+        if not torch.isfinite(logits).all():
+            nan_count = (~torch.isfinite(logits)).sum().item()
+            raise ValueError(
+                f"Found {nan_count} non-finite values in raw logits before masking. "
+                f"This suggests the model is outputting NaN/Inf."
+            )
+        # Mask illegal moves to -inf
+        masked_logits = logits.masked_fill(~legal_masks, float('-inf'))
+        # Check that each sample has at least one legal move (before checking masked logits)
+        legal_per_sample = legal_masks.sum(dim=1)
+        if (legal_per_sample == 0).any():
+            empty_samples = (legal_per_sample == 0).nonzero(as_tuple=False).flatten().tolist()
+            raise ValueError(
+                f"Found {len(empty_samples)} samples with no legal moves. "
+                f"Batch indices: {empty_samples[:10]}. This should not happen."
+            )
+        # Check masked logits: each sample must have at least one finite logit (legal move)
+        finite_per_sample = torch.isfinite(masked_logits).sum(dim=1)
+        if (finite_per_sample == 0).any():
+            bad_samples = (finite_per_sample == 0).nonzero(as_tuple=False).flatten().tolist()
+            raise ValueError(
+                f"Found {len(bad_samples)} samples with all -inf logits after masking. "
+                f"Batch indices: {bad_samples[:10]}. This means no legal moves have finite logits."
+            )
+        # Ensure target actions are not masked (defensive check)
+        target_logits = masked_logits.gather(1, targets.unsqueeze(1)).squeeze(1)
+        if not torch.isfinite(target_logits).all():
+            inf_count = (~torch.isfinite(target_logits)).sum().item()
+            raise ValueError(
+                f"Found {inf_count} target actions with -inf logits after masking. "
+                f"This means target actions are being masked as illegal, which should not happen."
+            )
+        # Compute NLL loss (works correctly with -inf masked logits)
+        nll_loss = F.cross_entropy(masked_logits, targets, reduction='mean')
+        # Apply label smoothing only over legal moves to avoid inf from -inf logits
+        # Standard F.cross_entropy with label_smoothing averages log_softmax over ALL
+        # actions, but -inf logits cause smooth_loss = +inf
+        eps = self.pretrain_config.label_smoothing
+        if eps > 0:
+            # Compute log_softmax (illegal moves will be -inf)
+            log_probs = F.log_softmax(masked_logits, dim=-1)
+            # Zero out illegal moves so they don't contribute to smoothing term
+            log_probs_legal = log_probs.masked_fill(~legal_masks, 0.0)
+            # Average only over legal moves
+            num_legal = legal_masks.sum(dim=-1).float()  # [B]
+            smooth_loss = -log_probs_legal.sum(dim=-1) / num_legal  # [B]
+            loss = (1 - eps) * nll_loss + eps * smooth_loss.mean()
+        else:
+            loss = nll_loss
+        # Check if loss is infinite or NaN
+        if not torch.isfinite(loss):
+            # Additional debugging info
+            target_logits_debug = masked_logits.gather(1, targets.unsqueeze(1)).squeeze(1)
+            print(f"DEBUG: Loss is {loss.item()}")
+            print(f"DEBUG: NLL loss: {nll_loss.item()}")
+            if eps > 0:
+                print(f"DEBUG: Smooth loss mean: {smooth_loss.mean().item()}")
+            print(f"DEBUG: Logits shape: {logits.shape}")
+            print(f"DEBUG: Legal masks shape: {legal_masks.shape}")
+            print(f"DEBUG: Targets range: [{targets.min().item()}, {targets.max().item()}]")
+            print(f"DEBUG: Target logits range: [{target_logits_debug.min().item():.2f}, {target_logits_debug.max().item():.2f}]")
+            print(f"DEBUG: Legal moves per sample: min={legal_per_sample.min().item()}, max={legal_per_sample.max().item()}")
+            raise ValueError(
+                f"Loss is {loss.item()}. This can happen if:\n"
+                f"1. Target actions are out of bounds\n"
+                f"2. Target actions are masked as illegal\n"
+                f"3. Model outputs contain NaN/Inf\n"
+                f"4. All logits are -inf (no legal moves)"
+            )
+        # Compute metrics
+        with torch.no_grad():
+            # Top-1 accuracy
+            predictions = masked_logits.argmax(dim=-1)
+            accuracy = (predictions == targets).float().mean()
+            # Top-5 accuracy
+            _, top5_preds = masked_logits.topk(5, dim=-1)
+            top5_correct = (top5_preds == targets.unsqueeze(-1)).any(dim=-1)
+            top5_accuracy = top5_correct.float().mean()
+            # Entropy of the distribution (measure of confidence)
+            probs = F.softmax(masked_logits, dim=-1)
+            log_probs = F.log_softmax(masked_logits, dim=-1)
+            # Handle -inf * 0 = nan by replacing with 0
+            entropy_terms = probs * log_probs
+            entropy_terms = torch.where(
+                torch.isfinite(entropy_terms),
+                entropy_terms,
+                torch.zeros_like(entropy_terms)
+            )
+            entropy = -entropy_terms.sum(dim=-1).mean()
+            # Perplexity - clamp to avoid inf
+            perplexity = torch.exp(loss.clamp(max=50))
+        metrics = {
+            'accuracy': accuracy,
+            'top5_accuracy': top5_accuracy,
+            'entropy': entropy,
+            'perplexity': perplexity,
+        }
+        return loss, metrics
+    def training_step(self, batch: tuple, batch_idx: int) -> torch.Tensor:
+        """Perform a training step.
+        Args:
+            batch: Tuple of (boards, actions, legal_masks)
+            batch_idx: Batch index
+        Returns:
+            Loss value
+        """
+        boards, actions, legal_masks = batch
+        # Forward pass
+        logits = self(boards)
+        # Compute loss and metrics
+        loss, metrics = self._compute_loss(logits, actions, legal_masks)
+        # Log metrics
+        self.log('train/loss', loss, prog_bar=True)
+        self.log('train/accuracy', metrics['accuracy'], prog_bar=True)
+        self.log('train/top5_accuracy', metrics['top5_accuracy'])
+        self.log('train/entropy', metrics['entropy'])
+        self.log('train/perplexity', metrics['perplexity'])
+        return loss
+    def validation_step(self, batch: tuple, batch_idx: int) -> torch.Tensor:
+        """Perform a validation step.
+        Args:
+            batch: Tuple of (boards, actions, legal_masks)
+            batch_idx: Batch index
+        Returns:
+            Loss value
+        """
+        boards, actions, legal_masks = batch
+        # Forward pass
+        logits = self(boards)
+        # Compute loss and metrics
+        loss, metrics = self._compute_loss(logits, actions, legal_masks)
+        # Log metrics
+        self.log('val/loss', loss, prog_bar=True, sync_dist=True)
+        self.log('val/accuracy', metrics['accuracy'], prog_bar=True, sync_dist=True)
+        self.log('val/top5_accuracy', metrics['top5_accuracy'], sync_dist=True)
+        self.log('val/entropy', metrics['entropy'], sync_dist=True)
+        self.log('val/perplexity', metrics['perplexity'], sync_dist=True)
+        return loss
+    def configure_optimizers(self):
+        """Configure optimizer and learning rate scheduler.
+        Returns:
+            Dictionary with optimizer and lr_scheduler configuration
+        """
+        optimizer = torch.optim.AdamW(
+            self.parameters(),
+            lr=self.pretrain_config.lr,
+            weight_decay=self.pretrain_config.weight_decay,
+        )
+        # Linear warmup + cosine decay scheduler
+        def lr_lambda(current_step: int) -> float:
+            warmup_steps = self.pretrain_config.warmup_steps
+            if current_step < warmup_steps:
+                return float(current_step) / float(max(1, warmup_steps))
+            return 1.0  # After warmup, use constant LR (or add cosine decay)
+        scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
+        return {
+            'optimizer': optimizer,
+            'lr_scheduler': {
+                'scheduler': scheduler,
+                'interval': 'step',
+                'frequency': 1,
+            }
+        }
+def get_pretrain_trainer(
+    pretrain_config: PretrainConfig,
+    run_name: str,
+) -> pl.Trainer:
+    """Create a PyTorch Lightning trainer for pretraining.
+    Args:
+        pretrain_config: Pretraining configuration
+        run_name: Name for this training run
+    Returns:
+        Configured PyTorch Lightning trainer
+    """
+    # Create checkpoint directory
+    checkpoint_dir = Path(pretrain_config.checkpoint_dir)
+    checkpoint_dir.mkdir(parents=True, exist_ok=True)
+    callbacks = [
+        ModelCheckpoint(
+            dirpath=str(checkpoint_dir),
+            filename=run_name + "-{epoch:02d}-{train/loss:.4f}",
+            save_top_k=3,
+            monitor="train/loss",
+            mode="min",
+            save_last=True,
+        ),
+        LearningRateMonitor(logging_interval='step'),
+    ]
+    logger = None
+    if pretrain_config.use_wandb:
+        logger = WandbLogger(
+            project=pretrain_config.wandb_project,
+            name=run_name,
+            log_model=True,
+        )
+    trainer = pl.Trainer(
+        max_epochs=pretrain_config.num_epochs,
+        accelerator="auto",
+        devices=1,
+        logger=logger,
+        callbacks=callbacks,
+        gradient_clip_val=pretrain_config.max_grad_norm,
+        log_every_n_steps=50,
+        val_check_interval=pretrain_config.val_check_interval,
+    )
+    return trainer
+def load_pretrain_config(
+    path: str = "pretrain.yaml",
+    overrides: dict = None,
+) -> tuple[PretrainConfig, PretrainDatasetConfig, ChessTransformerConfig]:
+    """Load pretraining configuration from YAML file.
+    Args:
+        path: Path to config file (relative to configs dir or absolute)
+        overrides: Optional dict of overrides
+    Returns:
+        Tuple of (PretrainConfig, PretrainDatasetConfig, ChessTransformerConfig)
+    """
+    data = load_yaml_file(path)
+    if overrides:
+        for section, section_overrides in overrides.items():
+            if section in data:
+                data[section].update(section_overrides)
+            else:
+                data[section] = section_overrides
+    pretrain = dict_to_dataclass(PretrainConfig, data.get('pretrain', {}))
+    dataset = dict_to_dataclass(PretrainDatasetConfig, data.get('dataset', {}))
+    transformer = dict_to_dataclass(ChessTransformerConfig, data.get('transformer', {}))
+    return pretrain, dataset, transformer
+def train(
+    pretrain_config: PretrainConfig,
+    dataset_config: PretrainDatasetConfig,
+    transformer_config: ChessTransformerConfig,
+) -> str:
+    """Main pretraining function.
+    Args:
+        pretrain_config: Pretraining configuration
+        dataset_config: Dataset configuration
+        transformer_config: Model configuration
+    Returns:
+        Path to final checkpoint
+    """
+    import time
+    import random
+    import string
+    # Generate run name
+    timestamp = time.strftime("%Y%m%d-%H%M")
+    random_suffix = ''.join(random.choices(string.ascii_lowercase + string.digits, k=4))
+    run_name = f"pretrain-{timestamp}-{random_suffix}"
+    print(f"Run name: {run_name}")
+    # Create model
+    model = PretrainChessTransformer(transformer_config, pretrain_config)
+    print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
+    # Create datasets
+    train_dataset = ChessPretrainDataset(dataset_config)
+    # Create validation dataset using hash-based split
+    val_dataset_config = PretrainDatasetConfig(
+        min_elo=dataset_config.min_elo,
+        max_samples=10000,  # Smaller validation set
+        skip_first_n_moves=dataset_config.skip_first_n_moves,
+        skip_last_n_moves=dataset_config.skip_last_n_moves,
+        sample_positions_per_game=1,  # Less samples per game for validation
+        is_eval=True,  # Use eval portion of hash-based split
+        eval_fraction=dataset_config.eval_fraction,
+        cache_path=dataset_config.cache_path,
+    )
+    val_dataset = ChessPretrainDataset(val_dataset_config)
+    print(f"Train: {len(train_dataset):,} samples, Eval: {len(val_dataset):,} samples")
+    # Create dataloaders
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_size=pretrain_config.batch_size,
+        shuffle=True,  # Shuffle for training
+        num_workers=pretrain_config.num_workers,
+        collate_fn=collate_pretrain_batch,
+        pin_memory=True,
+    )
+    val_dataloader = DataLoader(
+        val_dataset,
+        batch_size=pretrain_config.batch_size,
+        shuffle=False,
+        num_workers=max(1, pretrain_config.num_workers // 2),
+        collate_fn=collate_pretrain_batch,
+        pin_memory=True,
+    )
+    # Create trainer
+    trainer = get_pretrain_trainer(pretrain_config, run_name)
+    # Resume from checkpoint if specified
+    ckpt_path = pretrain_config.resume_from
+    # Train
+    trainer.fit(model, train_dataloader, val_dataloader, ckpt_path=ckpt_path)
+    # Save final checkpoint in a standard location
+    final_path = Path(pretrain_config.checkpoint_dir) / "pretrain_final.pt"
+    torch.save({
+        'model_state_dict': model.model.state_dict(),
+        'transformer_config': transformer_config,
+        'pretrain_config': pretrain_config,
+    }, final_path)
+    print(f"\nPretraining complete! Final checkpoint saved to {final_path}")
+    return str(final_path)
+def main():
+    """Main entry point for pretraining script."""
+    parser = argparse.ArgumentParser(description="Pretrain chess model on Lichess games")
+    parser.add_argument("--config", type=str, default="pretrain.yaml",
+                        help="Path to config file")
+    # Allow command-line overrides for common parameters
+    parser.add_argument("--lr", type=float, help="Learning rate")
+    parser.add_argument("--batch_size", type=int, help="Batch size")
+    parser.add_argument("--num_epochs", type=int, help="Number of epochs")
+    parser.add_argument("--min_elo", type=int, help="Minimum player ELO")
+    parser.add_argument("--max_samples", type=int, help="Max samples per epoch")
+    parser.add_argument("--resume_from", type=str, help="Resume from checkpoint")
+    parser.add_argument("--no_wandb", action="store_true", help="Disable wandb logging")
+    args = parser.parse_args()
+    # Build overrides from command-line arguments
+    overrides = {'pretrain': {}, 'dataset': {}}
+    if args.lr:
+        overrides['pretrain']['lr'] = args.lr
+    if args.batch_size:
+        overrides['pretrain']['batch_size'] = args.batch_size
+    if args.num_epochs:
+        overrides['pretrain']['num_epochs'] = args.num_epochs
+    if args.resume_from:
+        overrides['pretrain']['resume_from'] = args.resume_from
+    if args.no_wandb:
+        overrides['pretrain']['use_wandb'] = False
+    if args.min_elo:
+        overrides['dataset']['min_elo'] = args.min_elo
+    if args.max_samples:
+        overrides['dataset']['max_samples'] = args.max_samples
+    # Load config
+    pretrain_config, dataset_config, transformer_config = load_pretrain_config(
+        args.config,
+        overrides=overrides if any(v for v in overrides.values()) else None
+    )
+    # Run training
+    train(pretrain_config, dataset_config, transformer_config)
+if __name__ == "__main__":
+    main()

hf_space_repo/pretrain/pretrain_dataset.py ADDED Viewed

	@@ -0,0 +1,328 @@

+"""Dataset for pretraining on chess games from HuggingFace.
+Uses angeluriot/chess_games: 14M high-ELO games (7.3GB download).
+Mean ELO ~2355, moves already in UCI format - no parsing needed.
+"""
+import os
+import chess
+import torch
+import random
+from typing import Optional
+from functools import partial
+from dataclasses import dataclass
+from multiprocessing import cpu_count
+from torch.utils.data import Dataset
+from datasets import load_dataset
+from tqdm import tqdm
+from src.grpo_self_play.searchless_chess_imports import MOVE_TO_ACTION, tokenize
+# Global constant
+_ACTION_SPACE_SIZE = max(MOVE_TO_ACTION.values()) + 1
+@dataclass
+class PretrainDatasetConfig:
+    """Configuration for the pretraining dataset.
+    Uses angeluriot/chess_games: 14M high-ELO games (7.3GB download).
+    Mean ELO ~2355, moves already in UCI format.
+    Attributes:
+        min_elo: Minimum player ELO to include games
+        max_samples: Maximum number of samples per epoch (None for unlimited)
+        skip_first_n_moves: Skip the first N moves (avoid memorizing openings)
+        skip_last_n_moves: Skip the last N moves (avoid noisy endgame positions)
+        sample_positions_per_game: Number of positions to sample from each game
+        is_eval: If True, use eval portion of hash-based split.
+        eval_fraction: Fraction of data to use for evaluation (default 0.05 = 5%)
+        cache_path: Path to save/load filtered dataset (e.g., Google Drive, studio storage).
+                   If set and exists, loads from cache. Otherwise downloads, filters, and saves.
+    """
+    min_elo: int = 2000
+    max_samples: Optional[int] = None
+    skip_first_n_moves: int = 5
+    skip_last_n_moves: int = 5
+    sample_positions_per_game: int = 3
+    is_eval: bool = False
+    eval_fraction: float = 0.05
+    cache_path: Optional[str] = None
+def uci_to_action(uci_move: str) -> Optional[int]:
+    """Convert UCI move string to action index."""
+    return MOVE_TO_ACTION.get(uci_move)
+def get_positions_from_game(
+    moves: list[str],
+    skip_first_n: int = 5,
+    skip_last_n: int = 5,
+    sample_n: int = 3,
+) -> list[tuple[str, str, int]]:
+    """Extract (FEN, move_played, move_number) tuples from a game.
+    Args:
+        moves: List of UCI moves
+        skip_first_n: Skip first N moves (opening book territory)
+        skip_last_n: Skip last N moves (endgame/resignation noise)
+        sample_n: Number of positions to randomly sample
+    Returns:
+        List of (fen, uci_move, move_number) tuples
+    """
+    if len(moves) <= skip_first_n + skip_last_n:
+        return []
+    board = chess.Board()
+    positions = []
+    for i, uci_move in enumerate(moves):
+        if i < skip_first_n:
+            try:
+                board.push_uci(uci_move)
+            except (ValueError, chess.InvalidMoveError):
+                return positions
+            continue
+        if i >= len(moves) - skip_last_n:
+            break
+        fen = board.fen()
+        positions.append((fen, uci_move, i))
+        try:
+            board.push_uci(uci_move)
+        except (ValueError, chess.InvalidMoveError):
+            break
+    if len(positions) > sample_n:
+        positions = random.sample(positions, sample_n)
+    return positions
+class ChessPretrainDataset(Dataset):
+    """Dataset for chess pretraining from angeluriot/chess_games.
+    Downloads the full dataset (7.3GB) and processes games into
+    (board_tensor, target_action, legal_moves_mask) tuples.
+    Example:
+        >>> config = PretrainDatasetConfig(min_elo=2000)
+        >>> dataset = ChessPretrainDataset(config)
+        >>> dataloader = DataLoader(dataset, batch_size=256, shuffle=True)
+    """
+    def __init__(self, config: PretrainDatasetConfig = PretrainDatasetConfig()):
+        """Initialize the dataset - downloads and processes all games."""
+        self.config = config
+        self._action_space_size = max(MOVE_TO_ACTION.values()) + 1
+        self._samples: list[tuple[torch.Tensor, int, torch.Tensor]] = []
+        self._load_and_process()
+    def _load_and_process(self):
+        """Download dataset and process all games into samples."""
+        # Try loading processed samples from cache
+        if self.config.cache_path:
+            cache_file = self._get_cache_filename()
+            if os.path.exists(cache_file):
+                print(f"Loading processed samples from {cache_file}...")
+                self._samples = torch.load(cache_file)
+                print(f"Loaded {len(self._samples):,} samples from cache")
+                return
+        # Download, filter, and process
+        dataset = self._load_filtered_dataset()
+        # Limit dataset size if max_samples is set
+        if self.config.max_samples:
+            max_games = self.config.max_samples // self.config.sample_positions_per_game + 1000
+            if len(dataset) > max_games:
+                dataset = dataset.select(range(max_games))
+                print(f"Limited to {len(dataset):,} games")
+        # Process games using HuggingFace's optimized map
+        num_workers = min(8, cpu_count() or 4)
+        print(f"Processing games into samples with {num_workers} workers...")
+        skip_first = self.config.skip_first_n_moves
+        skip_last = self.config.skip_last_n_moves
+        sample_n = self.config.sample_positions_per_game
+        def process_batch(batch):
+            """Process a batch of games - returns lists for HF dataset."""
+            all_boards, all_actions, all_masks = [], [], []
+            for i in range(len(batch['moves_uci'])):
+                moves = batch['moves_uci'][i]
+                if not moves:
+                    continue
+                positions = get_positions_from_game(moves, skip_first, skip_last, sample_n)
+                for fen, uci_move, _ in positions:
+                    action_idx = MOVE_TO_ACTION.get(uci_move)
+                    if action_idx is None:
+                        continue
+                    try:
+                        token_ids = list(tokenize(fen))
+                        board = chess.Board(fen)
+                        legal_mask = [False] * _ACTION_SPACE_SIZE
+                        for move in board.legal_moves:
+                            move_idx = MOVE_TO_ACTION.get(move.uci())
+                            if move_idx is not None:
+                                legal_mask[move_idx] = True
+                        if not legal_mask[action_idx]:
+                            continue
+                        all_boards.append(token_ids)
+                        all_actions.append(action_idx)
+                        all_masks.append(legal_mask)
+                    except Exception:
+                        continue
+            return {'boards': all_boards, 'actions': all_actions, 'masks': all_masks}
+        processed = dataset.map(
+            process_batch,
+            batched=True,
+            batch_size=1000,
+            num_proc=num_workers,
+            remove_columns=dataset.column_names,
+            desc="Processing"
+        )
+        # Convert to tensors  (HF map flattens the lists)
+        print("Converting to tensors...")
+        for i in tqdm(range(len(processed)), desc="Tensorizing"):
+            board_tensor = torch.tensor(processed[i]['boards'], dtype=torch.long)
+            legal_mask = torch.tensor(processed[i]['masks'], dtype=torch.bool)
+            self._samples.append((board_tensor, processed[i]['actions'], legal_mask))
+            if self.config.max_samples and len(self._samples) >= self.config.max_samples:
+                break
+        print(f"Done: {len(self._samples):,} samples")
+        # Save processed samples to cache
+        if self.config.cache_path:
+            cache_file = self._get_cache_filename()
+            print(f"Saving processed samples to {cache_file}...")
+            os.makedirs(self.config.cache_path, exist_ok=True)
+            torch.save(self._samples, cache_file)
+            print("Saved to cache")
+    def _get_cache_filename(self) -> str:
+        """Generate cache filename based on config."""
+        split = 'eval' if self.config.is_eval else 'train'
+        max_samples = self.config.max_samples or 'all'
+        return f"{self.config.cache_path}/processed_elo{self.config.min_elo}_{split}_{max_samples}.pt"
+    def _load_filtered_dataset(self):
+        """Download and filter dataset."""
+        # Download (uses cache_path for HuggingFace cache)
+        print("Downloading angeluriot/chess_games (7.3GB)...")
+        cache_dir = self.config.cache_path if self.config.cache_path else None
+        dataset = load_dataset("angeluriot/chess_games", split="train", cache_dir=cache_dir)
+        print(f"Loaded {len(dataset):,} games")
+        # Fast batched filtering
+        print(f"Filtering games (min_elo={self.config.min_elo})...")
+        min_elo = self.config.min_elo
+        eval_frac = self.config.eval_fraction
+        is_eval = self.config.is_eval
+        def batch_filter(batch):
+            """Filter a batch of games - much faster than per-example."""
+            keep = []
+            for i in range(len(batch['white_elo'])):
+                white_elo = batch['white_elo'][i]
+                black_elo = batch['black_elo'][i]
+                # Skip if ELO is missing
+                if white_elo is None or black_elo is None:
+                    keep.append(False)
+                    continue
+                # ELO filter
+                if white_elo < min_elo or black_elo < min_elo:
+                    keep.append(False)
+                    continue
+                # Moves filter
+                if len(batch['moves_uci'][i]) < 10:
+                    keep.append(False)
+                    continue
+                # Hash-based train/eval split
+                game_id = f"{batch['date'][i]}-{white_elo}-{black_elo}"
+                hash_val = hash(game_id) % 10000
+                is_eval_game = hash_val < (eval_frac * 10000)
+                if is_eval_game != is_eval:
+                    keep.append(False)
+                    continue
+                keep.append(True)
+            return keep
+        dataset = dataset.filter(batch_filter, batched=True, batch_size=10000, desc="Filtering")
+        print(f"After filtering: {len(dataset):,} games")
+        return dataset
+    def _process_game(self, game: dict):
+        """Process a single game and yield training samples."""
+        moves = game.get('moves_uci', [])
+        positions = get_positions_from_game(
+            moves,
+            skip_first_n=self.config.skip_first_n_moves,
+            skip_last_n=self.config.skip_last_n_moves,
+            sample_n=self.config.sample_positions_per_game,
+        )
+        for fen, uci_move, _ in positions:
+            action_idx = uci_to_action(uci_move)
+            if action_idx is None:
+                continue
+            try:
+                token_ids = list(tokenize(fen))
+                board_tensor = torch.tensor(token_ids, dtype=torch.long)
+            except Exception:
+                continue
+            try:
+                board = chess.Board(fen)
+                legal_mask = torch.zeros(self._action_space_size, dtype=torch.bool)
+                for move in board.legal_moves:
+                    move_idx = MOVE_TO_ACTION.get(move.uci())
+                    if move_idx is not None:
+                        legal_mask[move_idx] = True
+            except Exception:
+                continue
+            if not legal_mask[action_idx]:
+                continue
+            yield board_tensor, action_idx, legal_mask
+    def __len__(self) -> int:
+        return len(self._samples)
+    def __getitem__(self, idx: int) -> tuple[torch.Tensor, int, torch.Tensor]:
+        return self._samples[idx]
+def collate_pretrain_batch(
+    batch: list[tuple[torch.Tensor, int, torch.Tensor]]
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Collate function for DataLoader.
+    Returns:
+        Tuple of (boards [B, 77], actions [B], legal_masks [B, num_actions])
+    """
+    boards, actions, masks = zip(*batch)
+    boards = torch.stack(boards)
+    actions = torch.tensor(actions, dtype=torch.long)
+    masks = torch.stack(masks)
+    return boards, actions, masks

hf_space_repo/pretrain/pretrain_load_config.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""Pretrain load configuration - separated to avoid circular imports."""
+import torch
+from dataclasses import dataclass
+from typing import Optional
+@dataclass
+class PretrainLoadConfig:
+    """Configuration for loading pretrained weights.
+    Attributes:
+        checkpoint_path: Path to pretrained checkpoint file
+        freeze_layers: Number of transformer layers to freeze (0 = train all)
+    """
+    checkpoint_path: Optional[str] = None
+    freeze_layers: int = 0
+# Register as safe for torch.load with weights_only=True (PyTorch 2.6+ compatibility)
+torch.serialization.add_safe_globals([PretrainLoadConfig])

hf_space_repo/searchless_chess_imports.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from src.searchless_chess_model.searchless_chess_code.utils import ACTION_TO_MOVE, MOVE_TO_ACTION
2	+ from src.searchless_chess_model.searchless_chess_code.tokenizer import tokenize, SEQUENCE_LENGTH
3	+

hf_space_repo/searchless_chess_model/.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

hf_space_repo/searchless_chess_model/README.md ADDED Viewed

	@@ -0,0 +1,177 @@

+---
+license: apache-2.0
+tags:
+- chess
+- reinforcement-learning
+- jax
+- transformer
+language:
+- en
+library_name: jax
+---
+# Searchless Chess 9M Self-Play
+A 9-million parameter transformer-based chess engine trained via self-play with Stockfish evaluation. This model learns to play chess without explicit search during inference, relying purely on learned pattern recognition.
+## Model Description
+- **Model Size**: 9M parameters (8 layers, 256 embedding dim, 8 attention heads)
+- **Architecture**: Decoder-only Transformer with learned positional encodings
+- **Training Method**: Self-play with Stockfish rewards
+- **Framework**: JAX + Haiku
+- **Q-Value Distribution**: 128 return buckets for action-value prediction
+This model predicts action-values (Q-values) for chess positions without performing tree search, making it extremely fast for inference while maintaining strong play.
+## Installation
+### CPU Installation
+Install the required dependencies for CPU inference:
+```bash
+pip install jax jaxlib dm-haiku orbax-checkpoint numpy chess huggingface-hub jaxtyping apache-beam grain
+```
+### GPU Installation (Recommended)
+For GPU acceleration with CUDA 12:
+```bash
+pip install --upgrade "jax[cuda12_pip]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+pip install dm-haiku orbax-checkpoint numpy chess huggingface-hub jaxtyping apache-beam grain
+```
+For other CUDA versions, see the [JAX installation guide](https://github.com/google/jax#installation).
+**Note**: This model includes all necessary code and can be used **without cloning the original repository**.
+## Quick Start
+```python
+import sys
+from huggingface_hub import snapshot_download
+# Download model from HuggingFace Hub
+model_path = snapshot_download(
+    repo_id="dbest-isi/searchless-chess-9M-selfplay",
+    local_dir="./searchless_chess_model"
+)
+# Add bundled code to Python path
+sys.path.insert(0, f"{model_path}/searchless_chess_code")
+# Import model wrapper
+import hf_model
+# Load the model
+model = hf_model.SearchlessChessModel.from_pretrained(model_path)
+# Make a prediction
+fen = "rnbqkbnr/pppppppp/8/8/4P3/8/PPPP1PPP/RNBQKBNR b KQkq e3 0 1"
+result = model.predict(fen, temperature=1.0)
+print(f"Best move: {result['best_move']}")
+print(f"Q-value: {result['q_value']:.4f}")
+print(f"Action probabilities shape: {result['action_probs'].shape}")
+```
+## Example Output
+```python
+Best move: e7e5
+Q-value: 0.0119
+Action probabilities shape: (1968,)
+```
+## Full Example with Multiple Positions
+```python
+import sys
+from huggingface_hub import snapshot_download
+# Download and setup
+model_path = snapshot_download(
+    repo_id="dbest-isi/searchless-chess-9M-selfplay",
+    local_dir="./searchless_chess_model"
+)
+sys.path.insert(0, f"{model_path}/searchless_chess_code")
+import hf_model
+# Load model
+print("Loading model...")
+model = hf_model.SearchlessChessModel.from_pretrained(model_path)
+print("Model loaded!")
+# Test on multiple positions
+positions = [
+    ("Starting position", "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1"),
+    ("After 1.e4", "rnbqkbnr/pppppppp/8/8/4P3/8/PPPP1PPP/RNBQKBNR b KQkq e3 0 1"),
+    ("Scandinavian Defense", "rnbqkbnr/ppp1pppp/8/3p4/4P3/8/PPPP1PPP/RNBQKBNR w KQkq d6 0 2"),
+]
+for name, fen in positions:
+    result = model.predict(fen)
+    print(f"\n{name}")
+    print(f"  FEN: {fen}")
+    print(f"  Best move: {result['best_move']}")
+    print(f"  Q-value: {result['q_value']:.4f}")
+```
+## Model Architecture
+```python
+TransformerConfig(
+    vocab_size=1968,
+    output_size=128,
+    embedding_dim=256,
+    num_layers=8,
+    num_heads=8,
+    max_sequence_length=79,
+    num_return_buckets=128,
+    pos_encodings="LEARNED",
+    apply_post_ln=True,
+    apply_qk_layernorm=False,
+    use_causal_mask=False,
+)
+```
+## Training Details
+- **Base Model**: Initialized from pretrained 9M checkpoint
+- **Training Method**: Self-play reinforcement learning
+- **Reward Signal**: Stockfish evaluation at depth 20
+- **Iteration**: 22 (EMA parameters)
+- **Action Space**: 1968 possible moves (all legal chess moves)
+- **Value Representation**: Discretized into 128 buckets
+## Use Cases
+- Fast chess move prediction without search
+- Chess position evaluation
+- Research on learned planning in board games
+- Integration into chess applications requiring low-latency move suggestions
+## Limitations
+- Does not perform explicit search (unlike traditional chess engines)
+- May make suboptimal moves in complex tactical positions
+- Performance depends on training data distribution
+- Best suited for fast move suggestions rather than deep analysis
+## Background
+This model is based on the architecture from DeepMind's [Searchless Chess](https://github.com/google-deepmind/searchless_chess) work. The **self-play training implementation and this trained model** are original work by Darrell Best.
+For the full self-play training implementation and codebase, visit:
+- Repository: https://github.com/DarrellBest/searchless_chess
+## License
+Apache 2.0
+## Model Card Contact
+For questions or issues, please open an issue on the [GitHub repository](https://github.com/DarrellBest/searchless_chess).

hf_space_repo/searchless_chess_model/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "vocab_size": 1968,
+  "output_size": 128,
+  "embedding_dim": 256,
+  "num_layers": 8,
+  "num_heads": 8,
+  "max_sequence_length": 79,
+  "num_return_buckets": 128,
+  "model_name": "9M"
+}

hf_space_repo/searchless_chess_model/model_info.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "model_type": "searchless_chess",
+  "framework": "jax",
+  "library": "dm-haiku",
+  "includes_source": true,
+  "source_modules": [
+    "tokenizer.py",
+    "transformer.py",
+    "constants.py",
+    "utils.py",
+    "config.py"
+  ]
+}

hf_space_repo/searchless_chess_model/searchless_chess_code/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Searchless Chess code bundle

hf_space_repo/searchless_chess_model/searchless_chess_code/config.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# Copyright 2025 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defines the configuration dataclasses."""
+import dataclasses
+from typing import Literal
+PolicyType = Literal['action_value', 'state_value', 'behavioral_cloning']
+POLICY_TYPES = ['action_value', 'state_value', 'behavioral_cloning']
+@dataclasses.dataclass(kw_only=True)
+class DataConfig:
+  """Config for the data generation."""
+  # The batch size for the sequences.
+  batch_size: int
+  # Whether to shuffle the dataset (shuffling is applied per epoch).
+  shuffle: bool = False
+  # The seed used for shuffling and transformations of the data.
+  seed: int | None = 0
+  # Whether to drop partial batches.
+  drop_remainder: bool = False
+  # The number of child processes launched to parallelize the transformations.
+  worker_count: int | None = 0
+  # The number of return buckets.
+  num_return_buckets: int
+  # The dataset split.
+  split: Literal['train', 'test']
+  # The policy used to create the dataset.
+  policy: PolicyType
+  # The number of records to read from the dataset (can be useful when, e.g.,
+  # the dataset does not fit into memory).
+  num_records: int | None = None
+@dataclasses.dataclass(kw_only=True)
+class TrainConfig:
+  """Config for the training function."""
+  # The data configuration for training.
+  data: DataConfig
+  # The learning rate for Adam.
+  learning_rate: float
+  # The gradient clipping value.
+  max_grad_norm: float = 1.0
+  # The number of gradient steps.
+  num_steps: int
+  # The frequency (in gradient steps) at which checkpoints should be saved
+  # (`None` means there is no checkpointing).
+  ckpt_frequency: int | None = None
+  # If provided, the maximum number of checkpoints to keep.
+  ckpt_max_to_keep: int | None = 1
+  # The frequency (in gradient steps) at which checkpoints should be saved
+  # permanently (`None` means all checkpoints are temporary).
+  save_frequency: int | None = None
+  # The frequency of logging in gradient steps (`None` means no logging).
+  log_frequency: int | None = None
+@dataclasses.dataclass(kw_only=True)
+class EvalConfig:
+  """Config for the evaluator."""
+  # The data configuration for evaluation.
+  data: DataConfig
+  # How many data points to consider for evaluation.
+  num_eval_data: int | None = None
+  # Enables use of ema-ed params in eval.
+  use_ema_params: bool = False
+  # The policy used to play moves with the model.
+  policy: PolicyType
+  # The number of return buckets.
+  num_return_buckets: int
+  # The batch size for evaluation.
+  batch_size: int | None = None

hf_space_repo/searchless_chess_model/searchless_chess_code/constants.py ADDED Viewed

	@@ -0,0 +1,119 @@

+# Copyright 2025 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Constants, interfaces, and types."""
+import abc
+from collections.abc import Callable, Mapping
+import dataclasses
+from typing import Any, NamedTuple, Protocol
+from apache_beam import coders
+from grain import python as pygrain
+import haiku as hk
+import jaxtyping as jtp
+import config as config_lib
+# Integer sequences of token ids.
+Sequences = jtp.UInt32[jtp.Array, 'B T']
+# The predictions are log-probabilities (natural logarithm) for the passed
+# sequences. It can either be marginal log-probabilities (i.e. log P(s) for all
+# sequences s in the batch), or full conditionals (i.e. log P(token | s_<t) for
+# all sequence s, time t and token in the alphabet).
+Marginals = jtp.Float32[jtp.Array, '*B']
+Conditionals = jtp.Float32[jtp.Array, '*B T F']
+Predictions = Marginals | Conditionals
+# True means the loss will be masked there, i.e. we ignore it.
+LossMask = jtp.Bool[jtp.Array, 'B T']
+@dataclasses.dataclass
+class Predictor:
+  """Defines the predictor interface."""
+  initial_params: Callable[..., hk.MutableParams]
+  predict: Callable[..., Predictions]
+class DataLoaderBuilder(Protocol):
+  def __call__(self, config: config_lib.DataConfig) -> pygrain.DataLoader:
+    """Returns a PyGrain data loader from the `config`."""
+class Evaluator(abc.ABC):
+  """Defines the interface of the evaluator that evaluates a predictor."""
+  @abc.abstractmethod
+  def step(self, params: hk.Params, step: int) -> Mapping[str, Any]:
+    """Returns the results of evaluating the predictor with `params`."""
+class EvaluatorBuilder(Protocol):
+  def __call__(
+      self,
+      predictor: Predictor,
+      config: config_lib.EvalConfig,
+  ) -> Evaluator:
+    """Returns an evaluator for the `predictor` and `config`.
+    Args:
+      predictor: The predictor to be evaluated. The training loop continuously
+        saves the predictor's parameters, which are then loaded in the
+        evaluation loop and passed to the evaluator's step method.
+      config: The configuration of the evaluator.
+    """
+CODERS = {
+    'fen': coders.StrUtf8Coder(),
+    'move': coders.StrUtf8Coder(),
+    'count': coders.BigIntegerCoder(),
+    'win_prob': coders.FloatCoder(),
+}
+CODERS['state_value'] = coders.TupleCoder((
+    CODERS['fen'],
+    CODERS['win_prob'],
+))
+CODERS['action_value'] = coders.TupleCoder((
+    CODERS['fen'],
+    CODERS['move'],
+    CODERS['win_prob'],
+))
+CODERS['behavioral_cloning'] = coders.TupleCoder((
+    CODERS['fen'],
+    CODERS['move'],
+))
+class BehavioralCloningData(NamedTuple):
+  fen: str
+  move: str
+class StateValueData(NamedTuple):
+  fen: str
+  win_prob: float
+class ActionValueData(NamedTuple):
+  fen: str
+  move: str
+  win_prob: float

hf_space_repo/searchless_chess_model/searchless_chess_code/hf_model.py ADDED Viewed

	@@ -0,0 +1,329 @@

+"""HuggingFace model wrapper for searchless chess."""
+import json
+import os
+from typing import Dict, Optional
+import haiku as hk
+import jax
+import jax.numpy as jnp
+import numpy as np
+import orbax.checkpoint as ocp
+import tokenizer
+import transformer
+import utils
+class SearchlessChessConfig:
+    """Configuration for SearchlessChess model."""
+    def __init__(
+        self,
+        vocab_size: int = 1968,
+        output_size: int = 128,
+        embedding_dim: int = 256,
+        num_layers: int = 8,
+        num_heads: int = 8,
+        max_sequence_length: int = 79,
+        num_return_buckets: int = 128,
+        model_name: str = "9M",
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.output_size = output_size
+        self.embedding_dim = embedding_dim
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.max_sequence_length = max_sequence_length
+        self.num_return_buckets = num_return_buckets
+        self.model_name = model_name
+        # Store any extra kwargs
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+    def to_dict(self) -> Dict:
+        """Convert config to dictionary."""
+        return {
+            "vocab_size": self.vocab_size,
+            "output_size": self.output_size,
+            "embedding_dim": self.embedding_dim,
+            "num_layers": self.num_layers,
+            "num_heads": self.num_heads,
+            "max_sequence_length": self.max_sequence_length,
+            "num_return_buckets": self.num_return_buckets,
+            "model_name": self.model_name,
+        }
+    @classmethod
+    def from_dict(cls, config_dict: Dict) -> "SearchlessChessConfig":
+        """Load config from dictionary."""
+        return cls(**config_dict)
+    def save_pretrained(self, save_directory: str):
+        """Save config to directory."""
+        os.makedirs(save_directory, exist_ok=True)
+        config_path = os.path.join(save_directory, "config.json")
+        with open(config_path, "w") as f:
+            json.dump(self.to_dict(), f, indent=2)
+    @classmethod
+    def from_pretrained(cls, model_path: str) -> "SearchlessChessConfig":
+        """Load config from directory."""
+        config_path = os.path.join(model_path, "config.json")
+        with open(config_path, "r") as f:
+            config_dict = json.load(f)
+        return cls.from_dict(config_dict)
+class SearchlessChessModel:
+    """HuggingFace-compatible wrapper for SearchlessChess JAX/Haiku model."""
+    def __init__(self, config: SearchlessChessConfig):
+        self.config = config
+        # Build transformer config
+        self.transformer_config = transformer.TransformerConfig(
+            vocab_size=config.vocab_size,
+            output_size=config.output_size,
+            pos_encodings=transformer.PositionalEncodings.LEARNED,
+            max_sequence_length=config.max_sequence_length,
+            num_heads=config.num_heads,
+            num_layers=config.num_layers,
+            embedding_dim=config.embedding_dim,
+            apply_post_ln=True,
+            apply_qk_layernorm=False,
+            use_causal_mask=False,
+        )
+        # Build predictor
+        self.predictor = transformer.build_transformer_predictor(self.transformer_config)
+        # Initialize params
+        self.params = None
+        self.return_buckets_values = None
+        # Get return bucket values
+        _, self.return_buckets_values = utils.get_uniform_buckets_edges_values(
+            config.num_return_buckets
+        )
+    def load_params(self, params_path: str):
+        """Load parameters from Orbax checkpoint."""
+        # Convert to absolute path (Orbax requires absolute paths)
+        params_path = os.path.abspath(params_path)
+        # Create dummy params for structure
+        dummy_params = self.predictor.initial_params(
+            rng=jax.random.PRNGKey(0),
+            targets=np.ones((1, 1), dtype=np.uint32),
+        )
+        # Load checkpoint
+        restore_args = ocp.checkpoint_utils.construct_restore_args(dummy_params)
+        checkpointer = ocp.Checkpointer(ocp.PyTreeCheckpointHandler())
+        self.params = checkpointer.restore(params_path, restore_args=restore_args)
+    def predict(self, fen: str, temperature: float = 1.0) -> Dict:
+        """Predict move from FEN position.
+        Args:
+            fen: Chess position in FEN notation
+            temperature: Temperature for sampling (1.0 = no modification)
+        Returns:
+            Dictionary with:
+                - q_values: Q-value distribution
+                - action_probs: Action probabilities
+                - best_action: Best action index
+                - best_move: Best move in UCI notation
+        """
+        if self.params is None:
+            raise ValueError("Model parameters not loaded. Call load_params() first.")
+        # Tokenize input
+        tokens = tokenizer.tokenize(fen)
+        tokens = tokens[None, :]  # Add batch dimension
+        # Get predictions
+        bucket_log_probs = self.predictor.predict(
+            params=self.params,
+            targets=tokens,
+            rng=None,
+        )
+        # Extract action Q-values (second to last position)
+        action_bucket_log_probs = bucket_log_probs[0, -2]  # [num_return_buckets]
+        action_bucket_probs = jnp.exp(action_bucket_log_probs)
+        # Compute Q-value for each action bucket
+        q_value = float(jnp.dot(action_bucket_probs, self.return_buckets_values))
+        # Get action probabilities from Q-values
+        # Use softmax over return bucket expectations
+        action_values = jnp.dot(
+            jnp.exp(bucket_log_probs[0, -2:]),
+            self.return_buckets_values,
+        )
+        # Apply temperature and softmax
+        action_logits = action_values / temperature
+        action_probs = jax.nn.softmax(action_logits)
+        # Get best action
+        best_action = int(jnp.argmax(action_probs))
+        # Convert action to move
+        best_move = utils.ACTION_TO_MOVE.get(best_action, "unknown")
+        return {
+            "q_value": q_value,
+            "action_probs": np.array(action_probs),
+            "best_action": best_action,
+            "best_move": best_move,
+        }
+    def save_pretrained(self, save_directory: str):
+        """Save model to directory in HuggingFace format."""
+        os.makedirs(save_directory, exist_ok=True)
+        # Save config
+        self.config.save_pretrained(save_directory)
+        # Save parameters as numpy arrays
+        if self.params is not None:
+            params_cpu = jax.device_get(self.params)
+            params_flat, tree_def = jax.tree.flatten(params_cpu)
+            # Save flattened params
+            params_path = os.path.join(save_directory, "params.npz")
+            np.savez(params_path, *params_flat)
+            # Save tree structure
+            import pickle
+            tree_path = os.path.join(save_directory, "tree_structure.pkl")
+            with open(tree_path, "wb") as f:
+                pickle.dump(tree_def, f)
+        # Copy necessary source files for standalone usage
+        import shutil
+        src_dir = os.path.dirname(__file__)
+        code_dir = os.path.join(save_directory, "searchless_chess_code")
+        os.makedirs(code_dir, exist_ok=True)
+        # Copy core modules and fix imports for standalone usage
+        def fix_imports(content):
+            """Replace absolute imports with relative imports."""
+            content = content.replace("import tokenizer", "import tokenizer")
+            content = content.replace("import transformer", "import transformer")
+            content = content.replace("import utils", "import utils")
+            content = content.replace("import constants", "import constants")
+            content = content.replace("import config as config_lib", "import config as config_lib")
+            content = content.replace("import config", "import config")
+            return content
+        for module in ["tokenizer.py", "transformer.py", "constants.py", "utils.py", "config.py"]:
+            src_file = os.path.join(src_dir, module)
+            dst_file = os.path.join(code_dir, module)
+            if os.path.exists(src_file):
+                with open(src_file, 'r') as f:
+                    content = fix_imports(f.read())
+                with open(dst_file, 'w') as f:
+                    f.write(content)
+        # Create standalone hf_model.py
+        standalone_hf_model = os.path.join(code_dir, "hf_model.py")
+        with open(__file__, 'r') as source:
+            content = fix_imports(source.read())
+            with open(standalone_hf_model, 'w') as dest:
+                dest.write(content)
+        # Create __init__.py
+        with open(os.path.join(code_dir, "__init__.py"), "w") as f:
+            f.write("# Searchless Chess code bundle\n")
+        # Save model info
+        model_info = {
+            "model_type": "searchless_chess",
+            "framework": "jax",
+            "library": "dm-haiku",
+            "includes_source": True,
+            "source_modules": ["tokenizer.py", "transformer.py", "constants.py", "utils.py", "config.py"],
+        }
+        with open(os.path.join(save_directory, "model_info.json"), "w") as f:
+            json.dump(model_info, f, indent=2)
+    @classmethod
+    def from_pretrained(cls, model_path: str) -> "SearchlessChessModel":
+        """Load model from directory."""
+        # Load config
+        config = SearchlessChessConfig.from_pretrained(model_path)
+        # Create model
+        model = cls(config)
+        # Load parameters
+        params_path = os.path.join(model_path, "params.npz")
+        tree_path = os.path.join(model_path, "tree_structure.pkl")
+        if os.path.exists(params_path) and os.path.exists(tree_path):
+            # Load tree structure
+            import pickle
+            with open(tree_path, "rb") as f:
+                tree_def = pickle.load(f)
+            # Load params
+            params_data = np.load(params_path)
+            params_flat = [params_data[f"arr_{i}"] for i in range(len(params_data.files))]
+            # Reconstruct pytree
+            model.params = jax.tree.unflatten(tree_def, params_flat)
+        return model
+def create_model_from_checkpoint(
+    checkpoint_path: str,
+    model_name: str = "9M",
+    use_ema: bool = True,
+) -> SearchlessChessModel:
+    """Create HuggingFace model from existing checkpoint.
+    Args:
+        checkpoint_path: Path to checkpoint directory (e.g., checkpoints/9M_selfplay/4)
+        model_name: Model size (9M, 136M, 270M)
+        use_ema: Whether to load EMA parameters
+    Returns:
+        SearchlessChessModel ready to save or use
+    """
+    # Determine architecture from model name
+    if model_name == "9M":
+        num_layers, embedding_dim, num_heads = 8, 256, 8
+    elif model_name == "136M":
+        num_layers, embedding_dim, num_heads = 8, 1024, 8
+    else:  # 270M
+        num_layers, embedding_dim, num_heads = 16, 1024, 8
+    # Create config
+    config = SearchlessChessConfig(
+        vocab_size=1968,
+        output_size=128,
+        embedding_dim=embedding_dim,
+        num_layers=num_layers,
+        num_heads=num_heads,
+        max_sequence_length=79,
+        num_return_buckets=128,
+        model_name=model_name,
+    )
+    # Create model
+    model = SearchlessChessModel(config)
+    # Load parameters from Orbax checkpoint
+    params_dir = "params_ema" if use_ema else "params"
+    params_path = os.path.join(checkpoint_path, params_dir)
+    model.load_params(params_path)
+    return model

hf_space_repo/searchless_chess_model/searchless_chess_code/tokenizer.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# Copyright 2025 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implements tokenization of FEN strings."""
+import jaxtyping as jtp
+import numpy as np
+# pyfmt: disable
+_CHARACTERS = [
+    '0',
+    '1',
+    '2',
+    '3',
+    '4',
+    '5',
+    '6',
+    '7',
+    '8',
+    '9',
+    'a',
+    'b',
+    'c',
+    'd',
+    'e',
+    'f',
+    'g',
+    'h',
+    'p',
+    'n',
+    'r',
+    'k',
+    'q',
+    'P',
+    'B',
+    'N',
+    'R',
+    'Q',
+    'K',
+    'w',
+    '.',
+]
+# pyfmt: enable
+_CHARACTERS_INDEX = {letter: index for index, letter in enumerate(_CHARACTERS)}
+_SPACES_CHARACTERS = frozenset({'1', '2', '3', '4', '5', '6', '7', '8'})
+SEQUENCE_LENGTH = 77
+def tokenize(fen: str) -> jtp.Int32[jtp.Array, 'T']:
+  """Returns an array of tokens from a fen string.
+  We compute a tokenized representation of the board, from the FEN string.
+  The final array of tokens is a mapping from this string to numbers, which
+  are defined in the dictionary `_CHARACTERS_INDEX`.
+  For the 'en passant' information, we convert the '-' (which means there is
+  no en passant relevant square) to '..', to always have two characters, and
+  a fixed length output.
+  Args:
+    fen: The board position in Forsyth-Edwards Notation.
+  """
+  # Extracting the relevant information from the FEN.
+  board, side, castling, en_passant, halfmoves_last, fullmoves = fen.split(' ')
+  board = board.replace('/', '')
+  board = side + board
+  indices = list()
+  for char in board:
+    if char in _SPACES_CHARACTERS:
+      indices.extend(int(char) * [_CHARACTERS_INDEX['.']])
+    else:
+      indices.append(_CHARACTERS_INDEX[char])
+  if castling == '-':
+    indices.extend(4 * [_CHARACTERS_INDEX['.']])
+  else:
+    for char in castling:
+      indices.append(_CHARACTERS_INDEX[char])
+    # Padding castling to have exactly 4 characters.
+    if len(castling) < 4:
+      indices.extend((4 - len(castling)) * [_CHARACTERS_INDEX['.']])
+  if en_passant == '-':
+    indices.extend(2 * [_CHARACTERS_INDEX['.']])
+  else:
+    # En passant is a square like 'e3'.
+    for char in en_passant:
+      indices.append(_CHARACTERS_INDEX[char])
+  # Three digits for halfmoves (since last capture) is enough since the game
+  # ends at 50.
+  halfmoves_last += '.' * (3 - len(halfmoves_last))
+  indices.extend([_CHARACTERS_INDEX[x] for x in halfmoves_last])
+  # Three digits for full moves is enough (no game lasts longer than 999
+  # moves).
+  fullmoves += '.' * (3 - len(fullmoves))
+  indices.extend([_CHARACTERS_INDEX[x] for x in fullmoves])
+  assert len(indices) == SEQUENCE_LENGTH
+  return np.asarray(indices, dtype=np.uint8)

hf_space_repo/searchless_chess_model/searchless_chess_code/transformer.py ADDED Viewed

	@@ -0,0 +1,284 @@

+# Copyright 2025 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Transformer model."""
+import dataclasses
+import enum
+import functools
+import haiku as hk
+import jax
+import jax.nn as jnn
+import jax.numpy as jnp
+import numpy as np
+import constants
+class PositionalEncodings(enum.Enum):
+  SINUSOID = enum.auto()
+  LEARNED = enum.auto()
+@dataclasses.dataclass(kw_only=True)
+class TransformerConfig:
+  """Hyperparameters used in the Transformer architectures."""
+  # The random seed for parameter initialization.
+  seed: int = 1
+  # The input vocabulary size.
+  vocab_size: int
+  # The output size (by default equal to the vocabulary size).
+  output_size: int | None = None
+  # The dimension of the first embedding.
+  embedding_dim: int = 64
+  # The number of multi-head attention layers.
+  num_layers: int = 4
+  # The number of heads per layer.
+  num_heads: int = 8
+  # Whether to use a causal mask or not.
+  use_causal_mask: bool = True
+  # The parameter initialization scale for the embeddings.
+  emb_init_scale: float = 0.02
+  # Positional encodings to use.
+  pos_encodings: PositionalEncodings = PositionalEncodings.SINUSOID
+  # Maximum sequence length, useful for the LEARNED positional encodings.
+  max_sequence_length: int | None = None
+  # How much larger the hidden layer of the feedforward network should be
+  # compared to the `embedding_dim`.
+  widening_factor: int = 4
+  # Whether to apply QK normalization trick in attention layer.
+  apply_qk_layernorm: bool = False
+  # Whether to apply post LN after attention + MLP blocks
+  apply_post_ln: bool = True
+  def __post_init__(self):
+    if self.output_size is None:
+      self.output_size = self.vocab_size
+class MultiHeadDotProductAttention(hk.Module):
+  """Multi-head dot-product attention (Vaswani et al., 2017)."""
+  def __init__(
+      self,
+      num_heads: int,
+      num_hiddens_per_head: int,
+      name: str | None = None,
+      apply_qk_layernorm: bool = False,
+  ) -> None:
+    """Initializes the attention module.
+    Args:
+      num_heads: Number of heads to use.
+      num_hiddens_per_head: Number of hidden neurons per head.
+      name: Name of the module.
+      apply_qk_layernorm: Applies layernorm to query and key matrices, this
+        helps training stability.
+    """
+    super().__init__(name=name)
+    self._num_heads = num_heads
+    self._num_hiddens_per_head = num_hiddens_per_head
+    self._apply_qk_layernorm = apply_qk_layernorm
+  def __call__(
+      self,
+      inputs_q: jax.Array,
+      inputs_kv: jax.Array,
+      mask: jax.Array | None = None,
+  ) -> jax.Array:
+    """Returns the output of the multi-head attention."""
+    batch_size, sequence_length, embedding_size = inputs_q.shape
+    num_hiddens = self._num_hiddens_per_head * self._num_heads
+    q = hk.Linear(num_hiddens, with_bias=False)(inputs_q)
+    k = hk.Linear(num_hiddens, with_bias=False)(inputs_kv)
+    if self._apply_qk_layernorm:
+      q = layer_norm(q)
+      k = layer_norm(k)
+    v = hk.Linear(num_hiddens, with_bias=False)(inputs_kv)
+    # The second (sequence) dimension is undefined since it can differ between
+    # queries and keys/values when decoding. Also checking that the inputs have
+    # the same batch size as the reshape below does not guarantee a failure if
+    # they are different.
+    new_shape = (batch_size, -1, self._num_heads, self._num_hiddens_per_head)
+    q = jnp.reshape(q, new_shape)
+    k = jnp.reshape(k, new_shape)
+    v = jnp.reshape(v, new_shape)
+    # Let b=batch_size, t=seq_len, h=num_heads, and d=num_hiddens_per_head.
+    attention = jnp.einsum('bthd,bThd->bhtT', q, k)
+    attention *= 1.0 / jnp.sqrt(self._num_hiddens_per_head)
+    if mask is not None:
+      attention = jnp.where(mask, attention, jnp.finfo(jnp.float32).min)
+    normalized_attention = jnn.softmax(attention)
+    output = jnp.einsum('bhtT,bThd->bthd', normalized_attention, v)
+    output = jnp.reshape(output, (batch_size, sequence_length, num_hiddens))
+    return hk.Linear(embedding_size, with_bias=False)(output)
+def sinusoid_position_encoding(
+    sequence_length: int,
+    hidden_size: int,
+    max_timescale: float = 1e4,
+) -> np.ndarray:
+  """Creates sinusoidal encodings from the original transformer paper.
+  The returned values are, for all i < D/2:
+    array[pos, i] = sin(pos / (max_timescale^(2*i / D)))
+    array[pos, D/2 + i] = cos(pos / (max_timescale^(2*i / D)))
+  Args:
+    sequence_length: Sequence length.
+    hidden_size: Dimension of the positional encoding vectors, D. Should be
+      even.
+    max_timescale: Maximum timescale for the frequency.
+  Returns:
+    An array of shape [L, D] if `add_negative` or `keep_positive_side` is
+    `False`, else [2 * L, D].
+  """
+  freqs = np.arange(0, hidden_size + 1, 2)
+  inv_freq = max_timescale ** (-freqs / hidden_size)
+  pos_seq = np.arange(start=0, stop=sequence_length)
+  sinusoid_inp = np.einsum('i,j->ij', pos_seq, inv_freq)
+  embeddings = np.concatenate(
+      [np.sin(sinusoid_inp), np.cos(sinusoid_inp)], axis=-1
+  )
+  return embeddings[:, :hidden_size]
+def embed_sequences(
+    sequences: jax.Array,
+    config: TransformerConfig,
+) -> jax.Array:
+  """Returns embeddings for sequences of tokens."""
+  embs_init = hk.initializers.TruncatedNormal(stddev=config.emb_init_scale)
+  embeddings_layer = hk.Embed(
+      vocab_size=config.vocab_size,
+      embed_dim=config.embedding_dim,
+      lookup_style=hk.EmbedLookupStyle.ARRAY_INDEX,
+      w_init=embs_init,
+  )
+  embeddings = embeddings_layer(sequences)
+  embeddings *= jnp.sqrt(config.embedding_dim)
+  _, sequence_length, embedding_size = embeddings.shape
+  match config.pos_encodings:
+    case PositionalEncodings.SINUSOID:
+      pos_encodings = sinusoid_position_encoding(
+          sequence_length=sequence_length,
+          hidden_size=embedding_size,
+      )
+    case PositionalEncodings.LEARNED:
+      assert sequence_length <= config.max_sequence_length
+      positions = jnp.arange(sequence_length)
+      pos_encodings = hk.Embed(
+          vocab_size=config.max_sequence_length,
+          embed_dim=embedding_size,
+      )(positions)
+  return embeddings + pos_encodings
+def layer_norm(x: jax.Array) -> jax.Array:
+  """Helper function for layer norm."""
+  return hk.LayerNorm(axis=-1, create_scale=True, create_offset=True)(x)
+def shift_right(sequences: jax.Array) -> jax.Array:
+  """Right-shift the one-hot encoded input by padding on the temporal axis."""
+  bos_array = jnp.zeros((sequences.shape[0], 1), dtype=jnp.uint8)
+  padded_sequences = jnp.concatenate([bos_array, sequences], axis=1)
+  return padded_sequences[:, :-1]
+def _mlp_block(inputs: jax.Array, config: TransformerConfig) -> jax.Array:
+  """Gated MLP block for the Transformer."""
+  ffn_dim = config.embedding_dim * config.widening_factor
+  split_1 = hk.Linear(ffn_dim, with_bias=False)(inputs)
+  split_2 = hk.Linear(ffn_dim, with_bias=False)(inputs)
+  gate_output = jnn.silu(split_1) * split_2
+  return hk.Linear(config.embedding_dim, with_bias=False)(gate_output)
+def _attention_block(inputs: jax.Array, config: TransformerConfig) -> jax.Array:
+  """Attention block for the Transformer."""
+  batch_size, sequence_length = inputs.shape[:2]
+  if config.use_causal_mask:
+    causal_mask = np.tril(
+        np.ones((batch_size, 1, sequence_length, sequence_length))
+    )
+  else:
+    causal_mask = None
+  block = MultiHeadDotProductAttention(
+      num_heads=config.num_heads,
+      num_hiddens_per_head=config.embedding_dim // config.num_heads,
+      apply_qk_layernorm=config.apply_qk_layernorm,
+  )
+  return block(inputs_q=inputs, inputs_kv=inputs, mask=causal_mask)
+def transformer_decoder(
+    targets: jax.Array,
+    config: TransformerConfig,
+) -> jax.Array:
+  """Returns the transformer decoder output, shape [B, T, V].
+  Follows the LLaMa architecture:
+  https://github.com/facebookresearch/llama/blob/main/llama/model.py
+  Main changes to the original Transformer decoder:
+  - Using gating in the MLP block, with SwiGLU activation function.
+  - Using normalization before the attention and MLP blocks.
+  Args:
+    targets: The integer target values, shape [B, T].
+    config: The config to use for the transformer.
+  """
+  # Right shift the targets to get the inputs (the first token is now a 0).
+  inputs = shift_right(targets)
+  # Embeds the inputs and adds positional encodings.
+  embeddings = embed_sequences(inputs, config)
+  h = embeddings
+  for _ in range(config.num_layers):
+    attention_input = layer_norm(h)
+    attention = _attention_block(attention_input, config)
+    h += attention
+    mlp_input = layer_norm(h)
+    mlp_output = _mlp_block(mlp_input, config)
+    h += mlp_output
+  if config.apply_post_ln:
+    h = layer_norm(h)
+  logits = hk.Linear(config.output_size)(h)
+  return jnn.log_softmax(logits, axis=-1)
+def build_transformer_predictor(
+    config: TransformerConfig,
+) -> constants.Predictor:
+  """Returns a transformer predictor."""
+  model = hk.transform(functools.partial(transformer_decoder, config=config))
+  return constants.Predictor(initial_params=model.init, predict=model.apply)

hf_space_repo/searchless_chess_model/searchless_chess_code/utils.py ADDED Viewed

	@@ -0,0 +1,162 @@

+# Copyright 2025 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implements some utility functions."""
+import math
+import chess
+import numpy as np
+# The lists of the strings of the row and columns of a chess board,
+# traditionally named rank and file.
+_CHESS_FILE = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
+def _compute_all_possible_actions() -> tuple[dict[str, int], dict[int, str]]:
+  """Returns two dicts converting moves to actions and actions to moves.
+  These dicts contain all possible chess moves.
+  """
+  all_moves = []
+  # First, deal with the normal moves.
+  # Note that this includes castling, as it is just a rook or king move from one
+  # square to another.
+  board = chess.BaseBoard.empty()
+  for square in range(64):
+    next_squares = []
+    # Place the queen and see where it attacks (we don't need to cover the case
+    # for a bishop, rook, or pawn because the queen's moves includes all their
+    # squares).
+    board.set_piece_at(square, chess.Piece.from_symbol('Q'))
+    next_squares += board.attacks(square)
+    # Place knight and see where it attacks
+    board.set_piece_at(square, chess.Piece.from_symbol('N'))
+    next_squares += board.attacks(square)
+    board.remove_piece_at(square)
+    for next_square in next_squares:
+      all_moves.append(
+          chess.square_name(square) + chess.square_name(next_square)
+      )
+  # Then deal with promotions.
+  # Only look at the last ranks.
+  promotion_moves = []
+  for rank, next_rank in [('2', '1'), ('7', '8')]:
+    for index_file, file in enumerate(_CHESS_FILE):
+      # Normal promotions.
+      move = f'{file}{rank}{file}{next_rank}'
+      promotion_moves += [(move + piece) for piece in ['q', 'r', 'b', 'n']]
+      # Capture promotions.
+      # Left side.
+      if file > 'a':
+        next_file = _CHESS_FILE[index_file - 1]
+        move = f'{file}{rank}{next_file}{next_rank}'
+        promotion_moves += [(move + piece) for piece in ['q', 'r', 'b', 'n']]
+      # Right side.
+      if file < 'h':
+        next_file = _CHESS_FILE[index_file + 1]
+        move = f'{file}{rank}{next_file}{next_rank}'
+        promotion_moves += [(move + piece) for piece in ['q', 'r', 'b', 'n']]
+  all_moves += promotion_moves
+  move_to_action, action_to_move = {}, {}
+  for action, move in enumerate(all_moves):
+    assert move not in move_to_action
+    move_to_action[move] = action
+    action_to_move[action] = move
+  return move_to_action, action_to_move
+MOVE_TO_ACTION, ACTION_TO_MOVE = _compute_all_possible_actions()
+NUM_ACTIONS = len(MOVE_TO_ACTION)
+def centipawns_to_win_probability(centipawns: int) -> float:
+  """Returns the win probability (in [0, 1]) converted from the centipawn score.
+  Reference: https://lichess.org/page/accuracy
+  Well-known transformation, backed by real-world data.
+  Args:
+    centipawns: The chess score in centipawns.
+  """
+  return 0.5 + 0.5 * (2 / (1 + math.exp(-0.00368208 * centipawns)) - 1)
+def get_uniform_buckets_edges_values(
+    num_buckets: int,
+) -> tuple[np.ndarray, np.ndarray]:
+  """Returns edges and values of uniformly sampled buckets in [0, 1].
+  Example: for num_buckets=4, it returns:
+  edges=[0.25, 0.50, 0.75]
+  values=[0.125, 0.375, 0.625, 0.875]
+  Args:
+    num_buckets: Number of buckets to create.
+  """
+  full_linspace = np.linspace(0.0, 1.0, num_buckets + 1)
+  edges = full_linspace[1:-1]
+  values = (full_linspace[:-1] + full_linspace[1:]) / 2
+  return edges, values
+def compute_return_buckets_from_returns(
+    returns: np.ndarray,
+    bins_edges: np.ndarray,
+) -> np.ndarray:
+  """Arranges the discounted returns into bins.
+  The returns are put into the bins specified by `bin_edges`. The length of
+  `bin_edges` is equal to the number of buckets minus 1. In case of a tie (if
+  the return is exactly equal to an edge), we take the bucket right before the
+  edge. See example below.
+  This function is purely using np.searchsorted, so it's a good reference to
+  look at.
+  Examples:
+  * bin_edges=[0.5] and returns=[0., 1.] gives the buckets [0, 1].
+  * bin_edges=[-30., 30.] and returns=[-200., -30., 0., 1.] gives the buckets
+    [0, 0, 1, 1].
+  Args:
+    returns: An array of discounted returns, rank 1.
+    bins_edges: The boundary values of the return buckets, rank 1.
+  Returns:
+    An array of buckets, described as integers, rank 1.
+  Raises:
+    ValueError if `returns` or `bins_edges` are not of rank 1.
+  """
+  if len(returns.shape) != 1:
+    raise ValueError(
+        'The passed returns should be of rank 1. Got'
+        f' rank={len(returns.shape)}.'
+    )
+  if len(bins_edges.shape) != 1:
+    raise ValueError(
+        'The passed bins_edges should be of rank 1. Got'
+        f' rank{len(bins_edges.shape)}.'
+    )
+  return np.searchsorted(bins_edges, returns, side='left')

hf_space_repo/train_self_play.py ADDED Viewed

	@@ -0,0 +1,72 @@

+"""Training script for GRPO chess self-play."""
+import argparse
+import warnings
+from typing import Any
+from torch.utils.data import DataLoader
+from src.grpo_self_play.trainer import get_trainer
+from src.grpo_self_play.chess.boards_dataset import ChessStartStatesDataset
+from src.grpo_self_play.grpo_logic.model import GRPOChessTransformer
+from src.grpo_self_play.configs.config_loader import load_experiment_config
+def train(
+    config_path: str = "default.yaml",
+    overrides: dict[str, dict[str, Any]] | None = None,
+    dataloader_kwargs: dict[str, Any] | None = None
+) -> None:
+    """Main training function for GRPO chess self-play.
+    Args:
+        config_path: Path to the YAML config file (relative to configs directory)
+        overrides: Optional dict of overrides per section. Example:
+            {
+                "grpo": {"lr": 1e-4, "entropy_coef": 0.2},
+                "training": {"num_epochs": 100},
+                "stockfish": {"skill_level": 5},
+            }
+        dataloader_kwargs: Optional dict of arguments to pass to DataLoader constructor.
+            These override config values. Example: {"batch_size": 64, "num_workers": 4}
+    """
+    config = load_experiment_config(config_path, overrides=overrides)
+    # Build dataloader kwargs from config, with defaults
+    dataloader_config = {
+        "batch_size": config.training.batch_size,
+        "num_workers": 2,
+    }
+    # Apply dataloader_kwargs overrides and warn if overriding config values
+    if dataloader_kwargs:
+        for key, value in dataloader_kwargs.items():
+            if key in dataloader_config:
+                warnings.warn(
+                    f"Overriding DataLoader '{key}' from config ({dataloader_config[key]}) "
+                    f"with provided value ({value})",
+                    UserWarning,
+                    stacklevel=2
+                )
+            dataloader_config[key] = value
+    trainer = get_trainer(num_epochs=config.training.num_epochs)
+    dataset = ChessStartStatesDataset(config.dataset)
+    dataloader = DataLoader(dataset, **dataloader_config)
+    model = GRPOChessTransformer(
+        transformer_config=config.transformer,
+        grpo_config=config.grpo,
+        eval_cfg=config.eval,
+        stockfish_cfg=config.stockfish,
+        policy_cfg=config.policy,
+        searcher_cfg=config.searcher,
+        pretrain_cfg=config.pretrain,
+    )
+    print("Starting Training with WandB Tracking...")
+    trainer.fit(model, dataloader)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, default="default.yaml")
+    args = parser.parse_args()
+    train(config_path=args.config)

hf_space_repo/trainer.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import time
+import random
+import string
+import pytorch_lightning as pl
+from pytorch_lightning.loggers import WandbLogger
+from pytorch_lightning.callbacks import ModelCheckpoint
+def generate_run_name(project: str = "chess-grpo") -> str:
+    """Generate a unique run name with timestamp and random suffix.
+    Args:
+        project: Project name prefix
+    Returns:
+        Unique run name string
+    """
+    timestamp = time.strftime("%Y%m%d-%H%M")
+    random_suffix = ''.join(random.choices(string.ascii_lowercase + string.digits, k=4))
+    return f"{project}-{timestamp}-{random_suffix}"
+def get_trainer(num_epochs: int = 5000,
+                checkpoint_dir: str = "/content/drive/MyDrive/data/grpo-chess/checkpoints/",
+                checkpoint_every_n_epochs: int = 5,
+                keep_n_checkpoints: int = 3) -> pl.Trainer:
+    """Create a PyTorch Lightning trainer with WandB logging and checkpointing.
+    Args:
+        num_epochs: Maximum number of training epochs
+        checkpoint_dir: Directory to save model checkpoints
+        checkpoint_every_n_epochs: Save periodic checkpoint every N epochs
+        keep_n_checkpoints: Keep last N periodic checkpoints per run
+    Returns:
+        Configured PyTorch Lightning trainer
+    """
+    run_name = generate_run_name()
+    print(f"Generated run name: {run_name}")
+    wandb_logger = WandbLogger(project="Chess-GRPO-Bot", log_model=True, name=run_name)
+    # Best checkpoint - saves top 2 based on loss
+    best_checkpoint_cb = ModelCheckpoint(
+        dirpath=checkpoint_dir,
+        filename=run_name + "-best-{epoch:02d}-{train_total_loss:.4f}",
+        save_top_k=2,
+        monitor="train_total_loss",
+        mode="min"
+    )
+    # Periodic checkpoint for crash recovery
+    # Fixed filenames (periodic-0, periodic-1, etc.) that rotate within each run
+    periodic_checkpoint_cb = ModelCheckpoint(
+        dirpath=checkpoint_dir,
+        filename=run_name + "-periodic",
+        save_top_k=keep_n_checkpoints,
+        monitor="train_total_loss",
+        mode="min",
+        every_n_epochs=checkpoint_every_n_epochs,
+        save_last=True,  # Always keep the very last checkpoint
+    )
+    return pl.Trainer(
+        max_epochs=num_epochs,
+        # Gradient clipping handled manually in GRPOChessTransformer.training_step
+        accelerator="auto",
+        devices=1,
+        logger=wandb_logger,
+        callbacks=[best_checkpoint_cb, periodic_checkpoint_cb],
+        log_every_n_steps=1  # Log every step for GRPO debug
+    )

requirements.txt CHANGED Viewed

@@ -4,9 +4,5 @@ torch
 safetensors
 python-chess
 huggingface_hub
-pytorch_lightning
-mcp>=0.9.0
-wandb>=0.16.0
 jaxtyping
-datasets
-gradio>=4.44.1

 safetensors
 python-chess
 huggingface_hub
+numpy
 jaxtyping