bouhss commited on about 1 month ago

Commit

663d8ea

verified ·

1 Parent(s): 1045380

Chess Challenge submission by bouhss

Browse files

Files changed (29) hide show

README.md +20 -0
config.json +29 -0
model.py +446 -0
model.safetensors +3 -0
special_tokens_map.json +6 -0
src/.ipynb_checkpoints/__init__-checkpoint.py +22 -0
src/.ipynb_checkpoints/data-checkpoint.py +205 -0
src/.ipynb_checkpoints/evaluate-checkpoint.py +710 -0
src/.ipynb_checkpoints/model-checkpoint.py +446 -0
src/.ipynb_checkpoints/tokenizer-checkpoint.py +207 -0
src/.ipynb_checkpoints/train-checkpoint.py +268 -0
src/.ipynb_checkpoints/utils-checkpoint.py +369 -0
src/__init__.py +22 -0
src/__pycache__/__init__.cpython-311.pyc +0 -0
src/__pycache__/data.cpython-311.pyc +0 -0
src/__pycache__/evaluate.cpython-311.pyc +0 -0
src/__pycache__/model.cpython-311.pyc +0 -0
src/__pycache__/tokenizer.cpython-311.pyc +0 -0
src/__pycache__/train.cpython-311.pyc +0 -0
src/__pycache__/utils.cpython-311.pyc +0 -0
src/data.py +205 -0
src/evaluate.py +710 -0
src/model.py +446 -0
src/tokenizer.py +207 -0
src/train.py +268 -0
src/utils.py +369 -0
tokenizer.py +207 -0
tokenizer_config.json +49 -0
vocab.json +150 -0

README.md ADDED Viewed

	@@ -0,0 +1,20 @@

+---
+library_name: transformers
+tags:
+- chess
+- llm-course
+- chess-challenge
+license: mit
+---
+# chess-stockbird2
+Chess model submitted to the LLM Course Chess Challenge.
+## Submission Info
+- **Submitted by**: bouhss
+- **Parameters**: 992,032
+- **Vocab size**: 148
+- **Embedding dim**: 128
+- **Layers**: 6
+- **Heads**: 8

config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "_name_or_path": "my_model_gpu_full/final_model",
+  "architectures": [
+    "ChessForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "model.ChessConfig",
+    "AutoModelForCausalLM": "model.ChessForCausalLM"
+  },
+  "bos_token_id": 1,
+  "dropout": 0.05,
+  "eos_token_id": 2,
+  "layer_norm_epsilon": 1e-06,
+  "mlp_type": "swiglu",
+  "model_type": "chess_transformer",
+  "n_ctx": 256,
+  "n_embd": 128,
+  "n_head": 8,
+  "n_inner": 248,
+  "n_layer": 6,
+  "pad_token_id": 0,
+  "rope_theta": 10000.0,
+  "tie_weights": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.39.3",
+  "use_rmsnorm": true,
+  "use_rope": true,
+  "vocab_size": 148
+}

model.py ADDED Viewed

	@@ -0,0 +1,446 @@

+"""
+Chess Transformer Model for the Chess Challenge.
+Modern small-LLM upgrades:
+- RoPE (rotary positional embeddings): no learned positional embeddings needed
+- RMSNorm (optional, default True)
+- SwiGLU MLP (optional, default True)
+- Weight tying (default True)
+- Safe loss ignore_index = -100 (HF convention)
+"""
+from __future__ import annotations
+import math
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PretrainedConfig, PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithPast
+class ChessConfig(PretrainedConfig):
+    model_type = "chess_transformer"
+    def __init__(
+        self,
+        vocab_size: int = 1200,
+        # Architecture (defaults tuned to be < 1M params for common vocabs)
+        n_embd: int = 112,
+        n_layer: int = 7,
+        n_head: int = 7,
+        # Context window
+        n_ctx: int = 512,
+        # MLP hidden size:
+        # - if mlp_type="swiglu", this is SwiGLU hidden size h
+        # - if mlp_type="gelu", this is FFN inner size
+        n_inner: Optional[int] = 192,
+        dropout: float = 0.05,
+        layer_norm_epsilon: float = 1e-6,
+        # Position encoding
+        use_rope: bool = True,
+        rope_theta: float = 10000.0,
+        # Normalization / MLP type
+        use_rmsnorm: bool = True,
+        mlp_type: str = "swiglu",  # "swiglu" or "gelu"
+        # Weight tying
+        tie_weights: bool = True,
+        pad_token_id: int = 0,
+        bos_token_id: int = 1,
+        eos_token_id: int = 2,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+        if n_embd % n_head != 0:
+            raise ValueError(f"n_embd ({n_embd}) must be divisible by n_head ({n_head})")
+        head_dim = n_embd // n_head
+        if use_rope and (head_dim % 2 != 0):
+            raise ValueError(
+                f"RoPE requires even head_dim, got head_dim={head_dim}. "
+                f"Choose n_embd/n_head even."
+            )
+        self.vocab_size = vocab_size
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_ctx = n_ctx
+        self.n_inner = n_inner if n_inner is not None else (2 * n_embd)
+        self.dropout = dropout
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.use_rope = use_rope
+        self.rope_theta = rope_theta
+        self.use_rmsnorm = use_rmsnorm
+        self.mlp_type = mlp_type
+        self.tie_weights = tie_weights
+        # HF uses this field for embedding tying behavior
+        self.tie_word_embeddings = bool(tie_weights)
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        norm = torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
+        return x * norm * self.weight
+def rotate_half(x: torch.Tensor) -> torch.Tensor:
+    x1 = x[..., 0::2]
+    x2 = x[..., 1::2]
+    out = torch.empty_like(x)
+    out[..., 0::2] = -x2
+    out[..., 1::2] = x1
+    return out
+class RotaryEmbedding(nn.Module):
+    """
+    RoPE cache builder. Applies RoPE to q,k with shape (B,H,T,D).
+    """
+    def __init__(self, head_dim: int, theta: float = 10000.0):
+        super().__init__()
+        if head_dim % 2 != 0:
+            raise ValueError(f"RoPE requires even head_dim, got {head_dim}")
+        inv_freq = 1.0 / (theta ** (torch.arange(0, head_dim, 2).float() / head_dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self._cos_cached = None
+        self._sin_cached = None
+        self._seq_len_cached = 0
+        self._device_cached = None
+        self._dtype_cached = None
+    def _build_cache(self, seq_len: int, device: torch.device, dtype: torch.dtype):
+        t = torch.arange(seq_len, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)  # (T, D/2)
+        cos = freqs.cos().to(dtype=dtype)
+        sin = freqs.sin().to(dtype=dtype)
+        self._cos_cached = cos
+        self._sin_cached = sin
+        self._seq_len_cached = seq_len
+        self._device_cached = device
+        self._dtype_cached = dtype
+    def forward(self, q: torch.Tensor, k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        # q,k: (B,H,T,D)
+        T = q.size(-2)
+        device = q.device
+        dtype = q.dtype
+        if (
+            self._cos_cached is None
+            or T > self._seq_len_cached
+            or device != self._device_cached
+            or dtype != self._dtype_cached
+        ):
+            self._build_cache(T, device, dtype)
+        cos = self._cos_cached[:T]  # (T, D/2)
+        sin = self._sin_cached[:T]  # (T, D/2)
+        # broadcast to (1,1,T,D) via repeat_interleave on last dim
+        cos = torch.repeat_interleave(cos.unsqueeze(0).unsqueeze(0), 2, dim=-1)
+        sin = torch.repeat_interleave(sin.unsqueeze(0).unsqueeze(0), 2, dim=-1)
+        q_out = (q * cos) + (rotate_half(q) * sin)
+        k_out = (k * cos) + (rotate_half(k) * sin)
+        return q_out, k_out
+class MultiHeadAttention(nn.Module):
+    def __init__(self, config: ChessConfig):
+        super().__init__()
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.head_dim = config.n_embd // config.n_head
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
+        self.dropout = nn.Dropout(config.dropout)
+        self.use_rope = bool(config.use_rope)
+        self.rope = RotaryEmbedding(self.head_dim, theta=config.rope_theta) if self.use_rope else None
+        # causal mask buffer (expandable)
+        self.register_buffer(
+            "bias",
+            torch.tril(torch.ones(config.n_ctx, config.n_ctx)).view(1, 1, config.n_ctx, config.n_ctx),
+            persistent=False,
+        )
+    def _ensure_causal_mask(self, seq_len: int, device: torch.device, dtype: torch.dtype):
+        if self.bias.size(-1) >= seq_len and self.bias.device == device:
+            return
+        self.bias = torch.tril(torch.ones(seq_len, seq_len, device=device, dtype=dtype)).view(1, 1, seq_len, seq_len)
+    def forward(self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        B, T, _ = x.size()
+        qkv = self.c_attn(x)
+        q, k, v = qkv.split(self.n_embd, dim=2)
+        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)  # (B,H,T,D)
+        k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        if self.use_rope:
+            q, k = self.rope(q, k)
+        attn = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
+        self._ensure_causal_mask(T, attn.device, attn.dtype)
+        causal_mask = self.bias[:, :, :T, :T]
+        mask_value = torch.finfo(attn.dtype).min
+        attn = attn.masked_fill(causal_mask == 0, mask_value)
+        # padding mask (1=keep, 0=mask)
+        if attention_mask is not None:
+            am = attention_mask.unsqueeze(1).unsqueeze(2)  # (B,1,1,T)
+            attn = attn.masked_fill(am == 0, mask_value)
+        attn = F.softmax(attn, dim=-1)
+        attn = self.dropout(attn)
+        y = torch.matmul(attn, v)  # (B,H,T,D)
+        y = y.transpose(1, 2).contiguous().view(B, T, self.n_embd)
+        y = self.c_proj(y)
+        y = self.dropout(y)
+        return y
+class SwiGLU(nn.Module):
+    def __init__(self, config: ChessConfig):
+        super().__init__()
+        h = config.n_inner
+        self.w12 = nn.Linear(config.n_embd, 2 * h)
+        self.w3 = nn.Linear(h, config.n_embd)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        x = F.silu(x1) * x2
+        x = self.w3(x)
+        x = self.dropout(x)
+        return x
+class FeedForwardGELU(nn.Module):
+    def __init__(self, config: ChessConfig):
+        super().__init__()
+        self.c_fc = nn.Linear(config.n_embd, config.n_inner)
+        self.c_proj = nn.Linear(config.n_inner, config.n_embd)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.c_fc(x)
+        x = F.gelu(x)
+        x = self.c_proj(x)
+        x = self.dropout(x)
+        return x
+class TransformerBlock(nn.Module):
+    def __init__(self, config: ChessConfig):
+        super().__init__()
+        if config.use_rmsnorm:
+            self.ln_1 = RMSNorm(config.n_embd, eps=config.layer_norm_epsilon)
+            self.ln_2 = RMSNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        else:
+            self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+            self.ln_2 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        self.attn = MultiHeadAttention(config)
+        if config.mlp_type.lower() == "swiglu":
+            self.mlp = SwiGLU(config)
+        else:
+            self.mlp = FeedForwardGELU(config)
+    def forward(self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        x = x + self.attn(self.ln_1(x), attention_mask=attention_mask)
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class ChessForCausalLM(PreTrainedModel):
+    config_class = ChessConfig
+    base_model_prefix = "transformer"
+    supports_gradient_checkpointing = True
+    keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _no_split_modules = ["TransformerBlock"]
+    def __init__(self, config: ChessConfig):
+        super().__init__(config)
+        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
+        # learned positional embeddings only if RoPE disabled
+        self.wpe = None
+        if not config.use_rope:
+            self.wpe = nn.Embedding(config.n_ctx, config.n_embd)
+        self.drop = nn.Dropout(config.dropout)
+        self.h = nn.ModuleList([TransformerBlock(config) for _ in range(config.n_layer)])
+        if config.use_rmsnorm:
+            self.ln_f = RMSNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        else:
+            self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        if config.tie_weights:
+            self._tied_weights_keys = ["lm_head.weight"]
+        self.post_init()
+        if config.tie_weights:
+            self.tie_weights()
+    def get_input_embeddings(self) -> nn.Module:
+        return self.wte
+    def set_input_embeddings(self, new_embeddings: nn.Module):
+        self.wte = new_embeddings
+        if getattr(self.config, "tie_weights", False):
+            self.tie_weights()
+    def get_output_embeddings(self) -> nn.Module:
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings: nn.Module):
+        self.lm_head = new_embeddings
+    def tie_weights(self):
+        if getattr(self.config, "tie_weights", False) or getattr(self.config, "tie_word_embeddings", False):
+            self._tie_or_clone_weights(self.lm_head, self.wte)
+    def _init_weights(self, module: nn.Module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        B, T = input_ids.size()
+        device = input_ids.device
+        x = self.wte(input_ids)
+        if self.wpe is not None:
+            if position_ids is None:
+                position_ids = torch.arange(T, device=device).unsqueeze(0).expand(B, -1)
+            x = x + self.wpe(position_ids)
+        x = self.drop(x)
+        for block in self.h:
+            x = block(x, attention_mask=attention_mask)
+        x = self.ln_f(x)
+        logits = self.lm_head(x)
+        loss = None
+        if labels is not None:
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)),
+                shift_labels.view(-1),
+            )
+        if not return_dict:
+            output = (logits,)
+            return ((loss,) + output) if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=None,
+            hidden_states=None,
+            attentions=None,
+        )
+    @torch.no_grad()
+    def generate_move(
+        self,
+        input_ids: torch.LongTensor,
+        temperature: float = 0.7,
+        top_k: Optional[int] = 50,
+        top_p: Optional[float] = None,
+    ) -> int:
+        self.eval()
+        outputs = self(input_ids)
+        logits = outputs.logits[:, -1, :] / max(float(temperature), 1e-6)
+        if top_k is not None and top_k > 0:
+            k = min(int(top_k), logits.size(-1))
+            thresh = torch.topk(logits, k)[0][..., -1, None]
+            logits = logits.masked_fill(logits < thresh, torch.finfo(logits.dtype).min)
+        if top_p is not None:
+            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+            probs = F.softmax(sorted_logits, dim=-1)
+            cum = torch.cumsum(probs, dim=-1)
+            to_remove = cum > float(top_p)
+            to_remove[..., 1:] = to_remove[..., :-1].clone()
+            to_remove[..., 0] = 0
+            indices_to_remove = to_remove.scatter(dim=-1, index=sorted_indices, src=to_remove)
+            logits = logits.masked_fill(indices_to_remove, torch.finfo(logits.dtype).min)
+        probs = F.softmax(logits, dim=-1)
+        next_token = torch.multinomial(probs, num_samples=1)
+        return int(next_token.item())
+# Register the model with Auto classes
+from transformers import AutoConfig, AutoModelForCausalLM
+AutoConfig.register("chess_transformer", ChessConfig)
+AutoModelForCausalLM.register(ChessConfig, ChessForCausalLM)

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9148bdf02f882142d8414a64c14a568340412aa2d8c046ee1979da5d498f62e3
+size 3973424

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "[BOS]",
+  "eos_token": "[EOS]",
+  "pad_token": "[PAD]",
+  "unk_token": "[UNK]"
+}

src/.ipynb_checkpoints/__init__-checkpoint.py ADDED Viewed

	@@ -0,0 +1,22 @@

+"""Chess Challenge source module."""
+from .model import ChessConfig, ChessForCausalLM
+from .tokenizer import ChessTokenizer
+# Lazy import for evaluate to avoid RuntimeWarning when running as module
+def __getattr__(name):
+    if name == "ChessEvaluator":
+        from .evaluate import ChessEvaluator
+        return ChessEvaluator
+    if name == "load_model_from_hub":
+        from .evaluate import load_model_from_hub
+        return load_model_from_hub
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+__all__ = [
+    "ChessConfig",
+    "ChessForCausalLM",
+    "ChessTokenizer",
+    "ChessEvaluator",
+    "load_model_from_hub",
+]

src/.ipynb_checkpoints/data-checkpoint.py ADDED Viewed

	@@ -0,0 +1,205 @@

+"""
+Data loading utilities for the Chess Challenge.
+This module provides functions to load and process chess game data
+from the Lichess dataset on Hugging Face.
+IMPORTANT NOTE (compat with template evaluate + custom tokenizers):
+- Do NOT manually prepend BOS in the raw text.
+  The tokenizer should handle BOS via build_inputs_with_special_tokens.
+  This avoids double-BOS issues and keeps train/eval conventions aligned.
+"""
+from __future__ import annotations
+from typing import Dict, Iterator, List, Optional
+import torch
+from torch.utils.data import Dataset
+class ChessDataset(Dataset):
+    """
+    PyTorch Dataset for chess games.
+    Each game is tokenized and truncated/padded to max_length.
+    Labels are identical to input_ids; the model shifts internally.
+    Padding labels are set to -100 (HF convention) so they are ignored by CE loss.
+    """
+    def __init__(
+        self,
+        tokenizer,
+        dataset_name: str = "dlouapre/lichess_2025-01_1M",
+        split: str = "train",
+        column: str = "text",
+        max_length: int = 256,
+        max_samples: Optional[int] = None,
+    ):
+        from datasets import load_dataset
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.column = column
+        dataset = load_dataset(dataset_name, split=split)
+        if max_samples is not None:
+            dataset = dataset.select(range(min(max_samples, len(dataset))))
+        self.data = dataset
+    def __len__(self) -> int:
+        return len(self.data)
+    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
+        game = self.data[idx][self.column]
+        # IMPORTANT: do NOT prepend BOS manually in raw text.
+        # The tokenizer should add BOS (and only BOS if desired) via
+        # build_inputs_with_special_tokens, keeping things compatible with evaluate.py.
+        encoding = self.tokenizer(
+            game,
+            truncation=True,
+            max_length=self.max_length,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        input_ids = encoding["input_ids"].squeeze(0)
+        attention_mask = encoding["attention_mask"].squeeze(0)
+        labels = input_ids.clone()
+        labels[attention_mask == 0] = -100
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "labels": labels,
+        }
+class ChessDataCollator:
+    """
+    Data collator for chess games.
+    Here sequences are already padded to max_length in the dataset,
+    so we just stack tensors.
+    """
+    def __init__(self, tokenizer, max_length: int = 256):
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+    def __call__(self, features: List[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids = torch.stack([f["input_ids"] for f in features])
+        attention_mask = torch.stack([f["attention_mask"] for f in features])
+        labels = torch.stack([f["labels"] for f in features])
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "labels": labels,
+        }
+def create_train_val_datasets(
+    tokenizer,
+    dataset_name: str = "dlouapre/lichess_2025-01_1M",
+    max_length: int = 256,
+    train_samples: Optional[int] = None,
+    val_samples: int = 5000,
+    val_ratio: float = 0.05,
+):
+    """
+    Create training and validation datasets.
+    Splits the dataset deterministically by index:
+    - train: [0:n_train)
+    - val:   [n_train:n_train+n_val)
+    Returns:
+        (train_dataset, val_dataset)
+    """
+    from datasets import load_dataset
+    full_dataset = load_dataset(dataset_name, split="train")
+    total = len(full_dataset)
+    if train_samples is not None:
+        n_train = min(train_samples, total - val_samples)
+    else:
+        n_train = int(total * (1 - val_ratio))
+    n_val = min(val_samples, total - n_train)
+    train_data = full_dataset.select(range(n_train))
+    val_data = full_dataset.select(range(n_train, n_train + n_val))
+    train_dataset = ChessDataset(
+        tokenizer=tokenizer,
+        dataset_name=dataset_name,
+        max_length=max_length,
+    )
+    train_dataset.data = train_data
+    val_dataset = ChessDataset(
+        tokenizer=tokenizer,
+        dataset_name=dataset_name,
+        max_length=max_length,
+    )
+    val_dataset.data = val_data
+    return train_dataset, val_dataset
+def stream_games(
+    dataset_name: str = "dlouapre/lichess_2025-01_1M",
+    split: str = "train",
+    column: str = "text",
+) -> Iterator[str]:
+    """
+    Stream games from the dataset for memory-efficient processing.
+    """
+    from datasets import load_dataset
+    dataset = load_dataset(dataset_name, split=split, streaming=True)
+    for example in dataset:
+        yield example[column]
+def analyze_dataset_statistics(
+    dataset_name: str = "dlouapre/lichess_2025-01_1M",
+    max_samples: int = 10000,
+) -> Dict:
+    """
+    Analyze statistics of the chess dataset (non-streaming).
+    """
+    from collections import Counter
+    from datasets import load_dataset
+    dataset = load_dataset(dataset_name, split="train")
+    dataset = dataset.select(range(min(max_samples, len(dataset))))
+    game_lengths = []
+    move_counts = Counter()
+    opening_moves = Counter()
+    for example in dataset:
+        moves = example["text"].strip().split()
+        game_lengths.append(len(moves))
+        move_counts.update(moves)
+        if len(moves) >= 4:
+            opening = " ".join(moves[:4])
+            opening_moves[opening] += 1
+    return {
+        "total_games": len(dataset),
+        "avg_game_length": sum(game_lengths) / len(game_lengths),
+        "min_game_length": min(game_lengths),
+        "max_game_length": max(game_lengths),
+        "unique_moves": len(move_counts),
+        "most_common_moves": move_counts.most_common(20),
+        "most_common_openings": opening_moves.most_common(10),
+    }

src/.ipynb_checkpoints/evaluate-checkpoint.py ADDED Viewed

	@@ -0,0 +1,710 @@

+"""
+Evaluation script for the Chess Challenge.
+This script evaluates a trained chess model by playing games against
+Stockfish and computing ELO ratings.
+"""
+from __future__ import annotations
+import argparse
+import random
+import re
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+import torch
+@dataclass
+class GameResult:
+    """Result of a single game."""
+    moves: List[str]
+    result: str  # "1-0", "0-1", or "1/2-1/2"
+    model_color: str  # "white" or "black"
+    termination: str  # "checkmate", "stalemate", "illegal_move", "max_moves", etc.
+    illegal_move_count: int
+class ChessEvaluator:
+    """
+    Evaluator for chess models.
+    This class handles playing games between a trained model and Stockfish,
+    tracking results, and computing ELO ratings.
+    Supports any tokenization format as long as the model generates valid
+    chess squares (e.g., e2, e4). The evaluator extracts UCI moves by finding
+    square patterns in the generated output.
+    """
+    # Regex pattern to match chess squares
+    SQUARE_PATTERN = r"[a-h][1-8]"
+    def __init__(
+        self,
+        model,
+        tokenizer,
+        stockfish_path: Optional[str] = None,
+        stockfish_level: int = 1,
+        max_retries: int = 3,
+        device: str = "cuda" if torch.cuda.is_available() else "cpu",
+    ):
+        """
+        Initialize the evaluator.
+        Args:
+            model: The trained chess model.
+            tokenizer: The chess tokenizer.
+            stockfish_path: Path to Stockfish executable.
+            stockfish_level: Stockfish skill level (0-20).
+            max_retries: Maximum retries for illegal moves.
+            device: Device to run the model on.
+        """
+        self.model = model.to(device)
+        self.model.eval()
+        self.tokenizer = tokenizer
+        self.max_retries = max_retries
+        self.device = device
+        # Initialize Stockfish
+        try:
+            import chess
+            import chess.engine
+            self.chess = chess
+            if stockfish_path is None:
+                # Try common paths
+                import shutil
+                stockfish_path = shutil.which("stockfish")
+            if stockfish_path:
+                self.engine = chess.engine.SimpleEngine.popen_uci(stockfish_path)
+                self.engine.configure({"Skill Level": stockfish_level})
+            else:
+                print("WARNING: Stockfish not found. Install it for full evaluation.")
+                self.engine = None
+        except ImportError:
+            raise ImportError(
+                "python-chess is required for evaluation. "
+                "Install it with: pip install python-chess"
+            )
+    def __del__(self):
+        """Clean up Stockfish engine."""
+        if hasattr(self, "engine") and self.engine:
+            self.engine.quit()
+    def _detect_tokenizer_format(self) -> str:
+        """
+        Detect the tokenizer's expected move format by testing tokenization.
+        Tests various formats with a sample move and picks the one that
+        produces the fewest unknown tokens. This makes evaluation work
+        with any tokenizer format.
+        Supported formats:
+        - 'decomposed': "WP e2_f e4_t" (piece, from_suffix, to_suffix)
+        - 'standard': "WPe2e4" (combined with optional annotations)
+        - 'uci': "e2e4" (pure UCI notation)
+        - 'uci_spaced': "e2 e4" (UCI with space separator)
+        Returns:
+            The format string that best matches the tokenizer's vocabulary.
+        """
+        if hasattr(self, "_cached_format"):
+            return self._cached_format
+        test_formats = {
+            "decomposed": "WP e2_f e4_t",
+            "standard": "WPe2e4",
+            "uci": "e2e4",
+            "uci_spaced": "e2 e4",
+        }
+        unk_token_id = getattr(self.tokenizer, "unk_token_id", None)
+        best_format = "standard"
+        min_unk_count = float("inf")
+        for fmt, sample in test_formats.items():
+            try:
+                tokens = self.tokenizer.encode(sample, add_special_tokens=False)
+                unk_count = tokens.count(unk_token_id) if unk_token_id is not None else 0
+                if len(tokens) == 1 and unk_count == 1:
+                    unk_count = 100  # heavy penalty
+                if unk_count < min_unk_count:
+                    min_unk_count = unk_count
+                    best_format = fmt
+            except Exception:
+                continue
+        self._cached_format = best_format
+        return best_format
+    def _format_move(
+        self,
+        color: str,
+        piece: str,
+        from_sq: str,
+        to_sq: str,
+        promotion: str = None,
+    ) -> str:
+        fmt = self._detect_tokenizer_format()
+        if fmt == "decomposed":
+            move_str = f"{color}{piece} {from_sq}_f {to_sq}_t"
+        elif fmt == "uci":
+            move_str = f"{from_sq}{to_sq}"
+            if promotion:
+                move_str += promotion.lower()
+        elif fmt == "uci_spaced":
+            move_str = f"{from_sq} {to_sq}"
+            if promotion:
+                move_str += f" {promotion.lower()}"
+        else:  # standard
+            move_str = f"{color}{piece}{from_sq}{to_sq}"
+            if promotion:
+                move_str += f"={promotion}"
+        return move_str
+    def _convert_board_to_moves(self, board) -> str:
+        moves = []
+        temp_board = self.chess.Board()
+        fmt = self._detect_tokenizer_format()
+        for move in board.move_stack:
+            color = "W" if temp_board.turn == self.chess.WHITE else "B"
+            piece = temp_board.piece_at(move.from_square)
+            piece_letter = piece.symbol().upper() if piece else "P"
+            from_sq = self.chess.square_name(move.from_square)
+            to_sq = self.chess.square_name(move.to_square)
+            promo = None
+            if move.promotion:
+                promo = self.chess.piece_symbol(move.promotion).upper()
+            move_str = self._format_move(color, piece_letter, from_sq, to_sq, promo)
+            if fmt == "standard":
+                if temp_board.is_capture(move):
+                    move_str += "(x)"
+                temp_board.push(move)
+                if temp_board.is_checkmate():
+                    if "(x)" in move_str:
+                        move_str = move_str.replace("(x)", "(x+*)")
+                    else:
+                        move_str += "(+*)"
+                elif temp_board.is_check():
+                    if "(x)" in move_str:
+                        move_str = move_str.replace("(x)", "(x+)")
+                    else:
+                        move_str += "(+)"
+                if piece_letter == "K":
+                    if abs(ord(from_sq[0]) - ord(to_sq[0])) > 1:
+                        if to_sq[0] == "g":
+                            move_str = move_str.split("(")[0] + "(o)"
+                        else:
+                            move_str = move_str.split("(")[0] + "(O)"
+            else:
+                temp_board.push(move)
+            moves.append(move_str)
+        return " ".join(moves)
+    def _is_separator_token(self, token_str: str) -> bool:
+        if hasattr(self.tokenizer, "eos_token") and token_str == self.tokenizer.eos_token:
+            return True
+        if token_str.strip() == "" and len(token_str) > 0:
+            return True
+        if token_str != token_str.rstrip():
+            return True
+        return False
+    def _extract_uci_move(self, text: str) -> Optional[str]:
+        if not text:
+            return None
+        squares = re.findall(self.SQUARE_PATTERN, text)
+        if len(squares) < 2:
+            return None
+        from_sq, to_sq = squares[0], squares[1]
+        uci_move = from_sq + to_sq
+        to_sq_idx = text.find(to_sq)
+        if to_sq_idx != -1:
+            remaining = text[to_sq_idx + 2 : to_sq_idx + 5]
+            promo_match = re.search(r"[=]?([qrbnQRBN])", remaining)
+            if promo_match:
+                uci_move += promo_match.group(1).lower()
+        return uci_move
+    def _has_complete_move(self, text: str) -> bool:
+        squares = re.findall(self.SQUARE_PATTERN, text)
+        return len(squares) >= 2
+    def _generate_move_tokens(
+        self,
+        input_ids: torch.Tensor,
+        temperature: float = 0.7,
+        top_k: int = 10,
+        max_tokens: int = 20,
+    ) -> str:
+        generated_tokens = []
+        current_ids = input_ids.clone()
+        accumulated_text = ""
+        for _ in range(max_tokens):
+            with torch.no_grad():
+                outputs = self.model(input_ids=current_ids)
+                logits = outputs.logits[:, -1, :] / max(temperature, 1e-6)
+                if top_k > 0:
+                    top_k_vals = torch.topk(logits, min(top_k, logits.size(-1)))
+                    indices_to_remove = logits < top_k_vals[0][..., -1, None]
+                    logits[indices_to_remove] = float("-inf")
+                probs = torch.softmax(logits, dim=-1)
+                next_token = torch.multinomial(probs, num_samples=1)
+            token_str = self.tokenizer.decode(next_token[0])
+            if self._is_separator_token(token_str):
+                if self._has_complete_move(accumulated_text):
+                    break
+                if hasattr(self.tokenizer, "eos_token") and token_str == self.tokenizer.eos_token:
+                    break
+                if accumulated_text:
+                    break
+            generated_tokens.append(next_token[0])
+            current_ids = torch.cat([current_ids, next_token], dim=-1)
+            accumulated_text += token_str
+            if self._has_complete_move(accumulated_text):
+                squares = re.findall(self.SQUARE_PATTERN, accumulated_text)
+                if len(squares) >= 2:
+                    to_sq = squares[1]
+                    if to_sq[1] in "18":
+                        if len(generated_tokens) > 3:
+                            break
+                    else:
+                        break
+        if generated_tokens:
+            all_tokens = torch.cat(generated_tokens, dim=0)
+            move_str = self.tokenizer.decode(all_tokens, skip_special_tokens=True)
+            return move_str.strip()
+        return ""
+    def _get_model_move(
+        self,
+        board,
+        temperature: float = 0.7,
+        top_k: int = 10,
+    ) -> Tuple[Optional[str], int]:
+        self.model.eval()
+        moves_str = self._convert_board_to_moves(board)
+        if not moves_str:
+            input_text = self.tokenizer.bos_token
+        else:
+            input_text = self.tokenizer.bos_token + " " + moves_str
+        inputs = self.tokenizer(
+            input_text,
+            return_tensors="pt",
+            truncation=True,
+            max_length=self.model.config.n_ctx - 10,
+        ).to(self.device)
+        for retry in range(self.max_retries):
+            move_text = self._generate_move_tokens(
+                inputs["input_ids"],
+                temperature=temperature,
+                top_k=top_k,
+            )
+            uci_move = self._extract_uci_move(move_text)
+            if uci_move:
+                try:
+                    move = self.chess.Move.from_uci(uci_move)
+                    if move in board.legal_moves:
+                        return uci_move, retry
+                except (ValueError, self.chess.InvalidMoveError):
+                    pass
+        return None, self.max_retries
+    def _get_stockfish_move(self, board, time_limit: float = 0.1) -> str:
+        if self.engine is None:
+            raise RuntimeError("Stockfish engine not initialized")
+        result = self.engine.play(board, self.chess.engine.Limit(time=time_limit))
+        return result.move.uci()
+    def play_game(
+        self,
+        model_color: str = "white",
+        max_moves: int = 200,
+        temperature: float = 0.7,
+    ) -> GameResult:
+        board = self.chess.Board()
+        moves = []
+        illegal_move_count = 0
+        model_is_white = model_color == "white"
+        while not board.is_game_over() and len(moves) < max_moves:
+            is_model_turn = (board.turn == self.chess.WHITE) == model_is_white
+            if is_model_turn:
+                uci_move, retries = self._get_model_move(board, temperature)
+                illegal_move_count += retries
+                if uci_move is None:
+                    return GameResult(
+                        moves=moves,
+                        result="0-1" if model_is_white else "1-0",
+                        model_color=model_color,
+                        termination="illegal_move",
+                        illegal_move_count=illegal_move_count + 1,
+                    )
+                move = self.chess.Move.from_uci(uci_move)
+            else:
+                if self.engine:
+                    uci_move = self._get_stockfish_move(board)
+                    move = self.chess.Move.from_uci(uci_move)
+                else:
+                    move = random.choice(list(board.legal_moves))
+            board.push(move)
+            moves.append(move.uci())
+        if board.is_checkmate():
+            if board.turn == self.chess.WHITE:
+                result = "0-1"
+            else:
+                result = "1-0"
+            termination = "checkmate"
+        elif board.is_stalemate():
+            result = "1/2-1/2"
+            termination = "stalemate"
+        elif board.is_insufficient_material():
+            result = "1/2-1/2"
+            termination = "insufficient_material"
+        elif board.can_claim_draw():
+            result = "1/2-1/2"
+            termination = "draw_claim"
+        elif len(moves) >= max_moves:
+            result = "1/2-1/2"
+            termination = "max_moves"
+        else:
+            result = "1/2-1/2"
+            termination = "unknown"
+        return GameResult(
+            moves=moves,
+            result=result,
+            model_color=model_color,
+            termination=termination,
+            illegal_move_count=illegal_move_count,
+        )
+    def evaluate_legal_moves(
+        self,
+        n_positions: int = 1000,
+        temperature: float = 0.7,
+        verbose: bool = True,
+        seed: int = 42,
+    ) -> dict:
+        random.seed(seed)
+        torch.manual_seed(seed)
+        results = {
+            "total_positions": 0,
+            "legal_first_try": 0,
+            "legal_with_retry": 0,
+            "illegal_all_retries": 0,
+            "positions": [],
+        }
+        for i in range(n_positions):
+            board = self.chess.Board()
+            n_random_moves = random.randint(5, 40)
+            for _ in range(n_random_moves):
+                if board.is_game_over():
+                    break
+                move = random.choice(list(board.legal_moves))
+                board.push(move)
+            if board.is_game_over():
+                continue
+            results["total_positions"] += 1
+            uci_move, retries = self._get_model_move(board, temperature)
+            position_result = {
+                "fen": board.fen(),
+                "move_number": len(board.move_stack),
+                "legal": uci_move is not None,
+                "retries": retries,
+            }
+            results["positions"].append(position_result)
+            if uci_move is not None:
+                if retries == 0:
+                    results["legal_first_try"] += 1
+                else:
+                    results["legal_with_retry"] += 1
+            else:
+                results["illegal_all_retries"] += 1
+            if verbose and (i + 1) % 100 == 0:
+                legal_rate = (results["legal_first_try"] + results["legal_with_retry"]) / results["total_positions"]
+                print(f"  Positions: {i + 1}/{n_positions} | Legal rate: {legal_rate:.1%}")
+        total = results["total_positions"]
+        if total > 0:
+            results["legal_rate_first_try"] = results["legal_first_try"] / total
+            results["legal_rate_with_retry"] = (results["legal_first_try"] + results["legal_with_retry"]) / total
+            results["illegal_rate"] = results["illegal_all_retries"] / total
+        else:
+            results["legal_rate_first_try"] = 0
+            results["legal_rate_with_retry"] = 0
+            results["illegal_rate"] = 1
+        return results
+    def evaluate(
+        self,
+        n_games: int = 100,
+        temperature: float = 0.7,
+        verbose: bool = True,
+    ) -> dict:
+        results = {
+            "wins": 0,
+            "losses": 0,
+            "draws": 0,
+            "illegal_moves": 0,
+            "total_moves": 0,
+            "games": [],
+        }
+        for i in range(n_games):
+            model_color = "white" if i % 2 == 0 else "black"
+            game = self.play_game(
+                model_color=model_color,
+                temperature=temperature,
+            )
+            results["games"].append(game)
+            results["total_moves"] += len(game.moves)
+            results["illegal_moves"] += game.illegal_move_count
+            if game.result == "1/2-1/2":
+                results["draws"] += 1
+            elif (game.result == "1-0" and model_color == "white") or (game.result == "0-1" and model_color == "black"):
+                results["wins"] += 1
+            else:
+                results["losses"] += 1
+            if verbose and (i + 1) % 10 == 0:
+                print(
+                    f"  Games: {i + 1}/{n_games} | "
+                    f"W: {results['wins']} L: {results['losses']} D: {results['draws']}"
+                )
+        total = results["wins"] + results["losses"] + results["draws"]
+        results["win_rate"] = results["wins"] / total if total > 0 else 0
+        results["draw_rate"] = results["draws"] / total if total > 0 else 0
+        results["loss_rate"] = results["losses"] / total if total > 0 else 0
+        total_attempts = results["total_moves"] + results["illegal_moves"]
+        results["avg_game_length"] = total_attempts / total if total > 0 else 0
+        results["illegal_move_rate"] = results["illegal_moves"] / total_attempts if total_attempts > 0 else 0
+        stockfish_elo = 1350
+        if results["win_rate"] > 0 or results["loss_rate"] > 0:
+            score = results["wins"] + 0.5 * results["draws"]
+            if score > 0:
+                win_ratio = score / total
+                if 0 < win_ratio < 1:
+                    elo_diff = -400 * (1 - 2 * win_ratio) / (1 if win_ratio > 0.5 else -1)
+                    results["estimated_elo"] = stockfish_elo + elo_diff
+                else:
+                    results["estimated_elo"] = stockfish_elo + (400 if win_ratio >= 1 else -400)
+            else:
+                results["estimated_elo"] = stockfish_elo - 400
+        else:
+            results["estimated_elo"] = None
+        return results
+def load_model_from_hub(model_id: str, device: str = "auto", verbose: bool = True):
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    # Import to register custom classes
+    from src.model import ChessConfig, ChessForCausalLM
+    from src.tokenizer import ChessTokenizer
+    tokenizer_source = None
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+        tokenizer_source = "AutoTokenizer (from Hub with trust_remote_code=True)"
+    except Exception as e:
+        if verbose:
+            print(f"   AutoTokenizer failed: {e}")
+        tokenizer = ChessTokenizer.from_pretrained(model_id)
+        tokenizer_source = "ChessTokenizer (local class, vocab from Hub)"
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        trust_remote_code=True,
+        device_map=device,
+    )
+    if verbose:
+        print(f"   Tokenizer loaded via: {tokenizer_source}")
+        print(f"   Tokenizer class: {type(tokenizer).__name__}")
+        print(f"   Tokenizer vocab size: {tokenizer.vocab_size}")
+        if hasattr(tokenizer, "_vocab"):
+            print(f"   Tokenizer has _vocab attribute: yes ({len(tokenizer._vocab)} entries)")
+    return model, tokenizer
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate a chess model")
+    parser.add_argument("--model_path", type=str, required=True, help="Path to the model or Hugging Face model ID")
+    parser.add_argument("--mode", type=str, default="legal", choices=["legal", "winrate", "both"])
+    parser.add_argument("--stockfish_path", type=str, default=None, help="Path to Stockfish executable")
+    parser.add_argument("--stockfish_level", type=int, default=1, help="Stockfish skill level (0-20)")
+    parser.add_argument("--n_positions", type=int, default=500, help="Number of positions for legal move evaluation")
+    parser.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility")
+    parser.add_argument("--n_games", type=int, default=100, help="Number of games to play for win rate evaluation")
+    parser.add_argument("--temperature", type=float, default=0.7, help="Sampling temperature")
+    args = parser.parse_args()
+    print("=" * 60)
+    print("CHESS CHALLENGE - EVALUATION")
+    print("=" * 60)
+    print(f"\nLoading model from: {args.model_path}")
+    import os
+    is_local_path = os.path.exists(args.model_path)
+    if is_local_path:
+        # Local path
+        from transformers import AutoModelForCausalLM
+        from src.tokenizer import ChessTokenizer
+        from src.model import ChessConfig, ChessForCausalLM
+        tokenizer = ChessTokenizer.from_pretrained(args.model_path)
+        # IMPORTANT FIX:
+        # Our custom ChessForCausalLM does NOT support device_map="auto" unless _no_split_modules is defined.
+        # So we load normally and move to device explicitly.
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        model = AutoModelForCausalLM.from_pretrained(
+            args.model_path,
+            trust_remote_code=True,
+        )
+        model.to(device)
+        model.eval()
+    else:
+        if args.model_path.startswith(".") or args.model_path.startswith("/"):
+            raise FileNotFoundError(
+                f"Local model path not found: {args.model_path}\n"
+                f"Please check that the path exists and contains model files."
+            )
+        model, tokenizer = load_model_from_hub(args.model_path)
+    print(f"\nSetting up evaluator...")
+    evaluator = ChessEvaluator(
+        model=model,
+        tokenizer=tokenizer,
+        stockfish_path=args.stockfish_path,
+        stockfish_level=args.stockfish_level,
+    )
+    if args.mode in ["legal", "both"]:
+        print(f"\n" + "=" * 60)
+        print("PHASE 1: LEGAL MOVE EVALUATION")
+        print("=" * 60)
+        print(f"Testing {args.n_positions} random positions...")
+        legal_results = evaluator.evaluate_legal_moves(
+            n_positions=args.n_positions,
+            temperature=args.temperature,
+            verbose=True,
+            seed=args.seed,
+        )
+        print("\n" + "-" * 40)
+        print("LEGAL MOVE RESULTS")
+        print("-" * 40)
+        print(f"  Positions tested:     {legal_results['total_positions']}")
+        print(f"  Legal (1st try):      {legal_results['legal_first_try']} ({legal_results['legal_rate_first_try']:.1%})")
+        print(
+            f"  Legal (with retry):   {legal_results['legal_first_try'] + legal_results['legal_with_retry']}"
+            f" ({legal_results['legal_rate_with_retry']:.1%})"
+        )
+        print(f"  Always illegal:       {legal_results['illegal_all_retries']} ({legal_results['illegal_rate']:.1%})")
+    if args.mode in ["winrate", "both"]:
+        print(f"\n" + "=" * 60)
+        print("PHASE 2: WIN RATE EVALUATION")
+        print("=" * 60)
+        print(f"Playing {args.n_games} games against Stockfish (Level {args.stockfish_level})...")
+        winrate_results = evaluator.evaluate(
+            n_games=args.n_games,
+            temperature=args.temperature,
+            verbose=True,
+        )
+        print("\n" + "-" * 40)
+        print("WIN RATE RESULTS")
+        print("-" * 40)
+        print(f"  Wins:   {winrate_results['wins']}")
+        print(f"  Losses: {winrate_results['losses']}")
+        print(f"  Draws:  {winrate_results['draws']}")
+        print(f"\n  Win Rate:  {winrate_results['win_rate']:.1%}")
+        print(f"  Draw Rate: {winrate_results['draw_rate']:.1%}")
+        print(f"  Loss Rate: {winrate_results['loss_rate']:.1%}")
+        print(f"\n  Avg Game Length: {winrate_results['avg_game_length']:.1f} moves")
+        print(f"  Illegal Move Rate: {winrate_results['illegal_move_rate']:.2%}")
+        if winrate_results.get("estimated_elo", None):
+            print(f"\n  Estimated ELO: {winrate_results['estimated_elo']:.0f}")
+    print("\n" + "=" * 60)
+    print("EVALUATION COMPLETE")
+    print("=" * 60)
+if __name__ == "__main__":
+    main()

src/.ipynb_checkpoints/model-checkpoint.py ADDED Viewed

	@@ -0,0 +1,446 @@

+"""
+Chess Transformer Model for the Chess Challenge.
+Modern small-LLM upgrades:
+- RoPE (rotary positional embeddings): no learned positional embeddings needed
+- RMSNorm (optional, default True)
+- SwiGLU MLP (optional, default True)
+- Weight tying (default True)
+- Safe loss ignore_index = -100 (HF convention)
+"""
+from __future__ import annotations
+import math
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PretrainedConfig, PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithPast
+class ChessConfig(PretrainedConfig):
+    model_type = "chess_transformer"
+    def __init__(
+        self,
+        vocab_size: int = 1200,
+        # Architecture (defaults tuned to be < 1M params for common vocabs)
+        n_embd: int = 112,
+        n_layer: int = 7,
+        n_head: int = 7,
+        # Context window
+        n_ctx: int = 512,
+        # MLP hidden size:
+        # - if mlp_type="swiglu", this is SwiGLU hidden size h
+        # - if mlp_type="gelu", this is FFN inner size
+        n_inner: Optional[int] = 192,
+        dropout: float = 0.05,
+        layer_norm_epsilon: float = 1e-6,
+        # Position encoding
+        use_rope: bool = True,
+        rope_theta: float = 10000.0,
+        # Normalization / MLP type
+        use_rmsnorm: bool = True,
+        mlp_type: str = "swiglu",  # "swiglu" or "gelu"
+        # Weight tying
+        tie_weights: bool = True,
+        pad_token_id: int = 0,
+        bos_token_id: int = 1,
+        eos_token_id: int = 2,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+        if n_embd % n_head != 0:
+            raise ValueError(f"n_embd ({n_embd}) must be divisible by n_head ({n_head})")
+        head_dim = n_embd // n_head
+        if use_rope and (head_dim % 2 != 0):
+            raise ValueError(
+                f"RoPE requires even head_dim, got head_dim={head_dim}. "
+                f"Choose n_embd/n_head even."
+            )
+        self.vocab_size = vocab_size
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_ctx = n_ctx
+        self.n_inner = n_inner if n_inner is not None else (2 * n_embd)
+        self.dropout = dropout
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.use_rope = use_rope
+        self.rope_theta = rope_theta
+        self.use_rmsnorm = use_rmsnorm
+        self.mlp_type = mlp_type
+        self.tie_weights = tie_weights
+        # HF uses this field for embedding tying behavior
+        self.tie_word_embeddings = bool(tie_weights)
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        norm = torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
+        return x * norm * self.weight
+def rotate_half(x: torch.Tensor) -> torch.Tensor:
+    x1 = x[..., 0::2]
+    x2 = x[..., 1::2]
+    out = torch.empty_like(x)
+    out[..., 0::2] = -x2
+    out[..., 1::2] = x1
+    return out
+class RotaryEmbedding(nn.Module):
+    """
+    RoPE cache builder. Applies RoPE to q,k with shape (B,H,T,D).
+    """
+    def __init__(self, head_dim: int, theta: float = 10000.0):
+        super().__init__()
+        if head_dim % 2 != 0:
+            raise ValueError(f"RoPE requires even head_dim, got {head_dim}")
+        inv_freq = 1.0 / (theta ** (torch.arange(0, head_dim, 2).float() / head_dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self._cos_cached = None
+        self._sin_cached = None
+        self._seq_len_cached = 0
+        self._device_cached = None
+        self._dtype_cached = None
+    def _build_cache(self, seq_len: int, device: torch.device, dtype: torch.dtype):
+        t = torch.arange(seq_len, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)  # (T, D/2)
+        cos = freqs.cos().to(dtype=dtype)
+        sin = freqs.sin().to(dtype=dtype)
+        self._cos_cached = cos
+        self._sin_cached = sin
+        self._seq_len_cached = seq_len
+        self._device_cached = device
+        self._dtype_cached = dtype
+    def forward(self, q: torch.Tensor, k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        # q,k: (B,H,T,D)
+        T = q.size(-2)
+        device = q.device
+        dtype = q.dtype
+        if (
+            self._cos_cached is None
+            or T > self._seq_len_cached
+            or device != self._device_cached
+            or dtype != self._dtype_cached
+        ):
+            self._build_cache(T, device, dtype)
+        cos = self._cos_cached[:T]  # (T, D/2)
+        sin = self._sin_cached[:T]  # (T, D/2)
+        # broadcast to (1,1,T,D) via repeat_interleave on last dim
+        cos = torch.repeat_interleave(cos.unsqueeze(0).unsqueeze(0), 2, dim=-1)
+        sin = torch.repeat_interleave(sin.unsqueeze(0).unsqueeze(0), 2, dim=-1)
+        q_out = (q * cos) + (rotate_half(q) * sin)
+        k_out = (k * cos) + (rotate_half(k) * sin)
+        return q_out, k_out
+class MultiHeadAttention(nn.Module):
+    def __init__(self, config: ChessConfig):
+        super().__init__()
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.head_dim = config.n_embd // config.n_head
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
+        self.dropout = nn.Dropout(config.dropout)
+        self.use_rope = bool(config.use_rope)
+        self.rope = RotaryEmbedding(self.head_dim, theta=config.rope_theta) if self.use_rope else None
+        # causal mask buffer (expandable)
+        self.register_buffer(
+            "bias",
+            torch.tril(torch.ones(config.n_ctx, config.n_ctx)).view(1, 1, config.n_ctx, config.n_ctx),
+            persistent=False,
+        )
+    def _ensure_causal_mask(self, seq_len: int, device: torch.device, dtype: torch.dtype):
+        if self.bias.size(-1) >= seq_len and self.bias.device == device:
+            return
+        self.bias = torch.tril(torch.ones(seq_len, seq_len, device=device, dtype=dtype)).view(1, 1, seq_len, seq_len)
+    def forward(self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        B, T, _ = x.size()
+        qkv = self.c_attn(x)
+        q, k, v = qkv.split(self.n_embd, dim=2)
+        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)  # (B,H,T,D)
+        k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        if self.use_rope:
+            q, k = self.rope(q, k)
+        attn = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
+        self._ensure_causal_mask(T, attn.device, attn.dtype)
+        causal_mask = self.bias[:, :, :T, :T]
+        mask_value = torch.finfo(attn.dtype).min
+        attn = attn.masked_fill(causal_mask == 0, mask_value)
+        # padding mask (1=keep, 0=mask)
+        if attention_mask is not None:
+            am = attention_mask.unsqueeze(1).unsqueeze(2)  # (B,1,1,T)
+            attn = attn.masked_fill(am == 0, mask_value)
+        attn = F.softmax(attn, dim=-1)
+        attn = self.dropout(attn)
+        y = torch.matmul(attn, v)  # (B,H,T,D)
+        y = y.transpose(1, 2).contiguous().view(B, T, self.n_embd)
+        y = self.c_proj(y)
+        y = self.dropout(y)
+        return y
+class SwiGLU(nn.Module):
+    def __init__(self, config: ChessConfig):
+        super().__init__()
+        h = config.n_inner
+        self.w12 = nn.Linear(config.n_embd, 2 * h)
+        self.w3 = nn.Linear(h, config.n_embd)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        x = F.silu(x1) * x2
+        x = self.w3(x)
+        x = self.dropout(x)
+        return x
+class FeedForwardGELU(nn.Module):
+    def __init__(self, config: ChessConfig):
+        super().__init__()
+        self.c_fc = nn.Linear(config.n_embd, config.n_inner)
+        self.c_proj = nn.Linear(config.n_inner, config.n_embd)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.c_fc(x)
+        x = F.gelu(x)
+        x = self.c_proj(x)
+        x = self.dropout(x)
+        return x
+class TransformerBlock(nn.Module):
+    def __init__(self, config: ChessConfig):
+        super().__init__()
+        if config.use_rmsnorm:
+            self.ln_1 = RMSNorm(config.n_embd, eps=config.layer_norm_epsilon)
+            self.ln_2 = RMSNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        else:
+            self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+            self.ln_2 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        self.attn = MultiHeadAttention(config)
+        if config.mlp_type.lower() == "swiglu":
+            self.mlp = SwiGLU(config)
+        else:
+            self.mlp = FeedForwardGELU(config)
+    def forward(self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        x = x + self.attn(self.ln_1(x), attention_mask=attention_mask)
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class ChessForCausalLM(PreTrainedModel):
+    config_class = ChessConfig
+    base_model_prefix = "transformer"
+    supports_gradient_checkpointing = True
+    keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _no_split_modules = ["TransformerBlock"]
+    def __init__(self, config: ChessConfig):
+        super().__init__(config)
+        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
+        # learned positional embeddings only if RoPE disabled
+        self.wpe = None
+        if not config.use_rope:
+            self.wpe = nn.Embedding(config.n_ctx, config.n_embd)
+        self.drop = nn.Dropout(config.dropout)
+        self.h = nn.ModuleList([TransformerBlock(config) for _ in range(config.n_layer)])
+        if config.use_rmsnorm:
+            self.ln_f = RMSNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        else:
+            self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        if config.tie_weights:
+            self._tied_weights_keys = ["lm_head.weight"]
+        self.post_init()
+        if config.tie_weights:
+            self.tie_weights()
+    def get_input_embeddings(self) -> nn.Module:
+        return self.wte
+    def set_input_embeddings(self, new_embeddings: nn.Module):
+        self.wte = new_embeddings
+        if getattr(self.config, "tie_weights", False):
+            self.tie_weights()
+    def get_output_embeddings(self) -> nn.Module:
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings: nn.Module):
+        self.lm_head = new_embeddings
+    def tie_weights(self):
+        if getattr(self.config, "tie_weights", False) or getattr(self.config, "tie_word_embeddings", False):
+            self._tie_or_clone_weights(self.lm_head, self.wte)
+    def _init_weights(self, module: nn.Module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        B, T = input_ids.size()
+        device = input_ids.device
+        x = self.wte(input_ids)
+        if self.wpe is not None:
+            if position_ids is None:
+                position_ids = torch.arange(T, device=device).unsqueeze(0).expand(B, -1)
+            x = x + self.wpe(position_ids)
+        x = self.drop(x)
+        for block in self.h:
+            x = block(x, attention_mask=attention_mask)
+        x = self.ln_f(x)
+        logits = self.lm_head(x)
+        loss = None
+        if labels is not None:
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)),
+                shift_labels.view(-1),
+            )
+        if not return_dict:
+            output = (logits,)
+            return ((loss,) + output) if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=None,
+            hidden_states=None,
+            attentions=None,
+        )
+    @torch.no_grad()
+    def generate_move(
+        self,
+        input_ids: torch.LongTensor,
+        temperature: float = 0.7,
+        top_k: Optional[int] = 50,
+        top_p: Optional[float] = None,
+    ) -> int:
+        self.eval()
+        outputs = self(input_ids)
+        logits = outputs.logits[:, -1, :] / max(float(temperature), 1e-6)
+        if top_k is not None and top_k > 0:
+            k = min(int(top_k), logits.size(-1))
+            thresh = torch.topk(logits, k)[0][..., -1, None]
+            logits = logits.masked_fill(logits < thresh, torch.finfo(logits.dtype).min)
+        if top_p is not None:
+            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+            probs = F.softmax(sorted_logits, dim=-1)
+            cum = torch.cumsum(probs, dim=-1)
+            to_remove = cum > float(top_p)
+            to_remove[..., 1:] = to_remove[..., :-1].clone()
+            to_remove[..., 0] = 0
+            indices_to_remove = to_remove.scatter(dim=-1, index=sorted_indices, src=to_remove)
+            logits = logits.masked_fill(indices_to_remove, torch.finfo(logits.dtype).min)
+        probs = F.softmax(logits, dim=-1)
+        next_token = torch.multinomial(probs, num_samples=1)
+        return int(next_token.item())
+# Register the model with Auto classes
+from transformers import AutoConfig, AutoModelForCausalLM
+AutoConfig.register("chess_transformer", ChessConfig)
+AutoModelForCausalLM.register(ChessConfig, ChessForCausalLM)

src/.ipynb_checkpoints/tokenizer-checkpoint.py ADDED Viewed

	@@ -0,0 +1,207 @@

+"""
+Decomposed Chess Tokenizer for the Chess Challenge.
+Each move becomes 3 or 4 tokens:
+  WP e2_f e4_t
+  BN g8_f f6_t
+Promotion adds an extra token:
+  WP e7_f e8_t =q
+Why this helps:
+- Fixed small vocab (~150 tokens)
+- Near-zero OOV / UNK, so the evaluator can always parse squares
+- Compatible with the provided evaluate.py (it auto-detects 'decomposed')
+Special tokens behavior:
+- Adds BOS only (NO EOS)
+- If BOS already present, does not add it twice
+"""
+from __future__ import annotations
+import json
+import os
+from typing import Dict, List, Optional
+from transformers import PreTrainedTokenizer
+class ChessTokenizer(PreTrainedTokenizer):
+    model_input_names = ["input_ids", "attention_mask"]
+    vocab_files_names = {"vocab_file": "vocab.json"}
+    PAD_TOKEN = "[PAD]"
+    BOS_TOKEN = "[BOS]"
+    EOS_TOKEN = "[EOS]"   # kept for compatibility, not auto-added
+    UNK_TOKEN = "[UNK]"
+    def __init__(
+        self,
+        vocab_file: Optional[str] = None,
+        vocab: Optional[Dict[str, int]] = None,
+        **kwargs,
+    ):
+        self._pad_token = self.PAD_TOKEN
+        self._bos_token = self.BOS_TOKEN
+        self._eos_token = self.EOS_TOKEN
+        self._unk_token = self.UNK_TOKEN
+        # avoid duplicates from kwargs
+        kwargs.pop("pad_token", None)
+        kwargs.pop("bos_token", None)
+        kwargs.pop("eos_token", None)
+        kwargs.pop("unk_token", None)
+        if vocab is not None:
+            self._vocab = vocab
+        elif vocab_file is not None and os.path.exists(vocab_file):
+            with open(vocab_file, "r", encoding="utf-8") as f:
+                self._vocab = json.load(f)
+        else:
+            self._vocab = self._build_fixed_vocab()
+        self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
+        super().__init__(
+            pad_token=self._pad_token,
+            bos_token=self._bos_token,
+            eos_token=self._eos_token,
+            unk_token=self._unk_token,
+            **kwargs,
+        )
+    # --------------------------
+    # Fixed vocab: pieces + squares + promos
+    # --------------------------
+    @staticmethod
+    def _all_squares() -> List[str]:
+        files = "abcdefgh"
+        ranks = "12345678"
+        return [f + r for r in ranks for f in files]  # a1..h8
+    def _build_fixed_vocab(self) -> Dict[str, int]:
+        special = [self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN]
+        # piece tokens: WP..WK, BP..BK
+        piece_tokens = [f"{c}{p}" for c in "WB" for p in "PNBRQK"]
+        squares = self._all_squares()
+        from_tokens = [f"{sq}_f" for sq in squares]
+        to_tokens = [f"{sq}_t" for sq in squares]
+        promo_tokens = ["=q", "=r", "=b", "=n"]
+        tokens = special + piece_tokens + from_tokens + to_tokens + promo_tokens
+        return {tok: i for i, tok in enumerate(tokens)}
+    # --------------------------
+    # Special tokens handling (robust with evaluate.py)
+    # --------------------------
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        # BOS only, NO EOS
+        if token_ids_1 is not None:
+            token_ids_0 = token_ids_0 + token_ids_1
+        if token_ids_0 and token_ids_0[0] == self.bos_token_id:
+            return token_ids_0
+        return [self.bos_token_id] + token_ids_0
+    def get_special_tokens_mask(
+        self,
+        token_ids_0: List[int],
+        token_ids_1: Optional[List[int]] = None,
+        already_has_special_tokens: bool = False,
+    ) -> List[int]:
+        if already_has_special_tokens:
+            specials = {self.pad_token_id, self.bos_token_id, self.eos_token_id, self.unk_token_id}
+            return [1 if t in specials else 0 for t in token_ids_0]
+        if token_ids_1 is None:
+            return [1] + [0] * len(token_ids_0)
+        return [1] + [0] * (len(token_ids_0) + len(token_ids_1))
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        if token_ids_1 is None:
+            return [0] * (len(token_ids_0) + 1)
+        return [0] * (len(token_ids_0) + len(token_ids_1) + 1)
+    # --------------------------
+    # Tokenization
+    # --------------------------
+    def _tokenize(self, text: str) -> List[str]:
+        if not text or not text.strip():
+            return []
+        parts = text.strip().split()
+        out: List[str] = []
+        for tok in parts:
+            # allow literal special tokens present in text
+            if tok in {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}:
+                out.append(tok)
+                continue
+            # already decomposed tokens
+            if (len(tok) == 2 and tok[0] in "WB" and tok[1] in "PNBRQK") or tok.endswith("_f") or tok.endswith("_t") or tok in {"=q", "=r", "=b", "=n"}:
+                out.append(tok)
+                continue
+            # parse extended UCI (dataset): WPe2e4, BNg8f6(x), WPe7e8=Q(+), ...
+            if len(tok) < 6:
+                out.append(self.UNK_TOKEN)
+                continue
+            color = tok[0]
+            piece = tok[1]
+            from_sq = tok[2:4]
+            to_sq = tok[4:6]
+            out.append(f"{color}{piece}")
+            out.append(f"{from_sq}_f")
+            out.append(f"{to_sq}_t")
+            # promotion like "=Q"
+            if "=" in tok:
+                try:
+                    promo_part = tok.split("=", 1)[1]
+                    promo_letter = promo_part[0].lower()
+                    promo_tok = f"={promo_letter}"
+                    if promo_tok in self._vocab:
+                        out.append(promo_tok)
+                except Exception:
+                    pass
+        return out
+    def _convert_token_to_id(self, token: str) -> int:
+        return self._vocab.get(token, self._vocab[self.UNK_TOKEN])
+    def _convert_id_to_token(self, index: int) -> str:
+        return self._ids_to_tokens.get(index, self.UNK_TOKEN)
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        return " ".join(tokens)
+    # --------------------------
+    # Vocab I/O
+    # --------------------------
+    @property
+    def vocab_size(self) -> int:
+        return len(self._vocab)
+    def get_vocab(self) -> Dict[str, int]:
+        return dict(self._vocab)
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
+        os.makedirs(save_directory, exist_ok=True)
+        vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "") + "vocab.json",
+        )
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            json.dump(self._vocab, f, ensure_ascii=False, indent=2)
+        return (vocab_file,)

src/.ipynb_checkpoints/train-checkpoint.py ADDED Viewed

	@@ -0,0 +1,268 @@

+"""
+Training script for the Chess Challenge.
+GPU-optimized version (still compatible with older transformers/accelerate):
+- Uses fp16/bf16 automatically on GPU
+- Uses evaluation + saving per EPOCH by default (much faster than steps)
+- Enables dataloader_num_workers + pin_memory on GPU
+- Optional torch.compile for speed (safe-guarded)
+- Keeps your robust TrainingArguments compatibility (evaluation_strategy vs eval_strategy)
+"""
+from __future__ import annotations
+import argparse
+import os
+import warnings
+from pathlib import Path
+warnings.filterwarnings("ignore", message="'return' in a 'finally' block")
+import torch
+from transformers import Trainer, TrainingArguments, set_seed
+from src.data import ChessDataCollator, create_train_val_datasets
+from src.model import ChessConfig, ChessForCausalLM
+from src.tokenizer import ChessTokenizer
+from src.utils import count_parameters, print_parameter_budget
+def parse_args():
+    p = argparse.ArgumentParser(description="Train a chess-playing language model")
+    # ---------------- Model ----------------
+    p.add_argument("--n_embd", type=int, default=128, help="Embedding dimension")
+    p.add_argument("--n_layer", type=int, default=6, help="Number of transformer layers")
+    p.add_argument("--n_head", type=int, default=8, help="Number of attention heads")
+    # For speed on GPU, 256 is often a great default; override via CLI if needed.
+    p.add_argument("--n_ctx", type=int, default=256, help="Maximum context length")
+    p.add_argument("--n_inner", type=int, default=248, help="MLP hidden size (SwiGLU: h)")
+    p.add_argument("--dropout", type=float, default=0.05, help="Dropout probability")
+    p.add_argument("--no_tie_weights", action="store_true", help="Disable weight tying")
+    # improved model.py flags
+    p.add_argument("--use_rope", action="store_true", help="Use RoPE (recommended)")
+    p.add_argument("--mlp_type", type=str, default="swiglu", choices=["swiglu", "gelu"], help="MLP type")
+    p.add_argument("--use_rmsnorm", action="store_true", help="Use RMSNorm (recommended)")
+    # ---------------- Data ----------------
+    p.add_argument("--dataset_name", type=str, default="dlouapre/lichess_2025-01_1M")
+    p.add_argument("--max_train_samples", type=int, default=None, help="Optional cap for train samples")
+    p.add_argument("--val_samples", type=int, default=5000)
+    p.add_argument(
+        "--tokenizer_dir",
+        type=str,
+        default="./tokenizer_cache",
+        help="Where to save/load the tokenizer (vocab.json)",
+    )
+    # ---------------- Training ----------------
+    p.add_argument("--output_dir", type=str, default="./output")
+    p.add_argument("--num_train_epochs", type=int, default=3)
+    # For speed: prefer larger batch and smaller accumulation.
+    p.add_argument("--per_device_train_batch_size", type=int, default=64)
+    p.add_argument("--per_device_eval_batch_size", type=int, default=128)
+    p.add_argument("--gradient_accumulation_steps", type=int, default=1)
+    p.add_argument("--learning_rate", type=float, default=3e-4)
+    p.add_argument("--weight_decay", type=float, default=0.1)
+    p.add_argument("--warmup_steps", type=int, default=300)
+    p.add_argument("--seed", type=int, default=42)
+    # ---------------- Logging / Save ----------------
+    p.add_argument("--logging_steps", type=int, default=50)
+    # Eval/save config: epoch by default (much faster). Still allow steps if user wants.
+    p.add_argument("--eval_strategy", type=str, default="epoch", choices=["epoch", "steps"], help="Evaluation strategy")
+    p.add_argument("--save_strategy", type=str, default="epoch", choices=["epoch", "steps"], help="Save strategy")
+    p.add_argument("--eval_steps", type=int, default=1000, help="Only used if eval_strategy=steps")
+    p.add_argument("--save_steps", type=int, default=1000, help="Only used if save_strategy=steps")
+    # ---------------- Speed knobs ----------------
+    p.add_argument("--dataloader_num_workers", type=int, default=2, help="CPU workers for dataloader")
+    p.add_argument("--torch_compile", action="store_true", help="Enable torch.compile on GPU (can speed up)")
+    return p.parse_args()
+def load_or_create_tokenizer(args) -> ChessTokenizer:
+    tok_dir = Path(args.tokenizer_dir)
+    tok_dir.mkdir(parents=True, exist_ok=True)
+    vocab_path = tok_dir / "vocab.json"
+    if vocab_path.exists():
+        print(f"Loading tokenizer from {tok_dir} ...")
+        return ChessTokenizer(vocab_file=str(vocab_path))
+    print("Creating fixed-vocab tokenizer (decomposed) ...")
+    tok = ChessTokenizer()
+    tok.save_pretrained(str(tok_dir))
+    print(f"Tokenizer saved to {tok_dir} (vocab_size={tok.vocab_size})")
+    return tok
+def _make_training_args(args) -> TrainingArguments:
+    """
+    Compatibility layer for transformers versions:
+    - some use evaluation_strategy, others use eval_strategy
+    - we keep it robust while using faster defaults (epoch eval/save).
+    """
+    use_gpu = torch.cuda.is_available()
+    use_bf16 = bool(use_gpu and torch.cuda.is_bf16_supported())
+    use_fp16 = bool(use_gpu and not use_bf16)
+    common = dict(
+        output_dir=args.output_dir,
+        num_train_epochs=args.num_train_epochs,
+        per_device_train_batch_size=args.per_device_train_batch_size,
+        per_device_eval_batch_size=args.per_device_eval_batch_size,
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        learning_rate=args.learning_rate,
+        weight_decay=args.weight_decay,
+        warmup_steps=args.warmup_steps,
+        lr_scheduler_type="cosine",
+        max_grad_norm=1.0,
+        logging_dir=os.path.join(args.output_dir, "logs"),
+        logging_steps=args.logging_steps,
+        save_total_limit=2,
+        load_best_model_at_end=True,
+        metric_for_best_model="eval_loss",
+        greater_is_better=False,
+        seed=args.seed,
+        report_to=["none"],
+        # Mixed precision for GPU speed
+        fp16=use_fp16,
+        bf16=use_bf16,
+        # DataLoader perf
+        dataloader_num_workers=args.dataloader_num_workers,
+        dataloader_pin_memory=use_gpu,
+        # Important for custom batches
+        remove_unused_columns=False,
+    )
+    # Build kwargs depending on epoch vs steps
+    eval_kwargs = {}
+    if args.eval_strategy == "steps":
+        eval_kwargs["eval_steps"] = args.eval_steps
+    save_kwargs = {}
+    if args.save_strategy == "steps":
+        save_kwargs["save_steps"] = args.save_steps
+    # Try standard HF arg names first
+    try:
+        return TrainingArguments(
+            **common,
+            evaluation_strategy=args.eval_strategy,
+            save_strategy=args.save_strategy,
+            **eval_kwargs,
+            **save_kwargs,
+        )
+    except TypeError:
+        # Fallback for forks/older variants that renamed args
+        return TrainingArguments(
+            **common,
+            eval_strategy=args.eval_strategy,
+            save_strategy=args.save_strategy,
+            **eval_kwargs,
+            **save_kwargs,
+        )
+def main():
+    args = parse_args()
+    set_seed(args.seed)
+    print("=" * 60)
+    print("CHESS CHALLENGE - TRAINING")
+    print("=" * 60)
+    tokenizer = load_or_create_tokenizer(args)
+    actual_vocab_size = tokenizer.vocab_size
+    print(f"   Vocab size used: {actual_vocab_size}")
+    print("\nCreating model configuration...")
+    config = ChessConfig(
+        vocab_size=actual_vocab_size,
+        n_embd=args.n_embd,
+        n_layer=args.n_layer,
+        n_head=args.n_head,
+        n_ctx=args.n_ctx,
+        n_inner=args.n_inner,
+        dropout=args.dropout,
+        tie_weights=not args.no_tie_weights,
+        pad_token_id=tokenizer.pad_token_id,
+        bos_token_id=tokenizer.bos_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+        use_rope=bool(args.use_rope),
+        mlp_type=args.mlp_type,
+        use_rmsnorm=bool(args.use_rmsnorm),
+    )
+    print_parameter_budget(config)
+    print("\nCreating model...")
+    model = ChessForCausalLM(config)
+    # Optional torch.compile (GPU only)
+    if args.torch_compile and torch.cuda.is_available():
+        try:
+            model = torch.compile(model)
+            print("✓ torch.compile enabled")
+        except Exception as e:
+            print(f"WARNING: torch.compile failed ({e}). Continuing without it.")
+    n_params = count_parameters(model)
+    print(f"   Total parameters: {n_params:,}")
+    print("✓  Model is within 1M parameter limit" if n_params <= 1_000_000 else "WARNING: Model exceeds 1M!")
+    print("\nLoading datasets...")
+    train_dataset, val_dataset = create_train_val_datasets(
+        tokenizer=tokenizer,
+        dataset_name=args.dataset_name,
+        max_length=args.n_ctx,
+        train_samples=args.max_train_samples,
+        val_samples=args.val_samples,
+    )
+    print(f"   Training samples: {len(train_dataset):,}")
+    print(f"   Validation samples: {len(val_dataset):,}")
+    data_collator = ChessDataCollator(tokenizer, max_length=args.n_ctx)
+    training_args = _make_training_args(args)
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=val_dataset,
+        data_collator=data_collator,
+        tokenizer=tokenizer,
+    )
+    print("\nStarting training...")
+    trainer.train()
+    out_final = os.path.join(args.output_dir, "final_model")
+    print("\nSaving final model...")
+    trainer.save_model(out_final)
+    tokenizer.save_pretrained(out_final)
+    print("\nTraining complete!")
+    print(f"   Model saved to: {out_final}")
+if __name__ == "__main__":
+    main()

src/.ipynb_checkpoints/utils-checkpoint.py ADDED Viewed

	@@ -0,0 +1,369 @@

+"""
+Utility functions for the Chess Challenge.
+This module provides helper functions for:
+- Parameter counting and budget analysis (including RoPE / SwiGLU / RMSNorm variants)
+- Move validation and conversion with python-chess
+- Optional: compute legal-move rate over a whole game string
+"""
+from __future__ import annotations
+import re
+from typing import Dict, Optional, TYPE_CHECKING
+import torch.nn as nn
+if TYPE_CHECKING:
+    from src.model import ChessConfig
+# =========================
+# Parameter counting
+# =========================
+def count_parameters(model: nn.Module, trainable_only: bool = True) -> int:
+    """
+    Count the number of parameters in a model.
+    Args:
+        model: The PyTorch model.
+        trainable_only: If True, only count trainable parameters.
+    Returns:
+        Total number of parameters.
+    """
+    if trainable_only:
+        return sum(p.numel() for p in model.parameters() if p.requires_grad)
+    return sum(p.numel() for p in model.parameters())
+def count_parameters_by_component(model: nn.Module) -> Dict[str, int]:
+    """
+    Count parameters broken down by leaf modules.
+    Args:
+        model: The PyTorch model.
+    Returns:
+        Dictionary mapping module names to parameter counts.
+    """
+    counts: Dict[str, int] = {}
+    for name, module in model.named_modules():
+        if len(list(module.children())) == 0:  # leaf module
+            param_count = sum(p.numel() for p in module.parameters(recurse=False))
+            if param_count > 0:
+                counts[name] = param_count
+    return counts
+def estimate_parameters(config: "ChessConfig") -> Dict[str, int]:
+    """
+    Estimate parameter count for a configuration.
+    Works for:
+    - learned position embeddings (wpe) or RoPE (no pos params)
+    - GELU FFN (d -> n_inner -> d)
+    - SwiGLU FFN (d -> 2h, h -> d) where h = n_inner
+    - LayerNorm (weight+bias) vs RMSNorm (weight only)
+    - tied or untied LM head
+    NOTE: This is an estimate of *weights + biases* for the common implementation
+    patterns used in this repo.
+    """
+    V = int(config.vocab_size)
+    d = int(config.n_embd)
+    L = int(config.n_layer)
+    n_ctx = int(config.n_ctx)
+    n_inner = int(config.n_inner)
+    use_rope = bool(getattr(config, "use_rope", False))
+    use_rmsnorm = bool(getattr(config, "use_rmsnorm", False))
+    mlp_type = str(getattr(config, "mlp_type", "gelu")).lower()
+    tie = bool(getattr(config, "tie_weights", True))
+    # Embeddings
+    token_embeddings = V * d
+    position_embeddings = 0 if use_rope else (n_ctx * d)
+    # Attention per layer:
+    # c_attn: d -> 3d  : weight 3d*d, bias 3d
+    # c_proj: d -> d   : weight d*d,  bias d
+    attn_qkv_per_layer = 3 * d * d + 3 * d
+    attn_proj_per_layer = d * d + d
+    # FFN per layer
+    if mlp_type == "swiglu":
+        # w12: d -> 2h : weight 2h*d, bias 2h
+        # w3:  h -> d  : weight d*h,  bias d
+        h = n_inner
+        ffn_per_layer = (2 * h * d + 2 * h) + (d * h + d)  # 3*d*h + (2h + d)
+    else:
+        # GELU: d -> n_inner -> d
+        ffn_per_layer = (d * n_inner + n_inner) + (n_inner * d + d)  # 2*d*n_inner + (n_inner + d)
+    # Norm params
+    # LayerNorm: weight+bias => 2d ; RMSNorm: weight => d
+    norm_params = d if use_rmsnorm else 2 * d
+    norms_per_layer = 2 * norm_params  # ln_1 + ln_2
+    final_norm = norm_params
+    per_layer = attn_qkv_per_layer + attn_proj_per_layer + ffn_per_layer + norms_per_layer
+    total_transformer_layers = L * per_layer
+    # LM head
+    # In this repo, lm_head is typically Linear(d, V, bias=False).
+    # If untied, count V*d parameters.
+    lm_head = 0 if tie else (V * d)
+    total = token_embeddings + position_embeddings + total_transformer_layers + final_norm + lm_head
+    return {
+        "token_embeddings": token_embeddings,
+        "position_embeddings": position_embeddings,
+        "attention_qkv_per_layer": attn_qkv_per_layer,
+        "attention_proj_per_layer": attn_proj_per_layer,
+        "ffn_per_layer": ffn_per_layer,
+        "norms_per_layer": norms_per_layer,
+        "final_norm": final_norm,
+        "total_transformer_layers": total_transformer_layers,
+        "lm_head": lm_head,
+        "total": total,
+        "notes": {
+            "use_rope": use_rope,
+            "use_rmsnorm": use_rmsnorm,
+            "mlp_type": mlp_type,
+            "tie_weights": tie,
+        },
+    }
+def print_parameter_budget(config: "ChessConfig", limit: int = 1_000_000) -> None:
+    """
+    Print a formatted parameter budget analysis.
+    Args:
+        config: Model configuration.
+        limit: Parameter limit.
+    """
+    est = estimate_parameters(config)
+    print("=" * 60)
+    print("PARAMETER BUDGET ANALYSIS")
+    print("=" * 60)
+    print("\nConfiguration:")
+    print(f"  vocab_size (V) = {config.vocab_size}")
+    print(f"  n_embd (d)     = {config.n_embd}")
+    print(f"  n_layer (L)    = {config.n_layer}")
+    print(f"  n_head         = {config.n_head}")
+    print(f"  n_ctx          = {config.n_ctx}")
+    print(f"  n_inner        = {config.n_inner}")
+    print(f"  tie_weights    = {getattr(config, 'tie_weights', True)}")
+    if hasattr(config, "use_rope"):
+        print(f"  use_rope       = {getattr(config, 'use_rope', False)}")
+    if hasattr(config, "mlp_type"):
+        print(f"  mlp_type       = {getattr(config, 'mlp_type', 'gelu')}")
+    if hasattr(config, "use_rmsnorm"):
+        print(f"  use_rmsnorm    = {getattr(config, 'use_rmsnorm', False)}")
+    print("\nParameter Breakdown (estimate):")
+    print(f"  Token Embeddings:     {est['token_embeddings']:>10,}")
+    print(f"  Position Embeddings:  {est['position_embeddings']:>10,}")
+    print(f"  Transformer Layers:   {est['total_transformer_layers']:>10,}")
+    print(f"  Final Norm:           {est['final_norm']:>10,}")
+    if getattr(config, "tie_weights", True):
+        print(f"  LM Head:              {'(tied)':>10}")
+    else:
+        print(f"  LM Head:              {est['lm_head']:>10,}")
+    print("  " + "-" * 32)
+    print(f"  TOTAL:                {est['total']:>10,}")
+    remaining = limit - est["total"]
+    print("\nBudget Status:")
+    print(f"  Limit:     {limit:>10,}")
+    print(f"  Used:      {est['total']:>10,}")
+    print(f"  Remaining: {remaining:>10,}")
+    if est["total"] <= limit:
+        print(f"\n✓ Within budget! ({est['total'] / limit * 100:.1f}% used)")
+    else:
+        print(f"\n✗ OVER BUDGET by {-remaining:,} parameters!")
+    print("=" * 60)
+# =========================
+# Move conversion / validation (python-chess)
+# =========================
+def convert_extended_uci_to_uci(move: str) -> str:
+    """
+    Convert extended UCI format to standard UCI format.
+    Extended UCI format (dataset):
+      [W|B][Piece][from_sq][to_sq][suffixes...]
+      e.g. "WPe2e4", "BNg8f6(x)", "WKe1g1(o)", "WPe7e8=Q(+)"
+    Standard UCI:
+      "e2e4", "g8f6", "e1g1", "e7e8q"
+    """
+    if len(move) < 6:
+        return move
+    from_sq = move[2:4]
+    to_sq = move[4:6]
+    promotion = ""
+    if "=" in move:
+        promo_idx = move.index("=")
+        if promo_idx + 1 < len(move):
+            promotion = move[promo_idx + 1].lower()
+    return from_sq + to_sq + promotion
+def validate_move_with_chess(move: str, board_fen: Optional[str] = None) -> bool:
+    """
+    Validate a single move using python-chess against a given board state.
+    IMPORTANT:
+    - If board_fen is None, validation is against the initial position.
+      For validating a *game*, use `legal_rate_game_text` which advances the board.
+    Args:
+        move: Move in extended UCI format.
+        board_fen: FEN string of the current board (optional).
+    Returns:
+        True if move is legal on that board, else False.
+    """
+    try:
+        import chess
+    except ImportError:
+        raise ImportError(
+            "python-chess is required for move validation. Install it with: pip install python-chess"
+        )
+    if len(move) < 6:
+        return False
+    board = chess.Board(board_fen) if board_fen else chess.Board()
+    uci_move = convert_extended_uci_to_uci(move)
+    try:
+        move_obj = chess.Move.from_uci(uci_move)
+        return move_obj in board.legal_moves
+    except Exception:
+        return False
+def legal_rate_game_text(game_text: str, stop_on_illegal: bool = True) -> float:
+    """
+    Compute the fraction of legal moves in a space-separated extended-UCI game string.
+    Args:
+        game_text: "WPe2e4 BPe7e5 ..." (space-separated moves)
+        stop_on_illegal: If True, stop at first illegal move.
+    Returns:
+        legal / total (total is moves processed, or total moves if stop_on_illegal=False)
+    """
+    try:
+        import chess
+    except ImportError:
+        raise ImportError("python-chess is required. Install it with: pip install python-chess")
+    moves = game_text.strip().split()
+    if not moves:
+        return 0.0
+    board = chess.Board()
+    legal = 0
+    total = 0
+    for mv in moves:
+        total += 1
+        uci = convert_extended_uci_to_uci(mv)
+        try:
+            m = chess.Move.from_uci(uci)
+        except Exception:
+            if stop_on_illegal:
+                break
+            continue
+        if m in board.legal_moves:
+            legal += 1
+            board.push(m)
+        else:
+            if stop_on_illegal:
+                break
+    return legal / max(total, 1)
+def convert_uci_to_extended(uci_move: str, board_fen: str) -> str:
+    """
+    Convert standard UCI move to extended UCI format used by the dataset.
+    Args:
+        uci_move: e.g., "e2e4", "e7e8q", "e1g1"
+        board_fen: FEN of current board (must match move)
+    Returns:
+        Extended UCI like "WPe2e4", with suffixes:
+        - (x) capture
+        - (+) check
+        - (+*) checkmate
+        - (x+) capture+check
+        - (x+*) capture+checkmate
+        - (o) / (O) castling
+        - promotions as "=Q" etc
+    """
+    try:
+        import chess
+    except ImportError:
+        raise ImportError("python-chess is required for move conversion. Install it with: pip install python-chess")
+    board = chess.Board(board_fen)
+    move = chess.Move.from_uci(uci_move)
+    color = "W" if board.turn == chess.WHITE else "B"
+    piece = board.piece_at(move.from_square)
+    piece_letter = piece.symbol().upper() if piece else "P"
+    from_sq = chess.square_name(move.from_square)
+    to_sq = chess.square_name(move.to_square)
+    result = f"{color}{piece_letter}{from_sq}{to_sq}"
+    # Promotion
+    if move.promotion:
+        result += f"={chess.piece_symbol(move.promotion).upper()}"
+    # Capture suffix
+    if board.is_capture(move):
+        result += "(x)"
+    # Check / mate suffix (need to push)
+    board.push(move)
+    if board.is_checkmate():
+        if "(x)" in result:
+            result = result.replace("(x)", "(x+*)")
+        else:
+            result += "(+*)"
+    elif board.is_check():
+        if "(x)" in result:
+            result = result.replace("(x)", "(x+)")
+        else:
+            result += "(+)"
+    board.pop()
+    # Castling (dataset wants (o)/(O), usually no other suffix with it)
+    if board.is_castling(move):
+        result = re.sub(r"\([^)]*\)", "", result)  # drop any (...) suffix
+        if move.to_square in [chess.G1, chess.G8]:
+            result += "(o)"
+        else:
+            result += "(O)"
+    return result

src/__init__.py ADDED Viewed

	@@ -0,0 +1,22 @@

+"""Chess Challenge source module."""
+from .model import ChessConfig, ChessForCausalLM
+from .tokenizer import ChessTokenizer
+# Lazy import for evaluate to avoid RuntimeWarning when running as module
+def __getattr__(name):
+    if name == "ChessEvaluator":
+        from .evaluate import ChessEvaluator
+        return ChessEvaluator
+    if name == "load_model_from_hub":
+        from .evaluate import load_model_from_hub
+        return load_model_from_hub
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+__all__ = [
+    "ChessConfig",
+    "ChessForCausalLM",
+    "ChessTokenizer",
+    "ChessEvaluator",
+    "load_model_from_hub",
+]

src/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (878 Bytes). View file

src/__pycache__/data.cpython-311.pyc ADDED Viewed

Binary file (8.93 kB). View file

src/__pycache__/evaluate.cpython-311.pyc ADDED Viewed

Binary file (32.5 kB). View file

src/__pycache__/model.cpython-311.pyc ADDED Viewed

Binary file (26.3 kB). View file

src/__pycache__/tokenizer.cpython-311.pyc ADDED Viewed

Binary file (11.4 kB). View file

src/__pycache__/train.cpython-311.pyc ADDED Viewed

Binary file (13.2 kB). View file

src/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (15.8 kB). View file

src/data.py ADDED Viewed

	@@ -0,0 +1,205 @@

+"""
+Data loading utilities for the Chess Challenge.
+This module provides functions to load and process chess game data
+from the Lichess dataset on Hugging Face.
+IMPORTANT NOTE (compat with template evaluate + custom tokenizers):
+- Do NOT manually prepend BOS in the raw text.
+  The tokenizer should handle BOS via build_inputs_with_special_tokens.
+  This avoids double-BOS issues and keeps train/eval conventions aligned.
+"""
+from __future__ import annotations
+from typing import Dict, Iterator, List, Optional
+import torch
+from torch.utils.data import Dataset
+class ChessDataset(Dataset):
+    """
+    PyTorch Dataset for chess games.
+    Each game is tokenized and truncated/padded to max_length.
+    Labels are identical to input_ids; the model shifts internally.
+    Padding labels are set to -100 (HF convention) so they are ignored by CE loss.
+    """
+    def __init__(
+        self,
+        tokenizer,
+        dataset_name: str = "dlouapre/lichess_2025-01_1M",
+        split: str = "train",
+        column: str = "text",
+        max_length: int = 256,
+        max_samples: Optional[int] = None,
+    ):
+        from datasets import load_dataset
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.column = column
+        dataset = load_dataset(dataset_name, split=split)
+        if max_samples is not None:
+            dataset = dataset.select(range(min(max_samples, len(dataset))))
+        self.data = dataset
+    def __len__(self) -> int:
+        return len(self.data)
+    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
+        game = self.data[idx][self.column]
+        # IMPORTANT: do NOT prepend BOS manually in raw text.
+        # The tokenizer should add BOS (and only BOS if desired) via
+        # build_inputs_with_special_tokens, keeping things compatible with evaluate.py.
+        encoding = self.tokenizer(
+            game,
+            truncation=True,
+            max_length=self.max_length,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        input_ids = encoding["input_ids"].squeeze(0)
+        attention_mask = encoding["attention_mask"].squeeze(0)
+        labels = input_ids.clone()
+        labels[attention_mask == 0] = -100
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "labels": labels,
+        }
+class ChessDataCollator:
+    """
+    Data collator for chess games.
+    Here sequences are already padded to max_length in the dataset,
+    so we just stack tensors.
+    """
+    def __init__(self, tokenizer, max_length: int = 256):
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+    def __call__(self, features: List[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids = torch.stack([f["input_ids"] for f in features])
+        attention_mask = torch.stack([f["attention_mask"] for f in features])
+        labels = torch.stack([f["labels"] for f in features])
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "labels": labels,
+        }
+def create_train_val_datasets(
+    tokenizer,
+    dataset_name: str = "dlouapre/lichess_2025-01_1M",
+    max_length: int = 256,
+    train_samples: Optional[int] = None,
+    val_samples: int = 5000,
+    val_ratio: float = 0.05,
+):
+    """
+    Create training and validation datasets.
+    Splits the dataset deterministically by index:
+    - train: [0:n_train)
+    - val:   [n_train:n_train+n_val)
+    Returns:
+        (train_dataset, val_dataset)
+    """
+    from datasets import load_dataset
+    full_dataset = load_dataset(dataset_name, split="train")
+    total = len(full_dataset)
+    if train_samples is not None:
+        n_train = min(train_samples, total - val_samples)
+    else:
+        n_train = int(total * (1 - val_ratio))
+    n_val = min(val_samples, total - n_train)
+    train_data = full_dataset.select(range(n_train))
+    val_data = full_dataset.select(range(n_train, n_train + n_val))
+    train_dataset = ChessDataset(
+        tokenizer=tokenizer,
+        dataset_name=dataset_name,
+        max_length=max_length,
+    )
+    train_dataset.data = train_data
+    val_dataset = ChessDataset(
+        tokenizer=tokenizer,
+        dataset_name=dataset_name,
+        max_length=max_length,
+    )
+    val_dataset.data = val_data
+    return train_dataset, val_dataset
+def stream_games(
+    dataset_name: str = "dlouapre/lichess_2025-01_1M",
+    split: str = "train",
+    column: str = "text",
+) -> Iterator[str]:
+    """
+    Stream games from the dataset for memory-efficient processing.
+    """
+    from datasets import load_dataset
+    dataset = load_dataset(dataset_name, split=split, streaming=True)
+    for example in dataset:
+        yield example[column]
+def analyze_dataset_statistics(
+    dataset_name: str = "dlouapre/lichess_2025-01_1M",
+    max_samples: int = 10000,
+) -> Dict:
+    """
+    Analyze statistics of the chess dataset (non-streaming).
+    """
+    from collections import Counter
+    from datasets import load_dataset
+    dataset = load_dataset(dataset_name, split="train")
+    dataset = dataset.select(range(min(max_samples, len(dataset))))
+    game_lengths = []
+    move_counts = Counter()
+    opening_moves = Counter()
+    for example in dataset:
+        moves = example["text"].strip().split()
+        game_lengths.append(len(moves))
+        move_counts.update(moves)
+        if len(moves) >= 4:
+            opening = " ".join(moves[:4])
+            opening_moves[opening] += 1
+    return {
+        "total_games": len(dataset),
+        "avg_game_length": sum(game_lengths) / len(game_lengths),
+        "min_game_length": min(game_lengths),
+        "max_game_length": max(game_lengths),
+        "unique_moves": len(move_counts),
+        "most_common_moves": move_counts.most_common(20),
+        "most_common_openings": opening_moves.most_common(10),
+    }

src/evaluate.py ADDED Viewed

	@@ -0,0 +1,710 @@

+"""
+Evaluation script for the Chess Challenge.
+This script evaluates a trained chess model by playing games against
+Stockfish and computing ELO ratings.
+"""
+from __future__ import annotations
+import argparse
+import random
+import re
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+import torch
+@dataclass
+class GameResult:
+    """Result of a single game."""
+    moves: List[str]
+    result: str  # "1-0", "0-1", or "1/2-1/2"
+    model_color: str  # "white" or "black"
+    termination: str  # "checkmate", "stalemate", "illegal_move", "max_moves", etc.
+    illegal_move_count: int
+class ChessEvaluator:
+    """
+    Evaluator for chess models.
+    This class handles playing games between a trained model and Stockfish,
+    tracking results, and computing ELO ratings.
+    Supports any tokenization format as long as the model generates valid
+    chess squares (e.g., e2, e4). The evaluator extracts UCI moves by finding
+    square patterns in the generated output.
+    """
+    # Regex pattern to match chess squares
+    SQUARE_PATTERN = r"[a-h][1-8]"
+    def __init__(
+        self,
+        model,
+        tokenizer,
+        stockfish_path: Optional[str] = None,
+        stockfish_level: int = 1,
+        max_retries: int = 3,
+        device: str = "cuda" if torch.cuda.is_available() else "cpu",
+    ):
+        """
+        Initialize the evaluator.
+        Args:
+            model: The trained chess model.
+            tokenizer: The chess tokenizer.
+            stockfish_path: Path to Stockfish executable.
+            stockfish_level: Stockfish skill level (0-20).
+            max_retries: Maximum retries for illegal moves.
+            device: Device to run the model on.
+        """
+        self.model = model.to(device)
+        self.model.eval()
+        self.tokenizer = tokenizer
+        self.max_retries = max_retries
+        self.device = device
+        # Initialize Stockfish
+        try:
+            import chess
+            import chess.engine
+            self.chess = chess
+            if stockfish_path is None:
+                # Try common paths
+                import shutil
+                stockfish_path = shutil.which("stockfish")
+            if stockfish_path:
+                self.engine = chess.engine.SimpleEngine.popen_uci(stockfish_path)
+                self.engine.configure({"Skill Level": stockfish_level})
+            else:
+                print("WARNING: Stockfish not found. Install it for full evaluation.")
+                self.engine = None
+        except ImportError:
+            raise ImportError(
+                "python-chess is required for evaluation. "
+                "Install it with: pip install python-chess"
+            )
+    def __del__(self):
+        """Clean up Stockfish engine."""
+        if hasattr(self, "engine") and self.engine:
+            self.engine.quit()
+    def _detect_tokenizer_format(self) -> str:
+        """
+        Detect the tokenizer's expected move format by testing tokenization.
+        Tests various formats with a sample move and picks the one that
+        produces the fewest unknown tokens. This makes evaluation work
+        with any tokenizer format.
+        Supported formats:
+        - 'decomposed': "WP e2_f e4_t" (piece, from_suffix, to_suffix)
+        - 'standard': "WPe2e4" (combined with optional annotations)
+        - 'uci': "e2e4" (pure UCI notation)
+        - 'uci_spaced': "e2 e4" (UCI with space separator)
+        Returns:
+            The format string that best matches the tokenizer's vocabulary.
+        """
+        if hasattr(self, "_cached_format"):
+            return self._cached_format
+        test_formats = {
+            "decomposed": "WP e2_f e4_t",
+            "standard": "WPe2e4",
+            "uci": "e2e4",
+            "uci_spaced": "e2 e4",
+        }
+        unk_token_id = getattr(self.tokenizer, "unk_token_id", None)
+        best_format = "standard"
+        min_unk_count = float("inf")
+        for fmt, sample in test_formats.items():
+            try:
+                tokens = self.tokenizer.encode(sample, add_special_tokens=False)
+                unk_count = tokens.count(unk_token_id) if unk_token_id is not None else 0
+                if len(tokens) == 1 and unk_count == 1:
+                    unk_count = 100  # heavy penalty
+                if unk_count < min_unk_count:
+                    min_unk_count = unk_count
+                    best_format = fmt
+            except Exception:
+                continue
+        self._cached_format = best_format
+        return best_format
+    def _format_move(
+        self,
+        color: str,
+        piece: str,
+        from_sq: str,
+        to_sq: str,
+        promotion: str = None,
+    ) -> str:
+        fmt = self._detect_tokenizer_format()
+        if fmt == "decomposed":
+            move_str = f"{color}{piece} {from_sq}_f {to_sq}_t"
+        elif fmt == "uci":
+            move_str = f"{from_sq}{to_sq}"
+            if promotion:
+                move_str += promotion.lower()
+        elif fmt == "uci_spaced":
+            move_str = f"{from_sq} {to_sq}"
+            if promotion:
+                move_str += f" {promotion.lower()}"
+        else:  # standard
+            move_str = f"{color}{piece}{from_sq}{to_sq}"
+            if promotion:
+                move_str += f"={promotion}"
+        return move_str
+    def _convert_board_to_moves(self, board) -> str:
+        moves = []
+        temp_board = self.chess.Board()
+        fmt = self._detect_tokenizer_format()
+        for move in board.move_stack:
+            color = "W" if temp_board.turn == self.chess.WHITE else "B"
+            piece = temp_board.piece_at(move.from_square)
+            piece_letter = piece.symbol().upper() if piece else "P"
+            from_sq = self.chess.square_name(move.from_square)
+            to_sq = self.chess.square_name(move.to_square)
+            promo = None
+            if move.promotion:
+                promo = self.chess.piece_symbol(move.promotion).upper()
+            move_str = self._format_move(color, piece_letter, from_sq, to_sq, promo)
+            if fmt == "standard":
+                if temp_board.is_capture(move):
+                    move_str += "(x)"
+                temp_board.push(move)
+                if temp_board.is_checkmate():
+                    if "(x)" in move_str:
+                        move_str = move_str.replace("(x)", "(x+*)")
+                    else:
+                        move_str += "(+*)"
+                elif temp_board.is_check():
+                    if "(x)" in move_str:
+                        move_str = move_str.replace("(x)", "(x+)")
+                    else:
+                        move_str += "(+)"
+                if piece_letter == "K":
+                    if abs(ord(from_sq[0]) - ord(to_sq[0])) > 1:
+                        if to_sq[0] == "g":
+                            move_str = move_str.split("(")[0] + "(o)"
+                        else:
+                            move_str = move_str.split("(")[0] + "(O)"
+            else:
+                temp_board.push(move)
+            moves.append(move_str)
+        return " ".join(moves)
+    def _is_separator_token(self, token_str: str) -> bool:
+        if hasattr(self.tokenizer, "eos_token") and token_str == self.tokenizer.eos_token:
+            return True
+        if token_str.strip() == "" and len(token_str) > 0:
+            return True
+        if token_str != token_str.rstrip():
+            return True
+        return False
+    def _extract_uci_move(self, text: str) -> Optional[str]:
+        if not text:
+            return None
+        squares = re.findall(self.SQUARE_PATTERN, text)
+        if len(squares) < 2:
+            return None
+        from_sq, to_sq = squares[0], squares[1]
+        uci_move = from_sq + to_sq
+        to_sq_idx = text.find(to_sq)
+        if to_sq_idx != -1:
+            remaining = text[to_sq_idx + 2 : to_sq_idx + 5]
+            promo_match = re.search(r"[=]?([qrbnQRBN])", remaining)
+            if promo_match:
+                uci_move += promo_match.group(1).lower()
+        return uci_move
+    def _has_complete_move(self, text: str) -> bool:
+        squares = re.findall(self.SQUARE_PATTERN, text)
+        return len(squares) >= 2
+    def _generate_move_tokens(
+        self,
+        input_ids: torch.Tensor,
+        temperature: float = 0.7,
+        top_k: int = 10,
+        max_tokens: int = 20,
+    ) -> str:
+        generated_tokens = []
+        current_ids = input_ids.clone()
+        accumulated_text = ""
+        for _ in range(max_tokens):
+            with torch.no_grad():
+                outputs = self.model(input_ids=current_ids)
+                logits = outputs.logits[:, -1, :] / max(temperature, 1e-6)
+                if top_k > 0:
+                    top_k_vals = torch.topk(logits, min(top_k, logits.size(-1)))
+                    indices_to_remove = logits < top_k_vals[0][..., -1, None]
+                    logits[indices_to_remove] = float("-inf")
+                probs = torch.softmax(logits, dim=-1)
+                next_token = torch.multinomial(probs, num_samples=1)
+            token_str = self.tokenizer.decode(next_token[0])
+            if self._is_separator_token(token_str):
+                if self._has_complete_move(accumulated_text):
+                    break
+                if hasattr(self.tokenizer, "eos_token") and token_str == self.tokenizer.eos_token:
+                    break
+                if accumulated_text:
+                    break
+            generated_tokens.append(next_token[0])
+            current_ids = torch.cat([current_ids, next_token], dim=-1)
+            accumulated_text += token_str
+            if self._has_complete_move(accumulated_text):
+                squares = re.findall(self.SQUARE_PATTERN, accumulated_text)
+                if len(squares) >= 2:
+                    to_sq = squares[1]
+                    if to_sq[1] in "18":
+                        if len(generated_tokens) > 3:
+                            break
+                    else:
+                        break
+        if generated_tokens:
+            all_tokens = torch.cat(generated_tokens, dim=0)
+            move_str = self.tokenizer.decode(all_tokens, skip_special_tokens=True)
+            return move_str.strip()
+        return ""
+    def _get_model_move(
+        self,
+        board,
+        temperature: float = 0.7,
+        top_k: int = 10,
+    ) -> Tuple[Optional[str], int]:
+        self.model.eval()
+        moves_str = self._convert_board_to_moves(board)
+        if not moves_str:
+            input_text = self.tokenizer.bos_token
+        else:
+            input_text = self.tokenizer.bos_token + " " + moves_str
+        inputs = self.tokenizer(
+            input_text,
+            return_tensors="pt",
+            truncation=True,
+            max_length=self.model.config.n_ctx - 10,
+        ).to(self.device)
+        for retry in range(self.max_retries):
+            move_text = self._generate_move_tokens(
+                inputs["input_ids"],
+                temperature=temperature,
+                top_k=top_k,
+            )
+            uci_move = self._extract_uci_move(move_text)
+            if uci_move:
+                try:
+                    move = self.chess.Move.from_uci(uci_move)
+                    if move in board.legal_moves:
+                        return uci_move, retry
+                except (ValueError, self.chess.InvalidMoveError):
+                    pass
+        return None, self.max_retries
+    def _get_stockfish_move(self, board, time_limit: float = 0.1) -> str:
+        if self.engine is None:
+            raise RuntimeError("Stockfish engine not initialized")
+        result = self.engine.play(board, self.chess.engine.Limit(time=time_limit))
+        return result.move.uci()
+    def play_game(
+        self,
+        model_color: str = "white",
+        max_moves: int = 200,
+        temperature: float = 0.7,
+    ) -> GameResult:
+        board = self.chess.Board()
+        moves = []
+        illegal_move_count = 0
+        model_is_white = model_color == "white"
+        while not board.is_game_over() and len(moves) < max_moves:
+            is_model_turn = (board.turn == self.chess.WHITE) == model_is_white
+            if is_model_turn:
+                uci_move, retries = self._get_model_move(board, temperature)
+                illegal_move_count += retries
+                if uci_move is None:
+                    return GameResult(
+                        moves=moves,
+                        result="0-1" if model_is_white else "1-0",
+                        model_color=model_color,
+                        termination="illegal_move",
+                        illegal_move_count=illegal_move_count + 1,
+                    )
+                move = self.chess.Move.from_uci(uci_move)
+            else:
+                if self.engine:
+                    uci_move = self._get_stockfish_move(board)
+                    move = self.chess.Move.from_uci(uci_move)
+                else:
+                    move = random.choice(list(board.legal_moves))
+            board.push(move)
+            moves.append(move.uci())
+        if board.is_checkmate():
+            if board.turn == self.chess.WHITE:
+                result = "0-1"
+            else:
+                result = "1-0"
+            termination = "checkmate"
+        elif board.is_stalemate():
+            result = "1/2-1/2"
+            termination = "stalemate"
+        elif board.is_insufficient_material():
+            result = "1/2-1/2"
+            termination = "insufficient_material"
+        elif board.can_claim_draw():
+            result = "1/2-1/2"
+            termination = "draw_claim"
+        elif len(moves) >= max_moves:
+            result = "1/2-1/2"
+            termination = "max_moves"
+        else:
+            result = "1/2-1/2"
+            termination = "unknown"
+        return GameResult(
+            moves=moves,
+            result=result,
+            model_color=model_color,
+            termination=termination,
+            illegal_move_count=illegal_move_count,
+        )
+    def evaluate_legal_moves(
+        self,
+        n_positions: int = 1000,
+        temperature: float = 0.7,
+        verbose: bool = True,
+        seed: int = 42,
+    ) -> dict:
+        random.seed(seed)
+        torch.manual_seed(seed)
+        results = {
+            "total_positions": 0,
+            "legal_first_try": 0,
+            "legal_with_retry": 0,
+            "illegal_all_retries": 0,
+            "positions": [],
+        }
+        for i in range(n_positions):
+            board = self.chess.Board()
+            n_random_moves = random.randint(5, 40)
+            for _ in range(n_random_moves):
+                if board.is_game_over():
+                    break
+                move = random.choice(list(board.legal_moves))
+                board.push(move)
+            if board.is_game_over():
+                continue
+            results["total_positions"] += 1
+            uci_move, retries = self._get_model_move(board, temperature)
+            position_result = {
+                "fen": board.fen(),
+                "move_number": len(board.move_stack),
+                "legal": uci_move is not None,
+                "retries": retries,
+            }
+            results["positions"].append(position_result)
+            if uci_move is not None:
+                if retries == 0:
+                    results["legal_first_try"] += 1
+                else:
+                    results["legal_with_retry"] += 1
+            else:
+                results["illegal_all_retries"] += 1
+            if verbose and (i + 1) % 100 == 0:
+                legal_rate = (results["legal_first_try"] + results["legal_with_retry"]) / results["total_positions"]
+                print(f"  Positions: {i + 1}/{n_positions} | Legal rate: {legal_rate:.1%}")
+        total = results["total_positions"]
+        if total > 0:
+            results["legal_rate_first_try"] = results["legal_first_try"] / total
+            results["legal_rate_with_retry"] = (results["legal_first_try"] + results["legal_with_retry"]) / total
+            results["illegal_rate"] = results["illegal_all_retries"] / total
+        else:
+            results["legal_rate_first_try"] = 0
+            results["legal_rate_with_retry"] = 0
+            results["illegal_rate"] = 1
+        return results
+    def evaluate(
+        self,
+        n_games: int = 100,
+        temperature: float = 0.7,
+        verbose: bool = True,
+    ) -> dict:
+        results = {
+            "wins": 0,
+            "losses": 0,
+            "draws": 0,
+            "illegal_moves": 0,
+            "total_moves": 0,
+            "games": [],
+        }
+        for i in range(n_games):
+            model_color = "white" if i % 2 == 0 else "black"
+            game = self.play_game(
+                model_color=model_color,
+                temperature=temperature,
+            )
+            results["games"].append(game)
+            results["total_moves"] += len(game.moves)
+            results["illegal_moves"] += game.illegal_move_count
+            if game.result == "1/2-1/2":
+                results["draws"] += 1
+            elif (game.result == "1-0" and model_color == "white") or (game.result == "0-1" and model_color == "black"):
+                results["wins"] += 1
+            else:
+                results["losses"] += 1
+            if verbose and (i + 1) % 10 == 0:
+                print(
+                    f"  Games: {i + 1}/{n_games} | "
+                    f"W: {results['wins']} L: {results['losses']} D: {results['draws']}"
+                )
+        total = results["wins"] + results["losses"] + results["draws"]
+        results["win_rate"] = results["wins"] / total if total > 0 else 0
+        results["draw_rate"] = results["draws"] / total if total > 0 else 0
+        results["loss_rate"] = results["losses"] / total if total > 0 else 0
+        total_attempts = results["total_moves"] + results["illegal_moves"]
+        results["avg_game_length"] = total_attempts / total if total > 0 else 0
+        results["illegal_move_rate"] = results["illegal_moves"] / total_attempts if total_attempts > 0 else 0
+        stockfish_elo = 1350
+        if results["win_rate"] > 0 or results["loss_rate"] > 0:
+            score = results["wins"] + 0.5 * results["draws"]
+            if score > 0:
+                win_ratio = score / total
+                if 0 < win_ratio < 1:
+                    elo_diff = -400 * (1 - 2 * win_ratio) / (1 if win_ratio > 0.5 else -1)
+                    results["estimated_elo"] = stockfish_elo + elo_diff
+                else:
+                    results["estimated_elo"] = stockfish_elo + (400 if win_ratio >= 1 else -400)
+            else:
+                results["estimated_elo"] = stockfish_elo - 400
+        else:
+            results["estimated_elo"] = None
+        return results
+def load_model_from_hub(model_id: str, device: str = "auto", verbose: bool = True):
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    # Import to register custom classes
+    from src.model import ChessConfig, ChessForCausalLM
+    from src.tokenizer import ChessTokenizer
+    tokenizer_source = None
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+        tokenizer_source = "AutoTokenizer (from Hub with trust_remote_code=True)"
+    except Exception as e:
+        if verbose:
+            print(f"   AutoTokenizer failed: {e}")
+        tokenizer = ChessTokenizer.from_pretrained(model_id)
+        tokenizer_source = "ChessTokenizer (local class, vocab from Hub)"
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        trust_remote_code=True,
+        device_map=device,
+    )
+    if verbose:
+        print(f"   Tokenizer loaded via: {tokenizer_source}")
+        print(f"   Tokenizer class: {type(tokenizer).__name__}")
+        print(f"   Tokenizer vocab size: {tokenizer.vocab_size}")
+        if hasattr(tokenizer, "_vocab"):
+            print(f"   Tokenizer has _vocab attribute: yes ({len(tokenizer._vocab)} entries)")
+    return model, tokenizer
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate a chess model")
+    parser.add_argument("--model_path", type=str, required=True, help="Path to the model or Hugging Face model ID")
+    parser.add_argument("--mode", type=str, default="legal", choices=["legal", "winrate", "both"])
+    parser.add_argument("--stockfish_path", type=str, default=None, help="Path to Stockfish executable")
+    parser.add_argument("--stockfish_level", type=int, default=1, help="Stockfish skill level (0-20)")
+    parser.add_argument("--n_positions", type=int, default=500, help="Number of positions for legal move evaluation")
+    parser.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility")
+    parser.add_argument("--n_games", type=int, default=100, help="Number of games to play for win rate evaluation")
+    parser.add_argument("--temperature", type=float, default=0.7, help="Sampling temperature")
+    args = parser.parse_args()
+    print("=" * 60)
+    print("CHESS CHALLENGE - EVALUATION")
+    print("=" * 60)
+    print(f"\nLoading model from: {args.model_path}")
+    import os
+    is_local_path = os.path.exists(args.model_path)
+    if is_local_path:
+        # Local path
+        from transformers import AutoModelForCausalLM
+        from src.tokenizer import ChessTokenizer
+        from src.model import ChessConfig, ChessForCausalLM
+        tokenizer = ChessTokenizer.from_pretrained(args.model_path)
+        # IMPORTANT FIX:
+        # Our custom ChessForCausalLM does NOT support device_map="auto" unless _no_split_modules is defined.
+        # So we load normally and move to device explicitly.
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        model = AutoModelForCausalLM.from_pretrained(
+            args.model_path,
+            trust_remote_code=True,
+        )
+        model.to(device)
+        model.eval()
+    else:
+        if args.model_path.startswith(".") or args.model_path.startswith("/"):
+            raise FileNotFoundError(
+                f"Local model path not found: {args.model_path}\n"
+                f"Please check that the path exists and contains model files."
+            )
+        model, tokenizer = load_model_from_hub(args.model_path)
+    print(f"\nSetting up evaluator...")
+    evaluator = ChessEvaluator(
+        model=model,
+        tokenizer=tokenizer,
+        stockfish_path=args.stockfish_path,
+        stockfish_level=args.stockfish_level,
+    )
+    if args.mode in ["legal", "both"]:
+        print(f"\n" + "=" * 60)
+        print("PHASE 1: LEGAL MOVE EVALUATION")
+        print("=" * 60)
+        print(f"Testing {args.n_positions} random positions...")
+        legal_results = evaluator.evaluate_legal_moves(
+            n_positions=args.n_positions,
+            temperature=args.temperature,
+            verbose=True,
+            seed=args.seed,
+        )
+        print("\n" + "-" * 40)
+        print("LEGAL MOVE RESULTS")
+        print("-" * 40)
+        print(f"  Positions tested:     {legal_results['total_positions']}")
+        print(f"  Legal (1st try):      {legal_results['legal_first_try']} ({legal_results['legal_rate_first_try']:.1%})")
+        print(
+            f"  Legal (with retry):   {legal_results['legal_first_try'] + legal_results['legal_with_retry']}"
+            f" ({legal_results['legal_rate_with_retry']:.1%})"
+        )
+        print(f"  Always illegal:       {legal_results['illegal_all_retries']} ({legal_results['illegal_rate']:.1%})")
+    if args.mode in ["winrate", "both"]:
+        print(f"\n" + "=" * 60)
+        print("PHASE 2: WIN RATE EVALUATION")
+        print("=" * 60)
+        print(f"Playing {args.n_games} games against Stockfish (Level {args.stockfish_level})...")
+        winrate_results = evaluator.evaluate(
+            n_games=args.n_games,
+            temperature=args.temperature,
+            verbose=True,
+        )
+        print("\n" + "-" * 40)
+        print("WIN RATE RESULTS")
+        print("-" * 40)
+        print(f"  Wins:   {winrate_results['wins']}")
+        print(f"  Losses: {winrate_results['losses']}")
+        print(f"  Draws:  {winrate_results['draws']}")
+        print(f"\n  Win Rate:  {winrate_results['win_rate']:.1%}")
+        print(f"  Draw Rate: {winrate_results['draw_rate']:.1%}")
+        print(f"  Loss Rate: {winrate_results['loss_rate']:.1%}")
+        print(f"\n  Avg Game Length: {winrate_results['avg_game_length']:.1f} moves")
+        print(f"  Illegal Move Rate: {winrate_results['illegal_move_rate']:.2%}")
+        if winrate_results.get("estimated_elo", None):
+            print(f"\n  Estimated ELO: {winrate_results['estimated_elo']:.0f}")
+    print("\n" + "=" * 60)
+    print("EVALUATION COMPLETE")
+    print("=" * 60)
+if __name__ == "__main__":
+    main()

src/model.py ADDED Viewed

	@@ -0,0 +1,446 @@

+"""
+Chess Transformer Model for the Chess Challenge.
+Modern small-LLM upgrades:
+- RoPE (rotary positional embeddings): no learned positional embeddings needed
+- RMSNorm (optional, default True)
+- SwiGLU MLP (optional, default True)
+- Weight tying (default True)
+- Safe loss ignore_index = -100 (HF convention)
+"""
+from __future__ import annotations
+import math
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PretrainedConfig, PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithPast
+class ChessConfig(PretrainedConfig):
+    model_type = "chess_transformer"
+    def __init__(
+        self,
+        vocab_size: int = 1200,
+        # Architecture (defaults tuned to be < 1M params for common vocabs)
+        n_embd: int = 112,
+        n_layer: int = 7,
+        n_head: int = 7,
+        # Context window
+        n_ctx: int = 512,
+        # MLP hidden size:
+        # - if mlp_type="swiglu", this is SwiGLU hidden size h
+        # - if mlp_type="gelu", this is FFN inner size
+        n_inner: Optional[int] = 192,
+        dropout: float = 0.05,
+        layer_norm_epsilon: float = 1e-6,
+        # Position encoding
+        use_rope: bool = True,
+        rope_theta: float = 10000.0,
+        # Normalization / MLP type
+        use_rmsnorm: bool = True,
+        mlp_type: str = "swiglu",  # "swiglu" or "gelu"
+        # Weight tying
+        tie_weights: bool = True,
+        pad_token_id: int = 0,
+        bos_token_id: int = 1,
+        eos_token_id: int = 2,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+        if n_embd % n_head != 0:
+            raise ValueError(f"n_embd ({n_embd}) must be divisible by n_head ({n_head})")
+        head_dim = n_embd // n_head
+        if use_rope and (head_dim % 2 != 0):
+            raise ValueError(
+                f"RoPE requires even head_dim, got head_dim={head_dim}. "
+                f"Choose n_embd/n_head even."
+            )
+        self.vocab_size = vocab_size
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_ctx = n_ctx
+        self.n_inner = n_inner if n_inner is not None else (2 * n_embd)
+        self.dropout = dropout
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.use_rope = use_rope
+        self.rope_theta = rope_theta
+        self.use_rmsnorm = use_rmsnorm
+        self.mlp_type = mlp_type
+        self.tie_weights = tie_weights
+        # HF uses this field for embedding tying behavior
+        self.tie_word_embeddings = bool(tie_weights)
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        norm = torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
+        return x * norm * self.weight
+def rotate_half(x: torch.Tensor) -> torch.Tensor:
+    x1 = x[..., 0::2]
+    x2 = x[..., 1::2]
+    out = torch.empty_like(x)
+    out[..., 0::2] = -x2
+    out[..., 1::2] = x1
+    return out
+class RotaryEmbedding(nn.Module):
+    """
+    RoPE cache builder. Applies RoPE to q,k with shape (B,H,T,D).
+    """
+    def __init__(self, head_dim: int, theta: float = 10000.0):
+        super().__init__()
+        if head_dim % 2 != 0:
+            raise ValueError(f"RoPE requires even head_dim, got {head_dim}")
+        inv_freq = 1.0 / (theta ** (torch.arange(0, head_dim, 2).float() / head_dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self._cos_cached = None
+        self._sin_cached = None
+        self._seq_len_cached = 0
+        self._device_cached = None
+        self._dtype_cached = None
+    def _build_cache(self, seq_len: int, device: torch.device, dtype: torch.dtype):
+        t = torch.arange(seq_len, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)  # (T, D/2)
+        cos = freqs.cos().to(dtype=dtype)
+        sin = freqs.sin().to(dtype=dtype)
+        self._cos_cached = cos
+        self._sin_cached = sin
+        self._seq_len_cached = seq_len
+        self._device_cached = device
+        self._dtype_cached = dtype
+    def forward(self, q: torch.Tensor, k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        # q,k: (B,H,T,D)
+        T = q.size(-2)
+        device = q.device
+        dtype = q.dtype
+        if (
+            self._cos_cached is None
+            or T > self._seq_len_cached
+            or device != self._device_cached
+            or dtype != self._dtype_cached
+        ):
+            self._build_cache(T, device, dtype)
+        cos = self._cos_cached[:T]  # (T, D/2)
+        sin = self._sin_cached[:T]  # (T, D/2)
+        # broadcast to (1,1,T,D) via repeat_interleave on last dim
+        cos = torch.repeat_interleave(cos.unsqueeze(0).unsqueeze(0), 2, dim=-1)
+        sin = torch.repeat_interleave(sin.unsqueeze(0).unsqueeze(0), 2, dim=-1)
+        q_out = (q * cos) + (rotate_half(q) * sin)
+        k_out = (k * cos) + (rotate_half(k) * sin)
+        return q_out, k_out
+class MultiHeadAttention(nn.Module):
+    def __init__(self, config: ChessConfig):
+        super().__init__()
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.head_dim = config.n_embd // config.n_head
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
+        self.dropout = nn.Dropout(config.dropout)
+        self.use_rope = bool(config.use_rope)
+        self.rope = RotaryEmbedding(self.head_dim, theta=config.rope_theta) if self.use_rope else None
+        # causal mask buffer (expandable)
+        self.register_buffer(
+            "bias",
+            torch.tril(torch.ones(config.n_ctx, config.n_ctx)).view(1, 1, config.n_ctx, config.n_ctx),
+            persistent=False,
+        )
+    def _ensure_causal_mask(self, seq_len: int, device: torch.device, dtype: torch.dtype):
+        if self.bias.size(-1) >= seq_len and self.bias.device == device:
+            return
+        self.bias = torch.tril(torch.ones(seq_len, seq_len, device=device, dtype=dtype)).view(1, 1, seq_len, seq_len)
+    def forward(self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        B, T, _ = x.size()
+        qkv = self.c_attn(x)
+        q, k, v = qkv.split(self.n_embd, dim=2)
+        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)  # (B,H,T,D)
+        k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        if self.use_rope:
+            q, k = self.rope(q, k)
+        attn = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
+        self._ensure_causal_mask(T, attn.device, attn.dtype)
+        causal_mask = self.bias[:, :, :T, :T]
+        mask_value = torch.finfo(attn.dtype).min
+        attn = attn.masked_fill(causal_mask == 0, mask_value)
+        # padding mask (1=keep, 0=mask)
+        if attention_mask is not None:
+            am = attention_mask.unsqueeze(1).unsqueeze(2)  # (B,1,1,T)
+            attn = attn.masked_fill(am == 0, mask_value)
+        attn = F.softmax(attn, dim=-1)
+        attn = self.dropout(attn)
+        y = torch.matmul(attn, v)  # (B,H,T,D)
+        y = y.transpose(1, 2).contiguous().view(B, T, self.n_embd)
+        y = self.c_proj(y)
+        y = self.dropout(y)
+        return y
+class SwiGLU(nn.Module):
+    def __init__(self, config: ChessConfig):
+        super().__init__()
+        h = config.n_inner
+        self.w12 = nn.Linear(config.n_embd, 2 * h)
+        self.w3 = nn.Linear(h, config.n_embd)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        x = F.silu(x1) * x2
+        x = self.w3(x)
+        x = self.dropout(x)
+        return x
+class FeedForwardGELU(nn.Module):
+    def __init__(self, config: ChessConfig):
+        super().__init__()
+        self.c_fc = nn.Linear(config.n_embd, config.n_inner)
+        self.c_proj = nn.Linear(config.n_inner, config.n_embd)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.c_fc(x)
+        x = F.gelu(x)
+        x = self.c_proj(x)
+        x = self.dropout(x)
+        return x
+class TransformerBlock(nn.Module):
+    def __init__(self, config: ChessConfig):
+        super().__init__()
+        if config.use_rmsnorm:
+            self.ln_1 = RMSNorm(config.n_embd, eps=config.layer_norm_epsilon)
+            self.ln_2 = RMSNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        else:
+            self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+            self.ln_2 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        self.attn = MultiHeadAttention(config)
+        if config.mlp_type.lower() == "swiglu":
+            self.mlp = SwiGLU(config)
+        else:
+            self.mlp = FeedForwardGELU(config)
+    def forward(self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        x = x + self.attn(self.ln_1(x), attention_mask=attention_mask)
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class ChessForCausalLM(PreTrainedModel):
+    config_class = ChessConfig
+    base_model_prefix = "transformer"
+    supports_gradient_checkpointing = True
+    keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    _no_split_modules = ["TransformerBlock"]
+    def __init__(self, config: ChessConfig):
+        super().__init__(config)
+        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
+        # learned positional embeddings only if RoPE disabled
+        self.wpe = None
+        if not config.use_rope:
+            self.wpe = nn.Embedding(config.n_ctx, config.n_embd)
+        self.drop = nn.Dropout(config.dropout)
+        self.h = nn.ModuleList([TransformerBlock(config) for _ in range(config.n_layer)])
+        if config.use_rmsnorm:
+            self.ln_f = RMSNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        else:
+            self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        if config.tie_weights:
+            self._tied_weights_keys = ["lm_head.weight"]
+        self.post_init()
+        if config.tie_weights:
+            self.tie_weights()
+    def get_input_embeddings(self) -> nn.Module:
+        return self.wte
+    def set_input_embeddings(self, new_embeddings: nn.Module):
+        self.wte = new_embeddings
+        if getattr(self.config, "tie_weights", False):
+            self.tie_weights()
+    def get_output_embeddings(self) -> nn.Module:
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings: nn.Module):
+        self.lm_head = new_embeddings
+    def tie_weights(self):
+        if getattr(self.config, "tie_weights", False) or getattr(self.config, "tie_word_embeddings", False):
+            self._tie_or_clone_weights(self.lm_head, self.wte)
+    def _init_weights(self, module: nn.Module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        B, T = input_ids.size()
+        device = input_ids.device
+        x = self.wte(input_ids)
+        if self.wpe is not None:
+            if position_ids is None:
+                position_ids = torch.arange(T, device=device).unsqueeze(0).expand(B, -1)
+            x = x + self.wpe(position_ids)
+        x = self.drop(x)
+        for block in self.h:
+            x = block(x, attention_mask=attention_mask)
+        x = self.ln_f(x)
+        logits = self.lm_head(x)
+        loss = None
+        if labels is not None:
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)),
+                shift_labels.view(-1),
+            )
+        if not return_dict:
+            output = (logits,)
+            return ((loss,) + output) if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=None,
+            hidden_states=None,
+            attentions=None,
+        )
+    @torch.no_grad()
+    def generate_move(
+        self,
+        input_ids: torch.LongTensor,
+        temperature: float = 0.7,
+        top_k: Optional[int] = 50,
+        top_p: Optional[float] = None,
+    ) -> int:
+        self.eval()
+        outputs = self(input_ids)
+        logits = outputs.logits[:, -1, :] / max(float(temperature), 1e-6)
+        if top_k is not None and top_k > 0:
+            k = min(int(top_k), logits.size(-1))
+            thresh = torch.topk(logits, k)[0][..., -1, None]
+            logits = logits.masked_fill(logits < thresh, torch.finfo(logits.dtype).min)
+        if top_p is not None:
+            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+            probs = F.softmax(sorted_logits, dim=-1)
+            cum = torch.cumsum(probs, dim=-1)
+            to_remove = cum > float(top_p)
+            to_remove[..., 1:] = to_remove[..., :-1].clone()
+            to_remove[..., 0] = 0
+            indices_to_remove = to_remove.scatter(dim=-1, index=sorted_indices, src=to_remove)
+            logits = logits.masked_fill(indices_to_remove, torch.finfo(logits.dtype).min)
+        probs = F.softmax(logits, dim=-1)
+        next_token = torch.multinomial(probs, num_samples=1)
+        return int(next_token.item())
+# Register the model with Auto classes
+from transformers import AutoConfig, AutoModelForCausalLM
+AutoConfig.register("chess_transformer", ChessConfig)
+AutoModelForCausalLM.register(ChessConfig, ChessForCausalLM)

src/tokenizer.py ADDED Viewed

	@@ -0,0 +1,207 @@

+"""
+Decomposed Chess Tokenizer for the Chess Challenge.
+Each move becomes 3 or 4 tokens:
+  WP e2_f e4_t
+  BN g8_f f6_t
+Promotion adds an extra token:
+  WP e7_f e8_t =q
+Why this helps:
+- Fixed small vocab (~150 tokens)
+- Near-zero OOV / UNK, so the evaluator can always parse squares
+- Compatible with the provided evaluate.py (it auto-detects 'decomposed')
+Special tokens behavior:
+- Adds BOS only (NO EOS)
+- If BOS already present, does not add it twice
+"""
+from __future__ import annotations
+import json
+import os
+from typing import Dict, List, Optional
+from transformers import PreTrainedTokenizer
+class ChessTokenizer(PreTrainedTokenizer):
+    model_input_names = ["input_ids", "attention_mask"]
+    vocab_files_names = {"vocab_file": "vocab.json"}
+    PAD_TOKEN = "[PAD]"
+    BOS_TOKEN = "[BOS]"
+    EOS_TOKEN = "[EOS]"   # kept for compatibility, not auto-added
+    UNK_TOKEN = "[UNK]"
+    def __init__(
+        self,
+        vocab_file: Optional[str] = None,
+        vocab: Optional[Dict[str, int]] = None,
+        **kwargs,
+    ):
+        self._pad_token = self.PAD_TOKEN
+        self._bos_token = self.BOS_TOKEN
+        self._eos_token = self.EOS_TOKEN
+        self._unk_token = self.UNK_TOKEN
+        # avoid duplicates from kwargs
+        kwargs.pop("pad_token", None)
+        kwargs.pop("bos_token", None)
+        kwargs.pop("eos_token", None)
+        kwargs.pop("unk_token", None)
+        if vocab is not None:
+            self._vocab = vocab
+        elif vocab_file is not None and os.path.exists(vocab_file):
+            with open(vocab_file, "r", encoding="utf-8") as f:
+                self._vocab = json.load(f)
+        else:
+            self._vocab = self._build_fixed_vocab()
+        self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
+        super().__init__(
+            pad_token=self._pad_token,
+            bos_token=self._bos_token,
+            eos_token=self._eos_token,
+            unk_token=self._unk_token,
+            **kwargs,
+        )
+    # --------------------------
+    # Fixed vocab: pieces + squares + promos
+    # --------------------------
+    @staticmethod
+    def _all_squares() -> List[str]:
+        files = "abcdefgh"
+        ranks = "12345678"
+        return [f + r for r in ranks for f in files]  # a1..h8
+    def _build_fixed_vocab(self) -> Dict[str, int]:
+        special = [self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN]
+        # piece tokens: WP..WK, BP..BK
+        piece_tokens = [f"{c}{p}" for c in "WB" for p in "PNBRQK"]
+        squares = self._all_squares()
+        from_tokens = [f"{sq}_f" for sq in squares]
+        to_tokens = [f"{sq}_t" for sq in squares]
+        promo_tokens = ["=q", "=r", "=b", "=n"]
+        tokens = special + piece_tokens + from_tokens + to_tokens + promo_tokens
+        return {tok: i for i, tok in enumerate(tokens)}
+    # --------------------------
+    # Special tokens handling (robust with evaluate.py)
+    # --------------------------
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        # BOS only, NO EOS
+        if token_ids_1 is not None:
+            token_ids_0 = token_ids_0 + token_ids_1
+        if token_ids_0 and token_ids_0[0] == self.bos_token_id:
+            return token_ids_0
+        return [self.bos_token_id] + token_ids_0
+    def get_special_tokens_mask(
+        self,
+        token_ids_0: List[int],
+        token_ids_1: Optional[List[int]] = None,
+        already_has_special_tokens: bool = False,
+    ) -> List[int]:
+        if already_has_special_tokens:
+            specials = {self.pad_token_id, self.bos_token_id, self.eos_token_id, self.unk_token_id}
+            return [1 if t in specials else 0 for t in token_ids_0]
+        if token_ids_1 is None:
+            return [1] + [0] * len(token_ids_0)
+        return [1] + [0] * (len(token_ids_0) + len(token_ids_1))
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        if token_ids_1 is None:
+            return [0] * (len(token_ids_0) + 1)
+        return [0] * (len(token_ids_0) + len(token_ids_1) + 1)
+    # --------------------------
+    # Tokenization
+    # --------------------------
+    def _tokenize(self, text: str) -> List[str]:
+        if not text or not text.strip():
+            return []
+        parts = text.strip().split()
+        out: List[str] = []
+        for tok in parts:
+            # allow literal special tokens present in text
+            if tok in {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}:
+                out.append(tok)
+                continue
+            # already decomposed tokens
+            if (len(tok) == 2 and tok[0] in "WB" and tok[1] in "PNBRQK") or tok.endswith("_f") or tok.endswith("_t") or tok in {"=q", "=r", "=b", "=n"}:
+                out.append(tok)
+                continue
+            # parse extended UCI (dataset): WPe2e4, BNg8f6(x), WPe7e8=Q(+), ...
+            if len(tok) < 6:
+                out.append(self.UNK_TOKEN)
+                continue
+            color = tok[0]
+            piece = tok[1]
+            from_sq = tok[2:4]
+            to_sq = tok[4:6]
+            out.append(f"{color}{piece}")
+            out.append(f"{from_sq}_f")
+            out.append(f"{to_sq}_t")
+            # promotion like "=Q"
+            if "=" in tok:
+                try:
+                    promo_part = tok.split("=", 1)[1]
+                    promo_letter = promo_part[0].lower()
+                    promo_tok = f"={promo_letter}"
+                    if promo_tok in self._vocab:
+                        out.append(promo_tok)
+                except Exception:
+                    pass
+        return out
+    def _convert_token_to_id(self, token: str) -> int:
+        return self._vocab.get(token, self._vocab[self.UNK_TOKEN])
+    def _convert_id_to_token(self, index: int) -> str:
+        return self._ids_to_tokens.get(index, self.UNK_TOKEN)
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        return " ".join(tokens)
+    # --------------------------
+    # Vocab I/O
+    # --------------------------
+    @property
+    def vocab_size(self) -> int:
+        return len(self._vocab)
+    def get_vocab(self) -> Dict[str, int]:
+        return dict(self._vocab)
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
+        os.makedirs(save_directory, exist_ok=True)
+        vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "") + "vocab.json",
+        )
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            json.dump(self._vocab, f, ensure_ascii=False, indent=2)
+        return (vocab_file,)

src/train.py ADDED Viewed

	@@ -0,0 +1,268 @@

+"""
+Training script for the Chess Challenge.
+GPU-optimized version (still compatible with older transformers/accelerate):
+- Uses fp16/bf16 automatically on GPU
+- Uses evaluation + saving per EPOCH by default (much faster than steps)
+- Enables dataloader_num_workers + pin_memory on GPU
+- Optional torch.compile for speed (safe-guarded)
+- Keeps your robust TrainingArguments compatibility (evaluation_strategy vs eval_strategy)
+"""
+from __future__ import annotations
+import argparse
+import os
+import warnings
+from pathlib import Path
+warnings.filterwarnings("ignore", message="'return' in a 'finally' block")
+import torch
+from transformers import Trainer, TrainingArguments, set_seed
+from src.data import ChessDataCollator, create_train_val_datasets
+from src.model import ChessConfig, ChessForCausalLM
+from src.tokenizer import ChessTokenizer
+from src.utils import count_parameters, print_parameter_budget
+def parse_args():
+    p = argparse.ArgumentParser(description="Train a chess-playing language model")
+    # ---------------- Model ----------------
+    p.add_argument("--n_embd", type=int, default=128, help="Embedding dimension")
+    p.add_argument("--n_layer", type=int, default=6, help="Number of transformer layers")
+    p.add_argument("--n_head", type=int, default=8, help="Number of attention heads")
+    # For speed on GPU, 256 is often a great default; override via CLI if needed.
+    p.add_argument("--n_ctx", type=int, default=256, help="Maximum context length")
+    p.add_argument("--n_inner", type=int, default=248, help="MLP hidden size (SwiGLU: h)")
+    p.add_argument("--dropout", type=float, default=0.05, help="Dropout probability")
+    p.add_argument("--no_tie_weights", action="store_true", help="Disable weight tying")
+    # improved model.py flags
+    p.add_argument("--use_rope", action="store_true", help="Use RoPE (recommended)")
+    p.add_argument("--mlp_type", type=str, default="swiglu", choices=["swiglu", "gelu"], help="MLP type")
+    p.add_argument("--use_rmsnorm", action="store_true", help="Use RMSNorm (recommended)")
+    # ---------------- Data ----------------
+    p.add_argument("--dataset_name", type=str, default="dlouapre/lichess_2025-01_1M")
+    p.add_argument("--max_train_samples", type=int, default=None, help="Optional cap for train samples")
+    p.add_argument("--val_samples", type=int, default=5000)
+    p.add_argument(
+        "--tokenizer_dir",
+        type=str,
+        default="./tokenizer_cache",
+        help="Where to save/load the tokenizer (vocab.json)",
+    )
+    # ---------------- Training ----------------
+    p.add_argument("--output_dir", type=str, default="./output")
+    p.add_argument("--num_train_epochs", type=int, default=3)
+    # For speed: prefer larger batch and smaller accumulation.
+    p.add_argument("--per_device_train_batch_size", type=int, default=64)
+    p.add_argument("--per_device_eval_batch_size", type=int, default=128)
+    p.add_argument("--gradient_accumulation_steps", type=int, default=1)
+    p.add_argument("--learning_rate", type=float, default=3e-4)
+    p.add_argument("--weight_decay", type=float, default=0.1)
+    p.add_argument("--warmup_steps", type=int, default=300)
+    p.add_argument("--seed", type=int, default=42)
+    # ---------------- Logging / Save ----------------
+    p.add_argument("--logging_steps", type=int, default=50)
+    # Eval/save config: epoch by default (much faster). Still allow steps if user wants.
+    p.add_argument("--eval_strategy", type=str, default="epoch", choices=["epoch", "steps"], help="Evaluation strategy")
+    p.add_argument("--save_strategy", type=str, default="epoch", choices=["epoch", "steps"], help="Save strategy")
+    p.add_argument("--eval_steps", type=int, default=1000, help="Only used if eval_strategy=steps")
+    p.add_argument("--save_steps", type=int, default=1000, help="Only used if save_strategy=steps")
+    # ---------------- Speed knobs ----------------
+    p.add_argument("--dataloader_num_workers", type=int, default=2, help="CPU workers for dataloader")
+    p.add_argument("--torch_compile", action="store_true", help="Enable torch.compile on GPU (can speed up)")
+    return p.parse_args()
+def load_or_create_tokenizer(args) -> ChessTokenizer:
+    tok_dir = Path(args.tokenizer_dir)
+    tok_dir.mkdir(parents=True, exist_ok=True)
+    vocab_path = tok_dir / "vocab.json"
+    if vocab_path.exists():
+        print(f"Loading tokenizer from {tok_dir} ...")
+        return ChessTokenizer(vocab_file=str(vocab_path))
+    print("Creating fixed-vocab tokenizer (decomposed) ...")
+    tok = ChessTokenizer()
+    tok.save_pretrained(str(tok_dir))
+    print(f"Tokenizer saved to {tok_dir} (vocab_size={tok.vocab_size})")
+    return tok
+def _make_training_args(args) -> TrainingArguments:
+    """
+    Compatibility layer for transformers versions:
+    - some use evaluation_strategy, others use eval_strategy
+    - we keep it robust while using faster defaults (epoch eval/save).
+    """
+    use_gpu = torch.cuda.is_available()
+    use_bf16 = bool(use_gpu and torch.cuda.is_bf16_supported())
+    use_fp16 = bool(use_gpu and not use_bf16)
+    common = dict(
+        output_dir=args.output_dir,
+        num_train_epochs=args.num_train_epochs,
+        per_device_train_batch_size=args.per_device_train_batch_size,
+        per_device_eval_batch_size=args.per_device_eval_batch_size,
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        learning_rate=args.learning_rate,
+        weight_decay=args.weight_decay,
+        warmup_steps=args.warmup_steps,
+        lr_scheduler_type="cosine",
+        max_grad_norm=1.0,
+        logging_dir=os.path.join(args.output_dir, "logs"),
+        logging_steps=args.logging_steps,
+        save_total_limit=2,
+        load_best_model_at_end=True,
+        metric_for_best_model="eval_loss",
+        greater_is_better=False,
+        seed=args.seed,
+        report_to=["none"],
+        # Mixed precision for GPU speed
+        fp16=use_fp16,
+        bf16=use_bf16,
+        # DataLoader perf
+        dataloader_num_workers=args.dataloader_num_workers,
+        dataloader_pin_memory=use_gpu,
+        # Important for custom batches
+        remove_unused_columns=False,
+    )
+    # Build kwargs depending on epoch vs steps
+    eval_kwargs = {}
+    if args.eval_strategy == "steps":
+        eval_kwargs["eval_steps"] = args.eval_steps
+    save_kwargs = {}
+    if args.save_strategy == "steps":
+        save_kwargs["save_steps"] = args.save_steps
+    # Try standard HF arg names first
+    try:
+        return TrainingArguments(
+            **common,
+            evaluation_strategy=args.eval_strategy,
+            save_strategy=args.save_strategy,
+            **eval_kwargs,
+            **save_kwargs,
+        )
+    except TypeError:
+        # Fallback for forks/older variants that renamed args
+        return TrainingArguments(
+            **common,
+            eval_strategy=args.eval_strategy,
+            save_strategy=args.save_strategy,
+            **eval_kwargs,
+            **save_kwargs,
+        )
+def main():
+    args = parse_args()
+    set_seed(args.seed)
+    print("=" * 60)
+    print("CHESS CHALLENGE - TRAINING")
+    print("=" * 60)
+    tokenizer = load_or_create_tokenizer(args)
+    actual_vocab_size = tokenizer.vocab_size
+    print(f"   Vocab size used: {actual_vocab_size}")
+    print("\nCreating model configuration...")
+    config = ChessConfig(
+        vocab_size=actual_vocab_size,
+        n_embd=args.n_embd,
+        n_layer=args.n_layer,
+        n_head=args.n_head,
+        n_ctx=args.n_ctx,
+        n_inner=args.n_inner,
+        dropout=args.dropout,
+        tie_weights=not args.no_tie_weights,
+        pad_token_id=tokenizer.pad_token_id,
+        bos_token_id=tokenizer.bos_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+        use_rope=bool(args.use_rope),
+        mlp_type=args.mlp_type,
+        use_rmsnorm=bool(args.use_rmsnorm),
+    )
+    print_parameter_budget(config)
+    print("\nCreating model...")
+    model = ChessForCausalLM(config)
+    # Optional torch.compile (GPU only)
+    if args.torch_compile and torch.cuda.is_available():
+        try:
+            model = torch.compile(model)
+            print("✓ torch.compile enabled")
+        except Exception as e:
+            print(f"WARNING: torch.compile failed ({e}). Continuing without it.")
+    n_params = count_parameters(model)
+    print(f"   Total parameters: {n_params:,}")
+    print("✓  Model is within 1M parameter limit" if n_params <= 1_000_000 else "WARNING: Model exceeds 1M!")
+    print("\nLoading datasets...")
+    train_dataset, val_dataset = create_train_val_datasets(
+        tokenizer=tokenizer,
+        dataset_name=args.dataset_name,
+        max_length=args.n_ctx,
+        train_samples=args.max_train_samples,
+        val_samples=args.val_samples,
+    )
+    print(f"   Training samples: {len(train_dataset):,}")
+    print(f"   Validation samples: {len(val_dataset):,}")
+    data_collator = ChessDataCollator(tokenizer, max_length=args.n_ctx)
+    training_args = _make_training_args(args)
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=val_dataset,
+        data_collator=data_collator,
+        tokenizer=tokenizer,
+    )
+    print("\nStarting training...")
+    trainer.train()
+    out_final = os.path.join(args.output_dir, "final_model")
+    print("\nSaving final model...")
+    trainer.save_model(out_final)
+    tokenizer.save_pretrained(out_final)
+    print("\nTraining complete!")
+    print(f"   Model saved to: {out_final}")
+if __name__ == "__main__":
+    main()

src/utils.py ADDED Viewed

	@@ -0,0 +1,369 @@

+"""
+Utility functions for the Chess Challenge.
+This module provides helper functions for:
+- Parameter counting and budget analysis (including RoPE / SwiGLU / RMSNorm variants)
+- Move validation and conversion with python-chess
+- Optional: compute legal-move rate over a whole game string
+"""
+from __future__ import annotations
+import re
+from typing import Dict, Optional, TYPE_CHECKING
+import torch.nn as nn
+if TYPE_CHECKING:
+    from src.model import ChessConfig
+# =========================
+# Parameter counting
+# =========================
+def count_parameters(model: nn.Module, trainable_only: bool = True) -> int:
+    """
+    Count the number of parameters in a model.
+    Args:
+        model: The PyTorch model.
+        trainable_only: If True, only count trainable parameters.
+    Returns:
+        Total number of parameters.
+    """
+    if trainable_only:
+        return sum(p.numel() for p in model.parameters() if p.requires_grad)
+    return sum(p.numel() for p in model.parameters())
+def count_parameters_by_component(model: nn.Module) -> Dict[str, int]:
+    """
+    Count parameters broken down by leaf modules.
+    Args:
+        model: The PyTorch model.
+    Returns:
+        Dictionary mapping module names to parameter counts.
+    """
+    counts: Dict[str, int] = {}
+    for name, module in model.named_modules():
+        if len(list(module.children())) == 0:  # leaf module
+            param_count = sum(p.numel() for p in module.parameters(recurse=False))
+            if param_count > 0:
+                counts[name] = param_count
+    return counts
+def estimate_parameters(config: "ChessConfig") -> Dict[str, int]:
+    """
+    Estimate parameter count for a configuration.
+    Works for:
+    - learned position embeddings (wpe) or RoPE (no pos params)
+    - GELU FFN (d -> n_inner -> d)
+    - SwiGLU FFN (d -> 2h, h -> d) where h = n_inner
+    - LayerNorm (weight+bias) vs RMSNorm (weight only)
+    - tied or untied LM head
+    NOTE: This is an estimate of *weights + biases* for the common implementation
+    patterns used in this repo.
+    """
+    V = int(config.vocab_size)
+    d = int(config.n_embd)
+    L = int(config.n_layer)
+    n_ctx = int(config.n_ctx)
+    n_inner = int(config.n_inner)
+    use_rope = bool(getattr(config, "use_rope", False))
+    use_rmsnorm = bool(getattr(config, "use_rmsnorm", False))
+    mlp_type = str(getattr(config, "mlp_type", "gelu")).lower()
+    tie = bool(getattr(config, "tie_weights", True))
+    # Embeddings
+    token_embeddings = V * d
+    position_embeddings = 0 if use_rope else (n_ctx * d)
+    # Attention per layer:
+    # c_attn: d -> 3d  : weight 3d*d, bias 3d
+    # c_proj: d -> d   : weight d*d,  bias d
+    attn_qkv_per_layer = 3 * d * d + 3 * d
+    attn_proj_per_layer = d * d + d
+    # FFN per layer
+    if mlp_type == "swiglu":
+        # w12: d -> 2h : weight 2h*d, bias 2h
+        # w3:  h -> d  : weight d*h,  bias d
+        h = n_inner
+        ffn_per_layer = (2 * h * d + 2 * h) + (d * h + d)  # 3*d*h + (2h + d)
+    else:
+        # GELU: d -> n_inner -> d
+        ffn_per_layer = (d * n_inner + n_inner) + (n_inner * d + d)  # 2*d*n_inner + (n_inner + d)
+    # Norm params
+    # LayerNorm: weight+bias => 2d ; RMSNorm: weight => d
+    norm_params = d if use_rmsnorm else 2 * d
+    norms_per_layer = 2 * norm_params  # ln_1 + ln_2
+    final_norm = norm_params
+    per_layer = attn_qkv_per_layer + attn_proj_per_layer + ffn_per_layer + norms_per_layer
+    total_transformer_layers = L * per_layer
+    # LM head
+    # In this repo, lm_head is typically Linear(d, V, bias=False).
+    # If untied, count V*d parameters.
+    lm_head = 0 if tie else (V * d)
+    total = token_embeddings + position_embeddings + total_transformer_layers + final_norm + lm_head
+    return {
+        "token_embeddings": token_embeddings,
+        "position_embeddings": position_embeddings,
+        "attention_qkv_per_layer": attn_qkv_per_layer,
+        "attention_proj_per_layer": attn_proj_per_layer,
+        "ffn_per_layer": ffn_per_layer,
+        "norms_per_layer": norms_per_layer,
+        "final_norm": final_norm,
+        "total_transformer_layers": total_transformer_layers,
+        "lm_head": lm_head,
+        "total": total,
+        "notes": {
+            "use_rope": use_rope,
+            "use_rmsnorm": use_rmsnorm,
+            "mlp_type": mlp_type,
+            "tie_weights": tie,
+        },
+    }
+def print_parameter_budget(config: "ChessConfig", limit: int = 1_000_000) -> None:
+    """
+    Print a formatted parameter budget analysis.
+    Args:
+        config: Model configuration.
+        limit: Parameter limit.
+    """
+    est = estimate_parameters(config)
+    print("=" * 60)
+    print("PARAMETER BUDGET ANALYSIS")
+    print("=" * 60)
+    print("\nConfiguration:")
+    print(f"  vocab_size (V) = {config.vocab_size}")
+    print(f"  n_embd (d)     = {config.n_embd}")
+    print(f"  n_layer (L)    = {config.n_layer}")
+    print(f"  n_head         = {config.n_head}")
+    print(f"  n_ctx          = {config.n_ctx}")
+    print(f"  n_inner        = {config.n_inner}")
+    print(f"  tie_weights    = {getattr(config, 'tie_weights', True)}")
+    if hasattr(config, "use_rope"):
+        print(f"  use_rope       = {getattr(config, 'use_rope', False)}")
+    if hasattr(config, "mlp_type"):
+        print(f"  mlp_type       = {getattr(config, 'mlp_type', 'gelu')}")
+    if hasattr(config, "use_rmsnorm"):
+        print(f"  use_rmsnorm    = {getattr(config, 'use_rmsnorm', False)}")
+    print("\nParameter Breakdown (estimate):")
+    print(f"  Token Embeddings:     {est['token_embeddings']:>10,}")
+    print(f"  Position Embeddings:  {est['position_embeddings']:>10,}")
+    print(f"  Transformer Layers:   {est['total_transformer_layers']:>10,}")
+    print(f"  Final Norm:           {est['final_norm']:>10,}")
+    if getattr(config, "tie_weights", True):
+        print(f"  LM Head:              {'(tied)':>10}")
+    else:
+        print(f"  LM Head:              {est['lm_head']:>10,}")
+    print("  " + "-" * 32)
+    print(f"  TOTAL:                {est['total']:>10,}")
+    remaining = limit - est["total"]
+    print("\nBudget Status:")
+    print(f"  Limit:     {limit:>10,}")
+    print(f"  Used:      {est['total']:>10,}")
+    print(f"  Remaining: {remaining:>10,}")
+    if est["total"] <= limit:
+        print(f"\n✓ Within budget! ({est['total'] / limit * 100:.1f}% used)")
+    else:
+        print(f"\n✗ OVER BUDGET by {-remaining:,} parameters!")
+    print("=" * 60)
+# =========================
+# Move conversion / validation (python-chess)
+# =========================
+def convert_extended_uci_to_uci(move: str) -> str:
+    """
+    Convert extended UCI format to standard UCI format.
+    Extended UCI format (dataset):
+      [W|B][Piece][from_sq][to_sq][suffixes...]
+      e.g. "WPe2e4", "BNg8f6(x)", "WKe1g1(o)", "WPe7e8=Q(+)"
+    Standard UCI:
+      "e2e4", "g8f6", "e1g1", "e7e8q"
+    """
+    if len(move) < 6:
+        return move
+    from_sq = move[2:4]
+    to_sq = move[4:6]
+    promotion = ""
+    if "=" in move:
+        promo_idx = move.index("=")
+        if promo_idx + 1 < len(move):
+            promotion = move[promo_idx + 1].lower()
+    return from_sq + to_sq + promotion
+def validate_move_with_chess(move: str, board_fen: Optional[str] = None) -> bool:
+    """
+    Validate a single move using python-chess against a given board state.
+    IMPORTANT:
+    - If board_fen is None, validation is against the initial position.
+      For validating a *game*, use `legal_rate_game_text` which advances the board.
+    Args:
+        move: Move in extended UCI format.
+        board_fen: FEN string of the current board (optional).
+    Returns:
+        True if move is legal on that board, else False.
+    """
+    try:
+        import chess
+    except ImportError:
+        raise ImportError(
+            "python-chess is required for move validation. Install it with: pip install python-chess"
+        )
+    if len(move) < 6:
+        return False
+    board = chess.Board(board_fen) if board_fen else chess.Board()
+    uci_move = convert_extended_uci_to_uci(move)
+    try:
+        move_obj = chess.Move.from_uci(uci_move)
+        return move_obj in board.legal_moves
+    except Exception:
+        return False
+def legal_rate_game_text(game_text: str, stop_on_illegal: bool = True) -> float:
+    """
+    Compute the fraction of legal moves in a space-separated extended-UCI game string.
+    Args:
+        game_text: "WPe2e4 BPe7e5 ..." (space-separated moves)
+        stop_on_illegal: If True, stop at first illegal move.
+    Returns:
+        legal / total (total is moves processed, or total moves if stop_on_illegal=False)
+    """
+    try:
+        import chess
+    except ImportError:
+        raise ImportError("python-chess is required. Install it with: pip install python-chess")
+    moves = game_text.strip().split()
+    if not moves:
+        return 0.0
+    board = chess.Board()
+    legal = 0
+    total = 0
+    for mv in moves:
+        total += 1
+        uci = convert_extended_uci_to_uci(mv)
+        try:
+            m = chess.Move.from_uci(uci)
+        except Exception:
+            if stop_on_illegal:
+                break
+            continue
+        if m in board.legal_moves:
+            legal += 1
+            board.push(m)
+        else:
+            if stop_on_illegal:
+                break
+    return legal / max(total, 1)
+def convert_uci_to_extended(uci_move: str, board_fen: str) -> str:
+    """
+    Convert standard UCI move to extended UCI format used by the dataset.
+    Args:
+        uci_move: e.g., "e2e4", "e7e8q", "e1g1"
+        board_fen: FEN of current board (must match move)
+    Returns:
+        Extended UCI like "WPe2e4", with suffixes:
+        - (x) capture
+        - (+) check
+        - (+*) checkmate
+        - (x+) capture+check
+        - (x+*) capture+checkmate
+        - (o) / (O) castling
+        - promotions as "=Q" etc
+    """
+    try:
+        import chess
+    except ImportError:
+        raise ImportError("python-chess is required for move conversion. Install it with: pip install python-chess")
+    board = chess.Board(board_fen)
+    move = chess.Move.from_uci(uci_move)
+    color = "W" if board.turn == chess.WHITE else "B"
+    piece = board.piece_at(move.from_square)
+    piece_letter = piece.symbol().upper() if piece else "P"
+    from_sq = chess.square_name(move.from_square)
+    to_sq = chess.square_name(move.to_square)
+    result = f"{color}{piece_letter}{from_sq}{to_sq}"
+    # Promotion
+    if move.promotion:
+        result += f"={chess.piece_symbol(move.promotion).upper()}"
+    # Capture suffix
+    if board.is_capture(move):
+        result += "(x)"
+    # Check / mate suffix (need to push)
+    board.push(move)
+    if board.is_checkmate():
+        if "(x)" in result:
+            result = result.replace("(x)", "(x+*)")
+        else:
+            result += "(+*)"
+    elif board.is_check():
+        if "(x)" in result:
+            result = result.replace("(x)", "(x+)")
+        else:
+            result += "(+)"
+    board.pop()
+    # Castling (dataset wants (o)/(O), usually no other suffix with it)
+    if board.is_castling(move):
+        result = re.sub(r"\([^)]*\)", "", result)  # drop any (...) suffix
+        if move.to_square in [chess.G1, chess.G8]:
+            result += "(o)"
+        else:
+            result += "(O)"
+    return result

tokenizer.py ADDED Viewed

	@@ -0,0 +1,207 @@

+"""
+Decomposed Chess Tokenizer for the Chess Challenge.
+Each move becomes 3 or 4 tokens:
+  WP e2_f e4_t
+  BN g8_f f6_t
+Promotion adds an extra token:
+  WP e7_f e8_t =q
+Why this helps:
+- Fixed small vocab (~150 tokens)
+- Near-zero OOV / UNK, so the evaluator can always parse squares
+- Compatible with the provided evaluate.py (it auto-detects 'decomposed')
+Special tokens behavior:
+- Adds BOS only (NO EOS)
+- If BOS already present, does not add it twice
+"""
+from __future__ import annotations
+import json
+import os
+from typing import Dict, List, Optional
+from transformers import PreTrainedTokenizer
+class ChessTokenizer(PreTrainedTokenizer):
+    model_input_names = ["input_ids", "attention_mask"]
+    vocab_files_names = {"vocab_file": "vocab.json"}
+    PAD_TOKEN = "[PAD]"
+    BOS_TOKEN = "[BOS]"
+    EOS_TOKEN = "[EOS]"   # kept for compatibility, not auto-added
+    UNK_TOKEN = "[UNK]"
+    def __init__(
+        self,
+        vocab_file: Optional[str] = None,
+        vocab: Optional[Dict[str, int]] = None,
+        **kwargs,
+    ):
+        self._pad_token = self.PAD_TOKEN
+        self._bos_token = self.BOS_TOKEN
+        self._eos_token = self.EOS_TOKEN
+        self._unk_token = self.UNK_TOKEN
+        # avoid duplicates from kwargs
+        kwargs.pop("pad_token", None)
+        kwargs.pop("bos_token", None)
+        kwargs.pop("eos_token", None)
+        kwargs.pop("unk_token", None)
+        if vocab is not None:
+            self._vocab = vocab
+        elif vocab_file is not None and os.path.exists(vocab_file):
+            with open(vocab_file, "r", encoding="utf-8") as f:
+                self._vocab = json.load(f)
+        else:
+            self._vocab = self._build_fixed_vocab()
+        self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
+        super().__init__(
+            pad_token=self._pad_token,
+            bos_token=self._bos_token,
+            eos_token=self._eos_token,
+            unk_token=self._unk_token,
+            **kwargs,
+        )
+    # --------------------------
+    # Fixed vocab: pieces + squares + promos
+    # --------------------------
+    @staticmethod
+    def _all_squares() -> List[str]:
+        files = "abcdefgh"
+        ranks = "12345678"
+        return [f + r for r in ranks for f in files]  # a1..h8
+    def _build_fixed_vocab(self) -> Dict[str, int]:
+        special = [self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN]
+        # piece tokens: WP..WK, BP..BK
+        piece_tokens = [f"{c}{p}" for c in "WB" for p in "PNBRQK"]
+        squares = self._all_squares()
+        from_tokens = [f"{sq}_f" for sq in squares]
+        to_tokens = [f"{sq}_t" for sq in squares]
+        promo_tokens = ["=q", "=r", "=b", "=n"]
+        tokens = special + piece_tokens + from_tokens + to_tokens + promo_tokens
+        return {tok: i for i, tok in enumerate(tokens)}
+    # --------------------------
+    # Special tokens handling (robust with evaluate.py)
+    # --------------------------
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        # BOS only, NO EOS
+        if token_ids_1 is not None:
+            token_ids_0 = token_ids_0 + token_ids_1
+        if token_ids_0 and token_ids_0[0] == self.bos_token_id:
+            return token_ids_0
+        return [self.bos_token_id] + token_ids_0
+    def get_special_tokens_mask(
+        self,
+        token_ids_0: List[int],
+        token_ids_1: Optional[List[int]] = None,
+        already_has_special_tokens: bool = False,
+    ) -> List[int]:
+        if already_has_special_tokens:
+            specials = {self.pad_token_id, self.bos_token_id, self.eos_token_id, self.unk_token_id}
+            return [1 if t in specials else 0 for t in token_ids_0]
+        if token_ids_1 is None:
+            return [1] + [0] * len(token_ids_0)
+        return [1] + [0] * (len(token_ids_0) + len(token_ids_1))
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        if token_ids_1 is None:
+            return [0] * (len(token_ids_0) + 1)
+        return [0] * (len(token_ids_0) + len(token_ids_1) + 1)
+    # --------------------------
+    # Tokenization
+    # --------------------------
+    def _tokenize(self, text: str) -> List[str]:
+        if not text or not text.strip():
+            return []
+        parts = text.strip().split()
+        out: List[str] = []
+        for tok in parts:
+            # allow literal special tokens present in text
+            if tok in {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}:
+                out.append(tok)
+                continue
+            # already decomposed tokens
+            if (len(tok) == 2 and tok[0] in "WB" and tok[1] in "PNBRQK") or tok.endswith("_f") or tok.endswith("_t") or tok in {"=q", "=r", "=b", "=n"}:
+                out.append(tok)
+                continue
+            # parse extended UCI (dataset): WPe2e4, BNg8f6(x), WPe7e8=Q(+), ...
+            if len(tok) < 6:
+                out.append(self.UNK_TOKEN)
+                continue
+            color = tok[0]
+            piece = tok[1]
+            from_sq = tok[2:4]
+            to_sq = tok[4:6]
+            out.append(f"{color}{piece}")
+            out.append(f"{from_sq}_f")
+            out.append(f"{to_sq}_t")
+            # promotion like "=Q"
+            if "=" in tok:
+                try:
+                    promo_part = tok.split("=", 1)[1]
+                    promo_letter = promo_part[0].lower()
+                    promo_tok = f"={promo_letter}"
+                    if promo_tok in self._vocab:
+                        out.append(promo_tok)
+                except Exception:
+                    pass
+        return out
+    def _convert_token_to_id(self, token: str) -> int:
+        return self._vocab.get(token, self._vocab[self.UNK_TOKEN])
+    def _convert_id_to_token(self, index: int) -> str:
+        return self._ids_to_tokens.get(index, self.UNK_TOKEN)
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        return " ".join(tokens)
+    # --------------------------
+    # Vocab I/O
+    # --------------------------
+    @property
+    def vocab_size(self) -> int:
+        return len(self._vocab)
+    def get_vocab(self) -> Dict[str, int]:
+        return dict(self._vocab)
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
+        os.makedirs(save_directory, exist_ok=True)
+        vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "") + "vocab.json",
+        )
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            json.dump(self._vocab, f, ensure_ascii=False, indent=2)
+        return (vocab_file,)

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,49 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[BOS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[EOS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenizer.ChessTokenizer",
+      null
+    ]
+  },
+  "bos_token": "[BOS]",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "[EOS]",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "tokenizer_class": "ChessTokenizer",
+  "unk_token": "[UNK]"
+}

vocab.json ADDED Viewed

	@@ -0,0 +1,150 @@

+{
+  "[PAD]": 0,
+  "[BOS]": 1,
+  "[EOS]": 2,
+  "[UNK]": 3,
+  "WP": 4,
+  "WN": 5,
+  "WB": 6,
+  "WR": 7,
+  "WQ": 8,
+  "WK": 9,
+  "BP": 10,
+  "BN": 11,
+  "BB": 12,
+  "BR": 13,
+  "BQ": 14,
+  "BK": 15,
+  "a1_f": 16,
+  "b1_f": 17,
+  "c1_f": 18,
+  "d1_f": 19,
+  "e1_f": 20,
+  "f1_f": 21,
+  "g1_f": 22,
+  "h1_f": 23,
+  "a2_f": 24,
+  "b2_f": 25,
+  "c2_f": 26,
+  "d2_f": 27,
+  "e2_f": 28,
+  "f2_f": 29,
+  "g2_f": 30,
+  "h2_f": 31,
+  "a3_f": 32,
+  "b3_f": 33,
+  "c3_f": 34,
+  "d3_f": 35,
+  "e3_f": 36,
+  "f3_f": 37,
+  "g3_f": 38,
+  "h3_f": 39,
+  "a4_f": 40,
+  "b4_f": 41,
+  "c4_f": 42,
+  "d4_f": 43,
+  "e4_f": 44,
+  "f4_f": 45,
+  "g4_f": 46,
+  "h4_f": 47,
+  "a5_f": 48,
+  "b5_f": 49,
+  "c5_f": 50,
+  "d5_f": 51,
+  "e5_f": 52,
+  "f5_f": 53,
+  "g5_f": 54,
+  "h5_f": 55,
+  "a6_f": 56,
+  "b6_f": 57,
+  "c6_f": 58,
+  "d6_f": 59,
+  "e6_f": 60,
+  "f6_f": 61,
+  "g6_f": 62,
+  "h6_f": 63,
+  "a7_f": 64,
+  "b7_f": 65,
+  "c7_f": 66,
+  "d7_f": 67,
+  "e7_f": 68,
+  "f7_f": 69,
+  "g7_f": 70,
+  "h7_f": 71,
+  "a8_f": 72,
+  "b8_f": 73,
+  "c8_f": 74,
+  "d8_f": 75,
+  "e8_f": 76,
+  "f8_f": 77,
+  "g8_f": 78,
+  "h8_f": 79,
+  "a1_t": 80,
+  "b1_t": 81,
+  "c1_t": 82,
+  "d1_t": 83,
+  "e1_t": 84,
+  "f1_t": 85,
+  "g1_t": 86,
+  "h1_t": 87,
+  "a2_t": 88,
+  "b2_t": 89,
+  "c2_t": 90,
+  "d2_t": 91,
+  "e2_t": 92,
+  "f2_t": 93,
+  "g2_t": 94,
+  "h2_t": 95,
+  "a3_t": 96,
+  "b3_t": 97,
+  "c3_t": 98,
+  "d3_t": 99,
+  "e3_t": 100,
+  "f3_t": 101,
+  "g3_t": 102,
+  "h3_t": 103,
+  "a4_t": 104,
+  "b4_t": 105,
+  "c4_t": 106,
+  "d4_t": 107,
+  "e4_t": 108,
+  "f4_t": 109,
+  "g4_t": 110,
+  "h4_t": 111,
+  "a5_t": 112,
+  "b5_t": 113,
+  "c5_t": 114,
+  "d5_t": 115,
+  "e5_t": 116,
+  "f5_t": 117,
+  "g5_t": 118,
+  "h5_t": 119,
+  "a6_t": 120,
+  "b6_t": 121,
+  "c6_t": 122,
+  "d6_t": 123,
+  "e6_t": 124,
+  "f6_t": 125,
+  "g6_t": 126,
+  "h6_t": 127,
+  "a7_t": 128,
+  "b7_t": 129,
+  "c7_t": 130,
+  "d7_t": 131,
+  "e7_t": 132,
+  "f7_t": 133,
+  "g7_t": 134,
+  "h7_t": 135,
+  "a8_t": 136,
+  "b8_t": 137,
+  "c8_t": 138,
+  "d8_t": 139,
+  "e8_t": 140,
+  "f8_t": 141,
+  "g8_t": 142,
+  "h8_t": 143,
+  "=q": 144,
+  "=r": 145,
+  "=b": 146,
+  "=n": 147
+}