Instructions to use LLM-course/chess-player-v2 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use LLM-course/chess-player-v2 with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="LLM-course/chess-player-v2", trust_remote_code=True)

# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("LLM-course/chess-player-v2", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps Settings

vLLM

How to use LLM-course/chess-player-v2 with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "LLM-course/chess-player-v2"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "LLM-course/chess-player-v2",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker

docker model run hf.co/LLM-course/chess-player-v2

SGLang

How to use LLM-course/chess-player-v2 with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "LLM-course/chess-player-v2" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "LLM-course/chess-player-v2",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "LLM-course/chess-player-v2" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "LLM-course/chess-player-v2",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Docker Model Runner
How to use LLM-course/chess-player-v2 with Docker Model Runner:
```
docker model run hf.co/LLM-course/chess-player-v2
```

iliasslasri commited on Jan 21

Commit

c4df49c

verified ·

1 Parent(s): 535d8f0

Chess Challenge submission by iliasslasri

Browse files

Files changed (4) hide show

README.md +3 -3
config.json +8 -6
model.py +86 -4
model.safetensors +2 -2

README.md CHANGED Viewed

@@ -14,13 +14,13 @@ Chess model submitted to the LLM Course Chess Challenge.
 ## Submission Info
 - **Submitted by**: [iliasslasri](https://huggingface.co/iliasslasri)
-- **Parameters**: 980,720
 - **Organization**: LLM-course
 ## Model Details
 - **Architecture**: Chess Transformer (GPT-style)
 - **Vocab size**: 75
-- **Embedding dim**: 92
 - **Layers**: 11
-- **Heads**: 4

 ## Submission Info
 - **Submitted by**: [iliasslasri](https://huggingface.co/iliasslasri)
+- **Parameters**: 998,036
 - **Organization**: LLM-course
 ## Model Details
 - **Architecture**: Chess Transformer (GPT-style)
 - **Vocab size**: 75
+- **Embedding dim**: 96
 - **Layers**: 11
+- **Heads**: 8

config.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
-  "_name_or_path": "./11_4_92_ft_ft_ft/checkpoint-475008/",
   "architectures": [
     "ChessForCausalLM"
   ],
-  "attn": "MHA",
   "auto_map": {
     "AutoConfig": "model.ChessConfig",
     "AutoModelForCausalLM": "model.ChessForCausalLM"
@@ -14,12 +14,14 @@
   "layer_norm_epsilon": 1e-05,
   "model_type": "chess_transformer",
   "n_ctx": 256,
-  "n_embd": 92,
-  "n_head": 4,
-  "n_inner": 276,
   "n_layer": 11,
-  "num_groups": 2,
   "pad_token_id": 0,
   "tie_weights": false,
   "tie_word_embeddings": false,
   "torch_dtype": "float32",

 {
+  "_name_or_path": "./gqa_rpe/checkpoint-311724/",
   "architectures": [
     "ChessForCausalLM"
   ],
+  "attn": "GQA",
   "auto_map": {
     "AutoConfig": "model.ChessConfig",
     "AutoModelForCausalLM": "model.ChessForCausalLM"
   "layer_norm_epsilon": 1e-05,
   "model_type": "chess_transformer",
   "n_ctx": 256,
+  "n_embd": 96,
+  "n_head": 8,
+  "n_inner": 316,
   "n_layer": 11,
+  "num_groups": 4,
   "pad_token_id": 0,
+  "rot_pos_emb": true,
+  "rotary_base": 10000,
   "tie_weights": false,
   "tie_word_embeddings": false,
   "torch_dtype": "float32",

model.py CHANGED Viewed

@@ -66,6 +66,8 @@ class ChessConfig(PretrainedConfig):
         eos_token_id: int = 2,
         attn: str = "MHA",
         num_groups: int = 2,
         **kwargs,
     ):
         super().__init__(
@@ -91,6 +93,11 @@ class ChessConfig(PretrainedConfig):
         self.attn = attn
         self.num_groups = num_groups
 class MultiHeadAttention(nn.Module):
     """
@@ -110,6 +117,14 @@ class MultiHeadAttention(nn.Module):
         self.n_embd = config.n_embd
         self.head_dim = config.n_embd // config.n_head
         # Combined QKV projection for efficiency
         self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
         self.c_proj = nn.Linear(config.n_embd, config.n_embd)
@@ -141,6 +156,10 @@ class MultiHeadAttention(nn.Module):
         k = k.view(batch_size, seq_len, self.n_head, self.head_dim).transpose(1, 2)
         v = v.view(batch_size, seq_len, self.n_head, self.head_dim).transpose(1, 2)
         # Scaled dot-product attention
         attn_weights = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
@@ -328,8 +347,10 @@ class ChessForCausalLM(PreTrainedModel):
         # Token and position embeddings
         self.wte = nn.Embedding(config.vocab_size, config.n_embd)
-        self.wpe = nn.Embedding(config.n_ctx, config.n_embd)
         self.drop = nn.Dropout(config.dropout)
         # Transformer blocks
@@ -418,8 +439,11 @@ class ChessForCausalLM(PreTrainedModel):
         # Get embeddings
         token_embeds = self.wte(input_ids)
-        position_embeds = self.wpe(position_ids)
-        hidden_states = self.drop(token_embeds + position_embeds)
         # Pass through transformer blocks
         for block in self.h:
@@ -510,6 +534,64 @@ class ChessForCausalLM(PreTrainedModel):
         return next_token.item()
 # Register the model with Auto classes for easy loading
 from transformers import AutoConfig, AutoModelForCausalLM

         eos_token_id: int = 2,
         attn: str = "MHA",
         num_groups: int = 2,
+        rot_pos_emb=False,
+        rotary_base=10000,
         **kwargs,
     ):
         super().__init__(
         self.attn = attn
         self.num_groups = num_groups
+        # rot_pos_emb
+        self.rot_pos_emb = rot_pos_emb
+        self.rotary_base = rotary_base
 class MultiHeadAttention(nn.Module):
     """
         self.n_embd = config.n_embd
         self.head_dim = config.n_embd // config.n_head
+        self.rot_pos_emb = config.rot_pos_emb
+        if self.rot_pos_emb:
+            self.rotary_emb = RotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=config.n_ctx,
+                base=getattr(config, 'rotary_base', 10000)
+            )
         # Combined QKV projection for efficiency
         self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
         self.c_proj = nn.Linear(config.n_embd, config.n_embd)
         k = k.view(batch_size, seq_len, self.n_head, self.head_dim).transpose(1, 2)
         v = v.view(batch_size, seq_len, self.n_head, self.head_dim).transpose(1, 2)
+        if self.rot_pos_emb:
+            cos, sin = self.rotary_emb(v, seq_len=seq_len)
+            q, k = apply_rotary_pos_emb(q, k, cos, sin)
         # Scaled dot-product attention
         attn_weights = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
         # Token and position embeddings
         self.wte = nn.Embedding(config.vocab_size, config.n_embd)
+        if not config.rot_pos_emb:
+            self.wpe = nn.Embedding(config.n_ctx, config.n_embd)
+        self.rot_pos_emb = config.rot_pos_emb
         self.drop = nn.Dropout(config.dropout)
         # Transformer blocks
         # Get embeddings
         token_embeds = self.wte(input_ids)
+        if not self.rot_pos_emb:
+            position_embeds = self.wpe(position_ids)
+            hidden_states = self.drop(token_embeds + position_embeds)
+        else:
+            hidden_states = self.drop(token_embeds)
         # Pass through transformer blocks
         for block in self.h:
         return next_token.item()
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, dim, 2).float().to(device) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        # Build here to make `forward` cleaner
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(t, self.inv_freq)
+        # Different implementations use polar form; here we use the LLaMA style expansion
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+    def forward(self, x, seq_len=None):
+        # x: [batch, seq_len, head_dim]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None):
+    # q, k: [batch, seq_len, heads, head_dim] -> transpose to [batch, heads, seq_len, head_dim] for math
+    # But your model uses [batch, seq_len, heads, head_dim] internally until transpose.
+    # Let's align with the shape inside your attention:
+    # Your code computes: q = q.view(batch, seq, heads, dim).transpose(1, 2) -> [batch, heads, seq, dim]
+    # We assume inputs q, k are [batch, heads, seq_len, head_dim]
+    # cos, sin are [seq_len, head_dim] -> unsqueeze to [1, 1, seq_len, head_dim]
+    cos = cos.unsqueeze(0).unsqueeze(0)  # [1, 1, seq_len, head_dim]
+    sin = sin.unsqueeze(0).unsqueeze(0)
+    # If we have custom position_ids (not strictly necessary for causal LM unless doing cache), handle here.
+    # For simple causal LM, we assume standard 0..T indexing.
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
 # Register the model with Auto classes for easy loading
 from transformers import AutoConfig, AutoModelForCausalLM

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0f7244a5c854e9c9684f98b1b63970ad82899c0545f1fb1b105ce1ae2e8f76a8
-size 3934384

 version https://git-lfs.github.com/spec/v1
+oid sha256:105d6544cc86f6cad342e7ed9602f9628e3399e230a2048d1149d8dc09d35aa6
+size 4007408