MaximeMuhlethaler commited on
Commit
f98ff7a
·
verified ·
1 Parent(s): 62d9175

Chess Challenge submission by MaximeMuhlethaler

Browse files
Files changed (5) hide show
  1. README.md +1 -4
  2. config.json +7 -5
  3. model.py +150 -0
  4. pytorch_model.bin +3 -0
  5. tokenizer.py +1 -6
README.md CHANGED
@@ -12,15 +12,12 @@ license: mit
12
  Chess model submitted to the LLM Course Chess Challenge.
13
 
14
  ## Submission Info
15
-
16
  - **Submitted by**: [MaximeMuhlethaler](https://huggingface.co/MaximeMuhlethaler)
17
  - **Parameters**: 980,000
18
  - **Organization**: LLM-course
 
19
 
20
  ## Model Details
21
-
22
- - **Architecture**: Chess Transformer (GPT-style)
23
  - **Vocab size**: 1700
24
- - **Embedding dim**: 112
25
  - **Layers**: 6
26
  - **Heads**: 8
 
12
  Chess model submitted to the LLM Course Chess Challenge.
13
 
14
  ## Submission Info
 
15
  - **Submitted by**: [MaximeMuhlethaler](https://huggingface.co/MaximeMuhlethaler)
16
  - **Parameters**: 980,000
17
  - **Organization**: LLM-course
18
+ - **Architecture**: Custom Chess Transformer (Regex Tokenizer + EOS Protection)
19
 
20
  ## Model Details
 
 
21
  - **Vocab size**: 1700
 
22
  - **Layers**: 6
23
  - **Heads**: 8
config.json CHANGED
@@ -1,7 +1,4 @@
1
  {
2
- "architectures": [
3
- "ChessForCausalLM"
4
- ],
5
  "bos_token_id": 1,
6
  "dropout": 0.1,
7
  "dtype": "float32",
@@ -16,5 +13,10 @@
16
  "pad_token_id": 0,
17
  "tie_weights": true,
18
  "transformers_version": "4.57.5",
19
- "vocab_size": 1700
20
- }
 
 
 
 
 
 
1
  {
 
 
 
2
  "bos_token_id": 1,
3
  "dropout": 0.1,
4
  "dtype": "float32",
 
13
  "pad_token_id": 0,
14
  "tie_weights": true,
15
  "transformers_version": "4.57.5",
16
+ "unk_token_id": 3,
17
+ "vocab_size": 1700,
18
+ "auto_map": {
19
+ "AutoModelForCausalLM": "model.ChessForCausalLM",
20
+ "AutoConfig": "model.ChessConfig"
21
+ }
22
+ }
model.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Chess Transformer Model - The "Nuclear Patch" Edition
3
+ """
4
+ from __future__ import annotations
5
+ import math
6
+ from typing import Optional, Tuple, Union
7
+ import torch
8
+ import torch.nn as nn
9
+ import torch.nn.functional as F
10
+ from transformers import PretrainedConfig, PreTrainedModel
11
+ from transformers.modeling_outputs import CausalLMOutputWithPast
12
+
13
+ class ChessConfig(PretrainedConfig):
14
+ model_type = "chess_transformer"
15
+
16
+ def __init__(
17
+ self,
18
+ vocab_size=1200,
19
+ n_embd=128,
20
+ n_layer=6,
21
+ n_head=4,
22
+ n_ctx=256,
23
+ n_inner=None,
24
+ dropout=0.1,
25
+ layer_norm_epsilon=1e-5,
26
+ tie_weights=True,
27
+ pad_token_id=0,
28
+ bos_token_id=1,
29
+ eos_token_id=2,
30
+ unk_token_id=3,
31
+ **kwargs,
32
+ ):
33
+ self.vocab_size = vocab_size
34
+ self.n_embd = n_embd
35
+ self.n_layer = n_layer
36
+ self.n_head = n_head
37
+ self.n_ctx = n_ctx
38
+ self.n_inner = n_inner if n_inner is not None else 3 * n_embd
39
+ self.dropout = dropout
40
+ self.layer_norm_epsilon = layer_norm_epsilon
41
+ self.tie_weights = tie_weights
42
+
43
+ # On passe les IDs vitaux à kwargs pour le parent
44
+ kwargs["pad_token_id"] = pad_token_id
45
+ kwargs["bos_token_id"] = bos_token_id
46
+ kwargs["eos_token_id"] = eos_token_id
47
+ kwargs["unk_token_id"] = unk_token_id
48
+
49
+ super().__init__(**kwargs)
50
+
51
+ class MultiHeadAttention(nn.Module):
52
+ def __init__(self, config: ChessConfig):
53
+ super().__init__()
54
+ self.n_head = config.n_head
55
+ self.n_embd = config.n_embd
56
+ self.head_dim = config.n_embd // config.n_head
57
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
58
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd)
59
+ self.dropout = nn.Dropout(config.dropout)
60
+ self.register_buffer("bias", torch.tril(torch.ones(config.n_ctx, config.n_ctx)).view(1, 1, config.n_ctx, config.n_ctx), persistent=False)
61
+
62
+ def forward(self, x, attention_mask=None):
63
+ B, T, C = x.size()
64
+ qkv = self.c_attn(x)
65
+ q, k, v = qkv.split(self.n_embd, dim=2)
66
+ q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
67
+ k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
68
+ v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
69
+ att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
70
+ att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
71
+ if attention_mask is not None:
72
+ att = att.masked_fill(attention_mask.view(B, 1, 1, T) == 0, float('-inf'))
73
+ att = F.softmax(att, dim=-1)
74
+ att = self.dropout(att)
75
+ y = att @ v
76
+ y = y.transpose(1, 2).contiguous().view(B, T, C)
77
+ return self.c_proj(y)
78
+
79
+ class FeedForward(nn.Module):
80
+ def __init__(self, config: ChessConfig):
81
+ super().__init__()
82
+ self.c_fc = nn.Linear(config.n_embd, config.n_inner)
83
+ self.c_proj = nn.Linear(config.n_inner, config.n_embd)
84
+ self.dropout = nn.Dropout(config.dropout)
85
+ def forward(self, x):
86
+ return self.dropout(self.c_proj(F.gelu(self.c_fc(x))))
87
+
88
+ class TransformerBlock(nn.Module):
89
+ def __init__(self, config: ChessConfig):
90
+ super().__init__()
91
+ self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
92
+ self.attn = MultiHeadAttention(config)
93
+ self.ln_2 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
94
+ self.mlp = FeedForward(config)
95
+ def forward(self, x, attention_mask=None):
96
+ x = x + self.attn(self.ln_1(x), attention_mask)
97
+ x = x + self.mlp(self.ln_2(x))
98
+ return x
99
+
100
+ class ChessForCausalLM(PreTrainedModel):
101
+ config_class = ChessConfig
102
+ base_model_prefix = "transformer"
103
+
104
+ def __init__(self, config: ChessConfig):
105
+ super().__init__(config)
106
+ self.wte = nn.Embedding(config.vocab_size, config.n_embd)
107
+ self.wpe = nn.Embedding(config.n_ctx, config.n_embd)
108
+ self.drop = nn.Dropout(config.dropout)
109
+ self.h = nn.ModuleList([TransformerBlock(config) for _ in range(config.n_layer)])
110
+ self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
111
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
112
+ if config.tie_weights: self.post_init()
113
+
114
+ def get_input_embeddings(self): return self.wte
115
+ def set_input_embeddings(self, new_embeddings): self.wte = new_embeddings
116
+ def get_output_embeddings(self): return self.lm_head
117
+ def set_output_embeddings(self, new_embeddings): self.lm_head = new_embeddings
118
+
119
+ def forward(self, input_ids, attention_mask=None, position_ids=None, labels=None, return_dict=None, **kwargs):
120
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
121
+ if return_dict is None: return_dict = True
122
+
123
+ device = input_ids.device
124
+ b, t = input_ids.size()
125
+ if position_ids is None: position_ids = torch.arange(t, device=device).unsqueeze(0)
126
+
127
+ x = self.wte(input_ids) + self.wpe(position_ids)
128
+ x = self.drop(x)
129
+ for block in self.h: x = block(x, attention_mask)
130
+ x = self.ln_f(x)
131
+ logits = self.lm_head(x)
132
+ if labels is None:
133
+
134
+ nuclear_bad_ids = [0, 1, 2, 3]
135
+
136
+ logits[:, :, nuclear_bad_ids] = float("-inf")
137
+ loss = None
138
+ if labels is not None:
139
+ shift_logits = logits[..., :-1, :].contiguous()
140
+ shift_labels = labels[..., 1:].contiguous()
141
+ loss = nn.CrossEntropyLoss(ignore_index=self.config.pad_token_id)(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
142
+
143
+ if not return_dict:
144
+ return ((loss,) + (logits,)) if loss is not None else (logits,)
145
+
146
+ return CausalLMOutputWithPast(loss=loss, logits=logits)
147
+
148
+ from transformers import AutoConfig, AutoModelForCausalLM
149
+ AutoConfig.register("chess_transformer", ChessConfig)
150
+ AutoModelForCausalLM.register(ChessConfig, ChessForCausalLM)
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:484991ced4936bfd0aa741082d14f41c4390b2d6dc09773b55f56b125add4cd3
3
+ size 3943211
tokenizer.py CHANGED
@@ -9,7 +9,6 @@ import re
9
  from typing import Dict, List, Optional
10
  from transformers import PreTrainedTokenizer
11
 
12
- # --- REGEX (Pour nettoyer les coups) ---
13
  MOVE_RE = re.compile(r"([a-h][1-8])([a-h][1-8])")
14
  PROMO_RE = re.compile(r"=([NBRQ])")
15
 
@@ -41,7 +40,6 @@ class ChessTokenizer(PreTrainedTokenizer):
41
 
42
  for t in ["pad_token", "bos_token", "eos_token", "unk_token"]: kwargs.pop(t, None)
43
 
44
- # FIX CHEMIN
45
  if vocab is None:
46
  if vocab_file is None:
47
  vocab_file = os.path.join(os.path.dirname(__file__), "vocab.json")
@@ -56,7 +54,7 @@ class ChessTokenizer(PreTrainedTokenizer):
56
  self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
57
  super().__init__(pad_token=self.PAD_TOKEN, bos_token=self.BOS_TOKEN, eos_token=self.EOS_TOKEN, unk_token=self.UNK_TOKEN, **kwargs)
58
 
59
- # AUTO-COPIE (Vital pour le submit)
60
  def save_pretrained(self, save_directory: str, **kwargs):
61
  super().save_pretrained(save_directory, **kwargs)
62
  src_path = os.path.abspath(__file__)
@@ -72,7 +70,6 @@ class ChessTokenizer(PreTrainedTokenizer):
72
  def _create_default_vocab(self):
73
  return {t: i for i, t in enumerate([self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN])}
74
 
75
- # LA FONCTION QUI GERE LA TAILLE FIXE
76
  @classmethod
77
  def build_vocab_from_dataset(cls, dataset_name, split="train", column="text", min_frequency=2, max_vocab_size=1700, max_samples=100000):
78
  from datasets import load_dataset
@@ -87,9 +84,7 @@ class ChessTokenizer(PreTrainedTokenizer):
87
  moves = [normalize_move(t) for t in ex[column].split()]
88
  counter.update(moves)
89
 
90
- # ON FORCE LA TAILLE MAXIMALE ICI
91
  special = [cls.PAD_TOKEN, cls.BOS_TOKEN, cls.EOS_TOKEN, cls.UNK_TOKEN]
92
- # On prend les N plus fréquents pour remplir jusqu'à max_vocab_size
93
  most_common = counter.most_common(max_vocab_size - len(special))
94
 
95
  vocab = {t: i for i, t in enumerate(special + [t for t, c in most_common])}
 
9
  from typing import Dict, List, Optional
10
  from transformers import PreTrainedTokenizer
11
 
 
12
  MOVE_RE = re.compile(r"([a-h][1-8])([a-h][1-8])")
13
  PROMO_RE = re.compile(r"=([NBRQ])")
14
 
 
40
 
41
  for t in ["pad_token", "bos_token", "eos_token", "unk_token"]: kwargs.pop(t, None)
42
 
 
43
  if vocab is None:
44
  if vocab_file is None:
45
  vocab_file = os.path.join(os.path.dirname(__file__), "vocab.json")
 
54
  self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
55
  super().__init__(pad_token=self.PAD_TOKEN, bos_token=self.BOS_TOKEN, eos_token=self.EOS_TOKEN, unk_token=self.UNK_TOKEN, **kwargs)
56
 
57
+
58
  def save_pretrained(self, save_directory: str, **kwargs):
59
  super().save_pretrained(save_directory, **kwargs)
60
  src_path = os.path.abspath(__file__)
 
70
  def _create_default_vocab(self):
71
  return {t: i for i, t in enumerate([self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN])}
72
 
 
73
  @classmethod
74
  def build_vocab_from_dataset(cls, dataset_name, split="train", column="text", min_frequency=2, max_vocab_size=1700, max_samples=100000):
75
  from datasets import load_dataset
 
84
  moves = [normalize_move(t) for t in ex[column].split()]
85
  counter.update(moves)
86
 
 
87
  special = [cls.PAD_TOKEN, cls.BOS_TOKEN, cls.EOS_TOKEN, cls.UNK_TOKEN]
 
88
  most_common = counter.most_common(max_vocab_size - len(special))
89
 
90
  vocab = {t: i for i, t in enumerate(special + [t for t, c in most_common])}