ali-javani commited on
Commit
24ff34e
·
verified ·
1 Parent(s): f3996bb

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ tags:
4
+ - chess
5
+ - llm-course
6
+ - chess-challenge
7
+ license: mit
8
+ ---
9
+ # chess-aj-split-v2
10
+ Custom Chess Model with Split Tokenizer and Attention Fix.
11
+ - **Parameters**: 1,117,056
12
+ - **Submitted by**: ali-javani
config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ChessForCausalLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "model.ChessConfig",
7
+ "AutoModelForCausalLM": "model.ChessForCausalLM"
8
+ },
9
+ "bos_token_id": 22,
10
+ "dropout": 0.1,
11
+ "dtype": "float32",
12
+ "eos_token_id": 23,
13
+ "hidden_size": 96,
14
+ "model_type": "chess_transformer",
15
+ "n_ctx": 512,
16
+ "n_embd": 96,
17
+ "n_head": 4,
18
+ "n_inner": 256,
19
+ "n_layer": 10,
20
+ "num_attention_heads": 4,
21
+ "num_hidden_layers": 10,
22
+ "pad_token_id": 24,
23
+ "rms_norm_eps": 1e-06,
24
+ "tie_weights": true,
25
+ "transformers_version": "4.57.6",
26
+ "vocab_size": 90
27
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 22,
4
+ "eos_token_id": 23,
5
+ "pad_token_id": 24,
6
+ "transformers_version": "4.57.6"
7
+ }
model.py ADDED
@@ -0,0 +1,662 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # """
2
+ # SOTA Chess Transformer (Llama/DeepSeek Style)
3
+ # Updated for the 1M Parameter Challenge.
4
+
5
+ # Improvements over baseline:
6
+ # 1. RoPE (Rotary Positional Embeddings) - Saves ~32k params, better context.
7
+ # 2. RMSNorm - More stable than LayerNorm.
8
+ # 3. SwiGLU - Better activation function for reasoning.
9
+ # 4. QK-Norm - (From OLMo 2) Stabilizes attention.
10
+ # """
11
+
12
+ # from __future__ import annotations
13
+
14
+ # import math
15
+ # from dataclasses import dataclass
16
+ # from typing import Optional, Tuple, Union
17
+
18
+ # import torch
19
+ # import torch.nn as nn
20
+ # import torch.nn.functional as F
21
+ # from transformers import PretrainedConfig, PreTrainedModel
22
+ # from transformers.modeling_outputs import CausalLMOutputWithPast
23
+ # from transformers import LogitsProcessor, LogitsProcessorList
24
+ # from transformers.generation import GenerationMixin
25
+
26
+
27
+ # class ChessConfig(PretrainedConfig):
28
+ # model_type = "chess_transformer"
29
+
30
+ # def __init__(
31
+ # self,
32
+ # vocab_size: int = 1200,
33
+ # n_embd: int = 128,
34
+ # n_layer: int = 8, # Increased default depth since RoPE saves params
35
+ # n_head: int = 4,
36
+ # n_ctx: int = 256,
37
+ # n_inner: Optional[int] = None,
38
+ # dropout: float = 0.0, # Modern LLMs often use 0 dropout
39
+ # rms_norm_eps: float = 1e-6,
40
+ # tie_weights: bool = True,
41
+ # pad_token_id: int = 0,
42
+ # bos_token_id: int = 1,
43
+ # eos_token_id: int = 2,
44
+ # **kwargs,
45
+ # ):
46
+ # super().__init__(
47
+ # pad_token_id=pad_token_id,
48
+ # bos_token_id=bos_token_id,
49
+ # eos_token_id=eos_token_id,
50
+ # is_decoder=True,
51
+ # **kwargs,
52
+ # )
53
+ # self.vocab_size = vocab_size
54
+ # self.n_embd = n_embd
55
+ # self.n_layer = n_layer
56
+ # self.n_head = n_head
57
+ # # Mapping for Hugging Face compatibility
58
+ # self.num_hidden_layers = n_layer
59
+ # self.hidden_size = n_embd
60
+ # self.num_attention_heads = n_head
61
+
62
+ # self.n_ctx = n_ctx
63
+ # # SwiGLU needs a different inner dimension to match parameter count.
64
+ # # Usually 2/3 of 4d, but we can tune this.
65
+ # self.n_inner = n_inner if n_inner is not None else int(8/3 * n_embd)
66
+ # self.dropout = dropout
67
+ # self.rms_norm_eps = rms_norm_eps
68
+ # self.tie_weights = tie_weights
69
+ # self.tie_word_embeddings = bool(tie_weights)
70
+
71
+
72
+ # class RMSNorm(nn.Module):
73
+ # """Root Mean Square Layer Normalization (Llama style)."""
74
+ # def __init__(self, dim: int, eps: float = 1e-6):
75
+ # super().__init__()
76
+ # self.eps = eps
77
+ # self.weight = nn.Parameter(torch.ones(dim))
78
+
79
+ # def _norm(self, x):
80
+ # return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
81
+
82
+ # def forward(self, x):
83
+ # output = self._norm(x.float()).type_as(x)
84
+ # return output * self.weight
85
+
86
+
87
+ # def apply_rotary_pos_emb(q, k, cos, sin):
88
+ # """Apply Rotary Positional Embeddings (RoPE)."""
89
+ # # Reshape cos/sin to match q/k: [batch, 1, seq_len, head_dim]
90
+ # # Note: This is a simplified implementation for the challenge
91
+ # cos = cos.unsqueeze(1)
92
+ # sin = sin.unsqueeze(1)
93
+
94
+ # q_embed = (q * cos) + (rotate_half(q) * sin)
95
+ # k_embed = (k * cos) + (rotate_half(k) * sin)
96
+ # return q_embed, k_embed
97
+
98
+ # def rotate_half(x):
99
+ # """Rotates half the hidden dims of the input."""
100
+ # x1 = x[..., : x.shape[-1] // 2]
101
+ # x2 = x[..., x.shape[-1] // 2 :]
102
+ # return torch.cat((-x2, x1), dim=-1)
103
+
104
+
105
+ # class SOTAMultiHeadAttention(nn.Module):
106
+ # def __init__(self, config: ChessConfig):
107
+ # super().__init__()
108
+ # self.n_head = config.n_head
109
+ # self.n_embd = config.n_embd
110
+ # self.head_dim = config.n_embd // config.n_head
111
+
112
+ # # QKV Projections
113
+ # self.q_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
114
+ # self.k_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
115
+ # self.v_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
116
+ # self.o_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
117
+
118
+ # # QK-Norm (from OLMo 2) - Stabilizes training
119
+ # self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
120
+ # self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
121
+
122
+ # # RoPE cache
123
+ # self.register_buffer("inv_freq", 1.0 / (10000 ** (torch.arange(0, self.head_dim, 2).float() / self.head_dim)), persistent=False)
124
+
125
+ # def get_rope_embeddings(self, seq_len, device):
126
+ # t = torch.arange(seq_len, device=device, dtype=self.inv_freq.dtype)
127
+ # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
128
+ # emb = torch.cat((freqs, freqs), dim=-1)
129
+ # return emb.cos(), emb.sin()
130
+
131
+ # def forward(self, x, attention_mask=None):
132
+ # batch_size, seq_len, _ = x.size()
133
+
134
+ # # 1. Project
135
+ # q = self.q_proj(x).view(batch_size, seq_len, self.n_head, self.head_dim)
136
+ # k = self.k_proj(x).view(batch_size, seq_len, self.n_head, self.head_dim)
137
+ # v = self.v_proj(x).view(batch_size, seq_len, self.n_head, self.head_dim)
138
+
139
+ # # 2. QK-Norm (OLMo style) - Normalize BEFORE RoPE
140
+ # q = self.q_norm(q)
141
+ # k = self.k_norm(k)
142
+
143
+ # # 3. Apply RoPE
144
+ # # Transpose to [batch, head, seq, dim] for easier math
145
+ # q = q.transpose(1, 2)
146
+ # k = k.transpose(1, 2)
147
+ # v = v.transpose(1, 2)
148
+
149
+ # cos, sin = self.get_rope_embeddings(seq_len, x.device)
150
+ # # Match dimensions for broadcasting
151
+ # cos = cos.unsqueeze(0).unsqueeze(0) # [1, 1, seq, dim]
152
+ # sin = sin.unsqueeze(0).unsqueeze(0)
153
+
154
+ # q = (q * cos) + (rotate_half(q) * sin)
155
+ # k = (k * cos) + (rotate_half(k) * sin)
156
+
157
+ # # 4. Attention
158
+ # # Efficient Flash Attention if available (or standard)
159
+ # attn_output = F.scaled_dot_product_attention(
160
+ # q, k, v,
161
+ # attn_mask=None,
162
+ # dropout_p=0.0,
163
+ # is_causal=True
164
+ # )
165
+
166
+ # # 5. Output Projection
167
+ # attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len, self.n_embd)
168
+ # return self.o_proj(attn_output)
169
+
170
+
171
+ # class SwiGLUFeedForward(nn.Module):
172
+ # """SwiGLU FFN (Llama/DeepSeek style)."""
173
+ # def __init__(self, config: ChessConfig):
174
+ # super().__init__()
175
+ # # SwiGLU has 3 projections: Gate, Value, Output
176
+ # self.gate_proj = nn.Linear(config.n_embd, config.n_inner, bias=False)
177
+ # self.up_proj = nn.Linear(config.n_embd, config.n_inner, bias=False)
178
+ # self.down_proj = nn.Linear(config.n_inner, config.n_embd, bias=False)
179
+
180
+ # def forward(self, x):
181
+ # # SwiGLU: (Swish(Gate) * Up) -> Down
182
+ # return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
183
+
184
+
185
+ # class SOTATransformerBlock(nn.Module):
186
+ # def __init__(self, config: ChessConfig):
187
+ # super().__init__()
188
+ # self.input_layernorm = RMSNorm(config.n_embd, eps=config.rms_norm_eps)
189
+ # self.self_attn = SOTAMultiHeadAttention(config)
190
+ # self.post_attention_layernorm = RMSNorm(config.n_embd, eps=config.rms_norm_eps)
191
+ # self.mlp = SwiGLUFeedForward(config)
192
+
193
+ # def forward(self, x, attention_mask=None):
194
+ # # Pre-norm architecture
195
+ # x = x + self.self_attn(self.input_layernorm(x), attention_mask)
196
+ # x = x + self.mlp(self.post_attention_layernorm(x))
197
+ # return x
198
+
199
+
200
+ # class FourStepConsistency(LogitsProcessor):
201
+ # """
202
+ # Enforces the 4-step rhythm: [Piece] -> [From] -> [To] -> [Suffix]
203
+ # """
204
+ # def __init__(self, tokenizer, start_len):
205
+ # self.tokenizer = tokenizer
206
+ # self.start_len = start_len
207
+
208
+ # all_ids = set(range(tokenizer.vocab_size))
209
+
210
+ # # 1. Piece IDs
211
+ # self.piece_ids = {tokenizer.convert_tokens_to_ids(t) for t in tokenizer.PIECES if t in tokenizer.get_vocab()}
212
+ # # 2. Square IDs (Used for both From and To)
213
+ # self.square_ids = {tokenizer.convert_tokens_to_ids(t) for t in tokenizer.SQUARES if t in tokenizer.get_vocab()}
214
+ # # 3. Suffix IDs
215
+ # self.suffix_ids = {tokenizer.convert_tokens_to_ids(t) for t in tokenizer.SUFFIXES if t in tokenizer.get_vocab()}
216
+
217
+ # def __call__(self, input_ids, scores):
218
+ # cur_len = input_ids.shape[1]
219
+ # relative_pos = (cur_len - self.start_len) % 4
220
+
221
+ # mask_ids = set()
222
+
223
+ # if relative_pos == 0: # Step 1: Piece
224
+ # mask_ids = self.piece_ids
225
+ # elif relative_pos == 1: # Step 2: From Square
226
+ # mask_ids = self.square_ids
227
+ # elif relative_pos == 2: # Step 3: To Square
228
+ # mask_ids = self.square_ids
229
+ # else: # Step 4: Suffix
230
+ # mask_ids = self.suffix_ids
231
+
232
+ # # Mask out disallowed tokens
233
+ # for i in range(scores.shape[1]):
234
+ # if i not in mask_ids and i != self.tokenizer.eos_token_id:
235
+ # scores[:, i] = float("-inf")
236
+
237
+ # return scores
238
+
239
+
240
+ # class ChessForCausalLM(PreTrainedModel, GenerationMixin):
241
+ # config_class = ChessConfig
242
+
243
+ # def __init__(self, config: ChessConfig):
244
+ # super().__init__(config)
245
+
246
+ # # 1. Embeddings (No Position Embeddings needed, RoPE handles it!)
247
+ # self.embed_tokens = nn.Embedding(config.vocab_size, config.n_embd)
248
+
249
+ # # 2. Layers
250
+ # self.layers = nn.ModuleList([
251
+ # SOTATransformerBlock(config) for _ in range(config.n_layer)
252
+ # ])
253
+
254
+ # # 3. Final Norm
255
+ # self.norm = RMSNorm(config.n_embd, eps=config.rms_norm_eps)
256
+
257
+ # # 4. Head
258
+ # self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
259
+
260
+ # # Tie weights
261
+ # if config.tie_weights:
262
+ # self.lm_head.weight = self.embed_tokens.weight
263
+ # self._tied_weights_keys = ["lm_head.weight"]
264
+
265
+ # self.post_init()
266
+
267
+ # def get_input_embeddings(self):
268
+ # return self.embed_tokens
269
+
270
+ # def set_input_embeddings(self, value):
271
+ # self.embed_tokens = value
272
+
273
+ # def get_output_embeddings(self):
274
+ # return self.lm_head
275
+
276
+ # def set_output_embeddings(self, new_embeddings):
277
+ # self.lm_head = new_embeddings
278
+
279
+ # def forward(
280
+ # self,
281
+ # input_ids: torch.LongTensor = None,
282
+ # attention_mask: Optional[torch.Tensor] = None,
283
+ # labels: Optional[torch.LongTensor] = None,
284
+ # return_dict: Optional[bool] = None,
285
+ # **kwargs,
286
+ # ) -> Union[Tuple, CausalLMOutputWithPast]:
287
+
288
+ # batch_size, seq_len = input_ids.shape
289
+ # hidden_states = self.embed_tokens(input_ids)
290
+
291
+ # for layer in self.layers:
292
+ # hidden_states = layer(hidden_states, attention_mask)
293
+
294
+ # hidden_states = self.norm(hidden_states)
295
+ # logits = self.lm_head(hidden_states)
296
+
297
+ # loss = None
298
+ # if labels is not None:
299
+ # shift_logits = logits[..., :-1, :].contiguous()
300
+ # shift_labels = labels[..., 1:].contiguous()
301
+ # loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
302
+ # loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
303
+
304
+ # return_dict = return_dict if return_dict is not None else self.config.use_return_dict
305
+ # if not return_dict:
306
+ # output = (logits,)
307
+ # return ((loss,) + output) if loss is not None else output
308
+
309
+ # return CausalLMOutputWithPast(
310
+ # loss=loss,
311
+ # logits=logits,
312
+ # past_key_values=None,
313
+ # hidden_states=None,
314
+ # attentions=None,
315
+ # )
316
+
317
+ # def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
318
+ # # 1. Handle Cache (Past Key Values)
319
+ # # If we have a cache, we only need to process the very last token we generated
320
+ # if past_key_values:
321
+ # input_ids = input_ids[:, -1:]
322
+
323
+ # # 2. Handle Position IDs
324
+ # # If the user didn't provide position_ids, we might need to create them from the attention_mask
325
+ # position_ids = kwargs.get("position_ids", None)
326
+
327
+ # # FIX: Explicitly check 'is not None' to avoid the ambiguous Tensor error
328
+ # attention_mask = kwargs.get("attention_mask", None)
329
+ # if attention_mask is not None:
330
+ # # Create position_ids based on the mask (0, 1, 2... ignoring padding)
331
+ # if position_ids is None:
332
+ # position_ids = attention_mask.long().cumsum(-1) - 1
333
+ # position_ids.masked_fill_(attention_mask == 0, 1)
334
+
335
+ # # If using cache, we only need the position ID for the last token
336
+ # if past_key_values:
337
+ # position_ids = position_ids[:, -1].unsqueeze(-1)
338
+
339
+ # return {
340
+ # "input_ids": input_ids,
341
+ # "past_key_values": past_key_values,
342
+ # "use_cache": kwargs.get("use_cache"),
343
+ # "position_ids": position_ids,
344
+ # "attention_mask": attention_mask,
345
+ # }
346
+
347
+ # def generate(self, input_ids, **kwargs):
348
+ # tokenizer = kwargs.pop("tokenizer", None)
349
+ # if tokenizer is not None:
350
+ # # Use the 4-step synthesizer
351
+ # synthesizer = FourStepConsistency(tokenizer, input_ids.shape[1])
352
+ # logits_processor = kwargs.get("logits_processor", LogitsProcessorList())
353
+ # logits_processor.append(synthesizer)
354
+ # kwargs["logits_processor"] = logits_processor
355
+
356
+ # # Call GenerationMixin directly to bypass any PreTrainedModel ambiguity
357
+ # return GenerationMixin.generate(self, input_ids, **kwargs)
358
+
359
+ # # Register
360
+ # from transformers import AutoConfig, AutoModelForCausalLM
361
+ # AutoConfig.register("chess_transformer", ChessConfig)
362
+ # AutoModelForCausalLM.register(ChessConfig, ChessForCausalLM)
363
+
364
+
365
+ """
366
+ SOTA Chess Transformer (Llama/DeepSeek Style)
367
+ Updated for the 1M Parameter Challenge.
368
+ """
369
+ from __future__ import annotations
370
+
371
+ import math
372
+ from dataclasses import dataclass
373
+ from typing import Optional, Tuple, Union
374
+
375
+ import torch
376
+ import torch.nn as nn
377
+ import torch.nn.functional as F
378
+ from transformers import PretrainedConfig, PreTrainedModel
379
+ from transformers.modeling_outputs import CausalLMOutputWithPast
380
+ from transformers import LogitsProcessor, LogitsProcessorList
381
+ from transformers.generation import GenerationMixin
382
+
383
+ class ChessConfig(PretrainedConfig):
384
+ model_type = "chess_transformer"
385
+
386
+ def __init__(
387
+ self,
388
+ vocab_size: int = 1200,
389
+ n_embd: int = 128,
390
+ n_layer: int = 8,
391
+ n_head: int = 4,
392
+ n_ctx: int = 256,
393
+ n_inner: Optional[int] = None,
394
+ dropout: float = 0.0,
395
+ rms_norm_eps: float = 1e-6,
396
+ tie_weights: bool = True,
397
+ pad_token_id: int = 0,
398
+ bos_token_id: int = 1,
399
+ eos_token_id: int = 2,
400
+ **kwargs,
401
+ ):
402
+ super().__init__(
403
+ pad_token_id=pad_token_id,
404
+ bos_token_id=bos_token_id,
405
+ eos_token_id=eos_token_id,
406
+ is_decoder=True,
407
+ **kwargs,
408
+ )
409
+ self.vocab_size = vocab_size
410
+ self.n_embd = n_embd
411
+ self.n_layer = n_layer
412
+ self.n_head = n_head
413
+ self.n_ctx = n_ctx
414
+ self.n_inner = n_inner if n_inner is not None else int(8/3 * n_embd)
415
+ self.dropout = dropout
416
+ self.rms_norm_eps = rms_norm_eps
417
+ self.tie_weights = tie_weights
418
+ self.tie_word_embeddings = bool(tie_weights)
419
+
420
+ self.num_hidden_layers = n_layer
421
+ self.hidden_size = n_embd
422
+ self.num_attention_heads = n_head
423
+
424
+
425
+ class RMSNorm(nn.Module):
426
+ def __init__(self, dim: int, eps: float = 1e-6):
427
+ super().__init__()
428
+ self.eps = eps
429
+ self.weight = nn.Parameter(torch.ones(dim))
430
+
431
+ def _norm(self, x):
432
+ return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
433
+
434
+ def forward(self, x):
435
+ output = self._norm(x.float()).type_as(x)
436
+ return output * self.weight
437
+
438
+ def rotate_half(x):
439
+ x1 = x[..., : x.shape[-1] // 2]
440
+ x2 = x[..., x.shape[-1] // 2 :]
441
+ return torch.cat((-x2, x1), dim=-1)
442
+
443
+ class SOTAMultiHeadAttention(nn.Module):
444
+ def __init__(self, config: ChessConfig):
445
+ super().__init__()
446
+ self.n_head = config.n_head
447
+ self.n_embd = config.n_embd
448
+ self.head_dim = config.n_embd // config.n_head
449
+
450
+ self.q_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
451
+ self.k_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
452
+ self.v_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
453
+ self.o_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
454
+
455
+ self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
456
+ self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
457
+
458
+ self.register_buffer("inv_freq", 1.0 / (10000 ** (torch.arange(0, self.head_dim, 2).float() / self.head_dim)), persistent=False)
459
+
460
+ def get_rope_embeddings(self, position_ids, device):
461
+ # FIX: Use explicit position_ids instead of arange(seq_len)
462
+ # position_ids: [batch, seq_len]
463
+ inv_freq = self.inv_freq.to(device)
464
+
465
+ # Outer product: [batch, seq_len, head_dim/2]
466
+ # We need to flatten batch/seq to simplify, or use broadcasting
467
+ # freqs = (pos * freq)
468
+
469
+ # position_ids is [batch, seq], inv_freq is [dim]
470
+ # Output should be [batch, seq, dim]
471
+ freqs = torch.einsum("bs,d->bsd", position_ids.float(), inv_freq)
472
+ emb = torch.cat((freqs, freqs), dim=-1)
473
+ return emb.cos(), emb.sin()
474
+
475
+ def forward(self, x, attention_mask=None, position_ids=None):
476
+ batch_size, seq_len, _ = x.size()
477
+
478
+ q = self.q_proj(x).view(batch_size, seq_len, self.n_head, self.head_dim)
479
+ k = self.k_proj(x).view(batch_size, seq_len, self.n_head, self.head_dim)
480
+ v = self.v_proj(x).view(batch_size, seq_len, self.n_head, self.head_dim)
481
+
482
+ q = self.q_norm(q)
483
+ k = self.k_norm(k)
484
+
485
+ # Transpose for RoPE [batch, head, seq, dim]
486
+ q = q.transpose(1, 2)
487
+ k = k.transpose(1, 2)
488
+ v = v.transpose(1, 2)
489
+
490
+ if position_ids is None:
491
+ position_ids = torch.arange(seq_len, device=x.device).unsqueeze(0).expand(batch_size, -1)
492
+
493
+ cos, sin = self.get_rope_embeddings(position_ids, x.device)
494
+ cos = cos.unsqueeze(1)
495
+ sin = sin.unsqueeze(1)
496
+
497
+ q = (q * cos) + (rotate_half(q) * sin)
498
+ k = (k * cos) + (rotate_half(k) * sin)
499
+
500
+ # --- FIX: Resolve Conflict between attn_mask and is_causal ---
501
+ if attention_mask is not None:
502
+ # 1. Expand to 4D for broadcasting if needed
503
+ if attention_mask.dim() == 2:
504
+ attention_mask = attention_mask.unsqueeze(1).unsqueeze(1)
505
+
506
+ # 2. Convert Int to Bool if needed (PyTorch SDPA prefers this)
507
+ if attention_mask.dtype in [torch.long, torch.int64, torch.int32]:
508
+ attention_mask = (attention_mask == 0) # True for masked, False for keep?
509
+ # Wait, usually 1=Keep, 0=Mask.
510
+ # If using bool mask in SDPA: True = Masked Out (Ignore).
511
+ # So if input is 1 (Keep), we want False (Don't Mask).
512
+ # If input is 0 (Pad), we want True (Mask).
513
+ # So (mask == 0) gives us True for Padding. Correct.
514
+
515
+ # 3. CRITICAL: If the mask is "Empty" (all False = keep everything),
516
+ # drop it so we can use is_causal=True without error.
517
+ # (Note: In boolean mask, 'False' means 'Keep')
518
+ if not attention_mask.any():
519
+ attention_mask = None
520
+ # -------------------------------------------------------------
521
+
522
+ attn_output = F.scaled_dot_product_attention(
523
+ q, k, v,
524
+ attn_mask=attention_mask,
525
+ dropout_p=0.0,
526
+ is_causal=True
527
+ )
528
+
529
+ attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len, self.n_embd)
530
+ return self.o_proj(attn_output)
531
+
532
+
533
+ class SwiGLUFeedForward(nn.Module):
534
+ def __init__(self, config: ChessConfig):
535
+ super().__init__()
536
+ self.gate_proj = nn.Linear(config.n_embd, config.n_inner, bias=False)
537
+ self.up_proj = nn.Linear(config.n_embd, config.n_inner, bias=False)
538
+ self.down_proj = nn.Linear(config.n_inner, config.n_embd, bias=False)
539
+
540
+ def forward(self, x):
541
+ return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
542
+
543
+
544
+ class SOTATransformerBlock(nn.Module):
545
+ def __init__(self, config: ChessConfig):
546
+ super().__init__()
547
+ self.input_layernorm = RMSNorm(config.n_embd, eps=config.rms_norm_eps)
548
+ self.self_attn = SOTAMultiHeadAttention(config)
549
+ self.post_attention_layernorm = RMSNorm(config.n_embd, eps=config.rms_norm_eps)
550
+ self.mlp = SwiGLUFeedForward(config)
551
+
552
+ def forward(self, x, attention_mask=None, position_ids=None):
553
+ # FIX: Pass position_ids down
554
+ x = x + self.self_attn(self.input_layernorm(x), attention_mask, position_ids)
555
+ x = x + self.mlp(self.post_attention_layernorm(x))
556
+ return x
557
+
558
+
559
+ class FourStepConsistency(LogitsProcessor):
560
+ def __init__(self, tokenizer, start_len):
561
+ self.tokenizer = tokenizer
562
+ self.start_len = start_len
563
+ self.piece_ids = {tokenizer.convert_tokens_to_ids(t) for t in tokenizer.PIECES if t in tokenizer.get_vocab()}
564
+ self.square_ids = {tokenizer.convert_tokens_to_ids(t) for t in tokenizer.SQUARES if t in tokenizer.get_vocab()}
565
+ self.suffix_ids = {tokenizer.convert_tokens_to_ids(t) for t in tokenizer.SUFFIXES if t in tokenizer.get_vocab()}
566
+
567
+ def __call__(self, input_ids, scores):
568
+ cur_len = input_ids.shape[1]
569
+ relative_pos = (cur_len - self.start_len) % 4
570
+ mask_ids = set()
571
+ if relative_pos == 0: mask_ids = self.piece_ids
572
+ elif relative_pos == 1: mask_ids = self.square_ids
573
+ elif relative_pos == 2: mask_ids = self.square_ids
574
+ else: mask_ids = self.suffix_ids
575
+
576
+ for i in range(scores.shape[1]):
577
+ if i not in mask_ids and i != self.tokenizer.eos_token_id:
578
+ scores[:, i] = float("-inf")
579
+ return scores
580
+
581
+
582
+ class ChessForCausalLM(PreTrainedModel, GenerationMixin):
583
+ config_class = ChessConfig
584
+
585
+ def __init__(self, config: ChessConfig):
586
+ super().__init__(config)
587
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.n_embd)
588
+ self.layers = nn.ModuleList([SOTATransformerBlock(config) for _ in range(config.n_layer)])
589
+ self.norm = RMSNorm(config.n_embd, eps=config.rms_norm_eps)
590
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
591
+
592
+ if config.tie_weights:
593
+ self.lm_head.weight = self.embed_tokens.weight
594
+ self._tied_weights_keys = ["lm_head.weight"]
595
+ self.post_init()
596
+
597
+ def get_input_embeddings(self): return self.embed_tokens
598
+ def set_input_embeddings(self, value): self.embed_tokens = value
599
+ def get_output_embeddings(self): return self.lm_head
600
+ def set_output_embeddings(self, new_embeddings): self.lm_head = new_embeddings
601
+
602
+ def forward(self, input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, labels: Optional[torch.LongTensor] = None, return_dict: Optional[bool] = None, **kwargs):
603
+ batch_size, seq_len = input_ids.shape
604
+ hidden_states = self.embed_tokens(input_ids)
605
+
606
+ # FIX: Ensure position_ids exist
607
+ if position_ids is None:
608
+ position_ids = torch.arange(seq_len, device=input_ids.device).unsqueeze(0).expand(batch_size, -1)
609
+
610
+ # FIX: Pass position_ids to layers
611
+ for layer in self.layers:
612
+ hidden_states = layer(hidden_states, attention_mask, position_ids)
613
+
614
+ hidden_states = self.norm(hidden_states)
615
+ logits = self.lm_head(hidden_states)
616
+
617
+ loss = None
618
+ if labels is not None:
619
+ shift_logits = logits[..., :-1, :].contiguous()
620
+ shift_labels = labels[..., 1:].contiguous()
621
+ loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
622
+ loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
623
+
624
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
625
+ if not return_dict:
626
+ output = (logits,)
627
+ return ((loss,) + output) if loss is not None else output
628
+ return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=None, hidden_states=None, attentions=None)
629
+
630
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
631
+ # FORCE NO CACHE: Always process the full sequence.
632
+ # This matches our SOTAMultiHeadAttention which handles the full history every time.
633
+
634
+ position_ids = kwargs.get("position_ids", None)
635
+ attention_mask = kwargs.get("attention_mask", None)
636
+
637
+ if attention_mask is not None and position_ids is None:
638
+ # Create position_ids from the mask
639
+ position_ids = attention_mask.long().cumsum(-1) - 1
640
+ position_ids.masked_fill_(attention_mask == 0, 1)
641
+
642
+ return {
643
+ "input_ids": input_ids, # Return FULL input_ids (do not slice)
644
+ "past_key_values": None, # Force None so the model doesn't expect cache
645
+ "use_cache": False, # Explicitly disable cache flag
646
+ "position_ids": position_ids,
647
+ "attention_mask": attention_mask,
648
+ }
649
+
650
+ def generate(self, input_ids, **kwargs):
651
+ tokenizer = kwargs.pop("tokenizer", None)
652
+ if tokenizer is not None:
653
+ synthesizer = FourStepConsistency(tokenizer, input_ids.shape[1])
654
+ logits_processor = kwargs.get("logits_processor", LogitsProcessorList())
655
+ logits_processor.append(synthesizer)
656
+ kwargs["logits_processor"] = logits_processor
657
+ return GenerationMixin.generate(self, input_ids, **kwargs)
658
+
659
+ # Register
660
+ from transformers import AutoConfig, AutoModelForCausalLM
661
+ AutoConfig.register("chess_transformer", ChessConfig)
662
+ AutoModelForCausalLM.register(ChessConfig, ChessForCausalLM)
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4f29d086270b5b16d38d94f77b485e8f52dc37423b5e56f8335a05b23563c39
3
+ size 4479240
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "[BOS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "[EOS]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "[UNK]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 4-Step Split Tokenizer
3
+ Splits moves into: [Piece] -> [From] -> [To] -> [Suffix]
4
+ Minimizes vocabulary to ~150 tokens.
5
+ """
6
+ from __future__ import annotations
7
+ import json
8
+ import os
9
+ import re
10
+ from typing import Dict, List, Optional
11
+ from transformers import PreTrainedTokenizer, AutoTokenizer
12
+
13
+ class ChessTokenizer(PreTrainedTokenizer):
14
+ vocab_files_names = {"vocab_file": "vocab.json"}
15
+ model_input_names = ["input_ids", "attention_mask"]
16
+
17
+ # 1. Pieces
18
+ PIECES = ["WP", "WN", "WB", "WR", "WQ", "WK", "BP", "BN", "BB", "BR", "BQ", "BK"]
19
+ # 2. Squares
20
+ SQUARES = [f"{c}{r}" for c in "abcdefgh" for r in "12345678"]
21
+ # 3. Suffixes (Crucial: (-) represents "No Suffix/Quiet Move")
22
+ SUFFIXES = ["(-)", "(x)", "(+)", "(#)", "(x+)", "(x#)", "(O)", "(o)", "(Q)", "=Q"]
23
+
24
+ PAD_TOKEN = "[PAD]"
25
+ BOS_TOKEN = "[BOS]"
26
+ EOS_TOKEN = "[EOS]"
27
+ UNK_TOKEN = "[UNK]"
28
+
29
+ # def __init__(self, vocab_file: Optional[str] = None, vocab: Optional[Dict[str, int]] = None, **kwargs):
30
+ # # 1. Build or Load Vocab first
31
+ # self._vocab = vocab
32
+ # if vocab_file and os.path.exists(vocab_file):
33
+ # with open(vocab_file, "r", encoding="utf-8") as f:
34
+ # self._vocab = json.load(f)
35
+
36
+ # if not self._vocab:
37
+ # self._vocab = self._build_split_vocab()
38
+
39
+ # self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
40
+
41
+ # # 2. Call parent init with explicit tokens to prevent auto-add errors
42
+ # super().__init__(
43
+ # pad_token=self.PAD_TOKEN,
44
+ # bos_token=self.BOS_TOKEN,
45
+ # eos_token=self.EOS_TOKEN,
46
+ # unk_token=self.UNK_TOKEN,
47
+ # **kwargs,
48
+ # )
49
+ def __init__(self, vocab_file: Optional[str] = None, vocab: Optional[Dict[str, int]] = None, **kwargs):
50
+ # 1. Build or Load Vocab
51
+ self._vocab = vocab
52
+ if vocab_file and os.path.exists(vocab_file):
53
+ with open(vocab_file, "r", encoding="utf-8") as f:
54
+ self._vocab = json.load(f)
55
+
56
+ if not self._vocab:
57
+ self._vocab = self._build_split_vocab()
58
+
59
+ self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
60
+
61
+ # 2. Handle Special Tokens Safely
62
+ # We "pop" them from kwargs to prevent the "multiple values" error.
63
+ # This prioritizes the loaded config (kwargs) if it exists,
64
+ # falling back to your class constants if it doesn't.
65
+ pad_token = kwargs.pop("pad_token", self.PAD_TOKEN)
66
+ bos_token = kwargs.pop("bos_token", self.BOS_TOKEN)
67
+ eos_token = kwargs.pop("eos_token", self.EOS_TOKEN)
68
+ unk_token = kwargs.pop("unk_token", self.UNK_TOKEN)
69
+
70
+ # 3. Call parent
71
+ super().__init__(
72
+ pad_token=pad_token,
73
+ bos_token=bos_token,
74
+ eos_token=eos_token,
75
+ unk_token=unk_token,
76
+ **kwargs,
77
+ )
78
+
79
+ def _build_split_vocab(self):
80
+ tokens = [self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN]
81
+ tokens += self.PIECES + self.SQUARES + self.SUFFIXES
82
+ # Sort and unique to be safe
83
+ unique_tokens = sorted(list(set(tokens)))
84
+ return {t: i for i, t in enumerate(unique_tokens)}
85
+
86
+ def get_vocab(self) -> Dict[str, int]:
87
+ """Required by Hugging Face PreTrainedTokenizer"""
88
+ return dict(self._vocab)
89
+
90
+ @property
91
+ def vocab_size(self) -> int:
92
+ return len(self._vocab)
93
+
94
+ def _tokenize(self, text: str) -> List[str]:
95
+ moves = text.strip().split()
96
+ tokens = []
97
+
98
+ # Regex: (Piece)(Square)(Square)(Optional Suffix)
99
+ pattern = re.compile(r"([WB][PNBRQK])([a-h][1-8])([a-h][1-8])(.*)")
100
+
101
+ for move in moves:
102
+ match = pattern.match(move)
103
+ if match:
104
+ p, s, t, suf = match.groups()
105
+ tokens.extend([p, s, t])
106
+ tokens.append(suf if suf else "(-)")
107
+ else:
108
+ tokens.append(self.UNK_TOKEN)
109
+
110
+ return tokens
111
+
112
+ def _convert_token_to_id(self, token: str) -> int:
113
+ return self._vocab.get(token, self._vocab.get(self.UNK_TOKEN))
114
+
115
+ def _convert_id_to_token(self, index: int) -> str:
116
+ return self._ids_to_tokens.get(index, self.UNK_TOKEN)
117
+
118
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
119
+ out = []
120
+ specials = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}
121
+ clean = [t for t in tokens if t not in specials]
122
+
123
+ current_move = ""
124
+ for i, t in enumerate(clean):
125
+ if t == "(-)":
126
+ pass
127
+ else:
128
+ current_move += t
129
+
130
+ # Every 4th token completes a move
131
+ if (i + 1) % 4 == 0:
132
+ out.append(current_move)
133
+ current_move = ""
134
+
135
+ if current_move: out.append(current_move)
136
+ return " ".join(out)
137
+
138
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
139
+ path = os.path.join(save_directory, (filename_prefix + "-" if filename_prefix else "") + "vocab.json")
140
+ with open(path, "w") as f:
141
+ json.dump(self._vocab, f)
142
+ return (path,)
143
+
144
+ @classmethod
145
+ def build_vocab_from_dataset(cls, *args, **kwargs):
146
+ print("Using static 4-Step Split vocabulary.")
147
+ return cls()
148
+
149
+ # Register
150
+ AutoTokenizer.register("ChessTokenizer", ChessTokenizer)
tokenizer_config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "22": {
4
+ "content": "[BOS]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "23": {
12
+ "content": "[EOS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "24": {
20
+ "content": "[PAD]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "25": {
28
+ "content": "[UNK]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "auto_map": {
37
+ "AutoTokenizer": [
38
+ "tokenizer.ChessTokenizer",
39
+ null
40
+ ]
41
+ },
42
+ "bos_token": "[BOS]",
43
+ "clean_up_tokenization_spaces": false,
44
+ "eos_token": "[EOS]",
45
+ "extra_special_tokens": {},
46
+ "model_max_length": 1000000000000000019884624838656,
47
+ "pad_token": "[PAD]",
48
+ "tokenizer_class": "ChessTokenizer",
49
+ "unk_token": "[UNK]",
50
+ "bos_token_id": 22
51
+ }
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"(#)": 0, "(+)": 1, "(-)": 2, "(O)": 3, "(Q)": 4, "(o)": 5, "(x#)": 6, "(x)": 7, "(x+)": 8, "=Q": 9, "BB": 10, "BK": 11, "BN": 12, "BP": 13, "BQ": 14, "BR": 15, "WB": 16, "WK": 17, "WN": 18, "WP": 19, "WQ": 20, "WR": 21, "[BOS]": 22, "[EOS]": 23, "[PAD]": 24, "[UNK]": 25, "a1": 26, "a2": 27, "a3": 28, "a4": 29, "a5": 30, "a6": 31, "a7": 32, "a8": 33, "b1": 34, "b2": 35, "b3": 36, "b4": 37, "b5": 38, "b6": 39, "b7": 40, "b8": 41, "c1": 42, "c2": 43, "c3": 44, "c4": 45, "c5": 46, "c6": 47, "c7": 48, "c8": 49, "d1": 50, "d2": 51, "d3": 52, "d4": 53, "d5": 54, "d6": 55, "d7": 56, "d8": 57, "e1": 58, "e2": 59, "e3": 60, "e4": 61, "e5": 62, "e6": 63, "e7": 64, "e8": 65, "f1": 66, "f2": 67, "f3": 68, "f4": 69, "f5": 70, "f6": 71, "f7": 72, "f8": 73, "g1": 74, "g2": 75, "g3": 76, "g4": 77, "g5": 78, "g6": 79, "g7": 80, "g8": 81, "h1": 82, "h2": 83, "h3": 84, "h4": 85, "h5": 86, "h6": 87, "h7": 88, "h8": 89}