MDaytek commited on
Commit
a15908c
·
verified ·
1 Parent(s): 375d124

Chess Challenge submission by MDaytek

Browse files
Files changed (8) hide show
  1. README.md +31 -0
  2. config.json +24 -0
  3. model.py +159 -0
  4. model.safetensors +3 -0
  5. special_tokens_map.json +6 -0
  6. tokenizer.py +55 -0
  7. tokenizer_config.json +13 -0
  8. vocab.json +35 -0
README.md ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ tags:
4
+ - chess
5
+ - llm-course
6
+ - chess-challenge
7
+ license: mit
8
+ ---
9
+
10
+ # chess-MDaytek
11
+
12
+ Chess model submitted to the LLM Course Chess Challenge.
13
+
14
+ ## Submission Info
15
+
16
+ - **Submitted by**: [MDaytek](https://huggingface.co/MDaytek)
17
+ - **Parameters**: 749,856
18
+ - **Organization**: LLM-course
19
+
20
+ ## Usage
21
+
22
+ ```python
23
+ from transformers import AutoModelForCausalLM, AutoTokenizer
24
+
25
+ model = AutoModelForCausalLM.from_pretrained("LLM-course/chess-MDaytek", trust_remote_code=True)
26
+ tokenizer = AutoTokenizer.from_pretrained("LLM-course/chess-MDaytek", trust_remote_code=True)
27
+ ```
28
+
29
+ ## Evaluation
30
+
31
+ This model is evaluated at the [Chess Challenge Arena](https://huggingface.co/spaces/LLM-course/Chess1MChallenge).
config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ChessForCausalLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "model.ChessConfig",
7
+ "AutoModelForCausalLM": "model.ChessForCausalLM"
8
+ },
9
+ "bos_token_id": 1,
10
+ "dropout": 0.15,
11
+ "dtype": "float32",
12
+ "eos_token_id": 2,
13
+ "layer_norm_epsilon": 1e-05,
14
+ "model_type": "chess_transformer",
15
+ "n_ctx": 256,
16
+ "n_embd": 96,
17
+ "n_head": 8,
18
+ "n_inner": 288,
19
+ "n_layer": 8,
20
+ "pad_token_id": 0,
21
+ "tie_weights": true,
22
+ "transformers_version": "4.57.6",
23
+ "vocab_size": 33
24
+ }
model.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ from transformers import PretrainedConfig, PreTrainedModel
6
+ from transformers.modeling_outputs import CausalLMOutputWithPast
7
+
8
+ def rotate_half(x):
9
+ x1 = x[..., : x.shape[-1] // 2]
10
+ x2 = x[..., x.shape[-1] // 2 :]
11
+ return torch.cat((-x2, x1), dim=-1)
12
+
13
+ def apply_rope(q, k):
14
+ dim = q.shape[-1]
15
+ device = q.device
16
+ seq_len = q.shape[-2]
17
+ theta = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device).float() / dim))
18
+ pos = torch.arange(seq_len, device=device).float()
19
+ freqs = torch.einsum('i,j->ij', pos, theta)
20
+ emb = torch.cat((freqs, freqs), dim=-1)
21
+ cos = emb.cos()[None, None, :, :]
22
+ sin = emb.sin()[None, None, :, :]
23
+ q = (q * cos) + (rotate_half(q) * sin)
24
+ k = (k * cos) + (rotate_half(k) * sin)
25
+ return q, k
26
+
27
+
28
+ class ChessConfig(PretrainedConfig):
29
+ model_type = "chess_transformer"
30
+ def __init__(self, vocab_size=1200, n_embd=104, n_layer=8, n_head=8, n_ctx=256, n_inner=None, dropout=0.15, layer_norm_epsilon=1e-5, tie_weights=True, pad_token_id=0, bos_token_id=1, eos_token_id=2, **kwargs):
31
+ super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
32
+ self.vocab_size = vocab_size
33
+ self.n_embd = n_embd
34
+ self.n_layer = n_layer
35
+ self.n_head = n_head
36
+ self.n_ctx = n_ctx
37
+ self.n_inner = n_inner if n_inner is not None else int(2.5 * n_embd)
38
+ self.dropout = dropout
39
+ self.layer_norm_epsilon = layer_norm_epsilon
40
+ self.tie_weights = tie_weights
41
+ self.tie_word_embeddings = True
42
+
43
+ class MultiHeadAttention(nn.Module):
44
+ def __init__(self, config):
45
+ super().__init__()
46
+ assert config.n_embd % config.n_head == 0
47
+ self.n_head = config.n_head
48
+ self.n_embd = config.n_embd
49
+ self.head_dim = config.n_embd // config.n_head
50
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
51
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd)
52
+ self.dropout = nn.Dropout(config.dropout)
53
+ self.register_buffer("bias", torch.tril(torch.ones(config.n_ctx, config.n_ctx)).view(1, 1, config.n_ctx, config.n_ctx), persistent=False)
54
+ def forward(self, x, attention_mask=None):
55
+ batch_size, seq_len, _ = x.size()
56
+ qkv = self.c_attn(x)
57
+ q, k, v = qkv.split(self.n_embd, dim=2)
58
+ q = q.view(batch_size, seq_len, self.n_head, self.head_dim).transpose(1, 2)
59
+ k = k.view(batch_size, seq_len, self.n_head, self.head_dim).transpose(1, 2)
60
+ v = v.view(batch_size, seq_len, self.n_head, self.head_dim).transpose(1, 2)
61
+ q, k = apply_rope(q, k)
62
+ attn_weights = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
63
+ causal_mask = self.bias[:, :, :seq_len, :seq_len]
64
+ attn_weights = attn_weights.masked_fill(causal_mask == 0, float("-inf"))
65
+ if attention_mask is not None:
66
+ attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
67
+ attn_weights = attn_weights.masked_fill(attention_mask == 0, float("-inf"))
68
+ attn_weights = F.softmax(attn_weights, dim=-1)
69
+ attn_weights = self.dropout(attn_weights)
70
+ attn_output = torch.matmul(attn_weights, v)
71
+ attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len, self.n_embd)
72
+ attn_output = self.c_proj(attn_output)
73
+ return attn_output
74
+
75
+ class FeedForward(nn.Module):
76
+ def __init__(self, config):
77
+ super().__init__()
78
+ self.c_fc = nn.Linear(config.n_embd, config.n_inner)
79
+ self.c_proj = nn.Linear(config.n_inner, config.n_embd)
80
+ self.dropout = nn.Dropout(config.dropout)
81
+ def forward(self, x):
82
+ x = self.c_fc(x)
83
+ x = F.gelu(x)
84
+ x = self.c_proj(x)
85
+ x = self.dropout(x)
86
+ return x
87
+
88
+ class TransformerBlock(nn.Module):
89
+ def __init__(self, config):
90
+ super().__init__()
91
+ self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
92
+ self.attn = MultiHeadAttention(config)
93
+ self.ln_2 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
94
+ self.mlp = FeedForward(config)
95
+ def forward(self, x, attention_mask=None):
96
+ x = x + self.attn(self.ln_1(x), attention_mask=attention_mask)
97
+ x = x + self.mlp(self.ln_2(x))
98
+ return x
99
+
100
+ class ChessForCausalLM(PreTrainedModel):
101
+ config_class = ChessConfig
102
+ base_model_prefix = "transformer"
103
+ supports_gradient_checkpointing = True
104
+ keys_to_ignore_on_load_missing = ["lm_head.weight"]
105
+ def __init__(self, config):
106
+ super().__init__(config)
107
+ self.wte = nn.Embedding(config.vocab_size, config.n_embd)
108
+ self.drop = nn.Dropout(config.dropout)
109
+ self.h = nn.ModuleList([TransformerBlock(config) for _ in range(config.n_layer)])
110
+ self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
111
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
112
+ if config.tie_weights:
113
+ self._tied_weights_keys = ["lm_head.weight"]
114
+ self.post_init()
115
+ if config.tie_weights:
116
+ self.tie_weights()
117
+ def get_input_embeddings(self):
118
+ return self.wte
119
+ def set_input_embeddings(self, new_embeddings):
120
+ self.wte = new_embeddings
121
+ if getattr(self.config, "tie_weights", False):
122
+ self.tie_weights()
123
+ def get_output_embeddings(self):
124
+ return self.lm_head
125
+ def set_output_embeddings(self, new_embeddings):
126
+ self.lm_head = new_embeddings
127
+ def tie_weights(self):
128
+ if getattr(self.config, "tie_weights", False) or getattr(self.config, "tie_word_embeddings", False):
129
+ self._tie_or_clone_weights(self.lm_head, self.wte)
130
+ def _init_weights(self, module):
131
+ if isinstance(module, nn.Linear):
132
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
133
+ if module.bias is not None:
134
+ torch.nn.init.zeros_(module.bias)
135
+ elif isinstance(module, nn.Embedding):
136
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
137
+ elif isinstance(module, nn.LayerNorm):
138
+ torch.nn.init.ones_(module.weight)
139
+ torch.nn.init.zeros_(module.bias)
140
+ def forward(self, input_ids, attention_mask=None, position_ids=None, labels=None, return_dict=None, **kwargs):
141
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
142
+ batch_size, seq_len = input_ids.size()
143
+ token_embeds = self.wte(input_ids)
144
+ hidden_states = self.drop(token_embeds)
145
+ for block in self.h:
146
+ hidden_states = block(hidden_states, attention_mask=attention_mask)
147
+ hidden_states = self.ln_f(hidden_states)
148
+ logits = self.lm_head(hidden_states)
149
+ loss = None
150
+ if labels is not None:
151
+ shift_logits = logits[..., :-1, :].contiguous()
152
+ shift_labels = labels[..., 1:].contiguous()
153
+ loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
154
+ loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
155
+ if not return_dict:
156
+ output = (logits,)
157
+ return ((loss,) + output) if loss is not None else output
158
+ return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=None, hidden_states=None, attentions=None)
159
+
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:605ee3e104ab4399da70b315787459ee6001bbabdea4c4737f2022db410c208a
3
+ size 3007720
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[BOS]",
3
+ "eos_token": "[EOS]",
4
+ "pad_token": "[PAD]",
5
+ "unk_token": "[UNK]"
6
+ }
tokenizer.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from transformers import PreTrainedTokenizer
4
+
5
+
6
+ class ChessTokenizer(PreTrainedTokenizer):
7
+ model_input_names = ["input_ids", "attention_mask"]
8
+ vocab_files_names = {"vocab_file": "vocab.json"}
9
+ PAD_TOKEN = "[PAD]"
10
+ BOS_TOKEN = "[BOS]"
11
+ EOS_TOKEN = "[EOS]"
12
+ UNK_TOKEN = "[UNK]"
13
+ def __init__(self, vocab_file=None, vocab=None, **kwargs):
14
+ self._pad_token = self.PAD_TOKEN
15
+ self._bos_token = self.BOS_TOKEN
16
+ self._eos_token = self.EOS_TOKEN
17
+ self._unk_token = self.UNK_TOKEN
18
+ kwargs.pop("pad_token", None)
19
+ kwargs.pop("bos_token", None)
20
+ kwargs.pop("eos_token", None)
21
+ kwargs.pop("unk_token", None)
22
+ if vocab is not None:
23
+ self._vocab = vocab
24
+ elif vocab_file is not None and os.path.exists(vocab_file):
25
+ with open(vocab_file, "r", encoding="utf-8") as f:
26
+ self._vocab = json.load(f)
27
+ else:
28
+ self._vocab = self._create_default_vocab()
29
+ self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
30
+ super().__init__(pad_token=self._pad_token, bos_token=self._bos_token, eos_token=self._eos_token, unk_token=self._unk_token, **kwargs)
31
+ def _create_default_vocab(self):
32
+ special_tokens = [self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN]
33
+ return {token: idx for idx, token in enumerate(special_tokens)}
34
+ @property
35
+ def vocab_size(self):
36
+ return len(self._vocab)
37
+ def get_vocab(self):
38
+ return dict(self._vocab)
39
+ def _tokenize(self, text):
40
+ return text.strip().split()
41
+ def _convert_token_to_id(self, token):
42
+ return self._vocab.get(token, self._vocab.get(self.UNK_TOKEN, 0))
43
+ def _convert_id_to_token(self, index):
44
+ return self._ids_to_tokens.get(index, self.UNK_TOKEN)
45
+ def convert_tokens_to_string(self, tokens):
46
+ special = {self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN}
47
+ return " ".join(t for t in tokens if t not in special)
48
+ def save_vocabulary(self, save_directory, filename_prefix=None):
49
+ if not os.path.isdir(save_directory):
50
+ os.makedirs(save_directory, exist_ok=True)
51
+ vocab_file = os.path.join(save_directory, (filename_prefix + "-" if filename_prefix else "") + "vocab.json")
52
+ with open(vocab_file, "w", encoding="utf-8") as f:
53
+ json.dump(self._vocab, f, ensure_ascii=False, indent=2)
54
+ return (vocab_file,)
55
+
tokenizer_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoTokenizer": [
4
+ "tokenizer.ChessTokenizer",
5
+ null
6
+ ]
7
+ },
8
+ "tokenizer_class": "ChessTokenizer",
9
+ "bos_token": "[BOS]",
10
+ "eos_token": "[EOS]",
11
+ "pad_token": "[PAD]",
12
+ "unk_token": "[UNK]"
13
+ }
vocab.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "[PAD]": 0,
3
+ "[BOS]": 1,
4
+ "[EOS]": 2,
5
+ "[UNK]": 3,
6
+ "WNb1c3": 4,
7
+ "BPe7e5": 5,
8
+ "WPe2e4": 6,
9
+ "BBf8c5": 7,
10
+ "WQd1h5": 8,
11
+ "BNg8f6": 9,
12
+ "WQh5e5(x+)": 10,
13
+ "BBc5e7": 11,
14
+ "WPd2d3": 12,
15
+ "BPd7d6": 13,
16
+ "WQe5f4": 14,
17
+ "BNf6h5": 15,
18
+ "WQf4f3": 16,
19
+ "BNh5f6": 17,
20
+ "WQf3g3": 18,
21
+ "BNf6g4": 19,
22
+ "WBf1e2": 20,
23
+ "BPh7h5": 21,
24
+ "WBe2g4(x)": 22,
25
+ "BBc8g4(x)": 23,
26
+ "WPf2f3": 24,
27
+ "BPh5h4": 25,
28
+ "WQg3g4(x)": 26,
29
+ "BPg7g6": 27,
30
+ "WNc3d5": 28,
31
+ "BBe7f8": 29,
32
+ "WBc1g5": 30,
33
+ "BQd8d7": 31,
34
+ "WNd5f6(+)": 32
35
+ }