Eithannak commited on
Commit
71b2e68
·
verified ·
1 Parent(s): e37798c

Chess Challenge submission by Eithannak

Browse files
Files changed (6) hide show
  1. README.md +4 -4
  2. config.json +5 -5
  3. pytorch_model.bin +3 -0
  4. tokenizer.py +154 -0
  5. tokenizer_config.json +4 -1
  6. vocab.json +0 -0
README.md CHANGED
@@ -14,13 +14,13 @@ Chess model submitted to the LLM Course Chess Challenge.
14
  ## Submission Info
15
 
16
  - **Submitted by**: [Eithannak](https://huggingface.co/Eithannak)
17
- - **Parameters**: 997,872
18
  - **Organization**: LLM-course
19
 
20
  ## Model Details
21
 
22
  - **Architecture**: Chess Transformer (GPT-style)
23
- - **Vocab size**: 5581
24
- - **Embedding dim**: 96
25
- - **Layers**: 5
26
  - **Heads**: 4
 
14
  ## Submission Info
15
 
16
  - **Submitted by**: [Eithannak](https://huggingface.co/Eithannak)
17
+ - **Parameters**: 965,040
18
  - **Organization**: LLM-course
19
 
20
  ## Model Details
21
 
22
  - **Architecture**: Chess Transformer (GPT-style)
23
+ - **Vocab size**: 512
24
+ - **Embedding dim**: 120
25
+ - **Layers**: 6
26
  - **Heads**: 4
config.json CHANGED
@@ -9,13 +9,13 @@
9
  "layer_norm_epsilon": 1e-05,
10
  "model_type": "chess_transformer",
11
  "n_ctx": 256,
12
- "n_embd": 96,
13
  "n_head": 4,
14
- "n_inner": 192,
15
- "n_layer": 5,
16
  "pad_token_id": 0,
17
- "rope_theta": 10000.0,
18
  "tie_weights": true,
19
  "transformers_version": "4.57.5",
20
- "vocab_size": 5581
 
21
  }
 
9
  "layer_norm_epsilon": 1e-05,
10
  "model_type": "chess_transformer",
11
  "n_ctx": 256,
12
+ "n_embd": 120,
13
  "n_head": 4,
14
+ "n_inner": 360,
15
+ "n_layer": 6,
16
  "pad_token_id": 0,
 
17
  "tie_weights": true,
18
  "transformers_version": "4.57.5",
19
+ "unk_token_id": 3,
20
+ "vocab_size": 512
21
  }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8ee69ac57bb9c15cff3921b710f4b63230e178ce1d73a156fcea9b3e45e00de
3
+ size 3881771
tokenizer.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import json
3
+ import os
4
+ import shutil
5
+ import re
6
+
7
+ from collections import Counter
8
+ from datasets import load_dataset
9
+ from typing import Dict, List, Optional
10
+ from transformers import PreTrainedTokenizer
11
+
12
+ SQUARE_MOVE_PATTERN = re.compile(r"([a-h][1-8])([a-h][1-8])")
13
+ PROMOTION_PATTERN = re.compile(r"=([NBRQ])")
14
+
15
+
16
+ def normalize_move(token: str) -> str:
17
+ if token.startswith("["):
18
+ return token
19
+
20
+ move_match = SQUARE_MOVE_PATTERN.search(token)
21
+ if not move_match:
22
+ return token
23
+
24
+ from_sq, to_sq = move_match.group(1), move_match.group(2)
25
+
26
+ promotion_suffix = ""
27
+ promo_match = PROMOTION_PATTERN.search(token)
28
+ if promo_match:
29
+ promotion_suffix = "=" + promo_match.group(1)
30
+
31
+ piece_prefix = token[:2] if len(token) >= 2 else "WP"
32
+
33
+ return f"{piece_prefix}{from_sq}{to_sq}{promotion_suffix}"
34
+
35
+
36
+
37
+ class ChessTokenizer(PreTrainedTokenizer):
38
+ model_input_names = ["input_ids", "attention_mask"]
39
+ vocab_files_names = {"vocab_file": "vocab.json"}
40
+
41
+ PAD_TOKEN = "[PAD]"
42
+ BOS_TOKEN = "[BOS]"
43
+ EOS_TOKEN = "[EOS]"
44
+ UNK_TOKEN = "[UNK]"
45
+
46
+ def __init__(self, vocab_file=None, vocab=None, **kwargs):
47
+ self._pad_token = self.PAD_TOKEN
48
+ self._bos_token = self.BOS_TOKEN
49
+ self._eos_token = self.EOS_TOKEN
50
+ self._unk_token = self.UNK_TOKEN
51
+
52
+ for t in ["pad_token", "bos_token", "eos_token", "unk_token"]:
53
+ kwargs.pop(t, None)
54
+
55
+ if vocab is None:
56
+ if vocab_file is None:
57
+ vocab_file = os.path.join(os.path.dirname(__file__), "vocab.json")
58
+ self.vocab_file = vocab_file
59
+ if os.path.exists(vocab_file):
60
+ with open(vocab_file, "r", encoding="utf-8") as f:
61
+ self._vocab = json.load(f)
62
+ else:
63
+ self._vocab = self._create_default_vocab()
64
+ else:
65
+ self._vocab = vocab
66
+ self.vocab_file = vocab_file
67
+
68
+ self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
69
+ super().__init__(
70
+ pad_token=self.PAD_TOKEN,
71
+ bos_token=self.BOS_TOKEN,
72
+ eos_token=self.EOS_TOKEN,
73
+ unk_token=self.UNK_TOKEN,
74
+ **kwargs,
75
+ )
76
+
77
+ def save_pretrained(self, save_directory: str, **kwargs):
78
+ super().save_pretrained(save_directory, **kwargs)
79
+ src_path = os.path.abspath(__file__)
80
+ dst_path = os.path.join(save_directory, "tokenizer.py")
81
+ if src_path != dst_path:
82
+ shutil.copy(src_path, dst_path)
83
+
84
+ config_path = os.path.join(save_directory, "tokenizer_config.json")
85
+ if os.path.exists(config_path):
86
+ with open(config_path, "r") as f:
87
+ cfg = json.load(f)
88
+ cfg["auto_map"] = {"AutoTokenizer": "tokenizer.ChessTokenizer"}
89
+ with open(config_path, "w") as f:
90
+ json.dump(cfg, f, indent=2)
91
+
92
+ def _create_default_vocab(self):
93
+ return {
94
+ t: i
95
+ for i, t in enumerate([self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN])
96
+ }
97
+
98
+ @classmethod
99
+ def build_vocab_from_dataset(
100
+ cls,
101
+ dataset_name,
102
+ split="train",
103
+ column="text",
104
+ max_vocab_size=512,
105
+ min_frequency=500,
106
+ max_samples=100000,
107
+ ):
108
+
109
+ ds = load_dataset(dataset_name, split=split, streaming=True)
110
+ ds = ds.take(max_samples)
111
+
112
+ counter = Counter()
113
+ for ex in ds:
114
+ moves = [normalize_move(t) for t in ex[column].split()]
115
+ counter.update(moves)
116
+
117
+ special = [cls.PAD_TOKEN, cls.BOS_TOKEN, cls.EOS_TOKEN, cls.UNK_TOKEN]
118
+ most_common = counter.most_common(max_vocab_size - len(special))
119
+
120
+ vocab = {t: i for i, t in enumerate(special + [t for t, c in most_common])}
121
+ return cls(vocab=vocab)
122
+
123
+ @property
124
+ def vocab_size(self):
125
+ return len(self._vocab)
126
+
127
+ def get_vocab(self):
128
+ return dict(self._vocab)
129
+
130
+ def _tokenize(self, text):
131
+ return [normalize_move(t) for t in text.strip().split()]
132
+
133
+ def _convert_token_to_id(self, token):
134
+ return self._vocab.get(token, self._vocab.get(self.UNK_TOKEN))
135
+
136
+ def _convert_id_to_token(self, index):
137
+ return self._ids_to_tokens.get(index, self.UNK_TOKEN)
138
+
139
+ def convert_tokens_to_string(self, tokens):
140
+ return " ".join(
141
+ t
142
+ for t in tokens
143
+ if t not in [self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN]
144
+ )
145
+
146
+ def save_vocabulary(self, save_directory, filename_prefix=None):
147
+ if not os.path.isdir(save_directory):
148
+ os.makedirs(save_directory, exist_ok=True)
149
+ path = os.path.join(
150
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + "vocab.json"
151
+ )
152
+ with open(path, "w", encoding="utf-8") as f:
153
+ json.dump(self._vocab, f, ensure_ascii=False, indent=2)
154
+ return (path,)
tokenizer_config.json CHANGED
@@ -33,6 +33,9 @@
33
  "special": true
34
  }
35
  },
 
 
 
36
  "bos_token": "[BOS]",
37
  "clean_up_tokenization_spaces": false,
38
  "eos_token": "[EOS]",
@@ -41,4 +44,4 @@
41
  "pad_token": "[PAD]",
42
  "tokenizer_class": "ChessTokenizer",
43
  "unk_token": "[UNK]"
44
- }
 
33
  "special": true
34
  }
35
  },
36
+ "auto_map": {
37
+ "AutoTokenizer": "tokenizer.ChessTokenizer"
38
+ },
39
  "bos_token": "[BOS]",
40
  "clean_up_tokenization_spaces": false,
41
  "eos_token": "[EOS]",
 
44
  "pad_token": "[PAD]",
45
  "tokenizer_class": "ChessTokenizer",
46
  "unk_token": "[UNK]"
47
+ }
vocab.json CHANGED
The diff for this file is too large to render. See raw diff