StentorLabs commited on
Commit
16c1cde
·
verified ·
1 Parent(s): 546303d

Auto-push latest Stentor2-12M checkpoint

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenmonster.vocab filter=lfs diff=lfs merge=lfs -text
.ready ADDED
@@ -0,0 +1 @@
 
 
1
+ ready
README.md CHANGED
@@ -1,3 +1,28 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Stentor-12M
2
+
3
+ This checkpoint was produced by the Stentor-12M training pipeline.
4
+
5
+ ## Metadata
6
+
7
+ ```json
8
+ {
9
+ "hidden_size": 256,
10
+ "intermediate_size": 768,
11
+ "learning_rate": 0.0008,
12
+ "max_position_embeddings": 1024,
13
+ "mixed_precision": "fp16",
14
+ "model_arch": "llama",
15
+ "num_attention_heads": 4,
16
+ "num_hidden_layers": 12,
17
+ "num_key_value_heads": 4,
18
+ "optimizer": "adamw",
19
+ "pad_vocab_to_multiple": 128,
20
+ "rope_theta": null,
21
+ "scheduler": "cosine",
22
+ "stable_ratio": 0.8,
23
+ "torch_compile": false,
24
+ "vocab_size": 8064,
25
+ "warmup_ratio": 0.05,
26
+ "weight_decay": 0.01
27
+ }
28
+ ```
chat_template.jinja ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {% for message in messages %}<|{{ message['role'] }}|>
2
+ {{ message['content'] }}
3
+ {% endfor %}{% if add_generation_prompt %}<|assistant|>
4
+ {% endif %}
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 7886,
8
+ "dtype": "float32",
9
+ "eos_token_id": 7887,
10
+ "head_dim": 64,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 256,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 768,
15
+ "max_position_embeddings": 1024,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 4,
19
+ "num_hidden_layers": 12,
20
+ "num_key_value_heads": 4,
21
+ "pad_token_id": 7889,
22
+ "pretraining_tp": 1,
23
+ "rms_norm_eps": 1e-05,
24
+ "rope_parameters": {
25
+ "rope_theta": 10000.0,
26
+ "rope_type": "default"
27
+ },
28
+ "tie_word_embeddings": true,
29
+ "transformers_version": "5.2.0",
30
+ "use_cache": true,
31
+ "vocab_size": 8064
32
+ }
generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 7886,
4
+ "eos_token_id": 7887,
5
+ "output_attentions": false,
6
+ "output_hidden_states": false,
7
+ "pad_token_id": 7889,
8
+ "transformers_version": "5.2.0",
9
+ "use_cache": true
10
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6c39fcd467d0534c820c929fcae2ce55f2cf306735ad0a238fbf527b2ca7eba
3
+ size 49189568
tokenizer_config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "7886": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "7887": {
12
+ "content": "</s>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "7888": {
20
+ "content": "<unk>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "7889": {
28
+ "content": "<pad>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "auto_map": {
37
+ "AutoTokenizer": [
38
+ "tokenmonster_hf.TokenMonsterTokenizer",
39
+ null
40
+ ]
41
+ },
42
+ "backend": "custom",
43
+ "bos_token": "<s>",
44
+ "eos_token": "</s>",
45
+ "model_max_length": 1000000,
46
+ "pad_token": "<pad>",
47
+ "tokenizer_class": "TokenMonsterTokenizer",
48
+ "unk_token": "<unk>",
49
+ "vocab_file": "tokenmonster.vocab"
50
+ }
tokenmonster.vocab ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aae1dba4cd80a638125689c559726706dccf04563a4aad141ad040b13fcb3279
3
+ size 210283
tokenmonster_hf.py ADDED
@@ -0,0 +1,581 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib
2
+ import inspect
3
+ import json
4
+ import os
5
+ import shutil
6
+ import subprocess
7
+ import sys
8
+ from typing import Dict, Iterable, List, Optional, Sequence, Tuple
9
+
10
+ import numpy as np
11
+ from transformers import PreTrainedTokenizer
12
+ from transformers.utils import cached_file
13
+
14
+ def _prepare_tokenmonster_runtime(tokenmonster_module) -> None:
15
+ candidate_roots = [
16
+ os.environ.get("STENTOR_TOKENMONSTER_DIR"),
17
+ "/root/.cache" if os.path.isdir("/root/.cache") else None,
18
+ os.getcwd(),
19
+ os.environ.get("HOME"),
20
+ os.environ.get("TMPDIR"),
21
+ tempfile.gettempdir(),
22
+ ]
23
+ runtime_dir = None
24
+ for root in candidate_roots:
25
+ if not root:
26
+ continue
27
+ try:
28
+ os.makedirs(root, exist_ok=True)
29
+ except Exception:
30
+ continue
31
+ if not (os.access(root, os.W_OK) and os.access(root, os.X_OK)):
32
+ continue
33
+ runtime_dir = os.path.join(root, "stentor_tokenmonster_runtime")
34
+ break
35
+ if runtime_dir is None:
36
+ raise SystemExit(
37
+ "Unable to find a writable/executable directory for TokenMonster runtime files."
38
+ )
39
+ rank_suffix = os.environ.get("LOCAL_RANK") or os.environ.get("RANK")
40
+ if rank_suffix is not None and str(rank_suffix).strip() != "":
41
+ runtime_dir = os.path.join(runtime_dir, f"rank_{rank_suffix}")
42
+ os.makedirs(runtime_dir, exist_ok=True)
43
+ if hasattr(tokenmonster_module, "set_local_directory"):
44
+ tokenmonster_module.set_local_directory(runtime_dir)
45
+
46
+ server_name = "tokenmonsterserver"
47
+ os_name = None
48
+ if hasattr(tokenmonster_module, "_get_binary_filename"):
49
+ try:
50
+ os_name, server_name = tokenmonster_module._get_binary_filename()
51
+ except Exception:
52
+ os_name, server_name = None, "tokenmonsterserver"
53
+ server_path = os.path.join(runtime_dir, server_name)
54
+ if (not os.path.exists(server_path)) and os_name:
55
+ try:
56
+ cached_server = hf_hub_download(
57
+ repo_id=DEFAULT_TOKENMONSTER_REPO,
58
+ filename=f"binaries/{os_name}/{server_name}",
59
+ token=resolve_hf_token(),
60
+ )
61
+ shutil.copy2(cached_server, server_path)
62
+ except Exception:
63
+ pass
64
+ if os.path.exists(server_path):
65
+ try:
66
+ os.chmod(server_path, 0o755)
67
+ except Exception:
68
+ pass
69
+ if not os.access(server_path, os.X_OK):
70
+ raise SystemExit(
71
+ f"TokenMonster server binary is not executable after chmod: {server_path}"
72
+ )
73
+
74
+
75
+ def _repair_tokenmonster_server(tokenmonster_module) -> None:
76
+ _prepare_tokenmonster_runtime(tokenmonster_module)
77
+ vocab_cls = getattr(tokenmonster_module, "Vocab", None)
78
+ if vocab_cls is None:
79
+ return
80
+
81
+ runtime_dir = getattr(vocab_cls, "_dir", None)
82
+ server_name = None
83
+ os_name = None
84
+ if hasattr(tokenmonster_module, "_get_binary_filename"):
85
+ try:
86
+ os_name, server_name = tokenmonster_module._get_binary_filename()
87
+ except Exception:
88
+ os_name, server_name = None, None
89
+ if not runtime_dir or not server_name:
90
+ return
91
+
92
+ server_path = os.path.join(runtime_dir, server_name)
93
+ try:
94
+ if os.path.exists(server_path):
95
+ os.remove(server_path)
96
+ except Exception:
97
+ pass
98
+
99
+ if os_name:
100
+ try:
101
+ cached_server = hf_hub_download(
102
+ repo_id=DEFAULT_TOKENMONSTER_REPO,
103
+ filename=f"binaries/{os_name}/{server_name}",
104
+ token=resolve_hf_token(),
105
+ )
106
+ shutil.copy2(cached_server, server_path)
107
+ except Exception:
108
+ pass
109
+
110
+ if os.path.exists(server_path):
111
+ try:
112
+ os.chmod(server_path, 0o755)
113
+ except Exception:
114
+ pass
115
+
116
+
117
+ def _ensure_tokenmonster_available():
118
+ global tokenmonster
119
+ if tokenmonster is not None:
120
+ _prepare_tokenmonster_runtime(tokenmonster)
121
+ return tokenmonster
122
+ try:
123
+ subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "tokenmonster"])
124
+ tokenmonster = importlib.import_module("tokenmonster")
125
+ _prepare_tokenmonster_runtime(tokenmonster)
126
+ return tokenmonster
127
+ except Exception as exc:
128
+ raise SystemExit(
129
+ "tokenmonster is required to use TokenMonster vocab files. "
130
+ "Automatic install failed. Install with `pip install tokenmonster`."
131
+ ) from exc
132
+
133
+
134
+ def _tokenmonster_remote_module_source() -> str:
135
+ imports = [
136
+ "import importlib",
137
+ "import inspect",
138
+ "import json",
139
+ "import os",
140
+ "import shutil",
141
+ "import subprocess",
142
+ "import sys",
143
+ "from typing import Dict, Iterable, List, Optional, Sequence, Tuple",
144
+ "",
145
+ "import numpy as np",
146
+ "from transformers import PreTrainedTokenizer",
147
+ "from transformers.utils import cached_file",
148
+ ]
149
+ try:
150
+ prepare_source = inspect.getsource(_prepare_tokenmonster_runtime)
151
+ repair_source = inspect.getsource(_repair_tokenmonster_server)
152
+ helper_source = inspect.getsource(_ensure_tokenmonster_available)
153
+ remote_source = inspect.getsource(_tokenmonster_remote_module_source)
154
+ class_source = inspect.getsource(TokenMonsterTokenizer)
155
+ except (OSError, TypeError):
156
+ source_text = _load_current_source_text()
157
+ prepare_source = _extract_source_block(
158
+ source_text,
159
+ "def _prepare_tokenmonster_runtime(tokenmonster_module) -> None:",
160
+ ["def _repair_tokenmonster_server(", "def print_hardware_diagnostic(", "def _ensure_tokenmonster_available("],
161
+ )
162
+ repair_source = _extract_source_block(
163
+ source_text,
164
+ "def _repair_tokenmonster_server(tokenmonster_module) -> None:",
165
+ ["def print_hardware_diagnostic(", "def _ensure_tokenmonster_available("],
166
+ )
167
+ helper_source = _extract_source_block(
168
+ source_text,
169
+ "def _ensure_tokenmonster_available():",
170
+ ["def _extract_source_block(", "def _tokenmonster_remote_module_source("],
171
+ )
172
+ extract_source = _extract_source_block(
173
+ source_text,
174
+ "def _extract_source_block(",
175
+ ["def _load_current_source_text(", "def _tokenmonster_remote_module_source("],
176
+ )
177
+ load_source = _extract_source_block(
178
+ source_text,
179
+ "def _load_current_source_text(",
180
+ ["def _tokenmonster_remote_module_source(", "class TokenMonsterTokenizer("],
181
+ )
182
+ remote_source = _extract_source_block(
183
+ source_text,
184
+ "def _tokenmonster_remote_module_source() -> str:",
185
+ ["class TokenMonsterTokenizer("],
186
+ )
187
+ class_source = _extract_source_block(
188
+ source_text,
189
+ "class TokenMonsterTokenizer(PreTrainedTokenizer):",
190
+ ["@dataclass", "\n\ndef parse_args("],
191
+ )
192
+ return "\n\n".join(
193
+ [
194
+ "\n".join(imports),
195
+ prepare_source.strip(),
196
+ repair_source.strip(),
197
+ helper_source.strip(),
198
+ extract_source.strip(),
199
+ load_source.strip(),
200
+ remote_source.strip(),
201
+ class_source.strip(),
202
+ ]
203
+ ) + "\n"
204
+ return "\n\n".join(
205
+ [
206
+ "\n".join(imports),
207
+ prepare_source,
208
+ repair_source,
209
+ helper_source,
210
+ remote_source,
211
+ class_source,
212
+ ]
213
+ ) + "\n"
214
+
215
+
216
+ class TokenMonsterTokenizer(PreTrainedTokenizer):
217
+ vocab_files_names = {"vocab_file": "tokenmonster.vocab"}
218
+ model_input_names = ["input_ids", "attention_mask"]
219
+
220
+ @classmethod
221
+ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
222
+ if "vocab_file" not in kwargs:
223
+ resolved_vocab = None
224
+ if os.path.isdir(pretrained_model_name_or_path):
225
+ candidate = os.path.join(
226
+ pretrained_model_name_or_path,
227
+ cls.vocab_files_names["vocab_file"],
228
+ )
229
+ if os.path.exists(candidate):
230
+ resolved_vocab = candidate
231
+ if resolved_vocab is None:
232
+ cached_kwargs = {
233
+ key: kwargs[key]
234
+ for key in (
235
+ "cache_dir",
236
+ "force_download",
237
+ "local_files_only",
238
+ "proxies",
239
+ "resume_download",
240
+ "revision",
241
+ "subfolder",
242
+ "token",
243
+ )
244
+ if key in kwargs
245
+ }
246
+ try:
247
+ resolved_vocab = cached_file(
248
+ pretrained_model_name_or_path,
249
+ cls.vocab_files_names["vocab_file"],
250
+ **cached_kwargs,
251
+ )
252
+ except Exception:
253
+ resolved_vocab = None
254
+ if resolved_vocab is not None:
255
+ kwargs["vocab_file"] = resolved_vocab
256
+ return super().from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
257
+
258
+ def __init__(self, vocab_file: str, model_max_length: int = 1_000_000, **kwargs) -> None:
259
+ name_or_path = kwargs.get("name_or_path")
260
+ if (
261
+ vocab_file
262
+ and not os.path.isabs(vocab_file)
263
+ and not os.path.exists(vocab_file)
264
+ and name_or_path
265
+ and os.path.isdir(name_or_path)
266
+ ):
267
+ candidate = os.path.join(name_or_path, vocab_file)
268
+ if os.path.exists(candidate):
269
+ vocab_file = candidate
270
+ self.vocab_file_path = os.path.abspath(vocab_file)
271
+ self._tm = _ensure_tokenmonster_available()
272
+ try:
273
+ self._vocab = self._tm.load(self.vocab_file_path)
274
+ except RuntimeError as exc:
275
+ if "enable execute permissions" not in str(exc):
276
+ raise
277
+ _repair_tokenmonster_server(self._tm)
278
+ self._vocab = self._tm.load(self.vocab_file_path)
279
+ self._token_to_id: Dict[str, int] = {}
280
+ self._id_to_token: Dict[int, str] = {}
281
+ self._vocab_size_with_gaps = 0
282
+ self._refresh_dictionary()
283
+
284
+ kwargs.setdefault("bos_token", "<s>")
285
+ kwargs.setdefault("eos_token", "</s>")
286
+ kwargs.setdefault("unk_token", "<unk>")
287
+ kwargs.setdefault("pad_token", "<pad>")
288
+ kwargs.setdefault("model_max_length", int(model_max_length))
289
+
290
+ super().__init__(vocab_file=self.vocab_file_path, **kwargs)
291
+ if not hasattr(self, "additional_special_tokens"):
292
+ self.additional_special_tokens = []
293
+ elif self.additional_special_tokens is None:
294
+ self.additional_special_tokens = []
295
+
296
+ @property
297
+ def vocab_size(self) -> int:
298
+ return self._vocab_size_with_gaps
299
+
300
+ def __len__(self) -> int:
301
+ return self._vocab_size_with_gaps
302
+
303
+ def get_vocab(self) -> Dict[str, int]:
304
+ return dict(self._token_to_id)
305
+
306
+ def prepare_for_tokenization(self, text, is_split_into_words: bool = False, **kwargs):
307
+ return (text, kwargs)
308
+
309
+ def _refresh_dictionary(self) -> None:
310
+ token_to_id: Dict[str, int] = {}
311
+ id_to_token: Dict[int, str] = {}
312
+ max_token_id = -1
313
+ entries = self._vocab.get_dictionary()
314
+ if isinstance(entries, dict):
315
+ entries = entries.values()
316
+ for entry in entries:
317
+ token = entry["token"]
318
+ if isinstance(token, bytes):
319
+ token = token.decode("utf-8", errors="replace")
320
+ token_id = int(entry["id"])
321
+ token_to_id[str(token)] = token_id
322
+ id_to_token[token_id] = str(token)
323
+ if token_id > max_token_id:
324
+ max_token_id = token_id
325
+ self._token_to_id = token_to_id
326
+ self._id_to_token = id_to_token
327
+ self._vocab_size_with_gaps = max(0, max_token_id + 1)
328
+
329
+ def _clear_added_token_state(self) -> None:
330
+ # Intentionally left as a no-op. Clearing HF's added-token state here causes
331
+ # additional/special tokens to disappear from saved checkpoints.
332
+ return
333
+
334
+ @staticmethod
335
+ def _normalize_token_value(token) -> Optional[str]:
336
+ if token is None:
337
+ return None
338
+ if hasattr(token, "content"):
339
+ return str(token.content)
340
+ return str(token)
341
+
342
+ @staticmethod
343
+ def _normalize_token_ids(token_ids) -> List[int]:
344
+ if isinstance(token_ids, np.ndarray):
345
+ return [int(x) for x in token_ids.tolist()]
346
+ if isinstance(token_ids, (list, tuple)):
347
+ return [int(x) for x in token_ids]
348
+ return [int(token_ids)]
349
+
350
+ def _add_backend_token(self, token: Optional[str], special: bool) -> bool:
351
+ if not token:
352
+ return False
353
+ if token in self._token_to_id:
354
+ return False
355
+ if special:
356
+ self._vocab.add_special_token(token)
357
+ else:
358
+ self._vocab.add_token(token)
359
+ self._refresh_dictionary()
360
+ return token in self._token_to_id
361
+
362
+ def _tokenize(self, text: str) -> List[str]:
363
+ ids = self._normalize_token_ids(self._vocab.tokenize(text))
364
+ unk_token = self.unk_token or "<unk>"
365
+ return [self._id_to_token.get(int(i), unk_token) for i in ids]
366
+
367
+ def _convert_token_to_id(self, token: str) -> Optional[int]:
368
+ if token is None:
369
+ return None
370
+ token_id = self._token_to_id.get(token)
371
+ if token_id is not None:
372
+ return int(token_id)
373
+ if self.unk_token is not None:
374
+ return self._token_to_id.get(self.unk_token)
375
+ return None
376
+
377
+ def _convert_id_to_token(self, index: int) -> str:
378
+ token = self._id_to_token.get(int(index))
379
+ if token is not None:
380
+ return token
381
+ return self.unk_token or "<unk>"
382
+
383
+ def convert_tokens_to_string(self, tokens: Sequence[str]) -> str:
384
+ token_ids: List[int] = []
385
+ for token in tokens:
386
+ token_id = self._convert_token_to_id(token)
387
+ if token_id is not None:
388
+ token_ids.append(int(token_id))
389
+ if not token_ids:
390
+ return ""
391
+ return self._vocab.decode(token_ids)
392
+
393
+ def build_inputs_with_special_tokens(
394
+ self,
395
+ token_ids_0: List[int],
396
+ token_ids_1: Optional[List[int]] = None,
397
+ ) -> List[int]:
398
+ output = list(token_ids_0)
399
+ if self.bos_token_id is not None:
400
+ output = [self.bos_token_id] + output
401
+ if self.eos_token_id is not None:
402
+ output = output + [self.eos_token_id]
403
+ if token_ids_1 is None:
404
+ return output
405
+ second = list(token_ids_1)
406
+ if self.eos_token_id is not None:
407
+ second = second + [self.eos_token_id]
408
+ return output + second
409
+
410
+ def get_special_tokens_mask(
411
+ self,
412
+ token_ids_0: List[int],
413
+ token_ids_1: Optional[List[int]] = None,
414
+ already_has_special_tokens: bool = False,
415
+ ) -> List[int]:
416
+ special_ids = set(self.all_special_ids)
417
+ if already_has_special_tokens:
418
+ combined = list(token_ids_0)
419
+ return [1 if token in special_ids else 0 for token in combined]
420
+ mask = [0] * len(token_ids_0)
421
+ if self.bos_token_id is not None:
422
+ mask = [1] + mask
423
+ if self.eos_token_id is not None:
424
+ mask = mask + [1]
425
+ if token_ids_1 is None:
426
+ return mask
427
+ second_mask = [0] * len(token_ids_1)
428
+ if self.eos_token_id is not None:
429
+ second_mask = second_mask + [1]
430
+ return mask + second_mask
431
+
432
+ def create_token_type_ids_from_sequences(
433
+ self,
434
+ token_ids_0: List[int],
435
+ token_ids_1: Optional[List[int]] = None,
436
+ ) -> List[int]:
437
+ length = len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1))
438
+ return [0] * length
439
+
440
+ def add_tokens(self, new_tokens: Iterable[str], special_tokens: bool = False) -> int:
441
+ if isinstance(new_tokens, str):
442
+ new_tokens = [new_tokens]
443
+ added = 0
444
+ for token in new_tokens:
445
+ if self._add_backend_token(self._normalize_token_value(token), special_tokens):
446
+ added += 1
447
+ return added
448
+
449
+ def add_special_tokens(
450
+ self,
451
+ special_tokens_dict: Dict[str, object],
452
+ replace_additional_special_tokens: bool = True,
453
+ ) -> int:
454
+ added = 0
455
+ single_token_keys = (
456
+ "unk_token",
457
+ "bos_token",
458
+ "eos_token",
459
+ "pad_token",
460
+ "sep_token",
461
+ "cls_token",
462
+ "mask_token",
463
+ )
464
+ for key in single_token_keys:
465
+ token = self._normalize_token_value(special_tokens_dict.get(key))
466
+ if token is None:
467
+ continue
468
+ if self._add_backend_token(token, special=True):
469
+ added += 1
470
+ setattr(self, key, token)
471
+
472
+ extras = special_tokens_dict.get("additional_special_tokens")
473
+ if extras is not None:
474
+ normalized_extras: List[str] = []
475
+ for token in extras:
476
+ normalized = self._normalize_token_value(token)
477
+ if normalized is None:
478
+ continue
479
+ if self._add_backend_token(normalized, special=True):
480
+ added += 1
481
+ if normalized not in normalized_extras:
482
+ normalized_extras.append(normalized)
483
+ if replace_additional_special_tokens:
484
+ self.additional_special_tokens = normalized_extras
485
+ else:
486
+ merged = list(getattr(self, "additional_special_tokens", []) or [])
487
+ for token in normalized_extras:
488
+ if token not in merged:
489
+ merged.append(token)
490
+ self.additional_special_tokens = merged
491
+ return added
492
+
493
+ def _decode(
494
+ self,
495
+ token_ids,
496
+ skip_special_tokens: bool = False,
497
+ clean_up_tokenization_spaces: Optional[bool] = None,
498
+ spaces_between_special_tokens: bool = False,
499
+ **kwargs,
500
+ ) -> str:
501
+ ids = self._normalize_token_ids(token_ids)
502
+ if skip_special_tokens:
503
+ special_ids = set(self.all_special_ids)
504
+ ids = [token_id for token_id in ids if token_id not in special_ids]
505
+ if not ids:
506
+ return ""
507
+ return self._vocab.decode(ids)
508
+
509
+ def save_vocabulary(
510
+ self,
511
+ save_directory: str,
512
+ filename_prefix: Optional[str] = None,
513
+ ) -> Tuple[str]:
514
+ os.makedirs(save_directory, exist_ok=True)
515
+ prefix = f"{filename_prefix}-" if filename_prefix else ""
516
+ vocab_path = os.path.join(save_directory, f"{prefix}tokenmonster.vocab")
517
+ vocab_json_path = os.path.join(save_directory, f"{prefix}vocab.json")
518
+ self._vocab.save(vocab_path)
519
+ with open(vocab_json_path, "w", encoding="utf-8") as handle:
520
+ json.dump(self.get_vocab(), handle, ensure_ascii=False, indent=2, sort_keys=True)
521
+ return (vocab_path,)
522
+
523
+ def save_pretrained(
524
+ self,
525
+ save_directory: str,
526
+ legacy_format: Optional[bool] = None,
527
+ filename_prefix: Optional[str] = None,
528
+ push_to_hub: bool = False,
529
+ **kwargs,
530
+ ):
531
+ saved_files = super().save_pretrained(
532
+ save_directory,
533
+ legacy_format=legacy_format,
534
+ filename_prefix=filename_prefix,
535
+ push_to_hub=push_to_hub,
536
+ **kwargs,
537
+ )
538
+
539
+ module_out = os.path.join(save_directory, "tokenmonster_hf.py")
540
+ try:
541
+ module_path = inspect.getsourcefile(type(self))
542
+ except (OSError, TypeError):
543
+ module_path = None
544
+ if module_path and os.path.isfile(module_path) and os.path.basename(module_path) == "tokenmonster_hf.py":
545
+ shutil.copy(module_path, module_out)
546
+ else:
547
+ with open(module_out, "w", encoding="utf-8") as handle:
548
+ handle.write(_tokenmonster_remote_module_source())
549
+
550
+ prefix = f"{filename_prefix}-" if filename_prefix else ""
551
+ config_path = os.path.join(save_directory, f"{prefix}tokenizer_config.json")
552
+ config = {}
553
+ if os.path.exists(config_path):
554
+ with open(config_path, "r", encoding="utf-8") as handle:
555
+ config = json.load(handle)
556
+ config.update(
557
+ {
558
+ "tokenizer_class": "TokenMonsterTokenizer",
559
+ "auto_map": {
560
+ "AutoTokenizer": ["tokenmonster_hf.TokenMonsterTokenizer", None]
561
+ },
562
+ "vocab_file": self.vocab_files_names["vocab_file"],
563
+ "bos_token": self.bos_token,
564
+ "eos_token": self.eos_token,
565
+ "unk_token": self.unk_token,
566
+ "pad_token": self.pad_token,
567
+ }
568
+ )
569
+ with open(config_path, "w", encoding="utf-8") as handle:
570
+ json.dump(config, handle, ensure_ascii=False, indent=2, sort_keys=True)
571
+ added_tokens_path = os.path.join(save_directory, f"{prefix}added_tokens.json")
572
+ added_tokens = {}
573
+ for token in list(getattr(self, "additional_special_tokens", []) or []):
574
+ token_id = self._convert_token_to_id(token)
575
+ if token_id is not None:
576
+ added_tokens[str(token)] = int(token_id)
577
+ if added_tokens:
578
+ with open(added_tokens_path, "w", encoding="utf-8") as handle:
579
+ json.dump(added_tokens, handle, ensure_ascii=False, indent=2, sort_keys=True)
580
+ return saved_files
581
+
vocab.json ADDED
The diff for this file is too large to render. See raw diff