TheOpenMachine commited on
Commit
37646f7
·
verified ·
1 Parent(s): 4e64f69

Initial release: UnitronX Tokenizer 32k v1

Browse files
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
meta.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "UnitronX-Tokenizer-32k-v1",
3
+ "saved_at": "2025-11-06T21:44:44Z",
4
+ "vocab_size": 32000,
5
+ "special_tokens": [
6
+ "<unk>",
7
+ "<s>",
8
+ "</s>",
9
+ "<pad>",
10
+ "<mask>",
11
+ "<eot>",
12
+ "<URL>",
13
+ "<EMAIL>",
14
+ "<PATH>",
15
+ "<HEX>",
16
+ "<UUID>",
17
+ "<HANDLE>",
18
+ "<HASHTAG>",
19
+ "<NUM_2D>",
20
+ "<NUM_4D>",
21
+ "<NUM_N>"
22
+ ],
23
+ "byte_fallback": true,
24
+ "sha256": "b69c786fa3cee01dd1f60835cd1b5081efd197c6ea84214fc2ba0d605c229df0"
25
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "unk_token": "<unk>",
3
+ "pad_token": "<pad>",
4
+ "bos_token": "<s>",
5
+ "eos_token": "</s>",
6
+ "mask_token": "<mask>",
7
+ "additional_special_tokens": [
8
+ "<eot>",
9
+ "<URL>",
10
+ "<EMAIL>",
11
+ "<PATH>",
12
+ "<HEX>",
13
+ "<UUID>",
14
+ "<HANDLE>",
15
+ "<HASHTAG>",
16
+ "<NUM_2D>",
17
+ "<NUM_4D>",
18
+ "<NUM_N>"
19
+ ]
20
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_max_length": 4096,
3
+ "padding_side": "right",
4
+ "truncation_side": "right",
5
+ "add_prefix_space": true,
6
+ "name_or_path": "UnitronX-Tokenizer-32k-v1",
7
+ "bos_token": "<s>",
8
+ "eos_token": "</s>"
9
+ }
unitronx.tiktoken.json ADDED
The diff for this file is too large to render. See raw diff
 
vocab.json ADDED
The diff for this file is too large to render. See raw diff