damfle commited on
Commit
f65e646
·
unverified ·
1 Parent(s): 2c2adf2

init: initial commit

Browse files
Files changed (4) hide show
  1. README.md +32 -3
  2. special_tokens_map.json +30 -0
  3. tokenizer.json +0 -0
  4. tokenizer_config.json +31 -0
README.md CHANGED
@@ -1,3 +1,32 @@
1
- ---
2
- license: isc
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Multistral Tokenizer
2
+
3
+ Training completed successfully!
4
+
5
+ ## Configuration
6
+ - Vocabulary size: 127,989
7
+ - Special tokens: 13
8
+ - Min frequency: 2
9
+ - Training samples: up to 500,000
10
+
11
+ ## Datasets
12
+ - nick007x/github-code-2025 (35%)
13
+ - HuggingFaceFW/fineweb-2 (10%)
14
+ - HuggingFaceFW/fineweb-2 (15%)
15
+ - HuggingFaceFW/fineweb-2 (15%)
16
+ - HuggingFaceFW/fineweb (25%)
17
+
18
+ ## Special Tokens
19
+ <|begin|>, <|return|>, <|pad|>, <|start|>, <|channel|>, <|end|>, <|message|>, <|image|>, <|video|>, <|audio|>, <|call|>, <|constrain|>, <|unknown|>
20
+
21
+ ## Enforced Vocabulary
22
+ analysis, assistant, commentary, developer, final, json, system, tool, toon, user, yaml
23
+
24
+ ## Usage
25
+
26
+ ```python
27
+ from multistral.multistraltokenizer import MultistralTokenizer
28
+
29
+ tokenizer = MultistralTokenizer.from_pretrained("models/aizia_tokenizer")
30
+ tokens = tokenizer.encode("Your text here")
31
+ text = tokenizer.decode(tokens)
32
+ ```
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|begin|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|return|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|pad|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|unknown|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|start|>",
4
+ "<|channel|>",
5
+ "<|end|>",
6
+ "<|message|>",
7
+ "<|image|>",
8
+ "<|video|>",
9
+ "<|audio|>",
10
+ "<|call|>",
11
+ "<|constrain|>"
12
+ ],
13
+ "backend": "tokenizers",
14
+ "bos_token": "<|begin|>",
15
+ "eos_token": "<|return|>",
16
+ "extra_special_tokens": [
17
+ "<|start|>",
18
+ "<|channel|>",
19
+ "<|end|>",
20
+ "<|message|>",
21
+ "<|image|>",
22
+ "<|video|>",
23
+ "<|audio|>",
24
+ "<|call|>",
25
+ "<|constrain|>"
26
+ ],
27
+ "model_max_length": 1000000000000000019884624838656,
28
+ "pad_token": "<|pad|>",
29
+ "tokenizer_class": "MultistralTokenizer",
30
+ "unk_token": "<|unknown|>"
31
+ }