umarbutler commited on
Commit
d5c6acb
·
verified ·
1 Parent(s): 3f61691

Added Kanon 2's tokenizer.

Browse files
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "single_word": false,
5
+ "lstrip": false,
6
+ "rstrip": false,
7
+ "normalized": false,
8
+ "special": true
9
+ },
10
+ "eos_token": {
11
+ "content": "<|endoftext|>",
12
+ "single_word": false,
13
+ "lstrip": false,
14
+ "rstrip": false,
15
+ "normalized": false,
16
+ "special": true
17
+ },
18
+ "unk_token": {
19
+ "content": "<|unknowntoken|>",
20
+ "single_word": false,
21
+ "lstrip": false,
22
+ "rstrip": false,
23
+ "normalized": false,
24
+ "special": true
25
+ },
26
+ "sep_token": {
27
+ "content": "<|endoftext|>",
28
+ "single_word": false,
29
+ "lstrip": false,
30
+ "rstrip": false,
31
+ "normalized": false,
32
+ "special": true
33
+ },
34
+ "pad_token": {
35
+ "content": "<|paddingtoken|>",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ "cls_token": {
43
+ "content": "<|startoftext|>",
44
+ "single_word": false,
45
+ "lstrip": false,
46
+ "rstrip": false,
47
+ "normalized": false,
48
+ "special": true
49
+ },
50
+ "mask_token": {
51
+ "content": "<|masktoken|>",
52
+ "single_word": false,
53
+ "lstrip": true,
54
+ "rstrip": false,
55
+ "normalized": false,
56
+ "special": true
57
+ }
58
+ }
tiktoken_tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_token_decoder": {
4
+ "0": {
5
+ "content": "<|paddingtoken|>",
6
+ "single_word": false,
7
+ "lstrip": false,
8
+ "rstrip": false,
9
+ "normalized": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<|startoftext|>",
14
+ "single_word": false,
15
+ "lstrip": false,
16
+ "rstrip": false,
17
+ "normalized": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "<|endoftext|>",
22
+ "single_word": false,
23
+ "lstrip": false,
24
+ "rstrip": false,
25
+ "normalized": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<|masktoken|>",
30
+ "single_word": false,
31
+ "lstrip": true,
32
+ "rstrip": false,
33
+ "normalized": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": "<|unknowntoken|>",
38
+ "single_word": false,
39
+ "lstrip": false,
40
+ "rstrip": false,
41
+ "normalized": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "clean_up_tokenization_spaces": false,
46
+ "extra_special_tokens": {},
47
+ "model_max_length": 8192,
48
+ "use_fast": true,
49
+ "tokenizer_class": "GPT2TokenizerFast",
50
+ "bos_token": "<|startoftext|>",
51
+ "eos_token": "<|endoftext|>",
52
+ "unk_token": "<|unknowntoken|>",
53
+ "sep_token": "<|endoftext|>",
54
+ "pad_token": "<|paddingtoken|>",
55
+ "cls_token": "<|startoftext|>",
56
+ "mask_token": "<|masktoken|>"
57
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff