crumb commited on
Commit
398d198
·
1 Parent(s): f1bdc49

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +71 -0
  2. tokenizer.json +0 -0
  3. tokenizer_config.json +13 -0
special_tokens_map.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "[R-DENOISING]",
4
+ "[X-DENOISING]",
5
+ "[S-DENOISING]",
6
+ "<extra_token_0>",
7
+ "<extra_token_1>",
8
+ "<extra_token_2>",
9
+ "<extra_token_3>",
10
+ "<extra_token_4>",
11
+ "<extra_token_5>",
12
+ "<extra_token_6>",
13
+ "<extra_token_7>",
14
+ "<extra_token_8>",
15
+ "<extra_token_9>",
16
+ "<extra_token_10>",
17
+ "<extra_token_11>",
18
+ "<extra_token_12>",
19
+ "<extra_token_13>",
20
+ "<extra_token_14>",
21
+ "<extra_token_15>",
22
+ "<extra_token_16>",
23
+ "<extra_token_17>",
24
+ "<extra_token_18>",
25
+ "<extra_token_19>",
26
+ "<extra_token_20>",
27
+ "<extra_token_21>",
28
+ "<extra_token_22>",
29
+ "<extra_token_23>",
30
+ "<extra_token_24>",
31
+ "<extra_token_25>",
32
+ "<extra_token_26>",
33
+ "<extra_token_27>",
34
+ "<extra_token_28>",
35
+ "<extra_token_29>",
36
+ "<extra_token_30>",
37
+ "<extra_token_31>",
38
+ "<extra_token_32>",
39
+ "<extra_token_33>",
40
+ "<extra_token_34>",
41
+ "<extra_token_35>",
42
+ "<extra_token_36>",
43
+ "<extra_token_37>",
44
+ "<extra_token_38>",
45
+ "<extra_token_39>",
46
+ "<extra_token_40>",
47
+ "<extra_token_41>",
48
+ "<extra_token_42>",
49
+ "<extra_token_43>",
50
+ "<extra_token_44>",
51
+ "<extra_token_45>",
52
+ "<extra_token_46>",
53
+ "<extra_token_47>",
54
+ "<extra_token_48>",
55
+ "<extra_token_49>",
56
+ "<extra_token_50>",
57
+ "<extra_token_51>",
58
+ "<extra_token_52>",
59
+ "<extra_token_53>",
60
+ "<extra_token_54>",
61
+ "<extra_token_55>",
62
+ "<extra_token_56>",
63
+ "<extra_token_57>",
64
+ "<extra_token_58>",
65
+ "<extra_token_59>"
66
+ ],
67
+ "bos_token": "<|endoftext|>",
68
+ "eos_token": "<|endoftext|>",
69
+ "pad_token": "<|endoftext|>",
70
+ "unk_token": "<|endoftext|>"
71
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<|endoftext|>",
4
+ "clean_up_tokenization_spaces": true,
5
+ "eos_token": "<|endoftext|>",
6
+ "max_length": 256,
7
+ "model_max_length": 1024,
8
+ "stride": 0,
9
+ "tokenizer_class": "GPTNeoXTokenizer",
10
+ "truncation_side": "right",
11
+ "truncation_strategy": "longest_first",
12
+ "unk_token": "<|endoftext|>"
13
+ }