nroggendorff commited on
Commit
bd2ef39
·
verified ·
1 Parent(s): 8097882

0.6030588073730468

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +37 -0
  2. tokenizer.json +190 -0
  3. tokenizer_config.json +52 -0
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "mask_token": {
17
+ "content": "<mask>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "pad_token": {
24
+ "content": "<pad>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "<unk>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 512,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
+ "padding": null,
10
+ "added_tokens": [
11
+ {
12
+ "id": 96,
13
+ "content": "<s>",
14
+ "single_word": false,
15
+ "lstrip": false,
16
+ "rstrip": false,
17
+ "normalized": false,
18
+ "special": true
19
+ },
20
+ {
21
+ "id": 97,
22
+ "content": "<pad>",
23
+ "single_word": false,
24
+ "lstrip": false,
25
+ "rstrip": false,
26
+ "normalized": false,
27
+ "special": true
28
+ },
29
+ {
30
+ "id": 98,
31
+ "content": "</s>",
32
+ "single_word": false,
33
+ "lstrip": false,
34
+ "rstrip": false,
35
+ "normalized": false,
36
+ "special": true
37
+ },
38
+ {
39
+ "id": 99,
40
+ "content": "<unk>",
41
+ "single_word": false,
42
+ "lstrip": false,
43
+ "rstrip": false,
44
+ "normalized": false,
45
+ "special": true
46
+ },
47
+ {
48
+ "id": 100,
49
+ "content": "<mask>",
50
+ "single_word": false,
51
+ "lstrip": false,
52
+ "rstrip": false,
53
+ "normalized": false,
54
+ "special": true
55
+ }
56
+ ],
57
+ "normalizer": null,
58
+ "pre_tokenizer": {
59
+ "type": "ByteLevel",
60
+ "add_prefix_space": false,
61
+ "trim_offsets": true,
62
+ "use_regex": true
63
+ },
64
+ "post_processor": {
65
+ "type": "ByteLevel",
66
+ "add_prefix_space": true,
67
+ "trim_offsets": false,
68
+ "use_regex": true
69
+ },
70
+ "decoder": {
71
+ "type": "ByteLevel",
72
+ "add_prefix_space": true,
73
+ "trim_offsets": true,
74
+ "use_regex": true
75
+ },
76
+ "model": {
77
+ "type": "BPE",
78
+ "dropout": null,
79
+ "unk_token": null,
80
+ "continuing_subword_prefix": "",
81
+ "end_of_word_suffix": "",
82
+ "fuse_unk": false,
83
+ "byte_fallback": false,
84
+ "ignore_merges": false,
85
+ "vocab": {
86
+ "!": 0,
87
+ "\"": 1,
88
+ "#": 2,
89
+ "$": 3,
90
+ "%": 4,
91
+ "&": 5,
92
+ "'": 6,
93
+ "(": 7,
94
+ ")": 8,
95
+ "*": 9,
96
+ "+": 10,
97
+ ",": 11,
98
+ "-": 12,
99
+ ".": 13,
100
+ "/": 14,
101
+ "0": 15,
102
+ "1": 16,
103
+ "2": 17,
104
+ "3": 18,
105
+ "4": 19,
106
+ "5": 20,
107
+ "6": 21,
108
+ "7": 22,
109
+ "8": 23,
110
+ "9": 24,
111
+ ":": 25,
112
+ ";": 26,
113
+ "<": 27,
114
+ "=": 28,
115
+ ">": 29,
116
+ "?": 30,
117
+ "@": 31,
118
+ "A": 32,
119
+ "B": 33,
120
+ "C": 34,
121
+ "D": 35,
122
+ "E": 36,
123
+ "F": 37,
124
+ "G": 38,
125
+ "H": 39,
126
+ "I": 40,
127
+ "J": 41,
128
+ "K": 42,
129
+ "L": 43,
130
+ "M": 44,
131
+ "N": 45,
132
+ "O": 46,
133
+ "P": 47,
134
+ "Q": 48,
135
+ "R": 49,
136
+ "S": 50,
137
+ "T": 51,
138
+ "U": 52,
139
+ "V": 53,
140
+ "W": 54,
141
+ "X": 55,
142
+ "Y": 56,
143
+ "Z": 57,
144
+ "[": 58,
145
+ "\\": 59,
146
+ "]": 60,
147
+ "^": 61,
148
+ "_": 62,
149
+ "`": 63,
150
+ "a": 64,
151
+ "b": 65,
152
+ "c": 66,
153
+ "d": 67,
154
+ "e": 68,
155
+ "f": 69,
156
+ "g": 70,
157
+ "h": 71,
158
+ "i": 72,
159
+ "j": 73,
160
+ "k": 74,
161
+ "l": 75,
162
+ "m": 76,
163
+ "n": 77,
164
+ "o": 78,
165
+ "p": 79,
166
+ "q": 80,
167
+ "r": 81,
168
+ "s": 82,
169
+ "t": 83,
170
+ "u": 84,
171
+ "v": 85,
172
+ "w": 86,
173
+ "x": 87,
174
+ "y": 88,
175
+ "z": 89,
176
+ "{": 90,
177
+ "|": 91,
178
+ "}": 92,
179
+ "~": 93,
180
+ "Ġ": 94,
181
+ "Ċ": 95,
182
+ "<s>": 96,
183
+ "<pad>": 97,
184
+ "</s>": 98,
185
+ "<unk>": 99,
186
+ "<mask>": 100
187
+ },
188
+ "merges": []
189
+ }
190
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "96": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "97": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "98": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "99": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "100": {
36
+ "content": "<mask>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": true,
46
+ "eos_token": "</s>",
47
+ "mask_token": "<mask>",
48
+ "model_max_length": 1000000000000000019884624838656,
49
+ "pad_token": "<pad>",
50
+ "tokenizer_class": "PreTrainedTokenizerFast",
51
+ "unk_token": "<unk>"
52
+ }