wonderwind271 commited on
Commit
be75f93
·
verified ·
1 Parent(s): ebde8a3

Upload tokenizer

Browse files
added_tokens.json ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<pad>": 30002,
3
+ "[CLS]": 30000,
4
+ "[MASK]": 30003,
5
+ "[SEP]": 30001,
6
+ "[XXXXX0]": 30004,
7
+ "[XXXXX100]": 30104,
8
+ "[XXXXX101]": 30105,
9
+ "[XXXXX102]": 30106,
10
+ "[XXXXX103]": 30107,
11
+ "[XXXXX104]": 30108,
12
+ "[XXXXX105]": 30109,
13
+ "[XXXXX106]": 30110,
14
+ "[XXXXX107]": 30111,
15
+ "[XXXXX108]": 30112,
16
+ "[XXXXX109]": 30113,
17
+ "[XXXXX10]": 30014,
18
+ "[XXXXX110]": 30114,
19
+ "[XXXXX111]": 30115,
20
+ "[XXXXX112]": 30116,
21
+ "[XXXXX113]": 30117,
22
+ "[XXXXX114]": 30118,
23
+ "[XXXXX115]": 30119,
24
+ "[XXXXX116]": 30120,
25
+ "[XXXXX117]": 30121,
26
+ "[XXXXX118]": 30122,
27
+ "[XXXXX119]": 30123,
28
+ "[XXXXX11]": 30015,
29
+ "[XXXXX120]": 30124,
30
+ "[XXXXX121]": 30125,
31
+ "[XXXXX122]": 30126,
32
+ "[XXXXX123]": 30127,
33
+ "[XXXXX124]": 30128,
34
+ "[XXXXX125]": 30129,
35
+ "[XXXXX126]": 30130,
36
+ "[XXXXX127]": 30131,
37
+ "[XXXXX128]": 30132,
38
+ "[XXXXX129]": 30133,
39
+ "[XXXXX12]": 30016,
40
+ "[XXXXX130]": 30134,
41
+ "[XXXXX131]": 30135,
42
+ "[XXXXX132]": 30136,
43
+ "[XXXXX133]": 30137,
44
+ "[XXXXX134]": 30138,
45
+ "[XXXXX135]": 30139,
46
+ "[XXXXX136]": 30140,
47
+ "[XXXXX137]": 30141,
48
+ "[XXXXX138]": 30142,
49
+ "[XXXXX139]": 30143,
50
+ "[XXXXX13]": 30017,
51
+ "[XXXXX140]": 30144,
52
+ "[XXXXX141]": 30145,
53
+ "[XXXXX142]": 30146,
54
+ "[XXXXX143]": 30147,
55
+ "[XXXXX144]": 30148,
56
+ "[XXXXX145]": 30149,
57
+ "[XXXXX146]": 30150,
58
+ "[XXXXX147]": 30151,
59
+ "[XXXXX148]": 30152,
60
+ "[XXXXX149]": 30153,
61
+ "[XXXXX14]": 30018,
62
+ "[XXXXX150]": 30154,
63
+ "[XXXXX151]": 30155,
64
+ "[XXXXX152]": 30156,
65
+ "[XXXXX153]": 30157,
66
+ "[XXXXX154]": 30158,
67
+ "[XXXXX155]": 30159,
68
+ "[XXXXX156]": 30160,
69
+ "[XXXXX157]": 30161,
70
+ "[XXXXX158]": 30162,
71
+ "[XXXXX159]": 30163,
72
+ "[XXXXX15]": 30019,
73
+ "[XXXXX160]": 30164,
74
+ "[XXXXX161]": 30165,
75
+ "[XXXXX162]": 30166,
76
+ "[XXXXX163]": 30167,
77
+ "[XXXXX164]": 30168,
78
+ "[XXXXX165]": 30169,
79
+ "[XXXXX166]": 30170,
80
+ "[XXXXX167]": 30171,
81
+ "[XXXXX168]": 30172,
82
+ "[XXXXX169]": 30173,
83
+ "[XXXXX16]": 30020,
84
+ "[XXXXX170]": 30174,
85
+ "[XXXXX171]": 30175,
86
+ "[XXXXX172]": 30176,
87
+ "[XXXXX173]": 30177,
88
+ "[XXXXX174]": 30178,
89
+ "[XXXXX175]": 30179,
90
+ "[XXXXX176]": 30180,
91
+ "[XXXXX177]": 30181,
92
+ "[XXXXX178]": 30182,
93
+ "[XXXXX179]": 30183,
94
+ "[XXXXX17]": 30021,
95
+ "[XXXXX180]": 30184,
96
+ "[XXXXX181]": 30185,
97
+ "[XXXXX182]": 30186,
98
+ "[XXXXX183]": 30187,
99
+ "[XXXXX184]": 30188,
100
+ "[XXXXX185]": 30189,
101
+ "[XXXXX186]": 30190,
102
+ "[XXXXX187]": 30191,
103
+ "[XXXXX188]": 30192,
104
+ "[XXXXX189]": 30193,
105
+ "[XXXXX18]": 30022,
106
+ "[XXXXX190]": 30194,
107
+ "[XXXXX191]": 30195,
108
+ "[XXXXX192]": 30196,
109
+ "[XXXXX193]": 30197,
110
+ "[XXXXX194]": 30198,
111
+ "[XXXXX195]": 30199,
112
+ "[XXXXX196]": 30200,
113
+ "[XXXXX197]": 30201,
114
+ "[XXXXX198]": 30202,
115
+ "[XXXXX199]": 30203,
116
+ "[XXXXX19]": 30023,
117
+ "[XXXXX1]": 30005,
118
+ "[XXXXX200]": 30204,
119
+ "[XXXXX201]": 30205,
120
+ "[XXXXX202]": 30206,
121
+ "[XXXXX203]": 30207,
122
+ "[XXXXX20]": 30024,
123
+ "[XXXXX21]": 30025,
124
+ "[XXXXX22]": 30026,
125
+ "[XXXXX23]": 30027,
126
+ "[XXXXX24]": 30028,
127
+ "[XXXXX25]": 30029,
128
+ "[XXXXX26]": 30030,
129
+ "[XXXXX27]": 30031,
130
+ "[XXXXX28]": 30032,
131
+ "[XXXXX29]": 30033,
132
+ "[XXXXX2]": 30006,
133
+ "[XXXXX30]": 30034,
134
+ "[XXXXX31]": 30035,
135
+ "[XXXXX32]": 30036,
136
+ "[XXXXX33]": 30037,
137
+ "[XXXXX34]": 30038,
138
+ "[XXXXX35]": 30039,
139
+ "[XXXXX36]": 30040,
140
+ "[XXXXX37]": 30041,
141
+ "[XXXXX38]": 30042,
142
+ "[XXXXX39]": 30043,
143
+ "[XXXXX3]": 30007,
144
+ "[XXXXX40]": 30044,
145
+ "[XXXXX41]": 30045,
146
+ "[XXXXX42]": 30046,
147
+ "[XXXXX43]": 30047,
148
+ "[XXXXX44]": 30048,
149
+ "[XXXXX45]": 30049,
150
+ "[XXXXX46]": 30050,
151
+ "[XXXXX47]": 30051,
152
+ "[XXXXX48]": 30052,
153
+ "[XXXXX49]": 30053,
154
+ "[XXXXX4]": 30008,
155
+ "[XXXXX50]": 30054,
156
+ "[XXXXX51]": 30055,
157
+ "[XXXXX52]": 30056,
158
+ "[XXXXX53]": 30057,
159
+ "[XXXXX54]": 30058,
160
+ "[XXXXX55]": 30059,
161
+ "[XXXXX56]": 30060,
162
+ "[XXXXX57]": 30061,
163
+ "[XXXXX58]": 30062,
164
+ "[XXXXX59]": 30063,
165
+ "[XXXXX5]": 30009,
166
+ "[XXXXX60]": 30064,
167
+ "[XXXXX61]": 30065,
168
+ "[XXXXX62]": 30066,
169
+ "[XXXXX63]": 30067,
170
+ "[XXXXX64]": 30068,
171
+ "[XXXXX65]": 30069,
172
+ "[XXXXX66]": 30070,
173
+ "[XXXXX67]": 30071,
174
+ "[XXXXX68]": 30072,
175
+ "[XXXXX69]": 30073,
176
+ "[XXXXX6]": 30010,
177
+ "[XXXXX70]": 30074,
178
+ "[XXXXX71]": 30075,
179
+ "[XXXXX72]": 30076,
180
+ "[XXXXX73]": 30077,
181
+ "[XXXXX74]": 30078,
182
+ "[XXXXX75]": 30079,
183
+ "[XXXXX76]": 30080,
184
+ "[XXXXX77]": 30081,
185
+ "[XXXXX78]": 30082,
186
+ "[XXXXX79]": 30083,
187
+ "[XXXXX7]": 30011,
188
+ "[XXXXX80]": 30084,
189
+ "[XXXXX81]": 30085,
190
+ "[XXXXX82]": 30086,
191
+ "[XXXXX83]": 30087,
192
+ "[XXXXX84]": 30088,
193
+ "[XXXXX85]": 30089,
194
+ "[XXXXX86]": 30090,
195
+ "[XXXXX87]": 30091,
196
+ "[XXXXX88]": 30092,
197
+ "[XXXXX89]": 30093,
198
+ "[XXXXX8]": 30012,
199
+ "[XXXXX90]": 30094,
200
+ "[XXXXX91]": 30095,
201
+ "[XXXXX92]": 30096,
202
+ "[XXXXX93]": 30097,
203
+ "[XXXXX94]": 30098,
204
+ "[XXXXX95]": 30099,
205
+ "[XXXXX96]": 30100,
206
+ "[XXXXX97]": 30101,
207
+ "[XXXXX98]": 30102,
208
+ "[XXXXX99]": 30103,
209
+ "[XXXXX9]": 30013
210
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "[XXXXX0]",
4
+ "[XXXXX1]",
5
+ "[XXXXX2]",
6
+ "[XXXXX3]",
7
+ "[XXXXX4]",
8
+ "[XXXXX5]",
9
+ "[XXXXX6]",
10
+ "[XXXXX7]",
11
+ "[XXXXX8]",
12
+ "[XXXXX9]",
13
+ "[XXXXX10]",
14
+ "[XXXXX11]",
15
+ "[XXXXX12]",
16
+ "[XXXXX13]",
17
+ "[XXXXX14]",
18
+ "[XXXXX15]",
19
+ "[XXXXX16]",
20
+ "[XXXXX17]",
21
+ "[XXXXX18]",
22
+ "[XXXXX19]",
23
+ "[XXXXX20]",
24
+ "[XXXXX21]",
25
+ "[XXXXX22]",
26
+ "[XXXXX23]",
27
+ "[XXXXX24]",
28
+ "[XXXXX25]",
29
+ "[XXXXX26]",
30
+ "[XXXXX27]",
31
+ "[XXXXX28]",
32
+ "[XXXXX29]",
33
+ "[XXXXX30]",
34
+ "[XXXXX31]",
35
+ "[XXXXX32]",
36
+ "[XXXXX33]",
37
+ "[XXXXX34]",
38
+ "[XXXXX35]",
39
+ "[XXXXX36]",
40
+ "[XXXXX37]",
41
+ "[XXXXX38]",
42
+ "[XXXXX39]",
43
+ "[XXXXX40]",
44
+ "[XXXXX41]",
45
+ "[XXXXX42]",
46
+ "[XXXXX43]",
47
+ "[XXXXX44]",
48
+ "[XXXXX45]",
49
+ "[XXXXX46]",
50
+ "[XXXXX47]",
51
+ "[XXXXX48]",
52
+ "[XXXXX49]",
53
+ "[XXXXX50]",
54
+ "[XXXXX51]",
55
+ "[XXXXX52]",
56
+ "[XXXXX53]",
57
+ "[XXXXX54]",
58
+ "[XXXXX55]",
59
+ "[XXXXX56]",
60
+ "[XXXXX57]",
61
+ "[XXXXX58]",
62
+ "[XXXXX59]",
63
+ "[XXXXX60]",
64
+ "[XXXXX61]",
65
+ "[XXXXX62]",
66
+ "[XXXXX63]",
67
+ "[XXXXX64]",
68
+ "[XXXXX65]",
69
+ "[XXXXX66]",
70
+ "[XXXXX67]",
71
+ "[XXXXX68]",
72
+ "[XXXXX69]",
73
+ "[XXXXX70]",
74
+ "[XXXXX71]",
75
+ "[XXXXX72]",
76
+ "[XXXXX73]",
77
+ "[XXXXX74]",
78
+ "[XXXXX75]",
79
+ "[XXXXX76]",
80
+ "[XXXXX77]",
81
+ "[XXXXX78]",
82
+ "[XXXXX79]",
83
+ "[XXXXX80]",
84
+ "[XXXXX81]",
85
+ "[XXXXX82]",
86
+ "[XXXXX83]",
87
+ "[XXXXX84]",
88
+ "[XXXXX85]",
89
+ "[XXXXX86]",
90
+ "[XXXXX87]",
91
+ "[XXXXX88]",
92
+ "[XXXXX89]",
93
+ "[XXXXX90]",
94
+ "[XXXXX91]",
95
+ "[XXXXX92]",
96
+ "[XXXXX93]",
97
+ "[XXXXX94]",
98
+ "[XXXXX95]",
99
+ "[XXXXX96]",
100
+ "[XXXXX97]",
101
+ "[XXXXX98]",
102
+ "[XXXXX99]",
103
+ "[XXXXX100]",
104
+ "[XXXXX101]",
105
+ "[XXXXX102]",
106
+ "[XXXXX103]",
107
+ "[XXXXX104]",
108
+ "[XXXXX105]",
109
+ "[XXXXX106]",
110
+ "[XXXXX107]",
111
+ "[XXXXX108]",
112
+ "[XXXXX109]",
113
+ "[XXXXX110]",
114
+ "[XXXXX111]",
115
+ "[XXXXX112]",
116
+ "[XXXXX113]",
117
+ "[XXXXX114]",
118
+ "[XXXXX115]",
119
+ "[XXXXX116]",
120
+ "[XXXXX117]",
121
+ "[XXXXX118]",
122
+ "[XXXXX119]",
123
+ "[XXXXX120]",
124
+ "[XXXXX121]",
125
+ "[XXXXX122]",
126
+ "[XXXXX123]",
127
+ "[XXXXX124]",
128
+ "[XXXXX125]",
129
+ "[XXXXX126]",
130
+ "[XXXXX127]",
131
+ "[XXXXX128]",
132
+ "[XXXXX129]",
133
+ "[XXXXX130]",
134
+ "[XXXXX131]",
135
+ "[XXXXX132]",
136
+ "[XXXXX133]",
137
+ "[XXXXX134]",
138
+ "[XXXXX135]",
139
+ "[XXXXX136]",
140
+ "[XXXXX137]",
141
+ "[XXXXX138]",
142
+ "[XXXXX139]",
143
+ "[XXXXX140]",
144
+ "[XXXXX141]",
145
+ "[XXXXX142]",
146
+ "[XXXXX143]",
147
+ "[XXXXX144]",
148
+ "[XXXXX145]",
149
+ "[XXXXX146]",
150
+ "[XXXXX147]",
151
+ "[XXXXX148]",
152
+ "[XXXXX149]",
153
+ "[XXXXX150]",
154
+ "[XXXXX151]",
155
+ "[XXXXX152]",
156
+ "[XXXXX153]",
157
+ "[XXXXX154]",
158
+ "[XXXXX155]",
159
+ "[XXXXX156]",
160
+ "[XXXXX157]",
161
+ "[XXXXX158]",
162
+ "[XXXXX159]",
163
+ "[XXXXX160]",
164
+ "[XXXXX161]",
165
+ "[XXXXX162]",
166
+ "[XXXXX163]",
167
+ "[XXXXX164]",
168
+ "[XXXXX165]",
169
+ "[XXXXX166]",
170
+ "[XXXXX167]",
171
+ "[XXXXX168]",
172
+ "[XXXXX169]",
173
+ "[XXXXX170]",
174
+ "[XXXXX171]",
175
+ "[XXXXX172]",
176
+ "[XXXXX173]",
177
+ "[XXXXX174]",
178
+ "[XXXXX175]",
179
+ "[XXXXX176]",
180
+ "[XXXXX177]",
181
+ "[XXXXX178]",
182
+ "[XXXXX179]",
183
+ "[XXXXX180]",
184
+ "[XXXXX181]",
185
+ "[XXXXX182]",
186
+ "[XXXXX183]",
187
+ "[XXXXX184]",
188
+ "[XXXXX185]",
189
+ "[XXXXX186]",
190
+ "[XXXXX187]",
191
+ "[XXXXX188]",
192
+ "[XXXXX189]",
193
+ "[XXXXX190]",
194
+ "[XXXXX191]",
195
+ "[XXXXX192]",
196
+ "[XXXXX193]",
197
+ "[XXXXX194]",
198
+ "[XXXXX195]",
199
+ "[XXXXX196]",
200
+ "[XXXXX197]",
201
+ "[XXXXX198]",
202
+ "[XXXXX199]",
203
+ "[XXXXX200]",
204
+ "[XXXXX201]",
205
+ "[XXXXX202]",
206
+ "[XXXXX203]"
207
+ ],
208
+ "bos_token": "[CLS]",
209
+ "cls_token": "[CLS]",
210
+ "eos_token": "[SEP]",
211
+ "mask_token": {
212
+ "content": "[MASK]",
213
+ "lstrip": true,
214
+ "normalized": false,
215
+ "rstrip": false,
216
+ "single_word": false
217
+ },
218
+ "pad_token": "<pad>",
219
+ "sep_token": "[SEP]",
220
+ "unk_token": "<unk>"
221
+ }
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1840794a944b1ca5d804431a8fb26e42fa55680e0c845315adef099c24d322d5
3
+ size 756326
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "clean_up_tokenization_spaces": true,
4
+ "cls_token": "[CLS]",
5
+ "do_lower_case": false,
6
+ "eos_token": "[SEP]",
7
+ "keep_accents": false,
8
+ "mask_token": {
9
+ "__type": "AddedToken",
10
+ "content": "[MASK]",
11
+ "lstrip": true,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "model_max_length": 1000000000000000019884624838656,
17
+ "pad_token": "<pad>",
18
+ "remove_space": true,
19
+ "sep_token": "[SEP]",
20
+ "sp_model_kwargs": {},
21
+ "tokenizer_class": "AlbertTokenizer",
22
+ "unk_token": "<unk>"
23
+ }