Clemylia commited on
Commit
d8e681e
·
verified ·
1 Parent(s): e49b899

Ajout du tokenizer associé au modèle final

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +9 -0
  2. tokenizer.json +359 -0
  3. tokenizer_config.json +49 -0
special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "pad_token": {
3
+ "content": "[PAD]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ }
9
+ }
tokenizer.json ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "[UNK]",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "[CLS]",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "[SEP]",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "[PAD]",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 4,
44
+ "content": "[MASK]",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ }
51
+ ],
52
+ "normalizer": null,
53
+ "pre_tokenizer": {
54
+ "type": "Whitespace"
55
+ },
56
+ "post_processor": null,
57
+ "decoder": null,
58
+ "model": {
59
+ "type": "BPE",
60
+ "dropout": null,
61
+ "unk_token": "[UNK]",
62
+ "continuing_subword_prefix": null,
63
+ "end_of_word_suffix": null,
64
+ "fuse_unk": false,
65
+ "byte_fallback": false,
66
+ "ignore_merges": false,
67
+ "vocab": {
68
+ "[UNK]": 0,
69
+ "[CLS]": 1,
70
+ "[SEP]": 2,
71
+ "[PAD]": 3,
72
+ "[MASK]": 4,
73
+ "!": 5,
74
+ "‍": 6,
75
+ "⁉": 7,
76
+ "↔": 8,
77
+ "↕": 9,
78
+ "↩": 10,
79
+ "▶": 11,
80
+ "◀": 12,
81
+ "☂": 13,
82
+ "☪": 14,
83
+ "☹": 15,
84
+ "☺": 16,
85
+ "♑": 17,
86
+ "♣": 18,
87
+ "♥": 19,
88
+ "♦": 20,
89
+ "⚕": 21,
90
+ "⚙": 22,
91
+ "✅": 23,
92
+ "❄": 24,
93
+ "❌": 25,
94
+ "〽": 26,
95
+ "️": 27,
96
+ "🆘": 28,
97
+ "🇦": 29,
98
+ "🇧": 30,
99
+ "🇬": 31,
100
+ "🇭": 32,
101
+ "🇮": 33,
102
+ "🇰": 34,
103
+ "🇹": 35,
104
+ "🇼": 36,
105
+ "🉑": 37,
106
+ "🌀": 38,
107
+ "🌃": 39,
108
+ "🌫": 40,
109
+ "🌶": 41,
110
+ "🌷": 42,
111
+ "🌺": 43,
112
+ "🍄": 44,
113
+ "🍏": 45,
114
+ "🍛": 46,
115
+ "🍝": 47,
116
+ "🍟": 48,
117
+ "🍦": 49,
118
+ "🍩": 50,
119
+ "🍯": 51,
120
+ "🍱": 52,
121
+ "🍵": 53,
122
+ "🎊": 54,
123
+ "🎒": 55,
124
+ "🎢": 56,
125
+ "🎤": 57,
126
+ "🎩": 58,
127
+ "🎭": 59,
128
+ "🎯": 60,
129
+ "🏍": 61,
130
+ "🏮": 62,
131
+ "🏴": 63,
132
+ "🐒": 64,
133
+ "🐠": 65,
134
+ "🐻": 66,
135
+ "🐽": 67,
136
+ "👁": 68,
137
+ "👔": 69,
138
+ "👘": 70,
139
+ "👚": 71,
140
+ "👠": 72,
141
+ "💊": 73,
142
+ "💍": 74,
143
+ "💒": 75,
144
+ "💖": 76,
145
+ "💧": 77,
146
+ "💨": 78,
147
+ "💫": 79,
148
+ "💮": 80,
149
+ "💲": 81,
150
+ "📀": 82,
151
+ "📢": 83,
152
+ "📺": 84,
153
+ "🔍": 85,
154
+ "🔔": 86,
155
+ "🔪": 87,
156
+ "🔵": 88,
157
+ "🕥": 89,
158
+ "🖍": 90,
159
+ "🗨": 91,
160
+ "😀": 92,
161
+ "😁": 93,
162
+ "😂": 94,
163
+ "😃": 95,
164
+ "😄": 96,
165
+ "😅": 97,
166
+ "😆": 98,
167
+ "😉": 99,
168
+ "😊": 100,
169
+ "😋": 101,
170
+ "😌": 102,
171
+ "😍": 103,
172
+ "😏": 104,
173
+ "😐": 105,
174
+ "😑": 106,
175
+ "😒": 107,
176
+ "😓": 108,
177
+ "😔": 109,
178
+ "😕": 110,
179
+ "😖": 111,
180
+ "😗": 112,
181
+ "😘": 113,
182
+ "😙": 114,
183
+ "😚": 115,
184
+ "😛": 116,
185
+ "😜": 117,
186
+ "😝": 118,
187
+ "😞": 119,
188
+ "😟": 120,
189
+ "😠": 121,
190
+ "😡": 122,
191
+ "😢": 123,
192
+ "😣": 124,
193
+ "😤": 125,
194
+ "😥": 126,
195
+ "😦": 127,
196
+ "😧": 128,
197
+ "😨": 129,
198
+ "😩": 130,
199
+ "😪": 131,
200
+ "😫": 132,
201
+ "😬": 133,
202
+ "😭": 134,
203
+ "😮": 135,
204
+ "😯": 136,
205
+ "😰": 137,
206
+ "😱": 138,
207
+ "😲": 139,
208
+ "😳": 140,
209
+ "😴": 141,
210
+ "😵": 142,
211
+ "😶": 143,
212
+ "🙁": 144,
213
+ "🙂": 145,
214
+ "🙃": 146,
215
+ "🙄": 147,
216
+ "🚊": 148,
217
+ "🚏": 149,
218
+ "🚷": 150,
219
+ "🚿": 151,
220
+ "🛃": 152,
221
+ "🛒": 153,
222
+ "🛡": 154,
223
+ "🛤": 155,
224
+ "🤐": 156,
225
+ "🤔": 157,
226
+ "🤗": 158,
227
+ "🤢": 159,
228
+ "🤣": 160,
229
+ "🤤": 161,
230
+ "🤧": 162,
231
+ "🤨": 163,
232
+ "🤩": 164,
233
+ "🤪": 165,
234
+ "🤫": 166,
235
+ "🤬": 167,
236
+ "🤭": 168,
237
+ "🤮": 169,
238
+ "🤯": 170,
239
+ "🥌": 171,
240
+ "🥎": 172,
241
+ "🥏": 173,
242
+ "🥕": 174,
243
+ "🥘": 175,
244
+ "🥡": 176,
245
+ "🥩": 177,
246
+ "🥫": 178,
247
+ "🥰": 179,
248
+ "🥱": 180,
249
+ "🥲": 181,
250
+ "🥳": 182,
251
+ "🥴": 183,
252
+ "🥵": 184,
253
+ "🥶": 185,
254
+ "🥹": 186,
255
+ "🥺": 187,
256
+ "🦔": 188,
257
+ "🧐": 189,
258
+ "🧩": 190,
259
+ "����": 191,
260
+ "🧬": 192,
261
+ "🧳": 193,
262
+ "🧹": 194,
263
+ "🧼": 195,
264
+ "🩴": 196,
265
+ "🩷": 197,
266
+ "🩺": 198,
267
+ "🩻": 199,
268
+ "🪇": 200,
269
+ "🪑": 201,
270
+ "🪗": 202,
271
+ "🪘": 203,
272
+ "🪢": 204,
273
+ "🪪": 205,
274
+ "🪴": 206,
275
+ "🫖": 207,
276
+ "🫟": 208,
277
+ "🫠": 209,
278
+ "🫡": 210,
279
+ "🫢": 211,
280
+ "🫣": 212,
281
+ "🫤": 213,
282
+ "🫥": 214,
283
+ "🫨": 215,
284
+ "🫩": 216,
285
+ "󠁢": 217,
286
+ "󠁥": 218,
287
+ "󠁧": 219,
288
+ "󠁮": 220,
289
+ "󠁿": 221,
290
+ "😀!": 222,
291
+ "️‍": 223,
292
+ "🇧🇼": 224,
293
+ "🇬🇦": 225,
294
+ "🇬🇼": 226,
295
+ "🇭🇰": 227,
296
+ "🇮🇹": 228,
297
+ "🏴󠁧": 229,
298
+ "󠁢󠁥": 230,
299
+ "󠁧󠁿": 231,
300
+ "󠁮󠁧󠁿": 232,
301
+ "🏴󠁧󠁢󠁥": 233,
302
+ "🏴󠁧󠁢󠁥󠁮󠁧󠁿": 234
303
+ },
304
+ "merges": [
305
+ [
306
+ "😀",
307
+ "!"
308
+ ],
309
+ [
310
+ "️",
311
+ "‍"
312
+ ],
313
+ [
314
+ "🇧",
315
+ "🇼"
316
+ ],
317
+ [
318
+ "🇬",
319
+ "🇦"
320
+ ],
321
+ [
322
+ "🇬",
323
+ "🇼"
324
+ ],
325
+ [
326
+ "🇭",
327
+ "🇰"
328
+ ],
329
+ [
330
+ "🇮",
331
+ "🇹"
332
+ ],
333
+ [
334
+ "🏴",
335
+ "󠁧"
336
+ ],
337
+ [
338
+ "󠁢",
339
+ "󠁥"
340
+ ],
341
+ [
342
+ "󠁧",
343
+ "󠁿"
344
+ ],
345
+ [
346
+ "󠁮",
347
+ "󠁧󠁿"
348
+ ],
349
+ [
350
+ "🏴󠁧",
351
+ "󠁢󠁥"
352
+ ],
353
+ [
354
+ "🏴󠁧󠁢󠁥",
355
+ "󠁮󠁧󠁿"
356
+ ]
357
+ ]
358
+ }
359
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[UNK]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[PAD]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "extra_special_tokens": {},
46
+ "model_max_length": 1000000000000000019884624838656,
47
+ "pad_token": "[PAD]",
48
+ "tokenizer_class": "PreTrainedTokenizerFast"
49
+ }