Clemylia commited on
Commit
2d0ebbd
·
verified ·
1 Parent(s): c547f03

Ajout du tokenizer associé au modèle final

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +9 -0
  2. tokenizer.json +420 -0
  3. tokenizer_config.json +49 -0
special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "pad_token": {
3
+ "content": "[PAD]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ }
9
+ }
tokenizer.json ADDED
@@ -0,0 +1,420 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "[UNK]",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "[CLS]",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "[SEP]",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "[PAD]",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 4,
44
+ "content": "[MASK]",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ }
51
+ ],
52
+ "normalizer": null,
53
+ "pre_tokenizer": {
54
+ "type": "Whitespace"
55
+ },
56
+ "post_processor": null,
57
+ "decoder": null,
58
+ "model": {
59
+ "type": "BPE",
60
+ "dropout": null,
61
+ "unk_token": "[UNK]",
62
+ "continuing_subword_prefix": null,
63
+ "end_of_word_suffix": null,
64
+ "fuse_unk": false,
65
+ "byte_fallback": false,
66
+ "ignore_merges": false,
67
+ "vocab": {
68
+ "[UNK]": 0,
69
+ "[CLS]": 1,
70
+ "[SEP]": 2,
71
+ "[PAD]": 3,
72
+ "[MASK]": 4,
73
+ "!": 5,
74
+ ".": 6,
75
+ "G": 7,
76
+ "M": 8,
77
+ "Q": 9,
78
+ "T": 10,
79
+ "W": 11,
80
+ "a": 12,
81
+ "b": 13,
82
+ "d": 14,
83
+ "e": 15,
84
+ "f": 16,
85
+ "i": 17,
86
+ "j": 18,
87
+ "l": 19,
88
+ "m": 20,
89
+ "n": 21,
90
+ "o": 22,
91
+ "p": 23,
92
+ "q": 24,
93
+ "r": 25,
94
+ "s": 26,
95
+ "t": 27,
96
+ "u": 28,
97
+ "v": 29,
98
+ "w": 30,
99
+ "ç": 31,
100
+ "é": 32,
101
+ "..": 33,
102
+ "ou": 34,
103
+ "aou": 35,
104
+ "aouf": 36,
105
+ "Waouf": 37,
106
+ "rr": 38,
107
+ "is": 39,
108
+ "Grr": 40,
109
+ "le": 41,
110
+ "on": 42,
111
+ "su": 43,
112
+ "...": 44,
113
+ "Grrr": 45,
114
+ "suis": 46,
115
+ "as": 47,
116
+ "ble": 48,
117
+ "dis": 49,
118
+ "ible": 50,
119
+ "je": 51,
120
+ "oi": 52,
121
+ "pon": 53,
122
+ "pas": 54,
123
+ "rd": 55,
124
+ "sou": 56,
125
+ "te": 57,
126
+ "uoi": 58,
127
+ "....": 59,
128
+ "dispon": 60,
129
+ "sourd": 61,
130
+ "disponible": 62,
131
+ "!!": 63,
132
+ "!.": 64,
133
+ "Ma": 65,
134
+ "Quoi": 66,
135
+ "Ta": 67,
136
+ "Tu": 68,
137
+ "aa": 69,
138
+ "ai": 70,
139
+ "dé": 71,
140
+ "eu": 72,
141
+ "fai": 73,
142
+ "ir": 74,
143
+ "iu": 75,
144
+ "me": 76,
145
+ "non": 77,
146
+ "or": 78,
147
+ "quoi": 79,
148
+ "riu": 80,
149
+ "ste": 81,
150
+ "sor": 82,
151
+ "tir": 83,
152
+ "veu": 84,
153
+ "waouf": 85,
154
+ "waa": 86,
155
+ "ça": 87,
156
+ "les": 88,
157
+ "teste": 89,
158
+ "Mariu": 90,
159
+ "déteste": 91,
160
+ "fait": 92,
161
+ "sortir": 93,
162
+ "veut": 94,
163
+ "Marius": 95
164
+ },
165
+ "merges": [
166
+ [
167
+ ".",
168
+ "."
169
+ ],
170
+ [
171
+ "o",
172
+ "u"
173
+ ],
174
+ [
175
+ "a",
176
+ "ou"
177
+ ],
178
+ [
179
+ "aou",
180
+ "f"
181
+ ],
182
+ [
183
+ "W",
184
+ "aouf"
185
+ ],
186
+ [
187
+ "r",
188
+ "r"
189
+ ],
190
+ [
191
+ "i",
192
+ "s"
193
+ ],
194
+ [
195
+ "G",
196
+ "rr"
197
+ ],
198
+ [
199
+ "l",
200
+ "e"
201
+ ],
202
+ [
203
+ "o",
204
+ "n"
205
+ ],
206
+ [
207
+ "s",
208
+ "u"
209
+ ],
210
+ [
211
+ "..",
212
+ "."
213
+ ],
214
+ [
215
+ "Grr",
216
+ "r"
217
+ ],
218
+ [
219
+ "su",
220
+ "is"
221
+ ],
222
+ [
223
+ "a",
224
+ "s"
225
+ ],
226
+ [
227
+ "b",
228
+ "le"
229
+ ],
230
+ [
231
+ "d",
232
+ "is"
233
+ ],
234
+ [
235
+ "i",
236
+ "ble"
237
+ ],
238
+ [
239
+ "j",
240
+ "e"
241
+ ],
242
+ [
243
+ "o",
244
+ "i"
245
+ ],
246
+ [
247
+ "p",
248
+ "on"
249
+ ],
250
+ [
251
+ "p",
252
+ "as"
253
+ ],
254
+ [
255
+ "r",
256
+ "d"
257
+ ],
258
+ [
259
+ "s",
260
+ "ou"
261
+ ],
262
+ [
263
+ "t",
264
+ "e"
265
+ ],
266
+ [
267
+ "u",
268
+ "oi"
269
+ ],
270
+ [
271
+ "..",
272
+ ".."
273
+ ],
274
+ [
275
+ "dis",
276
+ "pon"
277
+ ],
278
+ [
279
+ "sou",
280
+ "rd"
281
+ ],
282
+ [
283
+ "dispon",
284
+ "ible"
285
+ ],
286
+ [
287
+ "!",
288
+ "!"
289
+ ],
290
+ [
291
+ "!",
292
+ "."
293
+ ],
294
+ [
295
+ "M",
296
+ "a"
297
+ ],
298
+ [
299
+ "Q",
300
+ "uoi"
301
+ ],
302
+ [
303
+ "T",
304
+ "a"
305
+ ],
306
+ [
307
+ "T",
308
+ "u"
309
+ ],
310
+ [
311
+ "a",
312
+ "a"
313
+ ],
314
+ [
315
+ "a",
316
+ "i"
317
+ ],
318
+ [
319
+ "d",
320
+ "é"
321
+ ],
322
+ [
323
+ "e",
324
+ "u"
325
+ ],
326
+ [
327
+ "f",
328
+ "ai"
329
+ ],
330
+ [
331
+ "i",
332
+ "r"
333
+ ],
334
+ [
335
+ "i",
336
+ "u"
337
+ ],
338
+ [
339
+ "m",
340
+ "e"
341
+ ],
342
+ [
343
+ "n",
344
+ "on"
345
+ ],
346
+ [
347
+ "o",
348
+ "r"
349
+ ],
350
+ [
351
+ "q",
352
+ "uoi"
353
+ ],
354
+ [
355
+ "r",
356
+ "iu"
357
+ ],
358
+ [
359
+ "s",
360
+ "te"
361
+ ],
362
+ [
363
+ "s",
364
+ "or"
365
+ ],
366
+ [
367
+ "t",
368
+ "ir"
369
+ ],
370
+ [
371
+ "v",
372
+ "eu"
373
+ ],
374
+ [
375
+ "w",
376
+ "aouf"
377
+ ],
378
+ [
379
+ "w",
380
+ "aa"
381
+ ],
382
+ [
383
+ "ç",
384
+ "a"
385
+ ],
386
+ [
387
+ "le",
388
+ "s"
389
+ ],
390
+ [
391
+ "te",
392
+ "ste"
393
+ ],
394
+ [
395
+ "Ma",
396
+ "riu"
397
+ ],
398
+ [
399
+ "dé",
400
+ "teste"
401
+ ],
402
+ [
403
+ "fai",
404
+ "t"
405
+ ],
406
+ [
407
+ "sor",
408
+ "tir"
409
+ ],
410
+ [
411
+ "veu",
412
+ "t"
413
+ ],
414
+ [
415
+ "Mariu",
416
+ "s"
417
+ ]
418
+ ]
419
+ }
420
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[UNK]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[PAD]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "extra_special_tokens": {},
46
+ "model_max_length": 1000000000000000019884624838656,
47
+ "pad_token": "[PAD]",
48
+ "tokenizer_class": "PreTrainedTokenizerFast"
49
+ }