Floncer commited on
Commit
3e60858
·
verified ·
1 Parent(s): 5555448

Upload 7 files

Browse files
Files changed (3) hide show
  1. merges.txt +18 -0
  2. tokenizer_config.json +5 -4
  3. vocab.json +514 -0
merges.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #version: 0.2
2
+ П р
3
+ р и
4
+ и в
5
+ в е
6
+ е т
7
+ Ч т
8
+ т о
9
+ о
10
+ т
11
+ т а
12
+ а к
13
+ к о
14
+ о е
15
+ H e
16
+ e l
17
+ l l
18
+ l o
tokenizer_config.json CHANGED
@@ -1,9 +1,10 @@
1
  {
2
- "backend": "tokenizers",
3
  "bos_token": "<|endoftext|>",
4
  "eos_token": "<|endoftext|>",
5
- "model_max_length": 1000000000000000019884624838656,
6
  "pad_token": "<pad>",
7
- "tokenizer_class": "TokenizersBackend",
8
- "unk_token": "<unk>"
 
 
9
  }
 
1
  {
 
2
  "bos_token": "<|endoftext|>",
3
  "eos_token": "<|endoftext|>",
4
+ "model_max_length": 128,
5
  "pad_token": "<pad>",
6
+ "tokenizer_class": "GPT2Tokenizer",
7
+ "unk_token": "<unk>",
8
+ "add_prefix_space": false,
9
+ "clean_up_tokenization_spaces": true
10
  }
vocab.json ADDED
@@ -0,0 +1,514 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "до": 451,
3
+ "fast": 302,
4
+ "ек": 331,
5
+ "ky": 396,
6
+ "G": 23,
7
+ "of": 231,
8
+ "ед": 168,
9
+ "п": 102,
10
+ "ation": 342,
11
+ "us": 147,
12
+ "ig": 204,
13
+ "а": 87,
14
+ "за": 160,
15
+ "уч": 241,
16
+ "то": 125,
17
+ "arning": 213,
18
+ "ly": 309,
19
+ "такое": 200,
20
+ "пр": 141,
21
+ "a": 40,
22
+ "бо": 264,
23
+ "Э": 85,
24
+ "pu": 317,
25
+ "ста": 478,
26
+ "ит": 460,
27
+ "это": 197,
28
+ "кция": 465,
29
+ "во": 330,
30
+ "ygen": 435,
31
+ "лени": 469,
32
+ "ht": 304,
33
+ "ud": 162,
34
+ "w": 61,
35
+ "по": 211,
36
+ "ch": 225,
37
+ "ology": 363,
38
+ "sle": 420,
39
+ "gy": 228,
40
+ "ст": 239,
41
+ "2": 10,
42
+ "ун": 336,
43
+ "hav": 253,
44
+ "know": 397,
45
+ "ны": 271,
46
+ "дел": 452,
47
+ "art": 496,
48
+ "ор": 210,
49
+ "ает": 442,
50
+ "ater": 341,
51
+ "y": 63,
52
+ "про": 282,
53
+ "й": 96,
54
+ "ce": 250,
55
+ "v": 60,
56
+ "x": 62,
57
+ "om": 151,
58
+ "en": 135,
59
+ "eb": 299,
60
+ "-": 5,
61
+ "studies": 172,
62
+ "gr": 388,
63
+ "жи": 456,
64
+ "к": 97,
65
+ "wi": 430,
66
+ "se": 418,
67
+ "all": 245,
68
+ "ab": 249,
69
+ "ло": 467,
70
+ "ies": 149,
71
+ "ней": 334,
72
+ "are": 183,
73
+ "ts": 233,
74
+ "ель": 247,
75
+ "J": 26,
76
+ "Г": 68,
77
+ "age": 293,
78
+ "ме": 470,
79
+ "f": 45,
80
+ "рон": 476,
81
+ "чем": 490,
82
+ "ning": 164,
83
+ "ain": 157,
84
+ "ру": 475,
85
+ "О": 76,
86
+ "hy": 305,
87
+ "langu": 354,
88
+ "ния": 502,
89
+ "тек": 349,
90
+ "s": 57,
91
+ "ус": 337,
92
+ "ел": 169,
93
+ "S": 34,
94
+ "z": 64,
95
+ "<|endoftext|>": 2,
96
+ "no": 230,
97
+ "Ка": 327,
98
+ "сп": 277,
99
+ "O": 30,
100
+ "il": 393,
101
+ "each": 385,
102
+ "ho": 391,
103
+ "еоб": 332,
104
+ "из": 236,
105
+ "ace": 346,
106
+ "peop": 415,
107
+ "l": 50,
108
+ "ay": 174,
109
+ "<unk>": 0,
110
+ "ence": 508,
111
+ "ь": 115,
112
+ "ся": 278,
113
+ "р": 103,
114
+ "но": 238,
115
+ "tel": 423,
116
+ "те": 154,
117
+ "es": 119,
118
+ "мбед": 472,
119
+ "sky": 421,
120
+ "lan": 188,
121
+ "ame": 376,
122
+ "ая": 208,
123
+ "пред": 283,
124
+ "Foo": 369,
125
+ "ines": 493,
126
+ "N": 29,
127
+ "ем": 170,
128
+ "L": 27,
129
+ "столица": 221,
130
+ "раз": 214,
131
+ "los": 348,
132
+ "id": 392,
133
+ "ner": 406,
134
+ "С": 79,
135
+ "Mat": 372,
136
+ "н": 100,
137
+ "ov": 314,
138
+ "ine": 243,
139
+ "Ear": 288,
140
+ "вод": 446,
141
+ "H": 24,
142
+ "many": 362,
143
+ "ин": 192,
144
+ "F": 22,
145
+ "for": 203,
146
+ "T": 35,
147
+ "хани": 488,
148
+ "al": 134,
149
+ "М": 74,
150
+ "ball": 295,
151
+ "iology": 395,
152
+ "oo": 259,
153
+ "Ц": 83,
154
+ "have": 286,
155
+ "5": 13,
156
+ "A": 17,
157
+ "con": 202,
158
+ "ma": 256,
159
+ "б": 88,
160
+ "by": 294,
161
+ "Д": 69,
162
+ "has": 306,
163
+ "man": 312,
164
+ "lu": 398,
165
+ "исл": 461,
166
+ "to": 166,
167
+ "iv": 205,
168
+ "ыва": 338,
169
+ "is": 120,
170
+ "з": 94,
171
+ "Lo": 371,
172
+ "th": 178,
173
+ "Х": 82,
174
+ "oc": 257,
175
+ "our": 316,
176
+ "on": 131,
177
+ "ю": 117,
178
+ "q": 55,
179
+ "ха": 486,
180
+ "end": 506,
181
+ "ze": 436,
182
+ "E": 21,
183
+ "преоб": 510,
184
+ "ри": 275,
185
+ "ли": 138,
186
+ "4": 12,
187
+ "sy": 320,
188
+ "t": 58,
189
+ "цес": 489,
190
+ "now": 357,
191
+ "Earth": 361,
192
+ "h": 47,
193
+ "els": 353,
194
+ "В": 67,
195
+ "Ч": 84,
196
+ "инте": 355,
197
+ "ени": 163,
198
+ "м": 99,
199
+ "сем": 480,
200
+ "кени": 270,
201
+ "ль": 468,
202
+ "ск": 276,
203
+ "т": 105,
204
+ "И": 71,
205
+ "ти": 483,
206
+ "ни": 129,
207
+ "00": 222,
208
+ "ac": 143,
209
+ "ast": 224,
210
+ "me": 206,
211
+ "у": 106,
212
+ "W": 38,
213
+ "ле": 466,
214
+ "com": 251,
215
+ "ut": 322,
216
+ "flo": 301,
217
+ "ons": 343,
218
+ "et": 155,
219
+ ",": 4,
220
+ "мер": 195,
221
+ "Я": 86,
222
+ "C": 19,
223
+ "re": 142,
224
+ "k": 49,
225
+ "ring": 261,
226
+ "ат": 329,
227
+ "7": 15,
228
+ "ow": 408,
229
+ "/": 7,
230
+ "mach": 313,
231
+ "ces": 296,
232
+ "B": 18,
233
+ "Win": 291,
234
+ "ming": 402,
235
+ "Цель": 441,
236
+ "stem": 500,
237
+ "lo": 150,
238
+ "ки": 194,
239
+ "ну": 473,
240
+ "ная": 272,
241
+ "em": 300,
242
+ "gu": 303,
243
+ "ter": 167,
244
+ "olo": 315,
245
+ ".": 6,
246
+ "hum": 307,
247
+ "er": 124,
248
+ "the": 146,
249
+ "щ": 112,
250
+ "ro": 165,
251
+ "Mo": 248,
252
+ "ен": 179,
253
+ "made": 359,
254
+ "The": 223,
255
+ "и": 95,
256
+ "иб": 458,
257
+ "hi": 390,
258
+ "ко": 153,
259
+ "Мо": 439,
260
+ "ages": 378,
261
+ "ш": 111,
262
+ "ns": 405,
263
+ "brings": 380,
264
+ "M": 28,
265
+ "ут": 240,
266
+ "яет": 339,
267
+ "rings": 360,
268
+ "ч": 110,
269
+ "сто": 212,
270
+ "модели": 358,
271
+ "свет": 482,
272
+ "planet": 365,
273
+ "pre": 412,
274
+ "eop": 386,
275
+ "plan": 319,
276
+ "Что": 191,
277
+ "ep": 185,
278
+ "lif": 400,
279
+ "used": 511,
280
+ "мен": 471,
281
+ "То": 440,
282
+ "ure": 427,
283
+ "ас": 328,
284
+ "ани": 263,
285
+ "pt": 411,
286
+ "Fi": 368,
287
+ "ку": 463,
288
+ "тор": 244,
289
+ "air": 377,
290
+ "ход": 487,
291
+ "ра": 136,
292
+ "and": 504,
293
+ "m": 51,
294
+ "bit": 379,
295
+ "ion": 177,
296
+ "сте": 479,
297
+ "П": 77,
298
+ "ar": 123,
299
+ "нима": 503,
300
+ "сет": 335,
301
+ "lay": 399,
302
+ "I": 25,
303
+ "д": 91,
304
+ "ош": 274,
305
+ "space": 422,
306
+ "ne": 404,
307
+ "ry": 260,
308
+ "ца": 182,
309
+ "ъ": 113,
310
+ "si": 262,
311
+ "мод": 237,
312
+ "нов": 333,
313
+ "ent": 507,
314
+ "Т": 80,
315
+ "Вод": 437,
316
+ "ес": 180,
317
+ "ают": 443,
318
+ "токени": 497,
319
+ "З": 70,
320
+ "обуч": 350,
321
+ "Q": 32,
322
+ "La": 370,
323
+ "э": 116,
324
+ "you": 234,
325
+ "name": 407,
326
+ "ics": 215,
327
+ "од": 140,
328
+ "ран": 344,
329
+ "o": 53,
330
+ "fr": 227,
331
+ "жен": 457,
332
+ "fut": 387,
333
+ "est": 340,
334
+ "he": 127,
335
+ "он": 273,
336
+ "г": 90,
337
+ "ist": 280,
338
+ "orbit": 505,
339
+ "ers": 198,
340
+ "ks": 255,
341
+ "сия": 481,
342
+ "ения": 352,
343
+ "uc": 425,
344
+ "atch": 495,
345
+ "Nic": 373,
346
+ "language": 367,
347
+ "А": 65,
348
+ "ket": 308,
349
+ "3": 11,
350
+ "day": 226,
351
+ "el": 176,
352
+ "дней": 454,
353
+ "puter": 364,
354
+ "de": 175,
355
+ "qu": 416,
356
+ "ги": 449,
357
+ "им": 269,
358
+ "lar": 310,
359
+ "ия": 193,
360
+ "ial": 394,
361
+ "am": 292,
362
+ "гра": 450,
363
+ "Б": 66,
364
+ "ars": 281,
365
+ "in": 121,
366
+ "der": 298,
367
+ "an": 130,
368
+ "as": 161,
369
+ "фун": 485,
370
+ "нии": 501,
371
+ "бед": 445,
372
+ "Ф": 81,
373
+ "ges": 252,
374
+ "ов": 139,
375
+ "ы": 114,
376
+ "rat": 417,
377
+ "n": 52,
378
+ "ven": 429,
379
+ "p": 54,
380
+ "pages": 414,
381
+ "ur": 158,
382
+ "1": 9,
383
+ "tr": 321,
384
+ "ма": 181,
385
+ "Y": 39,
386
+ "вет": 265,
387
+ "Sp": 289,
388
+ "ет": 133,
389
+ "at": 122,
390
+ "бя": 444,
391
+ "ей": 267,
392
+ "пт": 474,
393
+ "gen": 389,
394
+ "learning": 246,
395
+ "х": 108,
396
+ "ge": 186,
397
+ "whe": 431,
398
+ "ово": 509,
399
+ "wit": 433,
400
+ "ели": 268,
401
+ "it": 144,
402
+ "pro": 318,
403
+ "ach": 218,
404
+ "К": 72,
405
+ "tain": 189,
406
+ "ps": 410,
407
+ "?": 16,
408
+ "war": 323,
409
+ "ass": 351,
410
+ "Sun": 290,
411
+ "U": 36,
412
+ "um": 190,
413
+ "ic": 137,
414
+ "mov": 403,
415
+ "c": 42,
416
+ "R": 33,
417
+ "vis": 428,
418
+ "при": 217,
419
+ "чес": 491,
420
+ "i": 48,
421
+ "e": 44,
422
+ "col": 383,
423
+ "heal": 498,
424
+ "d": 43,
425
+ "op": 232,
426
+ "dat": 297,
427
+ "За": 438,
428
+ "eep": 384,
429
+ "wor": 432,
430
+ "if": 254,
431
+ "D": 20,
432
+ "тако": 199,
433
+ "ует": 484,
434
+ "les": 187,
435
+ "lig": 311,
436
+ "u": 59,
437
+ "teach": 424,
438
+ "ция": 242,
439
+ "я": 118,
440
+ "mat": 401,
441
+ "ul": 426,
442
+ "ity": 347,
443
+ "from": 356,
444
+ "затор": 284,
445
+ "ля": 209,
446
+ "ер": 159,
447
+ "0": 8,
448
+ "о": 101,
449
+ "ц": 109,
450
+ "St": 374,
451
+ "6": 14,
452
+ "cur": 382,
453
+ "star": 499,
454
+ "Wh": 201,
455
+ "intel": 494,
456
+ "лица": 216,
457
+ "ют": 279,
458
+ "дус": 455,
459
+ "изатор": 462,
460
+ "ва": 235,
461
+ "Л": 73,
462
+ "ol": 258,
463
+ "ou": 207,
464
+ "е": 92,
465
+ "ir": 229,
466
+ "ess": 492,
467
+ "or": 132,
468
+ "ф": 107,
469
+ "сл": 196,
470
+ "Зем": 326,
471
+ "ков": 464,
472
+ "для": 266,
473
+ "AI": 287,
474
+ "st": 128,
475
+ "V": 37,
476
+ "Как": 366,
477
+ "<pad>": 1,
478
+ "сф": 477,
479
+ "Р": 78,
480
+ "r": 56,
481
+ "Sat": 375,
482
+ "uses": 219,
483
+ "ж": 93,
484
+ "ed": 184,
485
+ "cre": 381,
486
+ "об": 156,
487
+ "contain": 285,
488
+ "le": 145,
489
+ "ox": 409,
490
+ "av": 173,
491
+ "stud": 171,
492
+ "ing": 126,
493
+ "P": 31,
494
+ "web": 324,
495
+ "дин": 453,
496
+ "water": 434,
497
+ "!": 3,
498
+ "sh": 419,
499
+ "yc": 325,
500
+ "с": 104,
501
+ "un": 152,
502
+ "b": 41,
503
+ "Н": 75,
504
+ "рабо": 345,
505
+ "та": 148,
506
+ "What": 220,
507
+ "вес": 447,
508
+ "g": 46,
509
+ "л": 98,
510
+ "га": 448,
511
+ "ии": 459,
512
+ "в": 89,
513
+ "pace": 413
514
+ }