LKarlo commited on
Commit
35e291d
·
1 Parent(s): 192ee1c

training roberta structure with 4808259 samples, 2406 test samples, 500 vocab size, 3 hidden layers, 256 hidden size, 4 attention heads, 0.15 mlm probability, 10 num process, 512 max length, 0.0005 train test split, 50 min sub seq length, 2000 max sub seq length, 42 seed

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +37 -0
  2. tokenizer.json +1068 -0
  3. tokenizer_config.json +56 -0
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "<CLS>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "<MASK>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<PAD>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "<SEP>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "<UNK>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
@@ -0,0 +1,1068 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 512,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
+ "padding": null,
10
+ "added_tokens": [
11
+ {
12
+ "id": 0,
13
+ "content": "<UNK>",
14
+ "single_word": false,
15
+ "lstrip": false,
16
+ "rstrip": false,
17
+ "normalized": false,
18
+ "special": true
19
+ },
20
+ {
21
+ "id": 1,
22
+ "content": "<SEP>",
23
+ "single_word": false,
24
+ "lstrip": false,
25
+ "rstrip": false,
26
+ "normalized": false,
27
+ "special": true
28
+ },
29
+ {
30
+ "id": 2,
31
+ "content": "<MASK>",
32
+ "single_word": false,
33
+ "lstrip": false,
34
+ "rstrip": false,
35
+ "normalized": false,
36
+ "special": true
37
+ },
38
+ {
39
+ "id": 3,
40
+ "content": "<CLS>",
41
+ "single_word": false,
42
+ "lstrip": false,
43
+ "rstrip": false,
44
+ "normalized": false,
45
+ "special": true
46
+ },
47
+ {
48
+ "id": 4,
49
+ "content": "<PAD>",
50
+ "single_word": false,
51
+ "lstrip": false,
52
+ "rstrip": false,
53
+ "normalized": false,
54
+ "special": true
55
+ }
56
+ ],
57
+ "normalizer": null,
58
+ "pre_tokenizer": {
59
+ "type": "Whitespace"
60
+ },
61
+ "post_processor": {
62
+ "type": "RobertaProcessing",
63
+ "sep": [
64
+ "<SEP>",
65
+ 1
66
+ ],
67
+ "cls": [
68
+ "<CLS>",
69
+ 3
70
+ ],
71
+ "trim_offsets": true,
72
+ "add_prefix_space": true
73
+ },
74
+ "decoder": null,
75
+ "model": {
76
+ "type": "BPE",
77
+ "dropout": null,
78
+ "unk_token": "<UNK>",
79
+ "continuing_subword_prefix": null,
80
+ "end_of_word_suffix": null,
81
+ "fuse_unk": false,
82
+ "byte_fallback": false,
83
+ "vocab": {
84
+ "<UNK>": 0,
85
+ "<SEP>": 1,
86
+ "<MASK>": 2,
87
+ "<CLS>": 3,
88
+ "<PAD>": 4,
89
+ "A": 5,
90
+ "B": 6,
91
+ "C": 7,
92
+ "D": 8,
93
+ "G": 9,
94
+ "H": 10,
95
+ "K": 11,
96
+ "M": 12,
97
+ "N": 13,
98
+ "R": 14,
99
+ "S": 15,
100
+ "T": 16,
101
+ "V": 17,
102
+ "W": 18,
103
+ "Y": 19,
104
+ "AA": 20,
105
+ "TT": 21,
106
+ "GC": 22,
107
+ "GA": 23,
108
+ "TC": 24,
109
+ "TA": 25,
110
+ "TG": 26,
111
+ "CC": 27,
112
+ "CA": 28,
113
+ "GG": 29,
114
+ "CG": 30,
115
+ "AC": 31,
116
+ "AG": 32,
117
+ "GT": 33,
118
+ "AT": 34,
119
+ "CT": 35,
120
+ "ATT": 36,
121
+ "GAA": 37,
122
+ "CAA": 38,
123
+ "TAA": 39,
124
+ "GAT": 40,
125
+ "ATC": 41,
126
+ "GTT": 42,
127
+ "CTT": 43,
128
+ "GCA": 44,
129
+ "GCG": 45,
130
+ "ACC": 46,
131
+ "GCT": 47,
132
+ "GAC": 48,
133
+ "GCC": 49,
134
+ "GAG": 50,
135
+ "GTC": 51,
136
+ "TAT": 52,
137
+ "TGG": 53,
138
+ "CTC": 54,
139
+ "ACA": 55,
140
+ "TGT": 56,
141
+ "TAC": 57,
142
+ "AAAA": 58,
143
+ "TTTT": 59,
144
+ "AAAT": 60,
145
+ "TAG": 61,
146
+ "GGT": 62,
147
+ "ATTT": 63,
148
+ "AAGA": 64,
149
+ "TAAA": 65,
150
+ "AATA": 66,
151
+ "TATT": 67,
152
+ "TTTA": 68,
153
+ "AGAA": 69,
154
+ "AATT": 70,
155
+ "ATAT": 71,
156
+ "TTAT": 72,
157
+ "TCAA": 73,
158
+ "ATAA": 74,
159
+ "TGAA": 75,
160
+ "ATCA": 76,
161
+ "GAAG": 77,
162
+ "TGAT": 78,
163
+ "GAAA": 79,
164
+ "ATTA": 80,
165
+ "TTAA": 81,
166
+ "TTCA": 82,
167
+ "TAAT": 83,
168
+ "AACA": 84,
169
+ "AAAG": 85,
170
+ "TTGA": 86,
171
+ "CAAA": 87,
172
+ "ATGA": 88,
173
+ "ACAA": 89,
174
+ "TTCT": 90,
175
+ "GGG": 91,
176
+ "TCTT": 92,
177
+ "GATG": 93,
178
+ "TGTT": 94,
179
+ "TTTG": 95,
180
+ "TTTC": 96,
181
+ "AATG": 97,
182
+ "TCAT": 98,
183
+ "CTTT": 99,
184
+ "CATC": 100,
185
+ "CGGC": 101,
186
+ "TGGT": 102,
187
+ "AAAC": 103,
188
+ "CTTC": 104,
189
+ "AGAT": 105,
190
+ "GCTG": 106,
191
+ "CAAC": 107,
192
+ "GGCG": 108,
193
+ "CAAG": 109,
194
+ "CAAT": 110,
195
+ "ATTG": 111,
196
+ "GCCG": 112,
197
+ "TGGA": 113,
198
+ "ACCA": 114,
199
+ "CATT": 115,
200
+ "GATT": 116,
201
+ "GTTT": 117,
202
+ "TTGT": 118,
203
+ "TATA": 119,
204
+ "AACT": 120,
205
+ "CTGG": 121,
206
+ "ATGG": 122,
207
+ "CGCC": 123,
208
+ "AATC": 124,
209
+ "CGAC": 125,
210
+ "GAAC": 126,
211
+ "AGCA": 127,
212
+ "GAAT": 128,
213
+ "GCAA": 129,
214
+ "GTTC": 130,
215
+ "ATCT": 131,
216
+ "AGTT": 132,
217
+ "GGTG": 133,
218
+ "ATTC": 134,
219
+ "CAGC": 135,
220
+ "TGCT": 136,
221
+ "GATA": 137,
222
+ "GCGC": 138,
223
+ "GACG": 139,
224
+ "TATC": 140,
225
+ "CAGA": 141,
226
+ "GCAG": 142,
227
+ "GGAA": 143,
228
+ "AAGT": 144,
229
+ "TCGA": 145,
230
+ "GTTG": 146,
231
+ "AAGG": 147,
232
+ "CTGA": 148,
233
+ "AAGC": 149,
234
+ "ACAT": 150,
235
+ "CACC": 151,
236
+ "ACGA": 152,
237
+ "TGGC": 153,
238
+ "ACTT": 154,
239
+ "CCAA": 155,
240
+ "CCGC": 156,
241
+ "ATGT": 157,
242
+ "CGCG": 158,
243
+ "TTGG": 159,
244
+ "TACA": 160,
245
+ "TCCA": 161,
246
+ "CGAA": 162,
247
+ "TCTG": 163,
248
+ "AGGA": 164,
249
+ "GCGG": 165,
250
+ "CTGC": 166,
251
+ "CCAG": 167,
252
+ "ATCG": 168,
253
+ "TCAG": 169,
254
+ "TGAC": 170,
255
+ "GTCG": 171,
256
+ "GATC": 172,
257
+ "TTCC": 173,
258
+ "CGAT": 174,
259
+ "TATG": 175,
260
+ "GTGG": 176,
261
+ "TGCA": 177,
262
+ "AACG": 178,
263
+ "GGCA": 179,
264
+ "CGTC": 180,
265
+ "TTGC": 181,
266
+ "GAGA": 182,
267
+ "TTCG": 183,
268
+ "CTTG": 184,
269
+ "AACC": 185,
270
+ "CCAT": 186,
271
+ "TTAC": 187,
272
+ "GCGA": 188,
273
+ "GCTT": 189,
274
+ "GTAT": 190,
275
+ "AGCT": 191,
276
+ "CCGA": 192,
277
+ "TCGT": 193,
278
+ "GTAA": 194,
279
+ "GCCA": 195,
280
+ "GACA": 196,
281
+ "GGTT": 197,
282
+ "GACC": 198,
283
+ "ACCG": 199,
284
+ "CAGG": 200,
285
+ "ATGC": 201,
286
+ "CCTG": 202,
287
+ "CGAG": 203,
288
+ "GTCA": 204,
289
+ "TGTA": 205,
290
+ "ACTG": 206,
291
+ "ATAC": 207,
292
+ "CATG": 208,
293
+ "CCGG": 209,
294
+ "GGAT": 210,
295
+ "ACAG": 211,
296
+ "TACT": 212,
297
+ "TCGC": 213,
298
+ "GTGA": 214,
299
+ "GCAT": 215,
300
+ "AGTA": 216,
301
+ "AGGT": 217,
302
+ "ACCT": 218,
303
+ "CGGT": 219,
304
+ "GTTA": 220,
305
+ "TCAC": 221,
306
+ "TAAC": 222,
307
+ "CGTT": 223,
308
+ "ATCC": 224,
309
+ "TCCT": 225,
310
+ "CGCA": 226,
311
+ "GGCC": 227,
312
+ "CTCG": 228,
313
+ "TCGG": 229,
314
+ "GAGG": 230,
315
+ "CGCT": 231,
316
+ "ACGC": 232,
317
+ "CTGT": 233,
318
+ "CAGT": 234,
319
+ "GAGC": 235,
320
+ "CCTT": 236,
321
+ "GGTC": 237,
322
+ "GGAG": 238,
323
+ "AGAG": 239,
324
+ "GCTC": 240,
325
+ "GGTA": 241,
326
+ "CTAT": 242,
327
+ "AGAC": 243,
328
+ "ACGG": 244,
329
+ "CATA": 245,
330
+ "CGTG": 246,
331
+ "TCTA": 247,
332
+ "ACAC": 248,
333
+ "TGAG": 249,
334
+ "TGCC": 250,
335
+ "TCTC": 251,
336
+ "GCAC": 252,
337
+ "CCAC": 253,
338
+ "TAGA": 254,
339
+ "GGCT": 255,
340
+ "AGCG": 256,
341
+ "TGTC": 257,
342
+ "GCGT": 258,
343
+ "CTAC": 259,
344
+ "CTCA": 260,
345
+ "ACTA": 261,
346
+ "CACA": 262,
347
+ "CTAA": 263,
348
+ "AGGC": 264,
349
+ "ACG": 265,
350
+ "ACGT": 266,
351
+ "TGGG": 267,
352
+ "TACC": 268,
353
+ "CCCG": 269,
354
+ "GTGC": 270,
355
+ "CACG": 271,
356
+ "TGCG": 272,
357
+ "GGGC": 273,
358
+ "CCTC": 274,
359
+ "TGTG": 275,
360
+ "GTGT": 276,
361
+ "CTCT": 277,
362
+ "TAAG": 278,
363
+ "CCGT": 279,
364
+ "GGAC": 280,
365
+ "GAGT": 281,
366
+ "TTAG": 282,
367
+ "GTAC": 283,
368
+ "GTCT": 284,
369
+ "GACT": 285,
370
+ "CGGA": 286,
371
+ "ATAG": 287,
372
+ "CTTA": 288,
373
+ "ACTC": 289,
374
+ "TCCG": 290,
375
+ "AGCC": 291,
376
+ "ACCC": 292,
377
+ "CTCC": 293,
378
+ "CGGG": 294,
379
+ "AGTG": 295,
380
+ "GCCT": 296,
381
+ "GCCC": 297,
382
+ "CACT": 298,
383
+ "GCTA": 299,
384
+ "TACG": 300,
385
+ "GGGT": 301,
386
+ "TAGT": 302,
387
+ "CCCA": 303,
388
+ "AGTC": 304,
389
+ "GGGA": 305,
390
+ "GTAG": 306,
391
+ "GTCC": 307,
392
+ "CGTA": 308,
393
+ "AGGG": 309,
394
+ "TCCC": 310,
395
+ "TAGC": 311,
396
+ "CCCT": 312,
397
+ "CCCC": 313,
398
+ "GGGG": 314,
399
+ "CCTA": 315,
400
+ "TAGG": 316,
401
+ "CTAG": 317,
402
+ "NN": 318,
403
+ "NNNN": 319,
404
+ "TY": 320,
405
+ "RA": 321,
406
+ "YA": 322,
407
+ "TR": 323,
408
+ "AR": 324,
409
+ "AY": 325,
410
+ "YT": 326,
411
+ "CY": 327,
412
+ "CR": 328,
413
+ "YG": 329,
414
+ "NNN": 330,
415
+ "RT": 331,
416
+ "RG": 332,
417
+ "GY": 333,
418
+ "GR": 334,
419
+ "YC": 335,
420
+ "AAA": 336,
421
+ "TN": 337,
422
+ "RC": 338,
423
+ "AN": 339,
424
+ "TW": 340,
425
+ "NA": 341,
426
+ "TK": 342,
427
+ "WA": 343,
428
+ "KG": 344,
429
+ "CS": 345,
430
+ "MA": 346,
431
+ "TTT": 347,
432
+ "SG": 348,
433
+ "MC": 349,
434
+ "AW": 350,
435
+ "GN": 351,
436
+ "GK": 352,
437
+ "CM": 353,
438
+ "AAT": 354,
439
+ "CN": 355,
440
+ "AM": 356,
441
+ "NG": 357,
442
+ "CW": 358,
443
+ "WT": 359,
444
+ "GS": 360,
445
+ "KA": 361,
446
+ "SC": 362,
447
+ "NT": 363,
448
+ "TM": 364,
449
+ "NC": 365,
450
+ "CK": 366,
451
+ "WC": 367,
452
+ "KT": 368,
453
+ "AAG": 369,
454
+ "MT": 370,
455
+ "WG": 371,
456
+ "TTG": 372,
457
+ "TTA": 373,
458
+ "TS": 374,
459
+ "GAY": 375,
460
+ "MG": 376,
461
+ "AAC": 377,
462
+ "ATA": 378,
463
+ "CCA": 379,
464
+ "KC": 380,
465
+ "AK": 381,
466
+ "GM": 382,
467
+ "GAR": 383,
468
+ "AS": 384,
469
+ "ST": 385,
470
+ "YTC": 386,
471
+ "SA": 387,
472
+ "GW": 388,
473
+ "TTC": 389,
474
+ "GCY": 390,
475
+ "AGT": 391,
476
+ "CAT": 392,
477
+ "RTC": 393,
478
+ "RAAA": 394,
479
+ "TTTY": 395,
480
+ "CCC": 396,
481
+ "ACT": 397,
482
+ "CTG": 398,
483
+ "AGG": 399,
484
+ "TTYA": 400,
485
+ "CGG": 401,
486
+ "TYAA": 402,
487
+ "GTA": 403,
488
+ "CCT": 404,
489
+ "TRAA": 405,
490
+ "ATG": 406,
491
+ "GCR": 407,
492
+ "TTYT": 408,
493
+ "RR": 409,
494
+ "YAAA": 410,
495
+ "AARA": 411,
496
+ "CCG": 412,
497
+ "ARAA": 413,
498
+ "RAAT": 414,
499
+ "CAG": 415,
500
+ "ANNN": 416,
501
+ "WM": 417,
502
+ "ATTY": 418,
503
+ "TNNN": 419,
504
+ "NNNA": 420,
505
+ "WW": 421,
506
+ "YTTT": 422,
507
+ "TTYG": 423,
508
+ "NNNG": 424,
509
+ "GNNN": 425,
510
+ "AAAN": 426,
511
+ "TTRA": 427,
512
+ "GTG": 428,
513
+ "YY": 429,
514
+ "AAAR": 430,
515
+ "CAC": 431,
516
+ "TYTT": 432,
517
+ "CGT": 433,
518
+ "NNNC": 434,
519
+ "CTA": 435,
520
+ "TYAT": 436,
521
+ "YAAT": 437,
522
+ "TRAT": 438,
523
+ "CRAA": 439,
524
+ "TTTN": 440,
525
+ "ATYA": 441,
526
+ "WY": 442,
527
+ "GTTY": 443,
528
+ "CNNN": 444,
529
+ "RTTT": 445,
530
+ "AATY": 446,
531
+ "YATT": 447,
532
+ "ATTR": 448,
533
+ "CTTY": 449,
534
+ "AYTT": 450,
535
+ "TAYT": 451,
536
+ "ATRA": 452,
537
+ "AAYA": 453,
538
+ "GAN": 454,
539
+ "TCRA": 455,
540
+ "RATA": 456,
541
+ "AART": 457,
542
+ "CAAR": 458,
543
+ "TYGA": 459,
544
+ "ARTT": 460,
545
+ "AGC": 461,
546
+ "RAAC": 462,
547
+ "WR": 463,
548
+ "AAAY": 464,
549
+ "ATYT": 465,
550
+ "AYAT": 466,
551
+ "NNNT": 467,
552
+ "RATT": 468,
553
+ "RGAA": 469,
554
+ "YTGT": 470,
555
+ "AARG": 471,
556
+ "ACRA": 472,
557
+ "GAAR": 473,
558
+ "NAAA": 474,
559
+ "TTTR": 475,
560
+ "TCA": 476,
561
+ "GRAA": 477,
562
+ "CCSC": 478,
563
+ "RAAG": 479,
564
+ "YTAT": 480,
565
+ "AAYT": 481,
566
+ "ARAT": 482,
567
+ "ATAY": 483,
568
+ "YAAC": 484,
569
+ "TTCR": 485,
570
+ "GGSG": 486,
571
+ "ARGA": 487,
572
+ "AYAA": 488,
573
+ "YGAA": 489,
574
+ "YCAT": 490,
575
+ "GAK": 491,
576
+ "TCTY": 492,
577
+ "TATY": 493,
578
+ "TYTA": 494,
579
+ "TYGT": 495,
580
+ "GSGG": 496,
581
+ "GAM": 497,
582
+ "CTRA": 498,
583
+ "TYCA": 499
584
+ },
585
+ "merges": [
586
+ "A A",
587
+ "T T",
588
+ "G C",
589
+ "G A",
590
+ "T C",
591
+ "T A",
592
+ "T G",
593
+ "C C",
594
+ "C A",
595
+ "G G",
596
+ "C G",
597
+ "A C",
598
+ "A G",
599
+ "G T",
600
+ "A T",
601
+ "C T",
602
+ "A TT",
603
+ "G AA",
604
+ "C AA",
605
+ "T AA",
606
+ "GA T",
607
+ "A TC",
608
+ "G TT",
609
+ "C TT",
610
+ "GC A",
611
+ "GC G",
612
+ "A CC",
613
+ "GC T",
614
+ "GA C",
615
+ "GC C",
616
+ "GA G",
617
+ "G TC",
618
+ "TA T",
619
+ "TG G",
620
+ "C TC",
621
+ "A CA",
622
+ "TG T",
623
+ "TA C",
624
+ "AA AA",
625
+ "TT TT",
626
+ "AA AT",
627
+ "TA G",
628
+ "GG T",
629
+ "ATT T",
630
+ "AA GA",
631
+ "TAA A",
632
+ "AA TA",
633
+ "TA TT",
634
+ "TT TA",
635
+ "AG AA",
636
+ "AA TT",
637
+ "A TAT",
638
+ "TT AT",
639
+ "TC AA",
640
+ "AT AA",
641
+ "TG AA",
642
+ "ATC A",
643
+ "GAA G",
644
+ "T GAT",
645
+ "GAA A",
646
+ "ATT A",
647
+ "TT AA",
648
+ "TT CA",
649
+ "TAA T",
650
+ "AA CA",
651
+ "AA AG",
652
+ "TT GA",
653
+ "CAA A",
654
+ "AT GA",
655
+ "AC AA",
656
+ "TT CT",
657
+ "GG G",
658
+ "TC TT",
659
+ "GA TG",
660
+ "TG TT",
661
+ "TT TG",
662
+ "TT TC",
663
+ "AA TG",
664
+ "TC AT",
665
+ "CTT T",
666
+ "CA TC",
667
+ "CG GC",
668
+ "TG GT",
669
+ "AA AC",
670
+ "CTT C",
671
+ "A GAT",
672
+ "GC TG",
673
+ "CAA C",
674
+ "G GCG",
675
+ "CAA G",
676
+ "CAA T",
677
+ "ATT G",
678
+ "GC CG",
679
+ "TG GA",
680
+ "ACC A",
681
+ "CA TT",
682
+ "GA TT",
683
+ "GTT T",
684
+ "TT GT",
685
+ "TA TA",
686
+ "AA CT",
687
+ "C TGG",
688
+ "A TGG",
689
+ "C GCC",
690
+ "AA TC",
691
+ "C GAC",
692
+ "GAA C",
693
+ "A GCA",
694
+ "GAA T",
695
+ "GC AA",
696
+ "GTT C",
697
+ "ATC T",
698
+ "AG TT",
699
+ "GG TG",
700
+ "ATT C",
701
+ "CA GC",
702
+ "T GCT",
703
+ "GA TA",
704
+ "GC GC",
705
+ "GA CG",
706
+ "TA TC",
707
+ "CA GA",
708
+ "GC AG",
709
+ "GG AA",
710
+ "AA GT",
711
+ "TC GA",
712
+ "GTT G",
713
+ "AA GG",
714
+ "CT GA",
715
+ "AA GC",
716
+ "ACA T",
717
+ "CA CC",
718
+ "AC GA",
719
+ "TG GC",
720
+ "AC TT",
721
+ "CC AA",
722
+ "CC GC",
723
+ "A TGT",
724
+ "C GCG",
725
+ "TT GG",
726
+ "TA CA",
727
+ "TC CA",
728
+ "CG AA",
729
+ "TC TG",
730
+ "AG GA",
731
+ "GC GG",
732
+ "CT GC",
733
+ "CC AG",
734
+ "ATC G",
735
+ "TC AG",
736
+ "T GAC",
737
+ "GTC G",
738
+ "GA TC",
739
+ "TT CC",
740
+ "C GAT",
741
+ "TA TG",
742
+ "G TGG",
743
+ "T GCA",
744
+ "AA CG",
745
+ "G GCA",
746
+ "CG TC",
747
+ "TT GC",
748
+ "GA GA",
749
+ "TT CG",
750
+ "CTT G",
751
+ "AA CC",
752
+ "CC AT",
753
+ "TT AC",
754
+ "GC GA",
755
+ "GC TT",
756
+ "G TAT",
757
+ "A GCT",
758
+ "CC GA",
759
+ "TC GT",
760
+ "GT AA",
761
+ "GC CA",
762
+ "GA CA",
763
+ "GG TT",
764
+ "GA CC",
765
+ "ACC G",
766
+ "CA GG",
767
+ "AT GC",
768
+ "CC TG",
769
+ "C GAG",
770
+ "GTC A",
771
+ "TG TA",
772
+ "AC TG",
773
+ "A TAC",
774
+ "CA TG",
775
+ "CC GG",
776
+ "G GAT",
777
+ "ACA G",
778
+ "TA CT",
779
+ "TC GC",
780
+ "GT GA",
781
+ "GC AT",
782
+ "AG TA",
783
+ "A GGT",
784
+ "ACC T",
785
+ "C GGT",
786
+ "GTT A",
787
+ "TC AC",
788
+ "TAA C",
789
+ "CG TT",
790
+ "ATC C",
791
+ "TC CT",
792
+ "C GCA",
793
+ "G GCC",
794
+ "CTC G",
795
+ "TC GG",
796
+ "GA GG",
797
+ "C GCT",
798
+ "AC GC",
799
+ "C TGT",
800
+ "CA GT",
801
+ "GA GC",
802
+ "CC TT",
803
+ "GG TC",
804
+ "G GAG",
805
+ "A GAG",
806
+ "GC TC",
807
+ "GG TA",
808
+ "C TAT",
809
+ "A GAC",
810
+ "AC GG",
811
+ "CA TA",
812
+ "CG TG",
813
+ "TC TA",
814
+ "ACA C",
815
+ "T GAG",
816
+ "T GCC",
817
+ "TC TC",
818
+ "GC AC",
819
+ "CC AC",
820
+ "TA GA",
821
+ "G GCT",
822
+ "A GCG",
823
+ "TG TC",
824
+ "GC GT",
825
+ "C TAC",
826
+ "CTC A",
827
+ "AC TA",
828
+ "CA CA",
829
+ "CT AA",
830
+ "AG GC",
831
+ "A CG",
832
+ "ACG T",
833
+ "TG GG",
834
+ "TA CC",
835
+ "CC CG",
836
+ "GT GC",
837
+ "CA CG",
838
+ "T GCG",
839
+ "GG GC",
840
+ "CC TC",
841
+ "TG TG",
842
+ "G TGT",
843
+ "CTC T",
844
+ "TAA G",
845
+ "CC GT",
846
+ "G GAC",
847
+ "GA GT",
848
+ "TT AG",
849
+ "G TAC",
850
+ "GTC T",
851
+ "GA CT",
852
+ "CG GA",
853
+ "A TAG",
854
+ "CTT A",
855
+ "AC TC",
856
+ "TC CG",
857
+ "A GCC",
858
+ "ACC C",
859
+ "CTC C",
860
+ "C GGG",
861
+ "AG TG",
862
+ "GC CT",
863
+ "GC CC",
864
+ "CA CT",
865
+ "GC TA",
866
+ "TA CG",
867
+ "GG GT",
868
+ "TA GT",
869
+ "CC CA",
870
+ "AG TC",
871
+ "GG GA",
872
+ "G TAG",
873
+ "GTC C",
874
+ "CG TA",
875
+ "A GGG",
876
+ "TC CC",
877
+ "TA GC",
878
+ "CC CT",
879
+ "CC CC",
880
+ "GG GG",
881
+ "CC TA",
882
+ "TA GG",
883
+ "C TAG",
884
+ "N N",
885
+ "NN NN",
886
+ "T Y",
887
+ "R A",
888
+ "Y A",
889
+ "T R",
890
+ "A R",
891
+ "A Y",
892
+ "Y T",
893
+ "C Y",
894
+ "C R",
895
+ "Y G",
896
+ "NN N",
897
+ "R T",
898
+ "R G",
899
+ "G Y",
900
+ "G R",
901
+ "Y C",
902
+ "AA A",
903
+ "T N",
904
+ "R C",
905
+ "A N",
906
+ "T W",
907
+ "N A",
908
+ "T K",
909
+ "W A",
910
+ "K G",
911
+ "C S",
912
+ "M A",
913
+ "TT T",
914
+ "S G",
915
+ "M C",
916
+ "A W",
917
+ "G N",
918
+ "G K",
919
+ "C M",
920
+ "AA T",
921
+ "C N",
922
+ "A M",
923
+ "N G",
924
+ "C W",
925
+ "W T",
926
+ "G S",
927
+ "K A",
928
+ "S C",
929
+ "N T",
930
+ "T M",
931
+ "N C",
932
+ "C K",
933
+ "W C",
934
+ "K T",
935
+ "AA G",
936
+ "M T",
937
+ "W G",
938
+ "TT G",
939
+ "TT A",
940
+ "T S",
941
+ "GA Y",
942
+ "M G",
943
+ "AA C",
944
+ "A TA",
945
+ "CC A",
946
+ "K C",
947
+ "A K",
948
+ "G M",
949
+ "GA R",
950
+ "A S",
951
+ "S T",
952
+ "Y TC",
953
+ "S A",
954
+ "G W",
955
+ "TT C",
956
+ "GC Y",
957
+ "AG T",
958
+ "CA T",
959
+ "R TC",
960
+ "R AAA",
961
+ "TT TY",
962
+ "CC C",
963
+ "AC T",
964
+ "C TG",
965
+ "A GG",
966
+ "TT YA",
967
+ "C GG",
968
+ "TY AA",
969
+ "G TA",
970
+ "CC T",
971
+ "TR AA",
972
+ "A TG",
973
+ "GC R",
974
+ "TT YT",
975
+ "R R",
976
+ "Y AAA",
977
+ "AA RA",
978
+ "CC G",
979
+ "AR AA",
980
+ "R AAT",
981
+ "CA G",
982
+ "A NNN",
983
+ "W M",
984
+ "ATT Y",
985
+ "T NNN",
986
+ "NNN A",
987
+ "W W",
988
+ "Y TTT",
989
+ "TT YG",
990
+ "NNN G",
991
+ "G NNN",
992
+ "AAA N",
993
+ "TT RA",
994
+ "G TG",
995
+ "Y Y",
996
+ "AA AR",
997
+ "CA C",
998
+ "TY TT",
999
+ "CG T",
1000
+ "NNN C",
1001
+ "C TA",
1002
+ "TY AT",
1003
+ "Y AAT",
1004
+ "TR AT",
1005
+ "CR AA",
1006
+ "TT TN",
1007
+ "AT YA",
1008
+ "W Y",
1009
+ "GTT Y",
1010
+ "C NNN",
1011
+ "R TTT",
1012
+ "AA TY",
1013
+ "Y ATT",
1014
+ "ATT R",
1015
+ "CTT Y",
1016
+ "AY TT",
1017
+ "TA YT",
1018
+ "AT RA",
1019
+ "AA YA",
1020
+ "GA N",
1021
+ "TC RA",
1022
+ "RA TA",
1023
+ "AA RT",
1024
+ "CAA R",
1025
+ "TY GA",
1026
+ "AR TT",
1027
+ "A GC",
1028
+ "R AAC",
1029
+ "W R",
1030
+ "AA AY",
1031
+ "AT YT",
1032
+ "AY AT",
1033
+ "NNN T",
1034
+ "R ATT",
1035
+ "R GAA",
1036
+ "Y TGT",
1037
+ "AA RG",
1038
+ "AC RA",
1039
+ "GAA R",
1040
+ "N AAA",
1041
+ "TT TR",
1042
+ "TC A",
1043
+ "GR AA",
1044
+ "CC SC",
1045
+ "R AAG",
1046
+ "Y TAT",
1047
+ "AA YT",
1048
+ "AR AT",
1049
+ "ATA Y",
1050
+ "Y AAC",
1051
+ "TT CR",
1052
+ "GG SG",
1053
+ "AR GA",
1054
+ "AY AA",
1055
+ "Y GAA",
1056
+ "Y CAT",
1057
+ "GA K",
1058
+ "TC TY",
1059
+ "TAT Y",
1060
+ "TY TA",
1061
+ "TY GT",
1062
+ "GS GG",
1063
+ "GA M",
1064
+ "CT RA",
1065
+ "TY CA"
1066
+ ]
1067
+ }
1068
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<UNK>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<SEP>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "<MASK>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<CLS>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "<PAD>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "<CLS>",
46
+ "mask_token": "<MASK>",
47
+ "max_length": 512,
48
+ "model_max_length": 512,
49
+ "pad_token": "<PAD>",
50
+ "sep_token": "<SEP>",
51
+ "stride": 0,
52
+ "tokenizer_class": "PreTrainedTokenizerFast",
53
+ "truncation_side": "right",
54
+ "truncation_strategy": "longest_first",
55
+ "unk_token": "<UNK>"
56
+ }