vinucmer-small / tokenizer.json
LKarlo's picture
training roberta structure with 4808259 samples, 2406 test samples, 500 vocab size, 3 hidden layers, 256 hidden size, 4 attention heads, 0.15 mlm probability, 10 num process, 512 max length, 0.0005 train test split, 50 min sub seq length, 2000 max sub seq length, 42 seed
35e291d
{
"version": "1.0",
"truncation": {
"direction": "Right",
"max_length": 512,
"strategy": "LongestFirst",
"stride": 0
},
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<UNK>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "<SEP>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "<MASK>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "<CLS>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "<PAD>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "Whitespace"
},
"post_processor": {
"type": "RobertaProcessing",
"sep": [
"<SEP>",
1
],
"cls": [
"<CLS>",
3
],
"trim_offsets": true,
"add_prefix_space": true
},
"decoder": null,
"model": {
"type": "BPE",
"dropout": null,
"unk_token": "<UNK>",
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"vocab": {
"<UNK>": 0,
"<SEP>": 1,
"<MASK>": 2,
"<CLS>": 3,
"<PAD>": 4,
"A": 5,
"B": 6,
"C": 7,
"D": 8,
"G": 9,
"H": 10,
"K": 11,
"M": 12,
"N": 13,
"R": 14,
"S": 15,
"T": 16,
"V": 17,
"W": 18,
"Y": 19,
"AA": 20,
"TT": 21,
"GC": 22,
"GA": 23,
"TC": 24,
"TA": 25,
"TG": 26,
"CC": 27,
"CA": 28,
"GG": 29,
"CG": 30,
"AC": 31,
"AG": 32,
"GT": 33,
"AT": 34,
"CT": 35,
"ATT": 36,
"GAA": 37,
"CAA": 38,
"TAA": 39,
"GAT": 40,
"ATC": 41,
"GTT": 42,
"CTT": 43,
"GCA": 44,
"GCG": 45,
"ACC": 46,
"GCT": 47,
"GAC": 48,
"GCC": 49,
"GAG": 50,
"GTC": 51,
"TAT": 52,
"TGG": 53,
"CTC": 54,
"ACA": 55,
"TGT": 56,
"TAC": 57,
"AAAA": 58,
"TTTT": 59,
"AAAT": 60,
"TAG": 61,
"GGT": 62,
"ATTT": 63,
"AAGA": 64,
"TAAA": 65,
"AATA": 66,
"TATT": 67,
"TTTA": 68,
"AGAA": 69,
"AATT": 70,
"ATAT": 71,
"TTAT": 72,
"TCAA": 73,
"ATAA": 74,
"TGAA": 75,
"ATCA": 76,
"GAAG": 77,
"TGAT": 78,
"GAAA": 79,
"ATTA": 80,
"TTAA": 81,
"TTCA": 82,
"TAAT": 83,
"AACA": 84,
"AAAG": 85,
"TTGA": 86,
"CAAA": 87,
"ATGA": 88,
"ACAA": 89,
"TTCT": 90,
"GGG": 91,
"TCTT": 92,
"GATG": 93,
"TGTT": 94,
"TTTG": 95,
"TTTC": 96,
"AATG": 97,
"TCAT": 98,
"CTTT": 99,
"CATC": 100,
"CGGC": 101,
"TGGT": 102,
"AAAC": 103,
"CTTC": 104,
"AGAT": 105,
"GCTG": 106,
"CAAC": 107,
"GGCG": 108,
"CAAG": 109,
"CAAT": 110,
"ATTG": 111,
"GCCG": 112,
"TGGA": 113,
"ACCA": 114,
"CATT": 115,
"GATT": 116,
"GTTT": 117,
"TTGT": 118,
"TATA": 119,
"AACT": 120,
"CTGG": 121,
"ATGG": 122,
"CGCC": 123,
"AATC": 124,
"CGAC": 125,
"GAAC": 126,
"AGCA": 127,
"GAAT": 128,
"GCAA": 129,
"GTTC": 130,
"ATCT": 131,
"AGTT": 132,
"GGTG": 133,
"ATTC": 134,
"CAGC": 135,
"TGCT": 136,
"GATA": 137,
"GCGC": 138,
"GACG": 139,
"TATC": 140,
"CAGA": 141,
"GCAG": 142,
"GGAA": 143,
"AAGT": 144,
"TCGA": 145,
"GTTG": 146,
"AAGG": 147,
"CTGA": 148,
"AAGC": 149,
"ACAT": 150,
"CACC": 151,
"ACGA": 152,
"TGGC": 153,
"ACTT": 154,
"CCAA": 155,
"CCGC": 156,
"ATGT": 157,
"CGCG": 158,
"TTGG": 159,
"TACA": 160,
"TCCA": 161,
"CGAA": 162,
"TCTG": 163,
"AGGA": 164,
"GCGG": 165,
"CTGC": 166,
"CCAG": 167,
"ATCG": 168,
"TCAG": 169,
"TGAC": 170,
"GTCG": 171,
"GATC": 172,
"TTCC": 173,
"CGAT": 174,
"TATG": 175,
"GTGG": 176,
"TGCA": 177,
"AACG": 178,
"GGCA": 179,
"CGTC": 180,
"TTGC": 181,
"GAGA": 182,
"TTCG": 183,
"CTTG": 184,
"AACC": 185,
"CCAT": 186,
"TTAC": 187,
"GCGA": 188,
"GCTT": 189,
"GTAT": 190,
"AGCT": 191,
"CCGA": 192,
"TCGT": 193,
"GTAA": 194,
"GCCA": 195,
"GACA": 196,
"GGTT": 197,
"GACC": 198,
"ACCG": 199,
"CAGG": 200,
"ATGC": 201,
"CCTG": 202,
"CGAG": 203,
"GTCA": 204,
"TGTA": 205,
"ACTG": 206,
"ATAC": 207,
"CATG": 208,
"CCGG": 209,
"GGAT": 210,
"ACAG": 211,
"TACT": 212,
"TCGC": 213,
"GTGA": 214,
"GCAT": 215,
"AGTA": 216,
"AGGT": 217,
"ACCT": 218,
"CGGT": 219,
"GTTA": 220,
"TCAC": 221,
"TAAC": 222,
"CGTT": 223,
"ATCC": 224,
"TCCT": 225,
"CGCA": 226,
"GGCC": 227,
"CTCG": 228,
"TCGG": 229,
"GAGG": 230,
"CGCT": 231,
"ACGC": 232,
"CTGT": 233,
"CAGT": 234,
"GAGC": 235,
"CCTT": 236,
"GGTC": 237,
"GGAG": 238,
"AGAG": 239,
"GCTC": 240,
"GGTA": 241,
"CTAT": 242,
"AGAC": 243,
"ACGG": 244,
"CATA": 245,
"CGTG": 246,
"TCTA": 247,
"ACAC": 248,
"TGAG": 249,
"TGCC": 250,
"TCTC": 251,
"GCAC": 252,
"CCAC": 253,
"TAGA": 254,
"GGCT": 255,
"AGCG": 256,
"TGTC": 257,
"GCGT": 258,
"CTAC": 259,
"CTCA": 260,
"ACTA": 261,
"CACA": 262,
"CTAA": 263,
"AGGC": 264,
"ACG": 265,
"ACGT": 266,
"TGGG": 267,
"TACC": 268,
"CCCG": 269,
"GTGC": 270,
"CACG": 271,
"TGCG": 272,
"GGGC": 273,
"CCTC": 274,
"TGTG": 275,
"GTGT": 276,
"CTCT": 277,
"TAAG": 278,
"CCGT": 279,
"GGAC": 280,
"GAGT": 281,
"TTAG": 282,
"GTAC": 283,
"GTCT": 284,
"GACT": 285,
"CGGA": 286,
"ATAG": 287,
"CTTA": 288,
"ACTC": 289,
"TCCG": 290,
"AGCC": 291,
"ACCC": 292,
"CTCC": 293,
"CGGG": 294,
"AGTG": 295,
"GCCT": 296,
"GCCC": 297,
"CACT": 298,
"GCTA": 299,
"TACG": 300,
"GGGT": 301,
"TAGT": 302,
"CCCA": 303,
"AGTC": 304,
"GGGA": 305,
"GTAG": 306,
"GTCC": 307,
"CGTA": 308,
"AGGG": 309,
"TCCC": 310,
"TAGC": 311,
"CCCT": 312,
"CCCC": 313,
"GGGG": 314,
"CCTA": 315,
"TAGG": 316,
"CTAG": 317,
"NN": 318,
"NNNN": 319,
"TY": 320,
"RA": 321,
"YA": 322,
"TR": 323,
"AR": 324,
"AY": 325,
"YT": 326,
"CY": 327,
"CR": 328,
"YG": 329,
"NNN": 330,
"RT": 331,
"RG": 332,
"GY": 333,
"GR": 334,
"YC": 335,
"AAA": 336,
"TN": 337,
"RC": 338,
"AN": 339,
"TW": 340,
"NA": 341,
"TK": 342,
"WA": 343,
"KG": 344,
"CS": 345,
"MA": 346,
"TTT": 347,
"SG": 348,
"MC": 349,
"AW": 350,
"GN": 351,
"GK": 352,
"CM": 353,
"AAT": 354,
"CN": 355,
"AM": 356,
"NG": 357,
"CW": 358,
"WT": 359,
"GS": 360,
"KA": 361,
"SC": 362,
"NT": 363,
"TM": 364,
"NC": 365,
"CK": 366,
"WC": 367,
"KT": 368,
"AAG": 369,
"MT": 370,
"WG": 371,
"TTG": 372,
"TTA": 373,
"TS": 374,
"GAY": 375,
"MG": 376,
"AAC": 377,
"ATA": 378,
"CCA": 379,
"KC": 380,
"AK": 381,
"GM": 382,
"GAR": 383,
"AS": 384,
"ST": 385,
"YTC": 386,
"SA": 387,
"GW": 388,
"TTC": 389,
"GCY": 390,
"AGT": 391,
"CAT": 392,
"RTC": 393,
"RAAA": 394,
"TTTY": 395,
"CCC": 396,
"ACT": 397,
"CTG": 398,
"AGG": 399,
"TTYA": 400,
"CGG": 401,
"TYAA": 402,
"GTA": 403,
"CCT": 404,
"TRAA": 405,
"ATG": 406,
"GCR": 407,
"TTYT": 408,
"RR": 409,
"YAAA": 410,
"AARA": 411,
"CCG": 412,
"ARAA": 413,
"RAAT": 414,
"CAG": 415,
"ANNN": 416,
"WM": 417,
"ATTY": 418,
"TNNN": 419,
"NNNA": 420,
"WW": 421,
"YTTT": 422,
"TTYG": 423,
"NNNG": 424,
"GNNN": 425,
"AAAN": 426,
"TTRA": 427,
"GTG": 428,
"YY": 429,
"AAAR": 430,
"CAC": 431,
"TYTT": 432,
"CGT": 433,
"NNNC": 434,
"CTA": 435,
"TYAT": 436,
"YAAT": 437,
"TRAT": 438,
"CRAA": 439,
"TTTN": 440,
"ATYA": 441,
"WY": 442,
"GTTY": 443,
"CNNN": 444,
"RTTT": 445,
"AATY": 446,
"YATT": 447,
"ATTR": 448,
"CTTY": 449,
"AYTT": 450,
"TAYT": 451,
"ATRA": 452,
"AAYA": 453,
"GAN": 454,
"TCRA": 455,
"RATA": 456,
"AART": 457,
"CAAR": 458,
"TYGA": 459,
"ARTT": 460,
"AGC": 461,
"RAAC": 462,
"WR": 463,
"AAAY": 464,
"ATYT": 465,
"AYAT": 466,
"NNNT": 467,
"RATT": 468,
"RGAA": 469,
"YTGT": 470,
"AARG": 471,
"ACRA": 472,
"GAAR": 473,
"NAAA": 474,
"TTTR": 475,
"TCA": 476,
"GRAA": 477,
"CCSC": 478,
"RAAG": 479,
"YTAT": 480,
"AAYT": 481,
"ARAT": 482,
"ATAY": 483,
"YAAC": 484,
"TTCR": 485,
"GGSG": 486,
"ARGA": 487,
"AYAA": 488,
"YGAA": 489,
"YCAT": 490,
"GAK": 491,
"TCTY": 492,
"TATY": 493,
"TYTA": 494,
"TYGT": 495,
"GSGG": 496,
"GAM": 497,
"CTRA": 498,
"TYCA": 499
},
"merges": [
"A A",
"T T",
"G C",
"G A",
"T C",
"T A",
"T G",
"C C",
"C A",
"G G",
"C G",
"A C",
"A G",
"G T",
"A T",
"C T",
"A TT",
"G AA",
"C AA",
"T AA",
"GA T",
"A TC",
"G TT",
"C TT",
"GC A",
"GC G",
"A CC",
"GC T",
"GA C",
"GC C",
"GA G",
"G TC",
"TA T",
"TG G",
"C TC",
"A CA",
"TG T",
"TA C",
"AA AA",
"TT TT",
"AA AT",
"TA G",
"GG T",
"ATT T",
"AA GA",
"TAA A",
"AA TA",
"TA TT",
"TT TA",
"AG AA",
"AA TT",
"A TAT",
"TT AT",
"TC AA",
"AT AA",
"TG AA",
"ATC A",
"GAA G",
"T GAT",
"GAA A",
"ATT A",
"TT AA",
"TT CA",
"TAA T",
"AA CA",
"AA AG",
"TT GA",
"CAA A",
"AT GA",
"AC AA",
"TT CT",
"GG G",
"TC TT",
"GA TG",
"TG TT",
"TT TG",
"TT TC",
"AA TG",
"TC AT",
"CTT T",
"CA TC",
"CG GC",
"TG GT",
"AA AC",
"CTT C",
"A GAT",
"GC TG",
"CAA C",
"G GCG",
"CAA G",
"CAA T",
"ATT G",
"GC CG",
"TG GA",
"ACC A",
"CA TT",
"GA TT",
"GTT T",
"TT GT",
"TA TA",
"AA CT",
"C TGG",
"A TGG",
"C GCC",
"AA TC",
"C GAC",
"GAA C",
"A GCA",
"GAA T",
"GC AA",
"GTT C",
"ATC T",
"AG TT",
"GG TG",
"ATT C",
"CA GC",
"T GCT",
"GA TA",
"GC GC",
"GA CG",
"TA TC",
"CA GA",
"GC AG",
"GG AA",
"AA GT",
"TC GA",
"GTT G",
"AA GG",
"CT GA",
"AA GC",
"ACA T",
"CA CC",
"AC GA",
"TG GC",
"AC TT",
"CC AA",
"CC GC",
"A TGT",
"C GCG",
"TT GG",
"TA CA",
"TC CA",
"CG AA",
"TC TG",
"AG GA",
"GC GG",
"CT GC",
"CC AG",
"ATC G",
"TC AG",
"T GAC",
"GTC G",
"GA TC",
"TT CC",
"C GAT",
"TA TG",
"G TGG",
"T GCA",
"AA CG",
"G GCA",
"CG TC",
"TT GC",
"GA GA",
"TT CG",
"CTT G",
"AA CC",
"CC AT",
"TT AC",
"GC GA",
"GC TT",
"G TAT",
"A GCT",
"CC GA",
"TC GT",
"GT AA",
"GC CA",
"GA CA",
"GG TT",
"GA CC",
"ACC G",
"CA GG",
"AT GC",
"CC TG",
"C GAG",
"GTC A",
"TG TA",
"AC TG",
"A TAC",
"CA TG",
"CC GG",
"G GAT",
"ACA G",
"TA CT",
"TC GC",
"GT GA",
"GC AT",
"AG TA",
"A GGT",
"ACC T",
"C GGT",
"GTT A",
"TC AC",
"TAA C",
"CG TT",
"ATC C",
"TC CT",
"C GCA",
"G GCC",
"CTC G",
"TC GG",
"GA GG",
"C GCT",
"AC GC",
"C TGT",
"CA GT",
"GA GC",
"CC TT",
"GG TC",
"G GAG",
"A GAG",
"GC TC",
"GG TA",
"C TAT",
"A GAC",
"AC GG",
"CA TA",
"CG TG",
"TC TA",
"ACA C",
"T GAG",
"T GCC",
"TC TC",
"GC AC",
"CC AC",
"TA GA",
"G GCT",
"A GCG",
"TG TC",
"GC GT",
"C TAC",
"CTC A",
"AC TA",
"CA CA",
"CT AA",
"AG GC",
"A CG",
"ACG T",
"TG GG",
"TA CC",
"CC CG",
"GT GC",
"CA CG",
"T GCG",
"GG GC",
"CC TC",
"TG TG",
"G TGT",
"CTC T",
"TAA G",
"CC GT",
"G GAC",
"GA GT",
"TT AG",
"G TAC",
"GTC T",
"GA CT",
"CG GA",
"A TAG",
"CTT A",
"AC TC",
"TC CG",
"A GCC",
"ACC C",
"CTC C",
"C GGG",
"AG TG",
"GC CT",
"GC CC",
"CA CT",
"GC TA",
"TA CG",
"GG GT",
"TA GT",
"CC CA",
"AG TC",
"GG GA",
"G TAG",
"GTC C",
"CG TA",
"A GGG",
"TC CC",
"TA GC",
"CC CT",
"CC CC",
"GG GG",
"CC TA",
"TA GG",
"C TAG",
"N N",
"NN NN",
"T Y",
"R A",
"Y A",
"T R",
"A R",
"A Y",
"Y T",
"C Y",
"C R",
"Y G",
"NN N",
"R T",
"R G",
"G Y",
"G R",
"Y C",
"AA A",
"T N",
"R C",
"A N",
"T W",
"N A",
"T K",
"W A",
"K G",
"C S",
"M A",
"TT T",
"S G",
"M C",
"A W",
"G N",
"G K",
"C M",
"AA T",
"C N",
"A M",
"N G",
"C W",
"W T",
"G S",
"K A",
"S C",
"N T",
"T M",
"N C",
"C K",
"W C",
"K T",
"AA G",
"M T",
"W G",
"TT G",
"TT A",
"T S",
"GA Y",
"M G",
"AA C",
"A TA",
"CC A",
"K C",
"A K",
"G M",
"GA R",
"A S",
"S T",
"Y TC",
"S A",
"G W",
"TT C",
"GC Y",
"AG T",
"CA T",
"R TC",
"R AAA",
"TT TY",
"CC C",
"AC T",
"C TG",
"A GG",
"TT YA",
"C GG",
"TY AA",
"G TA",
"CC T",
"TR AA",
"A TG",
"GC R",
"TT YT",
"R R",
"Y AAA",
"AA RA",
"CC G",
"AR AA",
"R AAT",
"CA G",
"A NNN",
"W M",
"ATT Y",
"T NNN",
"NNN A",
"W W",
"Y TTT",
"TT YG",
"NNN G",
"G NNN",
"AAA N",
"TT RA",
"G TG",
"Y Y",
"AA AR",
"CA C",
"TY TT",
"CG T",
"NNN C",
"C TA",
"TY AT",
"Y AAT",
"TR AT",
"CR AA",
"TT TN",
"AT YA",
"W Y",
"GTT Y",
"C NNN",
"R TTT",
"AA TY",
"Y ATT",
"ATT R",
"CTT Y",
"AY TT",
"TA YT",
"AT RA",
"AA YA",
"GA N",
"TC RA",
"RA TA",
"AA RT",
"CAA R",
"TY GA",
"AR TT",
"A GC",
"R AAC",
"W R",
"AA AY",
"AT YT",
"AY AT",
"NNN T",
"R ATT",
"R GAA",
"Y TGT",
"AA RG",
"AC RA",
"GAA R",
"N AAA",
"TT TR",
"TC A",
"GR AA",
"CC SC",
"R AAG",
"Y TAT",
"AA YT",
"AR AT",
"ATA Y",
"Y AAC",
"TT CR",
"GG SG",
"AR GA",
"AY AA",
"Y GAA",
"Y CAT",
"GA K",
"TC TY",
"TAT Y",
"TY TA",
"TY GT",
"GS GG",
"GA M",
"CT RA",
"TY CA"
]
}
}