imgremlin's picture
Upload tokenizer
d6b38b1
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<s>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "<pad>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "</s>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "<unk>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "<mask>",
"single_word": false,
"lstrip": true,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 15,
"content": "+",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 20,
"content": "0",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 21,
"content": "1",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 22,
"content": "2",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 23,
"content": "3",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 24,
"content": "4",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 25,
"content": "5",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 26,
"content": "6",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 27,
"content": "7",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 28,
"content": "8",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 29,
"content": "9",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 38,
"content": "B",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 39,
"content": "C",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 42,
"content": "F",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 44,
"content": "H",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 45,
"content": "I",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 50,
"content": "N",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 51,
"content": "O",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 52,
"content": "P",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 55,
"content": "S",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 274,
"content": "Cl",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 284,
"content": "Br",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 417,
"content": "Si",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 512,
"content": "Se",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 513,
"content": "As",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 514,
"content": "Sn",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "ByteLevel",
"add_prefix_space": false,
"trim_offsets": true,
"use_regex": true
},
"post_processor": {
"type": "RobertaProcessing",
"sep": [
"</s>",
2
],
"cls": [
"<s>",
0
],
"trim_offsets": true,
"add_prefix_space": false
},
"decoder": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": null,
"continuing_subword_prefix": "",
"end_of_word_suffix": "",
"fuse_unk": false,
"byte_fallback": false,
"vocab": {
"<s>": 0,
"<pad>": 1,
"</s>": 2,
"<unk>": 3,
"<mask>": 4,
"!": 5,
"\"": 6,
"#": 7,
"$": 8,
"%": 9,
"&": 10,
"'": 11,
"(": 12,
")": 13,
"*": 14,
"+": 15,
",": 16,
"-": 17,
".": 18,
"/": 19,
"0": 20,
"1": 21,
"2": 22,
"3": 23,
"4": 24,
"5": 25,
"6": 26,
"7": 27,
"8": 28,
"9": 29,
":": 30,
";": 31,
"<": 32,
"=": 33,
">": 34,
"?": 35,
"@": 36,
"A": 37,
"B": 38,
"C": 39,
"D": 40,
"E": 41,
"F": 42,
"G": 43,
"H": 44,
"I": 45,
"J": 46,
"K": 47,
"L": 48,
"M": 49,
"N": 50,
"O": 51,
"P": 52,
"Q": 53,
"R": 54,
"S": 55,
"T": 56,
"U": 57,
"V": 58,
"W": 59,
"X": 60,
"Y": 61,
"Z": 62,
"[": 63,
"\\": 64,
"]": 65,
"^": 66,
"_": 67,
"`": 68,
"a": 69,
"b": 70,
"c": 71,
"d": 72,
"e": 73,
"f": 74,
"g": 75,
"h": 76,
"i": 77,
"j": 78,
"k": 79,
"l": 80,
"m": 81,
"n": 82,
"o": 83,
"p": 84,
"q": 85,
"r": 86,
"s": 87,
"t": 88,
"u": 89,
"v": 90,
"w": 91,
"x": 92,
"y": 93,
"z": 94,
"{": 95,
"|": 96,
"}": 97,
"~": 98,
"¡": 99,
"¢": 100,
"£": 101,
"¤": 102,
"¥": 103,
"¦": 104,
"§": 105,
"¨": 106,
"©": 107,
"ª": 108,
"«": 109,
"¬": 110,
"®": 111,
"¯": 112,
"°": 113,
"±": 114,
"²": 115,
"³": 116,
"´": 117,
"µ": 118,
"¶": 119,
"·": 120,
"¸": 121,
"¹": 122,
"º": 123,
"»": 124,
"¼": 125,
"½": 126,
"¾": 127,
"¿": 128,
"À": 129,
"Á": 130,
"Â": 131,
"Ã": 132,
"Ä": 133,
"Å": 134,
"Æ": 135,
"Ç": 136,
"È": 137,
"É": 138,
"Ê": 139,
"Ë": 140,
"Ì": 141,
"Í": 142,
"Î": 143,
"Ï": 144,
"Ð": 145,
"Ñ": 146,
"Ò": 147,
"Ó": 148,
"Ô": 149,
"Õ": 150,
"Ö": 151,
"×": 152,
"Ø": 153,
"Ù": 154,
"Ú": 155,
"Û": 156,
"Ü": 157,
"Ý": 158,
"Þ": 159,
"ß": 160,
"à": 161,
"á": 162,
"â": 163,
"ã": 164,
"ä": 165,
"å": 166,
"æ": 167,
"ç": 168,
"è": 169,
"é": 170,
"ê": 171,
"ë": 172,
"ì": 173,
"í": 174,
"î": 175,
"ï": 176,
"ð": 177,
"ñ": 178,
"ò": 179,
"ó": 180,
"ô": 181,
"õ": 182,
"ö": 183,
"÷": 184,
"ø": 185,
"ù": 186,
"ú": 187,
"û": 188,
"ü": 189,
"ý": 190,
"þ": 191,
"ÿ": 192,
"Ā": 193,
"ā": 194,
"Ă": 195,
"ă": 196,
"Ą": 197,
"ą": 198,
"Ć": 199,
"ć": 200,
"Ĉ": 201,
"ĉ": 202,
"Ċ": 203,
"ċ": 204,
"Č": 205,
"č": 206,
"Ď": 207,
"ď": 208,
"Đ": 209,
"đ": 210,
"Ē": 211,
"ē": 212,
"Ĕ": 213,
"ĕ": 214,
"Ė": 215,
"ė": 216,
"Ę": 217,
"ę": 218,
"Ě": 219,
"ě": 220,
"Ĝ": 221,
"ĝ": 222,
"Ğ": 223,
"ğ": 224,
"Ġ": 225,
"ġ": 226,
"Ģ": 227,
"ģ": 228,
"Ĥ": 229,
"ĥ": 230,
"Ħ": 231,
"ħ": 232,
"Ĩ": 233,
"ĩ": 234,
"Ī": 235,
"ī": 236,
"Ĭ": 237,
"ĭ": 238,
"Į": 239,
"į": 240,
"İ": 241,
"ı": 242,
"IJ": 243,
"ij": 244,
"Ĵ": 245,
"ĵ": 246,
"Ķ": 247,
"ķ": 248,
"ĸ": 249,
"Ĺ": 250,
"ĺ": 251,
"Ļ": 252,
"ļ": 253,
"Ľ": 254,
"ľ": 255,
"Ŀ": 256,
"ŀ": 257,
"Ł": 258,
"ł": 259,
"Ń": 260,
"cc": 261,
"CC": 262,
"(=": 263,
"ccc": 264,
"OC": 265,
"CCCC": 266,
"CCC": 267,
"ccccc": 268,
"NC": 269,
"CO": 270,
")(": 271,
"Cc": 272,
"nc": 273,
"Cl": 274,
"Nc": 275,
"nH": 276,
"12": 277,
"cccc": 278,
"COc": 279,
"OCC": 280,
"(-": 281,
")=": 282,
"CCCCCCCC": 283,
"Br": 284,
"COC": 285,
"+]": 286,
")[": 287,
"CCCCC": 288,
"-]": 289,
"Oc": 290,
"oc": 291,
"CCN": 292,
"CN": 293,
")(=": 294,
"-])": 295,
"21": 296,
"([": 297,
"CCc": 298,
"+](=": 299,
"cn": 300,
"CS": 301,
"23": 302,
"CCCN": 303,
"CCCCCC": 304,
"ncc": 305,
"CCOC": 306,
"cnc": 307,
"NCC": 308,
"sc": 309,
"CNC": 310,
"CCCCN": 311,
"nn": 312,
"NCc": 313,
"CCCCCCCCC": 314,
"Cn": 315,
"COP": 316,
"OP": 317,
"CCCCCCC": 318,
"NCCCC": 319,
"cccnc": 320,
"OCc": 321,
"ncnc": 322,
"CCNCC": 323,
"+]([": 324,
"NS": 325,
"CCOCC": 326,
"+](": 327,
"CCl": 328,
"32": 329,
"nnc": 330,
"OCO": 331,
"Clc": 332,
"34": 333,
"CCO": 334,
"43": 335,
"CCNC": 336,
"=[": 337,
"NCCCCC": 338,
"SC": 339,
"CCOc": 340,
"ccnc": 341,
"CSc": 342,
"ccncc": 343,
"Sc": 344,
"ccccn": 345,
"CCCCCCCCCCCCCCCC": 346,
"Fc": 347,
"NNC": 348,
"NN": 349,
"CCCc": 350,
"NCCc": 351,
"cccs": 352,
"13": 353,
"SCC": 354,
"ncn": 355,
"Brc": 356,
"CSCCC": 357,
"csc": 358,
"CNc": 359,
"coc": 360,
"cs": 361,
"CCn": 362,
"NCCC": 363,
"ccoc": 364,
"ccco": 365,
")([": 366,
"ncccc": 367,
"ccn": 368,
"OCCN": 369,
"cnn": 370,
"CCCCCCCCCCCCCC": 371,
"CCCCCn": 372,
"nccc": 373,
"FC": 374,
"CCOP": 375,
"CCCNC": 376,
"CCCCCCCCCCCCCCC": 377,
"OCCc": 378,
"NO": 379,
"no": 380,
"NCCO": 381,
"NCCN": 382,
"OCCOCC": 383,
"45": 384,
"CCS": 385,
"CCCl": 386,
"CCCCCCCCCCCC": 387,
"CCNc": 388,
"CSC": 389,
"CCCCCCCCCC": 390,
"cncc": 391,
"NNc": 392,
"OCCO": 393,
"OS": 394,
"CNCc": 395,
"CCCCCCCCCCCCCCCCC": 396,
"CCCCc": 397,
"CCCCCCCCCCC": 398,
"31": 399,
"SCc": 400,
"CCCO": 401,
"COCC": 402,
"CNCC": 403,
"CCCn": 404,
"occc": 405,
"noc": 406,
"cncn": 407,
"CBr": 408,
"CCCCCF": 409,
"OCCC": 410,
")-": 411,
"CCCCn": 412,
"nccs": 413,
"56": 414,
"ccsc": 415,
"CCCCCCCCCCCCC": 416,
"Si": 417,
"sccc": 418,
"CCSc": 419,
"CNS": 420,
"NCCS": 421,
"ON": 422,
"occ": 423,
"CCSC": 424,
"](": 425,
"ncccn": 426,
"OCCCC": 427,
"CCNCc": 428,
"CCCCCCCCCCCCCCCCCC": 429,
"CCNS": 430,
"cnccc": 431,
"54": 432,
"Nn": 433,
"CCCOc": 434,
"nccn": 435,
"CP": 436,
"SCCNC": 437,
"CCCCNC": 438,
"nccnc": 439,
"NCCCN": 440,
"nonc": 441,
"14": 442,
"on": 443,
"cnccn": 444,
"CSCC": 445,
"cccn": 446,
"24": 447,
"ns": 448,
"CCCOC": 449,
"CNCCc": 450,
"On": 451,
"OCCCN": 452,
"CCCCOC": 453,
"CCCCCO": 454,
"CCCNCC": 455,
"OO": 456,
"CCCCCc": 457,
"CCCCCN": 458,
"nnnn": 459,
"35": 460,
"CCCCOCC": 461,
"CCCS": 462,
"CON": 463,
"ClCc": 464,
"nnn": 465,
"onc": 466,
"CCCCCCCCc": 467,
"NCCNC": 468,
"Ic": 469,
"NOCC": 470,
"CCCCCNC": 471,
"CNCCC": 472,
"nsnc": 473,
"65": 474,
"ccnn": 475,
"CNCCN": 476,
"cnnc": 477,
"CCCNS": 478,
"CCCCCCCCCCCCCCCCOCC": 479,
"CCSCC": 480,
"CCCCCCc": 481,
"OCCOc": 482,
"COCc": 483,
"nnnc": 484,
"OCCOCCOCCOCC": 485,
"NCCCCCC": 486,
"scc": 487,
"CCCCOc": 488,
"cncnc": 489,
"46": 490,
"NCCCCN": 491,
"NCCCc": 492,
"SSc": 493,
"CCCCNc": 494,
"CCCNc": 495,
"-][": 496,
"67": 497,
"SCCC": 498,
"SSC": 499,
"OCCOC": 500,
"341": 501,
"SP": 502,
"CCP": 503,
"OCOC": 504,
"COCCOCC": 505,
"ncncc": 506,
"CCCCCCO": 507,
"nncs": 508,
"NCCCn": 509,
"NOS": 510,
"10": 511
},
"merges": [
"c c",
"C C",
"( =",
"cc c",
"O C",
"CC CC",
"CC C",
"cc ccc",
"N C",
"C O",
") (",
"C c",
"n c",
"C l",
"N c",
"n H",
"1 2",
"cc cc",
"CO c",
"O CC",
"( -",
") =",
"CCCC CCCC",
"B r",
"C OC",
"+ ]",
") [",
"CCCC C",
"- ]",
"O c",
"o c",
"CC N",
"C N",
") (=",
"-] )",
"2 1",
"( [",
"CC c",
"+] (=",
"c n",
"C S",
"2 3",
"CCC N",
"CCCC CC",
"n cc",
"CC OC",
"c nc",
"N CC",
"s c",
"C NC",
"CCCC N",
"n n",
"NC c",
"CCCCCCCC C",
"C n",
"CO P",
"O P",
"CCCC CCC",
"N CCCC",
"ccc nc",
"OC c",
"nc nc",
"CCN CC",
"+] ([",
"N S",
"CC OCC",
"+] (",
"CC l",
"3 2",
"n nc",
"OC O",
"Cl c",
"3 4",
"CC O",
"4 3",
"CC NC",
"= [",
"N CCCCC",
"S C",
"CC Oc",
"cc nc",
"CS c",
"cc ncc",
"S c",
"cccc n",
"CCCCCCCC CCCCCCCC",
"F c",
"N NC",
"N N",
"CCC c",
"N CCc",
"ccc s",
"1 3",
"S CC",
"nc n",
"Br c",
"CS CCC",
"c sc",
"C Nc",
"c oc",
"c s",
"CC n",
"N CCC",
"cc oc",
"ccc o",
")( [",
"n cccc",
"cc n",
"OCC N",
"cn n",
"CCCCCCCC CCCCCC",
"CCCCC n",
"n ccc",
"F C",
"CC OP",
"CCC NC",
"CCCCCCCC CCCCCCC",
"OCC c",
"N O",
"n o",
"NCC O",
"N CCN",
"OCC OCC",
"4 5",
"CC S",
"CCC l",
"CCCCCCCC CCCC",
"CC Nc",
"CS C",
"CCCCCCCC CC",
"cn cc",
"N Nc",
"OCC O",
"O S",
"CNC c",
"CCCCCCCC CCCCCCCCC",
"CCCC c",
"CCCCCCCC CCC",
"3 1",
"S Cc",
"CCC O",
"CO CC",
"CN CC",
"CCC n",
"o ccc",
"n oc",
"cnc n",
"C Br",
"CCCCC F",
"O CCC",
") -",
"CCCC n",
"ncc s",
"5 6",
"cc sc",
"CCCCCCCC CCCCC",
"S i",
"s ccc",
"CC Sc",
"CN S",
"NCC S",
"O N",
"o cc",
"CC SC",
"] (",
"nccc n",
"O CCCC",
"CC NCc",
"CCCCCCCCCCCCCCCC CC",
"CCN S",
"cn ccc",
"5 4",
"N n",
"CCC Oc",
"ncc n",
"C P",
"S CCNC",
"CCCC NC",
"ncc nc",
"N CCCN",
"no nc",
"1 4",
"o n",
"cn ccn",
"CS CC",
"ccc n",
"2 4",
"n s",
"CCC OC",
"CN CCc",
"O n",
"O CCCN",
"CCCC OC",
"CCCC CO",
"CCCN CC",
"O O",
"CCCC Cc",
"CCCCC N",
"nn nn",
"3 5",
"CCCC OCC",
"CCC S",
"CO N",
"Cl Cc",
"nn n",
"o nc",
"CCCCCCCC c",
"NCC NC",
"I c",
"N OCC",
"CCCCC NC",
"CN CCC",
"ns nc",
"6 5",
"cc nn",
"CN CCN",
"cn nc",
"CCCN S",
"CCCCCCCCCCCCCCCC OCC",
"CC SCC",
"CCCC CCc",
"OCC Oc",
"COC c",
"nn nc",
"OCCOCC OCCOCC",
"N CCCCCC",
"s cc",
"CCCC Oc",
"cnc nc",
"4 6",
"N CCCCN",
"N CCCc",
"S Sc",
"CCCC Nc",
"CCC Nc",
"-] [",
"6 7",
"S CCC",
"S SC",
"OCC OC",
"34 1",
"S P",
"CC P",
"OC OC",
"CO CCOCC",
"nc ncc",
"CCCCCC O",
"nnc s",
"NCCC n",
"NO S",
"1 0"
]
}
}