PeptideMLM_base / tokenizer.json
aaronfeller's picture
Upload tokenizer
ee2b9b6 verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[CLS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "[SEP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "[MASK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "Split",
"pattern": {
"Regex": "(\\[[^\\]]+]|C\\(=N\\)N|CCC\\(C\\)|\\(CCCN\\)|NC\\(=O\\)|C\\(C\\)=O|=C\\(N\\)N|N=C\\(N\\)|NC\\(=N\\)|C\\(=O\\)C|CS\\(=O\\)|OC\\(=O\\)|C\\(=O\\)c|c\\(=O\\)n|C\\(=O\\)O|C\\(N\\)=O|cc\\(Br\\)|CC\\(=O\\)|C\\(=O\\)N|ccc\\(C\\)|ccc\\(F\\)|c\\(=O\\)|C\\(=N\\)|c\\(O\\)c|NC\\(C\\)|n\\(C\\)c|CC\\(O\\)|cc\\(N\\)|CC\\(C\\)|cc\\(C\\)|C\\(=O\\)|cc\\(O\\)|c\\(N\\)c|c\\(Cl\\)|C\\(N\\)N|N\\(C\\)C|NC\\(N\\)|=C\\(N\\)|C\\(O\\)C|c\\(OC\\)|\\(C#N\\)|C\\(C\\)C|CC\\(N\\)|C\\(C\\)N|c\\(CO\\)|c\\(Br\\)|\\(CCO\\)|C\\(CC\\)|S\\(=O\\)|c\\(C\\)c|\\(=N\\)|c\\(O\\)|\\(Br\\)|\\(CS\\)|c\\(C\\)|\\(CC\\)|c\\(I\\)|C\\(C\\)|N\\(C\\)|C\\(O\\)|C\\(I\\)|C\\(F\\)|\\(Cl\\)|n\\(C\\)|\\(OC\\)|\\(=O\\)|c\\(F\\)|CCCN\\)|\\(=S\\)|c\\(N\\)|\\(CO\\)|C\\(N\\)|\\(C\\)|ccccc|\\(S\\)|\\(F\\)|\\(O\\)|C#N\\)|CCO\\)|\\(N\\)|C\\(=N|\\(I\\)|CSSC|=N\\)|CC=O|CCCO|Cl\\)|CCNO|=O\\)|CCSC|\\(=N|CO\\)|CCNC|CCCC|=S\\)|CN=C|CCCS|cccc|CCCN|Br\\)|cccn|CS\\)|C=CC|OC\\)|CC=C|cnn|=NC|COC|OCC|\\(O|CCS|CNc|#Cc|=CC|ccn|C=C|CSc|ccc|NCc|CCO|N=C|cnc|I\\)|CCc|OCc|CCl|ccs|COc|CCn|CSC|SCC|NCC|CCN|CNC|C#C|C=O|CNO|CCC|SSC|C#N|O=C|NOC|S\\)|csc|ncc|C\\)|N\\)|\\(C|ncn|F\\)|O\\)|N#C|nnc|CSS|cco|Cl|NC|nc|co|CS|CO|no|cc|CN|cn|SS|OC|\\)|SN|nn|CC|#C|NO|=S|NS|cs|=C|Oc|=O|oc|Nc|Cc|=N|NN|C=|C#|\\(|SC|sc|Br|N#|#N|p|O|I|N|C|s|=|c|B|S|F|n|P|#|o)"
},
"behavior": "Isolated",
"invert": false
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
}
],
"pair": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 1
}
}
],
"special_tokens": {
"[CLS]": {
"id": "[CLS]",
"ids": [
2
],
"tokens": [
"[CLS]"
]
},
"[SEP]": {
"id": "[SEP]",
"ids": [
3
],
"tokens": [
"[SEP]"
]
}
}
},
"decoder": null,
"model": {
"type": "WordPiece",
"unk_token": "[UNK]",
"continuing_subword_prefix": "##",
"max_input_chars_per_word": 100,
"vocab": {
"[PAD]": 0,
"[UNK]": 1,
"[CLS]": 2,
"[SEP]": 3,
"[MASK]": 4,
":": 5,
"%11": 6,
"-": 7,
"[As]": 8,
"[pH]": 9,
"[Po]": 10,
"[Ra]": 11,
"[3H]": 12,
"[S-]": 13,
"8": 14,
"%21": 15,
"[CH-]": 16,
"[IH]": 17,
"P": 18,
"[SeH]": 19,
"[O]": 20,
"4": 21,
"/": 22,
"[N-]": 23,
"[129Xe]": 24,
"[Cl+3]": 25,
"3": 26,
"[C@@]": 27,
"[11CH3]": 28,
"[13C]": 29,
"[Sn+]": 30,
"[P@@]": 31,
"[Ge]": 32,
"[BH3-]": 33,
"[123I]": 34,
"[14CH2]": 35,
"[Al-]": 36,
"[Si]": 37,
"[S@]": 38,
"[W]": 39,
"=": 40,
"%19": 41,
"Cl": 42,
"[Cl+2]": 43,
"%14": 44,
"[Al]": 45,
"9": 46,
"[B-]": 47,
"[Cl+]": 48,
"[TlH2]": 49,
"[NH2+]": 50,
"[11CH]": 51,
"[SnH]": 52,
"[SiH3]": 53,
"[Sn]": 54,
"[11C]": 55,
"S": 56,
"[SiH2]": 57,
"%18": 58,
"[BH-]": 59,
"[Ru]": 60,
"%10": 61,
"[V]": 62,
"[o+]": 63,
"[O+]": 64,
"c": 65,
"[I-]": 66,
"[C@@H]": 67,
"n": 68,
"2": 69,
"[Se-]": 70,
"[N+]": 71,
"N": 72,
"s": 73,
"[PH+]": 74,
"[C@]": 75,
"[N@]": 76,
"[C+]": 77,
"[s+]": 78,
"[N@+]": 79,
"[125I]": 80,
"[cH-]": 81,
"[Th]": 82,
"C": 83,
"[Sb]": 84,
"5": 85,
"[c-]": 86,
"#": 87,
"[Ca]": 88,
"%16": 89,
"[Tl]": 90,
"[18F]": 91,
"[223Ra]": 92,
"[BH2-]": 93,
"[O-]": 94,
"[Bi]": 95,
"[te]": 96,
"Br": 97,
"[Cr]": 98,
"[N@@]": 99,
"[Hg]": 100,
"[S@+]": 101,
"\\": 102,
"[n+]": 103,
"%15": 104,
"[123Te]": 105,
"[C-]": 106,
"1": 107,
"[NH+]": 108,
"[I+]": 109,
"[CH]": 110,
"%13": 111,
"[Pb]": 112,
"[14C]": 113,
"[2H]": 114,
"[P@]": 115,
"[OH+]": 116,
")": 117,
"[Tc]": 118,
"[se+]": 119,
"[NH-]": 120,
"[nH]": 121,
"B": 122,
"[CH2]": 123,
"[P+]": 124,
"[se]": 125,
"[In]": 126,
"[Te]": 127,
"[Se+]": 128,
"%12": 129,
"[S+]": 130,
"o": 131,
"[C]": 132,
"[N@@+]": 133,
"[n-]": 134,
"6": 135,
"[S@@]": 136,
"[nH+]": 137,
"[Si+]": 138,
"[PH]": 139,
"[Hg+]": 140,
"[C@H]": 141,
"[Ga]": 142,
"[S@@+]": 143,
"[NH3+]": 144,
"[SiH]": 145,
"[11c]": 146,
"%20": 147,
"%17": 148,
"(": 149,
"O": 150,
"[IH2]": 151,
"[As+]": 152,
"F": 153,
"[CH2-]": 154,
"[Se]": 155,
"[c+]": 156,
"%23": 157,
"[SH]": 158,
"I": 159,
"7": 160,
"%22": 161,
"[Os]": 162,
"[OH]": 163,
"p": 164,
"[P@+]": 165,
"[Ag+]": 166,
"[Ag-4]": 167,
"[Ag]": 168,
"[Al-3]": 169,
"[AsH3]": 170,
"[AsH]": 171,
"[At]": 172,
"[B@-]": 173,
"[B@@-]": 174,
"[B]": 175,
"[Ba]": 176,
"[Br+2]": 177,
"[BrH]": 178,
"[Br]": 179,
"[CH3]": 180,
"[CaH2]": 181,
"[Cs]": 182,
"[FH]": 183,
"[F]": 184,
"[H]": 185,
"[He]": 186,
"[I+2]": 187,
"[I+3]": 188,
"[I]": 189,
"[K]": 190,
"[Kr]": 191,
"[Li+]": 192,
"[LiH]": 193,
"[MgH2]": 194,
"[Mg]": 195,
"[NH3]": 196,
"[N]": 197,
"[Na]": 198,
"[OH2]": 199,
"[P@@+]": 200,
"[PH2]": 201,
"[P]": 202,
"[Rb]": 203,
"[SH+]": 204,
"[SH2]": 205,
"[S]": 206,
"[Se-2]": 207,
"[SeH2]": 208,
"[Si@]": 209,
"[SrH2]": 210,
"[TeH]": 211,
"[Xe]": 212,
"[Zn+2]": 213,
"[Zn-2]": 214,
"[Zn]": 215,
"[n]": 216,
"[te+]": 217,
"=O": 218,
"CC": 219,
"NC": 220,
"CO": 221,
"cc": 222,
"CCC": 223,
"CCCC": 224,
"ccc": 225,
"CCN": 226,
"CCCN": 227,
"CN": 228,
"CNC": 229,
"cccc": 230,
"ccccc": 231,
"N)": 232,
"(N)": 233,
"=O)": 234,
"(=O)": 235,
"C(=O)": 236,
"C(=O)N": 237,
"O)": 238,
"(C": 239,
"(C)": 240,
"C(C)": 241,
"C(C)C": 242,
"CC(=O)": 243,
"C(=O)O": 244,
"C(=O)C": 245,
"C(N)": 246,
"CC(N)": 247,
"C(N)=O": 248,
"CO)": 249,
"(CO)": 250,
"CC(C)": 251,
"CS": 252,
"=N": 253,
"CCNC": 254,
"NC(=O)": 255,
"=N)": 256,
"(=N)": 257,
"C(=N)": 258,
"CC=O": 259,
"CCCN)": 260,
"(CCCN)": 261,
"NC(=N)": 262,
"Br)": 263,
"(Br)": 264,
"F)": 265,
"(F)": 266,
"S)": 267,
"(S)": 268,
"C)": 269,
"(O)": 270,
"CCS": 271,
"CCCS": 272,
"CCSC": 273,
"cn": 274,
"ccn": 275,
"cccn": 276,
"CSC": 277,
"=C": 278,
"CCO": 279,
"(O": 280,
"(=N": 281,
"C(=N": 282,
"c(O)": 283,
"OC": 284,
"SCC": 285,
"ccc(F)": 286,
"S(=O)": 287,
"O=C": 288,
"CCc": 289,
"OC(=O)": 290,
"C#": 291,
"Cc": 292,
"C=C": 293,
"C=": 294,
"#N": 295,
"C#N": 296,
"ccs": 297,
"NO": 298,
"C(O)": 299,
"csc": 300,
"ccc(C)": 301,
"cc(Br)": 302,
"ncn": 303,
"CCNO": 304,
"CCCO": 305,
"CSS": 306,
"CSSC": 307,
"=CC": 308,
"I)": 309,
"(I)": 310,
"CNO": 311,
"N(C)": 312,
"N(C)C": 313,
"C(N)N": 314,
"NOC": 315,
"C(C)=O": 316,
"#C": 317,
"cco": 318,
"NS": 319,
"SN": 320,
"c(=O)n": 321,
"=S)": 322,
"(=S)": 323,
"c(N)c": 324,
"N=C": 325,
"SC": 326,
"SSC": 327,
"CCC(C)": 328,
"c(=O)": 329,
"C#N)": 330,
"(C#N)": 331,
"SS": 332,
"=S": 333,
"oc": 334,
"co": 335,
"no": 336,
"N#": 337,
"N#C": 338,
"nc": 339,
"sc": 340,
"C(=N)N": 341,
"C=O": 342,
"c(F)": 343,
"C(F)": 344,
"c(I)": 345,
"C(I)": 346,
"cnn": 347,
"cc(N)": 348,
"NC(N)": 349,
"OC)": 350,
"(OC)": 351,
"c(OC)": 352,
"c(Br)": 353,
"c(N)": 354,
"cc(O)": 355,
"CS)": 356,
"(CS)": 357,
"Oc": 358,
"cnc": 359,
"Cl)": 360,
"(Cl)": 361,
"c(Cl)": 362,
"c(O)c": 363,
"NCC": 364,
"COC": 365,
"OCC": 366,
"Nc": 367,
"ncc": 368,
"cc(C)": 369,
"nn": 370,
"cs": 371,
"c(C)c": 372,
"COc": 373,
"C(=O)c": 374,
"c(C)": 375,
"(CC)": 376,
"NCc": 377,
"nnc": 378,
"C(O)C": 379,
"=C(N)": 380,
"C=CC": 381,
"=C(N)N": 382,
"N=C(N)": 383,
"OCc": 384,
"CC=C": 385,
"CCl": 386,
"CCn": 387,
"CNc": 388,
"CC(O)": 389,
"NN": 390,
"CSc": 391,
"NC(C)": 392,
"CS(=O)": 393,
"C(CC)": 394,
"C#C": 395,
"C(C)N": 396,
"CCO)": 397,
"(CCO)": 398,
"CN=C": 399,
"n(C)": 400,
"n(C)c": 401,
"c(CO)": 402,
"#Cc": 403,
"=NC": 404
}
}
}