ProtoBind-Diff / tokenizer_smiles_diffusion.json
manuylo's picture
Upload dataset, tokenizer, categorical_mappings
a22a49d verified
raw
history blame contribute delete
974 Bytes
{
"properties": {
"regex": "(\\[[^\\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\\(|\\)|\\.|=|#|-|\\+|\\\\\\\\|\\/|:|~|@|\\?|>|\\*|\\$|\\%[0-9]{2}|[0-9]|.)",
"special_tokens": {
"start": "^",
"end": "&",
"pad": "<PAD>",
"unknown": "?",
"mask": "<MASK>",
"sep": "<SEP>"
},
"chem_start_idx": 6
},
"vocabulary": [
"<PAD>",
"?",
"^",
"&",
"<MASK>",
"<SEP>",
"O",
"=",
"C",
"1",
"N",
"(",
"/",
"c",
"2",
"n",
"[nH]",
")",
"[C@H]",
"[C@@H]",
"3",
"Br",
"F",
"S",
"Cl",
"\\",
"[N+]",
"[O-]",
"#",
"4",
"s",
"-",
"o",
"5",
"[C@@]",
"[C@]",
"6",
"P",
"7"
]
}