ApeTokenizer-SMILES / metadata.json
jsture's picture
Add ChEMBL36 APE SMILES tokenizer max6 mf3000
01bd7f8 verified
Raw
History Blame Contribute Delete
912 Bytes
{
"ape_source": "modernmolbert.local",
"created_at_utc": "2026-05-22T04:05:53.431998+00:00",
"creation_command": "python -m modernmolbert.train_ape_tokenizer",
"dataset_name": "data/pretrain/chembl36_selfies",
"extra_vocab_selfies_path": null,
"extra_vocab_symbols_added": 0,
"extra_vocab_symbols_path": null,
"extra_vocab_symbols_requested": 0,
"max_merge_pieces": 6,
"max_vocab_size": 2000,
"min_freq_for_merge": 3000,
"molecule_column": "smiles_canonical_clean",
"representation": "SMILES",
"seed": 42,
"shuffle_buffer_size": 100000,
"special_ids": {
"bos_token": 0,
"eos_token": 2,
"mask_token": 4,
"pad_token": 1,
"unk_token": 3
},
"tokenizer_path": "tokenizer/chembl36_smiles_2m_ape_max6_mf3000.json",
"tokenizer_sha256": "faf7748e8959b252c9d0ad83c2228df37a45dc9a68c15ead1ced2942cc8f155e",
"tokenizer_train_size": 2000000,
"vocab_size": 1386
}