File size: 965 Bytes
891e224
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
{
  "ape_source": "modernmolbert.local",
  "created_at_utc": "2026-05-19T16:24:35.971419+00:00",
  "creation_command": "python -m modernmolbert.train_ape_tokenizer",
  "dataset_name": "data/pretrain/chembl36_selfies",
  "extra_vocab_selfies_path": null,
  "extra_vocab_symbols_added": 42,
  "extra_vocab_symbols_path": "tokenizer/extra_symbols/benchmark_missing_selfies_symbols_min10.txt",
  "extra_vocab_symbols_requested": 42,
  "max_merge_pieces": 2,
  "max_vocab_size": 2000,
  "min_freq_for_merge": 3000,
  "representation": "SELFIES",
  "seed": 42,
  "selfies_column": "selfies",
  "shuffle_buffer_size": 100000,
  "special_ids": {
    "bos_token": 0,
    "eos_token": 2,
    "mask_token": 4,
    "pad_token": 1,
    "unk_token": 3
  },
  "tokenizer_path": "tokenizer/chembl36_selfies_2m_ape_max2_min3000.json",
  "tokenizer_sha256": "26ad0e90de9c0a469eb6e3b7aa985e2a2d7cc3f0dd7b2b229ccfce41f639e208",
  "tokenizer_train_size": 2000000,
  "vocab_size": 631
}