jsture commited on
Commit
01bd7f8
·
verified ·
1 Parent(s): 9a67064

Add ChEMBL36 APE SMILES tokenizer max6 mf3000

Browse files
Files changed (5) hide show
  1. metadata.json +28 -0
  2. special_tokens_map.json +7 -0
  3. tokenization_ape.py +709 -0
  4. tokenizer_config.json +58 -0
  5. vocab.json +1388 -0
metadata.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "ape_source": "modernmolbert.local",
3
+ "created_at_utc": "2026-05-22T04:05:53.431998+00:00",
4
+ "creation_command": "python -m modernmolbert.train_ape_tokenizer",
5
+ "dataset_name": "data/pretrain/chembl36_selfies",
6
+ "extra_vocab_selfies_path": null,
7
+ "extra_vocab_symbols_added": 0,
8
+ "extra_vocab_symbols_path": null,
9
+ "extra_vocab_symbols_requested": 0,
10
+ "max_merge_pieces": 6,
11
+ "max_vocab_size": 2000,
12
+ "min_freq_for_merge": 3000,
13
+ "molecule_column": "smiles_canonical_clean",
14
+ "representation": "SMILES",
15
+ "seed": 42,
16
+ "shuffle_buffer_size": 100000,
17
+ "special_ids": {
18
+ "bos_token": 0,
19
+ "eos_token": 2,
20
+ "mask_token": 4,
21
+ "pad_token": 1,
22
+ "unk_token": 3
23
+ },
24
+ "tokenizer_path": "tokenizer/chembl36_smiles_2m_ape_max6_mf3000.json",
25
+ "tokenizer_sha256": "faf7748e8959b252c9d0ad83c2228df37a45dc9a68c15ead1ced2942cc8f155e",
26
+ "tokenizer_train_size": 2000000,
27
+ "vocab_size": 1386
28
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "unk_token": "<unk>",
5
+ "pad_token": "<pad>",
6
+ "mask_token": "<mask>"
7
+ }
tokenization_ape.py ADDED
@@ -0,0 +1,709 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Hugging Face-compatible tokenizer for APE molecular vocabularies.
2
+
3
+ This file is intentionally self-contained so it can be copied into a model repo
4
+ and loaded by ``AutoTokenizer.from_pretrained(..., trust_remote_code=True)``.
5
+ """
6
+
7
+ import json
8
+ import os
9
+ import re
10
+ from collections.abc import Mapping
11
+ from collections import defaultdict
12
+ from pathlib import Path
13
+ from typing import Any, Literal
14
+
15
+ from transformers import PreTrainedTokenizer
16
+
17
+
18
+ Representation = Literal["SELFIES", "SMILES"]
19
+
20
+ VOCAB_FILES_NAMES = {
21
+ "vocab_file": "vocab.json",
22
+ "selfies_vocab_file": "selfies_vocab.json",
23
+ "smiles_vocab_file": "smiles_vocab.json",
24
+ }
25
+ SELFIES_RE = re.compile(r"\[[^\]]+\]")
26
+ SMILES_RE = re.compile(
27
+ r"(\[[^\]]+\]|Br?|Cl?|Si?|Se?|Li?|Na?|Mg?|Al?|Ca?|Fe?|Zn?|"
28
+ r"N|O|S|P|F|I|K|B|C|H|"
29
+ r"b|c|n|o|s|p|"
30
+ r"\%\d{2}|\d|"
31
+ r"\(|\)|\.|=|#|-|\+|\\|/|:|~|@|\?|\*|\$)"
32
+ )
33
+
34
+
35
+ def _base_piece_count(token: str, representation: str) -> int:
36
+ """Count primitive molecular pieces in a vocab token."""
37
+ pieces = pre_tokenize_molecule(token, representation)
38
+ return max(1, len(pieces))
39
+
40
+
41
+ def _max_vocab_piece_span(vocab: dict[str, int], representation: str) -> int:
42
+ """Maximum number of primitive pieces covered by any non-special vocab token."""
43
+ max_span = 1
44
+ for token in vocab:
45
+ if token.startswith("<") and token.endswith(">"):
46
+ continue
47
+ max_span = max(max_span, _base_piece_count(token, representation))
48
+ return max_span
49
+
50
+
51
+ def _coerce_vocab(vocab: Mapping[str, Any]) -> dict[str, int]:
52
+ if not isinstance(vocab, Mapping):
53
+ raise ValueError("Vocabulary must be a JSON object mapping token strings to integer IDs.")
54
+ out = {str(token): int(idx) for token, idx in vocab.items()}
55
+ if len(set(out.values())) != len(out):
56
+ raise ValueError("Vocabulary token IDs must be unique.")
57
+ return out
58
+
59
+
60
+ def _token_text(token: Any) -> str:
61
+ return str(getattr(token, "content", token))
62
+
63
+
64
+ def _normalize_representation(representation: str) -> Representation:
65
+ normalized = representation.upper()
66
+ if normalized not in {"SELFIES", "SMILES"}:
67
+ raise ValueError(f"representation must be 'SELFIES' or 'SMILES', got {representation!r}")
68
+ return normalized # type: ignore[return-value]
69
+
70
+
71
+ def _select_vocab_file(
72
+ *,
73
+ representation: Representation,
74
+ vocab_file: str | os.PathLike[str] | None,
75
+ selfies_vocab_file: str | os.PathLike[str] | None,
76
+ smiles_vocab_file: str | os.PathLike[str] | None,
77
+ ) -> str | os.PathLike[str] | None:
78
+ if representation == "SELFIES" and selfies_vocab_file is not None:
79
+ return selfies_vocab_file
80
+ if representation == "SMILES" and smiles_vocab_file is not None:
81
+ return smiles_vocab_file
82
+ return vocab_file
83
+
84
+
85
+ def pre_tokenize_molecule(molecule: str, representation: str) -> list[str]:
86
+ active_representation = _normalize_representation(representation)
87
+ if active_representation == "SELFIES":
88
+ return SELFIES_RE.findall(molecule)
89
+
90
+ tokens: list[str] = []
91
+ cursor = 0
92
+ for match in SMILES_RE.finditer(molecule):
93
+ if match.start() > cursor:
94
+ tokens.extend(molecule[cursor : match.start()])
95
+ tokens.append(match.group(0))
96
+ cursor = match.end()
97
+ if cursor < len(molecule):
98
+ tokens.extend(molecule[cursor:])
99
+ return [token for token in tokens if token and not token.isspace()]
100
+
101
+
102
+ def ape_tokenize(
103
+ text: str,
104
+ vocab: dict[str, int],
105
+ representation: str,
106
+ unk_token: str = "<unk>",
107
+ max_piece_span: int | None = None,
108
+ ) -> list[str]:
109
+ pieces = pre_tokenize_molecule(text, representation)
110
+ if not pieces:
111
+ return [unk_token]
112
+
113
+ if max_piece_span is None:
114
+ max_piece_span = _max_vocab_piece_span(vocab, representation)
115
+
116
+ n = len(pieces)
117
+ tokens: list[str] = []
118
+ append_token = tokens.append
119
+ vocab_contains = vocab.__contains__
120
+ join_pieces = "".join
121
+ i = 0
122
+
123
+ while i < n:
124
+ upper = min(n, i + max_piece_span)
125
+
126
+ for j in range(upper, i, -1):
127
+ candidate = join_pieces(pieces[i:j])
128
+ if vocab_contains(candidate):
129
+ append_token(candidate)
130
+ i = j
131
+ break
132
+ else:
133
+ append_token(unk_token)
134
+ i += 1
135
+
136
+ return tokens
137
+
138
+
139
+ class APEPreTrainedTokenizer(PreTrainedTokenizer):
140
+ """Hugging Face tokenizer backend for APE molecular tokenization. (Not fast)"""
141
+
142
+ vocab_files_names = VOCAB_FILES_NAMES
143
+ model_input_names = ["input_ids", "attention_mask"]
144
+
145
+ def __init__(
146
+ self,
147
+ vocab_file: str | os.PathLike[str] | None = None,
148
+ selfies_vocab_file: str | os.PathLike[str] | None = None,
149
+ smiles_vocab_file: str | os.PathLike[str] | None = None,
150
+ vocab: dict[str, Any] | None = None,
151
+ representation: str = "SELFIES",
152
+ bos_token: str = "<s>",
153
+ eos_token: str = "</s>",
154
+ unk_token: str = "<unk>",
155
+ pad_token: str = "<pad>",
156
+ mask_token: str = "<mask>",
157
+ model_max_length: int = 256,
158
+ **kwargs,
159
+ ) -> None:
160
+ self.representation = _normalize_representation(representation)
161
+ active_vocab_file = _select_vocab_file(
162
+ representation=self.representation,
163
+ vocab_file=vocab_file,
164
+ selfies_vocab_file=selfies_vocab_file,
165
+ smiles_vocab_file=smiles_vocab_file,
166
+ )
167
+
168
+ if vocab is None:
169
+ if active_vocab_file is None:
170
+ vocab = {
171
+ bos_token: 0,
172
+ pad_token: 1,
173
+ eos_token: 2,
174
+ unk_token: 3,
175
+ mask_token: 4,
176
+ }
177
+ else:
178
+ with open(active_vocab_file, encoding="utf-8") as f:
179
+ vocab = json.load(f)
180
+
181
+ if vocab is None:
182
+ raise ValueError("Loaded vocabulary is None.")
183
+
184
+ self.vocab_file = str(active_vocab_file) if active_vocab_file is not None else None
185
+ self.selfies_vocab_file = (
186
+ str(selfies_vocab_file) if selfies_vocab_file is not None else None
187
+ )
188
+ self.smiles_vocab_file = str(smiles_vocab_file) if smiles_vocab_file is not None else None
189
+ self.vocab = _coerce_vocab(vocab)
190
+ self._require_special_tokens(
191
+ bos_token=bos_token,
192
+ eos_token=eos_token,
193
+ unk_token=unk_token,
194
+ pad_token=pad_token,
195
+ mask_token=mask_token,
196
+ )
197
+ self.ids_to_tokens = {idx: token for token, idx in self.vocab.items()}
198
+ self.vocabulary_frequency: dict[str, int] = {}
199
+ self.pair_counts: dict[tuple[str, str], int] = {}
200
+ self._max_piece_span = _max_vocab_piece_span(self.vocab, self.representation)
201
+
202
+ super().__init__(
203
+ bos_token=bos_token,
204
+ eos_token=eos_token,
205
+ unk_token=unk_token,
206
+ pad_token=pad_token,
207
+ mask_token=mask_token,
208
+ model_max_length=model_max_length,
209
+ representation=self.representation,
210
+ **kwargs,
211
+ )
212
+
213
+ @property
214
+ def vocab_size(self) -> int:
215
+ return len(self.vocab)
216
+
217
+ @property
218
+ def vocabulary(self) -> dict[str, int]:
219
+ """Legacy alias for callers that previously used APETokenizer."""
220
+ return self.vocab
221
+
222
+ @vocabulary.setter
223
+ def vocabulary(self, value: dict[str, int]) -> None:
224
+ self.vocab = _coerce_vocab(value)
225
+ self.update_reverse_vocabulary()
226
+ self._refresh_tokenization_cache()
227
+
228
+ @property
229
+ def special_tokens(self) -> dict[str, int]:
230
+ bos_token = str(self.bos_token)
231
+ pad_token = str(self.pad_token)
232
+ eos_token = str(self.eos_token)
233
+ unk_token = str(self.unk_token)
234
+ mask_token = str(self.mask_token)
235
+ return {
236
+ bos_token: self._convert_token_to_id(bos_token),
237
+ pad_token: self._convert_token_to_id(pad_token),
238
+ eos_token: self._convert_token_to_id(eos_token),
239
+ unk_token: self._convert_token_to_id(unk_token),
240
+ mask_token: self._convert_token_to_id(mask_token),
241
+ }
242
+
243
+ @special_tokens.setter
244
+ def special_tokens(self, value: dict[str, int]) -> None:
245
+ for token, token_id in value.items():
246
+ self.vocab.setdefault(str(token), int(token_id))
247
+ self.vocab = _coerce_vocab(self.vocab)
248
+ self.update_reverse_vocabulary()
249
+ self._refresh_tokenization_cache()
250
+
251
+ def get_vocab(self) -> dict[str, int]:
252
+ return dict(self.vocab)
253
+
254
+ def update_reverse_vocabulary(self) -> None:
255
+ self.ids_to_tokens = {idx: token for token, idx in self.vocab.items()}
256
+
257
+ def _refresh_tokenization_cache(self) -> None:
258
+ self._max_piece_span = _max_vocab_piece_span(self.vocab, self.representation)
259
+
260
+ def _require_special_tokens(
261
+ self,
262
+ *,
263
+ bos_token: str,
264
+ eos_token: str,
265
+ unk_token: str,
266
+ pad_token: str,
267
+ mask_token: str,
268
+ ) -> None:
269
+ missing = [
270
+ token_text
271
+ for token in [bos_token, eos_token, unk_token, pad_token, mask_token]
272
+ if (token_text := _token_text(token)) not in self.vocab
273
+ ]
274
+ if missing:
275
+ raise ValueError(f"Vocabulary is missing required special tokens: {missing}")
276
+
277
+ def pre_tokenize(self, molecule: str, representation: str | None = None) -> list[str]:
278
+ return pre_tokenize_molecule(molecule, representation or self.representation)
279
+
280
+ def _tokenize(self, text: str, **kwargs) -> list[str]:
281
+
282
+ return ape_tokenize(
283
+ text,
284
+ vocab=self.vocab,
285
+ representation=self.representation,
286
+ unk_token=str(self.unk_token),
287
+ max_piece_span=self._max_piece_span,
288
+ )
289
+
290
+ def encode_molecule(
291
+ self,
292
+ text: str,
293
+ add_special_tokens: bool = True,
294
+ max_length: int | None = None,
295
+ truncation: bool = True,
296
+ ) -> list[int]:
297
+ """Fast molecular encode path avoiding generic Hugging Face tokenizer overhead."""
298
+
299
+ tokens = self._tokenize(text)
300
+
301
+ ids = [self._convert_token_to_id(token) for token in tokens]
302
+
303
+ if add_special_tokens:
304
+ ids = self.build_inputs_with_special_tokens(ids)
305
+
306
+ if max_length is not None and truncation:
307
+ ids = ids[:max_length]
308
+
309
+ return ids
310
+
311
+ def _convert_token_to_id(self, token: str) -> int:
312
+ return self.vocab.get(token, self.vocab[str(self.unk_token)])
313
+
314
+ def _convert_id_to_token(self, index: int) -> str:
315
+ return self.ids_to_tokens.get(int(index), str(self.unk_token))
316
+
317
+ def convert_tokens_to_string(self, tokens: list[str]) -> str:
318
+ return "".join(tokens)
319
+
320
+ def _required_special_token_id(
321
+ self,
322
+ token_value: int | list[int] | str | list[str] | None,
323
+ token_name: str,
324
+ ) -> int:
325
+ if token_value is None:
326
+ raise ValueError(f"{token_name} must be set.")
327
+ if isinstance(token_value, int):
328
+ return token_value
329
+ if isinstance(token_value, str):
330
+ return self._convert_token_to_id(token_value)
331
+ if len(token_value) == 1:
332
+ only_value = token_value[0]
333
+ if isinstance(only_value, int):
334
+ return only_value
335
+ if isinstance(only_value, str):
336
+ return self._convert_token_to_id(only_value)
337
+ raise ValueError(f"{token_name} must resolve to a single token id.")
338
+
339
+ def build_inputs_with_special_tokens(
340
+ self,
341
+ token_ids_0: list[int],
342
+ token_ids_1: list[int] | None = None,
343
+ ) -> list[int]:
344
+ bos_id = self._required_special_token_id(self.bos_token, "bos_token")
345
+ eos_id = self._required_special_token_id(self.eos_token, "eos_token")
346
+ if token_ids_1 is None:
347
+ return [bos_id, *token_ids_0, eos_id]
348
+ return [bos_id, *token_ids_0, eos_id, *token_ids_1, eos_id]
349
+
350
+ def create_token_type_ids_from_sequences(
351
+ self,
352
+ token_ids_0: list[int],
353
+ token_ids_1: list[int] | None = None,
354
+ ) -> list[int]:
355
+ return [0] * len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1))
356
+
357
+ def pad(
358
+ self,
359
+ encoded_inputs: Any,
360
+ padding: Any = True,
361
+ max_length: int | None = None,
362
+ pad_to_multiple_of: int | None = None,
363
+ padding_side: str | None = None,
364
+ return_attention_mask: bool | None = None,
365
+ return_tensors: Any = None,
366
+ verbose: bool = True,
367
+ ):
368
+ padding_enabled = padding not in (False, "do_not_pad")
369
+ if (
370
+ padding_enabled
371
+ and isinstance(encoded_inputs, list)
372
+ and any("labels" in item for item in encoded_inputs)
373
+ ):
374
+ target_length = max(
375
+ len(item.get("input_ids", item.get("labels", []))) for item in encoded_inputs
376
+ )
377
+ if padding == "max_length" and max_length is not None:
378
+ target_length = max_length
379
+
380
+ if pad_to_multiple_of and target_length % pad_to_multiple_of:
381
+ target_length = ((target_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
382
+
383
+ padded_inputs = []
384
+ for item in encoded_inputs:
385
+ item = dict(item)
386
+ labels = list(item.get("labels", []))
387
+ pad_len = max(0, target_length - len(labels))
388
+ if pad_len:
389
+ label_padding = [-100] * pad_len
390
+ if self.padding_side == "left":
391
+ labels = label_padding + labels
392
+ else:
393
+ labels = labels + label_padding
394
+ item["labels"] = labels
395
+ padded_inputs.append(item)
396
+ encoded_inputs = padded_inputs
397
+
398
+ return super().pad(
399
+ encoded_inputs,
400
+ padding=padding,
401
+ max_length=max_length,
402
+ pad_to_multiple_of=pad_to_multiple_of,
403
+ padding_side=padding_side,
404
+ return_attention_mask=return_attention_mask,
405
+ return_tensors=return_tensors,
406
+ verbose=verbose,
407
+ )
408
+
409
+ def save_vocabulary(
410
+ self,
411
+ save_directory: str,
412
+ filename_prefix: str | None = None,
413
+ ) -> tuple[str, ...]:
414
+ if not os.path.isdir(save_directory):
415
+ raise ValueError(f"Vocabulary path ({save_directory}) should be a directory.")
416
+
417
+ vocab_file = Path(save_directory) / (
418
+ f"{filename_prefix}-vocab.json" if filename_prefix else "vocab.json"
419
+ )
420
+ with vocab_file.open("w", encoding="utf-8") as f:
421
+ json.dump(self.vocab, f, ensure_ascii=False, indent=4)
422
+ return (str(vocab_file),)
423
+
424
+ def add_tokens_to_vocabulary(self, tokens: list[str]) -> int:
425
+ """Add tokens to the tokenizer vocabulary if they are not already present.
426
+
427
+ This is intended for forcing coverage of rare valid molecular primitive
428
+ symbols, especially SELFIES bracket tokens, after APE merge training.
429
+ """
430
+
431
+ if not tokens:
432
+ return 0
433
+
434
+ next_id = max(self.vocab.values(), default=-1) + 1
435
+ added = 0
436
+
437
+ for token in tokens:
438
+ token = str(token).strip()
439
+ if not token:
440
+ continue
441
+ if token in self.vocab:
442
+ continue
443
+
444
+ self.vocab[token] = next_id
445
+ next_id += 1
446
+ added += 1
447
+
448
+ if added:
449
+ self.update_reverse_vocabulary()
450
+ self._refresh_tokenization_cache()
451
+
452
+ return added
453
+
454
+ def save_pretrained(self, save_directory: str | os.PathLike[str], *args, **kwargs):
455
+ saved_files = super().save_pretrained(save_directory, *args, **kwargs)
456
+ save_path = Path(save_directory)
457
+
458
+ special_tokens_map = {
459
+ "bos_token": str(self.bos_token),
460
+ "eos_token": str(self.eos_token),
461
+ "unk_token": str(self.unk_token),
462
+ "pad_token": str(self.pad_token),
463
+ "mask_token": str(self.mask_token),
464
+ }
465
+ with (save_path / "special_tokens_map.json").open("w", encoding="utf-8") as f:
466
+ json.dump(special_tokens_map, f, ensure_ascii=False, indent=2)
467
+
468
+ tokenizer_config_path = save_path / "tokenizer_config.json"
469
+ if tokenizer_config_path.exists():
470
+ with tokenizer_config_path.open(encoding="utf-8") as f:
471
+ tokenizer_config = json.load(f)
472
+ else:
473
+ tokenizer_config = {}
474
+ tokenizer_config.pop("tokenizer_class", None)
475
+ tokenizer_config.update(
476
+ {
477
+ "representation": self.representation,
478
+ "model_max_length": self.model_max_length,
479
+ "auto_map": {
480
+ "AutoTokenizer": [
481
+ "tokenization_ape.APEPreTrainedTokenizer",
482
+ None,
483
+ ],
484
+ },
485
+ }
486
+ )
487
+ with tokenizer_config_path.open("w", encoding="utf-8") as f:
488
+ json.dump(tokenizer_config, f, ensure_ascii=False, indent=2)
489
+
490
+ return saved_files
491
+
492
+ def save_vocabulary_file(self, file_path: str | os.PathLike[str]) -> None:
493
+ path = Path(file_path)
494
+ path.parent.mkdir(parents=True, exist_ok=True)
495
+ freq_path = path.with_name(f"{path.stem}_freq.json")
496
+
497
+ with path.open("w", encoding="utf-8") as f:
498
+ json.dump(self.vocab, f, ensure_ascii=False, indent=4)
499
+ with freq_path.open("w", encoding="utf-8") as f:
500
+ json.dump(self.vocabulary_frequency, f, ensure_ascii=False, indent=4)
501
+
502
+ def load_vocabulary_file(
503
+ self,
504
+ file_path: str | os.PathLike[str],
505
+ representation: str | None = None,
506
+ ) -> None:
507
+ if representation is not None:
508
+ self.representation = _normalize_representation(representation)
509
+ with open(file_path, encoding="utf-8") as f:
510
+ vocab = json.load(f)
511
+ self.vocab = _coerce_vocab(vocab)
512
+ self._require_special_tokens(
513
+ bos_token=str(self.bos_token),
514
+ eos_token=str(self.eos_token),
515
+ unk_token=str(self.unk_token),
516
+ pad_token=str(self.pad_token),
517
+ mask_token=str(self.mask_token),
518
+ )
519
+ self.ids_to_tokens = {idx: token for token, idx in self.vocab.items()}
520
+ self._refresh_tokenization_cache()
521
+
522
+ def train(
523
+ self,
524
+ corpus,
525
+ type: str = "selfies",
526
+ representation: str | None = None,
527
+ max_vocab_size: int = 5000,
528
+ min_freq_for_merge: int = 2000,
529
+ max_merge_pieces: int | None = 8,
530
+ save_checkpoint: bool = False,
531
+ checkpoint_path: str = "checkpoint",
532
+ checkpoint_interval: int = 500,
533
+ ) -> None:
534
+ import warnings
535
+
536
+ new_rep = _normalize_representation(representation or type)
537
+ if new_rep != self.representation:
538
+ warnings.warn(
539
+ f"train() representation={new_rep!r} differs from tokenizer "
540
+ f"representation={self.representation!r}. Overwriting.",
541
+ UserWarning,
542
+ stacklevel=2,
543
+ )
544
+ self.representation = new_rep
545
+
546
+ if not corpus:
547
+ raise ValueError("Cannot train APE tokenizer on an empty corpus.")
548
+
549
+ print(f"Pretokenizing {self.representation}...", flush=True)
550
+ tokenized_corpus = []
551
+ vocabulary_frequency: defaultdict[str, int] = defaultdict(int)
552
+ saw_tokens = False
553
+
554
+ for sentence in corpus:
555
+ tokens = self.pre_tokenize(str(sentence))
556
+ if not tokens:
557
+ continue
558
+ saw_tokens = True
559
+ for token in tokens:
560
+ vocabulary_frequency[token] += 1
561
+ if len(tokens) > 1:
562
+ tokenized_corpus.append(tokens)
563
+ print(
564
+ f"Pretokenization complete, found {len(vocabulary_frequency)} tokens",
565
+ flush=True,
566
+ )
567
+
568
+ if not saw_tokens:
569
+ raise ValueError("Cannot train APE tokenizer on an empty corpus.")
570
+
571
+ pre_tokens_counts = len(vocabulary_frequency)
572
+ merged_counter = len(vocabulary_frequency) + 1
573
+ if save_checkpoint and checkpoint_interval <= 0:
574
+ raise ValueError(
575
+ "checkpoint_interval must be positive when save_checkpoint is enabled."
576
+ )
577
+ checkpoint_increment = checkpoint_interval
578
+ batch = checkpoint_interval + pre_tokens_counts
579
+ piece_count_cache: dict[str, int] = {}
580
+
581
+ def merged_piece_count(token: str) -> int:
582
+ count = piece_count_cache.get(token)
583
+ if count is None:
584
+ count = _base_piece_count(token, self.representation)
585
+ piece_count_cache[token] = count
586
+ return count
587
+
588
+ def get_most_common_pair(tokenized):
589
+ pair_counts: defaultdict[tuple[str, str], int] = defaultdict(int)
590
+ for tokens in tokenized:
591
+ for i in range(len(tokens) - 1):
592
+ pair = (tokens[i], tokens[i + 1])
593
+
594
+ if max_merge_pieces is not None:
595
+ merged_candidate = pair[0] + pair[1]
596
+ if merged_piece_count(merged_candidate) > max_merge_pieces:
597
+ continue
598
+
599
+ pair_counts[pair] += 1
600
+
601
+ self.pair_counts = dict(pair_counts)
602
+ if not pair_counts:
603
+ return ("", ""), 0
604
+
605
+ most_common_pair = ("", "")
606
+ most_common_frequency = 0
607
+ for pair, count in pair_counts.items():
608
+ if count > most_common_frequency:
609
+ most_common_pair = pair
610
+ most_common_frequency = count
611
+ return most_common_pair, most_common_frequency
612
+
613
+ while True:
614
+ if save_checkpoint and len(vocabulary_frequency) >= batch:
615
+ self.vocabulary_frequency = dict(vocabulary_frequency)
616
+ self.vocab = {
617
+ **{
618
+ str(self.bos_token): 0,
619
+ str(self.pad_token): 1,
620
+ str(self.eos_token): 2,
621
+ str(self.unk_token): 3,
622
+ str(self.mask_token): 4,
623
+ },
624
+ **{
625
+ word: idx
626
+ for idx, word in enumerate(
627
+ vocabulary_frequency.keys(),
628
+ start=5,
629
+ )
630
+ },
631
+ }
632
+ self.ids_to_tokens = {idx: token for token, idx in self.vocab.items()}
633
+ self._refresh_tokenization_cache()
634
+ checkpoint_dir = Path(checkpoint_path)
635
+ checkpoint_dir.mkdir(parents=True, exist_ok=True)
636
+ self.save_vocabulary_file(checkpoint_dir / f"checkpoint_{batch}.json")
637
+ self.save_pretrained(str(checkpoint_dir / f"checkpoint_{batch}"))
638
+ print(f"Checkpoint saved at {checkpoint_dir}/checkpoint_{batch}.json")
639
+ batch += checkpoint_increment
640
+
641
+ if len(vocabulary_frequency) >= max_vocab_size:
642
+ print("Max vocabulary achieved", flush=True)
643
+ break
644
+
645
+ if not tokenized_corpus:
646
+ print("No more mergeable pairs", flush=True)
647
+ break
648
+
649
+ most_common_pair, freq = get_most_common_pair(tokenized_corpus)
650
+ if freq < min_freq_for_merge:
651
+ print("Not enough frequency found", flush=True)
652
+ break
653
+
654
+ if not most_common_pair[0] or not most_common_pair[1]:
655
+ print("No valid merge pair found", flush=True)
656
+ break
657
+
658
+ left_token, right_token = most_common_pair
659
+ merged_word = left_token + right_token
660
+ if merged_word not in vocabulary_frequency:
661
+ print(
662
+ f"New merge found: {merged_word} {merged_counter}/{max_vocab_size} "
663
+ f"{round(merged_counter / max_vocab_size * 100, 2)}%",
664
+ flush=True,
665
+ )
666
+ merged_counter += 1
667
+ vocabulary_frequency[merged_word] += freq
668
+
669
+ new_tokenized_corpus = []
670
+ for tokens in tokenized_corpus:
671
+ new_tokens = []
672
+ append_token = new_tokens.append
673
+ i = 0
674
+ token_count = len(tokens)
675
+ while i < token_count:
676
+ if (
677
+ i < token_count - 1
678
+ and tokens[i] == left_token
679
+ and tokens[i + 1] == right_token
680
+ ):
681
+ append_token(merged_word)
682
+ i += 2
683
+ else:
684
+ append_token(tokens[i])
685
+ i += 1
686
+
687
+ if len(new_tokens) > 1:
688
+ new_tokenized_corpus.append(new_tokens)
689
+
690
+ tokenized_corpus = new_tokenized_corpus
691
+
692
+ self.vocabulary_frequency = dict(vocabulary_frequency)
693
+ self.vocab = {
694
+ str(self.bos_token): 0,
695
+ str(self.pad_token): 1,
696
+ str(self.eos_token): 2,
697
+ str(self.unk_token): 3,
698
+ str(self.mask_token): 4,
699
+ **{word: idx for idx, word in enumerate(vocabulary_frequency.keys(), start=5)},
700
+ }
701
+
702
+ self.ids_to_tokens = {idx: token for token, idx in self.vocab.items()}
703
+ self._refresh_tokenization_cache()
704
+
705
+ def train_from_iterator(self, iterator, *args, **kwargs) -> None:
706
+ raise NotImplementedError("train_from_iterator is not implemented for APE")
707
+
708
+
709
+ APEPreTrainedTokenizer.register_for_auto_class("AutoTokenizer")
tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "<mask>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "auto_map": {
45
+ "AutoTokenizer": [
46
+ "tokenization_ape.APEPreTrainedTokenizer",
47
+ null
48
+ ]
49
+ },
50
+ "backend": "custom",
51
+ "bos_token": "<s>",
52
+ "eos_token": "</s>",
53
+ "mask_token": "<mask>",
54
+ "model_max_length": 256,
55
+ "pad_token": "<pad>",
56
+ "representation": "SMILES",
57
+ "unk_token": "<unk>"
58
+ }
vocab.json ADDED
@@ -0,0 +1,1388 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "#": 18,
3
+ "#N)": 643,
4
+ "%10": 81,
5
+ "%11": 91,
6
+ "%12": 92,
7
+ "%13": 93,
8
+ "%14": 94,
9
+ "%15": 95,
10
+ "%16": 96,
11
+ "%17": 97,
12
+ "%18": 98,
13
+ "%19": 99,
14
+ "%20": 100,
15
+ "%21": 101,
16
+ "%22": 102,
17
+ "%23": 103,
18
+ "%24": 104,
19
+ "%25": 105,
20
+ "%26": 106,
21
+ "%27": 250,
22
+ "(": 9,
23
+ "(-": 420,
24
+ "(-c2cc": 993,
25
+ "(-c3cc": 1123,
26
+ "(-c4cc": 1016,
27
+ "(/C=C/": 1255,
28
+ "(=O)": 288,
29
+ "(=O)=O": 842,
30
+ "(=O)CC": 1181,
31
+ "(=O)N": 321,
32
+ "(=O)N(": 1384,
33
+ "(=O)N1": 607,
34
+ "(=O)N2": 690,
35
+ "(=O)O": 908,
36
+ "(=O)O)": 637,
37
+ "(=O)[O-]": 397,
38
+ "(=O)o": 1083,
39
+ "(C": 309,
40
+ "(C#": 1018,
41
+ "(C#N)": 829,
42
+ "(C(=O)": 515,
43
+ "(C(F)": 664,
44
+ "(C)": 287,
45
+ "(C)(C)": 1194,
46
+ "(C)C": 518,
47
+ "(C)C)": 369,
48
+ "(C)CC": 394,
49
+ "(C)CCC": 1325,
50
+ "(C)N": 1324,
51
+ "(C)O": 557,
52
+ "(C)O)": 912,
53
+ "(C)c(": 1351,
54
+ "(C2": 1046,
55
+ "(C3)": 1292,
56
+ "(CC": 324,
57
+ "(CC(C": 961,
58
+ "(CC)": 952,
59
+ "(CCCC": 824,
60
+ "(CCN": 1041,
61
+ "(CN": 865,
62
+ "(CO": 868,
63
+ "(CO)": 653,
64
+ "(Cc1cc": 988,
65
+ "(Cc2cc": 726,
66
+ "(Cc3cc": 923,
67
+ "(Cl)": 746,
68
+ "(F)": 315,
69
+ "(F)(F)": 812,
70
+ "(F)F": 498,
71
+ "(F)F)": 361,
72
+ "(F)F.": 1079,
73
+ "(F)c(": 992,
74
+ "(N": 398,
75
+ "(N)": 574,
76
+ "(O": 764,
77
+ "(O)": 316,
78
+ "(O)CC": 1127,
79
+ "(O)O": 765,
80
+ "(O)O)": 980,
81
+ "(OC)": 748,
82
+ "(OC)c1": 1376,
83
+ "(S(=O)": 1210,
84
+ "([O-])": 570,
85
+ "(c1cc": 793,
86
+ "(c2cc": 632,
87
+ "(c3cc": 1011,
88
+ ")": 11,
89
+ ")C(=O)": 990,
90
+ ")CC": 668,
91
+ ")N": 350,
92
+ ")N1": 525,
93
+ ")N1CC": 818,
94
+ ")Nc1cc": 941,
95
+ ")c1": 771,
96
+ ")c1cc": 780,
97
+ ")cc": 285,
98
+ ")cc(": 530,
99
+ ")cc1": 303,
100
+ ")cc1)": 848,
101
+ ")cc12": 974,
102
+ ")cc2": 491,
103
+ ")cc2)": 554,
104
+ ")cc3": 858,
105
+ ")cc3)": 752,
106
+ ")ccc1": 507,
107
+ ")ccc1O": 1182,
108
+ ")ccn1": 1166,
109
+ "-": 29,
110
+ "-2": 1000,
111
+ "-c1cc": 471,
112
+ "-c1cn": 1289,
113
+ "-c2cc": 345,
114
+ "-c2cc(": 608,
115
+ "-c2cc3": 914,
116
+ "-c2cn": 710,
117
+ "-c2n": 564,
118
+ "-c2nc(": 1030,
119
+ "-c3cc": 381,
120
+ "-c3cn": 1153,
121
+ "-c3n": 1352,
122
+ "-c4cc": 869,
123
+ "-n2": 602,
124
+ ".": 23,
125
+ ".Cl": 470,
126
+ ".Cl.Cl": 1086,
127
+ ".[Br-]": 998,
128
+ ".[Cl-]": 1217,
129
+ ".[I-]": 1296,
130
+ ".[Na+]": 891,
131
+ "/": 19,
132
+ "/C": 346,
133
+ "/C(": 424,
134
+ "/C(=C/": 732,
135
+ "/C(=C\\": 702,
136
+ "/C(=N/": 1066,
137
+ "/C(=N\\": 755,
138
+ "/C(C)": 853,
139
+ "/C1": 955,
140
+ "/C=C": 809,
141
+ "/C=C(\\": 828,
142
+ "/C=C/": 390,
143
+ "/C=C2\\": 968,
144
+ "/C=C\\": 657,
145
+ "/C=N/": 794,
146
+ "/C=N/N": 1108,
147
+ "/N": 439,
148
+ "/N=C(\\": 1149,
149
+ "/N=C/": 614,
150
+ "/c(": 1126,
151
+ "1": 8,
152
+ "1)": 401,
153
+ "1)c1cc": 798,
154
+ "1C": 1023,
155
+ "1CC": 342,
156
+ "1CCC(": 542,
157
+ "1CCC[C@H]1": 1319,
158
+ "1CCN(": 932,
159
+ "2": 12,
160
+ "2)": 291,
161
+ "2)C1": 620,
162
+ "2)CC1": 436,
163
+ "2)c1": 407,
164
+ "2)c1=O": 1245,
165
+ "2)cc(": 1258,
166
+ "2)cc1": 358,
167
+ "2)cc1)": 1349,
168
+ "2)ccc1": 982,
169
+ "2)n1": 813,
170
+ "2C": 1196,
171
+ "2CC": 351,
172
+ "2CC2)": 969,
173
+ "2CC3CC": 1109,
174
+ "2CCC(": 604,
175
+ "2CCCC": 548,
176
+ "2CCN(": 618,
177
+ "2CCO": 897,
178
+ "2c(": 671,
179
+ "2c(cc1": 839,
180
+ "3": 13,
181
+ "3)": 302,
182
+ "3)C1": 1374,
183
+ "3)CC1": 956,
184
+ "3)c1": 859,
185
+ "3)cc": 379,
186
+ "3)cc1": 481,
187
+ "3)cc12": 1334,
188
+ "3)cc2": 572,
189
+ "3)cc2)": 679,
190
+ "3)cc21": 1287,
191
+ "3)ccc2": 1309,
192
+ "3)n": 404,
193
+ "3)n2)": 1313,
194
+ "3C(=O)": 1002,
195
+ "3C)": 716,
196
+ "3CC": 377,
197
+ "3CC3)": 1055,
198
+ "3CC4CC": 1238,
199
+ "3CCC(": 938,
200
+ "3CCCC": 535,
201
+ "3CCN": 861,
202
+ "3CCN(": 694,
203
+ "3CCOCC": 592,
204
+ "4": 14,
205
+ "4)": 336,
206
+ "4)CC": 841,
207
+ "4)CC3)": 1064,
208
+ "4)c3": 901,
209
+ "4)c3)": 1306,
210
+ "4)cc": 417,
211
+ "4)cc2": 1084,
212
+ "4)cc3": 766,
213
+ "4)cc3)": 749,
214
+ "4)ccc3": 1314,
215
+ "4CC": 426,
216
+ "4CC4)": 763,
217
+ "4CCCC": 756,
218
+ "4CCN": 1338,
219
+ "4CCN(": 1340,
220
+ "4CCOCC": 745,
221
+ "5": 35,
222
+ "5)": 432,
223
+ "5)CC": 1299,
224
+ "5)cc": 656,
225
+ "5CC": 591,
226
+ "5CCCC": 1382,
227
+ "5CCOCC": 1214,
228
+ "6": 48,
229
+ "6)": 819,
230
+ "6CC": 1283,
231
+ "7": 49,
232
+ "8": 50,
233
+ "9": 80,
234
+ "</s>": 2,
235
+ "<mask>": 4,
236
+ "<pad>": 1,
237
+ "<s>": 0,
238
+ "<unk>": 3,
239
+ "=": 17,
240
+ "=C": 329,
241
+ "=C(": 306,
242
+ "=C(/": 1075,
243
+ "=C(C)": 556,
244
+ "=C(N)": 1231,
245
+ "=C(\\": 651,
246
+ "=C/": 355,
247
+ "=C1": 595,
248
+ "=C1\\": 1035,
249
+ "=C2": 566,
250
+ "=C2\\": 843,
251
+ "=C3": 1288,
252
+ "=CC": 734,
253
+ "=C\\": 469,
254
+ "=N": 364,
255
+ "=N)": 1356,
256
+ "=N/": 562,
257
+ "=N/N": 881,
258
+ "=N\\": 578,
259
+ "=O": 356,
260
+ "=O)": 278,
261
+ "=O)CC": 1169,
262
+ "=O)c1": 1124,
263
+ "=O)cc": 1071,
264
+ "=O)cc1": 727,
265
+ "=O)cc2": 1261,
266
+ "=O)n1": 1308,
267
+ "=S)": 918,
268
+ "=[N+]": 1199,
269
+ "B": 54,
270
+ "Br": 20,
271
+ "Br)": 461,
272
+ "Br)cc": 623,
273
+ "Br)cc1": 697,
274
+ "Br)cc2": 1133,
275
+ "Br)cc2)": 1146,
276
+ "Br.": 1280,
277
+ "C": 5,
278
+ "C#": 368,
279
+ "C#C": 1092,
280
+ "C#N": 551,
281
+ "C#N)": 433,
282
+ "C#N)cc": 1053,
283
+ "C(": 279,
284
+ "C(=": 431,
285
+ "C(=N)": 584,
286
+ "C(=N)N": 831,
287
+ "C(=O)": 283,
288
+ "C(=O)C": 698,
289
+ "C(=O)N": 296,
290
+ "C(=O)O": 396,
291
+ "C(=S": 539,
292
+ "C(=S)": 1129,
293
+ "C(=S)N": 597,
294
+ "C(C": 1304,
295
+ "C(C#N)": 1236,
296
+ "C(C)": 318,
297
+ "C(C)C": 768,
298
+ "C(C)C)": 422,
299
+ "C(CC": 1051,
300
+ "C(Cl)": 1277,
301
+ "C(F)": 325,
302
+ "C(F)F)": 1031,
303
+ "C(N": 758,
304
+ "C(N)": 435,
305
+ "C(N)=N": 1327,
306
+ "C(N)=O": 1020,
307
+ "C(O)": 510,
308
+ "C(c1cc": 870,
309
+ "C(c2cc": 777,
310
+ "C(c3cc": 1226,
311
+ "C)": 281,
312
+ "C)CC": 1274,
313
+ "C)c(": 964,
314
+ "C)cc": 1233,
315
+ "C)cc1": 571,
316
+ "C)cc2": 1034,
317
+ "C)cc2)": 1070,
318
+ "C)cc3)": 1122,
319
+ "C/C=C\\": 1364,
320
+ "C1": 300,
321
+ "C1(": 1058,
322
+ "C1)": 508,
323
+ "C1)C2": 1229,
324
+ "C1)N": 1271,
325
+ "C1=": 862,
326
+ "C1=C(": 786,
327
+ "C1=N": 834,
328
+ "C1=O": 484,
329
+ "C1=O)": 773,
330
+ "C1CC": 523,
331
+ "C1CC1": 790,
332
+ "C1CC1)": 1278,
333
+ "C1CCCC": 559,
334
+ "C1CCN(": 706,
335
+ "C1c1cc": 1320,
336
+ "C2": 312,
337
+ "C2(": 627,
338
+ "C2)": 392,
339
+ "C2)C1": 1097,
340
+ "C2)CC1": 1062,
341
+ "C2)c1": 778,
342
+ "C2)cc1": 573,
343
+ "C2)n1": 1183,
344
+ "C2=": 1163,
345
+ "C2=C(": 920,
346
+ "C2=N": 810,
347
+ "C2=O": 696,
348
+ "C2=O)": 509,
349
+ "C2CC": 601,
350
+ "C2CC2)": 837,
351
+ "C2CCCC": 645,
352
+ "C2CCN(": 948,
353
+ "C3": 349,
354
+ "C3(CC": 1154,
355
+ "C3)": 441,
356
+ "C3)cc": 970,
357
+ "C3)cc1": 1078,
358
+ "C3)n": 883,
359
+ "C3=C(": 1385,
360
+ "C3=O)": 648,
361
+ "C3CC": 669,
362
+ "C3CC3)": 871,
363
+ "C3CCCC": 774,
364
+ "C4": 500,
365
+ "C4)": 563,
366
+ "C4=O)": 1132,
367
+ "C4CC": 1114,
368
+ "C4CC4)": 1076,
369
+ "C5": 801,
370
+ "C5)": 935,
371
+ "C=": 418,
372
+ "C=C": 665,
373
+ "C=C(": 776,
374
+ "C=C(C)": 913,
375
+ "C=C1": 808,
376
+ "C=C2": 1311,
377
+ "C=CC": 585,
378
+ "C=O)": 1152,
379
+ "CC": 274,
380
+ "CC#": 1189,
381
+ "CC(": 462,
382
+ "CC(=O)": 409,
383
+ "CC(C)": 343,
384
+ "CC(C)=": 1305,
385
+ "CC(C)N": 1138,
386
+ "CC(C)O": 1150,
387
+ "CC(N": 1110,
388
+ "CC(N)": 1336,
389
+ "CC(O)": 772,
390
+ "CC)": 806,
391
+ "CC1": 313,
392
+ "CC1(": 1088,
393
+ "CC1(C)": 826,
394
+ "CC1)": 493,
395
+ "CC1=C(": 1013,
396
+ "CC1CC": 1329,
397
+ "CC2": 449,
398
+ "CC2)": 373,
399
+ "CC2)C1": 1165,
400
+ "CC2)c1": 631,
401
+ "CC2)n1": 978,
402
+ "CC2CC": 1348,
403
+ "CC3": 1021,
404
+ "CC3)": 465,
405
+ "CC3)cc": 1221,
406
+ "CC3)n": 1027,
407
+ "CC4)": 720,
408
+ "CCC": 543,
409
+ "CCC(": 641,
410
+ "CCC(C)": 904,
411
+ "CCC1": 691,
412
+ "CCC2": 1007,
413
+ "CCCC": 298,
414
+ "CCCC(": 1307,
415
+ "CCCC1": 677,
416
+ "CCCC2": 893,
417
+ "CCCCC": 1112,
418
+ "CCCCC1": 715,
419
+ "CCCCCC": 565,
420
+ "CCCCCN": 1253,
421
+ "CCCCN": 621,
422
+ "CCCCN)": 1268,
423
+ "CCCCN1": 1223,
424
+ "CCCCO": 1026,
425
+ "CCCN": 459,
426
+ "CCCN1": 972,
427
+ "CCCN2": 1372,
428
+ "CCCO": 739,
429
+ "CCN": 301,
430
+ "CCN(": 344,
431
+ "CCN(C": 446,
432
+ "CCN(C)": 709,
433
+ "CCN(CC": 448,
434
+ "CCN1": 472,
435
+ "CCN1CC": 1054,
436
+ "CCN2": 719,
437
+ "CCNCC": 1087,
438
+ "CCO": 317,
439
+ "CCO)": 1328,
440
+ "CCOCC": 428,
441
+ "CCOCC1": 681,
442
+ "CCOCCO": 889,
443
+ "CCS": 852,
444
+ "CC[C@@H]1": 1230,
445
+ "CC[C@H](": 1369,
446
+ "CC[C@H](C)": 1218,
447
+ "CC[C@H]1": 1069,
448
+ "CCc1cc": 659,
449
+ "CCc1n": 940,
450
+ "CCc2cc": 977,
451
+ "CCc3cc": 1375,
452
+ "CCn1": 534,
453
+ "CCn1c(": 1224,
454
+ "CN": 295,
455
+ "CN(": 391,
456
+ "CN(C": 541,
457
+ "CN(C)": 444,
458
+ "CN(C)C": 1276,
459
+ "CN(CC": 646,
460
+ "CN)": 1107,
461
+ "CN1": 395,
462
+ "CN1CC": 647,
463
+ "CN2": 479,
464
+ "CN2CC": 898,
465
+ "CN3": 1293,
466
+ "CN3CC": 1208,
467
+ "CNCC": 1113,
468
+ "CO": 294,
469
+ "CO)": 489,
470
+ "CO1": 1318,
471
+ "COC": 1343,
472
+ "COC1": 1157,
473
+ "COCC": 888,
474
+ "COCCN": 1174,
475
+ "COCCN1": 1264,
476
+ "COCCO": 1171,
477
+ "CO[C@H]1": 1284,
478
+ "COc1c(": 1275,
479
+ "COc1cc": 340,
480
+ "COc1n": 1377,
481
+ "COc2cc": 791,
482
+ "COc3cc": 1065,
483
+ "CS": 413,
484
+ "CS(=O)": 580,
485
+ "CSc1n": 930,
486
+ "CSc2n": 1219,
487
+ "C[C@@H]": 339,
488
+ "C[C@@H](": 502,
489
+ "C[C@@H](C)": 840,
490
+ "C[C@@H](N": 877,
491
+ "C[C@@H](N)": 1367,
492
+ "C[C@@H](O)": 626,
493
+ "C[C@@H]1": 482,
494
+ "C[C@@H]1CC": 924,
495
+ "C[C@@H]1CN(": 1193,
496
+ "C[C@@H]2": 579,
497
+ "C[C@@H]3": 700,
498
+ "C[C@@H]4": 1029,
499
+ "C[C@@]1": 963,
500
+ "C[C@@]2": 1273,
501
+ "C[C@H]": 335,
502
+ "C[C@H](": 473,
503
+ "C[C@H](C)": 872,
504
+ "C[C@H](N": 799,
505
+ "C[C@H](N)": 1004,
506
+ "C[C@H](O)": 638,
507
+ "C[C@H]1": 460,
508
+ "C[C@H]1CC": 1036,
509
+ "C[C@H]1CN(": 1357,
510
+ "C[C@H]1O[C@@H](": 1074,
511
+ "C[C@H]2": 550,
512
+ "C[C@H]3": 685,
513
+ "C[C@H]4": 1151,
514
+ "C[C@]1": 851,
515
+ "C[C@]12CC": 1341,
516
+ "Cc1": 378,
517
+ "Cc1c(": 576,
518
+ "Cc1c(-": 1345,
519
+ "Cc1c[nH]": 1365,
520
+ "Cc1cc": 326,
521
+ "Cc1cc(": 490,
522
+ "Cc1cc2": 999,
523
+ "Cc1cn": 619,
524
+ "Cc1cs": 1252,
525
+ "Cc1n": 438,
526
+ "Cc1nc(": 792,
527
+ "Cc1no": 1072,
528
+ "Cc2c(": 1073,
529
+ "Cc2cc": 425,
530
+ "Cc2cc(": 1005,
531
+ "Cc3cc": 501,
532
+ "Cc4cc": 692,
533
+ "Cl": 22,
534
+ "Cl)": 320,
535
+ "Cl)c(": 775,
536
+ "Cl)c(Cl)": 667,
537
+ "Cl)cc": 347,
538
+ "Cl)cc(": 1290,
539
+ "Cl)cc1": 434,
540
+ "Cl)cc1)": 895,
541
+ "Cl)cc1Cl": 1344,
542
+ "Cl)cc2": 599,
543
+ "Cl)cc2)": 589,
544
+ "Cl)cc3": 811,
545
+ "Cl)cc3)": 725,
546
+ "Cl)ccc1": 934,
547
+ "Cl)ccc2": 835,
548
+ "Cl)ccc3": 1204,
549
+ "Cl.": 717,
550
+ "Clc1cc": 1256,
551
+ "Cn1": 399,
552
+ "Cn1c(": 890,
553
+ "Cn1cc": 1162,
554
+ "Cn1cc(": 762,
555
+ "Cn1cn": 926,
556
+ "Cn1nc(": 1337,
557
+ "Cn2": 567,
558
+ "Cn3": 1240,
559
+ "F": 15,
560
+ "F)": 289,
561
+ "F)c(": 954,
562
+ "F)c(Cl)": 1254,
563
+ "F)c(F)": 973,
564
+ "F)cc": 333,
565
+ "F)cc(": 1177,
566
+ "F)cc1": 423,
567
+ "F)cc1)": 728,
568
+ "F)cc1F": 1281,
569
+ "F)cc2": 568,
570
+ "F)cc2)": 546,
571
+ "F)cc3": 705,
572
+ "F)cc3)": 605,
573
+ "F)cc4": 1167,
574
+ "F)cc4)": 986,
575
+ "F)ccc1": 1008,
576
+ "F)ccc2": 1176,
577
+ "F)ccc3": 1373,
578
+ "FC(F)": 1049,
579
+ "Fc1cc": 1197,
580
+ "I": 40,
581
+ "I)": 1212,
582
+ "N": 10,
583
+ "N#": 476,
584
+ "N#C": 770,
585
+ "N(": 704,
586
+ "N(C": 823,
587
+ "N(C)": 654,
588
+ "N(C)C)": 937,
589
+ "N(CC": 795,
590
+ "N)": 311,
591
+ "N)cc": 1232,
592
+ "N)cc1": 1301,
593
+ "N)ncn": 817,
594
+ "N1": 427,
595
+ "N1CC": 864,
596
+ "N1CCN(": 900,
597
+ "N2": 445,
598
+ "N2CC": 1024,
599
+ "N2CCCC": 1155,
600
+ "N2CCN(": 945,
601
+ "N2CCO": 1259,
602
+ "N3": 1170,
603
+ "N4": 1068,
604
+ "N=C(": 558,
605
+ "N=C(N)": 789,
606
+ "NC": 984,
607
+ "NC(": 1160,
608
+ "NC(=O)": 314,
609
+ "NC(C)": 894,
610
+ "NCC": 633,
611
+ "NCc1cc": 1216,
612
+ "NS(=O)": 544,
613
+ "Nc1cc": 673,
614
+ "Nc1n": 549,
615
+ "Nc1nc(": 959,
616
+ "Nc2cc": 784,
617
+ "Nc2n": 1118,
618
+ "O": 6,
619
+ "O)": 275,
620
+ "O)CC": 827,
621
+ "O)c(": 887,
622
+ "O)c(O)": 850,
623
+ "O)c1": 921,
624
+ "O)c1cc": 1209,
625
+ "O)cc": 814,
626
+ "O)cc1": 516,
627
+ "O)cc1)": 1121,
628
+ "O)cc2": 804,
629
+ "O)cc2)": 838,
630
+ "O)cc3": 1366,
631
+ "O)cc3)": 1111,
632
+ "O1": 617,
633
+ "O2": 1042,
634
+ "O2)": 997,
635
+ "O3)": 1302,
636
+ "O=": 374,
637
+ "O=C(": 323,
638
+ "O=C(C": 1361,
639
+ "O=C(CC": 1200,
640
+ "O=C(CO": 1105,
641
+ "O=C(CS": 1312,
642
+ "O=C(N": 385,
643
+ "O=C(O)": 415,
644
+ "O=C1": 450,
645
+ "O=C1N": 879,
646
+ "O=[N+]([O-])": 821,
647
+ "O=c1": 693,
648
+ "O=c1[nH]": 1213,
649
+ "OC": 360,
650
+ "OC(": 701,
651
+ "OC(=O)": 467,
652
+ "OC(C)": 475,
653
+ "OC(F)": 561,
654
+ "OC)": 328,
655
+ "OC)c(": 1103,
656
+ "OC)c1": 569,
657
+ "OC)cc1": 906,
658
+ "OC1": 922,
659
+ "OC2": 1235,
660
+ "OCC": 376,
661
+ "OCC(O)": 1342,
662
+ "OCC)": 917,
663
+ "OCCCC": 899,
664
+ "OCCCN": 1350,
665
+ "OCCN": 830,
666
+ "OCCO": 629,
667
+ "OCCO2": 1381,
668
+ "OCO": 511,
669
+ "OCO4)": 1244,
670
+ "OCc1cc": 892,
671
+ "OCc2cc": 724,
672
+ "OCc3cc": 874,
673
+ "OO": 1360,
674
+ "O[C@@H](": 708,
675
+ "O[C@@H]1": 1038,
676
+ "O[C@@H]2": 1315,
677
+ "O[C@H](": 1242,
678
+ "O[C@H](CO)": 929,
679
+ "O[C@H]1": 958,
680
+ "O[C@H]2": 1333,
681
+ "Oc1cc": 642,
682
+ "Oc2cc": 644,
683
+ "Oc3cc": 753,
684
+ "Oc4cc": 1285,
685
+ "P": 43,
686
+ "P(=O)": 524,
687
+ "P(=O)(": 1243,
688
+ "S": 21,
689
+ "S(=O)": 319,
690
+ "S(C)": 521,
691
+ "S(N)": 723,
692
+ "S)": 652,
693
+ "S)N": 1317,
694
+ "S1": 943,
695
+ "S2": 1362,
696
+ "SC": 1291,
697
+ "SCC": 612,
698
+ "Sc2cc": 1363,
699
+ "[10B]": 228,
700
+ "[11C@@H]": 246,
701
+ "[11CH2]": 203,
702
+ "[11CH3]": 60,
703
+ "[11C]": 117,
704
+ "[11c]": 112,
705
+ "[123I-]": 253,
706
+ "[123I]": 56,
707
+ "[123Te]": 155,
708
+ "[124I-]": 265,
709
+ "[124I]": 113,
710
+ "[125I-]": 143,
711
+ "[125I]": 57,
712
+ "[127I]": 244,
713
+ "[127Xe]": 252,
714
+ "[129Xe]": 237,
715
+ "[131Cs]": 222,
716
+ "[131I-]": 269,
717
+ "[131I]": 68,
718
+ "[133Xe]": 263,
719
+ "[13CH3]": 224,
720
+ "[13CH]": 219,
721
+ "[13C]": 140,
722
+ "[13cH]": 164,
723
+ "[13c]": 163,
724
+ "[14C@@H]": 259,
725
+ "[14C@@]": 245,
726
+ "[14C@H]": 248,
727
+ "[14CH2]": 146,
728
+ "[14CH3]": 147,
729
+ "[14CH]": 154,
730
+ "[14C]": 121,
731
+ "[14cH]": 132,
732
+ "[14c]": 206,
733
+ "[15nH]": 165,
734
+ "[15n]": 114,
735
+ "[17F]": 179,
736
+ "[18F-]": 239,
737
+ "[18F]": 59,
738
+ "[18OH]": 205,
739
+ "[18O]": 258,
740
+ "[19F]": 122,
741
+ "[211At]": 231,
742
+ "[223Ra]": 162,
743
+ "[22Na+]": 160,
744
+ "[2H]": 69,
745
+ "[32PH]": 210,
746
+ "[32P]": 201,
747
+ "[35S]": 151,
748
+ "[3H]": 77,
749
+ "[42K+]": 157,
750
+ "[45Ca+2]": 148,
751
+ "[47Ca+2]": 166,
752
+ "[68Ga+3]": 220,
753
+ "[73Se]": 200,
754
+ "[75Se]": 267,
755
+ "[76Br]": 128,
756
+ "[81Kr]": 266,
757
+ "[82Rb+]": 255,
758
+ "[82Rb]": 226,
759
+ "[85Sr+2]": 150,
760
+ "[89Sr+2]": 260,
761
+ "[Ag+]": 159,
762
+ "[Ag-4]": 268,
763
+ "[Ag]": 123,
764
+ "[Al+3]": 184,
765
+ "[Al]": 186,
766
+ "[Ar]": 214,
767
+ "[As+]": 158,
768
+ "[As-]": 241,
769
+ "[AsH3]": 257,
770
+ "[AsH]": 227,
771
+ "[As]": 66,
772
+ "[At]": 235,
773
+ "[B-]": 76,
774
+ "[B@-]": 170,
775
+ "[B@@-]": 172,
776
+ "[BH-]": 204,
777
+ "[BH2-]": 167,
778
+ "[BH3-]": 111,
779
+ "[B]": 139,
780
+ "[Ba+2]": 133,
781
+ "[Ba]": 236,
782
+ "[Be+2]": 202,
783
+ "[Bi+3]": 141,
784
+ "[Bi]": 131,
785
+ "[Br+2]": 207,
786
+ "[Br-]": 47,
787
+ "[C+]": 79,
788
+ "[C-]": 90,
789
+ "[C@@H]": 24,
790
+ "[C@@H](": 365,
791
+ "[C@@H](C": 488,
792
+ "[C@@H](C)": 456,
793
+ "[C@@H](CC": 582,
794
+ "[C@@H](CO": 1359,
795
+ "[C@@H](CO)": 849,
796
+ "[C@@H](N": 1144,
797
+ "[C@@H](N)": 910,
798
+ "[C@@H](O": 857,
799
+ "[C@@H](O)": 419,
800
+ "[C@@H]1": 375,
801
+ "[C@@H]12": 951,
802
+ "[C@@H]1CC": 609,
803
+ "[C@@H]1O": 581,
804
+ "[C@@H]2": 406,
805
+ "[C@@H]2CC": 660,
806
+ "[C@@H]2O": 722,
807
+ "[C@@H]3": 477,
808
+ "[C@@H]3CC": 713,
809
+ "[C@@H]4": 613,
810
+ "[C@@H]4CC": 1120,
811
+ "[C@@H]4[C@@]5": 1178,
812
+ "[C@@H]5": 1047,
813
+ "[C@@H]5CC": 1096,
814
+ "[C@@]": 25,
815
+ "[C@@](C)": 721,
816
+ "[C@@](O)": 1207,
817
+ "[C@@]1": 532,
818
+ "[C@@]12": 1383,
819
+ "[C@@]2": 512,
820
+ "[C@@]2(C)": 855,
821
+ "[C@@]3": 615,
822
+ "[C@@]3(C)": 1139,
823
+ "[C@@]4": 767,
824
+ "[C@@]4(C)": 1370,
825
+ "[C@@]5": 942,
826
+ "[C@H]": 31,
827
+ "[C@H](": 357,
828
+ "[C@H](C": 453,
829
+ "[C@H](C)": 443,
830
+ "[C@H](C)CC": 1006,
831
+ "[C@H](CC": 596,
832
+ "[C@H](CO": 947,
833
+ "[C@H](CO)": 600,
834
+ "[C@H](N": 1040,
835
+ "[C@H](N)": 966,
836
+ "[C@H](O": 854,
837
+ "[C@H](O)": 412,
838
+ "[C@H]1": 383,
839
+ "[C@H]1CC": 547,
840
+ "[C@H]1CC[C@H](": 1227,
841
+ "[C@H]1CC[C@H]2": 1248,
842
+ "[C@H]1O": 655,
843
+ "[C@H]2": 402,
844
+ "[C@H]2CC": 683,
845
+ "[C@H]2O)": 1028,
846
+ "[C@H]3": 480,
847
+ "[C@H]3CC": 703,
848
+ "[C@H]3O)": 1239,
849
+ "[C@H]4": 666,
850
+ "[C@H]4CC": 1148,
851
+ "[C@H]5": 1134,
852
+ "[C@]": 32,
853
+ "[C@](C)": 769,
854
+ "[C@](O)": 1206,
855
+ "[C@]1": 468,
856
+ "[C@]1(C)": 1279,
857
+ "[C@]12": 712,
858
+ "[C@]12C": 1032,
859
+ "[C@]2": 494,
860
+ "[C@]2(": 1246,
861
+ "[C@]2(C)": 757,
862
+ "[C@]3": 552,
863
+ "[C@]3(C)": 781,
864
+ "[C@]4": 635,
865
+ "[C@]4(C)": 981,
866
+ "[C@]43C)": 1185,
867
+ "[C@]5": 1045,
868
+ "[CH-]": 125,
869
+ "[CH2]": 185,
870
+ "[CH3]": 187,
871
+ "[CH]": 264,
872
+ "[C]": 188,
873
+ "[Ca+2]": 89,
874
+ "[CaH2]": 233,
875
+ "[Ca]": 193,
876
+ "[Cl+2]": 199,
877
+ "[Cl+3]": 53,
878
+ "[Cl+]": 119,
879
+ "[Cl-]": 38,
880
+ "[Cl]": 108,
881
+ "[Cs+]": 67,
882
+ "[Cs]": 229,
883
+ "[F-]": 84,
884
+ "[H+]": 127,
885
+ "[H-]": 230,
886
+ "[He]": 86,
887
+ "[I+2]": 249,
888
+ "[I+3]": 251,
889
+ "[I+]": 129,
890
+ "[I-]": 55,
891
+ "[I]": 152,
892
+ "[K+]": 65,
893
+ "[KH]": 189,
894
+ "[K]": 254,
895
+ "[Kr]": 256,
896
+ "[Li+]": 83,
897
+ "[LiH]": 192,
898
+ "[Li]": 223,
899
+ "[Mg+2]": 61,
900
+ "[Mg+]": 234,
901
+ "[MgH2]": 195,
902
+ "[Mg]": 194,
903
+ "[N+]": 27,
904
+ "[N+](=O)[O-]": 403,
905
+ "[N+](C)": 902,
906
+ "[N+]([O-])": 785,
907
+ "[N-]": 34,
908
+ "[N@+]": 75,
909
+ "[N@@+]": 82,
910
+ "[N@@]": 120,
911
+ "[N@H+]": 240,
912
+ "[N@]": 72,
913
+ "[NH+]": 178,
914
+ "[NH-]": 107,
915
+ "[NH2+]": 177,
916
+ "[NH3+]": 238,
917
+ "[NH4+]": 211,
918
+ "[NH]": 136,
919
+ "[N]": 197,
920
+ "[Na+]": 39,
921
+ "[NaH]": 183,
922
+ "[Na]": 216,
923
+ "[O+]": 116,
924
+ "[O-2]": 217,
925
+ "[O-]": 28,
926
+ "[O-])": 528,
927
+ "[OH+]": 182,
928
+ "[OH-]": 130,
929
+ "[OH3+]": 221,
930
+ "[OH]": 175,
931
+ "[O]": 74,
932
+ "[P+]": 85,
933
+ "[P-]": 115,
934
+ "[P@+]": 137,
935
+ "[P@@+]": 196,
936
+ "[P@@]": 62,
937
+ "[P@]": 58,
938
+ "[PH2+]": 247,
939
+ "[PH2]": 168,
940
+ "[PH]": 109,
941
+ "[Ra]": 156,
942
+ "[Rb+]": 171,
943
+ "[Rb]": 262,
944
+ "[S+]": 42,
945
+ "[S+]([O-])": 1300,
946
+ "[S-2]": 144,
947
+ "[S-]": 52,
948
+ "[S@+]": 51,
949
+ "[S@@+]": 73,
950
+ "[S@@]": 46,
951
+ "[S@]": 138,
952
+ "[SH+]": 213,
953
+ "[SH-]": 169,
954
+ "[SH2]": 242,
955
+ "[SH]": 126,
956
+ "[S]": 142,
957
+ "[Sb]": 174,
958
+ "[Se+]": 190,
959
+ "[SeH2]": 270,
960
+ "[SeH]": 110,
961
+ "[Se]": 33,
962
+ "[Si-]": 271,
963
+ "[Si@]": 232,
964
+ "[SiH-]": 161,
965
+ "[SiH2]": 243,
966
+ "[SiH3-]": 181,
967
+ "[SiH4]": 218,
968
+ "[SiH]": 78,
969
+ "[Si]": 45,
970
+ "[Sr+2]": 134,
971
+ "[Sr]": 272,
972
+ "[TeH2]": 198,
973
+ "[TeH]": 225,
974
+ "[Te]": 63,
975
+ "[Xe]": 261,
976
+ "[Yb]": 173,
977
+ "[Zn+2]": 124,
978
+ "[Zn+]": 176,
979
+ "[Zn-2]": 212,
980
+ "[Zn]": 64,
981
+ "[b-]": 135,
982
+ "[c+]": 153,
983
+ "[c-]": 88,
984
+ "[cH-]": 118,
985
+ "[c]": 215,
986
+ "[n+]": 30,
987
+ "[n+]1": 797,
988
+ "[n+]2": 1077,
989
+ "[n-]": 70,
990
+ "[nH+]": 191,
991
+ "[nH]": 37,
992
+ "[nH]1": 730,
993
+ "[nH]2)": 1205,
994
+ "[nH]2)cc1": 1322,
995
+ "[nH]c(": 844,
996
+ "[nH]c(-": 1332,
997
+ "[nH]c(=O)": 695,
998
+ "[nH]c2c1": 1137,
999
+ "[nH]c3cc": 1251,
1000
+ "[o+]": 87,
1001
+ "[s+]": 44,
1002
+ "[se+]": 145,
1003
+ "[se]": 71,
1004
+ "[te+]": 149,
1005
+ "[te]": 180,
1006
+ "\\": 41,
1007
+ "b": 209,
1008
+ "c": 7,
1009
+ "c(": 277,
1010
+ "c(-": 359,
1011
+ "c(-c5": 1211,
1012
+ "c(=O)": 348,
1013
+ "c(=O)[nH]": 517,
1014
+ "c(=O)o": 979,
1015
+ "c(Br)": 1057,
1016
+ "c(C#N)": 836,
1017
+ "c(C(F)": 625,
1018
+ "c(C)": 380,
1019
+ "c(C)c1": 598,
1020
+ "c(C)cc": 1346,
1021
+ "c(CC": 1063,
1022
+ "c(CN": 707,
1023
+ "c(Cl)": 430,
1024
+ "c(Cl)c1": 867,
1025
+ "c(F)": 442,
1026
+ "c(F)c1": 761,
1027
+ "c(N": 362,
1028
+ "c(N)": 540,
1029
+ "c(O": 689,
1030
+ "c(O)": 414,
1031
+ "c(O)c(": 1321,
1032
+ "c(O)c1": 1116,
1033
+ "c(OC)": 416,
1034
+ "c(OCC": 1270,
1035
+ "c(S": 1014,
1036
+ "c(cc1": 1265,
1037
+ "c(cc2": 1056,
1038
+ "c(cc3": 1249,
1039
+ "c1": 276,
1040
+ "c1)": 366,
1041
+ "c1)N": 1247,
1042
+ "c1)OCO": 928,
1043
+ "c1-": 650,
1044
+ "c12": 455,
1045
+ "c12)": 1358,
1046
+ "c1=O": 526,
1047
+ "c1=O)": 1260,
1048
+ "c1C": 805,
1049
+ "c1Cl": 1142,
1050
+ "c1N": 1326,
1051
+ "c1O": 670,
1052
+ "c1[nH]": 1104,
1053
+ "c1c(": 495,
1054
+ "c1c(-": 1257,
1055
+ "c1c(C)": 803,
1056
+ "c1c(N": 1298,
1057
+ "c1c(O)": 916,
1058
+ "c1c2c(": 1316,
1059
+ "c1c[nH]": 744,
1060
+ "c1cc": 282,
1061
+ "c1cc(": 372,
1062
+ "c1cc(-": 875,
1063
+ "c1cc(C": 1330,
1064
+ "c1cc(N": 946,
1065
+ "c1cc2": 522,
1066
+ "c1ccc(": 292,
1067
+ "c1cn": 411,
1068
+ "c1cnc(": 1158,
1069
+ "c1cs": 994,
1070
+ "c1n": 332,
1071
+ "c1nc(": 478,
1072
+ "c1nc(-": 925,
1073
+ "c1nc(N": 661,
1074
+ "c1ncc": 676,
1075
+ "c1ncc(": 886,
1076
+ "c1ncn": 1025,
1077
+ "c1ncn2": 1136,
1078
+ "c1nnc(": 699,
1079
+ "c1s": 751,
1080
+ "c2": 280,
1081
+ "c2)": 338,
1082
+ "c2)C1": 962,
1083
+ "c2)CC1": 680,
1084
+ "c2)OCO": 991,
1085
+ "c2)c1": 575,
1086
+ "c2)cc1": 492,
1087
+ "c2)cn1": 1100,
1088
+ "c2)n1": 796,
1089
+ "c21": 514,
1090
+ "c23)": 678,
1091
+ "c2=O)": 545,
1092
+ "c2C)": 820,
1093
+ "c2C1": 1095,
1094
+ "c2Cl)": 1080,
1095
+ "c2F)": 863,
1096
+ "c2[nH]": 505,
1097
+ "c2[nH]1": 1234,
1098
+ "c2c(": 334,
1099
+ "c2c(-": 971,
1100
+ "c2c(C)": 688,
1101
+ "c2c(N": 1022,
1102
+ "c2c(O)": 1215,
1103
+ "c2c(c1": 788,
1104
+ "c2c1": 386,
1105
+ "c2c1)": 1099,
1106
+ "c2c3c(": 1089,
1107
+ "c2c[nH]": 735,
1108
+ "c2cc": 286,
1109
+ "c2cc(": 363,
1110
+ "c2cc(-": 740,
1111
+ "c2cc(C": 1310,
1112
+ "c2cc(N": 833,
1113
+ "c2cc1": 634,
1114
+ "c2cc3": 616,
1115
+ "c2ccc(": 308,
1116
+ "c2cccn": 873,
1117
+ "c2cn": 400,
1118
+ "c2cs": 816,
1119
+ "c2n": 341,
1120
+ "c2n1": 553,
1121
+ "c2nc(": 451,
1122
+ "c2nc(-": 760,
1123
+ "c2nc(N": 639,
1124
+ "c2ncc": 590,
1125
+ "c2ncc(": 856,
1126
+ "c2ncn": 684,
1127
+ "c2nnc(": 729,
1128
+ "c2nnn": 1323,
1129
+ "c2o": 742,
1130
+ "c2s": 663,
1131
+ "c3": 284,
1132
+ "c3)": 367,
1133
+ "c3)cc": 822,
1134
+ "c3)cc2": 1085,
1135
+ "c32)": 983,
1136
+ "c34)": 1017,
1137
+ "c3=O)": 1037,
1138
+ "c3C)": 949,
1139
+ "c3F)": 1172,
1140
+ "c3[nH]": 649,
1141
+ "c3c(": 371,
1142
+ "c3c(-": 1059,
1143
+ "c3c(C)": 880,
1144
+ "c3c(N": 1355,
1145
+ "c3c2": 750,
1146
+ "c3c2)": 1161,
1147
+ "c3c[nH]": 1147,
1148
+ "c3cc": 290,
1149
+ "c3cc(": 463,
1150
+ "c3cc4": 847,
1151
+ "c3ccc(": 331,
1152
+ "c3cn": 458,
1153
+ "c3n": 389,
1154
+ "c3nc(": 815,
1155
+ "c3nc(-": 1001,
1156
+ "c3nc(N": 1048,
1157
+ "c3ncc": 682,
1158
+ "c3ncc(": 1220,
1159
+ "c3ncn": 743,
1160
+ "c3nn": 845,
1161
+ "c3o": 1143,
1162
+ "c3s": 1052,
1163
+ "c4": 299,
1164
+ "c4)": 486,
1165
+ "c4)cc": 1272,
1166
+ "c43)": 1303,
1167
+ "c4[nH]": 800,
1168
+ "c4c(": 513,
1169
+ "c4c3": 1262,
1170
+ "c4c3)": 1195,
1171
+ "c4cc": 330,
1172
+ "c4cc(": 622,
1173
+ "c4ccc(": 466,
1174
+ "c4cccn": 1267,
1175
+ "c4cn": 630,
1176
+ "c4n": 759,
1177
+ "c4ncc": 1061,
1178
+ "c5": 354,
1179
+ "c5)": 953,
1180
+ "c5c(": 919,
1181
+ "c5cc": 457,
1182
+ "c5ccc(": 882,
1183
+ "c6": 560,
1184
+ "c7": 1191,
1185
+ "c[n+]": 1198,
1186
+ "c[nH]": 447,
1187
+ "cc": 273,
1188
+ "cc(": 297,
1189
+ "cc(-": 405,
1190
+ "cc(Br)": 783,
1191
+ "cc(C)": 520,
1192
+ "cc(CN": 672,
1193
+ "cc(CO": 1297,
1194
+ "cc(Cl)": 452,
1195
+ "cc(F)": 483,
1196
+ "cc(N": 496,
1197
+ "cc(O": 975,
1198
+ "cc(O)": 603,
1199
+ "cc(OC)": 731,
1200
+ "cc(OCC": 1106,
1201
+ "cc1": 337,
1202
+ "cc1-": 1101,
1203
+ "cc12": 1145,
1204
+ "cc2": 352,
1205
+ "cc2)": 529,
1206
+ "cc21": 1094,
1207
+ "cc2Cl)": 1371,
1208
+ "cc2c(": 1159,
1209
+ "cc2c1": 738,
1210
+ "cc2cc": 909,
1211
+ "cc3": 393,
1212
+ "cc3)": 593,
1213
+ "cc3c2": 1201,
1214
+ "cc3cc": 876,
1215
+ "cc4": 533,
1216
+ "cc4cc": 1368,
1217
+ "cc5": 960,
1218
+ "ccc(": 747,
1219
+ "ccc1": 310,
1220
+ "ccc1)": 408,
1221
+ "ccc1)N": 733,
1222
+ "ccc1-": 658,
1223
+ "ccc12": 536,
1224
+ "ccc12)": 741,
1225
+ "ccc1CN": 1335,
1226
+ "ccc1Cl": 1043,
1227
+ "ccc1Cl)": 1202,
1228
+ "ccc1F": 1135,
1229
+ "ccc1F)": 1380,
1230
+ "ccc1N": 1050,
1231
+ "ccc1O": 807,
1232
+ "ccc2": 307,
1233
+ "ccc2)": 353,
1234
+ "ccc2)N": 1186,
1235
+ "ccc21": 519,
1236
+ "ccc23)": 586,
1237
+ "ccc2C1": 1282,
1238
+ "ccc2Cl)": 860,
1239
+ "ccc2F)": 985,
1240
+ "ccc2c1": 718,
1241
+ "ccc2n1": 846,
1242
+ "ccc3": 322,
1243
+ "ccc3)": 388,
1244
+ "ccc32)": 1044,
1245
+ "ccc34)": 736,
1246
+ "ccc3Cl)": 1115,
1247
+ "ccc3F)": 1140,
1248
+ "ccc4": 382,
1249
+ "ccc4)": 531,
1250
+ "ccc43)": 1156,
1251
+ "ccc5": 587,
1252
+ "ccc5)": 1175,
1253
+ "cccc": 931,
1254
+ "cccc1": 1012,
1255
+ "cccc2": 1179,
1256
+ "ccccc2": 1130,
1257
+ "ccccc3": 1019,
1258
+ "ccccc4": 1331,
1259
+ "ccn": 437,
1260
+ "ccn1": 714,
1261
+ "ccn1)": 1228,
1262
+ "ccn2": 1378,
1263
+ "ccn2)": 911,
1264
+ "ccn3)": 976,
1265
+ "cn": 293,
1266
+ "cn1": 474,
1267
+ "cn1)": 885,
1268
+ "cn2": 577,
1269
+ "cn2)": 711,
1270
+ "cn3": 967,
1271
+ "cn3)": 782,
1272
+ "cnc1": 588,
1273
+ "cnc1)": 1203,
1274
+ "cnc2": 611,
1275
+ "cnc2)": 1263,
1276
+ "cnc2c1": 1164,
1277
+ "cnc3": 555,
1278
+ "cnc3)": 939,
1279
+ "cnn2": 987,
1280
+ "co": 440,
1281
+ "co1": 802,
1282
+ "co1)": 1250,
1283
+ "co2)": 989,
1284
+ "co3)": 1081,
1285
+ "cs": 387,
1286
+ "cs1": 662,
1287
+ "cs1)": 1168,
1288
+ "cs2)": 905,
1289
+ "cs3)": 1009,
1290
+ "n": 16,
1291
+ "n(": 497,
1292
+ "n(-": 675,
1293
+ "n(C": 537,
1294
+ "n(C)": 485,
1295
+ "n(CC": 624,
1296
+ "n1": 305,
1297
+ "n1)": 527,
1298
+ "n1-": 1119,
1299
+ "n12": 866,
1300
+ "n1C": 1015,
1301
+ "n1c(": 1295,
1302
+ "n1cc(": 1141,
1303
+ "n1cn": 950,
1304
+ "n2": 327,
1305
+ "n2)": 538,
1306
+ "n2)CC1": 1090,
1307
+ "n2)c1": 896,
1308
+ "n2)cc1": 504,
1309
+ "n2C": 1067,
1310
+ "n2C)": 1093,
1311
+ "n2c(": 1117,
1312
+ "n2c1": 1237,
1313
+ "n2cc": 1003,
1314
+ "n2cc(": 1033,
1315
+ "n2cn": 944,
1316
+ "n2nc(": 1241,
1317
+ "n3": 410,
1318
+ "n3)": 610,
1319
+ "n3)cc": 1082,
1320
+ "n3C)": 1269,
1321
+ "n3cc": 1102,
1322
+ "n3cn": 1039,
1323
+ "n4": 628,
1324
+ "n4)": 1184,
1325
+ "n5": 1286,
1326
+ "n[nH]": 640,
1327
+ "nc(": 304,
1328
+ "nc(-": 421,
1329
+ "nc(C)": 583,
1330
+ "nc(N": 384,
1331
+ "nc(N)": 687,
1332
+ "nc(S": 1128,
1333
+ "nc1": 429,
1334
+ "nc1-": 1353,
1335
+ "nc12": 957,
1336
+ "nc2": 464,
1337
+ "nc21": 1188,
1338
+ "nc2c(": 907,
1339
+ "nc2c1": 787,
1340
+ "nc2cc": 737,
1341
+ "nc2cc(": 1222,
1342
+ "nc2n1": 1354,
1343
+ "nc3": 506,
1344
+ "nc3)": 1098,
1345
+ "nc3cc": 825,
1346
+ "nc4": 936,
1347
+ "nc4cc": 1379,
1348
+ "ncc": 487,
1349
+ "ncc(": 995,
1350
+ "ncc1": 606,
1351
+ "ncc2": 674,
1352
+ "ncc2)": 1173,
1353
+ "ncc3": 754,
1354
+ "ncc3)": 1187,
1355
+ "ncc4": 1266,
1356
+ "ncn": 454,
1357
+ "nn": 370,
1358
+ "nn(C)": 1180,
1359
+ "nn1": 594,
1360
+ "nn2": 686,
1361
+ "nn2)": 996,
1362
+ "nn3": 1010,
1363
+ "nn3)": 1131,
1364
+ "nnc1": 1091,
1365
+ "no": 636,
1366
+ "no1": 884,
1367
+ "o": 36,
1368
+ "o1": 503,
1369
+ "o2)": 915,
1370
+ "o2)cc1": 878,
1371
+ "o3)": 1190,
1372
+ "oc(": 832,
1373
+ "oc(-": 1294,
1374
+ "oc2c1": 1225,
1375
+ "oc2cc": 1347,
1376
+ "on1": 1125,
1377
+ "p": 208,
1378
+ "s": 26,
1379
+ "s1": 499,
1380
+ "s1)": 903,
1381
+ "s2)": 779,
1382
+ "s2)cc1": 927,
1383
+ "s3)": 933,
1384
+ "sc(": 1192,
1385
+ "sc1": 965,
1386
+ "sc2": 1339,
1387
+ "sc2c1": 1060
1388
+ }