Add ChEMBL36 APE SMILES tokenizer max6 mf3000

Browse files

Files changed (5) hide show

metadata.json +28 -0
special_tokens_map.json +7 -0
tokenization_ape.py +709 -0
tokenizer_config.json +58 -0
vocab.json +1388 -0

metadata.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "ape_source": "modernmolbert.local",
+  "created_at_utc": "2026-05-22T04:05:53.431998+00:00",
+  "creation_command": "python -m modernmolbert.train_ape_tokenizer",
+  "dataset_name": "data/pretrain/chembl36_selfies",
+  "extra_vocab_selfies_path": null,
+  "extra_vocab_symbols_added": 0,
+  "extra_vocab_symbols_path": null,
+  "extra_vocab_symbols_requested": 0,
+  "max_merge_pieces": 6,
+  "max_vocab_size": 2000,
+  "min_freq_for_merge": 3000,
+  "molecule_column": "smiles_canonical_clean",
+  "representation": "SMILES",
+  "seed": 42,
+  "shuffle_buffer_size": 100000,
+  "special_ids": {
+    "bos_token": 0,
+    "eos_token": 2,
+    "mask_token": 4,
+    "pad_token": 1,
+    "unk_token": 3
+  },
+  "tokenizer_path": "tokenizer/chembl36_smiles_2m_ape_max6_mf3000.json",
+  "tokenizer_sha256": "faf7748e8959b252c9d0ad83c2228df37a45dc9a68c15ead1ced2942cc8f155e",
+  "tokenizer_train_size": 2000000,
+  "vocab_size": 1386
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "unk_token": "<unk>",
+  "pad_token": "<pad>",
+  "mask_token": "<mask>"
+}

tokenization_ape.py ADDED Viewed

	@@ -0,0 +1,709 @@

+"""Hugging Face-compatible tokenizer for APE molecular vocabularies.
+This file is intentionally self-contained so it can be copied into a model repo
+and loaded by ``AutoTokenizer.from_pretrained(..., trust_remote_code=True)``.
+"""
+import json
+import os
+import re
+from collections.abc import Mapping
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, Literal
+from transformers import PreTrainedTokenizer
+Representation = Literal["SELFIES", "SMILES"]
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "selfies_vocab_file": "selfies_vocab.json",
+    "smiles_vocab_file": "smiles_vocab.json",
+}
+SELFIES_RE = re.compile(r"\[[^\]]+\]")
+SMILES_RE = re.compile(
+    r"(\[[^\]]+\]|Br?|Cl?|Si?|Se?|Li?|Na?|Mg?|Al?|Ca?|Fe?|Zn?|"
+    r"N|O|S|P|F|I|K|B|C|H|"
+    r"b|c|n|o|s|p|"
+    r"\%\d{2}|\d|"
+    r"\(|\)|\.|=|#|-|\+|\\|/|:|~|@|\?|\*|\$)"
+)
+def _base_piece_count(token: str, representation: str) -> int:
+    """Count primitive molecular pieces in a vocab token."""
+    pieces = pre_tokenize_molecule(token, representation)
+    return max(1, len(pieces))
+def _max_vocab_piece_span(vocab: dict[str, int], representation: str) -> int:
+    """Maximum number of primitive pieces covered by any non-special vocab token."""
+    max_span = 1
+    for token in vocab:
+        if token.startswith("<") and token.endswith(">"):
+            continue
+        max_span = max(max_span, _base_piece_count(token, representation))
+    return max_span
+def _coerce_vocab(vocab: Mapping[str, Any]) -> dict[str, int]:
+    if not isinstance(vocab, Mapping):
+        raise ValueError("Vocabulary must be a JSON object mapping token strings to integer IDs.")
+    out = {str(token): int(idx) for token, idx in vocab.items()}
+    if len(set(out.values())) != len(out):
+        raise ValueError("Vocabulary token IDs must be unique.")
+    return out
+def _token_text(token: Any) -> str:
+    return str(getattr(token, "content", token))
+def _normalize_representation(representation: str) -> Representation:
+    normalized = representation.upper()
+    if normalized not in {"SELFIES", "SMILES"}:
+        raise ValueError(f"representation must be 'SELFIES' or 'SMILES', got {representation!r}")
+    return normalized  # type: ignore[return-value]
+def _select_vocab_file(
+    *,
+    representation: Representation,
+    vocab_file: str | os.PathLike[str] | None,
+    selfies_vocab_file: str | os.PathLike[str] | None,
+    smiles_vocab_file: str | os.PathLike[str] | None,
+) -> str | os.PathLike[str] | None:
+    if representation == "SELFIES" and selfies_vocab_file is not None:
+        return selfies_vocab_file
+    if representation == "SMILES" and smiles_vocab_file is not None:
+        return smiles_vocab_file
+    return vocab_file
+def pre_tokenize_molecule(molecule: str, representation: str) -> list[str]:
+    active_representation = _normalize_representation(representation)
+    if active_representation == "SELFIES":
+        return SELFIES_RE.findall(molecule)
+    tokens: list[str] = []
+    cursor = 0
+    for match in SMILES_RE.finditer(molecule):
+        if match.start() > cursor:
+            tokens.extend(molecule[cursor : match.start()])
+        tokens.append(match.group(0))
+        cursor = match.end()
+    if cursor < len(molecule):
+        tokens.extend(molecule[cursor:])
+    return [token for token in tokens if token and not token.isspace()]
+def ape_tokenize(
+    text: str,
+    vocab: dict[str, int],
+    representation: str,
+    unk_token: str = "<unk>",
+    max_piece_span: int | None = None,
+) -> list[str]:
+    pieces = pre_tokenize_molecule(text, representation)
+    if not pieces:
+        return [unk_token]
+    if max_piece_span is None:
+        max_piece_span = _max_vocab_piece_span(vocab, representation)
+    n = len(pieces)
+    tokens: list[str] = []
+    append_token = tokens.append
+    vocab_contains = vocab.__contains__
+    join_pieces = "".join
+    i = 0
+    while i < n:
+        upper = min(n, i + max_piece_span)
+        for j in range(upper, i, -1):
+            candidate = join_pieces(pieces[i:j])
+            if vocab_contains(candidate):
+                append_token(candidate)
+                i = j
+                break
+        else:
+            append_token(unk_token)
+            i += 1
+    return tokens
+class APEPreTrainedTokenizer(PreTrainedTokenizer):
+    """Hugging Face tokenizer backend for APE molecular tokenization. (Not fast)"""
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(
+        self,
+        vocab_file: str | os.PathLike[str] | None = None,
+        selfies_vocab_file: str | os.PathLike[str] | None = None,
+        smiles_vocab_file: str | os.PathLike[str] | None = None,
+        vocab: dict[str, Any] | None = None,
+        representation: str = "SELFIES",
+        bos_token: str = "<s>",
+        eos_token: str = "</s>",
+        unk_token: str = "<unk>",
+        pad_token: str = "<pad>",
+        mask_token: str = "<mask>",
+        model_max_length: int = 256,
+        **kwargs,
+    ) -> None:
+        self.representation = _normalize_representation(representation)
+        active_vocab_file = _select_vocab_file(
+            representation=self.representation,
+            vocab_file=vocab_file,
+            selfies_vocab_file=selfies_vocab_file,
+            smiles_vocab_file=smiles_vocab_file,
+        )
+        if vocab is None:
+            if active_vocab_file is None:
+                vocab = {
+                    bos_token: 0,
+                    pad_token: 1,
+                    eos_token: 2,
+                    unk_token: 3,
+                    mask_token: 4,
+                }
+            else:
+                with open(active_vocab_file, encoding="utf-8") as f:
+                    vocab = json.load(f)
+        if vocab is None:
+            raise ValueError("Loaded vocabulary is None.")
+        self.vocab_file = str(active_vocab_file) if active_vocab_file is not None else None
+        self.selfies_vocab_file = (
+            str(selfies_vocab_file) if selfies_vocab_file is not None else None
+        )
+        self.smiles_vocab_file = str(smiles_vocab_file) if smiles_vocab_file is not None else None
+        self.vocab = _coerce_vocab(vocab)
+        self._require_special_tokens(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+        )
+        self.ids_to_tokens = {idx: token for token, idx in self.vocab.items()}
+        self.vocabulary_frequency: dict[str, int] = {}
+        self.pair_counts: dict[tuple[str, str], int] = {}
+        self._max_piece_span = _max_vocab_piece_span(self.vocab, self.representation)
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            model_max_length=model_max_length,
+            representation=self.representation,
+            **kwargs,
+        )
+    @property
+    def vocab_size(self) -> int:
+        return len(self.vocab)
+    @property
+    def vocabulary(self) -> dict[str, int]:
+        """Legacy alias for callers that previously used APETokenizer."""
+        return self.vocab
+    @vocabulary.setter
+    def vocabulary(self, value: dict[str, int]) -> None:
+        self.vocab = _coerce_vocab(value)
+        self.update_reverse_vocabulary()
+        self._refresh_tokenization_cache()
+    @property
+    def special_tokens(self) -> dict[str, int]:
+        bos_token = str(self.bos_token)
+        pad_token = str(self.pad_token)
+        eos_token = str(self.eos_token)
+        unk_token = str(self.unk_token)
+        mask_token = str(self.mask_token)
+        return {
+            bos_token: self._convert_token_to_id(bos_token),
+            pad_token: self._convert_token_to_id(pad_token),
+            eos_token: self._convert_token_to_id(eos_token),
+            unk_token: self._convert_token_to_id(unk_token),
+            mask_token: self._convert_token_to_id(mask_token),
+        }
+    @special_tokens.setter
+    def special_tokens(self, value: dict[str, int]) -> None:
+        for token, token_id in value.items():
+            self.vocab.setdefault(str(token), int(token_id))
+        self.vocab = _coerce_vocab(self.vocab)
+        self.update_reverse_vocabulary()
+        self._refresh_tokenization_cache()
+    def get_vocab(self) -> dict[str, int]:
+        return dict(self.vocab)
+    def update_reverse_vocabulary(self) -> None:
+        self.ids_to_tokens = {idx: token for token, idx in self.vocab.items()}
+    def _refresh_tokenization_cache(self) -> None:
+        self._max_piece_span = _max_vocab_piece_span(self.vocab, self.representation)
+    def _require_special_tokens(
+        self,
+        *,
+        bos_token: str,
+        eos_token: str,
+        unk_token: str,
+        pad_token: str,
+        mask_token: str,
+    ) -> None:
+        missing = [
+            token_text
+            for token in [bos_token, eos_token, unk_token, pad_token, mask_token]
+            if (token_text := _token_text(token)) not in self.vocab
+        ]
+        if missing:
+            raise ValueError(f"Vocabulary is missing required special tokens: {missing}")
+    def pre_tokenize(self, molecule: str, representation: str | None = None) -> list[str]:
+        return pre_tokenize_molecule(molecule, representation or self.representation)
+    def _tokenize(self, text: str, **kwargs) -> list[str]:
+        return ape_tokenize(
+            text,
+            vocab=self.vocab,
+            representation=self.representation,
+            unk_token=str(self.unk_token),
+            max_piece_span=self._max_piece_span,
+        )
+    def encode_molecule(
+        self,
+        text: str,
+        add_special_tokens: bool = True,
+        max_length: int | None = None,
+        truncation: bool = True,
+    ) -> list[int]:
+        """Fast molecular encode path avoiding generic Hugging Face tokenizer overhead."""
+        tokens = self._tokenize(text)
+        ids = [self._convert_token_to_id(token) for token in tokens]
+        if add_special_tokens:
+            ids = self.build_inputs_with_special_tokens(ids)
+        if max_length is not None and truncation:
+            ids = ids[:max_length]
+        return ids
+    def _convert_token_to_id(self, token: str) -> int:
+        return self.vocab.get(token, self.vocab[str(self.unk_token)])
+    def _convert_id_to_token(self, index: int) -> str:
+        return self.ids_to_tokens.get(int(index), str(self.unk_token))
+    def convert_tokens_to_string(self, tokens: list[str]) -> str:
+        return "".join(tokens)
+    def _required_special_token_id(
+        self,
+        token_value: int | list[int] | str | list[str] | None,
+        token_name: str,
+    ) -> int:
+        if token_value is None:
+            raise ValueError(f"{token_name} must be set.")
+        if isinstance(token_value, int):
+            return token_value
+        if isinstance(token_value, str):
+            return self._convert_token_to_id(token_value)
+        if len(token_value) == 1:
+            only_value = token_value[0]
+            if isinstance(only_value, int):
+                return only_value
+            if isinstance(only_value, str):
+                return self._convert_token_to_id(only_value)
+        raise ValueError(f"{token_name} must resolve to a single token id.")
+    def build_inputs_with_special_tokens(
+        self,
+        token_ids_0: list[int],
+        token_ids_1: list[int] | None = None,
+    ) -> list[int]:
+        bos_id = self._required_special_token_id(self.bos_token, "bos_token")
+        eos_id = self._required_special_token_id(self.eos_token, "eos_token")
+        if token_ids_1 is None:
+            return [bos_id, *token_ids_0, eos_id]
+        return [bos_id, *token_ids_0, eos_id, *token_ids_1, eos_id]
+    def create_token_type_ids_from_sequences(
+        self,
+        token_ids_0: list[int],
+        token_ids_1: list[int] | None = None,
+    ) -> list[int]:
+        return [0] * len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1))
+    def pad(
+        self,
+        encoded_inputs: Any,
+        padding: Any = True,
+        max_length: int | None = None,
+        pad_to_multiple_of: int | None = None,
+        padding_side: str | None = None,
+        return_attention_mask: bool | None = None,
+        return_tensors: Any = None,
+        verbose: bool = True,
+    ):
+        padding_enabled = padding not in (False, "do_not_pad")
+        if (
+            padding_enabled
+            and isinstance(encoded_inputs, list)
+            and any("labels" in item for item in encoded_inputs)
+        ):
+            target_length = max(
+                len(item.get("input_ids", item.get("labels", []))) for item in encoded_inputs
+            )
+            if padding == "max_length" and max_length is not None:
+                target_length = max_length
+            if pad_to_multiple_of and target_length % pad_to_multiple_of:
+                target_length = ((target_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+            padded_inputs = []
+            for item in encoded_inputs:
+                item = dict(item)
+                labels = list(item.get("labels", []))
+                pad_len = max(0, target_length - len(labels))
+                if pad_len:
+                    label_padding = [-100] * pad_len
+                    if self.padding_side == "left":
+                        labels = label_padding + labels
+                    else:
+                        labels = labels + label_padding
+                    item["labels"] = labels
+                padded_inputs.append(item)
+            encoded_inputs = padded_inputs
+        return super().pad(
+            encoded_inputs,
+            padding=padding,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_attention_mask=return_attention_mask,
+            return_tensors=return_tensors,
+            verbose=verbose,
+        )
+    def save_vocabulary(
+        self,
+        save_directory: str,
+        filename_prefix: str | None = None,
+    ) -> tuple[str, ...]:
+        if not os.path.isdir(save_directory):
+            raise ValueError(f"Vocabulary path ({save_directory}) should be a directory.")
+        vocab_file = Path(save_directory) / (
+            f"{filename_prefix}-vocab.json" if filename_prefix else "vocab.json"
+        )
+        with vocab_file.open("w", encoding="utf-8") as f:
+            json.dump(self.vocab, f, ensure_ascii=False, indent=4)
+        return (str(vocab_file),)
+    def add_tokens_to_vocabulary(self, tokens: list[str]) -> int:
+        """Add tokens to the tokenizer vocabulary if they are not already present.
+        This is intended for forcing coverage of rare valid molecular primitive
+        symbols, especially SELFIES bracket tokens, after APE merge training.
+        """
+        if not tokens:
+            return 0
+        next_id = max(self.vocab.values(), default=-1) + 1
+        added = 0
+        for token in tokens:
+            token = str(token).strip()
+            if not token:
+                continue
+            if token in self.vocab:
+                continue
+            self.vocab[token] = next_id
+            next_id += 1
+            added += 1
+        if added:
+            self.update_reverse_vocabulary()
+            self._refresh_tokenization_cache()
+        return added
+    def save_pretrained(self, save_directory: str | os.PathLike[str], *args, **kwargs):
+        saved_files = super().save_pretrained(save_directory, *args, **kwargs)
+        save_path = Path(save_directory)
+        special_tokens_map = {
+            "bos_token": str(self.bos_token),
+            "eos_token": str(self.eos_token),
+            "unk_token": str(self.unk_token),
+            "pad_token": str(self.pad_token),
+            "mask_token": str(self.mask_token),
+        }
+        with (save_path / "special_tokens_map.json").open("w", encoding="utf-8") as f:
+            json.dump(special_tokens_map, f, ensure_ascii=False, indent=2)
+        tokenizer_config_path = save_path / "tokenizer_config.json"
+        if tokenizer_config_path.exists():
+            with tokenizer_config_path.open(encoding="utf-8") as f:
+                tokenizer_config = json.load(f)
+        else:
+            tokenizer_config = {}
+        tokenizer_config.pop("tokenizer_class", None)
+        tokenizer_config.update(
+            {
+                "representation": self.representation,
+                "model_max_length": self.model_max_length,
+                "auto_map": {
+                    "AutoTokenizer": [
+                        "tokenization_ape.APEPreTrainedTokenizer",
+                        None,
+                    ],
+                },
+            }
+        )
+        with tokenizer_config_path.open("w", encoding="utf-8") as f:
+            json.dump(tokenizer_config, f, ensure_ascii=False, indent=2)
+        return saved_files
+    def save_vocabulary_file(self, file_path: str | os.PathLike[str]) -> None:
+        path = Path(file_path)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        freq_path = path.with_name(f"{path.stem}_freq.json")
+        with path.open("w", encoding="utf-8") as f:
+            json.dump(self.vocab, f, ensure_ascii=False, indent=4)
+        with freq_path.open("w", encoding="utf-8") as f:
+            json.dump(self.vocabulary_frequency, f, ensure_ascii=False, indent=4)
+    def load_vocabulary_file(
+        self,
+        file_path: str | os.PathLike[str],
+        representation: str | None = None,
+    ) -> None:
+        if representation is not None:
+            self.representation = _normalize_representation(representation)
+        with open(file_path, encoding="utf-8") as f:
+            vocab = json.load(f)
+        self.vocab = _coerce_vocab(vocab)
+        self._require_special_tokens(
+            bos_token=str(self.bos_token),
+            eos_token=str(self.eos_token),
+            unk_token=str(self.unk_token),
+            pad_token=str(self.pad_token),
+            mask_token=str(self.mask_token),
+        )
+        self.ids_to_tokens = {idx: token for token, idx in self.vocab.items()}
+        self._refresh_tokenization_cache()
+    def train(
+        self,
+        corpus,
+        type: str = "selfies",
+        representation: str | None = None,
+        max_vocab_size: int = 5000,
+        min_freq_for_merge: int = 2000,
+        max_merge_pieces: int | None = 8,
+        save_checkpoint: bool = False,
+        checkpoint_path: str = "checkpoint",
+        checkpoint_interval: int = 500,
+    ) -> None:
+        import warnings
+        new_rep = _normalize_representation(representation or type)
+        if new_rep != self.representation:
+            warnings.warn(
+                f"train() representation={new_rep!r} differs from tokenizer "
+                f"representation={self.representation!r}. Overwriting.",
+                UserWarning,
+                stacklevel=2,
+            )
+        self.representation = new_rep
+        if not corpus:
+            raise ValueError("Cannot train APE tokenizer on an empty corpus.")
+        print(f"Pretokenizing {self.representation}...", flush=True)
+        tokenized_corpus = []
+        vocabulary_frequency: defaultdict[str, int] = defaultdict(int)
+        saw_tokens = False
+        for sentence in corpus:
+            tokens = self.pre_tokenize(str(sentence))
+            if not tokens:
+                continue
+            saw_tokens = True
+            for token in tokens:
+                vocabulary_frequency[token] += 1
+            if len(tokens) > 1:
+                tokenized_corpus.append(tokens)
+        print(
+            f"Pretokenization complete, found {len(vocabulary_frequency)} tokens",
+            flush=True,
+        )
+        if not saw_tokens:
+            raise ValueError("Cannot train APE tokenizer on an empty corpus.")
+        pre_tokens_counts = len(vocabulary_frequency)
+        merged_counter = len(vocabulary_frequency) + 1
+        if save_checkpoint and checkpoint_interval <= 0:
+            raise ValueError(
+                "checkpoint_interval must be positive when save_checkpoint is enabled."
+            )
+        checkpoint_increment = checkpoint_interval
+        batch = checkpoint_interval + pre_tokens_counts
+        piece_count_cache: dict[str, int] = {}
+        def merged_piece_count(token: str) -> int:
+            count = piece_count_cache.get(token)
+            if count is None:
+                count = _base_piece_count(token, self.representation)
+                piece_count_cache[token] = count
+            return count
+        def get_most_common_pair(tokenized):
+            pair_counts: defaultdict[tuple[str, str], int] = defaultdict(int)
+            for tokens in tokenized:
+                for i in range(len(tokens) - 1):
+                    pair = (tokens[i], tokens[i + 1])
+                    if max_merge_pieces is not None:
+                        merged_candidate = pair[0] + pair[1]
+                        if merged_piece_count(merged_candidate) > max_merge_pieces:
+                            continue
+                    pair_counts[pair] += 1
+            self.pair_counts = dict(pair_counts)
+            if not pair_counts:
+                return ("", ""), 0
+            most_common_pair = ("", "")
+            most_common_frequency = 0
+            for pair, count in pair_counts.items():
+                if count > most_common_frequency:
+                    most_common_pair = pair
+                    most_common_frequency = count
+            return most_common_pair, most_common_frequency
+        while True:
+            if save_checkpoint and len(vocabulary_frequency) >= batch:
+                self.vocabulary_frequency = dict(vocabulary_frequency)
+                self.vocab = {
+                    **{
+                        str(self.bos_token): 0,
+                        str(self.pad_token): 1,
+                        str(self.eos_token): 2,
+                        str(self.unk_token): 3,
+                        str(self.mask_token): 4,
+                    },
+                    **{
+                        word: idx
+                        for idx, word in enumerate(
+                            vocabulary_frequency.keys(),
+                            start=5,
+                        )
+                    },
+                }
+                self.ids_to_tokens = {idx: token for token, idx in self.vocab.items()}
+                self._refresh_tokenization_cache()
+                checkpoint_dir = Path(checkpoint_path)
+                checkpoint_dir.mkdir(parents=True, exist_ok=True)
+                self.save_vocabulary_file(checkpoint_dir / f"checkpoint_{batch}.json")
+                self.save_pretrained(str(checkpoint_dir / f"checkpoint_{batch}"))
+                print(f"Checkpoint saved at {checkpoint_dir}/checkpoint_{batch}.json")
+                batch += checkpoint_increment
+            if len(vocabulary_frequency) >= max_vocab_size:
+                print("Max vocabulary achieved", flush=True)
+                break
+            if not tokenized_corpus:
+                print("No more mergeable pairs", flush=True)
+                break
+            most_common_pair, freq = get_most_common_pair(tokenized_corpus)
+            if freq < min_freq_for_merge:
+                print("Not enough frequency found", flush=True)
+                break
+            if not most_common_pair[0] or not most_common_pair[1]:
+                print("No valid merge pair found", flush=True)
+                break
+            left_token, right_token = most_common_pair
+            merged_word = left_token + right_token
+            if merged_word not in vocabulary_frequency:
+                print(
+                    f"New merge found: {merged_word} {merged_counter}/{max_vocab_size} "
+                    f"{round(merged_counter / max_vocab_size * 100, 2)}%",
+                    flush=True,
+                )
+                merged_counter += 1
+            vocabulary_frequency[merged_word] += freq
+            new_tokenized_corpus = []
+            for tokens in tokenized_corpus:
+                new_tokens = []
+                append_token = new_tokens.append
+                i = 0
+                token_count = len(tokens)
+                while i < token_count:
+                    if (
+                        i < token_count - 1
+                        and tokens[i] == left_token
+                        and tokens[i + 1] == right_token
+                    ):
+                        append_token(merged_word)
+                        i += 2
+                    else:
+                        append_token(tokens[i])
+                        i += 1
+                if len(new_tokens) > 1:
+                    new_tokenized_corpus.append(new_tokens)
+            tokenized_corpus = new_tokenized_corpus
+        self.vocabulary_frequency = dict(vocabulary_frequency)
+        self.vocab = {
+            str(self.bos_token): 0,
+            str(self.pad_token): 1,
+            str(self.eos_token): 2,
+            str(self.unk_token): 3,
+            str(self.mask_token): 4,
+            **{word: idx for idx, word in enumerate(vocabulary_frequency.keys(), start=5)},
+        }
+        self.ids_to_tokens = {idx: token for token, idx in self.vocab.items()}
+        self._refresh_tokenization_cache()
+    def train_from_iterator(self, iterator, *args, **kwargs) -> None:
+        raise NotImplementedError("train_from_iterator is not implemented for APE")
+APEPreTrainedTokenizer.register_for_auto_class("AutoTokenizer")

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_ape.APEPreTrainedTokenizer",
+      null
+    ]
+  },
+  "backend": "custom",
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": "<mask>",
+  "model_max_length": 256,
+  "pad_token": "<pad>",
+  "representation": "SMILES",
+  "unk_token": "<unk>"
+}

vocab.json ADDED Viewed

	@@ -0,0 +1,1388 @@

+{
+    "#": 18,
+    "#N)": 643,
+    "%10": 81,
+    "%11": 91,
+    "%12": 92,
+    "%13": 93,
+    "%14": 94,
+    "%15": 95,
+    "%16": 96,
+    "%17": 97,
+    "%18": 98,
+    "%19": 99,
+    "%20": 100,
+    "%21": 101,
+    "%22": 102,
+    "%23": 103,
+    "%24": 104,
+    "%25": 105,
+    "%26": 106,
+    "%27": 250,
+    "(": 9,
+    "(-": 420,
+    "(-c2cc": 993,
+    "(-c3cc": 1123,
+    "(-c4cc": 1016,
+    "(/C=C/": 1255,
+    "(=O)": 288,
+    "(=O)=O": 842,
+    "(=O)CC": 1181,
+    "(=O)N": 321,
+    "(=O)N(": 1384,
+    "(=O)N1": 607,
+    "(=O)N2": 690,
+    "(=O)O": 908,
+    "(=O)O)": 637,
+    "(=O)[O-]": 397,
+    "(=O)o": 1083,
+    "(C": 309,
+    "(C#": 1018,
+    "(C#N)": 829,
+    "(C(=O)": 515,
+    "(C(F)": 664,
+    "(C)": 287,
+    "(C)(C)": 1194,
+    "(C)C": 518,
+    "(C)C)": 369,
+    "(C)CC": 394,
+    "(C)CCC": 1325,
+    "(C)N": 1324,
+    "(C)O": 557,
+    "(C)O)": 912,
+    "(C)c(": 1351,
+    "(C2": 1046,
+    "(C3)": 1292,
+    "(CC": 324,
+    "(CC(C": 961,
+    "(CC)": 952,
+    "(CCCC": 824,
+    "(CCN": 1041,
+    "(CN": 865,
+    "(CO": 868,
+    "(CO)": 653,
+    "(Cc1cc": 988,
+    "(Cc2cc": 726,
+    "(Cc3cc": 923,
+    "(Cl)": 746,
+    "(F)": 315,
+    "(F)(F)": 812,
+    "(F)F": 498,
+    "(F)F)": 361,
+    "(F)F.": 1079,
+    "(F)c(": 992,
+    "(N": 398,
+    "(N)": 574,
+    "(O": 764,
+    "(O)": 316,
+    "(O)CC": 1127,
+    "(O)O": 765,
+    "(O)O)": 980,
+    "(OC)": 748,
+    "(OC)c1": 1376,
+    "(S(=O)": 1210,
+    "([O-])": 570,
+    "(c1cc": 793,
+    "(c2cc": 632,
+    "(c3cc": 1011,
+    ")": 11,
+    ")C(=O)": 990,
+    ")CC": 668,
+    ")N": 350,
+    ")N1": 525,
+    ")N1CC": 818,
+    ")Nc1cc": 941,
+    ")c1": 771,
+    ")c1cc": 780,
+    ")cc": 285,
+    ")cc(": 530,
+    ")cc1": 303,
+    ")cc1)": 848,
+    ")cc12": 974,
+    ")cc2": 491,
+    ")cc2)": 554,
+    ")cc3": 858,
+    ")cc3)": 752,
+    ")ccc1": 507,
+    ")ccc1O": 1182,
+    ")ccn1": 1166,
+    "-": 29,
+    "-2": 1000,
+    "-c1cc": 471,
+    "-c1cn": 1289,
+    "-c2cc": 345,
+    "-c2cc(": 608,
+    "-c2cc3": 914,
+    "-c2cn": 710,
+    "-c2n": 564,
+    "-c2nc(": 1030,
+    "-c3cc": 381,
+    "-c3cn": 1153,
+    "-c3n": 1352,
+    "-c4cc": 869,
+    "-n2": 602,
+    ".": 23,
+    ".Cl": 470,
+    ".Cl.Cl": 1086,
+    ".[Br-]": 998,
+    ".[Cl-]": 1217,
+    ".[I-]": 1296,
+    ".[Na+]": 891,
+    "/": 19,
+    "/C": 346,
+    "/C(": 424,
+    "/C(=C/": 732,
+    "/C(=C\\": 702,
+    "/C(=N/": 1066,
+    "/C(=N\\": 755,
+    "/C(C)": 853,
+    "/C1": 955,
+    "/C=C": 809,
+    "/C=C(\\": 828,
+    "/C=C/": 390,
+    "/C=C2\\": 968,
+    "/C=C\\": 657,
+    "/C=N/": 794,
+    "/C=N/N": 1108,
+    "/N": 439,
+    "/N=C(\\": 1149,
+    "/N=C/": 614,
+    "/c(": 1126,
+    "1": 8,
+    "1)": 401,
+    "1)c1cc": 798,
+    "1C": 1023,
+    "1CC": 342,
+    "1CCC(": 542,
+    "1CCC[C@H]1": 1319,
+    "1CCN(": 932,
+    "2": 12,
+    "2)": 291,
+    "2)C1": 620,
+    "2)CC1": 436,
+    "2)c1": 407,
+    "2)c1=O": 1245,
+    "2)cc(": 1258,
+    "2)cc1": 358,
+    "2)cc1)": 1349,
+    "2)ccc1": 982,
+    "2)n1": 813,
+    "2C": 1196,
+    "2CC": 351,
+    "2CC2)": 969,
+    "2CC3CC": 1109,
+    "2CCC(": 604,
+    "2CCCC": 548,
+    "2CCN(": 618,
+    "2CCO": 897,
+    "2c(": 671,
+    "2c(cc1": 839,
+    "3": 13,
+    "3)": 302,
+    "3)C1": 1374,
+    "3)CC1": 956,
+    "3)c1": 859,
+    "3)cc": 379,
+    "3)cc1": 481,
+    "3)cc12": 1334,
+    "3)cc2": 572,
+    "3)cc2)": 679,
+    "3)cc21": 1287,
+    "3)ccc2": 1309,
+    "3)n": 404,
+    "3)n2)": 1313,
+    "3C(=O)": 1002,
+    "3C)": 716,
+    "3CC": 377,
+    "3CC3)": 1055,
+    "3CC4CC": 1238,
+    "3CCC(": 938,
+    "3CCCC": 535,
+    "3CCN": 861,
+    "3CCN(": 694,
+    "3CCOCC": 592,
+    "4": 14,
+    "4)": 336,
+    "4)CC": 841,
+    "4)CC3)": 1064,
+    "4)c3": 901,
+    "4)c3)": 1306,
+    "4)cc": 417,
+    "4)cc2": 1084,
+    "4)cc3": 766,
+    "4)cc3)": 749,
+    "4)ccc3": 1314,
+    "4CC": 426,
+    "4CC4)": 763,
+    "4CCCC": 756,
+    "4CCN": 1338,
+    "4CCN(": 1340,
+    "4CCOCC": 745,
+    "5": 35,
+    "5)": 432,
+    "5)CC": 1299,
+    "5)cc": 656,
+    "5CC": 591,
+    "5CCCC": 1382,
+    "5CCOCC": 1214,
+    "6": 48,
+    "6)": 819,
+    "6CC": 1283,
+    "7": 49,
+    "8": 50,
+    "9": 80,
+    "</s>": 2,
+    "<mask>": 4,
+    "<pad>": 1,
+    "<s>": 0,
+    "<unk>": 3,
+    "=": 17,
+    "=C": 329,
+    "=C(": 306,
+    "=C(/": 1075,
+    "=C(C)": 556,
+    "=C(N)": 1231,
+    "=C(\\": 651,
+    "=C/": 355,
+    "=C1": 595,
+    "=C1\\": 1035,
+    "=C2": 566,
+    "=C2\\": 843,
+    "=C3": 1288,
+    "=CC": 734,
+    "=C\\": 469,
+    "=N": 364,
+    "=N)": 1356,
+    "=N/": 562,
+    "=N/N": 881,
+    "=N\\": 578,
+    "=O": 356,
+    "=O)": 278,
+    "=O)CC": 1169,
+    "=O)c1": 1124,
+    "=O)cc": 1071,
+    "=O)cc1": 727,
+    "=O)cc2": 1261,
+    "=O)n1": 1308,
+    "=S)": 918,
+    "=[N+]": 1199,
+    "B": 54,
+    "Br": 20,
+    "Br)": 461,
+    "Br)cc": 623,
+    "Br)cc1": 697,
+    "Br)cc2": 1133,
+    "Br)cc2)": 1146,
+    "Br.": 1280,
+    "C": 5,
+    "C#": 368,
+    "C#C": 1092,
+    "C#N": 551,
+    "C#N)": 433,
+    "C#N)cc": 1053,
+    "C(": 279,
+    "C(=": 431,
+    "C(=N)": 584,
+    "C(=N)N": 831,
+    "C(=O)": 283,
+    "C(=O)C": 698,
+    "C(=O)N": 296,
+    "C(=O)O": 396,
+    "C(=S": 539,
+    "C(=S)": 1129,
+    "C(=S)N": 597,
+    "C(C": 1304,
+    "C(C#N)": 1236,
+    "C(C)": 318,
+    "C(C)C": 768,
+    "C(C)C)": 422,
+    "C(CC": 1051,
+    "C(Cl)": 1277,
+    "C(F)": 325,
+    "C(F)F)": 1031,
+    "C(N": 758,
+    "C(N)": 435,
+    "C(N)=N": 1327,
+    "C(N)=O": 1020,
+    "C(O)": 510,
+    "C(c1cc": 870,
+    "C(c2cc": 777,
+    "C(c3cc": 1226,
+    "C)": 281,
+    "C)CC": 1274,
+    "C)c(": 964,
+    "C)cc": 1233,
+    "C)cc1": 571,
+    "C)cc2": 1034,
+    "C)cc2)": 1070,
+    "C)cc3)": 1122,
+    "C/C=C\\": 1364,
+    "C1": 300,
+    "C1(": 1058,
+    "C1)": 508,
+    "C1)C2": 1229,
+    "C1)N": 1271,
+    "C1=": 862,
+    "C1=C(": 786,
+    "C1=N": 834,
+    "C1=O": 484,
+    "C1=O)": 773,
+    "C1CC": 523,
+    "C1CC1": 790,
+    "C1CC1)": 1278,
+    "C1CCCC": 559,
+    "C1CCN(": 706,
+    "C1c1cc": 1320,
+    "C2": 312,
+    "C2(": 627,
+    "C2)": 392,
+    "C2)C1": 1097,
+    "C2)CC1": 1062,
+    "C2)c1": 778,
+    "C2)cc1": 573,
+    "C2)n1": 1183,
+    "C2=": 1163,
+    "C2=C(": 920,
+    "C2=N": 810,
+    "C2=O": 696,
+    "C2=O)": 509,
+    "C2CC": 601,
+    "C2CC2)": 837,
+    "C2CCCC": 645,
+    "C2CCN(": 948,
+    "C3": 349,
+    "C3(CC": 1154,
+    "C3)": 441,
+    "C3)cc": 970,
+    "C3)cc1": 1078,
+    "C3)n": 883,
+    "C3=C(": 1385,
+    "C3=O)": 648,
+    "C3CC": 669,
+    "C3CC3)": 871,
+    "C3CCCC": 774,
+    "C4": 500,
+    "C4)": 563,
+    "C4=O)": 1132,
+    "C4CC": 1114,
+    "C4CC4)": 1076,
+    "C5": 801,
+    "C5)": 935,
+    "C=": 418,
+    "C=C": 665,
+    "C=C(": 776,
+    "C=C(C)": 913,
+    "C=C1": 808,
+    "C=C2": 1311,
+    "C=CC": 585,
+    "C=O)": 1152,
+    "CC": 274,
+    "CC#": 1189,
+    "CC(": 462,
+    "CC(=O)": 409,
+    "CC(C)": 343,
+    "CC(C)=": 1305,
+    "CC(C)N": 1138,
+    "CC(C)O": 1150,
+    "CC(N": 1110,
+    "CC(N)": 1336,
+    "CC(O)": 772,
+    "CC)": 806,
+    "CC1": 313,
+    "CC1(": 1088,
+    "CC1(C)": 826,
+    "CC1)": 493,
+    "CC1=C(": 1013,
+    "CC1CC": 1329,
+    "CC2": 449,
+    "CC2)": 373,
+    "CC2)C1": 1165,
+    "CC2)c1": 631,
+    "CC2)n1": 978,
+    "CC2CC": 1348,
+    "CC3": 1021,
+    "CC3)": 465,
+    "CC3)cc": 1221,
+    "CC3)n": 1027,
+    "CC4)": 720,
+    "CCC": 543,
+    "CCC(": 641,
+    "CCC(C)": 904,
+    "CCC1": 691,
+    "CCC2": 1007,
+    "CCCC": 298,
+    "CCCC(": 1307,
+    "CCCC1": 677,
+    "CCCC2": 893,
+    "CCCCC": 1112,
+    "CCCCC1": 715,
+    "CCCCCC": 565,
+    "CCCCCN": 1253,
+    "CCCCN": 621,
+    "CCCCN)": 1268,
+    "CCCCN1": 1223,
+    "CCCCO": 1026,
+    "CCCN": 459,
+    "CCCN1": 972,
+    "CCCN2": 1372,
+    "CCCO": 739,
+    "CCN": 301,
+    "CCN(": 344,
+    "CCN(C": 446,
+    "CCN(C)": 709,
+    "CCN(CC": 448,
+    "CCN1": 472,
+    "CCN1CC": 1054,
+    "CCN2": 719,
+    "CCNCC": 1087,
+    "CCO": 317,
+    "CCO)": 1328,
+    "CCOCC": 428,
+    "CCOCC1": 681,
+    "CCOCCO": 889,
+    "CCS": 852,
+    "CC[C@@H]1": 1230,
+    "CC[C@H](": 1369,
+    "CC[C@H](C)": 1218,
+    "CC[C@H]1": 1069,
+    "CCc1cc": 659,
+    "CCc1n": 940,
+    "CCc2cc": 977,
+    "CCc3cc": 1375,
+    "CCn1": 534,
+    "CCn1c(": 1224,
+    "CN": 295,
+    "CN(": 391,
+    "CN(C": 541,
+    "CN(C)": 444,
+    "CN(C)C": 1276,
+    "CN(CC": 646,
+    "CN)": 1107,
+    "CN1": 395,
+    "CN1CC": 647,
+    "CN2": 479,
+    "CN2CC": 898,
+    "CN3": 1293,
+    "CN3CC": 1208,
+    "CNCC": 1113,
+    "CO": 294,
+    "CO)": 489,
+    "CO1": 1318,
+    "COC": 1343,
+    "COC1": 1157,
+    "COCC": 888,
+    "COCCN": 1174,
+    "COCCN1": 1264,
+    "COCCO": 1171,
+    "CO[C@H]1": 1284,
+    "COc1c(": 1275,
+    "COc1cc": 340,
+    "COc1n": 1377,
+    "COc2cc": 791,
+    "COc3cc": 1065,
+    "CS": 413,
+    "CS(=O)": 580,
+    "CSc1n": 930,
+    "CSc2n": 1219,
+    "C[C@@H]": 339,
+    "C[C@@H](": 502,
+    "C[C@@H](C)": 840,
+    "C[C@@H](N": 877,
+    "C[C@@H](N)": 1367,
+    "C[C@@H](O)": 626,
+    "C[C@@H]1": 482,
+    "C[C@@H]1CC": 924,
+    "C[C@@H]1CN(": 1193,
+    "C[C@@H]2": 579,
+    "C[C@@H]3": 700,
+    "C[C@@H]4": 1029,
+    "C[C@@]1": 963,
+    "C[C@@]2": 1273,
+    "C[C@H]": 335,
+    "C[C@H](": 473,
+    "C[C@H](C)": 872,
+    "C[C@H](N": 799,
+    "C[C@H](N)": 1004,
+    "C[C@H](O)": 638,
+    "C[C@H]1": 460,
+    "C[C@H]1CC": 1036,
+    "C[C@H]1CN(": 1357,
+    "C[C@H]1O[C@@H](": 1074,
+    "C[C@H]2": 550,
+    "C[C@H]3": 685,
+    "C[C@H]4": 1151,
+    "C[C@]1": 851,
+    "C[C@]12CC": 1341,
+    "Cc1": 378,
+    "Cc1c(": 576,
+    "Cc1c(-": 1345,
+    "Cc1c[nH]": 1365,
+    "Cc1cc": 326,
+    "Cc1cc(": 490,
+    "Cc1cc2": 999,
+    "Cc1cn": 619,
+    "Cc1cs": 1252,
+    "Cc1n": 438,
+    "Cc1nc(": 792,
+    "Cc1no": 1072,
+    "Cc2c(": 1073,
+    "Cc2cc": 425,
+    "Cc2cc(": 1005,
+    "Cc3cc": 501,
+    "Cc4cc": 692,
+    "Cl": 22,
+    "Cl)": 320,
+    "Cl)c(": 775,
+    "Cl)c(Cl)": 667,
+    "Cl)cc": 347,
+    "Cl)cc(": 1290,
+    "Cl)cc1": 434,
+    "Cl)cc1)": 895,
+    "Cl)cc1Cl": 1344,
+    "Cl)cc2": 599,
+    "Cl)cc2)": 589,
+    "Cl)cc3": 811,
+    "Cl)cc3)": 725,
+    "Cl)ccc1": 934,
+    "Cl)ccc2": 835,
+    "Cl)ccc3": 1204,
+    "Cl.": 717,
+    "Clc1cc": 1256,
+    "Cn1": 399,
+    "Cn1c(": 890,
+    "Cn1cc": 1162,
+    "Cn1cc(": 762,
+    "Cn1cn": 926,
+    "Cn1nc(": 1337,
+    "Cn2": 567,
+    "Cn3": 1240,
+    "F": 15,
+    "F)": 289,
+    "F)c(": 954,
+    "F)c(Cl)": 1254,
+    "F)c(F)": 973,
+    "F)cc": 333,
+    "F)cc(": 1177,
+    "F)cc1": 423,
+    "F)cc1)": 728,
+    "F)cc1F": 1281,
+    "F)cc2": 568,
+    "F)cc2)": 546,
+    "F)cc3": 705,
+    "F)cc3)": 605,
+    "F)cc4": 1167,
+    "F)cc4)": 986,
+    "F)ccc1": 1008,
+    "F)ccc2": 1176,
+    "F)ccc3": 1373,
+    "FC(F)": 1049,
+    "Fc1cc": 1197,
+    "I": 40,
+    "I)": 1212,
+    "N": 10,
+    "N#": 476,
+    "N#C": 770,
+    "N(": 704,
+    "N(C": 823,
+    "N(C)": 654,
+    "N(C)C)": 937,
+    "N(CC": 795,
+    "N)": 311,
+    "N)cc": 1232,
+    "N)cc1": 1301,
+    "N)ncn": 817,
+    "N1": 427,
+    "N1CC": 864,
+    "N1CCN(": 900,
+    "N2": 445,
+    "N2CC": 1024,
+    "N2CCCC": 1155,
+    "N2CCN(": 945,
+    "N2CCO": 1259,
+    "N3": 1170,
+    "N4": 1068,
+    "N=C(": 558,
+    "N=C(N)": 789,
+    "NC": 984,
+    "NC(": 1160,
+    "NC(=O)": 314,
+    "NC(C)": 894,
+    "NCC": 633,
+    "NCc1cc": 1216,
+    "NS(=O)": 544,
+    "Nc1cc": 673,
+    "Nc1n": 549,
+    "Nc1nc(": 959,
+    "Nc2cc": 784,
+    "Nc2n": 1118,
+    "O": 6,
+    "O)": 275,
+    "O)CC": 827,
+    "O)c(": 887,
+    "O)c(O)": 850,
+    "O)c1": 921,
+    "O)c1cc": 1209,
+    "O)cc": 814,
+    "O)cc1": 516,
+    "O)cc1)": 1121,
+    "O)cc2": 804,
+    "O)cc2)": 838,
+    "O)cc3": 1366,
+    "O)cc3)": 1111,
+    "O1": 617,
+    "O2": 1042,
+    "O2)": 997,
+    "O3)": 1302,
+    "O=": 374,
+    "O=C(": 323,
+    "O=C(C": 1361,
+    "O=C(CC": 1200,
+    "O=C(CO": 1105,
+    "O=C(CS": 1312,
+    "O=C(N": 385,
+    "O=C(O)": 415,
+    "O=C1": 450,
+    "O=C1N": 879,
+    "O=[N+]([O-])": 821,
+    "O=c1": 693,
+    "O=c1[nH]": 1213,
+    "OC": 360,
+    "OC(": 701,
+    "OC(=O)": 467,
+    "OC(C)": 475,
+    "OC(F)": 561,
+    "OC)": 328,
+    "OC)c(": 1103,
+    "OC)c1": 569,
+    "OC)cc1": 906,
+    "OC1": 922,
+    "OC2": 1235,
+    "OCC": 376,
+    "OCC(O)": 1342,
+    "OCC)": 917,
+    "OCCCC": 899,
+    "OCCCN": 1350,
+    "OCCN": 830,
+    "OCCO": 629,
+    "OCCO2": 1381,
+    "OCO": 511,
+    "OCO4)": 1244,
+    "OCc1cc": 892,
+    "OCc2cc": 724,
+    "OCc3cc": 874,
+    "OO": 1360,
+    "O[C@@H](": 708,
+    "O[C@@H]1": 1038,
+    "O[C@@H]2": 1315,
+    "O[C@H](": 1242,
+    "O[C@H](CO)": 929,
+    "O[C@H]1": 958,
+    "O[C@H]2": 1333,
+    "Oc1cc": 642,
+    "Oc2cc": 644,
+    "Oc3cc": 753,
+    "Oc4cc": 1285,
+    "P": 43,
+    "P(=O)": 524,
+    "P(=O)(": 1243,
+    "S": 21,
+    "S(=O)": 319,
+    "S(C)": 521,
+    "S(N)": 723,
+    "S)": 652,
+    "S)N": 1317,
+    "S1": 943,
+    "S2": 1362,
+    "SC": 1291,
+    "SCC": 612,
+    "Sc2cc": 1363,
+    "[10B]": 228,
+    "[11C@@H]": 246,
+    "[11CH2]": 203,
+    "[11CH3]": 60,
+    "[11C]": 117,
+    "[11c]": 112,
+    "[123I-]": 253,
+    "[123I]": 56,
+    "[123Te]": 155,
+    "[124I-]": 265,
+    "[124I]": 113,
+    "[125I-]": 143,
+    "[125I]": 57,
+    "[127I]": 244,
+    "[127Xe]": 252,
+    "[129Xe]": 237,
+    "[131Cs]": 222,
+    "[131I-]": 269,
+    "[131I]": 68,
+    "[133Xe]": 263,
+    "[13CH3]": 224,
+    "[13CH]": 219,
+    "[13C]": 140,
+    "[13cH]": 164,
+    "[13c]": 163,
+    "[14C@@H]": 259,
+    "[14C@@]": 245,
+    "[14C@H]": 248,
+    "[14CH2]": 146,
+    "[14CH3]": 147,
+    "[14CH]": 154,
+    "[14C]": 121,
+    "[14cH]": 132,
+    "[14c]": 206,
+    "[15nH]": 165,
+    "[15n]": 114,
+    "[17F]": 179,
+    "[18F-]": 239,
+    "[18F]": 59,
+    "[18OH]": 205,
+    "[18O]": 258,
+    "[19F]": 122,
+    "[211At]": 231,
+    "[223Ra]": 162,
+    "[22Na+]": 160,
+    "[2H]": 69,
+    "[32PH]": 210,
+    "[32P]": 201,
+    "[35S]": 151,
+    "[3H]": 77,
+    "[42K+]": 157,
+    "[45Ca+2]": 148,
+    "[47Ca+2]": 166,
+    "[68Ga+3]": 220,
+    "[73Se]": 200,
+    "[75Se]": 267,
+    "[76Br]": 128,
+    "[81Kr]": 266,
+    "[82Rb+]": 255,
+    "[82Rb]": 226,
+    "[85Sr+2]": 150,
+    "[89Sr+2]": 260,
+    "[Ag+]": 159,
+    "[Ag-4]": 268,
+    "[Ag]": 123,
+    "[Al+3]": 184,
+    "[Al]": 186,
+    "[Ar]": 214,
+    "[As+]": 158,
+    "[As-]": 241,
+    "[AsH3]": 257,
+    "[AsH]": 227,
+    "[As]": 66,
+    "[At]": 235,
+    "[B-]": 76,
+    "[B@-]": 170,
+    "[B@@-]": 172,
+    "[BH-]": 204,
+    "[BH2-]": 167,
+    "[BH3-]": 111,
+    "[B]": 139,
+    "[Ba+2]": 133,
+    "[Ba]": 236,
+    "[Be+2]": 202,
+    "[Bi+3]": 141,
+    "[Bi]": 131,
+    "[Br+2]": 207,
+    "[Br-]": 47,
+    "[C+]": 79,
+    "[C-]": 90,
+    "[C@@H]": 24,
+    "[C@@H](": 365,
+    "[C@@H](C": 488,
+    "[C@@H](C)": 456,
+    "[C@@H](CC": 582,
+    "[C@@H](CO": 1359,
+    "[C@@H](CO)": 849,
+    "[C@@H](N": 1144,
+    "[C@@H](N)": 910,
+    "[C@@H](O": 857,
+    "[C@@H](O)": 419,
+    "[C@@H]1": 375,
+    "[C@@H]12": 951,
+    "[C@@H]1CC": 609,
+    "[C@@H]1O": 581,
+    "[C@@H]2": 406,
+    "[C@@H]2CC": 660,
+    "[C@@H]2O": 722,
+    "[C@@H]3": 477,
+    "[C@@H]3CC": 713,
+    "[C@@H]4": 613,
+    "[C@@H]4CC": 1120,
+    "[C@@H]4[C@@]5": 1178,
+    "[C@@H]5": 1047,
+    "[C@@H]5CC": 1096,
+    "[C@@]": 25,
+    "[C@@](C)": 721,
+    "[C@@](O)": 1207,
+    "[C@@]1": 532,
+    "[C@@]12": 1383,
+    "[C@@]2": 512,
+    "[C@@]2(C)": 855,
+    "[C@@]3": 615,
+    "[C@@]3(C)": 1139,
+    "[C@@]4": 767,
+    "[C@@]4(C)": 1370,
+    "[C@@]5": 942,
+    "[C@H]": 31,
+    "[C@H](": 357,
+    "[C@H](C": 453,
+    "[C@H](C)": 443,
+    "[C@H](C)CC": 1006,
+    "[C@H](CC": 596,
+    "[C@H](CO": 947,
+    "[C@H](CO)": 600,
+    "[C@H](N": 1040,
+    "[C@H](N)": 966,
+    "[C@H](O": 854,
+    "[C@H](O)": 412,
+    "[C@H]1": 383,
+    "[C@H]1CC": 547,
+    "[C@H]1CC[C@H](": 1227,
+    "[C@H]1CC[C@H]2": 1248,
+    "[C@H]1O": 655,
+    "[C@H]2": 402,
+    "[C@H]2CC": 683,
+    "[C@H]2O)": 1028,
+    "[C@H]3": 480,
+    "[C@H]3CC": 703,
+    "[C@H]3O)": 1239,
+    "[C@H]4": 666,
+    "[C@H]4CC": 1148,
+    "[C@H]5": 1134,
+    "[C@]": 32,
+    "[C@](C)": 769,
+    "[C@](O)": 1206,
+    "[C@]1": 468,
+    "[C@]1(C)": 1279,
+    "[C@]12": 712,
+    "[C@]12C": 1032,
+    "[C@]2": 494,
+    "[C@]2(": 1246,
+    "[C@]2(C)": 757,
+    "[C@]3": 552,
+    "[C@]3(C)": 781,
+    "[C@]4": 635,
+    "[C@]4(C)": 981,
+    "[C@]43C)": 1185,
+    "[C@]5": 1045,
+    "[CH-]": 125,
+    "[CH2]": 185,
+    "[CH3]": 187,
+    "[CH]": 264,
+    "[C]": 188,
+    "[Ca+2]": 89,
+    "[CaH2]": 233,
+    "[Ca]": 193,
+    "[Cl+2]": 199,
+    "[Cl+3]": 53,
+    "[Cl+]": 119,
+    "[Cl-]": 38,
+    "[Cl]": 108,
+    "[Cs+]": 67,
+    "[Cs]": 229,
+    "[F-]": 84,
+    "[H+]": 127,
+    "[H-]": 230,
+    "[He]": 86,
+    "[I+2]": 249,
+    "[I+3]": 251,
+    "[I+]": 129,
+    "[I-]": 55,
+    "[I]": 152,
+    "[K+]": 65,
+    "[KH]": 189,
+    "[K]": 254,
+    "[Kr]": 256,
+    "[Li+]": 83,
+    "[LiH]": 192,
+    "[Li]": 223,
+    "[Mg+2]": 61,
+    "[Mg+]": 234,
+    "[MgH2]": 195,
+    "[Mg]": 194,
+    "[N+]": 27,
+    "[N+](=O)[O-]": 403,
+    "[N+](C)": 902,
+    "[N+]([O-])": 785,
+    "[N-]": 34,
+    "[N@+]": 75,
+    "[N@@+]": 82,
+    "[N@@]": 120,
+    "[N@H+]": 240,
+    "[N@]": 72,
+    "[NH+]": 178,
+    "[NH-]": 107,
+    "[NH2+]": 177,
+    "[NH3+]": 238,
+    "[NH4+]": 211,
+    "[NH]": 136,
+    "[N]": 197,
+    "[Na+]": 39,
+    "[NaH]": 183,
+    "[Na]": 216,
+    "[O+]": 116,
+    "[O-2]": 217,
+    "[O-]": 28,
+    "[O-])": 528,
+    "[OH+]": 182,
+    "[OH-]": 130,
+    "[OH3+]": 221,
+    "[OH]": 175,
+    "[O]": 74,
+    "[P+]": 85,
+    "[P-]": 115,
+    "[P@+]": 137,
+    "[P@@+]": 196,
+    "[P@@]": 62,
+    "[P@]": 58,
+    "[PH2+]": 247,
+    "[PH2]": 168,
+    "[PH]": 109,
+    "[Ra]": 156,
+    "[Rb+]": 171,
+    "[Rb]": 262,
+    "[S+]": 42,
+    "[S+]([O-])": 1300,
+    "[S-2]": 144,
+    "[S-]": 52,
+    "[S@+]": 51,
+    "[S@@+]": 73,
+    "[S@@]": 46,
+    "[S@]": 138,
+    "[SH+]": 213,
+    "[SH-]": 169,
+    "[SH2]": 242,
+    "[SH]": 126,
+    "[S]": 142,
+    "[Sb]": 174,
+    "[Se+]": 190,
+    "[SeH2]": 270,
+    "[SeH]": 110,
+    "[Se]": 33,
+    "[Si-]": 271,
+    "[Si@]": 232,
+    "[SiH-]": 161,
+    "[SiH2]": 243,
+    "[SiH3-]": 181,
+    "[SiH4]": 218,
+    "[SiH]": 78,
+    "[Si]": 45,
+    "[Sr+2]": 134,
+    "[Sr]": 272,
+    "[TeH2]": 198,
+    "[TeH]": 225,
+    "[Te]": 63,
+    "[Xe]": 261,
+    "[Yb]": 173,
+    "[Zn+2]": 124,
+    "[Zn+]": 176,
+    "[Zn-2]": 212,
+    "[Zn]": 64,
+    "[b-]": 135,
+    "[c+]": 153,
+    "[c-]": 88,
+    "[cH-]": 118,
+    "[c]": 215,
+    "[n+]": 30,
+    "[n+]1": 797,
+    "[n+]2": 1077,
+    "[n-]": 70,
+    "[nH+]": 191,
+    "[nH]": 37,
+    "[nH]1": 730,
+    "[nH]2)": 1205,
+    "[nH]2)cc1": 1322,
+    "[nH]c(": 844,
+    "[nH]c(-": 1332,
+    "[nH]c(=O)": 695,
+    "[nH]c2c1": 1137,
+    "[nH]c3cc": 1251,
+    "[o+]": 87,
+    "[s+]": 44,
+    "[se+]": 145,
+    "[se]": 71,
+    "[te+]": 149,
+    "[te]": 180,
+    "\\": 41,
+    "b": 209,
+    "c": 7,
+    "c(": 277,
+    "c(-": 359,
+    "c(-c5": 1211,
+    "c(=O)": 348,
+    "c(=O)[nH]": 517,
+    "c(=O)o": 979,
+    "c(Br)": 1057,
+    "c(C#N)": 836,
+    "c(C(F)": 625,
+    "c(C)": 380,
+    "c(C)c1": 598,
+    "c(C)cc": 1346,
+    "c(CC": 1063,
+    "c(CN": 707,
+    "c(Cl)": 430,
+    "c(Cl)c1": 867,
+    "c(F)": 442,
+    "c(F)c1": 761,
+    "c(N": 362,
+    "c(N)": 540,
+    "c(O": 689,
+    "c(O)": 414,
+    "c(O)c(": 1321,
+    "c(O)c1": 1116,
+    "c(OC)": 416,
+    "c(OCC": 1270,
+    "c(S": 1014,
+    "c(cc1": 1265,
+    "c(cc2": 1056,
+    "c(cc3": 1249,
+    "c1": 276,
+    "c1)": 366,
+    "c1)N": 1247,
+    "c1)OCO": 928,
+    "c1-": 650,
+    "c12": 455,
+    "c12)": 1358,
+    "c1=O": 526,
+    "c1=O)": 1260,
+    "c1C": 805,
+    "c1Cl": 1142,
+    "c1N": 1326,
+    "c1O": 670,
+    "c1[nH]": 1104,
+    "c1c(": 495,
+    "c1c(-": 1257,
+    "c1c(C)": 803,
+    "c1c(N": 1298,
+    "c1c(O)": 916,
+    "c1c2c(": 1316,
+    "c1c[nH]": 744,
+    "c1cc": 282,
+    "c1cc(": 372,
+    "c1cc(-": 875,
+    "c1cc(C": 1330,
+    "c1cc(N": 946,
+    "c1cc2": 522,
+    "c1ccc(": 292,
+    "c1cn": 411,
+    "c1cnc(": 1158,
+    "c1cs": 994,
+    "c1n": 332,
+    "c1nc(": 478,
+    "c1nc(-": 925,
+    "c1nc(N": 661,
+    "c1ncc": 676,
+    "c1ncc(": 886,
+    "c1ncn": 1025,
+    "c1ncn2": 1136,
+    "c1nnc(": 699,
+    "c1s": 751,
+    "c2": 280,
+    "c2)": 338,
+    "c2)C1": 962,
+    "c2)CC1": 680,
+    "c2)OCO": 991,
+    "c2)c1": 575,
+    "c2)cc1": 492,
+    "c2)cn1": 1100,
+    "c2)n1": 796,
+    "c21": 514,
+    "c23)": 678,
+    "c2=O)": 545,
+    "c2C)": 820,
+    "c2C1": 1095,
+    "c2Cl)": 1080,
+    "c2F)": 863,
+    "c2[nH]": 505,
+    "c2[nH]1": 1234,
+    "c2c(": 334,
+    "c2c(-": 971,
+    "c2c(C)": 688,
+    "c2c(N": 1022,
+    "c2c(O)": 1215,
+    "c2c(c1": 788,
+    "c2c1": 386,
+    "c2c1)": 1099,
+    "c2c3c(": 1089,
+    "c2c[nH]": 735,
+    "c2cc": 286,
+    "c2cc(": 363,
+    "c2cc(-": 740,
+    "c2cc(C": 1310,
+    "c2cc(N": 833,
+    "c2cc1": 634,
+    "c2cc3": 616,
+    "c2ccc(": 308,
+    "c2cccn": 873,
+    "c2cn": 400,
+    "c2cs": 816,
+    "c2n": 341,
+    "c2n1": 553,
+    "c2nc(": 451,
+    "c2nc(-": 760,
+    "c2nc(N": 639,
+    "c2ncc": 590,
+    "c2ncc(": 856,
+    "c2ncn": 684,
+    "c2nnc(": 729,
+    "c2nnn": 1323,
+    "c2o": 742,
+    "c2s": 663,
+    "c3": 284,
+    "c3)": 367,
+    "c3)cc": 822,
+    "c3)cc2": 1085,
+    "c32)": 983,
+    "c34)": 1017,
+    "c3=O)": 1037,
+    "c3C)": 949,
+    "c3F)": 1172,
+    "c3[nH]": 649,
+    "c3c(": 371,
+    "c3c(-": 1059,
+    "c3c(C)": 880,
+    "c3c(N": 1355,
+    "c3c2": 750,
+    "c3c2)": 1161,
+    "c3c[nH]": 1147,
+    "c3cc": 290,
+    "c3cc(": 463,
+    "c3cc4": 847,
+    "c3ccc(": 331,
+    "c3cn": 458,
+    "c3n": 389,
+    "c3nc(": 815,
+    "c3nc(-": 1001,
+    "c3nc(N": 1048,
+    "c3ncc": 682,
+    "c3ncc(": 1220,
+    "c3ncn": 743,
+    "c3nn": 845,
+    "c3o": 1143,
+    "c3s": 1052,
+    "c4": 299,
+    "c4)": 486,
+    "c4)cc": 1272,
+    "c43)": 1303,
+    "c4[nH]": 800,
+    "c4c(": 513,
+    "c4c3": 1262,
+    "c4c3)": 1195,
+    "c4cc": 330,
+    "c4cc(": 622,
+    "c4ccc(": 466,
+    "c4cccn": 1267,
+    "c4cn": 630,
+    "c4n": 759,
+    "c4ncc": 1061,
+    "c5": 354,
+    "c5)": 953,
+    "c5c(": 919,
+    "c5cc": 457,
+    "c5ccc(": 882,
+    "c6": 560,
+    "c7": 1191,
+    "c[n+]": 1198,
+    "c[nH]": 447,
+    "cc": 273,
+    "cc(": 297,
+    "cc(-": 405,
+    "cc(Br)": 783,
+    "cc(C)": 520,
+    "cc(CN": 672,
+    "cc(CO": 1297,
+    "cc(Cl)": 452,
+    "cc(F)": 483,
+    "cc(N": 496,
+    "cc(O": 975,
+    "cc(O)": 603,
+    "cc(OC)": 731,
+    "cc(OCC": 1106,
+    "cc1": 337,
+    "cc1-": 1101,
+    "cc12": 1145,
+    "cc2": 352,
+    "cc2)": 529,
+    "cc21": 1094,
+    "cc2Cl)": 1371,
+    "cc2c(": 1159,
+    "cc2c1": 738,
+    "cc2cc": 909,
+    "cc3": 393,
+    "cc3)": 593,
+    "cc3c2": 1201,
+    "cc3cc": 876,
+    "cc4": 533,
+    "cc4cc": 1368,
+    "cc5": 960,
+    "ccc(": 747,
+    "ccc1": 310,
+    "ccc1)": 408,
+    "ccc1)N": 733,
+    "ccc1-": 658,
+    "ccc12": 536,
+    "ccc12)": 741,
+    "ccc1CN": 1335,
+    "ccc1Cl": 1043,
+    "ccc1Cl)": 1202,
+    "ccc1F": 1135,
+    "ccc1F)": 1380,
+    "ccc1N": 1050,
+    "ccc1O": 807,
+    "ccc2": 307,
+    "ccc2)": 353,
+    "ccc2)N": 1186,
+    "ccc21": 519,
+    "ccc23)": 586,
+    "ccc2C1": 1282,
+    "ccc2Cl)": 860,
+    "ccc2F)": 985,
+    "ccc2c1": 718,
+    "ccc2n1": 846,
+    "ccc3": 322,
+    "ccc3)": 388,
+    "ccc32)": 1044,
+    "ccc34)": 736,
+    "ccc3Cl)": 1115,
+    "ccc3F)": 1140,
+    "ccc4": 382,
+    "ccc4)": 531,
+    "ccc43)": 1156,
+    "ccc5": 587,
+    "ccc5)": 1175,
+    "cccc": 931,
+    "cccc1": 1012,
+    "cccc2": 1179,
+    "ccccc2": 1130,
+    "ccccc3": 1019,
+    "ccccc4": 1331,
+    "ccn": 437,
+    "ccn1": 714,
+    "ccn1)": 1228,
+    "ccn2": 1378,
+    "ccn2)": 911,
+    "ccn3)": 976,
+    "cn": 293,
+    "cn1": 474,
+    "cn1)": 885,
+    "cn2": 577,
+    "cn2)": 711,
+    "cn3": 967,
+    "cn3)": 782,
+    "cnc1": 588,
+    "cnc1)": 1203,
+    "cnc2": 611,
+    "cnc2)": 1263,
+    "cnc2c1": 1164,
+    "cnc3": 555,
+    "cnc3)": 939,
+    "cnn2": 987,
+    "co": 440,
+    "co1": 802,
+    "co1)": 1250,
+    "co2)": 989,
+    "co3)": 1081,
+    "cs": 387,
+    "cs1": 662,
+    "cs1)": 1168,
+    "cs2)": 905,
+    "cs3)": 1009,
+    "n": 16,
+    "n(": 497,
+    "n(-": 675,
+    "n(C": 537,
+    "n(C)": 485,
+    "n(CC": 624,
+    "n1": 305,
+    "n1)": 527,
+    "n1-": 1119,
+    "n12": 866,
+    "n1C": 1015,
+    "n1c(": 1295,
+    "n1cc(": 1141,
+    "n1cn": 950,
+    "n2": 327,
+    "n2)": 538,
+    "n2)CC1": 1090,
+    "n2)c1": 896,
+    "n2)cc1": 504,
+    "n2C": 1067,
+    "n2C)": 1093,
+    "n2c(": 1117,
+    "n2c1": 1237,
+    "n2cc": 1003,
+    "n2cc(": 1033,
+    "n2cn": 944,
+    "n2nc(": 1241,
+    "n3": 410,
+    "n3)": 610,
+    "n3)cc": 1082,
+    "n3C)": 1269,
+    "n3cc": 1102,
+    "n3cn": 1039,
+    "n4": 628,
+    "n4)": 1184,
+    "n5": 1286,
+    "n[nH]": 640,
+    "nc(": 304,
+    "nc(-": 421,
+    "nc(C)": 583,
+    "nc(N": 384,
+    "nc(N)": 687,
+    "nc(S": 1128,
+    "nc1": 429,
+    "nc1-": 1353,
+    "nc12": 957,
+    "nc2": 464,
+    "nc21": 1188,
+    "nc2c(": 907,
+    "nc2c1": 787,
+    "nc2cc": 737,
+    "nc2cc(": 1222,
+    "nc2n1": 1354,
+    "nc3": 506,
+    "nc3)": 1098,
+    "nc3cc": 825,
+    "nc4": 936,
+    "nc4cc": 1379,
+    "ncc": 487,
+    "ncc(": 995,
+    "ncc1": 606,
+    "ncc2": 674,
+    "ncc2)": 1173,
+    "ncc3": 754,
+    "ncc3)": 1187,
+    "ncc4": 1266,
+    "ncn": 454,
+    "nn": 370,
+    "nn(C)": 1180,
+    "nn1": 594,
+    "nn2": 686,
+    "nn2)": 996,
+    "nn3": 1010,
+    "nn3)": 1131,
+    "nnc1": 1091,
+    "no": 636,
+    "no1": 884,
+    "o": 36,
+    "o1": 503,
+    "o2)": 915,
+    "o2)cc1": 878,
+    "o3)": 1190,
+    "oc(": 832,
+    "oc(-": 1294,
+    "oc2c1": 1225,
+    "oc2cc": 1347,
+    "on1": 1125,
+    "p": 208,
+    "s": 26,
+    "s1": 499,
+    "s1)": 903,
+    "s2)": 779,
+    "s2)cc1": 927,
+    "s3)": 933,
+    "sc(": 1192,
+    "sc1": 965,
+    "sc2": 1339,
+    "sc2c1": 1060
+}