File size: 29,070 Bytes

"""Hugging Face-compatible tokenizer for APE molecular vocabularies.

This file is intentionally self-contained so it can be copied into a model repo
and loaded by ``AutoTokenizer.from_pretrained(..., trust_remote_code=True)``.
"""

import json
import os
import re
from collections.abc import Mapping
from collections import defaultdict
from pathlib import Path
from typing import Any, Literal

from transformers import PreTrainedTokenizer


Representation = Literal["SELFIES", "SMILES"]

VOCAB_FILES_NAMES = {
    "vocab_file": "vocab.json",
    "selfies_vocab_file": "selfies_vocab.json",
    "smiles_vocab_file": "smiles_vocab.json",
}
SELFIES_RE = re.compile(r"\[[^\]]+\]")
# Only the organic subset (B C N O P S F Cl Br I) may appear unbracketed in
# canonical SMILES; two-letter metals (Si, Se, Na, Mg, Al, Ca, Fe, Zn, ...) are
# always bracketed and matched by the leading \[[^\]]+\] branch. The previous
# pattern listed those metals as optional-second-letter alternatives (Si?, Na?,
# ...), which could match bare invalid single letters (L, M, A, Z) and was dead
# weight for valid input. Keep only Br?/Cl? (B, C, Br, Cl all valid bare).
SMILES_RE = re.compile(
    r"(\[[^\]]+\]|Br?|Cl?|"
    r"N|O|S|P|F|I|K|B|C|H|"
    r"b|c|n|o|s|p|"
    r"\%\d{2}|\d|"
    r"\(|\)|\.|=|#|-|\+|\\|/|:|~|@|\?|\*|\$)"
)


def _base_piece_count(token: str, representation: str) -> int:
    """Count primitive molecular pieces in a vocab token."""
    pieces = pre_tokenize_molecule(token, representation)
    return max(1, len(pieces))


def _max_vocab_piece_span(vocab: dict[str, int], representation: str) -> int:
    """Maximum number of primitive pieces covered by any non-special vocab token."""
    max_span = 1
    for token in vocab:
        if token.startswith("<") and token.endswith(">"):
            continue
        max_span = max(max_span, _base_piece_count(token, representation))
    return max_span


def _coerce_vocab(vocab: Mapping[str, Any]) -> dict[str, int]:
    if not isinstance(vocab, Mapping):
        raise ValueError("Vocabulary must be a JSON object mapping token strings to integer IDs.")
    out = {str(token): int(idx) for token, idx in vocab.items()}
    if len(set(out.values())) != len(out):
        raise ValueError("Vocabulary token IDs must be unique.")
    return out


def _token_text(token: Any) -> str:
    return str(getattr(token, "content", token))


def _normalize_representation(representation: str) -> Representation:
    normalized = representation.upper()
    if normalized not in {"SELFIES", "SMILES"}:
        raise ValueError(f"representation must be 'SELFIES' or 'SMILES', got {representation!r}")
    return normalized  # type: ignore[return-value]


def _select_vocab_file(
    *,
    representation: Representation,
    vocab_file: str | os.PathLike[str] | None,
    selfies_vocab_file: str | os.PathLike[str] | None,
    smiles_vocab_file: str | os.PathLike[str] | None,
) -> str | os.PathLike[str] | None:
    if representation == "SELFIES" and selfies_vocab_file is not None:
        return selfies_vocab_file
    if representation == "SMILES" and smiles_vocab_file is not None:
        return smiles_vocab_file
    return vocab_file


def _pre_tokenize_selfies(molecule: str, *, strict: bool = True) -> list[str]:
    pieces = SELFIES_RE.findall(molecule)

    if strict and "".join(pieces) != molecule:
        raise ValueError(
            "Malformed SELFIES string contains unmatched text outside "
            f"bracketed SELFIES tokens: {molecule!r}"
        )

    return pieces


def pre_tokenize_molecule(
    molecule: str,
    representation: str,
    *,
    strict_selfies: bool = True,
) -> list[str]:
    active_representation = _normalize_representation(representation)

    if active_representation == "SELFIES":
        return _pre_tokenize_selfies(molecule, strict=strict_selfies)

    tokens: list[str] = []
    cursor = 0

    for match in SMILES_RE.finditer(molecule):
        if match.start() > cursor:
            tokens.extend(molecule[cursor : match.start()])

        tokens.append(match.group(0))
        cursor = match.end()

    if cursor < len(molecule):
        tokens.extend(molecule[cursor:])

    return [token for token in tokens if token and not token.isspace()]


def ape_tokenize(
    text: str,
    vocab: dict[str, int],
    representation: str,
    unk_token: str = "<unk>",
    max_piece_span: int | None = None,
) -> list[str]:
    """Segment a molecule against the APE vocabulary by greedy longest match.

    Note this is *not* a replay of the training merges in learned order: train()
    learns which substrings become vocab entries, but decoding here just takes
    the longest vocab token at each position (up to ``max_piece_span`` pieces).
    The two can disagree on segmentation. That is fine and intended — both
    pretraining and fine-tuning encode through this same function, so the model
    only ever sees greedy-longest-match output and stays internally consistent.
    The learned merge *order* is intentionally discarded; only the vocab set is
    used at inference.
    """
    # A single malformed SELFIES (stray text outside bracket tokens) must not
    # crash encoding. Map the whole string to <unk> so it stays detectable via
    # the validator's unk_rate gate instead of raising mid-batch.
    try:
        pieces = pre_tokenize_molecule(text, representation)
    except ValueError:
        return [unk_token]
    if not pieces:
        return [unk_token]

    if max_piece_span is None:
        max_piece_span = _max_vocab_piece_span(vocab, representation)

    n = len(pieces)
    tokens: list[str] = []
    append_token = tokens.append
    vocab_contains = vocab.__contains__
    join_pieces = "".join
    i = 0

    while i < n:
        upper = min(n, i + max_piece_span)

        for j in range(upper, i, -1):
            candidate = join_pieces(pieces[i:j])
            if vocab_contains(candidate):
                append_token(candidate)
                i = j
                break
        else:
            append_token(unk_token)
            i += 1

    return tokens


class APEPreTrainedTokenizer(PreTrainedTokenizer):
    """Hugging Face tokenizer backend for APE molecular tokenization. (Not fast)"""

    vocab_files_names = VOCAB_FILES_NAMES
    model_input_names = ["input_ids", "attention_mask"]

    def __init__(
        self,
        vocab_file: str | os.PathLike[str] | None = None,
        selfies_vocab_file: str | os.PathLike[str] | None = None,
        smiles_vocab_file: str | os.PathLike[str] | None = None,
        vocab: dict[str, Any] | None = None,
        representation: str = "SELFIES",
        bos_token: str = "<s>",
        eos_token: str = "</s>",
        unk_token: str = "<unk>",
        pad_token: str = "<pad>",
        mask_token: str = "<mask>",
        model_max_length: int = 256,
        **kwargs,
    ) -> None:
        self.representation = _normalize_representation(representation)
        active_vocab_file = _select_vocab_file(
            representation=self.representation,
            vocab_file=vocab_file,
            selfies_vocab_file=selfies_vocab_file,
            smiles_vocab_file=smiles_vocab_file,
        )

        if vocab is None:
            if active_vocab_file is None:
                vocab = {
                    bos_token: 0,
                    pad_token: 1,
                    eos_token: 2,
                    unk_token: 3,
                    mask_token: 4,
                }
            else:
                with open(active_vocab_file, encoding="utf-8") as f:
                    vocab = json.load(f)

        if vocab is None:
            raise ValueError("Loaded vocabulary is None.")

        self.vocab_file = str(active_vocab_file) if active_vocab_file is not None else None
        self.selfies_vocab_file = (
            str(selfies_vocab_file) if selfies_vocab_file is not None else None
        )
        self.smiles_vocab_file = str(smiles_vocab_file) if smiles_vocab_file is not None else None
        self.vocab = _coerce_vocab(vocab)
        self._require_special_tokens(
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            pad_token=pad_token,
            mask_token=mask_token,
        )
        self.ids_to_tokens = {idx: token for token, idx in self.vocab.items()}
        self.vocabulary_frequency: dict[str, int] = {}
        self.pair_counts: dict[tuple[str, str], int] = {}
        self._max_piece_span = _max_vocab_piece_span(self.vocab, self.representation)

        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            pad_token=pad_token,
            mask_token=mask_token,
            model_max_length=model_max_length,
            representation=self.representation,
            **kwargs,
        )

    @property
    def vocab_size(self) -> int:
        return len(self.vocab)

    @property
    def vocabulary(self) -> dict[str, int]:
        """Legacy alias for callers that previously used APETokenizer."""
        return self.vocab

    @vocabulary.setter
    def vocabulary(self, value: dict[str, int]) -> None:
        self.vocab = _coerce_vocab(value)
        self.update_reverse_vocabulary()
        self._refresh_tokenization_cache()

    @property
    def special_tokens(self) -> dict[str, int]:
        bos_token = str(self.bos_token)
        pad_token = str(self.pad_token)
        eos_token = str(self.eos_token)
        unk_token = str(self.unk_token)
        mask_token = str(self.mask_token)
        return {
            bos_token: self._convert_token_to_id(bos_token),
            pad_token: self._convert_token_to_id(pad_token),
            eos_token: self._convert_token_to_id(eos_token),
            unk_token: self._convert_token_to_id(unk_token),
            mask_token: self._convert_token_to_id(mask_token),
        }

    @special_tokens.setter
    def special_tokens(self, value: dict[str, int]) -> None:
        for token, token_id in value.items():
            self.vocab.setdefault(str(token), int(token_id))
        self.vocab = _coerce_vocab(self.vocab)
        self.update_reverse_vocabulary()
        self._refresh_tokenization_cache()

    def get_vocab(self) -> dict[str, int]:
        return dict(self.vocab)

    def update_reverse_vocabulary(self) -> None:
        self.ids_to_tokens = {idx: token for token, idx in self.vocab.items()}

    def _refresh_tokenization_cache(self) -> None:
        self._max_piece_span = _max_vocab_piece_span(self.vocab, self.representation)

    def _require_special_tokens(
        self,
        *,
        bos_token: str,
        eos_token: str,
        unk_token: str,
        pad_token: str,
        mask_token: str,
    ) -> None:
        missing = [
            token_text
            for token in [bos_token, eos_token, unk_token, pad_token, mask_token]
            if (token_text := _token_text(token)) not in self.vocab
        ]
        if missing:
            raise ValueError(f"Vocabulary is missing required special tokens: {missing}")

    def pre_tokenize(self, molecule: str, representation: str | None = None) -> list[str]:
        return pre_tokenize_molecule(molecule, representation or self.representation)

    def _tokenize(self, text: str, **kwargs) -> list[str]:

        return ape_tokenize(
            text,
            vocab=self.vocab,
            representation=self.representation,
            unk_token=str(self.unk_token),
            max_piece_span=self._max_piece_span,
        )

    def encode_molecule(
        self,
        text: str,
        add_special_tokens: bool = True,
        max_length: int | None = None,
        truncation: bool = True,
    ) -> list[int]:
        """Fast molecular encode path avoiding generic Hugging Face tokenizer overhead."""

        tokens = self._tokenize(text)

        ids = [self._convert_token_to_id(token) for token in tokens]

        if add_special_tokens:
            ids = self.build_inputs_with_special_tokens(ids)

        if max_length is not None and truncation:
            ids = ids[:max_length]

        return ids

    def _convert_token_to_id(self, token: str) -> int:
        return self.vocab.get(token, self.vocab[str(self.unk_token)])

    def _convert_id_to_token(self, index: int) -> str:
        return self.ids_to_tokens.get(int(index), str(self.unk_token))

    def convert_tokens_to_string(self, tokens: list[str]) -> str:
        return "".join(tokens)

    def _required_special_token_id(
        self,
        token_value: int | list[int] | str | list[str] | None,
        token_name: str,
    ) -> int:
        if token_value is None:
            raise ValueError(f"{token_name} must be set.")
        if isinstance(token_value, int):
            return token_value
        if isinstance(token_value, str):
            return self._convert_token_to_id(token_value)
        if len(token_value) == 1:
            only_value = token_value[0]
            if isinstance(only_value, int):
                return only_value
            if isinstance(only_value, str):
                return self._convert_token_to_id(only_value)
        raise ValueError(f"{token_name} must resolve to a single token id.")

    def build_inputs_with_special_tokens(
        self,
        token_ids_0: list[int],
        token_ids_1: list[int] | None = None,
    ) -> list[int]:
        bos_id = self._required_special_token_id(self.bos_token, "bos_token")
        eos_id = self._required_special_token_id(self.eos_token, "eos_token")
        if token_ids_1 is None:
            return [bos_id, *token_ids_0, eos_id]
        return [bos_id, *token_ids_0, eos_id, *token_ids_1, eos_id]

    def create_token_type_ids_from_sequences(
        self,
        token_ids_0: list[int],
        token_ids_1: list[int] | None = None,
    ) -> list[int]:
        return [0] * len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1))

    def pad(
        self,
        encoded_inputs: Any,
        padding: Any = True,
        max_length: int | None = None,
        pad_to_multiple_of: int | None = None,
        padding_side: str | None = None,
        return_attention_mask: bool | None = None,
        return_tensors: Any = None,
        verbose: bool = True,
    ):
        padding_enabled = padding not in (False, "do_not_pad")
        if (
            padding_enabled
            and isinstance(encoded_inputs, list)
            and any("labels" in item for item in encoded_inputs)
        ):
            target_length = max(
                len(item.get("input_ids", item.get("labels", []))) for item in encoded_inputs
            )
            if padding == "max_length" and max_length is not None:
                target_length = max_length

            if pad_to_multiple_of and target_length % pad_to_multiple_of:
                target_length = ((target_length // pad_to_multiple_of) + 1) * pad_to_multiple_of

            padded_inputs = []
            for item in encoded_inputs:
                item = dict(item)
                labels = list(item.get("labels", []))
                pad_len = max(0, target_length - len(labels))
                if pad_len:
                    label_padding = [-100] * pad_len
                    if self.padding_side == "left":
                        labels = label_padding + labels
                    else:
                        labels = labels + label_padding
                    item["labels"] = labels
                padded_inputs.append(item)
            encoded_inputs = padded_inputs

        return super().pad(
            encoded_inputs,
            padding=padding,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            padding_side=padding_side,
            return_attention_mask=return_attention_mask,
            return_tensors=return_tensors,
            verbose=verbose,
        )

    def save_vocabulary(
        self,
        save_directory: str,
        filename_prefix: str | None = None,
    ) -> tuple[str, ...]:
        if not os.path.isdir(save_directory):
            raise ValueError(f"Vocabulary path ({save_directory}) should be a directory.")

        vocab_file = Path(save_directory) / (
            f"{filename_prefix}-vocab.json" if filename_prefix else "vocab.json"
        )
        with vocab_file.open("w", encoding="utf-8") as f:
            json.dump(self.vocab, f, ensure_ascii=False, indent=4)
        return (str(vocab_file),)

    def add_tokens_to_vocabulary(self, tokens: list[str]) -> int:
        """Add tokens to the tokenizer vocabulary if they are not already present.

        This is intended for forcing coverage of rare valid molecular primitive
        symbols, especially SELFIES bracket tokens, after APE merge training.
        """

        if not tokens:
            return 0

        next_id = max(self.vocab.values(), default=-1) + 1
        added = 0

        for token in tokens:
            token = str(token).strip()
            if not token:
                continue
            if token in self.vocab:
                continue

            self.vocab[token] = next_id
            next_id += 1
            added += 1

        if added:
            self.update_reverse_vocabulary()
            self._refresh_tokenization_cache()

        return added

    def save_pretrained(self, save_directory: str | os.PathLike[str], *args, **kwargs):
        saved_files = super().save_pretrained(save_directory, *args, **kwargs)
        save_path = Path(save_directory)

        special_tokens_map = {
            "bos_token": str(self.bos_token),
            "eos_token": str(self.eos_token),
            "unk_token": str(self.unk_token),
            "pad_token": str(self.pad_token),
            "mask_token": str(self.mask_token),
        }
        with (save_path / "special_tokens_map.json").open("w", encoding="utf-8") as f:
            json.dump(special_tokens_map, f, ensure_ascii=False, indent=2)

        tokenizer_config_path = save_path / "tokenizer_config.json"
        if tokenizer_config_path.exists():
            with tokenizer_config_path.open(encoding="utf-8") as f:
                tokenizer_config = json.load(f)
        else:
            tokenizer_config = {}
        tokenizer_config.pop("tokenizer_class", None)
        tokenizer_config.update(
            {
                "representation": self.representation,
                "model_max_length": self.model_max_length,
                "auto_map": {
                    "AutoTokenizer": [
                        "tokenization_ape.APEPreTrainedTokenizer",
                        None,
                    ],
                },
            }
        )
        with tokenizer_config_path.open("w", encoding="utf-8") as f:
            json.dump(tokenizer_config, f, ensure_ascii=False, indent=2)

        return saved_files

    def save_vocabulary_file(self, file_path: str | os.PathLike[str]) -> None:
        path = Path(file_path)
        path.parent.mkdir(parents=True, exist_ok=True)
        freq_path = path.with_name(f"{path.stem}_freq.json")

        with path.open("w", encoding="utf-8") as f:
            json.dump(self.vocab, f, ensure_ascii=False, indent=4)
        with freq_path.open("w", encoding="utf-8") as f:
            json.dump(self.vocabulary_frequency, f, ensure_ascii=False, indent=4)

    def load_vocabulary_file(
        self,
        file_path: str | os.PathLike[str],
        representation: str | None = None,
    ) -> None:
        if representation is not None:
            self.representation = _normalize_representation(representation)
        with open(file_path, encoding="utf-8") as f:
            vocab = json.load(f)
        self.vocab = _coerce_vocab(vocab)
        self._require_special_tokens(
            bos_token=str(self.bos_token),
            eos_token=str(self.eos_token),
            unk_token=str(self.unk_token),
            pad_token=str(self.pad_token),
            mask_token=str(self.mask_token),
        )
        self.ids_to_tokens = {idx: token for token, idx in self.vocab.items()}
        self._refresh_tokenization_cache()

    def train(
        self,
        corpus,
        type: str = "selfies",
        representation: str | None = None,
        max_vocab_size: int = 5000,
        min_freq_for_merge: int = 2000,
        max_merge_pieces: int | None = 8,
        save_checkpoint: bool = False,
        checkpoint_path: str = "checkpoint",
        checkpoint_interval: int = 500,
    ) -> None:
        import warnings

        new_rep = _normalize_representation(representation or type)
        if new_rep != self.representation:
            warnings.warn(
                f"train() representation={new_rep!r} differs from tokenizer "
                f"representation={self.representation!r}. Overwriting.",
                UserWarning,
                stacklevel=2,
            )
        self.representation = new_rep

        if not corpus:
            raise ValueError("Cannot train APE tokenizer on an empty corpus.")

        print(f"Pretokenizing {self.representation}...", flush=True)
        tokenized_corpus = []
        vocabulary_frequency: defaultdict[str, int] = defaultdict(int)
        saw_tokens = False
        skipped_malformed = 0

        for sentence in corpus:
            # One malformed row must not abort a multi-hour training run. Skip and
            # count it; surface the total so a corrupt corpus is still visible.
            try:
                tokens = self.pre_tokenize(str(sentence))
            except ValueError:
                skipped_malformed += 1
                continue
            if not tokens:
                continue
            saw_tokens = True
            for token in tokens:
                vocabulary_frequency[token] += 1
            if len(tokens) > 1:
                tokenized_corpus.append(tokens)
        if skipped_malformed:
            print(f"Skipped {skipped_malformed} malformed sequences", flush=True)
        print(
            f"Pretokenization complete, found {len(vocabulary_frequency)} tokens",
            flush=True,
        )

        if not saw_tokens:
            raise ValueError("Cannot train APE tokenizer on an empty corpus.")

        pre_tokens_counts = len(vocabulary_frequency)
        merged_counter = len(vocabulary_frequency) + 1
        if save_checkpoint and checkpoint_interval <= 0:
            raise ValueError(
                "checkpoint_interval must be positive when save_checkpoint is enabled."
            )
        checkpoint_increment = checkpoint_interval
        batch = checkpoint_interval + pre_tokens_counts
        piece_count_cache: dict[str, int] = {}

        def merged_piece_count(token: str) -> int:
            count = piece_count_cache.get(token)
            if count is None:
                count = _base_piece_count(token, self.representation)
                piece_count_cache[token] = count
            return count

        def get_most_common_pair(tokenized):
            pair_counts: defaultdict[tuple[str, str], int] = defaultdict(int)
            for tokens in tokenized:
                for i in range(len(tokens) - 1):
                    pair = (tokens[i], tokens[i + 1])

                    if max_merge_pieces is not None:
                        merged_candidate = pair[0] + pair[1]
                        if merged_piece_count(merged_candidate) > max_merge_pieces:
                            continue

                    pair_counts[pair] += 1

            if not pair_counts:
                return ("", ""), 0

            most_common_pair = ("", "")
            most_common_frequency = 0
            for pair, count in pair_counts.items():
                if count > most_common_frequency:
                    most_common_pair = pair
                    most_common_frequency = count
            return most_common_pair, most_common_frequency

        while True:
            if save_checkpoint and len(vocabulary_frequency) >= batch:
                self.vocabulary_frequency = dict(vocabulary_frequency)
                self.vocab = {
                    **{
                        str(self.bos_token): 0,
                        str(self.pad_token): 1,
                        str(self.eos_token): 2,
                        str(self.unk_token): 3,
                        str(self.mask_token): 4,
                    },
                    **{
                        word: idx
                        for idx, word in enumerate(
                            vocabulary_frequency.keys(),
                            start=5,
                        )
                    },
                }
                self.ids_to_tokens = {idx: token for token, idx in self.vocab.items()}
                self._refresh_tokenization_cache()
                checkpoint_dir = Path(checkpoint_path)
                checkpoint_dir.mkdir(parents=True, exist_ok=True)
                self.save_vocabulary_file(checkpoint_dir / f"checkpoint_{batch}.json")
                self.save_pretrained(str(checkpoint_dir / f"checkpoint_{batch}"))
                print(f"Checkpoint saved at {checkpoint_dir}/checkpoint_{batch}.json")
                batch += checkpoint_increment

            if len(vocabulary_frequency) >= max_vocab_size:
                print("Max vocabulary achieved", flush=True)
                break

            if not tokenized_corpus:
                print("No more mergeable pairs", flush=True)
                break

            most_common_pair, freq = get_most_common_pair(tokenized_corpus)
            if freq < min_freq_for_merge:
                print("Not enough frequency found", flush=True)
                break

            if not most_common_pair[0] or not most_common_pair[1]:
                print("No valid merge pair found", flush=True)
                break

            left_token, right_token = most_common_pair
            merged_word = left_token + right_token
            if merged_word not in vocabulary_frequency:
                print(
                    f"New merge found: {merged_word} {merged_counter}/{max_vocab_size} "
                    f"{round(merged_counter / max_vocab_size * 100, 2)}%",
                    flush=True,
                )
                merged_counter += 1
            # Each merged occurrence consumes one left + one right piece, so debit
            # both constituents to keep vocabulary_frequency (the *_freq.json
            # diagnostic) an accurate post-merge count. Keys are never removed —
            # a primitive merged to zero must stay in vocab for coverage.
            vocabulary_frequency[merged_word] += freq
            vocabulary_frequency[left_token] = max(0, vocabulary_frequency[left_token] - freq)
            vocabulary_frequency[right_token] = max(0, vocabulary_frequency[right_token] - freq)

            new_tokenized_corpus = []
            append_seq = new_tokenized_corpus.append
            for tokens in tokenized_corpus:
                token_count = len(tokens)

                # Fast path: a sequence with no adjacent (left, right) is
                # unchanged by this merge. Keep the existing list by reference
                # instead of reallocating + re-appending every token. Most
                # sequences are untouched per merge, so this avoids the bulk of
                # the per-iteration allocation without altering the output.
                has_pair = any(
                    tokens[i] == left_token and tokens[i + 1] == right_token
                    for i in range(token_count - 1)
                )
                if not has_pair:
                    append_seq(tokens)
                    continue

                new_tokens = []
                append_token = new_tokens.append
                i = 0
                while i < token_count:
                    if (
                        i < token_count - 1
                        and tokens[i] == left_token
                        and tokens[i + 1] == right_token
                    ):
                        append_token(merged_word)
                        i += 2
                    else:
                        append_token(tokens[i])
                        i += 1

                if len(new_tokens) > 1:
                    append_seq(new_tokens)

            tokenized_corpus = new_tokenized_corpus

        self.vocabulary_frequency = dict(vocabulary_frequency)
        self.vocab = {
            str(self.bos_token): 0,
            str(self.pad_token): 1,
            str(self.eos_token): 2,
            str(self.unk_token): 3,
            str(self.mask_token): 4,
            **{word: idx for idx, word in enumerate(vocabulary_frequency.keys(), start=5)},
        }

        self.ids_to_tokens = {idx: token for token, idx in self.vocab.items()}
        self._refresh_tokenization_cache()

    def train_from_iterator(self, iterator, *args, **kwargs) -> None:
        raise NotImplementedError("train_from_iterator is not implemented for APE")


APEPreTrainedTokenizer.register_for_auto_class("AutoTokenizer")