""" Hugging Face tokenizer class for MINDI 1.0 420M. """ from pathlib import Path from transformers import PreTrainedTokenizerFast class MindiTokenizer(PreTrainedTokenizerFast): vocab_files_names = {"tokenizer_file": "tokenizer.json"} model_input_names = ["input_ids", "attention_mask"] @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs): if kwargs.get("tokenizer_file") is None: local_candidate = Path(str(pretrained_model_name_or_path)) / "tokenizer.json" if local_candidate.exists(): kwargs["tokenizer_file"] = str(local_candidate) return super().from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs) def __init__(self, tokenizer_file=None, **kwargs): name_or_path = kwargs.pop("name_or_path", None) if tokenizer_file is None and name_or_path is not None: candidate = Path(name_or_path) / "tokenizer.json" if candidate.exists(): tokenizer_file = str(candidate) if tokenizer_file is None: tokenizer_file = str(Path(__file__).resolve().parent / "tokenizer.json") kwargs.setdefault("bos_token", "") kwargs.setdefault("eos_token", "") kwargs.setdefault("unk_token", "") kwargs.setdefault("pad_token", "") super().__init__(tokenizer_file=tokenizer_file, **kwargs)