File size: 1,438 Bytes
5ae3e12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | """
Hugging Face tokenizer class for MINDI 1.0 420M.
"""
from pathlib import Path
from transformers import PreTrainedTokenizerFast
class MindiTokenizer(PreTrainedTokenizerFast):
vocab_files_names = {"tokenizer_file": "tokenizer.json"}
model_input_names = ["input_ids", "attention_mask"]
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
if kwargs.get("tokenizer_file") is None:
local_candidate = Path(str(pretrained_model_name_or_path)) / "tokenizer.json"
if local_candidate.exists():
kwargs["tokenizer_file"] = str(local_candidate)
return super().from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs)
def __init__(self, tokenizer_file=None, **kwargs):
name_or_path = kwargs.pop("name_or_path", None)
if tokenizer_file is None and name_or_path is not None:
candidate = Path(name_or_path) / "tokenizer.json"
if candidate.exists():
tokenizer_file = str(candidate)
if tokenizer_file is None:
tokenizer_file = str(Path(__file__).resolve().parent / "tokenizer.json")
kwargs.setdefault("bos_token", "<BOS>")
kwargs.setdefault("eos_token", "<EOS>")
kwargs.setdefault("unk_token", "<UNK>")
kwargs.setdefault("pad_token", "<PAD>")
super().__init__(tokenizer_file=tokenizer_file, **kwargs)
|