mindi-backup / final_model /tokenization_mindi.py
Mindigenous
Sync latest workspace state: data/scripts updates and archive cleanup
5ae3e12
"""
Hugging Face tokenizer class for MINDI 1.0 420M.
"""
from pathlib import Path
from transformers import PreTrainedTokenizerFast
class MindiTokenizer(PreTrainedTokenizerFast):
vocab_files_names = {"tokenizer_file": "tokenizer.json"}
model_input_names = ["input_ids", "attention_mask"]
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
if kwargs.get("tokenizer_file") is None:
local_candidate = Path(str(pretrained_model_name_or_path)) / "tokenizer.json"
if local_candidate.exists():
kwargs["tokenizer_file"] = str(local_candidate)
return super().from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs)
def __init__(self, tokenizer_file=None, **kwargs):
name_or_path = kwargs.pop("name_or_path", None)
if tokenizer_file is None and name_or_path is not None:
candidate = Path(name_or_path) / "tokenizer.json"
if candidate.exists():
tokenizer_file = str(candidate)
if tokenizer_file is None:
tokenizer_file = str(Path(__file__).resolve().parent / "tokenizer.json")
kwargs.setdefault("bos_token", "<BOS>")
kwargs.setdefault("eos_token", "<EOS>")
kwargs.setdefault("unk_token", "<UNK>")
kwargs.setdefault("pad_token", "<PAD>")
super().__init__(tokenizer_file=tokenizer_file, **kwargs)