File size: 1,438 Bytes
5ae3e12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
"""
Hugging Face tokenizer class for MINDI 1.0 420M.
"""

from pathlib import Path
from transformers import PreTrainedTokenizerFast


class MindiTokenizer(PreTrainedTokenizerFast):
    vocab_files_names = {"tokenizer_file": "tokenizer.json"}
    model_input_names = ["input_ids", "attention_mask"]

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
        if kwargs.get("tokenizer_file") is None:
            local_candidate = Path(str(pretrained_model_name_or_path)) / "tokenizer.json"
            if local_candidate.exists():
                kwargs["tokenizer_file"] = str(local_candidate)
        return super().from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs)

    def __init__(self, tokenizer_file=None, **kwargs):
        name_or_path = kwargs.pop("name_or_path", None)
        if tokenizer_file is None and name_or_path is not None:
            candidate = Path(name_or_path) / "tokenizer.json"
            if candidate.exists():
                tokenizer_file = str(candidate)
        if tokenizer_file is None:
            tokenizer_file = str(Path(__file__).resolve().parent / "tokenizer.json")
        kwargs.setdefault("bos_token", "<BOS>")
        kwargs.setdefault("eos_token", "<EOS>")
        kwargs.setdefault("unk_token", "<UNK>")
        kwargs.setdefault("pad_token", "<PAD>")
        super().__init__(tokenizer_file=tokenizer_file, **kwargs)