File size: 3,429 Bytes
846dc7c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# tokenization_tinytransformer.py (最完整修复版)

from transformers import PreTrainedTokenizer
import json
from typing import List, Dict, Optional

class TinyTokenizer(PreTrainedTokenizer):
    vocab_files_names = {"vocab_file": "vocab.json"}
    pretrained_vocab_files_map = {}
    max_model_input_sizes = {"tinytransformer": 512}
    model_input_names = ["input_ids", "attention_mask"]

    def __init__(self, vocab_file: Optional[str] = None, **kwargs):
        # 特殊 token
        self.special_tokens = {
            "[PAD]": 0,
            "[UNK]": 1,
            "[CLS]": 2,
            "[SEP]": 3,
        }

        # 构建 vocab
        self.vocab: Dict[str, int] = self.special_tokens.copy()
        offset = len(self.vocab)

        # ASCII + 常用字符
        for i in range(32, 127):
            char = chr(i)
            self.vocab[char] = offset + i - 32

        # 支持中文(常用汉字范围,可扩展)
        for i in range(0x4e00, 0x9fff + 1):
            char = chr(i)
            if char not in self.vocab:
                self.vocab[char] = len(self.vocab)

        self.id_to_token = {v: k for k, v in self.vocab.items()}

        # 设置特殊 token id
        self.pad_token_id = 0
        self.unk_token_id = 1
        self.cls_token_id = 2
        self.sep_token_id = 3

        super().__init__(
            pad_token="[PAD]",
            unk_token="[UNK]",
            cls_token="[CLS]",
            sep_token="[SEP]",
            **kwargs
        )

    def get_vocab(self) -> Dict[str, int]:
        return self.vocab.copy()

    @property
    def vocab_size(self) -> int:
        return len(self.vocab)

    def _tokenize(self, text: str) -> List[str]:
        return list(text)  # 字符级分词

    def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
        return [self.vocab.get(t, self.unk_token_id) for t in tokens]

    def convert_ids_to_tokens(self, ids: List[int]) -> List[str]:
        return [self.id_to_token.get(i, "[UNK]") for i in ids]

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + token_ids_1 + [self.sep_token_id]

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        if token_ids_1 is None:
            return [0] * len([self.cls_token_id] + token_ids_0 + [self.sep_token_id])
        len0 = len([self.cls_token_id] + token_ids_0 + [self.sep_token_id])
        len1 = len(token_ids_1 + [self.sep_token_id])
        return [0] * len0 + [1] * len1

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
        vocab_file = f"{filename_prefix}vocab.json" if filename_prefix else "vocab.json"
        vocab_path = f"{save_directory}/{vocab_file}"
        with open(vocab_path, "w", encoding="utf-8") as f:
            json.dump(self.vocab, f, ensure_ascii=False, indent=2)
        return (vocab_path,)
    

# # 文件最底部,类定义之后
# from transformers import TOKENIZER_MAPPING_NAMES

# # 注册(只执行一次,放在这里最安全)
# TOKENIZER_MAPPING_NAMES["tinytransformer"] = "TinyTokenizer"