| import os |
| from typing import Optional, Dict, Any, Tuple |
|
|
| import sentencepiece |
| from torch import TensorType |
| from transformers import PreTrainedTokenizer |
| from transformers.models.deberta_v2.tokenization_deberta_v2 import SPMTokenizer |
| from transformers.tokenization_utils_base import TextInput, PreTokenizedInput, TruncationStrategy |
| from transformers.utils import PaddingStrategy |
|
|
|
|
| class QiDeBERTaTokenizer(PreTrainedTokenizer): |
| vocab_files_names = {"spm_model_file": "spm.model"} |
|
|
| def __init__( |
| self, |
| spm_model_file: str, |
| bos_token: str = '<s>', |
| eos_token: str = '</s>', |
| unk_token: str = '<unk>', |
| sep_token: str = '</s>', |
| pad_token: str = '<pad>', |
| cls_token: str = '<s>', |
| mask_token: str = '<mask>', |
| do_lower_case=False, |
| split_by_punct=False, |
| sp_model_kwargs: Optional[Dict[str, Any]] = None, |
| **kwargs, |
| ): |
| self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs |
|
|
| if not os.path.isfile(spm_model_file): |
| raise ValueError( |
| f"Can't find a vocabulary file at path '{spm_model_file}'. To load the vocabulary from a Google pretrained" |
| " model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" |
| ) |
| self.do_lower_case = do_lower_case |
| self.split_by_punct = split_by_punct |
| self.spm_model_file = spm_model_file |
| self._tokenizer = SPMTokenizer( |
| spm_model_file, None, split_by_punct=split_by_punct, sp_model_kwargs=self.sp_model_kwargs |
| ) |
|
|
| super().__init__( |
| do_lower_case=do_lower_case, |
| bos_token=bos_token, |
| eos_token=eos_token, |
| unk_token=unk_token, |
| sep_token=sep_token, |
| pad_token=pad_token, |
| cls_token=cls_token, |
| mask_token=mask_token, |
| split_by_punct=split_by_punct, |
| sp_model_kwargs=self.sp_model_kwargs, |
| **kwargs, |
| ) |
| self._tokenizer.special_tokens = self.all_special_tokens |
| self.space_token_id = self._tokenizer.spm.PieceToId('▁') |
|
|
| @property |
| def vocab_size(self): |
| return len(self.vocab) |
|
|
| @property |
| def vocab(self): |
| return self._tokenizer.vocab |
|
|
| def __call__( |
| self, |
| text: TextInput|PreTokenizedInput|list[TextInput]|list[PreTokenizedInput], |
| text_pair: Optional[TextInput|PreTokenizedInput|list[TextInput]|list[PreTokenizedInput]] = None, |
| text_target: Optional[TextInput|PreTokenizedInput|list[TextInput]|list[PreTokenizedInput]] = None, |
| text_pair_target: Optional[TextInput|PreTokenizedInput|list[TextInput]|list[PreTokenizedInput]] = None, |
| add_special_tokens: bool = True, |
| padding: bool|str|PaddingStrategy = False, |
| truncation: Optional[bool|str|TruncationStrategy] = None, |
| max_length: Optional[int] = None, |
| stride: int = 0, |
| is_split_into_words: bool = False, |
| pad_to_multiple_of: Optional[int] = None, |
| padding_side: Optional[str] = None, |
| return_tensors: str|TensorType = 'pt', |
| return_token_type_ids: bool = False, |
| return_attention_mask: bool = True, |
| return_overflowing_tokens: bool = False, |
| return_special_tokens_mask: bool = False, |
| return_offsets_mapping: bool = False, |
| return_length: bool = False, |
| verbose: bool = True, |
| **kwargs, |
| ): |
| return super().__call__( |
| text=text, |
| text_pair=text_pair, |
| text_target=text_target, |
| text_pair_target=text_pair_target, |
| add_special_tokens=add_special_tokens, |
| padding=padding, |
| truncation=truncation, |
| max_length=max_length, |
| stride=stride, |
| is_split_into_words=is_split_into_words, |
| pad_to_multiple_of=pad_to_multiple_of, |
| padding_side=padding_side, |
| return_tensors=return_tensors, |
| return_token_type_ids=return_token_type_ids, |
| return_attention_mask=return_attention_mask, |
| return_overflowing_tokens=return_overflowing_tokens, |
| return_special_tokens_mask=return_special_tokens_mask, |
| return_offsets_mapping=return_offsets_mapping, |
| return_length=return_length, |
| verbose=verbose, |
| **kwargs, |
| ) |
|
|
| def get_vocab(self): |
| vocab = self.vocab.copy() |
| vocab.update(self.get_added_vocab()) |
| return vocab |
|
|
| def _tokenize(self, text: str) -> list[str]: |
| """Take as input a string and return a list of strings (tokens) for words/sub-words""" |
| if self.do_lower_case: |
| text = text.lower() |
| return self._tokenizer.tokenize(text) |
|
|
| def tokenize(self, text: TextInput, **kwargs) -> list[str]: |
| result = super().tokenize(text, **kwargs) |
| return result[1:] if result[0] == '▁' else result |
|
|
| def _convert_token_to_id(self, token: str): |
| """Converts a token (str) in an id using the vocab.""" |
| return self._tokenizer.spm.PieceToId(token) |
|
|
| def _convert_id_to_token(self, index: int): |
| """Converts an index (integer) in a token (str) using the vocab.""" |
| return self._tokenizer.spm.IdToPiece(index) if index < self.vocab_size else self.unk_token |
|
|
| def convert_tokens_to_string(self, tokens): |
| """Converts a sequence of tokens (string) in a single string.""" |
| return self._tokenizer.decode(tokens) |
|
|
| def build_inputs_with_special_tokens(self, token_ids_0: list[int]|str, token_ids_1: Optional[list[int]|str] = None): |
| """ |
| Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and |
| adding special tokens. A DeBERTa sequence has the following format: |
| |
| - 单个序列: [CLS] X [SEP] |
| - 序列对: [CLS] A [SEP] B [SEP] |
| |
| Args: |
| token_ids_0 (`List[int]`): |
| 将向其添加特殊令牌的 ID 列表。 |
| token_ids_1 (`List[int]`, *optional*): |
| 序列对的可选第二个 ID 列表。 |
| |
| Returns: |
| `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. |
| """ |
| cls = [self.cls_token_id] |
| sep = [self.sep_token_id] |
|
|
| if isinstance(token_ids_0, str): |
| token_ids_0 = self._tokenizer.spm.encode_as_ids(token_ids_0)[1:] |
|
|
| if token_ids_1 is None: |
| return cls + token_ids_0 + sep |
| else: |
| if isinstance(token_ids_1, str): |
| token_ids_1 = self._tokenizer.spm.encode_as_ids(token_ids_1)[1:] |
|
|
| return cls + token_ids_0 + sep + token_ids_1 + sep |
|
|
| def get_special_tokens_mask(self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens=False): |
| """ |
| Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding |
| special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods. |
| |
| Args: |
| token_ids_0 (`List[int]`): |
| List of IDs. |
| token_ids_1 (`List[int]`, *optional*): |
| Optional second list of IDs for sequence pairs. |
| already_has_special_tokens (`bool`, *optional*, defaults to `False`): |
| token列表是否已使用模型的bos、eos特殊令牌进行格式化。 |
| |
| Returns: |
| `List[int]`: 范围 [0, 1] 中的整数列表:1 表示特殊token,0 表示正文序列token。 |
| """ |
|
|
| if already_has_special_tokens: |
| return super().get_special_tokens_mask( |
| token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True |
| ) |
|
|
| if token_ids_1 is not None: |
| return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] |
| return [1] + ([0] * len(token_ids_0)) + [1] |
|
|
| def create_token_type_ids_from_sequences(self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None): |
| """ |
| Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa |
| sequence pair mask has the following format: |
| |
| ``` |
| 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 |
| | first sequence | second sequence | |
| ``` |
| |
| If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s). |
| |
| Args: |
| token_ids_0 (`List[int]`): |
| List of IDs. |
| token_ids_1 (`List[int]`, *optional*): |
| Optional second list of IDs for sequence pairs. |
| |
| Returns: |
| `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). |
| """ |
| sep = [self.sep_token_id] |
| cls = [self.cls_token_id] |
| if token_ids_1 is None: |
| return len(cls + token_ids_0 + sep) * [0] |
| return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] |
|
|
| def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): |
| add_prefix_space = kwargs.pop("add_prefix_space", False) |
| if is_split_into_words or add_prefix_space: |
| text = " " + text |
| return (text, kwargs) |
|
|
| def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: |
| return self._tokenizer.save_pretrained(save_directory, filename_prefix=filename_prefix) |
|
|
| def _get_bos_piece(self) -> str: |
| """ |
| 获取BOS Piece |
| :return: |
| """ |
| return self._tokenizer.spm.IdToPiece(self._tokenizer.spm.bos_id()) |
|
|
| def _get_eos_piece(self) -> str: |
| """ |
| 获取EOS Piece |
| :return: |
| """ |
| return self._tokenizer.spm.IdToPiece(self._tokenizer.spm.eos_id()) |
|
|
| def processor(self) -> sentencepiece.SentencePieceProcessor: |
| return self._tokenizer.spm |
|
|