Spaces:
Running
Running
| # coding=utf-8 | |
| # Copyright 2020 The HuggingFace Inc. team. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """ Base classes common to both the slow and the fast tokenization classes: | |
| PreTrainedTokenizerBase (host all the user fronting encoding methodes) | |
| Special token mixing (host the special tokens logic) and | |
| BatchEncoding (wrap the dictionnary of output with special method for the Fast tokenizers) | |
| """ | |
| import copy | |
| import json | |
| import logging | |
| import os | |
| import warnings | |
| from collections import UserDict | |
| from enum import Enum | |
| from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union | |
| import numpy as np | |
| from tokenizers import AddedToken | |
| from tokenizers import Encoding as EncodingFast | |
| from .file_utils import ( | |
| add_end_docstrings, | |
| cached_path, | |
| hf_bucket_url, | |
| is_remote_url, | |
| is_tf_available, | |
| is_torch_available, | |
| torch_required, | |
| ) | |
| if is_tf_available(): | |
| import tensorflow as tf | |
| if is_torch_available(): | |
| import torch | |
| logger = logging.getLogger(__name__) | |
| VERY_LARGE_INTEGER = int(1e30) # This is used to set the max input length for a model with infinite size input | |
| LARGE_INTEGER = int(1e20) # This is used when we need something big but slightly smaller than VERY_LARGE_INTEGER | |
| # Define type aliases and NamedTuples | |
| TextInput = str | |
| PreTokenizedInput = List[str] | |
| EncodedInput = List[int] | |
| TextInputPair = Tuple[str, str] | |
| PreTokenizedInputPair = Tuple[List[str], List[str]] | |
| EncodedInputPair = Tuple[List[int], List[int]] | |
| # Slow tokenizers used to be saved in three separated files | |
| SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json" | |
| ADDED_TOKENS_FILE = "added_tokens.json" | |
| TOKENIZER_CONFIG_FILE = "tokenizer_config.json" | |
| # Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file | |
| FULL_TOKENIZER_FILE = "tokenizer.json" | |
| class ExplicitEnum(Enum): | |
| """ Enum with more explicit error message for missing values. | |
| """ | |
| def _missing_(cls, value): | |
| raise ValueError( | |
| "%r is not a valid %s, please select one of %s" | |
| % (value, cls.__name__, str(list(cls._value2member_map_.keys()))) | |
| ) | |
| class TruncationStrategy(ExplicitEnum): | |
| ONLY_FIRST = "only_first" | |
| ONLY_SECOND = "only_second" | |
| LONGEST_FIRST = "longest_first" | |
| DO_NOT_TRUNCATE = "do_not_truncate" | |
| class PaddingStrategy(ExplicitEnum): | |
| LONGEST = "longest" | |
| MAX_LENGTH = "max_length" | |
| DO_NOT_PAD = "do_not_pad" | |
| class TensorType(ExplicitEnum): | |
| PYTORCH = "pt" | |
| TENSORFLOW = "tf" | |
| NUMPY = "np" | |
| class CharSpan(NamedTuple): | |
| """ Character span in the original string | |
| Args: | |
| start: index of the first character in the original string | |
| end: index of the character following the last character in the original string | |
| """ | |
| start: int | |
| end: int | |
| class TokenSpan(NamedTuple): | |
| """ Token span in an encoded string (list of tokens) | |
| Args: | |
| start: index of the first token in the span | |
| end: index of the token following the last token in the span | |
| """ | |
| start: int | |
| end: int | |
| class BatchEncoding(UserDict): | |
| """ BatchEncoding hold the output of the encode and batch_encode methods (tokens, attention_masks, etc). | |
| This class is derived from a python Dictionary and can be used as a dictionnary. | |
| In addition, this class expose utility methods to map from word/char space to token space. | |
| Args: | |
| data (:obj:`dict`): Dictionary of lists/arrays returned by the encode/batch_encode methods ('input_ids', 'attention_mask'...) | |
| encoding (:obj:`EncodingFast`, :obj:`list(EncodingFast)`, `optional`, defaults to :obj:`None`): | |
| If the tokenizer is a fast tokenizer which outputs additional informations like mapping from word/char space to token space | |
| the `EncodingFast` instance or list of instance (for batches) hold these informations. | |
| tensor_type (:obj:`Union[None, str, TensorType]`, `optional`, defaults to :obj:`None`): | |
| You can give a tensor_type here to convert the lists of integers in PyTorch/TF/Numpy Tensors at initialization | |
| prepend_batch_axis (:obj:`bool`, `optional`, defaults to :obj:`False`): | |
| Set to True to add a batch axis when converting in Tensors (see :obj:`tensor_type` above) | |
| """ | |
| def __init__( | |
| self, | |
| data: Optional[Dict[str, Any]] = None, | |
| encoding: Optional[Union[EncodingFast, Sequence[EncodingFast]]] = None, | |
| tensor_type: Union[None, str, TensorType] = None, | |
| prepend_batch_axis: bool = False, | |
| ): | |
| super().__init__(data) | |
| if isinstance(encoding, EncodingFast): | |
| encoding = [encoding] | |
| self._encodings = encoding | |
| self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis) | |
| def is_fast(self): | |
| """ | |
| Indicate if this BatchEncoding was generated from the result of a PreTrainedTokenizerFast | |
| Returns: True if generated from subclasses of PreTrainedTokenizerFast, else otherwise | |
| """ | |
| return self._encodings is not None | |
| def __getitem__(self, item: Union[int, str]) -> EncodingFast: | |
| """ If the key is a string, get the value of the dict associated to `key` ('input_ids', 'attention_mask'...) | |
| If the key is an integer, get the EncodingFast for batch item with index `key` | |
| """ | |
| if isinstance(item, str): | |
| return self.data[item] | |
| elif self._encodings is not None: | |
| return self._encodings[item] | |
| else: | |
| raise KeyError( | |
| "Indexing with integers (to access backend Encoding for a given batch index) " | |
| "is not available when using Python based tokenizers" | |
| ) | |
| def __getattr__(self, item: str): | |
| try: | |
| return self.data[item] | |
| except KeyError: | |
| raise AttributeError | |
| def __getstate__(self): | |
| return {"data": self.data, "encodings": self._encodings} | |
| def __setstate__(self, state): | |
| if "data" in state: | |
| self.data = state["data"] | |
| if "encodings" in state: | |
| self._encodings = state["encodings"] | |
| def keys(self): | |
| return self.data.keys() | |
| def values(self): | |
| return self.data.values() | |
| def items(self): | |
| return self.data.items() | |
| # After this point: | |
| # Extended properties and methods only available for fast (Rust-based) tokenizers | |
| # provided by HuggingFace tokenizers library. | |
| def encodings(self) -> Optional[List[EncodingFast]]: | |
| """ | |
| Return the list all encoding from the tokenization process | |
| Returns: List[EncodingFast] or None if input was tokenized through Python (i.e. not fast) tokenizer | |
| """ | |
| return self._encodings | |
| def tokens(self, batch_index: int = 0) -> List[str]: | |
| if not self._encodings: | |
| raise ValueError("tokens() is not available when using Python based tokenizers") | |
| return self._encodings[batch_index].tokens | |
| def words(self, batch_index: int = 0) -> List[Optional[int]]: | |
| if not self._encodings: | |
| raise ValueError("words() is not available when using Python based tokenizers") | |
| return self._encodings[batch_index].words | |
| def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int: | |
| """ | |
| Get the index of the word corresponding (i.e. comprising) to an encoded token | |
| in a sequence of the batch. | |
| Can be called as: | |
| - ``self.token_to_word(token_index)`` if batch size is 1 | |
| - ``self.token_to_word(batch_index, token_index)`` if batch size is greater than 1 | |
| This method is particularly suited when the input sequences are provided as | |
| pre-tokenized sequences (i.e. words are defined by the user). In this case it allows | |
| to easily associate encoded tokens with provided tokenized words. | |
| Args: | |
| batch_or_token_index (:obj:`int`): | |
| Index of the sequence in the batch. If the batch only comprise one sequence, | |
| this can be the index of the token in the sequence | |
| token_index (:obj:`int`, `optional`): | |
| If a batch index is provided in `batch_or_token_index`, this can be the index | |
| of the token in the sequence. | |
| Returns: | |
| :obj:`int`: | |
| index of the word in the input sequence. | |
| """ | |
| if not self._encodings: | |
| raise ValueError("token_to_word() is not available when using Python based tokenizers") | |
| if token_index is not None: | |
| batch_index = batch_or_token_index | |
| else: | |
| batch_index = 0 | |
| token_index = batch_or_token_index | |
| if batch_index < 0: | |
| batch_index = self._batch_size + batch_index | |
| if token_index < 0: | |
| token_index = self._seq_len + token_index | |
| return self._encodings[batch_index].token_to_word(token_index) | |
| def word_to_tokens(self, batch_or_word_index: int, word_index: Optional[int] = None) -> TokenSpan: | |
| """ | |
| Get the encoded token span corresponding to a word in the sequence of the batch. | |
| Token spans are returned as a TokenSpan NamedTuple with: | |
| - start: index of the first token | |
| - end: index of the token following the last token | |
| Can be called as: | |
| - ``self.word_to_tokens(word_index)`` if batch size is 1 | |
| - ``self.word_to_tokens(batch_index, word_index)`` if batch size is greater or equal to 1 | |
| This method is particularly suited when the input sequences are provided as | |
| pre-tokenized sequences (i.e. words are defined by the user). In this case it allows | |
| to easily associate encoded tokens with provided tokenized words. | |
| Args: | |
| batch_or_word_index (:obj:`int`): | |
| Index of the sequence in the batch. If the batch only comprises one sequence, | |
| this can be the index of the word in the sequence | |
| word_index (:obj:`int`, `optional`): | |
| If a batch index is provided in `batch_or_token_index`, this can be the index | |
| of the word in the sequence. | |
| Returns: | |
| :obj:`TokenSpan`: | |
| Span of tokens in the encoded sequence. | |
| :obj:`TokenSpan` are NamedTuple with: | |
| - start: index of the first token | |
| - end: index of the token following the last token | |
| """ | |
| if not self._encodings: | |
| raise ValueError("word_to_tokens() is not available when using Python based tokenizers") | |
| if word_index is not None: | |
| batch_index = batch_or_word_index | |
| else: | |
| batch_index = 0 | |
| word_index = batch_or_word_index | |
| if batch_index < 0: | |
| batch_index = self._batch_size + batch_index | |
| if word_index < 0: | |
| word_index = self._seq_len + word_index | |
| return TokenSpan(*(self._encodings[batch_index].word_to_tokens(word_index))) | |
| def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = None) -> CharSpan: | |
| """ | |
| Get the character span corresponding to an encoded token in a sequence of the batch. | |
| Character spans are returned as a CharSpan NamedTuple with: | |
| - start: index of the first character in the original string associated to the token | |
| - end: index of the character following the last character in the original string associated to the token | |
| Can be called as: | |
| - ``self.token_to_chars(token_index)`` if batch size is 1 | |
| - ``self.token_to_chars(batch_index, token_index)`` if batch size is greater or equal to 1 | |
| Args: | |
| batch_or_token_index (:obj:`int`): | |
| Index of the sequence in the batch. If the batch only comprise one sequence, | |
| this can be the index of the token in the sequence | |
| token_index (:obj:`int`, `optional`): | |
| If a batch index is provided in `batch_or_token_index`, this can be the index | |
| of the token or tokens in the sequence. | |
| Returns: | |
| :obj:`CharSpan`: | |
| Span of characters in the original string. | |
| :obj:`CharSpan` are NamedTuple with: | |
| - start: index of the first character in the original string | |
| - end: index of the character following the last character in the original string | |
| """ | |
| if not self._encodings: | |
| raise ValueError("token_to_chars() is not available when using Python based tokenizers") | |
| if token_index is not None: | |
| batch_index = batch_or_token_index | |
| else: | |
| batch_index = 0 | |
| token_index = batch_or_token_index | |
| return CharSpan(*(self._encodings[batch_index].token_to_chars(token_index))) | |
| def char_to_token(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int: | |
| """ | |
| Get the index of the token in the encoded output comprising a character | |
| in the original string for a sequence of the batch. | |
| Can be called as: | |
| - ``self.char_to_token(char_index)`` if batch size is 1 | |
| - ``self.char_to_token(batch_index, char_index)`` if batch size is greater or equal to 1 | |
| This method is particularly suited when the input sequences are provided as | |
| pre-tokenized sequences (i.e. words are defined by the user). In this case it allows | |
| to easily associate encoded tokens with provided tokenized words. | |
| Args: | |
| batch_or_char_index (:obj:`int`): | |
| Index of the sequence in the batch. If the batch only comprise one sequence, | |
| this can be the index of the word in the sequence | |
| char_index (:obj:`int`, `optional`): | |
| If a batch index is provided in `batch_or_token_index`, this can be the index | |
| of the word in the sequence. | |
| Returns: | |
| :obj:`int`: Index of the token. | |
| """ | |
| if not self._encodings: | |
| raise ValueError("char_to_token() is not available when using Python based tokenizers") | |
| if char_index is not None: | |
| batch_index = batch_or_char_index | |
| else: | |
| batch_index = 0 | |
| char_index = batch_or_char_index | |
| return self._encodings[batch_index].char_to_token(char_index) | |
| def word_to_chars(self, batch_or_word_index: int, word_index: Optional[int] = None) -> CharSpan: | |
| """ | |
| Get the character span in the original string corresponding to given word in a sequence | |
| of the batch. | |
| Character spans are returned as a CharSpan NamedTuple with: | |
| - start: index of the first character in the original string | |
| - end: index of the character following the last character in the original string | |
| Can be called as: | |
| - ``self.word_to_chars(word_index)`` if batch size is 1 | |
| - ``self.word_to_chars(batch_index, word_index)`` if batch size is greater or equal to 1 | |
| Args: | |
| batch_or_word_index (:obj:`int`): | |
| Index of the sequence in the batch. If the batch only comprise one sequence, | |
| this can be the index of the word in the sequence | |
| word_index (:obj:`int`, `optional`): | |
| If a batch index is provided in `batch_or_token_index`, this can be the index | |
| of the word in the sequence. | |
| Returns: | |
| :obj:`CharSpan` or :obj:`List[CharSpan]`: | |
| Span(s) of the associated character or characters in the string. | |
| CharSpan are NamedTuple with: | |
| - start: index of the first character associated to the token in the original string | |
| - end: index of the character following the last character associated to the token in the original string | |
| """ | |
| if not self._encodings: | |
| raise ValueError("word_to_chars() is not available when using Python based tokenizers") | |
| if word_index is not None: | |
| batch_index = batch_or_word_index | |
| else: | |
| batch_index = 0 | |
| word_index = batch_or_word_index | |
| return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index))) | |
| def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int: | |
| """ | |
| Get the word in the original string corresponding to a character in the original string of | |
| a sequence of the batch. | |
| Can be called as: | |
| - ``self.char_to_word(char_index)`` if batch size is 1 | |
| - ``self.char_to_word(batch_index, char_index)`` if batch size is greater than 1 | |
| This method is particularly suited when the input sequences are provided as | |
| pre-tokenized sequences (i.e. words are defined by the user). In this case it allows | |
| to easily associate encoded tokens with provided tokenized words. | |
| Args: | |
| batch_or_char_index (:obj:`int`): | |
| Index of the sequence in the batch. If the batch only comprise one sequence, | |
| this can be the index of the character in the orginal string. | |
| char_index (:obj:`int`, `optional`): | |
| If a batch index is provided in `batch_or_token_index`, this can be the index | |
| of the character in the orginal string. | |
| Returns: | |
| :obj:`int` or :obj:`List[int]`: | |
| Index or indices of the associated encoded token(s). | |
| """ | |
| if not self._encodings: | |
| raise ValueError("char_to_word() is not available when using Python based tokenizers") | |
| if char_index is not None: | |
| batch_index = batch_or_char_index | |
| else: | |
| batch_index = 0 | |
| char_index = batch_or_char_index | |
| return self._encodings[batch_index].char_to_word(char_index) | |
| def convert_to_tensors(self, tensor_type: Union[None, str, TensorType], prepend_batch_axis: bool = False): | |
| if tensor_type is None: | |
| return self | |
| # Convert to TensorType | |
| if not isinstance(tensor_type, TensorType): | |
| tensor_type = TensorType(tensor_type) | |
| # Get a function reference for the correct framework | |
| if tensor_type == TensorType.TENSORFLOW and is_tf_available(): | |
| as_tensor = tf.constant | |
| elif tensor_type == TensorType.PYTORCH and is_torch_available(): | |
| as_tensor = torch.tensor | |
| elif tensor_type == TensorType.NUMPY: | |
| as_tensor = np.asarray | |
| else: | |
| raise ImportError( | |
| "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format( | |
| tensor_type | |
| ) | |
| ) | |
| # Do the tensor conversion in batch | |
| for key, value in self.items(): | |
| try: | |
| if prepend_batch_axis: | |
| value = [value] | |
| tensor = as_tensor(value) | |
| # at-least2d | |
| if tensor.ndim > 2: | |
| tensor = tensor.squeeze(0) | |
| elif tensor.ndim < 2: | |
| tensor = tensor[None, :] | |
| self[key] = tensor | |
| except: # noqa E722 | |
| raise ValueError( | |
| "Unable to create tensor, you should probably activate truncation and/or padding " | |
| "with 'padding=True' 'truncation=True' to have batched tensors with the same length." | |
| ) | |
| return self | |
| def to(self, device: str): | |
| """Send all values to device by calling v.to(device)""" | |
| self.data = {k: v.to(device) for k, v in self.data.items()} | |
| return self | |
| # class AddedToken(UserString): | |
| # """ AddedToken represents a token to be added to a Tokenizer | |
| # An AddedToken can have special options defining the way it should behave. | |
| # Args: | |
| # content: str: | |
| # The content of the token | |
| # single_word: bool | |
| # Whether this token should only match against single word. If True, | |
| # this token will never match inside of a word. | |
| # lstrip: bool | |
| # Whether this token should strip all potential whitespaces on the left side. | |
| # If True, this token will greedily match any whitespace on the left and then strip | |
| # them out. | |
| # rstrip: bool | |
| # Whether this token should strip all potential whitespaces on the right side. | |
| # If True, this token will greedily match any whitespace on the right and then strip | |
| # them out. | |
| # """ | |
| # def __init__( | |
| # self, data: str, single_word: bool = False, lstrip: bool = False, rstrip: bool = False, | |
| # ): | |
| # super().__init__(data) | |
| # self._single_word = single_word | |
| # self._lstrip = lstrip | |
| # self._rstrip = rstrip | |
| # def lower(self): | |
| # return AddedToken(self.data.lower(), self._single_word, self._lstrip, self._rstrip) | |
| class SpecialTokensMixin: | |
| """ SpecialTokensMixin is derived by ``PreTrainedTokenizer`` and ``PreTrainedTokenizerFast`` and | |
| handles specific behaviors related to special tokens. In particular, this class hold the | |
| attributes which can be used to directly access to these special tokens in a | |
| model-independant manner and allow to set and update the special tokens. | |
| """ | |
| SPECIAL_TOKENS_ATTRIBUTES = [ | |
| "bos_token", | |
| "eos_token", | |
| "unk_token", | |
| "sep_token", | |
| "pad_token", | |
| "cls_token", | |
| "mask_token", | |
| "additional_special_tokens", | |
| ] | |
| def __init__(self, verbose=True, **kwargs): | |
| self._bos_token = None | |
| self._eos_token = None | |
| self._unk_token = None | |
| self._sep_token = None | |
| self._pad_token = None | |
| self._cls_token = None | |
| self._mask_token = None | |
| self._pad_token_type_id = 0 | |
| self._additional_special_tokens = [] | |
| self.verbose = verbose | |
| # We directly set the hidden value to allow initialization with special tokens | |
| # which are not yet in the vocabulary. Necesssary for serialization/de-serialization | |
| # TODO clean this up at some point (probably by sitching to fast tokenizers) | |
| for key, value in kwargs.items(): | |
| if key in self.SPECIAL_TOKENS_ATTRIBUTES: | |
| if key == "additional_special_tokens": | |
| assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value) | |
| setattr(self, key, value) | |
| elif isinstance(value, (str, AddedToken)): | |
| setattr(self, key, value) | |
| else: | |
| raise TypeError( | |
| "special token {} has to be either str or AddedToken but got: {}".format(key, type(value)) | |
| ) | |
| def sanitize_special_tokens(self) -> int: | |
| """ Make sure that all the special tokens attributes of the tokenizer (tokenizer.mask_token, tokenizer.cls_token, ...) | |
| are in the vocabulary. Add the missing ones to the vocabulary if needed. | |
| Return: | |
| Number of tokens added in the vocaulary during the operation. | |
| """ | |
| return self.add_tokens(self.all_special_tokens_extended, special_tokens=True) | |
| def add_special_tokens(self, special_tokens_dict: Dict[str, Union[str, AddedToken]]) -> int: | |
| """ | |
| Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them | |
| to class attributes. If special tokens are NOT in the vocabulary, they are added | |
| to it (indexed starting from the last index of the current vocabulary). | |
| Using `add_special_tokens` will ensure your special tokens can be used in several ways: | |
| - special tokens are carefully handled by the tokenizer (they are never split) | |
| - you can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This makes it easy to develop model-agnostic training and fine-tuning scripts. | |
| When possible, special tokens are already registered for provided pretrained models (ex: BertTokenizer cls_token is already registered to be '[CLS]' and XLM's one is also registered to be '</s>') | |
| Args: | |
| special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes: | |
| [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, | |
| ``additional_special_tokens``]. | |
| Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them). | |
| Returns: | |
| Number of tokens added to the vocabulary. | |
| Examples:: | |
| # Let's see how to add a new classification token to GPT-2 | |
| tokenizer = GPT2Tokenizer.from_pretrained('gpt2') | |
| model = GPT2Model.from_pretrained('gpt2') | |
| special_tokens_dict = {'cls_token': '<CLS>'} | |
| num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) | |
| print('We have added', num_added_toks, 'tokens') | |
| model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. | |
| assert tokenizer.cls_token == '<CLS>' | |
| """ | |
| if not special_tokens_dict: | |
| return 0 | |
| added_tokens = 0 | |
| for key, value in special_tokens_dict.items(): | |
| assert key in self.SPECIAL_TOKENS_ATTRIBUTES | |
| if self.verbose: | |
| logger.info("Assigning %s to the %s key of the tokenizer", value, key) | |
| setattr(self, key, value) | |
| if key == "additional_special_tokens": | |
| assert isinstance(value, (list, tuple)) and all( | |
| isinstance(t, (str, AddedToken)) for t in value | |
| ), f"Tokens {value} for key {key} should all be str or AddedToken instances" | |
| added_tokens += self.add_tokens(value, special_tokens=True) | |
| else: | |
| assert isinstance( | |
| value, (str, AddedToken) | |
| ), f"Token {value} for key {key} should be a str or an AddedToken instance" | |
| added_tokens += self.add_tokens([value], special_tokens=True) | |
| return added_tokens | |
| def add_tokens(self, new_tokens: Union[str, AddedToken, List[str], List[AddedToken]], special_tokens=False) -> int: | |
| """ | |
| Add a list of new tokens to the tokenizer class. If the new tokens are not in the | |
| vocabulary, they are added to it with indices starting from length of the current vocabulary. | |
| Args: | |
| new_tokens: string or list of string or :class:`~transformers.AddedToken`. Each string is a token to add. | |
| Tokens are only added if they are not already in the vocabulary. AddedToken wrap a string token to | |
| let you personnalize it's behavior (Whether this token should only match against single word, whether | |
| this token should strip all potential whitespaces on the left side, Whether this token should strip | |
| all potential whitespaces on the right side...). | |
| special_token: can be used to specify if the token is a special token. This mostly change the normalization | |
| behavior (special tokens like CLS or [MASK] are usually not lower-cased for instance) | |
| See details for :class:`~transformers.AddedToken` in HuggingFace tokenizers library. | |
| Returns: | |
| Number of tokens added to the vocabulary. | |
| Examples:: | |
| # Let's see how to increase the vocabulary of Bert model and tokenizer | |
| tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased') | |
| model = BertModel.from_pretrained('bert-base-uncased') | |
| num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2']) | |
| print('We have added', num_added_toks, 'tokens') | |
| model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. | |
| """ | |
| if not new_tokens: | |
| return 0 | |
| if not isinstance(new_tokens, (list, tuple)): | |
| new_tokens = [new_tokens] | |
| return self._add_tokens(new_tokens, special_tokens=special_tokens) | |
| def bos_token(self): | |
| """ Beginning of sentence token (string). Log an error if used while not having been set. """ | |
| if self._bos_token is None and self.verbose: | |
| logger.error("Using bos_token, but it is not set yet.") | |
| return None | |
| return str(self._bos_token) | |
| def eos_token(self): | |
| """ End of sentence token (string). Log an error if used while not having been set. """ | |
| if self._eos_token is None and self.verbose: | |
| logger.error("Using eos_token, but it is not set yet.") | |
| return None | |
| return str(self._eos_token) | |
| def unk_token(self): | |
| """ Unknown token (string). Log an error if used while not having been set. """ | |
| if self._unk_token is None and self.verbose: | |
| logger.error("Using unk_token, but it is not set yet.") | |
| return None | |
| return str(self._unk_token) | |
| def sep_token(self): | |
| """ Separation token (string). E.g. separate context and query in an input sequence. Log an error if used while not having been set. """ | |
| if self._sep_token is None and self.verbose: | |
| logger.error("Using sep_token, but it is not set yet.") | |
| return None | |
| return str(self._sep_token) | |
| def pad_token(self): | |
| """ Padding token (string). Log an error if used while not having been set. """ | |
| if self._pad_token is None and self.verbose: | |
| logger.error("Using pad_token, but it is not set yet.") | |
| return None | |
| return str(self._pad_token) | |
| def cls_token(self): | |
| """ Classification token (string). E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """ | |
| if self._cls_token is None and self.verbose: | |
| logger.error("Using cls_token, but it is not set yet.") | |
| return None | |
| return str(self._cls_token) | |
| def mask_token(self): | |
| """ Mask token (string). E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """ | |
| if self._mask_token is None and self.verbose: | |
| logger.error("Using mask_token, but it is not set yet.") | |
| return None | |
| return str(self._mask_token) | |
| def additional_special_tokens(self): | |
| """ All the additional special tokens you may want to use (list of strings). Log an error if used while not having been set. """ | |
| if self._additional_special_tokens is None and self.verbose: | |
| logger.error("Using additional_special_tokens, but it is not set yet.") | |
| return None | |
| return [str(tok) for tok in self._additional_special_tokens] | |
| def bos_token(self, value): | |
| self._bos_token = value | |
| def eos_token(self, value): | |
| self._eos_token = value | |
| def unk_token(self, value): | |
| self._unk_token = value | |
| def sep_token(self, value): | |
| self._sep_token = value | |
| def pad_token(self, value): | |
| self._pad_token = value | |
| def cls_token(self, value): | |
| self._cls_token = value | |
| def mask_token(self, value): | |
| self._mask_token = value | |
| def additional_special_tokens(self, value): | |
| self._additional_special_tokens = value | |
| def bos_token_id(self): | |
| """ Id of the beginning of sentence token in the vocabulary. Log an error if used while not having been set. """ | |
| if self._bos_token is None: | |
| return None | |
| return self.convert_tokens_to_ids(self.bos_token) | |
| def eos_token_id(self): | |
| """ Id of the end of sentence token in the vocabulary. Log an error if used while not having been set. """ | |
| if self._eos_token is None: | |
| return None | |
| return self.convert_tokens_to_ids(self.eos_token) | |
| def unk_token_id(self): | |
| """ Id of the unknown token in the vocabulary. Log an error if used while not having been set. """ | |
| if self._unk_token is None: | |
| return None | |
| return self.convert_tokens_to_ids(self.unk_token) | |
| def sep_token_id(self): | |
| """ Id of the separation token in the vocabulary. E.g. separate context and query in an input sequence. Log an error if used while not having been set. """ | |
| if self._sep_token is None: | |
| return None | |
| return self.convert_tokens_to_ids(self.sep_token) | |
| def pad_token_id(self): | |
| """ Id of the padding token in the vocabulary. Log an error if used while not having been set. """ | |
| if self._pad_token is None: | |
| return None | |
| return self.convert_tokens_to_ids(self.pad_token) | |
| def pad_token_type_id(self): | |
| """ Id of the padding token type in the vocabulary.""" | |
| return self._pad_token_type_id | |
| def cls_token_id(self): | |
| """ Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """ | |
| if self._cls_token is None: | |
| return None | |
| return self.convert_tokens_to_ids(self.cls_token) | |
| def mask_token_id(self): | |
| """ Id of the mask token in the vocabulary. E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """ | |
| if self._mask_token is None: | |
| return None | |
| return self.convert_tokens_to_ids(self.mask_token) | |
| def additional_special_tokens_ids(self): | |
| """ Ids of all the additional special tokens in the vocabulary (list of integers). Log an error if used while not having been set. """ | |
| return self.convert_tokens_to_ids(self.additional_special_tokens) | |
| def special_tokens_map(self): | |
| """ A dictionary mapping special token class attribute (cls_token, unk_token...) to their | |
| values ('<unk>', '<cls>'...) | |
| Convert tokens of AddedToken type in string. | |
| All returned tokens are strings | |
| """ | |
| set_attr = {} | |
| for attr in self.SPECIAL_TOKENS_ATTRIBUTES: | |
| attr_value = getattr(self, "_" + attr) | |
| if attr_value: | |
| set_attr[attr] = str(attr_value) | |
| return set_attr | |
| def special_tokens_map_extended(self): | |
| """ A dictionary mapping special token class attribute (cls_token, unk_token...) to their | |
| values ('<unk>', '<cls>'...) | |
| Keep the tokens as AddedToken if they are of this type. | |
| AddedToken can be used to control more finely how special tokens are tokenized. | |
| """ | |
| set_attr = {} | |
| for attr in self.SPECIAL_TOKENS_ATTRIBUTES: | |
| attr_value = getattr(self, "_" + attr) | |
| if attr_value: | |
| set_attr[attr] = attr_value | |
| return set_attr | |
| def all_special_tokens(self): | |
| """ List all the special tokens ('<unk>', '<cls>'...) mapped to class attributes | |
| Convert tokens of AddedToken type in string. | |
| All returned tokens are strings | |
| (cls_token, unk_token...). | |
| """ | |
| all_toks = [str(s) for s in self.all_special_tokens_extended] | |
| return all_toks | |
| def all_special_tokens_extended(self): | |
| """ List all the special tokens ('<unk>', '<cls>'...) mapped to class attributes | |
| Keep the tokens as AddedToken if they are of this type. | |
| AddedToken can be used to control more finely how special tokens are tokenized. | |
| """ | |
| all_toks = [] | |
| set_attr = self.special_tokens_map_extended | |
| for attr_value in set_attr.values(): | |
| all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value]) | |
| all_toks = list(set(all_toks)) | |
| return all_toks | |
| def all_special_ids(self): | |
| """ List the vocabulary indices of the special tokens ('<unk>', '<cls>'...) mapped to | |
| class attributes (cls_token, unk_token...). | |
| """ | |
| all_toks = self.all_special_tokens | |
| all_ids = self.convert_tokens_to_ids(all_toks) | |
| return all_ids | |
| ENCODE_KWARGS_DOCSTRING = r""" | |
| add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`): | |
| If set to ``True``, the sequences will be encoded with the special tokens relative | |
| to their model. | |
| `padding` (:obj:`Union[bool, str]`, `optional`, defaults to :obj:`False`): | |
| Activate and control padding. Accepts the following values: | |
| * `True` or `'longest'`: pad to the longest sequence in the batch (or no padding if only a single sequence if provided), | |
| * `'max_length'`: pad to a max length specified in `max_length` or to the max acceptable input length for the model if no length is provided (`max_length=None`) | |
| * `False` or `'do_not_pad'` (default): No padding (i.e. can output batch with sequences of uneven lengths) | |
| `truncation` (:obj:`Union[bool, str]`, `optional`, defaults to :obj:`False`): | |
| Activate and control truncation. Accepts the following values: | |
| * `True` or `'longest_first'`: truncate to a max length specified in `max_length` or to the max acceptable input length for the model if no length is provided (`max_length=None`). This will truncate token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a batch of pairs) is provided, | |
| * `'only_first'`: truncate to a max length specified in `max_length` or to the max acceptable input length for the model if no length is provided (`max_length=None`). This will only truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided, | |
| * `'only_second'`: truncate to a max length specified in `max_length` or to the max acceptable input length for the model if no length is provided (`max_length=None`). This will only truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided, | |
| * `False` or `'do_not_truncate'` (default): No truncation (i.e. can output batch with sequences length greater than the model max admissible input size) | |
| `max_length` (:obj:`Union[int, None]`, `optional`, defaults to :obj:`None`): | |
| Control the length for padding/truncation. Accepts the following values | |
| * `None` (default): This will use the predefined model max length if required by one of the truncation/padding parameters. If the model has no specific max input length (e.g. XLNet) truncation/padding to max length is deactivated. | |
| * `any integer value` (e.g. `42`): Use this specific maximum length value if required by one of the truncation/padding parameters. | |
| stride (:obj:`int`, `optional`, defaults to ``0``): | |
| If set to a number along with max_length, the overflowing tokens returned when `return_overflowing_tokens=True` | |
| will contain some tokens from the end of the truncated sequence returned to provide some overlap between truncated and overflow ing sequences. | |
| The value of this argument defines the number of overlapping tokens. | |
| is_pretokenized (:obj:`bool`, defaults to :obj:`False`): | |
| Set to True to indicate the input is already tokenized | |
| pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. | |
| This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability | |
| >= 7.5 (Volta). | |
| return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`): | |
| Can be set to 'tf', 'pt' or 'np' to return respectively TensorFlow :obj:`tf.constant`, | |
| PyTorch :obj:`torch.Tensor` or Numpy :oj: `np.ndarray` instead of a list of python integers. | |
| """ | |
| ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r""" | |
| return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`): | |
| Whether to return token type IDs. If left to the default, will return the token type IDs according | |
| to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. | |
| `What are token type IDs? <../glossary.html#token-type-ids>`_ | |
| return_attention_mask (:obj:`bool`, `optional`, defaults to :obj:`none`): | |
| Whether to return the attention mask. If left to the default, will return the attention mask according | |
| to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. | |
| `What are attention masks? <../glossary.html#attention-mask>`__ | |
| return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): | |
| Set to True to return overflowing token sequences (default False). | |
| return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`): | |
| Set to True to return special tokens mask information (default False). | |
| return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`): | |
| Set to True to return (char_start, char_end) for each token (default False). | |
| If using Python's tokenizer, this method will raise NotImplementedError. | |
| This one is only available on fast tokenizers inheriting from PreTrainedTokenizerFast. | |
| **kwargs: passed to the `self.tokenize()` method | |
| Return: | |
| A Dictionary of shape:: | |
| { | |
| input_ids: list[int], | |
| token_type_ids: list[int] if return_token_type_ids is True (default) | |
| attention_mask: list[int] if return_attention_mask is True (default) | |
| overflowing_tokens: list[int] if the tokenizer is a slow tokenize, else a List[List[int]] if a ``max_length`` is specified and ``return_overflowing_tokens=True`` | |
| special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` | |
| and return_special_tokens_mask is True | |
| } | |
| With the fields: | |
| - ``input_ids``: list of token ids to be fed to a model | |
| - ``token_type_ids``: list of token type ids to be fed to a model | |
| - ``attention_mask``: list of indices specifying which tokens should be attended to by the model | |
| - ``overflowing_tokens``: list of overflowing tokens sequences if a max length is specified and ``return_overflowing_tokens=True``. | |
| - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added | |
| tokens and 1 specifying sequence tokens. | |
| """ | |
| class PreTrainedTokenizerBase(SpecialTokensMixin): | |
| """ Base class for slow and fast tokenizers. | |
| Handle shared (mostly boiler plate) methods for slow and fast tokenizers. | |
| """ | |
| vocab_files_names: Dict[str, str] = {} | |
| pretrained_vocab_files_map: Dict[str, Dict[str, str]] = {} | |
| pretrained_init_configuration: Dict[str, Dict[str, Any]] = {} | |
| max_model_input_sizes: Dict[str, int] = {} | |
| model_input_names: List[str] = ["token_type_ids", "attention_mask"] | |
| padding_side: str = "right" | |
| def __init__(self, **kwargs): | |
| # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``) | |
| self.init_inputs = () | |
| self.init_kwargs = kwargs | |
| # For backward compatibility we fallback to set model_max_length from max_len if provided | |
| model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None)) | |
| self.model_max_length = model_max_length if model_max_length is not None else VERY_LARGE_INTEGER | |
| # Padding side is right by default and overridden in subclasses. If specified in the kwargs, it is changed. | |
| self.padding_side = kwargs.pop("padding_side", self.padding_side) | |
| assert self.padding_side in [ | |
| "right", | |
| "left", | |
| ], f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}" | |
| self.model_input_names = kwargs.pop("model_input_names", self.model_input_names) | |
| super().__init__(**kwargs) | |
| def max_len(self) -> int: | |
| """ Kept here for backward compatibility. | |
| Now renamed to `model_max_length` to avoid ambiguity. | |
| """ | |
| return self.model_max_length | |
| def max_len_single_sentence(self) -> int: | |
| return self.model_max_length - self.num_special_tokens_to_add(pair=False) | |
| def max_len_sentences_pair(self) -> int: | |
| return self.model_max_length - self.num_special_tokens_to_add(pair=True) | |
| def max_len_single_sentence(self, value) -> int: | |
| """ For backward compatibility, allow to try to setup 'max_len_single_sentence' """ | |
| if value == self.model_max_length - self.num_special_tokens_to_add(pair=False) and self.verbose: | |
| logger.warning( | |
| "Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up." | |
| ) | |
| else: | |
| raise ValueError( | |
| "Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up." | |
| ) | |
| def max_len_sentences_pair(self, value) -> int: | |
| """ For backward compatibility, allow to try to setup 'max_len_sentences_pair' """ | |
| if value == self.model_max_length - self.num_special_tokens_to_add(pair=True) and self.verbose: | |
| logger.warning( | |
| "Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up." | |
| ) | |
| else: | |
| raise ValueError( | |
| "Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up." | |
| ) | |
| def from_pretrained(cls, *inputs, **kwargs): | |
| r""" | |
| Instantiate a :class:`~transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer. | |
| Args: | |
| pretrained_model_name_or_path: either: | |
| - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``. | |
| - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. | |
| - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``. | |
| - (not applicable to all derived classes, deprecated) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``. | |
| cache_dir: (`optional`) string: | |
| Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used. | |
| force_download: (`optional`) boolean, default False: | |
| Force to (re-)download the vocabulary files and override the cached versions if they exists. | |
| resume_download: (`optional`) boolean, default False: | |
| Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. | |
| proxies: (`optional`) dict, default None: | |
| A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. | |
| The proxies are used on each request. | |
| inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method. | |
| kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~transformers.PreTrainedTokenizer` for details. | |
| Examples:: | |
| # We can't instantiate directly the base class `PreTrainedTokenizer` so let's show our examples on a derived class: BertTokenizer | |
| # Download vocabulary from S3 and cache. | |
| tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') | |
| # Download vocabulary from S3 (user-uploaded) and cache. | |
| tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased') | |
| # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`) | |
| tokenizer = BertTokenizer.from_pretrained('./test/saved_model/') | |
| # If the tokenizer uses a single vocabulary file, you can point directly to this file | |
| tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt') | |
| # You can link tokens to special vocabulary when instantiating | |
| tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='<unk>') | |
| # You should be sure '<unk>' is in the vocabulary when doing that. | |
| # Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead) | |
| assert tokenizer.unk_token == '<unk>' | |
| """ | |
| return cls._from_pretrained(*inputs, **kwargs) | |
| def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs): | |
| cache_dir = kwargs.pop("cache_dir", None) | |
| force_download = kwargs.pop("force_download", False) | |
| resume_download = kwargs.pop("resume_download", False) | |
| proxies = kwargs.pop("proxies", None) | |
| local_files_only = kwargs.pop("local_files_only", False) | |
| s3_models = list(cls.max_model_input_sizes.keys()) | |
| vocab_files = {} | |
| init_configuration = {} | |
| if pretrained_model_name_or_path in s3_models: | |
| # Get the vocabulary from AWS S3 bucket | |
| for file_id, map_list in cls.pretrained_vocab_files_map.items(): | |
| vocab_files[file_id] = map_list[pretrained_model_name_or_path] | |
| if ( | |
| cls.pretrained_init_configuration | |
| and pretrained_model_name_or_path in cls.pretrained_init_configuration | |
| ): | |
| init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path].copy() | |
| else: | |
| # Get the vocabulary from local files | |
| logger.info( | |
| "Model name '{}' not found in model shortcut name list ({}). " | |
| "Assuming '{}' is a path, a model identifier, or url to a directory containing tokenizer files.".format( | |
| pretrained_model_name_or_path, ", ".join(s3_models), pretrained_model_name_or_path | |
| ) | |
| ) | |
| if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): | |
| if len(cls.vocab_files_names) > 1: | |
| raise ValueError( | |
| "Calling {}.from_pretrained() with the path to a single file or url is not supported." | |
| "Use a model identifier or the path to a directory instead.".format(cls.__name__) | |
| ) | |
| logger.warning( | |
| "Calling {}.from_pretrained() with the path to a single file or url is deprecated".format( | |
| cls.__name__ | |
| ) | |
| ) | |
| file_id = list(cls.vocab_files_names.keys())[0] | |
| vocab_files[file_id] = pretrained_model_name_or_path | |
| else: | |
| # At this point pretrained_model_name_or_path is either a directory or a model identifier name | |
| additional_files_names = { | |
| "added_tokens_file": ADDED_TOKENS_FILE, | |
| "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE, | |
| "tokenizer_config_file": TOKENIZER_CONFIG_FILE, | |
| "full_tokenizer_file": FULL_TOKENIZER_FILE, | |
| } | |
| # Look for the tokenizer files | |
| for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items(): | |
| if os.path.isdir(pretrained_model_name_or_path): | |
| full_file_name = os.path.join(pretrained_model_name_or_path, file_name) | |
| if not os.path.exists(full_file_name): | |
| logger.info("Didn't find file {}. We won't load it.".format(full_file_name)) | |
| full_file_name = None | |
| else: | |
| full_file_name = hf_bucket_url( | |
| pretrained_model_name_or_path, filename=file_name, use_cdn=False | |
| ) | |
| vocab_files[file_id] = full_file_name | |
| # Get files from url, cache, or disk depending on the case | |
| try: | |
| resolved_vocab_files = {} | |
| for file_id, file_path in vocab_files.items(): | |
| if file_path is None: | |
| resolved_vocab_files[file_id] = None | |
| else: | |
| resolved_vocab_files[file_id] = cached_path( | |
| file_path, | |
| cache_dir=cache_dir, | |
| force_download=force_download, | |
| proxies=proxies, | |
| resume_download=resume_download, | |
| local_files_only=local_files_only, | |
| ) | |
| except EnvironmentError: | |
| if pretrained_model_name_or_path in s3_models: | |
| msg = "Couldn't reach server at '{}' to download vocabulary files." | |
| else: | |
| msg = ( | |
| "Model name '{}' was not found in tokenizers model name list ({}). " | |
| "We assumed '{}' was a path or url to a directory containing vocabulary files " | |
| "named {}, but couldn't find such vocabulary files at this path or url.".format( | |
| pretrained_model_name_or_path, | |
| ", ".join(s3_models), | |
| pretrained_model_name_or_path, | |
| list(cls.vocab_files_names.values()), | |
| ) | |
| ) | |
| raise EnvironmentError(msg) | |
| if all(full_file_name is None for full_file_name in resolved_vocab_files.values()): | |
| raise EnvironmentError( | |
| "Model name '{}' was not found in tokenizers model name list ({}). " | |
| "We assumed '{}' was a path, a model identifier, or url to a directory containing vocabulary files " | |
| "named {} but couldn't find such vocabulary files at this path or url.".format( | |
| pretrained_model_name_or_path, | |
| ", ".join(s3_models), | |
| pretrained_model_name_or_path, | |
| list(cls.vocab_files_names.values()), | |
| ) | |
| ) | |
| for file_id, file_path in vocab_files.items(): | |
| if file_path == resolved_vocab_files[file_id]: | |
| logger.info("loading file {}".format(file_path)) | |
| else: | |
| logger.info("loading file {} from cache at {}".format(file_path, resolved_vocab_files[file_id])) | |
| # Prepare tokenizer initialization kwargs | |
| # Did we saved some inputs and kwargs to reload ? | |
| tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None) | |
| if tokenizer_config_file is not None: | |
| with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle: | |
| init_kwargs = json.load(tokenizer_config_handle) | |
| saved_init_inputs = init_kwargs.pop("init_inputs", ()) | |
| if not init_inputs: | |
| init_inputs = saved_init_inputs | |
| else: | |
| init_kwargs = init_configuration | |
| # Update with newly provided kwargs | |
| init_kwargs.update(kwargs) | |
| # Set max length if needed | |
| if pretrained_model_name_or_path in cls.max_model_input_sizes: | |
| # if we're using a pretrained model, ensure the tokenizer | |
| # wont index sequences longer than the number of positional embeddings | |
| model_max_length = cls.max_model_input_sizes[pretrained_model_name_or_path] | |
| if model_max_length is not None and isinstance(model_max_length, (int, float)): | |
| init_kwargs["model_max_length"] = min(init_kwargs.get("model_max_length", int(1e30)), model_max_length) | |
| # Merge resolved_vocab_files arguments in init_kwargs. | |
| added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None) | |
| for args_name, file_path in resolved_vocab_files.items(): | |
| if args_name not in init_kwargs: | |
| init_kwargs[args_name] = file_path | |
| # Instantiate tokenizer. | |
| try: | |
| tokenizer = cls(*init_inputs, **init_kwargs) | |
| except OSError: | |
| raise OSError( | |
| "Unable to load vocabulary from file. " | |
| "Please check that the provided vocabulary is accessible and not corrupted." | |
| ) | |
| # Save inputs and kwargs for saving and re-loading with ``save_pretrained`` | |
| tokenizer.init_inputs = init_inputs | |
| tokenizer.init_kwargs = init_kwargs | |
| # If there is a complementary special token map, load it | |
| special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None) | |
| if special_tokens_map_file is not None: | |
| with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle: | |
| special_tokens_map = json.load(special_tokens_map_handle) | |
| for key, value in special_tokens_map.items(): | |
| if isinstance(value, dict): | |
| value = AddedToken(**value) | |
| setattr(tokenizer, key, value) | |
| # Add supplementary tokens. | |
| special_tokens = tokenizer.all_special_tokens | |
| if added_tokens_file is not None: | |
| with open(added_tokens_file, encoding="utf-8") as added_tokens_handle: | |
| added_tok_encoder = json.load(added_tokens_handle) | |
| # Sort added tokens by index | |
| added_tok_encoder_sorted = list(sorted(added_tok_encoder.items(), key=lambda x: x[1])) | |
| for token, index in added_tok_encoder_sorted: | |
| assert index == len(tokenizer), ( | |
| f"Non-consecutive added token '{token}' found. " | |
| f"Should have index {len(tokenizer)} but has index {index} in saved vocabulary." | |
| ) | |
| tokenizer.add_tokens(token, special_tokens=bool(token in special_tokens)) | |
| # Check all our special tokens are registrered as "no split" token (we don't cut them) and are in the vocab | |
| added_tokens = tokenizer.sanitize_special_tokens() | |
| if added_tokens: | |
| logger.warning( | |
| "Special tokens have been added in the vocabulary, make sure the associated word emebedding are fine-tuned or trained." | |
| ) | |
| return tokenizer | |
| def save_pretrained(self, save_directory) -> Tuple[str]: | |
| """ Save the tokenizer vocabulary files together with: | |
| - added tokens, | |
| - special-tokens-to-class-attributes-mapping, | |
| - tokenizer instantiation positional and keywords inputs (e.g. do_lower_case for Bert). | |
| Warning: This won't save modifications you may have applied to the tokenizer after the instantiation | |
| (e.g. modifying tokenizer.do_lower_case after creation). | |
| This method make sure the full tokenizer can then be re-loaded using the | |
| :func:`~transformers.PreTrainedTokenizer.from_pretrained` class method. | |
| """ | |
| if os.path.isfile(save_directory): | |
| logger.error("Provided path ({}) should be a directory, not a file".format(save_directory)) | |
| return | |
| os.makedirs(save_directory, exist_ok=True) | |
| special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE) | |
| added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE) | |
| tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE) | |
| tokenizer_config = copy.deepcopy(self.init_kwargs) | |
| if len(self.init_inputs) > 0: | |
| tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs) | |
| for file_id in self.vocab_files_names.keys(): | |
| tokenizer_config.pop(file_id, None) | |
| with open(tokenizer_config_file, "w", encoding="utf-8") as f: | |
| f.write(json.dumps(tokenizer_config, ensure_ascii=False)) | |
| with open(special_tokens_map_file, "w", encoding="utf-8") as f: | |
| write_dict = {} | |
| for key, value in self.special_tokens_map_extended.items(): | |
| if isinstance(value, AddedToken): | |
| write_dict[key] = value.__getstate__() | |
| else: | |
| write_dict[key] = value | |
| f.write(json.dumps(write_dict, ensure_ascii=False)) | |
| added_vocab = self.get_added_vocab() | |
| if added_vocab: | |
| with open(added_tokens_file, "w", encoding="utf-8") as f: | |
| out_str = json.dumps(added_vocab, ensure_ascii=False) | |
| f.write(out_str) | |
| vocab_files = self.save_vocabulary(save_directory) | |
| return vocab_files + (special_tokens_map_file, added_tokens_file) | |
| def encode( | |
| self, | |
| text: Union[TextInput, PreTokenizedInput, EncodedInput], | |
| text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, | |
| add_special_tokens: bool = True, | |
| padding: Union[bool, str] = False, | |
| truncation: Union[bool, str] = False, | |
| max_length: Optional[int] = None, | |
| stride: int = 0, | |
| return_tensors: Optional[Union[str, TensorType]] = None, | |
| **kwargs | |
| ): | |
| """ | |
| Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary. | |
| Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``. | |
| Args: | |
| text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`): | |
| The first sequence to be encoded. This can be a string, a list of strings (tokenized string using | |
| the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` | |
| method) | |
| text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`, defaults to :obj:`None`): | |
| Optional second sequence to be encoded. This can be a string, a list of strings (tokenized | |
| string using the `tokenize` method) or a list of integers (tokenized string ids using the | |
| `convert_tokens_to_ids` method) | |
| """ | |
| encoded_inputs = self.encode_plus( | |
| text, | |
| text_pair=text_pair, | |
| add_special_tokens=add_special_tokens, | |
| padding=padding, | |
| truncation=truncation, | |
| max_length=max_length, | |
| stride=stride, | |
| return_tensors=return_tensors, | |
| **kwargs, | |
| ) | |
| return encoded_inputs["input_ids"] | |
| def num_special_tokens_to_add(self, pair: bool = False) -> int: | |
| raise NotImplementedError | |
| def _get_padding_truncation_strategies( | |
| self, padding=False, truncation=False, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs | |
| ): | |
| """ Find the correct padding/truncation strategy with backward compatibility | |
| for old arguments (truncation_strategy and pad_to_max_length) and behaviors. | |
| """ | |
| old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate") | |
| old_pad_to_max_length = kwargs.pop("pad_to_max_length", False) | |
| # Backward compatibility for previous behavior, maybe we should deprecate it: | |
| # If you only set max_length, it activates truncation for max_length | |
| if max_length is not None and padding is False and truncation is False: | |
| if verbose: | |
| logger.warning( | |
| "Truncation was not explicitely activated but `max_length` is provided a specific value, " | |
| "please use `truncation=True` to explicitely truncate examples to max length. " | |
| "Defaulting to 'longest_first' truncation strategy. " | |
| "If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy " | |
| "more precisely by providing a specific strategy to `truncation`." | |
| ) | |
| truncation = "longest_first" | |
| # Get padding strategy | |
| if padding is False and old_pad_to_max_length: | |
| if verbose: | |
| warnings.warn( | |
| "The `pad_to_max_length` argument is deprecated and will be removed in a future version, " | |
| "use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or " | |
| "use `padding='max_length'` to pad to a max length. In this case, you can give a specific " | |
| "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the " | |
| "maximal input size of the model (e.g. 512 for Bert).", | |
| DeprecationWarning, | |
| ) | |
| if max_length is None: | |
| padding_strategy = PaddingStrategy.LONGEST | |
| else: | |
| padding_strategy = PaddingStrategy.MAX_LENGTH | |
| elif padding is not False: | |
| if padding is True: | |
| padding_strategy = PaddingStrategy.LONGEST # Default to pad to the longest sequence in the batch | |
| elif not isinstance(padding, PaddingStrategy): | |
| padding_strategy = PaddingStrategy(padding) | |
| else: | |
| padding_strategy = PaddingStrategy.DO_NOT_PAD | |
| # Get truncation strategy | |
| if truncation is False and old_truncation_strategy != "do_not_truncate": | |
| if verbose: | |
| warnings.warn( | |
| "The `truncation_strategy` argument is deprecated and will be removed in a future version, " | |
| "use `truncation=True` to truncate examples to a max length. You can give a specific " | |
| "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the " | |
| "maximal input size of the model (e.g. 512 for Bert). " | |
| " If you have pairs of inputs, you can give a specific truncation strategy selected among " | |
| "`truncation='only_first'` (will only truncate the first sentence in the pairs) " | |
| "`truncation='only_second'` (will only truncate the second sentence in the pairs) " | |
| "or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence in the pairs).", | |
| DeprecationWarning, | |
| ) | |
| truncation_strategy = TruncationStrategy(old_truncation_strategy) | |
| elif truncation is not False: | |
| if truncation is True: | |
| truncation_strategy = ( | |
| TruncationStrategy.LONGEST_FIRST | |
| ) # Default to truncate the longest sequences in pairs of inputs | |
| elif not isinstance(truncation, TruncationStrategy): | |
| truncation_strategy = TruncationStrategy(truncation) | |
| else: | |
| truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE | |
| # Set max length if needed | |
| if max_length is None: | |
| if padding_strategy == PaddingStrategy.MAX_LENGTH: | |
| if self.model_max_length > LARGE_INTEGER: | |
| if verbose: | |
| logger.warning( | |
| "Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. " | |
| "Default to no padding." | |
| ) | |
| padding_strategy = PaddingStrategy.DO_NOT_PAD | |
| else: | |
| max_length = self.model_max_length | |
| if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE: | |
| if self.model_max_length > LARGE_INTEGER: | |
| if verbose: | |
| logger.warning( | |
| "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. " | |
| "Default to no truncation." | |
| ) | |
| truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE | |
| else: | |
| max_length = self.model_max_length | |
| # Test if we have a padding token | |
| if padding_strategy != PaddingStrategy.DO_NOT_PAD and (not self.pad_token or self.pad_token_id < 0): | |
| raise ValueError( | |
| "Asking to pad but the tokenizer does not have a padding token. " | |
| "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` " | |
| "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`." | |
| ) | |
| # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided | |
| if ( | |
| truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE | |
| and padding_strategy != PaddingStrategy.DO_NOT_PAD | |
| and pad_to_multiple_of is not None | |
| and max_length is not None | |
| and (max_length % pad_to_multiple_of != 0) | |
| ): | |
| raise ValueError( | |
| f"Truncation and padding are both activated but " | |
| f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})." | |
| ) | |
| return padding_strategy, truncation_strategy, max_length, kwargs | |
| def __call__( | |
| self, | |
| text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]], | |
| text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, | |
| add_special_tokens: bool = True, | |
| padding: Union[bool, str] = False, | |
| truncation: Union[bool, str] = False, | |
| max_length: Optional[int] = None, | |
| stride: int = 0, | |
| is_pretokenized: bool = False, | |
| pad_to_multiple_of: Optional[int] = None, | |
| return_tensors: Optional[Union[str, TensorType]] = None, | |
| return_token_type_ids: Optional[bool] = None, | |
| return_attention_mask: Optional[bool] = None, | |
| return_overflowing_tokens: bool = False, | |
| return_special_tokens_mask: bool = False, | |
| return_offsets_mapping: bool = False, | |
| return_length: bool = False, | |
| verbose: bool = True, | |
| **kwargs | |
| ) -> BatchEncoding: | |
| """ | |
| Returns a dictionary containing the encoded sequence or sequence pair and additional information: | |
| the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. | |
| Args: | |
| text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]``): | |
| The sequence or batch of sequences to be encoded. | |
| Each sequence can be a string or a list of strings (pre-tokenized string). | |
| If the sequences are provided as list of strings (pretokenized), you must set `is_pretokenized=True` | |
| (to lift the ambiguity with a batch of sequences) | |
| text_pair (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]``): | |
| The sequence or batch of sequences to be encoded. | |
| Each sequence can be a string or a list of strings (pre-tokenized string). | |
| If the sequences are provided as list of strings (pretokenized), you must set `is_pretokenized=True` | |
| (to lift the ambiguity with a batch of sequences) | |
| """ | |
| # Input type checking for clearer error | |
| assert isinstance(text, str) or ( | |
| isinstance(text, (list, tuple)) | |
| and ( | |
| len(text) == 0 | |
| or ( | |
| isinstance(text[0], str) | |
| or (isinstance(text[0], (list, tuple)) and (len(text[0]) == 0 or isinstance(text[0][0], str))) | |
| ) | |
| ) | |
| ), ( | |
| "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) " | |
| "or `List[List[str]]` (batch of pretokenized examples)." | |
| ) | |
| assert ( | |
| text_pair is None | |
| or isinstance(text_pair, str) | |
| or ( | |
| isinstance(text_pair, (list, tuple)) | |
| and ( | |
| len(text_pair) == 0 | |
| or ( | |
| isinstance(text_pair[0], str) | |
| or ( | |
| isinstance(text_pair[0], (list, tuple)) | |
| and (len(text_pair[0]) == 0 or isinstance(text_pair[0][0], str)) | |
| ) | |
| ) | |
| ) | |
| ) | |
| ), ( | |
| "text_pair input must of type `str` (single example), `List[str]` (batch or single pretokenized example) " | |
| "or `List[List[str]]` (batch of pretokenized examples)." | |
| ) | |
| is_batched = bool( | |
| (not is_pretokenized and isinstance(text, (list, tuple))) | |
| or (is_pretokenized and isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))) | |
| ) | |
| if is_batched: | |
| batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text | |
| return self.batch_encode_plus( | |
| batch_text_or_text_pairs=batch_text_or_text_pairs, | |
| add_special_tokens=add_special_tokens, | |
| padding=padding, | |
| truncation=truncation, | |
| max_length=max_length, | |
| stride=stride, | |
| is_pretokenized=is_pretokenized, | |
| pad_to_multiple_of=pad_to_multiple_of, | |
| return_tensors=return_tensors, | |
| return_token_type_ids=return_token_type_ids, | |
| return_attention_mask=return_attention_mask, | |
| return_overflowing_tokens=return_overflowing_tokens, | |
| return_special_tokens_mask=return_special_tokens_mask, | |
| return_offsets_mapping=return_offsets_mapping, | |
| return_length=return_length, | |
| verbose=verbose, | |
| **kwargs, | |
| ) | |
| else: | |
| return self.encode_plus( | |
| text=text, | |
| text_pair=text_pair, | |
| add_special_tokens=add_special_tokens, | |
| padding=padding, | |
| truncation=truncation, | |
| max_length=max_length, | |
| stride=stride, | |
| is_pretokenized=is_pretokenized, | |
| pad_to_multiple_of=pad_to_multiple_of, | |
| return_tensors=return_tensors, | |
| return_token_type_ids=return_token_type_ids, | |
| return_attention_mask=return_attention_mask, | |
| return_overflowing_tokens=return_overflowing_tokens, | |
| return_special_tokens_mask=return_special_tokens_mask, | |
| return_offsets_mapping=return_offsets_mapping, | |
| return_length=return_length, | |
| verbose=verbose, | |
| **kwargs, | |
| ) | |
| def encode_plus( | |
| self, | |
| text: Union[TextInput, PreTokenizedInput, EncodedInput], | |
| text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, | |
| add_special_tokens: bool = True, | |
| padding: Union[bool, str] = False, | |
| truncation: Union[bool, str] = False, | |
| max_length: Optional[int] = None, | |
| stride: int = 0, | |
| is_pretokenized: bool = False, | |
| pad_to_multiple_of: Optional[int] = None, | |
| return_tensors: Optional[Union[str, TensorType]] = None, | |
| return_token_type_ids: Optional[bool] = None, | |
| return_attention_mask: Optional[bool] = None, | |
| return_overflowing_tokens: bool = False, | |
| return_special_tokens_mask: bool = False, | |
| return_offsets_mapping: bool = False, | |
| return_length: bool = False, | |
| verbose: bool = True, | |
| **kwargs | |
| ) -> BatchEncoding: | |
| """ | |
| Returns a dictionary containing the encoded sequence or sequence pair and additional information: | |
| the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. | |
| Args: | |
| text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]` (the later only for not-fast tokenizers)): | |
| The first sequence to be encoded. This can be a string, a list of strings (tokenized string using | |
| the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` | |
| method) | |
| text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`, defaults to :obj:`None`): | |
| Optional second sequence to be encoded. This can be a string, a list of strings (tokenized | |
| string using the `tokenize` method) or a list of integers (tokenized string ids using the | |
| `convert_tokens_to_ids` method) | |
| """ | |
| # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' | |
| padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( | |
| padding=padding, | |
| truncation=truncation, | |
| max_length=max_length, | |
| pad_to_multiple_of=pad_to_multiple_of, | |
| verbose=verbose, | |
| **kwargs, | |
| ) | |
| return self._encode_plus( | |
| text=text, | |
| text_pair=text_pair, | |
| add_special_tokens=add_special_tokens, | |
| padding_strategy=padding_strategy, | |
| truncation_strategy=truncation_strategy, | |
| max_length=max_length, | |
| stride=stride, | |
| is_pretokenized=is_pretokenized, | |
| pad_to_multiple_of=pad_to_multiple_of, | |
| return_tensors=return_tensors, | |
| return_token_type_ids=return_token_type_ids, | |
| return_attention_mask=return_attention_mask, | |
| return_overflowing_tokens=return_overflowing_tokens, | |
| return_special_tokens_mask=return_special_tokens_mask, | |
| return_offsets_mapping=return_offsets_mapping, | |
| return_length=return_length, | |
| verbose=verbose, | |
| **kwargs, | |
| ) | |
| def _encode_plus( | |
| self, | |
| text: Union[TextInput, PreTokenizedInput, EncodedInput], | |
| text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, | |
| add_special_tokens: bool = True, | |
| padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, | |
| truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, | |
| max_length: Optional[int] = None, | |
| stride: int = 0, | |
| is_pretokenized: bool = False, | |
| pad_to_multiple_of: Optional[int] = None, | |
| return_tensors: Optional[Union[str, TensorType]] = None, | |
| return_token_type_ids: Optional[bool] = None, | |
| return_attention_mask: Optional[bool] = None, | |
| return_overflowing_tokens: bool = False, | |
| return_special_tokens_mask: bool = False, | |
| return_offsets_mapping: bool = False, | |
| return_length: bool = False, | |
| verbose: bool = True, | |
| **kwargs | |
| ) -> BatchEncoding: | |
| raise NotImplementedError | |
| def batch_encode_plus( | |
| self, | |
| batch_text_or_text_pairs: Union[ | |
| List[TextInput], | |
| List[TextInputPair], | |
| List[PreTokenizedInput], | |
| List[PreTokenizedInputPair], | |
| List[EncodedInput], | |
| List[EncodedInputPair], | |
| ], | |
| add_special_tokens: bool = True, | |
| padding: Union[bool, str] = False, | |
| truncation: Union[bool, str] = False, | |
| max_length: Optional[int] = None, | |
| stride: int = 0, | |
| is_pretokenized: bool = False, | |
| pad_to_multiple_of: Optional[int] = None, | |
| return_tensors: Optional[Union[str, TensorType]] = None, | |
| return_token_type_ids: Optional[bool] = None, | |
| return_attention_mask: Optional[bool] = None, | |
| return_overflowing_tokens: bool = False, | |
| return_special_tokens_mask: bool = False, | |
| return_offsets_mapping: bool = False, | |
| return_length: bool = False, | |
| verbose: bool = True, | |
| **kwargs | |
| ) -> BatchEncoding: | |
| """ | |
| Returns a dictionary containing the encoded sequence or sequence pair and additional information: | |
| the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. | |
| Args: | |
| batch_text_or_text_pairs (:obj:`List[str]`, :obj:`List[Tuple[str, str]]`, | |
| :obj:`List[List[str]]`, :obj:`List[Tuple[List[str], List[str]]]`, | |
| and for not-fast tokenizers, also: | |
| :obj:`List[List[int]]`, :obj:`List[Tuple[List[int], List[int]]]`): | |
| Batch of sequences or pair of sequences to be encoded. | |
| This can be a list of string/string-sequences/int-sequences or a list of pair of | |
| string/string-sequences/int-sequence (see details in encode_plus) | |
| """ | |
| # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' | |
| padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( | |
| padding=padding, | |
| truncation=truncation, | |
| max_length=max_length, | |
| pad_to_multiple_of=pad_to_multiple_of, | |
| verbose=verbose, | |
| **kwargs, | |
| ) | |
| return self._batch_encode_plus( | |
| batch_text_or_text_pairs=batch_text_or_text_pairs, | |
| add_special_tokens=add_special_tokens, | |
| padding_strategy=padding_strategy, | |
| truncation_strategy=truncation_strategy, | |
| max_length=max_length, | |
| stride=stride, | |
| is_pretokenized=is_pretokenized, | |
| pad_to_multiple_of=pad_to_multiple_of, | |
| return_tensors=return_tensors, | |
| return_token_type_ids=return_token_type_ids, | |
| return_attention_mask=return_attention_mask, | |
| return_overflowing_tokens=return_overflowing_tokens, | |
| return_special_tokens_mask=return_special_tokens_mask, | |
| return_offsets_mapping=return_offsets_mapping, | |
| return_length=return_length, | |
| verbose=verbose, | |
| **kwargs, | |
| ) | |
| def _batch_encode_plus( | |
| self, | |
| batch_text_or_text_pairs: Union[ | |
| List[TextInput], | |
| List[TextInputPair], | |
| List[PreTokenizedInput], | |
| List[PreTokenizedInputPair], | |
| List[EncodedInput], | |
| List[EncodedInputPair], | |
| ], | |
| add_special_tokens: bool = True, | |
| padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, | |
| truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, | |
| max_length: Optional[int] = None, | |
| stride: int = 0, | |
| is_pretokenized: bool = False, | |
| pad_to_multiple_of: Optional[int] = None, | |
| return_tensors: Optional[Union[str, TensorType]] = None, | |
| return_token_type_ids: Optional[bool] = None, | |
| return_attention_mask: Optional[bool] = None, | |
| return_overflowing_tokens: bool = False, | |
| return_special_tokens_mask: bool = False, | |
| return_offsets_mapping: bool = False, | |
| return_length: bool = False, | |
| verbose: bool = True, | |
| **kwargs | |
| ) -> BatchEncoding: | |
| raise NotImplementedError | |
| def pad( | |
| self, | |
| encoded_inputs: Union[ | |
| BatchEncoding, | |
| List[BatchEncoding], | |
| Dict[str, EncodedInput], | |
| Dict[str, List[EncodedInput]], | |
| List[Dict[str, EncodedInput]], | |
| ], | |
| padding: Union[bool, str] = True, | |
| max_length: Optional[int] = None, | |
| pad_to_multiple_of: Optional[int] = None, | |
| return_attention_mask: Optional[bool] = None, | |
| return_tensors: Optional[Union[str, TensorType]] = None, | |
| verbose: bool = True, | |
| ) -> BatchEncoding: | |
| """ Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length in the batch. | |
| Padding side (left/right) padding token ids are defined at the tokenizer level | |
| (with ``self.padding_side``, ``self.pad_token_id`` and ``self.pad_token_type_id``) | |
| Args: | |
| encoded_inputs: Dictionary of tokenized inputs (`Dict[str, List[int]]`) or batch of tokenized inputs. | |
| Batch of tokenized inputs can be given as dicts of lists or lists of dicts, both work so you can | |
| use ``tokenizer.pad()`` during pre-processing as well as in a PyTorch Dataloader collate function. | |
| (`Dict[str, List[List[int]]]` or `List[Dict[str, List[int]]]`). | |
| padding: Boolean or specific strategy to use for padding. | |
| Select a strategy to pad the returned sequences (according to the model's padding side and padding index) among: | |
| - 'longest' (or `True`) Pad to the longest sequence in the batch | |
| - 'max_length': Pad to the max length (default) | |
| - 'do_not_pad' (or `False`): Do not pad | |
| max_length: maximum length of the returned list and optionally padding length (see below). | |
| Will truncate by taking into account the special tokens. | |
| pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. | |
| This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability | |
| >= 7.5 (Volta). | |
| return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) | |
| return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`): | |
| Can be set to 'tf', 'pt' or 'np' to return respectively TensorFlow :obj:`tf.constant`, | |
| PyTorch :obj:`torch.Tensor` or Numpy :oj: `np.ndarray` instead of a list of python integers. | |
| verbose (:obj:`bool`, `optional`, defaults to :obj:`True`): | |
| Set to ``False`` to avoid printing infos and warnings. | |
| """ | |
| # If we have a list of dicts, let's convert it in a dict of lists | |
| if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], (dict, BatchEncoding)): | |
| encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()} | |
| assert "input_ids" in encoded_inputs, ( | |
| "You should supply an encoding or a list of encodings to this method. " | |
| "An encoding is the output of one the encoding methods of the tokenizer, i.e. " | |
| "__call__/encode_plus/batch_encode_plus. " | |
| ) | |
| if not encoded_inputs["input_ids"]: | |
| if return_attention_mask: | |
| encoded_inputs["attention_mask"] = [] | |
| return encoded_inputs | |
| # Convert padding_strategy in PaddingStrategy | |
| padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies( | |
| padding=padding, max_length=max_length, verbose=verbose | |
| ) | |
| if encoded_inputs["input_ids"] and not isinstance(encoded_inputs["input_ids"][0], (list, tuple)): | |
| encoded_inputs = self._pad( | |
| encoded_inputs, | |
| max_length=max_length, | |
| padding_strategy=padding_strategy, | |
| pad_to_multiple_of=pad_to_multiple_of, | |
| return_attention_mask=return_attention_mask, | |
| ) | |
| return BatchEncoding(encoded_inputs, tensor_type=return_tensors) | |
| batch_size = len(encoded_inputs["input_ids"]) | |
| assert all( | |
| len(v) == batch_size for v in encoded_inputs.values() | |
| ), "Some items in the output dictionnary have a different batch size than others." | |
| if padding_strategy == PaddingStrategy.LONGEST: | |
| max_length = max(len(inputs) for inputs in encoded_inputs["input_ids"]) | |
| padding_strategy = PaddingStrategy.MAX_LENGTH | |
| batch_outputs = {} | |
| for i in range(batch_size): | |
| inputs = dict((k, v[i]) for k, v in encoded_inputs.items()) | |
| outputs = self._pad( | |
| inputs, | |
| max_length=max_length, | |
| padding_strategy=padding_strategy, | |
| pad_to_multiple_of=pad_to_multiple_of, | |
| return_attention_mask=return_attention_mask, | |
| ) | |
| for key, value in outputs.items(): | |
| if key not in batch_outputs: | |
| batch_outputs[key] = [] | |
| batch_outputs[key].append(value) | |
| return BatchEncoding(batch_outputs, tensor_type=return_tensors) | |
| def create_token_type_ids_from_sequences(self, token_ids_0: List, token_ids_1: Optional[List] = None) -> List[int]: | |
| if token_ids_1 is None: | |
| return len(token_ids_0) * [0] | |
| return [0] * len(token_ids_0) + [1] * len(token_ids_1) | |
| def build_inputs_with_special_tokens(self, token_ids_0: List, token_ids_1: Optional[List] = None) -> List: | |
| """ | |
| Build model inputs from a sequence or a pair of sequence for sequence classification tasks | |
| by concatenating and adding special tokens. This implementation does not add special tokens. | |
| """ | |
| if token_ids_1 is None: | |
| return token_ids_0 | |
| return token_ids_0 + token_ids_1 | |
| def prepare_for_model( | |
| self, | |
| ids: List[int], | |
| pair_ids: Optional[List[int]] = None, | |
| add_special_tokens: bool = True, | |
| padding: Union[bool, str] = False, | |
| truncation: Union[bool, str] = False, | |
| max_length: Optional[int] = None, | |
| stride: int = 0, | |
| pad_to_multiple_of: Optional[int] = None, | |
| return_tensors: Optional[Union[str, TensorType]] = None, | |
| return_token_type_ids: Optional[bool] = None, | |
| return_attention_mask: Optional[bool] = None, | |
| return_overflowing_tokens: bool = False, | |
| return_special_tokens_mask: bool = False, | |
| return_offsets_mapping: bool = False, | |
| return_length: bool = False, | |
| verbose: bool = True, | |
| prepend_batch_axis: bool = False, | |
| **kwargs | |
| ) -> BatchEncoding: | |
| """ Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. | |
| It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and | |
| manages a moving window (with user defined stride) for overflowing tokens | |
| Args: | |
| ids: list of tokenized input ids. Can be obtained from a string by chaining the | |
| `tokenize` and `convert_tokens_to_ids` methods. | |
| pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the | |
| `tokenize` and `convert_tokens_to_ids` methods. | |
| """ | |
| if "return_lengths" in kwargs: | |
| if verbose: | |
| warnings.warn( | |
| "The PreTrainedTokenizerBase.prepare_for_model `return_lengths` parameter is deprecated. " | |
| "Please use `return_length` instead.", | |
| FutureWarning, | |
| ) | |
| return_length = kwargs["return_lengths"] | |
| # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' | |
| padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( | |
| padding=padding, | |
| truncation=truncation, | |
| max_length=max_length, | |
| pad_to_multiple_of=pad_to_multiple_of, | |
| verbose=verbose, | |
| **kwargs, | |
| ) | |
| pair = bool(pair_ids is not None) | |
| len_ids = len(ids) | |
| len_pair_ids = len(pair_ids) if pair else 0 | |
| # Load from model defaults | |
| if return_token_type_ids is None: | |
| return_token_type_ids = "token_type_ids" in self.model_input_names | |
| if return_attention_mask is None: | |
| return_attention_mask = "attention_mask" in self.model_input_names | |
| encoded_inputs = {} | |
| # Compute the total size of the returned encodings | |
| total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0) | |
| # Truncation: Handle max sequence length | |
| if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length: | |
| ids, pair_ids, overflowing_tokens = self.truncate_sequences( | |
| ids, | |
| pair_ids=pair_ids, | |
| num_tokens_to_remove=total_len - max_length, | |
| truncation_strategy=truncation_strategy, | |
| stride=stride, | |
| ) | |
| if return_overflowing_tokens: | |
| encoded_inputs["overflowing_tokens"] = overflowing_tokens | |
| encoded_inputs["num_truncated_tokens"] = total_len - max_length | |
| # Add special tokens | |
| if add_special_tokens: | |
| sequence = self.build_inputs_with_special_tokens(ids, pair_ids) | |
| token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) | |
| else: | |
| sequence = ids + pair_ids if pair else ids | |
| token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else []) | |
| # Build output dictionnary | |
| encoded_inputs["input_ids"] = sequence | |
| if return_token_type_ids: | |
| encoded_inputs["token_type_ids"] = token_type_ids | |
| if return_special_tokens_mask: | |
| if add_special_tokens: | |
| encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids) | |
| else: | |
| encoded_inputs["special_tokens_mask"] = [0] * len(sequence) | |
| # Check lengths | |
| if max_length is None and len(encoded_inputs["input_ids"]) > self.model_max_length and verbose: | |
| logger.warning( | |
| "Token indices sequence length is longer than the specified maximum sequence length " | |
| "for this model ({} > {}). Running this sequence through the model will result in " | |
| "indexing errors".format(len(ids), self.model_max_length) | |
| ) | |
| # Padding | |
| if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask: | |
| encoded_inputs = self.pad( | |
| encoded_inputs, | |
| max_length=max_length, | |
| padding=padding_strategy.value, | |
| pad_to_multiple_of=pad_to_multiple_of, | |
| return_attention_mask=return_attention_mask, | |
| ) | |
| if return_length: | |
| encoded_inputs["length"] = len(encoded_inputs["input_ids"]) | |
| batch_outputs = BatchEncoding( | |
| encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis | |
| ) | |
| return batch_outputs | |
| def truncate_sequences( | |
| self, | |
| ids: List[int], | |
| pair_ids: Optional[List[int]] = None, | |
| num_tokens_to_remove: int = 0, | |
| truncation_strategy: Union[str, TruncationStrategy] = "longest_first", | |
| stride: int = 0, | |
| ) -> Tuple[List[int], List[int], List[int]]: | |
| """ Truncates a sequence pair in place to the maximum length. | |
| Args: | |
| ids: list of tokenized input ids. Can be obtained from a string by chaining the | |
| `tokenize` and `convert_tokens_to_ids` methods. | |
| pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the | |
| `tokenize` and `convert_tokens_to_ids` methods. | |
| num_tokens_to_remove (:obj:`int`, `optional`, defaults to ``0``): | |
| number of tokens to remove using the truncation strategy | |
| truncation_strategy (:obj:`string`, `optional`, defaults to "longest_first"): | |
| String selected in the following options: | |
| - 'longest_first' (default): Iteratively reduce the inputs sequence until the input is under max_length | |
| starting from the longest one at each token (when there is a pair of input sequences). | |
| Overflowing tokens only contains overflow from the first sequence. | |
| - 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove. | |
| - 'only_second': Only truncate the second sequence | |
| - 'do_not_truncate' | |
| stride (:obj:`int`, `optional`, defaults to ``0``): | |
| If set to a number along with max_length, the overflowing tokens returned will contain some tokens | |
| from the main sequence returned. The value of this argument defines the number of additional tokens. | |
| """ | |
| if num_tokens_to_remove <= 0: | |
| return ids, pair_ids, [] | |
| if not isinstance(truncation_strategy, TruncationStrategy): | |
| truncation_strategy = TruncationStrategy(truncation_strategy) | |
| overflowing_tokens = [] | |
| if truncation_strategy == TruncationStrategy.LONGEST_FIRST: | |
| for _ in range(num_tokens_to_remove): | |
| if pair_ids is None or len(ids) > len(pair_ids): | |
| if not overflowing_tokens: | |
| window_len = min(len(ids), stride + 1) | |
| else: | |
| window_len = 1 | |
| overflowing_tokens.extend(ids[-window_len:]) | |
| ids = ids[:-1] | |
| else: | |
| if not overflowing_tokens: | |
| window_len = min(len(pair_ids), stride + 1) | |
| else: | |
| window_len = 1 | |
| overflowing_tokens.extend(pair_ids[-window_len:]) | |
| pair_ids = pair_ids[:-1] | |
| elif truncation_strategy == TruncationStrategy.ONLY_FIRST: | |
| if len(ids) > num_tokens_to_remove: | |
| window_len = min(len(ids), stride + num_tokens_to_remove) | |
| overflowing_tokens = ids[-window_len:] | |
| ids = ids[:-num_tokens_to_remove] | |
| else: | |
| logger.error( | |
| f"We need to remove {num_tokens_to_remove} to truncate the input" | |
| f"but the first sequence has a length {len(ids)}. " | |
| f"Please select another truncation strategy than {truncation_strategy}, " | |
| f"for instance 'longest_first' or 'only_second'." | |
| ) | |
| elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None: | |
| if len(pair_ids) > num_tokens_to_remove: | |
| window_len = min(len(pair_ids), stride + num_tokens_to_remove) | |
| overflowing_tokens = pair_ids[-window_len:] | |
| pair_ids = pair_ids[:-num_tokens_to_remove] | |
| else: | |
| logger.error( | |
| f"We need to remove {num_tokens_to_remove} to truncate the input" | |
| f"but the second sequence has a length {len(pair_ids)}. " | |
| f"Please select another truncation strategy than {truncation_strategy}, " | |
| f"for instance 'longest_first' or 'only_first'." | |
| ) | |
| return (ids, pair_ids, overflowing_tokens) | |
| def _pad( | |
| self, | |
| encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], | |
| max_length: Optional[int] = None, | |
| padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, | |
| pad_to_multiple_of: Optional[int] = None, | |
| return_attention_mask: Optional[bool] = None, | |
| ) -> dict: | |
| """ Pad encoded inputs (on left/right and up to predefined legnth or max length in the batch) | |
| Args: | |
| encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). | |
| max_length: maximum length of the returned list and optionally padding length (see below). | |
| Will truncate by taking into account the special tokens. | |
| padding_strategy: PaddingStrategy to use for padding. | |
| - PaddingStrategy.LONGEST Pad to the longest sequence in the batch | |
| - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) | |
| - PaddingStrategy.DO_NOT_PAD: Do not pad | |
| The tokenizer padding sides are defined in self.padding_side: | |
| - 'left': pads on the left of the sequences | |
| - 'right': pads on the right of the sequences | |
| pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. | |
| This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability | |
| >= 7.5 (Volta). | |
| return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) | |
| """ | |
| # Load from model defaults | |
| if return_attention_mask is None: | |
| return_attention_mask = "attention_mask" in self.model_input_names | |
| if padding_strategy == PaddingStrategy.LONGEST: | |
| max_length = len(encoded_inputs["input_ids"]) | |
| if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): | |
| max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of | |
| needs_to_be_padded = ( | |
| padding_strategy != PaddingStrategy.DO_NOT_PAD and len(encoded_inputs["input_ids"]) != max_length | |
| ) | |
| if needs_to_be_padded: | |
| difference = max_length - len(encoded_inputs["input_ids"]) | |
| if self.padding_side == "right": | |
| if return_attention_mask: | |
| encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference | |
| if "token_type_ids" in encoded_inputs: | |
| encoded_inputs["token_type_ids"] = ( | |
| encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference | |
| ) | |
| if "special_tokens_mask" in encoded_inputs: | |
| encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference | |
| encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference | |
| elif self.padding_side == "left": | |
| if return_attention_mask: | |
| encoded_inputs["attention_mask"] = [0] * difference + [1] * len(encoded_inputs["input_ids"]) | |
| if "token_type_ids" in encoded_inputs: | |
| encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[ | |
| "token_type_ids" | |
| ] | |
| if "special_tokens_mask" in encoded_inputs: | |
| encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] | |
| encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"] | |
| else: | |
| raise ValueError("Invalid padding strategy:" + str(self.padding_side)) | |
| else: | |
| if return_attention_mask: | |
| encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) | |
| return encoded_inputs | |
| def batch_decode(self, sequences: List[List[int]], **kwargs) -> List[str]: | |
| return [self.decode(seq, **kwargs) for seq in sequences] | |
| def decode( | |
| self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True | |
| ) -> str: | |
| """ | |
| Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary | |
| with options to remove special tokens and clean up tokenization spaces. | |
| Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``. | |
| Args: | |
| token_ids: list of tokenized input ids. Can be obtained using the `encode` or `encode_plus` methods. | |
| skip_special_tokens: if set to True, will replace special tokens. | |
| clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces. | |
| """ | |
| raise NotImplementedError | |
| def get_special_tokens_mask( | |
| self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False | |
| ) -> List[int]: | |
| """ | |
| Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding | |
| special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. | |
| Args: | |
| token_ids_0: list of ids (must not contain special tokens) | |
| token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids | |
| for sequence pairs | |
| already_has_special_tokens: (default False) Set to True if the token list is already formated with | |
| special tokens for the model | |
| Returns: | |
| A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. | |
| """ | |
| assert already_has_special_tokens and token_ids_1 is None, ( | |
| "You cannot use ``already_has_special_tokens=False`` with this tokenizer. " | |
| "Please use a slow (full python) tokenizer to activate this argument." | |
| "Or set `return_special_token_mask=True` when calling the encoding method " | |
| "to get the special tokens mask in any tokenizer. " | |
| ) | |
| all_special_ids = self.all_special_ids # cache the property | |
| special_tokens_mask = [1 if token in all_special_ids else 0 for token in token_ids_0] | |
| return special_tokens_mask | |
| def clean_up_tokenization(out_string: str) -> str: | |
| """ Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms. | |
| """ | |
| out_string = ( | |
| out_string.replace(" .", ".") | |
| .replace(" ?", "?") | |
| .replace(" !", "!") | |
| .replace(" ,", ",") | |
| .replace(" ' ", "'") | |
| .replace(" n't", "n't") | |
| .replace(" 'm", "'m") | |
| .replace(" 's", "'s") | |
| .replace(" 've", "'ve") | |
| .replace(" 're", "'re") | |
| ) | |
| return out_string | |