Upload 3 files

Browse files

Files changed (3) hide show

__init__.cpython-312.pyc +0 -0
__init__.py +13 -6
__init__.pyi +382 -363

__init__.cpython-312.pyc CHANGED Viewed

Binary files a/__init__.cpython-312.pyc and b/__init__.cpython-312.pyc differ

__init__.py CHANGED Viewed

@@ -1,8 +1,15 @@
 # Generated content DO NOT EDIT
-from .. import models
-Model = models.Model
-BPE = models.BPE
-Unigram = models.Unigram
-WordLevel = models.WordLevel
-WordPiece = models.WordPiece

 # Generated content DO NOT EDIT
+from .. import pre_tokenizers
+PreTokenizer = pre_tokenizers.PreTokenizer
+BertPreTokenizer = pre_tokenizers.BertPreTokenizer
+ByteLevel = pre_tokenizers.ByteLevel
+CharDelimiterSplit = pre_tokenizers.CharDelimiterSplit
+Digits = pre_tokenizers.Digits
+Metaspace = pre_tokenizers.Metaspace
+Punctuation = pre_tokenizers.Punctuation
+Sequence = pre_tokenizers.Sequence
+Split = pre_tokenizers.Split
+UnicodeScripts = pre_tokenizers.UnicodeScripts
+Whitespace = pre_tokenizers.Whitespace
+WhitespaceSplit = pre_tokenizers.WhitespaceSplit

__init__.pyi CHANGED Viewed

@@ -1,591 +1,610 @@
 # Generated content DO NOT EDIT
-class Model:
     """
-    Base class for all models
-    The model represents the actual tokenization algorithm. This is the part that
-    will contain and manage the learned vocabulary.
-    This class cannot be constructed directly. Please use one of the concrete models.
     """
-    def get_trainer(self):
         """
-        Get the associated :class:`~tokenizers.trainers.Trainer`
-        Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
-        :class:`~tokenizers.models.Model`.
-        Returns:
-            :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
         """
         pass
-    def id_to_token(self, id):
         """
-        Get the token associated to an ID
         Args:
-            id (:obj:`int`):
-                An ID to convert to a token
         Returns:
-            :obj:`str`: The token associated to the ID
         """
         pass
-    def save(self, folder, prefix):
-        """
-        Save the current model
-        Save the current model in the given folder, using the given prefix for the various
-        files that will get created.
-        Any file with the same name that already exists in this folder will be overwritten.
-        Args:
-            folder (:obj:`str`):
-                The path to the target folder in which to save the various files
-            prefix (:obj:`str`, `optional`):
-                An optional prefix, used to prefix each file name
-        Returns:
-            :obj:`List[str]`: The list of saved files
-        """
         pass
-    def token_to_id(self, tokens):
         """
-        Get the ID associated to a token
-        Args:
-            token (:obj:`str`):
-                A token to convert to an ID
-        Returns:
-            :obj:`int`: The ID associated to the token
         """
         pass
-    def tokenize(self, sequence):
         """
-        Tokenize a sequence
         Args:
             sequence (:obj:`str`):
-                A sequence to tokenize
         Returns:
-            A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
         """
         pass
-class BPE(Model):
     """
-    An implementation of the BPE (Byte-Pair Encoding) algorithm
-    Args:
-        vocab (:obj:`Dict[str, int]`, `optional`):
-            A dictionary of string keys and their ids :obj:`{"am": 0,...}`
-        merges (:obj:`List[Tuple[str, str]]`, `optional`):
-            A list of pairs of tokens (:obj:`Tuple[str, str]`) :obj:`[("a", "b"),...]`
-        cache_capacity (:obj:`int`, `optional`):
-            The number of words that the BPE cache can contain. The cache allows
-            to speed-up the process by keeping the result of the merge operations
-            for a number of words.
-        dropout (:obj:`float`, `optional`):
-            A float between 0 and 1 that represents the BPE dropout to use.
-        unk_token (:obj:`str`, `optional`):
-            The unknown token to be used by the model.
-        continuing_subword_prefix (:obj:`str`, `optional`):
-            The prefix to attach to subword units that don't represent a beginning of word.
-        end_of_word_suffix (:obj:`str`, `optional`):
-            The suffix to attach to subword units that represent an end of word.
-        fuse_unk (:obj:`bool`, `optional`):
-            Whether to fuse any subsequent unknown tokens into a single one
-        byte_fallback (:obj:`bool`, `optional`):
-            Whether to use spm byte-fallback trick (defaults to False)
-        ignore_merges (:obj:`bool`, `optional`):
-            Whether or not to match tokens with the vocab before using merges.
     """
-    def __init__(
-        self,
-        vocab=None,
-        merges=None,
-        cache_capacity=None,
-        dropout=None,
-        unk_token=None,
-        continuing_subword_prefix=None,
-        end_of_word_suffix=None,
-        fuse_unk=None,
-        byte_fallback=False,
-        ignore_merges=False,
-    ):
         pass
     @staticmethod
-    def from_file(cls, vocab, merge, **kwargs):
         """
-        Instantiate a BPE model from the given files.
-        This method is roughly equivalent to doing::
-           vocab, merges = BPE.read_file(vocab_filename, merges_filename)
-           bpe = BPE(vocab, merges)
-        If you don't need to keep the :obj:`vocab, merges` values lying around,
-        this method is more optimized than manually calling
-        :meth:`~tokenizers.models.BPE.read_file` to initialize a :class:`~tokenizers.models.BPE`
-        Args:
-            vocab (:obj:`str`):
-                The path to a :obj:`vocab.json` file
-            merges (:obj:`str`):
-                The path to a :obj:`merges.txt` file
         Returns:
-            :class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files
         """
         pass
-    def get_trainer(self):
         """
-        Get the associated :class:`~tokenizers.trainers.Trainer`
-        Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
-        :class:`~tokenizers.models.Model`.
-        Returns:
-            :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
-        """
-        pass
-    def id_to_token(self, id):
-        """
-        Get the token associated to an ID
         Args:
-            id (:obj:`int`):
-                An ID to convert to a token
-        Returns:
-            :obj:`str`: The token associated to the ID
         """
         pass
-    @staticmethod
-    def read_file(self, vocab, merges):
         """
-        Read a :obj:`vocab.json` and a :obj:`merges.txt` files
-        This method provides a way to read and parse the content of these files,
-        returning the relevant data structures. If you want to instantiate some BPE models
-        from memory, this method gives you the expected input from the standard files.
         Args:
-            vocab (:obj:`str`):
-                The path to a :obj:`vocab.json` file
-            merges (:obj:`str`):
-                The path to a :obj:`merges.txt` file
         Returns:
-            A :obj:`Tuple` with the vocab and the merges:
-                The vocabulary and merges loaded into memory
         """
         pass
-    def save(self, folder, prefix):
-        """
-        Save the current model
-        Save the current model in the given folder, using the given prefix for the various
-        files that will get created.
-        Any file with the same name that already exists in this folder will be overwritten.
-        Args:
-            folder (:obj:`str`):
-                The path to the target folder in which to save the various files
-            prefix (:obj:`str`, `optional`):
-                An optional prefix, used to prefix each file name
-        Returns:
-            :obj:`List[str]`: The list of saved files
         """
-        pass
-    def token_to_id(self, tokens):
-        """
-        Get the ID associated to a token
         Args:
-            token (:obj:`str`):
-                A token to convert to an ID
-        Returns:
-            :obj:`int`: The ID associated to the token
         """
         pass
-    def tokenize(self, sequence):
         """
-        Tokenize a sequence
         Args:
             sequence (:obj:`str`):
-                A sequence to tokenize
         Returns:
-            A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
         """
         pass
-class Unigram(Model):
     """
-    An implementation of the Unigram algorithm
     Args:
-        vocab (:obj:`List[Tuple[str, float]]`, `optional`, `optional`):
-            A list of vocabulary items and their relative score [("am", -0.2442),...]
     """
-    def __init__(self, vocab, unk_id, byte_fallback):
         pass
-    def get_trainer(self):
         """
-        Get the associated :class:`~tokenizers.trainers.Trainer`
-        Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
-        :class:`~tokenizers.models.Model`.
-        Returns:
-            :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
         """
         pass
-    def id_to_token(self, id):
         """
-        Get the token associated to an ID
         Args:
-            id (:obj:`int`):
-                An ID to convert to a token
         Returns:
-            :obj:`str`: The token associated to the ID
         """
         pass
-    def save(self, folder, prefix):
-        """
-        Save the current model
-        Save the current model in the given folder, using the given prefix for the various
-        files that will get created.
-        Any file with the same name that already exists in this folder will be overwritten.
-        Args:
-            folder (:obj:`str`):
-                The path to the target folder in which to save the various files
-            prefix (:obj:`str`, `optional`):
-                An optional prefix, used to prefix each file name
-        Returns:
-            :obj:`List[str]`: The list of saved files
-        """
         pass
-    def token_to_id(self, tokens):
         """
-        Get the ID associated to a token
-        Args:
-            token (:obj:`str`):
-                A token to convert to an ID
-        Returns:
-            :obj:`int`: The ID associated to the token
         """
         pass
-    def tokenize(self, sequence):
         """
-        Tokenize a sequence
         Args:
             sequence (:obj:`str`):
-                A sequence to tokenize
         Returns:
-            A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
         """
         pass
-class WordLevel(Model):
     """
-    An implementation of the WordLevel algorithm
-    Most simple tokenizer model based on mapping tokens to their corresponding id.
     Args:
-        vocab (:obj:`str`, `optional`):
-            A dictionary of string keys and their ids :obj:`{"am": 0,...}`
-        unk_token (:obj:`str`, `optional`):
-            The unknown token to be used by the model.
     """
-    def __init__(self, vocab, unk_token):
         pass
-    @staticmethod
-    def from_file(vocab, unk_token):
         """
-        Instantiate a WordLevel model from the given file
-        This method is roughly equivalent to doing::
-            vocab = WordLevel.read_file(vocab_filename)
-            wordlevel = WordLevel(vocab)
-        If you don't need to keep the :obj:`vocab` values lying around, this method is
-        more optimized than manually calling :meth:`~tokenizers.models.WordLevel.read_file` to
-        initialize a :class:`~tokenizers.models.WordLevel`
         Args:
-            vocab (:obj:`str`):
-                The path to a :obj:`vocab.json` file
-        Returns:
-            :class:`~tokenizers.models.WordLevel`: An instance of WordLevel loaded from file
         """
         pass
-    def get_trainer(self):
         """
-        Get the associated :class:`~tokenizers.trainers.Trainer`
-        Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
-        :class:`~tokenizers.models.Model`.
         Returns:
-            :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
         """
         pass
-    def id_to_token(self, id):
         """
-        Get the token associated to an ID
-        Args:
-            id (:obj:`int`):
-                An ID to convert to a token
-        Returns:
-            :obj:`str`: The token associated to the ID
         """
         pass
-    @staticmethod
-    def read_file(vocab):
         """
-        Read a :obj:`vocab.json`
-        This method provides a way to read and parse the content of a vocabulary file,
-        returning the relevant data structures. If you want to instantiate some WordLevel models
-        from memory, this method gives you the expected input from the standard files.
         Args:
-            vocab (:obj:`str`):
-                The path to a :obj:`vocab.json` file
         Returns:
-            :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
         """
         pass
-    def save(self, folder, prefix):
-        """
-        Save the current model
-        Save the current model in the given folder, using the given prefix for the various
-        files that will get created.
-        Any file with the same name that already exists in this folder will be overwritten.
-        Args:
-            folder (:obj:`str`):
-                The path to the target folder in which to save the various files
-            prefix (:obj:`str`, `optional`):
-                An optional prefix, used to prefix each file name
-        Returns:
-            :obj:`List[str]`: The list of saved files
-        """
         pass
-    def token_to_id(self, tokens):
         """
-        Get the ID associated to a token
-        Args:
-            token (:obj:`str`):
-                A token to convert to an ID
-        Returns:
-            :obj:`int`: The ID associated to the token
         """
         pass
-    def tokenize(self, sequence):
         """
-        Tokenize a sequence
         Args:
             sequence (:obj:`str`):
-                A sequence to tokenize
         Returns:
-            A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
         """
         pass
-class WordPiece(Model):
     """
-    An implementation of the WordPiece algorithm
-    Args:
-        vocab (:obj:`Dict[str, int]`, `optional`):
-            A dictionary of string keys and their ids :obj:`{"am": 0,...}`
-        unk_token (:obj:`str`, `optional`):
-            The unknown token to be used by the model.
-        max_input_chars_per_word (:obj:`int`, `optional`):
-            The maximum number of characters to authorize in a single word.
     """
-    def __init__(self, vocab, unk_token, max_input_chars_per_word):
         pass
-    @staticmethod
-    def from_file(vocab, **kwargs):
         """
-        Instantiate a WordPiece model from the given file
-        This method is roughly equivalent to doing::
-            vocab = WordPiece.read_file(vocab_filename)
-            wordpiece = WordPiece(vocab)
-        If you don't need to keep the :obj:`vocab` values lying around, this method is
-        more optimized than manually calling :meth:`~tokenizers.models.WordPiece.read_file` to
-        initialize a :class:`~tokenizers.models.WordPiece`
         Args:
-            vocab (:obj:`str`):
-                The path to a :obj:`vocab.txt` file
-        Returns:
-            :class:`~tokenizers.models.WordPiece`: An instance of WordPiece loaded from file
         """
         pass
-    def get_trainer(self):
         """
-        Get the associated :class:`~tokenizers.trainers.Trainer`
-        Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
-        :class:`~tokenizers.models.Model`.
-        Returns:
-            :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
-        """
-        pass
-    def id_to_token(self, id):
-        """
-        Get the token associated to an ID
         Args:
-            id (:obj:`int`):
-                An ID to convert to a token
         Returns:
-            :obj:`str`: The token associated to the ID
         """
         pass
-    @staticmethod
-    def read_file(vocab):
         """
-        Read a :obj:`vocab.txt` file
-        This method provides a way to read and parse the content of a standard `vocab.txt`
-        file as used by the WordPiece Model, returning the relevant data structures. If you
-        want to instantiate some WordPiece models from memory, this method gives you the
-        expected input from the standard files.
         Args:
-            vocab (:obj:`str`):
-                The path to a :obj:`vocab.txt` file
-        Returns:
-            :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
         """
         pass
-    def save(self, folder, prefix):
         """
-        Save the current model
-        Save the current model in the given folder, using the given prefix for the various
-        files that will get created.
-        Any file with the same name that already exists in this folder will be overwritten.
         Args:
-            folder (:obj:`str`):
-                The path to the target folder in which to save the various files
-            prefix (:obj:`str`, `optional`):
-                An optional prefix, used to prefix each file name
         Returns:
-            :obj:`List[str]`: The list of saved files
         """
         pass
-    def token_to_id(self, tokens):
         """
-        Get the ID associated to a token
-        Args:
-            token (:obj:`str`):
-                A token to convert to an ID
-        Returns:
-            :obj:`int`: The ID associated to the token
         """
         pass
-    def tokenize(self, sequence):
         """
-        Tokenize a sequence
         Args:
             sequence (:obj:`str`):
-                A sequence to tokenize
         Returns:
-            A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
         """
         pass

 # Generated content DO NOT EDIT
+class PreTokenizer:
     """
+    Base class for all pre-tokenizers
+    This class is not supposed to be instantiated directly. Instead, any implementation of a
+    PreTokenizer will return an instance of this class when instantiated.
     """
+    def pre_tokenize(self, pretok):
         """
+        Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+        This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+        keep track of the pre-tokenization, and leverage the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+        the pre-tokenization of a raw string, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
+        Args:
+            pretok (:class:`~tokenizers.PreTokenizedString):
+                The pre-tokenized string on which to apply this
+                :class:`~tokenizers.pre_tokenizers.PreTokenizer`
         """
         pass
+    def pre_tokenize_str(self, sequence):
         """
+        Pre tokenize the given string
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+        alignment, nor does it provide all the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
         Args:
+            sequence (:obj:`str`):
+                A string to pre-tokeize
         Returns:
+            :obj:`List[Tuple[str, Offsets]]`:
+                A list of tuple with the pre-tokenized parts and their offsets
         """
         pass
+class BertPreTokenizer(PreTokenizer):
+    """
+    BertPreTokenizer
+    This pre-tokenizer splits tokens on spaces, and also on punctuation.
+    Each occurence of a punctuation character will be treated separately.
+    """
+    def __init__(self):
         pass
+    def pre_tokenize(self, pretok):
         """
+        Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+        This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+        keep track of the pre-tokenization, and leverage the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+        the pre-tokenization of a raw string, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
+        Args:
+            pretok (:class:`~tokenizers.PreTokenizedString):
+                The pre-tokenized string on which to apply this
+                :class:`~tokenizers.pre_tokenizers.PreTokenizer`
         """
         pass
+    def pre_tokenize_str(self, sequence):
         """
+        Pre tokenize the given string
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+        alignment, nor does it provide all the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
         Args:
             sequence (:obj:`str`):
+                A string to pre-tokeize
         Returns:
+            :obj:`List[Tuple[str, Offsets]]`:
+                A list of tuple with the pre-tokenized parts and their offsets
         """
         pass
+class ByteLevel(PreTokenizer):
     """
+    ByteLevel PreTokenizer
+    This pre-tokenizer takes care of replacing all bytes of the given string
+    with a corresponding representation, as well as splitting into words.
+    Args:
+        add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to add a space to the first word if there isn't already one. This
+            lets us treat `hello` exactly like `say hello`.
+        use_regex (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Set this to :obj:`False` to prevent this `pre_tokenizer` from using
+            the GPT2 specific regexp for spliting on whitespace.
     """
+    def __init__(self, add_prefix_space=True, use_regex=True):
         pass
     @staticmethod
+    def alphabet():
         """
+        Returns the alphabet used by this PreTokenizer.
+        Since the ByteLevel works as its name suggests, at the byte level, it
+        encodes each byte value to a unique visible character. This means that there is a
+        total of 256 different characters composing this alphabet.
         Returns:
+            :obj:`List[str]`: A list of characters that compose the alphabet
         """
         pass
+    def pre_tokenize(self, pretok):
         """
+        Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+        This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+        keep track of the pre-tokenization, and leverage the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+        the pre-tokenization of a raw string, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
         Args:
+            pretok (:class:`~tokenizers.PreTokenizedString):
+                The pre-tokenized string on which to apply this
+                :class:`~tokenizers.pre_tokenizers.PreTokenizer`
         """
         pass
+    def pre_tokenize_str(self, sequence):
         """
+        Pre tokenize the given string
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+        alignment, nor does it provide all the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
         Args:
+            sequence (:obj:`str`):
+                A string to pre-tokeize
         Returns:
+            :obj:`List[Tuple[str, Offsets]]`:
+                A list of tuple with the pre-tokenized parts and their offsets
         """
         pass
+class CharDelimiterSplit(PreTokenizer):
+    """
+    This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)`
+    Args:
+        delimiter: str:
+            The delimiter char that will be used to split input
+    """
+    def pre_tokenize(self, pretok):
         """
+        Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+        This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+        keep track of the pre-tokenization, and leverage the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+        the pre-tokenization of a raw string, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
         Args:
+            pretok (:class:`~tokenizers.PreTokenizedString):
+                The pre-tokenized string on which to apply this
+                :class:`~tokenizers.pre_tokenizers.PreTokenizer`
         """
         pass
+    def pre_tokenize_str(self, sequence):
         """
+        Pre tokenize the given string
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+        alignment, nor does it provide all the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
         Args:
             sequence (:obj:`str`):
+                A string to pre-tokeize
         Returns:
+            :obj:`List[Tuple[str, Offsets]]`:
+                A list of tuple with the pre-tokenized parts and their offsets
         """
         pass
+class Digits(PreTokenizer):
     """
+    This pre-tokenizer simply splits using the digits in separate tokens
     Args:
+        individual_digits (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If set to True, digits will each be separated as follows::
+                "Call 123 please" -> "Call ", "1", "2", "3", " please"
+            If set to False, digits will grouped as follows::
+                "Call 123 please" -> "Call ", "123", " please"
     """
+    def __init__(self, individual_digits=False):
         pass
+    def pre_tokenize(self, pretok):
         """
+        Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+        This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+        keep track of the pre-tokenization, and leverage the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+        the pre-tokenization of a raw string, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
+        Args:
+            pretok (:class:`~tokenizers.PreTokenizedString):
+                The pre-tokenized string on which to apply this
+                :class:`~tokenizers.pre_tokenizers.PreTokenizer`
         """
         pass
+    def pre_tokenize_str(self, sequence):
         """
+        Pre tokenize the given string
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+        alignment, nor does it provide all the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
         Args:
+            sequence (:obj:`str`):
+                A string to pre-tokeize
         Returns:
+            :obj:`List[Tuple[str, Offsets]]`:
+                A list of tuple with the pre-tokenized parts and their offsets
         """
         pass
+class Metaspace(PreTokenizer):
+    """
+    Metaspace pre-tokenizer
+    This pre-tokenizer replaces any whitespace by the provided replacement character.
+    It then tries to split on these spaces.
+    Args:
+        replacement (:obj:`str`, `optional`, defaults to :obj:`▁`):
+            The replacement character. Must be exactly one character. By default we
+            use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
+        prepend_scheme (:obj:`str`, `optional`, defaults to :obj:`"always"`):
+            Whether to add a space to the first word if there isn't already one. This
+            lets us treat `hello` exactly like `say hello`.
+            Choices: "always", "never", "first". First means the space is only added on the first
+            token (relevant when special tokens are used or other pre_tokenizer are used).
+    """
+    def __init__(self, replacement="_", prepend_scheme="always", split=True):
         pass
+    def pre_tokenize(self, pretok):
         """
+        Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+        This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+        keep track of the pre-tokenization, and leverage the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+        the pre-tokenization of a raw string, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
+        Args:
+            pretok (:class:`~tokenizers.PreTokenizedString):
+                The pre-tokenized string on which to apply this
+                :class:`~tokenizers.pre_tokenizers.PreTokenizer`
         """
         pass
+    def pre_tokenize_str(self, sequence):
         """
+        Pre tokenize the given string
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+        alignment, nor does it provide all the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
         Args:
             sequence (:obj:`str`):
+                A string to pre-tokeize
         Returns:
+            :obj:`List[Tuple[str, Offsets]]`:
+                A list of tuple with the pre-tokenized parts and their offsets
         """
         pass
+class Punctuation(PreTokenizer):
     """
+    This pre-tokenizer simply splits on punctuation as individual characters.
     Args:
+        behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
+            The behavior to use when splitting.
+            Choices: "removed", "isolated" (default), "merged_with_previous", "merged_with_next",
+            "contiguous"
     """
+    def __init__(self, behavior="isolated"):
         pass
+    def pre_tokenize(self, pretok):
         """
+        Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+        This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+        keep track of the pre-tokenization, and leverage the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+        the pre-tokenization of a raw string, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
         Args:
+            pretok (:class:`~tokenizers.PreTokenizedString):
+                The pre-tokenized string on which to apply this
+                :class:`~tokenizers.pre_tokenizers.PreTokenizer`
         """
         pass
+    def pre_tokenize_str(self, sequence):
         """
+        Pre tokenize the given string
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+        alignment, nor does it provide all the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
+        Args:
+            sequence (:obj:`str`):
+                A string to pre-tokeize
         Returns:
+            :obj:`List[Tuple[str, Offsets]]`:
+                A list of tuple with the pre-tokenized parts and their offsets
         """
         pass
+class Sequence(PreTokenizer):
+    """
+    This pre-tokenizer composes other pre_tokenizers and applies them in sequence
+    """
+    def __init__(self, pretokenizers):
+        pass
+    def pre_tokenize(self, pretok):
         """
+        Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+        This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+        keep track of the pre-tokenization, and leverage the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+        the pre-tokenization of a raw string, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
+        Args:
+            pretok (:class:`~tokenizers.PreTokenizedString):
+                The pre-tokenized string on which to apply this
+                :class:`~tokenizers.pre_tokenizers.PreTokenizer`
         """
         pass
+    def pre_tokenize_str(self, sequence):
         """
+        Pre tokenize the given string
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+        alignment, nor does it provide all the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
         Args:
+            sequence (:obj:`str`):
+                A string to pre-tokeize
         Returns:
+            :obj:`List[Tuple[str, Offsets]]`:
+                A list of tuple with the pre-tokenized parts and their offsets
         """
         pass
+class Split(PreTokenizer):
+    """
+    Split PreTokenizer
+    This versatile pre-tokenizer splits using the provided pattern and
+    according to the provided behavior. The pattern can be inverted by
+    making use of the invert flag.
+    Args:
+        pattern (:obj:`str` or :class:`~tokenizers.Regex`):
+            A pattern used to split the string. Usually a string or a regex built with `tokenizers.Regex`.
+            If you want to use a regex pattern, it has to be wrapped around a `tokenizer.Regex`,
+            otherwise we consider is as a string pattern. For example `pattern="|"`
+            means you want to split on `|` (imagine a csv file for example), while
+            `patter=tokenizer.Regex("1|2")` means you split on either '1' or '2'.
+        behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
+            The behavior to use when splitting.
+            Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",
+            "contiguous"
+        invert (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to invert the pattern.
+    """
+    def __init__(self, pattern, behavior, invert=False):
         pass
+    def pre_tokenize(self, pretok):
         """
+        Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+        This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+        keep track of the pre-tokenization, and leverage the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+        the pre-tokenization of a raw string, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
+        Args:
+            pretok (:class:`~tokenizers.PreTokenizedString):
+                The pre-tokenized string on which to apply this
+                :class:`~tokenizers.pre_tokenizers.PreTokenizer`
         """
         pass
+    def pre_tokenize_str(self, sequence):
         """
+        Pre tokenize the given string
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+        alignment, nor does it provide all the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
         Args:
             sequence (:obj:`str`):
+                A string to pre-tokeize
         Returns:
+            :obj:`List[Tuple[str, Offsets]]`:
+                A list of tuple with the pre-tokenized parts and their offsets
         """
         pass
+class UnicodeScripts(PreTokenizer):
     """
+    This pre-tokenizer splits on characters that belong to different language family
+    It roughly follows https://github.com/google/sentencepiece/blob/master/data/Scripts.txt
+    Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too.
+    This mimicks SentencePiece Unigram implementation.
     """
+    def __init__(self):
         pass
+    def pre_tokenize(self, pretok):
         """
+        Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+        This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+        keep track of the pre-tokenization, and leverage the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+        the pre-tokenization of a raw string, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
         Args:
+            pretok (:class:`~tokenizers.PreTokenizedString):
+                The pre-tokenized string on which to apply this
+                :class:`~tokenizers.pre_tokenizers.PreTokenizer`
         """
         pass
+    def pre_tokenize_str(self, sequence):
         """
+        Pre tokenize the given string
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+        alignment, nor does it provide all the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
         Args:
+            sequence (:obj:`str`):
+                A string to pre-tokeize
         Returns:
+            :obj:`List[Tuple[str, Offsets]]`:
+                A list of tuple with the pre-tokenized parts and their offsets
         """
         pass
+class Whitespace(PreTokenizer):
+    """
+    This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
+    """
+    def __init__(self):
+        pass
+    def pre_tokenize(self, pretok):
         """
+        Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+        This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+        keep track of the pre-tokenization, and leverage the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+        the pre-tokenization of a raw string, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
         Args:
+            pretok (:class:`~tokenizers.PreTokenizedString):
+                The pre-tokenized string on which to apply this
+                :class:`~tokenizers.pre_tokenizers.PreTokenizer`
         """
         pass
+    def pre_tokenize_str(self, sequence):
         """
+        Pre tokenize the given string
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+        alignment, nor does it provide all the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
         Args:
+            sequence (:obj:`str`):
+                A string to pre-tokeize
         Returns:
+            :obj:`List[Tuple[str, Offsets]]`:
+                A list of tuple with the pre-tokenized parts and their offsets
         """
         pass
+class WhitespaceSplit(PreTokenizer):
+    """
+    This pre-tokenizer simply splits on the whitespace. Works like `.split()`
+    """
+    def __init__(self):
+        pass
+    def pre_tokenize(self, pretok):
         """
+        Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+        This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+        keep track of the pre-tokenization, and leverage the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+        the pre-tokenization of a raw string, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
+        Args:
+            pretok (:class:`~tokenizers.PreTokenizedString):
+                The pre-tokenized string on which to apply this
+                :class:`~tokenizers.pre_tokenizers.PreTokenizer`
         """
         pass
+    def pre_tokenize_str(self, sequence):
         """
+        Pre tokenize the given string
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+        alignment, nor does it provide all the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
         Args:
             sequence (:obj:`str`):
+                A string to pre-tokeize
         Returns:
+            :obj:`List[Tuple[str, Offsets]]`:
+                A list of tuple with the pre-tokenized parts and their offsets
         """
         pass