Add source batch 10/11
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +18 -0
- source/tiktoken-0.12.0.dist-info/INSTALLER +1 -0
- source/tiktoken-0.12.0.dist-info/METADATA +171 -0
- source/tiktoken-0.12.0.dist-info/RECORD +22 -0
- source/tiktoken-0.12.0.dist-info/WHEEL +5 -0
- source/tiktoken-0.12.0.dist-info/licenses/LICENSE +21 -0
- source/tiktoken-0.12.0.dist-info/top_level.txt +2 -0
- source/tiktoken_ext/openai_public.py +162 -0
- source/tokenizers-0.22.2.dist-info/INSTALLER +1 -0
- source/tokenizers-0.22.2.dist-info/METADATA +214 -0
- source/tokenizers-0.22.2.dist-info/RECORD +46 -0
- source/tokenizers-0.22.2.dist-info/WHEEL +5 -0
- source/tokenizers/__init__.py +100 -0
- source/tokenizers/__init__.pyi +1800 -0
- source/tokenizers/decoders/__init__.py +15 -0
- source/tokenizers/decoders/__init__.pyi +569 -0
- source/tokenizers/implementations/__init__.py +6 -0
- source/tokenizers/implementations/base_tokenizer.py +459 -0
- source/tokenizers/implementations/bert_wordpiece.py +151 -0
- source/tokenizers/implementations/byte_level_bpe.py +122 -0
- source/tokenizers/implementations/char_level_bpe.py +150 -0
- source/tokenizers/implementations/sentencepiece_bpe.py +103 -0
- source/tokenizers/implementations/sentencepiece_unigram.py +196 -0
- source/tokenizers/models/__init__.py +8 -0
- source/tokenizers/models/__init__.pyi +744 -0
- source/tokenizers/normalizers/__init__.py +29 -0
- source/tokenizers/normalizers/__init__.pyi +946 -0
- source/tokenizers/pre_tokenizers/__init__.py +16 -0
- source/tokenizers/pre_tokenizers/__init__.pyi +1015 -0
- source/tokenizers/processors/__init__.py +9 -0
- source/tokenizers/processors/__init__.pyi +519 -0
- source/tokenizers/tokenizers.abi3.so +3 -0
- source/tokenizers/tokenizers.pyi +17 -0
- source/tokenizers/tools/__init__.py +1 -0
- source/tokenizers/tools/visualizer-styles.css +170 -0
- source/tokenizers/tools/visualizer.py +407 -0
- source/tokenizers/trainers/__init__.py +8 -0
- source/tokenizers/trainers/__init__.pyi +462 -0
- source/torchaudio-2.9.1.dist-info/INSTALLER +1 -0
- source/torchaudio-2.9.1.dist-info/METADATA +133 -0
- source/torchaudio-2.9.1.dist-info/RECORD +166 -0
- source/torchaudio-2.9.1.dist-info/WHEEL +5 -0
- source/torchaudio-2.9.1.dist-info/licenses/LICENSE +25 -0
- source/torchaudio-2.9.1.dist-info/top_level.txt +1 -0
- source/torchaudio/__init__.py +204 -0
- source/torchaudio/_extension/__init__.py +61 -0
- source/torchaudio/_extension/utils.py +133 -0
- source/torchaudio/_internal/__init__.py +10 -0
- source/torchaudio/_internal/module_utils.py +171 -0
- source/torchaudio/_torchcodec.py +340 -0
.gitattributes
CHANGED
|
@@ -249,3 +249,21 @@ source/rpds/rpds.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -
|
|
| 249 |
source/safetensors/_safetensors_rust.abi3.so filter=lfs diff=lfs merge=lfs -text
|
| 250 |
source/sentencepiece/_sentencepiece.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 251 |
source/tiktoken/_tiktoken.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
source/safetensors/_safetensors_rust.abi3.so filter=lfs diff=lfs merge=lfs -text
|
| 250 |
source/sentencepiece/_sentencepiece.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 251 |
source/tiktoken/_tiktoken.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 252 |
+
source/tokenizers/tokenizers.abi3.so filter=lfs diff=lfs merge=lfs -text
|
| 253 |
+
source/torchaudio/lib/_torchaudio.so filter=lfs diff=lfs merge=lfs -text
|
| 254 |
+
source/torchaudio/lib/libctc_prefix_decoder.so filter=lfs diff=lfs merge=lfs -text
|
| 255 |
+
source/torchaudio/lib/libtorchaudio.so filter=lfs diff=lfs merge=lfs -text
|
| 256 |
+
source/torchaudio/lib/pybind11_prefixctc.so filter=lfs diff=lfs merge=lfs -text
|
| 257 |
+
source/torchvision/_C.so filter=lfs diff=lfs merge=lfs -text
|
| 258 |
+
source/torchvision/image.so filter=lfs diff=lfs merge=lfs -text
|
| 259 |
+
source/torchvision.libs/libcudart.e8e8b82a.so.12 filter=lfs diff=lfs merge=lfs -text
|
| 260 |
+
source/torchvision.libs/libjpeg.d246b9ea.so.8 filter=lfs diff=lfs merge=lfs -text
|
| 261 |
+
source/torchvision.libs/libnvjpeg.8dd2b5e6.so.12 filter=lfs diff=lfs merge=lfs -text
|
| 262 |
+
source/torchvision.libs/libpng16.4ef4b109.so.16 filter=lfs diff=lfs merge=lfs -text
|
| 263 |
+
source/torchvision.libs/libwebp.121d56b5.so.7 filter=lfs diff=lfs merge=lfs -text
|
| 264 |
+
source/torchvision.libs/libz.cac6d5fc.so.1 filter=lfs diff=lfs merge=lfs -text
|
| 265 |
+
source/tvm_ffi/core.abi3.so filter=lfs diff=lfs merge=lfs -text
|
| 266 |
+
source/tvm_ffi/lib/libtvm_ffi.so filter=lfs diff=lfs merge=lfs -text
|
| 267 |
+
source/tvm_ffi/lib/libtvm_ffi_testing.so filter=lfs diff=lfs merge=lfs -text
|
| 268 |
+
source/uvloop/loop.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 269 |
+
source/watchfiles/_rust_notify.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
source/tiktoken-0.12.0.dist-info/INSTALLER
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
pip
|
source/tiktoken-0.12.0.dist-info/METADATA
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: tiktoken
|
| 3 |
+
Version: 0.12.0
|
| 4 |
+
Summary: tiktoken is a fast BPE tokeniser for use with OpenAI's models
|
| 5 |
+
Author: Shantanu Jain
|
| 6 |
+
Author-email: shantanu@openai.com
|
| 7 |
+
License: MIT License
|
| 8 |
+
|
| 9 |
+
Copyright (c) 2022 OpenAI, Shantanu Jain
|
| 10 |
+
|
| 11 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 12 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 13 |
+
in the Software without restriction, including without limitation the rights
|
| 14 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 15 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 16 |
+
furnished to do so, subject to the following conditions:
|
| 17 |
+
|
| 18 |
+
The above copyright notice and this permission notice shall be included in all
|
| 19 |
+
copies or substantial portions of the Software.
|
| 20 |
+
|
| 21 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 22 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 23 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 24 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 25 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 26 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 27 |
+
SOFTWARE.
|
| 28 |
+
|
| 29 |
+
Project-URL: homepage, https://github.com/openai/tiktoken
|
| 30 |
+
Project-URL: repository, https://github.com/openai/tiktoken
|
| 31 |
+
Project-URL: changelog, https://github.com/openai/tiktoken/blob/main/CHANGELOG.md
|
| 32 |
+
Requires-Python: >=3.9
|
| 33 |
+
Description-Content-Type: text/markdown
|
| 34 |
+
License-File: LICENSE
|
| 35 |
+
Requires-Dist: regex>=2022.1.18
|
| 36 |
+
Requires-Dist: requests>=2.26.0
|
| 37 |
+
Provides-Extra: blobfile
|
| 38 |
+
Requires-Dist: blobfile>=2; extra == "blobfile"
|
| 39 |
+
Dynamic: license-file
|
| 40 |
+
|
| 41 |
+
# ⏳ tiktoken
|
| 42 |
+
|
| 43 |
+
tiktoken is a fast [BPE](https://en.wikipedia.org/wiki/Byte_pair_encoding) tokeniser for use with
|
| 44 |
+
OpenAI's models.
|
| 45 |
+
|
| 46 |
+
```python
|
| 47 |
+
import tiktoken
|
| 48 |
+
enc = tiktoken.get_encoding("o200k_base")
|
| 49 |
+
assert enc.decode(enc.encode("hello world")) == "hello world"
|
| 50 |
+
|
| 51 |
+
# To get the tokeniser corresponding to a specific model in the OpenAI API:
|
| 52 |
+
enc = tiktoken.encoding_for_model("gpt-4o")
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
The open source version of `tiktoken` can be installed from [PyPI](https://pypi.org/project/tiktoken):
|
| 56 |
+
```
|
| 57 |
+
pip install tiktoken
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
The tokeniser API is documented in `tiktoken/core.py`.
|
| 61 |
+
|
| 62 |
+
Example code using `tiktoken` can be found in the
|
| 63 |
+
[OpenAI Cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb).
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
## Performance
|
| 67 |
+
|
| 68 |
+
`tiktoken` is between 3-6x faster than a comparable open source tokeniser:
|
| 69 |
+
|
| 70 |
+

|
| 71 |
+
|
| 72 |
+
Performance measured on 1GB of text using the GPT-2 tokeniser, using `GPT2TokenizerFast` from
|
| 73 |
+
`tokenizers==0.13.2`, `transformers==4.24.0` and `tiktoken==0.2.0`.
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
## Getting help
|
| 77 |
+
|
| 78 |
+
Please post questions in the [issue tracker](https://github.com/openai/tiktoken/issues).
|
| 79 |
+
|
| 80 |
+
If you work at OpenAI, make sure to check the internal documentation or feel free to contact
|
| 81 |
+
@shantanu.
|
| 82 |
+
|
| 83 |
+
## What is BPE anyway?
|
| 84 |
+
|
| 85 |
+
Language models don't see text like you and I, instead they see a sequence of numbers (known as tokens).
|
| 86 |
+
Byte pair encoding (BPE) is a way of converting text into tokens. It has a couple desirable
|
| 87 |
+
properties:
|
| 88 |
+
1) It's reversible and lossless, so you can convert tokens back into the original text
|
| 89 |
+
2) It works on arbitrary text, even text that is not in the tokeniser's training data
|
| 90 |
+
3) It compresses the text: the token sequence is shorter than the bytes corresponding to the
|
| 91 |
+
original text. On average, in practice, each token corresponds to about 4 bytes.
|
| 92 |
+
4) It attempts to let the model see common subwords. For instance, "ing" is a common subword in
|
| 93 |
+
English, so BPE encodings will often split "encoding" into tokens like "encod" and "ing"
|
| 94 |
+
(instead of e.g. "enc" and "oding"). Because the model will then see the "ing" token again and
|
| 95 |
+
again in different contexts, it helps models generalise and better understand grammar.
|
| 96 |
+
|
| 97 |
+
`tiktoken` contains an educational submodule that is friendlier if you want to learn more about
|
| 98 |
+
the details of BPE, including code that helps visualise the BPE procedure:
|
| 99 |
+
```python
|
| 100 |
+
from tiktoken._educational import *
|
| 101 |
+
|
| 102 |
+
# Train a BPE tokeniser on a small amount of text
|
| 103 |
+
enc = train_simple_encoding()
|
| 104 |
+
|
| 105 |
+
# Visualise how the GPT-4 encoder encodes text
|
| 106 |
+
enc = SimpleBytePairEncoding.from_tiktoken("cl100k_base")
|
| 107 |
+
enc.encode("hello world aaaaaaaaaaaa")
|
| 108 |
+
```
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
## Extending tiktoken
|
| 112 |
+
|
| 113 |
+
You may wish to extend `tiktoken` to support new encodings. There are two ways to do this.
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
**Create your `Encoding` object exactly the way you want and simply pass it around.**
|
| 117 |
+
|
| 118 |
+
```python
|
| 119 |
+
cl100k_base = tiktoken.get_encoding("cl100k_base")
|
| 120 |
+
|
| 121 |
+
# In production, load the arguments directly instead of accessing private attributes
|
| 122 |
+
# See openai_public.py for examples of arguments for specific encodings
|
| 123 |
+
enc = tiktoken.Encoding(
|
| 124 |
+
# If you're changing the set of special tokens, make sure to use a different name
|
| 125 |
+
# It should be clear from the name what behaviour to expect.
|
| 126 |
+
name="cl100k_im",
|
| 127 |
+
pat_str=cl100k_base._pat_str,
|
| 128 |
+
mergeable_ranks=cl100k_base._mergeable_ranks,
|
| 129 |
+
special_tokens={
|
| 130 |
+
**cl100k_base._special_tokens,
|
| 131 |
+
"<|im_start|>": 100264,
|
| 132 |
+
"<|im_end|>": 100265,
|
| 133 |
+
}
|
| 134 |
+
)
|
| 135 |
+
```
|
| 136 |
+
|
| 137 |
+
**Use the `tiktoken_ext` plugin mechanism to register your `Encoding` objects with `tiktoken`.**
|
| 138 |
+
|
| 139 |
+
This is only useful if you need `tiktoken.get_encoding` to find your encoding, otherwise prefer
|
| 140 |
+
option 1.
|
| 141 |
+
|
| 142 |
+
To do this, you'll need to create a namespace package under `tiktoken_ext`.
|
| 143 |
+
|
| 144 |
+
Layout your project like this, making sure to omit the `tiktoken_ext/__init__.py` file:
|
| 145 |
+
```
|
| 146 |
+
my_tiktoken_extension
|
| 147 |
+
├── tiktoken_ext
|
| 148 |
+
│ └── my_encodings.py
|
| 149 |
+
└── setup.py
|
| 150 |
+
```
|
| 151 |
+
|
| 152 |
+
`my_encodings.py` should be a module that contains a variable named `ENCODING_CONSTRUCTORS`.
|
| 153 |
+
This is a dictionary from an encoding name to a function that takes no arguments and returns
|
| 154 |
+
arguments that can be passed to `tiktoken.Encoding` to construct that encoding. For an example, see
|
| 155 |
+
`tiktoken_ext/openai_public.py`. For precise details, see `tiktoken/registry.py`.
|
| 156 |
+
|
| 157 |
+
Your `setup.py` should look something like this:
|
| 158 |
+
```python
|
| 159 |
+
from setuptools import setup, find_namespace_packages
|
| 160 |
+
|
| 161 |
+
setup(
|
| 162 |
+
name="my_tiktoken_extension",
|
| 163 |
+
packages=find_namespace_packages(include=['tiktoken_ext*']),
|
| 164 |
+
install_requires=["tiktoken"],
|
| 165 |
+
...
|
| 166 |
+
)
|
| 167 |
+
```
|
| 168 |
+
|
| 169 |
+
Then simply `pip install ./my_tiktoken_extension` and you should be able to use your
|
| 170 |
+
custom encodings! Make sure **not** to use an editable install.
|
| 171 |
+
|
source/tiktoken-0.12.0.dist-info/RECORD
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
tiktoken-0.12.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
| 2 |
+
tiktoken-0.12.0.dist-info/METADATA,sha256=07KObsjwnEfLTQ-oRD0vhrE3Zw-oNMJqTlBbEeJxtZ8,6688
|
| 3 |
+
tiktoken-0.12.0.dist-info/RECORD,,
|
| 4 |
+
tiktoken-0.12.0.dist-info/WHEEL,sha256=VXvNKn6nFeCM45GEUrNLJOO_J_e-cNJphGt9rWFxyE0,113
|
| 5 |
+
tiktoken-0.12.0.dist-info/licenses/LICENSE,sha256=QYy0mbQ2Eo1lPXmUEzOlQ3t74uqSE9zC8E0V1dLFHYY,1078
|
| 6 |
+
tiktoken-0.12.0.dist-info/top_level.txt,sha256=54G5MceQnuD7EXvp7jzGxDDapA1iOwsh77jhCN9WKkc,22
|
| 7 |
+
tiktoken/__init__.py,sha256=eHlkakibO43-11JFQJUgpC8z2v4ID1r3l3LXjMyEwKc,346
|
| 8 |
+
tiktoken/__pycache__/__init__.cpython-312.pyc,,
|
| 9 |
+
tiktoken/__pycache__/_educational.cpython-312.pyc,,
|
| 10 |
+
tiktoken/__pycache__/core.cpython-312.pyc,,
|
| 11 |
+
tiktoken/__pycache__/load.cpython-312.pyc,,
|
| 12 |
+
tiktoken/__pycache__/model.cpython-312.pyc,,
|
| 13 |
+
tiktoken/__pycache__/registry.cpython-312.pyc,,
|
| 14 |
+
tiktoken/_educational.py,sha256=TUFOp8Q91WjrTvGKhCNEyrhtva82UlenXfhPy9zS7VQ,8229
|
| 15 |
+
tiktoken/_tiktoken.cpython-312-x86_64-linux-gnu.so,sha256=qCn0iO_VQ7YJKD6D8jDXz6WHH64mSUK59dicngi37S8,3525056
|
| 16 |
+
tiktoken/core.py,sha256=TCwORlettZl-da55Ysp52TlLk18nKD6e62Q_0ZFA404,17458
|
| 17 |
+
tiktoken/load.py,sha256=dhTOiVIInbhiQ_zmtOZDshKvqSKzXyNOJJWPmJ0S9RU,5919
|
| 18 |
+
tiktoken/model.py,sha256=d57kixsksIv6VESndVjvmGBRj8LrSFGAwUCV5xZtxRk,4061
|
| 19 |
+
tiktoken/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 20 |
+
tiktoken/registry.py,sha256=7fktZbJ1Kcm8sVyWgEfIy-ZxfUvcXupLUNXKPfSGwQU,3256
|
| 21 |
+
tiktoken_ext/__pycache__/openai_public.cpython-312.pyc,,
|
| 22 |
+
tiktoken_ext/openai_public.py,sha256=lUOSc45g0Pttyh2tgIcu_EfI4nM7q-y78KI5cO1mwss,5613
|
source/tiktoken-0.12.0.dist-info/WHEEL
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Wheel-Version: 1.0
|
| 2 |
+
Generator: setuptools (80.9.0)
|
| 3 |
+
Root-Is-Purelib: false
|
| 4 |
+
Tag: cp312-cp312-manylinux_2_28_x86_64
|
| 5 |
+
|
source/tiktoken-0.12.0.dist-info/licenses/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2022 OpenAI, Shantanu Jain
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
source/tiktoken-0.12.0.dist-info/top_level.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
tiktoken
|
| 2 |
+
tiktoken_ext
|
source/tiktoken_ext/openai_public.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from tiktoken.load import data_gym_to_mergeable_bpe_ranks, load_tiktoken_bpe
|
| 2 |
+
|
| 3 |
+
ENDOFTEXT = "<|endoftext|>"
|
| 4 |
+
FIM_PREFIX = "<|fim_prefix|>"
|
| 5 |
+
FIM_MIDDLE = "<|fim_middle|>"
|
| 6 |
+
FIM_SUFFIX = "<|fim_suffix|>"
|
| 7 |
+
ENDOFPROMPT = "<|endofprompt|>"
|
| 8 |
+
|
| 9 |
+
# The pattern in the original GPT-2 release is:
|
| 10 |
+
# r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
|
| 11 |
+
# This is equivalent, but executes faster:
|
| 12 |
+
r50k_pat_str = (
|
| 13 |
+
r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s++$|\s+(?!\S)|\s"""
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def gpt2():
|
| 18 |
+
mergeable_ranks = data_gym_to_mergeable_bpe_ranks(
|
| 19 |
+
vocab_bpe_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe",
|
| 20 |
+
encoder_json_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json",
|
| 21 |
+
vocab_bpe_hash="1ce1664773c50f3e0cc8842619a93edc4624525b728b188a9e0be33b7726adc5",
|
| 22 |
+
encoder_json_hash="196139668be63f3b5d6574427317ae82f612a97c5d1cdaf36ed2256dbf636783",
|
| 23 |
+
)
|
| 24 |
+
return {
|
| 25 |
+
"name": "gpt2",
|
| 26 |
+
"explicit_n_vocab": 50257,
|
| 27 |
+
"pat_str": r50k_pat_str,
|
| 28 |
+
"mergeable_ranks": mergeable_ranks,
|
| 29 |
+
"special_tokens": {ENDOFTEXT: 50256},
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def r50k_base():
|
| 34 |
+
mergeable_ranks = load_tiktoken_bpe(
|
| 35 |
+
"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken",
|
| 36 |
+
expected_hash="306cd27f03c1a714eca7108e03d66b7dc042abe8c258b44c199a7ed9838dd930",
|
| 37 |
+
)
|
| 38 |
+
return {
|
| 39 |
+
"name": "r50k_base",
|
| 40 |
+
"explicit_n_vocab": 50257,
|
| 41 |
+
"pat_str": r50k_pat_str,
|
| 42 |
+
"mergeable_ranks": mergeable_ranks,
|
| 43 |
+
"special_tokens": {ENDOFTEXT: 50256},
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def p50k_base():
|
| 48 |
+
mergeable_ranks = load_tiktoken_bpe(
|
| 49 |
+
"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
|
| 50 |
+
expected_hash="94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069",
|
| 51 |
+
)
|
| 52 |
+
return {
|
| 53 |
+
"name": "p50k_base",
|
| 54 |
+
"explicit_n_vocab": 50281,
|
| 55 |
+
"pat_str": r50k_pat_str,
|
| 56 |
+
"mergeable_ranks": mergeable_ranks,
|
| 57 |
+
"special_tokens": {ENDOFTEXT: 50256},
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def p50k_edit():
|
| 62 |
+
mergeable_ranks = load_tiktoken_bpe(
|
| 63 |
+
"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
|
| 64 |
+
expected_hash="94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069",
|
| 65 |
+
)
|
| 66 |
+
special_tokens = {ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283}
|
| 67 |
+
return {
|
| 68 |
+
"name": "p50k_edit",
|
| 69 |
+
"pat_str": r50k_pat_str,
|
| 70 |
+
"mergeable_ranks": mergeable_ranks,
|
| 71 |
+
"special_tokens": special_tokens,
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def cl100k_base():
|
| 76 |
+
mergeable_ranks = load_tiktoken_bpe(
|
| 77 |
+
"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken",
|
| 78 |
+
expected_hash="223921b76ee99bde995b7ff738513eef100fb51d18c93597a113bcffe865b2a7",
|
| 79 |
+
)
|
| 80 |
+
special_tokens = {
|
| 81 |
+
ENDOFTEXT: 100257,
|
| 82 |
+
FIM_PREFIX: 100258,
|
| 83 |
+
FIM_MIDDLE: 100259,
|
| 84 |
+
FIM_SUFFIX: 100260,
|
| 85 |
+
ENDOFPROMPT: 100276,
|
| 86 |
+
}
|
| 87 |
+
return {
|
| 88 |
+
"name": "cl100k_base",
|
| 89 |
+
"pat_str": r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}++|\p{N}{1,3}+| ?[^\s\p{L}\p{N}]++[\r\n]*+|\s++$|\s*[\r\n]|\s+(?!\S)|\s""",
|
| 90 |
+
"mergeable_ranks": mergeable_ranks,
|
| 91 |
+
"special_tokens": special_tokens,
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def o200k_base():
|
| 96 |
+
mergeable_ranks = load_tiktoken_bpe(
|
| 97 |
+
"https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken",
|
| 98 |
+
expected_hash="446a9538cb6c348e3516120d7c08b09f57c36495e2acfffe59a5bf8b0cfb1a2d",
|
| 99 |
+
)
|
| 100 |
+
special_tokens = {ENDOFTEXT: 199999, ENDOFPROMPT: 200018}
|
| 101 |
+
# This regex could be made more efficient. If I was the one working on this encoding, I would
|
| 102 |
+
# have done a few other things differently too, e.g. I think you can allocate tokens more
|
| 103 |
+
# efficiently across languages.
|
| 104 |
+
pat_str = "|".join(
|
| 105 |
+
[
|
| 106 |
+
r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
|
| 107 |
+
r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
|
| 108 |
+
r"""\p{N}{1,3}""",
|
| 109 |
+
r""" ?[^\s\p{L}\p{N}]+[\r\n/]*""",
|
| 110 |
+
r"""\s*[\r\n]+""",
|
| 111 |
+
r"""\s+(?!\S)""",
|
| 112 |
+
r"""\s+""",
|
| 113 |
+
]
|
| 114 |
+
)
|
| 115 |
+
return {
|
| 116 |
+
"name": "o200k_base",
|
| 117 |
+
"pat_str": pat_str,
|
| 118 |
+
"mergeable_ranks": mergeable_ranks,
|
| 119 |
+
"special_tokens": special_tokens,
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def o200k_harmony():
|
| 124 |
+
base_enc = o200k_base()
|
| 125 |
+
name = "o200k_harmony"
|
| 126 |
+
pat_str = base_enc["pat_str"]
|
| 127 |
+
mergeable_ranks = base_enc["mergeable_ranks"]
|
| 128 |
+
special_tokens = {
|
| 129 |
+
**base_enc["special_tokens"],
|
| 130 |
+
"<|startoftext|>": 199998,
|
| 131 |
+
"<|endoftext|>": 199999,
|
| 132 |
+
"<|reserved_200000|>": 200000,
|
| 133 |
+
"<|reserved_200001|>": 200001,
|
| 134 |
+
"<|return|>": 200002,
|
| 135 |
+
"<|constrain|>": 200003,
|
| 136 |
+
"<|reserved_200004|>": 200004,
|
| 137 |
+
"<|channel|>": 200005,
|
| 138 |
+
"<|start|>": 200006,
|
| 139 |
+
"<|end|>": 200007,
|
| 140 |
+
"<|message|>": 200008,
|
| 141 |
+
"<|reserved_200009|>": 200009,
|
| 142 |
+
"<|reserved_200010|>": 200010,
|
| 143 |
+
"<|reserved_200011|>": 200011,
|
| 144 |
+
"<|call|>": 200012,
|
| 145 |
+
} | {f"<|reserved_{i}|>": i for i in range(200013, 201088)}
|
| 146 |
+
return {
|
| 147 |
+
"name": name,
|
| 148 |
+
"pat_str": pat_str,
|
| 149 |
+
"mergeable_ranks": mergeable_ranks,
|
| 150 |
+
"special_tokens": special_tokens,
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
ENCODING_CONSTRUCTORS = {
|
| 155 |
+
"gpt2": gpt2,
|
| 156 |
+
"r50k_base": r50k_base,
|
| 157 |
+
"p50k_base": p50k_base,
|
| 158 |
+
"p50k_edit": p50k_edit,
|
| 159 |
+
"cl100k_base": cl100k_base,
|
| 160 |
+
"o200k_base": o200k_base,
|
| 161 |
+
"o200k_harmony": o200k_harmony,
|
| 162 |
+
}
|
source/tokenizers-0.22.2.dist-info/INSTALLER
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
pip
|
source/tokenizers-0.22.2.dist-info/METADATA
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: tokenizers
|
| 3 |
+
Version: 0.22.2
|
| 4 |
+
Classifier: Development Status :: 5 - Production/Stable
|
| 5 |
+
Classifier: Intended Audience :: Developers
|
| 6 |
+
Classifier: Intended Audience :: Education
|
| 7 |
+
Classifier: Intended Audience :: Science/Research
|
| 8 |
+
Classifier: License :: OSI Approved :: Apache Software License
|
| 9 |
+
Classifier: Operating System :: OS Independent
|
| 10 |
+
Classifier: Programming Language :: Python :: 3
|
| 11 |
+
Classifier: Programming Language :: Python :: 3.9
|
| 12 |
+
Classifier: Programming Language :: Python :: 3.10
|
| 13 |
+
Classifier: Programming Language :: Python :: 3.11
|
| 14 |
+
Classifier: Programming Language :: Python :: 3.12
|
| 15 |
+
Classifier: Programming Language :: Python :: 3.13
|
| 16 |
+
Classifier: Programming Language :: Python :: 3 :: Only
|
| 17 |
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
| 18 |
+
Requires-Dist: huggingface-hub>=0.16.4,<2.0
|
| 19 |
+
Requires-Dist: pytest ; extra == 'testing'
|
| 20 |
+
Requires-Dist: pytest-asyncio ; extra == 'testing'
|
| 21 |
+
Requires-Dist: requests ; extra == 'testing'
|
| 22 |
+
Requires-Dist: numpy ; extra == 'testing'
|
| 23 |
+
Requires-Dist: datasets ; extra == 'testing'
|
| 24 |
+
Requires-Dist: ruff ; extra == 'testing'
|
| 25 |
+
Requires-Dist: ty ; extra == 'testing'
|
| 26 |
+
Requires-Dist: sphinx ; extra == 'docs'
|
| 27 |
+
Requires-Dist: sphinx-rtd-theme ; extra == 'docs'
|
| 28 |
+
Requires-Dist: setuptools-rust ; extra == 'docs'
|
| 29 |
+
Requires-Dist: tokenizers[testing] ; extra == 'dev'
|
| 30 |
+
Provides-Extra: testing
|
| 31 |
+
Provides-Extra: docs
|
| 32 |
+
Provides-Extra: dev
|
| 33 |
+
Keywords: NLP,tokenizer,BPE,transformer,deep learning
|
| 34 |
+
Author-email: Nicolas Patry <patry.nicolas@protonmail.com>, Anthony Moi <anthony@huggingface.co>
|
| 35 |
+
Requires-Python: >=3.9
|
| 36 |
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
| 37 |
+
Project-URL: Homepage, https://github.com/huggingface/tokenizers
|
| 38 |
+
Project-URL: Source, https://github.com/huggingface/tokenizers
|
| 39 |
+
|
| 40 |
+
<p align="center">
|
| 41 |
+
<br>
|
| 42 |
+
<img src="https://huggingface.co/landing/assets/tokenizers/tokenizers-logo.png" width="600"/>
|
| 43 |
+
<br>
|
| 44 |
+
<p>
|
| 45 |
+
<p align="center">
|
| 46 |
+
<a href="https://badge.fury.io/py/tokenizers">
|
| 47 |
+
<img alt="Build" src="https://badge.fury.io/py/tokenizers.svg">
|
| 48 |
+
</a>
|
| 49 |
+
<a href="https://github.com/huggingface/tokenizers/blob/master/LICENSE">
|
| 50 |
+
<img alt="GitHub" src="https://img.shields.io/github/license/huggingface/tokenizers.svg?color=blue">
|
| 51 |
+
</a>
|
| 52 |
+
</p>
|
| 53 |
+
<br>
|
| 54 |
+
|
| 55 |
+
# Tokenizers
|
| 56 |
+
|
| 57 |
+
Provides an implementation of today's most used tokenizers, with a focus on performance and
|
| 58 |
+
versatility.
|
| 59 |
+
|
| 60 |
+
Bindings over the [Rust](https://github.com/huggingface/tokenizers/tree/master/tokenizers) implementation.
|
| 61 |
+
If you are interested in the High-level design, you can go check it there.
|
| 62 |
+
|
| 63 |
+
Otherwise, let's dive in!
|
| 64 |
+
|
| 65 |
+
## Main features:
|
| 66 |
+
|
| 67 |
+
- Train new vocabularies and tokenize using 4 pre-made tokenizers (Bert WordPiece and the 3
|
| 68 |
+
most common BPE versions).
|
| 69 |
+
- Extremely fast (both training and tokenization), thanks to the Rust implementation. Takes
|
| 70 |
+
less than 20 seconds to tokenize a GB of text on a server's CPU.
|
| 71 |
+
- Easy to use, but also extremely versatile.
|
| 72 |
+
- Designed for research and production.
|
| 73 |
+
- Normalization comes with alignments tracking. It's always possible to get the part of the
|
| 74 |
+
original sentence that corresponds to a given token.
|
| 75 |
+
- Does all the pre-processing: Truncate, Pad, add the special tokens your model needs.
|
| 76 |
+
|
| 77 |
+
### Installation
|
| 78 |
+
|
| 79 |
+
#### With pip:
|
| 80 |
+
|
| 81 |
+
```bash
|
| 82 |
+
pip install tokenizers
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
#### From sources:
|
| 86 |
+
|
| 87 |
+
To use this method, you need to have the Rust installed:
|
| 88 |
+
|
| 89 |
+
```bash
|
| 90 |
+
# Install with:
|
| 91 |
+
curl https://sh.rustup.rs -sSf | sh -s -- -y
|
| 92 |
+
export PATH="$HOME/.cargo/bin:$PATH"
|
| 93 |
+
```
|
| 94 |
+
|
| 95 |
+
Once Rust is installed, you can compile doing the following
|
| 96 |
+
|
| 97 |
+
```bash
|
| 98 |
+
git clone https://github.com/huggingface/tokenizers
|
| 99 |
+
cd tokenizers/bindings/python
|
| 100 |
+
|
| 101 |
+
# Create a virtual env (you can use yours as well)
|
| 102 |
+
python -m venv .env
|
| 103 |
+
source .env/bin/activate
|
| 104 |
+
|
| 105 |
+
# Install `tokenizers` in the current virtual env
|
| 106 |
+
pip install -e .
|
| 107 |
+
```
|
| 108 |
+
|
| 109 |
+
### Load a pretrained tokenizer from the Hub
|
| 110 |
+
|
| 111 |
+
```python
|
| 112 |
+
from tokenizers import Tokenizer
|
| 113 |
+
|
| 114 |
+
tokenizer = Tokenizer.from_pretrained("bert-base-cased")
|
| 115 |
+
```
|
| 116 |
+
|
| 117 |
+
### Using the provided Tokenizers
|
| 118 |
+
|
| 119 |
+
We provide some pre-build tokenizers to cover the most common cases. You can easily load one of
|
| 120 |
+
these using some `vocab.json` and `merges.txt` files:
|
| 121 |
+
|
| 122 |
+
```python
|
| 123 |
+
from tokenizers import CharBPETokenizer
|
| 124 |
+
|
| 125 |
+
# Initialize a tokenizer
|
| 126 |
+
vocab = "./path/to/vocab.json"
|
| 127 |
+
merges = "./path/to/merges.txt"
|
| 128 |
+
tokenizer = CharBPETokenizer(vocab, merges)
|
| 129 |
+
|
| 130 |
+
# And then encode:
|
| 131 |
+
encoded = tokenizer.encode("I can feel the magic, can you?")
|
| 132 |
+
print(encoded.ids)
|
| 133 |
+
print(encoded.tokens)
|
| 134 |
+
```
|
| 135 |
+
|
| 136 |
+
And you can train them just as simply:
|
| 137 |
+
|
| 138 |
+
```python
|
| 139 |
+
from tokenizers import CharBPETokenizer
|
| 140 |
+
|
| 141 |
+
# Initialize a tokenizer
|
| 142 |
+
tokenizer = CharBPETokenizer()
|
| 143 |
+
|
| 144 |
+
# Then train it!
|
| 145 |
+
tokenizer.train([ "./path/to/files/1.txt", "./path/to/files/2.txt" ])
|
| 146 |
+
|
| 147 |
+
# Now, let's use it:
|
| 148 |
+
encoded = tokenizer.encode("I can feel the magic, can you?")
|
| 149 |
+
|
| 150 |
+
# And finally save it somewhere
|
| 151 |
+
tokenizer.save("./path/to/directory/my-bpe.tokenizer.json")
|
| 152 |
+
```
|
| 153 |
+
|
| 154 |
+
#### Provided Tokenizers
|
| 155 |
+
|
| 156 |
+
- `CharBPETokenizer`: The original BPE
|
| 157 |
+
- `ByteLevelBPETokenizer`: The byte level version of the BPE
|
| 158 |
+
- `SentencePieceBPETokenizer`: A BPE implementation compatible with the one used by SentencePiece
|
| 159 |
+
- `BertWordPieceTokenizer`: The famous Bert tokenizer, using WordPiece
|
| 160 |
+
|
| 161 |
+
All of these can be used and trained as explained above!
|
| 162 |
+
|
| 163 |
+
### Build your own
|
| 164 |
+
|
| 165 |
+
Whenever these provided tokenizers don't give you enough freedom, you can build your own tokenizer,
|
| 166 |
+
by putting all the different parts you need together.
|
| 167 |
+
You can check how we implemented the [provided tokenizers](https://github.com/huggingface/tokenizers/tree/master/bindings/python/py_src/tokenizers/implementations) and adapt them easily to your own needs.
|
| 168 |
+
|
| 169 |
+
#### Building a byte-level BPE
|
| 170 |
+
|
| 171 |
+
Here is an example showing how to build your own byte-level BPE by putting all the different pieces
|
| 172 |
+
together, and then saving it to a single file:
|
| 173 |
+
|
| 174 |
+
```python
|
| 175 |
+
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors
|
| 176 |
+
|
| 177 |
+
# Initialize a tokenizer
|
| 178 |
+
tokenizer = Tokenizer(models.BPE())
|
| 179 |
+
|
| 180 |
+
# Customize pre-tokenization and decoding
|
| 181 |
+
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
|
| 182 |
+
tokenizer.decoder = decoders.ByteLevel()
|
| 183 |
+
tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
|
| 184 |
+
|
| 185 |
+
# And then train
|
| 186 |
+
trainer = trainers.BpeTrainer(
|
| 187 |
+
vocab_size=20000,
|
| 188 |
+
min_frequency=2,
|
| 189 |
+
initial_alphabet=pre_tokenizers.ByteLevel.alphabet()
|
| 190 |
+
)
|
| 191 |
+
tokenizer.train([
|
| 192 |
+
"./path/to/dataset/1.txt",
|
| 193 |
+
"./path/to/dataset/2.txt",
|
| 194 |
+
"./path/to/dataset/3.txt"
|
| 195 |
+
], trainer=trainer)
|
| 196 |
+
|
| 197 |
+
# And Save it
|
| 198 |
+
tokenizer.save("byte-level-bpe.tokenizer.json", pretty=True)
|
| 199 |
+
```
|
| 200 |
+
|
| 201 |
+
Now, when you want to use this tokenizer, this is as simple as:
|
| 202 |
+
|
| 203 |
+
```python
|
| 204 |
+
from tokenizers import Tokenizer
|
| 205 |
+
|
| 206 |
+
tokenizer = Tokenizer.from_file("byte-level-bpe.tokenizer.json")
|
| 207 |
+
|
| 208 |
+
encoded = tokenizer.encode("I can feel the magic, can you?")
|
| 209 |
+
```
|
| 210 |
+
|
| 211 |
+
### Typing support and `stub.py`
|
| 212 |
+
|
| 213 |
+
The compiled PyO3 extension does not expose type annotations, so editors and type checkers would otherwise see most objects as `Any`. The `stub.py` helper walks the loaded extension modules, renders `.pyi` stub files (plus minimal forwarding `__init__.py` shims), and formats them so that tools like mypy/pyright can understand the public API. Run `python stub.py` whenever you change the Python-visible surface to keep the generated stubs in sync.
|
| 214 |
+
|
source/tokenizers-0.22.2.dist-info/RECORD
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
tokenizers-0.22.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
| 2 |
+
tokenizers-0.22.2.dist-info/METADATA,sha256=FaXdr0ifWSt34Kk0wO60a1ETCpQGTKEpIyr9sKOGjvw,7254
|
| 3 |
+
tokenizers-0.22.2.dist-info/RECORD,,
|
| 4 |
+
tokenizers-0.22.2.dist-info/WHEEL,sha256=5mwg5nCvp3YrLxikUrE5E0HBDKerMOoBBb70NjCncME,143
|
| 5 |
+
tokenizers/__init__.py,sha256=FI7LEi8_7gO-mrsf4hPdhfvGkb8q0rQ3_1MVM3gaajo,2639
|
| 6 |
+
tokenizers/__init__.pyi,sha256=MKWF2m4mz7IG1bPTdJ7AjXkQDNzkmQSLMmACQ2VUYJU,55891
|
| 7 |
+
tokenizers/__pycache__/__init__.cpython-312.pyc,,
|
| 8 |
+
tokenizers/decoders/__init__.py,sha256=hfwM6CFUDvlMGGL4-xsaaYz81K9P5rQI5ZL5UHWK8Y4,372
|
| 9 |
+
tokenizers/decoders/__init__.pyi,sha256=T60mFckMbS8YrsonOAPtfvb7VYHUJi9mm47Wd8pT62o,12019
|
| 10 |
+
tokenizers/decoders/__pycache__/__init__.cpython-312.pyc,,
|
| 11 |
+
tokenizers/implementations/__init__.py,sha256=VzAsplaIo7rl4AFO8Miu7ig7MfZjvonwVblZw01zR6M,310
|
| 12 |
+
tokenizers/implementations/__pycache__/__init__.cpython-312.pyc,,
|
| 13 |
+
tokenizers/implementations/__pycache__/base_tokenizer.cpython-312.pyc,,
|
| 14 |
+
tokenizers/implementations/__pycache__/bert_wordpiece.cpython-312.pyc,,
|
| 15 |
+
tokenizers/implementations/__pycache__/byte_level_bpe.cpython-312.pyc,,
|
| 16 |
+
tokenizers/implementations/__pycache__/char_level_bpe.cpython-312.pyc,,
|
| 17 |
+
tokenizers/implementations/__pycache__/sentencepiece_bpe.cpython-312.pyc,,
|
| 18 |
+
tokenizers/implementations/__pycache__/sentencepiece_unigram.cpython-312.pyc,,
|
| 19 |
+
tokenizers/implementations/base_tokenizer.py,sha256=PtQ2TSmoMGlTpL8oc8fDvwJVIY6isWGmps9comzsWjE,15806
|
| 20 |
+
tokenizers/implementations/bert_wordpiece.py,sha256=sKCum0FKPYdSgJFJN8LDerVBoTDRSqyqSdrcm-lvQqI,5520
|
| 21 |
+
tokenizers/implementations/byte_level_bpe.py,sha256=iBepM_z1s5Ky7zFDVrYLc3L5byYrIouk7-k0JGuF10s,4272
|
| 22 |
+
tokenizers/implementations/char_level_bpe.py,sha256=Nag_HFq8Rvcucqi8MhV1-0xtoR0C7FjHOecFVURL7ss,5449
|
| 23 |
+
tokenizers/implementations/sentencepiece_bpe.py,sha256=c08fKf6i92E2RsKgsxy7LzZfYX8-MACHSRG8U_I5ytY,3721
|
| 24 |
+
tokenizers/implementations/sentencepiece_unigram.py,sha256=2RoIfFVpiMkJOtOCskM_VCeCELWaC_bNnds6GvtE0KQ,7630
|
| 25 |
+
tokenizers/models/__init__.py,sha256=eJZ4HTAQZpxnKILNylWaTFqxXy-Ba6OKswWN47feeV8,176
|
| 26 |
+
tokenizers/models/__init__.pyi,sha256=2gZPQR1Z5_krTzLXx-ts5ai7Fz7bTZ0QI1OSJ5MyOuc,19517
|
| 27 |
+
tokenizers/models/__pycache__/__init__.cpython-312.pyc,,
|
| 28 |
+
tokenizers/normalizers/__init__.py,sha256=_06w4cqRItveEgIddYaLMScgkSOkIAMIzYCesb5AA4U,841
|
| 29 |
+
tokenizers/normalizers/__init__.pyi,sha256=6zYmbFtvdF1WhoWQSdEN974mxHjc7ZwJBA0TI2dJk98,25709
|
| 30 |
+
tokenizers/normalizers/__pycache__/__init__.cpython-312.pyc,,
|
| 31 |
+
tokenizers/pre_tokenizers/__init__.py,sha256=KV9-EsAykGENUUzkGWCbv4n6YM6hYa1hfnY-gzBpMNE,598
|
| 32 |
+
tokenizers/pre_tokenizers/__init__.pyi,sha256=_pc34-Kd2N7Nvs7vTHPULBKjm18iJRM9qLOClVHw9n4,31566
|
| 33 |
+
tokenizers/pre_tokenizers/__pycache__/__init__.cpython-312.pyc,,
|
| 34 |
+
tokenizers/processors/__init__.py,sha256=xM2DEKwKtHIumHsszM8AMkq-AlaqvBZFXWgLU8SNhOY,307
|
| 35 |
+
tokenizers/processors/__init__.pyi,sha256=5L5OBZ7SXCg7AEy51jyDHViaCSHG5c7vW4eWjSVQbUs,14348
|
| 36 |
+
tokenizers/processors/__pycache__/__init__.cpython-312.pyc,,
|
| 37 |
+
tokenizers/tokenizers.abi3.so,sha256=wRb88egNRhzgo1wzKXTyWUnoNZQW9Qs9UzcYENLOHMw,10074176
|
| 38 |
+
tokenizers/tokenizers.pyi,sha256=Mq4G5RcxKiVc0FZd_Omi-bT7YQMRc-iDBU_nPCmCZOA,468
|
| 39 |
+
tokenizers/tools/__init__.py,sha256=xG8caB9OHC8cbB01S5vYV14HZxhO6eWbLehsb70ppio,55
|
| 40 |
+
tokenizers/tools/__pycache__/__init__.cpython-312.pyc,,
|
| 41 |
+
tokenizers/tools/__pycache__/visualizer.cpython-312.pyc,,
|
| 42 |
+
tokenizers/tools/visualizer-styles.css,sha256=zAydq1oGWD8QEll4-eyL8Llw0B1sty_hpIE3tYxL02k,4850
|
| 43 |
+
tokenizers/tools/visualizer.py,sha256=jtxka01phNP47uQSocIQFO_DMnL3ZHdwohGVDqqYJPo,14834
|
| 44 |
+
tokenizers/trainers/__init__.py,sha256=UTu22AGcp76IvpW45xLRbJWET04NxPW6NfCb2YYz0EM,248
|
| 45 |
+
tokenizers/trainers/__init__.pyi,sha256=jKtDNXnoX6FWeCTeHz-W62Cj2_JErgYG7h1PReUz1rU,10719
|
| 46 |
+
tokenizers/trainers/__pycache__/__init__.cpython-312.pyc,,
|
source/tokenizers-0.22.2.dist-info/WHEEL
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Wheel-Version: 1.0
|
| 2 |
+
Generator: maturin (1.10.2)
|
| 3 |
+
Root-Is-Purelib: false
|
| 4 |
+
Tag: cp39-abi3-manylinux_2_17_x86_64
|
| 5 |
+
Tag: cp39-abi3-manylinux2014_x86_64
|
source/tokenizers/__init__.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from enum import Enum
|
| 2 |
+
from typing import List, Tuple, Union
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
Offsets = Tuple[int, int]
|
| 6 |
+
|
| 7 |
+
TextInputSequence = str
|
| 8 |
+
"""A :obj:`str` that represents an input sequence """
|
| 9 |
+
|
| 10 |
+
PreTokenizedInputSequence = Union[List[str], Tuple[str]]
|
| 11 |
+
"""A pre-tokenized input sequence. Can be one of:
|
| 12 |
+
|
| 13 |
+
- A :obj:`List` of :obj:`str`
|
| 14 |
+
- A :obj:`Tuple` of :obj:`str`
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
TextEncodeInput = Union[
|
| 18 |
+
TextInputSequence,
|
| 19 |
+
Tuple[TextInputSequence, TextInputSequence],
|
| 20 |
+
List[TextInputSequence],
|
| 21 |
+
]
|
| 22 |
+
"""Represents a textual input for encoding. Can be either:
|
| 23 |
+
|
| 24 |
+
- A single sequence: :data:`~tokenizers.TextInputSequence`
|
| 25 |
+
- A pair of sequences:
|
| 26 |
+
|
| 27 |
+
- A :obj:`Tuple` of :data:`~tokenizers.TextInputSequence`
|
| 28 |
+
- Or a :obj:`List` of :data:`~tokenizers.TextInputSequence` of size 2
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
PreTokenizedEncodeInput = Union[
|
| 32 |
+
PreTokenizedInputSequence,
|
| 33 |
+
Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
|
| 34 |
+
List[PreTokenizedInputSequence],
|
| 35 |
+
]
|
| 36 |
+
"""Represents a pre-tokenized input for encoding. Can be either:
|
| 37 |
+
|
| 38 |
+
- A single sequence: :data:`~tokenizers.PreTokenizedInputSequence`
|
| 39 |
+
- A pair of sequences:
|
| 40 |
+
|
| 41 |
+
- A :obj:`Tuple` of :data:`~tokenizers.PreTokenizedInputSequence`
|
| 42 |
+
- Or a :obj:`List` of :data:`~tokenizers.PreTokenizedInputSequence` of size 2
|
| 43 |
+
"""
|
| 44 |
+
|
| 45 |
+
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
|
| 46 |
+
"""Represents all the possible types of input sequences for encoding. Can be:
|
| 47 |
+
|
| 48 |
+
- When ``is_pretokenized=False``: :data:`~TextInputSequence`
|
| 49 |
+
- When ``is_pretokenized=True``: :data:`~PreTokenizedInputSequence`
|
| 50 |
+
"""
|
| 51 |
+
|
| 52 |
+
EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
|
| 53 |
+
"""Represents all the possible types of input for encoding. Can be:
|
| 54 |
+
|
| 55 |
+
- When ``is_pretokenized=False``: :data:`~TextEncodeInput`
|
| 56 |
+
- When ``is_pretokenized=True``: :data:`~PreTokenizedEncodeInput`
|
| 57 |
+
"""
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class OffsetReferential(Enum):
|
| 61 |
+
ORIGINAL = "original"
|
| 62 |
+
NORMALIZED = "normalized"
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
class OffsetType(Enum):
|
| 66 |
+
BYTE = "byte"
|
| 67 |
+
CHAR = "char"
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
class SplitDelimiterBehavior(Enum):
|
| 71 |
+
REMOVED = "removed"
|
| 72 |
+
ISOLATED = "isolated"
|
| 73 |
+
MERGED_WITH_PREVIOUS = "merged_with_previous"
|
| 74 |
+
MERGED_WITH_NEXT = "merged_with_next"
|
| 75 |
+
CONTIGUOUS = "contiguous"
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
from .tokenizers import ( # type: ignore[import]
|
| 79 |
+
AddedToken,
|
| 80 |
+
Encoding,
|
| 81 |
+
NormalizedString,
|
| 82 |
+
PreTokenizedString,
|
| 83 |
+
Regex,
|
| 84 |
+
Token,
|
| 85 |
+
Tokenizer,
|
| 86 |
+
decoders,
|
| 87 |
+
models,
|
| 88 |
+
normalizers,
|
| 89 |
+
pre_tokenizers,
|
| 90 |
+
processors,
|
| 91 |
+
trainers,
|
| 92 |
+
__version__,
|
| 93 |
+
)
|
| 94 |
+
from .implementations import (
|
| 95 |
+
BertWordPieceTokenizer,
|
| 96 |
+
ByteLevelBPETokenizer,
|
| 97 |
+
CharBPETokenizer,
|
| 98 |
+
SentencePieceBPETokenizer,
|
| 99 |
+
SentencePieceUnigramTokenizer,
|
| 100 |
+
)
|
source/tokenizers/__init__.pyi
ADDED
|
@@ -0,0 +1,1800 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Generated content DO NOT EDIT
|
| 2 |
+
class AddedToken:
|
| 3 |
+
"""
|
| 4 |
+
Represents a token that can be be added to a :class:`~tokenizers.Tokenizer`.
|
| 5 |
+
It can have special options that defines the way it should behave.
|
| 6 |
+
|
| 7 |
+
Args:
|
| 8 |
+
content (:obj:`str`): The content of the token
|
| 9 |
+
|
| 10 |
+
single_word (:obj:`bool`, defaults to :obj:`False`):
|
| 11 |
+
Defines whether this token should only match single words. If :obj:`True`, this
|
| 12 |
+
token will never match inside of a word. For example the token ``ing`` would match
|
| 13 |
+
on ``tokenizing`` if this option is :obj:`False`, but not if it is :obj:`True`.
|
| 14 |
+
The notion of "`inside of a word`" is defined by the word boundaries pattern in
|
| 15 |
+
regular expressions (ie. the token should start and end with word boundaries).
|
| 16 |
+
|
| 17 |
+
lstrip (:obj:`bool`, defaults to :obj:`False`):
|
| 18 |
+
Defines whether this token should strip all potential whitespaces on its left side.
|
| 19 |
+
If :obj:`True`, this token will greedily match any whitespace on its left. For
|
| 20 |
+
example if we try to match the token ``[MASK]`` with ``lstrip=True``, in the text
|
| 21 |
+
``"I saw a [MASK]"``, we would match on ``" [MASK]"``. (Note the space on the left).
|
| 22 |
+
|
| 23 |
+
rstrip (:obj:`bool`, defaults to :obj:`False`):
|
| 24 |
+
Defines whether this token should strip all potential whitespaces on its right
|
| 25 |
+
side. If :obj:`True`, this token will greedily match any whitespace on its right.
|
| 26 |
+
It works just like :obj:`lstrip` but on the right.
|
| 27 |
+
|
| 28 |
+
normalized (:obj:`bool`, defaults to :obj:`True` with :meth:`~tokenizers.Tokenizer.add_tokens` and :obj:`False` with :meth:`~tokenizers.Tokenizer.add_special_tokens`):
|
| 29 |
+
Defines whether this token should match against the normalized version of the input
|
| 30 |
+
text. For example, with the added token ``"yesterday"``, and a normalizer in charge of
|
| 31 |
+
lowercasing the text, the token could be extract from the input ``"I saw a lion
|
| 32 |
+
Yesterday"``.
|
| 33 |
+
special (:obj:`bool`, defaults to :obj:`False` with :meth:`~tokenizers.Tokenizer.add_tokens` and :obj:`False` with :meth:`~tokenizers.Tokenizer.add_special_tokens`):
|
| 34 |
+
Defines whether this token should be skipped when decoding.
|
| 35 |
+
|
| 36 |
+
"""
|
| 37 |
+
def __init__(self, content=None, single_word=False, lstrip=False, rstrip=False, normalized=True, special=False):
|
| 38 |
+
pass
|
| 39 |
+
|
| 40 |
+
def __getstate__(self):
|
| 41 |
+
""" """
|
| 42 |
+
pass
|
| 43 |
+
|
| 44 |
+
def __setstate__(self, state):
|
| 45 |
+
""" """
|
| 46 |
+
pass
|
| 47 |
+
|
| 48 |
+
@property
|
| 49 |
+
def content(self):
|
| 50 |
+
"""
|
| 51 |
+
Get the content of this :obj:`AddedToken`
|
| 52 |
+
"""
|
| 53 |
+
pass
|
| 54 |
+
|
| 55 |
+
@content.setter
|
| 56 |
+
def content(self, value):
|
| 57 |
+
"""
|
| 58 |
+
Get the content of this :obj:`AddedToken`
|
| 59 |
+
"""
|
| 60 |
+
pass
|
| 61 |
+
|
| 62 |
+
@property
|
| 63 |
+
def lstrip(self):
|
| 64 |
+
"""
|
| 65 |
+
Get the value of the :obj:`lstrip` option
|
| 66 |
+
"""
|
| 67 |
+
pass
|
| 68 |
+
|
| 69 |
+
@lstrip.setter
|
| 70 |
+
def lstrip(self, value):
|
| 71 |
+
"""
|
| 72 |
+
Get the value of the :obj:`lstrip` option
|
| 73 |
+
"""
|
| 74 |
+
pass
|
| 75 |
+
|
| 76 |
+
@property
|
| 77 |
+
def normalized(self):
|
| 78 |
+
"""
|
| 79 |
+
Get the value of the :obj:`normalized` option
|
| 80 |
+
"""
|
| 81 |
+
pass
|
| 82 |
+
|
| 83 |
+
@normalized.setter
|
| 84 |
+
def normalized(self, value):
|
| 85 |
+
"""
|
| 86 |
+
Get the value of the :obj:`normalized` option
|
| 87 |
+
"""
|
| 88 |
+
pass
|
| 89 |
+
|
| 90 |
+
@property
|
| 91 |
+
def rstrip(self):
|
| 92 |
+
"""
|
| 93 |
+
Get the value of the :obj:`rstrip` option
|
| 94 |
+
"""
|
| 95 |
+
pass
|
| 96 |
+
|
| 97 |
+
@rstrip.setter
|
| 98 |
+
def rstrip(self, value):
|
| 99 |
+
"""
|
| 100 |
+
Get the value of the :obj:`rstrip` option
|
| 101 |
+
"""
|
| 102 |
+
pass
|
| 103 |
+
|
| 104 |
+
@property
|
| 105 |
+
def single_word(self):
|
| 106 |
+
"""
|
| 107 |
+
Get the value of the :obj:`single_word` option
|
| 108 |
+
"""
|
| 109 |
+
pass
|
| 110 |
+
|
| 111 |
+
@single_word.setter
|
| 112 |
+
def single_word(self, value):
|
| 113 |
+
"""
|
| 114 |
+
Get the value of the :obj:`single_word` option
|
| 115 |
+
"""
|
| 116 |
+
pass
|
| 117 |
+
|
| 118 |
+
@property
|
| 119 |
+
def special(self):
|
| 120 |
+
"""
|
| 121 |
+
Get the value of the :obj:`special` option
|
| 122 |
+
"""
|
| 123 |
+
pass
|
| 124 |
+
|
| 125 |
+
@special.setter
|
| 126 |
+
def special(self, value):
|
| 127 |
+
"""
|
| 128 |
+
Get the value of the :obj:`special` option
|
| 129 |
+
"""
|
| 130 |
+
pass
|
| 131 |
+
|
| 132 |
+
class Encoding:
|
| 133 |
+
"""
|
| 134 |
+
The :class:`~tokenizers.Encoding` represents the output of a :class:`~tokenizers.Tokenizer`.
|
| 135 |
+
"""
|
| 136 |
+
def __init__(self):
|
| 137 |
+
pass
|
| 138 |
+
|
| 139 |
+
def __getstate__(self):
|
| 140 |
+
""" """
|
| 141 |
+
pass
|
| 142 |
+
|
| 143 |
+
def __setstate__(self, state):
|
| 144 |
+
""" """
|
| 145 |
+
pass
|
| 146 |
+
|
| 147 |
+
@property
|
| 148 |
+
def attention_mask(self):
|
| 149 |
+
"""
|
| 150 |
+
The attention mask
|
| 151 |
+
|
| 152 |
+
This indicates to the LM which tokens should be attended to, and which should not.
|
| 153 |
+
This is especially important when batching sequences, where we need to applying
|
| 154 |
+
padding.
|
| 155 |
+
|
| 156 |
+
Returns:
|
| 157 |
+
:obj:`List[int]`: The attention mask
|
| 158 |
+
"""
|
| 159 |
+
pass
|
| 160 |
+
|
| 161 |
+
@attention_mask.setter
|
| 162 |
+
def attention_mask(self, value):
|
| 163 |
+
"""
|
| 164 |
+
The attention mask
|
| 165 |
+
|
| 166 |
+
This indicates to the LM which tokens should be attended to, and which should not.
|
| 167 |
+
This is especially important when batching sequences, where we need to applying
|
| 168 |
+
padding.
|
| 169 |
+
|
| 170 |
+
Returns:
|
| 171 |
+
:obj:`List[int]`: The attention mask
|
| 172 |
+
"""
|
| 173 |
+
pass
|
| 174 |
+
|
| 175 |
+
def char_to_token(self, char_pos, sequence_index=0):
|
| 176 |
+
"""
|
| 177 |
+
Get the token that contains the char at the given position in the input sequence.
|
| 178 |
+
|
| 179 |
+
Args:
|
| 180 |
+
char_pos (:obj:`int`):
|
| 181 |
+
The position of a char in the input string
|
| 182 |
+
sequence_index (:obj:`int`, defaults to :obj:`0`):
|
| 183 |
+
The index of the sequence that contains the target char
|
| 184 |
+
|
| 185 |
+
Returns:
|
| 186 |
+
:obj:`int`: The index of the token that contains this char in the encoded sequence
|
| 187 |
+
"""
|
| 188 |
+
pass
|
| 189 |
+
|
| 190 |
+
def char_to_word(self, char_pos, sequence_index=0):
|
| 191 |
+
"""
|
| 192 |
+
Get the word that contains the char at the given position in the input sequence.
|
| 193 |
+
|
| 194 |
+
Args:
|
| 195 |
+
char_pos (:obj:`int`):
|
| 196 |
+
The position of a char in the input string
|
| 197 |
+
sequence_index (:obj:`int`, defaults to :obj:`0`):
|
| 198 |
+
The index of the sequence that contains the target char
|
| 199 |
+
|
| 200 |
+
Returns:
|
| 201 |
+
:obj:`int`: The index of the word that contains this char in the input sequence
|
| 202 |
+
"""
|
| 203 |
+
pass
|
| 204 |
+
|
| 205 |
+
@property
|
| 206 |
+
def ids(self):
|
| 207 |
+
"""
|
| 208 |
+
The generated IDs
|
| 209 |
+
|
| 210 |
+
The IDs are the main input to a Language Model. They are the token indices,
|
| 211 |
+
the numerical representations that a LM understands.
|
| 212 |
+
|
| 213 |
+
Returns:
|
| 214 |
+
:obj:`List[int]`: The list of IDs
|
| 215 |
+
"""
|
| 216 |
+
pass
|
| 217 |
+
|
| 218 |
+
@ids.setter
|
| 219 |
+
def ids(self, value):
|
| 220 |
+
"""
|
| 221 |
+
The generated IDs
|
| 222 |
+
|
| 223 |
+
The IDs are the main input to a Language Model. They are the token indices,
|
| 224 |
+
the numerical representations that a LM understands.
|
| 225 |
+
|
| 226 |
+
Returns:
|
| 227 |
+
:obj:`List[int]`: The list of IDs
|
| 228 |
+
"""
|
| 229 |
+
pass
|
| 230 |
+
|
| 231 |
+
@staticmethod
|
| 232 |
+
def merge(encodings, growing_offsets=True):
|
| 233 |
+
"""
|
| 234 |
+
Merge the list of encodings into one final :class:`~tokenizers.Encoding`
|
| 235 |
+
|
| 236 |
+
Args:
|
| 237 |
+
encodings (A :obj:`List` of :class:`~tokenizers.Encoding`):
|
| 238 |
+
The list of encodings that should be merged in one
|
| 239 |
+
|
| 240 |
+
growing_offsets (:obj:`bool`, defaults to :obj:`True`):
|
| 241 |
+
Whether the offsets should accumulate while merging
|
| 242 |
+
|
| 243 |
+
Returns:
|
| 244 |
+
:class:`~tokenizers.Encoding`: The resulting Encoding
|
| 245 |
+
"""
|
| 246 |
+
pass
|
| 247 |
+
|
| 248 |
+
@property
|
| 249 |
+
def n_sequences(self):
|
| 250 |
+
"""
|
| 251 |
+
The number of sequences represented
|
| 252 |
+
|
| 253 |
+
Returns:
|
| 254 |
+
:obj:`int`: The number of sequences in this :class:`~tokenizers.Encoding`
|
| 255 |
+
"""
|
| 256 |
+
pass
|
| 257 |
+
|
| 258 |
+
@n_sequences.setter
|
| 259 |
+
def n_sequences(self, value):
|
| 260 |
+
"""
|
| 261 |
+
The number of sequences represented
|
| 262 |
+
|
| 263 |
+
Returns:
|
| 264 |
+
:obj:`int`: The number of sequences in this :class:`~tokenizers.Encoding`
|
| 265 |
+
"""
|
| 266 |
+
pass
|
| 267 |
+
|
| 268 |
+
@property
|
| 269 |
+
def offsets(self):
|
| 270 |
+
"""
|
| 271 |
+
The offsets associated to each token
|
| 272 |
+
|
| 273 |
+
These offsets let's you slice the input string, and thus retrieve the original
|
| 274 |
+
part that led to producing the corresponding token.
|
| 275 |
+
|
| 276 |
+
Returns:
|
| 277 |
+
A :obj:`List` of :obj:`Tuple[int, int]`: The list of offsets
|
| 278 |
+
"""
|
| 279 |
+
pass
|
| 280 |
+
|
| 281 |
+
@offsets.setter
|
| 282 |
+
def offsets(self, value):
|
| 283 |
+
"""
|
| 284 |
+
The offsets associated to each token
|
| 285 |
+
|
| 286 |
+
These offsets let's you slice the input string, and thus retrieve the original
|
| 287 |
+
part that led to producing the corresponding token.
|
| 288 |
+
|
| 289 |
+
Returns:
|
| 290 |
+
A :obj:`List` of :obj:`Tuple[int, int]`: The list of offsets
|
| 291 |
+
"""
|
| 292 |
+
pass
|
| 293 |
+
|
| 294 |
+
@property
|
| 295 |
+
def overflowing(self):
|
| 296 |
+
"""
|
| 297 |
+
A :obj:`List` of overflowing :class:`~tokenizers.Encoding`
|
| 298 |
+
|
| 299 |
+
When using truncation, the :class:`~tokenizers.Tokenizer` takes care of splitting
|
| 300 |
+
the output into as many pieces as required to match the specified maximum length.
|
| 301 |
+
This field lets you retrieve all the subsequent pieces.
|
| 302 |
+
|
| 303 |
+
When you use pairs of sequences, the overflowing pieces will contain enough
|
| 304 |
+
variations to cover all the possible combinations, while respecting the provided
|
| 305 |
+
maximum length.
|
| 306 |
+
"""
|
| 307 |
+
pass
|
| 308 |
+
|
| 309 |
+
@overflowing.setter
|
| 310 |
+
def overflowing(self, value):
|
| 311 |
+
"""
|
| 312 |
+
A :obj:`List` of overflowing :class:`~tokenizers.Encoding`
|
| 313 |
+
|
| 314 |
+
When using truncation, the :class:`~tokenizers.Tokenizer` takes care of splitting
|
| 315 |
+
the output into as many pieces as required to match the specified maximum length.
|
| 316 |
+
This field lets you retrieve all the subsequent pieces.
|
| 317 |
+
|
| 318 |
+
When you use pairs of sequences, the overflowing pieces will contain enough
|
| 319 |
+
variations to cover all the possible combinations, while respecting the provided
|
| 320 |
+
maximum length.
|
| 321 |
+
"""
|
| 322 |
+
pass
|
| 323 |
+
|
| 324 |
+
def pad(self, length, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]"):
|
| 325 |
+
"""
|
| 326 |
+
Pad the :class:`~tokenizers.Encoding` at the given length
|
| 327 |
+
|
| 328 |
+
Args:
|
| 329 |
+
length (:obj:`int`):
|
| 330 |
+
The desired length
|
| 331 |
+
|
| 332 |
+
direction: (:obj:`str`, defaults to :obj:`right`):
|
| 333 |
+
The expected padding direction. Can be either :obj:`right` or :obj:`left`
|
| 334 |
+
|
| 335 |
+
pad_id (:obj:`int`, defaults to :obj:`0`):
|
| 336 |
+
The ID corresponding to the padding token
|
| 337 |
+
|
| 338 |
+
pad_type_id (:obj:`int`, defaults to :obj:`0`):
|
| 339 |
+
The type ID corresponding to the padding token
|
| 340 |
+
|
| 341 |
+
pad_token (:obj:`str`, defaults to `[PAD]`):
|
| 342 |
+
The pad token to use
|
| 343 |
+
"""
|
| 344 |
+
pass
|
| 345 |
+
|
| 346 |
+
@property
|
| 347 |
+
def sequence_ids(self):
|
| 348 |
+
"""
|
| 349 |
+
The generated sequence indices.
|
| 350 |
+
|
| 351 |
+
They represent the index of the input sequence associated to each token.
|
| 352 |
+
The sequence id can be None if the token is not related to any input sequence,
|
| 353 |
+
like for example with special tokens.
|
| 354 |
+
|
| 355 |
+
Returns:
|
| 356 |
+
A :obj:`List` of :obj:`Optional[int]`: A list of optional sequence index.
|
| 357 |
+
"""
|
| 358 |
+
pass
|
| 359 |
+
|
| 360 |
+
@sequence_ids.setter
|
| 361 |
+
def sequence_ids(self, value):
|
| 362 |
+
"""
|
| 363 |
+
The generated sequence indices.
|
| 364 |
+
|
| 365 |
+
They represent the index of the input sequence associated to each token.
|
| 366 |
+
The sequence id can be None if the token is not related to any input sequence,
|
| 367 |
+
like for example with special tokens.
|
| 368 |
+
|
| 369 |
+
Returns:
|
| 370 |
+
A :obj:`List` of :obj:`Optional[int]`: A list of optional sequence index.
|
| 371 |
+
"""
|
| 372 |
+
pass
|
| 373 |
+
|
| 374 |
+
def set_sequence_id(self, sequence_id):
|
| 375 |
+
"""
|
| 376 |
+
Set the given sequence index
|
| 377 |
+
|
| 378 |
+
Set the given sequence index for the whole range of tokens contained in this
|
| 379 |
+
:class:`~tokenizers.Encoding`.
|
| 380 |
+
"""
|
| 381 |
+
pass
|
| 382 |
+
|
| 383 |
+
@property
|
| 384 |
+
def special_tokens_mask(self):
|
| 385 |
+
"""
|
| 386 |
+
The special token mask
|
| 387 |
+
|
| 388 |
+
This indicates which tokens are special tokens, and which are not.
|
| 389 |
+
|
| 390 |
+
Returns:
|
| 391 |
+
:obj:`List[int]`: The special tokens mask
|
| 392 |
+
"""
|
| 393 |
+
pass
|
| 394 |
+
|
| 395 |
+
@special_tokens_mask.setter
|
| 396 |
+
def special_tokens_mask(self, value):
|
| 397 |
+
"""
|
| 398 |
+
The special token mask
|
| 399 |
+
|
| 400 |
+
This indicates which tokens are special tokens, and which are not.
|
| 401 |
+
|
| 402 |
+
Returns:
|
| 403 |
+
:obj:`List[int]`: The special tokens mask
|
| 404 |
+
"""
|
| 405 |
+
pass
|
| 406 |
+
|
| 407 |
+
def token_to_chars(self, token_index):
|
| 408 |
+
"""
|
| 409 |
+
Get the offsets of the token at the given index.
|
| 410 |
+
|
| 411 |
+
The returned offsets are related to the input sequence that contains the
|
| 412 |
+
token. In order to determine in which input sequence it belongs, you
|
| 413 |
+
must call :meth:`~tokenizers.Encoding.token_to_sequence()`.
|
| 414 |
+
|
| 415 |
+
Args:
|
| 416 |
+
token_index (:obj:`int`):
|
| 417 |
+
The index of a token in the encoded sequence.
|
| 418 |
+
|
| 419 |
+
Returns:
|
| 420 |
+
:obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)`
|
| 421 |
+
"""
|
| 422 |
+
pass
|
| 423 |
+
|
| 424 |
+
def token_to_sequence(self, token_index):
|
| 425 |
+
"""
|
| 426 |
+
Get the index of the sequence represented by the given token.
|
| 427 |
+
|
| 428 |
+
In the general use case, this method returns :obj:`0` for a single sequence or
|
| 429 |
+
the first sequence of a pair, and :obj:`1` for the second sequence of a pair
|
| 430 |
+
|
| 431 |
+
Args:
|
| 432 |
+
token_index (:obj:`int`):
|
| 433 |
+
The index of a token in the encoded sequence.
|
| 434 |
+
|
| 435 |
+
Returns:
|
| 436 |
+
:obj:`int`: The sequence id of the given token
|
| 437 |
+
"""
|
| 438 |
+
pass
|
| 439 |
+
|
| 440 |
+
def token_to_word(self, token_index):
|
| 441 |
+
"""
|
| 442 |
+
Get the index of the word that contains the token in one of the input sequences.
|
| 443 |
+
|
| 444 |
+
The returned word index is related to the input sequence that contains
|
| 445 |
+
the token. In order to determine in which input sequence it belongs, you
|
| 446 |
+
must call :meth:`~tokenizers.Encoding.token_to_sequence()`.
|
| 447 |
+
|
| 448 |
+
Args:
|
| 449 |
+
token_index (:obj:`int`):
|
| 450 |
+
The index of a token in the encoded sequence.
|
| 451 |
+
|
| 452 |
+
Returns:
|
| 453 |
+
:obj:`int`: The index of the word in the relevant input sequence.
|
| 454 |
+
"""
|
| 455 |
+
pass
|
| 456 |
+
|
| 457 |
+
@property
|
| 458 |
+
def tokens(self):
|
| 459 |
+
"""
|
| 460 |
+
The generated tokens
|
| 461 |
+
|
| 462 |
+
They are the string representation of the IDs.
|
| 463 |
+
|
| 464 |
+
Returns:
|
| 465 |
+
:obj:`List[str]`: The list of tokens
|
| 466 |
+
"""
|
| 467 |
+
pass
|
| 468 |
+
|
| 469 |
+
@tokens.setter
|
| 470 |
+
def tokens(self, value):
|
| 471 |
+
"""
|
| 472 |
+
The generated tokens
|
| 473 |
+
|
| 474 |
+
They are the string representation of the IDs.
|
| 475 |
+
|
| 476 |
+
Returns:
|
| 477 |
+
:obj:`List[str]`: The list of tokens
|
| 478 |
+
"""
|
| 479 |
+
pass
|
| 480 |
+
|
| 481 |
+
def truncate(self, max_length, stride=0, direction="right"):
|
| 482 |
+
"""
|
| 483 |
+
Truncate the :class:`~tokenizers.Encoding` at the given length
|
| 484 |
+
|
| 485 |
+
If this :class:`~tokenizers.Encoding` represents multiple sequences, when truncating
|
| 486 |
+
this information is lost. It will be considered as representing a single sequence.
|
| 487 |
+
|
| 488 |
+
Args:
|
| 489 |
+
max_length (:obj:`int`):
|
| 490 |
+
The desired length
|
| 491 |
+
|
| 492 |
+
stride (:obj:`int`, defaults to :obj:`0`):
|
| 493 |
+
The length of previous content to be included in each overflowing piece
|
| 494 |
+
|
| 495 |
+
direction (:obj:`str`, defaults to :obj:`right`):
|
| 496 |
+
Truncate direction
|
| 497 |
+
"""
|
| 498 |
+
pass
|
| 499 |
+
|
| 500 |
+
@property
|
| 501 |
+
def type_ids(self):
|
| 502 |
+
"""
|
| 503 |
+
The generated type IDs
|
| 504 |
+
|
| 505 |
+
Generally used for tasks like sequence classification or question answering,
|
| 506 |
+
these tokens let the LM know which input sequence corresponds to each tokens.
|
| 507 |
+
|
| 508 |
+
Returns:
|
| 509 |
+
:obj:`List[int]`: The list of type ids
|
| 510 |
+
"""
|
| 511 |
+
pass
|
| 512 |
+
|
| 513 |
+
@type_ids.setter
|
| 514 |
+
def type_ids(self, value):
|
| 515 |
+
"""
|
| 516 |
+
The generated type IDs
|
| 517 |
+
|
| 518 |
+
Generally used for tasks like sequence classification or question answering,
|
| 519 |
+
these tokens let the LM know which input sequence corresponds to each tokens.
|
| 520 |
+
|
| 521 |
+
Returns:
|
| 522 |
+
:obj:`List[int]`: The list of type ids
|
| 523 |
+
"""
|
| 524 |
+
pass
|
| 525 |
+
|
| 526 |
+
@property
|
| 527 |
+
def word_ids(self):
|
| 528 |
+
"""
|
| 529 |
+
The generated word indices.
|
| 530 |
+
|
| 531 |
+
They represent the index of the word associated to each token.
|
| 532 |
+
When the input is pre-tokenized, they correspond to the ID of the given input label,
|
| 533 |
+
otherwise they correspond to the words indices as defined by the
|
| 534 |
+
:class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used.
|
| 535 |
+
|
| 536 |
+
For special tokens and such (any token that was generated from something that was
|
| 537 |
+
not part of the input), the output is :obj:`None`
|
| 538 |
+
|
| 539 |
+
Returns:
|
| 540 |
+
A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
|
| 541 |
+
"""
|
| 542 |
+
pass
|
| 543 |
+
|
| 544 |
+
@word_ids.setter
|
| 545 |
+
def word_ids(self, value):
|
| 546 |
+
"""
|
| 547 |
+
The generated word indices.
|
| 548 |
+
|
| 549 |
+
They represent the index of the word associated to each token.
|
| 550 |
+
When the input is pre-tokenized, they correspond to the ID of the given input label,
|
| 551 |
+
otherwise they correspond to the words indices as defined by the
|
| 552 |
+
:class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used.
|
| 553 |
+
|
| 554 |
+
For special tokens and such (any token that was generated from something that was
|
| 555 |
+
not part of the input), the output is :obj:`None`
|
| 556 |
+
|
| 557 |
+
Returns:
|
| 558 |
+
A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
|
| 559 |
+
"""
|
| 560 |
+
pass
|
| 561 |
+
|
| 562 |
+
def word_to_chars(self, word_index, sequence_index=0):
|
| 563 |
+
"""
|
| 564 |
+
Get the offsets of the word at the given index in one of the input sequences.
|
| 565 |
+
|
| 566 |
+
Args:
|
| 567 |
+
word_index (:obj:`int`):
|
| 568 |
+
The index of a word in one of the input sequences.
|
| 569 |
+
sequence_index (:obj:`int`, defaults to :obj:`0`):
|
| 570 |
+
The index of the sequence that contains the target word
|
| 571 |
+
|
| 572 |
+
Returns:
|
| 573 |
+
:obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)`
|
| 574 |
+
"""
|
| 575 |
+
pass
|
| 576 |
+
|
| 577 |
+
def word_to_tokens(self, word_index, sequence_index=0):
|
| 578 |
+
"""
|
| 579 |
+
Get the encoded tokens corresponding to the word at the given index
|
| 580 |
+
in one of the input sequences.
|
| 581 |
+
|
| 582 |
+
Args:
|
| 583 |
+
word_index (:obj:`int`):
|
| 584 |
+
The index of a word in one of the input sequences.
|
| 585 |
+
sequence_index (:obj:`int`, defaults to :obj:`0`):
|
| 586 |
+
The index of the sequence that contains the target word
|
| 587 |
+
|
| 588 |
+
Returns:
|
| 589 |
+
:obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)`
|
| 590 |
+
"""
|
| 591 |
+
pass
|
| 592 |
+
|
| 593 |
+
@property
|
| 594 |
+
def words(self):
|
| 595 |
+
"""
|
| 596 |
+
The generated word indices.
|
| 597 |
+
|
| 598 |
+
.. warning::
|
| 599 |
+
This is deprecated and will be removed in a future version.
|
| 600 |
+
Please use :obj:`~tokenizers.Encoding.word_ids` instead.
|
| 601 |
+
|
| 602 |
+
They represent the index of the word associated to each token.
|
| 603 |
+
When the input is pre-tokenized, they correspond to the ID of the given input label,
|
| 604 |
+
otherwise they correspond to the words indices as defined by the
|
| 605 |
+
:class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used.
|
| 606 |
+
|
| 607 |
+
For special tokens and such (any token that was generated from something that was
|
| 608 |
+
not part of the input), the output is :obj:`None`
|
| 609 |
+
|
| 610 |
+
Returns:
|
| 611 |
+
A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
|
| 612 |
+
"""
|
| 613 |
+
pass
|
| 614 |
+
|
| 615 |
+
@words.setter
|
| 616 |
+
def words(self, value):
|
| 617 |
+
"""
|
| 618 |
+
The generated word indices.
|
| 619 |
+
|
| 620 |
+
.. warning::
|
| 621 |
+
This is deprecated and will be removed in a future version.
|
| 622 |
+
Please use :obj:`~tokenizers.Encoding.word_ids` instead.
|
| 623 |
+
|
| 624 |
+
They represent the index of the word associated to each token.
|
| 625 |
+
When the input is pre-tokenized, they correspond to the ID of the given input label,
|
| 626 |
+
otherwise they correspond to the words indices as defined by the
|
| 627 |
+
:class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used.
|
| 628 |
+
|
| 629 |
+
For special tokens and such (any token that was generated from something that was
|
| 630 |
+
not part of the input), the output is :obj:`None`
|
| 631 |
+
|
| 632 |
+
Returns:
|
| 633 |
+
A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
|
| 634 |
+
"""
|
| 635 |
+
pass
|
| 636 |
+
|
| 637 |
+
class NormalizedString:
|
| 638 |
+
"""
|
| 639 |
+
NormalizedString
|
| 640 |
+
|
| 641 |
+
A NormalizedString takes care of modifying an "original" string, to obtain a "normalized" one.
|
| 642 |
+
While making all the requested modifications, it keeps track of the alignment information
|
| 643 |
+
between the two versions of the string.
|
| 644 |
+
|
| 645 |
+
Args:
|
| 646 |
+
sequence: str:
|
| 647 |
+
The string sequence used to initialize this NormalizedString
|
| 648 |
+
"""
|
| 649 |
+
def __init__(self, sequence):
|
| 650 |
+
pass
|
| 651 |
+
|
| 652 |
+
def __getitem__(self, key):
|
| 653 |
+
"""
|
| 654 |
+
Return self[key].
|
| 655 |
+
"""
|
| 656 |
+
pass
|
| 657 |
+
|
| 658 |
+
def __getstate__(self, /):
|
| 659 |
+
"""
|
| 660 |
+
Helper for pickle.
|
| 661 |
+
"""
|
| 662 |
+
pass
|
| 663 |
+
|
| 664 |
+
def append(self, s):
|
| 665 |
+
"""
|
| 666 |
+
Append the given sequence to the string
|
| 667 |
+
"""
|
| 668 |
+
pass
|
| 669 |
+
|
| 670 |
+
def clear(self):
|
| 671 |
+
"""
|
| 672 |
+
Clears the string
|
| 673 |
+
"""
|
| 674 |
+
pass
|
| 675 |
+
|
| 676 |
+
def filter(self, func):
|
| 677 |
+
"""
|
| 678 |
+
Filter each character of the string using the given func
|
| 679 |
+
"""
|
| 680 |
+
pass
|
| 681 |
+
|
| 682 |
+
def for_each(self, func):
|
| 683 |
+
"""
|
| 684 |
+
Calls the given function for each character of the string
|
| 685 |
+
"""
|
| 686 |
+
pass
|
| 687 |
+
|
| 688 |
+
def lowercase(self):
|
| 689 |
+
"""
|
| 690 |
+
Lowercase the string
|
| 691 |
+
"""
|
| 692 |
+
pass
|
| 693 |
+
|
| 694 |
+
def lstrip(self):
|
| 695 |
+
"""
|
| 696 |
+
Strip the left of the string
|
| 697 |
+
"""
|
| 698 |
+
pass
|
| 699 |
+
|
| 700 |
+
def map(self, func):
|
| 701 |
+
"""
|
| 702 |
+
Calls the given function for each character of the string
|
| 703 |
+
|
| 704 |
+
Replaces each character of the string using the returned value. Each
|
| 705 |
+
returned value **must** be a str of length 1 (ie a character).
|
| 706 |
+
"""
|
| 707 |
+
pass
|
| 708 |
+
|
| 709 |
+
def nfc(self):
|
| 710 |
+
"""
|
| 711 |
+
Runs the NFC normalization
|
| 712 |
+
"""
|
| 713 |
+
pass
|
| 714 |
+
|
| 715 |
+
def nfd(self):
|
| 716 |
+
"""
|
| 717 |
+
Runs the NFD normalization
|
| 718 |
+
"""
|
| 719 |
+
pass
|
| 720 |
+
|
| 721 |
+
def nfkc(self):
|
| 722 |
+
"""
|
| 723 |
+
Runs the NFKC normalization
|
| 724 |
+
"""
|
| 725 |
+
pass
|
| 726 |
+
|
| 727 |
+
def nfkd(self):
|
| 728 |
+
"""
|
| 729 |
+
Runs the NFKD normalization
|
| 730 |
+
"""
|
| 731 |
+
pass
|
| 732 |
+
|
| 733 |
+
@property
|
| 734 |
+
def normalized(self):
|
| 735 |
+
"""
|
| 736 |
+
The normalized part of the string
|
| 737 |
+
"""
|
| 738 |
+
pass
|
| 739 |
+
|
| 740 |
+
@normalized.setter
|
| 741 |
+
def normalized(self, value):
|
| 742 |
+
"""
|
| 743 |
+
The normalized part of the string
|
| 744 |
+
"""
|
| 745 |
+
pass
|
| 746 |
+
|
| 747 |
+
@property
|
| 748 |
+
def original(self):
|
| 749 |
+
""" """
|
| 750 |
+
pass
|
| 751 |
+
|
| 752 |
+
@original.setter
|
| 753 |
+
def original(self, value):
|
| 754 |
+
""" """
|
| 755 |
+
pass
|
| 756 |
+
|
| 757 |
+
def prepend(self, s):
|
| 758 |
+
"""
|
| 759 |
+
Prepend the given sequence to the string
|
| 760 |
+
"""
|
| 761 |
+
pass
|
| 762 |
+
|
| 763 |
+
def replace(self, pattern, content):
|
| 764 |
+
"""
|
| 765 |
+
Replace the content of the given pattern with the provided content
|
| 766 |
+
|
| 767 |
+
Args:
|
| 768 |
+
pattern: Pattern:
|
| 769 |
+
A pattern used to match the string. Usually a string or a Regex
|
| 770 |
+
|
| 771 |
+
content: str:
|
| 772 |
+
The content to be used as replacement
|
| 773 |
+
"""
|
| 774 |
+
pass
|
| 775 |
+
|
| 776 |
+
def rstrip(self):
|
| 777 |
+
"""
|
| 778 |
+
Strip the right of the string
|
| 779 |
+
"""
|
| 780 |
+
pass
|
| 781 |
+
|
| 782 |
+
def slice(self, range):
|
| 783 |
+
"""
|
| 784 |
+
Slice the string using the given range
|
| 785 |
+
"""
|
| 786 |
+
pass
|
| 787 |
+
|
| 788 |
+
def split(self, pattern, behavior):
|
| 789 |
+
"""
|
| 790 |
+
Split the NormalizedString using the given pattern and the specified behavior
|
| 791 |
+
|
| 792 |
+
Args:
|
| 793 |
+
pattern: Pattern:
|
| 794 |
+
A pattern used to split the string. Usually a string or a regex built with `tokenizers.Regex`
|
| 795 |
+
|
| 796 |
+
behavior: SplitDelimiterBehavior:
|
| 797 |
+
The behavior to use when splitting.
|
| 798 |
+
Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",
|
| 799 |
+
"contiguous"
|
| 800 |
+
|
| 801 |
+
Returns:
|
| 802 |
+
A list of NormalizedString, representing each split
|
| 803 |
+
"""
|
| 804 |
+
pass
|
| 805 |
+
|
| 806 |
+
def strip(self):
|
| 807 |
+
"""
|
| 808 |
+
Strip both ends of the string
|
| 809 |
+
"""
|
| 810 |
+
pass
|
| 811 |
+
|
| 812 |
+
def uppercase(self):
|
| 813 |
+
"""
|
| 814 |
+
Uppercase the string
|
| 815 |
+
"""
|
| 816 |
+
pass
|
| 817 |
+
|
| 818 |
+
class PreTokenizedString:
|
| 819 |
+
"""
|
| 820 |
+
PreTokenizedString
|
| 821 |
+
|
| 822 |
+
Wrapper over a string, that provides a way to normalize, pre-tokenize, tokenize the
|
| 823 |
+
underlying string, while keeping track of the alignment information (offsets).
|
| 824 |
+
|
| 825 |
+
The PreTokenizedString manages what we call `splits`. Each split represents a substring
|
| 826 |
+
which is a subpart of the original string, with the relevant offsets and tokens.
|
| 827 |
+
|
| 828 |
+
When calling one of the methods used to modify the PreTokenizedString (namely one of
|
| 829 |
+
`split`, `normalize` or `tokenize), only the `splits` that don't have any associated
|
| 830 |
+
tokens will get modified.
|
| 831 |
+
|
| 832 |
+
Args:
|
| 833 |
+
sequence: str:
|
| 834 |
+
The string sequence used to initialize this PreTokenizedString
|
| 835 |
+
"""
|
| 836 |
+
def __init__(self, sequence):
|
| 837 |
+
pass
|
| 838 |
+
|
| 839 |
+
def __getstate__(self, /):
|
| 840 |
+
"""
|
| 841 |
+
Helper for pickle.
|
| 842 |
+
"""
|
| 843 |
+
pass
|
| 844 |
+
|
| 845 |
+
def get_splits(self, offset_referential="original", offset_type="char"):
|
| 846 |
+
"""
|
| 847 |
+
Get the splits currently managed by the PreTokenizedString
|
| 848 |
+
|
| 849 |
+
Args:
|
| 850 |
+
offset_referential: :obj:`str`
|
| 851 |
+
Whether the returned splits should have offsets expressed relative
|
| 852 |
+
to the original string, or the normalized one. choices: "original", "normalized".
|
| 853 |
+
|
| 854 |
+
offset_type: :obj:`str`
|
| 855 |
+
Whether the returned splits should have offsets expressed in bytes or chars.
|
| 856 |
+
When slicing an str, we usually want to use chars, which is the default value.
|
| 857 |
+
Now in some cases it might be interesting to get these offsets expressed in bytes,
|
| 858 |
+
so it is possible to change this here.
|
| 859 |
+
choices: "char", "bytes"
|
| 860 |
+
|
| 861 |
+
Returns
|
| 862 |
+
A list of splits
|
| 863 |
+
"""
|
| 864 |
+
pass
|
| 865 |
+
|
| 866 |
+
def normalize(self, func):
|
| 867 |
+
"""
|
| 868 |
+
Normalize each split of the `PreTokenizedString` using the given `func`
|
| 869 |
+
|
| 870 |
+
Args:
|
| 871 |
+
func: Callable[[NormalizedString], None]:
|
| 872 |
+
The function used to normalize each underlying split. This function
|
| 873 |
+
does not need to return anything, just calling the methods on the provided
|
| 874 |
+
NormalizedString allow its modification.
|
| 875 |
+
"""
|
| 876 |
+
pass
|
| 877 |
+
|
| 878 |
+
def split(self, func):
|
| 879 |
+
"""
|
| 880 |
+
Split the PreTokenizedString using the given `func`
|
| 881 |
+
|
| 882 |
+
Args:
|
| 883 |
+
func: Callable[[index, NormalizedString], List[NormalizedString]]:
|
| 884 |
+
The function used to split each underlying split.
|
| 885 |
+
It is expected to return a list of `NormalizedString`, that represent the new
|
| 886 |
+
splits. If the given `NormalizedString` does not need any splitting, we can
|
| 887 |
+
just return it directly.
|
| 888 |
+
In order for the offsets to be tracked accurately, any returned `NormalizedString`
|
| 889 |
+
should come from calling either `.split` or `.slice` on the received one.
|
| 890 |
+
"""
|
| 891 |
+
pass
|
| 892 |
+
|
| 893 |
+
def to_encoding(self, type_id=0, word_idx=None):
|
| 894 |
+
"""
|
| 895 |
+
Return an Encoding generated from this PreTokenizedString
|
| 896 |
+
|
| 897 |
+
Args:
|
| 898 |
+
type_id: int = 0:
|
| 899 |
+
The type_id to be used on the generated Encoding.
|
| 900 |
+
|
| 901 |
+
word_idx: Optional[int] = None:
|
| 902 |
+
An optional word index to be used for each token of this Encoding. If provided,
|
| 903 |
+
all the word indices in the generated Encoding will use this value, instead
|
| 904 |
+
of the one automatically tracked during pre-tokenization.
|
| 905 |
+
|
| 906 |
+
Returns:
|
| 907 |
+
An Encoding
|
| 908 |
+
"""
|
| 909 |
+
pass
|
| 910 |
+
|
| 911 |
+
def tokenize(self, func):
|
| 912 |
+
"""
|
| 913 |
+
Tokenize each split of the `PreTokenizedString` using the given `func`
|
| 914 |
+
|
| 915 |
+
Args:
|
| 916 |
+
func: Callable[[str], List[Token]]:
|
| 917 |
+
The function used to tokenize each underlying split. This function must return
|
| 918 |
+
a list of Token generated from the input str.
|
| 919 |
+
"""
|
| 920 |
+
pass
|
| 921 |
+
|
| 922 |
+
class Regex:
|
| 923 |
+
"""
|
| 924 |
+
Instantiate a new Regex with the given pattern
|
| 925 |
+
"""
|
| 926 |
+
def __init__(self, pattern):
|
| 927 |
+
pass
|
| 928 |
+
|
| 929 |
+
def __getstate__(self, /):
|
| 930 |
+
"""
|
| 931 |
+
Helper for pickle.
|
| 932 |
+
"""
|
| 933 |
+
pass
|
| 934 |
+
|
| 935 |
+
class Token:
|
| 936 |
+
def __init__(self, id, value, offsets):
|
| 937 |
+
pass
|
| 938 |
+
|
| 939 |
+
def __getstate__(self, /):
|
| 940 |
+
"""
|
| 941 |
+
Helper for pickle.
|
| 942 |
+
"""
|
| 943 |
+
pass
|
| 944 |
+
|
| 945 |
+
def as_tuple(self):
|
| 946 |
+
""" """
|
| 947 |
+
pass
|
| 948 |
+
|
| 949 |
+
@property
|
| 950 |
+
def id(self):
|
| 951 |
+
""" """
|
| 952 |
+
pass
|
| 953 |
+
|
| 954 |
+
@id.setter
|
| 955 |
+
def id(self, value):
|
| 956 |
+
""" """
|
| 957 |
+
pass
|
| 958 |
+
|
| 959 |
+
@property
|
| 960 |
+
def offsets(self):
|
| 961 |
+
""" """
|
| 962 |
+
pass
|
| 963 |
+
|
| 964 |
+
@offsets.setter
|
| 965 |
+
def offsets(self, value):
|
| 966 |
+
""" """
|
| 967 |
+
pass
|
| 968 |
+
|
| 969 |
+
@property
|
| 970 |
+
def value(self):
|
| 971 |
+
""" """
|
| 972 |
+
pass
|
| 973 |
+
|
| 974 |
+
@value.setter
|
| 975 |
+
def value(self, value):
|
| 976 |
+
""" """
|
| 977 |
+
pass
|
| 978 |
+
|
| 979 |
+
class Tokenizer:
|
| 980 |
+
"""
|
| 981 |
+
A :obj:`Tokenizer` works as a pipeline. It processes some raw text as input
|
| 982 |
+
and outputs an :class:`~tokenizers.Encoding`.
|
| 983 |
+
|
| 984 |
+
Args:
|
| 985 |
+
model (:class:`~tokenizers.models.Model`):
|
| 986 |
+
The core algorithm that this :obj:`Tokenizer` should be using.
|
| 987 |
+
|
| 988 |
+
"""
|
| 989 |
+
def __init__(self, model):
|
| 990 |
+
pass
|
| 991 |
+
|
| 992 |
+
def __getnewargs__(self):
|
| 993 |
+
""" """
|
| 994 |
+
pass
|
| 995 |
+
|
| 996 |
+
def __getstate__(self):
|
| 997 |
+
""" """
|
| 998 |
+
pass
|
| 999 |
+
|
| 1000 |
+
def __setstate__(self, state):
|
| 1001 |
+
""" """
|
| 1002 |
+
pass
|
| 1003 |
+
|
| 1004 |
+
def add_special_tokens(self, tokens):
|
| 1005 |
+
"""
|
| 1006 |
+
Add the given special tokens to the Tokenizer.
|
| 1007 |
+
|
| 1008 |
+
If these tokens are already part of the vocabulary, it just let the Tokenizer know about
|
| 1009 |
+
them. If they don't exist, the Tokenizer creates them, giving them a new id.
|
| 1010 |
+
|
| 1011 |
+
These special tokens will never be processed by the model (ie won't be split into
|
| 1012 |
+
multiple tokens), and they can be removed from the output when decoding.
|
| 1013 |
+
|
| 1014 |
+
Args:
|
| 1015 |
+
tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
|
| 1016 |
+
The list of special tokens we want to add to the vocabulary. Each token can either
|
| 1017 |
+
be a string or an instance of :class:`~tokenizers.AddedToken` for more
|
| 1018 |
+
customization.
|
| 1019 |
+
|
| 1020 |
+
Returns:
|
| 1021 |
+
:obj:`int`: The number of tokens that were created in the vocabulary
|
| 1022 |
+
"""
|
| 1023 |
+
pass
|
| 1024 |
+
|
| 1025 |
+
def add_tokens(self, tokens):
|
| 1026 |
+
"""
|
| 1027 |
+
Add the given tokens to the vocabulary
|
| 1028 |
+
|
| 1029 |
+
The given tokens are added only if they don't already exist in the vocabulary.
|
| 1030 |
+
Each token then gets a new attributed id.
|
| 1031 |
+
|
| 1032 |
+
Args:
|
| 1033 |
+
tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
|
| 1034 |
+
The list of tokens we want to add to the vocabulary. Each token can be either a
|
| 1035 |
+
string or an instance of :class:`~tokenizers.AddedToken` for more customization.
|
| 1036 |
+
|
| 1037 |
+
Returns:
|
| 1038 |
+
:obj:`int`: The number of tokens that were created in the vocabulary
|
| 1039 |
+
"""
|
| 1040 |
+
pass
|
| 1041 |
+
|
| 1042 |
+
def async_decode_batch(self, sequences, skip_special_tokens=True):
|
| 1043 |
+
"""
|
| 1044 |
+
Decode a batch of ids back to their corresponding string
|
| 1045 |
+
|
| 1046 |
+
Args:
|
| 1047 |
+
sequences (:obj:`List` of :obj:`List[int]`):
|
| 1048 |
+
The batch of sequences we want to decode
|
| 1049 |
+
|
| 1050 |
+
skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
|
| 1051 |
+
Whether the special tokens should be removed from the decoded strings
|
| 1052 |
+
|
| 1053 |
+
Returns:
|
| 1054 |
+
:obj:`List[str]`: A list of decoded strings
|
| 1055 |
+
"""
|
| 1056 |
+
pass
|
| 1057 |
+
|
| 1058 |
+
def async_encode(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True):
|
| 1059 |
+
"""
|
| 1060 |
+
Asynchronously encode the given input with character offsets.
|
| 1061 |
+
|
| 1062 |
+
This is an async version of encode that can be awaited in async Python code.
|
| 1063 |
+
|
| 1064 |
+
Example:
|
| 1065 |
+
Here are some examples of the inputs that are accepted::
|
| 1066 |
+
|
| 1067 |
+
await async_encode("A single sequence")
|
| 1068 |
+
|
| 1069 |
+
Args:
|
| 1070 |
+
sequence (:obj:`~tokenizers.InputSequence`):
|
| 1071 |
+
The main input sequence we want to encode. This sequence can be either raw
|
| 1072 |
+
text or pre-tokenized, according to the ``is_pretokenized`` argument:
|
| 1073 |
+
|
| 1074 |
+
- If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence`
|
| 1075 |
+
- If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence`
|
| 1076 |
+
|
| 1077 |
+
pair (:obj:`~tokenizers.InputSequence`, `optional`):
|
| 1078 |
+
An optional input sequence. The expected format is the same that for ``sequence``.
|
| 1079 |
+
|
| 1080 |
+
is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
|
| 1081 |
+
Whether the input is already pre-tokenized
|
| 1082 |
+
|
| 1083 |
+
add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
|
| 1084 |
+
Whether to add the special tokens
|
| 1085 |
+
|
| 1086 |
+
Returns:
|
| 1087 |
+
:class:`~tokenizers.Encoding`: The encoded result
|
| 1088 |
+
|
| 1089 |
+
"""
|
| 1090 |
+
pass
|
| 1091 |
+
|
| 1092 |
+
def async_encode_batch(self, input, is_pretokenized=False, add_special_tokens=True):
|
| 1093 |
+
"""
|
| 1094 |
+
Asynchronously encode the given batch of inputs with character offsets.
|
| 1095 |
+
|
| 1096 |
+
This is an async version of encode_batch that can be awaited in async Python code.
|
| 1097 |
+
|
| 1098 |
+
Example:
|
| 1099 |
+
Here are some examples of the inputs that are accepted::
|
| 1100 |
+
|
| 1101 |
+
await async_encode_batch([
|
| 1102 |
+
"A single sequence",
|
| 1103 |
+
("A tuple with a sequence", "And its pair"),
|
| 1104 |
+
[ "A", "pre", "tokenized", "sequence" ],
|
| 1105 |
+
([ "A", "pre", "tokenized", "sequence" ], "And its pair")
|
| 1106 |
+
])
|
| 1107 |
+
|
| 1108 |
+
Args:
|
| 1109 |
+
input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
|
| 1110 |
+
A list of single sequences or pair sequences to encode. Each sequence
|
| 1111 |
+
can be either raw text or pre-tokenized, according to the ``is_pretokenized``
|
| 1112 |
+
argument:
|
| 1113 |
+
|
| 1114 |
+
- If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
|
| 1115 |
+
- If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
|
| 1116 |
+
|
| 1117 |
+
is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
|
| 1118 |
+
Whether the input is already pre-tokenized
|
| 1119 |
+
|
| 1120 |
+
add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
|
| 1121 |
+
Whether to add the special tokens
|
| 1122 |
+
|
| 1123 |
+
Returns:
|
| 1124 |
+
A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
|
| 1125 |
+
|
| 1126 |
+
"""
|
| 1127 |
+
pass
|
| 1128 |
+
|
| 1129 |
+
def async_encode_batch_fast(self, input, is_pretokenized=False, add_special_tokens=True):
|
| 1130 |
+
"""
|
| 1131 |
+
Asynchronously encode the given batch of inputs without tracking character offsets.
|
| 1132 |
+
|
| 1133 |
+
This is an async version of encode_batch_fast that can be awaited in async Python code.
|
| 1134 |
+
|
| 1135 |
+
Example:
|
| 1136 |
+
Here are some examples of the inputs that are accepted::
|
| 1137 |
+
|
| 1138 |
+
await async_encode_batch_fast([
|
| 1139 |
+
"A single sequence",
|
| 1140 |
+
("A tuple with a sequence", "And its pair"),
|
| 1141 |
+
[ "A", "pre", "tokenized", "sequence" ],
|
| 1142 |
+
([ "A", "pre", "tokenized", "sequence" ], "And its pair")
|
| 1143 |
+
])
|
| 1144 |
+
|
| 1145 |
+
Args:
|
| 1146 |
+
input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
|
| 1147 |
+
A list of single sequences or pair sequences to encode. Each sequence
|
| 1148 |
+
can be either raw text or pre-tokenized, according to the ``is_pretokenized``
|
| 1149 |
+
argument:
|
| 1150 |
+
|
| 1151 |
+
- If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
|
| 1152 |
+
- If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
|
| 1153 |
+
|
| 1154 |
+
is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
|
| 1155 |
+
Whether the input is already pre-tokenized
|
| 1156 |
+
|
| 1157 |
+
add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
|
| 1158 |
+
Whether to add the special tokens
|
| 1159 |
+
|
| 1160 |
+
Returns:
|
| 1161 |
+
A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
|
| 1162 |
+
|
| 1163 |
+
"""
|
| 1164 |
+
pass
|
| 1165 |
+
|
| 1166 |
+
def decode(self, ids, skip_special_tokens=True):
|
| 1167 |
+
"""
|
| 1168 |
+
Decode the given list of ids back to a string
|
| 1169 |
+
|
| 1170 |
+
This is used to decode anything coming back from a Language Model
|
| 1171 |
+
|
| 1172 |
+
Args:
|
| 1173 |
+
ids (A :obj:`List/Tuple` of :obj:`int`):
|
| 1174 |
+
The list of ids that we want to decode
|
| 1175 |
+
|
| 1176 |
+
skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
|
| 1177 |
+
Whether the special tokens should be removed from the decoded string
|
| 1178 |
+
|
| 1179 |
+
Returns:
|
| 1180 |
+
:obj:`str`: The decoded string
|
| 1181 |
+
"""
|
| 1182 |
+
pass
|
| 1183 |
+
|
| 1184 |
+
def decode_batch(self, sequences, skip_special_tokens=True):
|
| 1185 |
+
"""
|
| 1186 |
+
Decode a batch of ids back to their corresponding string
|
| 1187 |
+
|
| 1188 |
+
Args:
|
| 1189 |
+
sequences (:obj:`List` of :obj:`List[int]`):
|
| 1190 |
+
The batch of sequences we want to decode
|
| 1191 |
+
|
| 1192 |
+
skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
|
| 1193 |
+
Whether the special tokens should be removed from the decoded strings
|
| 1194 |
+
|
| 1195 |
+
Returns:
|
| 1196 |
+
:obj:`List[str]`: A list of decoded strings
|
| 1197 |
+
"""
|
| 1198 |
+
pass
|
| 1199 |
+
|
| 1200 |
+
@property
|
| 1201 |
+
def decoder(self):
|
| 1202 |
+
"""
|
| 1203 |
+
The `optional` :class:`~tokenizers.decoders.Decoder` in use by the Tokenizer
|
| 1204 |
+
"""
|
| 1205 |
+
pass
|
| 1206 |
+
|
| 1207 |
+
@decoder.setter
|
| 1208 |
+
def decoder(self, value):
|
| 1209 |
+
"""
|
| 1210 |
+
The `optional` :class:`~tokenizers.decoders.Decoder` in use by the Tokenizer
|
| 1211 |
+
"""
|
| 1212 |
+
pass
|
| 1213 |
+
|
| 1214 |
+
def enable_padding(
|
| 1215 |
+
self, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]", length=None, pad_to_multiple_of=None
|
| 1216 |
+
):
|
| 1217 |
+
"""
|
| 1218 |
+
Enable the padding
|
| 1219 |
+
|
| 1220 |
+
Args:
|
| 1221 |
+
direction (:obj:`str`, `optional`, defaults to :obj:`right`):
|
| 1222 |
+
The direction in which to pad. Can be either ``right`` or ``left``
|
| 1223 |
+
|
| 1224 |
+
pad_to_multiple_of (:obj:`int`, `optional`):
|
| 1225 |
+
If specified, the padding length should always snap to the next multiple of the
|
| 1226 |
+
given value. For example if we were going to pad witha length of 250 but
|
| 1227 |
+
``pad_to_multiple_of=8`` then we will pad to 256.
|
| 1228 |
+
|
| 1229 |
+
pad_id (:obj:`int`, defaults to 0):
|
| 1230 |
+
The id to be used when padding
|
| 1231 |
+
|
| 1232 |
+
pad_type_id (:obj:`int`, defaults to 0):
|
| 1233 |
+
The type id to be used when padding
|
| 1234 |
+
|
| 1235 |
+
pad_token (:obj:`str`, defaults to :obj:`[PAD]`):
|
| 1236 |
+
The pad token to be used when padding
|
| 1237 |
+
|
| 1238 |
+
length (:obj:`int`, `optional`):
|
| 1239 |
+
If specified, the length at which to pad. If not specified we pad using the size of
|
| 1240 |
+
the longest sequence in a batch.
|
| 1241 |
+
"""
|
| 1242 |
+
pass
|
| 1243 |
+
|
| 1244 |
+
def enable_truncation(self, max_length, stride=0, strategy="longest_first", direction="right"):
|
| 1245 |
+
"""
|
| 1246 |
+
Enable truncation
|
| 1247 |
+
|
| 1248 |
+
Args:
|
| 1249 |
+
max_length (:obj:`int`):
|
| 1250 |
+
The max length at which to truncate
|
| 1251 |
+
|
| 1252 |
+
stride (:obj:`int`, `optional`):
|
| 1253 |
+
The length of the previous first sequence to be included in the overflowing
|
| 1254 |
+
sequence
|
| 1255 |
+
|
| 1256 |
+
strategy (:obj:`str`, `optional`, defaults to :obj:`longest_first`):
|
| 1257 |
+
The strategy used to truncation. Can be one of ``longest_first``, ``only_first`` or
|
| 1258 |
+
``only_second``.
|
| 1259 |
+
|
| 1260 |
+
direction (:obj:`str`, defaults to :obj:`right`):
|
| 1261 |
+
Truncate direction
|
| 1262 |
+
"""
|
| 1263 |
+
pass
|
| 1264 |
+
|
| 1265 |
+
def encode(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True):
|
| 1266 |
+
"""
|
| 1267 |
+
Encode the given sequence and pair. This method can process raw text sequences
|
| 1268 |
+
as well as already pre-tokenized sequences.
|
| 1269 |
+
|
| 1270 |
+
Example:
|
| 1271 |
+
Here are some examples of the inputs that are accepted::
|
| 1272 |
+
|
| 1273 |
+
encode("A single sequence")`
|
| 1274 |
+
encode("A sequence", "And its pair")`
|
| 1275 |
+
encode([ "A", "pre", "tokenized", "sequence" ], is_pretokenized=True)`
|
| 1276 |
+
encode(
|
| 1277 |
+
[ "A", "pre", "tokenized", "sequence" ], [ "And", "its", "pair" ],
|
| 1278 |
+
is_pretokenized=True
|
| 1279 |
+
)
|
| 1280 |
+
|
| 1281 |
+
Args:
|
| 1282 |
+
sequence (:obj:`~tokenizers.InputSequence`):
|
| 1283 |
+
The main input sequence we want to encode. This sequence can be either raw
|
| 1284 |
+
text or pre-tokenized, according to the ``is_pretokenized`` argument:
|
| 1285 |
+
|
| 1286 |
+
- If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence`
|
| 1287 |
+
- If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence`
|
| 1288 |
+
|
| 1289 |
+
pair (:obj:`~tokenizers.InputSequence`, `optional`):
|
| 1290 |
+
An optional input sequence. The expected format is the same that for ``sequence``.
|
| 1291 |
+
|
| 1292 |
+
is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
|
| 1293 |
+
Whether the input is already pre-tokenized
|
| 1294 |
+
|
| 1295 |
+
add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
|
| 1296 |
+
Whether to add the special tokens
|
| 1297 |
+
|
| 1298 |
+
Returns:
|
| 1299 |
+
:class:`~tokenizers.Encoding`: The encoded result
|
| 1300 |
+
|
| 1301 |
+
"""
|
| 1302 |
+
pass
|
| 1303 |
+
|
| 1304 |
+
def encode_batch(self, input, is_pretokenized=False, add_special_tokens=True):
|
| 1305 |
+
"""
|
| 1306 |
+
Encode the given batch of inputs. This method accept both raw text sequences
|
| 1307 |
+
as well as already pre-tokenized sequences. The reason we use `PySequence` is
|
| 1308 |
+
because it allows type checking with zero-cost (according to PyO3) as we don't
|
| 1309 |
+
have to convert to check.
|
| 1310 |
+
|
| 1311 |
+
Example:
|
| 1312 |
+
Here are some examples of the inputs that are accepted::
|
| 1313 |
+
|
| 1314 |
+
encode_batch([
|
| 1315 |
+
"A single sequence",
|
| 1316 |
+
("A tuple with a sequence", "And its pair"),
|
| 1317 |
+
[ "A", "pre", "tokenized", "sequence" ],
|
| 1318 |
+
([ "A", "pre", "tokenized", "sequence" ], "And its pair")
|
| 1319 |
+
])
|
| 1320 |
+
|
| 1321 |
+
Args:
|
| 1322 |
+
input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
|
| 1323 |
+
A list of single sequences or pair sequences to encode. Each sequence
|
| 1324 |
+
can be either raw text or pre-tokenized, according to the ``is_pretokenized``
|
| 1325 |
+
argument:
|
| 1326 |
+
|
| 1327 |
+
- If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
|
| 1328 |
+
- If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
|
| 1329 |
+
|
| 1330 |
+
is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
|
| 1331 |
+
Whether the input is already pre-tokenized
|
| 1332 |
+
|
| 1333 |
+
add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
|
| 1334 |
+
Whether to add the special tokens
|
| 1335 |
+
|
| 1336 |
+
Returns:
|
| 1337 |
+
A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
|
| 1338 |
+
|
| 1339 |
+
"""
|
| 1340 |
+
pass
|
| 1341 |
+
|
| 1342 |
+
def encode_batch_fast(self, input, is_pretokenized=False, add_special_tokens=True):
|
| 1343 |
+
"""
|
| 1344 |
+
Encode the given batch of inputs. This method is faster than `encode_batch`
|
| 1345 |
+
because it doesn't keep track of offsets, they will be all zeros.
|
| 1346 |
+
|
| 1347 |
+
Example:
|
| 1348 |
+
Here are some examples of the inputs that are accepted::
|
| 1349 |
+
|
| 1350 |
+
encode_batch_fast([
|
| 1351 |
+
"A single sequence",
|
| 1352 |
+
("A tuple with a sequence", "And its pair"),
|
| 1353 |
+
[ "A", "pre", "tokenized", "sequence" ],
|
| 1354 |
+
([ "A", "pre", "tokenized", "sequence" ], "And its pair")
|
| 1355 |
+
])
|
| 1356 |
+
|
| 1357 |
+
Args:
|
| 1358 |
+
input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
|
| 1359 |
+
A list of single sequences or pair sequences to encode. Each sequence
|
| 1360 |
+
can be either raw text or pre-tokenized, according to the ``is_pretokenized``
|
| 1361 |
+
argument:
|
| 1362 |
+
|
| 1363 |
+
- If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
|
| 1364 |
+
- If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
|
| 1365 |
+
|
| 1366 |
+
is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
|
| 1367 |
+
Whether the input is already pre-tokenized
|
| 1368 |
+
|
| 1369 |
+
add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
|
| 1370 |
+
Whether to add the special tokens
|
| 1371 |
+
|
| 1372 |
+
Returns:
|
| 1373 |
+
A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
|
| 1374 |
+
|
| 1375 |
+
"""
|
| 1376 |
+
pass
|
| 1377 |
+
|
| 1378 |
+
@property
|
| 1379 |
+
def encode_special_tokens(self):
|
| 1380 |
+
"""
|
| 1381 |
+
Modifies the tokenizer in order to use or not the special tokens
|
| 1382 |
+
during encoding.
|
| 1383 |
+
|
| 1384 |
+
Args:
|
| 1385 |
+
value (:obj:`bool`):
|
| 1386 |
+
Whether to use the special tokens or not
|
| 1387 |
+
|
| 1388 |
+
"""
|
| 1389 |
+
pass
|
| 1390 |
+
|
| 1391 |
+
@encode_special_tokens.setter
|
| 1392 |
+
def encode_special_tokens(self, value):
|
| 1393 |
+
"""
|
| 1394 |
+
Modifies the tokenizer in order to use or not the special tokens
|
| 1395 |
+
during encoding.
|
| 1396 |
+
|
| 1397 |
+
Args:
|
| 1398 |
+
value (:obj:`bool`):
|
| 1399 |
+
Whether to use the special tokens or not
|
| 1400 |
+
|
| 1401 |
+
"""
|
| 1402 |
+
pass
|
| 1403 |
+
|
| 1404 |
+
@staticmethod
|
| 1405 |
+
def from_buffer(buffer):
|
| 1406 |
+
"""
|
| 1407 |
+
Instantiate a new :class:`~tokenizers.Tokenizer` from the given buffer.
|
| 1408 |
+
|
| 1409 |
+
Args:
|
| 1410 |
+
buffer (:obj:`bytes`):
|
| 1411 |
+
A buffer containing a previously serialized :class:`~tokenizers.Tokenizer`
|
| 1412 |
+
|
| 1413 |
+
Returns:
|
| 1414 |
+
:class:`~tokenizers.Tokenizer`: The new tokenizer
|
| 1415 |
+
"""
|
| 1416 |
+
pass
|
| 1417 |
+
|
| 1418 |
+
@staticmethod
|
| 1419 |
+
def from_file(path):
|
| 1420 |
+
"""
|
| 1421 |
+
Instantiate a new :class:`~tokenizers.Tokenizer` from the file at the given path.
|
| 1422 |
+
|
| 1423 |
+
Args:
|
| 1424 |
+
path (:obj:`str`):
|
| 1425 |
+
A path to a local JSON file representing a previously serialized
|
| 1426 |
+
:class:`~tokenizers.Tokenizer`
|
| 1427 |
+
|
| 1428 |
+
Returns:
|
| 1429 |
+
:class:`~tokenizers.Tokenizer`: The new tokenizer
|
| 1430 |
+
"""
|
| 1431 |
+
pass
|
| 1432 |
+
|
| 1433 |
+
@staticmethod
|
| 1434 |
+
def from_pretrained(identifier, revision="main", token=None):
|
| 1435 |
+
"""
|
| 1436 |
+
Instantiate a new :class:`~tokenizers.Tokenizer` from an existing file on the
|
| 1437 |
+
Hugging Face Hub.
|
| 1438 |
+
|
| 1439 |
+
Args:
|
| 1440 |
+
identifier (:obj:`str`):
|
| 1441 |
+
The identifier of a Model on the Hugging Face Hub, that contains
|
| 1442 |
+
a tokenizer.json file
|
| 1443 |
+
revision (:obj:`str`, defaults to `main`):
|
| 1444 |
+
A branch or commit id
|
| 1445 |
+
token (:obj:`str`, `optional`, defaults to `None`):
|
| 1446 |
+
An optional auth token used to access private repositories on the
|
| 1447 |
+
Hugging Face Hub
|
| 1448 |
+
|
| 1449 |
+
Returns:
|
| 1450 |
+
:class:`~tokenizers.Tokenizer`: The new tokenizer
|
| 1451 |
+
"""
|
| 1452 |
+
pass
|
| 1453 |
+
|
| 1454 |
+
@staticmethod
|
| 1455 |
+
def from_str(json):
|
| 1456 |
+
"""
|
| 1457 |
+
Instantiate a new :class:`~tokenizers.Tokenizer` from the given JSON string.
|
| 1458 |
+
|
| 1459 |
+
Args:
|
| 1460 |
+
json (:obj:`str`):
|
| 1461 |
+
A valid JSON string representing a previously serialized
|
| 1462 |
+
:class:`~tokenizers.Tokenizer`
|
| 1463 |
+
|
| 1464 |
+
Returns:
|
| 1465 |
+
:class:`~tokenizers.Tokenizer`: The new tokenizer
|
| 1466 |
+
"""
|
| 1467 |
+
pass
|
| 1468 |
+
|
| 1469 |
+
def get_added_tokens_decoder(self):
|
| 1470 |
+
"""
|
| 1471 |
+
Get the underlying vocabulary
|
| 1472 |
+
|
| 1473 |
+
Returns:
|
| 1474 |
+
:obj:`Dict[int, AddedToken]`: The vocabulary
|
| 1475 |
+
"""
|
| 1476 |
+
pass
|
| 1477 |
+
|
| 1478 |
+
def get_vocab(self, with_added_tokens=True):
|
| 1479 |
+
"""
|
| 1480 |
+
Get the underlying vocabulary
|
| 1481 |
+
|
| 1482 |
+
Args:
|
| 1483 |
+
with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
|
| 1484 |
+
Whether to include the added tokens
|
| 1485 |
+
|
| 1486 |
+
Returns:
|
| 1487 |
+
:obj:`Dict[str, int]`: The vocabulary
|
| 1488 |
+
"""
|
| 1489 |
+
pass
|
| 1490 |
+
|
| 1491 |
+
def get_vocab_size(self, with_added_tokens=True):
|
| 1492 |
+
"""
|
| 1493 |
+
Get the size of the underlying vocabulary
|
| 1494 |
+
|
| 1495 |
+
Args:
|
| 1496 |
+
with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
|
| 1497 |
+
Whether to include the added tokens
|
| 1498 |
+
|
| 1499 |
+
Returns:
|
| 1500 |
+
:obj:`int`: The size of the vocabulary
|
| 1501 |
+
"""
|
| 1502 |
+
pass
|
| 1503 |
+
|
| 1504 |
+
def id_to_token(self, id):
|
| 1505 |
+
"""
|
| 1506 |
+
Convert the given id to its corresponding token if it exists
|
| 1507 |
+
|
| 1508 |
+
Args:
|
| 1509 |
+
id (:obj:`int`):
|
| 1510 |
+
The id to convert
|
| 1511 |
+
|
| 1512 |
+
Returns:
|
| 1513 |
+
:obj:`Optional[str]`: An optional token, :obj:`None` if out of vocabulary
|
| 1514 |
+
"""
|
| 1515 |
+
pass
|
| 1516 |
+
|
| 1517 |
+
@property
|
| 1518 |
+
def model(self):
|
| 1519 |
+
"""
|
| 1520 |
+
The :class:`~tokenizers.models.Model` in use by the Tokenizer
|
| 1521 |
+
"""
|
| 1522 |
+
pass
|
| 1523 |
+
|
| 1524 |
+
@model.setter
|
| 1525 |
+
def model(self, value):
|
| 1526 |
+
"""
|
| 1527 |
+
The :class:`~tokenizers.models.Model` in use by the Tokenizer
|
| 1528 |
+
"""
|
| 1529 |
+
pass
|
| 1530 |
+
|
| 1531 |
+
def no_padding(self):
|
| 1532 |
+
"""
|
| 1533 |
+
Disable padding
|
| 1534 |
+
"""
|
| 1535 |
+
pass
|
| 1536 |
+
|
| 1537 |
+
def no_truncation(self):
|
| 1538 |
+
"""
|
| 1539 |
+
Disable truncation
|
| 1540 |
+
"""
|
| 1541 |
+
pass
|
| 1542 |
+
|
| 1543 |
+
@property
|
| 1544 |
+
def normalizer(self):
|
| 1545 |
+
"""
|
| 1546 |
+
The `optional` :class:`~tokenizers.normalizers.Normalizer` in use by the Tokenizer
|
| 1547 |
+
"""
|
| 1548 |
+
pass
|
| 1549 |
+
|
| 1550 |
+
@normalizer.setter
|
| 1551 |
+
def normalizer(self, value):
|
| 1552 |
+
"""
|
| 1553 |
+
The `optional` :class:`~tokenizers.normalizers.Normalizer` in use by the Tokenizer
|
| 1554 |
+
"""
|
| 1555 |
+
pass
|
| 1556 |
+
|
| 1557 |
+
def num_special_tokens_to_add(self, is_pair):
|
| 1558 |
+
"""
|
| 1559 |
+
Return the number of special tokens that would be added for single/pair sentences.
|
| 1560 |
+
:param is_pair: Boolean indicating if the input would be a single sentence or a pair
|
| 1561 |
+
:return:
|
| 1562 |
+
"""
|
| 1563 |
+
pass
|
| 1564 |
+
|
| 1565 |
+
@property
|
| 1566 |
+
def padding(self):
|
| 1567 |
+
"""
|
| 1568 |
+
Get the current padding parameters
|
| 1569 |
+
|
| 1570 |
+
`Cannot be set, use` :meth:`~tokenizers.Tokenizer.enable_padding` `instead`
|
| 1571 |
+
|
| 1572 |
+
Returns:
|
| 1573 |
+
(:obj:`dict`, `optional`):
|
| 1574 |
+
A dict with the current padding parameters if padding is enabled
|
| 1575 |
+
"""
|
| 1576 |
+
pass
|
| 1577 |
+
|
| 1578 |
+
@padding.setter
|
| 1579 |
+
def padding(self, value):
|
| 1580 |
+
"""
|
| 1581 |
+
Get the current padding parameters
|
| 1582 |
+
|
| 1583 |
+
`Cannot be set, use` :meth:`~tokenizers.Tokenizer.enable_padding` `instead`
|
| 1584 |
+
|
| 1585 |
+
Returns:
|
| 1586 |
+
(:obj:`dict`, `optional`):
|
| 1587 |
+
A dict with the current padding parameters if padding is enabled
|
| 1588 |
+
"""
|
| 1589 |
+
pass
|
| 1590 |
+
|
| 1591 |
+
def post_process(self, encoding, pair=None, add_special_tokens=True):
|
| 1592 |
+
"""
|
| 1593 |
+
Apply all the post-processing steps to the given encodings.
|
| 1594 |
+
|
| 1595 |
+
The various steps are:
|
| 1596 |
+
|
| 1597 |
+
1. Truncate according to the set truncation params (provided with
|
| 1598 |
+
:meth:`~tokenizers.Tokenizer.enable_truncation`)
|
| 1599 |
+
2. Apply the :class:`~tokenizers.processors.PostProcessor`
|
| 1600 |
+
3. Pad according to the set padding params (provided with
|
| 1601 |
+
:meth:`~tokenizers.Tokenizer.enable_padding`)
|
| 1602 |
+
|
| 1603 |
+
Args:
|
| 1604 |
+
encoding (:class:`~tokenizers.Encoding`):
|
| 1605 |
+
The :class:`~tokenizers.Encoding` corresponding to the main sequence.
|
| 1606 |
+
|
| 1607 |
+
pair (:class:`~tokenizers.Encoding`, `optional`):
|
| 1608 |
+
An optional :class:`~tokenizers.Encoding` corresponding to the pair sequence.
|
| 1609 |
+
|
| 1610 |
+
add_special_tokens (:obj:`bool`):
|
| 1611 |
+
Whether to add the special tokens
|
| 1612 |
+
|
| 1613 |
+
Returns:
|
| 1614 |
+
:class:`~tokenizers.Encoding`: The final post-processed encoding
|
| 1615 |
+
"""
|
| 1616 |
+
pass
|
| 1617 |
+
|
| 1618 |
+
@property
|
| 1619 |
+
def post_processor(self):
|
| 1620 |
+
"""
|
| 1621 |
+
The `optional` :class:`~tokenizers.processors.PostProcessor` in use by the Tokenizer
|
| 1622 |
+
"""
|
| 1623 |
+
pass
|
| 1624 |
+
|
| 1625 |
+
@post_processor.setter
|
| 1626 |
+
def post_processor(self, value):
|
| 1627 |
+
"""
|
| 1628 |
+
The `optional` :class:`~tokenizers.processors.PostProcessor` in use by the Tokenizer
|
| 1629 |
+
"""
|
| 1630 |
+
pass
|
| 1631 |
+
|
| 1632 |
+
@property
|
| 1633 |
+
def pre_tokenizer(self):
|
| 1634 |
+
"""
|
| 1635 |
+
The `optional` :class:`~tokenizers.pre_tokenizers.PreTokenizer` in use by the Tokenizer
|
| 1636 |
+
"""
|
| 1637 |
+
pass
|
| 1638 |
+
|
| 1639 |
+
@pre_tokenizer.setter
|
| 1640 |
+
def pre_tokenizer(self, value):
|
| 1641 |
+
"""
|
| 1642 |
+
The `optional` :class:`~tokenizers.pre_tokenizers.PreTokenizer` in use by the Tokenizer
|
| 1643 |
+
"""
|
| 1644 |
+
pass
|
| 1645 |
+
|
| 1646 |
+
def save(self, path, pretty=True):
|
| 1647 |
+
"""
|
| 1648 |
+
Save the :class:`~tokenizers.Tokenizer` to the file at the given path.
|
| 1649 |
+
|
| 1650 |
+
Args:
|
| 1651 |
+
path (:obj:`str`):
|
| 1652 |
+
A path to a file in which to save the serialized tokenizer.
|
| 1653 |
+
|
| 1654 |
+
pretty (:obj:`bool`, defaults to :obj:`True`):
|
| 1655 |
+
Whether the JSON file should be pretty formatted.
|
| 1656 |
+
"""
|
| 1657 |
+
pass
|
| 1658 |
+
|
| 1659 |
+
def to_str(self, pretty=False):
|
| 1660 |
+
"""
|
| 1661 |
+
Gets a serialized string representing this :class:`~tokenizers.Tokenizer`.
|
| 1662 |
+
|
| 1663 |
+
Args:
|
| 1664 |
+
pretty (:obj:`bool`, defaults to :obj:`False`):
|
| 1665 |
+
Whether the JSON string should be pretty formatted.
|
| 1666 |
+
|
| 1667 |
+
Returns:
|
| 1668 |
+
:obj:`str`: A string representing the serialized Tokenizer
|
| 1669 |
+
"""
|
| 1670 |
+
pass
|
| 1671 |
+
|
| 1672 |
+
def token_to_id(self, token):
|
| 1673 |
+
"""
|
| 1674 |
+
Convert the given token to its corresponding id if it exists
|
| 1675 |
+
|
| 1676 |
+
Args:
|
| 1677 |
+
token (:obj:`str`):
|
| 1678 |
+
The token to convert
|
| 1679 |
+
|
| 1680 |
+
Returns:
|
| 1681 |
+
:obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary
|
| 1682 |
+
"""
|
| 1683 |
+
pass
|
| 1684 |
+
|
| 1685 |
+
def train(self, files, trainer=None):
|
| 1686 |
+
"""
|
| 1687 |
+
Train the Tokenizer using the given files.
|
| 1688 |
+
|
| 1689 |
+
Reads the files line by line, while keeping all the whitespace, even new lines.
|
| 1690 |
+
If you want to train from data store in-memory, you can check
|
| 1691 |
+
:meth:`~tokenizers.Tokenizer.train_from_iterator`
|
| 1692 |
+
|
| 1693 |
+
Args:
|
| 1694 |
+
files (:obj:`List[str]`):
|
| 1695 |
+
A list of path to the files that we should use for training
|
| 1696 |
+
|
| 1697 |
+
trainer (:obj:`~tokenizers.trainers.Trainer`, `optional`):
|
| 1698 |
+
An optional trainer that should be used to train our Model
|
| 1699 |
+
"""
|
| 1700 |
+
pass
|
| 1701 |
+
|
| 1702 |
+
def train_from_iterator(self, iterator, trainer=None, length=None):
|
| 1703 |
+
"""
|
| 1704 |
+
Train the Tokenizer using the provided iterator.
|
| 1705 |
+
|
| 1706 |
+
You can provide anything that is a Python Iterator
|
| 1707 |
+
|
| 1708 |
+
* A list of sequences :obj:`List[str]`
|
| 1709 |
+
* A generator that yields :obj:`str` or :obj:`List[str]`
|
| 1710 |
+
* A Numpy array of strings
|
| 1711 |
+
* ...
|
| 1712 |
+
|
| 1713 |
+
Args:
|
| 1714 |
+
iterator (:obj:`Iterator`):
|
| 1715 |
+
Any iterator over strings or list of strings
|
| 1716 |
+
|
| 1717 |
+
trainer (:obj:`~tokenizers.trainers.Trainer`, `optional`):
|
| 1718 |
+
An optional trainer that should be used to train our Model
|
| 1719 |
+
|
| 1720 |
+
length (:obj:`int`, `optional`):
|
| 1721 |
+
The total number of sequences in the iterator. This is used to
|
| 1722 |
+
provide meaningful progress tracking
|
| 1723 |
+
"""
|
| 1724 |
+
pass
|
| 1725 |
+
|
| 1726 |
+
@property
|
| 1727 |
+
def truncation(self):
|
| 1728 |
+
"""
|
| 1729 |
+
Get the currently set truncation parameters
|
| 1730 |
+
|
| 1731 |
+
`Cannot set, use` :meth:`~tokenizers.Tokenizer.enable_truncation` `instead`
|
| 1732 |
+
|
| 1733 |
+
Returns:
|
| 1734 |
+
(:obj:`dict`, `optional`):
|
| 1735 |
+
A dict with the current truncation parameters if truncation is enabled
|
| 1736 |
+
"""
|
| 1737 |
+
pass
|
| 1738 |
+
|
| 1739 |
+
@truncation.setter
|
| 1740 |
+
def truncation(self, value):
|
| 1741 |
+
"""
|
| 1742 |
+
Get the currently set truncation parameters
|
| 1743 |
+
|
| 1744 |
+
`Cannot set, use` :meth:`~tokenizers.Tokenizer.enable_truncation` `instead`
|
| 1745 |
+
|
| 1746 |
+
Returns:
|
| 1747 |
+
(:obj:`dict`, `optional`):
|
| 1748 |
+
A dict with the current truncation parameters if truncation is enabled
|
| 1749 |
+
"""
|
| 1750 |
+
pass
|
| 1751 |
+
|
| 1752 |
+
from enum import Enum
|
| 1753 |
+
from typing import List, Tuple, Union, Any
|
| 1754 |
+
|
| 1755 |
+
Offsets = Tuple[int, int]
|
| 1756 |
+
TextInputSequence = str
|
| 1757 |
+
PreTokenizedInputSequence = Union[List[str], Tuple[str, ...]]
|
| 1758 |
+
TextEncodeInput = Union[
|
| 1759 |
+
TextInputSequence,
|
| 1760 |
+
Tuple[TextInputSequence, TextInputSequence],
|
| 1761 |
+
List[TextInputSequence],
|
| 1762 |
+
]
|
| 1763 |
+
PreTokenizedEncodeInput = Union[
|
| 1764 |
+
PreTokenizedInputSequence,
|
| 1765 |
+
Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
|
| 1766 |
+
List[PreTokenizedInputSequence],
|
| 1767 |
+
]
|
| 1768 |
+
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
|
| 1769 |
+
EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
|
| 1770 |
+
|
| 1771 |
+
class OffsetReferential(Enum):
|
| 1772 |
+
ORIGINAL = "original"
|
| 1773 |
+
NORMALIZED = "normalized"
|
| 1774 |
+
|
| 1775 |
+
class OffsetType(Enum):
|
| 1776 |
+
BYTE = "byte"
|
| 1777 |
+
CHAR = "char"
|
| 1778 |
+
|
| 1779 |
+
class SplitDelimiterBehavior(Enum):
|
| 1780 |
+
REMOVED = "removed"
|
| 1781 |
+
ISOLATED = "isolated"
|
| 1782 |
+
MERGED_WITH_PREVIOUS = "merged_with_previous"
|
| 1783 |
+
MERGED_WITH_NEXT = "merged_with_next"
|
| 1784 |
+
CONTIGUOUS = "contiguous"
|
| 1785 |
+
|
| 1786 |
+
from .implementations import (
|
| 1787 |
+
BertWordPieceTokenizer,
|
| 1788 |
+
ByteLevelBPETokenizer,
|
| 1789 |
+
CharBPETokenizer,
|
| 1790 |
+
SentencePieceBPETokenizer,
|
| 1791 |
+
SentencePieceUnigramTokenizer,
|
| 1792 |
+
)
|
| 1793 |
+
|
| 1794 |
+
def __getattr__(name: str) -> Any: ...
|
| 1795 |
+
|
| 1796 |
+
BertWordPieceTokenizer: Any
|
| 1797 |
+
ByteLevelBPETokenizer: Any
|
| 1798 |
+
CharBPETokenizer: Any
|
| 1799 |
+
SentencePieceBPETokenizer: Any
|
| 1800 |
+
SentencePieceUnigramTokenizer: Any
|
source/tokenizers/decoders/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .. import decoders
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
Decoder = decoders.Decoder
|
| 5 |
+
ByteLevel = decoders.ByteLevel
|
| 6 |
+
Replace = decoders.Replace
|
| 7 |
+
WordPiece = decoders.WordPiece
|
| 8 |
+
ByteFallback = decoders.ByteFallback
|
| 9 |
+
Fuse = decoders.Fuse
|
| 10 |
+
Strip = decoders.Strip
|
| 11 |
+
Metaspace = decoders.Metaspace
|
| 12 |
+
BPEDecoder = decoders.BPEDecoder
|
| 13 |
+
CTC = decoders.CTC
|
| 14 |
+
Sequence = decoders.Sequence
|
| 15 |
+
DecodeStream = decoders.DecodeStream
|
source/tokenizers/decoders/__init__.pyi
ADDED
|
@@ -0,0 +1,569 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Generated content DO NOT EDIT
|
| 2 |
+
class DecodeStream:
|
| 3 |
+
"""
|
| 4 |
+
Class needed for streaming decode
|
| 5 |
+
|
| 6 |
+
"""
|
| 7 |
+
def __init__(self, ids=None, skip_special_tokens=False):
|
| 8 |
+
pass
|
| 9 |
+
|
| 10 |
+
def __getstate__(self, /):
|
| 11 |
+
"""
|
| 12 |
+
Helper for pickle.
|
| 13 |
+
"""
|
| 14 |
+
pass
|
| 15 |
+
|
| 16 |
+
def step(self, tokenizer, id):
|
| 17 |
+
"""
|
| 18 |
+
Streaming decode step
|
| 19 |
+
|
| 20 |
+
Args:
|
| 21 |
+
tokenizer (:class:`~tokenizers.Tokenizer`):
|
| 22 |
+
The tokenizer to use for decoding
|
| 23 |
+
id (:obj:`int` or `List[int]`):
|
| 24 |
+
The next token id or list of token ids to add to the stream
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
Returns:
|
| 28 |
+
:obj:`Optional[str]`: The next decoded string chunk, or None if not enough
|
| 29 |
+
tokens have been provided yet.
|
| 30 |
+
"""
|
| 31 |
+
pass
|
| 32 |
+
|
| 33 |
+
class Decoder:
|
| 34 |
+
"""
|
| 35 |
+
Base class for all decoders
|
| 36 |
+
|
| 37 |
+
This class is not supposed to be instantiated directly. Instead, any implementation of
|
| 38 |
+
a Decoder will return an instance of this class when instantiated.
|
| 39 |
+
"""
|
| 40 |
+
def __getstate__(self):
|
| 41 |
+
""" """
|
| 42 |
+
pass
|
| 43 |
+
|
| 44 |
+
def __setstate__(self, state):
|
| 45 |
+
""" """
|
| 46 |
+
pass
|
| 47 |
+
|
| 48 |
+
@staticmethod
|
| 49 |
+
def custom(decoder):
|
| 50 |
+
""" """
|
| 51 |
+
pass
|
| 52 |
+
|
| 53 |
+
def decode(self, tokens):
|
| 54 |
+
"""
|
| 55 |
+
Decode the given list of tokens to a final string
|
| 56 |
+
|
| 57 |
+
Args:
|
| 58 |
+
tokens (:obj:`List[str]`):
|
| 59 |
+
The list of tokens to decode
|
| 60 |
+
|
| 61 |
+
Returns:
|
| 62 |
+
:obj:`str`: The decoded string
|
| 63 |
+
"""
|
| 64 |
+
pass
|
| 65 |
+
|
| 66 |
+
class BPEDecoder(Decoder):
|
| 67 |
+
"""
|
| 68 |
+
BPEDecoder Decoder
|
| 69 |
+
|
| 70 |
+
Args:
|
| 71 |
+
suffix (:obj:`str`, `optional`, defaults to :obj:`</w>`):
|
| 72 |
+
The suffix that was used to characterize an end-of-word. This suffix will
|
| 73 |
+
be replaced by whitespaces during the decoding
|
| 74 |
+
"""
|
| 75 |
+
def __init__(self, suffix="</w>"):
|
| 76 |
+
pass
|
| 77 |
+
|
| 78 |
+
def __getstate__(self):
|
| 79 |
+
""" """
|
| 80 |
+
pass
|
| 81 |
+
|
| 82 |
+
def __setstate__(self, state):
|
| 83 |
+
""" """
|
| 84 |
+
pass
|
| 85 |
+
|
| 86 |
+
@staticmethod
|
| 87 |
+
def custom(decoder):
|
| 88 |
+
""" """
|
| 89 |
+
pass
|
| 90 |
+
|
| 91 |
+
def decode(self, tokens):
|
| 92 |
+
"""
|
| 93 |
+
Decode the given list of tokens to a final string
|
| 94 |
+
|
| 95 |
+
Args:
|
| 96 |
+
tokens (:obj:`List[str]`):
|
| 97 |
+
The list of tokens to decode
|
| 98 |
+
|
| 99 |
+
Returns:
|
| 100 |
+
:obj:`str`: The decoded string
|
| 101 |
+
"""
|
| 102 |
+
pass
|
| 103 |
+
|
| 104 |
+
@property
|
| 105 |
+
def suffix(self):
|
| 106 |
+
""" """
|
| 107 |
+
pass
|
| 108 |
+
|
| 109 |
+
@suffix.setter
|
| 110 |
+
def suffix(self, value):
|
| 111 |
+
""" """
|
| 112 |
+
pass
|
| 113 |
+
|
| 114 |
+
class ByteFallback(Decoder):
|
| 115 |
+
"""
|
| 116 |
+
ByteFallback Decoder
|
| 117 |
+
ByteFallback is a simple trick which converts tokens looking like `<0x61>`
|
| 118 |
+
to pure bytes, and attempts to make them into a string. If the tokens
|
| 119 |
+
cannot be decoded you will get � instead for each inconvertible byte token
|
| 120 |
+
|
| 121 |
+
"""
|
| 122 |
+
def __init__(self):
|
| 123 |
+
pass
|
| 124 |
+
|
| 125 |
+
def __getstate__(self):
|
| 126 |
+
""" """
|
| 127 |
+
pass
|
| 128 |
+
|
| 129 |
+
def __setstate__(self, state):
|
| 130 |
+
""" """
|
| 131 |
+
pass
|
| 132 |
+
|
| 133 |
+
@staticmethod
|
| 134 |
+
def custom(decoder):
|
| 135 |
+
""" """
|
| 136 |
+
pass
|
| 137 |
+
|
| 138 |
+
def decode(self, tokens):
|
| 139 |
+
"""
|
| 140 |
+
Decode the given list of tokens to a final string
|
| 141 |
+
|
| 142 |
+
Args:
|
| 143 |
+
tokens (:obj:`List[str]`):
|
| 144 |
+
The list of tokens to decode
|
| 145 |
+
|
| 146 |
+
Returns:
|
| 147 |
+
:obj:`str`: The decoded string
|
| 148 |
+
"""
|
| 149 |
+
pass
|
| 150 |
+
|
| 151 |
+
class ByteLevel(Decoder):
|
| 152 |
+
"""
|
| 153 |
+
ByteLevel Decoder
|
| 154 |
+
|
| 155 |
+
This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.ByteLevel`
|
| 156 |
+
:class:`~tokenizers.pre_tokenizers.PreTokenizer`.
|
| 157 |
+
"""
|
| 158 |
+
def __init__(self):
|
| 159 |
+
pass
|
| 160 |
+
|
| 161 |
+
def __getstate__(self):
|
| 162 |
+
""" """
|
| 163 |
+
pass
|
| 164 |
+
|
| 165 |
+
def __setstate__(self, state):
|
| 166 |
+
""" """
|
| 167 |
+
pass
|
| 168 |
+
|
| 169 |
+
@staticmethod
|
| 170 |
+
def custom(decoder):
|
| 171 |
+
""" """
|
| 172 |
+
pass
|
| 173 |
+
|
| 174 |
+
def decode(self, tokens):
|
| 175 |
+
"""
|
| 176 |
+
Decode the given list of tokens to a final string
|
| 177 |
+
|
| 178 |
+
Args:
|
| 179 |
+
tokens (:obj:`List[str]`):
|
| 180 |
+
The list of tokens to decode
|
| 181 |
+
|
| 182 |
+
Returns:
|
| 183 |
+
:obj:`str`: The decoded string
|
| 184 |
+
"""
|
| 185 |
+
pass
|
| 186 |
+
|
| 187 |
+
class CTC(Decoder):
|
| 188 |
+
"""
|
| 189 |
+
CTC Decoder
|
| 190 |
+
|
| 191 |
+
Args:
|
| 192 |
+
pad_token (:obj:`str`, `optional`, defaults to :obj:`<pad>`):
|
| 193 |
+
The pad token used by CTC to delimit a new token.
|
| 194 |
+
word_delimiter_token (:obj:`str`, `optional`, defaults to :obj:`|`):
|
| 195 |
+
The word delimiter token. It will be replaced by a <space>
|
| 196 |
+
cleanup (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
| 197 |
+
Whether to cleanup some tokenization artifacts.
|
| 198 |
+
Mainly spaces before punctuation, and some abbreviated english forms.
|
| 199 |
+
"""
|
| 200 |
+
def __init__(self, pad_token="<pad>", word_delimiter_token="|", cleanup=True):
|
| 201 |
+
pass
|
| 202 |
+
|
| 203 |
+
def __getstate__(self):
|
| 204 |
+
""" """
|
| 205 |
+
pass
|
| 206 |
+
|
| 207 |
+
def __setstate__(self, state):
|
| 208 |
+
""" """
|
| 209 |
+
pass
|
| 210 |
+
|
| 211 |
+
@property
|
| 212 |
+
def cleanup(self):
|
| 213 |
+
""" """
|
| 214 |
+
pass
|
| 215 |
+
|
| 216 |
+
@cleanup.setter
|
| 217 |
+
def cleanup(self, value):
|
| 218 |
+
""" """
|
| 219 |
+
pass
|
| 220 |
+
|
| 221 |
+
@staticmethod
|
| 222 |
+
def custom(decoder):
|
| 223 |
+
""" """
|
| 224 |
+
pass
|
| 225 |
+
|
| 226 |
+
def decode(self, tokens):
|
| 227 |
+
"""
|
| 228 |
+
Decode the given list of tokens to a final string
|
| 229 |
+
|
| 230 |
+
Args:
|
| 231 |
+
tokens (:obj:`List[str]`):
|
| 232 |
+
The list of tokens to decode
|
| 233 |
+
|
| 234 |
+
Returns:
|
| 235 |
+
:obj:`str`: The decoded string
|
| 236 |
+
"""
|
| 237 |
+
pass
|
| 238 |
+
|
| 239 |
+
@property
|
| 240 |
+
def pad_token(self):
|
| 241 |
+
""" """
|
| 242 |
+
pass
|
| 243 |
+
|
| 244 |
+
@pad_token.setter
|
| 245 |
+
def pad_token(self, value):
|
| 246 |
+
""" """
|
| 247 |
+
pass
|
| 248 |
+
|
| 249 |
+
@property
|
| 250 |
+
def word_delimiter_token(self):
|
| 251 |
+
""" """
|
| 252 |
+
pass
|
| 253 |
+
|
| 254 |
+
@word_delimiter_token.setter
|
| 255 |
+
def word_delimiter_token(self, value):
|
| 256 |
+
""" """
|
| 257 |
+
pass
|
| 258 |
+
|
| 259 |
+
class Fuse(Decoder):
|
| 260 |
+
"""
|
| 261 |
+
Fuse Decoder
|
| 262 |
+
Fuse simply fuses every token into a single string.
|
| 263 |
+
This is the last step of decoding, this decoder exists only if
|
| 264 |
+
there is need to add other decoders *after* the fusion
|
| 265 |
+
"""
|
| 266 |
+
def __init__(self):
|
| 267 |
+
pass
|
| 268 |
+
|
| 269 |
+
def __getstate__(self):
|
| 270 |
+
""" """
|
| 271 |
+
pass
|
| 272 |
+
|
| 273 |
+
def __setstate__(self, state):
|
| 274 |
+
""" """
|
| 275 |
+
pass
|
| 276 |
+
|
| 277 |
+
@staticmethod
|
| 278 |
+
def custom(decoder):
|
| 279 |
+
""" """
|
| 280 |
+
pass
|
| 281 |
+
|
| 282 |
+
def decode(self, tokens):
|
| 283 |
+
"""
|
| 284 |
+
Decode the given list of tokens to a final string
|
| 285 |
+
|
| 286 |
+
Args:
|
| 287 |
+
tokens (:obj:`List[str]`):
|
| 288 |
+
The list of tokens to decode
|
| 289 |
+
|
| 290 |
+
Returns:
|
| 291 |
+
:obj:`str`: The decoded string
|
| 292 |
+
"""
|
| 293 |
+
pass
|
| 294 |
+
|
| 295 |
+
class Metaspace(Decoder):
|
| 296 |
+
"""
|
| 297 |
+
Metaspace Decoder
|
| 298 |
+
|
| 299 |
+
Args:
|
| 300 |
+
replacement (:obj:`str`, `optional`, defaults to :obj:`▁`):
|
| 301 |
+
The replacement character. Must be exactly one character. By default we
|
| 302 |
+
use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
|
| 303 |
+
|
| 304 |
+
prepend_scheme (:obj:`str`, `optional`, defaults to :obj:`"always"`):
|
| 305 |
+
Whether to add a space to the first word if there isn't already one. This
|
| 306 |
+
lets us treat `hello` exactly like `say hello`.
|
| 307 |
+
Choices: "always", "never", "first". First means the space is only added on the first
|
| 308 |
+
token (relevant when special tokens are used or other pre_tokenizer are used).
|
| 309 |
+
"""
|
| 310 |
+
def __init__(self, replacement="▁", prepend_scheme="always", split=True):
|
| 311 |
+
pass
|
| 312 |
+
|
| 313 |
+
def __getstate__(self):
|
| 314 |
+
""" """
|
| 315 |
+
pass
|
| 316 |
+
|
| 317 |
+
def __setstate__(self, state):
|
| 318 |
+
""" """
|
| 319 |
+
pass
|
| 320 |
+
|
| 321 |
+
@staticmethod
|
| 322 |
+
def custom(decoder):
|
| 323 |
+
""" """
|
| 324 |
+
pass
|
| 325 |
+
|
| 326 |
+
def decode(self, tokens):
|
| 327 |
+
"""
|
| 328 |
+
Decode the given list of tokens to a final string
|
| 329 |
+
|
| 330 |
+
Args:
|
| 331 |
+
tokens (:obj:`List[str]`):
|
| 332 |
+
The list of tokens to decode
|
| 333 |
+
|
| 334 |
+
Returns:
|
| 335 |
+
:obj:`str`: The decoded string
|
| 336 |
+
"""
|
| 337 |
+
pass
|
| 338 |
+
|
| 339 |
+
@property
|
| 340 |
+
def prepend_scheme(self):
|
| 341 |
+
""" """
|
| 342 |
+
pass
|
| 343 |
+
|
| 344 |
+
@prepend_scheme.setter
|
| 345 |
+
def prepend_scheme(self, value):
|
| 346 |
+
""" """
|
| 347 |
+
pass
|
| 348 |
+
|
| 349 |
+
@property
|
| 350 |
+
def replacement(self):
|
| 351 |
+
""" """
|
| 352 |
+
pass
|
| 353 |
+
|
| 354 |
+
@replacement.setter
|
| 355 |
+
def replacement(self, value):
|
| 356 |
+
""" """
|
| 357 |
+
pass
|
| 358 |
+
|
| 359 |
+
@property
|
| 360 |
+
def split(self):
|
| 361 |
+
""" """
|
| 362 |
+
pass
|
| 363 |
+
|
| 364 |
+
@split.setter
|
| 365 |
+
def split(self, value):
|
| 366 |
+
""" """
|
| 367 |
+
pass
|
| 368 |
+
|
| 369 |
+
class Replace(Decoder):
|
| 370 |
+
"""
|
| 371 |
+
Replace Decoder
|
| 372 |
+
|
| 373 |
+
This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.Replace`
|
| 374 |
+
:class:`~tokenizers.pre_tokenizers.PreTokenizer`.
|
| 375 |
+
"""
|
| 376 |
+
def __init__(self, pattern, content):
|
| 377 |
+
pass
|
| 378 |
+
|
| 379 |
+
def __getstate__(self):
|
| 380 |
+
""" """
|
| 381 |
+
pass
|
| 382 |
+
|
| 383 |
+
def __setstate__(self, state):
|
| 384 |
+
""" """
|
| 385 |
+
pass
|
| 386 |
+
|
| 387 |
+
@staticmethod
|
| 388 |
+
def custom(decoder):
|
| 389 |
+
""" """
|
| 390 |
+
pass
|
| 391 |
+
|
| 392 |
+
def decode(self, tokens):
|
| 393 |
+
"""
|
| 394 |
+
Decode the given list of tokens to a final string
|
| 395 |
+
|
| 396 |
+
Args:
|
| 397 |
+
tokens (:obj:`List[str]`):
|
| 398 |
+
The list of tokens to decode
|
| 399 |
+
|
| 400 |
+
Returns:
|
| 401 |
+
:obj:`str`: The decoded string
|
| 402 |
+
"""
|
| 403 |
+
pass
|
| 404 |
+
|
| 405 |
+
class Sequence(Decoder):
|
| 406 |
+
"""
|
| 407 |
+
Sequence Decoder
|
| 408 |
+
|
| 409 |
+
Args:
|
| 410 |
+
decoders (:obj:`List[Decoder]`)
|
| 411 |
+
The decoders that need to be chained
|
| 412 |
+
"""
|
| 413 |
+
def __init__(self, decoders):
|
| 414 |
+
pass
|
| 415 |
+
|
| 416 |
+
def __getnewargs__(self):
|
| 417 |
+
""" """
|
| 418 |
+
pass
|
| 419 |
+
|
| 420 |
+
def __getstate__(self):
|
| 421 |
+
""" """
|
| 422 |
+
pass
|
| 423 |
+
|
| 424 |
+
def __setstate__(self, state):
|
| 425 |
+
""" """
|
| 426 |
+
pass
|
| 427 |
+
|
| 428 |
+
@staticmethod
|
| 429 |
+
def custom(decoder):
|
| 430 |
+
""" """
|
| 431 |
+
pass
|
| 432 |
+
|
| 433 |
+
def decode(self, tokens):
|
| 434 |
+
"""
|
| 435 |
+
Decode the given list of tokens to a final string
|
| 436 |
+
|
| 437 |
+
Args:
|
| 438 |
+
tokens (:obj:`List[str]`):
|
| 439 |
+
The list of tokens to decode
|
| 440 |
+
|
| 441 |
+
Returns:
|
| 442 |
+
:obj:`str`: The decoded string
|
| 443 |
+
"""
|
| 444 |
+
pass
|
| 445 |
+
|
| 446 |
+
class Strip(Decoder):
|
| 447 |
+
"""
|
| 448 |
+
Strip normalizer
|
| 449 |
+
Strips n left characters of each token, or n right characters of each token
|
| 450 |
+
"""
|
| 451 |
+
def __init__(self, content=" ", left=0, right=0):
|
| 452 |
+
pass
|
| 453 |
+
|
| 454 |
+
def __getstate__(self):
|
| 455 |
+
""" """
|
| 456 |
+
pass
|
| 457 |
+
|
| 458 |
+
def __setstate__(self, state):
|
| 459 |
+
""" """
|
| 460 |
+
pass
|
| 461 |
+
|
| 462 |
+
@property
|
| 463 |
+
def content(self):
|
| 464 |
+
""" """
|
| 465 |
+
pass
|
| 466 |
+
|
| 467 |
+
@content.setter
|
| 468 |
+
def content(self, value):
|
| 469 |
+
""" """
|
| 470 |
+
pass
|
| 471 |
+
|
| 472 |
+
@staticmethod
|
| 473 |
+
def custom(decoder):
|
| 474 |
+
""" """
|
| 475 |
+
pass
|
| 476 |
+
|
| 477 |
+
def decode(self, tokens):
|
| 478 |
+
"""
|
| 479 |
+
Decode the given list of tokens to a final string
|
| 480 |
+
|
| 481 |
+
Args:
|
| 482 |
+
tokens (:obj:`List[str]`):
|
| 483 |
+
The list of tokens to decode
|
| 484 |
+
|
| 485 |
+
Returns:
|
| 486 |
+
:obj:`str`: The decoded string
|
| 487 |
+
"""
|
| 488 |
+
pass
|
| 489 |
+
|
| 490 |
+
@property
|
| 491 |
+
def start(self):
|
| 492 |
+
""" """
|
| 493 |
+
pass
|
| 494 |
+
|
| 495 |
+
@start.setter
|
| 496 |
+
def start(self, value):
|
| 497 |
+
""" """
|
| 498 |
+
pass
|
| 499 |
+
|
| 500 |
+
@property
|
| 501 |
+
def stop(self):
|
| 502 |
+
""" """
|
| 503 |
+
pass
|
| 504 |
+
|
| 505 |
+
@stop.setter
|
| 506 |
+
def stop(self, value):
|
| 507 |
+
""" """
|
| 508 |
+
pass
|
| 509 |
+
|
| 510 |
+
class WordPiece(Decoder):
|
| 511 |
+
"""
|
| 512 |
+
WordPiece Decoder
|
| 513 |
+
|
| 514 |
+
Args:
|
| 515 |
+
prefix (:obj:`str`, `optional`, defaults to :obj:`##`):
|
| 516 |
+
The prefix to use for subwords that are not a beginning-of-word
|
| 517 |
+
|
| 518 |
+
cleanup (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
| 519 |
+
Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
|
| 520 |
+
and some abbreviated english forms.
|
| 521 |
+
"""
|
| 522 |
+
def __init__(self, prefix="##", cleanup=True):
|
| 523 |
+
pass
|
| 524 |
+
|
| 525 |
+
def __getstate__(self):
|
| 526 |
+
""" """
|
| 527 |
+
pass
|
| 528 |
+
|
| 529 |
+
def __setstate__(self, state):
|
| 530 |
+
""" """
|
| 531 |
+
pass
|
| 532 |
+
|
| 533 |
+
@property
|
| 534 |
+
def cleanup(self):
|
| 535 |
+
""" """
|
| 536 |
+
pass
|
| 537 |
+
|
| 538 |
+
@cleanup.setter
|
| 539 |
+
def cleanup(self, value):
|
| 540 |
+
""" """
|
| 541 |
+
pass
|
| 542 |
+
|
| 543 |
+
@staticmethod
|
| 544 |
+
def custom(decoder):
|
| 545 |
+
""" """
|
| 546 |
+
pass
|
| 547 |
+
|
| 548 |
+
def decode(self, tokens):
|
| 549 |
+
"""
|
| 550 |
+
Decode the given list of tokens to a final string
|
| 551 |
+
|
| 552 |
+
Args:
|
| 553 |
+
tokens (:obj:`List[str]`):
|
| 554 |
+
The list of tokens to decode
|
| 555 |
+
|
| 556 |
+
Returns:
|
| 557 |
+
:obj:`str`: The decoded string
|
| 558 |
+
"""
|
| 559 |
+
pass
|
| 560 |
+
|
| 561 |
+
@property
|
| 562 |
+
def prefix(self):
|
| 563 |
+
""" """
|
| 564 |
+
pass
|
| 565 |
+
|
| 566 |
+
@prefix.setter
|
| 567 |
+
def prefix(self, value):
|
| 568 |
+
""" """
|
| 569 |
+
pass
|
source/tokenizers/implementations/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .base_tokenizer import BaseTokenizer
|
| 2 |
+
from .bert_wordpiece import BertWordPieceTokenizer
|
| 3 |
+
from .byte_level_bpe import ByteLevelBPETokenizer
|
| 4 |
+
from .char_level_bpe import CharBPETokenizer
|
| 5 |
+
from .sentencepiece_bpe import SentencePieceBPETokenizer
|
| 6 |
+
from .sentencepiece_unigram import SentencePieceUnigramTokenizer
|
source/tokenizers/implementations/base_tokenizer.py
ADDED
|
@@ -0,0 +1,459 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, List, Optional, Tuple, Union
|
| 2 |
+
|
| 3 |
+
from tokenizers import AddedToken, EncodeInput, Encoding, InputSequence, Tokenizer
|
| 4 |
+
from tokenizers.decoders import Decoder
|
| 5 |
+
from tokenizers.models import Model
|
| 6 |
+
from tokenizers.normalizers import Normalizer
|
| 7 |
+
from tokenizers.pre_tokenizers import PreTokenizer
|
| 8 |
+
from tokenizers.processors import PostProcessor
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
Offsets = Tuple[int, int]
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class BaseTokenizer:
|
| 15 |
+
def __init__(self, tokenizer: Tokenizer, parameters=None):
|
| 16 |
+
self._tokenizer = tokenizer
|
| 17 |
+
self._parameters = parameters if parameters is not None else {}
|
| 18 |
+
|
| 19 |
+
def __repr__(self):
|
| 20 |
+
return "Tokenizer(vocabulary_size={}, {})".format(
|
| 21 |
+
self._tokenizer.get_vocab_size(),
|
| 22 |
+
", ".join(k + "=" + str(v) for k, v in self._parameters.items()),
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
def num_special_tokens_to_add(self, is_pair: bool) -> int:
|
| 26 |
+
"""
|
| 27 |
+
Return the number of special tokens that would be added for single/pair sentences.
|
| 28 |
+
:param is_pair: Boolean indicating if the input would be a single sentence or a pair
|
| 29 |
+
:return:
|
| 30 |
+
"""
|
| 31 |
+
return self._tokenizer.num_special_tokens_to_add(is_pair)
|
| 32 |
+
|
| 33 |
+
def get_vocab(self, with_added_tokens: bool = True) -> Dict[str, int]:
|
| 34 |
+
"""Returns the vocabulary
|
| 35 |
+
|
| 36 |
+
Args:
|
| 37 |
+
with_added_tokens: boolean:
|
| 38 |
+
Whether to include the added tokens in the vocabulary
|
| 39 |
+
|
| 40 |
+
Returns:
|
| 41 |
+
The vocabulary
|
| 42 |
+
"""
|
| 43 |
+
return self._tokenizer.get_vocab(with_added_tokens=with_added_tokens)
|
| 44 |
+
|
| 45 |
+
def get_added_tokens_decoder(self) -> Dict[int, AddedToken]:
|
| 46 |
+
"""Returns the added reverse vocabulary
|
| 47 |
+
|
| 48 |
+
Returns:
|
| 49 |
+
The added vocabulary mapping ints to AddedTokens
|
| 50 |
+
"""
|
| 51 |
+
return self._tokenizer.get_added_tokens_decoder()
|
| 52 |
+
|
| 53 |
+
def get_vocab_size(self, with_added_tokens: bool = True) -> int:
|
| 54 |
+
"""Return the size of vocabulary, with or without added tokens.
|
| 55 |
+
|
| 56 |
+
Args:
|
| 57 |
+
with_added_tokens: (`optional`) bool:
|
| 58 |
+
Whether to count in added special tokens or not
|
| 59 |
+
|
| 60 |
+
Returns:
|
| 61 |
+
Size of vocabulary
|
| 62 |
+
"""
|
| 63 |
+
return self._tokenizer.get_vocab_size(with_added_tokens=with_added_tokens)
|
| 64 |
+
|
| 65 |
+
def enable_padding(
|
| 66 |
+
self,
|
| 67 |
+
direction: Optional[str] = "right",
|
| 68 |
+
pad_to_multiple_of: Optional[int] = None,
|
| 69 |
+
pad_id: Optional[int] = 0,
|
| 70 |
+
pad_type_id: Optional[int] = 0,
|
| 71 |
+
pad_token: Optional[str] = "[PAD]",
|
| 72 |
+
length: Optional[int] = None,
|
| 73 |
+
):
|
| 74 |
+
"""Change the padding strategy
|
| 75 |
+
|
| 76 |
+
Args:
|
| 77 |
+
direction: (`optional`) str:
|
| 78 |
+
Can be one of: `right` or `left`
|
| 79 |
+
|
| 80 |
+
pad_to_multiple_of: (`optional`) unsigned int:
|
| 81 |
+
If specified, the padding length should always snap to the next multiple of
|
| 82 |
+
the given value. For example if we were going to pad with a length of 250 but
|
| 83 |
+
`pad_to_multiple_of=8` then we will pad to 256.
|
| 84 |
+
|
| 85 |
+
pad_id: (`optional`) unsigned int:
|
| 86 |
+
The indice to be used when padding
|
| 87 |
+
|
| 88 |
+
pad_type_id: (`optional`) unsigned int:
|
| 89 |
+
The type indice to be used when padding
|
| 90 |
+
|
| 91 |
+
pad_token: (`optional`) str:
|
| 92 |
+
The pad token to be used when padding
|
| 93 |
+
|
| 94 |
+
length: (`optional`) unsigned int:
|
| 95 |
+
If specified, the length at which to pad. If not specified
|
| 96 |
+
we pad using the size of the longest sequence in a batch
|
| 97 |
+
"""
|
| 98 |
+
return self._tokenizer.enable_padding(
|
| 99 |
+
direction=direction,
|
| 100 |
+
pad_to_multiple_of=pad_to_multiple_of,
|
| 101 |
+
pad_id=pad_id,
|
| 102 |
+
pad_type_id=pad_type_id,
|
| 103 |
+
pad_token=pad_token,
|
| 104 |
+
length=length,
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
def no_padding(self):
|
| 108 |
+
"""Disable padding"""
|
| 109 |
+
return self._tokenizer.no_padding()
|
| 110 |
+
|
| 111 |
+
@property
|
| 112 |
+
def padding(self) -> Optional[dict]:
|
| 113 |
+
"""Get the current padding parameters
|
| 114 |
+
|
| 115 |
+
Returns:
|
| 116 |
+
None if padding is disabled, a dict with the currently set parameters
|
| 117 |
+
if the padding is enabled.
|
| 118 |
+
"""
|
| 119 |
+
return self._tokenizer.padding
|
| 120 |
+
|
| 121 |
+
def enable_truncation(self, max_length: int, stride: Optional[int] = 0, strategy: Optional[str] = "longest_first"):
|
| 122 |
+
"""Change the truncation options
|
| 123 |
+
|
| 124 |
+
Args:
|
| 125 |
+
max_length: unsigned int:
|
| 126 |
+
The maximum length at which to truncate
|
| 127 |
+
|
| 128 |
+
stride: (`optional`) unsigned int:
|
| 129 |
+
The length of the previous first sequence to be included
|
| 130 |
+
in the overflowing sequence
|
| 131 |
+
|
| 132 |
+
strategy: (`optional`) str:
|
| 133 |
+
Can be one of `longest_first`, `only_first` or `only_second`
|
| 134 |
+
"""
|
| 135 |
+
return self._tokenizer.enable_truncation(max_length, stride=stride, strategy=strategy)
|
| 136 |
+
|
| 137 |
+
def no_truncation(self):
|
| 138 |
+
"""Disable truncation"""
|
| 139 |
+
return self._tokenizer.no_truncation()
|
| 140 |
+
|
| 141 |
+
@property
|
| 142 |
+
def truncation(self) -> Optional[dict]:
|
| 143 |
+
"""Get the current truncation parameters
|
| 144 |
+
|
| 145 |
+
Returns:
|
| 146 |
+
None if truncation is disabled, a dict with the current truncation parameters if
|
| 147 |
+
truncation is enabled
|
| 148 |
+
"""
|
| 149 |
+
return self._tokenizer.truncation
|
| 150 |
+
|
| 151 |
+
def add_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
|
| 152 |
+
"""Add the given tokens to the vocabulary
|
| 153 |
+
|
| 154 |
+
Args:
|
| 155 |
+
tokens: List[Union[str, AddedToken]]:
|
| 156 |
+
A list of tokens to add to the vocabulary. Each token can either be
|
| 157 |
+
a string, or an instance of AddedToken
|
| 158 |
+
|
| 159 |
+
Returns:
|
| 160 |
+
The number of tokens that were added to the vocabulary
|
| 161 |
+
"""
|
| 162 |
+
return self._tokenizer.add_tokens(tokens)
|
| 163 |
+
|
| 164 |
+
def add_special_tokens(self, special_tokens: List[Union[str, AddedToken]]) -> int:
|
| 165 |
+
"""Add the given special tokens to the vocabulary, and treat them as special tokens.
|
| 166 |
+
|
| 167 |
+
The special tokens will never be processed by the model, and will be
|
| 168 |
+
removed while decoding.
|
| 169 |
+
|
| 170 |
+
Args:
|
| 171 |
+
tokens: List[Union[str, AddedToken]]:
|
| 172 |
+
A list of special tokens to add to the vocabulary. Each token can either be
|
| 173 |
+
a string, or an instance of AddedToken
|
| 174 |
+
|
| 175 |
+
Returns:
|
| 176 |
+
The number of tokens that were added to the vocabulary
|
| 177 |
+
"""
|
| 178 |
+
return self._tokenizer.add_special_tokens(special_tokens)
|
| 179 |
+
|
| 180 |
+
def normalize(self, sequence: str) -> str:
|
| 181 |
+
"""Normalize the given sequence
|
| 182 |
+
|
| 183 |
+
Args:
|
| 184 |
+
sequence: str:
|
| 185 |
+
The sequence to normalize
|
| 186 |
+
|
| 187 |
+
Returns:
|
| 188 |
+
The normalized string
|
| 189 |
+
"""
|
| 190 |
+
return self._tokenizer.normalizer.normalize_str(sequence)
|
| 191 |
+
|
| 192 |
+
def encode(
|
| 193 |
+
self,
|
| 194 |
+
sequence: InputSequence,
|
| 195 |
+
pair: Optional[InputSequence] = None,
|
| 196 |
+
is_pretokenized: bool = False,
|
| 197 |
+
add_special_tokens: bool = True,
|
| 198 |
+
) -> Encoding:
|
| 199 |
+
"""Encode the given sequence and pair. This method can process raw text sequences as well
|
| 200 |
+
as already pre-tokenized sequences.
|
| 201 |
+
|
| 202 |
+
Args:
|
| 203 |
+
sequence: InputSequence:
|
| 204 |
+
The sequence we want to encode. This sequence can be either raw text or
|
| 205 |
+
pre-tokenized, according to the `is_pretokenized` argument:
|
| 206 |
+
|
| 207 |
+
- If `is_pretokenized=False`: `InputSequence` is expected to be `str`
|
| 208 |
+
- If `is_pretokenized=True`: `InputSequence` is expected to be
|
| 209 |
+
`Union[List[str], Tuple[str]]`
|
| 210 |
+
|
| 211 |
+
is_pretokenized: bool:
|
| 212 |
+
Whether the input is already pre-tokenized.
|
| 213 |
+
|
| 214 |
+
add_special_tokens: bool:
|
| 215 |
+
Whether to add the special tokens while encoding.
|
| 216 |
+
|
| 217 |
+
Returns:
|
| 218 |
+
An Encoding
|
| 219 |
+
"""
|
| 220 |
+
if sequence is None:
|
| 221 |
+
raise ValueError("encode: `sequence` can't be `None`")
|
| 222 |
+
|
| 223 |
+
return self._tokenizer.encode(sequence, pair, is_pretokenized, add_special_tokens)
|
| 224 |
+
|
| 225 |
+
def encode_batch(
|
| 226 |
+
self,
|
| 227 |
+
inputs: List[EncodeInput],
|
| 228 |
+
is_pretokenized: bool = False,
|
| 229 |
+
add_special_tokens: bool = True,
|
| 230 |
+
) -> List[Encoding]:
|
| 231 |
+
"""Encode the given inputs. This method accept both raw text sequences as well as already
|
| 232 |
+
pre-tokenized sequences.
|
| 233 |
+
|
| 234 |
+
Args:
|
| 235 |
+
inputs: List[EncodeInput]:
|
| 236 |
+
A list of single sequences or pair sequences to encode. Each `EncodeInput` is
|
| 237 |
+
expected to be of the following form:
|
| 238 |
+
`Union[InputSequence, Tuple[InputSequence, InputSequence]]`
|
| 239 |
+
|
| 240 |
+
Each `InputSequence` can either be raw text or pre-tokenized,
|
| 241 |
+
according to the `is_pretokenized` argument:
|
| 242 |
+
|
| 243 |
+
- If `is_pretokenized=False`: `InputSequence` is expected to be `str`
|
| 244 |
+
- If `is_pretokenized=True`: `InputSequence` is expected to be
|
| 245 |
+
`Union[List[str], Tuple[str]]`
|
| 246 |
+
|
| 247 |
+
is_pretokenized: bool:
|
| 248 |
+
Whether the input is already pre-tokenized.
|
| 249 |
+
|
| 250 |
+
add_special_tokens: bool:
|
| 251 |
+
Whether to add the special tokens while encoding.
|
| 252 |
+
|
| 253 |
+
Returns:
|
| 254 |
+
A list of Encoding
|
| 255 |
+
"""
|
| 256 |
+
|
| 257 |
+
if inputs is None:
|
| 258 |
+
raise ValueError("encode_batch: `inputs` can't be `None`")
|
| 259 |
+
|
| 260 |
+
return self._tokenizer.encode_batch(inputs, is_pretokenized, add_special_tokens)
|
| 261 |
+
|
| 262 |
+
async def async_encode_batch(
|
| 263 |
+
self,
|
| 264 |
+
inputs: List[EncodeInput],
|
| 265 |
+
is_pretokenized: bool = False,
|
| 266 |
+
add_special_tokens: bool = True,
|
| 267 |
+
) -> List[Encoding]:
|
| 268 |
+
"""Asynchronously encode a batch (tracks character offsets).
|
| 269 |
+
|
| 270 |
+
Args:
|
| 271 |
+
inputs: A list of single or pair sequences to encode.
|
| 272 |
+
is_pretokenized: Whether inputs are already pre-tokenized.
|
| 273 |
+
add_special_tokens: Whether to add special tokens.
|
| 274 |
+
|
| 275 |
+
Returns:
|
| 276 |
+
A list of Encoding.
|
| 277 |
+
"""
|
| 278 |
+
if inputs is None:
|
| 279 |
+
raise ValueError("async_encode_batch: `inputs` can't be `None`")
|
| 280 |
+
# Exposed by the Rust bindings via pyo3_async_runtimes::tokio::future_into_py
|
| 281 |
+
return await self._tokenizer.async_encode_batch(inputs, is_pretokenized, add_special_tokens)
|
| 282 |
+
|
| 283 |
+
async def async_encode_batch_fast(
|
| 284 |
+
self,
|
| 285 |
+
inputs: List[EncodeInput],
|
| 286 |
+
is_pretokenized: bool = False,
|
| 287 |
+
add_special_tokens: bool = True,
|
| 288 |
+
) -> List[Encoding]:
|
| 289 |
+
"""Asynchronously encode a batch (no character offsets, faster).
|
| 290 |
+
|
| 291 |
+
Args:
|
| 292 |
+
inputs: A list of single or pair sequences to encode.
|
| 293 |
+
is_pretokenized: Whether inputs are already pre-tokenized.
|
| 294 |
+
add_special_tokens: Whether to add special tokens.
|
| 295 |
+
|
| 296 |
+
Returns:
|
| 297 |
+
A list of Encoding.
|
| 298 |
+
"""
|
| 299 |
+
if inputs is None:
|
| 300 |
+
raise ValueError("async_encode_batch_fast: `inputs` can't be `None`")
|
| 301 |
+
return await self._tokenizer.async_encode_batch_fast(inputs, is_pretokenized, add_special_tokens)
|
| 302 |
+
|
| 303 |
+
def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
|
| 304 |
+
"""Decode the given list of ids to a string sequence
|
| 305 |
+
|
| 306 |
+
Args:
|
| 307 |
+
ids: List[unsigned int]:
|
| 308 |
+
A list of ids to be decoded
|
| 309 |
+
|
| 310 |
+
skip_special_tokens: (`optional`) boolean:
|
| 311 |
+
Whether to remove all the special tokens from the output string
|
| 312 |
+
|
| 313 |
+
Returns:
|
| 314 |
+
The decoded string
|
| 315 |
+
"""
|
| 316 |
+
if ids is None:
|
| 317 |
+
raise ValueError("None input is not valid. Should be a list of integers.")
|
| 318 |
+
|
| 319 |
+
return self._tokenizer.decode(ids, skip_special_tokens=skip_special_tokens)
|
| 320 |
+
|
| 321 |
+
def decode_batch(self, sequences: List[List[int]], skip_special_tokens: Optional[bool] = True) -> str:
|
| 322 |
+
"""Decode the list of sequences to a list of string sequences
|
| 323 |
+
|
| 324 |
+
Args:
|
| 325 |
+
sequences: List[List[unsigned int]]:
|
| 326 |
+
A list of sequence of ids to be decoded
|
| 327 |
+
|
| 328 |
+
skip_special_tokens: (`optional`) boolean:
|
| 329 |
+
Whether to remove all the special tokens from the output strings
|
| 330 |
+
|
| 331 |
+
Returns:
|
| 332 |
+
A list of decoded strings
|
| 333 |
+
"""
|
| 334 |
+
if sequences is None:
|
| 335 |
+
raise ValueError("None input is not valid. Should be list of list of integers.")
|
| 336 |
+
|
| 337 |
+
return self._tokenizer.decode_batch(sequences, skip_special_tokens=skip_special_tokens)
|
| 338 |
+
|
| 339 |
+
def token_to_id(self, token: str) -> Optional[int]:
|
| 340 |
+
"""Convert the given token to its corresponding id
|
| 341 |
+
|
| 342 |
+
Args:
|
| 343 |
+
token: str:
|
| 344 |
+
The token to convert
|
| 345 |
+
|
| 346 |
+
Returns:
|
| 347 |
+
The corresponding id if it exists, None otherwise
|
| 348 |
+
"""
|
| 349 |
+
return self._tokenizer.token_to_id(token)
|
| 350 |
+
|
| 351 |
+
def id_to_token(self, id: int) -> Optional[str]:
|
| 352 |
+
"""Convert the given token id to its corresponding string
|
| 353 |
+
|
| 354 |
+
Args:
|
| 355 |
+
token: id:
|
| 356 |
+
The token id to convert
|
| 357 |
+
|
| 358 |
+
Returns:
|
| 359 |
+
The corresponding string if it exists, None otherwise
|
| 360 |
+
"""
|
| 361 |
+
return self._tokenizer.id_to_token(id)
|
| 362 |
+
|
| 363 |
+
def save_model(self, directory: str, prefix: Optional[str] = None):
|
| 364 |
+
"""Save the current model to the given directory
|
| 365 |
+
|
| 366 |
+
Args:
|
| 367 |
+
directory: str:
|
| 368 |
+
A path to the destination directory
|
| 369 |
+
|
| 370 |
+
prefix: (Optional) str:
|
| 371 |
+
An optional prefix, used to prefix each file name
|
| 372 |
+
"""
|
| 373 |
+
return self._tokenizer.model.save(directory, prefix=prefix)
|
| 374 |
+
|
| 375 |
+
def save(self, path: str, pretty: bool = True):
|
| 376 |
+
"""Save the current Tokenizer at the given path
|
| 377 |
+
|
| 378 |
+
Args:
|
| 379 |
+
path: str:
|
| 380 |
+
A path to the destination Tokenizer file
|
| 381 |
+
"""
|
| 382 |
+
return self._tokenizer.save(path, pretty)
|
| 383 |
+
|
| 384 |
+
def to_str(self, pretty: bool = False):
|
| 385 |
+
"""Get a serialized JSON version of the Tokenizer as a str
|
| 386 |
+
|
| 387 |
+
Args:
|
| 388 |
+
pretty: bool:
|
| 389 |
+
Whether the JSON string should be prettified
|
| 390 |
+
|
| 391 |
+
Returns:
|
| 392 |
+
str
|
| 393 |
+
"""
|
| 394 |
+
return self._tokenizer.to_str(pretty)
|
| 395 |
+
|
| 396 |
+
def post_process(
|
| 397 |
+
self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True
|
| 398 |
+
) -> Encoding:
|
| 399 |
+
"""Apply all the post-processing steps to the given encodings.
|
| 400 |
+
|
| 401 |
+
The various steps are:
|
| 402 |
+
1. Truncate according to global params (provided to `enable_truncation`)
|
| 403 |
+
2. Apply the PostProcessor
|
| 404 |
+
3. Pad according to global params. (provided to `enable_padding`)
|
| 405 |
+
|
| 406 |
+
Args:
|
| 407 |
+
encoding: Encoding:
|
| 408 |
+
The main Encoding to post process
|
| 409 |
+
|
| 410 |
+
pair: Optional[Encoding]:
|
| 411 |
+
An optional pair Encoding
|
| 412 |
+
|
| 413 |
+
add_special_tokens: bool:
|
| 414 |
+
Whether to add special tokens
|
| 415 |
+
|
| 416 |
+
Returns:
|
| 417 |
+
The resulting Encoding
|
| 418 |
+
"""
|
| 419 |
+
return self._tokenizer.post_process(encoding, pair, add_special_tokens)
|
| 420 |
+
|
| 421 |
+
@property
|
| 422 |
+
def model(self) -> Model:
|
| 423 |
+
return self._tokenizer.model
|
| 424 |
+
|
| 425 |
+
@model.setter
|
| 426 |
+
def model(self, model: Model):
|
| 427 |
+
self._tokenizer.model = model
|
| 428 |
+
|
| 429 |
+
@property
|
| 430 |
+
def normalizer(self) -> Normalizer:
|
| 431 |
+
return self._tokenizer.normalizer
|
| 432 |
+
|
| 433 |
+
@normalizer.setter
|
| 434 |
+
def normalizer(self, normalizer: Normalizer):
|
| 435 |
+
self._tokenizer.normalizer = normalizer
|
| 436 |
+
|
| 437 |
+
@property
|
| 438 |
+
def pre_tokenizer(self) -> PreTokenizer:
|
| 439 |
+
return self._tokenizer.pre_tokenizer
|
| 440 |
+
|
| 441 |
+
@pre_tokenizer.setter
|
| 442 |
+
def pre_tokenizer(self, pre_tokenizer: PreTokenizer):
|
| 443 |
+
self._tokenizer.pre_tokenizer = pre_tokenizer
|
| 444 |
+
|
| 445 |
+
@property
|
| 446 |
+
def post_processor(self) -> PostProcessor:
|
| 447 |
+
return self._tokenizer.post_processor
|
| 448 |
+
|
| 449 |
+
@post_processor.setter
|
| 450 |
+
def post_processor(self, post_processor: PostProcessor):
|
| 451 |
+
self._tokenizer.post_processor = post_processor
|
| 452 |
+
|
| 453 |
+
@property
|
| 454 |
+
def decoder(self) -> Decoder:
|
| 455 |
+
return self._tokenizer.decoder
|
| 456 |
+
|
| 457 |
+
@decoder.setter
|
| 458 |
+
def decoder(self, decoder: Decoder):
|
| 459 |
+
self._tokenizer.decoder = decoder
|
source/tokenizers/implementations/bert_wordpiece.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, Iterator, List, Optional, Union
|
| 2 |
+
|
| 3 |
+
from tokenizers import AddedToken, Tokenizer, decoders, trainers
|
| 4 |
+
from tokenizers.models import WordPiece
|
| 5 |
+
from tokenizers.normalizers import BertNormalizer
|
| 6 |
+
from tokenizers.pre_tokenizers import BertPreTokenizer
|
| 7 |
+
from tokenizers.processors import BertProcessing
|
| 8 |
+
|
| 9 |
+
from .base_tokenizer import BaseTokenizer
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class BertWordPieceTokenizer(BaseTokenizer):
|
| 13 |
+
"""Bert WordPiece Tokenizer"""
|
| 14 |
+
|
| 15 |
+
def __init__(
|
| 16 |
+
self,
|
| 17 |
+
vocab: Optional[Union[str, Dict[str, int]]] = None,
|
| 18 |
+
unk_token: Union[str, AddedToken] = "[UNK]",
|
| 19 |
+
sep_token: Union[str, AddedToken] = "[SEP]",
|
| 20 |
+
cls_token: Union[str, AddedToken] = "[CLS]",
|
| 21 |
+
pad_token: Union[str, AddedToken] = "[PAD]",
|
| 22 |
+
mask_token: Union[str, AddedToken] = "[MASK]",
|
| 23 |
+
clean_text: bool = True,
|
| 24 |
+
handle_chinese_chars: bool = True,
|
| 25 |
+
strip_accents: Optional[bool] = None,
|
| 26 |
+
lowercase: bool = True,
|
| 27 |
+
wordpieces_prefix: str = "##",
|
| 28 |
+
):
|
| 29 |
+
if vocab is not None:
|
| 30 |
+
tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(unk_token)))
|
| 31 |
+
else:
|
| 32 |
+
tokenizer = Tokenizer(WordPiece(unk_token=str(unk_token)))
|
| 33 |
+
|
| 34 |
+
# Let the tokenizer know about special tokens if they are part of the vocab
|
| 35 |
+
if tokenizer.token_to_id(str(unk_token)) is not None:
|
| 36 |
+
tokenizer.add_special_tokens([str(unk_token)])
|
| 37 |
+
if tokenizer.token_to_id(str(sep_token)) is not None:
|
| 38 |
+
tokenizer.add_special_tokens([str(sep_token)])
|
| 39 |
+
if tokenizer.token_to_id(str(cls_token)) is not None:
|
| 40 |
+
tokenizer.add_special_tokens([str(cls_token)])
|
| 41 |
+
if tokenizer.token_to_id(str(pad_token)) is not None:
|
| 42 |
+
tokenizer.add_special_tokens([str(pad_token)])
|
| 43 |
+
if tokenizer.token_to_id(str(mask_token)) is not None:
|
| 44 |
+
tokenizer.add_special_tokens([str(mask_token)])
|
| 45 |
+
|
| 46 |
+
tokenizer.normalizer = BertNormalizer(
|
| 47 |
+
clean_text=clean_text,
|
| 48 |
+
handle_chinese_chars=handle_chinese_chars,
|
| 49 |
+
strip_accents=strip_accents,
|
| 50 |
+
lowercase=lowercase,
|
| 51 |
+
)
|
| 52 |
+
tokenizer.pre_tokenizer = BertPreTokenizer()
|
| 53 |
+
|
| 54 |
+
if vocab is not None:
|
| 55 |
+
sep_token_id = tokenizer.token_to_id(str(sep_token))
|
| 56 |
+
if sep_token_id is None:
|
| 57 |
+
raise TypeError("sep_token not found in the vocabulary")
|
| 58 |
+
cls_token_id = tokenizer.token_to_id(str(cls_token))
|
| 59 |
+
if cls_token_id is None:
|
| 60 |
+
raise TypeError("cls_token not found in the vocabulary")
|
| 61 |
+
|
| 62 |
+
tokenizer.post_processor = BertProcessing((str(sep_token), sep_token_id), (str(cls_token), cls_token_id))
|
| 63 |
+
tokenizer.decoder = decoders.WordPiece(prefix=wordpieces_prefix)
|
| 64 |
+
|
| 65 |
+
parameters = {
|
| 66 |
+
"model": "BertWordPiece",
|
| 67 |
+
"unk_token": unk_token,
|
| 68 |
+
"sep_token": sep_token,
|
| 69 |
+
"cls_token": cls_token,
|
| 70 |
+
"pad_token": pad_token,
|
| 71 |
+
"mask_token": mask_token,
|
| 72 |
+
"clean_text": clean_text,
|
| 73 |
+
"handle_chinese_chars": handle_chinese_chars,
|
| 74 |
+
"strip_accents": strip_accents,
|
| 75 |
+
"lowercase": lowercase,
|
| 76 |
+
"wordpieces_prefix": wordpieces_prefix,
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
super().__init__(tokenizer, parameters)
|
| 80 |
+
|
| 81 |
+
@staticmethod
|
| 82 |
+
def from_file(vocab: str, **kwargs):
|
| 83 |
+
vocab = WordPiece.read_file(vocab)
|
| 84 |
+
return BertWordPieceTokenizer(vocab, **kwargs)
|
| 85 |
+
|
| 86 |
+
def train(
|
| 87 |
+
self,
|
| 88 |
+
files: Union[str, List[str]],
|
| 89 |
+
vocab_size: int = 30000,
|
| 90 |
+
min_frequency: int = 2,
|
| 91 |
+
limit_alphabet: int = 1000,
|
| 92 |
+
initial_alphabet: List[str] = [],
|
| 93 |
+
special_tokens: List[Union[str, AddedToken]] = [
|
| 94 |
+
"[PAD]",
|
| 95 |
+
"[UNK]",
|
| 96 |
+
"[CLS]",
|
| 97 |
+
"[SEP]",
|
| 98 |
+
"[MASK]",
|
| 99 |
+
],
|
| 100 |
+
show_progress: bool = True,
|
| 101 |
+
wordpieces_prefix: str = "##",
|
| 102 |
+
):
|
| 103 |
+
"""Train the model using the given files"""
|
| 104 |
+
|
| 105 |
+
trainer = trainers.WordPieceTrainer(
|
| 106 |
+
vocab_size=vocab_size,
|
| 107 |
+
min_frequency=min_frequency,
|
| 108 |
+
limit_alphabet=limit_alphabet,
|
| 109 |
+
initial_alphabet=initial_alphabet,
|
| 110 |
+
special_tokens=special_tokens,
|
| 111 |
+
show_progress=show_progress,
|
| 112 |
+
continuing_subword_prefix=wordpieces_prefix,
|
| 113 |
+
)
|
| 114 |
+
if isinstance(files, str):
|
| 115 |
+
files = [files]
|
| 116 |
+
self._tokenizer.train(files, trainer=trainer)
|
| 117 |
+
|
| 118 |
+
def train_from_iterator(
|
| 119 |
+
self,
|
| 120 |
+
iterator: Union[Iterator[str], Iterator[Iterator[str]]],
|
| 121 |
+
vocab_size: int = 30000,
|
| 122 |
+
min_frequency: int = 2,
|
| 123 |
+
limit_alphabet: int = 1000,
|
| 124 |
+
initial_alphabet: List[str] = [],
|
| 125 |
+
special_tokens: List[Union[str, AddedToken]] = [
|
| 126 |
+
"[PAD]",
|
| 127 |
+
"[UNK]",
|
| 128 |
+
"[CLS]",
|
| 129 |
+
"[SEP]",
|
| 130 |
+
"[MASK]",
|
| 131 |
+
],
|
| 132 |
+
show_progress: bool = True,
|
| 133 |
+
wordpieces_prefix: str = "##",
|
| 134 |
+
length: Optional[int] = None,
|
| 135 |
+
):
|
| 136 |
+
"""Train the model using the given iterator"""
|
| 137 |
+
|
| 138 |
+
trainer = trainers.WordPieceTrainer(
|
| 139 |
+
vocab_size=vocab_size,
|
| 140 |
+
min_frequency=min_frequency,
|
| 141 |
+
limit_alphabet=limit_alphabet,
|
| 142 |
+
initial_alphabet=initial_alphabet,
|
| 143 |
+
special_tokens=special_tokens,
|
| 144 |
+
show_progress=show_progress,
|
| 145 |
+
continuing_subword_prefix=wordpieces_prefix,
|
| 146 |
+
)
|
| 147 |
+
self._tokenizer.train_from_iterator(
|
| 148 |
+
iterator,
|
| 149 |
+
trainer=trainer,
|
| 150 |
+
length=length,
|
| 151 |
+
)
|
source/tokenizers/implementations/byte_level_bpe.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
| 2 |
+
|
| 3 |
+
from tokenizers import AddedToken, Tokenizer, decoders, pre_tokenizers, processors, trainers
|
| 4 |
+
from tokenizers.models import BPE
|
| 5 |
+
from tokenizers.normalizers import Lowercase, Sequence, unicode_normalizer_from_str
|
| 6 |
+
|
| 7 |
+
from .base_tokenizer import BaseTokenizer
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class ByteLevelBPETokenizer(BaseTokenizer):
|
| 11 |
+
"""ByteLevelBPETokenizer
|
| 12 |
+
|
| 13 |
+
Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
def __init__(
|
| 17 |
+
self,
|
| 18 |
+
vocab: Optional[Union[str, Dict[str, int]]] = None,
|
| 19 |
+
merges: Optional[Union[str, List[Tuple[str, str]]]] = None,
|
| 20 |
+
add_prefix_space: bool = False,
|
| 21 |
+
lowercase: bool = False,
|
| 22 |
+
dropout: Optional[float] = None,
|
| 23 |
+
unicode_normalizer: Optional[str] = None,
|
| 24 |
+
continuing_subword_prefix: Optional[str] = None,
|
| 25 |
+
end_of_word_suffix: Optional[str] = None,
|
| 26 |
+
trim_offsets: bool = False,
|
| 27 |
+
):
|
| 28 |
+
if vocab is not None and merges is not None:
|
| 29 |
+
tokenizer = Tokenizer(
|
| 30 |
+
BPE(
|
| 31 |
+
vocab,
|
| 32 |
+
merges,
|
| 33 |
+
dropout=dropout,
|
| 34 |
+
continuing_subword_prefix=continuing_subword_prefix or "",
|
| 35 |
+
end_of_word_suffix=end_of_word_suffix or "",
|
| 36 |
+
)
|
| 37 |
+
)
|
| 38 |
+
else:
|
| 39 |
+
tokenizer = Tokenizer(BPE())
|
| 40 |
+
|
| 41 |
+
# Check for Unicode normalization first (before everything else)
|
| 42 |
+
normalizers = []
|
| 43 |
+
|
| 44 |
+
if unicode_normalizer:
|
| 45 |
+
normalizers += [unicode_normalizer_from_str(unicode_normalizer)]
|
| 46 |
+
|
| 47 |
+
if lowercase:
|
| 48 |
+
normalizers += [Lowercase()]
|
| 49 |
+
|
| 50 |
+
# Create the normalizer structure
|
| 51 |
+
if len(normalizers) > 0:
|
| 52 |
+
if len(normalizers) > 1:
|
| 53 |
+
tokenizer.normalizer = Sequence(normalizers)
|
| 54 |
+
else:
|
| 55 |
+
tokenizer.normalizer = normalizers[0]
|
| 56 |
+
|
| 57 |
+
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space)
|
| 58 |
+
tokenizer.decoder = decoders.ByteLevel()
|
| 59 |
+
tokenizer.post_processor = processors.ByteLevel(trim_offsets=trim_offsets)
|
| 60 |
+
|
| 61 |
+
parameters = {
|
| 62 |
+
"model": "ByteLevelBPE",
|
| 63 |
+
"add_prefix_space": add_prefix_space,
|
| 64 |
+
"lowercase": lowercase,
|
| 65 |
+
"dropout": dropout,
|
| 66 |
+
"unicode_normalizer": unicode_normalizer,
|
| 67 |
+
"continuing_subword_prefix": continuing_subword_prefix,
|
| 68 |
+
"end_of_word_suffix": end_of_word_suffix,
|
| 69 |
+
"trim_offsets": trim_offsets,
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
super().__init__(tokenizer, parameters)
|
| 73 |
+
|
| 74 |
+
@staticmethod
|
| 75 |
+
def from_file(vocab_filename: str, merges_filename: str, **kwargs):
|
| 76 |
+
vocab, merges = BPE.read_file(vocab_filename, merges_filename)
|
| 77 |
+
return ByteLevelBPETokenizer(vocab, merges, **kwargs)
|
| 78 |
+
|
| 79 |
+
def train(
|
| 80 |
+
self,
|
| 81 |
+
files: Union[str, List[str]],
|
| 82 |
+
vocab_size: int = 30000,
|
| 83 |
+
min_frequency: int = 2,
|
| 84 |
+
show_progress: bool = True,
|
| 85 |
+
special_tokens: List[Union[str, AddedToken]] = [],
|
| 86 |
+
):
|
| 87 |
+
"""Train the model using the given files"""
|
| 88 |
+
|
| 89 |
+
trainer = trainers.BpeTrainer(
|
| 90 |
+
vocab_size=vocab_size,
|
| 91 |
+
min_frequency=min_frequency,
|
| 92 |
+
show_progress=show_progress,
|
| 93 |
+
special_tokens=special_tokens,
|
| 94 |
+
initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
|
| 95 |
+
)
|
| 96 |
+
if isinstance(files, str):
|
| 97 |
+
files = [files]
|
| 98 |
+
self._tokenizer.train(files, trainer=trainer)
|
| 99 |
+
|
| 100 |
+
def train_from_iterator(
|
| 101 |
+
self,
|
| 102 |
+
iterator: Union[Iterator[str], Iterator[Iterator[str]]],
|
| 103 |
+
vocab_size: int = 30000,
|
| 104 |
+
min_frequency: int = 2,
|
| 105 |
+
show_progress: bool = True,
|
| 106 |
+
special_tokens: List[Union[str, AddedToken]] = [],
|
| 107 |
+
length: Optional[int] = None,
|
| 108 |
+
):
|
| 109 |
+
"""Train the model using the given iterator"""
|
| 110 |
+
|
| 111 |
+
trainer = trainers.BpeTrainer(
|
| 112 |
+
vocab_size=vocab_size,
|
| 113 |
+
min_frequency=min_frequency,
|
| 114 |
+
show_progress=show_progress,
|
| 115 |
+
special_tokens=special_tokens,
|
| 116 |
+
initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
|
| 117 |
+
)
|
| 118 |
+
self._tokenizer.train_from_iterator(
|
| 119 |
+
iterator,
|
| 120 |
+
trainer=trainer,
|
| 121 |
+
length=length,
|
| 122 |
+
)
|
source/tokenizers/implementations/char_level_bpe.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
| 2 |
+
|
| 3 |
+
from .. import AddedToken, Tokenizer, decoders, pre_tokenizers, trainers
|
| 4 |
+
from ..models import BPE
|
| 5 |
+
from ..normalizers import BertNormalizer, Lowercase, Sequence, unicode_normalizer_from_str
|
| 6 |
+
from .base_tokenizer import BaseTokenizer
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class CharBPETokenizer(BaseTokenizer):
|
| 10 |
+
"""Original BPE Tokenizer
|
| 11 |
+
|
| 12 |
+
Represents the BPE algorithm, as introduced by Rico Sennrich
|
| 13 |
+
(https://arxiv.org/abs/1508.07909)
|
| 14 |
+
|
| 15 |
+
The defaults settings corresponds to OpenAI GPT BPE tokenizers and differs from the original
|
| 16 |
+
Sennrich subword-nmt implementation by the following options that you can deactivate:
|
| 17 |
+
- adding a normalizer to clean up the text (deactivate with `bert_normalizer=False`) by:
|
| 18 |
+
* removing any control characters and replacing all whitespaces by the classic one.
|
| 19 |
+
* handle chinese chars by putting spaces around them.
|
| 20 |
+
* strip all accents.
|
| 21 |
+
- spitting on punctuation in addition to whitespaces (deactivate it with
|
| 22 |
+
`split_on_whitespace_only=True`)
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
def __init__(
|
| 26 |
+
self,
|
| 27 |
+
vocab: Optional[Union[str, Dict[str, int]]] = None,
|
| 28 |
+
merges: Optional[Union[str, List[Tuple[str, str]]]] = None,
|
| 29 |
+
unk_token: Union[str, AddedToken] = "<unk>",
|
| 30 |
+
suffix: str = "</w>",
|
| 31 |
+
dropout: Optional[float] = None,
|
| 32 |
+
lowercase: bool = False,
|
| 33 |
+
unicode_normalizer: Optional[str] = None,
|
| 34 |
+
bert_normalizer: bool = True,
|
| 35 |
+
split_on_whitespace_only: bool = False,
|
| 36 |
+
):
|
| 37 |
+
if vocab is not None and merges is not None:
|
| 38 |
+
tokenizer = Tokenizer(
|
| 39 |
+
BPE(
|
| 40 |
+
vocab,
|
| 41 |
+
merges,
|
| 42 |
+
dropout=dropout,
|
| 43 |
+
unk_token=str(unk_token),
|
| 44 |
+
end_of_word_suffix=suffix,
|
| 45 |
+
)
|
| 46 |
+
)
|
| 47 |
+
else:
|
| 48 |
+
tokenizer = Tokenizer(BPE(unk_token=str(unk_token), dropout=dropout, end_of_word_suffix=suffix))
|
| 49 |
+
|
| 50 |
+
if tokenizer.token_to_id(str(unk_token)) is not None:
|
| 51 |
+
tokenizer.add_special_tokens([str(unk_token)])
|
| 52 |
+
|
| 53 |
+
# Check for Unicode normalization first (before everything else)
|
| 54 |
+
normalizers = []
|
| 55 |
+
|
| 56 |
+
if unicode_normalizer:
|
| 57 |
+
normalizers += [unicode_normalizer_from_str(unicode_normalizer)]
|
| 58 |
+
|
| 59 |
+
if bert_normalizer:
|
| 60 |
+
normalizers += [BertNormalizer(lowercase=False)]
|
| 61 |
+
|
| 62 |
+
if lowercase:
|
| 63 |
+
normalizers += [Lowercase()]
|
| 64 |
+
|
| 65 |
+
# Create the normalizer structure
|
| 66 |
+
if len(normalizers) > 0:
|
| 67 |
+
if len(normalizers) > 1:
|
| 68 |
+
tokenizer.normalizer = Sequence(normalizers)
|
| 69 |
+
else:
|
| 70 |
+
tokenizer.normalizer = normalizers[0]
|
| 71 |
+
|
| 72 |
+
if split_on_whitespace_only:
|
| 73 |
+
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
|
| 74 |
+
else:
|
| 75 |
+
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
|
| 76 |
+
|
| 77 |
+
tokenizer.decoder = decoders.BPEDecoder(suffix=suffix)
|
| 78 |
+
|
| 79 |
+
parameters = {
|
| 80 |
+
"model": "BPE",
|
| 81 |
+
"unk_token": unk_token,
|
| 82 |
+
"suffix": suffix,
|
| 83 |
+
"dropout": dropout,
|
| 84 |
+
"lowercase": lowercase,
|
| 85 |
+
"unicode_normalizer": unicode_normalizer,
|
| 86 |
+
"bert_normalizer": bert_normalizer,
|
| 87 |
+
"split_on_whitespace_only": split_on_whitespace_only,
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
super().__init__(tokenizer, parameters)
|
| 91 |
+
|
| 92 |
+
@staticmethod
|
| 93 |
+
def from_file(vocab_filename: str, merges_filename: str, **kwargs):
|
| 94 |
+
vocab, merges = BPE.read_file(vocab_filename, merges_filename)
|
| 95 |
+
return CharBPETokenizer(vocab, merges, **kwargs)
|
| 96 |
+
|
| 97 |
+
def train(
|
| 98 |
+
self,
|
| 99 |
+
files: Union[str, List[str]],
|
| 100 |
+
vocab_size: int = 30000,
|
| 101 |
+
min_frequency: int = 2,
|
| 102 |
+
special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
|
| 103 |
+
limit_alphabet: int = 1000,
|
| 104 |
+
initial_alphabet: List[str] = [],
|
| 105 |
+
suffix: Optional[str] = "</w>",
|
| 106 |
+
show_progress: bool = True,
|
| 107 |
+
):
|
| 108 |
+
"""Train the model using the given files"""
|
| 109 |
+
|
| 110 |
+
trainer = trainers.BpeTrainer(
|
| 111 |
+
vocab_size=vocab_size,
|
| 112 |
+
min_frequency=min_frequency,
|
| 113 |
+
special_tokens=special_tokens,
|
| 114 |
+
limit_alphabet=limit_alphabet,
|
| 115 |
+
initial_alphabet=initial_alphabet,
|
| 116 |
+
end_of_word_suffix=suffix,
|
| 117 |
+
show_progress=show_progress,
|
| 118 |
+
)
|
| 119 |
+
if isinstance(files, str):
|
| 120 |
+
files = [files]
|
| 121 |
+
self._tokenizer.train(files, trainer=trainer)
|
| 122 |
+
|
| 123 |
+
def train_from_iterator(
|
| 124 |
+
self,
|
| 125 |
+
iterator: Union[Iterator[str], Iterator[Iterator[str]]],
|
| 126 |
+
vocab_size: int = 30000,
|
| 127 |
+
min_frequency: int = 2,
|
| 128 |
+
special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
|
| 129 |
+
limit_alphabet: int = 1000,
|
| 130 |
+
initial_alphabet: List[str] = [],
|
| 131 |
+
suffix: Optional[str] = "</w>",
|
| 132 |
+
show_progress: bool = True,
|
| 133 |
+
length: Optional[int] = None,
|
| 134 |
+
):
|
| 135 |
+
"""Train the model using the given iterator"""
|
| 136 |
+
|
| 137 |
+
trainer = trainers.BpeTrainer(
|
| 138 |
+
vocab_size=vocab_size,
|
| 139 |
+
min_frequency=min_frequency,
|
| 140 |
+
special_tokens=special_tokens,
|
| 141 |
+
limit_alphabet=limit_alphabet,
|
| 142 |
+
initial_alphabet=initial_alphabet,
|
| 143 |
+
end_of_word_suffix=suffix,
|
| 144 |
+
show_progress=show_progress,
|
| 145 |
+
)
|
| 146 |
+
self._tokenizer.train_from_iterator(
|
| 147 |
+
iterator,
|
| 148 |
+
trainer=trainer,
|
| 149 |
+
length=length,
|
| 150 |
+
)
|
source/tokenizers/implementations/sentencepiece_bpe.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
| 2 |
+
|
| 3 |
+
from tokenizers import AddedToken, Tokenizer, decoders, pre_tokenizers, trainers
|
| 4 |
+
from tokenizers.models import BPE
|
| 5 |
+
from tokenizers.normalizers import NFKC
|
| 6 |
+
|
| 7 |
+
from .base_tokenizer import BaseTokenizer
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class SentencePieceBPETokenizer(BaseTokenizer):
|
| 11 |
+
"""SentencePiece BPE Tokenizer
|
| 12 |
+
|
| 13 |
+
Represents the BPE algorithm, with the pretokenization used by SentencePiece
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
def __init__(
|
| 17 |
+
self,
|
| 18 |
+
vocab: Optional[Union[str, Dict[str, int]]] = None,
|
| 19 |
+
merges: Optional[Union[str, List[Tuple[str, str]]]] = None,
|
| 20 |
+
unk_token: Union[str, AddedToken] = "<unk>",
|
| 21 |
+
replacement: str = "▁",
|
| 22 |
+
add_prefix_space: bool = True,
|
| 23 |
+
dropout: Optional[float] = None,
|
| 24 |
+
fuse_unk: Optional[bool] = False,
|
| 25 |
+
):
|
| 26 |
+
if vocab is not None and merges is not None:
|
| 27 |
+
tokenizer = Tokenizer(BPE(vocab, merges, dropout=dropout, unk_token=unk_token, fuse_unk=fuse_unk))
|
| 28 |
+
else:
|
| 29 |
+
tokenizer = Tokenizer(BPE(dropout=dropout, unk_token=unk_token, fuse_unk=fuse_unk))
|
| 30 |
+
|
| 31 |
+
if tokenizer.token_to_id(str(unk_token)) is not None:
|
| 32 |
+
tokenizer.add_special_tokens([str(unk_token)])
|
| 33 |
+
|
| 34 |
+
tokenizer.normalizer = NFKC()
|
| 35 |
+
prepend_scheme = "always" if add_prefix_space else "never"
|
| 36 |
+
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
|
| 37 |
+
tokenizer.decoder = decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
|
| 38 |
+
|
| 39 |
+
parameters = {
|
| 40 |
+
"model": "SentencePieceBPE",
|
| 41 |
+
"unk_token": unk_token,
|
| 42 |
+
"replacement": replacement,
|
| 43 |
+
"add_prefix_space": add_prefix_space,
|
| 44 |
+
"dropout": dropout,
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
super().__init__(tokenizer, parameters)
|
| 48 |
+
|
| 49 |
+
@staticmethod
|
| 50 |
+
def from_file(vocab_filename: str, merges_filename: str, **kwargs):
|
| 51 |
+
vocab, merges = BPE.read_file(vocab_filename, merges_filename)
|
| 52 |
+
return SentencePieceBPETokenizer(vocab, merges, **kwargs)
|
| 53 |
+
|
| 54 |
+
def train(
|
| 55 |
+
self,
|
| 56 |
+
files: Union[str, List[str]],
|
| 57 |
+
vocab_size: int = 30000,
|
| 58 |
+
min_frequency: int = 2,
|
| 59 |
+
special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
|
| 60 |
+
limit_alphabet: int = 1000,
|
| 61 |
+
initial_alphabet: List[str] = [],
|
| 62 |
+
show_progress: bool = True,
|
| 63 |
+
):
|
| 64 |
+
"""Train the model using the given files"""
|
| 65 |
+
|
| 66 |
+
trainer = trainers.BpeTrainer(
|
| 67 |
+
vocab_size=vocab_size,
|
| 68 |
+
min_frequency=min_frequency,
|
| 69 |
+
special_tokens=special_tokens,
|
| 70 |
+
limit_alphabet=limit_alphabet,
|
| 71 |
+
initial_alphabet=initial_alphabet,
|
| 72 |
+
show_progress=show_progress,
|
| 73 |
+
)
|
| 74 |
+
if isinstance(files, str):
|
| 75 |
+
files = [files]
|
| 76 |
+
self._tokenizer.train(files, trainer=trainer)
|
| 77 |
+
|
| 78 |
+
def train_from_iterator(
|
| 79 |
+
self,
|
| 80 |
+
iterator: Union[Iterator[str], Iterator[Iterator[str]]],
|
| 81 |
+
vocab_size: int = 30000,
|
| 82 |
+
min_frequency: int = 2,
|
| 83 |
+
special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
|
| 84 |
+
limit_alphabet: int = 1000,
|
| 85 |
+
initial_alphabet: List[str] = [],
|
| 86 |
+
show_progress: bool = True,
|
| 87 |
+
length: Optional[int] = None,
|
| 88 |
+
):
|
| 89 |
+
"""Train the model using the given iterator"""
|
| 90 |
+
|
| 91 |
+
trainer = trainers.BpeTrainer(
|
| 92 |
+
vocab_size=vocab_size,
|
| 93 |
+
min_frequency=min_frequency,
|
| 94 |
+
special_tokens=special_tokens,
|
| 95 |
+
limit_alphabet=limit_alphabet,
|
| 96 |
+
initial_alphabet=initial_alphabet,
|
| 97 |
+
show_progress=show_progress,
|
| 98 |
+
)
|
| 99 |
+
self._tokenizer.train_from_iterator(
|
| 100 |
+
iterator,
|
| 101 |
+
trainer=trainer,
|
| 102 |
+
length=length,
|
| 103 |
+
)
|
source/tokenizers/implementations/sentencepiece_unigram.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
from typing import Iterator, List, Optional, Union, Tuple
|
| 4 |
+
|
| 5 |
+
from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, trainers
|
| 6 |
+
from tokenizers.models import Unigram
|
| 7 |
+
|
| 8 |
+
from .base_tokenizer import BaseTokenizer
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class SentencePieceUnigramTokenizer(BaseTokenizer):
|
| 12 |
+
"""SentencePiece Unigram Tokenizer
|
| 13 |
+
|
| 14 |
+
Represents the Unigram algorithm, with the pretokenization used by SentencePiece
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
def __init__(
|
| 18 |
+
self,
|
| 19 |
+
vocab: Optional[List[Tuple[str, float]]] = None,
|
| 20 |
+
replacement: str = "▁",
|
| 21 |
+
add_prefix_space: bool = True,
|
| 22 |
+
):
|
| 23 |
+
if vocab is not None:
|
| 24 |
+
# Let Unigram(..) fail if only one of them is None
|
| 25 |
+
tokenizer = Tokenizer(Unigram(vocab))
|
| 26 |
+
else:
|
| 27 |
+
tokenizer = Tokenizer(Unigram())
|
| 28 |
+
|
| 29 |
+
tokenizer.normalizer = normalizers.Sequence(
|
| 30 |
+
[normalizers.Nmt(), normalizers.NFKC(), normalizers.Replace(Regex(" {2,}"), " ")]
|
| 31 |
+
)
|
| 32 |
+
prepend_scheme = "always" if add_prefix_space else "never"
|
| 33 |
+
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
|
| 34 |
+
tokenizer.decoder = decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
|
| 35 |
+
|
| 36 |
+
parameters = {
|
| 37 |
+
"model": "SentencePieceUnigram",
|
| 38 |
+
"replacement": replacement,
|
| 39 |
+
"add_prefix_space": add_prefix_space,
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
super().__init__(tokenizer, parameters)
|
| 43 |
+
|
| 44 |
+
def train(
|
| 45 |
+
self,
|
| 46 |
+
files: Union[str, List[str]],
|
| 47 |
+
vocab_size: int = 8000,
|
| 48 |
+
show_progress: bool = True,
|
| 49 |
+
special_tokens: Optional[List[Union[str, AddedToken]]] = None,
|
| 50 |
+
initial_alphabet: Optional[List[str]] = None,
|
| 51 |
+
unk_token: Optional[str] = None,
|
| 52 |
+
):
|
| 53 |
+
"""
|
| 54 |
+
Train the model using the given files
|
| 55 |
+
|
| 56 |
+
Args:
|
| 57 |
+
files (:obj:`List[str]`):
|
| 58 |
+
A list of path to the files that we should use for training
|
| 59 |
+
vocab_size (:obj:`int`):
|
| 60 |
+
The size of the final vocabulary, including all tokens and alphabet.
|
| 61 |
+
show_progress (:obj:`bool`):
|
| 62 |
+
Whether to show progress bars while training.
|
| 63 |
+
special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
|
| 64 |
+
A list of special tokens the model should know of.
|
| 65 |
+
initial_alphabet (:obj:`List[str]`, `optional`):
|
| 66 |
+
A list of characters to include in the initial alphabet, even
|
| 67 |
+
if not seen in the training dataset.
|
| 68 |
+
If the strings contain more than one character, only the first one
|
| 69 |
+
is kept.
|
| 70 |
+
unk_token (:obj:`str`, `optional`):
|
| 71 |
+
The unknown token to be used by the model.
|
| 72 |
+
"""
|
| 73 |
+
|
| 74 |
+
if special_tokens is None:
|
| 75 |
+
special_tokens = []
|
| 76 |
+
|
| 77 |
+
if initial_alphabet is None:
|
| 78 |
+
initial_alphabet = []
|
| 79 |
+
|
| 80 |
+
trainer = trainers.UnigramTrainer(
|
| 81 |
+
vocab_size=vocab_size,
|
| 82 |
+
special_tokens=special_tokens,
|
| 83 |
+
show_progress=show_progress,
|
| 84 |
+
initial_alphabet=initial_alphabet,
|
| 85 |
+
unk_token=unk_token,
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
if isinstance(files, str):
|
| 89 |
+
files = [files]
|
| 90 |
+
self._tokenizer.train(files, trainer=trainer)
|
| 91 |
+
|
| 92 |
+
def train_from_iterator(
|
| 93 |
+
self,
|
| 94 |
+
iterator: Union[Iterator[str], Iterator[Iterator[str]]],
|
| 95 |
+
vocab_size: int = 8000,
|
| 96 |
+
show_progress: bool = True,
|
| 97 |
+
special_tokens: Optional[List[Union[str, AddedToken]]] = None,
|
| 98 |
+
initial_alphabet: Optional[List[str]] = None,
|
| 99 |
+
unk_token: Optional[str] = None,
|
| 100 |
+
length: Optional[int] = None,
|
| 101 |
+
):
|
| 102 |
+
"""
|
| 103 |
+
Train the model using the given iterator
|
| 104 |
+
|
| 105 |
+
Args:
|
| 106 |
+
iterator (:obj:`Union[Iterator[str], Iterator[Iterator[str]]]`):
|
| 107 |
+
Any iterator over strings or list of strings
|
| 108 |
+
vocab_size (:obj:`int`):
|
| 109 |
+
The size of the final vocabulary, including all tokens and alphabet.
|
| 110 |
+
show_progress (:obj:`bool`):
|
| 111 |
+
Whether to show progress bars while training.
|
| 112 |
+
special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
|
| 113 |
+
A list of special tokens the model should know of.
|
| 114 |
+
initial_alphabet (:obj:`List[str]`, `optional`):
|
| 115 |
+
A list of characters to include in the initial alphabet, even
|
| 116 |
+
if not seen in the training dataset.
|
| 117 |
+
If the strings contain more than one character, only the first one
|
| 118 |
+
is kept.
|
| 119 |
+
unk_token (:obj:`str`, `optional`):
|
| 120 |
+
The unknown token to be used by the model.
|
| 121 |
+
length (:obj:`int`, `optional`):
|
| 122 |
+
The total number of sequences in the iterator. This is used to
|
| 123 |
+
provide meaningful progress tracking
|
| 124 |
+
"""
|
| 125 |
+
|
| 126 |
+
if special_tokens is None:
|
| 127 |
+
special_tokens = []
|
| 128 |
+
|
| 129 |
+
if initial_alphabet is None:
|
| 130 |
+
initial_alphabet = []
|
| 131 |
+
|
| 132 |
+
trainer = trainers.UnigramTrainer(
|
| 133 |
+
vocab_size=vocab_size,
|
| 134 |
+
special_tokens=special_tokens,
|
| 135 |
+
show_progress=show_progress,
|
| 136 |
+
initial_alphabet=initial_alphabet,
|
| 137 |
+
unk_token=unk_token,
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
self._tokenizer.train_from_iterator(
|
| 141 |
+
iterator,
|
| 142 |
+
trainer=trainer,
|
| 143 |
+
length=length,
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
@staticmethod
|
| 147 |
+
def from_spm(filename: str):
|
| 148 |
+
try:
|
| 149 |
+
import sys
|
| 150 |
+
|
| 151 |
+
sys.path.append(".")
|
| 152 |
+
|
| 153 |
+
import sentencepiece_model_pb2 as model # type: ignore[import]
|
| 154 |
+
except Exception:
|
| 155 |
+
raise Exception(
|
| 156 |
+
"You don't seem to have the required protobuf file, in order to use this function you need to run `pip install protobuf` and `wget https://raw.githubusercontent.com/google/sentencepiece/master/python/src/sentencepiece/sentencepiece_model_pb2.py` for us to be able to read the intrinsics of your spm_file. `pip install sentencepiece` is not required."
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
m = model.ModelProto()
|
| 160 |
+
m.ParseFromString(open(filename, "rb").read())
|
| 161 |
+
|
| 162 |
+
precompiled_charsmap = m.normalizer_spec.precompiled_charsmap
|
| 163 |
+
vocab = [(piece.piece, piece.score) for piece in m.pieces]
|
| 164 |
+
unk_id = m.trainer_spec.unk_id
|
| 165 |
+
model_type = m.trainer_spec.model_type
|
| 166 |
+
byte_fallback = m.trainer_spec.byte_fallback
|
| 167 |
+
if model_type != 1:
|
| 168 |
+
raise Exception(
|
| 169 |
+
"You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
replacement = "▁"
|
| 173 |
+
add_prefix_space = True
|
| 174 |
+
|
| 175 |
+
tokenizer = Tokenizer(Unigram(vocab, unk_id, byte_fallback))
|
| 176 |
+
|
| 177 |
+
if precompiled_charsmap:
|
| 178 |
+
tokenizer.normalizer = normalizers.Sequence(
|
| 179 |
+
[
|
| 180 |
+
normalizers.Precompiled(precompiled_charsmap),
|
| 181 |
+
normalizers.Replace(Regex(" {2,}"), " "),
|
| 182 |
+
]
|
| 183 |
+
)
|
| 184 |
+
else:
|
| 185 |
+
tokenizer.normalizer = normalizers.Sequence([normalizers.Replace(Regex(" {2,}"), " ")])
|
| 186 |
+
prepend_scheme = "always" if add_prefix_space else "never"
|
| 187 |
+
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
|
| 188 |
+
tokenizer.decoder = decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
|
| 189 |
+
|
| 190 |
+
parameters = {
|
| 191 |
+
"model": "SentencePieceUnigram",
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
obj = BaseTokenizer.__new__(SentencePieceUnigramTokenizer, tokenizer, parameters) # type: ignore[arg-type]
|
| 195 |
+
BaseTokenizer.__init__(obj, tokenizer, parameters)
|
| 196 |
+
return obj
|
source/tokenizers/models/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Generated content DO NOT EDIT
|
| 2 |
+
from .. import models
|
| 3 |
+
|
| 4 |
+
Model = models.Model
|
| 5 |
+
BPE = models.BPE
|
| 6 |
+
Unigram = models.Unigram
|
| 7 |
+
WordLevel = models.WordLevel
|
| 8 |
+
WordPiece = models.WordPiece
|
source/tokenizers/models/__init__.pyi
ADDED
|
@@ -0,0 +1,744 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Generated content DO NOT EDIT
|
| 2 |
+
class Model:
|
| 3 |
+
"""
|
| 4 |
+
Base class for all models
|
| 5 |
+
|
| 6 |
+
The model represents the actual tokenization algorithm. This is the part that
|
| 7 |
+
will contain and manage the learned vocabulary.
|
| 8 |
+
|
| 9 |
+
This class cannot be constructed directly. Please use one of the concrete models.
|
| 10 |
+
"""
|
| 11 |
+
def __init__(self):
|
| 12 |
+
pass
|
| 13 |
+
|
| 14 |
+
def __getstate__(self):
|
| 15 |
+
""" """
|
| 16 |
+
pass
|
| 17 |
+
|
| 18 |
+
def __setstate__(self, state):
|
| 19 |
+
""" """
|
| 20 |
+
pass
|
| 21 |
+
|
| 22 |
+
def get_trainer(self):
|
| 23 |
+
"""
|
| 24 |
+
Get the associated :class:`~tokenizers.trainers.Trainer`
|
| 25 |
+
|
| 26 |
+
Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
|
| 27 |
+
:class:`~tokenizers.models.Model`.
|
| 28 |
+
|
| 29 |
+
Returns:
|
| 30 |
+
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
|
| 31 |
+
"""
|
| 32 |
+
pass
|
| 33 |
+
|
| 34 |
+
def id_to_token(self, id):
|
| 35 |
+
"""
|
| 36 |
+
Get the token associated to an ID
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
id (:obj:`int`):
|
| 40 |
+
An ID to convert to a token
|
| 41 |
+
|
| 42 |
+
Returns:
|
| 43 |
+
:obj:`str`: The token associated to the ID
|
| 44 |
+
"""
|
| 45 |
+
pass
|
| 46 |
+
|
| 47 |
+
def save(self, folder, prefix):
|
| 48 |
+
"""
|
| 49 |
+
Save the current model
|
| 50 |
+
|
| 51 |
+
Save the current model in the given folder, using the given prefix for the various
|
| 52 |
+
files that will get created.
|
| 53 |
+
Any file with the same name that already exists in this folder will be overwritten.
|
| 54 |
+
|
| 55 |
+
Args:
|
| 56 |
+
folder (:obj:`str`):
|
| 57 |
+
The path to the target folder in which to save the various files
|
| 58 |
+
|
| 59 |
+
prefix (:obj:`str`, `optional`):
|
| 60 |
+
An optional prefix, used to prefix each file name
|
| 61 |
+
|
| 62 |
+
Returns:
|
| 63 |
+
:obj:`List[str]`: The list of saved files
|
| 64 |
+
"""
|
| 65 |
+
pass
|
| 66 |
+
|
| 67 |
+
def token_to_id(self, tokens):
|
| 68 |
+
"""
|
| 69 |
+
Get the ID associated to a token
|
| 70 |
+
|
| 71 |
+
Args:
|
| 72 |
+
token (:obj:`str`):
|
| 73 |
+
A token to convert to an ID
|
| 74 |
+
|
| 75 |
+
Returns:
|
| 76 |
+
:obj:`int`: The ID associated to the token
|
| 77 |
+
"""
|
| 78 |
+
pass
|
| 79 |
+
|
| 80 |
+
def tokenize(self, sequence):
|
| 81 |
+
"""
|
| 82 |
+
Tokenize a sequence
|
| 83 |
+
|
| 84 |
+
Args:
|
| 85 |
+
sequence (:obj:`str`):
|
| 86 |
+
A sequence to tokenize
|
| 87 |
+
|
| 88 |
+
Returns:
|
| 89 |
+
A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
|
| 90 |
+
"""
|
| 91 |
+
pass
|
| 92 |
+
|
| 93 |
+
class BPE(Model):
|
| 94 |
+
"""
|
| 95 |
+
An implementation of the BPE (Byte-Pair Encoding) algorithm
|
| 96 |
+
|
| 97 |
+
Args:
|
| 98 |
+
vocab (:obj:`Dict[str, int]`, `optional`):
|
| 99 |
+
A dictionary of string keys and their ids :obj:`{"am": 0,...}`
|
| 100 |
+
|
| 101 |
+
merges (:obj:`List[Tuple[str, str]]`, `optional`):
|
| 102 |
+
A list of pairs of tokens (:obj:`Tuple[str, str]`) :obj:`[("a", "b"),...]`
|
| 103 |
+
|
| 104 |
+
cache_capacity (:obj:`int`, `optional`):
|
| 105 |
+
The number of words that the BPE cache can contain. The cache allows
|
| 106 |
+
to speed-up the process by keeping the result of the merge operations
|
| 107 |
+
for a number of words.
|
| 108 |
+
|
| 109 |
+
dropout (:obj:`float`, `optional`):
|
| 110 |
+
A float between 0 and 1 that represents the BPE dropout to use.
|
| 111 |
+
|
| 112 |
+
unk_token (:obj:`str`, `optional`):
|
| 113 |
+
The unknown token to be used by the model.
|
| 114 |
+
|
| 115 |
+
continuing_subword_prefix (:obj:`str`, `optional`):
|
| 116 |
+
The prefix to attach to subword units that don't represent a beginning of word.
|
| 117 |
+
|
| 118 |
+
end_of_word_suffix (:obj:`str`, `optional`):
|
| 119 |
+
The suffix to attach to subword units that represent an end of word.
|
| 120 |
+
|
| 121 |
+
fuse_unk (:obj:`bool`, `optional`):
|
| 122 |
+
Whether to fuse any subsequent unknown tokens into a single one
|
| 123 |
+
|
| 124 |
+
byte_fallback (:obj:`bool`, `optional`):
|
| 125 |
+
Whether to use spm byte-fallback trick (defaults to False)
|
| 126 |
+
|
| 127 |
+
ignore_merges (:obj:`bool`, `optional`):
|
| 128 |
+
Whether or not to match tokens with the vocab before using merges.
|
| 129 |
+
"""
|
| 130 |
+
def __init__(
|
| 131 |
+
self,
|
| 132 |
+
vocab=None,
|
| 133 |
+
merges=None,
|
| 134 |
+
cache_capacity=None,
|
| 135 |
+
dropout=None,
|
| 136 |
+
unk_token=None,
|
| 137 |
+
continuing_subword_prefix=None,
|
| 138 |
+
end_of_word_suffix=None,
|
| 139 |
+
fuse_unk=None,
|
| 140 |
+
byte_fallback=False,
|
| 141 |
+
ignore_merges=False,
|
| 142 |
+
):
|
| 143 |
+
pass
|
| 144 |
+
|
| 145 |
+
def __getstate__(self):
|
| 146 |
+
""" """
|
| 147 |
+
pass
|
| 148 |
+
|
| 149 |
+
def __setstate__(self, state):
|
| 150 |
+
""" """
|
| 151 |
+
pass
|
| 152 |
+
|
| 153 |
+
@property
|
| 154 |
+
def byte_fallback(self):
|
| 155 |
+
""" """
|
| 156 |
+
pass
|
| 157 |
+
|
| 158 |
+
@byte_fallback.setter
|
| 159 |
+
def byte_fallback(self, value):
|
| 160 |
+
""" """
|
| 161 |
+
pass
|
| 162 |
+
|
| 163 |
+
@property
|
| 164 |
+
def continuing_subword_prefix(self):
|
| 165 |
+
""" """
|
| 166 |
+
pass
|
| 167 |
+
|
| 168 |
+
@continuing_subword_prefix.setter
|
| 169 |
+
def continuing_subword_prefix(self, value):
|
| 170 |
+
""" """
|
| 171 |
+
pass
|
| 172 |
+
|
| 173 |
+
@property
|
| 174 |
+
def dropout(self):
|
| 175 |
+
""" """
|
| 176 |
+
pass
|
| 177 |
+
|
| 178 |
+
@dropout.setter
|
| 179 |
+
def dropout(self, value):
|
| 180 |
+
""" """
|
| 181 |
+
pass
|
| 182 |
+
|
| 183 |
+
@property
|
| 184 |
+
def end_of_word_suffix(self):
|
| 185 |
+
""" """
|
| 186 |
+
pass
|
| 187 |
+
|
| 188 |
+
@end_of_word_suffix.setter
|
| 189 |
+
def end_of_word_suffix(self, value):
|
| 190 |
+
""" """
|
| 191 |
+
pass
|
| 192 |
+
|
| 193 |
+
@staticmethod
|
| 194 |
+
def from_file(vocab, merges, **kwargs):
|
| 195 |
+
"""
|
| 196 |
+
Instantiate a BPE model from the given files.
|
| 197 |
+
|
| 198 |
+
This method is roughly equivalent to doing::
|
| 199 |
+
|
| 200 |
+
vocab, merges = BPE.read_file(vocab_filename, merges_filename)
|
| 201 |
+
bpe = BPE(vocab, merges)
|
| 202 |
+
|
| 203 |
+
If you don't need to keep the :obj:`vocab, merges` values lying around,
|
| 204 |
+
this method is more optimized than manually calling
|
| 205 |
+
:meth:`~tokenizers.models.BPE.read_file` to initialize a :class:`~tokenizers.models.BPE`
|
| 206 |
+
|
| 207 |
+
Args:
|
| 208 |
+
vocab (:obj:`str`):
|
| 209 |
+
The path to a :obj:`vocab.json` file
|
| 210 |
+
|
| 211 |
+
merges (:obj:`str`):
|
| 212 |
+
The path to a :obj:`merges.txt` file
|
| 213 |
+
|
| 214 |
+
Returns:
|
| 215 |
+
:class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files
|
| 216 |
+
"""
|
| 217 |
+
pass
|
| 218 |
+
|
| 219 |
+
@property
|
| 220 |
+
def fuse_unk(self):
|
| 221 |
+
""" """
|
| 222 |
+
pass
|
| 223 |
+
|
| 224 |
+
@fuse_unk.setter
|
| 225 |
+
def fuse_unk(self, value):
|
| 226 |
+
""" """
|
| 227 |
+
pass
|
| 228 |
+
|
| 229 |
+
def get_trainer(self):
|
| 230 |
+
"""
|
| 231 |
+
Get the associated :class:`~tokenizers.trainers.Trainer`
|
| 232 |
+
|
| 233 |
+
Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
|
| 234 |
+
:class:`~tokenizers.models.Model`.
|
| 235 |
+
|
| 236 |
+
Returns:
|
| 237 |
+
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
|
| 238 |
+
"""
|
| 239 |
+
pass
|
| 240 |
+
|
| 241 |
+
def id_to_token(self, id):
|
| 242 |
+
"""
|
| 243 |
+
Get the token associated to an ID
|
| 244 |
+
|
| 245 |
+
Args:
|
| 246 |
+
id (:obj:`int`):
|
| 247 |
+
An ID to convert to a token
|
| 248 |
+
|
| 249 |
+
Returns:
|
| 250 |
+
:obj:`str`: The token associated to the ID
|
| 251 |
+
"""
|
| 252 |
+
pass
|
| 253 |
+
|
| 254 |
+
@property
|
| 255 |
+
def ignore_merges(self):
|
| 256 |
+
""" """
|
| 257 |
+
pass
|
| 258 |
+
|
| 259 |
+
@ignore_merges.setter
|
| 260 |
+
def ignore_merges(self, value):
|
| 261 |
+
""" """
|
| 262 |
+
pass
|
| 263 |
+
|
| 264 |
+
@staticmethod
|
| 265 |
+
def read_file(vocab, merges):
|
| 266 |
+
"""
|
| 267 |
+
Read a :obj:`vocab.json` and a :obj:`merges.txt` files
|
| 268 |
+
|
| 269 |
+
This method provides a way to read and parse the content of these files,
|
| 270 |
+
returning the relevant data structures. If you want to instantiate some BPE models
|
| 271 |
+
from memory, this method gives you the expected input from the standard files.
|
| 272 |
+
|
| 273 |
+
Args:
|
| 274 |
+
vocab (:obj:`str`):
|
| 275 |
+
The path to a :obj:`vocab.json` file
|
| 276 |
+
|
| 277 |
+
merges (:obj:`str`):
|
| 278 |
+
The path to a :obj:`merges.txt` file
|
| 279 |
+
|
| 280 |
+
Returns:
|
| 281 |
+
A :obj:`Tuple` with the vocab and the merges:
|
| 282 |
+
The vocabulary and merges loaded into memory
|
| 283 |
+
"""
|
| 284 |
+
pass
|
| 285 |
+
|
| 286 |
+
def save(self, folder, prefix):
|
| 287 |
+
"""
|
| 288 |
+
Save the current model
|
| 289 |
+
|
| 290 |
+
Save the current model in the given folder, using the given prefix for the various
|
| 291 |
+
files that will get created.
|
| 292 |
+
Any file with the same name that already exists in this folder will be overwritten.
|
| 293 |
+
|
| 294 |
+
Args:
|
| 295 |
+
folder (:obj:`str`):
|
| 296 |
+
The path to the target folder in which to save the various files
|
| 297 |
+
|
| 298 |
+
prefix (:obj:`str`, `optional`):
|
| 299 |
+
An optional prefix, used to prefix each file name
|
| 300 |
+
|
| 301 |
+
Returns:
|
| 302 |
+
:obj:`List[str]`: The list of saved files
|
| 303 |
+
"""
|
| 304 |
+
pass
|
| 305 |
+
|
| 306 |
+
def token_to_id(self, tokens):
|
| 307 |
+
"""
|
| 308 |
+
Get the ID associated to a token
|
| 309 |
+
|
| 310 |
+
Args:
|
| 311 |
+
token (:obj:`str`):
|
| 312 |
+
A token to convert to an ID
|
| 313 |
+
|
| 314 |
+
Returns:
|
| 315 |
+
:obj:`int`: The ID associated to the token
|
| 316 |
+
"""
|
| 317 |
+
pass
|
| 318 |
+
|
| 319 |
+
def tokenize(self, sequence):
|
| 320 |
+
"""
|
| 321 |
+
Tokenize a sequence
|
| 322 |
+
|
| 323 |
+
Args:
|
| 324 |
+
sequence (:obj:`str`):
|
| 325 |
+
A sequence to tokenize
|
| 326 |
+
|
| 327 |
+
Returns:
|
| 328 |
+
A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
|
| 329 |
+
"""
|
| 330 |
+
pass
|
| 331 |
+
|
| 332 |
+
@property
|
| 333 |
+
def unk_token(self):
|
| 334 |
+
""" """
|
| 335 |
+
pass
|
| 336 |
+
|
| 337 |
+
@unk_token.setter
|
| 338 |
+
def unk_token(self, value):
|
| 339 |
+
""" """
|
| 340 |
+
pass
|
| 341 |
+
|
| 342 |
+
class Unigram(Model):
|
| 343 |
+
"""
|
| 344 |
+
An implementation of the Unigram algorithm
|
| 345 |
+
|
| 346 |
+
Args:
|
| 347 |
+
vocab (:obj:`List[Tuple[str, float]]`, `optional`, `optional`):
|
| 348 |
+
A list of vocabulary items and their relative score [("am", -0.2442),...]
|
| 349 |
+
"""
|
| 350 |
+
def __init__(self, vocab=None, unk_id=None, byte_fallback=None):
|
| 351 |
+
pass
|
| 352 |
+
|
| 353 |
+
def __getstate__(self):
|
| 354 |
+
""" """
|
| 355 |
+
pass
|
| 356 |
+
|
| 357 |
+
def __setstate__(self, state):
|
| 358 |
+
""" """
|
| 359 |
+
pass
|
| 360 |
+
|
| 361 |
+
def get_trainer(self):
|
| 362 |
+
"""
|
| 363 |
+
Get the associated :class:`~tokenizers.trainers.Trainer`
|
| 364 |
+
|
| 365 |
+
Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
|
| 366 |
+
:class:`~tokenizers.models.Model`.
|
| 367 |
+
|
| 368 |
+
Returns:
|
| 369 |
+
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
|
| 370 |
+
"""
|
| 371 |
+
pass
|
| 372 |
+
|
| 373 |
+
def id_to_token(self, id):
|
| 374 |
+
"""
|
| 375 |
+
Get the token associated to an ID
|
| 376 |
+
|
| 377 |
+
Args:
|
| 378 |
+
id (:obj:`int`):
|
| 379 |
+
An ID to convert to a token
|
| 380 |
+
|
| 381 |
+
Returns:
|
| 382 |
+
:obj:`str`: The token associated to the ID
|
| 383 |
+
"""
|
| 384 |
+
pass
|
| 385 |
+
|
| 386 |
+
def save(self, folder, prefix):
|
| 387 |
+
"""
|
| 388 |
+
Save the current model
|
| 389 |
+
|
| 390 |
+
Save the current model in the given folder, using the given prefix for the various
|
| 391 |
+
files that will get created.
|
| 392 |
+
Any file with the same name that already exists in this folder will be overwritten.
|
| 393 |
+
|
| 394 |
+
Args:
|
| 395 |
+
folder (:obj:`str`):
|
| 396 |
+
The path to the target folder in which to save the various files
|
| 397 |
+
|
| 398 |
+
prefix (:obj:`str`, `optional`):
|
| 399 |
+
An optional prefix, used to prefix each file name
|
| 400 |
+
|
| 401 |
+
Returns:
|
| 402 |
+
:obj:`List[str]`: The list of saved files
|
| 403 |
+
"""
|
| 404 |
+
pass
|
| 405 |
+
|
| 406 |
+
def token_to_id(self, tokens):
|
| 407 |
+
"""
|
| 408 |
+
Get the ID associated to a token
|
| 409 |
+
|
| 410 |
+
Args:
|
| 411 |
+
token (:obj:`str`):
|
| 412 |
+
A token to convert to an ID
|
| 413 |
+
|
| 414 |
+
Returns:
|
| 415 |
+
:obj:`int`: The ID associated to the token
|
| 416 |
+
"""
|
| 417 |
+
pass
|
| 418 |
+
|
| 419 |
+
def tokenize(self, sequence):
|
| 420 |
+
"""
|
| 421 |
+
Tokenize a sequence
|
| 422 |
+
|
| 423 |
+
Args:
|
| 424 |
+
sequence (:obj:`str`):
|
| 425 |
+
A sequence to tokenize
|
| 426 |
+
|
| 427 |
+
Returns:
|
| 428 |
+
A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
|
| 429 |
+
"""
|
| 430 |
+
pass
|
| 431 |
+
|
| 432 |
+
class WordLevel(Model):
|
| 433 |
+
"""
|
| 434 |
+
An implementation of the WordLevel algorithm
|
| 435 |
+
|
| 436 |
+
Most simple tokenizer model based on mapping tokens to their corresponding id.
|
| 437 |
+
|
| 438 |
+
Args:
|
| 439 |
+
vocab (:obj:`str`, `optional`):
|
| 440 |
+
A dictionary of string keys and their ids :obj:`{"am": 0,...}`
|
| 441 |
+
|
| 442 |
+
unk_token (:obj:`str`, `optional`):
|
| 443 |
+
The unknown token to be used by the model.
|
| 444 |
+
"""
|
| 445 |
+
def __init__(self, vocab=None, unk_token=None):
|
| 446 |
+
pass
|
| 447 |
+
|
| 448 |
+
def __getstate__(self):
|
| 449 |
+
""" """
|
| 450 |
+
pass
|
| 451 |
+
|
| 452 |
+
def __setstate__(self, state):
|
| 453 |
+
""" """
|
| 454 |
+
pass
|
| 455 |
+
|
| 456 |
+
@staticmethod
|
| 457 |
+
def from_file(vocab, unk_token=None):
|
| 458 |
+
"""
|
| 459 |
+
Instantiate a WordLevel model from the given file
|
| 460 |
+
|
| 461 |
+
This method is roughly equivalent to doing::
|
| 462 |
+
|
| 463 |
+
vocab = WordLevel.read_file(vocab_filename)
|
| 464 |
+
wordlevel = WordLevel(vocab)
|
| 465 |
+
|
| 466 |
+
If you don't need to keep the :obj:`vocab` values lying around, this method is
|
| 467 |
+
more optimized than manually calling :meth:`~tokenizers.models.WordLevel.read_file` to
|
| 468 |
+
initialize a :class:`~tokenizers.models.WordLevel`
|
| 469 |
+
|
| 470 |
+
Args:
|
| 471 |
+
vocab (:obj:`str`):
|
| 472 |
+
The path to a :obj:`vocab.json` file
|
| 473 |
+
|
| 474 |
+
Returns:
|
| 475 |
+
:class:`~tokenizers.models.WordLevel`: An instance of WordLevel loaded from file
|
| 476 |
+
"""
|
| 477 |
+
pass
|
| 478 |
+
|
| 479 |
+
def get_trainer(self):
|
| 480 |
+
"""
|
| 481 |
+
Get the associated :class:`~tokenizers.trainers.Trainer`
|
| 482 |
+
|
| 483 |
+
Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
|
| 484 |
+
:class:`~tokenizers.models.Model`.
|
| 485 |
+
|
| 486 |
+
Returns:
|
| 487 |
+
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
|
| 488 |
+
"""
|
| 489 |
+
pass
|
| 490 |
+
|
| 491 |
+
def id_to_token(self, id):
|
| 492 |
+
"""
|
| 493 |
+
Get the token associated to an ID
|
| 494 |
+
|
| 495 |
+
Args:
|
| 496 |
+
id (:obj:`int`):
|
| 497 |
+
An ID to convert to a token
|
| 498 |
+
|
| 499 |
+
Returns:
|
| 500 |
+
:obj:`str`: The token associated to the ID
|
| 501 |
+
"""
|
| 502 |
+
pass
|
| 503 |
+
|
| 504 |
+
@staticmethod
|
| 505 |
+
def read_file(vocab):
|
| 506 |
+
"""
|
| 507 |
+
Read a :obj:`vocab.json`
|
| 508 |
+
|
| 509 |
+
This method provides a way to read and parse the content of a vocabulary file,
|
| 510 |
+
returning the relevant data structures. If you want to instantiate some WordLevel models
|
| 511 |
+
from memory, this method gives you the expected input from the standard files.
|
| 512 |
+
|
| 513 |
+
Args:
|
| 514 |
+
vocab (:obj:`str`):
|
| 515 |
+
The path to a :obj:`vocab.json` file
|
| 516 |
+
|
| 517 |
+
Returns:
|
| 518 |
+
:obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
|
| 519 |
+
"""
|
| 520 |
+
pass
|
| 521 |
+
|
| 522 |
+
def save(self, folder, prefix):
|
| 523 |
+
"""
|
| 524 |
+
Save the current model
|
| 525 |
+
|
| 526 |
+
Save the current model in the given folder, using the given prefix for the various
|
| 527 |
+
files that will get created.
|
| 528 |
+
Any file with the same name that already exists in this folder will be overwritten.
|
| 529 |
+
|
| 530 |
+
Args:
|
| 531 |
+
folder (:obj:`str`):
|
| 532 |
+
The path to the target folder in which to save the various files
|
| 533 |
+
|
| 534 |
+
prefix (:obj:`str`, `optional`):
|
| 535 |
+
An optional prefix, used to prefix each file name
|
| 536 |
+
|
| 537 |
+
Returns:
|
| 538 |
+
:obj:`List[str]`: The list of saved files
|
| 539 |
+
"""
|
| 540 |
+
pass
|
| 541 |
+
|
| 542 |
+
def token_to_id(self, tokens):
|
| 543 |
+
"""
|
| 544 |
+
Get the ID associated to a token
|
| 545 |
+
|
| 546 |
+
Args:
|
| 547 |
+
token (:obj:`str`):
|
| 548 |
+
A token to convert to an ID
|
| 549 |
+
|
| 550 |
+
Returns:
|
| 551 |
+
:obj:`int`: The ID associated to the token
|
| 552 |
+
"""
|
| 553 |
+
pass
|
| 554 |
+
|
| 555 |
+
def tokenize(self, sequence):
|
| 556 |
+
"""
|
| 557 |
+
Tokenize a sequence
|
| 558 |
+
|
| 559 |
+
Args:
|
| 560 |
+
sequence (:obj:`str`):
|
| 561 |
+
A sequence to tokenize
|
| 562 |
+
|
| 563 |
+
Returns:
|
| 564 |
+
A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
|
| 565 |
+
"""
|
| 566 |
+
pass
|
| 567 |
+
|
| 568 |
+
@property
|
| 569 |
+
def unk_token(self):
|
| 570 |
+
""" """
|
| 571 |
+
pass
|
| 572 |
+
|
| 573 |
+
@unk_token.setter
|
| 574 |
+
def unk_token(self, value):
|
| 575 |
+
""" """
|
| 576 |
+
pass
|
| 577 |
+
|
| 578 |
+
class WordPiece(Model):
|
| 579 |
+
"""
|
| 580 |
+
An implementation of the WordPiece algorithm
|
| 581 |
+
|
| 582 |
+
Args:
|
| 583 |
+
vocab (:obj:`Dict[str, int]`, `optional`):
|
| 584 |
+
A dictionary of string keys and their ids :obj:`{"am": 0,...}`
|
| 585 |
+
|
| 586 |
+
unk_token (:obj:`str`, `optional`):
|
| 587 |
+
The unknown token to be used by the model.
|
| 588 |
+
|
| 589 |
+
max_input_chars_per_word (:obj:`int`, `optional`):
|
| 590 |
+
The maximum number of characters to authorize in a single word.
|
| 591 |
+
"""
|
| 592 |
+
def __init__(self, vocab=None, unk_token="[UNK]", max_input_chars_per_word=100, continuing_subword_prefix="##"):
|
| 593 |
+
pass
|
| 594 |
+
|
| 595 |
+
def __getstate__(self):
|
| 596 |
+
""" """
|
| 597 |
+
pass
|
| 598 |
+
|
| 599 |
+
def __setstate__(self, state):
|
| 600 |
+
""" """
|
| 601 |
+
pass
|
| 602 |
+
|
| 603 |
+
@property
|
| 604 |
+
def continuing_subword_prefix(self):
|
| 605 |
+
""" """
|
| 606 |
+
pass
|
| 607 |
+
|
| 608 |
+
@continuing_subword_prefix.setter
|
| 609 |
+
def continuing_subword_prefix(self, value):
|
| 610 |
+
""" """
|
| 611 |
+
pass
|
| 612 |
+
|
| 613 |
+
@staticmethod
|
| 614 |
+
def from_file(vocab, **kwargs):
|
| 615 |
+
"""
|
| 616 |
+
Instantiate a WordPiece model from the given file
|
| 617 |
+
|
| 618 |
+
This method is roughly equivalent to doing::
|
| 619 |
+
|
| 620 |
+
vocab = WordPiece.read_file(vocab_filename)
|
| 621 |
+
wordpiece = WordPiece(vocab)
|
| 622 |
+
|
| 623 |
+
If you don't need to keep the :obj:`vocab` values lying around, this method is
|
| 624 |
+
more optimized than manually calling :meth:`~tokenizers.models.WordPiece.read_file` to
|
| 625 |
+
initialize a :class:`~tokenizers.models.WordPiece`
|
| 626 |
+
|
| 627 |
+
Args:
|
| 628 |
+
vocab (:obj:`str`):
|
| 629 |
+
The path to a :obj:`vocab.txt` file
|
| 630 |
+
|
| 631 |
+
Returns:
|
| 632 |
+
:class:`~tokenizers.models.WordPiece`: An instance of WordPiece loaded from file
|
| 633 |
+
"""
|
| 634 |
+
pass
|
| 635 |
+
|
| 636 |
+
def get_trainer(self):
|
| 637 |
+
"""
|
| 638 |
+
Get the associated :class:`~tokenizers.trainers.Trainer`
|
| 639 |
+
|
| 640 |
+
Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
|
| 641 |
+
:class:`~tokenizers.models.Model`.
|
| 642 |
+
|
| 643 |
+
Returns:
|
| 644 |
+
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
|
| 645 |
+
"""
|
| 646 |
+
pass
|
| 647 |
+
|
| 648 |
+
def id_to_token(self, id):
|
| 649 |
+
"""
|
| 650 |
+
Get the token associated to an ID
|
| 651 |
+
|
| 652 |
+
Args:
|
| 653 |
+
id (:obj:`int`):
|
| 654 |
+
An ID to convert to a token
|
| 655 |
+
|
| 656 |
+
Returns:
|
| 657 |
+
:obj:`str`: The token associated to the ID
|
| 658 |
+
"""
|
| 659 |
+
pass
|
| 660 |
+
|
| 661 |
+
@property
|
| 662 |
+
def max_input_chars_per_word(self):
|
| 663 |
+
""" """
|
| 664 |
+
pass
|
| 665 |
+
|
| 666 |
+
@max_input_chars_per_word.setter
|
| 667 |
+
def max_input_chars_per_word(self, value):
|
| 668 |
+
""" """
|
| 669 |
+
pass
|
| 670 |
+
|
| 671 |
+
@staticmethod
|
| 672 |
+
def read_file(vocab):
|
| 673 |
+
"""
|
| 674 |
+
Read a :obj:`vocab.txt` file
|
| 675 |
+
|
| 676 |
+
This method provides a way to read and parse the content of a standard `vocab.txt`
|
| 677 |
+
file as used by the WordPiece Model, returning the relevant data structures. If you
|
| 678 |
+
want to instantiate some WordPiece models from memory, this method gives you the
|
| 679 |
+
expected input from the standard files.
|
| 680 |
+
|
| 681 |
+
Args:
|
| 682 |
+
vocab (:obj:`str`):
|
| 683 |
+
The path to a :obj:`vocab.txt` file
|
| 684 |
+
|
| 685 |
+
Returns:
|
| 686 |
+
:obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
|
| 687 |
+
"""
|
| 688 |
+
pass
|
| 689 |
+
|
| 690 |
+
def save(self, folder, prefix):
|
| 691 |
+
"""
|
| 692 |
+
Save the current model
|
| 693 |
+
|
| 694 |
+
Save the current model in the given folder, using the given prefix for the various
|
| 695 |
+
files that will get created.
|
| 696 |
+
Any file with the same name that already exists in this folder will be overwritten.
|
| 697 |
+
|
| 698 |
+
Args:
|
| 699 |
+
folder (:obj:`str`):
|
| 700 |
+
The path to the target folder in which to save the various files
|
| 701 |
+
|
| 702 |
+
prefix (:obj:`str`, `optional`):
|
| 703 |
+
An optional prefix, used to prefix each file name
|
| 704 |
+
|
| 705 |
+
Returns:
|
| 706 |
+
:obj:`List[str]`: The list of saved files
|
| 707 |
+
"""
|
| 708 |
+
pass
|
| 709 |
+
|
| 710 |
+
def token_to_id(self, tokens):
|
| 711 |
+
"""
|
| 712 |
+
Get the ID associated to a token
|
| 713 |
+
|
| 714 |
+
Args:
|
| 715 |
+
token (:obj:`str`):
|
| 716 |
+
A token to convert to an ID
|
| 717 |
+
|
| 718 |
+
Returns:
|
| 719 |
+
:obj:`int`: The ID associated to the token
|
| 720 |
+
"""
|
| 721 |
+
pass
|
| 722 |
+
|
| 723 |
+
def tokenize(self, sequence):
|
| 724 |
+
"""
|
| 725 |
+
Tokenize a sequence
|
| 726 |
+
|
| 727 |
+
Args:
|
| 728 |
+
sequence (:obj:`str`):
|
| 729 |
+
A sequence to tokenize
|
| 730 |
+
|
| 731 |
+
Returns:
|
| 732 |
+
A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
|
| 733 |
+
"""
|
| 734 |
+
pass
|
| 735 |
+
|
| 736 |
+
@property
|
| 737 |
+
def unk_token(self):
|
| 738 |
+
""" """
|
| 739 |
+
pass
|
| 740 |
+
|
| 741 |
+
@unk_token.setter
|
| 742 |
+
def unk_token(self, value):
|
| 743 |
+
""" """
|
| 744 |
+
pass
|
source/tokenizers/normalizers/__init__.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .. import normalizers
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
Normalizer = normalizers.Normalizer
|
| 5 |
+
BertNormalizer = normalizers.BertNormalizer
|
| 6 |
+
NFD = normalizers.NFD
|
| 7 |
+
NFKD = normalizers.NFKD
|
| 8 |
+
NFC = normalizers.NFC
|
| 9 |
+
NFKC = normalizers.NFKC
|
| 10 |
+
Sequence = normalizers.Sequence
|
| 11 |
+
Lowercase = normalizers.Lowercase
|
| 12 |
+
Prepend = normalizers.Prepend
|
| 13 |
+
Strip = normalizers.Strip
|
| 14 |
+
StripAccents = normalizers.StripAccents
|
| 15 |
+
Nmt = normalizers.Nmt
|
| 16 |
+
Precompiled = normalizers.Precompiled
|
| 17 |
+
Replace = normalizers.Replace
|
| 18 |
+
ByteLevel = normalizers.ByteLevel
|
| 19 |
+
|
| 20 |
+
NORMALIZERS = {"nfc": NFC, "nfd": NFD, "nfkc": NFKC, "nfkd": NFKD}
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
|
| 24 |
+
if normalizer not in NORMALIZERS:
|
| 25 |
+
raise ValueError(
|
| 26 |
+
"{} is not a known unicode normalizer. Available are {}".format(normalizer, NORMALIZERS.keys())
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
return NORMALIZERS[normalizer]()
|
source/tokenizers/normalizers/__init__.pyi
ADDED
|
@@ -0,0 +1,946 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Generated content DO NOT EDIT
|
| 2 |
+
class Normalizer:
|
| 3 |
+
"""
|
| 4 |
+
Base class for all normalizers
|
| 5 |
+
|
| 6 |
+
This class is not supposed to be instantiated directly. Instead, any implementation of a
|
| 7 |
+
Normalizer will return an instance of this class when instantiated.
|
| 8 |
+
"""
|
| 9 |
+
def __getstate__(self):
|
| 10 |
+
""" """
|
| 11 |
+
pass
|
| 12 |
+
|
| 13 |
+
def __setstate__(self, state):
|
| 14 |
+
""" """
|
| 15 |
+
pass
|
| 16 |
+
|
| 17 |
+
@staticmethod
|
| 18 |
+
def custom(normalizer):
|
| 19 |
+
""" """
|
| 20 |
+
pass
|
| 21 |
+
|
| 22 |
+
def normalize(self, normalized):
|
| 23 |
+
"""
|
| 24 |
+
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
| 25 |
+
|
| 26 |
+
This method allows to modify a :class:`~tokenizers.NormalizedString` to
|
| 27 |
+
keep track of the alignment information. If you just want to see the result
|
| 28 |
+
of the normalization on a raw string, you can use
|
| 29 |
+
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
normalized (:class:`~tokenizers.NormalizedString`):
|
| 33 |
+
The normalized string on which to apply this
|
| 34 |
+
:class:`~tokenizers.normalizers.Normalizer`
|
| 35 |
+
"""
|
| 36 |
+
pass
|
| 37 |
+
|
| 38 |
+
def normalize_str(self, sequence):
|
| 39 |
+
"""
|
| 40 |
+
Normalize the given string
|
| 41 |
+
|
| 42 |
+
This method provides a way to visualize the effect of a
|
| 43 |
+
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
|
| 44 |
+
information. If you need to get/convert offsets, you can use
|
| 45 |
+
:meth:`~tokenizers.normalizers.Normalizer.normalize`
|
| 46 |
+
|
| 47 |
+
Args:
|
| 48 |
+
sequence (:obj:`str`):
|
| 49 |
+
A string to normalize
|
| 50 |
+
|
| 51 |
+
Returns:
|
| 52 |
+
:obj:`str`: A string after normalization
|
| 53 |
+
"""
|
| 54 |
+
pass
|
| 55 |
+
|
| 56 |
+
class BertNormalizer(Normalizer):
|
| 57 |
+
"""
|
| 58 |
+
BertNormalizer
|
| 59 |
+
|
| 60 |
+
Takes care of normalizing raw text before giving it to a Bert model.
|
| 61 |
+
This includes cleaning the text, handling accents, chinese chars and lowercasing
|
| 62 |
+
|
| 63 |
+
Args:
|
| 64 |
+
clean_text (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
| 65 |
+
Whether to clean the text, by removing any control characters
|
| 66 |
+
and replacing all whitespaces by the classic one.
|
| 67 |
+
|
| 68 |
+
handle_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
| 69 |
+
Whether to handle chinese chars by putting spaces around them.
|
| 70 |
+
|
| 71 |
+
strip_accents (:obj:`bool`, `optional`):
|
| 72 |
+
Whether to strip all accents. If this option is not specified (ie == None),
|
| 73 |
+
then it will be determined by the value for `lowercase` (as in the original Bert).
|
| 74 |
+
|
| 75 |
+
lowercase (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
| 76 |
+
Whether to lowercase.
|
| 77 |
+
"""
|
| 78 |
+
def __init__(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True):
|
| 79 |
+
pass
|
| 80 |
+
|
| 81 |
+
def __getstate__(self):
|
| 82 |
+
""" """
|
| 83 |
+
pass
|
| 84 |
+
|
| 85 |
+
def __setstate__(self, state):
|
| 86 |
+
""" """
|
| 87 |
+
pass
|
| 88 |
+
|
| 89 |
+
@property
|
| 90 |
+
def clean_text(self):
|
| 91 |
+
""" """
|
| 92 |
+
pass
|
| 93 |
+
|
| 94 |
+
@clean_text.setter
|
| 95 |
+
def clean_text(self, value):
|
| 96 |
+
""" """
|
| 97 |
+
pass
|
| 98 |
+
|
| 99 |
+
@staticmethod
|
| 100 |
+
def custom(normalizer):
|
| 101 |
+
""" """
|
| 102 |
+
pass
|
| 103 |
+
|
| 104 |
+
@property
|
| 105 |
+
def handle_chinese_chars(self):
|
| 106 |
+
""" """
|
| 107 |
+
pass
|
| 108 |
+
|
| 109 |
+
@handle_chinese_chars.setter
|
| 110 |
+
def handle_chinese_chars(self, value):
|
| 111 |
+
""" """
|
| 112 |
+
pass
|
| 113 |
+
|
| 114 |
+
@property
|
| 115 |
+
def lowercase(self):
|
| 116 |
+
""" """
|
| 117 |
+
pass
|
| 118 |
+
|
| 119 |
+
@lowercase.setter
|
| 120 |
+
def lowercase(self, value):
|
| 121 |
+
""" """
|
| 122 |
+
pass
|
| 123 |
+
|
| 124 |
+
def normalize(self, normalized):
|
| 125 |
+
"""
|
| 126 |
+
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
| 127 |
+
|
| 128 |
+
This method allows to modify a :class:`~tokenizers.NormalizedString` to
|
| 129 |
+
keep track of the alignment information. If you just want to see the result
|
| 130 |
+
of the normalization on a raw string, you can use
|
| 131 |
+
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
|
| 132 |
+
|
| 133 |
+
Args:
|
| 134 |
+
normalized (:class:`~tokenizers.NormalizedString`):
|
| 135 |
+
The normalized string on which to apply this
|
| 136 |
+
:class:`~tokenizers.normalizers.Normalizer`
|
| 137 |
+
"""
|
| 138 |
+
pass
|
| 139 |
+
|
| 140 |
+
def normalize_str(self, sequence):
|
| 141 |
+
"""
|
| 142 |
+
Normalize the given string
|
| 143 |
+
|
| 144 |
+
This method provides a way to visualize the effect of a
|
| 145 |
+
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
|
| 146 |
+
information. If you need to get/convert offsets, you can use
|
| 147 |
+
:meth:`~tokenizers.normalizers.Normalizer.normalize`
|
| 148 |
+
|
| 149 |
+
Args:
|
| 150 |
+
sequence (:obj:`str`):
|
| 151 |
+
A string to normalize
|
| 152 |
+
|
| 153 |
+
Returns:
|
| 154 |
+
:obj:`str`: A string after normalization
|
| 155 |
+
"""
|
| 156 |
+
pass
|
| 157 |
+
|
| 158 |
+
@property
|
| 159 |
+
def strip_accents(self):
|
| 160 |
+
""" """
|
| 161 |
+
pass
|
| 162 |
+
|
| 163 |
+
@strip_accents.setter
|
| 164 |
+
def strip_accents(self, value):
|
| 165 |
+
""" """
|
| 166 |
+
pass
|
| 167 |
+
|
| 168 |
+
class ByteLevel(Normalizer):
|
| 169 |
+
"""
|
| 170 |
+
Bytelevel Normalizer
|
| 171 |
+
"""
|
| 172 |
+
def __init__(self):
|
| 173 |
+
pass
|
| 174 |
+
|
| 175 |
+
def __getstate__(self):
|
| 176 |
+
""" """
|
| 177 |
+
pass
|
| 178 |
+
|
| 179 |
+
def __setstate__(self, state):
|
| 180 |
+
""" """
|
| 181 |
+
pass
|
| 182 |
+
|
| 183 |
+
@staticmethod
|
| 184 |
+
def custom(normalizer):
|
| 185 |
+
""" """
|
| 186 |
+
pass
|
| 187 |
+
|
| 188 |
+
def normalize(self, normalized):
|
| 189 |
+
"""
|
| 190 |
+
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
| 191 |
+
|
| 192 |
+
This method allows to modify a :class:`~tokenizers.NormalizedString` to
|
| 193 |
+
keep track of the alignment information. If you just want to see the result
|
| 194 |
+
of the normalization on a raw string, you can use
|
| 195 |
+
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
|
| 196 |
+
|
| 197 |
+
Args:
|
| 198 |
+
normalized (:class:`~tokenizers.NormalizedString`):
|
| 199 |
+
The normalized string on which to apply this
|
| 200 |
+
:class:`~tokenizers.normalizers.Normalizer`
|
| 201 |
+
"""
|
| 202 |
+
pass
|
| 203 |
+
|
| 204 |
+
def normalize_str(self, sequence):
|
| 205 |
+
"""
|
| 206 |
+
Normalize the given string
|
| 207 |
+
|
| 208 |
+
This method provides a way to visualize the effect of a
|
| 209 |
+
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
|
| 210 |
+
information. If you need to get/convert offsets, you can use
|
| 211 |
+
:meth:`~tokenizers.normalizers.Normalizer.normalize`
|
| 212 |
+
|
| 213 |
+
Args:
|
| 214 |
+
sequence (:obj:`str`):
|
| 215 |
+
A string to normalize
|
| 216 |
+
|
| 217 |
+
Returns:
|
| 218 |
+
:obj:`str`: A string after normalization
|
| 219 |
+
"""
|
| 220 |
+
pass
|
| 221 |
+
|
| 222 |
+
class Lowercase(Normalizer):
|
| 223 |
+
"""
|
| 224 |
+
Lowercase Normalizer
|
| 225 |
+
"""
|
| 226 |
+
def __init__(self):
|
| 227 |
+
pass
|
| 228 |
+
|
| 229 |
+
def __getstate__(self):
|
| 230 |
+
""" """
|
| 231 |
+
pass
|
| 232 |
+
|
| 233 |
+
def __setstate__(self, state):
|
| 234 |
+
""" """
|
| 235 |
+
pass
|
| 236 |
+
|
| 237 |
+
@staticmethod
|
| 238 |
+
def custom(normalizer):
|
| 239 |
+
""" """
|
| 240 |
+
pass
|
| 241 |
+
|
| 242 |
+
def normalize(self, normalized):
|
| 243 |
+
"""
|
| 244 |
+
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
| 245 |
+
|
| 246 |
+
This method allows to modify a :class:`~tokenizers.NormalizedString` to
|
| 247 |
+
keep track of the alignment information. If you just want to see the result
|
| 248 |
+
of the normalization on a raw string, you can use
|
| 249 |
+
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
|
| 250 |
+
|
| 251 |
+
Args:
|
| 252 |
+
normalized (:class:`~tokenizers.NormalizedString`):
|
| 253 |
+
The normalized string on which to apply this
|
| 254 |
+
:class:`~tokenizers.normalizers.Normalizer`
|
| 255 |
+
"""
|
| 256 |
+
pass
|
| 257 |
+
|
| 258 |
+
def normalize_str(self, sequence):
|
| 259 |
+
"""
|
| 260 |
+
Normalize the given string
|
| 261 |
+
|
| 262 |
+
This method provides a way to visualize the effect of a
|
| 263 |
+
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
|
| 264 |
+
information. If you need to get/convert offsets, you can use
|
| 265 |
+
:meth:`~tokenizers.normalizers.Normalizer.normalize`
|
| 266 |
+
|
| 267 |
+
Args:
|
| 268 |
+
sequence (:obj:`str`):
|
| 269 |
+
A string to normalize
|
| 270 |
+
|
| 271 |
+
Returns:
|
| 272 |
+
:obj:`str`: A string after normalization
|
| 273 |
+
"""
|
| 274 |
+
pass
|
| 275 |
+
|
| 276 |
+
class NFC(Normalizer):
|
| 277 |
+
"""
|
| 278 |
+
NFC Unicode Normalizer
|
| 279 |
+
"""
|
| 280 |
+
def __init__(self):
|
| 281 |
+
pass
|
| 282 |
+
|
| 283 |
+
def __getstate__(self):
|
| 284 |
+
""" """
|
| 285 |
+
pass
|
| 286 |
+
|
| 287 |
+
def __setstate__(self, state):
|
| 288 |
+
""" """
|
| 289 |
+
pass
|
| 290 |
+
|
| 291 |
+
@staticmethod
|
| 292 |
+
def custom(normalizer):
|
| 293 |
+
""" """
|
| 294 |
+
pass
|
| 295 |
+
|
| 296 |
+
def normalize(self, normalized):
|
| 297 |
+
"""
|
| 298 |
+
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
| 299 |
+
|
| 300 |
+
This method allows to modify a :class:`~tokenizers.NormalizedString` to
|
| 301 |
+
keep track of the alignment information. If you just want to see the result
|
| 302 |
+
of the normalization on a raw string, you can use
|
| 303 |
+
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
|
| 304 |
+
|
| 305 |
+
Args:
|
| 306 |
+
normalized (:class:`~tokenizers.NormalizedString`):
|
| 307 |
+
The normalized string on which to apply this
|
| 308 |
+
:class:`~tokenizers.normalizers.Normalizer`
|
| 309 |
+
"""
|
| 310 |
+
pass
|
| 311 |
+
|
| 312 |
+
def normalize_str(self, sequence):
|
| 313 |
+
"""
|
| 314 |
+
Normalize the given string
|
| 315 |
+
|
| 316 |
+
This method provides a way to visualize the effect of a
|
| 317 |
+
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
|
| 318 |
+
information. If you need to get/convert offsets, you can use
|
| 319 |
+
:meth:`~tokenizers.normalizers.Normalizer.normalize`
|
| 320 |
+
|
| 321 |
+
Args:
|
| 322 |
+
sequence (:obj:`str`):
|
| 323 |
+
A string to normalize
|
| 324 |
+
|
| 325 |
+
Returns:
|
| 326 |
+
:obj:`str`: A string after normalization
|
| 327 |
+
"""
|
| 328 |
+
pass
|
| 329 |
+
|
| 330 |
+
class NFD(Normalizer):
|
| 331 |
+
"""
|
| 332 |
+
NFD Unicode Normalizer
|
| 333 |
+
"""
|
| 334 |
+
def __init__(self):
|
| 335 |
+
pass
|
| 336 |
+
|
| 337 |
+
def __getstate__(self):
|
| 338 |
+
""" """
|
| 339 |
+
pass
|
| 340 |
+
|
| 341 |
+
def __setstate__(self, state):
|
| 342 |
+
""" """
|
| 343 |
+
pass
|
| 344 |
+
|
| 345 |
+
@staticmethod
|
| 346 |
+
def custom(normalizer):
|
| 347 |
+
""" """
|
| 348 |
+
pass
|
| 349 |
+
|
| 350 |
+
def normalize(self, normalized):
|
| 351 |
+
"""
|
| 352 |
+
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
| 353 |
+
|
| 354 |
+
This method allows to modify a :class:`~tokenizers.NormalizedString` to
|
| 355 |
+
keep track of the alignment information. If you just want to see the result
|
| 356 |
+
of the normalization on a raw string, you can use
|
| 357 |
+
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
|
| 358 |
+
|
| 359 |
+
Args:
|
| 360 |
+
normalized (:class:`~tokenizers.NormalizedString`):
|
| 361 |
+
The normalized string on which to apply this
|
| 362 |
+
:class:`~tokenizers.normalizers.Normalizer`
|
| 363 |
+
"""
|
| 364 |
+
pass
|
| 365 |
+
|
| 366 |
+
def normalize_str(self, sequence):
|
| 367 |
+
"""
|
| 368 |
+
Normalize the given string
|
| 369 |
+
|
| 370 |
+
This method provides a way to visualize the effect of a
|
| 371 |
+
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
|
| 372 |
+
information. If you need to get/convert offsets, you can use
|
| 373 |
+
:meth:`~tokenizers.normalizers.Normalizer.normalize`
|
| 374 |
+
|
| 375 |
+
Args:
|
| 376 |
+
sequence (:obj:`str`):
|
| 377 |
+
A string to normalize
|
| 378 |
+
|
| 379 |
+
Returns:
|
| 380 |
+
:obj:`str`: A string after normalization
|
| 381 |
+
"""
|
| 382 |
+
pass
|
| 383 |
+
|
| 384 |
+
class NFKC(Normalizer):
|
| 385 |
+
"""
|
| 386 |
+
NFKC Unicode Normalizer
|
| 387 |
+
"""
|
| 388 |
+
def __init__(self):
|
| 389 |
+
pass
|
| 390 |
+
|
| 391 |
+
def __getstate__(self):
|
| 392 |
+
""" """
|
| 393 |
+
pass
|
| 394 |
+
|
| 395 |
+
def __setstate__(self, state):
|
| 396 |
+
""" """
|
| 397 |
+
pass
|
| 398 |
+
|
| 399 |
+
@staticmethod
|
| 400 |
+
def custom(normalizer):
|
| 401 |
+
""" """
|
| 402 |
+
pass
|
| 403 |
+
|
| 404 |
+
def normalize(self, normalized):
|
| 405 |
+
"""
|
| 406 |
+
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
| 407 |
+
|
| 408 |
+
This method allows to modify a :class:`~tokenizers.NormalizedString` to
|
| 409 |
+
keep track of the alignment information. If you just want to see the result
|
| 410 |
+
of the normalization on a raw string, you can use
|
| 411 |
+
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
|
| 412 |
+
|
| 413 |
+
Args:
|
| 414 |
+
normalized (:class:`~tokenizers.NormalizedString`):
|
| 415 |
+
The normalized string on which to apply this
|
| 416 |
+
:class:`~tokenizers.normalizers.Normalizer`
|
| 417 |
+
"""
|
| 418 |
+
pass
|
| 419 |
+
|
| 420 |
+
def normalize_str(self, sequence):
|
| 421 |
+
"""
|
| 422 |
+
Normalize the given string
|
| 423 |
+
|
| 424 |
+
This method provides a way to visualize the effect of a
|
| 425 |
+
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
|
| 426 |
+
information. If you need to get/convert offsets, you can use
|
| 427 |
+
:meth:`~tokenizers.normalizers.Normalizer.normalize`
|
| 428 |
+
|
| 429 |
+
Args:
|
| 430 |
+
sequence (:obj:`str`):
|
| 431 |
+
A string to normalize
|
| 432 |
+
|
| 433 |
+
Returns:
|
| 434 |
+
:obj:`str`: A string after normalization
|
| 435 |
+
"""
|
| 436 |
+
pass
|
| 437 |
+
|
| 438 |
+
class NFKD(Normalizer):
|
| 439 |
+
"""
|
| 440 |
+
NFKD Unicode Normalizer
|
| 441 |
+
"""
|
| 442 |
+
def __init__(self):
|
| 443 |
+
pass
|
| 444 |
+
|
| 445 |
+
def __getstate__(self):
|
| 446 |
+
""" """
|
| 447 |
+
pass
|
| 448 |
+
|
| 449 |
+
def __setstate__(self, state):
|
| 450 |
+
""" """
|
| 451 |
+
pass
|
| 452 |
+
|
| 453 |
+
@staticmethod
|
| 454 |
+
def custom(normalizer):
|
| 455 |
+
""" """
|
| 456 |
+
pass
|
| 457 |
+
|
| 458 |
+
def normalize(self, normalized):
|
| 459 |
+
"""
|
| 460 |
+
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
| 461 |
+
|
| 462 |
+
This method allows to modify a :class:`~tokenizers.NormalizedString` to
|
| 463 |
+
keep track of the alignment information. If you just want to see the result
|
| 464 |
+
of the normalization on a raw string, you can use
|
| 465 |
+
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
|
| 466 |
+
|
| 467 |
+
Args:
|
| 468 |
+
normalized (:class:`~tokenizers.NormalizedString`):
|
| 469 |
+
The normalized string on which to apply this
|
| 470 |
+
:class:`~tokenizers.normalizers.Normalizer`
|
| 471 |
+
"""
|
| 472 |
+
pass
|
| 473 |
+
|
| 474 |
+
def normalize_str(self, sequence):
|
| 475 |
+
"""
|
| 476 |
+
Normalize the given string
|
| 477 |
+
|
| 478 |
+
This method provides a way to visualize the effect of a
|
| 479 |
+
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
|
| 480 |
+
information. If you need to get/convert offsets, you can use
|
| 481 |
+
:meth:`~tokenizers.normalizers.Normalizer.normalize`
|
| 482 |
+
|
| 483 |
+
Args:
|
| 484 |
+
sequence (:obj:`str`):
|
| 485 |
+
A string to normalize
|
| 486 |
+
|
| 487 |
+
Returns:
|
| 488 |
+
:obj:`str`: A string after normalization
|
| 489 |
+
"""
|
| 490 |
+
pass
|
| 491 |
+
|
| 492 |
+
class Nmt(Normalizer):
|
| 493 |
+
"""
|
| 494 |
+
Nmt normalizer
|
| 495 |
+
"""
|
| 496 |
+
def __init__(self):
|
| 497 |
+
pass
|
| 498 |
+
|
| 499 |
+
def __getstate__(self):
|
| 500 |
+
""" """
|
| 501 |
+
pass
|
| 502 |
+
|
| 503 |
+
def __setstate__(self, state):
|
| 504 |
+
""" """
|
| 505 |
+
pass
|
| 506 |
+
|
| 507 |
+
@staticmethod
|
| 508 |
+
def custom(normalizer):
|
| 509 |
+
""" """
|
| 510 |
+
pass
|
| 511 |
+
|
| 512 |
+
def normalize(self, normalized):
|
| 513 |
+
"""
|
| 514 |
+
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
| 515 |
+
|
| 516 |
+
This method allows to modify a :class:`~tokenizers.NormalizedString` to
|
| 517 |
+
keep track of the alignment information. If you just want to see the result
|
| 518 |
+
of the normalization on a raw string, you can use
|
| 519 |
+
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
|
| 520 |
+
|
| 521 |
+
Args:
|
| 522 |
+
normalized (:class:`~tokenizers.NormalizedString`):
|
| 523 |
+
The normalized string on which to apply this
|
| 524 |
+
:class:`~tokenizers.normalizers.Normalizer`
|
| 525 |
+
"""
|
| 526 |
+
pass
|
| 527 |
+
|
| 528 |
+
def normalize_str(self, sequence):
|
| 529 |
+
"""
|
| 530 |
+
Normalize the given string
|
| 531 |
+
|
| 532 |
+
This method provides a way to visualize the effect of a
|
| 533 |
+
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
|
| 534 |
+
information. If you need to get/convert offsets, you can use
|
| 535 |
+
:meth:`~tokenizers.normalizers.Normalizer.normalize`
|
| 536 |
+
|
| 537 |
+
Args:
|
| 538 |
+
sequence (:obj:`str`):
|
| 539 |
+
A string to normalize
|
| 540 |
+
|
| 541 |
+
Returns:
|
| 542 |
+
:obj:`str`: A string after normalization
|
| 543 |
+
"""
|
| 544 |
+
pass
|
| 545 |
+
|
| 546 |
+
class Precompiled(Normalizer):
|
| 547 |
+
"""
|
| 548 |
+
Precompiled normalizer
|
| 549 |
+
Don't use manually it is used for compatibility for SentencePiece.
|
| 550 |
+
"""
|
| 551 |
+
def __init__(self, precompiled_charsmap):
|
| 552 |
+
pass
|
| 553 |
+
|
| 554 |
+
def __getstate__(self):
|
| 555 |
+
""" """
|
| 556 |
+
pass
|
| 557 |
+
|
| 558 |
+
def __setstate__(self, state):
|
| 559 |
+
""" """
|
| 560 |
+
pass
|
| 561 |
+
|
| 562 |
+
@staticmethod
|
| 563 |
+
def custom(normalizer):
|
| 564 |
+
""" """
|
| 565 |
+
pass
|
| 566 |
+
|
| 567 |
+
def normalize(self, normalized):
|
| 568 |
+
"""
|
| 569 |
+
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
| 570 |
+
|
| 571 |
+
This method allows to modify a :class:`~tokenizers.NormalizedString` to
|
| 572 |
+
keep track of the alignment information. If you just want to see the result
|
| 573 |
+
of the normalization on a raw string, you can use
|
| 574 |
+
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
|
| 575 |
+
|
| 576 |
+
Args:
|
| 577 |
+
normalized (:class:`~tokenizers.NormalizedString`):
|
| 578 |
+
The normalized string on which to apply this
|
| 579 |
+
:class:`~tokenizers.normalizers.Normalizer`
|
| 580 |
+
"""
|
| 581 |
+
pass
|
| 582 |
+
|
| 583 |
+
def normalize_str(self, sequence):
|
| 584 |
+
"""
|
| 585 |
+
Normalize the given string
|
| 586 |
+
|
| 587 |
+
This method provides a way to visualize the effect of a
|
| 588 |
+
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
|
| 589 |
+
information. If you need to get/convert offsets, you can use
|
| 590 |
+
:meth:`~tokenizers.normalizers.Normalizer.normalize`
|
| 591 |
+
|
| 592 |
+
Args:
|
| 593 |
+
sequence (:obj:`str`):
|
| 594 |
+
A string to normalize
|
| 595 |
+
|
| 596 |
+
Returns:
|
| 597 |
+
:obj:`str`: A string after normalization
|
| 598 |
+
"""
|
| 599 |
+
pass
|
| 600 |
+
|
| 601 |
+
class Prepend(Normalizer):
|
| 602 |
+
"""
|
| 603 |
+
Prepend normalizer
|
| 604 |
+
"""
|
| 605 |
+
def __init__(self, prepend):
|
| 606 |
+
pass
|
| 607 |
+
|
| 608 |
+
def __getstate__(self):
|
| 609 |
+
""" """
|
| 610 |
+
pass
|
| 611 |
+
|
| 612 |
+
def __setstate__(self, state):
|
| 613 |
+
""" """
|
| 614 |
+
pass
|
| 615 |
+
|
| 616 |
+
@staticmethod
|
| 617 |
+
def custom(normalizer):
|
| 618 |
+
""" """
|
| 619 |
+
pass
|
| 620 |
+
|
| 621 |
+
def normalize(self, normalized):
|
| 622 |
+
"""
|
| 623 |
+
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
| 624 |
+
|
| 625 |
+
This method allows to modify a :class:`~tokenizers.NormalizedString` to
|
| 626 |
+
keep track of the alignment information. If you just want to see the result
|
| 627 |
+
of the normalization on a raw string, you can use
|
| 628 |
+
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
|
| 629 |
+
|
| 630 |
+
Args:
|
| 631 |
+
normalized (:class:`~tokenizers.NormalizedString`):
|
| 632 |
+
The normalized string on which to apply this
|
| 633 |
+
:class:`~tokenizers.normalizers.Normalizer`
|
| 634 |
+
"""
|
| 635 |
+
pass
|
| 636 |
+
|
| 637 |
+
def normalize_str(self, sequence):
|
| 638 |
+
"""
|
| 639 |
+
Normalize the given string
|
| 640 |
+
|
| 641 |
+
This method provides a way to visualize the effect of a
|
| 642 |
+
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
|
| 643 |
+
information. If you need to get/convert offsets, you can use
|
| 644 |
+
:meth:`~tokenizers.normalizers.Normalizer.normalize`
|
| 645 |
+
|
| 646 |
+
Args:
|
| 647 |
+
sequence (:obj:`str`):
|
| 648 |
+
A string to normalize
|
| 649 |
+
|
| 650 |
+
Returns:
|
| 651 |
+
:obj:`str`: A string after normalization
|
| 652 |
+
"""
|
| 653 |
+
pass
|
| 654 |
+
|
| 655 |
+
@property
|
| 656 |
+
def prepend(self):
|
| 657 |
+
""" """
|
| 658 |
+
pass
|
| 659 |
+
|
| 660 |
+
@prepend.setter
|
| 661 |
+
def prepend(self, value):
|
| 662 |
+
""" """
|
| 663 |
+
pass
|
| 664 |
+
|
| 665 |
+
class Replace(Normalizer):
|
| 666 |
+
"""
|
| 667 |
+
Replace normalizer
|
| 668 |
+
"""
|
| 669 |
+
def __init__(self, pattern, content):
|
| 670 |
+
pass
|
| 671 |
+
|
| 672 |
+
def __getstate__(self):
|
| 673 |
+
""" """
|
| 674 |
+
pass
|
| 675 |
+
|
| 676 |
+
def __setstate__(self, state):
|
| 677 |
+
""" """
|
| 678 |
+
pass
|
| 679 |
+
|
| 680 |
+
@property
|
| 681 |
+
def content(self):
|
| 682 |
+
""" """
|
| 683 |
+
pass
|
| 684 |
+
|
| 685 |
+
@content.setter
|
| 686 |
+
def content(self, value):
|
| 687 |
+
""" """
|
| 688 |
+
pass
|
| 689 |
+
|
| 690 |
+
@staticmethod
|
| 691 |
+
def custom(normalizer):
|
| 692 |
+
""" """
|
| 693 |
+
pass
|
| 694 |
+
|
| 695 |
+
def normalize(self, normalized):
|
| 696 |
+
"""
|
| 697 |
+
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
| 698 |
+
|
| 699 |
+
This method allows to modify a :class:`~tokenizers.NormalizedString` to
|
| 700 |
+
keep track of the alignment information. If you just want to see the result
|
| 701 |
+
of the normalization on a raw string, you can use
|
| 702 |
+
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
|
| 703 |
+
|
| 704 |
+
Args:
|
| 705 |
+
normalized (:class:`~tokenizers.NormalizedString`):
|
| 706 |
+
The normalized string on which to apply this
|
| 707 |
+
:class:`~tokenizers.normalizers.Normalizer`
|
| 708 |
+
"""
|
| 709 |
+
pass
|
| 710 |
+
|
| 711 |
+
def normalize_str(self, sequence):
|
| 712 |
+
"""
|
| 713 |
+
Normalize the given string
|
| 714 |
+
|
| 715 |
+
This method provides a way to visualize the effect of a
|
| 716 |
+
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
|
| 717 |
+
information. If you need to get/convert offsets, you can use
|
| 718 |
+
:meth:`~tokenizers.normalizers.Normalizer.normalize`
|
| 719 |
+
|
| 720 |
+
Args:
|
| 721 |
+
sequence (:obj:`str`):
|
| 722 |
+
A string to normalize
|
| 723 |
+
|
| 724 |
+
Returns:
|
| 725 |
+
:obj:`str`: A string after normalization
|
| 726 |
+
"""
|
| 727 |
+
pass
|
| 728 |
+
|
| 729 |
+
@property
|
| 730 |
+
def pattern(self):
|
| 731 |
+
""" """
|
| 732 |
+
pass
|
| 733 |
+
|
| 734 |
+
@pattern.setter
|
| 735 |
+
def pattern(self, value):
|
| 736 |
+
""" """
|
| 737 |
+
pass
|
| 738 |
+
|
| 739 |
+
class Sequence(Normalizer):
|
| 740 |
+
"""
|
| 741 |
+
Allows concatenating multiple other Normalizer as a Sequence.
|
| 742 |
+
All the normalizers run in sequence in the given order
|
| 743 |
+
|
| 744 |
+
Args:
|
| 745 |
+
normalizers (:obj:`List[Normalizer]`):
|
| 746 |
+
A list of Normalizer to be run as a sequence
|
| 747 |
+
"""
|
| 748 |
+
def __init__(self, normalizers):
|
| 749 |
+
pass
|
| 750 |
+
|
| 751 |
+
def __getitem__(self, key):
|
| 752 |
+
"""
|
| 753 |
+
Return self[key].
|
| 754 |
+
"""
|
| 755 |
+
pass
|
| 756 |
+
|
| 757 |
+
def __getnewargs__(self):
|
| 758 |
+
""" """
|
| 759 |
+
pass
|
| 760 |
+
|
| 761 |
+
def __getstate__(self):
|
| 762 |
+
""" """
|
| 763 |
+
pass
|
| 764 |
+
|
| 765 |
+
def __setitem__(self, key, value):
|
| 766 |
+
"""
|
| 767 |
+
Set self[key] to value.
|
| 768 |
+
"""
|
| 769 |
+
pass
|
| 770 |
+
|
| 771 |
+
def __setstate__(self, state):
|
| 772 |
+
""" """
|
| 773 |
+
pass
|
| 774 |
+
|
| 775 |
+
@staticmethod
|
| 776 |
+
def custom(normalizer):
|
| 777 |
+
""" """
|
| 778 |
+
pass
|
| 779 |
+
|
| 780 |
+
def normalize(self, normalized):
|
| 781 |
+
"""
|
| 782 |
+
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
| 783 |
+
|
| 784 |
+
This method allows to modify a :class:`~tokenizers.NormalizedString` to
|
| 785 |
+
keep track of the alignment information. If you just want to see the result
|
| 786 |
+
of the normalization on a raw string, you can use
|
| 787 |
+
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
|
| 788 |
+
|
| 789 |
+
Args:
|
| 790 |
+
normalized (:class:`~tokenizers.NormalizedString`):
|
| 791 |
+
The normalized string on which to apply this
|
| 792 |
+
:class:`~tokenizers.normalizers.Normalizer`
|
| 793 |
+
"""
|
| 794 |
+
pass
|
| 795 |
+
|
| 796 |
+
def normalize_str(self, sequence):
|
| 797 |
+
"""
|
| 798 |
+
Normalize the given string
|
| 799 |
+
|
| 800 |
+
This method provides a way to visualize the effect of a
|
| 801 |
+
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
|
| 802 |
+
information. If you need to get/convert offsets, you can use
|
| 803 |
+
:meth:`~tokenizers.normalizers.Normalizer.normalize`
|
| 804 |
+
|
| 805 |
+
Args:
|
| 806 |
+
sequence (:obj:`str`):
|
| 807 |
+
A string to normalize
|
| 808 |
+
|
| 809 |
+
Returns:
|
| 810 |
+
:obj:`str`: A string after normalization
|
| 811 |
+
"""
|
| 812 |
+
pass
|
| 813 |
+
|
| 814 |
+
class Strip(Normalizer):
|
| 815 |
+
"""
|
| 816 |
+
Strip normalizer
|
| 817 |
+
"""
|
| 818 |
+
def __init__(self, left=True, right=True):
|
| 819 |
+
pass
|
| 820 |
+
|
| 821 |
+
def __getstate__(self):
|
| 822 |
+
""" """
|
| 823 |
+
pass
|
| 824 |
+
|
| 825 |
+
def __setstate__(self, state):
|
| 826 |
+
""" """
|
| 827 |
+
pass
|
| 828 |
+
|
| 829 |
+
@staticmethod
|
| 830 |
+
def custom(normalizer):
|
| 831 |
+
""" """
|
| 832 |
+
pass
|
| 833 |
+
|
| 834 |
+
@property
|
| 835 |
+
def left(self):
|
| 836 |
+
""" """
|
| 837 |
+
pass
|
| 838 |
+
|
| 839 |
+
@left.setter
|
| 840 |
+
def left(self, value):
|
| 841 |
+
""" """
|
| 842 |
+
pass
|
| 843 |
+
|
| 844 |
+
def normalize(self, normalized):
|
| 845 |
+
"""
|
| 846 |
+
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
| 847 |
+
|
| 848 |
+
This method allows to modify a :class:`~tokenizers.NormalizedString` to
|
| 849 |
+
keep track of the alignment information. If you just want to see the result
|
| 850 |
+
of the normalization on a raw string, you can use
|
| 851 |
+
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
|
| 852 |
+
|
| 853 |
+
Args:
|
| 854 |
+
normalized (:class:`~tokenizers.NormalizedString`):
|
| 855 |
+
The normalized string on which to apply this
|
| 856 |
+
:class:`~tokenizers.normalizers.Normalizer`
|
| 857 |
+
"""
|
| 858 |
+
pass
|
| 859 |
+
|
| 860 |
+
def normalize_str(self, sequence):
|
| 861 |
+
"""
|
| 862 |
+
Normalize the given string
|
| 863 |
+
|
| 864 |
+
This method provides a way to visualize the effect of a
|
| 865 |
+
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
|
| 866 |
+
information. If you need to get/convert offsets, you can use
|
| 867 |
+
:meth:`~tokenizers.normalizers.Normalizer.normalize`
|
| 868 |
+
|
| 869 |
+
Args:
|
| 870 |
+
sequence (:obj:`str`):
|
| 871 |
+
A string to normalize
|
| 872 |
+
|
| 873 |
+
Returns:
|
| 874 |
+
:obj:`str`: A string after normalization
|
| 875 |
+
"""
|
| 876 |
+
pass
|
| 877 |
+
|
| 878 |
+
@property
|
| 879 |
+
def right(self):
|
| 880 |
+
""" """
|
| 881 |
+
pass
|
| 882 |
+
|
| 883 |
+
@right.setter
|
| 884 |
+
def right(self, value):
|
| 885 |
+
""" """
|
| 886 |
+
pass
|
| 887 |
+
|
| 888 |
+
class StripAccents(Normalizer):
|
| 889 |
+
"""
|
| 890 |
+
StripAccents normalizer
|
| 891 |
+
"""
|
| 892 |
+
def __init__(self):
|
| 893 |
+
pass
|
| 894 |
+
|
| 895 |
+
def __getstate__(self):
|
| 896 |
+
""" """
|
| 897 |
+
pass
|
| 898 |
+
|
| 899 |
+
def __setstate__(self, state):
|
| 900 |
+
""" """
|
| 901 |
+
pass
|
| 902 |
+
|
| 903 |
+
@staticmethod
|
| 904 |
+
def custom(normalizer):
|
| 905 |
+
""" """
|
| 906 |
+
pass
|
| 907 |
+
|
| 908 |
+
def normalize(self, normalized):
|
| 909 |
+
"""
|
| 910 |
+
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
| 911 |
+
|
| 912 |
+
This method allows to modify a :class:`~tokenizers.NormalizedString` to
|
| 913 |
+
keep track of the alignment information. If you just want to see the result
|
| 914 |
+
of the normalization on a raw string, you can use
|
| 915 |
+
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
|
| 916 |
+
|
| 917 |
+
Args:
|
| 918 |
+
normalized (:class:`~tokenizers.NormalizedString`):
|
| 919 |
+
The normalized string on which to apply this
|
| 920 |
+
:class:`~tokenizers.normalizers.Normalizer`
|
| 921 |
+
"""
|
| 922 |
+
pass
|
| 923 |
+
|
| 924 |
+
def normalize_str(self, sequence):
|
| 925 |
+
"""
|
| 926 |
+
Normalize the given string
|
| 927 |
+
|
| 928 |
+
This method provides a way to visualize the effect of a
|
| 929 |
+
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
|
| 930 |
+
information. If you need to get/convert offsets, you can use
|
| 931 |
+
:meth:`~tokenizers.normalizers.Normalizer.normalize`
|
| 932 |
+
|
| 933 |
+
Args:
|
| 934 |
+
sequence (:obj:`str`):
|
| 935 |
+
A string to normalize
|
| 936 |
+
|
| 937 |
+
Returns:
|
| 938 |
+
:obj:`str`: A string after normalization
|
| 939 |
+
"""
|
| 940 |
+
pass
|
| 941 |
+
|
| 942 |
+
from typing import Dict
|
| 943 |
+
|
| 944 |
+
NORMALIZERS: Dict[str, Normalizer]
|
| 945 |
+
|
| 946 |
+
def unicode_normalizer_from_str(normalizer: str) -> Normalizer: ...
|
source/tokenizers/pre_tokenizers/__init__.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Generated content DO NOT EDIT
|
| 2 |
+
from .. import pre_tokenizers
|
| 3 |
+
|
| 4 |
+
PreTokenizer = pre_tokenizers.PreTokenizer
|
| 5 |
+
BertPreTokenizer = pre_tokenizers.BertPreTokenizer
|
| 6 |
+
ByteLevel = pre_tokenizers.ByteLevel
|
| 7 |
+
CharDelimiterSplit = pre_tokenizers.CharDelimiterSplit
|
| 8 |
+
Digits = pre_tokenizers.Digits
|
| 9 |
+
FixedLength = pre_tokenizers.FixedLength
|
| 10 |
+
Metaspace = pre_tokenizers.Metaspace
|
| 11 |
+
Punctuation = pre_tokenizers.Punctuation
|
| 12 |
+
Sequence = pre_tokenizers.Sequence
|
| 13 |
+
Split = pre_tokenizers.Split
|
| 14 |
+
UnicodeScripts = pre_tokenizers.UnicodeScripts
|
| 15 |
+
Whitespace = pre_tokenizers.Whitespace
|
| 16 |
+
WhitespaceSplit = pre_tokenizers.WhitespaceSplit
|
source/tokenizers/pre_tokenizers/__init__.pyi
ADDED
|
@@ -0,0 +1,1015 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Generated content DO NOT EDIT
|
| 2 |
+
class PreTokenizer:
|
| 3 |
+
"""
|
| 4 |
+
Base class for all pre-tokenizers
|
| 5 |
+
|
| 6 |
+
This class is not supposed to be instantiated directly. Instead, any implementation of a
|
| 7 |
+
PreTokenizer will return an instance of this class when instantiated.
|
| 8 |
+
"""
|
| 9 |
+
def __getstate__(self):
|
| 10 |
+
""" """
|
| 11 |
+
pass
|
| 12 |
+
|
| 13 |
+
def __setstate__(self, state):
|
| 14 |
+
""" """
|
| 15 |
+
pass
|
| 16 |
+
|
| 17 |
+
@staticmethod
|
| 18 |
+
def custom(pretok):
|
| 19 |
+
""" """
|
| 20 |
+
pass
|
| 21 |
+
|
| 22 |
+
def pre_tokenize(self, pretok):
|
| 23 |
+
"""
|
| 24 |
+
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
| 25 |
+
|
| 26 |
+
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
| 27 |
+
keep track of the pre-tokenization, and leverage the capabilities of the
|
| 28 |
+
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
| 29 |
+
the pre-tokenization of a raw string, you can use
|
| 30 |
+
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
| 31 |
+
|
| 32 |
+
Args:
|
| 33 |
+
pretok (:class:`~tokenizers.PreTokenizedString):
|
| 34 |
+
The pre-tokenized string on which to apply this
|
| 35 |
+
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
| 36 |
+
"""
|
| 37 |
+
pass
|
| 38 |
+
|
| 39 |
+
def pre_tokenize_str(self, sequence):
|
| 40 |
+
"""
|
| 41 |
+
Pre tokenize the given string
|
| 42 |
+
|
| 43 |
+
This method provides a way to visualize the effect of a
|
| 44 |
+
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
| 45 |
+
alignment, nor does it provide all the capabilities of the
|
| 46 |
+
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
| 47 |
+
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
| 48 |
+
|
| 49 |
+
Args:
|
| 50 |
+
sequence (:obj:`str`):
|
| 51 |
+
A string to pre-tokeize
|
| 52 |
+
|
| 53 |
+
Returns:
|
| 54 |
+
:obj:`List[Tuple[str, Offsets]]`:
|
| 55 |
+
A list of tuple with the pre-tokenized parts and their offsets
|
| 56 |
+
"""
|
| 57 |
+
pass
|
| 58 |
+
|
| 59 |
+
class BertPreTokenizer(PreTokenizer):
|
| 60 |
+
"""
|
| 61 |
+
BertPreTokenizer
|
| 62 |
+
|
| 63 |
+
This pre-tokenizer splits tokens on spaces, and also on punctuation.
|
| 64 |
+
Each occurrence of a punctuation character will be treated separately.
|
| 65 |
+
"""
|
| 66 |
+
def __init__(self):
|
| 67 |
+
pass
|
| 68 |
+
|
| 69 |
+
def __getstate__(self):
|
| 70 |
+
""" """
|
| 71 |
+
pass
|
| 72 |
+
|
| 73 |
+
def __setstate__(self, state):
|
| 74 |
+
""" """
|
| 75 |
+
pass
|
| 76 |
+
|
| 77 |
+
@staticmethod
|
| 78 |
+
def custom(pretok):
|
| 79 |
+
""" """
|
| 80 |
+
pass
|
| 81 |
+
|
| 82 |
+
def pre_tokenize(self, pretok):
|
| 83 |
+
"""
|
| 84 |
+
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
| 85 |
+
|
| 86 |
+
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
| 87 |
+
keep track of the pre-tokenization, and leverage the capabilities of the
|
| 88 |
+
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
| 89 |
+
the pre-tokenization of a raw string, you can use
|
| 90 |
+
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
| 91 |
+
|
| 92 |
+
Args:
|
| 93 |
+
pretok (:class:`~tokenizers.PreTokenizedString):
|
| 94 |
+
The pre-tokenized string on which to apply this
|
| 95 |
+
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
| 96 |
+
"""
|
| 97 |
+
pass
|
| 98 |
+
|
| 99 |
+
def pre_tokenize_str(self, sequence):
|
| 100 |
+
"""
|
| 101 |
+
Pre tokenize the given string
|
| 102 |
+
|
| 103 |
+
This method provides a way to visualize the effect of a
|
| 104 |
+
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
| 105 |
+
alignment, nor does it provide all the capabilities of the
|
| 106 |
+
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
| 107 |
+
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
| 108 |
+
|
| 109 |
+
Args:
|
| 110 |
+
sequence (:obj:`str`):
|
| 111 |
+
A string to pre-tokeize
|
| 112 |
+
|
| 113 |
+
Returns:
|
| 114 |
+
:obj:`List[Tuple[str, Offsets]]`:
|
| 115 |
+
A list of tuple with the pre-tokenized parts and their offsets
|
| 116 |
+
"""
|
| 117 |
+
pass
|
| 118 |
+
|
| 119 |
+
class ByteLevel(PreTokenizer):
|
| 120 |
+
"""
|
| 121 |
+
ByteLevel PreTokenizer
|
| 122 |
+
|
| 123 |
+
This pre-tokenizer takes care of replacing all bytes of the given string
|
| 124 |
+
with a corresponding representation, as well as splitting into words.
|
| 125 |
+
|
| 126 |
+
Args:
|
| 127 |
+
add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
| 128 |
+
Whether to add a space to the first word if there isn't already one. This
|
| 129 |
+
lets us treat `hello` exactly like `say hello`.
|
| 130 |
+
use_regex (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
| 131 |
+
Set this to :obj:`False` to prevent this `pre_tokenizer` from using
|
| 132 |
+
the GPT2 specific regexp for spliting on whitespace.
|
| 133 |
+
"""
|
| 134 |
+
def __init__(self, add_prefix_space=True, trim_offsets=True, use_regex=True):
|
| 135 |
+
pass
|
| 136 |
+
|
| 137 |
+
def __getstate__(self):
|
| 138 |
+
""" """
|
| 139 |
+
pass
|
| 140 |
+
|
| 141 |
+
def __setstate__(self, state):
|
| 142 |
+
""" """
|
| 143 |
+
pass
|
| 144 |
+
|
| 145 |
+
@property
|
| 146 |
+
def add_prefix_space(self):
|
| 147 |
+
""" """
|
| 148 |
+
pass
|
| 149 |
+
|
| 150 |
+
@add_prefix_space.setter
|
| 151 |
+
def add_prefix_space(self, value):
|
| 152 |
+
""" """
|
| 153 |
+
pass
|
| 154 |
+
|
| 155 |
+
@staticmethod
|
| 156 |
+
def alphabet():
|
| 157 |
+
"""
|
| 158 |
+
Returns the alphabet used by this PreTokenizer.
|
| 159 |
+
|
| 160 |
+
Since the ByteLevel works as its name suggests, at the byte level, it
|
| 161 |
+
encodes each byte value to a unique visible character. This means that there is a
|
| 162 |
+
total of 256 different characters composing this alphabet.
|
| 163 |
+
|
| 164 |
+
Returns:
|
| 165 |
+
:obj:`List[str]`: A list of characters that compose the alphabet
|
| 166 |
+
"""
|
| 167 |
+
pass
|
| 168 |
+
|
| 169 |
+
@staticmethod
|
| 170 |
+
def custom(pretok):
|
| 171 |
+
""" """
|
| 172 |
+
pass
|
| 173 |
+
|
| 174 |
+
def pre_tokenize(self, pretok):
|
| 175 |
+
"""
|
| 176 |
+
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
| 177 |
+
|
| 178 |
+
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
| 179 |
+
keep track of the pre-tokenization, and leverage the capabilities of the
|
| 180 |
+
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
| 181 |
+
the pre-tokenization of a raw string, you can use
|
| 182 |
+
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
| 183 |
+
|
| 184 |
+
Args:
|
| 185 |
+
pretok (:class:`~tokenizers.PreTokenizedString):
|
| 186 |
+
The pre-tokenized string on which to apply this
|
| 187 |
+
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
| 188 |
+
"""
|
| 189 |
+
pass
|
| 190 |
+
|
| 191 |
+
def pre_tokenize_str(self, sequence):
|
| 192 |
+
"""
|
| 193 |
+
Pre tokenize the given string
|
| 194 |
+
|
| 195 |
+
This method provides a way to visualize the effect of a
|
| 196 |
+
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
| 197 |
+
alignment, nor does it provide all the capabilities of the
|
| 198 |
+
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
| 199 |
+
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
| 200 |
+
|
| 201 |
+
Args:
|
| 202 |
+
sequence (:obj:`str`):
|
| 203 |
+
A string to pre-tokeize
|
| 204 |
+
|
| 205 |
+
Returns:
|
| 206 |
+
:obj:`List[Tuple[str, Offsets]]`:
|
| 207 |
+
A list of tuple with the pre-tokenized parts and their offsets
|
| 208 |
+
"""
|
| 209 |
+
pass
|
| 210 |
+
|
| 211 |
+
@property
|
| 212 |
+
def trim_offsets(self):
|
| 213 |
+
""" """
|
| 214 |
+
pass
|
| 215 |
+
|
| 216 |
+
@trim_offsets.setter
|
| 217 |
+
def trim_offsets(self, value):
|
| 218 |
+
""" """
|
| 219 |
+
pass
|
| 220 |
+
|
| 221 |
+
@property
|
| 222 |
+
def use_regex(self):
|
| 223 |
+
""" """
|
| 224 |
+
pass
|
| 225 |
+
|
| 226 |
+
@use_regex.setter
|
| 227 |
+
def use_regex(self, value):
|
| 228 |
+
""" """
|
| 229 |
+
pass
|
| 230 |
+
|
| 231 |
+
class CharDelimiterSplit(PreTokenizer):
|
| 232 |
+
"""
|
| 233 |
+
This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)`
|
| 234 |
+
|
| 235 |
+
Args:
|
| 236 |
+
delimiter: str:
|
| 237 |
+
The delimiter char that will be used to split input
|
| 238 |
+
"""
|
| 239 |
+
def __init__(self, delimiter):
|
| 240 |
+
pass
|
| 241 |
+
|
| 242 |
+
def __getnewargs__(self):
|
| 243 |
+
""" """
|
| 244 |
+
pass
|
| 245 |
+
|
| 246 |
+
def __getstate__(self):
|
| 247 |
+
""" """
|
| 248 |
+
pass
|
| 249 |
+
|
| 250 |
+
def __setstate__(self, state):
|
| 251 |
+
""" """
|
| 252 |
+
pass
|
| 253 |
+
|
| 254 |
+
@staticmethod
|
| 255 |
+
def custom(pretok):
|
| 256 |
+
""" """
|
| 257 |
+
pass
|
| 258 |
+
|
| 259 |
+
@property
|
| 260 |
+
def delimiter(self):
|
| 261 |
+
""" """
|
| 262 |
+
pass
|
| 263 |
+
|
| 264 |
+
@delimiter.setter
|
| 265 |
+
def delimiter(self, value):
|
| 266 |
+
""" """
|
| 267 |
+
pass
|
| 268 |
+
|
| 269 |
+
def pre_tokenize(self, pretok):
|
| 270 |
+
"""
|
| 271 |
+
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
| 272 |
+
|
| 273 |
+
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
| 274 |
+
keep track of the pre-tokenization, and leverage the capabilities of the
|
| 275 |
+
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
| 276 |
+
the pre-tokenization of a raw string, you can use
|
| 277 |
+
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
| 278 |
+
|
| 279 |
+
Args:
|
| 280 |
+
pretok (:class:`~tokenizers.PreTokenizedString):
|
| 281 |
+
The pre-tokenized string on which to apply this
|
| 282 |
+
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
| 283 |
+
"""
|
| 284 |
+
pass
|
| 285 |
+
|
| 286 |
+
def pre_tokenize_str(self, sequence):
|
| 287 |
+
"""
|
| 288 |
+
Pre tokenize the given string
|
| 289 |
+
|
| 290 |
+
This method provides a way to visualize the effect of a
|
| 291 |
+
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
| 292 |
+
alignment, nor does it provide all the capabilities of the
|
| 293 |
+
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
| 294 |
+
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
| 295 |
+
|
| 296 |
+
Args:
|
| 297 |
+
sequence (:obj:`str`):
|
| 298 |
+
A string to pre-tokeize
|
| 299 |
+
|
| 300 |
+
Returns:
|
| 301 |
+
:obj:`List[Tuple[str, Offsets]]`:
|
| 302 |
+
A list of tuple with the pre-tokenized parts and their offsets
|
| 303 |
+
"""
|
| 304 |
+
pass
|
| 305 |
+
|
| 306 |
+
class Digits(PreTokenizer):
|
| 307 |
+
"""
|
| 308 |
+
This pre-tokenizer simply splits using the digits in separate tokens
|
| 309 |
+
|
| 310 |
+
Args:
|
| 311 |
+
individual_digits (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
| 312 |
+
If set to True, digits will each be separated as follows::
|
| 313 |
+
|
| 314 |
+
"Call 123 please" -> "Call ", "1", "2", "3", " please"
|
| 315 |
+
|
| 316 |
+
If set to False, digits will grouped as follows::
|
| 317 |
+
|
| 318 |
+
"Call 123 please" -> "Call ", "123", " please"
|
| 319 |
+
"""
|
| 320 |
+
def __init__(self, individual_digits=False):
|
| 321 |
+
pass
|
| 322 |
+
|
| 323 |
+
def __getstate__(self):
|
| 324 |
+
""" """
|
| 325 |
+
pass
|
| 326 |
+
|
| 327 |
+
def __setstate__(self, state):
|
| 328 |
+
""" """
|
| 329 |
+
pass
|
| 330 |
+
|
| 331 |
+
@staticmethod
|
| 332 |
+
def custom(pretok):
|
| 333 |
+
""" """
|
| 334 |
+
pass
|
| 335 |
+
|
| 336 |
+
@property
|
| 337 |
+
def individual_digits(self):
|
| 338 |
+
""" """
|
| 339 |
+
pass
|
| 340 |
+
|
| 341 |
+
@individual_digits.setter
|
| 342 |
+
def individual_digits(self, value):
|
| 343 |
+
""" """
|
| 344 |
+
pass
|
| 345 |
+
|
| 346 |
+
def pre_tokenize(self, pretok):
|
| 347 |
+
"""
|
| 348 |
+
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
| 349 |
+
|
| 350 |
+
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
| 351 |
+
keep track of the pre-tokenization, and leverage the capabilities of the
|
| 352 |
+
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
| 353 |
+
the pre-tokenization of a raw string, you can use
|
| 354 |
+
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
| 355 |
+
|
| 356 |
+
Args:
|
| 357 |
+
pretok (:class:`~tokenizers.PreTokenizedString):
|
| 358 |
+
The pre-tokenized string on which to apply this
|
| 359 |
+
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
| 360 |
+
"""
|
| 361 |
+
pass
|
| 362 |
+
|
| 363 |
+
def pre_tokenize_str(self, sequence):
|
| 364 |
+
"""
|
| 365 |
+
Pre tokenize the given string
|
| 366 |
+
|
| 367 |
+
This method provides a way to visualize the effect of a
|
| 368 |
+
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
| 369 |
+
alignment, nor does it provide all the capabilities of the
|
| 370 |
+
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
| 371 |
+
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
| 372 |
+
|
| 373 |
+
Args:
|
| 374 |
+
sequence (:obj:`str`):
|
| 375 |
+
A string to pre-tokeize
|
| 376 |
+
|
| 377 |
+
Returns:
|
| 378 |
+
:obj:`List[Tuple[str, Offsets]]`:
|
| 379 |
+
A list of tuple with the pre-tokenized parts and their offsets
|
| 380 |
+
"""
|
| 381 |
+
pass
|
| 382 |
+
|
| 383 |
+
class FixedLength(PreTokenizer):
|
| 384 |
+
"""
|
| 385 |
+
This pre-tokenizer splits the text into fixed length chunks as used
|
| 386 |
+
[here](https://www.biorxiv.org/content/10.1101/2023.01.11.523679v1.full)
|
| 387 |
+
|
| 388 |
+
Args:
|
| 389 |
+
length (:obj:`int`, `optional`, defaults to :obj:`5`):
|
| 390 |
+
The length of the chunks to split the text into.
|
| 391 |
+
|
| 392 |
+
Strings are split on the character level rather than the byte level to avoid
|
| 393 |
+
splitting unicode characters consisting of multiple bytes.
|
| 394 |
+
"""
|
| 395 |
+
def __init__(self, length=5):
|
| 396 |
+
pass
|
| 397 |
+
|
| 398 |
+
def __getstate__(self):
|
| 399 |
+
""" """
|
| 400 |
+
pass
|
| 401 |
+
|
| 402 |
+
def __setstate__(self, state):
|
| 403 |
+
""" """
|
| 404 |
+
pass
|
| 405 |
+
|
| 406 |
+
@staticmethod
|
| 407 |
+
def custom(pretok):
|
| 408 |
+
""" """
|
| 409 |
+
pass
|
| 410 |
+
|
| 411 |
+
@property
|
| 412 |
+
def length(self):
|
| 413 |
+
""" """
|
| 414 |
+
pass
|
| 415 |
+
|
| 416 |
+
@length.setter
|
| 417 |
+
def length(self, value):
|
| 418 |
+
""" """
|
| 419 |
+
pass
|
| 420 |
+
|
| 421 |
+
def pre_tokenize(self, pretok):
|
| 422 |
+
"""
|
| 423 |
+
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
| 424 |
+
|
| 425 |
+
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
| 426 |
+
keep track of the pre-tokenization, and leverage the capabilities of the
|
| 427 |
+
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
| 428 |
+
the pre-tokenization of a raw string, you can use
|
| 429 |
+
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
| 430 |
+
|
| 431 |
+
Args:
|
| 432 |
+
pretok (:class:`~tokenizers.PreTokenizedString):
|
| 433 |
+
The pre-tokenized string on which to apply this
|
| 434 |
+
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
| 435 |
+
"""
|
| 436 |
+
pass
|
| 437 |
+
|
| 438 |
+
def pre_tokenize_str(self, sequence):
|
| 439 |
+
"""
|
| 440 |
+
Pre tokenize the given string
|
| 441 |
+
|
| 442 |
+
This method provides a way to visualize the effect of a
|
| 443 |
+
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
| 444 |
+
alignment, nor does it provide all the capabilities of the
|
| 445 |
+
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
| 446 |
+
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
| 447 |
+
|
| 448 |
+
Args:
|
| 449 |
+
sequence (:obj:`str`):
|
| 450 |
+
A string to pre-tokeize
|
| 451 |
+
|
| 452 |
+
Returns:
|
| 453 |
+
:obj:`List[Tuple[str, Offsets]]`:
|
| 454 |
+
A list of tuple with the pre-tokenized parts and their offsets
|
| 455 |
+
"""
|
| 456 |
+
pass
|
| 457 |
+
|
| 458 |
+
class Metaspace(PreTokenizer):
|
| 459 |
+
"""
|
| 460 |
+
Metaspace pre-tokenizer
|
| 461 |
+
|
| 462 |
+
This pre-tokenizer replaces any whitespace by the provided replacement character.
|
| 463 |
+
It then tries to split on these spaces.
|
| 464 |
+
|
| 465 |
+
Args:
|
| 466 |
+
replacement (:obj:`str`, `optional`, defaults to :obj:`▁`):
|
| 467 |
+
The replacement character. Must be exactly one character. By default we
|
| 468 |
+
use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
|
| 469 |
+
|
| 470 |
+
prepend_scheme (:obj:`str`, `optional`, defaults to :obj:`"always"`):
|
| 471 |
+
Whether to add a space to the first word if there isn't already one. This
|
| 472 |
+
lets us treat `hello` exactly like `say hello`.
|
| 473 |
+
Choices: "always", "never", "first". First means the space is only added on the first
|
| 474 |
+
token (relevant when special tokens are used or other pre_tokenizer are used).
|
| 475 |
+
|
| 476 |
+
"""
|
| 477 |
+
def __init__(self, replacement="_", prepend_scheme="always", split=True):
|
| 478 |
+
pass
|
| 479 |
+
|
| 480 |
+
def __getstate__(self):
|
| 481 |
+
""" """
|
| 482 |
+
pass
|
| 483 |
+
|
| 484 |
+
def __setstate__(self, state):
|
| 485 |
+
""" """
|
| 486 |
+
pass
|
| 487 |
+
|
| 488 |
+
@staticmethod
|
| 489 |
+
def custom(pretok):
|
| 490 |
+
""" """
|
| 491 |
+
pass
|
| 492 |
+
|
| 493 |
+
def pre_tokenize(self, pretok):
|
| 494 |
+
"""
|
| 495 |
+
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
| 496 |
+
|
| 497 |
+
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
| 498 |
+
keep track of the pre-tokenization, and leverage the capabilities of the
|
| 499 |
+
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
| 500 |
+
the pre-tokenization of a raw string, you can use
|
| 501 |
+
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
| 502 |
+
|
| 503 |
+
Args:
|
| 504 |
+
pretok (:class:`~tokenizers.PreTokenizedString):
|
| 505 |
+
The pre-tokenized string on which to apply this
|
| 506 |
+
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
| 507 |
+
"""
|
| 508 |
+
pass
|
| 509 |
+
|
| 510 |
+
def pre_tokenize_str(self, sequence):
|
| 511 |
+
"""
|
| 512 |
+
Pre tokenize the given string
|
| 513 |
+
|
| 514 |
+
This method provides a way to visualize the effect of a
|
| 515 |
+
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
| 516 |
+
alignment, nor does it provide all the capabilities of the
|
| 517 |
+
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
| 518 |
+
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
| 519 |
+
|
| 520 |
+
Args:
|
| 521 |
+
sequence (:obj:`str`):
|
| 522 |
+
A string to pre-tokeize
|
| 523 |
+
|
| 524 |
+
Returns:
|
| 525 |
+
:obj:`List[Tuple[str, Offsets]]`:
|
| 526 |
+
A list of tuple with the pre-tokenized parts and their offsets
|
| 527 |
+
"""
|
| 528 |
+
pass
|
| 529 |
+
|
| 530 |
+
@property
|
| 531 |
+
def prepend_scheme(self):
|
| 532 |
+
""" """
|
| 533 |
+
pass
|
| 534 |
+
|
| 535 |
+
@prepend_scheme.setter
|
| 536 |
+
def prepend_scheme(self, value):
|
| 537 |
+
""" """
|
| 538 |
+
pass
|
| 539 |
+
|
| 540 |
+
@property
|
| 541 |
+
def replacement(self):
|
| 542 |
+
""" """
|
| 543 |
+
pass
|
| 544 |
+
|
| 545 |
+
@replacement.setter
|
| 546 |
+
def replacement(self, value):
|
| 547 |
+
""" """
|
| 548 |
+
pass
|
| 549 |
+
|
| 550 |
+
@property
|
| 551 |
+
def split(self):
|
| 552 |
+
""" """
|
| 553 |
+
pass
|
| 554 |
+
|
| 555 |
+
@split.setter
|
| 556 |
+
def split(self, value):
|
| 557 |
+
""" """
|
| 558 |
+
pass
|
| 559 |
+
|
| 560 |
+
class Punctuation(PreTokenizer):
|
| 561 |
+
"""
|
| 562 |
+
This pre-tokenizer simply splits on punctuation as individual characters.
|
| 563 |
+
|
| 564 |
+
Args:
|
| 565 |
+
behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
|
| 566 |
+
The behavior to use when splitting.
|
| 567 |
+
Choices: "removed", "isolated" (default), "merged_with_previous", "merged_with_next",
|
| 568 |
+
"contiguous"
|
| 569 |
+
"""
|
| 570 |
+
def __init__(self, behavior="isolated"):
|
| 571 |
+
pass
|
| 572 |
+
|
| 573 |
+
def __getstate__(self):
|
| 574 |
+
""" """
|
| 575 |
+
pass
|
| 576 |
+
|
| 577 |
+
def __setstate__(self, state):
|
| 578 |
+
""" """
|
| 579 |
+
pass
|
| 580 |
+
|
| 581 |
+
@property
|
| 582 |
+
def behavior(self):
|
| 583 |
+
""" """
|
| 584 |
+
pass
|
| 585 |
+
|
| 586 |
+
@behavior.setter
|
| 587 |
+
def behavior(self, value):
|
| 588 |
+
""" """
|
| 589 |
+
pass
|
| 590 |
+
|
| 591 |
+
@staticmethod
|
| 592 |
+
def custom(pretok):
|
| 593 |
+
""" """
|
| 594 |
+
pass
|
| 595 |
+
|
| 596 |
+
def pre_tokenize(self, pretok):
|
| 597 |
+
"""
|
| 598 |
+
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
| 599 |
+
|
| 600 |
+
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
| 601 |
+
keep track of the pre-tokenization, and leverage the capabilities of the
|
| 602 |
+
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
| 603 |
+
the pre-tokenization of a raw string, you can use
|
| 604 |
+
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
| 605 |
+
|
| 606 |
+
Args:
|
| 607 |
+
pretok (:class:`~tokenizers.PreTokenizedString):
|
| 608 |
+
The pre-tokenized string on which to apply this
|
| 609 |
+
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
| 610 |
+
"""
|
| 611 |
+
pass
|
| 612 |
+
|
| 613 |
+
def pre_tokenize_str(self, sequence):
|
| 614 |
+
"""
|
| 615 |
+
Pre tokenize the given string
|
| 616 |
+
|
| 617 |
+
This method provides a way to visualize the effect of a
|
| 618 |
+
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
| 619 |
+
alignment, nor does it provide all the capabilities of the
|
| 620 |
+
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
| 621 |
+
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
| 622 |
+
|
| 623 |
+
Args:
|
| 624 |
+
sequence (:obj:`str`):
|
| 625 |
+
A string to pre-tokeize
|
| 626 |
+
|
| 627 |
+
Returns:
|
| 628 |
+
:obj:`List[Tuple[str, Offsets]]`:
|
| 629 |
+
A list of tuple with the pre-tokenized parts and their offsets
|
| 630 |
+
"""
|
| 631 |
+
pass
|
| 632 |
+
|
| 633 |
+
class Sequence(PreTokenizer):
|
| 634 |
+
"""
|
| 635 |
+
This pre-tokenizer composes other pre_tokenizers and applies them in sequence
|
| 636 |
+
"""
|
| 637 |
+
def __init__(self, pretokenizers):
|
| 638 |
+
pass
|
| 639 |
+
|
| 640 |
+
def __getitem__(self, key):
|
| 641 |
+
"""
|
| 642 |
+
Return self[key].
|
| 643 |
+
"""
|
| 644 |
+
pass
|
| 645 |
+
|
| 646 |
+
def __getnewargs__(self):
|
| 647 |
+
""" """
|
| 648 |
+
pass
|
| 649 |
+
|
| 650 |
+
def __getstate__(self):
|
| 651 |
+
""" """
|
| 652 |
+
pass
|
| 653 |
+
|
| 654 |
+
def __setitem__(self, key, value):
|
| 655 |
+
"""
|
| 656 |
+
Set self[key] to value.
|
| 657 |
+
"""
|
| 658 |
+
pass
|
| 659 |
+
|
| 660 |
+
def __setstate__(self, state):
|
| 661 |
+
""" """
|
| 662 |
+
pass
|
| 663 |
+
|
| 664 |
+
@staticmethod
|
| 665 |
+
def custom(pretok):
|
| 666 |
+
""" """
|
| 667 |
+
pass
|
| 668 |
+
|
| 669 |
+
def pre_tokenize(self, pretok):
|
| 670 |
+
"""
|
| 671 |
+
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
| 672 |
+
|
| 673 |
+
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
| 674 |
+
keep track of the pre-tokenization, and leverage the capabilities of the
|
| 675 |
+
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
| 676 |
+
the pre-tokenization of a raw string, you can use
|
| 677 |
+
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
| 678 |
+
|
| 679 |
+
Args:
|
| 680 |
+
pretok (:class:`~tokenizers.PreTokenizedString):
|
| 681 |
+
The pre-tokenized string on which to apply this
|
| 682 |
+
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
| 683 |
+
"""
|
| 684 |
+
pass
|
| 685 |
+
|
| 686 |
+
def pre_tokenize_str(self, sequence):
|
| 687 |
+
"""
|
| 688 |
+
Pre tokenize the given string
|
| 689 |
+
|
| 690 |
+
This method provides a way to visualize the effect of a
|
| 691 |
+
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
| 692 |
+
alignment, nor does it provide all the capabilities of the
|
| 693 |
+
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
| 694 |
+
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
| 695 |
+
|
| 696 |
+
Args:
|
| 697 |
+
sequence (:obj:`str`):
|
| 698 |
+
A string to pre-tokeize
|
| 699 |
+
|
| 700 |
+
Returns:
|
| 701 |
+
:obj:`List[Tuple[str, Offsets]]`:
|
| 702 |
+
A list of tuple with the pre-tokenized parts and their offsets
|
| 703 |
+
"""
|
| 704 |
+
pass
|
| 705 |
+
|
| 706 |
+
class Split(PreTokenizer):
|
| 707 |
+
"""
|
| 708 |
+
Split PreTokenizer
|
| 709 |
+
|
| 710 |
+
This versatile pre-tokenizer splits using the provided pattern and
|
| 711 |
+
according to the provided behavior. The pattern can be inverted by
|
| 712 |
+
making use of the invert flag.
|
| 713 |
+
|
| 714 |
+
Args:
|
| 715 |
+
pattern (:obj:`str` or :class:`~tokenizers.Regex`):
|
| 716 |
+
A pattern used to split the string. Usually a string or a regex built with `tokenizers.Regex`.
|
| 717 |
+
If you want to use a regex pattern, it has to be wrapped around a `tokenizers.Regex`,
|
| 718 |
+
otherwise we consider is as a string pattern. For example `pattern="|"`
|
| 719 |
+
means you want to split on `|` (imagine a csv file for example), while
|
| 720 |
+
`pattern=tokenizers.Regex("1|2")` means you split on either '1' or '2'.
|
| 721 |
+
behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
|
| 722 |
+
The behavior to use when splitting.
|
| 723 |
+
Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",
|
| 724 |
+
"contiguous"
|
| 725 |
+
|
| 726 |
+
invert (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
| 727 |
+
Whether to invert the pattern.
|
| 728 |
+
"""
|
| 729 |
+
def __init__(self, pattern, behavior, invert=False):
|
| 730 |
+
pass
|
| 731 |
+
|
| 732 |
+
def __getnewargs__(self):
|
| 733 |
+
""" """
|
| 734 |
+
pass
|
| 735 |
+
|
| 736 |
+
def __getstate__(self):
|
| 737 |
+
""" """
|
| 738 |
+
pass
|
| 739 |
+
|
| 740 |
+
def __setstate__(self, state):
|
| 741 |
+
""" """
|
| 742 |
+
pass
|
| 743 |
+
|
| 744 |
+
@property
|
| 745 |
+
def behavior(self):
|
| 746 |
+
""" """
|
| 747 |
+
pass
|
| 748 |
+
|
| 749 |
+
@behavior.setter
|
| 750 |
+
def behavior(self, value):
|
| 751 |
+
""" """
|
| 752 |
+
pass
|
| 753 |
+
|
| 754 |
+
@staticmethod
|
| 755 |
+
def custom(pretok):
|
| 756 |
+
""" """
|
| 757 |
+
pass
|
| 758 |
+
|
| 759 |
+
@property
|
| 760 |
+
def invert(self):
|
| 761 |
+
""" """
|
| 762 |
+
pass
|
| 763 |
+
|
| 764 |
+
@invert.setter
|
| 765 |
+
def invert(self, value):
|
| 766 |
+
""" """
|
| 767 |
+
pass
|
| 768 |
+
|
| 769 |
+
@property
|
| 770 |
+
def pattern(self):
|
| 771 |
+
""" """
|
| 772 |
+
pass
|
| 773 |
+
|
| 774 |
+
@pattern.setter
|
| 775 |
+
def pattern(self, value):
|
| 776 |
+
""" """
|
| 777 |
+
pass
|
| 778 |
+
|
| 779 |
+
def pre_tokenize(self, pretok):
|
| 780 |
+
"""
|
| 781 |
+
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
| 782 |
+
|
| 783 |
+
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
| 784 |
+
keep track of the pre-tokenization, and leverage the capabilities of the
|
| 785 |
+
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
| 786 |
+
the pre-tokenization of a raw string, you can use
|
| 787 |
+
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
| 788 |
+
|
| 789 |
+
Args:
|
| 790 |
+
pretok (:class:`~tokenizers.PreTokenizedString):
|
| 791 |
+
The pre-tokenized string on which to apply this
|
| 792 |
+
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
| 793 |
+
"""
|
| 794 |
+
pass
|
| 795 |
+
|
| 796 |
+
def pre_tokenize_str(self, sequence):
|
| 797 |
+
"""
|
| 798 |
+
Pre tokenize the given string
|
| 799 |
+
|
| 800 |
+
This method provides a way to visualize the effect of a
|
| 801 |
+
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
| 802 |
+
alignment, nor does it provide all the capabilities of the
|
| 803 |
+
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
| 804 |
+
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
| 805 |
+
|
| 806 |
+
Args:
|
| 807 |
+
sequence (:obj:`str`):
|
| 808 |
+
A string to pre-tokeize
|
| 809 |
+
|
| 810 |
+
Returns:
|
| 811 |
+
:obj:`List[Tuple[str, Offsets]]`:
|
| 812 |
+
A list of tuple with the pre-tokenized parts and their offsets
|
| 813 |
+
"""
|
| 814 |
+
pass
|
| 815 |
+
|
| 816 |
+
class UnicodeScripts(PreTokenizer):
|
| 817 |
+
"""
|
| 818 |
+
This pre-tokenizer splits on characters that belong to different language family
|
| 819 |
+
It roughly follows https://github.com/google/sentencepiece/blob/master/data/Scripts.txt
|
| 820 |
+
Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too.
|
| 821 |
+
This mimicks SentencePiece Unigram implementation.
|
| 822 |
+
"""
|
| 823 |
+
def __init__(self):
|
| 824 |
+
pass
|
| 825 |
+
|
| 826 |
+
def __getstate__(self):
|
| 827 |
+
""" """
|
| 828 |
+
pass
|
| 829 |
+
|
| 830 |
+
def __setstate__(self, state):
|
| 831 |
+
""" """
|
| 832 |
+
pass
|
| 833 |
+
|
| 834 |
+
@staticmethod
|
| 835 |
+
def custom(pretok):
|
| 836 |
+
""" """
|
| 837 |
+
pass
|
| 838 |
+
|
| 839 |
+
def pre_tokenize(self, pretok):
|
| 840 |
+
"""
|
| 841 |
+
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
| 842 |
+
|
| 843 |
+
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
| 844 |
+
keep track of the pre-tokenization, and leverage the capabilities of the
|
| 845 |
+
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
| 846 |
+
the pre-tokenization of a raw string, you can use
|
| 847 |
+
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
| 848 |
+
|
| 849 |
+
Args:
|
| 850 |
+
pretok (:class:`~tokenizers.PreTokenizedString):
|
| 851 |
+
The pre-tokenized string on which to apply this
|
| 852 |
+
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
| 853 |
+
"""
|
| 854 |
+
pass
|
| 855 |
+
|
| 856 |
+
def pre_tokenize_str(self, sequence):
|
| 857 |
+
"""
|
| 858 |
+
Pre tokenize the given string
|
| 859 |
+
|
| 860 |
+
This method provides a way to visualize the effect of a
|
| 861 |
+
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
| 862 |
+
alignment, nor does it provide all the capabilities of the
|
| 863 |
+
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
| 864 |
+
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
| 865 |
+
|
| 866 |
+
Args:
|
| 867 |
+
sequence (:obj:`str`):
|
| 868 |
+
A string to pre-tokeize
|
| 869 |
+
|
| 870 |
+
Returns:
|
| 871 |
+
:obj:`List[Tuple[str, Offsets]]`:
|
| 872 |
+
A list of tuple with the pre-tokenized parts and their offsets
|
| 873 |
+
"""
|
| 874 |
+
pass
|
| 875 |
+
|
| 876 |
+
class Whitespace(PreTokenizer):
|
| 877 |
+
"""
|
| 878 |
+
This pre-tokenizer splits on word boundaries according to the `\w+|[^\w\s]+`
|
| 879 |
+
regex pattern. It splits on word characters or characters that aren't words or
|
| 880 |
+
whitespaces (punctuation such as hyphens, apostrophes, commas, etc.).
|
| 881 |
+
|
| 882 |
+
Example:
|
| 883 |
+
Use the `Whitespace` function as shown below::
|
| 884 |
+
|
| 885 |
+
```python
|
| 886 |
+
from tokenizers.pre_tokenizers import Whitespace
|
| 887 |
+
|
| 888 |
+
pre_tokenizer = Whitespace()
|
| 889 |
+
text = "Hello, world! Let's try the Whitespace pre-tokenizer."
|
| 890 |
+
pre_tokenizer.pre_tokenize_str(text)
|
| 891 |
+
[('Hello', (0, 5)),
|
| 892 |
+
(',', (5, 6)),
|
| 893 |
+
('world', (7, 12)),
|
| 894 |
+
('!', (12, 13)),
|
| 895 |
+
('Let', (14, 17)),
|
| 896 |
+
("'", (17, 18)),
|
| 897 |
+
('s', (18, 19)),
|
| 898 |
+
('try', (20, 23)),
|
| 899 |
+
('the', (24, 27)),
|
| 900 |
+
('Whitespace', (28, 38)),
|
| 901 |
+
('pre', (39, 42)),
|
| 902 |
+
('-', (42, 43)),
|
| 903 |
+
('tokenizer', (43, 52)),
|
| 904 |
+
('.', (52, 53))]
|
| 905 |
+
```
|
| 906 |
+
"""
|
| 907 |
+
def __init__(self):
|
| 908 |
+
pass
|
| 909 |
+
|
| 910 |
+
def __getstate__(self):
|
| 911 |
+
""" """
|
| 912 |
+
pass
|
| 913 |
+
|
| 914 |
+
def __setstate__(self, state):
|
| 915 |
+
""" """
|
| 916 |
+
pass
|
| 917 |
+
|
| 918 |
+
@staticmethod
|
| 919 |
+
def custom(pretok):
|
| 920 |
+
""" """
|
| 921 |
+
pass
|
| 922 |
+
|
| 923 |
+
def pre_tokenize(self, pretok):
|
| 924 |
+
"""
|
| 925 |
+
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
| 926 |
+
|
| 927 |
+
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
| 928 |
+
keep track of the pre-tokenization, and leverage the capabilities of the
|
| 929 |
+
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
| 930 |
+
the pre-tokenization of a raw string, you can use
|
| 931 |
+
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
| 932 |
+
|
| 933 |
+
Args:
|
| 934 |
+
pretok (:class:`~tokenizers.PreTokenizedString):
|
| 935 |
+
The pre-tokenized string on which to apply this
|
| 936 |
+
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
| 937 |
+
"""
|
| 938 |
+
pass
|
| 939 |
+
|
| 940 |
+
def pre_tokenize_str(self, sequence):
|
| 941 |
+
"""
|
| 942 |
+
Pre tokenize the given string
|
| 943 |
+
|
| 944 |
+
This method provides a way to visualize the effect of a
|
| 945 |
+
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
| 946 |
+
alignment, nor does it provide all the capabilities of the
|
| 947 |
+
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
| 948 |
+
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
| 949 |
+
|
| 950 |
+
Args:
|
| 951 |
+
sequence (:obj:`str`):
|
| 952 |
+
A string to pre-tokeize
|
| 953 |
+
|
| 954 |
+
Returns:
|
| 955 |
+
:obj:`List[Tuple[str, Offsets]]`:
|
| 956 |
+
A list of tuple with the pre-tokenized parts and their offsets
|
| 957 |
+
"""
|
| 958 |
+
pass
|
| 959 |
+
|
| 960 |
+
class WhitespaceSplit(PreTokenizer):
|
| 961 |
+
"""
|
| 962 |
+
This pre-tokenizer simply splits on the whitespace. Works like `.split()`
|
| 963 |
+
"""
|
| 964 |
+
def __init__(self):
|
| 965 |
+
pass
|
| 966 |
+
|
| 967 |
+
def __getstate__(self):
|
| 968 |
+
""" """
|
| 969 |
+
pass
|
| 970 |
+
|
| 971 |
+
def __setstate__(self, state):
|
| 972 |
+
""" """
|
| 973 |
+
pass
|
| 974 |
+
|
| 975 |
+
@staticmethod
|
| 976 |
+
def custom(pretok):
|
| 977 |
+
""" """
|
| 978 |
+
pass
|
| 979 |
+
|
| 980 |
+
def pre_tokenize(self, pretok):
|
| 981 |
+
"""
|
| 982 |
+
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
| 983 |
+
|
| 984 |
+
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
| 985 |
+
keep track of the pre-tokenization, and leverage the capabilities of the
|
| 986 |
+
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
| 987 |
+
the pre-tokenization of a raw string, you can use
|
| 988 |
+
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
| 989 |
+
|
| 990 |
+
Args:
|
| 991 |
+
pretok (:class:`~tokenizers.PreTokenizedString):
|
| 992 |
+
The pre-tokenized string on which to apply this
|
| 993 |
+
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
| 994 |
+
"""
|
| 995 |
+
pass
|
| 996 |
+
|
| 997 |
+
def pre_tokenize_str(self, sequence):
|
| 998 |
+
"""
|
| 999 |
+
Pre tokenize the given string
|
| 1000 |
+
|
| 1001 |
+
This method provides a way to visualize the effect of a
|
| 1002 |
+
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
| 1003 |
+
alignment, nor does it provide all the capabilities of the
|
| 1004 |
+
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
| 1005 |
+
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
| 1006 |
+
|
| 1007 |
+
Args:
|
| 1008 |
+
sequence (:obj:`str`):
|
| 1009 |
+
A string to pre-tokeize
|
| 1010 |
+
|
| 1011 |
+
Returns:
|
| 1012 |
+
:obj:`List[Tuple[str, Offsets]]`:
|
| 1013 |
+
A list of tuple with the pre-tokenized parts and their offsets
|
| 1014 |
+
"""
|
| 1015 |
+
pass
|
source/tokenizers/processors/__init__.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Generated content DO NOT EDIT
|
| 2 |
+
from .. import processors
|
| 3 |
+
|
| 4 |
+
PostProcessor = processors.PostProcessor
|
| 5 |
+
BertProcessing = processors.BertProcessing
|
| 6 |
+
ByteLevel = processors.ByteLevel
|
| 7 |
+
RobertaProcessing = processors.RobertaProcessing
|
| 8 |
+
Sequence = processors.Sequence
|
| 9 |
+
TemplateProcessing = processors.TemplateProcessing
|
source/tokenizers/processors/__init__.pyi
ADDED
|
@@ -0,0 +1,519 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Generated content DO NOT EDIT
|
| 2 |
+
class PostProcessor:
|
| 3 |
+
"""
|
| 4 |
+
Base class for all post-processors
|
| 5 |
+
|
| 6 |
+
This class is not supposed to be instantiated directly. Instead, any implementation of
|
| 7 |
+
a PostProcessor will return an instance of this class when instantiated.
|
| 8 |
+
"""
|
| 9 |
+
def __getstate__(self):
|
| 10 |
+
""" """
|
| 11 |
+
pass
|
| 12 |
+
|
| 13 |
+
def __setstate__(self, state):
|
| 14 |
+
""" """
|
| 15 |
+
pass
|
| 16 |
+
|
| 17 |
+
def num_special_tokens_to_add(self, is_pair):
|
| 18 |
+
"""
|
| 19 |
+
Return the number of special tokens that would be added for single/pair sentences.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
is_pair (:obj:`bool`):
|
| 23 |
+
Whether the input would be a pair of sequences
|
| 24 |
+
|
| 25 |
+
Returns:
|
| 26 |
+
:obj:`int`: The number of tokens to add
|
| 27 |
+
"""
|
| 28 |
+
pass
|
| 29 |
+
|
| 30 |
+
def process(self, encoding, pair=None, add_special_tokens=True):
|
| 31 |
+
"""
|
| 32 |
+
Post-process the given encodings, generating the final one
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
encoding (:class:`~tokenizers.Encoding`):
|
| 36 |
+
The encoding for the first sequence
|
| 37 |
+
|
| 38 |
+
pair (:class:`~tokenizers.Encoding`, `optional`):
|
| 39 |
+
The encoding for the pair sequence
|
| 40 |
+
|
| 41 |
+
add_special_tokens (:obj:`bool`):
|
| 42 |
+
Whether to add the special tokens
|
| 43 |
+
|
| 44 |
+
Return:
|
| 45 |
+
:class:`~tokenizers.Encoding`: The final encoding
|
| 46 |
+
"""
|
| 47 |
+
pass
|
| 48 |
+
|
| 49 |
+
class BertProcessing(PostProcessor):
|
| 50 |
+
"""
|
| 51 |
+
This post-processor takes care of adding the special tokens needed by
|
| 52 |
+
a Bert model:
|
| 53 |
+
|
| 54 |
+
- a SEP token
|
| 55 |
+
- a CLS token
|
| 56 |
+
|
| 57 |
+
Args:
|
| 58 |
+
sep (:obj:`Tuple[str, int]`):
|
| 59 |
+
A tuple with the string representation of the SEP token, and its id
|
| 60 |
+
|
| 61 |
+
cls (:obj:`Tuple[str, int]`):
|
| 62 |
+
A tuple with the string representation of the CLS token, and its id
|
| 63 |
+
"""
|
| 64 |
+
def __init__(self, sep, cls):
|
| 65 |
+
pass
|
| 66 |
+
|
| 67 |
+
def __getnewargs__(self):
|
| 68 |
+
""" """
|
| 69 |
+
pass
|
| 70 |
+
|
| 71 |
+
def __getstate__(self):
|
| 72 |
+
""" """
|
| 73 |
+
pass
|
| 74 |
+
|
| 75 |
+
def __setstate__(self, state):
|
| 76 |
+
""" """
|
| 77 |
+
pass
|
| 78 |
+
|
| 79 |
+
@property
|
| 80 |
+
def cls(self):
|
| 81 |
+
""" """
|
| 82 |
+
pass
|
| 83 |
+
|
| 84 |
+
@cls.setter
|
| 85 |
+
def cls(self, value):
|
| 86 |
+
""" """
|
| 87 |
+
pass
|
| 88 |
+
|
| 89 |
+
def num_special_tokens_to_add(self, is_pair):
|
| 90 |
+
"""
|
| 91 |
+
Return the number of special tokens that would be added for single/pair sentences.
|
| 92 |
+
|
| 93 |
+
Args:
|
| 94 |
+
is_pair (:obj:`bool`):
|
| 95 |
+
Whether the input would be a pair of sequences
|
| 96 |
+
|
| 97 |
+
Returns:
|
| 98 |
+
:obj:`int`: The number of tokens to add
|
| 99 |
+
"""
|
| 100 |
+
pass
|
| 101 |
+
|
| 102 |
+
def process(self, encoding, pair=None, add_special_tokens=True):
|
| 103 |
+
"""
|
| 104 |
+
Post-process the given encodings, generating the final one
|
| 105 |
+
|
| 106 |
+
Args:
|
| 107 |
+
encoding (:class:`~tokenizers.Encoding`):
|
| 108 |
+
The encoding for the first sequence
|
| 109 |
+
|
| 110 |
+
pair (:class:`~tokenizers.Encoding`, `optional`):
|
| 111 |
+
The encoding for the pair sequence
|
| 112 |
+
|
| 113 |
+
add_special_tokens (:obj:`bool`):
|
| 114 |
+
Whether to add the special tokens
|
| 115 |
+
|
| 116 |
+
Return:
|
| 117 |
+
:class:`~tokenizers.Encoding`: The final encoding
|
| 118 |
+
"""
|
| 119 |
+
pass
|
| 120 |
+
|
| 121 |
+
@property
|
| 122 |
+
def sep(self):
|
| 123 |
+
""" """
|
| 124 |
+
pass
|
| 125 |
+
|
| 126 |
+
@sep.setter
|
| 127 |
+
def sep(self, value):
|
| 128 |
+
""" """
|
| 129 |
+
pass
|
| 130 |
+
|
| 131 |
+
class ByteLevel(PostProcessor):
|
| 132 |
+
"""
|
| 133 |
+
This post-processor takes care of trimming the offsets.
|
| 134 |
+
|
| 135 |
+
By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
|
| 136 |
+
want the offsets to include these whitespaces, then this PostProcessor must be used.
|
| 137 |
+
|
| 138 |
+
Args:
|
| 139 |
+
trim_offsets (:obj:`bool`):
|
| 140 |
+
Whether to trim the whitespaces from the produced offsets.
|
| 141 |
+
|
| 142 |
+
add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
| 143 |
+
If :obj:`True`, keeps the first token's offset as is. If :obj:`False`, increments
|
| 144 |
+
the start of the first token's offset by 1. Only has an effect if :obj:`trim_offsets`
|
| 145 |
+
is set to :obj:`True`.
|
| 146 |
+
"""
|
| 147 |
+
def __init__(self, add_prefix_space=None, trim_offsets=None, use_regex=None):
|
| 148 |
+
pass
|
| 149 |
+
|
| 150 |
+
def __getstate__(self):
|
| 151 |
+
""" """
|
| 152 |
+
pass
|
| 153 |
+
|
| 154 |
+
def __setstate__(self, state):
|
| 155 |
+
""" """
|
| 156 |
+
pass
|
| 157 |
+
|
| 158 |
+
@property
|
| 159 |
+
def add_prefix_space(self):
|
| 160 |
+
""" """
|
| 161 |
+
pass
|
| 162 |
+
|
| 163 |
+
@add_prefix_space.setter
|
| 164 |
+
def add_prefix_space(self, value):
|
| 165 |
+
""" """
|
| 166 |
+
pass
|
| 167 |
+
|
| 168 |
+
def num_special_tokens_to_add(self, is_pair):
|
| 169 |
+
"""
|
| 170 |
+
Return the number of special tokens that would be added for single/pair sentences.
|
| 171 |
+
|
| 172 |
+
Args:
|
| 173 |
+
is_pair (:obj:`bool`):
|
| 174 |
+
Whether the input would be a pair of sequences
|
| 175 |
+
|
| 176 |
+
Returns:
|
| 177 |
+
:obj:`int`: The number of tokens to add
|
| 178 |
+
"""
|
| 179 |
+
pass
|
| 180 |
+
|
| 181 |
+
def process(self, encoding, pair=None, add_special_tokens=True):
|
| 182 |
+
"""
|
| 183 |
+
Post-process the given encodings, generating the final one
|
| 184 |
+
|
| 185 |
+
Args:
|
| 186 |
+
encoding (:class:`~tokenizers.Encoding`):
|
| 187 |
+
The encoding for the first sequence
|
| 188 |
+
|
| 189 |
+
pair (:class:`~tokenizers.Encoding`, `optional`):
|
| 190 |
+
The encoding for the pair sequence
|
| 191 |
+
|
| 192 |
+
add_special_tokens (:obj:`bool`):
|
| 193 |
+
Whether to add the special tokens
|
| 194 |
+
|
| 195 |
+
Return:
|
| 196 |
+
:class:`~tokenizers.Encoding`: The final encoding
|
| 197 |
+
"""
|
| 198 |
+
pass
|
| 199 |
+
|
| 200 |
+
@property
|
| 201 |
+
def trim_offsets(self):
|
| 202 |
+
""" """
|
| 203 |
+
pass
|
| 204 |
+
|
| 205 |
+
@trim_offsets.setter
|
| 206 |
+
def trim_offsets(self, value):
|
| 207 |
+
""" """
|
| 208 |
+
pass
|
| 209 |
+
|
| 210 |
+
@property
|
| 211 |
+
def use_regex(self):
|
| 212 |
+
""" """
|
| 213 |
+
pass
|
| 214 |
+
|
| 215 |
+
@use_regex.setter
|
| 216 |
+
def use_regex(self, value):
|
| 217 |
+
""" """
|
| 218 |
+
pass
|
| 219 |
+
|
| 220 |
+
class RobertaProcessing(PostProcessor):
|
| 221 |
+
"""
|
| 222 |
+
This post-processor takes care of adding the special tokens needed by
|
| 223 |
+
a Roberta model:
|
| 224 |
+
|
| 225 |
+
- a SEP token
|
| 226 |
+
- a CLS token
|
| 227 |
+
|
| 228 |
+
It also takes care of trimming the offsets.
|
| 229 |
+
By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
|
| 230 |
+
want the offsets to include these whitespaces, then this PostProcessor should be initialized
|
| 231 |
+
with :obj:`trim_offsets=True`
|
| 232 |
+
|
| 233 |
+
Args:
|
| 234 |
+
sep (:obj:`Tuple[str, int]`):
|
| 235 |
+
A tuple with the string representation of the SEP token, and its id
|
| 236 |
+
|
| 237 |
+
cls (:obj:`Tuple[str, int]`):
|
| 238 |
+
A tuple with the string representation of the CLS token, and its id
|
| 239 |
+
|
| 240 |
+
trim_offsets (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
| 241 |
+
Whether to trim the whitespaces from the produced offsets.
|
| 242 |
+
|
| 243 |
+
add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
| 244 |
+
Whether the add_prefix_space option was enabled during pre-tokenization. This
|
| 245 |
+
is relevant because it defines the way the offsets are trimmed out.
|
| 246 |
+
"""
|
| 247 |
+
def __init__(self, sep, cls, trim_offsets=True, add_prefix_space=True):
|
| 248 |
+
pass
|
| 249 |
+
|
| 250 |
+
def __getnewargs__(self):
|
| 251 |
+
""" """
|
| 252 |
+
pass
|
| 253 |
+
|
| 254 |
+
def __getstate__(self):
|
| 255 |
+
""" """
|
| 256 |
+
pass
|
| 257 |
+
|
| 258 |
+
def __setstate__(self, state):
|
| 259 |
+
""" """
|
| 260 |
+
pass
|
| 261 |
+
|
| 262 |
+
@property
|
| 263 |
+
def add_prefix_space(self):
|
| 264 |
+
""" """
|
| 265 |
+
pass
|
| 266 |
+
|
| 267 |
+
@add_prefix_space.setter
|
| 268 |
+
def add_prefix_space(self, value):
|
| 269 |
+
""" """
|
| 270 |
+
pass
|
| 271 |
+
|
| 272 |
+
@property
|
| 273 |
+
def cls(self):
|
| 274 |
+
""" """
|
| 275 |
+
pass
|
| 276 |
+
|
| 277 |
+
@cls.setter
|
| 278 |
+
def cls(self, value):
|
| 279 |
+
""" """
|
| 280 |
+
pass
|
| 281 |
+
|
| 282 |
+
def num_special_tokens_to_add(self, is_pair):
|
| 283 |
+
"""
|
| 284 |
+
Return the number of special tokens that would be added for single/pair sentences.
|
| 285 |
+
|
| 286 |
+
Args:
|
| 287 |
+
is_pair (:obj:`bool`):
|
| 288 |
+
Whether the input would be a pair of sequences
|
| 289 |
+
|
| 290 |
+
Returns:
|
| 291 |
+
:obj:`int`: The number of tokens to add
|
| 292 |
+
"""
|
| 293 |
+
pass
|
| 294 |
+
|
| 295 |
+
def process(self, encoding, pair=None, add_special_tokens=True):
|
| 296 |
+
"""
|
| 297 |
+
Post-process the given encodings, generating the final one
|
| 298 |
+
|
| 299 |
+
Args:
|
| 300 |
+
encoding (:class:`~tokenizers.Encoding`):
|
| 301 |
+
The encoding for the first sequence
|
| 302 |
+
|
| 303 |
+
pair (:class:`~tokenizers.Encoding`, `optional`):
|
| 304 |
+
The encoding for the pair sequence
|
| 305 |
+
|
| 306 |
+
add_special_tokens (:obj:`bool`):
|
| 307 |
+
Whether to add the special tokens
|
| 308 |
+
|
| 309 |
+
Return:
|
| 310 |
+
:class:`~tokenizers.Encoding`: The final encoding
|
| 311 |
+
"""
|
| 312 |
+
pass
|
| 313 |
+
|
| 314 |
+
@property
|
| 315 |
+
def sep(self):
|
| 316 |
+
""" """
|
| 317 |
+
pass
|
| 318 |
+
|
| 319 |
+
@sep.setter
|
| 320 |
+
def sep(self, value):
|
| 321 |
+
""" """
|
| 322 |
+
pass
|
| 323 |
+
|
| 324 |
+
@property
|
| 325 |
+
def trim_offsets(self):
|
| 326 |
+
""" """
|
| 327 |
+
pass
|
| 328 |
+
|
| 329 |
+
@trim_offsets.setter
|
| 330 |
+
def trim_offsets(self, value):
|
| 331 |
+
""" """
|
| 332 |
+
pass
|
| 333 |
+
|
| 334 |
+
class Sequence(PostProcessor):
|
| 335 |
+
"""
|
| 336 |
+
Sequence Processor
|
| 337 |
+
|
| 338 |
+
Args:
|
| 339 |
+
processors (:obj:`List[PostProcessor]`)
|
| 340 |
+
The processors that need to be chained
|
| 341 |
+
"""
|
| 342 |
+
def __init__(self, processors):
|
| 343 |
+
pass
|
| 344 |
+
|
| 345 |
+
def __getitem__(self, key):
|
| 346 |
+
"""
|
| 347 |
+
Return self[key].
|
| 348 |
+
"""
|
| 349 |
+
pass
|
| 350 |
+
|
| 351 |
+
def __getnewargs__(self):
|
| 352 |
+
""" """
|
| 353 |
+
pass
|
| 354 |
+
|
| 355 |
+
def __getstate__(self):
|
| 356 |
+
""" """
|
| 357 |
+
pass
|
| 358 |
+
|
| 359 |
+
def __setitem__(self, key, value):
|
| 360 |
+
"""
|
| 361 |
+
Set self[key] to value.
|
| 362 |
+
"""
|
| 363 |
+
pass
|
| 364 |
+
|
| 365 |
+
def __setstate__(self, state):
|
| 366 |
+
""" """
|
| 367 |
+
pass
|
| 368 |
+
|
| 369 |
+
def num_special_tokens_to_add(self, is_pair):
|
| 370 |
+
"""
|
| 371 |
+
Return the number of special tokens that would be added for single/pair sentences.
|
| 372 |
+
|
| 373 |
+
Args:
|
| 374 |
+
is_pair (:obj:`bool`):
|
| 375 |
+
Whether the input would be a pair of sequences
|
| 376 |
+
|
| 377 |
+
Returns:
|
| 378 |
+
:obj:`int`: The number of tokens to add
|
| 379 |
+
"""
|
| 380 |
+
pass
|
| 381 |
+
|
| 382 |
+
def process(self, encoding, pair=None, add_special_tokens=True):
|
| 383 |
+
"""
|
| 384 |
+
Post-process the given encodings, generating the final one
|
| 385 |
+
|
| 386 |
+
Args:
|
| 387 |
+
encoding (:class:`~tokenizers.Encoding`):
|
| 388 |
+
The encoding for the first sequence
|
| 389 |
+
|
| 390 |
+
pair (:class:`~tokenizers.Encoding`, `optional`):
|
| 391 |
+
The encoding for the pair sequence
|
| 392 |
+
|
| 393 |
+
add_special_tokens (:obj:`bool`):
|
| 394 |
+
Whether to add the special tokens
|
| 395 |
+
|
| 396 |
+
Return:
|
| 397 |
+
:class:`~tokenizers.Encoding`: The final encoding
|
| 398 |
+
"""
|
| 399 |
+
pass
|
| 400 |
+
|
| 401 |
+
class TemplateProcessing(PostProcessor):
|
| 402 |
+
"""
|
| 403 |
+
Provides a way to specify templates in order to add the special tokens to each
|
| 404 |
+
input sequence as relevant.
|
| 405 |
+
|
| 406 |
+
Let's take :obj:`BERT` tokenizer as an example. It uses two special tokens, used to
|
| 407 |
+
delimitate each sequence. :obj:`[CLS]` is always used at the beginning of the first
|
| 408 |
+
sequence, and :obj:`[SEP]` is added at the end of both the first, and the pair
|
| 409 |
+
sequences. The final result looks like this:
|
| 410 |
+
|
| 411 |
+
- Single sequence: :obj:`[CLS] Hello there [SEP]`
|
| 412 |
+
- Pair sequences: :obj:`[CLS] My name is Anthony [SEP] What is my name? [SEP]`
|
| 413 |
+
|
| 414 |
+
With the type ids as following::
|
| 415 |
+
|
| 416 |
+
[CLS] ... [SEP] ... [SEP]
|
| 417 |
+
0 0 0 1 1
|
| 418 |
+
|
| 419 |
+
You can achieve such behavior using a TemplateProcessing::
|
| 420 |
+
|
| 421 |
+
TemplateProcessing(
|
| 422 |
+
single="[CLS] $0 [SEP]",
|
| 423 |
+
pair="[CLS] $A [SEP] $B:1 [SEP]:1",
|
| 424 |
+
special_tokens=[("[CLS]", 1), ("[SEP]", 0)],
|
| 425 |
+
)
|
| 426 |
+
|
| 427 |
+
In this example, each input sequence is identified using a ``$`` construct. This identifier
|
| 428 |
+
lets us specify each input sequence, and the type_id to use. When nothing is specified,
|
| 429 |
+
it uses the default values. Here are the different ways to specify it:
|
| 430 |
+
|
| 431 |
+
- Specifying the sequence, with default ``type_id == 0``: ``$A`` or ``$B``
|
| 432 |
+
- Specifying the `type_id` with default ``sequence == A``: ``$0``, ``$1``, ``$2``, ...
|
| 433 |
+
- Specifying both: ``$A:0``, ``$B:1``, ...
|
| 434 |
+
|
| 435 |
+
The same construct is used for special tokens: ``<identifier>(:<type_id>)?``.
|
| 436 |
+
|
| 437 |
+
**Warning**: You must ensure that you are giving the correct tokens/ids as these
|
| 438 |
+
will be added to the Encoding without any further check. If the given ids correspond
|
| 439 |
+
to something totally different in a `Tokenizer` using this `PostProcessor`, it
|
| 440 |
+
might lead to unexpected results.
|
| 441 |
+
|
| 442 |
+
Args:
|
| 443 |
+
single (:obj:`Template`):
|
| 444 |
+
The template used for single sequences
|
| 445 |
+
|
| 446 |
+
pair (:obj:`Template`):
|
| 447 |
+
The template used when both sequences are specified
|
| 448 |
+
|
| 449 |
+
special_tokens (:obj:`Tokens`):
|
| 450 |
+
The list of special tokens used in each sequences
|
| 451 |
+
|
| 452 |
+
Types:
|
| 453 |
+
|
| 454 |
+
Template (:obj:`str` or :obj:`List`):
|
| 455 |
+
- If a :obj:`str` is provided, the whitespace is used as delimiter between tokens
|
| 456 |
+
- If a :obj:`List[str]` is provided, a list of tokens
|
| 457 |
+
|
| 458 |
+
Tokens (:obj:`List[Union[Tuple[int, str], Tuple[str, int], dict]]`):
|
| 459 |
+
- A :obj:`Tuple` with both a token and its associated ID, in any order
|
| 460 |
+
- A :obj:`dict` with the following keys:
|
| 461 |
+
- "id": :obj:`str` => The special token id, as specified in the Template
|
| 462 |
+
- "ids": :obj:`List[int]` => The associated IDs
|
| 463 |
+
- "tokens": :obj:`List[str]` => The associated tokens
|
| 464 |
+
|
| 465 |
+
The given dict expects the provided :obj:`ids` and :obj:`tokens` lists to have
|
| 466 |
+
the same length.
|
| 467 |
+
"""
|
| 468 |
+
def __init__(self, single=None, pair=None, special_tokens=None):
|
| 469 |
+
pass
|
| 470 |
+
|
| 471 |
+
def __getstate__(self):
|
| 472 |
+
""" """
|
| 473 |
+
pass
|
| 474 |
+
|
| 475 |
+
def __setstate__(self, state):
|
| 476 |
+
""" """
|
| 477 |
+
pass
|
| 478 |
+
|
| 479 |
+
def num_special_tokens_to_add(self, is_pair):
|
| 480 |
+
"""
|
| 481 |
+
Return the number of special tokens that would be added for single/pair sentences.
|
| 482 |
+
|
| 483 |
+
Args:
|
| 484 |
+
is_pair (:obj:`bool`):
|
| 485 |
+
Whether the input would be a pair of sequences
|
| 486 |
+
|
| 487 |
+
Returns:
|
| 488 |
+
:obj:`int`: The number of tokens to add
|
| 489 |
+
"""
|
| 490 |
+
pass
|
| 491 |
+
|
| 492 |
+
def process(self, encoding, pair=None, add_special_tokens=True):
|
| 493 |
+
"""
|
| 494 |
+
Post-process the given encodings, generating the final one
|
| 495 |
+
|
| 496 |
+
Args:
|
| 497 |
+
encoding (:class:`~tokenizers.Encoding`):
|
| 498 |
+
The encoding for the first sequence
|
| 499 |
+
|
| 500 |
+
pair (:class:`~tokenizers.Encoding`, `optional`):
|
| 501 |
+
The encoding for the pair sequence
|
| 502 |
+
|
| 503 |
+
add_special_tokens (:obj:`bool`):
|
| 504 |
+
Whether to add the special tokens
|
| 505 |
+
|
| 506 |
+
Return:
|
| 507 |
+
:class:`~tokenizers.Encoding`: The final encoding
|
| 508 |
+
"""
|
| 509 |
+
pass
|
| 510 |
+
|
| 511 |
+
@property
|
| 512 |
+
def single(self):
|
| 513 |
+
""" """
|
| 514 |
+
pass
|
| 515 |
+
|
| 516 |
+
@single.setter
|
| 517 |
+
def single(self, value):
|
| 518 |
+
""" """
|
| 519 |
+
pass
|
source/tokenizers/tokenizers.abi3.so
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c116fcf1e80d461ce0a35c332974f25949e8359416f50b3d53371810d2ce1ccc
|
| 3 |
+
size 10074176
|
source/tokenizers/tokenizers.pyi
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Generated content DO NOT EDIT
|
| 2 |
+
from . import (
|
| 3 |
+
AddedToken as AddedToken,
|
| 4 |
+
Encoding as Encoding,
|
| 5 |
+
NormalizedString as NormalizedString,
|
| 6 |
+
PreTokenizedString as PreTokenizedString,
|
| 7 |
+
Regex as Regex,
|
| 8 |
+
Token as Token,
|
| 9 |
+
Tokenizer as Tokenizer,
|
| 10 |
+
__version__ as __version__,
|
| 11 |
+
decoders as decoders,
|
| 12 |
+
models as models,
|
| 13 |
+
normalizers as normalizers,
|
| 14 |
+
pre_tokenizers as pre_tokenizers,
|
| 15 |
+
processors as processors,
|
| 16 |
+
trainers as trainers,
|
| 17 |
+
)
|
source/tokenizers/tools/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
from .visualizer import Annotation, EncodingVisualizer
|
source/tokenizers/tools/visualizer-styles.css
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.tokenized-text {
|
| 2 |
+
width:100%;
|
| 3 |
+
padding:2rem;
|
| 4 |
+
max-height: 400px;
|
| 5 |
+
overflow-y: auto;
|
| 6 |
+
box-sizing:border-box;
|
| 7 |
+
line-height:4rem; /* Lots of space between lines */
|
| 8 |
+
font-family: "Roboto Light", "Ubuntu Light", "Ubuntu", monospace;
|
| 9 |
+
box-shadow: 2px 2px 2px rgba(0,0,0,0.2);
|
| 10 |
+
background-color: rgba(0,0,0,0.01);
|
| 11 |
+
letter-spacing:2px; /* Give some extra separation between chars */
|
| 12 |
+
}
|
| 13 |
+
.non-token{
|
| 14 |
+
/* White space and other things the tokenizer ignores*/
|
| 15 |
+
white-space: pre;
|
| 16 |
+
letter-spacing:4px;
|
| 17 |
+
border-top:1px solid #A0A0A0; /* A gentle border on top and bottom makes tabs more ovious*/
|
| 18 |
+
border-bottom:1px solid #A0A0A0;
|
| 19 |
+
line-height: 1rem;
|
| 20 |
+
height: calc(100% - 2px);
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
.token {
|
| 24 |
+
white-space: pre;
|
| 25 |
+
position:relative;
|
| 26 |
+
color:black;
|
| 27 |
+
letter-spacing:2px;
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
.annotation{
|
| 31 |
+
white-space:nowrap; /* Important - ensures that annotations appears even if the annotated text wraps a line */
|
| 32 |
+
border-radius:4px;
|
| 33 |
+
position:relative;
|
| 34 |
+
width:fit-content;
|
| 35 |
+
}
|
| 36 |
+
.annotation:before {
|
| 37 |
+
/*The before holds the text and the after holds the background*/
|
| 38 |
+
z-index:1000; /* Make sure this is above the background */
|
| 39 |
+
content:attr(data-label); /* The annotations label is on a data attribute */
|
| 40 |
+
color:white;
|
| 41 |
+
position:absolute;
|
| 42 |
+
font-size:1rem;
|
| 43 |
+
text-align:center;
|
| 44 |
+
font-weight:bold;
|
| 45 |
+
|
| 46 |
+
top:1.75rem;
|
| 47 |
+
line-height:0;
|
| 48 |
+
left:0;
|
| 49 |
+
width:100%;
|
| 50 |
+
padding:0.5rem 0;
|
| 51 |
+
/* These make it so an annotation doesn't stretch beyond the annotated text if the label is longer*/
|
| 52 |
+
overflow: hidden;
|
| 53 |
+
white-space: nowrap;
|
| 54 |
+
text-overflow:ellipsis;
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
.annotation:after {
|
| 58 |
+
content:attr(data-label); /* The content defines the width of the annotation*/
|
| 59 |
+
position:absolute;
|
| 60 |
+
font-size:0.75rem;
|
| 61 |
+
text-align:center;
|
| 62 |
+
font-weight:bold;
|
| 63 |
+
text-overflow:ellipsis;
|
| 64 |
+
top:1.75rem;
|
| 65 |
+
line-height:0;
|
| 66 |
+
overflow: hidden;
|
| 67 |
+
white-space: nowrap;
|
| 68 |
+
|
| 69 |
+
left:0;
|
| 70 |
+
width:100%; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/
|
| 71 |
+
|
| 72 |
+
padding:0.5rem 0;
|
| 73 |
+
/* Nast hack below:
|
| 74 |
+
We set the annotations color in code because we don't know the colors at css time.
|
| 75 |
+
But you can't pass a color as a data attribute to get it into the pseudo element (this thing)
|
| 76 |
+
So to get around that, annotations have the color set on them with a style attribute and then we
|
| 77 |
+
can get the color with currentColor.
|
| 78 |
+
Annotations wrap tokens and tokens set the color back to black
|
| 79 |
+
*/
|
| 80 |
+
background-color: currentColor;
|
| 81 |
+
}
|
| 82 |
+
.annotation:hover::after, .annotation:hover::before{
|
| 83 |
+
/* When the user hovers over an annotation expand the label to display in full
|
| 84 |
+
*/
|
| 85 |
+
min-width: fit-content;
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
.annotation:hover{
|
| 89 |
+
/* Emphasize the annotation start end with a border on hover*/
|
| 90 |
+
border-color: currentColor;
|
| 91 |
+
border: 2px solid;
|
| 92 |
+
}
|
| 93 |
+
.special-token:not(:empty){
|
| 94 |
+
/*
|
| 95 |
+
A none empty special token is like UNK (as opposed to CLS which has no representation in the text )
|
| 96 |
+
*/
|
| 97 |
+
position:relative;
|
| 98 |
+
}
|
| 99 |
+
.special-token:empty::before{
|
| 100 |
+
/* Special tokens that don't have text are displayed as pseudo elements so we dont select them with the mouse*/
|
| 101 |
+
content:attr(data-stok);
|
| 102 |
+
background:#202020;
|
| 103 |
+
font-size:0.75rem;
|
| 104 |
+
color:white;
|
| 105 |
+
margin: 0 0.25rem;
|
| 106 |
+
padding: 0.25rem;
|
| 107 |
+
border-radius:4px
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
.special-token:not(:empty):before {
|
| 111 |
+
/* Special tokens that have text (UNK) are displayed above the actual text*/
|
| 112 |
+
content:attr(data-stok);
|
| 113 |
+
position:absolute;
|
| 114 |
+
bottom:1.75rem;
|
| 115 |
+
min-width:100%;
|
| 116 |
+
width:100%;
|
| 117 |
+
height:1rem;
|
| 118 |
+
line-height:1rem;
|
| 119 |
+
font-size:1rem;
|
| 120 |
+
text-align:center;
|
| 121 |
+
color:white;
|
| 122 |
+
font-weight:bold;
|
| 123 |
+
background:#202020;
|
| 124 |
+
border-radius:10%;
|
| 125 |
+
}
|
| 126 |
+
/*
|
| 127 |
+
We want to alternate the color of tokens, but we can't use nth child because tokens might be broken up by annotations
|
| 128 |
+
instead we apply even and odd class at generation time and color them that way
|
| 129 |
+
*/
|
| 130 |
+
.even-token{
|
| 131 |
+
background:#DCDCDC ;
|
| 132 |
+
border: 1px solid #DCDCDC;
|
| 133 |
+
}
|
| 134 |
+
.odd-token{
|
| 135 |
+
background:#A0A0A0;
|
| 136 |
+
border: 1px solid #A0A0A0;
|
| 137 |
+
}
|
| 138 |
+
.even-token.multi-token,.odd-token.multi-token{
|
| 139 |
+
background: repeating-linear-gradient(
|
| 140 |
+
45deg,
|
| 141 |
+
transparent,
|
| 142 |
+
transparent 1px,
|
| 143 |
+
#ccc 1px,
|
| 144 |
+
#ccc 1px
|
| 145 |
+
),
|
| 146 |
+
/* on "bottom" */
|
| 147 |
+
linear-gradient(
|
| 148 |
+
to bottom,
|
| 149 |
+
#FFB6C1,
|
| 150 |
+
#999
|
| 151 |
+
);
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
.multi-token:hover::after {
|
| 155 |
+
content:"This char has more than 1 token"; /* The content defines the width of the annotation*/
|
| 156 |
+
color:white;
|
| 157 |
+
background-color: black;
|
| 158 |
+
position:absolute;
|
| 159 |
+
font-size:0.75rem;
|
| 160 |
+
text-align:center;
|
| 161 |
+
font-weight:bold;
|
| 162 |
+
text-overflow:ellipsis;
|
| 163 |
+
top:1.75rem;
|
| 164 |
+
line-height:0;
|
| 165 |
+
overflow: hidden;
|
| 166 |
+
white-space: nowrap;
|
| 167 |
+
left:0;
|
| 168 |
+
width:fit-content; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/
|
| 169 |
+
padding:0.5rem 0;
|
| 170 |
+
}
|
source/tokenizers/tools/visualizer.py
ADDED
|
@@ -0,0 +1,407 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import itertools
|
| 2 |
+
import os
|
| 3 |
+
import re
|
| 4 |
+
from string import Template
|
| 5 |
+
from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple
|
| 6 |
+
|
| 7 |
+
from tokenizers import Encoding, Tokenizer
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
dirname = os.path.dirname(__file__)
|
| 11 |
+
css_filename = os.path.join(dirname, "visualizer-styles.css")
|
| 12 |
+
with open(css_filename) as f:
|
| 13 |
+
css = f.read()
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class Annotation:
|
| 17 |
+
start: int
|
| 18 |
+
end: int
|
| 19 |
+
label: str
|
| 20 |
+
|
| 21 |
+
def __init__(self, start: int, end: int, label: str):
|
| 22 |
+
self.start = start
|
| 23 |
+
self.end = end
|
| 24 |
+
self.label = label
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
AnnotationList = List[Annotation]
|
| 28 |
+
PartialIntList = List[Optional[int]]
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class CharStateKey(NamedTuple):
|
| 32 |
+
token_ix: Optional[int]
|
| 33 |
+
anno_ix: Optional[int]
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class CharState:
|
| 37 |
+
char_ix: Optional[int]
|
| 38 |
+
|
| 39 |
+
def __init__(self, char_ix):
|
| 40 |
+
self.char_ix = char_ix
|
| 41 |
+
|
| 42 |
+
self.anno_ix: Optional[int] = None
|
| 43 |
+
self.tokens: List[int] = []
|
| 44 |
+
|
| 45 |
+
@property
|
| 46 |
+
def token_ix(self):
|
| 47 |
+
return self.tokens[0] if len(self.tokens) > 0 else None
|
| 48 |
+
|
| 49 |
+
@property
|
| 50 |
+
def is_multitoken(self):
|
| 51 |
+
"""
|
| 52 |
+
BPE tokenizers can output more than one token for a char
|
| 53 |
+
"""
|
| 54 |
+
return len(self.tokens) > 1
|
| 55 |
+
|
| 56 |
+
def partition_key(self) -> CharStateKey:
|
| 57 |
+
return CharStateKey(
|
| 58 |
+
token_ix=self.token_ix,
|
| 59 |
+
anno_ix=self.anno_ix,
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
class Aligned:
|
| 64 |
+
pass
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class EncodingVisualizer:
|
| 68 |
+
"""
|
| 69 |
+
Build an EncodingVisualizer
|
| 70 |
+
|
| 71 |
+
Args:
|
| 72 |
+
|
| 73 |
+
tokenizer (:class:`~tokenizers.Tokenizer`):
|
| 74 |
+
A tokenizer instance
|
| 75 |
+
|
| 76 |
+
default_to_notebook (:obj:`bool`):
|
| 77 |
+
Whether to render html output in a notebook by default
|
| 78 |
+
|
| 79 |
+
annotation_converter (:obj:`Callable`, `optional`):
|
| 80 |
+
An optional (lambda) function that takes an annotation in any format and returns
|
| 81 |
+
an Annotation object
|
| 82 |
+
"""
|
| 83 |
+
|
| 84 |
+
unk_token_regex = re.compile("(.{1}\b)?(unk|oov)(\b.{1})?", flags=re.IGNORECASE)
|
| 85 |
+
|
| 86 |
+
def __init__(
|
| 87 |
+
self,
|
| 88 |
+
tokenizer: Tokenizer,
|
| 89 |
+
default_to_notebook: bool = True,
|
| 90 |
+
annotation_converter: Optional[Callable[[Any], Annotation]] = None,
|
| 91 |
+
):
|
| 92 |
+
if default_to_notebook:
|
| 93 |
+
try:
|
| 94 |
+
from IPython.core.display import HTML, display # type: ignore[attr-defined]
|
| 95 |
+
except ImportError:
|
| 96 |
+
raise Exception(
|
| 97 |
+
"""We couldn't import IPython utils for html display.
|
| 98 |
+
Are you running in a notebook?
|
| 99 |
+
You can also pass `default_to_notebook=False` to get back raw HTML
|
| 100 |
+
"""
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
self.tokenizer = tokenizer
|
| 104 |
+
self.default_to_notebook = default_to_notebook
|
| 105 |
+
self.annotation_coverter = annotation_converter
|
| 106 |
+
pass
|
| 107 |
+
|
| 108 |
+
def __call__(
|
| 109 |
+
self,
|
| 110 |
+
text: str,
|
| 111 |
+
annotations: Optional[List[Any]] = None,
|
| 112 |
+
default_to_notebook: Optional[bool] = None,
|
| 113 |
+
) -> Optional[str]:
|
| 114 |
+
"""
|
| 115 |
+
Build a visualization of the given text
|
| 116 |
+
|
| 117 |
+
Args:
|
| 118 |
+
text (:obj:`str`):
|
| 119 |
+
The text to tokenize
|
| 120 |
+
|
| 121 |
+
annotations (:obj:`List[Annotation]`, `optional`):
|
| 122 |
+
An optional list of annotations of the text. The can either be an annotation class
|
| 123 |
+
or anything else if you instantiated the visualizer with a converter function
|
| 124 |
+
|
| 125 |
+
default_to_notebook (:obj:`bool`, `optional`, defaults to `False`):
|
| 126 |
+
If True, will render the html in a notebook. Otherwise returns an html string.
|
| 127 |
+
|
| 128 |
+
Returns:
|
| 129 |
+
The HTML string if default_to_notebook is False, otherwise (default) returns None and
|
| 130 |
+
renders the HTML in the notebook
|
| 131 |
+
|
| 132 |
+
"""
|
| 133 |
+
final_default_to_notebook = self.default_to_notebook
|
| 134 |
+
if default_to_notebook is not None:
|
| 135 |
+
final_default_to_notebook = default_to_notebook
|
| 136 |
+
if final_default_to_notebook:
|
| 137 |
+
try:
|
| 138 |
+
from IPython.core.display import HTML, display # type: ignore[attr-defined]
|
| 139 |
+
except ImportError:
|
| 140 |
+
raise Exception(
|
| 141 |
+
"""We couldn't import IPython utils for html display.
|
| 142 |
+
Are you running in a notebook?"""
|
| 143 |
+
)
|
| 144 |
+
if annotations is None:
|
| 145 |
+
annotations = []
|
| 146 |
+
if self.annotation_coverter is not None:
|
| 147 |
+
annotations = list(map(self.annotation_coverter, annotations))
|
| 148 |
+
encoding = self.tokenizer.encode(text)
|
| 149 |
+
html = EncodingVisualizer.__make_html(text, encoding, annotations)
|
| 150 |
+
if final_default_to_notebook:
|
| 151 |
+
display(HTML(html))
|
| 152 |
+
else:
|
| 153 |
+
return html
|
| 154 |
+
|
| 155 |
+
@staticmethod
|
| 156 |
+
def calculate_label_colors(annotations: AnnotationList) -> Dict[str, str]:
|
| 157 |
+
"""
|
| 158 |
+
Generates a color palette for all the labels in a given set of annotations
|
| 159 |
+
|
| 160 |
+
Args:
|
| 161 |
+
annotations (:obj:`Annotation`):
|
| 162 |
+
A list of annotations
|
| 163 |
+
|
| 164 |
+
Returns:
|
| 165 |
+
:obj:`dict`: A dictionary mapping labels to colors in HSL format
|
| 166 |
+
"""
|
| 167 |
+
if len(annotations) == 0:
|
| 168 |
+
return {}
|
| 169 |
+
labels = set(map(lambda x: x.label, annotations))
|
| 170 |
+
num_labels = len(labels)
|
| 171 |
+
h_step = int(255 / num_labels)
|
| 172 |
+
if h_step < 20:
|
| 173 |
+
h_step = 20
|
| 174 |
+
s = 32
|
| 175 |
+
l = 64 # noqa: E741
|
| 176 |
+
h = 10
|
| 177 |
+
colors = {}
|
| 178 |
+
|
| 179 |
+
for label in sorted(labels): # sort so we always get the same colors for a given set of labels
|
| 180 |
+
colors[label] = f"hsl({h},{s}%,{l}%)"
|
| 181 |
+
h += h_step
|
| 182 |
+
return colors
|
| 183 |
+
|
| 184 |
+
@staticmethod
|
| 185 |
+
def consecutive_chars_to_html(
|
| 186 |
+
consecutive_chars_list: List[CharState],
|
| 187 |
+
text: str,
|
| 188 |
+
encoding: Encoding,
|
| 189 |
+
):
|
| 190 |
+
"""
|
| 191 |
+
Converts a list of "consecutive chars" into a single HTML element.
|
| 192 |
+
Chars are consecutive if they fall under the same word, token and annotation.
|
| 193 |
+
The CharState class is a named tuple with a "partition_key" method that makes it easy to
|
| 194 |
+
compare if two chars are consecutive.
|
| 195 |
+
|
| 196 |
+
Args:
|
| 197 |
+
consecutive_chars_list (:obj:`List[CharState]`):
|
| 198 |
+
A list of CharStates that have been grouped together
|
| 199 |
+
|
| 200 |
+
text (:obj:`str`):
|
| 201 |
+
The original text being processed
|
| 202 |
+
|
| 203 |
+
encoding (:class:`~tokenizers.Encoding`):
|
| 204 |
+
The encoding returned from the tokenizer
|
| 205 |
+
|
| 206 |
+
Returns:
|
| 207 |
+
:obj:`str`: The HTML span for a set of consecutive chars
|
| 208 |
+
"""
|
| 209 |
+
first = consecutive_chars_list[0]
|
| 210 |
+
if first.char_ix is None:
|
| 211 |
+
# its a special token
|
| 212 |
+
stoken = encoding.tokens[first.token_ix]
|
| 213 |
+
# special tokens are represented as empty spans. We use the data attribute and css
|
| 214 |
+
# magic to display it
|
| 215 |
+
return f'<span class="special-token" data-stoken={stoken}></span>'
|
| 216 |
+
# We're not in a special token so this group has a start and end.
|
| 217 |
+
last = consecutive_chars_list[-1]
|
| 218 |
+
assert first.char_ix is not None
|
| 219 |
+
assert last.char_ix is not None
|
| 220 |
+
start = first.char_ix
|
| 221 |
+
end = last.char_ix + 1
|
| 222 |
+
span_text = text[start:end]
|
| 223 |
+
css_classes = [] # What css classes will we apply on the resulting span
|
| 224 |
+
data_items = {} # What data attributes will we apply on the result span
|
| 225 |
+
if first.token_ix is not None:
|
| 226 |
+
# We can either be in a token or not (e.g. in white space)
|
| 227 |
+
css_classes.append("token")
|
| 228 |
+
if first.is_multitoken:
|
| 229 |
+
css_classes.append("multi-token")
|
| 230 |
+
if first.token_ix % 2:
|
| 231 |
+
# We use this to color alternating tokens.
|
| 232 |
+
# A token might be split by an annotation that ends in the middle of it, so this
|
| 233 |
+
# lets us visually indicate a consecutive token despite its possible splitting in
|
| 234 |
+
# the html markup
|
| 235 |
+
css_classes.append("odd-token")
|
| 236 |
+
else:
|
| 237 |
+
# Like above, but a different color so we can see the tokens alternate
|
| 238 |
+
css_classes.append("even-token")
|
| 239 |
+
if EncodingVisualizer.unk_token_regex.search(encoding.tokens[first.token_ix]) is not None:
|
| 240 |
+
# This is a special token that is in the text. probably UNK
|
| 241 |
+
css_classes.append("special-token")
|
| 242 |
+
# TODO is this the right name for the data attribute ?
|
| 243 |
+
data_items["stok"] = encoding.tokens[first.token_ix]
|
| 244 |
+
else:
|
| 245 |
+
# In this case we are looking at a group/single char that is not tokenized.
|
| 246 |
+
# e.g. white space
|
| 247 |
+
css_classes.append("non-token")
|
| 248 |
+
css = f'''class="{" ".join(css_classes)}"'''
|
| 249 |
+
data = ""
|
| 250 |
+
for key, val in data_items.items():
|
| 251 |
+
data += f' data-{key}="{val}"'
|
| 252 |
+
return f"<span {css} {data} >{span_text}</span>"
|
| 253 |
+
|
| 254 |
+
@staticmethod
|
| 255 |
+
def __make_html(text: str, encoding: Encoding, annotations: AnnotationList) -> str:
|
| 256 |
+
char_states = EncodingVisualizer.__make_char_states(text, encoding, annotations)
|
| 257 |
+
current_consecutive_chars = [char_states[0]]
|
| 258 |
+
prev_anno_ix = char_states[0].anno_ix
|
| 259 |
+
spans = []
|
| 260 |
+
label_colors_dict = EncodingVisualizer.calculate_label_colors(annotations)
|
| 261 |
+
cur_anno_ix = char_states[0].anno_ix
|
| 262 |
+
if cur_anno_ix is not None:
|
| 263 |
+
# If we started in an annotation make a span for it
|
| 264 |
+
anno = annotations[cur_anno_ix]
|
| 265 |
+
label = anno.label
|
| 266 |
+
color = label_colors_dict[label]
|
| 267 |
+
spans.append(f'<span class="annotation" style="color:{color}" data-label="{label}">')
|
| 268 |
+
|
| 269 |
+
for cs in char_states[1:]:
|
| 270 |
+
cur_anno_ix = cs.anno_ix
|
| 271 |
+
if cur_anno_ix != prev_anno_ix:
|
| 272 |
+
# If we've transitioned in or out of an annotation
|
| 273 |
+
spans.append(
|
| 274 |
+
# Create a span from the current consecutive characters
|
| 275 |
+
EncodingVisualizer.consecutive_chars_to_html(
|
| 276 |
+
current_consecutive_chars,
|
| 277 |
+
text=text,
|
| 278 |
+
encoding=encoding,
|
| 279 |
+
)
|
| 280 |
+
)
|
| 281 |
+
current_consecutive_chars = [cs]
|
| 282 |
+
|
| 283 |
+
if prev_anno_ix is not None:
|
| 284 |
+
# if we transitioned out of an annotation close it's span
|
| 285 |
+
spans.append("</span>")
|
| 286 |
+
if cur_anno_ix is not None:
|
| 287 |
+
# If we entered a new annotation make a span for it
|
| 288 |
+
anno = annotations[cur_anno_ix]
|
| 289 |
+
label = anno.label
|
| 290 |
+
color = label_colors_dict[label]
|
| 291 |
+
spans.append(f'<span class="annotation" style="color:{color}" data-label="{label}">')
|
| 292 |
+
prev_anno_ix = cur_anno_ix
|
| 293 |
+
|
| 294 |
+
if cs.partition_key() == current_consecutive_chars[0].partition_key():
|
| 295 |
+
# If the current charchter is in the same "group" as the previous one
|
| 296 |
+
current_consecutive_chars.append(cs)
|
| 297 |
+
else:
|
| 298 |
+
# Otherwise we make a span for the previous group
|
| 299 |
+
spans.append(
|
| 300 |
+
EncodingVisualizer.consecutive_chars_to_html(
|
| 301 |
+
current_consecutive_chars,
|
| 302 |
+
text=text,
|
| 303 |
+
encoding=encoding,
|
| 304 |
+
)
|
| 305 |
+
)
|
| 306 |
+
# An reset the consecutive_char_list to form a new group
|
| 307 |
+
current_consecutive_chars = [cs]
|
| 308 |
+
# All that's left is to fill out the final span
|
| 309 |
+
# TODO I think there is an edge case here where an annotation's span might not close
|
| 310 |
+
spans.append(
|
| 311 |
+
EncodingVisualizer.consecutive_chars_to_html(
|
| 312 |
+
current_consecutive_chars,
|
| 313 |
+
text=text,
|
| 314 |
+
encoding=encoding,
|
| 315 |
+
)
|
| 316 |
+
)
|
| 317 |
+
res = HTMLBody(spans) # Send the list of spans to the body of our html
|
| 318 |
+
return res
|
| 319 |
+
|
| 320 |
+
@staticmethod
|
| 321 |
+
def __make_anno_map(text: str, annotations: AnnotationList) -> PartialIntList:
|
| 322 |
+
"""
|
| 323 |
+
Args:
|
| 324 |
+
text (:obj:`str`):
|
| 325 |
+
The raw text we want to align to
|
| 326 |
+
|
| 327 |
+
annotations (:obj:`AnnotationList`):
|
| 328 |
+
A (possibly empty) list of annotations
|
| 329 |
+
|
| 330 |
+
Returns:
|
| 331 |
+
A list of length len(text) whose entry at index i is None if there is no annotation on
|
| 332 |
+
character i or k, the index of the annotation that covers index i where k is with
|
| 333 |
+
respect to the list of annotations
|
| 334 |
+
"""
|
| 335 |
+
annotation_map = [None] * len(text)
|
| 336 |
+
for anno_ix, a in enumerate(annotations):
|
| 337 |
+
for i in range(a.start, a.end):
|
| 338 |
+
annotation_map[i] = anno_ix
|
| 339 |
+
return annotation_map
|
| 340 |
+
|
| 341 |
+
@staticmethod
|
| 342 |
+
def __make_char_states(text: str, encoding: Encoding, annotations: AnnotationList) -> List[CharState]:
|
| 343 |
+
"""
|
| 344 |
+
For each character in the original text, we emit a tuple representing it's "state":
|
| 345 |
+
|
| 346 |
+
* which token_ix it corresponds to
|
| 347 |
+
* which word_ix it corresponds to
|
| 348 |
+
* which annotation_ix it corresponds to
|
| 349 |
+
|
| 350 |
+
Args:
|
| 351 |
+
text (:obj:`str`):
|
| 352 |
+
The raw text we want to align to
|
| 353 |
+
|
| 354 |
+
annotations (:obj:`List[Annotation]`):
|
| 355 |
+
A (possibly empty) list of annotations
|
| 356 |
+
|
| 357 |
+
encoding: (:class:`~tokenizers.Encoding`):
|
| 358 |
+
The encoding returned from the tokenizer
|
| 359 |
+
|
| 360 |
+
Returns:
|
| 361 |
+
:obj:`List[CharState]`: A list of CharStates, indicating for each char in the text what
|
| 362 |
+
it's state is
|
| 363 |
+
"""
|
| 364 |
+
annotation_map = EncodingVisualizer.__make_anno_map(text, annotations)
|
| 365 |
+
# Todo make this a dataclass or named tuple
|
| 366 |
+
char_states: List[CharState] = [CharState(char_ix) for char_ix in range(len(text))]
|
| 367 |
+
for token_ix, token in enumerate(encoding.tokens):
|
| 368 |
+
offsets = encoding.token_to_chars(token_ix)
|
| 369 |
+
if offsets is not None:
|
| 370 |
+
start, end = offsets
|
| 371 |
+
for i in range(start, end):
|
| 372 |
+
char_states[i].tokens.append(token_ix)
|
| 373 |
+
for char_ix, anno_ix in enumerate(annotation_map):
|
| 374 |
+
char_states[char_ix].anno_ix = anno_ix
|
| 375 |
+
|
| 376 |
+
return char_states
|
| 377 |
+
|
| 378 |
+
|
| 379 |
+
def HTMLBody(children: List[str], css_styles=css) -> str:
|
| 380 |
+
"""
|
| 381 |
+
Generates the full html with css from a list of html spans
|
| 382 |
+
|
| 383 |
+
Args:
|
| 384 |
+
children (:obj:`List[str]`):
|
| 385 |
+
A list of strings, assumed to be html elements
|
| 386 |
+
|
| 387 |
+
css_styles (:obj:`str`, `optional`):
|
| 388 |
+
Optional alternative implementation of the css
|
| 389 |
+
|
| 390 |
+
Returns:
|
| 391 |
+
:obj:`str`: An HTML string with style markup
|
| 392 |
+
"""
|
| 393 |
+
children_text = "".join(children)
|
| 394 |
+
return f"""
|
| 395 |
+
<html>
|
| 396 |
+
<head>
|
| 397 |
+
<style>
|
| 398 |
+
{css_styles}
|
| 399 |
+
</style>
|
| 400 |
+
</head>
|
| 401 |
+
<body>
|
| 402 |
+
<div class="tokenized-text" dir=auto>
|
| 403 |
+
{children_text}
|
| 404 |
+
</div>
|
| 405 |
+
</body>
|
| 406 |
+
</html>
|
| 407 |
+
"""
|
source/tokenizers/trainers/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Generated content DO NOT EDIT
|
| 2 |
+
from .. import trainers
|
| 3 |
+
|
| 4 |
+
Trainer = trainers.Trainer
|
| 5 |
+
BpeTrainer = trainers.BpeTrainer
|
| 6 |
+
UnigramTrainer = trainers.UnigramTrainer
|
| 7 |
+
WordLevelTrainer = trainers.WordLevelTrainer
|
| 8 |
+
WordPieceTrainer = trainers.WordPieceTrainer
|
source/tokenizers/trainers/__init__.pyi
ADDED
|
@@ -0,0 +1,462 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Generated content DO NOT EDIT
|
| 2 |
+
class Trainer:
|
| 3 |
+
"""
|
| 4 |
+
Base class for all trainers
|
| 5 |
+
|
| 6 |
+
This class is not supposed to be instantiated directly. Instead, any implementation of a
|
| 7 |
+
Trainer will return an instance of this class when instantiated.
|
| 8 |
+
"""
|
| 9 |
+
def __getstate__(self):
|
| 10 |
+
""" """
|
| 11 |
+
pass
|
| 12 |
+
|
| 13 |
+
def __setstate__(self, state):
|
| 14 |
+
""" """
|
| 15 |
+
pass
|
| 16 |
+
|
| 17 |
+
class BpeTrainer(Trainer):
|
| 18 |
+
"""
|
| 19 |
+
Trainer capable of training a BPE model
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
vocab_size (:obj:`int`, `optional`):
|
| 23 |
+
The size of the final vocabulary, including all tokens and alphabet.
|
| 24 |
+
|
| 25 |
+
min_frequency (:obj:`int`, `optional`):
|
| 26 |
+
The minimum frequency a pair should have in order to be merged.
|
| 27 |
+
|
| 28 |
+
show_progress (:obj:`bool`, `optional`):
|
| 29 |
+
Whether to show progress bars while training.
|
| 30 |
+
|
| 31 |
+
special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
|
| 32 |
+
A list of special tokens the model should know of.
|
| 33 |
+
|
| 34 |
+
limit_alphabet (:obj:`int`, `optional`):
|
| 35 |
+
The maximum different characters to keep in the alphabet.
|
| 36 |
+
|
| 37 |
+
initial_alphabet (:obj:`List[str]`, `optional`):
|
| 38 |
+
A list of characters to include in the initial alphabet, even
|
| 39 |
+
if not seen in the training dataset.
|
| 40 |
+
If the strings contain more than one character, only the first one
|
| 41 |
+
is kept.
|
| 42 |
+
|
| 43 |
+
continuing_subword_prefix (:obj:`str`, `optional`):
|
| 44 |
+
A prefix to be used for every subword that is not a beginning-of-word.
|
| 45 |
+
|
| 46 |
+
end_of_word_suffix (:obj:`str`, `optional`):
|
| 47 |
+
A suffix to be used for every subword that is a end-of-word.
|
| 48 |
+
|
| 49 |
+
max_token_length (:obj:`int`, `optional`):
|
| 50 |
+
Prevents creating tokens longer than the specified size.
|
| 51 |
+
This can help with reducing polluting your vocabulary with
|
| 52 |
+
highly repetitive tokens like `======` for wikipedia
|
| 53 |
+
|
| 54 |
+
"""
|
| 55 |
+
def __init__(
|
| 56 |
+
self,
|
| 57 |
+
vocab_size=30000,
|
| 58 |
+
min_frequency=0,
|
| 59 |
+
show_progress=True,
|
| 60 |
+
special_tokens=[],
|
| 61 |
+
limit_alphabet=None,
|
| 62 |
+
initial_alphabet=[],
|
| 63 |
+
continuing_subword_prefix=None,
|
| 64 |
+
end_of_word_suffix=None,
|
| 65 |
+
max_token_length=None,
|
| 66 |
+
words={},
|
| 67 |
+
):
|
| 68 |
+
pass
|
| 69 |
+
|
| 70 |
+
def __getstate__(self):
|
| 71 |
+
""" """
|
| 72 |
+
pass
|
| 73 |
+
|
| 74 |
+
def __setstate__(self, state):
|
| 75 |
+
""" """
|
| 76 |
+
pass
|
| 77 |
+
|
| 78 |
+
@property
|
| 79 |
+
def continuing_subword_prefix(self):
|
| 80 |
+
""" """
|
| 81 |
+
pass
|
| 82 |
+
|
| 83 |
+
@continuing_subword_prefix.setter
|
| 84 |
+
def continuing_subword_prefix(self, value):
|
| 85 |
+
""" """
|
| 86 |
+
pass
|
| 87 |
+
|
| 88 |
+
@property
|
| 89 |
+
def end_of_word_suffix(self):
|
| 90 |
+
""" """
|
| 91 |
+
pass
|
| 92 |
+
|
| 93 |
+
@end_of_word_suffix.setter
|
| 94 |
+
def end_of_word_suffix(self, value):
|
| 95 |
+
""" """
|
| 96 |
+
pass
|
| 97 |
+
|
| 98 |
+
@property
|
| 99 |
+
def initial_alphabet(self):
|
| 100 |
+
""" """
|
| 101 |
+
pass
|
| 102 |
+
|
| 103 |
+
@initial_alphabet.setter
|
| 104 |
+
def initial_alphabet(self, value):
|
| 105 |
+
""" """
|
| 106 |
+
pass
|
| 107 |
+
|
| 108 |
+
@property
|
| 109 |
+
def limit_alphabet(self):
|
| 110 |
+
""" """
|
| 111 |
+
pass
|
| 112 |
+
|
| 113 |
+
@limit_alphabet.setter
|
| 114 |
+
def limit_alphabet(self, value):
|
| 115 |
+
""" """
|
| 116 |
+
pass
|
| 117 |
+
|
| 118 |
+
@property
|
| 119 |
+
def max_token_length(self):
|
| 120 |
+
""" """
|
| 121 |
+
pass
|
| 122 |
+
|
| 123 |
+
@max_token_length.setter
|
| 124 |
+
def max_token_length(self, value):
|
| 125 |
+
""" """
|
| 126 |
+
pass
|
| 127 |
+
|
| 128 |
+
@property
|
| 129 |
+
def min_frequency(self):
|
| 130 |
+
""" """
|
| 131 |
+
pass
|
| 132 |
+
|
| 133 |
+
@min_frequency.setter
|
| 134 |
+
def min_frequency(self, value):
|
| 135 |
+
""" """
|
| 136 |
+
pass
|
| 137 |
+
|
| 138 |
+
@property
|
| 139 |
+
def show_progress(self):
|
| 140 |
+
""" """
|
| 141 |
+
pass
|
| 142 |
+
|
| 143 |
+
@show_progress.setter
|
| 144 |
+
def show_progress(self, value):
|
| 145 |
+
""" """
|
| 146 |
+
pass
|
| 147 |
+
|
| 148 |
+
@property
|
| 149 |
+
def special_tokens(self):
|
| 150 |
+
""" """
|
| 151 |
+
pass
|
| 152 |
+
|
| 153 |
+
@special_tokens.setter
|
| 154 |
+
def special_tokens(self, value):
|
| 155 |
+
""" """
|
| 156 |
+
pass
|
| 157 |
+
|
| 158 |
+
@property
|
| 159 |
+
def vocab_size(self):
|
| 160 |
+
""" """
|
| 161 |
+
pass
|
| 162 |
+
|
| 163 |
+
@vocab_size.setter
|
| 164 |
+
def vocab_size(self, value):
|
| 165 |
+
""" """
|
| 166 |
+
pass
|
| 167 |
+
|
| 168 |
+
class UnigramTrainer(Trainer):
|
| 169 |
+
"""
|
| 170 |
+
Trainer capable of training a Unigram model
|
| 171 |
+
|
| 172 |
+
Args:
|
| 173 |
+
vocab_size (:obj:`int`):
|
| 174 |
+
The size of the final vocabulary, including all tokens and alphabet.
|
| 175 |
+
|
| 176 |
+
show_progress (:obj:`bool`):
|
| 177 |
+
Whether to show progress bars while training.
|
| 178 |
+
|
| 179 |
+
special_tokens (:obj:`List[Union[str, AddedToken]]`):
|
| 180 |
+
A list of special tokens the model should know of.
|
| 181 |
+
|
| 182 |
+
initial_alphabet (:obj:`List[str]`):
|
| 183 |
+
A list of characters to include in the initial alphabet, even
|
| 184 |
+
if not seen in the training dataset.
|
| 185 |
+
If the strings contain more than one character, only the first one
|
| 186 |
+
is kept.
|
| 187 |
+
|
| 188 |
+
shrinking_factor (:obj:`float`):
|
| 189 |
+
The shrinking factor used at each step of the training to prune the
|
| 190 |
+
vocabulary.
|
| 191 |
+
|
| 192 |
+
unk_token (:obj:`str`):
|
| 193 |
+
The token used for out-of-vocabulary tokens.
|
| 194 |
+
|
| 195 |
+
max_piece_length (:obj:`int`):
|
| 196 |
+
The maximum length of a given token.
|
| 197 |
+
|
| 198 |
+
n_sub_iterations (:obj:`int`):
|
| 199 |
+
The number of iterations of the EM algorithm to perform before
|
| 200 |
+
pruning the vocabulary.
|
| 201 |
+
"""
|
| 202 |
+
def __init__(
|
| 203 |
+
self,
|
| 204 |
+
vocab_size=8000,
|
| 205 |
+
show_progress=True,
|
| 206 |
+
special_tokens=[],
|
| 207 |
+
initial_alphabet=[],
|
| 208 |
+
shrinking_factor=0.75,
|
| 209 |
+
unk_token=None,
|
| 210 |
+
max_piece_length=16,
|
| 211 |
+
n_sub_iterations=2,
|
| 212 |
+
):
|
| 213 |
+
pass
|
| 214 |
+
|
| 215 |
+
def __getstate__(self):
|
| 216 |
+
""" """
|
| 217 |
+
pass
|
| 218 |
+
|
| 219 |
+
def __setstate__(self, state):
|
| 220 |
+
""" """
|
| 221 |
+
pass
|
| 222 |
+
|
| 223 |
+
@property
|
| 224 |
+
def initial_alphabet(self):
|
| 225 |
+
""" """
|
| 226 |
+
pass
|
| 227 |
+
|
| 228 |
+
@initial_alphabet.setter
|
| 229 |
+
def initial_alphabet(self, value):
|
| 230 |
+
""" """
|
| 231 |
+
pass
|
| 232 |
+
|
| 233 |
+
@property
|
| 234 |
+
def show_progress(self):
|
| 235 |
+
""" """
|
| 236 |
+
pass
|
| 237 |
+
|
| 238 |
+
@show_progress.setter
|
| 239 |
+
def show_progress(self, value):
|
| 240 |
+
""" """
|
| 241 |
+
pass
|
| 242 |
+
|
| 243 |
+
@property
|
| 244 |
+
def special_tokens(self):
|
| 245 |
+
""" """
|
| 246 |
+
pass
|
| 247 |
+
|
| 248 |
+
@special_tokens.setter
|
| 249 |
+
def special_tokens(self, value):
|
| 250 |
+
""" """
|
| 251 |
+
pass
|
| 252 |
+
|
| 253 |
+
@property
|
| 254 |
+
def vocab_size(self):
|
| 255 |
+
""" """
|
| 256 |
+
pass
|
| 257 |
+
|
| 258 |
+
@vocab_size.setter
|
| 259 |
+
def vocab_size(self, value):
|
| 260 |
+
""" """
|
| 261 |
+
pass
|
| 262 |
+
|
| 263 |
+
class WordLevelTrainer(Trainer):
|
| 264 |
+
"""
|
| 265 |
+
Trainer capable of training a WorldLevel model
|
| 266 |
+
|
| 267 |
+
Args:
|
| 268 |
+
vocab_size (:obj:`int`, `optional`):
|
| 269 |
+
The size of the final vocabulary, including all tokens and alphabet.
|
| 270 |
+
|
| 271 |
+
min_frequency (:obj:`int`, `optional`):
|
| 272 |
+
The minimum frequency a pair should have in order to be merged.
|
| 273 |
+
|
| 274 |
+
show_progress (:obj:`bool`, `optional`):
|
| 275 |
+
Whether to show progress bars while training.
|
| 276 |
+
|
| 277 |
+
special_tokens (:obj:`List[Union[str, AddedToken]]`):
|
| 278 |
+
A list of special tokens the model should know of.
|
| 279 |
+
"""
|
| 280 |
+
def __init__(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[]):
|
| 281 |
+
pass
|
| 282 |
+
|
| 283 |
+
def __getstate__(self):
|
| 284 |
+
""" """
|
| 285 |
+
pass
|
| 286 |
+
|
| 287 |
+
def __setstate__(self, state):
|
| 288 |
+
""" """
|
| 289 |
+
pass
|
| 290 |
+
|
| 291 |
+
@property
|
| 292 |
+
def min_frequency(self):
|
| 293 |
+
""" """
|
| 294 |
+
pass
|
| 295 |
+
|
| 296 |
+
@min_frequency.setter
|
| 297 |
+
def min_frequency(self, value):
|
| 298 |
+
""" """
|
| 299 |
+
pass
|
| 300 |
+
|
| 301 |
+
@property
|
| 302 |
+
def show_progress(self):
|
| 303 |
+
""" """
|
| 304 |
+
pass
|
| 305 |
+
|
| 306 |
+
@show_progress.setter
|
| 307 |
+
def show_progress(self, value):
|
| 308 |
+
""" """
|
| 309 |
+
pass
|
| 310 |
+
|
| 311 |
+
@property
|
| 312 |
+
def special_tokens(self):
|
| 313 |
+
""" """
|
| 314 |
+
pass
|
| 315 |
+
|
| 316 |
+
@special_tokens.setter
|
| 317 |
+
def special_tokens(self, value):
|
| 318 |
+
""" """
|
| 319 |
+
pass
|
| 320 |
+
|
| 321 |
+
@property
|
| 322 |
+
def vocab_size(self):
|
| 323 |
+
""" """
|
| 324 |
+
pass
|
| 325 |
+
|
| 326 |
+
@vocab_size.setter
|
| 327 |
+
def vocab_size(self, value):
|
| 328 |
+
""" """
|
| 329 |
+
pass
|
| 330 |
+
|
| 331 |
+
class WordPieceTrainer(Trainer):
|
| 332 |
+
"""
|
| 333 |
+
Trainer capable of training a WordPiece model
|
| 334 |
+
|
| 335 |
+
Args:
|
| 336 |
+
vocab_size (:obj:`int`, `optional`):
|
| 337 |
+
The size of the final vocabulary, including all tokens and alphabet.
|
| 338 |
+
|
| 339 |
+
min_frequency (:obj:`int`, `optional`):
|
| 340 |
+
The minimum frequency a pair should have in order to be merged.
|
| 341 |
+
|
| 342 |
+
show_progress (:obj:`bool`, `optional`):
|
| 343 |
+
Whether to show progress bars while training.
|
| 344 |
+
|
| 345 |
+
special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
|
| 346 |
+
A list of special tokens the model should know of.
|
| 347 |
+
|
| 348 |
+
limit_alphabet (:obj:`int`, `optional`):
|
| 349 |
+
The maximum different characters to keep in the alphabet.
|
| 350 |
+
|
| 351 |
+
initial_alphabet (:obj:`List[str]`, `optional`):
|
| 352 |
+
A list of characters to include in the initial alphabet, even
|
| 353 |
+
if not seen in the training dataset.
|
| 354 |
+
If the strings contain more than one character, only the first one
|
| 355 |
+
is kept.
|
| 356 |
+
|
| 357 |
+
continuing_subword_prefix (:obj:`str`, `optional`):
|
| 358 |
+
A prefix to be used for every subword that is not a beginning-of-word.
|
| 359 |
+
|
| 360 |
+
end_of_word_suffix (:obj:`str`, `optional`):
|
| 361 |
+
A suffix to be used for every subword that is a end-of-word.
|
| 362 |
+
"""
|
| 363 |
+
def __init__(
|
| 364 |
+
self,
|
| 365 |
+
vocab_size=30000,
|
| 366 |
+
min_frequency=0,
|
| 367 |
+
show_progress=True,
|
| 368 |
+
special_tokens=[],
|
| 369 |
+
limit_alphabet=None,
|
| 370 |
+
initial_alphabet=[],
|
| 371 |
+
continuing_subword_prefix="##",
|
| 372 |
+
end_of_word_suffix=None,
|
| 373 |
+
):
|
| 374 |
+
pass
|
| 375 |
+
|
| 376 |
+
def __getstate__(self):
|
| 377 |
+
""" """
|
| 378 |
+
pass
|
| 379 |
+
|
| 380 |
+
def __setstate__(self, state):
|
| 381 |
+
""" """
|
| 382 |
+
pass
|
| 383 |
+
|
| 384 |
+
@property
|
| 385 |
+
def continuing_subword_prefix(self):
|
| 386 |
+
""" """
|
| 387 |
+
pass
|
| 388 |
+
|
| 389 |
+
@continuing_subword_prefix.setter
|
| 390 |
+
def continuing_subword_prefix(self, value):
|
| 391 |
+
""" """
|
| 392 |
+
pass
|
| 393 |
+
|
| 394 |
+
@property
|
| 395 |
+
def end_of_word_suffix(self):
|
| 396 |
+
""" """
|
| 397 |
+
pass
|
| 398 |
+
|
| 399 |
+
@end_of_word_suffix.setter
|
| 400 |
+
def end_of_word_suffix(self, value):
|
| 401 |
+
""" """
|
| 402 |
+
pass
|
| 403 |
+
|
| 404 |
+
@property
|
| 405 |
+
def initial_alphabet(self):
|
| 406 |
+
""" """
|
| 407 |
+
pass
|
| 408 |
+
|
| 409 |
+
@initial_alphabet.setter
|
| 410 |
+
def initial_alphabet(self, value):
|
| 411 |
+
""" """
|
| 412 |
+
pass
|
| 413 |
+
|
| 414 |
+
@property
|
| 415 |
+
def limit_alphabet(self):
|
| 416 |
+
""" """
|
| 417 |
+
pass
|
| 418 |
+
|
| 419 |
+
@limit_alphabet.setter
|
| 420 |
+
def limit_alphabet(self, value):
|
| 421 |
+
""" """
|
| 422 |
+
pass
|
| 423 |
+
|
| 424 |
+
@property
|
| 425 |
+
def min_frequency(self):
|
| 426 |
+
""" """
|
| 427 |
+
pass
|
| 428 |
+
|
| 429 |
+
@min_frequency.setter
|
| 430 |
+
def min_frequency(self, value):
|
| 431 |
+
""" """
|
| 432 |
+
pass
|
| 433 |
+
|
| 434 |
+
@property
|
| 435 |
+
def show_progress(self):
|
| 436 |
+
""" """
|
| 437 |
+
pass
|
| 438 |
+
|
| 439 |
+
@show_progress.setter
|
| 440 |
+
def show_progress(self, value):
|
| 441 |
+
""" """
|
| 442 |
+
pass
|
| 443 |
+
|
| 444 |
+
@property
|
| 445 |
+
def special_tokens(self):
|
| 446 |
+
""" """
|
| 447 |
+
pass
|
| 448 |
+
|
| 449 |
+
@special_tokens.setter
|
| 450 |
+
def special_tokens(self, value):
|
| 451 |
+
""" """
|
| 452 |
+
pass
|
| 453 |
+
|
| 454 |
+
@property
|
| 455 |
+
def vocab_size(self):
|
| 456 |
+
""" """
|
| 457 |
+
pass
|
| 458 |
+
|
| 459 |
+
@vocab_size.setter
|
| 460 |
+
def vocab_size(self, value):
|
| 461 |
+
""" """
|
| 462 |
+
pass
|
source/torchaudio-2.9.1.dist-info/INSTALLER
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
pip
|
source/torchaudio-2.9.1.dist-info/METADATA
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: torchaudio
|
| 3 |
+
Version: 2.9.1
|
| 4 |
+
Summary: An audio package for PyTorch
|
| 5 |
+
Home-page: https://github.com/pytorch/audio
|
| 6 |
+
Author: Soumith Chintala, David Pollack, Sean Naren, Peter Goldsborough, Moto Hira, Caroline Chen, Jeff Hwang, Zhaoheng Ni, Xiaohui Zhang
|
| 7 |
+
Author-email: soumith@pytorch.org
|
| 8 |
+
Maintainer: Moto Hira, Caroline Chen, Jeff Hwang, Zhaoheng Ni, Xiaohui Zhang
|
| 9 |
+
Maintainer-email: moto@meta.com
|
| 10 |
+
Classifier: Environment :: Plugins
|
| 11 |
+
Classifier: Intended Audience :: Developers
|
| 12 |
+
Classifier: Intended Audience :: Science/Research
|
| 13 |
+
Classifier: License :: OSI Approved :: BSD License
|
| 14 |
+
Classifier: Operating System :: MacOS :: MacOS X
|
| 15 |
+
Classifier: Operating System :: Microsoft :: Windows
|
| 16 |
+
Classifier: Operating System :: POSIX
|
| 17 |
+
Classifier: Programming Language :: C++
|
| 18 |
+
Classifier: Programming Language :: Python :: 3.10
|
| 19 |
+
Classifier: Programming Language :: Python :: 3.11
|
| 20 |
+
Classifier: Programming Language :: Python :: 3.12
|
| 21 |
+
Classifier: Programming Language :: Python :: 3.13
|
| 22 |
+
Classifier: Programming Language :: Python :: 3.14
|
| 23 |
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
| 24 |
+
Classifier: Topic :: Multimedia :: Sound/Audio
|
| 25 |
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
| 26 |
+
Description-Content-Type: text/markdown
|
| 27 |
+
License-File: LICENSE
|
| 28 |
+
Requires-Dist: torch==2.9.1
|
| 29 |
+
Dynamic: author
|
| 30 |
+
Dynamic: author-email
|
| 31 |
+
Dynamic: classifier
|
| 32 |
+
Dynamic: description
|
| 33 |
+
Dynamic: description-content-type
|
| 34 |
+
Dynamic: home-page
|
| 35 |
+
Dynamic: license-file
|
| 36 |
+
Dynamic: maintainer
|
| 37 |
+
Dynamic: maintainer-email
|
| 38 |
+
Dynamic: requires-dist
|
| 39 |
+
Dynamic: summary
|
| 40 |
+
|
| 41 |
+
torchaudio: an audio library for PyTorch
|
| 42 |
+
========================================
|
| 43 |
+
|
| 44 |
+
[](https://pytorch.org/audio/main/)
|
| 45 |
+
[](https://anaconda.org/pytorch/torchaudio)
|
| 46 |
+
[](https://anaconda.org/pytorch/torchaudio)
|
| 47 |
+
|
| 48 |
+

|
| 49 |
+
|
| 50 |
+
> [!NOTE]
|
| 51 |
+
> **We have transitioned TorchAudio into a
|
| 52 |
+
> maintenance phase. This process removed some user-facing
|
| 53 |
+
> features. These features were deprecated from TorchAudio 2.8 and removed in 2.9.
|
| 54 |
+
> Our main goals were to reduce redundancies with the rest of the
|
| 55 |
+
> PyTorch ecosystem, make it easier to maintain, and create a version of
|
| 56 |
+
> TorchAudio that is more tightly scoped to its strengths: processing audio
|
| 57 |
+
> data for ML. Please see
|
| 58 |
+
> [our community message](https://github.com/pytorch/audio/issues/3902)
|
| 59 |
+
> for more details.**
|
| 60 |
+
|
| 61 |
+
The aim of torchaudio is to apply [PyTorch](https://github.com/pytorch/pytorch) to
|
| 62 |
+
the audio domain. By supporting PyTorch, torchaudio follows the same philosophy
|
| 63 |
+
of providing strong GPU acceleration, having a focus on trainable features through
|
| 64 |
+
the autograd system, and having consistent style (tensor names and dimension names).
|
| 65 |
+
Therefore, it is primarily a machine learning library and not a general signal
|
| 66 |
+
processing library. The benefits of PyTorch can be seen in torchaudio through
|
| 67 |
+
having all the computations be through PyTorch operations which makes it easy
|
| 68 |
+
to use and feel like a natural extension.
|
| 69 |
+
|
| 70 |
+
- [Dataloaders for common audio datasets](http://pytorch.org/audio/main/datasets.html)
|
| 71 |
+
- Audio and speech processing functions
|
| 72 |
+
- [forced_align](https://pytorch.org/audio/main/generated/torchaudio.functional.forced_align.html)
|
| 73 |
+
- Common audio transforms
|
| 74 |
+
- [Spectrogram, AmplitudeToDB, MelScale, MelSpectrogram, MFCC, MuLawEncoding, MuLawDecoding, Resample](http://pytorch.org/audio/main/transforms.html)
|
| 75 |
+
- Compliance interfaces: Run code using PyTorch that align with other libraries
|
| 76 |
+
- [Kaldi: spectrogram, fbank, mfcc](https://pytorch.org/audio/main/compliance.kaldi.html)
|
| 77 |
+
|
| 78 |
+
Installation
|
| 79 |
+
------------
|
| 80 |
+
|
| 81 |
+
Please refer to https://pytorch.org/audio/main/installation.html for installation and build process of TorchAudio.
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
API Reference
|
| 85 |
+
-------------
|
| 86 |
+
|
| 87 |
+
API Reference is located here: http://pytorch.org/audio/main/
|
| 88 |
+
|
| 89 |
+
Contributing Guidelines
|
| 90 |
+
-----------------------
|
| 91 |
+
|
| 92 |
+
Please refer to [CONTRIBUTING.md](./CONTRIBUTING.md)
|
| 93 |
+
|
| 94 |
+
Citation
|
| 95 |
+
--------
|
| 96 |
+
|
| 97 |
+
If you find this package useful, please cite as:
|
| 98 |
+
|
| 99 |
+
```bibtex
|
| 100 |
+
@article{yang2021torchaudio,
|
| 101 |
+
title={TorchAudio: Building Blocks for Audio and Speech Processing},
|
| 102 |
+
author={Yao-Yuan Yang and Moto Hira and Zhaoheng Ni and Anjali Chourdia and Artyom Astafurov and Caroline Chen and Ching-Feng Yeh and Christian Puhrsch and David Pollack and Dmitriy Genzel and Donny Greenberg and Edward Z. Yang and Jason Lian and Jay Mahadeokar and Jeff Hwang and Ji Chen and Peter Goldsborough and Prabhat Roy and Sean Narenthiran and Shinji Watanabe and Soumith Chintala and Vincent Quenneville-Bélair and Yangyang Shi},
|
| 103 |
+
journal={arXiv preprint arXiv:2110.15018},
|
| 104 |
+
year={2021}
|
| 105 |
+
}
|
| 106 |
+
```
|
| 107 |
+
|
| 108 |
+
```bibtex
|
| 109 |
+
@misc{hwang2023torchaudio,
|
| 110 |
+
title={TorchAudio 2.1: Advancing speech recognition, self-supervised learning, and audio processing components for PyTorch},
|
| 111 |
+
author={Jeff Hwang and Moto Hira and Caroline Chen and Xiaohui Zhang and Zhaoheng Ni and Guangzhi Sun and Pingchuan Ma and Ruizhe Huang and Vineel Pratap and Yuekai Zhang and Anurag Kumar and Chin-Yun Yu and Chuang Zhu and Chunxi Liu and Jacob Kahn and Mirco Ravanelli and Peng Sun and Shinji Watanabe and Yangyang Shi and Yumeng Tao and Robin Scheibler and Samuele Cornell and Sean Kim and Stavros Petridis},
|
| 112 |
+
year={2023},
|
| 113 |
+
eprint={2310.17864},
|
| 114 |
+
archivePrefix={arXiv},
|
| 115 |
+
primaryClass={eess.AS}
|
| 116 |
+
}
|
| 117 |
+
```
|
| 118 |
+
|
| 119 |
+
Disclaimer on Datasets
|
| 120 |
+
----------------------
|
| 121 |
+
|
| 122 |
+
This is a utility library that downloads and prepares public datasets. We do not host or distribute these datasets, vouch for their quality or fairness, or claim that you have license to use the dataset. It is your responsibility to determine whether you have permission to use the dataset under the dataset's license.
|
| 123 |
+
|
| 124 |
+
If you're a dataset owner and wish to update any part of it (description, citation, etc.), or do not want your dataset to be included in this library, please get in touch through a GitHub issue. Thanks for your contribution to the ML community!
|
| 125 |
+
|
| 126 |
+
Pre-trained Model License
|
| 127 |
+
-------------------------
|
| 128 |
+
|
| 129 |
+
The pre-trained models provided in this library may have their own licenses or terms and conditions derived from the dataset used for training. It is your responsibility to determine whether you have permission to use the models for your use case.
|
| 130 |
+
|
| 131 |
+
For instance, SquimSubjective model is released under the Creative Commons Attribution Non Commercial 4.0 International (CC-BY-NC 4.0) license. See [the link](https://zenodo.org/record/4660670#.ZBtWPOxuerN) for additional details.
|
| 132 |
+
|
| 133 |
+
Other pre-trained models that have different license are noted in documentation. Please checkout the [documentation page](https://pytorch.org/audio/main/).
|
source/torchaudio-2.9.1.dist-info/RECORD
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torchaudio-2.9.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
| 2 |
+
torchaudio-2.9.1.dist-info/METADATA,sha256=nhbW4EcMPskMVtj653CC9bZ2xsogo4xjD7mkmp7K9Sg,6911
|
| 3 |
+
torchaudio-2.9.1.dist-info/RECORD,,
|
| 4 |
+
torchaudio-2.9.1.dist-info/WHEEL,sha256=VXvNKn6nFeCM45GEUrNLJOO_J_e-cNJphGt9rWFxyE0,113
|
| 5 |
+
torchaudio-2.9.1.dist-info/licenses/LICENSE,sha256=k6WIYahYzBCOa2uDPgjnbosqZjOeSoAHyKWowf-cQNY,1338
|
| 6 |
+
torchaudio-2.9.1.dist-info/top_level.txt,sha256=mPKWMIRWWW2JwbJN6wRckeN1gpbjhifapAF0Z9t7SMo,11
|
| 7 |
+
torchaudio/__init__.py,sha256=8OB3EPGCViF7LgBWy_bUyZUF6HJUIpbTI8ouRGwn6lU,7878
|
| 8 |
+
torchaudio/__pycache__/__init__.cpython-312.pyc,,
|
| 9 |
+
torchaudio/__pycache__/_torchcodec.cpython-312.pyc,,
|
| 10 |
+
torchaudio/__pycache__/version.cpython-312.pyc,,
|
| 11 |
+
torchaudio/_extension/__init__.py,sha256=A8oH7eF2Fx4d68LddkFE1Ylq3AE3X2sgZdXjvaMEdjQ,1905
|
| 12 |
+
torchaudio/_extension/__pycache__/__init__.cpython-312.pyc,,
|
| 13 |
+
torchaudio/_extension/__pycache__/utils.cpython-312.pyc,,
|
| 14 |
+
torchaudio/_extension/utils.py,sha256=UQCObmKAsgdHhXU2dQYYxyFXwfdTsBO9bnrQmpQNN_I,4926
|
| 15 |
+
torchaudio/_internal/__init__.py,sha256=gjU8g9HhVd9hHrHXJM0xOlZL6cT8ktO60MN8RHI6ZbA,241
|
| 16 |
+
torchaudio/_internal/__pycache__/__init__.cpython-312.pyc,,
|
| 17 |
+
torchaudio/_internal/__pycache__/module_utils.cpython-312.pyc,,
|
| 18 |
+
torchaudio/_internal/module_utils.py,sha256=eosQSGtN5WhHhATJGBWJIGUM_nvtgLPRkQ8BH_Zd53o,5229
|
| 19 |
+
torchaudio/_torchcodec.py,sha256=Z1TpONctbL80DufuWhLRj4dC0rVhjKu6hOYeglcLwvU,13424
|
| 20 |
+
torchaudio/compliance/__init__.py,sha256=hhNObUS0c-fS-VMudM7zl3-CvupvCDmESlikntSMn5g,48
|
| 21 |
+
torchaudio/compliance/__pycache__/__init__.cpython-312.pyc,,
|
| 22 |
+
torchaudio/compliance/__pycache__/kaldi.cpython-312.pyc,,
|
| 23 |
+
torchaudio/compliance/kaldi.py,sha256=XL6hpYTd6nSPb2imIdeU4TM06I2fqh1AmG968y8ZbSk,36666
|
| 24 |
+
torchaudio/datasets/__init__.py,sha256=taRr3duDaEK1Pfzj9N1dFuZpXfy8e4uFItcJiRLAQwQ,1171
|
| 25 |
+
torchaudio/datasets/__pycache__/__init__.cpython-312.pyc,,
|
| 26 |
+
torchaudio/datasets/__pycache__/cmuarctic.cpython-312.pyc,,
|
| 27 |
+
torchaudio/datasets/__pycache__/cmudict.cpython-312.pyc,,
|
| 28 |
+
torchaudio/datasets/__pycache__/commonvoice.cpython-312.pyc,,
|
| 29 |
+
torchaudio/datasets/__pycache__/dr_vctk.cpython-312.pyc,,
|
| 30 |
+
torchaudio/datasets/__pycache__/fluentcommands.cpython-312.pyc,,
|
| 31 |
+
torchaudio/datasets/__pycache__/gtzan.cpython-312.pyc,,
|
| 32 |
+
torchaudio/datasets/__pycache__/iemocap.cpython-312.pyc,,
|
| 33 |
+
torchaudio/datasets/__pycache__/librilight_limited.cpython-312.pyc,,
|
| 34 |
+
torchaudio/datasets/__pycache__/librimix.cpython-312.pyc,,
|
| 35 |
+
torchaudio/datasets/__pycache__/librispeech.cpython-312.pyc,,
|
| 36 |
+
torchaudio/datasets/__pycache__/librispeech_biasing.cpython-312.pyc,,
|
| 37 |
+
torchaudio/datasets/__pycache__/libritts.cpython-312.pyc,,
|
| 38 |
+
torchaudio/datasets/__pycache__/ljspeech.cpython-312.pyc,,
|
| 39 |
+
torchaudio/datasets/__pycache__/musdb_hq.cpython-312.pyc,,
|
| 40 |
+
torchaudio/datasets/__pycache__/quesst14.cpython-312.pyc,,
|
| 41 |
+
torchaudio/datasets/__pycache__/snips.cpython-312.pyc,,
|
| 42 |
+
torchaudio/datasets/__pycache__/speechcommands.cpython-312.pyc,,
|
| 43 |
+
torchaudio/datasets/__pycache__/tedlium.cpython-312.pyc,,
|
| 44 |
+
torchaudio/datasets/__pycache__/utils.cpython-312.pyc,,
|
| 45 |
+
torchaudio/datasets/__pycache__/vctk.cpython-312.pyc,,
|
| 46 |
+
torchaudio/datasets/__pycache__/voxceleb1.cpython-312.pyc,,
|
| 47 |
+
torchaudio/datasets/__pycache__/yesno.cpython-312.pyc,,
|
| 48 |
+
torchaudio/datasets/cmuarctic.py,sha256=2e5Oh_jDHRs8ORhNONsD9NhI_OfQSHDLQAM-tWpgZ-U,7081
|
| 49 |
+
torchaudio/datasets/cmudict.py,sha256=9OEpNDYpyqeEyinAnyGIU8FampDj7ziSOHRwJLIlq2M,5990
|
| 50 |
+
torchaudio/datasets/commonvoice.py,sha256=9khedUCmdEkCKPU6_r8VWz6I2VdJokatuziZ6BxJMZs,2763
|
| 51 |
+
torchaudio/datasets/dr_vctk.py,sha256=Km4-tKllAgnOKCuq66YRWhTlNWmC7D0Xz3dAttRRGSo,4377
|
| 52 |
+
torchaudio/datasets/fluentcommands.py,sha256=u3tkO4-AAaTWdbRQi6lIvad4x2plZgXM39KljGtmRsw,3245
|
| 53 |
+
torchaudio/datasets/gtzan.py,sha256=I5dRP_QGuQ1joXWRwZwtvpwi22uZTb8QZm9Mr2W55Mg,24357
|
| 54 |
+
torchaudio/datasets/iemocap.py,sha256=X_WCoXOzRqcWRRRoUtY0AlD9SJcUUOACIcgbV0irt48,4930
|
| 55 |
+
torchaudio/datasets/librilight_limited.py,sha256=fAwpX0hEMze5aV57BP7rjBLwRiZa3Aje_NXi_3o16wA,4179
|
| 56 |
+
torchaudio/datasets/librimix.py,sha256=VtKOhf6VJc1ysWCvUvh0SbtjOkXJChmBM_BhoSkg_2A,5116
|
| 57 |
+
torchaudio/datasets/librispeech.py,sha256=zkzJFWchWs4AktYAI-ghmWH4ZeJ84C0uDo9E1_pTgSI,6308
|
| 58 |
+
torchaudio/datasets/librispeech_biasing.py,sha256=d-02tyrXI-CSGbXBFYFcnM_yT8WSGABHfpNiFxyadL0,6958
|
| 59 |
+
torchaudio/datasets/libritts.py,sha256=EtWOoCDz7_qGLZF5YcZfnHaLxH4Y8QJCnopafLiqFno,5870
|
| 60 |
+
torchaudio/datasets/ljspeech.py,sha256=92NeLQsC1iKpqfiMkKKbcJDpaYdZKVdVEBQJze1wmxY,3494
|
| 61 |
+
torchaudio/datasets/musdb_hq.py,sha256=TYKjpat6JKr9bkFqUecu7_hRdshRfQP2UbknaYR3Q0U,5075
|
| 62 |
+
torchaudio/datasets/quesst14.py,sha256=QyGd4fMS820ATbP8YgBtu7bSSK09pw5RZklsPJ8Jf0Y,4455
|
| 63 |
+
torchaudio/datasets/snips.py,sha256=WaYUknGFM3rnLklOj5ZYHSX5mhlf_Ce4p3LBZdA9yJc,5008
|
| 64 |
+
torchaudio/datasets/speechcommands.py,sha256=cLSgiVYlQjEOuYPpFeAtcXSGirraH4IMoP8p9WIvUoY,7481
|
| 65 |
+
torchaudio/datasets/tedlium.py,sha256=a8Hf2QvOki7_chgXcMAFMk-piTjodktfnc3HRbUVJkU,8698
|
| 66 |
+
torchaudio/datasets/utils.py,sha256=P6nckh2YrAfOPMphHlxyfI-HBmNg39DTlxQ8-asG4MY,1703
|
| 67 |
+
torchaudio/datasets/vctk.py,sha256=twR_n8LyQcT8A_HrJoMx3RkaVrRXXZAnIVU1d0E0npQ,5699
|
| 68 |
+
torchaudio/datasets/voxceleb1.py,sha256=9vU0ftB4-2usO8ZiEUKR_IQTEdHhA0M8l9scXCNehnw,11725
|
| 69 |
+
torchaudio/datasets/yesno.py,sha256=4sgfMeSxz8HaRDk6A2UIFP-20q29MwEO_r8DoEtfbvE,3026
|
| 70 |
+
torchaudio/functional/__init__.py,sha256=_5eT3FZFO6GXmKqFkPY4c_w7F7Isqnd8CTP2FdMxfVM,2451
|
| 71 |
+
torchaudio/functional/__pycache__/__init__.cpython-312.pyc,,
|
| 72 |
+
torchaudio/functional/__pycache__/_alignment.cpython-312.pyc,,
|
| 73 |
+
torchaudio/functional/__pycache__/filtering.cpython-312.pyc,,
|
| 74 |
+
torchaudio/functional/__pycache__/functional.cpython-312.pyc,,
|
| 75 |
+
torchaudio/functional/_alignment.py,sha256=NveQ74x8PmleuB-Ka9eEYYyshbV7nYc0g-Tu3NGHdz0,4739
|
| 76 |
+
torchaudio/functional/filtering.py,sha256=rML8MismfehSeglw65kUkfugoP6XDtWcs_XhCl6aJM4,62325
|
| 77 |
+
torchaudio/functional/functional.py,sha256=5l-07BLVAs1PNU8NM2CPV_GTnq3V8nbV9tI7t0v79Y4,94731
|
| 78 |
+
torchaudio/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 79 |
+
torchaudio/lib/__pycache__/__init__.cpython-312.pyc,,
|
| 80 |
+
torchaudio/lib/_torchaudio.so,sha256=70IIA3F6xzE4xCzUrpWAGTfHu1KqWzIM1KxU4dDTTsg,171552
|
| 81 |
+
torchaudio/lib/libctc_prefix_decoder.so,sha256=1DlwTtNZXE_P0zsHvoVP7mBzpSpUJFaSSaaQjg8oX0E,6268592
|
| 82 |
+
torchaudio/lib/libtorchaudio.so,sha256=L8j8al4FAtxyb5w5Arp5eo-jpVlM2HcswimRW7c2mII,2573624
|
| 83 |
+
torchaudio/lib/pybind11_prefixctc.so,sha256=VdBdtmt8GU2y1ulK-S4oLR5jWYA5K0PnlEBSqrV4F3A,277688
|
| 84 |
+
torchaudio/models/__init__.py,sha256=BNMNGuwpJAFRsdtwHYQ6slGClkrUTu31_7mXh7FjeV4,1995
|
| 85 |
+
torchaudio/models/__pycache__/__init__.cpython-312.pyc,,
|
| 86 |
+
torchaudio/models/__pycache__/_hdemucs.cpython-312.pyc,,
|
| 87 |
+
torchaudio/models/__pycache__/conformer.cpython-312.pyc,,
|
| 88 |
+
torchaudio/models/__pycache__/conv_tasnet.cpython-312.pyc,,
|
| 89 |
+
torchaudio/models/__pycache__/deepspeech.cpython-312.pyc,,
|
| 90 |
+
torchaudio/models/__pycache__/emformer.cpython-312.pyc,,
|
| 91 |
+
torchaudio/models/__pycache__/rnnt.cpython-312.pyc,,
|
| 92 |
+
torchaudio/models/__pycache__/rnnt_decoder.cpython-312.pyc,,
|
| 93 |
+
torchaudio/models/__pycache__/tacotron2.cpython-312.pyc,,
|
| 94 |
+
torchaudio/models/__pycache__/wav2letter.cpython-312.pyc,,
|
| 95 |
+
torchaudio/models/__pycache__/wavernn.cpython-312.pyc,,
|
| 96 |
+
torchaudio/models/_hdemucs.py,sha256=VPnQ73lA9lfAxRjZ85NCGJYP36mPNwTjS-TU4qelu_k,38242
|
| 97 |
+
torchaudio/models/conformer.py,sha256=5IceU-jcZKofkHTTqRKoytubQ75MzZPrPlfkLsIlxeA,10068
|
| 98 |
+
torchaudio/models/conv_tasnet.py,sha256=v-DI_Ej9FCBBbSH-Spkh3tzq8rkBhbQNA-Wp52Uf32E,12540
|
| 99 |
+
torchaudio/models/decoder/__init__.py,sha256=HxU2Bgyea0No8SORRfxgMZNwwEDTrjlT3bDW_GxzpTU,1899
|
| 100 |
+
torchaudio/models/decoder/__pycache__/__init__.cpython-312.pyc,,
|
| 101 |
+
torchaudio/models/decoder/__pycache__/_ctc_decoder.cpython-312.pyc,,
|
| 102 |
+
torchaudio/models/decoder/__pycache__/_cuda_ctc_decoder.cpython-312.pyc,,
|
| 103 |
+
torchaudio/models/decoder/_ctc_decoder.py,sha256=AmLQAcm4Q4bFPqnq-SF7Lpvg2QPK88xyio8ol_OJjvU,20086
|
| 104 |
+
torchaudio/models/decoder/_cuda_ctc_decoder.py,sha256=xFrj1cTEsS-MxAO5Vgdutcb3kTb7Jv-OFhS6cmfFKhA,7186
|
| 105 |
+
torchaudio/models/deepspeech.py,sha256=kQW3B6YcjYuq7xRzWjRJFGr7ZNraY9gMYDTxII7Cgtg,2746
|
| 106 |
+
torchaudio/models/emformer.py,sha256=ncDeEcYegUmIKQoDBoufUhVWj4dYpZAXxLX0qmEqt1A,37766
|
| 107 |
+
torchaudio/models/rnnt.py,sha256=jz66nwDd1qGT6KQR1lbA_urPktygewhm0FH66T7P3Ek,35541
|
| 108 |
+
torchaudio/models/rnnt_decoder.py,sha256=IwlDsuw1SA-uCRrXGMBqm05auGFSha2bZ-8BOImnK0c,12839
|
| 109 |
+
torchaudio/models/squim/__init__.py,sha256=b98nAaL28Q4w3lrqd_6wUd0An-xNhhJn4Tj8oZlzQnc,346
|
| 110 |
+
torchaudio/models/squim/__pycache__/__init__.cpython-312.pyc,,
|
| 111 |
+
torchaudio/models/squim/__pycache__/objective.cpython-312.pyc,,
|
| 112 |
+
torchaudio/models/squim/__pycache__/subjective.cpython-312.pyc,,
|
| 113 |
+
torchaudio/models/squim/objective.py,sha256=gvUasz7RpqgKeGf04yHUotshSIzH3KzjW90-iHeDo2g,12281
|
| 114 |
+
torchaudio/models/squim/subjective.py,sha256=N00kILSPm0akWyNsrNYKmHgZmooo8gbyUm5IVLf7bx8,5797
|
| 115 |
+
torchaudio/models/tacotron2.py,sha256=FimYhGSI8FKwWb87CLk4h3yKWatCU2HvFmU1t5WUn4E,45914
|
| 116 |
+
torchaudio/models/wav2letter.py,sha256=KNcq4p0qZG2Bwfdakv7YwLCvi_yGT-qB4fJwGMuFQhg,3278
|
| 117 |
+
torchaudio/models/wav2vec2/__init__.py,sha256=WlafukV6GwuSNh0CZifrYUt4V5l59kjvGX7AZNonjfk,927
|
| 118 |
+
torchaudio/models/wav2vec2/__pycache__/__init__.cpython-312.pyc,,
|
| 119 |
+
torchaudio/models/wav2vec2/__pycache__/components.cpython-312.pyc,,
|
| 120 |
+
torchaudio/models/wav2vec2/__pycache__/model.cpython-312.pyc,,
|
| 121 |
+
torchaudio/models/wav2vec2/__pycache__/wavlm_attention.cpython-312.pyc,,
|
| 122 |
+
torchaudio/models/wav2vec2/components.py,sha256=DRmW-GHYf-JReCg_0l1ovNWJBnAavePO3S2vPY-1ze4,47077
|
| 123 |
+
torchaudio/models/wav2vec2/model.py,sha256=Z2VN6KbDOOdq5JtP7lxPQebwYqsxKms1Eu4IjDJtZaQ,60092
|
| 124 |
+
torchaudio/models/wav2vec2/utils/__init__.py,sha256=qmMbz4HAN5kEEyl4cSGm_JQZI47beyh4witydPC_qns,181
|
| 125 |
+
torchaudio/models/wav2vec2/utils/__pycache__/__init__.cpython-312.pyc,,
|
| 126 |
+
torchaudio/models/wav2vec2/utils/__pycache__/import_fairseq.cpython-312.pyc,,
|
| 127 |
+
torchaudio/models/wav2vec2/utils/__pycache__/import_huggingface.cpython-312.pyc,,
|
| 128 |
+
torchaudio/models/wav2vec2/utils/import_fairseq.py,sha256=oCwG6qpG0bCXue2V56fjDcC8cA2rgy4b3O_nu_FI9ZY,9198
|
| 129 |
+
torchaudio/models/wav2vec2/utils/import_huggingface.py,sha256=1nVCipp-lOUAyl_-P103DWLUeTOZi9X_ffX93bOXxEk,5946
|
| 130 |
+
torchaudio/models/wav2vec2/wavlm_attention.py,sha256=1DU_pkoLCeHQwSF4lJ06cez0PsMVoXNxiYKP0Yv0qFQ,10844
|
| 131 |
+
torchaudio/models/wavernn.py,sha256=5xUyao5g69jRXX4ReNi4mP_aTSIonJPP6XcPrqKybEk,15446
|
| 132 |
+
torchaudio/pipelines/__init__.py,sha256=Xy8NmInKwTcNBHwLTTjHjrfczRLuQq8a67ENt1OTVXM,2745
|
| 133 |
+
torchaudio/pipelines/__pycache__/__init__.cpython-312.pyc,,
|
| 134 |
+
torchaudio/pipelines/__pycache__/_source_separation_pipeline.cpython-312.pyc,,
|
| 135 |
+
torchaudio/pipelines/__pycache__/_squim_pipeline.cpython-312.pyc,,
|
| 136 |
+
torchaudio/pipelines/__pycache__/rnnt_pipeline.cpython-312.pyc,,
|
| 137 |
+
torchaudio/pipelines/_source_separation_pipeline.py,sha256=ogWakvaOv6OegmREcbagvfIm0jNWjzEtsdMYTialRNk,4225
|
| 138 |
+
torchaudio/pipelines/_squim_pipeline.py,sha256=852SYXqUZDgTPegL7LqgVQr0PXG94da_DTDF2bwDhVE,6282
|
| 139 |
+
torchaudio/pipelines/_tts/__init__.py,sha256=PP7l8XzVURqelwuMJFgfOCv4fvzZunDiy90ZQlRkv7g,426
|
| 140 |
+
torchaudio/pipelines/_tts/__pycache__/__init__.cpython-312.pyc,,
|
| 141 |
+
torchaudio/pipelines/_tts/__pycache__/impl.cpython-312.pyc,,
|
| 142 |
+
torchaudio/pipelines/_tts/__pycache__/interface.cpython-312.pyc,,
|
| 143 |
+
torchaudio/pipelines/_tts/__pycache__/utils.cpython-312.pyc,,
|
| 144 |
+
torchaudio/pipelines/_tts/impl.py,sha256=Tig4_5sITJADwxN5eZGek7Ath_-e3sV8CTM5t6UpeUU,15374
|
| 145 |
+
torchaudio/pipelines/_tts/interface.py,sha256=yUaS0UK3PTRruYXRWFil7lAhr-1iYiyBaDBLmEnJPUQ,10224
|
| 146 |
+
torchaudio/pipelines/_tts/utils.py,sha256=KGrFoetCZ4l4FJkINFptAc8Pvrbo9e4QQhCIMCp8NYY,4810
|
| 147 |
+
torchaudio/pipelines/_wav2vec2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 148 |
+
torchaudio/pipelines/_wav2vec2/__pycache__/__init__.cpython-312.pyc,,
|
| 149 |
+
torchaudio/pipelines/_wav2vec2/__pycache__/aligner.cpython-312.pyc,,
|
| 150 |
+
torchaudio/pipelines/_wav2vec2/__pycache__/impl.cpython-312.pyc,,
|
| 151 |
+
torchaudio/pipelines/_wav2vec2/__pycache__/utils.cpython-312.pyc,,
|
| 152 |
+
torchaudio/pipelines/_wav2vec2/aligner.py,sha256=pIWRgQ-kdYUxtL8bdc0qk9wBjwRrHY1uSWL3L4e2vxs,2709
|
| 153 |
+
torchaudio/pipelines/_wav2vec2/impl.py,sha256=zdXFjytJO5MvnB-3aygzUUFKxCTkQGU_OX_rhUh9c0k,65561
|
| 154 |
+
torchaudio/pipelines/_wav2vec2/utils.py,sha256=Q8_fWOR2JDnHu0TTRmHzRjI3BOJa0hGIAl0cjtALgsQ,6971
|
| 155 |
+
torchaudio/pipelines/rnnt_pipeline.py,sha256=56nQnCcjY4xewDqXR1Rkrh_hyoK42CsYumpU8mUNs1w,13753
|
| 156 |
+
torchaudio/transforms/__init__.py,sha256=8_47qPRjXNg332f2kcNP_T5UXCn6jQmUUMkIgyIByjY,1398
|
| 157 |
+
torchaudio/transforms/__pycache__/__init__.cpython-312.pyc,,
|
| 158 |
+
torchaudio/transforms/__pycache__/_multi_channel.cpython-312.pyc,,
|
| 159 |
+
torchaudio/transforms/__pycache__/_transforms.cpython-312.pyc,,
|
| 160 |
+
torchaudio/transforms/_multi_channel.py,sha256=GZ2rrwFt2KtSG7At7kS9Bqh1KmYYw0HwcUnEjc-AWr8,22221
|
| 161 |
+
torchaudio/transforms/_transforms.py,sha256=i-xEARqCfnaDk9b0yzmYkPo9Gg1N1iKvZiLSMdX14-Q,86919
|
| 162 |
+
torchaudio/utils/__init__.py,sha256=adAdfYm9DJBC2JXxRCTrjxOUU1vKJ9w3rFke-DzKKqU,70
|
| 163 |
+
torchaudio/utils/__pycache__/__init__.cpython-312.pyc,,
|
| 164 |
+
torchaudio/utils/__pycache__/download.cpython-312.pyc,,
|
| 165 |
+
torchaudio/utils/download.py,sha256=gZA7CijUoAu3Q0Qd6dKpFQAEjcdnxR6xOT59lTgEIOo,2883
|
| 166 |
+
torchaudio/version.py,sha256=sBUsm0oAwNdEIPgVIrOs5KflkkUDSnUNB0usP957SGE,85
|
source/torchaudio-2.9.1.dist-info/WHEEL
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Wheel-Version: 1.0
|
| 2 |
+
Generator: setuptools (80.9.0)
|
| 3 |
+
Root-Is-Purelib: false
|
| 4 |
+
Tag: cp312-cp312-manylinux_2_28_x86_64
|
| 5 |
+
|
source/torchaudio-2.9.1.dist-info/licenses/LICENSE
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
BSD 2-Clause License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
|
| 4 |
+
All rights reserved.
|
| 5 |
+
|
| 6 |
+
Redistribution and use in source and binary forms, with or without
|
| 7 |
+
modification, are permitted provided that the following conditions are met:
|
| 8 |
+
|
| 9 |
+
* Redistributions of source code must retain the above copyright notice, this
|
| 10 |
+
list of conditions and the following disclaimer.
|
| 11 |
+
|
| 12 |
+
* Redistributions in binary form must reproduce the above copyright notice,
|
| 13 |
+
this list of conditions and the following disclaimer in the documentation
|
| 14 |
+
and/or other materials provided with the distribution.
|
| 15 |
+
|
| 16 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
| 17 |
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
| 18 |
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
| 19 |
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
| 20 |
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
| 21 |
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
| 22 |
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
| 23 |
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
| 24 |
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| 25 |
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
source/torchaudio-2.9.1.dist-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
torchaudio
|
source/torchaudio/__init__.py
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import BinaryIO, Optional, Tuple, Union
|
| 3 |
+
|
| 4 |
+
import torch
|
| 5 |
+
|
| 6 |
+
# Initialize extension and backend first
|
| 7 |
+
from . import _extension # noqa # usort: skip
|
| 8 |
+
from . import compliance, datasets, functional, models, pipelines, transforms, utils # noqa: F401
|
| 9 |
+
from ._torchcodec import load_with_torchcodec, save_with_torchcodec
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
try:
|
| 13 |
+
from .version import __version__, git_version # noqa: F401
|
| 14 |
+
except ImportError:
|
| 15 |
+
pass
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def load(
|
| 19 |
+
uri: Union[BinaryIO, str, os.PathLike],
|
| 20 |
+
frame_offset: int = 0,
|
| 21 |
+
num_frames: int = -1,
|
| 22 |
+
normalize: bool = True,
|
| 23 |
+
channels_first: bool = True,
|
| 24 |
+
format: Optional[str] = None,
|
| 25 |
+
buffer_size: int = 4096,
|
| 26 |
+
backend: Optional[str] = None,
|
| 27 |
+
) -> Tuple[torch.Tensor, int]:
|
| 28 |
+
"""Load audio data from source using TorchCodec's AudioDecoder.
|
| 29 |
+
|
| 30 |
+
.. note::
|
| 31 |
+
|
| 32 |
+
As of TorchAudio 2.9, this function relies on TorchCodec's decoding capabilities under the hood. It is
|
| 33 |
+
provided for convenience, but we do recommend that you port your code to
|
| 34 |
+
natively use ``torchcodec``'s ``AudioDecoder`` class for better
|
| 35 |
+
performance:
|
| 36 |
+
https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.AudioDecoder.
|
| 37 |
+
Because of the reliance on Torchcodec, the parameters ``normalize``, ``buffer_size``, and
|
| 38 |
+
``backend`` are ignored and accepted only for backwards compatibility.
|
| 39 |
+
To install torchcodec, follow the instructions at https://github.com/pytorch/torchcodec#installing-torchcodec.
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
Args:
|
| 43 |
+
uri (path-like object or file-like object):
|
| 44 |
+
Source of audio data. The following types are accepted:
|
| 45 |
+
|
| 46 |
+
* ``path-like``: File path or URL.
|
| 47 |
+
* ``file-like``: Object with ``read(size: int) -> bytes`` method.
|
| 48 |
+
|
| 49 |
+
frame_offset (int, optional):
|
| 50 |
+
Number of samples to skip before start reading data.
|
| 51 |
+
num_frames (int, optional):
|
| 52 |
+
Maximum number of samples to read. ``-1`` reads all the remaining samples,
|
| 53 |
+
starting from ``frame_offset``.
|
| 54 |
+
normalize (bool, optional):
|
| 55 |
+
TorchCodec always returns normalized float32 samples. This parameter
|
| 56 |
+
is ignored and a warning is issued if set to False.
|
| 57 |
+
Default: ``True``.
|
| 58 |
+
channels_first (bool, optional):
|
| 59 |
+
When True, the returned Tensor has dimension `[channel, time]`.
|
| 60 |
+
Otherwise, the returned Tensor's dimension is `[time, channel]`.
|
| 61 |
+
format (str or None, optional):
|
| 62 |
+
Format hint for the decoder. May not be supported by all TorchCodec
|
| 63 |
+
decoders. (Default: ``None``)
|
| 64 |
+
buffer_size (int, optional):
|
| 65 |
+
Not used by TorchCodec AudioDecoder. Provided for API compatibility.
|
| 66 |
+
backend (str or None, optional):
|
| 67 |
+
Not used by TorchCodec AudioDecoder. Provided for API compatibility.
|
| 68 |
+
|
| 69 |
+
Returns:
|
| 70 |
+
(torch.Tensor, int): Resulting Tensor and sample rate.
|
| 71 |
+
Always returns float32 tensors. If ``channels_first=True``, shape is
|
| 72 |
+
`[channel, time]`, otherwise `[time, channel]`.
|
| 73 |
+
|
| 74 |
+
Raises:
|
| 75 |
+
ImportError: If torchcodec is not available.
|
| 76 |
+
ValueError: If unsupported parameters are used.
|
| 77 |
+
RuntimeError: If TorchCodec fails to decode the audio.
|
| 78 |
+
|
| 79 |
+
Note:
|
| 80 |
+
- TorchCodec always returns normalized float32 samples, so the ``normalize``
|
| 81 |
+
parameter has no effect.
|
| 82 |
+
- The ``buffer_size`` and ``backend`` parameters are ignored.
|
| 83 |
+
- Not all audio formats supported by torchaudio backends may be supported
|
| 84 |
+
by TorchCodec.
|
| 85 |
+
"""
|
| 86 |
+
return load_with_torchcodec(
|
| 87 |
+
uri,
|
| 88 |
+
frame_offset=frame_offset,
|
| 89 |
+
num_frames=num_frames,
|
| 90 |
+
normalize=normalize,
|
| 91 |
+
channels_first=channels_first,
|
| 92 |
+
format=format,
|
| 93 |
+
buffer_size=buffer_size,
|
| 94 |
+
backend=backend,
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def save(
|
| 99 |
+
uri: Union[str, os.PathLike],
|
| 100 |
+
src: torch.Tensor,
|
| 101 |
+
sample_rate: int,
|
| 102 |
+
channels_first: bool = True,
|
| 103 |
+
format: Optional[str] = None,
|
| 104 |
+
encoding: Optional[str] = None,
|
| 105 |
+
bits_per_sample: Optional[int] = None,
|
| 106 |
+
buffer_size: int = 4096,
|
| 107 |
+
backend: Optional[str] = None,
|
| 108 |
+
compression: Optional[Union[float, int]] = None,
|
| 109 |
+
) -> None:
|
| 110 |
+
"""Save audio data to file using TorchCodec's AudioEncoder.
|
| 111 |
+
|
| 112 |
+
.. note::
|
| 113 |
+
|
| 114 |
+
As of TorchAudio 2.9, this function relies on TorchCodec's encoding capabilities under the hood.
|
| 115 |
+
It is provided for convenience, but we do recommend that you port your code to
|
| 116 |
+
natively use ``torchcodec``'s ``AudioEncoder`` class for better
|
| 117 |
+
performance:
|
| 118 |
+
https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.encoders.AudioEncoder.
|
| 119 |
+
Because of the reliance on Torchcodec, the parameters ``format``, ``encoding``,
|
| 120 |
+
``bits_per_sample``, ``buffer_size``, and ``backend``, are ignored and accepted only for
|
| 121 |
+
backwards compatibility.
|
| 122 |
+
To install torchcodec, follow the instructions at https://github.com/pytorch/torchcodec#installing-torchcodec.
|
| 123 |
+
|
| 124 |
+
Args:
|
| 125 |
+
uri (path-like object):
|
| 126 |
+
Path to save the audio file. The file extension determines the format.
|
| 127 |
+
|
| 128 |
+
src (torch.Tensor):
|
| 129 |
+
Audio data to save. Must be a 1D or 2D tensor with float32 values
|
| 130 |
+
in the range [-1, 1]. If 2D, shape should be [channel, time] when
|
| 131 |
+
channels_first=True, or [time, channel] when channels_first=False.
|
| 132 |
+
|
| 133 |
+
sample_rate (int):
|
| 134 |
+
Sample rate of the audio data.
|
| 135 |
+
|
| 136 |
+
channels_first (bool, optional):
|
| 137 |
+
Indicates whether the input tensor has channels as the first dimension.
|
| 138 |
+
If True, expects [channel, time]. If False, expects [time, channel].
|
| 139 |
+
Default: True.
|
| 140 |
+
|
| 141 |
+
format (str or None, optional):
|
| 142 |
+
Audio format hint. Not used by TorchCodec (format is determined by
|
| 143 |
+
file extension). A warning is issued if provided.
|
| 144 |
+
Default: None.
|
| 145 |
+
|
| 146 |
+
encoding (str or None, optional):
|
| 147 |
+
Audio encoding. Not fully supported by TorchCodec AudioEncoder.
|
| 148 |
+
A warning is issued if provided. Default: None.
|
| 149 |
+
|
| 150 |
+
bits_per_sample (int or None, optional):
|
| 151 |
+
Bits per sample. Not directly supported by TorchCodec AudioEncoder.
|
| 152 |
+
A warning is issued if provided. Default: None.
|
| 153 |
+
|
| 154 |
+
buffer_size (int, optional):
|
| 155 |
+
Not used by TorchCodec AudioEncoder. Provided for API compatibility.
|
| 156 |
+
A warning is issued if not default value. Default: 4096.
|
| 157 |
+
|
| 158 |
+
backend (str or None, optional):
|
| 159 |
+
Not used by TorchCodec AudioEncoder. Provided for API compatibility.
|
| 160 |
+
A warning is issued if provided. Default: None.
|
| 161 |
+
|
| 162 |
+
compression (float, int or None, optional):
|
| 163 |
+
Compression level or bit rate. Maps to bit_rate parameter in
|
| 164 |
+
TorchCodec AudioEncoder. Default: None.
|
| 165 |
+
|
| 166 |
+
Raises:
|
| 167 |
+
ImportError: If torchcodec is not available.
|
| 168 |
+
ValueError: If input parameters are invalid.
|
| 169 |
+
RuntimeError: If TorchCodec fails to encode the audio.
|
| 170 |
+
|
| 171 |
+
Note:
|
| 172 |
+
- TorchCodec AudioEncoder expects float32 samples in [-1, 1] range.
|
| 173 |
+
- Some parameters (format, encoding, bits_per_sample, buffer_size, backend)
|
| 174 |
+
are not used by TorchCodec but are provided for API compatibility.
|
| 175 |
+
- The output format is determined by the file extension in the uri.
|
| 176 |
+
- TorchCodec uses FFmpeg under the hood for encoding.
|
| 177 |
+
"""
|
| 178 |
+
return save_with_torchcodec(
|
| 179 |
+
uri,
|
| 180 |
+
src,
|
| 181 |
+
sample_rate,
|
| 182 |
+
channels_first=channels_first,
|
| 183 |
+
format=format,
|
| 184 |
+
encoding=encoding,
|
| 185 |
+
bits_per_sample=bits_per_sample,
|
| 186 |
+
buffer_size=buffer_size,
|
| 187 |
+
backend=backend,
|
| 188 |
+
compression=compression,
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
__all__ = [
|
| 193 |
+
"load",
|
| 194 |
+
"load_with_torchcodec",
|
| 195 |
+
"save_with_torchcodec",
|
| 196 |
+
"save",
|
| 197 |
+
"compliance",
|
| 198 |
+
"datasets",
|
| 199 |
+
"functional",
|
| 200 |
+
"models",
|
| 201 |
+
"pipelines",
|
| 202 |
+
"utils",
|
| 203 |
+
"transforms",
|
| 204 |
+
]
|
source/torchaudio/_extension/__init__.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import os
|
| 3 |
+
import sys
|
| 4 |
+
|
| 5 |
+
from torchaudio._internal.module_utils import fail_with_message, is_module_available, no_op
|
| 6 |
+
|
| 7 |
+
from .utils import _check_cuda_version, _init_dll_path, _load_lib
|
| 8 |
+
|
| 9 |
+
_LG = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
# Note:
|
| 13 |
+
# `_check_cuda_version` is not meant to be used by regular users.
|
| 14 |
+
# Builder uses it for debugging purpose, so we export it.
|
| 15 |
+
# https://github.com/pytorch/builder/blob/e2e4542b8eb0bdf491214451a1a4128bd606cce2/test/smoke_test/smoke_test.py#L80
|
| 16 |
+
__all__ = [
|
| 17 |
+
"_check_cuda_version",
|
| 18 |
+
"_IS_TORCHAUDIO_EXT_AVAILABLE",
|
| 19 |
+
"_IS_RIR_AVAILABLE",
|
| 20 |
+
]
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
if os.name == "nt" and (3, 8) <= sys.version_info < (3, 9):
|
| 24 |
+
_init_dll_path()
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# When the extension module is built, we initialize it.
|
| 28 |
+
# In case of an error, we do not catch the failure as it suggests there is something
|
| 29 |
+
# wrong with the installation.
|
| 30 |
+
_IS_TORCHAUDIO_EXT_AVAILABLE = is_module_available("torchaudio.lib._torchaudio")
|
| 31 |
+
# RIR features are implemented in _torchaudio extension, but they can be individually
|
| 32 |
+
# turned on/off at build time. Available means that _torchaudio is loaded properly, and
|
| 33 |
+
# RIR features are found there.
|
| 34 |
+
_IS_RIR_AVAILABLE = False
|
| 35 |
+
_IS_ALIGN_AVAILABLE = False
|
| 36 |
+
if _IS_TORCHAUDIO_EXT_AVAILABLE:
|
| 37 |
+
_load_lib("libtorchaudio")
|
| 38 |
+
|
| 39 |
+
import torchaudio.lib._torchaudio # noqa
|
| 40 |
+
|
| 41 |
+
_check_cuda_version()
|
| 42 |
+
_IS_RIR_AVAILABLE = torchaudio.lib._torchaudio.is_rir_available()
|
| 43 |
+
_IS_ALIGN_AVAILABLE = torchaudio.lib._torchaudio.is_align_available()
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
fail_if_no_rir = (
|
| 47 |
+
no_op
|
| 48 |
+
if _IS_RIR_AVAILABLE
|
| 49 |
+
else fail_with_message(
|
| 50 |
+
"requires RIR extension, but TorchAudio is not compiled with it. Please build TorchAudio with RIR support."
|
| 51 |
+
)
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
fail_if_no_align = (
|
| 55 |
+
no_op
|
| 56 |
+
if _IS_ALIGN_AVAILABLE
|
| 57 |
+
else fail_with_message(
|
| 58 |
+
"Requires alignment extension, but TorchAudio is not compiled with it. \
|
| 59 |
+
Please build TorchAudio with alignment support."
|
| 60 |
+
)
|
| 61 |
+
)
|
source/torchaudio/_extension/utils.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Module to implement logics used for initializing extensions.
|
| 2 |
+
|
| 3 |
+
The implementations here should be stateless.
|
| 4 |
+
They should not depend on external state.
|
| 5 |
+
Anything that depends on external state should happen in __init__.py
|
| 6 |
+
"""
|
| 7 |
+
import logging
|
| 8 |
+
import os
|
| 9 |
+
import types
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
import torch
|
| 13 |
+
|
| 14 |
+
_LG = logging.getLogger(__name__)
|
| 15 |
+
_LIB_DIR = Path(__file__).parent.parent / "lib"
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def _get_lib_path(lib: str):
|
| 19 |
+
suffix = "pyd" if os.name == "nt" else "so"
|
| 20 |
+
path = _LIB_DIR / f"{lib}.{suffix}"
|
| 21 |
+
return path
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def _load_lib(lib: str) -> bool:
|
| 25 |
+
"""Load extension module
|
| 26 |
+
|
| 27 |
+
Note:
|
| 28 |
+
In case `torchaudio` is deployed with `pex` format, the library file
|
| 29 |
+
is not in a standard location.
|
| 30 |
+
In this case, we expect that `libtorchaudio` is available somewhere
|
| 31 |
+
in the search path of dynamic loading mechanism, so that importing
|
| 32 |
+
`_torchaudio` will have library loader find and load `libtorchaudio`.
|
| 33 |
+
This is the reason why the function should not raising an error when the library
|
| 34 |
+
file is not found.
|
| 35 |
+
|
| 36 |
+
Returns:
|
| 37 |
+
bool:
|
| 38 |
+
True if the library file is found AND the library loaded without failure.
|
| 39 |
+
False if the library file is not found (like in the case where torchaudio
|
| 40 |
+
is deployed with pex format, thus the shared library file is
|
| 41 |
+
in a non-standard location.).
|
| 42 |
+
If the library file is found but there is an issue loading the library,
|
| 43 |
+
(such as missing dependency) then this function raises the exception as-is.
|
| 44 |
+
|
| 45 |
+
Raises:
|
| 46 |
+
Exception:
|
| 47 |
+
If the library file is found, but there is an issue loading the library file,
|
| 48 |
+
(when underlying `ctype.DLL` throws an exception), this function will pass
|
| 49 |
+
the exception as-is, instead of catching it and returning bool.
|
| 50 |
+
The expected case is `OSError` thrown by `ctype.DLL` when a dynamic dependency
|
| 51 |
+
is not found.
|
| 52 |
+
This behavior was chosen because the expected failure case is not recoverable.
|
| 53 |
+
If a dependency is missing, then users have to install it.
|
| 54 |
+
"""
|
| 55 |
+
path = _get_lib_path(lib)
|
| 56 |
+
if not path.exists():
|
| 57 |
+
return False
|
| 58 |
+
torch.ops.load_library(path)
|
| 59 |
+
return True
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
class _LazyImporter(types.ModuleType):
|
| 63 |
+
"""Lazily import module/extension."""
|
| 64 |
+
|
| 65 |
+
def __init__(self, name, import_func):
|
| 66 |
+
super().__init__(name)
|
| 67 |
+
self.import_func = import_func
|
| 68 |
+
self.module = None
|
| 69 |
+
|
| 70 |
+
# Note:
|
| 71 |
+
# Python caches what was retrieved with `__getattr__`, so this method will not be
|
| 72 |
+
# called again for the same item.
|
| 73 |
+
def __getattr__(self, item):
|
| 74 |
+
self._import_once()
|
| 75 |
+
return getattr(self.module, item)
|
| 76 |
+
|
| 77 |
+
def __repr__(self):
|
| 78 |
+
if self.module is None:
|
| 79 |
+
return f"<module '{self.__module__}.{self.__class__.__name__}(\"{self.name}\")'>"
|
| 80 |
+
return repr(self.module)
|
| 81 |
+
|
| 82 |
+
def __dir__(self):
|
| 83 |
+
self._import_once()
|
| 84 |
+
return dir(self.module)
|
| 85 |
+
|
| 86 |
+
def _import_once(self):
|
| 87 |
+
if self.module is None:
|
| 88 |
+
self.module = self.import_func()
|
| 89 |
+
# Note:
|
| 90 |
+
# By attaching the module attributes to self,
|
| 91 |
+
# module attributes are directly accessible.
|
| 92 |
+
# This allows to avoid calling __getattr__ for every attribute access.
|
| 93 |
+
self.__dict__.update(self.module.__dict__)
|
| 94 |
+
|
| 95 |
+
def is_available(self):
|
| 96 |
+
try:
|
| 97 |
+
self._import_once()
|
| 98 |
+
except Exception:
|
| 99 |
+
return False
|
| 100 |
+
return True
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def _init_dll_path():
|
| 104 |
+
# On Windows Python-3.8+ has `os.add_dll_directory` call,
|
| 105 |
+
# which is called to configure dll search path.
|
| 106 |
+
# To find cuda related dlls we need to make sure the
|
| 107 |
+
# conda environment/bin path is configured Please take a look:
|
| 108 |
+
# https://stackoverflow.com/questions/59330863/cant-import-dll-module-in-python
|
| 109 |
+
# Please note: if some path can't be added using add_dll_directory we simply ignore this path
|
| 110 |
+
for path in os.environ.get("PATH", "").split(";"):
|
| 111 |
+
if os.path.exists(path):
|
| 112 |
+
try:
|
| 113 |
+
os.add_dll_directory(path)
|
| 114 |
+
except Exception:
|
| 115 |
+
pass
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def _check_cuda_version():
|
| 119 |
+
import torchaudio.lib._torchaudio
|
| 120 |
+
|
| 121 |
+
version = torchaudio.lib._torchaudio.cuda_version()
|
| 122 |
+
if version is not None and torch.version.cuda is not None:
|
| 123 |
+
version_str = str(version)
|
| 124 |
+
ta_version = f"{version_str[:-3]}.{version_str[-2]}"
|
| 125 |
+
t_version = torch.version.cuda.split(".")
|
| 126 |
+
t_version = f"{t_version[0]}.{t_version[1]}"
|
| 127 |
+
if ta_version != t_version:
|
| 128 |
+
raise RuntimeError(
|
| 129 |
+
"Detected that PyTorch and TorchAudio were compiled with different CUDA versions. "
|
| 130 |
+
f"PyTorch has CUDA version {t_version} whereas TorchAudio has CUDA version {ta_version}. "
|
| 131 |
+
"Please install the TorchAudio version that matches your PyTorch version."
|
| 132 |
+
)
|
| 133 |
+
return version
|
source/torchaudio/_internal/__init__.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
try:
|
| 2 |
+
from .fb import download_url_to_file, load_state_dict_from_url
|
| 3 |
+
except ImportError:
|
| 4 |
+
from torch.hub import download_url_to_file, load_state_dict_from_url
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
__all__ = [
|
| 8 |
+
"load_state_dict_from_url",
|
| 9 |
+
"download_url_to_file",
|
| 10 |
+
]
|
source/torchaudio/_internal/module_utils.py
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import importlib.util
|
| 2 |
+
import os
|
| 3 |
+
import warnings
|
| 4 |
+
from functools import partial, wraps
|
| 5 |
+
from typing import Optional
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def eval_env(var, default):
|
| 9 |
+
"""Check if environment varable has True-y value"""
|
| 10 |
+
if var not in os.environ:
|
| 11 |
+
return default
|
| 12 |
+
|
| 13 |
+
val = os.environ.get(var, "0")
|
| 14 |
+
trues = ["1", "true", "TRUE", "on", "ON", "yes", "YES"]
|
| 15 |
+
falses = ["0", "false", "FALSE", "off", "OFF", "no", "NO"]
|
| 16 |
+
if val in trues:
|
| 17 |
+
return True
|
| 18 |
+
if val not in falses:
|
| 19 |
+
# fmt: off
|
| 20 |
+
raise RuntimeError(
|
| 21 |
+
f"Unexpected environment variable value `{var}={val}`. "
|
| 22 |
+
f"Expected one of {trues + falses}")
|
| 23 |
+
# fmt: on
|
| 24 |
+
return False
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def is_module_available(*modules: str) -> bool:
|
| 28 |
+
r"""Returns if a top-level module with :attr:`name` exists *without**
|
| 29 |
+
importing it. This is generally safer than try-catch block around a
|
| 30 |
+
`import X`. It avoids third party libraries breaking assumptions of some of
|
| 31 |
+
our tests, e.g., setting multiprocessing start method when imported
|
| 32 |
+
(see librosa/#747, torchvision/#544).
|
| 33 |
+
"""
|
| 34 |
+
return all(importlib.util.find_spec(m) is not None for m in modules)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def requires_module(*modules: str):
|
| 38 |
+
"""Decorate function to give error message if invoked without required optional modules.
|
| 39 |
+
|
| 40 |
+
This decorator is to give better error message to users rather
|
| 41 |
+
than raising ``NameError: name 'module' is not defined`` at random places.
|
| 42 |
+
"""
|
| 43 |
+
missing = [m for m in modules if not is_module_available(m)]
|
| 44 |
+
|
| 45 |
+
if not missing:
|
| 46 |
+
# fall through. If all the modules are available, no need to decorate
|
| 47 |
+
def decorator(func):
|
| 48 |
+
return func
|
| 49 |
+
|
| 50 |
+
else:
|
| 51 |
+
req = f"module: {missing[0]}" if len(missing) == 1 else f"modules: {missing}"
|
| 52 |
+
|
| 53 |
+
def decorator(func):
|
| 54 |
+
@wraps(func)
|
| 55 |
+
def wrapped(*args, **kwargs):
|
| 56 |
+
raise RuntimeError(f"{func.__module__}.{func.__name__} requires {req}")
|
| 57 |
+
|
| 58 |
+
return wrapped
|
| 59 |
+
|
| 60 |
+
return decorator
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
UNSUPPORTED = []
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def wrap_deprecated(func, name, direction: str, version: Optional[str] = None, remove: bool = False):
|
| 67 |
+
@wraps(func)
|
| 68 |
+
def wrapped(*args, **kwargs):
|
| 69 |
+
message = f"{name} has been deprecated. {direction}"
|
| 70 |
+
if remove:
|
| 71 |
+
message += f' It will be removed from {"a future" if version is None else "the " + str(version)} release. '
|
| 72 |
+
warnings.warn(message, stacklevel=2)
|
| 73 |
+
return func(*args, **kwargs)
|
| 74 |
+
|
| 75 |
+
return wrapped
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def deprecated(direction: str, version: Optional[str] = None, remove: bool = False):
|
| 79 |
+
"""Decorator to add deprecation message
|
| 80 |
+
|
| 81 |
+
Args:
|
| 82 |
+
direction (str): Migration steps to be given to users.
|
| 83 |
+
version (str or int): The version when the object will be removed
|
| 84 |
+
remove (bool): If enabled, append future removal message.
|
| 85 |
+
"""
|
| 86 |
+
|
| 87 |
+
def decorator(func):
|
| 88 |
+
wrapped = wrap_deprecated(func, f"{func.__module__}.{func.__name__}", direction, version=version, remove=remove)
|
| 89 |
+
|
| 90 |
+
message = "This function has been deprecated. "
|
| 91 |
+
if remove:
|
| 92 |
+
message += f'It will be removed from {"future" if version is None else version} release. '
|
| 93 |
+
|
| 94 |
+
wrapped.__doc__ = f"""DEPRECATED
|
| 95 |
+
|
| 96 |
+
.. warning::
|
| 97 |
+
|
| 98 |
+
{message}
|
| 99 |
+
{direction}
|
| 100 |
+
|
| 101 |
+
{func.__doc__}
|
| 102 |
+
"""
|
| 103 |
+
|
| 104 |
+
return wrapped
|
| 105 |
+
|
| 106 |
+
return decorator
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
DEPRECATION_MSG = (
|
| 110 |
+
"This deprecation is part of a large refactoring effort to transition TorchAudio into a maintenance phase. "
|
| 111 |
+
"Please see https://github.com/pytorch/audio/issues/3902 for more information."
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
IO_DEPRECATION_MSG = (
|
| 115 |
+
"This deprecation is part of a large refactoring effort to transition TorchAudio into a maintenance phase. "
|
| 116 |
+
"The decoding and encoding capabilities of PyTorch for both audio"
|
| 117 |
+
" and video are being consolidated into TorchCodec. "
|
| 118 |
+
"Please see https://github.com/pytorch/audio/issues/3902 for more information."
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
dropping_support = deprecated(DEPRECATION_MSG, version="2.9", remove=True)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def dropping_class_support(c, msg=DEPRECATION_MSG):
|
| 125 |
+
c.__init__ = wrap_deprecated(c.__init__, f"{c.__module__}.{c.__name__}", msg, version="2.9", remove=True)
|
| 126 |
+
c.__doc__ = f"""DEPRECATED
|
| 127 |
+
|
| 128 |
+
.. warning::
|
| 129 |
+
|
| 130 |
+
This class is deprecated from version 2.8. It will be removed in the 2.9 release.
|
| 131 |
+
{msg}
|
| 132 |
+
{c.__doc__}
|
| 133 |
+
"""
|
| 134 |
+
|
| 135 |
+
UNSUPPORTED.append(c)
|
| 136 |
+
return c
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def dropping_const_support(c, msg=DEPRECATION_MSG, name=None):
|
| 140 |
+
c.__doc__ = f"""[DEPRECATED]
|
| 141 |
+
|
| 142 |
+
.. warning::
|
| 143 |
+
|
| 144 |
+
This object is deprecated deprecated from version 2.8. It will be removed in the 2.9 release.
|
| 145 |
+
{msg}
|
| 146 |
+
{c.__doc__}
|
| 147 |
+
"""
|
| 148 |
+
return c
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
dropping_class_io_support = partial(dropping_class_support, msg=IO_DEPRECATION_MSG)
|
| 152 |
+
|
| 153 |
+
dropping_io_support = deprecated(IO_DEPRECATION_MSG, version="2.9", remove=True)
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def fail_with_message(message):
|
| 157 |
+
"""Generate decorator to give users message about missing TorchAudio extension."""
|
| 158 |
+
|
| 159 |
+
def decorator(func):
|
| 160 |
+
@wraps(func)
|
| 161 |
+
def wrapped(*args, **kwargs):
|
| 162 |
+
raise RuntimeError(f"{func.__module__}.{func.__name__} {message}")
|
| 163 |
+
|
| 164 |
+
return wrapped
|
| 165 |
+
|
| 166 |
+
return decorator
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def no_op(func):
|
| 170 |
+
"""Op-op decorator. Used in place of fail_with_message when a functionality that requires extension works fine."""
|
| 171 |
+
return func
|
source/torchaudio/_torchcodec.py
ADDED
|
@@ -0,0 +1,340 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""TorchCodec integration for TorchAudio."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from typing import BinaryIO, Optional, Tuple, Union
|
| 5 |
+
|
| 6 |
+
import torch
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def load_with_torchcodec(
|
| 10 |
+
uri: Union[BinaryIO, str, os.PathLike],
|
| 11 |
+
frame_offset: int = 0,
|
| 12 |
+
num_frames: int = -1,
|
| 13 |
+
normalize: bool = True,
|
| 14 |
+
channels_first: bool = True,
|
| 15 |
+
format: Optional[str] = None,
|
| 16 |
+
buffer_size: int = 4096,
|
| 17 |
+
backend: Optional[str] = None,
|
| 18 |
+
) -> Tuple[torch.Tensor, int]:
|
| 19 |
+
"""Load audio data from source using TorchCodec's AudioDecoder.
|
| 20 |
+
|
| 21 |
+
.. note::
|
| 22 |
+
|
| 23 |
+
This function supports the same API as :func:`~torchaudio.load`, and
|
| 24 |
+
relies on TorchCodec's decoding capabilities under the hood. It is
|
| 25 |
+
provided for convenience, but we do recommend that you port your code to
|
| 26 |
+
natively use ``torchcodec``'s ``AudioDecoder`` class for better
|
| 27 |
+
performance:
|
| 28 |
+
https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.AudioDecoder.
|
| 29 |
+
As of TorchAudio 2.9, :func:`~torchaudio.load` relies on
|
| 30 |
+
:func:`~torchaudio.load_with_torchcodec`. Note that some parameters of
|
| 31 |
+
:func:`~torchaudio.load`, like ``normalize``, ``buffer_size``, and
|
| 32 |
+
``backend``, are ignored by :func:`~torchaudio.load_with_torchcodec`.
|
| 33 |
+
To install torchcodec, follow the instructions at https://github.com/pytorch/torchcodec#installing-torchcodec.
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
Args:
|
| 37 |
+
uri (path-like object or file-like object):
|
| 38 |
+
Source of audio data. The following types are accepted:
|
| 39 |
+
|
| 40 |
+
* ``path-like``: File path or URL.
|
| 41 |
+
* ``file-like``: Object with ``read(size: int) -> bytes`` method.
|
| 42 |
+
|
| 43 |
+
frame_offset (int, optional):
|
| 44 |
+
Number of samples to skip before start reading data.
|
| 45 |
+
num_frames (int, optional):
|
| 46 |
+
Maximum number of samples to read. ``-1`` reads all the remaining samples,
|
| 47 |
+
starting from ``frame_offset``.
|
| 48 |
+
normalize (bool, optional):
|
| 49 |
+
TorchCodec always returns normalized float32 samples. This parameter
|
| 50 |
+
is ignored and a warning is issued if set to False.
|
| 51 |
+
Default: ``True``.
|
| 52 |
+
channels_first (bool, optional):
|
| 53 |
+
When True, the returned Tensor has dimension `[channel, time]`.
|
| 54 |
+
Otherwise, the returned Tensor's dimension is `[time, channel]`.
|
| 55 |
+
format (str or None, optional):
|
| 56 |
+
Format hint for the decoder. May not be supported by all TorchCodec
|
| 57 |
+
decoders. (Default: ``None``)
|
| 58 |
+
buffer_size (int, optional):
|
| 59 |
+
Not used by TorchCodec AudioDecoder. Provided for API compatibility.
|
| 60 |
+
backend (str or None, optional):
|
| 61 |
+
Not used by TorchCodec AudioDecoder. Provided for API compatibility.
|
| 62 |
+
|
| 63 |
+
Returns:
|
| 64 |
+
(torch.Tensor, int): Resulting Tensor and sample rate.
|
| 65 |
+
Always returns float32 tensors. If ``channels_first=True``, shape is
|
| 66 |
+
`[channel, time]`, otherwise `[time, channel]`.
|
| 67 |
+
|
| 68 |
+
Raises:
|
| 69 |
+
ImportError: If torchcodec is not available.
|
| 70 |
+
ValueError: If unsupported parameters are used.
|
| 71 |
+
RuntimeError: If TorchCodec fails to decode the audio.
|
| 72 |
+
|
| 73 |
+
Note:
|
| 74 |
+
- TorchCodec always returns normalized float32 samples, so the ``normalize``
|
| 75 |
+
parameter has no effect.
|
| 76 |
+
- The ``buffer_size`` and ``backend`` parameters are ignored.
|
| 77 |
+
- Not all audio formats supported by torchaudio backends may be supported
|
| 78 |
+
by TorchCodec.
|
| 79 |
+
"""
|
| 80 |
+
# Import torchcodec here to provide clear error if not available
|
| 81 |
+
try:
|
| 82 |
+
from torchcodec.decoders import AudioDecoder
|
| 83 |
+
except ImportError as e:
|
| 84 |
+
raise ImportError(
|
| 85 |
+
"TorchCodec is required for load_with_torchcodec. " "Please install torchcodec to use this function."
|
| 86 |
+
) from e
|
| 87 |
+
|
| 88 |
+
# Parameter validation and warnings
|
| 89 |
+
if not normalize:
|
| 90 |
+
import warnings
|
| 91 |
+
|
| 92 |
+
warnings.warn(
|
| 93 |
+
"TorchCodec AudioDecoder always returns normalized float32 samples. "
|
| 94 |
+
"The 'normalize=False' parameter is ignored.",
|
| 95 |
+
UserWarning,
|
| 96 |
+
stacklevel=2,
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
if buffer_size != 4096:
|
| 100 |
+
import warnings
|
| 101 |
+
|
| 102 |
+
warnings.warn("The 'buffer_size' parameter is not used by TorchCodec AudioDecoder.", UserWarning, stacklevel=2)
|
| 103 |
+
|
| 104 |
+
if backend is not None:
|
| 105 |
+
import warnings
|
| 106 |
+
|
| 107 |
+
warnings.warn("The 'backend' parameter is not used by TorchCodec AudioDecoder.", UserWarning, stacklevel=2)
|
| 108 |
+
|
| 109 |
+
if format is not None:
|
| 110 |
+
import warnings
|
| 111 |
+
|
| 112 |
+
warnings.warn("The 'format' parameter is not supported by TorchCodec AudioDecoder.", UserWarning, stacklevel=2)
|
| 113 |
+
|
| 114 |
+
# Create AudioDecoder
|
| 115 |
+
try:
|
| 116 |
+
decoder = AudioDecoder(uri)
|
| 117 |
+
except Exception as e:
|
| 118 |
+
raise RuntimeError(f"Failed to create AudioDecoder for {uri}: {e}") from e
|
| 119 |
+
|
| 120 |
+
# Get sample rate from metadata
|
| 121 |
+
sample_rate = decoder.metadata.sample_rate
|
| 122 |
+
if sample_rate is None:
|
| 123 |
+
raise RuntimeError("Unable to determine sample rate from audio metadata")
|
| 124 |
+
|
| 125 |
+
# Decode the entire file first, then subsample manually
|
| 126 |
+
# This is the simplest approach since torchcodec uses time-based indexing
|
| 127 |
+
try:
|
| 128 |
+
audio_samples = decoder.get_all_samples()
|
| 129 |
+
except Exception as e:
|
| 130 |
+
raise RuntimeError(f"Failed to decode audio samples: {e}") from e
|
| 131 |
+
|
| 132 |
+
data = audio_samples.data
|
| 133 |
+
|
| 134 |
+
# Apply frame_offset and num_frames (which are actually sample offsets)
|
| 135 |
+
if frame_offset > 0:
|
| 136 |
+
if frame_offset >= data.shape[1]:
|
| 137 |
+
# Return empty tensor if offset is beyond available data
|
| 138 |
+
empty_shape = (data.shape[0], 0) if channels_first else (0, data.shape[0])
|
| 139 |
+
return torch.zeros(empty_shape, dtype=torch.float32), sample_rate
|
| 140 |
+
data = data[:, frame_offset:]
|
| 141 |
+
|
| 142 |
+
if num_frames == 0:
|
| 143 |
+
# Return empty tensor if num_frames is 0
|
| 144 |
+
empty_shape = (data.shape[0], 0) if channels_first else (0, data.shape[0])
|
| 145 |
+
return torch.zeros(empty_shape, dtype=torch.float32), sample_rate
|
| 146 |
+
elif num_frames > 0:
|
| 147 |
+
data = data[:, :num_frames]
|
| 148 |
+
|
| 149 |
+
# TorchCodec returns data in [channel, time] format by default
|
| 150 |
+
# Handle channels_first parameter
|
| 151 |
+
if not channels_first:
|
| 152 |
+
data = data.transpose(0, 1) # [channel, time] -> [time, channel]
|
| 153 |
+
|
| 154 |
+
return data, sample_rate
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def save_with_torchcodec(
|
| 158 |
+
uri: Union[str, os.PathLike],
|
| 159 |
+
src: torch.Tensor,
|
| 160 |
+
sample_rate: int,
|
| 161 |
+
channels_first: bool = True,
|
| 162 |
+
format: Optional[str] = None,
|
| 163 |
+
encoding: Optional[str] = None,
|
| 164 |
+
bits_per_sample: Optional[int] = None,
|
| 165 |
+
buffer_size: int = 4096,
|
| 166 |
+
backend: Optional[str] = None,
|
| 167 |
+
compression: Optional[Union[float, int]] = None,
|
| 168 |
+
) -> None:
|
| 169 |
+
"""Save audio data to file using TorchCodec's AudioEncoder.
|
| 170 |
+
|
| 171 |
+
.. note::
|
| 172 |
+
|
| 173 |
+
This function supports the same API as :func:`~torchaudio.save`, and
|
| 174 |
+
relies on TorchCodec's encoding capabilities under the hood. It is
|
| 175 |
+
provided for convenience, but we do recommend that you port your code to
|
| 176 |
+
natively use ``torchcodec``'s ``AudioEncoder`` class for better
|
| 177 |
+
performance:
|
| 178 |
+
https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.encoders.AudioEncoder.
|
| 179 |
+
As of TorchAudio 2.9, :func:`~torchaudio.save` relies on
|
| 180 |
+
:func:`~torchaudio.save_with_torchcodec`. Note that some parameters of
|
| 181 |
+
:func:`~torchaudio.save`, like ``format``, ``encoding``,
|
| 182 |
+
``bits_per_sample``, ``buffer_size``, and ``backend``, are ignored by
|
| 183 |
+
are ignored by :func:`~torchaudio.save_with_torchcodec`.
|
| 184 |
+
To install torchcodec, follow the instructions at https://github.com/pytorch/torchcodec#installing-torchcodec.
|
| 185 |
+
|
| 186 |
+
This function provides a TorchCodec-based alternative to torchaudio.save
|
| 187 |
+
with the same API. TorchCodec's AudioEncoder provides efficient encoding
|
| 188 |
+
with FFmpeg under the hood.
|
| 189 |
+
|
| 190 |
+
Args:
|
| 191 |
+
uri (path-like object):
|
| 192 |
+
Path to save the audio file. The file extension determines the format.
|
| 193 |
+
|
| 194 |
+
src (torch.Tensor):
|
| 195 |
+
Audio data to save. Must be a 1D or 2D tensor with float32 values
|
| 196 |
+
in the range [-1, 1]. If 2D, shape should be [channel, time] when
|
| 197 |
+
channels_first=True, or [time, channel] when channels_first=False.
|
| 198 |
+
|
| 199 |
+
sample_rate (int):
|
| 200 |
+
Sample rate of the audio data.
|
| 201 |
+
|
| 202 |
+
channels_first (bool, optional):
|
| 203 |
+
Indicates whether the input tensor has channels as the first dimension.
|
| 204 |
+
If True, expects [channel, time]. If False, expects [time, channel].
|
| 205 |
+
Default: True.
|
| 206 |
+
|
| 207 |
+
format (str or None, optional):
|
| 208 |
+
Audio format hint. Not used by TorchCodec (format is determined by
|
| 209 |
+
file extension). A warning is issued if provided.
|
| 210 |
+
Default: None.
|
| 211 |
+
|
| 212 |
+
encoding (str or None, optional):
|
| 213 |
+
Audio encoding. Not fully supported by TorchCodec AudioEncoder.
|
| 214 |
+
A warning is issued if provided. Default: None.
|
| 215 |
+
|
| 216 |
+
bits_per_sample (int or None, optional):
|
| 217 |
+
Bits per sample. Not directly supported by TorchCodec AudioEncoder.
|
| 218 |
+
A warning is issued if provided. Default: None.
|
| 219 |
+
|
| 220 |
+
buffer_size (int, optional):
|
| 221 |
+
Not used by TorchCodec AudioEncoder. Provided for API compatibility.
|
| 222 |
+
A warning is issued if not default value. Default: 4096.
|
| 223 |
+
|
| 224 |
+
backend (str or None, optional):
|
| 225 |
+
Not used by TorchCodec AudioEncoder. Provided for API compatibility.
|
| 226 |
+
A warning is issued if provided. Default: None.
|
| 227 |
+
|
| 228 |
+
compression (float, int or None, optional):
|
| 229 |
+
Compression level or bit rate. Maps to bit_rate parameter in
|
| 230 |
+
TorchCodec AudioEncoder. Default: None.
|
| 231 |
+
|
| 232 |
+
Raises:
|
| 233 |
+
ImportError: If torchcodec is not available.
|
| 234 |
+
ValueError: If input parameters are invalid.
|
| 235 |
+
RuntimeError: If TorchCodec fails to encode the audio.
|
| 236 |
+
|
| 237 |
+
Note:
|
| 238 |
+
- TorchCodec AudioEncoder expects float32 samples in [-1, 1] range.
|
| 239 |
+
- Some parameters (format, encoding, bits_per_sample, buffer_size, backend)
|
| 240 |
+
are not used by TorchCodec but are provided for API compatibility.
|
| 241 |
+
- The output format is determined by the file extension in the uri.
|
| 242 |
+
- TorchCodec uses FFmpeg under the hood for encoding.
|
| 243 |
+
"""
|
| 244 |
+
# Import torchcodec here to provide clear error if not available
|
| 245 |
+
try:
|
| 246 |
+
from torchcodec.encoders import AudioEncoder
|
| 247 |
+
except ImportError as e:
|
| 248 |
+
raise ImportError(
|
| 249 |
+
"TorchCodec is required for save_with_torchcodec. " "Please install torchcodec to use this function."
|
| 250 |
+
) from e
|
| 251 |
+
|
| 252 |
+
# Parameter validation and warnings
|
| 253 |
+
if format is not None:
|
| 254 |
+
import warnings
|
| 255 |
+
|
| 256 |
+
warnings.warn(
|
| 257 |
+
"The 'format' parameter is not used by TorchCodec AudioEncoder. "
|
| 258 |
+
"Format is determined by the file extension.",
|
| 259 |
+
UserWarning,
|
| 260 |
+
stacklevel=2,
|
| 261 |
+
)
|
| 262 |
+
|
| 263 |
+
if encoding is not None:
|
| 264 |
+
import warnings
|
| 265 |
+
|
| 266 |
+
warnings.warn(
|
| 267 |
+
"The 'encoding' parameter is not fully supported by TorchCodec AudioEncoder.", UserWarning, stacklevel=2
|
| 268 |
+
)
|
| 269 |
+
|
| 270 |
+
if bits_per_sample is not None:
|
| 271 |
+
import warnings
|
| 272 |
+
|
| 273 |
+
warnings.warn(
|
| 274 |
+
"The 'bits_per_sample' parameter is not directly supported by TorchCodec AudioEncoder.",
|
| 275 |
+
UserWarning,
|
| 276 |
+
stacklevel=2,
|
| 277 |
+
)
|
| 278 |
+
|
| 279 |
+
if buffer_size != 4096:
|
| 280 |
+
import warnings
|
| 281 |
+
|
| 282 |
+
warnings.warn("The 'buffer_size' parameter is not used by TorchCodec AudioEncoder.", UserWarning, stacklevel=2)
|
| 283 |
+
|
| 284 |
+
if backend is not None:
|
| 285 |
+
import warnings
|
| 286 |
+
|
| 287 |
+
warnings.warn("The 'backend' parameter is not used by TorchCodec AudioEncoder.", UserWarning, stacklevel=2)
|
| 288 |
+
|
| 289 |
+
# Input validation
|
| 290 |
+
if not isinstance(src, torch.Tensor):
|
| 291 |
+
raise ValueError(f"Expected src to be a torch.Tensor, got {type(src)}")
|
| 292 |
+
|
| 293 |
+
if src.dtype != torch.float32:
|
| 294 |
+
src = src.float()
|
| 295 |
+
|
| 296 |
+
if sample_rate <= 0:
|
| 297 |
+
raise ValueError(f"sample_rate must be positive, got {sample_rate}")
|
| 298 |
+
|
| 299 |
+
# Handle tensor shape and channels_first
|
| 300 |
+
if src.ndim == 1:
|
| 301 |
+
# Convert to 2D: [1, time] for channels_first=True
|
| 302 |
+
if channels_first:
|
| 303 |
+
data = src.unsqueeze(0) # [1, time]
|
| 304 |
+
else:
|
| 305 |
+
# For channels_first=False, input is [time] -> reshape to [time, 1] -> transpose to [1, time]
|
| 306 |
+
data = src.unsqueeze(1).transpose(0, 1) # [time, 1] -> [1, time]
|
| 307 |
+
elif src.ndim == 2:
|
| 308 |
+
if channels_first:
|
| 309 |
+
data = src # Already [channel, time]
|
| 310 |
+
else:
|
| 311 |
+
data = src.transpose(0, 1) # [time, channel] -> [channel, time]
|
| 312 |
+
else:
|
| 313 |
+
raise ValueError(f"Expected 1D or 2D tensor, got {src.ndim}D tensor")
|
| 314 |
+
|
| 315 |
+
# Create AudioEncoder
|
| 316 |
+
try:
|
| 317 |
+
encoder = AudioEncoder(data, sample_rate=sample_rate)
|
| 318 |
+
except Exception as e:
|
| 319 |
+
raise RuntimeError(f"Failed to create AudioEncoder: {e}") from e
|
| 320 |
+
|
| 321 |
+
# Determine bit_rate from compression parameter
|
| 322 |
+
bit_rate = None
|
| 323 |
+
if compression is not None:
|
| 324 |
+
if isinstance(compression, (int, float)):
|
| 325 |
+
bit_rate = int(compression)
|
| 326 |
+
else:
|
| 327 |
+
import warnings
|
| 328 |
+
|
| 329 |
+
warnings.warn(
|
| 330 |
+
f"Unsupported compression type {type(compression)}. "
|
| 331 |
+
"TorchCodec AudioEncoder expects int or float for bit_rate.",
|
| 332 |
+
UserWarning,
|
| 333 |
+
stacklevel=2,
|
| 334 |
+
)
|
| 335 |
+
|
| 336 |
+
# Save to file
|
| 337 |
+
try:
|
| 338 |
+
encoder.to_file(uri, bit_rate=bit_rate)
|
| 339 |
+
except Exception as e:
|
| 340 |
+
raise RuntimeError(f"Failed to save audio to {uri}: {e}") from e
|