Harmony18090 commited on
Commit
cabdcdf
·
verified ·
1 Parent(s): 4612cc9

Add source batch 10/11

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +18 -0
  2. source/tiktoken-0.12.0.dist-info/INSTALLER +1 -0
  3. source/tiktoken-0.12.0.dist-info/METADATA +171 -0
  4. source/tiktoken-0.12.0.dist-info/RECORD +22 -0
  5. source/tiktoken-0.12.0.dist-info/WHEEL +5 -0
  6. source/tiktoken-0.12.0.dist-info/licenses/LICENSE +21 -0
  7. source/tiktoken-0.12.0.dist-info/top_level.txt +2 -0
  8. source/tiktoken_ext/openai_public.py +162 -0
  9. source/tokenizers-0.22.2.dist-info/INSTALLER +1 -0
  10. source/tokenizers-0.22.2.dist-info/METADATA +214 -0
  11. source/tokenizers-0.22.2.dist-info/RECORD +46 -0
  12. source/tokenizers-0.22.2.dist-info/WHEEL +5 -0
  13. source/tokenizers/__init__.py +100 -0
  14. source/tokenizers/__init__.pyi +1800 -0
  15. source/tokenizers/decoders/__init__.py +15 -0
  16. source/tokenizers/decoders/__init__.pyi +569 -0
  17. source/tokenizers/implementations/__init__.py +6 -0
  18. source/tokenizers/implementations/base_tokenizer.py +459 -0
  19. source/tokenizers/implementations/bert_wordpiece.py +151 -0
  20. source/tokenizers/implementations/byte_level_bpe.py +122 -0
  21. source/tokenizers/implementations/char_level_bpe.py +150 -0
  22. source/tokenizers/implementations/sentencepiece_bpe.py +103 -0
  23. source/tokenizers/implementations/sentencepiece_unigram.py +196 -0
  24. source/tokenizers/models/__init__.py +8 -0
  25. source/tokenizers/models/__init__.pyi +744 -0
  26. source/tokenizers/normalizers/__init__.py +29 -0
  27. source/tokenizers/normalizers/__init__.pyi +946 -0
  28. source/tokenizers/pre_tokenizers/__init__.py +16 -0
  29. source/tokenizers/pre_tokenizers/__init__.pyi +1015 -0
  30. source/tokenizers/processors/__init__.py +9 -0
  31. source/tokenizers/processors/__init__.pyi +519 -0
  32. source/tokenizers/tokenizers.abi3.so +3 -0
  33. source/tokenizers/tokenizers.pyi +17 -0
  34. source/tokenizers/tools/__init__.py +1 -0
  35. source/tokenizers/tools/visualizer-styles.css +170 -0
  36. source/tokenizers/tools/visualizer.py +407 -0
  37. source/tokenizers/trainers/__init__.py +8 -0
  38. source/tokenizers/trainers/__init__.pyi +462 -0
  39. source/torchaudio-2.9.1.dist-info/INSTALLER +1 -0
  40. source/torchaudio-2.9.1.dist-info/METADATA +133 -0
  41. source/torchaudio-2.9.1.dist-info/RECORD +166 -0
  42. source/torchaudio-2.9.1.dist-info/WHEEL +5 -0
  43. source/torchaudio-2.9.1.dist-info/licenses/LICENSE +25 -0
  44. source/torchaudio-2.9.1.dist-info/top_level.txt +1 -0
  45. source/torchaudio/__init__.py +204 -0
  46. source/torchaudio/_extension/__init__.py +61 -0
  47. source/torchaudio/_extension/utils.py +133 -0
  48. source/torchaudio/_internal/__init__.py +10 -0
  49. source/torchaudio/_internal/module_utils.py +171 -0
  50. source/torchaudio/_torchcodec.py +340 -0
.gitattributes CHANGED
@@ -249,3 +249,21 @@ source/rpds/rpds.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -
249
  source/safetensors/_safetensors_rust.abi3.so filter=lfs diff=lfs merge=lfs -text
250
  source/sentencepiece/_sentencepiece.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
251
  source/tiktoken/_tiktoken.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
  source/safetensors/_safetensors_rust.abi3.so filter=lfs diff=lfs merge=lfs -text
250
  source/sentencepiece/_sentencepiece.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
251
  source/tiktoken/_tiktoken.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
252
+ source/tokenizers/tokenizers.abi3.so filter=lfs diff=lfs merge=lfs -text
253
+ source/torchaudio/lib/_torchaudio.so filter=lfs diff=lfs merge=lfs -text
254
+ source/torchaudio/lib/libctc_prefix_decoder.so filter=lfs diff=lfs merge=lfs -text
255
+ source/torchaudio/lib/libtorchaudio.so filter=lfs diff=lfs merge=lfs -text
256
+ source/torchaudio/lib/pybind11_prefixctc.so filter=lfs diff=lfs merge=lfs -text
257
+ source/torchvision/_C.so filter=lfs diff=lfs merge=lfs -text
258
+ source/torchvision/image.so filter=lfs diff=lfs merge=lfs -text
259
+ source/torchvision.libs/libcudart.e8e8b82a.so.12 filter=lfs diff=lfs merge=lfs -text
260
+ source/torchvision.libs/libjpeg.d246b9ea.so.8 filter=lfs diff=lfs merge=lfs -text
261
+ source/torchvision.libs/libnvjpeg.8dd2b5e6.so.12 filter=lfs diff=lfs merge=lfs -text
262
+ source/torchvision.libs/libpng16.4ef4b109.so.16 filter=lfs diff=lfs merge=lfs -text
263
+ source/torchvision.libs/libwebp.121d56b5.so.7 filter=lfs diff=lfs merge=lfs -text
264
+ source/torchvision.libs/libz.cac6d5fc.so.1 filter=lfs diff=lfs merge=lfs -text
265
+ source/tvm_ffi/core.abi3.so filter=lfs diff=lfs merge=lfs -text
266
+ source/tvm_ffi/lib/libtvm_ffi.so filter=lfs diff=lfs merge=lfs -text
267
+ source/tvm_ffi/lib/libtvm_ffi_testing.so filter=lfs diff=lfs merge=lfs -text
268
+ source/uvloop/loop.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
269
+ source/watchfiles/_rust_notify.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
source/tiktoken-0.12.0.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ pip
source/tiktoken-0.12.0.dist-info/METADATA ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: tiktoken
3
+ Version: 0.12.0
4
+ Summary: tiktoken is a fast BPE tokeniser for use with OpenAI's models
5
+ Author: Shantanu Jain
6
+ Author-email: shantanu@openai.com
7
+ License: MIT License
8
+
9
+ Copyright (c) 2022 OpenAI, Shantanu Jain
10
+
11
+ Permission is hereby granted, free of charge, to any person obtaining a copy
12
+ of this software and associated documentation files (the "Software"), to deal
13
+ in the Software without restriction, including without limitation the rights
14
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15
+ copies of the Software, and to permit persons to whom the Software is
16
+ furnished to do so, subject to the following conditions:
17
+
18
+ The above copyright notice and this permission notice shall be included in all
19
+ copies or substantial portions of the Software.
20
+
21
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27
+ SOFTWARE.
28
+
29
+ Project-URL: homepage, https://github.com/openai/tiktoken
30
+ Project-URL: repository, https://github.com/openai/tiktoken
31
+ Project-URL: changelog, https://github.com/openai/tiktoken/blob/main/CHANGELOG.md
32
+ Requires-Python: >=3.9
33
+ Description-Content-Type: text/markdown
34
+ License-File: LICENSE
35
+ Requires-Dist: regex>=2022.1.18
36
+ Requires-Dist: requests>=2.26.0
37
+ Provides-Extra: blobfile
38
+ Requires-Dist: blobfile>=2; extra == "blobfile"
39
+ Dynamic: license-file
40
+
41
+ # ⏳ tiktoken
42
+
43
+ tiktoken is a fast [BPE](https://en.wikipedia.org/wiki/Byte_pair_encoding) tokeniser for use with
44
+ OpenAI's models.
45
+
46
+ ```python
47
+ import tiktoken
48
+ enc = tiktoken.get_encoding("o200k_base")
49
+ assert enc.decode(enc.encode("hello world")) == "hello world"
50
+
51
+ # To get the tokeniser corresponding to a specific model in the OpenAI API:
52
+ enc = tiktoken.encoding_for_model("gpt-4o")
53
+ ```
54
+
55
+ The open source version of `tiktoken` can be installed from [PyPI](https://pypi.org/project/tiktoken):
56
+ ```
57
+ pip install tiktoken
58
+ ```
59
+
60
+ The tokeniser API is documented in `tiktoken/core.py`.
61
+
62
+ Example code using `tiktoken` can be found in the
63
+ [OpenAI Cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb).
64
+
65
+
66
+ ## Performance
67
+
68
+ `tiktoken` is between 3-6x faster than a comparable open source tokeniser:
69
+
70
+ ![image](https://raw.githubusercontent.com/openai/tiktoken/main/perf.svg)
71
+
72
+ Performance measured on 1GB of text using the GPT-2 tokeniser, using `GPT2TokenizerFast` from
73
+ `tokenizers==0.13.2`, `transformers==4.24.0` and `tiktoken==0.2.0`.
74
+
75
+
76
+ ## Getting help
77
+
78
+ Please post questions in the [issue tracker](https://github.com/openai/tiktoken/issues).
79
+
80
+ If you work at OpenAI, make sure to check the internal documentation or feel free to contact
81
+ @shantanu.
82
+
83
+ ## What is BPE anyway?
84
+
85
+ Language models don't see text like you and I, instead they see a sequence of numbers (known as tokens).
86
+ Byte pair encoding (BPE) is a way of converting text into tokens. It has a couple desirable
87
+ properties:
88
+ 1) It's reversible and lossless, so you can convert tokens back into the original text
89
+ 2) It works on arbitrary text, even text that is not in the tokeniser's training data
90
+ 3) It compresses the text: the token sequence is shorter than the bytes corresponding to the
91
+ original text. On average, in practice, each token corresponds to about 4 bytes.
92
+ 4) It attempts to let the model see common subwords. For instance, "ing" is a common subword in
93
+ English, so BPE encodings will often split "encoding" into tokens like "encod" and "ing"
94
+ (instead of e.g. "enc" and "oding"). Because the model will then see the "ing" token again and
95
+ again in different contexts, it helps models generalise and better understand grammar.
96
+
97
+ `tiktoken` contains an educational submodule that is friendlier if you want to learn more about
98
+ the details of BPE, including code that helps visualise the BPE procedure:
99
+ ```python
100
+ from tiktoken._educational import *
101
+
102
+ # Train a BPE tokeniser on a small amount of text
103
+ enc = train_simple_encoding()
104
+
105
+ # Visualise how the GPT-4 encoder encodes text
106
+ enc = SimpleBytePairEncoding.from_tiktoken("cl100k_base")
107
+ enc.encode("hello world aaaaaaaaaaaa")
108
+ ```
109
+
110
+
111
+ ## Extending tiktoken
112
+
113
+ You may wish to extend `tiktoken` to support new encodings. There are two ways to do this.
114
+
115
+
116
+ **Create your `Encoding` object exactly the way you want and simply pass it around.**
117
+
118
+ ```python
119
+ cl100k_base = tiktoken.get_encoding("cl100k_base")
120
+
121
+ # In production, load the arguments directly instead of accessing private attributes
122
+ # See openai_public.py for examples of arguments for specific encodings
123
+ enc = tiktoken.Encoding(
124
+ # If you're changing the set of special tokens, make sure to use a different name
125
+ # It should be clear from the name what behaviour to expect.
126
+ name="cl100k_im",
127
+ pat_str=cl100k_base._pat_str,
128
+ mergeable_ranks=cl100k_base._mergeable_ranks,
129
+ special_tokens={
130
+ **cl100k_base._special_tokens,
131
+ "<|im_start|>": 100264,
132
+ "<|im_end|>": 100265,
133
+ }
134
+ )
135
+ ```
136
+
137
+ **Use the `tiktoken_ext` plugin mechanism to register your `Encoding` objects with `tiktoken`.**
138
+
139
+ This is only useful if you need `tiktoken.get_encoding` to find your encoding, otherwise prefer
140
+ option 1.
141
+
142
+ To do this, you'll need to create a namespace package under `tiktoken_ext`.
143
+
144
+ Layout your project like this, making sure to omit the `tiktoken_ext/__init__.py` file:
145
+ ```
146
+ my_tiktoken_extension
147
+ ├── tiktoken_ext
148
+ │   └── my_encodings.py
149
+ └── setup.py
150
+ ```
151
+
152
+ `my_encodings.py` should be a module that contains a variable named `ENCODING_CONSTRUCTORS`.
153
+ This is a dictionary from an encoding name to a function that takes no arguments and returns
154
+ arguments that can be passed to `tiktoken.Encoding` to construct that encoding. For an example, see
155
+ `tiktoken_ext/openai_public.py`. For precise details, see `tiktoken/registry.py`.
156
+
157
+ Your `setup.py` should look something like this:
158
+ ```python
159
+ from setuptools import setup, find_namespace_packages
160
+
161
+ setup(
162
+ name="my_tiktoken_extension",
163
+ packages=find_namespace_packages(include=['tiktoken_ext*']),
164
+ install_requires=["tiktoken"],
165
+ ...
166
+ )
167
+ ```
168
+
169
+ Then simply `pip install ./my_tiktoken_extension` and you should be able to use your
170
+ custom encodings! Make sure **not** to use an editable install.
171
+
source/tiktoken-0.12.0.dist-info/RECORD ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tiktoken-0.12.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
2
+ tiktoken-0.12.0.dist-info/METADATA,sha256=07KObsjwnEfLTQ-oRD0vhrE3Zw-oNMJqTlBbEeJxtZ8,6688
3
+ tiktoken-0.12.0.dist-info/RECORD,,
4
+ tiktoken-0.12.0.dist-info/WHEEL,sha256=VXvNKn6nFeCM45GEUrNLJOO_J_e-cNJphGt9rWFxyE0,113
5
+ tiktoken-0.12.0.dist-info/licenses/LICENSE,sha256=QYy0mbQ2Eo1lPXmUEzOlQ3t74uqSE9zC8E0V1dLFHYY,1078
6
+ tiktoken-0.12.0.dist-info/top_level.txt,sha256=54G5MceQnuD7EXvp7jzGxDDapA1iOwsh77jhCN9WKkc,22
7
+ tiktoken/__init__.py,sha256=eHlkakibO43-11JFQJUgpC8z2v4ID1r3l3LXjMyEwKc,346
8
+ tiktoken/__pycache__/__init__.cpython-312.pyc,,
9
+ tiktoken/__pycache__/_educational.cpython-312.pyc,,
10
+ tiktoken/__pycache__/core.cpython-312.pyc,,
11
+ tiktoken/__pycache__/load.cpython-312.pyc,,
12
+ tiktoken/__pycache__/model.cpython-312.pyc,,
13
+ tiktoken/__pycache__/registry.cpython-312.pyc,,
14
+ tiktoken/_educational.py,sha256=TUFOp8Q91WjrTvGKhCNEyrhtva82UlenXfhPy9zS7VQ,8229
15
+ tiktoken/_tiktoken.cpython-312-x86_64-linux-gnu.so,sha256=qCn0iO_VQ7YJKD6D8jDXz6WHH64mSUK59dicngi37S8,3525056
16
+ tiktoken/core.py,sha256=TCwORlettZl-da55Ysp52TlLk18nKD6e62Q_0ZFA404,17458
17
+ tiktoken/load.py,sha256=dhTOiVIInbhiQ_zmtOZDshKvqSKzXyNOJJWPmJ0S9RU,5919
18
+ tiktoken/model.py,sha256=d57kixsksIv6VESndVjvmGBRj8LrSFGAwUCV5xZtxRk,4061
19
+ tiktoken/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
+ tiktoken/registry.py,sha256=7fktZbJ1Kcm8sVyWgEfIy-ZxfUvcXupLUNXKPfSGwQU,3256
21
+ tiktoken_ext/__pycache__/openai_public.cpython-312.pyc,,
22
+ tiktoken_ext/openai_public.py,sha256=lUOSc45g0Pttyh2tgIcu_EfI4nM7q-y78KI5cO1mwss,5613
source/tiktoken-0.12.0.dist-info/WHEEL ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: false
4
+ Tag: cp312-cp312-manylinux_2_28_x86_64
5
+
source/tiktoken-0.12.0.dist-info/licenses/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2022 OpenAI, Shantanu Jain
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
source/tiktoken-0.12.0.dist-info/top_level.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ tiktoken
2
+ tiktoken_ext
source/tiktoken_ext/openai_public.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tiktoken.load import data_gym_to_mergeable_bpe_ranks, load_tiktoken_bpe
2
+
3
+ ENDOFTEXT = "<|endoftext|>"
4
+ FIM_PREFIX = "<|fim_prefix|>"
5
+ FIM_MIDDLE = "<|fim_middle|>"
6
+ FIM_SUFFIX = "<|fim_suffix|>"
7
+ ENDOFPROMPT = "<|endofprompt|>"
8
+
9
+ # The pattern in the original GPT-2 release is:
10
+ # r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
11
+ # This is equivalent, but executes faster:
12
+ r50k_pat_str = (
13
+ r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s++$|\s+(?!\S)|\s"""
14
+ )
15
+
16
+
17
+ def gpt2():
18
+ mergeable_ranks = data_gym_to_mergeable_bpe_ranks(
19
+ vocab_bpe_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe",
20
+ encoder_json_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json",
21
+ vocab_bpe_hash="1ce1664773c50f3e0cc8842619a93edc4624525b728b188a9e0be33b7726adc5",
22
+ encoder_json_hash="196139668be63f3b5d6574427317ae82f612a97c5d1cdaf36ed2256dbf636783",
23
+ )
24
+ return {
25
+ "name": "gpt2",
26
+ "explicit_n_vocab": 50257,
27
+ "pat_str": r50k_pat_str,
28
+ "mergeable_ranks": mergeable_ranks,
29
+ "special_tokens": {ENDOFTEXT: 50256},
30
+ }
31
+
32
+
33
+ def r50k_base():
34
+ mergeable_ranks = load_tiktoken_bpe(
35
+ "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken",
36
+ expected_hash="306cd27f03c1a714eca7108e03d66b7dc042abe8c258b44c199a7ed9838dd930",
37
+ )
38
+ return {
39
+ "name": "r50k_base",
40
+ "explicit_n_vocab": 50257,
41
+ "pat_str": r50k_pat_str,
42
+ "mergeable_ranks": mergeable_ranks,
43
+ "special_tokens": {ENDOFTEXT: 50256},
44
+ }
45
+
46
+
47
+ def p50k_base():
48
+ mergeable_ranks = load_tiktoken_bpe(
49
+ "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
50
+ expected_hash="94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069",
51
+ )
52
+ return {
53
+ "name": "p50k_base",
54
+ "explicit_n_vocab": 50281,
55
+ "pat_str": r50k_pat_str,
56
+ "mergeable_ranks": mergeable_ranks,
57
+ "special_tokens": {ENDOFTEXT: 50256},
58
+ }
59
+
60
+
61
+ def p50k_edit():
62
+ mergeable_ranks = load_tiktoken_bpe(
63
+ "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
64
+ expected_hash="94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069",
65
+ )
66
+ special_tokens = {ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283}
67
+ return {
68
+ "name": "p50k_edit",
69
+ "pat_str": r50k_pat_str,
70
+ "mergeable_ranks": mergeable_ranks,
71
+ "special_tokens": special_tokens,
72
+ }
73
+
74
+
75
+ def cl100k_base():
76
+ mergeable_ranks = load_tiktoken_bpe(
77
+ "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken",
78
+ expected_hash="223921b76ee99bde995b7ff738513eef100fb51d18c93597a113bcffe865b2a7",
79
+ )
80
+ special_tokens = {
81
+ ENDOFTEXT: 100257,
82
+ FIM_PREFIX: 100258,
83
+ FIM_MIDDLE: 100259,
84
+ FIM_SUFFIX: 100260,
85
+ ENDOFPROMPT: 100276,
86
+ }
87
+ return {
88
+ "name": "cl100k_base",
89
+ "pat_str": r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}++|\p{N}{1,3}+| ?[^\s\p{L}\p{N}]++[\r\n]*+|\s++$|\s*[\r\n]|\s+(?!\S)|\s""",
90
+ "mergeable_ranks": mergeable_ranks,
91
+ "special_tokens": special_tokens,
92
+ }
93
+
94
+
95
+ def o200k_base():
96
+ mergeable_ranks = load_tiktoken_bpe(
97
+ "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken",
98
+ expected_hash="446a9538cb6c348e3516120d7c08b09f57c36495e2acfffe59a5bf8b0cfb1a2d",
99
+ )
100
+ special_tokens = {ENDOFTEXT: 199999, ENDOFPROMPT: 200018}
101
+ # This regex could be made more efficient. If I was the one working on this encoding, I would
102
+ # have done a few other things differently too, e.g. I think you can allocate tokens more
103
+ # efficiently across languages.
104
+ pat_str = "|".join(
105
+ [
106
+ r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
107
+ r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
108
+ r"""\p{N}{1,3}""",
109
+ r""" ?[^\s\p{L}\p{N}]+[\r\n/]*""",
110
+ r"""\s*[\r\n]+""",
111
+ r"""\s+(?!\S)""",
112
+ r"""\s+""",
113
+ ]
114
+ )
115
+ return {
116
+ "name": "o200k_base",
117
+ "pat_str": pat_str,
118
+ "mergeable_ranks": mergeable_ranks,
119
+ "special_tokens": special_tokens,
120
+ }
121
+
122
+
123
+ def o200k_harmony():
124
+ base_enc = o200k_base()
125
+ name = "o200k_harmony"
126
+ pat_str = base_enc["pat_str"]
127
+ mergeable_ranks = base_enc["mergeable_ranks"]
128
+ special_tokens = {
129
+ **base_enc["special_tokens"],
130
+ "<|startoftext|>": 199998,
131
+ "<|endoftext|>": 199999,
132
+ "<|reserved_200000|>": 200000,
133
+ "<|reserved_200001|>": 200001,
134
+ "<|return|>": 200002,
135
+ "<|constrain|>": 200003,
136
+ "<|reserved_200004|>": 200004,
137
+ "<|channel|>": 200005,
138
+ "<|start|>": 200006,
139
+ "<|end|>": 200007,
140
+ "<|message|>": 200008,
141
+ "<|reserved_200009|>": 200009,
142
+ "<|reserved_200010|>": 200010,
143
+ "<|reserved_200011|>": 200011,
144
+ "<|call|>": 200012,
145
+ } | {f"<|reserved_{i}|>": i for i in range(200013, 201088)}
146
+ return {
147
+ "name": name,
148
+ "pat_str": pat_str,
149
+ "mergeable_ranks": mergeable_ranks,
150
+ "special_tokens": special_tokens,
151
+ }
152
+
153
+
154
+ ENCODING_CONSTRUCTORS = {
155
+ "gpt2": gpt2,
156
+ "r50k_base": r50k_base,
157
+ "p50k_base": p50k_base,
158
+ "p50k_edit": p50k_edit,
159
+ "cl100k_base": cl100k_base,
160
+ "o200k_base": o200k_base,
161
+ "o200k_harmony": o200k_harmony,
162
+ }
source/tokenizers-0.22.2.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ pip
source/tokenizers-0.22.2.dist-info/METADATA ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: tokenizers
3
+ Version: 0.22.2
4
+ Classifier: Development Status :: 5 - Production/Stable
5
+ Classifier: Intended Audience :: Developers
6
+ Classifier: Intended Audience :: Education
7
+ Classifier: Intended Audience :: Science/Research
8
+ Classifier: License :: OSI Approved :: Apache Software License
9
+ Classifier: Operating System :: OS Independent
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.9
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Classifier: Programming Language :: Python :: 3 :: Only
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Requires-Dist: huggingface-hub>=0.16.4,<2.0
19
+ Requires-Dist: pytest ; extra == 'testing'
20
+ Requires-Dist: pytest-asyncio ; extra == 'testing'
21
+ Requires-Dist: requests ; extra == 'testing'
22
+ Requires-Dist: numpy ; extra == 'testing'
23
+ Requires-Dist: datasets ; extra == 'testing'
24
+ Requires-Dist: ruff ; extra == 'testing'
25
+ Requires-Dist: ty ; extra == 'testing'
26
+ Requires-Dist: sphinx ; extra == 'docs'
27
+ Requires-Dist: sphinx-rtd-theme ; extra == 'docs'
28
+ Requires-Dist: setuptools-rust ; extra == 'docs'
29
+ Requires-Dist: tokenizers[testing] ; extra == 'dev'
30
+ Provides-Extra: testing
31
+ Provides-Extra: docs
32
+ Provides-Extra: dev
33
+ Keywords: NLP,tokenizer,BPE,transformer,deep learning
34
+ Author-email: Nicolas Patry <patry.nicolas@protonmail.com>, Anthony Moi <anthony@huggingface.co>
35
+ Requires-Python: >=3.9
36
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
37
+ Project-URL: Homepage, https://github.com/huggingface/tokenizers
38
+ Project-URL: Source, https://github.com/huggingface/tokenizers
39
+
40
+ <p align="center">
41
+ <br>
42
+ <img src="https://huggingface.co/landing/assets/tokenizers/tokenizers-logo.png" width="600"/>
43
+ <br>
44
+ <p>
45
+ <p align="center">
46
+ <a href="https://badge.fury.io/py/tokenizers">
47
+ <img alt="Build" src="https://badge.fury.io/py/tokenizers.svg">
48
+ </a>
49
+ <a href="https://github.com/huggingface/tokenizers/blob/master/LICENSE">
50
+ <img alt="GitHub" src="https://img.shields.io/github/license/huggingface/tokenizers.svg?color=blue">
51
+ </a>
52
+ </p>
53
+ <br>
54
+
55
+ # Tokenizers
56
+
57
+ Provides an implementation of today's most used tokenizers, with a focus on performance and
58
+ versatility.
59
+
60
+ Bindings over the [Rust](https://github.com/huggingface/tokenizers/tree/master/tokenizers) implementation.
61
+ If you are interested in the High-level design, you can go check it there.
62
+
63
+ Otherwise, let's dive in!
64
+
65
+ ## Main features:
66
+
67
+ - Train new vocabularies and tokenize using 4 pre-made tokenizers (Bert WordPiece and the 3
68
+ most common BPE versions).
69
+ - Extremely fast (both training and tokenization), thanks to the Rust implementation. Takes
70
+ less than 20 seconds to tokenize a GB of text on a server's CPU.
71
+ - Easy to use, but also extremely versatile.
72
+ - Designed for research and production.
73
+ - Normalization comes with alignments tracking. It's always possible to get the part of the
74
+ original sentence that corresponds to a given token.
75
+ - Does all the pre-processing: Truncate, Pad, add the special tokens your model needs.
76
+
77
+ ### Installation
78
+
79
+ #### With pip:
80
+
81
+ ```bash
82
+ pip install tokenizers
83
+ ```
84
+
85
+ #### From sources:
86
+
87
+ To use this method, you need to have the Rust installed:
88
+
89
+ ```bash
90
+ # Install with:
91
+ curl https://sh.rustup.rs -sSf | sh -s -- -y
92
+ export PATH="$HOME/.cargo/bin:$PATH"
93
+ ```
94
+
95
+ Once Rust is installed, you can compile doing the following
96
+
97
+ ```bash
98
+ git clone https://github.com/huggingface/tokenizers
99
+ cd tokenizers/bindings/python
100
+
101
+ # Create a virtual env (you can use yours as well)
102
+ python -m venv .env
103
+ source .env/bin/activate
104
+
105
+ # Install `tokenizers` in the current virtual env
106
+ pip install -e .
107
+ ```
108
+
109
+ ### Load a pretrained tokenizer from the Hub
110
+
111
+ ```python
112
+ from tokenizers import Tokenizer
113
+
114
+ tokenizer = Tokenizer.from_pretrained("bert-base-cased")
115
+ ```
116
+
117
+ ### Using the provided Tokenizers
118
+
119
+ We provide some pre-build tokenizers to cover the most common cases. You can easily load one of
120
+ these using some `vocab.json` and `merges.txt` files:
121
+
122
+ ```python
123
+ from tokenizers import CharBPETokenizer
124
+
125
+ # Initialize a tokenizer
126
+ vocab = "./path/to/vocab.json"
127
+ merges = "./path/to/merges.txt"
128
+ tokenizer = CharBPETokenizer(vocab, merges)
129
+
130
+ # And then encode:
131
+ encoded = tokenizer.encode("I can feel the magic, can you?")
132
+ print(encoded.ids)
133
+ print(encoded.tokens)
134
+ ```
135
+
136
+ And you can train them just as simply:
137
+
138
+ ```python
139
+ from tokenizers import CharBPETokenizer
140
+
141
+ # Initialize a tokenizer
142
+ tokenizer = CharBPETokenizer()
143
+
144
+ # Then train it!
145
+ tokenizer.train([ "./path/to/files/1.txt", "./path/to/files/2.txt" ])
146
+
147
+ # Now, let's use it:
148
+ encoded = tokenizer.encode("I can feel the magic, can you?")
149
+
150
+ # And finally save it somewhere
151
+ tokenizer.save("./path/to/directory/my-bpe.tokenizer.json")
152
+ ```
153
+
154
+ #### Provided Tokenizers
155
+
156
+ - `CharBPETokenizer`: The original BPE
157
+ - `ByteLevelBPETokenizer`: The byte level version of the BPE
158
+ - `SentencePieceBPETokenizer`: A BPE implementation compatible with the one used by SentencePiece
159
+ - `BertWordPieceTokenizer`: The famous Bert tokenizer, using WordPiece
160
+
161
+ All of these can be used and trained as explained above!
162
+
163
+ ### Build your own
164
+
165
+ Whenever these provided tokenizers don't give you enough freedom, you can build your own tokenizer,
166
+ by putting all the different parts you need together.
167
+ You can check how we implemented the [provided tokenizers](https://github.com/huggingface/tokenizers/tree/master/bindings/python/py_src/tokenizers/implementations) and adapt them easily to your own needs.
168
+
169
+ #### Building a byte-level BPE
170
+
171
+ Here is an example showing how to build your own byte-level BPE by putting all the different pieces
172
+ together, and then saving it to a single file:
173
+
174
+ ```python
175
+ from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors
176
+
177
+ # Initialize a tokenizer
178
+ tokenizer = Tokenizer(models.BPE())
179
+
180
+ # Customize pre-tokenization and decoding
181
+ tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
182
+ tokenizer.decoder = decoders.ByteLevel()
183
+ tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
184
+
185
+ # And then train
186
+ trainer = trainers.BpeTrainer(
187
+ vocab_size=20000,
188
+ min_frequency=2,
189
+ initial_alphabet=pre_tokenizers.ByteLevel.alphabet()
190
+ )
191
+ tokenizer.train([
192
+ "./path/to/dataset/1.txt",
193
+ "./path/to/dataset/2.txt",
194
+ "./path/to/dataset/3.txt"
195
+ ], trainer=trainer)
196
+
197
+ # And Save it
198
+ tokenizer.save("byte-level-bpe.tokenizer.json", pretty=True)
199
+ ```
200
+
201
+ Now, when you want to use this tokenizer, this is as simple as:
202
+
203
+ ```python
204
+ from tokenizers import Tokenizer
205
+
206
+ tokenizer = Tokenizer.from_file("byte-level-bpe.tokenizer.json")
207
+
208
+ encoded = tokenizer.encode("I can feel the magic, can you?")
209
+ ```
210
+
211
+ ### Typing support and `stub.py`
212
+
213
+ The compiled PyO3 extension does not expose type annotations, so editors and type checkers would otherwise see most objects as `Any`. The `stub.py` helper walks the loaded extension modules, renders `.pyi` stub files (plus minimal forwarding `__init__.py` shims), and formats them so that tools like mypy/pyright can understand the public API. Run `python stub.py` whenever you change the Python-visible surface to keep the generated stubs in sync.
214
+
source/tokenizers-0.22.2.dist-info/RECORD ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tokenizers-0.22.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
2
+ tokenizers-0.22.2.dist-info/METADATA,sha256=FaXdr0ifWSt34Kk0wO60a1ETCpQGTKEpIyr9sKOGjvw,7254
3
+ tokenizers-0.22.2.dist-info/RECORD,,
4
+ tokenizers-0.22.2.dist-info/WHEEL,sha256=5mwg5nCvp3YrLxikUrE5E0HBDKerMOoBBb70NjCncME,143
5
+ tokenizers/__init__.py,sha256=FI7LEi8_7gO-mrsf4hPdhfvGkb8q0rQ3_1MVM3gaajo,2639
6
+ tokenizers/__init__.pyi,sha256=MKWF2m4mz7IG1bPTdJ7AjXkQDNzkmQSLMmACQ2VUYJU,55891
7
+ tokenizers/__pycache__/__init__.cpython-312.pyc,,
8
+ tokenizers/decoders/__init__.py,sha256=hfwM6CFUDvlMGGL4-xsaaYz81K9P5rQI5ZL5UHWK8Y4,372
9
+ tokenizers/decoders/__init__.pyi,sha256=T60mFckMbS8YrsonOAPtfvb7VYHUJi9mm47Wd8pT62o,12019
10
+ tokenizers/decoders/__pycache__/__init__.cpython-312.pyc,,
11
+ tokenizers/implementations/__init__.py,sha256=VzAsplaIo7rl4AFO8Miu7ig7MfZjvonwVblZw01zR6M,310
12
+ tokenizers/implementations/__pycache__/__init__.cpython-312.pyc,,
13
+ tokenizers/implementations/__pycache__/base_tokenizer.cpython-312.pyc,,
14
+ tokenizers/implementations/__pycache__/bert_wordpiece.cpython-312.pyc,,
15
+ tokenizers/implementations/__pycache__/byte_level_bpe.cpython-312.pyc,,
16
+ tokenizers/implementations/__pycache__/char_level_bpe.cpython-312.pyc,,
17
+ tokenizers/implementations/__pycache__/sentencepiece_bpe.cpython-312.pyc,,
18
+ tokenizers/implementations/__pycache__/sentencepiece_unigram.cpython-312.pyc,,
19
+ tokenizers/implementations/base_tokenizer.py,sha256=PtQ2TSmoMGlTpL8oc8fDvwJVIY6isWGmps9comzsWjE,15806
20
+ tokenizers/implementations/bert_wordpiece.py,sha256=sKCum0FKPYdSgJFJN8LDerVBoTDRSqyqSdrcm-lvQqI,5520
21
+ tokenizers/implementations/byte_level_bpe.py,sha256=iBepM_z1s5Ky7zFDVrYLc3L5byYrIouk7-k0JGuF10s,4272
22
+ tokenizers/implementations/char_level_bpe.py,sha256=Nag_HFq8Rvcucqi8MhV1-0xtoR0C7FjHOecFVURL7ss,5449
23
+ tokenizers/implementations/sentencepiece_bpe.py,sha256=c08fKf6i92E2RsKgsxy7LzZfYX8-MACHSRG8U_I5ytY,3721
24
+ tokenizers/implementations/sentencepiece_unigram.py,sha256=2RoIfFVpiMkJOtOCskM_VCeCELWaC_bNnds6GvtE0KQ,7630
25
+ tokenizers/models/__init__.py,sha256=eJZ4HTAQZpxnKILNylWaTFqxXy-Ba6OKswWN47feeV8,176
26
+ tokenizers/models/__init__.pyi,sha256=2gZPQR1Z5_krTzLXx-ts5ai7Fz7bTZ0QI1OSJ5MyOuc,19517
27
+ tokenizers/models/__pycache__/__init__.cpython-312.pyc,,
28
+ tokenizers/normalizers/__init__.py,sha256=_06w4cqRItveEgIddYaLMScgkSOkIAMIzYCesb5AA4U,841
29
+ tokenizers/normalizers/__init__.pyi,sha256=6zYmbFtvdF1WhoWQSdEN974mxHjc7ZwJBA0TI2dJk98,25709
30
+ tokenizers/normalizers/__pycache__/__init__.cpython-312.pyc,,
31
+ tokenizers/pre_tokenizers/__init__.py,sha256=KV9-EsAykGENUUzkGWCbv4n6YM6hYa1hfnY-gzBpMNE,598
32
+ tokenizers/pre_tokenizers/__init__.pyi,sha256=_pc34-Kd2N7Nvs7vTHPULBKjm18iJRM9qLOClVHw9n4,31566
33
+ tokenizers/pre_tokenizers/__pycache__/__init__.cpython-312.pyc,,
34
+ tokenizers/processors/__init__.py,sha256=xM2DEKwKtHIumHsszM8AMkq-AlaqvBZFXWgLU8SNhOY,307
35
+ tokenizers/processors/__init__.pyi,sha256=5L5OBZ7SXCg7AEy51jyDHViaCSHG5c7vW4eWjSVQbUs,14348
36
+ tokenizers/processors/__pycache__/__init__.cpython-312.pyc,,
37
+ tokenizers/tokenizers.abi3.so,sha256=wRb88egNRhzgo1wzKXTyWUnoNZQW9Qs9UzcYENLOHMw,10074176
38
+ tokenizers/tokenizers.pyi,sha256=Mq4G5RcxKiVc0FZd_Omi-bT7YQMRc-iDBU_nPCmCZOA,468
39
+ tokenizers/tools/__init__.py,sha256=xG8caB9OHC8cbB01S5vYV14HZxhO6eWbLehsb70ppio,55
40
+ tokenizers/tools/__pycache__/__init__.cpython-312.pyc,,
41
+ tokenizers/tools/__pycache__/visualizer.cpython-312.pyc,,
42
+ tokenizers/tools/visualizer-styles.css,sha256=zAydq1oGWD8QEll4-eyL8Llw0B1sty_hpIE3tYxL02k,4850
43
+ tokenizers/tools/visualizer.py,sha256=jtxka01phNP47uQSocIQFO_DMnL3ZHdwohGVDqqYJPo,14834
44
+ tokenizers/trainers/__init__.py,sha256=UTu22AGcp76IvpW45xLRbJWET04NxPW6NfCb2YYz0EM,248
45
+ tokenizers/trainers/__init__.pyi,sha256=jKtDNXnoX6FWeCTeHz-W62Cj2_JErgYG7h1PReUz1rU,10719
46
+ tokenizers/trainers/__pycache__/__init__.cpython-312.pyc,,
source/tokenizers-0.22.2.dist-info/WHEEL ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: maturin (1.10.2)
3
+ Root-Is-Purelib: false
4
+ Tag: cp39-abi3-manylinux_2_17_x86_64
5
+ Tag: cp39-abi3-manylinux2014_x86_64
source/tokenizers/__init__.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from enum import Enum
2
+ from typing import List, Tuple, Union
3
+
4
+
5
+ Offsets = Tuple[int, int]
6
+
7
+ TextInputSequence = str
8
+ """A :obj:`str` that represents an input sequence """
9
+
10
+ PreTokenizedInputSequence = Union[List[str], Tuple[str]]
11
+ """A pre-tokenized input sequence. Can be one of:
12
+
13
+ - A :obj:`List` of :obj:`str`
14
+ - A :obj:`Tuple` of :obj:`str`
15
+ """
16
+
17
+ TextEncodeInput = Union[
18
+ TextInputSequence,
19
+ Tuple[TextInputSequence, TextInputSequence],
20
+ List[TextInputSequence],
21
+ ]
22
+ """Represents a textual input for encoding. Can be either:
23
+
24
+ - A single sequence: :data:`~tokenizers.TextInputSequence`
25
+ - A pair of sequences:
26
+
27
+ - A :obj:`Tuple` of :data:`~tokenizers.TextInputSequence`
28
+ - Or a :obj:`List` of :data:`~tokenizers.TextInputSequence` of size 2
29
+ """
30
+
31
+ PreTokenizedEncodeInput = Union[
32
+ PreTokenizedInputSequence,
33
+ Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
34
+ List[PreTokenizedInputSequence],
35
+ ]
36
+ """Represents a pre-tokenized input for encoding. Can be either:
37
+
38
+ - A single sequence: :data:`~tokenizers.PreTokenizedInputSequence`
39
+ - A pair of sequences:
40
+
41
+ - A :obj:`Tuple` of :data:`~tokenizers.PreTokenizedInputSequence`
42
+ - Or a :obj:`List` of :data:`~tokenizers.PreTokenizedInputSequence` of size 2
43
+ """
44
+
45
+ InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
46
+ """Represents all the possible types of input sequences for encoding. Can be:
47
+
48
+ - When ``is_pretokenized=False``: :data:`~TextInputSequence`
49
+ - When ``is_pretokenized=True``: :data:`~PreTokenizedInputSequence`
50
+ """
51
+
52
+ EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
53
+ """Represents all the possible types of input for encoding. Can be:
54
+
55
+ - When ``is_pretokenized=False``: :data:`~TextEncodeInput`
56
+ - When ``is_pretokenized=True``: :data:`~PreTokenizedEncodeInput`
57
+ """
58
+
59
+
60
+ class OffsetReferential(Enum):
61
+ ORIGINAL = "original"
62
+ NORMALIZED = "normalized"
63
+
64
+
65
+ class OffsetType(Enum):
66
+ BYTE = "byte"
67
+ CHAR = "char"
68
+
69
+
70
+ class SplitDelimiterBehavior(Enum):
71
+ REMOVED = "removed"
72
+ ISOLATED = "isolated"
73
+ MERGED_WITH_PREVIOUS = "merged_with_previous"
74
+ MERGED_WITH_NEXT = "merged_with_next"
75
+ CONTIGUOUS = "contiguous"
76
+
77
+
78
+ from .tokenizers import ( # type: ignore[import]
79
+ AddedToken,
80
+ Encoding,
81
+ NormalizedString,
82
+ PreTokenizedString,
83
+ Regex,
84
+ Token,
85
+ Tokenizer,
86
+ decoders,
87
+ models,
88
+ normalizers,
89
+ pre_tokenizers,
90
+ processors,
91
+ trainers,
92
+ __version__,
93
+ )
94
+ from .implementations import (
95
+ BertWordPieceTokenizer,
96
+ ByteLevelBPETokenizer,
97
+ CharBPETokenizer,
98
+ SentencePieceBPETokenizer,
99
+ SentencePieceUnigramTokenizer,
100
+ )
source/tokenizers/__init__.pyi ADDED
@@ -0,0 +1,1800 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated content DO NOT EDIT
2
+ class AddedToken:
3
+ """
4
+ Represents a token that can be be added to a :class:`~tokenizers.Tokenizer`.
5
+ It can have special options that defines the way it should behave.
6
+
7
+ Args:
8
+ content (:obj:`str`): The content of the token
9
+
10
+ single_word (:obj:`bool`, defaults to :obj:`False`):
11
+ Defines whether this token should only match single words. If :obj:`True`, this
12
+ token will never match inside of a word. For example the token ``ing`` would match
13
+ on ``tokenizing`` if this option is :obj:`False`, but not if it is :obj:`True`.
14
+ The notion of "`inside of a word`" is defined by the word boundaries pattern in
15
+ regular expressions (ie. the token should start and end with word boundaries).
16
+
17
+ lstrip (:obj:`bool`, defaults to :obj:`False`):
18
+ Defines whether this token should strip all potential whitespaces on its left side.
19
+ If :obj:`True`, this token will greedily match any whitespace on its left. For
20
+ example if we try to match the token ``[MASK]`` with ``lstrip=True``, in the text
21
+ ``"I saw a [MASK]"``, we would match on ``" [MASK]"``. (Note the space on the left).
22
+
23
+ rstrip (:obj:`bool`, defaults to :obj:`False`):
24
+ Defines whether this token should strip all potential whitespaces on its right
25
+ side. If :obj:`True`, this token will greedily match any whitespace on its right.
26
+ It works just like :obj:`lstrip` but on the right.
27
+
28
+ normalized (:obj:`bool`, defaults to :obj:`True` with :meth:`~tokenizers.Tokenizer.add_tokens` and :obj:`False` with :meth:`~tokenizers.Tokenizer.add_special_tokens`):
29
+ Defines whether this token should match against the normalized version of the input
30
+ text. For example, with the added token ``"yesterday"``, and a normalizer in charge of
31
+ lowercasing the text, the token could be extract from the input ``"I saw a lion
32
+ Yesterday"``.
33
+ special (:obj:`bool`, defaults to :obj:`False` with :meth:`~tokenizers.Tokenizer.add_tokens` and :obj:`False` with :meth:`~tokenizers.Tokenizer.add_special_tokens`):
34
+ Defines whether this token should be skipped when decoding.
35
+
36
+ """
37
+ def __init__(self, content=None, single_word=False, lstrip=False, rstrip=False, normalized=True, special=False):
38
+ pass
39
+
40
+ def __getstate__(self):
41
+ """ """
42
+ pass
43
+
44
+ def __setstate__(self, state):
45
+ """ """
46
+ pass
47
+
48
+ @property
49
+ def content(self):
50
+ """
51
+ Get the content of this :obj:`AddedToken`
52
+ """
53
+ pass
54
+
55
+ @content.setter
56
+ def content(self, value):
57
+ """
58
+ Get the content of this :obj:`AddedToken`
59
+ """
60
+ pass
61
+
62
+ @property
63
+ def lstrip(self):
64
+ """
65
+ Get the value of the :obj:`lstrip` option
66
+ """
67
+ pass
68
+
69
+ @lstrip.setter
70
+ def lstrip(self, value):
71
+ """
72
+ Get the value of the :obj:`lstrip` option
73
+ """
74
+ pass
75
+
76
+ @property
77
+ def normalized(self):
78
+ """
79
+ Get the value of the :obj:`normalized` option
80
+ """
81
+ pass
82
+
83
+ @normalized.setter
84
+ def normalized(self, value):
85
+ """
86
+ Get the value of the :obj:`normalized` option
87
+ """
88
+ pass
89
+
90
+ @property
91
+ def rstrip(self):
92
+ """
93
+ Get the value of the :obj:`rstrip` option
94
+ """
95
+ pass
96
+
97
+ @rstrip.setter
98
+ def rstrip(self, value):
99
+ """
100
+ Get the value of the :obj:`rstrip` option
101
+ """
102
+ pass
103
+
104
+ @property
105
+ def single_word(self):
106
+ """
107
+ Get the value of the :obj:`single_word` option
108
+ """
109
+ pass
110
+
111
+ @single_word.setter
112
+ def single_word(self, value):
113
+ """
114
+ Get the value of the :obj:`single_word` option
115
+ """
116
+ pass
117
+
118
+ @property
119
+ def special(self):
120
+ """
121
+ Get the value of the :obj:`special` option
122
+ """
123
+ pass
124
+
125
+ @special.setter
126
+ def special(self, value):
127
+ """
128
+ Get the value of the :obj:`special` option
129
+ """
130
+ pass
131
+
132
+ class Encoding:
133
+ """
134
+ The :class:`~tokenizers.Encoding` represents the output of a :class:`~tokenizers.Tokenizer`.
135
+ """
136
+ def __init__(self):
137
+ pass
138
+
139
+ def __getstate__(self):
140
+ """ """
141
+ pass
142
+
143
+ def __setstate__(self, state):
144
+ """ """
145
+ pass
146
+
147
+ @property
148
+ def attention_mask(self):
149
+ """
150
+ The attention mask
151
+
152
+ This indicates to the LM which tokens should be attended to, and which should not.
153
+ This is especially important when batching sequences, where we need to applying
154
+ padding.
155
+
156
+ Returns:
157
+ :obj:`List[int]`: The attention mask
158
+ """
159
+ pass
160
+
161
+ @attention_mask.setter
162
+ def attention_mask(self, value):
163
+ """
164
+ The attention mask
165
+
166
+ This indicates to the LM which tokens should be attended to, and which should not.
167
+ This is especially important when batching sequences, where we need to applying
168
+ padding.
169
+
170
+ Returns:
171
+ :obj:`List[int]`: The attention mask
172
+ """
173
+ pass
174
+
175
+ def char_to_token(self, char_pos, sequence_index=0):
176
+ """
177
+ Get the token that contains the char at the given position in the input sequence.
178
+
179
+ Args:
180
+ char_pos (:obj:`int`):
181
+ The position of a char in the input string
182
+ sequence_index (:obj:`int`, defaults to :obj:`0`):
183
+ The index of the sequence that contains the target char
184
+
185
+ Returns:
186
+ :obj:`int`: The index of the token that contains this char in the encoded sequence
187
+ """
188
+ pass
189
+
190
+ def char_to_word(self, char_pos, sequence_index=0):
191
+ """
192
+ Get the word that contains the char at the given position in the input sequence.
193
+
194
+ Args:
195
+ char_pos (:obj:`int`):
196
+ The position of a char in the input string
197
+ sequence_index (:obj:`int`, defaults to :obj:`0`):
198
+ The index of the sequence that contains the target char
199
+
200
+ Returns:
201
+ :obj:`int`: The index of the word that contains this char in the input sequence
202
+ """
203
+ pass
204
+
205
+ @property
206
+ def ids(self):
207
+ """
208
+ The generated IDs
209
+
210
+ The IDs are the main input to a Language Model. They are the token indices,
211
+ the numerical representations that a LM understands.
212
+
213
+ Returns:
214
+ :obj:`List[int]`: The list of IDs
215
+ """
216
+ pass
217
+
218
+ @ids.setter
219
+ def ids(self, value):
220
+ """
221
+ The generated IDs
222
+
223
+ The IDs are the main input to a Language Model. They are the token indices,
224
+ the numerical representations that a LM understands.
225
+
226
+ Returns:
227
+ :obj:`List[int]`: The list of IDs
228
+ """
229
+ pass
230
+
231
+ @staticmethod
232
+ def merge(encodings, growing_offsets=True):
233
+ """
234
+ Merge the list of encodings into one final :class:`~tokenizers.Encoding`
235
+
236
+ Args:
237
+ encodings (A :obj:`List` of :class:`~tokenizers.Encoding`):
238
+ The list of encodings that should be merged in one
239
+
240
+ growing_offsets (:obj:`bool`, defaults to :obj:`True`):
241
+ Whether the offsets should accumulate while merging
242
+
243
+ Returns:
244
+ :class:`~tokenizers.Encoding`: The resulting Encoding
245
+ """
246
+ pass
247
+
248
+ @property
249
+ def n_sequences(self):
250
+ """
251
+ The number of sequences represented
252
+
253
+ Returns:
254
+ :obj:`int`: The number of sequences in this :class:`~tokenizers.Encoding`
255
+ """
256
+ pass
257
+
258
+ @n_sequences.setter
259
+ def n_sequences(self, value):
260
+ """
261
+ The number of sequences represented
262
+
263
+ Returns:
264
+ :obj:`int`: The number of sequences in this :class:`~tokenizers.Encoding`
265
+ """
266
+ pass
267
+
268
+ @property
269
+ def offsets(self):
270
+ """
271
+ The offsets associated to each token
272
+
273
+ These offsets let's you slice the input string, and thus retrieve the original
274
+ part that led to producing the corresponding token.
275
+
276
+ Returns:
277
+ A :obj:`List` of :obj:`Tuple[int, int]`: The list of offsets
278
+ """
279
+ pass
280
+
281
+ @offsets.setter
282
+ def offsets(self, value):
283
+ """
284
+ The offsets associated to each token
285
+
286
+ These offsets let's you slice the input string, and thus retrieve the original
287
+ part that led to producing the corresponding token.
288
+
289
+ Returns:
290
+ A :obj:`List` of :obj:`Tuple[int, int]`: The list of offsets
291
+ """
292
+ pass
293
+
294
+ @property
295
+ def overflowing(self):
296
+ """
297
+ A :obj:`List` of overflowing :class:`~tokenizers.Encoding`
298
+
299
+ When using truncation, the :class:`~tokenizers.Tokenizer` takes care of splitting
300
+ the output into as many pieces as required to match the specified maximum length.
301
+ This field lets you retrieve all the subsequent pieces.
302
+
303
+ When you use pairs of sequences, the overflowing pieces will contain enough
304
+ variations to cover all the possible combinations, while respecting the provided
305
+ maximum length.
306
+ """
307
+ pass
308
+
309
+ @overflowing.setter
310
+ def overflowing(self, value):
311
+ """
312
+ A :obj:`List` of overflowing :class:`~tokenizers.Encoding`
313
+
314
+ When using truncation, the :class:`~tokenizers.Tokenizer` takes care of splitting
315
+ the output into as many pieces as required to match the specified maximum length.
316
+ This field lets you retrieve all the subsequent pieces.
317
+
318
+ When you use pairs of sequences, the overflowing pieces will contain enough
319
+ variations to cover all the possible combinations, while respecting the provided
320
+ maximum length.
321
+ """
322
+ pass
323
+
324
+ def pad(self, length, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]"):
325
+ """
326
+ Pad the :class:`~tokenizers.Encoding` at the given length
327
+
328
+ Args:
329
+ length (:obj:`int`):
330
+ The desired length
331
+
332
+ direction: (:obj:`str`, defaults to :obj:`right`):
333
+ The expected padding direction. Can be either :obj:`right` or :obj:`left`
334
+
335
+ pad_id (:obj:`int`, defaults to :obj:`0`):
336
+ The ID corresponding to the padding token
337
+
338
+ pad_type_id (:obj:`int`, defaults to :obj:`0`):
339
+ The type ID corresponding to the padding token
340
+
341
+ pad_token (:obj:`str`, defaults to `[PAD]`):
342
+ The pad token to use
343
+ """
344
+ pass
345
+
346
+ @property
347
+ def sequence_ids(self):
348
+ """
349
+ The generated sequence indices.
350
+
351
+ They represent the index of the input sequence associated to each token.
352
+ The sequence id can be None if the token is not related to any input sequence,
353
+ like for example with special tokens.
354
+
355
+ Returns:
356
+ A :obj:`List` of :obj:`Optional[int]`: A list of optional sequence index.
357
+ """
358
+ pass
359
+
360
+ @sequence_ids.setter
361
+ def sequence_ids(self, value):
362
+ """
363
+ The generated sequence indices.
364
+
365
+ They represent the index of the input sequence associated to each token.
366
+ The sequence id can be None if the token is not related to any input sequence,
367
+ like for example with special tokens.
368
+
369
+ Returns:
370
+ A :obj:`List` of :obj:`Optional[int]`: A list of optional sequence index.
371
+ """
372
+ pass
373
+
374
+ def set_sequence_id(self, sequence_id):
375
+ """
376
+ Set the given sequence index
377
+
378
+ Set the given sequence index for the whole range of tokens contained in this
379
+ :class:`~tokenizers.Encoding`.
380
+ """
381
+ pass
382
+
383
+ @property
384
+ def special_tokens_mask(self):
385
+ """
386
+ The special token mask
387
+
388
+ This indicates which tokens are special tokens, and which are not.
389
+
390
+ Returns:
391
+ :obj:`List[int]`: The special tokens mask
392
+ """
393
+ pass
394
+
395
+ @special_tokens_mask.setter
396
+ def special_tokens_mask(self, value):
397
+ """
398
+ The special token mask
399
+
400
+ This indicates which tokens are special tokens, and which are not.
401
+
402
+ Returns:
403
+ :obj:`List[int]`: The special tokens mask
404
+ """
405
+ pass
406
+
407
+ def token_to_chars(self, token_index):
408
+ """
409
+ Get the offsets of the token at the given index.
410
+
411
+ The returned offsets are related to the input sequence that contains the
412
+ token. In order to determine in which input sequence it belongs, you
413
+ must call :meth:`~tokenizers.Encoding.token_to_sequence()`.
414
+
415
+ Args:
416
+ token_index (:obj:`int`):
417
+ The index of a token in the encoded sequence.
418
+
419
+ Returns:
420
+ :obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)`
421
+ """
422
+ pass
423
+
424
+ def token_to_sequence(self, token_index):
425
+ """
426
+ Get the index of the sequence represented by the given token.
427
+
428
+ In the general use case, this method returns :obj:`0` for a single sequence or
429
+ the first sequence of a pair, and :obj:`1` for the second sequence of a pair
430
+
431
+ Args:
432
+ token_index (:obj:`int`):
433
+ The index of a token in the encoded sequence.
434
+
435
+ Returns:
436
+ :obj:`int`: The sequence id of the given token
437
+ """
438
+ pass
439
+
440
+ def token_to_word(self, token_index):
441
+ """
442
+ Get the index of the word that contains the token in one of the input sequences.
443
+
444
+ The returned word index is related to the input sequence that contains
445
+ the token. In order to determine in which input sequence it belongs, you
446
+ must call :meth:`~tokenizers.Encoding.token_to_sequence()`.
447
+
448
+ Args:
449
+ token_index (:obj:`int`):
450
+ The index of a token in the encoded sequence.
451
+
452
+ Returns:
453
+ :obj:`int`: The index of the word in the relevant input sequence.
454
+ """
455
+ pass
456
+
457
+ @property
458
+ def tokens(self):
459
+ """
460
+ The generated tokens
461
+
462
+ They are the string representation of the IDs.
463
+
464
+ Returns:
465
+ :obj:`List[str]`: The list of tokens
466
+ """
467
+ pass
468
+
469
+ @tokens.setter
470
+ def tokens(self, value):
471
+ """
472
+ The generated tokens
473
+
474
+ They are the string representation of the IDs.
475
+
476
+ Returns:
477
+ :obj:`List[str]`: The list of tokens
478
+ """
479
+ pass
480
+
481
+ def truncate(self, max_length, stride=0, direction="right"):
482
+ """
483
+ Truncate the :class:`~tokenizers.Encoding` at the given length
484
+
485
+ If this :class:`~tokenizers.Encoding` represents multiple sequences, when truncating
486
+ this information is lost. It will be considered as representing a single sequence.
487
+
488
+ Args:
489
+ max_length (:obj:`int`):
490
+ The desired length
491
+
492
+ stride (:obj:`int`, defaults to :obj:`0`):
493
+ The length of previous content to be included in each overflowing piece
494
+
495
+ direction (:obj:`str`, defaults to :obj:`right`):
496
+ Truncate direction
497
+ """
498
+ pass
499
+
500
+ @property
501
+ def type_ids(self):
502
+ """
503
+ The generated type IDs
504
+
505
+ Generally used for tasks like sequence classification or question answering,
506
+ these tokens let the LM know which input sequence corresponds to each tokens.
507
+
508
+ Returns:
509
+ :obj:`List[int]`: The list of type ids
510
+ """
511
+ pass
512
+
513
+ @type_ids.setter
514
+ def type_ids(self, value):
515
+ """
516
+ The generated type IDs
517
+
518
+ Generally used for tasks like sequence classification or question answering,
519
+ these tokens let the LM know which input sequence corresponds to each tokens.
520
+
521
+ Returns:
522
+ :obj:`List[int]`: The list of type ids
523
+ """
524
+ pass
525
+
526
+ @property
527
+ def word_ids(self):
528
+ """
529
+ The generated word indices.
530
+
531
+ They represent the index of the word associated to each token.
532
+ When the input is pre-tokenized, they correspond to the ID of the given input label,
533
+ otherwise they correspond to the words indices as defined by the
534
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used.
535
+
536
+ For special tokens and such (any token that was generated from something that was
537
+ not part of the input), the output is :obj:`None`
538
+
539
+ Returns:
540
+ A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
541
+ """
542
+ pass
543
+
544
+ @word_ids.setter
545
+ def word_ids(self, value):
546
+ """
547
+ The generated word indices.
548
+
549
+ They represent the index of the word associated to each token.
550
+ When the input is pre-tokenized, they correspond to the ID of the given input label,
551
+ otherwise they correspond to the words indices as defined by the
552
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used.
553
+
554
+ For special tokens and such (any token that was generated from something that was
555
+ not part of the input), the output is :obj:`None`
556
+
557
+ Returns:
558
+ A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
559
+ """
560
+ pass
561
+
562
+ def word_to_chars(self, word_index, sequence_index=0):
563
+ """
564
+ Get the offsets of the word at the given index in one of the input sequences.
565
+
566
+ Args:
567
+ word_index (:obj:`int`):
568
+ The index of a word in one of the input sequences.
569
+ sequence_index (:obj:`int`, defaults to :obj:`0`):
570
+ The index of the sequence that contains the target word
571
+
572
+ Returns:
573
+ :obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)`
574
+ """
575
+ pass
576
+
577
+ def word_to_tokens(self, word_index, sequence_index=0):
578
+ """
579
+ Get the encoded tokens corresponding to the word at the given index
580
+ in one of the input sequences.
581
+
582
+ Args:
583
+ word_index (:obj:`int`):
584
+ The index of a word in one of the input sequences.
585
+ sequence_index (:obj:`int`, defaults to :obj:`0`):
586
+ The index of the sequence that contains the target word
587
+
588
+ Returns:
589
+ :obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)`
590
+ """
591
+ pass
592
+
593
+ @property
594
+ def words(self):
595
+ """
596
+ The generated word indices.
597
+
598
+ .. warning::
599
+ This is deprecated and will be removed in a future version.
600
+ Please use :obj:`~tokenizers.Encoding.word_ids` instead.
601
+
602
+ They represent the index of the word associated to each token.
603
+ When the input is pre-tokenized, they correspond to the ID of the given input label,
604
+ otherwise they correspond to the words indices as defined by the
605
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used.
606
+
607
+ For special tokens and such (any token that was generated from something that was
608
+ not part of the input), the output is :obj:`None`
609
+
610
+ Returns:
611
+ A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
612
+ """
613
+ pass
614
+
615
+ @words.setter
616
+ def words(self, value):
617
+ """
618
+ The generated word indices.
619
+
620
+ .. warning::
621
+ This is deprecated and will be removed in a future version.
622
+ Please use :obj:`~tokenizers.Encoding.word_ids` instead.
623
+
624
+ They represent the index of the word associated to each token.
625
+ When the input is pre-tokenized, they correspond to the ID of the given input label,
626
+ otherwise they correspond to the words indices as defined by the
627
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used.
628
+
629
+ For special tokens and such (any token that was generated from something that was
630
+ not part of the input), the output is :obj:`None`
631
+
632
+ Returns:
633
+ A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
634
+ """
635
+ pass
636
+
637
+ class NormalizedString:
638
+ """
639
+ NormalizedString
640
+
641
+ A NormalizedString takes care of modifying an "original" string, to obtain a "normalized" one.
642
+ While making all the requested modifications, it keeps track of the alignment information
643
+ between the two versions of the string.
644
+
645
+ Args:
646
+ sequence: str:
647
+ The string sequence used to initialize this NormalizedString
648
+ """
649
+ def __init__(self, sequence):
650
+ pass
651
+
652
+ def __getitem__(self, key):
653
+ """
654
+ Return self[key].
655
+ """
656
+ pass
657
+
658
+ def __getstate__(self, /):
659
+ """
660
+ Helper for pickle.
661
+ """
662
+ pass
663
+
664
+ def append(self, s):
665
+ """
666
+ Append the given sequence to the string
667
+ """
668
+ pass
669
+
670
+ def clear(self):
671
+ """
672
+ Clears the string
673
+ """
674
+ pass
675
+
676
+ def filter(self, func):
677
+ """
678
+ Filter each character of the string using the given func
679
+ """
680
+ pass
681
+
682
+ def for_each(self, func):
683
+ """
684
+ Calls the given function for each character of the string
685
+ """
686
+ pass
687
+
688
+ def lowercase(self):
689
+ """
690
+ Lowercase the string
691
+ """
692
+ pass
693
+
694
+ def lstrip(self):
695
+ """
696
+ Strip the left of the string
697
+ """
698
+ pass
699
+
700
+ def map(self, func):
701
+ """
702
+ Calls the given function for each character of the string
703
+
704
+ Replaces each character of the string using the returned value. Each
705
+ returned value **must** be a str of length 1 (ie a character).
706
+ """
707
+ pass
708
+
709
+ def nfc(self):
710
+ """
711
+ Runs the NFC normalization
712
+ """
713
+ pass
714
+
715
+ def nfd(self):
716
+ """
717
+ Runs the NFD normalization
718
+ """
719
+ pass
720
+
721
+ def nfkc(self):
722
+ """
723
+ Runs the NFKC normalization
724
+ """
725
+ pass
726
+
727
+ def nfkd(self):
728
+ """
729
+ Runs the NFKD normalization
730
+ """
731
+ pass
732
+
733
+ @property
734
+ def normalized(self):
735
+ """
736
+ The normalized part of the string
737
+ """
738
+ pass
739
+
740
+ @normalized.setter
741
+ def normalized(self, value):
742
+ """
743
+ The normalized part of the string
744
+ """
745
+ pass
746
+
747
+ @property
748
+ def original(self):
749
+ """ """
750
+ pass
751
+
752
+ @original.setter
753
+ def original(self, value):
754
+ """ """
755
+ pass
756
+
757
+ def prepend(self, s):
758
+ """
759
+ Prepend the given sequence to the string
760
+ """
761
+ pass
762
+
763
+ def replace(self, pattern, content):
764
+ """
765
+ Replace the content of the given pattern with the provided content
766
+
767
+ Args:
768
+ pattern: Pattern:
769
+ A pattern used to match the string. Usually a string or a Regex
770
+
771
+ content: str:
772
+ The content to be used as replacement
773
+ """
774
+ pass
775
+
776
+ def rstrip(self):
777
+ """
778
+ Strip the right of the string
779
+ """
780
+ pass
781
+
782
+ def slice(self, range):
783
+ """
784
+ Slice the string using the given range
785
+ """
786
+ pass
787
+
788
+ def split(self, pattern, behavior):
789
+ """
790
+ Split the NormalizedString using the given pattern and the specified behavior
791
+
792
+ Args:
793
+ pattern: Pattern:
794
+ A pattern used to split the string. Usually a string or a regex built with `tokenizers.Regex`
795
+
796
+ behavior: SplitDelimiterBehavior:
797
+ The behavior to use when splitting.
798
+ Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",
799
+ "contiguous"
800
+
801
+ Returns:
802
+ A list of NormalizedString, representing each split
803
+ """
804
+ pass
805
+
806
+ def strip(self):
807
+ """
808
+ Strip both ends of the string
809
+ """
810
+ pass
811
+
812
+ def uppercase(self):
813
+ """
814
+ Uppercase the string
815
+ """
816
+ pass
817
+
818
+ class PreTokenizedString:
819
+ """
820
+ PreTokenizedString
821
+
822
+ Wrapper over a string, that provides a way to normalize, pre-tokenize, tokenize the
823
+ underlying string, while keeping track of the alignment information (offsets).
824
+
825
+ The PreTokenizedString manages what we call `splits`. Each split represents a substring
826
+ which is a subpart of the original string, with the relevant offsets and tokens.
827
+
828
+ When calling one of the methods used to modify the PreTokenizedString (namely one of
829
+ `split`, `normalize` or `tokenize), only the `splits` that don't have any associated
830
+ tokens will get modified.
831
+
832
+ Args:
833
+ sequence: str:
834
+ The string sequence used to initialize this PreTokenizedString
835
+ """
836
+ def __init__(self, sequence):
837
+ pass
838
+
839
+ def __getstate__(self, /):
840
+ """
841
+ Helper for pickle.
842
+ """
843
+ pass
844
+
845
+ def get_splits(self, offset_referential="original", offset_type="char"):
846
+ """
847
+ Get the splits currently managed by the PreTokenizedString
848
+
849
+ Args:
850
+ offset_referential: :obj:`str`
851
+ Whether the returned splits should have offsets expressed relative
852
+ to the original string, or the normalized one. choices: "original", "normalized".
853
+
854
+ offset_type: :obj:`str`
855
+ Whether the returned splits should have offsets expressed in bytes or chars.
856
+ When slicing an str, we usually want to use chars, which is the default value.
857
+ Now in some cases it might be interesting to get these offsets expressed in bytes,
858
+ so it is possible to change this here.
859
+ choices: "char", "bytes"
860
+
861
+ Returns
862
+ A list of splits
863
+ """
864
+ pass
865
+
866
+ def normalize(self, func):
867
+ """
868
+ Normalize each split of the `PreTokenizedString` using the given `func`
869
+
870
+ Args:
871
+ func: Callable[[NormalizedString], None]:
872
+ The function used to normalize each underlying split. This function
873
+ does not need to return anything, just calling the methods on the provided
874
+ NormalizedString allow its modification.
875
+ """
876
+ pass
877
+
878
+ def split(self, func):
879
+ """
880
+ Split the PreTokenizedString using the given `func`
881
+
882
+ Args:
883
+ func: Callable[[index, NormalizedString], List[NormalizedString]]:
884
+ The function used to split each underlying split.
885
+ It is expected to return a list of `NormalizedString`, that represent the new
886
+ splits. If the given `NormalizedString` does not need any splitting, we can
887
+ just return it directly.
888
+ In order for the offsets to be tracked accurately, any returned `NormalizedString`
889
+ should come from calling either `.split` or `.slice` on the received one.
890
+ """
891
+ pass
892
+
893
+ def to_encoding(self, type_id=0, word_idx=None):
894
+ """
895
+ Return an Encoding generated from this PreTokenizedString
896
+
897
+ Args:
898
+ type_id: int = 0:
899
+ The type_id to be used on the generated Encoding.
900
+
901
+ word_idx: Optional[int] = None:
902
+ An optional word index to be used for each token of this Encoding. If provided,
903
+ all the word indices in the generated Encoding will use this value, instead
904
+ of the one automatically tracked during pre-tokenization.
905
+
906
+ Returns:
907
+ An Encoding
908
+ """
909
+ pass
910
+
911
+ def tokenize(self, func):
912
+ """
913
+ Tokenize each split of the `PreTokenizedString` using the given `func`
914
+
915
+ Args:
916
+ func: Callable[[str], List[Token]]:
917
+ The function used to tokenize each underlying split. This function must return
918
+ a list of Token generated from the input str.
919
+ """
920
+ pass
921
+
922
+ class Regex:
923
+ """
924
+ Instantiate a new Regex with the given pattern
925
+ """
926
+ def __init__(self, pattern):
927
+ pass
928
+
929
+ def __getstate__(self, /):
930
+ """
931
+ Helper for pickle.
932
+ """
933
+ pass
934
+
935
+ class Token:
936
+ def __init__(self, id, value, offsets):
937
+ pass
938
+
939
+ def __getstate__(self, /):
940
+ """
941
+ Helper for pickle.
942
+ """
943
+ pass
944
+
945
+ def as_tuple(self):
946
+ """ """
947
+ pass
948
+
949
+ @property
950
+ def id(self):
951
+ """ """
952
+ pass
953
+
954
+ @id.setter
955
+ def id(self, value):
956
+ """ """
957
+ pass
958
+
959
+ @property
960
+ def offsets(self):
961
+ """ """
962
+ pass
963
+
964
+ @offsets.setter
965
+ def offsets(self, value):
966
+ """ """
967
+ pass
968
+
969
+ @property
970
+ def value(self):
971
+ """ """
972
+ pass
973
+
974
+ @value.setter
975
+ def value(self, value):
976
+ """ """
977
+ pass
978
+
979
+ class Tokenizer:
980
+ """
981
+ A :obj:`Tokenizer` works as a pipeline. It processes some raw text as input
982
+ and outputs an :class:`~tokenizers.Encoding`.
983
+
984
+ Args:
985
+ model (:class:`~tokenizers.models.Model`):
986
+ The core algorithm that this :obj:`Tokenizer` should be using.
987
+
988
+ """
989
+ def __init__(self, model):
990
+ pass
991
+
992
+ def __getnewargs__(self):
993
+ """ """
994
+ pass
995
+
996
+ def __getstate__(self):
997
+ """ """
998
+ pass
999
+
1000
+ def __setstate__(self, state):
1001
+ """ """
1002
+ pass
1003
+
1004
+ def add_special_tokens(self, tokens):
1005
+ """
1006
+ Add the given special tokens to the Tokenizer.
1007
+
1008
+ If these tokens are already part of the vocabulary, it just let the Tokenizer know about
1009
+ them. If they don't exist, the Tokenizer creates them, giving them a new id.
1010
+
1011
+ These special tokens will never be processed by the model (ie won't be split into
1012
+ multiple tokens), and they can be removed from the output when decoding.
1013
+
1014
+ Args:
1015
+ tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
1016
+ The list of special tokens we want to add to the vocabulary. Each token can either
1017
+ be a string or an instance of :class:`~tokenizers.AddedToken` for more
1018
+ customization.
1019
+
1020
+ Returns:
1021
+ :obj:`int`: The number of tokens that were created in the vocabulary
1022
+ """
1023
+ pass
1024
+
1025
+ def add_tokens(self, tokens):
1026
+ """
1027
+ Add the given tokens to the vocabulary
1028
+
1029
+ The given tokens are added only if they don't already exist in the vocabulary.
1030
+ Each token then gets a new attributed id.
1031
+
1032
+ Args:
1033
+ tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
1034
+ The list of tokens we want to add to the vocabulary. Each token can be either a
1035
+ string or an instance of :class:`~tokenizers.AddedToken` for more customization.
1036
+
1037
+ Returns:
1038
+ :obj:`int`: The number of tokens that were created in the vocabulary
1039
+ """
1040
+ pass
1041
+
1042
+ def async_decode_batch(self, sequences, skip_special_tokens=True):
1043
+ """
1044
+ Decode a batch of ids back to their corresponding string
1045
+
1046
+ Args:
1047
+ sequences (:obj:`List` of :obj:`List[int]`):
1048
+ The batch of sequences we want to decode
1049
+
1050
+ skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
1051
+ Whether the special tokens should be removed from the decoded strings
1052
+
1053
+ Returns:
1054
+ :obj:`List[str]`: A list of decoded strings
1055
+ """
1056
+ pass
1057
+
1058
+ def async_encode(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True):
1059
+ """
1060
+ Asynchronously encode the given input with character offsets.
1061
+
1062
+ This is an async version of encode that can be awaited in async Python code.
1063
+
1064
+ Example:
1065
+ Here are some examples of the inputs that are accepted::
1066
+
1067
+ await async_encode("A single sequence")
1068
+
1069
+ Args:
1070
+ sequence (:obj:`~tokenizers.InputSequence`):
1071
+ The main input sequence we want to encode. This sequence can be either raw
1072
+ text or pre-tokenized, according to the ``is_pretokenized`` argument:
1073
+
1074
+ - If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence`
1075
+ - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence`
1076
+
1077
+ pair (:obj:`~tokenizers.InputSequence`, `optional`):
1078
+ An optional input sequence. The expected format is the same that for ``sequence``.
1079
+
1080
+ is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
1081
+ Whether the input is already pre-tokenized
1082
+
1083
+ add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
1084
+ Whether to add the special tokens
1085
+
1086
+ Returns:
1087
+ :class:`~tokenizers.Encoding`: The encoded result
1088
+
1089
+ """
1090
+ pass
1091
+
1092
+ def async_encode_batch(self, input, is_pretokenized=False, add_special_tokens=True):
1093
+ """
1094
+ Asynchronously encode the given batch of inputs with character offsets.
1095
+
1096
+ This is an async version of encode_batch that can be awaited in async Python code.
1097
+
1098
+ Example:
1099
+ Here are some examples of the inputs that are accepted::
1100
+
1101
+ await async_encode_batch([
1102
+ "A single sequence",
1103
+ ("A tuple with a sequence", "And its pair"),
1104
+ [ "A", "pre", "tokenized", "sequence" ],
1105
+ ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
1106
+ ])
1107
+
1108
+ Args:
1109
+ input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
1110
+ A list of single sequences or pair sequences to encode. Each sequence
1111
+ can be either raw text or pre-tokenized, according to the ``is_pretokenized``
1112
+ argument:
1113
+
1114
+ - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
1115
+ - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
1116
+
1117
+ is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
1118
+ Whether the input is already pre-tokenized
1119
+
1120
+ add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
1121
+ Whether to add the special tokens
1122
+
1123
+ Returns:
1124
+ A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
1125
+
1126
+ """
1127
+ pass
1128
+
1129
+ def async_encode_batch_fast(self, input, is_pretokenized=False, add_special_tokens=True):
1130
+ """
1131
+ Asynchronously encode the given batch of inputs without tracking character offsets.
1132
+
1133
+ This is an async version of encode_batch_fast that can be awaited in async Python code.
1134
+
1135
+ Example:
1136
+ Here are some examples of the inputs that are accepted::
1137
+
1138
+ await async_encode_batch_fast([
1139
+ "A single sequence",
1140
+ ("A tuple with a sequence", "And its pair"),
1141
+ [ "A", "pre", "tokenized", "sequence" ],
1142
+ ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
1143
+ ])
1144
+
1145
+ Args:
1146
+ input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
1147
+ A list of single sequences or pair sequences to encode. Each sequence
1148
+ can be either raw text or pre-tokenized, according to the ``is_pretokenized``
1149
+ argument:
1150
+
1151
+ - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
1152
+ - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
1153
+
1154
+ is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
1155
+ Whether the input is already pre-tokenized
1156
+
1157
+ add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
1158
+ Whether to add the special tokens
1159
+
1160
+ Returns:
1161
+ A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
1162
+
1163
+ """
1164
+ pass
1165
+
1166
+ def decode(self, ids, skip_special_tokens=True):
1167
+ """
1168
+ Decode the given list of ids back to a string
1169
+
1170
+ This is used to decode anything coming back from a Language Model
1171
+
1172
+ Args:
1173
+ ids (A :obj:`List/Tuple` of :obj:`int`):
1174
+ The list of ids that we want to decode
1175
+
1176
+ skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
1177
+ Whether the special tokens should be removed from the decoded string
1178
+
1179
+ Returns:
1180
+ :obj:`str`: The decoded string
1181
+ """
1182
+ pass
1183
+
1184
+ def decode_batch(self, sequences, skip_special_tokens=True):
1185
+ """
1186
+ Decode a batch of ids back to their corresponding string
1187
+
1188
+ Args:
1189
+ sequences (:obj:`List` of :obj:`List[int]`):
1190
+ The batch of sequences we want to decode
1191
+
1192
+ skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
1193
+ Whether the special tokens should be removed from the decoded strings
1194
+
1195
+ Returns:
1196
+ :obj:`List[str]`: A list of decoded strings
1197
+ """
1198
+ pass
1199
+
1200
+ @property
1201
+ def decoder(self):
1202
+ """
1203
+ The `optional` :class:`~tokenizers.decoders.Decoder` in use by the Tokenizer
1204
+ """
1205
+ pass
1206
+
1207
+ @decoder.setter
1208
+ def decoder(self, value):
1209
+ """
1210
+ The `optional` :class:`~tokenizers.decoders.Decoder` in use by the Tokenizer
1211
+ """
1212
+ pass
1213
+
1214
+ def enable_padding(
1215
+ self, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]", length=None, pad_to_multiple_of=None
1216
+ ):
1217
+ """
1218
+ Enable the padding
1219
+
1220
+ Args:
1221
+ direction (:obj:`str`, `optional`, defaults to :obj:`right`):
1222
+ The direction in which to pad. Can be either ``right`` or ``left``
1223
+
1224
+ pad_to_multiple_of (:obj:`int`, `optional`):
1225
+ If specified, the padding length should always snap to the next multiple of the
1226
+ given value. For example if we were going to pad witha length of 250 but
1227
+ ``pad_to_multiple_of=8`` then we will pad to 256.
1228
+
1229
+ pad_id (:obj:`int`, defaults to 0):
1230
+ The id to be used when padding
1231
+
1232
+ pad_type_id (:obj:`int`, defaults to 0):
1233
+ The type id to be used when padding
1234
+
1235
+ pad_token (:obj:`str`, defaults to :obj:`[PAD]`):
1236
+ The pad token to be used when padding
1237
+
1238
+ length (:obj:`int`, `optional`):
1239
+ If specified, the length at which to pad. If not specified we pad using the size of
1240
+ the longest sequence in a batch.
1241
+ """
1242
+ pass
1243
+
1244
+ def enable_truncation(self, max_length, stride=0, strategy="longest_first", direction="right"):
1245
+ """
1246
+ Enable truncation
1247
+
1248
+ Args:
1249
+ max_length (:obj:`int`):
1250
+ The max length at which to truncate
1251
+
1252
+ stride (:obj:`int`, `optional`):
1253
+ The length of the previous first sequence to be included in the overflowing
1254
+ sequence
1255
+
1256
+ strategy (:obj:`str`, `optional`, defaults to :obj:`longest_first`):
1257
+ The strategy used to truncation. Can be one of ``longest_first``, ``only_first`` or
1258
+ ``only_second``.
1259
+
1260
+ direction (:obj:`str`, defaults to :obj:`right`):
1261
+ Truncate direction
1262
+ """
1263
+ pass
1264
+
1265
+ def encode(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True):
1266
+ """
1267
+ Encode the given sequence and pair. This method can process raw text sequences
1268
+ as well as already pre-tokenized sequences.
1269
+
1270
+ Example:
1271
+ Here are some examples of the inputs that are accepted::
1272
+
1273
+ encode("A single sequence")`
1274
+ encode("A sequence", "And its pair")`
1275
+ encode([ "A", "pre", "tokenized", "sequence" ], is_pretokenized=True)`
1276
+ encode(
1277
+ [ "A", "pre", "tokenized", "sequence" ], [ "And", "its", "pair" ],
1278
+ is_pretokenized=True
1279
+ )
1280
+
1281
+ Args:
1282
+ sequence (:obj:`~tokenizers.InputSequence`):
1283
+ The main input sequence we want to encode. This sequence can be either raw
1284
+ text or pre-tokenized, according to the ``is_pretokenized`` argument:
1285
+
1286
+ - If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence`
1287
+ - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence`
1288
+
1289
+ pair (:obj:`~tokenizers.InputSequence`, `optional`):
1290
+ An optional input sequence. The expected format is the same that for ``sequence``.
1291
+
1292
+ is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
1293
+ Whether the input is already pre-tokenized
1294
+
1295
+ add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
1296
+ Whether to add the special tokens
1297
+
1298
+ Returns:
1299
+ :class:`~tokenizers.Encoding`: The encoded result
1300
+
1301
+ """
1302
+ pass
1303
+
1304
+ def encode_batch(self, input, is_pretokenized=False, add_special_tokens=True):
1305
+ """
1306
+ Encode the given batch of inputs. This method accept both raw text sequences
1307
+ as well as already pre-tokenized sequences. The reason we use `PySequence` is
1308
+ because it allows type checking with zero-cost (according to PyO3) as we don't
1309
+ have to convert to check.
1310
+
1311
+ Example:
1312
+ Here are some examples of the inputs that are accepted::
1313
+
1314
+ encode_batch([
1315
+ "A single sequence",
1316
+ ("A tuple with a sequence", "And its pair"),
1317
+ [ "A", "pre", "tokenized", "sequence" ],
1318
+ ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
1319
+ ])
1320
+
1321
+ Args:
1322
+ input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
1323
+ A list of single sequences or pair sequences to encode. Each sequence
1324
+ can be either raw text or pre-tokenized, according to the ``is_pretokenized``
1325
+ argument:
1326
+
1327
+ - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
1328
+ - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
1329
+
1330
+ is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
1331
+ Whether the input is already pre-tokenized
1332
+
1333
+ add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
1334
+ Whether to add the special tokens
1335
+
1336
+ Returns:
1337
+ A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
1338
+
1339
+ """
1340
+ pass
1341
+
1342
+ def encode_batch_fast(self, input, is_pretokenized=False, add_special_tokens=True):
1343
+ """
1344
+ Encode the given batch of inputs. This method is faster than `encode_batch`
1345
+ because it doesn't keep track of offsets, they will be all zeros.
1346
+
1347
+ Example:
1348
+ Here are some examples of the inputs that are accepted::
1349
+
1350
+ encode_batch_fast([
1351
+ "A single sequence",
1352
+ ("A tuple with a sequence", "And its pair"),
1353
+ [ "A", "pre", "tokenized", "sequence" ],
1354
+ ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
1355
+ ])
1356
+
1357
+ Args:
1358
+ input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
1359
+ A list of single sequences or pair sequences to encode. Each sequence
1360
+ can be either raw text or pre-tokenized, according to the ``is_pretokenized``
1361
+ argument:
1362
+
1363
+ - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
1364
+ - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
1365
+
1366
+ is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
1367
+ Whether the input is already pre-tokenized
1368
+
1369
+ add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
1370
+ Whether to add the special tokens
1371
+
1372
+ Returns:
1373
+ A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
1374
+
1375
+ """
1376
+ pass
1377
+
1378
+ @property
1379
+ def encode_special_tokens(self):
1380
+ """
1381
+ Modifies the tokenizer in order to use or not the special tokens
1382
+ during encoding.
1383
+
1384
+ Args:
1385
+ value (:obj:`bool`):
1386
+ Whether to use the special tokens or not
1387
+
1388
+ """
1389
+ pass
1390
+
1391
+ @encode_special_tokens.setter
1392
+ def encode_special_tokens(self, value):
1393
+ """
1394
+ Modifies the tokenizer in order to use or not the special tokens
1395
+ during encoding.
1396
+
1397
+ Args:
1398
+ value (:obj:`bool`):
1399
+ Whether to use the special tokens or not
1400
+
1401
+ """
1402
+ pass
1403
+
1404
+ @staticmethod
1405
+ def from_buffer(buffer):
1406
+ """
1407
+ Instantiate a new :class:`~tokenizers.Tokenizer` from the given buffer.
1408
+
1409
+ Args:
1410
+ buffer (:obj:`bytes`):
1411
+ A buffer containing a previously serialized :class:`~tokenizers.Tokenizer`
1412
+
1413
+ Returns:
1414
+ :class:`~tokenizers.Tokenizer`: The new tokenizer
1415
+ """
1416
+ pass
1417
+
1418
+ @staticmethod
1419
+ def from_file(path):
1420
+ """
1421
+ Instantiate a new :class:`~tokenizers.Tokenizer` from the file at the given path.
1422
+
1423
+ Args:
1424
+ path (:obj:`str`):
1425
+ A path to a local JSON file representing a previously serialized
1426
+ :class:`~tokenizers.Tokenizer`
1427
+
1428
+ Returns:
1429
+ :class:`~tokenizers.Tokenizer`: The new tokenizer
1430
+ """
1431
+ pass
1432
+
1433
+ @staticmethod
1434
+ def from_pretrained(identifier, revision="main", token=None):
1435
+ """
1436
+ Instantiate a new :class:`~tokenizers.Tokenizer` from an existing file on the
1437
+ Hugging Face Hub.
1438
+
1439
+ Args:
1440
+ identifier (:obj:`str`):
1441
+ The identifier of a Model on the Hugging Face Hub, that contains
1442
+ a tokenizer.json file
1443
+ revision (:obj:`str`, defaults to `main`):
1444
+ A branch or commit id
1445
+ token (:obj:`str`, `optional`, defaults to `None`):
1446
+ An optional auth token used to access private repositories on the
1447
+ Hugging Face Hub
1448
+
1449
+ Returns:
1450
+ :class:`~tokenizers.Tokenizer`: The new tokenizer
1451
+ """
1452
+ pass
1453
+
1454
+ @staticmethod
1455
+ def from_str(json):
1456
+ """
1457
+ Instantiate a new :class:`~tokenizers.Tokenizer` from the given JSON string.
1458
+
1459
+ Args:
1460
+ json (:obj:`str`):
1461
+ A valid JSON string representing a previously serialized
1462
+ :class:`~tokenizers.Tokenizer`
1463
+
1464
+ Returns:
1465
+ :class:`~tokenizers.Tokenizer`: The new tokenizer
1466
+ """
1467
+ pass
1468
+
1469
+ def get_added_tokens_decoder(self):
1470
+ """
1471
+ Get the underlying vocabulary
1472
+
1473
+ Returns:
1474
+ :obj:`Dict[int, AddedToken]`: The vocabulary
1475
+ """
1476
+ pass
1477
+
1478
+ def get_vocab(self, with_added_tokens=True):
1479
+ """
1480
+ Get the underlying vocabulary
1481
+
1482
+ Args:
1483
+ with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
1484
+ Whether to include the added tokens
1485
+
1486
+ Returns:
1487
+ :obj:`Dict[str, int]`: The vocabulary
1488
+ """
1489
+ pass
1490
+
1491
+ def get_vocab_size(self, with_added_tokens=True):
1492
+ """
1493
+ Get the size of the underlying vocabulary
1494
+
1495
+ Args:
1496
+ with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
1497
+ Whether to include the added tokens
1498
+
1499
+ Returns:
1500
+ :obj:`int`: The size of the vocabulary
1501
+ """
1502
+ pass
1503
+
1504
+ def id_to_token(self, id):
1505
+ """
1506
+ Convert the given id to its corresponding token if it exists
1507
+
1508
+ Args:
1509
+ id (:obj:`int`):
1510
+ The id to convert
1511
+
1512
+ Returns:
1513
+ :obj:`Optional[str]`: An optional token, :obj:`None` if out of vocabulary
1514
+ """
1515
+ pass
1516
+
1517
+ @property
1518
+ def model(self):
1519
+ """
1520
+ The :class:`~tokenizers.models.Model` in use by the Tokenizer
1521
+ """
1522
+ pass
1523
+
1524
+ @model.setter
1525
+ def model(self, value):
1526
+ """
1527
+ The :class:`~tokenizers.models.Model` in use by the Tokenizer
1528
+ """
1529
+ pass
1530
+
1531
+ def no_padding(self):
1532
+ """
1533
+ Disable padding
1534
+ """
1535
+ pass
1536
+
1537
+ def no_truncation(self):
1538
+ """
1539
+ Disable truncation
1540
+ """
1541
+ pass
1542
+
1543
+ @property
1544
+ def normalizer(self):
1545
+ """
1546
+ The `optional` :class:`~tokenizers.normalizers.Normalizer` in use by the Tokenizer
1547
+ """
1548
+ pass
1549
+
1550
+ @normalizer.setter
1551
+ def normalizer(self, value):
1552
+ """
1553
+ The `optional` :class:`~tokenizers.normalizers.Normalizer` in use by the Tokenizer
1554
+ """
1555
+ pass
1556
+
1557
+ def num_special_tokens_to_add(self, is_pair):
1558
+ """
1559
+ Return the number of special tokens that would be added for single/pair sentences.
1560
+ :param is_pair: Boolean indicating if the input would be a single sentence or a pair
1561
+ :return:
1562
+ """
1563
+ pass
1564
+
1565
+ @property
1566
+ def padding(self):
1567
+ """
1568
+ Get the current padding parameters
1569
+
1570
+ `Cannot be set, use` :meth:`~tokenizers.Tokenizer.enable_padding` `instead`
1571
+
1572
+ Returns:
1573
+ (:obj:`dict`, `optional`):
1574
+ A dict with the current padding parameters if padding is enabled
1575
+ """
1576
+ pass
1577
+
1578
+ @padding.setter
1579
+ def padding(self, value):
1580
+ """
1581
+ Get the current padding parameters
1582
+
1583
+ `Cannot be set, use` :meth:`~tokenizers.Tokenizer.enable_padding` `instead`
1584
+
1585
+ Returns:
1586
+ (:obj:`dict`, `optional`):
1587
+ A dict with the current padding parameters if padding is enabled
1588
+ """
1589
+ pass
1590
+
1591
+ def post_process(self, encoding, pair=None, add_special_tokens=True):
1592
+ """
1593
+ Apply all the post-processing steps to the given encodings.
1594
+
1595
+ The various steps are:
1596
+
1597
+ 1. Truncate according to the set truncation params (provided with
1598
+ :meth:`~tokenizers.Tokenizer.enable_truncation`)
1599
+ 2. Apply the :class:`~tokenizers.processors.PostProcessor`
1600
+ 3. Pad according to the set padding params (provided with
1601
+ :meth:`~tokenizers.Tokenizer.enable_padding`)
1602
+
1603
+ Args:
1604
+ encoding (:class:`~tokenizers.Encoding`):
1605
+ The :class:`~tokenizers.Encoding` corresponding to the main sequence.
1606
+
1607
+ pair (:class:`~tokenizers.Encoding`, `optional`):
1608
+ An optional :class:`~tokenizers.Encoding` corresponding to the pair sequence.
1609
+
1610
+ add_special_tokens (:obj:`bool`):
1611
+ Whether to add the special tokens
1612
+
1613
+ Returns:
1614
+ :class:`~tokenizers.Encoding`: The final post-processed encoding
1615
+ """
1616
+ pass
1617
+
1618
+ @property
1619
+ def post_processor(self):
1620
+ """
1621
+ The `optional` :class:`~tokenizers.processors.PostProcessor` in use by the Tokenizer
1622
+ """
1623
+ pass
1624
+
1625
+ @post_processor.setter
1626
+ def post_processor(self, value):
1627
+ """
1628
+ The `optional` :class:`~tokenizers.processors.PostProcessor` in use by the Tokenizer
1629
+ """
1630
+ pass
1631
+
1632
+ @property
1633
+ def pre_tokenizer(self):
1634
+ """
1635
+ The `optional` :class:`~tokenizers.pre_tokenizers.PreTokenizer` in use by the Tokenizer
1636
+ """
1637
+ pass
1638
+
1639
+ @pre_tokenizer.setter
1640
+ def pre_tokenizer(self, value):
1641
+ """
1642
+ The `optional` :class:`~tokenizers.pre_tokenizers.PreTokenizer` in use by the Tokenizer
1643
+ """
1644
+ pass
1645
+
1646
+ def save(self, path, pretty=True):
1647
+ """
1648
+ Save the :class:`~tokenizers.Tokenizer` to the file at the given path.
1649
+
1650
+ Args:
1651
+ path (:obj:`str`):
1652
+ A path to a file in which to save the serialized tokenizer.
1653
+
1654
+ pretty (:obj:`bool`, defaults to :obj:`True`):
1655
+ Whether the JSON file should be pretty formatted.
1656
+ """
1657
+ pass
1658
+
1659
+ def to_str(self, pretty=False):
1660
+ """
1661
+ Gets a serialized string representing this :class:`~tokenizers.Tokenizer`.
1662
+
1663
+ Args:
1664
+ pretty (:obj:`bool`, defaults to :obj:`False`):
1665
+ Whether the JSON string should be pretty formatted.
1666
+
1667
+ Returns:
1668
+ :obj:`str`: A string representing the serialized Tokenizer
1669
+ """
1670
+ pass
1671
+
1672
+ def token_to_id(self, token):
1673
+ """
1674
+ Convert the given token to its corresponding id if it exists
1675
+
1676
+ Args:
1677
+ token (:obj:`str`):
1678
+ The token to convert
1679
+
1680
+ Returns:
1681
+ :obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary
1682
+ """
1683
+ pass
1684
+
1685
+ def train(self, files, trainer=None):
1686
+ """
1687
+ Train the Tokenizer using the given files.
1688
+
1689
+ Reads the files line by line, while keeping all the whitespace, even new lines.
1690
+ If you want to train from data store in-memory, you can check
1691
+ :meth:`~tokenizers.Tokenizer.train_from_iterator`
1692
+
1693
+ Args:
1694
+ files (:obj:`List[str]`):
1695
+ A list of path to the files that we should use for training
1696
+
1697
+ trainer (:obj:`~tokenizers.trainers.Trainer`, `optional`):
1698
+ An optional trainer that should be used to train our Model
1699
+ """
1700
+ pass
1701
+
1702
+ def train_from_iterator(self, iterator, trainer=None, length=None):
1703
+ """
1704
+ Train the Tokenizer using the provided iterator.
1705
+
1706
+ You can provide anything that is a Python Iterator
1707
+
1708
+ * A list of sequences :obj:`List[str]`
1709
+ * A generator that yields :obj:`str` or :obj:`List[str]`
1710
+ * A Numpy array of strings
1711
+ * ...
1712
+
1713
+ Args:
1714
+ iterator (:obj:`Iterator`):
1715
+ Any iterator over strings or list of strings
1716
+
1717
+ trainer (:obj:`~tokenizers.trainers.Trainer`, `optional`):
1718
+ An optional trainer that should be used to train our Model
1719
+
1720
+ length (:obj:`int`, `optional`):
1721
+ The total number of sequences in the iterator. This is used to
1722
+ provide meaningful progress tracking
1723
+ """
1724
+ pass
1725
+
1726
+ @property
1727
+ def truncation(self):
1728
+ """
1729
+ Get the currently set truncation parameters
1730
+
1731
+ `Cannot set, use` :meth:`~tokenizers.Tokenizer.enable_truncation` `instead`
1732
+
1733
+ Returns:
1734
+ (:obj:`dict`, `optional`):
1735
+ A dict with the current truncation parameters if truncation is enabled
1736
+ """
1737
+ pass
1738
+
1739
+ @truncation.setter
1740
+ def truncation(self, value):
1741
+ """
1742
+ Get the currently set truncation parameters
1743
+
1744
+ `Cannot set, use` :meth:`~tokenizers.Tokenizer.enable_truncation` `instead`
1745
+
1746
+ Returns:
1747
+ (:obj:`dict`, `optional`):
1748
+ A dict with the current truncation parameters if truncation is enabled
1749
+ """
1750
+ pass
1751
+
1752
+ from enum import Enum
1753
+ from typing import List, Tuple, Union, Any
1754
+
1755
+ Offsets = Tuple[int, int]
1756
+ TextInputSequence = str
1757
+ PreTokenizedInputSequence = Union[List[str], Tuple[str, ...]]
1758
+ TextEncodeInput = Union[
1759
+ TextInputSequence,
1760
+ Tuple[TextInputSequence, TextInputSequence],
1761
+ List[TextInputSequence],
1762
+ ]
1763
+ PreTokenizedEncodeInput = Union[
1764
+ PreTokenizedInputSequence,
1765
+ Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
1766
+ List[PreTokenizedInputSequence],
1767
+ ]
1768
+ InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
1769
+ EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
1770
+
1771
+ class OffsetReferential(Enum):
1772
+ ORIGINAL = "original"
1773
+ NORMALIZED = "normalized"
1774
+
1775
+ class OffsetType(Enum):
1776
+ BYTE = "byte"
1777
+ CHAR = "char"
1778
+
1779
+ class SplitDelimiterBehavior(Enum):
1780
+ REMOVED = "removed"
1781
+ ISOLATED = "isolated"
1782
+ MERGED_WITH_PREVIOUS = "merged_with_previous"
1783
+ MERGED_WITH_NEXT = "merged_with_next"
1784
+ CONTIGUOUS = "contiguous"
1785
+
1786
+ from .implementations import (
1787
+ BertWordPieceTokenizer,
1788
+ ByteLevelBPETokenizer,
1789
+ CharBPETokenizer,
1790
+ SentencePieceBPETokenizer,
1791
+ SentencePieceUnigramTokenizer,
1792
+ )
1793
+
1794
+ def __getattr__(name: str) -> Any: ...
1795
+
1796
+ BertWordPieceTokenizer: Any
1797
+ ByteLevelBPETokenizer: Any
1798
+ CharBPETokenizer: Any
1799
+ SentencePieceBPETokenizer: Any
1800
+ SentencePieceUnigramTokenizer: Any
source/tokenizers/decoders/__init__.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .. import decoders
2
+
3
+
4
+ Decoder = decoders.Decoder
5
+ ByteLevel = decoders.ByteLevel
6
+ Replace = decoders.Replace
7
+ WordPiece = decoders.WordPiece
8
+ ByteFallback = decoders.ByteFallback
9
+ Fuse = decoders.Fuse
10
+ Strip = decoders.Strip
11
+ Metaspace = decoders.Metaspace
12
+ BPEDecoder = decoders.BPEDecoder
13
+ CTC = decoders.CTC
14
+ Sequence = decoders.Sequence
15
+ DecodeStream = decoders.DecodeStream
source/tokenizers/decoders/__init__.pyi ADDED
@@ -0,0 +1,569 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated content DO NOT EDIT
2
+ class DecodeStream:
3
+ """
4
+ Class needed for streaming decode
5
+
6
+ """
7
+ def __init__(self, ids=None, skip_special_tokens=False):
8
+ pass
9
+
10
+ def __getstate__(self, /):
11
+ """
12
+ Helper for pickle.
13
+ """
14
+ pass
15
+
16
+ def step(self, tokenizer, id):
17
+ """
18
+ Streaming decode step
19
+
20
+ Args:
21
+ tokenizer (:class:`~tokenizers.Tokenizer`):
22
+ The tokenizer to use for decoding
23
+ id (:obj:`int` or `List[int]`):
24
+ The next token id or list of token ids to add to the stream
25
+
26
+
27
+ Returns:
28
+ :obj:`Optional[str]`: The next decoded string chunk, or None if not enough
29
+ tokens have been provided yet.
30
+ """
31
+ pass
32
+
33
+ class Decoder:
34
+ """
35
+ Base class for all decoders
36
+
37
+ This class is not supposed to be instantiated directly. Instead, any implementation of
38
+ a Decoder will return an instance of this class when instantiated.
39
+ """
40
+ def __getstate__(self):
41
+ """ """
42
+ pass
43
+
44
+ def __setstate__(self, state):
45
+ """ """
46
+ pass
47
+
48
+ @staticmethod
49
+ def custom(decoder):
50
+ """ """
51
+ pass
52
+
53
+ def decode(self, tokens):
54
+ """
55
+ Decode the given list of tokens to a final string
56
+
57
+ Args:
58
+ tokens (:obj:`List[str]`):
59
+ The list of tokens to decode
60
+
61
+ Returns:
62
+ :obj:`str`: The decoded string
63
+ """
64
+ pass
65
+
66
+ class BPEDecoder(Decoder):
67
+ """
68
+ BPEDecoder Decoder
69
+
70
+ Args:
71
+ suffix (:obj:`str`, `optional`, defaults to :obj:`</w>`):
72
+ The suffix that was used to characterize an end-of-word. This suffix will
73
+ be replaced by whitespaces during the decoding
74
+ """
75
+ def __init__(self, suffix="</w>"):
76
+ pass
77
+
78
+ def __getstate__(self):
79
+ """ """
80
+ pass
81
+
82
+ def __setstate__(self, state):
83
+ """ """
84
+ pass
85
+
86
+ @staticmethod
87
+ def custom(decoder):
88
+ """ """
89
+ pass
90
+
91
+ def decode(self, tokens):
92
+ """
93
+ Decode the given list of tokens to a final string
94
+
95
+ Args:
96
+ tokens (:obj:`List[str]`):
97
+ The list of tokens to decode
98
+
99
+ Returns:
100
+ :obj:`str`: The decoded string
101
+ """
102
+ pass
103
+
104
+ @property
105
+ def suffix(self):
106
+ """ """
107
+ pass
108
+
109
+ @suffix.setter
110
+ def suffix(self, value):
111
+ """ """
112
+ pass
113
+
114
+ class ByteFallback(Decoder):
115
+ """
116
+ ByteFallback Decoder
117
+ ByteFallback is a simple trick which converts tokens looking like `<0x61>`
118
+ to pure bytes, and attempts to make them into a string. If the tokens
119
+ cannot be decoded you will get � instead for each inconvertible byte token
120
+
121
+ """
122
+ def __init__(self):
123
+ pass
124
+
125
+ def __getstate__(self):
126
+ """ """
127
+ pass
128
+
129
+ def __setstate__(self, state):
130
+ """ """
131
+ pass
132
+
133
+ @staticmethod
134
+ def custom(decoder):
135
+ """ """
136
+ pass
137
+
138
+ def decode(self, tokens):
139
+ """
140
+ Decode the given list of tokens to a final string
141
+
142
+ Args:
143
+ tokens (:obj:`List[str]`):
144
+ The list of tokens to decode
145
+
146
+ Returns:
147
+ :obj:`str`: The decoded string
148
+ """
149
+ pass
150
+
151
+ class ByteLevel(Decoder):
152
+ """
153
+ ByteLevel Decoder
154
+
155
+ This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.ByteLevel`
156
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`.
157
+ """
158
+ def __init__(self):
159
+ pass
160
+
161
+ def __getstate__(self):
162
+ """ """
163
+ pass
164
+
165
+ def __setstate__(self, state):
166
+ """ """
167
+ pass
168
+
169
+ @staticmethod
170
+ def custom(decoder):
171
+ """ """
172
+ pass
173
+
174
+ def decode(self, tokens):
175
+ """
176
+ Decode the given list of tokens to a final string
177
+
178
+ Args:
179
+ tokens (:obj:`List[str]`):
180
+ The list of tokens to decode
181
+
182
+ Returns:
183
+ :obj:`str`: The decoded string
184
+ """
185
+ pass
186
+
187
+ class CTC(Decoder):
188
+ """
189
+ CTC Decoder
190
+
191
+ Args:
192
+ pad_token (:obj:`str`, `optional`, defaults to :obj:`<pad>`):
193
+ The pad token used by CTC to delimit a new token.
194
+ word_delimiter_token (:obj:`str`, `optional`, defaults to :obj:`|`):
195
+ The word delimiter token. It will be replaced by a <space>
196
+ cleanup (:obj:`bool`, `optional`, defaults to :obj:`True`):
197
+ Whether to cleanup some tokenization artifacts.
198
+ Mainly spaces before punctuation, and some abbreviated english forms.
199
+ """
200
+ def __init__(self, pad_token="<pad>", word_delimiter_token="|", cleanup=True):
201
+ pass
202
+
203
+ def __getstate__(self):
204
+ """ """
205
+ pass
206
+
207
+ def __setstate__(self, state):
208
+ """ """
209
+ pass
210
+
211
+ @property
212
+ def cleanup(self):
213
+ """ """
214
+ pass
215
+
216
+ @cleanup.setter
217
+ def cleanup(self, value):
218
+ """ """
219
+ pass
220
+
221
+ @staticmethod
222
+ def custom(decoder):
223
+ """ """
224
+ pass
225
+
226
+ def decode(self, tokens):
227
+ """
228
+ Decode the given list of tokens to a final string
229
+
230
+ Args:
231
+ tokens (:obj:`List[str]`):
232
+ The list of tokens to decode
233
+
234
+ Returns:
235
+ :obj:`str`: The decoded string
236
+ """
237
+ pass
238
+
239
+ @property
240
+ def pad_token(self):
241
+ """ """
242
+ pass
243
+
244
+ @pad_token.setter
245
+ def pad_token(self, value):
246
+ """ """
247
+ pass
248
+
249
+ @property
250
+ def word_delimiter_token(self):
251
+ """ """
252
+ pass
253
+
254
+ @word_delimiter_token.setter
255
+ def word_delimiter_token(self, value):
256
+ """ """
257
+ pass
258
+
259
+ class Fuse(Decoder):
260
+ """
261
+ Fuse Decoder
262
+ Fuse simply fuses every token into a single string.
263
+ This is the last step of decoding, this decoder exists only if
264
+ there is need to add other decoders *after* the fusion
265
+ """
266
+ def __init__(self):
267
+ pass
268
+
269
+ def __getstate__(self):
270
+ """ """
271
+ pass
272
+
273
+ def __setstate__(self, state):
274
+ """ """
275
+ pass
276
+
277
+ @staticmethod
278
+ def custom(decoder):
279
+ """ """
280
+ pass
281
+
282
+ def decode(self, tokens):
283
+ """
284
+ Decode the given list of tokens to a final string
285
+
286
+ Args:
287
+ tokens (:obj:`List[str]`):
288
+ The list of tokens to decode
289
+
290
+ Returns:
291
+ :obj:`str`: The decoded string
292
+ """
293
+ pass
294
+
295
+ class Metaspace(Decoder):
296
+ """
297
+ Metaspace Decoder
298
+
299
+ Args:
300
+ replacement (:obj:`str`, `optional`, defaults to :obj:`▁`):
301
+ The replacement character. Must be exactly one character. By default we
302
+ use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
303
+
304
+ prepend_scheme (:obj:`str`, `optional`, defaults to :obj:`"always"`):
305
+ Whether to add a space to the first word if there isn't already one. This
306
+ lets us treat `hello` exactly like `say hello`.
307
+ Choices: "always", "never", "first". First means the space is only added on the first
308
+ token (relevant when special tokens are used or other pre_tokenizer are used).
309
+ """
310
+ def __init__(self, replacement="▁", prepend_scheme="always", split=True):
311
+ pass
312
+
313
+ def __getstate__(self):
314
+ """ """
315
+ pass
316
+
317
+ def __setstate__(self, state):
318
+ """ """
319
+ pass
320
+
321
+ @staticmethod
322
+ def custom(decoder):
323
+ """ """
324
+ pass
325
+
326
+ def decode(self, tokens):
327
+ """
328
+ Decode the given list of tokens to a final string
329
+
330
+ Args:
331
+ tokens (:obj:`List[str]`):
332
+ The list of tokens to decode
333
+
334
+ Returns:
335
+ :obj:`str`: The decoded string
336
+ """
337
+ pass
338
+
339
+ @property
340
+ def prepend_scheme(self):
341
+ """ """
342
+ pass
343
+
344
+ @prepend_scheme.setter
345
+ def prepend_scheme(self, value):
346
+ """ """
347
+ pass
348
+
349
+ @property
350
+ def replacement(self):
351
+ """ """
352
+ pass
353
+
354
+ @replacement.setter
355
+ def replacement(self, value):
356
+ """ """
357
+ pass
358
+
359
+ @property
360
+ def split(self):
361
+ """ """
362
+ pass
363
+
364
+ @split.setter
365
+ def split(self, value):
366
+ """ """
367
+ pass
368
+
369
+ class Replace(Decoder):
370
+ """
371
+ Replace Decoder
372
+
373
+ This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.Replace`
374
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`.
375
+ """
376
+ def __init__(self, pattern, content):
377
+ pass
378
+
379
+ def __getstate__(self):
380
+ """ """
381
+ pass
382
+
383
+ def __setstate__(self, state):
384
+ """ """
385
+ pass
386
+
387
+ @staticmethod
388
+ def custom(decoder):
389
+ """ """
390
+ pass
391
+
392
+ def decode(self, tokens):
393
+ """
394
+ Decode the given list of tokens to a final string
395
+
396
+ Args:
397
+ tokens (:obj:`List[str]`):
398
+ The list of tokens to decode
399
+
400
+ Returns:
401
+ :obj:`str`: The decoded string
402
+ """
403
+ pass
404
+
405
+ class Sequence(Decoder):
406
+ """
407
+ Sequence Decoder
408
+
409
+ Args:
410
+ decoders (:obj:`List[Decoder]`)
411
+ The decoders that need to be chained
412
+ """
413
+ def __init__(self, decoders):
414
+ pass
415
+
416
+ def __getnewargs__(self):
417
+ """ """
418
+ pass
419
+
420
+ def __getstate__(self):
421
+ """ """
422
+ pass
423
+
424
+ def __setstate__(self, state):
425
+ """ """
426
+ pass
427
+
428
+ @staticmethod
429
+ def custom(decoder):
430
+ """ """
431
+ pass
432
+
433
+ def decode(self, tokens):
434
+ """
435
+ Decode the given list of tokens to a final string
436
+
437
+ Args:
438
+ tokens (:obj:`List[str]`):
439
+ The list of tokens to decode
440
+
441
+ Returns:
442
+ :obj:`str`: The decoded string
443
+ """
444
+ pass
445
+
446
+ class Strip(Decoder):
447
+ """
448
+ Strip normalizer
449
+ Strips n left characters of each token, or n right characters of each token
450
+ """
451
+ def __init__(self, content=" ", left=0, right=0):
452
+ pass
453
+
454
+ def __getstate__(self):
455
+ """ """
456
+ pass
457
+
458
+ def __setstate__(self, state):
459
+ """ """
460
+ pass
461
+
462
+ @property
463
+ def content(self):
464
+ """ """
465
+ pass
466
+
467
+ @content.setter
468
+ def content(self, value):
469
+ """ """
470
+ pass
471
+
472
+ @staticmethod
473
+ def custom(decoder):
474
+ """ """
475
+ pass
476
+
477
+ def decode(self, tokens):
478
+ """
479
+ Decode the given list of tokens to a final string
480
+
481
+ Args:
482
+ tokens (:obj:`List[str]`):
483
+ The list of tokens to decode
484
+
485
+ Returns:
486
+ :obj:`str`: The decoded string
487
+ """
488
+ pass
489
+
490
+ @property
491
+ def start(self):
492
+ """ """
493
+ pass
494
+
495
+ @start.setter
496
+ def start(self, value):
497
+ """ """
498
+ pass
499
+
500
+ @property
501
+ def stop(self):
502
+ """ """
503
+ pass
504
+
505
+ @stop.setter
506
+ def stop(self, value):
507
+ """ """
508
+ pass
509
+
510
+ class WordPiece(Decoder):
511
+ """
512
+ WordPiece Decoder
513
+
514
+ Args:
515
+ prefix (:obj:`str`, `optional`, defaults to :obj:`##`):
516
+ The prefix to use for subwords that are not a beginning-of-word
517
+
518
+ cleanup (:obj:`bool`, `optional`, defaults to :obj:`True`):
519
+ Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
520
+ and some abbreviated english forms.
521
+ """
522
+ def __init__(self, prefix="##", cleanup=True):
523
+ pass
524
+
525
+ def __getstate__(self):
526
+ """ """
527
+ pass
528
+
529
+ def __setstate__(self, state):
530
+ """ """
531
+ pass
532
+
533
+ @property
534
+ def cleanup(self):
535
+ """ """
536
+ pass
537
+
538
+ @cleanup.setter
539
+ def cleanup(self, value):
540
+ """ """
541
+ pass
542
+
543
+ @staticmethod
544
+ def custom(decoder):
545
+ """ """
546
+ pass
547
+
548
+ def decode(self, tokens):
549
+ """
550
+ Decode the given list of tokens to a final string
551
+
552
+ Args:
553
+ tokens (:obj:`List[str]`):
554
+ The list of tokens to decode
555
+
556
+ Returns:
557
+ :obj:`str`: The decoded string
558
+ """
559
+ pass
560
+
561
+ @property
562
+ def prefix(self):
563
+ """ """
564
+ pass
565
+
566
+ @prefix.setter
567
+ def prefix(self, value):
568
+ """ """
569
+ pass
source/tokenizers/implementations/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from .base_tokenizer import BaseTokenizer
2
+ from .bert_wordpiece import BertWordPieceTokenizer
3
+ from .byte_level_bpe import ByteLevelBPETokenizer
4
+ from .char_level_bpe import CharBPETokenizer
5
+ from .sentencepiece_bpe import SentencePieceBPETokenizer
6
+ from .sentencepiece_unigram import SentencePieceUnigramTokenizer
source/tokenizers/implementations/base_tokenizer.py ADDED
@@ -0,0 +1,459 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Optional, Tuple, Union
2
+
3
+ from tokenizers import AddedToken, EncodeInput, Encoding, InputSequence, Tokenizer
4
+ from tokenizers.decoders import Decoder
5
+ from tokenizers.models import Model
6
+ from tokenizers.normalizers import Normalizer
7
+ from tokenizers.pre_tokenizers import PreTokenizer
8
+ from tokenizers.processors import PostProcessor
9
+
10
+
11
+ Offsets = Tuple[int, int]
12
+
13
+
14
+ class BaseTokenizer:
15
+ def __init__(self, tokenizer: Tokenizer, parameters=None):
16
+ self._tokenizer = tokenizer
17
+ self._parameters = parameters if parameters is not None else {}
18
+
19
+ def __repr__(self):
20
+ return "Tokenizer(vocabulary_size={}, {})".format(
21
+ self._tokenizer.get_vocab_size(),
22
+ ", ".join(k + "=" + str(v) for k, v in self._parameters.items()),
23
+ )
24
+
25
+ def num_special_tokens_to_add(self, is_pair: bool) -> int:
26
+ """
27
+ Return the number of special tokens that would be added for single/pair sentences.
28
+ :param is_pair: Boolean indicating if the input would be a single sentence or a pair
29
+ :return:
30
+ """
31
+ return self._tokenizer.num_special_tokens_to_add(is_pair)
32
+
33
+ def get_vocab(self, with_added_tokens: bool = True) -> Dict[str, int]:
34
+ """Returns the vocabulary
35
+
36
+ Args:
37
+ with_added_tokens: boolean:
38
+ Whether to include the added tokens in the vocabulary
39
+
40
+ Returns:
41
+ The vocabulary
42
+ """
43
+ return self._tokenizer.get_vocab(with_added_tokens=with_added_tokens)
44
+
45
+ def get_added_tokens_decoder(self) -> Dict[int, AddedToken]:
46
+ """Returns the added reverse vocabulary
47
+
48
+ Returns:
49
+ The added vocabulary mapping ints to AddedTokens
50
+ """
51
+ return self._tokenizer.get_added_tokens_decoder()
52
+
53
+ def get_vocab_size(self, with_added_tokens: bool = True) -> int:
54
+ """Return the size of vocabulary, with or without added tokens.
55
+
56
+ Args:
57
+ with_added_tokens: (`optional`) bool:
58
+ Whether to count in added special tokens or not
59
+
60
+ Returns:
61
+ Size of vocabulary
62
+ """
63
+ return self._tokenizer.get_vocab_size(with_added_tokens=with_added_tokens)
64
+
65
+ def enable_padding(
66
+ self,
67
+ direction: Optional[str] = "right",
68
+ pad_to_multiple_of: Optional[int] = None,
69
+ pad_id: Optional[int] = 0,
70
+ pad_type_id: Optional[int] = 0,
71
+ pad_token: Optional[str] = "[PAD]",
72
+ length: Optional[int] = None,
73
+ ):
74
+ """Change the padding strategy
75
+
76
+ Args:
77
+ direction: (`optional`) str:
78
+ Can be one of: `right` or `left`
79
+
80
+ pad_to_multiple_of: (`optional`) unsigned int:
81
+ If specified, the padding length should always snap to the next multiple of
82
+ the given value. For example if we were going to pad with a length of 250 but
83
+ `pad_to_multiple_of=8` then we will pad to 256.
84
+
85
+ pad_id: (`optional`) unsigned int:
86
+ The indice to be used when padding
87
+
88
+ pad_type_id: (`optional`) unsigned int:
89
+ The type indice to be used when padding
90
+
91
+ pad_token: (`optional`) str:
92
+ The pad token to be used when padding
93
+
94
+ length: (`optional`) unsigned int:
95
+ If specified, the length at which to pad. If not specified
96
+ we pad using the size of the longest sequence in a batch
97
+ """
98
+ return self._tokenizer.enable_padding(
99
+ direction=direction,
100
+ pad_to_multiple_of=pad_to_multiple_of,
101
+ pad_id=pad_id,
102
+ pad_type_id=pad_type_id,
103
+ pad_token=pad_token,
104
+ length=length,
105
+ )
106
+
107
+ def no_padding(self):
108
+ """Disable padding"""
109
+ return self._tokenizer.no_padding()
110
+
111
+ @property
112
+ def padding(self) -> Optional[dict]:
113
+ """Get the current padding parameters
114
+
115
+ Returns:
116
+ None if padding is disabled, a dict with the currently set parameters
117
+ if the padding is enabled.
118
+ """
119
+ return self._tokenizer.padding
120
+
121
+ def enable_truncation(self, max_length: int, stride: Optional[int] = 0, strategy: Optional[str] = "longest_first"):
122
+ """Change the truncation options
123
+
124
+ Args:
125
+ max_length: unsigned int:
126
+ The maximum length at which to truncate
127
+
128
+ stride: (`optional`) unsigned int:
129
+ The length of the previous first sequence to be included
130
+ in the overflowing sequence
131
+
132
+ strategy: (`optional`) str:
133
+ Can be one of `longest_first`, `only_first` or `only_second`
134
+ """
135
+ return self._tokenizer.enable_truncation(max_length, stride=stride, strategy=strategy)
136
+
137
+ def no_truncation(self):
138
+ """Disable truncation"""
139
+ return self._tokenizer.no_truncation()
140
+
141
+ @property
142
+ def truncation(self) -> Optional[dict]:
143
+ """Get the current truncation parameters
144
+
145
+ Returns:
146
+ None if truncation is disabled, a dict with the current truncation parameters if
147
+ truncation is enabled
148
+ """
149
+ return self._tokenizer.truncation
150
+
151
+ def add_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
152
+ """Add the given tokens to the vocabulary
153
+
154
+ Args:
155
+ tokens: List[Union[str, AddedToken]]:
156
+ A list of tokens to add to the vocabulary. Each token can either be
157
+ a string, or an instance of AddedToken
158
+
159
+ Returns:
160
+ The number of tokens that were added to the vocabulary
161
+ """
162
+ return self._tokenizer.add_tokens(tokens)
163
+
164
+ def add_special_tokens(self, special_tokens: List[Union[str, AddedToken]]) -> int:
165
+ """Add the given special tokens to the vocabulary, and treat them as special tokens.
166
+
167
+ The special tokens will never be processed by the model, and will be
168
+ removed while decoding.
169
+
170
+ Args:
171
+ tokens: List[Union[str, AddedToken]]:
172
+ A list of special tokens to add to the vocabulary. Each token can either be
173
+ a string, or an instance of AddedToken
174
+
175
+ Returns:
176
+ The number of tokens that were added to the vocabulary
177
+ """
178
+ return self._tokenizer.add_special_tokens(special_tokens)
179
+
180
+ def normalize(self, sequence: str) -> str:
181
+ """Normalize the given sequence
182
+
183
+ Args:
184
+ sequence: str:
185
+ The sequence to normalize
186
+
187
+ Returns:
188
+ The normalized string
189
+ """
190
+ return self._tokenizer.normalizer.normalize_str(sequence)
191
+
192
+ def encode(
193
+ self,
194
+ sequence: InputSequence,
195
+ pair: Optional[InputSequence] = None,
196
+ is_pretokenized: bool = False,
197
+ add_special_tokens: bool = True,
198
+ ) -> Encoding:
199
+ """Encode the given sequence and pair. This method can process raw text sequences as well
200
+ as already pre-tokenized sequences.
201
+
202
+ Args:
203
+ sequence: InputSequence:
204
+ The sequence we want to encode. This sequence can be either raw text or
205
+ pre-tokenized, according to the `is_pretokenized` argument:
206
+
207
+ - If `is_pretokenized=False`: `InputSequence` is expected to be `str`
208
+ - If `is_pretokenized=True`: `InputSequence` is expected to be
209
+ `Union[List[str], Tuple[str]]`
210
+
211
+ is_pretokenized: bool:
212
+ Whether the input is already pre-tokenized.
213
+
214
+ add_special_tokens: bool:
215
+ Whether to add the special tokens while encoding.
216
+
217
+ Returns:
218
+ An Encoding
219
+ """
220
+ if sequence is None:
221
+ raise ValueError("encode: `sequence` can't be `None`")
222
+
223
+ return self._tokenizer.encode(sequence, pair, is_pretokenized, add_special_tokens)
224
+
225
+ def encode_batch(
226
+ self,
227
+ inputs: List[EncodeInput],
228
+ is_pretokenized: bool = False,
229
+ add_special_tokens: bool = True,
230
+ ) -> List[Encoding]:
231
+ """Encode the given inputs. This method accept both raw text sequences as well as already
232
+ pre-tokenized sequences.
233
+
234
+ Args:
235
+ inputs: List[EncodeInput]:
236
+ A list of single sequences or pair sequences to encode. Each `EncodeInput` is
237
+ expected to be of the following form:
238
+ `Union[InputSequence, Tuple[InputSequence, InputSequence]]`
239
+
240
+ Each `InputSequence` can either be raw text or pre-tokenized,
241
+ according to the `is_pretokenized` argument:
242
+
243
+ - If `is_pretokenized=False`: `InputSequence` is expected to be `str`
244
+ - If `is_pretokenized=True`: `InputSequence` is expected to be
245
+ `Union[List[str], Tuple[str]]`
246
+
247
+ is_pretokenized: bool:
248
+ Whether the input is already pre-tokenized.
249
+
250
+ add_special_tokens: bool:
251
+ Whether to add the special tokens while encoding.
252
+
253
+ Returns:
254
+ A list of Encoding
255
+ """
256
+
257
+ if inputs is None:
258
+ raise ValueError("encode_batch: `inputs` can't be `None`")
259
+
260
+ return self._tokenizer.encode_batch(inputs, is_pretokenized, add_special_tokens)
261
+
262
+ async def async_encode_batch(
263
+ self,
264
+ inputs: List[EncodeInput],
265
+ is_pretokenized: bool = False,
266
+ add_special_tokens: bool = True,
267
+ ) -> List[Encoding]:
268
+ """Asynchronously encode a batch (tracks character offsets).
269
+
270
+ Args:
271
+ inputs: A list of single or pair sequences to encode.
272
+ is_pretokenized: Whether inputs are already pre-tokenized.
273
+ add_special_tokens: Whether to add special tokens.
274
+
275
+ Returns:
276
+ A list of Encoding.
277
+ """
278
+ if inputs is None:
279
+ raise ValueError("async_encode_batch: `inputs` can't be `None`")
280
+ # Exposed by the Rust bindings via pyo3_async_runtimes::tokio::future_into_py
281
+ return await self._tokenizer.async_encode_batch(inputs, is_pretokenized, add_special_tokens)
282
+
283
+ async def async_encode_batch_fast(
284
+ self,
285
+ inputs: List[EncodeInput],
286
+ is_pretokenized: bool = False,
287
+ add_special_tokens: bool = True,
288
+ ) -> List[Encoding]:
289
+ """Asynchronously encode a batch (no character offsets, faster).
290
+
291
+ Args:
292
+ inputs: A list of single or pair sequences to encode.
293
+ is_pretokenized: Whether inputs are already pre-tokenized.
294
+ add_special_tokens: Whether to add special tokens.
295
+
296
+ Returns:
297
+ A list of Encoding.
298
+ """
299
+ if inputs is None:
300
+ raise ValueError("async_encode_batch_fast: `inputs` can't be `None`")
301
+ return await self._tokenizer.async_encode_batch_fast(inputs, is_pretokenized, add_special_tokens)
302
+
303
+ def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
304
+ """Decode the given list of ids to a string sequence
305
+
306
+ Args:
307
+ ids: List[unsigned int]:
308
+ A list of ids to be decoded
309
+
310
+ skip_special_tokens: (`optional`) boolean:
311
+ Whether to remove all the special tokens from the output string
312
+
313
+ Returns:
314
+ The decoded string
315
+ """
316
+ if ids is None:
317
+ raise ValueError("None input is not valid. Should be a list of integers.")
318
+
319
+ return self._tokenizer.decode(ids, skip_special_tokens=skip_special_tokens)
320
+
321
+ def decode_batch(self, sequences: List[List[int]], skip_special_tokens: Optional[bool] = True) -> str:
322
+ """Decode the list of sequences to a list of string sequences
323
+
324
+ Args:
325
+ sequences: List[List[unsigned int]]:
326
+ A list of sequence of ids to be decoded
327
+
328
+ skip_special_tokens: (`optional`) boolean:
329
+ Whether to remove all the special tokens from the output strings
330
+
331
+ Returns:
332
+ A list of decoded strings
333
+ """
334
+ if sequences is None:
335
+ raise ValueError("None input is not valid. Should be list of list of integers.")
336
+
337
+ return self._tokenizer.decode_batch(sequences, skip_special_tokens=skip_special_tokens)
338
+
339
+ def token_to_id(self, token: str) -> Optional[int]:
340
+ """Convert the given token to its corresponding id
341
+
342
+ Args:
343
+ token: str:
344
+ The token to convert
345
+
346
+ Returns:
347
+ The corresponding id if it exists, None otherwise
348
+ """
349
+ return self._tokenizer.token_to_id(token)
350
+
351
+ def id_to_token(self, id: int) -> Optional[str]:
352
+ """Convert the given token id to its corresponding string
353
+
354
+ Args:
355
+ token: id:
356
+ The token id to convert
357
+
358
+ Returns:
359
+ The corresponding string if it exists, None otherwise
360
+ """
361
+ return self._tokenizer.id_to_token(id)
362
+
363
+ def save_model(self, directory: str, prefix: Optional[str] = None):
364
+ """Save the current model to the given directory
365
+
366
+ Args:
367
+ directory: str:
368
+ A path to the destination directory
369
+
370
+ prefix: (Optional) str:
371
+ An optional prefix, used to prefix each file name
372
+ """
373
+ return self._tokenizer.model.save(directory, prefix=prefix)
374
+
375
+ def save(self, path: str, pretty: bool = True):
376
+ """Save the current Tokenizer at the given path
377
+
378
+ Args:
379
+ path: str:
380
+ A path to the destination Tokenizer file
381
+ """
382
+ return self._tokenizer.save(path, pretty)
383
+
384
+ def to_str(self, pretty: bool = False):
385
+ """Get a serialized JSON version of the Tokenizer as a str
386
+
387
+ Args:
388
+ pretty: bool:
389
+ Whether the JSON string should be prettified
390
+
391
+ Returns:
392
+ str
393
+ """
394
+ return self._tokenizer.to_str(pretty)
395
+
396
+ def post_process(
397
+ self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True
398
+ ) -> Encoding:
399
+ """Apply all the post-processing steps to the given encodings.
400
+
401
+ The various steps are:
402
+ 1. Truncate according to global params (provided to `enable_truncation`)
403
+ 2. Apply the PostProcessor
404
+ 3. Pad according to global params. (provided to `enable_padding`)
405
+
406
+ Args:
407
+ encoding: Encoding:
408
+ The main Encoding to post process
409
+
410
+ pair: Optional[Encoding]:
411
+ An optional pair Encoding
412
+
413
+ add_special_tokens: bool:
414
+ Whether to add special tokens
415
+
416
+ Returns:
417
+ The resulting Encoding
418
+ """
419
+ return self._tokenizer.post_process(encoding, pair, add_special_tokens)
420
+
421
+ @property
422
+ def model(self) -> Model:
423
+ return self._tokenizer.model
424
+
425
+ @model.setter
426
+ def model(self, model: Model):
427
+ self._tokenizer.model = model
428
+
429
+ @property
430
+ def normalizer(self) -> Normalizer:
431
+ return self._tokenizer.normalizer
432
+
433
+ @normalizer.setter
434
+ def normalizer(self, normalizer: Normalizer):
435
+ self._tokenizer.normalizer = normalizer
436
+
437
+ @property
438
+ def pre_tokenizer(self) -> PreTokenizer:
439
+ return self._tokenizer.pre_tokenizer
440
+
441
+ @pre_tokenizer.setter
442
+ def pre_tokenizer(self, pre_tokenizer: PreTokenizer):
443
+ self._tokenizer.pre_tokenizer = pre_tokenizer
444
+
445
+ @property
446
+ def post_processor(self) -> PostProcessor:
447
+ return self._tokenizer.post_processor
448
+
449
+ @post_processor.setter
450
+ def post_processor(self, post_processor: PostProcessor):
451
+ self._tokenizer.post_processor = post_processor
452
+
453
+ @property
454
+ def decoder(self) -> Decoder:
455
+ return self._tokenizer.decoder
456
+
457
+ @decoder.setter
458
+ def decoder(self, decoder: Decoder):
459
+ self._tokenizer.decoder = decoder
source/tokenizers/implementations/bert_wordpiece.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Iterator, List, Optional, Union
2
+
3
+ from tokenizers import AddedToken, Tokenizer, decoders, trainers
4
+ from tokenizers.models import WordPiece
5
+ from tokenizers.normalizers import BertNormalizer
6
+ from tokenizers.pre_tokenizers import BertPreTokenizer
7
+ from tokenizers.processors import BertProcessing
8
+
9
+ from .base_tokenizer import BaseTokenizer
10
+
11
+
12
+ class BertWordPieceTokenizer(BaseTokenizer):
13
+ """Bert WordPiece Tokenizer"""
14
+
15
+ def __init__(
16
+ self,
17
+ vocab: Optional[Union[str, Dict[str, int]]] = None,
18
+ unk_token: Union[str, AddedToken] = "[UNK]",
19
+ sep_token: Union[str, AddedToken] = "[SEP]",
20
+ cls_token: Union[str, AddedToken] = "[CLS]",
21
+ pad_token: Union[str, AddedToken] = "[PAD]",
22
+ mask_token: Union[str, AddedToken] = "[MASK]",
23
+ clean_text: bool = True,
24
+ handle_chinese_chars: bool = True,
25
+ strip_accents: Optional[bool] = None,
26
+ lowercase: bool = True,
27
+ wordpieces_prefix: str = "##",
28
+ ):
29
+ if vocab is not None:
30
+ tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(unk_token)))
31
+ else:
32
+ tokenizer = Tokenizer(WordPiece(unk_token=str(unk_token)))
33
+
34
+ # Let the tokenizer know about special tokens if they are part of the vocab
35
+ if tokenizer.token_to_id(str(unk_token)) is not None:
36
+ tokenizer.add_special_tokens([str(unk_token)])
37
+ if tokenizer.token_to_id(str(sep_token)) is not None:
38
+ tokenizer.add_special_tokens([str(sep_token)])
39
+ if tokenizer.token_to_id(str(cls_token)) is not None:
40
+ tokenizer.add_special_tokens([str(cls_token)])
41
+ if tokenizer.token_to_id(str(pad_token)) is not None:
42
+ tokenizer.add_special_tokens([str(pad_token)])
43
+ if tokenizer.token_to_id(str(mask_token)) is not None:
44
+ tokenizer.add_special_tokens([str(mask_token)])
45
+
46
+ tokenizer.normalizer = BertNormalizer(
47
+ clean_text=clean_text,
48
+ handle_chinese_chars=handle_chinese_chars,
49
+ strip_accents=strip_accents,
50
+ lowercase=lowercase,
51
+ )
52
+ tokenizer.pre_tokenizer = BertPreTokenizer()
53
+
54
+ if vocab is not None:
55
+ sep_token_id = tokenizer.token_to_id(str(sep_token))
56
+ if sep_token_id is None:
57
+ raise TypeError("sep_token not found in the vocabulary")
58
+ cls_token_id = tokenizer.token_to_id(str(cls_token))
59
+ if cls_token_id is None:
60
+ raise TypeError("cls_token not found in the vocabulary")
61
+
62
+ tokenizer.post_processor = BertProcessing((str(sep_token), sep_token_id), (str(cls_token), cls_token_id))
63
+ tokenizer.decoder = decoders.WordPiece(prefix=wordpieces_prefix)
64
+
65
+ parameters = {
66
+ "model": "BertWordPiece",
67
+ "unk_token": unk_token,
68
+ "sep_token": sep_token,
69
+ "cls_token": cls_token,
70
+ "pad_token": pad_token,
71
+ "mask_token": mask_token,
72
+ "clean_text": clean_text,
73
+ "handle_chinese_chars": handle_chinese_chars,
74
+ "strip_accents": strip_accents,
75
+ "lowercase": lowercase,
76
+ "wordpieces_prefix": wordpieces_prefix,
77
+ }
78
+
79
+ super().__init__(tokenizer, parameters)
80
+
81
+ @staticmethod
82
+ def from_file(vocab: str, **kwargs):
83
+ vocab = WordPiece.read_file(vocab)
84
+ return BertWordPieceTokenizer(vocab, **kwargs)
85
+
86
+ def train(
87
+ self,
88
+ files: Union[str, List[str]],
89
+ vocab_size: int = 30000,
90
+ min_frequency: int = 2,
91
+ limit_alphabet: int = 1000,
92
+ initial_alphabet: List[str] = [],
93
+ special_tokens: List[Union[str, AddedToken]] = [
94
+ "[PAD]",
95
+ "[UNK]",
96
+ "[CLS]",
97
+ "[SEP]",
98
+ "[MASK]",
99
+ ],
100
+ show_progress: bool = True,
101
+ wordpieces_prefix: str = "##",
102
+ ):
103
+ """Train the model using the given files"""
104
+
105
+ trainer = trainers.WordPieceTrainer(
106
+ vocab_size=vocab_size,
107
+ min_frequency=min_frequency,
108
+ limit_alphabet=limit_alphabet,
109
+ initial_alphabet=initial_alphabet,
110
+ special_tokens=special_tokens,
111
+ show_progress=show_progress,
112
+ continuing_subword_prefix=wordpieces_prefix,
113
+ )
114
+ if isinstance(files, str):
115
+ files = [files]
116
+ self._tokenizer.train(files, trainer=trainer)
117
+
118
+ def train_from_iterator(
119
+ self,
120
+ iterator: Union[Iterator[str], Iterator[Iterator[str]]],
121
+ vocab_size: int = 30000,
122
+ min_frequency: int = 2,
123
+ limit_alphabet: int = 1000,
124
+ initial_alphabet: List[str] = [],
125
+ special_tokens: List[Union[str, AddedToken]] = [
126
+ "[PAD]",
127
+ "[UNK]",
128
+ "[CLS]",
129
+ "[SEP]",
130
+ "[MASK]",
131
+ ],
132
+ show_progress: bool = True,
133
+ wordpieces_prefix: str = "##",
134
+ length: Optional[int] = None,
135
+ ):
136
+ """Train the model using the given iterator"""
137
+
138
+ trainer = trainers.WordPieceTrainer(
139
+ vocab_size=vocab_size,
140
+ min_frequency=min_frequency,
141
+ limit_alphabet=limit_alphabet,
142
+ initial_alphabet=initial_alphabet,
143
+ special_tokens=special_tokens,
144
+ show_progress=show_progress,
145
+ continuing_subword_prefix=wordpieces_prefix,
146
+ )
147
+ self._tokenizer.train_from_iterator(
148
+ iterator,
149
+ trainer=trainer,
150
+ length=length,
151
+ )
source/tokenizers/implementations/byte_level_bpe.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Iterator, List, Optional, Tuple, Union
2
+
3
+ from tokenizers import AddedToken, Tokenizer, decoders, pre_tokenizers, processors, trainers
4
+ from tokenizers.models import BPE
5
+ from tokenizers.normalizers import Lowercase, Sequence, unicode_normalizer_from_str
6
+
7
+ from .base_tokenizer import BaseTokenizer
8
+
9
+
10
+ class ByteLevelBPETokenizer(BaseTokenizer):
11
+ """ByteLevelBPETokenizer
12
+
13
+ Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
14
+ """
15
+
16
+ def __init__(
17
+ self,
18
+ vocab: Optional[Union[str, Dict[str, int]]] = None,
19
+ merges: Optional[Union[str, List[Tuple[str, str]]]] = None,
20
+ add_prefix_space: bool = False,
21
+ lowercase: bool = False,
22
+ dropout: Optional[float] = None,
23
+ unicode_normalizer: Optional[str] = None,
24
+ continuing_subword_prefix: Optional[str] = None,
25
+ end_of_word_suffix: Optional[str] = None,
26
+ trim_offsets: bool = False,
27
+ ):
28
+ if vocab is not None and merges is not None:
29
+ tokenizer = Tokenizer(
30
+ BPE(
31
+ vocab,
32
+ merges,
33
+ dropout=dropout,
34
+ continuing_subword_prefix=continuing_subword_prefix or "",
35
+ end_of_word_suffix=end_of_word_suffix or "",
36
+ )
37
+ )
38
+ else:
39
+ tokenizer = Tokenizer(BPE())
40
+
41
+ # Check for Unicode normalization first (before everything else)
42
+ normalizers = []
43
+
44
+ if unicode_normalizer:
45
+ normalizers += [unicode_normalizer_from_str(unicode_normalizer)]
46
+
47
+ if lowercase:
48
+ normalizers += [Lowercase()]
49
+
50
+ # Create the normalizer structure
51
+ if len(normalizers) > 0:
52
+ if len(normalizers) > 1:
53
+ tokenizer.normalizer = Sequence(normalizers)
54
+ else:
55
+ tokenizer.normalizer = normalizers[0]
56
+
57
+ tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space)
58
+ tokenizer.decoder = decoders.ByteLevel()
59
+ tokenizer.post_processor = processors.ByteLevel(trim_offsets=trim_offsets)
60
+
61
+ parameters = {
62
+ "model": "ByteLevelBPE",
63
+ "add_prefix_space": add_prefix_space,
64
+ "lowercase": lowercase,
65
+ "dropout": dropout,
66
+ "unicode_normalizer": unicode_normalizer,
67
+ "continuing_subword_prefix": continuing_subword_prefix,
68
+ "end_of_word_suffix": end_of_word_suffix,
69
+ "trim_offsets": trim_offsets,
70
+ }
71
+
72
+ super().__init__(tokenizer, parameters)
73
+
74
+ @staticmethod
75
+ def from_file(vocab_filename: str, merges_filename: str, **kwargs):
76
+ vocab, merges = BPE.read_file(vocab_filename, merges_filename)
77
+ return ByteLevelBPETokenizer(vocab, merges, **kwargs)
78
+
79
+ def train(
80
+ self,
81
+ files: Union[str, List[str]],
82
+ vocab_size: int = 30000,
83
+ min_frequency: int = 2,
84
+ show_progress: bool = True,
85
+ special_tokens: List[Union[str, AddedToken]] = [],
86
+ ):
87
+ """Train the model using the given files"""
88
+
89
+ trainer = trainers.BpeTrainer(
90
+ vocab_size=vocab_size,
91
+ min_frequency=min_frequency,
92
+ show_progress=show_progress,
93
+ special_tokens=special_tokens,
94
+ initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
95
+ )
96
+ if isinstance(files, str):
97
+ files = [files]
98
+ self._tokenizer.train(files, trainer=trainer)
99
+
100
+ def train_from_iterator(
101
+ self,
102
+ iterator: Union[Iterator[str], Iterator[Iterator[str]]],
103
+ vocab_size: int = 30000,
104
+ min_frequency: int = 2,
105
+ show_progress: bool = True,
106
+ special_tokens: List[Union[str, AddedToken]] = [],
107
+ length: Optional[int] = None,
108
+ ):
109
+ """Train the model using the given iterator"""
110
+
111
+ trainer = trainers.BpeTrainer(
112
+ vocab_size=vocab_size,
113
+ min_frequency=min_frequency,
114
+ show_progress=show_progress,
115
+ special_tokens=special_tokens,
116
+ initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
117
+ )
118
+ self._tokenizer.train_from_iterator(
119
+ iterator,
120
+ trainer=trainer,
121
+ length=length,
122
+ )
source/tokenizers/implementations/char_level_bpe.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Iterator, List, Optional, Tuple, Union
2
+
3
+ from .. import AddedToken, Tokenizer, decoders, pre_tokenizers, trainers
4
+ from ..models import BPE
5
+ from ..normalizers import BertNormalizer, Lowercase, Sequence, unicode_normalizer_from_str
6
+ from .base_tokenizer import BaseTokenizer
7
+
8
+
9
+ class CharBPETokenizer(BaseTokenizer):
10
+ """Original BPE Tokenizer
11
+
12
+ Represents the BPE algorithm, as introduced by Rico Sennrich
13
+ (https://arxiv.org/abs/1508.07909)
14
+
15
+ The defaults settings corresponds to OpenAI GPT BPE tokenizers and differs from the original
16
+ Sennrich subword-nmt implementation by the following options that you can deactivate:
17
+ - adding a normalizer to clean up the text (deactivate with `bert_normalizer=False`) by:
18
+ * removing any control characters and replacing all whitespaces by the classic one.
19
+ * handle chinese chars by putting spaces around them.
20
+ * strip all accents.
21
+ - spitting on punctuation in addition to whitespaces (deactivate it with
22
+ `split_on_whitespace_only=True`)
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ vocab: Optional[Union[str, Dict[str, int]]] = None,
28
+ merges: Optional[Union[str, List[Tuple[str, str]]]] = None,
29
+ unk_token: Union[str, AddedToken] = "<unk>",
30
+ suffix: str = "</w>",
31
+ dropout: Optional[float] = None,
32
+ lowercase: bool = False,
33
+ unicode_normalizer: Optional[str] = None,
34
+ bert_normalizer: bool = True,
35
+ split_on_whitespace_only: bool = False,
36
+ ):
37
+ if vocab is not None and merges is not None:
38
+ tokenizer = Tokenizer(
39
+ BPE(
40
+ vocab,
41
+ merges,
42
+ dropout=dropout,
43
+ unk_token=str(unk_token),
44
+ end_of_word_suffix=suffix,
45
+ )
46
+ )
47
+ else:
48
+ tokenizer = Tokenizer(BPE(unk_token=str(unk_token), dropout=dropout, end_of_word_suffix=suffix))
49
+
50
+ if tokenizer.token_to_id(str(unk_token)) is not None:
51
+ tokenizer.add_special_tokens([str(unk_token)])
52
+
53
+ # Check for Unicode normalization first (before everything else)
54
+ normalizers = []
55
+
56
+ if unicode_normalizer:
57
+ normalizers += [unicode_normalizer_from_str(unicode_normalizer)]
58
+
59
+ if bert_normalizer:
60
+ normalizers += [BertNormalizer(lowercase=False)]
61
+
62
+ if lowercase:
63
+ normalizers += [Lowercase()]
64
+
65
+ # Create the normalizer structure
66
+ if len(normalizers) > 0:
67
+ if len(normalizers) > 1:
68
+ tokenizer.normalizer = Sequence(normalizers)
69
+ else:
70
+ tokenizer.normalizer = normalizers[0]
71
+
72
+ if split_on_whitespace_only:
73
+ tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
74
+ else:
75
+ tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
76
+
77
+ tokenizer.decoder = decoders.BPEDecoder(suffix=suffix)
78
+
79
+ parameters = {
80
+ "model": "BPE",
81
+ "unk_token": unk_token,
82
+ "suffix": suffix,
83
+ "dropout": dropout,
84
+ "lowercase": lowercase,
85
+ "unicode_normalizer": unicode_normalizer,
86
+ "bert_normalizer": bert_normalizer,
87
+ "split_on_whitespace_only": split_on_whitespace_only,
88
+ }
89
+
90
+ super().__init__(tokenizer, parameters)
91
+
92
+ @staticmethod
93
+ def from_file(vocab_filename: str, merges_filename: str, **kwargs):
94
+ vocab, merges = BPE.read_file(vocab_filename, merges_filename)
95
+ return CharBPETokenizer(vocab, merges, **kwargs)
96
+
97
+ def train(
98
+ self,
99
+ files: Union[str, List[str]],
100
+ vocab_size: int = 30000,
101
+ min_frequency: int = 2,
102
+ special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
103
+ limit_alphabet: int = 1000,
104
+ initial_alphabet: List[str] = [],
105
+ suffix: Optional[str] = "</w>",
106
+ show_progress: bool = True,
107
+ ):
108
+ """Train the model using the given files"""
109
+
110
+ trainer = trainers.BpeTrainer(
111
+ vocab_size=vocab_size,
112
+ min_frequency=min_frequency,
113
+ special_tokens=special_tokens,
114
+ limit_alphabet=limit_alphabet,
115
+ initial_alphabet=initial_alphabet,
116
+ end_of_word_suffix=suffix,
117
+ show_progress=show_progress,
118
+ )
119
+ if isinstance(files, str):
120
+ files = [files]
121
+ self._tokenizer.train(files, trainer=trainer)
122
+
123
+ def train_from_iterator(
124
+ self,
125
+ iterator: Union[Iterator[str], Iterator[Iterator[str]]],
126
+ vocab_size: int = 30000,
127
+ min_frequency: int = 2,
128
+ special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
129
+ limit_alphabet: int = 1000,
130
+ initial_alphabet: List[str] = [],
131
+ suffix: Optional[str] = "</w>",
132
+ show_progress: bool = True,
133
+ length: Optional[int] = None,
134
+ ):
135
+ """Train the model using the given iterator"""
136
+
137
+ trainer = trainers.BpeTrainer(
138
+ vocab_size=vocab_size,
139
+ min_frequency=min_frequency,
140
+ special_tokens=special_tokens,
141
+ limit_alphabet=limit_alphabet,
142
+ initial_alphabet=initial_alphabet,
143
+ end_of_word_suffix=suffix,
144
+ show_progress=show_progress,
145
+ )
146
+ self._tokenizer.train_from_iterator(
147
+ iterator,
148
+ trainer=trainer,
149
+ length=length,
150
+ )
source/tokenizers/implementations/sentencepiece_bpe.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Iterator, List, Optional, Tuple, Union
2
+
3
+ from tokenizers import AddedToken, Tokenizer, decoders, pre_tokenizers, trainers
4
+ from tokenizers.models import BPE
5
+ from tokenizers.normalizers import NFKC
6
+
7
+ from .base_tokenizer import BaseTokenizer
8
+
9
+
10
+ class SentencePieceBPETokenizer(BaseTokenizer):
11
+ """SentencePiece BPE Tokenizer
12
+
13
+ Represents the BPE algorithm, with the pretokenization used by SentencePiece
14
+ """
15
+
16
+ def __init__(
17
+ self,
18
+ vocab: Optional[Union[str, Dict[str, int]]] = None,
19
+ merges: Optional[Union[str, List[Tuple[str, str]]]] = None,
20
+ unk_token: Union[str, AddedToken] = "<unk>",
21
+ replacement: str = "▁",
22
+ add_prefix_space: bool = True,
23
+ dropout: Optional[float] = None,
24
+ fuse_unk: Optional[bool] = False,
25
+ ):
26
+ if vocab is not None and merges is not None:
27
+ tokenizer = Tokenizer(BPE(vocab, merges, dropout=dropout, unk_token=unk_token, fuse_unk=fuse_unk))
28
+ else:
29
+ tokenizer = Tokenizer(BPE(dropout=dropout, unk_token=unk_token, fuse_unk=fuse_unk))
30
+
31
+ if tokenizer.token_to_id(str(unk_token)) is not None:
32
+ tokenizer.add_special_tokens([str(unk_token)])
33
+
34
+ tokenizer.normalizer = NFKC()
35
+ prepend_scheme = "always" if add_prefix_space else "never"
36
+ tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
37
+ tokenizer.decoder = decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
38
+
39
+ parameters = {
40
+ "model": "SentencePieceBPE",
41
+ "unk_token": unk_token,
42
+ "replacement": replacement,
43
+ "add_prefix_space": add_prefix_space,
44
+ "dropout": dropout,
45
+ }
46
+
47
+ super().__init__(tokenizer, parameters)
48
+
49
+ @staticmethod
50
+ def from_file(vocab_filename: str, merges_filename: str, **kwargs):
51
+ vocab, merges = BPE.read_file(vocab_filename, merges_filename)
52
+ return SentencePieceBPETokenizer(vocab, merges, **kwargs)
53
+
54
+ def train(
55
+ self,
56
+ files: Union[str, List[str]],
57
+ vocab_size: int = 30000,
58
+ min_frequency: int = 2,
59
+ special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
60
+ limit_alphabet: int = 1000,
61
+ initial_alphabet: List[str] = [],
62
+ show_progress: bool = True,
63
+ ):
64
+ """Train the model using the given files"""
65
+
66
+ trainer = trainers.BpeTrainer(
67
+ vocab_size=vocab_size,
68
+ min_frequency=min_frequency,
69
+ special_tokens=special_tokens,
70
+ limit_alphabet=limit_alphabet,
71
+ initial_alphabet=initial_alphabet,
72
+ show_progress=show_progress,
73
+ )
74
+ if isinstance(files, str):
75
+ files = [files]
76
+ self._tokenizer.train(files, trainer=trainer)
77
+
78
+ def train_from_iterator(
79
+ self,
80
+ iterator: Union[Iterator[str], Iterator[Iterator[str]]],
81
+ vocab_size: int = 30000,
82
+ min_frequency: int = 2,
83
+ special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
84
+ limit_alphabet: int = 1000,
85
+ initial_alphabet: List[str] = [],
86
+ show_progress: bool = True,
87
+ length: Optional[int] = None,
88
+ ):
89
+ """Train the model using the given iterator"""
90
+
91
+ trainer = trainers.BpeTrainer(
92
+ vocab_size=vocab_size,
93
+ min_frequency=min_frequency,
94
+ special_tokens=special_tokens,
95
+ limit_alphabet=limit_alphabet,
96
+ initial_alphabet=initial_alphabet,
97
+ show_progress=show_progress,
98
+ )
99
+ self._tokenizer.train_from_iterator(
100
+ iterator,
101
+ trainer=trainer,
102
+ length=length,
103
+ )
source/tokenizers/implementations/sentencepiece_unigram.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from typing import Iterator, List, Optional, Union, Tuple
4
+
5
+ from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, trainers
6
+ from tokenizers.models import Unigram
7
+
8
+ from .base_tokenizer import BaseTokenizer
9
+
10
+
11
+ class SentencePieceUnigramTokenizer(BaseTokenizer):
12
+ """SentencePiece Unigram Tokenizer
13
+
14
+ Represents the Unigram algorithm, with the pretokenization used by SentencePiece
15
+ """
16
+
17
+ def __init__(
18
+ self,
19
+ vocab: Optional[List[Tuple[str, float]]] = None,
20
+ replacement: str = "▁",
21
+ add_prefix_space: bool = True,
22
+ ):
23
+ if vocab is not None:
24
+ # Let Unigram(..) fail if only one of them is None
25
+ tokenizer = Tokenizer(Unigram(vocab))
26
+ else:
27
+ tokenizer = Tokenizer(Unigram())
28
+
29
+ tokenizer.normalizer = normalizers.Sequence(
30
+ [normalizers.Nmt(), normalizers.NFKC(), normalizers.Replace(Regex(" {2,}"), " ")]
31
+ )
32
+ prepend_scheme = "always" if add_prefix_space else "never"
33
+ tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
34
+ tokenizer.decoder = decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
35
+
36
+ parameters = {
37
+ "model": "SentencePieceUnigram",
38
+ "replacement": replacement,
39
+ "add_prefix_space": add_prefix_space,
40
+ }
41
+
42
+ super().__init__(tokenizer, parameters)
43
+
44
+ def train(
45
+ self,
46
+ files: Union[str, List[str]],
47
+ vocab_size: int = 8000,
48
+ show_progress: bool = True,
49
+ special_tokens: Optional[List[Union[str, AddedToken]]] = None,
50
+ initial_alphabet: Optional[List[str]] = None,
51
+ unk_token: Optional[str] = None,
52
+ ):
53
+ """
54
+ Train the model using the given files
55
+
56
+ Args:
57
+ files (:obj:`List[str]`):
58
+ A list of path to the files that we should use for training
59
+ vocab_size (:obj:`int`):
60
+ The size of the final vocabulary, including all tokens and alphabet.
61
+ show_progress (:obj:`bool`):
62
+ Whether to show progress bars while training.
63
+ special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
64
+ A list of special tokens the model should know of.
65
+ initial_alphabet (:obj:`List[str]`, `optional`):
66
+ A list of characters to include in the initial alphabet, even
67
+ if not seen in the training dataset.
68
+ If the strings contain more than one character, only the first one
69
+ is kept.
70
+ unk_token (:obj:`str`, `optional`):
71
+ The unknown token to be used by the model.
72
+ """
73
+
74
+ if special_tokens is None:
75
+ special_tokens = []
76
+
77
+ if initial_alphabet is None:
78
+ initial_alphabet = []
79
+
80
+ trainer = trainers.UnigramTrainer(
81
+ vocab_size=vocab_size,
82
+ special_tokens=special_tokens,
83
+ show_progress=show_progress,
84
+ initial_alphabet=initial_alphabet,
85
+ unk_token=unk_token,
86
+ )
87
+
88
+ if isinstance(files, str):
89
+ files = [files]
90
+ self._tokenizer.train(files, trainer=trainer)
91
+
92
+ def train_from_iterator(
93
+ self,
94
+ iterator: Union[Iterator[str], Iterator[Iterator[str]]],
95
+ vocab_size: int = 8000,
96
+ show_progress: bool = True,
97
+ special_tokens: Optional[List[Union[str, AddedToken]]] = None,
98
+ initial_alphabet: Optional[List[str]] = None,
99
+ unk_token: Optional[str] = None,
100
+ length: Optional[int] = None,
101
+ ):
102
+ """
103
+ Train the model using the given iterator
104
+
105
+ Args:
106
+ iterator (:obj:`Union[Iterator[str], Iterator[Iterator[str]]]`):
107
+ Any iterator over strings or list of strings
108
+ vocab_size (:obj:`int`):
109
+ The size of the final vocabulary, including all tokens and alphabet.
110
+ show_progress (:obj:`bool`):
111
+ Whether to show progress bars while training.
112
+ special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
113
+ A list of special tokens the model should know of.
114
+ initial_alphabet (:obj:`List[str]`, `optional`):
115
+ A list of characters to include in the initial alphabet, even
116
+ if not seen in the training dataset.
117
+ If the strings contain more than one character, only the first one
118
+ is kept.
119
+ unk_token (:obj:`str`, `optional`):
120
+ The unknown token to be used by the model.
121
+ length (:obj:`int`, `optional`):
122
+ The total number of sequences in the iterator. This is used to
123
+ provide meaningful progress tracking
124
+ """
125
+
126
+ if special_tokens is None:
127
+ special_tokens = []
128
+
129
+ if initial_alphabet is None:
130
+ initial_alphabet = []
131
+
132
+ trainer = trainers.UnigramTrainer(
133
+ vocab_size=vocab_size,
134
+ special_tokens=special_tokens,
135
+ show_progress=show_progress,
136
+ initial_alphabet=initial_alphabet,
137
+ unk_token=unk_token,
138
+ )
139
+
140
+ self._tokenizer.train_from_iterator(
141
+ iterator,
142
+ trainer=trainer,
143
+ length=length,
144
+ )
145
+
146
+ @staticmethod
147
+ def from_spm(filename: str):
148
+ try:
149
+ import sys
150
+
151
+ sys.path.append(".")
152
+
153
+ import sentencepiece_model_pb2 as model # type: ignore[import]
154
+ except Exception:
155
+ raise Exception(
156
+ "You don't seem to have the required protobuf file, in order to use this function you need to run `pip install protobuf` and `wget https://raw.githubusercontent.com/google/sentencepiece/master/python/src/sentencepiece/sentencepiece_model_pb2.py` for us to be able to read the intrinsics of your spm_file. `pip install sentencepiece` is not required."
157
+ )
158
+
159
+ m = model.ModelProto()
160
+ m.ParseFromString(open(filename, "rb").read())
161
+
162
+ precompiled_charsmap = m.normalizer_spec.precompiled_charsmap
163
+ vocab = [(piece.piece, piece.score) for piece in m.pieces]
164
+ unk_id = m.trainer_spec.unk_id
165
+ model_type = m.trainer_spec.model_type
166
+ byte_fallback = m.trainer_spec.byte_fallback
167
+ if model_type != 1:
168
+ raise Exception(
169
+ "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
170
+ )
171
+
172
+ replacement = "▁"
173
+ add_prefix_space = True
174
+
175
+ tokenizer = Tokenizer(Unigram(vocab, unk_id, byte_fallback))
176
+
177
+ if precompiled_charsmap:
178
+ tokenizer.normalizer = normalizers.Sequence(
179
+ [
180
+ normalizers.Precompiled(precompiled_charsmap),
181
+ normalizers.Replace(Regex(" {2,}"), " "),
182
+ ]
183
+ )
184
+ else:
185
+ tokenizer.normalizer = normalizers.Sequence([normalizers.Replace(Regex(" {2,}"), " ")])
186
+ prepend_scheme = "always" if add_prefix_space else "never"
187
+ tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
188
+ tokenizer.decoder = decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
189
+
190
+ parameters = {
191
+ "model": "SentencePieceUnigram",
192
+ }
193
+
194
+ obj = BaseTokenizer.__new__(SentencePieceUnigramTokenizer, tokenizer, parameters) # type: ignore[arg-type]
195
+ BaseTokenizer.__init__(obj, tokenizer, parameters)
196
+ return obj
source/tokenizers/models/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Generated content DO NOT EDIT
2
+ from .. import models
3
+
4
+ Model = models.Model
5
+ BPE = models.BPE
6
+ Unigram = models.Unigram
7
+ WordLevel = models.WordLevel
8
+ WordPiece = models.WordPiece
source/tokenizers/models/__init__.pyi ADDED
@@ -0,0 +1,744 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated content DO NOT EDIT
2
+ class Model:
3
+ """
4
+ Base class for all models
5
+
6
+ The model represents the actual tokenization algorithm. This is the part that
7
+ will contain and manage the learned vocabulary.
8
+
9
+ This class cannot be constructed directly. Please use one of the concrete models.
10
+ """
11
+ def __init__(self):
12
+ pass
13
+
14
+ def __getstate__(self):
15
+ """ """
16
+ pass
17
+
18
+ def __setstate__(self, state):
19
+ """ """
20
+ pass
21
+
22
+ def get_trainer(self):
23
+ """
24
+ Get the associated :class:`~tokenizers.trainers.Trainer`
25
+
26
+ Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
27
+ :class:`~tokenizers.models.Model`.
28
+
29
+ Returns:
30
+ :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
31
+ """
32
+ pass
33
+
34
+ def id_to_token(self, id):
35
+ """
36
+ Get the token associated to an ID
37
+
38
+ Args:
39
+ id (:obj:`int`):
40
+ An ID to convert to a token
41
+
42
+ Returns:
43
+ :obj:`str`: The token associated to the ID
44
+ """
45
+ pass
46
+
47
+ def save(self, folder, prefix):
48
+ """
49
+ Save the current model
50
+
51
+ Save the current model in the given folder, using the given prefix for the various
52
+ files that will get created.
53
+ Any file with the same name that already exists in this folder will be overwritten.
54
+
55
+ Args:
56
+ folder (:obj:`str`):
57
+ The path to the target folder in which to save the various files
58
+
59
+ prefix (:obj:`str`, `optional`):
60
+ An optional prefix, used to prefix each file name
61
+
62
+ Returns:
63
+ :obj:`List[str]`: The list of saved files
64
+ """
65
+ pass
66
+
67
+ def token_to_id(self, tokens):
68
+ """
69
+ Get the ID associated to a token
70
+
71
+ Args:
72
+ token (:obj:`str`):
73
+ A token to convert to an ID
74
+
75
+ Returns:
76
+ :obj:`int`: The ID associated to the token
77
+ """
78
+ pass
79
+
80
+ def tokenize(self, sequence):
81
+ """
82
+ Tokenize a sequence
83
+
84
+ Args:
85
+ sequence (:obj:`str`):
86
+ A sequence to tokenize
87
+
88
+ Returns:
89
+ A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
90
+ """
91
+ pass
92
+
93
+ class BPE(Model):
94
+ """
95
+ An implementation of the BPE (Byte-Pair Encoding) algorithm
96
+
97
+ Args:
98
+ vocab (:obj:`Dict[str, int]`, `optional`):
99
+ A dictionary of string keys and their ids :obj:`{"am": 0,...}`
100
+
101
+ merges (:obj:`List[Tuple[str, str]]`, `optional`):
102
+ A list of pairs of tokens (:obj:`Tuple[str, str]`) :obj:`[("a", "b"),...]`
103
+
104
+ cache_capacity (:obj:`int`, `optional`):
105
+ The number of words that the BPE cache can contain. The cache allows
106
+ to speed-up the process by keeping the result of the merge operations
107
+ for a number of words.
108
+
109
+ dropout (:obj:`float`, `optional`):
110
+ A float between 0 and 1 that represents the BPE dropout to use.
111
+
112
+ unk_token (:obj:`str`, `optional`):
113
+ The unknown token to be used by the model.
114
+
115
+ continuing_subword_prefix (:obj:`str`, `optional`):
116
+ The prefix to attach to subword units that don't represent a beginning of word.
117
+
118
+ end_of_word_suffix (:obj:`str`, `optional`):
119
+ The suffix to attach to subword units that represent an end of word.
120
+
121
+ fuse_unk (:obj:`bool`, `optional`):
122
+ Whether to fuse any subsequent unknown tokens into a single one
123
+
124
+ byte_fallback (:obj:`bool`, `optional`):
125
+ Whether to use spm byte-fallback trick (defaults to False)
126
+
127
+ ignore_merges (:obj:`bool`, `optional`):
128
+ Whether or not to match tokens with the vocab before using merges.
129
+ """
130
+ def __init__(
131
+ self,
132
+ vocab=None,
133
+ merges=None,
134
+ cache_capacity=None,
135
+ dropout=None,
136
+ unk_token=None,
137
+ continuing_subword_prefix=None,
138
+ end_of_word_suffix=None,
139
+ fuse_unk=None,
140
+ byte_fallback=False,
141
+ ignore_merges=False,
142
+ ):
143
+ pass
144
+
145
+ def __getstate__(self):
146
+ """ """
147
+ pass
148
+
149
+ def __setstate__(self, state):
150
+ """ """
151
+ pass
152
+
153
+ @property
154
+ def byte_fallback(self):
155
+ """ """
156
+ pass
157
+
158
+ @byte_fallback.setter
159
+ def byte_fallback(self, value):
160
+ """ """
161
+ pass
162
+
163
+ @property
164
+ def continuing_subword_prefix(self):
165
+ """ """
166
+ pass
167
+
168
+ @continuing_subword_prefix.setter
169
+ def continuing_subword_prefix(self, value):
170
+ """ """
171
+ pass
172
+
173
+ @property
174
+ def dropout(self):
175
+ """ """
176
+ pass
177
+
178
+ @dropout.setter
179
+ def dropout(self, value):
180
+ """ """
181
+ pass
182
+
183
+ @property
184
+ def end_of_word_suffix(self):
185
+ """ """
186
+ pass
187
+
188
+ @end_of_word_suffix.setter
189
+ def end_of_word_suffix(self, value):
190
+ """ """
191
+ pass
192
+
193
+ @staticmethod
194
+ def from_file(vocab, merges, **kwargs):
195
+ """
196
+ Instantiate a BPE model from the given files.
197
+
198
+ This method is roughly equivalent to doing::
199
+
200
+ vocab, merges = BPE.read_file(vocab_filename, merges_filename)
201
+ bpe = BPE(vocab, merges)
202
+
203
+ If you don't need to keep the :obj:`vocab, merges` values lying around,
204
+ this method is more optimized than manually calling
205
+ :meth:`~tokenizers.models.BPE.read_file` to initialize a :class:`~tokenizers.models.BPE`
206
+
207
+ Args:
208
+ vocab (:obj:`str`):
209
+ The path to a :obj:`vocab.json` file
210
+
211
+ merges (:obj:`str`):
212
+ The path to a :obj:`merges.txt` file
213
+
214
+ Returns:
215
+ :class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files
216
+ """
217
+ pass
218
+
219
+ @property
220
+ def fuse_unk(self):
221
+ """ """
222
+ pass
223
+
224
+ @fuse_unk.setter
225
+ def fuse_unk(self, value):
226
+ """ """
227
+ pass
228
+
229
+ def get_trainer(self):
230
+ """
231
+ Get the associated :class:`~tokenizers.trainers.Trainer`
232
+
233
+ Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
234
+ :class:`~tokenizers.models.Model`.
235
+
236
+ Returns:
237
+ :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
238
+ """
239
+ pass
240
+
241
+ def id_to_token(self, id):
242
+ """
243
+ Get the token associated to an ID
244
+
245
+ Args:
246
+ id (:obj:`int`):
247
+ An ID to convert to a token
248
+
249
+ Returns:
250
+ :obj:`str`: The token associated to the ID
251
+ """
252
+ pass
253
+
254
+ @property
255
+ def ignore_merges(self):
256
+ """ """
257
+ pass
258
+
259
+ @ignore_merges.setter
260
+ def ignore_merges(self, value):
261
+ """ """
262
+ pass
263
+
264
+ @staticmethod
265
+ def read_file(vocab, merges):
266
+ """
267
+ Read a :obj:`vocab.json` and a :obj:`merges.txt` files
268
+
269
+ This method provides a way to read and parse the content of these files,
270
+ returning the relevant data structures. If you want to instantiate some BPE models
271
+ from memory, this method gives you the expected input from the standard files.
272
+
273
+ Args:
274
+ vocab (:obj:`str`):
275
+ The path to a :obj:`vocab.json` file
276
+
277
+ merges (:obj:`str`):
278
+ The path to a :obj:`merges.txt` file
279
+
280
+ Returns:
281
+ A :obj:`Tuple` with the vocab and the merges:
282
+ The vocabulary and merges loaded into memory
283
+ """
284
+ pass
285
+
286
+ def save(self, folder, prefix):
287
+ """
288
+ Save the current model
289
+
290
+ Save the current model in the given folder, using the given prefix for the various
291
+ files that will get created.
292
+ Any file with the same name that already exists in this folder will be overwritten.
293
+
294
+ Args:
295
+ folder (:obj:`str`):
296
+ The path to the target folder in which to save the various files
297
+
298
+ prefix (:obj:`str`, `optional`):
299
+ An optional prefix, used to prefix each file name
300
+
301
+ Returns:
302
+ :obj:`List[str]`: The list of saved files
303
+ """
304
+ pass
305
+
306
+ def token_to_id(self, tokens):
307
+ """
308
+ Get the ID associated to a token
309
+
310
+ Args:
311
+ token (:obj:`str`):
312
+ A token to convert to an ID
313
+
314
+ Returns:
315
+ :obj:`int`: The ID associated to the token
316
+ """
317
+ pass
318
+
319
+ def tokenize(self, sequence):
320
+ """
321
+ Tokenize a sequence
322
+
323
+ Args:
324
+ sequence (:obj:`str`):
325
+ A sequence to tokenize
326
+
327
+ Returns:
328
+ A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
329
+ """
330
+ pass
331
+
332
+ @property
333
+ def unk_token(self):
334
+ """ """
335
+ pass
336
+
337
+ @unk_token.setter
338
+ def unk_token(self, value):
339
+ """ """
340
+ pass
341
+
342
+ class Unigram(Model):
343
+ """
344
+ An implementation of the Unigram algorithm
345
+
346
+ Args:
347
+ vocab (:obj:`List[Tuple[str, float]]`, `optional`, `optional`):
348
+ A list of vocabulary items and their relative score [("am", -0.2442),...]
349
+ """
350
+ def __init__(self, vocab=None, unk_id=None, byte_fallback=None):
351
+ pass
352
+
353
+ def __getstate__(self):
354
+ """ """
355
+ pass
356
+
357
+ def __setstate__(self, state):
358
+ """ """
359
+ pass
360
+
361
+ def get_trainer(self):
362
+ """
363
+ Get the associated :class:`~tokenizers.trainers.Trainer`
364
+
365
+ Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
366
+ :class:`~tokenizers.models.Model`.
367
+
368
+ Returns:
369
+ :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
370
+ """
371
+ pass
372
+
373
+ def id_to_token(self, id):
374
+ """
375
+ Get the token associated to an ID
376
+
377
+ Args:
378
+ id (:obj:`int`):
379
+ An ID to convert to a token
380
+
381
+ Returns:
382
+ :obj:`str`: The token associated to the ID
383
+ """
384
+ pass
385
+
386
+ def save(self, folder, prefix):
387
+ """
388
+ Save the current model
389
+
390
+ Save the current model in the given folder, using the given prefix for the various
391
+ files that will get created.
392
+ Any file with the same name that already exists in this folder will be overwritten.
393
+
394
+ Args:
395
+ folder (:obj:`str`):
396
+ The path to the target folder in which to save the various files
397
+
398
+ prefix (:obj:`str`, `optional`):
399
+ An optional prefix, used to prefix each file name
400
+
401
+ Returns:
402
+ :obj:`List[str]`: The list of saved files
403
+ """
404
+ pass
405
+
406
+ def token_to_id(self, tokens):
407
+ """
408
+ Get the ID associated to a token
409
+
410
+ Args:
411
+ token (:obj:`str`):
412
+ A token to convert to an ID
413
+
414
+ Returns:
415
+ :obj:`int`: The ID associated to the token
416
+ """
417
+ pass
418
+
419
+ def tokenize(self, sequence):
420
+ """
421
+ Tokenize a sequence
422
+
423
+ Args:
424
+ sequence (:obj:`str`):
425
+ A sequence to tokenize
426
+
427
+ Returns:
428
+ A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
429
+ """
430
+ pass
431
+
432
+ class WordLevel(Model):
433
+ """
434
+ An implementation of the WordLevel algorithm
435
+
436
+ Most simple tokenizer model based on mapping tokens to their corresponding id.
437
+
438
+ Args:
439
+ vocab (:obj:`str`, `optional`):
440
+ A dictionary of string keys and their ids :obj:`{"am": 0,...}`
441
+
442
+ unk_token (:obj:`str`, `optional`):
443
+ The unknown token to be used by the model.
444
+ """
445
+ def __init__(self, vocab=None, unk_token=None):
446
+ pass
447
+
448
+ def __getstate__(self):
449
+ """ """
450
+ pass
451
+
452
+ def __setstate__(self, state):
453
+ """ """
454
+ pass
455
+
456
+ @staticmethod
457
+ def from_file(vocab, unk_token=None):
458
+ """
459
+ Instantiate a WordLevel model from the given file
460
+
461
+ This method is roughly equivalent to doing::
462
+
463
+ vocab = WordLevel.read_file(vocab_filename)
464
+ wordlevel = WordLevel(vocab)
465
+
466
+ If you don't need to keep the :obj:`vocab` values lying around, this method is
467
+ more optimized than manually calling :meth:`~tokenizers.models.WordLevel.read_file` to
468
+ initialize a :class:`~tokenizers.models.WordLevel`
469
+
470
+ Args:
471
+ vocab (:obj:`str`):
472
+ The path to a :obj:`vocab.json` file
473
+
474
+ Returns:
475
+ :class:`~tokenizers.models.WordLevel`: An instance of WordLevel loaded from file
476
+ """
477
+ pass
478
+
479
+ def get_trainer(self):
480
+ """
481
+ Get the associated :class:`~tokenizers.trainers.Trainer`
482
+
483
+ Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
484
+ :class:`~tokenizers.models.Model`.
485
+
486
+ Returns:
487
+ :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
488
+ """
489
+ pass
490
+
491
+ def id_to_token(self, id):
492
+ """
493
+ Get the token associated to an ID
494
+
495
+ Args:
496
+ id (:obj:`int`):
497
+ An ID to convert to a token
498
+
499
+ Returns:
500
+ :obj:`str`: The token associated to the ID
501
+ """
502
+ pass
503
+
504
+ @staticmethod
505
+ def read_file(vocab):
506
+ """
507
+ Read a :obj:`vocab.json`
508
+
509
+ This method provides a way to read and parse the content of a vocabulary file,
510
+ returning the relevant data structures. If you want to instantiate some WordLevel models
511
+ from memory, this method gives you the expected input from the standard files.
512
+
513
+ Args:
514
+ vocab (:obj:`str`):
515
+ The path to a :obj:`vocab.json` file
516
+
517
+ Returns:
518
+ :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
519
+ """
520
+ pass
521
+
522
+ def save(self, folder, prefix):
523
+ """
524
+ Save the current model
525
+
526
+ Save the current model in the given folder, using the given prefix for the various
527
+ files that will get created.
528
+ Any file with the same name that already exists in this folder will be overwritten.
529
+
530
+ Args:
531
+ folder (:obj:`str`):
532
+ The path to the target folder in which to save the various files
533
+
534
+ prefix (:obj:`str`, `optional`):
535
+ An optional prefix, used to prefix each file name
536
+
537
+ Returns:
538
+ :obj:`List[str]`: The list of saved files
539
+ """
540
+ pass
541
+
542
+ def token_to_id(self, tokens):
543
+ """
544
+ Get the ID associated to a token
545
+
546
+ Args:
547
+ token (:obj:`str`):
548
+ A token to convert to an ID
549
+
550
+ Returns:
551
+ :obj:`int`: The ID associated to the token
552
+ """
553
+ pass
554
+
555
+ def tokenize(self, sequence):
556
+ """
557
+ Tokenize a sequence
558
+
559
+ Args:
560
+ sequence (:obj:`str`):
561
+ A sequence to tokenize
562
+
563
+ Returns:
564
+ A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
565
+ """
566
+ pass
567
+
568
+ @property
569
+ def unk_token(self):
570
+ """ """
571
+ pass
572
+
573
+ @unk_token.setter
574
+ def unk_token(self, value):
575
+ """ """
576
+ pass
577
+
578
+ class WordPiece(Model):
579
+ """
580
+ An implementation of the WordPiece algorithm
581
+
582
+ Args:
583
+ vocab (:obj:`Dict[str, int]`, `optional`):
584
+ A dictionary of string keys and their ids :obj:`{"am": 0,...}`
585
+
586
+ unk_token (:obj:`str`, `optional`):
587
+ The unknown token to be used by the model.
588
+
589
+ max_input_chars_per_word (:obj:`int`, `optional`):
590
+ The maximum number of characters to authorize in a single word.
591
+ """
592
+ def __init__(self, vocab=None, unk_token="[UNK]", max_input_chars_per_word=100, continuing_subword_prefix="##"):
593
+ pass
594
+
595
+ def __getstate__(self):
596
+ """ """
597
+ pass
598
+
599
+ def __setstate__(self, state):
600
+ """ """
601
+ pass
602
+
603
+ @property
604
+ def continuing_subword_prefix(self):
605
+ """ """
606
+ pass
607
+
608
+ @continuing_subword_prefix.setter
609
+ def continuing_subword_prefix(self, value):
610
+ """ """
611
+ pass
612
+
613
+ @staticmethod
614
+ def from_file(vocab, **kwargs):
615
+ """
616
+ Instantiate a WordPiece model from the given file
617
+
618
+ This method is roughly equivalent to doing::
619
+
620
+ vocab = WordPiece.read_file(vocab_filename)
621
+ wordpiece = WordPiece(vocab)
622
+
623
+ If you don't need to keep the :obj:`vocab` values lying around, this method is
624
+ more optimized than manually calling :meth:`~tokenizers.models.WordPiece.read_file` to
625
+ initialize a :class:`~tokenizers.models.WordPiece`
626
+
627
+ Args:
628
+ vocab (:obj:`str`):
629
+ The path to a :obj:`vocab.txt` file
630
+
631
+ Returns:
632
+ :class:`~tokenizers.models.WordPiece`: An instance of WordPiece loaded from file
633
+ """
634
+ pass
635
+
636
+ def get_trainer(self):
637
+ """
638
+ Get the associated :class:`~tokenizers.trainers.Trainer`
639
+
640
+ Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
641
+ :class:`~tokenizers.models.Model`.
642
+
643
+ Returns:
644
+ :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
645
+ """
646
+ pass
647
+
648
+ def id_to_token(self, id):
649
+ """
650
+ Get the token associated to an ID
651
+
652
+ Args:
653
+ id (:obj:`int`):
654
+ An ID to convert to a token
655
+
656
+ Returns:
657
+ :obj:`str`: The token associated to the ID
658
+ """
659
+ pass
660
+
661
+ @property
662
+ def max_input_chars_per_word(self):
663
+ """ """
664
+ pass
665
+
666
+ @max_input_chars_per_word.setter
667
+ def max_input_chars_per_word(self, value):
668
+ """ """
669
+ pass
670
+
671
+ @staticmethod
672
+ def read_file(vocab):
673
+ """
674
+ Read a :obj:`vocab.txt` file
675
+
676
+ This method provides a way to read and parse the content of a standard `vocab.txt`
677
+ file as used by the WordPiece Model, returning the relevant data structures. If you
678
+ want to instantiate some WordPiece models from memory, this method gives you the
679
+ expected input from the standard files.
680
+
681
+ Args:
682
+ vocab (:obj:`str`):
683
+ The path to a :obj:`vocab.txt` file
684
+
685
+ Returns:
686
+ :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
687
+ """
688
+ pass
689
+
690
+ def save(self, folder, prefix):
691
+ """
692
+ Save the current model
693
+
694
+ Save the current model in the given folder, using the given prefix for the various
695
+ files that will get created.
696
+ Any file with the same name that already exists in this folder will be overwritten.
697
+
698
+ Args:
699
+ folder (:obj:`str`):
700
+ The path to the target folder in which to save the various files
701
+
702
+ prefix (:obj:`str`, `optional`):
703
+ An optional prefix, used to prefix each file name
704
+
705
+ Returns:
706
+ :obj:`List[str]`: The list of saved files
707
+ """
708
+ pass
709
+
710
+ def token_to_id(self, tokens):
711
+ """
712
+ Get the ID associated to a token
713
+
714
+ Args:
715
+ token (:obj:`str`):
716
+ A token to convert to an ID
717
+
718
+ Returns:
719
+ :obj:`int`: The ID associated to the token
720
+ """
721
+ pass
722
+
723
+ def tokenize(self, sequence):
724
+ """
725
+ Tokenize a sequence
726
+
727
+ Args:
728
+ sequence (:obj:`str`):
729
+ A sequence to tokenize
730
+
731
+ Returns:
732
+ A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
733
+ """
734
+ pass
735
+
736
+ @property
737
+ def unk_token(self):
738
+ """ """
739
+ pass
740
+
741
+ @unk_token.setter
742
+ def unk_token(self, value):
743
+ """ """
744
+ pass
source/tokenizers/normalizers/__init__.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .. import normalizers
2
+
3
+
4
+ Normalizer = normalizers.Normalizer
5
+ BertNormalizer = normalizers.BertNormalizer
6
+ NFD = normalizers.NFD
7
+ NFKD = normalizers.NFKD
8
+ NFC = normalizers.NFC
9
+ NFKC = normalizers.NFKC
10
+ Sequence = normalizers.Sequence
11
+ Lowercase = normalizers.Lowercase
12
+ Prepend = normalizers.Prepend
13
+ Strip = normalizers.Strip
14
+ StripAccents = normalizers.StripAccents
15
+ Nmt = normalizers.Nmt
16
+ Precompiled = normalizers.Precompiled
17
+ Replace = normalizers.Replace
18
+ ByteLevel = normalizers.ByteLevel
19
+
20
+ NORMALIZERS = {"nfc": NFC, "nfd": NFD, "nfkc": NFKC, "nfkd": NFKD}
21
+
22
+
23
+ def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
24
+ if normalizer not in NORMALIZERS:
25
+ raise ValueError(
26
+ "{} is not a known unicode normalizer. Available are {}".format(normalizer, NORMALIZERS.keys())
27
+ )
28
+
29
+ return NORMALIZERS[normalizer]()
source/tokenizers/normalizers/__init__.pyi ADDED
@@ -0,0 +1,946 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated content DO NOT EDIT
2
+ class Normalizer:
3
+ """
4
+ Base class for all normalizers
5
+
6
+ This class is not supposed to be instantiated directly. Instead, any implementation of a
7
+ Normalizer will return an instance of this class when instantiated.
8
+ """
9
+ def __getstate__(self):
10
+ """ """
11
+ pass
12
+
13
+ def __setstate__(self, state):
14
+ """ """
15
+ pass
16
+
17
+ @staticmethod
18
+ def custom(normalizer):
19
+ """ """
20
+ pass
21
+
22
+ def normalize(self, normalized):
23
+ """
24
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
25
+
26
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
27
+ keep track of the alignment information. If you just want to see the result
28
+ of the normalization on a raw string, you can use
29
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
30
+
31
+ Args:
32
+ normalized (:class:`~tokenizers.NormalizedString`):
33
+ The normalized string on which to apply this
34
+ :class:`~tokenizers.normalizers.Normalizer`
35
+ """
36
+ pass
37
+
38
+ def normalize_str(self, sequence):
39
+ """
40
+ Normalize the given string
41
+
42
+ This method provides a way to visualize the effect of a
43
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
44
+ information. If you need to get/convert offsets, you can use
45
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
46
+
47
+ Args:
48
+ sequence (:obj:`str`):
49
+ A string to normalize
50
+
51
+ Returns:
52
+ :obj:`str`: A string after normalization
53
+ """
54
+ pass
55
+
56
+ class BertNormalizer(Normalizer):
57
+ """
58
+ BertNormalizer
59
+
60
+ Takes care of normalizing raw text before giving it to a Bert model.
61
+ This includes cleaning the text, handling accents, chinese chars and lowercasing
62
+
63
+ Args:
64
+ clean_text (:obj:`bool`, `optional`, defaults to :obj:`True`):
65
+ Whether to clean the text, by removing any control characters
66
+ and replacing all whitespaces by the classic one.
67
+
68
+ handle_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
69
+ Whether to handle chinese chars by putting spaces around them.
70
+
71
+ strip_accents (:obj:`bool`, `optional`):
72
+ Whether to strip all accents. If this option is not specified (ie == None),
73
+ then it will be determined by the value for `lowercase` (as in the original Bert).
74
+
75
+ lowercase (:obj:`bool`, `optional`, defaults to :obj:`True`):
76
+ Whether to lowercase.
77
+ """
78
+ def __init__(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True):
79
+ pass
80
+
81
+ def __getstate__(self):
82
+ """ """
83
+ pass
84
+
85
+ def __setstate__(self, state):
86
+ """ """
87
+ pass
88
+
89
+ @property
90
+ def clean_text(self):
91
+ """ """
92
+ pass
93
+
94
+ @clean_text.setter
95
+ def clean_text(self, value):
96
+ """ """
97
+ pass
98
+
99
+ @staticmethod
100
+ def custom(normalizer):
101
+ """ """
102
+ pass
103
+
104
+ @property
105
+ def handle_chinese_chars(self):
106
+ """ """
107
+ pass
108
+
109
+ @handle_chinese_chars.setter
110
+ def handle_chinese_chars(self, value):
111
+ """ """
112
+ pass
113
+
114
+ @property
115
+ def lowercase(self):
116
+ """ """
117
+ pass
118
+
119
+ @lowercase.setter
120
+ def lowercase(self, value):
121
+ """ """
122
+ pass
123
+
124
+ def normalize(self, normalized):
125
+ """
126
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
127
+
128
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
129
+ keep track of the alignment information. If you just want to see the result
130
+ of the normalization on a raw string, you can use
131
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
132
+
133
+ Args:
134
+ normalized (:class:`~tokenizers.NormalizedString`):
135
+ The normalized string on which to apply this
136
+ :class:`~tokenizers.normalizers.Normalizer`
137
+ """
138
+ pass
139
+
140
+ def normalize_str(self, sequence):
141
+ """
142
+ Normalize the given string
143
+
144
+ This method provides a way to visualize the effect of a
145
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
146
+ information. If you need to get/convert offsets, you can use
147
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
148
+
149
+ Args:
150
+ sequence (:obj:`str`):
151
+ A string to normalize
152
+
153
+ Returns:
154
+ :obj:`str`: A string after normalization
155
+ """
156
+ pass
157
+
158
+ @property
159
+ def strip_accents(self):
160
+ """ """
161
+ pass
162
+
163
+ @strip_accents.setter
164
+ def strip_accents(self, value):
165
+ """ """
166
+ pass
167
+
168
+ class ByteLevel(Normalizer):
169
+ """
170
+ Bytelevel Normalizer
171
+ """
172
+ def __init__(self):
173
+ pass
174
+
175
+ def __getstate__(self):
176
+ """ """
177
+ pass
178
+
179
+ def __setstate__(self, state):
180
+ """ """
181
+ pass
182
+
183
+ @staticmethod
184
+ def custom(normalizer):
185
+ """ """
186
+ pass
187
+
188
+ def normalize(self, normalized):
189
+ """
190
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
191
+
192
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
193
+ keep track of the alignment information. If you just want to see the result
194
+ of the normalization on a raw string, you can use
195
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
196
+
197
+ Args:
198
+ normalized (:class:`~tokenizers.NormalizedString`):
199
+ The normalized string on which to apply this
200
+ :class:`~tokenizers.normalizers.Normalizer`
201
+ """
202
+ pass
203
+
204
+ def normalize_str(self, sequence):
205
+ """
206
+ Normalize the given string
207
+
208
+ This method provides a way to visualize the effect of a
209
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
210
+ information. If you need to get/convert offsets, you can use
211
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
212
+
213
+ Args:
214
+ sequence (:obj:`str`):
215
+ A string to normalize
216
+
217
+ Returns:
218
+ :obj:`str`: A string after normalization
219
+ """
220
+ pass
221
+
222
+ class Lowercase(Normalizer):
223
+ """
224
+ Lowercase Normalizer
225
+ """
226
+ def __init__(self):
227
+ pass
228
+
229
+ def __getstate__(self):
230
+ """ """
231
+ pass
232
+
233
+ def __setstate__(self, state):
234
+ """ """
235
+ pass
236
+
237
+ @staticmethod
238
+ def custom(normalizer):
239
+ """ """
240
+ pass
241
+
242
+ def normalize(self, normalized):
243
+ """
244
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
245
+
246
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
247
+ keep track of the alignment information. If you just want to see the result
248
+ of the normalization on a raw string, you can use
249
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
250
+
251
+ Args:
252
+ normalized (:class:`~tokenizers.NormalizedString`):
253
+ The normalized string on which to apply this
254
+ :class:`~tokenizers.normalizers.Normalizer`
255
+ """
256
+ pass
257
+
258
+ def normalize_str(self, sequence):
259
+ """
260
+ Normalize the given string
261
+
262
+ This method provides a way to visualize the effect of a
263
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
264
+ information. If you need to get/convert offsets, you can use
265
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
266
+
267
+ Args:
268
+ sequence (:obj:`str`):
269
+ A string to normalize
270
+
271
+ Returns:
272
+ :obj:`str`: A string after normalization
273
+ """
274
+ pass
275
+
276
+ class NFC(Normalizer):
277
+ """
278
+ NFC Unicode Normalizer
279
+ """
280
+ def __init__(self):
281
+ pass
282
+
283
+ def __getstate__(self):
284
+ """ """
285
+ pass
286
+
287
+ def __setstate__(self, state):
288
+ """ """
289
+ pass
290
+
291
+ @staticmethod
292
+ def custom(normalizer):
293
+ """ """
294
+ pass
295
+
296
+ def normalize(self, normalized):
297
+ """
298
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
299
+
300
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
301
+ keep track of the alignment information. If you just want to see the result
302
+ of the normalization on a raw string, you can use
303
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
304
+
305
+ Args:
306
+ normalized (:class:`~tokenizers.NormalizedString`):
307
+ The normalized string on which to apply this
308
+ :class:`~tokenizers.normalizers.Normalizer`
309
+ """
310
+ pass
311
+
312
+ def normalize_str(self, sequence):
313
+ """
314
+ Normalize the given string
315
+
316
+ This method provides a way to visualize the effect of a
317
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
318
+ information. If you need to get/convert offsets, you can use
319
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
320
+
321
+ Args:
322
+ sequence (:obj:`str`):
323
+ A string to normalize
324
+
325
+ Returns:
326
+ :obj:`str`: A string after normalization
327
+ """
328
+ pass
329
+
330
+ class NFD(Normalizer):
331
+ """
332
+ NFD Unicode Normalizer
333
+ """
334
+ def __init__(self):
335
+ pass
336
+
337
+ def __getstate__(self):
338
+ """ """
339
+ pass
340
+
341
+ def __setstate__(self, state):
342
+ """ """
343
+ pass
344
+
345
+ @staticmethod
346
+ def custom(normalizer):
347
+ """ """
348
+ pass
349
+
350
+ def normalize(self, normalized):
351
+ """
352
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
353
+
354
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
355
+ keep track of the alignment information. If you just want to see the result
356
+ of the normalization on a raw string, you can use
357
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
358
+
359
+ Args:
360
+ normalized (:class:`~tokenizers.NormalizedString`):
361
+ The normalized string on which to apply this
362
+ :class:`~tokenizers.normalizers.Normalizer`
363
+ """
364
+ pass
365
+
366
+ def normalize_str(self, sequence):
367
+ """
368
+ Normalize the given string
369
+
370
+ This method provides a way to visualize the effect of a
371
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
372
+ information. If you need to get/convert offsets, you can use
373
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
374
+
375
+ Args:
376
+ sequence (:obj:`str`):
377
+ A string to normalize
378
+
379
+ Returns:
380
+ :obj:`str`: A string after normalization
381
+ """
382
+ pass
383
+
384
+ class NFKC(Normalizer):
385
+ """
386
+ NFKC Unicode Normalizer
387
+ """
388
+ def __init__(self):
389
+ pass
390
+
391
+ def __getstate__(self):
392
+ """ """
393
+ pass
394
+
395
+ def __setstate__(self, state):
396
+ """ """
397
+ pass
398
+
399
+ @staticmethod
400
+ def custom(normalizer):
401
+ """ """
402
+ pass
403
+
404
+ def normalize(self, normalized):
405
+ """
406
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
407
+
408
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
409
+ keep track of the alignment information. If you just want to see the result
410
+ of the normalization on a raw string, you can use
411
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
412
+
413
+ Args:
414
+ normalized (:class:`~tokenizers.NormalizedString`):
415
+ The normalized string on which to apply this
416
+ :class:`~tokenizers.normalizers.Normalizer`
417
+ """
418
+ pass
419
+
420
+ def normalize_str(self, sequence):
421
+ """
422
+ Normalize the given string
423
+
424
+ This method provides a way to visualize the effect of a
425
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
426
+ information. If you need to get/convert offsets, you can use
427
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
428
+
429
+ Args:
430
+ sequence (:obj:`str`):
431
+ A string to normalize
432
+
433
+ Returns:
434
+ :obj:`str`: A string after normalization
435
+ """
436
+ pass
437
+
438
+ class NFKD(Normalizer):
439
+ """
440
+ NFKD Unicode Normalizer
441
+ """
442
+ def __init__(self):
443
+ pass
444
+
445
+ def __getstate__(self):
446
+ """ """
447
+ pass
448
+
449
+ def __setstate__(self, state):
450
+ """ """
451
+ pass
452
+
453
+ @staticmethod
454
+ def custom(normalizer):
455
+ """ """
456
+ pass
457
+
458
+ def normalize(self, normalized):
459
+ """
460
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
461
+
462
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
463
+ keep track of the alignment information. If you just want to see the result
464
+ of the normalization on a raw string, you can use
465
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
466
+
467
+ Args:
468
+ normalized (:class:`~tokenizers.NormalizedString`):
469
+ The normalized string on which to apply this
470
+ :class:`~tokenizers.normalizers.Normalizer`
471
+ """
472
+ pass
473
+
474
+ def normalize_str(self, sequence):
475
+ """
476
+ Normalize the given string
477
+
478
+ This method provides a way to visualize the effect of a
479
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
480
+ information. If you need to get/convert offsets, you can use
481
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
482
+
483
+ Args:
484
+ sequence (:obj:`str`):
485
+ A string to normalize
486
+
487
+ Returns:
488
+ :obj:`str`: A string after normalization
489
+ """
490
+ pass
491
+
492
+ class Nmt(Normalizer):
493
+ """
494
+ Nmt normalizer
495
+ """
496
+ def __init__(self):
497
+ pass
498
+
499
+ def __getstate__(self):
500
+ """ """
501
+ pass
502
+
503
+ def __setstate__(self, state):
504
+ """ """
505
+ pass
506
+
507
+ @staticmethod
508
+ def custom(normalizer):
509
+ """ """
510
+ pass
511
+
512
+ def normalize(self, normalized):
513
+ """
514
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
515
+
516
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
517
+ keep track of the alignment information. If you just want to see the result
518
+ of the normalization on a raw string, you can use
519
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
520
+
521
+ Args:
522
+ normalized (:class:`~tokenizers.NormalizedString`):
523
+ The normalized string on which to apply this
524
+ :class:`~tokenizers.normalizers.Normalizer`
525
+ """
526
+ pass
527
+
528
+ def normalize_str(self, sequence):
529
+ """
530
+ Normalize the given string
531
+
532
+ This method provides a way to visualize the effect of a
533
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
534
+ information. If you need to get/convert offsets, you can use
535
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
536
+
537
+ Args:
538
+ sequence (:obj:`str`):
539
+ A string to normalize
540
+
541
+ Returns:
542
+ :obj:`str`: A string after normalization
543
+ """
544
+ pass
545
+
546
+ class Precompiled(Normalizer):
547
+ """
548
+ Precompiled normalizer
549
+ Don't use manually it is used for compatibility for SentencePiece.
550
+ """
551
+ def __init__(self, precompiled_charsmap):
552
+ pass
553
+
554
+ def __getstate__(self):
555
+ """ """
556
+ pass
557
+
558
+ def __setstate__(self, state):
559
+ """ """
560
+ pass
561
+
562
+ @staticmethod
563
+ def custom(normalizer):
564
+ """ """
565
+ pass
566
+
567
+ def normalize(self, normalized):
568
+ """
569
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
570
+
571
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
572
+ keep track of the alignment information. If you just want to see the result
573
+ of the normalization on a raw string, you can use
574
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
575
+
576
+ Args:
577
+ normalized (:class:`~tokenizers.NormalizedString`):
578
+ The normalized string on which to apply this
579
+ :class:`~tokenizers.normalizers.Normalizer`
580
+ """
581
+ pass
582
+
583
+ def normalize_str(self, sequence):
584
+ """
585
+ Normalize the given string
586
+
587
+ This method provides a way to visualize the effect of a
588
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
589
+ information. If you need to get/convert offsets, you can use
590
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
591
+
592
+ Args:
593
+ sequence (:obj:`str`):
594
+ A string to normalize
595
+
596
+ Returns:
597
+ :obj:`str`: A string after normalization
598
+ """
599
+ pass
600
+
601
+ class Prepend(Normalizer):
602
+ """
603
+ Prepend normalizer
604
+ """
605
+ def __init__(self, prepend):
606
+ pass
607
+
608
+ def __getstate__(self):
609
+ """ """
610
+ pass
611
+
612
+ def __setstate__(self, state):
613
+ """ """
614
+ pass
615
+
616
+ @staticmethod
617
+ def custom(normalizer):
618
+ """ """
619
+ pass
620
+
621
+ def normalize(self, normalized):
622
+ """
623
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
624
+
625
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
626
+ keep track of the alignment information. If you just want to see the result
627
+ of the normalization on a raw string, you can use
628
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
629
+
630
+ Args:
631
+ normalized (:class:`~tokenizers.NormalizedString`):
632
+ The normalized string on which to apply this
633
+ :class:`~tokenizers.normalizers.Normalizer`
634
+ """
635
+ pass
636
+
637
+ def normalize_str(self, sequence):
638
+ """
639
+ Normalize the given string
640
+
641
+ This method provides a way to visualize the effect of a
642
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
643
+ information. If you need to get/convert offsets, you can use
644
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
645
+
646
+ Args:
647
+ sequence (:obj:`str`):
648
+ A string to normalize
649
+
650
+ Returns:
651
+ :obj:`str`: A string after normalization
652
+ """
653
+ pass
654
+
655
+ @property
656
+ def prepend(self):
657
+ """ """
658
+ pass
659
+
660
+ @prepend.setter
661
+ def prepend(self, value):
662
+ """ """
663
+ pass
664
+
665
+ class Replace(Normalizer):
666
+ """
667
+ Replace normalizer
668
+ """
669
+ def __init__(self, pattern, content):
670
+ pass
671
+
672
+ def __getstate__(self):
673
+ """ """
674
+ pass
675
+
676
+ def __setstate__(self, state):
677
+ """ """
678
+ pass
679
+
680
+ @property
681
+ def content(self):
682
+ """ """
683
+ pass
684
+
685
+ @content.setter
686
+ def content(self, value):
687
+ """ """
688
+ pass
689
+
690
+ @staticmethod
691
+ def custom(normalizer):
692
+ """ """
693
+ pass
694
+
695
+ def normalize(self, normalized):
696
+ """
697
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
698
+
699
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
700
+ keep track of the alignment information. If you just want to see the result
701
+ of the normalization on a raw string, you can use
702
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
703
+
704
+ Args:
705
+ normalized (:class:`~tokenizers.NormalizedString`):
706
+ The normalized string on which to apply this
707
+ :class:`~tokenizers.normalizers.Normalizer`
708
+ """
709
+ pass
710
+
711
+ def normalize_str(self, sequence):
712
+ """
713
+ Normalize the given string
714
+
715
+ This method provides a way to visualize the effect of a
716
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
717
+ information. If you need to get/convert offsets, you can use
718
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
719
+
720
+ Args:
721
+ sequence (:obj:`str`):
722
+ A string to normalize
723
+
724
+ Returns:
725
+ :obj:`str`: A string after normalization
726
+ """
727
+ pass
728
+
729
+ @property
730
+ def pattern(self):
731
+ """ """
732
+ pass
733
+
734
+ @pattern.setter
735
+ def pattern(self, value):
736
+ """ """
737
+ pass
738
+
739
+ class Sequence(Normalizer):
740
+ """
741
+ Allows concatenating multiple other Normalizer as a Sequence.
742
+ All the normalizers run in sequence in the given order
743
+
744
+ Args:
745
+ normalizers (:obj:`List[Normalizer]`):
746
+ A list of Normalizer to be run as a sequence
747
+ """
748
+ def __init__(self, normalizers):
749
+ pass
750
+
751
+ def __getitem__(self, key):
752
+ """
753
+ Return self[key].
754
+ """
755
+ pass
756
+
757
+ def __getnewargs__(self):
758
+ """ """
759
+ pass
760
+
761
+ def __getstate__(self):
762
+ """ """
763
+ pass
764
+
765
+ def __setitem__(self, key, value):
766
+ """
767
+ Set self[key] to value.
768
+ """
769
+ pass
770
+
771
+ def __setstate__(self, state):
772
+ """ """
773
+ pass
774
+
775
+ @staticmethod
776
+ def custom(normalizer):
777
+ """ """
778
+ pass
779
+
780
+ def normalize(self, normalized):
781
+ """
782
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
783
+
784
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
785
+ keep track of the alignment information. If you just want to see the result
786
+ of the normalization on a raw string, you can use
787
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
788
+
789
+ Args:
790
+ normalized (:class:`~tokenizers.NormalizedString`):
791
+ The normalized string on which to apply this
792
+ :class:`~tokenizers.normalizers.Normalizer`
793
+ """
794
+ pass
795
+
796
+ def normalize_str(self, sequence):
797
+ """
798
+ Normalize the given string
799
+
800
+ This method provides a way to visualize the effect of a
801
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
802
+ information. If you need to get/convert offsets, you can use
803
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
804
+
805
+ Args:
806
+ sequence (:obj:`str`):
807
+ A string to normalize
808
+
809
+ Returns:
810
+ :obj:`str`: A string after normalization
811
+ """
812
+ pass
813
+
814
+ class Strip(Normalizer):
815
+ """
816
+ Strip normalizer
817
+ """
818
+ def __init__(self, left=True, right=True):
819
+ pass
820
+
821
+ def __getstate__(self):
822
+ """ """
823
+ pass
824
+
825
+ def __setstate__(self, state):
826
+ """ """
827
+ pass
828
+
829
+ @staticmethod
830
+ def custom(normalizer):
831
+ """ """
832
+ pass
833
+
834
+ @property
835
+ def left(self):
836
+ """ """
837
+ pass
838
+
839
+ @left.setter
840
+ def left(self, value):
841
+ """ """
842
+ pass
843
+
844
+ def normalize(self, normalized):
845
+ """
846
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
847
+
848
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
849
+ keep track of the alignment information. If you just want to see the result
850
+ of the normalization on a raw string, you can use
851
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
852
+
853
+ Args:
854
+ normalized (:class:`~tokenizers.NormalizedString`):
855
+ The normalized string on which to apply this
856
+ :class:`~tokenizers.normalizers.Normalizer`
857
+ """
858
+ pass
859
+
860
+ def normalize_str(self, sequence):
861
+ """
862
+ Normalize the given string
863
+
864
+ This method provides a way to visualize the effect of a
865
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
866
+ information. If you need to get/convert offsets, you can use
867
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
868
+
869
+ Args:
870
+ sequence (:obj:`str`):
871
+ A string to normalize
872
+
873
+ Returns:
874
+ :obj:`str`: A string after normalization
875
+ """
876
+ pass
877
+
878
+ @property
879
+ def right(self):
880
+ """ """
881
+ pass
882
+
883
+ @right.setter
884
+ def right(self, value):
885
+ """ """
886
+ pass
887
+
888
+ class StripAccents(Normalizer):
889
+ """
890
+ StripAccents normalizer
891
+ """
892
+ def __init__(self):
893
+ pass
894
+
895
+ def __getstate__(self):
896
+ """ """
897
+ pass
898
+
899
+ def __setstate__(self, state):
900
+ """ """
901
+ pass
902
+
903
+ @staticmethod
904
+ def custom(normalizer):
905
+ """ """
906
+ pass
907
+
908
+ def normalize(self, normalized):
909
+ """
910
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
911
+
912
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
913
+ keep track of the alignment information. If you just want to see the result
914
+ of the normalization on a raw string, you can use
915
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
916
+
917
+ Args:
918
+ normalized (:class:`~tokenizers.NormalizedString`):
919
+ The normalized string on which to apply this
920
+ :class:`~tokenizers.normalizers.Normalizer`
921
+ """
922
+ pass
923
+
924
+ def normalize_str(self, sequence):
925
+ """
926
+ Normalize the given string
927
+
928
+ This method provides a way to visualize the effect of a
929
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
930
+ information. If you need to get/convert offsets, you can use
931
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
932
+
933
+ Args:
934
+ sequence (:obj:`str`):
935
+ A string to normalize
936
+
937
+ Returns:
938
+ :obj:`str`: A string after normalization
939
+ """
940
+ pass
941
+
942
+ from typing import Dict
943
+
944
+ NORMALIZERS: Dict[str, Normalizer]
945
+
946
+ def unicode_normalizer_from_str(normalizer: str) -> Normalizer: ...
source/tokenizers/pre_tokenizers/__init__.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated content DO NOT EDIT
2
+ from .. import pre_tokenizers
3
+
4
+ PreTokenizer = pre_tokenizers.PreTokenizer
5
+ BertPreTokenizer = pre_tokenizers.BertPreTokenizer
6
+ ByteLevel = pre_tokenizers.ByteLevel
7
+ CharDelimiterSplit = pre_tokenizers.CharDelimiterSplit
8
+ Digits = pre_tokenizers.Digits
9
+ FixedLength = pre_tokenizers.FixedLength
10
+ Metaspace = pre_tokenizers.Metaspace
11
+ Punctuation = pre_tokenizers.Punctuation
12
+ Sequence = pre_tokenizers.Sequence
13
+ Split = pre_tokenizers.Split
14
+ UnicodeScripts = pre_tokenizers.UnicodeScripts
15
+ Whitespace = pre_tokenizers.Whitespace
16
+ WhitespaceSplit = pre_tokenizers.WhitespaceSplit
source/tokenizers/pre_tokenizers/__init__.pyi ADDED
@@ -0,0 +1,1015 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated content DO NOT EDIT
2
+ class PreTokenizer:
3
+ """
4
+ Base class for all pre-tokenizers
5
+
6
+ This class is not supposed to be instantiated directly. Instead, any implementation of a
7
+ PreTokenizer will return an instance of this class when instantiated.
8
+ """
9
+ def __getstate__(self):
10
+ """ """
11
+ pass
12
+
13
+ def __setstate__(self, state):
14
+ """ """
15
+ pass
16
+
17
+ @staticmethod
18
+ def custom(pretok):
19
+ """ """
20
+ pass
21
+
22
+ def pre_tokenize(self, pretok):
23
+ """
24
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
25
+
26
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
27
+ keep track of the pre-tokenization, and leverage the capabilities of the
28
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
29
+ the pre-tokenization of a raw string, you can use
30
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
31
+
32
+ Args:
33
+ pretok (:class:`~tokenizers.PreTokenizedString):
34
+ The pre-tokenized string on which to apply this
35
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
36
+ """
37
+ pass
38
+
39
+ def pre_tokenize_str(self, sequence):
40
+ """
41
+ Pre tokenize the given string
42
+
43
+ This method provides a way to visualize the effect of a
44
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
45
+ alignment, nor does it provide all the capabilities of the
46
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
47
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
48
+
49
+ Args:
50
+ sequence (:obj:`str`):
51
+ A string to pre-tokeize
52
+
53
+ Returns:
54
+ :obj:`List[Tuple[str, Offsets]]`:
55
+ A list of tuple with the pre-tokenized parts and their offsets
56
+ """
57
+ pass
58
+
59
+ class BertPreTokenizer(PreTokenizer):
60
+ """
61
+ BertPreTokenizer
62
+
63
+ This pre-tokenizer splits tokens on spaces, and also on punctuation.
64
+ Each occurrence of a punctuation character will be treated separately.
65
+ """
66
+ def __init__(self):
67
+ pass
68
+
69
+ def __getstate__(self):
70
+ """ """
71
+ pass
72
+
73
+ def __setstate__(self, state):
74
+ """ """
75
+ pass
76
+
77
+ @staticmethod
78
+ def custom(pretok):
79
+ """ """
80
+ pass
81
+
82
+ def pre_tokenize(self, pretok):
83
+ """
84
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
85
+
86
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
87
+ keep track of the pre-tokenization, and leverage the capabilities of the
88
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
89
+ the pre-tokenization of a raw string, you can use
90
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
91
+
92
+ Args:
93
+ pretok (:class:`~tokenizers.PreTokenizedString):
94
+ The pre-tokenized string on which to apply this
95
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
96
+ """
97
+ pass
98
+
99
+ def pre_tokenize_str(self, sequence):
100
+ """
101
+ Pre tokenize the given string
102
+
103
+ This method provides a way to visualize the effect of a
104
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
105
+ alignment, nor does it provide all the capabilities of the
106
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
107
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
108
+
109
+ Args:
110
+ sequence (:obj:`str`):
111
+ A string to pre-tokeize
112
+
113
+ Returns:
114
+ :obj:`List[Tuple[str, Offsets]]`:
115
+ A list of tuple with the pre-tokenized parts and their offsets
116
+ """
117
+ pass
118
+
119
+ class ByteLevel(PreTokenizer):
120
+ """
121
+ ByteLevel PreTokenizer
122
+
123
+ This pre-tokenizer takes care of replacing all bytes of the given string
124
+ with a corresponding representation, as well as splitting into words.
125
+
126
+ Args:
127
+ add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
128
+ Whether to add a space to the first word if there isn't already one. This
129
+ lets us treat `hello` exactly like `say hello`.
130
+ use_regex (:obj:`bool`, `optional`, defaults to :obj:`True`):
131
+ Set this to :obj:`False` to prevent this `pre_tokenizer` from using
132
+ the GPT2 specific regexp for spliting on whitespace.
133
+ """
134
+ def __init__(self, add_prefix_space=True, trim_offsets=True, use_regex=True):
135
+ pass
136
+
137
+ def __getstate__(self):
138
+ """ """
139
+ pass
140
+
141
+ def __setstate__(self, state):
142
+ """ """
143
+ pass
144
+
145
+ @property
146
+ def add_prefix_space(self):
147
+ """ """
148
+ pass
149
+
150
+ @add_prefix_space.setter
151
+ def add_prefix_space(self, value):
152
+ """ """
153
+ pass
154
+
155
+ @staticmethod
156
+ def alphabet():
157
+ """
158
+ Returns the alphabet used by this PreTokenizer.
159
+
160
+ Since the ByteLevel works as its name suggests, at the byte level, it
161
+ encodes each byte value to a unique visible character. This means that there is a
162
+ total of 256 different characters composing this alphabet.
163
+
164
+ Returns:
165
+ :obj:`List[str]`: A list of characters that compose the alphabet
166
+ """
167
+ pass
168
+
169
+ @staticmethod
170
+ def custom(pretok):
171
+ """ """
172
+ pass
173
+
174
+ def pre_tokenize(self, pretok):
175
+ """
176
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
177
+
178
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
179
+ keep track of the pre-tokenization, and leverage the capabilities of the
180
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
181
+ the pre-tokenization of a raw string, you can use
182
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
183
+
184
+ Args:
185
+ pretok (:class:`~tokenizers.PreTokenizedString):
186
+ The pre-tokenized string on which to apply this
187
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
188
+ """
189
+ pass
190
+
191
+ def pre_tokenize_str(self, sequence):
192
+ """
193
+ Pre tokenize the given string
194
+
195
+ This method provides a way to visualize the effect of a
196
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
197
+ alignment, nor does it provide all the capabilities of the
198
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
199
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
200
+
201
+ Args:
202
+ sequence (:obj:`str`):
203
+ A string to pre-tokeize
204
+
205
+ Returns:
206
+ :obj:`List[Tuple[str, Offsets]]`:
207
+ A list of tuple with the pre-tokenized parts and their offsets
208
+ """
209
+ pass
210
+
211
+ @property
212
+ def trim_offsets(self):
213
+ """ """
214
+ pass
215
+
216
+ @trim_offsets.setter
217
+ def trim_offsets(self, value):
218
+ """ """
219
+ pass
220
+
221
+ @property
222
+ def use_regex(self):
223
+ """ """
224
+ pass
225
+
226
+ @use_regex.setter
227
+ def use_regex(self, value):
228
+ """ """
229
+ pass
230
+
231
+ class CharDelimiterSplit(PreTokenizer):
232
+ """
233
+ This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)`
234
+
235
+ Args:
236
+ delimiter: str:
237
+ The delimiter char that will be used to split input
238
+ """
239
+ def __init__(self, delimiter):
240
+ pass
241
+
242
+ def __getnewargs__(self):
243
+ """ """
244
+ pass
245
+
246
+ def __getstate__(self):
247
+ """ """
248
+ pass
249
+
250
+ def __setstate__(self, state):
251
+ """ """
252
+ pass
253
+
254
+ @staticmethod
255
+ def custom(pretok):
256
+ """ """
257
+ pass
258
+
259
+ @property
260
+ def delimiter(self):
261
+ """ """
262
+ pass
263
+
264
+ @delimiter.setter
265
+ def delimiter(self, value):
266
+ """ """
267
+ pass
268
+
269
+ def pre_tokenize(self, pretok):
270
+ """
271
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
272
+
273
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
274
+ keep track of the pre-tokenization, and leverage the capabilities of the
275
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
276
+ the pre-tokenization of a raw string, you can use
277
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
278
+
279
+ Args:
280
+ pretok (:class:`~tokenizers.PreTokenizedString):
281
+ The pre-tokenized string on which to apply this
282
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
283
+ """
284
+ pass
285
+
286
+ def pre_tokenize_str(self, sequence):
287
+ """
288
+ Pre tokenize the given string
289
+
290
+ This method provides a way to visualize the effect of a
291
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
292
+ alignment, nor does it provide all the capabilities of the
293
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
294
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
295
+
296
+ Args:
297
+ sequence (:obj:`str`):
298
+ A string to pre-tokeize
299
+
300
+ Returns:
301
+ :obj:`List[Tuple[str, Offsets]]`:
302
+ A list of tuple with the pre-tokenized parts and their offsets
303
+ """
304
+ pass
305
+
306
+ class Digits(PreTokenizer):
307
+ """
308
+ This pre-tokenizer simply splits using the digits in separate tokens
309
+
310
+ Args:
311
+ individual_digits (:obj:`bool`, `optional`, defaults to :obj:`False`):
312
+ If set to True, digits will each be separated as follows::
313
+
314
+ "Call 123 please" -> "Call ", "1", "2", "3", " please"
315
+
316
+ If set to False, digits will grouped as follows::
317
+
318
+ "Call 123 please" -> "Call ", "123", " please"
319
+ """
320
+ def __init__(self, individual_digits=False):
321
+ pass
322
+
323
+ def __getstate__(self):
324
+ """ """
325
+ pass
326
+
327
+ def __setstate__(self, state):
328
+ """ """
329
+ pass
330
+
331
+ @staticmethod
332
+ def custom(pretok):
333
+ """ """
334
+ pass
335
+
336
+ @property
337
+ def individual_digits(self):
338
+ """ """
339
+ pass
340
+
341
+ @individual_digits.setter
342
+ def individual_digits(self, value):
343
+ """ """
344
+ pass
345
+
346
+ def pre_tokenize(self, pretok):
347
+ """
348
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
349
+
350
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
351
+ keep track of the pre-tokenization, and leverage the capabilities of the
352
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
353
+ the pre-tokenization of a raw string, you can use
354
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
355
+
356
+ Args:
357
+ pretok (:class:`~tokenizers.PreTokenizedString):
358
+ The pre-tokenized string on which to apply this
359
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
360
+ """
361
+ pass
362
+
363
+ def pre_tokenize_str(self, sequence):
364
+ """
365
+ Pre tokenize the given string
366
+
367
+ This method provides a way to visualize the effect of a
368
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
369
+ alignment, nor does it provide all the capabilities of the
370
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
371
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
372
+
373
+ Args:
374
+ sequence (:obj:`str`):
375
+ A string to pre-tokeize
376
+
377
+ Returns:
378
+ :obj:`List[Tuple[str, Offsets]]`:
379
+ A list of tuple with the pre-tokenized parts and their offsets
380
+ """
381
+ pass
382
+
383
+ class FixedLength(PreTokenizer):
384
+ """
385
+ This pre-tokenizer splits the text into fixed length chunks as used
386
+ [here](https://www.biorxiv.org/content/10.1101/2023.01.11.523679v1.full)
387
+
388
+ Args:
389
+ length (:obj:`int`, `optional`, defaults to :obj:`5`):
390
+ The length of the chunks to split the text into.
391
+
392
+ Strings are split on the character level rather than the byte level to avoid
393
+ splitting unicode characters consisting of multiple bytes.
394
+ """
395
+ def __init__(self, length=5):
396
+ pass
397
+
398
+ def __getstate__(self):
399
+ """ """
400
+ pass
401
+
402
+ def __setstate__(self, state):
403
+ """ """
404
+ pass
405
+
406
+ @staticmethod
407
+ def custom(pretok):
408
+ """ """
409
+ pass
410
+
411
+ @property
412
+ def length(self):
413
+ """ """
414
+ pass
415
+
416
+ @length.setter
417
+ def length(self, value):
418
+ """ """
419
+ pass
420
+
421
+ def pre_tokenize(self, pretok):
422
+ """
423
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
424
+
425
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
426
+ keep track of the pre-tokenization, and leverage the capabilities of the
427
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
428
+ the pre-tokenization of a raw string, you can use
429
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
430
+
431
+ Args:
432
+ pretok (:class:`~tokenizers.PreTokenizedString):
433
+ The pre-tokenized string on which to apply this
434
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
435
+ """
436
+ pass
437
+
438
+ def pre_tokenize_str(self, sequence):
439
+ """
440
+ Pre tokenize the given string
441
+
442
+ This method provides a way to visualize the effect of a
443
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
444
+ alignment, nor does it provide all the capabilities of the
445
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
446
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
447
+
448
+ Args:
449
+ sequence (:obj:`str`):
450
+ A string to pre-tokeize
451
+
452
+ Returns:
453
+ :obj:`List[Tuple[str, Offsets]]`:
454
+ A list of tuple with the pre-tokenized parts and their offsets
455
+ """
456
+ pass
457
+
458
+ class Metaspace(PreTokenizer):
459
+ """
460
+ Metaspace pre-tokenizer
461
+
462
+ This pre-tokenizer replaces any whitespace by the provided replacement character.
463
+ It then tries to split on these spaces.
464
+
465
+ Args:
466
+ replacement (:obj:`str`, `optional`, defaults to :obj:`▁`):
467
+ The replacement character. Must be exactly one character. By default we
468
+ use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
469
+
470
+ prepend_scheme (:obj:`str`, `optional`, defaults to :obj:`"always"`):
471
+ Whether to add a space to the first word if there isn't already one. This
472
+ lets us treat `hello` exactly like `say hello`.
473
+ Choices: "always", "never", "first". First means the space is only added on the first
474
+ token (relevant when special tokens are used or other pre_tokenizer are used).
475
+
476
+ """
477
+ def __init__(self, replacement="_", prepend_scheme="always", split=True):
478
+ pass
479
+
480
+ def __getstate__(self):
481
+ """ """
482
+ pass
483
+
484
+ def __setstate__(self, state):
485
+ """ """
486
+ pass
487
+
488
+ @staticmethod
489
+ def custom(pretok):
490
+ """ """
491
+ pass
492
+
493
+ def pre_tokenize(self, pretok):
494
+ """
495
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
496
+
497
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
498
+ keep track of the pre-tokenization, and leverage the capabilities of the
499
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
500
+ the pre-tokenization of a raw string, you can use
501
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
502
+
503
+ Args:
504
+ pretok (:class:`~tokenizers.PreTokenizedString):
505
+ The pre-tokenized string on which to apply this
506
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
507
+ """
508
+ pass
509
+
510
+ def pre_tokenize_str(self, sequence):
511
+ """
512
+ Pre tokenize the given string
513
+
514
+ This method provides a way to visualize the effect of a
515
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
516
+ alignment, nor does it provide all the capabilities of the
517
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
518
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
519
+
520
+ Args:
521
+ sequence (:obj:`str`):
522
+ A string to pre-tokeize
523
+
524
+ Returns:
525
+ :obj:`List[Tuple[str, Offsets]]`:
526
+ A list of tuple with the pre-tokenized parts and their offsets
527
+ """
528
+ pass
529
+
530
+ @property
531
+ def prepend_scheme(self):
532
+ """ """
533
+ pass
534
+
535
+ @prepend_scheme.setter
536
+ def prepend_scheme(self, value):
537
+ """ """
538
+ pass
539
+
540
+ @property
541
+ def replacement(self):
542
+ """ """
543
+ pass
544
+
545
+ @replacement.setter
546
+ def replacement(self, value):
547
+ """ """
548
+ pass
549
+
550
+ @property
551
+ def split(self):
552
+ """ """
553
+ pass
554
+
555
+ @split.setter
556
+ def split(self, value):
557
+ """ """
558
+ pass
559
+
560
+ class Punctuation(PreTokenizer):
561
+ """
562
+ This pre-tokenizer simply splits on punctuation as individual characters.
563
+
564
+ Args:
565
+ behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
566
+ The behavior to use when splitting.
567
+ Choices: "removed", "isolated" (default), "merged_with_previous", "merged_with_next",
568
+ "contiguous"
569
+ """
570
+ def __init__(self, behavior="isolated"):
571
+ pass
572
+
573
+ def __getstate__(self):
574
+ """ """
575
+ pass
576
+
577
+ def __setstate__(self, state):
578
+ """ """
579
+ pass
580
+
581
+ @property
582
+ def behavior(self):
583
+ """ """
584
+ pass
585
+
586
+ @behavior.setter
587
+ def behavior(self, value):
588
+ """ """
589
+ pass
590
+
591
+ @staticmethod
592
+ def custom(pretok):
593
+ """ """
594
+ pass
595
+
596
+ def pre_tokenize(self, pretok):
597
+ """
598
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
599
+
600
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
601
+ keep track of the pre-tokenization, and leverage the capabilities of the
602
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
603
+ the pre-tokenization of a raw string, you can use
604
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
605
+
606
+ Args:
607
+ pretok (:class:`~tokenizers.PreTokenizedString):
608
+ The pre-tokenized string on which to apply this
609
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
610
+ """
611
+ pass
612
+
613
+ def pre_tokenize_str(self, sequence):
614
+ """
615
+ Pre tokenize the given string
616
+
617
+ This method provides a way to visualize the effect of a
618
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
619
+ alignment, nor does it provide all the capabilities of the
620
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
621
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
622
+
623
+ Args:
624
+ sequence (:obj:`str`):
625
+ A string to pre-tokeize
626
+
627
+ Returns:
628
+ :obj:`List[Tuple[str, Offsets]]`:
629
+ A list of tuple with the pre-tokenized parts and their offsets
630
+ """
631
+ pass
632
+
633
+ class Sequence(PreTokenizer):
634
+ """
635
+ This pre-tokenizer composes other pre_tokenizers and applies them in sequence
636
+ """
637
+ def __init__(self, pretokenizers):
638
+ pass
639
+
640
+ def __getitem__(self, key):
641
+ """
642
+ Return self[key].
643
+ """
644
+ pass
645
+
646
+ def __getnewargs__(self):
647
+ """ """
648
+ pass
649
+
650
+ def __getstate__(self):
651
+ """ """
652
+ pass
653
+
654
+ def __setitem__(self, key, value):
655
+ """
656
+ Set self[key] to value.
657
+ """
658
+ pass
659
+
660
+ def __setstate__(self, state):
661
+ """ """
662
+ pass
663
+
664
+ @staticmethod
665
+ def custom(pretok):
666
+ """ """
667
+ pass
668
+
669
+ def pre_tokenize(self, pretok):
670
+ """
671
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
672
+
673
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
674
+ keep track of the pre-tokenization, and leverage the capabilities of the
675
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
676
+ the pre-tokenization of a raw string, you can use
677
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
678
+
679
+ Args:
680
+ pretok (:class:`~tokenizers.PreTokenizedString):
681
+ The pre-tokenized string on which to apply this
682
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
683
+ """
684
+ pass
685
+
686
+ def pre_tokenize_str(self, sequence):
687
+ """
688
+ Pre tokenize the given string
689
+
690
+ This method provides a way to visualize the effect of a
691
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
692
+ alignment, nor does it provide all the capabilities of the
693
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
694
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
695
+
696
+ Args:
697
+ sequence (:obj:`str`):
698
+ A string to pre-tokeize
699
+
700
+ Returns:
701
+ :obj:`List[Tuple[str, Offsets]]`:
702
+ A list of tuple with the pre-tokenized parts and their offsets
703
+ """
704
+ pass
705
+
706
+ class Split(PreTokenizer):
707
+ """
708
+ Split PreTokenizer
709
+
710
+ This versatile pre-tokenizer splits using the provided pattern and
711
+ according to the provided behavior. The pattern can be inverted by
712
+ making use of the invert flag.
713
+
714
+ Args:
715
+ pattern (:obj:`str` or :class:`~tokenizers.Regex`):
716
+ A pattern used to split the string. Usually a string or a regex built with `tokenizers.Regex`.
717
+ If you want to use a regex pattern, it has to be wrapped around a `tokenizers.Regex`,
718
+ otherwise we consider is as a string pattern. For example `pattern="|"`
719
+ means you want to split on `|` (imagine a csv file for example), while
720
+ `pattern=tokenizers.Regex("1|2")` means you split on either '1' or '2'.
721
+ behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
722
+ The behavior to use when splitting.
723
+ Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",
724
+ "contiguous"
725
+
726
+ invert (:obj:`bool`, `optional`, defaults to :obj:`False`):
727
+ Whether to invert the pattern.
728
+ """
729
+ def __init__(self, pattern, behavior, invert=False):
730
+ pass
731
+
732
+ def __getnewargs__(self):
733
+ """ """
734
+ pass
735
+
736
+ def __getstate__(self):
737
+ """ """
738
+ pass
739
+
740
+ def __setstate__(self, state):
741
+ """ """
742
+ pass
743
+
744
+ @property
745
+ def behavior(self):
746
+ """ """
747
+ pass
748
+
749
+ @behavior.setter
750
+ def behavior(self, value):
751
+ """ """
752
+ pass
753
+
754
+ @staticmethod
755
+ def custom(pretok):
756
+ """ """
757
+ pass
758
+
759
+ @property
760
+ def invert(self):
761
+ """ """
762
+ pass
763
+
764
+ @invert.setter
765
+ def invert(self, value):
766
+ """ """
767
+ pass
768
+
769
+ @property
770
+ def pattern(self):
771
+ """ """
772
+ pass
773
+
774
+ @pattern.setter
775
+ def pattern(self, value):
776
+ """ """
777
+ pass
778
+
779
+ def pre_tokenize(self, pretok):
780
+ """
781
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
782
+
783
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
784
+ keep track of the pre-tokenization, and leverage the capabilities of the
785
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
786
+ the pre-tokenization of a raw string, you can use
787
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
788
+
789
+ Args:
790
+ pretok (:class:`~tokenizers.PreTokenizedString):
791
+ The pre-tokenized string on which to apply this
792
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
793
+ """
794
+ pass
795
+
796
+ def pre_tokenize_str(self, sequence):
797
+ """
798
+ Pre tokenize the given string
799
+
800
+ This method provides a way to visualize the effect of a
801
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
802
+ alignment, nor does it provide all the capabilities of the
803
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
804
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
805
+
806
+ Args:
807
+ sequence (:obj:`str`):
808
+ A string to pre-tokeize
809
+
810
+ Returns:
811
+ :obj:`List[Tuple[str, Offsets]]`:
812
+ A list of tuple with the pre-tokenized parts and their offsets
813
+ """
814
+ pass
815
+
816
+ class UnicodeScripts(PreTokenizer):
817
+ """
818
+ This pre-tokenizer splits on characters that belong to different language family
819
+ It roughly follows https://github.com/google/sentencepiece/blob/master/data/Scripts.txt
820
+ Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too.
821
+ This mimicks SentencePiece Unigram implementation.
822
+ """
823
+ def __init__(self):
824
+ pass
825
+
826
+ def __getstate__(self):
827
+ """ """
828
+ pass
829
+
830
+ def __setstate__(self, state):
831
+ """ """
832
+ pass
833
+
834
+ @staticmethod
835
+ def custom(pretok):
836
+ """ """
837
+ pass
838
+
839
+ def pre_tokenize(self, pretok):
840
+ """
841
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
842
+
843
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
844
+ keep track of the pre-tokenization, and leverage the capabilities of the
845
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
846
+ the pre-tokenization of a raw string, you can use
847
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
848
+
849
+ Args:
850
+ pretok (:class:`~tokenizers.PreTokenizedString):
851
+ The pre-tokenized string on which to apply this
852
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
853
+ """
854
+ pass
855
+
856
+ def pre_tokenize_str(self, sequence):
857
+ """
858
+ Pre tokenize the given string
859
+
860
+ This method provides a way to visualize the effect of a
861
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
862
+ alignment, nor does it provide all the capabilities of the
863
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
864
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
865
+
866
+ Args:
867
+ sequence (:obj:`str`):
868
+ A string to pre-tokeize
869
+
870
+ Returns:
871
+ :obj:`List[Tuple[str, Offsets]]`:
872
+ A list of tuple with the pre-tokenized parts and their offsets
873
+ """
874
+ pass
875
+
876
+ class Whitespace(PreTokenizer):
877
+ """
878
+ This pre-tokenizer splits on word boundaries according to the `\w+|[^\w\s]+`
879
+ regex pattern. It splits on word characters or characters that aren't words or
880
+ whitespaces (punctuation such as hyphens, apostrophes, commas, etc.).
881
+
882
+ Example:
883
+ Use the `Whitespace` function as shown below::
884
+
885
+ ```python
886
+ from tokenizers.pre_tokenizers import Whitespace
887
+
888
+ pre_tokenizer = Whitespace()
889
+ text = "Hello, world! Let's try the Whitespace pre-tokenizer."
890
+ pre_tokenizer.pre_tokenize_str(text)
891
+ [('Hello', (0, 5)),
892
+ (',', (5, 6)),
893
+ ('world', (7, 12)),
894
+ ('!', (12, 13)),
895
+ ('Let', (14, 17)),
896
+ ("'", (17, 18)),
897
+ ('s', (18, 19)),
898
+ ('try', (20, 23)),
899
+ ('the', (24, 27)),
900
+ ('Whitespace', (28, 38)),
901
+ ('pre', (39, 42)),
902
+ ('-', (42, 43)),
903
+ ('tokenizer', (43, 52)),
904
+ ('.', (52, 53))]
905
+ ```
906
+ """
907
+ def __init__(self):
908
+ pass
909
+
910
+ def __getstate__(self):
911
+ """ """
912
+ pass
913
+
914
+ def __setstate__(self, state):
915
+ """ """
916
+ pass
917
+
918
+ @staticmethod
919
+ def custom(pretok):
920
+ """ """
921
+ pass
922
+
923
+ def pre_tokenize(self, pretok):
924
+ """
925
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
926
+
927
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
928
+ keep track of the pre-tokenization, and leverage the capabilities of the
929
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
930
+ the pre-tokenization of a raw string, you can use
931
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
932
+
933
+ Args:
934
+ pretok (:class:`~tokenizers.PreTokenizedString):
935
+ The pre-tokenized string on which to apply this
936
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
937
+ """
938
+ pass
939
+
940
+ def pre_tokenize_str(self, sequence):
941
+ """
942
+ Pre tokenize the given string
943
+
944
+ This method provides a way to visualize the effect of a
945
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
946
+ alignment, nor does it provide all the capabilities of the
947
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
948
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
949
+
950
+ Args:
951
+ sequence (:obj:`str`):
952
+ A string to pre-tokeize
953
+
954
+ Returns:
955
+ :obj:`List[Tuple[str, Offsets]]`:
956
+ A list of tuple with the pre-tokenized parts and their offsets
957
+ """
958
+ pass
959
+
960
+ class WhitespaceSplit(PreTokenizer):
961
+ """
962
+ This pre-tokenizer simply splits on the whitespace. Works like `.split()`
963
+ """
964
+ def __init__(self):
965
+ pass
966
+
967
+ def __getstate__(self):
968
+ """ """
969
+ pass
970
+
971
+ def __setstate__(self, state):
972
+ """ """
973
+ pass
974
+
975
+ @staticmethod
976
+ def custom(pretok):
977
+ """ """
978
+ pass
979
+
980
+ def pre_tokenize(self, pretok):
981
+ """
982
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
983
+
984
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
985
+ keep track of the pre-tokenization, and leverage the capabilities of the
986
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
987
+ the pre-tokenization of a raw string, you can use
988
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
989
+
990
+ Args:
991
+ pretok (:class:`~tokenizers.PreTokenizedString):
992
+ The pre-tokenized string on which to apply this
993
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
994
+ """
995
+ pass
996
+
997
+ def pre_tokenize_str(self, sequence):
998
+ """
999
+ Pre tokenize the given string
1000
+
1001
+ This method provides a way to visualize the effect of a
1002
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
1003
+ alignment, nor does it provide all the capabilities of the
1004
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
1005
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
1006
+
1007
+ Args:
1008
+ sequence (:obj:`str`):
1009
+ A string to pre-tokeize
1010
+
1011
+ Returns:
1012
+ :obj:`List[Tuple[str, Offsets]]`:
1013
+ A list of tuple with the pre-tokenized parts and their offsets
1014
+ """
1015
+ pass
source/tokenizers/processors/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # Generated content DO NOT EDIT
2
+ from .. import processors
3
+
4
+ PostProcessor = processors.PostProcessor
5
+ BertProcessing = processors.BertProcessing
6
+ ByteLevel = processors.ByteLevel
7
+ RobertaProcessing = processors.RobertaProcessing
8
+ Sequence = processors.Sequence
9
+ TemplateProcessing = processors.TemplateProcessing
source/tokenizers/processors/__init__.pyi ADDED
@@ -0,0 +1,519 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated content DO NOT EDIT
2
+ class PostProcessor:
3
+ """
4
+ Base class for all post-processors
5
+
6
+ This class is not supposed to be instantiated directly. Instead, any implementation of
7
+ a PostProcessor will return an instance of this class when instantiated.
8
+ """
9
+ def __getstate__(self):
10
+ """ """
11
+ pass
12
+
13
+ def __setstate__(self, state):
14
+ """ """
15
+ pass
16
+
17
+ def num_special_tokens_to_add(self, is_pair):
18
+ """
19
+ Return the number of special tokens that would be added for single/pair sentences.
20
+
21
+ Args:
22
+ is_pair (:obj:`bool`):
23
+ Whether the input would be a pair of sequences
24
+
25
+ Returns:
26
+ :obj:`int`: The number of tokens to add
27
+ """
28
+ pass
29
+
30
+ def process(self, encoding, pair=None, add_special_tokens=True):
31
+ """
32
+ Post-process the given encodings, generating the final one
33
+
34
+ Args:
35
+ encoding (:class:`~tokenizers.Encoding`):
36
+ The encoding for the first sequence
37
+
38
+ pair (:class:`~tokenizers.Encoding`, `optional`):
39
+ The encoding for the pair sequence
40
+
41
+ add_special_tokens (:obj:`bool`):
42
+ Whether to add the special tokens
43
+
44
+ Return:
45
+ :class:`~tokenizers.Encoding`: The final encoding
46
+ """
47
+ pass
48
+
49
+ class BertProcessing(PostProcessor):
50
+ """
51
+ This post-processor takes care of adding the special tokens needed by
52
+ a Bert model:
53
+
54
+ - a SEP token
55
+ - a CLS token
56
+
57
+ Args:
58
+ sep (:obj:`Tuple[str, int]`):
59
+ A tuple with the string representation of the SEP token, and its id
60
+
61
+ cls (:obj:`Tuple[str, int]`):
62
+ A tuple with the string representation of the CLS token, and its id
63
+ """
64
+ def __init__(self, sep, cls):
65
+ pass
66
+
67
+ def __getnewargs__(self):
68
+ """ """
69
+ pass
70
+
71
+ def __getstate__(self):
72
+ """ """
73
+ pass
74
+
75
+ def __setstate__(self, state):
76
+ """ """
77
+ pass
78
+
79
+ @property
80
+ def cls(self):
81
+ """ """
82
+ pass
83
+
84
+ @cls.setter
85
+ def cls(self, value):
86
+ """ """
87
+ pass
88
+
89
+ def num_special_tokens_to_add(self, is_pair):
90
+ """
91
+ Return the number of special tokens that would be added for single/pair sentences.
92
+
93
+ Args:
94
+ is_pair (:obj:`bool`):
95
+ Whether the input would be a pair of sequences
96
+
97
+ Returns:
98
+ :obj:`int`: The number of tokens to add
99
+ """
100
+ pass
101
+
102
+ def process(self, encoding, pair=None, add_special_tokens=True):
103
+ """
104
+ Post-process the given encodings, generating the final one
105
+
106
+ Args:
107
+ encoding (:class:`~tokenizers.Encoding`):
108
+ The encoding for the first sequence
109
+
110
+ pair (:class:`~tokenizers.Encoding`, `optional`):
111
+ The encoding for the pair sequence
112
+
113
+ add_special_tokens (:obj:`bool`):
114
+ Whether to add the special tokens
115
+
116
+ Return:
117
+ :class:`~tokenizers.Encoding`: The final encoding
118
+ """
119
+ pass
120
+
121
+ @property
122
+ def sep(self):
123
+ """ """
124
+ pass
125
+
126
+ @sep.setter
127
+ def sep(self, value):
128
+ """ """
129
+ pass
130
+
131
+ class ByteLevel(PostProcessor):
132
+ """
133
+ This post-processor takes care of trimming the offsets.
134
+
135
+ By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
136
+ want the offsets to include these whitespaces, then this PostProcessor must be used.
137
+
138
+ Args:
139
+ trim_offsets (:obj:`bool`):
140
+ Whether to trim the whitespaces from the produced offsets.
141
+
142
+ add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
143
+ If :obj:`True`, keeps the first token's offset as is. If :obj:`False`, increments
144
+ the start of the first token's offset by 1. Only has an effect if :obj:`trim_offsets`
145
+ is set to :obj:`True`.
146
+ """
147
+ def __init__(self, add_prefix_space=None, trim_offsets=None, use_regex=None):
148
+ pass
149
+
150
+ def __getstate__(self):
151
+ """ """
152
+ pass
153
+
154
+ def __setstate__(self, state):
155
+ """ """
156
+ pass
157
+
158
+ @property
159
+ def add_prefix_space(self):
160
+ """ """
161
+ pass
162
+
163
+ @add_prefix_space.setter
164
+ def add_prefix_space(self, value):
165
+ """ """
166
+ pass
167
+
168
+ def num_special_tokens_to_add(self, is_pair):
169
+ """
170
+ Return the number of special tokens that would be added for single/pair sentences.
171
+
172
+ Args:
173
+ is_pair (:obj:`bool`):
174
+ Whether the input would be a pair of sequences
175
+
176
+ Returns:
177
+ :obj:`int`: The number of tokens to add
178
+ """
179
+ pass
180
+
181
+ def process(self, encoding, pair=None, add_special_tokens=True):
182
+ """
183
+ Post-process the given encodings, generating the final one
184
+
185
+ Args:
186
+ encoding (:class:`~tokenizers.Encoding`):
187
+ The encoding for the first sequence
188
+
189
+ pair (:class:`~tokenizers.Encoding`, `optional`):
190
+ The encoding for the pair sequence
191
+
192
+ add_special_tokens (:obj:`bool`):
193
+ Whether to add the special tokens
194
+
195
+ Return:
196
+ :class:`~tokenizers.Encoding`: The final encoding
197
+ """
198
+ pass
199
+
200
+ @property
201
+ def trim_offsets(self):
202
+ """ """
203
+ pass
204
+
205
+ @trim_offsets.setter
206
+ def trim_offsets(self, value):
207
+ """ """
208
+ pass
209
+
210
+ @property
211
+ def use_regex(self):
212
+ """ """
213
+ pass
214
+
215
+ @use_regex.setter
216
+ def use_regex(self, value):
217
+ """ """
218
+ pass
219
+
220
+ class RobertaProcessing(PostProcessor):
221
+ """
222
+ This post-processor takes care of adding the special tokens needed by
223
+ a Roberta model:
224
+
225
+ - a SEP token
226
+ - a CLS token
227
+
228
+ It also takes care of trimming the offsets.
229
+ By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
230
+ want the offsets to include these whitespaces, then this PostProcessor should be initialized
231
+ with :obj:`trim_offsets=True`
232
+
233
+ Args:
234
+ sep (:obj:`Tuple[str, int]`):
235
+ A tuple with the string representation of the SEP token, and its id
236
+
237
+ cls (:obj:`Tuple[str, int]`):
238
+ A tuple with the string representation of the CLS token, and its id
239
+
240
+ trim_offsets (:obj:`bool`, `optional`, defaults to :obj:`True`):
241
+ Whether to trim the whitespaces from the produced offsets.
242
+
243
+ add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
244
+ Whether the add_prefix_space option was enabled during pre-tokenization. This
245
+ is relevant because it defines the way the offsets are trimmed out.
246
+ """
247
+ def __init__(self, sep, cls, trim_offsets=True, add_prefix_space=True):
248
+ pass
249
+
250
+ def __getnewargs__(self):
251
+ """ """
252
+ pass
253
+
254
+ def __getstate__(self):
255
+ """ """
256
+ pass
257
+
258
+ def __setstate__(self, state):
259
+ """ """
260
+ pass
261
+
262
+ @property
263
+ def add_prefix_space(self):
264
+ """ """
265
+ pass
266
+
267
+ @add_prefix_space.setter
268
+ def add_prefix_space(self, value):
269
+ """ """
270
+ pass
271
+
272
+ @property
273
+ def cls(self):
274
+ """ """
275
+ pass
276
+
277
+ @cls.setter
278
+ def cls(self, value):
279
+ """ """
280
+ pass
281
+
282
+ def num_special_tokens_to_add(self, is_pair):
283
+ """
284
+ Return the number of special tokens that would be added for single/pair sentences.
285
+
286
+ Args:
287
+ is_pair (:obj:`bool`):
288
+ Whether the input would be a pair of sequences
289
+
290
+ Returns:
291
+ :obj:`int`: The number of tokens to add
292
+ """
293
+ pass
294
+
295
+ def process(self, encoding, pair=None, add_special_tokens=True):
296
+ """
297
+ Post-process the given encodings, generating the final one
298
+
299
+ Args:
300
+ encoding (:class:`~tokenizers.Encoding`):
301
+ The encoding for the first sequence
302
+
303
+ pair (:class:`~tokenizers.Encoding`, `optional`):
304
+ The encoding for the pair sequence
305
+
306
+ add_special_tokens (:obj:`bool`):
307
+ Whether to add the special tokens
308
+
309
+ Return:
310
+ :class:`~tokenizers.Encoding`: The final encoding
311
+ """
312
+ pass
313
+
314
+ @property
315
+ def sep(self):
316
+ """ """
317
+ pass
318
+
319
+ @sep.setter
320
+ def sep(self, value):
321
+ """ """
322
+ pass
323
+
324
+ @property
325
+ def trim_offsets(self):
326
+ """ """
327
+ pass
328
+
329
+ @trim_offsets.setter
330
+ def trim_offsets(self, value):
331
+ """ """
332
+ pass
333
+
334
+ class Sequence(PostProcessor):
335
+ """
336
+ Sequence Processor
337
+
338
+ Args:
339
+ processors (:obj:`List[PostProcessor]`)
340
+ The processors that need to be chained
341
+ """
342
+ def __init__(self, processors):
343
+ pass
344
+
345
+ def __getitem__(self, key):
346
+ """
347
+ Return self[key].
348
+ """
349
+ pass
350
+
351
+ def __getnewargs__(self):
352
+ """ """
353
+ pass
354
+
355
+ def __getstate__(self):
356
+ """ """
357
+ pass
358
+
359
+ def __setitem__(self, key, value):
360
+ """
361
+ Set self[key] to value.
362
+ """
363
+ pass
364
+
365
+ def __setstate__(self, state):
366
+ """ """
367
+ pass
368
+
369
+ def num_special_tokens_to_add(self, is_pair):
370
+ """
371
+ Return the number of special tokens that would be added for single/pair sentences.
372
+
373
+ Args:
374
+ is_pair (:obj:`bool`):
375
+ Whether the input would be a pair of sequences
376
+
377
+ Returns:
378
+ :obj:`int`: The number of tokens to add
379
+ """
380
+ pass
381
+
382
+ def process(self, encoding, pair=None, add_special_tokens=True):
383
+ """
384
+ Post-process the given encodings, generating the final one
385
+
386
+ Args:
387
+ encoding (:class:`~tokenizers.Encoding`):
388
+ The encoding for the first sequence
389
+
390
+ pair (:class:`~tokenizers.Encoding`, `optional`):
391
+ The encoding for the pair sequence
392
+
393
+ add_special_tokens (:obj:`bool`):
394
+ Whether to add the special tokens
395
+
396
+ Return:
397
+ :class:`~tokenizers.Encoding`: The final encoding
398
+ """
399
+ pass
400
+
401
+ class TemplateProcessing(PostProcessor):
402
+ """
403
+ Provides a way to specify templates in order to add the special tokens to each
404
+ input sequence as relevant.
405
+
406
+ Let's take :obj:`BERT` tokenizer as an example. It uses two special tokens, used to
407
+ delimitate each sequence. :obj:`[CLS]` is always used at the beginning of the first
408
+ sequence, and :obj:`[SEP]` is added at the end of both the first, and the pair
409
+ sequences. The final result looks like this:
410
+
411
+ - Single sequence: :obj:`[CLS] Hello there [SEP]`
412
+ - Pair sequences: :obj:`[CLS] My name is Anthony [SEP] What is my name? [SEP]`
413
+
414
+ With the type ids as following::
415
+
416
+ [CLS] ... [SEP] ... [SEP]
417
+ 0 0 0 1 1
418
+
419
+ You can achieve such behavior using a TemplateProcessing::
420
+
421
+ TemplateProcessing(
422
+ single="[CLS] $0 [SEP]",
423
+ pair="[CLS] $A [SEP] $B:1 [SEP]:1",
424
+ special_tokens=[("[CLS]", 1), ("[SEP]", 0)],
425
+ )
426
+
427
+ In this example, each input sequence is identified using a ``$`` construct. This identifier
428
+ lets us specify each input sequence, and the type_id to use. When nothing is specified,
429
+ it uses the default values. Here are the different ways to specify it:
430
+
431
+ - Specifying the sequence, with default ``type_id == 0``: ``$A`` or ``$B``
432
+ - Specifying the `type_id` with default ``sequence == A``: ``$0``, ``$1``, ``$2``, ...
433
+ - Specifying both: ``$A:0``, ``$B:1``, ...
434
+
435
+ The same construct is used for special tokens: ``<identifier>(:<type_id>)?``.
436
+
437
+ **Warning**: You must ensure that you are giving the correct tokens/ids as these
438
+ will be added to the Encoding without any further check. If the given ids correspond
439
+ to something totally different in a `Tokenizer` using this `PostProcessor`, it
440
+ might lead to unexpected results.
441
+
442
+ Args:
443
+ single (:obj:`Template`):
444
+ The template used for single sequences
445
+
446
+ pair (:obj:`Template`):
447
+ The template used when both sequences are specified
448
+
449
+ special_tokens (:obj:`Tokens`):
450
+ The list of special tokens used in each sequences
451
+
452
+ Types:
453
+
454
+ Template (:obj:`str` or :obj:`List`):
455
+ - If a :obj:`str` is provided, the whitespace is used as delimiter between tokens
456
+ - If a :obj:`List[str]` is provided, a list of tokens
457
+
458
+ Tokens (:obj:`List[Union[Tuple[int, str], Tuple[str, int], dict]]`):
459
+ - A :obj:`Tuple` with both a token and its associated ID, in any order
460
+ - A :obj:`dict` with the following keys:
461
+ - "id": :obj:`str` => The special token id, as specified in the Template
462
+ - "ids": :obj:`List[int]` => The associated IDs
463
+ - "tokens": :obj:`List[str]` => The associated tokens
464
+
465
+ The given dict expects the provided :obj:`ids` and :obj:`tokens` lists to have
466
+ the same length.
467
+ """
468
+ def __init__(self, single=None, pair=None, special_tokens=None):
469
+ pass
470
+
471
+ def __getstate__(self):
472
+ """ """
473
+ pass
474
+
475
+ def __setstate__(self, state):
476
+ """ """
477
+ pass
478
+
479
+ def num_special_tokens_to_add(self, is_pair):
480
+ """
481
+ Return the number of special tokens that would be added for single/pair sentences.
482
+
483
+ Args:
484
+ is_pair (:obj:`bool`):
485
+ Whether the input would be a pair of sequences
486
+
487
+ Returns:
488
+ :obj:`int`: The number of tokens to add
489
+ """
490
+ pass
491
+
492
+ def process(self, encoding, pair=None, add_special_tokens=True):
493
+ """
494
+ Post-process the given encodings, generating the final one
495
+
496
+ Args:
497
+ encoding (:class:`~tokenizers.Encoding`):
498
+ The encoding for the first sequence
499
+
500
+ pair (:class:`~tokenizers.Encoding`, `optional`):
501
+ The encoding for the pair sequence
502
+
503
+ add_special_tokens (:obj:`bool`):
504
+ Whether to add the special tokens
505
+
506
+ Return:
507
+ :class:`~tokenizers.Encoding`: The final encoding
508
+ """
509
+ pass
510
+
511
+ @property
512
+ def single(self):
513
+ """ """
514
+ pass
515
+
516
+ @single.setter
517
+ def single(self, value):
518
+ """ """
519
+ pass
source/tokenizers/tokenizers.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c116fcf1e80d461ce0a35c332974f25949e8359416f50b3d53371810d2ce1ccc
3
+ size 10074176
source/tokenizers/tokenizers.pyi ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated content DO NOT EDIT
2
+ from . import (
3
+ AddedToken as AddedToken,
4
+ Encoding as Encoding,
5
+ NormalizedString as NormalizedString,
6
+ PreTokenizedString as PreTokenizedString,
7
+ Regex as Regex,
8
+ Token as Token,
9
+ Tokenizer as Tokenizer,
10
+ __version__ as __version__,
11
+ decoders as decoders,
12
+ models as models,
13
+ normalizers as normalizers,
14
+ pre_tokenizers as pre_tokenizers,
15
+ processors as processors,
16
+ trainers as trainers,
17
+ )
source/tokenizers/tools/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .visualizer import Annotation, EncodingVisualizer
source/tokenizers/tools/visualizer-styles.css ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .tokenized-text {
2
+ width:100%;
3
+ padding:2rem;
4
+ max-height: 400px;
5
+ overflow-y: auto;
6
+ box-sizing:border-box;
7
+ line-height:4rem; /* Lots of space between lines */
8
+ font-family: "Roboto Light", "Ubuntu Light", "Ubuntu", monospace;
9
+ box-shadow: 2px 2px 2px rgba(0,0,0,0.2);
10
+ background-color: rgba(0,0,0,0.01);
11
+ letter-spacing:2px; /* Give some extra separation between chars */
12
+ }
13
+ .non-token{
14
+ /* White space and other things the tokenizer ignores*/
15
+ white-space: pre;
16
+ letter-spacing:4px;
17
+ border-top:1px solid #A0A0A0; /* A gentle border on top and bottom makes tabs more ovious*/
18
+ border-bottom:1px solid #A0A0A0;
19
+ line-height: 1rem;
20
+ height: calc(100% - 2px);
21
+ }
22
+
23
+ .token {
24
+ white-space: pre;
25
+ position:relative;
26
+ color:black;
27
+ letter-spacing:2px;
28
+ }
29
+
30
+ .annotation{
31
+ white-space:nowrap; /* Important - ensures that annotations appears even if the annotated text wraps a line */
32
+ border-radius:4px;
33
+ position:relative;
34
+ width:fit-content;
35
+ }
36
+ .annotation:before {
37
+ /*The before holds the text and the after holds the background*/
38
+ z-index:1000; /* Make sure this is above the background */
39
+ content:attr(data-label); /* The annotations label is on a data attribute */
40
+ color:white;
41
+ position:absolute;
42
+ font-size:1rem;
43
+ text-align:center;
44
+ font-weight:bold;
45
+
46
+ top:1.75rem;
47
+ line-height:0;
48
+ left:0;
49
+ width:100%;
50
+ padding:0.5rem 0;
51
+ /* These make it so an annotation doesn't stretch beyond the annotated text if the label is longer*/
52
+ overflow: hidden;
53
+ white-space: nowrap;
54
+ text-overflow:ellipsis;
55
+ }
56
+
57
+ .annotation:after {
58
+ content:attr(data-label); /* The content defines the width of the annotation*/
59
+ position:absolute;
60
+ font-size:0.75rem;
61
+ text-align:center;
62
+ font-weight:bold;
63
+ text-overflow:ellipsis;
64
+ top:1.75rem;
65
+ line-height:0;
66
+ overflow: hidden;
67
+ white-space: nowrap;
68
+
69
+ left:0;
70
+ width:100%; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/
71
+
72
+ padding:0.5rem 0;
73
+ /* Nast hack below:
74
+ We set the annotations color in code because we don't know the colors at css time.
75
+ But you can't pass a color as a data attribute to get it into the pseudo element (this thing)
76
+ So to get around that, annotations have the color set on them with a style attribute and then we
77
+ can get the color with currentColor.
78
+ Annotations wrap tokens and tokens set the color back to black
79
+ */
80
+ background-color: currentColor;
81
+ }
82
+ .annotation:hover::after, .annotation:hover::before{
83
+ /* When the user hovers over an annotation expand the label to display in full
84
+ */
85
+ min-width: fit-content;
86
+ }
87
+
88
+ .annotation:hover{
89
+ /* Emphasize the annotation start end with a border on hover*/
90
+ border-color: currentColor;
91
+ border: 2px solid;
92
+ }
93
+ .special-token:not(:empty){
94
+ /*
95
+ A none empty special token is like UNK (as opposed to CLS which has no representation in the text )
96
+ */
97
+ position:relative;
98
+ }
99
+ .special-token:empty::before{
100
+ /* Special tokens that don't have text are displayed as pseudo elements so we dont select them with the mouse*/
101
+ content:attr(data-stok);
102
+ background:#202020;
103
+ font-size:0.75rem;
104
+ color:white;
105
+ margin: 0 0.25rem;
106
+ padding: 0.25rem;
107
+ border-radius:4px
108
+ }
109
+
110
+ .special-token:not(:empty):before {
111
+ /* Special tokens that have text (UNK) are displayed above the actual text*/
112
+ content:attr(data-stok);
113
+ position:absolute;
114
+ bottom:1.75rem;
115
+ min-width:100%;
116
+ width:100%;
117
+ height:1rem;
118
+ line-height:1rem;
119
+ font-size:1rem;
120
+ text-align:center;
121
+ color:white;
122
+ font-weight:bold;
123
+ background:#202020;
124
+ border-radius:10%;
125
+ }
126
+ /*
127
+ We want to alternate the color of tokens, but we can't use nth child because tokens might be broken up by annotations
128
+ instead we apply even and odd class at generation time and color them that way
129
+ */
130
+ .even-token{
131
+ background:#DCDCDC ;
132
+ border: 1px solid #DCDCDC;
133
+ }
134
+ .odd-token{
135
+ background:#A0A0A0;
136
+ border: 1px solid #A0A0A0;
137
+ }
138
+ .even-token.multi-token,.odd-token.multi-token{
139
+ background: repeating-linear-gradient(
140
+ 45deg,
141
+ transparent,
142
+ transparent 1px,
143
+ #ccc 1px,
144
+ #ccc 1px
145
+ ),
146
+ /* on "bottom" */
147
+ linear-gradient(
148
+ to bottom,
149
+ #FFB6C1,
150
+ #999
151
+ );
152
+ }
153
+
154
+ .multi-token:hover::after {
155
+ content:"This char has more than 1 token"; /* The content defines the width of the annotation*/
156
+ color:white;
157
+ background-color: black;
158
+ position:absolute;
159
+ font-size:0.75rem;
160
+ text-align:center;
161
+ font-weight:bold;
162
+ text-overflow:ellipsis;
163
+ top:1.75rem;
164
+ line-height:0;
165
+ overflow: hidden;
166
+ white-space: nowrap;
167
+ left:0;
168
+ width:fit-content; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/
169
+ padding:0.5rem 0;
170
+ }
source/tokenizers/tools/visualizer.py ADDED
@@ -0,0 +1,407 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import itertools
2
+ import os
3
+ import re
4
+ from string import Template
5
+ from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple
6
+
7
+ from tokenizers import Encoding, Tokenizer
8
+
9
+
10
+ dirname = os.path.dirname(__file__)
11
+ css_filename = os.path.join(dirname, "visualizer-styles.css")
12
+ with open(css_filename) as f:
13
+ css = f.read()
14
+
15
+
16
+ class Annotation:
17
+ start: int
18
+ end: int
19
+ label: str
20
+
21
+ def __init__(self, start: int, end: int, label: str):
22
+ self.start = start
23
+ self.end = end
24
+ self.label = label
25
+
26
+
27
+ AnnotationList = List[Annotation]
28
+ PartialIntList = List[Optional[int]]
29
+
30
+
31
+ class CharStateKey(NamedTuple):
32
+ token_ix: Optional[int]
33
+ anno_ix: Optional[int]
34
+
35
+
36
+ class CharState:
37
+ char_ix: Optional[int]
38
+
39
+ def __init__(self, char_ix):
40
+ self.char_ix = char_ix
41
+
42
+ self.anno_ix: Optional[int] = None
43
+ self.tokens: List[int] = []
44
+
45
+ @property
46
+ def token_ix(self):
47
+ return self.tokens[0] if len(self.tokens) > 0 else None
48
+
49
+ @property
50
+ def is_multitoken(self):
51
+ """
52
+ BPE tokenizers can output more than one token for a char
53
+ """
54
+ return len(self.tokens) > 1
55
+
56
+ def partition_key(self) -> CharStateKey:
57
+ return CharStateKey(
58
+ token_ix=self.token_ix,
59
+ anno_ix=self.anno_ix,
60
+ )
61
+
62
+
63
+ class Aligned:
64
+ pass
65
+
66
+
67
+ class EncodingVisualizer:
68
+ """
69
+ Build an EncodingVisualizer
70
+
71
+ Args:
72
+
73
+ tokenizer (:class:`~tokenizers.Tokenizer`):
74
+ A tokenizer instance
75
+
76
+ default_to_notebook (:obj:`bool`):
77
+ Whether to render html output in a notebook by default
78
+
79
+ annotation_converter (:obj:`Callable`, `optional`):
80
+ An optional (lambda) function that takes an annotation in any format and returns
81
+ an Annotation object
82
+ """
83
+
84
+ unk_token_regex = re.compile("(.{1}\b)?(unk|oov)(\b.{1})?", flags=re.IGNORECASE)
85
+
86
+ def __init__(
87
+ self,
88
+ tokenizer: Tokenizer,
89
+ default_to_notebook: bool = True,
90
+ annotation_converter: Optional[Callable[[Any], Annotation]] = None,
91
+ ):
92
+ if default_to_notebook:
93
+ try:
94
+ from IPython.core.display import HTML, display # type: ignore[attr-defined]
95
+ except ImportError:
96
+ raise Exception(
97
+ """We couldn't import IPython utils for html display.
98
+ Are you running in a notebook?
99
+ You can also pass `default_to_notebook=False` to get back raw HTML
100
+ """
101
+ )
102
+
103
+ self.tokenizer = tokenizer
104
+ self.default_to_notebook = default_to_notebook
105
+ self.annotation_coverter = annotation_converter
106
+ pass
107
+
108
+ def __call__(
109
+ self,
110
+ text: str,
111
+ annotations: Optional[List[Any]] = None,
112
+ default_to_notebook: Optional[bool] = None,
113
+ ) -> Optional[str]:
114
+ """
115
+ Build a visualization of the given text
116
+
117
+ Args:
118
+ text (:obj:`str`):
119
+ The text to tokenize
120
+
121
+ annotations (:obj:`List[Annotation]`, `optional`):
122
+ An optional list of annotations of the text. The can either be an annotation class
123
+ or anything else if you instantiated the visualizer with a converter function
124
+
125
+ default_to_notebook (:obj:`bool`, `optional`, defaults to `False`):
126
+ If True, will render the html in a notebook. Otherwise returns an html string.
127
+
128
+ Returns:
129
+ The HTML string if default_to_notebook is False, otherwise (default) returns None and
130
+ renders the HTML in the notebook
131
+
132
+ """
133
+ final_default_to_notebook = self.default_to_notebook
134
+ if default_to_notebook is not None:
135
+ final_default_to_notebook = default_to_notebook
136
+ if final_default_to_notebook:
137
+ try:
138
+ from IPython.core.display import HTML, display # type: ignore[attr-defined]
139
+ except ImportError:
140
+ raise Exception(
141
+ """We couldn't import IPython utils for html display.
142
+ Are you running in a notebook?"""
143
+ )
144
+ if annotations is None:
145
+ annotations = []
146
+ if self.annotation_coverter is not None:
147
+ annotations = list(map(self.annotation_coverter, annotations))
148
+ encoding = self.tokenizer.encode(text)
149
+ html = EncodingVisualizer.__make_html(text, encoding, annotations)
150
+ if final_default_to_notebook:
151
+ display(HTML(html))
152
+ else:
153
+ return html
154
+
155
+ @staticmethod
156
+ def calculate_label_colors(annotations: AnnotationList) -> Dict[str, str]:
157
+ """
158
+ Generates a color palette for all the labels in a given set of annotations
159
+
160
+ Args:
161
+ annotations (:obj:`Annotation`):
162
+ A list of annotations
163
+
164
+ Returns:
165
+ :obj:`dict`: A dictionary mapping labels to colors in HSL format
166
+ """
167
+ if len(annotations) == 0:
168
+ return {}
169
+ labels = set(map(lambda x: x.label, annotations))
170
+ num_labels = len(labels)
171
+ h_step = int(255 / num_labels)
172
+ if h_step < 20:
173
+ h_step = 20
174
+ s = 32
175
+ l = 64 # noqa: E741
176
+ h = 10
177
+ colors = {}
178
+
179
+ for label in sorted(labels): # sort so we always get the same colors for a given set of labels
180
+ colors[label] = f"hsl({h},{s}%,{l}%)"
181
+ h += h_step
182
+ return colors
183
+
184
+ @staticmethod
185
+ def consecutive_chars_to_html(
186
+ consecutive_chars_list: List[CharState],
187
+ text: str,
188
+ encoding: Encoding,
189
+ ):
190
+ """
191
+ Converts a list of "consecutive chars" into a single HTML element.
192
+ Chars are consecutive if they fall under the same word, token and annotation.
193
+ The CharState class is a named tuple with a "partition_key" method that makes it easy to
194
+ compare if two chars are consecutive.
195
+
196
+ Args:
197
+ consecutive_chars_list (:obj:`List[CharState]`):
198
+ A list of CharStates that have been grouped together
199
+
200
+ text (:obj:`str`):
201
+ The original text being processed
202
+
203
+ encoding (:class:`~tokenizers.Encoding`):
204
+ The encoding returned from the tokenizer
205
+
206
+ Returns:
207
+ :obj:`str`: The HTML span for a set of consecutive chars
208
+ """
209
+ first = consecutive_chars_list[0]
210
+ if first.char_ix is None:
211
+ # its a special token
212
+ stoken = encoding.tokens[first.token_ix]
213
+ # special tokens are represented as empty spans. We use the data attribute and css
214
+ # magic to display it
215
+ return f'<span class="special-token" data-stoken={stoken}></span>'
216
+ # We're not in a special token so this group has a start and end.
217
+ last = consecutive_chars_list[-1]
218
+ assert first.char_ix is not None
219
+ assert last.char_ix is not None
220
+ start = first.char_ix
221
+ end = last.char_ix + 1
222
+ span_text = text[start:end]
223
+ css_classes = [] # What css classes will we apply on the resulting span
224
+ data_items = {} # What data attributes will we apply on the result span
225
+ if first.token_ix is not None:
226
+ # We can either be in a token or not (e.g. in white space)
227
+ css_classes.append("token")
228
+ if first.is_multitoken:
229
+ css_classes.append("multi-token")
230
+ if first.token_ix % 2:
231
+ # We use this to color alternating tokens.
232
+ # A token might be split by an annotation that ends in the middle of it, so this
233
+ # lets us visually indicate a consecutive token despite its possible splitting in
234
+ # the html markup
235
+ css_classes.append("odd-token")
236
+ else:
237
+ # Like above, but a different color so we can see the tokens alternate
238
+ css_classes.append("even-token")
239
+ if EncodingVisualizer.unk_token_regex.search(encoding.tokens[first.token_ix]) is not None:
240
+ # This is a special token that is in the text. probably UNK
241
+ css_classes.append("special-token")
242
+ # TODO is this the right name for the data attribute ?
243
+ data_items["stok"] = encoding.tokens[first.token_ix]
244
+ else:
245
+ # In this case we are looking at a group/single char that is not tokenized.
246
+ # e.g. white space
247
+ css_classes.append("non-token")
248
+ css = f'''class="{" ".join(css_classes)}"'''
249
+ data = ""
250
+ for key, val in data_items.items():
251
+ data += f' data-{key}="{val}"'
252
+ return f"<span {css} {data} >{span_text}</span>"
253
+
254
+ @staticmethod
255
+ def __make_html(text: str, encoding: Encoding, annotations: AnnotationList) -> str:
256
+ char_states = EncodingVisualizer.__make_char_states(text, encoding, annotations)
257
+ current_consecutive_chars = [char_states[0]]
258
+ prev_anno_ix = char_states[0].anno_ix
259
+ spans = []
260
+ label_colors_dict = EncodingVisualizer.calculate_label_colors(annotations)
261
+ cur_anno_ix = char_states[0].anno_ix
262
+ if cur_anno_ix is not None:
263
+ # If we started in an annotation make a span for it
264
+ anno = annotations[cur_anno_ix]
265
+ label = anno.label
266
+ color = label_colors_dict[label]
267
+ spans.append(f'<span class="annotation" style="color:{color}" data-label="{label}">')
268
+
269
+ for cs in char_states[1:]:
270
+ cur_anno_ix = cs.anno_ix
271
+ if cur_anno_ix != prev_anno_ix:
272
+ # If we've transitioned in or out of an annotation
273
+ spans.append(
274
+ # Create a span from the current consecutive characters
275
+ EncodingVisualizer.consecutive_chars_to_html(
276
+ current_consecutive_chars,
277
+ text=text,
278
+ encoding=encoding,
279
+ )
280
+ )
281
+ current_consecutive_chars = [cs]
282
+
283
+ if prev_anno_ix is not None:
284
+ # if we transitioned out of an annotation close it's span
285
+ spans.append("</span>")
286
+ if cur_anno_ix is not None:
287
+ # If we entered a new annotation make a span for it
288
+ anno = annotations[cur_anno_ix]
289
+ label = anno.label
290
+ color = label_colors_dict[label]
291
+ spans.append(f'<span class="annotation" style="color:{color}" data-label="{label}">')
292
+ prev_anno_ix = cur_anno_ix
293
+
294
+ if cs.partition_key() == current_consecutive_chars[0].partition_key():
295
+ # If the current charchter is in the same "group" as the previous one
296
+ current_consecutive_chars.append(cs)
297
+ else:
298
+ # Otherwise we make a span for the previous group
299
+ spans.append(
300
+ EncodingVisualizer.consecutive_chars_to_html(
301
+ current_consecutive_chars,
302
+ text=text,
303
+ encoding=encoding,
304
+ )
305
+ )
306
+ # An reset the consecutive_char_list to form a new group
307
+ current_consecutive_chars = [cs]
308
+ # All that's left is to fill out the final span
309
+ # TODO I think there is an edge case here where an annotation's span might not close
310
+ spans.append(
311
+ EncodingVisualizer.consecutive_chars_to_html(
312
+ current_consecutive_chars,
313
+ text=text,
314
+ encoding=encoding,
315
+ )
316
+ )
317
+ res = HTMLBody(spans) # Send the list of spans to the body of our html
318
+ return res
319
+
320
+ @staticmethod
321
+ def __make_anno_map(text: str, annotations: AnnotationList) -> PartialIntList:
322
+ """
323
+ Args:
324
+ text (:obj:`str`):
325
+ The raw text we want to align to
326
+
327
+ annotations (:obj:`AnnotationList`):
328
+ A (possibly empty) list of annotations
329
+
330
+ Returns:
331
+ A list of length len(text) whose entry at index i is None if there is no annotation on
332
+ character i or k, the index of the annotation that covers index i where k is with
333
+ respect to the list of annotations
334
+ """
335
+ annotation_map = [None] * len(text)
336
+ for anno_ix, a in enumerate(annotations):
337
+ for i in range(a.start, a.end):
338
+ annotation_map[i] = anno_ix
339
+ return annotation_map
340
+
341
+ @staticmethod
342
+ def __make_char_states(text: str, encoding: Encoding, annotations: AnnotationList) -> List[CharState]:
343
+ """
344
+ For each character in the original text, we emit a tuple representing it's "state":
345
+
346
+ * which token_ix it corresponds to
347
+ * which word_ix it corresponds to
348
+ * which annotation_ix it corresponds to
349
+
350
+ Args:
351
+ text (:obj:`str`):
352
+ The raw text we want to align to
353
+
354
+ annotations (:obj:`List[Annotation]`):
355
+ A (possibly empty) list of annotations
356
+
357
+ encoding: (:class:`~tokenizers.Encoding`):
358
+ The encoding returned from the tokenizer
359
+
360
+ Returns:
361
+ :obj:`List[CharState]`: A list of CharStates, indicating for each char in the text what
362
+ it's state is
363
+ """
364
+ annotation_map = EncodingVisualizer.__make_anno_map(text, annotations)
365
+ # Todo make this a dataclass or named tuple
366
+ char_states: List[CharState] = [CharState(char_ix) for char_ix in range(len(text))]
367
+ for token_ix, token in enumerate(encoding.tokens):
368
+ offsets = encoding.token_to_chars(token_ix)
369
+ if offsets is not None:
370
+ start, end = offsets
371
+ for i in range(start, end):
372
+ char_states[i].tokens.append(token_ix)
373
+ for char_ix, anno_ix in enumerate(annotation_map):
374
+ char_states[char_ix].anno_ix = anno_ix
375
+
376
+ return char_states
377
+
378
+
379
+ def HTMLBody(children: List[str], css_styles=css) -> str:
380
+ """
381
+ Generates the full html with css from a list of html spans
382
+
383
+ Args:
384
+ children (:obj:`List[str]`):
385
+ A list of strings, assumed to be html elements
386
+
387
+ css_styles (:obj:`str`, `optional`):
388
+ Optional alternative implementation of the css
389
+
390
+ Returns:
391
+ :obj:`str`: An HTML string with style markup
392
+ """
393
+ children_text = "".join(children)
394
+ return f"""
395
+ <html>
396
+ <head>
397
+ <style>
398
+ {css_styles}
399
+ </style>
400
+ </head>
401
+ <body>
402
+ <div class="tokenized-text" dir=auto>
403
+ {children_text}
404
+ </div>
405
+ </body>
406
+ </html>
407
+ """
source/tokenizers/trainers/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Generated content DO NOT EDIT
2
+ from .. import trainers
3
+
4
+ Trainer = trainers.Trainer
5
+ BpeTrainer = trainers.BpeTrainer
6
+ UnigramTrainer = trainers.UnigramTrainer
7
+ WordLevelTrainer = trainers.WordLevelTrainer
8
+ WordPieceTrainer = trainers.WordPieceTrainer
source/tokenizers/trainers/__init__.pyi ADDED
@@ -0,0 +1,462 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated content DO NOT EDIT
2
+ class Trainer:
3
+ """
4
+ Base class for all trainers
5
+
6
+ This class is not supposed to be instantiated directly. Instead, any implementation of a
7
+ Trainer will return an instance of this class when instantiated.
8
+ """
9
+ def __getstate__(self):
10
+ """ """
11
+ pass
12
+
13
+ def __setstate__(self, state):
14
+ """ """
15
+ pass
16
+
17
+ class BpeTrainer(Trainer):
18
+ """
19
+ Trainer capable of training a BPE model
20
+
21
+ Args:
22
+ vocab_size (:obj:`int`, `optional`):
23
+ The size of the final vocabulary, including all tokens and alphabet.
24
+
25
+ min_frequency (:obj:`int`, `optional`):
26
+ The minimum frequency a pair should have in order to be merged.
27
+
28
+ show_progress (:obj:`bool`, `optional`):
29
+ Whether to show progress bars while training.
30
+
31
+ special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
32
+ A list of special tokens the model should know of.
33
+
34
+ limit_alphabet (:obj:`int`, `optional`):
35
+ The maximum different characters to keep in the alphabet.
36
+
37
+ initial_alphabet (:obj:`List[str]`, `optional`):
38
+ A list of characters to include in the initial alphabet, even
39
+ if not seen in the training dataset.
40
+ If the strings contain more than one character, only the first one
41
+ is kept.
42
+
43
+ continuing_subword_prefix (:obj:`str`, `optional`):
44
+ A prefix to be used for every subword that is not a beginning-of-word.
45
+
46
+ end_of_word_suffix (:obj:`str`, `optional`):
47
+ A suffix to be used for every subword that is a end-of-word.
48
+
49
+ max_token_length (:obj:`int`, `optional`):
50
+ Prevents creating tokens longer than the specified size.
51
+ This can help with reducing polluting your vocabulary with
52
+ highly repetitive tokens like `======` for wikipedia
53
+
54
+ """
55
+ def __init__(
56
+ self,
57
+ vocab_size=30000,
58
+ min_frequency=0,
59
+ show_progress=True,
60
+ special_tokens=[],
61
+ limit_alphabet=None,
62
+ initial_alphabet=[],
63
+ continuing_subword_prefix=None,
64
+ end_of_word_suffix=None,
65
+ max_token_length=None,
66
+ words={},
67
+ ):
68
+ pass
69
+
70
+ def __getstate__(self):
71
+ """ """
72
+ pass
73
+
74
+ def __setstate__(self, state):
75
+ """ """
76
+ pass
77
+
78
+ @property
79
+ def continuing_subword_prefix(self):
80
+ """ """
81
+ pass
82
+
83
+ @continuing_subword_prefix.setter
84
+ def continuing_subword_prefix(self, value):
85
+ """ """
86
+ pass
87
+
88
+ @property
89
+ def end_of_word_suffix(self):
90
+ """ """
91
+ pass
92
+
93
+ @end_of_word_suffix.setter
94
+ def end_of_word_suffix(self, value):
95
+ """ """
96
+ pass
97
+
98
+ @property
99
+ def initial_alphabet(self):
100
+ """ """
101
+ pass
102
+
103
+ @initial_alphabet.setter
104
+ def initial_alphabet(self, value):
105
+ """ """
106
+ pass
107
+
108
+ @property
109
+ def limit_alphabet(self):
110
+ """ """
111
+ pass
112
+
113
+ @limit_alphabet.setter
114
+ def limit_alphabet(self, value):
115
+ """ """
116
+ pass
117
+
118
+ @property
119
+ def max_token_length(self):
120
+ """ """
121
+ pass
122
+
123
+ @max_token_length.setter
124
+ def max_token_length(self, value):
125
+ """ """
126
+ pass
127
+
128
+ @property
129
+ def min_frequency(self):
130
+ """ """
131
+ pass
132
+
133
+ @min_frequency.setter
134
+ def min_frequency(self, value):
135
+ """ """
136
+ pass
137
+
138
+ @property
139
+ def show_progress(self):
140
+ """ """
141
+ pass
142
+
143
+ @show_progress.setter
144
+ def show_progress(self, value):
145
+ """ """
146
+ pass
147
+
148
+ @property
149
+ def special_tokens(self):
150
+ """ """
151
+ pass
152
+
153
+ @special_tokens.setter
154
+ def special_tokens(self, value):
155
+ """ """
156
+ pass
157
+
158
+ @property
159
+ def vocab_size(self):
160
+ """ """
161
+ pass
162
+
163
+ @vocab_size.setter
164
+ def vocab_size(self, value):
165
+ """ """
166
+ pass
167
+
168
+ class UnigramTrainer(Trainer):
169
+ """
170
+ Trainer capable of training a Unigram model
171
+
172
+ Args:
173
+ vocab_size (:obj:`int`):
174
+ The size of the final vocabulary, including all tokens and alphabet.
175
+
176
+ show_progress (:obj:`bool`):
177
+ Whether to show progress bars while training.
178
+
179
+ special_tokens (:obj:`List[Union[str, AddedToken]]`):
180
+ A list of special tokens the model should know of.
181
+
182
+ initial_alphabet (:obj:`List[str]`):
183
+ A list of characters to include in the initial alphabet, even
184
+ if not seen in the training dataset.
185
+ If the strings contain more than one character, only the first one
186
+ is kept.
187
+
188
+ shrinking_factor (:obj:`float`):
189
+ The shrinking factor used at each step of the training to prune the
190
+ vocabulary.
191
+
192
+ unk_token (:obj:`str`):
193
+ The token used for out-of-vocabulary tokens.
194
+
195
+ max_piece_length (:obj:`int`):
196
+ The maximum length of a given token.
197
+
198
+ n_sub_iterations (:obj:`int`):
199
+ The number of iterations of the EM algorithm to perform before
200
+ pruning the vocabulary.
201
+ """
202
+ def __init__(
203
+ self,
204
+ vocab_size=8000,
205
+ show_progress=True,
206
+ special_tokens=[],
207
+ initial_alphabet=[],
208
+ shrinking_factor=0.75,
209
+ unk_token=None,
210
+ max_piece_length=16,
211
+ n_sub_iterations=2,
212
+ ):
213
+ pass
214
+
215
+ def __getstate__(self):
216
+ """ """
217
+ pass
218
+
219
+ def __setstate__(self, state):
220
+ """ """
221
+ pass
222
+
223
+ @property
224
+ def initial_alphabet(self):
225
+ """ """
226
+ pass
227
+
228
+ @initial_alphabet.setter
229
+ def initial_alphabet(self, value):
230
+ """ """
231
+ pass
232
+
233
+ @property
234
+ def show_progress(self):
235
+ """ """
236
+ pass
237
+
238
+ @show_progress.setter
239
+ def show_progress(self, value):
240
+ """ """
241
+ pass
242
+
243
+ @property
244
+ def special_tokens(self):
245
+ """ """
246
+ pass
247
+
248
+ @special_tokens.setter
249
+ def special_tokens(self, value):
250
+ """ """
251
+ pass
252
+
253
+ @property
254
+ def vocab_size(self):
255
+ """ """
256
+ pass
257
+
258
+ @vocab_size.setter
259
+ def vocab_size(self, value):
260
+ """ """
261
+ pass
262
+
263
+ class WordLevelTrainer(Trainer):
264
+ """
265
+ Trainer capable of training a WorldLevel model
266
+
267
+ Args:
268
+ vocab_size (:obj:`int`, `optional`):
269
+ The size of the final vocabulary, including all tokens and alphabet.
270
+
271
+ min_frequency (:obj:`int`, `optional`):
272
+ The minimum frequency a pair should have in order to be merged.
273
+
274
+ show_progress (:obj:`bool`, `optional`):
275
+ Whether to show progress bars while training.
276
+
277
+ special_tokens (:obj:`List[Union[str, AddedToken]]`):
278
+ A list of special tokens the model should know of.
279
+ """
280
+ def __init__(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[]):
281
+ pass
282
+
283
+ def __getstate__(self):
284
+ """ """
285
+ pass
286
+
287
+ def __setstate__(self, state):
288
+ """ """
289
+ pass
290
+
291
+ @property
292
+ def min_frequency(self):
293
+ """ """
294
+ pass
295
+
296
+ @min_frequency.setter
297
+ def min_frequency(self, value):
298
+ """ """
299
+ pass
300
+
301
+ @property
302
+ def show_progress(self):
303
+ """ """
304
+ pass
305
+
306
+ @show_progress.setter
307
+ def show_progress(self, value):
308
+ """ """
309
+ pass
310
+
311
+ @property
312
+ def special_tokens(self):
313
+ """ """
314
+ pass
315
+
316
+ @special_tokens.setter
317
+ def special_tokens(self, value):
318
+ """ """
319
+ pass
320
+
321
+ @property
322
+ def vocab_size(self):
323
+ """ """
324
+ pass
325
+
326
+ @vocab_size.setter
327
+ def vocab_size(self, value):
328
+ """ """
329
+ pass
330
+
331
+ class WordPieceTrainer(Trainer):
332
+ """
333
+ Trainer capable of training a WordPiece model
334
+
335
+ Args:
336
+ vocab_size (:obj:`int`, `optional`):
337
+ The size of the final vocabulary, including all tokens and alphabet.
338
+
339
+ min_frequency (:obj:`int`, `optional`):
340
+ The minimum frequency a pair should have in order to be merged.
341
+
342
+ show_progress (:obj:`bool`, `optional`):
343
+ Whether to show progress bars while training.
344
+
345
+ special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
346
+ A list of special tokens the model should know of.
347
+
348
+ limit_alphabet (:obj:`int`, `optional`):
349
+ The maximum different characters to keep in the alphabet.
350
+
351
+ initial_alphabet (:obj:`List[str]`, `optional`):
352
+ A list of characters to include in the initial alphabet, even
353
+ if not seen in the training dataset.
354
+ If the strings contain more than one character, only the first one
355
+ is kept.
356
+
357
+ continuing_subword_prefix (:obj:`str`, `optional`):
358
+ A prefix to be used for every subword that is not a beginning-of-word.
359
+
360
+ end_of_word_suffix (:obj:`str`, `optional`):
361
+ A suffix to be used for every subword that is a end-of-word.
362
+ """
363
+ def __init__(
364
+ self,
365
+ vocab_size=30000,
366
+ min_frequency=0,
367
+ show_progress=True,
368
+ special_tokens=[],
369
+ limit_alphabet=None,
370
+ initial_alphabet=[],
371
+ continuing_subword_prefix="##",
372
+ end_of_word_suffix=None,
373
+ ):
374
+ pass
375
+
376
+ def __getstate__(self):
377
+ """ """
378
+ pass
379
+
380
+ def __setstate__(self, state):
381
+ """ """
382
+ pass
383
+
384
+ @property
385
+ def continuing_subword_prefix(self):
386
+ """ """
387
+ pass
388
+
389
+ @continuing_subword_prefix.setter
390
+ def continuing_subword_prefix(self, value):
391
+ """ """
392
+ pass
393
+
394
+ @property
395
+ def end_of_word_suffix(self):
396
+ """ """
397
+ pass
398
+
399
+ @end_of_word_suffix.setter
400
+ def end_of_word_suffix(self, value):
401
+ """ """
402
+ pass
403
+
404
+ @property
405
+ def initial_alphabet(self):
406
+ """ """
407
+ pass
408
+
409
+ @initial_alphabet.setter
410
+ def initial_alphabet(self, value):
411
+ """ """
412
+ pass
413
+
414
+ @property
415
+ def limit_alphabet(self):
416
+ """ """
417
+ pass
418
+
419
+ @limit_alphabet.setter
420
+ def limit_alphabet(self, value):
421
+ """ """
422
+ pass
423
+
424
+ @property
425
+ def min_frequency(self):
426
+ """ """
427
+ pass
428
+
429
+ @min_frequency.setter
430
+ def min_frequency(self, value):
431
+ """ """
432
+ pass
433
+
434
+ @property
435
+ def show_progress(self):
436
+ """ """
437
+ pass
438
+
439
+ @show_progress.setter
440
+ def show_progress(self, value):
441
+ """ """
442
+ pass
443
+
444
+ @property
445
+ def special_tokens(self):
446
+ """ """
447
+ pass
448
+
449
+ @special_tokens.setter
450
+ def special_tokens(self, value):
451
+ """ """
452
+ pass
453
+
454
+ @property
455
+ def vocab_size(self):
456
+ """ """
457
+ pass
458
+
459
+ @vocab_size.setter
460
+ def vocab_size(self, value):
461
+ """ """
462
+ pass
source/torchaudio-2.9.1.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ pip
source/torchaudio-2.9.1.dist-info/METADATA ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: torchaudio
3
+ Version: 2.9.1
4
+ Summary: An audio package for PyTorch
5
+ Home-page: https://github.com/pytorch/audio
6
+ Author: Soumith Chintala, David Pollack, Sean Naren, Peter Goldsborough, Moto Hira, Caroline Chen, Jeff Hwang, Zhaoheng Ni, Xiaohui Zhang
7
+ Author-email: soumith@pytorch.org
8
+ Maintainer: Moto Hira, Caroline Chen, Jeff Hwang, Zhaoheng Ni, Xiaohui Zhang
9
+ Maintainer-email: moto@meta.com
10
+ Classifier: Environment :: Plugins
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: BSD License
14
+ Classifier: Operating System :: MacOS :: MacOS X
15
+ Classifier: Operating System :: Microsoft :: Windows
16
+ Classifier: Operating System :: POSIX
17
+ Classifier: Programming Language :: C++
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Programming Language :: Python :: 3.14
23
+ Classifier: Programming Language :: Python :: Implementation :: CPython
24
+ Classifier: Topic :: Multimedia :: Sound/Audio
25
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
26
+ Description-Content-Type: text/markdown
27
+ License-File: LICENSE
28
+ Requires-Dist: torch==2.9.1
29
+ Dynamic: author
30
+ Dynamic: author-email
31
+ Dynamic: classifier
32
+ Dynamic: description
33
+ Dynamic: description-content-type
34
+ Dynamic: home-page
35
+ Dynamic: license-file
36
+ Dynamic: maintainer
37
+ Dynamic: maintainer-email
38
+ Dynamic: requires-dist
39
+ Dynamic: summary
40
+
41
+ torchaudio: an audio library for PyTorch
42
+ ========================================
43
+
44
+ [![Documentation](https://img.shields.io/badge/dynamic/json.svg?label=docs&url=https%3A%2F%2Fpypi.org%2Fpypi%2Ftorchaudio%2Fjson&query=%24.info.version&colorB=brightgreen&prefix=v)](https://pytorch.org/audio/main/)
45
+ [![Anaconda Badge](https://anaconda.org/pytorch/torchaudio/badges/downloads.svg)](https://anaconda.org/pytorch/torchaudio)
46
+ [![Anaconda-Server Badge](https://anaconda.org/pytorch/torchaudio/badges/platforms.svg)](https://anaconda.org/pytorch/torchaudio)
47
+
48
+ ![TorchAudio Logo](docs/source/_static/img/logo.png)
49
+
50
+ > [!NOTE]
51
+ > **We have transitioned TorchAudio into a
52
+ > maintenance phase. This process removed some user-facing
53
+ > features. These features were deprecated from TorchAudio 2.8 and removed in 2.9.
54
+ > Our main goals were to reduce redundancies with the rest of the
55
+ > PyTorch ecosystem, make it easier to maintain, and create a version of
56
+ > TorchAudio that is more tightly scoped to its strengths: processing audio
57
+ > data for ML. Please see
58
+ > [our community message](https://github.com/pytorch/audio/issues/3902)
59
+ > for more details.**
60
+
61
+ The aim of torchaudio is to apply [PyTorch](https://github.com/pytorch/pytorch) to
62
+ the audio domain. By supporting PyTorch, torchaudio follows the same philosophy
63
+ of providing strong GPU acceleration, having a focus on trainable features through
64
+ the autograd system, and having consistent style (tensor names and dimension names).
65
+ Therefore, it is primarily a machine learning library and not a general signal
66
+ processing library. The benefits of PyTorch can be seen in torchaudio through
67
+ having all the computations be through PyTorch operations which makes it easy
68
+ to use and feel like a natural extension.
69
+
70
+ - [Dataloaders for common audio datasets](http://pytorch.org/audio/main/datasets.html)
71
+ - Audio and speech processing functions
72
+ - [forced_align](https://pytorch.org/audio/main/generated/torchaudio.functional.forced_align.html)
73
+ - Common audio transforms
74
+ - [Spectrogram, AmplitudeToDB, MelScale, MelSpectrogram, MFCC, MuLawEncoding, MuLawDecoding, Resample](http://pytorch.org/audio/main/transforms.html)
75
+ - Compliance interfaces: Run code using PyTorch that align with other libraries
76
+ - [Kaldi: spectrogram, fbank, mfcc](https://pytorch.org/audio/main/compliance.kaldi.html)
77
+
78
+ Installation
79
+ ------------
80
+
81
+ Please refer to https://pytorch.org/audio/main/installation.html for installation and build process of TorchAudio.
82
+
83
+
84
+ API Reference
85
+ -------------
86
+
87
+ API Reference is located here: http://pytorch.org/audio/main/
88
+
89
+ Contributing Guidelines
90
+ -----------------------
91
+
92
+ Please refer to [CONTRIBUTING.md](./CONTRIBUTING.md)
93
+
94
+ Citation
95
+ --------
96
+
97
+ If you find this package useful, please cite as:
98
+
99
+ ```bibtex
100
+ @article{yang2021torchaudio,
101
+ title={TorchAudio: Building Blocks for Audio and Speech Processing},
102
+ author={Yao-Yuan Yang and Moto Hira and Zhaoheng Ni and Anjali Chourdia and Artyom Astafurov and Caroline Chen and Ching-Feng Yeh and Christian Puhrsch and David Pollack and Dmitriy Genzel and Donny Greenberg and Edward Z. Yang and Jason Lian and Jay Mahadeokar and Jeff Hwang and Ji Chen and Peter Goldsborough and Prabhat Roy and Sean Narenthiran and Shinji Watanabe and Soumith Chintala and Vincent Quenneville-Bélair and Yangyang Shi},
103
+ journal={arXiv preprint arXiv:2110.15018},
104
+ year={2021}
105
+ }
106
+ ```
107
+
108
+ ```bibtex
109
+ @misc{hwang2023torchaudio,
110
+ title={TorchAudio 2.1: Advancing speech recognition, self-supervised learning, and audio processing components for PyTorch},
111
+ author={Jeff Hwang and Moto Hira and Caroline Chen and Xiaohui Zhang and Zhaoheng Ni and Guangzhi Sun and Pingchuan Ma and Ruizhe Huang and Vineel Pratap and Yuekai Zhang and Anurag Kumar and Chin-Yun Yu and Chuang Zhu and Chunxi Liu and Jacob Kahn and Mirco Ravanelli and Peng Sun and Shinji Watanabe and Yangyang Shi and Yumeng Tao and Robin Scheibler and Samuele Cornell and Sean Kim and Stavros Petridis},
112
+ year={2023},
113
+ eprint={2310.17864},
114
+ archivePrefix={arXiv},
115
+ primaryClass={eess.AS}
116
+ }
117
+ ```
118
+
119
+ Disclaimer on Datasets
120
+ ----------------------
121
+
122
+ This is a utility library that downloads and prepares public datasets. We do not host or distribute these datasets, vouch for their quality or fairness, or claim that you have license to use the dataset. It is your responsibility to determine whether you have permission to use the dataset under the dataset's license.
123
+
124
+ If you're a dataset owner and wish to update any part of it (description, citation, etc.), or do not want your dataset to be included in this library, please get in touch through a GitHub issue. Thanks for your contribution to the ML community!
125
+
126
+ Pre-trained Model License
127
+ -------------------------
128
+
129
+ The pre-trained models provided in this library may have their own licenses or terms and conditions derived from the dataset used for training. It is your responsibility to determine whether you have permission to use the models for your use case.
130
+
131
+ For instance, SquimSubjective model is released under the Creative Commons Attribution Non Commercial 4.0 International (CC-BY-NC 4.0) license. See [the link](https://zenodo.org/record/4660670#.ZBtWPOxuerN) for additional details.
132
+
133
+ Other pre-trained models that have different license are noted in documentation. Please checkout the [documentation page](https://pytorch.org/audio/main/).
source/torchaudio-2.9.1.dist-info/RECORD ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torchaudio-2.9.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
2
+ torchaudio-2.9.1.dist-info/METADATA,sha256=nhbW4EcMPskMVtj653CC9bZ2xsogo4xjD7mkmp7K9Sg,6911
3
+ torchaudio-2.9.1.dist-info/RECORD,,
4
+ torchaudio-2.9.1.dist-info/WHEEL,sha256=VXvNKn6nFeCM45GEUrNLJOO_J_e-cNJphGt9rWFxyE0,113
5
+ torchaudio-2.9.1.dist-info/licenses/LICENSE,sha256=k6WIYahYzBCOa2uDPgjnbosqZjOeSoAHyKWowf-cQNY,1338
6
+ torchaudio-2.9.1.dist-info/top_level.txt,sha256=mPKWMIRWWW2JwbJN6wRckeN1gpbjhifapAF0Z9t7SMo,11
7
+ torchaudio/__init__.py,sha256=8OB3EPGCViF7LgBWy_bUyZUF6HJUIpbTI8ouRGwn6lU,7878
8
+ torchaudio/__pycache__/__init__.cpython-312.pyc,,
9
+ torchaudio/__pycache__/_torchcodec.cpython-312.pyc,,
10
+ torchaudio/__pycache__/version.cpython-312.pyc,,
11
+ torchaudio/_extension/__init__.py,sha256=A8oH7eF2Fx4d68LddkFE1Ylq3AE3X2sgZdXjvaMEdjQ,1905
12
+ torchaudio/_extension/__pycache__/__init__.cpython-312.pyc,,
13
+ torchaudio/_extension/__pycache__/utils.cpython-312.pyc,,
14
+ torchaudio/_extension/utils.py,sha256=UQCObmKAsgdHhXU2dQYYxyFXwfdTsBO9bnrQmpQNN_I,4926
15
+ torchaudio/_internal/__init__.py,sha256=gjU8g9HhVd9hHrHXJM0xOlZL6cT8ktO60MN8RHI6ZbA,241
16
+ torchaudio/_internal/__pycache__/__init__.cpython-312.pyc,,
17
+ torchaudio/_internal/__pycache__/module_utils.cpython-312.pyc,,
18
+ torchaudio/_internal/module_utils.py,sha256=eosQSGtN5WhHhATJGBWJIGUM_nvtgLPRkQ8BH_Zd53o,5229
19
+ torchaudio/_torchcodec.py,sha256=Z1TpONctbL80DufuWhLRj4dC0rVhjKu6hOYeglcLwvU,13424
20
+ torchaudio/compliance/__init__.py,sha256=hhNObUS0c-fS-VMudM7zl3-CvupvCDmESlikntSMn5g,48
21
+ torchaudio/compliance/__pycache__/__init__.cpython-312.pyc,,
22
+ torchaudio/compliance/__pycache__/kaldi.cpython-312.pyc,,
23
+ torchaudio/compliance/kaldi.py,sha256=XL6hpYTd6nSPb2imIdeU4TM06I2fqh1AmG968y8ZbSk,36666
24
+ torchaudio/datasets/__init__.py,sha256=taRr3duDaEK1Pfzj9N1dFuZpXfy8e4uFItcJiRLAQwQ,1171
25
+ torchaudio/datasets/__pycache__/__init__.cpython-312.pyc,,
26
+ torchaudio/datasets/__pycache__/cmuarctic.cpython-312.pyc,,
27
+ torchaudio/datasets/__pycache__/cmudict.cpython-312.pyc,,
28
+ torchaudio/datasets/__pycache__/commonvoice.cpython-312.pyc,,
29
+ torchaudio/datasets/__pycache__/dr_vctk.cpython-312.pyc,,
30
+ torchaudio/datasets/__pycache__/fluentcommands.cpython-312.pyc,,
31
+ torchaudio/datasets/__pycache__/gtzan.cpython-312.pyc,,
32
+ torchaudio/datasets/__pycache__/iemocap.cpython-312.pyc,,
33
+ torchaudio/datasets/__pycache__/librilight_limited.cpython-312.pyc,,
34
+ torchaudio/datasets/__pycache__/librimix.cpython-312.pyc,,
35
+ torchaudio/datasets/__pycache__/librispeech.cpython-312.pyc,,
36
+ torchaudio/datasets/__pycache__/librispeech_biasing.cpython-312.pyc,,
37
+ torchaudio/datasets/__pycache__/libritts.cpython-312.pyc,,
38
+ torchaudio/datasets/__pycache__/ljspeech.cpython-312.pyc,,
39
+ torchaudio/datasets/__pycache__/musdb_hq.cpython-312.pyc,,
40
+ torchaudio/datasets/__pycache__/quesst14.cpython-312.pyc,,
41
+ torchaudio/datasets/__pycache__/snips.cpython-312.pyc,,
42
+ torchaudio/datasets/__pycache__/speechcommands.cpython-312.pyc,,
43
+ torchaudio/datasets/__pycache__/tedlium.cpython-312.pyc,,
44
+ torchaudio/datasets/__pycache__/utils.cpython-312.pyc,,
45
+ torchaudio/datasets/__pycache__/vctk.cpython-312.pyc,,
46
+ torchaudio/datasets/__pycache__/voxceleb1.cpython-312.pyc,,
47
+ torchaudio/datasets/__pycache__/yesno.cpython-312.pyc,,
48
+ torchaudio/datasets/cmuarctic.py,sha256=2e5Oh_jDHRs8ORhNONsD9NhI_OfQSHDLQAM-tWpgZ-U,7081
49
+ torchaudio/datasets/cmudict.py,sha256=9OEpNDYpyqeEyinAnyGIU8FampDj7ziSOHRwJLIlq2M,5990
50
+ torchaudio/datasets/commonvoice.py,sha256=9khedUCmdEkCKPU6_r8VWz6I2VdJokatuziZ6BxJMZs,2763
51
+ torchaudio/datasets/dr_vctk.py,sha256=Km4-tKllAgnOKCuq66YRWhTlNWmC7D0Xz3dAttRRGSo,4377
52
+ torchaudio/datasets/fluentcommands.py,sha256=u3tkO4-AAaTWdbRQi6lIvad4x2plZgXM39KljGtmRsw,3245
53
+ torchaudio/datasets/gtzan.py,sha256=I5dRP_QGuQ1joXWRwZwtvpwi22uZTb8QZm9Mr2W55Mg,24357
54
+ torchaudio/datasets/iemocap.py,sha256=X_WCoXOzRqcWRRRoUtY0AlD9SJcUUOACIcgbV0irt48,4930
55
+ torchaudio/datasets/librilight_limited.py,sha256=fAwpX0hEMze5aV57BP7rjBLwRiZa3Aje_NXi_3o16wA,4179
56
+ torchaudio/datasets/librimix.py,sha256=VtKOhf6VJc1ysWCvUvh0SbtjOkXJChmBM_BhoSkg_2A,5116
57
+ torchaudio/datasets/librispeech.py,sha256=zkzJFWchWs4AktYAI-ghmWH4ZeJ84C0uDo9E1_pTgSI,6308
58
+ torchaudio/datasets/librispeech_biasing.py,sha256=d-02tyrXI-CSGbXBFYFcnM_yT8WSGABHfpNiFxyadL0,6958
59
+ torchaudio/datasets/libritts.py,sha256=EtWOoCDz7_qGLZF5YcZfnHaLxH4Y8QJCnopafLiqFno,5870
60
+ torchaudio/datasets/ljspeech.py,sha256=92NeLQsC1iKpqfiMkKKbcJDpaYdZKVdVEBQJze1wmxY,3494
61
+ torchaudio/datasets/musdb_hq.py,sha256=TYKjpat6JKr9bkFqUecu7_hRdshRfQP2UbknaYR3Q0U,5075
62
+ torchaudio/datasets/quesst14.py,sha256=QyGd4fMS820ATbP8YgBtu7bSSK09pw5RZklsPJ8Jf0Y,4455
63
+ torchaudio/datasets/snips.py,sha256=WaYUknGFM3rnLklOj5ZYHSX5mhlf_Ce4p3LBZdA9yJc,5008
64
+ torchaudio/datasets/speechcommands.py,sha256=cLSgiVYlQjEOuYPpFeAtcXSGirraH4IMoP8p9WIvUoY,7481
65
+ torchaudio/datasets/tedlium.py,sha256=a8Hf2QvOki7_chgXcMAFMk-piTjodktfnc3HRbUVJkU,8698
66
+ torchaudio/datasets/utils.py,sha256=P6nckh2YrAfOPMphHlxyfI-HBmNg39DTlxQ8-asG4MY,1703
67
+ torchaudio/datasets/vctk.py,sha256=twR_n8LyQcT8A_HrJoMx3RkaVrRXXZAnIVU1d0E0npQ,5699
68
+ torchaudio/datasets/voxceleb1.py,sha256=9vU0ftB4-2usO8ZiEUKR_IQTEdHhA0M8l9scXCNehnw,11725
69
+ torchaudio/datasets/yesno.py,sha256=4sgfMeSxz8HaRDk6A2UIFP-20q29MwEO_r8DoEtfbvE,3026
70
+ torchaudio/functional/__init__.py,sha256=_5eT3FZFO6GXmKqFkPY4c_w7F7Isqnd8CTP2FdMxfVM,2451
71
+ torchaudio/functional/__pycache__/__init__.cpython-312.pyc,,
72
+ torchaudio/functional/__pycache__/_alignment.cpython-312.pyc,,
73
+ torchaudio/functional/__pycache__/filtering.cpython-312.pyc,,
74
+ torchaudio/functional/__pycache__/functional.cpython-312.pyc,,
75
+ torchaudio/functional/_alignment.py,sha256=NveQ74x8PmleuB-Ka9eEYYyshbV7nYc0g-Tu3NGHdz0,4739
76
+ torchaudio/functional/filtering.py,sha256=rML8MismfehSeglw65kUkfugoP6XDtWcs_XhCl6aJM4,62325
77
+ torchaudio/functional/functional.py,sha256=5l-07BLVAs1PNU8NM2CPV_GTnq3V8nbV9tI7t0v79Y4,94731
78
+ torchaudio/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
79
+ torchaudio/lib/__pycache__/__init__.cpython-312.pyc,,
80
+ torchaudio/lib/_torchaudio.so,sha256=70IIA3F6xzE4xCzUrpWAGTfHu1KqWzIM1KxU4dDTTsg,171552
81
+ torchaudio/lib/libctc_prefix_decoder.so,sha256=1DlwTtNZXE_P0zsHvoVP7mBzpSpUJFaSSaaQjg8oX0E,6268592
82
+ torchaudio/lib/libtorchaudio.so,sha256=L8j8al4FAtxyb5w5Arp5eo-jpVlM2HcswimRW7c2mII,2573624
83
+ torchaudio/lib/pybind11_prefixctc.so,sha256=VdBdtmt8GU2y1ulK-S4oLR5jWYA5K0PnlEBSqrV4F3A,277688
84
+ torchaudio/models/__init__.py,sha256=BNMNGuwpJAFRsdtwHYQ6slGClkrUTu31_7mXh7FjeV4,1995
85
+ torchaudio/models/__pycache__/__init__.cpython-312.pyc,,
86
+ torchaudio/models/__pycache__/_hdemucs.cpython-312.pyc,,
87
+ torchaudio/models/__pycache__/conformer.cpython-312.pyc,,
88
+ torchaudio/models/__pycache__/conv_tasnet.cpython-312.pyc,,
89
+ torchaudio/models/__pycache__/deepspeech.cpython-312.pyc,,
90
+ torchaudio/models/__pycache__/emformer.cpython-312.pyc,,
91
+ torchaudio/models/__pycache__/rnnt.cpython-312.pyc,,
92
+ torchaudio/models/__pycache__/rnnt_decoder.cpython-312.pyc,,
93
+ torchaudio/models/__pycache__/tacotron2.cpython-312.pyc,,
94
+ torchaudio/models/__pycache__/wav2letter.cpython-312.pyc,,
95
+ torchaudio/models/__pycache__/wavernn.cpython-312.pyc,,
96
+ torchaudio/models/_hdemucs.py,sha256=VPnQ73lA9lfAxRjZ85NCGJYP36mPNwTjS-TU4qelu_k,38242
97
+ torchaudio/models/conformer.py,sha256=5IceU-jcZKofkHTTqRKoytubQ75MzZPrPlfkLsIlxeA,10068
98
+ torchaudio/models/conv_tasnet.py,sha256=v-DI_Ej9FCBBbSH-Spkh3tzq8rkBhbQNA-Wp52Uf32E,12540
99
+ torchaudio/models/decoder/__init__.py,sha256=HxU2Bgyea0No8SORRfxgMZNwwEDTrjlT3bDW_GxzpTU,1899
100
+ torchaudio/models/decoder/__pycache__/__init__.cpython-312.pyc,,
101
+ torchaudio/models/decoder/__pycache__/_ctc_decoder.cpython-312.pyc,,
102
+ torchaudio/models/decoder/__pycache__/_cuda_ctc_decoder.cpython-312.pyc,,
103
+ torchaudio/models/decoder/_ctc_decoder.py,sha256=AmLQAcm4Q4bFPqnq-SF7Lpvg2QPK88xyio8ol_OJjvU,20086
104
+ torchaudio/models/decoder/_cuda_ctc_decoder.py,sha256=xFrj1cTEsS-MxAO5Vgdutcb3kTb7Jv-OFhS6cmfFKhA,7186
105
+ torchaudio/models/deepspeech.py,sha256=kQW3B6YcjYuq7xRzWjRJFGr7ZNraY9gMYDTxII7Cgtg,2746
106
+ torchaudio/models/emformer.py,sha256=ncDeEcYegUmIKQoDBoufUhVWj4dYpZAXxLX0qmEqt1A,37766
107
+ torchaudio/models/rnnt.py,sha256=jz66nwDd1qGT6KQR1lbA_urPktygewhm0FH66T7P3Ek,35541
108
+ torchaudio/models/rnnt_decoder.py,sha256=IwlDsuw1SA-uCRrXGMBqm05auGFSha2bZ-8BOImnK0c,12839
109
+ torchaudio/models/squim/__init__.py,sha256=b98nAaL28Q4w3lrqd_6wUd0An-xNhhJn4Tj8oZlzQnc,346
110
+ torchaudio/models/squim/__pycache__/__init__.cpython-312.pyc,,
111
+ torchaudio/models/squim/__pycache__/objective.cpython-312.pyc,,
112
+ torchaudio/models/squim/__pycache__/subjective.cpython-312.pyc,,
113
+ torchaudio/models/squim/objective.py,sha256=gvUasz7RpqgKeGf04yHUotshSIzH3KzjW90-iHeDo2g,12281
114
+ torchaudio/models/squim/subjective.py,sha256=N00kILSPm0akWyNsrNYKmHgZmooo8gbyUm5IVLf7bx8,5797
115
+ torchaudio/models/tacotron2.py,sha256=FimYhGSI8FKwWb87CLk4h3yKWatCU2HvFmU1t5WUn4E,45914
116
+ torchaudio/models/wav2letter.py,sha256=KNcq4p0qZG2Bwfdakv7YwLCvi_yGT-qB4fJwGMuFQhg,3278
117
+ torchaudio/models/wav2vec2/__init__.py,sha256=WlafukV6GwuSNh0CZifrYUt4V5l59kjvGX7AZNonjfk,927
118
+ torchaudio/models/wav2vec2/__pycache__/__init__.cpython-312.pyc,,
119
+ torchaudio/models/wav2vec2/__pycache__/components.cpython-312.pyc,,
120
+ torchaudio/models/wav2vec2/__pycache__/model.cpython-312.pyc,,
121
+ torchaudio/models/wav2vec2/__pycache__/wavlm_attention.cpython-312.pyc,,
122
+ torchaudio/models/wav2vec2/components.py,sha256=DRmW-GHYf-JReCg_0l1ovNWJBnAavePO3S2vPY-1ze4,47077
123
+ torchaudio/models/wav2vec2/model.py,sha256=Z2VN6KbDOOdq5JtP7lxPQebwYqsxKms1Eu4IjDJtZaQ,60092
124
+ torchaudio/models/wav2vec2/utils/__init__.py,sha256=qmMbz4HAN5kEEyl4cSGm_JQZI47beyh4witydPC_qns,181
125
+ torchaudio/models/wav2vec2/utils/__pycache__/__init__.cpython-312.pyc,,
126
+ torchaudio/models/wav2vec2/utils/__pycache__/import_fairseq.cpython-312.pyc,,
127
+ torchaudio/models/wav2vec2/utils/__pycache__/import_huggingface.cpython-312.pyc,,
128
+ torchaudio/models/wav2vec2/utils/import_fairseq.py,sha256=oCwG6qpG0bCXue2V56fjDcC8cA2rgy4b3O_nu_FI9ZY,9198
129
+ torchaudio/models/wav2vec2/utils/import_huggingface.py,sha256=1nVCipp-lOUAyl_-P103DWLUeTOZi9X_ffX93bOXxEk,5946
130
+ torchaudio/models/wav2vec2/wavlm_attention.py,sha256=1DU_pkoLCeHQwSF4lJ06cez0PsMVoXNxiYKP0Yv0qFQ,10844
131
+ torchaudio/models/wavernn.py,sha256=5xUyao5g69jRXX4ReNi4mP_aTSIonJPP6XcPrqKybEk,15446
132
+ torchaudio/pipelines/__init__.py,sha256=Xy8NmInKwTcNBHwLTTjHjrfczRLuQq8a67ENt1OTVXM,2745
133
+ torchaudio/pipelines/__pycache__/__init__.cpython-312.pyc,,
134
+ torchaudio/pipelines/__pycache__/_source_separation_pipeline.cpython-312.pyc,,
135
+ torchaudio/pipelines/__pycache__/_squim_pipeline.cpython-312.pyc,,
136
+ torchaudio/pipelines/__pycache__/rnnt_pipeline.cpython-312.pyc,,
137
+ torchaudio/pipelines/_source_separation_pipeline.py,sha256=ogWakvaOv6OegmREcbagvfIm0jNWjzEtsdMYTialRNk,4225
138
+ torchaudio/pipelines/_squim_pipeline.py,sha256=852SYXqUZDgTPegL7LqgVQr0PXG94da_DTDF2bwDhVE,6282
139
+ torchaudio/pipelines/_tts/__init__.py,sha256=PP7l8XzVURqelwuMJFgfOCv4fvzZunDiy90ZQlRkv7g,426
140
+ torchaudio/pipelines/_tts/__pycache__/__init__.cpython-312.pyc,,
141
+ torchaudio/pipelines/_tts/__pycache__/impl.cpython-312.pyc,,
142
+ torchaudio/pipelines/_tts/__pycache__/interface.cpython-312.pyc,,
143
+ torchaudio/pipelines/_tts/__pycache__/utils.cpython-312.pyc,,
144
+ torchaudio/pipelines/_tts/impl.py,sha256=Tig4_5sITJADwxN5eZGek7Ath_-e3sV8CTM5t6UpeUU,15374
145
+ torchaudio/pipelines/_tts/interface.py,sha256=yUaS0UK3PTRruYXRWFil7lAhr-1iYiyBaDBLmEnJPUQ,10224
146
+ torchaudio/pipelines/_tts/utils.py,sha256=KGrFoetCZ4l4FJkINFptAc8Pvrbo9e4QQhCIMCp8NYY,4810
147
+ torchaudio/pipelines/_wav2vec2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
148
+ torchaudio/pipelines/_wav2vec2/__pycache__/__init__.cpython-312.pyc,,
149
+ torchaudio/pipelines/_wav2vec2/__pycache__/aligner.cpython-312.pyc,,
150
+ torchaudio/pipelines/_wav2vec2/__pycache__/impl.cpython-312.pyc,,
151
+ torchaudio/pipelines/_wav2vec2/__pycache__/utils.cpython-312.pyc,,
152
+ torchaudio/pipelines/_wav2vec2/aligner.py,sha256=pIWRgQ-kdYUxtL8bdc0qk9wBjwRrHY1uSWL3L4e2vxs,2709
153
+ torchaudio/pipelines/_wav2vec2/impl.py,sha256=zdXFjytJO5MvnB-3aygzUUFKxCTkQGU_OX_rhUh9c0k,65561
154
+ torchaudio/pipelines/_wav2vec2/utils.py,sha256=Q8_fWOR2JDnHu0TTRmHzRjI3BOJa0hGIAl0cjtALgsQ,6971
155
+ torchaudio/pipelines/rnnt_pipeline.py,sha256=56nQnCcjY4xewDqXR1Rkrh_hyoK42CsYumpU8mUNs1w,13753
156
+ torchaudio/transforms/__init__.py,sha256=8_47qPRjXNg332f2kcNP_T5UXCn6jQmUUMkIgyIByjY,1398
157
+ torchaudio/transforms/__pycache__/__init__.cpython-312.pyc,,
158
+ torchaudio/transforms/__pycache__/_multi_channel.cpython-312.pyc,,
159
+ torchaudio/transforms/__pycache__/_transforms.cpython-312.pyc,,
160
+ torchaudio/transforms/_multi_channel.py,sha256=GZ2rrwFt2KtSG7At7kS9Bqh1KmYYw0HwcUnEjc-AWr8,22221
161
+ torchaudio/transforms/_transforms.py,sha256=i-xEARqCfnaDk9b0yzmYkPo9Gg1N1iKvZiLSMdX14-Q,86919
162
+ torchaudio/utils/__init__.py,sha256=adAdfYm9DJBC2JXxRCTrjxOUU1vKJ9w3rFke-DzKKqU,70
163
+ torchaudio/utils/__pycache__/__init__.cpython-312.pyc,,
164
+ torchaudio/utils/__pycache__/download.cpython-312.pyc,,
165
+ torchaudio/utils/download.py,sha256=gZA7CijUoAu3Q0Qd6dKpFQAEjcdnxR6xOT59lTgEIOo,2883
166
+ torchaudio/version.py,sha256=sBUsm0oAwNdEIPgVIrOs5KflkkUDSnUNB0usP957SGE,85
source/torchaudio-2.9.1.dist-info/WHEEL ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: false
4
+ Tag: cp312-cp312-manylinux_2_28_x86_64
5
+
source/torchaudio-2.9.1.dist-info/licenses/LICENSE ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ BSD 2-Clause License
2
+
3
+ Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
4
+ All rights reserved.
5
+
6
+ Redistribution and use in source and binary forms, with or without
7
+ modification, are permitted provided that the following conditions are met:
8
+
9
+ * Redistributions of source code must retain the above copyright notice, this
10
+ list of conditions and the following disclaimer.
11
+
12
+ * Redistributions in binary form must reproduce the above copyright notice,
13
+ this list of conditions and the following disclaimer in the documentation
14
+ and/or other materials provided with the distribution.
15
+
16
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
source/torchaudio-2.9.1.dist-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ torchaudio
source/torchaudio/__init__.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import BinaryIO, Optional, Tuple, Union
3
+
4
+ import torch
5
+
6
+ # Initialize extension and backend first
7
+ from . import _extension # noqa # usort: skip
8
+ from . import compliance, datasets, functional, models, pipelines, transforms, utils # noqa: F401
9
+ from ._torchcodec import load_with_torchcodec, save_with_torchcodec
10
+
11
+
12
+ try:
13
+ from .version import __version__, git_version # noqa: F401
14
+ except ImportError:
15
+ pass
16
+
17
+
18
+ def load(
19
+ uri: Union[BinaryIO, str, os.PathLike],
20
+ frame_offset: int = 0,
21
+ num_frames: int = -1,
22
+ normalize: bool = True,
23
+ channels_first: bool = True,
24
+ format: Optional[str] = None,
25
+ buffer_size: int = 4096,
26
+ backend: Optional[str] = None,
27
+ ) -> Tuple[torch.Tensor, int]:
28
+ """Load audio data from source using TorchCodec's AudioDecoder.
29
+
30
+ .. note::
31
+
32
+ As of TorchAudio 2.9, this function relies on TorchCodec's decoding capabilities under the hood. It is
33
+ provided for convenience, but we do recommend that you port your code to
34
+ natively use ``torchcodec``'s ``AudioDecoder`` class for better
35
+ performance:
36
+ https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.AudioDecoder.
37
+ Because of the reliance on Torchcodec, the parameters ``normalize``, ``buffer_size``, and
38
+ ``backend`` are ignored and accepted only for backwards compatibility.
39
+ To install torchcodec, follow the instructions at https://github.com/pytorch/torchcodec#installing-torchcodec.
40
+
41
+
42
+ Args:
43
+ uri (path-like object or file-like object):
44
+ Source of audio data. The following types are accepted:
45
+
46
+ * ``path-like``: File path or URL.
47
+ * ``file-like``: Object with ``read(size: int) -> bytes`` method.
48
+
49
+ frame_offset (int, optional):
50
+ Number of samples to skip before start reading data.
51
+ num_frames (int, optional):
52
+ Maximum number of samples to read. ``-1`` reads all the remaining samples,
53
+ starting from ``frame_offset``.
54
+ normalize (bool, optional):
55
+ TorchCodec always returns normalized float32 samples. This parameter
56
+ is ignored and a warning is issued if set to False.
57
+ Default: ``True``.
58
+ channels_first (bool, optional):
59
+ When True, the returned Tensor has dimension `[channel, time]`.
60
+ Otherwise, the returned Tensor's dimension is `[time, channel]`.
61
+ format (str or None, optional):
62
+ Format hint for the decoder. May not be supported by all TorchCodec
63
+ decoders. (Default: ``None``)
64
+ buffer_size (int, optional):
65
+ Not used by TorchCodec AudioDecoder. Provided for API compatibility.
66
+ backend (str or None, optional):
67
+ Not used by TorchCodec AudioDecoder. Provided for API compatibility.
68
+
69
+ Returns:
70
+ (torch.Tensor, int): Resulting Tensor and sample rate.
71
+ Always returns float32 tensors. If ``channels_first=True``, shape is
72
+ `[channel, time]`, otherwise `[time, channel]`.
73
+
74
+ Raises:
75
+ ImportError: If torchcodec is not available.
76
+ ValueError: If unsupported parameters are used.
77
+ RuntimeError: If TorchCodec fails to decode the audio.
78
+
79
+ Note:
80
+ - TorchCodec always returns normalized float32 samples, so the ``normalize``
81
+ parameter has no effect.
82
+ - The ``buffer_size`` and ``backend`` parameters are ignored.
83
+ - Not all audio formats supported by torchaudio backends may be supported
84
+ by TorchCodec.
85
+ """
86
+ return load_with_torchcodec(
87
+ uri,
88
+ frame_offset=frame_offset,
89
+ num_frames=num_frames,
90
+ normalize=normalize,
91
+ channels_first=channels_first,
92
+ format=format,
93
+ buffer_size=buffer_size,
94
+ backend=backend,
95
+ )
96
+
97
+
98
+ def save(
99
+ uri: Union[str, os.PathLike],
100
+ src: torch.Tensor,
101
+ sample_rate: int,
102
+ channels_first: bool = True,
103
+ format: Optional[str] = None,
104
+ encoding: Optional[str] = None,
105
+ bits_per_sample: Optional[int] = None,
106
+ buffer_size: int = 4096,
107
+ backend: Optional[str] = None,
108
+ compression: Optional[Union[float, int]] = None,
109
+ ) -> None:
110
+ """Save audio data to file using TorchCodec's AudioEncoder.
111
+
112
+ .. note::
113
+
114
+ As of TorchAudio 2.9, this function relies on TorchCodec's encoding capabilities under the hood.
115
+ It is provided for convenience, but we do recommend that you port your code to
116
+ natively use ``torchcodec``'s ``AudioEncoder`` class for better
117
+ performance:
118
+ https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.encoders.AudioEncoder.
119
+ Because of the reliance on Torchcodec, the parameters ``format``, ``encoding``,
120
+ ``bits_per_sample``, ``buffer_size``, and ``backend``, are ignored and accepted only for
121
+ backwards compatibility.
122
+ To install torchcodec, follow the instructions at https://github.com/pytorch/torchcodec#installing-torchcodec.
123
+
124
+ Args:
125
+ uri (path-like object):
126
+ Path to save the audio file. The file extension determines the format.
127
+
128
+ src (torch.Tensor):
129
+ Audio data to save. Must be a 1D or 2D tensor with float32 values
130
+ in the range [-1, 1]. If 2D, shape should be [channel, time] when
131
+ channels_first=True, or [time, channel] when channels_first=False.
132
+
133
+ sample_rate (int):
134
+ Sample rate of the audio data.
135
+
136
+ channels_first (bool, optional):
137
+ Indicates whether the input tensor has channels as the first dimension.
138
+ If True, expects [channel, time]. If False, expects [time, channel].
139
+ Default: True.
140
+
141
+ format (str or None, optional):
142
+ Audio format hint. Not used by TorchCodec (format is determined by
143
+ file extension). A warning is issued if provided.
144
+ Default: None.
145
+
146
+ encoding (str or None, optional):
147
+ Audio encoding. Not fully supported by TorchCodec AudioEncoder.
148
+ A warning is issued if provided. Default: None.
149
+
150
+ bits_per_sample (int or None, optional):
151
+ Bits per sample. Not directly supported by TorchCodec AudioEncoder.
152
+ A warning is issued if provided. Default: None.
153
+
154
+ buffer_size (int, optional):
155
+ Not used by TorchCodec AudioEncoder. Provided for API compatibility.
156
+ A warning is issued if not default value. Default: 4096.
157
+
158
+ backend (str or None, optional):
159
+ Not used by TorchCodec AudioEncoder. Provided for API compatibility.
160
+ A warning is issued if provided. Default: None.
161
+
162
+ compression (float, int or None, optional):
163
+ Compression level or bit rate. Maps to bit_rate parameter in
164
+ TorchCodec AudioEncoder. Default: None.
165
+
166
+ Raises:
167
+ ImportError: If torchcodec is not available.
168
+ ValueError: If input parameters are invalid.
169
+ RuntimeError: If TorchCodec fails to encode the audio.
170
+
171
+ Note:
172
+ - TorchCodec AudioEncoder expects float32 samples in [-1, 1] range.
173
+ - Some parameters (format, encoding, bits_per_sample, buffer_size, backend)
174
+ are not used by TorchCodec but are provided for API compatibility.
175
+ - The output format is determined by the file extension in the uri.
176
+ - TorchCodec uses FFmpeg under the hood for encoding.
177
+ """
178
+ return save_with_torchcodec(
179
+ uri,
180
+ src,
181
+ sample_rate,
182
+ channels_first=channels_first,
183
+ format=format,
184
+ encoding=encoding,
185
+ bits_per_sample=bits_per_sample,
186
+ buffer_size=buffer_size,
187
+ backend=backend,
188
+ compression=compression,
189
+ )
190
+
191
+
192
+ __all__ = [
193
+ "load",
194
+ "load_with_torchcodec",
195
+ "save_with_torchcodec",
196
+ "save",
197
+ "compliance",
198
+ "datasets",
199
+ "functional",
200
+ "models",
201
+ "pipelines",
202
+ "utils",
203
+ "transforms",
204
+ ]
source/torchaudio/_extension/__init__.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import sys
4
+
5
+ from torchaudio._internal.module_utils import fail_with_message, is_module_available, no_op
6
+
7
+ from .utils import _check_cuda_version, _init_dll_path, _load_lib
8
+
9
+ _LG = logging.getLogger(__name__)
10
+
11
+
12
+ # Note:
13
+ # `_check_cuda_version` is not meant to be used by regular users.
14
+ # Builder uses it for debugging purpose, so we export it.
15
+ # https://github.com/pytorch/builder/blob/e2e4542b8eb0bdf491214451a1a4128bd606cce2/test/smoke_test/smoke_test.py#L80
16
+ __all__ = [
17
+ "_check_cuda_version",
18
+ "_IS_TORCHAUDIO_EXT_AVAILABLE",
19
+ "_IS_RIR_AVAILABLE",
20
+ ]
21
+
22
+
23
+ if os.name == "nt" and (3, 8) <= sys.version_info < (3, 9):
24
+ _init_dll_path()
25
+
26
+
27
+ # When the extension module is built, we initialize it.
28
+ # In case of an error, we do not catch the failure as it suggests there is something
29
+ # wrong with the installation.
30
+ _IS_TORCHAUDIO_EXT_AVAILABLE = is_module_available("torchaudio.lib._torchaudio")
31
+ # RIR features are implemented in _torchaudio extension, but they can be individually
32
+ # turned on/off at build time. Available means that _torchaudio is loaded properly, and
33
+ # RIR features are found there.
34
+ _IS_RIR_AVAILABLE = False
35
+ _IS_ALIGN_AVAILABLE = False
36
+ if _IS_TORCHAUDIO_EXT_AVAILABLE:
37
+ _load_lib("libtorchaudio")
38
+
39
+ import torchaudio.lib._torchaudio # noqa
40
+
41
+ _check_cuda_version()
42
+ _IS_RIR_AVAILABLE = torchaudio.lib._torchaudio.is_rir_available()
43
+ _IS_ALIGN_AVAILABLE = torchaudio.lib._torchaudio.is_align_available()
44
+
45
+
46
+ fail_if_no_rir = (
47
+ no_op
48
+ if _IS_RIR_AVAILABLE
49
+ else fail_with_message(
50
+ "requires RIR extension, but TorchAudio is not compiled with it. Please build TorchAudio with RIR support."
51
+ )
52
+ )
53
+
54
+ fail_if_no_align = (
55
+ no_op
56
+ if _IS_ALIGN_AVAILABLE
57
+ else fail_with_message(
58
+ "Requires alignment extension, but TorchAudio is not compiled with it. \
59
+ Please build TorchAudio with alignment support."
60
+ )
61
+ )
source/torchaudio/_extension/utils.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Module to implement logics used for initializing extensions.
2
+
3
+ The implementations here should be stateless.
4
+ They should not depend on external state.
5
+ Anything that depends on external state should happen in __init__.py
6
+ """
7
+ import logging
8
+ import os
9
+ import types
10
+ from pathlib import Path
11
+
12
+ import torch
13
+
14
+ _LG = logging.getLogger(__name__)
15
+ _LIB_DIR = Path(__file__).parent.parent / "lib"
16
+
17
+
18
+ def _get_lib_path(lib: str):
19
+ suffix = "pyd" if os.name == "nt" else "so"
20
+ path = _LIB_DIR / f"{lib}.{suffix}"
21
+ return path
22
+
23
+
24
+ def _load_lib(lib: str) -> bool:
25
+ """Load extension module
26
+
27
+ Note:
28
+ In case `torchaudio` is deployed with `pex` format, the library file
29
+ is not in a standard location.
30
+ In this case, we expect that `libtorchaudio` is available somewhere
31
+ in the search path of dynamic loading mechanism, so that importing
32
+ `_torchaudio` will have library loader find and load `libtorchaudio`.
33
+ This is the reason why the function should not raising an error when the library
34
+ file is not found.
35
+
36
+ Returns:
37
+ bool:
38
+ True if the library file is found AND the library loaded without failure.
39
+ False if the library file is not found (like in the case where torchaudio
40
+ is deployed with pex format, thus the shared library file is
41
+ in a non-standard location.).
42
+ If the library file is found but there is an issue loading the library,
43
+ (such as missing dependency) then this function raises the exception as-is.
44
+
45
+ Raises:
46
+ Exception:
47
+ If the library file is found, but there is an issue loading the library file,
48
+ (when underlying `ctype.DLL` throws an exception), this function will pass
49
+ the exception as-is, instead of catching it and returning bool.
50
+ The expected case is `OSError` thrown by `ctype.DLL` when a dynamic dependency
51
+ is not found.
52
+ This behavior was chosen because the expected failure case is not recoverable.
53
+ If a dependency is missing, then users have to install it.
54
+ """
55
+ path = _get_lib_path(lib)
56
+ if not path.exists():
57
+ return False
58
+ torch.ops.load_library(path)
59
+ return True
60
+
61
+
62
+ class _LazyImporter(types.ModuleType):
63
+ """Lazily import module/extension."""
64
+
65
+ def __init__(self, name, import_func):
66
+ super().__init__(name)
67
+ self.import_func = import_func
68
+ self.module = None
69
+
70
+ # Note:
71
+ # Python caches what was retrieved with `__getattr__`, so this method will not be
72
+ # called again for the same item.
73
+ def __getattr__(self, item):
74
+ self._import_once()
75
+ return getattr(self.module, item)
76
+
77
+ def __repr__(self):
78
+ if self.module is None:
79
+ return f"<module '{self.__module__}.{self.__class__.__name__}(\"{self.name}\")'>"
80
+ return repr(self.module)
81
+
82
+ def __dir__(self):
83
+ self._import_once()
84
+ return dir(self.module)
85
+
86
+ def _import_once(self):
87
+ if self.module is None:
88
+ self.module = self.import_func()
89
+ # Note:
90
+ # By attaching the module attributes to self,
91
+ # module attributes are directly accessible.
92
+ # This allows to avoid calling __getattr__ for every attribute access.
93
+ self.__dict__.update(self.module.__dict__)
94
+
95
+ def is_available(self):
96
+ try:
97
+ self._import_once()
98
+ except Exception:
99
+ return False
100
+ return True
101
+
102
+
103
+ def _init_dll_path():
104
+ # On Windows Python-3.8+ has `os.add_dll_directory` call,
105
+ # which is called to configure dll search path.
106
+ # To find cuda related dlls we need to make sure the
107
+ # conda environment/bin path is configured Please take a look:
108
+ # https://stackoverflow.com/questions/59330863/cant-import-dll-module-in-python
109
+ # Please note: if some path can't be added using add_dll_directory we simply ignore this path
110
+ for path in os.environ.get("PATH", "").split(";"):
111
+ if os.path.exists(path):
112
+ try:
113
+ os.add_dll_directory(path)
114
+ except Exception:
115
+ pass
116
+
117
+
118
+ def _check_cuda_version():
119
+ import torchaudio.lib._torchaudio
120
+
121
+ version = torchaudio.lib._torchaudio.cuda_version()
122
+ if version is not None and torch.version.cuda is not None:
123
+ version_str = str(version)
124
+ ta_version = f"{version_str[:-3]}.{version_str[-2]}"
125
+ t_version = torch.version.cuda.split(".")
126
+ t_version = f"{t_version[0]}.{t_version[1]}"
127
+ if ta_version != t_version:
128
+ raise RuntimeError(
129
+ "Detected that PyTorch and TorchAudio were compiled with different CUDA versions. "
130
+ f"PyTorch has CUDA version {t_version} whereas TorchAudio has CUDA version {ta_version}. "
131
+ "Please install the TorchAudio version that matches your PyTorch version."
132
+ )
133
+ return version
source/torchaudio/_internal/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ try:
2
+ from .fb import download_url_to_file, load_state_dict_from_url
3
+ except ImportError:
4
+ from torch.hub import download_url_to_file, load_state_dict_from_url
5
+
6
+
7
+ __all__ = [
8
+ "load_state_dict_from_url",
9
+ "download_url_to_file",
10
+ ]
source/torchaudio/_internal/module_utils.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib.util
2
+ import os
3
+ import warnings
4
+ from functools import partial, wraps
5
+ from typing import Optional
6
+
7
+
8
+ def eval_env(var, default):
9
+ """Check if environment varable has True-y value"""
10
+ if var not in os.environ:
11
+ return default
12
+
13
+ val = os.environ.get(var, "0")
14
+ trues = ["1", "true", "TRUE", "on", "ON", "yes", "YES"]
15
+ falses = ["0", "false", "FALSE", "off", "OFF", "no", "NO"]
16
+ if val in trues:
17
+ return True
18
+ if val not in falses:
19
+ # fmt: off
20
+ raise RuntimeError(
21
+ f"Unexpected environment variable value `{var}={val}`. "
22
+ f"Expected one of {trues + falses}")
23
+ # fmt: on
24
+ return False
25
+
26
+
27
+ def is_module_available(*modules: str) -> bool:
28
+ r"""Returns if a top-level module with :attr:`name` exists *without**
29
+ importing it. This is generally safer than try-catch block around a
30
+ `import X`. It avoids third party libraries breaking assumptions of some of
31
+ our tests, e.g., setting multiprocessing start method when imported
32
+ (see librosa/#747, torchvision/#544).
33
+ """
34
+ return all(importlib.util.find_spec(m) is not None for m in modules)
35
+
36
+
37
+ def requires_module(*modules: str):
38
+ """Decorate function to give error message if invoked without required optional modules.
39
+
40
+ This decorator is to give better error message to users rather
41
+ than raising ``NameError: name 'module' is not defined`` at random places.
42
+ """
43
+ missing = [m for m in modules if not is_module_available(m)]
44
+
45
+ if not missing:
46
+ # fall through. If all the modules are available, no need to decorate
47
+ def decorator(func):
48
+ return func
49
+
50
+ else:
51
+ req = f"module: {missing[0]}" if len(missing) == 1 else f"modules: {missing}"
52
+
53
+ def decorator(func):
54
+ @wraps(func)
55
+ def wrapped(*args, **kwargs):
56
+ raise RuntimeError(f"{func.__module__}.{func.__name__} requires {req}")
57
+
58
+ return wrapped
59
+
60
+ return decorator
61
+
62
+
63
+ UNSUPPORTED = []
64
+
65
+
66
+ def wrap_deprecated(func, name, direction: str, version: Optional[str] = None, remove: bool = False):
67
+ @wraps(func)
68
+ def wrapped(*args, **kwargs):
69
+ message = f"{name} has been deprecated. {direction}"
70
+ if remove:
71
+ message += f' It will be removed from {"a future" if version is None else "the " + str(version)} release. '
72
+ warnings.warn(message, stacklevel=2)
73
+ return func(*args, **kwargs)
74
+
75
+ return wrapped
76
+
77
+
78
+ def deprecated(direction: str, version: Optional[str] = None, remove: bool = False):
79
+ """Decorator to add deprecation message
80
+
81
+ Args:
82
+ direction (str): Migration steps to be given to users.
83
+ version (str or int): The version when the object will be removed
84
+ remove (bool): If enabled, append future removal message.
85
+ """
86
+
87
+ def decorator(func):
88
+ wrapped = wrap_deprecated(func, f"{func.__module__}.{func.__name__}", direction, version=version, remove=remove)
89
+
90
+ message = "This function has been deprecated. "
91
+ if remove:
92
+ message += f'It will be removed from {"future" if version is None else version} release. '
93
+
94
+ wrapped.__doc__ = f"""DEPRECATED
95
+
96
+ .. warning::
97
+
98
+ {message}
99
+ {direction}
100
+
101
+ {func.__doc__}
102
+ """
103
+
104
+ return wrapped
105
+
106
+ return decorator
107
+
108
+
109
+ DEPRECATION_MSG = (
110
+ "This deprecation is part of a large refactoring effort to transition TorchAudio into a maintenance phase. "
111
+ "Please see https://github.com/pytorch/audio/issues/3902 for more information."
112
+ )
113
+
114
+ IO_DEPRECATION_MSG = (
115
+ "This deprecation is part of a large refactoring effort to transition TorchAudio into a maintenance phase. "
116
+ "The decoding and encoding capabilities of PyTorch for both audio"
117
+ " and video are being consolidated into TorchCodec. "
118
+ "Please see https://github.com/pytorch/audio/issues/3902 for more information."
119
+ )
120
+
121
+ dropping_support = deprecated(DEPRECATION_MSG, version="2.9", remove=True)
122
+
123
+
124
+ def dropping_class_support(c, msg=DEPRECATION_MSG):
125
+ c.__init__ = wrap_deprecated(c.__init__, f"{c.__module__}.{c.__name__}", msg, version="2.9", remove=True)
126
+ c.__doc__ = f"""DEPRECATED
127
+
128
+ .. warning::
129
+
130
+ This class is deprecated from version 2.8. It will be removed in the 2.9 release.
131
+ {msg}
132
+ {c.__doc__}
133
+ """
134
+
135
+ UNSUPPORTED.append(c)
136
+ return c
137
+
138
+
139
+ def dropping_const_support(c, msg=DEPRECATION_MSG, name=None):
140
+ c.__doc__ = f"""[DEPRECATED]
141
+
142
+ .. warning::
143
+
144
+ This object is deprecated deprecated from version 2.8. It will be removed in the 2.9 release.
145
+ {msg}
146
+ {c.__doc__}
147
+ """
148
+ return c
149
+
150
+
151
+ dropping_class_io_support = partial(dropping_class_support, msg=IO_DEPRECATION_MSG)
152
+
153
+ dropping_io_support = deprecated(IO_DEPRECATION_MSG, version="2.9", remove=True)
154
+
155
+
156
+ def fail_with_message(message):
157
+ """Generate decorator to give users message about missing TorchAudio extension."""
158
+
159
+ def decorator(func):
160
+ @wraps(func)
161
+ def wrapped(*args, **kwargs):
162
+ raise RuntimeError(f"{func.__module__}.{func.__name__} {message}")
163
+
164
+ return wrapped
165
+
166
+ return decorator
167
+
168
+
169
+ def no_op(func):
170
+ """Op-op decorator. Used in place of fail_with_message when a functionality that requires extension works fine."""
171
+ return func
source/torchaudio/_torchcodec.py ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """TorchCodec integration for TorchAudio."""
2
+
3
+ import os
4
+ from typing import BinaryIO, Optional, Tuple, Union
5
+
6
+ import torch
7
+
8
+
9
+ def load_with_torchcodec(
10
+ uri: Union[BinaryIO, str, os.PathLike],
11
+ frame_offset: int = 0,
12
+ num_frames: int = -1,
13
+ normalize: bool = True,
14
+ channels_first: bool = True,
15
+ format: Optional[str] = None,
16
+ buffer_size: int = 4096,
17
+ backend: Optional[str] = None,
18
+ ) -> Tuple[torch.Tensor, int]:
19
+ """Load audio data from source using TorchCodec's AudioDecoder.
20
+
21
+ .. note::
22
+
23
+ This function supports the same API as :func:`~torchaudio.load`, and
24
+ relies on TorchCodec's decoding capabilities under the hood. It is
25
+ provided for convenience, but we do recommend that you port your code to
26
+ natively use ``torchcodec``'s ``AudioDecoder`` class for better
27
+ performance:
28
+ https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.AudioDecoder.
29
+ As of TorchAudio 2.9, :func:`~torchaudio.load` relies on
30
+ :func:`~torchaudio.load_with_torchcodec`. Note that some parameters of
31
+ :func:`~torchaudio.load`, like ``normalize``, ``buffer_size``, and
32
+ ``backend``, are ignored by :func:`~torchaudio.load_with_torchcodec`.
33
+ To install torchcodec, follow the instructions at https://github.com/pytorch/torchcodec#installing-torchcodec.
34
+
35
+
36
+ Args:
37
+ uri (path-like object or file-like object):
38
+ Source of audio data. The following types are accepted:
39
+
40
+ * ``path-like``: File path or URL.
41
+ * ``file-like``: Object with ``read(size: int) -> bytes`` method.
42
+
43
+ frame_offset (int, optional):
44
+ Number of samples to skip before start reading data.
45
+ num_frames (int, optional):
46
+ Maximum number of samples to read. ``-1`` reads all the remaining samples,
47
+ starting from ``frame_offset``.
48
+ normalize (bool, optional):
49
+ TorchCodec always returns normalized float32 samples. This parameter
50
+ is ignored and a warning is issued if set to False.
51
+ Default: ``True``.
52
+ channels_first (bool, optional):
53
+ When True, the returned Tensor has dimension `[channel, time]`.
54
+ Otherwise, the returned Tensor's dimension is `[time, channel]`.
55
+ format (str or None, optional):
56
+ Format hint for the decoder. May not be supported by all TorchCodec
57
+ decoders. (Default: ``None``)
58
+ buffer_size (int, optional):
59
+ Not used by TorchCodec AudioDecoder. Provided for API compatibility.
60
+ backend (str or None, optional):
61
+ Not used by TorchCodec AudioDecoder. Provided for API compatibility.
62
+
63
+ Returns:
64
+ (torch.Tensor, int): Resulting Tensor and sample rate.
65
+ Always returns float32 tensors. If ``channels_first=True``, shape is
66
+ `[channel, time]`, otherwise `[time, channel]`.
67
+
68
+ Raises:
69
+ ImportError: If torchcodec is not available.
70
+ ValueError: If unsupported parameters are used.
71
+ RuntimeError: If TorchCodec fails to decode the audio.
72
+
73
+ Note:
74
+ - TorchCodec always returns normalized float32 samples, so the ``normalize``
75
+ parameter has no effect.
76
+ - The ``buffer_size`` and ``backend`` parameters are ignored.
77
+ - Not all audio formats supported by torchaudio backends may be supported
78
+ by TorchCodec.
79
+ """
80
+ # Import torchcodec here to provide clear error if not available
81
+ try:
82
+ from torchcodec.decoders import AudioDecoder
83
+ except ImportError as e:
84
+ raise ImportError(
85
+ "TorchCodec is required for load_with_torchcodec. " "Please install torchcodec to use this function."
86
+ ) from e
87
+
88
+ # Parameter validation and warnings
89
+ if not normalize:
90
+ import warnings
91
+
92
+ warnings.warn(
93
+ "TorchCodec AudioDecoder always returns normalized float32 samples. "
94
+ "The 'normalize=False' parameter is ignored.",
95
+ UserWarning,
96
+ stacklevel=2,
97
+ )
98
+
99
+ if buffer_size != 4096:
100
+ import warnings
101
+
102
+ warnings.warn("The 'buffer_size' parameter is not used by TorchCodec AudioDecoder.", UserWarning, stacklevel=2)
103
+
104
+ if backend is not None:
105
+ import warnings
106
+
107
+ warnings.warn("The 'backend' parameter is not used by TorchCodec AudioDecoder.", UserWarning, stacklevel=2)
108
+
109
+ if format is not None:
110
+ import warnings
111
+
112
+ warnings.warn("The 'format' parameter is not supported by TorchCodec AudioDecoder.", UserWarning, stacklevel=2)
113
+
114
+ # Create AudioDecoder
115
+ try:
116
+ decoder = AudioDecoder(uri)
117
+ except Exception as e:
118
+ raise RuntimeError(f"Failed to create AudioDecoder for {uri}: {e}") from e
119
+
120
+ # Get sample rate from metadata
121
+ sample_rate = decoder.metadata.sample_rate
122
+ if sample_rate is None:
123
+ raise RuntimeError("Unable to determine sample rate from audio metadata")
124
+
125
+ # Decode the entire file first, then subsample manually
126
+ # This is the simplest approach since torchcodec uses time-based indexing
127
+ try:
128
+ audio_samples = decoder.get_all_samples()
129
+ except Exception as e:
130
+ raise RuntimeError(f"Failed to decode audio samples: {e}") from e
131
+
132
+ data = audio_samples.data
133
+
134
+ # Apply frame_offset and num_frames (which are actually sample offsets)
135
+ if frame_offset > 0:
136
+ if frame_offset >= data.shape[1]:
137
+ # Return empty tensor if offset is beyond available data
138
+ empty_shape = (data.shape[0], 0) if channels_first else (0, data.shape[0])
139
+ return torch.zeros(empty_shape, dtype=torch.float32), sample_rate
140
+ data = data[:, frame_offset:]
141
+
142
+ if num_frames == 0:
143
+ # Return empty tensor if num_frames is 0
144
+ empty_shape = (data.shape[0], 0) if channels_first else (0, data.shape[0])
145
+ return torch.zeros(empty_shape, dtype=torch.float32), sample_rate
146
+ elif num_frames > 0:
147
+ data = data[:, :num_frames]
148
+
149
+ # TorchCodec returns data in [channel, time] format by default
150
+ # Handle channels_first parameter
151
+ if not channels_first:
152
+ data = data.transpose(0, 1) # [channel, time] -> [time, channel]
153
+
154
+ return data, sample_rate
155
+
156
+
157
+ def save_with_torchcodec(
158
+ uri: Union[str, os.PathLike],
159
+ src: torch.Tensor,
160
+ sample_rate: int,
161
+ channels_first: bool = True,
162
+ format: Optional[str] = None,
163
+ encoding: Optional[str] = None,
164
+ bits_per_sample: Optional[int] = None,
165
+ buffer_size: int = 4096,
166
+ backend: Optional[str] = None,
167
+ compression: Optional[Union[float, int]] = None,
168
+ ) -> None:
169
+ """Save audio data to file using TorchCodec's AudioEncoder.
170
+
171
+ .. note::
172
+
173
+ This function supports the same API as :func:`~torchaudio.save`, and
174
+ relies on TorchCodec's encoding capabilities under the hood. It is
175
+ provided for convenience, but we do recommend that you port your code to
176
+ natively use ``torchcodec``'s ``AudioEncoder`` class for better
177
+ performance:
178
+ https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.encoders.AudioEncoder.
179
+ As of TorchAudio 2.9, :func:`~torchaudio.save` relies on
180
+ :func:`~torchaudio.save_with_torchcodec`. Note that some parameters of
181
+ :func:`~torchaudio.save`, like ``format``, ``encoding``,
182
+ ``bits_per_sample``, ``buffer_size``, and ``backend``, are ignored by
183
+ are ignored by :func:`~torchaudio.save_with_torchcodec`.
184
+ To install torchcodec, follow the instructions at https://github.com/pytorch/torchcodec#installing-torchcodec.
185
+
186
+ This function provides a TorchCodec-based alternative to torchaudio.save
187
+ with the same API. TorchCodec's AudioEncoder provides efficient encoding
188
+ with FFmpeg under the hood.
189
+
190
+ Args:
191
+ uri (path-like object):
192
+ Path to save the audio file. The file extension determines the format.
193
+
194
+ src (torch.Tensor):
195
+ Audio data to save. Must be a 1D or 2D tensor with float32 values
196
+ in the range [-1, 1]. If 2D, shape should be [channel, time] when
197
+ channels_first=True, or [time, channel] when channels_first=False.
198
+
199
+ sample_rate (int):
200
+ Sample rate of the audio data.
201
+
202
+ channels_first (bool, optional):
203
+ Indicates whether the input tensor has channels as the first dimension.
204
+ If True, expects [channel, time]. If False, expects [time, channel].
205
+ Default: True.
206
+
207
+ format (str or None, optional):
208
+ Audio format hint. Not used by TorchCodec (format is determined by
209
+ file extension). A warning is issued if provided.
210
+ Default: None.
211
+
212
+ encoding (str or None, optional):
213
+ Audio encoding. Not fully supported by TorchCodec AudioEncoder.
214
+ A warning is issued if provided. Default: None.
215
+
216
+ bits_per_sample (int or None, optional):
217
+ Bits per sample. Not directly supported by TorchCodec AudioEncoder.
218
+ A warning is issued if provided. Default: None.
219
+
220
+ buffer_size (int, optional):
221
+ Not used by TorchCodec AudioEncoder. Provided for API compatibility.
222
+ A warning is issued if not default value. Default: 4096.
223
+
224
+ backend (str or None, optional):
225
+ Not used by TorchCodec AudioEncoder. Provided for API compatibility.
226
+ A warning is issued if provided. Default: None.
227
+
228
+ compression (float, int or None, optional):
229
+ Compression level or bit rate. Maps to bit_rate parameter in
230
+ TorchCodec AudioEncoder. Default: None.
231
+
232
+ Raises:
233
+ ImportError: If torchcodec is not available.
234
+ ValueError: If input parameters are invalid.
235
+ RuntimeError: If TorchCodec fails to encode the audio.
236
+
237
+ Note:
238
+ - TorchCodec AudioEncoder expects float32 samples in [-1, 1] range.
239
+ - Some parameters (format, encoding, bits_per_sample, buffer_size, backend)
240
+ are not used by TorchCodec but are provided for API compatibility.
241
+ - The output format is determined by the file extension in the uri.
242
+ - TorchCodec uses FFmpeg under the hood for encoding.
243
+ """
244
+ # Import torchcodec here to provide clear error if not available
245
+ try:
246
+ from torchcodec.encoders import AudioEncoder
247
+ except ImportError as e:
248
+ raise ImportError(
249
+ "TorchCodec is required for save_with_torchcodec. " "Please install torchcodec to use this function."
250
+ ) from e
251
+
252
+ # Parameter validation and warnings
253
+ if format is not None:
254
+ import warnings
255
+
256
+ warnings.warn(
257
+ "The 'format' parameter is not used by TorchCodec AudioEncoder. "
258
+ "Format is determined by the file extension.",
259
+ UserWarning,
260
+ stacklevel=2,
261
+ )
262
+
263
+ if encoding is not None:
264
+ import warnings
265
+
266
+ warnings.warn(
267
+ "The 'encoding' parameter is not fully supported by TorchCodec AudioEncoder.", UserWarning, stacklevel=2
268
+ )
269
+
270
+ if bits_per_sample is not None:
271
+ import warnings
272
+
273
+ warnings.warn(
274
+ "The 'bits_per_sample' parameter is not directly supported by TorchCodec AudioEncoder.",
275
+ UserWarning,
276
+ stacklevel=2,
277
+ )
278
+
279
+ if buffer_size != 4096:
280
+ import warnings
281
+
282
+ warnings.warn("The 'buffer_size' parameter is not used by TorchCodec AudioEncoder.", UserWarning, stacklevel=2)
283
+
284
+ if backend is not None:
285
+ import warnings
286
+
287
+ warnings.warn("The 'backend' parameter is not used by TorchCodec AudioEncoder.", UserWarning, stacklevel=2)
288
+
289
+ # Input validation
290
+ if not isinstance(src, torch.Tensor):
291
+ raise ValueError(f"Expected src to be a torch.Tensor, got {type(src)}")
292
+
293
+ if src.dtype != torch.float32:
294
+ src = src.float()
295
+
296
+ if sample_rate <= 0:
297
+ raise ValueError(f"sample_rate must be positive, got {sample_rate}")
298
+
299
+ # Handle tensor shape and channels_first
300
+ if src.ndim == 1:
301
+ # Convert to 2D: [1, time] for channels_first=True
302
+ if channels_first:
303
+ data = src.unsqueeze(0) # [1, time]
304
+ else:
305
+ # For channels_first=False, input is [time] -> reshape to [time, 1] -> transpose to [1, time]
306
+ data = src.unsqueeze(1).transpose(0, 1) # [time, 1] -> [1, time]
307
+ elif src.ndim == 2:
308
+ if channels_first:
309
+ data = src # Already [channel, time]
310
+ else:
311
+ data = src.transpose(0, 1) # [time, channel] -> [channel, time]
312
+ else:
313
+ raise ValueError(f"Expected 1D or 2D tensor, got {src.ndim}D tensor")
314
+
315
+ # Create AudioEncoder
316
+ try:
317
+ encoder = AudioEncoder(data, sample_rate=sample_rate)
318
+ except Exception as e:
319
+ raise RuntimeError(f"Failed to create AudioEncoder: {e}") from e
320
+
321
+ # Determine bit_rate from compression parameter
322
+ bit_rate = None
323
+ if compression is not None:
324
+ if isinstance(compression, (int, float)):
325
+ bit_rate = int(compression)
326
+ else:
327
+ import warnings
328
+
329
+ warnings.warn(
330
+ f"Unsupported compression type {type(compression)}. "
331
+ "TorchCodec AudioEncoder expects int or float for bit_rate.",
332
+ UserWarning,
333
+ stacklevel=2,
334
+ )
335
+
336
+ # Save to file
337
+ try:
338
+ encoder.to_file(uri, bit_rate=bit_rate)
339
+ except Exception as e:
340
+ raise RuntimeError(f"Failed to save audio to {uri}: {e}") from e