Harmony18090 commited on
Commit
4612cc9
·
verified ·
1 Parent(s): fc6fcf5

Add source batch 9/11

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -0
  2. source/rpds_py-0.30.0.dist-info/INSTALLER +1 -0
  3. source/rpds_py-0.30.0.dist-info/METADATA +99 -0
  4. source/rpds_py-0.30.0.dist-info/RECORD +10 -0
  5. source/rpds_py-0.30.0.dist-info/WHEEL +5 -0
  6. source/rpds_py-0.30.0.dist-info/licenses/LICENSE +19 -0
  7. source/safetensors-0.7.0.dist-info/INSTALLER +1 -0
  8. source/safetensors-0.7.0.dist-info/METADATA +133 -0
  9. source/safetensors-0.7.0.dist-info/RECORD +23 -0
  10. source/safetensors-0.7.0.dist-info/REQUESTED +0 -0
  11. source/safetensors-0.7.0.dist-info/WHEEL +5 -0
  12. source/safetensors-0.7.0.dist-info/licenses/LICENSE +201 -0
  13. source/safetensors/__init__.py +10 -0
  14. source/safetensors/__init__.pyi +164 -0
  15. source/safetensors/_safetensors_rust.abi3.so +3 -0
  16. source/safetensors/flax.py +138 -0
  17. source/safetensors/mlx.py +140 -0
  18. source/safetensors/numpy.py +187 -0
  19. source/safetensors/paddle.py +290 -0
  20. source/safetensors/py.typed +0 -0
  21. source/safetensors/tensorflow.py +139 -0
  22. source/safetensors/torch.py +550 -0
  23. source/sentencepiece-0.2.1.dist-info/INSTALLER +1 -0
  24. source/sentencepiece-0.2.1.dist-info/METADATA +251 -0
  25. source/sentencepiece-0.2.1.dist-info/RECORD +20 -0
  26. source/sentencepiece-0.2.1.dist-info/WHEEL +6 -0
  27. source/sentencepiece-0.2.1.dist-info/top_level.txt +5 -0
  28. source/sentencepiece/__init__.py +1230 -0
  29. source/sentencepiece/_sentencepiece.cpython-312-x86_64-linux-gnu.so +3 -0
  30. source/sentencepiece/_version.py +1 -0
  31. source/sentencepiece/package_data/nfkc.bin +3 -0
  32. source/sentencepiece/package_data/nfkc_cf.bin +3 -0
  33. source/sentencepiece/package_data/nmt_nfkc.bin +3 -0
  34. source/sentencepiece/package_data/nmt_nfkc_cf.bin +3 -0
  35. source/sentencepiece/sentencepiece.i +2013 -0
  36. source/sentencepiece/sentencepiece_model_pb2.py +44 -0
  37. source/sentencepiece/sentencepiece_pb2.py +30 -0
  38. source/sentencepiece/sentencepiece_wrap.cxx +0 -0
  39. source/sentry_sdk-2.53.0.dist-info/INSTALLER +1 -0
  40. source/sentry_sdk-2.53.0.dist-info/METADATA +268 -0
  41. source/sentry_sdk-2.53.0.dist-info/RECORD +386 -0
  42. source/sentry_sdk-2.53.0.dist-info/WHEEL +6 -0
  43. source/sentry_sdk-2.53.0.dist-info/entry_points.txt +2 -0
  44. source/sentry_sdk-2.53.0.dist-info/licenses/LICENSE +21 -0
  45. source/sentry_sdk-2.53.0.dist-info/top_level.txt +1 -0
  46. source/sentry_sdk/__init__.py +67 -0
  47. source/sentry_sdk/_batcher.py +139 -0
  48. source/sentry_sdk/_compat.py +94 -0
  49. source/sentry_sdk/_init_implementation.py +79 -0
  50. source/sentry_sdk/_log_batcher.py +56 -0
.gitattributes CHANGED
@@ -246,3 +246,6 @@ source/ray/thirdparty_files/psutil/_psutil_linux.abi3.so filter=lfs diff=lfs mer
246
  source/regex/_regex.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
247
  source/rignore/rignore.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
248
  source/rpds/rpds.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
 
 
 
 
246
  source/regex/_regex.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
247
  source/rignore/rignore.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
248
  source/rpds/rpds.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
249
+ source/safetensors/_safetensors_rust.abi3.so filter=lfs diff=lfs merge=lfs -text
250
+ source/sentencepiece/_sentencepiece.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
251
+ source/tiktoken/_tiktoken.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
source/rpds_py-0.30.0.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ pip
source/rpds_py-0.30.0.dist-info/METADATA ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: rpds-py
3
+ Version: 0.30.0
4
+ Classifier: Development Status :: 3 - Alpha
5
+ Classifier: Intended Audience :: Developers
6
+ Classifier: Operating System :: OS Independent
7
+ Classifier: Programming Language :: Rust
8
+ Classifier: Programming Language :: Python :: 3.10
9
+ Classifier: Programming Language :: Python :: 3.11
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Classifier: Programming Language :: Python :: 3.13
12
+ Classifier: Programming Language :: Python :: 3.14
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: Implementation :: CPython
15
+ Classifier: Programming Language :: Python :: Implementation :: PyPy
16
+ License-File: LICENSE
17
+ Summary: Python bindings to Rust's persistent data structures (rpds)
18
+ Keywords: data structures,rust,persistent
19
+ Author-email: Julian Berman <Julian+rpds@GrayVines.com>
20
+ License-Expression: MIT
21
+ Requires-Python: >=3.10
22
+ Description-Content-Type: text/x-rst; charset=UTF-8
23
+ Project-URL: Documentation, https://rpds.readthedocs.io/
24
+ Project-URL: Homepage, https://github.com/crate-py/rpds
25
+ Project-URL: Issues, https://github.com/crate-py/rpds/issues/
26
+ Project-URL: Funding, https://github.com/sponsors/Julian
27
+ Project-URL: Tidelift, https://tidelift.com/subscription/pkg/pypi-rpds-py?utm_source=pypi-rpds-py&utm_medium=referral&utm_campaign=pypi-link
28
+ Project-URL: Source, https://github.com/crate-py/rpds
29
+ Project-URL: Upstream, https://github.com/orium/rpds
30
+
31
+ ===========
32
+ ``rpds.py``
33
+ ===========
34
+
35
+ |PyPI| |Pythons| |CI|
36
+
37
+ .. |PyPI| image:: https://img.shields.io/pypi/v/rpds-py.svg
38
+ :alt: PyPI version
39
+ :target: https://pypi.org/project/rpds-py/
40
+
41
+ .. |Pythons| image:: https://img.shields.io/pypi/pyversions/rpds-py.svg
42
+ :alt: Supported Python versions
43
+ :target: https://pypi.org/project/rpds-py/
44
+
45
+ .. |CI| image:: https://github.com/crate-py/rpds/workflows/CI/badge.svg
46
+ :alt: Build status
47
+ :target: https://github.com/crate-py/rpds/actions?query=workflow%3ACI
48
+
49
+ .. |ReadTheDocs| image:: https://readthedocs.org/projects/referencing/badge/?version=stable&style=flat
50
+ :alt: ReadTheDocs status
51
+ :target: https://referencing.readthedocs.io/en/stable/
52
+
53
+
54
+ Python bindings to the `Rust rpds crate <https://docs.rs/rpds/>`_ for persistent data structures.
55
+
56
+ What's here is quite minimal (in transparency, it was written initially to support replacing ``pyrsistent`` in the `referencing library <https://github.com/python-jsonschema/referencing>`_).
57
+ If you see something missing (which is very likely), a PR is definitely welcome to add it.
58
+
59
+ Installation
60
+ ------------
61
+
62
+ The distribution on PyPI is named ``rpds.py`` (equivalently ``rpds-py``), and thus can be installed via e.g.:
63
+
64
+ .. code:: sh
65
+
66
+ $ pip install rpds-py
67
+
68
+ Note that if you install ``rpds-py`` from source, you will need a Rust toolchain installed, as it is a build-time dependency.
69
+ An example of how to do so in a ``Dockerfile`` can be found `here <https://github.com/bowtie-json-schema/bowtie/blob/e77fd93598cb6e7dc1b8b1f53c00e5aa410c201a/implementations/python-jsonschema/Dockerfile#L1-L8>`_.
70
+
71
+ If you believe you are on a common platform which should have wheels built (i.e. and not need to compile from source), feel free to file an issue or pull request modifying the GitHub action used here to build wheels via ``maturin``.
72
+
73
+ Usage
74
+ -----
75
+
76
+ Methods in general are named similarly to their ``rpds`` counterparts (rather than ``pyrsistent``\ 's conventions, though probably a full drop-in ``pyrsistent``\ -compatible wrapper module is a good addition at some point).
77
+
78
+ .. code:: python
79
+
80
+ >>> from rpds import HashTrieMap, HashTrieSet, List
81
+
82
+ >>> m = HashTrieMap({"foo": "bar", "baz": "quux"})
83
+ >>> m.insert("spam", 37) == HashTrieMap({"foo": "bar", "baz": "quux", "spam": 37})
84
+ True
85
+ >>> m.remove("foo") == HashTrieMap({"baz": "quux"})
86
+ True
87
+
88
+ >>> s = HashTrieSet({"foo", "bar", "baz", "quux"})
89
+ >>> s.insert("spam") == HashTrieSet({"foo", "bar", "baz", "quux", "spam"})
90
+ True
91
+ >>> s.remove("foo") == HashTrieSet({"bar", "baz", "quux"})
92
+ True
93
+
94
+ >>> L = List([1, 3, 5])
95
+ >>> L.push_front(-1) == List([-1, 1, 3, 5])
96
+ True
97
+ >>> L.rest == List([3, 5])
98
+ True
99
+
source/rpds_py-0.30.0.dist-info/RECORD ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ rpds/__init__.py,sha256=w3MgXW7lpTCICw0KXbw20QX573_kbsEnWIeMsCAugvM,99
2
+ rpds/__init__.pyi,sha256=am7x6oMa_pu_kv1NlolqJbPr6_UvCvoyxGKrDGSMKEk,2602
3
+ rpds/__pycache__/__init__.cpython-312.pyc,,
4
+ rpds/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ rpds/rpds.cpython-312-x86_64-linux-gnu.so,sha256=YnSRYU7s221B-NVp0g5gVF1LPFzVnkzK3CdalA33MzY,1060936
6
+ rpds_py-0.30.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
7
+ rpds_py-0.30.0.dist-info/METADATA,sha256=zAHQl925x-B6KclVPujdNc-SDoX7V097DCPWgTinFr8,4145
8
+ rpds_py-0.30.0.dist-info/RECORD,,
9
+ rpds_py-0.30.0.dist-info/WHEEL,sha256=m2ROzCpH5Kw6bN_3jKfw80jyQS9OqSulcWBhBkC07PU,147
10
+ rpds_py-0.30.0.dist-info/licenses/LICENSE,sha256=MU5Okb47qpPA-0vMyeTpfNZD64ObBlr5IXgsIXX-mQk,1057
source/rpds_py-0.30.0.dist-info/WHEEL ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: maturin (1.10.2)
3
+ Root-Is-Purelib: false
4
+ Tag: cp312-cp312-manylinux_2_17_x86_64
5
+ Tag: cp312-cp312-manylinux2014_x86_64
source/rpds_py-0.30.0.dist-info/licenses/LICENSE ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (c) 2023 Julian Berman
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
source/safetensors-0.7.0.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ pip
source/safetensors-0.7.0.dist-info/METADATA ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: safetensors
3
+ Version: 0.7.0
4
+ Classifier: Development Status :: 5 - Production/Stable
5
+ Classifier: Intended Audience :: Developers
6
+ Classifier: Intended Audience :: Education
7
+ Classifier: Intended Audience :: Science/Research
8
+ Classifier: License :: OSI Approved :: Apache Software License
9
+ Classifier: Operating System :: OS Independent
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.7
12
+ Classifier: Programming Language :: Python :: 3.8
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Classifier: Typing :: Typed
17
+ Requires-Dist: numpy>=1.21.6 ; extra == 'numpy'
18
+ Requires-Dist: packaging ; extra == 'torch'
19
+ Requires-Dist: safetensors[numpy] ; extra == 'torch'
20
+ Requires-Dist: torch>=1.10 ; extra == 'torch'
21
+ Requires-Dist: safetensors[numpy] ; extra == 'tensorflow'
22
+ Requires-Dist: tensorflow>=2.11.0 ; extra == 'tensorflow'
23
+ Requires-Dist: safetensors[numpy] ; extra == 'pinned-tf'
24
+ Requires-Dist: tensorflow==2.18.0 ; extra == 'pinned-tf'
25
+ Requires-Dist: safetensors[numpy] ; extra == 'jax'
26
+ Requires-Dist: flax>=0.6.3 ; extra == 'jax'
27
+ Requires-Dist: jax>=0.3.25 ; extra == 'jax'
28
+ Requires-Dist: jaxlib>=0.3.25 ; extra == 'jax'
29
+ Requires-Dist: mlx>=0.0.9 ; extra == 'mlx'
30
+ Requires-Dist: safetensors[numpy] ; extra == 'paddlepaddle'
31
+ Requires-Dist: paddlepaddle>=2.4.1 ; extra == 'paddlepaddle'
32
+ Requires-Dist: ruff ; extra == 'quality'
33
+ Requires-Dist: safetensors[numpy] ; extra == 'testing'
34
+ Requires-Dist: h5py>=3.7.0 ; extra == 'testing'
35
+ Requires-Dist: huggingface-hub>=0.12.1 ; extra == 'testing'
36
+ Requires-Dist: setuptools-rust>=1.5.2 ; extra == 'testing'
37
+ Requires-Dist: pytest>=7.2.0 ; extra == 'testing'
38
+ Requires-Dist: pytest-benchmark>=4.0.0 ; extra == 'testing'
39
+ Requires-Dist: hypothesis>=6.70.2 ; extra == 'testing'
40
+ Requires-Dist: safetensors[numpy] ; extra == 'testingfree'
41
+ Requires-Dist: huggingface-hub>=0.12.1 ; extra == 'testingfree'
42
+ Requires-Dist: setuptools-rust>=1.5.2 ; extra == 'testingfree'
43
+ Requires-Dist: pytest>=7.2.0 ; extra == 'testingfree'
44
+ Requires-Dist: pytest-benchmark>=4.0.0 ; extra == 'testingfree'
45
+ Requires-Dist: hypothesis>=6.70.2 ; extra == 'testingfree'
46
+ Requires-Dist: safetensors[torch] ; extra == 'all'
47
+ Requires-Dist: safetensors[numpy] ; extra == 'all'
48
+ Requires-Dist: safetensors[pinned-tf] ; extra == 'all'
49
+ Requires-Dist: safetensors[jax] ; extra == 'all'
50
+ Requires-Dist: safetensors[paddlepaddle] ; extra == 'all'
51
+ Requires-Dist: safetensors[quality] ; extra == 'all'
52
+ Requires-Dist: safetensors[testing] ; extra == 'all'
53
+ Requires-Dist: safetensors[all] ; extra == 'dev'
54
+ Provides-Extra: numpy
55
+ Provides-Extra: torch
56
+ Provides-Extra: tensorflow
57
+ Provides-Extra: pinned-tf
58
+ Provides-Extra: jax
59
+ Provides-Extra: mlx
60
+ Provides-Extra: paddlepaddle
61
+ Provides-Extra: quality
62
+ Provides-Extra: testing
63
+ Provides-Extra: testingfree
64
+ Provides-Extra: all
65
+ Provides-Extra: dev
66
+ License-File: LICENSE
67
+ Author-email: Nicolas Patry <patry.nicolas@protonmail.com>
68
+ Requires-Python: >=3.9
69
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
70
+ Project-URL: Homepage, https://github.com/huggingface/safetensors
71
+ Project-URL: Source, https://github.com/huggingface/safetensors
72
+
73
+ ## Installation
74
+
75
+ ```
76
+ pip install safetensors
77
+ ```
78
+
79
+
80
+ ## Usage
81
+
82
+ ### Numpy
83
+
84
+ ```python
85
+ from safetensors.numpy import save_file, load_file
86
+ import numpy as np
87
+
88
+ tensors = {
89
+ "a": np.zeros((2, 2)),
90
+ "b": np.zeros((2, 3), dtype=np.uint8)
91
+ }
92
+
93
+ save_file(tensors, "./model.safetensors")
94
+
95
+
96
+ # Now loading
97
+ loaded = load_file("./model.safetensors")
98
+ ```
99
+
100
+ ### Torch
101
+
102
+ ```python
103
+ from safetensors.torch import save_file, load_file
104
+ import torch
105
+
106
+ tensors = {
107
+ "a": torch.zeros((2, 2)),
108
+ "b": torch.zeros((2, 3), dtype=torch.uint8)
109
+ }
110
+
111
+ save_file(tensors, "./model.safetensors")
112
+
113
+
114
+ # Now loading
115
+ loaded = load_file("./model.safetensors")
116
+ ```
117
+
118
+ ### Developing
119
+
120
+ ```
121
+ # inside ./safetensors/bindings/python
122
+ pip install .[dev]
123
+ ```
124
+ Should be enough to install this library locally.
125
+
126
+ ### Testing
127
+
128
+ ```
129
+ # inside ./safetensors/bindings/python
130
+ pip install .[dev]
131
+ pytest -sv tests/
132
+ ```
133
+
source/safetensors-0.7.0.dist-info/RECORD ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ safetensors-0.7.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
2
+ safetensors-0.7.0.dist-info/METADATA,sha256=aAkyb78XRE3VIKsZ8KVbehhHbb5SpDBGa79TiZZ8Kqo,4125
3
+ safetensors-0.7.0.dist-info/RECORD,,
4
+ safetensors-0.7.0.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ safetensors-0.7.0.dist-info/WHEEL,sha256=EsgGQg7OBGIn-zS1ipDRPjO8C2qSQ0GRrd2xuL_Pyq0,143
6
+ safetensors-0.7.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
7
+ safetensors/__init__.py,sha256=wbzKZlAVgnAsjHmqryuSJCiADvpDZxNGCfj8VzY0At0,194
8
+ safetensors/__init__.pyi,sha256=IpwsrzRWJA2yR8TxMEC3RHgM_5TiDgSFqyvrAxAa15U,4019
9
+ safetensors/__pycache__/__init__.cpython-312.pyc,,
10
+ safetensors/__pycache__/flax.cpython-312.pyc,,
11
+ safetensors/__pycache__/mlx.cpython-312.pyc,,
12
+ safetensors/__pycache__/numpy.cpython-312.pyc,,
13
+ safetensors/__pycache__/paddle.cpython-312.pyc,,
14
+ safetensors/__pycache__/tensorflow.cpython-312.pyc,,
15
+ safetensors/__pycache__/torch.cpython-312.pyc,,
16
+ safetensors/_safetensors_rust.abi3.so,sha256=h9qPRzPm7PhApdjisd6IjK0BLCN1URiRlWfsSAp9fgI,1216632
17
+ safetensors/flax.py,sha256=T59elUqzVDyGYGdR78QzNEuwyAc8KrEO0EuLBSKOnUs,3853
18
+ safetensors/mlx.py,sha256=IR51jRpcJq6epb0Agj8VsxI9xqBS6NjeAJnr-Ny0jJU,3850
19
+ safetensors/numpy.py,sha256=rit_12-IfZtRgip_VLd8nPAcCXyeM2fPrCDZ7OiyxSY,5028
20
+ safetensors/paddle.py,sha256=LrDwqQbwFnQXiY3M601IU7G6FBctX6tyHHK3_UH6lxE,8721
21
+ safetensors/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
+ safetensors/tensorflow.py,sha256=AZ-O7-gM-JqTfjczZyCUAHm3Er-GSQnQWaFyY7mAIQc,3903
23
+ safetensors/torch.py,sha256=U0acZVahLsxvqPa1GitMRiaByu6XVbmiBITGtuznBEY,18610
source/safetensors-0.7.0.dist-info/REQUESTED ADDED
File without changes
source/safetensors-0.7.0.dist-info/WHEEL ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: maturin (1.10.2)
3
+ Root-Is-Purelib: false
4
+ Tag: cp38-abi3-manylinux_2_17_x86_64
5
+ Tag: cp38-abi3-manylinux2014_x86_64
source/safetensors-0.7.0.dist-info/licenses/LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
source/safetensors/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Re-export this
2
+ from ._safetensors_rust import ( # noqa: F401
3
+ SafetensorError,
4
+ __version__,
5
+ deserialize,
6
+ safe_open,
7
+ _safe_open_handle,
8
+ serialize,
9
+ serialize_file,
10
+ )
source/safetensors/__init__.pyi ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated content DO NOT EDIT
2
+ @staticmethod
3
+ def deserialize(bytes):
4
+ """
5
+ Opens a safetensors lazily and returns tensors as asked
6
+
7
+ Args:
8
+ data (`bytes`):
9
+ The byte content of a file
10
+
11
+ Returns:
12
+ (`List[str, Dict[str, Dict[str, any]]]`):
13
+ The deserialized content is like:
14
+ [("tensor_name", {"shape": [2, 3], "dtype": "F32", "data": b"\0\0.." }), (...)]
15
+ """
16
+ pass
17
+
18
+ @staticmethod
19
+ def serialize(tensor_dict, metadata=None):
20
+ """
21
+ Serializes raw data.
22
+
23
+ Args:
24
+ tensor_dict (`Dict[str, Dict[Any]]`):
25
+ The tensor dict is like:
26
+ {"tensor_name": {"dtype": "F32", "shape": [2, 3], "data": b"\0\0"}}
27
+ metadata (`Dict[str, str]`, *optional*):
28
+ The optional purely text annotations
29
+
30
+ Returns:
31
+ (`bytes`):
32
+ The serialized content.
33
+ """
34
+ pass
35
+
36
+ @staticmethod
37
+ def serialize_file(tensor_dict, filename, metadata=None):
38
+ """
39
+ Serializes raw data into file.
40
+
41
+ Args:
42
+ tensor_dict (`Dict[str, Dict[Any]]`):
43
+ The tensor dict is like:
44
+ {"tensor_name": {"dtype": "F32", "shape": [2, 3], "data": b"\0\0"}}
45
+ filename (`str`, or `os.PathLike`):
46
+ The name of the file to write into.
47
+ metadata (`Dict[str, str]`, *optional*):
48
+ The optional purely text annotations
49
+
50
+ Returns:
51
+ (`NoneType`):
52
+ On success return None
53
+ """
54
+ pass
55
+
56
+ class safe_open:
57
+ """
58
+ Opens a safetensors lazily and returns tensors as asked
59
+
60
+ Args:
61
+ filename (`str`, or `os.PathLike`):
62
+ The filename to open
63
+
64
+ framework (`str`):
65
+ The framework you want you tensors in. Supported values:
66
+ `pt`, `tf`, `flax`, `numpy`.
67
+
68
+ device (`str`, defaults to `"cpu"`):
69
+ The device on which you want the tensors.
70
+ """
71
+ def __init__(self, filename, framework, device=...):
72
+ pass
73
+
74
+ def __enter__(self):
75
+ """
76
+ Start the context manager
77
+ """
78
+ pass
79
+
80
+ def __exit__(self, _exc_type, _exc_value, _traceback):
81
+ """
82
+ Exits the context manager
83
+ """
84
+ pass
85
+
86
+ def get_slice(self, name):
87
+ """
88
+ Returns a full slice view object
89
+
90
+ Args:
91
+ name (`str`):
92
+ The name of the tensor you want
93
+
94
+ Returns:
95
+ (`PySafeSlice`):
96
+ A dummy object you can slice into to get a real tensor
97
+ Example:
98
+ ```python
99
+ from safetensors import safe_open
100
+
101
+ with safe_open("model.safetensors", framework="pt", device=0) as f:
102
+ tensor_part = f.get_slice("embedding")[:, ::8]
103
+
104
+ ```
105
+ """
106
+ pass
107
+
108
+ def get_tensor(self, name):
109
+ """
110
+ Returns a full tensor
111
+
112
+ Args:
113
+ name (`str`):
114
+ The name of the tensor you want
115
+
116
+ Returns:
117
+ (`Tensor`):
118
+ The tensor in the framework you opened the file for.
119
+
120
+ Example:
121
+ ```python
122
+ from safetensors import safe_open
123
+
124
+ with safe_open("model.safetensors", framework="pt", device=0) as f:
125
+ tensor = f.get_tensor("embedding")
126
+
127
+ ```
128
+ """
129
+ pass
130
+
131
+ def keys(self):
132
+ """
133
+ Returns the names of the tensors in the file.
134
+
135
+ Returns:
136
+ (`List[str]`):
137
+ The name of the tensors contained in that file
138
+ """
139
+ pass
140
+
141
+ def metadata(self):
142
+ """
143
+ Return the special non tensor information in the header
144
+
145
+ Returns:
146
+ (`Dict[str, str]`):
147
+ The freeform metadata.
148
+ """
149
+ pass
150
+
151
+ def offset_keys(self):
152
+ """
153
+ Returns the names of the tensors in the file, ordered by offset.
154
+
155
+ Returns:
156
+ (`List[str]`):
157
+ The name of the tensors contained in that file
158
+ """
159
+ pass
160
+
161
+ class SafetensorError(Exception):
162
+ """
163
+ Custom Python Exception for Safetensor errors.
164
+ """
source/safetensors/_safetensors_rust.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87da8f4733e6ecf840a5d8e2b1de888cad012c23755118919567ec480a7d7e02
3
+ size 1216632
source/safetensors/flax.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Dict, Optional, Union
3
+
4
+ import numpy as np
5
+
6
+ import jax.numpy as jnp
7
+ from jax import Array
8
+ from safetensors import numpy, safe_open
9
+
10
+
11
+ def save(tensors: Dict[str, Array], metadata: Optional[Dict[str, str]] = None) -> bytes:
12
+ """
13
+ Saves a dictionary of tensors into raw bytes in safetensors format.
14
+
15
+ Args:
16
+ tensors (`Dict[str, Array]`):
17
+ The incoming tensors. Tensors need to be contiguous and dense.
18
+ metadata (`Dict[str, str]`, *optional*, defaults to `None`):
19
+ Optional text only metadata you might want to save in your header.
20
+ For instance it can be useful to specify more about the underlying
21
+ tensors. This is purely informative and does not affect tensor loading.
22
+
23
+ Returns:
24
+ `bytes`: The raw bytes representing the format
25
+
26
+ Example:
27
+
28
+ ```python
29
+ from safetensors.flax import save
30
+ from jax import numpy as jnp
31
+
32
+ tensors = {"embedding": jnp.zeros((512, 1024)), "attention": jnp.zeros((256, 256))}
33
+ byte_data = save(tensors)
34
+ ```
35
+ """
36
+ np_tensors = _jnp2np(tensors)
37
+ return numpy.save(np_tensors, metadata=metadata)
38
+
39
+
40
+ def save_file(
41
+ tensors: Dict[str, Array],
42
+ filename: Union[str, os.PathLike],
43
+ metadata: Optional[Dict[str, str]] = None,
44
+ ) -> None:
45
+ """
46
+ Saves a dictionary of tensors into raw bytes in safetensors format.
47
+
48
+ Args:
49
+ tensors (`Dict[str, Array]`):
50
+ The incoming tensors. Tensors need to be contiguous and dense.
51
+ filename (`str`, or `os.PathLike`)):
52
+ The filename we're saving into.
53
+ metadata (`Dict[str, str]`, *optional*, defaults to `None`):
54
+ Optional text only metadata you might want to save in your header.
55
+ For instance it can be useful to specify more about the underlying
56
+ tensors. This is purely informative and does not affect tensor loading.
57
+
58
+ Returns:
59
+ `None`
60
+
61
+ Example:
62
+
63
+ ```python
64
+ from safetensors.flax import save_file
65
+ from jax import numpy as jnp
66
+
67
+ tensors = {"embedding": jnp.zeros((512, 1024)), "attention": jnp.zeros((256, 256))}
68
+ save_file(tensors, "model.safetensors")
69
+ ```
70
+ """
71
+ np_tensors = _jnp2np(tensors)
72
+ return numpy.save_file(np_tensors, filename, metadata=metadata)
73
+
74
+
75
+ def load(data: bytes) -> Dict[str, Array]:
76
+ """
77
+ Loads a safetensors file into flax format from pure bytes.
78
+
79
+ Args:
80
+ data (`bytes`):
81
+ The content of a safetensors file
82
+
83
+ Returns:
84
+ `Dict[str, Array]`: dictionary that contains name as key, value as `Array` on cpu
85
+
86
+ Example:
87
+
88
+ ```python
89
+ from safetensors.flax import load
90
+
91
+ file_path = "./my_folder/bert.safetensors"
92
+ with open(file_path, "rb") as f:
93
+ data = f.read()
94
+
95
+ loaded = load(data)
96
+ ```
97
+ """
98
+ flat = numpy.load(data)
99
+ return _np2jnp(flat)
100
+
101
+
102
+ def load_file(filename: Union[str, os.PathLike]) -> Dict[str, Array]:
103
+ """
104
+ Loads a safetensors file into flax format.
105
+
106
+ Args:
107
+ filename (`str`, or `os.PathLike`)):
108
+ The name of the file which contains the tensors
109
+
110
+ Returns:
111
+ `Dict[str, Array]`: dictionary that contains name as key, value as `Array`
112
+
113
+ Example:
114
+
115
+ ```python
116
+ from safetensors.flax import load_file
117
+
118
+ file_path = "./my_folder/bert.safetensors"
119
+ loaded = load_file(file_path)
120
+ ```
121
+ """
122
+ result = {}
123
+ with safe_open(filename, framework="flax") as f:
124
+ for k in f.offset_keys():
125
+ result[k] = f.get_tensor(k)
126
+ return result
127
+
128
+
129
+ def _np2jnp(numpy_dict: Dict[str, np.ndarray]) -> Dict[str, Array]:
130
+ for k, v in numpy_dict.items():
131
+ numpy_dict[k] = jnp.array(v)
132
+ return numpy_dict
133
+
134
+
135
+ def _jnp2np(jnp_dict: Dict[str, Array]) -> Dict[str, np.array]:
136
+ for k, v in jnp_dict.items():
137
+ jnp_dict[k] = np.asarray(v)
138
+ return jnp_dict
source/safetensors/mlx.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Dict, Optional, Union
3
+
4
+ import numpy as np
5
+
6
+ import mlx.core as mx
7
+ from safetensors import numpy, safe_open
8
+
9
+
10
+ def save(
11
+ tensors: Dict[str, mx.array], metadata: Optional[Dict[str, str]] = None
12
+ ) -> bytes:
13
+ """
14
+ Saves a dictionary of tensors into raw bytes in safetensors format.
15
+
16
+ Args:
17
+ tensors (`Dict[str, mx.array]`):
18
+ The incoming tensors. Tensors need to be contiguous and dense.
19
+ metadata (`Dict[str, str]`, *optional*, defaults to `None`):
20
+ Optional text only metadata you might want to save in your header.
21
+ For instance it can be useful to specify more about the underlying
22
+ tensors. This is purely informative and does not affect tensor loading.
23
+
24
+ Returns:
25
+ `bytes`: The raw bytes representing the format
26
+
27
+ Example:
28
+
29
+ ```python
30
+ from safetensors.mlx import save
31
+ import mlx.core as mx
32
+
33
+ tensors = {"embedding": mx.zeros((512, 1024)), "attention": mx.zeros((256, 256))}
34
+ byte_data = save(tensors)
35
+ ```
36
+ """
37
+ np_tensors = _mx2np(tensors)
38
+ return numpy.save(np_tensors, metadata=metadata)
39
+
40
+
41
+ def save_file(
42
+ tensors: Dict[str, mx.array],
43
+ filename: Union[str, os.PathLike],
44
+ metadata: Optional[Dict[str, str]] = None,
45
+ ) -> None:
46
+ """
47
+ Saves a dictionary of tensors into raw bytes in safetensors format.
48
+
49
+ Args:
50
+ tensors (`Dict[str, mx.array]`):
51
+ The incoming tensors. Tensors need to be contiguous and dense.
52
+ filename (`str`, or `os.PathLike`)):
53
+ The filename we're saving into.
54
+ metadata (`Dict[str, str]`, *optional*, defaults to `None`):
55
+ Optional text only metadata you might want to save in your header.
56
+ For instance it can be useful to specify more about the underlying
57
+ tensors. This is purely informative and does not affect tensor loading.
58
+
59
+ Returns:
60
+ `None`
61
+
62
+ Example:
63
+
64
+ ```python
65
+ from safetensors.mlx import save_file
66
+ import mlx.core as mx
67
+
68
+ tensors = {"embedding": mx.zeros((512, 1024)), "attention": mx.zeros((256, 256))}
69
+ save_file(tensors, "model.safetensors")
70
+ ```
71
+ """
72
+ np_tensors = _mx2np(tensors)
73
+ return numpy.save_file(np_tensors, filename, metadata=metadata)
74
+
75
+
76
+ def load(data: bytes) -> Dict[str, mx.array]:
77
+ """
78
+ Loads a safetensors file into MLX format from pure bytes.
79
+
80
+ Args:
81
+ data (`bytes`):
82
+ The content of a safetensors file
83
+
84
+ Returns:
85
+ `Dict[str, mx.array]`: dictionary that contains name as key, value as `mx.array`
86
+
87
+ Example:
88
+
89
+ ```python
90
+ from safetensors.mlx import load
91
+
92
+ file_path = "./my_folder/bert.safetensors"
93
+ with open(file_path, "rb") as f:
94
+ data = f.read()
95
+
96
+ loaded = load(data)
97
+ ```
98
+ """
99
+ flat = numpy.load(data)
100
+ return _np2mx(flat)
101
+
102
+
103
+ def load_file(filename: Union[str, os.PathLike]) -> Dict[str, mx.array]:
104
+ """
105
+ Loads a safetensors file into MLX format.
106
+
107
+ Args:
108
+ filename (`str`, or `os.PathLike`)):
109
+ The name of the file which contains the tensors
110
+
111
+ Returns:
112
+ `Dict[str, mx.array]`: dictionary that contains name as key, value as `mx.array`
113
+
114
+ Example:
115
+
116
+ ```python
117
+ from safetensors.flax import load_file
118
+
119
+ file_path = "./my_folder/bert.safetensors"
120
+ loaded = load_file(file_path)
121
+ ```
122
+ """
123
+ result = {}
124
+ with safe_open(filename, framework="mlx") as f:
125
+ for k in f.offset_keys():
126
+ result[k] = f.get_tensor(k)
127
+ return result
128
+
129
+
130
+ def _np2mx(numpy_dict: Dict[str, np.ndarray]) -> Dict[str, mx.array]:
131
+ for k, v in numpy_dict.items():
132
+ numpy_dict[k] = mx.array(v)
133
+ return numpy_dict
134
+
135
+
136
+ def _mx2np(mx_dict: Dict[str, mx.array]) -> Dict[str, np.array]:
137
+ new_dict = {}
138
+ for k, v in mx_dict.items():
139
+ new_dict[k] = np.asarray(v)
140
+ return new_dict
source/safetensors/numpy.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from typing import Dict, Optional, Union
4
+
5
+ import numpy as np
6
+
7
+ from safetensors import deserialize, safe_open, serialize, serialize_file
8
+
9
+
10
+ def _tobytes(tensor: np.ndarray) -> bytes:
11
+ if not _is_little_endian(tensor):
12
+ tensor = tensor.byteswap(inplace=False)
13
+ return tensor.tobytes()
14
+
15
+
16
+ def save(
17
+ tensor_dict: Dict[str, np.ndarray], metadata: Optional[Dict[str, str]] = None
18
+ ) -> bytes:
19
+ """
20
+ Saves a dictionary of tensors into raw bytes in safetensors format.
21
+
22
+ Args:
23
+ tensor_dict (`Dict[str, np.ndarray]`):
24
+ The incoming tensors. Tensors need to be contiguous and dense.
25
+ metadata (`Dict[str, str]`, *optional*, defaults to `None`):
26
+ Optional text only metadata you might want to save in your header.
27
+ For instance it can be useful to specify more about the underlying
28
+ tensors. This is purely informative and does not affect tensor loading.
29
+
30
+ Returns:
31
+ `bytes`: The raw bytes representing the format
32
+
33
+ Example:
34
+
35
+ ```python
36
+ from safetensors.numpy import save
37
+ import numpy as np
38
+
39
+ tensors = {"embedding": np.zeros((512, 1024)), "attention": np.zeros((256, 256))}
40
+ byte_data = save(tensors)
41
+ ```
42
+ """
43
+ flattened = {
44
+ k: {"dtype": v.dtype.name, "shape": v.shape, "data": _tobytes(v)}
45
+ for k, v in tensor_dict.items()
46
+ }
47
+ serialized = serialize(flattened, metadata=metadata)
48
+ result = bytes(serialized)
49
+ return result
50
+
51
+
52
+ def save_file(
53
+ tensor_dict: Dict[str, np.ndarray],
54
+ filename: Union[str, os.PathLike],
55
+ metadata: Optional[Dict[str, str]] = None,
56
+ ) -> None:
57
+ """
58
+ Saves a dictionary of tensors into raw bytes in safetensors format.
59
+
60
+ Args:
61
+ tensor_dict (`Dict[str, np.ndarray]`):
62
+ The incoming tensors. Tensors need to be contiguous and dense.
63
+ filename (`str`, or `os.PathLike`)):
64
+ The filename we're saving into.
65
+ metadata (`Dict[str, str]`, *optional*, defaults to `None`):
66
+ Optional text only metadata you might want to save in your header.
67
+ For instance it can be useful to specify more about the underlying
68
+ tensors. This is purely informative and does not affect tensor loading.
69
+
70
+ Returns:
71
+ `None`
72
+
73
+ Example:
74
+
75
+ ```python
76
+ from safetensors.numpy import save_file
77
+ import numpy as np
78
+
79
+ tensors = {"embedding": np.zeros((512, 1024)), "attention": np.zeros((256, 256))}
80
+ save_file(tensors, "model.safetensors")
81
+ ```
82
+ """
83
+ flattened = {
84
+ k: {"dtype": v.dtype.name, "shape": v.shape, "data": _tobytes(v)}
85
+ for k, v in tensor_dict.items()
86
+ }
87
+ serialize_file(flattened, filename, metadata=metadata)
88
+
89
+
90
+ def load(data: bytes) -> Dict[str, np.ndarray]:
91
+ """
92
+ Loads a safetensors file into numpy format from pure bytes.
93
+
94
+ Args:
95
+ data (`bytes`):
96
+ The content of a safetensors file
97
+
98
+ Returns:
99
+ `Dict[str, np.ndarray]`: dictionary that contains name as key, value as `np.ndarray` on cpu
100
+
101
+ Example:
102
+
103
+ ```python
104
+ from safetensors.numpy import load
105
+
106
+ file_path = "./my_folder/bert.safetensors"
107
+ with open(file_path, "rb") as f:
108
+ data = f.read()
109
+
110
+ loaded = load(data)
111
+ ```
112
+ """
113
+ flat = deserialize(data)
114
+ return _view2np(flat)
115
+
116
+
117
+ def load_file(filename: Union[str, os.PathLike]) -> Dict[str, np.ndarray]:
118
+ """
119
+ Loads a safetensors file into numpy format.
120
+
121
+ Args:
122
+ filename (`str`, or `os.PathLike`)):
123
+ The name of the file which contains the tensors
124
+
125
+ Returns:
126
+ `Dict[str, np.ndarray]`: dictionary that contains name as key, value as `np.ndarray`
127
+
128
+ Example:
129
+
130
+ ```python
131
+ from safetensors.numpy import load_file
132
+
133
+ file_path = "./my_folder/bert.safetensors"
134
+ loaded = load_file(file_path)
135
+ ```
136
+ """
137
+ result = {}
138
+ with safe_open(filename, framework="np") as f:
139
+ for k in f.offset_keys():
140
+ result[k] = f.get_tensor(k)
141
+ return result
142
+
143
+
144
+ _TYPES = {
145
+ "F64": np.float64,
146
+ "F32": np.float32,
147
+ "F16": np.float16,
148
+ "I64": np.int64,
149
+ "U64": np.uint64,
150
+ "I32": np.int32,
151
+ "U32": np.uint32,
152
+ "I16": np.int16,
153
+ "U16": np.uint16,
154
+ "I8": np.int8,
155
+ "U8": np.uint8,
156
+ "BOOL": bool,
157
+ "C64": np.complex64,
158
+ }
159
+
160
+
161
+ def _getdtype(dtype_str: str) -> np.dtype:
162
+ return _TYPES[dtype_str]
163
+
164
+
165
+ def _view2np(safeview) -> Dict[str, np.ndarray]:
166
+ result = {}
167
+ for k, v in safeview:
168
+ dtype = _getdtype(v["dtype"])
169
+ arr = np.frombuffer(v["data"], dtype=dtype).reshape(v["shape"])
170
+ result[k] = arr
171
+ return result
172
+
173
+
174
+ def _is_little_endian(tensor: np.ndarray) -> bool:
175
+ byteorder = tensor.dtype.byteorder
176
+ if byteorder == "=":
177
+ if sys.byteorder == "little":
178
+ return True
179
+ else:
180
+ return False
181
+ elif byteorder == "|":
182
+ return True
183
+ elif byteorder == "<":
184
+ return True
185
+ elif byteorder == ">":
186
+ return False
187
+ raise ValueError(f"Unexpected byte order {byteorder}")
source/safetensors/paddle.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from typing import Any, Dict, Optional, Union
4
+
5
+ import numpy as np
6
+ import paddle
7
+
8
+ from safetensors import numpy, deserialize, safe_open, serialize, serialize_file
9
+
10
+
11
+ def save(
12
+ tensors: Dict[str, paddle.Tensor], metadata: Optional[Dict[str, str]] = None
13
+ ) -> bytes:
14
+ """
15
+ Saves a dictionary of tensors into raw bytes in safetensors format.
16
+
17
+ Args:
18
+ tensors (`Dict[str, paddle.Tensor]`):
19
+ The incoming tensors. Tensors need to be contiguous and dense.
20
+ metadata (`Dict[str, str]`, *optional*, defaults to `None`):
21
+ Optional text only metadata you might want to save in your header.
22
+ For instance it can be useful to specify more about the underlying
23
+ tensors. This is purely informative and does not affect tensor loading.
24
+
25
+ Returns:
26
+ `bytes`: The raw bytes representing the format
27
+
28
+ Example:
29
+
30
+ ```python
31
+ from safetensors.paddle import save
32
+ import paddle
33
+
34
+ tensors = {"embedding": paddle.zeros((512, 1024)), "attention": paddle.zeros((256, 256))}
35
+ byte_data = save(tensors)
36
+ ```
37
+ """
38
+ serialized = serialize(_flatten(tensors), metadata=metadata)
39
+ result = bytes(serialized)
40
+ return result
41
+
42
+
43
+ def save_file(
44
+ tensors: Dict[str, paddle.Tensor],
45
+ filename: Union[str, os.PathLike],
46
+ metadata: Optional[Dict[str, str]] = None,
47
+ ) -> None:
48
+ """
49
+ Saves a dictionary of tensors into raw bytes in safetensors format.
50
+
51
+ Args:
52
+ tensors (`Dict[str, paddle.Tensor]`):
53
+ The incoming tensors. Tensors need to be contiguous and dense.
54
+ filename (`str`, or `os.PathLike`)):
55
+ The filename we're saving into.
56
+ metadata (`Dict[str, str]`, *optional*, defaults to `None`):
57
+ Optional text only metadata you might want to save in your header.
58
+ For instance it can be useful to specify more about the underlying
59
+ tensors. This is purely informative and does not affect tensor loading.
60
+
61
+ Returns:
62
+ `None`
63
+
64
+ Example:
65
+
66
+ ```python
67
+ from safetensors.paddle import save_file
68
+ import paddle
69
+
70
+ tensors = {"embedding": paddle.zeros((512, 1024)), "attention": paddle.zeros((256, 256))}
71
+ save_file(tensors, "model.safetensors")
72
+ ```
73
+ """
74
+ serialize_file(_flatten(tensors), filename, metadata=metadata)
75
+
76
+
77
+ def load(data: bytes, device: str = "cpu") -> Dict[str, paddle.Tensor]:
78
+ """
79
+ Loads a safetensors file into paddle format from pure bytes.
80
+
81
+ Args:
82
+ data (`bytes`):
83
+ The content of a safetensors file
84
+
85
+ Returns:
86
+ `Dict[str, paddle.Tensor]`: dictionary that contains name as key, value as `paddle.Tensor` on cpu
87
+
88
+ Example:
89
+
90
+ ```python
91
+ from safetensors.paddle import load
92
+
93
+ file_path = "./my_folder/bert.safetensors"
94
+ with open(file_path, "rb") as f:
95
+ data = f.read()
96
+
97
+ loaded = load(data)
98
+ ```
99
+ """
100
+ if paddle.__version__ >= "3.2.0":
101
+ flat = deserialize(data)
102
+ return _view2paddle(flat, device)
103
+ else:
104
+ flat = numpy.load(data)
105
+ return _np2paddle(flat, device)
106
+
107
+
108
+ def load_file(
109
+ filename: Union[str, os.PathLike], device="cpu"
110
+ ) -> Dict[str, paddle.Tensor]:
111
+ """
112
+ Loads a safetensors file into paddle format.
113
+
114
+ Args:
115
+ filename (`str`, or `os.PathLike`)):
116
+ The name of the file which contains the tensors
117
+ device (`Union[Dict[str, any], str]`, *optional*, defaults to `cpu`):
118
+ The device where the tensors need to be located after load.
119
+ available options are all regular paddle device locations
120
+
121
+ Returns:
122
+ `Dict[str, paddle.Tensor]`: dictionary that contains name as key, value as `paddle.Tensor`
123
+
124
+ Example:
125
+
126
+ ```python
127
+ from safetensors.paddle import load_file
128
+
129
+ file_path = "./my_folder/bert.safetensors"
130
+ loaded = load_file(file_path)
131
+ ```
132
+ """
133
+ result = {}
134
+ if paddle.__version__ >= "3.2.0":
135
+ with safe_open(filename, framework="paddle", device=device) as f:
136
+ for k in f.offset_keys():
137
+ result[k] = f.get_tensor(k)
138
+ else:
139
+ flat = numpy.load_file(filename)
140
+ result = _np2paddle(flat, device)
141
+ return result
142
+
143
+
144
+ def _np2paddle(
145
+ numpy_dict: Dict[str, np.ndarray], device: str = "cpu"
146
+ ) -> Dict[str, paddle.Tensor]:
147
+ for k, v in numpy_dict.items():
148
+ numpy_dict[k] = paddle.to_tensor(v, place=device)
149
+ return numpy_dict
150
+
151
+
152
+ def _paddle2np(paddle_dict: Dict[str, paddle.Tensor]) -> Dict[str, np.array]:
153
+ for k, v in paddle_dict.items():
154
+ paddle_dict[k] = v.detach().cpu().numpy()
155
+ return paddle_dict
156
+
157
+
158
+ _SIZE = {
159
+ paddle.int64: 8,
160
+ paddle.float32: 4,
161
+ paddle.int32: 4,
162
+ paddle.bfloat16: 2,
163
+ paddle.float16: 2,
164
+ paddle.int16: 2,
165
+ paddle.uint8: 1,
166
+ paddle.int8: 1,
167
+ paddle.bool: 1,
168
+ paddle.float64: 8,
169
+ paddle.float8_e4m3fn: 1,
170
+ paddle.float8_e5m2: 1,
171
+ paddle.complex64: 8,
172
+ # XXX: These are not supported yet in paddle
173
+ # paddle.uint64: 8,
174
+ # paddle.uint32: 4,
175
+ # paddle.uint16: 2,
176
+ # paddle.float8_e8m0: 1,
177
+ # paddle.float4_e2m1_x2: 1,
178
+ }
179
+
180
+ _TYPES = {
181
+ "F64": paddle.float64,
182
+ "F32": paddle.float32,
183
+ "F16": paddle.float16,
184
+ "BF16": paddle.bfloat16,
185
+ "I64": paddle.int64,
186
+ "I32": paddle.int32,
187
+ "I16": paddle.int16,
188
+ "I8": paddle.int8,
189
+ "U8": paddle.uint8,
190
+ "BOOL": paddle.bool,
191
+ "F8_E4M3": paddle.float8_e4m3fn,
192
+ "F8_E5M2": paddle.float8_e5m2,
193
+ }
194
+
195
+ NPDTYPES = {
196
+ paddle.int64: np.int64,
197
+ paddle.float32: np.float32,
198
+ paddle.int32: np.int32,
199
+ # XXX: This is ok because both have the same width
200
+ paddle.bfloat16: np.float16,
201
+ paddle.float16: np.float16,
202
+ paddle.int16: np.int16,
203
+ paddle.uint8: np.uint8,
204
+ paddle.int8: np.int8,
205
+ paddle.bool: bool,
206
+ paddle.float64: np.float64,
207
+ # XXX: This is ok because both have the same width and byteswap is a no-op anyway
208
+ paddle.float8_e4m3fn: np.uint8,
209
+ paddle.float8_e5m2: np.uint8,
210
+ }
211
+
212
+
213
+ def _getdtype(dtype_str: str) -> paddle.dtype:
214
+ return _TYPES[dtype_str]
215
+
216
+
217
+ def _view2paddle(safeview, device) -> Dict[str, paddle.Tensor]:
218
+ result = {}
219
+ for k, v in safeview:
220
+ dtype = _getdtype(v["dtype"])
221
+ if len(v["data"]) == 0:
222
+ # Workaround because frombuffer doesn't accept zero-size tensors
223
+ assert any(x == 0 for x in v["shape"])
224
+ arr = paddle.empty(v["shape"], dtype=dtype)
225
+ else:
226
+ arr = paddle.base.core.frombuffer(v["data"], dtype).reshape(v["shape"])
227
+ if device != "cpu":
228
+ arr = arr.to(device)
229
+ if sys.byteorder == "big":
230
+ arr = paddle.to_tensor(arr.numpy().byteswap(inplace=False), place=device)
231
+ result[k] = arr
232
+
233
+ return result
234
+
235
+
236
+ def _tobytes(tensor: paddle.Tensor, name: str) -> bytes:
237
+ if not tensor.is_contiguous():
238
+ raise ValueError(
239
+ f"You are trying to save a non contiguous tensor: `{name}` which is not allowed. It either means you"
240
+ " are trying to save tensors which are reference of each other in which case it's recommended to save"
241
+ " only the full tensors, and reslice at load time, or simply call `.contiguous()` on your tensor to"
242
+ " pack it before saving."
243
+ )
244
+ if not tensor.place.is_cpu_place():
245
+ # Moving tensor to cpu before saving
246
+ tensor = tensor.cpu()
247
+
248
+ import ctypes
249
+
250
+ import numpy as np
251
+
252
+ # When shape is empty (scalar), np.prod returns a float
253
+ # we need a int for the following calculations
254
+ length = int(np.prod(tensor.shape).item())
255
+ bytes_per_item = _SIZE[tensor.dtype]
256
+
257
+ total_bytes = length * bytes_per_item
258
+
259
+ ptr = tensor.data_ptr()
260
+ if ptr == 0:
261
+ return b""
262
+ newptr = ctypes.cast(ptr, ctypes.POINTER(ctypes.c_ubyte))
263
+ data = np.ctypeslib.as_array(newptr, (total_bytes,)) # no internal copy
264
+ if sys.byteorder == "big":
265
+ npdtype = NPDTYPES[tensor.dtype]
266
+ # Not in place as that would potentially modify a live running model
267
+ data = data.view(npdtype).byteswap(inplace=False)
268
+ return data.tobytes()
269
+
270
+
271
+ def _flatten(tensors: Dict[str, paddle.Tensor]) -> Dict[str, Dict[str, Any]]:
272
+ if not isinstance(tensors, dict):
273
+ raise ValueError(
274
+ f"Expected a dict of [str, paddle.Tensor] but received {type(tensors)}"
275
+ )
276
+
277
+ for k, v in tensors.items():
278
+ if not isinstance(v, paddle.Tensor):
279
+ raise ValueError(
280
+ f"Key `{k}` is invalid, expected paddle.Tensor but received {type(v)}"
281
+ )
282
+
283
+ return {
284
+ k: {
285
+ "dtype": str(v.dtype).split(".")[-1],
286
+ "shape": v.shape,
287
+ "data": _tobytes(v, k),
288
+ }
289
+ for k, v in tensors.items()
290
+ }
source/safetensors/py.typed ADDED
File without changes
source/safetensors/tensorflow.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Dict, Optional, Union
3
+
4
+ import numpy as np
5
+ import tensorflow as tf
6
+
7
+ from safetensors import numpy, safe_open
8
+
9
+
10
+ def save(
11
+ tensors: Dict[str, tf.Tensor], metadata: Optional[Dict[str, str]] = None
12
+ ) -> bytes:
13
+ """
14
+ Saves a dictionary of tensors into raw bytes in safetensors format.
15
+
16
+ Args:
17
+ tensors (`Dict[str, tf.Tensor]`):
18
+ The incoming tensors. Tensors need to be contiguous and dense.
19
+ metadata (`Dict[str, str]`, *optional*, defaults to `None`):
20
+ Optional text only metadata you might want to save in your header.
21
+ For instance it can be useful to specify more about the underlying
22
+ tensors. This is purely informative and does not affect tensor loading.
23
+
24
+ Returns:
25
+ `bytes`: The raw bytes representing the format
26
+
27
+ Example:
28
+
29
+ ```python
30
+ from safetensors.tensorflow import save
31
+ import tensorflow as tf
32
+
33
+ tensors = {"embedding": tf.zeros((512, 1024)), "attention": tf.zeros((256, 256))}
34
+ byte_data = save(tensors)
35
+ ```
36
+ """
37
+ np_tensors = _tf2np(tensors)
38
+ return numpy.save(np_tensors, metadata=metadata)
39
+
40
+
41
+ def save_file(
42
+ tensors: Dict[str, tf.Tensor],
43
+ filename: Union[str, os.PathLike],
44
+ metadata: Optional[Dict[str, str]] = None,
45
+ ) -> None:
46
+ """
47
+ Saves a dictionary of tensors into raw bytes in safetensors format.
48
+
49
+ Args:
50
+ tensors (`Dict[str, tf.Tensor]`):
51
+ The incoming tensors. Tensors need to be contiguous and dense.
52
+ filename (`str`, or `os.PathLike`)):
53
+ The filename we're saving into.
54
+ metadata (`Dict[str, str]`, *optional*, defaults to `None`):
55
+ Optional text only metadata you might want to save in your header.
56
+ For instance it can be useful to specify more about the underlying
57
+ tensors. This is purely informative and does not affect tensor loading.
58
+
59
+ Returns:
60
+ `None`
61
+
62
+ Example:
63
+
64
+ ```python
65
+ from safetensors.tensorflow import save_file
66
+ import tensorflow as tf
67
+
68
+ tensors = {"embedding": tf.zeros((512, 1024)), "attention": tf.zeros((256, 256))}
69
+ save_file(tensors, "model.safetensors")
70
+ ```
71
+ """
72
+ np_tensors = _tf2np(tensors)
73
+ return numpy.save_file(np_tensors, filename, metadata=metadata)
74
+
75
+
76
+ def load(data: bytes) -> Dict[str, tf.Tensor]:
77
+ """
78
+ Loads a safetensors file into tensorflow format from pure bytes.
79
+
80
+ Args:
81
+ data (`bytes`):
82
+ The content of a safetensors file
83
+
84
+ Returns:
85
+ `Dict[str, tf.Tensor]`: dictionary that contains name as key, value as `tf.Tensor` on cpu
86
+
87
+ Example:
88
+
89
+ ```python
90
+ from safetensors.tensorflow import load
91
+
92
+ file_path = "./my_folder/bert.safetensors"
93
+ with open(file_path, "rb") as f:
94
+ data = f.read()
95
+
96
+ loaded = load(data)
97
+ ```
98
+ """
99
+ flat = numpy.load(data)
100
+ return _np2tf(flat)
101
+
102
+
103
+ def load_file(filename: Union[str, os.PathLike]) -> Dict[str, tf.Tensor]:
104
+ """
105
+ Loads a safetensors file into tensorflow format.
106
+
107
+ Args:
108
+ filename (`str`, or `os.PathLike`)):
109
+ The name of the file which contains the tensors
110
+
111
+ Returns:
112
+ `Dict[str, tf.Tensor]`: dictionary that contains name as key, value as `tf.Tensor`
113
+
114
+ Example:
115
+
116
+ ```python
117
+ from safetensors.tensorflow import load_file
118
+
119
+ file_path = "./my_folder/bert.safetensors"
120
+ loaded = load_file(file_path)
121
+ ```
122
+ """
123
+ result = {}
124
+ with safe_open(filename, framework="tf") as f:
125
+ for k in f.offset_keys():
126
+ result[k] = f.get_tensor(k)
127
+ return result
128
+
129
+
130
+ def _np2tf(numpy_dict: Dict[str, np.ndarray]) -> Dict[str, tf.Tensor]:
131
+ for k, v in numpy_dict.items():
132
+ numpy_dict[k] = tf.convert_to_tensor(v)
133
+ return numpy_dict
134
+
135
+
136
+ def _tf2np(tf_dict: Dict[str, tf.Tensor]) -> Dict[str, np.array]:
137
+ for k, v in tf_dict.items():
138
+ tf_dict[k] = v.numpy()
139
+ return tf_dict
source/safetensors/torch.py ADDED
@@ -0,0 +1,550 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from collections import defaultdict
4
+ from typing import Any, Dict, List, Optional, Set, Tuple, Union
5
+ from packaging.version import Version
6
+
7
+ import torch
8
+
9
+ from safetensors import deserialize, safe_open, serialize, serialize_file
10
+
11
+
12
+ def storage_ptr(tensor: torch.Tensor) -> int:
13
+ try:
14
+ return tensor.untyped_storage().data_ptr()
15
+ except Exception:
16
+ # Fallback for torch==1.10
17
+ try:
18
+ return tensor.storage().data_ptr()
19
+ except NotImplementedError:
20
+ # Fallback for meta storage
21
+ return 0
22
+
23
+
24
+ def _end_ptr(tensor: torch.Tensor) -> int:
25
+ if tensor.nelement():
26
+ stop = tensor.view(-1)[-1].data_ptr() + _SIZE[tensor.dtype]
27
+ else:
28
+ stop = tensor.data_ptr()
29
+ return stop
30
+
31
+
32
+ def storage_size(tensor: torch.Tensor) -> int:
33
+ try:
34
+ return tensor.untyped_storage().nbytes()
35
+ except AttributeError:
36
+ # Fallback for torch==1.10
37
+ try:
38
+ return tensor.storage().size() * _SIZE[tensor.dtype]
39
+ except NotImplementedError:
40
+ # Fallback for meta storage
41
+ # On torch >=2.0 this is the tensor size
42
+ return tensor.nelement() * _SIZE[tensor.dtype]
43
+
44
+
45
+ def _filter_shared_not_shared(
46
+ tensors: List[Set[str]], state_dict: Dict[str, torch.Tensor]
47
+ ) -> List[Set[str]]:
48
+ filtered_tensors = []
49
+ for shared in tensors:
50
+ if len(shared) < 2:
51
+ filtered_tensors.append(shared)
52
+ continue
53
+
54
+ areas = []
55
+ for name in shared:
56
+ tensor = state_dict[name]
57
+ areas.append((tensor.data_ptr(), _end_ptr(tensor), name))
58
+ areas.sort()
59
+
60
+ _, last_stop, last_name = areas[0]
61
+ filtered_tensors.append({last_name})
62
+ for start, stop, name in areas[1:]:
63
+ if start >= last_stop:
64
+ filtered_tensors.append({name})
65
+ else:
66
+ filtered_tensors[-1].add(name)
67
+ last_stop = stop
68
+
69
+ return filtered_tensors
70
+
71
+
72
+ def _find_shared_tensors(state_dict: Dict[str, torch.Tensor]) -> List[Set[str]]:
73
+ tensors = defaultdict(set)
74
+ for k, v in state_dict.items():
75
+ if (
76
+ v.device != torch.device("meta")
77
+ and storage_ptr(v) != 0
78
+ and storage_size(v) != 0
79
+ ):
80
+ # Need to add device as key because of multiple GPU.
81
+ tensors[(v.device, storage_ptr(v), storage_size(v))].add(k)
82
+ tensors = list(sorted(tensors.values()))
83
+ tensors = _filter_shared_not_shared(tensors, state_dict)
84
+ return tensors
85
+
86
+
87
+ def _is_complete(tensor: torch.Tensor) -> bool:
88
+ return tensor.data_ptr() == storage_ptr(tensor) and tensor.nelement() * _SIZE[
89
+ tensor.dtype
90
+ ] == storage_size(tensor)
91
+
92
+
93
+ def _remove_duplicate_names(
94
+ state_dict: Dict[str, torch.Tensor],
95
+ *,
96
+ preferred_names: Optional[List[str]] = None,
97
+ discard_names: Optional[List[str]] = None,
98
+ ) -> Dict[str, List[str]]:
99
+ if preferred_names is None:
100
+ preferred_names = []
101
+ preferred_names = set(preferred_names)
102
+ if discard_names is None:
103
+ discard_names = []
104
+ discard_names = set(discard_names)
105
+
106
+ shareds = _find_shared_tensors(state_dict)
107
+ to_remove = defaultdict(list)
108
+ for shared in shareds:
109
+ complete_names = set(
110
+ [name for name in shared if _is_complete(state_dict[name])]
111
+ )
112
+ if not complete_names:
113
+ raise RuntimeError(
114
+ "Error while trying to find names to remove to save state dict, but found no suitable name to keep"
115
+ f" for saving amongst: {shared}. None is covering the entire storage.Refusing to save/load the model"
116
+ " since you could be storing much more memory than needed. Please refer to"
117
+ " https://huggingface.co/docs/safetensors/torch_shared_tensors for more information. Or open an"
118
+ " issue."
119
+ )
120
+
121
+ keep_name = sorted(list(complete_names))[0]
122
+
123
+ # Mechanism to preferentially select keys to keep
124
+ # coming from the on-disk file to allow
125
+ # loading models saved with a different choice
126
+ # of keep_name
127
+ preferred = complete_names.difference(discard_names)
128
+ if preferred:
129
+ keep_name = sorted(list(preferred))[0]
130
+
131
+ if preferred_names:
132
+ preferred = preferred_names.intersection(complete_names)
133
+ if preferred:
134
+ keep_name = sorted(list(preferred))[0]
135
+ for name in sorted(shared):
136
+ if name != keep_name:
137
+ to_remove[keep_name].append(name)
138
+ return to_remove
139
+
140
+
141
+ def save_model(
142
+ model: torch.nn.Module,
143
+ filename: str,
144
+ metadata: Optional[Dict[str, str]] = None,
145
+ force_contiguous: bool = True,
146
+ ):
147
+ """
148
+ Saves a given torch model to specified filename.
149
+ This method exists specifically to avoid tensor sharing issues which are
150
+ not allowed in `safetensors`. [More information on tensor sharing](../torch_shared_tensors)
151
+
152
+ Args:
153
+ model (`torch.nn.Module`):
154
+ The model to save on disk.
155
+ filename (`str`):
156
+ The filename location to save the file
157
+ metadata (`Dict[str, str]`, *optional*):
158
+ Extra information to save along with the file.
159
+ Some metadata will be added for each dropped tensors.
160
+ This information will not be enough to recover the entire
161
+ shared structure but might help understanding things
162
+ force_contiguous (`boolean`, *optional*, defaults to True):
163
+ Forcing the state_dict to be saved as contiguous tensors.
164
+ This has no effect on the correctness of the model, but it
165
+ could potentially change performance if the layout of the tensor
166
+ was chosen specifically for that reason.
167
+ """
168
+ state_dict = model.state_dict()
169
+ to_removes = _remove_duplicate_names(state_dict)
170
+
171
+ for kept_name, to_remove_group in to_removes.items():
172
+ for to_remove in to_remove_group:
173
+ if metadata is None:
174
+ metadata = {}
175
+
176
+ if to_remove not in metadata:
177
+ # Do not override user data
178
+ metadata[to_remove] = kept_name
179
+ del state_dict[to_remove]
180
+ if force_contiguous:
181
+ state_dict = {k: v.contiguous() for k, v in state_dict.items()}
182
+ try:
183
+ save_file(state_dict, filename, metadata=metadata)
184
+ except ValueError as e:
185
+ msg = str(e)
186
+ msg += " Or use save_model(..., force_contiguous=True), read the docs for potential caveats."
187
+ raise ValueError(msg)
188
+
189
+
190
+ def load_model(
191
+ model: torch.nn.Module,
192
+ filename: Union[str, os.PathLike],
193
+ strict: bool = True,
194
+ device: Union[str, int] = "cpu",
195
+ ) -> Tuple[List[str], List[str]]:
196
+ """
197
+ Loads a given filename onto a torch model.
198
+ This method exists specifically to avoid tensor sharing issues which are
199
+ not allowed in `safetensors`. [More information on tensor sharing](../torch_shared_tensors)
200
+
201
+ Args:
202
+ model (`torch.nn.Module`):
203
+ The model to load onto.
204
+ filename (`str`, or `os.PathLike`):
205
+ The filename location to load the file from.
206
+ strict (`bool`, *optional*, defaults to True):
207
+ Whether to fail if you're missing keys or having unexpected ones.
208
+ When false, the function simply returns missing and unexpected names.
209
+ device (`Union[str, int]`, *optional*, defaults to `cpu`):
210
+ The device where the tensors need to be located after load.
211
+ available options are all regular torch device locations.
212
+
213
+ Returns:
214
+ `(missing, unexpected): (List[str], List[str])`
215
+ `missing` are names in the model which were not modified during loading
216
+ `unexpected` are names that are on the file, but weren't used during
217
+ the load.
218
+ """
219
+ state_dict = load_file(filename, device=device)
220
+ model_state_dict = model.state_dict()
221
+ to_removes = _remove_duplicate_names(
222
+ model_state_dict, preferred_names=state_dict.keys()
223
+ )
224
+ missing, unexpected = model.load_state_dict(state_dict, strict=False)
225
+ missing = set(missing)
226
+ for to_remove_group in to_removes.values():
227
+ for to_remove in to_remove_group:
228
+ if to_remove not in missing:
229
+ unexpected.append(to_remove)
230
+ else:
231
+ missing.remove(to_remove)
232
+ if strict and (missing or unexpected):
233
+ missing_keys = ", ".join([f'"{k}"' for k in sorted(missing)])
234
+ unexpected_keys = ", ".join([f'"{k}"' for k in sorted(unexpected)])
235
+ error = f"Error(s) in loading state_dict for {model.__class__.__name__}:"
236
+ if missing:
237
+ error += f"\n Missing key(s) in state_dict: {missing_keys}"
238
+ if unexpected:
239
+ error += f"\n Unexpected key(s) in state_dict: {unexpected_keys}"
240
+ raise RuntimeError(error)
241
+ return missing, unexpected
242
+
243
+
244
+ def save(
245
+ tensors: Dict[str, torch.Tensor], metadata: Optional[Dict[str, str]] = None
246
+ ) -> bytes:
247
+ """
248
+ Saves a dictionary of tensors into raw bytes in safetensors format.
249
+
250
+ Args:
251
+ tensors (`Dict[str, torch.Tensor]`):
252
+ The incoming tensors. Tensors need to be contiguous and dense.
253
+ metadata (`Dict[str, str]`, *optional*, defaults to `None`):
254
+ Optional text only metadata you might want to save in your header.
255
+ For instance it can be useful to specify more about the underlying
256
+ tensors. This is purely informative and does not affect tensor loading.
257
+
258
+ Returns:
259
+ `bytes`: The raw bytes representing the format
260
+
261
+ Example:
262
+
263
+ ```python
264
+ from safetensors.torch import save
265
+ import torch
266
+
267
+ tensors = {"embedding": torch.zeros((512, 1024)), "attention": torch.zeros((256, 256))}
268
+ byte_data = save(tensors)
269
+ ```
270
+ """
271
+ serialized = serialize(_flatten(tensors), metadata=metadata)
272
+ result = bytes(serialized)
273
+ return result
274
+
275
+
276
+ def save_file(
277
+ tensors: Dict[str, torch.Tensor],
278
+ filename: Union[str, os.PathLike],
279
+ metadata: Optional[Dict[str, str]] = None,
280
+ ):
281
+ """
282
+ Saves a dictionary of tensors into raw bytes in safetensors format.
283
+
284
+ Args:
285
+ tensors (`Dict[str, torch.Tensor]`):
286
+ The incoming tensors. Tensors need to be contiguous and dense.
287
+ filename (`str`, or `os.PathLike`)):
288
+ The filename we're saving into.
289
+ metadata (`Dict[str, str]`, *optional*, defaults to `None`):
290
+ Optional text only metadata you might want to save in your header.
291
+ For instance it can be useful to specify more about the underlying
292
+ tensors. This is purely informative and does not affect tensor loading.
293
+
294
+ Returns:
295
+ `None`
296
+
297
+ Example:
298
+
299
+ ```python
300
+ from safetensors.torch import save_file
301
+ import torch
302
+
303
+ tensors = {"embedding": torch.zeros((512, 1024)), "attention": torch.zeros((256, 256))}
304
+ save_file(tensors, "model.safetensors")
305
+ ```
306
+ """
307
+ serialize_file(_flatten(tensors), filename, metadata=metadata)
308
+
309
+
310
+ def load_file(
311
+ filename: Union[str, os.PathLike], device: Union[str, int] = "cpu"
312
+ ) -> Dict[str, torch.Tensor]:
313
+ """
314
+ Loads a safetensors file into torch format.
315
+
316
+ Args:
317
+ filename (`str`, or `os.PathLike`):
318
+ The name of the file which contains the tensors
319
+ device (`Union[str, int]`, *optional*, defaults to `cpu`):
320
+ The device where the tensors need to be located after load.
321
+ available options are all regular torch device locations.
322
+
323
+ Returns:
324
+ `Dict[str, torch.Tensor]`: dictionary that contains name as key, value as `torch.Tensor`
325
+
326
+ Example:
327
+
328
+ ```python
329
+ from safetensors.torch import load_file
330
+
331
+ file_path = "./my_folder/bert.safetensors"
332
+ loaded = load_file(file_path)
333
+ ```
334
+ """
335
+ result = {}
336
+ with safe_open(filename, framework="pt", device=device) as f:
337
+ for k in f.offset_keys():
338
+ result[k] = f.get_tensor(k)
339
+ return result
340
+
341
+
342
+ def load(data: bytes) -> Dict[str, torch.Tensor]:
343
+ """
344
+ Loads a safetensors file into torch format from pure bytes.
345
+
346
+ Args:
347
+ data (`bytes`):
348
+ The content of a safetensors file
349
+
350
+ Returns:
351
+ `Dict[str, torch.Tensor]`: dictionary that contains name as key, value as `torch.Tensor` on cpu
352
+
353
+ Example:
354
+
355
+ ```python
356
+ from safetensors.torch import load
357
+
358
+ file_path = "./my_folder/bert.safetensors"
359
+ with open(file_path, "rb") as f:
360
+ data = f.read()
361
+
362
+ loaded = load(data)
363
+ ```
364
+ """
365
+ flat = deserialize(data)
366
+ return _view2torch(flat)
367
+
368
+
369
+ # torch.float8 formats require 2.1; we do not support these dtypes on earlier versions
370
+ _float8_e4m3fn = getattr(torch, "float8_e4m3fn", None)
371
+ _float8_e5m2 = getattr(torch, "float8_e5m2", None)
372
+ _float8_e8m0 = getattr(torch, "float8_e8m0fnu", None)
373
+ _float4_e2m1_x2 = getattr(torch, "float4_e2m1fn_x2", None)
374
+
375
+ _SIZE = {
376
+ torch.int64: 8,
377
+ torch.float32: 4,
378
+ torch.int32: 4,
379
+ torch.bfloat16: 2,
380
+ torch.float16: 2,
381
+ torch.int16: 2,
382
+ torch.uint8: 1,
383
+ torch.int8: 1,
384
+ torch.bool: 1,
385
+ torch.float64: 8,
386
+ torch.complex64: 8,
387
+ _float8_e4m3fn: 1,
388
+ _float8_e5m2: 1,
389
+ _float8_e8m0: 1,
390
+ _float4_e2m1_x2: 1,
391
+ }
392
+ if Version(torch.__version__) >= Version("2.3.0"):
393
+ _SIZE.update(
394
+ {
395
+ torch.uint64: 8,
396
+ torch.uint32: 4,
397
+ torch.uint16: 2,
398
+ }
399
+ )
400
+
401
+ _TYPES = {
402
+ "F64": torch.float64,
403
+ "F32": torch.float32,
404
+ "F16": torch.float16,
405
+ "BF16": torch.bfloat16,
406
+ "I64": torch.int64,
407
+ "I32": torch.int32,
408
+ "I16": torch.int16,
409
+ "I8": torch.int8,
410
+ "U8": torch.uint8,
411
+ "BOOL": torch.bool,
412
+ "F8_E4M3": _float8_e4m3fn,
413
+ "F8_E5M2": _float8_e5m2,
414
+ "C64": torch.complex64,
415
+ }
416
+ if Version(torch.__version__) >= Version("2.3.0"):
417
+ _TYPES.update(
418
+ {
419
+ "U64": torch.uint64,
420
+ "U32": torch.uint32,
421
+ "U16": torch.uint16,
422
+ }
423
+ )
424
+
425
+
426
+ def _getdtype(dtype_str: str) -> torch.dtype:
427
+ return _TYPES[dtype_str]
428
+
429
+
430
+ def _view2torch(safeview) -> Dict[str, torch.Tensor]:
431
+ result = {}
432
+ for k, v in safeview:
433
+ dtype = _getdtype(v["dtype"])
434
+ if len(v["data"]) == 0:
435
+ # Workaround because frombuffer doesn't accept zero-size tensors
436
+ assert any(x == 0 for x in v["shape"])
437
+ arr = torch.empty(v["shape"], dtype=dtype)
438
+ else:
439
+ arr = torch.frombuffer(v["data"], dtype=dtype).reshape(v["shape"])
440
+ if sys.byteorder == "big":
441
+ arr = torch.from_numpy(arr.numpy().byteswap(inplace=False))
442
+ result[k] = arr
443
+
444
+ return result
445
+
446
+
447
+ def _tobytes(tensor: torch.Tensor, name: str) -> bytes:
448
+ if tensor.layout != torch.strided:
449
+ raise ValueError(
450
+ f"You are trying to save a sparse tensor: `{name}` which this library does not support."
451
+ " You can make it a dense tensor before saving with `.to_dense()` but be aware this might"
452
+ " make a much larger file than needed."
453
+ )
454
+
455
+ if not tensor.is_contiguous():
456
+ raise ValueError(
457
+ f"You are trying to save a non contiguous tensor: `{name}` which is not allowed. It either means you"
458
+ " are trying to save tensors which are reference of each other in which case it's recommended to save"
459
+ " only the full tensors, and reslice at load time, or simply call `.contiguous()` on your tensor to"
460
+ " pack it before saving."
461
+ )
462
+ if tensor.device.type != "cpu":
463
+ # Moving tensor to cpu before saving
464
+ tensor = tensor.to("cpu")
465
+
466
+ import ctypes
467
+
468
+ import numpy as np
469
+
470
+ # When shape is empty (scalar), np.prod returns a float
471
+ # we need a int for the following calculations
472
+ length = int(np.prod(tensor.shape).item())
473
+ bytes_per_item = _SIZE[tensor.dtype]
474
+
475
+ total_bytes = length * bytes_per_item
476
+
477
+ ptr = tensor.data_ptr()
478
+ if ptr == 0:
479
+ return b""
480
+ newptr = ctypes.cast(ptr, ctypes.POINTER(ctypes.c_ubyte))
481
+ data = np.ctypeslib.as_array(newptr, (total_bytes,)) # no internal copy
482
+ if sys.byteorder == "big":
483
+ NPDTYPES = {
484
+ torch.int64: np.int64,
485
+ torch.float32: np.float32,
486
+ torch.int32: np.int32,
487
+ # XXX: This is ok because both have the same width
488
+ torch.bfloat16: np.float16,
489
+ torch.float16: np.float16,
490
+ torch.int16: np.int16,
491
+ torch.uint8: np.uint8,
492
+ torch.int8: np.int8,
493
+ torch.bool: bool,
494
+ torch.float64: np.float64,
495
+ # XXX: This is ok because both have the same width and byteswap is a no-op anyway
496
+ _float8_e4m3fn: np.uint8,
497
+ _float8_e5m2: np.uint8,
498
+ torch.complex64: np.complex64,
499
+ }
500
+ npdtype = NPDTYPES[tensor.dtype]
501
+ # Not in place as that would potentially modify a live running model
502
+ data = data.view(npdtype).byteswap(inplace=False)
503
+ return data.tobytes()
504
+
505
+
506
+ def _flatten(tensors: Dict[str, torch.Tensor]) -> Dict[str, Dict[str, Any]]:
507
+ if not isinstance(tensors, dict):
508
+ raise ValueError(
509
+ f"Expected a dict of [str, torch.Tensor] but received {type(tensors)}"
510
+ )
511
+
512
+ invalid_tensors = []
513
+ for k, v in tensors.items():
514
+ if not isinstance(v, torch.Tensor):
515
+ raise ValueError(
516
+ f"Key `{k}` is invalid, expected torch.Tensor but received {type(v)}"
517
+ )
518
+
519
+ if v.layout != torch.strided:
520
+ invalid_tensors.append(k)
521
+ if invalid_tensors:
522
+ raise ValueError(
523
+ f"You are trying to save a sparse tensors: `{invalid_tensors}` which this library does not support."
524
+ " You can make it a dense tensor before saving with `.to_dense()` but be aware this might"
525
+ " make a much larger file than needed."
526
+ )
527
+
528
+ shared_pointers = _find_shared_tensors(tensors)
529
+ failing = []
530
+ for names in shared_pointers:
531
+ if len(names) > 1:
532
+ failing.append(names)
533
+
534
+ if failing:
535
+ raise RuntimeError(
536
+ f"""
537
+ Some tensors share memory, this will lead to duplicate memory on disk and potential differences when loading them again: {failing}.
538
+ A potential way to correctly save your model is to use `save_model`.
539
+ More information at https://huggingface.co/docs/safetensors/torch_shared_tensors
540
+ """
541
+ )
542
+
543
+ return {
544
+ k: {
545
+ "dtype": str(v.dtype).split(".")[-1],
546
+ "shape": v.shape,
547
+ "data": _tobytes(v, k),
548
+ }
549
+ for k, v in tensors.items()
550
+ }
source/sentencepiece-0.2.1.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ pip
source/sentencepiece-0.2.1.dist-info/METADATA ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: sentencepiece
3
+ Version: 0.2.1
4
+ Summary: Unsupervised text tokenizer and detokenizer.
5
+ Author-email: Taku Kudo <taku@google.com>
6
+ Project-URL: Homepage, https://github.com/google/sentencepiece
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: Development Status :: 5 - Production/Stable
9
+ Classifier: Environment :: Console
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Operating System :: MacOS :: MacOS X
13
+ Classifier: Operating System :: Microsoft :: Windows
14
+ Classifier: Operating System :: POSIX :: Linux
15
+ Classifier: Programming Language :: Python
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Programming Language :: Python :: 3.14
22
+ Classifier: Programming Language :: Python :: Free Threading :: 2 - Beta
23
+ Classifier: Topic :: Text Processing :: Linguistic
24
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
25
+ Requires-Python: >=3.9
26
+ Description-Content-Type: text/markdown
27
+ Provides-Extra: test
28
+ Requires-Dist: pytest; extra == "test"
29
+ Provides-Extra: testpaths
30
+ Requires-Dist: test; extra == "testpaths"
31
+
32
+ # SentencePiece Python Wrapper
33
+
34
+ Python wrapper for SentencePiece. This API will offer the encoding, decoding and training of Sentencepiece.
35
+
36
+ ## Build and Install SentencePiece
37
+
38
+ For Linux (x64/i686), macOS, and Windows(win32/x64/arm64) environment, you can simply use pip command to install SentencePiece python module.
39
+
40
+ ```
41
+ % pip install sentencepiece
42
+ ```
43
+
44
+ Before building SentencePiece from source on Linux, ensure that the following dependencies are installed.
45
+
46
+ ```
47
+ % sudo apt update
48
+ % sudo apt install -y cmake pkg-config libsentencepiece-dev
49
+ ```
50
+
51
+ To build and install the Python wrapper from source, try the following commands to build and install wheel package.
52
+
53
+ ```
54
+ % git clone https://github.com/google/sentencepiece.git
55
+ % cd sentencepiece
56
+ % mkdir build
57
+ % cd build
58
+ % cmake .. -DSPM_ENABLE_SHARED=OFF -DCMAKE_INSTALL_PREFIX=./root -DSPM_DISABLE_EMBEDDED_DATA=ON
59
+ % make install
60
+ % cd ../python
61
+ % python setup.py bdist_wheel
62
+ % pip install dist/sentencepiece*.whl
63
+ ```
64
+
65
+ If you don’t have write permission to the global site-packages directory or don’t want to install into it, please try:
66
+
67
+ ```
68
+ % python setup.py install --user
69
+ ```
70
+
71
+ For Windows users who want to build from source, you can build and install the Python wrapper using Visual Studio. First, you need to install the `pwsh.exe` (Powershell 7). Use `winget install --id Microsoft.Powershell --source winget` to install directly. Then open the `Developer PowerShell for VS 2022`, and execute the following commands.
72
+
73
+ ```
74
+ git clone https://github.com/google/sentencepiece.git
75
+ cd sentencepiece
76
+ mkdir build
77
+ cd build
78
+ cmake .. -DSPM_ENABLE_SHARED=OFF -DCMAKE_INSTALL_PREFIX=".\root" -DSPM_DISABLE_EMBEDDED_DATA=ON
79
+ cmake --build . --config Release --target install
80
+ cd ../python
81
+ pip install wheel
82
+ python setup.py bdist_wheel
83
+ Get-ChildItem .\dist\sentencepiece*.whl | ForEach-Object { pip install $_.FullName }
84
+ ```
85
+
86
+ ## Usage
87
+
88
+ See [this google colab page](https://github.com/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb) to run sentencepiece interactively.
89
+
90
+ ### Segmentation
91
+
92
+ ```
93
+ % python
94
+ >>> import sentencepiece as spm
95
+ >>> sp = spm.SentencePieceProcessor(model_file='test/test_model.model')
96
+
97
+ >>> sp.encode('This is a test')
98
+ [284, 47, 11, 4, 15, 400]
99
+
100
+ >>> sp.encode(['This is a test', 'Hello world'], out_type=int)
101
+ [[284, 47, 11, 4, 15, 400], [151, 88, 21, 887]]
102
+
103
+ >>> sp.encode_as_ids(['This is a test', 'Hello world'])
104
+ [[284, 47, 11, 4, 15, 400], [151, 88, 21, 887]]
105
+
106
+ >>> sp.encode('This is a test', out_type=str)
107
+ ['▁This', '▁is', '▁a', '▁', 't', 'est']
108
+
109
+ >>> sp.encode(['This is a test', 'Hello world'], out_type=str)
110
+ [['▁This', '▁is', '▁a', '▁', 't', 'est'], ['▁He', 'll', 'o', '▁world']]
111
+
112
+ >>> sp.encode_as_pieces(['This is a test', 'Hello world'])
113
+ [['▁This', '▁is', '▁a', '▁', 't', 'est'], ['▁He', 'll', 'o', '▁world']]
114
+
115
+ >>> proto = sp.encode('This is a test', out_type='immutable_proto')
116
+ >>> for n in proto.pieces:
117
+ ... print('piece="{}" surface="{}" id={} begin={} end={}'.format(n.piece, n.surface, n.id, n.begin, n.end))
118
+ ...
119
+ piece="▁This" surface="This" id=284 begin=0 end=4
120
+ piece="▁is" surface=" is" id=47 begin=4 end=7
121
+ piece="▁a" surface=" a" id=11 begin=7 end=9
122
+ piece="▁" surface=" " id=4 begin=9 end=10
123
+ piece="t" surface="t" id=15 begin=10 end=11
124
+ piece="est" surface="est" id=400 begin=11 end=14
125
+
126
+ >>> [[x.id for x in proto.pieces], [x.piece for x in proto.pieces], [x.begin for x in proto.pieces], [x.end for x in proto.pieces]]
127
+ [[284, 47, 11, 4, 15, 400], ['▁This', '▁is', '▁a', '▁', 't', 'est'], [0, 4, 7, 9, 10, 11], [4, 7, 9, 10, 11, 14]]
128
+
129
+ >>> proto2 = sp.encode_as_immutable_proto('This is a test')
130
+ >>> proto2 == proto
131
+ True
132
+
133
+ >>> for _ in range(10):
134
+ ... sp.encode('This is a test', out_type=str, enable_sampling=True, alpha=0.1, nbest_size=-1)
135
+ ...
136
+ ['▁', 'This', '▁', 'is', '▁a', '▁', 't', 'e', 'st']
137
+ ['▁T', 'h', 'i', 's', '▁is', '▁a', '▁', 'te', 's', 't']
138
+ ['▁T', 'h', 'is', '▁', 'is', '▁', 'a', '▁', 't', 'est']
139
+ ['▁', 'This', '▁is', '▁', 'a', '▁', 't', 'e', 'st']
140
+ ['▁', 'This', '▁', 'is', '▁', 'a', '▁', 't', 'e', 's', 't']
141
+ ['▁This', '▁is', '▁a', '▁', 'te', 's', 't']
142
+ ['▁This', '▁is', '▁', 'a', '▁', 't', 'e', 'st']
143
+ ['▁', 'T', 'h', 'is', '▁', 'is', '▁', 'a', '▁', 'te', 'st']
144
+ ['▁', 'This', '▁', 'i', 's', '▁a', '▁', 't', 'e', 'st']
145
+ ['▁This', '▁', 'is', '▁a', '▁', 't', 'est']
146
+
147
+ >> sp.nbest_encode('This is a test', nbest_size=5, out_type=str)
148
+ [['▁This', '▁is', '▁a', '▁', 't', 'est'],
149
+ ['▁This', '▁is', '▁a', '▁', 'te', 'st'],
150
+ ['▁This', '▁is', '▁a', '▁', 'te', 's', 't'],
151
+ ['▁This', '▁is', '▁a', '▁', 't', 'e', 'st'],
152
+ ['▁This', '▁is', '▁a', '▁', 't', 'es', 't']]
153
+
154
+ >>> sp.sample_encode_and_score('This is a test', num_samples=5, alpha=0.1, out_type=str, wor=True)
155
+ [(['▁This', '▁', 'i', 's', '▁a', '▁', 'te', 's', 't'], -3.043105125427246),
156
+ (['▁This', '▁', 'i', 's', '▁a', '▁', 'te', 'st'], -2.8475849628448486),
157
+ (['▁', 'This', '▁is', '▁', 'a', '▁', 'te', 'st'], -3.043248176574707),
158
+ (['▁', 'This', '▁is', '▁a', '▁', 't', 'e', 'st'], -2.87727689743042),
159
+ (['▁', 'This', '▁', 'i', 's', '▁', 'a', '▁', 't', 'est'], -3.6284031867980957)]
160
+
161
+ >>> sp.decode([284, 47, 11, 4, 15, 400])
162
+ 'This is a test'
163
+
164
+ >>> sp.decode([[284, 47, 11, 4, 15, 400], [151, 88, 21, 887]])
165
+ ['This is a test', 'Hello world']
166
+
167
+ >>> proto = sp.decode([284, 47, 11, 4, 15, 400], out_type='immutable_proto')
168
+ >>> proto.text
169
+ 'This is a test'
170
+
171
+ >>> sp.decode(['▁', 'This', '▁', 'is', '▁a', '▁', 't', 'e', 'st'])
172
+ 'This is a test'
173
+
174
+ >>> sp.decode([['▁This', '▁is', '▁a', '▁', 't', 'est'], ['▁He', 'll', 'o', '▁world']])
175
+ ['This is a test', 'Hello world']
176
+
177
+ >>> sp.get_piece_size()
178
+ 1000
179
+
180
+ >>> sp.id_to_piece(2)
181
+ '</s>'
182
+
183
+ >>> sp.id_to_piece([2, 3, 4])
184
+ ['</s>', '\r', '▁']
185
+
186
+ >>> sp.piece_to_id('<s>')
187
+ 1
188
+
189
+ >>> sp.piece_to_id(['</s>', '\r', '▁'])
190
+ [2, 3, 4]
191
+
192
+ >>> len(sp)
193
+ 1000
194
+
195
+ >>> sp['</s>']
196
+ 2
197
+ ```
198
+
199
+ ### Model Training
200
+
201
+ Training is performed by passing parameters of [spm_train](https://github.com/google/sentencepiece#train-sentencepiece-model) to SentencePieceTrainer.train() function.
202
+
203
+ ```
204
+ >>> import sentencepiece as spm
205
+ >>> spm.SentencePieceTrainer.train(input='test/botchan.txt', model_prefix='m', vocab_size=1000, user_defined_symbols=['foo', 'bar'])
206
+ sentencepiece_trainer.cc(73) LOG(INFO) Starts training with :
207
+ trainer_spec {
208
+ input: test/botchan.txt
209
+ .. snip
210
+ unigram_model_trainer.cc(500) LOG(INFO) EM sub_iter=1 size=1188 obj=10.2839 num_tokens=32182 num_tokens/piece=27.0892
211
+ unigram_model_trainer.cc(500) LOG(INFO) EM sub_iter=0 size=1100 obj=10.4269 num_tokens=33001 num_tokens/piece=30.0009
212
+ unigram_model_trainer.cc(500) LOG(INFO) EM sub_iter=1 size=1100 obj=10.4069 num_tokens=33002 num_tokens/piece=30.0018
213
+ trainer_interface.cc(595) LOG(INFO) Saving model: m.model
214
+ trainer_interface.cc(619) LOG(INFO) Saving vocabs: m.vocab
215
+ >>>
216
+ ```
217
+
218
+ ### Training without local filesystem
219
+
220
+ Sentencepiece trainer can receive any iterable object to feed training sentences. You can also pass a file object (instance with write() method) to emit the output model to any devices. These features are useful to run sentencepiece on environment that have limited access to the local file system (e.g., Google colab.)
221
+
222
+ ```
223
+ import urllib.request
224
+ import io
225
+ import sentencepiece as spm
226
+
227
+ # Loads model from URL as iterator and stores the model to BytesIO.
228
+ model = io.BytesIO()
229
+ with urllib.request.urlopen(
230
+ 'https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt'
231
+ ) as response:
232
+ spm.SentencePieceTrainer.train(
233
+ sentence_iterator=response, model_writer=model, vocab_size=1000)
234
+
235
+ # Serialize the model as file.
236
+ # with open('out.model', 'wb') as f:
237
+ # f.write(model.getvalue())
238
+
239
+ # Directly load the model from serialized model.
240
+ sp = spm.SentencePieceProcessor(model_proto=model.getvalue())
241
+ print(sp.encode('this is test'))
242
+ ```
243
+
244
+ ### Free Threading support
245
+ Experimental support for no-GIL/Free-Threading has been introduced since v0.2.1. For more details, please refer to [this page](https://py-free-threading.github.io.).
246
+ This operates similarly to how [NumPy](https://numpy.org/devdocs/reference/thread_safety.html#free-threaded-python) handles it.
247
+
248
+ The C++ library's const and static methods, e.g., encode(), decode() and train(), are designed to work in a non-GIL environment.
249
+ However, non-const methods, e.g., load(), may have potential data race issues, so please ensure you implement appropriate locks beforehand.
250
+
251
+ While this limitation might be removed in the future, please note that it's not a simple fix, as it would require additional shared locks in C++.
source/sentencepiece-0.2.1.dist-info/RECORD ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ sentencepiece-0.2.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
2
+ sentencepiece-0.2.1.dist-info/METADATA,sha256=zgounFmc0GCnvD3_sYvlz8PV8pNTPj7k6HDq7wpVAP0,10003
3
+ sentencepiece-0.2.1.dist-info/RECORD,,
4
+ sentencepiece-0.2.1.dist-info/WHEEL,sha256=1crAxrAH5rUbvWUY1UR0ly3o7KnT1jo0_98V8RY5-FM,152
5
+ sentencepiece-0.2.1.dist-info/top_level.txt,sha256=NIXVKmsq-xdZ5KROMHHV0gkLA3fvbNTb1g1KtgiYFOk,130
6
+ sentencepiece/__init__.py,sha256=NQT0kP3jkjEmRRIyK_jsKCug6L9zQi2vIPTWQin8E2w,49422
7
+ sentencepiece/__pycache__/__init__.cpython-312.pyc,,
8
+ sentencepiece/__pycache__/_version.cpython-312.pyc,,
9
+ sentencepiece/__pycache__/sentencepiece_model_pb2.cpython-312.pyc,,
10
+ sentencepiece/__pycache__/sentencepiece_pb2.cpython-312.pyc,,
11
+ sentencepiece/_sentencepiece.cpython-312-x86_64-linux-gnu.so,sha256=z2sCOJaqHJkCZEXvwrXL07ipfGg53Q_fsrynV_vtlw0,2005912
12
+ sentencepiece/_version.py,sha256=PmcQ2PI2oP8irnLtJLJby2YfW6sBvLAmL-VpABzTqwc,22
13
+ sentencepiece/package_data/nfkc.bin,sha256=UvEQKP-KffPgCdlKi2pU1KihcTLvtMzByaCkHkMr2R4,240008
14
+ sentencepiece/package_data/nfkc_cf.bin,sha256=YIM-wRIBRGZZw1ScGDsY8CTEAHYozGs6TpGuAHaXuCY,247028
15
+ sentencepiece/package_data/nmt_nfkc.bin,sha256=eTcsQTicK5spvBcQF6tUAONS3r1oawJnCkK-xwkBUHQ,240007
16
+ sentencepiece/package_data/nmt_nfkc_cf.bin,sha256=IsKSx29QN5XzDIXXnTCn9XL_9PSeADktLWD0-T6UGh4,247027
17
+ sentencepiece/sentencepiece.i,sha256=Hfv8AHFOJEfDfElVYIhoz29W7rV1VJ0Z13aP7S7ck6M,72647
18
+ sentencepiece/sentencepiece_model_pb2.py,sha256=LawEwmdUiIU1T9HcYu-rNEVTFcwAh9i-qavMMsg9riE,6257
19
+ sentencepiece/sentencepiece_pb2.py,sha256=_ZgnXOkpoScMXbJ-8BMKn2Q97BbMOH9Hz-L7JFMcJro,1753
20
+ sentencepiece/sentencepiece_wrap.cxx,sha256=XlbUFs7s48i3i_nhka9U_b41Xpv_eeSD9U_uxO742Y0,381494
source/sentencepiece-0.2.1.dist-info/WHEEL ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: false
4
+ Tag: cp312-cp312-manylinux_2_27_x86_64
5
+ Tag: cp312-cp312-manylinux_2_28_x86_64
6
+
source/sentencepiece-0.2.1.dist-info/top_level.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ sentencepiece
2
+ sentencepiece/__init__
3
+ sentencepiece/_version
4
+ sentencepiece/sentencepiece_model_pb2
5
+ sentencepiece/sentencepiece_pb2
source/sentencepiece/__init__.py ADDED
@@ -0,0 +1,1230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file was automatically generated by SWIG (https://www.swig.org).
2
+ # Version 4.3.0
3
+ #
4
+ # Do not make changes to this file unless you know what you are doing - modify
5
+ # the SWIG interface file instead.
6
+
7
+ from sys import version_info as _swig_python_version_info
8
+ # Import the low-level C/C++ module
9
+ if __package__ or "." in __name__:
10
+ from . import _sentencepiece
11
+ else:
12
+ import _sentencepiece
13
+
14
+ try:
15
+ import builtins as __builtin__
16
+ except ImportError:
17
+ import __builtin__
18
+
19
+ def _swig_repr(self):
20
+ try:
21
+ strthis = "proxy of " + self.this.__repr__()
22
+ except __builtin__.Exception:
23
+ strthis = ""
24
+ return "<%s.%s; %s >" % (self.__class__.__module__, self.__class__.__name__, strthis,)
25
+
26
+
27
+ def _swig_setattr_nondynamic_instance_variable(set):
28
+ def set_instance_attr(self, name, value):
29
+ if name == "this":
30
+ set(self, name, value)
31
+ elif name == "thisown":
32
+ self.this.own(value)
33
+ elif hasattr(self, name) and isinstance(getattr(type(self), name), property):
34
+ set(self, name, value)
35
+ else:
36
+ raise AttributeError("You cannot add instance attributes to %s" % self)
37
+ return set_instance_attr
38
+
39
+
40
+ def _swig_setattr_nondynamic_class_variable(set):
41
+ def set_class_attr(cls, name, value):
42
+ if hasattr(cls, name) and not isinstance(getattr(cls, name), property):
43
+ set(cls, name, value)
44
+ else:
45
+ raise AttributeError("You cannot add class attributes to %s" % cls)
46
+ return set_class_attr
47
+
48
+
49
+ def _swig_add_metaclass(metaclass):
50
+ """Class decorator for adding a metaclass to a SWIG wrapped class - a slimmed down version of six.add_metaclass"""
51
+ def wrapper(cls):
52
+ return metaclass(cls.__name__, cls.__bases__, cls.__dict__.copy())
53
+ return wrapper
54
+
55
+
56
+ class _SwigNonDynamicMeta(type):
57
+ """Meta class to enforce nondynamic attributes (no new attributes) for a class"""
58
+ __setattr__ = _swig_setattr_nondynamic_class_variable(type.__setattr__)
59
+
60
+
61
+ class ImmutableSentencePieceText_ImmutableSentencePiece(object):
62
+ thisown = property(lambda x: x.this.own(), lambda x, v: x.this.own(v), doc="The membership flag")
63
+ __repr__ = _swig_repr
64
+
65
+ def __init__(self):
66
+ _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece_swiginit(self, _sentencepiece.new_ImmutableSentencePieceText_ImmutableSentencePiece())
67
+ __swig_destroy__ = _sentencepiece.delete_ImmutableSentencePieceText_ImmutableSentencePiece
68
+
69
+ def _piece(self):
70
+ return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece__piece(self)
71
+
72
+ def _surface(self):
73
+ return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece__surface(self)
74
+
75
+ def _id(self):
76
+ return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece__id(self)
77
+
78
+ def _begin(self):
79
+ return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece__begin(self)
80
+
81
+ def _end(self):
82
+ return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece__end(self)
83
+
84
+ def _surface_as_bytes(self):
85
+ return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece__surface_as_bytes(self)
86
+
87
+ def _piece_as_bytes(self):
88
+ return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece__piece_as_bytes(self)
89
+
90
+ piece = property(_piece)
91
+ piece_as_bytes = property(_piece_as_bytes)
92
+ surface = property(_surface)
93
+ surface_as_bytes = property(_surface_as_bytes)
94
+ id = property(_id)
95
+ begin = property(_begin)
96
+ end = property(_end)
97
+
98
+ def __str__(self):
99
+ return ('piece: \"{}\"\n'
100
+ 'id: {}\n'
101
+ 'surface: \"{}\"\n'
102
+ 'begin: {}\n'
103
+ 'end: {}\n').format(self.piece, self.id, self.surface,
104
+ self.begin, self.end)
105
+
106
+ def __eq__(self, other):
107
+ return self.piece == other.piece and self.id == other.id and self.surface == other.surface and self.begin == other.begin and self.end == other.end
108
+
109
+ def __hash__(self):
110
+ return hash(str(self))
111
+
112
+ __repr__ = __str__
113
+
114
+
115
+ # Register ImmutableSentencePieceText_ImmutableSentencePiece in _sentencepiece:
116
+ _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece_swigregister(ImmutableSentencePieceText_ImmutableSentencePiece)
117
+ class ImmutableSentencePieceText(object):
118
+ thisown = property(lambda x: x.this.own(), lambda x, v: x.this.own(v), doc="The membership flag")
119
+ __repr__ = _swig_repr
120
+
121
+ def __init__(self):
122
+ _sentencepiece.ImmutableSentencePieceText_swiginit(self, _sentencepiece.new_ImmutableSentencePieceText())
123
+ __swig_destroy__ = _sentencepiece.delete_ImmutableSentencePieceText
124
+
125
+ def _pieces_size(self):
126
+ return _sentencepiece.ImmutableSentencePieceText__pieces_size(self)
127
+
128
+ def _pieces(self, index):
129
+ return _sentencepiece.ImmutableSentencePieceText__pieces(self, index)
130
+
131
+ def _text(self):
132
+ return _sentencepiece.ImmutableSentencePieceText__text(self)
133
+
134
+ def _score(self):
135
+ return _sentencepiece.ImmutableSentencePieceText__score(self)
136
+
137
+ def SerializeAsString(self):
138
+ return _sentencepiece.ImmutableSentencePieceText_SerializeAsString(self)
139
+
140
+ def _text_as_bytes(self):
141
+ return _sentencepiece.ImmutableSentencePieceText__text_as_bytes(self)
142
+
143
+ text = property(_text)
144
+ text_as_bytes = property(_text_as_bytes)
145
+ score = property(_score)
146
+
147
+ class ImmutableSentencePieceIterator:
148
+ def __init__(self, proto):
149
+ self.proto = proto
150
+ self.len = self.proto._pieces_size()
151
+
152
+ def __len__(self):
153
+ return self.len
154
+
155
+ def __getitem__(self, index):
156
+ if isinstance(index, slice):
157
+ return [self.proto._pieces(i) for i in range(self.len)][index.start:index.stop:index.step]
158
+ if index < 0:
159
+ index = index + self.len
160
+ if index < 0 or index >= self.len:
161
+ raise IndexError('piece index is out of range')
162
+ return self.proto._pieces(index)
163
+
164
+ def __str__(self):
165
+ return '\n'.join(['pieces {{\n{}}}'.format(str(x)) for x in self])
166
+
167
+ __repr__ = __str__
168
+
169
+ @property
170
+ def pieces(self):
171
+ return ImmutableSentencePieceText.ImmutableSentencePieceIterator(self)
172
+
173
+ def __eq__(self, other):
174
+ return self.SerializeAsString() == other.SerializeAsString()
175
+
176
+ def __hash__(self):
177
+ return hash(self.SerializeAsString())
178
+
179
+ def __str__(self):
180
+ return ('text: \"{}\"\n'
181
+ 'score: {}\n'
182
+ '{}').format(self.text, self.score,
183
+ '\n'.join(['pieces {{\n{}}}'.format(str(x)) for x in self.pieces]))
184
+
185
+ __repr__ = __str__
186
+
187
+
188
+ # Register ImmutableSentencePieceText in _sentencepiece:
189
+ _sentencepiece.ImmutableSentencePieceText_swigregister(ImmutableSentencePieceText)
190
+ class ImmutableNBestSentencePieceText(object):
191
+ thisown = property(lambda x: x.this.own(), lambda x, v: x.this.own(v), doc="The membership flag")
192
+ __repr__ = _swig_repr
193
+
194
+ def __init__(self):
195
+ _sentencepiece.ImmutableNBestSentencePieceText_swiginit(self, _sentencepiece.new_ImmutableNBestSentencePieceText())
196
+ __swig_destroy__ = _sentencepiece.delete_ImmutableNBestSentencePieceText
197
+
198
+ def _nbests_size(self):
199
+ return _sentencepiece.ImmutableNBestSentencePieceText__nbests_size(self)
200
+
201
+ def _nbests(self, index):
202
+ return _sentencepiece.ImmutableNBestSentencePieceText__nbests(self, index)
203
+
204
+ def SerializeAsString(self):
205
+ return _sentencepiece.ImmutableNBestSentencePieceText_SerializeAsString(self)
206
+
207
+ class ImmutableSentencePieceTextIterator:
208
+ def __init__(self, proto):
209
+ self.proto = proto
210
+ self.len = self.proto._nbests_size()
211
+
212
+ def __len__(self):
213
+ return self.len
214
+
215
+ def __getitem__(self, index):
216
+ if isinstance(index, slice):
217
+ return [self.proto._nbests(i) for i in range(self.len)][index.start:index.stop:index.step]
218
+ if index < 0:
219
+ index = index + self.len
220
+ if index < 0 or index >= self.len:
221
+ raise IndexError('nbests index is out of range')
222
+ return self.proto._nbests(index)
223
+
224
+ def __str__(self):
225
+ return '\n'.join(['nbests {{\n{}}}'.format(str(x)) for x in self])
226
+
227
+ __repr__ = __str__
228
+
229
+ @property
230
+ def nbests(self):
231
+ return ImmutableNBestSentencePieceText.ImmutableSentencePieceTextIterator(self)
232
+
233
+ def __eq__(self, other):
234
+ return self.SerializeAsString() == other.SerializeAsString()
235
+
236
+ def __hash__(self):
237
+ return hash(self.SerializeAsString())
238
+
239
+ def __str__(self):
240
+ return '\n'.join(['nbests {{\n{}}}'.format(str(x)) for x in self.nbests])
241
+
242
+ __repr__ = __str__
243
+
244
+
245
+ # Register ImmutableNBestSentencePieceText in _sentencepiece:
246
+ _sentencepiece.ImmutableNBestSentencePieceText_swigregister(ImmutableNBestSentencePieceText)
247
+ class SentencePieceProcessor(object):
248
+ thisown = property(lambda x: x.this.own(), lambda x, v: x.this.own(v), doc="The membership flag")
249
+ __repr__ = _swig_repr
250
+
251
+ def __init__(self):
252
+ _sentencepiece.SentencePieceProcessor_swiginit(self, _sentencepiece.new_SentencePieceProcessor())
253
+ __swig_destroy__ = _sentencepiece.delete_SentencePieceProcessor
254
+
255
+ def LoadFromSerializedProto(self, serialized):
256
+ return _sentencepiece.SentencePieceProcessor_LoadFromSerializedProto(self, serialized)
257
+
258
+ def SetEncodeExtraOptions(self, extra_option):
259
+ return _sentencepiece.SentencePieceProcessor_SetEncodeExtraOptions(self, extra_option)
260
+
261
+ def SetDecodeExtraOptions(self, extra_option):
262
+ return _sentencepiece.SentencePieceProcessor_SetDecodeExtraOptions(self, extra_option)
263
+
264
+ def SetVocabulary(self, valid_vocab):
265
+ return _sentencepiece.SentencePieceProcessor_SetVocabulary(self, valid_vocab)
266
+
267
+ def ResetVocabulary(self):
268
+ return _sentencepiece.SentencePieceProcessor_ResetVocabulary(self)
269
+
270
+ def LoadVocabulary(self, filename, threshold):
271
+ return _sentencepiece.SentencePieceProcessor_LoadVocabulary(self, filename, threshold)
272
+
273
+ def CalculateEntropy(self, *args):
274
+ return _sentencepiece.SentencePieceProcessor_CalculateEntropy(self, *args)
275
+
276
+ def GetPieceSize(self):
277
+ return _sentencepiece.SentencePieceProcessor_GetPieceSize(self)
278
+
279
+ def PieceToId(self, piece):
280
+ return _sentencepiece.SentencePieceProcessor_PieceToId(self, piece)
281
+
282
+ def IdToPiece(self, id):
283
+ return _sentencepiece.SentencePieceProcessor_IdToPiece(self, id)
284
+
285
+ def GetScore(self, id):
286
+ return _sentencepiece.SentencePieceProcessor_GetScore(self, id)
287
+
288
+ def IsUnknown(self, id):
289
+ return _sentencepiece.SentencePieceProcessor_IsUnknown(self, id)
290
+
291
+ def IsControl(self, id):
292
+ return _sentencepiece.SentencePieceProcessor_IsControl(self, id)
293
+
294
+ def IsUnused(self, id):
295
+ return _sentencepiece.SentencePieceProcessor_IsUnused(self, id)
296
+
297
+ def IsByte(self, id):
298
+ return _sentencepiece.SentencePieceProcessor_IsByte(self, id)
299
+
300
+ def unk_id(self):
301
+ return _sentencepiece.SentencePieceProcessor_unk_id(self)
302
+
303
+ def bos_id(self):
304
+ return _sentencepiece.SentencePieceProcessor_bos_id(self)
305
+
306
+ def eos_id(self):
307
+ return _sentencepiece.SentencePieceProcessor_eos_id(self)
308
+
309
+ def pad_id(self):
310
+ return _sentencepiece.SentencePieceProcessor_pad_id(self)
311
+
312
+ def serialized_model_proto(self):
313
+ return _sentencepiece.SentencePieceProcessor_serialized_model_proto(self)
314
+
315
+ def LoadFromFile(self, arg):
316
+ return _sentencepiece.SentencePieceProcessor_LoadFromFile(self, arg)
317
+
318
+ def _EncodeAsIds(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece):
319
+ return _sentencepiece.SentencePieceProcessor__EncodeAsIds(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece)
320
+
321
+ def _EncodeAsPieces(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece):
322
+ return _sentencepiece.SentencePieceProcessor__EncodeAsPieces(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece)
323
+
324
+ def _EncodeAsSerializedProto(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece):
325
+ return _sentencepiece.SentencePieceProcessor__EncodeAsSerializedProto(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece)
326
+
327
+ def _EncodeAsImmutableProto(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece):
328
+ return _sentencepiece.SentencePieceProcessor__EncodeAsImmutableProto(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece)
329
+
330
+ def _EncodeAsIdsBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece):
331
+ return _sentencepiece.SentencePieceProcessor__EncodeAsIdsBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece)
332
+
333
+ def _EncodeAsPiecesBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece):
334
+ return _sentencepiece.SentencePieceProcessor__EncodeAsPiecesBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece)
335
+
336
+ def _EncodeAsSerializedProtoBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece):
337
+ return _sentencepiece.SentencePieceProcessor__EncodeAsSerializedProtoBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece)
338
+
339
+ def _EncodeAsImmutableProtoBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece):
340
+ return _sentencepiece.SentencePieceProcessor__EncodeAsImmutableProtoBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece)
341
+
342
+ def _DecodeIds(self, ids):
343
+ return _sentencepiece.SentencePieceProcessor__DecodeIds(self, ids)
344
+
345
+ def _DecodeIdsAsBytes(self, ids):
346
+ return _sentencepiece.SentencePieceProcessor__DecodeIdsAsBytes(self, ids)
347
+
348
+ def _DecodePieces(self, pieces):
349
+ return _sentencepiece.SentencePieceProcessor__DecodePieces(self, pieces)
350
+
351
+ def _DecodeIdsAsSerializedProto(self, ids):
352
+ return _sentencepiece.SentencePieceProcessor__DecodeIdsAsSerializedProto(self, ids)
353
+
354
+ def _DecodePiecesAsSerializedProto(self, pieces):
355
+ return _sentencepiece.SentencePieceProcessor__DecodePiecesAsSerializedProto(self, pieces)
356
+
357
+ def _DecodeIdsAsImmutableProto(self, ids):
358
+ return _sentencepiece.SentencePieceProcessor__DecodeIdsAsImmutableProto(self, ids)
359
+
360
+ def _DecodePiecesAsImmutableProto(self, pieces):
361
+ return _sentencepiece.SentencePieceProcessor__DecodePiecesAsImmutableProto(self, pieces)
362
+
363
+ def _DecodeIdsBatch(self, ins, num_threads):
364
+ return _sentencepiece.SentencePieceProcessor__DecodeIdsBatch(self, ins, num_threads)
365
+
366
+ def _DecodeIdsAsBytesBatch(self, ins, num_threads):
367
+ return _sentencepiece.SentencePieceProcessor__DecodeIdsAsBytesBatch(self, ins, num_threads)
368
+
369
+ def _DecodeIdsAsSerializedProtoBatch(self, ins, num_threads):
370
+ return _sentencepiece.SentencePieceProcessor__DecodeIdsAsSerializedProtoBatch(self, ins, num_threads)
371
+
372
+ def _DecodeIdsAsImmutableProtoBatch(self, ins, num_threads):
373
+ return _sentencepiece.SentencePieceProcessor__DecodeIdsAsImmutableProtoBatch(self, ins, num_threads)
374
+
375
+ def _DecodePiecesBatch(self, ins, num_threads):
376
+ return _sentencepiece.SentencePieceProcessor__DecodePiecesBatch(self, ins, num_threads)
377
+
378
+ def _DecodePiecesAsSerializedProtoBatch(self, ins, num_threads):
379
+ return _sentencepiece.SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch(self, ins, num_threads)
380
+
381
+ def _DecodePiecesAsImmutableProtoBatch(self, ins, num_threads):
382
+ return _sentencepiece.SentencePieceProcessor__DecodePiecesAsImmutableProtoBatch(self, ins, num_threads)
383
+
384
+ def _NBestEncodeAsIds(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece):
385
+ return _sentencepiece.SentencePieceProcessor__NBestEncodeAsIds(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece)
386
+
387
+ def _NBestEncodeAsPieces(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece):
388
+ return _sentencepiece.SentencePieceProcessor__NBestEncodeAsPieces(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece)
389
+
390
+ def _NBestEncodeAsSerializedProto(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece):
391
+ return _sentencepiece.SentencePieceProcessor__NBestEncodeAsSerializedProto(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece)
392
+
393
+ def _NBestEncodeAsImmutableProto(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece):
394
+ return _sentencepiece.SentencePieceProcessor__NBestEncodeAsImmutableProto(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece)
395
+
396
+ def _SampleEncodeAndScoreAsIds(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece):
397
+ return _sentencepiece.SentencePieceProcessor__SampleEncodeAndScoreAsIds(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece)
398
+
399
+ def _SampleEncodeAndScoreAsPieces(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece):
400
+ return _sentencepiece.SentencePieceProcessor__SampleEncodeAndScoreAsPieces(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece)
401
+
402
+ def _SampleEncodeAndScoreAsSerializedProto(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece):
403
+ return _sentencepiece.SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece)
404
+
405
+ def _SampleEncodeAndScoreAsImmutableProto(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece):
406
+ return _sentencepiece.SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece)
407
+
408
+ def _Normalize(self, text):
409
+ return _sentencepiece.SentencePieceProcessor__Normalize(self, text)
410
+
411
+ def _NormalizeWithOffsets(self, text):
412
+ return _sentencepiece.SentencePieceProcessor__NormalizeWithOffsets(self, text)
413
+
414
+ def _CalculateEntropy(self, text, alpha):
415
+ return _sentencepiece.SentencePieceProcessor__CalculateEntropy(self, text, alpha)
416
+
417
+ def _CalculateEntropyBatch(self, ins, alpha, num_threads):
418
+ return _sentencepiece.SentencePieceProcessor__CalculateEntropyBatch(self, ins, alpha, num_threads)
419
+
420
+ def _OverrideNormalizerSpec(self, args):
421
+ return _sentencepiece.SentencePieceProcessor__OverrideNormalizerSpec(self, args)
422
+
423
+ def Init(self,
424
+ model_file=None,
425
+ model_proto=None,
426
+ out_type=int,
427
+ add_bos=False,
428
+ add_eos=False,
429
+ reverse=False,
430
+ emit_unk_piece=False,
431
+ enable_sampling=False,
432
+ nbest_size=-1,
433
+ alpha=0.1,
434
+ num_threads=-1):
435
+ """Initialzie sentencepieceProcessor.
436
+
437
+ Args:
438
+ model_file: The sentencepiece model file path.
439
+ model_proto: The sentencepiece model serialized proto.
440
+ out_type: output type. int or str.
441
+ add_bos: Add <s> to the result (Default = false)
442
+ add_eos: Add </s> to the result (Default = false) <s>/</s> is added after
443
+ reversing (if enabled).
444
+ reverse: Reverses the tokenized sequence (Default = false)
445
+ emit_unk_piece: Emits the unk literal string (Default = false)
446
+ nbest_size: sampling parameters for unigram. Invalid in BPE-Dropout.
447
+ nbest_size = {0,1}: No sampling is performed.
448
+ nbest_size > 1: samples from the nbest_size results.
449
+ nbest_size < 0: assuming that nbest_size is infinite and samples
450
+ from the all hypothesis (lattice) using
451
+ forward-filtering-and-backward-sampling algorithm.
452
+ alpha: Soothing parameter for unigram sampling, and dropout probability of
453
+ merge operations for BPE-dropout.
454
+ num_threads: number of threads in batch processing (Default = -1, auto-detected)
455
+ """
456
+
457
+ _sentencepiece_processor_init_native(self)
458
+ self._out_type = out_type
459
+ self._add_bos = add_bos
460
+ self._add_eos = add_eos
461
+ self._reverse = reverse
462
+ self._emit_unk_piece = emit_unk_piece
463
+ self._enable_sampling = enable_sampling
464
+ self._nbest_size = nbest_size
465
+ self._alpha = alpha
466
+ self._num_threads = num_threads
467
+ if model_file or model_proto:
468
+ self.Load(model_file=model_file, model_proto=model_proto)
469
+
470
+
471
+ def Encode(self,
472
+ input,
473
+ out_type=None,
474
+ add_bos=None,
475
+ add_eos=None,
476
+ reverse=None,
477
+ emit_unk_piece=None,
478
+ enable_sampling=None,
479
+ nbest_size=None,
480
+ alpha=None,
481
+ num_threads=None):
482
+ """Encode text input to segmented ids or tokens.
483
+
484
+ Args:
485
+ input: input string. accepsts list of string.
486
+ out_type: output type. int or str.
487
+ add_bos: Add <s> to the result (Default = false)
488
+ add_eos: Add </s> to the result (Default = false) <s>/</s> is added after
489
+ reversing (if enabled).
490
+ reverse: Reverses the tokenized sequence (Default = false)
491
+ emit_unk_piece: Emits the unk literal string (Default = false)
492
+ nbest_size: sampling parameters for unigram. Invalid in BPE-Dropout.
493
+ nbest_size = {0,1}: No sampling is performed.
494
+ nbest_size > 1: samples from the nbest_size results.
495
+ nbest_size < 0: assuming that nbest_size is infinite and samples
496
+ from the all hypothesis (lattice) using
497
+ forward-filtering-and-backward-sampling algorithm.
498
+ alpha: Soothing parameter for unigram sampling, and merge probability for
499
+ BPE-dropout (probablity 'p' in BPE-dropout paper).
500
+ num_threads: the number of threads used in the batch processing (Default = -1).
501
+ """
502
+
503
+ if out_type is None:
504
+ out_type = self._out_type
505
+ if add_bos is None:
506
+ add_bos = self._add_bos
507
+ if add_eos is None:
508
+ add_eos = self._add_eos
509
+ if reverse is None:
510
+ reverse = self._reverse
511
+ if emit_unk_piece is None:
512
+ emit_unk_piece = self._emit_unk_piece
513
+ if enable_sampling is None:
514
+ enable_sampling = self._enable_sampling
515
+ if nbest_size is None:
516
+ nbest_size = self._nbest_size
517
+ if alpha is None:
518
+ alpha = self._alpha
519
+ if num_threads is None:
520
+ num_threads = self._num_threads
521
+
522
+ if enable_sampling == True and (nbest_size is None or nbest_size == 0 or
523
+ nbest_size == 1 or alpha is None):
524
+ raise RuntimeError(
525
+ 'When enable_sampling is True, We must specify "nbest_size > 1" or "nbest_size = -1", '
526
+ 'and "alpha". "nbest_size" is enabled only on unigram mode ignored in BPE-dropout. '
527
+ 'when "nbest_size = -1" , this method samples from all candidates on the lattice '
528
+ 'instead of nbest segmentations.'
529
+ )
530
+
531
+ if num_threads is None or type(num_threads) is not int:
532
+ raise RuntimeError('num_threads must be int')
533
+
534
+ if type(input) is list:
535
+ if out_type is int:
536
+ return self._EncodeAsIdsBatch(input, num_threads, enable_sampling, nbest_size,
537
+ alpha, add_bos, add_eos, reverse, emit_unk_piece)
538
+ if out_type is str:
539
+ return self._EncodeAsPiecesBatch(input, num_threads, enable_sampling, nbest_size,
540
+ alpha, add_bos, add_eos, reverse, emit_unk_piece)
541
+ if out_type == 'serialized_proto' or out_type == 'proto':
542
+ return self._EncodeAsSerializedProtoBatch(input, num_threads, enable_sampling, nbest_size,
543
+ alpha, add_bos, add_eos, reverse, emit_unk_piece)
544
+ if out_type == 'immutable_proto':
545
+ return self._EncodeAsImmutableProtoBatch(input, num_threads, enable_sampling, nbest_size,
546
+ alpha, add_bos, add_eos, reverse, emit_unk_piece)
547
+
548
+ if out_type is int:
549
+ return self._EncodeAsIds(input, enable_sampling, nbest_size,
550
+ alpha, add_bos, add_eos, reverse, emit_unk_piece)
551
+ if out_type is str:
552
+ return self._EncodeAsPieces(input, enable_sampling, nbest_size,
553
+ alpha, add_bos, add_eos, reverse, emit_unk_piece)
554
+ if out_type == 'serialized_proto' or out_type == 'proto':
555
+ return self._EncodeAsSerializedProto(input, enable_sampling, nbest_size,
556
+ alpha, add_bos, add_eos, reverse, emit_unk_piece)
557
+ if out_type == 'immutable_proto':
558
+ return self._EncodeAsImmutableProto(input, enable_sampling, nbest_size,
559
+ alpha, add_bos, add_eos, reverse, emit_unk_piece)
560
+
561
+ raise RuntimeError('unknown out_type={}'.format(out_type))
562
+ return None
563
+
564
+
565
+ def EncodeAsPieces(self, input, **kwargs):
566
+ return self.Encode(input=input, out_type=str, **kwargs)
567
+
568
+
569
+ def EncodeAsIds(self, input, **kwargs):
570
+ return self.Encode(input=input, out_type=int, **kwargs)
571
+
572
+
573
+ def EncodeAsSerializedProto(self, input, **kwargs):
574
+ return self.Encode(input=input, out_type='serialized_proto', **kwargs)
575
+
576
+
577
+ def EncodeAsImmutableProto(self, input, **kwargs):
578
+ return self.Encode(input=input, out_type='immutable_proto', **kwargs)
579
+
580
+
581
+ def SampleEncodeAsPieces(self, input, nbest_size=None, alpha=None, **kwargs):
582
+ return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha,
583
+ out_type=str, enable_sampling=True, **kwargs)
584
+
585
+
586
+ def SampleEncodeAsIds(self, input, nbest_size=None, alpha=None,**kwargs):
587
+ return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha,
588
+ out_type=int, enable_sampling=True, **kwargs)
589
+
590
+
591
+ def SampleEncodeAsSerializedProto(self, input, nbest_size=None, alpha=None, **kwargs):
592
+ return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha,
593
+ out_type='serialized_proto', enable_sampling=True, **kwargs)
594
+
595
+
596
+ def SampleEncodeAsImmutableProto(self, input, nbest_size=None, alpha=None, **kwargs):
597
+ return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha,
598
+ out_type='immutable_proto', enable_sampling=True, **kwargs)
599
+
600
+
601
+ def NBestEncode(self,
602
+ input,
603
+ out_type=None,
604
+ add_bos=None,
605
+ add_eos=None,
606
+ reverse=None,
607
+ emit_unk_piece=None,
608
+ nbest_size=None):
609
+ """NBestEncode text input to segmented ids or tokens.
610
+
611
+ Args:
612
+ input: input string. accepsts list of string.
613
+ out_type: output type. int or str.
614
+ add_bos: Add <s> to the result (Default = false)
615
+ add_eos: Add </s> to the result (Default = false) <s>/</s> is added after reversing (if enabled).
616
+ reverse: Reverses the tokenized sequence (Default = false)
617
+ emit_unk_piece: Emits the unk literal string (Default = false)
618
+ nbest_size: nbest size
619
+ """
620
+
621
+ if out_type is None:
622
+ out_type = self._out_type
623
+ if add_bos is None:
624
+ add_bos = self._add_bos
625
+ if add_eos is None:
626
+ add_eos = self._add_eos
627
+ if reverse is None:
628
+ reverse = self._reverse
629
+ if emit_unk_piece is None:
630
+ emit_unk_piece = self._emit_unk_piece
631
+ if nbest_size is None:
632
+ nbest_size = self._nbest_size
633
+
634
+ if nbest_size <= 0:
635
+ nbest_size=1
636
+
637
+ def _encode(text):
638
+ if out_type is int:
639
+ return self._NBestEncodeAsIds(text, nbest_size,
640
+ add_bos, add_eos, reverse, emit_unk_piece)
641
+ if out_type is str:
642
+ return self._NBestEncodeAsPieces(text, nbest_size,
643
+ add_bos, add_eos, reverse, emit_unk_piece)
644
+ if out_type == 'serialized_proto' or out_type == 'proto':
645
+ return self._NBestEncodeAsSerializedProto(text, nbest_size,
646
+ add_bos, add_eos, reverse, emit_unk_piece)
647
+ if out_type == 'immutable_proto':
648
+ return self._NBestEncodeAsImmutableProto(text, nbest_size,
649
+ add_bos, add_eos, reverse, emit_unk_piece)
650
+
651
+ raise RuntimeError('unknown out_type')
652
+
653
+ if type(input) is list:
654
+ return [_encode(n) for n in input]
655
+
656
+ return _encode(input)
657
+
658
+
659
+ def NBestEncodeAsPieces(self, input, nbest_size=None, **kwargs):
660
+ return self.NBestEncode(input=input, nbest_size=nbest_size,
661
+ out_type=str, **kwargs)
662
+
663
+
664
+ def NBestEncodeAsIds(self, input, nbest_size=None, **kwargs):
665
+ return self.NBestEncode(input=input, nbest_size=nbest_size,
666
+ out_type=int, **kwargs)
667
+
668
+
669
+ def NBestEncodeAsSerializedProto(self, input, nbest_size=None, **kwargs):
670
+ return self.NBestEncode(input=input, nbest_size=nbest_size,
671
+ out_type='serialized_proto', **kwargs)
672
+
673
+
674
+ def NBestEncodeAsImmutableProto(self, input, nbest_size=None, **kwargs):
675
+ return self.NBestEncode(input=input, nbest_size=nbest_size,
676
+ out_type='immutable_proto', **kwargs)
677
+
678
+
679
+ def SampleEncodeAndScore(self,
680
+ input,
681
+ out_type=None,
682
+ add_bos=None,
683
+ add_eos=None,
684
+ reverse=None,
685
+ emit_unk_piece=None,
686
+ num_samples=None,
687
+ alpha=None,
688
+ wor=None,
689
+ include_best=None):
690
+ """SampleEncodeAndScore text input to segmented ids or tokens.
691
+
692
+ Args:
693
+ input: input string. accepsts list of string.
694
+ out_type: output type. int or str or 'serialized_proto' or 'immutable_proto'
695
+ add_bos: Add <s> to the result (Default = false)
696
+ add_eos: Add </s> to the result (Default = false) <s>/</s> is added after reversing (if enabled).
697
+ reverse: Reverses the tokenized sequence (Default = false)
698
+ emit_unk_piece: Emits the unk literal string (Default = false)
699
+ num_samples: How many samples to return (Default = 1)
700
+ alpha: inverse temperature for sampling
701
+ wor: whether to sample without replacement (Default = false)
702
+ include_best: whether to include the best tokenization, requires wor=True (Default = false)
703
+ """
704
+
705
+ if out_type is None:
706
+ out_type = self._out_type
707
+ if add_bos is None:
708
+ add_bos = self._add_bos
709
+ if add_eos is None:
710
+ add_eos = self._add_eos
711
+ if reverse is None:
712
+ reverse = self._reverse
713
+ if emit_unk_piece is None:
714
+ emit_unk_piece = self._emit_unk_piece
715
+ if num_samples is None:
716
+ num_samples = 1
717
+ if alpha is None:
718
+ alpha = 1.
719
+ if wor is None:
720
+ wor = False
721
+ if include_best is None:
722
+ include_best = False
723
+
724
+ if num_samples <= 0:
725
+ raise RuntimeError('num_examples must be positive')
726
+
727
+ if include_best and not wor:
728
+ raise RuntimeError('When include_best is True, We must specify "wor = True".')
729
+
730
+
731
+ def _encode(text):
732
+ if out_type is int:
733
+ return self._SampleEncodeAndScoreAsIds(text, num_samples, alpha, wor, include_best,
734
+ add_bos, add_eos, reverse, emit_unk_piece)
735
+ if out_type is str:
736
+ return self._SampleEncodeAndScoreAsPieces(text, num_samples, alpha, wor, include_best,
737
+ add_bos, add_eos, reverse, emit_unk_piece)
738
+
739
+ if out_type == 'serialized_proto' or out_type == 'proto':
740
+ return self._SampleEncodeAndScoreAsSerializedProto(text, num_samples, alpha, wor, include_best,
741
+ add_bos, add_eos, reverse, emit_unk_piece)
742
+
743
+ if out_type == 'immutable_proto':
744
+ return self._SampleEncodeAndScoreAsImmutableProto(text, num_samples, alpha, wor, include_best,
745
+ add_bos, add_eos, reverse, emit_unk_piece)
746
+
747
+ raise RuntimeError('unknown output type')
748
+
749
+
750
+ if type(input) is list:
751
+ return [_encode(n) for n in input]
752
+
753
+ return _encode(input)
754
+
755
+
756
+ def SampleEncodeAndScoreAsPieces(self, input, num_samples=None, alpha=None, **kwargs):
757
+ return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha,
758
+ out_type=str, **kwargs)
759
+
760
+
761
+ def SampleEncodeAndScoreAsIds(self, input, num_samples=None, alpha=None, **kwargs):
762
+ return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha,
763
+ out_type=int, **kwargs)
764
+
765
+
766
+ def SampleEncodeAndScoreAsSerializedProto(self, input, num_samples=None, alpha=None, **kwargs):
767
+ return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha,
768
+ out_type='serialized_proto', **kwargs)
769
+
770
+
771
+ def SampleEncodeAndScoreAsImmutableProto(self, input, num_samples=None, alpha=None, **kwargs):
772
+ return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha,
773
+ out_type='immutable_proto', **kwargs)
774
+
775
+
776
+ def Decode(self, input, out_type=str, num_threads=None):
777
+ """Decode processed id or token sequences.
778
+
779
+ Args:
780
+ out_type: output type. str, bytes or 'serialized_proto' or 'immutable_proto' (Default = str)
781
+ num_threads: the number of threads used in the batch processing (Default = -1).
782
+ """
783
+
784
+ if num_threads is None:
785
+ num_threads = self._num_threads
786
+
787
+ if num_threads is None or type(num_threads) is not int:
788
+ raise RuntimeError('num_threads must be int')
789
+
790
+ if not input:
791
+ return ''
792
+
793
+ if out_type is str:
794
+ if type(input) is int:
795
+ return self._DecodeIds([input])
796
+ if type(input) is str:
797
+ return self._DecodePieces([input])
798
+
799
+ if type(input) is list:
800
+ if len(input) == 0 or type(input[0]) is int:
801
+ return self._DecodeIds(input)
802
+ if type(input[0]) is str:
803
+ return self._DecodePieces(input)
804
+
805
+ if type(input[0]) is list:
806
+ if len(input[0]) == 0 or type(input[0][0]) is int:
807
+ return self._DecodeIdsBatch(input, num_threads)
808
+ if type(input[0][0]) is str:
809
+ return self._DecodePiecesBatch(input, num_threads)
810
+
811
+ if out_type is bytes:
812
+ if type(input) is int:
813
+ return self._DecodeIdsAsBytes([input])
814
+ if type(input) is str:
815
+ return self._DecodePieces([input])
816
+
817
+ if type(input) is list:
818
+ if len(input) == 0 or type(input[0]) is int:
819
+ return self._DecodeIdsAsBytes(input)
820
+ if type(input[0]) is str:
821
+ return self._DecodePieces(input)
822
+
823
+ if type(input[0]) is list:
824
+ if len(input[0]) == 0 or type(input[0][0]) is int:
825
+ return self._DecodeIdsAsBytesBatch(input, num_threads)
826
+ if type(input[0][0]) is str:
827
+ return self._DecodePiecesBatch(input, num_threads)
828
+
829
+ if out_type == 'serialized_proto':
830
+ if type(input) is int:
831
+ return self._DecodeIdsAsSerializedProto([input])
832
+ if type(input) is str:
833
+ return self._DecodePiecesAsSerializedProto([input])
834
+
835
+ if type(input) is list:
836
+ if len(input) == 0 or type(input[0]) is int:
837
+ return self._DecodeIdsAsSerializedProto(input)
838
+ if type(input[0]) is str:
839
+ return self._DecodePiecesAsSerializedProto(input)
840
+
841
+ if type(input[0]) is list:
842
+ if len(input[0]) == 0 or type(input[0][0]) is int:
843
+ return self._DecodeIdsAsSerializedProtoBatch(input, num_threads)
844
+ if type(input[0][0]) is str:
845
+ return self._DecodePiecesAsSerializedProtoBatch(input, num_threads)
846
+
847
+
848
+ if out_type == 'immutable_proto':
849
+ if type(input) is int:
850
+ return self._DecodeIdsAsImmutableProto([input])
851
+ if type(input) is str:
852
+ return self._DecodePiecesAsImmutableProto([input])
853
+
854
+ if type(input) is list:
855
+ if len(input) == 0 or type(input[0]) is int:
856
+ return self._DecodeIdsAsImmutableProto(input)
857
+ if type(input[0]) is str:
858
+ return self._DecodePiecesAsImmutableProto(input)
859
+
860
+ if type(input[0]) is list:
861
+ if len(input[0]) == 0 or type(input[0][0]) is int:
862
+ return self._DecodeIdsAsImmutableProtoBatch(input, num_threads)
863
+ if type(input[0][0]) is str:
864
+ return self._DecodePiecesAsImmutableProtoBatch(input, num_threads)
865
+
866
+
867
+ raise RuntimeError('unknown output or input type')
868
+ return None
869
+
870
+
871
+ def DecodePieces(self, input, out_type=str, **kwargs):
872
+ return self.Decode(input=input, out_type=out_type, **kwargs)
873
+
874
+
875
+ def DecodeIds(self, input, out_type=str, **kwargs):
876
+ return self.Decode(input=input, out_type=out_type, **kwargs)
877
+
878
+
879
+ def DecodePiecesAsSerializedProto(self, input, out_type='serialized_proto', **kwargs):
880
+ return self.Decode(input=input, out_type=out_type, **kwargs)
881
+
882
+
883
+ def DecodeIdsAsSerializedProto(self, input, out_type='serialized_proto', **kwargs):
884
+ return self.Decode(input=input, out_type=out_type, **kwargs)
885
+
886
+
887
+ def DecodePiecesAsImmutableProto(self, input, out_type='immutable_proto', **kwargs):
888
+ return self.Decode(input=input, out_type=out_type, **kwargs)
889
+
890
+
891
+ def DecodeIdsAsImmutableProto(self, input, out_type='immutable_proto', **kwargs):
892
+ return self.Decode(input=input, out_type=out_type, **kwargs)
893
+
894
+
895
+ def CalculateEntropy(self, input, alpha, num_threads=None):
896
+ """Calculate sentence entropy"""
897
+ if type(input) is list:
898
+ if num_threads is None:
899
+ num_threads = self._num_threads
900
+ if num_threads is None or type(num_threads) is not int:
901
+ raise RuntimeError('num_threads must be int')
902
+ return self._CalculateEntropyBatch(input, alpha, num_threads)
903
+
904
+ return self._CalculateEntropy(input, alpha)
905
+
906
+
907
+ def Normalize(self, input, with_offsets=None):
908
+ def _normalize(text):
909
+ if with_offsets:
910
+ return self._NormalizeWithOffsets(text)
911
+ return self._Normalize(text)
912
+
913
+ if type(input) is list:
914
+ return [_normalize(x) for x in input]
915
+ return _normalize(input)
916
+
917
+ def OverrideNormalizerSpec(self, **kwargs):
918
+ new_kwargs = {}
919
+ for key, value in kwargs.items():
920
+ new_kwargs[key] = str(value)
921
+ return self._OverrideNormalizerSpec(new_kwargs)
922
+
923
+
924
+ def piece_size(self):
925
+ return self.GetPieceSize()
926
+
927
+
928
+ def vocab_size(self):
929
+ return self.GetPieceSize()
930
+
931
+
932
+ def __getstate__(self):
933
+ return self.serialized_model_proto()
934
+
935
+
936
+ def __setstate__(self, serialized_model_proto):
937
+ self.__init__()
938
+ self.LoadFromSerializedProto(serialized_model_proto)
939
+
940
+
941
+ def __len__(self):
942
+ return self.GetPieceSize()
943
+
944
+
945
+ def __getitem__(self, piece):
946
+ return self.PieceToId(piece)
947
+
948
+
949
+ def Load(self, model_file=None, model_proto=None):
950
+ """Overwride SentencePieceProcessor.Load to support both model_file and model_proto.
951
+
952
+ Args:
953
+ model_file: The sentencepiece model file path.
954
+ model_proto: The sentencepiece model serialized proto. Either `model_file`
955
+ or `model_proto` must be set.
956
+ """
957
+ if model_file and model_proto:
958
+ raise RuntimeError('model_file and model_proto must be exclusive.')
959
+ if model_proto:
960
+ return self.LoadFromSerializedProto(model_proto)
961
+ return self.LoadFromFile(model_file)
962
+
963
+
964
+ # Register SentencePieceProcessor in _sentencepiece:
965
+ _sentencepiece.SentencePieceProcessor_swigregister(SentencePieceProcessor)
966
+
967
+ def SetRandomGeneratorSeed(seed):
968
+ return _sentencepiece.SetRandomGeneratorSeed(seed)
969
+
970
+ def SetMinLogLevel(v):
971
+ return _sentencepiece.SetMinLogLevel(v)
972
+ class SentencePieceTrainer(object):
973
+ thisown = property(lambda x: x.this.own(), lambda x, v: x.this.own(v), doc="The membership flag")
974
+
975
+ def __init__(self, *args, **kwargs):
976
+ raise AttributeError("No constructor defined")
977
+ __repr__ = _swig_repr
978
+
979
+ @staticmethod
980
+ def _TrainFromString(arg):
981
+ return _sentencepiece.SentencePieceTrainer__TrainFromString(arg)
982
+
983
+ @staticmethod
984
+ def _TrainFromMap(args):
985
+ return _sentencepiece.SentencePieceTrainer__TrainFromMap(args)
986
+
987
+ @staticmethod
988
+ def _TrainFromMap2(args, iter):
989
+ return _sentencepiece.SentencePieceTrainer__TrainFromMap2(args, iter)
990
+
991
+ @staticmethod
992
+ def _TrainFromMap3(args):
993
+ return _sentencepiece.SentencePieceTrainer__TrainFromMap3(args)
994
+
995
+ @staticmethod
996
+ def _TrainFromMap4(args, iter):
997
+ return _sentencepiece.SentencePieceTrainer__TrainFromMap4(args, iter)
998
+
999
+ @staticmethod
1000
+ def _Train(arg=None, **kwargs):
1001
+ """Train Sentencepiece model. Accept both kwargs and legacy string arg."""
1002
+ if arg is not None and type(arg) is str:
1003
+ return SentencePieceTrainer._TrainFromString(arg)
1004
+
1005
+ def _encode(value):
1006
+ """Encode value to CSV.."""
1007
+ if type(value) is list:
1008
+ if sys.version_info[0] == 3:
1009
+ f = StringIO()
1010
+ else:
1011
+ f = BytesIO()
1012
+ writer = csv.writer(f, lineterminator='')
1013
+ writer.writerow([str(v) for v in value])
1014
+ return f.getvalue()
1015
+ else:
1016
+ return str(value)
1017
+
1018
+ sentence_iterator = None
1019
+ model_writer = None
1020
+ new_kwargs = {}
1021
+ for key, value in kwargs.items():
1022
+ if key in ['sentence_iterator', 'sentence_reader']:
1023
+ sentence_iterator = value
1024
+ elif key in ['model_writer']:
1025
+ model_writer = value
1026
+ else:
1027
+ new_kwargs[key] = _encode(value)
1028
+
1029
+ if model_writer:
1030
+ if sentence_iterator:
1031
+ model_proto = SentencePieceTrainer._TrainFromMap4(new_kwargs,
1032
+ sentence_iterator)
1033
+ else:
1034
+ model_proto = SentencePieceTrainer._TrainFromMap3(new_kwargs)
1035
+ model_writer.write(model_proto)
1036
+ else:
1037
+ if sentence_iterator:
1038
+ return SentencePieceTrainer._TrainFromMap2(new_kwargs, sentence_iterator)
1039
+ else:
1040
+ return SentencePieceTrainer._TrainFromMap(new_kwargs)
1041
+
1042
+ return None
1043
+
1044
+ @staticmethod
1045
+ def Train(arg=None, logstream=None, **kwargs):
1046
+ with _LogStream(ostream=logstream):
1047
+ SentencePieceTrainer._Train(arg=arg, **kwargs)
1048
+
1049
+
1050
+ # Register SentencePieceTrainer in _sentencepiece:
1051
+ _sentencepiece.SentencePieceTrainer_swigregister(SentencePieceTrainer)
1052
+ class SentencePieceNormalizer(object):
1053
+ thisown = property(lambda x: x.this.own(), lambda x, v: x.this.own(v), doc="The membership flag")
1054
+ __repr__ = _swig_repr
1055
+
1056
+ def __init__(self):
1057
+ _sentencepiece.SentencePieceNormalizer_swiginit(self, _sentencepiece.new_SentencePieceNormalizer())
1058
+ __swig_destroy__ = _sentencepiece.delete_SentencePieceNormalizer
1059
+
1060
+ def LoadFromSerializedProto(self, serialized):
1061
+ return _sentencepiece.SentencePieceNormalizer_LoadFromSerializedProto(self, serialized)
1062
+
1063
+ def LoadFromRuleTSV(self, filename):
1064
+ return _sentencepiece.SentencePieceNormalizer_LoadFromRuleTSV(self, filename)
1065
+
1066
+ def LoadFromRuleName(self, name):
1067
+ return _sentencepiece.SentencePieceNormalizer_LoadFromRuleName(self, name)
1068
+
1069
+ def serialized_model_proto(self):
1070
+ return _sentencepiece.SentencePieceNormalizer_serialized_model_proto(self)
1071
+
1072
+ def LoadFromFile(self, arg):
1073
+ return _sentencepiece.SentencePieceNormalizer_LoadFromFile(self, arg)
1074
+
1075
+ def _Normalize(self, text):
1076
+ return _sentencepiece.SentencePieceNormalizer__Normalize(self, text)
1077
+
1078
+ def _NormalizeWithOffsets(self, text):
1079
+ return _sentencepiece.SentencePieceNormalizer__NormalizeWithOffsets(self, text)
1080
+
1081
+ def _SetProtoField(self, name, value):
1082
+ return _sentencepiece.SentencePieceNormalizer__SetProtoField(self, name, value)
1083
+
1084
+ def Init(self,
1085
+ model_file=None,
1086
+ model_proto=None,
1087
+ rule_tsv=None,
1088
+ rule_name=None,
1089
+ add_dummy_prefix=False,
1090
+ escape_whitespaces=False,
1091
+ remove_extra_whitespaces=False):
1092
+ """Initialzie sentencePieceNormalizer.
1093
+
1094
+ Args:
1095
+ model_file: The sentencepiece model file path.
1096
+ model_proto: The sentencepiece model serialized proto.
1097
+ rule_tsv: The normalization rule file in TSV format.
1098
+ rule_name: Pre-defined normalization name.
1099
+ add_dummy_prefix: add dummy prefix.
1100
+ escape_whitespaces: escape whitespaces.
1101
+ remove_extra_whitespaces: remove extra whitespaces.
1102
+ """
1103
+
1104
+ _sentencepiece_normalizer_init_native(self)
1105
+
1106
+ if model_file:
1107
+ status = self.LoadFromFile(model_file)
1108
+ elif model_proto:
1109
+ status = self.LoadFromSerializedProto(model_proto)
1110
+ elif rule_tsv:
1111
+ status = self.LoadFromRuleTSV(rule_tsv)
1112
+ elif rule_name:
1113
+ status = self.LoadFromRuleName(rule_name)
1114
+ else:
1115
+ raise RuntimeError('no model is specified')
1116
+
1117
+ if status:
1118
+ self._SetProtoField('add_dummy_prefix', add_dummy_prefix)
1119
+ self._SetProtoField('escape_whitespaces', escape_whitespaces)
1120
+ self._SetProtoField('remove_extra_whitespaces', remove_extra_whitespaces)
1121
+
1122
+ def Normalize(self, input, with_offsets=None):
1123
+ def _normalize(text):
1124
+ if with_offsets:
1125
+ return self._NormalizeWithOffsets(text)
1126
+ return self._Normalize(text)
1127
+
1128
+ if type(input) is list:
1129
+ return [_normalize(x) for x in input]
1130
+ return _normalize(input)
1131
+
1132
+
1133
+ def __getstate__(self):
1134
+ return self.serialized_model_proto()
1135
+
1136
+
1137
+ def __setstate__(self, serialized_model_proto):
1138
+ self.__init__()
1139
+ self.LoadFromSerializedProto(serialized_model_proto)
1140
+
1141
+
1142
+ # Register SentencePieceNormalizer in _sentencepiece:
1143
+ _sentencepiece.SentencePieceNormalizer_swigregister(SentencePieceNormalizer)
1144
+
1145
+ def SetDataDir(data_dir):
1146
+ return _sentencepiece.SetDataDir(data_dir)
1147
+
1148
+
1149
+ import re
1150
+ import csv
1151
+ import sys
1152
+ import os
1153
+ import importlib.resources
1154
+ from io import StringIO
1155
+ from io import BytesIO
1156
+
1157
+
1158
+ def _add_snake_case(classname):
1159
+ """Added snake_cased method from CammelCased method."""
1160
+
1161
+ snake_map = {}
1162
+ for k, v in classname.__dict__.items():
1163
+ if re.match(r'^[A-Z]+', k):
1164
+ snake = re.sub(r'(?<!^)(?=[A-Z])', '_',
1165
+ k).lower().replace('n_best', 'nbest')
1166
+ snake_map[snake] = v
1167
+ for k, v in snake_map.items():
1168
+ setattr(classname, k, v)
1169
+
1170
+
1171
+ def _batchnize(classname, name):
1172
+ """Enables batch request for the method classname.name."""
1173
+ func = getattr(classname, name, None)
1174
+ def _func(v, n):
1175
+ if type(n) is int and (n < 0 or n >= v.piece_size()):
1176
+ raise IndexError('piece id is out of range.')
1177
+ return func(v, n)
1178
+
1179
+ def _batched_func(self, arg):
1180
+ if type(arg) is list:
1181
+ return [_func(self, n) for n in arg]
1182
+ else:
1183
+ return _func(self, arg)
1184
+
1185
+ setattr(classname, name, _batched_func)
1186
+
1187
+
1188
+ _sentencepiece_processor_init_native = SentencePieceProcessor.__init__
1189
+ _sentencepiece_normalizer_init_native = SentencePieceNormalizer.__init__
1190
+ setattr(SentencePieceProcessor, '__init__', SentencePieceProcessor.Init)
1191
+ setattr(SentencePieceNormalizer, '__init__', SentencePieceNormalizer.Init)
1192
+
1193
+ SentencePieceProcessor.Tokenize = SentencePieceProcessor.Encode
1194
+ SentencePieceProcessor.Detokenize = SentencePieceProcessor.Decode
1195
+
1196
+ for m in [
1197
+ 'PieceToId', 'IdToPiece', 'GetScore', 'IsUnknown', 'IsControl', 'IsUnused',
1198
+ 'IsByte'
1199
+ ]:
1200
+ _batchnize(SentencePieceProcessor, m)
1201
+
1202
+ _add_snake_case(SentencePieceProcessor)
1203
+ _add_snake_case(SentencePieceTrainer)
1204
+ _add_snake_case(SentencePieceNormalizer)
1205
+ set_random_generator_seed = SetRandomGeneratorSeed
1206
+ set_min_log_level = SetMinLogLevel
1207
+
1208
+ from ._version import __version__
1209
+
1210
+ SetDataDir(os.path.join(str(importlib.resources.files('sentencepiece')), 'package_data'))
1211
+
1212
+ class _LogStream(object):
1213
+ def __init__(self, ostream=None):
1214
+ self.ostream = ostream
1215
+ if self.ostream is not None:
1216
+ self.orig_stream_fileno = sys.stderr.fileno()
1217
+
1218
+ def __enter__(self):
1219
+ if self.ostream is not None:
1220
+ self.orig_stream_dup = os.dup(self.orig_stream_fileno)
1221
+ os.dup2(self.ostream.fileno(), self.orig_stream_fileno)
1222
+
1223
+ def __exit__(self, type, value, traceback):
1224
+ if self.ostream is not None:
1225
+ os.close(self.orig_stream_fileno)
1226
+ os.dup2(self.orig_stream_dup, self.orig_stream_fileno)
1227
+ os.close(self.orig_stream_dup)
1228
+ self.ostream.close()
1229
+
1230
+
source/sentencepiece/_sentencepiece.cpython-312-x86_64-linux-gnu.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf6b023896aa1c99026445efc2b5cbd3b8a97c6839dd0fdfb2bca757fbed970d
3
+ size 2005912
source/sentencepiece/_version.py ADDED
@@ -0,0 +1 @@
 
 
1
+ __version__ = '0.2.1'
source/sentencepiece/package_data/nfkc.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52f11028ff8a7df3e009d94a8b6a54d4a8a17132efb4ccc1c9a0a41e432bd91e
3
+ size 240008
source/sentencepiece/package_data/nfkc_cf.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60833ec11201446659c3549c183b18f024c4007628cc6b3a4e91ae007697b826
3
+ size 247028
source/sentencepiece/package_data/nmt_nfkc.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79372c41389c2b9b29bc171017ab5400e352debd686b02670a42bec709015074
3
+ size 240007
source/sentencepiece/package_data/nmt_nfkc_cf.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22c292c76f503795f30c85d79d30a7f572fff4f49e00392d2d60f4f93e941a1e
3
+ size 247027
source/sentencepiece/sentencepiece.i ADDED
@@ -0,0 +1,2013 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ %module sentencepiece
2
+ %include exception.i
3
+
4
+ %{
5
+
6
+ #include <atomic>
7
+ #include <iostream>
8
+ #include <algorithm>
9
+ #include <functional>
10
+ #include <limits>
11
+ #include <cmath>
12
+ #include <thread>
13
+ #include <vector>
14
+ #include <sentencepiece_processor.h>
15
+ #include <sentencepiece_trainer.h>
16
+
17
+ namespace {
18
+ PyObject* kUnicodeInput = reinterpret_cast<PyObject* >(0x1);
19
+ PyObject* kByteInput = reinterpret_cast<PyObject* >(0x2);
20
+
21
+ using BytesArray = std::vector<sentencepiece::util::bytes>;
22
+
23
+ inline void ReleaseResultObject(PyObject *obj) {
24
+ if (obj != nullptr && obj != kUnicodeInput && obj != kByteInput) {
25
+ Py_XDECREF(obj);
26
+ }
27
+ }
28
+
29
+ class PyInputString {
30
+ public:
31
+ explicit PyInputString(PyObject* obj) {
32
+ if (PyUnicode_Check(obj)) {
33
+ str_ = const_cast<char *>(PyUnicode_AsUTF8AndSize(obj, &size_));
34
+ input_type_ = kUnicodeInput;
35
+ } else if (PyBytes_Check(obj)) {
36
+ PyBytes_AsStringAndSize(obj, &str_, &size_);
37
+ input_type_ = kByteInput;
38
+ } else {
39
+ str_ = nullptr;
40
+ }
41
+ }
42
+ absl::string_view str() const { return absl::string_view(data(), size()); }
43
+ const char* data() const { return str_; }
44
+ Py_ssize_t size() const { return size_; }
45
+ bool IsAvalable() const { return str_ != nullptr; }
46
+ PyObject *input_type() const { return input_type_; }
47
+
48
+ static bool IsUnicode(PyObject *resultobj) {
49
+ return (resultobj == nullptr || resultobj == kUnicodeInput);
50
+ }
51
+
52
+ private:
53
+ PyObject* input_type_ = nullptr;
54
+ char* str_ = nullptr;
55
+ Py_ssize_t size_ = 0;
56
+ };
57
+
58
+ PyObject* MakePyOutputString(const std::string& output,
59
+ PyObject *resultobj) {
60
+ if (PyInputString::IsUnicode(resultobj)) {
61
+ return PyUnicode_FromStringAndSize(output.data(), output.size());
62
+ }
63
+ return PyBytes_FromStringAndSize(output.data(), output.size());
64
+ }
65
+
66
+ PyObject* MakePyOutputBytes(const sentencepiece::util::bytes& output) {
67
+ return PyBytes_FromStringAndSize(output.data(), output.size());
68
+ }
69
+
70
+ int ToSwigError(sentencepiece::util::StatusCode code) {
71
+ switch (code) {
72
+ case sentencepiece::util::StatusCode::kNotFound:
73
+ return SWIG_IOError;
74
+ case sentencepiece::util::StatusCode::kOutOfRange:
75
+ return SWIG_IndexError;
76
+ case sentencepiece::util::StatusCode::kInvalidArgument:
77
+ return SWIG_SyntaxError;
78
+ default:
79
+ return SWIG_RuntimeError;
80
+ }
81
+ return SWIG_RuntimeError;
82
+ }
83
+
84
+ class PySentenceIterator : public sentencepiece::SentenceIterator {
85
+ public:
86
+ PySentenceIterator(PyObject *iter) : iter_(iter) {
87
+ item_ = PyIter_Next(iter_);
88
+ CopyValue();
89
+ }
90
+
91
+ ~PySentenceIterator() {
92
+ // Py_XDECREF(iter_);
93
+ }
94
+
95
+ bool done() const override {
96
+ return item_ == nullptr;
97
+ }
98
+
99
+ void Next() override {
100
+ item_ = PyIter_Next(iter_);
101
+ CopyValue();
102
+ }
103
+
104
+ const std::string &value() const override {
105
+ return value_;
106
+ }
107
+
108
+ sentencepiece::util::Status status() const override {
109
+ return status_;
110
+ }
111
+
112
+ private:
113
+ void CopyValue() {
114
+ if (item_ == nullptr) return;
115
+ const PyInputString ustring(item_);
116
+ if (ustring.IsAvalable()) {
117
+ const char *data = ustring.data();
118
+ size_t size = ustring.size();
119
+ while (size > 0) {
120
+ if (data[size - 1] == '\r' || data[size - 1] == '\n')
121
+ --size;
122
+ else
123
+ break;
124
+ }
125
+ value_.assign(data, size);
126
+ } else {
127
+ status_ = sentencepiece::util::Status(sentencepiece::util::StatusCode::kInternal,
128
+ "Not a string.");
129
+ }
130
+ Py_XDECREF(item_);
131
+ }
132
+ PyObject *iter_ = nullptr;
133
+ PyObject *item_ = nullptr;
134
+ std::string value_;
135
+ sentencepiece::util::Status status_;
136
+ };
137
+
138
+ inline void RewriteIds(const sentencepiece::SentencePieceProcessor &sp,
139
+ std::vector<int> *ids,
140
+ bool add_bos, bool add_eos, bool reverse, bool emit_unk_piece) {
141
+ if (!add_bos && !add_eos && !reverse) return;
142
+ if (reverse) std::reverse(ids->begin(), ids->end());
143
+ if (add_bos) ids->insert(ids->begin(), sp.bos_id());
144
+ if (add_eos) ids->push_back(sp.eos_id());
145
+ }
146
+
147
+ inline void RewriteIds(const sentencepiece::SentencePieceProcessor &sp,
148
+ std::vector<std::string> *pieces,
149
+ bool add_bos, bool add_eos, bool reverse, bool emit_unk_piece) {
150
+ if (!add_bos && !add_eos && !reverse && !emit_unk_piece) return;
151
+ if (reverse) std::reverse(pieces->begin(), pieces->end());
152
+ if (add_bos) pieces->insert(pieces->begin(), sp.IdToPiece(sp.bos_id()));
153
+ if (add_eos) pieces->push_back(sp.IdToPiece(sp.eos_id()));
154
+ if (emit_unk_piece) {
155
+ const auto &unk = sp.IdToPiece(sp.unk_id());
156
+ for (auto &piece : *pieces) {
157
+ const int id = sp.PieceToId(piece);
158
+ if (id == sp.unk_id()) {
159
+ piece = unk;
160
+ }
161
+ }
162
+ }
163
+ }
164
+
165
+ inline void RewriteIds(const sentencepiece::SentencePieceProcessor &sp,
166
+ sentencepiece::util::bytes *proto,
167
+ bool add_bos, bool add_eos, bool reverse, bool emit_unk_piece) {
168
+ if (add_bos || add_eos || reverse || emit_unk_piece) {
169
+ throw sentencepiece::util::Status(
170
+ sentencepiece::util::StatusCode::kUnimplemented,
171
+ "add_bos, add_eos, reverse, and emit_unk_piece is not supported in proto API");
172
+ }
173
+ }
174
+
175
+ inline void RewriteIds(const sentencepiece::SentencePieceProcessor &sp,
176
+ sentencepiece::ImmutableSentencePieceText *proto,
177
+ bool add_bos, bool add_eos, bool reverse, bool emit_unk_piece) {
178
+ if (add_bos || add_eos || reverse || emit_unk_piece) {
179
+ throw sentencepiece::util::Status(
180
+ sentencepiece::util::StatusCode::kUnimplemented,
181
+ "add_bos, add_eos, reverse, and emit_unk_piece is not supported in proto API");
182
+ }
183
+ }
184
+
185
+ inline void CheckIds(const std::vector<int> &ids, int num_pieces) {
186
+ for (int id : ids) {
187
+ if (id < 0 || id >= num_pieces) {
188
+ throw sentencepiece::util::Status(
189
+ sentencepiece::util::StatusCode::kOutOfRange,
190
+ "piece id is out of range.");
191
+ }
192
+ }
193
+ }
194
+
195
+ inline void CheckIds(const std::vector<absl::string_view> &ids, int num_pieces) {}
196
+
197
+ inline void CheckIdsBatch(const std::vector<std::vector<int>> &ids, int num_pieces) {
198
+ for (const auto &v : ids) CheckIds(v, num_pieces);
199
+ }
200
+
201
+ template <typename T>
202
+ inline void ConvertToUnicodeSpans(T *proto) {}
203
+
204
+ template <>
205
+ inline void ConvertToUnicodeSpans(sentencepiece::ImmutableSentencePieceText *proto) {
206
+ proto->ConvertToUnicodeSpans();
207
+ }
208
+
209
+ template <>
210
+ inline void ConvertToUnicodeSpans(sentencepiece::ImmutableNBestSentencePieceText *proto) {
211
+ proto->ConvertToUnicodeSpans();
212
+ }
213
+
214
+ class ThreadPool {
215
+ public:
216
+ explicit ThreadPool(size_t request_size) :
217
+ request_size_(request_size) {}
218
+
219
+ virtual ~ThreadPool() {
220
+ for (auto &task : tasks_) {
221
+ task.join();
222
+ }
223
+ }
224
+
225
+ void Schedule(std::function<void()> closure) {
226
+ static constexpr size_t kMinThreadSize = 2;
227
+ if (request_size_ < kMinThreadSize) {
228
+ closure();
229
+ } else {
230
+ tasks_.emplace_back(closure);
231
+ }
232
+ }
233
+
234
+ private:
235
+ size_t request_size_ = 0;
236
+ std::vector<std::thread> tasks_;
237
+ };
238
+
239
+ template <typename T>
240
+ inline void InitNumThreads(const std::vector<T> &ins, int *num_threads) {
241
+ if (*num_threads < 0) {
242
+ *num_threads = std::thread::hardware_concurrency();
243
+ }
244
+ *num_threads = std::max<int>(1,
245
+ std::min<int>({*num_threads,
246
+ static_cast<int>(ins.size()), 256}));
247
+ }
248
+
249
+ #define DEFINE_ENCODE_BATCH_FUNC_IMPL(FuncName, InType, OutType) \
250
+ std::vector<OutType> outs(ins.size()); \
251
+ InitNumThreads(ins, &num_threads); \
252
+ { \
253
+ ThreadPool pool(ins.size()); \
254
+ std::atomic<size_t> index = 0; \
255
+ for (int n = 0; n < num_threads; ++n) { \
256
+ pool.Schedule([&]() { \
257
+ size_t i = 0; \
258
+ while ((i = std::atomic_fetch_add(&index, 1)) < outs.size()) { \
259
+ auto out = enable_sampling ? \
260
+ self->Sample##FuncName(ins[i], \
261
+ nbest_size, alpha) : \
262
+ self->FuncName(ins[i]); \
263
+ RewriteIds(*self, &out, add_bos, add_eos, reverse, \
264
+ emit_unk_piece); \
265
+ ConvertToUnicodeSpans(&out); \
266
+ outs[i] = std::move(out); \
267
+ } \
268
+ }); \
269
+ } \
270
+ } \
271
+ return outs;
272
+
273
+ #define DEFINE_DECODE_BATCH_FUNC_IMPL(FuncName, InType, OutType) \
274
+ std::vector<OutType> outs(ins.size()); \
275
+ InitNumThreads(ins, &num_threads); \
276
+ { \
277
+ std::atomic<size_t> index = 0; \
278
+ ThreadPool pool(ins.size()); \
279
+ for (int n = 0; n < num_threads; ++n) { \
280
+ pool.Schedule([&]() { \
281
+ size_t i = 0; \
282
+ while ((i = std::atomic_fetch_add(&index, 1)) < outs.size()) { \
283
+ auto out = self->FuncName(ins[i]); \
284
+ ConvertToUnicodeSpans(&out); \
285
+ outs[i] = std::move(out); \
286
+ } \
287
+ }); \
288
+ } \
289
+ } \
290
+ return outs;
291
+
292
+ } // namespace
293
+ %}
294
+
295
+ %init %{
296
+ #ifdef Py_GIL_DISABLED
297
+ PyUnstable_Module_SetGIL(m, Py_MOD_GIL_NOT_USED);
298
+ #endif
299
+ %}
300
+
301
+ %exception {
302
+ try {
303
+ $action
304
+ ReleaseResultObject(resultobj);
305
+ }
306
+ catch (const sentencepiece::util::Status &status) {
307
+ SWIG_exception(ToSwigError(status.code()), status.ToString().c_str());
308
+ }
309
+ }
310
+
311
+ %apply unsigned int { uint32_t }
312
+
313
+ %ignore sentencepiece::util::Status;
314
+ %ignore sentencepiece::util::StatusCode;
315
+ %ignore absl::string_view;
316
+ %ignore std::string_view;
317
+ %ignore sentencepiece::SentencePieceText;
318
+ %ignore sentencepiece::NormalizerSpec;
319
+ %ignore sentencepiece::TrainerSpec;
320
+ %ignore sentencepiece::SentencePieceProcessor::status;
321
+ %ignore sentencepiece::ImmutableSentencePieceText::mutable_proto;
322
+ %ignore sentencepiece::ImmutableSentencePieceText::pieces() const;
323
+ %ignore sentencepiece::ImmutableSentencePieceText::ConvertToUnicodeSpans;
324
+ %ignore sentencepiece::ImmutableNBestSentencePieceText::mutable_proto;
325
+ %ignore sentencepiece::ImmutableNBestSentencePieceText::nbests() const;
326
+ %ignore sentencepiece::ImmutableNBestSentencePieceText::ConvertToUnicodeSpans;
327
+
328
+ %ignore sentencepiece::SentencePieceProcessor::Encode;
329
+ %ignore sentencepiece::SentencePieceProcessor::SampleEncode;
330
+ %ignore sentencepiece::SentencePieceProcessor::NBestEncode;
331
+ %ignore sentencepiece::SentencePieceProcessor::SampleEncodeAndScore;
332
+ %ignore sentencepiece::SentencePieceProcessor::Decode;
333
+
334
+ %ignore sentencepiece::SentencePieceProcessor::EncodeAsPieces;
335
+ %ignore sentencepiece::SentencePieceProcessor::EncodeAsIds;
336
+ %ignore sentencepiece::SentencePieceProcessor::SampleEncodeAsIds;
337
+ %ignore sentencepiece::SentencePieceProcessor::SampleEncodeAsPieces;
338
+ %ignore sentencepiece::SentencePieceProcessor::NBestEncodeAsIds;
339
+ %ignore sentencepiece::SentencePieceProcessor::NBestEncodeAsPieces;
340
+ %ignore sentencepiece::SentencePieceProcessor::SampleEncodeAndScoreAsIds;
341
+ %ignore sentencepiece::SentencePieceProcessor::SampleEncodeAndScoreAsPieces;
342
+ %ignore sentencepiece::SentencePieceProcessor::DecodeIds;
343
+ %ignore sentencepiece::SentencePieceProcessor::DecodePieces;
344
+
345
+ %ignore sentencepiece::SentencePieceProcessor::EncodeAsSerializedProto;
346
+ %ignore sentencepiece::SentencePieceProcessor::SampleEncodeAsSerializedProto;
347
+ %ignore sentencepiece::SentencePieceProcessor::NBestEncodeAsSerializedProto;
348
+ %ignore sentencepiece::SentencePieceProcessor::SampleEncodeAndScoreAsSerializedProto;
349
+ %ignore sentencepiece::SentencePieceProcessor::DecodePiecesAsSerializedProto;
350
+ %ignore sentencepiece::SentencePieceProcessor::DecodeIdsAsSerializedProto;
351
+
352
+ %ignore sentencepiece::SentencePieceProcessor::EncodeAsImmutableProto;
353
+ %ignore sentencepiece::SentencePieceProcessor::SampleEncodeAsImmutableProto;
354
+ %ignore sentencepiece::SentencePieceProcessor::NBestEncodeAsImmutableProto;
355
+ %ignore sentencepiece::SentencePieceProcessor::SampleEncodeAndScoreAsImmutableProto;
356
+ %ignore sentencepiece::SentencePieceProcessor::DecodePiecesAsImmutableProto;
357
+ %ignore sentencepiece::SentencePieceProcessor::DecodeIdsAsImmutableProto;
358
+
359
+ %ignore sentencepiece::SentencePieceProcessor::Normalize;
360
+ %ignore sentencepiece::SentencePieceProcessor::NormalizeWithOffsets;
361
+
362
+ %ignore sentencepiece::SentencePieceProcessor::model_proto;
363
+ %ignore sentencepiece::SentencePieceProcessor::mutable_normalizer_spec;
364
+ %ignore sentencepiece::SentencePieceProcessor::Load;
365
+ %ignore sentencepiece::SentencePieceProcessor::LoadOrDie;
366
+ %ignore sentencepiece::SentencePieceProcessor::SetModel;
367
+ %ignore sentencepiece::SentencePieceProcessor::SetNormalizer;
368
+ %ignore sentencepiece::pretokenizer::PretokenizerForTrainingInterface;
369
+ %ignore sentencepiece::SentenceIterator;
370
+ %ignore sentencepiece::ConvertToUnicodeSpans;
371
+ %ignore sentencepiece::SentencePieceTrainer::Train;
372
+ %ignore sentencepiece::SentencePieceTrainer::GetNormalizerSpec;
373
+ %ignore sentencepiece::SentencePieceTrainer::PopulateNormalizerSpec;
374
+ %ignore sentencepiece::SentencePieceTrainer::MergeSpecsFromArgs;
375
+ %ignore sentencepiece::SentencePieceTrainer::SetProtoField;
376
+ %ignore sentencepiece::SentencePieceTrainer::PopulateModelTypeFromString;
377
+ %ignore sentencepiece::SentencePieceTrainer::PieceProcecssor;
378
+ %ignore sentencepiece::SentencePieceTrainer::SetPretokenizerForTraining;
379
+ %ignore sentencepiece::SentencePieceTrainer::GetPretokenizerForTraining;
380
+ %ignore sentencepiece::SentencePieceTrainer::SetDataDir;
381
+ %ignore sentencepiece::ConvertToUnicodeAlignment;
382
+
383
+ %ignore sentencepiece::SentencePieceNormalizer::Load;
384
+ %ignore sentencepiece::SentencePieceNormalizer::Normalize;
385
+ %ignore sentencepiece::SentencePieceNormalizer::mutable_normalizer_spec;
386
+
387
+ %ignore sentencepiece::io::LoadModelProto;
388
+ %ignore sentencepiece::io::SaveModelProto;
389
+
390
+ %extend sentencepiece::SentencePieceProcessor {
391
+ sentencepiece::util::Status LoadFromFile(absl::string_view arg) {
392
+ return $self->Load(arg);
393
+ }
394
+
395
+ /////////////////////////////////////////////////////////////////////////////
396
+ // EncodeAs* (Single request)
397
+ std::vector<int> _EncodeAsIds(absl::string_view text,
398
+ bool enable_sampling,
399
+ int nbest_size, float alpha,
400
+ bool add_bos, bool add_eos, bool reverse,
401
+ bool emit_unk_piece) const {
402
+ auto ids = enable_sampling ?
403
+ $self->SampleEncodeAsIds(text, nbest_size, alpha) :
404
+ $self->EncodeAsIds(text);
405
+ RewriteIds(*$self, &ids, add_bos, add_eos, reverse, emit_unk_piece);
406
+ return ids;
407
+ }
408
+
409
+ std::vector<std::string> _EncodeAsPieces(absl::string_view text,
410
+ bool enable_sampling,
411
+ int nbest_size, float alpha,
412
+ bool add_bos, bool add_eos, bool reverse,
413
+ bool emit_unk_piece) const {
414
+ auto pieces = enable_sampling ?
415
+ $self->SampleEncodeAsPieces(text, nbest_size, alpha) :
416
+ $self->EncodeAsPieces(text);
417
+ RewriteIds(*$self, &pieces, add_bos, add_eos, reverse, emit_unk_piece);
418
+ return pieces;
419
+ }
420
+
421
+ sentencepiece::util::bytes _EncodeAsSerializedProto(absl::string_view text,
422
+ bool enable_sampling,
423
+ int nbest_size, float alpha,
424
+ bool add_bos, bool add_eos, bool reverse,
425
+ bool emit_unk_piece) const {
426
+ auto proto = enable_sampling ?
427
+ $self->SampleEncodeAsSerializedProto(text, nbest_size, alpha) :
428
+ $self->EncodeAsSerializedProto(text);
429
+ RewriteIds(*$self, &proto, add_bos, add_eos, reverse, emit_unk_piece);
430
+ return proto;
431
+ }
432
+
433
+ sentencepiece::ImmutableSentencePieceText
434
+ _EncodeAsImmutableProto(absl::string_view text,
435
+ bool enable_sampling,
436
+ int nbest_size, float alpha,
437
+ bool add_bos, bool add_eos, bool reverse,
438
+ bool emit_unk_piece) const {
439
+ auto proto = enable_sampling ?
440
+ $self->SampleEncodeAsImmutableProto(text, nbest_size, alpha) :
441
+ $self->EncodeAsImmutableProto(text);
442
+ proto.ConvertToUnicodeSpans();
443
+ RewriteIds(*$self, &proto, add_bos, add_eos, reverse, emit_unk_piece);
444
+ return proto;
445
+ }
446
+
447
+ /////////////////////////////////////////////////////////////////////////////
448
+ // EncodeAs* (Batch request)
449
+ std::vector<std::vector<int>> _EncodeAsIdsBatch(
450
+ const std::vector<absl::string_view> &ins, int num_threads,
451
+ bool enable_sampling, int nbest_size, float alpha,
452
+ bool add_bos, bool add_eos, bool reverse,
453
+ bool emit_unk_piece) const {
454
+ DEFINE_ENCODE_BATCH_FUNC_IMPL(EncodeAsIds,
455
+ absl::string_view, std::vector<int>);
456
+ }
457
+
458
+ std::vector<std::vector<std::string>> _EncodeAsPiecesBatch(
459
+ const std::vector<absl::string_view> &ins, int num_threads,
460
+ bool enable_sampling, int nbest_size, float alpha,
461
+ bool add_bos, bool add_eos, bool reverse,
462
+ bool emit_unk_piece) const {
463
+ DEFINE_ENCODE_BATCH_FUNC_IMPL(EncodeAsPieces,
464
+ absl::string_view, std::vector<std::string>);
465
+ }
466
+
467
+ BytesArray _EncodeAsSerializedProtoBatch(
468
+ const std::vector<absl::string_view> &ins, int num_threads,
469
+ bool enable_sampling, int nbest_size, float alpha,
470
+ bool add_bos, bool add_eos, bool reverse,
471
+ bool emit_unk_piece) const {
472
+ DEFINE_ENCODE_BATCH_FUNC_IMPL(EncodeAsSerializedProto,
473
+ absl::string_view,
474
+ sentencepiece::util::bytes);
475
+ }
476
+
477
+ std::vector<sentencepiece::ImmutableSentencePieceText>
478
+ _EncodeAsImmutableProtoBatch(
479
+ const std::vector<absl::string_view> &ins, int num_threads,
480
+ bool enable_sampling, int nbest_size, float alpha,
481
+ bool add_bos, bool add_eos, bool reverse,
482
+ bool emit_unk_piece) const {
483
+ DEFINE_ENCODE_BATCH_FUNC_IMPL(EncodeAsImmutableProto,
484
+ absl::string_view,
485
+ sentencepiece::ImmutableSentencePieceText);
486
+ }
487
+
488
+ /////////////////////////////////////////////////////////////////////////////
489
+ // DecodeAs* (Single request)
490
+ std::string _DecodeIds(const std::vector<int> &ids) const {
491
+ CheckIds(ids, $self->GetPieceSize());
492
+ return $self->DecodeIds(ids);
493
+ }
494
+
495
+ sentencepiece::util::bytes _DecodeIdsAsBytes(const std::vector<int> &ids) const {
496
+ CheckIds(ids, $self->GetPieceSize());
497
+ return $self->DecodeIds(ids);
498
+ }
499
+
500
+ std::string _DecodePieces(const std::vector<absl::string_view> &pieces) const {
501
+ return $self->DecodePieces(pieces);
502
+ }
503
+
504
+ sentencepiece::util::bytes _DecodeIdsAsSerializedProto(
505
+ const std::vector<int> &ids) const {
506
+ CheckIds(ids, $self->GetPieceSize());
507
+ return $self->DecodeIdsAsSerializedProto(ids);
508
+ }
509
+
510
+ sentencepiece::util::bytes _DecodePiecesAsSerializedProto(
511
+ const std::vector<absl::string_view> &pieces) const {
512
+ CheckIds(pieces, $self->GetPieceSize());
513
+ return $self->DecodePiecesAsSerializedProto(pieces);
514
+ }
515
+
516
+ sentencepiece::ImmutableSentencePieceText _DecodeIdsAsImmutableProto(
517
+ const std::vector<int> &ids) const {
518
+ CheckIds(ids, $self->GetPieceSize());
519
+ auto proto = $self->DecodeIdsAsImmutableProto(ids);
520
+ proto.ConvertToUnicodeSpans();
521
+ return proto;
522
+ }
523
+
524
+ sentencepiece::ImmutableSentencePieceText _DecodePiecesAsImmutableProto(
525
+ const std::vector<absl::string_view> &pieces) const {
526
+ CheckIds(pieces, $self->GetPieceSize());
527
+ auto proto= $self->DecodePiecesAsImmutableProto(pieces);
528
+ proto.ConvertToUnicodeSpans();
529
+ return proto;
530
+ }
531
+
532
+ /////////////////////////////////////////////////////////////////////////////
533
+ // DecodeAs* (Batch request)
534
+ std::vector<std::string> _DecodeIdsBatch(
535
+ const std::vector<std::vector<int>> &ins, int num_threads) const {
536
+ CheckIdsBatch(ins, $self->GetPieceSize());
537
+ DEFINE_DECODE_BATCH_FUNC_IMPL(DecodeIds, int, std::string);
538
+ }
539
+
540
+ BytesArray _DecodeIdsAsBytesBatch(
541
+ const std::vector<std::vector<int>> &ins, int num_threads) const {
542
+ CheckIdsBatch(ins, $self->GetPieceSize());
543
+ DEFINE_DECODE_BATCH_FUNC_IMPL(DecodeIds, int, std::string);
544
+ }
545
+
546
+ BytesArray _DecodeIdsAsSerializedProtoBatch(
547
+ const std::vector<std::vector<int>> &ins, int num_threads) const {
548
+ CheckIdsBatch(ins, $self->GetPieceSize());
549
+ DEFINE_DECODE_BATCH_FUNC_IMPL(DecodeIdsAsSerializedProto, int,
550
+ sentencepiece::util::bytes);
551
+ }
552
+
553
+ std::vector<sentencepiece::ImmutableSentencePieceText>
554
+ _DecodeIdsAsImmutableProtoBatch(
555
+ const std::vector<std::vector<int>> &ins, int num_threads) const {
556
+ CheckIdsBatch(ins, $self->GetPieceSize());
557
+ DEFINE_DECODE_BATCH_FUNC_IMPL(DecodeIdsAsImmutableProto, int,
558
+ sentencepiece::ImmutableSentencePieceText);
559
+ }
560
+
561
+ std::vector<std::string> _DecodePiecesBatch(
562
+ const std::vector<std::vector<absl::string_view>> &ins, int num_threads) const {
563
+ DEFINE_DECODE_BATCH_FUNC_IMPL(DecodePieces, std::string, std::string);
564
+ }
565
+
566
+ BytesArray _DecodePiecesAsSerializedProtoBatch(
567
+ const std::vector<std::vector<absl::string_view>> &ins, int num_threads) const {
568
+ DEFINE_DECODE_BATCH_FUNC_IMPL(DecodePiecesAsSerializedProto, std::string,
569
+ sentencepiece::util::bytes);
570
+ }
571
+
572
+ std::vector<sentencepiece::ImmutableSentencePieceText>
573
+ _DecodePiecesAsImmutableProtoBatch(
574
+ const std::vector<std::vector<absl::string_view>> &ins, int num_threads) const {
575
+ DEFINE_DECODE_BATCH_FUNC_IMPL(DecodePiecesAsImmutableProto, std::string,
576
+ sentencepiece::ImmutableSentencePieceText);
577
+ }
578
+
579
+ ////////////////////////////////////////////////////////////////////////////
580
+ // NBestEncodeAs* (Single request)
581
+ std::vector<std::vector<int>>
582
+ _NBestEncodeAsIds(absl::string_view text,
583
+ int nbest_size,
584
+ bool add_bos, bool add_eos, bool reverse,
585
+ bool emit_unk_piece) const {
586
+ auto idss = $self->NBestEncodeAsIds(text, nbest_size);
587
+ for (auto &ids : idss) {
588
+ RewriteIds(*$self, &ids, add_bos, add_eos, reverse, emit_unk_piece);
589
+ }
590
+ return idss;
591
+ }
592
+
593
+ std::vector<std::vector<std::string>>
594
+ _NBestEncodeAsPieces(absl::string_view text,
595
+ int nbest_size,
596
+ bool add_bos, bool add_eos, bool reverse,
597
+ bool emit_unk_piece) const {
598
+ auto piecess = $self->NBestEncodeAsPieces(text, nbest_size);
599
+ for (auto &pieces : piecess) {
600
+ RewriteIds(*$self, &pieces, add_bos, add_eos, reverse, emit_unk_piece);
601
+ }
602
+ return piecess;
603
+ }
604
+
605
+ sentencepiece::util::bytes
606
+ _NBestEncodeAsSerializedProto(absl::string_view text,
607
+ int nbest_size,
608
+ bool add_bos, bool add_eos, bool reverse,
609
+ bool emit_unk_piece) const {
610
+ RewriteIds(*$self, static_cast<sentencepiece::util::bytes *>(nullptr),
611
+ add_bos, add_eos, reverse, emit_unk_piece);
612
+ return $self->NBestEncodeAsSerializedProto(text, nbest_size);
613
+ }
614
+
615
+ sentencepiece::ImmutableNBestSentencePieceText
616
+ _NBestEncodeAsImmutableProto(absl::string_view text,
617
+ int nbest_size,
618
+ bool add_bos, bool add_eos, bool reverse,
619
+ bool emit_unk_piece) const {
620
+ RewriteIds(*$self, static_cast<sentencepiece::ImmutableSentencePieceText *>(nullptr),
621
+ add_bos, add_eos, reverse, emit_unk_piece);
622
+ auto proto = $self->NBestEncodeAsImmutableProto(text, nbest_size);
623
+ proto.ConvertToUnicodeSpans();
624
+ return proto;
625
+ }
626
+
627
+
628
+ /////////////////////////////////////////////////////////////////////////////
629
+ // SampleEncodeAndScoreAs* (Single request)
630
+ std::vector<std::pair<std::vector<int>, float>>
631
+ _SampleEncodeAndScoreAsIds(absl::string_view text,
632
+ int num_samples, float alpha, bool wor,
633
+ bool include_best,
634
+ bool add_bos, bool add_eos, bool reverse,
635
+ bool emit_unk_piece) const {
636
+ auto idss = $self->SampleEncodeAndScoreAsIds(text, num_samples,
637
+ alpha, wor, include_best);
638
+ for (auto &ids : idss) {
639
+ RewriteIds(*$self, &ids.first, add_bos, add_eos, reverse, emit_unk_piece);
640
+ }
641
+ return idss;
642
+ }
643
+
644
+ std::vector<std::pair<std::vector<std::string>, float>>
645
+ _SampleEncodeAndScoreAsPieces(absl::string_view text,
646
+ int num_samples, float alpha, bool wor,
647
+ bool include_best,
648
+ bool add_bos, bool add_eos, bool reverse,
649
+ bool emit_unk_piece) const {
650
+ auto piecess = $self->SampleEncodeAndScoreAsPieces(text, num_samples,
651
+ alpha, wor, include_best);
652
+ for (auto &pieces : piecess) {
653
+ RewriteIds(*$self, &pieces.first, add_bos, add_eos, reverse, emit_unk_piece);
654
+ }
655
+ return piecess;
656
+ }
657
+
658
+ sentencepiece::util::bytes
659
+ _SampleEncodeAndScoreAsSerializedProto(absl::string_view text,
660
+ int num_samples, float alpha, bool wor,
661
+ bool include_best,
662
+ bool add_bos, bool add_eos, bool reverse,
663
+ bool emit_unk_piece) const {
664
+ RewriteIds(*$self, static_cast<sentencepiece::util::bytes *>(nullptr),
665
+ add_bos, add_eos, reverse, emit_unk_piece);
666
+ return $self->SampleEncodeAndScoreAsSerializedProto(text, num_samples,
667
+ alpha, wor, include_best);
668
+ }
669
+
670
+ sentencepiece::ImmutableNBestSentencePieceText
671
+ _SampleEncodeAndScoreAsImmutableProto(absl::string_view text,
672
+ int num_samples, float alpha, bool wor,
673
+ bool include_best,
674
+ bool add_bos, bool add_eos, bool reverse,
675
+ bool emit_unk_piece) const {
676
+ RewriteIds(*$self, static_cast<sentencepiece::util::bytes *>(nullptr),
677
+ add_bos, add_eos, reverse, emit_unk_piece);
678
+ auto proto = $self->SampleEncodeAndScoreAsImmutableProto(text, num_samples,
679
+ alpha, wor, include_best);
680
+ proto.ConvertToUnicodeSpans();
681
+ return proto;
682
+ }
683
+
684
+ // Normalize
685
+ std::string _Normalize(absl::string_view text) {
686
+ return $self->Normalize(text);
687
+ }
688
+
689
+ std::pair<std::string, std::vector<size_t>> _NormalizeWithOffsets(absl::string_view text) {
690
+ std::pair<std::string, std::vector<size_t>> result;
691
+ $self->Normalize(text, &result.first, &result.second).IgnoreError();
692
+ return result;
693
+ }
694
+
695
+ // Calculate Entropy
696
+ float _CalculateEntropy(absl::string_view text, float alpha) {
697
+ return $self->CalculateEntropy(text, alpha);
698
+ }
699
+
700
+ std::vector<float> _CalculateEntropyBatch(const std::vector<absl::string_view> &ins,
701
+ float alpha, int num_threads) {
702
+ std::vector<float> outs(ins.size());
703
+ InitNumThreads(ins, &num_threads);
704
+ {
705
+ ThreadPool pool(ins.size());
706
+ std::atomic<size_t> index = 0;
707
+ for (int n = 0; n < num_threads; ++n) {
708
+ pool.Schedule([&]() {
709
+ size_t i = 0;
710
+ while ((i = std::atomic_fetch_add(&index, 1)) < outs.size()) {
711
+ outs[i] = self->CalculateEntropy(ins[i], alpha);
712
+ }
713
+ });
714
+ }
715
+ }
716
+ return outs;
717
+ }
718
+
719
+ // override normalizer_spec
720
+ sentencepiece::util::Status _OverrideNormalizerSpec(
721
+ const std::unordered_map<std::string, std::string> &args) {
722
+ sentencepiece::util::Status status;
723
+ for (const auto &[key, value] : args) {
724
+ status = sentencepiece::SentencePieceTrainer::SetProtoField(
725
+ key, value,
726
+ $self->mutable_normalizer_spec());
727
+ if (!status.ok()) return status;
728
+ }
729
+ return status;
730
+ }
731
+
732
+ %pythoncode {
733
+ def Init(self,
734
+ model_file=None,
735
+ model_proto=None,
736
+ out_type=int,
737
+ add_bos=False,
738
+ add_eos=False,
739
+ reverse=False,
740
+ emit_unk_piece=False,
741
+ enable_sampling=False,
742
+ nbest_size=-1,
743
+ alpha=0.1,
744
+ num_threads=-1):
745
+ """Initialzie sentencepieceProcessor.
746
+
747
+ Args:
748
+ model_file: The sentencepiece model file path.
749
+ model_proto: The sentencepiece model serialized proto.
750
+ out_type: output type. int or str.
751
+ add_bos: Add <s> to the result (Default = false)
752
+ add_eos: Add </s> to the result (Default = false) <s>/</s> is added after
753
+ reversing (if enabled).
754
+ reverse: Reverses the tokenized sequence (Default = false)
755
+ emit_unk_piece: Emits the unk literal string (Default = false)
756
+ nbest_size: sampling parameters for unigram. Invalid in BPE-Dropout.
757
+ nbest_size = {0,1}: No sampling is performed.
758
+ nbest_size > 1: samples from the nbest_size results.
759
+ nbest_size < 0: assuming that nbest_size is infinite and samples
760
+ from the all hypothesis (lattice) using
761
+ forward-filtering-and-backward-sampling algorithm.
762
+ alpha: Soothing parameter for unigram sampling, and dropout probability of
763
+ merge operations for BPE-dropout.
764
+ num_threads: number of threads in batch processing (Default = -1, auto-detected)
765
+ """
766
+
767
+ _sentencepiece_processor_init_native(self)
768
+ self._out_type = out_type
769
+ self._add_bos = add_bos
770
+ self._add_eos = add_eos
771
+ self._reverse = reverse
772
+ self._emit_unk_piece = emit_unk_piece
773
+ self._enable_sampling = enable_sampling
774
+ self._nbest_size = nbest_size
775
+ self._alpha = alpha
776
+ self._num_threads = num_threads
777
+ if model_file or model_proto:
778
+ self.Load(model_file=model_file, model_proto=model_proto)
779
+
780
+
781
+ def Encode(self,
782
+ input,
783
+ out_type=None,
784
+ add_bos=None,
785
+ add_eos=None,
786
+ reverse=None,
787
+ emit_unk_piece=None,
788
+ enable_sampling=None,
789
+ nbest_size=None,
790
+ alpha=None,
791
+ num_threads=None):
792
+ """Encode text input to segmented ids or tokens.
793
+
794
+ Args:
795
+ input: input string. accepsts list of string.
796
+ out_type: output type. int or str.
797
+ add_bos: Add <s> to the result (Default = false)
798
+ add_eos: Add </s> to the result (Default = false) <s>/</s> is added after
799
+ reversing (if enabled).
800
+ reverse: Reverses the tokenized sequence (Default = false)
801
+ emit_unk_piece: Emits the unk literal string (Default = false)
802
+ nbest_size: sampling parameters for unigram. Invalid in BPE-Dropout.
803
+ nbest_size = {0,1}: No sampling is performed.
804
+ nbest_size > 1: samples from the nbest_size results.
805
+ nbest_size < 0: assuming that nbest_size is infinite and samples
806
+ from the all hypothesis (lattice) using
807
+ forward-filtering-and-backward-sampling algorithm.
808
+ alpha: Soothing parameter for unigram sampling, and merge probability for
809
+ BPE-dropout (probablity 'p' in BPE-dropout paper).
810
+ num_threads: the number of threads used in the batch processing (Default = -1).
811
+ """
812
+
813
+ if out_type is None:
814
+ out_type = self._out_type
815
+ if add_bos is None:
816
+ add_bos = self._add_bos
817
+ if add_eos is None:
818
+ add_eos = self._add_eos
819
+ if reverse is None:
820
+ reverse = self._reverse
821
+ if emit_unk_piece is None:
822
+ emit_unk_piece = self._emit_unk_piece
823
+ if enable_sampling is None:
824
+ enable_sampling = self._enable_sampling
825
+ if nbest_size is None:
826
+ nbest_size = self._nbest_size
827
+ if alpha is None:
828
+ alpha = self._alpha
829
+ if num_threads is None:
830
+ num_threads = self._num_threads
831
+
832
+ if enable_sampling == True and (nbest_size is None or nbest_size == 0 or
833
+ nbest_size == 1 or alpha is None):
834
+ raise RuntimeError(
835
+ 'When enable_sampling is True, We must specify "nbest_size > 1" or "nbest_size = -1", '
836
+ 'and "alpha". "nbest_size" is enabled only on unigram mode ignored in BPE-dropout. '
837
+ 'when "nbest_size = -1" , this method samples from all candidates on the lattice '
838
+ 'instead of nbest segmentations.'
839
+ )
840
+
841
+ if num_threads is None or type(num_threads) is not int:
842
+ raise RuntimeError('num_threads must be int')
843
+
844
+ if type(input) is list:
845
+ if out_type is int:
846
+ return self._EncodeAsIdsBatch(input, num_threads, enable_sampling, nbest_size,
847
+ alpha, add_bos, add_eos, reverse, emit_unk_piece)
848
+ if out_type is str:
849
+ return self._EncodeAsPiecesBatch(input, num_threads, enable_sampling, nbest_size,
850
+ alpha, add_bos, add_eos, reverse, emit_unk_piece)
851
+ if out_type == 'serialized_proto' or out_type == 'proto':
852
+ return self._EncodeAsSerializedProtoBatch(input, num_threads, enable_sampling, nbest_size,
853
+ alpha, add_bos, add_eos, reverse, emit_unk_piece)
854
+ if out_type == 'immutable_proto':
855
+ return self._EncodeAsImmutableProtoBatch(input, num_threads, enable_sampling, nbest_size,
856
+ alpha, add_bos, add_eos, reverse, emit_unk_piece)
857
+
858
+ if out_type is int:
859
+ return self._EncodeAsIds(input, enable_sampling, nbest_size,
860
+ alpha, add_bos, add_eos, reverse, emit_unk_piece)
861
+ if out_type is str:
862
+ return self._EncodeAsPieces(input, enable_sampling, nbest_size,
863
+ alpha, add_bos, add_eos, reverse, emit_unk_piece)
864
+ if out_type == 'serialized_proto' or out_type == 'proto':
865
+ return self._EncodeAsSerializedProto(input, enable_sampling, nbest_size,
866
+ alpha, add_bos, add_eos, reverse, emit_unk_piece)
867
+ if out_type == 'immutable_proto':
868
+ return self._EncodeAsImmutableProto(input, enable_sampling, nbest_size,
869
+ alpha, add_bos, add_eos, reverse, emit_unk_piece)
870
+
871
+ raise RuntimeError('unknown out_type={}'.format(out_type))
872
+ return None
873
+
874
+
875
+ def EncodeAsPieces(self, input, **kwargs):
876
+ return self.Encode(input=input, out_type=str, **kwargs)
877
+
878
+
879
+ def EncodeAsIds(self, input, **kwargs):
880
+ return self.Encode(input=input, out_type=int, **kwargs)
881
+
882
+
883
+ def EncodeAsSerializedProto(self, input, **kwargs):
884
+ return self.Encode(input=input, out_type='serialized_proto', **kwargs)
885
+
886
+
887
+ def EncodeAsImmutableProto(self, input, **kwargs):
888
+ return self.Encode(input=input, out_type='immutable_proto', **kwargs)
889
+
890
+
891
+ def SampleEncodeAsPieces(self, input, nbest_size=None, alpha=None, **kwargs):
892
+ return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha,
893
+ out_type=str, enable_sampling=True, **kwargs)
894
+
895
+
896
+ def SampleEncodeAsIds(self, input, nbest_size=None, alpha=None,**kwargs):
897
+ return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha,
898
+ out_type=int, enable_sampling=True, **kwargs)
899
+
900
+
901
+ def SampleEncodeAsSerializedProto(self, input, nbest_size=None, alpha=None, **kwargs):
902
+ return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha,
903
+ out_type='serialized_proto', enable_sampling=True, **kwargs)
904
+
905
+
906
+ def SampleEncodeAsImmutableProto(self, input, nbest_size=None, alpha=None, **kwargs):
907
+ return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha,
908
+ out_type='immutable_proto', enable_sampling=True, **kwargs)
909
+
910
+
911
+ def NBestEncode(self,
912
+ input,
913
+ out_type=None,
914
+ add_bos=None,
915
+ add_eos=None,
916
+ reverse=None,
917
+ emit_unk_piece=None,
918
+ nbest_size=None):
919
+ """NBestEncode text input to segmented ids or tokens.
920
+
921
+ Args:
922
+ input: input string. accepsts list of string.
923
+ out_type: output type. int or str.
924
+ add_bos: Add <s> to the result (Default = false)
925
+ add_eos: Add </s> to the result (Default = false) <s>/</s> is added after reversing (if enabled).
926
+ reverse: Reverses the tokenized sequence (Default = false)
927
+ emit_unk_piece: Emits the unk literal string (Default = false)
928
+ nbest_size: nbest size
929
+ """
930
+
931
+ if out_type is None:
932
+ out_type = self._out_type
933
+ if add_bos is None:
934
+ add_bos = self._add_bos
935
+ if add_eos is None:
936
+ add_eos = self._add_eos
937
+ if reverse is None:
938
+ reverse = self._reverse
939
+ if emit_unk_piece is None:
940
+ emit_unk_piece = self._emit_unk_piece
941
+ if nbest_size is None:
942
+ nbest_size = self._nbest_size
943
+
944
+ if nbest_size <= 0:
945
+ nbest_size=1
946
+
947
+ def _encode(text):
948
+ if out_type is int:
949
+ return self._NBestEncodeAsIds(text, nbest_size,
950
+ add_bos, add_eos, reverse, emit_unk_piece)
951
+ if out_type is str:
952
+ return self._NBestEncodeAsPieces(text, nbest_size,
953
+ add_bos, add_eos, reverse, emit_unk_piece)
954
+ if out_type == 'serialized_proto' or out_type == 'proto':
955
+ return self._NBestEncodeAsSerializedProto(text, nbest_size,
956
+ add_bos, add_eos, reverse, emit_unk_piece)
957
+ if out_type == 'immutable_proto':
958
+ return self._NBestEncodeAsImmutableProto(text, nbest_size,
959
+ add_bos, add_eos, reverse, emit_unk_piece)
960
+
961
+ raise RuntimeError('unknown out_type')
962
+
963
+ if type(input) is list:
964
+ return [_encode(n) for n in input]
965
+
966
+ return _encode(input)
967
+
968
+
969
+ def NBestEncodeAsPieces(self, input, nbest_size=None, **kwargs):
970
+ return self.NBestEncode(input=input, nbest_size=nbest_size,
971
+ out_type=str, **kwargs)
972
+
973
+
974
+ def NBestEncodeAsIds(self, input, nbest_size=None, **kwargs):
975
+ return self.NBestEncode(input=input, nbest_size=nbest_size,
976
+ out_type=int, **kwargs)
977
+
978
+
979
+ def NBestEncodeAsSerializedProto(self, input, nbest_size=None, **kwargs):
980
+ return self.NBestEncode(input=input, nbest_size=nbest_size,
981
+ out_type='serialized_proto', **kwargs)
982
+
983
+
984
+ def NBestEncodeAsImmutableProto(self, input, nbest_size=None, **kwargs):
985
+ return self.NBestEncode(input=input, nbest_size=nbest_size,
986
+ out_type='immutable_proto', **kwargs)
987
+
988
+
989
+ def SampleEncodeAndScore(self,
990
+ input,
991
+ out_type=None,
992
+ add_bos=None,
993
+ add_eos=None,
994
+ reverse=None,
995
+ emit_unk_piece=None,
996
+ num_samples=None,
997
+ alpha=None,
998
+ wor=None,
999
+ include_best=None):
1000
+ """SampleEncodeAndScore text input to segmented ids or tokens.
1001
+
1002
+ Args:
1003
+ input: input string. accepsts list of string.
1004
+ out_type: output type. int or str or 'serialized_proto' or 'immutable_proto'
1005
+ add_bos: Add <s> to the result (Default = false)
1006
+ add_eos: Add </s> to the result (Default = false) <s>/</s> is added after reversing (if enabled).
1007
+ reverse: Reverses the tokenized sequence (Default = false)
1008
+ emit_unk_piece: Emits the unk literal string (Default = false)
1009
+ num_samples: How many samples to return (Default = 1)
1010
+ alpha: inverse temperature for sampling
1011
+ wor: whether to sample without replacement (Default = false)
1012
+ include_best: whether to include the best tokenization, requires wor=True (Default = false)
1013
+ """
1014
+
1015
+ if out_type is None:
1016
+ out_type = self._out_type
1017
+ if add_bos is None:
1018
+ add_bos = self._add_bos
1019
+ if add_eos is None:
1020
+ add_eos = self._add_eos
1021
+ if reverse is None:
1022
+ reverse = self._reverse
1023
+ if emit_unk_piece is None:
1024
+ emit_unk_piece = self._emit_unk_piece
1025
+ if num_samples is None:
1026
+ num_samples = 1
1027
+ if alpha is None:
1028
+ alpha = 1.
1029
+ if wor is None:
1030
+ wor = False
1031
+ if include_best is None:
1032
+ include_best = False
1033
+
1034
+ if num_samples <= 0:
1035
+ raise RuntimeError('num_examples must be positive')
1036
+
1037
+ if include_best and not wor:
1038
+ raise RuntimeError('When include_best is True, We must specify "wor = True".')
1039
+
1040
+
1041
+ def _encode(text):
1042
+ if out_type is int:
1043
+ return self._SampleEncodeAndScoreAsIds(text, num_samples, alpha, wor, include_best,
1044
+ add_bos, add_eos, reverse, emit_unk_piece)
1045
+ if out_type is str:
1046
+ return self._SampleEncodeAndScoreAsPieces(text, num_samples, alpha, wor, include_best,
1047
+ add_bos, add_eos, reverse, emit_unk_piece)
1048
+
1049
+ if out_type == 'serialized_proto' or out_type == 'proto':
1050
+ return self._SampleEncodeAndScoreAsSerializedProto(text, num_samples, alpha, wor, include_best,
1051
+ add_bos, add_eos, reverse, emit_unk_piece)
1052
+
1053
+ if out_type == 'immutable_proto':
1054
+ return self._SampleEncodeAndScoreAsImmutableProto(text, num_samples, alpha, wor, include_best,
1055
+ add_bos, add_eos, reverse, emit_unk_piece)
1056
+
1057
+ raise RuntimeError('unknown output type')
1058
+
1059
+
1060
+ if type(input) is list:
1061
+ return [_encode(n) for n in input]
1062
+
1063
+ return _encode(input)
1064
+
1065
+
1066
+ def SampleEncodeAndScoreAsPieces(self, input, num_samples=None, alpha=None, **kwargs):
1067
+ return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha,
1068
+ out_type=str, **kwargs)
1069
+
1070
+
1071
+ def SampleEncodeAndScoreAsIds(self, input, num_samples=None, alpha=None, **kwargs):
1072
+ return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha,
1073
+ out_type=int, **kwargs)
1074
+
1075
+
1076
+ def SampleEncodeAndScoreAsSerializedProto(self, input, num_samples=None, alpha=None, **kwargs):
1077
+ return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha,
1078
+ out_type='serialized_proto', **kwargs)
1079
+
1080
+
1081
+ def SampleEncodeAndScoreAsImmutableProto(self, input, num_samples=None, alpha=None, **kwargs):
1082
+ return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha,
1083
+ out_type='immutable_proto', **kwargs)
1084
+
1085
+
1086
+ def Decode(self, input, out_type=str, num_threads=None):
1087
+ """Decode processed id or token sequences.
1088
+
1089
+ Args:
1090
+ out_type: output type. str, bytes or 'serialized_proto' or 'immutable_proto' (Default = str)
1091
+ num_threads: the number of threads used in the batch processing (Default = -1).
1092
+ """
1093
+
1094
+ if num_threads is None:
1095
+ num_threads = self._num_threads
1096
+
1097
+ if num_threads is None or type(num_threads) is not int:
1098
+ raise RuntimeError('num_threads must be int')
1099
+
1100
+ if not input:
1101
+ return ''
1102
+
1103
+ if out_type is str:
1104
+ if type(input) is int:
1105
+ return self._DecodeIds([input])
1106
+ if type(input) is str:
1107
+ return self._DecodePieces([input])
1108
+
1109
+ if type(input) is list:
1110
+ if len(input) == 0 or type(input[0]) is int:
1111
+ return self._DecodeIds(input)
1112
+ if type(input[0]) is str:
1113
+ return self._DecodePieces(input)
1114
+
1115
+ if type(input[0]) is list:
1116
+ if len(input[0]) == 0 or type(input[0][0]) is int:
1117
+ return self._DecodeIdsBatch(input, num_threads)
1118
+ if type(input[0][0]) is str:
1119
+ return self._DecodePiecesBatch(input, num_threads)
1120
+
1121
+ if out_type is bytes:
1122
+ if type(input) is int:
1123
+ return self._DecodeIdsAsBytes([input])
1124
+ if type(input) is str:
1125
+ return self._DecodePieces([input])
1126
+
1127
+ if type(input) is list:
1128
+ if len(input) == 0 or type(input[0]) is int:
1129
+ return self._DecodeIdsAsBytes(input)
1130
+ if type(input[0]) is str:
1131
+ return self._DecodePieces(input)
1132
+
1133
+ if type(input[0]) is list:
1134
+ if len(input[0]) == 0 or type(input[0][0]) is int:
1135
+ return self._DecodeIdsAsBytesBatch(input, num_threads)
1136
+ if type(input[0][0]) is str:
1137
+ return self._DecodePiecesBatch(input, num_threads)
1138
+
1139
+ if out_type == 'serialized_proto':
1140
+ if type(input) is int:
1141
+ return self._DecodeIdsAsSerializedProto([input])
1142
+ if type(input) is str:
1143
+ return self._DecodePiecesAsSerializedProto([input])
1144
+
1145
+ if type(input) is list:
1146
+ if len(input) == 0 or type(input[0]) is int:
1147
+ return self._DecodeIdsAsSerializedProto(input)
1148
+ if type(input[0]) is str:
1149
+ return self._DecodePiecesAsSerializedProto(input)
1150
+
1151
+ if type(input[0]) is list:
1152
+ if len(input[0]) == 0 or type(input[0][0]) is int:
1153
+ return self._DecodeIdsAsSerializedProtoBatch(input, num_threads)
1154
+ if type(input[0][0]) is str:
1155
+ return self._DecodePiecesAsSerializedProtoBatch(input, num_threads)
1156
+
1157
+
1158
+ if out_type == 'immutable_proto':
1159
+ if type(input) is int:
1160
+ return self._DecodeIdsAsImmutableProto([input])
1161
+ if type(input) is str:
1162
+ return self._DecodePiecesAsImmutableProto([input])
1163
+
1164
+ if type(input) is list:
1165
+ if len(input) == 0 or type(input[0]) is int:
1166
+ return self._DecodeIdsAsImmutableProto(input)
1167
+ if type(input[0]) is str:
1168
+ return self._DecodePiecesAsImmutableProto(input)
1169
+
1170
+ if type(input[0]) is list:
1171
+ if len(input[0]) == 0 or type(input[0][0]) is int:
1172
+ return self._DecodeIdsAsImmutableProtoBatch(input, num_threads)
1173
+ if type(input[0][0]) is str:
1174
+ return self._DecodePiecesAsImmutableProtoBatch(input, num_threads)
1175
+
1176
+
1177
+ raise RuntimeError('unknown output or input type')
1178
+ return None
1179
+
1180
+
1181
+ def DecodePieces(self, input, out_type=str, **kwargs):
1182
+ return self.Decode(input=input, out_type=out_type, **kwargs)
1183
+
1184
+
1185
+ def DecodeIds(self, input, out_type=str, **kwargs):
1186
+ return self.Decode(input=input, out_type=out_type, **kwargs)
1187
+
1188
+
1189
+ def DecodePiecesAsSerializedProto(self, input, out_type='serialized_proto', **kwargs):
1190
+ return self.Decode(input=input, out_type=out_type, **kwargs)
1191
+
1192
+
1193
+ def DecodeIdsAsSerializedProto(self, input, out_type='serialized_proto', **kwargs):
1194
+ return self.Decode(input=input, out_type=out_type, **kwargs)
1195
+
1196
+
1197
+ def DecodePiecesAsImmutableProto(self, input, out_type='immutable_proto', **kwargs):
1198
+ return self.Decode(input=input, out_type=out_type, **kwargs)
1199
+
1200
+
1201
+ def DecodeIdsAsImmutableProto(self, input, out_type='immutable_proto', **kwargs):
1202
+ return self.Decode(input=input, out_type=out_type, **kwargs)
1203
+
1204
+
1205
+ def CalculateEntropy(self, input, alpha, num_threads=None):
1206
+ """Calculate sentence entropy"""
1207
+ if type(input) is list:
1208
+ if num_threads is None:
1209
+ num_threads = self._num_threads
1210
+ if num_threads is None or type(num_threads) is not int:
1211
+ raise RuntimeError('num_threads must be int')
1212
+ return self._CalculateEntropyBatch(input, alpha, num_threads)
1213
+
1214
+ return self._CalculateEntropy(input, alpha)
1215
+
1216
+
1217
+ def Normalize(self, input, with_offsets=None):
1218
+ def _normalize(text):
1219
+ if with_offsets:
1220
+ return self._NormalizeWithOffsets(text)
1221
+ return self._Normalize(text)
1222
+
1223
+ if type(input) is list:
1224
+ return [_normalize(x) for x in input]
1225
+ return _normalize(input)
1226
+
1227
+ def OverrideNormalizerSpec(self, **kwargs):
1228
+ new_kwargs = {}
1229
+ for key, value in kwargs.items():
1230
+ new_kwargs[key] = str(value)
1231
+ return self._OverrideNormalizerSpec(new_kwargs)
1232
+
1233
+
1234
+ def piece_size(self):
1235
+ return self.GetPieceSize()
1236
+
1237
+
1238
+ def vocab_size(self):
1239
+ return self.GetPieceSize()
1240
+
1241
+
1242
+ def __getstate__(self):
1243
+ return self.serialized_model_proto()
1244
+
1245
+
1246
+ def __setstate__(self, serialized_model_proto):
1247
+ self.__init__()
1248
+ self.LoadFromSerializedProto(serialized_model_proto)
1249
+
1250
+
1251
+ def __len__(self):
1252
+ return self.GetPieceSize()
1253
+
1254
+
1255
+ def __getitem__(self, piece):
1256
+ return self.PieceToId(piece)
1257
+
1258
+
1259
+ def Load(self, model_file=None, model_proto=None):
1260
+ """Overwride SentencePieceProcessor.Load to support both model_file and model_proto.
1261
+
1262
+ Args:
1263
+ model_file: The sentencepiece model file path.
1264
+ model_proto: The sentencepiece model serialized proto. Either `model_file`
1265
+ or `model_proto` must be set.
1266
+ """
1267
+ if model_file and model_proto:
1268
+ raise RuntimeError('model_file and model_proto must be exclusive.')
1269
+ if model_proto:
1270
+ return self.LoadFromSerializedProto(model_proto)
1271
+ return self.LoadFromFile(model_file)
1272
+ }
1273
+ }
1274
+
1275
+ %extend sentencepiece::SentencePieceTrainer {
1276
+ static void _TrainFromString(absl::string_view arg) {
1277
+ const auto _status = sentencepiece::SentencePieceTrainer::Train(arg);
1278
+ if (!_status.ok()) throw _status;
1279
+ return;
1280
+ }
1281
+
1282
+ static void _TrainFromMap(const std::unordered_map<std::string, std::string> &args) {
1283
+ const auto _status = sentencepiece::SentencePieceTrainer::Train(args);
1284
+ if (!_status.ok()) throw _status;
1285
+ return;
1286
+ }
1287
+
1288
+ static void _TrainFromMap2(const std::unordered_map<std::string, std::string> &args,
1289
+ SentenceIterator *iter) {
1290
+ const auto _status = sentencepiece::SentencePieceTrainer::Train(args, iter);
1291
+ if (!_status.ok()) throw _status;
1292
+ return;
1293
+ }
1294
+
1295
+ static sentencepiece::util::bytes _TrainFromMap3(const std::unordered_map<std::string, std::string> &args) {
1296
+ sentencepiece::util::bytes model_proto;
1297
+ const auto _status = sentencepiece::SentencePieceTrainer::Train(args, nullptr, &model_proto);
1298
+ if (!_status.ok()) throw _status;
1299
+ return model_proto;
1300
+ }
1301
+
1302
+ static sentencepiece::util::bytes _TrainFromMap4(const std::unordered_map<std::string, std::string> &args,
1303
+ SentenceIterator *iter) {
1304
+ sentencepiece::util::bytes model_proto;
1305
+ const auto _status = sentencepiece::SentencePieceTrainer::Train(args, iter, &model_proto);
1306
+ if (!_status.ok()) throw _status;
1307
+ return model_proto;
1308
+ }
1309
+
1310
+ %pythoncode {
1311
+ @staticmethod
1312
+ def _Train(arg=None, **kwargs):
1313
+ """Train Sentencepiece model. Accept both kwargs and legacy string arg."""
1314
+ if arg is not None and type(arg) is str:
1315
+ return SentencePieceTrainer._TrainFromString(arg)
1316
+
1317
+ def _encode(value):
1318
+ """Encode value to CSV.."""
1319
+ if type(value) is list:
1320
+ if sys.version_info[0] == 3:
1321
+ f = StringIO()
1322
+ else:
1323
+ f = BytesIO()
1324
+ writer = csv.writer(f, lineterminator='')
1325
+ writer.writerow([str(v) for v in value])
1326
+ return f.getvalue()
1327
+ else:
1328
+ return str(value)
1329
+
1330
+ sentence_iterator = None
1331
+ model_writer = None
1332
+ new_kwargs = {}
1333
+ for key, value in kwargs.items():
1334
+ if key in ['sentence_iterator', 'sentence_reader']:
1335
+ sentence_iterator = value
1336
+ elif key in ['model_writer']:
1337
+ model_writer = value
1338
+ else:
1339
+ new_kwargs[key] = _encode(value)
1340
+
1341
+ if model_writer:
1342
+ if sentence_iterator:
1343
+ model_proto = SentencePieceTrainer._TrainFromMap4(new_kwargs,
1344
+ sentence_iterator)
1345
+ else:
1346
+ model_proto = SentencePieceTrainer._TrainFromMap3(new_kwargs)
1347
+ model_writer.write(model_proto)
1348
+ else:
1349
+ if sentence_iterator:
1350
+ return SentencePieceTrainer._TrainFromMap2(new_kwargs, sentence_iterator)
1351
+ else:
1352
+ return SentencePieceTrainer._TrainFromMap(new_kwargs)
1353
+
1354
+ return None
1355
+
1356
+ @staticmethod
1357
+ def Train(arg=None, logstream=None, **kwargs):
1358
+ with _LogStream(ostream=logstream):
1359
+ SentencePieceTrainer._Train(arg=arg, **kwargs)
1360
+ }
1361
+ }
1362
+
1363
+ %extend sentencepiece::SentencePieceNormalizer {
1364
+ sentencepiece::util::Status LoadFromFile(absl::string_view arg) {
1365
+ return $self->Load(arg);
1366
+ }
1367
+
1368
+ std::string _Normalize(absl::string_view text) {
1369
+ std::string result;
1370
+ const auto _status = $self->Normalize(text, &result);
1371
+ if (!_status.ok()) throw _status;
1372
+ return result;
1373
+ }
1374
+
1375
+ std::pair<std::string, std::vector<size_t>> _NormalizeWithOffsets(absl::string_view text) {
1376
+ std::pair<std::string, std::vector<size_t>> result;
1377
+ const auto _status = $self->Normalize(text, &result.first, &result.second);
1378
+ if (!_status.ok()) throw _status;
1379
+ return result;
1380
+ }
1381
+
1382
+ void _SetProtoField(absl::string_view name, bool value) {
1383
+ sentencepiece::SentencePieceTrainer::SetProtoField(
1384
+ name,
1385
+ value ? "1" : "0",
1386
+ $self->mutable_normalizer_spec()).IgnoreError();
1387
+ }
1388
+
1389
+ %pythoncode %{
1390
+ def Init(self,
1391
+ model_file=None,
1392
+ model_proto=None,
1393
+ rule_tsv=None,
1394
+ rule_name=None,
1395
+ add_dummy_prefix=False,
1396
+ escape_whitespaces=False,
1397
+ remove_extra_whitespaces=False):
1398
+ """Initialzie sentencePieceNormalizer.
1399
+
1400
+ Args:
1401
+ model_file: The sentencepiece model file path.
1402
+ model_proto: The sentencepiece model serialized proto.
1403
+ rule_tsv: The normalization rule file in TSV format.
1404
+ rule_name: Pre-defined normalization name.
1405
+ add_dummy_prefix: add dummy prefix.
1406
+ escape_whitespaces: escape whitespaces.
1407
+ remove_extra_whitespaces: remove extra whitespaces.
1408
+ """
1409
+
1410
+ _sentencepiece_normalizer_init_native(self)
1411
+
1412
+ if model_file:
1413
+ status = self.LoadFromFile(model_file)
1414
+ elif model_proto:
1415
+ status = self.LoadFromSerializedProto(model_proto)
1416
+ elif rule_tsv:
1417
+ status = self.LoadFromRuleTSV(rule_tsv)
1418
+ elif rule_name:
1419
+ status = self.LoadFromRuleName(rule_name)
1420
+ else:
1421
+ raise RuntimeError('no model is specified')
1422
+
1423
+ if status:
1424
+ self._SetProtoField('add_dummy_prefix', add_dummy_prefix)
1425
+ self._SetProtoField('escape_whitespaces', escape_whitespaces)
1426
+ self._SetProtoField('remove_extra_whitespaces', remove_extra_whitespaces)
1427
+
1428
+ def Normalize(self, input, with_offsets=None):
1429
+ def _normalize(text):
1430
+ if with_offsets:
1431
+ return self._NormalizeWithOffsets(text)
1432
+ return self._Normalize(text)
1433
+
1434
+ if type(input) is list:
1435
+ return [_normalize(x) for x in input]
1436
+ return _normalize(input)
1437
+
1438
+
1439
+ def __getstate__(self):
1440
+ return self.serialized_model_proto()
1441
+
1442
+
1443
+ def __setstate__(self, serialized_model_proto):
1444
+ self.__init__()
1445
+ self.LoadFromSerializedProto(serialized_model_proto)
1446
+ %}
1447
+ }
1448
+
1449
+ %extend sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece {
1450
+ const sentencepiece::util::bytes& _surface_as_bytes() const {
1451
+ return $self->surface();
1452
+ }
1453
+
1454
+ const sentencepiece::util::bytes& _piece_as_bytes() const {
1455
+ return $self->piece();
1456
+ }
1457
+
1458
+ %rename(_piece) piece;
1459
+ %rename(_piece_as_bytes) piece_as_bytes;
1460
+ %rename(_id) id;
1461
+ %rename(_surface) surface;
1462
+ %rename(_surface_as_bytes) surface_as_bytes;
1463
+ %rename(_begin) begin;
1464
+ %rename(_end) end;
1465
+
1466
+ %pythoncode %{
1467
+ piece = property(_piece)
1468
+ piece_as_bytes = property(_piece_as_bytes)
1469
+ surface = property(_surface)
1470
+ surface_as_bytes = property(_surface_as_bytes)
1471
+ id = property(_id)
1472
+ begin = property(_begin)
1473
+ end = property(_end)
1474
+
1475
+ def __str__(self):
1476
+ return ('piece: \"{}\"\n'
1477
+ 'id: {}\n'
1478
+ 'surface: \"{}\"\n'
1479
+ 'begin: {}\n'
1480
+ 'end: {}\n').format(self.piece, self.id, self.surface,
1481
+ self.begin, self.end)
1482
+
1483
+ def __eq__(self, other):
1484
+ return self.piece == other.piece and self.id == other.id and self.surface == other.surface and self.begin == other.begin and self.end == other.end
1485
+
1486
+ def __hash__(self):
1487
+ return hash(str(self))
1488
+
1489
+ __repr__ = __str__
1490
+ %}
1491
+ }
1492
+
1493
+ %extend sentencepiece::ImmutableSentencePieceText {
1494
+ const sentencepiece::util::bytes& _text_as_bytes() const {
1495
+ return $self->text();
1496
+ }
1497
+
1498
+ %rename(_text) text;
1499
+ %rename(_text_as_bytes) text_as_bytes;
1500
+ %rename(_score) score;
1501
+ %rename(_pieces) pieces;
1502
+ %rename(_pieces_size) pieces_size;
1503
+
1504
+ %pythoncode %{
1505
+ text = property(_text)
1506
+ text_as_bytes = property(_text_as_bytes)
1507
+ score = property(_score)
1508
+
1509
+ class ImmutableSentencePieceIterator:
1510
+ def __init__(self, proto):
1511
+ self.proto = proto
1512
+ self.len = self.proto._pieces_size()
1513
+
1514
+ def __len__(self):
1515
+ return self.len
1516
+
1517
+ def __getitem__(self, index):
1518
+ if isinstance(index, slice):
1519
+ return [self.proto._pieces(i) for i in range(self.len)][index.start:index.stop:index.step]
1520
+ if index < 0:
1521
+ index = index + self.len
1522
+ if index < 0 or index >= self.len:
1523
+ raise IndexError('piece index is out of range')
1524
+ return self.proto._pieces(index)
1525
+
1526
+ def __str__(self):
1527
+ return '\n'.join(['pieces {{\n{}}}'.format(str(x)) for x in self])
1528
+
1529
+ __repr__ = __str__
1530
+
1531
+ @property
1532
+ def pieces(self):
1533
+ return ImmutableSentencePieceText.ImmutableSentencePieceIterator(self)
1534
+
1535
+ def __eq__(self, other):
1536
+ return self.SerializeAsString() == other.SerializeAsString()
1537
+
1538
+ def __hash__(self):
1539
+ return hash(self.SerializeAsString())
1540
+
1541
+ def __str__(self):
1542
+ return ('text: \"{}\"\n'
1543
+ 'score: {}\n'
1544
+ '{}').format(self.text, self.score,
1545
+ '\n'.join(['pieces {{\n{}}}'.format(str(x)) for x in self.pieces]))
1546
+
1547
+ __repr__ = __str__
1548
+ %}
1549
+ }
1550
+
1551
+ %extend sentencepiece::ImmutableNBestSentencePieceText {
1552
+ %rename(_nbests) nbests;
1553
+ %rename(_nbests_size) nbests_size;
1554
+
1555
+ %pythoncode %{
1556
+ class ImmutableSentencePieceTextIterator:
1557
+ def __init__(self, proto):
1558
+ self.proto = proto
1559
+ self.len = self.proto._nbests_size()
1560
+
1561
+ def __len__(self):
1562
+ return self.len
1563
+
1564
+ def __getitem__(self, index):
1565
+ if isinstance(index, slice):
1566
+ return [self.proto._nbests(i) for i in range(self.len)][index.start:index.stop:index.step]
1567
+ if index < 0:
1568
+ index = index + self.len
1569
+ if index < 0 or index >= self.len:
1570
+ raise IndexError('nbests index is out of range')
1571
+ return self.proto._nbests(index)
1572
+
1573
+ def __str__(self):
1574
+ return '\n'.join(['nbests {{\n{}}}'.format(str(x)) for x in self])
1575
+
1576
+ __repr__ = __str__
1577
+
1578
+ @property
1579
+ def nbests(self):
1580
+ return ImmutableNBestSentencePieceText.ImmutableSentencePieceTextIterator(self)
1581
+
1582
+ def __eq__(self, other):
1583
+ return self.SerializeAsString() == other.SerializeAsString()
1584
+
1585
+ def __hash__(self):
1586
+ return hash(self.SerializeAsString())
1587
+
1588
+ def __str__(self):
1589
+ return '\n'.join(['nbests {{\n{}}}'.format(str(x)) for x in self.nbests])
1590
+
1591
+ __repr__ = __str__
1592
+ %}
1593
+ }
1594
+
1595
+ %typemap(out) std::vector<int> {
1596
+ $result = PyList_New($1.size());
1597
+ for (size_t i = 0; i < $1.size(); ++i) {
1598
+ PyList_SET_ITEM($result, i, PyInt_FromLong(static_cast<long>($1[i])));
1599
+ }
1600
+ }
1601
+
1602
+ %typemap(out) std::vector<float> {
1603
+ $result = PyList_New($1.size());
1604
+ for (size_t i = 0; i < $1.size(); ++i) {
1605
+ PyList_SET_ITEM($result, i, PyFloat_FromDouble(static_cast<double>($1[i])));
1606
+ }
1607
+ }
1608
+
1609
+ %typemap(out) std::vector<std::vector<int>> {
1610
+ $result = PyList_New($1.size());
1611
+ for (size_t i = 0; i < $1.size(); ++i) {
1612
+ PyObject *obj = PyList_New($1[i].size());
1613
+ for (size_t j = 0; j < $1[i].size(); ++j) {
1614
+ PyList_SET_ITEM(obj, j, PyInt_FromLong(static_cast<long>($1[i][j])));
1615
+ }
1616
+ PyList_SET_ITEM($result, i, obj);
1617
+ }
1618
+ }
1619
+
1620
+ %typemap(out) std::vector<std::string> {
1621
+ PyObject *input_type = resultobj;
1622
+ $result = PyList_New($1.size());
1623
+ for (size_t i = 0; i < $1.size(); ++i) {
1624
+ PyList_SET_ITEM($result, i, MakePyOutputString($1[i], input_type));
1625
+ }
1626
+ }
1627
+
1628
+ %typemap(out) BytesArray {
1629
+ $result = PyList_New($1.size());
1630
+ for (size_t i = 0; i < $1.size(); ++i) {
1631
+ PyList_SET_ITEM($result, i, MakePyOutputBytes($1[i]));
1632
+ }
1633
+ }
1634
+
1635
+ %typemap(out) std::vector<std::vector<std::string>> {
1636
+ PyObject *input_type = resultobj;
1637
+ $result = PyList_New($1.size());
1638
+ for (size_t i = 0; i < $1.size(); ++i) {
1639
+ PyObject *obj = PyList_New($1[i].size());
1640
+ for (size_t j = 0; j < $1[i].size(); ++j) {
1641
+ PyList_SET_ITEM(obj, j, MakePyOutputString($1[i][j], input_type));
1642
+ }
1643
+ PyList_SET_ITEM($result, i, obj);
1644
+ }
1645
+ }
1646
+
1647
+ %typemap(out) sentencepiece::util::bytes {
1648
+ $result = MakePyOutputBytes($1);
1649
+ }
1650
+
1651
+ %typemap(out) const sentencepiece::util::bytes& {
1652
+ $result = MakePyOutputBytes(*$1);
1653
+ }
1654
+
1655
+ %typemap(out) std::string {
1656
+ PyObject *input_type = resultobj;
1657
+ $result = MakePyOutputString($1, input_type);
1658
+ }
1659
+
1660
+ %typemap(out) const std::string& {
1661
+ PyObject *input_type = resultobj;
1662
+ $result = MakePyOutputString(*$1, input_type);
1663
+ }
1664
+
1665
+ %typemap(out) sentencepiece::util::Status {
1666
+ if (!$1.ok()) {
1667
+ SWIG_exception(ToSwigError($1.code()), $1.ToString().c_str());
1668
+ }
1669
+ $result = SWIG_From_bool($1.ok());}
1670
+
1671
+
1672
+ %typemap(in) const std::string & {
1673
+ const PyInputString ustring($input);
1674
+ if (!ustring.IsAvalable()) {
1675
+ PyErr_SetString(PyExc_TypeError, "not a string");
1676
+ SWIG_fail;
1677
+ }
1678
+ resultobj = ustring.input_type();
1679
+ $1 = new std::string(ustring.data(), ustring.size());
1680
+ }
1681
+
1682
+ %typemap(typecheck) absl::string_view = char *;
1683
+
1684
+ %typemap(in) absl::string_view {
1685
+ const PyInputString ustring($input);
1686
+ if (!ustring.IsAvalable()) {
1687
+ PyErr_SetString(PyExc_TypeError, "not a string");
1688
+ SWIG_fail;
1689
+ }
1690
+ resultobj = ustring.input_type();
1691
+ $1 = ustring.str();
1692
+ }
1693
+
1694
+ %typemap(in) const std::vector<absl::string_view>& {
1695
+ std::vector<absl::string_view> *out = nullptr;
1696
+ if (PyList_Check($input)) {
1697
+ const size_t size = PyList_Size($input);
1698
+ out = new std::vector<absl::string_view>(size);
1699
+ for (size_t i = 0; i < size; ++i) {
1700
+ const PyInputString ustring(PyList_GetItem($input, i));
1701
+ if (ustring.IsAvalable()) {
1702
+ (*out)[i] = ustring.str();
1703
+ } else {
1704
+ PyErr_SetString(PyExc_TypeError, "list must contain strings");
1705
+ SWIG_fail;
1706
+ }
1707
+ resultobj = ustring.input_type();
1708
+ }
1709
+ } else {
1710
+ PyErr_SetString(PyExc_TypeError, "not a list");
1711
+ SWIG_fail;
1712
+ }
1713
+ $1 = out;
1714
+ }
1715
+
1716
+ %typemap(in) const std::vector<int>& {
1717
+ std::vector<int> *out = nullptr;
1718
+ if (PyList_Check($input)) {
1719
+ const size_t size = PyList_Size($input);
1720
+ out = new std::vector<int>(size);
1721
+ for (size_t i = 0; i < size; ++i) {
1722
+ PyObject *o = PyList_GetItem($input, i);
1723
+ if (PyInt_Check(o)) {
1724
+ (*out)[i] = static_cast<int>(PyInt_AsLong(o));
1725
+ } else {
1726
+ PyErr_SetString(PyExc_TypeError,"list must contain integers");
1727
+ SWIG_fail;
1728
+ }
1729
+ }
1730
+ } else {
1731
+ PyErr_SetString(PyExc_TypeError,"not a list");
1732
+ SWIG_fail;
1733
+ }
1734
+ $1 = out;
1735
+ }
1736
+
1737
+ %typemap(in) const std::vector<std::vector<absl::string_view>>& {
1738
+ std::vector<std::vector<absl::string_view>> *out = nullptr;
1739
+ if (PyList_Check($input)) {
1740
+ const size_t size = PyList_Size($input);
1741
+ out = new std::vector<std::vector<absl::string_view>>(size);
1742
+ for (size_t i = 0; i < size; ++i) {
1743
+ PyObject *o = PyList_GetItem($input, i);
1744
+ if (PyList_Check(o)) {
1745
+ const size_t size2 = PyList_Size(o);
1746
+ (*out)[i].resize(size2);
1747
+ for (size_t j = 0; j < size2; ++j) {
1748
+ const PyInputString ustring(PyList_GetItem(o, j));
1749
+ if (ustring.IsAvalable()) {
1750
+ (*out)[i][j] = ustring.str();
1751
+ } else {
1752
+ PyErr_SetString(PyExc_TypeError,"list must contain integers");
1753
+ SWIG_fail;
1754
+ }
1755
+ resultobj = ustring.input_type();
1756
+ }
1757
+ } else {
1758
+ PyErr_SetString(PyExc_TypeError,"not a list");
1759
+ SWIG_fail;
1760
+ }
1761
+ }
1762
+ } else {
1763
+ PyErr_SetString(PyExc_TypeError,"not a list");
1764
+ SWIG_fail;
1765
+ }
1766
+ $1 = out;
1767
+ }
1768
+
1769
+ %typemap(in) const std::vector<std::vector<int>>& {
1770
+ std::vector<std::vector<int>> *out = nullptr;
1771
+ if (PyList_Check($input)) {
1772
+ const size_t size = PyList_Size($input);
1773
+ out = new std::vector<std::vector<int>>(size);
1774
+ for (size_t i = 0; i < size; ++i) {
1775
+ PyObject *o = PyList_GetItem($input, i);
1776
+ if (PyList_Check(o)) {
1777
+ const size_t size2 = PyList_Size(o);
1778
+ (*out)[i].resize(size2);
1779
+ for (size_t j = 0; j < size2; ++j) {
1780
+ PyObject *o2 = PyList_GetItem(o, j);
1781
+ if (PyInt_Check(o2)) {
1782
+ (*out)[i][j] = static_cast<int>(PyInt_AsLong(o2));
1783
+ } else {
1784
+ PyErr_SetString(PyExc_TypeError, "list must contain strings");
1785
+ SWIG_fail;
1786
+ }
1787
+ }
1788
+ } else {
1789
+ PyErr_SetString(PyExc_TypeError, "not a list");
1790
+ SWIG_fail;
1791
+ }
1792
+ }
1793
+ } else {
1794
+ PyErr_SetString(PyExc_TypeError,"not a list");
1795
+ SWIG_fail;
1796
+ }
1797
+ $1 = out;
1798
+ }
1799
+
1800
+ %typemap(in) const std::unordered_map<std::string, std::string> & {
1801
+ std::unordered_map<std::string, std::string> *out = nullptr;
1802
+ if (PyDict_Check($input)) {
1803
+ PyObject *key, *value;
1804
+ Py_ssize_t pos = 0;
1805
+ out = new std::unordered_map<std::string, std::string>;
1806
+ while (PyDict_Next($input, &pos, &key, &value)) {
1807
+ const PyInputString key_ustring(key);
1808
+ const PyInputString value_ustring(value);
1809
+ if (key_ustring.IsAvalable() && value_ustring.IsAvalable()) {
1810
+ out->emplace(std::string(key_ustring.data(), key_ustring.size()),
1811
+ std::string(value_ustring.data(), value_ustring.size()));
1812
+ } else {
1813
+ PyErr_SetString(PyExc_TypeError, "map must contain strings.");
1814
+ SWIG_fail;
1815
+ }
1816
+ resultobj = key_ustring.input_type();
1817
+ }
1818
+ } else {
1819
+ PyErr_SetString(PyExc_TypeError, "not a dictionary");
1820
+ SWIG_fail;
1821
+ }
1822
+ $1 = out;
1823
+ }
1824
+
1825
+ %typemap(out) std::vector<std::pair<std::vector<std::string>, float>> {
1826
+ PyObject *input_type = resultobj;
1827
+ $result = PyList_New($1.size());
1828
+ for (size_t i = 0; i < $1.size(); ++i) {
1829
+ PyObject *obj = PyList_New($1[i].first.size());
1830
+ for (size_t j = 0; j < $1[i].first.size(); ++j) {
1831
+ PyList_SET_ITEM(obj, j, MakePyOutputString($1[i].first[j], input_type));
1832
+ }
1833
+ PyList_SET_ITEM($result, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast<double>($1[i].second))));
1834
+ }
1835
+ }
1836
+
1837
+ %typemap(out) std::vector<std::pair<std::vector<int>, float>> {
1838
+ $result = PyList_New($1.size());
1839
+ for (size_t i = 0; i < $1.size(); ++i) {
1840
+ PyObject *obj = PyList_New($1[i].first.size());
1841
+ for (size_t j = 0; j < $1[i].first.size(); ++j) {
1842
+ PyList_SET_ITEM(obj, j, PyInt_FromLong(static_cast<long>($1[i].first[j])));
1843
+ }
1844
+ PyList_SET_ITEM($result, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast<double>($1[i].second))));
1845
+ }
1846
+ }
1847
+
1848
+ %typemap(out) std::vector<sentencepiece::ImmutableSentencePieceText> {
1849
+ $result = PyList_New($1.size());
1850
+ for (size_t i = 0; i < $1.size(); ++i) {
1851
+ PyObject *obj = SWIG_NewPointerObj(new sentencepiece::ImmutableSentencePieceText($1.at(i)), SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, SWIG_POINTER_OWN | 0);
1852
+ PyList_SET_ITEM($result, i, obj);
1853
+ }
1854
+ }
1855
+
1856
+ // Types for normalized string and offset
1857
+ %typemap(out) std::pair<std::string, std::vector<size_t>> {
1858
+ PyObject *input_type = resultobj;
1859
+ if (PyInputString::IsUnicode(input_type)) {
1860
+ sentencepiece::ConvertToUnicodeAlignment(arg2, $1.first, &$1.second);
1861
+ }
1862
+ PyObject *obj = PyList_New($1.second.size());
1863
+ for (size_t i = 0; i < $1.second.size(); ++i) {
1864
+ PyList_SET_ITEM(obj, i, PyInt_FromLong(static_cast<long>($1.second[i])));
1865
+ }
1866
+ $result = PyTuple_Pack(2, MakePyOutputString($1.first, input_type), obj);
1867
+ }
1868
+
1869
+ %typemap(in) sentencepiece::SentenceIterator * {
1870
+ sentencepiece::SentenceIterator *out = nullptr;
1871
+ if (PyIter_Check($input)) {
1872
+ out = new PySentenceIterator($input);
1873
+ } else {
1874
+ PyErr_SetString(PyExc_TypeError, "not a iterator");
1875
+ SWIG_fail;
1876
+ }
1877
+ $1 = out;
1878
+ }
1879
+
1880
+ %typemap(freearg) const std::string& {
1881
+ delete $1;
1882
+ }
1883
+
1884
+ %typemap(freearg) const std::vector<std::string>& {
1885
+ delete $1;
1886
+ }
1887
+
1888
+ %typemap(freearg) const std::vector<absl::string_view>& {
1889
+ delete $1;
1890
+ }
1891
+
1892
+ %typemap(freearg) const std::vector<std::vector<std::string>>& {
1893
+ delete $1;
1894
+ }
1895
+
1896
+ %typemap(freearg) const std::vector<int>& {
1897
+ delete $1;
1898
+ }
1899
+
1900
+ %typemap(freearg) const std::vector<float>& {
1901
+ delete $1;
1902
+ }
1903
+
1904
+ %typemap(freearg) const std::vector<std::vector<int>>& {
1905
+ delete $1;
1906
+ }
1907
+
1908
+ %typemap(freearg) const std::unordered_map<std::string, std::string> & {
1909
+ delete $1;
1910
+ }
1911
+
1912
+ %typemap(freearg) sentencepiece::SentenceIterator * {
1913
+ delete $1;
1914
+ }
1915
+
1916
+ %typemap(freearg) sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece {
1917
+ delete $1;
1918
+ }
1919
+
1920
+ %typemap(freearg) sentencepiece::ImmutableSentencePieceText {
1921
+ delete $1;
1922
+ }
1923
+
1924
+ %typemap(freearg) sentencepiece::ImmutableNBestSentencePieceText {
1925
+ delete $1;
1926
+ }
1927
+
1928
+ %include <sentencepiece_processor.h>
1929
+ %include <sentencepiece_trainer.h>
1930
+
1931
+ %pythoncode %{
1932
+
1933
+ import re
1934
+ import csv
1935
+ import sys
1936
+ import os
1937
+ import importlib.resources
1938
+ from io import StringIO
1939
+ from io import BytesIO
1940
+
1941
+
1942
+ def _add_snake_case(classname):
1943
+ """Added snake_cased method from CammelCased method."""
1944
+
1945
+ snake_map = {}
1946
+ for k, v in classname.__dict__.items():
1947
+ if re.match(r'^[A-Z]+', k):
1948
+ snake = re.sub(r'(?<!^)(?=[A-Z])', '_',
1949
+ k).lower().replace('n_best', 'nbest')
1950
+ snake_map[snake] = v
1951
+ for k, v in snake_map.items():
1952
+ setattr(classname, k, v)
1953
+
1954
+
1955
+ def _batchnize(classname, name):
1956
+ """Enables batch request for the method classname.name."""
1957
+ func = getattr(classname, name, None)
1958
+ def _func(v, n):
1959
+ if type(n) is int and (n < 0 or n >= v.piece_size()):
1960
+ raise IndexError('piece id is out of range.')
1961
+ return func(v, n)
1962
+
1963
+ def _batched_func(self, arg):
1964
+ if type(arg) is list:
1965
+ return [_func(self, n) for n in arg]
1966
+ else:
1967
+ return _func(self, arg)
1968
+
1969
+ setattr(classname, name, _batched_func)
1970
+
1971
+
1972
+ _sentencepiece_processor_init_native = SentencePieceProcessor.__init__
1973
+ _sentencepiece_normalizer_init_native = SentencePieceNormalizer.__init__
1974
+ setattr(SentencePieceProcessor, '__init__', SentencePieceProcessor.Init)
1975
+ setattr(SentencePieceNormalizer, '__init__', SentencePieceNormalizer.Init)
1976
+
1977
+ SentencePieceProcessor.Tokenize = SentencePieceProcessor.Encode
1978
+ SentencePieceProcessor.Detokenize = SentencePieceProcessor.Decode
1979
+
1980
+ for m in [
1981
+ 'PieceToId', 'IdToPiece', 'GetScore', 'IsUnknown', 'IsControl', 'IsUnused',
1982
+ 'IsByte'
1983
+ ]:
1984
+ _batchnize(SentencePieceProcessor, m)
1985
+
1986
+ _add_snake_case(SentencePieceProcessor)
1987
+ _add_snake_case(SentencePieceTrainer)
1988
+ _add_snake_case(SentencePieceNormalizer)
1989
+ set_random_generator_seed = SetRandomGeneratorSeed
1990
+ set_min_log_level = SetMinLogLevel
1991
+
1992
+ from ._version import __version__
1993
+
1994
+ SetDataDir(os.path.join(str(importlib.resources.files('sentencepiece')), 'package_data'))
1995
+
1996
+ class _LogStream(object):
1997
+ def __init__(self, ostream=None):
1998
+ self.ostream = ostream
1999
+ if self.ostream is not None:
2000
+ self.orig_stream_fileno = sys.stderr.fileno()
2001
+
2002
+ def __enter__(self):
2003
+ if self.ostream is not None:
2004
+ self.orig_stream_dup = os.dup(self.orig_stream_fileno)
2005
+ os.dup2(self.ostream.fileno(), self.orig_stream_fileno)
2006
+
2007
+ def __exit__(self, type, value, traceback):
2008
+ if self.ostream is not None:
2009
+ os.close(self.orig_stream_fileno)
2010
+ os.dup2(self.orig_stream_dup, self.orig_stream_fileno)
2011
+ os.close(self.orig_stream_dup)
2012
+ self.ostream.close()
2013
+ %}
source/sentencepiece/sentencepiece_model_pb2.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Generated by the protocol buffer compiler. DO NOT EDIT!
3
+ # source: sentencepiece_model.proto
4
+ """Generated protocol buffer code."""
5
+ from google.protobuf.internal import builder as _builder
6
+ from google.protobuf import descriptor as _descriptor
7
+ from google.protobuf import descriptor_pool as _descriptor_pool
8
+ from google.protobuf import symbol_database as _symbol_database
9
+ # @@protoc_insertion_point(imports)
10
+
11
+ _sym_db = _symbol_database.Default()
12
+
13
+
14
+
15
+
16
+ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x19sentencepiece_model.proto\x12\rsentencepiece\"\x80\x0c\n\x0bTrainerSpec\x12\r\n\x05input\x18\x01 \x03(\t\x12\x14\n\x0cinput_format\x18\x07 \x01(\t\x12\x14\n\x0cmodel_prefix\x18\x02 \x01(\t\x12\x41\n\nmodel_type\x18\x03 \x01(\x0e\x32$.sentencepiece.TrainerSpec.ModelType:\x07UNIGRAM\x12\x18\n\nvocab_size\x18\x04 \x01(\x05:\x04\x38\x30\x30\x30\x12\x17\n\x0f\x61\x63\x63\x65pt_language\x18\x05 \x03(\t\x12 \n\x15self_test_sample_size\x18\x06 \x01(\x05:\x01\x30\x12*\n\x1b\x65nable_differential_privacy\x18\x32 \x01(\x08:\x05\x66\x61lse\x12+\n differential_privacy_noise_level\x18\x33 \x01(\x02:\x01\x30\x12\x32\n\'differential_privacy_clipping_threshold\x18\x34 \x01(\x04:\x01\x30\x12\"\n\x12\x63haracter_coverage\x18\n \x01(\x02:\x06\x30.9995\x12\x1e\n\x13input_sentence_size\x18\x0b \x01(\x04:\x01\x30\x12$\n\x16shuffle_input_sentence\x18\x13 \x01(\x08:\x04true\x12 \n\x14mining_sentence_size\x18\x0c \x01(\x05\x42\x02\x18\x01\x12\"\n\x16training_sentence_size\x18\r \x01(\x05\x42\x02\x18\x01\x12(\n\x17seed_sentencepiece_size\x18\x0e \x01(\x05:\x07\x31\x30\x30\x30\x30\x30\x30\x12\x1e\n\x10shrinking_factor\x18\x0f \x01(\x02:\x04\x30.75\x12!\n\x13max_sentence_length\x18\x12 \x01(\x05:\x04\x34\x31\x39\x32\x12\x17\n\x0bnum_threads\x18\x10 \x01(\x05:\x02\x31\x36\x12\x1d\n\x12num_sub_iterations\x18\x11 \x01(\x05:\x01\x32\x12$\n\x18max_sentencepiece_length\x18\x14 \x01(\x05:\x02\x31\x36\x12%\n\x17split_by_unicode_script\x18\x15 \x01(\x08:\x04true\x12\x1d\n\x0fsplit_by_number\x18\x17 \x01(\x08:\x04true\x12!\n\x13split_by_whitespace\x18\x16 \x01(\x08:\x04true\x12)\n\x1atreat_whitespace_as_suffix\x18\x18 \x01(\x08:\x05\x66\x61lse\x12+\n\x1c\x61llow_whitespace_only_pieces\x18\x1a \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0csplit_digits\x18\x19 \x01(\x08:\x05\x66\x61lse\x12#\n\x19pretokenization_delimiter\x18\x35 \x01(\t:\x00\x12\x17\n\x0f\x63ontrol_symbols\x18\x1e \x03(\t\x12\x1c\n\x14user_defined_symbols\x18\x1f \x03(\t\x12\x16\n\x0erequired_chars\x18$ \x01(\t\x12\x1c\n\rbyte_fallback\x18# \x01(\x08:\x05\x66\x61lse\x12+\n\x1dvocabulary_output_piece_score\x18 \x01(\x08:\x04true\x12\x1e\n\x10hard_vocab_limit\x18! \x01(\x08:\x04true\x12\x1c\n\ruse_all_vocab\x18\" \x01(\x08:\x05\x66\x61lse\x12\x11\n\x06unk_id\x18( \x01(\x05:\x01\x30\x12\x11\n\x06\x62os_id\x18) \x01(\x05:\x01\x31\x12\x11\n\x06\x65os_id\x18* \x01(\x05:\x01\x32\x12\x12\n\x06pad_id\x18+ \x01(\x05:\x02-1\x12\x18\n\tunk_piece\x18- \x01(\t:\x05<unk>\x12\x16\n\tbos_piece\x18. \x01(\t:\x03<s>\x12\x17\n\teos_piece\x18/ \x01(\t:\x04</s>\x12\x18\n\tpad_piece\x18\x30 \x01(\t:\x05<pad>\x12\x1a\n\x0bunk_surface\x18, \x01(\t:\x05 \xe2\x81\x87 \x12+\n\x1ctrain_extremely_large_corpus\x18\x31 \x01(\x08:\x05\x66\x61lse\"5\n\tModelType\x12\x0b\n\x07UNIGRAM\x10\x01\x12\x07\n\x03\x42PE\x10\x02\x12\x08\n\x04WORD\x10\x03\x12\x08\n\x04\x43HAR\x10\x04*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xd1\x01\n\x0eNormalizerSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1c\n\x14precompiled_charsmap\x18\x02 \x01(\x0c\x12\x1e\n\x10\x61\x64\x64_dummy_prefix\x18\x03 \x01(\x08:\x04true\x12&\n\x18remove_extra_whitespaces\x18\x04 \x01(\x08:\x04true\x12 \n\x12\x65scape_whitespaces\x18\x05 \x01(\x08:\x04true\x12\x1e\n\x16normalization_rule_tsv\x18\x06 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"y\n\x0cSelfTestData\x12\x33\n\x07samples\x18\x01 \x03(\x0b\x32\".sentencepiece.SelfTestData.Sample\x1a)\n\x06Sample\x12\r\n\x05input\x18\x01 \x01(\t\x12\x10\n\x08\x65xpected\x18\x02 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xfe\x03\n\nModelProto\x12\x37\n\x06pieces\x18\x01 \x03(\x0b\x32\'.sentencepiece.ModelProto.SentencePiece\x12\x30\n\x0ctrainer_spec\x18\x02 \x01(\x0b\x32\x1a.sentencepiece.TrainerSpec\x12\x36\n\x0fnormalizer_spec\x18\x03 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x12\x33\n\x0eself_test_data\x18\x04 \x01(\x0b\x32\x1b.sentencepiece.SelfTestData\x12\x38\n\x11\x64\x65normalizer_spec\x18\x05 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x1a\xd2\x01\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x12\x42\n\x04type\x18\x03 \x01(\x0e\x32,.sentencepiece.ModelProto.SentencePiece.Type:\x06NORMAL\"T\n\x04Type\x12\n\n\x06NORMAL\x10\x01\x12\x0b\n\x07UNKNOWN\x10\x02\x12\x0b\n\x07\x43ONTROL\x10\x03\x12\x10\n\x0cUSER_DEFINED\x10\x04\x12\x08\n\x04\x42YTE\x10\x06\x12\n\n\x06UNUSED\x10\x05*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\x42\x02H\x03')
17
+
18
+ _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals())
19
+ _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'sentencepiece_model_pb2', globals())
20
+ if _descriptor._USE_C_DESCRIPTORS == False:
21
+
22
+ DESCRIPTOR._options = None
23
+ DESCRIPTOR._serialized_options = b'H\003'
24
+ _TRAINERSPEC.fields_by_name['mining_sentence_size']._options = None
25
+ _TRAINERSPEC.fields_by_name['mining_sentence_size']._serialized_options = b'\030\001'
26
+ _TRAINERSPEC.fields_by_name['training_sentence_size']._options = None
27
+ _TRAINERSPEC.fields_by_name['training_sentence_size']._serialized_options = b'\030\001'
28
+ _TRAINERSPEC._serialized_start=45
29
+ _TRAINERSPEC._serialized_end=1581
30
+ _TRAINERSPEC_MODELTYPE._serialized_start=1517
31
+ _TRAINERSPEC_MODELTYPE._serialized_end=1570
32
+ _NORMALIZERSPEC._serialized_start=1584
33
+ _NORMALIZERSPEC._serialized_end=1793
34
+ _SELFTESTDATA._serialized_start=1795
35
+ _SELFTESTDATA._serialized_end=1916
36
+ _SELFTESTDATA_SAMPLE._serialized_start=1864
37
+ _SELFTESTDATA_SAMPLE._serialized_end=1905
38
+ _MODELPROTO._serialized_start=1919
39
+ _MODELPROTO._serialized_end=2429
40
+ _MODELPROTO_SENTENCEPIECE._serialized_start=2208
41
+ _MODELPROTO_SENTENCEPIECE._serialized_end=2418
42
+ _MODELPROTO_SENTENCEPIECE_TYPE._serialized_start=2323
43
+ _MODELPROTO_SENTENCEPIECE_TYPE._serialized_end=2407
44
+ # @@protoc_insertion_point(module_scope)
source/sentencepiece/sentencepiece_pb2.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Generated by the protocol buffer compiler. DO NOT EDIT!
3
+ # source: sentencepiece.proto
4
+ """Generated protocol buffer code."""
5
+ from google.protobuf.internal import builder as _builder
6
+ from google.protobuf import descriptor as _descriptor
7
+ from google.protobuf import descriptor_pool as _descriptor_pool
8
+ from google.protobuf import symbol_database as _symbol_database
9
+ # @@protoc_insertion_point(imports)
10
+
11
+ _sym_db = _symbol_database.Default()
12
+
13
+
14
+
15
+
16
+ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13sentencepiece.proto\x12\rsentencepiece\"\xdf\x01\n\x11SentencePieceText\x12\x0c\n\x04text\x18\x01 \x01(\t\x12>\n\x06pieces\x18\x02 \x03(\x0b\x32..sentencepiece.SentencePieceText.SentencePiece\x12\r\n\x05score\x18\x03 \x01(\x02\x1a\x62\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\n\n\x02id\x18\x02 \x01(\r\x12\x0f\n\x07surface\x18\x03 \x01(\t\x12\r\n\x05\x62\x65gin\x18\x04 \x01(\r\x12\x0b\n\x03\x65nd\x18\x05 \x01(\r*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"J\n\x16NBestSentencePieceText\x12\x30\n\x06nbests\x18\x01 \x03(\x0b\x32 .sentencepiece.SentencePieceTextB\x02H\x03')
17
+
18
+ _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals())
19
+ _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'sentencepiece_pb2', globals())
20
+ if _descriptor._USE_C_DESCRIPTORS == False:
21
+
22
+ DESCRIPTOR._options = None
23
+ DESCRIPTOR._serialized_options = b'H\003'
24
+ _SENTENCEPIECETEXT._serialized_start=39
25
+ _SENTENCEPIECETEXT._serialized_end=262
26
+ _SENTENCEPIECETEXT_SENTENCEPIECE._serialized_start=153
27
+ _SENTENCEPIECETEXT_SENTENCEPIECE._serialized_end=251
28
+ _NBESTSENTENCEPIECETEXT._serialized_start=264
29
+ _NBESTSENTENCEPIECETEXT._serialized_end=338
30
+ # @@protoc_insertion_point(module_scope)
source/sentencepiece/sentencepiece_wrap.cxx ADDED
The diff for this file is too large to render. See raw diff
 
source/sentry_sdk-2.53.0.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ pip
source/sentry_sdk-2.53.0.dist-info/METADATA ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: sentry-sdk
3
+ Version: 2.53.0
4
+ Summary: Python client for Sentry (https://sentry.io)
5
+ Home-page: https://github.com/getsentry/sentry-python
6
+ Author: Sentry Team and Contributors
7
+ Author-email: hello@sentry.io
8
+ License: MIT
9
+ Project-URL: Documentation, https://docs.sentry.io/platforms/python/
10
+ Project-URL: Changelog, https://github.com/getsentry/sentry-python/blob/master/CHANGELOG.md
11
+ Classifier: Development Status :: 5 - Production/Stable
12
+ Classifier: Environment :: Web Environment
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: BSD License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.6
19
+ Classifier: Programming Language :: Python :: 3.7
20
+ Classifier: Programming Language :: Python :: 3.8
21
+ Classifier: Programming Language :: Python :: 3.9
22
+ Classifier: Programming Language :: Python :: 3.10
23
+ Classifier: Programming Language :: Python :: 3.11
24
+ Classifier: Programming Language :: Python :: 3.12
25
+ Classifier: Programming Language :: Python :: 3.13
26
+ Classifier: Programming Language :: Python :: 3.14
27
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
28
+ Requires-Python: >=3.6
29
+ Description-Content-Type: text/markdown
30
+ License-File: LICENSE
31
+ Requires-Dist: urllib3>=1.26.11
32
+ Requires-Dist: certifi
33
+ Provides-Extra: aiohttp
34
+ Requires-Dist: aiohttp>=3.5; extra == "aiohttp"
35
+ Provides-Extra: anthropic
36
+ Requires-Dist: anthropic>=0.16; extra == "anthropic"
37
+ Provides-Extra: arq
38
+ Requires-Dist: arq>=0.23; extra == "arq"
39
+ Provides-Extra: asyncpg
40
+ Requires-Dist: asyncpg>=0.23; extra == "asyncpg"
41
+ Provides-Extra: beam
42
+ Requires-Dist: apache-beam>=2.12; extra == "beam"
43
+ Provides-Extra: bottle
44
+ Requires-Dist: bottle>=0.12.13; extra == "bottle"
45
+ Provides-Extra: celery
46
+ Requires-Dist: celery>=3; extra == "celery"
47
+ Provides-Extra: celery-redbeat
48
+ Requires-Dist: celery-redbeat>=2; extra == "celery-redbeat"
49
+ Provides-Extra: chalice
50
+ Requires-Dist: chalice>=1.16.0; extra == "chalice"
51
+ Provides-Extra: clickhouse-driver
52
+ Requires-Dist: clickhouse-driver>=0.2.0; extra == "clickhouse-driver"
53
+ Provides-Extra: django
54
+ Requires-Dist: django>=1.8; extra == "django"
55
+ Provides-Extra: falcon
56
+ Requires-Dist: falcon>=1.4; extra == "falcon"
57
+ Provides-Extra: fastapi
58
+ Requires-Dist: fastapi>=0.79.0; extra == "fastapi"
59
+ Provides-Extra: flask
60
+ Requires-Dist: flask>=0.11; extra == "flask"
61
+ Requires-Dist: blinker>=1.1; extra == "flask"
62
+ Requires-Dist: markupsafe; extra == "flask"
63
+ Provides-Extra: grpcio
64
+ Requires-Dist: grpcio>=1.21.1; extra == "grpcio"
65
+ Requires-Dist: protobuf>=3.8.0; extra == "grpcio"
66
+ Provides-Extra: http2
67
+ Requires-Dist: httpcore[http2]==1.*; extra == "http2"
68
+ Provides-Extra: httpx
69
+ Requires-Dist: httpx>=0.16.0; extra == "httpx"
70
+ Provides-Extra: huey
71
+ Requires-Dist: huey>=2; extra == "huey"
72
+ Provides-Extra: huggingface-hub
73
+ Requires-Dist: huggingface_hub>=0.22; extra == "huggingface-hub"
74
+ Provides-Extra: langchain
75
+ Requires-Dist: langchain>=0.0.210; extra == "langchain"
76
+ Provides-Extra: langgraph
77
+ Requires-Dist: langgraph>=0.6.6; extra == "langgraph"
78
+ Provides-Extra: launchdarkly
79
+ Requires-Dist: launchdarkly-server-sdk>=9.8.0; extra == "launchdarkly"
80
+ Provides-Extra: litellm
81
+ Requires-Dist: litellm>=1.77.5; extra == "litellm"
82
+ Provides-Extra: litestar
83
+ Requires-Dist: litestar>=2.0.0; extra == "litestar"
84
+ Provides-Extra: loguru
85
+ Requires-Dist: loguru>=0.5; extra == "loguru"
86
+ Provides-Extra: mcp
87
+ Requires-Dist: mcp>=1.15.0; extra == "mcp"
88
+ Provides-Extra: openai
89
+ Requires-Dist: openai>=1.0.0; extra == "openai"
90
+ Requires-Dist: tiktoken>=0.3.0; extra == "openai"
91
+ Provides-Extra: openfeature
92
+ Requires-Dist: openfeature-sdk>=0.7.1; extra == "openfeature"
93
+ Provides-Extra: opentelemetry
94
+ Requires-Dist: opentelemetry-distro>=0.35b0; extra == "opentelemetry"
95
+ Provides-Extra: opentelemetry-experimental
96
+ Requires-Dist: opentelemetry-distro; extra == "opentelemetry-experimental"
97
+ Provides-Extra: opentelemetry-otlp
98
+ Requires-Dist: opentelemetry-distro[otlp]>=0.35b0; extra == "opentelemetry-otlp"
99
+ Provides-Extra: pure-eval
100
+ Requires-Dist: pure_eval; extra == "pure-eval"
101
+ Requires-Dist: executing; extra == "pure-eval"
102
+ Requires-Dist: asttokens; extra == "pure-eval"
103
+ Provides-Extra: pydantic-ai
104
+ Requires-Dist: pydantic-ai>=1.0.0; extra == "pydantic-ai"
105
+ Provides-Extra: pymongo
106
+ Requires-Dist: pymongo>=3.1; extra == "pymongo"
107
+ Provides-Extra: pyspark
108
+ Requires-Dist: pyspark>=2.4.4; extra == "pyspark"
109
+ Provides-Extra: quart
110
+ Requires-Dist: quart>=0.16.1; extra == "quart"
111
+ Requires-Dist: blinker>=1.1; extra == "quart"
112
+ Provides-Extra: rq
113
+ Requires-Dist: rq>=0.6; extra == "rq"
114
+ Provides-Extra: sanic
115
+ Requires-Dist: sanic>=0.8; extra == "sanic"
116
+ Provides-Extra: sqlalchemy
117
+ Requires-Dist: sqlalchemy>=1.2; extra == "sqlalchemy"
118
+ Provides-Extra: starlette
119
+ Requires-Dist: starlette>=0.19.1; extra == "starlette"
120
+ Provides-Extra: starlite
121
+ Requires-Dist: starlite>=1.48; extra == "starlite"
122
+ Provides-Extra: statsig
123
+ Requires-Dist: statsig>=0.55.3; extra == "statsig"
124
+ Provides-Extra: tornado
125
+ Requires-Dist: tornado>=6; extra == "tornado"
126
+ Provides-Extra: unleash
127
+ Requires-Dist: UnleashClient>=6.0.1; extra == "unleash"
128
+ Provides-Extra: google-genai
129
+ Requires-Dist: google-genai>=1.29.0; extra == "google-genai"
130
+ Dynamic: author
131
+ Dynamic: author-email
132
+ Dynamic: classifier
133
+ Dynamic: description
134
+ Dynamic: description-content-type
135
+ Dynamic: home-page
136
+ Dynamic: license
137
+ Dynamic: license-file
138
+ Dynamic: project-url
139
+ Dynamic: provides-extra
140
+ Dynamic: requires-dist
141
+ Dynamic: requires-python
142
+ Dynamic: summary
143
+
144
+ <a href="https://sentry.io/?utm_source=github&utm_medium=logo" target="_blank">
145
+ <img src="https://sentry-brand.storage.googleapis.com/github-banners/github-sdk-python.png" alt="Sentry for Python">
146
+ </a>
147
+ <div align="center">
148
+
149
+ _Bad software is everywhere, and we're tired of it. Sentry is on a mission to help developers write better software faster, so we can get back to enjoying technology. If you want to join us
150
+ [<kbd>**Check out our open positions**</kbd>](https://sentry.io/careers/)_.
151
+
152
+ [![Discord](https://img.shields.io/discord/621778831602221064?logo=discord&labelColor=%20%235462eb&logoColor=%20%23f5f5f5&color=%20%235462eb)](https://discord.com/invite/Ww9hbqr)
153
+ [![X Follow](https://img.shields.io/twitter/follow/sentry?label=sentry&style=social)](https://x.com/intent/follow?screen_name=sentry)
154
+ [![PyPi page link -- version](https://img.shields.io/pypi/v/sentry-sdk.svg)](https://pypi.python.org/pypi/sentry-sdk)
155
+ <img src="https://img.shields.io/badge/python-3.7 | 3.8 | 3.9 | 3.10 | 3.11 | 3.12 | 3.13 | 3.14-blue.svg" alt="python">
156
+ [![Build Status](https://github.com/getsentry/sentry-python/actions/workflows/ci.yml/badge.svg)](https://github.com/getsentry/sentry-python/actions/workflows/ci.yml)
157
+
158
+ <br/>
159
+
160
+ </div>
161
+
162
+
163
+ # Official Sentry SDK for Python
164
+
165
+ Welcome to the official Python SDK for **[Sentry](http://sentry.io/)**.
166
+
167
+
168
+ ## 📦 Getting Started
169
+
170
+ ### Prerequisites
171
+
172
+ You need a Sentry [account](https://sentry.io/signup/) and [project](https://docs.sentry.io/product/projects/).
173
+
174
+ ### Installation
175
+
176
+ Getting Sentry into your project is straightforward. Just run this command in your terminal:
177
+
178
+ ```bash
179
+ pip install --upgrade sentry-sdk
180
+ ```
181
+
182
+ ### Basic Configuration
183
+
184
+ Here's a quick configuration example to get Sentry up and running:
185
+
186
+ ```python
187
+ import sentry_sdk
188
+
189
+ sentry_sdk.init(
190
+ "https://12927b5f211046b575ee51fd8b1ac34f@o1.ingest.sentry.io/1", # Your DSN here
191
+
192
+ # Set traces_sample_rate to 1.0 to capture 100%
193
+ # of traces for performance monitoring.
194
+ traces_sample_rate=1.0,
195
+ )
196
+ ```
197
+
198
+ With this configuration, Sentry will monitor for exceptions and performance issues.
199
+
200
+ ### Quick Usage Example
201
+
202
+ To generate some events that will show up in Sentry, you can log messages or capture errors:
203
+
204
+ ```python
205
+ import sentry_sdk
206
+ sentry_sdk.init(...) # same as above
207
+
208
+ sentry_sdk.capture_message("Hello Sentry!") # You'll see this in your Sentry dashboard.
209
+
210
+ raise ValueError("Oops, something went wrong!") # This will create an error event in Sentry.
211
+ ```
212
+
213
+
214
+ ## 📚 Documentation
215
+
216
+ For more details on advanced usage, integrations, and customization, check out the full documentation on [https://docs.sentry.io](https://docs.sentry.io/).
217
+
218
+
219
+ ## 🧩 Integrations
220
+
221
+ Sentry integrates with a ton of popular Python libraries and frameworks, including [FastAPI](https://docs.sentry.io/platforms/python/integrations/fastapi/), [Django](https://docs.sentry.io/platforms/python/integrations/django/), [Celery](https://docs.sentry.io/platforms/python/integrations/celery/), [OpenAI](https://docs.sentry.io/platforms/python/integrations/openai/) and many, many more. Check out the [full list of integrations](https://docs.sentry.io/platforms/python/integrations/) to get the full picture.
222
+
223
+
224
+ ## 🚧 Migrating Between Versions?
225
+
226
+ ### From `1.x` to `2.x`
227
+
228
+ If you're using the older `1.x` version of the SDK, now's the time to upgrade to `2.x`. It includes significant upgrades and new features. Check our [migration guide](https://docs.sentry.io/platforms/python/migration/1.x-to-2.x) for assistance.
229
+
230
+ ### From `raven-python`
231
+
232
+ Using the legacy `raven-python` client? It's now in maintenance mode, and we recommend migrating to the new SDK for an improved experience. Get all the details in our [migration guide](https://docs.sentry.io/platforms/python/migration/raven-to-sentry-sdk/).
233
+
234
+
235
+ ## 🙌 Want to Contribute?
236
+
237
+ We'd love your help in improving the Sentry SDK! Whether it's fixing bugs, adding features, writing new integrations, or enhancing documentation, every contribution is valuable.
238
+
239
+ For details on how to contribute, please read our [contribution guide](CONTRIBUTING.md) and explore the [open issues](https://github.com/getsentry/sentry-python/issues).
240
+
241
+
242
+ ## 🛟 Need Help?
243
+
244
+ If you encounter issues or need help setting up or configuring the SDK, don't hesitate to reach out to the [Sentry Community on Discord](https://discord.com/invite/Ww9hbqr). There is a ton of great people there ready to help!
245
+
246
+
247
+ ## 🔗 Resources
248
+
249
+ Here are all resources to help you make the most of Sentry:
250
+
251
+ - [Documentation](https://docs.sentry.io/platforms/python/) - Official documentation to get started.
252
+ - [Discord](https://discord.com/invite/Ww9hbqr) - Join our Discord community.
253
+ - [X/Twitter](https://x.com/intent/follow?screen_name=sentry) - Follow us on X (Twitter) for updates.
254
+ - [Stack Overflow](https://stackoverflow.com/questions/tagged/sentry) - Questions and answers related to Sentry.
255
+
256
+ <a name="license"></a>
257
+ ## 📃 License
258
+
259
+ The SDK is open-source and available under the MIT license. Check out the [LICENSE](LICENSE) file for more information.
260
+
261
+
262
+ ## 😘 Contributors
263
+
264
+ Thanks to everyone who has helped improve the SDK!
265
+
266
+ <a href="https://github.com/getsentry/sentry-python/graphs/contributors">
267
+ <img src="https://contributors-img.web.app/image?repo=getsentry/sentry-python" />
268
+ </a>
source/sentry_sdk-2.53.0.dist-info/RECORD ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ sentry_sdk-2.53.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
2
+ sentry_sdk-2.53.0.dist-info/METADATA,sha256=rdAnlprh5icT8lL9Qh0lJQw2I6rQOBYbmwG-1XWwXNA,10815
3
+ sentry_sdk-2.53.0.dist-info/RECORD,,
4
+ sentry_sdk-2.53.0.dist-info/WHEEL,sha256=uC7DnXjtoKy23CNUXRq6ohLsm8FbS2C_ECfYYzqZHVY,109
5
+ sentry_sdk-2.53.0.dist-info/entry_points.txt,sha256=qacZEz40UspQZD1IukCXykx0JtImqGDOctS5KfOLTko,91
6
+ sentry_sdk-2.53.0.dist-info/licenses/LICENSE,sha256=KhQNZg9GKBL6KQvHQNBGMxJsXsRdhLebVp4Sew7t3Qs,1093
7
+ sentry_sdk-2.53.0.dist-info/top_level.txt,sha256=XrQz30XE9FKXSY_yGLrd9bsv2Rk390GTDJOSujYaMxI,11
8
+ sentry_sdk/__init__.py,sha256=cnZoQ9y329brs-cdzIVtxbO1-o9AIrKk8VTVyZNJs1A,1410
9
+ sentry_sdk/__pycache__/__init__.cpython-312.pyc,,
10
+ sentry_sdk/__pycache__/_batcher.cpython-312.pyc,,
11
+ sentry_sdk/__pycache__/_compat.cpython-312.pyc,,
12
+ sentry_sdk/__pycache__/_init_implementation.cpython-312.pyc,,
13
+ sentry_sdk/__pycache__/_log_batcher.cpython-312.pyc,,
14
+ sentry_sdk/__pycache__/_lru_cache.cpython-312.pyc,,
15
+ sentry_sdk/__pycache__/_metrics_batcher.cpython-312.pyc,,
16
+ sentry_sdk/__pycache__/_queue.cpython-312.pyc,,
17
+ sentry_sdk/__pycache__/_span_batcher.cpython-312.pyc,,
18
+ sentry_sdk/__pycache__/_types.cpython-312.pyc,,
19
+ sentry_sdk/__pycache__/_werkzeug.cpython-312.pyc,,
20
+ sentry_sdk/__pycache__/api.cpython-312.pyc,,
21
+ sentry_sdk/__pycache__/attachments.cpython-312.pyc,,
22
+ sentry_sdk/__pycache__/client.cpython-312.pyc,,
23
+ sentry_sdk/__pycache__/consts.cpython-312.pyc,,
24
+ sentry_sdk/__pycache__/debug.cpython-312.pyc,,
25
+ sentry_sdk/__pycache__/envelope.cpython-312.pyc,,
26
+ sentry_sdk/__pycache__/feature_flags.cpython-312.pyc,,
27
+ sentry_sdk/__pycache__/hub.cpython-312.pyc,,
28
+ sentry_sdk/__pycache__/logger.cpython-312.pyc,,
29
+ sentry_sdk/__pycache__/metrics.cpython-312.pyc,,
30
+ sentry_sdk/__pycache__/monitor.cpython-312.pyc,,
31
+ sentry_sdk/__pycache__/scope.cpython-312.pyc,,
32
+ sentry_sdk/__pycache__/scrubber.cpython-312.pyc,,
33
+ sentry_sdk/__pycache__/serializer.cpython-312.pyc,,
34
+ sentry_sdk/__pycache__/session.cpython-312.pyc,,
35
+ sentry_sdk/__pycache__/sessions.cpython-312.pyc,,
36
+ sentry_sdk/__pycache__/spotlight.cpython-312.pyc,,
37
+ sentry_sdk/__pycache__/traces.cpython-312.pyc,,
38
+ sentry_sdk/__pycache__/tracing.cpython-312.pyc,,
39
+ sentry_sdk/__pycache__/tracing_utils.cpython-312.pyc,,
40
+ sentry_sdk/__pycache__/transport.cpython-312.pyc,,
41
+ sentry_sdk/__pycache__/types.cpython-312.pyc,,
42
+ sentry_sdk/__pycache__/utils.cpython-312.pyc,,
43
+ sentry_sdk/__pycache__/worker.cpython-312.pyc,,
44
+ sentry_sdk/_batcher.py,sha256=3Elbey1gpWzYSo-Dr5xxP00QpYcRYb8FGYqei9U_mdE,4004
45
+ sentry_sdk/_compat.py,sha256=tSI9LTAlSw7x_putvg7qg5SSdo7W1EwpbYiB_godUMQ,3065
46
+ sentry_sdk/_init_implementation.py,sha256=xualjTVpMB8XbJUX-30eAnuiyPfDeoeZoer3iIl0mH4,2491
47
+ sentry_sdk/_log_batcher.py,sha256=6Y3VNP6a65t1gzND9Rz9356xj2ZXOI1rB8Wq2De7Cpc,1827
48
+ sentry_sdk/_lru_cache.py,sha256=YSnCmL1qArB6_dsfcwKZgO4eJhBak87HqjqbUzBNQ7A,1167
49
+ sentry_sdk/_metrics_batcher.py,sha256=c7aw1sG7rldcS4XrpKAX59AGMXZ74Vx9Ja7peQo7MPc,1273
50
+ sentry_sdk/_queue.py,sha256=bluPs51jz8d3w74aSolOZYc3jnwmyTGPKvKxv7Z2LL0,11244
51
+ sentry_sdk/_span_batcher.py,sha256=EscEhZvzyJIby2taeOfRL7m6hm2RwptdmXwk0b8k36o,4406
52
+ sentry_sdk/_types.py,sha256=bcDjVByhfqJoYB2deda3oZBYv73gRanJ3WfFM2WagXM,11018
53
+ sentry_sdk/_werkzeug.py,sha256=vQL3Z_q3OBQ8vEby2ktiY7Ey8szMDWZsUe-BGJ9A3oY,3716
54
+ sentry_sdk/ai/__init__.py,sha256=L2EAYEx9075dUVZGEB_CBxSgwaLeYLFWhBYfIEeBDRg,218
55
+ sentry_sdk/ai/__pycache__/__init__.cpython-312.pyc,,
56
+ sentry_sdk/ai/__pycache__/_openai_completions_api.cpython-312.pyc,,
57
+ sentry_sdk/ai/__pycache__/_openai_responses_api.cpython-312.pyc,,
58
+ sentry_sdk/ai/__pycache__/monitoring.cpython-312.pyc,,
59
+ sentry_sdk/ai/__pycache__/utils.cpython-312.pyc,,
60
+ sentry_sdk/ai/_openai_completions_api.py,sha256=MeLmyCCd-PQLWRN4RHgCm7bK6QXBjU74Df0ZGHDDDCg,1821
61
+ sentry_sdk/ai/_openai_responses_api.py,sha256=xN2fbP2hNibB-kW0mbA7L9s_g_JrYhyra7V4S9xF8kU,656
62
+ sentry_sdk/ai/monitoring.py,sha256=PRYQXdKh2CdIttIKcCWhQKebGHlBHn6b0lKWAmhK7MQ,5539
63
+ sentry_sdk/ai/utils.py,sha256=VtyghTEHI_3KRrEltYpLlIOO5FX5eD_Og1l7sqA9mx4,23837
64
+ sentry_sdk/api.py,sha256=p4ABg2gX0Lx5T6nbi-D8LIo-p3lB37IcI91I_bEoJYg,14668
65
+ sentry_sdk/attachments.py,sha256=fvmTzYvG2a2mjNzTzcja28ize87kEEZMtXwK1rmXpBc,3023
66
+ sentry_sdk/client.py,sha256=jhdrYnioKRqwZy66VgqrwGNyTewajkd1stPU0iZqZLA,38867
67
+ sentry_sdk/consts.py,sha256=ndv79s9CYhXd5N1S5a_F6MiO2lAimC2k69aSZYQU2kM,55042
68
+ sentry_sdk/crons/__init__.py,sha256=3Zt6g1-pZZ12uRKKsC8QLm3XgJ4K1VYxgVpNNUygOZY,221
69
+ sentry_sdk/crons/__pycache__/__init__.cpython-312.pyc,,
70
+ sentry_sdk/crons/__pycache__/api.cpython-312.pyc,,
71
+ sentry_sdk/crons/__pycache__/consts.cpython-312.pyc,,
72
+ sentry_sdk/crons/__pycache__/decorator.cpython-312.pyc,,
73
+ sentry_sdk/crons/api.py,sha256=gwtNjHZxZy51piEwYrnrYg3bZkifrdkSRlDAZYC30Kg,1679
74
+ sentry_sdk/crons/consts.py,sha256=dXqJk5meBSu5rjlGpqAOlkpACnuUi7svQnAFoy1ZNUU,87
75
+ sentry_sdk/crons/decorator.py,sha256=4fYEHYlhdtKyhzJr1HZZ7g2ierEJ3ULZe4N7a3jqbYE,3834
76
+ sentry_sdk/debug.py,sha256=63f8uUowpbZ75E8q-r_8Tdymsv9jXZn1VMh287VcWhk,959
77
+ sentry_sdk/envelope.py,sha256=q0CtNI9IgYPTlviQyp8HB8aeIxLVcklMIqaQ-LU_HpU,9652
78
+ sentry_sdk/feature_flags.py,sha256=J9G-nnQiXimYbhkPNDGz_CjAccYlbXS9KgybExBZjUI,2136
79
+ sentry_sdk/hub.py,sha256=sVkOrpjU0cNhxOFtWfxA8Nc_3T8hiGH-mVFZa4tX1Us,25147
80
+ sentry_sdk/integrations/__init__.py,sha256=U52rckcGzXoe73On-T70R64feEIg1ir1O1NSSkQvHzg,12674
81
+ sentry_sdk/integrations/__pycache__/__init__.cpython-312.pyc,,
82
+ sentry_sdk/integrations/__pycache__/_asgi_common.cpython-312.pyc,,
83
+ sentry_sdk/integrations/__pycache__/_wsgi_common.cpython-312.pyc,,
84
+ sentry_sdk/integrations/__pycache__/aiohttp.cpython-312.pyc,,
85
+ sentry_sdk/integrations/__pycache__/anthropic.cpython-312.pyc,,
86
+ sentry_sdk/integrations/__pycache__/argv.cpython-312.pyc,,
87
+ sentry_sdk/integrations/__pycache__/ariadne.cpython-312.pyc,,
88
+ sentry_sdk/integrations/__pycache__/arq.cpython-312.pyc,,
89
+ sentry_sdk/integrations/__pycache__/asgi.cpython-312.pyc,,
90
+ sentry_sdk/integrations/__pycache__/asyncio.cpython-312.pyc,,
91
+ sentry_sdk/integrations/__pycache__/asyncpg.cpython-312.pyc,,
92
+ sentry_sdk/integrations/__pycache__/atexit.cpython-312.pyc,,
93
+ sentry_sdk/integrations/__pycache__/aws_lambda.cpython-312.pyc,,
94
+ sentry_sdk/integrations/__pycache__/beam.cpython-312.pyc,,
95
+ sentry_sdk/integrations/__pycache__/boto3.cpython-312.pyc,,
96
+ sentry_sdk/integrations/__pycache__/bottle.cpython-312.pyc,,
97
+ sentry_sdk/integrations/__pycache__/chalice.cpython-312.pyc,,
98
+ sentry_sdk/integrations/__pycache__/clickhouse_driver.cpython-312.pyc,,
99
+ sentry_sdk/integrations/__pycache__/cloud_resource_context.cpython-312.pyc,,
100
+ sentry_sdk/integrations/__pycache__/cohere.cpython-312.pyc,,
101
+ sentry_sdk/integrations/__pycache__/dedupe.cpython-312.pyc,,
102
+ sentry_sdk/integrations/__pycache__/dramatiq.cpython-312.pyc,,
103
+ sentry_sdk/integrations/__pycache__/excepthook.cpython-312.pyc,,
104
+ sentry_sdk/integrations/__pycache__/executing.cpython-312.pyc,,
105
+ sentry_sdk/integrations/__pycache__/falcon.cpython-312.pyc,,
106
+ sentry_sdk/integrations/__pycache__/fastapi.cpython-312.pyc,,
107
+ sentry_sdk/integrations/__pycache__/flask.cpython-312.pyc,,
108
+ sentry_sdk/integrations/__pycache__/gcp.cpython-312.pyc,,
109
+ sentry_sdk/integrations/__pycache__/gnu_backtrace.cpython-312.pyc,,
110
+ sentry_sdk/integrations/__pycache__/gql.cpython-312.pyc,,
111
+ sentry_sdk/integrations/__pycache__/graphene.cpython-312.pyc,,
112
+ sentry_sdk/integrations/__pycache__/httpx.cpython-312.pyc,,
113
+ sentry_sdk/integrations/__pycache__/huey.cpython-312.pyc,,
114
+ sentry_sdk/integrations/__pycache__/huggingface_hub.cpython-312.pyc,,
115
+ sentry_sdk/integrations/__pycache__/langchain.cpython-312.pyc,,
116
+ sentry_sdk/integrations/__pycache__/langgraph.cpython-312.pyc,,
117
+ sentry_sdk/integrations/__pycache__/launchdarkly.cpython-312.pyc,,
118
+ sentry_sdk/integrations/__pycache__/litellm.cpython-312.pyc,,
119
+ sentry_sdk/integrations/__pycache__/litestar.cpython-312.pyc,,
120
+ sentry_sdk/integrations/__pycache__/logging.cpython-312.pyc,,
121
+ sentry_sdk/integrations/__pycache__/loguru.cpython-312.pyc,,
122
+ sentry_sdk/integrations/__pycache__/mcp.cpython-312.pyc,,
123
+ sentry_sdk/integrations/__pycache__/modules.cpython-312.pyc,,
124
+ sentry_sdk/integrations/__pycache__/openai.cpython-312.pyc,,
125
+ sentry_sdk/integrations/__pycache__/openfeature.cpython-312.pyc,,
126
+ sentry_sdk/integrations/__pycache__/otlp.cpython-312.pyc,,
127
+ sentry_sdk/integrations/__pycache__/pure_eval.cpython-312.pyc,,
128
+ sentry_sdk/integrations/__pycache__/pymongo.cpython-312.pyc,,
129
+ sentry_sdk/integrations/__pycache__/pyramid.cpython-312.pyc,,
130
+ sentry_sdk/integrations/__pycache__/quart.cpython-312.pyc,,
131
+ sentry_sdk/integrations/__pycache__/ray.cpython-312.pyc,,
132
+ sentry_sdk/integrations/__pycache__/rq.cpython-312.pyc,,
133
+ sentry_sdk/integrations/__pycache__/rust_tracing.cpython-312.pyc,,
134
+ sentry_sdk/integrations/__pycache__/sanic.cpython-312.pyc,,
135
+ sentry_sdk/integrations/__pycache__/serverless.cpython-312.pyc,,
136
+ sentry_sdk/integrations/__pycache__/socket.cpython-312.pyc,,
137
+ sentry_sdk/integrations/__pycache__/sqlalchemy.cpython-312.pyc,,
138
+ sentry_sdk/integrations/__pycache__/starlette.cpython-312.pyc,,
139
+ sentry_sdk/integrations/__pycache__/starlite.cpython-312.pyc,,
140
+ sentry_sdk/integrations/__pycache__/statsig.cpython-312.pyc,,
141
+ sentry_sdk/integrations/__pycache__/stdlib.cpython-312.pyc,,
142
+ sentry_sdk/integrations/__pycache__/strawberry.cpython-312.pyc,,
143
+ sentry_sdk/integrations/__pycache__/sys_exit.cpython-312.pyc,,
144
+ sentry_sdk/integrations/__pycache__/threading.cpython-312.pyc,,
145
+ sentry_sdk/integrations/__pycache__/tornado.cpython-312.pyc,,
146
+ sentry_sdk/integrations/__pycache__/trytond.cpython-312.pyc,,
147
+ sentry_sdk/integrations/__pycache__/typer.cpython-312.pyc,,
148
+ sentry_sdk/integrations/__pycache__/unleash.cpython-312.pyc,,
149
+ sentry_sdk/integrations/__pycache__/unraisablehook.cpython-312.pyc,,
150
+ sentry_sdk/integrations/__pycache__/wsgi.cpython-312.pyc,,
151
+ sentry_sdk/integrations/_asgi_common.py,sha256=qY2nH21YGtQ1EufzdrovHtGZ-ZaMbSzYNKCo8z-kD44,3145
152
+ sentry_sdk/integrations/_wsgi_common.py,sha256=ffyvbYSYtVNo8wkzx7tt1T7anuMiu2ixApZmC0O9NH4,7282
153
+ sentry_sdk/integrations/aiohttp.py,sha256=vQrqPm_IVgvgQo4PLEyDNrvA5oloz1xALOFtVDTtOpE,12930
154
+ sentry_sdk/integrations/anthropic.py,sha256=CDLZf73v8HBj-OxDzJHlgmecXFHeIIYKoem2kdHmKP4,21222
155
+ sentry_sdk/integrations/argv.py,sha256=3dyTYcNVhx-EoCxzZI8-JO1mxydBo_zHxIKHpLJgJcc,877
156
+ sentry_sdk/integrations/ariadne.py,sha256=u1X6QEcXfwBXa4mmHkgcj8XMoowwBGIWQQYAsItpjwg,5792
157
+ sentry_sdk/integrations/arq.py,sha256=3hs8Y41X3rjxCd769CHjUBYQe3YgULtbX6WQQB5-kwQ,7906
158
+ sentry_sdk/integrations/asgi.py,sha256=AoJcWDRkmm5vqSMH44V2zx00BiT9Kzqfo3pd3Eyi6D8,12687
159
+ sentry_sdk/integrations/asyncio.py,sha256=nszg5HkxWLoTzfS3EZ9hBzsCC7JM8JKOBFXbNXnCDpI,6994
160
+ sentry_sdk/integrations/asyncpg.py,sha256=T0Ijpln-xoQCuY0AS15dYZK2_1XCdKYT1J9EfdSQURo,6578
161
+ sentry_sdk/integrations/atexit.py,sha256=eUcgC6f7WoqJadxDbn76dBv6zoS97kgyxshjGBNK1us,1567
162
+ sentry_sdk/integrations/aws_lambda.py,sha256=HDvJk2EpOjS0_2wkTXc9lQr2w-huT7v3j4_-Pxq0KPw,17895
163
+ sentry_sdk/integrations/beam.py,sha256=j-BzEzAHXk12HJ8izUyIOaM02d29BZUpYldBMCq8vdI,5092
164
+ sentry_sdk/integrations/boto3.py,sha256=4HLGehOoU8rc-85yL5ATUUZL9x_TGYjPkNYHOcT4Z2w,4341
165
+ sentry_sdk/integrations/bottle.py,sha256=Im8xqoQvrygqKHsK-rtaX41gOaY_hxb4anjgJnZQGu4,6404
166
+ sentry_sdk/integrations/celery/__init__.py,sha256=LTWUX6mEsf93fITkcLUV1exm7Rk12DY6JUGKH_gPXCI,18497
167
+ sentry_sdk/integrations/celery/__pycache__/__init__.cpython-312.pyc,,
168
+ sentry_sdk/integrations/celery/__pycache__/beat.cpython-312.pyc,,
169
+ sentry_sdk/integrations/celery/__pycache__/utils.cpython-312.pyc,,
170
+ sentry_sdk/integrations/celery/beat.py,sha256=RCG2yAyJSyWtzDGw6B1M_w3TXDvPnznAt1dhfFcLBDI,8839
171
+ sentry_sdk/integrations/celery/utils.py,sha256=KyaM868RGak0JeFHkQVYtR1CkKRNpSD4_bbpisF5jqo,1152
172
+ sentry_sdk/integrations/chalice.py,sha256=nd_J4MnPZzoVdH4M5HV_-ljeS8fbxe_5DXmKJm6zqSI,4663
173
+ sentry_sdk/integrations/clickhouse_driver.py,sha256=mNYW99LlIOCk45aUeELY9cuHTfBorhwV0rGn2c_25EY,5905
174
+ sentry_sdk/integrations/cloud_resource_context.py,sha256=J_3Zd9czyI2NuWy09BEwGKzw80rQlVlJ1CQqTvJ5m18,7638
175
+ sentry_sdk/integrations/cohere.py,sha256=QxgvN7fy385-EwUPYFup98UiMxxg_zrE8h0yCfI5iT8,9495
176
+ sentry_sdk/integrations/dedupe.py,sha256=EApcjbAp_uEmsOSb0m64JNGaKUEqB46YCYX8AgTDO24,1903
177
+ sentry_sdk/integrations/django/__init__.py,sha256=T2N0ZnN4aI4AGJvoyNtYBBXtAKPrN3suuQPQi1Ks_r8,26421
178
+ sentry_sdk/integrations/django/__pycache__/__init__.cpython-312.pyc,,
179
+ sentry_sdk/integrations/django/__pycache__/asgi.cpython-312.pyc,,
180
+ sentry_sdk/integrations/django/__pycache__/caching.cpython-312.pyc,,
181
+ sentry_sdk/integrations/django/__pycache__/middleware.cpython-312.pyc,,
182
+ sentry_sdk/integrations/django/__pycache__/signals_handlers.cpython-312.pyc,,
183
+ sentry_sdk/integrations/django/__pycache__/tasks.cpython-312.pyc,,
184
+ sentry_sdk/integrations/django/__pycache__/templates.cpython-312.pyc,,
185
+ sentry_sdk/integrations/django/__pycache__/transactions.cpython-312.pyc,,
186
+ sentry_sdk/integrations/django/__pycache__/views.cpython-312.pyc,,
187
+ sentry_sdk/integrations/django/asgi.py,sha256=xi4ceuIgjFsvyBwzGOwHj_siLHqeYHuL5oLfPqDxPKQ,8436
188
+ sentry_sdk/integrations/django/caching.py,sha256=gXRDSZigV1SETmxDUe8GA53Tha2fo0ZAFOofuorQD30,6980
189
+ sentry_sdk/integrations/django/middleware.py,sha256=kSN4ORVUuO8Yvt0sC6YS8br2gk6gLmBwysIFsN9z2js,5917
190
+ sentry_sdk/integrations/django/signals_handlers.py,sha256=E1r1sYzKQHko6fD8bPC7_rXGH5kBVDwmE2sutWuGAwQ,3062
191
+ sentry_sdk/integrations/django/tasks.py,sha256=Bu_5jxvFNcAk9N8PedxhWP5Fq-Mj8zVRYdDRsd-6eJA,1151
192
+ sentry_sdk/integrations/django/templates.py,sha256=NZuSVDU0srkF_La5BpEIGqonCbK-qLKMMI3bqj2za3w,5695
193
+ sentry_sdk/integrations/django/transactions.py,sha256=zjsJgVFJZX-c5H_I-TXeLrvrw3OvgAWgmxYO5nTdhfQ,4918
194
+ sentry_sdk/integrations/django/views.py,sha256=Cz2H_yeZkPixOksVoi5urp6Xeq5p0YdDsBWz5aEJan4,3256
195
+ sentry_sdk/integrations/dramatiq.py,sha256=OIq0lIUbabt9widLKGkyjhvx9MKXNF9atP1gTsLheuQ,7438
196
+ sentry_sdk/integrations/excepthook.py,sha256=lhbIwr1eIj3k8shtmUGDOWI7Qr-OaT5kbgeHj-F217g,2371
197
+ sentry_sdk/integrations/executing.py,sha256=O5R4pw999PIisy7uksrMkRx0SsmfBFKI1BEpI3cTDz4,1981
198
+ sentry_sdk/integrations/falcon.py,sha256=zWKFTzoOY92BryGKsjKDAhVxFDhyop4G02js6OlmIXY,9277
199
+ sentry_sdk/integrations/fastapi.py,sha256=eebHCt2FiR7YQCaRJVOnXurltMZ4phRCZaL6BXSEpVI,4485
200
+ sentry_sdk/integrations/flask.py,sha256=KGG2Jbcw9SdwADJLg5GmBnVq3VywK7qQwdPv97RldmA,8519
201
+ sentry_sdk/integrations/gcp.py,sha256=Fao0nrrBpiG8_GHygzh6VvmM8bpJq0mkvgTwu5LT1a4,8400
202
+ sentry_sdk/integrations/gnu_backtrace.py,sha256=XND4pkVeprhDdvPgOi8pBgQQ9g3CtPA-wJK0e0ex1XE,2771
203
+ sentry_sdk/integrations/google_genai/__init__.py,sha256=Z-NX8wKGkSwlQupCcCgsarGV3qtpmeiT_wGNqpxweSM,14093
204
+ sentry_sdk/integrations/google_genai/__pycache__/__init__.cpython-312.pyc,,
205
+ sentry_sdk/integrations/google_genai/__pycache__/consts.cpython-312.pyc,,
206
+ sentry_sdk/integrations/google_genai/__pycache__/streaming.cpython-312.pyc,,
207
+ sentry_sdk/integrations/google_genai/__pycache__/utils.cpython-312.pyc,,
208
+ sentry_sdk/integrations/google_genai/consts.py,sha256=nqHKKSyGixrSoozA06BGVBFaUCsvZlvGoubUZGI1kB8,559
209
+ sentry_sdk/integrations/google_genai/streaming.py,sha256=JqhTcAX3DtEsZ033AXsnA0KAzQe_jqca-7cztTKDcRI,5416
210
+ sentry_sdk/integrations/google_genai/utils.py,sha256=WCTyevkQF2CdGpdlidONJSWSdDmnqEa0dcaiRX2luuk,34850
211
+ sentry_sdk/integrations/gql.py,sha256=oLssOmHYQ4e0OQ1ITer8TynvyLXtDUoM9X29-IsVAkI,5003
212
+ sentry_sdk/integrations/graphene.py,sha256=11XRvAsSgvCLcfF1xvdSeAf0vvJm72hfcE3toAS8jZI,5066
213
+ sentry_sdk/integrations/grpc/__init__.py,sha256=0Y3neOWwES1PsWJsluPF_2_VatFlVvg7IUjTYYdCs-U,6253
214
+ sentry_sdk/integrations/grpc/__pycache__/__init__.cpython-312.pyc,,
215
+ sentry_sdk/integrations/grpc/__pycache__/client.cpython-312.pyc,,
216
+ sentry_sdk/integrations/grpc/__pycache__/consts.cpython-312.pyc,,
217
+ sentry_sdk/integrations/grpc/__pycache__/server.cpython-312.pyc,,
218
+ sentry_sdk/integrations/grpc/aio/__init__.py,sha256=2rgrliowpPfLLw40_2YU6ixSzIu_3f8NN3TRplzc8S8,141
219
+ sentry_sdk/integrations/grpc/aio/__pycache__/__init__.cpython-312.pyc,,
220
+ sentry_sdk/integrations/grpc/aio/__pycache__/client.cpython-312.pyc,,
221
+ sentry_sdk/integrations/grpc/aio/__pycache__/server.cpython-312.pyc,,
222
+ sentry_sdk/integrations/grpc/aio/client.py,sha256=2GNrTWJt4iZ9dK-M3_0-sVWlUcfqKItHZI5u-uzesUM,3507
223
+ sentry_sdk/integrations/grpc/aio/server.py,sha256=cRBvz9GOJemvNOsCB9ymTUX70nhjljfzvX0bfR8Wc_o,3954
224
+ sentry_sdk/integrations/grpc/client.py,sha256=s7yLcbYgnq1-bdh6jvSOv6NwKTx0m9tLPt0q4qKI7kw,3404
225
+ sentry_sdk/integrations/grpc/consts.py,sha256=NpsN5gKWDmtGtVK_L5HscgFZBHqjOpmLJLGKyh8GZBA,31
226
+ sentry_sdk/integrations/grpc/server.py,sha256=h6vEibthsZB6hWNmqT50UlBaddEZSdbXio_rYhighW0,2470
227
+ sentry_sdk/integrations/httpx.py,sha256=MxoT_D8n902awI2hq-lHpxXkOvNwGZFuG9ap4l4VxL4,5315
228
+ sentry_sdk/integrations/huey.py,sha256=GOslGxuYfrdPLqx_4MWOH_tdwwnbD72SkTdc-P8uF80,5348
229
+ sentry_sdk/integrations/huggingface_hub.py,sha256=axCqdDlmYn7wANHgVBo3MDv9QgBr0MWj3lBYhPJJX7E,14966
230
+ sentry_sdk/integrations/langchain.py,sha256=Oa-YJ2WoHX8Huyrzs4aQ2xlaupFR5Ft8FC4_0b_zvBs,42903
231
+ sentry_sdk/integrations/langgraph.py,sha256=xb9InjYjd4jAQECWddl30QGRoHRbbhoYBeHscsM7C1A,13301
232
+ sentry_sdk/integrations/launchdarkly.py,sha256=RSTCAUOJCo6Npr4V3-7-vSOaZ531Jj1LNU5IftXL6_w,1918
233
+ sentry_sdk/integrations/litellm.py,sha256=1CLM3CBdIv0YZSloDPCfemIdlufNG41eWvCBRG_AFow,11550
234
+ sentry_sdk/integrations/litestar.py,sha256=oJIz1vv2JgIT_AriKXDTQvQjkMOmwKfT68DFrCfdJ-8,11698
235
+ sentry_sdk/integrations/logging.py,sha256=eLjza7xDrfBszOH4gDHxPtSM3WfiWsw9WGR3GUfnA-A,13739
236
+ sentry_sdk/integrations/loguru.py,sha256=2UaLLzXVEOV0cYxOU5n9GpkmNtC_bA1KzCzEnrqIilE,6505
237
+ sentry_sdk/integrations/mcp.py,sha256=4t7cVGwqjoqyNSd4HrwzjfM7fw2B5HYWRJoT6wOw6AA,22192
238
+ sentry_sdk/integrations/modules.py,sha256=7kmS9T-werRkKN0w51skyDmTMuZbdz9L28RgXPsoDeU,786
239
+ sentry_sdk/integrations/openai.py,sha256=8UquHKCu17tS6Afs90HLDa4mpx-7ErRl4nUh1OKwdbE,33167
240
+ sentry_sdk/integrations/openai_agents/__init__.py,sha256=0FHafAmsKZKNtijv3ZRxS9b2xoUvfHbbgFuWZ3eSbRo,10709
241
+ sentry_sdk/integrations/openai_agents/__pycache__/__init__.cpython-312.pyc,,
242
+ sentry_sdk/integrations/openai_agents/__pycache__/consts.cpython-312.pyc,,
243
+ sentry_sdk/integrations/openai_agents/__pycache__/utils.cpython-312.pyc,,
244
+ sentry_sdk/integrations/openai_agents/consts.py,sha256=PTb3vlqkuMPktu21ALK72o5WMIX4-cewTEiTRdHKFdQ,38
245
+ sentry_sdk/integrations/openai_agents/patches/__init__.py,sha256=62qvLBPGajoxGi4C41eBOztIiB8MGRpmP2i3GSD_22k,383
246
+ sentry_sdk/integrations/openai_agents/patches/__pycache__/__init__.cpython-312.pyc,,
247
+ sentry_sdk/integrations/openai_agents/patches/__pycache__/agent_run.cpython-312.pyc,,
248
+ sentry_sdk/integrations/openai_agents/patches/__pycache__/error_tracing.cpython-312.pyc,,
249
+ sentry_sdk/integrations/openai_agents/patches/__pycache__/models.cpython-312.pyc,,
250
+ sentry_sdk/integrations/openai_agents/patches/__pycache__/runner.cpython-312.pyc,,
251
+ sentry_sdk/integrations/openai_agents/patches/__pycache__/tools.cpython-312.pyc,,
252
+ sentry_sdk/integrations/openai_agents/patches/agent_run.py,sha256=OhLQLEB7LQEMRs6BkQ0jfyY6QaZ-lhP5yZ_b44KTRV0,8257
253
+ sentry_sdk/integrations/openai_agents/patches/error_tracing.py,sha256=Fzs4oAyGWnmbaeLMMRnr9m6F6M2N9Cm0i4GSPsJbNqI,2237
254
+ sentry_sdk/integrations/openai_agents/patches/models.py,sha256=mKUbs01jhNIs06fKgP6nraRB-5m_UvCGkvK0Xu4cuDg,7278
255
+ sentry_sdk/integrations/openai_agents/patches/runner.py,sha256=gJLrC334MrfcSCxG93__iOh3jcROGWw4YKHXcHJ5dYY,6837
256
+ sentry_sdk/integrations/openai_agents/patches/tools.py,sha256=9I7GOIY-X3WyegTMMq-KIbs5DQ3Qg5JHHtIrEU3eivA,2823
257
+ sentry_sdk/integrations/openai_agents/spans/__init__.py,sha256=uzqInMjqfA_MtdTJoRRH10Z4cVtVESbi7qc5sC_zr8U,393
258
+ sentry_sdk/integrations/openai_agents/spans/__pycache__/__init__.cpython-312.pyc,,
259
+ sentry_sdk/integrations/openai_agents/spans/__pycache__/agent_workflow.cpython-312.pyc,,
260
+ sentry_sdk/integrations/openai_agents/spans/__pycache__/ai_client.cpython-312.pyc,,
261
+ sentry_sdk/integrations/openai_agents/spans/__pycache__/execute_tool.cpython-312.pyc,,
262
+ sentry_sdk/integrations/openai_agents/spans/__pycache__/handoff.cpython-312.pyc,,
263
+ sentry_sdk/integrations/openai_agents/spans/__pycache__/invoke_agent.cpython-312.pyc,,
264
+ sentry_sdk/integrations/openai_agents/spans/agent_workflow.py,sha256=lqr87hbaV1GCXX40cGQLSlf3U6V-nmD2zj5kr2w9kmk,459
265
+ sentry_sdk/integrations/openai_agents/spans/ai_client.py,sha256=RbJNDn9DXM4NJLHJBIodRp1GK-kQmclYCD5DgOV2WTI,2265
266
+ sentry_sdk/integrations/openai_agents/spans/execute_tool.py,sha256=ePm9PMnEzoYZW8M04dnDRINV7dRnDKtHa4YFn1InqZY,1612
267
+ sentry_sdk/integrations/openai_agents/spans/handoff.py,sha256=icCbs6HOFrVXY4V1LILCoyV-qxNntIBhgx8AFHW6MGo,723
268
+ sentry_sdk/integrations/openai_agents/spans/invoke_agent.py,sha256=VWB1HmR3oyoTqi3ZrHIHY37ageQnPsap5C81bLcxrOA,3592
269
+ sentry_sdk/integrations/openai_agents/utils.py,sha256=MvbPG-dh5UJzE50y-Q9xUeSuuu_Ot642SvD1H0hIcuY,8910
270
+ sentry_sdk/integrations/openfeature.py,sha256=w6bYjrHJdjIpziEvEBocCbxdryO2eDZGGOoc7g8jNEU,1102
271
+ sentry_sdk/integrations/opentelemetry/__init__.py,sha256=emNL5aAq_NhK0PZmfX_g4GIdvBS6nHqGrjrIgrdC5m8,229
272
+ sentry_sdk/integrations/opentelemetry/__pycache__/__init__.cpython-312.pyc,,
273
+ sentry_sdk/integrations/opentelemetry/__pycache__/consts.cpython-312.pyc,,
274
+ sentry_sdk/integrations/opentelemetry/__pycache__/integration.cpython-312.pyc,,
275
+ sentry_sdk/integrations/opentelemetry/__pycache__/propagator.cpython-312.pyc,,
276
+ sentry_sdk/integrations/opentelemetry/__pycache__/span_processor.cpython-312.pyc,,
277
+ sentry_sdk/integrations/opentelemetry/consts.py,sha256=6yhH65VVzzIjpU8MRYItLKkskkBsLlbsO-_2MULlEHQ,275
278
+ sentry_sdk/integrations/opentelemetry/integration.py,sha256=VPQlj-tDBgjSRXOxtrxeFpv9Mc_OqX7cC997gMy4CkY,1742
279
+ sentry_sdk/integrations/opentelemetry/propagator.py,sha256=ofVK4IJ9zhSN0cRmw_Cr15IHK0XckmBy8WqA_oaGa5Q,3973
280
+ sentry_sdk/integrations/opentelemetry/span_processor.py,sha256=YK-k88VMoJNrOEXMqYaVM_Y-FywN2hQjdrodLeqGegw,13222
281
+ sentry_sdk/integrations/otlp.py,sha256=5h0hFY19myYSg_buSAutomC_8_BfRtDyHyLE9XD_A8A,7713
282
+ sentry_sdk/integrations/pure_eval.py,sha256=SLN-omknLQWyeH1_3v3Qi-puFCAliYKQqi0wdDt3Keg,4560
283
+ sentry_sdk/integrations/pydantic_ai/__init__.py,sha256=YrUxvNgPIznxzVaniyLTorkkTmlYyosAbA37tFOTR68,1465
284
+ sentry_sdk/integrations/pydantic_ai/__pycache__/__init__.cpython-312.pyc,,
285
+ sentry_sdk/integrations/pydantic_ai/__pycache__/consts.cpython-312.pyc,,
286
+ sentry_sdk/integrations/pydantic_ai/__pycache__/utils.cpython-312.pyc,,
287
+ sentry_sdk/integrations/pydantic_ai/consts.py,sha256=fxOQ5n_Do8EqqqxtOJm5zyvhQmOV75HACNrt_-zGngs,36
288
+ sentry_sdk/integrations/pydantic_ai/patches/__init__.py,sha256=_RHvjc3436KSwPjzrAdnyascgggxg5e0MQpdHhmiS-U,229
289
+ sentry_sdk/integrations/pydantic_ai/patches/__pycache__/__init__.cpython-312.pyc,,
290
+ sentry_sdk/integrations/pydantic_ai/patches/__pycache__/agent_run.cpython-312.pyc,,
291
+ sentry_sdk/integrations/pydantic_ai/patches/__pycache__/graph_nodes.cpython-312.pyc,,
292
+ sentry_sdk/integrations/pydantic_ai/patches/__pycache__/model_request.cpython-312.pyc,,
293
+ sentry_sdk/integrations/pydantic_ai/patches/__pycache__/tools.cpython-312.pyc,,
294
+ sentry_sdk/integrations/pydantic_ai/patches/agent_run.py,sha256=iGafWMXefgs_1HQfV_rK4Pe5wJex4vLbAdX-O83fxQQ,7345
295
+ sentry_sdk/integrations/pydantic_ai/patches/graph_nodes.py,sha256=TDkIjRUxGGt0UbC3FzgaCJNJN1s682IXoVFO07biL6E,3793
296
+ sentry_sdk/integrations/pydantic_ai/patches/model_request.py,sha256=15uGRv0UXjZcEc9b6Jzx7o9yhToUQfd4KRLZm_OWMm8,1240
297
+ sentry_sdk/integrations/pydantic_ai/patches/tools.py,sha256=soSA-ugtq-EdkkbfXV_mAPC-AfEaCM_6E_c1kvg_IQk,3822
298
+ sentry_sdk/integrations/pydantic_ai/spans/__init__.py,sha256=dTUjvkw7VMOAiSasuAq37q_njvANsUlgfZxgXRKJDDo,243
299
+ sentry_sdk/integrations/pydantic_ai/spans/__pycache__/__init__.cpython-312.pyc,,
300
+ sentry_sdk/integrations/pydantic_ai/spans/__pycache__/ai_client.cpython-312.pyc,,
301
+ sentry_sdk/integrations/pydantic_ai/spans/__pycache__/execute_tool.cpython-312.pyc,,
302
+ sentry_sdk/integrations/pydantic_ai/spans/__pycache__/invoke_agent.cpython-312.pyc,,
303
+ sentry_sdk/integrations/pydantic_ai/spans/__pycache__/utils.cpython-312.pyc,,
304
+ sentry_sdk/integrations/pydantic_ai/spans/ai_client.py,sha256=aAEjW2eh317ZFQJwJi2_jJHnWaKtdWdJFkuaU-BFbj8,10508
305
+ sentry_sdk/integrations/pydantic_ai/spans/execute_tool.py,sha256=0k-eIN1tltSOQf2kFdUMGKbyWhKYTNZE8dJF9vkOfnE,1542
306
+ sentry_sdk/integrations/pydantic_ai/spans/invoke_agent.py,sha256=Mbq_7NkqD5oyjY-_AOhblliYNT_Ij6z-MEMfZl4xyyY,5742
307
+ sentry_sdk/integrations/pydantic_ai/spans/utils.py,sha256=VSWLcgjrUwNSR_31TFM5BRG0EsGrc3X8BWIxM4v0HZI,1763
308
+ sentry_sdk/integrations/pydantic_ai/utils.py,sha256=OWFa15_jQ7H4gHfMAUQHmSVclLVrF_DBb7heaPZ89HY,7091
309
+ sentry_sdk/integrations/pymongo.py,sha256=whIHxnsiz245UNT7yKMP1OqfAsBdC7ITy7kuice0uwE,6275
310
+ sentry_sdk/integrations/pyramid.py,sha256=EzK6W2VB4DdDfV_VXnp5ERibM27mVK-JGib3k9Khc4E,7208
311
+ sentry_sdk/integrations/quart.py,sha256=4GTx6E7auhLeYXyY8WoloqSghbeU14Jcz-gfDL4o3aE,7258
312
+ sentry_sdk/integrations/ray.py,sha256=8zLUo2ZFl9KEt88X-yJYVVoDphJUTWRZRT0I5OQ1PCk,5884
313
+ sentry_sdk/integrations/redis/__init__.py,sha256=TzeEWrYcDO9u3q5WXT8J9Zk5Ft9nrWW1PsE3i0ULGUg,1662
314
+ sentry_sdk/integrations/redis/__pycache__/__init__.cpython-312.pyc,,
315
+ sentry_sdk/integrations/redis/__pycache__/_async_common.cpython-312.pyc,,
316
+ sentry_sdk/integrations/redis/__pycache__/_sync_common.cpython-312.pyc,,
317
+ sentry_sdk/integrations/redis/__pycache__/consts.cpython-312.pyc,,
318
+ sentry_sdk/integrations/redis/__pycache__/rb.cpython-312.pyc,,
319
+ sentry_sdk/integrations/redis/__pycache__/redis.cpython-312.pyc,,
320
+ sentry_sdk/integrations/redis/__pycache__/redis_cluster.cpython-312.pyc,,
321
+ sentry_sdk/integrations/redis/__pycache__/redis_py_cluster_legacy.cpython-312.pyc,,
322
+ sentry_sdk/integrations/redis/__pycache__/utils.cpython-312.pyc,,
323
+ sentry_sdk/integrations/redis/_async_common.py,sha256=V05_CgSyWBKBLQ4Udkg4LT5ovPAnP5iqsvfeqsmLEwU,4058
324
+ sentry_sdk/integrations/redis/_sync_common.py,sha256=3seE5lTc2fDNLXutpe41Qv-3pdRVbLHFGCQaT9OXzvc,3776
325
+ sentry_sdk/integrations/redis/consts.py,sha256=y2f-FJ7TIkzto01tyjXvbKVSVELVkjZaxj3FG5DZ0hA,480
326
+ sentry_sdk/integrations/redis/modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
327
+ sentry_sdk/integrations/redis/modules/__pycache__/__init__.cpython-312.pyc,,
328
+ sentry_sdk/integrations/redis/modules/__pycache__/caches.cpython-312.pyc,,
329
+ sentry_sdk/integrations/redis/modules/__pycache__/queries.cpython-312.pyc,,
330
+ sentry_sdk/integrations/redis/modules/caches.py,sha256=cV64P6UWHXolvTtog5jQFHtR4XFd5IqA5vVYXc8xO2E,4020
331
+ sentry_sdk/integrations/redis/modules/queries.py,sha256=lRwn72AUvk5a29n43UEpx6l9ix_xC2htHcl4cHKsdcw,1941
332
+ sentry_sdk/integrations/redis/rb.py,sha256=we3oh4fEhiirglutAllZe55sAoz6vKtaQt1tvLPGoX4,791
333
+ sentry_sdk/integrations/redis/redis.py,sha256=NaaefF_0poC8-9citN6etyAC6WOabxBVOdH_J8FJ7ec,1684
334
+ sentry_sdk/integrations/redis/redis_cluster.py,sha256=rBbj1WRa_-NAR6tTQHKQMF40WjvNb6o6FoojImIE03Y,3530
335
+ sentry_sdk/integrations/redis/redis_py_cluster_legacy.py,sha256=YREwCLBkx_SDVkzNLGNcgCr4yi_mdomlwgKlVU8bgSU,1570
336
+ sentry_sdk/integrations/redis/utils.py,sha256=i818myPF61yl4EIS-t3ptyYCa2IrsaAWeFR0cqS0QNY,3954
337
+ sentry_sdk/integrations/rq.py,sha256=wyerb5iiAJbekL5-kdOpwQzQiJiWIRlX49il1aEnUWg,5278
338
+ sentry_sdk/integrations/rust_tracing.py,sha256=p3aFy8Gs8Dgvvlv9PSrbl7JDYmA5zus-fTichRzTiLM,9101
339
+ sentry_sdk/integrations/sanic.py,sha256=OJ570xdV4luxJY9_b4gutVf8gQhNpqyZHxyT3evydLM,12700
340
+ sentry_sdk/integrations/serverless.py,sha256=eZSd0NPYBm3ZGdYFmDYlXDtAD_Pe_aUGV77bCzJhoEI,1621
341
+ sentry_sdk/integrations/socket.py,sha256=fj-4yzw7uecKDa4Jl_8AN_Nb9KSIvCh3iR8oqaLsYaQ,3164
342
+ sentry_sdk/integrations/spark/__init__.py,sha256=oOewMErnZk2rzNvIlZO6URxQexu9bUJuSLM2m_zECy8,208
343
+ sentry_sdk/integrations/spark/__pycache__/__init__.cpython-312.pyc,,
344
+ sentry_sdk/integrations/spark/__pycache__/spark_driver.cpython-312.pyc,,
345
+ sentry_sdk/integrations/spark/__pycache__/spark_worker.cpython-312.pyc,,
346
+ sentry_sdk/integrations/spark/spark_driver.py,sha256=mYX1ohjUD7wrc7F16OhvNyyOGqgQ7QM6bFzfBb_MUZY,8896
347
+ sentry_sdk/integrations/spark/spark_worker.py,sha256=PtRzOBjrG0__Au-Uidd8cO5zpx-EJY_D13H9CG900RE,3632
348
+ sentry_sdk/integrations/sqlalchemy.py,sha256=Q6JQxDcwo2U-3ggB_wUhQZps1gzzXz6-Yf20dcCMUWU,4347
349
+ sentry_sdk/integrations/starlette.py,sha256=1cSqN0IARPGi6kLitW93ASlTJIAFFUxsyWnS8QMMzJY,26130
350
+ sentry_sdk/integrations/starlite.py,sha256=GC_FaL-cyjeUtsRIJ29jMR4HlYXRwI-JRYy7GFn5QAI,10451
351
+ sentry_sdk/integrations/statsig.py,sha256=ceSjGymN4Fqdnk8Tz-L5teHz0eNOJtptB_W0Y1VZu2o,1214
352
+ sentry_sdk/integrations/stdlib.py,sha256=2yN0_v-fokZwgfnsKBvcIdl4tQ-QpLUnoQzYyOX_gH4,9376
353
+ sentry_sdk/integrations/strawberry.py,sha256=cTY1L8ihyR5mCiF2H5ZEM59t0BMwNmWqwNkTIuIrFUA,14120
354
+ sentry_sdk/integrations/sys_exit.py,sha256=nLbBnbyPjUgyhsI1VkmTxVAOgdlenf-dFtIqzFTZNOI,2408
355
+ sentry_sdk/integrations/threading.py,sha256=jUsmnGEPoi6XGhuLZP3pWKJCDksul-e7uLwAyPK_vUI,7107
356
+ sentry_sdk/integrations/tornado.py,sha256=jh_tm6UR-m-OPFlTKD4eOYEsyex2Lm9rw7_UpZ-1dCQ,7212
357
+ sentry_sdk/integrations/trytond.py,sha256=FjAKIKDTRmEc1JUFsHkcZhslCOD7RsT-pKYoSg1Qyuk,1750
358
+ sentry_sdk/integrations/typer.py,sha256=hvLL7llTverbhjdC8eU5r0ps39XVf2qI27C35VWyXrE,1833
359
+ sentry_sdk/integrations/unleash.py,sha256=Gz3qAKB0fTtOgzCOJV24ocW1NbdcDkkqijrWDE6Oi7E,1045
360
+ sentry_sdk/integrations/unraisablehook.py,sha256=zThW8f6uFGFfwpPHFXq_pLbnaQRT-1tmltyOPVfTPzM,1717
361
+ sentry_sdk/integrations/wsgi.py,sha256=w6BKB-aCaGKl6tfWlKL2SlifY4ZErQaEH5oCXg_bcFg,10503
362
+ sentry_sdk/logger.py,sha256=3hfleFTl9n6ZlZeb-F8jVVMGoae_SU8fOZd0N6HWbuA,2683
363
+ sentry_sdk/metrics.py,sha256=z8futlRltjsPeTkU_We4HAanxeopDaWtfJTkAumwL68,1470
364
+ sentry_sdk/monitor.py,sha256=YRmomlJySjuuCHLJPgGMhRbHXun651He5QZXUvKWZp0,3443
365
+ sentry_sdk/profiler/__init__.py,sha256=3PI3bHk9RSkkOXZKN84DDedk_7M65EiqqaIGo-DYs0E,1291
366
+ sentry_sdk/profiler/__pycache__/__init__.cpython-312.pyc,,
367
+ sentry_sdk/profiler/__pycache__/continuous_profiler.cpython-312.pyc,,
368
+ sentry_sdk/profiler/__pycache__/transaction_profiler.cpython-312.pyc,,
369
+ sentry_sdk/profiler/__pycache__/utils.cpython-312.pyc,,
370
+ sentry_sdk/profiler/continuous_profiler.py,sha256=lRZa2yF81ByIh1yQVHdsK-Ur5jRMR6aKJuPCkuHn0pI,22550
371
+ sentry_sdk/profiler/transaction_profiler.py,sha256=AJicxLx2aPcnTBm5E6GGx_34j-y2yD907_OsHtN6xyQ,27248
372
+ sentry_sdk/profiler/utils.py,sha256=Y-GN2SoAKa12nxwUQRvI5LTYEWfIkhOvoCPG6B4K_N0,6397
373
+ sentry_sdk/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
374
+ sentry_sdk/scope.py,sha256=HtA_ivY_Syuw2sxXiXCgp1PsRg46CetAuVeth_slfzk,68615
375
+ sentry_sdk/scrubber.py,sha256=pEJECHaWzg0vf93S_uYe9PgStTImDXjLFu0pzIJ6XAM,5953
376
+ sentry_sdk/serializer.py,sha256=W3HrcoJ9iMzHa6j9Nmu5Y7Qopbkon3ovxuPxRM5R4pU,13199
377
+ sentry_sdk/session.py,sha256=ajYuxKt5cKfxl_OjcF0-g5IpowrJhvMe9rjTq1CaQac,5274
378
+ sentry_sdk/sessions.py,sha256=-k0OGB28DoWbb7nIW4ML8R_fiF5nLnboWqObnfyZnWQ,8932
379
+ sentry_sdk/spotlight.py,sha256=0bJrMpoT47xS2LJzpDfUZ5z5YEhdUla78r0YDtVm2Zo,12191
380
+ sentry_sdk/traces.py,sha256=1oVGyMonl10vIrFokfD5UBHXQoPIu4rP2rtnKL3Ju6Y,4568
381
+ sentry_sdk/tracing.py,sha256=OkIHt-BYfYhLe9MyEcTUlgN_n0CjqQn-BJ0K_9gnxaY,50512
382
+ sentry_sdk/tracing_utils.py,sha256=aItaWdi76Q0-HGsILrkNesctXp76BkriTol-jXSZjfI,43300
383
+ sentry_sdk/transport.py,sha256=P21HMCcD58PJfRNPA7Y39hlMqIFK5RsPCMwsy-QzXAQ,32141
384
+ sentry_sdk/types.py,sha256=A92AqvfrGQZ9KY6FaUjKfL9F1HK7Ui3heQilVzfzYCs,1269
385
+ sentry_sdk/utils.py,sha256=j6aRWZoMqiiN3Z_woIvw81E-rypx-CBxg5cQGmBbOIQ,65101
386
+ sentry_sdk/worker.py,sha256=djRCygOJFRmdXBS4lRSF4RTljVZ47BWEal85WlTD0uo,4257
source/sentry_sdk-2.53.0.dist-info/WHEEL ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py2-none-any
5
+ Tag: py3-none-any
6
+
source/sentry_sdk-2.53.0.dist-info/entry_points.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [opentelemetry_propagator]
2
+ sentry = sentry_sdk.integrations.opentelemetry:SentryPropagator
source/sentry_sdk-2.53.0.dist-info/licenses/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2018 Functional Software, Inc. dba Sentry
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
source/sentry_sdk-2.53.0.dist-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ sentry_sdk
source/sentry_sdk/__init__.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentry_sdk import profiler
2
+ from sentry_sdk import metrics
3
+ from sentry_sdk.scope import Scope
4
+ from sentry_sdk.transport import Transport, HttpTransport
5
+ from sentry_sdk.client import Client
6
+
7
+ from sentry_sdk.api import * # noqa
8
+ from sentry_sdk.consts import VERSION
9
+
10
+ __all__ = [ # noqa
11
+ "Hub",
12
+ "Scope",
13
+ "Client",
14
+ "Transport",
15
+ "HttpTransport",
16
+ "VERSION",
17
+ "integrations",
18
+ # From sentry_sdk.api
19
+ "init",
20
+ "add_attachment",
21
+ "add_breadcrumb",
22
+ "capture_event",
23
+ "capture_exception",
24
+ "capture_message",
25
+ "configure_scope",
26
+ "continue_trace",
27
+ "flush",
28
+ "get_baggage",
29
+ "get_client",
30
+ "get_global_scope",
31
+ "get_isolation_scope",
32
+ "get_current_scope",
33
+ "get_current_span",
34
+ "get_traceparent",
35
+ "is_initialized",
36
+ "isolation_scope",
37
+ "last_event_id",
38
+ "new_scope",
39
+ "push_scope",
40
+ "set_context",
41
+ "set_extra",
42
+ "set_level",
43
+ "set_measurement",
44
+ "set_tag",
45
+ "set_tags",
46
+ "set_user",
47
+ "start_span",
48
+ "start_transaction",
49
+ "trace",
50
+ "monitor",
51
+ "logger",
52
+ "metrics",
53
+ "profiler",
54
+ "start_session",
55
+ "end_session",
56
+ "set_transaction_name",
57
+ "update_current_span",
58
+ ]
59
+
60
+ # Initialize the debug support after everything is loaded
61
+ from sentry_sdk.debug import init_debug_support
62
+
63
+ init_debug_support()
64
+ del init_debug_support
65
+
66
+ # circular imports
67
+ from sentry_sdk.hub import Hub
source/sentry_sdk/_batcher.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+ import threading
4
+ from datetime import datetime, timezone
5
+ from typing import TYPE_CHECKING, TypeVar, Generic
6
+
7
+ from sentry_sdk.utils import format_timestamp, safe_repr, serialize_attribute
8
+ from sentry_sdk.envelope import Envelope, Item, PayloadRef
9
+
10
+ if TYPE_CHECKING:
11
+ from typing import Optional, Callable, Any
12
+
13
+ T = TypeVar("T")
14
+
15
+
16
+ class Batcher(Generic[T]):
17
+ MAX_BEFORE_FLUSH = 100
18
+ MAX_BEFORE_DROP = 1_000
19
+ FLUSH_WAIT_TIME = 5.0
20
+
21
+ TYPE = ""
22
+ CONTENT_TYPE = ""
23
+
24
+ def __init__(
25
+ self,
26
+ capture_func: "Callable[[Envelope], None]",
27
+ record_lost_func: "Callable[..., None]",
28
+ ) -> None:
29
+ self._buffer: "list[T]" = []
30
+ self._capture_func = capture_func
31
+ self._record_lost_func = record_lost_func
32
+ self._running = True
33
+ self._lock = threading.Lock()
34
+
35
+ self._flush_event: "threading.Event" = threading.Event()
36
+
37
+ self._flusher: "Optional[threading.Thread]" = None
38
+ self._flusher_pid: "Optional[int]" = None
39
+
40
+ def _ensure_thread(self) -> bool:
41
+ """For forking processes we might need to restart this thread.
42
+ This ensures that our process actually has that thread running.
43
+ """
44
+ if not self._running:
45
+ return False
46
+
47
+ pid = os.getpid()
48
+ if self._flusher_pid == pid:
49
+ return True
50
+
51
+ with self._lock:
52
+ # Recheck to make sure another thread didn't get here and start the
53
+ # the flusher in the meantime
54
+ if self._flusher_pid == pid:
55
+ return True
56
+
57
+ self._flusher_pid = pid
58
+
59
+ self._flusher = threading.Thread(target=self._flush_loop)
60
+ self._flusher.daemon = True
61
+
62
+ try:
63
+ self._flusher.start()
64
+ except RuntimeError:
65
+ # Unfortunately at this point the interpreter is in a state that no
66
+ # longer allows us to spawn a thread and we have to bail.
67
+ self._running = False
68
+ return False
69
+
70
+ return True
71
+
72
+ def _flush_loop(self) -> None:
73
+ while self._running:
74
+ self._flush_event.wait(self.FLUSH_WAIT_TIME + random.random())
75
+ self._flush_event.clear()
76
+ self._flush()
77
+
78
+ def add(self, item: "T") -> None:
79
+ if not self._ensure_thread() or self._flusher is None:
80
+ return None
81
+
82
+ with self._lock:
83
+ if len(self._buffer) >= self.MAX_BEFORE_DROP:
84
+ self._record_lost(item)
85
+ return None
86
+
87
+ self._buffer.append(item)
88
+ if len(self._buffer) >= self.MAX_BEFORE_FLUSH:
89
+ self._flush_event.set()
90
+
91
+ def kill(self) -> None:
92
+ if self._flusher is None:
93
+ return
94
+
95
+ self._running = False
96
+ self._flush_event.set()
97
+ self._flusher = None
98
+
99
+ def flush(self) -> None:
100
+ self._flush()
101
+
102
+ def _add_to_envelope(self, envelope: "Envelope") -> None:
103
+ envelope.add_item(
104
+ Item(
105
+ type=self.TYPE,
106
+ content_type=self.CONTENT_TYPE,
107
+ headers={
108
+ "item_count": len(self._buffer),
109
+ },
110
+ payload=PayloadRef(
111
+ json={
112
+ "items": [
113
+ self._to_transport_format(item) for item in self._buffer
114
+ ]
115
+ }
116
+ ),
117
+ )
118
+ )
119
+
120
+ def _flush(self) -> "Optional[Envelope]":
121
+ envelope = Envelope(
122
+ headers={"sent_at": format_timestamp(datetime.now(timezone.utc))}
123
+ )
124
+ with self._lock:
125
+ if len(self._buffer) == 0:
126
+ return None
127
+
128
+ self._add_to_envelope(envelope)
129
+ self._buffer.clear()
130
+
131
+ self._capture_func(envelope)
132
+ return envelope
133
+
134
+ def _record_lost(self, item: "T") -> None:
135
+ pass
136
+
137
+ @staticmethod
138
+ def _to_transport_format(item: "T") -> "Any":
139
+ pass
source/sentry_sdk/_compat.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ if TYPE_CHECKING:
6
+ from typing import Any
7
+ from typing import TypeVar
8
+
9
+ T = TypeVar("T")
10
+
11
+
12
+ PY37 = sys.version_info[0] == 3 and sys.version_info[1] >= 7
13
+ PY38 = sys.version_info[0] == 3 and sys.version_info[1] >= 8
14
+ PY310 = sys.version_info[0] == 3 and sys.version_info[1] >= 10
15
+ PY311 = sys.version_info[0] == 3 and sys.version_info[1] >= 11
16
+
17
+
18
+ def with_metaclass(meta: "Any", *bases: "Any") -> "Any":
19
+ class MetaClass(type):
20
+ def __new__(metacls: "Any", name: "Any", this_bases: "Any", d: "Any") -> "Any":
21
+ return meta(name, bases, d)
22
+
23
+ return type.__new__(MetaClass, "temporary_class", (), {})
24
+
25
+
26
+ def check_uwsgi_thread_support() -> bool:
27
+ # We check two things here:
28
+ #
29
+ # 1. uWSGI doesn't run in threaded mode by default -- issue a warning if
30
+ # that's the case.
31
+ #
32
+ # 2. Additionally, if uWSGI is running in preforking mode (default), it needs
33
+ # the --py-call-uwsgi-fork-hooks option for the SDK to work properly. This
34
+ # is because any background threads spawned before the main process is
35
+ # forked are NOT CLEANED UP IN THE CHILDREN BY DEFAULT even if
36
+ # --enable-threads is on. One has to explicitly provide
37
+ # --py-call-uwsgi-fork-hooks to force uWSGI to run regular cpython
38
+ # after-fork hooks that take care of cleaning up stale thread data.
39
+ try:
40
+ from uwsgi import opt # type: ignore
41
+ except ImportError:
42
+ return True
43
+
44
+ from sentry_sdk.consts import FALSE_VALUES
45
+
46
+ def enabled(option: str) -> bool:
47
+ value = opt.get(option, False)
48
+ if isinstance(value, bool):
49
+ return value
50
+
51
+ if isinstance(value, bytes):
52
+ try:
53
+ value = value.decode()
54
+ except Exception:
55
+ pass
56
+
57
+ return value and str(value).lower() not in FALSE_VALUES
58
+
59
+ # When `threads` is passed in as a uwsgi option,
60
+ # `enable-threads` is implied on.
61
+ threads_enabled = "threads" in opt or enabled("enable-threads")
62
+ fork_hooks_on = enabled("py-call-uwsgi-fork-hooks")
63
+ lazy_mode = enabled("lazy-apps") or enabled("lazy")
64
+
65
+ if lazy_mode and not threads_enabled:
66
+ from warnings import warn
67
+
68
+ warn(
69
+ Warning(
70
+ "IMPORTANT: "
71
+ "We detected the use of uWSGI without thread support. "
72
+ "This might lead to unexpected issues. "
73
+ 'Please run uWSGI with "--enable-threads" for full support.'
74
+ )
75
+ )
76
+
77
+ return False
78
+
79
+ elif not lazy_mode and (not threads_enabled or not fork_hooks_on):
80
+ from warnings import warn
81
+
82
+ warn(
83
+ Warning(
84
+ "IMPORTANT: "
85
+ "We detected the use of uWSGI in preforking mode without "
86
+ "thread support. This might lead to crashing workers. "
87
+ 'Please run uWSGI with both "--enable-threads" and '
88
+ '"--py-call-uwsgi-fork-hooks" for full support.'
89
+ )
90
+ )
91
+
92
+ return False
93
+
94
+ return True
source/sentry_sdk/_init_implementation.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ import sentry_sdk
6
+
7
+ if TYPE_CHECKING:
8
+ from typing import Any, ContextManager, Optional
9
+
10
+ import sentry_sdk.consts
11
+
12
+
13
+ class _InitGuard:
14
+ _CONTEXT_MANAGER_DEPRECATION_WARNING_MESSAGE = (
15
+ "Using the return value of sentry_sdk.init as a context manager "
16
+ "and manually calling the __enter__ and __exit__ methods on the "
17
+ "return value are deprecated. We are no longer maintaining this "
18
+ "functionality, and we will remove it in the next major release."
19
+ )
20
+
21
+ def __init__(self, client: "sentry_sdk.Client") -> None:
22
+ self._client = client
23
+
24
+ def __enter__(self) -> "_InitGuard":
25
+ warnings.warn(
26
+ self._CONTEXT_MANAGER_DEPRECATION_WARNING_MESSAGE,
27
+ stacklevel=2,
28
+ category=DeprecationWarning,
29
+ )
30
+
31
+ return self
32
+
33
+ def __exit__(self, exc_type: "Any", exc_value: "Any", tb: "Any") -> None:
34
+ warnings.warn(
35
+ self._CONTEXT_MANAGER_DEPRECATION_WARNING_MESSAGE,
36
+ stacklevel=2,
37
+ category=DeprecationWarning,
38
+ )
39
+
40
+ c = self._client
41
+ if c is not None:
42
+ c.close()
43
+
44
+
45
+ def _check_python_deprecations() -> None:
46
+ # Since we're likely to deprecate Python versions in the future, I'm keeping
47
+ # this handy function around. Use this to detect the Python version used and
48
+ # to output logger.warning()s if it's deprecated.
49
+ pass
50
+
51
+
52
+ def _init(*args: "Optional[str]", **kwargs: "Any") -> "ContextManager[Any]":
53
+ """Initializes the SDK and optionally integrations.
54
+
55
+ This takes the same arguments as the client constructor.
56
+ """
57
+ client = sentry_sdk.Client(*args, **kwargs)
58
+ sentry_sdk.get_global_scope().set_client(client)
59
+ _check_python_deprecations()
60
+ rv = _InitGuard(client)
61
+ return rv
62
+
63
+
64
+ if TYPE_CHECKING:
65
+ # Make mypy, PyCharm and other static analyzers think `init` is a type to
66
+ # have nicer autocompletion for params.
67
+ #
68
+ # Use `ClientConstructor` to define the argument types of `init` and
69
+ # `ContextManager[Any]` to tell static analyzers about the return type.
70
+
71
+ class init(sentry_sdk.consts.ClientConstructor, _InitGuard): # noqa: N801
72
+ pass
73
+
74
+ else:
75
+ # Alias `init` for actual usage. Go through the lambda indirection to throw
76
+ # PyCharm off of the weakly typed signature (it would otherwise discover
77
+ # both the weakly typed signature of `_init` and our faked `init` type).
78
+
79
+ init = (lambda: _init)()
source/sentry_sdk/_log_batcher.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import TYPE_CHECKING
2
+
3
+ from sentry_sdk._batcher import Batcher
4
+ from sentry_sdk.utils import serialize_attribute
5
+ from sentry_sdk.envelope import Envelope, Item, PayloadRef
6
+
7
+ if TYPE_CHECKING:
8
+ from typing import Any
9
+ from sentry_sdk._types import Log
10
+
11
+
12
+ class LogBatcher(Batcher["Log"]):
13
+ MAX_BEFORE_FLUSH = 100
14
+ MAX_BEFORE_DROP = 1_000
15
+ FLUSH_WAIT_TIME = 5.0
16
+
17
+ TYPE = "log"
18
+ CONTENT_TYPE = "application/vnd.sentry.items.log+json"
19
+
20
+ @staticmethod
21
+ def _to_transport_format(item: "Log") -> "Any":
22
+ if "sentry.severity_number" not in item["attributes"]:
23
+ item["attributes"]["sentry.severity_number"] = item["severity_number"]
24
+ if "sentry.severity_text" not in item["attributes"]:
25
+ item["attributes"]["sentry.severity_text"] = item["severity_text"]
26
+
27
+ res = {
28
+ "timestamp": int(item["time_unix_nano"]) / 1.0e9,
29
+ "trace_id": item.get("trace_id", "00000000-0000-0000-0000-000000000000"),
30
+ "span_id": item.get("span_id"),
31
+ "level": str(item["severity_text"]),
32
+ "body": str(item["body"]),
33
+ "attributes": {
34
+ k: serialize_attribute(v) for (k, v) in item["attributes"].items()
35
+ },
36
+ }
37
+
38
+ return res
39
+
40
+ def _record_lost(self, item: "Log") -> None:
41
+ # Construct log envelope item without sending it to report lost bytes
42
+ log_item = Item(
43
+ type=self.TYPE,
44
+ content_type=self.CONTENT_TYPE,
45
+ headers={
46
+ "item_count": 1,
47
+ },
48
+ payload=PayloadRef(json={"items": [self._to_transport_format(item)]}),
49
+ )
50
+
51
+ self._record_lost_func(
52
+ reason="queue_overflow",
53
+ data_category="log_item",
54
+ item=log_item,
55
+ quantity=1,
56
+ )