Add source batch 9/11
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +3 -0
- source/rpds_py-0.30.0.dist-info/INSTALLER +1 -0
- source/rpds_py-0.30.0.dist-info/METADATA +99 -0
- source/rpds_py-0.30.0.dist-info/RECORD +10 -0
- source/rpds_py-0.30.0.dist-info/WHEEL +5 -0
- source/rpds_py-0.30.0.dist-info/licenses/LICENSE +19 -0
- source/safetensors-0.7.0.dist-info/INSTALLER +1 -0
- source/safetensors-0.7.0.dist-info/METADATA +133 -0
- source/safetensors-0.7.0.dist-info/RECORD +23 -0
- source/safetensors-0.7.0.dist-info/REQUESTED +0 -0
- source/safetensors-0.7.0.dist-info/WHEEL +5 -0
- source/safetensors-0.7.0.dist-info/licenses/LICENSE +201 -0
- source/safetensors/__init__.py +10 -0
- source/safetensors/__init__.pyi +164 -0
- source/safetensors/_safetensors_rust.abi3.so +3 -0
- source/safetensors/flax.py +138 -0
- source/safetensors/mlx.py +140 -0
- source/safetensors/numpy.py +187 -0
- source/safetensors/paddle.py +290 -0
- source/safetensors/py.typed +0 -0
- source/safetensors/tensorflow.py +139 -0
- source/safetensors/torch.py +550 -0
- source/sentencepiece-0.2.1.dist-info/INSTALLER +1 -0
- source/sentencepiece-0.2.1.dist-info/METADATA +251 -0
- source/sentencepiece-0.2.1.dist-info/RECORD +20 -0
- source/sentencepiece-0.2.1.dist-info/WHEEL +6 -0
- source/sentencepiece-0.2.1.dist-info/top_level.txt +5 -0
- source/sentencepiece/__init__.py +1230 -0
- source/sentencepiece/_sentencepiece.cpython-312-x86_64-linux-gnu.so +3 -0
- source/sentencepiece/_version.py +1 -0
- source/sentencepiece/package_data/nfkc.bin +3 -0
- source/sentencepiece/package_data/nfkc_cf.bin +3 -0
- source/sentencepiece/package_data/nmt_nfkc.bin +3 -0
- source/sentencepiece/package_data/nmt_nfkc_cf.bin +3 -0
- source/sentencepiece/sentencepiece.i +2013 -0
- source/sentencepiece/sentencepiece_model_pb2.py +44 -0
- source/sentencepiece/sentencepiece_pb2.py +30 -0
- source/sentencepiece/sentencepiece_wrap.cxx +0 -0
- source/sentry_sdk-2.53.0.dist-info/INSTALLER +1 -0
- source/sentry_sdk-2.53.0.dist-info/METADATA +268 -0
- source/sentry_sdk-2.53.0.dist-info/RECORD +386 -0
- source/sentry_sdk-2.53.0.dist-info/WHEEL +6 -0
- source/sentry_sdk-2.53.0.dist-info/entry_points.txt +2 -0
- source/sentry_sdk-2.53.0.dist-info/licenses/LICENSE +21 -0
- source/sentry_sdk-2.53.0.dist-info/top_level.txt +1 -0
- source/sentry_sdk/__init__.py +67 -0
- source/sentry_sdk/_batcher.py +139 -0
- source/sentry_sdk/_compat.py +94 -0
- source/sentry_sdk/_init_implementation.py +79 -0
- source/sentry_sdk/_log_batcher.py +56 -0
.gitattributes
CHANGED
|
@@ -246,3 +246,6 @@ source/ray/thirdparty_files/psutil/_psutil_linux.abi3.so filter=lfs diff=lfs mer
|
|
| 246 |
source/regex/_regex.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 247 |
source/rignore/rignore.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 248 |
source/rpds/rpds.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
source/regex/_regex.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 247 |
source/rignore/rignore.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 248 |
source/rpds/rpds.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 249 |
+
source/safetensors/_safetensors_rust.abi3.so filter=lfs diff=lfs merge=lfs -text
|
| 250 |
+
source/sentencepiece/_sentencepiece.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 251 |
+
source/tiktoken/_tiktoken.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
source/rpds_py-0.30.0.dist-info/INSTALLER
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
pip
|
source/rpds_py-0.30.0.dist-info/METADATA
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: rpds-py
|
| 3 |
+
Version: 0.30.0
|
| 4 |
+
Classifier: Development Status :: 3 - Alpha
|
| 5 |
+
Classifier: Intended Audience :: Developers
|
| 6 |
+
Classifier: Operating System :: OS Independent
|
| 7 |
+
Classifier: Programming Language :: Rust
|
| 8 |
+
Classifier: Programming Language :: Python :: 3.10
|
| 9 |
+
Classifier: Programming Language :: Python :: 3.11
|
| 10 |
+
Classifier: Programming Language :: Python :: 3.12
|
| 11 |
+
Classifier: Programming Language :: Python :: 3.13
|
| 12 |
+
Classifier: Programming Language :: Python :: 3.14
|
| 13 |
+
Classifier: Programming Language :: Python :: 3
|
| 14 |
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
| 15 |
+
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
| 16 |
+
License-File: LICENSE
|
| 17 |
+
Summary: Python bindings to Rust's persistent data structures (rpds)
|
| 18 |
+
Keywords: data structures,rust,persistent
|
| 19 |
+
Author-email: Julian Berman <Julian+rpds@GrayVines.com>
|
| 20 |
+
License-Expression: MIT
|
| 21 |
+
Requires-Python: >=3.10
|
| 22 |
+
Description-Content-Type: text/x-rst; charset=UTF-8
|
| 23 |
+
Project-URL: Documentation, https://rpds.readthedocs.io/
|
| 24 |
+
Project-URL: Homepage, https://github.com/crate-py/rpds
|
| 25 |
+
Project-URL: Issues, https://github.com/crate-py/rpds/issues/
|
| 26 |
+
Project-URL: Funding, https://github.com/sponsors/Julian
|
| 27 |
+
Project-URL: Tidelift, https://tidelift.com/subscription/pkg/pypi-rpds-py?utm_source=pypi-rpds-py&utm_medium=referral&utm_campaign=pypi-link
|
| 28 |
+
Project-URL: Source, https://github.com/crate-py/rpds
|
| 29 |
+
Project-URL: Upstream, https://github.com/orium/rpds
|
| 30 |
+
|
| 31 |
+
===========
|
| 32 |
+
``rpds.py``
|
| 33 |
+
===========
|
| 34 |
+
|
| 35 |
+
|PyPI| |Pythons| |CI|
|
| 36 |
+
|
| 37 |
+
.. |PyPI| image:: https://img.shields.io/pypi/v/rpds-py.svg
|
| 38 |
+
:alt: PyPI version
|
| 39 |
+
:target: https://pypi.org/project/rpds-py/
|
| 40 |
+
|
| 41 |
+
.. |Pythons| image:: https://img.shields.io/pypi/pyversions/rpds-py.svg
|
| 42 |
+
:alt: Supported Python versions
|
| 43 |
+
:target: https://pypi.org/project/rpds-py/
|
| 44 |
+
|
| 45 |
+
.. |CI| image:: https://github.com/crate-py/rpds/workflows/CI/badge.svg
|
| 46 |
+
:alt: Build status
|
| 47 |
+
:target: https://github.com/crate-py/rpds/actions?query=workflow%3ACI
|
| 48 |
+
|
| 49 |
+
.. |ReadTheDocs| image:: https://readthedocs.org/projects/referencing/badge/?version=stable&style=flat
|
| 50 |
+
:alt: ReadTheDocs status
|
| 51 |
+
:target: https://referencing.readthedocs.io/en/stable/
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
Python bindings to the `Rust rpds crate <https://docs.rs/rpds/>`_ for persistent data structures.
|
| 55 |
+
|
| 56 |
+
What's here is quite minimal (in transparency, it was written initially to support replacing ``pyrsistent`` in the `referencing library <https://github.com/python-jsonschema/referencing>`_).
|
| 57 |
+
If you see something missing (which is very likely), a PR is definitely welcome to add it.
|
| 58 |
+
|
| 59 |
+
Installation
|
| 60 |
+
------------
|
| 61 |
+
|
| 62 |
+
The distribution on PyPI is named ``rpds.py`` (equivalently ``rpds-py``), and thus can be installed via e.g.:
|
| 63 |
+
|
| 64 |
+
.. code:: sh
|
| 65 |
+
|
| 66 |
+
$ pip install rpds-py
|
| 67 |
+
|
| 68 |
+
Note that if you install ``rpds-py`` from source, you will need a Rust toolchain installed, as it is a build-time dependency.
|
| 69 |
+
An example of how to do so in a ``Dockerfile`` can be found `here <https://github.com/bowtie-json-schema/bowtie/blob/e77fd93598cb6e7dc1b8b1f53c00e5aa410c201a/implementations/python-jsonschema/Dockerfile#L1-L8>`_.
|
| 70 |
+
|
| 71 |
+
If you believe you are on a common platform which should have wheels built (i.e. and not need to compile from source), feel free to file an issue or pull request modifying the GitHub action used here to build wheels via ``maturin``.
|
| 72 |
+
|
| 73 |
+
Usage
|
| 74 |
+
-----
|
| 75 |
+
|
| 76 |
+
Methods in general are named similarly to their ``rpds`` counterparts (rather than ``pyrsistent``\ 's conventions, though probably a full drop-in ``pyrsistent``\ -compatible wrapper module is a good addition at some point).
|
| 77 |
+
|
| 78 |
+
.. code:: python
|
| 79 |
+
|
| 80 |
+
>>> from rpds import HashTrieMap, HashTrieSet, List
|
| 81 |
+
|
| 82 |
+
>>> m = HashTrieMap({"foo": "bar", "baz": "quux"})
|
| 83 |
+
>>> m.insert("spam", 37) == HashTrieMap({"foo": "bar", "baz": "quux", "spam": 37})
|
| 84 |
+
True
|
| 85 |
+
>>> m.remove("foo") == HashTrieMap({"baz": "quux"})
|
| 86 |
+
True
|
| 87 |
+
|
| 88 |
+
>>> s = HashTrieSet({"foo", "bar", "baz", "quux"})
|
| 89 |
+
>>> s.insert("spam") == HashTrieSet({"foo", "bar", "baz", "quux", "spam"})
|
| 90 |
+
True
|
| 91 |
+
>>> s.remove("foo") == HashTrieSet({"bar", "baz", "quux"})
|
| 92 |
+
True
|
| 93 |
+
|
| 94 |
+
>>> L = List([1, 3, 5])
|
| 95 |
+
>>> L.push_front(-1) == List([-1, 1, 3, 5])
|
| 96 |
+
True
|
| 97 |
+
>>> L.rest == List([3, 5])
|
| 98 |
+
True
|
| 99 |
+
|
source/rpds_py-0.30.0.dist-info/RECORD
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
rpds/__init__.py,sha256=w3MgXW7lpTCICw0KXbw20QX573_kbsEnWIeMsCAugvM,99
|
| 2 |
+
rpds/__init__.pyi,sha256=am7x6oMa_pu_kv1NlolqJbPr6_UvCvoyxGKrDGSMKEk,2602
|
| 3 |
+
rpds/__pycache__/__init__.cpython-312.pyc,,
|
| 4 |
+
rpds/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 5 |
+
rpds/rpds.cpython-312-x86_64-linux-gnu.so,sha256=YnSRYU7s221B-NVp0g5gVF1LPFzVnkzK3CdalA33MzY,1060936
|
| 6 |
+
rpds_py-0.30.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
| 7 |
+
rpds_py-0.30.0.dist-info/METADATA,sha256=zAHQl925x-B6KclVPujdNc-SDoX7V097DCPWgTinFr8,4145
|
| 8 |
+
rpds_py-0.30.0.dist-info/RECORD,,
|
| 9 |
+
rpds_py-0.30.0.dist-info/WHEEL,sha256=m2ROzCpH5Kw6bN_3jKfw80jyQS9OqSulcWBhBkC07PU,147
|
| 10 |
+
rpds_py-0.30.0.dist-info/licenses/LICENSE,sha256=MU5Okb47qpPA-0vMyeTpfNZD64ObBlr5IXgsIXX-mQk,1057
|
source/rpds_py-0.30.0.dist-info/WHEEL
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Wheel-Version: 1.0
|
| 2 |
+
Generator: maturin (1.10.2)
|
| 3 |
+
Root-Is-Purelib: false
|
| 4 |
+
Tag: cp312-cp312-manylinux_2_17_x86_64
|
| 5 |
+
Tag: cp312-cp312-manylinux2014_x86_64
|
source/rpds_py-0.30.0.dist-info/licenses/LICENSE
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Copyright (c) 2023 Julian Berman
|
| 2 |
+
|
| 3 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 4 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 5 |
+
in the Software without restriction, including without limitation the rights
|
| 6 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 7 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 8 |
+
furnished to do so, subject to the following conditions:
|
| 9 |
+
|
| 10 |
+
The above copyright notice and this permission notice shall be included in
|
| 11 |
+
all copies or substantial portions of the Software.
|
| 12 |
+
|
| 13 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 14 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 15 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 16 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 17 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 18 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
| 19 |
+
THE SOFTWARE.
|
source/safetensors-0.7.0.dist-info/INSTALLER
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
pip
|
source/safetensors-0.7.0.dist-info/METADATA
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: safetensors
|
| 3 |
+
Version: 0.7.0
|
| 4 |
+
Classifier: Development Status :: 5 - Production/Stable
|
| 5 |
+
Classifier: Intended Audience :: Developers
|
| 6 |
+
Classifier: Intended Audience :: Education
|
| 7 |
+
Classifier: Intended Audience :: Science/Research
|
| 8 |
+
Classifier: License :: OSI Approved :: Apache Software License
|
| 9 |
+
Classifier: Operating System :: OS Independent
|
| 10 |
+
Classifier: Programming Language :: Python :: 3
|
| 11 |
+
Classifier: Programming Language :: Python :: 3.7
|
| 12 |
+
Classifier: Programming Language :: Python :: 3.8
|
| 13 |
+
Classifier: Programming Language :: Python :: 3.9
|
| 14 |
+
Classifier: Programming Language :: Python :: 3.10
|
| 15 |
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
| 16 |
+
Classifier: Typing :: Typed
|
| 17 |
+
Requires-Dist: numpy>=1.21.6 ; extra == 'numpy'
|
| 18 |
+
Requires-Dist: packaging ; extra == 'torch'
|
| 19 |
+
Requires-Dist: safetensors[numpy] ; extra == 'torch'
|
| 20 |
+
Requires-Dist: torch>=1.10 ; extra == 'torch'
|
| 21 |
+
Requires-Dist: safetensors[numpy] ; extra == 'tensorflow'
|
| 22 |
+
Requires-Dist: tensorflow>=2.11.0 ; extra == 'tensorflow'
|
| 23 |
+
Requires-Dist: safetensors[numpy] ; extra == 'pinned-tf'
|
| 24 |
+
Requires-Dist: tensorflow==2.18.0 ; extra == 'pinned-tf'
|
| 25 |
+
Requires-Dist: safetensors[numpy] ; extra == 'jax'
|
| 26 |
+
Requires-Dist: flax>=0.6.3 ; extra == 'jax'
|
| 27 |
+
Requires-Dist: jax>=0.3.25 ; extra == 'jax'
|
| 28 |
+
Requires-Dist: jaxlib>=0.3.25 ; extra == 'jax'
|
| 29 |
+
Requires-Dist: mlx>=0.0.9 ; extra == 'mlx'
|
| 30 |
+
Requires-Dist: safetensors[numpy] ; extra == 'paddlepaddle'
|
| 31 |
+
Requires-Dist: paddlepaddle>=2.4.1 ; extra == 'paddlepaddle'
|
| 32 |
+
Requires-Dist: ruff ; extra == 'quality'
|
| 33 |
+
Requires-Dist: safetensors[numpy] ; extra == 'testing'
|
| 34 |
+
Requires-Dist: h5py>=3.7.0 ; extra == 'testing'
|
| 35 |
+
Requires-Dist: huggingface-hub>=0.12.1 ; extra == 'testing'
|
| 36 |
+
Requires-Dist: setuptools-rust>=1.5.2 ; extra == 'testing'
|
| 37 |
+
Requires-Dist: pytest>=7.2.0 ; extra == 'testing'
|
| 38 |
+
Requires-Dist: pytest-benchmark>=4.0.0 ; extra == 'testing'
|
| 39 |
+
Requires-Dist: hypothesis>=6.70.2 ; extra == 'testing'
|
| 40 |
+
Requires-Dist: safetensors[numpy] ; extra == 'testingfree'
|
| 41 |
+
Requires-Dist: huggingface-hub>=0.12.1 ; extra == 'testingfree'
|
| 42 |
+
Requires-Dist: setuptools-rust>=1.5.2 ; extra == 'testingfree'
|
| 43 |
+
Requires-Dist: pytest>=7.2.0 ; extra == 'testingfree'
|
| 44 |
+
Requires-Dist: pytest-benchmark>=4.0.0 ; extra == 'testingfree'
|
| 45 |
+
Requires-Dist: hypothesis>=6.70.2 ; extra == 'testingfree'
|
| 46 |
+
Requires-Dist: safetensors[torch] ; extra == 'all'
|
| 47 |
+
Requires-Dist: safetensors[numpy] ; extra == 'all'
|
| 48 |
+
Requires-Dist: safetensors[pinned-tf] ; extra == 'all'
|
| 49 |
+
Requires-Dist: safetensors[jax] ; extra == 'all'
|
| 50 |
+
Requires-Dist: safetensors[paddlepaddle] ; extra == 'all'
|
| 51 |
+
Requires-Dist: safetensors[quality] ; extra == 'all'
|
| 52 |
+
Requires-Dist: safetensors[testing] ; extra == 'all'
|
| 53 |
+
Requires-Dist: safetensors[all] ; extra == 'dev'
|
| 54 |
+
Provides-Extra: numpy
|
| 55 |
+
Provides-Extra: torch
|
| 56 |
+
Provides-Extra: tensorflow
|
| 57 |
+
Provides-Extra: pinned-tf
|
| 58 |
+
Provides-Extra: jax
|
| 59 |
+
Provides-Extra: mlx
|
| 60 |
+
Provides-Extra: paddlepaddle
|
| 61 |
+
Provides-Extra: quality
|
| 62 |
+
Provides-Extra: testing
|
| 63 |
+
Provides-Extra: testingfree
|
| 64 |
+
Provides-Extra: all
|
| 65 |
+
Provides-Extra: dev
|
| 66 |
+
License-File: LICENSE
|
| 67 |
+
Author-email: Nicolas Patry <patry.nicolas@protonmail.com>
|
| 68 |
+
Requires-Python: >=3.9
|
| 69 |
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
| 70 |
+
Project-URL: Homepage, https://github.com/huggingface/safetensors
|
| 71 |
+
Project-URL: Source, https://github.com/huggingface/safetensors
|
| 72 |
+
|
| 73 |
+
## Installation
|
| 74 |
+
|
| 75 |
+
```
|
| 76 |
+
pip install safetensors
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
## Usage
|
| 81 |
+
|
| 82 |
+
### Numpy
|
| 83 |
+
|
| 84 |
+
```python
|
| 85 |
+
from safetensors.numpy import save_file, load_file
|
| 86 |
+
import numpy as np
|
| 87 |
+
|
| 88 |
+
tensors = {
|
| 89 |
+
"a": np.zeros((2, 2)),
|
| 90 |
+
"b": np.zeros((2, 3), dtype=np.uint8)
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
save_file(tensors, "./model.safetensors")
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
# Now loading
|
| 97 |
+
loaded = load_file("./model.safetensors")
|
| 98 |
+
```
|
| 99 |
+
|
| 100 |
+
### Torch
|
| 101 |
+
|
| 102 |
+
```python
|
| 103 |
+
from safetensors.torch import save_file, load_file
|
| 104 |
+
import torch
|
| 105 |
+
|
| 106 |
+
tensors = {
|
| 107 |
+
"a": torch.zeros((2, 2)),
|
| 108 |
+
"b": torch.zeros((2, 3), dtype=torch.uint8)
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
save_file(tensors, "./model.safetensors")
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
# Now loading
|
| 115 |
+
loaded = load_file("./model.safetensors")
|
| 116 |
+
```
|
| 117 |
+
|
| 118 |
+
### Developing
|
| 119 |
+
|
| 120 |
+
```
|
| 121 |
+
# inside ./safetensors/bindings/python
|
| 122 |
+
pip install .[dev]
|
| 123 |
+
```
|
| 124 |
+
Should be enough to install this library locally.
|
| 125 |
+
|
| 126 |
+
### Testing
|
| 127 |
+
|
| 128 |
+
```
|
| 129 |
+
# inside ./safetensors/bindings/python
|
| 130 |
+
pip install .[dev]
|
| 131 |
+
pytest -sv tests/
|
| 132 |
+
```
|
| 133 |
+
|
source/safetensors-0.7.0.dist-info/RECORD
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
safetensors-0.7.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
| 2 |
+
safetensors-0.7.0.dist-info/METADATA,sha256=aAkyb78XRE3VIKsZ8KVbehhHbb5SpDBGa79TiZZ8Kqo,4125
|
| 3 |
+
safetensors-0.7.0.dist-info/RECORD,,
|
| 4 |
+
safetensors-0.7.0.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 5 |
+
safetensors-0.7.0.dist-info/WHEEL,sha256=EsgGQg7OBGIn-zS1ipDRPjO8C2qSQ0GRrd2xuL_Pyq0,143
|
| 6 |
+
safetensors-0.7.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
| 7 |
+
safetensors/__init__.py,sha256=wbzKZlAVgnAsjHmqryuSJCiADvpDZxNGCfj8VzY0At0,194
|
| 8 |
+
safetensors/__init__.pyi,sha256=IpwsrzRWJA2yR8TxMEC3RHgM_5TiDgSFqyvrAxAa15U,4019
|
| 9 |
+
safetensors/__pycache__/__init__.cpython-312.pyc,,
|
| 10 |
+
safetensors/__pycache__/flax.cpython-312.pyc,,
|
| 11 |
+
safetensors/__pycache__/mlx.cpython-312.pyc,,
|
| 12 |
+
safetensors/__pycache__/numpy.cpython-312.pyc,,
|
| 13 |
+
safetensors/__pycache__/paddle.cpython-312.pyc,,
|
| 14 |
+
safetensors/__pycache__/tensorflow.cpython-312.pyc,,
|
| 15 |
+
safetensors/__pycache__/torch.cpython-312.pyc,,
|
| 16 |
+
safetensors/_safetensors_rust.abi3.so,sha256=h9qPRzPm7PhApdjisd6IjK0BLCN1URiRlWfsSAp9fgI,1216632
|
| 17 |
+
safetensors/flax.py,sha256=T59elUqzVDyGYGdR78QzNEuwyAc8KrEO0EuLBSKOnUs,3853
|
| 18 |
+
safetensors/mlx.py,sha256=IR51jRpcJq6epb0Agj8VsxI9xqBS6NjeAJnr-Ny0jJU,3850
|
| 19 |
+
safetensors/numpy.py,sha256=rit_12-IfZtRgip_VLd8nPAcCXyeM2fPrCDZ7OiyxSY,5028
|
| 20 |
+
safetensors/paddle.py,sha256=LrDwqQbwFnQXiY3M601IU7G6FBctX6tyHHK3_UH6lxE,8721
|
| 21 |
+
safetensors/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 22 |
+
safetensors/tensorflow.py,sha256=AZ-O7-gM-JqTfjczZyCUAHm3Er-GSQnQWaFyY7mAIQc,3903
|
| 23 |
+
safetensors/torch.py,sha256=U0acZVahLsxvqPa1GitMRiaByu6XVbmiBITGtuznBEY,18610
|
source/safetensors-0.7.0.dist-info/REQUESTED
ADDED
|
File without changes
|
source/safetensors-0.7.0.dist-info/WHEEL
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Wheel-Version: 1.0
|
| 2 |
+
Generator: maturin (1.10.2)
|
| 3 |
+
Root-Is-Purelib: false
|
| 4 |
+
Tag: cp38-abi3-manylinux_2_17_x86_64
|
| 5 |
+
Tag: cp38-abi3-manylinux2014_x86_64
|
source/safetensors-0.7.0.dist-info/licenses/LICENSE
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Apache License
|
| 2 |
+
Version 2.0, January 2004
|
| 3 |
+
http://www.apache.org/licenses/
|
| 4 |
+
|
| 5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
| 6 |
+
|
| 7 |
+
1. Definitions.
|
| 8 |
+
|
| 9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
| 10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
| 11 |
+
|
| 12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
| 13 |
+
the copyright owner that is granting the License.
|
| 14 |
+
|
| 15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
| 16 |
+
other entities that control, are controlled by, or are under common
|
| 17 |
+
control with that entity. For the purposes of this definition,
|
| 18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
| 19 |
+
direction or management of such entity, whether by contract or
|
| 20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
| 21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
| 22 |
+
|
| 23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
| 24 |
+
exercising permissions granted by this License.
|
| 25 |
+
|
| 26 |
+
"Source" form shall mean the preferred form for making modifications,
|
| 27 |
+
including but not limited to software source code, documentation
|
| 28 |
+
source, and configuration files.
|
| 29 |
+
|
| 30 |
+
"Object" form shall mean any form resulting from mechanical
|
| 31 |
+
transformation or translation of a Source form, including but
|
| 32 |
+
not limited to compiled object code, generated documentation,
|
| 33 |
+
and conversions to other media types.
|
| 34 |
+
|
| 35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
| 36 |
+
Object form, made available under the License, as indicated by a
|
| 37 |
+
copyright notice that is included in or attached to the work
|
| 38 |
+
(an example is provided in the Appendix below).
|
| 39 |
+
|
| 40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
| 41 |
+
form, that is based on (or derived from) the Work and for which the
|
| 42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
| 43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
| 44 |
+
of this License, Derivative Works shall not include works that remain
|
| 45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
| 46 |
+
the Work and Derivative Works thereof.
|
| 47 |
+
|
| 48 |
+
"Contribution" shall mean any work of authorship, including
|
| 49 |
+
the original version of the Work and any modifications or additions
|
| 50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
| 51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
| 52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
| 53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
| 54 |
+
means any form of electronic, verbal, or written communication sent
|
| 55 |
+
to the Licensor or its representatives, including but not limited to
|
| 56 |
+
communication on electronic mailing lists, source code control systems,
|
| 57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
| 58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
| 59 |
+
excluding communication that is conspicuously marked or otherwise
|
| 60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
| 61 |
+
|
| 62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
| 63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
| 64 |
+
subsequently incorporated within the Work.
|
| 65 |
+
|
| 66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
| 67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
| 70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
| 71 |
+
Work and such Derivative Works in Source or Object form.
|
| 72 |
+
|
| 73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
| 74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 76 |
+
(except as stated in this section) patent license to make, have made,
|
| 77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
| 78 |
+
where such license applies only to those patent claims licensable
|
| 79 |
+
by such Contributor that are necessarily infringed by their
|
| 80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
| 81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
| 82 |
+
institute patent litigation against any entity (including a
|
| 83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
| 84 |
+
or a Contribution incorporated within the Work constitutes direct
|
| 85 |
+
or contributory patent infringement, then any patent licenses
|
| 86 |
+
granted to You under this License for that Work shall terminate
|
| 87 |
+
as of the date such litigation is filed.
|
| 88 |
+
|
| 89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
| 90 |
+
Work or Derivative Works thereof in any medium, with or without
|
| 91 |
+
modifications, and in Source or Object form, provided that You
|
| 92 |
+
meet the following conditions:
|
| 93 |
+
|
| 94 |
+
(a) You must give any other recipients of the Work or
|
| 95 |
+
Derivative Works a copy of this License; and
|
| 96 |
+
|
| 97 |
+
(b) You must cause any modified files to carry prominent notices
|
| 98 |
+
stating that You changed the files; and
|
| 99 |
+
|
| 100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
| 101 |
+
that You distribute, all copyright, patent, trademark, and
|
| 102 |
+
attribution notices from the Source form of the Work,
|
| 103 |
+
excluding those notices that do not pertain to any part of
|
| 104 |
+
the Derivative Works; and
|
| 105 |
+
|
| 106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
| 107 |
+
distribution, then any Derivative Works that You distribute must
|
| 108 |
+
include a readable copy of the attribution notices contained
|
| 109 |
+
within such NOTICE file, excluding those notices that do not
|
| 110 |
+
pertain to any part of the Derivative Works, in at least one
|
| 111 |
+
of the following places: within a NOTICE text file distributed
|
| 112 |
+
as part of the Derivative Works; within the Source form or
|
| 113 |
+
documentation, if provided along with the Derivative Works; or,
|
| 114 |
+
within a display generated by the Derivative Works, if and
|
| 115 |
+
wherever such third-party notices normally appear. The contents
|
| 116 |
+
of the NOTICE file are for informational purposes only and
|
| 117 |
+
do not modify the License. You may add Your own attribution
|
| 118 |
+
notices within Derivative Works that You distribute, alongside
|
| 119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
| 120 |
+
that such additional attribution notices cannot be construed
|
| 121 |
+
as modifying the License.
|
| 122 |
+
|
| 123 |
+
You may add Your own copyright statement to Your modifications and
|
| 124 |
+
may provide additional or different license terms and conditions
|
| 125 |
+
for use, reproduction, or distribution of Your modifications, or
|
| 126 |
+
for any such Derivative Works as a whole, provided Your use,
|
| 127 |
+
reproduction, and distribution of the Work otherwise complies with
|
| 128 |
+
the conditions stated in this License.
|
| 129 |
+
|
| 130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
| 131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
| 132 |
+
by You to the Licensor shall be under the terms and conditions of
|
| 133 |
+
this License, without any additional terms or conditions.
|
| 134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
| 135 |
+
the terms of any separate license agreement you may have executed
|
| 136 |
+
with Licensor regarding such Contributions.
|
| 137 |
+
|
| 138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
| 139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
| 140 |
+
except as required for reasonable and customary use in describing the
|
| 141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
| 142 |
+
|
| 143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
| 144 |
+
agreed to in writing, Licensor provides the Work (and each
|
| 145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
| 146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 147 |
+
implied, including, without limitation, any warranties or conditions
|
| 148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
| 149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
| 150 |
+
appropriateness of using or redistributing the Work and assume any
|
| 151 |
+
risks associated with Your exercise of permissions under this License.
|
| 152 |
+
|
| 153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
| 154 |
+
whether in tort (including negligence), contract, or otherwise,
|
| 155 |
+
unless required by applicable law (such as deliberate and grossly
|
| 156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
| 157 |
+
liable to You for damages, including any direct, indirect, special,
|
| 158 |
+
incidental, or consequential damages of any character arising as a
|
| 159 |
+
result of this License or out of the use or inability to use the
|
| 160 |
+
Work (including but not limited to damages for loss of goodwill,
|
| 161 |
+
work stoppage, computer failure or malfunction, or any and all
|
| 162 |
+
other commercial damages or losses), even if such Contributor
|
| 163 |
+
has been advised of the possibility of such damages.
|
| 164 |
+
|
| 165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
| 166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
| 167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
| 168 |
+
or other liability obligations and/or rights consistent with this
|
| 169 |
+
License. However, in accepting such obligations, You may act only
|
| 170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
| 171 |
+
of any other Contributor, and only if You agree to indemnify,
|
| 172 |
+
defend, and hold each Contributor harmless for any liability
|
| 173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
| 174 |
+
of your accepting any such warranty or additional liability.
|
| 175 |
+
|
| 176 |
+
END OF TERMS AND CONDITIONS
|
| 177 |
+
|
| 178 |
+
APPENDIX: How to apply the Apache License to your work.
|
| 179 |
+
|
| 180 |
+
To apply the Apache License to your work, attach the following
|
| 181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
| 182 |
+
replaced with your own identifying information. (Don't include
|
| 183 |
+
the brackets!) The text should be enclosed in the appropriate
|
| 184 |
+
comment syntax for the file format. We also recommend that a
|
| 185 |
+
file or class name and description of purpose be included on the
|
| 186 |
+
same "printed page" as the copyright notice for easier
|
| 187 |
+
identification within third-party archives.
|
| 188 |
+
|
| 189 |
+
Copyright [yyyy] [name of copyright owner]
|
| 190 |
+
|
| 191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 192 |
+
you may not use this file except in compliance with the License.
|
| 193 |
+
You may obtain a copy of the License at
|
| 194 |
+
|
| 195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 196 |
+
|
| 197 |
+
Unless required by applicable law or agreed to in writing, software
|
| 198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 200 |
+
See the License for the specific language governing permissions and
|
| 201 |
+
limitations under the License.
|
source/safetensors/__init__.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Re-export this
|
| 2 |
+
from ._safetensors_rust import ( # noqa: F401
|
| 3 |
+
SafetensorError,
|
| 4 |
+
__version__,
|
| 5 |
+
deserialize,
|
| 6 |
+
safe_open,
|
| 7 |
+
_safe_open_handle,
|
| 8 |
+
serialize,
|
| 9 |
+
serialize_file,
|
| 10 |
+
)
|
source/safetensors/__init__.pyi
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Generated content DO NOT EDIT
|
| 2 |
+
@staticmethod
|
| 3 |
+
def deserialize(bytes):
|
| 4 |
+
"""
|
| 5 |
+
Opens a safetensors lazily and returns tensors as asked
|
| 6 |
+
|
| 7 |
+
Args:
|
| 8 |
+
data (`bytes`):
|
| 9 |
+
The byte content of a file
|
| 10 |
+
|
| 11 |
+
Returns:
|
| 12 |
+
(`List[str, Dict[str, Dict[str, any]]]`):
|
| 13 |
+
The deserialized content is like:
|
| 14 |
+
[("tensor_name", {"shape": [2, 3], "dtype": "F32", "data": b"\0\0.." }), (...)]
|
| 15 |
+
"""
|
| 16 |
+
pass
|
| 17 |
+
|
| 18 |
+
@staticmethod
|
| 19 |
+
def serialize(tensor_dict, metadata=None):
|
| 20 |
+
"""
|
| 21 |
+
Serializes raw data.
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
tensor_dict (`Dict[str, Dict[Any]]`):
|
| 25 |
+
The tensor dict is like:
|
| 26 |
+
{"tensor_name": {"dtype": "F32", "shape": [2, 3], "data": b"\0\0"}}
|
| 27 |
+
metadata (`Dict[str, str]`, *optional*):
|
| 28 |
+
The optional purely text annotations
|
| 29 |
+
|
| 30 |
+
Returns:
|
| 31 |
+
(`bytes`):
|
| 32 |
+
The serialized content.
|
| 33 |
+
"""
|
| 34 |
+
pass
|
| 35 |
+
|
| 36 |
+
@staticmethod
|
| 37 |
+
def serialize_file(tensor_dict, filename, metadata=None):
|
| 38 |
+
"""
|
| 39 |
+
Serializes raw data into file.
|
| 40 |
+
|
| 41 |
+
Args:
|
| 42 |
+
tensor_dict (`Dict[str, Dict[Any]]`):
|
| 43 |
+
The tensor dict is like:
|
| 44 |
+
{"tensor_name": {"dtype": "F32", "shape": [2, 3], "data": b"\0\0"}}
|
| 45 |
+
filename (`str`, or `os.PathLike`):
|
| 46 |
+
The name of the file to write into.
|
| 47 |
+
metadata (`Dict[str, str]`, *optional*):
|
| 48 |
+
The optional purely text annotations
|
| 49 |
+
|
| 50 |
+
Returns:
|
| 51 |
+
(`NoneType`):
|
| 52 |
+
On success return None
|
| 53 |
+
"""
|
| 54 |
+
pass
|
| 55 |
+
|
| 56 |
+
class safe_open:
|
| 57 |
+
"""
|
| 58 |
+
Opens a safetensors lazily and returns tensors as asked
|
| 59 |
+
|
| 60 |
+
Args:
|
| 61 |
+
filename (`str`, or `os.PathLike`):
|
| 62 |
+
The filename to open
|
| 63 |
+
|
| 64 |
+
framework (`str`):
|
| 65 |
+
The framework you want you tensors in. Supported values:
|
| 66 |
+
`pt`, `tf`, `flax`, `numpy`.
|
| 67 |
+
|
| 68 |
+
device (`str`, defaults to `"cpu"`):
|
| 69 |
+
The device on which you want the tensors.
|
| 70 |
+
"""
|
| 71 |
+
def __init__(self, filename, framework, device=...):
|
| 72 |
+
pass
|
| 73 |
+
|
| 74 |
+
def __enter__(self):
|
| 75 |
+
"""
|
| 76 |
+
Start the context manager
|
| 77 |
+
"""
|
| 78 |
+
pass
|
| 79 |
+
|
| 80 |
+
def __exit__(self, _exc_type, _exc_value, _traceback):
|
| 81 |
+
"""
|
| 82 |
+
Exits the context manager
|
| 83 |
+
"""
|
| 84 |
+
pass
|
| 85 |
+
|
| 86 |
+
def get_slice(self, name):
|
| 87 |
+
"""
|
| 88 |
+
Returns a full slice view object
|
| 89 |
+
|
| 90 |
+
Args:
|
| 91 |
+
name (`str`):
|
| 92 |
+
The name of the tensor you want
|
| 93 |
+
|
| 94 |
+
Returns:
|
| 95 |
+
(`PySafeSlice`):
|
| 96 |
+
A dummy object you can slice into to get a real tensor
|
| 97 |
+
Example:
|
| 98 |
+
```python
|
| 99 |
+
from safetensors import safe_open
|
| 100 |
+
|
| 101 |
+
with safe_open("model.safetensors", framework="pt", device=0) as f:
|
| 102 |
+
tensor_part = f.get_slice("embedding")[:, ::8]
|
| 103 |
+
|
| 104 |
+
```
|
| 105 |
+
"""
|
| 106 |
+
pass
|
| 107 |
+
|
| 108 |
+
def get_tensor(self, name):
|
| 109 |
+
"""
|
| 110 |
+
Returns a full tensor
|
| 111 |
+
|
| 112 |
+
Args:
|
| 113 |
+
name (`str`):
|
| 114 |
+
The name of the tensor you want
|
| 115 |
+
|
| 116 |
+
Returns:
|
| 117 |
+
(`Tensor`):
|
| 118 |
+
The tensor in the framework you opened the file for.
|
| 119 |
+
|
| 120 |
+
Example:
|
| 121 |
+
```python
|
| 122 |
+
from safetensors import safe_open
|
| 123 |
+
|
| 124 |
+
with safe_open("model.safetensors", framework="pt", device=0) as f:
|
| 125 |
+
tensor = f.get_tensor("embedding")
|
| 126 |
+
|
| 127 |
+
```
|
| 128 |
+
"""
|
| 129 |
+
pass
|
| 130 |
+
|
| 131 |
+
def keys(self):
|
| 132 |
+
"""
|
| 133 |
+
Returns the names of the tensors in the file.
|
| 134 |
+
|
| 135 |
+
Returns:
|
| 136 |
+
(`List[str]`):
|
| 137 |
+
The name of the tensors contained in that file
|
| 138 |
+
"""
|
| 139 |
+
pass
|
| 140 |
+
|
| 141 |
+
def metadata(self):
|
| 142 |
+
"""
|
| 143 |
+
Return the special non tensor information in the header
|
| 144 |
+
|
| 145 |
+
Returns:
|
| 146 |
+
(`Dict[str, str]`):
|
| 147 |
+
The freeform metadata.
|
| 148 |
+
"""
|
| 149 |
+
pass
|
| 150 |
+
|
| 151 |
+
def offset_keys(self):
|
| 152 |
+
"""
|
| 153 |
+
Returns the names of the tensors in the file, ordered by offset.
|
| 154 |
+
|
| 155 |
+
Returns:
|
| 156 |
+
(`List[str]`):
|
| 157 |
+
The name of the tensors contained in that file
|
| 158 |
+
"""
|
| 159 |
+
pass
|
| 160 |
+
|
| 161 |
+
class SafetensorError(Exception):
|
| 162 |
+
"""
|
| 163 |
+
Custom Python Exception for Safetensor errors.
|
| 164 |
+
"""
|
source/safetensors/_safetensors_rust.abi3.so
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:87da8f4733e6ecf840a5d8e2b1de888cad012c23755118919567ec480a7d7e02
|
| 3 |
+
size 1216632
|
source/safetensors/flax.py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import Dict, Optional, Union
|
| 3 |
+
|
| 4 |
+
import numpy as np
|
| 5 |
+
|
| 6 |
+
import jax.numpy as jnp
|
| 7 |
+
from jax import Array
|
| 8 |
+
from safetensors import numpy, safe_open
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def save(tensors: Dict[str, Array], metadata: Optional[Dict[str, str]] = None) -> bytes:
|
| 12 |
+
"""
|
| 13 |
+
Saves a dictionary of tensors into raw bytes in safetensors format.
|
| 14 |
+
|
| 15 |
+
Args:
|
| 16 |
+
tensors (`Dict[str, Array]`):
|
| 17 |
+
The incoming tensors. Tensors need to be contiguous and dense.
|
| 18 |
+
metadata (`Dict[str, str]`, *optional*, defaults to `None`):
|
| 19 |
+
Optional text only metadata you might want to save in your header.
|
| 20 |
+
For instance it can be useful to specify more about the underlying
|
| 21 |
+
tensors. This is purely informative and does not affect tensor loading.
|
| 22 |
+
|
| 23 |
+
Returns:
|
| 24 |
+
`bytes`: The raw bytes representing the format
|
| 25 |
+
|
| 26 |
+
Example:
|
| 27 |
+
|
| 28 |
+
```python
|
| 29 |
+
from safetensors.flax import save
|
| 30 |
+
from jax import numpy as jnp
|
| 31 |
+
|
| 32 |
+
tensors = {"embedding": jnp.zeros((512, 1024)), "attention": jnp.zeros((256, 256))}
|
| 33 |
+
byte_data = save(tensors)
|
| 34 |
+
```
|
| 35 |
+
"""
|
| 36 |
+
np_tensors = _jnp2np(tensors)
|
| 37 |
+
return numpy.save(np_tensors, metadata=metadata)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def save_file(
|
| 41 |
+
tensors: Dict[str, Array],
|
| 42 |
+
filename: Union[str, os.PathLike],
|
| 43 |
+
metadata: Optional[Dict[str, str]] = None,
|
| 44 |
+
) -> None:
|
| 45 |
+
"""
|
| 46 |
+
Saves a dictionary of tensors into raw bytes in safetensors format.
|
| 47 |
+
|
| 48 |
+
Args:
|
| 49 |
+
tensors (`Dict[str, Array]`):
|
| 50 |
+
The incoming tensors. Tensors need to be contiguous and dense.
|
| 51 |
+
filename (`str`, or `os.PathLike`)):
|
| 52 |
+
The filename we're saving into.
|
| 53 |
+
metadata (`Dict[str, str]`, *optional*, defaults to `None`):
|
| 54 |
+
Optional text only metadata you might want to save in your header.
|
| 55 |
+
For instance it can be useful to specify more about the underlying
|
| 56 |
+
tensors. This is purely informative and does not affect tensor loading.
|
| 57 |
+
|
| 58 |
+
Returns:
|
| 59 |
+
`None`
|
| 60 |
+
|
| 61 |
+
Example:
|
| 62 |
+
|
| 63 |
+
```python
|
| 64 |
+
from safetensors.flax import save_file
|
| 65 |
+
from jax import numpy as jnp
|
| 66 |
+
|
| 67 |
+
tensors = {"embedding": jnp.zeros((512, 1024)), "attention": jnp.zeros((256, 256))}
|
| 68 |
+
save_file(tensors, "model.safetensors")
|
| 69 |
+
```
|
| 70 |
+
"""
|
| 71 |
+
np_tensors = _jnp2np(tensors)
|
| 72 |
+
return numpy.save_file(np_tensors, filename, metadata=metadata)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def load(data: bytes) -> Dict[str, Array]:
|
| 76 |
+
"""
|
| 77 |
+
Loads a safetensors file into flax format from pure bytes.
|
| 78 |
+
|
| 79 |
+
Args:
|
| 80 |
+
data (`bytes`):
|
| 81 |
+
The content of a safetensors file
|
| 82 |
+
|
| 83 |
+
Returns:
|
| 84 |
+
`Dict[str, Array]`: dictionary that contains name as key, value as `Array` on cpu
|
| 85 |
+
|
| 86 |
+
Example:
|
| 87 |
+
|
| 88 |
+
```python
|
| 89 |
+
from safetensors.flax import load
|
| 90 |
+
|
| 91 |
+
file_path = "./my_folder/bert.safetensors"
|
| 92 |
+
with open(file_path, "rb") as f:
|
| 93 |
+
data = f.read()
|
| 94 |
+
|
| 95 |
+
loaded = load(data)
|
| 96 |
+
```
|
| 97 |
+
"""
|
| 98 |
+
flat = numpy.load(data)
|
| 99 |
+
return _np2jnp(flat)
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def load_file(filename: Union[str, os.PathLike]) -> Dict[str, Array]:
|
| 103 |
+
"""
|
| 104 |
+
Loads a safetensors file into flax format.
|
| 105 |
+
|
| 106 |
+
Args:
|
| 107 |
+
filename (`str`, or `os.PathLike`)):
|
| 108 |
+
The name of the file which contains the tensors
|
| 109 |
+
|
| 110 |
+
Returns:
|
| 111 |
+
`Dict[str, Array]`: dictionary that contains name as key, value as `Array`
|
| 112 |
+
|
| 113 |
+
Example:
|
| 114 |
+
|
| 115 |
+
```python
|
| 116 |
+
from safetensors.flax import load_file
|
| 117 |
+
|
| 118 |
+
file_path = "./my_folder/bert.safetensors"
|
| 119 |
+
loaded = load_file(file_path)
|
| 120 |
+
```
|
| 121 |
+
"""
|
| 122 |
+
result = {}
|
| 123 |
+
with safe_open(filename, framework="flax") as f:
|
| 124 |
+
for k in f.offset_keys():
|
| 125 |
+
result[k] = f.get_tensor(k)
|
| 126 |
+
return result
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def _np2jnp(numpy_dict: Dict[str, np.ndarray]) -> Dict[str, Array]:
|
| 130 |
+
for k, v in numpy_dict.items():
|
| 131 |
+
numpy_dict[k] = jnp.array(v)
|
| 132 |
+
return numpy_dict
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def _jnp2np(jnp_dict: Dict[str, Array]) -> Dict[str, np.array]:
|
| 136 |
+
for k, v in jnp_dict.items():
|
| 137 |
+
jnp_dict[k] = np.asarray(v)
|
| 138 |
+
return jnp_dict
|
source/safetensors/mlx.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import Dict, Optional, Union
|
| 3 |
+
|
| 4 |
+
import numpy as np
|
| 5 |
+
|
| 6 |
+
import mlx.core as mx
|
| 7 |
+
from safetensors import numpy, safe_open
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def save(
|
| 11 |
+
tensors: Dict[str, mx.array], metadata: Optional[Dict[str, str]] = None
|
| 12 |
+
) -> bytes:
|
| 13 |
+
"""
|
| 14 |
+
Saves a dictionary of tensors into raw bytes in safetensors format.
|
| 15 |
+
|
| 16 |
+
Args:
|
| 17 |
+
tensors (`Dict[str, mx.array]`):
|
| 18 |
+
The incoming tensors. Tensors need to be contiguous and dense.
|
| 19 |
+
metadata (`Dict[str, str]`, *optional*, defaults to `None`):
|
| 20 |
+
Optional text only metadata you might want to save in your header.
|
| 21 |
+
For instance it can be useful to specify more about the underlying
|
| 22 |
+
tensors. This is purely informative and does not affect tensor loading.
|
| 23 |
+
|
| 24 |
+
Returns:
|
| 25 |
+
`bytes`: The raw bytes representing the format
|
| 26 |
+
|
| 27 |
+
Example:
|
| 28 |
+
|
| 29 |
+
```python
|
| 30 |
+
from safetensors.mlx import save
|
| 31 |
+
import mlx.core as mx
|
| 32 |
+
|
| 33 |
+
tensors = {"embedding": mx.zeros((512, 1024)), "attention": mx.zeros((256, 256))}
|
| 34 |
+
byte_data = save(tensors)
|
| 35 |
+
```
|
| 36 |
+
"""
|
| 37 |
+
np_tensors = _mx2np(tensors)
|
| 38 |
+
return numpy.save(np_tensors, metadata=metadata)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def save_file(
|
| 42 |
+
tensors: Dict[str, mx.array],
|
| 43 |
+
filename: Union[str, os.PathLike],
|
| 44 |
+
metadata: Optional[Dict[str, str]] = None,
|
| 45 |
+
) -> None:
|
| 46 |
+
"""
|
| 47 |
+
Saves a dictionary of tensors into raw bytes in safetensors format.
|
| 48 |
+
|
| 49 |
+
Args:
|
| 50 |
+
tensors (`Dict[str, mx.array]`):
|
| 51 |
+
The incoming tensors. Tensors need to be contiguous and dense.
|
| 52 |
+
filename (`str`, or `os.PathLike`)):
|
| 53 |
+
The filename we're saving into.
|
| 54 |
+
metadata (`Dict[str, str]`, *optional*, defaults to `None`):
|
| 55 |
+
Optional text only metadata you might want to save in your header.
|
| 56 |
+
For instance it can be useful to specify more about the underlying
|
| 57 |
+
tensors. This is purely informative and does not affect tensor loading.
|
| 58 |
+
|
| 59 |
+
Returns:
|
| 60 |
+
`None`
|
| 61 |
+
|
| 62 |
+
Example:
|
| 63 |
+
|
| 64 |
+
```python
|
| 65 |
+
from safetensors.mlx import save_file
|
| 66 |
+
import mlx.core as mx
|
| 67 |
+
|
| 68 |
+
tensors = {"embedding": mx.zeros((512, 1024)), "attention": mx.zeros((256, 256))}
|
| 69 |
+
save_file(tensors, "model.safetensors")
|
| 70 |
+
```
|
| 71 |
+
"""
|
| 72 |
+
np_tensors = _mx2np(tensors)
|
| 73 |
+
return numpy.save_file(np_tensors, filename, metadata=metadata)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def load(data: bytes) -> Dict[str, mx.array]:
|
| 77 |
+
"""
|
| 78 |
+
Loads a safetensors file into MLX format from pure bytes.
|
| 79 |
+
|
| 80 |
+
Args:
|
| 81 |
+
data (`bytes`):
|
| 82 |
+
The content of a safetensors file
|
| 83 |
+
|
| 84 |
+
Returns:
|
| 85 |
+
`Dict[str, mx.array]`: dictionary that contains name as key, value as `mx.array`
|
| 86 |
+
|
| 87 |
+
Example:
|
| 88 |
+
|
| 89 |
+
```python
|
| 90 |
+
from safetensors.mlx import load
|
| 91 |
+
|
| 92 |
+
file_path = "./my_folder/bert.safetensors"
|
| 93 |
+
with open(file_path, "rb") as f:
|
| 94 |
+
data = f.read()
|
| 95 |
+
|
| 96 |
+
loaded = load(data)
|
| 97 |
+
```
|
| 98 |
+
"""
|
| 99 |
+
flat = numpy.load(data)
|
| 100 |
+
return _np2mx(flat)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def load_file(filename: Union[str, os.PathLike]) -> Dict[str, mx.array]:
|
| 104 |
+
"""
|
| 105 |
+
Loads a safetensors file into MLX format.
|
| 106 |
+
|
| 107 |
+
Args:
|
| 108 |
+
filename (`str`, or `os.PathLike`)):
|
| 109 |
+
The name of the file which contains the tensors
|
| 110 |
+
|
| 111 |
+
Returns:
|
| 112 |
+
`Dict[str, mx.array]`: dictionary that contains name as key, value as `mx.array`
|
| 113 |
+
|
| 114 |
+
Example:
|
| 115 |
+
|
| 116 |
+
```python
|
| 117 |
+
from safetensors.flax import load_file
|
| 118 |
+
|
| 119 |
+
file_path = "./my_folder/bert.safetensors"
|
| 120 |
+
loaded = load_file(file_path)
|
| 121 |
+
```
|
| 122 |
+
"""
|
| 123 |
+
result = {}
|
| 124 |
+
with safe_open(filename, framework="mlx") as f:
|
| 125 |
+
for k in f.offset_keys():
|
| 126 |
+
result[k] = f.get_tensor(k)
|
| 127 |
+
return result
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def _np2mx(numpy_dict: Dict[str, np.ndarray]) -> Dict[str, mx.array]:
|
| 131 |
+
for k, v in numpy_dict.items():
|
| 132 |
+
numpy_dict[k] = mx.array(v)
|
| 133 |
+
return numpy_dict
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def _mx2np(mx_dict: Dict[str, mx.array]) -> Dict[str, np.array]:
|
| 137 |
+
new_dict = {}
|
| 138 |
+
for k, v in mx_dict.items():
|
| 139 |
+
new_dict[k] = np.asarray(v)
|
| 140 |
+
return new_dict
|
source/safetensors/numpy.py
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
from typing import Dict, Optional, Union
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
|
| 7 |
+
from safetensors import deserialize, safe_open, serialize, serialize_file
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def _tobytes(tensor: np.ndarray) -> bytes:
|
| 11 |
+
if not _is_little_endian(tensor):
|
| 12 |
+
tensor = tensor.byteswap(inplace=False)
|
| 13 |
+
return tensor.tobytes()
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def save(
|
| 17 |
+
tensor_dict: Dict[str, np.ndarray], metadata: Optional[Dict[str, str]] = None
|
| 18 |
+
) -> bytes:
|
| 19 |
+
"""
|
| 20 |
+
Saves a dictionary of tensors into raw bytes in safetensors format.
|
| 21 |
+
|
| 22 |
+
Args:
|
| 23 |
+
tensor_dict (`Dict[str, np.ndarray]`):
|
| 24 |
+
The incoming tensors. Tensors need to be contiguous and dense.
|
| 25 |
+
metadata (`Dict[str, str]`, *optional*, defaults to `None`):
|
| 26 |
+
Optional text only metadata you might want to save in your header.
|
| 27 |
+
For instance it can be useful to specify more about the underlying
|
| 28 |
+
tensors. This is purely informative and does not affect tensor loading.
|
| 29 |
+
|
| 30 |
+
Returns:
|
| 31 |
+
`bytes`: The raw bytes representing the format
|
| 32 |
+
|
| 33 |
+
Example:
|
| 34 |
+
|
| 35 |
+
```python
|
| 36 |
+
from safetensors.numpy import save
|
| 37 |
+
import numpy as np
|
| 38 |
+
|
| 39 |
+
tensors = {"embedding": np.zeros((512, 1024)), "attention": np.zeros((256, 256))}
|
| 40 |
+
byte_data = save(tensors)
|
| 41 |
+
```
|
| 42 |
+
"""
|
| 43 |
+
flattened = {
|
| 44 |
+
k: {"dtype": v.dtype.name, "shape": v.shape, "data": _tobytes(v)}
|
| 45 |
+
for k, v in tensor_dict.items()
|
| 46 |
+
}
|
| 47 |
+
serialized = serialize(flattened, metadata=metadata)
|
| 48 |
+
result = bytes(serialized)
|
| 49 |
+
return result
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def save_file(
|
| 53 |
+
tensor_dict: Dict[str, np.ndarray],
|
| 54 |
+
filename: Union[str, os.PathLike],
|
| 55 |
+
metadata: Optional[Dict[str, str]] = None,
|
| 56 |
+
) -> None:
|
| 57 |
+
"""
|
| 58 |
+
Saves a dictionary of tensors into raw bytes in safetensors format.
|
| 59 |
+
|
| 60 |
+
Args:
|
| 61 |
+
tensor_dict (`Dict[str, np.ndarray]`):
|
| 62 |
+
The incoming tensors. Tensors need to be contiguous and dense.
|
| 63 |
+
filename (`str`, or `os.PathLike`)):
|
| 64 |
+
The filename we're saving into.
|
| 65 |
+
metadata (`Dict[str, str]`, *optional*, defaults to `None`):
|
| 66 |
+
Optional text only metadata you might want to save in your header.
|
| 67 |
+
For instance it can be useful to specify more about the underlying
|
| 68 |
+
tensors. This is purely informative and does not affect tensor loading.
|
| 69 |
+
|
| 70 |
+
Returns:
|
| 71 |
+
`None`
|
| 72 |
+
|
| 73 |
+
Example:
|
| 74 |
+
|
| 75 |
+
```python
|
| 76 |
+
from safetensors.numpy import save_file
|
| 77 |
+
import numpy as np
|
| 78 |
+
|
| 79 |
+
tensors = {"embedding": np.zeros((512, 1024)), "attention": np.zeros((256, 256))}
|
| 80 |
+
save_file(tensors, "model.safetensors")
|
| 81 |
+
```
|
| 82 |
+
"""
|
| 83 |
+
flattened = {
|
| 84 |
+
k: {"dtype": v.dtype.name, "shape": v.shape, "data": _tobytes(v)}
|
| 85 |
+
for k, v in tensor_dict.items()
|
| 86 |
+
}
|
| 87 |
+
serialize_file(flattened, filename, metadata=metadata)
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def load(data: bytes) -> Dict[str, np.ndarray]:
|
| 91 |
+
"""
|
| 92 |
+
Loads a safetensors file into numpy format from pure bytes.
|
| 93 |
+
|
| 94 |
+
Args:
|
| 95 |
+
data (`bytes`):
|
| 96 |
+
The content of a safetensors file
|
| 97 |
+
|
| 98 |
+
Returns:
|
| 99 |
+
`Dict[str, np.ndarray]`: dictionary that contains name as key, value as `np.ndarray` on cpu
|
| 100 |
+
|
| 101 |
+
Example:
|
| 102 |
+
|
| 103 |
+
```python
|
| 104 |
+
from safetensors.numpy import load
|
| 105 |
+
|
| 106 |
+
file_path = "./my_folder/bert.safetensors"
|
| 107 |
+
with open(file_path, "rb") as f:
|
| 108 |
+
data = f.read()
|
| 109 |
+
|
| 110 |
+
loaded = load(data)
|
| 111 |
+
```
|
| 112 |
+
"""
|
| 113 |
+
flat = deserialize(data)
|
| 114 |
+
return _view2np(flat)
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def load_file(filename: Union[str, os.PathLike]) -> Dict[str, np.ndarray]:
|
| 118 |
+
"""
|
| 119 |
+
Loads a safetensors file into numpy format.
|
| 120 |
+
|
| 121 |
+
Args:
|
| 122 |
+
filename (`str`, or `os.PathLike`)):
|
| 123 |
+
The name of the file which contains the tensors
|
| 124 |
+
|
| 125 |
+
Returns:
|
| 126 |
+
`Dict[str, np.ndarray]`: dictionary that contains name as key, value as `np.ndarray`
|
| 127 |
+
|
| 128 |
+
Example:
|
| 129 |
+
|
| 130 |
+
```python
|
| 131 |
+
from safetensors.numpy import load_file
|
| 132 |
+
|
| 133 |
+
file_path = "./my_folder/bert.safetensors"
|
| 134 |
+
loaded = load_file(file_path)
|
| 135 |
+
```
|
| 136 |
+
"""
|
| 137 |
+
result = {}
|
| 138 |
+
with safe_open(filename, framework="np") as f:
|
| 139 |
+
for k in f.offset_keys():
|
| 140 |
+
result[k] = f.get_tensor(k)
|
| 141 |
+
return result
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
_TYPES = {
|
| 145 |
+
"F64": np.float64,
|
| 146 |
+
"F32": np.float32,
|
| 147 |
+
"F16": np.float16,
|
| 148 |
+
"I64": np.int64,
|
| 149 |
+
"U64": np.uint64,
|
| 150 |
+
"I32": np.int32,
|
| 151 |
+
"U32": np.uint32,
|
| 152 |
+
"I16": np.int16,
|
| 153 |
+
"U16": np.uint16,
|
| 154 |
+
"I8": np.int8,
|
| 155 |
+
"U8": np.uint8,
|
| 156 |
+
"BOOL": bool,
|
| 157 |
+
"C64": np.complex64,
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def _getdtype(dtype_str: str) -> np.dtype:
|
| 162 |
+
return _TYPES[dtype_str]
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def _view2np(safeview) -> Dict[str, np.ndarray]:
|
| 166 |
+
result = {}
|
| 167 |
+
for k, v in safeview:
|
| 168 |
+
dtype = _getdtype(v["dtype"])
|
| 169 |
+
arr = np.frombuffer(v["data"], dtype=dtype).reshape(v["shape"])
|
| 170 |
+
result[k] = arr
|
| 171 |
+
return result
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def _is_little_endian(tensor: np.ndarray) -> bool:
|
| 175 |
+
byteorder = tensor.dtype.byteorder
|
| 176 |
+
if byteorder == "=":
|
| 177 |
+
if sys.byteorder == "little":
|
| 178 |
+
return True
|
| 179 |
+
else:
|
| 180 |
+
return False
|
| 181 |
+
elif byteorder == "|":
|
| 182 |
+
return True
|
| 183 |
+
elif byteorder == "<":
|
| 184 |
+
return True
|
| 185 |
+
elif byteorder == ">":
|
| 186 |
+
return False
|
| 187 |
+
raise ValueError(f"Unexpected byte order {byteorder}")
|
source/safetensors/paddle.py
ADDED
|
@@ -0,0 +1,290 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
from typing import Any, Dict, Optional, Union
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
import paddle
|
| 7 |
+
|
| 8 |
+
from safetensors import numpy, deserialize, safe_open, serialize, serialize_file
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def save(
|
| 12 |
+
tensors: Dict[str, paddle.Tensor], metadata: Optional[Dict[str, str]] = None
|
| 13 |
+
) -> bytes:
|
| 14 |
+
"""
|
| 15 |
+
Saves a dictionary of tensors into raw bytes in safetensors format.
|
| 16 |
+
|
| 17 |
+
Args:
|
| 18 |
+
tensors (`Dict[str, paddle.Tensor]`):
|
| 19 |
+
The incoming tensors. Tensors need to be contiguous and dense.
|
| 20 |
+
metadata (`Dict[str, str]`, *optional*, defaults to `None`):
|
| 21 |
+
Optional text only metadata you might want to save in your header.
|
| 22 |
+
For instance it can be useful to specify more about the underlying
|
| 23 |
+
tensors. This is purely informative and does not affect tensor loading.
|
| 24 |
+
|
| 25 |
+
Returns:
|
| 26 |
+
`bytes`: The raw bytes representing the format
|
| 27 |
+
|
| 28 |
+
Example:
|
| 29 |
+
|
| 30 |
+
```python
|
| 31 |
+
from safetensors.paddle import save
|
| 32 |
+
import paddle
|
| 33 |
+
|
| 34 |
+
tensors = {"embedding": paddle.zeros((512, 1024)), "attention": paddle.zeros((256, 256))}
|
| 35 |
+
byte_data = save(tensors)
|
| 36 |
+
```
|
| 37 |
+
"""
|
| 38 |
+
serialized = serialize(_flatten(tensors), metadata=metadata)
|
| 39 |
+
result = bytes(serialized)
|
| 40 |
+
return result
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def save_file(
|
| 44 |
+
tensors: Dict[str, paddle.Tensor],
|
| 45 |
+
filename: Union[str, os.PathLike],
|
| 46 |
+
metadata: Optional[Dict[str, str]] = None,
|
| 47 |
+
) -> None:
|
| 48 |
+
"""
|
| 49 |
+
Saves a dictionary of tensors into raw bytes in safetensors format.
|
| 50 |
+
|
| 51 |
+
Args:
|
| 52 |
+
tensors (`Dict[str, paddle.Tensor]`):
|
| 53 |
+
The incoming tensors. Tensors need to be contiguous and dense.
|
| 54 |
+
filename (`str`, or `os.PathLike`)):
|
| 55 |
+
The filename we're saving into.
|
| 56 |
+
metadata (`Dict[str, str]`, *optional*, defaults to `None`):
|
| 57 |
+
Optional text only metadata you might want to save in your header.
|
| 58 |
+
For instance it can be useful to specify more about the underlying
|
| 59 |
+
tensors. This is purely informative and does not affect tensor loading.
|
| 60 |
+
|
| 61 |
+
Returns:
|
| 62 |
+
`None`
|
| 63 |
+
|
| 64 |
+
Example:
|
| 65 |
+
|
| 66 |
+
```python
|
| 67 |
+
from safetensors.paddle import save_file
|
| 68 |
+
import paddle
|
| 69 |
+
|
| 70 |
+
tensors = {"embedding": paddle.zeros((512, 1024)), "attention": paddle.zeros((256, 256))}
|
| 71 |
+
save_file(tensors, "model.safetensors")
|
| 72 |
+
```
|
| 73 |
+
"""
|
| 74 |
+
serialize_file(_flatten(tensors), filename, metadata=metadata)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def load(data: bytes, device: str = "cpu") -> Dict[str, paddle.Tensor]:
|
| 78 |
+
"""
|
| 79 |
+
Loads a safetensors file into paddle format from pure bytes.
|
| 80 |
+
|
| 81 |
+
Args:
|
| 82 |
+
data (`bytes`):
|
| 83 |
+
The content of a safetensors file
|
| 84 |
+
|
| 85 |
+
Returns:
|
| 86 |
+
`Dict[str, paddle.Tensor]`: dictionary that contains name as key, value as `paddle.Tensor` on cpu
|
| 87 |
+
|
| 88 |
+
Example:
|
| 89 |
+
|
| 90 |
+
```python
|
| 91 |
+
from safetensors.paddle import load
|
| 92 |
+
|
| 93 |
+
file_path = "./my_folder/bert.safetensors"
|
| 94 |
+
with open(file_path, "rb") as f:
|
| 95 |
+
data = f.read()
|
| 96 |
+
|
| 97 |
+
loaded = load(data)
|
| 98 |
+
```
|
| 99 |
+
"""
|
| 100 |
+
if paddle.__version__ >= "3.2.0":
|
| 101 |
+
flat = deserialize(data)
|
| 102 |
+
return _view2paddle(flat, device)
|
| 103 |
+
else:
|
| 104 |
+
flat = numpy.load(data)
|
| 105 |
+
return _np2paddle(flat, device)
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def load_file(
|
| 109 |
+
filename: Union[str, os.PathLike], device="cpu"
|
| 110 |
+
) -> Dict[str, paddle.Tensor]:
|
| 111 |
+
"""
|
| 112 |
+
Loads a safetensors file into paddle format.
|
| 113 |
+
|
| 114 |
+
Args:
|
| 115 |
+
filename (`str`, or `os.PathLike`)):
|
| 116 |
+
The name of the file which contains the tensors
|
| 117 |
+
device (`Union[Dict[str, any], str]`, *optional*, defaults to `cpu`):
|
| 118 |
+
The device where the tensors need to be located after load.
|
| 119 |
+
available options are all regular paddle device locations
|
| 120 |
+
|
| 121 |
+
Returns:
|
| 122 |
+
`Dict[str, paddle.Tensor]`: dictionary that contains name as key, value as `paddle.Tensor`
|
| 123 |
+
|
| 124 |
+
Example:
|
| 125 |
+
|
| 126 |
+
```python
|
| 127 |
+
from safetensors.paddle import load_file
|
| 128 |
+
|
| 129 |
+
file_path = "./my_folder/bert.safetensors"
|
| 130 |
+
loaded = load_file(file_path)
|
| 131 |
+
```
|
| 132 |
+
"""
|
| 133 |
+
result = {}
|
| 134 |
+
if paddle.__version__ >= "3.2.0":
|
| 135 |
+
with safe_open(filename, framework="paddle", device=device) as f:
|
| 136 |
+
for k in f.offset_keys():
|
| 137 |
+
result[k] = f.get_tensor(k)
|
| 138 |
+
else:
|
| 139 |
+
flat = numpy.load_file(filename)
|
| 140 |
+
result = _np2paddle(flat, device)
|
| 141 |
+
return result
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def _np2paddle(
|
| 145 |
+
numpy_dict: Dict[str, np.ndarray], device: str = "cpu"
|
| 146 |
+
) -> Dict[str, paddle.Tensor]:
|
| 147 |
+
for k, v in numpy_dict.items():
|
| 148 |
+
numpy_dict[k] = paddle.to_tensor(v, place=device)
|
| 149 |
+
return numpy_dict
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def _paddle2np(paddle_dict: Dict[str, paddle.Tensor]) -> Dict[str, np.array]:
|
| 153 |
+
for k, v in paddle_dict.items():
|
| 154 |
+
paddle_dict[k] = v.detach().cpu().numpy()
|
| 155 |
+
return paddle_dict
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
_SIZE = {
|
| 159 |
+
paddle.int64: 8,
|
| 160 |
+
paddle.float32: 4,
|
| 161 |
+
paddle.int32: 4,
|
| 162 |
+
paddle.bfloat16: 2,
|
| 163 |
+
paddle.float16: 2,
|
| 164 |
+
paddle.int16: 2,
|
| 165 |
+
paddle.uint8: 1,
|
| 166 |
+
paddle.int8: 1,
|
| 167 |
+
paddle.bool: 1,
|
| 168 |
+
paddle.float64: 8,
|
| 169 |
+
paddle.float8_e4m3fn: 1,
|
| 170 |
+
paddle.float8_e5m2: 1,
|
| 171 |
+
paddle.complex64: 8,
|
| 172 |
+
# XXX: These are not supported yet in paddle
|
| 173 |
+
# paddle.uint64: 8,
|
| 174 |
+
# paddle.uint32: 4,
|
| 175 |
+
# paddle.uint16: 2,
|
| 176 |
+
# paddle.float8_e8m0: 1,
|
| 177 |
+
# paddle.float4_e2m1_x2: 1,
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
_TYPES = {
|
| 181 |
+
"F64": paddle.float64,
|
| 182 |
+
"F32": paddle.float32,
|
| 183 |
+
"F16": paddle.float16,
|
| 184 |
+
"BF16": paddle.bfloat16,
|
| 185 |
+
"I64": paddle.int64,
|
| 186 |
+
"I32": paddle.int32,
|
| 187 |
+
"I16": paddle.int16,
|
| 188 |
+
"I8": paddle.int8,
|
| 189 |
+
"U8": paddle.uint8,
|
| 190 |
+
"BOOL": paddle.bool,
|
| 191 |
+
"F8_E4M3": paddle.float8_e4m3fn,
|
| 192 |
+
"F8_E5M2": paddle.float8_e5m2,
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
NPDTYPES = {
|
| 196 |
+
paddle.int64: np.int64,
|
| 197 |
+
paddle.float32: np.float32,
|
| 198 |
+
paddle.int32: np.int32,
|
| 199 |
+
# XXX: This is ok because both have the same width
|
| 200 |
+
paddle.bfloat16: np.float16,
|
| 201 |
+
paddle.float16: np.float16,
|
| 202 |
+
paddle.int16: np.int16,
|
| 203 |
+
paddle.uint8: np.uint8,
|
| 204 |
+
paddle.int8: np.int8,
|
| 205 |
+
paddle.bool: bool,
|
| 206 |
+
paddle.float64: np.float64,
|
| 207 |
+
# XXX: This is ok because both have the same width and byteswap is a no-op anyway
|
| 208 |
+
paddle.float8_e4m3fn: np.uint8,
|
| 209 |
+
paddle.float8_e5m2: np.uint8,
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
def _getdtype(dtype_str: str) -> paddle.dtype:
|
| 214 |
+
return _TYPES[dtype_str]
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
def _view2paddle(safeview, device) -> Dict[str, paddle.Tensor]:
|
| 218 |
+
result = {}
|
| 219 |
+
for k, v in safeview:
|
| 220 |
+
dtype = _getdtype(v["dtype"])
|
| 221 |
+
if len(v["data"]) == 0:
|
| 222 |
+
# Workaround because frombuffer doesn't accept zero-size tensors
|
| 223 |
+
assert any(x == 0 for x in v["shape"])
|
| 224 |
+
arr = paddle.empty(v["shape"], dtype=dtype)
|
| 225 |
+
else:
|
| 226 |
+
arr = paddle.base.core.frombuffer(v["data"], dtype).reshape(v["shape"])
|
| 227 |
+
if device != "cpu":
|
| 228 |
+
arr = arr.to(device)
|
| 229 |
+
if sys.byteorder == "big":
|
| 230 |
+
arr = paddle.to_tensor(arr.numpy().byteswap(inplace=False), place=device)
|
| 231 |
+
result[k] = arr
|
| 232 |
+
|
| 233 |
+
return result
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
def _tobytes(tensor: paddle.Tensor, name: str) -> bytes:
|
| 237 |
+
if not tensor.is_contiguous():
|
| 238 |
+
raise ValueError(
|
| 239 |
+
f"You are trying to save a non contiguous tensor: `{name}` which is not allowed. It either means you"
|
| 240 |
+
" are trying to save tensors which are reference of each other in which case it's recommended to save"
|
| 241 |
+
" only the full tensors, and reslice at load time, or simply call `.contiguous()` on your tensor to"
|
| 242 |
+
" pack it before saving."
|
| 243 |
+
)
|
| 244 |
+
if not tensor.place.is_cpu_place():
|
| 245 |
+
# Moving tensor to cpu before saving
|
| 246 |
+
tensor = tensor.cpu()
|
| 247 |
+
|
| 248 |
+
import ctypes
|
| 249 |
+
|
| 250 |
+
import numpy as np
|
| 251 |
+
|
| 252 |
+
# When shape is empty (scalar), np.prod returns a float
|
| 253 |
+
# we need a int for the following calculations
|
| 254 |
+
length = int(np.prod(tensor.shape).item())
|
| 255 |
+
bytes_per_item = _SIZE[tensor.dtype]
|
| 256 |
+
|
| 257 |
+
total_bytes = length * bytes_per_item
|
| 258 |
+
|
| 259 |
+
ptr = tensor.data_ptr()
|
| 260 |
+
if ptr == 0:
|
| 261 |
+
return b""
|
| 262 |
+
newptr = ctypes.cast(ptr, ctypes.POINTER(ctypes.c_ubyte))
|
| 263 |
+
data = np.ctypeslib.as_array(newptr, (total_bytes,)) # no internal copy
|
| 264 |
+
if sys.byteorder == "big":
|
| 265 |
+
npdtype = NPDTYPES[tensor.dtype]
|
| 266 |
+
# Not in place as that would potentially modify a live running model
|
| 267 |
+
data = data.view(npdtype).byteswap(inplace=False)
|
| 268 |
+
return data.tobytes()
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
def _flatten(tensors: Dict[str, paddle.Tensor]) -> Dict[str, Dict[str, Any]]:
|
| 272 |
+
if not isinstance(tensors, dict):
|
| 273 |
+
raise ValueError(
|
| 274 |
+
f"Expected a dict of [str, paddle.Tensor] but received {type(tensors)}"
|
| 275 |
+
)
|
| 276 |
+
|
| 277 |
+
for k, v in tensors.items():
|
| 278 |
+
if not isinstance(v, paddle.Tensor):
|
| 279 |
+
raise ValueError(
|
| 280 |
+
f"Key `{k}` is invalid, expected paddle.Tensor but received {type(v)}"
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
return {
|
| 284 |
+
k: {
|
| 285 |
+
"dtype": str(v.dtype).split(".")[-1],
|
| 286 |
+
"shape": v.shape,
|
| 287 |
+
"data": _tobytes(v, k),
|
| 288 |
+
}
|
| 289 |
+
for k, v in tensors.items()
|
| 290 |
+
}
|
source/safetensors/py.typed
ADDED
|
File without changes
|
source/safetensors/tensorflow.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import Dict, Optional, Union
|
| 3 |
+
|
| 4 |
+
import numpy as np
|
| 5 |
+
import tensorflow as tf
|
| 6 |
+
|
| 7 |
+
from safetensors import numpy, safe_open
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def save(
|
| 11 |
+
tensors: Dict[str, tf.Tensor], metadata: Optional[Dict[str, str]] = None
|
| 12 |
+
) -> bytes:
|
| 13 |
+
"""
|
| 14 |
+
Saves a dictionary of tensors into raw bytes in safetensors format.
|
| 15 |
+
|
| 16 |
+
Args:
|
| 17 |
+
tensors (`Dict[str, tf.Tensor]`):
|
| 18 |
+
The incoming tensors. Tensors need to be contiguous and dense.
|
| 19 |
+
metadata (`Dict[str, str]`, *optional*, defaults to `None`):
|
| 20 |
+
Optional text only metadata you might want to save in your header.
|
| 21 |
+
For instance it can be useful to specify more about the underlying
|
| 22 |
+
tensors. This is purely informative and does not affect tensor loading.
|
| 23 |
+
|
| 24 |
+
Returns:
|
| 25 |
+
`bytes`: The raw bytes representing the format
|
| 26 |
+
|
| 27 |
+
Example:
|
| 28 |
+
|
| 29 |
+
```python
|
| 30 |
+
from safetensors.tensorflow import save
|
| 31 |
+
import tensorflow as tf
|
| 32 |
+
|
| 33 |
+
tensors = {"embedding": tf.zeros((512, 1024)), "attention": tf.zeros((256, 256))}
|
| 34 |
+
byte_data = save(tensors)
|
| 35 |
+
```
|
| 36 |
+
"""
|
| 37 |
+
np_tensors = _tf2np(tensors)
|
| 38 |
+
return numpy.save(np_tensors, metadata=metadata)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def save_file(
|
| 42 |
+
tensors: Dict[str, tf.Tensor],
|
| 43 |
+
filename: Union[str, os.PathLike],
|
| 44 |
+
metadata: Optional[Dict[str, str]] = None,
|
| 45 |
+
) -> None:
|
| 46 |
+
"""
|
| 47 |
+
Saves a dictionary of tensors into raw bytes in safetensors format.
|
| 48 |
+
|
| 49 |
+
Args:
|
| 50 |
+
tensors (`Dict[str, tf.Tensor]`):
|
| 51 |
+
The incoming tensors. Tensors need to be contiguous and dense.
|
| 52 |
+
filename (`str`, or `os.PathLike`)):
|
| 53 |
+
The filename we're saving into.
|
| 54 |
+
metadata (`Dict[str, str]`, *optional*, defaults to `None`):
|
| 55 |
+
Optional text only metadata you might want to save in your header.
|
| 56 |
+
For instance it can be useful to specify more about the underlying
|
| 57 |
+
tensors. This is purely informative and does not affect tensor loading.
|
| 58 |
+
|
| 59 |
+
Returns:
|
| 60 |
+
`None`
|
| 61 |
+
|
| 62 |
+
Example:
|
| 63 |
+
|
| 64 |
+
```python
|
| 65 |
+
from safetensors.tensorflow import save_file
|
| 66 |
+
import tensorflow as tf
|
| 67 |
+
|
| 68 |
+
tensors = {"embedding": tf.zeros((512, 1024)), "attention": tf.zeros((256, 256))}
|
| 69 |
+
save_file(tensors, "model.safetensors")
|
| 70 |
+
```
|
| 71 |
+
"""
|
| 72 |
+
np_tensors = _tf2np(tensors)
|
| 73 |
+
return numpy.save_file(np_tensors, filename, metadata=metadata)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def load(data: bytes) -> Dict[str, tf.Tensor]:
|
| 77 |
+
"""
|
| 78 |
+
Loads a safetensors file into tensorflow format from pure bytes.
|
| 79 |
+
|
| 80 |
+
Args:
|
| 81 |
+
data (`bytes`):
|
| 82 |
+
The content of a safetensors file
|
| 83 |
+
|
| 84 |
+
Returns:
|
| 85 |
+
`Dict[str, tf.Tensor]`: dictionary that contains name as key, value as `tf.Tensor` on cpu
|
| 86 |
+
|
| 87 |
+
Example:
|
| 88 |
+
|
| 89 |
+
```python
|
| 90 |
+
from safetensors.tensorflow import load
|
| 91 |
+
|
| 92 |
+
file_path = "./my_folder/bert.safetensors"
|
| 93 |
+
with open(file_path, "rb") as f:
|
| 94 |
+
data = f.read()
|
| 95 |
+
|
| 96 |
+
loaded = load(data)
|
| 97 |
+
```
|
| 98 |
+
"""
|
| 99 |
+
flat = numpy.load(data)
|
| 100 |
+
return _np2tf(flat)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def load_file(filename: Union[str, os.PathLike]) -> Dict[str, tf.Tensor]:
|
| 104 |
+
"""
|
| 105 |
+
Loads a safetensors file into tensorflow format.
|
| 106 |
+
|
| 107 |
+
Args:
|
| 108 |
+
filename (`str`, or `os.PathLike`)):
|
| 109 |
+
The name of the file which contains the tensors
|
| 110 |
+
|
| 111 |
+
Returns:
|
| 112 |
+
`Dict[str, tf.Tensor]`: dictionary that contains name as key, value as `tf.Tensor`
|
| 113 |
+
|
| 114 |
+
Example:
|
| 115 |
+
|
| 116 |
+
```python
|
| 117 |
+
from safetensors.tensorflow import load_file
|
| 118 |
+
|
| 119 |
+
file_path = "./my_folder/bert.safetensors"
|
| 120 |
+
loaded = load_file(file_path)
|
| 121 |
+
```
|
| 122 |
+
"""
|
| 123 |
+
result = {}
|
| 124 |
+
with safe_open(filename, framework="tf") as f:
|
| 125 |
+
for k in f.offset_keys():
|
| 126 |
+
result[k] = f.get_tensor(k)
|
| 127 |
+
return result
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def _np2tf(numpy_dict: Dict[str, np.ndarray]) -> Dict[str, tf.Tensor]:
|
| 131 |
+
for k, v in numpy_dict.items():
|
| 132 |
+
numpy_dict[k] = tf.convert_to_tensor(v)
|
| 133 |
+
return numpy_dict
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def _tf2np(tf_dict: Dict[str, tf.Tensor]) -> Dict[str, np.array]:
|
| 137 |
+
for k, v in tf_dict.items():
|
| 138 |
+
tf_dict[k] = v.numpy()
|
| 139 |
+
return tf_dict
|
source/safetensors/torch.py
ADDED
|
@@ -0,0 +1,550 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
from collections import defaultdict
|
| 4 |
+
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
| 5 |
+
from packaging.version import Version
|
| 6 |
+
|
| 7 |
+
import torch
|
| 8 |
+
|
| 9 |
+
from safetensors import deserialize, safe_open, serialize, serialize_file
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def storage_ptr(tensor: torch.Tensor) -> int:
|
| 13 |
+
try:
|
| 14 |
+
return tensor.untyped_storage().data_ptr()
|
| 15 |
+
except Exception:
|
| 16 |
+
# Fallback for torch==1.10
|
| 17 |
+
try:
|
| 18 |
+
return tensor.storage().data_ptr()
|
| 19 |
+
except NotImplementedError:
|
| 20 |
+
# Fallback for meta storage
|
| 21 |
+
return 0
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def _end_ptr(tensor: torch.Tensor) -> int:
|
| 25 |
+
if tensor.nelement():
|
| 26 |
+
stop = tensor.view(-1)[-1].data_ptr() + _SIZE[tensor.dtype]
|
| 27 |
+
else:
|
| 28 |
+
stop = tensor.data_ptr()
|
| 29 |
+
return stop
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def storage_size(tensor: torch.Tensor) -> int:
|
| 33 |
+
try:
|
| 34 |
+
return tensor.untyped_storage().nbytes()
|
| 35 |
+
except AttributeError:
|
| 36 |
+
# Fallback for torch==1.10
|
| 37 |
+
try:
|
| 38 |
+
return tensor.storage().size() * _SIZE[tensor.dtype]
|
| 39 |
+
except NotImplementedError:
|
| 40 |
+
# Fallback for meta storage
|
| 41 |
+
# On torch >=2.0 this is the tensor size
|
| 42 |
+
return tensor.nelement() * _SIZE[tensor.dtype]
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def _filter_shared_not_shared(
|
| 46 |
+
tensors: List[Set[str]], state_dict: Dict[str, torch.Tensor]
|
| 47 |
+
) -> List[Set[str]]:
|
| 48 |
+
filtered_tensors = []
|
| 49 |
+
for shared in tensors:
|
| 50 |
+
if len(shared) < 2:
|
| 51 |
+
filtered_tensors.append(shared)
|
| 52 |
+
continue
|
| 53 |
+
|
| 54 |
+
areas = []
|
| 55 |
+
for name in shared:
|
| 56 |
+
tensor = state_dict[name]
|
| 57 |
+
areas.append((tensor.data_ptr(), _end_ptr(tensor), name))
|
| 58 |
+
areas.sort()
|
| 59 |
+
|
| 60 |
+
_, last_stop, last_name = areas[0]
|
| 61 |
+
filtered_tensors.append({last_name})
|
| 62 |
+
for start, stop, name in areas[1:]:
|
| 63 |
+
if start >= last_stop:
|
| 64 |
+
filtered_tensors.append({name})
|
| 65 |
+
else:
|
| 66 |
+
filtered_tensors[-1].add(name)
|
| 67 |
+
last_stop = stop
|
| 68 |
+
|
| 69 |
+
return filtered_tensors
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def _find_shared_tensors(state_dict: Dict[str, torch.Tensor]) -> List[Set[str]]:
|
| 73 |
+
tensors = defaultdict(set)
|
| 74 |
+
for k, v in state_dict.items():
|
| 75 |
+
if (
|
| 76 |
+
v.device != torch.device("meta")
|
| 77 |
+
and storage_ptr(v) != 0
|
| 78 |
+
and storage_size(v) != 0
|
| 79 |
+
):
|
| 80 |
+
# Need to add device as key because of multiple GPU.
|
| 81 |
+
tensors[(v.device, storage_ptr(v), storage_size(v))].add(k)
|
| 82 |
+
tensors = list(sorted(tensors.values()))
|
| 83 |
+
tensors = _filter_shared_not_shared(tensors, state_dict)
|
| 84 |
+
return tensors
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def _is_complete(tensor: torch.Tensor) -> bool:
|
| 88 |
+
return tensor.data_ptr() == storage_ptr(tensor) and tensor.nelement() * _SIZE[
|
| 89 |
+
tensor.dtype
|
| 90 |
+
] == storage_size(tensor)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def _remove_duplicate_names(
|
| 94 |
+
state_dict: Dict[str, torch.Tensor],
|
| 95 |
+
*,
|
| 96 |
+
preferred_names: Optional[List[str]] = None,
|
| 97 |
+
discard_names: Optional[List[str]] = None,
|
| 98 |
+
) -> Dict[str, List[str]]:
|
| 99 |
+
if preferred_names is None:
|
| 100 |
+
preferred_names = []
|
| 101 |
+
preferred_names = set(preferred_names)
|
| 102 |
+
if discard_names is None:
|
| 103 |
+
discard_names = []
|
| 104 |
+
discard_names = set(discard_names)
|
| 105 |
+
|
| 106 |
+
shareds = _find_shared_tensors(state_dict)
|
| 107 |
+
to_remove = defaultdict(list)
|
| 108 |
+
for shared in shareds:
|
| 109 |
+
complete_names = set(
|
| 110 |
+
[name for name in shared if _is_complete(state_dict[name])]
|
| 111 |
+
)
|
| 112 |
+
if not complete_names:
|
| 113 |
+
raise RuntimeError(
|
| 114 |
+
"Error while trying to find names to remove to save state dict, but found no suitable name to keep"
|
| 115 |
+
f" for saving amongst: {shared}. None is covering the entire storage.Refusing to save/load the model"
|
| 116 |
+
" since you could be storing much more memory than needed. Please refer to"
|
| 117 |
+
" https://huggingface.co/docs/safetensors/torch_shared_tensors for more information. Or open an"
|
| 118 |
+
" issue."
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
keep_name = sorted(list(complete_names))[0]
|
| 122 |
+
|
| 123 |
+
# Mechanism to preferentially select keys to keep
|
| 124 |
+
# coming from the on-disk file to allow
|
| 125 |
+
# loading models saved with a different choice
|
| 126 |
+
# of keep_name
|
| 127 |
+
preferred = complete_names.difference(discard_names)
|
| 128 |
+
if preferred:
|
| 129 |
+
keep_name = sorted(list(preferred))[0]
|
| 130 |
+
|
| 131 |
+
if preferred_names:
|
| 132 |
+
preferred = preferred_names.intersection(complete_names)
|
| 133 |
+
if preferred:
|
| 134 |
+
keep_name = sorted(list(preferred))[0]
|
| 135 |
+
for name in sorted(shared):
|
| 136 |
+
if name != keep_name:
|
| 137 |
+
to_remove[keep_name].append(name)
|
| 138 |
+
return to_remove
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
def save_model(
|
| 142 |
+
model: torch.nn.Module,
|
| 143 |
+
filename: str,
|
| 144 |
+
metadata: Optional[Dict[str, str]] = None,
|
| 145 |
+
force_contiguous: bool = True,
|
| 146 |
+
):
|
| 147 |
+
"""
|
| 148 |
+
Saves a given torch model to specified filename.
|
| 149 |
+
This method exists specifically to avoid tensor sharing issues which are
|
| 150 |
+
not allowed in `safetensors`. [More information on tensor sharing](../torch_shared_tensors)
|
| 151 |
+
|
| 152 |
+
Args:
|
| 153 |
+
model (`torch.nn.Module`):
|
| 154 |
+
The model to save on disk.
|
| 155 |
+
filename (`str`):
|
| 156 |
+
The filename location to save the file
|
| 157 |
+
metadata (`Dict[str, str]`, *optional*):
|
| 158 |
+
Extra information to save along with the file.
|
| 159 |
+
Some metadata will be added for each dropped tensors.
|
| 160 |
+
This information will not be enough to recover the entire
|
| 161 |
+
shared structure but might help understanding things
|
| 162 |
+
force_contiguous (`boolean`, *optional*, defaults to True):
|
| 163 |
+
Forcing the state_dict to be saved as contiguous tensors.
|
| 164 |
+
This has no effect on the correctness of the model, but it
|
| 165 |
+
could potentially change performance if the layout of the tensor
|
| 166 |
+
was chosen specifically for that reason.
|
| 167 |
+
"""
|
| 168 |
+
state_dict = model.state_dict()
|
| 169 |
+
to_removes = _remove_duplicate_names(state_dict)
|
| 170 |
+
|
| 171 |
+
for kept_name, to_remove_group in to_removes.items():
|
| 172 |
+
for to_remove in to_remove_group:
|
| 173 |
+
if metadata is None:
|
| 174 |
+
metadata = {}
|
| 175 |
+
|
| 176 |
+
if to_remove not in metadata:
|
| 177 |
+
# Do not override user data
|
| 178 |
+
metadata[to_remove] = kept_name
|
| 179 |
+
del state_dict[to_remove]
|
| 180 |
+
if force_contiguous:
|
| 181 |
+
state_dict = {k: v.contiguous() for k, v in state_dict.items()}
|
| 182 |
+
try:
|
| 183 |
+
save_file(state_dict, filename, metadata=metadata)
|
| 184 |
+
except ValueError as e:
|
| 185 |
+
msg = str(e)
|
| 186 |
+
msg += " Or use save_model(..., force_contiguous=True), read the docs for potential caveats."
|
| 187 |
+
raise ValueError(msg)
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
def load_model(
|
| 191 |
+
model: torch.nn.Module,
|
| 192 |
+
filename: Union[str, os.PathLike],
|
| 193 |
+
strict: bool = True,
|
| 194 |
+
device: Union[str, int] = "cpu",
|
| 195 |
+
) -> Tuple[List[str], List[str]]:
|
| 196 |
+
"""
|
| 197 |
+
Loads a given filename onto a torch model.
|
| 198 |
+
This method exists specifically to avoid tensor sharing issues which are
|
| 199 |
+
not allowed in `safetensors`. [More information on tensor sharing](../torch_shared_tensors)
|
| 200 |
+
|
| 201 |
+
Args:
|
| 202 |
+
model (`torch.nn.Module`):
|
| 203 |
+
The model to load onto.
|
| 204 |
+
filename (`str`, or `os.PathLike`):
|
| 205 |
+
The filename location to load the file from.
|
| 206 |
+
strict (`bool`, *optional*, defaults to True):
|
| 207 |
+
Whether to fail if you're missing keys or having unexpected ones.
|
| 208 |
+
When false, the function simply returns missing and unexpected names.
|
| 209 |
+
device (`Union[str, int]`, *optional*, defaults to `cpu`):
|
| 210 |
+
The device where the tensors need to be located after load.
|
| 211 |
+
available options are all regular torch device locations.
|
| 212 |
+
|
| 213 |
+
Returns:
|
| 214 |
+
`(missing, unexpected): (List[str], List[str])`
|
| 215 |
+
`missing` are names in the model which were not modified during loading
|
| 216 |
+
`unexpected` are names that are on the file, but weren't used during
|
| 217 |
+
the load.
|
| 218 |
+
"""
|
| 219 |
+
state_dict = load_file(filename, device=device)
|
| 220 |
+
model_state_dict = model.state_dict()
|
| 221 |
+
to_removes = _remove_duplicate_names(
|
| 222 |
+
model_state_dict, preferred_names=state_dict.keys()
|
| 223 |
+
)
|
| 224 |
+
missing, unexpected = model.load_state_dict(state_dict, strict=False)
|
| 225 |
+
missing = set(missing)
|
| 226 |
+
for to_remove_group in to_removes.values():
|
| 227 |
+
for to_remove in to_remove_group:
|
| 228 |
+
if to_remove not in missing:
|
| 229 |
+
unexpected.append(to_remove)
|
| 230 |
+
else:
|
| 231 |
+
missing.remove(to_remove)
|
| 232 |
+
if strict and (missing or unexpected):
|
| 233 |
+
missing_keys = ", ".join([f'"{k}"' for k in sorted(missing)])
|
| 234 |
+
unexpected_keys = ", ".join([f'"{k}"' for k in sorted(unexpected)])
|
| 235 |
+
error = f"Error(s) in loading state_dict for {model.__class__.__name__}:"
|
| 236 |
+
if missing:
|
| 237 |
+
error += f"\n Missing key(s) in state_dict: {missing_keys}"
|
| 238 |
+
if unexpected:
|
| 239 |
+
error += f"\n Unexpected key(s) in state_dict: {unexpected_keys}"
|
| 240 |
+
raise RuntimeError(error)
|
| 241 |
+
return missing, unexpected
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
def save(
|
| 245 |
+
tensors: Dict[str, torch.Tensor], metadata: Optional[Dict[str, str]] = None
|
| 246 |
+
) -> bytes:
|
| 247 |
+
"""
|
| 248 |
+
Saves a dictionary of tensors into raw bytes in safetensors format.
|
| 249 |
+
|
| 250 |
+
Args:
|
| 251 |
+
tensors (`Dict[str, torch.Tensor]`):
|
| 252 |
+
The incoming tensors. Tensors need to be contiguous and dense.
|
| 253 |
+
metadata (`Dict[str, str]`, *optional*, defaults to `None`):
|
| 254 |
+
Optional text only metadata you might want to save in your header.
|
| 255 |
+
For instance it can be useful to specify more about the underlying
|
| 256 |
+
tensors. This is purely informative and does not affect tensor loading.
|
| 257 |
+
|
| 258 |
+
Returns:
|
| 259 |
+
`bytes`: The raw bytes representing the format
|
| 260 |
+
|
| 261 |
+
Example:
|
| 262 |
+
|
| 263 |
+
```python
|
| 264 |
+
from safetensors.torch import save
|
| 265 |
+
import torch
|
| 266 |
+
|
| 267 |
+
tensors = {"embedding": torch.zeros((512, 1024)), "attention": torch.zeros((256, 256))}
|
| 268 |
+
byte_data = save(tensors)
|
| 269 |
+
```
|
| 270 |
+
"""
|
| 271 |
+
serialized = serialize(_flatten(tensors), metadata=metadata)
|
| 272 |
+
result = bytes(serialized)
|
| 273 |
+
return result
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
def save_file(
|
| 277 |
+
tensors: Dict[str, torch.Tensor],
|
| 278 |
+
filename: Union[str, os.PathLike],
|
| 279 |
+
metadata: Optional[Dict[str, str]] = None,
|
| 280 |
+
):
|
| 281 |
+
"""
|
| 282 |
+
Saves a dictionary of tensors into raw bytes in safetensors format.
|
| 283 |
+
|
| 284 |
+
Args:
|
| 285 |
+
tensors (`Dict[str, torch.Tensor]`):
|
| 286 |
+
The incoming tensors. Tensors need to be contiguous and dense.
|
| 287 |
+
filename (`str`, or `os.PathLike`)):
|
| 288 |
+
The filename we're saving into.
|
| 289 |
+
metadata (`Dict[str, str]`, *optional*, defaults to `None`):
|
| 290 |
+
Optional text only metadata you might want to save in your header.
|
| 291 |
+
For instance it can be useful to specify more about the underlying
|
| 292 |
+
tensors. This is purely informative and does not affect tensor loading.
|
| 293 |
+
|
| 294 |
+
Returns:
|
| 295 |
+
`None`
|
| 296 |
+
|
| 297 |
+
Example:
|
| 298 |
+
|
| 299 |
+
```python
|
| 300 |
+
from safetensors.torch import save_file
|
| 301 |
+
import torch
|
| 302 |
+
|
| 303 |
+
tensors = {"embedding": torch.zeros((512, 1024)), "attention": torch.zeros((256, 256))}
|
| 304 |
+
save_file(tensors, "model.safetensors")
|
| 305 |
+
```
|
| 306 |
+
"""
|
| 307 |
+
serialize_file(_flatten(tensors), filename, metadata=metadata)
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
def load_file(
|
| 311 |
+
filename: Union[str, os.PathLike], device: Union[str, int] = "cpu"
|
| 312 |
+
) -> Dict[str, torch.Tensor]:
|
| 313 |
+
"""
|
| 314 |
+
Loads a safetensors file into torch format.
|
| 315 |
+
|
| 316 |
+
Args:
|
| 317 |
+
filename (`str`, or `os.PathLike`):
|
| 318 |
+
The name of the file which contains the tensors
|
| 319 |
+
device (`Union[str, int]`, *optional*, defaults to `cpu`):
|
| 320 |
+
The device where the tensors need to be located after load.
|
| 321 |
+
available options are all regular torch device locations.
|
| 322 |
+
|
| 323 |
+
Returns:
|
| 324 |
+
`Dict[str, torch.Tensor]`: dictionary that contains name as key, value as `torch.Tensor`
|
| 325 |
+
|
| 326 |
+
Example:
|
| 327 |
+
|
| 328 |
+
```python
|
| 329 |
+
from safetensors.torch import load_file
|
| 330 |
+
|
| 331 |
+
file_path = "./my_folder/bert.safetensors"
|
| 332 |
+
loaded = load_file(file_path)
|
| 333 |
+
```
|
| 334 |
+
"""
|
| 335 |
+
result = {}
|
| 336 |
+
with safe_open(filename, framework="pt", device=device) as f:
|
| 337 |
+
for k in f.offset_keys():
|
| 338 |
+
result[k] = f.get_tensor(k)
|
| 339 |
+
return result
|
| 340 |
+
|
| 341 |
+
|
| 342 |
+
def load(data: bytes) -> Dict[str, torch.Tensor]:
|
| 343 |
+
"""
|
| 344 |
+
Loads a safetensors file into torch format from pure bytes.
|
| 345 |
+
|
| 346 |
+
Args:
|
| 347 |
+
data (`bytes`):
|
| 348 |
+
The content of a safetensors file
|
| 349 |
+
|
| 350 |
+
Returns:
|
| 351 |
+
`Dict[str, torch.Tensor]`: dictionary that contains name as key, value as `torch.Tensor` on cpu
|
| 352 |
+
|
| 353 |
+
Example:
|
| 354 |
+
|
| 355 |
+
```python
|
| 356 |
+
from safetensors.torch import load
|
| 357 |
+
|
| 358 |
+
file_path = "./my_folder/bert.safetensors"
|
| 359 |
+
with open(file_path, "rb") as f:
|
| 360 |
+
data = f.read()
|
| 361 |
+
|
| 362 |
+
loaded = load(data)
|
| 363 |
+
```
|
| 364 |
+
"""
|
| 365 |
+
flat = deserialize(data)
|
| 366 |
+
return _view2torch(flat)
|
| 367 |
+
|
| 368 |
+
|
| 369 |
+
# torch.float8 formats require 2.1; we do not support these dtypes on earlier versions
|
| 370 |
+
_float8_e4m3fn = getattr(torch, "float8_e4m3fn", None)
|
| 371 |
+
_float8_e5m2 = getattr(torch, "float8_e5m2", None)
|
| 372 |
+
_float8_e8m0 = getattr(torch, "float8_e8m0fnu", None)
|
| 373 |
+
_float4_e2m1_x2 = getattr(torch, "float4_e2m1fn_x2", None)
|
| 374 |
+
|
| 375 |
+
_SIZE = {
|
| 376 |
+
torch.int64: 8,
|
| 377 |
+
torch.float32: 4,
|
| 378 |
+
torch.int32: 4,
|
| 379 |
+
torch.bfloat16: 2,
|
| 380 |
+
torch.float16: 2,
|
| 381 |
+
torch.int16: 2,
|
| 382 |
+
torch.uint8: 1,
|
| 383 |
+
torch.int8: 1,
|
| 384 |
+
torch.bool: 1,
|
| 385 |
+
torch.float64: 8,
|
| 386 |
+
torch.complex64: 8,
|
| 387 |
+
_float8_e4m3fn: 1,
|
| 388 |
+
_float8_e5m2: 1,
|
| 389 |
+
_float8_e8m0: 1,
|
| 390 |
+
_float4_e2m1_x2: 1,
|
| 391 |
+
}
|
| 392 |
+
if Version(torch.__version__) >= Version("2.3.0"):
|
| 393 |
+
_SIZE.update(
|
| 394 |
+
{
|
| 395 |
+
torch.uint64: 8,
|
| 396 |
+
torch.uint32: 4,
|
| 397 |
+
torch.uint16: 2,
|
| 398 |
+
}
|
| 399 |
+
)
|
| 400 |
+
|
| 401 |
+
_TYPES = {
|
| 402 |
+
"F64": torch.float64,
|
| 403 |
+
"F32": torch.float32,
|
| 404 |
+
"F16": torch.float16,
|
| 405 |
+
"BF16": torch.bfloat16,
|
| 406 |
+
"I64": torch.int64,
|
| 407 |
+
"I32": torch.int32,
|
| 408 |
+
"I16": torch.int16,
|
| 409 |
+
"I8": torch.int8,
|
| 410 |
+
"U8": torch.uint8,
|
| 411 |
+
"BOOL": torch.bool,
|
| 412 |
+
"F8_E4M3": _float8_e4m3fn,
|
| 413 |
+
"F8_E5M2": _float8_e5m2,
|
| 414 |
+
"C64": torch.complex64,
|
| 415 |
+
}
|
| 416 |
+
if Version(torch.__version__) >= Version("2.3.0"):
|
| 417 |
+
_TYPES.update(
|
| 418 |
+
{
|
| 419 |
+
"U64": torch.uint64,
|
| 420 |
+
"U32": torch.uint32,
|
| 421 |
+
"U16": torch.uint16,
|
| 422 |
+
}
|
| 423 |
+
)
|
| 424 |
+
|
| 425 |
+
|
| 426 |
+
def _getdtype(dtype_str: str) -> torch.dtype:
|
| 427 |
+
return _TYPES[dtype_str]
|
| 428 |
+
|
| 429 |
+
|
| 430 |
+
def _view2torch(safeview) -> Dict[str, torch.Tensor]:
|
| 431 |
+
result = {}
|
| 432 |
+
for k, v in safeview:
|
| 433 |
+
dtype = _getdtype(v["dtype"])
|
| 434 |
+
if len(v["data"]) == 0:
|
| 435 |
+
# Workaround because frombuffer doesn't accept zero-size tensors
|
| 436 |
+
assert any(x == 0 for x in v["shape"])
|
| 437 |
+
arr = torch.empty(v["shape"], dtype=dtype)
|
| 438 |
+
else:
|
| 439 |
+
arr = torch.frombuffer(v["data"], dtype=dtype).reshape(v["shape"])
|
| 440 |
+
if sys.byteorder == "big":
|
| 441 |
+
arr = torch.from_numpy(arr.numpy().byteswap(inplace=False))
|
| 442 |
+
result[k] = arr
|
| 443 |
+
|
| 444 |
+
return result
|
| 445 |
+
|
| 446 |
+
|
| 447 |
+
def _tobytes(tensor: torch.Tensor, name: str) -> bytes:
|
| 448 |
+
if tensor.layout != torch.strided:
|
| 449 |
+
raise ValueError(
|
| 450 |
+
f"You are trying to save a sparse tensor: `{name}` which this library does not support."
|
| 451 |
+
" You can make it a dense tensor before saving with `.to_dense()` but be aware this might"
|
| 452 |
+
" make a much larger file than needed."
|
| 453 |
+
)
|
| 454 |
+
|
| 455 |
+
if not tensor.is_contiguous():
|
| 456 |
+
raise ValueError(
|
| 457 |
+
f"You are trying to save a non contiguous tensor: `{name}` which is not allowed. It either means you"
|
| 458 |
+
" are trying to save tensors which are reference of each other in which case it's recommended to save"
|
| 459 |
+
" only the full tensors, and reslice at load time, or simply call `.contiguous()` on your tensor to"
|
| 460 |
+
" pack it before saving."
|
| 461 |
+
)
|
| 462 |
+
if tensor.device.type != "cpu":
|
| 463 |
+
# Moving tensor to cpu before saving
|
| 464 |
+
tensor = tensor.to("cpu")
|
| 465 |
+
|
| 466 |
+
import ctypes
|
| 467 |
+
|
| 468 |
+
import numpy as np
|
| 469 |
+
|
| 470 |
+
# When shape is empty (scalar), np.prod returns a float
|
| 471 |
+
# we need a int for the following calculations
|
| 472 |
+
length = int(np.prod(tensor.shape).item())
|
| 473 |
+
bytes_per_item = _SIZE[tensor.dtype]
|
| 474 |
+
|
| 475 |
+
total_bytes = length * bytes_per_item
|
| 476 |
+
|
| 477 |
+
ptr = tensor.data_ptr()
|
| 478 |
+
if ptr == 0:
|
| 479 |
+
return b""
|
| 480 |
+
newptr = ctypes.cast(ptr, ctypes.POINTER(ctypes.c_ubyte))
|
| 481 |
+
data = np.ctypeslib.as_array(newptr, (total_bytes,)) # no internal copy
|
| 482 |
+
if sys.byteorder == "big":
|
| 483 |
+
NPDTYPES = {
|
| 484 |
+
torch.int64: np.int64,
|
| 485 |
+
torch.float32: np.float32,
|
| 486 |
+
torch.int32: np.int32,
|
| 487 |
+
# XXX: This is ok because both have the same width
|
| 488 |
+
torch.bfloat16: np.float16,
|
| 489 |
+
torch.float16: np.float16,
|
| 490 |
+
torch.int16: np.int16,
|
| 491 |
+
torch.uint8: np.uint8,
|
| 492 |
+
torch.int8: np.int8,
|
| 493 |
+
torch.bool: bool,
|
| 494 |
+
torch.float64: np.float64,
|
| 495 |
+
# XXX: This is ok because both have the same width and byteswap is a no-op anyway
|
| 496 |
+
_float8_e4m3fn: np.uint8,
|
| 497 |
+
_float8_e5m2: np.uint8,
|
| 498 |
+
torch.complex64: np.complex64,
|
| 499 |
+
}
|
| 500 |
+
npdtype = NPDTYPES[tensor.dtype]
|
| 501 |
+
# Not in place as that would potentially modify a live running model
|
| 502 |
+
data = data.view(npdtype).byteswap(inplace=False)
|
| 503 |
+
return data.tobytes()
|
| 504 |
+
|
| 505 |
+
|
| 506 |
+
def _flatten(tensors: Dict[str, torch.Tensor]) -> Dict[str, Dict[str, Any]]:
|
| 507 |
+
if not isinstance(tensors, dict):
|
| 508 |
+
raise ValueError(
|
| 509 |
+
f"Expected a dict of [str, torch.Tensor] but received {type(tensors)}"
|
| 510 |
+
)
|
| 511 |
+
|
| 512 |
+
invalid_tensors = []
|
| 513 |
+
for k, v in tensors.items():
|
| 514 |
+
if not isinstance(v, torch.Tensor):
|
| 515 |
+
raise ValueError(
|
| 516 |
+
f"Key `{k}` is invalid, expected torch.Tensor but received {type(v)}"
|
| 517 |
+
)
|
| 518 |
+
|
| 519 |
+
if v.layout != torch.strided:
|
| 520 |
+
invalid_tensors.append(k)
|
| 521 |
+
if invalid_tensors:
|
| 522 |
+
raise ValueError(
|
| 523 |
+
f"You are trying to save a sparse tensors: `{invalid_tensors}` which this library does not support."
|
| 524 |
+
" You can make it a dense tensor before saving with `.to_dense()` but be aware this might"
|
| 525 |
+
" make a much larger file than needed."
|
| 526 |
+
)
|
| 527 |
+
|
| 528 |
+
shared_pointers = _find_shared_tensors(tensors)
|
| 529 |
+
failing = []
|
| 530 |
+
for names in shared_pointers:
|
| 531 |
+
if len(names) > 1:
|
| 532 |
+
failing.append(names)
|
| 533 |
+
|
| 534 |
+
if failing:
|
| 535 |
+
raise RuntimeError(
|
| 536 |
+
f"""
|
| 537 |
+
Some tensors share memory, this will lead to duplicate memory on disk and potential differences when loading them again: {failing}.
|
| 538 |
+
A potential way to correctly save your model is to use `save_model`.
|
| 539 |
+
More information at https://huggingface.co/docs/safetensors/torch_shared_tensors
|
| 540 |
+
"""
|
| 541 |
+
)
|
| 542 |
+
|
| 543 |
+
return {
|
| 544 |
+
k: {
|
| 545 |
+
"dtype": str(v.dtype).split(".")[-1],
|
| 546 |
+
"shape": v.shape,
|
| 547 |
+
"data": _tobytes(v, k),
|
| 548 |
+
}
|
| 549 |
+
for k, v in tensors.items()
|
| 550 |
+
}
|
source/sentencepiece-0.2.1.dist-info/INSTALLER
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
pip
|
source/sentencepiece-0.2.1.dist-info/METADATA
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: sentencepiece
|
| 3 |
+
Version: 0.2.1
|
| 4 |
+
Summary: Unsupervised text tokenizer and detokenizer.
|
| 5 |
+
Author-email: Taku Kudo <taku@google.com>
|
| 6 |
+
Project-URL: Homepage, https://github.com/google/sentencepiece
|
| 7 |
+
Classifier: Programming Language :: Python :: 3
|
| 8 |
+
Classifier: Development Status :: 5 - Production/Stable
|
| 9 |
+
Classifier: Environment :: Console
|
| 10 |
+
Classifier: Intended Audience :: Developers
|
| 11 |
+
Classifier: Intended Audience :: Science/Research
|
| 12 |
+
Classifier: Operating System :: MacOS :: MacOS X
|
| 13 |
+
Classifier: Operating System :: Microsoft :: Windows
|
| 14 |
+
Classifier: Operating System :: POSIX :: Linux
|
| 15 |
+
Classifier: Programming Language :: Python
|
| 16 |
+
Classifier: Programming Language :: Python :: 3.9
|
| 17 |
+
Classifier: Programming Language :: Python :: 3.10
|
| 18 |
+
Classifier: Programming Language :: Python :: 3.11
|
| 19 |
+
Classifier: Programming Language :: Python :: 3.12
|
| 20 |
+
Classifier: Programming Language :: Python :: 3.13
|
| 21 |
+
Classifier: Programming Language :: Python :: 3.14
|
| 22 |
+
Classifier: Programming Language :: Python :: Free Threading :: 2 - Beta
|
| 23 |
+
Classifier: Topic :: Text Processing :: Linguistic
|
| 24 |
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
| 25 |
+
Requires-Python: >=3.9
|
| 26 |
+
Description-Content-Type: text/markdown
|
| 27 |
+
Provides-Extra: test
|
| 28 |
+
Requires-Dist: pytest; extra == "test"
|
| 29 |
+
Provides-Extra: testpaths
|
| 30 |
+
Requires-Dist: test; extra == "testpaths"
|
| 31 |
+
|
| 32 |
+
# SentencePiece Python Wrapper
|
| 33 |
+
|
| 34 |
+
Python wrapper for SentencePiece. This API will offer the encoding, decoding and training of Sentencepiece.
|
| 35 |
+
|
| 36 |
+
## Build and Install SentencePiece
|
| 37 |
+
|
| 38 |
+
For Linux (x64/i686), macOS, and Windows(win32/x64/arm64) environment, you can simply use pip command to install SentencePiece python module.
|
| 39 |
+
|
| 40 |
+
```
|
| 41 |
+
% pip install sentencepiece
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
Before building SentencePiece from source on Linux, ensure that the following dependencies are installed.
|
| 45 |
+
|
| 46 |
+
```
|
| 47 |
+
% sudo apt update
|
| 48 |
+
% sudo apt install -y cmake pkg-config libsentencepiece-dev
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
To build and install the Python wrapper from source, try the following commands to build and install wheel package.
|
| 52 |
+
|
| 53 |
+
```
|
| 54 |
+
% git clone https://github.com/google/sentencepiece.git
|
| 55 |
+
% cd sentencepiece
|
| 56 |
+
% mkdir build
|
| 57 |
+
% cd build
|
| 58 |
+
% cmake .. -DSPM_ENABLE_SHARED=OFF -DCMAKE_INSTALL_PREFIX=./root -DSPM_DISABLE_EMBEDDED_DATA=ON
|
| 59 |
+
% make install
|
| 60 |
+
% cd ../python
|
| 61 |
+
% python setup.py bdist_wheel
|
| 62 |
+
% pip install dist/sentencepiece*.whl
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
If you don’t have write permission to the global site-packages directory or don’t want to install into it, please try:
|
| 66 |
+
|
| 67 |
+
```
|
| 68 |
+
% python setup.py install --user
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
For Windows users who want to build from source, you can build and install the Python wrapper using Visual Studio. First, you need to install the `pwsh.exe` (Powershell 7). Use `winget install --id Microsoft.Powershell --source winget` to install directly. Then open the `Developer PowerShell for VS 2022`, and execute the following commands.
|
| 72 |
+
|
| 73 |
+
```
|
| 74 |
+
git clone https://github.com/google/sentencepiece.git
|
| 75 |
+
cd sentencepiece
|
| 76 |
+
mkdir build
|
| 77 |
+
cd build
|
| 78 |
+
cmake .. -DSPM_ENABLE_SHARED=OFF -DCMAKE_INSTALL_PREFIX=".\root" -DSPM_DISABLE_EMBEDDED_DATA=ON
|
| 79 |
+
cmake --build . --config Release --target install
|
| 80 |
+
cd ../python
|
| 81 |
+
pip install wheel
|
| 82 |
+
python setup.py bdist_wheel
|
| 83 |
+
Get-ChildItem .\dist\sentencepiece*.whl | ForEach-Object { pip install $_.FullName }
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
## Usage
|
| 87 |
+
|
| 88 |
+
See [this google colab page](https://github.com/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb) to run sentencepiece interactively.
|
| 89 |
+
|
| 90 |
+
### Segmentation
|
| 91 |
+
|
| 92 |
+
```
|
| 93 |
+
% python
|
| 94 |
+
>>> import sentencepiece as spm
|
| 95 |
+
>>> sp = spm.SentencePieceProcessor(model_file='test/test_model.model')
|
| 96 |
+
|
| 97 |
+
>>> sp.encode('This is a test')
|
| 98 |
+
[284, 47, 11, 4, 15, 400]
|
| 99 |
+
|
| 100 |
+
>>> sp.encode(['This is a test', 'Hello world'], out_type=int)
|
| 101 |
+
[[284, 47, 11, 4, 15, 400], [151, 88, 21, 887]]
|
| 102 |
+
|
| 103 |
+
>>> sp.encode_as_ids(['This is a test', 'Hello world'])
|
| 104 |
+
[[284, 47, 11, 4, 15, 400], [151, 88, 21, 887]]
|
| 105 |
+
|
| 106 |
+
>>> sp.encode('This is a test', out_type=str)
|
| 107 |
+
['▁This', '▁is', '▁a', '▁', 't', 'est']
|
| 108 |
+
|
| 109 |
+
>>> sp.encode(['This is a test', 'Hello world'], out_type=str)
|
| 110 |
+
[['▁This', '▁is', '▁a', '▁', 't', 'est'], ['▁He', 'll', 'o', '▁world']]
|
| 111 |
+
|
| 112 |
+
>>> sp.encode_as_pieces(['This is a test', 'Hello world'])
|
| 113 |
+
[['▁This', '▁is', '▁a', '▁', 't', 'est'], ['▁He', 'll', 'o', '▁world']]
|
| 114 |
+
|
| 115 |
+
>>> proto = sp.encode('This is a test', out_type='immutable_proto')
|
| 116 |
+
>>> for n in proto.pieces:
|
| 117 |
+
... print('piece="{}" surface="{}" id={} begin={} end={}'.format(n.piece, n.surface, n.id, n.begin, n.end))
|
| 118 |
+
...
|
| 119 |
+
piece="▁This" surface="This" id=284 begin=0 end=4
|
| 120 |
+
piece="▁is" surface=" is" id=47 begin=4 end=7
|
| 121 |
+
piece="▁a" surface=" a" id=11 begin=7 end=9
|
| 122 |
+
piece="▁" surface=" " id=4 begin=9 end=10
|
| 123 |
+
piece="t" surface="t" id=15 begin=10 end=11
|
| 124 |
+
piece="est" surface="est" id=400 begin=11 end=14
|
| 125 |
+
|
| 126 |
+
>>> [[x.id for x in proto.pieces], [x.piece for x in proto.pieces], [x.begin for x in proto.pieces], [x.end for x in proto.pieces]]
|
| 127 |
+
[[284, 47, 11, 4, 15, 400], ['▁This', '▁is', '▁a', '▁', 't', 'est'], [0, 4, 7, 9, 10, 11], [4, 7, 9, 10, 11, 14]]
|
| 128 |
+
|
| 129 |
+
>>> proto2 = sp.encode_as_immutable_proto('This is a test')
|
| 130 |
+
>>> proto2 == proto
|
| 131 |
+
True
|
| 132 |
+
|
| 133 |
+
>>> for _ in range(10):
|
| 134 |
+
... sp.encode('This is a test', out_type=str, enable_sampling=True, alpha=0.1, nbest_size=-1)
|
| 135 |
+
...
|
| 136 |
+
['▁', 'This', '▁', 'is', '▁a', '▁', 't', 'e', 'st']
|
| 137 |
+
['▁T', 'h', 'i', 's', '▁is', '▁a', '▁', 'te', 's', 't']
|
| 138 |
+
['▁T', 'h', 'is', '▁', 'is', '▁', 'a', '▁', 't', 'est']
|
| 139 |
+
['▁', 'This', '▁is', '▁', 'a', '▁', 't', 'e', 'st']
|
| 140 |
+
['▁', 'This', '▁', 'is', '▁', 'a', '▁', 't', 'e', 's', 't']
|
| 141 |
+
['▁This', '▁is', '▁a', '▁', 'te', 's', 't']
|
| 142 |
+
['▁This', '▁is', '▁', 'a', '▁', 't', 'e', 'st']
|
| 143 |
+
['▁', 'T', 'h', 'is', '▁', 'is', '▁', 'a', '▁', 'te', 'st']
|
| 144 |
+
['▁', 'This', '▁', 'i', 's', '▁a', '▁', 't', 'e', 'st']
|
| 145 |
+
['▁This', '▁', 'is', '▁a', '▁', 't', 'est']
|
| 146 |
+
|
| 147 |
+
>> sp.nbest_encode('This is a test', nbest_size=5, out_type=str)
|
| 148 |
+
[['▁This', '▁is', '▁a', '▁', 't', 'est'],
|
| 149 |
+
['▁This', '▁is', '▁a', '▁', 'te', 'st'],
|
| 150 |
+
['▁This', '▁is', '▁a', '▁', 'te', 's', 't'],
|
| 151 |
+
['▁This', '▁is', '▁a', '▁', 't', 'e', 'st'],
|
| 152 |
+
['▁This', '▁is', '▁a', '▁', 't', 'es', 't']]
|
| 153 |
+
|
| 154 |
+
>>> sp.sample_encode_and_score('This is a test', num_samples=5, alpha=0.1, out_type=str, wor=True)
|
| 155 |
+
[(['▁This', '▁', 'i', 's', '▁a', '▁', 'te', 's', 't'], -3.043105125427246),
|
| 156 |
+
(['▁This', '▁', 'i', 's', '▁a', '▁', 'te', 'st'], -2.8475849628448486),
|
| 157 |
+
(['▁', 'This', '▁is', '▁', 'a', '▁', 'te', 'st'], -3.043248176574707),
|
| 158 |
+
(['▁', 'This', '▁is', '▁a', '▁', 't', 'e', 'st'], -2.87727689743042),
|
| 159 |
+
(['▁', 'This', '▁', 'i', 's', '▁', 'a', '▁', 't', 'est'], -3.6284031867980957)]
|
| 160 |
+
|
| 161 |
+
>>> sp.decode([284, 47, 11, 4, 15, 400])
|
| 162 |
+
'This is a test'
|
| 163 |
+
|
| 164 |
+
>>> sp.decode([[284, 47, 11, 4, 15, 400], [151, 88, 21, 887]])
|
| 165 |
+
['This is a test', 'Hello world']
|
| 166 |
+
|
| 167 |
+
>>> proto = sp.decode([284, 47, 11, 4, 15, 400], out_type='immutable_proto')
|
| 168 |
+
>>> proto.text
|
| 169 |
+
'This is a test'
|
| 170 |
+
|
| 171 |
+
>>> sp.decode(['▁', 'This', '▁', 'is', '▁a', '▁', 't', 'e', 'st'])
|
| 172 |
+
'This is a test'
|
| 173 |
+
|
| 174 |
+
>>> sp.decode([['▁This', '▁is', '▁a', '▁', 't', 'est'], ['▁He', 'll', 'o', '▁world']])
|
| 175 |
+
['This is a test', 'Hello world']
|
| 176 |
+
|
| 177 |
+
>>> sp.get_piece_size()
|
| 178 |
+
1000
|
| 179 |
+
|
| 180 |
+
>>> sp.id_to_piece(2)
|
| 181 |
+
'</s>'
|
| 182 |
+
|
| 183 |
+
>>> sp.id_to_piece([2, 3, 4])
|
| 184 |
+
['</s>', '\r', '▁']
|
| 185 |
+
|
| 186 |
+
>>> sp.piece_to_id('<s>')
|
| 187 |
+
1
|
| 188 |
+
|
| 189 |
+
>>> sp.piece_to_id(['</s>', '\r', '▁'])
|
| 190 |
+
[2, 3, 4]
|
| 191 |
+
|
| 192 |
+
>>> len(sp)
|
| 193 |
+
1000
|
| 194 |
+
|
| 195 |
+
>>> sp['</s>']
|
| 196 |
+
2
|
| 197 |
+
```
|
| 198 |
+
|
| 199 |
+
### Model Training
|
| 200 |
+
|
| 201 |
+
Training is performed by passing parameters of [spm_train](https://github.com/google/sentencepiece#train-sentencepiece-model) to SentencePieceTrainer.train() function.
|
| 202 |
+
|
| 203 |
+
```
|
| 204 |
+
>>> import sentencepiece as spm
|
| 205 |
+
>>> spm.SentencePieceTrainer.train(input='test/botchan.txt', model_prefix='m', vocab_size=1000, user_defined_symbols=['foo', 'bar'])
|
| 206 |
+
sentencepiece_trainer.cc(73) LOG(INFO) Starts training with :
|
| 207 |
+
trainer_spec {
|
| 208 |
+
input: test/botchan.txt
|
| 209 |
+
.. snip
|
| 210 |
+
unigram_model_trainer.cc(500) LOG(INFO) EM sub_iter=1 size=1188 obj=10.2839 num_tokens=32182 num_tokens/piece=27.0892
|
| 211 |
+
unigram_model_trainer.cc(500) LOG(INFO) EM sub_iter=0 size=1100 obj=10.4269 num_tokens=33001 num_tokens/piece=30.0009
|
| 212 |
+
unigram_model_trainer.cc(500) LOG(INFO) EM sub_iter=1 size=1100 obj=10.4069 num_tokens=33002 num_tokens/piece=30.0018
|
| 213 |
+
trainer_interface.cc(595) LOG(INFO) Saving model: m.model
|
| 214 |
+
trainer_interface.cc(619) LOG(INFO) Saving vocabs: m.vocab
|
| 215 |
+
>>>
|
| 216 |
+
```
|
| 217 |
+
|
| 218 |
+
### Training without local filesystem
|
| 219 |
+
|
| 220 |
+
Sentencepiece trainer can receive any iterable object to feed training sentences. You can also pass a file object (instance with write() method) to emit the output model to any devices. These features are useful to run sentencepiece on environment that have limited access to the local file system (e.g., Google colab.)
|
| 221 |
+
|
| 222 |
+
```
|
| 223 |
+
import urllib.request
|
| 224 |
+
import io
|
| 225 |
+
import sentencepiece as spm
|
| 226 |
+
|
| 227 |
+
# Loads model from URL as iterator and stores the model to BytesIO.
|
| 228 |
+
model = io.BytesIO()
|
| 229 |
+
with urllib.request.urlopen(
|
| 230 |
+
'https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt'
|
| 231 |
+
) as response:
|
| 232 |
+
spm.SentencePieceTrainer.train(
|
| 233 |
+
sentence_iterator=response, model_writer=model, vocab_size=1000)
|
| 234 |
+
|
| 235 |
+
# Serialize the model as file.
|
| 236 |
+
# with open('out.model', 'wb') as f:
|
| 237 |
+
# f.write(model.getvalue())
|
| 238 |
+
|
| 239 |
+
# Directly load the model from serialized model.
|
| 240 |
+
sp = spm.SentencePieceProcessor(model_proto=model.getvalue())
|
| 241 |
+
print(sp.encode('this is test'))
|
| 242 |
+
```
|
| 243 |
+
|
| 244 |
+
### Free Threading support
|
| 245 |
+
Experimental support for no-GIL/Free-Threading has been introduced since v0.2.1. For more details, please refer to [this page](https://py-free-threading.github.io.).
|
| 246 |
+
This operates similarly to how [NumPy](https://numpy.org/devdocs/reference/thread_safety.html#free-threaded-python) handles it.
|
| 247 |
+
|
| 248 |
+
The C++ library's const and static methods, e.g., encode(), decode() and train(), are designed to work in a non-GIL environment.
|
| 249 |
+
However, non-const methods, e.g., load(), may have potential data race issues, so please ensure you implement appropriate locks beforehand.
|
| 250 |
+
|
| 251 |
+
While this limitation might be removed in the future, please note that it's not a simple fix, as it would require additional shared locks in C++.
|
source/sentencepiece-0.2.1.dist-info/RECORD
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
sentencepiece-0.2.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
| 2 |
+
sentencepiece-0.2.1.dist-info/METADATA,sha256=zgounFmc0GCnvD3_sYvlz8PV8pNTPj7k6HDq7wpVAP0,10003
|
| 3 |
+
sentencepiece-0.2.1.dist-info/RECORD,,
|
| 4 |
+
sentencepiece-0.2.1.dist-info/WHEEL,sha256=1crAxrAH5rUbvWUY1UR0ly3o7KnT1jo0_98V8RY5-FM,152
|
| 5 |
+
sentencepiece-0.2.1.dist-info/top_level.txt,sha256=NIXVKmsq-xdZ5KROMHHV0gkLA3fvbNTb1g1KtgiYFOk,130
|
| 6 |
+
sentencepiece/__init__.py,sha256=NQT0kP3jkjEmRRIyK_jsKCug6L9zQi2vIPTWQin8E2w,49422
|
| 7 |
+
sentencepiece/__pycache__/__init__.cpython-312.pyc,,
|
| 8 |
+
sentencepiece/__pycache__/_version.cpython-312.pyc,,
|
| 9 |
+
sentencepiece/__pycache__/sentencepiece_model_pb2.cpython-312.pyc,,
|
| 10 |
+
sentencepiece/__pycache__/sentencepiece_pb2.cpython-312.pyc,,
|
| 11 |
+
sentencepiece/_sentencepiece.cpython-312-x86_64-linux-gnu.so,sha256=z2sCOJaqHJkCZEXvwrXL07ipfGg53Q_fsrynV_vtlw0,2005912
|
| 12 |
+
sentencepiece/_version.py,sha256=PmcQ2PI2oP8irnLtJLJby2YfW6sBvLAmL-VpABzTqwc,22
|
| 13 |
+
sentencepiece/package_data/nfkc.bin,sha256=UvEQKP-KffPgCdlKi2pU1KihcTLvtMzByaCkHkMr2R4,240008
|
| 14 |
+
sentencepiece/package_data/nfkc_cf.bin,sha256=YIM-wRIBRGZZw1ScGDsY8CTEAHYozGs6TpGuAHaXuCY,247028
|
| 15 |
+
sentencepiece/package_data/nmt_nfkc.bin,sha256=eTcsQTicK5spvBcQF6tUAONS3r1oawJnCkK-xwkBUHQ,240007
|
| 16 |
+
sentencepiece/package_data/nmt_nfkc_cf.bin,sha256=IsKSx29QN5XzDIXXnTCn9XL_9PSeADktLWD0-T6UGh4,247027
|
| 17 |
+
sentencepiece/sentencepiece.i,sha256=Hfv8AHFOJEfDfElVYIhoz29W7rV1VJ0Z13aP7S7ck6M,72647
|
| 18 |
+
sentencepiece/sentencepiece_model_pb2.py,sha256=LawEwmdUiIU1T9HcYu-rNEVTFcwAh9i-qavMMsg9riE,6257
|
| 19 |
+
sentencepiece/sentencepiece_pb2.py,sha256=_ZgnXOkpoScMXbJ-8BMKn2Q97BbMOH9Hz-L7JFMcJro,1753
|
| 20 |
+
sentencepiece/sentencepiece_wrap.cxx,sha256=XlbUFs7s48i3i_nhka9U_b41Xpv_eeSD9U_uxO742Y0,381494
|
source/sentencepiece-0.2.1.dist-info/WHEEL
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Wheel-Version: 1.0
|
| 2 |
+
Generator: setuptools (80.9.0)
|
| 3 |
+
Root-Is-Purelib: false
|
| 4 |
+
Tag: cp312-cp312-manylinux_2_27_x86_64
|
| 5 |
+
Tag: cp312-cp312-manylinux_2_28_x86_64
|
| 6 |
+
|
source/sentencepiece-0.2.1.dist-info/top_level.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
sentencepiece
|
| 2 |
+
sentencepiece/__init__
|
| 3 |
+
sentencepiece/_version
|
| 4 |
+
sentencepiece/sentencepiece_model_pb2
|
| 5 |
+
sentencepiece/sentencepiece_pb2
|
source/sentencepiece/__init__.py
ADDED
|
@@ -0,0 +1,1230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This file was automatically generated by SWIG (https://www.swig.org).
|
| 2 |
+
# Version 4.3.0
|
| 3 |
+
#
|
| 4 |
+
# Do not make changes to this file unless you know what you are doing - modify
|
| 5 |
+
# the SWIG interface file instead.
|
| 6 |
+
|
| 7 |
+
from sys import version_info as _swig_python_version_info
|
| 8 |
+
# Import the low-level C/C++ module
|
| 9 |
+
if __package__ or "." in __name__:
|
| 10 |
+
from . import _sentencepiece
|
| 11 |
+
else:
|
| 12 |
+
import _sentencepiece
|
| 13 |
+
|
| 14 |
+
try:
|
| 15 |
+
import builtins as __builtin__
|
| 16 |
+
except ImportError:
|
| 17 |
+
import __builtin__
|
| 18 |
+
|
| 19 |
+
def _swig_repr(self):
|
| 20 |
+
try:
|
| 21 |
+
strthis = "proxy of " + self.this.__repr__()
|
| 22 |
+
except __builtin__.Exception:
|
| 23 |
+
strthis = ""
|
| 24 |
+
return "<%s.%s; %s >" % (self.__class__.__module__, self.__class__.__name__, strthis,)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def _swig_setattr_nondynamic_instance_variable(set):
|
| 28 |
+
def set_instance_attr(self, name, value):
|
| 29 |
+
if name == "this":
|
| 30 |
+
set(self, name, value)
|
| 31 |
+
elif name == "thisown":
|
| 32 |
+
self.this.own(value)
|
| 33 |
+
elif hasattr(self, name) and isinstance(getattr(type(self), name), property):
|
| 34 |
+
set(self, name, value)
|
| 35 |
+
else:
|
| 36 |
+
raise AttributeError("You cannot add instance attributes to %s" % self)
|
| 37 |
+
return set_instance_attr
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _swig_setattr_nondynamic_class_variable(set):
|
| 41 |
+
def set_class_attr(cls, name, value):
|
| 42 |
+
if hasattr(cls, name) and not isinstance(getattr(cls, name), property):
|
| 43 |
+
set(cls, name, value)
|
| 44 |
+
else:
|
| 45 |
+
raise AttributeError("You cannot add class attributes to %s" % cls)
|
| 46 |
+
return set_class_attr
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def _swig_add_metaclass(metaclass):
|
| 50 |
+
"""Class decorator for adding a metaclass to a SWIG wrapped class - a slimmed down version of six.add_metaclass"""
|
| 51 |
+
def wrapper(cls):
|
| 52 |
+
return metaclass(cls.__name__, cls.__bases__, cls.__dict__.copy())
|
| 53 |
+
return wrapper
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
class _SwigNonDynamicMeta(type):
|
| 57 |
+
"""Meta class to enforce nondynamic attributes (no new attributes) for a class"""
|
| 58 |
+
__setattr__ = _swig_setattr_nondynamic_class_variable(type.__setattr__)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
class ImmutableSentencePieceText_ImmutableSentencePiece(object):
|
| 62 |
+
thisown = property(lambda x: x.this.own(), lambda x, v: x.this.own(v), doc="The membership flag")
|
| 63 |
+
__repr__ = _swig_repr
|
| 64 |
+
|
| 65 |
+
def __init__(self):
|
| 66 |
+
_sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece_swiginit(self, _sentencepiece.new_ImmutableSentencePieceText_ImmutableSentencePiece())
|
| 67 |
+
__swig_destroy__ = _sentencepiece.delete_ImmutableSentencePieceText_ImmutableSentencePiece
|
| 68 |
+
|
| 69 |
+
def _piece(self):
|
| 70 |
+
return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece__piece(self)
|
| 71 |
+
|
| 72 |
+
def _surface(self):
|
| 73 |
+
return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece__surface(self)
|
| 74 |
+
|
| 75 |
+
def _id(self):
|
| 76 |
+
return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece__id(self)
|
| 77 |
+
|
| 78 |
+
def _begin(self):
|
| 79 |
+
return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece__begin(self)
|
| 80 |
+
|
| 81 |
+
def _end(self):
|
| 82 |
+
return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece__end(self)
|
| 83 |
+
|
| 84 |
+
def _surface_as_bytes(self):
|
| 85 |
+
return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece__surface_as_bytes(self)
|
| 86 |
+
|
| 87 |
+
def _piece_as_bytes(self):
|
| 88 |
+
return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece__piece_as_bytes(self)
|
| 89 |
+
|
| 90 |
+
piece = property(_piece)
|
| 91 |
+
piece_as_bytes = property(_piece_as_bytes)
|
| 92 |
+
surface = property(_surface)
|
| 93 |
+
surface_as_bytes = property(_surface_as_bytes)
|
| 94 |
+
id = property(_id)
|
| 95 |
+
begin = property(_begin)
|
| 96 |
+
end = property(_end)
|
| 97 |
+
|
| 98 |
+
def __str__(self):
|
| 99 |
+
return ('piece: \"{}\"\n'
|
| 100 |
+
'id: {}\n'
|
| 101 |
+
'surface: \"{}\"\n'
|
| 102 |
+
'begin: {}\n'
|
| 103 |
+
'end: {}\n').format(self.piece, self.id, self.surface,
|
| 104 |
+
self.begin, self.end)
|
| 105 |
+
|
| 106 |
+
def __eq__(self, other):
|
| 107 |
+
return self.piece == other.piece and self.id == other.id and self.surface == other.surface and self.begin == other.begin and self.end == other.end
|
| 108 |
+
|
| 109 |
+
def __hash__(self):
|
| 110 |
+
return hash(str(self))
|
| 111 |
+
|
| 112 |
+
__repr__ = __str__
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
# Register ImmutableSentencePieceText_ImmutableSentencePiece in _sentencepiece:
|
| 116 |
+
_sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece_swigregister(ImmutableSentencePieceText_ImmutableSentencePiece)
|
| 117 |
+
class ImmutableSentencePieceText(object):
|
| 118 |
+
thisown = property(lambda x: x.this.own(), lambda x, v: x.this.own(v), doc="The membership flag")
|
| 119 |
+
__repr__ = _swig_repr
|
| 120 |
+
|
| 121 |
+
def __init__(self):
|
| 122 |
+
_sentencepiece.ImmutableSentencePieceText_swiginit(self, _sentencepiece.new_ImmutableSentencePieceText())
|
| 123 |
+
__swig_destroy__ = _sentencepiece.delete_ImmutableSentencePieceText
|
| 124 |
+
|
| 125 |
+
def _pieces_size(self):
|
| 126 |
+
return _sentencepiece.ImmutableSentencePieceText__pieces_size(self)
|
| 127 |
+
|
| 128 |
+
def _pieces(self, index):
|
| 129 |
+
return _sentencepiece.ImmutableSentencePieceText__pieces(self, index)
|
| 130 |
+
|
| 131 |
+
def _text(self):
|
| 132 |
+
return _sentencepiece.ImmutableSentencePieceText__text(self)
|
| 133 |
+
|
| 134 |
+
def _score(self):
|
| 135 |
+
return _sentencepiece.ImmutableSentencePieceText__score(self)
|
| 136 |
+
|
| 137 |
+
def SerializeAsString(self):
|
| 138 |
+
return _sentencepiece.ImmutableSentencePieceText_SerializeAsString(self)
|
| 139 |
+
|
| 140 |
+
def _text_as_bytes(self):
|
| 141 |
+
return _sentencepiece.ImmutableSentencePieceText__text_as_bytes(self)
|
| 142 |
+
|
| 143 |
+
text = property(_text)
|
| 144 |
+
text_as_bytes = property(_text_as_bytes)
|
| 145 |
+
score = property(_score)
|
| 146 |
+
|
| 147 |
+
class ImmutableSentencePieceIterator:
|
| 148 |
+
def __init__(self, proto):
|
| 149 |
+
self.proto = proto
|
| 150 |
+
self.len = self.proto._pieces_size()
|
| 151 |
+
|
| 152 |
+
def __len__(self):
|
| 153 |
+
return self.len
|
| 154 |
+
|
| 155 |
+
def __getitem__(self, index):
|
| 156 |
+
if isinstance(index, slice):
|
| 157 |
+
return [self.proto._pieces(i) for i in range(self.len)][index.start:index.stop:index.step]
|
| 158 |
+
if index < 0:
|
| 159 |
+
index = index + self.len
|
| 160 |
+
if index < 0 or index >= self.len:
|
| 161 |
+
raise IndexError('piece index is out of range')
|
| 162 |
+
return self.proto._pieces(index)
|
| 163 |
+
|
| 164 |
+
def __str__(self):
|
| 165 |
+
return '\n'.join(['pieces {{\n{}}}'.format(str(x)) for x in self])
|
| 166 |
+
|
| 167 |
+
__repr__ = __str__
|
| 168 |
+
|
| 169 |
+
@property
|
| 170 |
+
def pieces(self):
|
| 171 |
+
return ImmutableSentencePieceText.ImmutableSentencePieceIterator(self)
|
| 172 |
+
|
| 173 |
+
def __eq__(self, other):
|
| 174 |
+
return self.SerializeAsString() == other.SerializeAsString()
|
| 175 |
+
|
| 176 |
+
def __hash__(self):
|
| 177 |
+
return hash(self.SerializeAsString())
|
| 178 |
+
|
| 179 |
+
def __str__(self):
|
| 180 |
+
return ('text: \"{}\"\n'
|
| 181 |
+
'score: {}\n'
|
| 182 |
+
'{}').format(self.text, self.score,
|
| 183 |
+
'\n'.join(['pieces {{\n{}}}'.format(str(x)) for x in self.pieces]))
|
| 184 |
+
|
| 185 |
+
__repr__ = __str__
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
# Register ImmutableSentencePieceText in _sentencepiece:
|
| 189 |
+
_sentencepiece.ImmutableSentencePieceText_swigregister(ImmutableSentencePieceText)
|
| 190 |
+
class ImmutableNBestSentencePieceText(object):
|
| 191 |
+
thisown = property(lambda x: x.this.own(), lambda x, v: x.this.own(v), doc="The membership flag")
|
| 192 |
+
__repr__ = _swig_repr
|
| 193 |
+
|
| 194 |
+
def __init__(self):
|
| 195 |
+
_sentencepiece.ImmutableNBestSentencePieceText_swiginit(self, _sentencepiece.new_ImmutableNBestSentencePieceText())
|
| 196 |
+
__swig_destroy__ = _sentencepiece.delete_ImmutableNBestSentencePieceText
|
| 197 |
+
|
| 198 |
+
def _nbests_size(self):
|
| 199 |
+
return _sentencepiece.ImmutableNBestSentencePieceText__nbests_size(self)
|
| 200 |
+
|
| 201 |
+
def _nbests(self, index):
|
| 202 |
+
return _sentencepiece.ImmutableNBestSentencePieceText__nbests(self, index)
|
| 203 |
+
|
| 204 |
+
def SerializeAsString(self):
|
| 205 |
+
return _sentencepiece.ImmutableNBestSentencePieceText_SerializeAsString(self)
|
| 206 |
+
|
| 207 |
+
class ImmutableSentencePieceTextIterator:
|
| 208 |
+
def __init__(self, proto):
|
| 209 |
+
self.proto = proto
|
| 210 |
+
self.len = self.proto._nbests_size()
|
| 211 |
+
|
| 212 |
+
def __len__(self):
|
| 213 |
+
return self.len
|
| 214 |
+
|
| 215 |
+
def __getitem__(self, index):
|
| 216 |
+
if isinstance(index, slice):
|
| 217 |
+
return [self.proto._nbests(i) for i in range(self.len)][index.start:index.stop:index.step]
|
| 218 |
+
if index < 0:
|
| 219 |
+
index = index + self.len
|
| 220 |
+
if index < 0 or index >= self.len:
|
| 221 |
+
raise IndexError('nbests index is out of range')
|
| 222 |
+
return self.proto._nbests(index)
|
| 223 |
+
|
| 224 |
+
def __str__(self):
|
| 225 |
+
return '\n'.join(['nbests {{\n{}}}'.format(str(x)) for x in self])
|
| 226 |
+
|
| 227 |
+
__repr__ = __str__
|
| 228 |
+
|
| 229 |
+
@property
|
| 230 |
+
def nbests(self):
|
| 231 |
+
return ImmutableNBestSentencePieceText.ImmutableSentencePieceTextIterator(self)
|
| 232 |
+
|
| 233 |
+
def __eq__(self, other):
|
| 234 |
+
return self.SerializeAsString() == other.SerializeAsString()
|
| 235 |
+
|
| 236 |
+
def __hash__(self):
|
| 237 |
+
return hash(self.SerializeAsString())
|
| 238 |
+
|
| 239 |
+
def __str__(self):
|
| 240 |
+
return '\n'.join(['nbests {{\n{}}}'.format(str(x)) for x in self.nbests])
|
| 241 |
+
|
| 242 |
+
__repr__ = __str__
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
# Register ImmutableNBestSentencePieceText in _sentencepiece:
|
| 246 |
+
_sentencepiece.ImmutableNBestSentencePieceText_swigregister(ImmutableNBestSentencePieceText)
|
| 247 |
+
class SentencePieceProcessor(object):
|
| 248 |
+
thisown = property(lambda x: x.this.own(), lambda x, v: x.this.own(v), doc="The membership flag")
|
| 249 |
+
__repr__ = _swig_repr
|
| 250 |
+
|
| 251 |
+
def __init__(self):
|
| 252 |
+
_sentencepiece.SentencePieceProcessor_swiginit(self, _sentencepiece.new_SentencePieceProcessor())
|
| 253 |
+
__swig_destroy__ = _sentencepiece.delete_SentencePieceProcessor
|
| 254 |
+
|
| 255 |
+
def LoadFromSerializedProto(self, serialized):
|
| 256 |
+
return _sentencepiece.SentencePieceProcessor_LoadFromSerializedProto(self, serialized)
|
| 257 |
+
|
| 258 |
+
def SetEncodeExtraOptions(self, extra_option):
|
| 259 |
+
return _sentencepiece.SentencePieceProcessor_SetEncodeExtraOptions(self, extra_option)
|
| 260 |
+
|
| 261 |
+
def SetDecodeExtraOptions(self, extra_option):
|
| 262 |
+
return _sentencepiece.SentencePieceProcessor_SetDecodeExtraOptions(self, extra_option)
|
| 263 |
+
|
| 264 |
+
def SetVocabulary(self, valid_vocab):
|
| 265 |
+
return _sentencepiece.SentencePieceProcessor_SetVocabulary(self, valid_vocab)
|
| 266 |
+
|
| 267 |
+
def ResetVocabulary(self):
|
| 268 |
+
return _sentencepiece.SentencePieceProcessor_ResetVocabulary(self)
|
| 269 |
+
|
| 270 |
+
def LoadVocabulary(self, filename, threshold):
|
| 271 |
+
return _sentencepiece.SentencePieceProcessor_LoadVocabulary(self, filename, threshold)
|
| 272 |
+
|
| 273 |
+
def CalculateEntropy(self, *args):
|
| 274 |
+
return _sentencepiece.SentencePieceProcessor_CalculateEntropy(self, *args)
|
| 275 |
+
|
| 276 |
+
def GetPieceSize(self):
|
| 277 |
+
return _sentencepiece.SentencePieceProcessor_GetPieceSize(self)
|
| 278 |
+
|
| 279 |
+
def PieceToId(self, piece):
|
| 280 |
+
return _sentencepiece.SentencePieceProcessor_PieceToId(self, piece)
|
| 281 |
+
|
| 282 |
+
def IdToPiece(self, id):
|
| 283 |
+
return _sentencepiece.SentencePieceProcessor_IdToPiece(self, id)
|
| 284 |
+
|
| 285 |
+
def GetScore(self, id):
|
| 286 |
+
return _sentencepiece.SentencePieceProcessor_GetScore(self, id)
|
| 287 |
+
|
| 288 |
+
def IsUnknown(self, id):
|
| 289 |
+
return _sentencepiece.SentencePieceProcessor_IsUnknown(self, id)
|
| 290 |
+
|
| 291 |
+
def IsControl(self, id):
|
| 292 |
+
return _sentencepiece.SentencePieceProcessor_IsControl(self, id)
|
| 293 |
+
|
| 294 |
+
def IsUnused(self, id):
|
| 295 |
+
return _sentencepiece.SentencePieceProcessor_IsUnused(self, id)
|
| 296 |
+
|
| 297 |
+
def IsByte(self, id):
|
| 298 |
+
return _sentencepiece.SentencePieceProcessor_IsByte(self, id)
|
| 299 |
+
|
| 300 |
+
def unk_id(self):
|
| 301 |
+
return _sentencepiece.SentencePieceProcessor_unk_id(self)
|
| 302 |
+
|
| 303 |
+
def bos_id(self):
|
| 304 |
+
return _sentencepiece.SentencePieceProcessor_bos_id(self)
|
| 305 |
+
|
| 306 |
+
def eos_id(self):
|
| 307 |
+
return _sentencepiece.SentencePieceProcessor_eos_id(self)
|
| 308 |
+
|
| 309 |
+
def pad_id(self):
|
| 310 |
+
return _sentencepiece.SentencePieceProcessor_pad_id(self)
|
| 311 |
+
|
| 312 |
+
def serialized_model_proto(self):
|
| 313 |
+
return _sentencepiece.SentencePieceProcessor_serialized_model_proto(self)
|
| 314 |
+
|
| 315 |
+
def LoadFromFile(self, arg):
|
| 316 |
+
return _sentencepiece.SentencePieceProcessor_LoadFromFile(self, arg)
|
| 317 |
+
|
| 318 |
+
def _EncodeAsIds(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece):
|
| 319 |
+
return _sentencepiece.SentencePieceProcessor__EncodeAsIds(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece)
|
| 320 |
+
|
| 321 |
+
def _EncodeAsPieces(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece):
|
| 322 |
+
return _sentencepiece.SentencePieceProcessor__EncodeAsPieces(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece)
|
| 323 |
+
|
| 324 |
+
def _EncodeAsSerializedProto(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece):
|
| 325 |
+
return _sentencepiece.SentencePieceProcessor__EncodeAsSerializedProto(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece)
|
| 326 |
+
|
| 327 |
+
def _EncodeAsImmutableProto(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece):
|
| 328 |
+
return _sentencepiece.SentencePieceProcessor__EncodeAsImmutableProto(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece)
|
| 329 |
+
|
| 330 |
+
def _EncodeAsIdsBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece):
|
| 331 |
+
return _sentencepiece.SentencePieceProcessor__EncodeAsIdsBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece)
|
| 332 |
+
|
| 333 |
+
def _EncodeAsPiecesBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece):
|
| 334 |
+
return _sentencepiece.SentencePieceProcessor__EncodeAsPiecesBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece)
|
| 335 |
+
|
| 336 |
+
def _EncodeAsSerializedProtoBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece):
|
| 337 |
+
return _sentencepiece.SentencePieceProcessor__EncodeAsSerializedProtoBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece)
|
| 338 |
+
|
| 339 |
+
def _EncodeAsImmutableProtoBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece):
|
| 340 |
+
return _sentencepiece.SentencePieceProcessor__EncodeAsImmutableProtoBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece)
|
| 341 |
+
|
| 342 |
+
def _DecodeIds(self, ids):
|
| 343 |
+
return _sentencepiece.SentencePieceProcessor__DecodeIds(self, ids)
|
| 344 |
+
|
| 345 |
+
def _DecodeIdsAsBytes(self, ids):
|
| 346 |
+
return _sentencepiece.SentencePieceProcessor__DecodeIdsAsBytes(self, ids)
|
| 347 |
+
|
| 348 |
+
def _DecodePieces(self, pieces):
|
| 349 |
+
return _sentencepiece.SentencePieceProcessor__DecodePieces(self, pieces)
|
| 350 |
+
|
| 351 |
+
def _DecodeIdsAsSerializedProto(self, ids):
|
| 352 |
+
return _sentencepiece.SentencePieceProcessor__DecodeIdsAsSerializedProto(self, ids)
|
| 353 |
+
|
| 354 |
+
def _DecodePiecesAsSerializedProto(self, pieces):
|
| 355 |
+
return _sentencepiece.SentencePieceProcessor__DecodePiecesAsSerializedProto(self, pieces)
|
| 356 |
+
|
| 357 |
+
def _DecodeIdsAsImmutableProto(self, ids):
|
| 358 |
+
return _sentencepiece.SentencePieceProcessor__DecodeIdsAsImmutableProto(self, ids)
|
| 359 |
+
|
| 360 |
+
def _DecodePiecesAsImmutableProto(self, pieces):
|
| 361 |
+
return _sentencepiece.SentencePieceProcessor__DecodePiecesAsImmutableProto(self, pieces)
|
| 362 |
+
|
| 363 |
+
def _DecodeIdsBatch(self, ins, num_threads):
|
| 364 |
+
return _sentencepiece.SentencePieceProcessor__DecodeIdsBatch(self, ins, num_threads)
|
| 365 |
+
|
| 366 |
+
def _DecodeIdsAsBytesBatch(self, ins, num_threads):
|
| 367 |
+
return _sentencepiece.SentencePieceProcessor__DecodeIdsAsBytesBatch(self, ins, num_threads)
|
| 368 |
+
|
| 369 |
+
def _DecodeIdsAsSerializedProtoBatch(self, ins, num_threads):
|
| 370 |
+
return _sentencepiece.SentencePieceProcessor__DecodeIdsAsSerializedProtoBatch(self, ins, num_threads)
|
| 371 |
+
|
| 372 |
+
def _DecodeIdsAsImmutableProtoBatch(self, ins, num_threads):
|
| 373 |
+
return _sentencepiece.SentencePieceProcessor__DecodeIdsAsImmutableProtoBatch(self, ins, num_threads)
|
| 374 |
+
|
| 375 |
+
def _DecodePiecesBatch(self, ins, num_threads):
|
| 376 |
+
return _sentencepiece.SentencePieceProcessor__DecodePiecesBatch(self, ins, num_threads)
|
| 377 |
+
|
| 378 |
+
def _DecodePiecesAsSerializedProtoBatch(self, ins, num_threads):
|
| 379 |
+
return _sentencepiece.SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch(self, ins, num_threads)
|
| 380 |
+
|
| 381 |
+
def _DecodePiecesAsImmutableProtoBatch(self, ins, num_threads):
|
| 382 |
+
return _sentencepiece.SentencePieceProcessor__DecodePiecesAsImmutableProtoBatch(self, ins, num_threads)
|
| 383 |
+
|
| 384 |
+
def _NBestEncodeAsIds(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece):
|
| 385 |
+
return _sentencepiece.SentencePieceProcessor__NBestEncodeAsIds(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece)
|
| 386 |
+
|
| 387 |
+
def _NBestEncodeAsPieces(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece):
|
| 388 |
+
return _sentencepiece.SentencePieceProcessor__NBestEncodeAsPieces(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece)
|
| 389 |
+
|
| 390 |
+
def _NBestEncodeAsSerializedProto(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece):
|
| 391 |
+
return _sentencepiece.SentencePieceProcessor__NBestEncodeAsSerializedProto(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece)
|
| 392 |
+
|
| 393 |
+
def _NBestEncodeAsImmutableProto(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece):
|
| 394 |
+
return _sentencepiece.SentencePieceProcessor__NBestEncodeAsImmutableProto(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece)
|
| 395 |
+
|
| 396 |
+
def _SampleEncodeAndScoreAsIds(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece):
|
| 397 |
+
return _sentencepiece.SentencePieceProcessor__SampleEncodeAndScoreAsIds(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece)
|
| 398 |
+
|
| 399 |
+
def _SampleEncodeAndScoreAsPieces(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece):
|
| 400 |
+
return _sentencepiece.SentencePieceProcessor__SampleEncodeAndScoreAsPieces(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece)
|
| 401 |
+
|
| 402 |
+
def _SampleEncodeAndScoreAsSerializedProto(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece):
|
| 403 |
+
return _sentencepiece.SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece)
|
| 404 |
+
|
| 405 |
+
def _SampleEncodeAndScoreAsImmutableProto(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece):
|
| 406 |
+
return _sentencepiece.SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece)
|
| 407 |
+
|
| 408 |
+
def _Normalize(self, text):
|
| 409 |
+
return _sentencepiece.SentencePieceProcessor__Normalize(self, text)
|
| 410 |
+
|
| 411 |
+
def _NormalizeWithOffsets(self, text):
|
| 412 |
+
return _sentencepiece.SentencePieceProcessor__NormalizeWithOffsets(self, text)
|
| 413 |
+
|
| 414 |
+
def _CalculateEntropy(self, text, alpha):
|
| 415 |
+
return _sentencepiece.SentencePieceProcessor__CalculateEntropy(self, text, alpha)
|
| 416 |
+
|
| 417 |
+
def _CalculateEntropyBatch(self, ins, alpha, num_threads):
|
| 418 |
+
return _sentencepiece.SentencePieceProcessor__CalculateEntropyBatch(self, ins, alpha, num_threads)
|
| 419 |
+
|
| 420 |
+
def _OverrideNormalizerSpec(self, args):
|
| 421 |
+
return _sentencepiece.SentencePieceProcessor__OverrideNormalizerSpec(self, args)
|
| 422 |
+
|
| 423 |
+
def Init(self,
|
| 424 |
+
model_file=None,
|
| 425 |
+
model_proto=None,
|
| 426 |
+
out_type=int,
|
| 427 |
+
add_bos=False,
|
| 428 |
+
add_eos=False,
|
| 429 |
+
reverse=False,
|
| 430 |
+
emit_unk_piece=False,
|
| 431 |
+
enable_sampling=False,
|
| 432 |
+
nbest_size=-1,
|
| 433 |
+
alpha=0.1,
|
| 434 |
+
num_threads=-1):
|
| 435 |
+
"""Initialzie sentencepieceProcessor.
|
| 436 |
+
|
| 437 |
+
Args:
|
| 438 |
+
model_file: The sentencepiece model file path.
|
| 439 |
+
model_proto: The sentencepiece model serialized proto.
|
| 440 |
+
out_type: output type. int or str.
|
| 441 |
+
add_bos: Add <s> to the result (Default = false)
|
| 442 |
+
add_eos: Add </s> to the result (Default = false) <s>/</s> is added after
|
| 443 |
+
reversing (if enabled).
|
| 444 |
+
reverse: Reverses the tokenized sequence (Default = false)
|
| 445 |
+
emit_unk_piece: Emits the unk literal string (Default = false)
|
| 446 |
+
nbest_size: sampling parameters for unigram. Invalid in BPE-Dropout.
|
| 447 |
+
nbest_size = {0,1}: No sampling is performed.
|
| 448 |
+
nbest_size > 1: samples from the nbest_size results.
|
| 449 |
+
nbest_size < 0: assuming that nbest_size is infinite and samples
|
| 450 |
+
from the all hypothesis (lattice) using
|
| 451 |
+
forward-filtering-and-backward-sampling algorithm.
|
| 452 |
+
alpha: Soothing parameter for unigram sampling, and dropout probability of
|
| 453 |
+
merge operations for BPE-dropout.
|
| 454 |
+
num_threads: number of threads in batch processing (Default = -1, auto-detected)
|
| 455 |
+
"""
|
| 456 |
+
|
| 457 |
+
_sentencepiece_processor_init_native(self)
|
| 458 |
+
self._out_type = out_type
|
| 459 |
+
self._add_bos = add_bos
|
| 460 |
+
self._add_eos = add_eos
|
| 461 |
+
self._reverse = reverse
|
| 462 |
+
self._emit_unk_piece = emit_unk_piece
|
| 463 |
+
self._enable_sampling = enable_sampling
|
| 464 |
+
self._nbest_size = nbest_size
|
| 465 |
+
self._alpha = alpha
|
| 466 |
+
self._num_threads = num_threads
|
| 467 |
+
if model_file or model_proto:
|
| 468 |
+
self.Load(model_file=model_file, model_proto=model_proto)
|
| 469 |
+
|
| 470 |
+
|
| 471 |
+
def Encode(self,
|
| 472 |
+
input,
|
| 473 |
+
out_type=None,
|
| 474 |
+
add_bos=None,
|
| 475 |
+
add_eos=None,
|
| 476 |
+
reverse=None,
|
| 477 |
+
emit_unk_piece=None,
|
| 478 |
+
enable_sampling=None,
|
| 479 |
+
nbest_size=None,
|
| 480 |
+
alpha=None,
|
| 481 |
+
num_threads=None):
|
| 482 |
+
"""Encode text input to segmented ids or tokens.
|
| 483 |
+
|
| 484 |
+
Args:
|
| 485 |
+
input: input string. accepsts list of string.
|
| 486 |
+
out_type: output type. int or str.
|
| 487 |
+
add_bos: Add <s> to the result (Default = false)
|
| 488 |
+
add_eos: Add </s> to the result (Default = false) <s>/</s> is added after
|
| 489 |
+
reversing (if enabled).
|
| 490 |
+
reverse: Reverses the tokenized sequence (Default = false)
|
| 491 |
+
emit_unk_piece: Emits the unk literal string (Default = false)
|
| 492 |
+
nbest_size: sampling parameters for unigram. Invalid in BPE-Dropout.
|
| 493 |
+
nbest_size = {0,1}: No sampling is performed.
|
| 494 |
+
nbest_size > 1: samples from the nbest_size results.
|
| 495 |
+
nbest_size < 0: assuming that nbest_size is infinite and samples
|
| 496 |
+
from the all hypothesis (lattice) using
|
| 497 |
+
forward-filtering-and-backward-sampling algorithm.
|
| 498 |
+
alpha: Soothing parameter for unigram sampling, and merge probability for
|
| 499 |
+
BPE-dropout (probablity 'p' in BPE-dropout paper).
|
| 500 |
+
num_threads: the number of threads used in the batch processing (Default = -1).
|
| 501 |
+
"""
|
| 502 |
+
|
| 503 |
+
if out_type is None:
|
| 504 |
+
out_type = self._out_type
|
| 505 |
+
if add_bos is None:
|
| 506 |
+
add_bos = self._add_bos
|
| 507 |
+
if add_eos is None:
|
| 508 |
+
add_eos = self._add_eos
|
| 509 |
+
if reverse is None:
|
| 510 |
+
reverse = self._reverse
|
| 511 |
+
if emit_unk_piece is None:
|
| 512 |
+
emit_unk_piece = self._emit_unk_piece
|
| 513 |
+
if enable_sampling is None:
|
| 514 |
+
enable_sampling = self._enable_sampling
|
| 515 |
+
if nbest_size is None:
|
| 516 |
+
nbest_size = self._nbest_size
|
| 517 |
+
if alpha is None:
|
| 518 |
+
alpha = self._alpha
|
| 519 |
+
if num_threads is None:
|
| 520 |
+
num_threads = self._num_threads
|
| 521 |
+
|
| 522 |
+
if enable_sampling == True and (nbest_size is None or nbest_size == 0 or
|
| 523 |
+
nbest_size == 1 or alpha is None):
|
| 524 |
+
raise RuntimeError(
|
| 525 |
+
'When enable_sampling is True, We must specify "nbest_size > 1" or "nbest_size = -1", '
|
| 526 |
+
'and "alpha". "nbest_size" is enabled only on unigram mode ignored in BPE-dropout. '
|
| 527 |
+
'when "nbest_size = -1" , this method samples from all candidates on the lattice '
|
| 528 |
+
'instead of nbest segmentations.'
|
| 529 |
+
)
|
| 530 |
+
|
| 531 |
+
if num_threads is None or type(num_threads) is not int:
|
| 532 |
+
raise RuntimeError('num_threads must be int')
|
| 533 |
+
|
| 534 |
+
if type(input) is list:
|
| 535 |
+
if out_type is int:
|
| 536 |
+
return self._EncodeAsIdsBatch(input, num_threads, enable_sampling, nbest_size,
|
| 537 |
+
alpha, add_bos, add_eos, reverse, emit_unk_piece)
|
| 538 |
+
if out_type is str:
|
| 539 |
+
return self._EncodeAsPiecesBatch(input, num_threads, enable_sampling, nbest_size,
|
| 540 |
+
alpha, add_bos, add_eos, reverse, emit_unk_piece)
|
| 541 |
+
if out_type == 'serialized_proto' or out_type == 'proto':
|
| 542 |
+
return self._EncodeAsSerializedProtoBatch(input, num_threads, enable_sampling, nbest_size,
|
| 543 |
+
alpha, add_bos, add_eos, reverse, emit_unk_piece)
|
| 544 |
+
if out_type == 'immutable_proto':
|
| 545 |
+
return self._EncodeAsImmutableProtoBatch(input, num_threads, enable_sampling, nbest_size,
|
| 546 |
+
alpha, add_bos, add_eos, reverse, emit_unk_piece)
|
| 547 |
+
|
| 548 |
+
if out_type is int:
|
| 549 |
+
return self._EncodeAsIds(input, enable_sampling, nbest_size,
|
| 550 |
+
alpha, add_bos, add_eos, reverse, emit_unk_piece)
|
| 551 |
+
if out_type is str:
|
| 552 |
+
return self._EncodeAsPieces(input, enable_sampling, nbest_size,
|
| 553 |
+
alpha, add_bos, add_eos, reverse, emit_unk_piece)
|
| 554 |
+
if out_type == 'serialized_proto' or out_type == 'proto':
|
| 555 |
+
return self._EncodeAsSerializedProto(input, enable_sampling, nbest_size,
|
| 556 |
+
alpha, add_bos, add_eos, reverse, emit_unk_piece)
|
| 557 |
+
if out_type == 'immutable_proto':
|
| 558 |
+
return self._EncodeAsImmutableProto(input, enable_sampling, nbest_size,
|
| 559 |
+
alpha, add_bos, add_eos, reverse, emit_unk_piece)
|
| 560 |
+
|
| 561 |
+
raise RuntimeError('unknown out_type={}'.format(out_type))
|
| 562 |
+
return None
|
| 563 |
+
|
| 564 |
+
|
| 565 |
+
def EncodeAsPieces(self, input, **kwargs):
|
| 566 |
+
return self.Encode(input=input, out_type=str, **kwargs)
|
| 567 |
+
|
| 568 |
+
|
| 569 |
+
def EncodeAsIds(self, input, **kwargs):
|
| 570 |
+
return self.Encode(input=input, out_type=int, **kwargs)
|
| 571 |
+
|
| 572 |
+
|
| 573 |
+
def EncodeAsSerializedProto(self, input, **kwargs):
|
| 574 |
+
return self.Encode(input=input, out_type='serialized_proto', **kwargs)
|
| 575 |
+
|
| 576 |
+
|
| 577 |
+
def EncodeAsImmutableProto(self, input, **kwargs):
|
| 578 |
+
return self.Encode(input=input, out_type='immutable_proto', **kwargs)
|
| 579 |
+
|
| 580 |
+
|
| 581 |
+
def SampleEncodeAsPieces(self, input, nbest_size=None, alpha=None, **kwargs):
|
| 582 |
+
return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha,
|
| 583 |
+
out_type=str, enable_sampling=True, **kwargs)
|
| 584 |
+
|
| 585 |
+
|
| 586 |
+
def SampleEncodeAsIds(self, input, nbest_size=None, alpha=None,**kwargs):
|
| 587 |
+
return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha,
|
| 588 |
+
out_type=int, enable_sampling=True, **kwargs)
|
| 589 |
+
|
| 590 |
+
|
| 591 |
+
def SampleEncodeAsSerializedProto(self, input, nbest_size=None, alpha=None, **kwargs):
|
| 592 |
+
return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha,
|
| 593 |
+
out_type='serialized_proto', enable_sampling=True, **kwargs)
|
| 594 |
+
|
| 595 |
+
|
| 596 |
+
def SampleEncodeAsImmutableProto(self, input, nbest_size=None, alpha=None, **kwargs):
|
| 597 |
+
return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha,
|
| 598 |
+
out_type='immutable_proto', enable_sampling=True, **kwargs)
|
| 599 |
+
|
| 600 |
+
|
| 601 |
+
def NBestEncode(self,
|
| 602 |
+
input,
|
| 603 |
+
out_type=None,
|
| 604 |
+
add_bos=None,
|
| 605 |
+
add_eos=None,
|
| 606 |
+
reverse=None,
|
| 607 |
+
emit_unk_piece=None,
|
| 608 |
+
nbest_size=None):
|
| 609 |
+
"""NBestEncode text input to segmented ids or tokens.
|
| 610 |
+
|
| 611 |
+
Args:
|
| 612 |
+
input: input string. accepsts list of string.
|
| 613 |
+
out_type: output type. int or str.
|
| 614 |
+
add_bos: Add <s> to the result (Default = false)
|
| 615 |
+
add_eos: Add </s> to the result (Default = false) <s>/</s> is added after reversing (if enabled).
|
| 616 |
+
reverse: Reverses the tokenized sequence (Default = false)
|
| 617 |
+
emit_unk_piece: Emits the unk literal string (Default = false)
|
| 618 |
+
nbest_size: nbest size
|
| 619 |
+
"""
|
| 620 |
+
|
| 621 |
+
if out_type is None:
|
| 622 |
+
out_type = self._out_type
|
| 623 |
+
if add_bos is None:
|
| 624 |
+
add_bos = self._add_bos
|
| 625 |
+
if add_eos is None:
|
| 626 |
+
add_eos = self._add_eos
|
| 627 |
+
if reverse is None:
|
| 628 |
+
reverse = self._reverse
|
| 629 |
+
if emit_unk_piece is None:
|
| 630 |
+
emit_unk_piece = self._emit_unk_piece
|
| 631 |
+
if nbest_size is None:
|
| 632 |
+
nbest_size = self._nbest_size
|
| 633 |
+
|
| 634 |
+
if nbest_size <= 0:
|
| 635 |
+
nbest_size=1
|
| 636 |
+
|
| 637 |
+
def _encode(text):
|
| 638 |
+
if out_type is int:
|
| 639 |
+
return self._NBestEncodeAsIds(text, nbest_size,
|
| 640 |
+
add_bos, add_eos, reverse, emit_unk_piece)
|
| 641 |
+
if out_type is str:
|
| 642 |
+
return self._NBestEncodeAsPieces(text, nbest_size,
|
| 643 |
+
add_bos, add_eos, reverse, emit_unk_piece)
|
| 644 |
+
if out_type == 'serialized_proto' or out_type == 'proto':
|
| 645 |
+
return self._NBestEncodeAsSerializedProto(text, nbest_size,
|
| 646 |
+
add_bos, add_eos, reverse, emit_unk_piece)
|
| 647 |
+
if out_type == 'immutable_proto':
|
| 648 |
+
return self._NBestEncodeAsImmutableProto(text, nbest_size,
|
| 649 |
+
add_bos, add_eos, reverse, emit_unk_piece)
|
| 650 |
+
|
| 651 |
+
raise RuntimeError('unknown out_type')
|
| 652 |
+
|
| 653 |
+
if type(input) is list:
|
| 654 |
+
return [_encode(n) for n in input]
|
| 655 |
+
|
| 656 |
+
return _encode(input)
|
| 657 |
+
|
| 658 |
+
|
| 659 |
+
def NBestEncodeAsPieces(self, input, nbest_size=None, **kwargs):
|
| 660 |
+
return self.NBestEncode(input=input, nbest_size=nbest_size,
|
| 661 |
+
out_type=str, **kwargs)
|
| 662 |
+
|
| 663 |
+
|
| 664 |
+
def NBestEncodeAsIds(self, input, nbest_size=None, **kwargs):
|
| 665 |
+
return self.NBestEncode(input=input, nbest_size=nbest_size,
|
| 666 |
+
out_type=int, **kwargs)
|
| 667 |
+
|
| 668 |
+
|
| 669 |
+
def NBestEncodeAsSerializedProto(self, input, nbest_size=None, **kwargs):
|
| 670 |
+
return self.NBestEncode(input=input, nbest_size=nbest_size,
|
| 671 |
+
out_type='serialized_proto', **kwargs)
|
| 672 |
+
|
| 673 |
+
|
| 674 |
+
def NBestEncodeAsImmutableProto(self, input, nbest_size=None, **kwargs):
|
| 675 |
+
return self.NBestEncode(input=input, nbest_size=nbest_size,
|
| 676 |
+
out_type='immutable_proto', **kwargs)
|
| 677 |
+
|
| 678 |
+
|
| 679 |
+
def SampleEncodeAndScore(self,
|
| 680 |
+
input,
|
| 681 |
+
out_type=None,
|
| 682 |
+
add_bos=None,
|
| 683 |
+
add_eos=None,
|
| 684 |
+
reverse=None,
|
| 685 |
+
emit_unk_piece=None,
|
| 686 |
+
num_samples=None,
|
| 687 |
+
alpha=None,
|
| 688 |
+
wor=None,
|
| 689 |
+
include_best=None):
|
| 690 |
+
"""SampleEncodeAndScore text input to segmented ids or tokens.
|
| 691 |
+
|
| 692 |
+
Args:
|
| 693 |
+
input: input string. accepsts list of string.
|
| 694 |
+
out_type: output type. int or str or 'serialized_proto' or 'immutable_proto'
|
| 695 |
+
add_bos: Add <s> to the result (Default = false)
|
| 696 |
+
add_eos: Add </s> to the result (Default = false) <s>/</s> is added after reversing (if enabled).
|
| 697 |
+
reverse: Reverses the tokenized sequence (Default = false)
|
| 698 |
+
emit_unk_piece: Emits the unk literal string (Default = false)
|
| 699 |
+
num_samples: How many samples to return (Default = 1)
|
| 700 |
+
alpha: inverse temperature for sampling
|
| 701 |
+
wor: whether to sample without replacement (Default = false)
|
| 702 |
+
include_best: whether to include the best tokenization, requires wor=True (Default = false)
|
| 703 |
+
"""
|
| 704 |
+
|
| 705 |
+
if out_type is None:
|
| 706 |
+
out_type = self._out_type
|
| 707 |
+
if add_bos is None:
|
| 708 |
+
add_bos = self._add_bos
|
| 709 |
+
if add_eos is None:
|
| 710 |
+
add_eos = self._add_eos
|
| 711 |
+
if reverse is None:
|
| 712 |
+
reverse = self._reverse
|
| 713 |
+
if emit_unk_piece is None:
|
| 714 |
+
emit_unk_piece = self._emit_unk_piece
|
| 715 |
+
if num_samples is None:
|
| 716 |
+
num_samples = 1
|
| 717 |
+
if alpha is None:
|
| 718 |
+
alpha = 1.
|
| 719 |
+
if wor is None:
|
| 720 |
+
wor = False
|
| 721 |
+
if include_best is None:
|
| 722 |
+
include_best = False
|
| 723 |
+
|
| 724 |
+
if num_samples <= 0:
|
| 725 |
+
raise RuntimeError('num_examples must be positive')
|
| 726 |
+
|
| 727 |
+
if include_best and not wor:
|
| 728 |
+
raise RuntimeError('When include_best is True, We must specify "wor = True".')
|
| 729 |
+
|
| 730 |
+
|
| 731 |
+
def _encode(text):
|
| 732 |
+
if out_type is int:
|
| 733 |
+
return self._SampleEncodeAndScoreAsIds(text, num_samples, alpha, wor, include_best,
|
| 734 |
+
add_bos, add_eos, reverse, emit_unk_piece)
|
| 735 |
+
if out_type is str:
|
| 736 |
+
return self._SampleEncodeAndScoreAsPieces(text, num_samples, alpha, wor, include_best,
|
| 737 |
+
add_bos, add_eos, reverse, emit_unk_piece)
|
| 738 |
+
|
| 739 |
+
if out_type == 'serialized_proto' or out_type == 'proto':
|
| 740 |
+
return self._SampleEncodeAndScoreAsSerializedProto(text, num_samples, alpha, wor, include_best,
|
| 741 |
+
add_bos, add_eos, reverse, emit_unk_piece)
|
| 742 |
+
|
| 743 |
+
if out_type == 'immutable_proto':
|
| 744 |
+
return self._SampleEncodeAndScoreAsImmutableProto(text, num_samples, alpha, wor, include_best,
|
| 745 |
+
add_bos, add_eos, reverse, emit_unk_piece)
|
| 746 |
+
|
| 747 |
+
raise RuntimeError('unknown output type')
|
| 748 |
+
|
| 749 |
+
|
| 750 |
+
if type(input) is list:
|
| 751 |
+
return [_encode(n) for n in input]
|
| 752 |
+
|
| 753 |
+
return _encode(input)
|
| 754 |
+
|
| 755 |
+
|
| 756 |
+
def SampleEncodeAndScoreAsPieces(self, input, num_samples=None, alpha=None, **kwargs):
|
| 757 |
+
return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha,
|
| 758 |
+
out_type=str, **kwargs)
|
| 759 |
+
|
| 760 |
+
|
| 761 |
+
def SampleEncodeAndScoreAsIds(self, input, num_samples=None, alpha=None, **kwargs):
|
| 762 |
+
return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha,
|
| 763 |
+
out_type=int, **kwargs)
|
| 764 |
+
|
| 765 |
+
|
| 766 |
+
def SampleEncodeAndScoreAsSerializedProto(self, input, num_samples=None, alpha=None, **kwargs):
|
| 767 |
+
return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha,
|
| 768 |
+
out_type='serialized_proto', **kwargs)
|
| 769 |
+
|
| 770 |
+
|
| 771 |
+
def SampleEncodeAndScoreAsImmutableProto(self, input, num_samples=None, alpha=None, **kwargs):
|
| 772 |
+
return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha,
|
| 773 |
+
out_type='immutable_proto', **kwargs)
|
| 774 |
+
|
| 775 |
+
|
| 776 |
+
def Decode(self, input, out_type=str, num_threads=None):
|
| 777 |
+
"""Decode processed id or token sequences.
|
| 778 |
+
|
| 779 |
+
Args:
|
| 780 |
+
out_type: output type. str, bytes or 'serialized_proto' or 'immutable_proto' (Default = str)
|
| 781 |
+
num_threads: the number of threads used in the batch processing (Default = -1).
|
| 782 |
+
"""
|
| 783 |
+
|
| 784 |
+
if num_threads is None:
|
| 785 |
+
num_threads = self._num_threads
|
| 786 |
+
|
| 787 |
+
if num_threads is None or type(num_threads) is not int:
|
| 788 |
+
raise RuntimeError('num_threads must be int')
|
| 789 |
+
|
| 790 |
+
if not input:
|
| 791 |
+
return ''
|
| 792 |
+
|
| 793 |
+
if out_type is str:
|
| 794 |
+
if type(input) is int:
|
| 795 |
+
return self._DecodeIds([input])
|
| 796 |
+
if type(input) is str:
|
| 797 |
+
return self._DecodePieces([input])
|
| 798 |
+
|
| 799 |
+
if type(input) is list:
|
| 800 |
+
if len(input) == 0 or type(input[0]) is int:
|
| 801 |
+
return self._DecodeIds(input)
|
| 802 |
+
if type(input[0]) is str:
|
| 803 |
+
return self._DecodePieces(input)
|
| 804 |
+
|
| 805 |
+
if type(input[0]) is list:
|
| 806 |
+
if len(input[0]) == 0 or type(input[0][0]) is int:
|
| 807 |
+
return self._DecodeIdsBatch(input, num_threads)
|
| 808 |
+
if type(input[0][0]) is str:
|
| 809 |
+
return self._DecodePiecesBatch(input, num_threads)
|
| 810 |
+
|
| 811 |
+
if out_type is bytes:
|
| 812 |
+
if type(input) is int:
|
| 813 |
+
return self._DecodeIdsAsBytes([input])
|
| 814 |
+
if type(input) is str:
|
| 815 |
+
return self._DecodePieces([input])
|
| 816 |
+
|
| 817 |
+
if type(input) is list:
|
| 818 |
+
if len(input) == 0 or type(input[0]) is int:
|
| 819 |
+
return self._DecodeIdsAsBytes(input)
|
| 820 |
+
if type(input[0]) is str:
|
| 821 |
+
return self._DecodePieces(input)
|
| 822 |
+
|
| 823 |
+
if type(input[0]) is list:
|
| 824 |
+
if len(input[0]) == 0 or type(input[0][0]) is int:
|
| 825 |
+
return self._DecodeIdsAsBytesBatch(input, num_threads)
|
| 826 |
+
if type(input[0][0]) is str:
|
| 827 |
+
return self._DecodePiecesBatch(input, num_threads)
|
| 828 |
+
|
| 829 |
+
if out_type == 'serialized_proto':
|
| 830 |
+
if type(input) is int:
|
| 831 |
+
return self._DecodeIdsAsSerializedProto([input])
|
| 832 |
+
if type(input) is str:
|
| 833 |
+
return self._DecodePiecesAsSerializedProto([input])
|
| 834 |
+
|
| 835 |
+
if type(input) is list:
|
| 836 |
+
if len(input) == 0 or type(input[0]) is int:
|
| 837 |
+
return self._DecodeIdsAsSerializedProto(input)
|
| 838 |
+
if type(input[0]) is str:
|
| 839 |
+
return self._DecodePiecesAsSerializedProto(input)
|
| 840 |
+
|
| 841 |
+
if type(input[0]) is list:
|
| 842 |
+
if len(input[0]) == 0 or type(input[0][0]) is int:
|
| 843 |
+
return self._DecodeIdsAsSerializedProtoBatch(input, num_threads)
|
| 844 |
+
if type(input[0][0]) is str:
|
| 845 |
+
return self._DecodePiecesAsSerializedProtoBatch(input, num_threads)
|
| 846 |
+
|
| 847 |
+
|
| 848 |
+
if out_type == 'immutable_proto':
|
| 849 |
+
if type(input) is int:
|
| 850 |
+
return self._DecodeIdsAsImmutableProto([input])
|
| 851 |
+
if type(input) is str:
|
| 852 |
+
return self._DecodePiecesAsImmutableProto([input])
|
| 853 |
+
|
| 854 |
+
if type(input) is list:
|
| 855 |
+
if len(input) == 0 or type(input[0]) is int:
|
| 856 |
+
return self._DecodeIdsAsImmutableProto(input)
|
| 857 |
+
if type(input[0]) is str:
|
| 858 |
+
return self._DecodePiecesAsImmutableProto(input)
|
| 859 |
+
|
| 860 |
+
if type(input[0]) is list:
|
| 861 |
+
if len(input[0]) == 0 or type(input[0][0]) is int:
|
| 862 |
+
return self._DecodeIdsAsImmutableProtoBatch(input, num_threads)
|
| 863 |
+
if type(input[0][0]) is str:
|
| 864 |
+
return self._DecodePiecesAsImmutableProtoBatch(input, num_threads)
|
| 865 |
+
|
| 866 |
+
|
| 867 |
+
raise RuntimeError('unknown output or input type')
|
| 868 |
+
return None
|
| 869 |
+
|
| 870 |
+
|
| 871 |
+
def DecodePieces(self, input, out_type=str, **kwargs):
|
| 872 |
+
return self.Decode(input=input, out_type=out_type, **kwargs)
|
| 873 |
+
|
| 874 |
+
|
| 875 |
+
def DecodeIds(self, input, out_type=str, **kwargs):
|
| 876 |
+
return self.Decode(input=input, out_type=out_type, **kwargs)
|
| 877 |
+
|
| 878 |
+
|
| 879 |
+
def DecodePiecesAsSerializedProto(self, input, out_type='serialized_proto', **kwargs):
|
| 880 |
+
return self.Decode(input=input, out_type=out_type, **kwargs)
|
| 881 |
+
|
| 882 |
+
|
| 883 |
+
def DecodeIdsAsSerializedProto(self, input, out_type='serialized_proto', **kwargs):
|
| 884 |
+
return self.Decode(input=input, out_type=out_type, **kwargs)
|
| 885 |
+
|
| 886 |
+
|
| 887 |
+
def DecodePiecesAsImmutableProto(self, input, out_type='immutable_proto', **kwargs):
|
| 888 |
+
return self.Decode(input=input, out_type=out_type, **kwargs)
|
| 889 |
+
|
| 890 |
+
|
| 891 |
+
def DecodeIdsAsImmutableProto(self, input, out_type='immutable_proto', **kwargs):
|
| 892 |
+
return self.Decode(input=input, out_type=out_type, **kwargs)
|
| 893 |
+
|
| 894 |
+
|
| 895 |
+
def CalculateEntropy(self, input, alpha, num_threads=None):
|
| 896 |
+
"""Calculate sentence entropy"""
|
| 897 |
+
if type(input) is list:
|
| 898 |
+
if num_threads is None:
|
| 899 |
+
num_threads = self._num_threads
|
| 900 |
+
if num_threads is None or type(num_threads) is not int:
|
| 901 |
+
raise RuntimeError('num_threads must be int')
|
| 902 |
+
return self._CalculateEntropyBatch(input, alpha, num_threads)
|
| 903 |
+
|
| 904 |
+
return self._CalculateEntropy(input, alpha)
|
| 905 |
+
|
| 906 |
+
|
| 907 |
+
def Normalize(self, input, with_offsets=None):
|
| 908 |
+
def _normalize(text):
|
| 909 |
+
if with_offsets:
|
| 910 |
+
return self._NormalizeWithOffsets(text)
|
| 911 |
+
return self._Normalize(text)
|
| 912 |
+
|
| 913 |
+
if type(input) is list:
|
| 914 |
+
return [_normalize(x) for x in input]
|
| 915 |
+
return _normalize(input)
|
| 916 |
+
|
| 917 |
+
def OverrideNormalizerSpec(self, **kwargs):
|
| 918 |
+
new_kwargs = {}
|
| 919 |
+
for key, value in kwargs.items():
|
| 920 |
+
new_kwargs[key] = str(value)
|
| 921 |
+
return self._OverrideNormalizerSpec(new_kwargs)
|
| 922 |
+
|
| 923 |
+
|
| 924 |
+
def piece_size(self):
|
| 925 |
+
return self.GetPieceSize()
|
| 926 |
+
|
| 927 |
+
|
| 928 |
+
def vocab_size(self):
|
| 929 |
+
return self.GetPieceSize()
|
| 930 |
+
|
| 931 |
+
|
| 932 |
+
def __getstate__(self):
|
| 933 |
+
return self.serialized_model_proto()
|
| 934 |
+
|
| 935 |
+
|
| 936 |
+
def __setstate__(self, serialized_model_proto):
|
| 937 |
+
self.__init__()
|
| 938 |
+
self.LoadFromSerializedProto(serialized_model_proto)
|
| 939 |
+
|
| 940 |
+
|
| 941 |
+
def __len__(self):
|
| 942 |
+
return self.GetPieceSize()
|
| 943 |
+
|
| 944 |
+
|
| 945 |
+
def __getitem__(self, piece):
|
| 946 |
+
return self.PieceToId(piece)
|
| 947 |
+
|
| 948 |
+
|
| 949 |
+
def Load(self, model_file=None, model_proto=None):
|
| 950 |
+
"""Overwride SentencePieceProcessor.Load to support both model_file and model_proto.
|
| 951 |
+
|
| 952 |
+
Args:
|
| 953 |
+
model_file: The sentencepiece model file path.
|
| 954 |
+
model_proto: The sentencepiece model serialized proto. Either `model_file`
|
| 955 |
+
or `model_proto` must be set.
|
| 956 |
+
"""
|
| 957 |
+
if model_file and model_proto:
|
| 958 |
+
raise RuntimeError('model_file and model_proto must be exclusive.')
|
| 959 |
+
if model_proto:
|
| 960 |
+
return self.LoadFromSerializedProto(model_proto)
|
| 961 |
+
return self.LoadFromFile(model_file)
|
| 962 |
+
|
| 963 |
+
|
| 964 |
+
# Register SentencePieceProcessor in _sentencepiece:
|
| 965 |
+
_sentencepiece.SentencePieceProcessor_swigregister(SentencePieceProcessor)
|
| 966 |
+
|
| 967 |
+
def SetRandomGeneratorSeed(seed):
|
| 968 |
+
return _sentencepiece.SetRandomGeneratorSeed(seed)
|
| 969 |
+
|
| 970 |
+
def SetMinLogLevel(v):
|
| 971 |
+
return _sentencepiece.SetMinLogLevel(v)
|
| 972 |
+
class SentencePieceTrainer(object):
|
| 973 |
+
thisown = property(lambda x: x.this.own(), lambda x, v: x.this.own(v), doc="The membership flag")
|
| 974 |
+
|
| 975 |
+
def __init__(self, *args, **kwargs):
|
| 976 |
+
raise AttributeError("No constructor defined")
|
| 977 |
+
__repr__ = _swig_repr
|
| 978 |
+
|
| 979 |
+
@staticmethod
|
| 980 |
+
def _TrainFromString(arg):
|
| 981 |
+
return _sentencepiece.SentencePieceTrainer__TrainFromString(arg)
|
| 982 |
+
|
| 983 |
+
@staticmethod
|
| 984 |
+
def _TrainFromMap(args):
|
| 985 |
+
return _sentencepiece.SentencePieceTrainer__TrainFromMap(args)
|
| 986 |
+
|
| 987 |
+
@staticmethod
|
| 988 |
+
def _TrainFromMap2(args, iter):
|
| 989 |
+
return _sentencepiece.SentencePieceTrainer__TrainFromMap2(args, iter)
|
| 990 |
+
|
| 991 |
+
@staticmethod
|
| 992 |
+
def _TrainFromMap3(args):
|
| 993 |
+
return _sentencepiece.SentencePieceTrainer__TrainFromMap3(args)
|
| 994 |
+
|
| 995 |
+
@staticmethod
|
| 996 |
+
def _TrainFromMap4(args, iter):
|
| 997 |
+
return _sentencepiece.SentencePieceTrainer__TrainFromMap4(args, iter)
|
| 998 |
+
|
| 999 |
+
@staticmethod
|
| 1000 |
+
def _Train(arg=None, **kwargs):
|
| 1001 |
+
"""Train Sentencepiece model. Accept both kwargs and legacy string arg."""
|
| 1002 |
+
if arg is not None and type(arg) is str:
|
| 1003 |
+
return SentencePieceTrainer._TrainFromString(arg)
|
| 1004 |
+
|
| 1005 |
+
def _encode(value):
|
| 1006 |
+
"""Encode value to CSV.."""
|
| 1007 |
+
if type(value) is list:
|
| 1008 |
+
if sys.version_info[0] == 3:
|
| 1009 |
+
f = StringIO()
|
| 1010 |
+
else:
|
| 1011 |
+
f = BytesIO()
|
| 1012 |
+
writer = csv.writer(f, lineterminator='')
|
| 1013 |
+
writer.writerow([str(v) for v in value])
|
| 1014 |
+
return f.getvalue()
|
| 1015 |
+
else:
|
| 1016 |
+
return str(value)
|
| 1017 |
+
|
| 1018 |
+
sentence_iterator = None
|
| 1019 |
+
model_writer = None
|
| 1020 |
+
new_kwargs = {}
|
| 1021 |
+
for key, value in kwargs.items():
|
| 1022 |
+
if key in ['sentence_iterator', 'sentence_reader']:
|
| 1023 |
+
sentence_iterator = value
|
| 1024 |
+
elif key in ['model_writer']:
|
| 1025 |
+
model_writer = value
|
| 1026 |
+
else:
|
| 1027 |
+
new_kwargs[key] = _encode(value)
|
| 1028 |
+
|
| 1029 |
+
if model_writer:
|
| 1030 |
+
if sentence_iterator:
|
| 1031 |
+
model_proto = SentencePieceTrainer._TrainFromMap4(new_kwargs,
|
| 1032 |
+
sentence_iterator)
|
| 1033 |
+
else:
|
| 1034 |
+
model_proto = SentencePieceTrainer._TrainFromMap3(new_kwargs)
|
| 1035 |
+
model_writer.write(model_proto)
|
| 1036 |
+
else:
|
| 1037 |
+
if sentence_iterator:
|
| 1038 |
+
return SentencePieceTrainer._TrainFromMap2(new_kwargs, sentence_iterator)
|
| 1039 |
+
else:
|
| 1040 |
+
return SentencePieceTrainer._TrainFromMap(new_kwargs)
|
| 1041 |
+
|
| 1042 |
+
return None
|
| 1043 |
+
|
| 1044 |
+
@staticmethod
|
| 1045 |
+
def Train(arg=None, logstream=None, **kwargs):
|
| 1046 |
+
with _LogStream(ostream=logstream):
|
| 1047 |
+
SentencePieceTrainer._Train(arg=arg, **kwargs)
|
| 1048 |
+
|
| 1049 |
+
|
| 1050 |
+
# Register SentencePieceTrainer in _sentencepiece:
|
| 1051 |
+
_sentencepiece.SentencePieceTrainer_swigregister(SentencePieceTrainer)
|
| 1052 |
+
class SentencePieceNormalizer(object):
|
| 1053 |
+
thisown = property(lambda x: x.this.own(), lambda x, v: x.this.own(v), doc="The membership flag")
|
| 1054 |
+
__repr__ = _swig_repr
|
| 1055 |
+
|
| 1056 |
+
def __init__(self):
|
| 1057 |
+
_sentencepiece.SentencePieceNormalizer_swiginit(self, _sentencepiece.new_SentencePieceNormalizer())
|
| 1058 |
+
__swig_destroy__ = _sentencepiece.delete_SentencePieceNormalizer
|
| 1059 |
+
|
| 1060 |
+
def LoadFromSerializedProto(self, serialized):
|
| 1061 |
+
return _sentencepiece.SentencePieceNormalizer_LoadFromSerializedProto(self, serialized)
|
| 1062 |
+
|
| 1063 |
+
def LoadFromRuleTSV(self, filename):
|
| 1064 |
+
return _sentencepiece.SentencePieceNormalizer_LoadFromRuleTSV(self, filename)
|
| 1065 |
+
|
| 1066 |
+
def LoadFromRuleName(self, name):
|
| 1067 |
+
return _sentencepiece.SentencePieceNormalizer_LoadFromRuleName(self, name)
|
| 1068 |
+
|
| 1069 |
+
def serialized_model_proto(self):
|
| 1070 |
+
return _sentencepiece.SentencePieceNormalizer_serialized_model_proto(self)
|
| 1071 |
+
|
| 1072 |
+
def LoadFromFile(self, arg):
|
| 1073 |
+
return _sentencepiece.SentencePieceNormalizer_LoadFromFile(self, arg)
|
| 1074 |
+
|
| 1075 |
+
def _Normalize(self, text):
|
| 1076 |
+
return _sentencepiece.SentencePieceNormalizer__Normalize(self, text)
|
| 1077 |
+
|
| 1078 |
+
def _NormalizeWithOffsets(self, text):
|
| 1079 |
+
return _sentencepiece.SentencePieceNormalizer__NormalizeWithOffsets(self, text)
|
| 1080 |
+
|
| 1081 |
+
def _SetProtoField(self, name, value):
|
| 1082 |
+
return _sentencepiece.SentencePieceNormalizer__SetProtoField(self, name, value)
|
| 1083 |
+
|
| 1084 |
+
def Init(self,
|
| 1085 |
+
model_file=None,
|
| 1086 |
+
model_proto=None,
|
| 1087 |
+
rule_tsv=None,
|
| 1088 |
+
rule_name=None,
|
| 1089 |
+
add_dummy_prefix=False,
|
| 1090 |
+
escape_whitespaces=False,
|
| 1091 |
+
remove_extra_whitespaces=False):
|
| 1092 |
+
"""Initialzie sentencePieceNormalizer.
|
| 1093 |
+
|
| 1094 |
+
Args:
|
| 1095 |
+
model_file: The sentencepiece model file path.
|
| 1096 |
+
model_proto: The sentencepiece model serialized proto.
|
| 1097 |
+
rule_tsv: The normalization rule file in TSV format.
|
| 1098 |
+
rule_name: Pre-defined normalization name.
|
| 1099 |
+
add_dummy_prefix: add dummy prefix.
|
| 1100 |
+
escape_whitespaces: escape whitespaces.
|
| 1101 |
+
remove_extra_whitespaces: remove extra whitespaces.
|
| 1102 |
+
"""
|
| 1103 |
+
|
| 1104 |
+
_sentencepiece_normalizer_init_native(self)
|
| 1105 |
+
|
| 1106 |
+
if model_file:
|
| 1107 |
+
status = self.LoadFromFile(model_file)
|
| 1108 |
+
elif model_proto:
|
| 1109 |
+
status = self.LoadFromSerializedProto(model_proto)
|
| 1110 |
+
elif rule_tsv:
|
| 1111 |
+
status = self.LoadFromRuleTSV(rule_tsv)
|
| 1112 |
+
elif rule_name:
|
| 1113 |
+
status = self.LoadFromRuleName(rule_name)
|
| 1114 |
+
else:
|
| 1115 |
+
raise RuntimeError('no model is specified')
|
| 1116 |
+
|
| 1117 |
+
if status:
|
| 1118 |
+
self._SetProtoField('add_dummy_prefix', add_dummy_prefix)
|
| 1119 |
+
self._SetProtoField('escape_whitespaces', escape_whitespaces)
|
| 1120 |
+
self._SetProtoField('remove_extra_whitespaces', remove_extra_whitespaces)
|
| 1121 |
+
|
| 1122 |
+
def Normalize(self, input, with_offsets=None):
|
| 1123 |
+
def _normalize(text):
|
| 1124 |
+
if with_offsets:
|
| 1125 |
+
return self._NormalizeWithOffsets(text)
|
| 1126 |
+
return self._Normalize(text)
|
| 1127 |
+
|
| 1128 |
+
if type(input) is list:
|
| 1129 |
+
return [_normalize(x) for x in input]
|
| 1130 |
+
return _normalize(input)
|
| 1131 |
+
|
| 1132 |
+
|
| 1133 |
+
def __getstate__(self):
|
| 1134 |
+
return self.serialized_model_proto()
|
| 1135 |
+
|
| 1136 |
+
|
| 1137 |
+
def __setstate__(self, serialized_model_proto):
|
| 1138 |
+
self.__init__()
|
| 1139 |
+
self.LoadFromSerializedProto(serialized_model_proto)
|
| 1140 |
+
|
| 1141 |
+
|
| 1142 |
+
# Register SentencePieceNormalizer in _sentencepiece:
|
| 1143 |
+
_sentencepiece.SentencePieceNormalizer_swigregister(SentencePieceNormalizer)
|
| 1144 |
+
|
| 1145 |
+
def SetDataDir(data_dir):
|
| 1146 |
+
return _sentencepiece.SetDataDir(data_dir)
|
| 1147 |
+
|
| 1148 |
+
|
| 1149 |
+
import re
|
| 1150 |
+
import csv
|
| 1151 |
+
import sys
|
| 1152 |
+
import os
|
| 1153 |
+
import importlib.resources
|
| 1154 |
+
from io import StringIO
|
| 1155 |
+
from io import BytesIO
|
| 1156 |
+
|
| 1157 |
+
|
| 1158 |
+
def _add_snake_case(classname):
|
| 1159 |
+
"""Added snake_cased method from CammelCased method."""
|
| 1160 |
+
|
| 1161 |
+
snake_map = {}
|
| 1162 |
+
for k, v in classname.__dict__.items():
|
| 1163 |
+
if re.match(r'^[A-Z]+', k):
|
| 1164 |
+
snake = re.sub(r'(?<!^)(?=[A-Z])', '_',
|
| 1165 |
+
k).lower().replace('n_best', 'nbest')
|
| 1166 |
+
snake_map[snake] = v
|
| 1167 |
+
for k, v in snake_map.items():
|
| 1168 |
+
setattr(classname, k, v)
|
| 1169 |
+
|
| 1170 |
+
|
| 1171 |
+
def _batchnize(classname, name):
|
| 1172 |
+
"""Enables batch request for the method classname.name."""
|
| 1173 |
+
func = getattr(classname, name, None)
|
| 1174 |
+
def _func(v, n):
|
| 1175 |
+
if type(n) is int and (n < 0 or n >= v.piece_size()):
|
| 1176 |
+
raise IndexError('piece id is out of range.')
|
| 1177 |
+
return func(v, n)
|
| 1178 |
+
|
| 1179 |
+
def _batched_func(self, arg):
|
| 1180 |
+
if type(arg) is list:
|
| 1181 |
+
return [_func(self, n) for n in arg]
|
| 1182 |
+
else:
|
| 1183 |
+
return _func(self, arg)
|
| 1184 |
+
|
| 1185 |
+
setattr(classname, name, _batched_func)
|
| 1186 |
+
|
| 1187 |
+
|
| 1188 |
+
_sentencepiece_processor_init_native = SentencePieceProcessor.__init__
|
| 1189 |
+
_sentencepiece_normalizer_init_native = SentencePieceNormalizer.__init__
|
| 1190 |
+
setattr(SentencePieceProcessor, '__init__', SentencePieceProcessor.Init)
|
| 1191 |
+
setattr(SentencePieceNormalizer, '__init__', SentencePieceNormalizer.Init)
|
| 1192 |
+
|
| 1193 |
+
SentencePieceProcessor.Tokenize = SentencePieceProcessor.Encode
|
| 1194 |
+
SentencePieceProcessor.Detokenize = SentencePieceProcessor.Decode
|
| 1195 |
+
|
| 1196 |
+
for m in [
|
| 1197 |
+
'PieceToId', 'IdToPiece', 'GetScore', 'IsUnknown', 'IsControl', 'IsUnused',
|
| 1198 |
+
'IsByte'
|
| 1199 |
+
]:
|
| 1200 |
+
_batchnize(SentencePieceProcessor, m)
|
| 1201 |
+
|
| 1202 |
+
_add_snake_case(SentencePieceProcessor)
|
| 1203 |
+
_add_snake_case(SentencePieceTrainer)
|
| 1204 |
+
_add_snake_case(SentencePieceNormalizer)
|
| 1205 |
+
set_random_generator_seed = SetRandomGeneratorSeed
|
| 1206 |
+
set_min_log_level = SetMinLogLevel
|
| 1207 |
+
|
| 1208 |
+
from ._version import __version__
|
| 1209 |
+
|
| 1210 |
+
SetDataDir(os.path.join(str(importlib.resources.files('sentencepiece')), 'package_data'))
|
| 1211 |
+
|
| 1212 |
+
class _LogStream(object):
|
| 1213 |
+
def __init__(self, ostream=None):
|
| 1214 |
+
self.ostream = ostream
|
| 1215 |
+
if self.ostream is not None:
|
| 1216 |
+
self.orig_stream_fileno = sys.stderr.fileno()
|
| 1217 |
+
|
| 1218 |
+
def __enter__(self):
|
| 1219 |
+
if self.ostream is not None:
|
| 1220 |
+
self.orig_stream_dup = os.dup(self.orig_stream_fileno)
|
| 1221 |
+
os.dup2(self.ostream.fileno(), self.orig_stream_fileno)
|
| 1222 |
+
|
| 1223 |
+
def __exit__(self, type, value, traceback):
|
| 1224 |
+
if self.ostream is not None:
|
| 1225 |
+
os.close(self.orig_stream_fileno)
|
| 1226 |
+
os.dup2(self.orig_stream_dup, self.orig_stream_fileno)
|
| 1227 |
+
os.close(self.orig_stream_dup)
|
| 1228 |
+
self.ostream.close()
|
| 1229 |
+
|
| 1230 |
+
|
source/sentencepiece/_sentencepiece.cpython-312-x86_64-linux-gnu.so
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cf6b023896aa1c99026445efc2b5cbd3b8a97c6839dd0fdfb2bca757fbed970d
|
| 3 |
+
size 2005912
|
source/sentencepiece/_version.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
__version__ = '0.2.1'
|
source/sentencepiece/package_data/nfkc.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:52f11028ff8a7df3e009d94a8b6a54d4a8a17132efb4ccc1c9a0a41e432bd91e
|
| 3 |
+
size 240008
|
source/sentencepiece/package_data/nfkc_cf.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:60833ec11201446659c3549c183b18f024c4007628cc6b3a4e91ae007697b826
|
| 3 |
+
size 247028
|
source/sentencepiece/package_data/nmt_nfkc.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:79372c41389c2b9b29bc171017ab5400e352debd686b02670a42bec709015074
|
| 3 |
+
size 240007
|
source/sentencepiece/package_data/nmt_nfkc_cf.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:22c292c76f503795f30c85d79d30a7f572fff4f49e00392d2d60f4f93e941a1e
|
| 3 |
+
size 247027
|
source/sentencepiece/sentencepiece.i
ADDED
|
@@ -0,0 +1,2013 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
%module sentencepiece
|
| 2 |
+
%include exception.i
|
| 3 |
+
|
| 4 |
+
%{
|
| 5 |
+
|
| 6 |
+
#include <atomic>
|
| 7 |
+
#include <iostream>
|
| 8 |
+
#include <algorithm>
|
| 9 |
+
#include <functional>
|
| 10 |
+
#include <limits>
|
| 11 |
+
#include <cmath>
|
| 12 |
+
#include <thread>
|
| 13 |
+
#include <vector>
|
| 14 |
+
#include <sentencepiece_processor.h>
|
| 15 |
+
#include <sentencepiece_trainer.h>
|
| 16 |
+
|
| 17 |
+
namespace {
|
| 18 |
+
PyObject* kUnicodeInput = reinterpret_cast<PyObject* >(0x1);
|
| 19 |
+
PyObject* kByteInput = reinterpret_cast<PyObject* >(0x2);
|
| 20 |
+
|
| 21 |
+
using BytesArray = std::vector<sentencepiece::util::bytes>;
|
| 22 |
+
|
| 23 |
+
inline void ReleaseResultObject(PyObject *obj) {
|
| 24 |
+
if (obj != nullptr && obj != kUnicodeInput && obj != kByteInput) {
|
| 25 |
+
Py_XDECREF(obj);
|
| 26 |
+
}
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
class PyInputString {
|
| 30 |
+
public:
|
| 31 |
+
explicit PyInputString(PyObject* obj) {
|
| 32 |
+
if (PyUnicode_Check(obj)) {
|
| 33 |
+
str_ = const_cast<char *>(PyUnicode_AsUTF8AndSize(obj, &size_));
|
| 34 |
+
input_type_ = kUnicodeInput;
|
| 35 |
+
} else if (PyBytes_Check(obj)) {
|
| 36 |
+
PyBytes_AsStringAndSize(obj, &str_, &size_);
|
| 37 |
+
input_type_ = kByteInput;
|
| 38 |
+
} else {
|
| 39 |
+
str_ = nullptr;
|
| 40 |
+
}
|
| 41 |
+
}
|
| 42 |
+
absl::string_view str() const { return absl::string_view(data(), size()); }
|
| 43 |
+
const char* data() const { return str_; }
|
| 44 |
+
Py_ssize_t size() const { return size_; }
|
| 45 |
+
bool IsAvalable() const { return str_ != nullptr; }
|
| 46 |
+
PyObject *input_type() const { return input_type_; }
|
| 47 |
+
|
| 48 |
+
static bool IsUnicode(PyObject *resultobj) {
|
| 49 |
+
return (resultobj == nullptr || resultobj == kUnicodeInput);
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
private:
|
| 53 |
+
PyObject* input_type_ = nullptr;
|
| 54 |
+
char* str_ = nullptr;
|
| 55 |
+
Py_ssize_t size_ = 0;
|
| 56 |
+
};
|
| 57 |
+
|
| 58 |
+
PyObject* MakePyOutputString(const std::string& output,
|
| 59 |
+
PyObject *resultobj) {
|
| 60 |
+
if (PyInputString::IsUnicode(resultobj)) {
|
| 61 |
+
return PyUnicode_FromStringAndSize(output.data(), output.size());
|
| 62 |
+
}
|
| 63 |
+
return PyBytes_FromStringAndSize(output.data(), output.size());
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
PyObject* MakePyOutputBytes(const sentencepiece::util::bytes& output) {
|
| 67 |
+
return PyBytes_FromStringAndSize(output.data(), output.size());
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
int ToSwigError(sentencepiece::util::StatusCode code) {
|
| 71 |
+
switch (code) {
|
| 72 |
+
case sentencepiece::util::StatusCode::kNotFound:
|
| 73 |
+
return SWIG_IOError;
|
| 74 |
+
case sentencepiece::util::StatusCode::kOutOfRange:
|
| 75 |
+
return SWIG_IndexError;
|
| 76 |
+
case sentencepiece::util::StatusCode::kInvalidArgument:
|
| 77 |
+
return SWIG_SyntaxError;
|
| 78 |
+
default:
|
| 79 |
+
return SWIG_RuntimeError;
|
| 80 |
+
}
|
| 81 |
+
return SWIG_RuntimeError;
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
class PySentenceIterator : public sentencepiece::SentenceIterator {
|
| 85 |
+
public:
|
| 86 |
+
PySentenceIterator(PyObject *iter) : iter_(iter) {
|
| 87 |
+
item_ = PyIter_Next(iter_);
|
| 88 |
+
CopyValue();
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
~PySentenceIterator() {
|
| 92 |
+
// Py_XDECREF(iter_);
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
bool done() const override {
|
| 96 |
+
return item_ == nullptr;
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
void Next() override {
|
| 100 |
+
item_ = PyIter_Next(iter_);
|
| 101 |
+
CopyValue();
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
const std::string &value() const override {
|
| 105 |
+
return value_;
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
sentencepiece::util::Status status() const override {
|
| 109 |
+
return status_;
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
private:
|
| 113 |
+
void CopyValue() {
|
| 114 |
+
if (item_ == nullptr) return;
|
| 115 |
+
const PyInputString ustring(item_);
|
| 116 |
+
if (ustring.IsAvalable()) {
|
| 117 |
+
const char *data = ustring.data();
|
| 118 |
+
size_t size = ustring.size();
|
| 119 |
+
while (size > 0) {
|
| 120 |
+
if (data[size - 1] == '\r' || data[size - 1] == '\n')
|
| 121 |
+
--size;
|
| 122 |
+
else
|
| 123 |
+
break;
|
| 124 |
+
}
|
| 125 |
+
value_.assign(data, size);
|
| 126 |
+
} else {
|
| 127 |
+
status_ = sentencepiece::util::Status(sentencepiece::util::StatusCode::kInternal,
|
| 128 |
+
"Not a string.");
|
| 129 |
+
}
|
| 130 |
+
Py_XDECREF(item_);
|
| 131 |
+
}
|
| 132 |
+
PyObject *iter_ = nullptr;
|
| 133 |
+
PyObject *item_ = nullptr;
|
| 134 |
+
std::string value_;
|
| 135 |
+
sentencepiece::util::Status status_;
|
| 136 |
+
};
|
| 137 |
+
|
| 138 |
+
inline void RewriteIds(const sentencepiece::SentencePieceProcessor &sp,
|
| 139 |
+
std::vector<int> *ids,
|
| 140 |
+
bool add_bos, bool add_eos, bool reverse, bool emit_unk_piece) {
|
| 141 |
+
if (!add_bos && !add_eos && !reverse) return;
|
| 142 |
+
if (reverse) std::reverse(ids->begin(), ids->end());
|
| 143 |
+
if (add_bos) ids->insert(ids->begin(), sp.bos_id());
|
| 144 |
+
if (add_eos) ids->push_back(sp.eos_id());
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
inline void RewriteIds(const sentencepiece::SentencePieceProcessor &sp,
|
| 148 |
+
std::vector<std::string> *pieces,
|
| 149 |
+
bool add_bos, bool add_eos, bool reverse, bool emit_unk_piece) {
|
| 150 |
+
if (!add_bos && !add_eos && !reverse && !emit_unk_piece) return;
|
| 151 |
+
if (reverse) std::reverse(pieces->begin(), pieces->end());
|
| 152 |
+
if (add_bos) pieces->insert(pieces->begin(), sp.IdToPiece(sp.bos_id()));
|
| 153 |
+
if (add_eos) pieces->push_back(sp.IdToPiece(sp.eos_id()));
|
| 154 |
+
if (emit_unk_piece) {
|
| 155 |
+
const auto &unk = sp.IdToPiece(sp.unk_id());
|
| 156 |
+
for (auto &piece : *pieces) {
|
| 157 |
+
const int id = sp.PieceToId(piece);
|
| 158 |
+
if (id == sp.unk_id()) {
|
| 159 |
+
piece = unk;
|
| 160 |
+
}
|
| 161 |
+
}
|
| 162 |
+
}
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
inline void RewriteIds(const sentencepiece::SentencePieceProcessor &sp,
|
| 166 |
+
sentencepiece::util::bytes *proto,
|
| 167 |
+
bool add_bos, bool add_eos, bool reverse, bool emit_unk_piece) {
|
| 168 |
+
if (add_bos || add_eos || reverse || emit_unk_piece) {
|
| 169 |
+
throw sentencepiece::util::Status(
|
| 170 |
+
sentencepiece::util::StatusCode::kUnimplemented,
|
| 171 |
+
"add_bos, add_eos, reverse, and emit_unk_piece is not supported in proto API");
|
| 172 |
+
}
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
inline void RewriteIds(const sentencepiece::SentencePieceProcessor &sp,
|
| 176 |
+
sentencepiece::ImmutableSentencePieceText *proto,
|
| 177 |
+
bool add_bos, bool add_eos, bool reverse, bool emit_unk_piece) {
|
| 178 |
+
if (add_bos || add_eos || reverse || emit_unk_piece) {
|
| 179 |
+
throw sentencepiece::util::Status(
|
| 180 |
+
sentencepiece::util::StatusCode::kUnimplemented,
|
| 181 |
+
"add_bos, add_eos, reverse, and emit_unk_piece is not supported in proto API");
|
| 182 |
+
}
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
inline void CheckIds(const std::vector<int> &ids, int num_pieces) {
|
| 186 |
+
for (int id : ids) {
|
| 187 |
+
if (id < 0 || id >= num_pieces) {
|
| 188 |
+
throw sentencepiece::util::Status(
|
| 189 |
+
sentencepiece::util::StatusCode::kOutOfRange,
|
| 190 |
+
"piece id is out of range.");
|
| 191 |
+
}
|
| 192 |
+
}
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
inline void CheckIds(const std::vector<absl::string_view> &ids, int num_pieces) {}
|
| 196 |
+
|
| 197 |
+
inline void CheckIdsBatch(const std::vector<std::vector<int>> &ids, int num_pieces) {
|
| 198 |
+
for (const auto &v : ids) CheckIds(v, num_pieces);
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
template <typename T>
|
| 202 |
+
inline void ConvertToUnicodeSpans(T *proto) {}
|
| 203 |
+
|
| 204 |
+
template <>
|
| 205 |
+
inline void ConvertToUnicodeSpans(sentencepiece::ImmutableSentencePieceText *proto) {
|
| 206 |
+
proto->ConvertToUnicodeSpans();
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
template <>
|
| 210 |
+
inline void ConvertToUnicodeSpans(sentencepiece::ImmutableNBestSentencePieceText *proto) {
|
| 211 |
+
proto->ConvertToUnicodeSpans();
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
class ThreadPool {
|
| 215 |
+
public:
|
| 216 |
+
explicit ThreadPool(size_t request_size) :
|
| 217 |
+
request_size_(request_size) {}
|
| 218 |
+
|
| 219 |
+
virtual ~ThreadPool() {
|
| 220 |
+
for (auto &task : tasks_) {
|
| 221 |
+
task.join();
|
| 222 |
+
}
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
void Schedule(std::function<void()> closure) {
|
| 226 |
+
static constexpr size_t kMinThreadSize = 2;
|
| 227 |
+
if (request_size_ < kMinThreadSize) {
|
| 228 |
+
closure();
|
| 229 |
+
} else {
|
| 230 |
+
tasks_.emplace_back(closure);
|
| 231 |
+
}
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
private:
|
| 235 |
+
size_t request_size_ = 0;
|
| 236 |
+
std::vector<std::thread> tasks_;
|
| 237 |
+
};
|
| 238 |
+
|
| 239 |
+
template <typename T>
|
| 240 |
+
inline void InitNumThreads(const std::vector<T> &ins, int *num_threads) {
|
| 241 |
+
if (*num_threads < 0) {
|
| 242 |
+
*num_threads = std::thread::hardware_concurrency();
|
| 243 |
+
}
|
| 244 |
+
*num_threads = std::max<int>(1,
|
| 245 |
+
std::min<int>({*num_threads,
|
| 246 |
+
static_cast<int>(ins.size()), 256}));
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
#define DEFINE_ENCODE_BATCH_FUNC_IMPL(FuncName, InType, OutType) \
|
| 250 |
+
std::vector<OutType> outs(ins.size()); \
|
| 251 |
+
InitNumThreads(ins, &num_threads); \
|
| 252 |
+
{ \
|
| 253 |
+
ThreadPool pool(ins.size()); \
|
| 254 |
+
std::atomic<size_t> index = 0; \
|
| 255 |
+
for (int n = 0; n < num_threads; ++n) { \
|
| 256 |
+
pool.Schedule([&]() { \
|
| 257 |
+
size_t i = 0; \
|
| 258 |
+
while ((i = std::atomic_fetch_add(&index, 1)) < outs.size()) { \
|
| 259 |
+
auto out = enable_sampling ? \
|
| 260 |
+
self->Sample##FuncName(ins[i], \
|
| 261 |
+
nbest_size, alpha) : \
|
| 262 |
+
self->FuncName(ins[i]); \
|
| 263 |
+
RewriteIds(*self, &out, add_bos, add_eos, reverse, \
|
| 264 |
+
emit_unk_piece); \
|
| 265 |
+
ConvertToUnicodeSpans(&out); \
|
| 266 |
+
outs[i] = std::move(out); \
|
| 267 |
+
} \
|
| 268 |
+
}); \
|
| 269 |
+
} \
|
| 270 |
+
} \
|
| 271 |
+
return outs;
|
| 272 |
+
|
| 273 |
+
#define DEFINE_DECODE_BATCH_FUNC_IMPL(FuncName, InType, OutType) \
|
| 274 |
+
std::vector<OutType> outs(ins.size()); \
|
| 275 |
+
InitNumThreads(ins, &num_threads); \
|
| 276 |
+
{ \
|
| 277 |
+
std::atomic<size_t> index = 0; \
|
| 278 |
+
ThreadPool pool(ins.size()); \
|
| 279 |
+
for (int n = 0; n < num_threads; ++n) { \
|
| 280 |
+
pool.Schedule([&]() { \
|
| 281 |
+
size_t i = 0; \
|
| 282 |
+
while ((i = std::atomic_fetch_add(&index, 1)) < outs.size()) { \
|
| 283 |
+
auto out = self->FuncName(ins[i]); \
|
| 284 |
+
ConvertToUnicodeSpans(&out); \
|
| 285 |
+
outs[i] = std::move(out); \
|
| 286 |
+
} \
|
| 287 |
+
}); \
|
| 288 |
+
} \
|
| 289 |
+
} \
|
| 290 |
+
return outs;
|
| 291 |
+
|
| 292 |
+
} // namespace
|
| 293 |
+
%}
|
| 294 |
+
|
| 295 |
+
%init %{
|
| 296 |
+
#ifdef Py_GIL_DISABLED
|
| 297 |
+
PyUnstable_Module_SetGIL(m, Py_MOD_GIL_NOT_USED);
|
| 298 |
+
#endif
|
| 299 |
+
%}
|
| 300 |
+
|
| 301 |
+
%exception {
|
| 302 |
+
try {
|
| 303 |
+
$action
|
| 304 |
+
ReleaseResultObject(resultobj);
|
| 305 |
+
}
|
| 306 |
+
catch (const sentencepiece::util::Status &status) {
|
| 307 |
+
SWIG_exception(ToSwigError(status.code()), status.ToString().c_str());
|
| 308 |
+
}
|
| 309 |
+
}
|
| 310 |
+
|
| 311 |
+
%apply unsigned int { uint32_t }
|
| 312 |
+
|
| 313 |
+
%ignore sentencepiece::util::Status;
|
| 314 |
+
%ignore sentencepiece::util::StatusCode;
|
| 315 |
+
%ignore absl::string_view;
|
| 316 |
+
%ignore std::string_view;
|
| 317 |
+
%ignore sentencepiece::SentencePieceText;
|
| 318 |
+
%ignore sentencepiece::NormalizerSpec;
|
| 319 |
+
%ignore sentencepiece::TrainerSpec;
|
| 320 |
+
%ignore sentencepiece::SentencePieceProcessor::status;
|
| 321 |
+
%ignore sentencepiece::ImmutableSentencePieceText::mutable_proto;
|
| 322 |
+
%ignore sentencepiece::ImmutableSentencePieceText::pieces() const;
|
| 323 |
+
%ignore sentencepiece::ImmutableSentencePieceText::ConvertToUnicodeSpans;
|
| 324 |
+
%ignore sentencepiece::ImmutableNBestSentencePieceText::mutable_proto;
|
| 325 |
+
%ignore sentencepiece::ImmutableNBestSentencePieceText::nbests() const;
|
| 326 |
+
%ignore sentencepiece::ImmutableNBestSentencePieceText::ConvertToUnicodeSpans;
|
| 327 |
+
|
| 328 |
+
%ignore sentencepiece::SentencePieceProcessor::Encode;
|
| 329 |
+
%ignore sentencepiece::SentencePieceProcessor::SampleEncode;
|
| 330 |
+
%ignore sentencepiece::SentencePieceProcessor::NBestEncode;
|
| 331 |
+
%ignore sentencepiece::SentencePieceProcessor::SampleEncodeAndScore;
|
| 332 |
+
%ignore sentencepiece::SentencePieceProcessor::Decode;
|
| 333 |
+
|
| 334 |
+
%ignore sentencepiece::SentencePieceProcessor::EncodeAsPieces;
|
| 335 |
+
%ignore sentencepiece::SentencePieceProcessor::EncodeAsIds;
|
| 336 |
+
%ignore sentencepiece::SentencePieceProcessor::SampleEncodeAsIds;
|
| 337 |
+
%ignore sentencepiece::SentencePieceProcessor::SampleEncodeAsPieces;
|
| 338 |
+
%ignore sentencepiece::SentencePieceProcessor::NBestEncodeAsIds;
|
| 339 |
+
%ignore sentencepiece::SentencePieceProcessor::NBestEncodeAsPieces;
|
| 340 |
+
%ignore sentencepiece::SentencePieceProcessor::SampleEncodeAndScoreAsIds;
|
| 341 |
+
%ignore sentencepiece::SentencePieceProcessor::SampleEncodeAndScoreAsPieces;
|
| 342 |
+
%ignore sentencepiece::SentencePieceProcessor::DecodeIds;
|
| 343 |
+
%ignore sentencepiece::SentencePieceProcessor::DecodePieces;
|
| 344 |
+
|
| 345 |
+
%ignore sentencepiece::SentencePieceProcessor::EncodeAsSerializedProto;
|
| 346 |
+
%ignore sentencepiece::SentencePieceProcessor::SampleEncodeAsSerializedProto;
|
| 347 |
+
%ignore sentencepiece::SentencePieceProcessor::NBestEncodeAsSerializedProto;
|
| 348 |
+
%ignore sentencepiece::SentencePieceProcessor::SampleEncodeAndScoreAsSerializedProto;
|
| 349 |
+
%ignore sentencepiece::SentencePieceProcessor::DecodePiecesAsSerializedProto;
|
| 350 |
+
%ignore sentencepiece::SentencePieceProcessor::DecodeIdsAsSerializedProto;
|
| 351 |
+
|
| 352 |
+
%ignore sentencepiece::SentencePieceProcessor::EncodeAsImmutableProto;
|
| 353 |
+
%ignore sentencepiece::SentencePieceProcessor::SampleEncodeAsImmutableProto;
|
| 354 |
+
%ignore sentencepiece::SentencePieceProcessor::NBestEncodeAsImmutableProto;
|
| 355 |
+
%ignore sentencepiece::SentencePieceProcessor::SampleEncodeAndScoreAsImmutableProto;
|
| 356 |
+
%ignore sentencepiece::SentencePieceProcessor::DecodePiecesAsImmutableProto;
|
| 357 |
+
%ignore sentencepiece::SentencePieceProcessor::DecodeIdsAsImmutableProto;
|
| 358 |
+
|
| 359 |
+
%ignore sentencepiece::SentencePieceProcessor::Normalize;
|
| 360 |
+
%ignore sentencepiece::SentencePieceProcessor::NormalizeWithOffsets;
|
| 361 |
+
|
| 362 |
+
%ignore sentencepiece::SentencePieceProcessor::model_proto;
|
| 363 |
+
%ignore sentencepiece::SentencePieceProcessor::mutable_normalizer_spec;
|
| 364 |
+
%ignore sentencepiece::SentencePieceProcessor::Load;
|
| 365 |
+
%ignore sentencepiece::SentencePieceProcessor::LoadOrDie;
|
| 366 |
+
%ignore sentencepiece::SentencePieceProcessor::SetModel;
|
| 367 |
+
%ignore sentencepiece::SentencePieceProcessor::SetNormalizer;
|
| 368 |
+
%ignore sentencepiece::pretokenizer::PretokenizerForTrainingInterface;
|
| 369 |
+
%ignore sentencepiece::SentenceIterator;
|
| 370 |
+
%ignore sentencepiece::ConvertToUnicodeSpans;
|
| 371 |
+
%ignore sentencepiece::SentencePieceTrainer::Train;
|
| 372 |
+
%ignore sentencepiece::SentencePieceTrainer::GetNormalizerSpec;
|
| 373 |
+
%ignore sentencepiece::SentencePieceTrainer::PopulateNormalizerSpec;
|
| 374 |
+
%ignore sentencepiece::SentencePieceTrainer::MergeSpecsFromArgs;
|
| 375 |
+
%ignore sentencepiece::SentencePieceTrainer::SetProtoField;
|
| 376 |
+
%ignore sentencepiece::SentencePieceTrainer::PopulateModelTypeFromString;
|
| 377 |
+
%ignore sentencepiece::SentencePieceTrainer::PieceProcecssor;
|
| 378 |
+
%ignore sentencepiece::SentencePieceTrainer::SetPretokenizerForTraining;
|
| 379 |
+
%ignore sentencepiece::SentencePieceTrainer::GetPretokenizerForTraining;
|
| 380 |
+
%ignore sentencepiece::SentencePieceTrainer::SetDataDir;
|
| 381 |
+
%ignore sentencepiece::ConvertToUnicodeAlignment;
|
| 382 |
+
|
| 383 |
+
%ignore sentencepiece::SentencePieceNormalizer::Load;
|
| 384 |
+
%ignore sentencepiece::SentencePieceNormalizer::Normalize;
|
| 385 |
+
%ignore sentencepiece::SentencePieceNormalizer::mutable_normalizer_spec;
|
| 386 |
+
|
| 387 |
+
%ignore sentencepiece::io::LoadModelProto;
|
| 388 |
+
%ignore sentencepiece::io::SaveModelProto;
|
| 389 |
+
|
| 390 |
+
%extend sentencepiece::SentencePieceProcessor {
|
| 391 |
+
sentencepiece::util::Status LoadFromFile(absl::string_view arg) {
|
| 392 |
+
return $self->Load(arg);
|
| 393 |
+
}
|
| 394 |
+
|
| 395 |
+
/////////////////////////////////////////////////////////////////////////////
|
| 396 |
+
// EncodeAs* (Single request)
|
| 397 |
+
std::vector<int> _EncodeAsIds(absl::string_view text,
|
| 398 |
+
bool enable_sampling,
|
| 399 |
+
int nbest_size, float alpha,
|
| 400 |
+
bool add_bos, bool add_eos, bool reverse,
|
| 401 |
+
bool emit_unk_piece) const {
|
| 402 |
+
auto ids = enable_sampling ?
|
| 403 |
+
$self->SampleEncodeAsIds(text, nbest_size, alpha) :
|
| 404 |
+
$self->EncodeAsIds(text);
|
| 405 |
+
RewriteIds(*$self, &ids, add_bos, add_eos, reverse, emit_unk_piece);
|
| 406 |
+
return ids;
|
| 407 |
+
}
|
| 408 |
+
|
| 409 |
+
std::vector<std::string> _EncodeAsPieces(absl::string_view text,
|
| 410 |
+
bool enable_sampling,
|
| 411 |
+
int nbest_size, float alpha,
|
| 412 |
+
bool add_bos, bool add_eos, bool reverse,
|
| 413 |
+
bool emit_unk_piece) const {
|
| 414 |
+
auto pieces = enable_sampling ?
|
| 415 |
+
$self->SampleEncodeAsPieces(text, nbest_size, alpha) :
|
| 416 |
+
$self->EncodeAsPieces(text);
|
| 417 |
+
RewriteIds(*$self, &pieces, add_bos, add_eos, reverse, emit_unk_piece);
|
| 418 |
+
return pieces;
|
| 419 |
+
}
|
| 420 |
+
|
| 421 |
+
sentencepiece::util::bytes _EncodeAsSerializedProto(absl::string_view text,
|
| 422 |
+
bool enable_sampling,
|
| 423 |
+
int nbest_size, float alpha,
|
| 424 |
+
bool add_bos, bool add_eos, bool reverse,
|
| 425 |
+
bool emit_unk_piece) const {
|
| 426 |
+
auto proto = enable_sampling ?
|
| 427 |
+
$self->SampleEncodeAsSerializedProto(text, nbest_size, alpha) :
|
| 428 |
+
$self->EncodeAsSerializedProto(text);
|
| 429 |
+
RewriteIds(*$self, &proto, add_bos, add_eos, reverse, emit_unk_piece);
|
| 430 |
+
return proto;
|
| 431 |
+
}
|
| 432 |
+
|
| 433 |
+
sentencepiece::ImmutableSentencePieceText
|
| 434 |
+
_EncodeAsImmutableProto(absl::string_view text,
|
| 435 |
+
bool enable_sampling,
|
| 436 |
+
int nbest_size, float alpha,
|
| 437 |
+
bool add_bos, bool add_eos, bool reverse,
|
| 438 |
+
bool emit_unk_piece) const {
|
| 439 |
+
auto proto = enable_sampling ?
|
| 440 |
+
$self->SampleEncodeAsImmutableProto(text, nbest_size, alpha) :
|
| 441 |
+
$self->EncodeAsImmutableProto(text);
|
| 442 |
+
proto.ConvertToUnicodeSpans();
|
| 443 |
+
RewriteIds(*$self, &proto, add_bos, add_eos, reverse, emit_unk_piece);
|
| 444 |
+
return proto;
|
| 445 |
+
}
|
| 446 |
+
|
| 447 |
+
/////////////////////////////////////////////////////////////////////////////
|
| 448 |
+
// EncodeAs* (Batch request)
|
| 449 |
+
std::vector<std::vector<int>> _EncodeAsIdsBatch(
|
| 450 |
+
const std::vector<absl::string_view> &ins, int num_threads,
|
| 451 |
+
bool enable_sampling, int nbest_size, float alpha,
|
| 452 |
+
bool add_bos, bool add_eos, bool reverse,
|
| 453 |
+
bool emit_unk_piece) const {
|
| 454 |
+
DEFINE_ENCODE_BATCH_FUNC_IMPL(EncodeAsIds,
|
| 455 |
+
absl::string_view, std::vector<int>);
|
| 456 |
+
}
|
| 457 |
+
|
| 458 |
+
std::vector<std::vector<std::string>> _EncodeAsPiecesBatch(
|
| 459 |
+
const std::vector<absl::string_view> &ins, int num_threads,
|
| 460 |
+
bool enable_sampling, int nbest_size, float alpha,
|
| 461 |
+
bool add_bos, bool add_eos, bool reverse,
|
| 462 |
+
bool emit_unk_piece) const {
|
| 463 |
+
DEFINE_ENCODE_BATCH_FUNC_IMPL(EncodeAsPieces,
|
| 464 |
+
absl::string_view, std::vector<std::string>);
|
| 465 |
+
}
|
| 466 |
+
|
| 467 |
+
BytesArray _EncodeAsSerializedProtoBatch(
|
| 468 |
+
const std::vector<absl::string_view> &ins, int num_threads,
|
| 469 |
+
bool enable_sampling, int nbest_size, float alpha,
|
| 470 |
+
bool add_bos, bool add_eos, bool reverse,
|
| 471 |
+
bool emit_unk_piece) const {
|
| 472 |
+
DEFINE_ENCODE_BATCH_FUNC_IMPL(EncodeAsSerializedProto,
|
| 473 |
+
absl::string_view,
|
| 474 |
+
sentencepiece::util::bytes);
|
| 475 |
+
}
|
| 476 |
+
|
| 477 |
+
std::vector<sentencepiece::ImmutableSentencePieceText>
|
| 478 |
+
_EncodeAsImmutableProtoBatch(
|
| 479 |
+
const std::vector<absl::string_view> &ins, int num_threads,
|
| 480 |
+
bool enable_sampling, int nbest_size, float alpha,
|
| 481 |
+
bool add_bos, bool add_eos, bool reverse,
|
| 482 |
+
bool emit_unk_piece) const {
|
| 483 |
+
DEFINE_ENCODE_BATCH_FUNC_IMPL(EncodeAsImmutableProto,
|
| 484 |
+
absl::string_view,
|
| 485 |
+
sentencepiece::ImmutableSentencePieceText);
|
| 486 |
+
}
|
| 487 |
+
|
| 488 |
+
/////////////////////////////////////////////////////////////////////////////
|
| 489 |
+
// DecodeAs* (Single request)
|
| 490 |
+
std::string _DecodeIds(const std::vector<int> &ids) const {
|
| 491 |
+
CheckIds(ids, $self->GetPieceSize());
|
| 492 |
+
return $self->DecodeIds(ids);
|
| 493 |
+
}
|
| 494 |
+
|
| 495 |
+
sentencepiece::util::bytes _DecodeIdsAsBytes(const std::vector<int> &ids) const {
|
| 496 |
+
CheckIds(ids, $self->GetPieceSize());
|
| 497 |
+
return $self->DecodeIds(ids);
|
| 498 |
+
}
|
| 499 |
+
|
| 500 |
+
std::string _DecodePieces(const std::vector<absl::string_view> &pieces) const {
|
| 501 |
+
return $self->DecodePieces(pieces);
|
| 502 |
+
}
|
| 503 |
+
|
| 504 |
+
sentencepiece::util::bytes _DecodeIdsAsSerializedProto(
|
| 505 |
+
const std::vector<int> &ids) const {
|
| 506 |
+
CheckIds(ids, $self->GetPieceSize());
|
| 507 |
+
return $self->DecodeIdsAsSerializedProto(ids);
|
| 508 |
+
}
|
| 509 |
+
|
| 510 |
+
sentencepiece::util::bytes _DecodePiecesAsSerializedProto(
|
| 511 |
+
const std::vector<absl::string_view> &pieces) const {
|
| 512 |
+
CheckIds(pieces, $self->GetPieceSize());
|
| 513 |
+
return $self->DecodePiecesAsSerializedProto(pieces);
|
| 514 |
+
}
|
| 515 |
+
|
| 516 |
+
sentencepiece::ImmutableSentencePieceText _DecodeIdsAsImmutableProto(
|
| 517 |
+
const std::vector<int> &ids) const {
|
| 518 |
+
CheckIds(ids, $self->GetPieceSize());
|
| 519 |
+
auto proto = $self->DecodeIdsAsImmutableProto(ids);
|
| 520 |
+
proto.ConvertToUnicodeSpans();
|
| 521 |
+
return proto;
|
| 522 |
+
}
|
| 523 |
+
|
| 524 |
+
sentencepiece::ImmutableSentencePieceText _DecodePiecesAsImmutableProto(
|
| 525 |
+
const std::vector<absl::string_view> &pieces) const {
|
| 526 |
+
CheckIds(pieces, $self->GetPieceSize());
|
| 527 |
+
auto proto= $self->DecodePiecesAsImmutableProto(pieces);
|
| 528 |
+
proto.ConvertToUnicodeSpans();
|
| 529 |
+
return proto;
|
| 530 |
+
}
|
| 531 |
+
|
| 532 |
+
/////////////////////////////////////////////////////////////////////////////
|
| 533 |
+
// DecodeAs* (Batch request)
|
| 534 |
+
std::vector<std::string> _DecodeIdsBatch(
|
| 535 |
+
const std::vector<std::vector<int>> &ins, int num_threads) const {
|
| 536 |
+
CheckIdsBatch(ins, $self->GetPieceSize());
|
| 537 |
+
DEFINE_DECODE_BATCH_FUNC_IMPL(DecodeIds, int, std::string);
|
| 538 |
+
}
|
| 539 |
+
|
| 540 |
+
BytesArray _DecodeIdsAsBytesBatch(
|
| 541 |
+
const std::vector<std::vector<int>> &ins, int num_threads) const {
|
| 542 |
+
CheckIdsBatch(ins, $self->GetPieceSize());
|
| 543 |
+
DEFINE_DECODE_BATCH_FUNC_IMPL(DecodeIds, int, std::string);
|
| 544 |
+
}
|
| 545 |
+
|
| 546 |
+
BytesArray _DecodeIdsAsSerializedProtoBatch(
|
| 547 |
+
const std::vector<std::vector<int>> &ins, int num_threads) const {
|
| 548 |
+
CheckIdsBatch(ins, $self->GetPieceSize());
|
| 549 |
+
DEFINE_DECODE_BATCH_FUNC_IMPL(DecodeIdsAsSerializedProto, int,
|
| 550 |
+
sentencepiece::util::bytes);
|
| 551 |
+
}
|
| 552 |
+
|
| 553 |
+
std::vector<sentencepiece::ImmutableSentencePieceText>
|
| 554 |
+
_DecodeIdsAsImmutableProtoBatch(
|
| 555 |
+
const std::vector<std::vector<int>> &ins, int num_threads) const {
|
| 556 |
+
CheckIdsBatch(ins, $self->GetPieceSize());
|
| 557 |
+
DEFINE_DECODE_BATCH_FUNC_IMPL(DecodeIdsAsImmutableProto, int,
|
| 558 |
+
sentencepiece::ImmutableSentencePieceText);
|
| 559 |
+
}
|
| 560 |
+
|
| 561 |
+
std::vector<std::string> _DecodePiecesBatch(
|
| 562 |
+
const std::vector<std::vector<absl::string_view>> &ins, int num_threads) const {
|
| 563 |
+
DEFINE_DECODE_BATCH_FUNC_IMPL(DecodePieces, std::string, std::string);
|
| 564 |
+
}
|
| 565 |
+
|
| 566 |
+
BytesArray _DecodePiecesAsSerializedProtoBatch(
|
| 567 |
+
const std::vector<std::vector<absl::string_view>> &ins, int num_threads) const {
|
| 568 |
+
DEFINE_DECODE_BATCH_FUNC_IMPL(DecodePiecesAsSerializedProto, std::string,
|
| 569 |
+
sentencepiece::util::bytes);
|
| 570 |
+
}
|
| 571 |
+
|
| 572 |
+
std::vector<sentencepiece::ImmutableSentencePieceText>
|
| 573 |
+
_DecodePiecesAsImmutableProtoBatch(
|
| 574 |
+
const std::vector<std::vector<absl::string_view>> &ins, int num_threads) const {
|
| 575 |
+
DEFINE_DECODE_BATCH_FUNC_IMPL(DecodePiecesAsImmutableProto, std::string,
|
| 576 |
+
sentencepiece::ImmutableSentencePieceText);
|
| 577 |
+
}
|
| 578 |
+
|
| 579 |
+
////////////////////////////////////////////////////////////////////////////
|
| 580 |
+
// NBestEncodeAs* (Single request)
|
| 581 |
+
std::vector<std::vector<int>>
|
| 582 |
+
_NBestEncodeAsIds(absl::string_view text,
|
| 583 |
+
int nbest_size,
|
| 584 |
+
bool add_bos, bool add_eos, bool reverse,
|
| 585 |
+
bool emit_unk_piece) const {
|
| 586 |
+
auto idss = $self->NBestEncodeAsIds(text, nbest_size);
|
| 587 |
+
for (auto &ids : idss) {
|
| 588 |
+
RewriteIds(*$self, &ids, add_bos, add_eos, reverse, emit_unk_piece);
|
| 589 |
+
}
|
| 590 |
+
return idss;
|
| 591 |
+
}
|
| 592 |
+
|
| 593 |
+
std::vector<std::vector<std::string>>
|
| 594 |
+
_NBestEncodeAsPieces(absl::string_view text,
|
| 595 |
+
int nbest_size,
|
| 596 |
+
bool add_bos, bool add_eos, bool reverse,
|
| 597 |
+
bool emit_unk_piece) const {
|
| 598 |
+
auto piecess = $self->NBestEncodeAsPieces(text, nbest_size);
|
| 599 |
+
for (auto &pieces : piecess) {
|
| 600 |
+
RewriteIds(*$self, &pieces, add_bos, add_eos, reverse, emit_unk_piece);
|
| 601 |
+
}
|
| 602 |
+
return piecess;
|
| 603 |
+
}
|
| 604 |
+
|
| 605 |
+
sentencepiece::util::bytes
|
| 606 |
+
_NBestEncodeAsSerializedProto(absl::string_view text,
|
| 607 |
+
int nbest_size,
|
| 608 |
+
bool add_bos, bool add_eos, bool reverse,
|
| 609 |
+
bool emit_unk_piece) const {
|
| 610 |
+
RewriteIds(*$self, static_cast<sentencepiece::util::bytes *>(nullptr),
|
| 611 |
+
add_bos, add_eos, reverse, emit_unk_piece);
|
| 612 |
+
return $self->NBestEncodeAsSerializedProto(text, nbest_size);
|
| 613 |
+
}
|
| 614 |
+
|
| 615 |
+
sentencepiece::ImmutableNBestSentencePieceText
|
| 616 |
+
_NBestEncodeAsImmutableProto(absl::string_view text,
|
| 617 |
+
int nbest_size,
|
| 618 |
+
bool add_bos, bool add_eos, bool reverse,
|
| 619 |
+
bool emit_unk_piece) const {
|
| 620 |
+
RewriteIds(*$self, static_cast<sentencepiece::ImmutableSentencePieceText *>(nullptr),
|
| 621 |
+
add_bos, add_eos, reverse, emit_unk_piece);
|
| 622 |
+
auto proto = $self->NBestEncodeAsImmutableProto(text, nbest_size);
|
| 623 |
+
proto.ConvertToUnicodeSpans();
|
| 624 |
+
return proto;
|
| 625 |
+
}
|
| 626 |
+
|
| 627 |
+
|
| 628 |
+
/////////////////////////////////////////////////////////////////////////////
|
| 629 |
+
// SampleEncodeAndScoreAs* (Single request)
|
| 630 |
+
std::vector<std::pair<std::vector<int>, float>>
|
| 631 |
+
_SampleEncodeAndScoreAsIds(absl::string_view text,
|
| 632 |
+
int num_samples, float alpha, bool wor,
|
| 633 |
+
bool include_best,
|
| 634 |
+
bool add_bos, bool add_eos, bool reverse,
|
| 635 |
+
bool emit_unk_piece) const {
|
| 636 |
+
auto idss = $self->SampleEncodeAndScoreAsIds(text, num_samples,
|
| 637 |
+
alpha, wor, include_best);
|
| 638 |
+
for (auto &ids : idss) {
|
| 639 |
+
RewriteIds(*$self, &ids.first, add_bos, add_eos, reverse, emit_unk_piece);
|
| 640 |
+
}
|
| 641 |
+
return idss;
|
| 642 |
+
}
|
| 643 |
+
|
| 644 |
+
std::vector<std::pair<std::vector<std::string>, float>>
|
| 645 |
+
_SampleEncodeAndScoreAsPieces(absl::string_view text,
|
| 646 |
+
int num_samples, float alpha, bool wor,
|
| 647 |
+
bool include_best,
|
| 648 |
+
bool add_bos, bool add_eos, bool reverse,
|
| 649 |
+
bool emit_unk_piece) const {
|
| 650 |
+
auto piecess = $self->SampleEncodeAndScoreAsPieces(text, num_samples,
|
| 651 |
+
alpha, wor, include_best);
|
| 652 |
+
for (auto &pieces : piecess) {
|
| 653 |
+
RewriteIds(*$self, &pieces.first, add_bos, add_eos, reverse, emit_unk_piece);
|
| 654 |
+
}
|
| 655 |
+
return piecess;
|
| 656 |
+
}
|
| 657 |
+
|
| 658 |
+
sentencepiece::util::bytes
|
| 659 |
+
_SampleEncodeAndScoreAsSerializedProto(absl::string_view text,
|
| 660 |
+
int num_samples, float alpha, bool wor,
|
| 661 |
+
bool include_best,
|
| 662 |
+
bool add_bos, bool add_eos, bool reverse,
|
| 663 |
+
bool emit_unk_piece) const {
|
| 664 |
+
RewriteIds(*$self, static_cast<sentencepiece::util::bytes *>(nullptr),
|
| 665 |
+
add_bos, add_eos, reverse, emit_unk_piece);
|
| 666 |
+
return $self->SampleEncodeAndScoreAsSerializedProto(text, num_samples,
|
| 667 |
+
alpha, wor, include_best);
|
| 668 |
+
}
|
| 669 |
+
|
| 670 |
+
sentencepiece::ImmutableNBestSentencePieceText
|
| 671 |
+
_SampleEncodeAndScoreAsImmutableProto(absl::string_view text,
|
| 672 |
+
int num_samples, float alpha, bool wor,
|
| 673 |
+
bool include_best,
|
| 674 |
+
bool add_bos, bool add_eos, bool reverse,
|
| 675 |
+
bool emit_unk_piece) const {
|
| 676 |
+
RewriteIds(*$self, static_cast<sentencepiece::util::bytes *>(nullptr),
|
| 677 |
+
add_bos, add_eos, reverse, emit_unk_piece);
|
| 678 |
+
auto proto = $self->SampleEncodeAndScoreAsImmutableProto(text, num_samples,
|
| 679 |
+
alpha, wor, include_best);
|
| 680 |
+
proto.ConvertToUnicodeSpans();
|
| 681 |
+
return proto;
|
| 682 |
+
}
|
| 683 |
+
|
| 684 |
+
// Normalize
|
| 685 |
+
std::string _Normalize(absl::string_view text) {
|
| 686 |
+
return $self->Normalize(text);
|
| 687 |
+
}
|
| 688 |
+
|
| 689 |
+
std::pair<std::string, std::vector<size_t>> _NormalizeWithOffsets(absl::string_view text) {
|
| 690 |
+
std::pair<std::string, std::vector<size_t>> result;
|
| 691 |
+
$self->Normalize(text, &result.first, &result.second).IgnoreError();
|
| 692 |
+
return result;
|
| 693 |
+
}
|
| 694 |
+
|
| 695 |
+
// Calculate Entropy
|
| 696 |
+
float _CalculateEntropy(absl::string_view text, float alpha) {
|
| 697 |
+
return $self->CalculateEntropy(text, alpha);
|
| 698 |
+
}
|
| 699 |
+
|
| 700 |
+
std::vector<float> _CalculateEntropyBatch(const std::vector<absl::string_view> &ins,
|
| 701 |
+
float alpha, int num_threads) {
|
| 702 |
+
std::vector<float> outs(ins.size());
|
| 703 |
+
InitNumThreads(ins, &num_threads);
|
| 704 |
+
{
|
| 705 |
+
ThreadPool pool(ins.size());
|
| 706 |
+
std::atomic<size_t> index = 0;
|
| 707 |
+
for (int n = 0; n < num_threads; ++n) {
|
| 708 |
+
pool.Schedule([&]() {
|
| 709 |
+
size_t i = 0;
|
| 710 |
+
while ((i = std::atomic_fetch_add(&index, 1)) < outs.size()) {
|
| 711 |
+
outs[i] = self->CalculateEntropy(ins[i], alpha);
|
| 712 |
+
}
|
| 713 |
+
});
|
| 714 |
+
}
|
| 715 |
+
}
|
| 716 |
+
return outs;
|
| 717 |
+
}
|
| 718 |
+
|
| 719 |
+
// override normalizer_spec
|
| 720 |
+
sentencepiece::util::Status _OverrideNormalizerSpec(
|
| 721 |
+
const std::unordered_map<std::string, std::string> &args) {
|
| 722 |
+
sentencepiece::util::Status status;
|
| 723 |
+
for (const auto &[key, value] : args) {
|
| 724 |
+
status = sentencepiece::SentencePieceTrainer::SetProtoField(
|
| 725 |
+
key, value,
|
| 726 |
+
$self->mutable_normalizer_spec());
|
| 727 |
+
if (!status.ok()) return status;
|
| 728 |
+
}
|
| 729 |
+
return status;
|
| 730 |
+
}
|
| 731 |
+
|
| 732 |
+
%pythoncode {
|
| 733 |
+
def Init(self,
|
| 734 |
+
model_file=None,
|
| 735 |
+
model_proto=None,
|
| 736 |
+
out_type=int,
|
| 737 |
+
add_bos=False,
|
| 738 |
+
add_eos=False,
|
| 739 |
+
reverse=False,
|
| 740 |
+
emit_unk_piece=False,
|
| 741 |
+
enable_sampling=False,
|
| 742 |
+
nbest_size=-1,
|
| 743 |
+
alpha=0.1,
|
| 744 |
+
num_threads=-1):
|
| 745 |
+
"""Initialzie sentencepieceProcessor.
|
| 746 |
+
|
| 747 |
+
Args:
|
| 748 |
+
model_file: The sentencepiece model file path.
|
| 749 |
+
model_proto: The sentencepiece model serialized proto.
|
| 750 |
+
out_type: output type. int or str.
|
| 751 |
+
add_bos: Add <s> to the result (Default = false)
|
| 752 |
+
add_eos: Add </s> to the result (Default = false) <s>/</s> is added after
|
| 753 |
+
reversing (if enabled).
|
| 754 |
+
reverse: Reverses the tokenized sequence (Default = false)
|
| 755 |
+
emit_unk_piece: Emits the unk literal string (Default = false)
|
| 756 |
+
nbest_size: sampling parameters for unigram. Invalid in BPE-Dropout.
|
| 757 |
+
nbest_size = {0,1}: No sampling is performed.
|
| 758 |
+
nbest_size > 1: samples from the nbest_size results.
|
| 759 |
+
nbest_size < 0: assuming that nbest_size is infinite and samples
|
| 760 |
+
from the all hypothesis (lattice) using
|
| 761 |
+
forward-filtering-and-backward-sampling algorithm.
|
| 762 |
+
alpha: Soothing parameter for unigram sampling, and dropout probability of
|
| 763 |
+
merge operations for BPE-dropout.
|
| 764 |
+
num_threads: number of threads in batch processing (Default = -1, auto-detected)
|
| 765 |
+
"""
|
| 766 |
+
|
| 767 |
+
_sentencepiece_processor_init_native(self)
|
| 768 |
+
self._out_type = out_type
|
| 769 |
+
self._add_bos = add_bos
|
| 770 |
+
self._add_eos = add_eos
|
| 771 |
+
self._reverse = reverse
|
| 772 |
+
self._emit_unk_piece = emit_unk_piece
|
| 773 |
+
self._enable_sampling = enable_sampling
|
| 774 |
+
self._nbest_size = nbest_size
|
| 775 |
+
self._alpha = alpha
|
| 776 |
+
self._num_threads = num_threads
|
| 777 |
+
if model_file or model_proto:
|
| 778 |
+
self.Load(model_file=model_file, model_proto=model_proto)
|
| 779 |
+
|
| 780 |
+
|
| 781 |
+
def Encode(self,
|
| 782 |
+
input,
|
| 783 |
+
out_type=None,
|
| 784 |
+
add_bos=None,
|
| 785 |
+
add_eos=None,
|
| 786 |
+
reverse=None,
|
| 787 |
+
emit_unk_piece=None,
|
| 788 |
+
enable_sampling=None,
|
| 789 |
+
nbest_size=None,
|
| 790 |
+
alpha=None,
|
| 791 |
+
num_threads=None):
|
| 792 |
+
"""Encode text input to segmented ids or tokens.
|
| 793 |
+
|
| 794 |
+
Args:
|
| 795 |
+
input: input string. accepsts list of string.
|
| 796 |
+
out_type: output type. int or str.
|
| 797 |
+
add_bos: Add <s> to the result (Default = false)
|
| 798 |
+
add_eos: Add </s> to the result (Default = false) <s>/</s> is added after
|
| 799 |
+
reversing (if enabled).
|
| 800 |
+
reverse: Reverses the tokenized sequence (Default = false)
|
| 801 |
+
emit_unk_piece: Emits the unk literal string (Default = false)
|
| 802 |
+
nbest_size: sampling parameters for unigram. Invalid in BPE-Dropout.
|
| 803 |
+
nbest_size = {0,1}: No sampling is performed.
|
| 804 |
+
nbest_size > 1: samples from the nbest_size results.
|
| 805 |
+
nbest_size < 0: assuming that nbest_size is infinite and samples
|
| 806 |
+
from the all hypothesis (lattice) using
|
| 807 |
+
forward-filtering-and-backward-sampling algorithm.
|
| 808 |
+
alpha: Soothing parameter for unigram sampling, and merge probability for
|
| 809 |
+
BPE-dropout (probablity 'p' in BPE-dropout paper).
|
| 810 |
+
num_threads: the number of threads used in the batch processing (Default = -1).
|
| 811 |
+
"""
|
| 812 |
+
|
| 813 |
+
if out_type is None:
|
| 814 |
+
out_type = self._out_type
|
| 815 |
+
if add_bos is None:
|
| 816 |
+
add_bos = self._add_bos
|
| 817 |
+
if add_eos is None:
|
| 818 |
+
add_eos = self._add_eos
|
| 819 |
+
if reverse is None:
|
| 820 |
+
reverse = self._reverse
|
| 821 |
+
if emit_unk_piece is None:
|
| 822 |
+
emit_unk_piece = self._emit_unk_piece
|
| 823 |
+
if enable_sampling is None:
|
| 824 |
+
enable_sampling = self._enable_sampling
|
| 825 |
+
if nbest_size is None:
|
| 826 |
+
nbest_size = self._nbest_size
|
| 827 |
+
if alpha is None:
|
| 828 |
+
alpha = self._alpha
|
| 829 |
+
if num_threads is None:
|
| 830 |
+
num_threads = self._num_threads
|
| 831 |
+
|
| 832 |
+
if enable_sampling == True and (nbest_size is None or nbest_size == 0 or
|
| 833 |
+
nbest_size == 1 or alpha is None):
|
| 834 |
+
raise RuntimeError(
|
| 835 |
+
'When enable_sampling is True, We must specify "nbest_size > 1" or "nbest_size = -1", '
|
| 836 |
+
'and "alpha". "nbest_size" is enabled only on unigram mode ignored in BPE-dropout. '
|
| 837 |
+
'when "nbest_size = -1" , this method samples from all candidates on the lattice '
|
| 838 |
+
'instead of nbest segmentations.'
|
| 839 |
+
)
|
| 840 |
+
|
| 841 |
+
if num_threads is None or type(num_threads) is not int:
|
| 842 |
+
raise RuntimeError('num_threads must be int')
|
| 843 |
+
|
| 844 |
+
if type(input) is list:
|
| 845 |
+
if out_type is int:
|
| 846 |
+
return self._EncodeAsIdsBatch(input, num_threads, enable_sampling, nbest_size,
|
| 847 |
+
alpha, add_bos, add_eos, reverse, emit_unk_piece)
|
| 848 |
+
if out_type is str:
|
| 849 |
+
return self._EncodeAsPiecesBatch(input, num_threads, enable_sampling, nbest_size,
|
| 850 |
+
alpha, add_bos, add_eos, reverse, emit_unk_piece)
|
| 851 |
+
if out_type == 'serialized_proto' or out_type == 'proto':
|
| 852 |
+
return self._EncodeAsSerializedProtoBatch(input, num_threads, enable_sampling, nbest_size,
|
| 853 |
+
alpha, add_bos, add_eos, reverse, emit_unk_piece)
|
| 854 |
+
if out_type == 'immutable_proto':
|
| 855 |
+
return self._EncodeAsImmutableProtoBatch(input, num_threads, enable_sampling, nbest_size,
|
| 856 |
+
alpha, add_bos, add_eos, reverse, emit_unk_piece)
|
| 857 |
+
|
| 858 |
+
if out_type is int:
|
| 859 |
+
return self._EncodeAsIds(input, enable_sampling, nbest_size,
|
| 860 |
+
alpha, add_bos, add_eos, reverse, emit_unk_piece)
|
| 861 |
+
if out_type is str:
|
| 862 |
+
return self._EncodeAsPieces(input, enable_sampling, nbest_size,
|
| 863 |
+
alpha, add_bos, add_eos, reverse, emit_unk_piece)
|
| 864 |
+
if out_type == 'serialized_proto' or out_type == 'proto':
|
| 865 |
+
return self._EncodeAsSerializedProto(input, enable_sampling, nbest_size,
|
| 866 |
+
alpha, add_bos, add_eos, reverse, emit_unk_piece)
|
| 867 |
+
if out_type == 'immutable_proto':
|
| 868 |
+
return self._EncodeAsImmutableProto(input, enable_sampling, nbest_size,
|
| 869 |
+
alpha, add_bos, add_eos, reverse, emit_unk_piece)
|
| 870 |
+
|
| 871 |
+
raise RuntimeError('unknown out_type={}'.format(out_type))
|
| 872 |
+
return None
|
| 873 |
+
|
| 874 |
+
|
| 875 |
+
def EncodeAsPieces(self, input, **kwargs):
|
| 876 |
+
return self.Encode(input=input, out_type=str, **kwargs)
|
| 877 |
+
|
| 878 |
+
|
| 879 |
+
def EncodeAsIds(self, input, **kwargs):
|
| 880 |
+
return self.Encode(input=input, out_type=int, **kwargs)
|
| 881 |
+
|
| 882 |
+
|
| 883 |
+
def EncodeAsSerializedProto(self, input, **kwargs):
|
| 884 |
+
return self.Encode(input=input, out_type='serialized_proto', **kwargs)
|
| 885 |
+
|
| 886 |
+
|
| 887 |
+
def EncodeAsImmutableProto(self, input, **kwargs):
|
| 888 |
+
return self.Encode(input=input, out_type='immutable_proto', **kwargs)
|
| 889 |
+
|
| 890 |
+
|
| 891 |
+
def SampleEncodeAsPieces(self, input, nbest_size=None, alpha=None, **kwargs):
|
| 892 |
+
return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha,
|
| 893 |
+
out_type=str, enable_sampling=True, **kwargs)
|
| 894 |
+
|
| 895 |
+
|
| 896 |
+
def SampleEncodeAsIds(self, input, nbest_size=None, alpha=None,**kwargs):
|
| 897 |
+
return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha,
|
| 898 |
+
out_type=int, enable_sampling=True, **kwargs)
|
| 899 |
+
|
| 900 |
+
|
| 901 |
+
def SampleEncodeAsSerializedProto(self, input, nbest_size=None, alpha=None, **kwargs):
|
| 902 |
+
return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha,
|
| 903 |
+
out_type='serialized_proto', enable_sampling=True, **kwargs)
|
| 904 |
+
|
| 905 |
+
|
| 906 |
+
def SampleEncodeAsImmutableProto(self, input, nbest_size=None, alpha=None, **kwargs):
|
| 907 |
+
return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha,
|
| 908 |
+
out_type='immutable_proto', enable_sampling=True, **kwargs)
|
| 909 |
+
|
| 910 |
+
|
| 911 |
+
def NBestEncode(self,
|
| 912 |
+
input,
|
| 913 |
+
out_type=None,
|
| 914 |
+
add_bos=None,
|
| 915 |
+
add_eos=None,
|
| 916 |
+
reverse=None,
|
| 917 |
+
emit_unk_piece=None,
|
| 918 |
+
nbest_size=None):
|
| 919 |
+
"""NBestEncode text input to segmented ids or tokens.
|
| 920 |
+
|
| 921 |
+
Args:
|
| 922 |
+
input: input string. accepsts list of string.
|
| 923 |
+
out_type: output type. int or str.
|
| 924 |
+
add_bos: Add <s> to the result (Default = false)
|
| 925 |
+
add_eos: Add </s> to the result (Default = false) <s>/</s> is added after reversing (if enabled).
|
| 926 |
+
reverse: Reverses the tokenized sequence (Default = false)
|
| 927 |
+
emit_unk_piece: Emits the unk literal string (Default = false)
|
| 928 |
+
nbest_size: nbest size
|
| 929 |
+
"""
|
| 930 |
+
|
| 931 |
+
if out_type is None:
|
| 932 |
+
out_type = self._out_type
|
| 933 |
+
if add_bos is None:
|
| 934 |
+
add_bos = self._add_bos
|
| 935 |
+
if add_eos is None:
|
| 936 |
+
add_eos = self._add_eos
|
| 937 |
+
if reverse is None:
|
| 938 |
+
reverse = self._reverse
|
| 939 |
+
if emit_unk_piece is None:
|
| 940 |
+
emit_unk_piece = self._emit_unk_piece
|
| 941 |
+
if nbest_size is None:
|
| 942 |
+
nbest_size = self._nbest_size
|
| 943 |
+
|
| 944 |
+
if nbest_size <= 0:
|
| 945 |
+
nbest_size=1
|
| 946 |
+
|
| 947 |
+
def _encode(text):
|
| 948 |
+
if out_type is int:
|
| 949 |
+
return self._NBestEncodeAsIds(text, nbest_size,
|
| 950 |
+
add_bos, add_eos, reverse, emit_unk_piece)
|
| 951 |
+
if out_type is str:
|
| 952 |
+
return self._NBestEncodeAsPieces(text, nbest_size,
|
| 953 |
+
add_bos, add_eos, reverse, emit_unk_piece)
|
| 954 |
+
if out_type == 'serialized_proto' or out_type == 'proto':
|
| 955 |
+
return self._NBestEncodeAsSerializedProto(text, nbest_size,
|
| 956 |
+
add_bos, add_eos, reverse, emit_unk_piece)
|
| 957 |
+
if out_type == 'immutable_proto':
|
| 958 |
+
return self._NBestEncodeAsImmutableProto(text, nbest_size,
|
| 959 |
+
add_bos, add_eos, reverse, emit_unk_piece)
|
| 960 |
+
|
| 961 |
+
raise RuntimeError('unknown out_type')
|
| 962 |
+
|
| 963 |
+
if type(input) is list:
|
| 964 |
+
return [_encode(n) for n in input]
|
| 965 |
+
|
| 966 |
+
return _encode(input)
|
| 967 |
+
|
| 968 |
+
|
| 969 |
+
def NBestEncodeAsPieces(self, input, nbest_size=None, **kwargs):
|
| 970 |
+
return self.NBestEncode(input=input, nbest_size=nbest_size,
|
| 971 |
+
out_type=str, **kwargs)
|
| 972 |
+
|
| 973 |
+
|
| 974 |
+
def NBestEncodeAsIds(self, input, nbest_size=None, **kwargs):
|
| 975 |
+
return self.NBestEncode(input=input, nbest_size=nbest_size,
|
| 976 |
+
out_type=int, **kwargs)
|
| 977 |
+
|
| 978 |
+
|
| 979 |
+
def NBestEncodeAsSerializedProto(self, input, nbest_size=None, **kwargs):
|
| 980 |
+
return self.NBestEncode(input=input, nbest_size=nbest_size,
|
| 981 |
+
out_type='serialized_proto', **kwargs)
|
| 982 |
+
|
| 983 |
+
|
| 984 |
+
def NBestEncodeAsImmutableProto(self, input, nbest_size=None, **kwargs):
|
| 985 |
+
return self.NBestEncode(input=input, nbest_size=nbest_size,
|
| 986 |
+
out_type='immutable_proto', **kwargs)
|
| 987 |
+
|
| 988 |
+
|
| 989 |
+
def SampleEncodeAndScore(self,
|
| 990 |
+
input,
|
| 991 |
+
out_type=None,
|
| 992 |
+
add_bos=None,
|
| 993 |
+
add_eos=None,
|
| 994 |
+
reverse=None,
|
| 995 |
+
emit_unk_piece=None,
|
| 996 |
+
num_samples=None,
|
| 997 |
+
alpha=None,
|
| 998 |
+
wor=None,
|
| 999 |
+
include_best=None):
|
| 1000 |
+
"""SampleEncodeAndScore text input to segmented ids or tokens.
|
| 1001 |
+
|
| 1002 |
+
Args:
|
| 1003 |
+
input: input string. accepsts list of string.
|
| 1004 |
+
out_type: output type. int or str or 'serialized_proto' or 'immutable_proto'
|
| 1005 |
+
add_bos: Add <s> to the result (Default = false)
|
| 1006 |
+
add_eos: Add </s> to the result (Default = false) <s>/</s> is added after reversing (if enabled).
|
| 1007 |
+
reverse: Reverses the tokenized sequence (Default = false)
|
| 1008 |
+
emit_unk_piece: Emits the unk literal string (Default = false)
|
| 1009 |
+
num_samples: How many samples to return (Default = 1)
|
| 1010 |
+
alpha: inverse temperature for sampling
|
| 1011 |
+
wor: whether to sample without replacement (Default = false)
|
| 1012 |
+
include_best: whether to include the best tokenization, requires wor=True (Default = false)
|
| 1013 |
+
"""
|
| 1014 |
+
|
| 1015 |
+
if out_type is None:
|
| 1016 |
+
out_type = self._out_type
|
| 1017 |
+
if add_bos is None:
|
| 1018 |
+
add_bos = self._add_bos
|
| 1019 |
+
if add_eos is None:
|
| 1020 |
+
add_eos = self._add_eos
|
| 1021 |
+
if reverse is None:
|
| 1022 |
+
reverse = self._reverse
|
| 1023 |
+
if emit_unk_piece is None:
|
| 1024 |
+
emit_unk_piece = self._emit_unk_piece
|
| 1025 |
+
if num_samples is None:
|
| 1026 |
+
num_samples = 1
|
| 1027 |
+
if alpha is None:
|
| 1028 |
+
alpha = 1.
|
| 1029 |
+
if wor is None:
|
| 1030 |
+
wor = False
|
| 1031 |
+
if include_best is None:
|
| 1032 |
+
include_best = False
|
| 1033 |
+
|
| 1034 |
+
if num_samples <= 0:
|
| 1035 |
+
raise RuntimeError('num_examples must be positive')
|
| 1036 |
+
|
| 1037 |
+
if include_best and not wor:
|
| 1038 |
+
raise RuntimeError('When include_best is True, We must specify "wor = True".')
|
| 1039 |
+
|
| 1040 |
+
|
| 1041 |
+
def _encode(text):
|
| 1042 |
+
if out_type is int:
|
| 1043 |
+
return self._SampleEncodeAndScoreAsIds(text, num_samples, alpha, wor, include_best,
|
| 1044 |
+
add_bos, add_eos, reverse, emit_unk_piece)
|
| 1045 |
+
if out_type is str:
|
| 1046 |
+
return self._SampleEncodeAndScoreAsPieces(text, num_samples, alpha, wor, include_best,
|
| 1047 |
+
add_bos, add_eos, reverse, emit_unk_piece)
|
| 1048 |
+
|
| 1049 |
+
if out_type == 'serialized_proto' or out_type == 'proto':
|
| 1050 |
+
return self._SampleEncodeAndScoreAsSerializedProto(text, num_samples, alpha, wor, include_best,
|
| 1051 |
+
add_bos, add_eos, reverse, emit_unk_piece)
|
| 1052 |
+
|
| 1053 |
+
if out_type == 'immutable_proto':
|
| 1054 |
+
return self._SampleEncodeAndScoreAsImmutableProto(text, num_samples, alpha, wor, include_best,
|
| 1055 |
+
add_bos, add_eos, reverse, emit_unk_piece)
|
| 1056 |
+
|
| 1057 |
+
raise RuntimeError('unknown output type')
|
| 1058 |
+
|
| 1059 |
+
|
| 1060 |
+
if type(input) is list:
|
| 1061 |
+
return [_encode(n) for n in input]
|
| 1062 |
+
|
| 1063 |
+
return _encode(input)
|
| 1064 |
+
|
| 1065 |
+
|
| 1066 |
+
def SampleEncodeAndScoreAsPieces(self, input, num_samples=None, alpha=None, **kwargs):
|
| 1067 |
+
return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha,
|
| 1068 |
+
out_type=str, **kwargs)
|
| 1069 |
+
|
| 1070 |
+
|
| 1071 |
+
def SampleEncodeAndScoreAsIds(self, input, num_samples=None, alpha=None, **kwargs):
|
| 1072 |
+
return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha,
|
| 1073 |
+
out_type=int, **kwargs)
|
| 1074 |
+
|
| 1075 |
+
|
| 1076 |
+
def SampleEncodeAndScoreAsSerializedProto(self, input, num_samples=None, alpha=None, **kwargs):
|
| 1077 |
+
return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha,
|
| 1078 |
+
out_type='serialized_proto', **kwargs)
|
| 1079 |
+
|
| 1080 |
+
|
| 1081 |
+
def SampleEncodeAndScoreAsImmutableProto(self, input, num_samples=None, alpha=None, **kwargs):
|
| 1082 |
+
return self.SampleEncodeAndScore(input=input, num_samples=num_samples, alpha=alpha,
|
| 1083 |
+
out_type='immutable_proto', **kwargs)
|
| 1084 |
+
|
| 1085 |
+
|
| 1086 |
+
def Decode(self, input, out_type=str, num_threads=None):
|
| 1087 |
+
"""Decode processed id or token sequences.
|
| 1088 |
+
|
| 1089 |
+
Args:
|
| 1090 |
+
out_type: output type. str, bytes or 'serialized_proto' or 'immutable_proto' (Default = str)
|
| 1091 |
+
num_threads: the number of threads used in the batch processing (Default = -1).
|
| 1092 |
+
"""
|
| 1093 |
+
|
| 1094 |
+
if num_threads is None:
|
| 1095 |
+
num_threads = self._num_threads
|
| 1096 |
+
|
| 1097 |
+
if num_threads is None or type(num_threads) is not int:
|
| 1098 |
+
raise RuntimeError('num_threads must be int')
|
| 1099 |
+
|
| 1100 |
+
if not input:
|
| 1101 |
+
return ''
|
| 1102 |
+
|
| 1103 |
+
if out_type is str:
|
| 1104 |
+
if type(input) is int:
|
| 1105 |
+
return self._DecodeIds([input])
|
| 1106 |
+
if type(input) is str:
|
| 1107 |
+
return self._DecodePieces([input])
|
| 1108 |
+
|
| 1109 |
+
if type(input) is list:
|
| 1110 |
+
if len(input) == 0 or type(input[0]) is int:
|
| 1111 |
+
return self._DecodeIds(input)
|
| 1112 |
+
if type(input[0]) is str:
|
| 1113 |
+
return self._DecodePieces(input)
|
| 1114 |
+
|
| 1115 |
+
if type(input[0]) is list:
|
| 1116 |
+
if len(input[0]) == 0 or type(input[0][0]) is int:
|
| 1117 |
+
return self._DecodeIdsBatch(input, num_threads)
|
| 1118 |
+
if type(input[0][0]) is str:
|
| 1119 |
+
return self._DecodePiecesBatch(input, num_threads)
|
| 1120 |
+
|
| 1121 |
+
if out_type is bytes:
|
| 1122 |
+
if type(input) is int:
|
| 1123 |
+
return self._DecodeIdsAsBytes([input])
|
| 1124 |
+
if type(input) is str:
|
| 1125 |
+
return self._DecodePieces([input])
|
| 1126 |
+
|
| 1127 |
+
if type(input) is list:
|
| 1128 |
+
if len(input) == 0 or type(input[0]) is int:
|
| 1129 |
+
return self._DecodeIdsAsBytes(input)
|
| 1130 |
+
if type(input[0]) is str:
|
| 1131 |
+
return self._DecodePieces(input)
|
| 1132 |
+
|
| 1133 |
+
if type(input[0]) is list:
|
| 1134 |
+
if len(input[0]) == 0 or type(input[0][0]) is int:
|
| 1135 |
+
return self._DecodeIdsAsBytesBatch(input, num_threads)
|
| 1136 |
+
if type(input[0][0]) is str:
|
| 1137 |
+
return self._DecodePiecesBatch(input, num_threads)
|
| 1138 |
+
|
| 1139 |
+
if out_type == 'serialized_proto':
|
| 1140 |
+
if type(input) is int:
|
| 1141 |
+
return self._DecodeIdsAsSerializedProto([input])
|
| 1142 |
+
if type(input) is str:
|
| 1143 |
+
return self._DecodePiecesAsSerializedProto([input])
|
| 1144 |
+
|
| 1145 |
+
if type(input) is list:
|
| 1146 |
+
if len(input) == 0 or type(input[0]) is int:
|
| 1147 |
+
return self._DecodeIdsAsSerializedProto(input)
|
| 1148 |
+
if type(input[0]) is str:
|
| 1149 |
+
return self._DecodePiecesAsSerializedProto(input)
|
| 1150 |
+
|
| 1151 |
+
if type(input[0]) is list:
|
| 1152 |
+
if len(input[0]) == 0 or type(input[0][0]) is int:
|
| 1153 |
+
return self._DecodeIdsAsSerializedProtoBatch(input, num_threads)
|
| 1154 |
+
if type(input[0][0]) is str:
|
| 1155 |
+
return self._DecodePiecesAsSerializedProtoBatch(input, num_threads)
|
| 1156 |
+
|
| 1157 |
+
|
| 1158 |
+
if out_type == 'immutable_proto':
|
| 1159 |
+
if type(input) is int:
|
| 1160 |
+
return self._DecodeIdsAsImmutableProto([input])
|
| 1161 |
+
if type(input) is str:
|
| 1162 |
+
return self._DecodePiecesAsImmutableProto([input])
|
| 1163 |
+
|
| 1164 |
+
if type(input) is list:
|
| 1165 |
+
if len(input) == 0 or type(input[0]) is int:
|
| 1166 |
+
return self._DecodeIdsAsImmutableProto(input)
|
| 1167 |
+
if type(input[0]) is str:
|
| 1168 |
+
return self._DecodePiecesAsImmutableProto(input)
|
| 1169 |
+
|
| 1170 |
+
if type(input[0]) is list:
|
| 1171 |
+
if len(input[0]) == 0 or type(input[0][0]) is int:
|
| 1172 |
+
return self._DecodeIdsAsImmutableProtoBatch(input, num_threads)
|
| 1173 |
+
if type(input[0][0]) is str:
|
| 1174 |
+
return self._DecodePiecesAsImmutableProtoBatch(input, num_threads)
|
| 1175 |
+
|
| 1176 |
+
|
| 1177 |
+
raise RuntimeError('unknown output or input type')
|
| 1178 |
+
return None
|
| 1179 |
+
|
| 1180 |
+
|
| 1181 |
+
def DecodePieces(self, input, out_type=str, **kwargs):
|
| 1182 |
+
return self.Decode(input=input, out_type=out_type, **kwargs)
|
| 1183 |
+
|
| 1184 |
+
|
| 1185 |
+
def DecodeIds(self, input, out_type=str, **kwargs):
|
| 1186 |
+
return self.Decode(input=input, out_type=out_type, **kwargs)
|
| 1187 |
+
|
| 1188 |
+
|
| 1189 |
+
def DecodePiecesAsSerializedProto(self, input, out_type='serialized_proto', **kwargs):
|
| 1190 |
+
return self.Decode(input=input, out_type=out_type, **kwargs)
|
| 1191 |
+
|
| 1192 |
+
|
| 1193 |
+
def DecodeIdsAsSerializedProto(self, input, out_type='serialized_proto', **kwargs):
|
| 1194 |
+
return self.Decode(input=input, out_type=out_type, **kwargs)
|
| 1195 |
+
|
| 1196 |
+
|
| 1197 |
+
def DecodePiecesAsImmutableProto(self, input, out_type='immutable_proto', **kwargs):
|
| 1198 |
+
return self.Decode(input=input, out_type=out_type, **kwargs)
|
| 1199 |
+
|
| 1200 |
+
|
| 1201 |
+
def DecodeIdsAsImmutableProto(self, input, out_type='immutable_proto', **kwargs):
|
| 1202 |
+
return self.Decode(input=input, out_type=out_type, **kwargs)
|
| 1203 |
+
|
| 1204 |
+
|
| 1205 |
+
def CalculateEntropy(self, input, alpha, num_threads=None):
|
| 1206 |
+
"""Calculate sentence entropy"""
|
| 1207 |
+
if type(input) is list:
|
| 1208 |
+
if num_threads is None:
|
| 1209 |
+
num_threads = self._num_threads
|
| 1210 |
+
if num_threads is None or type(num_threads) is not int:
|
| 1211 |
+
raise RuntimeError('num_threads must be int')
|
| 1212 |
+
return self._CalculateEntropyBatch(input, alpha, num_threads)
|
| 1213 |
+
|
| 1214 |
+
return self._CalculateEntropy(input, alpha)
|
| 1215 |
+
|
| 1216 |
+
|
| 1217 |
+
def Normalize(self, input, with_offsets=None):
|
| 1218 |
+
def _normalize(text):
|
| 1219 |
+
if with_offsets:
|
| 1220 |
+
return self._NormalizeWithOffsets(text)
|
| 1221 |
+
return self._Normalize(text)
|
| 1222 |
+
|
| 1223 |
+
if type(input) is list:
|
| 1224 |
+
return [_normalize(x) for x in input]
|
| 1225 |
+
return _normalize(input)
|
| 1226 |
+
|
| 1227 |
+
def OverrideNormalizerSpec(self, **kwargs):
|
| 1228 |
+
new_kwargs = {}
|
| 1229 |
+
for key, value in kwargs.items():
|
| 1230 |
+
new_kwargs[key] = str(value)
|
| 1231 |
+
return self._OverrideNormalizerSpec(new_kwargs)
|
| 1232 |
+
|
| 1233 |
+
|
| 1234 |
+
def piece_size(self):
|
| 1235 |
+
return self.GetPieceSize()
|
| 1236 |
+
|
| 1237 |
+
|
| 1238 |
+
def vocab_size(self):
|
| 1239 |
+
return self.GetPieceSize()
|
| 1240 |
+
|
| 1241 |
+
|
| 1242 |
+
def __getstate__(self):
|
| 1243 |
+
return self.serialized_model_proto()
|
| 1244 |
+
|
| 1245 |
+
|
| 1246 |
+
def __setstate__(self, serialized_model_proto):
|
| 1247 |
+
self.__init__()
|
| 1248 |
+
self.LoadFromSerializedProto(serialized_model_proto)
|
| 1249 |
+
|
| 1250 |
+
|
| 1251 |
+
def __len__(self):
|
| 1252 |
+
return self.GetPieceSize()
|
| 1253 |
+
|
| 1254 |
+
|
| 1255 |
+
def __getitem__(self, piece):
|
| 1256 |
+
return self.PieceToId(piece)
|
| 1257 |
+
|
| 1258 |
+
|
| 1259 |
+
def Load(self, model_file=None, model_proto=None):
|
| 1260 |
+
"""Overwride SentencePieceProcessor.Load to support both model_file and model_proto.
|
| 1261 |
+
|
| 1262 |
+
Args:
|
| 1263 |
+
model_file: The sentencepiece model file path.
|
| 1264 |
+
model_proto: The sentencepiece model serialized proto. Either `model_file`
|
| 1265 |
+
or `model_proto` must be set.
|
| 1266 |
+
"""
|
| 1267 |
+
if model_file and model_proto:
|
| 1268 |
+
raise RuntimeError('model_file and model_proto must be exclusive.')
|
| 1269 |
+
if model_proto:
|
| 1270 |
+
return self.LoadFromSerializedProto(model_proto)
|
| 1271 |
+
return self.LoadFromFile(model_file)
|
| 1272 |
+
}
|
| 1273 |
+
}
|
| 1274 |
+
|
| 1275 |
+
%extend sentencepiece::SentencePieceTrainer {
|
| 1276 |
+
static void _TrainFromString(absl::string_view arg) {
|
| 1277 |
+
const auto _status = sentencepiece::SentencePieceTrainer::Train(arg);
|
| 1278 |
+
if (!_status.ok()) throw _status;
|
| 1279 |
+
return;
|
| 1280 |
+
}
|
| 1281 |
+
|
| 1282 |
+
static void _TrainFromMap(const std::unordered_map<std::string, std::string> &args) {
|
| 1283 |
+
const auto _status = sentencepiece::SentencePieceTrainer::Train(args);
|
| 1284 |
+
if (!_status.ok()) throw _status;
|
| 1285 |
+
return;
|
| 1286 |
+
}
|
| 1287 |
+
|
| 1288 |
+
static void _TrainFromMap2(const std::unordered_map<std::string, std::string> &args,
|
| 1289 |
+
SentenceIterator *iter) {
|
| 1290 |
+
const auto _status = sentencepiece::SentencePieceTrainer::Train(args, iter);
|
| 1291 |
+
if (!_status.ok()) throw _status;
|
| 1292 |
+
return;
|
| 1293 |
+
}
|
| 1294 |
+
|
| 1295 |
+
static sentencepiece::util::bytes _TrainFromMap3(const std::unordered_map<std::string, std::string> &args) {
|
| 1296 |
+
sentencepiece::util::bytes model_proto;
|
| 1297 |
+
const auto _status = sentencepiece::SentencePieceTrainer::Train(args, nullptr, &model_proto);
|
| 1298 |
+
if (!_status.ok()) throw _status;
|
| 1299 |
+
return model_proto;
|
| 1300 |
+
}
|
| 1301 |
+
|
| 1302 |
+
static sentencepiece::util::bytes _TrainFromMap4(const std::unordered_map<std::string, std::string> &args,
|
| 1303 |
+
SentenceIterator *iter) {
|
| 1304 |
+
sentencepiece::util::bytes model_proto;
|
| 1305 |
+
const auto _status = sentencepiece::SentencePieceTrainer::Train(args, iter, &model_proto);
|
| 1306 |
+
if (!_status.ok()) throw _status;
|
| 1307 |
+
return model_proto;
|
| 1308 |
+
}
|
| 1309 |
+
|
| 1310 |
+
%pythoncode {
|
| 1311 |
+
@staticmethod
|
| 1312 |
+
def _Train(arg=None, **kwargs):
|
| 1313 |
+
"""Train Sentencepiece model. Accept both kwargs and legacy string arg."""
|
| 1314 |
+
if arg is not None and type(arg) is str:
|
| 1315 |
+
return SentencePieceTrainer._TrainFromString(arg)
|
| 1316 |
+
|
| 1317 |
+
def _encode(value):
|
| 1318 |
+
"""Encode value to CSV.."""
|
| 1319 |
+
if type(value) is list:
|
| 1320 |
+
if sys.version_info[0] == 3:
|
| 1321 |
+
f = StringIO()
|
| 1322 |
+
else:
|
| 1323 |
+
f = BytesIO()
|
| 1324 |
+
writer = csv.writer(f, lineterminator='')
|
| 1325 |
+
writer.writerow([str(v) for v in value])
|
| 1326 |
+
return f.getvalue()
|
| 1327 |
+
else:
|
| 1328 |
+
return str(value)
|
| 1329 |
+
|
| 1330 |
+
sentence_iterator = None
|
| 1331 |
+
model_writer = None
|
| 1332 |
+
new_kwargs = {}
|
| 1333 |
+
for key, value in kwargs.items():
|
| 1334 |
+
if key in ['sentence_iterator', 'sentence_reader']:
|
| 1335 |
+
sentence_iterator = value
|
| 1336 |
+
elif key in ['model_writer']:
|
| 1337 |
+
model_writer = value
|
| 1338 |
+
else:
|
| 1339 |
+
new_kwargs[key] = _encode(value)
|
| 1340 |
+
|
| 1341 |
+
if model_writer:
|
| 1342 |
+
if sentence_iterator:
|
| 1343 |
+
model_proto = SentencePieceTrainer._TrainFromMap4(new_kwargs,
|
| 1344 |
+
sentence_iterator)
|
| 1345 |
+
else:
|
| 1346 |
+
model_proto = SentencePieceTrainer._TrainFromMap3(new_kwargs)
|
| 1347 |
+
model_writer.write(model_proto)
|
| 1348 |
+
else:
|
| 1349 |
+
if sentence_iterator:
|
| 1350 |
+
return SentencePieceTrainer._TrainFromMap2(new_kwargs, sentence_iterator)
|
| 1351 |
+
else:
|
| 1352 |
+
return SentencePieceTrainer._TrainFromMap(new_kwargs)
|
| 1353 |
+
|
| 1354 |
+
return None
|
| 1355 |
+
|
| 1356 |
+
@staticmethod
|
| 1357 |
+
def Train(arg=None, logstream=None, **kwargs):
|
| 1358 |
+
with _LogStream(ostream=logstream):
|
| 1359 |
+
SentencePieceTrainer._Train(arg=arg, **kwargs)
|
| 1360 |
+
}
|
| 1361 |
+
}
|
| 1362 |
+
|
| 1363 |
+
%extend sentencepiece::SentencePieceNormalizer {
|
| 1364 |
+
sentencepiece::util::Status LoadFromFile(absl::string_view arg) {
|
| 1365 |
+
return $self->Load(arg);
|
| 1366 |
+
}
|
| 1367 |
+
|
| 1368 |
+
std::string _Normalize(absl::string_view text) {
|
| 1369 |
+
std::string result;
|
| 1370 |
+
const auto _status = $self->Normalize(text, &result);
|
| 1371 |
+
if (!_status.ok()) throw _status;
|
| 1372 |
+
return result;
|
| 1373 |
+
}
|
| 1374 |
+
|
| 1375 |
+
std::pair<std::string, std::vector<size_t>> _NormalizeWithOffsets(absl::string_view text) {
|
| 1376 |
+
std::pair<std::string, std::vector<size_t>> result;
|
| 1377 |
+
const auto _status = $self->Normalize(text, &result.first, &result.second);
|
| 1378 |
+
if (!_status.ok()) throw _status;
|
| 1379 |
+
return result;
|
| 1380 |
+
}
|
| 1381 |
+
|
| 1382 |
+
void _SetProtoField(absl::string_view name, bool value) {
|
| 1383 |
+
sentencepiece::SentencePieceTrainer::SetProtoField(
|
| 1384 |
+
name,
|
| 1385 |
+
value ? "1" : "0",
|
| 1386 |
+
$self->mutable_normalizer_spec()).IgnoreError();
|
| 1387 |
+
}
|
| 1388 |
+
|
| 1389 |
+
%pythoncode %{
|
| 1390 |
+
def Init(self,
|
| 1391 |
+
model_file=None,
|
| 1392 |
+
model_proto=None,
|
| 1393 |
+
rule_tsv=None,
|
| 1394 |
+
rule_name=None,
|
| 1395 |
+
add_dummy_prefix=False,
|
| 1396 |
+
escape_whitespaces=False,
|
| 1397 |
+
remove_extra_whitespaces=False):
|
| 1398 |
+
"""Initialzie sentencePieceNormalizer.
|
| 1399 |
+
|
| 1400 |
+
Args:
|
| 1401 |
+
model_file: The sentencepiece model file path.
|
| 1402 |
+
model_proto: The sentencepiece model serialized proto.
|
| 1403 |
+
rule_tsv: The normalization rule file in TSV format.
|
| 1404 |
+
rule_name: Pre-defined normalization name.
|
| 1405 |
+
add_dummy_prefix: add dummy prefix.
|
| 1406 |
+
escape_whitespaces: escape whitespaces.
|
| 1407 |
+
remove_extra_whitespaces: remove extra whitespaces.
|
| 1408 |
+
"""
|
| 1409 |
+
|
| 1410 |
+
_sentencepiece_normalizer_init_native(self)
|
| 1411 |
+
|
| 1412 |
+
if model_file:
|
| 1413 |
+
status = self.LoadFromFile(model_file)
|
| 1414 |
+
elif model_proto:
|
| 1415 |
+
status = self.LoadFromSerializedProto(model_proto)
|
| 1416 |
+
elif rule_tsv:
|
| 1417 |
+
status = self.LoadFromRuleTSV(rule_tsv)
|
| 1418 |
+
elif rule_name:
|
| 1419 |
+
status = self.LoadFromRuleName(rule_name)
|
| 1420 |
+
else:
|
| 1421 |
+
raise RuntimeError('no model is specified')
|
| 1422 |
+
|
| 1423 |
+
if status:
|
| 1424 |
+
self._SetProtoField('add_dummy_prefix', add_dummy_prefix)
|
| 1425 |
+
self._SetProtoField('escape_whitespaces', escape_whitespaces)
|
| 1426 |
+
self._SetProtoField('remove_extra_whitespaces', remove_extra_whitespaces)
|
| 1427 |
+
|
| 1428 |
+
def Normalize(self, input, with_offsets=None):
|
| 1429 |
+
def _normalize(text):
|
| 1430 |
+
if with_offsets:
|
| 1431 |
+
return self._NormalizeWithOffsets(text)
|
| 1432 |
+
return self._Normalize(text)
|
| 1433 |
+
|
| 1434 |
+
if type(input) is list:
|
| 1435 |
+
return [_normalize(x) for x in input]
|
| 1436 |
+
return _normalize(input)
|
| 1437 |
+
|
| 1438 |
+
|
| 1439 |
+
def __getstate__(self):
|
| 1440 |
+
return self.serialized_model_proto()
|
| 1441 |
+
|
| 1442 |
+
|
| 1443 |
+
def __setstate__(self, serialized_model_proto):
|
| 1444 |
+
self.__init__()
|
| 1445 |
+
self.LoadFromSerializedProto(serialized_model_proto)
|
| 1446 |
+
%}
|
| 1447 |
+
}
|
| 1448 |
+
|
| 1449 |
+
%extend sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece {
|
| 1450 |
+
const sentencepiece::util::bytes& _surface_as_bytes() const {
|
| 1451 |
+
return $self->surface();
|
| 1452 |
+
}
|
| 1453 |
+
|
| 1454 |
+
const sentencepiece::util::bytes& _piece_as_bytes() const {
|
| 1455 |
+
return $self->piece();
|
| 1456 |
+
}
|
| 1457 |
+
|
| 1458 |
+
%rename(_piece) piece;
|
| 1459 |
+
%rename(_piece_as_bytes) piece_as_bytes;
|
| 1460 |
+
%rename(_id) id;
|
| 1461 |
+
%rename(_surface) surface;
|
| 1462 |
+
%rename(_surface_as_bytes) surface_as_bytes;
|
| 1463 |
+
%rename(_begin) begin;
|
| 1464 |
+
%rename(_end) end;
|
| 1465 |
+
|
| 1466 |
+
%pythoncode %{
|
| 1467 |
+
piece = property(_piece)
|
| 1468 |
+
piece_as_bytes = property(_piece_as_bytes)
|
| 1469 |
+
surface = property(_surface)
|
| 1470 |
+
surface_as_bytes = property(_surface_as_bytes)
|
| 1471 |
+
id = property(_id)
|
| 1472 |
+
begin = property(_begin)
|
| 1473 |
+
end = property(_end)
|
| 1474 |
+
|
| 1475 |
+
def __str__(self):
|
| 1476 |
+
return ('piece: \"{}\"\n'
|
| 1477 |
+
'id: {}\n'
|
| 1478 |
+
'surface: \"{}\"\n'
|
| 1479 |
+
'begin: {}\n'
|
| 1480 |
+
'end: {}\n').format(self.piece, self.id, self.surface,
|
| 1481 |
+
self.begin, self.end)
|
| 1482 |
+
|
| 1483 |
+
def __eq__(self, other):
|
| 1484 |
+
return self.piece == other.piece and self.id == other.id and self.surface == other.surface and self.begin == other.begin and self.end == other.end
|
| 1485 |
+
|
| 1486 |
+
def __hash__(self):
|
| 1487 |
+
return hash(str(self))
|
| 1488 |
+
|
| 1489 |
+
__repr__ = __str__
|
| 1490 |
+
%}
|
| 1491 |
+
}
|
| 1492 |
+
|
| 1493 |
+
%extend sentencepiece::ImmutableSentencePieceText {
|
| 1494 |
+
const sentencepiece::util::bytes& _text_as_bytes() const {
|
| 1495 |
+
return $self->text();
|
| 1496 |
+
}
|
| 1497 |
+
|
| 1498 |
+
%rename(_text) text;
|
| 1499 |
+
%rename(_text_as_bytes) text_as_bytes;
|
| 1500 |
+
%rename(_score) score;
|
| 1501 |
+
%rename(_pieces) pieces;
|
| 1502 |
+
%rename(_pieces_size) pieces_size;
|
| 1503 |
+
|
| 1504 |
+
%pythoncode %{
|
| 1505 |
+
text = property(_text)
|
| 1506 |
+
text_as_bytes = property(_text_as_bytes)
|
| 1507 |
+
score = property(_score)
|
| 1508 |
+
|
| 1509 |
+
class ImmutableSentencePieceIterator:
|
| 1510 |
+
def __init__(self, proto):
|
| 1511 |
+
self.proto = proto
|
| 1512 |
+
self.len = self.proto._pieces_size()
|
| 1513 |
+
|
| 1514 |
+
def __len__(self):
|
| 1515 |
+
return self.len
|
| 1516 |
+
|
| 1517 |
+
def __getitem__(self, index):
|
| 1518 |
+
if isinstance(index, slice):
|
| 1519 |
+
return [self.proto._pieces(i) for i in range(self.len)][index.start:index.stop:index.step]
|
| 1520 |
+
if index < 0:
|
| 1521 |
+
index = index + self.len
|
| 1522 |
+
if index < 0 or index >= self.len:
|
| 1523 |
+
raise IndexError('piece index is out of range')
|
| 1524 |
+
return self.proto._pieces(index)
|
| 1525 |
+
|
| 1526 |
+
def __str__(self):
|
| 1527 |
+
return '\n'.join(['pieces {{\n{}}}'.format(str(x)) for x in self])
|
| 1528 |
+
|
| 1529 |
+
__repr__ = __str__
|
| 1530 |
+
|
| 1531 |
+
@property
|
| 1532 |
+
def pieces(self):
|
| 1533 |
+
return ImmutableSentencePieceText.ImmutableSentencePieceIterator(self)
|
| 1534 |
+
|
| 1535 |
+
def __eq__(self, other):
|
| 1536 |
+
return self.SerializeAsString() == other.SerializeAsString()
|
| 1537 |
+
|
| 1538 |
+
def __hash__(self):
|
| 1539 |
+
return hash(self.SerializeAsString())
|
| 1540 |
+
|
| 1541 |
+
def __str__(self):
|
| 1542 |
+
return ('text: \"{}\"\n'
|
| 1543 |
+
'score: {}\n'
|
| 1544 |
+
'{}').format(self.text, self.score,
|
| 1545 |
+
'\n'.join(['pieces {{\n{}}}'.format(str(x)) for x in self.pieces]))
|
| 1546 |
+
|
| 1547 |
+
__repr__ = __str__
|
| 1548 |
+
%}
|
| 1549 |
+
}
|
| 1550 |
+
|
| 1551 |
+
%extend sentencepiece::ImmutableNBestSentencePieceText {
|
| 1552 |
+
%rename(_nbests) nbests;
|
| 1553 |
+
%rename(_nbests_size) nbests_size;
|
| 1554 |
+
|
| 1555 |
+
%pythoncode %{
|
| 1556 |
+
class ImmutableSentencePieceTextIterator:
|
| 1557 |
+
def __init__(self, proto):
|
| 1558 |
+
self.proto = proto
|
| 1559 |
+
self.len = self.proto._nbests_size()
|
| 1560 |
+
|
| 1561 |
+
def __len__(self):
|
| 1562 |
+
return self.len
|
| 1563 |
+
|
| 1564 |
+
def __getitem__(self, index):
|
| 1565 |
+
if isinstance(index, slice):
|
| 1566 |
+
return [self.proto._nbests(i) for i in range(self.len)][index.start:index.stop:index.step]
|
| 1567 |
+
if index < 0:
|
| 1568 |
+
index = index + self.len
|
| 1569 |
+
if index < 0 or index >= self.len:
|
| 1570 |
+
raise IndexError('nbests index is out of range')
|
| 1571 |
+
return self.proto._nbests(index)
|
| 1572 |
+
|
| 1573 |
+
def __str__(self):
|
| 1574 |
+
return '\n'.join(['nbests {{\n{}}}'.format(str(x)) for x in self])
|
| 1575 |
+
|
| 1576 |
+
__repr__ = __str__
|
| 1577 |
+
|
| 1578 |
+
@property
|
| 1579 |
+
def nbests(self):
|
| 1580 |
+
return ImmutableNBestSentencePieceText.ImmutableSentencePieceTextIterator(self)
|
| 1581 |
+
|
| 1582 |
+
def __eq__(self, other):
|
| 1583 |
+
return self.SerializeAsString() == other.SerializeAsString()
|
| 1584 |
+
|
| 1585 |
+
def __hash__(self):
|
| 1586 |
+
return hash(self.SerializeAsString())
|
| 1587 |
+
|
| 1588 |
+
def __str__(self):
|
| 1589 |
+
return '\n'.join(['nbests {{\n{}}}'.format(str(x)) for x in self.nbests])
|
| 1590 |
+
|
| 1591 |
+
__repr__ = __str__
|
| 1592 |
+
%}
|
| 1593 |
+
}
|
| 1594 |
+
|
| 1595 |
+
%typemap(out) std::vector<int> {
|
| 1596 |
+
$result = PyList_New($1.size());
|
| 1597 |
+
for (size_t i = 0; i < $1.size(); ++i) {
|
| 1598 |
+
PyList_SET_ITEM($result, i, PyInt_FromLong(static_cast<long>($1[i])));
|
| 1599 |
+
}
|
| 1600 |
+
}
|
| 1601 |
+
|
| 1602 |
+
%typemap(out) std::vector<float> {
|
| 1603 |
+
$result = PyList_New($1.size());
|
| 1604 |
+
for (size_t i = 0; i < $1.size(); ++i) {
|
| 1605 |
+
PyList_SET_ITEM($result, i, PyFloat_FromDouble(static_cast<double>($1[i])));
|
| 1606 |
+
}
|
| 1607 |
+
}
|
| 1608 |
+
|
| 1609 |
+
%typemap(out) std::vector<std::vector<int>> {
|
| 1610 |
+
$result = PyList_New($1.size());
|
| 1611 |
+
for (size_t i = 0; i < $1.size(); ++i) {
|
| 1612 |
+
PyObject *obj = PyList_New($1[i].size());
|
| 1613 |
+
for (size_t j = 0; j < $1[i].size(); ++j) {
|
| 1614 |
+
PyList_SET_ITEM(obj, j, PyInt_FromLong(static_cast<long>($1[i][j])));
|
| 1615 |
+
}
|
| 1616 |
+
PyList_SET_ITEM($result, i, obj);
|
| 1617 |
+
}
|
| 1618 |
+
}
|
| 1619 |
+
|
| 1620 |
+
%typemap(out) std::vector<std::string> {
|
| 1621 |
+
PyObject *input_type = resultobj;
|
| 1622 |
+
$result = PyList_New($1.size());
|
| 1623 |
+
for (size_t i = 0; i < $1.size(); ++i) {
|
| 1624 |
+
PyList_SET_ITEM($result, i, MakePyOutputString($1[i], input_type));
|
| 1625 |
+
}
|
| 1626 |
+
}
|
| 1627 |
+
|
| 1628 |
+
%typemap(out) BytesArray {
|
| 1629 |
+
$result = PyList_New($1.size());
|
| 1630 |
+
for (size_t i = 0; i < $1.size(); ++i) {
|
| 1631 |
+
PyList_SET_ITEM($result, i, MakePyOutputBytes($1[i]));
|
| 1632 |
+
}
|
| 1633 |
+
}
|
| 1634 |
+
|
| 1635 |
+
%typemap(out) std::vector<std::vector<std::string>> {
|
| 1636 |
+
PyObject *input_type = resultobj;
|
| 1637 |
+
$result = PyList_New($1.size());
|
| 1638 |
+
for (size_t i = 0; i < $1.size(); ++i) {
|
| 1639 |
+
PyObject *obj = PyList_New($1[i].size());
|
| 1640 |
+
for (size_t j = 0; j < $1[i].size(); ++j) {
|
| 1641 |
+
PyList_SET_ITEM(obj, j, MakePyOutputString($1[i][j], input_type));
|
| 1642 |
+
}
|
| 1643 |
+
PyList_SET_ITEM($result, i, obj);
|
| 1644 |
+
}
|
| 1645 |
+
}
|
| 1646 |
+
|
| 1647 |
+
%typemap(out) sentencepiece::util::bytes {
|
| 1648 |
+
$result = MakePyOutputBytes($1);
|
| 1649 |
+
}
|
| 1650 |
+
|
| 1651 |
+
%typemap(out) const sentencepiece::util::bytes& {
|
| 1652 |
+
$result = MakePyOutputBytes(*$1);
|
| 1653 |
+
}
|
| 1654 |
+
|
| 1655 |
+
%typemap(out) std::string {
|
| 1656 |
+
PyObject *input_type = resultobj;
|
| 1657 |
+
$result = MakePyOutputString($1, input_type);
|
| 1658 |
+
}
|
| 1659 |
+
|
| 1660 |
+
%typemap(out) const std::string& {
|
| 1661 |
+
PyObject *input_type = resultobj;
|
| 1662 |
+
$result = MakePyOutputString(*$1, input_type);
|
| 1663 |
+
}
|
| 1664 |
+
|
| 1665 |
+
%typemap(out) sentencepiece::util::Status {
|
| 1666 |
+
if (!$1.ok()) {
|
| 1667 |
+
SWIG_exception(ToSwigError($1.code()), $1.ToString().c_str());
|
| 1668 |
+
}
|
| 1669 |
+
$result = SWIG_From_bool($1.ok());}
|
| 1670 |
+
|
| 1671 |
+
|
| 1672 |
+
%typemap(in) const std::string & {
|
| 1673 |
+
const PyInputString ustring($input);
|
| 1674 |
+
if (!ustring.IsAvalable()) {
|
| 1675 |
+
PyErr_SetString(PyExc_TypeError, "not a string");
|
| 1676 |
+
SWIG_fail;
|
| 1677 |
+
}
|
| 1678 |
+
resultobj = ustring.input_type();
|
| 1679 |
+
$1 = new std::string(ustring.data(), ustring.size());
|
| 1680 |
+
}
|
| 1681 |
+
|
| 1682 |
+
%typemap(typecheck) absl::string_view = char *;
|
| 1683 |
+
|
| 1684 |
+
%typemap(in) absl::string_view {
|
| 1685 |
+
const PyInputString ustring($input);
|
| 1686 |
+
if (!ustring.IsAvalable()) {
|
| 1687 |
+
PyErr_SetString(PyExc_TypeError, "not a string");
|
| 1688 |
+
SWIG_fail;
|
| 1689 |
+
}
|
| 1690 |
+
resultobj = ustring.input_type();
|
| 1691 |
+
$1 = ustring.str();
|
| 1692 |
+
}
|
| 1693 |
+
|
| 1694 |
+
%typemap(in) const std::vector<absl::string_view>& {
|
| 1695 |
+
std::vector<absl::string_view> *out = nullptr;
|
| 1696 |
+
if (PyList_Check($input)) {
|
| 1697 |
+
const size_t size = PyList_Size($input);
|
| 1698 |
+
out = new std::vector<absl::string_view>(size);
|
| 1699 |
+
for (size_t i = 0; i < size; ++i) {
|
| 1700 |
+
const PyInputString ustring(PyList_GetItem($input, i));
|
| 1701 |
+
if (ustring.IsAvalable()) {
|
| 1702 |
+
(*out)[i] = ustring.str();
|
| 1703 |
+
} else {
|
| 1704 |
+
PyErr_SetString(PyExc_TypeError, "list must contain strings");
|
| 1705 |
+
SWIG_fail;
|
| 1706 |
+
}
|
| 1707 |
+
resultobj = ustring.input_type();
|
| 1708 |
+
}
|
| 1709 |
+
} else {
|
| 1710 |
+
PyErr_SetString(PyExc_TypeError, "not a list");
|
| 1711 |
+
SWIG_fail;
|
| 1712 |
+
}
|
| 1713 |
+
$1 = out;
|
| 1714 |
+
}
|
| 1715 |
+
|
| 1716 |
+
%typemap(in) const std::vector<int>& {
|
| 1717 |
+
std::vector<int> *out = nullptr;
|
| 1718 |
+
if (PyList_Check($input)) {
|
| 1719 |
+
const size_t size = PyList_Size($input);
|
| 1720 |
+
out = new std::vector<int>(size);
|
| 1721 |
+
for (size_t i = 0; i < size; ++i) {
|
| 1722 |
+
PyObject *o = PyList_GetItem($input, i);
|
| 1723 |
+
if (PyInt_Check(o)) {
|
| 1724 |
+
(*out)[i] = static_cast<int>(PyInt_AsLong(o));
|
| 1725 |
+
} else {
|
| 1726 |
+
PyErr_SetString(PyExc_TypeError,"list must contain integers");
|
| 1727 |
+
SWIG_fail;
|
| 1728 |
+
}
|
| 1729 |
+
}
|
| 1730 |
+
} else {
|
| 1731 |
+
PyErr_SetString(PyExc_TypeError,"not a list");
|
| 1732 |
+
SWIG_fail;
|
| 1733 |
+
}
|
| 1734 |
+
$1 = out;
|
| 1735 |
+
}
|
| 1736 |
+
|
| 1737 |
+
%typemap(in) const std::vector<std::vector<absl::string_view>>& {
|
| 1738 |
+
std::vector<std::vector<absl::string_view>> *out = nullptr;
|
| 1739 |
+
if (PyList_Check($input)) {
|
| 1740 |
+
const size_t size = PyList_Size($input);
|
| 1741 |
+
out = new std::vector<std::vector<absl::string_view>>(size);
|
| 1742 |
+
for (size_t i = 0; i < size; ++i) {
|
| 1743 |
+
PyObject *o = PyList_GetItem($input, i);
|
| 1744 |
+
if (PyList_Check(o)) {
|
| 1745 |
+
const size_t size2 = PyList_Size(o);
|
| 1746 |
+
(*out)[i].resize(size2);
|
| 1747 |
+
for (size_t j = 0; j < size2; ++j) {
|
| 1748 |
+
const PyInputString ustring(PyList_GetItem(o, j));
|
| 1749 |
+
if (ustring.IsAvalable()) {
|
| 1750 |
+
(*out)[i][j] = ustring.str();
|
| 1751 |
+
} else {
|
| 1752 |
+
PyErr_SetString(PyExc_TypeError,"list must contain integers");
|
| 1753 |
+
SWIG_fail;
|
| 1754 |
+
}
|
| 1755 |
+
resultobj = ustring.input_type();
|
| 1756 |
+
}
|
| 1757 |
+
} else {
|
| 1758 |
+
PyErr_SetString(PyExc_TypeError,"not a list");
|
| 1759 |
+
SWIG_fail;
|
| 1760 |
+
}
|
| 1761 |
+
}
|
| 1762 |
+
} else {
|
| 1763 |
+
PyErr_SetString(PyExc_TypeError,"not a list");
|
| 1764 |
+
SWIG_fail;
|
| 1765 |
+
}
|
| 1766 |
+
$1 = out;
|
| 1767 |
+
}
|
| 1768 |
+
|
| 1769 |
+
%typemap(in) const std::vector<std::vector<int>>& {
|
| 1770 |
+
std::vector<std::vector<int>> *out = nullptr;
|
| 1771 |
+
if (PyList_Check($input)) {
|
| 1772 |
+
const size_t size = PyList_Size($input);
|
| 1773 |
+
out = new std::vector<std::vector<int>>(size);
|
| 1774 |
+
for (size_t i = 0; i < size; ++i) {
|
| 1775 |
+
PyObject *o = PyList_GetItem($input, i);
|
| 1776 |
+
if (PyList_Check(o)) {
|
| 1777 |
+
const size_t size2 = PyList_Size(o);
|
| 1778 |
+
(*out)[i].resize(size2);
|
| 1779 |
+
for (size_t j = 0; j < size2; ++j) {
|
| 1780 |
+
PyObject *o2 = PyList_GetItem(o, j);
|
| 1781 |
+
if (PyInt_Check(o2)) {
|
| 1782 |
+
(*out)[i][j] = static_cast<int>(PyInt_AsLong(o2));
|
| 1783 |
+
} else {
|
| 1784 |
+
PyErr_SetString(PyExc_TypeError, "list must contain strings");
|
| 1785 |
+
SWIG_fail;
|
| 1786 |
+
}
|
| 1787 |
+
}
|
| 1788 |
+
} else {
|
| 1789 |
+
PyErr_SetString(PyExc_TypeError, "not a list");
|
| 1790 |
+
SWIG_fail;
|
| 1791 |
+
}
|
| 1792 |
+
}
|
| 1793 |
+
} else {
|
| 1794 |
+
PyErr_SetString(PyExc_TypeError,"not a list");
|
| 1795 |
+
SWIG_fail;
|
| 1796 |
+
}
|
| 1797 |
+
$1 = out;
|
| 1798 |
+
}
|
| 1799 |
+
|
| 1800 |
+
%typemap(in) const std::unordered_map<std::string, std::string> & {
|
| 1801 |
+
std::unordered_map<std::string, std::string> *out = nullptr;
|
| 1802 |
+
if (PyDict_Check($input)) {
|
| 1803 |
+
PyObject *key, *value;
|
| 1804 |
+
Py_ssize_t pos = 0;
|
| 1805 |
+
out = new std::unordered_map<std::string, std::string>;
|
| 1806 |
+
while (PyDict_Next($input, &pos, &key, &value)) {
|
| 1807 |
+
const PyInputString key_ustring(key);
|
| 1808 |
+
const PyInputString value_ustring(value);
|
| 1809 |
+
if (key_ustring.IsAvalable() && value_ustring.IsAvalable()) {
|
| 1810 |
+
out->emplace(std::string(key_ustring.data(), key_ustring.size()),
|
| 1811 |
+
std::string(value_ustring.data(), value_ustring.size()));
|
| 1812 |
+
} else {
|
| 1813 |
+
PyErr_SetString(PyExc_TypeError, "map must contain strings.");
|
| 1814 |
+
SWIG_fail;
|
| 1815 |
+
}
|
| 1816 |
+
resultobj = key_ustring.input_type();
|
| 1817 |
+
}
|
| 1818 |
+
} else {
|
| 1819 |
+
PyErr_SetString(PyExc_TypeError, "not a dictionary");
|
| 1820 |
+
SWIG_fail;
|
| 1821 |
+
}
|
| 1822 |
+
$1 = out;
|
| 1823 |
+
}
|
| 1824 |
+
|
| 1825 |
+
%typemap(out) std::vector<std::pair<std::vector<std::string>, float>> {
|
| 1826 |
+
PyObject *input_type = resultobj;
|
| 1827 |
+
$result = PyList_New($1.size());
|
| 1828 |
+
for (size_t i = 0; i < $1.size(); ++i) {
|
| 1829 |
+
PyObject *obj = PyList_New($1[i].first.size());
|
| 1830 |
+
for (size_t j = 0; j < $1[i].first.size(); ++j) {
|
| 1831 |
+
PyList_SET_ITEM(obj, j, MakePyOutputString($1[i].first[j], input_type));
|
| 1832 |
+
}
|
| 1833 |
+
PyList_SET_ITEM($result, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast<double>($1[i].second))));
|
| 1834 |
+
}
|
| 1835 |
+
}
|
| 1836 |
+
|
| 1837 |
+
%typemap(out) std::vector<std::pair<std::vector<int>, float>> {
|
| 1838 |
+
$result = PyList_New($1.size());
|
| 1839 |
+
for (size_t i = 0; i < $1.size(); ++i) {
|
| 1840 |
+
PyObject *obj = PyList_New($1[i].first.size());
|
| 1841 |
+
for (size_t j = 0; j < $1[i].first.size(); ++j) {
|
| 1842 |
+
PyList_SET_ITEM(obj, j, PyInt_FromLong(static_cast<long>($1[i].first[j])));
|
| 1843 |
+
}
|
| 1844 |
+
PyList_SET_ITEM($result, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast<double>($1[i].second))));
|
| 1845 |
+
}
|
| 1846 |
+
}
|
| 1847 |
+
|
| 1848 |
+
%typemap(out) std::vector<sentencepiece::ImmutableSentencePieceText> {
|
| 1849 |
+
$result = PyList_New($1.size());
|
| 1850 |
+
for (size_t i = 0; i < $1.size(); ++i) {
|
| 1851 |
+
PyObject *obj = SWIG_NewPointerObj(new sentencepiece::ImmutableSentencePieceText($1.at(i)), SWIGTYPE_p_sentencepiece__ImmutableSentencePieceText, SWIG_POINTER_OWN | 0);
|
| 1852 |
+
PyList_SET_ITEM($result, i, obj);
|
| 1853 |
+
}
|
| 1854 |
+
}
|
| 1855 |
+
|
| 1856 |
+
// Types for normalized string and offset
|
| 1857 |
+
%typemap(out) std::pair<std::string, std::vector<size_t>> {
|
| 1858 |
+
PyObject *input_type = resultobj;
|
| 1859 |
+
if (PyInputString::IsUnicode(input_type)) {
|
| 1860 |
+
sentencepiece::ConvertToUnicodeAlignment(arg2, $1.first, &$1.second);
|
| 1861 |
+
}
|
| 1862 |
+
PyObject *obj = PyList_New($1.second.size());
|
| 1863 |
+
for (size_t i = 0; i < $1.second.size(); ++i) {
|
| 1864 |
+
PyList_SET_ITEM(obj, i, PyInt_FromLong(static_cast<long>($1.second[i])));
|
| 1865 |
+
}
|
| 1866 |
+
$result = PyTuple_Pack(2, MakePyOutputString($1.first, input_type), obj);
|
| 1867 |
+
}
|
| 1868 |
+
|
| 1869 |
+
%typemap(in) sentencepiece::SentenceIterator * {
|
| 1870 |
+
sentencepiece::SentenceIterator *out = nullptr;
|
| 1871 |
+
if (PyIter_Check($input)) {
|
| 1872 |
+
out = new PySentenceIterator($input);
|
| 1873 |
+
} else {
|
| 1874 |
+
PyErr_SetString(PyExc_TypeError, "not a iterator");
|
| 1875 |
+
SWIG_fail;
|
| 1876 |
+
}
|
| 1877 |
+
$1 = out;
|
| 1878 |
+
}
|
| 1879 |
+
|
| 1880 |
+
%typemap(freearg) const std::string& {
|
| 1881 |
+
delete $1;
|
| 1882 |
+
}
|
| 1883 |
+
|
| 1884 |
+
%typemap(freearg) const std::vector<std::string>& {
|
| 1885 |
+
delete $1;
|
| 1886 |
+
}
|
| 1887 |
+
|
| 1888 |
+
%typemap(freearg) const std::vector<absl::string_view>& {
|
| 1889 |
+
delete $1;
|
| 1890 |
+
}
|
| 1891 |
+
|
| 1892 |
+
%typemap(freearg) const std::vector<std::vector<std::string>>& {
|
| 1893 |
+
delete $1;
|
| 1894 |
+
}
|
| 1895 |
+
|
| 1896 |
+
%typemap(freearg) const std::vector<int>& {
|
| 1897 |
+
delete $1;
|
| 1898 |
+
}
|
| 1899 |
+
|
| 1900 |
+
%typemap(freearg) const std::vector<float>& {
|
| 1901 |
+
delete $1;
|
| 1902 |
+
}
|
| 1903 |
+
|
| 1904 |
+
%typemap(freearg) const std::vector<std::vector<int>>& {
|
| 1905 |
+
delete $1;
|
| 1906 |
+
}
|
| 1907 |
+
|
| 1908 |
+
%typemap(freearg) const std::unordered_map<std::string, std::string> & {
|
| 1909 |
+
delete $1;
|
| 1910 |
+
}
|
| 1911 |
+
|
| 1912 |
+
%typemap(freearg) sentencepiece::SentenceIterator * {
|
| 1913 |
+
delete $1;
|
| 1914 |
+
}
|
| 1915 |
+
|
| 1916 |
+
%typemap(freearg) sentencepiece::ImmutableSentencePieceText_ImmutableSentencePiece {
|
| 1917 |
+
delete $1;
|
| 1918 |
+
}
|
| 1919 |
+
|
| 1920 |
+
%typemap(freearg) sentencepiece::ImmutableSentencePieceText {
|
| 1921 |
+
delete $1;
|
| 1922 |
+
}
|
| 1923 |
+
|
| 1924 |
+
%typemap(freearg) sentencepiece::ImmutableNBestSentencePieceText {
|
| 1925 |
+
delete $1;
|
| 1926 |
+
}
|
| 1927 |
+
|
| 1928 |
+
%include <sentencepiece_processor.h>
|
| 1929 |
+
%include <sentencepiece_trainer.h>
|
| 1930 |
+
|
| 1931 |
+
%pythoncode %{
|
| 1932 |
+
|
| 1933 |
+
import re
|
| 1934 |
+
import csv
|
| 1935 |
+
import sys
|
| 1936 |
+
import os
|
| 1937 |
+
import importlib.resources
|
| 1938 |
+
from io import StringIO
|
| 1939 |
+
from io import BytesIO
|
| 1940 |
+
|
| 1941 |
+
|
| 1942 |
+
def _add_snake_case(classname):
|
| 1943 |
+
"""Added snake_cased method from CammelCased method."""
|
| 1944 |
+
|
| 1945 |
+
snake_map = {}
|
| 1946 |
+
for k, v in classname.__dict__.items():
|
| 1947 |
+
if re.match(r'^[A-Z]+', k):
|
| 1948 |
+
snake = re.sub(r'(?<!^)(?=[A-Z])', '_',
|
| 1949 |
+
k).lower().replace('n_best', 'nbest')
|
| 1950 |
+
snake_map[snake] = v
|
| 1951 |
+
for k, v in snake_map.items():
|
| 1952 |
+
setattr(classname, k, v)
|
| 1953 |
+
|
| 1954 |
+
|
| 1955 |
+
def _batchnize(classname, name):
|
| 1956 |
+
"""Enables batch request for the method classname.name."""
|
| 1957 |
+
func = getattr(classname, name, None)
|
| 1958 |
+
def _func(v, n):
|
| 1959 |
+
if type(n) is int and (n < 0 or n >= v.piece_size()):
|
| 1960 |
+
raise IndexError('piece id is out of range.')
|
| 1961 |
+
return func(v, n)
|
| 1962 |
+
|
| 1963 |
+
def _batched_func(self, arg):
|
| 1964 |
+
if type(arg) is list:
|
| 1965 |
+
return [_func(self, n) for n in arg]
|
| 1966 |
+
else:
|
| 1967 |
+
return _func(self, arg)
|
| 1968 |
+
|
| 1969 |
+
setattr(classname, name, _batched_func)
|
| 1970 |
+
|
| 1971 |
+
|
| 1972 |
+
_sentencepiece_processor_init_native = SentencePieceProcessor.__init__
|
| 1973 |
+
_sentencepiece_normalizer_init_native = SentencePieceNormalizer.__init__
|
| 1974 |
+
setattr(SentencePieceProcessor, '__init__', SentencePieceProcessor.Init)
|
| 1975 |
+
setattr(SentencePieceNormalizer, '__init__', SentencePieceNormalizer.Init)
|
| 1976 |
+
|
| 1977 |
+
SentencePieceProcessor.Tokenize = SentencePieceProcessor.Encode
|
| 1978 |
+
SentencePieceProcessor.Detokenize = SentencePieceProcessor.Decode
|
| 1979 |
+
|
| 1980 |
+
for m in [
|
| 1981 |
+
'PieceToId', 'IdToPiece', 'GetScore', 'IsUnknown', 'IsControl', 'IsUnused',
|
| 1982 |
+
'IsByte'
|
| 1983 |
+
]:
|
| 1984 |
+
_batchnize(SentencePieceProcessor, m)
|
| 1985 |
+
|
| 1986 |
+
_add_snake_case(SentencePieceProcessor)
|
| 1987 |
+
_add_snake_case(SentencePieceTrainer)
|
| 1988 |
+
_add_snake_case(SentencePieceNormalizer)
|
| 1989 |
+
set_random_generator_seed = SetRandomGeneratorSeed
|
| 1990 |
+
set_min_log_level = SetMinLogLevel
|
| 1991 |
+
|
| 1992 |
+
from ._version import __version__
|
| 1993 |
+
|
| 1994 |
+
SetDataDir(os.path.join(str(importlib.resources.files('sentencepiece')), 'package_data'))
|
| 1995 |
+
|
| 1996 |
+
class _LogStream(object):
|
| 1997 |
+
def __init__(self, ostream=None):
|
| 1998 |
+
self.ostream = ostream
|
| 1999 |
+
if self.ostream is not None:
|
| 2000 |
+
self.orig_stream_fileno = sys.stderr.fileno()
|
| 2001 |
+
|
| 2002 |
+
def __enter__(self):
|
| 2003 |
+
if self.ostream is not None:
|
| 2004 |
+
self.orig_stream_dup = os.dup(self.orig_stream_fileno)
|
| 2005 |
+
os.dup2(self.ostream.fileno(), self.orig_stream_fileno)
|
| 2006 |
+
|
| 2007 |
+
def __exit__(self, type, value, traceback):
|
| 2008 |
+
if self.ostream is not None:
|
| 2009 |
+
os.close(self.orig_stream_fileno)
|
| 2010 |
+
os.dup2(self.orig_stream_dup, self.orig_stream_fileno)
|
| 2011 |
+
os.close(self.orig_stream_dup)
|
| 2012 |
+
self.ostream.close()
|
| 2013 |
+
%}
|
source/sentencepiece/sentencepiece_model_pb2.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
| 3 |
+
# source: sentencepiece_model.proto
|
| 4 |
+
"""Generated protocol buffer code."""
|
| 5 |
+
from google.protobuf.internal import builder as _builder
|
| 6 |
+
from google.protobuf import descriptor as _descriptor
|
| 7 |
+
from google.protobuf import descriptor_pool as _descriptor_pool
|
| 8 |
+
from google.protobuf import symbol_database as _symbol_database
|
| 9 |
+
# @@protoc_insertion_point(imports)
|
| 10 |
+
|
| 11 |
+
_sym_db = _symbol_database.Default()
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x19sentencepiece_model.proto\x12\rsentencepiece\"\x80\x0c\n\x0bTrainerSpec\x12\r\n\x05input\x18\x01 \x03(\t\x12\x14\n\x0cinput_format\x18\x07 \x01(\t\x12\x14\n\x0cmodel_prefix\x18\x02 \x01(\t\x12\x41\n\nmodel_type\x18\x03 \x01(\x0e\x32$.sentencepiece.TrainerSpec.ModelType:\x07UNIGRAM\x12\x18\n\nvocab_size\x18\x04 \x01(\x05:\x04\x38\x30\x30\x30\x12\x17\n\x0f\x61\x63\x63\x65pt_language\x18\x05 \x03(\t\x12 \n\x15self_test_sample_size\x18\x06 \x01(\x05:\x01\x30\x12*\n\x1b\x65nable_differential_privacy\x18\x32 \x01(\x08:\x05\x66\x61lse\x12+\n differential_privacy_noise_level\x18\x33 \x01(\x02:\x01\x30\x12\x32\n\'differential_privacy_clipping_threshold\x18\x34 \x01(\x04:\x01\x30\x12\"\n\x12\x63haracter_coverage\x18\n \x01(\x02:\x06\x30.9995\x12\x1e\n\x13input_sentence_size\x18\x0b \x01(\x04:\x01\x30\x12$\n\x16shuffle_input_sentence\x18\x13 \x01(\x08:\x04true\x12 \n\x14mining_sentence_size\x18\x0c \x01(\x05\x42\x02\x18\x01\x12\"\n\x16training_sentence_size\x18\r \x01(\x05\x42\x02\x18\x01\x12(\n\x17seed_sentencepiece_size\x18\x0e \x01(\x05:\x07\x31\x30\x30\x30\x30\x30\x30\x12\x1e\n\x10shrinking_factor\x18\x0f \x01(\x02:\x04\x30.75\x12!\n\x13max_sentence_length\x18\x12 \x01(\x05:\x04\x34\x31\x39\x32\x12\x17\n\x0bnum_threads\x18\x10 \x01(\x05:\x02\x31\x36\x12\x1d\n\x12num_sub_iterations\x18\x11 \x01(\x05:\x01\x32\x12$\n\x18max_sentencepiece_length\x18\x14 \x01(\x05:\x02\x31\x36\x12%\n\x17split_by_unicode_script\x18\x15 \x01(\x08:\x04true\x12\x1d\n\x0fsplit_by_number\x18\x17 \x01(\x08:\x04true\x12!\n\x13split_by_whitespace\x18\x16 \x01(\x08:\x04true\x12)\n\x1atreat_whitespace_as_suffix\x18\x18 \x01(\x08:\x05\x66\x61lse\x12+\n\x1c\x61llow_whitespace_only_pieces\x18\x1a \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0csplit_digits\x18\x19 \x01(\x08:\x05\x66\x61lse\x12#\n\x19pretokenization_delimiter\x18\x35 \x01(\t:\x00\x12\x17\n\x0f\x63ontrol_symbols\x18\x1e \x03(\t\x12\x1c\n\x14user_defined_symbols\x18\x1f \x03(\t\x12\x16\n\x0erequired_chars\x18$ \x01(\t\x12\x1c\n\rbyte_fallback\x18# \x01(\x08:\x05\x66\x61lse\x12+\n\x1dvocabulary_output_piece_score\x18 \x01(\x08:\x04true\x12\x1e\n\x10hard_vocab_limit\x18! \x01(\x08:\x04true\x12\x1c\n\ruse_all_vocab\x18\" \x01(\x08:\x05\x66\x61lse\x12\x11\n\x06unk_id\x18( \x01(\x05:\x01\x30\x12\x11\n\x06\x62os_id\x18) \x01(\x05:\x01\x31\x12\x11\n\x06\x65os_id\x18* \x01(\x05:\x01\x32\x12\x12\n\x06pad_id\x18+ \x01(\x05:\x02-1\x12\x18\n\tunk_piece\x18- \x01(\t:\x05<unk>\x12\x16\n\tbos_piece\x18. \x01(\t:\x03<s>\x12\x17\n\teos_piece\x18/ \x01(\t:\x04</s>\x12\x18\n\tpad_piece\x18\x30 \x01(\t:\x05<pad>\x12\x1a\n\x0bunk_surface\x18, \x01(\t:\x05 \xe2\x81\x87 \x12+\n\x1ctrain_extremely_large_corpus\x18\x31 \x01(\x08:\x05\x66\x61lse\"5\n\tModelType\x12\x0b\n\x07UNIGRAM\x10\x01\x12\x07\n\x03\x42PE\x10\x02\x12\x08\n\x04WORD\x10\x03\x12\x08\n\x04\x43HAR\x10\x04*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xd1\x01\n\x0eNormalizerSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1c\n\x14precompiled_charsmap\x18\x02 \x01(\x0c\x12\x1e\n\x10\x61\x64\x64_dummy_prefix\x18\x03 \x01(\x08:\x04true\x12&\n\x18remove_extra_whitespaces\x18\x04 \x01(\x08:\x04true\x12 \n\x12\x65scape_whitespaces\x18\x05 \x01(\x08:\x04true\x12\x1e\n\x16normalization_rule_tsv\x18\x06 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"y\n\x0cSelfTestData\x12\x33\n\x07samples\x18\x01 \x03(\x0b\x32\".sentencepiece.SelfTestData.Sample\x1a)\n\x06Sample\x12\r\n\x05input\x18\x01 \x01(\t\x12\x10\n\x08\x65xpected\x18\x02 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xfe\x03\n\nModelProto\x12\x37\n\x06pieces\x18\x01 \x03(\x0b\x32\'.sentencepiece.ModelProto.SentencePiece\x12\x30\n\x0ctrainer_spec\x18\x02 \x01(\x0b\x32\x1a.sentencepiece.TrainerSpec\x12\x36\n\x0fnormalizer_spec\x18\x03 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x12\x33\n\x0eself_test_data\x18\x04 \x01(\x0b\x32\x1b.sentencepiece.SelfTestData\x12\x38\n\x11\x64\x65normalizer_spec\x18\x05 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x1a\xd2\x01\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x12\x42\n\x04type\x18\x03 \x01(\x0e\x32,.sentencepiece.ModelProto.SentencePiece.Type:\x06NORMAL\"T\n\x04Type\x12\n\n\x06NORMAL\x10\x01\x12\x0b\n\x07UNKNOWN\x10\x02\x12\x0b\n\x07\x43ONTROL\x10\x03\x12\x10\n\x0cUSER_DEFINED\x10\x04\x12\x08\n\x04\x42YTE\x10\x06\x12\n\n\x06UNUSED\x10\x05*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\x42\x02H\x03')
|
| 17 |
+
|
| 18 |
+
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals())
|
| 19 |
+
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'sentencepiece_model_pb2', globals())
|
| 20 |
+
if _descriptor._USE_C_DESCRIPTORS == False:
|
| 21 |
+
|
| 22 |
+
DESCRIPTOR._options = None
|
| 23 |
+
DESCRIPTOR._serialized_options = b'H\003'
|
| 24 |
+
_TRAINERSPEC.fields_by_name['mining_sentence_size']._options = None
|
| 25 |
+
_TRAINERSPEC.fields_by_name['mining_sentence_size']._serialized_options = b'\030\001'
|
| 26 |
+
_TRAINERSPEC.fields_by_name['training_sentence_size']._options = None
|
| 27 |
+
_TRAINERSPEC.fields_by_name['training_sentence_size']._serialized_options = b'\030\001'
|
| 28 |
+
_TRAINERSPEC._serialized_start=45
|
| 29 |
+
_TRAINERSPEC._serialized_end=1581
|
| 30 |
+
_TRAINERSPEC_MODELTYPE._serialized_start=1517
|
| 31 |
+
_TRAINERSPEC_MODELTYPE._serialized_end=1570
|
| 32 |
+
_NORMALIZERSPEC._serialized_start=1584
|
| 33 |
+
_NORMALIZERSPEC._serialized_end=1793
|
| 34 |
+
_SELFTESTDATA._serialized_start=1795
|
| 35 |
+
_SELFTESTDATA._serialized_end=1916
|
| 36 |
+
_SELFTESTDATA_SAMPLE._serialized_start=1864
|
| 37 |
+
_SELFTESTDATA_SAMPLE._serialized_end=1905
|
| 38 |
+
_MODELPROTO._serialized_start=1919
|
| 39 |
+
_MODELPROTO._serialized_end=2429
|
| 40 |
+
_MODELPROTO_SENTENCEPIECE._serialized_start=2208
|
| 41 |
+
_MODELPROTO_SENTENCEPIECE._serialized_end=2418
|
| 42 |
+
_MODELPROTO_SENTENCEPIECE_TYPE._serialized_start=2323
|
| 43 |
+
_MODELPROTO_SENTENCEPIECE_TYPE._serialized_end=2407
|
| 44 |
+
# @@protoc_insertion_point(module_scope)
|
source/sentencepiece/sentencepiece_pb2.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
| 3 |
+
# source: sentencepiece.proto
|
| 4 |
+
"""Generated protocol buffer code."""
|
| 5 |
+
from google.protobuf.internal import builder as _builder
|
| 6 |
+
from google.protobuf import descriptor as _descriptor
|
| 7 |
+
from google.protobuf import descriptor_pool as _descriptor_pool
|
| 8 |
+
from google.protobuf import symbol_database as _symbol_database
|
| 9 |
+
# @@protoc_insertion_point(imports)
|
| 10 |
+
|
| 11 |
+
_sym_db = _symbol_database.Default()
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13sentencepiece.proto\x12\rsentencepiece\"\xdf\x01\n\x11SentencePieceText\x12\x0c\n\x04text\x18\x01 \x01(\t\x12>\n\x06pieces\x18\x02 \x03(\x0b\x32..sentencepiece.SentencePieceText.SentencePiece\x12\r\n\x05score\x18\x03 \x01(\x02\x1a\x62\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\n\n\x02id\x18\x02 \x01(\r\x12\x0f\n\x07surface\x18\x03 \x01(\t\x12\r\n\x05\x62\x65gin\x18\x04 \x01(\r\x12\x0b\n\x03\x65nd\x18\x05 \x01(\r*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"J\n\x16NBestSentencePieceText\x12\x30\n\x06nbests\x18\x01 \x03(\x0b\x32 .sentencepiece.SentencePieceTextB\x02H\x03')
|
| 17 |
+
|
| 18 |
+
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals())
|
| 19 |
+
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'sentencepiece_pb2', globals())
|
| 20 |
+
if _descriptor._USE_C_DESCRIPTORS == False:
|
| 21 |
+
|
| 22 |
+
DESCRIPTOR._options = None
|
| 23 |
+
DESCRIPTOR._serialized_options = b'H\003'
|
| 24 |
+
_SENTENCEPIECETEXT._serialized_start=39
|
| 25 |
+
_SENTENCEPIECETEXT._serialized_end=262
|
| 26 |
+
_SENTENCEPIECETEXT_SENTENCEPIECE._serialized_start=153
|
| 27 |
+
_SENTENCEPIECETEXT_SENTENCEPIECE._serialized_end=251
|
| 28 |
+
_NBESTSENTENCEPIECETEXT._serialized_start=264
|
| 29 |
+
_NBESTSENTENCEPIECETEXT._serialized_end=338
|
| 30 |
+
# @@protoc_insertion_point(module_scope)
|
source/sentencepiece/sentencepiece_wrap.cxx
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
source/sentry_sdk-2.53.0.dist-info/INSTALLER
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
pip
|
source/sentry_sdk-2.53.0.dist-info/METADATA
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: sentry-sdk
|
| 3 |
+
Version: 2.53.0
|
| 4 |
+
Summary: Python client for Sentry (https://sentry.io)
|
| 5 |
+
Home-page: https://github.com/getsentry/sentry-python
|
| 6 |
+
Author: Sentry Team and Contributors
|
| 7 |
+
Author-email: hello@sentry.io
|
| 8 |
+
License: MIT
|
| 9 |
+
Project-URL: Documentation, https://docs.sentry.io/platforms/python/
|
| 10 |
+
Project-URL: Changelog, https://github.com/getsentry/sentry-python/blob/master/CHANGELOG.md
|
| 11 |
+
Classifier: Development Status :: 5 - Production/Stable
|
| 12 |
+
Classifier: Environment :: Web Environment
|
| 13 |
+
Classifier: Intended Audience :: Developers
|
| 14 |
+
Classifier: License :: OSI Approved :: BSD License
|
| 15 |
+
Classifier: Operating System :: OS Independent
|
| 16 |
+
Classifier: Programming Language :: Python
|
| 17 |
+
Classifier: Programming Language :: Python :: 3
|
| 18 |
+
Classifier: Programming Language :: Python :: 3.6
|
| 19 |
+
Classifier: Programming Language :: Python :: 3.7
|
| 20 |
+
Classifier: Programming Language :: Python :: 3.8
|
| 21 |
+
Classifier: Programming Language :: Python :: 3.9
|
| 22 |
+
Classifier: Programming Language :: Python :: 3.10
|
| 23 |
+
Classifier: Programming Language :: Python :: 3.11
|
| 24 |
+
Classifier: Programming Language :: Python :: 3.12
|
| 25 |
+
Classifier: Programming Language :: Python :: 3.13
|
| 26 |
+
Classifier: Programming Language :: Python :: 3.14
|
| 27 |
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
| 28 |
+
Requires-Python: >=3.6
|
| 29 |
+
Description-Content-Type: text/markdown
|
| 30 |
+
License-File: LICENSE
|
| 31 |
+
Requires-Dist: urllib3>=1.26.11
|
| 32 |
+
Requires-Dist: certifi
|
| 33 |
+
Provides-Extra: aiohttp
|
| 34 |
+
Requires-Dist: aiohttp>=3.5; extra == "aiohttp"
|
| 35 |
+
Provides-Extra: anthropic
|
| 36 |
+
Requires-Dist: anthropic>=0.16; extra == "anthropic"
|
| 37 |
+
Provides-Extra: arq
|
| 38 |
+
Requires-Dist: arq>=0.23; extra == "arq"
|
| 39 |
+
Provides-Extra: asyncpg
|
| 40 |
+
Requires-Dist: asyncpg>=0.23; extra == "asyncpg"
|
| 41 |
+
Provides-Extra: beam
|
| 42 |
+
Requires-Dist: apache-beam>=2.12; extra == "beam"
|
| 43 |
+
Provides-Extra: bottle
|
| 44 |
+
Requires-Dist: bottle>=0.12.13; extra == "bottle"
|
| 45 |
+
Provides-Extra: celery
|
| 46 |
+
Requires-Dist: celery>=3; extra == "celery"
|
| 47 |
+
Provides-Extra: celery-redbeat
|
| 48 |
+
Requires-Dist: celery-redbeat>=2; extra == "celery-redbeat"
|
| 49 |
+
Provides-Extra: chalice
|
| 50 |
+
Requires-Dist: chalice>=1.16.0; extra == "chalice"
|
| 51 |
+
Provides-Extra: clickhouse-driver
|
| 52 |
+
Requires-Dist: clickhouse-driver>=0.2.0; extra == "clickhouse-driver"
|
| 53 |
+
Provides-Extra: django
|
| 54 |
+
Requires-Dist: django>=1.8; extra == "django"
|
| 55 |
+
Provides-Extra: falcon
|
| 56 |
+
Requires-Dist: falcon>=1.4; extra == "falcon"
|
| 57 |
+
Provides-Extra: fastapi
|
| 58 |
+
Requires-Dist: fastapi>=0.79.0; extra == "fastapi"
|
| 59 |
+
Provides-Extra: flask
|
| 60 |
+
Requires-Dist: flask>=0.11; extra == "flask"
|
| 61 |
+
Requires-Dist: blinker>=1.1; extra == "flask"
|
| 62 |
+
Requires-Dist: markupsafe; extra == "flask"
|
| 63 |
+
Provides-Extra: grpcio
|
| 64 |
+
Requires-Dist: grpcio>=1.21.1; extra == "grpcio"
|
| 65 |
+
Requires-Dist: protobuf>=3.8.0; extra == "grpcio"
|
| 66 |
+
Provides-Extra: http2
|
| 67 |
+
Requires-Dist: httpcore[http2]==1.*; extra == "http2"
|
| 68 |
+
Provides-Extra: httpx
|
| 69 |
+
Requires-Dist: httpx>=0.16.0; extra == "httpx"
|
| 70 |
+
Provides-Extra: huey
|
| 71 |
+
Requires-Dist: huey>=2; extra == "huey"
|
| 72 |
+
Provides-Extra: huggingface-hub
|
| 73 |
+
Requires-Dist: huggingface_hub>=0.22; extra == "huggingface-hub"
|
| 74 |
+
Provides-Extra: langchain
|
| 75 |
+
Requires-Dist: langchain>=0.0.210; extra == "langchain"
|
| 76 |
+
Provides-Extra: langgraph
|
| 77 |
+
Requires-Dist: langgraph>=0.6.6; extra == "langgraph"
|
| 78 |
+
Provides-Extra: launchdarkly
|
| 79 |
+
Requires-Dist: launchdarkly-server-sdk>=9.8.0; extra == "launchdarkly"
|
| 80 |
+
Provides-Extra: litellm
|
| 81 |
+
Requires-Dist: litellm>=1.77.5; extra == "litellm"
|
| 82 |
+
Provides-Extra: litestar
|
| 83 |
+
Requires-Dist: litestar>=2.0.0; extra == "litestar"
|
| 84 |
+
Provides-Extra: loguru
|
| 85 |
+
Requires-Dist: loguru>=0.5; extra == "loguru"
|
| 86 |
+
Provides-Extra: mcp
|
| 87 |
+
Requires-Dist: mcp>=1.15.0; extra == "mcp"
|
| 88 |
+
Provides-Extra: openai
|
| 89 |
+
Requires-Dist: openai>=1.0.0; extra == "openai"
|
| 90 |
+
Requires-Dist: tiktoken>=0.3.0; extra == "openai"
|
| 91 |
+
Provides-Extra: openfeature
|
| 92 |
+
Requires-Dist: openfeature-sdk>=0.7.1; extra == "openfeature"
|
| 93 |
+
Provides-Extra: opentelemetry
|
| 94 |
+
Requires-Dist: opentelemetry-distro>=0.35b0; extra == "opentelemetry"
|
| 95 |
+
Provides-Extra: opentelemetry-experimental
|
| 96 |
+
Requires-Dist: opentelemetry-distro; extra == "opentelemetry-experimental"
|
| 97 |
+
Provides-Extra: opentelemetry-otlp
|
| 98 |
+
Requires-Dist: opentelemetry-distro[otlp]>=0.35b0; extra == "opentelemetry-otlp"
|
| 99 |
+
Provides-Extra: pure-eval
|
| 100 |
+
Requires-Dist: pure_eval; extra == "pure-eval"
|
| 101 |
+
Requires-Dist: executing; extra == "pure-eval"
|
| 102 |
+
Requires-Dist: asttokens; extra == "pure-eval"
|
| 103 |
+
Provides-Extra: pydantic-ai
|
| 104 |
+
Requires-Dist: pydantic-ai>=1.0.0; extra == "pydantic-ai"
|
| 105 |
+
Provides-Extra: pymongo
|
| 106 |
+
Requires-Dist: pymongo>=3.1; extra == "pymongo"
|
| 107 |
+
Provides-Extra: pyspark
|
| 108 |
+
Requires-Dist: pyspark>=2.4.4; extra == "pyspark"
|
| 109 |
+
Provides-Extra: quart
|
| 110 |
+
Requires-Dist: quart>=0.16.1; extra == "quart"
|
| 111 |
+
Requires-Dist: blinker>=1.1; extra == "quart"
|
| 112 |
+
Provides-Extra: rq
|
| 113 |
+
Requires-Dist: rq>=0.6; extra == "rq"
|
| 114 |
+
Provides-Extra: sanic
|
| 115 |
+
Requires-Dist: sanic>=0.8; extra == "sanic"
|
| 116 |
+
Provides-Extra: sqlalchemy
|
| 117 |
+
Requires-Dist: sqlalchemy>=1.2; extra == "sqlalchemy"
|
| 118 |
+
Provides-Extra: starlette
|
| 119 |
+
Requires-Dist: starlette>=0.19.1; extra == "starlette"
|
| 120 |
+
Provides-Extra: starlite
|
| 121 |
+
Requires-Dist: starlite>=1.48; extra == "starlite"
|
| 122 |
+
Provides-Extra: statsig
|
| 123 |
+
Requires-Dist: statsig>=0.55.3; extra == "statsig"
|
| 124 |
+
Provides-Extra: tornado
|
| 125 |
+
Requires-Dist: tornado>=6; extra == "tornado"
|
| 126 |
+
Provides-Extra: unleash
|
| 127 |
+
Requires-Dist: UnleashClient>=6.0.1; extra == "unleash"
|
| 128 |
+
Provides-Extra: google-genai
|
| 129 |
+
Requires-Dist: google-genai>=1.29.0; extra == "google-genai"
|
| 130 |
+
Dynamic: author
|
| 131 |
+
Dynamic: author-email
|
| 132 |
+
Dynamic: classifier
|
| 133 |
+
Dynamic: description
|
| 134 |
+
Dynamic: description-content-type
|
| 135 |
+
Dynamic: home-page
|
| 136 |
+
Dynamic: license
|
| 137 |
+
Dynamic: license-file
|
| 138 |
+
Dynamic: project-url
|
| 139 |
+
Dynamic: provides-extra
|
| 140 |
+
Dynamic: requires-dist
|
| 141 |
+
Dynamic: requires-python
|
| 142 |
+
Dynamic: summary
|
| 143 |
+
|
| 144 |
+
<a href="https://sentry.io/?utm_source=github&utm_medium=logo" target="_blank">
|
| 145 |
+
<img src="https://sentry-brand.storage.googleapis.com/github-banners/github-sdk-python.png" alt="Sentry for Python">
|
| 146 |
+
</a>
|
| 147 |
+
<div align="center">
|
| 148 |
+
|
| 149 |
+
_Bad software is everywhere, and we're tired of it. Sentry is on a mission to help developers write better software faster, so we can get back to enjoying technology. If you want to join us
|
| 150 |
+
[<kbd>**Check out our open positions**</kbd>](https://sentry.io/careers/)_.
|
| 151 |
+
|
| 152 |
+
[](https://discord.com/invite/Ww9hbqr)
|
| 153 |
+
[](https://x.com/intent/follow?screen_name=sentry)
|
| 154 |
+
[](https://pypi.python.org/pypi/sentry-sdk)
|
| 155 |
+
<img src="https://img.shields.io/badge/python-3.7 | 3.8 | 3.9 | 3.10 | 3.11 | 3.12 | 3.13 | 3.14-blue.svg" alt="python">
|
| 156 |
+
[](https://github.com/getsentry/sentry-python/actions/workflows/ci.yml)
|
| 157 |
+
|
| 158 |
+
<br/>
|
| 159 |
+
|
| 160 |
+
</div>
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
# Official Sentry SDK for Python
|
| 164 |
+
|
| 165 |
+
Welcome to the official Python SDK for **[Sentry](http://sentry.io/)**.
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
## 📦 Getting Started
|
| 169 |
+
|
| 170 |
+
### Prerequisites
|
| 171 |
+
|
| 172 |
+
You need a Sentry [account](https://sentry.io/signup/) and [project](https://docs.sentry.io/product/projects/).
|
| 173 |
+
|
| 174 |
+
### Installation
|
| 175 |
+
|
| 176 |
+
Getting Sentry into your project is straightforward. Just run this command in your terminal:
|
| 177 |
+
|
| 178 |
+
```bash
|
| 179 |
+
pip install --upgrade sentry-sdk
|
| 180 |
+
```
|
| 181 |
+
|
| 182 |
+
### Basic Configuration
|
| 183 |
+
|
| 184 |
+
Here's a quick configuration example to get Sentry up and running:
|
| 185 |
+
|
| 186 |
+
```python
|
| 187 |
+
import sentry_sdk
|
| 188 |
+
|
| 189 |
+
sentry_sdk.init(
|
| 190 |
+
"https://12927b5f211046b575ee51fd8b1ac34f@o1.ingest.sentry.io/1", # Your DSN here
|
| 191 |
+
|
| 192 |
+
# Set traces_sample_rate to 1.0 to capture 100%
|
| 193 |
+
# of traces for performance monitoring.
|
| 194 |
+
traces_sample_rate=1.0,
|
| 195 |
+
)
|
| 196 |
+
```
|
| 197 |
+
|
| 198 |
+
With this configuration, Sentry will monitor for exceptions and performance issues.
|
| 199 |
+
|
| 200 |
+
### Quick Usage Example
|
| 201 |
+
|
| 202 |
+
To generate some events that will show up in Sentry, you can log messages or capture errors:
|
| 203 |
+
|
| 204 |
+
```python
|
| 205 |
+
import sentry_sdk
|
| 206 |
+
sentry_sdk.init(...) # same as above
|
| 207 |
+
|
| 208 |
+
sentry_sdk.capture_message("Hello Sentry!") # You'll see this in your Sentry dashboard.
|
| 209 |
+
|
| 210 |
+
raise ValueError("Oops, something went wrong!") # This will create an error event in Sentry.
|
| 211 |
+
```
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
## 📚 Documentation
|
| 215 |
+
|
| 216 |
+
For more details on advanced usage, integrations, and customization, check out the full documentation on [https://docs.sentry.io](https://docs.sentry.io/).
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
## 🧩 Integrations
|
| 220 |
+
|
| 221 |
+
Sentry integrates with a ton of popular Python libraries and frameworks, including [FastAPI](https://docs.sentry.io/platforms/python/integrations/fastapi/), [Django](https://docs.sentry.io/platforms/python/integrations/django/), [Celery](https://docs.sentry.io/platforms/python/integrations/celery/), [OpenAI](https://docs.sentry.io/platforms/python/integrations/openai/) and many, many more. Check out the [full list of integrations](https://docs.sentry.io/platforms/python/integrations/) to get the full picture.
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
## 🚧 Migrating Between Versions?
|
| 225 |
+
|
| 226 |
+
### From `1.x` to `2.x`
|
| 227 |
+
|
| 228 |
+
If you're using the older `1.x` version of the SDK, now's the time to upgrade to `2.x`. It includes significant upgrades and new features. Check our [migration guide](https://docs.sentry.io/platforms/python/migration/1.x-to-2.x) for assistance.
|
| 229 |
+
|
| 230 |
+
### From `raven-python`
|
| 231 |
+
|
| 232 |
+
Using the legacy `raven-python` client? It's now in maintenance mode, and we recommend migrating to the new SDK for an improved experience. Get all the details in our [migration guide](https://docs.sentry.io/platforms/python/migration/raven-to-sentry-sdk/).
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
## 🙌 Want to Contribute?
|
| 236 |
+
|
| 237 |
+
We'd love your help in improving the Sentry SDK! Whether it's fixing bugs, adding features, writing new integrations, or enhancing documentation, every contribution is valuable.
|
| 238 |
+
|
| 239 |
+
For details on how to contribute, please read our [contribution guide](CONTRIBUTING.md) and explore the [open issues](https://github.com/getsentry/sentry-python/issues).
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
## 🛟 Need Help?
|
| 243 |
+
|
| 244 |
+
If you encounter issues or need help setting up or configuring the SDK, don't hesitate to reach out to the [Sentry Community on Discord](https://discord.com/invite/Ww9hbqr). There is a ton of great people there ready to help!
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
## 🔗 Resources
|
| 248 |
+
|
| 249 |
+
Here are all resources to help you make the most of Sentry:
|
| 250 |
+
|
| 251 |
+
- [Documentation](https://docs.sentry.io/platforms/python/) - Official documentation to get started.
|
| 252 |
+
- [Discord](https://discord.com/invite/Ww9hbqr) - Join our Discord community.
|
| 253 |
+
- [X/Twitter](https://x.com/intent/follow?screen_name=sentry) - Follow us on X (Twitter) for updates.
|
| 254 |
+
- [Stack Overflow](https://stackoverflow.com/questions/tagged/sentry) - Questions and answers related to Sentry.
|
| 255 |
+
|
| 256 |
+
<a name="license"></a>
|
| 257 |
+
## 📃 License
|
| 258 |
+
|
| 259 |
+
The SDK is open-source and available under the MIT license. Check out the [LICENSE](LICENSE) file for more information.
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
## 😘 Contributors
|
| 263 |
+
|
| 264 |
+
Thanks to everyone who has helped improve the SDK!
|
| 265 |
+
|
| 266 |
+
<a href="https://github.com/getsentry/sentry-python/graphs/contributors">
|
| 267 |
+
<img src="https://contributors-img.web.app/image?repo=getsentry/sentry-python" />
|
| 268 |
+
</a>
|
source/sentry_sdk-2.53.0.dist-info/RECORD
ADDED
|
@@ -0,0 +1,386 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
sentry_sdk-2.53.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
| 2 |
+
sentry_sdk-2.53.0.dist-info/METADATA,sha256=rdAnlprh5icT8lL9Qh0lJQw2I6rQOBYbmwG-1XWwXNA,10815
|
| 3 |
+
sentry_sdk-2.53.0.dist-info/RECORD,,
|
| 4 |
+
sentry_sdk-2.53.0.dist-info/WHEEL,sha256=uC7DnXjtoKy23CNUXRq6ohLsm8FbS2C_ECfYYzqZHVY,109
|
| 5 |
+
sentry_sdk-2.53.0.dist-info/entry_points.txt,sha256=qacZEz40UspQZD1IukCXykx0JtImqGDOctS5KfOLTko,91
|
| 6 |
+
sentry_sdk-2.53.0.dist-info/licenses/LICENSE,sha256=KhQNZg9GKBL6KQvHQNBGMxJsXsRdhLebVp4Sew7t3Qs,1093
|
| 7 |
+
sentry_sdk-2.53.0.dist-info/top_level.txt,sha256=XrQz30XE9FKXSY_yGLrd9bsv2Rk390GTDJOSujYaMxI,11
|
| 8 |
+
sentry_sdk/__init__.py,sha256=cnZoQ9y329brs-cdzIVtxbO1-o9AIrKk8VTVyZNJs1A,1410
|
| 9 |
+
sentry_sdk/__pycache__/__init__.cpython-312.pyc,,
|
| 10 |
+
sentry_sdk/__pycache__/_batcher.cpython-312.pyc,,
|
| 11 |
+
sentry_sdk/__pycache__/_compat.cpython-312.pyc,,
|
| 12 |
+
sentry_sdk/__pycache__/_init_implementation.cpython-312.pyc,,
|
| 13 |
+
sentry_sdk/__pycache__/_log_batcher.cpython-312.pyc,,
|
| 14 |
+
sentry_sdk/__pycache__/_lru_cache.cpython-312.pyc,,
|
| 15 |
+
sentry_sdk/__pycache__/_metrics_batcher.cpython-312.pyc,,
|
| 16 |
+
sentry_sdk/__pycache__/_queue.cpython-312.pyc,,
|
| 17 |
+
sentry_sdk/__pycache__/_span_batcher.cpython-312.pyc,,
|
| 18 |
+
sentry_sdk/__pycache__/_types.cpython-312.pyc,,
|
| 19 |
+
sentry_sdk/__pycache__/_werkzeug.cpython-312.pyc,,
|
| 20 |
+
sentry_sdk/__pycache__/api.cpython-312.pyc,,
|
| 21 |
+
sentry_sdk/__pycache__/attachments.cpython-312.pyc,,
|
| 22 |
+
sentry_sdk/__pycache__/client.cpython-312.pyc,,
|
| 23 |
+
sentry_sdk/__pycache__/consts.cpython-312.pyc,,
|
| 24 |
+
sentry_sdk/__pycache__/debug.cpython-312.pyc,,
|
| 25 |
+
sentry_sdk/__pycache__/envelope.cpython-312.pyc,,
|
| 26 |
+
sentry_sdk/__pycache__/feature_flags.cpython-312.pyc,,
|
| 27 |
+
sentry_sdk/__pycache__/hub.cpython-312.pyc,,
|
| 28 |
+
sentry_sdk/__pycache__/logger.cpython-312.pyc,,
|
| 29 |
+
sentry_sdk/__pycache__/metrics.cpython-312.pyc,,
|
| 30 |
+
sentry_sdk/__pycache__/monitor.cpython-312.pyc,,
|
| 31 |
+
sentry_sdk/__pycache__/scope.cpython-312.pyc,,
|
| 32 |
+
sentry_sdk/__pycache__/scrubber.cpython-312.pyc,,
|
| 33 |
+
sentry_sdk/__pycache__/serializer.cpython-312.pyc,,
|
| 34 |
+
sentry_sdk/__pycache__/session.cpython-312.pyc,,
|
| 35 |
+
sentry_sdk/__pycache__/sessions.cpython-312.pyc,,
|
| 36 |
+
sentry_sdk/__pycache__/spotlight.cpython-312.pyc,,
|
| 37 |
+
sentry_sdk/__pycache__/traces.cpython-312.pyc,,
|
| 38 |
+
sentry_sdk/__pycache__/tracing.cpython-312.pyc,,
|
| 39 |
+
sentry_sdk/__pycache__/tracing_utils.cpython-312.pyc,,
|
| 40 |
+
sentry_sdk/__pycache__/transport.cpython-312.pyc,,
|
| 41 |
+
sentry_sdk/__pycache__/types.cpython-312.pyc,,
|
| 42 |
+
sentry_sdk/__pycache__/utils.cpython-312.pyc,,
|
| 43 |
+
sentry_sdk/__pycache__/worker.cpython-312.pyc,,
|
| 44 |
+
sentry_sdk/_batcher.py,sha256=3Elbey1gpWzYSo-Dr5xxP00QpYcRYb8FGYqei9U_mdE,4004
|
| 45 |
+
sentry_sdk/_compat.py,sha256=tSI9LTAlSw7x_putvg7qg5SSdo7W1EwpbYiB_godUMQ,3065
|
| 46 |
+
sentry_sdk/_init_implementation.py,sha256=xualjTVpMB8XbJUX-30eAnuiyPfDeoeZoer3iIl0mH4,2491
|
| 47 |
+
sentry_sdk/_log_batcher.py,sha256=6Y3VNP6a65t1gzND9Rz9356xj2ZXOI1rB8Wq2De7Cpc,1827
|
| 48 |
+
sentry_sdk/_lru_cache.py,sha256=YSnCmL1qArB6_dsfcwKZgO4eJhBak87HqjqbUzBNQ7A,1167
|
| 49 |
+
sentry_sdk/_metrics_batcher.py,sha256=c7aw1sG7rldcS4XrpKAX59AGMXZ74Vx9Ja7peQo7MPc,1273
|
| 50 |
+
sentry_sdk/_queue.py,sha256=bluPs51jz8d3w74aSolOZYc3jnwmyTGPKvKxv7Z2LL0,11244
|
| 51 |
+
sentry_sdk/_span_batcher.py,sha256=EscEhZvzyJIby2taeOfRL7m6hm2RwptdmXwk0b8k36o,4406
|
| 52 |
+
sentry_sdk/_types.py,sha256=bcDjVByhfqJoYB2deda3oZBYv73gRanJ3WfFM2WagXM,11018
|
| 53 |
+
sentry_sdk/_werkzeug.py,sha256=vQL3Z_q3OBQ8vEby2ktiY7Ey8szMDWZsUe-BGJ9A3oY,3716
|
| 54 |
+
sentry_sdk/ai/__init__.py,sha256=L2EAYEx9075dUVZGEB_CBxSgwaLeYLFWhBYfIEeBDRg,218
|
| 55 |
+
sentry_sdk/ai/__pycache__/__init__.cpython-312.pyc,,
|
| 56 |
+
sentry_sdk/ai/__pycache__/_openai_completions_api.cpython-312.pyc,,
|
| 57 |
+
sentry_sdk/ai/__pycache__/_openai_responses_api.cpython-312.pyc,,
|
| 58 |
+
sentry_sdk/ai/__pycache__/monitoring.cpython-312.pyc,,
|
| 59 |
+
sentry_sdk/ai/__pycache__/utils.cpython-312.pyc,,
|
| 60 |
+
sentry_sdk/ai/_openai_completions_api.py,sha256=MeLmyCCd-PQLWRN4RHgCm7bK6QXBjU74Df0ZGHDDDCg,1821
|
| 61 |
+
sentry_sdk/ai/_openai_responses_api.py,sha256=xN2fbP2hNibB-kW0mbA7L9s_g_JrYhyra7V4S9xF8kU,656
|
| 62 |
+
sentry_sdk/ai/monitoring.py,sha256=PRYQXdKh2CdIttIKcCWhQKebGHlBHn6b0lKWAmhK7MQ,5539
|
| 63 |
+
sentry_sdk/ai/utils.py,sha256=VtyghTEHI_3KRrEltYpLlIOO5FX5eD_Og1l7sqA9mx4,23837
|
| 64 |
+
sentry_sdk/api.py,sha256=p4ABg2gX0Lx5T6nbi-D8LIo-p3lB37IcI91I_bEoJYg,14668
|
| 65 |
+
sentry_sdk/attachments.py,sha256=fvmTzYvG2a2mjNzTzcja28ize87kEEZMtXwK1rmXpBc,3023
|
| 66 |
+
sentry_sdk/client.py,sha256=jhdrYnioKRqwZy66VgqrwGNyTewajkd1stPU0iZqZLA,38867
|
| 67 |
+
sentry_sdk/consts.py,sha256=ndv79s9CYhXd5N1S5a_F6MiO2lAimC2k69aSZYQU2kM,55042
|
| 68 |
+
sentry_sdk/crons/__init__.py,sha256=3Zt6g1-pZZ12uRKKsC8QLm3XgJ4K1VYxgVpNNUygOZY,221
|
| 69 |
+
sentry_sdk/crons/__pycache__/__init__.cpython-312.pyc,,
|
| 70 |
+
sentry_sdk/crons/__pycache__/api.cpython-312.pyc,,
|
| 71 |
+
sentry_sdk/crons/__pycache__/consts.cpython-312.pyc,,
|
| 72 |
+
sentry_sdk/crons/__pycache__/decorator.cpython-312.pyc,,
|
| 73 |
+
sentry_sdk/crons/api.py,sha256=gwtNjHZxZy51piEwYrnrYg3bZkifrdkSRlDAZYC30Kg,1679
|
| 74 |
+
sentry_sdk/crons/consts.py,sha256=dXqJk5meBSu5rjlGpqAOlkpACnuUi7svQnAFoy1ZNUU,87
|
| 75 |
+
sentry_sdk/crons/decorator.py,sha256=4fYEHYlhdtKyhzJr1HZZ7g2ierEJ3ULZe4N7a3jqbYE,3834
|
| 76 |
+
sentry_sdk/debug.py,sha256=63f8uUowpbZ75E8q-r_8Tdymsv9jXZn1VMh287VcWhk,959
|
| 77 |
+
sentry_sdk/envelope.py,sha256=q0CtNI9IgYPTlviQyp8HB8aeIxLVcklMIqaQ-LU_HpU,9652
|
| 78 |
+
sentry_sdk/feature_flags.py,sha256=J9G-nnQiXimYbhkPNDGz_CjAccYlbXS9KgybExBZjUI,2136
|
| 79 |
+
sentry_sdk/hub.py,sha256=sVkOrpjU0cNhxOFtWfxA8Nc_3T8hiGH-mVFZa4tX1Us,25147
|
| 80 |
+
sentry_sdk/integrations/__init__.py,sha256=U52rckcGzXoe73On-T70R64feEIg1ir1O1NSSkQvHzg,12674
|
| 81 |
+
sentry_sdk/integrations/__pycache__/__init__.cpython-312.pyc,,
|
| 82 |
+
sentry_sdk/integrations/__pycache__/_asgi_common.cpython-312.pyc,,
|
| 83 |
+
sentry_sdk/integrations/__pycache__/_wsgi_common.cpython-312.pyc,,
|
| 84 |
+
sentry_sdk/integrations/__pycache__/aiohttp.cpython-312.pyc,,
|
| 85 |
+
sentry_sdk/integrations/__pycache__/anthropic.cpython-312.pyc,,
|
| 86 |
+
sentry_sdk/integrations/__pycache__/argv.cpython-312.pyc,,
|
| 87 |
+
sentry_sdk/integrations/__pycache__/ariadne.cpython-312.pyc,,
|
| 88 |
+
sentry_sdk/integrations/__pycache__/arq.cpython-312.pyc,,
|
| 89 |
+
sentry_sdk/integrations/__pycache__/asgi.cpython-312.pyc,,
|
| 90 |
+
sentry_sdk/integrations/__pycache__/asyncio.cpython-312.pyc,,
|
| 91 |
+
sentry_sdk/integrations/__pycache__/asyncpg.cpython-312.pyc,,
|
| 92 |
+
sentry_sdk/integrations/__pycache__/atexit.cpython-312.pyc,,
|
| 93 |
+
sentry_sdk/integrations/__pycache__/aws_lambda.cpython-312.pyc,,
|
| 94 |
+
sentry_sdk/integrations/__pycache__/beam.cpython-312.pyc,,
|
| 95 |
+
sentry_sdk/integrations/__pycache__/boto3.cpython-312.pyc,,
|
| 96 |
+
sentry_sdk/integrations/__pycache__/bottle.cpython-312.pyc,,
|
| 97 |
+
sentry_sdk/integrations/__pycache__/chalice.cpython-312.pyc,,
|
| 98 |
+
sentry_sdk/integrations/__pycache__/clickhouse_driver.cpython-312.pyc,,
|
| 99 |
+
sentry_sdk/integrations/__pycache__/cloud_resource_context.cpython-312.pyc,,
|
| 100 |
+
sentry_sdk/integrations/__pycache__/cohere.cpython-312.pyc,,
|
| 101 |
+
sentry_sdk/integrations/__pycache__/dedupe.cpython-312.pyc,,
|
| 102 |
+
sentry_sdk/integrations/__pycache__/dramatiq.cpython-312.pyc,,
|
| 103 |
+
sentry_sdk/integrations/__pycache__/excepthook.cpython-312.pyc,,
|
| 104 |
+
sentry_sdk/integrations/__pycache__/executing.cpython-312.pyc,,
|
| 105 |
+
sentry_sdk/integrations/__pycache__/falcon.cpython-312.pyc,,
|
| 106 |
+
sentry_sdk/integrations/__pycache__/fastapi.cpython-312.pyc,,
|
| 107 |
+
sentry_sdk/integrations/__pycache__/flask.cpython-312.pyc,,
|
| 108 |
+
sentry_sdk/integrations/__pycache__/gcp.cpython-312.pyc,,
|
| 109 |
+
sentry_sdk/integrations/__pycache__/gnu_backtrace.cpython-312.pyc,,
|
| 110 |
+
sentry_sdk/integrations/__pycache__/gql.cpython-312.pyc,,
|
| 111 |
+
sentry_sdk/integrations/__pycache__/graphene.cpython-312.pyc,,
|
| 112 |
+
sentry_sdk/integrations/__pycache__/httpx.cpython-312.pyc,,
|
| 113 |
+
sentry_sdk/integrations/__pycache__/huey.cpython-312.pyc,,
|
| 114 |
+
sentry_sdk/integrations/__pycache__/huggingface_hub.cpython-312.pyc,,
|
| 115 |
+
sentry_sdk/integrations/__pycache__/langchain.cpython-312.pyc,,
|
| 116 |
+
sentry_sdk/integrations/__pycache__/langgraph.cpython-312.pyc,,
|
| 117 |
+
sentry_sdk/integrations/__pycache__/launchdarkly.cpython-312.pyc,,
|
| 118 |
+
sentry_sdk/integrations/__pycache__/litellm.cpython-312.pyc,,
|
| 119 |
+
sentry_sdk/integrations/__pycache__/litestar.cpython-312.pyc,,
|
| 120 |
+
sentry_sdk/integrations/__pycache__/logging.cpython-312.pyc,,
|
| 121 |
+
sentry_sdk/integrations/__pycache__/loguru.cpython-312.pyc,,
|
| 122 |
+
sentry_sdk/integrations/__pycache__/mcp.cpython-312.pyc,,
|
| 123 |
+
sentry_sdk/integrations/__pycache__/modules.cpython-312.pyc,,
|
| 124 |
+
sentry_sdk/integrations/__pycache__/openai.cpython-312.pyc,,
|
| 125 |
+
sentry_sdk/integrations/__pycache__/openfeature.cpython-312.pyc,,
|
| 126 |
+
sentry_sdk/integrations/__pycache__/otlp.cpython-312.pyc,,
|
| 127 |
+
sentry_sdk/integrations/__pycache__/pure_eval.cpython-312.pyc,,
|
| 128 |
+
sentry_sdk/integrations/__pycache__/pymongo.cpython-312.pyc,,
|
| 129 |
+
sentry_sdk/integrations/__pycache__/pyramid.cpython-312.pyc,,
|
| 130 |
+
sentry_sdk/integrations/__pycache__/quart.cpython-312.pyc,,
|
| 131 |
+
sentry_sdk/integrations/__pycache__/ray.cpython-312.pyc,,
|
| 132 |
+
sentry_sdk/integrations/__pycache__/rq.cpython-312.pyc,,
|
| 133 |
+
sentry_sdk/integrations/__pycache__/rust_tracing.cpython-312.pyc,,
|
| 134 |
+
sentry_sdk/integrations/__pycache__/sanic.cpython-312.pyc,,
|
| 135 |
+
sentry_sdk/integrations/__pycache__/serverless.cpython-312.pyc,,
|
| 136 |
+
sentry_sdk/integrations/__pycache__/socket.cpython-312.pyc,,
|
| 137 |
+
sentry_sdk/integrations/__pycache__/sqlalchemy.cpython-312.pyc,,
|
| 138 |
+
sentry_sdk/integrations/__pycache__/starlette.cpython-312.pyc,,
|
| 139 |
+
sentry_sdk/integrations/__pycache__/starlite.cpython-312.pyc,,
|
| 140 |
+
sentry_sdk/integrations/__pycache__/statsig.cpython-312.pyc,,
|
| 141 |
+
sentry_sdk/integrations/__pycache__/stdlib.cpython-312.pyc,,
|
| 142 |
+
sentry_sdk/integrations/__pycache__/strawberry.cpython-312.pyc,,
|
| 143 |
+
sentry_sdk/integrations/__pycache__/sys_exit.cpython-312.pyc,,
|
| 144 |
+
sentry_sdk/integrations/__pycache__/threading.cpython-312.pyc,,
|
| 145 |
+
sentry_sdk/integrations/__pycache__/tornado.cpython-312.pyc,,
|
| 146 |
+
sentry_sdk/integrations/__pycache__/trytond.cpython-312.pyc,,
|
| 147 |
+
sentry_sdk/integrations/__pycache__/typer.cpython-312.pyc,,
|
| 148 |
+
sentry_sdk/integrations/__pycache__/unleash.cpython-312.pyc,,
|
| 149 |
+
sentry_sdk/integrations/__pycache__/unraisablehook.cpython-312.pyc,,
|
| 150 |
+
sentry_sdk/integrations/__pycache__/wsgi.cpython-312.pyc,,
|
| 151 |
+
sentry_sdk/integrations/_asgi_common.py,sha256=qY2nH21YGtQ1EufzdrovHtGZ-ZaMbSzYNKCo8z-kD44,3145
|
| 152 |
+
sentry_sdk/integrations/_wsgi_common.py,sha256=ffyvbYSYtVNo8wkzx7tt1T7anuMiu2ixApZmC0O9NH4,7282
|
| 153 |
+
sentry_sdk/integrations/aiohttp.py,sha256=vQrqPm_IVgvgQo4PLEyDNrvA5oloz1xALOFtVDTtOpE,12930
|
| 154 |
+
sentry_sdk/integrations/anthropic.py,sha256=CDLZf73v8HBj-OxDzJHlgmecXFHeIIYKoem2kdHmKP4,21222
|
| 155 |
+
sentry_sdk/integrations/argv.py,sha256=3dyTYcNVhx-EoCxzZI8-JO1mxydBo_zHxIKHpLJgJcc,877
|
| 156 |
+
sentry_sdk/integrations/ariadne.py,sha256=u1X6QEcXfwBXa4mmHkgcj8XMoowwBGIWQQYAsItpjwg,5792
|
| 157 |
+
sentry_sdk/integrations/arq.py,sha256=3hs8Y41X3rjxCd769CHjUBYQe3YgULtbX6WQQB5-kwQ,7906
|
| 158 |
+
sentry_sdk/integrations/asgi.py,sha256=AoJcWDRkmm5vqSMH44V2zx00BiT9Kzqfo3pd3Eyi6D8,12687
|
| 159 |
+
sentry_sdk/integrations/asyncio.py,sha256=nszg5HkxWLoTzfS3EZ9hBzsCC7JM8JKOBFXbNXnCDpI,6994
|
| 160 |
+
sentry_sdk/integrations/asyncpg.py,sha256=T0Ijpln-xoQCuY0AS15dYZK2_1XCdKYT1J9EfdSQURo,6578
|
| 161 |
+
sentry_sdk/integrations/atexit.py,sha256=eUcgC6f7WoqJadxDbn76dBv6zoS97kgyxshjGBNK1us,1567
|
| 162 |
+
sentry_sdk/integrations/aws_lambda.py,sha256=HDvJk2EpOjS0_2wkTXc9lQr2w-huT7v3j4_-Pxq0KPw,17895
|
| 163 |
+
sentry_sdk/integrations/beam.py,sha256=j-BzEzAHXk12HJ8izUyIOaM02d29BZUpYldBMCq8vdI,5092
|
| 164 |
+
sentry_sdk/integrations/boto3.py,sha256=4HLGehOoU8rc-85yL5ATUUZL9x_TGYjPkNYHOcT4Z2w,4341
|
| 165 |
+
sentry_sdk/integrations/bottle.py,sha256=Im8xqoQvrygqKHsK-rtaX41gOaY_hxb4anjgJnZQGu4,6404
|
| 166 |
+
sentry_sdk/integrations/celery/__init__.py,sha256=LTWUX6mEsf93fITkcLUV1exm7Rk12DY6JUGKH_gPXCI,18497
|
| 167 |
+
sentry_sdk/integrations/celery/__pycache__/__init__.cpython-312.pyc,,
|
| 168 |
+
sentry_sdk/integrations/celery/__pycache__/beat.cpython-312.pyc,,
|
| 169 |
+
sentry_sdk/integrations/celery/__pycache__/utils.cpython-312.pyc,,
|
| 170 |
+
sentry_sdk/integrations/celery/beat.py,sha256=RCG2yAyJSyWtzDGw6B1M_w3TXDvPnznAt1dhfFcLBDI,8839
|
| 171 |
+
sentry_sdk/integrations/celery/utils.py,sha256=KyaM868RGak0JeFHkQVYtR1CkKRNpSD4_bbpisF5jqo,1152
|
| 172 |
+
sentry_sdk/integrations/chalice.py,sha256=nd_J4MnPZzoVdH4M5HV_-ljeS8fbxe_5DXmKJm6zqSI,4663
|
| 173 |
+
sentry_sdk/integrations/clickhouse_driver.py,sha256=mNYW99LlIOCk45aUeELY9cuHTfBorhwV0rGn2c_25EY,5905
|
| 174 |
+
sentry_sdk/integrations/cloud_resource_context.py,sha256=J_3Zd9czyI2NuWy09BEwGKzw80rQlVlJ1CQqTvJ5m18,7638
|
| 175 |
+
sentry_sdk/integrations/cohere.py,sha256=QxgvN7fy385-EwUPYFup98UiMxxg_zrE8h0yCfI5iT8,9495
|
| 176 |
+
sentry_sdk/integrations/dedupe.py,sha256=EApcjbAp_uEmsOSb0m64JNGaKUEqB46YCYX8AgTDO24,1903
|
| 177 |
+
sentry_sdk/integrations/django/__init__.py,sha256=T2N0ZnN4aI4AGJvoyNtYBBXtAKPrN3suuQPQi1Ks_r8,26421
|
| 178 |
+
sentry_sdk/integrations/django/__pycache__/__init__.cpython-312.pyc,,
|
| 179 |
+
sentry_sdk/integrations/django/__pycache__/asgi.cpython-312.pyc,,
|
| 180 |
+
sentry_sdk/integrations/django/__pycache__/caching.cpython-312.pyc,,
|
| 181 |
+
sentry_sdk/integrations/django/__pycache__/middleware.cpython-312.pyc,,
|
| 182 |
+
sentry_sdk/integrations/django/__pycache__/signals_handlers.cpython-312.pyc,,
|
| 183 |
+
sentry_sdk/integrations/django/__pycache__/tasks.cpython-312.pyc,,
|
| 184 |
+
sentry_sdk/integrations/django/__pycache__/templates.cpython-312.pyc,,
|
| 185 |
+
sentry_sdk/integrations/django/__pycache__/transactions.cpython-312.pyc,,
|
| 186 |
+
sentry_sdk/integrations/django/__pycache__/views.cpython-312.pyc,,
|
| 187 |
+
sentry_sdk/integrations/django/asgi.py,sha256=xi4ceuIgjFsvyBwzGOwHj_siLHqeYHuL5oLfPqDxPKQ,8436
|
| 188 |
+
sentry_sdk/integrations/django/caching.py,sha256=gXRDSZigV1SETmxDUe8GA53Tha2fo0ZAFOofuorQD30,6980
|
| 189 |
+
sentry_sdk/integrations/django/middleware.py,sha256=kSN4ORVUuO8Yvt0sC6YS8br2gk6gLmBwysIFsN9z2js,5917
|
| 190 |
+
sentry_sdk/integrations/django/signals_handlers.py,sha256=E1r1sYzKQHko6fD8bPC7_rXGH5kBVDwmE2sutWuGAwQ,3062
|
| 191 |
+
sentry_sdk/integrations/django/tasks.py,sha256=Bu_5jxvFNcAk9N8PedxhWP5Fq-Mj8zVRYdDRsd-6eJA,1151
|
| 192 |
+
sentry_sdk/integrations/django/templates.py,sha256=NZuSVDU0srkF_La5BpEIGqonCbK-qLKMMI3bqj2za3w,5695
|
| 193 |
+
sentry_sdk/integrations/django/transactions.py,sha256=zjsJgVFJZX-c5H_I-TXeLrvrw3OvgAWgmxYO5nTdhfQ,4918
|
| 194 |
+
sentry_sdk/integrations/django/views.py,sha256=Cz2H_yeZkPixOksVoi5urp6Xeq5p0YdDsBWz5aEJan4,3256
|
| 195 |
+
sentry_sdk/integrations/dramatiq.py,sha256=OIq0lIUbabt9widLKGkyjhvx9MKXNF9atP1gTsLheuQ,7438
|
| 196 |
+
sentry_sdk/integrations/excepthook.py,sha256=lhbIwr1eIj3k8shtmUGDOWI7Qr-OaT5kbgeHj-F217g,2371
|
| 197 |
+
sentry_sdk/integrations/executing.py,sha256=O5R4pw999PIisy7uksrMkRx0SsmfBFKI1BEpI3cTDz4,1981
|
| 198 |
+
sentry_sdk/integrations/falcon.py,sha256=zWKFTzoOY92BryGKsjKDAhVxFDhyop4G02js6OlmIXY,9277
|
| 199 |
+
sentry_sdk/integrations/fastapi.py,sha256=eebHCt2FiR7YQCaRJVOnXurltMZ4phRCZaL6BXSEpVI,4485
|
| 200 |
+
sentry_sdk/integrations/flask.py,sha256=KGG2Jbcw9SdwADJLg5GmBnVq3VywK7qQwdPv97RldmA,8519
|
| 201 |
+
sentry_sdk/integrations/gcp.py,sha256=Fao0nrrBpiG8_GHygzh6VvmM8bpJq0mkvgTwu5LT1a4,8400
|
| 202 |
+
sentry_sdk/integrations/gnu_backtrace.py,sha256=XND4pkVeprhDdvPgOi8pBgQQ9g3CtPA-wJK0e0ex1XE,2771
|
| 203 |
+
sentry_sdk/integrations/google_genai/__init__.py,sha256=Z-NX8wKGkSwlQupCcCgsarGV3qtpmeiT_wGNqpxweSM,14093
|
| 204 |
+
sentry_sdk/integrations/google_genai/__pycache__/__init__.cpython-312.pyc,,
|
| 205 |
+
sentry_sdk/integrations/google_genai/__pycache__/consts.cpython-312.pyc,,
|
| 206 |
+
sentry_sdk/integrations/google_genai/__pycache__/streaming.cpython-312.pyc,,
|
| 207 |
+
sentry_sdk/integrations/google_genai/__pycache__/utils.cpython-312.pyc,,
|
| 208 |
+
sentry_sdk/integrations/google_genai/consts.py,sha256=nqHKKSyGixrSoozA06BGVBFaUCsvZlvGoubUZGI1kB8,559
|
| 209 |
+
sentry_sdk/integrations/google_genai/streaming.py,sha256=JqhTcAX3DtEsZ033AXsnA0KAzQe_jqca-7cztTKDcRI,5416
|
| 210 |
+
sentry_sdk/integrations/google_genai/utils.py,sha256=WCTyevkQF2CdGpdlidONJSWSdDmnqEa0dcaiRX2luuk,34850
|
| 211 |
+
sentry_sdk/integrations/gql.py,sha256=oLssOmHYQ4e0OQ1ITer8TynvyLXtDUoM9X29-IsVAkI,5003
|
| 212 |
+
sentry_sdk/integrations/graphene.py,sha256=11XRvAsSgvCLcfF1xvdSeAf0vvJm72hfcE3toAS8jZI,5066
|
| 213 |
+
sentry_sdk/integrations/grpc/__init__.py,sha256=0Y3neOWwES1PsWJsluPF_2_VatFlVvg7IUjTYYdCs-U,6253
|
| 214 |
+
sentry_sdk/integrations/grpc/__pycache__/__init__.cpython-312.pyc,,
|
| 215 |
+
sentry_sdk/integrations/grpc/__pycache__/client.cpython-312.pyc,,
|
| 216 |
+
sentry_sdk/integrations/grpc/__pycache__/consts.cpython-312.pyc,,
|
| 217 |
+
sentry_sdk/integrations/grpc/__pycache__/server.cpython-312.pyc,,
|
| 218 |
+
sentry_sdk/integrations/grpc/aio/__init__.py,sha256=2rgrliowpPfLLw40_2YU6ixSzIu_3f8NN3TRplzc8S8,141
|
| 219 |
+
sentry_sdk/integrations/grpc/aio/__pycache__/__init__.cpython-312.pyc,,
|
| 220 |
+
sentry_sdk/integrations/grpc/aio/__pycache__/client.cpython-312.pyc,,
|
| 221 |
+
sentry_sdk/integrations/grpc/aio/__pycache__/server.cpython-312.pyc,,
|
| 222 |
+
sentry_sdk/integrations/grpc/aio/client.py,sha256=2GNrTWJt4iZ9dK-M3_0-sVWlUcfqKItHZI5u-uzesUM,3507
|
| 223 |
+
sentry_sdk/integrations/grpc/aio/server.py,sha256=cRBvz9GOJemvNOsCB9ymTUX70nhjljfzvX0bfR8Wc_o,3954
|
| 224 |
+
sentry_sdk/integrations/grpc/client.py,sha256=s7yLcbYgnq1-bdh6jvSOv6NwKTx0m9tLPt0q4qKI7kw,3404
|
| 225 |
+
sentry_sdk/integrations/grpc/consts.py,sha256=NpsN5gKWDmtGtVK_L5HscgFZBHqjOpmLJLGKyh8GZBA,31
|
| 226 |
+
sentry_sdk/integrations/grpc/server.py,sha256=h6vEibthsZB6hWNmqT50UlBaddEZSdbXio_rYhighW0,2470
|
| 227 |
+
sentry_sdk/integrations/httpx.py,sha256=MxoT_D8n902awI2hq-lHpxXkOvNwGZFuG9ap4l4VxL4,5315
|
| 228 |
+
sentry_sdk/integrations/huey.py,sha256=GOslGxuYfrdPLqx_4MWOH_tdwwnbD72SkTdc-P8uF80,5348
|
| 229 |
+
sentry_sdk/integrations/huggingface_hub.py,sha256=axCqdDlmYn7wANHgVBo3MDv9QgBr0MWj3lBYhPJJX7E,14966
|
| 230 |
+
sentry_sdk/integrations/langchain.py,sha256=Oa-YJ2WoHX8Huyrzs4aQ2xlaupFR5Ft8FC4_0b_zvBs,42903
|
| 231 |
+
sentry_sdk/integrations/langgraph.py,sha256=xb9InjYjd4jAQECWddl30QGRoHRbbhoYBeHscsM7C1A,13301
|
| 232 |
+
sentry_sdk/integrations/launchdarkly.py,sha256=RSTCAUOJCo6Npr4V3-7-vSOaZ531Jj1LNU5IftXL6_w,1918
|
| 233 |
+
sentry_sdk/integrations/litellm.py,sha256=1CLM3CBdIv0YZSloDPCfemIdlufNG41eWvCBRG_AFow,11550
|
| 234 |
+
sentry_sdk/integrations/litestar.py,sha256=oJIz1vv2JgIT_AriKXDTQvQjkMOmwKfT68DFrCfdJ-8,11698
|
| 235 |
+
sentry_sdk/integrations/logging.py,sha256=eLjza7xDrfBszOH4gDHxPtSM3WfiWsw9WGR3GUfnA-A,13739
|
| 236 |
+
sentry_sdk/integrations/loguru.py,sha256=2UaLLzXVEOV0cYxOU5n9GpkmNtC_bA1KzCzEnrqIilE,6505
|
| 237 |
+
sentry_sdk/integrations/mcp.py,sha256=4t7cVGwqjoqyNSd4HrwzjfM7fw2B5HYWRJoT6wOw6AA,22192
|
| 238 |
+
sentry_sdk/integrations/modules.py,sha256=7kmS9T-werRkKN0w51skyDmTMuZbdz9L28RgXPsoDeU,786
|
| 239 |
+
sentry_sdk/integrations/openai.py,sha256=8UquHKCu17tS6Afs90HLDa4mpx-7ErRl4nUh1OKwdbE,33167
|
| 240 |
+
sentry_sdk/integrations/openai_agents/__init__.py,sha256=0FHafAmsKZKNtijv3ZRxS9b2xoUvfHbbgFuWZ3eSbRo,10709
|
| 241 |
+
sentry_sdk/integrations/openai_agents/__pycache__/__init__.cpython-312.pyc,,
|
| 242 |
+
sentry_sdk/integrations/openai_agents/__pycache__/consts.cpython-312.pyc,,
|
| 243 |
+
sentry_sdk/integrations/openai_agents/__pycache__/utils.cpython-312.pyc,,
|
| 244 |
+
sentry_sdk/integrations/openai_agents/consts.py,sha256=PTb3vlqkuMPktu21ALK72o5WMIX4-cewTEiTRdHKFdQ,38
|
| 245 |
+
sentry_sdk/integrations/openai_agents/patches/__init__.py,sha256=62qvLBPGajoxGi4C41eBOztIiB8MGRpmP2i3GSD_22k,383
|
| 246 |
+
sentry_sdk/integrations/openai_agents/patches/__pycache__/__init__.cpython-312.pyc,,
|
| 247 |
+
sentry_sdk/integrations/openai_agents/patches/__pycache__/agent_run.cpython-312.pyc,,
|
| 248 |
+
sentry_sdk/integrations/openai_agents/patches/__pycache__/error_tracing.cpython-312.pyc,,
|
| 249 |
+
sentry_sdk/integrations/openai_agents/patches/__pycache__/models.cpython-312.pyc,,
|
| 250 |
+
sentry_sdk/integrations/openai_agents/patches/__pycache__/runner.cpython-312.pyc,,
|
| 251 |
+
sentry_sdk/integrations/openai_agents/patches/__pycache__/tools.cpython-312.pyc,,
|
| 252 |
+
sentry_sdk/integrations/openai_agents/patches/agent_run.py,sha256=OhLQLEB7LQEMRs6BkQ0jfyY6QaZ-lhP5yZ_b44KTRV0,8257
|
| 253 |
+
sentry_sdk/integrations/openai_agents/patches/error_tracing.py,sha256=Fzs4oAyGWnmbaeLMMRnr9m6F6M2N9Cm0i4GSPsJbNqI,2237
|
| 254 |
+
sentry_sdk/integrations/openai_agents/patches/models.py,sha256=mKUbs01jhNIs06fKgP6nraRB-5m_UvCGkvK0Xu4cuDg,7278
|
| 255 |
+
sentry_sdk/integrations/openai_agents/patches/runner.py,sha256=gJLrC334MrfcSCxG93__iOh3jcROGWw4YKHXcHJ5dYY,6837
|
| 256 |
+
sentry_sdk/integrations/openai_agents/patches/tools.py,sha256=9I7GOIY-X3WyegTMMq-KIbs5DQ3Qg5JHHtIrEU3eivA,2823
|
| 257 |
+
sentry_sdk/integrations/openai_agents/spans/__init__.py,sha256=uzqInMjqfA_MtdTJoRRH10Z4cVtVESbi7qc5sC_zr8U,393
|
| 258 |
+
sentry_sdk/integrations/openai_agents/spans/__pycache__/__init__.cpython-312.pyc,,
|
| 259 |
+
sentry_sdk/integrations/openai_agents/spans/__pycache__/agent_workflow.cpython-312.pyc,,
|
| 260 |
+
sentry_sdk/integrations/openai_agents/spans/__pycache__/ai_client.cpython-312.pyc,,
|
| 261 |
+
sentry_sdk/integrations/openai_agents/spans/__pycache__/execute_tool.cpython-312.pyc,,
|
| 262 |
+
sentry_sdk/integrations/openai_agents/spans/__pycache__/handoff.cpython-312.pyc,,
|
| 263 |
+
sentry_sdk/integrations/openai_agents/spans/__pycache__/invoke_agent.cpython-312.pyc,,
|
| 264 |
+
sentry_sdk/integrations/openai_agents/spans/agent_workflow.py,sha256=lqr87hbaV1GCXX40cGQLSlf3U6V-nmD2zj5kr2w9kmk,459
|
| 265 |
+
sentry_sdk/integrations/openai_agents/spans/ai_client.py,sha256=RbJNDn9DXM4NJLHJBIodRp1GK-kQmclYCD5DgOV2WTI,2265
|
| 266 |
+
sentry_sdk/integrations/openai_agents/spans/execute_tool.py,sha256=ePm9PMnEzoYZW8M04dnDRINV7dRnDKtHa4YFn1InqZY,1612
|
| 267 |
+
sentry_sdk/integrations/openai_agents/spans/handoff.py,sha256=icCbs6HOFrVXY4V1LILCoyV-qxNntIBhgx8AFHW6MGo,723
|
| 268 |
+
sentry_sdk/integrations/openai_agents/spans/invoke_agent.py,sha256=VWB1HmR3oyoTqi3ZrHIHY37ageQnPsap5C81bLcxrOA,3592
|
| 269 |
+
sentry_sdk/integrations/openai_agents/utils.py,sha256=MvbPG-dh5UJzE50y-Q9xUeSuuu_Ot642SvD1H0hIcuY,8910
|
| 270 |
+
sentry_sdk/integrations/openfeature.py,sha256=w6bYjrHJdjIpziEvEBocCbxdryO2eDZGGOoc7g8jNEU,1102
|
| 271 |
+
sentry_sdk/integrations/opentelemetry/__init__.py,sha256=emNL5aAq_NhK0PZmfX_g4GIdvBS6nHqGrjrIgrdC5m8,229
|
| 272 |
+
sentry_sdk/integrations/opentelemetry/__pycache__/__init__.cpython-312.pyc,,
|
| 273 |
+
sentry_sdk/integrations/opentelemetry/__pycache__/consts.cpython-312.pyc,,
|
| 274 |
+
sentry_sdk/integrations/opentelemetry/__pycache__/integration.cpython-312.pyc,,
|
| 275 |
+
sentry_sdk/integrations/opentelemetry/__pycache__/propagator.cpython-312.pyc,,
|
| 276 |
+
sentry_sdk/integrations/opentelemetry/__pycache__/span_processor.cpython-312.pyc,,
|
| 277 |
+
sentry_sdk/integrations/opentelemetry/consts.py,sha256=6yhH65VVzzIjpU8MRYItLKkskkBsLlbsO-_2MULlEHQ,275
|
| 278 |
+
sentry_sdk/integrations/opentelemetry/integration.py,sha256=VPQlj-tDBgjSRXOxtrxeFpv9Mc_OqX7cC997gMy4CkY,1742
|
| 279 |
+
sentry_sdk/integrations/opentelemetry/propagator.py,sha256=ofVK4IJ9zhSN0cRmw_Cr15IHK0XckmBy8WqA_oaGa5Q,3973
|
| 280 |
+
sentry_sdk/integrations/opentelemetry/span_processor.py,sha256=YK-k88VMoJNrOEXMqYaVM_Y-FywN2hQjdrodLeqGegw,13222
|
| 281 |
+
sentry_sdk/integrations/otlp.py,sha256=5h0hFY19myYSg_buSAutomC_8_BfRtDyHyLE9XD_A8A,7713
|
| 282 |
+
sentry_sdk/integrations/pure_eval.py,sha256=SLN-omknLQWyeH1_3v3Qi-puFCAliYKQqi0wdDt3Keg,4560
|
| 283 |
+
sentry_sdk/integrations/pydantic_ai/__init__.py,sha256=YrUxvNgPIznxzVaniyLTorkkTmlYyosAbA37tFOTR68,1465
|
| 284 |
+
sentry_sdk/integrations/pydantic_ai/__pycache__/__init__.cpython-312.pyc,,
|
| 285 |
+
sentry_sdk/integrations/pydantic_ai/__pycache__/consts.cpython-312.pyc,,
|
| 286 |
+
sentry_sdk/integrations/pydantic_ai/__pycache__/utils.cpython-312.pyc,,
|
| 287 |
+
sentry_sdk/integrations/pydantic_ai/consts.py,sha256=fxOQ5n_Do8EqqqxtOJm5zyvhQmOV75HACNrt_-zGngs,36
|
| 288 |
+
sentry_sdk/integrations/pydantic_ai/patches/__init__.py,sha256=_RHvjc3436KSwPjzrAdnyascgggxg5e0MQpdHhmiS-U,229
|
| 289 |
+
sentry_sdk/integrations/pydantic_ai/patches/__pycache__/__init__.cpython-312.pyc,,
|
| 290 |
+
sentry_sdk/integrations/pydantic_ai/patches/__pycache__/agent_run.cpython-312.pyc,,
|
| 291 |
+
sentry_sdk/integrations/pydantic_ai/patches/__pycache__/graph_nodes.cpython-312.pyc,,
|
| 292 |
+
sentry_sdk/integrations/pydantic_ai/patches/__pycache__/model_request.cpython-312.pyc,,
|
| 293 |
+
sentry_sdk/integrations/pydantic_ai/patches/__pycache__/tools.cpython-312.pyc,,
|
| 294 |
+
sentry_sdk/integrations/pydantic_ai/patches/agent_run.py,sha256=iGafWMXefgs_1HQfV_rK4Pe5wJex4vLbAdX-O83fxQQ,7345
|
| 295 |
+
sentry_sdk/integrations/pydantic_ai/patches/graph_nodes.py,sha256=TDkIjRUxGGt0UbC3FzgaCJNJN1s682IXoVFO07biL6E,3793
|
| 296 |
+
sentry_sdk/integrations/pydantic_ai/patches/model_request.py,sha256=15uGRv0UXjZcEc9b6Jzx7o9yhToUQfd4KRLZm_OWMm8,1240
|
| 297 |
+
sentry_sdk/integrations/pydantic_ai/patches/tools.py,sha256=soSA-ugtq-EdkkbfXV_mAPC-AfEaCM_6E_c1kvg_IQk,3822
|
| 298 |
+
sentry_sdk/integrations/pydantic_ai/spans/__init__.py,sha256=dTUjvkw7VMOAiSasuAq37q_njvANsUlgfZxgXRKJDDo,243
|
| 299 |
+
sentry_sdk/integrations/pydantic_ai/spans/__pycache__/__init__.cpython-312.pyc,,
|
| 300 |
+
sentry_sdk/integrations/pydantic_ai/spans/__pycache__/ai_client.cpython-312.pyc,,
|
| 301 |
+
sentry_sdk/integrations/pydantic_ai/spans/__pycache__/execute_tool.cpython-312.pyc,,
|
| 302 |
+
sentry_sdk/integrations/pydantic_ai/spans/__pycache__/invoke_agent.cpython-312.pyc,,
|
| 303 |
+
sentry_sdk/integrations/pydantic_ai/spans/__pycache__/utils.cpython-312.pyc,,
|
| 304 |
+
sentry_sdk/integrations/pydantic_ai/spans/ai_client.py,sha256=aAEjW2eh317ZFQJwJi2_jJHnWaKtdWdJFkuaU-BFbj8,10508
|
| 305 |
+
sentry_sdk/integrations/pydantic_ai/spans/execute_tool.py,sha256=0k-eIN1tltSOQf2kFdUMGKbyWhKYTNZE8dJF9vkOfnE,1542
|
| 306 |
+
sentry_sdk/integrations/pydantic_ai/spans/invoke_agent.py,sha256=Mbq_7NkqD5oyjY-_AOhblliYNT_Ij6z-MEMfZl4xyyY,5742
|
| 307 |
+
sentry_sdk/integrations/pydantic_ai/spans/utils.py,sha256=VSWLcgjrUwNSR_31TFM5BRG0EsGrc3X8BWIxM4v0HZI,1763
|
| 308 |
+
sentry_sdk/integrations/pydantic_ai/utils.py,sha256=OWFa15_jQ7H4gHfMAUQHmSVclLVrF_DBb7heaPZ89HY,7091
|
| 309 |
+
sentry_sdk/integrations/pymongo.py,sha256=whIHxnsiz245UNT7yKMP1OqfAsBdC7ITy7kuice0uwE,6275
|
| 310 |
+
sentry_sdk/integrations/pyramid.py,sha256=EzK6W2VB4DdDfV_VXnp5ERibM27mVK-JGib3k9Khc4E,7208
|
| 311 |
+
sentry_sdk/integrations/quart.py,sha256=4GTx6E7auhLeYXyY8WoloqSghbeU14Jcz-gfDL4o3aE,7258
|
| 312 |
+
sentry_sdk/integrations/ray.py,sha256=8zLUo2ZFl9KEt88X-yJYVVoDphJUTWRZRT0I5OQ1PCk,5884
|
| 313 |
+
sentry_sdk/integrations/redis/__init__.py,sha256=TzeEWrYcDO9u3q5WXT8J9Zk5Ft9nrWW1PsE3i0ULGUg,1662
|
| 314 |
+
sentry_sdk/integrations/redis/__pycache__/__init__.cpython-312.pyc,,
|
| 315 |
+
sentry_sdk/integrations/redis/__pycache__/_async_common.cpython-312.pyc,,
|
| 316 |
+
sentry_sdk/integrations/redis/__pycache__/_sync_common.cpython-312.pyc,,
|
| 317 |
+
sentry_sdk/integrations/redis/__pycache__/consts.cpython-312.pyc,,
|
| 318 |
+
sentry_sdk/integrations/redis/__pycache__/rb.cpython-312.pyc,,
|
| 319 |
+
sentry_sdk/integrations/redis/__pycache__/redis.cpython-312.pyc,,
|
| 320 |
+
sentry_sdk/integrations/redis/__pycache__/redis_cluster.cpython-312.pyc,,
|
| 321 |
+
sentry_sdk/integrations/redis/__pycache__/redis_py_cluster_legacy.cpython-312.pyc,,
|
| 322 |
+
sentry_sdk/integrations/redis/__pycache__/utils.cpython-312.pyc,,
|
| 323 |
+
sentry_sdk/integrations/redis/_async_common.py,sha256=V05_CgSyWBKBLQ4Udkg4LT5ovPAnP5iqsvfeqsmLEwU,4058
|
| 324 |
+
sentry_sdk/integrations/redis/_sync_common.py,sha256=3seE5lTc2fDNLXutpe41Qv-3pdRVbLHFGCQaT9OXzvc,3776
|
| 325 |
+
sentry_sdk/integrations/redis/consts.py,sha256=y2f-FJ7TIkzto01tyjXvbKVSVELVkjZaxj3FG5DZ0hA,480
|
| 326 |
+
sentry_sdk/integrations/redis/modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 327 |
+
sentry_sdk/integrations/redis/modules/__pycache__/__init__.cpython-312.pyc,,
|
| 328 |
+
sentry_sdk/integrations/redis/modules/__pycache__/caches.cpython-312.pyc,,
|
| 329 |
+
sentry_sdk/integrations/redis/modules/__pycache__/queries.cpython-312.pyc,,
|
| 330 |
+
sentry_sdk/integrations/redis/modules/caches.py,sha256=cV64P6UWHXolvTtog5jQFHtR4XFd5IqA5vVYXc8xO2E,4020
|
| 331 |
+
sentry_sdk/integrations/redis/modules/queries.py,sha256=lRwn72AUvk5a29n43UEpx6l9ix_xC2htHcl4cHKsdcw,1941
|
| 332 |
+
sentry_sdk/integrations/redis/rb.py,sha256=we3oh4fEhiirglutAllZe55sAoz6vKtaQt1tvLPGoX4,791
|
| 333 |
+
sentry_sdk/integrations/redis/redis.py,sha256=NaaefF_0poC8-9citN6etyAC6WOabxBVOdH_J8FJ7ec,1684
|
| 334 |
+
sentry_sdk/integrations/redis/redis_cluster.py,sha256=rBbj1WRa_-NAR6tTQHKQMF40WjvNb6o6FoojImIE03Y,3530
|
| 335 |
+
sentry_sdk/integrations/redis/redis_py_cluster_legacy.py,sha256=YREwCLBkx_SDVkzNLGNcgCr4yi_mdomlwgKlVU8bgSU,1570
|
| 336 |
+
sentry_sdk/integrations/redis/utils.py,sha256=i818myPF61yl4EIS-t3ptyYCa2IrsaAWeFR0cqS0QNY,3954
|
| 337 |
+
sentry_sdk/integrations/rq.py,sha256=wyerb5iiAJbekL5-kdOpwQzQiJiWIRlX49il1aEnUWg,5278
|
| 338 |
+
sentry_sdk/integrations/rust_tracing.py,sha256=p3aFy8Gs8Dgvvlv9PSrbl7JDYmA5zus-fTichRzTiLM,9101
|
| 339 |
+
sentry_sdk/integrations/sanic.py,sha256=OJ570xdV4luxJY9_b4gutVf8gQhNpqyZHxyT3evydLM,12700
|
| 340 |
+
sentry_sdk/integrations/serverless.py,sha256=eZSd0NPYBm3ZGdYFmDYlXDtAD_Pe_aUGV77bCzJhoEI,1621
|
| 341 |
+
sentry_sdk/integrations/socket.py,sha256=fj-4yzw7uecKDa4Jl_8AN_Nb9KSIvCh3iR8oqaLsYaQ,3164
|
| 342 |
+
sentry_sdk/integrations/spark/__init__.py,sha256=oOewMErnZk2rzNvIlZO6URxQexu9bUJuSLM2m_zECy8,208
|
| 343 |
+
sentry_sdk/integrations/spark/__pycache__/__init__.cpython-312.pyc,,
|
| 344 |
+
sentry_sdk/integrations/spark/__pycache__/spark_driver.cpython-312.pyc,,
|
| 345 |
+
sentry_sdk/integrations/spark/__pycache__/spark_worker.cpython-312.pyc,,
|
| 346 |
+
sentry_sdk/integrations/spark/spark_driver.py,sha256=mYX1ohjUD7wrc7F16OhvNyyOGqgQ7QM6bFzfBb_MUZY,8896
|
| 347 |
+
sentry_sdk/integrations/spark/spark_worker.py,sha256=PtRzOBjrG0__Au-Uidd8cO5zpx-EJY_D13H9CG900RE,3632
|
| 348 |
+
sentry_sdk/integrations/sqlalchemy.py,sha256=Q6JQxDcwo2U-3ggB_wUhQZps1gzzXz6-Yf20dcCMUWU,4347
|
| 349 |
+
sentry_sdk/integrations/starlette.py,sha256=1cSqN0IARPGi6kLitW93ASlTJIAFFUxsyWnS8QMMzJY,26130
|
| 350 |
+
sentry_sdk/integrations/starlite.py,sha256=GC_FaL-cyjeUtsRIJ29jMR4HlYXRwI-JRYy7GFn5QAI,10451
|
| 351 |
+
sentry_sdk/integrations/statsig.py,sha256=ceSjGymN4Fqdnk8Tz-L5teHz0eNOJtptB_W0Y1VZu2o,1214
|
| 352 |
+
sentry_sdk/integrations/stdlib.py,sha256=2yN0_v-fokZwgfnsKBvcIdl4tQ-QpLUnoQzYyOX_gH4,9376
|
| 353 |
+
sentry_sdk/integrations/strawberry.py,sha256=cTY1L8ihyR5mCiF2H5ZEM59t0BMwNmWqwNkTIuIrFUA,14120
|
| 354 |
+
sentry_sdk/integrations/sys_exit.py,sha256=nLbBnbyPjUgyhsI1VkmTxVAOgdlenf-dFtIqzFTZNOI,2408
|
| 355 |
+
sentry_sdk/integrations/threading.py,sha256=jUsmnGEPoi6XGhuLZP3pWKJCDksul-e7uLwAyPK_vUI,7107
|
| 356 |
+
sentry_sdk/integrations/tornado.py,sha256=jh_tm6UR-m-OPFlTKD4eOYEsyex2Lm9rw7_UpZ-1dCQ,7212
|
| 357 |
+
sentry_sdk/integrations/trytond.py,sha256=FjAKIKDTRmEc1JUFsHkcZhslCOD7RsT-pKYoSg1Qyuk,1750
|
| 358 |
+
sentry_sdk/integrations/typer.py,sha256=hvLL7llTverbhjdC8eU5r0ps39XVf2qI27C35VWyXrE,1833
|
| 359 |
+
sentry_sdk/integrations/unleash.py,sha256=Gz3qAKB0fTtOgzCOJV24ocW1NbdcDkkqijrWDE6Oi7E,1045
|
| 360 |
+
sentry_sdk/integrations/unraisablehook.py,sha256=zThW8f6uFGFfwpPHFXq_pLbnaQRT-1tmltyOPVfTPzM,1717
|
| 361 |
+
sentry_sdk/integrations/wsgi.py,sha256=w6BKB-aCaGKl6tfWlKL2SlifY4ZErQaEH5oCXg_bcFg,10503
|
| 362 |
+
sentry_sdk/logger.py,sha256=3hfleFTl9n6ZlZeb-F8jVVMGoae_SU8fOZd0N6HWbuA,2683
|
| 363 |
+
sentry_sdk/metrics.py,sha256=z8futlRltjsPeTkU_We4HAanxeopDaWtfJTkAumwL68,1470
|
| 364 |
+
sentry_sdk/monitor.py,sha256=YRmomlJySjuuCHLJPgGMhRbHXun651He5QZXUvKWZp0,3443
|
| 365 |
+
sentry_sdk/profiler/__init__.py,sha256=3PI3bHk9RSkkOXZKN84DDedk_7M65EiqqaIGo-DYs0E,1291
|
| 366 |
+
sentry_sdk/profiler/__pycache__/__init__.cpython-312.pyc,,
|
| 367 |
+
sentry_sdk/profiler/__pycache__/continuous_profiler.cpython-312.pyc,,
|
| 368 |
+
sentry_sdk/profiler/__pycache__/transaction_profiler.cpython-312.pyc,,
|
| 369 |
+
sentry_sdk/profiler/__pycache__/utils.cpython-312.pyc,,
|
| 370 |
+
sentry_sdk/profiler/continuous_profiler.py,sha256=lRZa2yF81ByIh1yQVHdsK-Ur5jRMR6aKJuPCkuHn0pI,22550
|
| 371 |
+
sentry_sdk/profiler/transaction_profiler.py,sha256=AJicxLx2aPcnTBm5E6GGx_34j-y2yD907_OsHtN6xyQ,27248
|
| 372 |
+
sentry_sdk/profiler/utils.py,sha256=Y-GN2SoAKa12nxwUQRvI5LTYEWfIkhOvoCPG6B4K_N0,6397
|
| 373 |
+
sentry_sdk/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 374 |
+
sentry_sdk/scope.py,sha256=HtA_ivY_Syuw2sxXiXCgp1PsRg46CetAuVeth_slfzk,68615
|
| 375 |
+
sentry_sdk/scrubber.py,sha256=pEJECHaWzg0vf93S_uYe9PgStTImDXjLFu0pzIJ6XAM,5953
|
| 376 |
+
sentry_sdk/serializer.py,sha256=W3HrcoJ9iMzHa6j9Nmu5Y7Qopbkon3ovxuPxRM5R4pU,13199
|
| 377 |
+
sentry_sdk/session.py,sha256=ajYuxKt5cKfxl_OjcF0-g5IpowrJhvMe9rjTq1CaQac,5274
|
| 378 |
+
sentry_sdk/sessions.py,sha256=-k0OGB28DoWbb7nIW4ML8R_fiF5nLnboWqObnfyZnWQ,8932
|
| 379 |
+
sentry_sdk/spotlight.py,sha256=0bJrMpoT47xS2LJzpDfUZ5z5YEhdUla78r0YDtVm2Zo,12191
|
| 380 |
+
sentry_sdk/traces.py,sha256=1oVGyMonl10vIrFokfD5UBHXQoPIu4rP2rtnKL3Ju6Y,4568
|
| 381 |
+
sentry_sdk/tracing.py,sha256=OkIHt-BYfYhLe9MyEcTUlgN_n0CjqQn-BJ0K_9gnxaY,50512
|
| 382 |
+
sentry_sdk/tracing_utils.py,sha256=aItaWdi76Q0-HGsILrkNesctXp76BkriTol-jXSZjfI,43300
|
| 383 |
+
sentry_sdk/transport.py,sha256=P21HMCcD58PJfRNPA7Y39hlMqIFK5RsPCMwsy-QzXAQ,32141
|
| 384 |
+
sentry_sdk/types.py,sha256=A92AqvfrGQZ9KY6FaUjKfL9F1HK7Ui3heQilVzfzYCs,1269
|
| 385 |
+
sentry_sdk/utils.py,sha256=j6aRWZoMqiiN3Z_woIvw81E-rypx-CBxg5cQGmBbOIQ,65101
|
| 386 |
+
sentry_sdk/worker.py,sha256=djRCygOJFRmdXBS4lRSF4RTljVZ47BWEal85WlTD0uo,4257
|
source/sentry_sdk-2.53.0.dist-info/WHEEL
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Wheel-Version: 1.0
|
| 2 |
+
Generator: setuptools (82.0.0)
|
| 3 |
+
Root-Is-Purelib: true
|
| 4 |
+
Tag: py2-none-any
|
| 5 |
+
Tag: py3-none-any
|
| 6 |
+
|
source/sentry_sdk-2.53.0.dist-info/entry_points.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[opentelemetry_propagator]
|
| 2 |
+
sentry = sentry_sdk.integrations.opentelemetry:SentryPropagator
|
source/sentry_sdk-2.53.0.dist-info/licenses/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2018 Functional Software, Inc. dba Sentry
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
source/sentry_sdk-2.53.0.dist-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
sentry_sdk
|
source/sentry_sdk/__init__.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sentry_sdk import profiler
|
| 2 |
+
from sentry_sdk import metrics
|
| 3 |
+
from sentry_sdk.scope import Scope
|
| 4 |
+
from sentry_sdk.transport import Transport, HttpTransport
|
| 5 |
+
from sentry_sdk.client import Client
|
| 6 |
+
|
| 7 |
+
from sentry_sdk.api import * # noqa
|
| 8 |
+
from sentry_sdk.consts import VERSION
|
| 9 |
+
|
| 10 |
+
__all__ = [ # noqa
|
| 11 |
+
"Hub",
|
| 12 |
+
"Scope",
|
| 13 |
+
"Client",
|
| 14 |
+
"Transport",
|
| 15 |
+
"HttpTransport",
|
| 16 |
+
"VERSION",
|
| 17 |
+
"integrations",
|
| 18 |
+
# From sentry_sdk.api
|
| 19 |
+
"init",
|
| 20 |
+
"add_attachment",
|
| 21 |
+
"add_breadcrumb",
|
| 22 |
+
"capture_event",
|
| 23 |
+
"capture_exception",
|
| 24 |
+
"capture_message",
|
| 25 |
+
"configure_scope",
|
| 26 |
+
"continue_trace",
|
| 27 |
+
"flush",
|
| 28 |
+
"get_baggage",
|
| 29 |
+
"get_client",
|
| 30 |
+
"get_global_scope",
|
| 31 |
+
"get_isolation_scope",
|
| 32 |
+
"get_current_scope",
|
| 33 |
+
"get_current_span",
|
| 34 |
+
"get_traceparent",
|
| 35 |
+
"is_initialized",
|
| 36 |
+
"isolation_scope",
|
| 37 |
+
"last_event_id",
|
| 38 |
+
"new_scope",
|
| 39 |
+
"push_scope",
|
| 40 |
+
"set_context",
|
| 41 |
+
"set_extra",
|
| 42 |
+
"set_level",
|
| 43 |
+
"set_measurement",
|
| 44 |
+
"set_tag",
|
| 45 |
+
"set_tags",
|
| 46 |
+
"set_user",
|
| 47 |
+
"start_span",
|
| 48 |
+
"start_transaction",
|
| 49 |
+
"trace",
|
| 50 |
+
"monitor",
|
| 51 |
+
"logger",
|
| 52 |
+
"metrics",
|
| 53 |
+
"profiler",
|
| 54 |
+
"start_session",
|
| 55 |
+
"end_session",
|
| 56 |
+
"set_transaction_name",
|
| 57 |
+
"update_current_span",
|
| 58 |
+
]
|
| 59 |
+
|
| 60 |
+
# Initialize the debug support after everything is loaded
|
| 61 |
+
from sentry_sdk.debug import init_debug_support
|
| 62 |
+
|
| 63 |
+
init_debug_support()
|
| 64 |
+
del init_debug_support
|
| 65 |
+
|
| 66 |
+
# circular imports
|
| 67 |
+
from sentry_sdk.hub import Hub
|
source/sentry_sdk/_batcher.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import random
|
| 3 |
+
import threading
|
| 4 |
+
from datetime import datetime, timezone
|
| 5 |
+
from typing import TYPE_CHECKING, TypeVar, Generic
|
| 6 |
+
|
| 7 |
+
from sentry_sdk.utils import format_timestamp, safe_repr, serialize_attribute
|
| 8 |
+
from sentry_sdk.envelope import Envelope, Item, PayloadRef
|
| 9 |
+
|
| 10 |
+
if TYPE_CHECKING:
|
| 11 |
+
from typing import Optional, Callable, Any
|
| 12 |
+
|
| 13 |
+
T = TypeVar("T")
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class Batcher(Generic[T]):
|
| 17 |
+
MAX_BEFORE_FLUSH = 100
|
| 18 |
+
MAX_BEFORE_DROP = 1_000
|
| 19 |
+
FLUSH_WAIT_TIME = 5.0
|
| 20 |
+
|
| 21 |
+
TYPE = ""
|
| 22 |
+
CONTENT_TYPE = ""
|
| 23 |
+
|
| 24 |
+
def __init__(
|
| 25 |
+
self,
|
| 26 |
+
capture_func: "Callable[[Envelope], None]",
|
| 27 |
+
record_lost_func: "Callable[..., None]",
|
| 28 |
+
) -> None:
|
| 29 |
+
self._buffer: "list[T]" = []
|
| 30 |
+
self._capture_func = capture_func
|
| 31 |
+
self._record_lost_func = record_lost_func
|
| 32 |
+
self._running = True
|
| 33 |
+
self._lock = threading.Lock()
|
| 34 |
+
|
| 35 |
+
self._flush_event: "threading.Event" = threading.Event()
|
| 36 |
+
|
| 37 |
+
self._flusher: "Optional[threading.Thread]" = None
|
| 38 |
+
self._flusher_pid: "Optional[int]" = None
|
| 39 |
+
|
| 40 |
+
def _ensure_thread(self) -> bool:
|
| 41 |
+
"""For forking processes we might need to restart this thread.
|
| 42 |
+
This ensures that our process actually has that thread running.
|
| 43 |
+
"""
|
| 44 |
+
if not self._running:
|
| 45 |
+
return False
|
| 46 |
+
|
| 47 |
+
pid = os.getpid()
|
| 48 |
+
if self._flusher_pid == pid:
|
| 49 |
+
return True
|
| 50 |
+
|
| 51 |
+
with self._lock:
|
| 52 |
+
# Recheck to make sure another thread didn't get here and start the
|
| 53 |
+
# the flusher in the meantime
|
| 54 |
+
if self._flusher_pid == pid:
|
| 55 |
+
return True
|
| 56 |
+
|
| 57 |
+
self._flusher_pid = pid
|
| 58 |
+
|
| 59 |
+
self._flusher = threading.Thread(target=self._flush_loop)
|
| 60 |
+
self._flusher.daemon = True
|
| 61 |
+
|
| 62 |
+
try:
|
| 63 |
+
self._flusher.start()
|
| 64 |
+
except RuntimeError:
|
| 65 |
+
# Unfortunately at this point the interpreter is in a state that no
|
| 66 |
+
# longer allows us to spawn a thread and we have to bail.
|
| 67 |
+
self._running = False
|
| 68 |
+
return False
|
| 69 |
+
|
| 70 |
+
return True
|
| 71 |
+
|
| 72 |
+
def _flush_loop(self) -> None:
|
| 73 |
+
while self._running:
|
| 74 |
+
self._flush_event.wait(self.FLUSH_WAIT_TIME + random.random())
|
| 75 |
+
self._flush_event.clear()
|
| 76 |
+
self._flush()
|
| 77 |
+
|
| 78 |
+
def add(self, item: "T") -> None:
|
| 79 |
+
if not self._ensure_thread() or self._flusher is None:
|
| 80 |
+
return None
|
| 81 |
+
|
| 82 |
+
with self._lock:
|
| 83 |
+
if len(self._buffer) >= self.MAX_BEFORE_DROP:
|
| 84 |
+
self._record_lost(item)
|
| 85 |
+
return None
|
| 86 |
+
|
| 87 |
+
self._buffer.append(item)
|
| 88 |
+
if len(self._buffer) >= self.MAX_BEFORE_FLUSH:
|
| 89 |
+
self._flush_event.set()
|
| 90 |
+
|
| 91 |
+
def kill(self) -> None:
|
| 92 |
+
if self._flusher is None:
|
| 93 |
+
return
|
| 94 |
+
|
| 95 |
+
self._running = False
|
| 96 |
+
self._flush_event.set()
|
| 97 |
+
self._flusher = None
|
| 98 |
+
|
| 99 |
+
def flush(self) -> None:
|
| 100 |
+
self._flush()
|
| 101 |
+
|
| 102 |
+
def _add_to_envelope(self, envelope: "Envelope") -> None:
|
| 103 |
+
envelope.add_item(
|
| 104 |
+
Item(
|
| 105 |
+
type=self.TYPE,
|
| 106 |
+
content_type=self.CONTENT_TYPE,
|
| 107 |
+
headers={
|
| 108 |
+
"item_count": len(self._buffer),
|
| 109 |
+
},
|
| 110 |
+
payload=PayloadRef(
|
| 111 |
+
json={
|
| 112 |
+
"items": [
|
| 113 |
+
self._to_transport_format(item) for item in self._buffer
|
| 114 |
+
]
|
| 115 |
+
}
|
| 116 |
+
),
|
| 117 |
+
)
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
def _flush(self) -> "Optional[Envelope]":
|
| 121 |
+
envelope = Envelope(
|
| 122 |
+
headers={"sent_at": format_timestamp(datetime.now(timezone.utc))}
|
| 123 |
+
)
|
| 124 |
+
with self._lock:
|
| 125 |
+
if len(self._buffer) == 0:
|
| 126 |
+
return None
|
| 127 |
+
|
| 128 |
+
self._add_to_envelope(envelope)
|
| 129 |
+
self._buffer.clear()
|
| 130 |
+
|
| 131 |
+
self._capture_func(envelope)
|
| 132 |
+
return envelope
|
| 133 |
+
|
| 134 |
+
def _record_lost(self, item: "T") -> None:
|
| 135 |
+
pass
|
| 136 |
+
|
| 137 |
+
@staticmethod
|
| 138 |
+
def _to_transport_format(item: "T") -> "Any":
|
| 139 |
+
pass
|
source/sentry_sdk/_compat.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
|
| 3 |
+
from typing import TYPE_CHECKING
|
| 4 |
+
|
| 5 |
+
if TYPE_CHECKING:
|
| 6 |
+
from typing import Any
|
| 7 |
+
from typing import TypeVar
|
| 8 |
+
|
| 9 |
+
T = TypeVar("T")
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
PY37 = sys.version_info[0] == 3 and sys.version_info[1] >= 7
|
| 13 |
+
PY38 = sys.version_info[0] == 3 and sys.version_info[1] >= 8
|
| 14 |
+
PY310 = sys.version_info[0] == 3 and sys.version_info[1] >= 10
|
| 15 |
+
PY311 = sys.version_info[0] == 3 and sys.version_info[1] >= 11
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def with_metaclass(meta: "Any", *bases: "Any") -> "Any":
|
| 19 |
+
class MetaClass(type):
|
| 20 |
+
def __new__(metacls: "Any", name: "Any", this_bases: "Any", d: "Any") -> "Any":
|
| 21 |
+
return meta(name, bases, d)
|
| 22 |
+
|
| 23 |
+
return type.__new__(MetaClass, "temporary_class", (), {})
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def check_uwsgi_thread_support() -> bool:
|
| 27 |
+
# We check two things here:
|
| 28 |
+
#
|
| 29 |
+
# 1. uWSGI doesn't run in threaded mode by default -- issue a warning if
|
| 30 |
+
# that's the case.
|
| 31 |
+
#
|
| 32 |
+
# 2. Additionally, if uWSGI is running in preforking mode (default), it needs
|
| 33 |
+
# the --py-call-uwsgi-fork-hooks option for the SDK to work properly. This
|
| 34 |
+
# is because any background threads spawned before the main process is
|
| 35 |
+
# forked are NOT CLEANED UP IN THE CHILDREN BY DEFAULT even if
|
| 36 |
+
# --enable-threads is on. One has to explicitly provide
|
| 37 |
+
# --py-call-uwsgi-fork-hooks to force uWSGI to run regular cpython
|
| 38 |
+
# after-fork hooks that take care of cleaning up stale thread data.
|
| 39 |
+
try:
|
| 40 |
+
from uwsgi import opt # type: ignore
|
| 41 |
+
except ImportError:
|
| 42 |
+
return True
|
| 43 |
+
|
| 44 |
+
from sentry_sdk.consts import FALSE_VALUES
|
| 45 |
+
|
| 46 |
+
def enabled(option: str) -> bool:
|
| 47 |
+
value = opt.get(option, False)
|
| 48 |
+
if isinstance(value, bool):
|
| 49 |
+
return value
|
| 50 |
+
|
| 51 |
+
if isinstance(value, bytes):
|
| 52 |
+
try:
|
| 53 |
+
value = value.decode()
|
| 54 |
+
except Exception:
|
| 55 |
+
pass
|
| 56 |
+
|
| 57 |
+
return value and str(value).lower() not in FALSE_VALUES
|
| 58 |
+
|
| 59 |
+
# When `threads` is passed in as a uwsgi option,
|
| 60 |
+
# `enable-threads` is implied on.
|
| 61 |
+
threads_enabled = "threads" in opt or enabled("enable-threads")
|
| 62 |
+
fork_hooks_on = enabled("py-call-uwsgi-fork-hooks")
|
| 63 |
+
lazy_mode = enabled("lazy-apps") or enabled("lazy")
|
| 64 |
+
|
| 65 |
+
if lazy_mode and not threads_enabled:
|
| 66 |
+
from warnings import warn
|
| 67 |
+
|
| 68 |
+
warn(
|
| 69 |
+
Warning(
|
| 70 |
+
"IMPORTANT: "
|
| 71 |
+
"We detected the use of uWSGI without thread support. "
|
| 72 |
+
"This might lead to unexpected issues. "
|
| 73 |
+
'Please run uWSGI with "--enable-threads" for full support.'
|
| 74 |
+
)
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
return False
|
| 78 |
+
|
| 79 |
+
elif not lazy_mode and (not threads_enabled or not fork_hooks_on):
|
| 80 |
+
from warnings import warn
|
| 81 |
+
|
| 82 |
+
warn(
|
| 83 |
+
Warning(
|
| 84 |
+
"IMPORTANT: "
|
| 85 |
+
"We detected the use of uWSGI in preforking mode without "
|
| 86 |
+
"thread support. This might lead to crashing workers. "
|
| 87 |
+
'Please run uWSGI with both "--enable-threads" and '
|
| 88 |
+
'"--py-call-uwsgi-fork-hooks" for full support.'
|
| 89 |
+
)
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
return False
|
| 93 |
+
|
| 94 |
+
return True
|
source/sentry_sdk/_init_implementation.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import warnings
|
| 2 |
+
|
| 3 |
+
from typing import TYPE_CHECKING
|
| 4 |
+
|
| 5 |
+
import sentry_sdk
|
| 6 |
+
|
| 7 |
+
if TYPE_CHECKING:
|
| 8 |
+
from typing import Any, ContextManager, Optional
|
| 9 |
+
|
| 10 |
+
import sentry_sdk.consts
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class _InitGuard:
|
| 14 |
+
_CONTEXT_MANAGER_DEPRECATION_WARNING_MESSAGE = (
|
| 15 |
+
"Using the return value of sentry_sdk.init as a context manager "
|
| 16 |
+
"and manually calling the __enter__ and __exit__ methods on the "
|
| 17 |
+
"return value are deprecated. We are no longer maintaining this "
|
| 18 |
+
"functionality, and we will remove it in the next major release."
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
def __init__(self, client: "sentry_sdk.Client") -> None:
|
| 22 |
+
self._client = client
|
| 23 |
+
|
| 24 |
+
def __enter__(self) -> "_InitGuard":
|
| 25 |
+
warnings.warn(
|
| 26 |
+
self._CONTEXT_MANAGER_DEPRECATION_WARNING_MESSAGE,
|
| 27 |
+
stacklevel=2,
|
| 28 |
+
category=DeprecationWarning,
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
return self
|
| 32 |
+
|
| 33 |
+
def __exit__(self, exc_type: "Any", exc_value: "Any", tb: "Any") -> None:
|
| 34 |
+
warnings.warn(
|
| 35 |
+
self._CONTEXT_MANAGER_DEPRECATION_WARNING_MESSAGE,
|
| 36 |
+
stacklevel=2,
|
| 37 |
+
category=DeprecationWarning,
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
c = self._client
|
| 41 |
+
if c is not None:
|
| 42 |
+
c.close()
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def _check_python_deprecations() -> None:
|
| 46 |
+
# Since we're likely to deprecate Python versions in the future, I'm keeping
|
| 47 |
+
# this handy function around. Use this to detect the Python version used and
|
| 48 |
+
# to output logger.warning()s if it's deprecated.
|
| 49 |
+
pass
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def _init(*args: "Optional[str]", **kwargs: "Any") -> "ContextManager[Any]":
|
| 53 |
+
"""Initializes the SDK and optionally integrations.
|
| 54 |
+
|
| 55 |
+
This takes the same arguments as the client constructor.
|
| 56 |
+
"""
|
| 57 |
+
client = sentry_sdk.Client(*args, **kwargs)
|
| 58 |
+
sentry_sdk.get_global_scope().set_client(client)
|
| 59 |
+
_check_python_deprecations()
|
| 60 |
+
rv = _InitGuard(client)
|
| 61 |
+
return rv
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
if TYPE_CHECKING:
|
| 65 |
+
# Make mypy, PyCharm and other static analyzers think `init` is a type to
|
| 66 |
+
# have nicer autocompletion for params.
|
| 67 |
+
#
|
| 68 |
+
# Use `ClientConstructor` to define the argument types of `init` and
|
| 69 |
+
# `ContextManager[Any]` to tell static analyzers about the return type.
|
| 70 |
+
|
| 71 |
+
class init(sentry_sdk.consts.ClientConstructor, _InitGuard): # noqa: N801
|
| 72 |
+
pass
|
| 73 |
+
|
| 74 |
+
else:
|
| 75 |
+
# Alias `init` for actual usage. Go through the lambda indirection to throw
|
| 76 |
+
# PyCharm off of the weakly typed signature (it would otherwise discover
|
| 77 |
+
# both the weakly typed signature of `_init` and our faked `init` type).
|
| 78 |
+
|
| 79 |
+
init = (lambda: _init)()
|
source/sentry_sdk/_log_batcher.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import TYPE_CHECKING
|
| 2 |
+
|
| 3 |
+
from sentry_sdk._batcher import Batcher
|
| 4 |
+
from sentry_sdk.utils import serialize_attribute
|
| 5 |
+
from sentry_sdk.envelope import Envelope, Item, PayloadRef
|
| 6 |
+
|
| 7 |
+
if TYPE_CHECKING:
|
| 8 |
+
from typing import Any
|
| 9 |
+
from sentry_sdk._types import Log
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class LogBatcher(Batcher["Log"]):
|
| 13 |
+
MAX_BEFORE_FLUSH = 100
|
| 14 |
+
MAX_BEFORE_DROP = 1_000
|
| 15 |
+
FLUSH_WAIT_TIME = 5.0
|
| 16 |
+
|
| 17 |
+
TYPE = "log"
|
| 18 |
+
CONTENT_TYPE = "application/vnd.sentry.items.log+json"
|
| 19 |
+
|
| 20 |
+
@staticmethod
|
| 21 |
+
def _to_transport_format(item: "Log") -> "Any":
|
| 22 |
+
if "sentry.severity_number" not in item["attributes"]:
|
| 23 |
+
item["attributes"]["sentry.severity_number"] = item["severity_number"]
|
| 24 |
+
if "sentry.severity_text" not in item["attributes"]:
|
| 25 |
+
item["attributes"]["sentry.severity_text"] = item["severity_text"]
|
| 26 |
+
|
| 27 |
+
res = {
|
| 28 |
+
"timestamp": int(item["time_unix_nano"]) / 1.0e9,
|
| 29 |
+
"trace_id": item.get("trace_id", "00000000-0000-0000-0000-000000000000"),
|
| 30 |
+
"span_id": item.get("span_id"),
|
| 31 |
+
"level": str(item["severity_text"]),
|
| 32 |
+
"body": str(item["body"]),
|
| 33 |
+
"attributes": {
|
| 34 |
+
k: serialize_attribute(v) for (k, v) in item["attributes"].items()
|
| 35 |
+
},
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
return res
|
| 39 |
+
|
| 40 |
+
def _record_lost(self, item: "Log") -> None:
|
| 41 |
+
# Construct log envelope item without sending it to report lost bytes
|
| 42 |
+
log_item = Item(
|
| 43 |
+
type=self.TYPE,
|
| 44 |
+
content_type=self.CONTENT_TYPE,
|
| 45 |
+
headers={
|
| 46 |
+
"item_count": 1,
|
| 47 |
+
},
|
| 48 |
+
payload=PayloadRef(json={"items": [self._to_transport_format(item)]}),
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
self._record_lost_func(
|
| 52 |
+
reason="queue_overflow",
|
| 53 |
+
data_category="log_item",
|
| 54 |
+
item=log_item,
|
| 55 |
+
quantity=1,
|
| 56 |
+
)
|