Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- _cuda_bindings_redirector.py +30 -0
- anyio-4.12.1.dist-info/INSTALLER +1 -0
- anyio-4.12.1.dist-info/METADATA +96 -0
- anyio-4.12.1.dist-info/RECORD +51 -0
- anyio-4.12.1.dist-info/REQUESTED +0 -0
- anyio-4.12.1.dist-info/WHEEL +5 -0
- anyio-4.12.1.dist-info/entry_points.txt +2 -0
- anyio-4.12.1.dist-info/top_level.txt +1 -0
- dataset-metadata.json +9 -0
- datasets/__init__.py +47 -0
- datasets/arrow_dataset.py +0 -0
- datasets/arrow_reader.py +620 -0
- datasets/arrow_writer.py +766 -0
- datasets/builder.py +1866 -0
- datasets/combine.py +223 -0
- datasets/config.py +268 -0
- datasets/data_files.py +807 -0
- datasets/dataset_dict.py +0 -0
- datasets/distributed.py +39 -0
- datasets/exceptions.py +119 -0
- datasets/fingerprint.py +454 -0
- datasets/hub.py +124 -0
- datasets/info.py +430 -0
- datasets/inspect.py +353 -0
- datasets/iterable_dataset.py +0 -0
- datasets/keyhash.py +104 -0
- datasets/load.py +1481 -0
- datasets/naming.py +84 -0
- datasets/search.py +785 -0
- datasets/splits.py +635 -0
- datasets/streaming.py +131 -0
- datasets/table.py +2385 -0
- idna/__init__.py +45 -0
- idna/codec.py +122 -0
- idna/compat.py +15 -0
- idna/core.py +437 -0
- idna/idnadata.py +4309 -0
- idna/intranges.py +57 -0
- idna/package_data.py +1 -0
- idna/py.typed +0 -0
- idna/uts46data.py +0 -0
- importlib_metadata/__init__.py +1191 -0
- importlib_metadata/_adapters.py +136 -0
- importlib_metadata/_collections.py +34 -0
- importlib_metadata/_compat.py +56 -0
- importlib_metadata/_functools.py +136 -0
- importlib_metadata/_itertools.py +171 -0
- importlib_metadata/_meta.py +71 -0
- importlib_metadata/_text.py +99 -0
- importlib_metadata/_typing.py +15 -0
_cuda_bindings_redirector.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
| 2 |
+
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
|
| 3 |
+
|
| 4 |
+
import sys
|
| 5 |
+
from types import ModuleType
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
# Make sure 'cuda' is importable as a namespace package
|
| 9 |
+
import cuda
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class LazyCudaModule(ModuleType):
|
| 13 |
+
|
| 14 |
+
def __getattr__(self, name):
|
| 15 |
+
if name == '__version__':
|
| 16 |
+
import warnings
|
| 17 |
+
warnings.warn(
|
| 18 |
+
"accessing cuda.__version__ is deprecated, " "please switch to use cuda.bindings.__version__ instead",
|
| 19 |
+
FutureWarning,
|
| 20 |
+
stacklevel=2,
|
| 21 |
+
)
|
| 22 |
+
from cuda.bindings import __version__
|
| 23 |
+
|
| 24 |
+
return __version__
|
| 25 |
+
|
| 26 |
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# Patch in LazyCudaModule for `cuda`
|
| 30 |
+
sys.modules['cuda'].__class__ = LazyCudaModule
|
anyio-4.12.1.dist-info/INSTALLER
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
uv
|
anyio-4.12.1.dist-info/METADATA
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: anyio
|
| 3 |
+
Version: 4.12.1
|
| 4 |
+
Summary: High-level concurrency and networking framework on top of asyncio or Trio
|
| 5 |
+
Author-email: Alex Grönholm <alex.gronholm@nextday.fi>
|
| 6 |
+
License-Expression: MIT
|
| 7 |
+
Project-URL: Documentation, https://anyio.readthedocs.io/en/latest/
|
| 8 |
+
Project-URL: Changelog, https://anyio.readthedocs.io/en/stable/versionhistory.html
|
| 9 |
+
Project-URL: Source code, https://github.com/agronholm/anyio
|
| 10 |
+
Project-URL: Issue tracker, https://github.com/agronholm/anyio/issues
|
| 11 |
+
Classifier: Development Status :: 5 - Production/Stable
|
| 12 |
+
Classifier: Intended Audience :: Developers
|
| 13 |
+
Classifier: Framework :: AnyIO
|
| 14 |
+
Classifier: Typing :: Typed
|
| 15 |
+
Classifier: Programming Language :: Python
|
| 16 |
+
Classifier: Programming Language :: Python :: 3
|
| 17 |
+
Classifier: Programming Language :: Python :: 3.9
|
| 18 |
+
Classifier: Programming Language :: Python :: 3.10
|
| 19 |
+
Classifier: Programming Language :: Python :: 3.11
|
| 20 |
+
Classifier: Programming Language :: Python :: 3.12
|
| 21 |
+
Classifier: Programming Language :: Python :: 3.13
|
| 22 |
+
Classifier: Programming Language :: Python :: 3.14
|
| 23 |
+
Requires-Python: >=3.9
|
| 24 |
+
Description-Content-Type: text/x-rst
|
| 25 |
+
License-File: LICENSE
|
| 26 |
+
Requires-Dist: exceptiongroup>=1.0.2; python_version < "3.11"
|
| 27 |
+
Requires-Dist: idna>=2.8
|
| 28 |
+
Requires-Dist: typing_extensions>=4.5; python_version < "3.13"
|
| 29 |
+
Provides-Extra: trio
|
| 30 |
+
Requires-Dist: trio>=0.32.0; python_version >= "3.10" and extra == "trio"
|
| 31 |
+
Requires-Dist: trio>=0.31.0; python_version < "3.10" and extra == "trio"
|
| 32 |
+
Dynamic: license-file
|
| 33 |
+
|
| 34 |
+
.. image:: https://github.com/agronholm/anyio/actions/workflows/test.yml/badge.svg
|
| 35 |
+
:target: https://github.com/agronholm/anyio/actions/workflows/test.yml
|
| 36 |
+
:alt: Build Status
|
| 37 |
+
.. image:: https://coveralls.io/repos/github/agronholm/anyio/badge.svg?branch=master
|
| 38 |
+
:target: https://coveralls.io/github/agronholm/anyio?branch=master
|
| 39 |
+
:alt: Code Coverage
|
| 40 |
+
.. image:: https://readthedocs.org/projects/anyio/badge/?version=latest
|
| 41 |
+
:target: https://anyio.readthedocs.io/en/latest/?badge=latest
|
| 42 |
+
:alt: Documentation
|
| 43 |
+
.. image:: https://badges.gitter.im/gitterHQ/gitter.svg
|
| 44 |
+
:target: https://gitter.im/python-trio/AnyIO
|
| 45 |
+
:alt: Gitter chat
|
| 46 |
+
|
| 47 |
+
AnyIO is an asynchronous networking and concurrency library that works on top of either asyncio_ or
|
| 48 |
+
Trio_. It implements Trio-like `structured concurrency`_ (SC) on top of asyncio and works in harmony
|
| 49 |
+
with the native SC of Trio itself.
|
| 50 |
+
|
| 51 |
+
Applications and libraries written against AnyIO's API will run unmodified on either asyncio_ or
|
| 52 |
+
Trio_. AnyIO can also be adopted into a library or application incrementally – bit by bit, no full
|
| 53 |
+
refactoring necessary. It will blend in with the native libraries of your chosen backend.
|
| 54 |
+
|
| 55 |
+
To find out why you might want to use AnyIO's APIs instead of asyncio's, you can read about it
|
| 56 |
+
`here <https://anyio.readthedocs.io/en/stable/why.html>`_.
|
| 57 |
+
|
| 58 |
+
Documentation
|
| 59 |
+
-------------
|
| 60 |
+
|
| 61 |
+
View full documentation at: https://anyio.readthedocs.io/
|
| 62 |
+
|
| 63 |
+
Features
|
| 64 |
+
--------
|
| 65 |
+
|
| 66 |
+
AnyIO offers the following functionality:
|
| 67 |
+
|
| 68 |
+
* Task groups (nurseries_ in trio terminology)
|
| 69 |
+
* High-level networking (TCP, UDP and UNIX sockets)
|
| 70 |
+
|
| 71 |
+
* `Happy eyeballs`_ algorithm for TCP connections (more robust than that of asyncio on Python
|
| 72 |
+
3.8)
|
| 73 |
+
* async/await style UDP sockets (unlike asyncio where you still have to use Transports and
|
| 74 |
+
Protocols)
|
| 75 |
+
|
| 76 |
+
* A versatile API for byte streams and object streams
|
| 77 |
+
* Inter-task synchronization and communication (locks, conditions, events, semaphores, object
|
| 78 |
+
streams)
|
| 79 |
+
* Worker threads
|
| 80 |
+
* Subprocesses
|
| 81 |
+
* Subinterpreter support for code parallelization (on Python 3.13 and later)
|
| 82 |
+
* Asynchronous file I/O (using worker threads)
|
| 83 |
+
* Signal handling
|
| 84 |
+
* Asynchronous version of the functools_ module
|
| 85 |
+
|
| 86 |
+
AnyIO also comes with its own pytest_ plugin which also supports asynchronous fixtures.
|
| 87 |
+
It even works with the popular Hypothesis_ library.
|
| 88 |
+
|
| 89 |
+
.. _asyncio: https://docs.python.org/3/library/asyncio.html
|
| 90 |
+
.. _Trio: https://github.com/python-trio/trio
|
| 91 |
+
.. _structured concurrency: https://en.wikipedia.org/wiki/Structured_concurrency
|
| 92 |
+
.. _nurseries: https://trio.readthedocs.io/en/stable/reference-core.html#nurseries-and-spawning
|
| 93 |
+
.. _Happy eyeballs: https://en.wikipedia.org/wiki/Happy_Eyeballs
|
| 94 |
+
.. _pytest: https://docs.pytest.org/en/latest/
|
| 95 |
+
.. _functools: https://docs.python.org/3/library/functools.html
|
| 96 |
+
.. _Hypothesis: https://hypothesis.works/
|
anyio-4.12.1.dist-info/RECORD
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
anyio-4.12.1.dist-info/INSTALLER,sha256=5hhM4Q4mYTT9z6QB6PGpUAW81PGNFrYrdXMj4oM_6ak,2
|
| 2 |
+
anyio-4.12.1.dist-info/METADATA,sha256=DfiDab9Tmmcfy802lOLTMEHJQShkOSbopCwqCYbLuJk,4277
|
| 3 |
+
anyio-4.12.1.dist-info/RECORD,,
|
| 4 |
+
anyio-4.12.1.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 5 |
+
anyio-4.12.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
| 6 |
+
anyio-4.12.1.dist-info/entry_points.txt,sha256=_d6Yu6uiaZmNe0CydowirE9Cmg7zUL2g08tQpoS3Qvc,39
|
| 7 |
+
anyio-4.12.1.dist-info/licenses/LICENSE,sha256=U2GsncWPLvX9LpsJxoKXwX8ElQkJu8gCO9uC6s8iwrA,1081
|
| 8 |
+
anyio-4.12.1.dist-info/top_level.txt,sha256=QglSMiWX8_5dpoVAEIHdEYzvqFMdSYWmCj6tYw2ITkQ,6
|
| 9 |
+
anyio/__init__.py,sha256=7iDVqMUprUuKNY91FuoKqayAhR-OY136YDPI6P78HHk,6170
|
| 10 |
+
anyio/_backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 11 |
+
anyio/_backends/_asyncio.py,sha256=xG6qv60mgGnL0mK82dxjH2b8hlkMlJ-x2BqIq3qv70Y,98863
|
| 12 |
+
anyio/_backends/_trio.py,sha256=30Rctb7lm8g63ZHljVPVnj5aH-uK6oQvphjwUBoAzuI,41456
|
| 13 |
+
anyio/_core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 14 |
+
anyio/_core/_asyncio_selector_thread.py,sha256=2PdxFM3cs02Kp6BSppbvmRT7q7asreTW5FgBxEsflBo,5626
|
| 15 |
+
anyio/_core/_contextmanagers.py,sha256=YInBCabiEeS-UaP_Jdxa1CaFC71ETPW8HZTHIM8Rsc8,7215
|
| 16 |
+
anyio/_core/_eventloop.py,sha256=c2EdcBX-xnKwxPcC4Pjn3_qG9I-x4IWFO2R9RqCGjM4,6448
|
| 17 |
+
anyio/_core/_exceptions.py,sha256=Y3aq-Wxd7Q2HqwSg7nZPvRsHEuGazv_qeet6gqEBdPk,4407
|
| 18 |
+
anyio/_core/_fileio.py,sha256=uc7t10Vb-If7GbdWM_zFf-ajUe6uek63fSt7IBLlZW0,25731
|
| 19 |
+
anyio/_core/_resources.py,sha256=NbmU5O5UX3xEyACnkmYX28Fmwdl-f-ny0tHym26e0w0,435
|
| 20 |
+
anyio/_core/_signals.py,sha256=mjTBB2hTKNPRlU0IhnijeQedpWOGERDiMjSlJQsFrug,1016
|
| 21 |
+
anyio/_core/_sockets.py,sha256=RBXHcUqZt5gg_-OOfgHVv8uq2FSKk1uVUzTdpjBoI1o,34977
|
| 22 |
+
anyio/_core/_streams.py,sha256=FczFwIgDpnkK0bODWJXMpsUJYdvAD04kaUaGzJU8DK0,1806
|
| 23 |
+
anyio/_core/_subprocesses.py,sha256=EXm5igL7dj55iYkPlbYVAqtbqxJxjU-6OndSTIx9SRg,8047
|
| 24 |
+
anyio/_core/_synchronization.py,sha256=MgVVqFzvt580tHC31LiOcq1G6aryut--xRG4Ff8KwxQ,20869
|
| 25 |
+
anyio/_core/_tasks.py,sha256=pVB7K6AAulzUM8YgXAeqNZG44nSyZ1bYJjH8GznC00I,5435
|
| 26 |
+
anyio/_core/_tempfile.py,sha256=lHb7CW4FyIlpkf5ADAf4VmLHCKwEHF9nxqNyBCFFUiA,19697
|
| 27 |
+
anyio/_core/_testing.py,sha256=u7MPqGXwpTxqI7hclSdNA30z2GH1Nw258uwKvy_RfBg,2340
|
| 28 |
+
anyio/_core/_typedattr.py,sha256=P4ozZikn3-DbpoYcvyghS_FOYAgbmUxeoU8-L_07pZM,2508
|
| 29 |
+
anyio/abc/__init__.py,sha256=6mWhcl_pGXhrgZVHP_TCfMvIXIOp9mroEFM90fYCU_U,2869
|
| 30 |
+
anyio/abc/_eventloop.py,sha256=GlzgB3UJGgG6Kr7olpjOZ-o00PghecXuofVDQ_5611Q,10749
|
| 31 |
+
anyio/abc/_resources.py,sha256=DrYvkNN1hH6Uvv5_5uKySvDsnknGVDe8FCKfko0VtN8,783
|
| 32 |
+
anyio/abc/_sockets.py,sha256=ECTY0jLEF18gryANHR3vFzXzGdZ-xPwELq1QdgOb0Jo,13258
|
| 33 |
+
anyio/abc/_streams.py,sha256=005GKSCXGprxnhucILboSqc2JFovECZk9m3p-qqxXVc,7640
|
| 34 |
+
anyio/abc/_subprocesses.py,sha256=cumAPJTktOQtw63IqG0lDpyZqu_l1EElvQHMiwJgL08,2067
|
| 35 |
+
anyio/abc/_tasks.py,sha256=KC7wrciE48AINOI-AhPutnFhe1ewfP7QnamFlDzqesQ,3721
|
| 36 |
+
anyio/abc/_testing.py,sha256=tBJUzkSfOXJw23fe8qSJ03kJlShOYjjaEyFB6k6MYT8,1821
|
| 37 |
+
anyio/from_thread.py,sha256=L-0w1HxJ6BSb-KuVi57k5Tkc3yzQrx3QK5tAxMPcY-0,19141
|
| 38 |
+
anyio/functools.py,sha256=HWj7GBEmc0Z-mZg3uok7Z7ZJn0rEC_0Pzbt0nYUDaTQ,10973
|
| 39 |
+
anyio/lowlevel.py,sha256=AyKLVK3LaWSoK39LkCKxE4_GDMLKZBNqTrLUgk63y80,5158
|
| 40 |
+
anyio/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 41 |
+
anyio/pytest_plugin.py,sha256=3jAFQn0jv_pyoWE2GBBlHaj9sqXj4e8vob0_hgrsXE8,10244
|
| 42 |
+
anyio/streams/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 43 |
+
anyio/streams/buffered.py,sha256=2R3PeJhe4EXrdYqz44Y6-Eg9R6DrmlsYrP36Ir43-po,6263
|
| 44 |
+
anyio/streams/file.py,sha256=4WZ7XGz5WNu39FQHvqbe__TQ0HDP9OOhgO1mk9iVpVU,4470
|
| 45 |
+
anyio/streams/memory.py,sha256=F0zwzvFJKAhX_LRZGoKzzqDC2oMM-f-yyTBrEYEGOaU,10740
|
| 46 |
+
anyio/streams/stapled.py,sha256=T8Xqwf8K6EgURPxbt1N4i7A8BAk-gScv-GRhjLXIf_o,4390
|
| 47 |
+
anyio/streams/text.py,sha256=BcVAGJw1VRvtIqnv-o0Rb0pwH7p8vwlvl21xHq522ag,5765
|
| 48 |
+
anyio/streams/tls.py,sha256=Jpxy0Mfbcp1BxHCwE-YjSSFaLnIBbnnwur-excYThs4,15368
|
| 49 |
+
anyio/to_interpreter.py,sha256=_mLngrMy97TMR6VbW4Y6YzDUk9ZuPcQMPlkuyRh3C9k,7100
|
| 50 |
+
anyio/to_process.py,sha256=J7gAA_YOuoHqnpDAf5fm1Qu6kOmTzdFbiDNvnV755vk,9798
|
| 51 |
+
anyio/to_thread.py,sha256=menEgXYmUV7Fjg_9WqCV95P9MAtQS8BzPGGcWB_QnfQ,2687
|
anyio-4.12.1.dist-info/REQUESTED
ADDED
|
File without changes
|
anyio-4.12.1.dist-info/WHEEL
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Wheel-Version: 1.0
|
| 2 |
+
Generator: setuptools (80.9.0)
|
| 3 |
+
Root-Is-Purelib: true
|
| 4 |
+
Tag: py3-none-any
|
| 5 |
+
|
anyio-4.12.1.dist-info/entry_points.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[pytest11]
|
| 2 |
+
anyio = anyio.pytest_plugin
|
anyio-4.12.1.dist-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
anyio
|
dataset-metadata.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"title": "mamba-packages",
|
| 3 |
+
"id": "pmsalmankhan/mamba-packages",
|
| 4 |
+
"licenses": [
|
| 5 |
+
{
|
| 6 |
+
"name": "CC0-1.0"
|
| 7 |
+
}
|
| 8 |
+
]
|
| 9 |
+
}
|
datasets/__init__.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
__version__ = "4.3.0"
|
| 16 |
+
|
| 17 |
+
from .arrow_dataset import Column, Dataset
|
| 18 |
+
from .arrow_reader import ReadInstruction
|
| 19 |
+
from .builder import ArrowBasedBuilder, BuilderConfig, DatasetBuilder, GeneratorBasedBuilder
|
| 20 |
+
from .combine import concatenate_datasets, interleave_datasets
|
| 21 |
+
from .dataset_dict import DatasetDict, IterableDatasetDict
|
| 22 |
+
from .download import *
|
| 23 |
+
from .features import *
|
| 24 |
+
from .fingerprint import disable_caching, enable_caching, is_caching_enabled
|
| 25 |
+
from .info import DatasetInfo
|
| 26 |
+
from .inspect import (
|
| 27 |
+
get_dataset_config_info,
|
| 28 |
+
get_dataset_config_names,
|
| 29 |
+
get_dataset_default_config_name,
|
| 30 |
+
get_dataset_infos,
|
| 31 |
+
get_dataset_split_names,
|
| 32 |
+
)
|
| 33 |
+
from .iterable_dataset import IterableColumn, IterableDataset
|
| 34 |
+
from .load import load_dataset, load_dataset_builder, load_from_disk
|
| 35 |
+
from .splits import (
|
| 36 |
+
NamedSplit,
|
| 37 |
+
NamedSplitAll,
|
| 38 |
+
Split,
|
| 39 |
+
SplitBase,
|
| 40 |
+
SplitDict,
|
| 41 |
+
SplitGenerator,
|
| 42 |
+
SplitInfo,
|
| 43 |
+
SubSplitInfo,
|
| 44 |
+
percent,
|
| 45 |
+
)
|
| 46 |
+
from .utils import *
|
| 47 |
+
from .utils import logging
|
datasets/arrow_dataset.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/arrow_reader.py
ADDED
|
@@ -0,0 +1,620 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
# Lint as: python3
|
| 16 |
+
"""Arrow ArrowReader."""
|
| 17 |
+
|
| 18 |
+
import copy
|
| 19 |
+
import math
|
| 20 |
+
import os
|
| 21 |
+
import re
|
| 22 |
+
from dataclasses import dataclass
|
| 23 |
+
from functools import partial
|
| 24 |
+
from typing import TYPE_CHECKING, Optional, Union
|
| 25 |
+
|
| 26 |
+
import pyarrow as pa
|
| 27 |
+
import pyarrow.parquet as pq
|
| 28 |
+
from tqdm.contrib.concurrent import thread_map
|
| 29 |
+
|
| 30 |
+
from .download.download_config import DownloadConfig # noqa: F401
|
| 31 |
+
from .naming import _split_re, filenames_for_dataset_split
|
| 32 |
+
from .table import InMemoryTable, MemoryMappedTable, Table, concat_tables
|
| 33 |
+
from .utils import logging
|
| 34 |
+
from .utils import tqdm as hf_tqdm
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
if TYPE_CHECKING:
|
| 38 |
+
from .info import DatasetInfo # noqa: F401
|
| 39 |
+
from .splits import Split, SplitInfo # noqa: F401
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
logger = logging.get_logger(__name__)
|
| 43 |
+
|
| 44 |
+
HF_GCP_BASE_URL = "https://storage.googleapis.com/huggingface-nlp/cache/datasets"
|
| 45 |
+
|
| 46 |
+
_SUB_SPEC_RE = re.compile(
|
| 47 |
+
rf"""
|
| 48 |
+
^
|
| 49 |
+
(?P<split>{_split_re[1:-1]})
|
| 50 |
+
(\[
|
| 51 |
+
((?P<from>-?[\d_]+)
|
| 52 |
+
(?P<from_pct>%)?)?
|
| 53 |
+
:
|
| 54 |
+
((?P<to>-?[\d_]+)
|
| 55 |
+
(?P<to_pct>%)?)?
|
| 56 |
+
\])?(\((?P<rounding>[^\)]*)\))?
|
| 57 |
+
$
|
| 58 |
+
""", # remove ^ and $
|
| 59 |
+
re.X,
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
_ADDITION_SEP_RE = re.compile(r"\s*\+\s*")
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
class DatasetNotOnHfGcsError(ConnectionError):
|
| 66 |
+
"""When you can't get the dataset from the Hf google cloud storage"""
|
| 67 |
+
|
| 68 |
+
pass
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
class MissingFilesOnHfGcsError(ConnectionError):
|
| 72 |
+
"""When some files are missing on the Hf oogle cloud storage"""
|
| 73 |
+
|
| 74 |
+
pass
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
@dataclass(frozen=True)
|
| 78 |
+
class FileInstructions:
|
| 79 |
+
"""The file instructions associated with a split ReadInstruction.
|
| 80 |
+
|
| 81 |
+
Attributes:
|
| 82 |
+
num_examples: `int`, The total number of examples
|
| 83 |
+
file_instructions: List[dict(filename, skip, take)], the files information.
|
| 84 |
+
The filenames contains the relative path, not absolute.
|
| 85 |
+
skip/take indicates which example read in the file: `ds.slice(skip, take)`
|
| 86 |
+
"""
|
| 87 |
+
|
| 88 |
+
num_examples: int
|
| 89 |
+
file_instructions: list[dict]
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def make_file_instructions(
|
| 93 |
+
name: str,
|
| 94 |
+
split_infos: list["SplitInfo"],
|
| 95 |
+
instruction: Union[str, "ReadInstruction"],
|
| 96 |
+
filetype_suffix: Optional[str] = None,
|
| 97 |
+
prefix_path: Optional[str] = None,
|
| 98 |
+
) -> FileInstructions:
|
| 99 |
+
"""Returns instructions of the split dict.
|
| 100 |
+
|
| 101 |
+
Args:
|
| 102 |
+
name (`str`): Name of the dataset.
|
| 103 |
+
split_infos (`list` of `[SplitInfo]`): Dataset splits information.
|
| 104 |
+
instruction ([`ReadInstruction`] or `str`): Reading instruction for a dataset.
|
| 105 |
+
filetype_suffix (`str`, *optional*): Suffix of dataset files, e.g. 'arrow' or 'parquet'.
|
| 106 |
+
prefix_path (`str`, *optional*): Prefix of dataset files, e.g. directory name.
|
| 107 |
+
|
| 108 |
+
Returns:
|
| 109 |
+
[`FileInstructions`]
|
| 110 |
+
"""
|
| 111 |
+
if not isinstance(name, str):
|
| 112 |
+
raise TypeError(f"Expected str 'name', but got: {type(name).__name__}")
|
| 113 |
+
elif not name:
|
| 114 |
+
raise ValueError("Expected non-empty str 'name'")
|
| 115 |
+
name2len = {info.name: info.num_examples for info in split_infos}
|
| 116 |
+
name2shard_lengths = {info.name: info.shard_lengths for info in split_infos}
|
| 117 |
+
name2filenames = {
|
| 118 |
+
info.name: filenames_for_dataset_split(
|
| 119 |
+
path=prefix_path,
|
| 120 |
+
dataset_name=name,
|
| 121 |
+
split=info.name,
|
| 122 |
+
filetype_suffix=filetype_suffix,
|
| 123 |
+
shard_lengths=name2shard_lengths[info.name],
|
| 124 |
+
)
|
| 125 |
+
for info in split_infos
|
| 126 |
+
}
|
| 127 |
+
if not isinstance(instruction, ReadInstruction):
|
| 128 |
+
instruction = ReadInstruction.from_spec(instruction)
|
| 129 |
+
# Create the absolute instruction (per split)
|
| 130 |
+
absolute_instructions = instruction.to_absolute(name2len)
|
| 131 |
+
|
| 132 |
+
# For each split, return the files instruction (skip/take)
|
| 133 |
+
file_instructions = []
|
| 134 |
+
num_examples = 0
|
| 135 |
+
for abs_instr in absolute_instructions:
|
| 136 |
+
split_length = name2len[abs_instr.splitname]
|
| 137 |
+
filenames = name2filenames[abs_instr.splitname]
|
| 138 |
+
shard_lengths = name2shard_lengths[abs_instr.splitname]
|
| 139 |
+
from_ = 0 if abs_instr.from_ is None else abs_instr.from_
|
| 140 |
+
to = split_length if abs_instr.to is None else abs_instr.to
|
| 141 |
+
if shard_lengths is None: # not sharded
|
| 142 |
+
for filename in filenames:
|
| 143 |
+
take = to - from_
|
| 144 |
+
if take == 0:
|
| 145 |
+
continue
|
| 146 |
+
num_examples += take
|
| 147 |
+
file_instructions.append({"filename": filename, "skip": from_, "take": take})
|
| 148 |
+
else: # sharded
|
| 149 |
+
index_start = 0 # Beginning (included) of moving window.
|
| 150 |
+
index_end = 0 # End (excluded) of moving window.
|
| 151 |
+
for filename, shard_length in zip(filenames, shard_lengths):
|
| 152 |
+
index_end += shard_length
|
| 153 |
+
if from_ < index_end and to > index_start: # There is something to take.
|
| 154 |
+
skip = from_ - index_start if from_ > index_start else 0
|
| 155 |
+
take = to - index_start - skip if to < index_end else -1
|
| 156 |
+
if take == 0:
|
| 157 |
+
continue
|
| 158 |
+
file_instructions.append({"filename": filename, "skip": skip, "take": take})
|
| 159 |
+
num_examples += shard_length - skip if take == -1 else take
|
| 160 |
+
index_start += shard_length
|
| 161 |
+
return FileInstructions(
|
| 162 |
+
num_examples=num_examples,
|
| 163 |
+
file_instructions=file_instructions,
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
class BaseReader:
|
| 168 |
+
"""
|
| 169 |
+
Build a Dataset object out of Instruction instance(s).
|
| 170 |
+
"""
|
| 171 |
+
|
| 172 |
+
def __init__(self, path: str, info: Optional["DatasetInfo"]):
|
| 173 |
+
"""Initializes ArrowReader.
|
| 174 |
+
|
| 175 |
+
Args:
|
| 176 |
+
path (str): path where tfrecords are stored.
|
| 177 |
+
info (DatasetInfo): info about the dataset.
|
| 178 |
+
"""
|
| 179 |
+
self._path: str = path
|
| 180 |
+
self._info: Optional["DatasetInfo"] = info
|
| 181 |
+
self._filetype_suffix: Optional[str] = None
|
| 182 |
+
|
| 183 |
+
def _get_table_from_filename(self, filename_skip_take, in_memory=False) -> Table:
|
| 184 |
+
"""Returns a Dataset instance from given (filename, skip, take)."""
|
| 185 |
+
raise NotImplementedError
|
| 186 |
+
|
| 187 |
+
def _read_files(self, files, in_memory=False) -> Table:
|
| 188 |
+
"""Returns Dataset for given file instructions.
|
| 189 |
+
|
| 190 |
+
Args:
|
| 191 |
+
files: List[dict(filename, skip, take)], the files information.
|
| 192 |
+
The filenames contain the absolute path, not relative.
|
| 193 |
+
skip/take indicates which example read in the file: `ds.slice(skip, take)`
|
| 194 |
+
in_memory (bool, default False): Whether to copy the data in-memory.
|
| 195 |
+
"""
|
| 196 |
+
if len(files) == 0 or not all(isinstance(f, dict) for f in files):
|
| 197 |
+
raise ValueError("please provide valid file informations")
|
| 198 |
+
files = copy.deepcopy(files)
|
| 199 |
+
for f in files:
|
| 200 |
+
f["filename"] = os.path.join(self._path, f["filename"])
|
| 201 |
+
|
| 202 |
+
pa_tables = thread_map(
|
| 203 |
+
partial(self._get_table_from_filename, in_memory=in_memory),
|
| 204 |
+
files,
|
| 205 |
+
tqdm_class=hf_tqdm,
|
| 206 |
+
desc="Loading dataset shards",
|
| 207 |
+
# set `disable=None` rather than `disable=False` by default to disable progress bar when no TTY attached
|
| 208 |
+
disable=len(files) <= 16 or None,
|
| 209 |
+
)
|
| 210 |
+
pa_tables = [t for t in pa_tables if len(t) > 0]
|
| 211 |
+
if not pa_tables and (self._info is None or self._info.features is None):
|
| 212 |
+
raise ValueError(
|
| 213 |
+
"Tried to read an empty table. Please specify at least info.features to create an empty table with the right type."
|
| 214 |
+
)
|
| 215 |
+
pa_tables = pa_tables or [InMemoryTable.from_batches([], schema=pa.schema(self._info.features.type))]
|
| 216 |
+
pa_table = concat_tables(pa_tables) if len(pa_tables) != 1 else pa_tables[0]
|
| 217 |
+
return pa_table
|
| 218 |
+
|
| 219 |
+
def get_file_instructions(self, name, instruction, split_infos):
|
| 220 |
+
"""Return list of dict {'filename': str, 'skip': int, 'take': int}"""
|
| 221 |
+
file_instructions = make_file_instructions(
|
| 222 |
+
name, split_infos, instruction, filetype_suffix=self._filetype_suffix, prefix_path=self._path
|
| 223 |
+
)
|
| 224 |
+
files = file_instructions.file_instructions
|
| 225 |
+
return files
|
| 226 |
+
|
| 227 |
+
def read(
|
| 228 |
+
self,
|
| 229 |
+
name,
|
| 230 |
+
instructions,
|
| 231 |
+
split_infos,
|
| 232 |
+
in_memory=False,
|
| 233 |
+
):
|
| 234 |
+
"""Returns Dataset instance(s).
|
| 235 |
+
|
| 236 |
+
Args:
|
| 237 |
+
name (str): name of the dataset.
|
| 238 |
+
instructions (ReadInstruction): instructions to read.
|
| 239 |
+
Instruction can be string and will then be passed to the Instruction
|
| 240 |
+
constructor as it.
|
| 241 |
+
split_infos (list of SplitInfo proto): the available splits for dataset.
|
| 242 |
+
in_memory (bool, default False): Whether to copy the data in-memory.
|
| 243 |
+
|
| 244 |
+
Returns:
|
| 245 |
+
kwargs to build a single Dataset instance.
|
| 246 |
+
"""
|
| 247 |
+
|
| 248 |
+
files = self.get_file_instructions(name, instructions, split_infos)
|
| 249 |
+
if not files:
|
| 250 |
+
msg = f'Instruction "{instructions}" corresponds to no data!'
|
| 251 |
+
raise ValueError(msg)
|
| 252 |
+
return self.read_files(files=files, original_instructions=instructions, in_memory=in_memory)
|
| 253 |
+
|
| 254 |
+
def read_files(
|
| 255 |
+
self,
|
| 256 |
+
files: list[dict],
|
| 257 |
+
original_instructions: Union[None, "ReadInstruction", "Split"] = None,
|
| 258 |
+
in_memory=False,
|
| 259 |
+
):
|
| 260 |
+
"""Returns single Dataset instance for the set of file instructions.
|
| 261 |
+
|
| 262 |
+
Args:
|
| 263 |
+
files: List[dict(filename, skip, take)], the files information.
|
| 264 |
+
The filenames contains the relative path, not absolute.
|
| 265 |
+
skip/take indicates which example read in the file: `ds.skip().take()`
|
| 266 |
+
original_instructions: store the original instructions used to build the dataset split in the dataset.
|
| 267 |
+
in_memory (bool, default False): Whether to copy the data in-memory.
|
| 268 |
+
|
| 269 |
+
Returns:
|
| 270 |
+
kwargs to build a Dataset instance.
|
| 271 |
+
"""
|
| 272 |
+
# Prepend path to filename
|
| 273 |
+
pa_table = self._read_files(files, in_memory=in_memory)
|
| 274 |
+
# If original_instructions is not None, convert it to a human-readable NamedSplit
|
| 275 |
+
if original_instructions is not None:
|
| 276 |
+
from .splits import Split # noqa
|
| 277 |
+
|
| 278 |
+
split = Split(str(original_instructions))
|
| 279 |
+
else:
|
| 280 |
+
split = None
|
| 281 |
+
dataset_kwargs = {"arrow_table": pa_table, "info": self._info, "split": split}
|
| 282 |
+
return dataset_kwargs
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
class ArrowReader(BaseReader):
|
| 286 |
+
"""
|
| 287 |
+
Build a Dataset object out of Instruction instance(s).
|
| 288 |
+
This Reader uses either memory mapping or file descriptors (in-memory) on arrow files.
|
| 289 |
+
"""
|
| 290 |
+
|
| 291 |
+
def __init__(self, path: str, info: Optional["DatasetInfo"]):
|
| 292 |
+
"""Initializes ArrowReader.
|
| 293 |
+
|
| 294 |
+
Args:
|
| 295 |
+
path (str): path where Arrow files are stored.
|
| 296 |
+
info (DatasetInfo): info about the dataset.
|
| 297 |
+
"""
|
| 298 |
+
super().__init__(path, info)
|
| 299 |
+
self._filetype_suffix = "arrow"
|
| 300 |
+
|
| 301 |
+
def _get_table_from_filename(self, filename_skip_take, in_memory=False) -> Table:
|
| 302 |
+
"""Returns a Dataset instance from given (filename, skip, take)."""
|
| 303 |
+
filename, skip, take = (
|
| 304 |
+
filename_skip_take["filename"],
|
| 305 |
+
filename_skip_take["skip"] if "skip" in filename_skip_take else None,
|
| 306 |
+
filename_skip_take["take"] if "take" in filename_skip_take else None,
|
| 307 |
+
)
|
| 308 |
+
table = ArrowReader.read_table(filename, in_memory=in_memory)
|
| 309 |
+
if take == -1:
|
| 310 |
+
take = len(table) - skip
|
| 311 |
+
# here we don't want to slice an empty table, or it may segfault
|
| 312 |
+
if skip is not None and take is not None and not (skip == 0 and take == len(table)):
|
| 313 |
+
table = table.slice(skip, take)
|
| 314 |
+
return table
|
| 315 |
+
|
| 316 |
+
@staticmethod
|
| 317 |
+
def read_table(filename, in_memory=False) -> Table:
|
| 318 |
+
"""
|
| 319 |
+
Read table from file.
|
| 320 |
+
|
| 321 |
+
Args:
|
| 322 |
+
filename (str): File name of the table.
|
| 323 |
+
in_memory (bool, default=False): Whether to copy the data in-memory.
|
| 324 |
+
|
| 325 |
+
Returns:
|
| 326 |
+
pyarrow.Table
|
| 327 |
+
"""
|
| 328 |
+
table_cls = InMemoryTable if in_memory else MemoryMappedTable
|
| 329 |
+
return table_cls.from_file(filename)
|
| 330 |
+
|
| 331 |
+
|
| 332 |
+
class ParquetReader(BaseReader):
|
| 333 |
+
"""
|
| 334 |
+
Build a Dataset object out of Instruction instance(s).
|
| 335 |
+
This Reader uses memory mapping on parquet files.
|
| 336 |
+
"""
|
| 337 |
+
|
| 338 |
+
def __init__(self, path: str, info: Optional["DatasetInfo"]):
|
| 339 |
+
"""Initializes ParquetReader.
|
| 340 |
+
|
| 341 |
+
Args:
|
| 342 |
+
path (str): path where tfrecords are stored.
|
| 343 |
+
info (DatasetInfo): info about the dataset.
|
| 344 |
+
"""
|
| 345 |
+
super().__init__(path, info)
|
| 346 |
+
self._filetype_suffix = "parquet"
|
| 347 |
+
|
| 348 |
+
def _get_table_from_filename(self, filename_skip_take, **kwargs):
|
| 349 |
+
"""Returns a Dataset instance from given (filename, skip, take)."""
|
| 350 |
+
filename, skip, take = (
|
| 351 |
+
filename_skip_take["filename"],
|
| 352 |
+
filename_skip_take["skip"] if "skip" in filename_skip_take else None,
|
| 353 |
+
filename_skip_take["take"] if "take" in filename_skip_take else None,
|
| 354 |
+
)
|
| 355 |
+
# Parquet read_table always loads data in memory, independently of memory_map
|
| 356 |
+
pa_table = pq.read_table(filename, memory_map=True)
|
| 357 |
+
# here we don't want to slice an empty table, or it may segfault
|
| 358 |
+
if skip is not None and take is not None and not (skip == 0 and take == len(pa_table)):
|
| 359 |
+
pa_table = pa_table.slice(skip, take)
|
| 360 |
+
return pa_table
|
| 361 |
+
|
| 362 |
+
|
| 363 |
+
@dataclass(frozen=True)
|
| 364 |
+
class _AbsoluteInstruction:
|
| 365 |
+
"""A machine friendly slice: defined absolute positive boundaries."""
|
| 366 |
+
|
| 367 |
+
splitname: str
|
| 368 |
+
from_: int # uint (starting index).
|
| 369 |
+
to: int # uint (ending index).
|
| 370 |
+
|
| 371 |
+
|
| 372 |
+
@dataclass(frozen=True)
|
| 373 |
+
class _RelativeInstruction:
|
| 374 |
+
"""Represents a single parsed slicing instruction, can use % and negatives."""
|
| 375 |
+
|
| 376 |
+
splitname: str
|
| 377 |
+
from_: Optional[int] = None # int (starting index) or None if no lower boundary.
|
| 378 |
+
to: Optional[int] = None # int (ending index) or None if no upper boundary.
|
| 379 |
+
unit: Optional[str] = None
|
| 380 |
+
rounding: Optional[str] = None
|
| 381 |
+
|
| 382 |
+
def __post_init__(self):
|
| 383 |
+
if self.unit is not None and self.unit not in ["%", "abs"]:
|
| 384 |
+
raise ValueError("unit must be either % or abs")
|
| 385 |
+
if self.rounding is not None and self.rounding not in ["closest", "pct1_dropremainder"]:
|
| 386 |
+
raise ValueError("rounding must be either closest or pct1_dropremainder")
|
| 387 |
+
if self.unit != "%" and self.rounding is not None:
|
| 388 |
+
raise ValueError("It is forbidden to specify rounding if not using percent slicing.")
|
| 389 |
+
if self.unit == "%" and self.from_ is not None and abs(self.from_) > 100:
|
| 390 |
+
raise ValueError("Percent slice boundaries must be > -100 and < 100.")
|
| 391 |
+
if self.unit == "%" and self.to is not None and abs(self.to) > 100:
|
| 392 |
+
raise ValueError("Percent slice boundaries must be > -100 and < 100.")
|
| 393 |
+
# Update via __dict__ due to instance being "frozen"
|
| 394 |
+
self.__dict__["rounding"] = "closest" if self.rounding is None and self.unit == "%" else self.rounding
|
| 395 |
+
|
| 396 |
+
|
| 397 |
+
def _str_to_read_instruction(spec):
|
| 398 |
+
"""Returns ReadInstruction for given string."""
|
| 399 |
+
res = _SUB_SPEC_RE.match(spec)
|
| 400 |
+
if not res:
|
| 401 |
+
raise ValueError(f"Unrecognized instruction format: {spec}")
|
| 402 |
+
unit = "%" if res.group("from_pct") or res.group("to_pct") else "abs"
|
| 403 |
+
return ReadInstruction(
|
| 404 |
+
split_name=res.group("split"),
|
| 405 |
+
rounding=res.group("rounding"),
|
| 406 |
+
from_=int(res.group("from")) if res.group("from") else None,
|
| 407 |
+
to=int(res.group("to")) if res.group("to") else None,
|
| 408 |
+
unit=unit,
|
| 409 |
+
)
|
| 410 |
+
|
| 411 |
+
|
| 412 |
+
def _pct_to_abs_pct1(boundary, num_examples):
|
| 413 |
+
# Using math.trunc here, since -99.5% should give -99%, not -100%.
|
| 414 |
+
if num_examples < 100:
|
| 415 |
+
msg = (
|
| 416 |
+
'Using "pct1_dropremainder" rounding on a split with less than 100 '
|
| 417 |
+
"elements is forbidden: it always results in an empty dataset."
|
| 418 |
+
)
|
| 419 |
+
raise ValueError(msg)
|
| 420 |
+
return boundary * math.trunc(num_examples / 100.0)
|
| 421 |
+
|
| 422 |
+
|
| 423 |
+
def _pct_to_abs_closest(boundary, num_examples):
|
| 424 |
+
return int(round(boundary * num_examples / 100.0))
|
| 425 |
+
|
| 426 |
+
|
| 427 |
+
def _rel_to_abs_instr(rel_instr, name2len):
|
| 428 |
+
"""Returns _AbsoluteInstruction instance for given RelativeInstruction.
|
| 429 |
+
|
| 430 |
+
Args:
|
| 431 |
+
rel_instr: RelativeInstruction instance.
|
| 432 |
+
name2len: dict {split_name: num_examples}.
|
| 433 |
+
"""
|
| 434 |
+
pct_to_abs = _pct_to_abs_closest if rel_instr.rounding == "closest" else _pct_to_abs_pct1
|
| 435 |
+
split = rel_instr.splitname
|
| 436 |
+
if split not in name2len:
|
| 437 |
+
raise ValueError(f'Unknown split "{split}". Should be one of {list(name2len)}.')
|
| 438 |
+
num_examples = name2len[split]
|
| 439 |
+
from_ = rel_instr.from_
|
| 440 |
+
to = rel_instr.to
|
| 441 |
+
if rel_instr.unit == "%":
|
| 442 |
+
from_ = 0 if from_ is None else pct_to_abs(from_, num_examples)
|
| 443 |
+
to = num_examples if to is None else pct_to_abs(to, num_examples)
|
| 444 |
+
else:
|
| 445 |
+
from_ = 0 if from_ is None else from_
|
| 446 |
+
to = num_examples if to is None else to
|
| 447 |
+
if from_ < 0:
|
| 448 |
+
from_ = max(num_examples + from_, 0)
|
| 449 |
+
if to < 0:
|
| 450 |
+
to = max(num_examples + to, 0)
|
| 451 |
+
from_ = min(from_, num_examples)
|
| 452 |
+
to = min(to, num_examples)
|
| 453 |
+
return _AbsoluteInstruction(split, from_, to)
|
| 454 |
+
|
| 455 |
+
|
| 456 |
+
class ReadInstruction:
|
| 457 |
+
"""Reading instruction for a dataset.
|
| 458 |
+
|
| 459 |
+
Examples::
|
| 460 |
+
|
| 461 |
+
# The following lines are equivalent:
|
| 462 |
+
ds = datasets.load_dataset('mnist', split='test[:33%]')
|
| 463 |
+
ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec('test[:33%]'))
|
| 464 |
+
ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction('test', to=33, unit='%'))
|
| 465 |
+
ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction(
|
| 466 |
+
'test', from_=0, to=33, unit='%'))
|
| 467 |
+
|
| 468 |
+
# The following lines are equivalent:
|
| 469 |
+
ds = datasets.load_dataset('mnist', split='test[:33%]+train[1:-1]')
|
| 470 |
+
ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec(
|
| 471 |
+
'test[:33%]+train[1:-1]'))
|
| 472 |
+
ds = datasets.load_dataset('mnist', split=(
|
| 473 |
+
datasets.ReadInstruction('test', to=33, unit='%') +
|
| 474 |
+
datasets.ReadInstruction('train', from_=1, to=-1, unit='abs')))
|
| 475 |
+
|
| 476 |
+
# The following lines are equivalent:
|
| 477 |
+
ds = datasets.load_dataset('mnist', split='test[:33%](pct1_dropremainder)')
|
| 478 |
+
ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec(
|
| 479 |
+
'test[:33%](pct1_dropremainder)'))
|
| 480 |
+
ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction(
|
| 481 |
+
'test', from_=0, to=33, unit='%', rounding="pct1_dropremainder"))
|
| 482 |
+
|
| 483 |
+
# 10-fold validation:
|
| 484 |
+
tests = datasets.load_dataset(
|
| 485 |
+
'mnist',
|
| 486 |
+
[datasets.ReadInstruction('train', from_=k, to=k+10, unit='%')
|
| 487 |
+
for k in range(0, 100, 10)])
|
| 488 |
+
trains = datasets.load_dataset(
|
| 489 |
+
'mnist',
|
| 490 |
+
[datasets.ReadInstruction('train', to=k, unit='%') + datasets.ReadInstruction('train', from_=k+10, unit='%')
|
| 491 |
+
for k in range(0, 100, 10)])
|
| 492 |
+
|
| 493 |
+
"""
|
| 494 |
+
|
| 495 |
+
def _init(self, relative_instructions):
|
| 496 |
+
# Private initializer.
|
| 497 |
+
self._relative_instructions = relative_instructions
|
| 498 |
+
|
| 499 |
+
@classmethod
|
| 500 |
+
def _read_instruction_from_relative_instructions(cls, relative_instructions):
|
| 501 |
+
"""Returns ReadInstruction obj initialized with relative_instructions."""
|
| 502 |
+
# Use __new__ to bypass __init__ used by public API and not conveniant here.
|
| 503 |
+
result = cls.__new__(cls)
|
| 504 |
+
result._init(relative_instructions) # pylint: disable=protected-access
|
| 505 |
+
return result
|
| 506 |
+
|
| 507 |
+
def __init__(self, split_name, rounding=None, from_=None, to=None, unit=None):
|
| 508 |
+
"""Initialize ReadInstruction.
|
| 509 |
+
|
| 510 |
+
Args:
|
| 511 |
+
split_name (str): name of the split to read. Eg: 'train'.
|
| 512 |
+
rounding (str, optional): The rounding behaviour to use when percent slicing is
|
| 513 |
+
used. Ignored when slicing with absolute indices.
|
| 514 |
+
Possible values:
|
| 515 |
+
- 'closest' (default): The specified percentages are rounded to the
|
| 516 |
+
closest value. Use this if you want specified percents to be as
|
| 517 |
+
much exact as possible.
|
| 518 |
+
- 'pct1_dropremainder': the specified percentages are treated as
|
| 519 |
+
multiple of 1%. Use this option if you want consistency. Eg:
|
| 520 |
+
len(5%) == 5 * len(1%).
|
| 521 |
+
Using this option, one might not be able to use the full set of
|
| 522 |
+
examples, if the number of those is not a multiple of 100.
|
| 523 |
+
from_ (int):
|
| 524 |
+
to (int): alternative way of specifying slicing boundaries. If any of
|
| 525 |
+
{from_, to, unit} argument is used, slicing cannot be specified as
|
| 526 |
+
string.
|
| 527 |
+
unit (str): optional, one of:
|
| 528 |
+
'%': to set the slicing unit as percents of the split size.
|
| 529 |
+
'abs': to set the slicing unit as absolute numbers.
|
| 530 |
+
"""
|
| 531 |
+
# This constructor is not always called. See factory method
|
| 532 |
+
# `_read_instruction_from_relative_instructions`. Common init instructions
|
| 533 |
+
# MUST be placed in the _init method.
|
| 534 |
+
self._init([_RelativeInstruction(split_name, from_, to, unit, rounding)])
|
| 535 |
+
|
| 536 |
+
@classmethod
|
| 537 |
+
def from_spec(cls, spec):
|
| 538 |
+
"""Creates a `ReadInstruction` instance out of a string spec.
|
| 539 |
+
|
| 540 |
+
Args:
|
| 541 |
+
spec (`str`):
|
| 542 |
+
Split(s) + optional slice(s) to read + optional rounding
|
| 543 |
+
if percents are used as the slicing unit. A slice can be specified,
|
| 544 |
+
using absolute numbers (`int`) or percentages (`int`).
|
| 545 |
+
|
| 546 |
+
Examples:
|
| 547 |
+
|
| 548 |
+
```
|
| 549 |
+
test: test split.
|
| 550 |
+
test + validation: test split + validation split.
|
| 551 |
+
test[10:]: test split, minus its first 10 records.
|
| 552 |
+
test[:10%]: first 10% records of test split.
|
| 553 |
+
test[:20%](pct1_dropremainder): first 10% records, rounded with the pct1_dropremainder rounding.
|
| 554 |
+
test[:-5%]+train[40%:60%]: first 95% of test + middle 20% of train.
|
| 555 |
+
```
|
| 556 |
+
|
| 557 |
+
Returns:
|
| 558 |
+
ReadInstruction instance.
|
| 559 |
+
"""
|
| 560 |
+
spec = str(spec) # Need to convert to str in case of NamedSplit instance.
|
| 561 |
+
subs = _ADDITION_SEP_RE.split(spec)
|
| 562 |
+
if not subs:
|
| 563 |
+
raise ValueError(f"No instructions could be built out of {spec}")
|
| 564 |
+
instruction = _str_to_read_instruction(subs[0])
|
| 565 |
+
return sum((_str_to_read_instruction(sub) for sub in subs[1:]), instruction)
|
| 566 |
+
|
| 567 |
+
def to_spec(self):
|
| 568 |
+
rel_instr_specs = []
|
| 569 |
+
for rel_instr in self._relative_instructions:
|
| 570 |
+
rel_instr_spec = rel_instr.splitname
|
| 571 |
+
if rel_instr.from_ is not None or rel_instr.to is not None:
|
| 572 |
+
from_ = rel_instr.from_
|
| 573 |
+
to = rel_instr.to
|
| 574 |
+
unit = rel_instr.unit
|
| 575 |
+
rounding = rel_instr.rounding
|
| 576 |
+
unit = unit if unit == "%" else ""
|
| 577 |
+
from_ = str(from_) + unit if from_ is not None else ""
|
| 578 |
+
to = str(to) + unit if to is not None else ""
|
| 579 |
+
slice_str = f"[{from_}:{to}]"
|
| 580 |
+
rounding_str = (
|
| 581 |
+
f"({rounding})" if unit == "%" and rounding is not None and rounding != "closest" else ""
|
| 582 |
+
)
|
| 583 |
+
rel_instr_spec += slice_str + rounding_str
|
| 584 |
+
rel_instr_specs.append(rel_instr_spec)
|
| 585 |
+
return "+".join(rel_instr_specs)
|
| 586 |
+
|
| 587 |
+
def __add__(self, other):
|
| 588 |
+
"""Returns a new ReadInstruction obj, result of appending other to self."""
|
| 589 |
+
if not isinstance(other, ReadInstruction):
|
| 590 |
+
msg = "ReadInstruction can only be added to another ReadInstruction obj."
|
| 591 |
+
raise TypeError(msg)
|
| 592 |
+
self_ris = self._relative_instructions
|
| 593 |
+
other_ris = other._relative_instructions # pylint: disable=protected-access
|
| 594 |
+
if (
|
| 595 |
+
self_ris[0].unit != "abs"
|
| 596 |
+
and other_ris[0].unit != "abs"
|
| 597 |
+
and self._relative_instructions[0].rounding != other_ris[0].rounding
|
| 598 |
+
):
|
| 599 |
+
raise ValueError("It is forbidden to sum ReadInstruction instances with different rounding values.")
|
| 600 |
+
return self._read_instruction_from_relative_instructions(self_ris + other_ris)
|
| 601 |
+
|
| 602 |
+
def __str__(self):
|
| 603 |
+
return self.to_spec()
|
| 604 |
+
|
| 605 |
+
def __repr__(self):
|
| 606 |
+
return f"ReadInstruction({self._relative_instructions})"
|
| 607 |
+
|
| 608 |
+
def to_absolute(self, name2len):
|
| 609 |
+
"""Translate instruction into a list of absolute instructions.
|
| 610 |
+
|
| 611 |
+
Those absolute instructions are then to be added together.
|
| 612 |
+
|
| 613 |
+
Args:
|
| 614 |
+
name2len (`dict`):
|
| 615 |
+
Associating split names to number of examples.
|
| 616 |
+
|
| 617 |
+
Returns:
|
| 618 |
+
list of _AbsoluteInstruction instances (corresponds to the + in spec).
|
| 619 |
+
"""
|
| 620 |
+
return [_rel_to_abs_instr(rel_instr, name2len) for rel_instr in self._relative_instructions]
|
datasets/arrow_writer.py
ADDED
|
@@ -0,0 +1,766 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 8 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 9 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 10 |
+
# See the License for the specific language governing permissions and
|
| 11 |
+
# limitations under the License.
|
| 12 |
+
|
| 13 |
+
# Lint as: python3
|
| 14 |
+
"""To write records into Parquet files."""
|
| 15 |
+
|
| 16 |
+
import json
|
| 17 |
+
import sys
|
| 18 |
+
from collections.abc import Iterable
|
| 19 |
+
from typing import Any, Optional, Union
|
| 20 |
+
|
| 21 |
+
import fsspec
|
| 22 |
+
import numpy as np
|
| 23 |
+
import pyarrow as pa
|
| 24 |
+
import pyarrow.parquet as pq
|
| 25 |
+
from fsspec.core import url_to_fs
|
| 26 |
+
|
| 27 |
+
from . import config
|
| 28 |
+
from .features import Audio, Features, Image, Pdf, Value, Video
|
| 29 |
+
from .features.features import (
|
| 30 |
+
FeatureType,
|
| 31 |
+
List,
|
| 32 |
+
_ArrayXDExtensionType,
|
| 33 |
+
_visit,
|
| 34 |
+
cast_to_python_objects,
|
| 35 |
+
generate_from_arrow_type,
|
| 36 |
+
get_nested_type,
|
| 37 |
+
list_of_np_array_to_pyarrow_listarray,
|
| 38 |
+
numpy_to_pyarrow_listarray,
|
| 39 |
+
to_pyarrow_listarray,
|
| 40 |
+
)
|
| 41 |
+
from .filesystems import is_remote_filesystem
|
| 42 |
+
from .info import DatasetInfo
|
| 43 |
+
from .keyhash import DuplicatedKeysError, KeyHasher
|
| 44 |
+
from .table import array_cast, cast_array_to_feature, embed_table_storage, table_cast
|
| 45 |
+
from .utils import logging
|
| 46 |
+
from .utils.py_utils import asdict, convert_file_size_to_int, first_non_null_non_empty_value
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
logger = logging.get_logger(__name__)
|
| 50 |
+
|
| 51 |
+
type_ = type # keep python's type function
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def get_arrow_writer_batch_size_from_features(features: Optional[Features]) -> Optional[int]:
|
| 55 |
+
"""
|
| 56 |
+
Get the writer_batch_size that defines the maximum record batch size in the arrow files based on configuration values.
|
| 57 |
+
The default value is 100 for image/audio datasets and 10 for videos.
|
| 58 |
+
This allows to avoid overflows in arrow buffers.
|
| 59 |
+
|
| 60 |
+
Args:
|
| 61 |
+
features (`datasets.Features` or `None`):
|
| 62 |
+
Dataset Features from `datasets`.
|
| 63 |
+
Returns:
|
| 64 |
+
writer_batch_size (`Optional[int]`):
|
| 65 |
+
Writer batch size to pass to a dataset builder.
|
| 66 |
+
If `None`, then it will use the `datasets` default, i.e. `datasets.config.DEFAULT_MAX_BATCH_SIZE`.
|
| 67 |
+
"""
|
| 68 |
+
if not features:
|
| 69 |
+
return None
|
| 70 |
+
|
| 71 |
+
batch_size = np.inf
|
| 72 |
+
|
| 73 |
+
def set_batch_size(feature: FeatureType) -> None:
|
| 74 |
+
nonlocal batch_size
|
| 75 |
+
if isinstance(feature, Image) and config.ARROW_RECORD_BATCH_SIZE_FOR_IMAGE_DATASETS is not None:
|
| 76 |
+
batch_size = min(batch_size, config.ARROW_RECORD_BATCH_SIZE_FOR_IMAGE_DATASETS)
|
| 77 |
+
elif isinstance(feature, Audio) and config.ARROW_RECORD_BATCH_SIZE_FOR_AUDIO_DATASETS is not None:
|
| 78 |
+
batch_size = min(batch_size, config.ARROW_RECORD_BATCH_SIZE_FOR_AUDIO_DATASETS)
|
| 79 |
+
elif isinstance(feature, Video) and config.ARROW_RECORD_BATCH_SIZE_FOR_VIDEO_DATASETS is not None:
|
| 80 |
+
batch_size = min(batch_size, config.ARROW_RECORD_BATCH_SIZE_FOR_VIDEO_DATASETS)
|
| 81 |
+
elif (
|
| 82 |
+
isinstance(feature, Value)
|
| 83 |
+
and feature.dtype == "binary"
|
| 84 |
+
and config.ARROW_RECORD_BATCH_SIZE_FOR_BINARY_DATASETS is not None
|
| 85 |
+
):
|
| 86 |
+
batch_size = min(batch_size, config.ARROW_RECORD_BATCH_SIZE_FOR_BINARY_DATASETS)
|
| 87 |
+
|
| 88 |
+
_visit(features, set_batch_size)
|
| 89 |
+
|
| 90 |
+
return None if batch_size is np.inf else batch_size
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def get_writer_batch_size_from_features(features: Optional[Features]) -> Optional[int]:
|
| 94 |
+
"""
|
| 95 |
+
Get the writer_batch_size that defines the maximum row group size in the parquet files based on configuration values.
|
| 96 |
+
By default these are not set, but it can be helpful to hard set those values in some cases.
|
| 97 |
+
This allows to optimize random access to parquet file, since accessing 1 row requires
|
| 98 |
+
to read its entire row group.
|
| 99 |
+
|
| 100 |
+
Args:
|
| 101 |
+
features (`datasets.Features` or `None`):
|
| 102 |
+
Dataset Features from `datasets`.
|
| 103 |
+
Returns:
|
| 104 |
+
writer_batch_size (`Optional[int]`):
|
| 105 |
+
Writer batch size to pass to a parquet writer.
|
| 106 |
+
If `None`, then it will use the `datasets` default, i.e. aiming for row groups of 100MB.
|
| 107 |
+
"""
|
| 108 |
+
if not features:
|
| 109 |
+
return None
|
| 110 |
+
|
| 111 |
+
batch_size = np.inf
|
| 112 |
+
|
| 113 |
+
def set_batch_size(feature: FeatureType) -> None:
|
| 114 |
+
nonlocal batch_size
|
| 115 |
+
if isinstance(feature, Image) and config.PARQUET_ROW_GROUP_SIZE_FOR_IMAGE_DATASETS is not None:
|
| 116 |
+
batch_size = min(batch_size, config.PARQUET_ROW_GROUP_SIZE_FOR_IMAGE_DATASETS)
|
| 117 |
+
elif isinstance(feature, Audio) and config.PARQUET_ROW_GROUP_SIZE_FOR_AUDIO_DATASETS is not None:
|
| 118 |
+
batch_size = min(batch_size, config.PARQUET_ROW_GROUP_SIZE_FOR_AUDIO_DATASETS)
|
| 119 |
+
elif isinstance(feature, Video) and config.PARQUET_ROW_GROUP_SIZE_FOR_VIDEO_DATASETS is not None:
|
| 120 |
+
batch_size = min(batch_size, config.PARQUET_ROW_GROUP_SIZE_FOR_VIDEO_DATASETS)
|
| 121 |
+
elif (
|
| 122 |
+
isinstance(feature, Value)
|
| 123 |
+
and feature.dtype == "binary"
|
| 124 |
+
and config.PARQUET_ROW_GROUP_SIZE_FOR_BINARY_DATASETS is not None
|
| 125 |
+
):
|
| 126 |
+
batch_size = min(batch_size, config.PARQUET_ROW_GROUP_SIZE_FOR_BINARY_DATASETS)
|
| 127 |
+
|
| 128 |
+
_visit(features, set_batch_size)
|
| 129 |
+
|
| 130 |
+
return None if batch_size is np.inf else batch_size
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def get_writer_batch_size_from_data_size(num_rows: int, num_bytes: int) -> int:
|
| 134 |
+
"""
|
| 135 |
+
Get the writer_batch_size that defines the maximum row group size in the parquet files.
|
| 136 |
+
The default in `datasets` is aiming for row groups of maximum 100MB uncompressed.
|
| 137 |
+
This allows to optimize random access to parquet file, since accessing 1 row requires
|
| 138 |
+
to read its entire row group.
|
| 139 |
+
|
| 140 |
+
This can be improved to get optimized size for querying/iterating
|
| 141 |
+
but at least it matches the dataset viewer expectations on HF.
|
| 142 |
+
|
| 143 |
+
Args:
|
| 144 |
+
num_rows (`int`):
|
| 145 |
+
Number of rows in the dataset.
|
| 146 |
+
num_bytes (`int`):
|
| 147 |
+
Number of bytes in the dataset.
|
| 148 |
+
For dataset with external files to embed (image, audio, videos), this can also be an
|
| 149 |
+
estimate from `dataset._estimate_nbytes()`.
|
| 150 |
+
Returns:
|
| 151 |
+
writer_batch_size (`Optional[int]`):
|
| 152 |
+
Writer batch size to pass to a parquet writer.
|
| 153 |
+
"""
|
| 154 |
+
return max(10, num_rows * convert_file_size_to_int(config.MAX_ROW_GROUP_SIZE) // num_bytes) if num_bytes > 0 else 1
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
class SchemaInferenceError(ValueError):
|
| 158 |
+
pass
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
class TypedSequence:
|
| 162 |
+
"""
|
| 163 |
+
This data container generalizes the typing when instantiating pyarrow arrays, tables or batches.
|
| 164 |
+
|
| 165 |
+
More specifically it adds several features:
|
| 166 |
+
- Support extension types like ``datasets.features.Array2DExtensionType``:
|
| 167 |
+
By default pyarrow arrays don't return extension arrays. One has to call
|
| 168 |
+
``pa.ExtensionArray.from_storage(type, pa.array(data, type.storage_type))``
|
| 169 |
+
in order to get an extension array.
|
| 170 |
+
- Support for ``try_type`` parameter that can be used instead of ``type``:
|
| 171 |
+
When an array is transformed, we like to keep the same type as before if possible.
|
| 172 |
+
For example when calling :func:`datasets.Dataset.map`, we don't want to change the type
|
| 173 |
+
of each column by default.
|
| 174 |
+
- Better error message when a pyarrow array overflows.
|
| 175 |
+
|
| 176 |
+
Example::
|
| 177 |
+
|
| 178 |
+
from datasets.features import Array2D, Array2DExtensionType, Value
|
| 179 |
+
from datasets.arrow_writer import TypedSequence
|
| 180 |
+
import pyarrow as pa
|
| 181 |
+
|
| 182 |
+
arr = pa.array(TypedSequence([1, 2, 3], type=Value("int32")))
|
| 183 |
+
assert arr.type == pa.int32()
|
| 184 |
+
|
| 185 |
+
arr = pa.array(TypedSequence([1, 2, 3], try_type=Value("int32")))
|
| 186 |
+
assert arr.type == pa.int32()
|
| 187 |
+
|
| 188 |
+
arr = pa.array(TypedSequence(["foo", "bar"], try_type=Value("int32")))
|
| 189 |
+
assert arr.type == pa.string()
|
| 190 |
+
|
| 191 |
+
arr = pa.array(TypedSequence([[[1, 2, 3]]], type=Array2D((1, 3), "int64")))
|
| 192 |
+
assert arr.type == Array2DExtensionType((1, 3), "int64")
|
| 193 |
+
|
| 194 |
+
table = pa.Table.from_pydict({
|
| 195 |
+
"image": TypedSequence([[[1, 2, 3]]], type=Array2D((1, 3), "int64"))
|
| 196 |
+
})
|
| 197 |
+
assert table["image"].type == Array2DExtensionType((1, 3), "int64")
|
| 198 |
+
|
| 199 |
+
"""
|
| 200 |
+
|
| 201 |
+
def __init__(
|
| 202 |
+
self,
|
| 203 |
+
data: Iterable,
|
| 204 |
+
type: Optional[FeatureType] = None,
|
| 205 |
+
try_type: Optional[FeatureType] = None,
|
| 206 |
+
optimized_int_type: Optional[FeatureType] = None,
|
| 207 |
+
):
|
| 208 |
+
# assert type is None or try_type is None,
|
| 209 |
+
if type is not None and try_type is not None:
|
| 210 |
+
raise ValueError("You cannot specify both type and try_type")
|
| 211 |
+
# set attributes
|
| 212 |
+
self.data = data
|
| 213 |
+
self.type = type
|
| 214 |
+
self.try_type = try_type # is ignored if it doesn't match the data
|
| 215 |
+
self.optimized_int_type = optimized_int_type
|
| 216 |
+
# when trying a type (is ignored if data is not compatible)
|
| 217 |
+
self.trying_type = self.try_type is not None
|
| 218 |
+
self.trying_int_optimization = optimized_int_type is not None and type is None and try_type is None
|
| 219 |
+
# used to get back the inferred type after __arrow_array__() is called once
|
| 220 |
+
self._inferred_type = None
|
| 221 |
+
|
| 222 |
+
def get_inferred_type(self) -> FeatureType:
|
| 223 |
+
"""Return the inferred feature type.
|
| 224 |
+
This is done by converting the sequence to an Arrow array, and getting the corresponding
|
| 225 |
+
feature type.
|
| 226 |
+
|
| 227 |
+
Since building the Arrow array can be expensive, the value of the inferred type is cached
|
| 228 |
+
as soon as pa.array is called on the typed sequence.
|
| 229 |
+
|
| 230 |
+
Returns:
|
| 231 |
+
FeatureType: inferred feature type of the sequence.
|
| 232 |
+
"""
|
| 233 |
+
if self._inferred_type is None:
|
| 234 |
+
self._inferred_type = generate_from_arrow_type(pa.array(self).type)
|
| 235 |
+
return self._inferred_type
|
| 236 |
+
|
| 237 |
+
@staticmethod
|
| 238 |
+
def _infer_custom_type_and_encode(data: Iterable) -> tuple[Iterable, Optional[FeatureType]]:
|
| 239 |
+
"""Implement type inference for custom objects like PIL.Image.Image -> Image type.
|
| 240 |
+
|
| 241 |
+
This function is only used for custom python objects that can't be directly passed to build
|
| 242 |
+
an Arrow array. In such cases is infers the feature type to use, and it encodes the data so
|
| 243 |
+
that they can be passed to an Arrow array.
|
| 244 |
+
|
| 245 |
+
Args:
|
| 246 |
+
data (Iterable): array of data to infer the type, e.g. a list of PIL images.
|
| 247 |
+
|
| 248 |
+
Returns:
|
| 249 |
+
Tuple[Iterable, Optional[FeatureType]]: a tuple with:
|
| 250 |
+
- the (possibly encoded) array, if the inferred feature type requires encoding
|
| 251 |
+
- the inferred feature type if the array is made of supported custom objects like
|
| 252 |
+
PIL images, else None.
|
| 253 |
+
"""
|
| 254 |
+
if config.PIL_AVAILABLE and "PIL" in sys.modules:
|
| 255 |
+
import PIL.Image
|
| 256 |
+
|
| 257 |
+
non_null_idx, non_null_value = first_non_null_non_empty_value(data)
|
| 258 |
+
if isinstance(non_null_value, PIL.Image.Image):
|
| 259 |
+
return [Image().encode_example(value) if value is not None else None for value in data], Image()
|
| 260 |
+
if isinstance(non_null_value, list) and isinstance(non_null_value[0], PIL.Image.Image):
|
| 261 |
+
return [
|
| 262 |
+
[Image().encode_example(x) for x in value] if value is not None else None for value in data
|
| 263 |
+
], List(Image())
|
| 264 |
+
if config.PDFPLUMBER_AVAILABLE and "pdfplumber" in sys.modules:
|
| 265 |
+
import pdfplumber
|
| 266 |
+
|
| 267 |
+
non_null_idx, non_null_value = first_non_null_non_empty_value(data)
|
| 268 |
+
if isinstance(non_null_value, pdfplumber.pdf.PDF):
|
| 269 |
+
return [Pdf().encode_example(value) if value is not None else None for value in data], Pdf()
|
| 270 |
+
if isinstance(non_null_value, list) and isinstance(non_null_value[0], pdfplumber.pdf.PDF):
|
| 271 |
+
return [
|
| 272 |
+
[Pdf().encode_example(x) for x in value] if value is not None else None for value in data
|
| 273 |
+
], List(Pdf())
|
| 274 |
+
return data, None
|
| 275 |
+
|
| 276 |
+
def __arrow_array__(self, type: Optional[pa.DataType] = None):
|
| 277 |
+
"""This function is called when calling pa.array(typed_sequence)"""
|
| 278 |
+
|
| 279 |
+
if type is not None:
|
| 280 |
+
raise ValueError("TypedSequence is supposed to be used with pa.array(typed_sequence, type=None)")
|
| 281 |
+
del type # make sure we don't use it
|
| 282 |
+
data = self.data
|
| 283 |
+
# automatic type inference for custom objects
|
| 284 |
+
if self.type is None and self.try_type is None:
|
| 285 |
+
data, self._inferred_type = self._infer_custom_type_and_encode(data)
|
| 286 |
+
if self._inferred_type is None:
|
| 287 |
+
type = self.try_type if self.trying_type else self.type
|
| 288 |
+
else:
|
| 289 |
+
type = self._inferred_type
|
| 290 |
+
pa_type = get_nested_type(type) if type is not None else None
|
| 291 |
+
optimized_int_pa_type = (
|
| 292 |
+
get_nested_type(self.optimized_int_type) if self.optimized_int_type is not None else None
|
| 293 |
+
)
|
| 294 |
+
trying_cast_to_python_objects = False
|
| 295 |
+
try:
|
| 296 |
+
# custom pyarrow types
|
| 297 |
+
if isinstance(pa_type, _ArrayXDExtensionType):
|
| 298 |
+
storage = to_pyarrow_listarray(data, pa_type)
|
| 299 |
+
return pa.ExtensionArray.from_storage(pa_type, storage)
|
| 300 |
+
|
| 301 |
+
# efficient np array to pyarrow array
|
| 302 |
+
if isinstance(data, np.ndarray):
|
| 303 |
+
out = numpy_to_pyarrow_listarray(data)
|
| 304 |
+
elif isinstance(data, list) and data and isinstance(first_non_null_non_empty_value(data)[1], np.ndarray):
|
| 305 |
+
out = list_of_np_array_to_pyarrow_listarray(data)
|
| 306 |
+
else:
|
| 307 |
+
trying_cast_to_python_objects = True
|
| 308 |
+
out = pa.array(cast_to_python_objects(data, only_1d_for_numpy=True))
|
| 309 |
+
# use smaller integer precisions if possible
|
| 310 |
+
if self.trying_int_optimization:
|
| 311 |
+
if pa.types.is_int64(out.type):
|
| 312 |
+
out = out.cast(optimized_int_pa_type)
|
| 313 |
+
elif pa.types.is_list(out.type):
|
| 314 |
+
if pa.types.is_int64(out.type.value_type):
|
| 315 |
+
out = array_cast(out, pa.list_(optimized_int_pa_type))
|
| 316 |
+
elif pa.types.is_list(out.type.value_type) and pa.types.is_int64(out.type.value_type.value_type):
|
| 317 |
+
out = array_cast(out, pa.list_(pa.list_(optimized_int_pa_type)))
|
| 318 |
+
# otherwise we can finally use the user's type
|
| 319 |
+
elif type is not None:
|
| 320 |
+
# We use cast_array_to_feature to support casting to custom types like Audio and Image
|
| 321 |
+
# Also, when trying type "string", we don't want to convert integers or floats to "string".
|
| 322 |
+
# We only do it if trying_type is False - since this is what the user asks for.
|
| 323 |
+
out = cast_array_to_feature(
|
| 324 |
+
out, type, allow_primitive_to_str=not self.trying_type, allow_decimal_to_str=not self.trying_type
|
| 325 |
+
)
|
| 326 |
+
return out
|
| 327 |
+
except (
|
| 328 |
+
TypeError,
|
| 329 |
+
pa.lib.ArrowInvalid,
|
| 330 |
+
pa.lib.ArrowNotImplementedError,
|
| 331 |
+
) as e: # handle type errors and overflows
|
| 332 |
+
# Ignore ArrowNotImplementedError caused by trying type, otherwise re-raise
|
| 333 |
+
if not self.trying_type and isinstance(e, pa.lib.ArrowNotImplementedError):
|
| 334 |
+
raise
|
| 335 |
+
|
| 336 |
+
if self.trying_type:
|
| 337 |
+
try: # second chance
|
| 338 |
+
if isinstance(data, np.ndarray):
|
| 339 |
+
return numpy_to_pyarrow_listarray(data)
|
| 340 |
+
elif isinstance(data, list) and data and any(isinstance(value, np.ndarray) for value in data):
|
| 341 |
+
return list_of_np_array_to_pyarrow_listarray(data)
|
| 342 |
+
else:
|
| 343 |
+
trying_cast_to_python_objects = True
|
| 344 |
+
return pa.array(cast_to_python_objects(data, only_1d_for_numpy=True))
|
| 345 |
+
except pa.lib.ArrowInvalid as e:
|
| 346 |
+
if "overflow" in str(e):
|
| 347 |
+
raise OverflowError(
|
| 348 |
+
f"There was an overflow with type {type_(data)}. Try to reduce writer_batch_size to have batches smaller than 2GB.\n({e})"
|
| 349 |
+
) from None
|
| 350 |
+
elif self.trying_int_optimization and "not in range" in str(e):
|
| 351 |
+
optimized_int_pa_type_str = np.dtype(optimized_int_pa_type.to_pandas_dtype()).name
|
| 352 |
+
logger.info(
|
| 353 |
+
f"Failed to cast a sequence to {optimized_int_pa_type_str}. Falling back to int64."
|
| 354 |
+
)
|
| 355 |
+
return out
|
| 356 |
+
elif trying_cast_to_python_objects and "Could not convert" in str(e):
|
| 357 |
+
out = pa.array(
|
| 358 |
+
cast_to_python_objects(data, only_1d_for_numpy=True, optimize_list_casting=False)
|
| 359 |
+
)
|
| 360 |
+
if type is not None:
|
| 361 |
+
out = cast_array_to_feature(
|
| 362 |
+
out, type, allow_primitive_to_str=True, allow_decimal_to_str=True
|
| 363 |
+
)
|
| 364 |
+
return out
|
| 365 |
+
else:
|
| 366 |
+
raise
|
| 367 |
+
elif "overflow" in str(e):
|
| 368 |
+
raise OverflowError(
|
| 369 |
+
f"There was an overflow with type {type_(data)}. Try to reduce writer_batch_size to have batches smaller than 2GB.\n({e})"
|
| 370 |
+
) from None
|
| 371 |
+
elif self.trying_int_optimization and "not in range" in str(e):
|
| 372 |
+
optimized_int_pa_type_str = np.dtype(optimized_int_pa_type.to_pandas_dtype()).name
|
| 373 |
+
logger.info(f"Failed to cast a sequence to {optimized_int_pa_type_str}. Falling back to int64.")
|
| 374 |
+
return out
|
| 375 |
+
elif trying_cast_to_python_objects and "Could not convert" in str(e):
|
| 376 |
+
out = pa.array(cast_to_python_objects(data, only_1d_for_numpy=True, optimize_list_casting=False))
|
| 377 |
+
if type is not None:
|
| 378 |
+
out = cast_array_to_feature(out, type, allow_primitive_to_str=True, allow_decimal_to_str=True)
|
| 379 |
+
return out
|
| 380 |
+
else:
|
| 381 |
+
raise
|
| 382 |
+
|
| 383 |
+
|
| 384 |
+
class OptimizedTypedSequence(TypedSequence):
|
| 385 |
+
def __init__(
|
| 386 |
+
self,
|
| 387 |
+
data,
|
| 388 |
+
type: Optional[FeatureType] = None,
|
| 389 |
+
try_type: Optional[FeatureType] = None,
|
| 390 |
+
col: Optional[str] = None,
|
| 391 |
+
optimized_int_type: Optional[FeatureType] = None,
|
| 392 |
+
):
|
| 393 |
+
optimized_int_type_by_col = {
|
| 394 |
+
"attention_mask": Value("int8"), # binary tensor
|
| 395 |
+
"special_tokens_mask": Value("int8"),
|
| 396 |
+
"input_ids": Value("int32"), # typical vocab size: 0-50k (max ~500k, never > 1M)
|
| 397 |
+
"token_type_ids": Value(
|
| 398 |
+
"int8"
|
| 399 |
+
), # binary mask; some (XLNetModel) use an additional token represented by a 2
|
| 400 |
+
}
|
| 401 |
+
if type is None and try_type is None:
|
| 402 |
+
optimized_int_type = optimized_int_type_by_col.get(col, None)
|
| 403 |
+
super().__init__(data, type=type, try_type=try_type, optimized_int_type=optimized_int_type)
|
| 404 |
+
|
| 405 |
+
|
| 406 |
+
class ArrowWriter:
|
| 407 |
+
"""Shuffles and writes Examples to Arrow files."""
|
| 408 |
+
|
| 409 |
+
def __init__(
|
| 410 |
+
self,
|
| 411 |
+
schema: Optional[pa.Schema] = None,
|
| 412 |
+
features: Optional[Features] = None,
|
| 413 |
+
path: Optional[str] = None,
|
| 414 |
+
stream: Optional[pa.NativeFile] = None,
|
| 415 |
+
fingerprint: Optional[str] = None,
|
| 416 |
+
writer_batch_size: Optional[int] = None,
|
| 417 |
+
hash_salt: Optional[str] = None,
|
| 418 |
+
check_duplicates: Optional[bool] = False,
|
| 419 |
+
disable_nullable: bool = False,
|
| 420 |
+
update_features: bool = False,
|
| 421 |
+
with_metadata: bool = True,
|
| 422 |
+
unit: str = "examples",
|
| 423 |
+
embed_local_files: bool = False,
|
| 424 |
+
storage_options: Optional[dict] = None,
|
| 425 |
+
):
|
| 426 |
+
if path is None and stream is None:
|
| 427 |
+
raise ValueError("At least one of path and stream must be provided.")
|
| 428 |
+
if features is not None:
|
| 429 |
+
self._features = features
|
| 430 |
+
self._schema = None
|
| 431 |
+
elif schema is not None:
|
| 432 |
+
self._schema: pa.Schema = schema
|
| 433 |
+
self._features = Features.from_arrow_schema(self._schema)
|
| 434 |
+
else:
|
| 435 |
+
self._features = None
|
| 436 |
+
self._schema = None
|
| 437 |
+
|
| 438 |
+
if hash_salt is not None:
|
| 439 |
+
# Create KeyHasher instance using split name as hash salt
|
| 440 |
+
self._hasher = KeyHasher(hash_salt)
|
| 441 |
+
else:
|
| 442 |
+
self._hasher = KeyHasher("")
|
| 443 |
+
|
| 444 |
+
self._check_duplicates = check_duplicates
|
| 445 |
+
self._disable_nullable = disable_nullable
|
| 446 |
+
|
| 447 |
+
if stream is None:
|
| 448 |
+
fs, path = url_to_fs(path, **(storage_options or {}))
|
| 449 |
+
self._fs: fsspec.AbstractFileSystem = fs
|
| 450 |
+
self._path = path if not is_remote_filesystem(self._fs) else self._fs.unstrip_protocol(path)
|
| 451 |
+
self.stream = self._fs.open(path, "wb")
|
| 452 |
+
self._closable_stream = True
|
| 453 |
+
else:
|
| 454 |
+
self._fs = None
|
| 455 |
+
self._path = None
|
| 456 |
+
self.stream = stream
|
| 457 |
+
self._closable_stream = False
|
| 458 |
+
|
| 459 |
+
self.fingerprint = fingerprint
|
| 460 |
+
self.disable_nullable = disable_nullable
|
| 461 |
+
self.writer_batch_size = (
|
| 462 |
+
writer_batch_size
|
| 463 |
+
or get_arrow_writer_batch_size_from_features(self._features)
|
| 464 |
+
or config.DEFAULT_MAX_BATCH_SIZE
|
| 465 |
+
)
|
| 466 |
+
self.update_features = update_features
|
| 467 |
+
self.with_metadata = with_metadata
|
| 468 |
+
self.unit = unit
|
| 469 |
+
self.embed_local_files = embed_local_files
|
| 470 |
+
|
| 471 |
+
self._num_examples = 0
|
| 472 |
+
self._num_bytes = 0
|
| 473 |
+
self.current_examples: list[tuple[dict[str, Any], str]] = []
|
| 474 |
+
self.current_rows: list[pa.Table] = []
|
| 475 |
+
self.pa_writer: Optional[pa.RecordBatchStreamWriter] = None
|
| 476 |
+
self.hkey_record = []
|
| 477 |
+
|
| 478 |
+
def __len__(self):
|
| 479 |
+
"""Return the number of writed and staged examples"""
|
| 480 |
+
return self._num_examples + len(self.current_examples) + len(self.current_rows)
|
| 481 |
+
|
| 482 |
+
def __enter__(self):
|
| 483 |
+
return self
|
| 484 |
+
|
| 485 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 486 |
+
self.close()
|
| 487 |
+
|
| 488 |
+
def close(self):
|
| 489 |
+
# Try closing if opened; if closed: pyarrow.lib.ArrowInvalid: Invalid operation on closed file
|
| 490 |
+
if self.pa_writer: # it might be None
|
| 491 |
+
try:
|
| 492 |
+
self.pa_writer.close()
|
| 493 |
+
except Exception: # pyarrow.lib.ArrowInvalid, OSError
|
| 494 |
+
pass
|
| 495 |
+
if self._closable_stream and not self.stream.closed:
|
| 496 |
+
self.stream.close() # This also closes self.pa_writer if it is opened
|
| 497 |
+
|
| 498 |
+
def _build_schema(self, inferred_schema: pa.Schema):
|
| 499 |
+
schema = self.schema
|
| 500 |
+
features = self._features
|
| 501 |
+
inferred_features = Features.from_arrow_schema(inferred_schema)
|
| 502 |
+
if self._features is not None:
|
| 503 |
+
if self.update_features: # keep original features it they match, or update them
|
| 504 |
+
fields = {field.name: field for field in self._features.type}
|
| 505 |
+
for inferred_field in inferred_features.type:
|
| 506 |
+
name = inferred_field.name
|
| 507 |
+
if name in fields:
|
| 508 |
+
if inferred_field == fields[name]:
|
| 509 |
+
inferred_features[name] = self._features[name]
|
| 510 |
+
features = inferred_features
|
| 511 |
+
schema: pa.Schema = inferred_schema
|
| 512 |
+
else:
|
| 513 |
+
features = inferred_features
|
| 514 |
+
schema: pa.Schema = inferred_features.arrow_schema
|
| 515 |
+
|
| 516 |
+
if self.disable_nullable:
|
| 517 |
+
schema = pa.schema(pa.field(field.name, field.type, nullable=False) for field in schema)
|
| 518 |
+
if self.with_metadata:
|
| 519 |
+
schema = schema.with_metadata(self._build_metadata(DatasetInfo(features=features), self.fingerprint))
|
| 520 |
+
else:
|
| 521 |
+
schema = schema.with_metadata({})
|
| 522 |
+
|
| 523 |
+
return schema, features
|
| 524 |
+
|
| 525 |
+
def _build_writer(self, inferred_schema: pa.Schema):
|
| 526 |
+
self._schema, self._features = self._build_schema(inferred_schema)
|
| 527 |
+
self.pa_writer = pa.RecordBatchStreamWriter(self.stream, self._schema)
|
| 528 |
+
|
| 529 |
+
@property
|
| 530 |
+
def schema(self):
|
| 531 |
+
_schema = (
|
| 532 |
+
self._schema
|
| 533 |
+
if self._schema is not None
|
| 534 |
+
else (pa.schema(self._features.type) if self._features is not None else None)
|
| 535 |
+
)
|
| 536 |
+
if self._disable_nullable and _schema is not None:
|
| 537 |
+
_schema = pa.schema(pa.field(field.name, field.type, nullable=False) for field in _schema)
|
| 538 |
+
return _schema if _schema is not None else []
|
| 539 |
+
|
| 540 |
+
@staticmethod
|
| 541 |
+
def _build_metadata(info: DatasetInfo, fingerprint: Optional[str] = None) -> dict[str, str]:
|
| 542 |
+
info_keys = ["features"] # we can add support for more DatasetInfo keys in the future
|
| 543 |
+
info_as_dict = asdict(info)
|
| 544 |
+
metadata = {}
|
| 545 |
+
metadata["info"] = {key: info_as_dict[key] for key in info_keys}
|
| 546 |
+
if fingerprint is not None:
|
| 547 |
+
metadata["fingerprint"] = fingerprint
|
| 548 |
+
return {"huggingface": json.dumps(metadata)}
|
| 549 |
+
|
| 550 |
+
def write_examples_on_file(self):
|
| 551 |
+
"""Write stored examples from the write-pool of examples. It makes a table out of the examples and write it."""
|
| 552 |
+
if not self.current_examples:
|
| 553 |
+
return
|
| 554 |
+
# preserve the order the columns
|
| 555 |
+
if self.schema:
|
| 556 |
+
schema_cols = set(self.schema.names)
|
| 557 |
+
examples_cols = self.current_examples[0][0].keys() # .keys() preserves the order (unlike set)
|
| 558 |
+
common_cols = [col for col in self.schema.names if col in examples_cols]
|
| 559 |
+
extra_cols = [col for col in examples_cols if col not in schema_cols]
|
| 560 |
+
cols = common_cols + extra_cols
|
| 561 |
+
else:
|
| 562 |
+
cols = list(self.current_examples[0][0])
|
| 563 |
+
batch_examples = {}
|
| 564 |
+
for col in cols:
|
| 565 |
+
# We use row[0][col] since current_examples contains (example, key) tuples.
|
| 566 |
+
# Moreover, examples could be Arrow arrays of 1 element.
|
| 567 |
+
# This can happen in `.map()` when we want to re-write the same Arrow data
|
| 568 |
+
if all(isinstance(row[0][col], (pa.Array, pa.ChunkedArray)) for row in self.current_examples):
|
| 569 |
+
arrays = [row[0][col] for row in self.current_examples]
|
| 570 |
+
arrays = [
|
| 571 |
+
chunk
|
| 572 |
+
for array in arrays
|
| 573 |
+
for chunk in (array.chunks if isinstance(array, pa.ChunkedArray) else [array])
|
| 574 |
+
]
|
| 575 |
+
batch_examples[col] = pa.concat_arrays(arrays)
|
| 576 |
+
else:
|
| 577 |
+
batch_examples[col] = [
|
| 578 |
+
row[0][col].to_pylist()[0] if isinstance(row[0][col], (pa.Array, pa.ChunkedArray)) else row[0][col]
|
| 579 |
+
for row in self.current_examples
|
| 580 |
+
]
|
| 581 |
+
self.write_batch(batch_examples=batch_examples)
|
| 582 |
+
self.current_examples = []
|
| 583 |
+
|
| 584 |
+
def write_rows_on_file(self):
|
| 585 |
+
"""Write stored rows from the write-pool of rows. It concatenates the single-row tables and it writes the resulting table."""
|
| 586 |
+
if not self.current_rows:
|
| 587 |
+
return
|
| 588 |
+
table = pa.concat_tables(self.current_rows)
|
| 589 |
+
self.write_table(table)
|
| 590 |
+
self.current_rows = []
|
| 591 |
+
|
| 592 |
+
def write(
|
| 593 |
+
self,
|
| 594 |
+
example: dict[str, Any],
|
| 595 |
+
key: Optional[Union[str, int, bytes]] = None,
|
| 596 |
+
writer_batch_size: Optional[int] = None,
|
| 597 |
+
):
|
| 598 |
+
"""Add a given (Example,Key) pair to the write-pool of examples which is written to file.
|
| 599 |
+
|
| 600 |
+
Args:
|
| 601 |
+
example: the Example to add.
|
| 602 |
+
key: Optional, a unique identifier(str, int or bytes) associated with each example
|
| 603 |
+
"""
|
| 604 |
+
# Utilize the keys and duplicate checking when `self._check_duplicates` is passed True
|
| 605 |
+
if self._check_duplicates:
|
| 606 |
+
# Create unique hash from key and store as (key, example) pairs
|
| 607 |
+
hash = self._hasher.hash(key)
|
| 608 |
+
self.current_examples.append((example, hash))
|
| 609 |
+
# Maintain record of keys and their respective hashes for checking duplicates
|
| 610 |
+
self.hkey_record.append((hash, key))
|
| 611 |
+
else:
|
| 612 |
+
# Store example as a tuple so as to keep the structure of `self.current_examples` uniform
|
| 613 |
+
self.current_examples.append((example, ""))
|
| 614 |
+
|
| 615 |
+
if writer_batch_size is None:
|
| 616 |
+
writer_batch_size = self.writer_batch_size
|
| 617 |
+
if writer_batch_size is not None and len(self.current_examples) >= writer_batch_size:
|
| 618 |
+
if self._check_duplicates:
|
| 619 |
+
self.check_duplicate_keys()
|
| 620 |
+
# Re-initializing to empty list for next batch
|
| 621 |
+
self.hkey_record = []
|
| 622 |
+
|
| 623 |
+
self.write_examples_on_file()
|
| 624 |
+
|
| 625 |
+
def check_duplicate_keys(self):
|
| 626 |
+
"""Raises error if duplicates found in a batch"""
|
| 627 |
+
tmp_record = set()
|
| 628 |
+
for hash, key in self.hkey_record:
|
| 629 |
+
if hash in tmp_record:
|
| 630 |
+
duplicate_key_indices = [
|
| 631 |
+
str(self._num_examples + index)
|
| 632 |
+
for index, (duplicate_hash, _) in enumerate(self.hkey_record)
|
| 633 |
+
if duplicate_hash == hash
|
| 634 |
+
]
|
| 635 |
+
|
| 636 |
+
raise DuplicatedKeysError(key, duplicate_key_indices)
|
| 637 |
+
else:
|
| 638 |
+
tmp_record.add(hash)
|
| 639 |
+
|
| 640 |
+
def write_row(self, row: pa.Table, writer_batch_size: Optional[int] = None):
|
| 641 |
+
"""Add a given single-row Table to the write-pool of rows which is written to file.
|
| 642 |
+
|
| 643 |
+
Args:
|
| 644 |
+
row: the row to add.
|
| 645 |
+
"""
|
| 646 |
+
if len(row) != 1:
|
| 647 |
+
raise ValueError(f"Only single-row pyarrow tables are allowed but got table with {len(row)} rows.")
|
| 648 |
+
self.current_rows.append(row)
|
| 649 |
+
if writer_batch_size is None:
|
| 650 |
+
writer_batch_size = self.writer_batch_size
|
| 651 |
+
if writer_batch_size is not None and len(self.current_rows) >= writer_batch_size:
|
| 652 |
+
self.write_rows_on_file()
|
| 653 |
+
|
| 654 |
+
def write_batch(
|
| 655 |
+
self,
|
| 656 |
+
batch_examples: dict[str, list],
|
| 657 |
+
writer_batch_size: Optional[int] = None,
|
| 658 |
+
try_original_type: Optional[bool] = True,
|
| 659 |
+
):
|
| 660 |
+
"""Write a batch of Example to file.
|
| 661 |
+
Ignores the batch if it appears to be empty,
|
| 662 |
+
preventing a potential schema update of unknown types.
|
| 663 |
+
|
| 664 |
+
Args:
|
| 665 |
+
batch_examples: the batch of examples to add.
|
| 666 |
+
try_original_type: use `try_type` when instantiating OptimizedTypedSequence if `True`, otherwise `try_type = None`.
|
| 667 |
+
"""
|
| 668 |
+
if batch_examples and len(next(iter(batch_examples.values()))) == 0:
|
| 669 |
+
return
|
| 670 |
+
features = None if self.pa_writer is None and self.update_features else self._features
|
| 671 |
+
try_features = self._features if self.pa_writer is None and self.update_features else None
|
| 672 |
+
arrays = []
|
| 673 |
+
inferred_features = Features()
|
| 674 |
+
# preserve the order the columns
|
| 675 |
+
if self.schema:
|
| 676 |
+
schema_cols = set(self.schema.names)
|
| 677 |
+
batch_cols = batch_examples.keys() # .keys() preserves the order (unlike set)
|
| 678 |
+
common_cols = [col for col in self.schema.names if col in batch_cols]
|
| 679 |
+
extra_cols = [col for col in batch_cols if col not in schema_cols]
|
| 680 |
+
cols = common_cols + extra_cols
|
| 681 |
+
else:
|
| 682 |
+
cols = list(batch_examples)
|
| 683 |
+
for col in cols:
|
| 684 |
+
col_values = batch_examples[col]
|
| 685 |
+
col_type = features[col] if features else None
|
| 686 |
+
if isinstance(col_values, (pa.Array, pa.ChunkedArray)):
|
| 687 |
+
array = cast_array_to_feature(col_values, col_type) if col_type is not None else col_values
|
| 688 |
+
arrays.append(array)
|
| 689 |
+
inferred_features[col] = generate_from_arrow_type(col_values.type)
|
| 690 |
+
else:
|
| 691 |
+
col_try_type = (
|
| 692 |
+
try_features[col]
|
| 693 |
+
if try_features is not None and col in try_features and try_original_type
|
| 694 |
+
else None
|
| 695 |
+
)
|
| 696 |
+
typed_sequence = OptimizedTypedSequence(col_values, type=col_type, try_type=col_try_type, col=col)
|
| 697 |
+
arrays.append(pa.array(typed_sequence))
|
| 698 |
+
inferred_features[col] = typed_sequence.get_inferred_type()
|
| 699 |
+
schema = inferred_features.arrow_schema if self.pa_writer is None else self.schema
|
| 700 |
+
pa_table = pa.Table.from_arrays(arrays, schema=schema)
|
| 701 |
+
self.write_table(pa_table, writer_batch_size)
|
| 702 |
+
|
| 703 |
+
def write_table(self, pa_table: pa.Table, writer_batch_size: Optional[int] = None):
|
| 704 |
+
"""Write a Table to file.
|
| 705 |
+
|
| 706 |
+
Args:
|
| 707 |
+
example: the Table to add.
|
| 708 |
+
"""
|
| 709 |
+
if writer_batch_size is None:
|
| 710 |
+
writer_batch_size = self.writer_batch_size
|
| 711 |
+
if self.pa_writer is None:
|
| 712 |
+
self._build_writer(inferred_schema=pa_table.schema)
|
| 713 |
+
pa_table = pa_table.combine_chunks()
|
| 714 |
+
pa_table = table_cast(pa_table, self._schema)
|
| 715 |
+
if self.embed_local_files:
|
| 716 |
+
pa_table = embed_table_storage(pa_table)
|
| 717 |
+
self._num_bytes += pa_table.nbytes
|
| 718 |
+
self._num_examples += pa_table.num_rows
|
| 719 |
+
self.pa_writer.write_table(pa_table, writer_batch_size)
|
| 720 |
+
|
| 721 |
+
def finalize(self, close_stream=True):
|
| 722 |
+
self.write_rows_on_file()
|
| 723 |
+
# In case current_examples < writer_batch_size, but user uses finalize()
|
| 724 |
+
if self._check_duplicates:
|
| 725 |
+
self.check_duplicate_keys()
|
| 726 |
+
# Re-initializing to empty list for next batch
|
| 727 |
+
self.hkey_record = []
|
| 728 |
+
self.write_examples_on_file()
|
| 729 |
+
# If schema is known, infer features even if no examples were written
|
| 730 |
+
if self.pa_writer is None and self.schema:
|
| 731 |
+
self._build_writer(self.schema)
|
| 732 |
+
if self.pa_writer is not None:
|
| 733 |
+
self.pa_writer.close()
|
| 734 |
+
self.pa_writer = None
|
| 735 |
+
if close_stream:
|
| 736 |
+
self.stream.close()
|
| 737 |
+
else:
|
| 738 |
+
if close_stream:
|
| 739 |
+
self.stream.close()
|
| 740 |
+
raise SchemaInferenceError("Please pass `features` or at least one example when writing data")
|
| 741 |
+
logger.debug(
|
| 742 |
+
f"Done writing {self._num_examples} {self.unit} in {self._num_bytes} bytes {self._path if self._path else ''}."
|
| 743 |
+
)
|
| 744 |
+
return self._num_examples, self._num_bytes
|
| 745 |
+
|
| 746 |
+
|
| 747 |
+
class ParquetWriter(ArrowWriter):
|
| 748 |
+
def __init__(self, *args, use_content_defined_chunking=True, write_page_index=True, **kwargs):
|
| 749 |
+
super().__init__(*args, **kwargs)
|
| 750 |
+
if use_content_defined_chunking is True:
|
| 751 |
+
use_content_defined_chunking = config.DEFAULT_CDC_OPTIONS
|
| 752 |
+
self.use_content_defined_chunking = use_content_defined_chunking
|
| 753 |
+
self.write_page_index = write_page_index
|
| 754 |
+
|
| 755 |
+
def _build_writer(self, inferred_schema: pa.Schema):
|
| 756 |
+
self._schema, self._features = self._build_schema(inferred_schema)
|
| 757 |
+
self.pa_writer = pq.ParquetWriter(
|
| 758 |
+
self.stream,
|
| 759 |
+
self._schema,
|
| 760 |
+
use_content_defined_chunking=self.use_content_defined_chunking,
|
| 761 |
+
write_page_index=self.write_page_index,
|
| 762 |
+
)
|
| 763 |
+
if self.use_content_defined_chunking is not False:
|
| 764 |
+
self.pa_writer.add_key_value_metadata(
|
| 765 |
+
{"content_defined_chunking": json.dumps(self.use_content_defined_chunking)}
|
| 766 |
+
)
|
datasets/builder.py
ADDED
|
@@ -0,0 +1,1866 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
# Lint as: python3
|
| 16 |
+
"""DatasetBuilder base class."""
|
| 17 |
+
|
| 18 |
+
import abc
|
| 19 |
+
import contextlib
|
| 20 |
+
import copy
|
| 21 |
+
import inspect
|
| 22 |
+
import os
|
| 23 |
+
import posixpath
|
| 24 |
+
import shutil
|
| 25 |
+
import textwrap
|
| 26 |
+
import time
|
| 27 |
+
import urllib
|
| 28 |
+
from collections.abc import Iterable, Mapping
|
| 29 |
+
from dataclasses import dataclass
|
| 30 |
+
from functools import partial
|
| 31 |
+
from pathlib import Path
|
| 32 |
+
from typing import TYPE_CHECKING, Optional, Union
|
| 33 |
+
from unittest.mock import patch
|
| 34 |
+
|
| 35 |
+
import fsspec
|
| 36 |
+
from fsspec.core import url_to_fs
|
| 37 |
+
from multiprocess import Pool
|
| 38 |
+
from tqdm.contrib.concurrent import thread_map
|
| 39 |
+
|
| 40 |
+
from . import config, utils
|
| 41 |
+
from .arrow_dataset import Dataset
|
| 42 |
+
from .arrow_reader import (
|
| 43 |
+
ArrowReader,
|
| 44 |
+
ReadInstruction,
|
| 45 |
+
)
|
| 46 |
+
from .arrow_writer import ArrowWriter, ParquetWriter, SchemaInferenceError
|
| 47 |
+
from .data_files import DataFilesDict, DataFilesPatternsDict, sanitize_patterns
|
| 48 |
+
from .dataset_dict import DatasetDict, IterableDatasetDict
|
| 49 |
+
from .download.download_config import DownloadConfig
|
| 50 |
+
from .download.download_manager import DownloadManager, DownloadMode
|
| 51 |
+
from .download.streaming_download_manager import StreamingDownloadManager, xjoin
|
| 52 |
+
from .exceptions import DatasetGenerationCastError, DatasetGenerationError, FileFormatError, ManualDownloadError
|
| 53 |
+
from .features import Features
|
| 54 |
+
from .filesystems import (
|
| 55 |
+
is_remote_filesystem,
|
| 56 |
+
rename,
|
| 57 |
+
)
|
| 58 |
+
from .fingerprint import Hasher
|
| 59 |
+
from .info import DatasetInfo, PostProcessedInfo
|
| 60 |
+
from .iterable_dataset import ArrowExamplesIterable, ExamplesIterable, IterableDataset
|
| 61 |
+
from .keyhash import DuplicatedKeysError
|
| 62 |
+
from .naming import INVALID_WINDOWS_CHARACTERS_IN_PATH, camelcase_to_snakecase
|
| 63 |
+
from .splits import Split, SplitDict, SplitGenerator, SplitInfo
|
| 64 |
+
from .streaming import extend_dataset_builder_for_streaming
|
| 65 |
+
from .table import CastError
|
| 66 |
+
from .utils import logging
|
| 67 |
+
from .utils import tqdm as hf_tqdm
|
| 68 |
+
from .utils._filelock import FileLock
|
| 69 |
+
from .utils.file_utils import is_remote_url
|
| 70 |
+
from .utils.info_utils import VerificationMode, get_size_checksum_dict, verify_checksums, verify_splits
|
| 71 |
+
from .utils.py_utils import (
|
| 72 |
+
classproperty,
|
| 73 |
+
convert_file_size_to_int,
|
| 74 |
+
has_sufficient_disk_space,
|
| 75 |
+
iflatmap_unordered,
|
| 76 |
+
map_nested,
|
| 77 |
+
memoize,
|
| 78 |
+
size_str,
|
| 79 |
+
temporary_assignment,
|
| 80 |
+
)
|
| 81 |
+
from .utils.sharding import _number_of_shards_in_gen_kwargs, _split_gen_kwargs
|
| 82 |
+
from .utils.track import tracked_list
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
if TYPE_CHECKING:
|
| 86 |
+
from .load import DatasetModule
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
logger = logging.get_logger(__name__)
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
class InvalidConfigName(ValueError):
|
| 93 |
+
pass
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
@dataclass
|
| 97 |
+
class BuilderConfig:
|
| 98 |
+
"""Base class for `DatasetBuilder` data configuration.
|
| 99 |
+
|
| 100 |
+
`DatasetBuilder` subclasses with data configuration options should subclass
|
| 101 |
+
`BuilderConfig` and add their own properties.
|
| 102 |
+
|
| 103 |
+
Attributes:
|
| 104 |
+
name (`str`, defaults to `default`):
|
| 105 |
+
The name of the configuration.
|
| 106 |
+
version (`Version` or `str`, defaults to `0.0.0`):
|
| 107 |
+
The version of the configuration.
|
| 108 |
+
data_dir (`str`, *optional*):
|
| 109 |
+
Path to the directory containing the source data.
|
| 110 |
+
data_files (`str` or `Sequence` or `Mapping`, *optional*):
|
| 111 |
+
Path(s) to source data file(s).
|
| 112 |
+
description (`str`, *optional*):
|
| 113 |
+
A human description of the configuration.
|
| 114 |
+
"""
|
| 115 |
+
|
| 116 |
+
name: str = "default"
|
| 117 |
+
version: Optional[Union[utils.Version, str]] = utils.Version("0.0.0")
|
| 118 |
+
data_dir: Optional[str] = None
|
| 119 |
+
data_files: Optional[Union[DataFilesDict, DataFilesPatternsDict]] = None
|
| 120 |
+
description: Optional[str] = None
|
| 121 |
+
|
| 122 |
+
def __post_init__(self):
|
| 123 |
+
# The config name is used to name the cache directory.
|
| 124 |
+
for invalid_char in INVALID_WINDOWS_CHARACTERS_IN_PATH:
|
| 125 |
+
if invalid_char in self.name:
|
| 126 |
+
raise InvalidConfigName(
|
| 127 |
+
f"Bad characters from black list '{INVALID_WINDOWS_CHARACTERS_IN_PATH}' found in '{self.name}'. "
|
| 128 |
+
f"They could create issues when creating a directory for this config on Windows filesystem."
|
| 129 |
+
)
|
| 130 |
+
if self.data_files is not None and not isinstance(self.data_files, (DataFilesDict, DataFilesPatternsDict)):
|
| 131 |
+
raise ValueError(f"Expected a DataFilesDict in data_files but got {self.data_files}")
|
| 132 |
+
|
| 133 |
+
def __eq__(self, o):
|
| 134 |
+
# we need to override the default dataclass __eq__ since it doesn't check for
|
| 135 |
+
# other attributes that the ones of the signature.
|
| 136 |
+
if set(self.__dict__.keys()) != set(o.__dict__.keys()):
|
| 137 |
+
return False
|
| 138 |
+
return all((k, getattr(self, k)) == (k, getattr(o, k)) for k in self.__dict__.keys())
|
| 139 |
+
|
| 140 |
+
def create_config_id(
|
| 141 |
+
self,
|
| 142 |
+
config_kwargs: dict,
|
| 143 |
+
custom_features: Optional[Features] = None,
|
| 144 |
+
) -> str:
|
| 145 |
+
"""
|
| 146 |
+
The config id is used to build the cache directory.
|
| 147 |
+
By default it is equal to the config name.
|
| 148 |
+
However the name of a config is not sufficient to have a unique identifier for the dataset being generated
|
| 149 |
+
since it doesn't take into account:
|
| 150 |
+
- the config kwargs that can be used to overwrite attributes
|
| 151 |
+
- the custom features used to write the dataset
|
| 152 |
+
- the data_files for json/text/csv/pandas datasets
|
| 153 |
+
|
| 154 |
+
Therefore the config id is just the config name with an optional suffix based on these.
|
| 155 |
+
"""
|
| 156 |
+
# Possibly add a suffix to the name to handle custom features/data_files/config_kwargs
|
| 157 |
+
suffix: Optional[str] = None
|
| 158 |
+
config_kwargs_to_add_to_suffix = config_kwargs.copy()
|
| 159 |
+
# name and version are already used to build the cache directory
|
| 160 |
+
config_kwargs_to_add_to_suffix.pop("name", None)
|
| 161 |
+
config_kwargs_to_add_to_suffix.pop("version", None)
|
| 162 |
+
# data dir handling (when specified it points to the manually downloaded data):
|
| 163 |
+
# it was previously ignored before the introduction of config id because we didn't want
|
| 164 |
+
# to change the config name. Now it's fine to take it into account for the config id.
|
| 165 |
+
# config_kwargs_to_add_to_suffix.pop("data_dir", None)
|
| 166 |
+
if "data_dir" in config_kwargs_to_add_to_suffix:
|
| 167 |
+
if config_kwargs_to_add_to_suffix["data_dir"] is None:
|
| 168 |
+
config_kwargs_to_add_to_suffix.pop("data_dir", None)
|
| 169 |
+
else:
|
| 170 |
+
# canonicalize the data dir to avoid two paths to the same location having different
|
| 171 |
+
# hashes
|
| 172 |
+
data_dir = config_kwargs_to_add_to_suffix["data_dir"]
|
| 173 |
+
data_dir = os.path.normpath(data_dir)
|
| 174 |
+
config_kwargs_to_add_to_suffix["data_dir"] = data_dir
|
| 175 |
+
if config_kwargs_to_add_to_suffix:
|
| 176 |
+
# we don't care about the order of the kwargs
|
| 177 |
+
config_kwargs_to_add_to_suffix = {
|
| 178 |
+
k: config_kwargs_to_add_to_suffix[k] for k in sorted(config_kwargs_to_add_to_suffix)
|
| 179 |
+
}
|
| 180 |
+
if all(isinstance(v, (str, bool, int, float)) for v in config_kwargs_to_add_to_suffix.values()):
|
| 181 |
+
suffix = ",".join(
|
| 182 |
+
str(k) + "=" + urllib.parse.quote_plus(str(v)) for k, v in config_kwargs_to_add_to_suffix.items()
|
| 183 |
+
)
|
| 184 |
+
if len(suffix) > 32: # hash if too long
|
| 185 |
+
suffix = Hasher.hash(config_kwargs_to_add_to_suffix)
|
| 186 |
+
else:
|
| 187 |
+
suffix = Hasher.hash(config_kwargs_to_add_to_suffix)
|
| 188 |
+
|
| 189 |
+
if custom_features is not None:
|
| 190 |
+
m = Hasher()
|
| 191 |
+
if suffix:
|
| 192 |
+
m.update(suffix)
|
| 193 |
+
m.update(custom_features)
|
| 194 |
+
suffix = m.hexdigest()
|
| 195 |
+
|
| 196 |
+
if suffix:
|
| 197 |
+
config_id = self.name + "-" + suffix
|
| 198 |
+
if len(config_id) > config.MAX_DATASET_CONFIG_ID_READABLE_LENGTH:
|
| 199 |
+
config_id = self.name + "-" + Hasher.hash(suffix)
|
| 200 |
+
return config_id
|
| 201 |
+
else:
|
| 202 |
+
return self.name
|
| 203 |
+
|
| 204 |
+
def _resolve_data_files(self, base_path: str, download_config: DownloadConfig) -> None:
|
| 205 |
+
if isinstance(self.data_files, DataFilesPatternsDict):
|
| 206 |
+
base_path = xjoin(base_path, self.data_dir) if self.data_dir else base_path
|
| 207 |
+
self.data_files = self.data_files.resolve(base_path, download_config)
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
class DatasetBuilder:
|
| 211 |
+
"""Abstract base class for all datasets.
|
| 212 |
+
|
| 213 |
+
`DatasetBuilder` has 3 key methods:
|
| 214 |
+
|
| 215 |
+
- [`DatasetBuilder.info`]: Documents the dataset, including feature
|
| 216 |
+
names, types, shapes, version, splits, citation, etc.
|
| 217 |
+
- [`DatasetBuilder.download_and_prepare`]: Downloads the source data
|
| 218 |
+
and writes it to disk.
|
| 219 |
+
- [`DatasetBuilder.as_dataset`]: Generates a [`Dataset`].
|
| 220 |
+
|
| 221 |
+
Some `DatasetBuilder`s expose multiple variants of the
|
| 222 |
+
dataset by defining a [`BuilderConfig`] subclass and accepting a
|
| 223 |
+
config object (or name) on construction. Configurable datasets expose a
|
| 224 |
+
pre-defined set of configurations in [`DatasetBuilder.builder_configs`].
|
| 225 |
+
|
| 226 |
+
Args:
|
| 227 |
+
cache_dir (`str`, *optional*):
|
| 228 |
+
Directory to cache data. Defaults to `"~/.cache/huggingface/datasets"`.
|
| 229 |
+
dataset_name (`str`, *optional*):
|
| 230 |
+
Name of the dataset, if different from the builder name. Useful for packaged builders
|
| 231 |
+
like csv, imagefolder, audiofolder, etc. to reflect the difference between datasets
|
| 232 |
+
that use the same packaged builder.
|
| 233 |
+
config_name (`str`, *optional*):
|
| 234 |
+
Name of the dataset configuration.
|
| 235 |
+
It affects the data generated on disk. Different configurations will have their own subdirectories and
|
| 236 |
+
versions.
|
| 237 |
+
If not provided, the default configuration is used (if it exists).
|
| 238 |
+
|
| 239 |
+
<Added version="2.3.0">
|
| 240 |
+
|
| 241 |
+
Parameter `name` was renamed to `config_name`.
|
| 242 |
+
|
| 243 |
+
</Added>
|
| 244 |
+
hash (`str`, *optional*):
|
| 245 |
+
Hash specific to the dataset builder code. Used to update the caching directory when the
|
| 246 |
+
dataset builder code is updated (to avoid reusing old data).
|
| 247 |
+
The typical caching directory (defined in `self._relative_data_dir`) is `name/version/hash/`.
|
| 248 |
+
base_path (`str`, *optional*):
|
| 249 |
+
Base path for relative paths that are used to download files.
|
| 250 |
+
This can be a remote URL.
|
| 251 |
+
features ([`Features`], *optional*):
|
| 252 |
+
Features types to use with this dataset.
|
| 253 |
+
It can be used to change the [`Features`] types of a dataset, for example.
|
| 254 |
+
token (`str` or `bool`, *optional*):
|
| 255 |
+
String or boolean to use as Bearer token for remote files on the
|
| 256 |
+
Datasets Hub. If `True`, will get token from `"~/.huggingface"`.
|
| 257 |
+
repo_id (`str`, *optional*):
|
| 258 |
+
ID of the dataset repository.
|
| 259 |
+
Used to distinguish builders with the same name but not coming from the same namespace, for example "rajpurkar/squad"
|
| 260 |
+
and "lhoestq/squad" repo IDs. In the latter, the builder name would be "lhoestq___squad".
|
| 261 |
+
data_files (`str` or `Sequence` or `Mapping`, *optional*):
|
| 262 |
+
Path(s) to source data file(s).
|
| 263 |
+
For builders like "csv" or "json" that need the user to specify data files. They can be either
|
| 264 |
+
local or remote files. For convenience, you can use a `DataFilesDict`.
|
| 265 |
+
data_dir (`str`, *optional*):
|
| 266 |
+
Path to directory containing source data file(s).
|
| 267 |
+
Use only if `data_files` is not passed, in which case it is equivalent to passing
|
| 268 |
+
`os.path.join(data_dir, "**")` as `data_files`.
|
| 269 |
+
For builders that require manual download, it must be the path to the local directory containing the
|
| 270 |
+
manually downloaded data.
|
| 271 |
+
storage_options (`dict`, *optional*):
|
| 272 |
+
Key/value pairs to be passed on to the dataset file-system backend, if any.
|
| 273 |
+
writer_batch_size (`int`, *optional*):
|
| 274 |
+
Batch size used by the ArrowWriter.
|
| 275 |
+
It defines the number of samples that are kept in memory before writing them
|
| 276 |
+
and also the length of the arrow chunks.
|
| 277 |
+
None means that the ArrowWriter will use its default value.
|
| 278 |
+
**config_kwargs (additional keyword arguments): Keyword arguments to be passed to the corresponding builder
|
| 279 |
+
configuration class, set on the class attribute [`DatasetBuilder.BUILDER_CONFIG_CLASS`]. The builder
|
| 280 |
+
configuration class is [`BuilderConfig`] or a subclass of it.
|
| 281 |
+
"""
|
| 282 |
+
|
| 283 |
+
# Default version
|
| 284 |
+
VERSION = None # Default version set in BuilderConfig
|
| 285 |
+
|
| 286 |
+
# Class for the builder config.
|
| 287 |
+
BUILDER_CONFIG_CLASS = BuilderConfig
|
| 288 |
+
|
| 289 |
+
# Named configurations that modify the data generated by download_and_prepare.
|
| 290 |
+
BUILDER_CONFIGS = []
|
| 291 |
+
|
| 292 |
+
# Optional default config name to be used when name is None
|
| 293 |
+
DEFAULT_CONFIG_NAME = None
|
| 294 |
+
|
| 295 |
+
# Default batch size used by the ArrowWriter
|
| 296 |
+
# It defines the number of samples that are kept in memory before writing them
|
| 297 |
+
# and also the length of the arrow chunks
|
| 298 |
+
# None means that the ArrowWriter will use its default value
|
| 299 |
+
DEFAULT_WRITER_BATCH_SIZE = None
|
| 300 |
+
|
| 301 |
+
def __init__(
|
| 302 |
+
self,
|
| 303 |
+
cache_dir: Optional[str] = None,
|
| 304 |
+
dataset_name: Optional[str] = None,
|
| 305 |
+
config_name: Optional[str] = None,
|
| 306 |
+
hash: Optional[str] = None,
|
| 307 |
+
base_path: Optional[str] = None,
|
| 308 |
+
info: Optional[DatasetInfo] = None,
|
| 309 |
+
features: Optional[Features] = None,
|
| 310 |
+
token: Optional[Union[bool, str]] = None,
|
| 311 |
+
repo_id: Optional[str] = None,
|
| 312 |
+
data_files: Optional[Union[str, list, dict, DataFilesDict]] = None,
|
| 313 |
+
data_dir: Optional[str] = None,
|
| 314 |
+
storage_options: Optional[dict] = None,
|
| 315 |
+
writer_batch_size: Optional[int] = None,
|
| 316 |
+
config_id: Optional[str] = None,
|
| 317 |
+
**config_kwargs,
|
| 318 |
+
):
|
| 319 |
+
# DatasetBuilder name
|
| 320 |
+
self.name: str = camelcase_to_snakecase(self.__module__.split(".")[-1])
|
| 321 |
+
self.hash: Optional[str] = hash
|
| 322 |
+
self.base_path = base_path
|
| 323 |
+
self.token = token
|
| 324 |
+
self.repo_id = repo_id
|
| 325 |
+
self.storage_options = storage_options or {}
|
| 326 |
+
self.dataset_name = camelcase_to_snakecase(dataset_name) if dataset_name else self.name
|
| 327 |
+
self._writer_batch_size = writer_batch_size or self.DEFAULT_WRITER_BATCH_SIZE
|
| 328 |
+
|
| 329 |
+
if data_files is not None and not isinstance(data_files, DataFilesDict):
|
| 330 |
+
data_files = DataFilesDict.from_patterns(
|
| 331 |
+
sanitize_patterns(data_files),
|
| 332 |
+
base_path=base_path,
|
| 333 |
+
download_config=DownloadConfig(token=token, storage_options=self.storage_options),
|
| 334 |
+
)
|
| 335 |
+
|
| 336 |
+
# Prepare config: DatasetConfig contains name, version and description but can be extended by each dataset
|
| 337 |
+
if "features" in inspect.signature(self.BUILDER_CONFIG_CLASS.__init__).parameters and features is not None:
|
| 338 |
+
config_kwargs["features"] = features
|
| 339 |
+
if data_files is not None:
|
| 340 |
+
config_kwargs["data_files"] = data_files
|
| 341 |
+
if data_dir is not None:
|
| 342 |
+
config_kwargs["data_dir"] = data_dir
|
| 343 |
+
self.config_kwargs = config_kwargs
|
| 344 |
+
self.config, self.config_id = self._create_builder_config(
|
| 345 |
+
config_name=config_name,
|
| 346 |
+
custom_features=features,
|
| 347 |
+
config_id=config_id,
|
| 348 |
+
**config_kwargs,
|
| 349 |
+
)
|
| 350 |
+
|
| 351 |
+
# prepare info: DatasetInfo are a standardized dataclass across all datasets
|
| 352 |
+
# Prefill datasetinfo
|
| 353 |
+
if info is None:
|
| 354 |
+
info = self._info()
|
| 355 |
+
info.builder_name = self.name
|
| 356 |
+
info.dataset_name = self.dataset_name
|
| 357 |
+
info.config_name = self.config.name
|
| 358 |
+
info.version = self.config.version
|
| 359 |
+
self.info = info
|
| 360 |
+
# update info with user specified infos
|
| 361 |
+
if features is not None:
|
| 362 |
+
self.info.features = features
|
| 363 |
+
|
| 364 |
+
# Prepare data dirs:
|
| 365 |
+
# cache_dir can be a remote bucket on GCS or S3
|
| 366 |
+
self._cache_dir_root = str(cache_dir or config.HF_DATASETS_CACHE)
|
| 367 |
+
self._cache_dir_root = (
|
| 368 |
+
self._cache_dir_root if is_remote_url(self._cache_dir_root) else os.path.expanduser(self._cache_dir_root)
|
| 369 |
+
)
|
| 370 |
+
self._cache_downloaded_dir = (
|
| 371 |
+
posixpath.join(self._cache_dir_root, config.DOWNLOADED_DATASETS_DIR)
|
| 372 |
+
if cache_dir
|
| 373 |
+
else str(config.DOWNLOADED_DATASETS_PATH)
|
| 374 |
+
)
|
| 375 |
+
self._cache_downloaded_dir = (
|
| 376 |
+
self._cache_downloaded_dir
|
| 377 |
+
if is_remote_url(self._cache_downloaded_dir)
|
| 378 |
+
else os.path.expanduser(self._cache_downloaded_dir)
|
| 379 |
+
)
|
| 380 |
+
|
| 381 |
+
# In case there exists a legacy cache directory
|
| 382 |
+
self._legacy_relative_data_dir = None
|
| 383 |
+
|
| 384 |
+
self._cache_dir = self._build_cache_dir()
|
| 385 |
+
if not is_remote_url(self._cache_dir_root):
|
| 386 |
+
os.makedirs(self._cache_dir_root, exist_ok=True)
|
| 387 |
+
lock_path = os.path.join(
|
| 388 |
+
self._cache_dir_root, Path(self._cache_dir).as_posix().replace("/", "_") + ".lock"
|
| 389 |
+
)
|
| 390 |
+
with FileLock(lock_path):
|
| 391 |
+
if os.path.exists(self._cache_dir): # check if data exist
|
| 392 |
+
if len(os.listdir(self._cache_dir)) > 0:
|
| 393 |
+
if os.path.exists(os.path.join(self._cache_dir, config.DATASET_INFO_FILENAME)):
|
| 394 |
+
logger.debug("Overwrite dataset info from restored data version if exists.")
|
| 395 |
+
self.info = DatasetInfo.from_directory(self._cache_dir)
|
| 396 |
+
else: # dir exists but no data, remove the empty dir as data aren't available anymore
|
| 397 |
+
logger.warning(
|
| 398 |
+
f"Old caching folder {self._cache_dir} for dataset {self.dataset_name} exists but no data were found. Removing it. "
|
| 399 |
+
)
|
| 400 |
+
os.rmdir(self._cache_dir)
|
| 401 |
+
|
| 402 |
+
# Store in the cache by default unless the user specifies a custom output_dir to download_and_prepare
|
| 403 |
+
self._output_dir = self._cache_dir
|
| 404 |
+
self._fs: fsspec.AbstractFileSystem = fsspec.filesystem("file")
|
| 405 |
+
|
| 406 |
+
# Set download manager
|
| 407 |
+
self.dl_manager = None
|
| 408 |
+
|
| 409 |
+
# Set to True by "datasets-cli test" to generate file checksums for (deprecated) dataset_infos.json independently of verification_mode value.
|
| 410 |
+
self._record_infos = False
|
| 411 |
+
|
| 412 |
+
# Set in `.download_and_prepare` once the format of the generated dataset is known
|
| 413 |
+
self._file_format = None
|
| 414 |
+
|
| 415 |
+
# Enable streaming (e.g. it patches "open" to work with remote files)
|
| 416 |
+
extend_dataset_builder_for_streaming(self)
|
| 417 |
+
|
| 418 |
+
def __getstate__(self):
|
| 419 |
+
return self.__dict__
|
| 420 |
+
|
| 421 |
+
def __setstate__(self, d):
|
| 422 |
+
self.__dict__ = d
|
| 423 |
+
# Re-enable streaming, since patched functions are not kept when pickling
|
| 424 |
+
extend_dataset_builder_for_streaming(self)
|
| 425 |
+
|
| 426 |
+
# Must be set for datasets that use 'data_dir' functionality - the ones
|
| 427 |
+
# that require users to do additional steps to download the data
|
| 428 |
+
# (this is usually due to some external regulations / rules).
|
| 429 |
+
# This field should contain a string with user instructions, including
|
| 430 |
+
# the list of files that should be present. It will be
|
| 431 |
+
# displayed in the dataset documentation.
|
| 432 |
+
@property
|
| 433 |
+
def manual_download_instructions(self) -> Optional[str]:
|
| 434 |
+
return None
|
| 435 |
+
|
| 436 |
+
def _check_legacy_cache(self) -> Optional[str]:
|
| 437 |
+
"""Check for the old cache directory template {cache_dir}/{namespace}___{builder_name} from 2.13"""
|
| 438 |
+
if (
|
| 439 |
+
self.__module__.startswith("datasets.")
|
| 440 |
+
and not is_remote_url(self._cache_dir_root)
|
| 441 |
+
and self.config.name == "default"
|
| 442 |
+
):
|
| 443 |
+
from .packaged_modules import _PACKAGED_DATASETS_MODULES
|
| 444 |
+
|
| 445 |
+
namespace = self.repo_id.split("/")[0] if self.repo_id and self.repo_id.count("/") > 0 else None
|
| 446 |
+
config_name = self.repo_id.replace("/", "--") if self.repo_id is not None else self.dataset_name
|
| 447 |
+
config_id = config_name + self.config_id[len(self.config.name) :]
|
| 448 |
+
hash = _PACKAGED_DATASETS_MODULES.get(self.name, "missing")[1]
|
| 449 |
+
legacy_relative_data_dir = posixpath.join(
|
| 450 |
+
self.dataset_name if namespace is None else f"{namespace}___{self.dataset_name}",
|
| 451 |
+
config_id,
|
| 452 |
+
"0.0.0",
|
| 453 |
+
hash,
|
| 454 |
+
)
|
| 455 |
+
legacy_cache_dir = posixpath.join(self._cache_dir_root, legacy_relative_data_dir)
|
| 456 |
+
if os.path.isdir(legacy_cache_dir):
|
| 457 |
+
return legacy_relative_data_dir
|
| 458 |
+
|
| 459 |
+
def _check_legacy_cache2(self, dataset_module: "DatasetModule") -> Optional[str]:
|
| 460 |
+
"""Check for the old cache directory template {cache_dir}/{namespace}___{dataset_name}/{config_name}-xxx from 2.14 and 2.15"""
|
| 461 |
+
if (
|
| 462 |
+
self.__module__.startswith("datasets.")
|
| 463 |
+
and not is_remote_url(self._cache_dir_root)
|
| 464 |
+
and not (set(self.config_kwargs) - {"data_files", "data_dir"})
|
| 465 |
+
):
|
| 466 |
+
from .packaged_modules import _PACKAGED_DATASETS_MODULES_2_15_HASHES
|
| 467 |
+
from .utils._dill import Pickler
|
| 468 |
+
|
| 469 |
+
def update_hash_with_config_parameters(hash: str, config_parameters: dict) -> str:
|
| 470 |
+
"""
|
| 471 |
+
Used to update hash of packaged modules which is used for creating unique cache directories to reflect
|
| 472 |
+
different config parameters which are passed in metadata from readme.
|
| 473 |
+
"""
|
| 474 |
+
params_to_exclude = {"config_name", "version", "description"}
|
| 475 |
+
params_to_add_to_hash = {
|
| 476 |
+
param: value
|
| 477 |
+
for param, value in sorted(config_parameters.items())
|
| 478 |
+
if param not in params_to_exclude
|
| 479 |
+
}
|
| 480 |
+
m = Hasher()
|
| 481 |
+
m.update(hash)
|
| 482 |
+
m.update(params_to_add_to_hash)
|
| 483 |
+
return m.hexdigest()
|
| 484 |
+
|
| 485 |
+
namespace = self.repo_id.split("/")[0] if self.repo_id and self.repo_id.count("/") > 0 else None
|
| 486 |
+
with patch.object(Pickler, "_legacy_no_dict_keys_sorting", True):
|
| 487 |
+
config_id = self.config.name + "-" + Hasher.hash({"data_files": self.config.data_files})
|
| 488 |
+
hash = _PACKAGED_DATASETS_MODULES_2_15_HASHES.get(self.name, "missing")
|
| 489 |
+
if (
|
| 490 |
+
dataset_module.builder_configs_parameters.metadata_configs
|
| 491 |
+
and self.config.name in dataset_module.builder_configs_parameters.metadata_configs
|
| 492 |
+
):
|
| 493 |
+
hash = update_hash_with_config_parameters(
|
| 494 |
+
hash, dataset_module.builder_configs_parameters.metadata_configs[self.config.name]
|
| 495 |
+
)
|
| 496 |
+
legacy_relative_data_dir = posixpath.join(
|
| 497 |
+
self.dataset_name if namespace is None else f"{namespace}___{self.dataset_name}",
|
| 498 |
+
config_id,
|
| 499 |
+
"0.0.0",
|
| 500 |
+
hash,
|
| 501 |
+
)
|
| 502 |
+
legacy_cache_dir = posixpath.join(self._cache_dir_root, legacy_relative_data_dir)
|
| 503 |
+
if os.path.isdir(legacy_cache_dir):
|
| 504 |
+
return legacy_relative_data_dir
|
| 505 |
+
|
| 506 |
+
def _create_builder_config(
|
| 507 |
+
self, config_name=None, custom_features=None, config_id=None, **config_kwargs
|
| 508 |
+
) -> tuple[BuilderConfig, str]:
|
| 509 |
+
"""Create and validate BuilderConfig object as well as a unique config id for this config.
|
| 510 |
+
Raises ValueError if there are multiple builder configs and config_name and DEFAULT_CONFIG_NAME are None.
|
| 511 |
+
config_kwargs override the defaults kwargs in config
|
| 512 |
+
"""
|
| 513 |
+
builder_config = None
|
| 514 |
+
|
| 515 |
+
# try default config
|
| 516 |
+
if config_name is None and self.BUILDER_CONFIGS:
|
| 517 |
+
if self.DEFAULT_CONFIG_NAME is not None:
|
| 518 |
+
builder_config = self.builder_configs.get(self.DEFAULT_CONFIG_NAME)
|
| 519 |
+
logger.info(f"No config specified, defaulting to: {self.dataset_name}/{builder_config.name}")
|
| 520 |
+
else:
|
| 521 |
+
if len(self.BUILDER_CONFIGS) > 1:
|
| 522 |
+
if not config_kwargs:
|
| 523 |
+
example_of_usage = (
|
| 524 |
+
f"load_dataset('{self.repo_id or self.dataset_name}', '{self.BUILDER_CONFIGS[0].name}')"
|
| 525 |
+
)
|
| 526 |
+
raise ValueError(
|
| 527 |
+
"Config name is missing."
|
| 528 |
+
f"\nPlease pick one among the available configs: {list(self.builder_configs.keys())}"
|
| 529 |
+
+ f"\nExample of usage:\n\t`{example_of_usage}`"
|
| 530 |
+
)
|
| 531 |
+
else:
|
| 532 |
+
builder_config = self.BUILDER_CONFIGS[0]
|
| 533 |
+
logger.info(
|
| 534 |
+
f"No config specified, defaulting to the single config: {self.dataset_name}/{builder_config.name}"
|
| 535 |
+
)
|
| 536 |
+
|
| 537 |
+
# try to get config by name
|
| 538 |
+
if isinstance(config_name, str):
|
| 539 |
+
builder_config = self.builder_configs.get(config_name)
|
| 540 |
+
if builder_config is None and self.BUILDER_CONFIGS:
|
| 541 |
+
raise ValueError(
|
| 542 |
+
f"BuilderConfig '{config_name}' not found. Available: {list(self.builder_configs.keys())}"
|
| 543 |
+
)
|
| 544 |
+
|
| 545 |
+
# if not using an existing config, then create a new config on the fly
|
| 546 |
+
if not builder_config:
|
| 547 |
+
if config_name is not None:
|
| 548 |
+
config_kwargs["name"] = config_name
|
| 549 |
+
elif self.DEFAULT_CONFIG_NAME and not config_kwargs:
|
| 550 |
+
# Use DEFAULT_CONFIG_NAME only if no config_kwargs are passed
|
| 551 |
+
config_kwargs["name"] = self.DEFAULT_CONFIG_NAME
|
| 552 |
+
if "version" not in config_kwargs and hasattr(self, "VERSION") and self.VERSION:
|
| 553 |
+
config_kwargs["version"] = self.VERSION
|
| 554 |
+
builder_config = self.BUILDER_CONFIG_CLASS(**config_kwargs)
|
| 555 |
+
|
| 556 |
+
# otherwise use the config_kwargs to overwrite the attributes
|
| 557 |
+
else:
|
| 558 |
+
builder_config = copy.deepcopy(builder_config) if config_kwargs else builder_config
|
| 559 |
+
for key, value in config_kwargs.items():
|
| 560 |
+
if value is not None:
|
| 561 |
+
if not hasattr(builder_config, key):
|
| 562 |
+
raise ValueError(f"BuilderConfig {builder_config} doesn't have a '{key}' key.")
|
| 563 |
+
setattr(builder_config, key, value)
|
| 564 |
+
|
| 565 |
+
if not builder_config.name:
|
| 566 |
+
raise ValueError(f"BuilderConfig must have a name, got {builder_config.name}")
|
| 567 |
+
|
| 568 |
+
# resolve data files if needed
|
| 569 |
+
builder_config._resolve_data_files(
|
| 570 |
+
base_path=self.base_path,
|
| 571 |
+
download_config=DownloadConfig(token=self.token, storage_options=self.storage_options),
|
| 572 |
+
)
|
| 573 |
+
|
| 574 |
+
# compute the config id that is going to be used for caching
|
| 575 |
+
if config_id is None:
|
| 576 |
+
config_id = builder_config.create_config_id(
|
| 577 |
+
config_kwargs,
|
| 578 |
+
custom_features=custom_features,
|
| 579 |
+
)
|
| 580 |
+
is_custom = (config_id not in self.builder_configs) and config_id != "default"
|
| 581 |
+
if is_custom:
|
| 582 |
+
logger.info(f"Using custom data configuration {config_id}")
|
| 583 |
+
else:
|
| 584 |
+
if (
|
| 585 |
+
builder_config.name in self.builder_configs
|
| 586 |
+
and builder_config != self.builder_configs[builder_config.name]
|
| 587 |
+
):
|
| 588 |
+
raise ValueError(
|
| 589 |
+
"Cannot name a custom BuilderConfig the same as an available "
|
| 590 |
+
f"BuilderConfig. Change the name. Available BuilderConfigs: {list(self.builder_configs.keys())}"
|
| 591 |
+
)
|
| 592 |
+
if not builder_config.version:
|
| 593 |
+
raise ValueError(f"BuilderConfig {builder_config.name} must have a version")
|
| 594 |
+
|
| 595 |
+
return builder_config, config_id
|
| 596 |
+
|
| 597 |
+
@classproperty
|
| 598 |
+
@classmethod
|
| 599 |
+
@memoize()
|
| 600 |
+
def builder_configs(cls) -> dict[str, BuilderConfig]:
|
| 601 |
+
"""Dictionary of pre-defined configurations for this builder class."""
|
| 602 |
+
configs = {config.name: config for config in cls.BUILDER_CONFIGS}
|
| 603 |
+
if len(configs) != len(cls.BUILDER_CONFIGS):
|
| 604 |
+
names = [config.name for config in cls.BUILDER_CONFIGS]
|
| 605 |
+
raise ValueError(f"Names in BUILDER_CONFIGS must not be duplicated. Got {names}")
|
| 606 |
+
return configs
|
| 607 |
+
|
| 608 |
+
@property
|
| 609 |
+
def cache_dir(self):
|
| 610 |
+
return self._cache_dir
|
| 611 |
+
|
| 612 |
+
def _use_legacy_cache_dir_if_possible(self, dataset_module: "DatasetModule"):
|
| 613 |
+
# Check for the legacy cache directory template (datasets<3.0.0)
|
| 614 |
+
self._legacy_relative_data_dir = (
|
| 615 |
+
self._check_legacy_cache2(dataset_module) or self._check_legacy_cache() or None
|
| 616 |
+
)
|
| 617 |
+
self._cache_dir = self._build_cache_dir()
|
| 618 |
+
self._output_dir = self._cache_dir
|
| 619 |
+
|
| 620 |
+
def _relative_data_dir(self, with_version=True, with_hash=True) -> str:
|
| 621 |
+
"""Relative path of this dataset in cache_dir:
|
| 622 |
+
Will be:
|
| 623 |
+
self.dataset_name/self.config.version/self.hash/
|
| 624 |
+
or if a repo_id with a namespace has been specified:
|
| 625 |
+
self.namespace___self.dataset_name/self.config.version/self.hash/
|
| 626 |
+
If any of these element is missing or if ``with_version=False`` the corresponding subfolders are dropped.
|
| 627 |
+
"""
|
| 628 |
+
if self._legacy_relative_data_dir is not None and with_version and with_hash:
|
| 629 |
+
return self._legacy_relative_data_dir
|
| 630 |
+
|
| 631 |
+
namespace = self.repo_id.split("/")[0] if self.repo_id and self.repo_id.count("/") > 0 else None
|
| 632 |
+
builder_data_dir = self.dataset_name if namespace is None else f"{namespace}___{self.dataset_name}"
|
| 633 |
+
builder_data_dir = posixpath.join(builder_data_dir, self.config_id)
|
| 634 |
+
if with_version:
|
| 635 |
+
builder_data_dir = posixpath.join(builder_data_dir, str(self.config.version))
|
| 636 |
+
if with_hash and self.hash and isinstance(self.hash, str):
|
| 637 |
+
builder_data_dir = posixpath.join(builder_data_dir, self.hash)
|
| 638 |
+
return builder_data_dir
|
| 639 |
+
|
| 640 |
+
def _build_cache_dir(self):
|
| 641 |
+
"""Return the data directory for the current version."""
|
| 642 |
+
builder_data_dir = posixpath.join(self._cache_dir_root, self._relative_data_dir(with_version=False))
|
| 643 |
+
version_data_dir = posixpath.join(self._cache_dir_root, self._relative_data_dir(with_version=True))
|
| 644 |
+
|
| 645 |
+
def _other_versions_on_disk():
|
| 646 |
+
"""Returns previous versions on disk."""
|
| 647 |
+
if not os.path.exists(builder_data_dir):
|
| 648 |
+
return []
|
| 649 |
+
|
| 650 |
+
version_dirnames = []
|
| 651 |
+
for dir_name in os.listdir(builder_data_dir):
|
| 652 |
+
try:
|
| 653 |
+
version_dirnames.append((utils.Version(dir_name), dir_name))
|
| 654 |
+
except ValueError: # Invalid version (ex: incomplete data dir)
|
| 655 |
+
pass
|
| 656 |
+
version_dirnames.sort(reverse=True)
|
| 657 |
+
return version_dirnames
|
| 658 |
+
|
| 659 |
+
# Check and warn if other versions exist
|
| 660 |
+
if not is_remote_url(builder_data_dir):
|
| 661 |
+
version_dirs = _other_versions_on_disk()
|
| 662 |
+
if version_dirs:
|
| 663 |
+
other_version = version_dirs[0][0]
|
| 664 |
+
if other_version != self.config.version:
|
| 665 |
+
warn_msg = (
|
| 666 |
+
f"Found a different version {str(other_version)} of dataset {self.dataset_name} in "
|
| 667 |
+
f"cache_dir {self._cache_dir_root}. Using currently defined version "
|
| 668 |
+
f"{str(self.config.version)}."
|
| 669 |
+
)
|
| 670 |
+
logger.warning(warn_msg)
|
| 671 |
+
|
| 672 |
+
return version_data_dir
|
| 673 |
+
|
| 674 |
+
@abc.abstractmethod
|
| 675 |
+
def _info(self) -> DatasetInfo:
|
| 676 |
+
"""Construct the DatasetInfo object. See `DatasetInfo` for details.
|
| 677 |
+
|
| 678 |
+
Warning: This function is only called once and the result is cached for all
|
| 679 |
+
following .info() calls.
|
| 680 |
+
|
| 681 |
+
Returns:
|
| 682 |
+
info: (DatasetInfo) The dataset information
|
| 683 |
+
"""
|
| 684 |
+
raise NotImplementedError
|
| 685 |
+
|
| 686 |
+
@classmethod
|
| 687 |
+
def get_imported_module_dir(cls):
|
| 688 |
+
"""Return the path of the module of this class or subclass."""
|
| 689 |
+
return os.path.dirname(inspect.getfile(inspect.getmodule(cls)))
|
| 690 |
+
|
| 691 |
+
def _rename(self, src: str, dst: str):
|
| 692 |
+
rename(self._fs, src, dst)
|
| 693 |
+
|
| 694 |
+
def download_and_prepare(
|
| 695 |
+
self,
|
| 696 |
+
output_dir: Optional[str] = None,
|
| 697 |
+
download_config: Optional[DownloadConfig] = None,
|
| 698 |
+
download_mode: Optional[Union[DownloadMode, str]] = None,
|
| 699 |
+
verification_mode: Optional[Union[VerificationMode, str]] = None,
|
| 700 |
+
dl_manager: Optional[DownloadManager] = None,
|
| 701 |
+
base_path: Optional[str] = None,
|
| 702 |
+
file_format: str = "arrow",
|
| 703 |
+
max_shard_size: Optional[Union[int, str]] = None,
|
| 704 |
+
num_proc: Optional[int] = None,
|
| 705 |
+
storage_options: Optional[dict] = None,
|
| 706 |
+
**download_and_prepare_kwargs,
|
| 707 |
+
):
|
| 708 |
+
"""Downloads and prepares dataset for reading.
|
| 709 |
+
|
| 710 |
+
Args:
|
| 711 |
+
output_dir (`str`, *optional*):
|
| 712 |
+
Output directory for the dataset.
|
| 713 |
+
Default to this builder's `cache_dir`, which is inside `~/.cache/huggingface/datasets` by default.
|
| 714 |
+
|
| 715 |
+
<Added version="2.5.0"/>
|
| 716 |
+
download_config (`DownloadConfig`, *optional*):
|
| 717 |
+
Specific download configuration parameters.
|
| 718 |
+
download_mode ([`DownloadMode`] or `str`, *optional*):
|
| 719 |
+
Select the download/generate mode, default to `REUSE_DATASET_IF_EXISTS`.
|
| 720 |
+
verification_mode ([`VerificationMode`] or `str`, defaults to `BASIC_CHECKS`):
|
| 721 |
+
Verification mode determining the checks to run on the downloaded/processed dataset information (checksums/size/splits/...).
|
| 722 |
+
|
| 723 |
+
<Added version="2.9.1"/>
|
| 724 |
+
dl_manager (`DownloadManager`, *optional*):
|
| 725 |
+
Specific `DownloadManger` to use.
|
| 726 |
+
base_path (`str`, *optional*):
|
| 727 |
+
Base path for relative paths that are used to download files. This can be a remote url.
|
| 728 |
+
If not specified, the value of the `base_path` attribute (`self.base_path`) will be used instead.
|
| 729 |
+
file_format (`str`, *optional*):
|
| 730 |
+
Format of the data files in which the dataset will be written.
|
| 731 |
+
Supported formats: "arrow", "parquet". Default to "arrow" format.
|
| 732 |
+
If the format is "parquet", then image and audio data are embedded into the Parquet files instead of pointing to local files.
|
| 733 |
+
|
| 734 |
+
<Added version="2.5.0"/>
|
| 735 |
+
max_shard_size (`Union[str, int]`, *optional*):
|
| 736 |
+
Maximum number of bytes written per shard, default is "500MB".
|
| 737 |
+
The size is based on uncompressed data size, so in practice your shard files may be smaller than
|
| 738 |
+
`max_shard_size` thanks to Parquet compression for example.
|
| 739 |
+
|
| 740 |
+
<Added version="2.5.0"/>
|
| 741 |
+
num_proc (`int`, *optional*, defaults to `None`):
|
| 742 |
+
Number of processes when downloading and generating the dataset locally.
|
| 743 |
+
Multiprocessing is disabled by default.
|
| 744 |
+
|
| 745 |
+
<Added version="2.7.0"/>
|
| 746 |
+
storage_options (`dict`, *optional*):
|
| 747 |
+
Key/value pairs to be passed on to the caching file-system backend, if any.
|
| 748 |
+
|
| 749 |
+
<Added version="2.5.0"/>
|
| 750 |
+
**download_and_prepare_kwargs (additional keyword arguments): Keyword arguments.
|
| 751 |
+
|
| 752 |
+
Example:
|
| 753 |
+
|
| 754 |
+
Download and prepare the dataset as Arrow files that can be loaded as a Dataset using `builder.as_dataset()`:
|
| 755 |
+
|
| 756 |
+
```py
|
| 757 |
+
>>> from datasets import load_dataset_builder
|
| 758 |
+
>>> builder = load_dataset_builder("cornell-movie-review-data/rotten_tomatoes")
|
| 759 |
+
>>> builder.download_and_prepare()
|
| 760 |
+
```
|
| 761 |
+
|
| 762 |
+
Download and prepare the dataset as sharded Parquet files locally:
|
| 763 |
+
|
| 764 |
+
```py
|
| 765 |
+
>>> from datasets import load_dataset_builder
|
| 766 |
+
>>> builder = load_dataset_builder("cornell-movie-review-data/rotten_tomatoes")
|
| 767 |
+
>>> builder.download_and_prepare("./output_dir", file_format="parquet")
|
| 768 |
+
```
|
| 769 |
+
|
| 770 |
+
Download and prepare the dataset as sharded Parquet files in a cloud storage:
|
| 771 |
+
|
| 772 |
+
```py
|
| 773 |
+
>>> from datasets import load_dataset_builder
|
| 774 |
+
>>> storage_options = {"key": aws_access_key_id, "secret": aws_secret_access_key}
|
| 775 |
+
>>> builder = load_dataset_builder("cornell-movie-review-data/rotten_tomatoes")
|
| 776 |
+
>>> builder.download_and_prepare("s3://my-bucket/my_rotten_tomatoes", storage_options=storage_options, file_format="parquet")
|
| 777 |
+
```
|
| 778 |
+
"""
|
| 779 |
+
output_dir = output_dir if output_dir is not None else self._cache_dir
|
| 780 |
+
# output_dir can be a remote bucket on GCS or S3
|
| 781 |
+
fs, output_dir = url_to_fs(output_dir, **(storage_options or {}))
|
| 782 |
+
self._fs = fs
|
| 783 |
+
self._output_dir = output_dir if not is_remote_filesystem(self._fs) else self._fs.unstrip_protocol(output_dir)
|
| 784 |
+
|
| 785 |
+
download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS)
|
| 786 |
+
verification_mode = VerificationMode(verification_mode or VerificationMode.BASIC_CHECKS)
|
| 787 |
+
base_path = base_path if base_path is not None else self.base_path
|
| 788 |
+
|
| 789 |
+
if file_format is not None and file_format not in ["arrow", "parquet"]:
|
| 790 |
+
raise ValueError(f"Unsupported file_format: {file_format}. Expected 'arrow' or 'parquet'")
|
| 791 |
+
self._file_format = file_format
|
| 792 |
+
|
| 793 |
+
if self._fs._strip_protocol(self._output_dir) == "":
|
| 794 |
+
# We don't support the root directory, because it has no dirname,
|
| 795 |
+
# and we need a dirname to use a <dirname>.incomplete directory
|
| 796 |
+
# when the dataset is being written
|
| 797 |
+
raise RuntimeError(
|
| 798 |
+
f"Unable to download and prepare the dataset at the root {self._output_dir}. "
|
| 799 |
+
f"Please specify a subdirectory, e.g. '{self._output_dir + self.dataset_name}'"
|
| 800 |
+
)
|
| 801 |
+
|
| 802 |
+
if dl_manager is None:
|
| 803 |
+
if download_config is None:
|
| 804 |
+
download_config = DownloadConfig(
|
| 805 |
+
cache_dir=self._cache_downloaded_dir,
|
| 806 |
+
force_download=download_mode == DownloadMode.FORCE_REDOWNLOAD,
|
| 807 |
+
force_extract=download_mode == DownloadMode.FORCE_REDOWNLOAD,
|
| 808 |
+
use_etag=False,
|
| 809 |
+
num_proc=num_proc,
|
| 810 |
+
token=self.token,
|
| 811 |
+
storage_options=self.storage_options,
|
| 812 |
+
) # We don't use etag for data files to speed up the process
|
| 813 |
+
|
| 814 |
+
dl_manager = DownloadManager(
|
| 815 |
+
dataset_name=self.dataset_name,
|
| 816 |
+
download_config=download_config,
|
| 817 |
+
data_dir=self.config.data_dir,
|
| 818 |
+
base_path=base_path,
|
| 819 |
+
record_checksums=(self._record_infos or verification_mode == VerificationMode.ALL_CHECKS),
|
| 820 |
+
)
|
| 821 |
+
|
| 822 |
+
is_local = not is_remote_filesystem(self._fs)
|
| 823 |
+
self.dl_manager = dl_manager
|
| 824 |
+
|
| 825 |
+
# Prevent parallel local disk operations
|
| 826 |
+
if is_local:
|
| 827 |
+
# Create parent directory of the output_dir to put the lock file in there
|
| 828 |
+
Path(self._output_dir).parent.mkdir(parents=True, exist_ok=True)
|
| 829 |
+
lock_path = self._output_dir + "_builder.lock"
|
| 830 |
+
|
| 831 |
+
# File locking only with local paths; no file locking on GCS or S3
|
| 832 |
+
with FileLock(lock_path) if is_local else contextlib.nullcontext():
|
| 833 |
+
# Check if the data already exists
|
| 834 |
+
data_exists = self._fs.exists(posixpath.join(self._output_dir, config.DATASET_INFO_FILENAME))
|
| 835 |
+
if data_exists and download_mode == DownloadMode.REUSE_DATASET_IF_EXISTS:
|
| 836 |
+
logger.info(f"Found cached dataset {self.dataset_name} ({self._output_dir})")
|
| 837 |
+
# We need to update the info in case some splits were added in the meantime
|
| 838 |
+
# for example when calling load_dataset from multiple workers.
|
| 839 |
+
self.info = self._load_info()
|
| 840 |
+
self.download_post_processing_resources(dl_manager)
|
| 841 |
+
return
|
| 842 |
+
|
| 843 |
+
logger.info(f"Generating dataset {self.dataset_name} ({self._output_dir})")
|
| 844 |
+
if is_local: # if cache dir is local, check for available space
|
| 845 |
+
if not has_sufficient_disk_space(
|
| 846 |
+
self.info.size_in_bytes or 0, directory=Path(self._output_dir).parent
|
| 847 |
+
):
|
| 848 |
+
raise OSError(
|
| 849 |
+
f"Not enough disk space. Needed: {size_str(self.info.size_in_bytes or 0)} (download: {size_str(self.info.download_size or 0)}, generated: {size_str(self.info.dataset_size or 0)}, post-processed: {size_str(self.info.post_processing_size or 0)})"
|
| 850 |
+
)
|
| 851 |
+
|
| 852 |
+
@contextlib.contextmanager
|
| 853 |
+
def incomplete_dir(dirname):
|
| 854 |
+
"""Create temporary dir for dirname and rename on exit."""
|
| 855 |
+
if not is_local:
|
| 856 |
+
self._fs.makedirs(dirname, exist_ok=True)
|
| 857 |
+
yield dirname
|
| 858 |
+
else:
|
| 859 |
+
tmp_dir = dirname + ".incomplete"
|
| 860 |
+
os.makedirs(tmp_dir, exist_ok=True)
|
| 861 |
+
try:
|
| 862 |
+
yield tmp_dir
|
| 863 |
+
if os.path.isdir(dirname):
|
| 864 |
+
shutil.rmtree(dirname)
|
| 865 |
+
# LocalFileSystem.mv does copy + rm, it is more efficient to simply rename a local directory
|
| 866 |
+
shutil.move(tmp_dir, dirname)
|
| 867 |
+
finally:
|
| 868 |
+
if os.path.exists(tmp_dir):
|
| 869 |
+
shutil.rmtree(tmp_dir)
|
| 870 |
+
|
| 871 |
+
# Print is intentional: we want this to always go to stdout so user has
|
| 872 |
+
# information needed to cancel download/preparation if needed.
|
| 873 |
+
# This comes right before the progress bar.
|
| 874 |
+
if self.info.size_in_bytes:
|
| 875 |
+
logger.info(
|
| 876 |
+
f"Downloading and preparing dataset {self.dataset_name}/{self.config.name} "
|
| 877 |
+
f"(download: {size_str(self.info.download_size)}, generated: {size_str(self.info.dataset_size)}, "
|
| 878 |
+
f"post-processed: {size_str(self.info.post_processing_size)}, "
|
| 879 |
+
f"total: {size_str(self.info.size_in_bytes)}) to {self._output_dir}..."
|
| 880 |
+
)
|
| 881 |
+
else:
|
| 882 |
+
_dest = self._fs._strip_protocol(self._output_dir) if is_local else self._output_dir
|
| 883 |
+
logger.info(f"Downloading and preparing dataset {self.dataset_name}/{self.config.name} to {_dest}...")
|
| 884 |
+
|
| 885 |
+
self._check_manual_download(dl_manager)
|
| 886 |
+
|
| 887 |
+
# Create a tmp dir and rename to self._output_dir on successful exit.
|
| 888 |
+
with incomplete_dir(self._output_dir) as tmp_output_dir:
|
| 889 |
+
# Temporarily assign _output_dir to tmp_data_dir to avoid having to forward
|
| 890 |
+
# it to every sub function.
|
| 891 |
+
with temporary_assignment(self, "_output_dir", tmp_output_dir):
|
| 892 |
+
prepare_split_kwargs = {"file_format": file_format}
|
| 893 |
+
if max_shard_size is not None:
|
| 894 |
+
prepare_split_kwargs["max_shard_size"] = max_shard_size
|
| 895 |
+
if num_proc is not None:
|
| 896 |
+
prepare_split_kwargs["num_proc"] = num_proc
|
| 897 |
+
self._download_and_prepare(
|
| 898 |
+
dl_manager=dl_manager,
|
| 899 |
+
verification_mode=verification_mode,
|
| 900 |
+
**prepare_split_kwargs,
|
| 901 |
+
**download_and_prepare_kwargs,
|
| 902 |
+
)
|
| 903 |
+
# Sync info
|
| 904 |
+
self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())
|
| 905 |
+
self.info.download_checksums = dl_manager.get_recorded_sizes_checksums()
|
| 906 |
+
if self.info.download_size is not None:
|
| 907 |
+
self.info.size_in_bytes = self.info.dataset_size + self.info.download_size
|
| 908 |
+
# Save info
|
| 909 |
+
self._save_info()
|
| 910 |
+
|
| 911 |
+
# Download post processing resources
|
| 912 |
+
self.download_post_processing_resources(dl_manager)
|
| 913 |
+
|
| 914 |
+
logger.info(
|
| 915 |
+
f"Dataset {self.dataset_name} downloaded and prepared to {self._output_dir}. "
|
| 916 |
+
f"Subsequent calls will reuse this data."
|
| 917 |
+
)
|
| 918 |
+
|
| 919 |
+
def _check_manual_download(self, dl_manager):
|
| 920 |
+
if self.manual_download_instructions is not None and dl_manager.manual_dir is None:
|
| 921 |
+
raise ManualDownloadError(
|
| 922 |
+
textwrap.dedent(
|
| 923 |
+
f"""\
|
| 924 |
+
The dataset {self.dataset_name} with config {self.config.name} requires manual data.
|
| 925 |
+
Please follow the manual download instructions:
|
| 926 |
+
{self.manual_download_instructions}
|
| 927 |
+
Manual data can be loaded with:
|
| 928 |
+
datasets.load_dataset("{self.repo_id or self.dataset_name}", data_dir="<path/to/manual/data>")"""
|
| 929 |
+
)
|
| 930 |
+
)
|
| 931 |
+
|
| 932 |
+
def _download_and_prepare(self, dl_manager, verification_mode, **prepare_split_kwargs):
|
| 933 |
+
"""Downloads and prepares dataset for reading.
|
| 934 |
+
|
| 935 |
+
This is the internal implementation to overwrite called when user calls
|
| 936 |
+
`download_and_prepare`. It should download all required data and generate
|
| 937 |
+
the pre-processed datasets files.
|
| 938 |
+
|
| 939 |
+
Args:
|
| 940 |
+
dl_manager ([`DownloadManager`]):
|
| 941 |
+
`DownloadManager` used to download and cache data.
|
| 942 |
+
verification_mode ([`VerificationMode`]):
|
| 943 |
+
if `ALL_CHECKS`, perform all the verifications including checksums.
|
| 944 |
+
if `BASIC_CHECKS`, do not perform checksums, only perform split tests.
|
| 945 |
+
if `NO_CHECKS`, do not perform any verification.
|
| 946 |
+
prepare_split_kwargs: Additional options, such as `file_format`, `max_shard_size`
|
| 947 |
+
"""
|
| 948 |
+
# Generating data for all splits
|
| 949 |
+
split_dict = SplitDict(dataset_name=self.dataset_name)
|
| 950 |
+
split_generators_kwargs = self._make_split_generators_kwargs(prepare_split_kwargs)
|
| 951 |
+
split_generators = self._split_generators(dl_manager, **split_generators_kwargs)
|
| 952 |
+
|
| 953 |
+
# Checksums verification
|
| 954 |
+
if verification_mode == VerificationMode.ALL_CHECKS and dl_manager.record_checksums:
|
| 955 |
+
verify_checksums(
|
| 956 |
+
self.info.download_checksums, dl_manager.get_recorded_sizes_checksums(), "dataset source files"
|
| 957 |
+
)
|
| 958 |
+
|
| 959 |
+
# Build splits
|
| 960 |
+
for split_generator in split_generators:
|
| 961 |
+
if str(split_generator.split_info.name).lower() == "all":
|
| 962 |
+
raise ValueError(
|
| 963 |
+
"`all` is a special split keyword corresponding to the "
|
| 964 |
+
"union of all splits, so cannot be used as key in "
|
| 965 |
+
"._split_generator()."
|
| 966 |
+
)
|
| 967 |
+
|
| 968 |
+
logger.info(f"Generating {split_generator.split_info.name} split")
|
| 969 |
+
split_dict.add(split_generator.split_info)
|
| 970 |
+
|
| 971 |
+
try:
|
| 972 |
+
# Prepare split will record examples associated to the split
|
| 973 |
+
self._prepare_split(split_generator, **prepare_split_kwargs)
|
| 974 |
+
except OSError as e:
|
| 975 |
+
raise OSError(
|
| 976 |
+
"Cannot find data file. "
|
| 977 |
+
+ (self.manual_download_instructions or "")
|
| 978 |
+
+ "\nOriginal error:\n"
|
| 979 |
+
+ str(e)
|
| 980 |
+
) from None
|
| 981 |
+
# If check_duplicates is set to True , then except DuplicatedKeysError
|
| 982 |
+
except DuplicatedKeysError as e:
|
| 983 |
+
raise DuplicatedKeysError(
|
| 984 |
+
e.key,
|
| 985 |
+
e.duplicate_key_indices,
|
| 986 |
+
fix_msg=f"To avoid duplicate keys, please fix the dataset splits for {self.name}",
|
| 987 |
+
) from None
|
| 988 |
+
dl_manager.manage_extracted_files()
|
| 989 |
+
|
| 990 |
+
if verification_mode == VerificationMode.BASIC_CHECKS or verification_mode == VerificationMode.ALL_CHECKS:
|
| 991 |
+
verify_splits(self.info.splits, split_dict)
|
| 992 |
+
|
| 993 |
+
# Update the info object with the splits.
|
| 994 |
+
self.info.splits = split_dict
|
| 995 |
+
self.info.download_size = dl_manager.downloaded_size
|
| 996 |
+
|
| 997 |
+
def download_post_processing_resources(self, dl_manager):
|
| 998 |
+
for split in self.info.splits or []:
|
| 999 |
+
for resource_name, resource_file_name in self._post_processing_resources(split).items():
|
| 1000 |
+
if not not is_remote_filesystem(self._fs):
|
| 1001 |
+
raise NotImplementedError(f"Post processing is not supported on filesystem {self._fs}")
|
| 1002 |
+
if os.sep in resource_file_name:
|
| 1003 |
+
raise ValueError(f"Resources shouldn't be in a sub-directory: {resource_file_name}")
|
| 1004 |
+
resource_path = os.path.join(self._output_dir, resource_file_name)
|
| 1005 |
+
if not os.path.exists(resource_path):
|
| 1006 |
+
downloaded_resource_path = self._download_post_processing_resources(
|
| 1007 |
+
split, resource_name, dl_manager
|
| 1008 |
+
)
|
| 1009 |
+
if downloaded_resource_path:
|
| 1010 |
+
logger.info(f"Downloaded post-processing resource {resource_name} as {resource_file_name}")
|
| 1011 |
+
shutil.move(downloaded_resource_path, resource_path)
|
| 1012 |
+
|
| 1013 |
+
def _load_info(self) -> DatasetInfo:
|
| 1014 |
+
return DatasetInfo.from_directory(self._output_dir, storage_options=self._fs.storage_options)
|
| 1015 |
+
|
| 1016 |
+
def _save_info(self):
|
| 1017 |
+
file_lock = (
|
| 1018 |
+
FileLock(self._output_dir + "_info.lock")
|
| 1019 |
+
if not is_remote_filesystem(self._fs)
|
| 1020 |
+
else contextlib.nullcontext()
|
| 1021 |
+
)
|
| 1022 |
+
with file_lock:
|
| 1023 |
+
self.info.write_to_directory(self._output_dir, storage_options=self._fs.storage_options)
|
| 1024 |
+
|
| 1025 |
+
def _make_split_generators_kwargs(self, prepare_split_kwargs):
|
| 1026 |
+
"""Get kwargs for `self._split_generators()` from `prepare_split_kwargs`."""
|
| 1027 |
+
del prepare_split_kwargs
|
| 1028 |
+
return {}
|
| 1029 |
+
|
| 1030 |
+
def as_dataset(
|
| 1031 |
+
self,
|
| 1032 |
+
split: Optional[Union[str, Split, list[str], list[Split]]] = None,
|
| 1033 |
+
run_post_process=True,
|
| 1034 |
+
verification_mode: Optional[Union[VerificationMode, str]] = None,
|
| 1035 |
+
in_memory=False,
|
| 1036 |
+
) -> Union[Dataset, DatasetDict]:
|
| 1037 |
+
"""Return a Dataset for the specified split.
|
| 1038 |
+
|
| 1039 |
+
Args:
|
| 1040 |
+
split (`datasets.Split`):
|
| 1041 |
+
Which subset of the data to return.
|
| 1042 |
+
run_post_process (`bool`, defaults to `True`):
|
| 1043 |
+
Whether to run post-processing dataset transforms and/or add
|
| 1044 |
+
indexes.
|
| 1045 |
+
verification_mode ([`VerificationMode`] or `str`, defaults to `BASIC_CHECKS`):
|
| 1046 |
+
Verification mode determining the checks to run on the
|
| 1047 |
+
downloaded/processed dataset information (checksums/size/splits/...).
|
| 1048 |
+
|
| 1049 |
+
<Added version="2.9.1"/>
|
| 1050 |
+
in_memory (`bool`, defaults to `False`):
|
| 1051 |
+
Whether to copy the data in-memory.
|
| 1052 |
+
|
| 1053 |
+
Returns:
|
| 1054 |
+
datasets.Dataset
|
| 1055 |
+
|
| 1056 |
+
Example:
|
| 1057 |
+
|
| 1058 |
+
```py
|
| 1059 |
+
>>> from datasets import load_dataset_builder
|
| 1060 |
+
>>> builder = load_dataset_builder('cornell-movie-review-data/rotten_tomatoes')
|
| 1061 |
+
>>> builder.download_and_prepare()
|
| 1062 |
+
>>> ds = builder.as_dataset(split='train')
|
| 1063 |
+
>>> ds
|
| 1064 |
+
Dataset({
|
| 1065 |
+
features: ['text', 'label'],
|
| 1066 |
+
num_rows: 8530
|
| 1067 |
+
})
|
| 1068 |
+
```
|
| 1069 |
+
"""
|
| 1070 |
+
if self._file_format is not None and self._file_format != "arrow":
|
| 1071 |
+
raise FileFormatError('Loading a dataset not written in the "arrow" format is not supported.')
|
| 1072 |
+
if is_remote_filesystem(self._fs):
|
| 1073 |
+
raise NotImplementedError(f"Loading a dataset cached in a {type(self._fs).__name__} is not supported.")
|
| 1074 |
+
if not os.path.exists(self._output_dir):
|
| 1075 |
+
raise FileNotFoundError(
|
| 1076 |
+
f"Dataset {self.dataset_name}: could not find data in {self._output_dir}. Please make sure to call "
|
| 1077 |
+
"builder.download_and_prepare(), or use "
|
| 1078 |
+
"datasets.load_dataset() before trying to access the Dataset object."
|
| 1079 |
+
)
|
| 1080 |
+
|
| 1081 |
+
logger.debug(f"Constructing Dataset for split {split or ', '.join(self.info.splits)}, from {self._output_dir}")
|
| 1082 |
+
|
| 1083 |
+
# By default, return all splits
|
| 1084 |
+
if split is None:
|
| 1085 |
+
split = {s: s for s in self.info.splits}
|
| 1086 |
+
|
| 1087 |
+
verification_mode = VerificationMode(verification_mode or VerificationMode.BASIC_CHECKS)
|
| 1088 |
+
|
| 1089 |
+
# Create a dataset for each of the given splits
|
| 1090 |
+
datasets = map_nested(
|
| 1091 |
+
partial(
|
| 1092 |
+
self._build_single_dataset,
|
| 1093 |
+
run_post_process=run_post_process,
|
| 1094 |
+
verification_mode=verification_mode,
|
| 1095 |
+
in_memory=in_memory,
|
| 1096 |
+
),
|
| 1097 |
+
split,
|
| 1098 |
+
map_tuple=True,
|
| 1099 |
+
disable_tqdm=True,
|
| 1100 |
+
)
|
| 1101 |
+
if isinstance(datasets, dict):
|
| 1102 |
+
datasets = DatasetDict(datasets)
|
| 1103 |
+
return datasets
|
| 1104 |
+
|
| 1105 |
+
def _build_single_dataset(
|
| 1106 |
+
self,
|
| 1107 |
+
split: Union[str, ReadInstruction, Split],
|
| 1108 |
+
run_post_process: bool,
|
| 1109 |
+
verification_mode: VerificationMode,
|
| 1110 |
+
in_memory: bool = False,
|
| 1111 |
+
):
|
| 1112 |
+
"""as_dataset for a single split."""
|
| 1113 |
+
if not isinstance(split, ReadInstruction):
|
| 1114 |
+
split = str(split)
|
| 1115 |
+
if split == "all":
|
| 1116 |
+
split = "+".join(self.info.splits.keys())
|
| 1117 |
+
split = Split(split)
|
| 1118 |
+
|
| 1119 |
+
# Build base dataset
|
| 1120 |
+
ds = self._as_dataset(
|
| 1121 |
+
split=split,
|
| 1122 |
+
in_memory=in_memory,
|
| 1123 |
+
)
|
| 1124 |
+
if run_post_process:
|
| 1125 |
+
for resource_file_name in self._post_processing_resources(split).values():
|
| 1126 |
+
if os.sep in resource_file_name:
|
| 1127 |
+
raise ValueError(f"Resources shouldn't be in a sub-directory: {resource_file_name}")
|
| 1128 |
+
resources_paths = {
|
| 1129 |
+
resource_name: os.path.join(self._output_dir, resource_file_name)
|
| 1130 |
+
for resource_name, resource_file_name in self._post_processing_resources(split).items()
|
| 1131 |
+
}
|
| 1132 |
+
post_processed = self._post_process(ds, resources_paths)
|
| 1133 |
+
if post_processed is not None:
|
| 1134 |
+
ds = post_processed
|
| 1135 |
+
recorded_checksums = {}
|
| 1136 |
+
record_checksums = False
|
| 1137 |
+
for resource_name, resource_path in resources_paths.items():
|
| 1138 |
+
size_checksum = get_size_checksum_dict(resource_path)
|
| 1139 |
+
recorded_checksums[resource_name] = size_checksum
|
| 1140 |
+
if verification_mode == VerificationMode.ALL_CHECKS and record_checksums:
|
| 1141 |
+
if self.info.post_processed is None or self.info.post_processed.resources_checksums is None:
|
| 1142 |
+
expected_checksums = None
|
| 1143 |
+
else:
|
| 1144 |
+
expected_checksums = self.info.post_processed.resources_checksums.get(split)
|
| 1145 |
+
verify_checksums(expected_checksums, recorded_checksums, "post processing resources")
|
| 1146 |
+
if self.info.post_processed is None:
|
| 1147 |
+
self.info.post_processed = PostProcessedInfo()
|
| 1148 |
+
if self.info.post_processed.resources_checksums is None:
|
| 1149 |
+
self.info.post_processed.resources_checksums = {}
|
| 1150 |
+
self.info.post_processed.resources_checksums[str(split)] = recorded_checksums
|
| 1151 |
+
self.info.post_processing_size = sum(
|
| 1152 |
+
checksums_dict["num_bytes"]
|
| 1153 |
+
for split_checksums_dicts in self.info.post_processed.resources_checksums.values()
|
| 1154 |
+
for checksums_dict in split_checksums_dicts.values()
|
| 1155 |
+
)
|
| 1156 |
+
if self.info.dataset_size is not None and self.info.download_size is not None:
|
| 1157 |
+
self.info.size_in_bytes = (
|
| 1158 |
+
self.info.dataset_size + self.info.download_size + self.info.post_processing_size
|
| 1159 |
+
)
|
| 1160 |
+
self._save_info()
|
| 1161 |
+
ds._info.post_processed = self.info.post_processed
|
| 1162 |
+
ds._info.post_processing_size = self.info.post_processing_size
|
| 1163 |
+
ds._info.size_in_bytes = self.info.size_in_bytes
|
| 1164 |
+
if self.info.post_processed.features is not None:
|
| 1165 |
+
if self.info.post_processed.features.type != ds.features.type:
|
| 1166 |
+
raise ValueError(
|
| 1167 |
+
f"Post-processed features info don't match the dataset:\nGot\n{self.info.post_processed.features}\nbut expected something like\n{ds.features}"
|
| 1168 |
+
)
|
| 1169 |
+
else:
|
| 1170 |
+
ds.info.features = self.info.post_processed.features
|
| 1171 |
+
|
| 1172 |
+
return ds
|
| 1173 |
+
|
| 1174 |
+
def _as_dataset(self, split: Union[ReadInstruction, Split] = Split.TRAIN, in_memory: bool = False) -> Dataset:
|
| 1175 |
+
"""Constructs a `Dataset`.
|
| 1176 |
+
|
| 1177 |
+
This is the internal implementation to overwrite called when user calls
|
| 1178 |
+
`as_dataset`. It should read the pre-processed datasets files and generate
|
| 1179 |
+
the `Dataset` object.
|
| 1180 |
+
|
| 1181 |
+
Args:
|
| 1182 |
+
split (`datasets.Split`):
|
| 1183 |
+
which subset of the data to read.
|
| 1184 |
+
in_memory (`bool`, defaults to `False`):
|
| 1185 |
+
Whether to copy the data in-memory.
|
| 1186 |
+
|
| 1187 |
+
Returns:
|
| 1188 |
+
`Dataset`
|
| 1189 |
+
"""
|
| 1190 |
+
cache_dir = self._fs._strip_protocol(self._output_dir)
|
| 1191 |
+
dataset_name = self.dataset_name
|
| 1192 |
+
if self._check_legacy_cache():
|
| 1193 |
+
dataset_name = self.name
|
| 1194 |
+
dataset_kwargs = ArrowReader(cache_dir, self.info).read(
|
| 1195 |
+
name=dataset_name,
|
| 1196 |
+
instructions=split,
|
| 1197 |
+
split_infos=self.info.splits.values(),
|
| 1198 |
+
in_memory=in_memory,
|
| 1199 |
+
)
|
| 1200 |
+
fingerprint = self._get_dataset_fingerprint(split)
|
| 1201 |
+
return Dataset(fingerprint=fingerprint, **dataset_kwargs)
|
| 1202 |
+
|
| 1203 |
+
def _get_dataset_fingerprint(self, split: Union[ReadInstruction, Split]) -> str:
|
| 1204 |
+
"""The dataset fingerprint is the hash of the relative directory dataset_name/config_name/version/hash, as well as the split specs."""
|
| 1205 |
+
hasher = Hasher()
|
| 1206 |
+
hasher.update(Path(self._relative_data_dir()).as_posix())
|
| 1207 |
+
hasher.update(str(split)) # for example: train, train+test, train[:10%], test[:33%](pct1_dropremainder)
|
| 1208 |
+
fingerprint = hasher.hexdigest()
|
| 1209 |
+
return fingerprint
|
| 1210 |
+
|
| 1211 |
+
def as_streaming_dataset(
|
| 1212 |
+
self,
|
| 1213 |
+
split: Optional[str] = None,
|
| 1214 |
+
base_path: Optional[str] = None,
|
| 1215 |
+
) -> Union[dict[str, IterableDataset], IterableDataset]:
|
| 1216 |
+
if is_remote_filesystem(self._fs):
|
| 1217 |
+
raise NotImplementedError(
|
| 1218 |
+
f"Loading a streaming dataset cached in a {type(self._fs).__name__} is not supported yet."
|
| 1219 |
+
)
|
| 1220 |
+
|
| 1221 |
+
dl_manager = StreamingDownloadManager(
|
| 1222 |
+
base_path=base_path or self.base_path,
|
| 1223 |
+
download_config=DownloadConfig(token=self.token, storage_options=self.storage_options),
|
| 1224 |
+
dataset_name=self.dataset_name,
|
| 1225 |
+
data_dir=self.config.data_dir,
|
| 1226 |
+
)
|
| 1227 |
+
self._check_manual_download(dl_manager)
|
| 1228 |
+
splits_generators = {sg.name: sg for sg in self._split_generators(dl_manager)}
|
| 1229 |
+
# By default, return all splits
|
| 1230 |
+
if split is None:
|
| 1231 |
+
splits_generator = splits_generators
|
| 1232 |
+
elif split in splits_generators:
|
| 1233 |
+
splits_generator = splits_generators[split]
|
| 1234 |
+
else:
|
| 1235 |
+
raise ValueError(f"Bad split: {split}. Available splits: {list(splits_generators)}")
|
| 1236 |
+
|
| 1237 |
+
# Create a dataset for each of the given splits
|
| 1238 |
+
datasets = map_nested(
|
| 1239 |
+
self._as_streaming_dataset_single,
|
| 1240 |
+
splits_generator,
|
| 1241 |
+
map_tuple=True,
|
| 1242 |
+
)
|
| 1243 |
+
if isinstance(datasets, dict):
|
| 1244 |
+
datasets = IterableDatasetDict(datasets)
|
| 1245 |
+
return datasets
|
| 1246 |
+
|
| 1247 |
+
def _as_streaming_dataset_single(
|
| 1248 |
+
self,
|
| 1249 |
+
splits_generator,
|
| 1250 |
+
) -> IterableDataset:
|
| 1251 |
+
ex_iterable = self._get_examples_iterable_for_split(splits_generator)
|
| 1252 |
+
# add auth to be able to access and decode audio/image files from private repositories.
|
| 1253 |
+
token_per_repo_id = {self.repo_id: self.token} if self.repo_id else {}
|
| 1254 |
+
return IterableDataset(
|
| 1255 |
+
ex_iterable, info=self.info, split=splits_generator.name, token_per_repo_id=token_per_repo_id
|
| 1256 |
+
)
|
| 1257 |
+
|
| 1258 |
+
def _post_process(self, dataset: Dataset, resources_paths: Mapping[str, str]) -> Optional[Dataset]:
|
| 1259 |
+
"""Run dataset transforms or add indexes"""
|
| 1260 |
+
return None
|
| 1261 |
+
|
| 1262 |
+
def _post_processing_resources(self, split: str) -> dict[str, str]:
|
| 1263 |
+
"""Mapping resource_name -> resource_file_name"""
|
| 1264 |
+
return {}
|
| 1265 |
+
|
| 1266 |
+
def _download_post_processing_resources(
|
| 1267 |
+
self, split: str, resource_name: str, dl_manager: DownloadManager
|
| 1268 |
+
) -> Optional[str]:
|
| 1269 |
+
"""Download the resource using the download manager and return the downloaded path."""
|
| 1270 |
+
return None
|
| 1271 |
+
|
| 1272 |
+
@abc.abstractmethod
|
| 1273 |
+
def _split_generators(self, dl_manager: Union[DownloadManager, StreamingDownloadManager]):
|
| 1274 |
+
"""Specify feature dictionary generators and dataset splits.
|
| 1275 |
+
|
| 1276 |
+
This function returns a list of `SplitGenerator`s defining how to generate
|
| 1277 |
+
data and what splits to use.
|
| 1278 |
+
|
| 1279 |
+
Example:
|
| 1280 |
+
|
| 1281 |
+
return [
|
| 1282 |
+
datasets.SplitGenerator(
|
| 1283 |
+
name=datasets.Split.TRAIN,
|
| 1284 |
+
gen_kwargs={'file': 'train_data.zip'},
|
| 1285 |
+
),
|
| 1286 |
+
datasets.SplitGenerator(
|
| 1287 |
+
name=datasets.Split.TEST,
|
| 1288 |
+
gen_kwargs={'file': 'test_data.zip'},
|
| 1289 |
+
),
|
| 1290 |
+
]
|
| 1291 |
+
|
| 1292 |
+
The above code will first call `_generate_examples(file='train_data.zip')`
|
| 1293 |
+
to write the train data, then `_generate_examples(file='test_data.zip')` to
|
| 1294 |
+
write the test data.
|
| 1295 |
+
|
| 1296 |
+
Datasets are typically split into different subsets to be used at various
|
| 1297 |
+
stages of training and evaluation.
|
| 1298 |
+
|
| 1299 |
+
Note that for datasets without a `VALIDATION` split, you can use a
|
| 1300 |
+
fraction of the `TRAIN` data for evaluation as you iterate on your model
|
| 1301 |
+
so as not to overfit to the `TEST` data.
|
| 1302 |
+
|
| 1303 |
+
For downloads and extractions, use the given `download_manager`.
|
| 1304 |
+
Note that the `DownloadManager` caches downloads, so it is fine to have each
|
| 1305 |
+
generator attempt to download the source data.
|
| 1306 |
+
|
| 1307 |
+
A good practice is to download all data in this function, and then
|
| 1308 |
+
distribute the relevant parts to each split with the `gen_kwargs` argument
|
| 1309 |
+
|
| 1310 |
+
Args:
|
| 1311 |
+
dl_manager (`Union[DownloadManager, StreamingDownloadManager]`):
|
| 1312 |
+
Download manager to download the data
|
| 1313 |
+
|
| 1314 |
+
Returns:
|
| 1315 |
+
`list<SplitGenerator>`.
|
| 1316 |
+
"""
|
| 1317 |
+
raise NotImplementedError()
|
| 1318 |
+
|
| 1319 |
+
@abc.abstractmethod
|
| 1320 |
+
def _prepare_split(
|
| 1321 |
+
self,
|
| 1322 |
+
split_generator: SplitGenerator,
|
| 1323 |
+
file_format: str = "arrow",
|
| 1324 |
+
max_shard_size: Optional[Union[str, int]] = None,
|
| 1325 |
+
num_proc: Optional[int] = None,
|
| 1326 |
+
**kwargs,
|
| 1327 |
+
):
|
| 1328 |
+
"""Generate the examples and record them on disk.
|
| 1329 |
+
|
| 1330 |
+
Args:
|
| 1331 |
+
split_generator (`SplitGenerator`):
|
| 1332 |
+
Split generator to process
|
| 1333 |
+
file_format (`str`, *optional*):
|
| 1334 |
+
format of the data files in which the dataset will be written.
|
| 1335 |
+
Supported formats: "arrow", "parquet". Default to "arrow" format.
|
| 1336 |
+
max_shard_size (`Union[str, int]`, *optional*):
|
| 1337 |
+
Maximum number of bytes written per shard, default is "500MB".
|
| 1338 |
+
The size is based on uncompressed data size, so in practice your shard files may be smaller than
|
| 1339 |
+
`max_shard_size` thanks to Parquet compression for example.
|
| 1340 |
+
num_proc (`int`, *optional*, defaults to `None`):
|
| 1341 |
+
Number of processes when downloading and generating the dataset locally.
|
| 1342 |
+
Multiprocessing is disabled by default.
|
| 1343 |
+
|
| 1344 |
+
<Added version="2.7.0"/>
|
| 1345 |
+
**kwargs: Additional kwargs forwarded from _download_and_prepare
|
| 1346 |
+
"""
|
| 1347 |
+
raise NotImplementedError()
|
| 1348 |
+
|
| 1349 |
+
def _get_examples_iterable_for_split(self, split_generator: SplitGenerator) -> ExamplesIterable:
|
| 1350 |
+
"""Generate the examples on the fly.
|
| 1351 |
+
|
| 1352 |
+
Args:
|
| 1353 |
+
split_generator (`SplitGenerator`):
|
| 1354 |
+
Split generator to process
|
| 1355 |
+
"""
|
| 1356 |
+
raise NotImplementedError()
|
| 1357 |
+
|
| 1358 |
+
|
| 1359 |
+
class GeneratorBasedBuilder(DatasetBuilder):
|
| 1360 |
+
"""Base class for datasets with data generation based on dict generators.
|
| 1361 |
+
|
| 1362 |
+
`GeneratorBasedBuilder` is a convenience class that abstracts away much
|
| 1363 |
+
of the data writing and reading of `DatasetBuilder`. It expects subclasses to
|
| 1364 |
+
implement generators of feature dictionaries across the dataset splits
|
| 1365 |
+
(`_split_generators`). See the method docstrings for details.
|
| 1366 |
+
"""
|
| 1367 |
+
|
| 1368 |
+
@abc.abstractmethod
|
| 1369 |
+
def _generate_examples(self, **kwargs):
|
| 1370 |
+
"""Default function generating examples for each `SplitGenerator`.
|
| 1371 |
+
|
| 1372 |
+
This function preprocess the examples from the raw data to the preprocessed
|
| 1373 |
+
dataset files.
|
| 1374 |
+
This function is called once for each `SplitGenerator` defined in
|
| 1375 |
+
`_split_generators`. The examples yielded here will be written on
|
| 1376 |
+
disk.
|
| 1377 |
+
|
| 1378 |
+
Args:
|
| 1379 |
+
**kwargs (additional keyword arguments):
|
| 1380 |
+
Arguments forwarded from the SplitGenerator.gen_kwargs
|
| 1381 |
+
|
| 1382 |
+
Yields:
|
| 1383 |
+
key: `str` or `int`, a unique deterministic example identification key.
|
| 1384 |
+
* Unique: An error will be raised if two examples are yield with the
|
| 1385 |
+
same key.
|
| 1386 |
+
* Deterministic: When generating the dataset twice, the same example
|
| 1387 |
+
should have the same key.
|
| 1388 |
+
Good keys can be the image id, or line number if examples are extracted
|
| 1389 |
+
from a text file.
|
| 1390 |
+
The key will be hashed and sorted to shuffle examples deterministically,
|
| 1391 |
+
such as generating the dataset multiple times keep examples in the
|
| 1392 |
+
same order.
|
| 1393 |
+
example: `dict<str feature_name, feature_value>`, a feature dictionary
|
| 1394 |
+
ready to be encoded and written to disk. The example will be
|
| 1395 |
+
encoded with `self.info.features.encode_example({...})`.
|
| 1396 |
+
"""
|
| 1397 |
+
raise NotImplementedError()
|
| 1398 |
+
|
| 1399 |
+
def _prepare_split(
|
| 1400 |
+
self,
|
| 1401 |
+
split_generator: SplitGenerator,
|
| 1402 |
+
check_duplicate_keys: bool,
|
| 1403 |
+
file_format="arrow",
|
| 1404 |
+
num_proc: Optional[int] = None,
|
| 1405 |
+
max_shard_size: Optional[Union[int, str]] = None,
|
| 1406 |
+
):
|
| 1407 |
+
max_shard_size = convert_file_size_to_int(max_shard_size or config.MAX_SHARD_SIZE)
|
| 1408 |
+
|
| 1409 |
+
if self.info.splits is not None:
|
| 1410 |
+
split_info = self.info.splits[split_generator.name]
|
| 1411 |
+
else:
|
| 1412 |
+
split_info = split_generator.split_info
|
| 1413 |
+
|
| 1414 |
+
SUFFIX = "-JJJJJ-SSSSS-of-NNNNN"
|
| 1415 |
+
fname = f"{self.dataset_name}-{split_generator.name}{SUFFIX}.{file_format}"
|
| 1416 |
+
fpath = posixpath.join(self._output_dir, fname)
|
| 1417 |
+
|
| 1418 |
+
if num_proc and num_proc > 1:
|
| 1419 |
+
num_input_shards = _number_of_shards_in_gen_kwargs(split_generator.gen_kwargs)
|
| 1420 |
+
if num_input_shards <= 1:
|
| 1421 |
+
logger.warning(
|
| 1422 |
+
f"Setting num_proc from {num_proc} back to 1 for the {split_info.name} split to disable multiprocessing as it only contains one shard."
|
| 1423 |
+
)
|
| 1424 |
+
num_proc = 1
|
| 1425 |
+
elif num_input_shards < num_proc:
|
| 1426 |
+
logger.warning(
|
| 1427 |
+
f"Setting num_proc from {num_proc} to {num_input_shards} for the {split_info.name} split as it only contains {num_input_shards} shards."
|
| 1428 |
+
)
|
| 1429 |
+
num_proc = num_input_shards
|
| 1430 |
+
|
| 1431 |
+
pbar = hf_tqdm(
|
| 1432 |
+
unit=" examples",
|
| 1433 |
+
total=split_info.num_examples,
|
| 1434 |
+
desc=f"Generating {split_info.name} split",
|
| 1435 |
+
)
|
| 1436 |
+
|
| 1437 |
+
_prepare_split_args = {
|
| 1438 |
+
"fpath": fpath,
|
| 1439 |
+
"file_format": file_format,
|
| 1440 |
+
"max_shard_size": max_shard_size,
|
| 1441 |
+
"split_info": split_info,
|
| 1442 |
+
"check_duplicate_keys": check_duplicate_keys,
|
| 1443 |
+
}
|
| 1444 |
+
|
| 1445 |
+
if num_proc is None or num_proc == 1:
|
| 1446 |
+
result = None
|
| 1447 |
+
gen_kwargs = split_generator.gen_kwargs
|
| 1448 |
+
job_id = 0
|
| 1449 |
+
with pbar:
|
| 1450 |
+
for job_id, done, content in self._prepare_split_single(
|
| 1451 |
+
gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args
|
| 1452 |
+
):
|
| 1453 |
+
if done:
|
| 1454 |
+
result = content
|
| 1455 |
+
else:
|
| 1456 |
+
pbar.update(content)
|
| 1457 |
+
# wrapping everything into lists for consistency with the multiprocessed code path
|
| 1458 |
+
assert result is not None, "Failed to retrieve results from prepare_split"
|
| 1459 |
+
examples_per_job, bytes_per_job, features_per_job, shards_per_job, shard_lengths_per_job = (
|
| 1460 |
+
[item] for item in result
|
| 1461 |
+
)
|
| 1462 |
+
else:
|
| 1463 |
+
kwargs_per_job = [
|
| 1464 |
+
{"gen_kwargs": gen_kwargs, "job_id": job_id, **_prepare_split_args}
|
| 1465 |
+
for job_id, gen_kwargs in enumerate(
|
| 1466 |
+
_split_gen_kwargs(split_generator.gen_kwargs, max_num_jobs=num_proc)
|
| 1467 |
+
)
|
| 1468 |
+
]
|
| 1469 |
+
num_jobs = len(kwargs_per_job)
|
| 1470 |
+
|
| 1471 |
+
examples_per_job = [None] * num_jobs
|
| 1472 |
+
bytes_per_job = [None] * num_jobs
|
| 1473 |
+
features_per_job = [None] * num_jobs
|
| 1474 |
+
shards_per_job = [None] * num_jobs
|
| 1475 |
+
shard_lengths_per_job = [None] * num_jobs
|
| 1476 |
+
|
| 1477 |
+
with Pool(num_proc) as pool:
|
| 1478 |
+
with pbar:
|
| 1479 |
+
for job_id, done, content in iflatmap_unordered(
|
| 1480 |
+
pool, self._prepare_split_single, kwargs_iterable=kwargs_per_job
|
| 1481 |
+
):
|
| 1482 |
+
if done:
|
| 1483 |
+
# the content is the result of the job
|
| 1484 |
+
(
|
| 1485 |
+
examples_per_job[job_id],
|
| 1486 |
+
bytes_per_job[job_id],
|
| 1487 |
+
features_per_job[job_id],
|
| 1488 |
+
shards_per_job[job_id],
|
| 1489 |
+
shard_lengths_per_job[job_id],
|
| 1490 |
+
) = content
|
| 1491 |
+
else:
|
| 1492 |
+
# the content is the number of examples progress update
|
| 1493 |
+
pbar.update(content)
|
| 1494 |
+
|
| 1495 |
+
assert None not in examples_per_job, (
|
| 1496 |
+
f"Failed to retrieve results from prepare_split: result list {examples_per_job} still contains None - at least one worker failed to return its results"
|
| 1497 |
+
)
|
| 1498 |
+
|
| 1499 |
+
total_shards = sum(shards_per_job)
|
| 1500 |
+
total_num_examples = sum(examples_per_job)
|
| 1501 |
+
total_num_bytes = sum(bytes_per_job)
|
| 1502 |
+
features = features_per_job[0]
|
| 1503 |
+
|
| 1504 |
+
split_generator.split_info.num_examples = total_num_examples
|
| 1505 |
+
split_generator.split_info.num_bytes = total_num_bytes
|
| 1506 |
+
|
| 1507 |
+
# should rename everything at the end
|
| 1508 |
+
logger.debug(f"Renaming {total_shards} shards.")
|
| 1509 |
+
if total_shards > 1:
|
| 1510 |
+
# use the -SSSSS-of-NNNNN pattern
|
| 1511 |
+
|
| 1512 |
+
def _rename_shard(shard_and_job: tuple[int]):
|
| 1513 |
+
shard_id, job_id = shard_and_job
|
| 1514 |
+
global_shard_id = sum(shards_per_job[:job_id]) + shard_id
|
| 1515 |
+
self._rename(
|
| 1516 |
+
fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
|
| 1517 |
+
fpath.replace("JJJJJ-SSSSS", f"{global_shard_id:05d}").replace("NNNNN", f"{total_shards:05d}"),
|
| 1518 |
+
)
|
| 1519 |
+
|
| 1520 |
+
shards_and_jobs = [
|
| 1521 |
+
(shard_id, job_id)
|
| 1522 |
+
for job_id, num_shards in enumerate(shards_per_job)
|
| 1523 |
+
for shard_id in range(num_shards)
|
| 1524 |
+
]
|
| 1525 |
+
thread_map(_rename_shard, shards_and_jobs, disable=True, max_workers=64)
|
| 1526 |
+
|
| 1527 |
+
split_generator.split_info.shard_lengths = [
|
| 1528 |
+
shard_length for shard_lengths in shard_lengths_per_job for shard_length in shard_lengths
|
| 1529 |
+
]
|
| 1530 |
+
else:
|
| 1531 |
+
# don't use any pattern
|
| 1532 |
+
shard_id, job_id = 0, 0
|
| 1533 |
+
self._rename(
|
| 1534 |
+
fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
|
| 1535 |
+
fpath.replace(SUFFIX, ""),
|
| 1536 |
+
)
|
| 1537 |
+
|
| 1538 |
+
if self.info.features is None:
|
| 1539 |
+
self.info.features = features
|
| 1540 |
+
|
| 1541 |
+
def _prepare_split_single(
|
| 1542 |
+
self,
|
| 1543 |
+
gen_kwargs: dict,
|
| 1544 |
+
fpath: str,
|
| 1545 |
+
file_format: str,
|
| 1546 |
+
max_shard_size: int,
|
| 1547 |
+
split_info: SplitInfo,
|
| 1548 |
+
check_duplicate_keys: bool,
|
| 1549 |
+
job_id: int,
|
| 1550 |
+
) -> Iterable[tuple[int, bool, Union[int, tuple]]]:
|
| 1551 |
+
generator = self._generate_examples(**gen_kwargs)
|
| 1552 |
+
writer_class = ParquetWriter if file_format == "parquet" else ArrowWriter
|
| 1553 |
+
embed_local_files = file_format == "parquet"
|
| 1554 |
+
shard_lengths = []
|
| 1555 |
+
total_num_examples, total_num_bytes = 0, 0
|
| 1556 |
+
|
| 1557 |
+
shard_id = 0
|
| 1558 |
+
num_examples_progress_update = 0
|
| 1559 |
+
try:
|
| 1560 |
+
writer = writer_class(
|
| 1561 |
+
features=self.info.features,
|
| 1562 |
+
path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
|
| 1563 |
+
writer_batch_size=self._writer_batch_size,
|
| 1564 |
+
hash_salt=split_info.name,
|
| 1565 |
+
check_duplicates=check_duplicate_keys,
|
| 1566 |
+
storage_options=self._fs.storage_options,
|
| 1567 |
+
embed_local_files=embed_local_files,
|
| 1568 |
+
)
|
| 1569 |
+
try:
|
| 1570 |
+
_time = time.time()
|
| 1571 |
+
for key, record in generator:
|
| 1572 |
+
if max_shard_size is not None and writer._num_bytes > max_shard_size:
|
| 1573 |
+
num_examples, num_bytes = writer.finalize()
|
| 1574 |
+
writer.close()
|
| 1575 |
+
shard_lengths.append(num_examples)
|
| 1576 |
+
total_num_examples += num_examples
|
| 1577 |
+
total_num_bytes += num_bytes
|
| 1578 |
+
shard_id += 1
|
| 1579 |
+
writer = writer_class(
|
| 1580 |
+
features=writer._features,
|
| 1581 |
+
path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
|
| 1582 |
+
writer_batch_size=self._writer_batch_size,
|
| 1583 |
+
hash_salt=split_info.name,
|
| 1584 |
+
check_duplicates=check_duplicate_keys,
|
| 1585 |
+
storage_options=self._fs.storage_options,
|
| 1586 |
+
embed_local_files=embed_local_files,
|
| 1587 |
+
)
|
| 1588 |
+
example = self.info.features.encode_example(record) if self.info.features is not None else record
|
| 1589 |
+
writer.write(example, key)
|
| 1590 |
+
num_examples_progress_update += 1
|
| 1591 |
+
if time.time() > _time + config.PBAR_REFRESH_TIME_INTERVAL:
|
| 1592 |
+
_time = time.time()
|
| 1593 |
+
yield job_id, False, num_examples_progress_update
|
| 1594 |
+
num_examples_progress_update = 0
|
| 1595 |
+
finally:
|
| 1596 |
+
yield job_id, False, num_examples_progress_update
|
| 1597 |
+
num_shards = shard_id + 1
|
| 1598 |
+
num_examples, num_bytes = writer.finalize()
|
| 1599 |
+
writer.close()
|
| 1600 |
+
shard_lengths.append(num_examples)
|
| 1601 |
+
total_num_examples += num_examples
|
| 1602 |
+
total_num_bytes += num_bytes
|
| 1603 |
+
except Exception as e:
|
| 1604 |
+
# Ignore the writer's error for no examples written to the file if this error was caused by the error in _generate_examples before the first example was yielded
|
| 1605 |
+
if isinstance(e, SchemaInferenceError) and e.__context__ is not None:
|
| 1606 |
+
e = e.__context__
|
| 1607 |
+
raise DatasetGenerationError("An error occurred while generating the dataset") from e
|
| 1608 |
+
|
| 1609 |
+
yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths)
|
| 1610 |
+
|
| 1611 |
+
def _download_and_prepare(self, dl_manager, verification_mode, **prepare_splits_kwargs):
|
| 1612 |
+
super()._download_and_prepare(
|
| 1613 |
+
dl_manager,
|
| 1614 |
+
verification_mode,
|
| 1615 |
+
check_duplicate_keys=verification_mode == VerificationMode.BASIC_CHECKS
|
| 1616 |
+
or verification_mode == VerificationMode.ALL_CHECKS,
|
| 1617 |
+
**prepare_splits_kwargs,
|
| 1618 |
+
)
|
| 1619 |
+
|
| 1620 |
+
def _get_examples_iterable_for_split(self, split_generator: SplitGenerator) -> ExamplesIterable:
|
| 1621 |
+
return ExamplesIterable(self._generate_examples, split_generator.gen_kwargs)
|
| 1622 |
+
|
| 1623 |
+
|
| 1624 |
+
class ArrowBasedBuilder(DatasetBuilder):
|
| 1625 |
+
"""Base class for datasets with data generation based on Arrow loading functions (CSV/JSON/Parquet)."""
|
| 1626 |
+
|
| 1627 |
+
@abc.abstractmethod
|
| 1628 |
+
def _generate_tables(self, **kwargs):
|
| 1629 |
+
"""Default function generating examples for each `SplitGenerator`.
|
| 1630 |
+
|
| 1631 |
+
This function preprocess the examples from the raw data to the preprocessed
|
| 1632 |
+
dataset files.
|
| 1633 |
+
This function is called once for each `SplitGenerator` defined in
|
| 1634 |
+
`_split_generators`. The examples yielded here will be written on
|
| 1635 |
+
disk.
|
| 1636 |
+
|
| 1637 |
+
Args:
|
| 1638 |
+
**kwargs (additional keyword arguments):
|
| 1639 |
+
Arguments forwarded from the SplitGenerator.gen_kwargs
|
| 1640 |
+
|
| 1641 |
+
Yields:
|
| 1642 |
+
key: `str` or `int`, a unique deterministic example identification key.
|
| 1643 |
+
* Unique: An error will be raised if two examples are yield with the
|
| 1644 |
+
same key.
|
| 1645 |
+
* Deterministic: When generating the dataset twice, the same example
|
| 1646 |
+
should have the same key.
|
| 1647 |
+
Good keys can be the image id, or line number if examples are extracted
|
| 1648 |
+
from a text file.
|
| 1649 |
+
The key will be hashed and sorted to shuffle examples deterministically,
|
| 1650 |
+
such as generating the dataset multiple times keep examples in the
|
| 1651 |
+
same order.
|
| 1652 |
+
example: `pyarrow.Table`, a feature table
|
| 1653 |
+
ready to be encoded and written to disk.
|
| 1654 |
+
"""
|
| 1655 |
+
raise NotImplementedError()
|
| 1656 |
+
|
| 1657 |
+
def _prepare_split(
|
| 1658 |
+
self,
|
| 1659 |
+
split_generator: SplitGenerator,
|
| 1660 |
+
file_format: str = "arrow",
|
| 1661 |
+
num_proc: Optional[int] = None,
|
| 1662 |
+
max_shard_size: Optional[Union[str, int]] = None,
|
| 1663 |
+
):
|
| 1664 |
+
max_shard_size = convert_file_size_to_int(max_shard_size or config.MAX_SHARD_SIZE)
|
| 1665 |
+
|
| 1666 |
+
try:
|
| 1667 |
+
split_info = self.info.splits[split_generator.name]
|
| 1668 |
+
except Exception:
|
| 1669 |
+
split_info = split_generator.split_info
|
| 1670 |
+
|
| 1671 |
+
SUFFIX = "-JJJJJ-SSSSS-of-NNNNN"
|
| 1672 |
+
fname = f"{self.dataset_name}-{split_generator.name}{SUFFIX}.{file_format}"
|
| 1673 |
+
fpath = posixpath.join(self._output_dir, fname)
|
| 1674 |
+
|
| 1675 |
+
if num_proc and num_proc > 1:
|
| 1676 |
+
num_input_shards = _number_of_shards_in_gen_kwargs(split_generator.gen_kwargs)
|
| 1677 |
+
if num_input_shards <= 1:
|
| 1678 |
+
logger.warning(
|
| 1679 |
+
f"Setting num_proc from {num_proc} back to 1 for the {split_info.name} split to disable multiprocessing as it only contains one shard."
|
| 1680 |
+
)
|
| 1681 |
+
num_proc = 1
|
| 1682 |
+
elif num_input_shards < num_proc:
|
| 1683 |
+
logger.warning(
|
| 1684 |
+
f"Setting num_proc from {num_proc} to {num_input_shards} for the {split_info.name} split as it only contains {num_input_shards} shards."
|
| 1685 |
+
)
|
| 1686 |
+
num_proc = num_input_shards
|
| 1687 |
+
|
| 1688 |
+
pbar = hf_tqdm(
|
| 1689 |
+
unit=" examples",
|
| 1690 |
+
total=split_info.num_examples,
|
| 1691 |
+
desc=f"Generating {split_info.name} split",
|
| 1692 |
+
)
|
| 1693 |
+
|
| 1694 |
+
_prepare_split_args = {
|
| 1695 |
+
"fpath": fpath,
|
| 1696 |
+
"file_format": file_format,
|
| 1697 |
+
"max_shard_size": max_shard_size,
|
| 1698 |
+
}
|
| 1699 |
+
|
| 1700 |
+
if num_proc is None or num_proc == 1:
|
| 1701 |
+
result = None
|
| 1702 |
+
gen_kwargs = split_generator.gen_kwargs
|
| 1703 |
+
job_id = 0
|
| 1704 |
+
with pbar:
|
| 1705 |
+
for job_id, done, content in self._prepare_split_single(
|
| 1706 |
+
gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args
|
| 1707 |
+
):
|
| 1708 |
+
if done:
|
| 1709 |
+
result = content
|
| 1710 |
+
else:
|
| 1711 |
+
pbar.update(content)
|
| 1712 |
+
# wrapping everything into lists for consistency with the multiprocessed code path
|
| 1713 |
+
assert result is not None, "Failed to retrieve results from prepare_split"
|
| 1714 |
+
examples_per_job, bytes_per_job, features_per_job, shards_per_job, shard_lengths_per_job = (
|
| 1715 |
+
[item] for item in result
|
| 1716 |
+
)
|
| 1717 |
+
else:
|
| 1718 |
+
kwargs_per_job = [
|
| 1719 |
+
{"gen_kwargs": gen_kwargs, "job_id": job_id, **_prepare_split_args}
|
| 1720 |
+
for job_id, gen_kwargs in enumerate(
|
| 1721 |
+
_split_gen_kwargs(split_generator.gen_kwargs, max_num_jobs=num_proc)
|
| 1722 |
+
)
|
| 1723 |
+
]
|
| 1724 |
+
num_jobs = len(kwargs_per_job)
|
| 1725 |
+
|
| 1726 |
+
examples_per_job = [None] * num_jobs
|
| 1727 |
+
bytes_per_job = [None] * num_jobs
|
| 1728 |
+
features_per_job = [None] * num_jobs
|
| 1729 |
+
shards_per_job = [None] * num_jobs
|
| 1730 |
+
shard_lengths_per_job = [None] * num_jobs
|
| 1731 |
+
|
| 1732 |
+
with Pool(num_proc) as pool:
|
| 1733 |
+
with pbar:
|
| 1734 |
+
for job_id, done, content in iflatmap_unordered(
|
| 1735 |
+
pool, self._prepare_split_single, kwargs_iterable=kwargs_per_job
|
| 1736 |
+
):
|
| 1737 |
+
if done:
|
| 1738 |
+
# the content is the result of the job
|
| 1739 |
+
(
|
| 1740 |
+
examples_per_job[job_id],
|
| 1741 |
+
bytes_per_job[job_id],
|
| 1742 |
+
features_per_job[job_id],
|
| 1743 |
+
shards_per_job[job_id],
|
| 1744 |
+
shard_lengths_per_job[job_id],
|
| 1745 |
+
) = content
|
| 1746 |
+
else:
|
| 1747 |
+
# the content is the number of examples progress update
|
| 1748 |
+
pbar.update(content)
|
| 1749 |
+
|
| 1750 |
+
assert None not in examples_per_job, (
|
| 1751 |
+
f"Failed to retrieve results from prepare_split: result list {examples_per_job} still contains None - at least one worker failed to return its results"
|
| 1752 |
+
)
|
| 1753 |
+
|
| 1754 |
+
total_shards = sum(shards_per_job)
|
| 1755 |
+
total_num_examples = sum(examples_per_job)
|
| 1756 |
+
total_num_bytes = sum(bytes_per_job)
|
| 1757 |
+
features = features_per_job[0]
|
| 1758 |
+
|
| 1759 |
+
split_generator.split_info.num_examples = total_num_examples
|
| 1760 |
+
split_generator.split_info.num_bytes = total_num_bytes
|
| 1761 |
+
|
| 1762 |
+
# should rename everything at the end
|
| 1763 |
+
logger.debug(f"Renaming {total_shards} shards.")
|
| 1764 |
+
if total_shards > 1:
|
| 1765 |
+
# use the -SSSSS-of-NNNNN pattern
|
| 1766 |
+
|
| 1767 |
+
def _rename_shard(shard_id_and_job: tuple[int]):
|
| 1768 |
+
shard_id, job_id = shard_id_and_job
|
| 1769 |
+
global_shard_id = sum(shards_per_job[:job_id]) + shard_id
|
| 1770 |
+
self._rename(
|
| 1771 |
+
fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
|
| 1772 |
+
fpath.replace("JJJJJ-SSSSS", f"{global_shard_id:05d}").replace("NNNNN", f"{total_shards:05d}"),
|
| 1773 |
+
)
|
| 1774 |
+
|
| 1775 |
+
shard_ids_and_jobs = [
|
| 1776 |
+
(shard_id, job_id)
|
| 1777 |
+
for job_id, num_shards in enumerate(shards_per_job)
|
| 1778 |
+
for shard_id in range(num_shards)
|
| 1779 |
+
]
|
| 1780 |
+
thread_map(_rename_shard, shard_ids_and_jobs, disable=True, max_workers=64)
|
| 1781 |
+
|
| 1782 |
+
split_generator.split_info.shard_lengths = [
|
| 1783 |
+
shard_length for shard_lengths in shard_lengths_per_job for shard_length in shard_lengths
|
| 1784 |
+
]
|
| 1785 |
+
else:
|
| 1786 |
+
# don't use any pattern
|
| 1787 |
+
shard_id, job_id = 0, 0
|
| 1788 |
+
self._rename(
|
| 1789 |
+
fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
|
| 1790 |
+
fpath.replace(SUFFIX, ""),
|
| 1791 |
+
)
|
| 1792 |
+
|
| 1793 |
+
if self.info.features is None:
|
| 1794 |
+
self.info.features = features
|
| 1795 |
+
|
| 1796 |
+
def _prepare_split_single(
|
| 1797 |
+
self, gen_kwargs: dict, fpath: str, file_format: str, max_shard_size: int, job_id: int
|
| 1798 |
+
) -> Iterable[tuple[int, bool, Union[int, tuple]]]:
|
| 1799 |
+
gen_kwargs = {k: tracked_list(v) if isinstance(v, list) else v for k, v in gen_kwargs.items()}
|
| 1800 |
+
generator = self._generate_tables(**gen_kwargs)
|
| 1801 |
+
writer_class = ParquetWriter if file_format == "parquet" else ArrowWriter
|
| 1802 |
+
embed_local_files = file_format == "parquet"
|
| 1803 |
+
shard_lengths = []
|
| 1804 |
+
total_num_examples, total_num_bytes = 0, 0
|
| 1805 |
+
|
| 1806 |
+
shard_id = 0
|
| 1807 |
+
num_examples_progress_update = 0
|
| 1808 |
+
try:
|
| 1809 |
+
writer = writer_class(
|
| 1810 |
+
features=self.info.features,
|
| 1811 |
+
path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
|
| 1812 |
+
writer_batch_size=self._writer_batch_size,
|
| 1813 |
+
storage_options=self._fs.storage_options,
|
| 1814 |
+
embed_local_files=embed_local_files,
|
| 1815 |
+
)
|
| 1816 |
+
try:
|
| 1817 |
+
_time = time.time()
|
| 1818 |
+
for _, table in generator:
|
| 1819 |
+
if max_shard_size is not None and writer._num_bytes > max_shard_size:
|
| 1820 |
+
num_examples, num_bytes = writer.finalize()
|
| 1821 |
+
writer.close()
|
| 1822 |
+
shard_lengths.append(num_examples)
|
| 1823 |
+
total_num_examples += num_examples
|
| 1824 |
+
total_num_bytes += num_bytes
|
| 1825 |
+
shard_id += 1
|
| 1826 |
+
writer = writer_class(
|
| 1827 |
+
features=writer._features,
|
| 1828 |
+
path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
|
| 1829 |
+
writer_batch_size=self._writer_batch_size,
|
| 1830 |
+
storage_options=self._fs.storage_options,
|
| 1831 |
+
embed_local_files=embed_local_files,
|
| 1832 |
+
)
|
| 1833 |
+
try:
|
| 1834 |
+
writer.write_table(table)
|
| 1835 |
+
except CastError as cast_error:
|
| 1836 |
+
raise DatasetGenerationCastError.from_cast_error(
|
| 1837 |
+
cast_error=cast_error,
|
| 1838 |
+
builder_name=self.info.builder_name,
|
| 1839 |
+
gen_kwargs=gen_kwargs,
|
| 1840 |
+
token=self.token,
|
| 1841 |
+
)
|
| 1842 |
+
num_examples_progress_update += len(table)
|
| 1843 |
+
if time.time() > _time + config.PBAR_REFRESH_TIME_INTERVAL:
|
| 1844 |
+
_time = time.time()
|
| 1845 |
+
yield job_id, False, num_examples_progress_update
|
| 1846 |
+
num_examples_progress_update = 0
|
| 1847 |
+
finally:
|
| 1848 |
+
yield job_id, False, num_examples_progress_update
|
| 1849 |
+
num_shards = shard_id + 1
|
| 1850 |
+
num_examples, num_bytes = writer.finalize()
|
| 1851 |
+
writer.close()
|
| 1852 |
+
shard_lengths.append(num_examples)
|
| 1853 |
+
total_num_examples += num_examples
|
| 1854 |
+
total_num_bytes += num_bytes
|
| 1855 |
+
except Exception as e:
|
| 1856 |
+
# Ignore the writer's error for no examples written to the file if this error was caused by the error in _generate_examples before the first example was yielded
|
| 1857 |
+
if isinstance(e, SchemaInferenceError) and e.__context__ is not None:
|
| 1858 |
+
e = e.__context__
|
| 1859 |
+
if isinstance(e, DatasetGenerationError):
|
| 1860 |
+
raise
|
| 1861 |
+
raise DatasetGenerationError("An error occurred while generating the dataset") from e
|
| 1862 |
+
|
| 1863 |
+
yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths)
|
| 1864 |
+
|
| 1865 |
+
def _get_examples_iterable_for_split(self, split_generator: SplitGenerator) -> ExamplesIterable:
|
| 1866 |
+
return ArrowExamplesIterable(self._generate_tables, kwargs=split_generator.gen_kwargs)
|
datasets/combine.py
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Optional, TypeVar
|
| 2 |
+
|
| 3 |
+
from .arrow_dataset import Dataset, _concatenate_map_style_datasets, _interleave_map_style_datasets
|
| 4 |
+
from .dataset_dict import DatasetDict, IterableDatasetDict
|
| 5 |
+
from .info import DatasetInfo
|
| 6 |
+
from .iterable_dataset import IterableDataset, _concatenate_iterable_datasets, _interleave_iterable_datasets
|
| 7 |
+
from .splits import NamedSplit
|
| 8 |
+
from .utils import logging
|
| 9 |
+
from .utils.py_utils import Literal
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
logger = logging.get_logger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
DatasetType = TypeVar("DatasetType", Dataset, IterableDataset)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def interleave_datasets(
|
| 19 |
+
datasets: list[DatasetType],
|
| 20 |
+
probabilities: Optional[list[float]] = None,
|
| 21 |
+
seed: Optional[int] = None,
|
| 22 |
+
info: Optional[DatasetInfo] = None,
|
| 23 |
+
split: Optional[NamedSplit] = None,
|
| 24 |
+
stopping_strategy: Literal[
|
| 25 |
+
"first_exhausted", "all_exhausted", "all_exhausted_without_replacement"
|
| 26 |
+
] = "first_exhausted",
|
| 27 |
+
) -> DatasetType:
|
| 28 |
+
"""
|
| 29 |
+
Interleave several datasets (sources) into a single dataset.
|
| 30 |
+
The new dataset is constructed by alternating between the sources to get the examples.
|
| 31 |
+
|
| 32 |
+
You can use this function on a list of [`Dataset`] objects, or on a list of [`IterableDataset`] objects.
|
| 33 |
+
|
| 34 |
+
- If `probabilities` is `None` (default) the new dataset is constructed by cycling between each source to get the examples.
|
| 35 |
+
- If `probabilities` is not `None`, the new dataset is constructed by getting examples from a random source at a time according to the provided probabilities.
|
| 36 |
+
|
| 37 |
+
The resulting dataset ends when one of the source datasets runs out of examples except when `oversampling` is `True`,
|
| 38 |
+
in which case, the resulting dataset ends when all datasets have ran out of examples at least one time.
|
| 39 |
+
|
| 40 |
+
Note for iterable datasets:
|
| 41 |
+
|
| 42 |
+
In a distributed setup or in PyTorch DataLoader workers, the stopping strategy is applied per process.
|
| 43 |
+
Therefore the "first_exhausted" strategy on an sharded iterable dataset can generate less samples in total (up to 1 missing sample per subdataset per worker).
|
| 44 |
+
|
| 45 |
+
Args:
|
| 46 |
+
datasets (`List[Dataset]` or `List[IterableDataset]`):
|
| 47 |
+
List of datasets to interleave.
|
| 48 |
+
probabilities (`List[float]`, *optional*, defaults to `None`):
|
| 49 |
+
If specified, the new dataset is constructed by sampling
|
| 50 |
+
examples from one source at a time according to these probabilities.
|
| 51 |
+
seed (`int`, *optional*, defaults to `None`):
|
| 52 |
+
The random seed used to choose a source for each example.
|
| 53 |
+
info ([`DatasetInfo`], *optional*):
|
| 54 |
+
Dataset information, like description, citation, etc.
|
| 55 |
+
<Added version="2.4.0"/>
|
| 56 |
+
split ([`NamedSplit`], *optional*):
|
| 57 |
+
Name of the dataset split.
|
| 58 |
+
<Added version="2.4.0"/>
|
| 59 |
+
stopping_strategy (`str`, defaults to `first_exhausted`):
|
| 60 |
+
Three strategies are proposed right now, `first_exhausted`, `all_exhausted` and `all_exhausted_without_replacement`.
|
| 61 |
+
By default, `first_exhausted` is an undersampling strategy, i.e the dataset construction is stopped as soon as one dataset has ran out of samples.
|
| 62 |
+
If the strategy is `all_exhausted`, we use an oversampling strategy, i.e the dataset construction is stopped as soon as every samples of every dataset has been added at least once.
|
| 63 |
+
When strategy is `all_exhausted_without_replacement` we make sure that each sample in each dataset is sampled only once.
|
| 64 |
+
Note that if the strategy is `all_exhausted`, the interleaved dataset size can get enormous:
|
| 65 |
+
- with no probabilities, the resulting dataset will have `max_length_datasets*nb_dataset` samples.
|
| 66 |
+
- with given probabilities, the resulting dataset will have more samples if some datasets have really low probability of visiting.
|
| 67 |
+
Returns:
|
| 68 |
+
[`Dataset`] or [`IterableDataset`]: Return type depends on the input `datasets`
|
| 69 |
+
parameter. `Dataset` if the input is a list of `Dataset`, `IterableDataset` if the input is a list of
|
| 70 |
+
`IterableDataset`.
|
| 71 |
+
|
| 72 |
+
Example:
|
| 73 |
+
|
| 74 |
+
For regular datasets (map-style):
|
| 75 |
+
|
| 76 |
+
```python
|
| 77 |
+
>>> from datasets import Dataset, interleave_datasets
|
| 78 |
+
>>> d1 = Dataset.from_dict({"a": [0, 1, 2]})
|
| 79 |
+
>>> d2 = Dataset.from_dict({"a": [10, 11, 12]})
|
| 80 |
+
>>> d3 = Dataset.from_dict({"a": [20, 21, 22]})
|
| 81 |
+
>>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42, stopping_strategy="all_exhausted")
|
| 82 |
+
>>> dataset["a"]
|
| 83 |
+
[10, 0, 11, 1, 2, 20, 12, 10, 0, 1, 2, 21, 0, 11, 1, 2, 0, 1, 12, 2, 10, 0, 22]
|
| 84 |
+
>>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42)
|
| 85 |
+
>>> dataset["a"]
|
| 86 |
+
[10, 0, 11, 1, 2]
|
| 87 |
+
>>> dataset = interleave_datasets([d1, d2, d3])
|
| 88 |
+
>>> dataset["a"]
|
| 89 |
+
[0, 10, 20, 1, 11, 21, 2, 12, 22]
|
| 90 |
+
>>> dataset = interleave_datasets([d1, d2, d3], stopping_strategy="all_exhausted")
|
| 91 |
+
>>> dataset["a"]
|
| 92 |
+
[0, 10, 20, 1, 11, 21, 2, 12, 22]
|
| 93 |
+
>>> d1 = Dataset.from_dict({"a": [0, 1, 2]})
|
| 94 |
+
>>> d2 = Dataset.from_dict({"a": [10, 11, 12, 13]})
|
| 95 |
+
>>> d3 = Dataset.from_dict({"a": [20, 21, 22, 23, 24]})
|
| 96 |
+
>>> dataset = interleave_datasets([d1, d2, d3])
|
| 97 |
+
>>> dataset["a"]
|
| 98 |
+
[0, 10, 20, 1, 11, 21, 2, 12, 22]
|
| 99 |
+
>>> dataset = interleave_datasets([d1, d2, d3], stopping_strategy="all_exhausted")
|
| 100 |
+
>>> dataset["a"]
|
| 101 |
+
[0, 10, 20, 1, 11, 21, 2, 12, 22, 0, 13, 23, 1, 10, 24]
|
| 102 |
+
>>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42)
|
| 103 |
+
>>> dataset["a"]
|
| 104 |
+
[10, 0, 11, 1, 2]
|
| 105 |
+
>>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42, stopping_strategy="all_exhausted")
|
| 106 |
+
>>> dataset["a"]
|
| 107 |
+
[10, 0, 11, 1, 2, 20, 12, 13, ..., 0, 1, 2, 0, 24]
|
| 108 |
+
For datasets in streaming mode (iterable):
|
| 109 |
+
|
| 110 |
+
>>> from datasets import interleave_datasets
|
| 111 |
+
>>> d1 = load_dataset('allenai/c4', 'es', split='train', streaming=True)
|
| 112 |
+
>>> d2 = load_dataset('allenai/c4', 'fr', split='train', streaming=True)
|
| 113 |
+
>>> dataset = interleave_datasets([d1, d2])
|
| 114 |
+
>>> iterator = iter(dataset)
|
| 115 |
+
>>> next(iterator)
|
| 116 |
+
{'text': 'Comprar Zapatillas para niña en chancla con goma por...'}
|
| 117 |
+
>>> next(iterator)
|
| 118 |
+
{'text': 'Le sacre de philippe ier, 23 mai 1059 - Compte Rendu...'
|
| 119 |
+
```
|
| 120 |
+
"""
|
| 121 |
+
from .arrow_dataset import Dataset
|
| 122 |
+
from .iterable_dataset import IterableDataset
|
| 123 |
+
|
| 124 |
+
if not datasets:
|
| 125 |
+
raise ValueError("Unable to interleave an empty list of datasets.")
|
| 126 |
+
for i, dataset in enumerate(datasets):
|
| 127 |
+
if not isinstance(dataset, (Dataset, IterableDataset)):
|
| 128 |
+
if isinstance(dataset, (DatasetDict, IterableDatasetDict)):
|
| 129 |
+
if not dataset:
|
| 130 |
+
raise ValueError(
|
| 131 |
+
f"Expected a list of Dataset objects or a list of IterableDataset objects, but element at position {i} "
|
| 132 |
+
"is an empty dataset dictionary."
|
| 133 |
+
)
|
| 134 |
+
raise ValueError(
|
| 135 |
+
f"Dataset at position {i} has at least one split: {list(dataset)}\n"
|
| 136 |
+
f"Please pick one to interleave with the other datasets, for example: dataset['{next(iter(dataset))}']"
|
| 137 |
+
)
|
| 138 |
+
raise ValueError(
|
| 139 |
+
f"Expected a list of Dataset objects or a list of IterableDataset objects, but element at position {i} is a {type(dataset).__name__}."
|
| 140 |
+
)
|
| 141 |
+
if i == 0:
|
| 142 |
+
dataset_type, other_type = (
|
| 143 |
+
(Dataset, IterableDataset) if isinstance(dataset, Dataset) else (IterableDataset, Dataset)
|
| 144 |
+
)
|
| 145 |
+
elif not isinstance(dataset, dataset_type):
|
| 146 |
+
raise ValueError(
|
| 147 |
+
f"Unable to interleave a {dataset_type.__name__} (at position 0) with a {other_type.__name__} (at position {i}). Expected a list of Dataset objects or a list of IterableDataset objects."
|
| 148 |
+
)
|
| 149 |
+
if stopping_strategy not in ["first_exhausted", "all_exhausted", "all_exhausted_without_replacement"]:
|
| 150 |
+
raise ValueError(f"{stopping_strategy} is not supported. Please enter a valid stopping_strategy.")
|
| 151 |
+
if dataset_type is Dataset:
|
| 152 |
+
return _interleave_map_style_datasets(
|
| 153 |
+
datasets, probabilities, seed, info=info, split=split, stopping_strategy=stopping_strategy
|
| 154 |
+
)
|
| 155 |
+
else:
|
| 156 |
+
return _interleave_iterable_datasets(
|
| 157 |
+
datasets,
|
| 158 |
+
probabilities,
|
| 159 |
+
seed,
|
| 160 |
+
info=info,
|
| 161 |
+
split=split,
|
| 162 |
+
stopping_strategy=stopping_strategy,
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def concatenate_datasets(
|
| 167 |
+
dsets: list[DatasetType],
|
| 168 |
+
info: Optional[DatasetInfo] = None,
|
| 169 |
+
split: Optional[NamedSplit] = None,
|
| 170 |
+
axis: int = 0,
|
| 171 |
+
) -> DatasetType:
|
| 172 |
+
"""
|
| 173 |
+
Converts a list of [`Dataset`] with the same schema into a single [`Dataset`].
|
| 174 |
+
|
| 175 |
+
Args:
|
| 176 |
+
dsets (`List[datasets.Dataset]`):
|
| 177 |
+
List of Datasets to concatenate.
|
| 178 |
+
info (`DatasetInfo`, *optional*):
|
| 179 |
+
Dataset information, like description, citation, etc.
|
| 180 |
+
split (`NamedSplit`, *optional*):
|
| 181 |
+
Name of the dataset split.
|
| 182 |
+
axis (`{0, 1}`, defaults to `0`):
|
| 183 |
+
Axis to concatenate over, where `0` means over rows (vertically) and `1` means over columns
|
| 184 |
+
(horizontally).
|
| 185 |
+
|
| 186 |
+
<Added version="1.6.0"/>
|
| 187 |
+
|
| 188 |
+
Example:
|
| 189 |
+
|
| 190 |
+
```py
|
| 191 |
+
>>> ds3 = concatenate_datasets([ds1, ds2])
|
| 192 |
+
```
|
| 193 |
+
"""
|
| 194 |
+
|
| 195 |
+
if not dsets:
|
| 196 |
+
raise ValueError("Unable to concatenate an empty list of datasets.")
|
| 197 |
+
for i, dataset in enumerate(dsets):
|
| 198 |
+
if not isinstance(dataset, (Dataset, IterableDataset)):
|
| 199 |
+
if isinstance(dataset, (DatasetDict, IterableDatasetDict)):
|
| 200 |
+
if not dataset:
|
| 201 |
+
raise ValueError(
|
| 202 |
+
f"Expected a list of Dataset objects or a list of IterableDataset objects, but element at position {i} "
|
| 203 |
+
"is an empty dataset dictionary."
|
| 204 |
+
)
|
| 205 |
+
raise ValueError(
|
| 206 |
+
f"Dataset at position {i} has at least one split: {list(dataset)}\n"
|
| 207 |
+
f"Please pick one to interleave with the other datasets, for example: dataset['{next(iter(dataset))}']"
|
| 208 |
+
)
|
| 209 |
+
raise ValueError(
|
| 210 |
+
f"Expected a list of Dataset objects or a list of IterableDataset objects, but element at position {i} is a {type(dataset).__name__}."
|
| 211 |
+
)
|
| 212 |
+
if i == 0:
|
| 213 |
+
dataset_type, other_type = (
|
| 214 |
+
(Dataset, IterableDataset) if isinstance(dataset, Dataset) else (IterableDataset, Dataset)
|
| 215 |
+
)
|
| 216 |
+
elif not isinstance(dataset, dataset_type):
|
| 217 |
+
raise ValueError(
|
| 218 |
+
f"Unable to interleave a {dataset_type.__name__} (at position 0) with a {other_type.__name__} (at position {i}). Expected a list of Dataset objects or a list of IterableDataset objects."
|
| 219 |
+
)
|
| 220 |
+
if dataset_type is Dataset:
|
| 221 |
+
return _concatenate_map_style_datasets(dsets, info=info, split=split, axis=axis)
|
| 222 |
+
else:
|
| 223 |
+
return _concatenate_iterable_datasets(dsets, info=info, split=split, axis=axis)
|
datasets/config.py
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import importlib
|
| 2 |
+
import importlib.metadata
|
| 3 |
+
import logging
|
| 4 |
+
import os
|
| 5 |
+
import platform
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Optional
|
| 8 |
+
|
| 9 |
+
from huggingface_hub import constants
|
| 10 |
+
from packaging import version
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__.split(".", 1)[0]) # to avoid circular import from .utils.logging
|
| 14 |
+
|
| 15 |
+
# Datasets
|
| 16 |
+
S3_DATASETS_BUCKET_PREFIX = "https://s3.amazonaws.com/datasets.huggingface.co/datasets/datasets"
|
| 17 |
+
CLOUDFRONT_DATASETS_DISTRIB_PREFIX = "https://cdn-datasets.huggingface.co/datasets/datasets"
|
| 18 |
+
REPO_DATASETS_URL = "https://raw.githubusercontent.com/huggingface/datasets/{revision}/datasets/{path}/{name}"
|
| 19 |
+
|
| 20 |
+
# Hub
|
| 21 |
+
HF_ENDPOINT = os.environ.get("HF_ENDPOINT", "https://huggingface.co")
|
| 22 |
+
HUB_DATASETS_URL = HF_ENDPOINT + "/datasets/{repo_id}/resolve/{revision}/{path}"
|
| 23 |
+
HUB_DATASETS_HFFS_URL = "hf://datasets/{repo_id}@{revision}/{path}"
|
| 24 |
+
HUB_DEFAULT_VERSION = "main"
|
| 25 |
+
|
| 26 |
+
PY_VERSION = version.parse(platform.python_version())
|
| 27 |
+
|
| 28 |
+
# General environment variables accepted values for booleans
|
| 29 |
+
ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
|
| 30 |
+
ENV_VARS_FALSE_VALUES = {"0", "OFF", "NO", "FALSE"}
|
| 31 |
+
ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
|
| 32 |
+
ENV_VARS_FALSE_AND_AUTO_VALUES = ENV_VARS_FALSE_VALUES.union({"AUTO"})
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# Imports
|
| 36 |
+
DILL_VERSION = version.parse(importlib.metadata.version("dill"))
|
| 37 |
+
FSSPEC_VERSION = version.parse(importlib.metadata.version("fsspec"))
|
| 38 |
+
PANDAS_VERSION = version.parse(importlib.metadata.version("pandas"))
|
| 39 |
+
PYARROW_VERSION = version.parse(importlib.metadata.version("pyarrow"))
|
| 40 |
+
HF_HUB_VERSION = version.parse(importlib.metadata.version("huggingface_hub"))
|
| 41 |
+
|
| 42 |
+
USE_TF = os.environ.get("USE_TF", "AUTO").upper()
|
| 43 |
+
USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper()
|
| 44 |
+
USE_JAX = os.environ.get("USE_JAX", "AUTO").upper()
|
| 45 |
+
|
| 46 |
+
TORCH_VERSION = "N/A"
|
| 47 |
+
TORCH_AVAILABLE = False
|
| 48 |
+
|
| 49 |
+
if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES:
|
| 50 |
+
TORCH_AVAILABLE = importlib.util.find_spec("torch") is not None
|
| 51 |
+
if TORCH_AVAILABLE:
|
| 52 |
+
try:
|
| 53 |
+
TORCH_VERSION = version.parse(importlib.metadata.version("torch"))
|
| 54 |
+
logger.debug(f"PyTorch version {TORCH_VERSION} available.")
|
| 55 |
+
except importlib.metadata.PackageNotFoundError:
|
| 56 |
+
pass
|
| 57 |
+
else:
|
| 58 |
+
logger.info("Disabling PyTorch because USE_TF is set")
|
| 59 |
+
|
| 60 |
+
POLARS_VERSION = "N/A"
|
| 61 |
+
POLARS_AVAILABLE = importlib.util.find_spec("polars") is not None
|
| 62 |
+
|
| 63 |
+
if POLARS_AVAILABLE:
|
| 64 |
+
try:
|
| 65 |
+
POLARS_VERSION = version.parse(importlib.metadata.version("polars"))
|
| 66 |
+
logger.debug(f"Polars version {POLARS_VERSION} available.")
|
| 67 |
+
except importlib.metadata.PackageNotFoundError:
|
| 68 |
+
pass
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
DUCKDB_VERSION = "N/A"
|
| 72 |
+
DUCKDB_AVAILABLE = importlib.util.find_spec("duckdb") is not None
|
| 73 |
+
|
| 74 |
+
if DUCKDB_AVAILABLE:
|
| 75 |
+
try:
|
| 76 |
+
DUCKDB_VERSION = version.parse(importlib.metadata.version("duckdb"))
|
| 77 |
+
logger.debug(f"Duckdb version {DUCKDB_VERSION} available.")
|
| 78 |
+
except importlib.metadata.PackageNotFoundError:
|
| 79 |
+
pass
|
| 80 |
+
|
| 81 |
+
TF_VERSION = "N/A"
|
| 82 |
+
TF_AVAILABLE = False
|
| 83 |
+
|
| 84 |
+
if USE_TF in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TORCH not in ENV_VARS_TRUE_VALUES:
|
| 85 |
+
TF_AVAILABLE = importlib.util.find_spec("tensorflow") is not None
|
| 86 |
+
if TF_AVAILABLE:
|
| 87 |
+
# For the metadata, we have to look for both tensorflow and tensorflow-cpu
|
| 88 |
+
for package in [
|
| 89 |
+
"tensorflow",
|
| 90 |
+
"tensorflow-cpu",
|
| 91 |
+
"tensorflow-gpu",
|
| 92 |
+
"tf-nightly",
|
| 93 |
+
"tf-nightly-cpu",
|
| 94 |
+
"tf-nightly-gpu",
|
| 95 |
+
"intel-tensorflow",
|
| 96 |
+
"tensorflow-rocm",
|
| 97 |
+
"tensorflow-macos",
|
| 98 |
+
]:
|
| 99 |
+
try:
|
| 100 |
+
TF_VERSION = version.parse(importlib.metadata.version(package))
|
| 101 |
+
except importlib.metadata.PackageNotFoundError:
|
| 102 |
+
continue
|
| 103 |
+
else:
|
| 104 |
+
break
|
| 105 |
+
else:
|
| 106 |
+
TF_AVAILABLE = False
|
| 107 |
+
if TF_AVAILABLE:
|
| 108 |
+
if TF_VERSION.major < 2:
|
| 109 |
+
logger.info(f"TensorFlow found but with version {TF_VERSION}. `datasets` requires version 2 minimum.")
|
| 110 |
+
TF_AVAILABLE = False
|
| 111 |
+
else:
|
| 112 |
+
logger.info(f"TensorFlow version {TF_VERSION} available.")
|
| 113 |
+
else:
|
| 114 |
+
logger.info("Disabling Tensorflow because USE_TORCH is set")
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
JAX_VERSION = "N/A"
|
| 118 |
+
JAX_AVAILABLE = False
|
| 119 |
+
|
| 120 |
+
if USE_JAX in ENV_VARS_TRUE_AND_AUTO_VALUES:
|
| 121 |
+
JAX_AVAILABLE = importlib.util.find_spec("jax") is not None and importlib.util.find_spec("jaxlib") is not None
|
| 122 |
+
if JAX_AVAILABLE:
|
| 123 |
+
try:
|
| 124 |
+
JAX_VERSION = version.parse(importlib.metadata.version("jax"))
|
| 125 |
+
logger.info(f"JAX version {JAX_VERSION} available.")
|
| 126 |
+
except importlib.metadata.PackageNotFoundError:
|
| 127 |
+
pass
|
| 128 |
+
else:
|
| 129 |
+
logger.info("Disabling JAX because USE_JAX is set to False")
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
# Optional tools for data loading
|
| 133 |
+
SQLALCHEMY_AVAILABLE = importlib.util.find_spec("sqlalchemy") is not None
|
| 134 |
+
|
| 135 |
+
# Optional tools for feature decoding
|
| 136 |
+
PIL_AVAILABLE = importlib.util.find_spec("PIL") is not None
|
| 137 |
+
IS_OPUS_SUPPORTED = True
|
| 138 |
+
IS_MP3_SUPPORTED = True
|
| 139 |
+
TORCHCODEC_AVAILABLE = importlib.util.find_spec("torchcodec") is not None
|
| 140 |
+
TORCHVISION_AVAILABLE = importlib.util.find_spec("torchvision") is not None
|
| 141 |
+
PDFPLUMBER_AVAILABLE = importlib.util.find_spec("pdfplumber") is not None
|
| 142 |
+
|
| 143 |
+
# Optional compression tools
|
| 144 |
+
RARFILE_AVAILABLE = importlib.util.find_spec("rarfile") is not None
|
| 145 |
+
ZSTANDARD_AVAILABLE = importlib.util.find_spec("zstandard") is not None
|
| 146 |
+
LZ4_AVAILABLE = importlib.util.find_spec("lz4") is not None
|
| 147 |
+
PY7ZR_AVAILABLE = importlib.util.find_spec("py7zr") is not None
|
| 148 |
+
|
| 149 |
+
# Cache location
|
| 150 |
+
DEFAULT_XDG_CACHE_HOME = "~/.cache"
|
| 151 |
+
XDG_CACHE_HOME = os.getenv("XDG_CACHE_HOME", DEFAULT_XDG_CACHE_HOME)
|
| 152 |
+
DEFAULT_HF_CACHE_HOME = os.path.join(XDG_CACHE_HOME, "huggingface")
|
| 153 |
+
HF_CACHE_HOME = os.path.expanduser(os.getenv("HF_HOME", DEFAULT_HF_CACHE_HOME))
|
| 154 |
+
|
| 155 |
+
DEFAULT_HF_DATASETS_CACHE = os.path.join(HF_CACHE_HOME, "datasets")
|
| 156 |
+
HF_DATASETS_CACHE = Path(os.getenv("HF_DATASETS_CACHE", DEFAULT_HF_DATASETS_CACHE))
|
| 157 |
+
|
| 158 |
+
DEFAULT_HF_MODULES_CACHE = os.path.join(HF_CACHE_HOME, "modules")
|
| 159 |
+
HF_MODULES_CACHE = Path(os.getenv("HF_MODULES_CACHE", DEFAULT_HF_MODULES_CACHE))
|
| 160 |
+
|
| 161 |
+
DOWNLOADED_DATASETS_DIR = "downloads"
|
| 162 |
+
DEFAULT_DOWNLOADED_DATASETS_PATH = os.path.join(HF_DATASETS_CACHE, DOWNLOADED_DATASETS_DIR)
|
| 163 |
+
DOWNLOADED_DATASETS_PATH = Path(os.getenv("HF_DATASETS_DOWNLOADED_DATASETS_PATH", DEFAULT_DOWNLOADED_DATASETS_PATH))
|
| 164 |
+
|
| 165 |
+
EXTRACTED_DATASETS_DIR = "extracted"
|
| 166 |
+
DEFAULT_EXTRACTED_DATASETS_PATH = os.path.join(DEFAULT_DOWNLOADED_DATASETS_PATH, EXTRACTED_DATASETS_DIR)
|
| 167 |
+
EXTRACTED_DATASETS_PATH = Path(os.getenv("HF_DATASETS_EXTRACTED_DATASETS_PATH", DEFAULT_EXTRACTED_DATASETS_PATH))
|
| 168 |
+
|
| 169 |
+
# Download count for the website
|
| 170 |
+
HF_UPDATE_DOWNLOAD_COUNTS = (
|
| 171 |
+
os.environ.get("HF_UPDATE_DOWNLOAD_COUNTS", "AUTO").upper() in ENV_VARS_TRUE_AND_AUTO_VALUES
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
# For downloads and to check remote files metadata
|
| 175 |
+
HF_DATASETS_MULTITHREADING_MAX_WORKERS = 16
|
| 176 |
+
|
| 177 |
+
# Dataset viewer API
|
| 178 |
+
USE_PARQUET_EXPORT = True
|
| 179 |
+
|
| 180 |
+
# Batch size constants. For more info, see:
|
| 181 |
+
# https://github.com/apache/arrow/blob/master/docs/source/cpp/arrays.rst#size-limitations-and-recommendations)
|
| 182 |
+
DEFAULT_MAX_BATCH_SIZE = 1000
|
| 183 |
+
|
| 184 |
+
DEFAULT_CDC_OPTIONS = {"min_chunk_size": 256 * 1024, "max_chunk_size": 1024 * 1024, "norm_level": 0}
|
| 185 |
+
|
| 186 |
+
# Size of the preloaded record batch in `Dataset.__iter__`
|
| 187 |
+
ARROW_READER_BATCH_SIZE_IN_DATASET_ITER = 10
|
| 188 |
+
|
| 189 |
+
# Max uncompressed shard size in bytes (e.g. to shard parquet datasets in push_to_hub or download_and_prepare)
|
| 190 |
+
MAX_SHARD_SIZE = "500MB"
|
| 191 |
+
|
| 192 |
+
# Max uncompressed row group size in bytes (e.g. for parquet files in push_to_hub or download_and_prepare)
|
| 193 |
+
MAX_ROW_GROUP_SIZE = "100MB"
|
| 194 |
+
|
| 195 |
+
# Parquet configuration
|
| 196 |
+
PARQUET_ROW_GROUP_SIZE_FOR_AUDIO_DATASETS = None
|
| 197 |
+
PARQUET_ROW_GROUP_SIZE_FOR_IMAGE_DATASETS = None
|
| 198 |
+
PARQUET_ROW_GROUP_SIZE_FOR_BINARY_DATASETS = None
|
| 199 |
+
PARQUET_ROW_GROUP_SIZE_FOR_VIDEO_DATASETS = None
|
| 200 |
+
|
| 201 |
+
# Arrow configuration
|
| 202 |
+
ARROW_RECORD_BATCH_SIZE_FOR_AUDIO_DATASETS = 100
|
| 203 |
+
ARROW_RECORD_BATCH_SIZE_FOR_IMAGE_DATASETS = 100
|
| 204 |
+
ARROW_RECORD_BATCH_SIZE_FOR_BINARY_DATASETS = 100
|
| 205 |
+
ARROW_RECORD_BATCH_SIZE_FOR_VIDEO_DATASETS = 10
|
| 206 |
+
|
| 207 |
+
# Offline mode
|
| 208 |
+
_offline = os.environ.get("HF_DATASETS_OFFLINE")
|
| 209 |
+
HF_HUB_OFFLINE = constants.HF_HUB_OFFLINE if _offline is None else _offline.upper() in ENV_VARS_TRUE_VALUES
|
| 210 |
+
HF_DATASETS_OFFLINE = HF_HUB_OFFLINE # kept for backward-compatibility
|
| 211 |
+
|
| 212 |
+
# Here, `True` will disable progress bars globally without possibility of enabling it
|
| 213 |
+
# programmatically. `False` will enable them without possibility of disabling them.
|
| 214 |
+
# If environment variable is not set (None), then the user is free to enable/disable
|
| 215 |
+
# them programmatically.
|
| 216 |
+
# TL;DR: env variable has priority over code
|
| 217 |
+
__HF_DATASETS_DISABLE_PROGRESS_BARS = os.environ.get("HF_DATASETS_DISABLE_PROGRESS_BARS")
|
| 218 |
+
HF_DATASETS_DISABLE_PROGRESS_BARS: Optional[bool] = (
|
| 219 |
+
__HF_DATASETS_DISABLE_PROGRESS_BARS.upper() in ENV_VARS_TRUE_VALUES
|
| 220 |
+
if __HF_DATASETS_DISABLE_PROGRESS_BARS is not None
|
| 221 |
+
else None
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
# In-memory
|
| 225 |
+
DEFAULT_IN_MEMORY_MAX_SIZE = 0 # Disabled
|
| 226 |
+
IN_MEMORY_MAX_SIZE = float(os.environ.get("HF_DATASETS_IN_MEMORY_MAX_SIZE", DEFAULT_IN_MEMORY_MAX_SIZE))
|
| 227 |
+
|
| 228 |
+
# File names
|
| 229 |
+
DATASET_ARROW_FILENAME = "dataset.arrow"
|
| 230 |
+
DATASET_INDICES_FILENAME = "indices.arrow"
|
| 231 |
+
DATASET_STATE_JSON_FILENAME = "state.json"
|
| 232 |
+
DATASET_INFO_FILENAME = "dataset_info.json"
|
| 233 |
+
DATASETDICT_INFOS_FILENAME = "dataset_infos.json"
|
| 234 |
+
LICENSE_FILENAME = "LICENSE"
|
| 235 |
+
DATASETDICT_JSON_FILENAME = "dataset_dict.json"
|
| 236 |
+
METADATA_CONFIGS_FIELD = "configs"
|
| 237 |
+
REPOCARD_FILENAME = "README.md"
|
| 238 |
+
REPOYAML_FILENAME = ".huggingface.yaml"
|
| 239 |
+
|
| 240 |
+
MODULE_NAME_FOR_DYNAMIC_MODULES = "datasets_modules"
|
| 241 |
+
|
| 242 |
+
MAX_DATASET_CONFIG_ID_READABLE_LENGTH = 255
|
| 243 |
+
|
| 244 |
+
# Temporary cache directory prefix
|
| 245 |
+
TEMP_CACHE_DIR_PREFIX = "hf_datasets-"
|
| 246 |
+
|
| 247 |
+
# Streaming
|
| 248 |
+
STREAMING_READ_MAX_RETRIES = 20
|
| 249 |
+
STREAMING_READ_RETRY_INTERVAL = 5
|
| 250 |
+
STREAMING_OPEN_MAX_RETRIES = 20
|
| 251 |
+
STREAMING_OPEN_RETRY_INTERVAL = 5
|
| 252 |
+
|
| 253 |
+
# Datasets repositories exploration
|
| 254 |
+
DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE = 200
|
| 255 |
+
GLOBBED_DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE = 10
|
| 256 |
+
ARCHIVED_DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE = 200
|
| 257 |
+
|
| 258 |
+
# Async map functions
|
| 259 |
+
MAX_NUM_RUNNING_ASYNC_MAP_FUNCTIONS_IN_PARALLEL = 1000
|
| 260 |
+
|
| 261 |
+
# Progress bars
|
| 262 |
+
PBAR_REFRESH_TIME_INTERVAL = 0.05 # 20 progress updates per sec
|
| 263 |
+
|
| 264 |
+
# Maximum number of uploaded files per commit
|
| 265 |
+
UPLOADS_MAX_NUMBER_PER_COMMIT = 50
|
| 266 |
+
|
| 267 |
+
# Backward compatibility
|
| 268 |
+
MAX_TABLE_NBYTES_FOR_PICKLING = 4 << 30
|
datasets/data_files.py
ADDED
|
@@ -0,0 +1,807 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
from functools import partial
|
| 4 |
+
from glob import has_magic
|
| 5 |
+
from pathlib import Path, PurePath
|
| 6 |
+
from typing import Callable, Optional, Union
|
| 7 |
+
|
| 8 |
+
import huggingface_hub
|
| 9 |
+
from fsspec.core import url_to_fs
|
| 10 |
+
from huggingface_hub import HfFileSystem
|
| 11 |
+
from packaging import version
|
| 12 |
+
from tqdm.contrib.concurrent import thread_map
|
| 13 |
+
|
| 14 |
+
from . import config
|
| 15 |
+
from .download import DownloadConfig
|
| 16 |
+
from .naming import _split_re
|
| 17 |
+
from .splits import Split
|
| 18 |
+
from .utils import logging
|
| 19 |
+
from .utils import tqdm as hf_tqdm
|
| 20 |
+
from .utils.file_utils import _prepare_path_and_storage_options, is_local_path, is_relative_path, xbasename, xjoin
|
| 21 |
+
from .utils.py_utils import string_to_dict
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
SingleOriginMetadata = Union[tuple[str, str], tuple[str], tuple[()]]
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
SANITIZED_DEFAULT_SPLIT = str(Split.TRAIN)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
logger = logging.get_logger(__name__)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class Url(str):
|
| 34 |
+
pass
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class EmptyDatasetError(FileNotFoundError):
|
| 38 |
+
pass
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
SPLIT_PATTERN_SHARDED = "data/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*"
|
| 42 |
+
|
| 43 |
+
SPLIT_KEYWORDS = {
|
| 44 |
+
Split.TRAIN: ["train", "training"],
|
| 45 |
+
Split.VALIDATION: ["validation", "valid", "dev", "val"],
|
| 46 |
+
Split.TEST: ["test", "testing", "eval", "evaluation"],
|
| 47 |
+
}
|
| 48 |
+
NON_WORDS_CHARS = "-._ 0-9"
|
| 49 |
+
if config.FSSPEC_VERSION < version.parse("2023.9.0"):
|
| 50 |
+
KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**[{sep}/]{keyword}[{sep}]*", "{keyword}[{sep}]*"]
|
| 51 |
+
KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [
|
| 52 |
+
"{keyword}/**",
|
| 53 |
+
"{keyword}[{sep}]*/**",
|
| 54 |
+
"**[{sep}/]{keyword}/**",
|
| 55 |
+
"**[{sep}/]{keyword}[{sep}]*/**",
|
| 56 |
+
]
|
| 57 |
+
elif config.FSSPEC_VERSION < version.parse("2023.12.0"):
|
| 58 |
+
KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**/*[{sep}/]{keyword}[{sep}]*", "{keyword}[{sep}]*"]
|
| 59 |
+
KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [
|
| 60 |
+
"{keyword}/**/*",
|
| 61 |
+
"{keyword}[{sep}]*/**/*",
|
| 62 |
+
"**/*[{sep}/]{keyword}/**/*",
|
| 63 |
+
"**/*[{sep}/]{keyword}[{sep}]*/**/*",
|
| 64 |
+
]
|
| 65 |
+
else:
|
| 66 |
+
KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**/{keyword}[{sep}]*", "**/*[{sep}]{keyword}[{sep}]*"]
|
| 67 |
+
KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [
|
| 68 |
+
"**/{keyword}/**",
|
| 69 |
+
"**/{keyword}[{sep}]*/**",
|
| 70 |
+
"**/*[{sep}]{keyword}/**",
|
| 71 |
+
"**/*[{sep}]{keyword}[{sep}]*/**",
|
| 72 |
+
]
|
| 73 |
+
|
| 74 |
+
DEFAULT_SPLITS = [Split.TRAIN, Split.VALIDATION, Split.TEST]
|
| 75 |
+
DEFAULT_PATTERNS_SPLIT_IN_FILENAME = {
|
| 76 |
+
split: [
|
| 77 |
+
pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
|
| 78 |
+
for keyword in SPLIT_KEYWORDS[split]
|
| 79 |
+
for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS
|
| 80 |
+
]
|
| 81 |
+
for split in DEFAULT_SPLITS
|
| 82 |
+
}
|
| 83 |
+
DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME = {
|
| 84 |
+
split: [
|
| 85 |
+
pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
|
| 86 |
+
for keyword in SPLIT_KEYWORDS[split]
|
| 87 |
+
for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS
|
| 88 |
+
]
|
| 89 |
+
for split in DEFAULT_SPLITS
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
DEFAULT_PATTERNS_ALL = {
|
| 94 |
+
Split.TRAIN: ["**"],
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
ALL_SPLIT_PATTERNS = [SPLIT_PATTERN_SHARDED]
|
| 98 |
+
ALL_DEFAULT_PATTERNS = [
|
| 99 |
+
DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME,
|
| 100 |
+
DEFAULT_PATTERNS_SPLIT_IN_FILENAME,
|
| 101 |
+
DEFAULT_PATTERNS_ALL,
|
| 102 |
+
]
|
| 103 |
+
WILDCARD_CHARACTERS = "*[]"
|
| 104 |
+
FILES_TO_IGNORE = [
|
| 105 |
+
"README.md",
|
| 106 |
+
"config.json",
|
| 107 |
+
"dataset_info.json",
|
| 108 |
+
"dataset_infos.json",
|
| 109 |
+
"dummy_data.zip",
|
| 110 |
+
"dataset_dict.json",
|
| 111 |
+
]
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def contains_wildcards(pattern: str) -> bool:
|
| 115 |
+
return any(wildcard_character in pattern for wildcard_character in WILDCARD_CHARACTERS)
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def sanitize_patterns(patterns: Union[dict, list, str]) -> dict[str, Union[list[str], "DataFilesList"]]:
|
| 119 |
+
"""
|
| 120 |
+
Take the data_files patterns from the user, and format them into a dictionary.
|
| 121 |
+
Each key is the name of the split, and each value is a list of data files patterns (paths or urls).
|
| 122 |
+
The default split is "train".
|
| 123 |
+
|
| 124 |
+
Returns:
|
| 125 |
+
patterns: dictionary of split_name -> list of patterns
|
| 126 |
+
"""
|
| 127 |
+
if isinstance(patterns, dict):
|
| 128 |
+
return {str(key): value if isinstance(value, list) else [value] for key, value in patterns.items()}
|
| 129 |
+
elif isinstance(patterns, str):
|
| 130 |
+
return {SANITIZED_DEFAULT_SPLIT: [patterns]}
|
| 131 |
+
elif isinstance(patterns, list):
|
| 132 |
+
if any(isinstance(pattern, dict) for pattern in patterns):
|
| 133 |
+
for pattern in patterns:
|
| 134 |
+
if not (
|
| 135 |
+
isinstance(pattern, dict)
|
| 136 |
+
and len(pattern) == 2
|
| 137 |
+
and "split" in pattern
|
| 138 |
+
and isinstance(pattern.get("path"), (str, list))
|
| 139 |
+
):
|
| 140 |
+
raise ValueError(
|
| 141 |
+
f"Expected each split to have a 'path' key which can be a string or a list of strings, but got {pattern}"
|
| 142 |
+
)
|
| 143 |
+
splits = [pattern["split"] for pattern in patterns]
|
| 144 |
+
if len(set(splits)) != len(splits):
|
| 145 |
+
raise ValueError(f"Some splits are duplicated in data_files: {splits}")
|
| 146 |
+
return {
|
| 147 |
+
str(pattern["split"]): pattern["path"] if isinstance(pattern["path"], list) else [pattern["path"]]
|
| 148 |
+
for pattern in patterns
|
| 149 |
+
}
|
| 150 |
+
else:
|
| 151 |
+
return {SANITIZED_DEFAULT_SPLIT: patterns}
|
| 152 |
+
else:
|
| 153 |
+
return sanitize_patterns(list(patterns))
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def _is_inside_unrequested_special_dir(matched_rel_path: str, pattern: str) -> bool:
|
| 157 |
+
"""
|
| 158 |
+
When a path matches a pattern, we additionally check if it's inside a special directory
|
| 159 |
+
we ignore by default (if it starts with a double underscore).
|
| 160 |
+
|
| 161 |
+
Users can still explicitly request a filepath inside such a directory if "__pycache__" is
|
| 162 |
+
mentioned explicitly in the requested pattern.
|
| 163 |
+
|
| 164 |
+
Some examples:
|
| 165 |
+
|
| 166 |
+
base directory:
|
| 167 |
+
|
| 168 |
+
./
|
| 169 |
+
└── __pycache__
|
| 170 |
+
└── b.txt
|
| 171 |
+
|
| 172 |
+
>>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "**")
|
| 173 |
+
True
|
| 174 |
+
>>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "*/b.txt")
|
| 175 |
+
True
|
| 176 |
+
>>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__pycache__/*")
|
| 177 |
+
False
|
| 178 |
+
>>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__*/*")
|
| 179 |
+
False
|
| 180 |
+
"""
|
| 181 |
+
# We just need to check if every special directories from the path is present explicitly in the pattern.
|
| 182 |
+
# Since we assume that the path matches the pattern, it's equivalent to counting that both
|
| 183 |
+
# the parent path and the parent pattern have the same number of special directories.
|
| 184 |
+
data_dirs_to_ignore_in_path = [part for part in PurePath(matched_rel_path).parent.parts if part.startswith("__")]
|
| 185 |
+
data_dirs_to_ignore_in_pattern = [part for part in PurePath(pattern).parent.parts if part.startswith("__")]
|
| 186 |
+
return len(data_dirs_to_ignore_in_path) != len(data_dirs_to_ignore_in_pattern)
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
def _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(matched_rel_path: str, pattern: str) -> bool:
|
| 190 |
+
"""
|
| 191 |
+
When a path matches a pattern, we additionally check if it's a hidden file or if it's inside
|
| 192 |
+
a hidden directory we ignore by default, i.e. if the file name or a parent directory name starts with a dot.
|
| 193 |
+
|
| 194 |
+
Users can still explicitly request a filepath that is hidden or is inside a hidden directory
|
| 195 |
+
if the hidden part is mentioned explicitly in the requested pattern.
|
| 196 |
+
|
| 197 |
+
Some examples:
|
| 198 |
+
|
| 199 |
+
base directory:
|
| 200 |
+
|
| 201 |
+
./
|
| 202 |
+
└── .hidden_file.txt
|
| 203 |
+
|
| 204 |
+
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", "**")
|
| 205 |
+
True
|
| 206 |
+
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", ".*")
|
| 207 |
+
False
|
| 208 |
+
|
| 209 |
+
base directory:
|
| 210 |
+
|
| 211 |
+
./
|
| 212 |
+
└── .hidden_dir
|
| 213 |
+
└── a.txt
|
| 214 |
+
|
| 215 |
+
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", "**")
|
| 216 |
+
True
|
| 217 |
+
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".*/*")
|
| 218 |
+
False
|
| 219 |
+
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".hidden_dir/*")
|
| 220 |
+
False
|
| 221 |
+
|
| 222 |
+
base directory:
|
| 223 |
+
|
| 224 |
+
./
|
| 225 |
+
└── .hidden_dir
|
| 226 |
+
└── .hidden_file.txt
|
| 227 |
+
|
| 228 |
+
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", "**")
|
| 229 |
+
True
|
| 230 |
+
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/*")
|
| 231 |
+
True
|
| 232 |
+
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/.*")
|
| 233 |
+
False
|
| 234 |
+
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/*")
|
| 235 |
+
True
|
| 236 |
+
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/.*")
|
| 237 |
+
False
|
| 238 |
+
"""
|
| 239 |
+
# We just need to check if every hidden part from the path is present explicitly in the pattern.
|
| 240 |
+
# Since we assume that the path matches the pattern, it's equivalent to counting that both
|
| 241 |
+
# the path and the pattern have the same number of hidden parts.
|
| 242 |
+
hidden_directories_in_path = [
|
| 243 |
+
part for part in PurePath(matched_rel_path).parts if part.startswith(".") and not set(part) == {"."}
|
| 244 |
+
]
|
| 245 |
+
hidden_directories_in_pattern = [
|
| 246 |
+
part for part in PurePath(pattern).parts if part.startswith(".") and not set(part) == {"."}
|
| 247 |
+
]
|
| 248 |
+
return len(hidden_directories_in_path) != len(hidden_directories_in_pattern)
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
def _get_data_files_patterns(pattern_resolver: Callable[[str], list[str]]) -> dict[str, list[str]]:
|
| 252 |
+
"""
|
| 253 |
+
Get the default pattern from a directory or repository by testing all the supported patterns.
|
| 254 |
+
The first patterns to return a non-empty list of data files is returned.
|
| 255 |
+
|
| 256 |
+
In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
|
| 257 |
+
"""
|
| 258 |
+
# first check the split patterns like data/{split}-00000-of-00001.parquet
|
| 259 |
+
for split_pattern in ALL_SPLIT_PATTERNS:
|
| 260 |
+
pattern = split_pattern.replace("{split}", "*")
|
| 261 |
+
try:
|
| 262 |
+
data_files = pattern_resolver(pattern)
|
| 263 |
+
except FileNotFoundError:
|
| 264 |
+
continue
|
| 265 |
+
if len(data_files) > 0:
|
| 266 |
+
splits: set[str] = set()
|
| 267 |
+
for p in data_files:
|
| 268 |
+
p_parts = string_to_dict(xbasename(p), xbasename(split_pattern))
|
| 269 |
+
assert p_parts is not None
|
| 270 |
+
splits.add(p_parts["split"])
|
| 271 |
+
|
| 272 |
+
if any(not re.match(_split_re, split) for split in splits):
|
| 273 |
+
raise ValueError(f"Split name should match '{_split_re}'' but got '{splits}'.")
|
| 274 |
+
sorted_splits = [str(split) for split in DEFAULT_SPLITS if split in splits] + sorted(
|
| 275 |
+
splits - {str(split) for split in DEFAULT_SPLITS}
|
| 276 |
+
)
|
| 277 |
+
return {split: [split_pattern.format(split=split)] for split in sorted_splits}
|
| 278 |
+
# then check the default patterns based on train/valid/test splits
|
| 279 |
+
for patterns_dict in ALL_DEFAULT_PATTERNS:
|
| 280 |
+
non_empty_splits = []
|
| 281 |
+
for split, patterns in patterns_dict.items():
|
| 282 |
+
for pattern in patterns:
|
| 283 |
+
try:
|
| 284 |
+
data_files = pattern_resolver(pattern)
|
| 285 |
+
except FileNotFoundError:
|
| 286 |
+
continue
|
| 287 |
+
if len(data_files) > 0:
|
| 288 |
+
non_empty_splits.append(split)
|
| 289 |
+
break
|
| 290 |
+
if non_empty_splits:
|
| 291 |
+
return {split: patterns_dict[split] for split in non_empty_splits}
|
| 292 |
+
raise FileNotFoundError(f"Couldn't resolve pattern {pattern} with resolver {pattern_resolver}")
|
| 293 |
+
|
| 294 |
+
|
| 295 |
+
def resolve_pattern(
|
| 296 |
+
pattern: str,
|
| 297 |
+
base_path: str,
|
| 298 |
+
allowed_extensions: Optional[list[str]] = None,
|
| 299 |
+
download_config: Optional[DownloadConfig] = None,
|
| 300 |
+
) -> list[str]:
|
| 301 |
+
"""
|
| 302 |
+
Resolve the paths and URLs of the data files from the pattern passed by the user.
|
| 303 |
+
|
| 304 |
+
You can use patterns to resolve multiple local files. Here are a few examples:
|
| 305 |
+
- *.csv to match all the CSV files at the first level
|
| 306 |
+
- **.csv to match all the CSV files at any level
|
| 307 |
+
- data/* to match all the files inside "data"
|
| 308 |
+
- data/** to match all the files inside "data" and its subdirectories
|
| 309 |
+
|
| 310 |
+
The patterns are resolved using the fsspec glob. In fsspec>=2023.12.0 this is equivalent to
|
| 311 |
+
Python's glob.glob, Path.glob, Path.match and fnmatch where ** is unsupported with a prefix/suffix
|
| 312 |
+
other than a forward slash /.
|
| 313 |
+
|
| 314 |
+
More generally:
|
| 315 |
+
- '*' matches any character except a forward-slash (to match just the file or directory name)
|
| 316 |
+
- '**' matches any character including a forward-slash /
|
| 317 |
+
|
| 318 |
+
Hidden files and directories (i.e. whose names start with a dot) are ignored, unless they are explicitly requested.
|
| 319 |
+
The same applies to special directories that start with a double underscore like "__pycache__".
|
| 320 |
+
You can still include one if the pattern explicitly mentions it:
|
| 321 |
+
- to include a hidden file: "*/.hidden.txt" or "*/.*"
|
| 322 |
+
- to include a hidden directory: ".hidden/*" or ".*/*"
|
| 323 |
+
- to include a special directory: "__special__/*" or "__*/*"
|
| 324 |
+
|
| 325 |
+
Example::
|
| 326 |
+
|
| 327 |
+
>>> from datasets.data_files import resolve_pattern
|
| 328 |
+
>>> base_path = "."
|
| 329 |
+
>>> resolve_pattern("docs/**/*.py", base_path)
|
| 330 |
+
[/Users/mariosasko/Desktop/projects/datasets/docs/source/_config.py']
|
| 331 |
+
|
| 332 |
+
Args:
|
| 333 |
+
pattern (str): Unix pattern or paths or URLs of the data files to resolve.
|
| 334 |
+
The paths can be absolute or relative to base_path.
|
| 335 |
+
Remote filesystems using fsspec are supported, e.g. with the hf:// protocol.
|
| 336 |
+
base_path (str): Base path to use when resolving relative paths.
|
| 337 |
+
allowed_extensions (Optional[list], optional): White-list of file extensions to use. Defaults to None (all extensions).
|
| 338 |
+
For example: allowed_extensions=[".csv", ".json", ".txt", ".parquet"]
|
| 339 |
+
download_config ([`DownloadConfig`], *optional*): Specific download configuration parameters.
|
| 340 |
+
Returns:
|
| 341 |
+
List[str]: List of paths or URLs to the local or remote files that match the patterns.
|
| 342 |
+
"""
|
| 343 |
+
if is_relative_path(pattern):
|
| 344 |
+
pattern = xjoin(base_path, pattern)
|
| 345 |
+
elif is_local_path(pattern):
|
| 346 |
+
base_path = os.path.splitdrive(pattern)[0] + os.sep
|
| 347 |
+
else:
|
| 348 |
+
base_path = ""
|
| 349 |
+
pattern, storage_options = _prepare_path_and_storage_options(pattern, download_config=download_config)
|
| 350 |
+
fs, fs_pattern = url_to_fs(pattern, **storage_options)
|
| 351 |
+
files_to_ignore = set(FILES_TO_IGNORE) - {xbasename(pattern)}
|
| 352 |
+
protocol = fs.protocol if isinstance(fs.protocol, str) else fs.protocol[0]
|
| 353 |
+
protocol_prefix = protocol + "://" if protocol != "file" else ""
|
| 354 |
+
glob_kwargs = {}
|
| 355 |
+
if protocol == "hf":
|
| 356 |
+
# 10 times faster glob with detail=True (ignores costly info like lastCommit)
|
| 357 |
+
glob_kwargs["expand_info"] = False
|
| 358 |
+
matched_paths = [
|
| 359 |
+
filepath if filepath.startswith(protocol_prefix) else protocol_prefix + filepath
|
| 360 |
+
for filepath, info in fs.glob(pattern, detail=True, **glob_kwargs).items()
|
| 361 |
+
if (info["type"] == "file" or (info.get("islink") and os.path.isfile(os.path.realpath(filepath))))
|
| 362 |
+
and (xbasename(filepath) not in files_to_ignore)
|
| 363 |
+
and not _is_inside_unrequested_special_dir(filepath, fs_pattern)
|
| 364 |
+
and not _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(filepath, fs_pattern)
|
| 365 |
+
] # ignore .ipynb and __pycache__, but keep /../
|
| 366 |
+
if allowed_extensions is not None:
|
| 367 |
+
out = [
|
| 368 |
+
filepath
|
| 369 |
+
for filepath in matched_paths
|
| 370 |
+
if any("." + suffix in allowed_extensions for suffix in xbasename(filepath).split(".")[1:])
|
| 371 |
+
]
|
| 372 |
+
if len(out) < len(matched_paths):
|
| 373 |
+
invalid_matched_files = list(set(matched_paths) - set(out))
|
| 374 |
+
logger.info(
|
| 375 |
+
f"Some files matched the pattern '{pattern}' but don't have valid data file extensions: {invalid_matched_files}"
|
| 376 |
+
)
|
| 377 |
+
else:
|
| 378 |
+
out = matched_paths
|
| 379 |
+
if not out:
|
| 380 |
+
error_msg = f"Unable to find '{pattern}'"
|
| 381 |
+
if allowed_extensions is not None:
|
| 382 |
+
error_msg += f" with any supported extension {list(allowed_extensions)}"
|
| 383 |
+
raise FileNotFoundError(error_msg)
|
| 384 |
+
return out
|
| 385 |
+
|
| 386 |
+
|
| 387 |
+
def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig] = None) -> dict[str, list[str]]:
|
| 388 |
+
"""
|
| 389 |
+
Get the default pattern from a directory testing all the supported patterns.
|
| 390 |
+
The first patterns to return a non-empty list of data files is returned.
|
| 391 |
+
|
| 392 |
+
Some examples of supported patterns:
|
| 393 |
+
|
| 394 |
+
Input:
|
| 395 |
+
|
| 396 |
+
my_dataset_repository/
|
| 397 |
+
├── README.md
|
| 398 |
+
└── dataset.csv
|
| 399 |
+
|
| 400 |
+
Output:
|
| 401 |
+
|
| 402 |
+
{'train': ['**']}
|
| 403 |
+
|
| 404 |
+
Input:
|
| 405 |
+
|
| 406 |
+
my_dataset_repository/
|
| 407 |
+
├── README.md
|
| 408 |
+
├── train.csv
|
| 409 |
+
└── test.csv
|
| 410 |
+
|
| 411 |
+
my_dataset_repository/
|
| 412 |
+
├── README.md
|
| 413 |
+
└── data/
|
| 414 |
+
├── train.csv
|
| 415 |
+
└── test.csv
|
| 416 |
+
|
| 417 |
+
my_dataset_repository/
|
| 418 |
+
├── README.md
|
| 419 |
+
├── train_0.csv
|
| 420 |
+
├── train_1.csv
|
| 421 |
+
├── train_2.csv
|
| 422 |
+
├── train_3.csv
|
| 423 |
+
├── test_0.csv
|
| 424 |
+
└── test_1.csv
|
| 425 |
+
|
| 426 |
+
Output:
|
| 427 |
+
|
| 428 |
+
{'train': ['**/train[-._ 0-9]*', '**/*[-._ 0-9]train[-._ 0-9]*', '**/training[-._ 0-9]*', '**/*[-._ 0-9]training[-._ 0-9]*'],
|
| 429 |
+
'test': ['**/test[-._ 0-9]*', '**/*[-._ 0-9]test[-._ 0-9]*', '**/testing[-._ 0-9]*', '**/*[-._ 0-9]testing[-._ 0-9]*', ...]}
|
| 430 |
+
|
| 431 |
+
Input:
|
| 432 |
+
|
| 433 |
+
my_dataset_repository/
|
| 434 |
+
├── README.md
|
| 435 |
+
└── data/
|
| 436 |
+
├── train/
|
| 437 |
+
│ ├── shard_0.csv
|
| 438 |
+
│ ├── shard_1.csv
|
| 439 |
+
│ ├── shard_2.csv
|
| 440 |
+
│ └── shard_3.csv
|
| 441 |
+
└── test/
|
| 442 |
+
├── shard_0.csv
|
| 443 |
+
└── shard_1.csv
|
| 444 |
+
|
| 445 |
+
Output:
|
| 446 |
+
|
| 447 |
+
{'train': ['**/train/**', '**/train[-._ 0-9]*/**', '**/*[-._ 0-9]train/**', '**/*[-._ 0-9]train[-._ 0-9]*/**', ...],
|
| 448 |
+
'test': ['**/test/**', '**/test[-._ 0-9]*/**', '**/*[-._ 0-9]test/**', '**/*[-._ 0-9]test[-._ 0-9]*/**', ...]}
|
| 449 |
+
|
| 450 |
+
Input:
|
| 451 |
+
|
| 452 |
+
my_dataset_repository/
|
| 453 |
+
├── README.md
|
| 454 |
+
└── data/
|
| 455 |
+
├── train-00000-of-00003.csv
|
| 456 |
+
├── train-00001-of-00003.csv
|
| 457 |
+
├── train-00002-of-00003.csv
|
| 458 |
+
├── test-00000-of-00001.csv
|
| 459 |
+
├── random-00000-of-00003.csv
|
| 460 |
+
├── random-00001-of-00003.csv
|
| 461 |
+
└── random-00002-of-00003.csv
|
| 462 |
+
|
| 463 |
+
Output:
|
| 464 |
+
|
| 465 |
+
{'train': ['data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
|
| 466 |
+
'test': ['data/test-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
|
| 467 |
+
'random': ['data/random-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*']}
|
| 468 |
+
|
| 469 |
+
In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
|
| 470 |
+
"""
|
| 471 |
+
resolver = partial(resolve_pattern, base_path=base_path, download_config=download_config)
|
| 472 |
+
try:
|
| 473 |
+
return _get_data_files_patterns(resolver)
|
| 474 |
+
except FileNotFoundError:
|
| 475 |
+
raise EmptyDatasetError(f"The directory at {base_path} doesn't contain any data files") from None
|
| 476 |
+
|
| 477 |
+
|
| 478 |
+
def _get_single_origin_metadata(
|
| 479 |
+
data_file: str,
|
| 480 |
+
download_config: Optional[DownloadConfig] = None,
|
| 481 |
+
) -> SingleOriginMetadata:
|
| 482 |
+
data_file, storage_options = _prepare_path_and_storage_options(data_file, download_config=download_config)
|
| 483 |
+
fs, *_ = url_to_fs(data_file, **storage_options)
|
| 484 |
+
if isinstance(fs, HfFileSystem):
|
| 485 |
+
resolved_path = fs.resolve_path(data_file)
|
| 486 |
+
return resolved_path.repo_id, resolved_path.revision
|
| 487 |
+
elif data_file.startswith(config.HF_ENDPOINT):
|
| 488 |
+
hffs = HfFileSystem(endpoint=config.HF_ENDPOINT, token=download_config.token)
|
| 489 |
+
data_file = "hf://" + data_file[len(config.HF_ENDPOINT) + 1 :].replace("/resolve/", "@", 1)
|
| 490 |
+
resolved_path = hffs.resolve_path(data_file)
|
| 491 |
+
return resolved_path.repo_id, resolved_path.revision
|
| 492 |
+
info = fs.info(data_file)
|
| 493 |
+
# s3fs uses "ETag", gcsfs uses "etag", and for local we simply check mtime
|
| 494 |
+
for key in ["ETag", "etag", "mtime"]:
|
| 495 |
+
if key in info:
|
| 496 |
+
return (str(info[key]),)
|
| 497 |
+
return ()
|
| 498 |
+
|
| 499 |
+
|
| 500 |
+
def _get_origin_metadata(
|
| 501 |
+
data_files: list[str],
|
| 502 |
+
download_config: Optional[DownloadConfig] = None,
|
| 503 |
+
max_workers: Optional[int] = None,
|
| 504 |
+
) -> list[SingleOriginMetadata]:
|
| 505 |
+
max_workers = max_workers if max_workers is not None else config.HF_DATASETS_MULTITHREADING_MAX_WORKERS
|
| 506 |
+
if all("hf://" in data_file for data_file in data_files):
|
| 507 |
+
# No need for multithreading here since the origin metadata of HF files
|
| 508 |
+
# is (repo_id, revision) and is cached after first .info() call.
|
| 509 |
+
return [
|
| 510 |
+
_get_single_origin_metadata(data_file, download_config=download_config)
|
| 511 |
+
for data_file in hf_tqdm(
|
| 512 |
+
data_files,
|
| 513 |
+
desc="Resolving data files",
|
| 514 |
+
# set `disable=None` rather than `disable=False` by default to disable progress bar when no TTY attached
|
| 515 |
+
disable=len(data_files) <= 16 or None,
|
| 516 |
+
)
|
| 517 |
+
]
|
| 518 |
+
return thread_map(
|
| 519 |
+
partial(_get_single_origin_metadata, download_config=download_config),
|
| 520 |
+
data_files,
|
| 521 |
+
max_workers=max_workers,
|
| 522 |
+
tqdm_class=hf_tqdm,
|
| 523 |
+
desc="Resolving data files",
|
| 524 |
+
# set `disable=None` rather than `disable=False` by default to disable progress bar when no TTY attached
|
| 525 |
+
disable=len(data_files) <= 16 or None,
|
| 526 |
+
)
|
| 527 |
+
|
| 528 |
+
|
| 529 |
+
class DataFilesList(list[str]):
|
| 530 |
+
"""
|
| 531 |
+
List of data files (absolute local paths or URLs).
|
| 532 |
+
It has two construction methods given the user's data files patterns:
|
| 533 |
+
- ``from_hf_repo``: resolve patterns inside a dataset repository
|
| 534 |
+
- ``from_local_or_remote``: resolve patterns from a local path
|
| 535 |
+
|
| 536 |
+
Moreover, DataFilesList has an additional attribute ``origin_metadata``.
|
| 537 |
+
It can store:
|
| 538 |
+
- the last modified time of local files
|
| 539 |
+
- ETag of remote files
|
| 540 |
+
- commit sha of a dataset repository
|
| 541 |
+
|
| 542 |
+
Thanks to this additional attribute, it is possible to hash the list
|
| 543 |
+
and get a different hash if and only if at least one file changed.
|
| 544 |
+
This is useful for caching Dataset objects that are obtained from a list of data files.
|
| 545 |
+
"""
|
| 546 |
+
|
| 547 |
+
def __init__(self, data_files: list[str], origin_metadata: list[SingleOriginMetadata]) -> None:
|
| 548 |
+
super().__init__(data_files)
|
| 549 |
+
self.origin_metadata = origin_metadata
|
| 550 |
+
|
| 551 |
+
def __add__(self, other: "DataFilesList") -> "DataFilesList":
|
| 552 |
+
return DataFilesList([*self, *other], self.origin_metadata + other.origin_metadata)
|
| 553 |
+
|
| 554 |
+
@classmethod
|
| 555 |
+
def from_hf_repo(
|
| 556 |
+
cls,
|
| 557 |
+
patterns: list[str],
|
| 558 |
+
dataset_info: huggingface_hub.hf_api.DatasetInfo,
|
| 559 |
+
base_path: Optional[str] = None,
|
| 560 |
+
allowed_extensions: Optional[list[str]] = None,
|
| 561 |
+
download_config: Optional[DownloadConfig] = None,
|
| 562 |
+
) -> "DataFilesList":
|
| 563 |
+
base_path = f"hf://datasets/{dataset_info.id}@{dataset_info.sha}/{base_path or ''}".rstrip("/")
|
| 564 |
+
return cls.from_patterns(
|
| 565 |
+
patterns, base_path=base_path, allowed_extensions=allowed_extensions, download_config=download_config
|
| 566 |
+
)
|
| 567 |
+
|
| 568 |
+
@classmethod
|
| 569 |
+
def from_local_or_remote(
|
| 570 |
+
cls,
|
| 571 |
+
patterns: list[str],
|
| 572 |
+
base_path: Optional[str] = None,
|
| 573 |
+
allowed_extensions: Optional[list[str]] = None,
|
| 574 |
+
download_config: Optional[DownloadConfig] = None,
|
| 575 |
+
) -> "DataFilesList":
|
| 576 |
+
base_path = base_path if base_path is not None else Path().resolve().as_posix()
|
| 577 |
+
return cls.from_patterns(
|
| 578 |
+
patterns, base_path=base_path, allowed_extensions=allowed_extensions, download_config=download_config
|
| 579 |
+
)
|
| 580 |
+
|
| 581 |
+
@classmethod
|
| 582 |
+
def from_patterns(
|
| 583 |
+
cls,
|
| 584 |
+
patterns: list[str],
|
| 585 |
+
base_path: Optional[str] = None,
|
| 586 |
+
allowed_extensions: Optional[list[str]] = None,
|
| 587 |
+
download_config: Optional[DownloadConfig] = None,
|
| 588 |
+
) -> "DataFilesList":
|
| 589 |
+
base_path = base_path if base_path is not None else Path().resolve().as_posix()
|
| 590 |
+
data_files = []
|
| 591 |
+
for pattern in patterns:
|
| 592 |
+
try:
|
| 593 |
+
data_files.extend(
|
| 594 |
+
resolve_pattern(
|
| 595 |
+
pattern,
|
| 596 |
+
base_path=base_path,
|
| 597 |
+
allowed_extensions=allowed_extensions,
|
| 598 |
+
download_config=download_config,
|
| 599 |
+
)
|
| 600 |
+
)
|
| 601 |
+
except FileNotFoundError:
|
| 602 |
+
if not has_magic(pattern):
|
| 603 |
+
raise
|
| 604 |
+
origin_metadata = _get_origin_metadata(data_files, download_config=download_config)
|
| 605 |
+
return cls(data_files, origin_metadata)
|
| 606 |
+
|
| 607 |
+
def filter(
|
| 608 |
+
self, *, extensions: Optional[list[str]] = None, file_names: Optional[list[str]] = None
|
| 609 |
+
) -> "DataFilesList":
|
| 610 |
+
patterns = []
|
| 611 |
+
if extensions:
|
| 612 |
+
ext_pattern = "|".join(re.escape(ext) for ext in extensions)
|
| 613 |
+
patterns.append(re.compile(f".*({ext_pattern})(\\..+)?$"))
|
| 614 |
+
if file_names:
|
| 615 |
+
fn_pattern = "|".join(re.escape(fn) for fn in file_names)
|
| 616 |
+
patterns.append(re.compile(rf".*[\/]?({fn_pattern})$"))
|
| 617 |
+
if patterns:
|
| 618 |
+
return DataFilesList(
|
| 619 |
+
[data_file for data_file in self if any(pattern.match(data_file) for pattern in patterns)],
|
| 620 |
+
origin_metadata=self.origin_metadata,
|
| 621 |
+
)
|
| 622 |
+
else:
|
| 623 |
+
return DataFilesList(list(self), origin_metadata=self.origin_metadata)
|
| 624 |
+
|
| 625 |
+
|
| 626 |
+
class DataFilesDict(dict[str, DataFilesList]):
|
| 627 |
+
"""
|
| 628 |
+
Dict of split_name -> list of data files (absolute local paths or URLs).
|
| 629 |
+
It has two construction methods given the user's data files patterns :
|
| 630 |
+
- ``from_hf_repo``: resolve patterns inside a dataset repository
|
| 631 |
+
- ``from_local_or_remote``: resolve patterns from a local path
|
| 632 |
+
|
| 633 |
+
Moreover, each list is a DataFilesList. It is possible to hash the dictionary
|
| 634 |
+
and get a different hash if and only if at least one file changed.
|
| 635 |
+
For more info, see [`DataFilesList`].
|
| 636 |
+
|
| 637 |
+
This is useful for caching Dataset objects that are obtained from a list of data files.
|
| 638 |
+
|
| 639 |
+
Changing the order of the keys of this dictionary also doesn't change its hash.
|
| 640 |
+
"""
|
| 641 |
+
|
| 642 |
+
@classmethod
|
| 643 |
+
def from_local_or_remote(
|
| 644 |
+
cls,
|
| 645 |
+
patterns: dict[str, Union[list[str], DataFilesList]],
|
| 646 |
+
base_path: Optional[str] = None,
|
| 647 |
+
allowed_extensions: Optional[list[str]] = None,
|
| 648 |
+
download_config: Optional[DownloadConfig] = None,
|
| 649 |
+
) -> "DataFilesDict":
|
| 650 |
+
out = cls()
|
| 651 |
+
for key, patterns_for_key in patterns.items():
|
| 652 |
+
out[key] = (
|
| 653 |
+
patterns_for_key
|
| 654 |
+
if isinstance(patterns_for_key, DataFilesList)
|
| 655 |
+
else DataFilesList.from_local_or_remote(
|
| 656 |
+
patterns_for_key,
|
| 657 |
+
base_path=base_path,
|
| 658 |
+
allowed_extensions=allowed_extensions,
|
| 659 |
+
download_config=download_config,
|
| 660 |
+
)
|
| 661 |
+
)
|
| 662 |
+
return out
|
| 663 |
+
|
| 664 |
+
@classmethod
|
| 665 |
+
def from_hf_repo(
|
| 666 |
+
cls,
|
| 667 |
+
patterns: dict[str, Union[list[str], DataFilesList]],
|
| 668 |
+
dataset_info: huggingface_hub.hf_api.DatasetInfo,
|
| 669 |
+
base_path: Optional[str] = None,
|
| 670 |
+
allowed_extensions: Optional[list[str]] = None,
|
| 671 |
+
download_config: Optional[DownloadConfig] = None,
|
| 672 |
+
) -> "DataFilesDict":
|
| 673 |
+
out = cls()
|
| 674 |
+
for key, patterns_for_key in patterns.items():
|
| 675 |
+
out[key] = (
|
| 676 |
+
patterns_for_key
|
| 677 |
+
if isinstance(patterns_for_key, DataFilesList)
|
| 678 |
+
else DataFilesList.from_hf_repo(
|
| 679 |
+
patterns_for_key,
|
| 680 |
+
dataset_info=dataset_info,
|
| 681 |
+
base_path=base_path,
|
| 682 |
+
allowed_extensions=allowed_extensions,
|
| 683 |
+
download_config=download_config,
|
| 684 |
+
)
|
| 685 |
+
)
|
| 686 |
+
return out
|
| 687 |
+
|
| 688 |
+
@classmethod
|
| 689 |
+
def from_patterns(
|
| 690 |
+
cls,
|
| 691 |
+
patterns: dict[str, Union[list[str], DataFilesList]],
|
| 692 |
+
base_path: Optional[str] = None,
|
| 693 |
+
allowed_extensions: Optional[list[str]] = None,
|
| 694 |
+
download_config: Optional[DownloadConfig] = None,
|
| 695 |
+
) -> "DataFilesDict":
|
| 696 |
+
out = cls()
|
| 697 |
+
for key, patterns_for_key in patterns.items():
|
| 698 |
+
out[key] = (
|
| 699 |
+
patterns_for_key
|
| 700 |
+
if isinstance(patterns_for_key, DataFilesList)
|
| 701 |
+
else DataFilesList.from_patterns(
|
| 702 |
+
patterns_for_key,
|
| 703 |
+
base_path=base_path,
|
| 704 |
+
allowed_extensions=allowed_extensions,
|
| 705 |
+
download_config=download_config,
|
| 706 |
+
)
|
| 707 |
+
)
|
| 708 |
+
return out
|
| 709 |
+
|
| 710 |
+
def filter(
|
| 711 |
+
self, *, extensions: Optional[list[str]] = None, file_names: Optional[list[str]] = None
|
| 712 |
+
) -> "DataFilesDict":
|
| 713 |
+
out = type(self)()
|
| 714 |
+
for key, data_files_list in self.items():
|
| 715 |
+
out[key] = data_files_list.filter(extensions=extensions, file_names=file_names)
|
| 716 |
+
return out
|
| 717 |
+
|
| 718 |
+
|
| 719 |
+
class DataFilesPatternsList(list[str]):
|
| 720 |
+
"""
|
| 721 |
+
List of data files patterns (absolute local paths or URLs).
|
| 722 |
+
For each pattern there should also be a list of allowed extensions
|
| 723 |
+
to keep, or a None ot keep all the files for the pattern.
|
| 724 |
+
"""
|
| 725 |
+
|
| 726 |
+
def __init__(
|
| 727 |
+
self,
|
| 728 |
+
patterns: list[str],
|
| 729 |
+
allowed_extensions: list[Optional[list[str]]],
|
| 730 |
+
):
|
| 731 |
+
super().__init__(patterns)
|
| 732 |
+
self.allowed_extensions = allowed_extensions
|
| 733 |
+
|
| 734 |
+
def __add__(self, other):
|
| 735 |
+
return DataFilesList([*self, *other], self.allowed_extensions + other.allowed_extensions)
|
| 736 |
+
|
| 737 |
+
@classmethod
|
| 738 |
+
def from_patterns(
|
| 739 |
+
cls, patterns: list[str], allowed_extensions: Optional[list[str]] = None
|
| 740 |
+
) -> "DataFilesPatternsList":
|
| 741 |
+
return cls(patterns, [allowed_extensions] * len(patterns))
|
| 742 |
+
|
| 743 |
+
def resolve(
|
| 744 |
+
self,
|
| 745 |
+
base_path: str,
|
| 746 |
+
download_config: Optional[DownloadConfig] = None,
|
| 747 |
+
) -> "DataFilesList":
|
| 748 |
+
base_path = base_path if base_path is not None else Path().resolve().as_posix()
|
| 749 |
+
data_files = []
|
| 750 |
+
for pattern, allowed_extensions in zip(self, self.allowed_extensions):
|
| 751 |
+
try:
|
| 752 |
+
data_files.extend(
|
| 753 |
+
resolve_pattern(
|
| 754 |
+
pattern,
|
| 755 |
+
base_path=base_path,
|
| 756 |
+
allowed_extensions=allowed_extensions,
|
| 757 |
+
download_config=download_config,
|
| 758 |
+
)
|
| 759 |
+
)
|
| 760 |
+
except FileNotFoundError:
|
| 761 |
+
if not has_magic(pattern):
|
| 762 |
+
raise
|
| 763 |
+
origin_metadata = _get_origin_metadata(data_files, download_config=download_config)
|
| 764 |
+
return DataFilesList(data_files, origin_metadata)
|
| 765 |
+
|
| 766 |
+
def filter_extensions(self, extensions: list[str]) -> "DataFilesPatternsList":
|
| 767 |
+
return DataFilesPatternsList(
|
| 768 |
+
self, [allowed_extensions + extensions for allowed_extensions in self.allowed_extensions]
|
| 769 |
+
)
|
| 770 |
+
|
| 771 |
+
|
| 772 |
+
class DataFilesPatternsDict(dict[str, DataFilesPatternsList]):
|
| 773 |
+
"""
|
| 774 |
+
Dict of split_name -> list of data files patterns (absolute local paths or URLs).
|
| 775 |
+
"""
|
| 776 |
+
|
| 777 |
+
@classmethod
|
| 778 |
+
def from_patterns(
|
| 779 |
+
cls, patterns: dict[str, list[str]], allowed_extensions: Optional[list[str]] = None
|
| 780 |
+
) -> "DataFilesPatternsDict":
|
| 781 |
+
out = cls()
|
| 782 |
+
for key, patterns_for_key in patterns.items():
|
| 783 |
+
out[key] = (
|
| 784 |
+
patterns_for_key
|
| 785 |
+
if isinstance(patterns_for_key, DataFilesPatternsList)
|
| 786 |
+
else DataFilesPatternsList.from_patterns(
|
| 787 |
+
patterns_for_key,
|
| 788 |
+
allowed_extensions=allowed_extensions,
|
| 789 |
+
)
|
| 790 |
+
)
|
| 791 |
+
return out
|
| 792 |
+
|
| 793 |
+
def resolve(
|
| 794 |
+
self,
|
| 795 |
+
base_path: str,
|
| 796 |
+
download_config: Optional[DownloadConfig] = None,
|
| 797 |
+
) -> "DataFilesDict":
|
| 798 |
+
out = DataFilesDict()
|
| 799 |
+
for key, data_files_patterns_list in self.items():
|
| 800 |
+
out[key] = data_files_patterns_list.resolve(base_path, download_config)
|
| 801 |
+
return out
|
| 802 |
+
|
| 803 |
+
def filter_extensions(self, extensions: list[str]) -> "DataFilesPatternsDict":
|
| 804 |
+
out = type(self)()
|
| 805 |
+
for key, data_files_patterns_list in self.items():
|
| 806 |
+
out[key] = data_files_patterns_list.filter_extensions(extensions)
|
| 807 |
+
return out
|
datasets/dataset_dict.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/distributed.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import TypeVar
|
| 2 |
+
|
| 3 |
+
from .arrow_dataset import Dataset, _split_by_node_map_style_dataset
|
| 4 |
+
from .iterable_dataset import IterableDataset, _split_by_node_iterable_dataset
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
DatasetType = TypeVar("DatasetType", Dataset, IterableDataset)
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def split_dataset_by_node(dataset: DatasetType, rank: int, world_size: int) -> DatasetType:
|
| 11 |
+
"""
|
| 12 |
+
Split a dataset for the node at rank `rank` in a pool of nodes of size `world_size`.
|
| 13 |
+
|
| 14 |
+
For map-style datasets:
|
| 15 |
+
|
| 16 |
+
Each node is assigned a chunk of data, e.g. rank 0 is given the first chunk of the dataset.
|
| 17 |
+
To maximize data loading throughput, chunks are made of contiguous data on disk if possible.
|
| 18 |
+
|
| 19 |
+
For iterable datasets:
|
| 20 |
+
|
| 21 |
+
If the dataset has a number of shards that is a factor of `world_size` (i.e. if `dataset.num_shards % world_size == 0`),
|
| 22 |
+
then the shards are evenly assigned across the nodes, which is the most optimized.
|
| 23 |
+
Otherwise, each node keeps 1 example out of `world_size`, skipping the other examples.
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
dataset ([`Dataset`] or [`IterableDataset`]):
|
| 27 |
+
The dataset to split by node.
|
| 28 |
+
rank (`int`):
|
| 29 |
+
Rank of the current node.
|
| 30 |
+
world_size (`int`):
|
| 31 |
+
Total number of nodes.
|
| 32 |
+
|
| 33 |
+
Returns:
|
| 34 |
+
[`Dataset`] or [`IterableDataset`]: The dataset to be used on the node at rank `rank`.
|
| 35 |
+
"""
|
| 36 |
+
if isinstance(dataset, Dataset):
|
| 37 |
+
return _split_by_node_map_style_dataset(dataset, rank=rank, world_size=world_size)
|
| 38 |
+
else:
|
| 39 |
+
return _split_by_node_iterable_dataset(dataset, rank=rank, world_size=world_size)
|
datasets/exceptions.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
# Copyright 2023 The HuggingFace Authors.
|
| 3 |
+
from typing import Any, Optional, Union
|
| 4 |
+
|
| 5 |
+
from huggingface_hub import HfFileSystem
|
| 6 |
+
|
| 7 |
+
from . import config
|
| 8 |
+
from .table import CastError
|
| 9 |
+
from .utils.track import TrackedIterableFromGenerator, tracked_list, tracked_str
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class DatasetsError(Exception):
|
| 13 |
+
"""Base class for exceptions in this library."""
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class DefunctDatasetError(DatasetsError):
|
| 17 |
+
"""The dataset has been defunct."""
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class FileNotFoundDatasetsError(DatasetsError, FileNotFoundError):
|
| 21 |
+
"""FileNotFoundError raised by this library."""
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class DataFilesNotFoundError(FileNotFoundDatasetsError):
|
| 25 |
+
"""No (supported) data files found."""
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class DatasetNotFoundError(FileNotFoundDatasetsError):
|
| 29 |
+
"""Dataset not found.
|
| 30 |
+
|
| 31 |
+
Raised when trying to access:
|
| 32 |
+
- a missing dataset, or
|
| 33 |
+
- a private/gated dataset and the user is not authenticated.
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class DatasetBuildError(DatasetsError):
|
| 38 |
+
pass
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class ManualDownloadError(DatasetBuildError):
|
| 42 |
+
pass
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class FileFormatError(DatasetBuildError):
|
| 46 |
+
pass
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class DatasetGenerationError(DatasetBuildError):
|
| 50 |
+
pass
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
class DatasetGenerationCastError(DatasetGenerationError):
|
| 54 |
+
@classmethod
|
| 55 |
+
def from_cast_error(
|
| 56 |
+
cls,
|
| 57 |
+
cast_error: CastError,
|
| 58 |
+
builder_name: str,
|
| 59 |
+
gen_kwargs: dict[str, Any],
|
| 60 |
+
token: Optional[Union[bool, str]],
|
| 61 |
+
) -> "DatasetGenerationCastError":
|
| 62 |
+
explanation_message = (
|
| 63 |
+
f"\n\nAll the data files must have the same columns, but at some point {cast_error.details()}"
|
| 64 |
+
)
|
| 65 |
+
formatted_tracked_gen_kwargs: list[str] = []
|
| 66 |
+
for gen_kwarg in gen_kwargs.values():
|
| 67 |
+
if not isinstance(gen_kwarg, (tracked_str, tracked_list, TrackedIterableFromGenerator)):
|
| 68 |
+
continue
|
| 69 |
+
while (
|
| 70 |
+
isinstance(gen_kwarg, (tracked_list, TrackedIterableFromGenerator)) and gen_kwarg.last_item is not None
|
| 71 |
+
):
|
| 72 |
+
gen_kwarg = gen_kwarg.last_item
|
| 73 |
+
if isinstance(gen_kwarg, tracked_str):
|
| 74 |
+
gen_kwarg = gen_kwarg.get_origin()
|
| 75 |
+
if isinstance(gen_kwarg, str) and gen_kwarg.startswith("hf://"):
|
| 76 |
+
resolved_path = HfFileSystem(endpoint=config.HF_ENDPOINT, token=token).resolve_path(gen_kwarg)
|
| 77 |
+
gen_kwarg = "hf://" + resolved_path.unresolve()
|
| 78 |
+
if "@" + resolved_path.revision in gen_kwarg:
|
| 79 |
+
gen_kwarg = (
|
| 80 |
+
gen_kwarg.replace("@" + resolved_path.revision, "", 1)
|
| 81 |
+
+ f" (at revision {resolved_path.revision})"
|
| 82 |
+
)
|
| 83 |
+
formatted_tracked_gen_kwargs.append(str(gen_kwarg))
|
| 84 |
+
if formatted_tracked_gen_kwargs:
|
| 85 |
+
explanation_message += f"\n\nThis happened while the {builder_name} dataset builder was generating data using\n\n{', '.join(formatted_tracked_gen_kwargs)}"
|
| 86 |
+
help_message = "\n\nPlease either edit the data files to have matching columns, or separate them into different configurations (see docs at https://hf.co/docs/hub/datasets-manual-configuration#multiple-configurations)"
|
| 87 |
+
return cls("An error occurred while generating the dataset" + explanation_message + help_message)
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
class ChecksumVerificationError(DatasetsError):
|
| 91 |
+
"""Error raised during checksums verifications of downloaded files."""
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
class UnexpectedDownloadedFileError(ChecksumVerificationError):
|
| 95 |
+
"""Some downloaded files were not expected."""
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
class ExpectedMoreDownloadedFilesError(ChecksumVerificationError):
|
| 99 |
+
"""Some files were supposed to be downloaded but were not."""
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
class NonMatchingChecksumError(ChecksumVerificationError):
|
| 103 |
+
"""The downloaded file checksum don't match the expected checksum."""
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
class SplitsVerificationError(DatasetsError):
|
| 107 |
+
"""Error raised during splits verifications."""
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
class UnexpectedSplitsError(SplitsVerificationError):
|
| 111 |
+
"""The expected splits of the downloaded file is missing."""
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
class ExpectedMoreSplitsError(SplitsVerificationError):
|
| 115 |
+
"""Some recorded splits are missing."""
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
class NonMatchingSplitsSizesError(SplitsVerificationError):
|
| 119 |
+
"""The splits sizes don't match the expected splits sizes."""
|
datasets/fingerprint.py
ADDED
|
@@ -0,0 +1,454 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import inspect
|
| 2 |
+
import os
|
| 3 |
+
import random
|
| 4 |
+
import shutil
|
| 5 |
+
import tempfile
|
| 6 |
+
import weakref
|
| 7 |
+
from functools import wraps
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import TYPE_CHECKING, Any, Callable, Optional, Union
|
| 10 |
+
|
| 11 |
+
import numpy as np
|
| 12 |
+
import xxhash
|
| 13 |
+
|
| 14 |
+
from . import config
|
| 15 |
+
from .naming import INVALID_WINDOWS_CHARACTERS_IN_PATH
|
| 16 |
+
from .utils._dill import dumps
|
| 17 |
+
from .utils.logging import get_logger
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
if TYPE_CHECKING:
|
| 21 |
+
from .arrow_dataset import Dataset
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
logger = get_logger(__name__)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# Fingerprinting allows to have one deterministic fingerprint per dataset state.
|
| 28 |
+
# A dataset fingerprint is updated after each transform.
|
| 29 |
+
# Re-running the same transforms on a dataset in a different session results in the same fingerprint.
|
| 30 |
+
# This is possible thanks to a custom hashing function that works with most python objects.
|
| 31 |
+
|
| 32 |
+
# Fingerprinting is the main mechanism that enables caching.
|
| 33 |
+
# The caching mechanism allows to reload an existing cache file if it's already been computed.
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
#################
|
| 37 |
+
# Caching
|
| 38 |
+
#################
|
| 39 |
+
|
| 40 |
+
_CACHING_ENABLED = True
|
| 41 |
+
_TEMP_DIR_FOR_TEMP_CACHE_FILES: Optional["_TempCacheDir"] = None
|
| 42 |
+
_DATASETS_WITH_TABLE_IN_TEMP_DIR: Optional[weakref.WeakSet] = None
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class _TempCacheDir:
|
| 46 |
+
"""
|
| 47 |
+
A temporary directory for storing cached Arrow files with a cleanup that frees references to the Arrow files
|
| 48 |
+
before deleting the directory itself to avoid permission errors on Windows.
|
| 49 |
+
"""
|
| 50 |
+
|
| 51 |
+
def __init__(self):
|
| 52 |
+
self.name = tempfile.mkdtemp(prefix=config.TEMP_CACHE_DIR_PREFIX)
|
| 53 |
+
self._finalizer = weakref.finalize(self, self._cleanup)
|
| 54 |
+
|
| 55 |
+
def _cleanup(self):
|
| 56 |
+
for dset in get_datasets_with_cache_file_in_temp_dir():
|
| 57 |
+
dset.__del__()
|
| 58 |
+
if os.path.exists(self.name):
|
| 59 |
+
try:
|
| 60 |
+
shutil.rmtree(self.name)
|
| 61 |
+
except Exception as e:
|
| 62 |
+
raise OSError(
|
| 63 |
+
f"An error occurred while trying to delete temporary cache directory {self.name}. Please delete it manually."
|
| 64 |
+
) from e
|
| 65 |
+
|
| 66 |
+
def cleanup(self):
|
| 67 |
+
if self._finalizer.detach():
|
| 68 |
+
self._cleanup()
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def maybe_register_dataset_for_temp_dir_deletion(dataset):
|
| 72 |
+
"""
|
| 73 |
+
This function registers the datasets that have cache files in _TEMP_DIR_FOR_TEMP_CACHE_FILES in order
|
| 74 |
+
to properly delete them before deleting the temporary directory.
|
| 75 |
+
The temporary directory _TEMP_DIR_FOR_TEMP_CACHE_FILES is used when caching is disabled.
|
| 76 |
+
"""
|
| 77 |
+
if _TEMP_DIR_FOR_TEMP_CACHE_FILES is None:
|
| 78 |
+
return
|
| 79 |
+
|
| 80 |
+
global _DATASETS_WITH_TABLE_IN_TEMP_DIR
|
| 81 |
+
if _DATASETS_WITH_TABLE_IN_TEMP_DIR is None:
|
| 82 |
+
_DATASETS_WITH_TABLE_IN_TEMP_DIR = weakref.WeakSet()
|
| 83 |
+
if any(
|
| 84 |
+
Path(_TEMP_DIR_FOR_TEMP_CACHE_FILES.name) in Path(cache_file["filename"]).parents
|
| 85 |
+
for cache_file in dataset.cache_files
|
| 86 |
+
):
|
| 87 |
+
_DATASETS_WITH_TABLE_IN_TEMP_DIR.add(dataset)
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def get_datasets_with_cache_file_in_temp_dir():
|
| 91 |
+
return list(_DATASETS_WITH_TABLE_IN_TEMP_DIR) if _DATASETS_WITH_TABLE_IN_TEMP_DIR is not None else []
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def enable_caching():
|
| 95 |
+
"""
|
| 96 |
+
When applying transforms on a dataset, the data are stored in cache files.
|
| 97 |
+
The caching mechanism allows to reload an existing cache file if it's already been computed.
|
| 98 |
+
|
| 99 |
+
Reloading a dataset is possible since the cache files are named using the dataset fingerprint, which is updated
|
| 100 |
+
after each transform.
|
| 101 |
+
|
| 102 |
+
If disabled, the library will no longer reload cached datasets files when applying transforms to the datasets.
|
| 103 |
+
More precisely, if the caching is disabled:
|
| 104 |
+
- cache files are always recreated
|
| 105 |
+
- cache files are written to a temporary directory that is deleted when session closes
|
| 106 |
+
- cache files are named using a random hash instead of the dataset fingerprint
|
| 107 |
+
- use [`~datasets.Dataset.save_to_disk`] to save a transformed dataset or it will be deleted when session closes
|
| 108 |
+
- caching doesn't affect [`~datasets.load_dataset`]. If you want to regenerate a dataset from scratch you should use
|
| 109 |
+
the `download_mode` parameter in [`~datasets.load_dataset`].
|
| 110 |
+
"""
|
| 111 |
+
global _CACHING_ENABLED
|
| 112 |
+
_CACHING_ENABLED = True
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def disable_caching():
|
| 116 |
+
"""
|
| 117 |
+
When applying transforms on a dataset, the data are stored in cache files.
|
| 118 |
+
The caching mechanism allows to reload an existing cache file if it's already been computed.
|
| 119 |
+
|
| 120 |
+
Reloading a dataset is possible since the cache files are named using the dataset fingerprint, which is updated
|
| 121 |
+
after each transform.
|
| 122 |
+
|
| 123 |
+
If disabled, the library will no longer reload cached datasets files when applying transforms to the datasets.
|
| 124 |
+
More precisely, if the caching is disabled:
|
| 125 |
+
- cache files are always recreated
|
| 126 |
+
- cache files are written to a temporary directory that is deleted when session closes
|
| 127 |
+
- cache files are named using a random hash instead of the dataset fingerprint
|
| 128 |
+
- use [`~datasets.Dataset.save_to_disk`] to save a transformed dataset or it will be deleted when session closes
|
| 129 |
+
- caching doesn't affect [`~datasets.load_dataset`]. If you want to regenerate a dataset from scratch you should use
|
| 130 |
+
the `download_mode` parameter in [`~datasets.load_dataset`].
|
| 131 |
+
"""
|
| 132 |
+
global _CACHING_ENABLED
|
| 133 |
+
_CACHING_ENABLED = False
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def is_caching_enabled() -> bool:
|
| 137 |
+
"""
|
| 138 |
+
When applying transforms on a dataset, the data are stored in cache files.
|
| 139 |
+
The caching mechanism allows to reload an existing cache file if it's already been computed.
|
| 140 |
+
|
| 141 |
+
Reloading a dataset is possible since the cache files are named using the dataset fingerprint, which is updated
|
| 142 |
+
after each transform.
|
| 143 |
+
|
| 144 |
+
If disabled, the library will no longer reload cached datasets files when applying transforms to the datasets.
|
| 145 |
+
More precisely, if the caching is disabled:
|
| 146 |
+
- cache files are always recreated
|
| 147 |
+
- cache files are written to a temporary directory that is deleted when session closes
|
| 148 |
+
- cache files are named using a random hash instead of the dataset fingerprint
|
| 149 |
+
- use [`~datasets.Dataset.save_to_disk`]] to save a transformed dataset or it will be deleted when session closes
|
| 150 |
+
- caching doesn't affect [`~datasets.load_dataset`]. If you want to regenerate a dataset from scratch you should use
|
| 151 |
+
the `download_mode` parameter in [`~datasets.load_dataset`].
|
| 152 |
+
"""
|
| 153 |
+
global _CACHING_ENABLED
|
| 154 |
+
return bool(_CACHING_ENABLED)
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def get_temporary_cache_files_directory() -> str:
|
| 158 |
+
"""Return a directory that is deleted when session closes."""
|
| 159 |
+
global _TEMP_DIR_FOR_TEMP_CACHE_FILES
|
| 160 |
+
if _TEMP_DIR_FOR_TEMP_CACHE_FILES is None:
|
| 161 |
+
_TEMP_DIR_FOR_TEMP_CACHE_FILES = _TempCacheDir()
|
| 162 |
+
return _TEMP_DIR_FOR_TEMP_CACHE_FILES.name
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
#################
|
| 166 |
+
# Hashing
|
| 167 |
+
#################
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
class Hasher:
|
| 171 |
+
"""Hasher that accepts python objects as inputs."""
|
| 172 |
+
|
| 173 |
+
dispatch: dict = {}
|
| 174 |
+
|
| 175 |
+
def __init__(self):
|
| 176 |
+
self.m = xxhash.xxh64()
|
| 177 |
+
|
| 178 |
+
@classmethod
|
| 179 |
+
def hash_bytes(cls, value: Union[bytes, list[bytes]]) -> str:
|
| 180 |
+
value = [value] if isinstance(value, bytes) else value
|
| 181 |
+
m = xxhash.xxh64()
|
| 182 |
+
for x in value:
|
| 183 |
+
m.update(x)
|
| 184 |
+
return m.hexdigest()
|
| 185 |
+
|
| 186 |
+
@classmethod
|
| 187 |
+
def hash(cls, value: Any) -> str:
|
| 188 |
+
return cls.hash_bytes(dumps(value))
|
| 189 |
+
|
| 190 |
+
def update(self, value: Any) -> None:
|
| 191 |
+
header_for_update = f"=={type(value)}=="
|
| 192 |
+
value_for_update = self.hash(value)
|
| 193 |
+
self.m.update(header_for_update.encode("utf8"))
|
| 194 |
+
self.m.update(value_for_update.encode("utf-8"))
|
| 195 |
+
|
| 196 |
+
def hexdigest(self) -> str:
|
| 197 |
+
return self.m.hexdigest()
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
#################
|
| 201 |
+
# Fingerprinting
|
| 202 |
+
#################
|
| 203 |
+
|
| 204 |
+
fingerprint_rng = random.Random()
|
| 205 |
+
# we show a warning only once when fingerprinting fails to avoid spam
|
| 206 |
+
fingerprint_warnings: dict[str, bool] = {}
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
def generate_fingerprint(dataset: "Dataset") -> str:
|
| 210 |
+
state = dataset.__dict__
|
| 211 |
+
hasher = Hasher()
|
| 212 |
+
for key in sorted(state):
|
| 213 |
+
if key == "_fingerprint":
|
| 214 |
+
continue
|
| 215 |
+
hasher.update(key)
|
| 216 |
+
hasher.update(state[key])
|
| 217 |
+
# hash data files last modification timestamps as well
|
| 218 |
+
for cache_file in dataset.cache_files:
|
| 219 |
+
hasher.update(os.path.getmtime(cache_file["filename"]))
|
| 220 |
+
return hasher.hexdigest()
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
def generate_random_fingerprint(nbits: int = 64) -> str:
|
| 224 |
+
return f"{fingerprint_rng.getrandbits(nbits):0{nbits // 4}x}"
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
def update_fingerprint(fingerprint, transform, transform_args):
|
| 228 |
+
global fingerprint_warnings
|
| 229 |
+
hasher = Hasher()
|
| 230 |
+
hasher.update(fingerprint)
|
| 231 |
+
try:
|
| 232 |
+
hasher.update(transform)
|
| 233 |
+
except: # noqa various errors might raise here from pickle or dill
|
| 234 |
+
if _CACHING_ENABLED:
|
| 235 |
+
if not fingerprint_warnings.get("update_fingerprint_transform_hash_failed", False):
|
| 236 |
+
logger.warning(
|
| 237 |
+
f"Transform {transform} couldn't be hashed properly, a random hash was used instead. "
|
| 238 |
+
"Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. "
|
| 239 |
+
"If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. "
|
| 240 |
+
"This warning is only shown once. Subsequent hashing failures won't be shown."
|
| 241 |
+
)
|
| 242 |
+
fingerprint_warnings["update_fingerprint_transform_hash_failed"] = True
|
| 243 |
+
else:
|
| 244 |
+
logger.info(f"Transform {transform} couldn't be hashed properly, a random hash was used instead.")
|
| 245 |
+
else:
|
| 246 |
+
logger.info(
|
| 247 |
+
f"Transform {transform} couldn't be hashed properly, a random hash was used instead. This doesn't affect caching since it's disabled."
|
| 248 |
+
)
|
| 249 |
+
|
| 250 |
+
return generate_random_fingerprint()
|
| 251 |
+
for key in sorted(transform_args):
|
| 252 |
+
hasher.update(key)
|
| 253 |
+
try:
|
| 254 |
+
hasher.update(transform_args[key])
|
| 255 |
+
except: # noqa various errors might raise here from pickle or dill
|
| 256 |
+
if _CACHING_ENABLED:
|
| 257 |
+
if not fingerprint_warnings.get("update_fingerprint_transform_hash_failed", False):
|
| 258 |
+
logger.warning(
|
| 259 |
+
f"Parameter '{key}'={transform_args[key]} of the transform {transform} couldn't be hashed properly, a random hash was used instead. "
|
| 260 |
+
"Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. "
|
| 261 |
+
"If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. "
|
| 262 |
+
"This warning is only shown once. Subsequent hashing failures won't be shown."
|
| 263 |
+
)
|
| 264 |
+
fingerprint_warnings["update_fingerprint_transform_hash_failed"] = True
|
| 265 |
+
else:
|
| 266 |
+
logger.info(
|
| 267 |
+
f"Parameter '{key}'={transform_args[key]} of the transform {transform} couldn't be hashed properly, a random hash was used instead."
|
| 268 |
+
)
|
| 269 |
+
else:
|
| 270 |
+
logger.info(
|
| 271 |
+
f"Parameter '{key}'={transform_args[key]} of the transform {transform} couldn't be hashed properly, a random hash was used instead. This doesn't affect caching since it's disabled."
|
| 272 |
+
)
|
| 273 |
+
return generate_random_fingerprint()
|
| 274 |
+
return hasher.hexdigest()
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
def validate_fingerprint(fingerprint: str, max_length=64):
|
| 278 |
+
"""
|
| 279 |
+
Make sure the fingerprint is a non-empty string that is not longer that max_length=64 by default,
|
| 280 |
+
so that the fingerprint can be used to name cache files without issues.
|
| 281 |
+
"""
|
| 282 |
+
if not isinstance(fingerprint, str) or not fingerprint:
|
| 283 |
+
raise ValueError(f"Invalid fingerprint '{fingerprint}': it should be a non-empty string.")
|
| 284 |
+
for invalid_char in INVALID_WINDOWS_CHARACTERS_IN_PATH:
|
| 285 |
+
if invalid_char in fingerprint:
|
| 286 |
+
raise ValueError(
|
| 287 |
+
f"Invalid fingerprint. Bad characters from black list '{INVALID_WINDOWS_CHARACTERS_IN_PATH}' found in '{fingerprint}'. "
|
| 288 |
+
f"They could create issues when creating cache files."
|
| 289 |
+
)
|
| 290 |
+
if len(fingerprint) > max_length:
|
| 291 |
+
raise ValueError(
|
| 292 |
+
f"Invalid fingerprint. Maximum lenth is {max_length} but '{fingerprint}' has length {len(fingerprint)}."
|
| 293 |
+
"It could create issues when creating cache files."
|
| 294 |
+
)
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
def format_transform_for_fingerprint(func: Callable, version: Optional[str] = None) -> str:
|
| 298 |
+
"""
|
| 299 |
+
Format a transform to the format that will be used to update the fingerprint.
|
| 300 |
+
"""
|
| 301 |
+
transform = f"{func.__module__}.{func.__qualname__}"
|
| 302 |
+
if version is not None:
|
| 303 |
+
transform += f"@{version}"
|
| 304 |
+
return transform
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
def format_kwargs_for_fingerprint(
|
| 308 |
+
func: Callable,
|
| 309 |
+
args: tuple,
|
| 310 |
+
kwargs: dict[str, Any],
|
| 311 |
+
use_kwargs: Optional[list[str]] = None,
|
| 312 |
+
ignore_kwargs: Optional[list[str]] = None,
|
| 313 |
+
randomized_function: bool = False,
|
| 314 |
+
) -> dict[str, Any]:
|
| 315 |
+
"""
|
| 316 |
+
Format the kwargs of a transform to the format that will be used to update the fingerprint.
|
| 317 |
+
"""
|
| 318 |
+
kwargs_for_fingerprint = kwargs.copy()
|
| 319 |
+
if args:
|
| 320 |
+
params = [p.name for p in inspect.signature(func).parameters.values() if p != p.VAR_KEYWORD]
|
| 321 |
+
args = args[1:] # assume the first argument is the dataset
|
| 322 |
+
params = params[1:]
|
| 323 |
+
kwargs_for_fingerprint.update(zip(params, args))
|
| 324 |
+
else:
|
| 325 |
+
del kwargs_for_fingerprint[
|
| 326 |
+
next(iter(inspect.signature(func).parameters))
|
| 327 |
+
] # assume the first key is the dataset
|
| 328 |
+
|
| 329 |
+
# keep the right kwargs to be hashed to generate the fingerprint
|
| 330 |
+
|
| 331 |
+
if use_kwargs:
|
| 332 |
+
kwargs_for_fingerprint = {k: v for k, v in kwargs_for_fingerprint.items() if k in use_kwargs}
|
| 333 |
+
if ignore_kwargs:
|
| 334 |
+
kwargs_for_fingerprint = {k: v for k, v in kwargs_for_fingerprint.items() if k not in ignore_kwargs}
|
| 335 |
+
if randomized_function: # randomized functions have `seed` and `generator` parameters
|
| 336 |
+
if kwargs_for_fingerprint.get("seed") is None and kwargs_for_fingerprint.get("generator") is None:
|
| 337 |
+
_, seed, pos, *_ = np.random.get_state()
|
| 338 |
+
seed = seed[pos] if pos < 624 else seed[0]
|
| 339 |
+
kwargs_for_fingerprint["generator"] = np.random.default_rng(seed)
|
| 340 |
+
|
| 341 |
+
# remove kwargs that are the default values
|
| 342 |
+
|
| 343 |
+
default_values = {
|
| 344 |
+
p.name: p.default for p in inspect.signature(func).parameters.values() if p.default != inspect._empty
|
| 345 |
+
}
|
| 346 |
+
for default_varname, default_value in default_values.items():
|
| 347 |
+
if default_varname in kwargs_for_fingerprint and kwargs_for_fingerprint[default_varname] == default_value:
|
| 348 |
+
kwargs_for_fingerprint.pop(default_varname)
|
| 349 |
+
return kwargs_for_fingerprint
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
def fingerprint_transform(
|
| 353 |
+
inplace: bool,
|
| 354 |
+
use_kwargs: Optional[list[str]] = None,
|
| 355 |
+
ignore_kwargs: Optional[list[str]] = None,
|
| 356 |
+
fingerprint_names: Optional[list[str]] = None,
|
| 357 |
+
randomized_function: bool = False,
|
| 358 |
+
version: Optional[str] = None,
|
| 359 |
+
):
|
| 360 |
+
"""
|
| 361 |
+
Wrapper for dataset transforms to update the dataset fingerprint using ``update_fingerprint``
|
| 362 |
+
Args:
|
| 363 |
+
inplace (:obj:`bool`): If inplace is True, the fingerprint of the dataset is updated inplace.
|
| 364 |
+
Otherwise, a parameter "new_fingerprint" is passed to the wrapped method that should take care of
|
| 365 |
+
setting the fingerprint of the returned Dataset.
|
| 366 |
+
use_kwargs (:obj:`List[str]`, optional): optional white list of argument names to take into account
|
| 367 |
+
to update the fingerprint to the wrapped method that should take care of
|
| 368 |
+
setting the fingerprint of the returned Dataset. By default all the arguments are used.
|
| 369 |
+
ignore_kwargs (:obj:`List[str]`, optional): optional black list of argument names to take into account
|
| 370 |
+
to update the fingerprint. Note that ignore_kwargs prevails on use_kwargs.
|
| 371 |
+
fingerprint_names (:obj:`List[str]`, optional, defaults to ["new_fingerprint"]):
|
| 372 |
+
If the dataset transforms is not inplace and returns a DatasetDict, then it can require
|
| 373 |
+
several fingerprints (one per dataset in the DatasetDict). By specifying fingerprint_names,
|
| 374 |
+
one fingerprint named after each element of fingerprint_names is going to be passed.
|
| 375 |
+
randomized_function (:obj:`bool`, defaults to False): If the dataset transform is random and has
|
| 376 |
+
optional parameters "seed" and "generator", then you can set randomized_function to True.
|
| 377 |
+
This way, even if users set "seed" and "generator" to None, then the fingerprint is
|
| 378 |
+
going to be randomly generated depending on numpy's current state. In this case, the
|
| 379 |
+
generator is set to np.random.default_rng(np.random.get_state()[1][0]).
|
| 380 |
+
version (:obj:`str`, optional): version of the transform. The version is taken into account when
|
| 381 |
+
computing the fingerprint. If a datase transform changes (or at least if the output data
|
| 382 |
+
that are cached changes), then one should increase the version. If the version stays the
|
| 383 |
+
same, then old cached data could be reused that are not compatible with the new transform.
|
| 384 |
+
It should be in the format "MAJOR.MINOR.PATCH".
|
| 385 |
+
"""
|
| 386 |
+
|
| 387 |
+
if use_kwargs is not None and not isinstance(use_kwargs, list):
|
| 388 |
+
raise ValueError(f"use_kwargs is supposed to be a list, not {type(use_kwargs)}")
|
| 389 |
+
|
| 390 |
+
if ignore_kwargs is not None and not isinstance(ignore_kwargs, list):
|
| 391 |
+
raise ValueError(f"ignore_kwargs is supposed to be a list, not {type(use_kwargs)}")
|
| 392 |
+
|
| 393 |
+
if inplace and fingerprint_names:
|
| 394 |
+
raise ValueError("fingerprint_names are only used when inplace is False")
|
| 395 |
+
|
| 396 |
+
fingerprint_names = fingerprint_names if fingerprint_names is not None else ["new_fingerprint"]
|
| 397 |
+
|
| 398 |
+
def _fingerprint(func):
|
| 399 |
+
if not inplace and not all(name in func.__code__.co_varnames for name in fingerprint_names):
|
| 400 |
+
raise ValueError(f"function {func} is missing parameters {fingerprint_names} in signature")
|
| 401 |
+
|
| 402 |
+
if randomized_function: # randomized function have seed and generator parameters
|
| 403 |
+
if "seed" not in func.__code__.co_varnames:
|
| 404 |
+
raise ValueError(f"'seed' must be in {func}'s signature")
|
| 405 |
+
if "generator" not in func.__code__.co_varnames:
|
| 406 |
+
raise ValueError(f"'generator' must be in {func}'s signature")
|
| 407 |
+
# this call has to be outside the wrapper or since __qualname__ changes in multiprocessing
|
| 408 |
+
transform = format_transform_for_fingerprint(func, version=version)
|
| 409 |
+
|
| 410 |
+
@wraps(func)
|
| 411 |
+
def wrapper(*args, **kwargs):
|
| 412 |
+
kwargs_for_fingerprint = format_kwargs_for_fingerprint(
|
| 413 |
+
func,
|
| 414 |
+
args,
|
| 415 |
+
kwargs,
|
| 416 |
+
use_kwargs=use_kwargs,
|
| 417 |
+
ignore_kwargs=ignore_kwargs,
|
| 418 |
+
randomized_function=randomized_function,
|
| 419 |
+
)
|
| 420 |
+
|
| 421 |
+
if args:
|
| 422 |
+
dataset: Dataset = args[0]
|
| 423 |
+
args = args[1:]
|
| 424 |
+
else:
|
| 425 |
+
dataset: Dataset = kwargs.pop(next(iter(inspect.signature(func).parameters)))
|
| 426 |
+
|
| 427 |
+
# compute new_fingerprint and add it to the args of not in-place transforms
|
| 428 |
+
if inplace:
|
| 429 |
+
new_fingerprint = update_fingerprint(dataset._fingerprint, transform, kwargs_for_fingerprint)
|
| 430 |
+
else:
|
| 431 |
+
for fingerprint_name in fingerprint_names: # transforms like `train_test_split` have several hashes
|
| 432 |
+
if kwargs.get(fingerprint_name) is None:
|
| 433 |
+
kwargs_for_fingerprint["fingerprint_name"] = fingerprint_name
|
| 434 |
+
kwargs[fingerprint_name] = update_fingerprint(
|
| 435 |
+
dataset._fingerprint, transform, kwargs_for_fingerprint
|
| 436 |
+
)
|
| 437 |
+
else:
|
| 438 |
+
validate_fingerprint(kwargs[fingerprint_name])
|
| 439 |
+
|
| 440 |
+
# Call actual function
|
| 441 |
+
|
| 442 |
+
out = func(dataset, *args, **kwargs)
|
| 443 |
+
|
| 444 |
+
# Update fingerprint of in-place transforms + update in-place history of transforms
|
| 445 |
+
|
| 446 |
+
if inplace: # update after calling func so that the fingerprint doesn't change if the function fails
|
| 447 |
+
dataset._fingerprint = new_fingerprint
|
| 448 |
+
|
| 449 |
+
return out
|
| 450 |
+
|
| 451 |
+
wrapper._decorator_name_ = "fingerprint"
|
| 452 |
+
return wrapper
|
| 453 |
+
|
| 454 |
+
return _fingerprint
|
datasets/hub.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from itertools import chain
|
| 2 |
+
from typing import Optional, Union
|
| 3 |
+
|
| 4 |
+
from huggingface_hub import (
|
| 5 |
+
CommitInfo,
|
| 6 |
+
CommitOperationAdd,
|
| 7 |
+
CommitOperationDelete,
|
| 8 |
+
DatasetCard,
|
| 9 |
+
DatasetCardData,
|
| 10 |
+
HfApi,
|
| 11 |
+
HfFileSystem,
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
import datasets.config
|
| 15 |
+
from datasets.info import DatasetInfosDict
|
| 16 |
+
from datasets.load import load_dataset_builder
|
| 17 |
+
from datasets.utils.metadata import MetadataConfigs
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def delete_from_hub(
|
| 21 |
+
repo_id: str,
|
| 22 |
+
config_name: str,
|
| 23 |
+
revision: Optional[str] = None,
|
| 24 |
+
token: Optional[Union[bool, str]] = None,
|
| 25 |
+
) -> CommitInfo:
|
| 26 |
+
"""Delete a dataset configuration from a [data-only dataset](repository_structure) on the Hub.
|
| 27 |
+
|
| 28 |
+
Args:
|
| 29 |
+
repo_id (`str`): ID of the Hub dataset repository, in the following format: `<user>/<dataset_name>` or
|
| 30 |
+
`<org>/<dataset_name>`.
|
| 31 |
+
config_name (`str`): Name of the dataset configuration.
|
| 32 |
+
revision (`str`, *optional*): Branch to delete the configuration from. Defaults to the `"main"` branch.
|
| 33 |
+
token (`bool` or `str`, *optional*): Authentication token for the Hugging Face Hub.
|
| 34 |
+
|
| 35 |
+
Returns:
|
| 36 |
+
`huggingface_hub.CommitInfo`
|
| 37 |
+
"""
|
| 38 |
+
operations = []
|
| 39 |
+
# data_files
|
| 40 |
+
fs = HfFileSystem(endpoint=datasets.config.HF_ENDPOINT, token=token)
|
| 41 |
+
builder = load_dataset_builder(repo_id, config_name, revision=revision, token=token)
|
| 42 |
+
for data_file in chain(*builder.config.data_files.values()):
|
| 43 |
+
data_file_resolved_path = fs.resolve_path(data_file)
|
| 44 |
+
if data_file_resolved_path.repo_id == repo_id:
|
| 45 |
+
operations.append(CommitOperationDelete(path_in_repo=data_file_resolved_path.path_in_repo))
|
| 46 |
+
# README.md
|
| 47 |
+
dataset_card = DatasetCard.load(repo_id)
|
| 48 |
+
# config_names
|
| 49 |
+
if dataset_card.data.get("config_names", None) and config_name in dataset_card.data["config_names"]:
|
| 50 |
+
dataset_card.data["config_names"].remove(config_name)
|
| 51 |
+
# metadata_configs
|
| 52 |
+
metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card.data)
|
| 53 |
+
if metadata_configs:
|
| 54 |
+
_ = metadata_configs.pop(config_name, None)
|
| 55 |
+
dataset_card_data = DatasetCardData()
|
| 56 |
+
metadata_configs.to_dataset_card_data(dataset_card_data)
|
| 57 |
+
if datasets.config.METADATA_CONFIGS_FIELD in dataset_card_data:
|
| 58 |
+
dataset_card.data[datasets.config.METADATA_CONFIGS_FIELD] = dataset_card_data[
|
| 59 |
+
datasets.config.METADATA_CONFIGS_FIELD
|
| 60 |
+
]
|
| 61 |
+
else:
|
| 62 |
+
_ = dataset_card.data.pop(datasets.config.METADATA_CONFIGS_FIELD, None)
|
| 63 |
+
# dataset_info
|
| 64 |
+
dataset_infos: DatasetInfosDict = DatasetInfosDict.from_dataset_card_data(dataset_card.data)
|
| 65 |
+
if dataset_infos:
|
| 66 |
+
_ = dataset_infos.pop(config_name, None)
|
| 67 |
+
dataset_card_data = DatasetCardData()
|
| 68 |
+
dataset_infos.to_dataset_card_data(dataset_card_data)
|
| 69 |
+
if "dataset_info" in dataset_card_data:
|
| 70 |
+
dataset_card.data["dataset_info"] = dataset_card_data["dataset_info"]
|
| 71 |
+
else:
|
| 72 |
+
_ = dataset_card.data.pop("dataset_info", None)
|
| 73 |
+
# Commit
|
| 74 |
+
operations.append(
|
| 75 |
+
CommitOperationAdd(path_in_repo=datasets.config.REPOCARD_FILENAME, path_or_fileobj=str(dataset_card).encode())
|
| 76 |
+
)
|
| 77 |
+
api = HfApi(endpoint=datasets.config.HF_ENDPOINT, token=token)
|
| 78 |
+
commit_info = api.create_commit(
|
| 79 |
+
repo_id,
|
| 80 |
+
operations=operations,
|
| 81 |
+
commit_message=f"Delete '{config_name}' config",
|
| 82 |
+
commit_description=f"Delete '{config_name}' config.",
|
| 83 |
+
token=token,
|
| 84 |
+
repo_type="dataset",
|
| 85 |
+
revision=revision,
|
| 86 |
+
create_pr=True,
|
| 87 |
+
)
|
| 88 |
+
print(f"You can find your PR to delete the dataset config at: {commit_info.pr_url}")
|
| 89 |
+
return commit_info
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def _delete_files(dataset_id, revision=None, token=None):
|
| 93 |
+
hf_api = HfApi(endpoint=datasets.config.HF_ENDPOINT, token=token)
|
| 94 |
+
repo_files = hf_api.list_repo_files(
|
| 95 |
+
dataset_id,
|
| 96 |
+
repo_type="dataset",
|
| 97 |
+
)
|
| 98 |
+
if repo_files:
|
| 99 |
+
legacy_json_file = []
|
| 100 |
+
data_files = []
|
| 101 |
+
for filename in repo_files:
|
| 102 |
+
if filename in {".gitattributes", "README.md"}:
|
| 103 |
+
continue
|
| 104 |
+
elif filename == "dataset_infos.json":
|
| 105 |
+
legacy_json_file.append(filename)
|
| 106 |
+
else:
|
| 107 |
+
data_files.append(filename)
|
| 108 |
+
if legacy_json_file:
|
| 109 |
+
hf_api.delete_file(
|
| 110 |
+
"dataset_infos.json",
|
| 111 |
+
dataset_id,
|
| 112 |
+
repo_type="dataset",
|
| 113 |
+
revision=revision,
|
| 114 |
+
commit_message="Delete legacy dataset_infos.json",
|
| 115 |
+
)
|
| 116 |
+
if data_files:
|
| 117 |
+
for filename in data_files:
|
| 118 |
+
hf_api.delete_file(
|
| 119 |
+
filename,
|
| 120 |
+
dataset_id,
|
| 121 |
+
repo_type="dataset",
|
| 122 |
+
revision=revision,
|
| 123 |
+
commit_message="Delete data file",
|
| 124 |
+
)
|
datasets/info.py
ADDED
|
@@ -0,0 +1,430 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
# Lint as: python3
|
| 16 |
+
"""DatasetInfo record information we know about a dataset.
|
| 17 |
+
|
| 18 |
+
This includes things that we know about the dataset statically, i.e.:
|
| 19 |
+
- description
|
| 20 |
+
- canonical location
|
| 21 |
+
- does it have validation and tests splits
|
| 22 |
+
- size
|
| 23 |
+
- etc.
|
| 24 |
+
|
| 25 |
+
This also includes the things that can and should be computed once we've
|
| 26 |
+
processed the dataset as well:
|
| 27 |
+
- number of examples (in each split)
|
| 28 |
+
- etc.
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
import copy
|
| 32 |
+
import dataclasses
|
| 33 |
+
import json
|
| 34 |
+
import os
|
| 35 |
+
import posixpath
|
| 36 |
+
from dataclasses import dataclass
|
| 37 |
+
from pathlib import Path
|
| 38 |
+
from typing import ClassVar, Optional, Union
|
| 39 |
+
|
| 40 |
+
import fsspec
|
| 41 |
+
from fsspec.core import url_to_fs
|
| 42 |
+
from huggingface_hub import DatasetCard, DatasetCardData
|
| 43 |
+
|
| 44 |
+
from . import config
|
| 45 |
+
from .features import Features
|
| 46 |
+
from .splits import SplitDict
|
| 47 |
+
from .utils import Version
|
| 48 |
+
from .utils.logging import get_logger
|
| 49 |
+
from .utils.py_utils import asdict, unique_values
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
logger = get_logger(__name__)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
@dataclass
|
| 56 |
+
class SupervisedKeysData:
|
| 57 |
+
input: str = ""
|
| 58 |
+
output: str = ""
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
@dataclass
|
| 62 |
+
class DownloadChecksumsEntryData:
|
| 63 |
+
key: str = ""
|
| 64 |
+
value: str = ""
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class MissingCachedSizesConfigError(Exception):
|
| 68 |
+
"""The expected cached sizes of the download file are missing."""
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
class NonMatchingCachedSizesError(Exception):
|
| 72 |
+
"""The prepared split doesn't have expected sizes."""
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
@dataclass
|
| 76 |
+
class PostProcessedInfo:
|
| 77 |
+
features: Optional[Features] = None
|
| 78 |
+
resources_checksums: Optional[dict] = None
|
| 79 |
+
|
| 80 |
+
def __post_init__(self):
|
| 81 |
+
# Convert back to the correct classes when we reload from dict
|
| 82 |
+
if self.features is not None and not isinstance(self.features, Features):
|
| 83 |
+
self.features = Features.from_dict(self.features)
|
| 84 |
+
|
| 85 |
+
@classmethod
|
| 86 |
+
def from_dict(cls, post_processed_info_dict: dict) -> "PostProcessedInfo":
|
| 87 |
+
field_names = {f.name for f in dataclasses.fields(cls)}
|
| 88 |
+
return cls(**{k: v for k, v in post_processed_info_dict.items() if k in field_names})
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
@dataclass
|
| 92 |
+
class DatasetInfo:
|
| 93 |
+
"""Information about a dataset.
|
| 94 |
+
|
| 95 |
+
`DatasetInfo` documents datasets, including its name, version, and features.
|
| 96 |
+
See the constructor arguments and properties for a full list.
|
| 97 |
+
|
| 98 |
+
Not all fields are known on construction and may be updated later.
|
| 99 |
+
|
| 100 |
+
Attributes:
|
| 101 |
+
description (`str`):
|
| 102 |
+
A description of the dataset.
|
| 103 |
+
citation (`str`):
|
| 104 |
+
A BibTeX citation of the dataset.
|
| 105 |
+
homepage (`str`):
|
| 106 |
+
A URL to the official homepage for the dataset.
|
| 107 |
+
license (`str`):
|
| 108 |
+
The dataset's license. It can be the name of the license or a paragraph containing the terms of the license.
|
| 109 |
+
features ([`Features`], *optional*):
|
| 110 |
+
The features used to specify the dataset's column types.
|
| 111 |
+
post_processed (`PostProcessedInfo`, *optional*):
|
| 112 |
+
Information regarding the resources of a possible post-processing of a dataset. For example, it can contain the information of an index.
|
| 113 |
+
supervised_keys (`SupervisedKeysData`, *optional*):
|
| 114 |
+
Specifies the input feature and the label for supervised learning if applicable for the dataset (legacy from TFDS).
|
| 115 |
+
builder_name (`str`, *optional*):
|
| 116 |
+
The name of the `GeneratorBasedBuilder` subclass used to create the dataset. It is also the snake_case version of the dataset builder class name.
|
| 117 |
+
config_name (`str`, *optional*):
|
| 118 |
+
The name of the configuration derived from [`BuilderConfig`].
|
| 119 |
+
version (`str` or [`Version`], *optional*):
|
| 120 |
+
The version of the dataset.
|
| 121 |
+
splits (`dict`, *optional*):
|
| 122 |
+
The mapping between split name and metadata.
|
| 123 |
+
download_checksums (`dict`, *optional*):
|
| 124 |
+
The mapping between the URL to download the dataset's checksums and corresponding metadata.
|
| 125 |
+
download_size (`int`, *optional*):
|
| 126 |
+
The size of the files to download to generate the dataset, in bytes.
|
| 127 |
+
post_processing_size (`int`, *optional*):
|
| 128 |
+
Size of the dataset in bytes after post-processing, if any.
|
| 129 |
+
dataset_size (`int`, *optional*):
|
| 130 |
+
The combined size in bytes of the Arrow tables for all splits.
|
| 131 |
+
size_in_bytes (`int`, *optional*):
|
| 132 |
+
The combined size in bytes of all files associated with the dataset (downloaded files + Arrow files).
|
| 133 |
+
**config_kwargs (additional keyword arguments):
|
| 134 |
+
Keyword arguments to be passed to the [`BuilderConfig`] and used in the [`DatasetBuilder`].
|
| 135 |
+
"""
|
| 136 |
+
|
| 137 |
+
# Set in the dataset builders
|
| 138 |
+
description: str = dataclasses.field(default_factory=str)
|
| 139 |
+
citation: str = dataclasses.field(default_factory=str)
|
| 140 |
+
homepage: str = dataclasses.field(default_factory=str)
|
| 141 |
+
license: str = dataclasses.field(default_factory=str)
|
| 142 |
+
features: Optional[Features] = None
|
| 143 |
+
post_processed: Optional[PostProcessedInfo] = None
|
| 144 |
+
supervised_keys: Optional[SupervisedKeysData] = None
|
| 145 |
+
|
| 146 |
+
# Set later by the builder
|
| 147 |
+
builder_name: Optional[str] = None
|
| 148 |
+
dataset_name: Optional[str] = None # for packaged builders, to be different from builder_name
|
| 149 |
+
config_name: Optional[str] = None
|
| 150 |
+
version: Optional[Union[str, Version]] = None
|
| 151 |
+
# Set later by `download_and_prepare`
|
| 152 |
+
splits: Optional[dict] = None
|
| 153 |
+
download_checksums: Optional[dict] = None
|
| 154 |
+
download_size: Optional[int] = None
|
| 155 |
+
post_processing_size: Optional[int] = None
|
| 156 |
+
dataset_size: Optional[int] = None
|
| 157 |
+
size_in_bytes: Optional[int] = None
|
| 158 |
+
|
| 159 |
+
_INCLUDED_INFO_IN_YAML: ClassVar[list[str]] = [
|
| 160 |
+
"config_name",
|
| 161 |
+
"download_size",
|
| 162 |
+
"dataset_size",
|
| 163 |
+
"features",
|
| 164 |
+
"splits",
|
| 165 |
+
]
|
| 166 |
+
|
| 167 |
+
def __post_init__(self):
|
| 168 |
+
# Convert back to the correct classes when we reload from dict
|
| 169 |
+
if self.features is not None and not isinstance(self.features, Features):
|
| 170 |
+
self.features = Features.from_dict(self.features)
|
| 171 |
+
if self.post_processed is not None and not isinstance(self.post_processed, PostProcessedInfo):
|
| 172 |
+
self.post_processed = PostProcessedInfo.from_dict(self.post_processed)
|
| 173 |
+
if self.version is not None and not isinstance(self.version, Version):
|
| 174 |
+
if isinstance(self.version, str):
|
| 175 |
+
self.version = Version(self.version)
|
| 176 |
+
else:
|
| 177 |
+
self.version = Version.from_dict(self.version)
|
| 178 |
+
if self.splits is not None and not isinstance(self.splits, SplitDict):
|
| 179 |
+
self.splits = SplitDict.from_split_dict(self.splits)
|
| 180 |
+
if self.supervised_keys is not None and not isinstance(self.supervised_keys, SupervisedKeysData):
|
| 181 |
+
if isinstance(self.supervised_keys, (tuple, list)):
|
| 182 |
+
self.supervised_keys = SupervisedKeysData(*self.supervised_keys)
|
| 183 |
+
else:
|
| 184 |
+
self.supervised_keys = SupervisedKeysData(**self.supervised_keys)
|
| 185 |
+
|
| 186 |
+
def write_to_directory(self, dataset_info_dir, pretty_print=False, storage_options: Optional[dict] = None):
|
| 187 |
+
"""Write `DatasetInfo` and license (if present) as JSON files to `dataset_info_dir`.
|
| 188 |
+
|
| 189 |
+
Args:
|
| 190 |
+
dataset_info_dir (`str`):
|
| 191 |
+
Destination directory.
|
| 192 |
+
pretty_print (`bool`, defaults to `False`):
|
| 193 |
+
If `True`, the JSON will be pretty-printed with the indent level of 4.
|
| 194 |
+
storage_options (`dict`, *optional*):
|
| 195 |
+
Key/value pairs to be passed on to the file-system backend, if any.
|
| 196 |
+
|
| 197 |
+
<Added version="2.9.0"/>
|
| 198 |
+
|
| 199 |
+
Example:
|
| 200 |
+
|
| 201 |
+
```py
|
| 202 |
+
>>> from datasets import load_dataset
|
| 203 |
+
>>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation")
|
| 204 |
+
>>> ds.info.write_to_directory("/path/to/directory/")
|
| 205 |
+
```
|
| 206 |
+
"""
|
| 207 |
+
fs: fsspec.AbstractFileSystem
|
| 208 |
+
fs, *_ = url_to_fs(dataset_info_dir, **(storage_options or {}))
|
| 209 |
+
with fs.open(posixpath.join(dataset_info_dir, config.DATASET_INFO_FILENAME), "wb") as f:
|
| 210 |
+
self._dump_info(f, pretty_print=pretty_print)
|
| 211 |
+
if self.license:
|
| 212 |
+
with fs.open(posixpath.join(dataset_info_dir, config.LICENSE_FILENAME), "wb") as f:
|
| 213 |
+
self._dump_license(f)
|
| 214 |
+
|
| 215 |
+
def _dump_info(self, file, pretty_print=False):
|
| 216 |
+
"""Dump info in `file` file-like object open in bytes mode (to support remote files)"""
|
| 217 |
+
file.write(json.dumps(asdict(self), indent=4 if pretty_print else None).encode("utf-8"))
|
| 218 |
+
|
| 219 |
+
def _dump_license(self, file):
|
| 220 |
+
"""Dump license in `file` file-like object open in bytes mode (to support remote files)"""
|
| 221 |
+
file.write(self.license.encode("utf-8"))
|
| 222 |
+
|
| 223 |
+
@classmethod
|
| 224 |
+
def from_merge(cls, dataset_infos: list["DatasetInfo"]):
|
| 225 |
+
dataset_infos = [dset_info.copy() for dset_info in dataset_infos if dset_info is not None]
|
| 226 |
+
|
| 227 |
+
if len(dataset_infos) > 0 and all(dataset_infos[0] == dset_info for dset_info in dataset_infos):
|
| 228 |
+
# if all dataset_infos are equal we don't need to merge. Just return the first.
|
| 229 |
+
return dataset_infos[0]
|
| 230 |
+
|
| 231 |
+
description = "\n\n".join(unique_values(info.description for info in dataset_infos)).strip()
|
| 232 |
+
citation = "\n\n".join(unique_values(info.citation for info in dataset_infos)).strip()
|
| 233 |
+
homepage = "\n\n".join(unique_values(info.homepage for info in dataset_infos)).strip()
|
| 234 |
+
license = "\n\n".join(unique_values(info.license for info in dataset_infos)).strip()
|
| 235 |
+
features = None
|
| 236 |
+
supervised_keys = None
|
| 237 |
+
|
| 238 |
+
return cls(
|
| 239 |
+
description=description,
|
| 240 |
+
citation=citation,
|
| 241 |
+
homepage=homepage,
|
| 242 |
+
license=license,
|
| 243 |
+
features=features,
|
| 244 |
+
supervised_keys=supervised_keys,
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
@classmethod
|
| 248 |
+
def from_directory(cls, dataset_info_dir: str, storage_options: Optional[dict] = None) -> "DatasetInfo":
|
| 249 |
+
"""Create [`DatasetInfo`] from the JSON file in `dataset_info_dir`.
|
| 250 |
+
|
| 251 |
+
This function updates all the dynamically generated fields (num_examples,
|
| 252 |
+
hash, time of creation,...) of the [`DatasetInfo`].
|
| 253 |
+
|
| 254 |
+
This will overwrite all previous metadata.
|
| 255 |
+
|
| 256 |
+
Args:
|
| 257 |
+
dataset_info_dir (`str`):
|
| 258 |
+
The directory containing the metadata file. This
|
| 259 |
+
should be the root directory of a specific dataset version.
|
| 260 |
+
storage_options (`dict`, *optional*):
|
| 261 |
+
Key/value pairs to be passed on to the file-system backend, if any.
|
| 262 |
+
|
| 263 |
+
<Added version="2.9.0"/>
|
| 264 |
+
|
| 265 |
+
Example:
|
| 266 |
+
|
| 267 |
+
```py
|
| 268 |
+
>>> from datasets import DatasetInfo
|
| 269 |
+
>>> ds_info = DatasetInfo.from_directory("/path/to/directory/")
|
| 270 |
+
```
|
| 271 |
+
"""
|
| 272 |
+
fs: fsspec.AbstractFileSystem
|
| 273 |
+
fs, *_ = url_to_fs(dataset_info_dir, **(storage_options or {}))
|
| 274 |
+
logger.debug(f"Loading Dataset info from {dataset_info_dir}")
|
| 275 |
+
if not dataset_info_dir:
|
| 276 |
+
raise ValueError("Calling DatasetInfo.from_directory() with undefined dataset_info_dir.")
|
| 277 |
+
with fs.open(posixpath.join(dataset_info_dir, config.DATASET_INFO_FILENAME), "r", encoding="utf-8") as f:
|
| 278 |
+
dataset_info_dict = json.load(f)
|
| 279 |
+
return cls.from_dict(dataset_info_dict)
|
| 280 |
+
|
| 281 |
+
@classmethod
|
| 282 |
+
def from_dict(cls, dataset_info_dict: dict) -> "DatasetInfo":
|
| 283 |
+
field_names = {f.name for f in dataclasses.fields(cls)}
|
| 284 |
+
return cls(**{k: v for k, v in dataset_info_dict.items() if k in field_names})
|
| 285 |
+
|
| 286 |
+
def update(self, other_dataset_info: "DatasetInfo", ignore_none=True):
|
| 287 |
+
self_dict = self.__dict__
|
| 288 |
+
self_dict.update(
|
| 289 |
+
**{
|
| 290 |
+
k: copy.deepcopy(v)
|
| 291 |
+
for k, v in other_dataset_info.__dict__.items()
|
| 292 |
+
if (v is not None or not ignore_none)
|
| 293 |
+
}
|
| 294 |
+
)
|
| 295 |
+
|
| 296 |
+
def copy(self) -> "DatasetInfo":
|
| 297 |
+
return self.__class__(**{k: copy.deepcopy(v) for k, v in self.__dict__.items()})
|
| 298 |
+
|
| 299 |
+
def _to_yaml_dict(self) -> dict:
|
| 300 |
+
yaml_dict = {}
|
| 301 |
+
dataset_info_dict = asdict(self)
|
| 302 |
+
for key in dataset_info_dict:
|
| 303 |
+
if key in self._INCLUDED_INFO_IN_YAML:
|
| 304 |
+
value = getattr(self, key)
|
| 305 |
+
if hasattr(value, "_to_yaml_list"): # Features, SplitDict
|
| 306 |
+
yaml_dict[key] = value._to_yaml_list()
|
| 307 |
+
elif hasattr(value, "_to_yaml_string"): # Version
|
| 308 |
+
yaml_dict[key] = value._to_yaml_string()
|
| 309 |
+
else:
|
| 310 |
+
yaml_dict[key] = value
|
| 311 |
+
return yaml_dict
|
| 312 |
+
|
| 313 |
+
@classmethod
|
| 314 |
+
def _from_yaml_dict(cls, yaml_data: dict) -> "DatasetInfo":
|
| 315 |
+
yaml_data = copy.deepcopy(yaml_data)
|
| 316 |
+
if yaml_data.get("features") is not None:
|
| 317 |
+
yaml_data["features"] = Features._from_yaml_list(yaml_data["features"])
|
| 318 |
+
if yaml_data.get("splits") is not None:
|
| 319 |
+
yaml_data["splits"] = SplitDict._from_yaml_list(yaml_data["splits"])
|
| 320 |
+
field_names = {f.name for f in dataclasses.fields(cls)}
|
| 321 |
+
return cls(**{k: v for k, v in yaml_data.items() if k in field_names})
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
class DatasetInfosDict(dict[str, DatasetInfo]):
|
| 325 |
+
def write_to_directory(self, dataset_infos_dir, overwrite=False, pretty_print=False) -> None:
|
| 326 |
+
total_dataset_infos = {}
|
| 327 |
+
dataset_infos_path = os.path.join(dataset_infos_dir, config.DATASETDICT_INFOS_FILENAME)
|
| 328 |
+
dataset_readme_path = os.path.join(dataset_infos_dir, config.REPOCARD_FILENAME)
|
| 329 |
+
if not overwrite:
|
| 330 |
+
total_dataset_infos = self.from_directory(dataset_infos_dir)
|
| 331 |
+
total_dataset_infos.update(self)
|
| 332 |
+
if os.path.exists(dataset_infos_path):
|
| 333 |
+
# for backward compatibility, let's update the JSON file if it exists
|
| 334 |
+
with open(dataset_infos_path, "w", encoding="utf-8") as f:
|
| 335 |
+
dataset_infos_dict = {
|
| 336 |
+
config_name: asdict(dset_info) for config_name, dset_info in total_dataset_infos.items()
|
| 337 |
+
}
|
| 338 |
+
json.dump(dataset_infos_dict, f, indent=4 if pretty_print else None)
|
| 339 |
+
# Dump the infos in the YAML part of the README.md file
|
| 340 |
+
if os.path.exists(dataset_readme_path):
|
| 341 |
+
dataset_card = DatasetCard.load(dataset_readme_path)
|
| 342 |
+
dataset_card_data = dataset_card.data
|
| 343 |
+
else:
|
| 344 |
+
dataset_card = None
|
| 345 |
+
dataset_card_data = DatasetCardData()
|
| 346 |
+
if total_dataset_infos:
|
| 347 |
+
total_dataset_infos.to_dataset_card_data(dataset_card_data)
|
| 348 |
+
dataset_card = (
|
| 349 |
+
DatasetCard("---\n" + str(dataset_card_data) + "\n---\n") if dataset_card is None else dataset_card
|
| 350 |
+
)
|
| 351 |
+
dataset_card.save(Path(dataset_readme_path))
|
| 352 |
+
|
| 353 |
+
@classmethod
|
| 354 |
+
def from_directory(cls, dataset_infos_dir) -> "DatasetInfosDict":
|
| 355 |
+
logger.debug(f"Loading Dataset Infos from {dataset_infos_dir}")
|
| 356 |
+
# Load the info from the YAML part of README.md
|
| 357 |
+
if os.path.exists(os.path.join(dataset_infos_dir, config.REPOCARD_FILENAME)):
|
| 358 |
+
dataset_card_data = DatasetCard.load(Path(dataset_infos_dir) / config.REPOCARD_FILENAME).data
|
| 359 |
+
if "dataset_info" in dataset_card_data:
|
| 360 |
+
return cls.from_dataset_card_data(dataset_card_data)
|
| 361 |
+
if os.path.exists(os.path.join(dataset_infos_dir, config.DATASETDICT_INFOS_FILENAME)):
|
| 362 |
+
# this is just to have backward compatibility with dataset_infos.json files
|
| 363 |
+
with open(os.path.join(dataset_infos_dir, config.DATASETDICT_INFOS_FILENAME), encoding="utf-8") as f:
|
| 364 |
+
return cls(
|
| 365 |
+
{
|
| 366 |
+
config_name: DatasetInfo.from_dict(dataset_info_dict)
|
| 367 |
+
for config_name, dataset_info_dict in json.load(f).items()
|
| 368 |
+
}
|
| 369 |
+
)
|
| 370 |
+
else:
|
| 371 |
+
return cls()
|
| 372 |
+
|
| 373 |
+
@classmethod
|
| 374 |
+
def from_dataset_card_data(cls, dataset_card_data: DatasetCardData) -> "DatasetInfosDict":
|
| 375 |
+
if isinstance(dataset_card_data.get("dataset_info"), (list, dict)):
|
| 376 |
+
if isinstance(dataset_card_data["dataset_info"], list):
|
| 377 |
+
return cls(
|
| 378 |
+
{
|
| 379 |
+
dataset_info_yaml_dict.get("config_name", "default"): DatasetInfo._from_yaml_dict(
|
| 380 |
+
dataset_info_yaml_dict
|
| 381 |
+
)
|
| 382 |
+
for dataset_info_yaml_dict in dataset_card_data["dataset_info"]
|
| 383 |
+
}
|
| 384 |
+
)
|
| 385 |
+
else:
|
| 386 |
+
dataset_info = DatasetInfo._from_yaml_dict(dataset_card_data["dataset_info"])
|
| 387 |
+
dataset_info.config_name = dataset_card_data["dataset_info"].get("config_name", "default")
|
| 388 |
+
return cls({dataset_info.config_name: dataset_info})
|
| 389 |
+
else:
|
| 390 |
+
return cls()
|
| 391 |
+
|
| 392 |
+
def to_dataset_card_data(self, dataset_card_data: DatasetCardData) -> None:
|
| 393 |
+
if self:
|
| 394 |
+
# first get existing metadata info
|
| 395 |
+
if "dataset_info" in dataset_card_data and isinstance(dataset_card_data["dataset_info"], dict):
|
| 396 |
+
dataset_metadata_infos = {
|
| 397 |
+
dataset_card_data["dataset_info"].get("config_name", "default"): dataset_card_data["dataset_info"]
|
| 398 |
+
}
|
| 399 |
+
elif "dataset_info" in dataset_card_data and isinstance(dataset_card_data["dataset_info"], list):
|
| 400 |
+
dataset_metadata_infos = {
|
| 401 |
+
config_metadata["config_name"]: config_metadata
|
| 402 |
+
for config_metadata in dataset_card_data["dataset_info"]
|
| 403 |
+
}
|
| 404 |
+
else:
|
| 405 |
+
dataset_metadata_infos = {}
|
| 406 |
+
# update/rewrite existing metadata info with the one to dump
|
| 407 |
+
total_dataset_infos = {
|
| 408 |
+
**dataset_metadata_infos,
|
| 409 |
+
**{config_name: dset_info._to_yaml_dict() for config_name, dset_info in self.items()},
|
| 410 |
+
}
|
| 411 |
+
# the config_name from the dataset_infos_dict takes over the config_name of the DatasetInfo
|
| 412 |
+
for config_name, dset_info_yaml_dict in total_dataset_infos.items():
|
| 413 |
+
dset_info_yaml_dict["config_name"] = config_name
|
| 414 |
+
if len(total_dataset_infos) == 1:
|
| 415 |
+
# use a struct instead of a list of configurations, since there's only one
|
| 416 |
+
dataset_card_data["dataset_info"] = next(iter(total_dataset_infos.values()))
|
| 417 |
+
config_name = dataset_card_data["dataset_info"].pop("config_name", None)
|
| 418 |
+
if config_name != "default":
|
| 419 |
+
# if config_name is not "default" preserve it and put at the first position
|
| 420 |
+
dataset_card_data["dataset_info"] = {
|
| 421 |
+
"config_name": config_name,
|
| 422 |
+
**dataset_card_data["dataset_info"],
|
| 423 |
+
}
|
| 424 |
+
else:
|
| 425 |
+
dataset_card_data["dataset_info"] = []
|
| 426 |
+
for config_name, dataset_info_yaml_dict in sorted(total_dataset_infos.items()):
|
| 427 |
+
# add the config_name field in first position
|
| 428 |
+
dataset_info_yaml_dict.pop("config_name", None)
|
| 429 |
+
dataset_info_yaml_dict = {"config_name": config_name, **dataset_info_yaml_dict}
|
| 430 |
+
dataset_card_data["dataset_info"].append(dataset_info_yaml_dict)
|
datasets/inspect.py
ADDED
|
@@ -0,0 +1,353 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2020 The HuggingFace Datasets Authors.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
# Lint as: python3
|
| 16 |
+
"""List and inspect datasets."""
|
| 17 |
+
|
| 18 |
+
import os
|
| 19 |
+
from collections.abc import Mapping, Sequence
|
| 20 |
+
from typing import Optional, Union
|
| 21 |
+
|
| 22 |
+
from .download.download_config import DownloadConfig
|
| 23 |
+
from .download.download_manager import DownloadMode
|
| 24 |
+
from .download.streaming_download_manager import StreamingDownloadManager
|
| 25 |
+
from .info import DatasetInfo
|
| 26 |
+
from .load import (
|
| 27 |
+
dataset_module_factory,
|
| 28 |
+
get_dataset_builder_class,
|
| 29 |
+
load_dataset_builder,
|
| 30 |
+
)
|
| 31 |
+
from .utils.logging import get_logger
|
| 32 |
+
from .utils.version import Version
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
logger = get_logger(__name__)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class SplitsNotFoundError(ValueError):
|
| 39 |
+
pass
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def get_dataset_infos(
|
| 43 |
+
path: str,
|
| 44 |
+
data_files: Optional[Union[dict, list, str]] = None,
|
| 45 |
+
download_config: Optional[DownloadConfig] = None,
|
| 46 |
+
download_mode: Optional[Union[DownloadMode, str]] = None,
|
| 47 |
+
revision: Optional[Union[str, Version]] = None,
|
| 48 |
+
token: Optional[Union[bool, str]] = None,
|
| 49 |
+
**config_kwargs,
|
| 50 |
+
):
|
| 51 |
+
"""Get the meta information about a dataset, returned as a dict mapping config name to DatasetInfoDict.
|
| 52 |
+
|
| 53 |
+
Args:
|
| 54 |
+
path (`str`): path to the dataset repository. Can be either:
|
| 55 |
+
|
| 56 |
+
- a local path to the dataset directory containing the data files,
|
| 57 |
+
e.g. `'./dataset/squad'`
|
| 58 |
+
- a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]),
|
| 59 |
+
e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'`
|
| 60 |
+
revision (`Union[str, datasets.Version]`, *optional*):
|
| 61 |
+
If specified, the dataset module will be loaded from the datasets repository at this version.
|
| 62 |
+
By default:
|
| 63 |
+
- it is set to the local version of the lib.
|
| 64 |
+
- it will also try to load it from the main branch if it's not available at the local version of the lib.
|
| 65 |
+
Specifying a version that is different from your local version of the lib might cause compatibility issues.
|
| 66 |
+
download_config ([`DownloadConfig`], *optional*):
|
| 67 |
+
Specific download configuration parameters.
|
| 68 |
+
download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):
|
| 69 |
+
Download/generate mode.
|
| 70 |
+
data_files (`Union[Dict, List, str]`, *optional*):
|
| 71 |
+
Defining the data_files of the dataset configuration.
|
| 72 |
+
token (`str` or `bool`, *optional*):
|
| 73 |
+
Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
|
| 74 |
+
If `True`, or not specified, will get token from `"~/.huggingface"`.
|
| 75 |
+
**config_kwargs (additional keyword arguments):
|
| 76 |
+
Optional attributes for builder class which will override the attributes if supplied.
|
| 77 |
+
|
| 78 |
+
Example:
|
| 79 |
+
|
| 80 |
+
```py
|
| 81 |
+
>>> from datasets import get_dataset_infos
|
| 82 |
+
>>> get_dataset_infos('cornell-movie-review-data/rotten_tomatoes')
|
| 83 |
+
{'default': DatasetInfo(description="Movie Review Dataset.\nThis is a dataset of containing 5,331 positive and 5,331 negative processed\nsentences from Rotten Tomatoes movie reviews...), ...}
|
| 84 |
+
```
|
| 85 |
+
"""
|
| 86 |
+
config_names = get_dataset_config_names(
|
| 87 |
+
path=path,
|
| 88 |
+
revision=revision,
|
| 89 |
+
download_config=download_config,
|
| 90 |
+
download_mode=download_mode,
|
| 91 |
+
data_files=data_files,
|
| 92 |
+
token=token,
|
| 93 |
+
)
|
| 94 |
+
return {
|
| 95 |
+
config_name: get_dataset_config_info(
|
| 96 |
+
path=path,
|
| 97 |
+
config_name=config_name,
|
| 98 |
+
data_files=data_files,
|
| 99 |
+
download_config=download_config,
|
| 100 |
+
download_mode=download_mode,
|
| 101 |
+
revision=revision,
|
| 102 |
+
token=token,
|
| 103 |
+
**config_kwargs,
|
| 104 |
+
)
|
| 105 |
+
for config_name in config_names
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def get_dataset_config_names(
|
| 110 |
+
path: str,
|
| 111 |
+
revision: Optional[Union[str, Version]] = None,
|
| 112 |
+
download_config: Optional[DownloadConfig] = None,
|
| 113 |
+
download_mode: Optional[Union[DownloadMode, str]] = None,
|
| 114 |
+
data_files: Optional[Union[dict, list, str]] = None,
|
| 115 |
+
**download_kwargs,
|
| 116 |
+
):
|
| 117 |
+
"""Get the list of available config names for a particular dataset.
|
| 118 |
+
|
| 119 |
+
Args:
|
| 120 |
+
path (`str`): path to the dataset repository. Can be either:
|
| 121 |
+
|
| 122 |
+
- a local path to the dataset directory containing the data files,
|
| 123 |
+
e.g. `'./dataset/squad'`
|
| 124 |
+
- a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]),
|
| 125 |
+
e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'`
|
| 126 |
+
revision (`Union[str, datasets.Version]`, *optional*):
|
| 127 |
+
If specified, the dataset module will be loaded from the datasets repository at this version.
|
| 128 |
+
By default:
|
| 129 |
+
- it is set to the local version of the lib.
|
| 130 |
+
- it will also try to load it from the main branch if it's not available at the local version of the lib.
|
| 131 |
+
Specifying a version that is different from your local version of the lib might cause compatibility issues.
|
| 132 |
+
download_config ([`DownloadConfig`], *optional*):
|
| 133 |
+
Specific download configuration parameters.
|
| 134 |
+
download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):
|
| 135 |
+
Download/generate mode.
|
| 136 |
+
data_files (`Union[Dict, List, str]`, *optional*):
|
| 137 |
+
Defining the data_files of the dataset configuration.
|
| 138 |
+
**download_kwargs (additional keyword arguments):
|
| 139 |
+
Optional attributes for [`DownloadConfig`] which will override the attributes in `download_config` if supplied,
|
| 140 |
+
for example `token`.
|
| 141 |
+
|
| 142 |
+
Example:
|
| 143 |
+
|
| 144 |
+
```py
|
| 145 |
+
>>> from datasets import get_dataset_config_names
|
| 146 |
+
>>> get_dataset_config_names("nyu-mll/glue")
|
| 147 |
+
['cola',
|
| 148 |
+
'sst2',
|
| 149 |
+
'mrpc',
|
| 150 |
+
'qqp',
|
| 151 |
+
'stsb',
|
| 152 |
+
'mnli',
|
| 153 |
+
'mnli_mismatched',
|
| 154 |
+
'mnli_matched',
|
| 155 |
+
'qnli',
|
| 156 |
+
'rte',
|
| 157 |
+
'wnli',
|
| 158 |
+
'ax']
|
| 159 |
+
```
|
| 160 |
+
"""
|
| 161 |
+
dataset_module = dataset_module_factory(
|
| 162 |
+
path,
|
| 163 |
+
revision=revision,
|
| 164 |
+
download_config=download_config,
|
| 165 |
+
download_mode=download_mode,
|
| 166 |
+
data_files=data_files,
|
| 167 |
+
**download_kwargs,
|
| 168 |
+
)
|
| 169 |
+
builder_cls = get_dataset_builder_class(dataset_module, dataset_name=os.path.basename(path))
|
| 170 |
+
return list(builder_cls.builder_configs.keys()) or [
|
| 171 |
+
dataset_module.builder_kwargs.get("config_name", builder_cls.DEFAULT_CONFIG_NAME or "default")
|
| 172 |
+
]
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
def get_dataset_default_config_name(
|
| 176 |
+
path: str,
|
| 177 |
+
revision: Optional[Union[str, Version]] = None,
|
| 178 |
+
download_config: Optional[DownloadConfig] = None,
|
| 179 |
+
download_mode: Optional[Union[DownloadMode, str]] = None,
|
| 180 |
+
data_files: Optional[Union[dict, list, str]] = None,
|
| 181 |
+
**download_kwargs,
|
| 182 |
+
) -> Optional[str]:
|
| 183 |
+
"""Get the default config name for a particular dataset.
|
| 184 |
+
Can return None only if the dataset has multiple configurations and no default configuration.
|
| 185 |
+
|
| 186 |
+
Args:
|
| 187 |
+
path (`str`): path to the dataset repository. Can be either:
|
| 188 |
+
|
| 189 |
+
- a local path to the dataset directory containing the data files,
|
| 190 |
+
e.g. `'./dataset/squad'`
|
| 191 |
+
- a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]),
|
| 192 |
+
e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'`
|
| 193 |
+
revision (`Union[str, datasets.Version]`, *optional*):
|
| 194 |
+
If specified, the dataset module will be loaded from the datasets repository at this version.
|
| 195 |
+
By default:
|
| 196 |
+
- it is set to the local version of the lib.
|
| 197 |
+
- it will also try to load it from the main branch if it's not available at the local version of the lib.
|
| 198 |
+
Specifying a version that is different from your local version of the lib might cause compatibility issues.
|
| 199 |
+
download_config ([`DownloadConfig`], *optional*):
|
| 200 |
+
Specific download configuration parameters.
|
| 201 |
+
download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):
|
| 202 |
+
Download/generate mode.
|
| 203 |
+
data_files (`Union[Dict, List, str]`, *optional*):
|
| 204 |
+
Defining the data_files of the dataset configuration.
|
| 205 |
+
**download_kwargs (additional keyword arguments):
|
| 206 |
+
Optional attributes for [`DownloadConfig`] which will override the attributes in `download_config` if supplied,
|
| 207 |
+
for example `token`.
|
| 208 |
+
|
| 209 |
+
Returns:
|
| 210 |
+
Optional[str]: the default config name if there is one
|
| 211 |
+
|
| 212 |
+
Example:
|
| 213 |
+
|
| 214 |
+
```py
|
| 215 |
+
>>> from datasets import get_dataset_default_config_name
|
| 216 |
+
>>> get_dataset_default_config_name("openbookqa")
|
| 217 |
+
'main'
|
| 218 |
+
```
|
| 219 |
+
"""
|
| 220 |
+
dataset_module = dataset_module_factory(
|
| 221 |
+
path,
|
| 222 |
+
revision=revision,
|
| 223 |
+
download_config=download_config,
|
| 224 |
+
download_mode=download_mode,
|
| 225 |
+
data_files=data_files,
|
| 226 |
+
**download_kwargs,
|
| 227 |
+
)
|
| 228 |
+
builder_cls = get_dataset_builder_class(dataset_module, dataset_name=os.path.basename(path))
|
| 229 |
+
builder_configs = list(builder_cls.builder_configs.keys())
|
| 230 |
+
if builder_configs:
|
| 231 |
+
default_config_name = builder_configs[0] if len(builder_configs) == 1 else None
|
| 232 |
+
else:
|
| 233 |
+
default_config_name = "default"
|
| 234 |
+
return builder_cls.DEFAULT_CONFIG_NAME or default_config_name
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
def get_dataset_config_info(
|
| 238 |
+
path: str,
|
| 239 |
+
config_name: Optional[str] = None,
|
| 240 |
+
data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
|
| 241 |
+
download_config: Optional[DownloadConfig] = None,
|
| 242 |
+
download_mode: Optional[Union[DownloadMode, str]] = None,
|
| 243 |
+
revision: Optional[Union[str, Version]] = None,
|
| 244 |
+
token: Optional[Union[bool, str]] = None,
|
| 245 |
+
**config_kwargs,
|
| 246 |
+
) -> DatasetInfo:
|
| 247 |
+
"""Get the meta information (DatasetInfo) about a dataset for a particular config
|
| 248 |
+
|
| 249 |
+
Args:
|
| 250 |
+
path (`str`): path to the dataset repository. Can be either:
|
| 251 |
+
|
| 252 |
+
- a local path to the dataset directory containing the data files,
|
| 253 |
+
e.g. `'./dataset/squad'`
|
| 254 |
+
- a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]),
|
| 255 |
+
e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'`
|
| 256 |
+
config_name (:obj:`str`, optional): Defining the name of the dataset configuration.
|
| 257 |
+
data_files (:obj:`str` or :obj:`Sequence` or :obj:`Mapping`, optional): Path(s) to source data file(s).
|
| 258 |
+
download_config (:class:`~download.DownloadConfig`, optional): Specific download configuration parameters.
|
| 259 |
+
download_mode (:class:`DownloadMode` or :obj:`str`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode.
|
| 260 |
+
revision (:class:`~utils.Version` or :obj:`str`, optional): Version of the dataset to load.
|
| 261 |
+
As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch.
|
| 262 |
+
You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository.
|
| 263 |
+
token (``str`` or :obj:`bool`, optional): Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
|
| 264 |
+
If True, or not specified, will get token from `"~/.huggingface"`.
|
| 265 |
+
**config_kwargs (additional keyword arguments): optional attributes for builder class which will override the attributes if supplied.
|
| 266 |
+
|
| 267 |
+
"""
|
| 268 |
+
builder = load_dataset_builder(
|
| 269 |
+
path,
|
| 270 |
+
name=config_name,
|
| 271 |
+
data_files=data_files,
|
| 272 |
+
download_config=download_config,
|
| 273 |
+
download_mode=download_mode,
|
| 274 |
+
revision=revision,
|
| 275 |
+
token=token,
|
| 276 |
+
**config_kwargs,
|
| 277 |
+
)
|
| 278 |
+
info = builder.info
|
| 279 |
+
if info.splits is None:
|
| 280 |
+
download_config = download_config.copy() if download_config else DownloadConfig()
|
| 281 |
+
if token is not None:
|
| 282 |
+
download_config.token = token
|
| 283 |
+
builder._check_manual_download(
|
| 284 |
+
StreamingDownloadManager(base_path=builder.base_path, download_config=download_config)
|
| 285 |
+
)
|
| 286 |
+
try:
|
| 287 |
+
info.splits = {
|
| 288 |
+
split_generator.name: {"name": split_generator.name, "dataset_name": path}
|
| 289 |
+
for split_generator in builder._split_generators(
|
| 290 |
+
StreamingDownloadManager(base_path=builder.base_path, download_config=download_config)
|
| 291 |
+
)
|
| 292 |
+
}
|
| 293 |
+
except Exception as err:
|
| 294 |
+
raise SplitsNotFoundError("The split names could not be parsed from the dataset config.") from err
|
| 295 |
+
return info
|
| 296 |
+
|
| 297 |
+
|
| 298 |
+
def get_dataset_split_names(
|
| 299 |
+
path: str,
|
| 300 |
+
config_name: Optional[str] = None,
|
| 301 |
+
data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
|
| 302 |
+
download_config: Optional[DownloadConfig] = None,
|
| 303 |
+
download_mode: Optional[Union[DownloadMode, str]] = None,
|
| 304 |
+
revision: Optional[Union[str, Version]] = None,
|
| 305 |
+
token: Optional[Union[bool, str]] = None,
|
| 306 |
+
**config_kwargs,
|
| 307 |
+
):
|
| 308 |
+
"""Get the list of available splits for a particular config and dataset.
|
| 309 |
+
|
| 310 |
+
Args:
|
| 311 |
+
path (`str`): path to the dataset repository. Can be either:
|
| 312 |
+
|
| 313 |
+
- a local path to the dataset directory containing the data files,
|
| 314 |
+
e.g. `'./dataset/squad'`
|
| 315 |
+
- a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]),
|
| 316 |
+
e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'`
|
| 317 |
+
config_name (`str`, *optional*):
|
| 318 |
+
Defining the name of the dataset configuration.
|
| 319 |
+
data_files (`str` or `Sequence` or `Mapping`, *optional*):
|
| 320 |
+
Path(s) to source data file(s).
|
| 321 |
+
download_config ([`DownloadConfig`], *optional*):
|
| 322 |
+
Specific download configuration parameters.
|
| 323 |
+
download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):
|
| 324 |
+
Download/generate mode.
|
| 325 |
+
revision ([`Version`] or `str`, *optional*):
|
| 326 |
+
Version of the dataset to load.
|
| 327 |
+
As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch.
|
| 328 |
+
You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository.
|
| 329 |
+
token (`str` or `bool`, *optional*):
|
| 330 |
+
Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
|
| 331 |
+
If `True`, or not specified, will get token from `"~/.huggingface"`.
|
| 332 |
+
**config_kwargs (additional keyword arguments):
|
| 333 |
+
Optional attributes for builder class which will override the attributes if supplied.
|
| 334 |
+
|
| 335 |
+
Example:
|
| 336 |
+
|
| 337 |
+
```py
|
| 338 |
+
>>> from datasets import get_dataset_split_names
|
| 339 |
+
>>> get_dataset_split_names('cornell-movie-review-data/rotten_tomatoes')
|
| 340 |
+
['train', 'validation', 'test']
|
| 341 |
+
```
|
| 342 |
+
"""
|
| 343 |
+
info = get_dataset_config_info(
|
| 344 |
+
path,
|
| 345 |
+
config_name=config_name,
|
| 346 |
+
data_files=data_files,
|
| 347 |
+
download_config=download_config,
|
| 348 |
+
download_mode=download_mode,
|
| 349 |
+
revision=revision,
|
| 350 |
+
token=token,
|
| 351 |
+
**config_kwargs,
|
| 352 |
+
)
|
| 353 |
+
return list(info.splits.keys())
|
datasets/iterable_dataset.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/keyhash.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
# Lint as: python3
|
| 16 |
+
|
| 17 |
+
"""
|
| 18 |
+
Hashing function for dataset keys using `hashlib.md5`
|
| 19 |
+
|
| 20 |
+
Requirements for the hash function:
|
| 21 |
+
|
| 22 |
+
- Provides a uniformly distributed hash from random space
|
| 23 |
+
- Adequately fast speed
|
| 24 |
+
- Working with multiple input types (in this case, `str`, `int` or `bytes`)
|
| 25 |
+
- Should be platform independent (generates same hash on different OS and systems)
|
| 26 |
+
|
| 27 |
+
The hashing function provides a unique 128-bit integer hash of the key provided.
|
| 28 |
+
|
| 29 |
+
The split name is being used here as the hash salt to avoid having same hashes
|
| 30 |
+
in different splits due to same keys
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
from typing import Union
|
| 34 |
+
|
| 35 |
+
from huggingface_hub.utils import insecure_hashlib
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def _as_bytes(hash_data: Union[str, int, bytes, bytearray]) -> bytes:
|
| 39 |
+
"""
|
| 40 |
+
Returns the input hash_data in its bytes form
|
| 41 |
+
|
| 42 |
+
Args:
|
| 43 |
+
hash_data: the hash salt/key to be converted to bytes
|
| 44 |
+
"""
|
| 45 |
+
if isinstance(hash_data, (bytes, bytearray)):
|
| 46 |
+
# Data already in bytes, returns as it as
|
| 47 |
+
return hash_data
|
| 48 |
+
elif isinstance(hash_data, str):
|
| 49 |
+
# We keep the data as it as for it ot be later encoded to UTF-8
|
| 50 |
+
# However replace `\\` with `/` for Windows compatibility
|
| 51 |
+
hash_data = hash_data.replace("\\", "/")
|
| 52 |
+
elif isinstance(hash_data, int):
|
| 53 |
+
hash_data = str(hash_data)
|
| 54 |
+
else:
|
| 55 |
+
# If data is not of the required type, raise error
|
| 56 |
+
raise InvalidKeyError(hash_data)
|
| 57 |
+
|
| 58 |
+
return hash_data.encode("utf-8")
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
class InvalidKeyError(Exception):
|
| 62 |
+
"""Raises an error when given key is of invalid datatype."""
|
| 63 |
+
|
| 64 |
+
def __init__(self, hash_data):
|
| 65 |
+
self.prefix = "\nFAILURE TO GENERATE DATASET: Invalid key type detected"
|
| 66 |
+
self.err_msg = f"\nFound Key {hash_data} of type {type(hash_data)}"
|
| 67 |
+
self.suffix = "\nKeys should be either str, int or bytes type"
|
| 68 |
+
super().__init__(f"{self.prefix}{self.err_msg}{self.suffix}")
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
class DuplicatedKeysError(Exception):
|
| 72 |
+
"""Raise an error when duplicate key found."""
|
| 73 |
+
|
| 74 |
+
def __init__(self, key, duplicate_key_indices, fix_msg=""):
|
| 75 |
+
self.key = key
|
| 76 |
+
self.duplicate_key_indices = duplicate_key_indices
|
| 77 |
+
self.fix_msg = fix_msg
|
| 78 |
+
self.prefix = "Found multiple examples generated with the same key"
|
| 79 |
+
if len(duplicate_key_indices) <= 20:
|
| 80 |
+
self.err_msg = f"\nThe examples at index {', '.join(duplicate_key_indices)} have the key {key}"
|
| 81 |
+
else:
|
| 82 |
+
self.err_msg = f"\nThe examples at index {', '.join(duplicate_key_indices[:20])}... ({len(duplicate_key_indices) - 20} more) have the key {key}"
|
| 83 |
+
self.suffix = "\n" + fix_msg if fix_msg else ""
|
| 84 |
+
super().__init__(f"{self.prefix}{self.err_msg}{self.suffix}")
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
class KeyHasher:
|
| 88 |
+
"""KeyHasher class for providing hash using md5"""
|
| 89 |
+
|
| 90 |
+
def __init__(self, hash_salt: str):
|
| 91 |
+
self._split_md5 = insecure_hashlib.md5(_as_bytes(hash_salt))
|
| 92 |
+
|
| 93 |
+
def hash(self, key: Union[str, int, bytes]) -> int:
|
| 94 |
+
"""Returns 128-bits unique hash of input key
|
| 95 |
+
|
| 96 |
+
Args:
|
| 97 |
+
key: the input key to be hashed (should be str, int or bytes)
|
| 98 |
+
|
| 99 |
+
Returns: 128-bit int hash key"""
|
| 100 |
+
md5 = self._split_md5.copy()
|
| 101 |
+
byte_key = _as_bytes(key)
|
| 102 |
+
md5.update(byte_key)
|
| 103 |
+
# Convert to integer with hexadecimal conversion
|
| 104 |
+
return int(md5.hexdigest(), 16)
|
datasets/load.py
ADDED
|
@@ -0,0 +1,1481 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
# Lint as: python3
|
| 16 |
+
"""Access datasets."""
|
| 17 |
+
|
| 18 |
+
import glob
|
| 19 |
+
import importlib
|
| 20 |
+
import inspect
|
| 21 |
+
import json
|
| 22 |
+
import os
|
| 23 |
+
import posixpath
|
| 24 |
+
from collections import Counter
|
| 25 |
+
from collections.abc import Mapping, Sequence
|
| 26 |
+
from dataclasses import dataclass, field
|
| 27 |
+
from pathlib import Path
|
| 28 |
+
from typing import Any, Optional, Union
|
| 29 |
+
|
| 30 |
+
import fsspec
|
| 31 |
+
import httpx
|
| 32 |
+
import requests
|
| 33 |
+
import yaml
|
| 34 |
+
from fsspec.core import url_to_fs
|
| 35 |
+
from huggingface_hub import DatasetCard, DatasetCardData, HfApi
|
| 36 |
+
from huggingface_hub.utils import (
|
| 37 |
+
EntryNotFoundError,
|
| 38 |
+
GatedRepoError,
|
| 39 |
+
LocalEntryNotFoundError,
|
| 40 |
+
OfflineModeIsEnabled,
|
| 41 |
+
RepositoryNotFoundError,
|
| 42 |
+
RevisionNotFoundError,
|
| 43 |
+
get_session,
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
from . import __version__, config
|
| 47 |
+
from .arrow_dataset import Dataset
|
| 48 |
+
from .builder import BuilderConfig, DatasetBuilder
|
| 49 |
+
from .data_files import (
|
| 50 |
+
DataFilesDict,
|
| 51 |
+
DataFilesList,
|
| 52 |
+
DataFilesPatternsDict,
|
| 53 |
+
EmptyDatasetError,
|
| 54 |
+
get_data_patterns,
|
| 55 |
+
sanitize_patterns,
|
| 56 |
+
)
|
| 57 |
+
from .dataset_dict import DatasetDict, IterableDatasetDict
|
| 58 |
+
from .download.download_config import DownloadConfig
|
| 59 |
+
from .download.download_manager import DownloadMode
|
| 60 |
+
from .download.streaming_download_manager import StreamingDownloadManager, xbasename, xglob, xjoin
|
| 61 |
+
from .exceptions import DataFilesNotFoundError, DatasetNotFoundError
|
| 62 |
+
from .features import Features
|
| 63 |
+
from .features.features import _fix_for_backward_compatible_features
|
| 64 |
+
from .fingerprint import Hasher
|
| 65 |
+
from .info import DatasetInfo, DatasetInfosDict
|
| 66 |
+
from .iterable_dataset import IterableDataset
|
| 67 |
+
from .naming import camelcase_to_snakecase, snakecase_to_camelcase
|
| 68 |
+
from .packaged_modules import (
|
| 69 |
+
_EXTENSION_TO_MODULE,
|
| 70 |
+
_MODULE_TO_EXTENSIONS,
|
| 71 |
+
_MODULE_TO_METADATA_FILE_NAMES,
|
| 72 |
+
_PACKAGED_DATASETS_MODULES,
|
| 73 |
+
)
|
| 74 |
+
from .packaged_modules.folder_based_builder.folder_based_builder import FolderBasedBuilder
|
| 75 |
+
from .splits import Split
|
| 76 |
+
from .utils import _dataset_viewer
|
| 77 |
+
from .utils.file_utils import (
|
| 78 |
+
_raise_if_offline_mode_is_enabled,
|
| 79 |
+
cached_path,
|
| 80 |
+
get_datasets_user_agent,
|
| 81 |
+
is_relative_path,
|
| 82 |
+
relative_to_absolute_path,
|
| 83 |
+
)
|
| 84 |
+
from .utils.hub import hf_dataset_url
|
| 85 |
+
from .utils.info_utils import VerificationMode, is_small_dataset
|
| 86 |
+
from .utils.logging import get_logger
|
| 87 |
+
from .utils.metadata import MetadataConfigs
|
| 88 |
+
from .utils.typing import PathLike
|
| 89 |
+
from .utils.version import Version
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
logger = get_logger(__name__)
|
| 93 |
+
|
| 94 |
+
ALL_ALLOWED_EXTENSIONS = list(_EXTENSION_TO_MODULE.keys()) + [".zip"]
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
class _InitializeConfiguredDatasetBuilder:
|
| 98 |
+
"""
|
| 99 |
+
From https://stackoverflow.com/questions/4647566/pickle-a-dynamically-parameterized-sub-class
|
| 100 |
+
See also ConfiguredDatasetBuilder.__reduce__
|
| 101 |
+
When called with the param value as the only argument, returns an
|
| 102 |
+
un-initialized instance of the parameterized class. Subsequent __setstate__
|
| 103 |
+
will be called by pickle.
|
| 104 |
+
"""
|
| 105 |
+
|
| 106 |
+
def __call__(self, builder_cls, metadata_configs, default_config_name, name):
|
| 107 |
+
# make a simple object which has no complex __init__ (this one will do)
|
| 108 |
+
obj = _InitializeConfiguredDatasetBuilder()
|
| 109 |
+
obj.__class__ = configure_builder_class(
|
| 110 |
+
builder_cls, metadata_configs, default_config_name=default_config_name, dataset_name=name
|
| 111 |
+
)
|
| 112 |
+
return obj
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def configure_builder_class(
|
| 116 |
+
builder_cls: type[DatasetBuilder],
|
| 117 |
+
builder_configs: list[BuilderConfig],
|
| 118 |
+
default_config_name: Optional[str],
|
| 119 |
+
dataset_name: str,
|
| 120 |
+
) -> type[DatasetBuilder]:
|
| 121 |
+
"""
|
| 122 |
+
Dynamically create a builder class with custom builder configs parsed from README.md file,
|
| 123 |
+
i.e. set BUILDER_CONFIGS class variable of a builder class to custom configs list.
|
| 124 |
+
"""
|
| 125 |
+
|
| 126 |
+
class ConfiguredDatasetBuilder(builder_cls):
|
| 127 |
+
BUILDER_CONFIGS = builder_configs
|
| 128 |
+
DEFAULT_CONFIG_NAME = default_config_name
|
| 129 |
+
|
| 130 |
+
__module__ = builder_cls.__module__ # so that the actual packaged builder can be imported
|
| 131 |
+
|
| 132 |
+
def __reduce__(self): # to make dynamically created class pickable, see _InitializeParameterizedDatasetBuilder
|
| 133 |
+
parent_builder_cls = self.__class__.__mro__[1]
|
| 134 |
+
return (
|
| 135 |
+
_InitializeConfiguredDatasetBuilder(),
|
| 136 |
+
(
|
| 137 |
+
parent_builder_cls,
|
| 138 |
+
self.BUILDER_CONFIGS,
|
| 139 |
+
self.DEFAULT_CONFIG_NAME,
|
| 140 |
+
self.dataset_name,
|
| 141 |
+
),
|
| 142 |
+
self.__dict__.copy(),
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
ConfiguredDatasetBuilder.__name__ = (
|
| 146 |
+
f"{builder_cls.__name__.lower().capitalize()}{snakecase_to_camelcase(dataset_name)}"
|
| 147 |
+
)
|
| 148 |
+
ConfiguredDatasetBuilder.__qualname__ = (
|
| 149 |
+
f"{builder_cls.__name__.lower().capitalize()}{snakecase_to_camelcase(dataset_name)}"
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
return ConfiguredDatasetBuilder
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def import_main_class(module_path) -> Optional[type[DatasetBuilder]]:
|
| 156 |
+
"""Import a module at module_path and return its main class: a DatasetBuilder"""
|
| 157 |
+
module = importlib.import_module(module_path)
|
| 158 |
+
# Find the main class in our imported module
|
| 159 |
+
module_main_cls = None
|
| 160 |
+
for name, obj in module.__dict__.items():
|
| 161 |
+
if inspect.isclass(obj) and issubclass(obj, DatasetBuilder):
|
| 162 |
+
if inspect.isabstract(obj):
|
| 163 |
+
continue
|
| 164 |
+
module_main_cls = obj
|
| 165 |
+
obj_module = inspect.getmodule(obj)
|
| 166 |
+
if obj_module is not None and module == obj_module:
|
| 167 |
+
break
|
| 168 |
+
|
| 169 |
+
return module_main_cls
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def get_dataset_builder_class(
|
| 173 |
+
dataset_module: "DatasetModule", dataset_name: Optional[str] = None
|
| 174 |
+
) -> type[DatasetBuilder]:
|
| 175 |
+
builder_cls = import_main_class(dataset_module.module_path)
|
| 176 |
+
if dataset_module.builder_configs_parameters.builder_configs:
|
| 177 |
+
dataset_name = dataset_name or dataset_module.builder_kwargs.get("dataset_name")
|
| 178 |
+
if dataset_name is None:
|
| 179 |
+
raise ValueError("dataset_name should be specified but got None")
|
| 180 |
+
builder_cls = configure_builder_class(
|
| 181 |
+
builder_cls,
|
| 182 |
+
builder_configs=dataset_module.builder_configs_parameters.builder_configs,
|
| 183 |
+
default_config_name=dataset_module.builder_configs_parameters.default_config_name,
|
| 184 |
+
dataset_name=dataset_name,
|
| 185 |
+
)
|
| 186 |
+
return builder_cls
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
def increase_load_count(name: str):
|
| 190 |
+
"""Update the download count of a dataset."""
|
| 191 |
+
if not config.HF_HUB_OFFLINE and config.HF_UPDATE_DOWNLOAD_COUNTS:
|
| 192 |
+
try:
|
| 193 |
+
get_session().head(
|
| 194 |
+
"/".join((config.S3_DATASETS_BUCKET_PREFIX, name, name + ".py")),
|
| 195 |
+
headers={"User-Agent": get_datasets_user_agent()},
|
| 196 |
+
timeout=3,
|
| 197 |
+
)
|
| 198 |
+
except Exception:
|
| 199 |
+
pass
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
def infer_module_for_data_files_list(
|
| 203 |
+
data_files_list: DataFilesList, download_config: Optional[DownloadConfig] = None
|
| 204 |
+
) -> tuple[Optional[str], dict]:
|
| 205 |
+
"""Infer module (and builder kwargs) from list of data files.
|
| 206 |
+
|
| 207 |
+
It picks the module based on the most common file extension.
|
| 208 |
+
In case of a draw ".parquet" is the favorite, and then alphabetical order.
|
| 209 |
+
|
| 210 |
+
Args:
|
| 211 |
+
data_files_list (DataFilesList): List of data files.
|
| 212 |
+
download_config (bool or str, optional): Mainly use `token` or `storage_options` to support different platforms and auth types.
|
| 213 |
+
|
| 214 |
+
Returns:
|
| 215 |
+
tuple[str, dict[str, Any]]: Tuple with
|
| 216 |
+
- inferred module name
|
| 217 |
+
- dict of builder kwargs
|
| 218 |
+
"""
|
| 219 |
+
extensions_counter = Counter(
|
| 220 |
+
("." + suffix.lower(), xbasename(filepath) in FolderBasedBuilder.METADATA_FILENAMES)
|
| 221 |
+
for filepath in data_files_list[: config.DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE]
|
| 222 |
+
for suffix in xbasename(filepath).split(".")[1:]
|
| 223 |
+
)
|
| 224 |
+
if extensions_counter:
|
| 225 |
+
|
| 226 |
+
def sort_key(ext_count: tuple[tuple[str, bool], int]) -> tuple[int, bool]:
|
| 227 |
+
"""Sort by count and set ".parquet" as the favorite in case of a draw, and ignore metadata files"""
|
| 228 |
+
(ext, is_metadata), count = ext_count
|
| 229 |
+
return (not is_metadata, count, ext == ".parquet", ext == ".jsonl", ext == ".json", ext == ".csv", ext)
|
| 230 |
+
|
| 231 |
+
for (ext, _), _ in sorted(extensions_counter.items(), key=sort_key, reverse=True):
|
| 232 |
+
if ext in _EXTENSION_TO_MODULE:
|
| 233 |
+
return _EXTENSION_TO_MODULE[ext]
|
| 234 |
+
elif ext == ".zip":
|
| 235 |
+
return infer_module_for_data_files_list_in_archives(data_files_list, download_config=download_config)
|
| 236 |
+
return None, {}
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
def infer_module_for_data_files_list_in_archives(
|
| 240 |
+
data_files_list: DataFilesList, download_config: Optional[DownloadConfig] = None
|
| 241 |
+
) -> tuple[Optional[str], dict]:
|
| 242 |
+
"""Infer module (and builder kwargs) from list of archive data files.
|
| 243 |
+
|
| 244 |
+
Args:
|
| 245 |
+
data_files_list (DataFilesList): List of data files.
|
| 246 |
+
download_config (bool or str, optional): Mainly use `token` or `storage_options` to support different platforms and auth types.
|
| 247 |
+
|
| 248 |
+
Returns:
|
| 249 |
+
tuple[str, dict[str, Any]]: Tuple with
|
| 250 |
+
- inferred module name
|
| 251 |
+
- dict of builder kwargs
|
| 252 |
+
"""
|
| 253 |
+
archived_files = []
|
| 254 |
+
archive_files_counter = 0
|
| 255 |
+
for filepath in data_files_list:
|
| 256 |
+
if str(filepath).endswith(".zip"):
|
| 257 |
+
archive_files_counter += 1
|
| 258 |
+
if archive_files_counter > config.GLOBBED_DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE:
|
| 259 |
+
break
|
| 260 |
+
extracted = xjoin(StreamingDownloadManager().extract(filepath), "**")
|
| 261 |
+
archived_files += [
|
| 262 |
+
f.split("::")[0]
|
| 263 |
+
for f in xglob(extracted, recursive=True, download_config=download_config)[
|
| 264 |
+
: config.ARCHIVED_DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE
|
| 265 |
+
]
|
| 266 |
+
]
|
| 267 |
+
extensions_counter = Counter(
|
| 268 |
+
"." + suffix.lower() for filepath in archived_files for suffix in xbasename(filepath).split(".")[1:]
|
| 269 |
+
)
|
| 270 |
+
if extensions_counter:
|
| 271 |
+
most_common = extensions_counter.most_common(1)[0][0]
|
| 272 |
+
if most_common in _EXTENSION_TO_MODULE:
|
| 273 |
+
return _EXTENSION_TO_MODULE[most_common]
|
| 274 |
+
return None, {}
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
def infer_module_for_data_files(
|
| 278 |
+
data_files: DataFilesDict, path: Optional[str] = None, download_config: Optional[DownloadConfig] = None
|
| 279 |
+
) -> tuple[Optional[str], dict[str, Any]]:
|
| 280 |
+
"""Infer module (and builder kwargs) from data files. Raise if module names for different splits don't match.
|
| 281 |
+
|
| 282 |
+
Args:
|
| 283 |
+
data_files ([`DataFilesDict`]): Dict of list of data files.
|
| 284 |
+
path (str, *optional*): Dataset name or path.
|
| 285 |
+
download_config ([`DownloadConfig`], *optional*):
|
| 286 |
+
Specific download configuration parameters to authenticate on the Hugging Face Hub for private remote files.
|
| 287 |
+
|
| 288 |
+
Returns:
|
| 289 |
+
tuple[str, dict[str, Any]]: Tuple with
|
| 290 |
+
- inferred module name
|
| 291 |
+
- builder kwargs
|
| 292 |
+
"""
|
| 293 |
+
split_modules = {
|
| 294 |
+
split: infer_module_for_data_files_list(data_files_list, download_config=download_config)
|
| 295 |
+
for split, data_files_list in data_files.items()
|
| 296 |
+
}
|
| 297 |
+
module_name, default_builder_kwargs = next(iter(split_modules.values()))
|
| 298 |
+
if any((module_name, default_builder_kwargs) != split_module for split_module in split_modules.values()):
|
| 299 |
+
raise ValueError(f"Couldn't infer the same data file format for all splits. Got {split_modules}")
|
| 300 |
+
if not module_name:
|
| 301 |
+
raise DataFilesNotFoundError("No (supported) data files found" + (f" in {path}" if path else ""))
|
| 302 |
+
return module_name, default_builder_kwargs
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
def create_builder_configs_from_metadata_configs(
|
| 306 |
+
module_path: str,
|
| 307 |
+
metadata_configs: MetadataConfigs,
|
| 308 |
+
base_path: Optional[str] = None,
|
| 309 |
+
default_builder_kwargs: dict[str, Any] = None,
|
| 310 |
+
download_config: Optional[DownloadConfig] = None,
|
| 311 |
+
) -> tuple[list[BuilderConfig], str]:
|
| 312 |
+
builder_cls = import_main_class(module_path)
|
| 313 |
+
builder_config_cls = builder_cls.BUILDER_CONFIG_CLASS
|
| 314 |
+
default_config_name = metadata_configs.get_default_config_name()
|
| 315 |
+
builder_configs = []
|
| 316 |
+
default_builder_kwargs = {} if default_builder_kwargs is None else default_builder_kwargs
|
| 317 |
+
|
| 318 |
+
base_path = base_path if base_path is not None else ""
|
| 319 |
+
for config_name, config_params in metadata_configs.items():
|
| 320 |
+
config_data_files = config_params.get("data_files")
|
| 321 |
+
config_data_dir = config_params.get("data_dir")
|
| 322 |
+
config_base_path = xjoin(base_path, config_data_dir) if config_data_dir else base_path
|
| 323 |
+
try:
|
| 324 |
+
config_patterns = (
|
| 325 |
+
sanitize_patterns(config_data_files)
|
| 326 |
+
if config_data_files is not None
|
| 327 |
+
else get_data_patterns(config_base_path, download_config=download_config)
|
| 328 |
+
)
|
| 329 |
+
config_data_files_dict = DataFilesPatternsDict.from_patterns(
|
| 330 |
+
config_patterns,
|
| 331 |
+
allowed_extensions=ALL_ALLOWED_EXTENSIONS,
|
| 332 |
+
)
|
| 333 |
+
except EmptyDatasetError as e:
|
| 334 |
+
raise EmptyDatasetError(
|
| 335 |
+
f"Dataset at '{base_path}' doesn't contain data files matching the patterns for config '{config_name}',"
|
| 336 |
+
f" check `data_files` and `data_fir` parameters in the `configs` YAML field in README.md. "
|
| 337 |
+
) from e
|
| 338 |
+
ignored_params = [
|
| 339 |
+
param for param in config_params if not hasattr(builder_config_cls, param) and param != "default"
|
| 340 |
+
]
|
| 341 |
+
if ignored_params:
|
| 342 |
+
logger.warning(
|
| 343 |
+
f"Some datasets params were ignored: {ignored_params}. "
|
| 344 |
+
"Make sure to use only valid params for the dataset builder and to have "
|
| 345 |
+
"a up-to-date version of the `datasets` library."
|
| 346 |
+
)
|
| 347 |
+
builder_configs.append(
|
| 348 |
+
builder_config_cls(
|
| 349 |
+
name=config_name,
|
| 350 |
+
data_files=config_data_files_dict,
|
| 351 |
+
data_dir=config_data_dir,
|
| 352 |
+
**{
|
| 353 |
+
param: value
|
| 354 |
+
for param, value in {**default_builder_kwargs, **config_params}.items()
|
| 355 |
+
if hasattr(builder_config_cls, param) and param not in ("default", "data_files", "data_dir")
|
| 356 |
+
},
|
| 357 |
+
)
|
| 358 |
+
)
|
| 359 |
+
return builder_configs, default_config_name
|
| 360 |
+
|
| 361 |
+
|
| 362 |
+
@dataclass
|
| 363 |
+
class BuilderConfigsParameters:
|
| 364 |
+
"""Dataclass containing objects related to creation of builder configurations from yaml's metadata content.
|
| 365 |
+
|
| 366 |
+
Attributes:
|
| 367 |
+
metadata_configs (`MetadataConfigs`, *optional*):
|
| 368 |
+
Configs parsed from yaml's metadata.
|
| 369 |
+
builder_configs (`list[BuilderConfig]`, *optional*):
|
| 370 |
+
List of BuilderConfig objects created from metadata_configs above.
|
| 371 |
+
default_config_name (`str`):
|
| 372 |
+
Name of default config taken from yaml's metadata.
|
| 373 |
+
"""
|
| 374 |
+
|
| 375 |
+
metadata_configs: Optional[MetadataConfigs] = None
|
| 376 |
+
builder_configs: Optional[list[BuilderConfig]] = None
|
| 377 |
+
default_config_name: Optional[str] = None
|
| 378 |
+
|
| 379 |
+
|
| 380 |
+
@dataclass
|
| 381 |
+
class DatasetModule:
|
| 382 |
+
module_path: str
|
| 383 |
+
hash: str
|
| 384 |
+
builder_kwargs: dict
|
| 385 |
+
builder_configs_parameters: BuilderConfigsParameters = field(default_factory=BuilderConfigsParameters)
|
| 386 |
+
dataset_infos: Optional[DatasetInfosDict] = None
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
class _DatasetModuleFactory:
|
| 390 |
+
def get_module(self) -> DatasetModule:
|
| 391 |
+
raise NotImplementedError
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
class LocalDatasetModuleFactory(_DatasetModuleFactory):
|
| 395 |
+
"""Get the module of a dataset loaded from the user's data files. The dataset builder module to use is inferred
|
| 396 |
+
from the data files extensions."""
|
| 397 |
+
|
| 398 |
+
def __init__(
|
| 399 |
+
self,
|
| 400 |
+
path: str,
|
| 401 |
+
data_dir: Optional[str] = None,
|
| 402 |
+
data_files: Optional[Union[str, list, dict]] = None,
|
| 403 |
+
download_mode: Optional[Union[DownloadMode, str]] = None,
|
| 404 |
+
):
|
| 405 |
+
if data_dir and os.path.isabs(data_dir):
|
| 406 |
+
raise ValueError(f"`data_dir` must be relative to a dataset directory's root: {path}")
|
| 407 |
+
|
| 408 |
+
self.path = Path(path).as_posix()
|
| 409 |
+
self.name = Path(path).stem
|
| 410 |
+
self.data_files = data_files
|
| 411 |
+
self.data_dir = data_dir
|
| 412 |
+
self.download_mode = download_mode
|
| 413 |
+
|
| 414 |
+
def get_module(self) -> DatasetModule:
|
| 415 |
+
readme_path = os.path.join(self.path, config.REPOCARD_FILENAME)
|
| 416 |
+
standalone_yaml_path = os.path.join(self.path, config.REPOYAML_FILENAME)
|
| 417 |
+
dataset_card_data = DatasetCard.load(readme_path).data if os.path.isfile(readme_path) else DatasetCardData()
|
| 418 |
+
if os.path.exists(standalone_yaml_path):
|
| 419 |
+
with open(standalone_yaml_path, encoding="utf-8") as f:
|
| 420 |
+
standalone_yaml_data = yaml.safe_load(f.read())
|
| 421 |
+
if standalone_yaml_data:
|
| 422 |
+
_dataset_card_data_dict = dataset_card_data.to_dict()
|
| 423 |
+
_dataset_card_data_dict.update(standalone_yaml_data)
|
| 424 |
+
dataset_card_data = DatasetCardData(**_dataset_card_data_dict)
|
| 425 |
+
metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)
|
| 426 |
+
dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data)
|
| 427 |
+
# we need a set of data files to find which dataset builder to use
|
| 428 |
+
# because we need to infer module name by files extensions
|
| 429 |
+
base_path = Path(self.path, self.data_dir or "").expanduser().resolve().as_posix()
|
| 430 |
+
if self.data_files is not None:
|
| 431 |
+
patterns = sanitize_patterns(self.data_files)
|
| 432 |
+
elif metadata_configs and not self.data_dir and "data_files" in next(iter(metadata_configs.values())):
|
| 433 |
+
patterns = sanitize_patterns(next(iter(metadata_configs.values()))["data_files"])
|
| 434 |
+
else:
|
| 435 |
+
patterns = get_data_patterns(base_path)
|
| 436 |
+
data_files = DataFilesDict.from_patterns(
|
| 437 |
+
patterns,
|
| 438 |
+
base_path=base_path,
|
| 439 |
+
allowed_extensions=ALL_ALLOWED_EXTENSIONS,
|
| 440 |
+
)
|
| 441 |
+
module_name, default_builder_kwargs = infer_module_for_data_files(
|
| 442 |
+
data_files=data_files,
|
| 443 |
+
path=self.path,
|
| 444 |
+
)
|
| 445 |
+
data_files = data_files.filter(
|
| 446 |
+
extensions=_MODULE_TO_EXTENSIONS[module_name], file_names=_MODULE_TO_METADATA_FILE_NAMES[module_name]
|
| 447 |
+
)
|
| 448 |
+
module_path, _ = _PACKAGED_DATASETS_MODULES[module_name]
|
| 449 |
+
if metadata_configs:
|
| 450 |
+
builder_configs, default_config_name = create_builder_configs_from_metadata_configs(
|
| 451 |
+
module_path,
|
| 452 |
+
metadata_configs,
|
| 453 |
+
base_path=base_path,
|
| 454 |
+
default_builder_kwargs=default_builder_kwargs,
|
| 455 |
+
)
|
| 456 |
+
else:
|
| 457 |
+
builder_configs: list[BuilderConfig] = [
|
| 458 |
+
import_main_class(module_path).BUILDER_CONFIG_CLASS(
|
| 459 |
+
data_files=data_files,
|
| 460 |
+
**default_builder_kwargs,
|
| 461 |
+
)
|
| 462 |
+
]
|
| 463 |
+
default_config_name = None
|
| 464 |
+
builder_kwargs = {
|
| 465 |
+
"base_path": self.path,
|
| 466 |
+
"dataset_name": camelcase_to_snakecase(Path(self.path).name),
|
| 467 |
+
}
|
| 468 |
+
if self.data_dir:
|
| 469 |
+
builder_kwargs["data_files"] = data_files
|
| 470 |
+
# this file is deprecated and was created automatically in old versions of push_to_hub
|
| 471 |
+
if os.path.isfile(os.path.join(self.path, config.DATASETDICT_INFOS_FILENAME)):
|
| 472 |
+
with open(os.path.join(self.path, config.DATASETDICT_INFOS_FILENAME), encoding="utf-8") as f:
|
| 473 |
+
legacy_dataset_infos = DatasetInfosDict(
|
| 474 |
+
{
|
| 475 |
+
config_name: DatasetInfo.from_dict(dataset_info_dict)
|
| 476 |
+
for config_name, dataset_info_dict in json.load(f).items()
|
| 477 |
+
}
|
| 478 |
+
)
|
| 479 |
+
if len(legacy_dataset_infos) == 1:
|
| 480 |
+
# old config e.g. named "username--dataset_name"
|
| 481 |
+
legacy_config_name = next(iter(legacy_dataset_infos))
|
| 482 |
+
legacy_dataset_infos["default"] = legacy_dataset_infos.pop(legacy_config_name)
|
| 483 |
+
legacy_dataset_infos.update(dataset_infos)
|
| 484 |
+
dataset_infos = legacy_dataset_infos
|
| 485 |
+
if default_config_name is None and len(dataset_infos) == 1:
|
| 486 |
+
default_config_name = next(iter(dataset_infos))
|
| 487 |
+
|
| 488 |
+
hash = Hasher.hash({"dataset_infos": dataset_infos, "builder_configs": builder_configs})
|
| 489 |
+
return DatasetModule(
|
| 490 |
+
module_path,
|
| 491 |
+
hash,
|
| 492 |
+
builder_kwargs,
|
| 493 |
+
dataset_infos=dataset_infos,
|
| 494 |
+
builder_configs_parameters=BuilderConfigsParameters(
|
| 495 |
+
metadata_configs=metadata_configs,
|
| 496 |
+
builder_configs=builder_configs,
|
| 497 |
+
default_config_name=default_config_name,
|
| 498 |
+
),
|
| 499 |
+
)
|
| 500 |
+
|
| 501 |
+
|
| 502 |
+
class PackagedDatasetModuleFactory(_DatasetModuleFactory):
|
| 503 |
+
"""Get the dataset builder module from the ones that are packaged with the library: csv, json, etc."""
|
| 504 |
+
|
| 505 |
+
def __init__(
|
| 506 |
+
self,
|
| 507 |
+
name: str,
|
| 508 |
+
data_dir: Optional[str] = None,
|
| 509 |
+
data_files: Optional[Union[str, list, dict]] = None,
|
| 510 |
+
download_config: Optional[DownloadConfig] = None,
|
| 511 |
+
download_mode: Optional[Union[DownloadMode, str]] = None,
|
| 512 |
+
):
|
| 513 |
+
self.name = name
|
| 514 |
+
self.data_files = data_files
|
| 515 |
+
self.data_dir = data_dir
|
| 516 |
+
self.download_config = download_config
|
| 517 |
+
self.download_mode = download_mode
|
| 518 |
+
increase_load_count(name)
|
| 519 |
+
|
| 520 |
+
def get_module(self) -> DatasetModule:
|
| 521 |
+
base_path = Path(self.data_dir or "").expanduser().resolve().as_posix()
|
| 522 |
+
patterns = (
|
| 523 |
+
sanitize_patterns(self.data_files)
|
| 524 |
+
if self.data_files is not None
|
| 525 |
+
else get_data_patterns(base_path, download_config=self.download_config)
|
| 526 |
+
)
|
| 527 |
+
data_files = DataFilesDict.from_patterns(
|
| 528 |
+
patterns,
|
| 529 |
+
download_config=self.download_config,
|
| 530 |
+
base_path=base_path,
|
| 531 |
+
)
|
| 532 |
+
|
| 533 |
+
module_path, hash = _PACKAGED_DATASETS_MODULES[self.name]
|
| 534 |
+
|
| 535 |
+
builder_kwargs = {
|
| 536 |
+
"data_files": data_files,
|
| 537 |
+
"dataset_name": self.name,
|
| 538 |
+
}
|
| 539 |
+
|
| 540 |
+
return DatasetModule(module_path, hash, builder_kwargs)
|
| 541 |
+
|
| 542 |
+
|
| 543 |
+
class HubDatasetModuleFactory(_DatasetModuleFactory):
|
| 544 |
+
"""
|
| 545 |
+
Get the module of a dataset loaded from data files of a dataset repository.
|
| 546 |
+
The dataset builder module to use is inferred from the data files extensions.
|
| 547 |
+
"""
|
| 548 |
+
|
| 549 |
+
def __init__(
|
| 550 |
+
self,
|
| 551 |
+
name: str,
|
| 552 |
+
commit_hash: str,
|
| 553 |
+
data_dir: Optional[str] = None,
|
| 554 |
+
data_files: Optional[Union[str, list, dict]] = None,
|
| 555 |
+
download_config: Optional[DownloadConfig] = None,
|
| 556 |
+
download_mode: Optional[Union[DownloadMode, str]] = None,
|
| 557 |
+
use_exported_dataset_infos: bool = False,
|
| 558 |
+
):
|
| 559 |
+
self.name = name
|
| 560 |
+
self.commit_hash = commit_hash
|
| 561 |
+
self.data_files = data_files
|
| 562 |
+
self.data_dir = data_dir
|
| 563 |
+
self.download_config = download_config or DownloadConfig()
|
| 564 |
+
self.download_mode = download_mode
|
| 565 |
+
self.use_exported_dataset_infos = use_exported_dataset_infos
|
| 566 |
+
increase_load_count(name)
|
| 567 |
+
|
| 568 |
+
def get_module(self) -> DatasetModule:
|
| 569 |
+
# Get the Dataset Card and fix the revision in case there are new commits in the meantime
|
| 570 |
+
api = HfApi(
|
| 571 |
+
endpoint=config.HF_ENDPOINT,
|
| 572 |
+
token=self.download_config.token,
|
| 573 |
+
library_name="datasets",
|
| 574 |
+
library_version=__version__,
|
| 575 |
+
user_agent=get_datasets_user_agent(self.download_config.user_agent),
|
| 576 |
+
)
|
| 577 |
+
try:
|
| 578 |
+
dataset_readme_path = api.hf_hub_download(
|
| 579 |
+
repo_id=self.name,
|
| 580 |
+
filename=config.REPOCARD_FILENAME,
|
| 581 |
+
repo_type="dataset",
|
| 582 |
+
revision=self.commit_hash,
|
| 583 |
+
proxies=self.download_config.proxies,
|
| 584 |
+
)
|
| 585 |
+
dataset_card_data = DatasetCard.load(dataset_readme_path).data
|
| 586 |
+
except EntryNotFoundError:
|
| 587 |
+
dataset_card_data = DatasetCardData()
|
| 588 |
+
download_config = self.download_config.copy()
|
| 589 |
+
if download_config.download_desc is None:
|
| 590 |
+
download_config.download_desc = "Downloading standalone yaml"
|
| 591 |
+
try:
|
| 592 |
+
standalone_yaml_path = cached_path(
|
| 593 |
+
hf_dataset_url(self.name, config.REPOYAML_FILENAME, revision=self.commit_hash),
|
| 594 |
+
download_config=download_config,
|
| 595 |
+
)
|
| 596 |
+
with open(standalone_yaml_path, encoding="utf-8") as f:
|
| 597 |
+
standalone_yaml_data = yaml.safe_load(f.read())
|
| 598 |
+
if standalone_yaml_data:
|
| 599 |
+
_dataset_card_data_dict = dataset_card_data.to_dict()
|
| 600 |
+
_dataset_card_data_dict.update(standalone_yaml_data)
|
| 601 |
+
dataset_card_data = DatasetCardData(**_dataset_card_data_dict)
|
| 602 |
+
except FileNotFoundError:
|
| 603 |
+
pass
|
| 604 |
+
base_path = f"hf://datasets/{self.name}@{self.commit_hash}/{self.data_dir or ''}".rstrip("/")
|
| 605 |
+
metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)
|
| 606 |
+
dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data)
|
| 607 |
+
if config.USE_PARQUET_EXPORT and self.use_exported_dataset_infos:
|
| 608 |
+
try:
|
| 609 |
+
exported_dataset_infos = _dataset_viewer.get_exported_dataset_infos(
|
| 610 |
+
dataset=self.name, commit_hash=self.commit_hash, token=self.download_config.token
|
| 611 |
+
)
|
| 612 |
+
exported_dataset_infos = DatasetInfosDict(
|
| 613 |
+
{
|
| 614 |
+
config_name: DatasetInfo.from_dict(exported_dataset_infos[config_name])
|
| 615 |
+
for config_name in exported_dataset_infos
|
| 616 |
+
}
|
| 617 |
+
)
|
| 618 |
+
except _dataset_viewer.DatasetViewerError:
|
| 619 |
+
exported_dataset_infos = None
|
| 620 |
+
else:
|
| 621 |
+
exported_dataset_infos = None
|
| 622 |
+
if exported_dataset_infos:
|
| 623 |
+
exported_dataset_infos.update(dataset_infos)
|
| 624 |
+
dataset_infos = exported_dataset_infos
|
| 625 |
+
# we need a set of data files to find which dataset builder to use
|
| 626 |
+
# because we need to infer module name by files extensions
|
| 627 |
+
if self.data_files is not None:
|
| 628 |
+
patterns = sanitize_patterns(self.data_files)
|
| 629 |
+
elif metadata_configs and not self.data_dir and "data_files" in next(iter(metadata_configs.values())):
|
| 630 |
+
patterns = sanitize_patterns(next(iter(metadata_configs.values()))["data_files"])
|
| 631 |
+
else:
|
| 632 |
+
patterns = get_data_patterns(base_path, download_config=self.download_config)
|
| 633 |
+
data_files = DataFilesDict.from_patterns(
|
| 634 |
+
patterns,
|
| 635 |
+
base_path=base_path,
|
| 636 |
+
allowed_extensions=ALL_ALLOWED_EXTENSIONS,
|
| 637 |
+
download_config=self.download_config,
|
| 638 |
+
)
|
| 639 |
+
module_name, default_builder_kwargs = infer_module_for_data_files(
|
| 640 |
+
data_files=data_files,
|
| 641 |
+
path=self.name,
|
| 642 |
+
download_config=self.download_config,
|
| 643 |
+
)
|
| 644 |
+
data_files = data_files.filter(
|
| 645 |
+
extensions=_MODULE_TO_EXTENSIONS[module_name], file_names=_MODULE_TO_METADATA_FILE_NAMES[module_name]
|
| 646 |
+
)
|
| 647 |
+
module_path, _ = _PACKAGED_DATASETS_MODULES[module_name]
|
| 648 |
+
if metadata_configs:
|
| 649 |
+
builder_configs, default_config_name = create_builder_configs_from_metadata_configs(
|
| 650 |
+
module_path,
|
| 651 |
+
metadata_configs,
|
| 652 |
+
base_path=base_path,
|
| 653 |
+
default_builder_kwargs=default_builder_kwargs,
|
| 654 |
+
download_config=self.download_config,
|
| 655 |
+
)
|
| 656 |
+
else:
|
| 657 |
+
builder_configs: list[BuilderConfig] = [
|
| 658 |
+
import_main_class(module_path).BUILDER_CONFIG_CLASS(
|
| 659 |
+
data_files=data_files,
|
| 660 |
+
**default_builder_kwargs,
|
| 661 |
+
)
|
| 662 |
+
]
|
| 663 |
+
default_config_name = None
|
| 664 |
+
builder_kwargs = {
|
| 665 |
+
"base_path": hf_dataset_url(self.name, "", revision=self.commit_hash).rstrip("/"),
|
| 666 |
+
"repo_id": self.name,
|
| 667 |
+
"dataset_name": camelcase_to_snakecase(Path(self.name).name),
|
| 668 |
+
}
|
| 669 |
+
if self.data_dir:
|
| 670 |
+
builder_kwargs["data_files"] = data_files
|
| 671 |
+
download_config = self.download_config.copy()
|
| 672 |
+
if download_config.download_desc is None:
|
| 673 |
+
download_config.download_desc = "Downloading metadata"
|
| 674 |
+
try:
|
| 675 |
+
# this file is deprecated and was created automatically in old versions of push_to_hub
|
| 676 |
+
dataset_infos_path = cached_path(
|
| 677 |
+
hf_dataset_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=self.commit_hash),
|
| 678 |
+
download_config=download_config,
|
| 679 |
+
)
|
| 680 |
+
with open(dataset_infos_path, encoding="utf-8") as f:
|
| 681 |
+
legacy_dataset_infos = DatasetInfosDict(
|
| 682 |
+
{
|
| 683 |
+
config_name: DatasetInfo.from_dict(dataset_info_dict)
|
| 684 |
+
for config_name, dataset_info_dict in json.load(f).items()
|
| 685 |
+
}
|
| 686 |
+
)
|
| 687 |
+
if len(legacy_dataset_infos) == 1:
|
| 688 |
+
# old config e.g. named "username--dataset_name"
|
| 689 |
+
legacy_config_name = next(iter(legacy_dataset_infos))
|
| 690 |
+
legacy_dataset_infos["default"] = legacy_dataset_infos.pop(legacy_config_name)
|
| 691 |
+
legacy_dataset_infos.update(dataset_infos)
|
| 692 |
+
dataset_infos = legacy_dataset_infos
|
| 693 |
+
except FileNotFoundError:
|
| 694 |
+
pass
|
| 695 |
+
if default_config_name is None and len(dataset_infos) == 1:
|
| 696 |
+
default_config_name = next(iter(dataset_infos))
|
| 697 |
+
|
| 698 |
+
return DatasetModule(
|
| 699 |
+
module_path,
|
| 700 |
+
self.commit_hash,
|
| 701 |
+
builder_kwargs,
|
| 702 |
+
dataset_infos=dataset_infos,
|
| 703 |
+
builder_configs_parameters=BuilderConfigsParameters(
|
| 704 |
+
metadata_configs=metadata_configs,
|
| 705 |
+
builder_configs=builder_configs,
|
| 706 |
+
default_config_name=default_config_name,
|
| 707 |
+
),
|
| 708 |
+
)
|
| 709 |
+
|
| 710 |
+
|
| 711 |
+
class HubDatasetModuleFactoryWithParquetExport(_DatasetModuleFactory):
|
| 712 |
+
"""
|
| 713 |
+
Get the module of a dataset loaded from parquet files of a dataset repository parquet export.
|
| 714 |
+
"""
|
| 715 |
+
|
| 716 |
+
def __init__(
|
| 717 |
+
self,
|
| 718 |
+
name: str,
|
| 719 |
+
commit_hash: str,
|
| 720 |
+
download_config: Optional[DownloadConfig] = None,
|
| 721 |
+
):
|
| 722 |
+
self.name = name
|
| 723 |
+
self.commit_hash = commit_hash
|
| 724 |
+
self.download_config = download_config or DownloadConfig()
|
| 725 |
+
increase_load_count(name)
|
| 726 |
+
|
| 727 |
+
def get_module(self) -> DatasetModule:
|
| 728 |
+
exported_parquet_files = _dataset_viewer.get_exported_parquet_files(
|
| 729 |
+
dataset=self.name, commit_hash=self.commit_hash, token=self.download_config.token
|
| 730 |
+
)
|
| 731 |
+
exported_dataset_infos = _dataset_viewer.get_exported_dataset_infos(
|
| 732 |
+
dataset=self.name, commit_hash=self.commit_hash, token=self.download_config.token
|
| 733 |
+
)
|
| 734 |
+
dataset_infos = DatasetInfosDict(
|
| 735 |
+
{
|
| 736 |
+
config_name: DatasetInfo.from_dict(exported_dataset_infos[config_name])
|
| 737 |
+
for config_name in exported_dataset_infos
|
| 738 |
+
}
|
| 739 |
+
)
|
| 740 |
+
parquet_commit_hash = (
|
| 741 |
+
HfApi(
|
| 742 |
+
endpoint=config.HF_ENDPOINT,
|
| 743 |
+
token=self.download_config.token,
|
| 744 |
+
library_name="datasets",
|
| 745 |
+
library_version=__version__,
|
| 746 |
+
user_agent=get_datasets_user_agent(self.download_config.user_agent),
|
| 747 |
+
)
|
| 748 |
+
.dataset_info(
|
| 749 |
+
self.name,
|
| 750 |
+
revision="refs/convert/parquet",
|
| 751 |
+
token=self.download_config.token,
|
| 752 |
+
timeout=100.0,
|
| 753 |
+
)
|
| 754 |
+
.sha
|
| 755 |
+
) # fix the revision in case there are new commits in the meantime
|
| 756 |
+
metadata_configs = MetadataConfigs._from_exported_parquet_files_and_dataset_infos(
|
| 757 |
+
parquet_commit_hash=parquet_commit_hash,
|
| 758 |
+
exported_parquet_files=exported_parquet_files,
|
| 759 |
+
dataset_infos=dataset_infos,
|
| 760 |
+
)
|
| 761 |
+
module_path, _ = _PACKAGED_DATASETS_MODULES["parquet"]
|
| 762 |
+
builder_configs, default_config_name = create_builder_configs_from_metadata_configs(
|
| 763 |
+
module_path,
|
| 764 |
+
metadata_configs,
|
| 765 |
+
download_config=self.download_config,
|
| 766 |
+
)
|
| 767 |
+
builder_kwargs = {
|
| 768 |
+
"repo_id": self.name,
|
| 769 |
+
"dataset_name": camelcase_to_snakecase(Path(self.name).name),
|
| 770 |
+
}
|
| 771 |
+
|
| 772 |
+
return DatasetModule(
|
| 773 |
+
module_path,
|
| 774 |
+
self.commit_hash,
|
| 775 |
+
builder_kwargs,
|
| 776 |
+
dataset_infos=dataset_infos,
|
| 777 |
+
builder_configs_parameters=BuilderConfigsParameters(
|
| 778 |
+
metadata_configs=metadata_configs,
|
| 779 |
+
builder_configs=builder_configs,
|
| 780 |
+
default_config_name=default_config_name,
|
| 781 |
+
),
|
| 782 |
+
)
|
| 783 |
+
|
| 784 |
+
|
| 785 |
+
class CachedDatasetModuleFactory(_DatasetModuleFactory):
|
| 786 |
+
"""
|
| 787 |
+
Get the module of a dataset that has been loaded once already and cached.
|
| 788 |
+
"""
|
| 789 |
+
|
| 790 |
+
def __init__(
|
| 791 |
+
self,
|
| 792 |
+
name: str,
|
| 793 |
+
cache_dir: Optional[str] = None,
|
| 794 |
+
):
|
| 795 |
+
self.name = name
|
| 796 |
+
self.cache_dir = cache_dir
|
| 797 |
+
assert self.name.count("/") <= 1
|
| 798 |
+
|
| 799 |
+
def get_module(self) -> DatasetModule:
|
| 800 |
+
cache_dir = os.path.expanduser(str(self.cache_dir or config.HF_DATASETS_CACHE))
|
| 801 |
+
namespace_and_dataset_name = self.name.split("/")
|
| 802 |
+
namespace_and_dataset_name[-1] = camelcase_to_snakecase(namespace_and_dataset_name[-1])
|
| 803 |
+
cached_relative_path = "___".join(namespace_and_dataset_name)
|
| 804 |
+
cached_datasets_directory_path_root = os.path.join(cache_dir, cached_relative_path)
|
| 805 |
+
cached_directory_paths = [
|
| 806 |
+
cached_directory_path
|
| 807 |
+
for cached_directory_path in glob.glob(os.path.join(cached_datasets_directory_path_root, "*", "*", "*"))
|
| 808 |
+
if os.path.isdir(cached_directory_path)
|
| 809 |
+
]
|
| 810 |
+
if cached_directory_paths:
|
| 811 |
+
builder_kwargs = {
|
| 812 |
+
"repo_id": self.name,
|
| 813 |
+
"dataset_name": self.name.split("/")[-1],
|
| 814 |
+
}
|
| 815 |
+
warning_msg = f"Using the latest cached version of the dataset since {self.name} couldn't be found on the Hugging Face Hub"
|
| 816 |
+
if config.HF_HUB_OFFLINE:
|
| 817 |
+
warning_msg += " (offline mode is enabled)."
|
| 818 |
+
logger.warning(warning_msg)
|
| 819 |
+
return DatasetModule(
|
| 820 |
+
"datasets.packaged_modules.cache.cache",
|
| 821 |
+
"auto",
|
| 822 |
+
{**builder_kwargs, "version": "auto"},
|
| 823 |
+
)
|
| 824 |
+
raise FileNotFoundError(f"Dataset {self.name} is not cached in {self.cache_dir}")
|
| 825 |
+
|
| 826 |
+
|
| 827 |
+
def dataset_module_factory(
|
| 828 |
+
path: str,
|
| 829 |
+
revision: Optional[Union[str, Version]] = None,
|
| 830 |
+
download_config: Optional[DownloadConfig] = None,
|
| 831 |
+
download_mode: Optional[Union[DownloadMode, str]] = None,
|
| 832 |
+
data_dir: Optional[str] = None,
|
| 833 |
+
data_files: Optional[Union[dict, list, str, DataFilesDict]] = None,
|
| 834 |
+
cache_dir: Optional[str] = None,
|
| 835 |
+
**download_kwargs,
|
| 836 |
+
) -> DatasetModule:
|
| 837 |
+
"""
|
| 838 |
+
Download/extract/cache a dataset module.
|
| 839 |
+
|
| 840 |
+
Dataset codes are cached inside the dynamic modules cache to allow easy import (avoid ugly sys.path tweaks).
|
| 841 |
+
|
| 842 |
+
Args:
|
| 843 |
+
|
| 844 |
+
path (str): Path or name of the dataset.
|
| 845 |
+
Depending on ``path``, the dataset builder that is used comes from one of the generic dataset builders (JSON, CSV, Parquet, text etc.).
|
| 846 |
+
|
| 847 |
+
For local datasets:
|
| 848 |
+
|
| 849 |
+
- if ``path`` is a local directory (containing data files only)
|
| 850 |
+
-> load a generic dataset builder (csv, json, text etc.) based on the content of the directory
|
| 851 |
+
e.g. ``'./path/to/directory/with/my/csv/data'``.
|
| 852 |
+
|
| 853 |
+
For datasets on the Hugging Face Hub (list all available datasets with ``huggingface_hub.list_datasets()``)
|
| 854 |
+
|
| 855 |
+
- if ``path`` is a dataset repository on the HF hub (containing data files only)
|
| 856 |
+
-> load a generic dataset builder (csv, text etc.) based on the content of the repository
|
| 857 |
+
e.g. ``'username/dataset_name'``, a dataset repository on the HF hub containing your data files.
|
| 858 |
+
|
| 859 |
+
revision (:class:`~utils.Version` or :obj:`str`, optional): Version of the dataset to load.
|
| 860 |
+
As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch.
|
| 861 |
+
You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository.
|
| 862 |
+
download_config (:class:`DownloadConfig`, optional): Specific download configuration parameters.
|
| 863 |
+
download_mode (:class:`DownloadMode` or :obj:`str`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode.
|
| 864 |
+
data_dir (:obj:`str`, optional): Directory with the data files. Used only if `data_files` is not specified,
|
| 865 |
+
in which case it's equal to pass `os.path.join(data_dir, "**")` as `data_files`.
|
| 866 |
+
data_files (:obj:`Union[Dict, List, str]`, optional): Defining the data_files of the dataset configuration.
|
| 867 |
+
cache_dir (`str`, *optional*):
|
| 868 |
+
Directory to read/write data. Defaults to `"~/.cache/huggingface/datasets"`.
|
| 869 |
+
|
| 870 |
+
<Added version="2.16.0"/>
|
| 871 |
+
|
| 872 |
+
**download_kwargs (additional keyword arguments): optional attributes for DownloadConfig() which will override
|
| 873 |
+
the attributes in download_config if supplied.
|
| 874 |
+
|
| 875 |
+
Returns:
|
| 876 |
+
DatasetModule
|
| 877 |
+
"""
|
| 878 |
+
if download_config is None:
|
| 879 |
+
download_config = DownloadConfig(**download_kwargs)
|
| 880 |
+
download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS)
|
| 881 |
+
download_config.extract_compressed_file = True
|
| 882 |
+
download_config.force_extract = True
|
| 883 |
+
download_config.force_download = download_mode == DownloadMode.FORCE_REDOWNLOAD
|
| 884 |
+
|
| 885 |
+
filename = list(filter(lambda x: x, path.replace(os.sep, "/").split("/")))[-1]
|
| 886 |
+
if not filename.endswith(".py"):
|
| 887 |
+
filename = filename + ".py"
|
| 888 |
+
combined_path = os.path.join(path, filename)
|
| 889 |
+
|
| 890 |
+
# We have several ways to get a dataset builder:
|
| 891 |
+
#
|
| 892 |
+
# - if path is the name of a packaged dataset module
|
| 893 |
+
# -> use the packaged module (json, csv, etc.)
|
| 894 |
+
#
|
| 895 |
+
# - if os.path.join(path, name) is a local python file
|
| 896 |
+
# -> use the module from the python file
|
| 897 |
+
# - if path is a local directory (but no python file)
|
| 898 |
+
# -> use a packaged module (csv, text etc.) based on content of the directory
|
| 899 |
+
#
|
| 900 |
+
# - if path has one "/" and is dataset repository on the HF hub with a python file
|
| 901 |
+
# -> the module from the python file in the dataset repository
|
| 902 |
+
# - if path has one "/" and is dataset repository on the HF hub without a python file
|
| 903 |
+
# -> use a packaged module (csv, text etc.) based on content of the repository
|
| 904 |
+
|
| 905 |
+
# Try packaged
|
| 906 |
+
if path in _PACKAGED_DATASETS_MODULES:
|
| 907 |
+
return PackagedDatasetModuleFactory(
|
| 908 |
+
path,
|
| 909 |
+
data_dir=data_dir,
|
| 910 |
+
data_files=data_files,
|
| 911 |
+
download_config=download_config,
|
| 912 |
+
download_mode=download_mode,
|
| 913 |
+
).get_module()
|
| 914 |
+
# Try locally
|
| 915 |
+
elif path.endswith(filename):
|
| 916 |
+
raise RuntimeError(f"Dataset scripts are no longer supported, but found {filename}")
|
| 917 |
+
elif os.path.isfile(combined_path):
|
| 918 |
+
raise RuntimeError(f"Dataset scripts are no longer supported, but found {filename}")
|
| 919 |
+
elif os.path.isdir(path):
|
| 920 |
+
return LocalDatasetModuleFactory(
|
| 921 |
+
path, data_dir=data_dir, data_files=data_files, download_mode=download_mode
|
| 922 |
+
).get_module()
|
| 923 |
+
# Try remotely
|
| 924 |
+
elif is_relative_path(path) and path.count("/") <= 1:
|
| 925 |
+
try:
|
| 926 |
+
# Get the Dataset Card + get the revision + check authentication all at in one call
|
| 927 |
+
# We fix the commit_hash in case there are new commits in the meantime
|
| 928 |
+
api = HfApi(
|
| 929 |
+
endpoint=config.HF_ENDPOINT,
|
| 930 |
+
token=download_config.token,
|
| 931 |
+
library_name="datasets",
|
| 932 |
+
library_version=__version__,
|
| 933 |
+
user_agent=get_datasets_user_agent(download_config.user_agent),
|
| 934 |
+
)
|
| 935 |
+
try:
|
| 936 |
+
_raise_if_offline_mode_is_enabled()
|
| 937 |
+
dataset_readme_path = api.hf_hub_download(
|
| 938 |
+
repo_id=path,
|
| 939 |
+
filename=config.REPOCARD_FILENAME,
|
| 940 |
+
repo_type="dataset",
|
| 941 |
+
revision=revision,
|
| 942 |
+
proxies=download_config.proxies,
|
| 943 |
+
)
|
| 944 |
+
commit_hash = os.path.basename(os.path.dirname(dataset_readme_path))
|
| 945 |
+
except LocalEntryNotFoundError as e:
|
| 946 |
+
if isinstance(
|
| 947 |
+
e.__cause__,
|
| 948 |
+
(
|
| 949 |
+
OfflineModeIsEnabled,
|
| 950 |
+
requests.exceptions.Timeout,
|
| 951 |
+
requests.exceptions.ConnectionError,
|
| 952 |
+
httpx.ConnectError,
|
| 953 |
+
httpx.TimeoutException,
|
| 954 |
+
),
|
| 955 |
+
):
|
| 956 |
+
raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({e.__class__.__name__})") from e
|
| 957 |
+
else:
|
| 958 |
+
raise
|
| 959 |
+
except EntryNotFoundError:
|
| 960 |
+
commit_hash = api.dataset_info(
|
| 961 |
+
path,
|
| 962 |
+
revision=revision,
|
| 963 |
+
timeout=100.0,
|
| 964 |
+
).sha
|
| 965 |
+
except (
|
| 966 |
+
OfflineModeIsEnabled,
|
| 967 |
+
requests.exceptions.Timeout,
|
| 968 |
+
requests.exceptions.ConnectionError,
|
| 969 |
+
httpx.ConnectError,
|
| 970 |
+
httpx.TimeoutException,
|
| 971 |
+
) as e:
|
| 972 |
+
raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({e.__class__.__name__})") from e
|
| 973 |
+
except GatedRepoError as e:
|
| 974 |
+
message = f"Dataset '{path}' is a gated dataset on the Hub."
|
| 975 |
+
if e.response.status_code == 401:
|
| 976 |
+
message += " You must be authenticated to access it."
|
| 977 |
+
elif e.response.status_code == 403:
|
| 978 |
+
message += f" Visit the dataset page at https://huggingface.co/datasets/{path} to ask for access."
|
| 979 |
+
raise DatasetNotFoundError(message) from e
|
| 980 |
+
except RevisionNotFoundError as e:
|
| 981 |
+
raise DatasetNotFoundError(
|
| 982 |
+
f"Revision '{revision}' doesn't exist for dataset '{path}' on the Hub."
|
| 983 |
+
) from e
|
| 984 |
+
except RepositoryNotFoundError as e:
|
| 985 |
+
raise DatasetNotFoundError(f"Dataset '{path}' doesn't exist on the Hub or cannot be accessed.") from e
|
| 986 |
+
try:
|
| 987 |
+
api.hf_hub_download(
|
| 988 |
+
repo_id=path,
|
| 989 |
+
filename=filename,
|
| 990 |
+
repo_type="dataset",
|
| 991 |
+
revision=commit_hash,
|
| 992 |
+
proxies=download_config.proxies,
|
| 993 |
+
)
|
| 994 |
+
raise RuntimeError(f"Dataset scripts are no longer supported, but found {filename}")
|
| 995 |
+
except EntryNotFoundError:
|
| 996 |
+
# Use the infos from the parquet export except in some cases:
|
| 997 |
+
if data_dir or data_files or (revision and revision != "main"):
|
| 998 |
+
use_exported_dataset_infos = False
|
| 999 |
+
else:
|
| 1000 |
+
use_exported_dataset_infos = True
|
| 1001 |
+
return HubDatasetModuleFactory(
|
| 1002 |
+
path,
|
| 1003 |
+
commit_hash=commit_hash,
|
| 1004 |
+
data_dir=data_dir,
|
| 1005 |
+
data_files=data_files,
|
| 1006 |
+
download_config=download_config,
|
| 1007 |
+
download_mode=download_mode,
|
| 1008 |
+
use_exported_dataset_infos=use_exported_dataset_infos,
|
| 1009 |
+
).get_module()
|
| 1010 |
+
except GatedRepoError as e:
|
| 1011 |
+
message = f"Dataset '{path}' is a gated dataset on the Hub."
|
| 1012 |
+
if e.response.status_code == 401:
|
| 1013 |
+
message += " You must be authenticated to access it."
|
| 1014 |
+
elif e.response.status_code == 403:
|
| 1015 |
+
message += f" Visit the dataset page at https://huggingface.co/datasets/{path} to ask for access."
|
| 1016 |
+
raise DatasetNotFoundError(message) from e
|
| 1017 |
+
except RevisionNotFoundError as e:
|
| 1018 |
+
raise DatasetNotFoundError(
|
| 1019 |
+
f"Revision '{revision}' doesn't exist for dataset '{path}' on the Hub."
|
| 1020 |
+
) from e
|
| 1021 |
+
except Exception as e1:
|
| 1022 |
+
# All the attempts failed, before raising the error we should check if the module is already cached
|
| 1023 |
+
try:
|
| 1024 |
+
return CachedDatasetModuleFactory(path, cache_dir=cache_dir).get_module()
|
| 1025 |
+
except Exception:
|
| 1026 |
+
# If it's not in the cache, then it doesn't exist.
|
| 1027 |
+
if isinstance(e1, OfflineModeIsEnabled):
|
| 1028 |
+
raise ConnectionError(f"Couldn't reach the Hugging Face Hub for dataset '{path}': {e1}") from None
|
| 1029 |
+
if isinstance(e1, (DataFilesNotFoundError, DatasetNotFoundError, EmptyDatasetError)):
|
| 1030 |
+
raise e1 from None
|
| 1031 |
+
if isinstance(e1, FileNotFoundError):
|
| 1032 |
+
raise FileNotFoundError(
|
| 1033 |
+
f"Couldn't find any data file at {relative_to_absolute_path(path)}. "
|
| 1034 |
+
f"Couldn't find '{path}' on the Hugging Face Hub either: {type(e1).__name__}: {e1}"
|
| 1035 |
+
) from None
|
| 1036 |
+
raise e1 from None
|
| 1037 |
+
else:
|
| 1038 |
+
raise FileNotFoundError(f"Couldn't find any data file at {relative_to_absolute_path(path)}.")
|
| 1039 |
+
|
| 1040 |
+
|
| 1041 |
+
def load_dataset_builder(
|
| 1042 |
+
path: str,
|
| 1043 |
+
name: Optional[str] = None,
|
| 1044 |
+
data_dir: Optional[str] = None,
|
| 1045 |
+
data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
|
| 1046 |
+
cache_dir: Optional[str] = None,
|
| 1047 |
+
features: Optional[Features] = None,
|
| 1048 |
+
download_config: Optional[DownloadConfig] = None,
|
| 1049 |
+
download_mode: Optional[Union[DownloadMode, str]] = None,
|
| 1050 |
+
revision: Optional[Union[str, Version]] = None,
|
| 1051 |
+
token: Optional[Union[bool, str]] = None,
|
| 1052 |
+
storage_options: Optional[dict] = None,
|
| 1053 |
+
**config_kwargs,
|
| 1054 |
+
) -> DatasetBuilder:
|
| 1055 |
+
"""Load a dataset builder which can be used to:
|
| 1056 |
+
|
| 1057 |
+
- Inspect general information that is required to build a dataset (cache directory, config, dataset info, features, data files, etc.)
|
| 1058 |
+
- Download and prepare the dataset as Arrow files in the cache
|
| 1059 |
+
- Get a streaming dataset without downloading or caching anything
|
| 1060 |
+
|
| 1061 |
+
You can find the list of datasets on the [Hub](https://huggingface.co/datasets) or with [`huggingface_hub.list_datasets`].
|
| 1062 |
+
|
| 1063 |
+
A dataset is a directory that contains some data files in generic formats (JSON, CSV, Parquet, etc.) and possibly
|
| 1064 |
+
in a generic structure (Webdataset, ImageFolder, AudioFolder, VideoFolder, etc.)
|
| 1065 |
+
|
| 1066 |
+
Args:
|
| 1067 |
+
|
| 1068 |
+
path (`str`):
|
| 1069 |
+
Path or name of the dataset.
|
| 1070 |
+
|
| 1071 |
+
- if `path` is a dataset repository on the HF hub (list all available datasets with [`huggingface_hub.list_datasets`])
|
| 1072 |
+
-> load the dataset builder from supported files in the repository (csv, json, parquet, etc.)
|
| 1073 |
+
e.g. `'username/dataset_name'`, a dataset repository on the HF hub containing the data files.
|
| 1074 |
+
|
| 1075 |
+
- if `path` is a local directory
|
| 1076 |
+
-> load the dataset builder from supported files in the directory (csv, json, parquet, etc.)
|
| 1077 |
+
e.g. `'./path/to/directory/with/my/csv/data'`.
|
| 1078 |
+
|
| 1079 |
+
- if `path` is the name of a dataset builder and `data_files` or `data_dir` is specified
|
| 1080 |
+
(available builders are "json", "csv", "parquet", "arrow", "text", "xml", "webdataset", "imagefolder", "audiofolder", "videofolder")
|
| 1081 |
+
-> load the dataset builder from the files in `data_files` or `data_dir`
|
| 1082 |
+
e.g. `'parquet'`.
|
| 1083 |
+
|
| 1084 |
+
name (`str`, *optional*):
|
| 1085 |
+
Defining the name of the dataset configuration.
|
| 1086 |
+
data_dir (`str`, *optional*):
|
| 1087 |
+
Defining the `data_dir` of the dataset configuration. If specified for the generic builders (csv, text etc.) or the Hub datasets and `data_files` is `None`,
|
| 1088 |
+
the behavior is equal to passing `os.path.join(data_dir, **)` as `data_files` to reference all the files in a directory.
|
| 1089 |
+
data_files (`str` or `Sequence` or `Mapping`, *optional*):
|
| 1090 |
+
Path(s) to source data file(s).
|
| 1091 |
+
cache_dir (`str`, *optional*):
|
| 1092 |
+
Directory to read/write data. Defaults to `"~/.cache/huggingface/datasets"`.
|
| 1093 |
+
features ([`Features`], *optional*):
|
| 1094 |
+
Set the features type to use for this dataset.
|
| 1095 |
+
download_config ([`DownloadConfig`], *optional*):
|
| 1096 |
+
Specific download configuration parameters.
|
| 1097 |
+
download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):
|
| 1098 |
+
Download/generate mode.
|
| 1099 |
+
revision ([`Version`] or `str`, *optional*):
|
| 1100 |
+
Version of the dataset to load.
|
| 1101 |
+
As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch.
|
| 1102 |
+
You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository.
|
| 1103 |
+
token (`str` or `bool`, *optional*):
|
| 1104 |
+
Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
|
| 1105 |
+
If `True`, or not specified, will get token from `"~/.huggingface"`.
|
| 1106 |
+
storage_options (`dict`, *optional*, defaults to `None`):
|
| 1107 |
+
**Experimental**. Key/value pairs to be passed on to the dataset file-system backend, if any.
|
| 1108 |
+
|
| 1109 |
+
<Added version="2.11.0"/>
|
| 1110 |
+
|
| 1111 |
+
**config_kwargs (additional keyword arguments):
|
| 1112 |
+
Keyword arguments to be passed to the [`BuilderConfig`]
|
| 1113 |
+
and used in the [`DatasetBuilder`].
|
| 1114 |
+
|
| 1115 |
+
Returns:
|
| 1116 |
+
[`DatasetBuilder`]
|
| 1117 |
+
|
| 1118 |
+
Example:
|
| 1119 |
+
|
| 1120 |
+
```py
|
| 1121 |
+
>>> from datasets import load_dataset_builder
|
| 1122 |
+
>>> ds_builder = load_dataset_builder('cornell-movie-review-data/rotten_tomatoes')
|
| 1123 |
+
>>> ds_builder.info.features
|
| 1124 |
+
{'label': ClassLabel(names=['neg', 'pos']),
|
| 1125 |
+
'text': Value('string')}
|
| 1126 |
+
```
|
| 1127 |
+
"""
|
| 1128 |
+
download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS)
|
| 1129 |
+
if token is not None:
|
| 1130 |
+
download_config = download_config.copy() if download_config else DownloadConfig()
|
| 1131 |
+
download_config.token = token
|
| 1132 |
+
if storage_options is not None:
|
| 1133 |
+
download_config = download_config.copy() if download_config else DownloadConfig()
|
| 1134 |
+
download_config.storage_options.update(storage_options)
|
| 1135 |
+
if features is not None:
|
| 1136 |
+
features = _fix_for_backward_compatible_features(features)
|
| 1137 |
+
dataset_module = dataset_module_factory(
|
| 1138 |
+
path,
|
| 1139 |
+
revision=revision,
|
| 1140 |
+
download_config=download_config,
|
| 1141 |
+
download_mode=download_mode,
|
| 1142 |
+
data_dir=data_dir,
|
| 1143 |
+
data_files=data_files,
|
| 1144 |
+
cache_dir=cache_dir,
|
| 1145 |
+
)
|
| 1146 |
+
# Get dataset builder class
|
| 1147 |
+
builder_kwargs = dataset_module.builder_kwargs
|
| 1148 |
+
data_dir = builder_kwargs.pop("data_dir", data_dir)
|
| 1149 |
+
data_files = builder_kwargs.pop("data_files", data_files)
|
| 1150 |
+
config_name = builder_kwargs.pop(
|
| 1151 |
+
"config_name", name or dataset_module.builder_configs_parameters.default_config_name
|
| 1152 |
+
)
|
| 1153 |
+
dataset_name = builder_kwargs.pop("dataset_name", None)
|
| 1154 |
+
info = dataset_module.dataset_infos.get(config_name) if dataset_module.dataset_infos else None
|
| 1155 |
+
|
| 1156 |
+
if (
|
| 1157 |
+
path in _PACKAGED_DATASETS_MODULES
|
| 1158 |
+
and data_files is None
|
| 1159 |
+
and dataset_module.builder_configs_parameters.builder_configs[0].data_files is None
|
| 1160 |
+
):
|
| 1161 |
+
error_msg = f"Please specify the data files or data directory to load for the {path} dataset builder."
|
| 1162 |
+
example_extensions = [
|
| 1163 |
+
extension for extension in _EXTENSION_TO_MODULE if _EXTENSION_TO_MODULE[extension] == path
|
| 1164 |
+
]
|
| 1165 |
+
if example_extensions:
|
| 1166 |
+
error_msg += f'\nFor example `data_files={{"train": "path/to/data/train/*.{example_extensions[0]}"}}`'
|
| 1167 |
+
raise ValueError(error_msg)
|
| 1168 |
+
|
| 1169 |
+
builder_cls = get_dataset_builder_class(dataset_module, dataset_name=dataset_name)
|
| 1170 |
+
# Instantiate the dataset builder
|
| 1171 |
+
builder_instance: DatasetBuilder = builder_cls(
|
| 1172 |
+
cache_dir=cache_dir,
|
| 1173 |
+
dataset_name=dataset_name,
|
| 1174 |
+
config_name=config_name,
|
| 1175 |
+
data_dir=data_dir,
|
| 1176 |
+
data_files=data_files,
|
| 1177 |
+
hash=dataset_module.hash,
|
| 1178 |
+
info=info,
|
| 1179 |
+
features=features,
|
| 1180 |
+
token=token,
|
| 1181 |
+
storage_options=storage_options,
|
| 1182 |
+
**builder_kwargs,
|
| 1183 |
+
**config_kwargs,
|
| 1184 |
+
)
|
| 1185 |
+
builder_instance._use_legacy_cache_dir_if_possible(dataset_module)
|
| 1186 |
+
|
| 1187 |
+
return builder_instance
|
| 1188 |
+
|
| 1189 |
+
|
| 1190 |
+
def load_dataset(
|
| 1191 |
+
path: str,
|
| 1192 |
+
name: Optional[str] = None,
|
| 1193 |
+
data_dir: Optional[str] = None,
|
| 1194 |
+
data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
|
| 1195 |
+
split: Optional[Union[str, Split, list[str], list[Split]]] = None,
|
| 1196 |
+
cache_dir: Optional[str] = None,
|
| 1197 |
+
features: Optional[Features] = None,
|
| 1198 |
+
download_config: Optional[DownloadConfig] = None,
|
| 1199 |
+
download_mode: Optional[Union[DownloadMode, str]] = None,
|
| 1200 |
+
verification_mode: Optional[Union[VerificationMode, str]] = None,
|
| 1201 |
+
keep_in_memory: Optional[bool] = None,
|
| 1202 |
+
save_infos: bool = False,
|
| 1203 |
+
revision: Optional[Union[str, Version]] = None,
|
| 1204 |
+
token: Optional[Union[bool, str]] = None,
|
| 1205 |
+
streaming: bool = False,
|
| 1206 |
+
num_proc: Optional[int] = None,
|
| 1207 |
+
storage_options: Optional[dict] = None,
|
| 1208 |
+
**config_kwargs,
|
| 1209 |
+
) -> Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset]:
|
| 1210 |
+
"""Load a dataset from the Hugging Face Hub, or a local dataset.
|
| 1211 |
+
|
| 1212 |
+
You can find the list of datasets on the [Hub](https://huggingface.co/datasets) or with [`huggingface_hub.list_datasets`].
|
| 1213 |
+
|
| 1214 |
+
A dataset is a directory that contains some data files in generic formats (JSON, CSV, Parquet, etc.) and possibly
|
| 1215 |
+
in a generic structure (Webdataset, ImageFolder, AudioFolder, VideoFolder, etc.)
|
| 1216 |
+
|
| 1217 |
+
This function does the following under the hood:
|
| 1218 |
+
|
| 1219 |
+
1. Load a dataset builder:
|
| 1220 |
+
|
| 1221 |
+
* Find the most common data format in the dataset and pick its associated builder (JSON, CSV, Parquet, Webdataset, ImageFolder, AudioFolder, etc.)
|
| 1222 |
+
* Find which file goes into which split (e.g. train/test) based on file and directory names or on the YAML configuration
|
| 1223 |
+
* It is also possible to specify `data_files` manually, and which dataset builder to use (e.g. "parquet").
|
| 1224 |
+
|
| 1225 |
+
2. Run the dataset builder:
|
| 1226 |
+
|
| 1227 |
+
In the general case:
|
| 1228 |
+
|
| 1229 |
+
* Download the data files from the dataset if they are not already available locally or cached.
|
| 1230 |
+
* Process and cache the dataset in typed Arrow tables for caching.
|
| 1231 |
+
|
| 1232 |
+
Arrow table are arbitrarily long, typed tables which can store nested objects and be mapped to numpy/pandas/python generic types.
|
| 1233 |
+
They can be directly accessed from disk, loaded in RAM or even streamed over the web.
|
| 1234 |
+
|
| 1235 |
+
In the streaming case:
|
| 1236 |
+
|
| 1237 |
+
* Don't download or cache anything. Instead, the dataset is lazily loaded and will be streamed on-the-fly when iterating on it.
|
| 1238 |
+
|
| 1239 |
+
3. Return a dataset built from the requested splits in `split` (default: all).
|
| 1240 |
+
|
| 1241 |
+
Args:
|
| 1242 |
+
|
| 1243 |
+
path (`str`):
|
| 1244 |
+
Path or name of the dataset.
|
| 1245 |
+
|
| 1246 |
+
- if `path` is a dataset repository on the HF hub (list all available datasets with [`huggingface_hub.list_datasets`])
|
| 1247 |
+
-> load the dataset from supported files in the repository (csv, json, parquet, etc.)
|
| 1248 |
+
e.g. `'username/dataset_name'`, a dataset repository on the HF hub containing the data files.
|
| 1249 |
+
|
| 1250 |
+
- if `path` is a local directory
|
| 1251 |
+
-> load the dataset from supported files in the directory (csv, json, parquet, etc.)
|
| 1252 |
+
e.g. `'./path/to/directory/with/my/csv/data'`.
|
| 1253 |
+
|
| 1254 |
+
- if `path` is the name of a dataset builder and `data_files` or `data_dir` is specified
|
| 1255 |
+
(available builders are "json", "csv", "parquet", "arrow", "text", "xml", "webdataset", "imagefolder", "audiofolder", "videofolder")
|
| 1256 |
+
-> load the dataset from the files in `data_files` or `data_dir`
|
| 1257 |
+
e.g. `'parquet'`.
|
| 1258 |
+
|
| 1259 |
+
name (`str`, *optional*):
|
| 1260 |
+
Defining the name of the dataset configuration.
|
| 1261 |
+
data_dir (`str`, *optional*):
|
| 1262 |
+
Defining the `data_dir` of the dataset configuration. If specified for the generic builders (csv, text etc.) or the Hub datasets and `data_files` is `None`,
|
| 1263 |
+
the behavior is equal to passing `os.path.join(data_dir, **)` as `data_files` to reference all the files in a directory.
|
| 1264 |
+
data_files (`str` or `Sequence` or `Mapping`, *optional*):
|
| 1265 |
+
Path(s) to source data file(s).
|
| 1266 |
+
split (`Split` or `str`):
|
| 1267 |
+
Which split of the data to load.
|
| 1268 |
+
If `None`, will return a `dict` with all splits (typically `datasets.Split.TRAIN` and `datasets.Split.TEST`).
|
| 1269 |
+
If given, will return a single Dataset.
|
| 1270 |
+
Splits can be combined and specified like in tensorflow-datasets.
|
| 1271 |
+
cache_dir (`str`, *optional*):
|
| 1272 |
+
Directory to read/write data. Defaults to `"~/.cache/huggingface/datasets"`.
|
| 1273 |
+
features (`Features`, *optional*):
|
| 1274 |
+
Set the features type to use for this dataset.
|
| 1275 |
+
download_config ([`DownloadConfig`], *optional*):
|
| 1276 |
+
Specific download configuration parameters.
|
| 1277 |
+
download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):
|
| 1278 |
+
Download/generate mode.
|
| 1279 |
+
verification_mode ([`VerificationMode`] or `str`, defaults to `BASIC_CHECKS`):
|
| 1280 |
+
Verification mode determining the checks to run on the downloaded/processed dataset information (checksums/size/splits/...).
|
| 1281 |
+
|
| 1282 |
+
<Added version="2.9.1"/>
|
| 1283 |
+
keep_in_memory (`bool`, defaults to `None`):
|
| 1284 |
+
Whether to copy the dataset in-memory. If `None`, the dataset
|
| 1285 |
+
will not be copied in-memory unless explicitly enabled by setting `datasets.config.IN_MEMORY_MAX_SIZE` to
|
| 1286 |
+
nonzero. See more details in the [improve performance](../cache#improve-performance) section.
|
| 1287 |
+
revision ([`Version`] or `str`, *optional*):
|
| 1288 |
+
Version of the dataset to load.
|
| 1289 |
+
As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch.
|
| 1290 |
+
You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository.
|
| 1291 |
+
token (`str` or `bool`, *optional*):
|
| 1292 |
+
Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
|
| 1293 |
+
If `True`, or not specified, will get token from `"~/.huggingface"`.
|
| 1294 |
+
streaming (`bool`, defaults to `False`):
|
| 1295 |
+
If set to `True`, don't download the data files. Instead, it streams the data progressively while
|
| 1296 |
+
iterating on the dataset. An [`IterableDataset`] or [`IterableDatasetDict`] is returned instead in this case.
|
| 1297 |
+
|
| 1298 |
+
Note that streaming works for datasets that use data formats that support being iterated over like txt, csv, jsonl for example.
|
| 1299 |
+
Json files may be downloaded completely. Also streaming from remote zip or gzip files is supported but other compressed formats
|
| 1300 |
+
like rar and xz are not yet supported. The tgz format doesn't allow streaming.
|
| 1301 |
+
num_proc (`int`, *optional*, defaults to `None`):
|
| 1302 |
+
Number of processes when downloading and generating the dataset locally.
|
| 1303 |
+
Multiprocessing is disabled by default.
|
| 1304 |
+
|
| 1305 |
+
<Added version="2.7.0"/>
|
| 1306 |
+
storage_options (`dict`, *optional*, defaults to `None`):
|
| 1307 |
+
**Experimental**. Key/value pairs to be passed on to the dataset file-system backend, if any.
|
| 1308 |
+
|
| 1309 |
+
<Added version="2.11.0"/>
|
| 1310 |
+
**config_kwargs (additional keyword arguments):
|
| 1311 |
+
Keyword arguments to be passed to the `BuilderConfig`
|
| 1312 |
+
and used in the [`DatasetBuilder`].
|
| 1313 |
+
|
| 1314 |
+
Returns:
|
| 1315 |
+
[`Dataset`] or [`DatasetDict`]:
|
| 1316 |
+
- if `split` is not `None`: the dataset requested,
|
| 1317 |
+
- if `split` is `None`, a [`~datasets.DatasetDict`] with each split.
|
| 1318 |
+
|
| 1319 |
+
or [`IterableDataset`] or [`IterableDatasetDict`]: if `streaming=True`
|
| 1320 |
+
|
| 1321 |
+
- if `split` is not `None`, the dataset is requested
|
| 1322 |
+
- if `split` is `None`, a [`~datasets.streaming.IterableDatasetDict`] with each split.
|
| 1323 |
+
|
| 1324 |
+
Example:
|
| 1325 |
+
|
| 1326 |
+
Load a dataset from the Hugging Face Hub:
|
| 1327 |
+
|
| 1328 |
+
```py
|
| 1329 |
+
>>> from datasets import load_dataset
|
| 1330 |
+
>>> ds = load_dataset('cornell-movie-review-data/rotten_tomatoes', split='train')
|
| 1331 |
+
|
| 1332 |
+
# Load a subset or dataset configuration (here 'sst2')
|
| 1333 |
+
>>> from datasets import load_dataset
|
| 1334 |
+
>>> ds = load_dataset('nyu-mll/glue', 'sst2', split='train')
|
| 1335 |
+
|
| 1336 |
+
# Manual mapping of data files to splits
|
| 1337 |
+
>>> data_files = {'train': 'train.csv', 'test': 'test.csv'}
|
| 1338 |
+
>>> ds = load_dataset('namespace/your_dataset_name', data_files=data_files)
|
| 1339 |
+
|
| 1340 |
+
# Manual selection of a directory to load
|
| 1341 |
+
>>> ds = load_dataset('namespace/your_dataset_name', data_dir='folder_name')
|
| 1342 |
+
```
|
| 1343 |
+
|
| 1344 |
+
Load a local dataset:
|
| 1345 |
+
|
| 1346 |
+
```py
|
| 1347 |
+
# Load a CSV file
|
| 1348 |
+
>>> from datasets import load_dataset
|
| 1349 |
+
>>> ds = load_dataset('csv', data_files='path/to/local/my_dataset.csv')
|
| 1350 |
+
|
| 1351 |
+
# Load a JSON file
|
| 1352 |
+
>>> from datasets import load_dataset
|
| 1353 |
+
>>> ds = load_dataset('json', data_files='path/to/local/my_dataset.json')
|
| 1354 |
+
```
|
| 1355 |
+
|
| 1356 |
+
Load an [`~datasets.IterableDataset`]:
|
| 1357 |
+
|
| 1358 |
+
```py
|
| 1359 |
+
>>> from datasets import load_dataset
|
| 1360 |
+
>>> ds = load_dataset('cornell-movie-review-data/rotten_tomatoes', split='train', streaming=True)
|
| 1361 |
+
```
|
| 1362 |
+
|
| 1363 |
+
Load an image dataset with the `ImageFolder` dataset builder:
|
| 1364 |
+
|
| 1365 |
+
```py
|
| 1366 |
+
>>> from datasets import load_dataset
|
| 1367 |
+
>>> ds = load_dataset('imagefolder', data_dir='/path/to/images', split='train')
|
| 1368 |
+
```
|
| 1369 |
+
"""
|
| 1370 |
+
if "trust_remote_code" in config_kwargs:
|
| 1371 |
+
if config_kwargs.pop("trust_remote_code"):
|
| 1372 |
+
logger.error(
|
| 1373 |
+
"`trust_remote_code` is not supported anymore.\n"
|
| 1374 |
+
f"Please check that the Hugging Face dataset '{path}' isn't based on a loading script and remove `trust_remote_code`.\n"
|
| 1375 |
+
"If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet."
|
| 1376 |
+
)
|
| 1377 |
+
if data_files is not None and not data_files:
|
| 1378 |
+
raise ValueError(f"Empty 'data_files': '{data_files}'. It should be either non-empty or None (default).")
|
| 1379 |
+
if Path(path, config.DATASET_STATE_JSON_FILENAME).exists():
|
| 1380 |
+
raise ValueError(
|
| 1381 |
+
"You are trying to load a dataset that was saved using `save_to_disk`. "
|
| 1382 |
+
"Please use `load_from_disk` instead."
|
| 1383 |
+
)
|
| 1384 |
+
|
| 1385 |
+
if streaming and num_proc is not None:
|
| 1386 |
+
raise NotImplementedError(
|
| 1387 |
+
"Loading a streaming dataset in parallel with `num_proc` is not implemented. "
|
| 1388 |
+
"To parallelize streaming, you can wrap the dataset with a PyTorch DataLoader using `num_workers` > 1 instead."
|
| 1389 |
+
)
|
| 1390 |
+
|
| 1391 |
+
download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS)
|
| 1392 |
+
verification_mode = VerificationMode(
|
| 1393 |
+
(verification_mode or VerificationMode.BASIC_CHECKS) if not save_infos else VerificationMode.ALL_CHECKS
|
| 1394 |
+
)
|
| 1395 |
+
|
| 1396 |
+
# Create a dataset builder
|
| 1397 |
+
builder_instance = load_dataset_builder(
|
| 1398 |
+
path=path,
|
| 1399 |
+
name=name,
|
| 1400 |
+
data_dir=data_dir,
|
| 1401 |
+
data_files=data_files,
|
| 1402 |
+
cache_dir=cache_dir,
|
| 1403 |
+
features=features,
|
| 1404 |
+
download_config=download_config,
|
| 1405 |
+
download_mode=download_mode,
|
| 1406 |
+
revision=revision,
|
| 1407 |
+
token=token,
|
| 1408 |
+
storage_options=storage_options,
|
| 1409 |
+
**config_kwargs,
|
| 1410 |
+
)
|
| 1411 |
+
|
| 1412 |
+
# Return iterable dataset in case of streaming
|
| 1413 |
+
if streaming:
|
| 1414 |
+
return builder_instance.as_streaming_dataset(split=split)
|
| 1415 |
+
|
| 1416 |
+
# Download and prepare data
|
| 1417 |
+
builder_instance.download_and_prepare(
|
| 1418 |
+
download_config=download_config,
|
| 1419 |
+
download_mode=download_mode,
|
| 1420 |
+
verification_mode=verification_mode,
|
| 1421 |
+
num_proc=num_proc,
|
| 1422 |
+
storage_options=storage_options,
|
| 1423 |
+
)
|
| 1424 |
+
|
| 1425 |
+
# Build dataset for splits
|
| 1426 |
+
keep_in_memory = (
|
| 1427 |
+
keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
|
| 1428 |
+
)
|
| 1429 |
+
ds = builder_instance.as_dataset(split=split, verification_mode=verification_mode, in_memory=keep_in_memory)
|
| 1430 |
+
|
| 1431 |
+
return ds
|
| 1432 |
+
|
| 1433 |
+
|
| 1434 |
+
def load_from_disk(
|
| 1435 |
+
dataset_path: PathLike, keep_in_memory: Optional[bool] = None, storage_options: Optional[dict] = None
|
| 1436 |
+
) -> Union[Dataset, DatasetDict]:
|
| 1437 |
+
"""
|
| 1438 |
+
Loads a dataset that was previously saved using [`~Dataset.save_to_disk`] from a dataset directory, or
|
| 1439 |
+
from a filesystem using any implementation of `fsspec.spec.AbstractFileSystem`.
|
| 1440 |
+
|
| 1441 |
+
Args:
|
| 1442 |
+
dataset_path (`path-like`):
|
| 1443 |
+
Path (e.g. `"dataset/train"`) or remote URI (e.g. `"s3://my-bucket/dataset/train"`)
|
| 1444 |
+
of the [`Dataset`] or [`DatasetDict`] directory where the dataset/dataset-dict will be
|
| 1445 |
+
loaded from.
|
| 1446 |
+
keep_in_memory (`bool`, defaults to `None`):
|
| 1447 |
+
Whether to copy the dataset in-memory. If `None`, the dataset
|
| 1448 |
+
will not be copied in-memory unless explicitly enabled by setting `datasets.config.IN_MEMORY_MAX_SIZE` to
|
| 1449 |
+
nonzero. See more details in the [improve performance](../cache#improve-performance) section.
|
| 1450 |
+
|
| 1451 |
+
storage_options (`dict`, *optional*):
|
| 1452 |
+
Key/value pairs to be passed on to the file-system backend, if any.
|
| 1453 |
+
|
| 1454 |
+
<Added version="2.9.0"/>
|
| 1455 |
+
|
| 1456 |
+
Returns:
|
| 1457 |
+
[`Dataset`] or [`DatasetDict`]:
|
| 1458 |
+
- If `dataset_path` is a path of a dataset directory: the dataset requested.
|
| 1459 |
+
- If `dataset_path` is a path of a dataset dict directory, a [`DatasetDict`] with each split.
|
| 1460 |
+
|
| 1461 |
+
Example:
|
| 1462 |
+
|
| 1463 |
+
```py
|
| 1464 |
+
>>> from datasets import load_from_disk
|
| 1465 |
+
>>> ds = load_from_disk('path/to/dataset/directory')
|
| 1466 |
+
```
|
| 1467 |
+
"""
|
| 1468 |
+
fs: fsspec.AbstractFileSystem
|
| 1469 |
+
fs, *_ = url_to_fs(dataset_path, **(storage_options or {}))
|
| 1470 |
+
if not fs.exists(dataset_path):
|
| 1471 |
+
raise FileNotFoundError(f"Directory {dataset_path} not found")
|
| 1472 |
+
if fs.isfile(posixpath.join(dataset_path, config.DATASET_INFO_FILENAME)) and fs.isfile(
|
| 1473 |
+
posixpath.join(dataset_path, config.DATASET_STATE_JSON_FILENAME)
|
| 1474 |
+
):
|
| 1475 |
+
return Dataset.load_from_disk(dataset_path, keep_in_memory=keep_in_memory, storage_options=storage_options)
|
| 1476 |
+
elif fs.isfile(posixpath.join(dataset_path, config.DATASETDICT_JSON_FILENAME)):
|
| 1477 |
+
return DatasetDict.load_from_disk(dataset_path, keep_in_memory=keep_in_memory, storage_options=storage_options)
|
| 1478 |
+
else:
|
| 1479 |
+
raise FileNotFoundError(
|
| 1480 |
+
f"Directory {dataset_path} is neither a `Dataset` directory nor a `DatasetDict` directory."
|
| 1481 |
+
)
|
datasets/naming.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
# Lint as: python3
|
| 16 |
+
"""Utilities for file names."""
|
| 17 |
+
|
| 18 |
+
import itertools
|
| 19 |
+
import os
|
| 20 |
+
import re
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
_uppercase_uppercase_re = re.compile(r"([A-Z]+)([A-Z][a-z])")
|
| 24 |
+
_lowercase_uppercase_re = re.compile(r"([a-z\d])([A-Z])")
|
| 25 |
+
|
| 26 |
+
_single_underscore_re = re.compile(r"(?<!_)_(?!_)")
|
| 27 |
+
_multiple_underscores_re = re.compile(r"(_{2,})")
|
| 28 |
+
|
| 29 |
+
_split_re = r"^\w+(\.\w+)*$"
|
| 30 |
+
|
| 31 |
+
INVALID_WINDOWS_CHARACTERS_IN_PATH = r"<>:/\|?*"
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def camelcase_to_snakecase(name):
|
| 35 |
+
"""Convert camel-case string to snake-case."""
|
| 36 |
+
name = _uppercase_uppercase_re.sub(r"\1_\2", name)
|
| 37 |
+
name = _lowercase_uppercase_re.sub(r"\1_\2", name)
|
| 38 |
+
return name.lower()
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def snakecase_to_camelcase(name):
|
| 42 |
+
"""Convert snake-case string to camel-case string."""
|
| 43 |
+
name = _single_underscore_re.split(name)
|
| 44 |
+
name = [_multiple_underscores_re.split(n) for n in name]
|
| 45 |
+
return "".join(n.capitalize() for n in itertools.chain.from_iterable(name) if n != "")
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def filename_prefix_for_name(name):
|
| 49 |
+
if os.path.basename(name) != name:
|
| 50 |
+
raise ValueError(f"Should be a dataset name, not a path: {name}")
|
| 51 |
+
return camelcase_to_snakecase(name)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def filename_prefix_for_split(name, split):
|
| 55 |
+
if os.path.basename(name) != name:
|
| 56 |
+
raise ValueError(f"Should be a dataset name, not a path: {name}")
|
| 57 |
+
if not re.match(_split_re, split):
|
| 58 |
+
raise ValueError(f"Split name should match '{_split_re}'' but got '{split}'.")
|
| 59 |
+
return f"{filename_prefix_for_name(name)}-{split}"
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def filepattern_for_dataset_split(dataset_name, split, data_dir, filetype_suffix=None):
|
| 63 |
+
prefix = filename_prefix_for_split(dataset_name, split)
|
| 64 |
+
if filetype_suffix:
|
| 65 |
+
prefix += f".{filetype_suffix}"
|
| 66 |
+
filepath = os.path.join(data_dir, prefix)
|
| 67 |
+
return f"{filepath}*"
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def filenames_for_dataset_split(path, dataset_name, split, filetype_suffix=None, shard_lengths=None):
|
| 71 |
+
prefix = filename_prefix_for_split(dataset_name, split)
|
| 72 |
+
prefix = os.path.join(path, prefix)
|
| 73 |
+
|
| 74 |
+
if shard_lengths:
|
| 75 |
+
num_shards = len(shard_lengths)
|
| 76 |
+
filenames = [f"{prefix}-{shard_id:05d}-of-{num_shards:05d}" for shard_id in range(num_shards)]
|
| 77 |
+
if filetype_suffix:
|
| 78 |
+
filenames = [filename + f".{filetype_suffix}" for filename in filenames]
|
| 79 |
+
return filenames
|
| 80 |
+
else:
|
| 81 |
+
filename = prefix
|
| 82 |
+
if filetype_suffix:
|
| 83 |
+
filename += f".{filetype_suffix}"
|
| 84 |
+
return [filename]
|
datasets/search.py
ADDED
|
@@ -0,0 +1,785 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import importlib.util
|
| 2 |
+
import os
|
| 3 |
+
import tempfile
|
| 4 |
+
from pathlib import PurePath
|
| 5 |
+
from typing import TYPE_CHECKING, NamedTuple, Optional, Union
|
| 6 |
+
|
| 7 |
+
import fsspec
|
| 8 |
+
import numpy as np
|
| 9 |
+
|
| 10 |
+
from .features import List
|
| 11 |
+
from .utils import logging
|
| 12 |
+
from .utils import tqdm as hf_tqdm
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
if TYPE_CHECKING:
|
| 16 |
+
from .arrow_dataset import Dataset # noqa: F401
|
| 17 |
+
|
| 18 |
+
try:
|
| 19 |
+
from elasticsearch import Elasticsearch # noqa: F401
|
| 20 |
+
|
| 21 |
+
except ImportError:
|
| 22 |
+
pass
|
| 23 |
+
try:
|
| 24 |
+
import faiss # noqa: F401
|
| 25 |
+
|
| 26 |
+
except ImportError:
|
| 27 |
+
pass
|
| 28 |
+
|
| 29 |
+
_has_elasticsearch = importlib.util.find_spec("elasticsearch") is not None
|
| 30 |
+
_has_faiss = importlib.util.find_spec("faiss") is not None
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
logger = logging.get_logger(__name__)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class MissingIndex(Exception):
|
| 37 |
+
pass
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class SearchResults(NamedTuple):
|
| 41 |
+
scores: list[float]
|
| 42 |
+
indices: list[int]
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class BatchedSearchResults(NamedTuple):
|
| 46 |
+
total_scores: list[list[float]]
|
| 47 |
+
total_indices: list[list[int]]
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class NearestExamplesResults(NamedTuple):
|
| 51 |
+
scores: list[float]
|
| 52 |
+
examples: dict
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
class BatchedNearestExamplesResults(NamedTuple):
|
| 56 |
+
total_scores: list[list[float]]
|
| 57 |
+
total_examples: list[dict]
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class BaseIndex:
|
| 61 |
+
"""Base class for indexing"""
|
| 62 |
+
|
| 63 |
+
def search(self, query, k: int = 10, **kwargs) -> SearchResults:
|
| 64 |
+
"""
|
| 65 |
+
To implement.
|
| 66 |
+
This method has to return the scores and the indices of the retrieved examples given a certain query.
|
| 67 |
+
"""
|
| 68 |
+
raise NotImplementedError
|
| 69 |
+
|
| 70 |
+
def search_batch(self, queries, k: int = 10, **kwargs) -> BatchedSearchResults:
|
| 71 |
+
"""Find the nearest examples indices to the query.
|
| 72 |
+
|
| 73 |
+
Args:
|
| 74 |
+
queries (`Union[List[str], np.ndarray]`): The queries as a list of strings if `column` is a text index or as a numpy array if `column` is a vector index.
|
| 75 |
+
k (`int`): The number of examples to retrieve per query.
|
| 76 |
+
|
| 77 |
+
Output:
|
| 78 |
+
total_scores (`List[List[float]`): The retrieval scores of the retrieved examples per query.
|
| 79 |
+
total_indices (`List[List[int]]`): The indices of the retrieved examples per query.
|
| 80 |
+
"""
|
| 81 |
+
total_scores, total_indices = [], []
|
| 82 |
+
for query in queries:
|
| 83 |
+
scores, indices = self.search(query, k)
|
| 84 |
+
total_scores.append(scores)
|
| 85 |
+
total_indices.append(indices)
|
| 86 |
+
return BatchedSearchResults(total_scores, total_indices)
|
| 87 |
+
|
| 88 |
+
def save(self, file: Union[str, PurePath]):
|
| 89 |
+
"""Serialize the index on disk"""
|
| 90 |
+
raise NotImplementedError
|
| 91 |
+
|
| 92 |
+
@classmethod
|
| 93 |
+
def load(cls, file: Union[str, PurePath]) -> "BaseIndex":
|
| 94 |
+
"""Deserialize the index from disk"""
|
| 95 |
+
raise NotImplementedError
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
class ElasticSearchIndex(BaseIndex):
|
| 99 |
+
"""
|
| 100 |
+
Sparse index using Elasticsearch. It is used to index text and run queries based on BM25 similarity.
|
| 101 |
+
An Elasticsearch server needs to be accessible, and a python client is declared with
|
| 102 |
+
```
|
| 103 |
+
es_client = Elasticsearch([{'host': 'localhost', 'port': '9200'}])
|
| 104 |
+
```
|
| 105 |
+
for example.
|
| 106 |
+
"""
|
| 107 |
+
|
| 108 |
+
def __init__(
|
| 109 |
+
self,
|
| 110 |
+
host: Optional[str] = None,
|
| 111 |
+
port: Optional[int] = None,
|
| 112 |
+
es_client: Optional["Elasticsearch"] = None,
|
| 113 |
+
es_index_name: Optional[str] = None,
|
| 114 |
+
es_index_config: Optional[dict] = None,
|
| 115 |
+
):
|
| 116 |
+
if not _has_elasticsearch:
|
| 117 |
+
raise ImportError(
|
| 118 |
+
"You must install ElasticSearch to use ElasticSearchIndex. To do so you can run `pip install elasticsearch==7.7.1 for example`"
|
| 119 |
+
)
|
| 120 |
+
if es_client is not None and (host is not None or port is not None):
|
| 121 |
+
raise ValueError("Please specify either `es_client` or `(host, port)`, but not both.")
|
| 122 |
+
host = host or "localhost"
|
| 123 |
+
port = port or 9200
|
| 124 |
+
|
| 125 |
+
import elasticsearch.helpers # noqa: F401 - need this to properly load all the es features
|
| 126 |
+
from elasticsearch import Elasticsearch # noqa: F811
|
| 127 |
+
|
| 128 |
+
self.es_client = es_client if es_client is not None else Elasticsearch([{"host": host, "port": str(port)}])
|
| 129 |
+
self.es_index_name = (
|
| 130 |
+
es_index_name
|
| 131 |
+
if es_index_name is not None
|
| 132 |
+
else "huggingface_datasets_" + os.path.basename(tempfile.NamedTemporaryFile().name)
|
| 133 |
+
)
|
| 134 |
+
self.es_index_config = (
|
| 135 |
+
es_index_config
|
| 136 |
+
if es_index_config is not None
|
| 137 |
+
else {
|
| 138 |
+
"settings": {
|
| 139 |
+
"number_of_shards": 1,
|
| 140 |
+
"analysis": {"analyzer": {"stop_standard": {"type": "standard", " stopwords": "_english_"}}},
|
| 141 |
+
},
|
| 142 |
+
"mappings": {"properties": {"text": {"type": "text", "analyzer": "standard", "similarity": "BM25"}}},
|
| 143 |
+
}
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
def add_documents(self, documents: Union[list[str], "Dataset"], column: Optional[str] = None):
|
| 147 |
+
"""
|
| 148 |
+
Add documents to the index.
|
| 149 |
+
If the documents are inside a certain column, you can specify it using the `column` argument.
|
| 150 |
+
"""
|
| 151 |
+
index_name = self.es_index_name
|
| 152 |
+
index_config = self.es_index_config
|
| 153 |
+
self.es_client.indices.create(index=index_name, body=index_config)
|
| 154 |
+
number_of_docs = len(documents)
|
| 155 |
+
progress = hf_tqdm(unit="docs", total=number_of_docs)
|
| 156 |
+
successes = 0
|
| 157 |
+
|
| 158 |
+
def passage_generator():
|
| 159 |
+
if column is not None:
|
| 160 |
+
for i, example in enumerate(documents):
|
| 161 |
+
yield {"text": example[column], "_id": i}
|
| 162 |
+
else:
|
| 163 |
+
for i, example in enumerate(documents):
|
| 164 |
+
yield {"text": example, "_id": i}
|
| 165 |
+
|
| 166 |
+
# create the ES index
|
| 167 |
+
import elasticsearch as es
|
| 168 |
+
|
| 169 |
+
for ok, action in es.helpers.streaming_bulk(
|
| 170 |
+
client=self.es_client,
|
| 171 |
+
index=index_name,
|
| 172 |
+
actions=passage_generator(),
|
| 173 |
+
):
|
| 174 |
+
progress.update(1)
|
| 175 |
+
successes += ok
|
| 176 |
+
if successes != len(documents):
|
| 177 |
+
logger.warning(
|
| 178 |
+
f"Some documents failed to be added to ElasticSearch. Failures: {len(documents) - successes}/{len(documents)}"
|
| 179 |
+
)
|
| 180 |
+
logger.info(f"Indexed {successes:d} documents")
|
| 181 |
+
|
| 182 |
+
def search(self, query: str, k=10, **kwargs) -> SearchResults:
|
| 183 |
+
"""Find the nearest examples indices to the query.
|
| 184 |
+
|
| 185 |
+
Args:
|
| 186 |
+
query (`str`): The query as a string.
|
| 187 |
+
k (`int`): The number of examples to retrieve.
|
| 188 |
+
|
| 189 |
+
Output:
|
| 190 |
+
scores (`List[List[float]`): The retrieval scores of the retrieved examples.
|
| 191 |
+
indices (`List[List[int]]`): The indices of the retrieved examples.
|
| 192 |
+
"""
|
| 193 |
+
response = self.es_client.search(
|
| 194 |
+
index=self.es_index_name,
|
| 195 |
+
body={"query": {"multi_match": {"query": query, "fields": ["text"], "type": "cross_fields"}}, "size": k},
|
| 196 |
+
**kwargs,
|
| 197 |
+
)
|
| 198 |
+
hits = response["hits"]["hits"]
|
| 199 |
+
return SearchResults([hit["_score"] for hit in hits], [int(hit["_id"]) for hit in hits])
|
| 200 |
+
|
| 201 |
+
def search_batch(self, queries, k: int = 10, max_workers=10, **kwargs) -> BatchedSearchResults:
|
| 202 |
+
import concurrent.futures
|
| 203 |
+
|
| 204 |
+
total_scores, total_indices = [None] * len(queries), [None] * len(queries)
|
| 205 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 206 |
+
future_to_index = {executor.submit(self.search, query, k, **kwargs): i for i, query in enumerate(queries)}
|
| 207 |
+
for future in concurrent.futures.as_completed(future_to_index):
|
| 208 |
+
index = future_to_index[future]
|
| 209 |
+
results: SearchResults = future.result()
|
| 210 |
+
total_scores[index] = results.scores
|
| 211 |
+
total_indices[index] = results.indices
|
| 212 |
+
return BatchedSearchResults(total_indices=total_indices, total_scores=total_scores)
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
class FaissIndex(BaseIndex):
|
| 216 |
+
"""
|
| 217 |
+
Dense index using Faiss. It is used to index vectors.
|
| 218 |
+
Faiss is a library for efficient similarity search and clustering of dense vectors.
|
| 219 |
+
It contains algorithms that search in sets of vectors of any size, up to ones that possibly do not fit in RAM.
|
| 220 |
+
You can find more information about Faiss here:
|
| 221 |
+
- For index types and the string factory: https://github.com/facebookresearch/faiss/wiki/The-index-factory
|
| 222 |
+
- For GPU settings: https://github.com/facebookresearch/faiss/wiki/Faiss-on-the-GPU
|
| 223 |
+
"""
|
| 224 |
+
|
| 225 |
+
def __init__(
|
| 226 |
+
self,
|
| 227 |
+
device: Optional[Union[int, list[int]]] = None,
|
| 228 |
+
string_factory: Optional[str] = None,
|
| 229 |
+
metric_type: Optional[int] = None,
|
| 230 |
+
custom_index: Optional["faiss.Index"] = None,
|
| 231 |
+
):
|
| 232 |
+
"""
|
| 233 |
+
Create a Dense index using Faiss. You can specify `device` if you want to run it on GPU (`device` must be the GPU index).
|
| 234 |
+
You can find more information about Faiss here:
|
| 235 |
+
- For `string factory`: https://github.com/facebookresearch/faiss/wiki/The-index-factory
|
| 236 |
+
"""
|
| 237 |
+
if string_factory is not None and custom_index is not None:
|
| 238 |
+
raise ValueError("Please specify either `string_factory` or `custom_index` but not both.")
|
| 239 |
+
if device is not None and custom_index is not None:
|
| 240 |
+
raise ValueError(
|
| 241 |
+
"Cannot pass both 'custom_index' and 'device'. "
|
| 242 |
+
"Pass 'custom_index' already transferred to the target device instead."
|
| 243 |
+
)
|
| 244 |
+
self.device = device
|
| 245 |
+
self.string_factory = string_factory
|
| 246 |
+
self.metric_type = metric_type
|
| 247 |
+
self.faiss_index = custom_index
|
| 248 |
+
if not _has_faiss:
|
| 249 |
+
raise ImportError(
|
| 250 |
+
"You must install Faiss to use FaissIndex. To do so you can run `conda install -c pytorch faiss-cpu` or `conda install -c pytorch faiss-gpu`. "
|
| 251 |
+
"A community supported package is also available on pypi: `pip install faiss-cpu` or `pip install faiss-gpu`. "
|
| 252 |
+
"Note that pip may not have the latest version of FAISS, and thus, some of the latest features and bug fixes may not be available."
|
| 253 |
+
)
|
| 254 |
+
|
| 255 |
+
def add_vectors(
|
| 256 |
+
self,
|
| 257 |
+
vectors: Union[np.array, "Dataset"],
|
| 258 |
+
column: Optional[str] = None,
|
| 259 |
+
batch_size: int = 1000,
|
| 260 |
+
train_size: Optional[int] = None,
|
| 261 |
+
faiss_verbose: Optional[bool] = None,
|
| 262 |
+
):
|
| 263 |
+
"""
|
| 264 |
+
Add vectors to the index.
|
| 265 |
+
If the arrays are inside a certain column, you can specify it using the `column` argument.
|
| 266 |
+
"""
|
| 267 |
+
import faiss # noqa: F811
|
| 268 |
+
|
| 269 |
+
if column and not isinstance(vectors.features[column], List):
|
| 270 |
+
raise ValueError(
|
| 271 |
+
f"Wrong feature type for column '{column}'. Expected 1d array, got {vectors.features[column]}"
|
| 272 |
+
)
|
| 273 |
+
|
| 274 |
+
# Create index
|
| 275 |
+
if self.faiss_index is None:
|
| 276 |
+
size = len(vectors[0]) if column is None else len(vectors[0][column])
|
| 277 |
+
if self.string_factory is not None:
|
| 278 |
+
if self.metric_type is None:
|
| 279 |
+
index = faiss.index_factory(size, self.string_factory)
|
| 280 |
+
else:
|
| 281 |
+
index = faiss.index_factory(size, self.string_factory, self.metric_type)
|
| 282 |
+
else:
|
| 283 |
+
if self.metric_type is None:
|
| 284 |
+
index = faiss.IndexFlat(size)
|
| 285 |
+
else:
|
| 286 |
+
index = faiss.IndexFlat(size, self.metric_type)
|
| 287 |
+
|
| 288 |
+
self.faiss_index = self._faiss_index_to_device(index, self.device)
|
| 289 |
+
logger.info(f"Created faiss index of type {type(self.faiss_index)}")
|
| 290 |
+
|
| 291 |
+
# Set verbosity level
|
| 292 |
+
if faiss_verbose is not None:
|
| 293 |
+
self.faiss_index.verbose = faiss_verbose
|
| 294 |
+
if hasattr(self.faiss_index, "index") and self.faiss_index.index is not None:
|
| 295 |
+
self.faiss_index.index.verbose = faiss_verbose
|
| 296 |
+
if hasattr(self.faiss_index, "quantizer") and self.faiss_index.quantizer is not None:
|
| 297 |
+
self.faiss_index.quantizer.verbose = faiss_verbose
|
| 298 |
+
if hasattr(self.faiss_index, "clustering_index") and self.faiss_index.clustering_index is not None:
|
| 299 |
+
self.faiss_index.clustering_index.verbose = faiss_verbose
|
| 300 |
+
|
| 301 |
+
# Train
|
| 302 |
+
if train_size is not None:
|
| 303 |
+
train_vecs = vectors[:train_size] if column is None else vectors[:train_size][column]
|
| 304 |
+
logger.info(f"Training the index with the first {len(train_vecs)} vectors")
|
| 305 |
+
self.faiss_index.train(train_vecs)
|
| 306 |
+
else:
|
| 307 |
+
logger.info("Ignored the training step of the faiss index as `train_size` is None.")
|
| 308 |
+
|
| 309 |
+
# Add vectors
|
| 310 |
+
logger.info(f"Adding {len(vectors)} vectors to the faiss index")
|
| 311 |
+
for i in hf_tqdm(range(0, len(vectors), batch_size)):
|
| 312 |
+
vecs = vectors[i : i + batch_size] if column is None else vectors[i : i + batch_size][column]
|
| 313 |
+
self.faiss_index.add(vecs)
|
| 314 |
+
|
| 315 |
+
@staticmethod
|
| 316 |
+
def _faiss_index_to_device(index: "faiss.Index", device: Optional[Union[int, list[int]]] = None) -> "faiss.Index":
|
| 317 |
+
"""
|
| 318 |
+
Sends a faiss index to a device.
|
| 319 |
+
A device can either be a positive integer (GPU id), a negative integer (all GPUs),
|
| 320 |
+
or a list of positive integers (select GPUs to use), or `None` for CPU.
|
| 321 |
+
"""
|
| 322 |
+
|
| 323 |
+
# If device is not specified, then it runs on CPU.
|
| 324 |
+
if device is None:
|
| 325 |
+
return index
|
| 326 |
+
|
| 327 |
+
import faiss # noqa: F811
|
| 328 |
+
|
| 329 |
+
# If the device id is given as an integer
|
| 330 |
+
if isinstance(device, int):
|
| 331 |
+
# Positive integers are directly mapped to GPU ids
|
| 332 |
+
if device > -1:
|
| 333 |
+
faiss_res = faiss.StandardGpuResources()
|
| 334 |
+
index = faiss.index_cpu_to_gpu(faiss_res, device, index)
|
| 335 |
+
# And negative integers mean using all GPUs
|
| 336 |
+
else:
|
| 337 |
+
index = faiss.index_cpu_to_all_gpus(index)
|
| 338 |
+
# Device ids given as a list mean mapping to those devices specified.
|
| 339 |
+
elif isinstance(device, (list, tuple)):
|
| 340 |
+
index = faiss.index_cpu_to_gpus_list(index, gpus=list(device))
|
| 341 |
+
else:
|
| 342 |
+
raise TypeError(
|
| 343 |
+
f"The argument type: {type(device)} is not expected. "
|
| 344 |
+
+ "Please pass in either nothing, a positive int, a negative int, or a list of positive ints."
|
| 345 |
+
)
|
| 346 |
+
|
| 347 |
+
return index
|
| 348 |
+
|
| 349 |
+
def search(self, query: np.array, k=10, **kwargs) -> SearchResults:
|
| 350 |
+
"""Find the nearest examples indices to the query.
|
| 351 |
+
|
| 352 |
+
Args:
|
| 353 |
+
query (`np.array`): The query as a numpy array.
|
| 354 |
+
k (`int`): The number of examples to retrieve.
|
| 355 |
+
|
| 356 |
+
Output:
|
| 357 |
+
scores (`List[List[float]`): The retrieval scores of the retrieved examples.
|
| 358 |
+
indices (`List[List[int]]`): The indices of the retrieved examples.
|
| 359 |
+
"""
|
| 360 |
+
if len(query.shape) != 1 and (len(query.shape) != 2 or query.shape[0] != 1):
|
| 361 |
+
raise ValueError("Shape of query is incorrect, it has to be either a 1D array or 2D (1, N)")
|
| 362 |
+
|
| 363 |
+
queries = query.reshape(1, -1)
|
| 364 |
+
if not queries.flags.c_contiguous:
|
| 365 |
+
queries = np.asarray(queries, order="C")
|
| 366 |
+
scores, indices = self.faiss_index.search(queries, k, **kwargs)
|
| 367 |
+
return SearchResults(scores[0], indices[0].astype(int))
|
| 368 |
+
|
| 369 |
+
def search_batch(self, queries: np.array, k=10, **kwargs) -> BatchedSearchResults:
|
| 370 |
+
"""Find the nearest examples indices to the queries.
|
| 371 |
+
|
| 372 |
+
Args:
|
| 373 |
+
queries (`np.array`): The queries as a numpy array.
|
| 374 |
+
k (`int`): The number of examples to retrieve.
|
| 375 |
+
|
| 376 |
+
Output:
|
| 377 |
+
total_scores (`List[List[float]`): The retrieval scores of the retrieved examples per query.
|
| 378 |
+
total_indices (`List[List[int]]`): The indices of the retrieved examples per query.
|
| 379 |
+
"""
|
| 380 |
+
if len(queries.shape) != 2:
|
| 381 |
+
raise ValueError("Shape of query must be 2D")
|
| 382 |
+
if not queries.flags.c_contiguous:
|
| 383 |
+
queries = np.asarray(queries, order="C")
|
| 384 |
+
scores, indices = self.faiss_index.search(queries, k, **kwargs)
|
| 385 |
+
return BatchedSearchResults(scores, indices.astype(int))
|
| 386 |
+
|
| 387 |
+
def save(self, file: Union[str, PurePath], storage_options: Optional[dict] = None):
|
| 388 |
+
"""Serialize the FaissIndex on disk"""
|
| 389 |
+
import faiss # noqa: F811
|
| 390 |
+
|
| 391 |
+
if self.device is not None and isinstance(self.device, (int, list, tuple)):
|
| 392 |
+
index = faiss.index_gpu_to_cpu(self.faiss_index)
|
| 393 |
+
else:
|
| 394 |
+
index = self.faiss_index
|
| 395 |
+
|
| 396 |
+
with fsspec.open(str(file), "wb", **(storage_options or {})) as f:
|
| 397 |
+
faiss.write_index(index, faiss.BufferedIOWriter(faiss.PyCallbackIOWriter(f.write)))
|
| 398 |
+
|
| 399 |
+
@classmethod
|
| 400 |
+
def load(
|
| 401 |
+
cls,
|
| 402 |
+
file: Union[str, PurePath],
|
| 403 |
+
device: Optional[Union[int, list[int]]] = None,
|
| 404 |
+
storage_options: Optional[dict] = None,
|
| 405 |
+
) -> "FaissIndex":
|
| 406 |
+
"""Deserialize the FaissIndex from disk"""
|
| 407 |
+
import faiss # noqa: F811
|
| 408 |
+
|
| 409 |
+
# Instances of FaissIndex is essentially just a wrapper for faiss indices.
|
| 410 |
+
faiss_index = cls(device=device)
|
| 411 |
+
with fsspec.open(str(file), "rb", **(storage_options or {})) as f:
|
| 412 |
+
index = faiss.read_index(faiss.BufferedIOReader(faiss.PyCallbackIOReader(f.read)))
|
| 413 |
+
faiss_index.faiss_index = faiss_index._faiss_index_to_device(index, faiss_index.device)
|
| 414 |
+
return faiss_index
|
| 415 |
+
|
| 416 |
+
|
| 417 |
+
class IndexableMixin:
|
| 418 |
+
"""Add indexing features to `datasets.Dataset`"""
|
| 419 |
+
|
| 420 |
+
def __init__(self):
|
| 421 |
+
self._indexes: dict[str, BaseIndex] = {}
|
| 422 |
+
|
| 423 |
+
def __len__(self):
|
| 424 |
+
raise NotImplementedError
|
| 425 |
+
|
| 426 |
+
def __getitem__(self, key):
|
| 427 |
+
raise NotImplementedError
|
| 428 |
+
|
| 429 |
+
def is_index_initialized(self, index_name: str) -> bool:
|
| 430 |
+
return index_name in self._indexes
|
| 431 |
+
|
| 432 |
+
def _check_index_is_initialized(self, index_name: str):
|
| 433 |
+
if not self.is_index_initialized(index_name):
|
| 434 |
+
raise MissingIndex(
|
| 435 |
+
f"Index with index_name '{index_name}' not initialized yet. Please make sure that you call `add_faiss_index` or `add_elasticsearch_index` first."
|
| 436 |
+
)
|
| 437 |
+
|
| 438 |
+
def list_indexes(self) -> list[str]:
|
| 439 |
+
"""List the `colindex_nameumns`/identifiers of all the attached indexes."""
|
| 440 |
+
return list(self._indexes)
|
| 441 |
+
|
| 442 |
+
def get_index(self, index_name: str) -> BaseIndex:
|
| 443 |
+
"""List the `index_name`/identifiers of all the attached indexes.
|
| 444 |
+
|
| 445 |
+
Args:
|
| 446 |
+
index_name (`str`): Index name.
|
| 447 |
+
|
| 448 |
+
Returns:
|
| 449 |
+
[`BaseIndex`]
|
| 450 |
+
"""
|
| 451 |
+
self._check_index_is_initialized(index_name)
|
| 452 |
+
return self._indexes[index_name]
|
| 453 |
+
|
| 454 |
+
def add_faiss_index(
|
| 455 |
+
self,
|
| 456 |
+
column: str,
|
| 457 |
+
index_name: Optional[str] = None,
|
| 458 |
+
device: Optional[Union[int, list[int]]] = None,
|
| 459 |
+
string_factory: Optional[str] = None,
|
| 460 |
+
metric_type: Optional[int] = None,
|
| 461 |
+
custom_index: Optional["faiss.Index"] = None,
|
| 462 |
+
batch_size: int = 1000,
|
| 463 |
+
train_size: Optional[int] = None,
|
| 464 |
+
faiss_verbose: bool = False,
|
| 465 |
+
):
|
| 466 |
+
"""Add a dense index using Faiss for fast retrieval.
|
| 467 |
+
The index is created using the vectors of the specified column.
|
| 468 |
+
You can specify `device` if you want to run it on GPU (`device` must be the GPU index, see more below).
|
| 469 |
+
You can find more information about Faiss here:
|
| 470 |
+
- For `string factory`: https://github.com/facebookresearch/faiss/wiki/The-index-factory
|
| 471 |
+
|
| 472 |
+
Args:
|
| 473 |
+
column (`str`): The column of the vectors to add to the index.
|
| 474 |
+
index_name (Optional `str`): The index_name/identifier of the index. This is the index_name that is used to call `.get_nearest` or `.search`.
|
| 475 |
+
By default it corresponds to `column`.
|
| 476 |
+
device (Optional `Union[int, List[int]]`): If positive integer, this is the index of the GPU to use. If negative integer, use all GPUs.
|
| 477 |
+
If a list of positive integers is passed in, run only on those GPUs. By default it uses the CPU.
|
| 478 |
+
string_factory (Optional `str`): This is passed to the index factory of Faiss to create the index. Default index class is IndexFlatIP.
|
| 479 |
+
metric_type (Optional `int`): Type of metric. Ex: `faiss.METRIC_INNER_PRODUCT` or `faiss.METRIC_L2`.
|
| 480 |
+
custom_index (Optional `faiss.Index`): Custom Faiss index that you already have instantiated and configured for your needs.
|
| 481 |
+
batch_size (Optional `int`): Size of the batch to use while adding vectors to the FaissIndex. Default value is 1000.
|
| 482 |
+
<Added version="2.4.0"/>
|
| 483 |
+
train_size (Optional `int`): If the index needs a training step, specifies how many vectors will be used to train the index.
|
| 484 |
+
faiss_verbose (`bool`, defaults to False): Enable the verbosity of the Faiss index.
|
| 485 |
+
"""
|
| 486 |
+
index_name = index_name if index_name is not None else column
|
| 487 |
+
faiss_index = FaissIndex(
|
| 488 |
+
device=device, string_factory=string_factory, metric_type=metric_type, custom_index=custom_index
|
| 489 |
+
)
|
| 490 |
+
faiss_index.add_vectors(
|
| 491 |
+
self, column=column, batch_size=batch_size, train_size=train_size, faiss_verbose=faiss_verbose
|
| 492 |
+
)
|
| 493 |
+
self._indexes[index_name] = faiss_index
|
| 494 |
+
|
| 495 |
+
def add_faiss_index_from_external_arrays(
|
| 496 |
+
self,
|
| 497 |
+
external_arrays: np.array,
|
| 498 |
+
index_name: str,
|
| 499 |
+
device: Optional[Union[int, list[int]]] = None,
|
| 500 |
+
string_factory: Optional[str] = None,
|
| 501 |
+
metric_type: Optional[int] = None,
|
| 502 |
+
custom_index: Optional["faiss.Index"] = None,
|
| 503 |
+
batch_size: int = 1000,
|
| 504 |
+
train_size: Optional[int] = None,
|
| 505 |
+
faiss_verbose: bool = False,
|
| 506 |
+
):
|
| 507 |
+
"""Add a dense index using Faiss for fast retrieval.
|
| 508 |
+
The index is created using the vectors of `external_arrays`.
|
| 509 |
+
You can specify `device` if you want to run it on GPU (`device` must be the GPU index).
|
| 510 |
+
You can find more information about Faiss here:
|
| 511 |
+
- For `string factory`: https://github.com/facebookresearch/faiss/wiki/The-index-factory
|
| 512 |
+
|
| 513 |
+
Args:
|
| 514 |
+
external_arrays (`np.array`): If you want to use arrays from outside the lib for the index, you can set `external_arrays`.
|
| 515 |
+
It will use `external_arrays` to create the Faiss index instead of the arrays in the given `column`.
|
| 516 |
+
index_name (`str`): The index_name/identifier of the index. This is the index_name that is used to call `.get_nearest` or `.search`.
|
| 517 |
+
device (Optional `Union[int, List[int]]`): If positive integer, this is the index of the GPU to use. If negative integer, use all GPUs.
|
| 518 |
+
If a list of positive integers is passed in, run only on those GPUs. By default it uses the CPU.
|
| 519 |
+
string_factory (Optional `str`): This is passed to the index factory of Faiss to create the index. Default index class is IndexFlatIP.
|
| 520 |
+
metric_type (Optional `int`): Type of metric. Ex: `faiss.METRIC_INNER_PRODUCT` or `faiss.METRIC_L2`.
|
| 521 |
+
custom_index (Optional `faiss.Index`): Custom Faiss index that you already have instantiated and configured for your needs.
|
| 522 |
+
batch_size (Optional `int`): Size of the batch to use while adding vectors to the FaissIndex. Default value is 1000.
|
| 523 |
+
<Added version="2.4.0"/>
|
| 524 |
+
train_size (Optional `int`): If the index needs a training step, specifies how many vectors will be used to train the index.
|
| 525 |
+
faiss_verbose (`bool`, defaults to False): Enable the verbosity of the Faiss index.
|
| 526 |
+
"""
|
| 527 |
+
faiss_index = FaissIndex(
|
| 528 |
+
device=device, string_factory=string_factory, metric_type=metric_type, custom_index=custom_index
|
| 529 |
+
)
|
| 530 |
+
faiss_index.add_vectors(
|
| 531 |
+
external_arrays, column=None, batch_size=batch_size, train_size=train_size, faiss_verbose=faiss_verbose
|
| 532 |
+
)
|
| 533 |
+
self._indexes[index_name] = faiss_index
|
| 534 |
+
|
| 535 |
+
def save_faiss_index(self, index_name: str, file: Union[str, PurePath], storage_options: Optional[dict] = None):
|
| 536 |
+
"""Save a FaissIndex on disk.
|
| 537 |
+
|
| 538 |
+
Args:
|
| 539 |
+
index_name (`str`): The index_name/identifier of the index. This is the index_name that is used to call `.get_nearest` or `.search`.
|
| 540 |
+
file (`str`): The path to the serialized faiss index on disk or remote URI (e.g. `"s3://my-bucket/index.faiss"`).
|
| 541 |
+
storage_options (`dict`, *optional*):
|
| 542 |
+
Key/value pairs to be passed on to the file-system backend, if any.
|
| 543 |
+
|
| 544 |
+
<Added version="2.11.0"/>
|
| 545 |
+
|
| 546 |
+
"""
|
| 547 |
+
index = self.get_index(index_name)
|
| 548 |
+
if not isinstance(index, FaissIndex):
|
| 549 |
+
raise ValueError(f"Index '{index_name}' is not a FaissIndex but a '{type(index)}'")
|
| 550 |
+
index.save(file, storage_options=storage_options)
|
| 551 |
+
logger.info(f"Saved FaissIndex {index_name} at {file}")
|
| 552 |
+
|
| 553 |
+
def load_faiss_index(
|
| 554 |
+
self,
|
| 555 |
+
index_name: str,
|
| 556 |
+
file: Union[str, PurePath],
|
| 557 |
+
device: Optional[Union[int, list[int]]] = None,
|
| 558 |
+
storage_options: Optional[dict] = None,
|
| 559 |
+
):
|
| 560 |
+
"""Load a FaissIndex from disk.
|
| 561 |
+
|
| 562 |
+
If you want to do additional configurations, you can have access to the faiss index object by doing
|
| 563 |
+
`.get_index(index_name).faiss_index` to make it fit your needs.
|
| 564 |
+
|
| 565 |
+
Args:
|
| 566 |
+
index_name (`str`): The index_name/identifier of the index. This is the index_name that is used to
|
| 567 |
+
call `.get_nearest` or `.search`.
|
| 568 |
+
file (`str`): The path to the serialized faiss index on disk or remote URI (e.g. `"s3://my-bucket/index.faiss"`).
|
| 569 |
+
device (Optional `Union[int, List[int]]`): If positive integer, this is the index of the GPU to use. If negative integer, use all GPUs.
|
| 570 |
+
If a list of positive integers is passed in, run only on those GPUs. By default it uses the CPU.
|
| 571 |
+
storage_options (`dict`, *optional*):
|
| 572 |
+
Key/value pairs to be passed on to the file-system backend, if any.
|
| 573 |
+
|
| 574 |
+
<Added version="2.11.0"/>
|
| 575 |
+
|
| 576 |
+
"""
|
| 577 |
+
index = FaissIndex.load(file, device=device, storage_options=storage_options)
|
| 578 |
+
if index.faiss_index.ntotal != len(self):
|
| 579 |
+
raise ValueError(
|
| 580 |
+
f"Index size should match Dataset size, but Index '{index_name}' at {file} has {index.faiss_index.ntotal} elements while the dataset has {len(self)} examples."
|
| 581 |
+
)
|
| 582 |
+
self._indexes[index_name] = index
|
| 583 |
+
logger.info(f"Loaded FaissIndex {index_name} from {file}")
|
| 584 |
+
|
| 585 |
+
def add_elasticsearch_index(
|
| 586 |
+
self,
|
| 587 |
+
column: str,
|
| 588 |
+
index_name: Optional[str] = None,
|
| 589 |
+
host: Optional[str] = None,
|
| 590 |
+
port: Optional[int] = None,
|
| 591 |
+
es_client: Optional["Elasticsearch"] = None,
|
| 592 |
+
es_index_name: Optional[str] = None,
|
| 593 |
+
es_index_config: Optional[dict] = None,
|
| 594 |
+
):
|
| 595 |
+
"""Add a text index using ElasticSearch for fast retrieval.
|
| 596 |
+
|
| 597 |
+
Args:
|
| 598 |
+
column (`str`): The column of the documents to add to the index.
|
| 599 |
+
index_name (Optional `str`): The index_name/identifier of the index. This is the index name that is used to call `.get_nearest` or `.search`.
|
| 600 |
+
By default it corresponds to `column`.
|
| 601 |
+
host (Optional `str`, defaults to localhost):
|
| 602 |
+
host of where ElasticSearch is running
|
| 603 |
+
port (Optional `str`, defaults to 9200):
|
| 604 |
+
port of where ElasticSearch is running
|
| 605 |
+
es_client (Optional `elasticsearch.Elasticsearch`):
|
| 606 |
+
The elasticsearch client used to create the index if host and port are None.
|
| 607 |
+
es_index_name (Optional `str`): The elasticsearch index name used to create the index.
|
| 608 |
+
es_index_config (Optional `dict`):
|
| 609 |
+
The configuration of the elasticsearch index.
|
| 610 |
+
Default config is:
|
| 611 |
+
|
| 612 |
+
Config::
|
| 613 |
+
|
| 614 |
+
{
|
| 615 |
+
"settings": {
|
| 616 |
+
"number_of_shards": 1,
|
| 617 |
+
"analysis": {"analyzer": {"stop_standard": {"type": "standard", " stopwords": "_english_"}}},
|
| 618 |
+
},
|
| 619 |
+
"mappings": {
|
| 620 |
+
"properties": {
|
| 621 |
+
"text": {
|
| 622 |
+
"type": "text",
|
| 623 |
+
"analyzer": "standard",
|
| 624 |
+
"similarity": "BM25"
|
| 625 |
+
},
|
| 626 |
+
}
|
| 627 |
+
},
|
| 628 |
+
}
|
| 629 |
+
"""
|
| 630 |
+
index_name = index_name if index_name is not None else column
|
| 631 |
+
es_index = ElasticSearchIndex(
|
| 632 |
+
host=host, port=port, es_client=es_client, es_index_name=es_index_name, es_index_config=es_index_config
|
| 633 |
+
)
|
| 634 |
+
es_index.add_documents(self, column=column)
|
| 635 |
+
self._indexes[index_name] = es_index
|
| 636 |
+
|
| 637 |
+
def load_elasticsearch_index(
|
| 638 |
+
self,
|
| 639 |
+
index_name: str,
|
| 640 |
+
es_index_name: str,
|
| 641 |
+
host: Optional[str] = None,
|
| 642 |
+
port: Optional[int] = None,
|
| 643 |
+
es_client: Optional["Elasticsearch"] = None,
|
| 644 |
+
es_index_config: Optional[dict] = None,
|
| 645 |
+
):
|
| 646 |
+
"""Load an existing text index using ElasticSearch for fast retrieval.
|
| 647 |
+
|
| 648 |
+
Args:
|
| 649 |
+
index_name (`str`):
|
| 650 |
+
The `index_name`/identifier of the index. This is the index name that is used to call `get_nearest` or `search`.
|
| 651 |
+
es_index_name (`str`):
|
| 652 |
+
The name of elasticsearch index to load.
|
| 653 |
+
host (`str`, *optional*, defaults to `localhost`):
|
| 654 |
+
Host of where ElasticSearch is running.
|
| 655 |
+
port (`str`, *optional*, defaults to `9200`):
|
| 656 |
+
Port of where ElasticSearch is running.
|
| 657 |
+
es_client (`elasticsearch.Elasticsearch`, *optional*):
|
| 658 |
+
The elasticsearch client used to create the index if host and port are `None`.
|
| 659 |
+
es_index_config (`dict`, *optional*):
|
| 660 |
+
The configuration of the elasticsearch index.
|
| 661 |
+
Default config is:
|
| 662 |
+
```
|
| 663 |
+
{
|
| 664 |
+
"settings": {
|
| 665 |
+
"number_of_shards": 1,
|
| 666 |
+
"analysis": {"analyzer": {"stop_standard": {"type": "standard", " stopwords": "_english_"}}},
|
| 667 |
+
},
|
| 668 |
+
"mappings": {
|
| 669 |
+
"properties": {
|
| 670 |
+
"text": {
|
| 671 |
+
"type": "text",
|
| 672 |
+
"analyzer": "standard",
|
| 673 |
+
"similarity": "BM25"
|
| 674 |
+
},
|
| 675 |
+
}
|
| 676 |
+
},
|
| 677 |
+
}
|
| 678 |
+
```
|
| 679 |
+
"""
|
| 680 |
+
self._indexes[index_name] = ElasticSearchIndex(
|
| 681 |
+
host=host, port=port, es_client=es_client, es_index_name=es_index_name, es_index_config=es_index_config
|
| 682 |
+
)
|
| 683 |
+
|
| 684 |
+
def drop_index(self, index_name: str):
|
| 685 |
+
"""Drop the index with the specified column.
|
| 686 |
+
|
| 687 |
+
Args:
|
| 688 |
+
index_name (`str`):
|
| 689 |
+
The `index_name`/identifier of the index.
|
| 690 |
+
"""
|
| 691 |
+
del self._indexes[index_name]
|
| 692 |
+
|
| 693 |
+
def search(self, index_name: str, query: Union[str, np.array], k: int = 10, **kwargs) -> SearchResults:
|
| 694 |
+
"""Find the nearest examples indices in the dataset to the query.
|
| 695 |
+
|
| 696 |
+
Args:
|
| 697 |
+
index_name (`str`):
|
| 698 |
+
The name/identifier of the index.
|
| 699 |
+
query (`Union[str, np.ndarray]`):
|
| 700 |
+
The query as a string if `index_name` is a text index or as a numpy array if `index_name` is a vector index.
|
| 701 |
+
k (`int`):
|
| 702 |
+
The number of examples to retrieve.
|
| 703 |
+
|
| 704 |
+
Returns:
|
| 705 |
+
`(scores, indices)`:
|
| 706 |
+
A tuple of `(scores, indices)` where:
|
| 707 |
+
- **scores** (`List[List[float]`): the retrieval scores from either FAISS (`IndexFlatL2` by default) or ElasticSearch of the retrieved examples
|
| 708 |
+
- **indices** (`List[List[int]]`): the indices of the retrieved examples
|
| 709 |
+
"""
|
| 710 |
+
self._check_index_is_initialized(index_name)
|
| 711 |
+
return self._indexes[index_name].search(query, k, **kwargs)
|
| 712 |
+
|
| 713 |
+
def search_batch(
|
| 714 |
+
self, index_name: str, queries: Union[list[str], np.array], k: int = 10, **kwargs
|
| 715 |
+
) -> BatchedSearchResults:
|
| 716 |
+
"""Find the nearest examples indices in the dataset to the query.
|
| 717 |
+
|
| 718 |
+
Args:
|
| 719 |
+
index_name (`str`):
|
| 720 |
+
The `index_name`/identifier of the index.
|
| 721 |
+
queries (`Union[List[str], np.ndarray]`):
|
| 722 |
+
The queries as a list of strings if `index_name` is a text index or as a numpy array if `index_name` is a vector index.
|
| 723 |
+
k (`int`):
|
| 724 |
+
The number of examples to retrieve per query.
|
| 725 |
+
|
| 726 |
+
Returns:
|
| 727 |
+
`(total_scores, total_indices)`:
|
| 728 |
+
A tuple of `(total_scores, total_indices)` where:
|
| 729 |
+
- **total_scores** (`List[List[float]`): the retrieval scores from either FAISS (`IndexFlatL2` by default) or ElasticSearch of the retrieved examples per query
|
| 730 |
+
- **total_indices** (`List[List[int]]`): the indices of the retrieved examples per query
|
| 731 |
+
"""
|
| 732 |
+
self._check_index_is_initialized(index_name)
|
| 733 |
+
return self._indexes[index_name].search_batch(queries, k, **kwargs)
|
| 734 |
+
|
| 735 |
+
def get_nearest_examples(
|
| 736 |
+
self, index_name: str, query: Union[str, np.array], k: int = 10, **kwargs
|
| 737 |
+
) -> NearestExamplesResults:
|
| 738 |
+
"""Find the nearest examples in the dataset to the query.
|
| 739 |
+
|
| 740 |
+
Args:
|
| 741 |
+
index_name (`str`):
|
| 742 |
+
The index_name/identifier of the index.
|
| 743 |
+
query (`Union[str, np.ndarray]`):
|
| 744 |
+
The query as a string if `index_name` is a text index or as a numpy array if `index_name` is a vector index.
|
| 745 |
+
k (`int`):
|
| 746 |
+
The number of examples to retrieve.
|
| 747 |
+
|
| 748 |
+
Returns:
|
| 749 |
+
`(scores, examples)`:
|
| 750 |
+
A tuple of `(scores, examples)` where:
|
| 751 |
+
- **scores** (`List[float]`): the retrieval scores from either FAISS (`IndexFlatL2` by default) or ElasticSearch of the retrieved examples
|
| 752 |
+
- **examples** (`dict`): the retrieved examples
|
| 753 |
+
"""
|
| 754 |
+
self._check_index_is_initialized(index_name)
|
| 755 |
+
scores, indices = self.search(index_name, query, k, **kwargs)
|
| 756 |
+
top_indices = [i for i in indices if i >= 0]
|
| 757 |
+
return NearestExamplesResults(scores[: len(top_indices)], self[top_indices])
|
| 758 |
+
|
| 759 |
+
def get_nearest_examples_batch(
|
| 760 |
+
self, index_name: str, queries: Union[list[str], np.array], k: int = 10, **kwargs
|
| 761 |
+
) -> BatchedNearestExamplesResults:
|
| 762 |
+
"""Find the nearest examples in the dataset to the query.
|
| 763 |
+
|
| 764 |
+
Args:
|
| 765 |
+
index_name (`str`):
|
| 766 |
+
The `index_name`/identifier of the index.
|
| 767 |
+
queries (`Union[List[str], np.ndarray]`):
|
| 768 |
+
The queries as a list of strings if `index_name` is a text index or as a numpy array if `index_name` is a vector index.
|
| 769 |
+
k (`int`):
|
| 770 |
+
The number of examples to retrieve per query.
|
| 771 |
+
|
| 772 |
+
Returns:
|
| 773 |
+
`(total_scores, total_examples)`:
|
| 774 |
+
A tuple of `(total_scores, total_examples)` where:
|
| 775 |
+
- **total_scores** (`List[List[float]`): the retrieval scores from either FAISS (`IndexFlatL2` by default) or ElasticSearch of the retrieved examples per query
|
| 776 |
+
- **total_examples** (`List[dict]`): the retrieved examples per query
|
| 777 |
+
"""
|
| 778 |
+
self._check_index_is_initialized(index_name)
|
| 779 |
+
total_scores, total_indices = self.search_batch(index_name, queries, k, **kwargs)
|
| 780 |
+
total_scores = [
|
| 781 |
+
scores_i[: len([i for i in indices_i if i >= 0])]
|
| 782 |
+
for scores_i, indices_i in zip(total_scores, total_indices)
|
| 783 |
+
]
|
| 784 |
+
total_samples = [self[[i for i in indices if i >= 0]] for indices in total_indices]
|
| 785 |
+
return BatchedNearestExamplesResults(total_scores, total_samples)
|
datasets/splits.py
ADDED
|
@@ -0,0 +1,635 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
# Lint as: python3
|
| 16 |
+
"""Splits related API."""
|
| 17 |
+
|
| 18 |
+
import abc
|
| 19 |
+
import collections
|
| 20 |
+
import copy
|
| 21 |
+
import dataclasses
|
| 22 |
+
import re
|
| 23 |
+
from dataclasses import dataclass
|
| 24 |
+
from typing import Optional, Union
|
| 25 |
+
|
| 26 |
+
from .arrow_reader import FileInstructions, make_file_instructions
|
| 27 |
+
from .naming import _split_re
|
| 28 |
+
from .utils.py_utils import NonMutableDict, asdict
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
@dataclass
|
| 32 |
+
class SplitInfo:
|
| 33 |
+
name: str = dataclasses.field(default="", metadata={"include_in_asdict_even_if_is_default": True})
|
| 34 |
+
num_bytes: int = dataclasses.field(default=0, metadata={"include_in_asdict_even_if_is_default": True})
|
| 35 |
+
num_examples: int = dataclasses.field(default=0, metadata={"include_in_asdict_even_if_is_default": True})
|
| 36 |
+
shard_lengths: Optional[list[int]] = None
|
| 37 |
+
|
| 38 |
+
# Deprecated
|
| 39 |
+
# For backward compatibility, this field needs to always be included in files like
|
| 40 |
+
# dataset_infos.json and dataset_info.json files
|
| 41 |
+
# To do so, we always include it in the output of datasets.utils.py_utils.asdict(split_info)
|
| 42 |
+
dataset_name: Optional[str] = dataclasses.field(
|
| 43 |
+
default=None, metadata={"include_in_asdict_even_if_is_default": True}
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
@property
|
| 47 |
+
def file_instructions(self):
|
| 48 |
+
"""Returns the list of dict(filename, take, skip)."""
|
| 49 |
+
# `self.dataset_name` is assigned in `SplitDict.add()`.
|
| 50 |
+
instructions = make_file_instructions(
|
| 51 |
+
name=self.dataset_name,
|
| 52 |
+
split_infos=[self],
|
| 53 |
+
instruction=str(self.name),
|
| 54 |
+
)
|
| 55 |
+
return instructions.file_instructions
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
@dataclass
|
| 59 |
+
class SubSplitInfo:
|
| 60 |
+
"""Wrapper around a sub split info.
|
| 61 |
+
This class expose info on the subsplit:
|
| 62 |
+
```
|
| 63 |
+
ds, info = datasets.load_dataset(..., split='train[75%:]', with_info=True)
|
| 64 |
+
info.splits['train[75%:]'].num_examples
|
| 65 |
+
```
|
| 66 |
+
"""
|
| 67 |
+
|
| 68 |
+
instructions: FileInstructions
|
| 69 |
+
|
| 70 |
+
@property
|
| 71 |
+
def num_examples(self):
|
| 72 |
+
"""Returns the number of example in the subsplit."""
|
| 73 |
+
return self.instructions.num_examples
|
| 74 |
+
|
| 75 |
+
@property
|
| 76 |
+
def file_instructions(self):
|
| 77 |
+
"""Returns the list of dict(filename, take, skip)."""
|
| 78 |
+
return self.instructions.file_instructions
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
class SplitBase(metaclass=abc.ABCMeta):
|
| 82 |
+
# pylint: disable=line-too-long
|
| 83 |
+
"""Abstract base class for Split compositionality.
|
| 84 |
+
|
| 85 |
+
See the
|
| 86 |
+
[guide on splits](../loading#slice-splits)
|
| 87 |
+
for more information.
|
| 88 |
+
|
| 89 |
+
There are three parts to the composition:
|
| 90 |
+
1) The splits are composed (defined, merged, split,...) together before
|
| 91 |
+
calling the `.as_dataset()` function. This is done with the `__add__`,
|
| 92 |
+
`__getitem__`, which return a tree of `SplitBase` (whose leaf
|
| 93 |
+
are the `NamedSplit` objects)
|
| 94 |
+
|
| 95 |
+
```
|
| 96 |
+
split = datasets.Split.TRAIN + datasets.Split.TEST.subsplit(datasets.percent[:50])
|
| 97 |
+
```
|
| 98 |
+
|
| 99 |
+
2) The `SplitBase` is forwarded to the `.as_dataset()` function
|
| 100 |
+
to be resolved into actual read instruction. This is done by the
|
| 101 |
+
`.get_read_instruction()` method which takes the real dataset splits
|
| 102 |
+
(name, number of shards,...) and parse the tree to return a
|
| 103 |
+
`SplitReadInstruction()` object
|
| 104 |
+
|
| 105 |
+
```
|
| 106 |
+
read_instruction = split.get_read_instruction(self.info.splits)
|
| 107 |
+
```
|
| 108 |
+
|
| 109 |
+
3) The `SplitReadInstruction` is then used in the `tf.data.Dataset` pipeline
|
| 110 |
+
to define which files to read and how to skip examples within file.
|
| 111 |
+
|
| 112 |
+
"""
|
| 113 |
+
|
| 114 |
+
# pylint: enable=line-too-long
|
| 115 |
+
|
| 116 |
+
@abc.abstractmethod
|
| 117 |
+
def get_read_instruction(self, split_dict):
|
| 118 |
+
"""Parse the descriptor tree and compile all read instructions together.
|
| 119 |
+
|
| 120 |
+
Args:
|
| 121 |
+
split_dict: `dict`, The `dict[split_name, SplitInfo]` of the dataset
|
| 122 |
+
|
| 123 |
+
Returns:
|
| 124 |
+
split_read_instruction: `SplitReadInstruction`
|
| 125 |
+
"""
|
| 126 |
+
raise NotImplementedError("Abstract method")
|
| 127 |
+
|
| 128 |
+
def __eq__(self, other):
|
| 129 |
+
"""Equality: datasets.Split.TRAIN == 'train'."""
|
| 130 |
+
if isinstance(other, (NamedSplit, str)):
|
| 131 |
+
return False
|
| 132 |
+
raise NotImplementedError("Equality is not implemented between merged/sub splits.")
|
| 133 |
+
|
| 134 |
+
def __ne__(self, other):
|
| 135 |
+
"""InEquality: datasets.Split.TRAIN != 'test'."""
|
| 136 |
+
return not self.__eq__(other)
|
| 137 |
+
|
| 138 |
+
def __add__(self, other):
|
| 139 |
+
"""Merging: datasets.Split.TRAIN + datasets.Split.TEST."""
|
| 140 |
+
return _SplitMerged(self, other)
|
| 141 |
+
|
| 142 |
+
def subsplit(self, arg=None, k=None, percent=None, weighted=None): # pylint: disable=redefined-outer-name
|
| 143 |
+
"""Divides this split into subsplits.
|
| 144 |
+
|
| 145 |
+
There are 3 ways to define subsplits, which correspond to the 3
|
| 146 |
+
arguments `k` (get `k` even subsplits), `percent` (get a slice of the
|
| 147 |
+
dataset with `datasets.percent`), and `weighted` (get subsplits with proportions
|
| 148 |
+
specified by `weighted`).
|
| 149 |
+
|
| 150 |
+
Example::
|
| 151 |
+
|
| 152 |
+
```
|
| 153 |
+
# 50% train, 50% test
|
| 154 |
+
train, test = split.subsplit(k=2)
|
| 155 |
+
# 50% train, 25% test, 25% validation
|
| 156 |
+
train, test, validation = split.subsplit(weighted=[2, 1, 1])
|
| 157 |
+
# Extract last 20%
|
| 158 |
+
subsplit = split.subsplit(datasets.percent[-20:])
|
| 159 |
+
```
|
| 160 |
+
|
| 161 |
+
Warning: k and weighted will be converted into percent which mean that
|
| 162 |
+
values below the percent will be rounded up or down. The final split may be
|
| 163 |
+
bigger to deal with remainders. For instance:
|
| 164 |
+
|
| 165 |
+
```
|
| 166 |
+
train, test, valid = split.subsplit(k=3) # 33%, 33%, 34%
|
| 167 |
+
s1, s2, s3, s4 = split.subsplit(weighted=[2, 2, 1, 1]) # 33%, 33%, 16%, 18%
|
| 168 |
+
```
|
| 169 |
+
|
| 170 |
+
Args:
|
| 171 |
+
arg: If no kwargs are given, `arg` will be interpreted as one of
|
| 172 |
+
`k`, `percent`, or `weighted` depending on the type.
|
| 173 |
+
For example:
|
| 174 |
+
```
|
| 175 |
+
split.subsplit(10) # Equivalent to split.subsplit(k=10)
|
| 176 |
+
split.subsplit(datasets.percent[:-20]) # percent=datasets.percent[:-20]
|
| 177 |
+
split.subsplit([1, 1, 2]) # weighted=[1, 1, 2]
|
| 178 |
+
```
|
| 179 |
+
k: `int` If set, subdivide the split into `k` equal parts.
|
| 180 |
+
percent: `datasets.percent slice`, return a single subsplit corresponding to
|
| 181 |
+
a slice of the original split. For example:
|
| 182 |
+
`split.subsplit(datasets.percent[-20:]) # Last 20% of the dataset`.
|
| 183 |
+
weighted: `list[int]`, return a list of subsplits whose proportions match
|
| 184 |
+
the normalized sum of the list. For example:
|
| 185 |
+
`split.subsplit(weighted=[1, 1, 2]) # 25%, 25%, 50%`.
|
| 186 |
+
|
| 187 |
+
Returns:
|
| 188 |
+
A subsplit or list of subsplits extracted from this split object.
|
| 189 |
+
"""
|
| 190 |
+
# Note that the percent kwargs redefine the outer name datasets.percent. This
|
| 191 |
+
# is done for consistency (.subsplit(percent=datasets.percent[:40]))
|
| 192 |
+
if sum(bool(x) for x in (arg, k, percent, weighted)) != 1:
|
| 193 |
+
raise ValueError("Only one argument of subsplit should be set.")
|
| 194 |
+
|
| 195 |
+
# Auto deduce k
|
| 196 |
+
if isinstance(arg, int):
|
| 197 |
+
k = arg
|
| 198 |
+
elif isinstance(arg, slice):
|
| 199 |
+
percent = arg
|
| 200 |
+
elif isinstance(arg, list):
|
| 201 |
+
weighted = arg
|
| 202 |
+
|
| 203 |
+
if not (k or percent or weighted):
|
| 204 |
+
raise ValueError(
|
| 205 |
+
f"Invalid split argument {arg}. Only list, slice and int supported. "
|
| 206 |
+
"One of k, weighted or percent should be set to a non empty value."
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
def assert_slices_coverage(slices):
|
| 210 |
+
# Ensure that the expended slices cover all percents.
|
| 211 |
+
assert sum((list(range(*s.indices(100))) for s in slices), []) == list(range(100))
|
| 212 |
+
|
| 213 |
+
if k:
|
| 214 |
+
if not 0 < k <= 100:
|
| 215 |
+
raise ValueError(f"Subsplit k should be between 0 and 100, got {k}")
|
| 216 |
+
shift = 100 // k
|
| 217 |
+
slices = [slice(i * shift, (i + 1) * shift) for i in range(k)]
|
| 218 |
+
# Round up last element to ensure all elements are taken
|
| 219 |
+
slices[-1] = slice(slices[-1].start, 100)
|
| 220 |
+
# Internal check to ensure full coverage
|
| 221 |
+
assert_slices_coverage(slices)
|
| 222 |
+
return tuple(_SubSplit(self, s) for s in slices)
|
| 223 |
+
elif percent:
|
| 224 |
+
return _SubSplit(self, percent)
|
| 225 |
+
elif weighted:
|
| 226 |
+
# Normalize the weighted sum
|
| 227 |
+
total = sum(weighted)
|
| 228 |
+
weighted = [100 * x // total for x in weighted]
|
| 229 |
+
# Create the slice for each of the elements
|
| 230 |
+
start = 0
|
| 231 |
+
stop = 0
|
| 232 |
+
slices = []
|
| 233 |
+
for v in weighted:
|
| 234 |
+
stop += v
|
| 235 |
+
slices.append(slice(start, stop))
|
| 236 |
+
start = stop
|
| 237 |
+
# Round up last element to ensure all elements are taken
|
| 238 |
+
slices[-1] = slice(slices[-1].start, 100)
|
| 239 |
+
# Internal check to ensure full coverage
|
| 240 |
+
assert_slices_coverage(slices)
|
| 241 |
+
return tuple(_SubSplit(self, s) for s in slices)
|
| 242 |
+
else:
|
| 243 |
+
# Should not be possible
|
| 244 |
+
raise ValueError("Could not determine the split")
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
# 2 requirements:
|
| 248 |
+
# 1. datasets.percent be sliceable
|
| 249 |
+
# 2. datasets.percent be documented
|
| 250 |
+
#
|
| 251 |
+
# Instances are not documented, so we want datasets.percent to be a class, but to
|
| 252 |
+
# have it be sliceable, we need this metaclass.
|
| 253 |
+
class PercentSliceMeta(type):
|
| 254 |
+
def __getitem__(cls, slice_value):
|
| 255 |
+
if not isinstance(slice_value, slice):
|
| 256 |
+
raise ValueError(f"datasets.percent should only be called with slice, not {slice_value}")
|
| 257 |
+
return slice_value
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
class PercentSlice(metaclass=PercentSliceMeta):
|
| 261 |
+
# pylint: disable=line-too-long
|
| 262 |
+
"""Syntactic sugar for defining slice subsplits: `datasets.percent[75:-5]`.
|
| 263 |
+
|
| 264 |
+
See the
|
| 265 |
+
[guide on splits](../loading#slice-splits)
|
| 266 |
+
for more information.
|
| 267 |
+
"""
|
| 268 |
+
|
| 269 |
+
# pylint: enable=line-too-long
|
| 270 |
+
pass
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
percent = PercentSlice # pylint: disable=invalid-name
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
class _SplitMerged(SplitBase):
|
| 277 |
+
"""Represent two split descriptors merged together."""
|
| 278 |
+
|
| 279 |
+
def __init__(self, split1, split2):
|
| 280 |
+
self._split1 = split1
|
| 281 |
+
self._split2 = split2
|
| 282 |
+
|
| 283 |
+
def get_read_instruction(self, split_dict):
|
| 284 |
+
read_instruction1 = self._split1.get_read_instruction(split_dict)
|
| 285 |
+
read_instruction2 = self._split2.get_read_instruction(split_dict)
|
| 286 |
+
return read_instruction1 + read_instruction2
|
| 287 |
+
|
| 288 |
+
def __repr__(self):
|
| 289 |
+
return f"({repr(self._split1)} + {repr(self._split2)})"
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
class _SubSplit(SplitBase):
|
| 293 |
+
"""Represent a sub split of a split descriptor."""
|
| 294 |
+
|
| 295 |
+
def __init__(self, split, slice_value):
|
| 296 |
+
self._split = split
|
| 297 |
+
self._slice_value = slice_value
|
| 298 |
+
|
| 299 |
+
def get_read_instruction(self, split_dict):
|
| 300 |
+
return self._split.get_read_instruction(split_dict)[self._slice_value]
|
| 301 |
+
|
| 302 |
+
def __repr__(self):
|
| 303 |
+
slice_str = "{start}:{stop}"
|
| 304 |
+
if self._slice_value.step is not None:
|
| 305 |
+
slice_str += ":{step}"
|
| 306 |
+
slice_str = slice_str.format(
|
| 307 |
+
start="" if self._slice_value.start is None else self._slice_value.start,
|
| 308 |
+
stop="" if self._slice_value.stop is None else self._slice_value.stop,
|
| 309 |
+
step=self._slice_value.step,
|
| 310 |
+
)
|
| 311 |
+
return f"{repr(self._split)}(datasets.percent[{slice_str}])"
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
class NamedSplit(SplitBase):
|
| 315 |
+
"""Descriptor corresponding to a named split (train, test, ...).
|
| 316 |
+
|
| 317 |
+
Example:
|
| 318 |
+
Each descriptor can be composed with other using addition or slice:
|
| 319 |
+
|
| 320 |
+
```py
|
| 321 |
+
split = datasets.Split.TRAIN.subsplit(datasets.percent[0:25]) + datasets.Split.TEST
|
| 322 |
+
```
|
| 323 |
+
|
| 324 |
+
The resulting split will correspond to 25% of the train split merged with
|
| 325 |
+
100% of the test split.
|
| 326 |
+
|
| 327 |
+
A split cannot be added twice, so the following will fail:
|
| 328 |
+
|
| 329 |
+
```py
|
| 330 |
+
split = (
|
| 331 |
+
datasets.Split.TRAIN.subsplit(datasets.percent[:25]) +
|
| 332 |
+
datasets.Split.TRAIN.subsplit(datasets.percent[75:])
|
| 333 |
+
) # Error
|
| 334 |
+
split = datasets.Split.TEST + datasets.Split.ALL # Error
|
| 335 |
+
```
|
| 336 |
+
|
| 337 |
+
The slices can be applied only one time. So the following are valid:
|
| 338 |
+
|
| 339 |
+
```py
|
| 340 |
+
split = (
|
| 341 |
+
datasets.Split.TRAIN.subsplit(datasets.percent[:25]) +
|
| 342 |
+
datasets.Split.TEST.subsplit(datasets.percent[:50])
|
| 343 |
+
)
|
| 344 |
+
split = (datasets.Split.TRAIN + datasets.Split.TEST).subsplit(datasets.percent[:50])
|
| 345 |
+
```
|
| 346 |
+
|
| 347 |
+
But this is not valid:
|
| 348 |
+
|
| 349 |
+
```py
|
| 350 |
+
train = datasets.Split.TRAIN
|
| 351 |
+
test = datasets.Split.TEST
|
| 352 |
+
split = train.subsplit(datasets.percent[:25]).subsplit(datasets.percent[:25])
|
| 353 |
+
split = (train.subsplit(datasets.percent[:25]) + test).subsplit(datasets.percent[:50])
|
| 354 |
+
```
|
| 355 |
+
"""
|
| 356 |
+
|
| 357 |
+
def __init__(self, name):
|
| 358 |
+
self._name = name
|
| 359 |
+
split_names_from_instruction = [split_instruction.split("[")[0] for split_instruction in name.split("+")]
|
| 360 |
+
for split_name in split_names_from_instruction:
|
| 361 |
+
if not re.match(_split_re, split_name):
|
| 362 |
+
raise ValueError(f"Split name should match '{_split_re}' but got '{split_name}'.")
|
| 363 |
+
|
| 364 |
+
def __str__(self):
|
| 365 |
+
return self._name
|
| 366 |
+
|
| 367 |
+
def __repr__(self):
|
| 368 |
+
return f"NamedSplit({self._name!r})"
|
| 369 |
+
|
| 370 |
+
def __eq__(self, other):
|
| 371 |
+
"""Equality: datasets.Split.TRAIN == 'train'."""
|
| 372 |
+
if isinstance(other, NamedSplit):
|
| 373 |
+
return self._name == other._name # pylint: disable=protected-access
|
| 374 |
+
elif isinstance(other, SplitBase):
|
| 375 |
+
return False
|
| 376 |
+
elif isinstance(other, str): # Other should be string
|
| 377 |
+
return self._name == other
|
| 378 |
+
else:
|
| 379 |
+
return False
|
| 380 |
+
|
| 381 |
+
def __lt__(self, other):
|
| 382 |
+
return self._name < other._name # pylint: disable=protected-access
|
| 383 |
+
|
| 384 |
+
def __hash__(self):
|
| 385 |
+
return hash(self._name)
|
| 386 |
+
|
| 387 |
+
def get_read_instruction(self, split_dict):
|
| 388 |
+
return SplitReadInstruction(split_dict[self._name])
|
| 389 |
+
|
| 390 |
+
|
| 391 |
+
class NamedSplitAll(NamedSplit):
|
| 392 |
+
"""Split corresponding to the union of all defined dataset splits."""
|
| 393 |
+
|
| 394 |
+
def __init__(self):
|
| 395 |
+
super().__init__("all")
|
| 396 |
+
|
| 397 |
+
def __repr__(self):
|
| 398 |
+
return "NamedSplitAll()"
|
| 399 |
+
|
| 400 |
+
def get_read_instruction(self, split_dict):
|
| 401 |
+
# Merge all dataset split together
|
| 402 |
+
read_instructions = [SplitReadInstruction(s) for s in split_dict.values()]
|
| 403 |
+
return sum(read_instructions, SplitReadInstruction())
|
| 404 |
+
|
| 405 |
+
|
| 406 |
+
class Split:
|
| 407 |
+
# pylint: disable=line-too-long
|
| 408 |
+
"""`Enum` for dataset splits.
|
| 409 |
+
|
| 410 |
+
Datasets are typically split into different subsets to be used at various
|
| 411 |
+
stages of training and evaluation.
|
| 412 |
+
|
| 413 |
+
- `TRAIN`: the training data.
|
| 414 |
+
- `VALIDATION`: the validation data. If present, this is typically used as
|
| 415 |
+
evaluation data while iterating on a model (e.g. changing hyperparameters,
|
| 416 |
+
model architecture, etc.).
|
| 417 |
+
- `TEST`: the testing data. This is the data to report metrics on. Typically
|
| 418 |
+
you do not want to use this during model iteration as you may overfit to it.
|
| 419 |
+
- `ALL`: the union of all defined dataset splits.
|
| 420 |
+
|
| 421 |
+
All splits, including compositions inherit from `datasets.SplitBase`.
|
| 422 |
+
|
| 423 |
+
See the [guide](../load_hub#splits) on splits for more information.
|
| 424 |
+
|
| 425 |
+
Example:
|
| 426 |
+
|
| 427 |
+
```py
|
| 428 |
+
>>> datasets.SplitGenerator(
|
| 429 |
+
... name=datasets.Split.TRAIN,
|
| 430 |
+
... gen_kwargs={"split_key": "train", "files": dl_manager.download_and extract(url)},
|
| 431 |
+
... ),
|
| 432 |
+
... datasets.SplitGenerator(
|
| 433 |
+
... name=datasets.Split.VALIDATION,
|
| 434 |
+
... gen_kwargs={"split_key": "validation", "files": dl_manager.download_and extract(url)},
|
| 435 |
+
... ),
|
| 436 |
+
... datasets.SplitGenerator(
|
| 437 |
+
... name=datasets.Split.TEST,
|
| 438 |
+
... gen_kwargs={"split_key": "test", "files": dl_manager.download_and extract(url)},
|
| 439 |
+
... )
|
| 440 |
+
```
|
| 441 |
+
"""
|
| 442 |
+
|
| 443 |
+
# pylint: enable=line-too-long
|
| 444 |
+
TRAIN = NamedSplit("train")
|
| 445 |
+
TEST = NamedSplit("test")
|
| 446 |
+
VALIDATION = NamedSplit("validation")
|
| 447 |
+
ALL = NamedSplitAll()
|
| 448 |
+
|
| 449 |
+
def __new__(cls, name):
|
| 450 |
+
"""Create a custom split with datasets.Split('custom_name')."""
|
| 451 |
+
return NamedSplitAll() if name == "all" else NamedSplit(name)
|
| 452 |
+
|
| 453 |
+
|
| 454 |
+
# Similar to SplitInfo, but contain an additional slice info
|
| 455 |
+
SlicedSplitInfo = collections.namedtuple(
|
| 456 |
+
"SlicedSplitInfo",
|
| 457 |
+
[
|
| 458 |
+
"split_info",
|
| 459 |
+
"slice_value",
|
| 460 |
+
],
|
| 461 |
+
) # noqa: E231
|
| 462 |
+
|
| 463 |
+
|
| 464 |
+
class SplitReadInstruction:
|
| 465 |
+
"""Object containing the reading instruction for the dataset.
|
| 466 |
+
|
| 467 |
+
Similarly to `SplitDescriptor` nodes, this object can be composed with itself,
|
| 468 |
+
but the resolution happens instantaneously, instead of keeping track of the
|
| 469 |
+
tree, such as all instructions are compiled and flattened in a single
|
| 470 |
+
SplitReadInstruction object containing the list of files and slice to use.
|
| 471 |
+
|
| 472 |
+
Once resolved, the instructions can be accessed with:
|
| 473 |
+
|
| 474 |
+
```
|
| 475 |
+
read_instructions.get_list_sliced_split_info() # List of splits to use
|
| 476 |
+
```
|
| 477 |
+
|
| 478 |
+
"""
|
| 479 |
+
|
| 480 |
+
def __init__(self, split_info=None):
|
| 481 |
+
self._splits = NonMutableDict(error_msg="Overlap between splits. Split {key} has been added with itself.")
|
| 482 |
+
|
| 483 |
+
if split_info:
|
| 484 |
+
self.add(SlicedSplitInfo(split_info=split_info, slice_value=None))
|
| 485 |
+
|
| 486 |
+
def add(self, sliced_split):
|
| 487 |
+
"""Add a SlicedSplitInfo the read instructions."""
|
| 488 |
+
# TODO(epot): Check that the number of examples per shard % 100 == 0
|
| 489 |
+
# Otherwise the slices value may be unbalanced and not exactly reflect the
|
| 490 |
+
# requested slice.
|
| 491 |
+
self._splits[sliced_split.split_info.name] = sliced_split
|
| 492 |
+
|
| 493 |
+
def __add__(self, other):
|
| 494 |
+
"""Merging split together."""
|
| 495 |
+
# Will raise error if a split has already be added (NonMutableDict)
|
| 496 |
+
# TODO(epot): If a split is already added but there is no overlap between
|
| 497 |
+
# the slices, should merge the slices (ex: [:10] + [80:])
|
| 498 |
+
split_instruction = SplitReadInstruction()
|
| 499 |
+
split_instruction._splits.update(self._splits) # pylint: disable=protected-access
|
| 500 |
+
split_instruction._splits.update(other._splits) # pylint: disable=protected-access
|
| 501 |
+
return split_instruction
|
| 502 |
+
|
| 503 |
+
def __getitem__(self, slice_value):
|
| 504 |
+
"""Sub-splits."""
|
| 505 |
+
# Will raise an error if a split has already been sliced
|
| 506 |
+
split_instruction = SplitReadInstruction()
|
| 507 |
+
for v in self._splits.values():
|
| 508 |
+
if v.slice_value is not None:
|
| 509 |
+
raise ValueError(f"Trying to slice Split {v.split_info.name} which has already been sliced")
|
| 510 |
+
v = v._asdict()
|
| 511 |
+
v["slice_value"] = slice_value
|
| 512 |
+
split_instruction.add(SlicedSplitInfo(**v))
|
| 513 |
+
return split_instruction
|
| 514 |
+
|
| 515 |
+
def get_list_sliced_split_info(self):
|
| 516 |
+
return list(self._splits.values())
|
| 517 |
+
|
| 518 |
+
|
| 519 |
+
class SplitDict(dict):
|
| 520 |
+
"""Split info object."""
|
| 521 |
+
|
| 522 |
+
def __init__(self, *args, dataset_name=None, **kwargs):
|
| 523 |
+
super().__init__(*args, **kwargs)
|
| 524 |
+
self.dataset_name = dataset_name
|
| 525 |
+
|
| 526 |
+
def __getitem__(self, key: Union[SplitBase, str]):
|
| 527 |
+
# 1st case: The key exists: `info.splits['train']`
|
| 528 |
+
if str(key) in self:
|
| 529 |
+
return super().__getitem__(str(key))
|
| 530 |
+
# 2nd case: Uses instructions: `info.splits['train[50%]']`
|
| 531 |
+
else:
|
| 532 |
+
instructions = make_file_instructions(
|
| 533 |
+
name=self.dataset_name,
|
| 534 |
+
split_infos=self.values(),
|
| 535 |
+
instruction=key,
|
| 536 |
+
)
|
| 537 |
+
return SubSplitInfo(instructions)
|
| 538 |
+
|
| 539 |
+
def __setitem__(self, key: Union[SplitBase, str], value: SplitInfo):
|
| 540 |
+
if key != value.name:
|
| 541 |
+
raise ValueError(f"Cannot add elem. (key mismatch: '{key}' != '{value.name}')")
|
| 542 |
+
super().__setitem__(key, value)
|
| 543 |
+
|
| 544 |
+
def add(self, split_info: SplitInfo):
|
| 545 |
+
"""Add the split info."""
|
| 546 |
+
if split_info.name in self:
|
| 547 |
+
raise ValueError(f"Split {split_info.name} already present")
|
| 548 |
+
split_info.dataset_name = self.dataset_name
|
| 549 |
+
super().__setitem__(split_info.name, split_info)
|
| 550 |
+
|
| 551 |
+
@property
|
| 552 |
+
def total_num_examples(self):
|
| 553 |
+
"""Return the total number of examples."""
|
| 554 |
+
return sum(s.num_examples for s in self.values())
|
| 555 |
+
|
| 556 |
+
@classmethod
|
| 557 |
+
def from_split_dict(cls, split_infos: Union[list, dict], dataset_name: Optional[str] = None):
|
| 558 |
+
"""Returns a new SplitDict initialized from a Dict or List of `split_infos`."""
|
| 559 |
+
if isinstance(split_infos, dict):
|
| 560 |
+
split_infos = list(split_infos.values())
|
| 561 |
+
|
| 562 |
+
if dataset_name is None:
|
| 563 |
+
dataset_name = split_infos[0].get("dataset_name") if split_infos else None
|
| 564 |
+
|
| 565 |
+
split_dict = cls(dataset_name=dataset_name)
|
| 566 |
+
|
| 567 |
+
for split_info in split_infos:
|
| 568 |
+
if isinstance(split_info, dict):
|
| 569 |
+
split_info = SplitInfo(**split_info)
|
| 570 |
+
split_dict.add(split_info)
|
| 571 |
+
|
| 572 |
+
return split_dict
|
| 573 |
+
|
| 574 |
+
def to_split_dict(self):
|
| 575 |
+
"""Returns a list of SplitInfo protos that we have."""
|
| 576 |
+
out = []
|
| 577 |
+
for split_name, split_info in self.items():
|
| 578 |
+
split_info = copy.deepcopy(split_info)
|
| 579 |
+
split_info.name = split_name
|
| 580 |
+
out.append(split_info)
|
| 581 |
+
return out
|
| 582 |
+
|
| 583 |
+
def copy(self):
|
| 584 |
+
return SplitDict.from_split_dict(self.to_split_dict(), self.dataset_name)
|
| 585 |
+
|
| 586 |
+
def _to_yaml_list(self) -> list:
|
| 587 |
+
out = [asdict(s) for s in self.to_split_dict()]
|
| 588 |
+
# we don't need the shard lengths in YAML, since it depends on max_shard_size and num_proc
|
| 589 |
+
for split_info_dict in out:
|
| 590 |
+
split_info_dict.pop("shard_lengths", None)
|
| 591 |
+
# we don't need the dataset_name attribute that is deprecated
|
| 592 |
+
for split_info_dict in out:
|
| 593 |
+
split_info_dict.pop("dataset_name", None)
|
| 594 |
+
return out
|
| 595 |
+
|
| 596 |
+
@classmethod
|
| 597 |
+
def _from_yaml_list(cls, yaml_data: list) -> "SplitDict":
|
| 598 |
+
return cls.from_split_dict(yaml_data)
|
| 599 |
+
|
| 600 |
+
|
| 601 |
+
@dataclass
|
| 602 |
+
class SplitGenerator:
|
| 603 |
+
"""Defines the split information for the generator.
|
| 604 |
+
|
| 605 |
+
This should be used as returned value of
|
| 606 |
+
`GeneratorBasedBuilder._split_generators`.
|
| 607 |
+
See `GeneratorBasedBuilder._split_generators` for more info and example
|
| 608 |
+
of usage.
|
| 609 |
+
|
| 610 |
+
Args:
|
| 611 |
+
name (`str`):
|
| 612 |
+
Name of the `Split` for which the generator will
|
| 613 |
+
create the examples.
|
| 614 |
+
**gen_kwargs (additional keyword arguments):
|
| 615 |
+
Keyword arguments to forward to the `DatasetBuilder._generate_examples` method
|
| 616 |
+
of the builder.
|
| 617 |
+
|
| 618 |
+
Example:
|
| 619 |
+
|
| 620 |
+
```py
|
| 621 |
+
>>> datasets.SplitGenerator(
|
| 622 |
+
... name=datasets.Split.TRAIN,
|
| 623 |
+
... gen_kwargs={"split_key": "train", "files": dl_manager.download_and_extract(url)},
|
| 624 |
+
... )
|
| 625 |
+
```
|
| 626 |
+
"""
|
| 627 |
+
|
| 628 |
+
name: str
|
| 629 |
+
gen_kwargs: dict = dataclasses.field(default_factory=dict)
|
| 630 |
+
split_info: SplitInfo = dataclasses.field(init=False)
|
| 631 |
+
|
| 632 |
+
def __post_init__(self):
|
| 633 |
+
self.name = str(self.name) # Make sure we convert NamedSplits in strings
|
| 634 |
+
NamedSplit(self.name) # check that it's a valid split name
|
| 635 |
+
self.split_info = SplitInfo(name=self.name)
|
datasets/streaming.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import importlib
|
| 2 |
+
from functools import wraps
|
| 3 |
+
from typing import TYPE_CHECKING, Optional
|
| 4 |
+
|
| 5 |
+
from .download.download_config import DownloadConfig
|
| 6 |
+
from .utils.file_utils import (
|
| 7 |
+
xbasename,
|
| 8 |
+
xdirname,
|
| 9 |
+
xet_parse,
|
| 10 |
+
xexists,
|
| 11 |
+
xgetsize,
|
| 12 |
+
xglob,
|
| 13 |
+
xgzip_open,
|
| 14 |
+
xisdir,
|
| 15 |
+
xisfile,
|
| 16 |
+
xjoin,
|
| 17 |
+
xlistdir,
|
| 18 |
+
xnumpy_load,
|
| 19 |
+
xopen,
|
| 20 |
+
xpandas_read_csv,
|
| 21 |
+
xpandas_read_excel,
|
| 22 |
+
xPath,
|
| 23 |
+
xpyarrow_parquet_read_table,
|
| 24 |
+
xrelpath,
|
| 25 |
+
xsio_loadmat,
|
| 26 |
+
xsplit,
|
| 27 |
+
xsplitext,
|
| 28 |
+
xwalk,
|
| 29 |
+
xxml_dom_minidom_parse,
|
| 30 |
+
)
|
| 31 |
+
from .utils.logging import get_logger
|
| 32 |
+
from .utils.patching import patch_submodule
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
logger = get_logger(__name__)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
if TYPE_CHECKING:
|
| 39 |
+
from .builder import DatasetBuilder
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def extend_module_for_streaming(module_path, download_config: Optional[DownloadConfig] = None):
|
| 43 |
+
"""Extend the module to support streaming.
|
| 44 |
+
|
| 45 |
+
We patch some functions in the module to use `fsspec` to support data streaming:
|
| 46 |
+
- We use `fsspec.open` to open and read remote files. We patch the module function:
|
| 47 |
+
- `open`
|
| 48 |
+
- We use the "::" hop separator to join paths and navigate remote compressed/archive files. We patch the module
|
| 49 |
+
functions:
|
| 50 |
+
- `os.path.join`
|
| 51 |
+
- `pathlib.Path.joinpath` and `pathlib.Path.__truediv__` (called when using the "/" operator)
|
| 52 |
+
|
| 53 |
+
The patched functions are replaced with custom functions defined to work with the
|
| 54 |
+
:class:`~download.streaming_download_manager.StreamingDownloadManager`.
|
| 55 |
+
|
| 56 |
+
Args:
|
| 57 |
+
module_path: Path to the module to be extended.
|
| 58 |
+
download_config: Mainly use `token` or `storage_options` to support different platforms and auth types.
|
| 59 |
+
"""
|
| 60 |
+
|
| 61 |
+
module = importlib.import_module(module_path)
|
| 62 |
+
|
| 63 |
+
# TODO(QL): always update the module to add subsequent new authentication without removing old ones
|
| 64 |
+
if hasattr(module, "_patched_for_streaming") and module._patched_for_streaming:
|
| 65 |
+
if isinstance(module._patched_for_streaming, DownloadConfig):
|
| 66 |
+
module._patched_for_streaming.token = download_config.token
|
| 67 |
+
module._patched_for_streaming.storage_options = download_config.storage_options
|
| 68 |
+
return
|
| 69 |
+
|
| 70 |
+
def wrap_auth(function):
|
| 71 |
+
@wraps(function)
|
| 72 |
+
def wrapper(*args, **kwargs):
|
| 73 |
+
return function(*args, download_config=download_config, **kwargs)
|
| 74 |
+
|
| 75 |
+
wrapper._decorator_name_ = "wrap_auth"
|
| 76 |
+
return wrapper
|
| 77 |
+
|
| 78 |
+
# open files in a streaming fashion
|
| 79 |
+
patch_submodule(module, "open", wrap_auth(xopen)).start()
|
| 80 |
+
patch_submodule(module, "os.listdir", wrap_auth(xlistdir)).start()
|
| 81 |
+
patch_submodule(module, "os.walk", wrap_auth(xwalk)).start()
|
| 82 |
+
patch_submodule(module, "glob.glob", wrap_auth(xglob)).start()
|
| 83 |
+
# allow to navigate in remote zip files
|
| 84 |
+
patch_submodule(module, "os.path.join", xjoin).start()
|
| 85 |
+
patch_submodule(module, "os.path.dirname", xdirname).start()
|
| 86 |
+
patch_submodule(module, "os.path.basename", xbasename).start()
|
| 87 |
+
patch_submodule(module, "os.path.relpath", xrelpath).start()
|
| 88 |
+
patch_submodule(module, "os.path.split", xsplit).start()
|
| 89 |
+
patch_submodule(module, "os.path.splitext", xsplitext).start()
|
| 90 |
+
# allow checks on paths
|
| 91 |
+
patch_submodule(module, "os.path.exists", wrap_auth(xexists)).start()
|
| 92 |
+
patch_submodule(module, "os.path.isdir", wrap_auth(xisdir)).start()
|
| 93 |
+
patch_submodule(module, "os.path.isfile", wrap_auth(xisfile)).start()
|
| 94 |
+
patch_submodule(module, "os.path.getsize", wrap_auth(xgetsize)).start()
|
| 95 |
+
patch_submodule(module, "pathlib.Path", xPath).start()
|
| 96 |
+
# file readers
|
| 97 |
+
patch_submodule(module, "gzip.open", wrap_auth(xgzip_open)).start()
|
| 98 |
+
patch_submodule(module, "numpy.load", wrap_auth(xnumpy_load)).start()
|
| 99 |
+
patch_submodule(module, "pandas.read_csv", wrap_auth(xpandas_read_csv), attrs=["__version__"]).start()
|
| 100 |
+
patch_submodule(module, "pandas.read_excel", wrap_auth(xpandas_read_excel), attrs=["__version__"]).start()
|
| 101 |
+
patch_submodule(module, "scipy.io.loadmat", wrap_auth(xsio_loadmat), attrs=["__version__"]).start()
|
| 102 |
+
patch_submodule(module, "xml.etree.ElementTree.parse", wrap_auth(xet_parse)).start()
|
| 103 |
+
patch_submodule(module, "xml.dom.minidom.parse", wrap_auth(xxml_dom_minidom_parse)).start()
|
| 104 |
+
# pyarrow: do not patch pyarrow attribute in packaged modules
|
| 105 |
+
if not module.__name__.startswith("datasets.packaged_modules."):
|
| 106 |
+
patch_submodule(module, "pyarrow.parquet.read_table", wrap_auth(xpyarrow_parquet_read_table)).start()
|
| 107 |
+
module._patched_for_streaming = download_config
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def extend_dataset_builder_for_streaming(builder: "DatasetBuilder"):
|
| 111 |
+
"""Extend the dataset builder module and the modules imported by it to support streaming.
|
| 112 |
+
|
| 113 |
+
Args:
|
| 114 |
+
builder (:class:`DatasetBuilder`): Dataset builder instance.
|
| 115 |
+
"""
|
| 116 |
+
# this extends the open and os.path.join functions for data streaming
|
| 117 |
+
download_config = DownloadConfig(storage_options=builder.storage_options, token=builder.token)
|
| 118 |
+
extend_module_for_streaming(builder.__module__, download_config=download_config)
|
| 119 |
+
|
| 120 |
+
# builders can inherit from other builders that might use streaming functionality
|
| 121 |
+
# (for example, ImageFolder and AudioFolder inherit from FolderBuilder which implements examples generation)
|
| 122 |
+
# but these parents builders are not patched automatically as they are not instantiated, so we patch them here
|
| 123 |
+
from .builder import DatasetBuilder
|
| 124 |
+
|
| 125 |
+
parent_builder_modules = [
|
| 126 |
+
cls.__module__
|
| 127 |
+
for cls in type(builder).__mro__[1:] # make sure it's not the same module we've already patched
|
| 128 |
+
if issubclass(cls, DatasetBuilder) and cls.__module__ != DatasetBuilder.__module__
|
| 129 |
+
] # check it's not a standard builder from datasets.builder
|
| 130 |
+
for module in parent_builder_modules:
|
| 131 |
+
extend_module_for_streaming(module, download_config=download_config)
|
datasets/table.py
ADDED
|
@@ -0,0 +1,2385 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import copy
|
| 2 |
+
import os
|
| 3 |
+
from collections.abc import Iterator
|
| 4 |
+
from functools import partial
|
| 5 |
+
from itertools import groupby
|
| 6 |
+
from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar, Union
|
| 7 |
+
|
| 8 |
+
import numpy as np
|
| 9 |
+
import pyarrow as pa
|
| 10 |
+
import pyarrow.compute as pc
|
| 11 |
+
|
| 12 |
+
from .utils.logging import get_logger
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
if TYPE_CHECKING:
|
| 16 |
+
from .features.features import Features, FeatureType
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
logger = get_logger(__name__)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def inject_arrow_table_documentation(arrow_table_method):
|
| 23 |
+
def wrapper(fn):
|
| 24 |
+
fn.__doc__ = arrow_table_method.__doc__ + (fn.__doc__ if fn.__doc__ is not None else "")
|
| 25 |
+
fn.__doc__ = fn.__doc__.replace("pyarrow.Table", "Table")
|
| 26 |
+
if hasattr(arrow_table_method, "__annotations__"):
|
| 27 |
+
fn.__annotations__ = arrow_table_method.__annotations__
|
| 28 |
+
return fn
|
| 29 |
+
|
| 30 |
+
return wrapper
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def _in_memory_arrow_table_from_file(filename: str) -> pa.Table:
|
| 34 |
+
in_memory_stream = pa.input_stream(filename)
|
| 35 |
+
opened_stream = pa.ipc.open_stream(in_memory_stream)
|
| 36 |
+
pa_table = opened_stream.read_all()
|
| 37 |
+
return pa_table
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _in_memory_arrow_table_from_buffer(buffer: pa.Buffer) -> pa.Table:
|
| 41 |
+
stream = pa.BufferReader(buffer)
|
| 42 |
+
opened_stream = pa.ipc.open_stream(stream)
|
| 43 |
+
table = opened_stream.read_all()
|
| 44 |
+
return table
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def _memory_mapped_record_batch_reader_from_file(filename: str) -> pa.RecordBatchStreamReader:
|
| 48 |
+
memory_mapped_stream = pa.memory_map(filename)
|
| 49 |
+
return pa.ipc.open_stream(memory_mapped_stream)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def read_schema_from_file(filename: str) -> pa.Schema:
|
| 53 |
+
"""
|
| 54 |
+
Infer arrow table schema from file without loading whole file into memory.
|
| 55 |
+
Useful especially while having very big files.
|
| 56 |
+
"""
|
| 57 |
+
with pa.memory_map(filename) as memory_mapped_stream:
|
| 58 |
+
schema = pa.ipc.open_stream(memory_mapped_stream).schema
|
| 59 |
+
return schema
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def _memory_mapped_arrow_table_from_file(filename: str) -> pa.Table:
|
| 63 |
+
opened_stream = _memory_mapped_record_batch_reader_from_file(filename)
|
| 64 |
+
pa_table = opened_stream.read_all()
|
| 65 |
+
return pa_table
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def _deepcopy(x, memo: dict):
|
| 69 |
+
"""deepcopy a regular class instance"""
|
| 70 |
+
cls = x.__class__
|
| 71 |
+
result = cls.__new__(cls)
|
| 72 |
+
memo[id(x)] = result
|
| 73 |
+
for k, v in x.__dict__.items():
|
| 74 |
+
setattr(result, k, copy.deepcopy(v, memo))
|
| 75 |
+
return result
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def _interpolation_search(arr: list[int], x: int) -> int:
|
| 79 |
+
"""
|
| 80 |
+
Return the position i of a sorted array so that arr[i] <= x < arr[i+1]
|
| 81 |
+
|
| 82 |
+
Args:
|
| 83 |
+
arr (`List[int]`): non-empty sorted list of integers
|
| 84 |
+
x (`int`): query
|
| 85 |
+
|
| 86 |
+
Returns:
|
| 87 |
+
`int`: the position i so that arr[i] <= x < arr[i+1]
|
| 88 |
+
|
| 89 |
+
Raises:
|
| 90 |
+
`IndexError`: if the array is empty or if the query is outside the array values
|
| 91 |
+
"""
|
| 92 |
+
i, j = 0, len(arr) - 1
|
| 93 |
+
while i < j and arr[i] <= x < arr[j]:
|
| 94 |
+
k = i + ((j - i) * (x - arr[i]) // (arr[j] - arr[i]))
|
| 95 |
+
if arr[k] <= x < arr[k + 1]:
|
| 96 |
+
return k
|
| 97 |
+
elif arr[k] < x:
|
| 98 |
+
i, j = k + 1, j
|
| 99 |
+
else:
|
| 100 |
+
i, j = i, k
|
| 101 |
+
raise IndexError(f"Invalid query '{x}' for size {arr[-1] if len(arr) else 'none'}.")
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
class IndexedTableMixin:
|
| 105 |
+
def __init__(self, table: pa.Table):
|
| 106 |
+
self._schema: pa.Schema = table.schema
|
| 107 |
+
self._batches: list[pa.RecordBatch] = [
|
| 108 |
+
recordbatch for recordbatch in table.to_batches() if len(recordbatch) > 0
|
| 109 |
+
]
|
| 110 |
+
self._offsets: np.ndarray = np.cumsum([0] + [len(b) for b in self._batches], dtype=np.int64)
|
| 111 |
+
|
| 112 |
+
def fast_gather(self, indices: Union[list[int], np.ndarray]) -> pa.Table:
|
| 113 |
+
"""
|
| 114 |
+
Create a pa.Table by gathering the records at the records at the specified indices. Should be faster
|
| 115 |
+
than pa.concat_tables(table.fast_slice(int(i) % table.num_rows, 1) for i in indices) since NumPy can compute
|
| 116 |
+
the binary searches in parallel, highly optimized C
|
| 117 |
+
"""
|
| 118 |
+
if not len(indices):
|
| 119 |
+
raise ValueError("Indices must be non-empty")
|
| 120 |
+
batch_indices = np.searchsorted(self._offsets, indices, side="right") - 1
|
| 121 |
+
return pa.Table.from_batches(
|
| 122 |
+
[
|
| 123 |
+
self._batches[batch_idx].slice(i - self._offsets[batch_idx], 1)
|
| 124 |
+
for batch_idx, i in zip(batch_indices, indices)
|
| 125 |
+
],
|
| 126 |
+
schema=self._schema,
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
def fast_slice(self, offset=0, length=None) -> pa.Table:
|
| 130 |
+
"""
|
| 131 |
+
Slice the Table using interpolation search.
|
| 132 |
+
The behavior is the same as `pyarrow.Table.slice` but it's significantly faster.
|
| 133 |
+
|
| 134 |
+
Interpolation search is used to find the start and end indexes of the batches we want to keep.
|
| 135 |
+
The batches to keep are then concatenated to form the sliced Table.
|
| 136 |
+
"""
|
| 137 |
+
if offset < 0:
|
| 138 |
+
raise IndexError("Offset must be non-negative")
|
| 139 |
+
elif offset >= self._offsets[-1] or (length is not None and length <= 0):
|
| 140 |
+
return pa.Table.from_batches([], schema=self._schema)
|
| 141 |
+
i = _interpolation_search(self._offsets, offset)
|
| 142 |
+
if length is None or length + offset >= self._offsets[-1]:
|
| 143 |
+
batches = self._batches[i:]
|
| 144 |
+
batches[0] = batches[0].slice(offset - self._offsets[i])
|
| 145 |
+
else:
|
| 146 |
+
j = _interpolation_search(self._offsets, offset + length - 1)
|
| 147 |
+
batches = self._batches[i : j + 1]
|
| 148 |
+
batches[-1] = batches[-1].slice(0, offset + length - self._offsets[j])
|
| 149 |
+
batches[0] = batches[0].slice(offset - self._offsets[i])
|
| 150 |
+
return pa.Table.from_batches(batches, schema=self._schema)
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
class Table(IndexedTableMixin):
|
| 154 |
+
"""
|
| 155 |
+
Wraps a pyarrow Table by using composition.
|
| 156 |
+
This is the base class for `InMemoryTable`, `MemoryMappedTable` and `ConcatenationTable`.
|
| 157 |
+
|
| 158 |
+
It implements all the basic attributes/methods of the pyarrow Table class except
|
| 159 |
+
the Table transforms: `slice, filter, flatten, combine_chunks, cast, add_column,
|
| 160 |
+
append_column, remove_column, set_column, rename_columns` and `drop`.
|
| 161 |
+
|
| 162 |
+
The implementation of these methods differs for the subclasses.
|
| 163 |
+
"""
|
| 164 |
+
|
| 165 |
+
def __init__(self, table: pa.Table):
|
| 166 |
+
super().__init__(table)
|
| 167 |
+
self.table = table
|
| 168 |
+
|
| 169 |
+
def __deepcopy__(self, memo: dict):
|
| 170 |
+
# arrow tables are immutable, so there's no need to copy self.table
|
| 171 |
+
# moreover calling deepcopy on a pyarrow table seems to make pa.total_allocated_bytes() decrease for some reason
|
| 172 |
+
# by adding it to the memo, self.table won't be copied
|
| 173 |
+
memo[id(self.table)] = self.table
|
| 174 |
+
# same for the recordbatches used by the index
|
| 175 |
+
memo[id(self._batches)] = list(self._batches)
|
| 176 |
+
return _deepcopy(self, memo)
|
| 177 |
+
|
| 178 |
+
def validate(self, *args, **kwargs):
|
| 179 |
+
"""
|
| 180 |
+
Perform validation checks. An exception is raised if validation fails.
|
| 181 |
+
|
| 182 |
+
By default only cheap validation checks are run. Pass `full=True`
|
| 183 |
+
for thorough validation checks (potentially `O(n)`).
|
| 184 |
+
|
| 185 |
+
Args:
|
| 186 |
+
full (`bool`, defaults to `False`):
|
| 187 |
+
If `True`, run expensive checks, otherwise cheap checks only.
|
| 188 |
+
|
| 189 |
+
Raises:
|
| 190 |
+
`pa.lib.ArrowInvalid`: if validation fails
|
| 191 |
+
"""
|
| 192 |
+
return self.table.validate(*args, **kwargs)
|
| 193 |
+
|
| 194 |
+
def equals(self, *args, **kwargs):
|
| 195 |
+
"""
|
| 196 |
+
Check if contents of two tables are equal.
|
| 197 |
+
|
| 198 |
+
Args:
|
| 199 |
+
other ([`~datasets.table.Table`]):
|
| 200 |
+
Table to compare against.
|
| 201 |
+
check_metadata `bool`, defaults to `False`):
|
| 202 |
+
Whether schema metadata equality should be checked as well.
|
| 203 |
+
|
| 204 |
+
Returns:
|
| 205 |
+
`bool`
|
| 206 |
+
"""
|
| 207 |
+
args = tuple(arg.table if isinstance(arg, Table) else arg for arg in args)
|
| 208 |
+
kwargs = {k: v.table if isinstance(v, Table) else v for k, v in kwargs}
|
| 209 |
+
return self.table.equals(*args, **kwargs)
|
| 210 |
+
|
| 211 |
+
def to_batches(self, *args, **kwargs):
|
| 212 |
+
"""
|
| 213 |
+
Convert Table to list of (contiguous) `RecordBatch` objects.
|
| 214 |
+
|
| 215 |
+
Args:
|
| 216 |
+
max_chunksize (`int`, defaults to `None`):
|
| 217 |
+
Maximum size for `RecordBatch` chunks. Individual chunks may be
|
| 218 |
+
smaller depending on the chunk layout of individual columns.
|
| 219 |
+
|
| 220 |
+
Returns:
|
| 221 |
+
`List[pyarrow.RecordBatch]`
|
| 222 |
+
"""
|
| 223 |
+
return self.table.to_batches(*args, **kwargs)
|
| 224 |
+
|
| 225 |
+
def to_pydict(self, *args, **kwargs):
|
| 226 |
+
"""
|
| 227 |
+
Convert the Table to a `dict` or `OrderedDict`.
|
| 228 |
+
|
| 229 |
+
Returns:
|
| 230 |
+
`dict`
|
| 231 |
+
"""
|
| 232 |
+
return self.table.to_pydict(*args, **kwargs)
|
| 233 |
+
|
| 234 |
+
def to_pylist(self, *args, **kwargs):
|
| 235 |
+
"""
|
| 236 |
+
Convert the Table to a list
|
| 237 |
+
|
| 238 |
+
Returns:
|
| 239 |
+
`list`
|
| 240 |
+
"""
|
| 241 |
+
return self.table.to_pylist(*args, **kwargs)
|
| 242 |
+
|
| 243 |
+
def to_pandas(self, *args, **kwargs):
|
| 244 |
+
"""
|
| 245 |
+
Convert to a pandas-compatible NumPy array or DataFrame, as appropriate.
|
| 246 |
+
|
| 247 |
+
Args:
|
| 248 |
+
memory_pool (`MemoryPool`, defaults to `None`):
|
| 249 |
+
Arrow MemoryPool to use for allocations. Uses the default memory
|
| 250 |
+
pool is not passed.
|
| 251 |
+
strings_to_categorical (`bool`, defaults to `False`):
|
| 252 |
+
Encode string (UTF8) and binary types to `pandas.Categorical`.
|
| 253 |
+
categories (`list`, defaults to `empty`):
|
| 254 |
+
List of fields that should be returned as `pandas.Categorical`. Only
|
| 255 |
+
applies to table-like data structures.
|
| 256 |
+
zero_copy_only (`bool`, defaults to `False`):
|
| 257 |
+
Raise an `ArrowException` if this function call would require copying
|
| 258 |
+
the underlying data.
|
| 259 |
+
integer_object_nulls (`bool`, defaults to `False`):
|
| 260 |
+
Cast integers with nulls to objects.
|
| 261 |
+
date_as_object (`bool`, defaults to `True`):
|
| 262 |
+
Cast dates to objects. If `False`, convert to `datetime64[ns]` dtype.
|
| 263 |
+
timestamp_as_object (`bool`, defaults to `False`):
|
| 264 |
+
Cast non-nanosecond timestamps (`np.datetime64`) to objects. This is
|
| 265 |
+
useful if you have timestamps that don't fit in the normal date
|
| 266 |
+
range of nanosecond timestamps (1678 CE-2262 CE).
|
| 267 |
+
If `False`, all timestamps are converted to `datetime64[ns]` dtype.
|
| 268 |
+
use_threads (`bool`, defaults to `True`):
|
| 269 |
+
Whether to parallelize the conversion using multiple threads.
|
| 270 |
+
deduplicate_objects (`bool`, defaults to `False`):
|
| 271 |
+
Do not create multiple copies Python objects when created, to save
|
| 272 |
+
on memory use. Conversion will be slower.
|
| 273 |
+
ignore_metadata (`bool`, defaults to `False`):
|
| 274 |
+
If `True`, do not use the 'pandas' metadata to reconstruct the
|
| 275 |
+
DataFrame index, if present.
|
| 276 |
+
safe (`bool`, defaults to `True`):
|
| 277 |
+
For certain data types, a cast is needed in order to store the
|
| 278 |
+
data in a pandas DataFrame or Series (e.g. timestamps are always
|
| 279 |
+
stored as nanoseconds in pandas). This option controls whether it
|
| 280 |
+
is a safe cast or not.
|
| 281 |
+
split_blocks (`bool`, defaults to `False`):
|
| 282 |
+
If `True`, generate one internal "block" for each column when
|
| 283 |
+
creating a pandas.DataFrame from a `RecordBatch` or `Table`. While this
|
| 284 |
+
can temporarily reduce memory note that various pandas operations
|
| 285 |
+
can trigger "consolidation" which may balloon memory use.
|
| 286 |
+
self_destruct (`bool`, defaults to `False`):
|
| 287 |
+
EXPERIMENTAL: If `True`, attempt to deallocate the originating Arrow
|
| 288 |
+
memory while converting the Arrow object to pandas. If you use the
|
| 289 |
+
object after calling `to_pandas` with this option it will crash your
|
| 290 |
+
program.
|
| 291 |
+
types_mapper (`function`, defaults to `None`):
|
| 292 |
+
A function mapping a pyarrow DataType to a pandas `ExtensionDtype`.
|
| 293 |
+
This can be used to override the default pandas type for conversion
|
| 294 |
+
of built-in pyarrow types or in absence of `pandas_metadata` in the
|
| 295 |
+
Table schema. The function receives a pyarrow DataType and is
|
| 296 |
+
expected to return a pandas `ExtensionDtype` or `None` if the
|
| 297 |
+
default conversion should be used for that type. If you have
|
| 298 |
+
a dictionary mapping, you can pass `dict.get` as function.
|
| 299 |
+
|
| 300 |
+
Returns:
|
| 301 |
+
`pandas.Series` or `pandas.DataFrame`: `pandas.Series` or `pandas.DataFrame` depending on type of object
|
| 302 |
+
"""
|
| 303 |
+
return self.table.to_pandas(*args, **kwargs)
|
| 304 |
+
|
| 305 |
+
def to_string(self, *args, **kwargs):
|
| 306 |
+
return self.table.to_string(*args, **kwargs)
|
| 307 |
+
|
| 308 |
+
def to_reader(self, max_chunksize: Optional[int] = None):
|
| 309 |
+
"""
|
| 310 |
+
Convert the Table to a RecordBatchReader.
|
| 311 |
+
|
| 312 |
+
Note that this method is zero-copy, it merely exposes the same data under a different API.
|
| 313 |
+
|
| 314 |
+
Args:
|
| 315 |
+
max_chunksize (`int`, defaults to `None`)
|
| 316 |
+
Maximum size for RecordBatch chunks. Individual chunks may be smaller depending
|
| 317 |
+
on the chunk layout of individual columns.
|
| 318 |
+
|
| 319 |
+
Returns:
|
| 320 |
+
`pyarrow.RecordBatchReader`
|
| 321 |
+
"""
|
| 322 |
+
return self.table.to_reader(max_chunksize=max_chunksize)
|
| 323 |
+
|
| 324 |
+
def field(self, *args, **kwargs):
|
| 325 |
+
"""
|
| 326 |
+
Select a schema field by its column name or numeric index.
|
| 327 |
+
|
| 328 |
+
Args:
|
| 329 |
+
i (`Union[int, str]`):
|
| 330 |
+
The index or name of the field to retrieve.
|
| 331 |
+
|
| 332 |
+
Returns:
|
| 333 |
+
`pyarrow.Field`
|
| 334 |
+
"""
|
| 335 |
+
return self.table.field(*args, **kwargs)
|
| 336 |
+
|
| 337 |
+
def column(self, *args, **kwargs):
|
| 338 |
+
"""
|
| 339 |
+
Select a column by its column name, or numeric index.
|
| 340 |
+
|
| 341 |
+
Args:
|
| 342 |
+
i (`Union[int, str]`):
|
| 343 |
+
The index or name of the column to retrieve.
|
| 344 |
+
|
| 345 |
+
Returns:
|
| 346 |
+
`pyarrow.ChunkedArray`
|
| 347 |
+
"""
|
| 348 |
+
return self.table.column(*args, **kwargs)
|
| 349 |
+
|
| 350 |
+
def itercolumns(self, *args, **kwargs):
|
| 351 |
+
"""
|
| 352 |
+
Iterator over all columns in their numerical order.
|
| 353 |
+
|
| 354 |
+
Yields:
|
| 355 |
+
`pyarrow.ChunkedArray`
|
| 356 |
+
"""
|
| 357 |
+
return self.table.itercolumns(*args, **kwargs)
|
| 358 |
+
|
| 359 |
+
@property
|
| 360 |
+
def schema(self):
|
| 361 |
+
"""
|
| 362 |
+
Schema of the table and its columns.
|
| 363 |
+
|
| 364 |
+
Returns:
|
| 365 |
+
`pyarrow.Schema`
|
| 366 |
+
"""
|
| 367 |
+
return self.table.schema
|
| 368 |
+
|
| 369 |
+
@property
|
| 370 |
+
def columns(self):
|
| 371 |
+
"""
|
| 372 |
+
List of all columns in numerical order.
|
| 373 |
+
|
| 374 |
+
Returns:
|
| 375 |
+
`List[pa.ChunkedArray]`
|
| 376 |
+
"""
|
| 377 |
+
return self.table.columns
|
| 378 |
+
|
| 379 |
+
@property
|
| 380 |
+
def num_columns(self):
|
| 381 |
+
"""
|
| 382 |
+
Number of columns in this table.
|
| 383 |
+
|
| 384 |
+
Returns:
|
| 385 |
+
int
|
| 386 |
+
"""
|
| 387 |
+
return self.table.num_columns
|
| 388 |
+
|
| 389 |
+
@property
|
| 390 |
+
def num_rows(self):
|
| 391 |
+
"""
|
| 392 |
+
Number of rows in this table.
|
| 393 |
+
|
| 394 |
+
Due to the definition of a table, all columns have the same number of
|
| 395 |
+
rows.
|
| 396 |
+
|
| 397 |
+
Returns:
|
| 398 |
+
int
|
| 399 |
+
"""
|
| 400 |
+
return self.table.num_rows
|
| 401 |
+
|
| 402 |
+
@property
|
| 403 |
+
def shape(self):
|
| 404 |
+
"""
|
| 405 |
+
Dimensions of the table: (#rows, #columns).
|
| 406 |
+
|
| 407 |
+
Returns:
|
| 408 |
+
`(int, int)`: Number of rows and number of columns.
|
| 409 |
+
"""
|
| 410 |
+
return self.table.shape
|
| 411 |
+
|
| 412 |
+
@property
|
| 413 |
+
def nbytes(self):
|
| 414 |
+
"""
|
| 415 |
+
Total number of bytes consumed by the elements of the table.
|
| 416 |
+
"""
|
| 417 |
+
return self.table.nbytes
|
| 418 |
+
|
| 419 |
+
@property
|
| 420 |
+
def column_names(self):
|
| 421 |
+
"""
|
| 422 |
+
Names of the table's columns.
|
| 423 |
+
"""
|
| 424 |
+
return self.table.column_names
|
| 425 |
+
|
| 426 |
+
def __eq__(self, other):
|
| 427 |
+
return self.equals(other)
|
| 428 |
+
|
| 429 |
+
def __getitem__(self, i):
|
| 430 |
+
return self.table[i]
|
| 431 |
+
|
| 432 |
+
def __len__(self):
|
| 433 |
+
return len(self.table)
|
| 434 |
+
|
| 435 |
+
def __repr__(self):
|
| 436 |
+
return self.table.__repr__().replace("pyarrow.Table", self.__class__.__name__)
|
| 437 |
+
|
| 438 |
+
def __str__(self):
|
| 439 |
+
return self.table.__str__().replace("pyarrow.Table", self.__class__.__name__)
|
| 440 |
+
|
| 441 |
+
def slice(self, *args, **kwargs):
|
| 442 |
+
"""
|
| 443 |
+
Compute zero-copy slice of this Table.
|
| 444 |
+
|
| 445 |
+
Args:
|
| 446 |
+
offset (`int`, defaults to `0`):
|
| 447 |
+
Offset from start of table to slice.
|
| 448 |
+
length (`int`, defaults to `None`):
|
| 449 |
+
Length of slice (default is until end of table starting from
|
| 450 |
+
offset).
|
| 451 |
+
|
| 452 |
+
Returns:
|
| 453 |
+
`datasets.table.Table`
|
| 454 |
+
"""
|
| 455 |
+
raise NotImplementedError()
|
| 456 |
+
|
| 457 |
+
def filter(self, *args, **kwargs):
|
| 458 |
+
"""
|
| 459 |
+
Select records from a Table. See `pyarrow.compute.filter` for full usage.
|
| 460 |
+
"""
|
| 461 |
+
raise NotImplementedError()
|
| 462 |
+
|
| 463 |
+
def flatten(self, *args, **kwargs):
|
| 464 |
+
"""
|
| 465 |
+
Flatten this Table. Each column with a struct type is flattened
|
| 466 |
+
into one column per struct field. Other columns are left unchanged.
|
| 467 |
+
|
| 468 |
+
Args:
|
| 469 |
+
memory_pool (`MemoryPool`, defaults to `None`):
|
| 470 |
+
For memory allocations, if required, otherwise use default pool.
|
| 471 |
+
|
| 472 |
+
Returns:
|
| 473 |
+
`datasets.table.Table`
|
| 474 |
+
"""
|
| 475 |
+
raise NotImplementedError()
|
| 476 |
+
|
| 477 |
+
def combine_chunks(self, *args, **kwargs):
|
| 478 |
+
"""
|
| 479 |
+
Make a new table by combining the chunks this table has.
|
| 480 |
+
|
| 481 |
+
All the underlying chunks in the `ChunkedArray` of each column are
|
| 482 |
+
concatenated into zero or one chunk.
|
| 483 |
+
|
| 484 |
+
Args:
|
| 485 |
+
memory_pool (`MemoryPool`, defaults to `None`):
|
| 486 |
+
For memory allocations, if required, otherwise use default pool.
|
| 487 |
+
|
| 488 |
+
Returns:
|
| 489 |
+
`datasets.table.Table`
|
| 490 |
+
"""
|
| 491 |
+
raise NotImplementedError()
|
| 492 |
+
|
| 493 |
+
def cast(self, *args, **kwargs):
|
| 494 |
+
"""
|
| 495 |
+
Cast table values to another schema.
|
| 496 |
+
|
| 497 |
+
Args:
|
| 498 |
+
target_schema (`Schema`):
|
| 499 |
+
Schema to cast to, the names and order of fields must match.
|
| 500 |
+
safe (`bool`, defaults to `True`):
|
| 501 |
+
Check for overflows or other unsafe conversions.
|
| 502 |
+
|
| 503 |
+
Returns:
|
| 504 |
+
`datasets.table.Table`
|
| 505 |
+
"""
|
| 506 |
+
raise NotImplementedError()
|
| 507 |
+
|
| 508 |
+
def replace_schema_metadata(self, *args, **kwargs):
|
| 509 |
+
"""
|
| 510 |
+
EXPERIMENTAL: Create shallow copy of table by replacing schema
|
| 511 |
+
key-value metadata with the indicated new metadata (which may be None,
|
| 512 |
+
which deletes any existing metadata
|
| 513 |
+
|
| 514 |
+
Args:
|
| 515 |
+
metadata (`dict`, defaults to `None`):
|
| 516 |
+
|
| 517 |
+
Returns:
|
| 518 |
+
`datasets.table.Table`: shallow_copy
|
| 519 |
+
"""
|
| 520 |
+
raise NotImplementedError()
|
| 521 |
+
|
| 522 |
+
def add_column(self, *args, **kwargs):
|
| 523 |
+
"""
|
| 524 |
+
Add column to Table at position.
|
| 525 |
+
|
| 526 |
+
A new table is returned with the column added, the original table
|
| 527 |
+
object is left unchanged.
|
| 528 |
+
|
| 529 |
+
Args:
|
| 530 |
+
i (`int`):
|
| 531 |
+
Index to place the column at.
|
| 532 |
+
field_ (`Union[str, pyarrow.Field]`):
|
| 533 |
+
If a string is passed then the type is deduced from the column
|
| 534 |
+
data.
|
| 535 |
+
column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
|
| 536 |
+
Column data.
|
| 537 |
+
|
| 538 |
+
Returns:
|
| 539 |
+
`datasets.table.Table`: New table with the passed column added.
|
| 540 |
+
"""
|
| 541 |
+
raise NotImplementedError()
|
| 542 |
+
|
| 543 |
+
def append_column(self, *args, **kwargs):
|
| 544 |
+
"""
|
| 545 |
+
Append column at end of columns.
|
| 546 |
+
|
| 547 |
+
Args:
|
| 548 |
+
field_ (`Union[str, pyarrow.Field]`):
|
| 549 |
+
If a string is passed then the type is deduced from the column
|
| 550 |
+
data.
|
| 551 |
+
column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
|
| 552 |
+
Column data.
|
| 553 |
+
|
| 554 |
+
Returns:
|
| 555 |
+
`datasets.table.Table`: New table with the passed column added.
|
| 556 |
+
"""
|
| 557 |
+
raise NotImplementedError()
|
| 558 |
+
|
| 559 |
+
def remove_column(self, *args, **kwargs):
|
| 560 |
+
"""
|
| 561 |
+
Create new Table with the indicated column removed.
|
| 562 |
+
|
| 563 |
+
Args:
|
| 564 |
+
i (`int`):
|
| 565 |
+
Index of column to remove.
|
| 566 |
+
|
| 567 |
+
Returns:
|
| 568 |
+
`datasets.table.Table`: New table without the column.
|
| 569 |
+
"""
|
| 570 |
+
raise NotImplementedError()
|
| 571 |
+
|
| 572 |
+
def set_column(self, *args, **kwargs):
|
| 573 |
+
"""
|
| 574 |
+
Replace column in Table at position.
|
| 575 |
+
|
| 576 |
+
Args:
|
| 577 |
+
i (`int`):
|
| 578 |
+
Index to place the column at.
|
| 579 |
+
field_ (`Union[str, pyarrow.Field]`):
|
| 580 |
+
If a string is passed then the type is deduced from the column
|
| 581 |
+
data.
|
| 582 |
+
column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
|
| 583 |
+
Column data.
|
| 584 |
+
|
| 585 |
+
Returns:
|
| 586 |
+
`datasets.table.Table`: New table with the passed column set.
|
| 587 |
+
"""
|
| 588 |
+
raise NotImplementedError()
|
| 589 |
+
|
| 590 |
+
def rename_columns(self, *args, **kwargs):
|
| 591 |
+
"""
|
| 592 |
+
Create new table with columns renamed to provided names.
|
| 593 |
+
"""
|
| 594 |
+
raise NotImplementedError()
|
| 595 |
+
|
| 596 |
+
def drop(self, *args, **kwargs):
|
| 597 |
+
"""
|
| 598 |
+
Drop one or more columns and return a new table.
|
| 599 |
+
|
| 600 |
+
Args:
|
| 601 |
+
columns (`List[str]`):
|
| 602 |
+
List of field names referencing existing columns.
|
| 603 |
+
|
| 604 |
+
Raises:
|
| 605 |
+
`KeyError` : if any of the passed columns name are not existing.
|
| 606 |
+
|
| 607 |
+
Returns:
|
| 608 |
+
`datasets.table.Table`: New table without the columns.
|
| 609 |
+
"""
|
| 610 |
+
raise NotImplementedError()
|
| 611 |
+
|
| 612 |
+
def select(self, *args, **kwargs):
|
| 613 |
+
"""
|
| 614 |
+
Select columns of the table.
|
| 615 |
+
|
| 616 |
+
Returns a new table with the specified columns, and metadata preserved.
|
| 617 |
+
|
| 618 |
+
Args:
|
| 619 |
+
columns (:obj:`Union[List[str], List[int]]`):
|
| 620 |
+
The column names or integer indices to select.
|
| 621 |
+
|
| 622 |
+
Returns:
|
| 623 |
+
`datasets.table.Table`: table with only a subset of the columns
|
| 624 |
+
"""
|
| 625 |
+
raise NotImplementedError()
|
| 626 |
+
|
| 627 |
+
|
| 628 |
+
class TableBlock(Table):
|
| 629 |
+
"""
|
| 630 |
+
`TableBlock` is the allowed class inside a `ConcanetationTable`.
|
| 631 |
+
Only `MemoryMappedTable` and `InMemoryTable` are `TableBlock`.
|
| 632 |
+
This is because we don't want a `ConcanetationTable` made out of other `ConcanetationTables`.
|
| 633 |
+
"""
|
| 634 |
+
|
| 635 |
+
pass
|
| 636 |
+
|
| 637 |
+
|
| 638 |
+
class InMemoryTable(TableBlock):
|
| 639 |
+
"""
|
| 640 |
+
The table is said in-memory when it is loaded into the user's RAM.
|
| 641 |
+
|
| 642 |
+
Pickling it does copy all the data using memory.
|
| 643 |
+
Its implementation is simple and uses the underlying pyarrow Table methods directly.
|
| 644 |
+
|
| 645 |
+
This is different from the `MemoryMapped` table, for which pickling doesn't copy all the
|
| 646 |
+
data in memory. For a `MemoryMapped`, unpickling instead reloads the table from the disk.
|
| 647 |
+
|
| 648 |
+
`InMemoryTable` must be used when data fit in memory, while `MemoryMapped` are reserved for
|
| 649 |
+
data bigger than memory or when you want the memory footprint of your application to
|
| 650 |
+
stay low.
|
| 651 |
+
"""
|
| 652 |
+
|
| 653 |
+
@classmethod
|
| 654 |
+
def from_file(cls, filename: str):
|
| 655 |
+
table = _in_memory_arrow_table_from_file(filename)
|
| 656 |
+
return cls(table)
|
| 657 |
+
|
| 658 |
+
@classmethod
|
| 659 |
+
def from_buffer(cls, buffer: pa.Buffer):
|
| 660 |
+
table = _in_memory_arrow_table_from_buffer(buffer)
|
| 661 |
+
return cls(table)
|
| 662 |
+
|
| 663 |
+
@classmethod
|
| 664 |
+
def from_pandas(cls, *args, **kwargs):
|
| 665 |
+
"""
|
| 666 |
+
Convert pandas.DataFrame to an Arrow Table.
|
| 667 |
+
|
| 668 |
+
The column types in the resulting Arrow Table are inferred from the
|
| 669 |
+
dtypes of the pandas.Series in the DataFrame. In the case of non-object
|
| 670 |
+
Series, the NumPy dtype is translated to its Arrow equivalent. In the
|
| 671 |
+
case of `object`, we need to guess the datatype by looking at the
|
| 672 |
+
Python objects in this Series.
|
| 673 |
+
|
| 674 |
+
Be aware that Series of the `object` dtype don't carry enough
|
| 675 |
+
information to always lead to a meaningful Arrow type. In the case that
|
| 676 |
+
we cannot infer a type, e.g. because the DataFrame is of length 0 or
|
| 677 |
+
the Series only contains `None/nan` objects, the type is set to
|
| 678 |
+
null. This behavior can be avoided by constructing an explicit schema
|
| 679 |
+
and passing it to this function.
|
| 680 |
+
|
| 681 |
+
Args:
|
| 682 |
+
df (`pandas.DataFrame`):
|
| 683 |
+
schema (`pyarrow.Schema`, *optional*):
|
| 684 |
+
The expected schema of the Arrow Table. This can be used to
|
| 685 |
+
indicate the type of columns if we cannot infer it automatically.
|
| 686 |
+
If passed, the output will have exactly this schema. Columns
|
| 687 |
+
specified in the schema that are not found in the DataFrame columns
|
| 688 |
+
or its index will raise an error. Additional columns or index
|
| 689 |
+
levels in the DataFrame which are not specified in the schema will
|
| 690 |
+
be ignored.
|
| 691 |
+
preserve_index (`bool`, *optional*):
|
| 692 |
+
Whether to store the index as an additional column in the resulting
|
| 693 |
+
`Table`. The default of None will store the index as a column,
|
| 694 |
+
except for RangeIndex which is stored as metadata only. Use
|
| 695 |
+
`preserve_index=True` to force it to be stored as a column.
|
| 696 |
+
nthreads (`int`, defaults to `None` (may use up to system CPU count threads))
|
| 697 |
+
If greater than 1, convert columns to Arrow in parallel using
|
| 698 |
+
indicated number of threads.
|
| 699 |
+
columns (`List[str]`, *optional*):
|
| 700 |
+
List of column to be converted. If `None`, use all columns.
|
| 701 |
+
safe (`bool`, defaults to `True`):
|
| 702 |
+
Check for overflows or other unsafe conversions,
|
| 703 |
+
|
| 704 |
+
Returns:
|
| 705 |
+
`datasets.table.Table`:
|
| 706 |
+
|
| 707 |
+
Examples:
|
| 708 |
+
```python
|
| 709 |
+
>>> import pandas as pd
|
| 710 |
+
>>> import pyarrow as pa
|
| 711 |
+
>>> df = pd.DataFrame({
|
| 712 |
+
... 'int': [1, 2],
|
| 713 |
+
... 'str': ['a', 'b']
|
| 714 |
+
... })
|
| 715 |
+
>>> pa.Table.from_pandas(df)
|
| 716 |
+
<pyarrow.lib.Table object at 0x7f05d1fb1b40>
|
| 717 |
+
```
|
| 718 |
+
"""
|
| 719 |
+
return cls(pa.Table.from_pandas(*args, **kwargs))
|
| 720 |
+
|
| 721 |
+
@classmethod
|
| 722 |
+
def from_arrays(cls, *args, **kwargs):
|
| 723 |
+
"""
|
| 724 |
+
Construct a Table from Arrow arrays.
|
| 725 |
+
|
| 726 |
+
Args:
|
| 727 |
+
arrays (`List[Union[pyarrow.Array, pyarrow.ChunkedArray]]`):
|
| 728 |
+
Equal-length arrays that should form the table.
|
| 729 |
+
names (`List[str]`, *optional*):
|
| 730 |
+
Names for the table columns. If not passed, schema must be passed.
|
| 731 |
+
schema (`Schema`, defaults to `None`):
|
| 732 |
+
Schema for the created table. If not passed, names must be passed.
|
| 733 |
+
metadata (`Union[dict, Mapping]`, defaults to `None`):
|
| 734 |
+
Optional metadata for the schema (if inferred).
|
| 735 |
+
|
| 736 |
+
Returns:
|
| 737 |
+
`datasets.table.Table`
|
| 738 |
+
"""
|
| 739 |
+
return cls(pa.Table.from_arrays(*args, **kwargs))
|
| 740 |
+
|
| 741 |
+
@classmethod
|
| 742 |
+
def from_pydict(cls, *args, **kwargs):
|
| 743 |
+
"""
|
| 744 |
+
Construct a Table from Arrow arrays or columns.
|
| 745 |
+
|
| 746 |
+
Args:
|
| 747 |
+
mapping (`Union[dict, Mapping]`):
|
| 748 |
+
A mapping of strings to Arrays or Python lists.
|
| 749 |
+
schema (`Schema`, defaults to `None`):
|
| 750 |
+
If not passed, will be inferred from the Mapping values
|
| 751 |
+
metadata (`Union[dict, Mapping]`, defaults to `None`):
|
| 752 |
+
Optional metadata for the schema (if inferred).
|
| 753 |
+
|
| 754 |
+
Returns:
|
| 755 |
+
`datasets.table.Table`
|
| 756 |
+
"""
|
| 757 |
+
return cls(pa.Table.from_pydict(*args, **kwargs))
|
| 758 |
+
|
| 759 |
+
@classmethod
|
| 760 |
+
def from_pylist(cls, mapping, *args, **kwargs):
|
| 761 |
+
"""
|
| 762 |
+
Construct a Table from list of rows / dictionaries.
|
| 763 |
+
|
| 764 |
+
Args:
|
| 765 |
+
mapping (`List[dict]`):
|
| 766 |
+
A mapping of strings to row values.
|
| 767 |
+
schema (`Schema`, defaults to `None`):
|
| 768 |
+
If not passed, will be inferred from the Mapping values
|
| 769 |
+
metadata (`Union[dict, Mapping]`, defaults to `None`):
|
| 770 |
+
Optional metadata for the schema (if inferred).
|
| 771 |
+
|
| 772 |
+
Returns:
|
| 773 |
+
`datasets.table.Table`
|
| 774 |
+
"""
|
| 775 |
+
return cls(pa.Table.from_pylist(mapping, *args, **kwargs))
|
| 776 |
+
|
| 777 |
+
@classmethod
|
| 778 |
+
def from_batches(cls, *args, **kwargs):
|
| 779 |
+
"""
|
| 780 |
+
Construct a Table from a sequence or iterator of Arrow `RecordBatches`.
|
| 781 |
+
|
| 782 |
+
Args:
|
| 783 |
+
batches (`Union[Sequence[pyarrow.RecordBatch], Iterator[pyarrow.RecordBatch]]`):
|
| 784 |
+
Sequence of `RecordBatch` to be converted, all schemas must be equal.
|
| 785 |
+
schema (`Schema`, defaults to `None`):
|
| 786 |
+
If not passed, will be inferred from the first `RecordBatch`.
|
| 787 |
+
|
| 788 |
+
Returns:
|
| 789 |
+
`datasets.table.Table`:
|
| 790 |
+
"""
|
| 791 |
+
return cls(pa.Table.from_batches(*args, **kwargs))
|
| 792 |
+
|
| 793 |
+
def slice(self, offset=0, length=None):
|
| 794 |
+
"""
|
| 795 |
+
Compute zero-copy slice of this Table.
|
| 796 |
+
|
| 797 |
+
Args:
|
| 798 |
+
offset (`int`, defaults to `0`):
|
| 799 |
+
Offset from start of table to slice.
|
| 800 |
+
length (`int`, defaults to `None`):
|
| 801 |
+
Length of slice (default is until end of table starting from
|
| 802 |
+
offset).
|
| 803 |
+
|
| 804 |
+
Returns:
|
| 805 |
+
`datasets.table.Table`
|
| 806 |
+
"""
|
| 807 |
+
# Use fast slicing here
|
| 808 |
+
return InMemoryTable(self.fast_slice(offset=offset, length=length))
|
| 809 |
+
|
| 810 |
+
def filter(self, *args, **kwargs):
|
| 811 |
+
"""
|
| 812 |
+
Select records from a Table. See `pyarrow.compute.filter` for full usage.
|
| 813 |
+
"""
|
| 814 |
+
return InMemoryTable(self.table.filter(*args, **kwargs))
|
| 815 |
+
|
| 816 |
+
def flatten(self, *args, **kwargs):
|
| 817 |
+
"""
|
| 818 |
+
Flatten this Table. Each column with a struct type is flattened
|
| 819 |
+
into one column per struct field. Other columns are left unchanged.
|
| 820 |
+
|
| 821 |
+
Args:
|
| 822 |
+
memory_pool (`MemoryPool`, defaults to `None`):
|
| 823 |
+
For memory allocations, if required, otherwise use default pool.
|
| 824 |
+
|
| 825 |
+
Returns:
|
| 826 |
+
`datasets.table.Table`
|
| 827 |
+
"""
|
| 828 |
+
return InMemoryTable(table_flatten(self.table, *args, **kwargs))
|
| 829 |
+
|
| 830 |
+
def combine_chunks(self, *args, **kwargs):
|
| 831 |
+
"""
|
| 832 |
+
Make a new table by combining the chunks this table has.
|
| 833 |
+
|
| 834 |
+
All the underlying chunks in the `ChunkedArray` of each column are
|
| 835 |
+
concatenated into zero or one chunk.
|
| 836 |
+
|
| 837 |
+
Args:
|
| 838 |
+
memory_pool (`MemoryPool`, defaults to `None`):
|
| 839 |
+
For memory allocations, if required, otherwise use default pool.
|
| 840 |
+
|
| 841 |
+
Returns:
|
| 842 |
+
`datasets.table.Table`
|
| 843 |
+
"""
|
| 844 |
+
return InMemoryTable(self.table.combine_chunks(*args, **kwargs))
|
| 845 |
+
|
| 846 |
+
def cast(self, *args, **kwargs):
|
| 847 |
+
"""
|
| 848 |
+
Cast table values to another schema.
|
| 849 |
+
|
| 850 |
+
Args:
|
| 851 |
+
target_schema (`Schema`):
|
| 852 |
+
Schema to cast to, the names and order of fields must match.
|
| 853 |
+
safe (`bool`, defaults to `True`):
|
| 854 |
+
Check for overflows or other unsafe conversions.
|
| 855 |
+
|
| 856 |
+
Returns:
|
| 857 |
+
`datasets.table.Table`
|
| 858 |
+
"""
|
| 859 |
+
return InMemoryTable(table_cast(self.table, *args, **kwargs))
|
| 860 |
+
|
| 861 |
+
def replace_schema_metadata(self, *args, **kwargs):
|
| 862 |
+
"""
|
| 863 |
+
EXPERIMENTAL: Create shallow copy of table by replacing schema
|
| 864 |
+
key-value metadata with the indicated new metadata (which may be `None`,
|
| 865 |
+
which deletes any existing metadata).
|
| 866 |
+
|
| 867 |
+
Args:
|
| 868 |
+
metadata (`dict`, defaults to `None`):
|
| 869 |
+
|
| 870 |
+
Returns:
|
| 871 |
+
`datasets.table.Table`: shallow_copy
|
| 872 |
+
"""
|
| 873 |
+
return InMemoryTable(self.table.replace_schema_metadata(*args, **kwargs))
|
| 874 |
+
|
| 875 |
+
def add_column(self, *args, **kwargs):
|
| 876 |
+
"""
|
| 877 |
+
Add column to Table at position.
|
| 878 |
+
|
| 879 |
+
A new table is returned with the column added, the original table
|
| 880 |
+
object is left unchanged.
|
| 881 |
+
|
| 882 |
+
Args:
|
| 883 |
+
i (`int`):
|
| 884 |
+
Index to place the column at.
|
| 885 |
+
field_ (`Union[str, pyarrow.Field]`):
|
| 886 |
+
If a string is passed then the type is deduced from the column
|
| 887 |
+
data.
|
| 888 |
+
column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
|
| 889 |
+
Column data.
|
| 890 |
+
|
| 891 |
+
Returns:
|
| 892 |
+
`datasets.table.Table`: New table with the passed column added.
|
| 893 |
+
"""
|
| 894 |
+
return InMemoryTable(self.table.add_column(*args, **kwargs))
|
| 895 |
+
|
| 896 |
+
def append_column(self, *args, **kwargs):
|
| 897 |
+
"""
|
| 898 |
+
Append column at end of columns.
|
| 899 |
+
|
| 900 |
+
Args:
|
| 901 |
+
field_ (`Union[str, pyarrow.Field]`):
|
| 902 |
+
If a string is passed then the type is deduced from the column
|
| 903 |
+
data.
|
| 904 |
+
column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
|
| 905 |
+
Column data.
|
| 906 |
+
|
| 907 |
+
Returns:
|
| 908 |
+
`datasets.table.Table`:
|
| 909 |
+
New table with the passed column added.
|
| 910 |
+
"""
|
| 911 |
+
return InMemoryTable(self.table.append_column(*args, **kwargs))
|
| 912 |
+
|
| 913 |
+
def remove_column(self, *args, **kwargs):
|
| 914 |
+
"""
|
| 915 |
+
Create new Table with the indicated column removed.
|
| 916 |
+
|
| 917 |
+
Args:
|
| 918 |
+
i (`int`):
|
| 919 |
+
Index of column to remove.
|
| 920 |
+
|
| 921 |
+
Returns:
|
| 922 |
+
`datasets.table.Table`:
|
| 923 |
+
New table without the column.
|
| 924 |
+
"""
|
| 925 |
+
return InMemoryTable(self.table.remove_column(*args, **kwargs))
|
| 926 |
+
|
| 927 |
+
def set_column(self, *args, **kwargs):
|
| 928 |
+
"""
|
| 929 |
+
Replace column in Table at position.
|
| 930 |
+
|
| 931 |
+
Args:
|
| 932 |
+
i (`int`):
|
| 933 |
+
Index to place the column at.
|
| 934 |
+
field_ (`Union[str, pyarrow.Field]`):
|
| 935 |
+
If a string is passed then the type is deduced from the column
|
| 936 |
+
data.
|
| 937 |
+
column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
|
| 938 |
+
Column data.
|
| 939 |
+
|
| 940 |
+
Returns:
|
| 941 |
+
`datasets.table.Table`:
|
| 942 |
+
New table with the passed column set.
|
| 943 |
+
"""
|
| 944 |
+
return InMemoryTable(self.table.set_column(*args, **kwargs))
|
| 945 |
+
|
| 946 |
+
def rename_columns(self, *args, **kwargs):
|
| 947 |
+
"""
|
| 948 |
+
Create new table with columns renamed to provided names.
|
| 949 |
+
"""
|
| 950 |
+
return InMemoryTable(self.table.rename_columns(*args, **kwargs))
|
| 951 |
+
|
| 952 |
+
def drop(self, *args, **kwargs):
|
| 953 |
+
"""
|
| 954 |
+
Drop one or more columns and return a new table.
|
| 955 |
+
|
| 956 |
+
Args:
|
| 957 |
+
columns (`List[str]`):
|
| 958 |
+
List of field names referencing existing columns.
|
| 959 |
+
|
| 960 |
+
Raises:
|
| 961 |
+
`KeyError` : if any of the passed columns name are not existing.
|
| 962 |
+
|
| 963 |
+
Returns:
|
| 964 |
+
`datasets.table.Table`:
|
| 965 |
+
New table without the columns.
|
| 966 |
+
"""
|
| 967 |
+
return InMemoryTable(self.table.drop(*args, **kwargs))
|
| 968 |
+
|
| 969 |
+
def select(self, *args, **kwargs):
|
| 970 |
+
"""
|
| 971 |
+
Select columns of the table.
|
| 972 |
+
|
| 973 |
+
Returns a new table with the specified columns, and metadata preserved.
|
| 974 |
+
|
| 975 |
+
Args:
|
| 976 |
+
columns (:obj:`Union[List[str], List[int]]`):
|
| 977 |
+
The column names or integer indices to select.
|
| 978 |
+
|
| 979 |
+
Returns:
|
| 980 |
+
:class:`datasets.table.Table`: New table with the specified columns, and metadata preserved.
|
| 981 |
+
"""
|
| 982 |
+
return InMemoryTable(self.table.select(*args, **kwargs))
|
| 983 |
+
|
| 984 |
+
|
| 985 |
+
# The MemoryMappedTable needs replays to properly reload tables from the disk
|
| 986 |
+
Replay = tuple[str, tuple, dict]
|
| 987 |
+
|
| 988 |
+
|
| 989 |
+
class MemoryMappedTable(TableBlock):
|
| 990 |
+
"""
|
| 991 |
+
The table is said memory mapped when it doesn't use the user's RAM but loads the data
|
| 992 |
+
from the disk instead.
|
| 993 |
+
|
| 994 |
+
Pickling it doesn't copy the data into memory.
|
| 995 |
+
Instead, only the path to the memory mapped arrow file is pickled, as well as the list
|
| 996 |
+
of transforms to "replay" when reloading the table from the disk.
|
| 997 |
+
|
| 998 |
+
Its implementation requires to store an history of all the transforms that were applied
|
| 999 |
+
to the underlying pyarrow Table, so that they can be "replayed" when reloading the Table
|
| 1000 |
+
from the disk.
|
| 1001 |
+
|
| 1002 |
+
This is different from the `InMemoryTable` table, for which pickling does copy all the
|
| 1003 |
+
data in memory.
|
| 1004 |
+
|
| 1005 |
+
`InMemoryTable` must be used when data fit in memory, while `MemoryMapped` are reserved for
|
| 1006 |
+
data bigger than memory or when you want the memory footprint of your application to
|
| 1007 |
+
stay low.
|
| 1008 |
+
"""
|
| 1009 |
+
|
| 1010 |
+
def __init__(self, table: pa.Table, path: str, replays: Optional[list[Replay]] = None):
|
| 1011 |
+
super().__init__(table)
|
| 1012 |
+
self.path = os.path.abspath(path)
|
| 1013 |
+
self.replays: list[Replay] = replays if replays is not None else []
|
| 1014 |
+
|
| 1015 |
+
@classmethod
|
| 1016 |
+
def from_file(cls, filename: str, replays=None):
|
| 1017 |
+
table = _memory_mapped_arrow_table_from_file(filename)
|
| 1018 |
+
table = cls._apply_replays(table, replays)
|
| 1019 |
+
return cls(table, filename, replays)
|
| 1020 |
+
|
| 1021 |
+
def __getstate__(self):
|
| 1022 |
+
return {"path": self.path, "replays": self.replays}
|
| 1023 |
+
|
| 1024 |
+
def __setstate__(self, state):
|
| 1025 |
+
path = state["path"]
|
| 1026 |
+
replays = state["replays"]
|
| 1027 |
+
table = _memory_mapped_arrow_table_from_file(path)
|
| 1028 |
+
table = self._apply_replays(table, replays)
|
| 1029 |
+
MemoryMappedTable.__init__(self, table, path=path, replays=replays)
|
| 1030 |
+
|
| 1031 |
+
@staticmethod
|
| 1032 |
+
def _apply_replays(table: pa.Table, replays: Optional[list[Replay]] = None) -> pa.Table:
|
| 1033 |
+
if replays is not None:
|
| 1034 |
+
for name, args, kwargs in replays:
|
| 1035 |
+
if name == "cast":
|
| 1036 |
+
table = table_cast(table, *args, **kwargs)
|
| 1037 |
+
elif name == "flatten":
|
| 1038 |
+
table = table_flatten(table, *args, **kwargs)
|
| 1039 |
+
else:
|
| 1040 |
+
table = getattr(table, name)(*args, **kwargs)
|
| 1041 |
+
return table
|
| 1042 |
+
|
| 1043 |
+
def _append_replay(self, replay: Replay) -> list[Replay]:
|
| 1044 |
+
replays = copy.deepcopy(self.replays)
|
| 1045 |
+
replays.append(replay)
|
| 1046 |
+
return replays
|
| 1047 |
+
|
| 1048 |
+
def slice(self, offset=0, length=None):
|
| 1049 |
+
"""
|
| 1050 |
+
Compute zero-copy slice of this Table.
|
| 1051 |
+
|
| 1052 |
+
Args:
|
| 1053 |
+
offset (`int`, defaults to `0`):
|
| 1054 |
+
Offset from start of table to slice.
|
| 1055 |
+
length (`int`, defaults to `None`):
|
| 1056 |
+
Length of slice (default is until end of table starting from
|
| 1057 |
+
offset).
|
| 1058 |
+
|
| 1059 |
+
Returns:
|
| 1060 |
+
`datasets.table.Table`
|
| 1061 |
+
"""
|
| 1062 |
+
replay = ("slice", (offset, length), {})
|
| 1063 |
+
replays = self._append_replay(replay)
|
| 1064 |
+
# Use fast slicing here
|
| 1065 |
+
return MemoryMappedTable(self.fast_slice(offset=offset, length=length), self.path, replays)
|
| 1066 |
+
|
| 1067 |
+
def filter(self, *args, **kwargs):
|
| 1068 |
+
"""
|
| 1069 |
+
Select records from a Table. See `pyarrow.compute.filter` for full usage.
|
| 1070 |
+
"""
|
| 1071 |
+
replay = ("filter", copy.deepcopy(args), copy.deepcopy(kwargs))
|
| 1072 |
+
replays = self._append_replay(replay)
|
| 1073 |
+
return MemoryMappedTable(self.table.filter(*args, **kwargs), self.path, replays)
|
| 1074 |
+
|
| 1075 |
+
def flatten(self, *args, **kwargs):
|
| 1076 |
+
"""
|
| 1077 |
+
Flatten this Table. Each column with a struct type is flattened
|
| 1078 |
+
into one column per struct field. Other columns are left unchanged.
|
| 1079 |
+
|
| 1080 |
+
Args:
|
| 1081 |
+
memory_pool (`MemoryPool`, defaults to `None`):
|
| 1082 |
+
For memory allocations, if required, otherwise use default pool.
|
| 1083 |
+
|
| 1084 |
+
Returns:
|
| 1085 |
+
`datasets.table.Table`
|
| 1086 |
+
"""
|
| 1087 |
+
replay = ("flatten", copy.deepcopy(args), copy.deepcopy(kwargs))
|
| 1088 |
+
replays = self._append_replay(replay)
|
| 1089 |
+
return MemoryMappedTable(table_flatten(self.table, *args, **kwargs), self.path, replays)
|
| 1090 |
+
|
| 1091 |
+
def combine_chunks(self, *args, **kwargs):
|
| 1092 |
+
"""
|
| 1093 |
+
Make a new table by combining the chunks this table has.
|
| 1094 |
+
|
| 1095 |
+
All the underlying chunks in the ChunkedArray of each column are
|
| 1096 |
+
concatenated into zero or one chunk.
|
| 1097 |
+
|
| 1098 |
+
Args:
|
| 1099 |
+
memory_pool (`MemoryPool`, defaults to `None`):
|
| 1100 |
+
For memory allocations, if required, otherwise use default pool.
|
| 1101 |
+
|
| 1102 |
+
Returns:
|
| 1103 |
+
`datasets.table.Table`
|
| 1104 |
+
"""
|
| 1105 |
+
replay = ("combine_chunks", copy.deepcopy(args), copy.deepcopy(kwargs))
|
| 1106 |
+
replays = self._append_replay(replay)
|
| 1107 |
+
return MemoryMappedTable(self.table.combine_chunks(*args, **kwargs), self.path, replays)
|
| 1108 |
+
|
| 1109 |
+
def cast(self, *args, **kwargs):
|
| 1110 |
+
"""
|
| 1111 |
+
Cast table values to another schema
|
| 1112 |
+
|
| 1113 |
+
Args:
|
| 1114 |
+
target_schema (`Schema`):
|
| 1115 |
+
Schema to cast to, the names and order of fields must match.
|
| 1116 |
+
safe (`bool`, defaults to `True`):
|
| 1117 |
+
Check for overflows or other unsafe conversions.
|
| 1118 |
+
|
| 1119 |
+
Returns:
|
| 1120 |
+
`datasets.table.Table`
|
| 1121 |
+
"""
|
| 1122 |
+
replay = ("cast", copy.deepcopy(args), copy.deepcopy(kwargs))
|
| 1123 |
+
replays = self._append_replay(replay)
|
| 1124 |
+
return MemoryMappedTable(table_cast(self.table, *args, **kwargs), self.path, replays)
|
| 1125 |
+
|
| 1126 |
+
def replace_schema_metadata(self, *args, **kwargs):
|
| 1127 |
+
"""
|
| 1128 |
+
EXPERIMENTAL: Create shallow copy of table by replacing schema
|
| 1129 |
+
key-value metadata with the indicated new metadata (which may be None,
|
| 1130 |
+
which deletes any existing metadata.
|
| 1131 |
+
|
| 1132 |
+
Args:
|
| 1133 |
+
metadata (`dict`, defaults to `None`):
|
| 1134 |
+
|
| 1135 |
+
Returns:
|
| 1136 |
+
`datasets.table.Table`: shallow_copy
|
| 1137 |
+
"""
|
| 1138 |
+
replay = ("replace_schema_metadata", copy.deepcopy(args), copy.deepcopy(kwargs))
|
| 1139 |
+
replays = self._append_replay(replay)
|
| 1140 |
+
return MemoryMappedTable(self.table.replace_schema_metadata(*args, **kwargs), self.path, replays)
|
| 1141 |
+
|
| 1142 |
+
def add_column(self, *args, **kwargs):
|
| 1143 |
+
"""
|
| 1144 |
+
Add column to Table at position.
|
| 1145 |
+
|
| 1146 |
+
A new table is returned with the column added, the original table
|
| 1147 |
+
object is left unchanged.
|
| 1148 |
+
|
| 1149 |
+
Args:
|
| 1150 |
+
i (`int`):
|
| 1151 |
+
Index to place the column at.
|
| 1152 |
+
field_ (`Union[str, pyarrow.Field]`):
|
| 1153 |
+
If a string is passed then the type is deduced from the column
|
| 1154 |
+
data.
|
| 1155 |
+
column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
|
| 1156 |
+
Column data.
|
| 1157 |
+
|
| 1158 |
+
Returns:
|
| 1159 |
+
`datasets.table.Table`: New table with the passed column added.
|
| 1160 |
+
"""
|
| 1161 |
+
replay = ("add_column", copy.deepcopy(args), copy.deepcopy(kwargs))
|
| 1162 |
+
replays = self._append_replay(replay)
|
| 1163 |
+
return MemoryMappedTable(self.table.add_column(*args, **kwargs), self.path, replays)
|
| 1164 |
+
|
| 1165 |
+
def append_column(self, *args, **kwargs):
|
| 1166 |
+
"""
|
| 1167 |
+
Append column at end of columns.
|
| 1168 |
+
|
| 1169 |
+
Args:
|
| 1170 |
+
field_ (`Union[str, pyarrow.Field]`):
|
| 1171 |
+
If a string is passed then the type is deduced from the column
|
| 1172 |
+
data.
|
| 1173 |
+
column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
|
| 1174 |
+
Column data.
|
| 1175 |
+
|
| 1176 |
+
Returns:
|
| 1177 |
+
`datasets.table.Table`:
|
| 1178 |
+
New table with the passed column added.
|
| 1179 |
+
"""
|
| 1180 |
+
replay = ("append_column", copy.deepcopy(args), copy.deepcopy(kwargs))
|
| 1181 |
+
replays = self._append_replay(replay)
|
| 1182 |
+
return MemoryMappedTable(self.table.append_column(*args, **kwargs), self.path, replays)
|
| 1183 |
+
|
| 1184 |
+
def remove_column(self, *args, **kwargs):
|
| 1185 |
+
"""
|
| 1186 |
+
Create new Table with the indicated column removed.
|
| 1187 |
+
|
| 1188 |
+
Args:
|
| 1189 |
+
i (`int`):
|
| 1190 |
+
Index of column to remove.
|
| 1191 |
+
|
| 1192 |
+
Returns:
|
| 1193 |
+
`datasets.table.Table`:
|
| 1194 |
+
New table without the column.
|
| 1195 |
+
"""
|
| 1196 |
+
replay = ("remove_column", copy.deepcopy(args), copy.deepcopy(kwargs))
|
| 1197 |
+
replays = self._append_replay(replay)
|
| 1198 |
+
return MemoryMappedTable(self.table.remove_column(*args, **kwargs), self.path, replays)
|
| 1199 |
+
|
| 1200 |
+
def set_column(self, *args, **kwargs):
|
| 1201 |
+
"""
|
| 1202 |
+
Replace column in Table at position.
|
| 1203 |
+
|
| 1204 |
+
Args:
|
| 1205 |
+
i (`int`):
|
| 1206 |
+
Index to place the column at.
|
| 1207 |
+
field_ (`Union[str, pyarrow.Field]`):
|
| 1208 |
+
If a string is passed then the type is deduced from the column
|
| 1209 |
+
data.
|
| 1210 |
+
column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
|
| 1211 |
+
Column data.
|
| 1212 |
+
|
| 1213 |
+
Returns:
|
| 1214 |
+
`datasets.table.Table`:
|
| 1215 |
+
New table with the passed column set.
|
| 1216 |
+
"""
|
| 1217 |
+
replay = ("set_column", copy.deepcopy(args), copy.deepcopy(kwargs))
|
| 1218 |
+
replays = self._append_replay(replay)
|
| 1219 |
+
return MemoryMappedTable(self.table.set_column(*args, **kwargs), self.path, replays)
|
| 1220 |
+
|
| 1221 |
+
def rename_columns(self, *args, **kwargs):
|
| 1222 |
+
"""
|
| 1223 |
+
Create new table with columns renamed to provided names.
|
| 1224 |
+
"""
|
| 1225 |
+
replay = ("rename_columns", copy.deepcopy(args), copy.deepcopy(kwargs))
|
| 1226 |
+
replays = self._append_replay(replay)
|
| 1227 |
+
return MemoryMappedTable(self.table.rename_columns(*args, **kwargs), self.path, replays)
|
| 1228 |
+
|
| 1229 |
+
def drop(self, *args, **kwargs):
|
| 1230 |
+
"""
|
| 1231 |
+
Drop one or more columns and return a new table.
|
| 1232 |
+
|
| 1233 |
+
Args:
|
| 1234 |
+
columns (`List[str]`):
|
| 1235 |
+
List of field names referencing existing columns.
|
| 1236 |
+
|
| 1237 |
+
Raises:
|
| 1238 |
+
`KeyError` : if any of the passed columns name are not existing.
|
| 1239 |
+
|
| 1240 |
+
Returns:
|
| 1241 |
+
`datasets.table.Table`:
|
| 1242 |
+
New table without the columns.
|
| 1243 |
+
"""
|
| 1244 |
+
replay = ("drop", copy.deepcopy(args), copy.deepcopy(kwargs))
|
| 1245 |
+
replays = self._append_replay(replay)
|
| 1246 |
+
return MemoryMappedTable(self.table.drop(*args, **kwargs), self.path, replays)
|
| 1247 |
+
|
| 1248 |
+
def select(self, *args, **kwargs):
|
| 1249 |
+
"""
|
| 1250 |
+
Select columns of the table.
|
| 1251 |
+
|
| 1252 |
+
Returns a new table with the specified columns, and metadata preserved.
|
| 1253 |
+
|
| 1254 |
+
Args:
|
| 1255 |
+
columns (:obj:`Union[List[str], List[int]]`):
|
| 1256 |
+
The column names or integer indices to select.
|
| 1257 |
+
|
| 1258 |
+
Returns:
|
| 1259 |
+
:class:`datasets.table.Table`: New table with the specified columns, and metadata preserved.
|
| 1260 |
+
"""
|
| 1261 |
+
replay = ("select", copy.deepcopy(args), copy.deepcopy(kwargs))
|
| 1262 |
+
replays = self._append_replay(replay)
|
| 1263 |
+
return MemoryMappedTable(self.table.select(*args, **kwargs), self.path, replays)
|
| 1264 |
+
|
| 1265 |
+
|
| 1266 |
+
# A ConcatenationTable is the concatenation of several tables.
|
| 1267 |
+
# The ``blocks`` attributes stores a list of list of blocks.
|
| 1268 |
+
# The first axis concatenates the tables along the axis 0 (it appends rows),
|
| 1269 |
+
# while the second axis concatenates tables along the axis 1 (it appends columns).
|
| 1270 |
+
TableBlockContainer = TypeVar("TableBlockContainer", TableBlock, list[TableBlock], list[list[TableBlock]])
|
| 1271 |
+
|
| 1272 |
+
|
| 1273 |
+
class ConcatenationTable(Table):
|
| 1274 |
+
"""
|
| 1275 |
+
The table comes from the concatenation of several tables called blocks.
|
| 1276 |
+
It enables concatenation on both axis 0 (append rows) and axis 1 (append columns).
|
| 1277 |
+
|
| 1278 |
+
The underlying tables are called "blocks" and can be either `InMemoryTable`
|
| 1279 |
+
or `MemoryMappedTable` objects.
|
| 1280 |
+
This allows to combine tables that come from memory or that are memory mapped.
|
| 1281 |
+
When a `ConcatenationTable` is pickled, then each block is pickled:
|
| 1282 |
+
- the `InMemoryTable` objects are pickled by copying all the data in memory.
|
| 1283 |
+
- the MemoryMappedTable objects are pickled without copying the data into memory.
|
| 1284 |
+
Instead, only the path to the memory mapped arrow file is pickled, as well as the list
|
| 1285 |
+
of transforms to "replays" when reloading the table from the disk.
|
| 1286 |
+
|
| 1287 |
+
Its implementation requires to store each block separately.
|
| 1288 |
+
The `blocks` attributes stores a list of list of blocks.
|
| 1289 |
+
The first axis concatenates the tables along the axis 0 (it appends rows),
|
| 1290 |
+
while the second axis concatenates tables along the axis 1 (it appends columns).
|
| 1291 |
+
|
| 1292 |
+
If some columns are missing when concatenating on axis 0, they are filled with null values.
|
| 1293 |
+
This is done using `pyarrow.concat_tables(tables, promote=True)`.
|
| 1294 |
+
|
| 1295 |
+
You can access the fully combined table by accessing the `ConcatenationTable.table` attribute,
|
| 1296 |
+
and the blocks by accessing the `ConcatenationTable.blocks` attribute.
|
| 1297 |
+
"""
|
| 1298 |
+
|
| 1299 |
+
def __init__(self, table: pa.Table, blocks: list[list[TableBlock]]):
|
| 1300 |
+
super().__init__(table)
|
| 1301 |
+
self.blocks = blocks
|
| 1302 |
+
# Check that all the blocks have the right type.
|
| 1303 |
+
# Only InMemoryTable and MemoryMappedTable are allowed.
|
| 1304 |
+
for subtables in blocks:
|
| 1305 |
+
for subtable in subtables:
|
| 1306 |
+
if not isinstance(subtable, TableBlock):
|
| 1307 |
+
raise TypeError(
|
| 1308 |
+
"The blocks of a ConcatenationTable must be InMemoryTable or MemoryMappedTable objects"
|
| 1309 |
+
f", but got {_short_str(subtable)}."
|
| 1310 |
+
)
|
| 1311 |
+
|
| 1312 |
+
def __getstate__(self):
|
| 1313 |
+
return {"blocks": self.blocks, "schema": self.table.schema}
|
| 1314 |
+
|
| 1315 |
+
def __setstate__(self, state):
|
| 1316 |
+
blocks = state["blocks"]
|
| 1317 |
+
schema = state["schema"]
|
| 1318 |
+
table = self._concat_blocks_horizontally_and_vertically(blocks)
|
| 1319 |
+
if schema is not None and table.schema != schema:
|
| 1320 |
+
# We fix the columns by concatenating with an empty table with the right columns
|
| 1321 |
+
empty_table = pa.Table.from_batches([], schema=schema)
|
| 1322 |
+
# We set promote_options="default" to fill missing columns with null values
|
| 1323 |
+
table = pa.concat_tables([table, empty_table], promote_options="default")
|
| 1324 |
+
ConcatenationTable.__init__(self, table, blocks=blocks)
|
| 1325 |
+
|
| 1326 |
+
@staticmethod
|
| 1327 |
+
def _concat_blocks(blocks: list[Union[TableBlock, pa.Table]], axis: int = 0) -> pa.Table:
|
| 1328 |
+
pa_tables = [table.table if hasattr(table, "table") else table for table in blocks]
|
| 1329 |
+
if axis == 0:
|
| 1330 |
+
# We set promote_options="default" to fill missing columns with null values
|
| 1331 |
+
return pa.concat_tables(pa_tables, promote_options="default")
|
| 1332 |
+
elif axis == 1:
|
| 1333 |
+
for i, table in enumerate(pa_tables):
|
| 1334 |
+
if i == 0:
|
| 1335 |
+
pa_table = table
|
| 1336 |
+
else:
|
| 1337 |
+
for name, col in zip(table.column_names, table.columns):
|
| 1338 |
+
pa_table = pa_table.append_column(name, col)
|
| 1339 |
+
return pa_table
|
| 1340 |
+
else:
|
| 1341 |
+
raise ValueError("'axis' must be either 0 or 1")
|
| 1342 |
+
|
| 1343 |
+
@classmethod
|
| 1344 |
+
def _concat_blocks_horizontally_and_vertically(cls, blocks: list[list[TableBlock]]) -> pa.Table:
|
| 1345 |
+
pa_tables_to_concat_vertically = []
|
| 1346 |
+
for i, tables in enumerate(blocks):
|
| 1347 |
+
if not tables:
|
| 1348 |
+
continue
|
| 1349 |
+
pa_table_horizontally_concatenated = cls._concat_blocks(tables, axis=1)
|
| 1350 |
+
pa_tables_to_concat_vertically.append(pa_table_horizontally_concatenated)
|
| 1351 |
+
return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)
|
| 1352 |
+
|
| 1353 |
+
@classmethod
|
| 1354 |
+
def _merge_blocks(cls, blocks: TableBlockContainer, axis: Optional[int] = None) -> TableBlockContainer:
|
| 1355 |
+
if axis is not None:
|
| 1356 |
+
merged_blocks = []
|
| 1357 |
+
for is_in_memory, block_group in groupby(blocks, key=lambda x: isinstance(x, InMemoryTable)):
|
| 1358 |
+
if is_in_memory:
|
| 1359 |
+
block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
|
| 1360 |
+
merged_blocks += list(block_group)
|
| 1361 |
+
else: # both
|
| 1362 |
+
merged_blocks = [cls._merge_blocks(row_block, axis=1) for row_block in blocks]
|
| 1363 |
+
if all(len(row_block) == 1 for row_block in merged_blocks):
|
| 1364 |
+
merged_blocks = cls._merge_blocks(
|
| 1365 |
+
[block for row_block in merged_blocks for block in row_block], axis=0
|
| 1366 |
+
)
|
| 1367 |
+
return merged_blocks
|
| 1368 |
+
|
| 1369 |
+
@classmethod
|
| 1370 |
+
def _consolidate_blocks(cls, blocks: TableBlockContainer) -> TableBlockContainer:
|
| 1371 |
+
if isinstance(blocks, TableBlock):
|
| 1372 |
+
return blocks
|
| 1373 |
+
elif isinstance(blocks[0], TableBlock):
|
| 1374 |
+
return cls._merge_blocks(blocks, axis=0)
|
| 1375 |
+
else:
|
| 1376 |
+
return cls._merge_blocks(blocks)
|
| 1377 |
+
|
| 1378 |
+
@classmethod
|
| 1379 |
+
def from_blocks(cls, blocks: TableBlockContainer) -> "ConcatenationTable":
|
| 1380 |
+
blocks = cls._consolidate_blocks(blocks)
|
| 1381 |
+
if isinstance(blocks, TableBlock):
|
| 1382 |
+
table = blocks
|
| 1383 |
+
return cls(table.table, [[table]])
|
| 1384 |
+
elif isinstance(blocks[0], TableBlock):
|
| 1385 |
+
table = cls._concat_blocks(blocks, axis=0)
|
| 1386 |
+
blocks = [[t] for t in blocks]
|
| 1387 |
+
return cls(table, blocks)
|
| 1388 |
+
else:
|
| 1389 |
+
table = cls._concat_blocks_horizontally_and_vertically(blocks)
|
| 1390 |
+
return cls(table, blocks)
|
| 1391 |
+
|
| 1392 |
+
@classmethod
|
| 1393 |
+
def from_tables(cls, tables: list[Union[pa.Table, Table]], axis: int = 0) -> "ConcatenationTable":
|
| 1394 |
+
"""Create `ConcatenationTable` from list of tables.
|
| 1395 |
+
|
| 1396 |
+
Args:
|
| 1397 |
+
tables (list of `Table` or list of `pyarrow.Table`):
|
| 1398 |
+
List of tables.
|
| 1399 |
+
axis (`{0, 1}`, defaults to `0`, meaning over rows):
|
| 1400 |
+
Axis to concatenate over, where `0` means over rows (vertically) and `1` means over columns
|
| 1401 |
+
(horizontally).
|
| 1402 |
+
|
| 1403 |
+
<Added version="1.6.0"/>
|
| 1404 |
+
"""
|
| 1405 |
+
|
| 1406 |
+
def to_blocks(table: Union[pa.Table, Table]) -> list[list[TableBlock]]:
|
| 1407 |
+
if isinstance(table, pa.Table):
|
| 1408 |
+
return [[InMemoryTable(table)]]
|
| 1409 |
+
elif isinstance(table, ConcatenationTable):
|
| 1410 |
+
return copy.deepcopy(table.blocks)
|
| 1411 |
+
else:
|
| 1412 |
+
return [[table]]
|
| 1413 |
+
|
| 1414 |
+
def _slice_row_block(row_block: list[TableBlock], length: int) -> tuple[list[TableBlock], list[TableBlock]]:
|
| 1415 |
+
sliced = [table.slice(0, length) for table in row_block]
|
| 1416 |
+
remainder = [table.slice(length, len(row_block[0]) - length) for table in row_block]
|
| 1417 |
+
return sliced, remainder
|
| 1418 |
+
|
| 1419 |
+
def _split_both_like(
|
| 1420 |
+
result: list[list[TableBlock]], blocks: list[list[TableBlock]]
|
| 1421 |
+
) -> tuple[list[list[TableBlock]], list[list[TableBlock]]]:
|
| 1422 |
+
"""
|
| 1423 |
+
Make sure each row_block contain the same num_rows to be able to concatenate them on axis=1.
|
| 1424 |
+
|
| 1425 |
+
To do so, we modify both blocks sets to have the same row_blocks boundaries.
|
| 1426 |
+
For example, if `result` has 2 row_blocks of 3 rows and `blocks` has 3 row_blocks of 2 rows,
|
| 1427 |
+
we modify both to have 4 row_blocks of size 2, 1, 1 and 2:
|
| 1428 |
+
|
| 1429 |
+
[ x x x | x x x ]
|
| 1430 |
+
+ [ y y | y y | y y ]
|
| 1431 |
+
-----------------------------
|
| 1432 |
+
= [ x x | x | x | x x ]
|
| 1433 |
+
[ y y | y | y | y y ]
|
| 1434 |
+
|
| 1435 |
+
"""
|
| 1436 |
+
result, blocks = list(result), list(blocks)
|
| 1437 |
+
new_result, new_blocks = [], []
|
| 1438 |
+
while result and blocks:
|
| 1439 |
+
# we slice the longest row block to save two row blocks of same length
|
| 1440 |
+
# and we replace the long row block by its remainder if necessary
|
| 1441 |
+
if len(result[0][0]) > len(blocks[0][0]):
|
| 1442 |
+
new_blocks.append(blocks[0])
|
| 1443 |
+
sliced, result[0] = _slice_row_block(result[0], len(blocks.pop(0)[0]))
|
| 1444 |
+
new_result.append(sliced)
|
| 1445 |
+
elif len(result[0][0]) < len(blocks[0][0]):
|
| 1446 |
+
new_result.append(result[0])
|
| 1447 |
+
sliced, blocks[0] = _slice_row_block(blocks[0], len(result.pop(0)[0]))
|
| 1448 |
+
new_blocks.append(sliced)
|
| 1449 |
+
else:
|
| 1450 |
+
new_result.append(result.pop(0))
|
| 1451 |
+
new_blocks.append(blocks.pop(0))
|
| 1452 |
+
if result or blocks:
|
| 1453 |
+
raise ValueError("Failed to concatenate on axis=1 because tables don't have the same number of rows")
|
| 1454 |
+
return new_result, new_blocks
|
| 1455 |
+
|
| 1456 |
+
def _extend_blocks(
|
| 1457 |
+
result: list[list[TableBlock]], blocks: list[list[TableBlock]], axis: int = 0
|
| 1458 |
+
) -> list[list[TableBlock]]:
|
| 1459 |
+
if axis == 0:
|
| 1460 |
+
result.extend(blocks)
|
| 1461 |
+
elif axis == 1:
|
| 1462 |
+
# We make sure each row_block have the same num_rows
|
| 1463 |
+
result, blocks = _split_both_like(result, blocks)
|
| 1464 |
+
for i, row_block in enumerate(blocks):
|
| 1465 |
+
result[i].extend(row_block)
|
| 1466 |
+
return result
|
| 1467 |
+
|
| 1468 |
+
blocks = to_blocks(tables[0])
|
| 1469 |
+
for table in tables[1:]:
|
| 1470 |
+
table_blocks = to_blocks(table)
|
| 1471 |
+
blocks = _extend_blocks(blocks, table_blocks, axis=axis)
|
| 1472 |
+
return cls.from_blocks(blocks)
|
| 1473 |
+
|
| 1474 |
+
@property
|
| 1475 |
+
def _slices(self):
|
| 1476 |
+
offset = 0
|
| 1477 |
+
for tables in self.blocks:
|
| 1478 |
+
length = len(tables[0])
|
| 1479 |
+
yield (offset, length)
|
| 1480 |
+
offset += length
|
| 1481 |
+
|
| 1482 |
+
def slice(self, offset=0, length=None):
|
| 1483 |
+
"""
|
| 1484 |
+
Compute zero-copy slice of this Table.
|
| 1485 |
+
|
| 1486 |
+
Args:
|
| 1487 |
+
offset (`int`, defaults to `0`):
|
| 1488 |
+
Offset from start of table to slice.
|
| 1489 |
+
length (`int`, defaults to `None`):
|
| 1490 |
+
Length of slice (default is until end of table starting from
|
| 1491 |
+
offset).
|
| 1492 |
+
|
| 1493 |
+
Returns:
|
| 1494 |
+
`datasets.table.Table`
|
| 1495 |
+
"""
|
| 1496 |
+
table = self.table.slice(offset, length=length)
|
| 1497 |
+
length = length if length is not None else self.num_rows - offset
|
| 1498 |
+
blocks = []
|
| 1499 |
+
for tables in self.blocks:
|
| 1500 |
+
n_rows = len(tables[0])
|
| 1501 |
+
if length == 0:
|
| 1502 |
+
break
|
| 1503 |
+
elif n_rows <= offset:
|
| 1504 |
+
offset = offset - n_rows
|
| 1505 |
+
elif n_rows <= offset + length:
|
| 1506 |
+
blocks.append([t.slice(offset) for t in tables])
|
| 1507 |
+
length, offset = length + offset - n_rows, 0
|
| 1508 |
+
else:
|
| 1509 |
+
blocks.append([t.slice(offset, length) for t in tables])
|
| 1510 |
+
length, offset = 0, 0
|
| 1511 |
+
return ConcatenationTable(table, blocks)
|
| 1512 |
+
|
| 1513 |
+
def filter(self, mask, *args, **kwargs):
|
| 1514 |
+
"""
|
| 1515 |
+
Select records from a Table. See `pyarrow.compute.filter` for full usage.
|
| 1516 |
+
"""
|
| 1517 |
+
table = self.table.filter(mask, *args, **kwargs)
|
| 1518 |
+
blocks = []
|
| 1519 |
+
for (offset, length), tables in zip(self._slices, self.blocks):
|
| 1520 |
+
submask = mask.slice(offset, length)
|
| 1521 |
+
blocks.append([t.filter(submask, *args, **kwargs) for t in tables])
|
| 1522 |
+
return ConcatenationTable(table, blocks)
|
| 1523 |
+
|
| 1524 |
+
def flatten(self, *args, **kwargs):
|
| 1525 |
+
"""
|
| 1526 |
+
Flatten this Table. Each column with a struct type is flattened
|
| 1527 |
+
into one column per struct field. Other columns are left unchanged.
|
| 1528 |
+
|
| 1529 |
+
Args:
|
| 1530 |
+
memory_pool (`MemoryPool`, defaults to `None`):
|
| 1531 |
+
For memory allocations, if required, otherwise use default pool.
|
| 1532 |
+
|
| 1533 |
+
Returns:
|
| 1534 |
+
`datasets.table.Table`
|
| 1535 |
+
"""
|
| 1536 |
+
table = table_flatten(self.table, *args, **kwargs)
|
| 1537 |
+
blocks = []
|
| 1538 |
+
for tables in self.blocks:
|
| 1539 |
+
blocks.append([t.flatten(*args, **kwargs) for t in tables])
|
| 1540 |
+
return ConcatenationTable(table, blocks)
|
| 1541 |
+
|
| 1542 |
+
def combine_chunks(self, *args, **kwargs):
|
| 1543 |
+
"""
|
| 1544 |
+
Make a new table by combining the chunks this table has.
|
| 1545 |
+
|
| 1546 |
+
All the underlying chunks in the `ChunkedArray` of each column are
|
| 1547 |
+
concatenated into zero or one chunk.
|
| 1548 |
+
|
| 1549 |
+
Args:
|
| 1550 |
+
memory_pool (`MemoryPool`, defaults to `None`):
|
| 1551 |
+
For memory allocations, if required, otherwise use default pool.
|
| 1552 |
+
|
| 1553 |
+
Returns:
|
| 1554 |
+
`datasets.table.Table`
|
| 1555 |
+
"""
|
| 1556 |
+
table = self.table.combine_chunks(*args, **kwargs)
|
| 1557 |
+
blocks = []
|
| 1558 |
+
for tables in self.blocks:
|
| 1559 |
+
blocks.append([t.combine_chunks(*args, **kwargs) for t in tables])
|
| 1560 |
+
return ConcatenationTable(table, blocks)
|
| 1561 |
+
|
| 1562 |
+
def cast(self, target_schema, *args, **kwargs):
|
| 1563 |
+
"""
|
| 1564 |
+
Cast table values to another schema.
|
| 1565 |
+
|
| 1566 |
+
Args:
|
| 1567 |
+
target_schema (`Schema`):
|
| 1568 |
+
Schema to cast to, the names and order of fields must match.
|
| 1569 |
+
safe (`bool`, defaults to `True`):
|
| 1570 |
+
Check for overflows or other unsafe conversions.
|
| 1571 |
+
|
| 1572 |
+
Returns:
|
| 1573 |
+
`datasets.table.Table`
|
| 1574 |
+
"""
|
| 1575 |
+
from .features import Features
|
| 1576 |
+
|
| 1577 |
+
table = table_cast(self.table, target_schema, *args, **kwargs)
|
| 1578 |
+
target_features = Features.from_arrow_schema(target_schema)
|
| 1579 |
+
blocks = []
|
| 1580 |
+
for subtables in self.blocks:
|
| 1581 |
+
new_tables = []
|
| 1582 |
+
fields = list(target_schema)
|
| 1583 |
+
for subtable in subtables:
|
| 1584 |
+
subfields = []
|
| 1585 |
+
for name in subtable.column_names:
|
| 1586 |
+
subfields.append(fields.pop(next(i for i, field in enumerate(fields) if field.name == name)))
|
| 1587 |
+
subfeatures = Features({subfield.name: target_features[subfield.name] for subfield in subfields})
|
| 1588 |
+
subschema = subfeatures.arrow_schema
|
| 1589 |
+
new_tables.append(subtable.cast(subschema, *args, **kwargs))
|
| 1590 |
+
blocks.append(new_tables)
|
| 1591 |
+
return ConcatenationTable(table, blocks)
|
| 1592 |
+
|
| 1593 |
+
def replace_schema_metadata(self, *args, **kwargs):
|
| 1594 |
+
"""
|
| 1595 |
+
EXPERIMENTAL: Create shallow copy of table by replacing schema
|
| 1596 |
+
key-value metadata with the indicated new metadata (which may be `None`,
|
| 1597 |
+
which deletes any existing metadata).
|
| 1598 |
+
|
| 1599 |
+
Args:
|
| 1600 |
+
metadata (`dict`, defaults to `None`):
|
| 1601 |
+
|
| 1602 |
+
Returns:
|
| 1603 |
+
`datasets.table.Table`: shallow_copy
|
| 1604 |
+
"""
|
| 1605 |
+
table = self.table.replace_schema_metadata(*args, **kwargs)
|
| 1606 |
+
blocks = []
|
| 1607 |
+
for tables in self.blocks:
|
| 1608 |
+
blocks.append([t.replace_schema_metadata(*args, **kwargs) for t in tables])
|
| 1609 |
+
return ConcatenationTable(table, self.blocks)
|
| 1610 |
+
|
| 1611 |
+
def add_column(self, *args, **kwargs):
|
| 1612 |
+
"""
|
| 1613 |
+
Add column to Table at position.
|
| 1614 |
+
|
| 1615 |
+
A new table is returned with the column added, the original table
|
| 1616 |
+
object is left unchanged.
|
| 1617 |
+
|
| 1618 |
+
Args:
|
| 1619 |
+
i (`int`):
|
| 1620 |
+
Index to place the column at.
|
| 1621 |
+
field_ (`Union[str, pyarrow.Field]`):
|
| 1622 |
+
If a string is passed then the type is deduced from the column
|
| 1623 |
+
data.
|
| 1624 |
+
column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
|
| 1625 |
+
Column data.
|
| 1626 |
+
|
| 1627 |
+
Returns:
|
| 1628 |
+
`datasets.table.Table`: New table with the passed column added.
|
| 1629 |
+
"""
|
| 1630 |
+
raise NotImplementedError()
|
| 1631 |
+
|
| 1632 |
+
def append_column(self, *args, **kwargs):
|
| 1633 |
+
"""
|
| 1634 |
+
Append column at end of columns.
|
| 1635 |
+
|
| 1636 |
+
Args:
|
| 1637 |
+
field_ (`Union[str, pyarrow.Field]`):
|
| 1638 |
+
If a string is passed then the type is deduced from the column
|
| 1639 |
+
data.
|
| 1640 |
+
column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
|
| 1641 |
+
Column data.
|
| 1642 |
+
|
| 1643 |
+
Returns:
|
| 1644 |
+
`datasets.table.Table`:
|
| 1645 |
+
New table with the passed column added.
|
| 1646 |
+
"""
|
| 1647 |
+
raise NotImplementedError()
|
| 1648 |
+
|
| 1649 |
+
def remove_column(self, i, *args, **kwargs):
|
| 1650 |
+
"""
|
| 1651 |
+
Create new Table with the indicated column removed.
|
| 1652 |
+
|
| 1653 |
+
Args:
|
| 1654 |
+
i (`int`):
|
| 1655 |
+
Index of column to remove.
|
| 1656 |
+
|
| 1657 |
+
Returns:
|
| 1658 |
+
`datasets.table.Table`:
|
| 1659 |
+
New table without the column.
|
| 1660 |
+
"""
|
| 1661 |
+
table = self.table.remove_column(i, *args, **kwargs)
|
| 1662 |
+
name = self.table.column_names[i]
|
| 1663 |
+
blocks = []
|
| 1664 |
+
for tables in self.blocks:
|
| 1665 |
+
blocks.append(
|
| 1666 |
+
[
|
| 1667 |
+
t.remove_column(t.column_names.index(name), *args, **kwargs) if name in t.column_names else t
|
| 1668 |
+
for t in tables
|
| 1669 |
+
]
|
| 1670 |
+
)
|
| 1671 |
+
return ConcatenationTable(table, blocks)
|
| 1672 |
+
|
| 1673 |
+
def set_column(self, *args, **kwargs):
|
| 1674 |
+
"""
|
| 1675 |
+
Replace column in Table at position.
|
| 1676 |
+
|
| 1677 |
+
Args:
|
| 1678 |
+
i (`int`):
|
| 1679 |
+
Index to place the column at.
|
| 1680 |
+
field_ (`Union[str, pyarrow.Field]`):
|
| 1681 |
+
If a string is passed then the type is deduced from the column
|
| 1682 |
+
data.
|
| 1683 |
+
column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
|
| 1684 |
+
Column data.
|
| 1685 |
+
|
| 1686 |
+
Returns:
|
| 1687 |
+
`datasets.table.Table`:
|
| 1688 |
+
New table with the passed column set.
|
| 1689 |
+
"""
|
| 1690 |
+
raise NotImplementedError()
|
| 1691 |
+
|
| 1692 |
+
def rename_columns(self, names, *args, **kwargs):
|
| 1693 |
+
"""
|
| 1694 |
+
Create new table with columns renamed to provided names.
|
| 1695 |
+
"""
|
| 1696 |
+
table = self.table.rename_columns(names, *args, **kwargs)
|
| 1697 |
+
names = dict(zip(self.table.column_names, names))
|
| 1698 |
+
blocks = []
|
| 1699 |
+
for tables in self.blocks:
|
| 1700 |
+
blocks.append(
|
| 1701 |
+
[t.rename_columns([names[name] for name in t.column_names], *args, **kwargs) for t in tables]
|
| 1702 |
+
)
|
| 1703 |
+
return ConcatenationTable(table, blocks)
|
| 1704 |
+
|
| 1705 |
+
def drop(self, columns, *args, **kwargs):
|
| 1706 |
+
"""
|
| 1707 |
+
Drop one or more columns and return a new table.
|
| 1708 |
+
|
| 1709 |
+
Args:
|
| 1710 |
+
columns (`List[str]`):
|
| 1711 |
+
List of field names referencing existing columns.
|
| 1712 |
+
|
| 1713 |
+
Raises:
|
| 1714 |
+
`KeyError` : if any of the passed columns name are not existing.
|
| 1715 |
+
|
| 1716 |
+
Returns:
|
| 1717 |
+
`datasets.table.Table`:
|
| 1718 |
+
New table without the columns.
|
| 1719 |
+
"""
|
| 1720 |
+
table = self.table.drop(columns, *args, **kwargs)
|
| 1721 |
+
blocks = []
|
| 1722 |
+
for tables in self.blocks:
|
| 1723 |
+
blocks.append([t.drop([c for c in columns if c in t.column_names], *args, **kwargs) for t in tables])
|
| 1724 |
+
return ConcatenationTable(table, blocks)
|
| 1725 |
+
|
| 1726 |
+
def select(self, columns, *args, **kwargs):
|
| 1727 |
+
"""
|
| 1728 |
+
Select columns of the table.
|
| 1729 |
+
|
| 1730 |
+
Returns a new table with the specified columns, and metadata preserved.
|
| 1731 |
+
|
| 1732 |
+
Args:
|
| 1733 |
+
columns (:obj:`Union[List[str], List[int]]`):
|
| 1734 |
+
The column names or integer indices to select.
|
| 1735 |
+
|
| 1736 |
+
Returns:
|
| 1737 |
+
:class:`datasets.table.Table`: New table with the specified columns, and metadata preserved.
|
| 1738 |
+
"""
|
| 1739 |
+
table = self.table.select(columns, *args, **kwargs)
|
| 1740 |
+
blocks = []
|
| 1741 |
+
for tables in self.blocks:
|
| 1742 |
+
blocks.append([t.select([c for c in columns if c in t.column_names], *args, **kwargs) for t in tables])
|
| 1743 |
+
return ConcatenationTable(table, blocks)
|
| 1744 |
+
|
| 1745 |
+
|
| 1746 |
+
def concat_tables(tables: list[Table], axis: int = 0) -> Table:
|
| 1747 |
+
"""
|
| 1748 |
+
Concatenate tables.
|
| 1749 |
+
|
| 1750 |
+
Args:
|
| 1751 |
+
tables (list of `Table`):
|
| 1752 |
+
List of tables to be concatenated.
|
| 1753 |
+
axis (`{0, 1}`, defaults to `0`, meaning over rows):
|
| 1754 |
+
Axis to concatenate over, where `0` means over rows (vertically) and `1` means over columns
|
| 1755 |
+
(horizontally).
|
| 1756 |
+
|
| 1757 |
+
<Added version="1.6.0"/>
|
| 1758 |
+
Returns:
|
| 1759 |
+
`datasets.table.Table`:
|
| 1760 |
+
If the number of input tables is > 1, then the returned table is a `datasets.table.ConcatenationTable`.
|
| 1761 |
+
Otherwise if there's only one table, it is returned as is.
|
| 1762 |
+
"""
|
| 1763 |
+
tables = list(tables)
|
| 1764 |
+
if len(tables) == 1:
|
| 1765 |
+
return tables[0]
|
| 1766 |
+
return ConcatenationTable.from_tables(tables, axis=axis)
|
| 1767 |
+
|
| 1768 |
+
|
| 1769 |
+
def list_table_cache_files(table: Table) -> list[str]:
|
| 1770 |
+
"""
|
| 1771 |
+
Get the cache files that are loaded by the table.
|
| 1772 |
+
Cache file are used when parts of the table come from the disk via memory mapping.
|
| 1773 |
+
|
| 1774 |
+
Returns:
|
| 1775 |
+
`List[str]`:
|
| 1776 |
+
A list of paths to the cache files loaded by the table.
|
| 1777 |
+
"""
|
| 1778 |
+
if isinstance(table, ConcatenationTable):
|
| 1779 |
+
cache_files = []
|
| 1780 |
+
for subtables in table.blocks:
|
| 1781 |
+
for subtable in subtables:
|
| 1782 |
+
cache_files += list_table_cache_files(subtable)
|
| 1783 |
+
return cache_files
|
| 1784 |
+
elif isinstance(table, MemoryMappedTable):
|
| 1785 |
+
return [table.path]
|
| 1786 |
+
else:
|
| 1787 |
+
return []
|
| 1788 |
+
|
| 1789 |
+
|
| 1790 |
+
def _wrap_for_chunked_arrays(func):
|
| 1791 |
+
"""Apply the function on each chunk of a `pyarrow.ChunkedArray`, or on the array directly"""
|
| 1792 |
+
|
| 1793 |
+
def wrapper(array, *args, **kwargs):
|
| 1794 |
+
if isinstance(array, pa.ChunkedArray):
|
| 1795 |
+
return pa.chunked_array([func(chunk, *args, **kwargs) for chunk in array.chunks])
|
| 1796 |
+
else:
|
| 1797 |
+
return func(array, *args, **kwargs)
|
| 1798 |
+
|
| 1799 |
+
return wrapper
|
| 1800 |
+
|
| 1801 |
+
|
| 1802 |
+
def _are_list_values_of_length(array: pa.ListArray, length: int) -> bool:
|
| 1803 |
+
"""Check if all the sub-lists of a `pa.ListArray` have the specified length."""
|
| 1804 |
+
return pc.all(pc.equal(array.value_lengths(), length)).as_py() or array.null_count == len(array)
|
| 1805 |
+
|
| 1806 |
+
|
| 1807 |
+
def _combine_list_array_offsets_with_mask(array: pa.ListArray) -> pa.Array:
|
| 1808 |
+
"""Add the null bitmap to the offsets of a `pa.ListArray`."""
|
| 1809 |
+
offsets = array.offsets
|
| 1810 |
+
if array.null_count > 0:
|
| 1811 |
+
offsets = pa.concat_arrays(
|
| 1812 |
+
[
|
| 1813 |
+
pc.replace_with_mask(offsets[:-1], array.is_null(), pa.nulls(len(array), pa.int32())),
|
| 1814 |
+
offsets[-1:],
|
| 1815 |
+
]
|
| 1816 |
+
)
|
| 1817 |
+
return offsets
|
| 1818 |
+
|
| 1819 |
+
|
| 1820 |
+
def _storage_type(type: pa.DataType) -> pa.DataType:
|
| 1821 |
+
"""Convert a (possibly nested) `pa.ExtensionType` to its storage type."""
|
| 1822 |
+
if isinstance(type, pa.ExtensionType):
|
| 1823 |
+
return _storage_type(type.storage_type)
|
| 1824 |
+
elif isinstance(type, pa.StructType):
|
| 1825 |
+
return pa.struct([pa.field(field.name, _storage_type(field.type)) for field in type])
|
| 1826 |
+
elif isinstance(type, pa.ListType):
|
| 1827 |
+
return pa.list_(_storage_type(type.value_type))
|
| 1828 |
+
elif isinstance(type, pa.FixedSizeListType):
|
| 1829 |
+
return pa.list_(_storage_type(type.value_type), type.list_size)
|
| 1830 |
+
return type
|
| 1831 |
+
|
| 1832 |
+
|
| 1833 |
+
def _short_str(value: Any) -> str:
|
| 1834 |
+
out = str(value)
|
| 1835 |
+
if len(out) > 3000:
|
| 1836 |
+
out = out[:1500] + "\n...\n" + out[-1500:]
|
| 1837 |
+
return out
|
| 1838 |
+
|
| 1839 |
+
|
| 1840 |
+
@_wrap_for_chunked_arrays
|
| 1841 |
+
def array_cast(
|
| 1842 |
+
array: pa.Array, pa_type: pa.DataType, allow_primitive_to_str: bool = True, allow_decimal_to_str: bool = True
|
| 1843 |
+
) -> Union[pa.Array, pa.FixedSizeListArray, pa.ListArray, pa.StructArray, pa.ExtensionArray]:
|
| 1844 |
+
"""Improved version of `pa.Array.cast`
|
| 1845 |
+
|
| 1846 |
+
It supports casting `pa.StructArray` objects to re-order the fields.
|
| 1847 |
+
It also let you control certain aspects of the casting, e.g. whether
|
| 1848 |
+
to disable casting primitives (`booleans`, `floats` or `ints`) or
|
| 1849 |
+
disable casting decimals to strings.
|
| 1850 |
+
|
| 1851 |
+
Args:
|
| 1852 |
+
array (`pa.Array`):
|
| 1853 |
+
PyArrow array to cast
|
| 1854 |
+
pa_type (`pa.DataType`):
|
| 1855 |
+
Target PyArrow type
|
| 1856 |
+
allow_primitive_to_str (`bool`, defaults to `True`):
|
| 1857 |
+
Whether to allow casting primitives to strings.
|
| 1858 |
+
Defaults to `True`.
|
| 1859 |
+
allow_decimal_to_str (`bool`, defaults to `True`):
|
| 1860 |
+
Whether to allow casting decimals to strings.
|
| 1861 |
+
Defaults to `True`.
|
| 1862 |
+
|
| 1863 |
+
Raises:
|
| 1864 |
+
`pa.ArrowInvalidError`: if the arrow data casting fails
|
| 1865 |
+
`TypeError`: if the target type is not supported according, e.g.
|
| 1866 |
+
|
| 1867 |
+
- if a field is missing
|
| 1868 |
+
- if casting from primitives to strings and `allow_primitive_to_str` is `False`
|
| 1869 |
+
- if casting from decimals to strings and `allow_decimal_to_str` is `False`
|
| 1870 |
+
|
| 1871 |
+
Returns:
|
| 1872 |
+
`List[pyarrow.Array]`: the casted array
|
| 1873 |
+
"""
|
| 1874 |
+
_c = partial(array_cast, allow_primitive_to_str=allow_primitive_to_str, allow_decimal_to_str=allow_decimal_to_str)
|
| 1875 |
+
if isinstance(array, pa.ExtensionArray):
|
| 1876 |
+
array = array.storage
|
| 1877 |
+
if isinstance(pa_type, pa.ExtensionType):
|
| 1878 |
+
return pa_type.wrap_array(_c(array, pa_type.storage_type))
|
| 1879 |
+
elif array.type == pa_type:
|
| 1880 |
+
return array
|
| 1881 |
+
elif pa.types.is_struct(array.type):
|
| 1882 |
+
if pa.types.is_struct(pa_type) and ({field.name for field in pa_type} == {field.name for field in array.type}):
|
| 1883 |
+
if array.type.num_fields == 0:
|
| 1884 |
+
return array
|
| 1885 |
+
arrays = [_c(array.field(field.name), field.type) for field in pa_type]
|
| 1886 |
+
return pa.StructArray.from_arrays(arrays, fields=list(pa_type), mask=array.is_null())
|
| 1887 |
+
elif pa.types.is_list(array.type) or pa.types.is_large_list(array.type):
|
| 1888 |
+
if pa.types.is_fixed_size_list(pa_type):
|
| 1889 |
+
if _are_list_values_of_length(array, pa_type.list_size):
|
| 1890 |
+
if array.null_count > 0:
|
| 1891 |
+
# Ensure each null value in the array translates to [null] * pa_type.list_size in the array's values array
|
| 1892 |
+
array_type = array.type
|
| 1893 |
+
storage_type = _storage_type(array_type)
|
| 1894 |
+
if array_type != storage_type:
|
| 1895 |
+
# Temporarily convert to the storage type to support extension types in the slice operation
|
| 1896 |
+
array = _c(array, storage_type)
|
| 1897 |
+
array = pc.list_slice(array, 0, pa_type.list_size, return_fixed_size_list=True)
|
| 1898 |
+
array = _c(array, array_type)
|
| 1899 |
+
else:
|
| 1900 |
+
array = pc.list_slice(array, 0, pa_type.list_size, return_fixed_size_list=True)
|
| 1901 |
+
array_values = array.values
|
| 1902 |
+
return pa.FixedSizeListArray.from_arrays(
|
| 1903 |
+
_c(array_values, pa_type.value_type), pa_type.list_size, mask=array.is_null()
|
| 1904 |
+
)
|
| 1905 |
+
else:
|
| 1906 |
+
array_values = array.values[
|
| 1907 |
+
array.offset * pa_type.list_size : (array.offset + len(array)) * pa_type.list_size
|
| 1908 |
+
]
|
| 1909 |
+
return pa.FixedSizeListArray.from_arrays(_c(array_values, pa_type.value_type), pa_type.list_size)
|
| 1910 |
+
elif pa.types.is_list(pa_type):
|
| 1911 |
+
# Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError
|
| 1912 |
+
array_offsets = _combine_list_array_offsets_with_mask(array)
|
| 1913 |
+
return pa.ListArray.from_arrays(array_offsets, _c(array.values, pa_type.value_type))
|
| 1914 |
+
elif pa.types.is_large_list(pa_type):
|
| 1915 |
+
# Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError
|
| 1916 |
+
array_offsets = _combine_list_array_offsets_with_mask(array)
|
| 1917 |
+
return pa.LargeListArray.from_arrays(array_offsets, _c(array.values, pa_type.value_type))
|
| 1918 |
+
elif pa.types.is_fixed_size_list(array.type):
|
| 1919 |
+
if pa.types.is_fixed_size_list(pa_type):
|
| 1920 |
+
if pa_type.list_size == array.type.list_size:
|
| 1921 |
+
array_values = array.values[
|
| 1922 |
+
array.offset * array.type.list_size : (array.offset + len(array)) * array.type.list_size
|
| 1923 |
+
]
|
| 1924 |
+
return pa.FixedSizeListArray.from_arrays(
|
| 1925 |
+
_c(array_values, pa_type.value_type), pa_type.list_size, mask=array.is_null()
|
| 1926 |
+
)
|
| 1927 |
+
elif pa.types.is_list(pa_type):
|
| 1928 |
+
array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size
|
| 1929 |
+
return pa.ListArray.from_arrays(array_offsets, _c(array.values, pa_type.value_type), mask=array.is_null())
|
| 1930 |
+
elif pa.types.is_large_list(pa_type):
|
| 1931 |
+
array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size
|
| 1932 |
+
return pa.LargeListArray.from_arrays(
|
| 1933 |
+
array_offsets, _c(array.values, pa_type.value_type), mask=array.is_null()
|
| 1934 |
+
)
|
| 1935 |
+
else:
|
| 1936 |
+
if pa.types.is_string(pa_type):
|
| 1937 |
+
if not allow_primitive_to_str and pa.types.is_primitive(array.type):
|
| 1938 |
+
raise TypeError(
|
| 1939 |
+
f"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)} "
|
| 1940 |
+
f"since allow_primitive_to_str is set to {allow_primitive_to_str} "
|
| 1941 |
+
)
|
| 1942 |
+
if not allow_decimal_to_str and pa.types.is_decimal(array.type):
|
| 1943 |
+
raise TypeError(
|
| 1944 |
+
f"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)} "
|
| 1945 |
+
f"and allow_decimal_to_str is set to {allow_decimal_to_str}"
|
| 1946 |
+
)
|
| 1947 |
+
if pa.types.is_null(pa_type) and not pa.types.is_null(array.type):
|
| 1948 |
+
raise TypeError(f"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)}")
|
| 1949 |
+
return array.cast(pa_type)
|
| 1950 |
+
raise TypeError(f"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)}")
|
| 1951 |
+
|
| 1952 |
+
|
| 1953 |
+
@_wrap_for_chunked_arrays
|
| 1954 |
+
def cast_array_to_feature(
|
| 1955 |
+
array: pa.Array, feature: "FeatureType", allow_primitive_to_str: bool = True, allow_decimal_to_str: bool = True
|
| 1956 |
+
) -> pa.Array:
|
| 1957 |
+
"""Cast an array to the arrow type that corresponds to the requested feature type.
|
| 1958 |
+
For custom features like [`Audio`] or [`Image`], it takes into account the "cast_storage" methods
|
| 1959 |
+
they defined to enable casting from other arrow types.
|
| 1960 |
+
|
| 1961 |
+
Args:
|
| 1962 |
+
array (`pa.Array`):
|
| 1963 |
+
The PyArrow array to cast.
|
| 1964 |
+
feature (`datasets.features.FeatureType`):
|
| 1965 |
+
The target feature type.
|
| 1966 |
+
allow_primitive_to_str (`bool`, defaults to `True`):
|
| 1967 |
+
Whether to allow casting primitives to strings.
|
| 1968 |
+
Defaults to `True`.
|
| 1969 |
+
allow_decimal_to_str (`bool`, defaults to `True`):
|
| 1970 |
+
Whether to allow casting decimals to strings.
|
| 1971 |
+
Defaults to `True`.
|
| 1972 |
+
|
| 1973 |
+
Raises:
|
| 1974 |
+
`pa.ArrowInvalidError`: if the arrow data casting fails
|
| 1975 |
+
`TypeError`: if the target type is not supported according, e.g.
|
| 1976 |
+
|
| 1977 |
+
- if a field is missing
|
| 1978 |
+
- if casting from primitives and `allow_primitive_to_str` is `False`
|
| 1979 |
+
- if casting from decimals and `allow_decimal_to_str` is `False`
|
| 1980 |
+
|
| 1981 |
+
Returns:
|
| 1982 |
+
array (`pyarrow.Array`): the casted array
|
| 1983 |
+
"""
|
| 1984 |
+
from .features.features import LargeList, List, get_nested_type
|
| 1985 |
+
|
| 1986 |
+
_c = partial(
|
| 1987 |
+
cast_array_to_feature,
|
| 1988 |
+
allow_primitive_to_str=allow_primitive_to_str,
|
| 1989 |
+
allow_decimal_to_str=allow_decimal_to_str,
|
| 1990 |
+
)
|
| 1991 |
+
|
| 1992 |
+
if isinstance(array, pa.ExtensionArray):
|
| 1993 |
+
array = array.storage
|
| 1994 |
+
if hasattr(feature, "cast_storage"):
|
| 1995 |
+
return feature.cast_storage(array)
|
| 1996 |
+
|
| 1997 |
+
if pa.types.is_struct(array.type):
|
| 1998 |
+
# feature must be a dict
|
| 1999 |
+
if isinstance(feature, dict) and (array_fields := {field.name for field in array.type}) <= set(feature):
|
| 2000 |
+
null_array = pa.array([None] * len(array))
|
| 2001 |
+
arrays = [
|
| 2002 |
+
_c(array.field(name) if name in array_fields else null_array, subfeature)
|
| 2003 |
+
for name, subfeature in feature.items()
|
| 2004 |
+
]
|
| 2005 |
+
return pa.StructArray.from_arrays(arrays, names=list(feature), mask=array.is_null())
|
| 2006 |
+
elif pa.types.is_list(array.type) or pa.types.is_large_list(array.type):
|
| 2007 |
+
# feature must be either List(subfeature) or LargeList(subfeature)
|
| 2008 |
+
if isinstance(feature, LargeList):
|
| 2009 |
+
casted_array_values = _c(array.values, feature.feature)
|
| 2010 |
+
if pa.types.is_large_list(array.type) and casted_array_values.type == array.values.type:
|
| 2011 |
+
# Both array and feature have equal large_list type and values (within the list) type
|
| 2012 |
+
return array
|
| 2013 |
+
else:
|
| 2014 |
+
# Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError
|
| 2015 |
+
array_offsets = _combine_list_array_offsets_with_mask(array)
|
| 2016 |
+
return pa.LargeListArray.from_arrays(array_offsets, casted_array_values)
|
| 2017 |
+
elif isinstance(feature, List):
|
| 2018 |
+
if feature.length > -1:
|
| 2019 |
+
if _are_list_values_of_length(array, feature.length):
|
| 2020 |
+
if array.null_count > 0:
|
| 2021 |
+
# Ensure each null value in the array translates to [null] * pa_type.list_size in the array's values array
|
| 2022 |
+
array_type = array.type
|
| 2023 |
+
storage_type = _storage_type(array_type)
|
| 2024 |
+
if array_type != storage_type:
|
| 2025 |
+
# Temporarily convert to the storage type to support extension types in the slice operation
|
| 2026 |
+
array = array_cast(
|
| 2027 |
+
array,
|
| 2028 |
+
storage_type,
|
| 2029 |
+
allow_primitive_to_str=allow_primitive_to_str,
|
| 2030 |
+
allow_decimal_to_str=allow_decimal_to_str,
|
| 2031 |
+
)
|
| 2032 |
+
array = pc.list_slice(array, 0, feature.length, return_fixed_size_list=True)
|
| 2033 |
+
array = array_cast(
|
| 2034 |
+
array,
|
| 2035 |
+
array_type,
|
| 2036 |
+
allow_primitive_to_str=allow_primitive_to_str,
|
| 2037 |
+
allow_decimal_to_str=allow_decimal_to_str,
|
| 2038 |
+
)
|
| 2039 |
+
else:
|
| 2040 |
+
array = pc.list_slice(array, 0, feature.length, return_fixed_size_list=True)
|
| 2041 |
+
array_values = array.values
|
| 2042 |
+
casted_array_values = _c(array_values, feature.feature)
|
| 2043 |
+
return pa.FixedSizeListArray.from_arrays(
|
| 2044 |
+
casted_array_values, feature.length, mask=array.is_null()
|
| 2045 |
+
)
|
| 2046 |
+
else:
|
| 2047 |
+
array_values = array.values[
|
| 2048 |
+
array.offset * feature.length : (array.offset + len(array)) * feature.length
|
| 2049 |
+
]
|
| 2050 |
+
return pa.FixedSizeListArray.from_arrays(_c(array_values, feature.feature), feature.length)
|
| 2051 |
+
else:
|
| 2052 |
+
casted_array_values = _c(array.values, feature.feature)
|
| 2053 |
+
if pa.types.is_list(array.type) and casted_array_values.type == array.values.type:
|
| 2054 |
+
# Both array and feature have equal list type and values (within the list) type
|
| 2055 |
+
return array
|
| 2056 |
+
else:
|
| 2057 |
+
# Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError
|
| 2058 |
+
array_offsets = _combine_list_array_offsets_with_mask(array)
|
| 2059 |
+
return pa.ListArray.from_arrays(array_offsets, casted_array_values)
|
| 2060 |
+
elif pa.types.is_fixed_size_list(array.type):
|
| 2061 |
+
# feature must be List(subfeature)
|
| 2062 |
+
if isinstance(feature, LargeList):
|
| 2063 |
+
array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size
|
| 2064 |
+
return pa.LargeListArray.from_arrays(
|
| 2065 |
+
array_offsets, _c(array.values, feature.feature), mask=array.is_null()
|
| 2066 |
+
)
|
| 2067 |
+
elif isinstance(feature, List):
|
| 2068 |
+
if feature.length > -1:
|
| 2069 |
+
if feature.length == array.type.list_size:
|
| 2070 |
+
array_values = array.values[
|
| 2071 |
+
array.offset * array.type.list_size : (array.offset + len(array)) * array.type.list_size
|
| 2072 |
+
]
|
| 2073 |
+
casted_array_values = _c(array_values, feature.feature)
|
| 2074 |
+
return pa.FixedSizeListArray.from_arrays(casted_array_values, feature.length, mask=array.is_null())
|
| 2075 |
+
else:
|
| 2076 |
+
array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size
|
| 2077 |
+
return pa.ListArray.from_arrays(array_offsets, _c(array.values, feature.feature), mask=array.is_null())
|
| 2078 |
+
if pa.types.is_null(array.type):
|
| 2079 |
+
return array_cast(
|
| 2080 |
+
array,
|
| 2081 |
+
get_nested_type(feature),
|
| 2082 |
+
allow_primitive_to_str=allow_primitive_to_str,
|
| 2083 |
+
allow_decimal_to_str=allow_decimal_to_str,
|
| 2084 |
+
)
|
| 2085 |
+
elif not isinstance(feature, (List, LargeList, dict)):
|
| 2086 |
+
return array_cast(
|
| 2087 |
+
array,
|
| 2088 |
+
feature(),
|
| 2089 |
+
allow_primitive_to_str=allow_primitive_to_str,
|
| 2090 |
+
allow_decimal_to_str=allow_decimal_to_str,
|
| 2091 |
+
)
|
| 2092 |
+
raise TypeError(f"Couldn't cast array of type\n{_short_str(array.type)}\nto\n{_short_str(feature)}")
|
| 2093 |
+
|
| 2094 |
+
|
| 2095 |
+
@_wrap_for_chunked_arrays
|
| 2096 |
+
def embed_array_storage(array: pa.Array, feature: "FeatureType", token_per_repo_id=None):
|
| 2097 |
+
"""Embed data into an arrays's storage.
|
| 2098 |
+
For custom features like Audio or Image, it takes into account the "embed_storage" methods
|
| 2099 |
+
they define to embed external data (e.g. an image file) into an array.
|
| 2100 |
+
|
| 2101 |
+
<Added version="2.4.0"/>
|
| 2102 |
+
|
| 2103 |
+
Args:
|
| 2104 |
+
array (`pa.Array`):
|
| 2105 |
+
The PyArrow array in which to embed data.
|
| 2106 |
+
feature (`datasets.features.FeatureType`):
|
| 2107 |
+
Array features.
|
| 2108 |
+
|
| 2109 |
+
Raises:
|
| 2110 |
+
`TypeError`: if the target type is not supported according, e.g.
|
| 2111 |
+
|
| 2112 |
+
- if a field is missing
|
| 2113 |
+
|
| 2114 |
+
Returns:
|
| 2115 |
+
array (`pyarrow.Array`): the casted array
|
| 2116 |
+
"""
|
| 2117 |
+
from .features import LargeList, List
|
| 2118 |
+
|
| 2119 |
+
_e = partial(embed_array_storage, token_per_repo_id=token_per_repo_id)
|
| 2120 |
+
|
| 2121 |
+
if isinstance(array, pa.ExtensionArray):
|
| 2122 |
+
array = array.storage
|
| 2123 |
+
if hasattr(feature, "embed_storage"):
|
| 2124 |
+
return feature.embed_storage(array, token_per_repo_id=token_per_repo_id)
|
| 2125 |
+
elif pa.types.is_struct(array.type):
|
| 2126 |
+
# feature must be a dict
|
| 2127 |
+
if isinstance(feature, dict):
|
| 2128 |
+
arrays = [_e(array.field(name), subfeature) for name, subfeature in feature.items()]
|
| 2129 |
+
return pa.StructArray.from_arrays(arrays, names=list(feature), mask=array.is_null())
|
| 2130 |
+
elif pa.types.is_list(array.type):
|
| 2131 |
+
# feature must be either List(subfeature)
|
| 2132 |
+
# Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError
|
| 2133 |
+
array_offsets = _combine_list_array_offsets_with_mask(array)
|
| 2134 |
+
if isinstance(feature, List) and feature.length == -1:
|
| 2135 |
+
return pa.ListArray.from_arrays(array_offsets, _e(array.values, feature.feature))
|
| 2136 |
+
elif pa.types.is_large_list(array.type):
|
| 2137 |
+
# feature must be LargeList(subfeature)
|
| 2138 |
+
# Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError
|
| 2139 |
+
array_offsets = _combine_list_array_offsets_with_mask(array)
|
| 2140 |
+
return pa.LargeListArray.from_arrays(array_offsets, _e(array.values, feature.feature))
|
| 2141 |
+
elif pa.types.is_fixed_size_list(array.type):
|
| 2142 |
+
# feature must be List(subfeature)
|
| 2143 |
+
if isinstance(feature, List) and feature.length > -1:
|
| 2144 |
+
array_values = array.values[
|
| 2145 |
+
array.offset * array.type.list_size : (array.offset + len(array)) * array.type.list_size
|
| 2146 |
+
]
|
| 2147 |
+
embedded_array_values = _e(array_values, feature.feature)
|
| 2148 |
+
return pa.FixedSizeListArray.from_arrays(embedded_array_values, feature.length, mask=array.is_null())
|
| 2149 |
+
if not isinstance(feature, (List, LargeList, dict)):
|
| 2150 |
+
return array
|
| 2151 |
+
raise TypeError(f"Couldn't embed array of type\n{_short_str(array.type)}\nwith\n{_short_str(feature)}")
|
| 2152 |
+
|
| 2153 |
+
|
| 2154 |
+
class CastError(ValueError):
|
| 2155 |
+
"""When it's not possible to cast an Arrow table to a specific schema or set of features"""
|
| 2156 |
+
|
| 2157 |
+
def __init__(self, *args, table_column_names: list[str], requested_column_names: list[str]) -> None:
|
| 2158 |
+
super().__init__(*args)
|
| 2159 |
+
self.table_column_names = table_column_names
|
| 2160 |
+
self.requested_column_names = requested_column_names
|
| 2161 |
+
|
| 2162 |
+
def __reduce__(self):
|
| 2163 |
+
# Fix unpickling: TypeError: __init__() missing 2 required keyword-only arguments: 'table_column_names' and 'requested_column_names'
|
| 2164 |
+
return partial(
|
| 2165 |
+
CastError, table_column_names=self.table_column_names, requested_column_names=self.requested_column_names
|
| 2166 |
+
), ()
|
| 2167 |
+
|
| 2168 |
+
def details(self):
|
| 2169 |
+
new_columns = set(self.table_column_names) - set(self.requested_column_names)
|
| 2170 |
+
missing_columns = set(self.requested_column_names) - set(self.table_column_names)
|
| 2171 |
+
if new_columns and missing_columns:
|
| 2172 |
+
return f"there are {len(new_columns)} new columns ({_short_str(new_columns)}) and {len(missing_columns)} missing columns ({_short_str(missing_columns)})."
|
| 2173 |
+
elif new_columns:
|
| 2174 |
+
return f"there are {len(new_columns)} new columns ({_short_str(new_columns)})"
|
| 2175 |
+
else:
|
| 2176 |
+
return f"there are {len(missing_columns)} missing columns ({_short_str(missing_columns)})"
|
| 2177 |
+
|
| 2178 |
+
|
| 2179 |
+
def cast_table_to_features(table: pa.Table, features: "Features"):
|
| 2180 |
+
"""Cast a table to the arrow schema that corresponds to the requested features.
|
| 2181 |
+
|
| 2182 |
+
Args:
|
| 2183 |
+
table (`pyarrow.Table`):
|
| 2184 |
+
PyArrow table to cast.
|
| 2185 |
+
features ([`Features`]):
|
| 2186 |
+
Target features.
|
| 2187 |
+
|
| 2188 |
+
Returns:
|
| 2189 |
+
table (`pyarrow.Table`): the casted table
|
| 2190 |
+
"""
|
| 2191 |
+
if sorted(table.column_names) != sorted(features):
|
| 2192 |
+
raise CastError(
|
| 2193 |
+
f"Couldn't cast\n{_short_str(table.schema)}\nto\n{_short_str(features)}\nbecause column names don't match",
|
| 2194 |
+
table_column_names=table.column_names,
|
| 2195 |
+
requested_column_names=list(features),
|
| 2196 |
+
)
|
| 2197 |
+
arrays = [cast_array_to_feature(table[name], feature) for name, feature in features.items()]
|
| 2198 |
+
return pa.Table.from_arrays(arrays, schema=features.arrow_schema)
|
| 2199 |
+
|
| 2200 |
+
|
| 2201 |
+
def cast_table_to_schema(table: pa.Table, schema: pa.Schema):
|
| 2202 |
+
"""Cast a table to the arrow schema. Different from `cast_table_to_features`, this method can preserve nullability.
|
| 2203 |
+
|
| 2204 |
+
Args:
|
| 2205 |
+
table (`pa.Table`):
|
| 2206 |
+
PyArrow table to cast.
|
| 2207 |
+
features ([`Features`]):
|
| 2208 |
+
Target features.
|
| 2209 |
+
|
| 2210 |
+
Returns:
|
| 2211 |
+
`pa.Table`: the casted table
|
| 2212 |
+
"""
|
| 2213 |
+
from .features import Features
|
| 2214 |
+
|
| 2215 |
+
features = Features.from_arrow_schema(schema)
|
| 2216 |
+
table_column_names = set(table.column_names)
|
| 2217 |
+
if not table_column_names <= set(schema.names):
|
| 2218 |
+
raise CastError(
|
| 2219 |
+
f"Couldn't cast\n{_short_str(table.schema)}\nto\n{_short_str(features)}\nbecause column names don't match",
|
| 2220 |
+
table_column_names=table.column_names,
|
| 2221 |
+
requested_column_names=list(features),
|
| 2222 |
+
)
|
| 2223 |
+
arrays = [
|
| 2224 |
+
cast_array_to_feature(
|
| 2225 |
+
table[name] if name in table_column_names else pa.array([None] * len(table), type=schema.field(name).type),
|
| 2226 |
+
feature,
|
| 2227 |
+
)
|
| 2228 |
+
for name, feature in features.items()
|
| 2229 |
+
]
|
| 2230 |
+
return pa.Table.from_arrays(arrays, schema=schema)
|
| 2231 |
+
|
| 2232 |
+
|
| 2233 |
+
def embed_table_storage(table: pa.Table, token_per_repo_id=None):
|
| 2234 |
+
"""Embed external data into a table's storage.
|
| 2235 |
+
|
| 2236 |
+
<Added version="2.4.0"/>
|
| 2237 |
+
|
| 2238 |
+
Args:
|
| 2239 |
+
table (`pyarrow.Table`):
|
| 2240 |
+
PyArrow table in which to embed data.
|
| 2241 |
+
|
| 2242 |
+
Returns:
|
| 2243 |
+
table (`pyarrow.Table`): the table with embedded data
|
| 2244 |
+
"""
|
| 2245 |
+
from .features.features import Features, require_storage_embed
|
| 2246 |
+
|
| 2247 |
+
features = Features.from_arrow_schema(table.schema)
|
| 2248 |
+
arrays = [
|
| 2249 |
+
embed_array_storage(table[name], feature, token_per_repo_id=token_per_repo_id)
|
| 2250 |
+
if require_storage_embed(feature)
|
| 2251 |
+
else table[name]
|
| 2252 |
+
for name, feature in features.items()
|
| 2253 |
+
]
|
| 2254 |
+
return pa.Table.from_arrays(arrays, schema=features.arrow_schema)
|
| 2255 |
+
|
| 2256 |
+
|
| 2257 |
+
def table_cast(table: pa.Table, schema: pa.Schema):
|
| 2258 |
+
"""Improved version of `pa.Table.cast`.
|
| 2259 |
+
|
| 2260 |
+
It supports casting to feature types stored in the schema metadata.
|
| 2261 |
+
|
| 2262 |
+
Args:
|
| 2263 |
+
table (`pyarrow.Table`):
|
| 2264 |
+
PyArrow table to cast.
|
| 2265 |
+
schema (`pyarrow.Schema`):
|
| 2266 |
+
Target PyArrow schema.
|
| 2267 |
+
|
| 2268 |
+
Returns:
|
| 2269 |
+
table (`pyarrow.Table`): the casted table
|
| 2270 |
+
"""
|
| 2271 |
+
if table.schema != schema:
|
| 2272 |
+
return cast_table_to_schema(table, schema)
|
| 2273 |
+
elif table.schema.metadata != schema.metadata:
|
| 2274 |
+
return table.replace_schema_metadata(schema.metadata)
|
| 2275 |
+
else:
|
| 2276 |
+
return table
|
| 2277 |
+
|
| 2278 |
+
|
| 2279 |
+
def table_flatten(table: pa.Table):
|
| 2280 |
+
"""Improved version of `pa.Table.flatten`.
|
| 2281 |
+
|
| 2282 |
+
It behaves as `pa.Table.flatten` in a sense it does 1-step flatten of the columns with a struct type into one column per struct field,
|
| 2283 |
+
but updates the metadata and skips decodable features unless the `decode` attribute of these features is set to False.
|
| 2284 |
+
|
| 2285 |
+
Args:
|
| 2286 |
+
table (`pa.Table`):
|
| 2287 |
+
PyArrow table to flatten.
|
| 2288 |
+
|
| 2289 |
+
Returns:
|
| 2290 |
+
`Table`: the flattened table
|
| 2291 |
+
"""
|
| 2292 |
+
from .features import Features
|
| 2293 |
+
|
| 2294 |
+
features = Features.from_arrow_schema(table.schema)
|
| 2295 |
+
if any(hasattr(subfeature, "flatten") and subfeature.flatten() == subfeature for subfeature in features.values()):
|
| 2296 |
+
flat_arrays = []
|
| 2297 |
+
flat_column_names = []
|
| 2298 |
+
for field in table.schema:
|
| 2299 |
+
array = table.column(field.name)
|
| 2300 |
+
subfeature = features[field.name]
|
| 2301 |
+
if pa.types.is_struct(field.type) and (
|
| 2302 |
+
not hasattr(subfeature, "flatten") or subfeature.flatten() != subfeature
|
| 2303 |
+
):
|
| 2304 |
+
flat_arrays.extend(array.flatten())
|
| 2305 |
+
flat_column_names.extend([f"{field.name}.{subfield.name}" for subfield in field.type])
|
| 2306 |
+
else:
|
| 2307 |
+
flat_arrays.append(array)
|
| 2308 |
+
flat_column_names.append(field.name)
|
| 2309 |
+
flat_table = pa.Table.from_arrays(
|
| 2310 |
+
flat_arrays,
|
| 2311 |
+
names=flat_column_names,
|
| 2312 |
+
)
|
| 2313 |
+
else:
|
| 2314 |
+
flat_table = table.flatten()
|
| 2315 |
+
# Preserve complex types in the metadata
|
| 2316 |
+
flat_features = features.flatten(max_depth=2)
|
| 2317 |
+
flat_features = Features({column_name: flat_features[column_name] for column_name in flat_table.column_names})
|
| 2318 |
+
return flat_table.replace_schema_metadata(flat_features.arrow_schema.metadata)
|
| 2319 |
+
|
| 2320 |
+
|
| 2321 |
+
def table_visitor(table: pa.Table, function: Callable[[pa.Array], None]):
|
| 2322 |
+
"""Visit all arrays in a table and apply a function to them.
|
| 2323 |
+
|
| 2324 |
+
Args:
|
| 2325 |
+
table (`pyarrow.Table`):
|
| 2326 |
+
PyArrow table to visit.
|
| 2327 |
+
function (`Callable[[pa.Array], None]`):
|
| 2328 |
+
Function to apply to each array.
|
| 2329 |
+
"""
|
| 2330 |
+
from .features import Features, LargeList, List
|
| 2331 |
+
|
| 2332 |
+
features = Features.from_arrow_schema(table.schema)
|
| 2333 |
+
|
| 2334 |
+
def _visit(array, feature):
|
| 2335 |
+
if isinstance(array, pa.ChunkedArray):
|
| 2336 |
+
for chunk in array.chunks:
|
| 2337 |
+
_visit(chunk, feature)
|
| 2338 |
+
else:
|
| 2339 |
+
if isinstance(array, pa.ExtensionArray):
|
| 2340 |
+
array = array.storage
|
| 2341 |
+
function(array, feature)
|
| 2342 |
+
if pa.types.is_struct(array.type) and not hasattr(feature, "cast_storage"):
|
| 2343 |
+
for name, subfeature in feature.items():
|
| 2344 |
+
_visit(array.field(name), subfeature)
|
| 2345 |
+
elif pa.types.is_list(array.type):
|
| 2346 |
+
if isinstance(feature, (LargeList, List)):
|
| 2347 |
+
_visit(array.values, feature.feature)
|
| 2348 |
+
|
| 2349 |
+
for name, feature in features.items():
|
| 2350 |
+
_visit(table[name], feature)
|
| 2351 |
+
|
| 2352 |
+
|
| 2353 |
+
def table_iter(table: Table, batch_size: int, drop_last_batch=False) -> Iterator[pa.Table]:
|
| 2354 |
+
"""Iterate over sub-tables of size `batch_size`.
|
| 2355 |
+
|
| 2356 |
+
Args:
|
| 2357 |
+
table (`pyarrow.Table`):
|
| 2358 |
+
PyArrow table to iterate over.
|
| 2359 |
+
batch_size (`int`):
|
| 2360 |
+
Size of each sub-table to yield.
|
| 2361 |
+
drop_last_batch (`bool`, defaults to `False`):
|
| 2362 |
+
Drop the last batch if it is smaller than `batch_size`.
|
| 2363 |
+
"""
|
| 2364 |
+
chunks_buffer = []
|
| 2365 |
+
chunks_buffer_size = 0
|
| 2366 |
+
for chunk in table.to_reader(max_chunksize=batch_size):
|
| 2367 |
+
if len(chunk) == 0:
|
| 2368 |
+
continue
|
| 2369 |
+
elif chunks_buffer_size + len(chunk) < batch_size:
|
| 2370 |
+
chunks_buffer.append(chunk)
|
| 2371 |
+
chunks_buffer_size += len(chunk)
|
| 2372 |
+
continue
|
| 2373 |
+
elif chunks_buffer_size + len(chunk) == batch_size:
|
| 2374 |
+
chunks_buffer.append(chunk)
|
| 2375 |
+
yield pa.Table.from_batches(chunks_buffer)
|
| 2376 |
+
chunks_buffer = []
|
| 2377 |
+
chunks_buffer_size = 0
|
| 2378 |
+
else:
|
| 2379 |
+
cropped_chunk_length = batch_size - chunks_buffer_size
|
| 2380 |
+
chunks_buffer.append(chunk.slice(0, cropped_chunk_length))
|
| 2381 |
+
yield pa.Table.from_batches(chunks_buffer)
|
| 2382 |
+
chunks_buffer = [chunk.slice(cropped_chunk_length, len(chunk) - cropped_chunk_length)]
|
| 2383 |
+
chunks_buffer_size = len(chunk) - cropped_chunk_length
|
| 2384 |
+
if not drop_last_batch and chunks_buffer:
|
| 2385 |
+
yield pa.Table.from_batches(chunks_buffer)
|
idna/__init__.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .core import (
|
| 2 |
+
IDNABidiError,
|
| 3 |
+
IDNAError,
|
| 4 |
+
InvalidCodepoint,
|
| 5 |
+
InvalidCodepointContext,
|
| 6 |
+
alabel,
|
| 7 |
+
check_bidi,
|
| 8 |
+
check_hyphen_ok,
|
| 9 |
+
check_initial_combiner,
|
| 10 |
+
check_label,
|
| 11 |
+
check_nfc,
|
| 12 |
+
decode,
|
| 13 |
+
encode,
|
| 14 |
+
ulabel,
|
| 15 |
+
uts46_remap,
|
| 16 |
+
valid_contextj,
|
| 17 |
+
valid_contexto,
|
| 18 |
+
valid_label_length,
|
| 19 |
+
valid_string_length,
|
| 20 |
+
)
|
| 21 |
+
from .intranges import intranges_contain
|
| 22 |
+
from .package_data import __version__
|
| 23 |
+
|
| 24 |
+
__all__ = [
|
| 25 |
+
"__version__",
|
| 26 |
+
"IDNABidiError",
|
| 27 |
+
"IDNAError",
|
| 28 |
+
"InvalidCodepoint",
|
| 29 |
+
"InvalidCodepointContext",
|
| 30 |
+
"alabel",
|
| 31 |
+
"check_bidi",
|
| 32 |
+
"check_hyphen_ok",
|
| 33 |
+
"check_initial_combiner",
|
| 34 |
+
"check_label",
|
| 35 |
+
"check_nfc",
|
| 36 |
+
"decode",
|
| 37 |
+
"encode",
|
| 38 |
+
"intranges_contain",
|
| 39 |
+
"ulabel",
|
| 40 |
+
"uts46_remap",
|
| 41 |
+
"valid_contextj",
|
| 42 |
+
"valid_contexto",
|
| 43 |
+
"valid_label_length",
|
| 44 |
+
"valid_string_length",
|
| 45 |
+
]
|
idna/codec.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import codecs
|
| 2 |
+
import re
|
| 3 |
+
from typing import Any, Optional, Tuple
|
| 4 |
+
|
| 5 |
+
from .core import IDNAError, alabel, decode, encode, ulabel
|
| 6 |
+
|
| 7 |
+
_unicode_dots_re = re.compile("[\u002e\u3002\uff0e\uff61]")
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class Codec(codecs.Codec):
|
| 11 |
+
def encode(self, data: str, errors: str = "strict") -> Tuple[bytes, int]:
|
| 12 |
+
if errors != "strict":
|
| 13 |
+
raise IDNAError('Unsupported error handling "{}"'.format(errors))
|
| 14 |
+
|
| 15 |
+
if not data:
|
| 16 |
+
return b"", 0
|
| 17 |
+
|
| 18 |
+
return encode(data), len(data)
|
| 19 |
+
|
| 20 |
+
def decode(self, data: bytes, errors: str = "strict") -> Tuple[str, int]:
|
| 21 |
+
if errors != "strict":
|
| 22 |
+
raise IDNAError('Unsupported error handling "{}"'.format(errors))
|
| 23 |
+
|
| 24 |
+
if not data:
|
| 25 |
+
return "", 0
|
| 26 |
+
|
| 27 |
+
return decode(data), len(data)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
|
| 31 |
+
def _buffer_encode(self, data: str, errors: str, final: bool) -> Tuple[bytes, int]:
|
| 32 |
+
if errors != "strict":
|
| 33 |
+
raise IDNAError('Unsupported error handling "{}"'.format(errors))
|
| 34 |
+
|
| 35 |
+
if not data:
|
| 36 |
+
return b"", 0
|
| 37 |
+
|
| 38 |
+
labels = _unicode_dots_re.split(data)
|
| 39 |
+
trailing_dot = b""
|
| 40 |
+
if labels:
|
| 41 |
+
if not labels[-1]:
|
| 42 |
+
trailing_dot = b"."
|
| 43 |
+
del labels[-1]
|
| 44 |
+
elif not final:
|
| 45 |
+
# Keep potentially unfinished label until the next call
|
| 46 |
+
del labels[-1]
|
| 47 |
+
if labels:
|
| 48 |
+
trailing_dot = b"."
|
| 49 |
+
|
| 50 |
+
result = []
|
| 51 |
+
size = 0
|
| 52 |
+
for label in labels:
|
| 53 |
+
result.append(alabel(label))
|
| 54 |
+
if size:
|
| 55 |
+
size += 1
|
| 56 |
+
size += len(label)
|
| 57 |
+
|
| 58 |
+
# Join with U+002E
|
| 59 |
+
result_bytes = b".".join(result) + trailing_dot
|
| 60 |
+
size += len(trailing_dot)
|
| 61 |
+
return result_bytes, size
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
|
| 65 |
+
def _buffer_decode(self, data: Any, errors: str, final: bool) -> Tuple[str, int]:
|
| 66 |
+
if errors != "strict":
|
| 67 |
+
raise IDNAError('Unsupported error handling "{}"'.format(errors))
|
| 68 |
+
|
| 69 |
+
if not data:
|
| 70 |
+
return ("", 0)
|
| 71 |
+
|
| 72 |
+
if not isinstance(data, str):
|
| 73 |
+
data = str(data, "ascii")
|
| 74 |
+
|
| 75 |
+
labels = _unicode_dots_re.split(data)
|
| 76 |
+
trailing_dot = ""
|
| 77 |
+
if labels:
|
| 78 |
+
if not labels[-1]:
|
| 79 |
+
trailing_dot = "."
|
| 80 |
+
del labels[-1]
|
| 81 |
+
elif not final:
|
| 82 |
+
# Keep potentially unfinished label until the next call
|
| 83 |
+
del labels[-1]
|
| 84 |
+
if labels:
|
| 85 |
+
trailing_dot = "."
|
| 86 |
+
|
| 87 |
+
result = []
|
| 88 |
+
size = 0
|
| 89 |
+
for label in labels:
|
| 90 |
+
result.append(ulabel(label))
|
| 91 |
+
if size:
|
| 92 |
+
size += 1
|
| 93 |
+
size += len(label)
|
| 94 |
+
|
| 95 |
+
result_str = ".".join(result) + trailing_dot
|
| 96 |
+
size += len(trailing_dot)
|
| 97 |
+
return (result_str, size)
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
class StreamWriter(Codec, codecs.StreamWriter):
|
| 101 |
+
pass
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
class StreamReader(Codec, codecs.StreamReader):
|
| 105 |
+
pass
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def search_function(name: str) -> Optional[codecs.CodecInfo]:
|
| 109 |
+
if name != "idna2008":
|
| 110 |
+
return None
|
| 111 |
+
return codecs.CodecInfo(
|
| 112 |
+
name=name,
|
| 113 |
+
encode=Codec().encode,
|
| 114 |
+
decode=Codec().decode, # type: ignore
|
| 115 |
+
incrementalencoder=IncrementalEncoder,
|
| 116 |
+
incrementaldecoder=IncrementalDecoder,
|
| 117 |
+
streamwriter=StreamWriter,
|
| 118 |
+
streamreader=StreamReader,
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
codecs.register(search_function)
|
idna/compat.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Any, Union
|
| 2 |
+
|
| 3 |
+
from .core import decode, encode
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def ToASCII(label: str) -> bytes:
|
| 7 |
+
return encode(label)
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def ToUnicode(label: Union[bytes, bytearray]) -> str:
|
| 11 |
+
return decode(label)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def nameprep(s: Any) -> None:
|
| 15 |
+
raise NotImplementedError("IDNA 2008 does not utilise nameprep protocol")
|
idna/core.py
ADDED
|
@@ -0,0 +1,437 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import bisect
|
| 2 |
+
import re
|
| 3 |
+
import unicodedata
|
| 4 |
+
from typing import Optional, Union
|
| 5 |
+
|
| 6 |
+
from . import idnadata
|
| 7 |
+
from .intranges import intranges_contain
|
| 8 |
+
|
| 9 |
+
_virama_combining_class = 9
|
| 10 |
+
_alabel_prefix = b"xn--"
|
| 11 |
+
_unicode_dots_re = re.compile("[\u002e\u3002\uff0e\uff61]")
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class IDNAError(UnicodeError):
|
| 15 |
+
"""Base exception for all IDNA-encoding related problems"""
|
| 16 |
+
|
| 17 |
+
pass
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class IDNABidiError(IDNAError):
|
| 21 |
+
"""Exception when bidirectional requirements are not satisfied"""
|
| 22 |
+
|
| 23 |
+
pass
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class InvalidCodepoint(IDNAError):
|
| 27 |
+
"""Exception when a disallowed or unallocated codepoint is used"""
|
| 28 |
+
|
| 29 |
+
pass
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class InvalidCodepointContext(IDNAError):
|
| 33 |
+
"""Exception when the codepoint is not valid in the context it is used"""
|
| 34 |
+
|
| 35 |
+
pass
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def _combining_class(cp: int) -> int:
|
| 39 |
+
v = unicodedata.combining(chr(cp))
|
| 40 |
+
if v == 0:
|
| 41 |
+
if not unicodedata.name(chr(cp)):
|
| 42 |
+
raise ValueError("Unknown character in unicodedata")
|
| 43 |
+
return v
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def _is_script(cp: str, script: str) -> bool:
|
| 47 |
+
return intranges_contain(ord(cp), idnadata.scripts[script])
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def _punycode(s: str) -> bytes:
|
| 51 |
+
return s.encode("punycode")
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def _unot(s: int) -> str:
|
| 55 |
+
return "U+{:04X}".format(s)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def valid_label_length(label: Union[bytes, str]) -> bool:
|
| 59 |
+
if len(label) > 63:
|
| 60 |
+
return False
|
| 61 |
+
return True
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def valid_string_length(label: Union[bytes, str], trailing_dot: bool) -> bool:
|
| 65 |
+
if len(label) > (254 if trailing_dot else 253):
|
| 66 |
+
return False
|
| 67 |
+
return True
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def check_bidi(label: str, check_ltr: bool = False) -> bool:
|
| 71 |
+
# Bidi rules should only be applied if string contains RTL characters
|
| 72 |
+
bidi_label = False
|
| 73 |
+
for idx, cp in enumerate(label, 1):
|
| 74 |
+
direction = unicodedata.bidirectional(cp)
|
| 75 |
+
if direction == "":
|
| 76 |
+
# String likely comes from a newer version of Unicode
|
| 77 |
+
raise IDNABidiError("Unknown directionality in label {} at position {}".format(repr(label), idx))
|
| 78 |
+
if direction in ["R", "AL", "AN"]:
|
| 79 |
+
bidi_label = True
|
| 80 |
+
if not bidi_label and not check_ltr:
|
| 81 |
+
return True
|
| 82 |
+
|
| 83 |
+
# Bidi rule 1
|
| 84 |
+
direction = unicodedata.bidirectional(label[0])
|
| 85 |
+
if direction in ["R", "AL"]:
|
| 86 |
+
rtl = True
|
| 87 |
+
elif direction == "L":
|
| 88 |
+
rtl = False
|
| 89 |
+
else:
|
| 90 |
+
raise IDNABidiError("First codepoint in label {} must be directionality L, R or AL".format(repr(label)))
|
| 91 |
+
|
| 92 |
+
valid_ending = False
|
| 93 |
+
number_type: Optional[str] = None
|
| 94 |
+
for idx, cp in enumerate(label, 1):
|
| 95 |
+
direction = unicodedata.bidirectional(cp)
|
| 96 |
+
|
| 97 |
+
if rtl:
|
| 98 |
+
# Bidi rule 2
|
| 99 |
+
if direction not in [
|
| 100 |
+
"R",
|
| 101 |
+
"AL",
|
| 102 |
+
"AN",
|
| 103 |
+
"EN",
|
| 104 |
+
"ES",
|
| 105 |
+
"CS",
|
| 106 |
+
"ET",
|
| 107 |
+
"ON",
|
| 108 |
+
"BN",
|
| 109 |
+
"NSM",
|
| 110 |
+
]:
|
| 111 |
+
raise IDNABidiError("Invalid direction for codepoint at position {} in a right-to-left label".format(idx))
|
| 112 |
+
# Bidi rule 3
|
| 113 |
+
if direction in ["R", "AL", "EN", "AN"]:
|
| 114 |
+
valid_ending = True
|
| 115 |
+
elif direction != "NSM":
|
| 116 |
+
valid_ending = False
|
| 117 |
+
# Bidi rule 4
|
| 118 |
+
if direction in ["AN", "EN"]:
|
| 119 |
+
if not number_type:
|
| 120 |
+
number_type = direction
|
| 121 |
+
else:
|
| 122 |
+
if number_type != direction:
|
| 123 |
+
raise IDNABidiError("Can not mix numeral types in a right-to-left label")
|
| 124 |
+
else:
|
| 125 |
+
# Bidi rule 5
|
| 126 |
+
if direction not in ["L", "EN", "ES", "CS", "ET", "ON", "BN", "NSM"]:
|
| 127 |
+
raise IDNABidiError("Invalid direction for codepoint at position {} in a left-to-right label".format(idx))
|
| 128 |
+
# Bidi rule 6
|
| 129 |
+
if direction in ["L", "EN"]:
|
| 130 |
+
valid_ending = True
|
| 131 |
+
elif direction != "NSM":
|
| 132 |
+
valid_ending = False
|
| 133 |
+
|
| 134 |
+
if not valid_ending:
|
| 135 |
+
raise IDNABidiError("Label ends with illegal codepoint directionality")
|
| 136 |
+
|
| 137 |
+
return True
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def check_initial_combiner(label: str) -> bool:
|
| 141 |
+
if unicodedata.category(label[0])[0] == "M":
|
| 142 |
+
raise IDNAError("Label begins with an illegal combining character")
|
| 143 |
+
return True
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
def check_hyphen_ok(label: str) -> bool:
|
| 147 |
+
if label[2:4] == "--":
|
| 148 |
+
raise IDNAError("Label has disallowed hyphens in 3rd and 4th position")
|
| 149 |
+
if label[0] == "-" or label[-1] == "-":
|
| 150 |
+
raise IDNAError("Label must not start or end with a hyphen")
|
| 151 |
+
return True
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
def check_nfc(label: str) -> None:
|
| 155 |
+
if unicodedata.normalize("NFC", label) != label:
|
| 156 |
+
raise IDNAError("Label must be in Normalization Form C")
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
def valid_contextj(label: str, pos: int) -> bool:
|
| 160 |
+
cp_value = ord(label[pos])
|
| 161 |
+
|
| 162 |
+
if cp_value == 0x200C:
|
| 163 |
+
if pos > 0:
|
| 164 |
+
if _combining_class(ord(label[pos - 1])) == _virama_combining_class:
|
| 165 |
+
return True
|
| 166 |
+
|
| 167 |
+
ok = False
|
| 168 |
+
for i in range(pos - 1, -1, -1):
|
| 169 |
+
joining_type = idnadata.joining_types.get(ord(label[i]))
|
| 170 |
+
if joining_type == ord("T"):
|
| 171 |
+
continue
|
| 172 |
+
elif joining_type in [ord("L"), ord("D")]:
|
| 173 |
+
ok = True
|
| 174 |
+
break
|
| 175 |
+
else:
|
| 176 |
+
break
|
| 177 |
+
|
| 178 |
+
if not ok:
|
| 179 |
+
return False
|
| 180 |
+
|
| 181 |
+
ok = False
|
| 182 |
+
for i in range(pos + 1, len(label)):
|
| 183 |
+
joining_type = idnadata.joining_types.get(ord(label[i]))
|
| 184 |
+
if joining_type == ord("T"):
|
| 185 |
+
continue
|
| 186 |
+
elif joining_type in [ord("R"), ord("D")]:
|
| 187 |
+
ok = True
|
| 188 |
+
break
|
| 189 |
+
else:
|
| 190 |
+
break
|
| 191 |
+
return ok
|
| 192 |
+
|
| 193 |
+
if cp_value == 0x200D:
|
| 194 |
+
if pos > 0:
|
| 195 |
+
if _combining_class(ord(label[pos - 1])) == _virama_combining_class:
|
| 196 |
+
return True
|
| 197 |
+
return False
|
| 198 |
+
|
| 199 |
+
else:
|
| 200 |
+
return False
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
def valid_contexto(label: str, pos: int, exception: bool = False) -> bool:
|
| 204 |
+
cp_value = ord(label[pos])
|
| 205 |
+
|
| 206 |
+
if cp_value == 0x00B7:
|
| 207 |
+
if 0 < pos < len(label) - 1:
|
| 208 |
+
if ord(label[pos - 1]) == 0x006C and ord(label[pos + 1]) == 0x006C:
|
| 209 |
+
return True
|
| 210 |
+
return False
|
| 211 |
+
|
| 212 |
+
elif cp_value == 0x0375:
|
| 213 |
+
if pos < len(label) - 1 and len(label) > 1:
|
| 214 |
+
return _is_script(label[pos + 1], "Greek")
|
| 215 |
+
return False
|
| 216 |
+
|
| 217 |
+
elif cp_value == 0x05F3 or cp_value == 0x05F4:
|
| 218 |
+
if pos > 0:
|
| 219 |
+
return _is_script(label[pos - 1], "Hebrew")
|
| 220 |
+
return False
|
| 221 |
+
|
| 222 |
+
elif cp_value == 0x30FB:
|
| 223 |
+
for cp in label:
|
| 224 |
+
if cp == "\u30fb":
|
| 225 |
+
continue
|
| 226 |
+
if _is_script(cp, "Hiragana") or _is_script(cp, "Katakana") or _is_script(cp, "Han"):
|
| 227 |
+
return True
|
| 228 |
+
return False
|
| 229 |
+
|
| 230 |
+
elif 0x660 <= cp_value <= 0x669:
|
| 231 |
+
for cp in label:
|
| 232 |
+
if 0x6F0 <= ord(cp) <= 0x06F9:
|
| 233 |
+
return False
|
| 234 |
+
return True
|
| 235 |
+
|
| 236 |
+
elif 0x6F0 <= cp_value <= 0x6F9:
|
| 237 |
+
for cp in label:
|
| 238 |
+
if 0x660 <= ord(cp) <= 0x0669:
|
| 239 |
+
return False
|
| 240 |
+
return True
|
| 241 |
+
|
| 242 |
+
return False
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
def check_label(label: Union[str, bytes, bytearray]) -> None:
|
| 246 |
+
if isinstance(label, (bytes, bytearray)):
|
| 247 |
+
label = label.decode("utf-8")
|
| 248 |
+
if len(label) == 0:
|
| 249 |
+
raise IDNAError("Empty Label")
|
| 250 |
+
|
| 251 |
+
check_nfc(label)
|
| 252 |
+
check_hyphen_ok(label)
|
| 253 |
+
check_initial_combiner(label)
|
| 254 |
+
|
| 255 |
+
for pos, cp in enumerate(label):
|
| 256 |
+
cp_value = ord(cp)
|
| 257 |
+
if intranges_contain(cp_value, idnadata.codepoint_classes["PVALID"]):
|
| 258 |
+
continue
|
| 259 |
+
elif intranges_contain(cp_value, idnadata.codepoint_classes["CONTEXTJ"]):
|
| 260 |
+
try:
|
| 261 |
+
if not valid_contextj(label, pos):
|
| 262 |
+
raise InvalidCodepointContext(
|
| 263 |
+
"Joiner {} not allowed at position {} in {}".format(_unot(cp_value), pos + 1, repr(label))
|
| 264 |
+
)
|
| 265 |
+
except ValueError:
|
| 266 |
+
raise IDNAError(
|
| 267 |
+
"Unknown codepoint adjacent to joiner {} at position {} in {}".format(
|
| 268 |
+
_unot(cp_value), pos + 1, repr(label)
|
| 269 |
+
)
|
| 270 |
+
)
|
| 271 |
+
elif intranges_contain(cp_value, idnadata.codepoint_classes["CONTEXTO"]):
|
| 272 |
+
if not valid_contexto(label, pos):
|
| 273 |
+
raise InvalidCodepointContext(
|
| 274 |
+
"Codepoint {} not allowed at position {} in {}".format(_unot(cp_value), pos + 1, repr(label))
|
| 275 |
+
)
|
| 276 |
+
else:
|
| 277 |
+
raise InvalidCodepoint(
|
| 278 |
+
"Codepoint {} at position {} of {} not allowed".format(_unot(cp_value), pos + 1, repr(label))
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
check_bidi(label)
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
def alabel(label: str) -> bytes:
|
| 285 |
+
try:
|
| 286 |
+
label_bytes = label.encode("ascii")
|
| 287 |
+
ulabel(label_bytes)
|
| 288 |
+
if not valid_label_length(label_bytes):
|
| 289 |
+
raise IDNAError("Label too long")
|
| 290 |
+
return label_bytes
|
| 291 |
+
except UnicodeEncodeError:
|
| 292 |
+
pass
|
| 293 |
+
|
| 294 |
+
check_label(label)
|
| 295 |
+
label_bytes = _alabel_prefix + _punycode(label)
|
| 296 |
+
|
| 297 |
+
if not valid_label_length(label_bytes):
|
| 298 |
+
raise IDNAError("Label too long")
|
| 299 |
+
|
| 300 |
+
return label_bytes
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
def ulabel(label: Union[str, bytes, bytearray]) -> str:
|
| 304 |
+
if not isinstance(label, (bytes, bytearray)):
|
| 305 |
+
try:
|
| 306 |
+
label_bytes = label.encode("ascii")
|
| 307 |
+
except UnicodeEncodeError:
|
| 308 |
+
check_label(label)
|
| 309 |
+
return label
|
| 310 |
+
else:
|
| 311 |
+
label_bytes = bytes(label)
|
| 312 |
+
|
| 313 |
+
label_bytes = label_bytes.lower()
|
| 314 |
+
if label_bytes.startswith(_alabel_prefix):
|
| 315 |
+
label_bytes = label_bytes[len(_alabel_prefix) :]
|
| 316 |
+
if not label_bytes:
|
| 317 |
+
raise IDNAError("Malformed A-label, no Punycode eligible content found")
|
| 318 |
+
if label_bytes.decode("ascii")[-1] == "-":
|
| 319 |
+
raise IDNAError("A-label must not end with a hyphen")
|
| 320 |
+
else:
|
| 321 |
+
check_label(label_bytes)
|
| 322 |
+
return label_bytes.decode("ascii")
|
| 323 |
+
|
| 324 |
+
try:
|
| 325 |
+
label = label_bytes.decode("punycode")
|
| 326 |
+
except UnicodeError:
|
| 327 |
+
raise IDNAError("Invalid A-label")
|
| 328 |
+
check_label(label)
|
| 329 |
+
return label
|
| 330 |
+
|
| 331 |
+
|
| 332 |
+
def uts46_remap(domain: str, std3_rules: bool = True, transitional: bool = False) -> str:
|
| 333 |
+
"""Re-map the characters in the string according to UTS46 processing."""
|
| 334 |
+
from .uts46data import uts46data
|
| 335 |
+
|
| 336 |
+
output = ""
|
| 337 |
+
|
| 338 |
+
for pos, char in enumerate(domain):
|
| 339 |
+
code_point = ord(char)
|
| 340 |
+
try:
|
| 341 |
+
uts46row = uts46data[code_point if code_point < 256 else bisect.bisect_left(uts46data, (code_point, "Z")) - 1]
|
| 342 |
+
status = uts46row[1]
|
| 343 |
+
replacement: Optional[str] = None
|
| 344 |
+
if len(uts46row) == 3:
|
| 345 |
+
replacement = uts46row[2]
|
| 346 |
+
if (
|
| 347 |
+
status == "V"
|
| 348 |
+
or (status == "D" and not transitional)
|
| 349 |
+
or (status == "3" and not std3_rules and replacement is None)
|
| 350 |
+
):
|
| 351 |
+
output += char
|
| 352 |
+
elif replacement is not None and (
|
| 353 |
+
status == "M" or (status == "3" and not std3_rules) or (status == "D" and transitional)
|
| 354 |
+
):
|
| 355 |
+
output += replacement
|
| 356 |
+
elif status != "I":
|
| 357 |
+
raise IndexError()
|
| 358 |
+
except IndexError:
|
| 359 |
+
raise InvalidCodepoint(
|
| 360 |
+
"Codepoint {} not allowed at position {} in {}".format(_unot(code_point), pos + 1, repr(domain))
|
| 361 |
+
)
|
| 362 |
+
|
| 363 |
+
return unicodedata.normalize("NFC", output)
|
| 364 |
+
|
| 365 |
+
|
| 366 |
+
def encode(
|
| 367 |
+
s: Union[str, bytes, bytearray],
|
| 368 |
+
strict: bool = False,
|
| 369 |
+
uts46: bool = False,
|
| 370 |
+
std3_rules: bool = False,
|
| 371 |
+
transitional: bool = False,
|
| 372 |
+
) -> bytes:
|
| 373 |
+
if not isinstance(s, str):
|
| 374 |
+
try:
|
| 375 |
+
s = str(s, "ascii")
|
| 376 |
+
except UnicodeDecodeError:
|
| 377 |
+
raise IDNAError("should pass a unicode string to the function rather than a byte string.")
|
| 378 |
+
if uts46:
|
| 379 |
+
s = uts46_remap(s, std3_rules, transitional)
|
| 380 |
+
trailing_dot = False
|
| 381 |
+
result = []
|
| 382 |
+
if strict:
|
| 383 |
+
labels = s.split(".")
|
| 384 |
+
else:
|
| 385 |
+
labels = _unicode_dots_re.split(s)
|
| 386 |
+
if not labels or labels == [""]:
|
| 387 |
+
raise IDNAError("Empty domain")
|
| 388 |
+
if labels[-1] == "":
|
| 389 |
+
del labels[-1]
|
| 390 |
+
trailing_dot = True
|
| 391 |
+
for label in labels:
|
| 392 |
+
s = alabel(label)
|
| 393 |
+
if s:
|
| 394 |
+
result.append(s)
|
| 395 |
+
else:
|
| 396 |
+
raise IDNAError("Empty label")
|
| 397 |
+
if trailing_dot:
|
| 398 |
+
result.append(b"")
|
| 399 |
+
s = b".".join(result)
|
| 400 |
+
if not valid_string_length(s, trailing_dot):
|
| 401 |
+
raise IDNAError("Domain too long")
|
| 402 |
+
return s
|
| 403 |
+
|
| 404 |
+
|
| 405 |
+
def decode(
|
| 406 |
+
s: Union[str, bytes, bytearray],
|
| 407 |
+
strict: bool = False,
|
| 408 |
+
uts46: bool = False,
|
| 409 |
+
std3_rules: bool = False,
|
| 410 |
+
) -> str:
|
| 411 |
+
try:
|
| 412 |
+
if not isinstance(s, str):
|
| 413 |
+
s = str(s, "ascii")
|
| 414 |
+
except UnicodeDecodeError:
|
| 415 |
+
raise IDNAError("Invalid ASCII in A-label")
|
| 416 |
+
if uts46:
|
| 417 |
+
s = uts46_remap(s, std3_rules, False)
|
| 418 |
+
trailing_dot = False
|
| 419 |
+
result = []
|
| 420 |
+
if not strict:
|
| 421 |
+
labels = _unicode_dots_re.split(s)
|
| 422 |
+
else:
|
| 423 |
+
labels = s.split(".")
|
| 424 |
+
if not labels or labels == [""]:
|
| 425 |
+
raise IDNAError("Empty domain")
|
| 426 |
+
if not labels[-1]:
|
| 427 |
+
del labels[-1]
|
| 428 |
+
trailing_dot = True
|
| 429 |
+
for label in labels:
|
| 430 |
+
s = ulabel(label)
|
| 431 |
+
if s:
|
| 432 |
+
result.append(s)
|
| 433 |
+
else:
|
| 434 |
+
raise IDNAError("Empty label")
|
| 435 |
+
if trailing_dot:
|
| 436 |
+
result.append("")
|
| 437 |
+
return ".".join(result)
|
idna/idnadata.py
ADDED
|
@@ -0,0 +1,4309 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This file is automatically generated by tools/idna-data
|
| 2 |
+
|
| 3 |
+
__version__ = "16.0.0"
|
| 4 |
+
|
| 5 |
+
scripts = {
|
| 6 |
+
"Greek": (
|
| 7 |
+
0x37000000374,
|
| 8 |
+
0x37500000378,
|
| 9 |
+
0x37A0000037E,
|
| 10 |
+
0x37F00000380,
|
| 11 |
+
0x38400000385,
|
| 12 |
+
0x38600000387,
|
| 13 |
+
0x3880000038B,
|
| 14 |
+
0x38C0000038D,
|
| 15 |
+
0x38E000003A2,
|
| 16 |
+
0x3A3000003E2,
|
| 17 |
+
0x3F000000400,
|
| 18 |
+
0x1D2600001D2B,
|
| 19 |
+
0x1D5D00001D62,
|
| 20 |
+
0x1D6600001D6B,
|
| 21 |
+
0x1DBF00001DC0,
|
| 22 |
+
0x1F0000001F16,
|
| 23 |
+
0x1F1800001F1E,
|
| 24 |
+
0x1F2000001F46,
|
| 25 |
+
0x1F4800001F4E,
|
| 26 |
+
0x1F5000001F58,
|
| 27 |
+
0x1F5900001F5A,
|
| 28 |
+
0x1F5B00001F5C,
|
| 29 |
+
0x1F5D00001F5E,
|
| 30 |
+
0x1F5F00001F7E,
|
| 31 |
+
0x1F8000001FB5,
|
| 32 |
+
0x1FB600001FC5,
|
| 33 |
+
0x1FC600001FD4,
|
| 34 |
+
0x1FD600001FDC,
|
| 35 |
+
0x1FDD00001FF0,
|
| 36 |
+
0x1FF200001FF5,
|
| 37 |
+
0x1FF600001FFF,
|
| 38 |
+
0x212600002127,
|
| 39 |
+
0xAB650000AB66,
|
| 40 |
+
0x101400001018F,
|
| 41 |
+
0x101A0000101A1,
|
| 42 |
+
0x1D2000001D246,
|
| 43 |
+
),
|
| 44 |
+
"Han": (
|
| 45 |
+
0x2E8000002E9A,
|
| 46 |
+
0x2E9B00002EF4,
|
| 47 |
+
0x2F0000002FD6,
|
| 48 |
+
0x300500003006,
|
| 49 |
+
0x300700003008,
|
| 50 |
+
0x30210000302A,
|
| 51 |
+
0x30380000303C,
|
| 52 |
+
0x340000004DC0,
|
| 53 |
+
0x4E000000A000,
|
| 54 |
+
0xF9000000FA6E,
|
| 55 |
+
0xFA700000FADA,
|
| 56 |
+
0x16FE200016FE4,
|
| 57 |
+
0x16FF000016FF2,
|
| 58 |
+
0x200000002A6E0,
|
| 59 |
+
0x2A7000002B73A,
|
| 60 |
+
0x2B7400002B81E,
|
| 61 |
+
0x2B8200002CEA2,
|
| 62 |
+
0x2CEB00002EBE1,
|
| 63 |
+
0x2EBF00002EE5E,
|
| 64 |
+
0x2F8000002FA1E,
|
| 65 |
+
0x300000003134B,
|
| 66 |
+
0x31350000323B0,
|
| 67 |
+
),
|
| 68 |
+
"Hebrew": (
|
| 69 |
+
0x591000005C8,
|
| 70 |
+
0x5D0000005EB,
|
| 71 |
+
0x5EF000005F5,
|
| 72 |
+
0xFB1D0000FB37,
|
| 73 |
+
0xFB380000FB3D,
|
| 74 |
+
0xFB3E0000FB3F,
|
| 75 |
+
0xFB400000FB42,
|
| 76 |
+
0xFB430000FB45,
|
| 77 |
+
0xFB460000FB50,
|
| 78 |
+
),
|
| 79 |
+
"Hiragana": (
|
| 80 |
+
0x304100003097,
|
| 81 |
+
0x309D000030A0,
|
| 82 |
+
0x1B0010001B120,
|
| 83 |
+
0x1B1320001B133,
|
| 84 |
+
0x1B1500001B153,
|
| 85 |
+
0x1F2000001F201,
|
| 86 |
+
),
|
| 87 |
+
"Katakana": (
|
| 88 |
+
0x30A1000030FB,
|
| 89 |
+
0x30FD00003100,
|
| 90 |
+
0x31F000003200,
|
| 91 |
+
0x32D0000032FF,
|
| 92 |
+
0x330000003358,
|
| 93 |
+
0xFF660000FF70,
|
| 94 |
+
0xFF710000FF9E,
|
| 95 |
+
0x1AFF00001AFF4,
|
| 96 |
+
0x1AFF50001AFFC,
|
| 97 |
+
0x1AFFD0001AFFF,
|
| 98 |
+
0x1B0000001B001,
|
| 99 |
+
0x1B1200001B123,
|
| 100 |
+
0x1B1550001B156,
|
| 101 |
+
0x1B1640001B168,
|
| 102 |
+
),
|
| 103 |
+
}
|
| 104 |
+
joining_types = {
|
| 105 |
+
0xAD: 84,
|
| 106 |
+
0x300: 84,
|
| 107 |
+
0x301: 84,
|
| 108 |
+
0x302: 84,
|
| 109 |
+
0x303: 84,
|
| 110 |
+
0x304: 84,
|
| 111 |
+
0x305: 84,
|
| 112 |
+
0x306: 84,
|
| 113 |
+
0x307: 84,
|
| 114 |
+
0x308: 84,
|
| 115 |
+
0x309: 84,
|
| 116 |
+
0x30A: 84,
|
| 117 |
+
0x30B: 84,
|
| 118 |
+
0x30C: 84,
|
| 119 |
+
0x30D: 84,
|
| 120 |
+
0x30E: 84,
|
| 121 |
+
0x30F: 84,
|
| 122 |
+
0x310: 84,
|
| 123 |
+
0x311: 84,
|
| 124 |
+
0x312: 84,
|
| 125 |
+
0x313: 84,
|
| 126 |
+
0x314: 84,
|
| 127 |
+
0x315: 84,
|
| 128 |
+
0x316: 84,
|
| 129 |
+
0x317: 84,
|
| 130 |
+
0x318: 84,
|
| 131 |
+
0x319: 84,
|
| 132 |
+
0x31A: 84,
|
| 133 |
+
0x31B: 84,
|
| 134 |
+
0x31C: 84,
|
| 135 |
+
0x31D: 84,
|
| 136 |
+
0x31E: 84,
|
| 137 |
+
0x31F: 84,
|
| 138 |
+
0x320: 84,
|
| 139 |
+
0x321: 84,
|
| 140 |
+
0x322: 84,
|
| 141 |
+
0x323: 84,
|
| 142 |
+
0x324: 84,
|
| 143 |
+
0x325: 84,
|
| 144 |
+
0x326: 84,
|
| 145 |
+
0x327: 84,
|
| 146 |
+
0x328: 84,
|
| 147 |
+
0x329: 84,
|
| 148 |
+
0x32A: 84,
|
| 149 |
+
0x32B: 84,
|
| 150 |
+
0x32C: 84,
|
| 151 |
+
0x32D: 84,
|
| 152 |
+
0x32E: 84,
|
| 153 |
+
0x32F: 84,
|
| 154 |
+
0x330: 84,
|
| 155 |
+
0x331: 84,
|
| 156 |
+
0x332: 84,
|
| 157 |
+
0x333: 84,
|
| 158 |
+
0x334: 84,
|
| 159 |
+
0x335: 84,
|
| 160 |
+
0x336: 84,
|
| 161 |
+
0x337: 84,
|
| 162 |
+
0x338: 84,
|
| 163 |
+
0x339: 84,
|
| 164 |
+
0x33A: 84,
|
| 165 |
+
0x33B: 84,
|
| 166 |
+
0x33C: 84,
|
| 167 |
+
0x33D: 84,
|
| 168 |
+
0x33E: 84,
|
| 169 |
+
0x33F: 84,
|
| 170 |
+
0x340: 84,
|
| 171 |
+
0x341: 84,
|
| 172 |
+
0x342: 84,
|
| 173 |
+
0x343: 84,
|
| 174 |
+
0x344: 84,
|
| 175 |
+
0x345: 84,
|
| 176 |
+
0x346: 84,
|
| 177 |
+
0x347: 84,
|
| 178 |
+
0x348: 84,
|
| 179 |
+
0x349: 84,
|
| 180 |
+
0x34A: 84,
|
| 181 |
+
0x34B: 84,
|
| 182 |
+
0x34C: 84,
|
| 183 |
+
0x34D: 84,
|
| 184 |
+
0x34E: 84,
|
| 185 |
+
0x34F: 84,
|
| 186 |
+
0x350: 84,
|
| 187 |
+
0x351: 84,
|
| 188 |
+
0x352: 84,
|
| 189 |
+
0x353: 84,
|
| 190 |
+
0x354: 84,
|
| 191 |
+
0x355: 84,
|
| 192 |
+
0x356: 84,
|
| 193 |
+
0x357: 84,
|
| 194 |
+
0x358: 84,
|
| 195 |
+
0x359: 84,
|
| 196 |
+
0x35A: 84,
|
| 197 |
+
0x35B: 84,
|
| 198 |
+
0x35C: 84,
|
| 199 |
+
0x35D: 84,
|
| 200 |
+
0x35E: 84,
|
| 201 |
+
0x35F: 84,
|
| 202 |
+
0x360: 84,
|
| 203 |
+
0x361: 84,
|
| 204 |
+
0x362: 84,
|
| 205 |
+
0x363: 84,
|
| 206 |
+
0x364: 84,
|
| 207 |
+
0x365: 84,
|
| 208 |
+
0x366: 84,
|
| 209 |
+
0x367: 84,
|
| 210 |
+
0x368: 84,
|
| 211 |
+
0x369: 84,
|
| 212 |
+
0x36A: 84,
|
| 213 |
+
0x36B: 84,
|
| 214 |
+
0x36C: 84,
|
| 215 |
+
0x36D: 84,
|
| 216 |
+
0x36E: 84,
|
| 217 |
+
0x36F: 84,
|
| 218 |
+
0x483: 84,
|
| 219 |
+
0x484: 84,
|
| 220 |
+
0x485: 84,
|
| 221 |
+
0x486: 84,
|
| 222 |
+
0x487: 84,
|
| 223 |
+
0x488: 84,
|
| 224 |
+
0x489: 84,
|
| 225 |
+
0x591: 84,
|
| 226 |
+
0x592: 84,
|
| 227 |
+
0x593: 84,
|
| 228 |
+
0x594: 84,
|
| 229 |
+
0x595: 84,
|
| 230 |
+
0x596: 84,
|
| 231 |
+
0x597: 84,
|
| 232 |
+
0x598: 84,
|
| 233 |
+
0x599: 84,
|
| 234 |
+
0x59A: 84,
|
| 235 |
+
0x59B: 84,
|
| 236 |
+
0x59C: 84,
|
| 237 |
+
0x59D: 84,
|
| 238 |
+
0x59E: 84,
|
| 239 |
+
0x59F: 84,
|
| 240 |
+
0x5A0: 84,
|
| 241 |
+
0x5A1: 84,
|
| 242 |
+
0x5A2: 84,
|
| 243 |
+
0x5A3: 84,
|
| 244 |
+
0x5A4: 84,
|
| 245 |
+
0x5A5: 84,
|
| 246 |
+
0x5A6: 84,
|
| 247 |
+
0x5A7: 84,
|
| 248 |
+
0x5A8: 84,
|
| 249 |
+
0x5A9: 84,
|
| 250 |
+
0x5AA: 84,
|
| 251 |
+
0x5AB: 84,
|
| 252 |
+
0x5AC: 84,
|
| 253 |
+
0x5AD: 84,
|
| 254 |
+
0x5AE: 84,
|
| 255 |
+
0x5AF: 84,
|
| 256 |
+
0x5B0: 84,
|
| 257 |
+
0x5B1: 84,
|
| 258 |
+
0x5B2: 84,
|
| 259 |
+
0x5B3: 84,
|
| 260 |
+
0x5B4: 84,
|
| 261 |
+
0x5B5: 84,
|
| 262 |
+
0x5B6: 84,
|
| 263 |
+
0x5B7: 84,
|
| 264 |
+
0x5B8: 84,
|
| 265 |
+
0x5B9: 84,
|
| 266 |
+
0x5BA: 84,
|
| 267 |
+
0x5BB: 84,
|
| 268 |
+
0x5BC: 84,
|
| 269 |
+
0x5BD: 84,
|
| 270 |
+
0x5BF: 84,
|
| 271 |
+
0x5C1: 84,
|
| 272 |
+
0x5C2: 84,
|
| 273 |
+
0x5C4: 84,
|
| 274 |
+
0x5C5: 84,
|
| 275 |
+
0x5C7: 84,
|
| 276 |
+
0x610: 84,
|
| 277 |
+
0x611: 84,
|
| 278 |
+
0x612: 84,
|
| 279 |
+
0x613: 84,
|
| 280 |
+
0x614: 84,
|
| 281 |
+
0x615: 84,
|
| 282 |
+
0x616: 84,
|
| 283 |
+
0x617: 84,
|
| 284 |
+
0x618: 84,
|
| 285 |
+
0x619: 84,
|
| 286 |
+
0x61A: 84,
|
| 287 |
+
0x61C: 84,
|
| 288 |
+
0x620: 68,
|
| 289 |
+
0x622: 82,
|
| 290 |
+
0x623: 82,
|
| 291 |
+
0x624: 82,
|
| 292 |
+
0x625: 82,
|
| 293 |
+
0x626: 68,
|
| 294 |
+
0x627: 82,
|
| 295 |
+
0x628: 68,
|
| 296 |
+
0x629: 82,
|
| 297 |
+
0x62A: 68,
|
| 298 |
+
0x62B: 68,
|
| 299 |
+
0x62C: 68,
|
| 300 |
+
0x62D: 68,
|
| 301 |
+
0x62E: 68,
|
| 302 |
+
0x62F: 82,
|
| 303 |
+
0x630: 82,
|
| 304 |
+
0x631: 82,
|
| 305 |
+
0x632: 82,
|
| 306 |
+
0x633: 68,
|
| 307 |
+
0x634: 68,
|
| 308 |
+
0x635: 68,
|
| 309 |
+
0x636: 68,
|
| 310 |
+
0x637: 68,
|
| 311 |
+
0x638: 68,
|
| 312 |
+
0x639: 68,
|
| 313 |
+
0x63A: 68,
|
| 314 |
+
0x63B: 68,
|
| 315 |
+
0x63C: 68,
|
| 316 |
+
0x63D: 68,
|
| 317 |
+
0x63E: 68,
|
| 318 |
+
0x63F: 68,
|
| 319 |
+
0x640: 67,
|
| 320 |
+
0x641: 68,
|
| 321 |
+
0x642: 68,
|
| 322 |
+
0x643: 68,
|
| 323 |
+
0x644: 68,
|
| 324 |
+
0x645: 68,
|
| 325 |
+
0x646: 68,
|
| 326 |
+
0x647: 68,
|
| 327 |
+
0x648: 82,
|
| 328 |
+
0x649: 68,
|
| 329 |
+
0x64A: 68,
|
| 330 |
+
0x64B: 84,
|
| 331 |
+
0x64C: 84,
|
| 332 |
+
0x64D: 84,
|
| 333 |
+
0x64E: 84,
|
| 334 |
+
0x64F: 84,
|
| 335 |
+
0x650: 84,
|
| 336 |
+
0x651: 84,
|
| 337 |
+
0x652: 84,
|
| 338 |
+
0x653: 84,
|
| 339 |
+
0x654: 84,
|
| 340 |
+
0x655: 84,
|
| 341 |
+
0x656: 84,
|
| 342 |
+
0x657: 84,
|
| 343 |
+
0x658: 84,
|
| 344 |
+
0x659: 84,
|
| 345 |
+
0x65A: 84,
|
| 346 |
+
0x65B: 84,
|
| 347 |
+
0x65C: 84,
|
| 348 |
+
0x65D: 84,
|
| 349 |
+
0x65E: 84,
|
| 350 |
+
0x65F: 84,
|
| 351 |
+
0x66E: 68,
|
| 352 |
+
0x66F: 68,
|
| 353 |
+
0x670: 84,
|
| 354 |
+
0x671: 82,
|
| 355 |
+
0x672: 82,
|
| 356 |
+
0x673: 82,
|
| 357 |
+
0x675: 82,
|
| 358 |
+
0x676: 82,
|
| 359 |
+
0x677: 82,
|
| 360 |
+
0x678: 68,
|
| 361 |
+
0x679: 68,
|
| 362 |
+
0x67A: 68,
|
| 363 |
+
0x67B: 68,
|
| 364 |
+
0x67C: 68,
|
| 365 |
+
0x67D: 68,
|
| 366 |
+
0x67E: 68,
|
| 367 |
+
0x67F: 68,
|
| 368 |
+
0x680: 68,
|
| 369 |
+
0x681: 68,
|
| 370 |
+
0x682: 68,
|
| 371 |
+
0x683: 68,
|
| 372 |
+
0x684: 68,
|
| 373 |
+
0x685: 68,
|
| 374 |
+
0x686: 68,
|
| 375 |
+
0x687: 68,
|
| 376 |
+
0x688: 82,
|
| 377 |
+
0x689: 82,
|
| 378 |
+
0x68A: 82,
|
| 379 |
+
0x68B: 82,
|
| 380 |
+
0x68C: 82,
|
| 381 |
+
0x68D: 82,
|
| 382 |
+
0x68E: 82,
|
| 383 |
+
0x68F: 82,
|
| 384 |
+
0x690: 82,
|
| 385 |
+
0x691: 82,
|
| 386 |
+
0x692: 82,
|
| 387 |
+
0x693: 82,
|
| 388 |
+
0x694: 82,
|
| 389 |
+
0x695: 82,
|
| 390 |
+
0x696: 82,
|
| 391 |
+
0x697: 82,
|
| 392 |
+
0x698: 82,
|
| 393 |
+
0x699: 82,
|
| 394 |
+
0x69A: 68,
|
| 395 |
+
0x69B: 68,
|
| 396 |
+
0x69C: 68,
|
| 397 |
+
0x69D: 68,
|
| 398 |
+
0x69E: 68,
|
| 399 |
+
0x69F: 68,
|
| 400 |
+
0x6A0: 68,
|
| 401 |
+
0x6A1: 68,
|
| 402 |
+
0x6A2: 68,
|
| 403 |
+
0x6A3: 68,
|
| 404 |
+
0x6A4: 68,
|
| 405 |
+
0x6A5: 68,
|
| 406 |
+
0x6A6: 68,
|
| 407 |
+
0x6A7: 68,
|
| 408 |
+
0x6A8: 68,
|
| 409 |
+
0x6A9: 68,
|
| 410 |
+
0x6AA: 68,
|
| 411 |
+
0x6AB: 68,
|
| 412 |
+
0x6AC: 68,
|
| 413 |
+
0x6AD: 68,
|
| 414 |
+
0x6AE: 68,
|
| 415 |
+
0x6AF: 68,
|
| 416 |
+
0x6B0: 68,
|
| 417 |
+
0x6B1: 68,
|
| 418 |
+
0x6B2: 68,
|
| 419 |
+
0x6B3: 68,
|
| 420 |
+
0x6B4: 68,
|
| 421 |
+
0x6B5: 68,
|
| 422 |
+
0x6B6: 68,
|
| 423 |
+
0x6B7: 68,
|
| 424 |
+
0x6B8: 68,
|
| 425 |
+
0x6B9: 68,
|
| 426 |
+
0x6BA: 68,
|
| 427 |
+
0x6BB: 68,
|
| 428 |
+
0x6BC: 68,
|
| 429 |
+
0x6BD: 68,
|
| 430 |
+
0x6BE: 68,
|
| 431 |
+
0x6BF: 68,
|
| 432 |
+
0x6C0: 82,
|
| 433 |
+
0x6C1: 68,
|
| 434 |
+
0x6C2: 68,
|
| 435 |
+
0x6C3: 82,
|
| 436 |
+
0x6C4: 82,
|
| 437 |
+
0x6C5: 82,
|
| 438 |
+
0x6C6: 82,
|
| 439 |
+
0x6C7: 82,
|
| 440 |
+
0x6C8: 82,
|
| 441 |
+
0x6C9: 82,
|
| 442 |
+
0x6CA: 82,
|
| 443 |
+
0x6CB: 82,
|
| 444 |
+
0x6CC: 68,
|
| 445 |
+
0x6CD: 82,
|
| 446 |
+
0x6CE: 68,
|
| 447 |
+
0x6CF: 82,
|
| 448 |
+
0x6D0: 68,
|
| 449 |
+
0x6D1: 68,
|
| 450 |
+
0x6D2: 82,
|
| 451 |
+
0x6D3: 82,
|
| 452 |
+
0x6D5: 82,
|
| 453 |
+
0x6D6: 84,
|
| 454 |
+
0x6D7: 84,
|
| 455 |
+
0x6D8: 84,
|
| 456 |
+
0x6D9: 84,
|
| 457 |
+
0x6DA: 84,
|
| 458 |
+
0x6DB: 84,
|
| 459 |
+
0x6DC: 84,
|
| 460 |
+
0x6DF: 84,
|
| 461 |
+
0x6E0: 84,
|
| 462 |
+
0x6E1: 84,
|
| 463 |
+
0x6E2: 84,
|
| 464 |
+
0x6E3: 84,
|
| 465 |
+
0x6E4: 84,
|
| 466 |
+
0x6E7: 84,
|
| 467 |
+
0x6E8: 84,
|
| 468 |
+
0x6EA: 84,
|
| 469 |
+
0x6EB: 84,
|
| 470 |
+
0x6EC: 84,
|
| 471 |
+
0x6ED: 84,
|
| 472 |
+
0x6EE: 82,
|
| 473 |
+
0x6EF: 82,
|
| 474 |
+
0x6FA: 68,
|
| 475 |
+
0x6FB: 68,
|
| 476 |
+
0x6FC: 68,
|
| 477 |
+
0x6FF: 68,
|
| 478 |
+
0x70F: 84,
|
| 479 |
+
0x710: 82,
|
| 480 |
+
0x711: 84,
|
| 481 |
+
0x712: 68,
|
| 482 |
+
0x713: 68,
|
| 483 |
+
0x714: 68,
|
| 484 |
+
0x715: 82,
|
| 485 |
+
0x716: 82,
|
| 486 |
+
0x717: 82,
|
| 487 |
+
0x718: 82,
|
| 488 |
+
0x719: 82,
|
| 489 |
+
0x71A: 68,
|
| 490 |
+
0x71B: 68,
|
| 491 |
+
0x71C: 68,
|
| 492 |
+
0x71D: 68,
|
| 493 |
+
0x71E: 82,
|
| 494 |
+
0x71F: 68,
|
| 495 |
+
0x720: 68,
|
| 496 |
+
0x721: 68,
|
| 497 |
+
0x722: 68,
|
| 498 |
+
0x723: 68,
|
| 499 |
+
0x724: 68,
|
| 500 |
+
0x725: 68,
|
| 501 |
+
0x726: 68,
|
| 502 |
+
0x727: 68,
|
| 503 |
+
0x728: 82,
|
| 504 |
+
0x729: 68,
|
| 505 |
+
0x72A: 82,
|
| 506 |
+
0x72B: 68,
|
| 507 |
+
0x72C: 82,
|
| 508 |
+
0x72D: 68,
|
| 509 |
+
0x72E: 68,
|
| 510 |
+
0x72F: 82,
|
| 511 |
+
0x730: 84,
|
| 512 |
+
0x731: 84,
|
| 513 |
+
0x732: 84,
|
| 514 |
+
0x733: 84,
|
| 515 |
+
0x734: 84,
|
| 516 |
+
0x735: 84,
|
| 517 |
+
0x736: 84,
|
| 518 |
+
0x737: 84,
|
| 519 |
+
0x738: 84,
|
| 520 |
+
0x739: 84,
|
| 521 |
+
0x73A: 84,
|
| 522 |
+
0x73B: 84,
|
| 523 |
+
0x73C: 84,
|
| 524 |
+
0x73D: 84,
|
| 525 |
+
0x73E: 84,
|
| 526 |
+
0x73F: 84,
|
| 527 |
+
0x740: 84,
|
| 528 |
+
0x741: 84,
|
| 529 |
+
0x742: 84,
|
| 530 |
+
0x743: 84,
|
| 531 |
+
0x744: 84,
|
| 532 |
+
0x745: 84,
|
| 533 |
+
0x746: 84,
|
| 534 |
+
0x747: 84,
|
| 535 |
+
0x748: 84,
|
| 536 |
+
0x749: 84,
|
| 537 |
+
0x74A: 84,
|
| 538 |
+
0x74D: 82,
|
| 539 |
+
0x74E: 68,
|
| 540 |
+
0x74F: 68,
|
| 541 |
+
0x750: 68,
|
| 542 |
+
0x751: 68,
|
| 543 |
+
0x752: 68,
|
| 544 |
+
0x753: 68,
|
| 545 |
+
0x754: 68,
|
| 546 |
+
0x755: 68,
|
| 547 |
+
0x756: 68,
|
| 548 |
+
0x757: 68,
|
| 549 |
+
0x758: 68,
|
| 550 |
+
0x759: 82,
|
| 551 |
+
0x75A: 82,
|
| 552 |
+
0x75B: 82,
|
| 553 |
+
0x75C: 68,
|
| 554 |
+
0x75D: 68,
|
| 555 |
+
0x75E: 68,
|
| 556 |
+
0x75F: 68,
|
| 557 |
+
0x760: 68,
|
| 558 |
+
0x761: 68,
|
| 559 |
+
0x762: 68,
|
| 560 |
+
0x763: 68,
|
| 561 |
+
0x764: 68,
|
| 562 |
+
0x765: 68,
|
| 563 |
+
0x766: 68,
|
| 564 |
+
0x767: 68,
|
| 565 |
+
0x768: 68,
|
| 566 |
+
0x769: 68,
|
| 567 |
+
0x76A: 68,
|
| 568 |
+
0x76B: 82,
|
| 569 |
+
0x76C: 82,
|
| 570 |
+
0x76D: 68,
|
| 571 |
+
0x76E: 68,
|
| 572 |
+
0x76F: 68,
|
| 573 |
+
0x770: 68,
|
| 574 |
+
0x771: 82,
|
| 575 |
+
0x772: 68,
|
| 576 |
+
0x773: 82,
|
| 577 |
+
0x774: 82,
|
| 578 |
+
0x775: 68,
|
| 579 |
+
0x776: 68,
|
| 580 |
+
0x777: 68,
|
| 581 |
+
0x778: 82,
|
| 582 |
+
0x779: 82,
|
| 583 |
+
0x77A: 68,
|
| 584 |
+
0x77B: 68,
|
| 585 |
+
0x77C: 68,
|
| 586 |
+
0x77D: 68,
|
| 587 |
+
0x77E: 68,
|
| 588 |
+
0x77F: 68,
|
| 589 |
+
0x7A6: 84,
|
| 590 |
+
0x7A7: 84,
|
| 591 |
+
0x7A8: 84,
|
| 592 |
+
0x7A9: 84,
|
| 593 |
+
0x7AA: 84,
|
| 594 |
+
0x7AB: 84,
|
| 595 |
+
0x7AC: 84,
|
| 596 |
+
0x7AD: 84,
|
| 597 |
+
0x7AE: 84,
|
| 598 |
+
0x7AF: 84,
|
| 599 |
+
0x7B0: 84,
|
| 600 |
+
0x7CA: 68,
|
| 601 |
+
0x7CB: 68,
|
| 602 |
+
0x7CC: 68,
|
| 603 |
+
0x7CD: 68,
|
| 604 |
+
0x7CE: 68,
|
| 605 |
+
0x7CF: 68,
|
| 606 |
+
0x7D0: 68,
|
| 607 |
+
0x7D1: 68,
|
| 608 |
+
0x7D2: 68,
|
| 609 |
+
0x7D3: 68,
|
| 610 |
+
0x7D4: 68,
|
| 611 |
+
0x7D5: 68,
|
| 612 |
+
0x7D6: 68,
|
| 613 |
+
0x7D7: 68,
|
| 614 |
+
0x7D8: 68,
|
| 615 |
+
0x7D9: 68,
|
| 616 |
+
0x7DA: 68,
|
| 617 |
+
0x7DB: 68,
|
| 618 |
+
0x7DC: 68,
|
| 619 |
+
0x7DD: 68,
|
| 620 |
+
0x7DE: 68,
|
| 621 |
+
0x7DF: 68,
|
| 622 |
+
0x7E0: 68,
|
| 623 |
+
0x7E1: 68,
|
| 624 |
+
0x7E2: 68,
|
| 625 |
+
0x7E3: 68,
|
| 626 |
+
0x7E4: 68,
|
| 627 |
+
0x7E5: 68,
|
| 628 |
+
0x7E6: 68,
|
| 629 |
+
0x7E7: 68,
|
| 630 |
+
0x7E8: 68,
|
| 631 |
+
0x7E9: 68,
|
| 632 |
+
0x7EA: 68,
|
| 633 |
+
0x7EB: 84,
|
| 634 |
+
0x7EC: 84,
|
| 635 |
+
0x7ED: 84,
|
| 636 |
+
0x7EE: 84,
|
| 637 |
+
0x7EF: 84,
|
| 638 |
+
0x7F0: 84,
|
| 639 |
+
0x7F1: 84,
|
| 640 |
+
0x7F2: 84,
|
| 641 |
+
0x7F3: 84,
|
| 642 |
+
0x7FA: 67,
|
| 643 |
+
0x7FD: 84,
|
| 644 |
+
0x816: 84,
|
| 645 |
+
0x817: 84,
|
| 646 |
+
0x818: 84,
|
| 647 |
+
0x819: 84,
|
| 648 |
+
0x81B: 84,
|
| 649 |
+
0x81C: 84,
|
| 650 |
+
0x81D: 84,
|
| 651 |
+
0x81E: 84,
|
| 652 |
+
0x81F: 84,
|
| 653 |
+
0x820: 84,
|
| 654 |
+
0x821: 84,
|
| 655 |
+
0x822: 84,
|
| 656 |
+
0x823: 84,
|
| 657 |
+
0x825: 84,
|
| 658 |
+
0x826: 84,
|
| 659 |
+
0x827: 84,
|
| 660 |
+
0x829: 84,
|
| 661 |
+
0x82A: 84,
|
| 662 |
+
0x82B: 84,
|
| 663 |
+
0x82C: 84,
|
| 664 |
+
0x82D: 84,
|
| 665 |
+
0x840: 82,
|
| 666 |
+
0x841: 68,
|
| 667 |
+
0x842: 68,
|
| 668 |
+
0x843: 68,
|
| 669 |
+
0x844: 68,
|
| 670 |
+
0x845: 68,
|
| 671 |
+
0x846: 82,
|
| 672 |
+
0x847: 82,
|
| 673 |
+
0x848: 68,
|
| 674 |
+
0x849: 82,
|
| 675 |
+
0x84A: 68,
|
| 676 |
+
0x84B: 68,
|
| 677 |
+
0x84C: 68,
|
| 678 |
+
0x84D: 68,
|
| 679 |
+
0x84E: 68,
|
| 680 |
+
0x84F: 68,
|
| 681 |
+
0x850: 68,
|
| 682 |
+
0x851: 68,
|
| 683 |
+
0x852: 68,
|
| 684 |
+
0x853: 68,
|
| 685 |
+
0x854: 82,
|
| 686 |
+
0x855: 68,
|
| 687 |
+
0x856: 82,
|
| 688 |
+
0x857: 82,
|
| 689 |
+
0x858: 82,
|
| 690 |
+
0x859: 84,
|
| 691 |
+
0x85A: 84,
|
| 692 |
+
0x85B: 84,
|
| 693 |
+
0x860: 68,
|
| 694 |
+
0x862: 68,
|
| 695 |
+
0x863: 68,
|
| 696 |
+
0x864: 68,
|
| 697 |
+
0x865: 68,
|
| 698 |
+
0x867: 82,
|
| 699 |
+
0x868: 68,
|
| 700 |
+
0x869: 82,
|
| 701 |
+
0x86A: 82,
|
| 702 |
+
0x870: 82,
|
| 703 |
+
0x871: 82,
|
| 704 |
+
0x872: 82,
|
| 705 |
+
0x873: 82,
|
| 706 |
+
0x874: 82,
|
| 707 |
+
0x875: 82,
|
| 708 |
+
0x876: 82,
|
| 709 |
+
0x877: 82,
|
| 710 |
+
0x878: 82,
|
| 711 |
+
0x879: 82,
|
| 712 |
+
0x87A: 82,
|
| 713 |
+
0x87B: 82,
|
| 714 |
+
0x87C: 82,
|
| 715 |
+
0x87D: 82,
|
| 716 |
+
0x87E: 82,
|
| 717 |
+
0x87F: 82,
|
| 718 |
+
0x880: 82,
|
| 719 |
+
0x881: 82,
|
| 720 |
+
0x882: 82,
|
| 721 |
+
0x883: 67,
|
| 722 |
+
0x884: 67,
|
| 723 |
+
0x885: 67,
|
| 724 |
+
0x886: 68,
|
| 725 |
+
0x889: 68,
|
| 726 |
+
0x88A: 68,
|
| 727 |
+
0x88B: 68,
|
| 728 |
+
0x88C: 68,
|
| 729 |
+
0x88D: 68,
|
| 730 |
+
0x88E: 82,
|
| 731 |
+
0x897: 84,
|
| 732 |
+
0x898: 84,
|
| 733 |
+
0x899: 84,
|
| 734 |
+
0x89A: 84,
|
| 735 |
+
0x89B: 84,
|
| 736 |
+
0x89C: 84,
|
| 737 |
+
0x89D: 84,
|
| 738 |
+
0x89E: 84,
|
| 739 |
+
0x89F: 84,
|
| 740 |
+
0x8A0: 68,
|
| 741 |
+
0x8A1: 68,
|
| 742 |
+
0x8A2: 68,
|
| 743 |
+
0x8A3: 68,
|
| 744 |
+
0x8A4: 68,
|
| 745 |
+
0x8A5: 68,
|
| 746 |
+
0x8A6: 68,
|
| 747 |
+
0x8A7: 68,
|
| 748 |
+
0x8A8: 68,
|
| 749 |
+
0x8A9: 68,
|
| 750 |
+
0x8AA: 82,
|
| 751 |
+
0x8AB: 82,
|
| 752 |
+
0x8AC: 82,
|
| 753 |
+
0x8AE: 82,
|
| 754 |
+
0x8AF: 68,
|
| 755 |
+
0x8B0: 68,
|
| 756 |
+
0x8B1: 82,
|
| 757 |
+
0x8B2: 82,
|
| 758 |
+
0x8B3: 68,
|
| 759 |
+
0x8B4: 68,
|
| 760 |
+
0x8B5: 68,
|
| 761 |
+
0x8B6: 68,
|
| 762 |
+
0x8B7: 68,
|
| 763 |
+
0x8B8: 68,
|
| 764 |
+
0x8B9: 82,
|
| 765 |
+
0x8BA: 68,
|
| 766 |
+
0x8BB: 68,
|
| 767 |
+
0x8BC: 68,
|
| 768 |
+
0x8BD: 68,
|
| 769 |
+
0x8BE: 68,
|
| 770 |
+
0x8BF: 68,
|
| 771 |
+
0x8C0: 68,
|
| 772 |
+
0x8C1: 68,
|
| 773 |
+
0x8C2: 68,
|
| 774 |
+
0x8C3: 68,
|
| 775 |
+
0x8C4: 68,
|
| 776 |
+
0x8C5: 68,
|
| 777 |
+
0x8C6: 68,
|
| 778 |
+
0x8C7: 68,
|
| 779 |
+
0x8C8: 68,
|
| 780 |
+
0x8CA: 84,
|
| 781 |
+
0x8CB: 84,
|
| 782 |
+
0x8CC: 84,
|
| 783 |
+
0x8CD: 84,
|
| 784 |
+
0x8CE: 84,
|
| 785 |
+
0x8CF: 84,
|
| 786 |
+
0x8D0: 84,
|
| 787 |
+
0x8D1: 84,
|
| 788 |
+
0x8D2: 84,
|
| 789 |
+
0x8D3: 84,
|
| 790 |
+
0x8D4: 84,
|
| 791 |
+
0x8D5: 84,
|
| 792 |
+
0x8D6: 84,
|
| 793 |
+
0x8D7: 84,
|
| 794 |
+
0x8D8: 84,
|
| 795 |
+
0x8D9: 84,
|
| 796 |
+
0x8DA: 84,
|
| 797 |
+
0x8DB: 84,
|
| 798 |
+
0x8DC: 84,
|
| 799 |
+
0x8DD: 84,
|
| 800 |
+
0x8DE: 84,
|
| 801 |
+
0x8DF: 84,
|
| 802 |
+
0x8E0: 84,
|
| 803 |
+
0x8E1: 84,
|
| 804 |
+
0x8E3: 84,
|
| 805 |
+
0x8E4: 84,
|
| 806 |
+
0x8E5: 84,
|
| 807 |
+
0x8E6: 84,
|
| 808 |
+
0x8E7: 84,
|
| 809 |
+
0x8E8: 84,
|
| 810 |
+
0x8E9: 84,
|
| 811 |
+
0x8EA: 84,
|
| 812 |
+
0x8EB: 84,
|
| 813 |
+
0x8EC: 84,
|
| 814 |
+
0x8ED: 84,
|
| 815 |
+
0x8EE: 84,
|
| 816 |
+
0x8EF: 84,
|
| 817 |
+
0x8F0: 84,
|
| 818 |
+
0x8F1: 84,
|
| 819 |
+
0x8F2: 84,
|
| 820 |
+
0x8F3: 84,
|
| 821 |
+
0x8F4: 84,
|
| 822 |
+
0x8F5: 84,
|
| 823 |
+
0x8F6: 84,
|
| 824 |
+
0x8F7: 84,
|
| 825 |
+
0x8F8: 84,
|
| 826 |
+
0x8F9: 84,
|
| 827 |
+
0x8FA: 84,
|
| 828 |
+
0x8FB: 84,
|
| 829 |
+
0x8FC: 84,
|
| 830 |
+
0x8FD: 84,
|
| 831 |
+
0x8FE: 84,
|
| 832 |
+
0x8FF: 84,
|
| 833 |
+
0x900: 84,
|
| 834 |
+
0x901: 84,
|
| 835 |
+
0x902: 84,
|
| 836 |
+
0x93A: 84,
|
| 837 |
+
0x93C: 84,
|
| 838 |
+
0x941: 84,
|
| 839 |
+
0x942: 84,
|
| 840 |
+
0x943: 84,
|
| 841 |
+
0x944: 84,
|
| 842 |
+
0x945: 84,
|
| 843 |
+
0x946: 84,
|
| 844 |
+
0x947: 84,
|
| 845 |
+
0x948: 84,
|
| 846 |
+
0x94D: 84,
|
| 847 |
+
0x951: 84,
|
| 848 |
+
0x952: 84,
|
| 849 |
+
0x953: 84,
|
| 850 |
+
0x954: 84,
|
| 851 |
+
0x955: 84,
|
| 852 |
+
0x956: 84,
|
| 853 |
+
0x957: 84,
|
| 854 |
+
0x962: 84,
|
| 855 |
+
0x963: 84,
|
| 856 |
+
0x981: 84,
|
| 857 |
+
0x9BC: 84,
|
| 858 |
+
0x9C1: 84,
|
| 859 |
+
0x9C2: 84,
|
| 860 |
+
0x9C3: 84,
|
| 861 |
+
0x9C4: 84,
|
| 862 |
+
0x9CD: 84,
|
| 863 |
+
0x9E2: 84,
|
| 864 |
+
0x9E3: 84,
|
| 865 |
+
0x9FE: 84,
|
| 866 |
+
0xA01: 84,
|
| 867 |
+
0xA02: 84,
|
| 868 |
+
0xA3C: 84,
|
| 869 |
+
0xA41: 84,
|
| 870 |
+
0xA42: 84,
|
| 871 |
+
0xA47: 84,
|
| 872 |
+
0xA48: 84,
|
| 873 |
+
0xA4B: 84,
|
| 874 |
+
0xA4C: 84,
|
| 875 |
+
0xA4D: 84,
|
| 876 |
+
0xA51: 84,
|
| 877 |
+
0xA70: 84,
|
| 878 |
+
0xA71: 84,
|
| 879 |
+
0xA75: 84,
|
| 880 |
+
0xA81: 84,
|
| 881 |
+
0xA82: 84,
|
| 882 |
+
0xABC: 84,
|
| 883 |
+
0xAC1: 84,
|
| 884 |
+
0xAC2: 84,
|
| 885 |
+
0xAC3: 84,
|
| 886 |
+
0xAC4: 84,
|
| 887 |
+
0xAC5: 84,
|
| 888 |
+
0xAC7: 84,
|
| 889 |
+
0xAC8: 84,
|
| 890 |
+
0xACD: 84,
|
| 891 |
+
0xAE2: 84,
|
| 892 |
+
0xAE3: 84,
|
| 893 |
+
0xAFA: 84,
|
| 894 |
+
0xAFB: 84,
|
| 895 |
+
0xAFC: 84,
|
| 896 |
+
0xAFD: 84,
|
| 897 |
+
0xAFE: 84,
|
| 898 |
+
0xAFF: 84,
|
| 899 |
+
0xB01: 84,
|
| 900 |
+
0xB3C: 84,
|
| 901 |
+
0xB3F: 84,
|
| 902 |
+
0xB41: 84,
|
| 903 |
+
0xB42: 84,
|
| 904 |
+
0xB43: 84,
|
| 905 |
+
0xB44: 84,
|
| 906 |
+
0xB4D: 84,
|
| 907 |
+
0xB55: 84,
|
| 908 |
+
0xB56: 84,
|
| 909 |
+
0xB62: 84,
|
| 910 |
+
0xB63: 84,
|
| 911 |
+
0xB82: 84,
|
| 912 |
+
0xBC0: 84,
|
| 913 |
+
0xBCD: 84,
|
| 914 |
+
0xC00: 84,
|
| 915 |
+
0xC04: 84,
|
| 916 |
+
0xC3C: 84,
|
| 917 |
+
0xC3E: 84,
|
| 918 |
+
0xC3F: 84,
|
| 919 |
+
0xC40: 84,
|
| 920 |
+
0xC46: 84,
|
| 921 |
+
0xC47: 84,
|
| 922 |
+
0xC48: 84,
|
| 923 |
+
0xC4A: 84,
|
| 924 |
+
0xC4B: 84,
|
| 925 |
+
0xC4C: 84,
|
| 926 |
+
0xC4D: 84,
|
| 927 |
+
0xC55: 84,
|
| 928 |
+
0xC56: 84,
|
| 929 |
+
0xC62: 84,
|
| 930 |
+
0xC63: 84,
|
| 931 |
+
0xC81: 84,
|
| 932 |
+
0xCBC: 84,
|
| 933 |
+
0xCBF: 84,
|
| 934 |
+
0xCC6: 84,
|
| 935 |
+
0xCCC: 84,
|
| 936 |
+
0xCCD: 84,
|
| 937 |
+
0xCE2: 84,
|
| 938 |
+
0xCE3: 84,
|
| 939 |
+
0xD00: 84,
|
| 940 |
+
0xD01: 84,
|
| 941 |
+
0xD3B: 84,
|
| 942 |
+
0xD3C: 84,
|
| 943 |
+
0xD41: 84,
|
| 944 |
+
0xD42: 84,
|
| 945 |
+
0xD43: 84,
|
| 946 |
+
0xD44: 84,
|
| 947 |
+
0xD4D: 84,
|
| 948 |
+
0xD62: 84,
|
| 949 |
+
0xD63: 84,
|
| 950 |
+
0xD81: 84,
|
| 951 |
+
0xDCA: 84,
|
| 952 |
+
0xDD2: 84,
|
| 953 |
+
0xDD3: 84,
|
| 954 |
+
0xDD4: 84,
|
| 955 |
+
0xDD6: 84,
|
| 956 |
+
0xE31: 84,
|
| 957 |
+
0xE34: 84,
|
| 958 |
+
0xE35: 84,
|
| 959 |
+
0xE36: 84,
|
| 960 |
+
0xE37: 84,
|
| 961 |
+
0xE38: 84,
|
| 962 |
+
0xE39: 84,
|
| 963 |
+
0xE3A: 84,
|
| 964 |
+
0xE47: 84,
|
| 965 |
+
0xE48: 84,
|
| 966 |
+
0xE49: 84,
|
| 967 |
+
0xE4A: 84,
|
| 968 |
+
0xE4B: 84,
|
| 969 |
+
0xE4C: 84,
|
| 970 |
+
0xE4D: 84,
|
| 971 |
+
0xE4E: 84,
|
| 972 |
+
0xEB1: 84,
|
| 973 |
+
0xEB4: 84,
|
| 974 |
+
0xEB5: 84,
|
| 975 |
+
0xEB6: 84,
|
| 976 |
+
0xEB7: 84,
|
| 977 |
+
0xEB8: 84,
|
| 978 |
+
0xEB9: 84,
|
| 979 |
+
0xEBA: 84,
|
| 980 |
+
0xEBB: 84,
|
| 981 |
+
0xEBC: 84,
|
| 982 |
+
0xEC8: 84,
|
| 983 |
+
0xEC9: 84,
|
| 984 |
+
0xECA: 84,
|
| 985 |
+
0xECB: 84,
|
| 986 |
+
0xECC: 84,
|
| 987 |
+
0xECD: 84,
|
| 988 |
+
0xECE: 84,
|
| 989 |
+
0xF18: 84,
|
| 990 |
+
0xF19: 84,
|
| 991 |
+
0xF35: 84,
|
| 992 |
+
0xF37: 84,
|
| 993 |
+
0xF39: 84,
|
| 994 |
+
0xF71: 84,
|
| 995 |
+
0xF72: 84,
|
| 996 |
+
0xF73: 84,
|
| 997 |
+
0xF74: 84,
|
| 998 |
+
0xF75: 84,
|
| 999 |
+
0xF76: 84,
|
| 1000 |
+
0xF77: 84,
|
| 1001 |
+
0xF78: 84,
|
| 1002 |
+
0xF79: 84,
|
| 1003 |
+
0xF7A: 84,
|
| 1004 |
+
0xF7B: 84,
|
| 1005 |
+
0xF7C: 84,
|
| 1006 |
+
0xF7D: 84,
|
| 1007 |
+
0xF7E: 84,
|
| 1008 |
+
0xF80: 84,
|
| 1009 |
+
0xF81: 84,
|
| 1010 |
+
0xF82: 84,
|
| 1011 |
+
0xF83: 84,
|
| 1012 |
+
0xF84: 84,
|
| 1013 |
+
0xF86: 84,
|
| 1014 |
+
0xF87: 84,
|
| 1015 |
+
0xF8D: 84,
|
| 1016 |
+
0xF8E: 84,
|
| 1017 |
+
0xF8F: 84,
|
| 1018 |
+
0xF90: 84,
|
| 1019 |
+
0xF91: 84,
|
| 1020 |
+
0xF92: 84,
|
| 1021 |
+
0xF93: 84,
|
| 1022 |
+
0xF94: 84,
|
| 1023 |
+
0xF95: 84,
|
| 1024 |
+
0xF96: 84,
|
| 1025 |
+
0xF97: 84,
|
| 1026 |
+
0xF99: 84,
|
| 1027 |
+
0xF9A: 84,
|
| 1028 |
+
0xF9B: 84,
|
| 1029 |
+
0xF9C: 84,
|
| 1030 |
+
0xF9D: 84,
|
| 1031 |
+
0xF9E: 84,
|
| 1032 |
+
0xF9F: 84,
|
| 1033 |
+
0xFA0: 84,
|
| 1034 |
+
0xFA1: 84,
|
| 1035 |
+
0xFA2: 84,
|
| 1036 |
+
0xFA3: 84,
|
| 1037 |
+
0xFA4: 84,
|
| 1038 |
+
0xFA5: 84,
|
| 1039 |
+
0xFA6: 84,
|
| 1040 |
+
0xFA7: 84,
|
| 1041 |
+
0xFA8: 84,
|
| 1042 |
+
0xFA9: 84,
|
| 1043 |
+
0xFAA: 84,
|
| 1044 |
+
0xFAB: 84,
|
| 1045 |
+
0xFAC: 84,
|
| 1046 |
+
0xFAD: 84,
|
| 1047 |
+
0xFAE: 84,
|
| 1048 |
+
0xFAF: 84,
|
| 1049 |
+
0xFB0: 84,
|
| 1050 |
+
0xFB1: 84,
|
| 1051 |
+
0xFB2: 84,
|
| 1052 |
+
0xFB3: 84,
|
| 1053 |
+
0xFB4: 84,
|
| 1054 |
+
0xFB5: 84,
|
| 1055 |
+
0xFB6: 84,
|
| 1056 |
+
0xFB7: 84,
|
| 1057 |
+
0xFB8: 84,
|
| 1058 |
+
0xFB9: 84,
|
| 1059 |
+
0xFBA: 84,
|
| 1060 |
+
0xFBB: 84,
|
| 1061 |
+
0xFBC: 84,
|
| 1062 |
+
0xFC6: 84,
|
| 1063 |
+
0x102D: 84,
|
| 1064 |
+
0x102E: 84,
|
| 1065 |
+
0x102F: 84,
|
| 1066 |
+
0x1030: 84,
|
| 1067 |
+
0x1032: 84,
|
| 1068 |
+
0x1033: 84,
|
| 1069 |
+
0x1034: 84,
|
| 1070 |
+
0x1035: 84,
|
| 1071 |
+
0x1036: 84,
|
| 1072 |
+
0x1037: 84,
|
| 1073 |
+
0x1039: 84,
|
| 1074 |
+
0x103A: 84,
|
| 1075 |
+
0x103D: 84,
|
| 1076 |
+
0x103E: 84,
|
| 1077 |
+
0x1058: 84,
|
| 1078 |
+
0x1059: 84,
|
| 1079 |
+
0x105E: 84,
|
| 1080 |
+
0x105F: 84,
|
| 1081 |
+
0x1060: 84,
|
| 1082 |
+
0x1071: 84,
|
| 1083 |
+
0x1072: 84,
|
| 1084 |
+
0x1073: 84,
|
| 1085 |
+
0x1074: 84,
|
| 1086 |
+
0x1082: 84,
|
| 1087 |
+
0x1085: 84,
|
| 1088 |
+
0x1086: 84,
|
| 1089 |
+
0x108D: 84,
|
| 1090 |
+
0x109D: 84,
|
| 1091 |
+
0x135D: 84,
|
| 1092 |
+
0x135E: 84,
|
| 1093 |
+
0x135F: 84,
|
| 1094 |
+
0x1712: 84,
|
| 1095 |
+
0x1713: 84,
|
| 1096 |
+
0x1714: 84,
|
| 1097 |
+
0x1732: 84,
|
| 1098 |
+
0x1733: 84,
|
| 1099 |
+
0x1752: 84,
|
| 1100 |
+
0x1753: 84,
|
| 1101 |
+
0x1772: 84,
|
| 1102 |
+
0x1773: 84,
|
| 1103 |
+
0x17B4: 84,
|
| 1104 |
+
0x17B5: 84,
|
| 1105 |
+
0x17B7: 84,
|
| 1106 |
+
0x17B8: 84,
|
| 1107 |
+
0x17B9: 84,
|
| 1108 |
+
0x17BA: 84,
|
| 1109 |
+
0x17BB: 84,
|
| 1110 |
+
0x17BC: 84,
|
| 1111 |
+
0x17BD: 84,
|
| 1112 |
+
0x17C6: 84,
|
| 1113 |
+
0x17C9: 84,
|
| 1114 |
+
0x17CA: 84,
|
| 1115 |
+
0x17CB: 84,
|
| 1116 |
+
0x17CC: 84,
|
| 1117 |
+
0x17CD: 84,
|
| 1118 |
+
0x17CE: 84,
|
| 1119 |
+
0x17CF: 84,
|
| 1120 |
+
0x17D0: 84,
|
| 1121 |
+
0x17D1: 84,
|
| 1122 |
+
0x17D2: 84,
|
| 1123 |
+
0x17D3: 84,
|
| 1124 |
+
0x17DD: 84,
|
| 1125 |
+
0x1807: 68,
|
| 1126 |
+
0x180A: 67,
|
| 1127 |
+
0x180B: 84,
|
| 1128 |
+
0x180C: 84,
|
| 1129 |
+
0x180D: 84,
|
| 1130 |
+
0x180F: 84,
|
| 1131 |
+
0x1820: 68,
|
| 1132 |
+
0x1821: 68,
|
| 1133 |
+
0x1822: 68,
|
| 1134 |
+
0x1823: 68,
|
| 1135 |
+
0x1824: 68,
|
| 1136 |
+
0x1825: 68,
|
| 1137 |
+
0x1826: 68,
|
| 1138 |
+
0x1827: 68,
|
| 1139 |
+
0x1828: 68,
|
| 1140 |
+
0x1829: 68,
|
| 1141 |
+
0x182A: 68,
|
| 1142 |
+
0x182B: 68,
|
| 1143 |
+
0x182C: 68,
|
| 1144 |
+
0x182D: 68,
|
| 1145 |
+
0x182E: 68,
|
| 1146 |
+
0x182F: 68,
|
| 1147 |
+
0x1830: 68,
|
| 1148 |
+
0x1831: 68,
|
| 1149 |
+
0x1832: 68,
|
| 1150 |
+
0x1833: 68,
|
| 1151 |
+
0x1834: 68,
|
| 1152 |
+
0x1835: 68,
|
| 1153 |
+
0x1836: 68,
|
| 1154 |
+
0x1837: 68,
|
| 1155 |
+
0x1838: 68,
|
| 1156 |
+
0x1839: 68,
|
| 1157 |
+
0x183A: 68,
|
| 1158 |
+
0x183B: 68,
|
| 1159 |
+
0x183C: 68,
|
| 1160 |
+
0x183D: 68,
|
| 1161 |
+
0x183E: 68,
|
| 1162 |
+
0x183F: 68,
|
| 1163 |
+
0x1840: 68,
|
| 1164 |
+
0x1841: 68,
|
| 1165 |
+
0x1842: 68,
|
| 1166 |
+
0x1843: 68,
|
| 1167 |
+
0x1844: 68,
|
| 1168 |
+
0x1845: 68,
|
| 1169 |
+
0x1846: 68,
|
| 1170 |
+
0x1847: 68,
|
| 1171 |
+
0x1848: 68,
|
| 1172 |
+
0x1849: 68,
|
| 1173 |
+
0x184A: 68,
|
| 1174 |
+
0x184B: 68,
|
| 1175 |
+
0x184C: 68,
|
| 1176 |
+
0x184D: 68,
|
| 1177 |
+
0x184E: 68,
|
| 1178 |
+
0x184F: 68,
|
| 1179 |
+
0x1850: 68,
|
| 1180 |
+
0x1851: 68,
|
| 1181 |
+
0x1852: 68,
|
| 1182 |
+
0x1853: 68,
|
| 1183 |
+
0x1854: 68,
|
| 1184 |
+
0x1855: 68,
|
| 1185 |
+
0x1856: 68,
|
| 1186 |
+
0x1857: 68,
|
| 1187 |
+
0x1858: 68,
|
| 1188 |
+
0x1859: 68,
|
| 1189 |
+
0x185A: 68,
|
| 1190 |
+
0x185B: 68,
|
| 1191 |
+
0x185C: 68,
|
| 1192 |
+
0x185D: 68,
|
| 1193 |
+
0x185E: 68,
|
| 1194 |
+
0x185F: 68,
|
| 1195 |
+
0x1860: 68,
|
| 1196 |
+
0x1861: 68,
|
| 1197 |
+
0x1862: 68,
|
| 1198 |
+
0x1863: 68,
|
| 1199 |
+
0x1864: 68,
|
| 1200 |
+
0x1865: 68,
|
| 1201 |
+
0x1866: 68,
|
| 1202 |
+
0x1867: 68,
|
| 1203 |
+
0x1868: 68,
|
| 1204 |
+
0x1869: 68,
|
| 1205 |
+
0x186A: 68,
|
| 1206 |
+
0x186B: 68,
|
| 1207 |
+
0x186C: 68,
|
| 1208 |
+
0x186D: 68,
|
| 1209 |
+
0x186E: 68,
|
| 1210 |
+
0x186F: 68,
|
| 1211 |
+
0x1870: 68,
|
| 1212 |
+
0x1871: 68,
|
| 1213 |
+
0x1872: 68,
|
| 1214 |
+
0x1873: 68,
|
| 1215 |
+
0x1874: 68,
|
| 1216 |
+
0x1875: 68,
|
| 1217 |
+
0x1876: 68,
|
| 1218 |
+
0x1877: 68,
|
| 1219 |
+
0x1878: 68,
|
| 1220 |
+
0x1885: 84,
|
| 1221 |
+
0x1886: 84,
|
| 1222 |
+
0x1887: 68,
|
| 1223 |
+
0x1888: 68,
|
| 1224 |
+
0x1889: 68,
|
| 1225 |
+
0x188A: 68,
|
| 1226 |
+
0x188B: 68,
|
| 1227 |
+
0x188C: 68,
|
| 1228 |
+
0x188D: 68,
|
| 1229 |
+
0x188E: 68,
|
| 1230 |
+
0x188F: 68,
|
| 1231 |
+
0x1890: 68,
|
| 1232 |
+
0x1891: 68,
|
| 1233 |
+
0x1892: 68,
|
| 1234 |
+
0x1893: 68,
|
| 1235 |
+
0x1894: 68,
|
| 1236 |
+
0x1895: 68,
|
| 1237 |
+
0x1896: 68,
|
| 1238 |
+
0x1897: 68,
|
| 1239 |
+
0x1898: 68,
|
| 1240 |
+
0x1899: 68,
|
| 1241 |
+
0x189A: 68,
|
| 1242 |
+
0x189B: 68,
|
| 1243 |
+
0x189C: 68,
|
| 1244 |
+
0x189D: 68,
|
| 1245 |
+
0x189E: 68,
|
| 1246 |
+
0x189F: 68,
|
| 1247 |
+
0x18A0: 68,
|
| 1248 |
+
0x18A1: 68,
|
| 1249 |
+
0x18A2: 68,
|
| 1250 |
+
0x18A3: 68,
|
| 1251 |
+
0x18A4: 68,
|
| 1252 |
+
0x18A5: 68,
|
| 1253 |
+
0x18A6: 68,
|
| 1254 |
+
0x18A7: 68,
|
| 1255 |
+
0x18A8: 68,
|
| 1256 |
+
0x18A9: 84,
|
| 1257 |
+
0x18AA: 68,
|
| 1258 |
+
0x1920: 84,
|
| 1259 |
+
0x1921: 84,
|
| 1260 |
+
0x1922: 84,
|
| 1261 |
+
0x1927: 84,
|
| 1262 |
+
0x1928: 84,
|
| 1263 |
+
0x1932: 84,
|
| 1264 |
+
0x1939: 84,
|
| 1265 |
+
0x193A: 84,
|
| 1266 |
+
0x193B: 84,
|
| 1267 |
+
0x1A17: 84,
|
| 1268 |
+
0x1A18: 84,
|
| 1269 |
+
0x1A1B: 84,
|
| 1270 |
+
0x1A56: 84,
|
| 1271 |
+
0x1A58: 84,
|
| 1272 |
+
0x1A59: 84,
|
| 1273 |
+
0x1A5A: 84,
|
| 1274 |
+
0x1A5B: 84,
|
| 1275 |
+
0x1A5C: 84,
|
| 1276 |
+
0x1A5D: 84,
|
| 1277 |
+
0x1A5E: 84,
|
| 1278 |
+
0x1A60: 84,
|
| 1279 |
+
0x1A62: 84,
|
| 1280 |
+
0x1A65: 84,
|
| 1281 |
+
0x1A66: 84,
|
| 1282 |
+
0x1A67: 84,
|
| 1283 |
+
0x1A68: 84,
|
| 1284 |
+
0x1A69: 84,
|
| 1285 |
+
0x1A6A: 84,
|
| 1286 |
+
0x1A6B: 84,
|
| 1287 |
+
0x1A6C: 84,
|
| 1288 |
+
0x1A73: 84,
|
| 1289 |
+
0x1A74: 84,
|
| 1290 |
+
0x1A75: 84,
|
| 1291 |
+
0x1A76: 84,
|
| 1292 |
+
0x1A77: 84,
|
| 1293 |
+
0x1A78: 84,
|
| 1294 |
+
0x1A79: 84,
|
| 1295 |
+
0x1A7A: 84,
|
| 1296 |
+
0x1A7B: 84,
|
| 1297 |
+
0x1A7C: 84,
|
| 1298 |
+
0x1A7F: 84,
|
| 1299 |
+
0x1AB0: 84,
|
| 1300 |
+
0x1AB1: 84,
|
| 1301 |
+
0x1AB2: 84,
|
| 1302 |
+
0x1AB3: 84,
|
| 1303 |
+
0x1AB4: 84,
|
| 1304 |
+
0x1AB5: 84,
|
| 1305 |
+
0x1AB6: 84,
|
| 1306 |
+
0x1AB7: 84,
|
| 1307 |
+
0x1AB8: 84,
|
| 1308 |
+
0x1AB9: 84,
|
| 1309 |
+
0x1ABA: 84,
|
| 1310 |
+
0x1ABB: 84,
|
| 1311 |
+
0x1ABC: 84,
|
| 1312 |
+
0x1ABD: 84,
|
| 1313 |
+
0x1ABE: 84,
|
| 1314 |
+
0x1ABF: 84,
|
| 1315 |
+
0x1AC0: 84,
|
| 1316 |
+
0x1AC1: 84,
|
| 1317 |
+
0x1AC2: 84,
|
| 1318 |
+
0x1AC3: 84,
|
| 1319 |
+
0x1AC4: 84,
|
| 1320 |
+
0x1AC5: 84,
|
| 1321 |
+
0x1AC6: 84,
|
| 1322 |
+
0x1AC7: 84,
|
| 1323 |
+
0x1AC8: 84,
|
| 1324 |
+
0x1AC9: 84,
|
| 1325 |
+
0x1ACA: 84,
|
| 1326 |
+
0x1ACB: 84,
|
| 1327 |
+
0x1ACC: 84,
|
| 1328 |
+
0x1ACD: 84,
|
| 1329 |
+
0x1ACE: 84,
|
| 1330 |
+
0x1B00: 84,
|
| 1331 |
+
0x1B01: 84,
|
| 1332 |
+
0x1B02: 84,
|
| 1333 |
+
0x1B03: 84,
|
| 1334 |
+
0x1B34: 84,
|
| 1335 |
+
0x1B36: 84,
|
| 1336 |
+
0x1B37: 84,
|
| 1337 |
+
0x1B38: 84,
|
| 1338 |
+
0x1B39: 84,
|
| 1339 |
+
0x1B3A: 84,
|
| 1340 |
+
0x1B3C: 84,
|
| 1341 |
+
0x1B42: 84,
|
| 1342 |
+
0x1B6B: 84,
|
| 1343 |
+
0x1B6C: 84,
|
| 1344 |
+
0x1B6D: 84,
|
| 1345 |
+
0x1B6E: 84,
|
| 1346 |
+
0x1B6F: 84,
|
| 1347 |
+
0x1B70: 84,
|
| 1348 |
+
0x1B71: 84,
|
| 1349 |
+
0x1B72: 84,
|
| 1350 |
+
0x1B73: 84,
|
| 1351 |
+
0x1B80: 84,
|
| 1352 |
+
0x1B81: 84,
|
| 1353 |
+
0x1BA2: 84,
|
| 1354 |
+
0x1BA3: 84,
|
| 1355 |
+
0x1BA4: 84,
|
| 1356 |
+
0x1BA5: 84,
|
| 1357 |
+
0x1BA8: 84,
|
| 1358 |
+
0x1BA9: 84,
|
| 1359 |
+
0x1BAB: 84,
|
| 1360 |
+
0x1BAC: 84,
|
| 1361 |
+
0x1BAD: 84,
|
| 1362 |
+
0x1BE6: 84,
|
| 1363 |
+
0x1BE8: 84,
|
| 1364 |
+
0x1BE9: 84,
|
| 1365 |
+
0x1BED: 84,
|
| 1366 |
+
0x1BEF: 84,
|
| 1367 |
+
0x1BF0: 84,
|
| 1368 |
+
0x1BF1: 84,
|
| 1369 |
+
0x1C2C: 84,
|
| 1370 |
+
0x1C2D: 84,
|
| 1371 |
+
0x1C2E: 84,
|
| 1372 |
+
0x1C2F: 84,
|
| 1373 |
+
0x1C30: 84,
|
| 1374 |
+
0x1C31: 84,
|
| 1375 |
+
0x1C32: 84,
|
| 1376 |
+
0x1C33: 84,
|
| 1377 |
+
0x1C36: 84,
|
| 1378 |
+
0x1C37: 84,
|
| 1379 |
+
0x1CD0: 84,
|
| 1380 |
+
0x1CD1: 84,
|
| 1381 |
+
0x1CD2: 84,
|
| 1382 |
+
0x1CD4: 84,
|
| 1383 |
+
0x1CD5: 84,
|
| 1384 |
+
0x1CD6: 84,
|
| 1385 |
+
0x1CD7: 84,
|
| 1386 |
+
0x1CD8: 84,
|
| 1387 |
+
0x1CD9: 84,
|
| 1388 |
+
0x1CDA: 84,
|
| 1389 |
+
0x1CDB: 84,
|
| 1390 |
+
0x1CDC: 84,
|
| 1391 |
+
0x1CDD: 84,
|
| 1392 |
+
0x1CDE: 84,
|
| 1393 |
+
0x1CDF: 84,
|
| 1394 |
+
0x1CE0: 84,
|
| 1395 |
+
0x1CE2: 84,
|
| 1396 |
+
0x1CE3: 84,
|
| 1397 |
+
0x1CE4: 84,
|
| 1398 |
+
0x1CE5: 84,
|
| 1399 |
+
0x1CE6: 84,
|
| 1400 |
+
0x1CE7: 84,
|
| 1401 |
+
0x1CE8: 84,
|
| 1402 |
+
0x1CED: 84,
|
| 1403 |
+
0x1CF4: 84,
|
| 1404 |
+
0x1CF8: 84,
|
| 1405 |
+
0x1CF9: 84,
|
| 1406 |
+
0x1DC0: 84,
|
| 1407 |
+
0x1DC1: 84,
|
| 1408 |
+
0x1DC2: 84,
|
| 1409 |
+
0x1DC3: 84,
|
| 1410 |
+
0x1DC4: 84,
|
| 1411 |
+
0x1DC5: 84,
|
| 1412 |
+
0x1DC6: 84,
|
| 1413 |
+
0x1DC7: 84,
|
| 1414 |
+
0x1DC8: 84,
|
| 1415 |
+
0x1DC9: 84,
|
| 1416 |
+
0x1DCA: 84,
|
| 1417 |
+
0x1DCB: 84,
|
| 1418 |
+
0x1DCC: 84,
|
| 1419 |
+
0x1DCD: 84,
|
| 1420 |
+
0x1DCE: 84,
|
| 1421 |
+
0x1DCF: 84,
|
| 1422 |
+
0x1DD0: 84,
|
| 1423 |
+
0x1DD1: 84,
|
| 1424 |
+
0x1DD2: 84,
|
| 1425 |
+
0x1DD3: 84,
|
| 1426 |
+
0x1DD4: 84,
|
| 1427 |
+
0x1DD5: 84,
|
| 1428 |
+
0x1DD6: 84,
|
| 1429 |
+
0x1DD7: 84,
|
| 1430 |
+
0x1DD8: 84,
|
| 1431 |
+
0x1DD9: 84,
|
| 1432 |
+
0x1DDA: 84,
|
| 1433 |
+
0x1DDB: 84,
|
| 1434 |
+
0x1DDC: 84,
|
| 1435 |
+
0x1DDD: 84,
|
| 1436 |
+
0x1DDE: 84,
|
| 1437 |
+
0x1DDF: 84,
|
| 1438 |
+
0x1DE0: 84,
|
| 1439 |
+
0x1DE1: 84,
|
| 1440 |
+
0x1DE2: 84,
|
| 1441 |
+
0x1DE3: 84,
|
| 1442 |
+
0x1DE4: 84,
|
| 1443 |
+
0x1DE5: 84,
|
| 1444 |
+
0x1DE6: 84,
|
| 1445 |
+
0x1DE7: 84,
|
| 1446 |
+
0x1DE8: 84,
|
| 1447 |
+
0x1DE9: 84,
|
| 1448 |
+
0x1DEA: 84,
|
| 1449 |
+
0x1DEB: 84,
|
| 1450 |
+
0x1DEC: 84,
|
| 1451 |
+
0x1DED: 84,
|
| 1452 |
+
0x1DEE: 84,
|
| 1453 |
+
0x1DEF: 84,
|
| 1454 |
+
0x1DF0: 84,
|
| 1455 |
+
0x1DF1: 84,
|
| 1456 |
+
0x1DF2: 84,
|
| 1457 |
+
0x1DF3: 84,
|
| 1458 |
+
0x1DF4: 84,
|
| 1459 |
+
0x1DF5: 84,
|
| 1460 |
+
0x1DF6: 84,
|
| 1461 |
+
0x1DF7: 84,
|
| 1462 |
+
0x1DF8: 84,
|
| 1463 |
+
0x1DF9: 84,
|
| 1464 |
+
0x1DFA: 84,
|
| 1465 |
+
0x1DFB: 84,
|
| 1466 |
+
0x1DFC: 84,
|
| 1467 |
+
0x1DFD: 84,
|
| 1468 |
+
0x1DFE: 84,
|
| 1469 |
+
0x1DFF: 84,
|
| 1470 |
+
0x200B: 84,
|
| 1471 |
+
0x200D: 67,
|
| 1472 |
+
0x200E: 84,
|
| 1473 |
+
0x200F: 84,
|
| 1474 |
+
0x202A: 84,
|
| 1475 |
+
0x202B: 84,
|
| 1476 |
+
0x202C: 84,
|
| 1477 |
+
0x202D: 84,
|
| 1478 |
+
0x202E: 84,
|
| 1479 |
+
0x2060: 84,
|
| 1480 |
+
0x2061: 84,
|
| 1481 |
+
0x2062: 84,
|
| 1482 |
+
0x2063: 84,
|
| 1483 |
+
0x2064: 84,
|
| 1484 |
+
0x206A: 84,
|
| 1485 |
+
0x206B: 84,
|
| 1486 |
+
0x206C: 84,
|
| 1487 |
+
0x206D: 84,
|
| 1488 |
+
0x206E: 84,
|
| 1489 |
+
0x206F: 84,
|
| 1490 |
+
0x20D0: 84,
|
| 1491 |
+
0x20D1: 84,
|
| 1492 |
+
0x20D2: 84,
|
| 1493 |
+
0x20D3: 84,
|
| 1494 |
+
0x20D4: 84,
|
| 1495 |
+
0x20D5: 84,
|
| 1496 |
+
0x20D6: 84,
|
| 1497 |
+
0x20D7: 84,
|
| 1498 |
+
0x20D8: 84,
|
| 1499 |
+
0x20D9: 84,
|
| 1500 |
+
0x20DA: 84,
|
| 1501 |
+
0x20DB: 84,
|
| 1502 |
+
0x20DC: 84,
|
| 1503 |
+
0x20DD: 84,
|
| 1504 |
+
0x20DE: 84,
|
| 1505 |
+
0x20DF: 84,
|
| 1506 |
+
0x20E0: 84,
|
| 1507 |
+
0x20E1: 84,
|
| 1508 |
+
0x20E2: 84,
|
| 1509 |
+
0x20E3: 84,
|
| 1510 |
+
0x20E4: 84,
|
| 1511 |
+
0x20E5: 84,
|
| 1512 |
+
0x20E6: 84,
|
| 1513 |
+
0x20E7: 84,
|
| 1514 |
+
0x20E8: 84,
|
| 1515 |
+
0x20E9: 84,
|
| 1516 |
+
0x20EA: 84,
|
| 1517 |
+
0x20EB: 84,
|
| 1518 |
+
0x20EC: 84,
|
| 1519 |
+
0x20ED: 84,
|
| 1520 |
+
0x20EE: 84,
|
| 1521 |
+
0x20EF: 84,
|
| 1522 |
+
0x20F0: 84,
|
| 1523 |
+
0x2CEF: 84,
|
| 1524 |
+
0x2CF0: 84,
|
| 1525 |
+
0x2CF1: 84,
|
| 1526 |
+
0x2D7F: 84,
|
| 1527 |
+
0x2DE0: 84,
|
| 1528 |
+
0x2DE1: 84,
|
| 1529 |
+
0x2DE2: 84,
|
| 1530 |
+
0x2DE3: 84,
|
| 1531 |
+
0x2DE4: 84,
|
| 1532 |
+
0x2DE5: 84,
|
| 1533 |
+
0x2DE6: 84,
|
| 1534 |
+
0x2DE7: 84,
|
| 1535 |
+
0x2DE8: 84,
|
| 1536 |
+
0x2DE9: 84,
|
| 1537 |
+
0x2DEA: 84,
|
| 1538 |
+
0x2DEB: 84,
|
| 1539 |
+
0x2DEC: 84,
|
| 1540 |
+
0x2DED: 84,
|
| 1541 |
+
0x2DEE: 84,
|
| 1542 |
+
0x2DEF: 84,
|
| 1543 |
+
0x2DF0: 84,
|
| 1544 |
+
0x2DF1: 84,
|
| 1545 |
+
0x2DF2: 84,
|
| 1546 |
+
0x2DF3: 84,
|
| 1547 |
+
0x2DF4: 84,
|
| 1548 |
+
0x2DF5: 84,
|
| 1549 |
+
0x2DF6: 84,
|
| 1550 |
+
0x2DF7: 84,
|
| 1551 |
+
0x2DF8: 84,
|
| 1552 |
+
0x2DF9: 84,
|
| 1553 |
+
0x2DFA: 84,
|
| 1554 |
+
0x2DFB: 84,
|
| 1555 |
+
0x2DFC: 84,
|
| 1556 |
+
0x2DFD: 84,
|
| 1557 |
+
0x2DFE: 84,
|
| 1558 |
+
0x2DFF: 84,
|
| 1559 |
+
0x302A: 84,
|
| 1560 |
+
0x302B: 84,
|
| 1561 |
+
0x302C: 84,
|
| 1562 |
+
0x302D: 84,
|
| 1563 |
+
0x3099: 84,
|
| 1564 |
+
0x309A: 84,
|
| 1565 |
+
0xA66F: 84,
|
| 1566 |
+
0xA670: 84,
|
| 1567 |
+
0xA671: 84,
|
| 1568 |
+
0xA672: 84,
|
| 1569 |
+
0xA674: 84,
|
| 1570 |
+
0xA675: 84,
|
| 1571 |
+
0xA676: 84,
|
| 1572 |
+
0xA677: 84,
|
| 1573 |
+
0xA678: 84,
|
| 1574 |
+
0xA679: 84,
|
| 1575 |
+
0xA67A: 84,
|
| 1576 |
+
0xA67B: 84,
|
| 1577 |
+
0xA67C: 84,
|
| 1578 |
+
0xA67D: 84,
|
| 1579 |
+
0xA69E: 84,
|
| 1580 |
+
0xA69F: 84,
|
| 1581 |
+
0xA6F0: 84,
|
| 1582 |
+
0xA6F1: 84,
|
| 1583 |
+
0xA802: 84,
|
| 1584 |
+
0xA806: 84,
|
| 1585 |
+
0xA80B: 84,
|
| 1586 |
+
0xA825: 84,
|
| 1587 |
+
0xA826: 84,
|
| 1588 |
+
0xA82C: 84,
|
| 1589 |
+
0xA840: 68,
|
| 1590 |
+
0xA841: 68,
|
| 1591 |
+
0xA842: 68,
|
| 1592 |
+
0xA843: 68,
|
| 1593 |
+
0xA844: 68,
|
| 1594 |
+
0xA845: 68,
|
| 1595 |
+
0xA846: 68,
|
| 1596 |
+
0xA847: 68,
|
| 1597 |
+
0xA848: 68,
|
| 1598 |
+
0xA849: 68,
|
| 1599 |
+
0xA84A: 68,
|
| 1600 |
+
0xA84B: 68,
|
| 1601 |
+
0xA84C: 68,
|
| 1602 |
+
0xA84D: 68,
|
| 1603 |
+
0xA84E: 68,
|
| 1604 |
+
0xA84F: 68,
|
| 1605 |
+
0xA850: 68,
|
| 1606 |
+
0xA851: 68,
|
| 1607 |
+
0xA852: 68,
|
| 1608 |
+
0xA853: 68,
|
| 1609 |
+
0xA854: 68,
|
| 1610 |
+
0xA855: 68,
|
| 1611 |
+
0xA856: 68,
|
| 1612 |
+
0xA857: 68,
|
| 1613 |
+
0xA858: 68,
|
| 1614 |
+
0xA859: 68,
|
| 1615 |
+
0xA85A: 68,
|
| 1616 |
+
0xA85B: 68,
|
| 1617 |
+
0xA85C: 68,
|
| 1618 |
+
0xA85D: 68,
|
| 1619 |
+
0xA85E: 68,
|
| 1620 |
+
0xA85F: 68,
|
| 1621 |
+
0xA860: 68,
|
| 1622 |
+
0xA861: 68,
|
| 1623 |
+
0xA862: 68,
|
| 1624 |
+
0xA863: 68,
|
| 1625 |
+
0xA864: 68,
|
| 1626 |
+
0xA865: 68,
|
| 1627 |
+
0xA866: 68,
|
| 1628 |
+
0xA867: 68,
|
| 1629 |
+
0xA868: 68,
|
| 1630 |
+
0xA869: 68,
|
| 1631 |
+
0xA86A: 68,
|
| 1632 |
+
0xA86B: 68,
|
| 1633 |
+
0xA86C: 68,
|
| 1634 |
+
0xA86D: 68,
|
| 1635 |
+
0xA86E: 68,
|
| 1636 |
+
0xA86F: 68,
|
| 1637 |
+
0xA870: 68,
|
| 1638 |
+
0xA871: 68,
|
| 1639 |
+
0xA872: 76,
|
| 1640 |
+
0xA8C4: 84,
|
| 1641 |
+
0xA8C5: 84,
|
| 1642 |
+
0xA8E0: 84,
|
| 1643 |
+
0xA8E1: 84,
|
| 1644 |
+
0xA8E2: 84,
|
| 1645 |
+
0xA8E3: 84,
|
| 1646 |
+
0xA8E4: 84,
|
| 1647 |
+
0xA8E5: 84,
|
| 1648 |
+
0xA8E6: 84,
|
| 1649 |
+
0xA8E7: 84,
|
| 1650 |
+
0xA8E8: 84,
|
| 1651 |
+
0xA8E9: 84,
|
| 1652 |
+
0xA8EA: 84,
|
| 1653 |
+
0xA8EB: 84,
|
| 1654 |
+
0xA8EC: 84,
|
| 1655 |
+
0xA8ED: 84,
|
| 1656 |
+
0xA8EE: 84,
|
| 1657 |
+
0xA8EF: 84,
|
| 1658 |
+
0xA8F0: 84,
|
| 1659 |
+
0xA8F1: 84,
|
| 1660 |
+
0xA8FF: 84,
|
| 1661 |
+
0xA926: 84,
|
| 1662 |
+
0xA927: 84,
|
| 1663 |
+
0xA928: 84,
|
| 1664 |
+
0xA929: 84,
|
| 1665 |
+
0xA92A: 84,
|
| 1666 |
+
0xA92B: 84,
|
| 1667 |
+
0xA92C: 84,
|
| 1668 |
+
0xA92D: 84,
|
| 1669 |
+
0xA947: 84,
|
| 1670 |
+
0xA948: 84,
|
| 1671 |
+
0xA949: 84,
|
| 1672 |
+
0xA94A: 84,
|
| 1673 |
+
0xA94B: 84,
|
| 1674 |
+
0xA94C: 84,
|
| 1675 |
+
0xA94D: 84,
|
| 1676 |
+
0xA94E: 84,
|
| 1677 |
+
0xA94F: 84,
|
| 1678 |
+
0xA950: 84,
|
| 1679 |
+
0xA951: 84,
|
| 1680 |
+
0xA980: 84,
|
| 1681 |
+
0xA981: 84,
|
| 1682 |
+
0xA982: 84,
|
| 1683 |
+
0xA9B3: 84,
|
| 1684 |
+
0xA9B6: 84,
|
| 1685 |
+
0xA9B7: 84,
|
| 1686 |
+
0xA9B8: 84,
|
| 1687 |
+
0xA9B9: 84,
|
| 1688 |
+
0xA9BC: 84,
|
| 1689 |
+
0xA9BD: 84,
|
| 1690 |
+
0xA9E5: 84,
|
| 1691 |
+
0xAA29: 84,
|
| 1692 |
+
0xAA2A: 84,
|
| 1693 |
+
0xAA2B: 84,
|
| 1694 |
+
0xAA2C: 84,
|
| 1695 |
+
0xAA2D: 84,
|
| 1696 |
+
0xAA2E: 84,
|
| 1697 |
+
0xAA31: 84,
|
| 1698 |
+
0xAA32: 84,
|
| 1699 |
+
0xAA35: 84,
|
| 1700 |
+
0xAA36: 84,
|
| 1701 |
+
0xAA43: 84,
|
| 1702 |
+
0xAA4C: 84,
|
| 1703 |
+
0xAA7C: 84,
|
| 1704 |
+
0xAAB0: 84,
|
| 1705 |
+
0xAAB2: 84,
|
| 1706 |
+
0xAAB3: 84,
|
| 1707 |
+
0xAAB4: 84,
|
| 1708 |
+
0xAAB7: 84,
|
| 1709 |
+
0xAAB8: 84,
|
| 1710 |
+
0xAABE: 84,
|
| 1711 |
+
0xAABF: 84,
|
| 1712 |
+
0xAAC1: 84,
|
| 1713 |
+
0xAAEC: 84,
|
| 1714 |
+
0xAAED: 84,
|
| 1715 |
+
0xAAF6: 84,
|
| 1716 |
+
0xABE5: 84,
|
| 1717 |
+
0xABE8: 84,
|
| 1718 |
+
0xABED: 84,
|
| 1719 |
+
0xFB1E: 84,
|
| 1720 |
+
0xFE00: 84,
|
| 1721 |
+
0xFE01: 84,
|
| 1722 |
+
0xFE02: 84,
|
| 1723 |
+
0xFE03: 84,
|
| 1724 |
+
0xFE04: 84,
|
| 1725 |
+
0xFE05: 84,
|
| 1726 |
+
0xFE06: 84,
|
| 1727 |
+
0xFE07: 84,
|
| 1728 |
+
0xFE08: 84,
|
| 1729 |
+
0xFE09: 84,
|
| 1730 |
+
0xFE0A: 84,
|
| 1731 |
+
0xFE0B: 84,
|
| 1732 |
+
0xFE0C: 84,
|
| 1733 |
+
0xFE0D: 84,
|
| 1734 |
+
0xFE0E: 84,
|
| 1735 |
+
0xFE0F: 84,
|
| 1736 |
+
0xFE20: 84,
|
| 1737 |
+
0xFE21: 84,
|
| 1738 |
+
0xFE22: 84,
|
| 1739 |
+
0xFE23: 84,
|
| 1740 |
+
0xFE24: 84,
|
| 1741 |
+
0xFE25: 84,
|
| 1742 |
+
0xFE26: 84,
|
| 1743 |
+
0xFE27: 84,
|
| 1744 |
+
0xFE28: 84,
|
| 1745 |
+
0xFE29: 84,
|
| 1746 |
+
0xFE2A: 84,
|
| 1747 |
+
0xFE2B: 84,
|
| 1748 |
+
0xFE2C: 84,
|
| 1749 |
+
0xFE2D: 84,
|
| 1750 |
+
0xFE2E: 84,
|
| 1751 |
+
0xFE2F: 84,
|
| 1752 |
+
0xFEFF: 84,
|
| 1753 |
+
0xFFF9: 84,
|
| 1754 |
+
0xFFFA: 84,
|
| 1755 |
+
0xFFFB: 84,
|
| 1756 |
+
0x101FD: 84,
|
| 1757 |
+
0x102E0: 84,
|
| 1758 |
+
0x10376: 84,
|
| 1759 |
+
0x10377: 84,
|
| 1760 |
+
0x10378: 84,
|
| 1761 |
+
0x10379: 84,
|
| 1762 |
+
0x1037A: 84,
|
| 1763 |
+
0x10A01: 84,
|
| 1764 |
+
0x10A02: 84,
|
| 1765 |
+
0x10A03: 84,
|
| 1766 |
+
0x10A05: 84,
|
| 1767 |
+
0x10A06: 84,
|
| 1768 |
+
0x10A0C: 84,
|
| 1769 |
+
0x10A0D: 84,
|
| 1770 |
+
0x10A0E: 84,
|
| 1771 |
+
0x10A0F: 84,
|
| 1772 |
+
0x10A38: 84,
|
| 1773 |
+
0x10A39: 84,
|
| 1774 |
+
0x10A3A: 84,
|
| 1775 |
+
0x10A3F: 84,
|
| 1776 |
+
0x10AC0: 68,
|
| 1777 |
+
0x10AC1: 68,
|
| 1778 |
+
0x10AC2: 68,
|
| 1779 |
+
0x10AC3: 68,
|
| 1780 |
+
0x10AC4: 68,
|
| 1781 |
+
0x10AC5: 82,
|
| 1782 |
+
0x10AC7: 82,
|
| 1783 |
+
0x10AC9: 82,
|
| 1784 |
+
0x10ACA: 82,
|
| 1785 |
+
0x10ACD: 76,
|
| 1786 |
+
0x10ACE: 82,
|
| 1787 |
+
0x10ACF: 82,
|
| 1788 |
+
0x10AD0: 82,
|
| 1789 |
+
0x10AD1: 82,
|
| 1790 |
+
0x10AD2: 82,
|
| 1791 |
+
0x10AD3: 68,
|
| 1792 |
+
0x10AD4: 68,
|
| 1793 |
+
0x10AD5: 68,
|
| 1794 |
+
0x10AD6: 68,
|
| 1795 |
+
0x10AD7: 76,
|
| 1796 |
+
0x10AD8: 68,
|
| 1797 |
+
0x10AD9: 68,
|
| 1798 |
+
0x10ADA: 68,
|
| 1799 |
+
0x10ADB: 68,
|
| 1800 |
+
0x10ADC: 68,
|
| 1801 |
+
0x10ADD: 82,
|
| 1802 |
+
0x10ADE: 68,
|
| 1803 |
+
0x10ADF: 68,
|
| 1804 |
+
0x10AE0: 68,
|
| 1805 |
+
0x10AE1: 82,
|
| 1806 |
+
0x10AE4: 82,
|
| 1807 |
+
0x10AE5: 84,
|
| 1808 |
+
0x10AE6: 84,
|
| 1809 |
+
0x10AEB: 68,
|
| 1810 |
+
0x10AEC: 68,
|
| 1811 |
+
0x10AED: 68,
|
| 1812 |
+
0x10AEE: 68,
|
| 1813 |
+
0x10AEF: 82,
|
| 1814 |
+
0x10B80: 68,
|
| 1815 |
+
0x10B81: 82,
|
| 1816 |
+
0x10B82: 68,
|
| 1817 |
+
0x10B83: 82,
|
| 1818 |
+
0x10B84: 82,
|
| 1819 |
+
0x10B85: 82,
|
| 1820 |
+
0x10B86: 68,
|
| 1821 |
+
0x10B87: 68,
|
| 1822 |
+
0x10B88: 68,
|
| 1823 |
+
0x10B89: 82,
|
| 1824 |
+
0x10B8A: 68,
|
| 1825 |
+
0x10B8B: 68,
|
| 1826 |
+
0x10B8C: 82,
|
| 1827 |
+
0x10B8D: 68,
|
| 1828 |
+
0x10B8E: 82,
|
| 1829 |
+
0x10B8F: 82,
|
| 1830 |
+
0x10B90: 68,
|
| 1831 |
+
0x10B91: 82,
|
| 1832 |
+
0x10BA9: 82,
|
| 1833 |
+
0x10BAA: 82,
|
| 1834 |
+
0x10BAB: 82,
|
| 1835 |
+
0x10BAC: 82,
|
| 1836 |
+
0x10BAD: 68,
|
| 1837 |
+
0x10BAE: 68,
|
| 1838 |
+
0x10D00: 76,
|
| 1839 |
+
0x10D01: 68,
|
| 1840 |
+
0x10D02: 68,
|
| 1841 |
+
0x10D03: 68,
|
| 1842 |
+
0x10D04: 68,
|
| 1843 |
+
0x10D05: 68,
|
| 1844 |
+
0x10D06: 68,
|
| 1845 |
+
0x10D07: 68,
|
| 1846 |
+
0x10D08: 68,
|
| 1847 |
+
0x10D09: 68,
|
| 1848 |
+
0x10D0A: 68,
|
| 1849 |
+
0x10D0B: 68,
|
| 1850 |
+
0x10D0C: 68,
|
| 1851 |
+
0x10D0D: 68,
|
| 1852 |
+
0x10D0E: 68,
|
| 1853 |
+
0x10D0F: 68,
|
| 1854 |
+
0x10D10: 68,
|
| 1855 |
+
0x10D11: 68,
|
| 1856 |
+
0x10D12: 68,
|
| 1857 |
+
0x10D13: 68,
|
| 1858 |
+
0x10D14: 68,
|
| 1859 |
+
0x10D15: 68,
|
| 1860 |
+
0x10D16: 68,
|
| 1861 |
+
0x10D17: 68,
|
| 1862 |
+
0x10D18: 68,
|
| 1863 |
+
0x10D19: 68,
|
| 1864 |
+
0x10D1A: 68,
|
| 1865 |
+
0x10D1B: 68,
|
| 1866 |
+
0x10D1C: 68,
|
| 1867 |
+
0x10D1D: 68,
|
| 1868 |
+
0x10D1E: 68,
|
| 1869 |
+
0x10D1F: 68,
|
| 1870 |
+
0x10D20: 68,
|
| 1871 |
+
0x10D21: 68,
|
| 1872 |
+
0x10D22: 82,
|
| 1873 |
+
0x10D23: 68,
|
| 1874 |
+
0x10D24: 84,
|
| 1875 |
+
0x10D25: 84,
|
| 1876 |
+
0x10D26: 84,
|
| 1877 |
+
0x10D27: 84,
|
| 1878 |
+
0x10D69: 84,
|
| 1879 |
+
0x10D6A: 84,
|
| 1880 |
+
0x10D6B: 84,
|
| 1881 |
+
0x10D6C: 84,
|
| 1882 |
+
0x10D6D: 84,
|
| 1883 |
+
0x10EAB: 84,
|
| 1884 |
+
0x10EAC: 84,
|
| 1885 |
+
0x10EC2: 82,
|
| 1886 |
+
0x10EC3: 68,
|
| 1887 |
+
0x10EC4: 68,
|
| 1888 |
+
0x10EFC: 84,
|
| 1889 |
+
0x10EFD: 84,
|
| 1890 |
+
0x10EFE: 84,
|
| 1891 |
+
0x10EFF: 84,
|
| 1892 |
+
0x10F30: 68,
|
| 1893 |
+
0x10F31: 68,
|
| 1894 |
+
0x10F32: 68,
|
| 1895 |
+
0x10F33: 82,
|
| 1896 |
+
0x10F34: 68,
|
| 1897 |
+
0x10F35: 68,
|
| 1898 |
+
0x10F36: 68,
|
| 1899 |
+
0x10F37: 68,
|
| 1900 |
+
0x10F38: 68,
|
| 1901 |
+
0x10F39: 68,
|
| 1902 |
+
0x10F3A: 68,
|
| 1903 |
+
0x10F3B: 68,
|
| 1904 |
+
0x10F3C: 68,
|
| 1905 |
+
0x10F3D: 68,
|
| 1906 |
+
0x10F3E: 68,
|
| 1907 |
+
0x10F3F: 68,
|
| 1908 |
+
0x10F40: 68,
|
| 1909 |
+
0x10F41: 68,
|
| 1910 |
+
0x10F42: 68,
|
| 1911 |
+
0x10F43: 68,
|
| 1912 |
+
0x10F44: 68,
|
| 1913 |
+
0x10F46: 84,
|
| 1914 |
+
0x10F47: 84,
|
| 1915 |
+
0x10F48: 84,
|
| 1916 |
+
0x10F49: 84,
|
| 1917 |
+
0x10F4A: 84,
|
| 1918 |
+
0x10F4B: 84,
|
| 1919 |
+
0x10F4C: 84,
|
| 1920 |
+
0x10F4D: 84,
|
| 1921 |
+
0x10F4E: 84,
|
| 1922 |
+
0x10F4F: 84,
|
| 1923 |
+
0x10F50: 84,
|
| 1924 |
+
0x10F51: 68,
|
| 1925 |
+
0x10F52: 68,
|
| 1926 |
+
0x10F53: 68,
|
| 1927 |
+
0x10F54: 82,
|
| 1928 |
+
0x10F70: 68,
|
| 1929 |
+
0x10F71: 68,
|
| 1930 |
+
0x10F72: 68,
|
| 1931 |
+
0x10F73: 68,
|
| 1932 |
+
0x10F74: 82,
|
| 1933 |
+
0x10F75: 82,
|
| 1934 |
+
0x10F76: 68,
|
| 1935 |
+
0x10F77: 68,
|
| 1936 |
+
0x10F78: 68,
|
| 1937 |
+
0x10F79: 68,
|
| 1938 |
+
0x10F7A: 68,
|
| 1939 |
+
0x10F7B: 68,
|
| 1940 |
+
0x10F7C: 68,
|
| 1941 |
+
0x10F7D: 68,
|
| 1942 |
+
0x10F7E: 68,
|
| 1943 |
+
0x10F7F: 68,
|
| 1944 |
+
0x10F80: 68,
|
| 1945 |
+
0x10F81: 68,
|
| 1946 |
+
0x10F82: 84,
|
| 1947 |
+
0x10F83: 84,
|
| 1948 |
+
0x10F84: 84,
|
| 1949 |
+
0x10F85: 84,
|
| 1950 |
+
0x10FB0: 68,
|
| 1951 |
+
0x10FB2: 68,
|
| 1952 |
+
0x10FB3: 68,
|
| 1953 |
+
0x10FB4: 82,
|
| 1954 |
+
0x10FB5: 82,
|
| 1955 |
+
0x10FB6: 82,
|
| 1956 |
+
0x10FB8: 68,
|
| 1957 |
+
0x10FB9: 82,
|
| 1958 |
+
0x10FBA: 82,
|
| 1959 |
+
0x10FBB: 68,
|
| 1960 |
+
0x10FBC: 68,
|
| 1961 |
+
0x10FBD: 82,
|
| 1962 |
+
0x10FBE: 68,
|
| 1963 |
+
0x10FBF: 68,
|
| 1964 |
+
0x10FC1: 68,
|
| 1965 |
+
0x10FC2: 82,
|
| 1966 |
+
0x10FC3: 82,
|
| 1967 |
+
0x10FC4: 68,
|
| 1968 |
+
0x10FC9: 82,
|
| 1969 |
+
0x10FCA: 68,
|
| 1970 |
+
0x10FCB: 76,
|
| 1971 |
+
0x11001: 84,
|
| 1972 |
+
0x11038: 84,
|
| 1973 |
+
0x11039: 84,
|
| 1974 |
+
0x1103A: 84,
|
| 1975 |
+
0x1103B: 84,
|
| 1976 |
+
0x1103C: 84,
|
| 1977 |
+
0x1103D: 84,
|
| 1978 |
+
0x1103E: 84,
|
| 1979 |
+
0x1103F: 84,
|
| 1980 |
+
0x11040: 84,
|
| 1981 |
+
0x11041: 84,
|
| 1982 |
+
0x11042: 84,
|
| 1983 |
+
0x11043: 84,
|
| 1984 |
+
0x11044: 84,
|
| 1985 |
+
0x11045: 84,
|
| 1986 |
+
0x11046: 84,
|
| 1987 |
+
0x11070: 84,
|
| 1988 |
+
0x11073: 84,
|
| 1989 |
+
0x11074: 84,
|
| 1990 |
+
0x1107F: 84,
|
| 1991 |
+
0x11080: 84,
|
| 1992 |
+
0x11081: 84,
|
| 1993 |
+
0x110B3: 84,
|
| 1994 |
+
0x110B4: 84,
|
| 1995 |
+
0x110B5: 84,
|
| 1996 |
+
0x110B6: 84,
|
| 1997 |
+
0x110B9: 84,
|
| 1998 |
+
0x110BA: 84,
|
| 1999 |
+
0x110C2: 84,
|
| 2000 |
+
0x11100: 84,
|
| 2001 |
+
0x11101: 84,
|
| 2002 |
+
0x11102: 84,
|
| 2003 |
+
0x11127: 84,
|
| 2004 |
+
0x11128: 84,
|
| 2005 |
+
0x11129: 84,
|
| 2006 |
+
0x1112A: 84,
|
| 2007 |
+
0x1112B: 84,
|
| 2008 |
+
0x1112D: 84,
|
| 2009 |
+
0x1112E: 84,
|
| 2010 |
+
0x1112F: 84,
|
| 2011 |
+
0x11130: 84,
|
| 2012 |
+
0x11131: 84,
|
| 2013 |
+
0x11132: 84,
|
| 2014 |
+
0x11133: 84,
|
| 2015 |
+
0x11134: 84,
|
| 2016 |
+
0x11173: 84,
|
| 2017 |
+
0x11180: 84,
|
| 2018 |
+
0x11181: 84,
|
| 2019 |
+
0x111B6: 84,
|
| 2020 |
+
0x111B7: 84,
|
| 2021 |
+
0x111B8: 84,
|
| 2022 |
+
0x111B9: 84,
|
| 2023 |
+
0x111BA: 84,
|
| 2024 |
+
0x111BB: 84,
|
| 2025 |
+
0x111BC: 84,
|
| 2026 |
+
0x111BD: 84,
|
| 2027 |
+
0x111BE: 84,
|
| 2028 |
+
0x111C9: 84,
|
| 2029 |
+
0x111CA: 84,
|
| 2030 |
+
0x111CB: 84,
|
| 2031 |
+
0x111CC: 84,
|
| 2032 |
+
0x111CF: 84,
|
| 2033 |
+
0x1122F: 84,
|
| 2034 |
+
0x11230: 84,
|
| 2035 |
+
0x11231: 84,
|
| 2036 |
+
0x11234: 84,
|
| 2037 |
+
0x11236: 84,
|
| 2038 |
+
0x11237: 84,
|
| 2039 |
+
0x1123E: 84,
|
| 2040 |
+
0x11241: 84,
|
| 2041 |
+
0x112DF: 84,
|
| 2042 |
+
0x112E3: 84,
|
| 2043 |
+
0x112E4: 84,
|
| 2044 |
+
0x112E5: 84,
|
| 2045 |
+
0x112E6: 84,
|
| 2046 |
+
0x112E7: 84,
|
| 2047 |
+
0x112E8: 84,
|
| 2048 |
+
0x112E9: 84,
|
| 2049 |
+
0x112EA: 84,
|
| 2050 |
+
0x11300: 84,
|
| 2051 |
+
0x11301: 84,
|
| 2052 |
+
0x1133B: 84,
|
| 2053 |
+
0x1133C: 84,
|
| 2054 |
+
0x11340: 84,
|
| 2055 |
+
0x11366: 84,
|
| 2056 |
+
0x11367: 84,
|
| 2057 |
+
0x11368: 84,
|
| 2058 |
+
0x11369: 84,
|
| 2059 |
+
0x1136A: 84,
|
| 2060 |
+
0x1136B: 84,
|
| 2061 |
+
0x1136C: 84,
|
| 2062 |
+
0x11370: 84,
|
| 2063 |
+
0x11371: 84,
|
| 2064 |
+
0x11372: 84,
|
| 2065 |
+
0x11373: 84,
|
| 2066 |
+
0x11374: 84,
|
| 2067 |
+
0x113BB: 84,
|
| 2068 |
+
0x113BC: 84,
|
| 2069 |
+
0x113BD: 84,
|
| 2070 |
+
0x113BE: 84,
|
| 2071 |
+
0x113BF: 84,
|
| 2072 |
+
0x113C0: 84,
|
| 2073 |
+
0x113CE: 84,
|
| 2074 |
+
0x113D0: 84,
|
| 2075 |
+
0x113D2: 84,
|
| 2076 |
+
0x113E1: 84,
|
| 2077 |
+
0x113E2: 84,
|
| 2078 |
+
0x11438: 84,
|
| 2079 |
+
0x11439: 84,
|
| 2080 |
+
0x1143A: 84,
|
| 2081 |
+
0x1143B: 84,
|
| 2082 |
+
0x1143C: 84,
|
| 2083 |
+
0x1143D: 84,
|
| 2084 |
+
0x1143E: 84,
|
| 2085 |
+
0x1143F: 84,
|
| 2086 |
+
0x11442: 84,
|
| 2087 |
+
0x11443: 84,
|
| 2088 |
+
0x11444: 84,
|
| 2089 |
+
0x11446: 84,
|
| 2090 |
+
0x1145E: 84,
|
| 2091 |
+
0x114B3: 84,
|
| 2092 |
+
0x114B4: 84,
|
| 2093 |
+
0x114B5: 84,
|
| 2094 |
+
0x114B6: 84,
|
| 2095 |
+
0x114B7: 84,
|
| 2096 |
+
0x114B8: 84,
|
| 2097 |
+
0x114BA: 84,
|
| 2098 |
+
0x114BF: 84,
|
| 2099 |
+
0x114C0: 84,
|
| 2100 |
+
0x114C2: 84,
|
| 2101 |
+
0x114C3: 84,
|
| 2102 |
+
0x115B2: 84,
|
| 2103 |
+
0x115B3: 84,
|
| 2104 |
+
0x115B4: 84,
|
| 2105 |
+
0x115B5: 84,
|
| 2106 |
+
0x115BC: 84,
|
| 2107 |
+
0x115BD: 84,
|
| 2108 |
+
0x115BF: 84,
|
| 2109 |
+
0x115C0: 84,
|
| 2110 |
+
0x115DC: 84,
|
| 2111 |
+
0x115DD: 84,
|
| 2112 |
+
0x11633: 84,
|
| 2113 |
+
0x11634: 84,
|
| 2114 |
+
0x11635: 84,
|
| 2115 |
+
0x11636: 84,
|
| 2116 |
+
0x11637: 84,
|
| 2117 |
+
0x11638: 84,
|
| 2118 |
+
0x11639: 84,
|
| 2119 |
+
0x1163A: 84,
|
| 2120 |
+
0x1163D: 84,
|
| 2121 |
+
0x1163F: 84,
|
| 2122 |
+
0x11640: 84,
|
| 2123 |
+
0x116AB: 84,
|
| 2124 |
+
0x116AD: 84,
|
| 2125 |
+
0x116B0: 84,
|
| 2126 |
+
0x116B1: 84,
|
| 2127 |
+
0x116B2: 84,
|
| 2128 |
+
0x116B3: 84,
|
| 2129 |
+
0x116B4: 84,
|
| 2130 |
+
0x116B5: 84,
|
| 2131 |
+
0x116B7: 84,
|
| 2132 |
+
0x1171D: 84,
|
| 2133 |
+
0x1171F: 84,
|
| 2134 |
+
0x11722: 84,
|
| 2135 |
+
0x11723: 84,
|
| 2136 |
+
0x11724: 84,
|
| 2137 |
+
0x11725: 84,
|
| 2138 |
+
0x11727: 84,
|
| 2139 |
+
0x11728: 84,
|
| 2140 |
+
0x11729: 84,
|
| 2141 |
+
0x1172A: 84,
|
| 2142 |
+
0x1172B: 84,
|
| 2143 |
+
0x1182F: 84,
|
| 2144 |
+
0x11830: 84,
|
| 2145 |
+
0x11831: 84,
|
| 2146 |
+
0x11832: 84,
|
| 2147 |
+
0x11833: 84,
|
| 2148 |
+
0x11834: 84,
|
| 2149 |
+
0x11835: 84,
|
| 2150 |
+
0x11836: 84,
|
| 2151 |
+
0x11837: 84,
|
| 2152 |
+
0x11839: 84,
|
| 2153 |
+
0x1183A: 84,
|
| 2154 |
+
0x1193B: 84,
|
| 2155 |
+
0x1193C: 84,
|
| 2156 |
+
0x1193E: 84,
|
| 2157 |
+
0x11943: 84,
|
| 2158 |
+
0x119D4: 84,
|
| 2159 |
+
0x119D5: 84,
|
| 2160 |
+
0x119D6: 84,
|
| 2161 |
+
0x119D7: 84,
|
| 2162 |
+
0x119DA: 84,
|
| 2163 |
+
0x119DB: 84,
|
| 2164 |
+
0x119E0: 84,
|
| 2165 |
+
0x11A01: 84,
|
| 2166 |
+
0x11A02: 84,
|
| 2167 |
+
0x11A03: 84,
|
| 2168 |
+
0x11A04: 84,
|
| 2169 |
+
0x11A05: 84,
|
| 2170 |
+
0x11A06: 84,
|
| 2171 |
+
0x11A07: 84,
|
| 2172 |
+
0x11A08: 84,
|
| 2173 |
+
0x11A09: 84,
|
| 2174 |
+
0x11A0A: 84,
|
| 2175 |
+
0x11A33: 84,
|
| 2176 |
+
0x11A34: 84,
|
| 2177 |
+
0x11A35: 84,
|
| 2178 |
+
0x11A36: 84,
|
| 2179 |
+
0x11A37: 84,
|
| 2180 |
+
0x11A38: 84,
|
| 2181 |
+
0x11A3B: 84,
|
| 2182 |
+
0x11A3C: 84,
|
| 2183 |
+
0x11A3D: 84,
|
| 2184 |
+
0x11A3E: 84,
|
| 2185 |
+
0x11A47: 84,
|
| 2186 |
+
0x11A51: 84,
|
| 2187 |
+
0x11A52: 84,
|
| 2188 |
+
0x11A53: 84,
|
| 2189 |
+
0x11A54: 84,
|
| 2190 |
+
0x11A55: 84,
|
| 2191 |
+
0x11A56: 84,
|
| 2192 |
+
0x11A59: 84,
|
| 2193 |
+
0x11A5A: 84,
|
| 2194 |
+
0x11A5B: 84,
|
| 2195 |
+
0x11A8A: 84,
|
| 2196 |
+
0x11A8B: 84,
|
| 2197 |
+
0x11A8C: 84,
|
| 2198 |
+
0x11A8D: 84,
|
| 2199 |
+
0x11A8E: 84,
|
| 2200 |
+
0x11A8F: 84,
|
| 2201 |
+
0x11A90: 84,
|
| 2202 |
+
0x11A91: 84,
|
| 2203 |
+
0x11A92: 84,
|
| 2204 |
+
0x11A93: 84,
|
| 2205 |
+
0x11A94: 84,
|
| 2206 |
+
0x11A95: 84,
|
| 2207 |
+
0x11A96: 84,
|
| 2208 |
+
0x11A98: 84,
|
| 2209 |
+
0x11A99: 84,
|
| 2210 |
+
0x11C30: 84,
|
| 2211 |
+
0x11C31: 84,
|
| 2212 |
+
0x11C32: 84,
|
| 2213 |
+
0x11C33: 84,
|
| 2214 |
+
0x11C34: 84,
|
| 2215 |
+
0x11C35: 84,
|
| 2216 |
+
0x11C36: 84,
|
| 2217 |
+
0x11C38: 84,
|
| 2218 |
+
0x11C39: 84,
|
| 2219 |
+
0x11C3A: 84,
|
| 2220 |
+
0x11C3B: 84,
|
| 2221 |
+
0x11C3C: 84,
|
| 2222 |
+
0x11C3D: 84,
|
| 2223 |
+
0x11C3F: 84,
|
| 2224 |
+
0x11C92: 84,
|
| 2225 |
+
0x11C93: 84,
|
| 2226 |
+
0x11C94: 84,
|
| 2227 |
+
0x11C95: 84,
|
| 2228 |
+
0x11C96: 84,
|
| 2229 |
+
0x11C97: 84,
|
| 2230 |
+
0x11C98: 84,
|
| 2231 |
+
0x11C99: 84,
|
| 2232 |
+
0x11C9A: 84,
|
| 2233 |
+
0x11C9B: 84,
|
| 2234 |
+
0x11C9C: 84,
|
| 2235 |
+
0x11C9D: 84,
|
| 2236 |
+
0x11C9E: 84,
|
| 2237 |
+
0x11C9F: 84,
|
| 2238 |
+
0x11CA0: 84,
|
| 2239 |
+
0x11CA1: 84,
|
| 2240 |
+
0x11CA2: 84,
|
| 2241 |
+
0x11CA3: 84,
|
| 2242 |
+
0x11CA4: 84,
|
| 2243 |
+
0x11CA5: 84,
|
| 2244 |
+
0x11CA6: 84,
|
| 2245 |
+
0x11CA7: 84,
|
| 2246 |
+
0x11CAA: 84,
|
| 2247 |
+
0x11CAB: 84,
|
| 2248 |
+
0x11CAC: 84,
|
| 2249 |
+
0x11CAD: 84,
|
| 2250 |
+
0x11CAE: 84,
|
| 2251 |
+
0x11CAF: 84,
|
| 2252 |
+
0x11CB0: 84,
|
| 2253 |
+
0x11CB2: 84,
|
| 2254 |
+
0x11CB3: 84,
|
| 2255 |
+
0x11CB5: 84,
|
| 2256 |
+
0x11CB6: 84,
|
| 2257 |
+
0x11D31: 84,
|
| 2258 |
+
0x11D32: 84,
|
| 2259 |
+
0x11D33: 84,
|
| 2260 |
+
0x11D34: 84,
|
| 2261 |
+
0x11D35: 84,
|
| 2262 |
+
0x11D36: 84,
|
| 2263 |
+
0x11D3A: 84,
|
| 2264 |
+
0x11D3C: 84,
|
| 2265 |
+
0x11D3D: 84,
|
| 2266 |
+
0x11D3F: 84,
|
| 2267 |
+
0x11D40: 84,
|
| 2268 |
+
0x11D41: 84,
|
| 2269 |
+
0x11D42: 84,
|
| 2270 |
+
0x11D43: 84,
|
| 2271 |
+
0x11D44: 84,
|
| 2272 |
+
0x11D45: 84,
|
| 2273 |
+
0x11D47: 84,
|
| 2274 |
+
0x11D90: 84,
|
| 2275 |
+
0x11D91: 84,
|
| 2276 |
+
0x11D95: 84,
|
| 2277 |
+
0x11D97: 84,
|
| 2278 |
+
0x11EF3: 84,
|
| 2279 |
+
0x11EF4: 84,
|
| 2280 |
+
0x11F00: 84,
|
| 2281 |
+
0x11F01: 84,
|
| 2282 |
+
0x11F36: 84,
|
| 2283 |
+
0x11F37: 84,
|
| 2284 |
+
0x11F38: 84,
|
| 2285 |
+
0x11F39: 84,
|
| 2286 |
+
0x11F3A: 84,
|
| 2287 |
+
0x11F40: 84,
|
| 2288 |
+
0x11F42: 84,
|
| 2289 |
+
0x11F5A: 84,
|
| 2290 |
+
0x13430: 84,
|
| 2291 |
+
0x13431: 84,
|
| 2292 |
+
0x13432: 84,
|
| 2293 |
+
0x13433: 84,
|
| 2294 |
+
0x13434: 84,
|
| 2295 |
+
0x13435: 84,
|
| 2296 |
+
0x13436: 84,
|
| 2297 |
+
0x13437: 84,
|
| 2298 |
+
0x13438: 84,
|
| 2299 |
+
0x13439: 84,
|
| 2300 |
+
0x1343A: 84,
|
| 2301 |
+
0x1343B: 84,
|
| 2302 |
+
0x1343C: 84,
|
| 2303 |
+
0x1343D: 84,
|
| 2304 |
+
0x1343E: 84,
|
| 2305 |
+
0x1343F: 84,
|
| 2306 |
+
0x13440: 84,
|
| 2307 |
+
0x13447: 84,
|
| 2308 |
+
0x13448: 84,
|
| 2309 |
+
0x13449: 84,
|
| 2310 |
+
0x1344A: 84,
|
| 2311 |
+
0x1344B: 84,
|
| 2312 |
+
0x1344C: 84,
|
| 2313 |
+
0x1344D: 84,
|
| 2314 |
+
0x1344E: 84,
|
| 2315 |
+
0x1344F: 84,
|
| 2316 |
+
0x13450: 84,
|
| 2317 |
+
0x13451: 84,
|
| 2318 |
+
0x13452: 84,
|
| 2319 |
+
0x13453: 84,
|
| 2320 |
+
0x13454: 84,
|
| 2321 |
+
0x13455: 84,
|
| 2322 |
+
0x1611E: 84,
|
| 2323 |
+
0x1611F: 84,
|
| 2324 |
+
0x16120: 84,
|
| 2325 |
+
0x16121: 84,
|
| 2326 |
+
0x16122: 84,
|
| 2327 |
+
0x16123: 84,
|
| 2328 |
+
0x16124: 84,
|
| 2329 |
+
0x16125: 84,
|
| 2330 |
+
0x16126: 84,
|
| 2331 |
+
0x16127: 84,
|
| 2332 |
+
0x16128: 84,
|
| 2333 |
+
0x16129: 84,
|
| 2334 |
+
0x1612D: 84,
|
| 2335 |
+
0x1612E: 84,
|
| 2336 |
+
0x1612F: 84,
|
| 2337 |
+
0x16AF0: 84,
|
| 2338 |
+
0x16AF1: 84,
|
| 2339 |
+
0x16AF2: 84,
|
| 2340 |
+
0x16AF3: 84,
|
| 2341 |
+
0x16AF4: 84,
|
| 2342 |
+
0x16B30: 84,
|
| 2343 |
+
0x16B31: 84,
|
| 2344 |
+
0x16B32: 84,
|
| 2345 |
+
0x16B33: 84,
|
| 2346 |
+
0x16B34: 84,
|
| 2347 |
+
0x16B35: 84,
|
| 2348 |
+
0x16B36: 84,
|
| 2349 |
+
0x16F4F: 84,
|
| 2350 |
+
0x16F8F: 84,
|
| 2351 |
+
0x16F90: 84,
|
| 2352 |
+
0x16F91: 84,
|
| 2353 |
+
0x16F92: 84,
|
| 2354 |
+
0x16FE4: 84,
|
| 2355 |
+
0x1BC9D: 84,
|
| 2356 |
+
0x1BC9E: 84,
|
| 2357 |
+
0x1BCA0: 84,
|
| 2358 |
+
0x1BCA1: 84,
|
| 2359 |
+
0x1BCA2: 84,
|
| 2360 |
+
0x1BCA3: 84,
|
| 2361 |
+
0x1CF00: 84,
|
| 2362 |
+
0x1CF01: 84,
|
| 2363 |
+
0x1CF02: 84,
|
| 2364 |
+
0x1CF03: 84,
|
| 2365 |
+
0x1CF04: 84,
|
| 2366 |
+
0x1CF05: 84,
|
| 2367 |
+
0x1CF06: 84,
|
| 2368 |
+
0x1CF07: 84,
|
| 2369 |
+
0x1CF08: 84,
|
| 2370 |
+
0x1CF09: 84,
|
| 2371 |
+
0x1CF0A: 84,
|
| 2372 |
+
0x1CF0B: 84,
|
| 2373 |
+
0x1CF0C: 84,
|
| 2374 |
+
0x1CF0D: 84,
|
| 2375 |
+
0x1CF0E: 84,
|
| 2376 |
+
0x1CF0F: 84,
|
| 2377 |
+
0x1CF10: 84,
|
| 2378 |
+
0x1CF11: 84,
|
| 2379 |
+
0x1CF12: 84,
|
| 2380 |
+
0x1CF13: 84,
|
| 2381 |
+
0x1CF14: 84,
|
| 2382 |
+
0x1CF15: 84,
|
| 2383 |
+
0x1CF16: 84,
|
| 2384 |
+
0x1CF17: 84,
|
| 2385 |
+
0x1CF18: 84,
|
| 2386 |
+
0x1CF19: 84,
|
| 2387 |
+
0x1CF1A: 84,
|
| 2388 |
+
0x1CF1B: 84,
|
| 2389 |
+
0x1CF1C: 84,
|
| 2390 |
+
0x1CF1D: 84,
|
| 2391 |
+
0x1CF1E: 84,
|
| 2392 |
+
0x1CF1F: 84,
|
| 2393 |
+
0x1CF20: 84,
|
| 2394 |
+
0x1CF21: 84,
|
| 2395 |
+
0x1CF22: 84,
|
| 2396 |
+
0x1CF23: 84,
|
| 2397 |
+
0x1CF24: 84,
|
| 2398 |
+
0x1CF25: 84,
|
| 2399 |
+
0x1CF26: 84,
|
| 2400 |
+
0x1CF27: 84,
|
| 2401 |
+
0x1CF28: 84,
|
| 2402 |
+
0x1CF29: 84,
|
| 2403 |
+
0x1CF2A: 84,
|
| 2404 |
+
0x1CF2B: 84,
|
| 2405 |
+
0x1CF2C: 84,
|
| 2406 |
+
0x1CF2D: 84,
|
| 2407 |
+
0x1CF30: 84,
|
| 2408 |
+
0x1CF31: 84,
|
| 2409 |
+
0x1CF32: 84,
|
| 2410 |
+
0x1CF33: 84,
|
| 2411 |
+
0x1CF34: 84,
|
| 2412 |
+
0x1CF35: 84,
|
| 2413 |
+
0x1CF36: 84,
|
| 2414 |
+
0x1CF37: 84,
|
| 2415 |
+
0x1CF38: 84,
|
| 2416 |
+
0x1CF39: 84,
|
| 2417 |
+
0x1CF3A: 84,
|
| 2418 |
+
0x1CF3B: 84,
|
| 2419 |
+
0x1CF3C: 84,
|
| 2420 |
+
0x1CF3D: 84,
|
| 2421 |
+
0x1CF3E: 84,
|
| 2422 |
+
0x1CF3F: 84,
|
| 2423 |
+
0x1CF40: 84,
|
| 2424 |
+
0x1CF41: 84,
|
| 2425 |
+
0x1CF42: 84,
|
| 2426 |
+
0x1CF43: 84,
|
| 2427 |
+
0x1CF44: 84,
|
| 2428 |
+
0x1CF45: 84,
|
| 2429 |
+
0x1CF46: 84,
|
| 2430 |
+
0x1D167: 84,
|
| 2431 |
+
0x1D168: 84,
|
| 2432 |
+
0x1D169: 84,
|
| 2433 |
+
0x1D173: 84,
|
| 2434 |
+
0x1D174: 84,
|
| 2435 |
+
0x1D175: 84,
|
| 2436 |
+
0x1D176: 84,
|
| 2437 |
+
0x1D177: 84,
|
| 2438 |
+
0x1D178: 84,
|
| 2439 |
+
0x1D179: 84,
|
| 2440 |
+
0x1D17A: 84,
|
| 2441 |
+
0x1D17B: 84,
|
| 2442 |
+
0x1D17C: 84,
|
| 2443 |
+
0x1D17D: 84,
|
| 2444 |
+
0x1D17E: 84,
|
| 2445 |
+
0x1D17F: 84,
|
| 2446 |
+
0x1D180: 84,
|
| 2447 |
+
0x1D181: 84,
|
| 2448 |
+
0x1D182: 84,
|
| 2449 |
+
0x1D185: 84,
|
| 2450 |
+
0x1D186: 84,
|
| 2451 |
+
0x1D187: 84,
|
| 2452 |
+
0x1D188: 84,
|
| 2453 |
+
0x1D189: 84,
|
| 2454 |
+
0x1D18A: 84,
|
| 2455 |
+
0x1D18B: 84,
|
| 2456 |
+
0x1D1AA: 84,
|
| 2457 |
+
0x1D1AB: 84,
|
| 2458 |
+
0x1D1AC: 84,
|
| 2459 |
+
0x1D1AD: 84,
|
| 2460 |
+
0x1D242: 84,
|
| 2461 |
+
0x1D243: 84,
|
| 2462 |
+
0x1D244: 84,
|
| 2463 |
+
0x1DA00: 84,
|
| 2464 |
+
0x1DA01: 84,
|
| 2465 |
+
0x1DA02: 84,
|
| 2466 |
+
0x1DA03: 84,
|
| 2467 |
+
0x1DA04: 84,
|
| 2468 |
+
0x1DA05: 84,
|
| 2469 |
+
0x1DA06: 84,
|
| 2470 |
+
0x1DA07: 84,
|
| 2471 |
+
0x1DA08: 84,
|
| 2472 |
+
0x1DA09: 84,
|
| 2473 |
+
0x1DA0A: 84,
|
| 2474 |
+
0x1DA0B: 84,
|
| 2475 |
+
0x1DA0C: 84,
|
| 2476 |
+
0x1DA0D: 84,
|
| 2477 |
+
0x1DA0E: 84,
|
| 2478 |
+
0x1DA0F: 84,
|
| 2479 |
+
0x1DA10: 84,
|
| 2480 |
+
0x1DA11: 84,
|
| 2481 |
+
0x1DA12: 84,
|
| 2482 |
+
0x1DA13: 84,
|
| 2483 |
+
0x1DA14: 84,
|
| 2484 |
+
0x1DA15: 84,
|
| 2485 |
+
0x1DA16: 84,
|
| 2486 |
+
0x1DA17: 84,
|
| 2487 |
+
0x1DA18: 84,
|
| 2488 |
+
0x1DA19: 84,
|
| 2489 |
+
0x1DA1A: 84,
|
| 2490 |
+
0x1DA1B: 84,
|
| 2491 |
+
0x1DA1C: 84,
|
| 2492 |
+
0x1DA1D: 84,
|
| 2493 |
+
0x1DA1E: 84,
|
| 2494 |
+
0x1DA1F: 84,
|
| 2495 |
+
0x1DA20: 84,
|
| 2496 |
+
0x1DA21: 84,
|
| 2497 |
+
0x1DA22: 84,
|
| 2498 |
+
0x1DA23: 84,
|
| 2499 |
+
0x1DA24: 84,
|
| 2500 |
+
0x1DA25: 84,
|
| 2501 |
+
0x1DA26: 84,
|
| 2502 |
+
0x1DA27: 84,
|
| 2503 |
+
0x1DA28: 84,
|
| 2504 |
+
0x1DA29: 84,
|
| 2505 |
+
0x1DA2A: 84,
|
| 2506 |
+
0x1DA2B: 84,
|
| 2507 |
+
0x1DA2C: 84,
|
| 2508 |
+
0x1DA2D: 84,
|
| 2509 |
+
0x1DA2E: 84,
|
| 2510 |
+
0x1DA2F: 84,
|
| 2511 |
+
0x1DA30: 84,
|
| 2512 |
+
0x1DA31: 84,
|
| 2513 |
+
0x1DA32: 84,
|
| 2514 |
+
0x1DA33: 84,
|
| 2515 |
+
0x1DA34: 84,
|
| 2516 |
+
0x1DA35: 84,
|
| 2517 |
+
0x1DA36: 84,
|
| 2518 |
+
0x1DA3B: 84,
|
| 2519 |
+
0x1DA3C: 84,
|
| 2520 |
+
0x1DA3D: 84,
|
| 2521 |
+
0x1DA3E: 84,
|
| 2522 |
+
0x1DA3F: 84,
|
| 2523 |
+
0x1DA40: 84,
|
| 2524 |
+
0x1DA41: 84,
|
| 2525 |
+
0x1DA42: 84,
|
| 2526 |
+
0x1DA43: 84,
|
| 2527 |
+
0x1DA44: 84,
|
| 2528 |
+
0x1DA45: 84,
|
| 2529 |
+
0x1DA46: 84,
|
| 2530 |
+
0x1DA47: 84,
|
| 2531 |
+
0x1DA48: 84,
|
| 2532 |
+
0x1DA49: 84,
|
| 2533 |
+
0x1DA4A: 84,
|
| 2534 |
+
0x1DA4B: 84,
|
| 2535 |
+
0x1DA4C: 84,
|
| 2536 |
+
0x1DA4D: 84,
|
| 2537 |
+
0x1DA4E: 84,
|
| 2538 |
+
0x1DA4F: 84,
|
| 2539 |
+
0x1DA50: 84,
|
| 2540 |
+
0x1DA51: 84,
|
| 2541 |
+
0x1DA52: 84,
|
| 2542 |
+
0x1DA53: 84,
|
| 2543 |
+
0x1DA54: 84,
|
| 2544 |
+
0x1DA55: 84,
|
| 2545 |
+
0x1DA56: 84,
|
| 2546 |
+
0x1DA57: 84,
|
| 2547 |
+
0x1DA58: 84,
|
| 2548 |
+
0x1DA59: 84,
|
| 2549 |
+
0x1DA5A: 84,
|
| 2550 |
+
0x1DA5B: 84,
|
| 2551 |
+
0x1DA5C: 84,
|
| 2552 |
+
0x1DA5D: 84,
|
| 2553 |
+
0x1DA5E: 84,
|
| 2554 |
+
0x1DA5F: 84,
|
| 2555 |
+
0x1DA60: 84,
|
| 2556 |
+
0x1DA61: 84,
|
| 2557 |
+
0x1DA62: 84,
|
| 2558 |
+
0x1DA63: 84,
|
| 2559 |
+
0x1DA64: 84,
|
| 2560 |
+
0x1DA65: 84,
|
| 2561 |
+
0x1DA66: 84,
|
| 2562 |
+
0x1DA67: 84,
|
| 2563 |
+
0x1DA68: 84,
|
| 2564 |
+
0x1DA69: 84,
|
| 2565 |
+
0x1DA6A: 84,
|
| 2566 |
+
0x1DA6B: 84,
|
| 2567 |
+
0x1DA6C: 84,
|
| 2568 |
+
0x1DA75: 84,
|
| 2569 |
+
0x1DA84: 84,
|
| 2570 |
+
0x1DA9B: 84,
|
| 2571 |
+
0x1DA9C: 84,
|
| 2572 |
+
0x1DA9D: 84,
|
| 2573 |
+
0x1DA9E: 84,
|
| 2574 |
+
0x1DA9F: 84,
|
| 2575 |
+
0x1DAA1: 84,
|
| 2576 |
+
0x1DAA2: 84,
|
| 2577 |
+
0x1DAA3: 84,
|
| 2578 |
+
0x1DAA4: 84,
|
| 2579 |
+
0x1DAA5: 84,
|
| 2580 |
+
0x1DAA6: 84,
|
| 2581 |
+
0x1DAA7: 84,
|
| 2582 |
+
0x1DAA8: 84,
|
| 2583 |
+
0x1DAA9: 84,
|
| 2584 |
+
0x1DAAA: 84,
|
| 2585 |
+
0x1DAAB: 84,
|
| 2586 |
+
0x1DAAC: 84,
|
| 2587 |
+
0x1DAAD: 84,
|
| 2588 |
+
0x1DAAE: 84,
|
| 2589 |
+
0x1DAAF: 84,
|
| 2590 |
+
0x1E000: 84,
|
| 2591 |
+
0x1E001: 84,
|
| 2592 |
+
0x1E002: 84,
|
| 2593 |
+
0x1E003: 84,
|
| 2594 |
+
0x1E004: 84,
|
| 2595 |
+
0x1E005: 84,
|
| 2596 |
+
0x1E006: 84,
|
| 2597 |
+
0x1E008: 84,
|
| 2598 |
+
0x1E009: 84,
|
| 2599 |
+
0x1E00A: 84,
|
| 2600 |
+
0x1E00B: 84,
|
| 2601 |
+
0x1E00C: 84,
|
| 2602 |
+
0x1E00D: 84,
|
| 2603 |
+
0x1E00E: 84,
|
| 2604 |
+
0x1E00F: 84,
|
| 2605 |
+
0x1E010: 84,
|
| 2606 |
+
0x1E011: 84,
|
| 2607 |
+
0x1E012: 84,
|
| 2608 |
+
0x1E013: 84,
|
| 2609 |
+
0x1E014: 84,
|
| 2610 |
+
0x1E015: 84,
|
| 2611 |
+
0x1E016: 84,
|
| 2612 |
+
0x1E017: 84,
|
| 2613 |
+
0x1E018: 84,
|
| 2614 |
+
0x1E01B: 84,
|
| 2615 |
+
0x1E01C: 84,
|
| 2616 |
+
0x1E01D: 84,
|
| 2617 |
+
0x1E01E: 84,
|
| 2618 |
+
0x1E01F: 84,
|
| 2619 |
+
0x1E020: 84,
|
| 2620 |
+
0x1E021: 84,
|
| 2621 |
+
0x1E023: 84,
|
| 2622 |
+
0x1E024: 84,
|
| 2623 |
+
0x1E026: 84,
|
| 2624 |
+
0x1E027: 84,
|
| 2625 |
+
0x1E028: 84,
|
| 2626 |
+
0x1E029: 84,
|
| 2627 |
+
0x1E02A: 84,
|
| 2628 |
+
0x1E08F: 84,
|
| 2629 |
+
0x1E130: 84,
|
| 2630 |
+
0x1E131: 84,
|
| 2631 |
+
0x1E132: 84,
|
| 2632 |
+
0x1E133: 84,
|
| 2633 |
+
0x1E134: 84,
|
| 2634 |
+
0x1E135: 84,
|
| 2635 |
+
0x1E136: 84,
|
| 2636 |
+
0x1E2AE: 84,
|
| 2637 |
+
0x1E2EC: 84,
|
| 2638 |
+
0x1E2ED: 84,
|
| 2639 |
+
0x1E2EE: 84,
|
| 2640 |
+
0x1E2EF: 84,
|
| 2641 |
+
0x1E4EC: 84,
|
| 2642 |
+
0x1E4ED: 84,
|
| 2643 |
+
0x1E4EE: 84,
|
| 2644 |
+
0x1E4EF: 84,
|
| 2645 |
+
0x1E5EE: 84,
|
| 2646 |
+
0x1E5EF: 84,
|
| 2647 |
+
0x1E8D0: 84,
|
| 2648 |
+
0x1E8D1: 84,
|
| 2649 |
+
0x1E8D2: 84,
|
| 2650 |
+
0x1E8D3: 84,
|
| 2651 |
+
0x1E8D4: 84,
|
| 2652 |
+
0x1E8D5: 84,
|
| 2653 |
+
0x1E8D6: 84,
|
| 2654 |
+
0x1E900: 68,
|
| 2655 |
+
0x1E901: 68,
|
| 2656 |
+
0x1E902: 68,
|
| 2657 |
+
0x1E903: 68,
|
| 2658 |
+
0x1E904: 68,
|
| 2659 |
+
0x1E905: 68,
|
| 2660 |
+
0x1E906: 68,
|
| 2661 |
+
0x1E907: 68,
|
| 2662 |
+
0x1E908: 68,
|
| 2663 |
+
0x1E909: 68,
|
| 2664 |
+
0x1E90A: 68,
|
| 2665 |
+
0x1E90B: 68,
|
| 2666 |
+
0x1E90C: 68,
|
| 2667 |
+
0x1E90D: 68,
|
| 2668 |
+
0x1E90E: 68,
|
| 2669 |
+
0x1E90F: 68,
|
| 2670 |
+
0x1E910: 68,
|
| 2671 |
+
0x1E911: 68,
|
| 2672 |
+
0x1E912: 68,
|
| 2673 |
+
0x1E913: 68,
|
| 2674 |
+
0x1E914: 68,
|
| 2675 |
+
0x1E915: 68,
|
| 2676 |
+
0x1E916: 68,
|
| 2677 |
+
0x1E917: 68,
|
| 2678 |
+
0x1E918: 68,
|
| 2679 |
+
0x1E919: 68,
|
| 2680 |
+
0x1E91A: 68,
|
| 2681 |
+
0x1E91B: 68,
|
| 2682 |
+
0x1E91C: 68,
|
| 2683 |
+
0x1E91D: 68,
|
| 2684 |
+
0x1E91E: 68,
|
| 2685 |
+
0x1E91F: 68,
|
| 2686 |
+
0x1E920: 68,
|
| 2687 |
+
0x1E921: 68,
|
| 2688 |
+
0x1E922: 68,
|
| 2689 |
+
0x1E923: 68,
|
| 2690 |
+
0x1E924: 68,
|
| 2691 |
+
0x1E925: 68,
|
| 2692 |
+
0x1E926: 68,
|
| 2693 |
+
0x1E927: 68,
|
| 2694 |
+
0x1E928: 68,
|
| 2695 |
+
0x1E929: 68,
|
| 2696 |
+
0x1E92A: 68,
|
| 2697 |
+
0x1E92B: 68,
|
| 2698 |
+
0x1E92C: 68,
|
| 2699 |
+
0x1E92D: 68,
|
| 2700 |
+
0x1E92E: 68,
|
| 2701 |
+
0x1E92F: 68,
|
| 2702 |
+
0x1E930: 68,
|
| 2703 |
+
0x1E931: 68,
|
| 2704 |
+
0x1E932: 68,
|
| 2705 |
+
0x1E933: 68,
|
| 2706 |
+
0x1E934: 68,
|
| 2707 |
+
0x1E935: 68,
|
| 2708 |
+
0x1E936: 68,
|
| 2709 |
+
0x1E937: 68,
|
| 2710 |
+
0x1E938: 68,
|
| 2711 |
+
0x1E939: 68,
|
| 2712 |
+
0x1E93A: 68,
|
| 2713 |
+
0x1E93B: 68,
|
| 2714 |
+
0x1E93C: 68,
|
| 2715 |
+
0x1E93D: 68,
|
| 2716 |
+
0x1E93E: 68,
|
| 2717 |
+
0x1E93F: 68,
|
| 2718 |
+
0x1E940: 68,
|
| 2719 |
+
0x1E941: 68,
|
| 2720 |
+
0x1E942: 68,
|
| 2721 |
+
0x1E943: 68,
|
| 2722 |
+
0x1E944: 84,
|
| 2723 |
+
0x1E945: 84,
|
| 2724 |
+
0x1E946: 84,
|
| 2725 |
+
0x1E947: 84,
|
| 2726 |
+
0x1E948: 84,
|
| 2727 |
+
0x1E949: 84,
|
| 2728 |
+
0x1E94A: 84,
|
| 2729 |
+
0x1E94B: 84,
|
| 2730 |
+
0xE0001: 84,
|
| 2731 |
+
0xE0020: 84,
|
| 2732 |
+
0xE0021: 84,
|
| 2733 |
+
0xE0022: 84,
|
| 2734 |
+
0xE0023: 84,
|
| 2735 |
+
0xE0024: 84,
|
| 2736 |
+
0xE0025: 84,
|
| 2737 |
+
0xE0026: 84,
|
| 2738 |
+
0xE0027: 84,
|
| 2739 |
+
0xE0028: 84,
|
| 2740 |
+
0xE0029: 84,
|
| 2741 |
+
0xE002A: 84,
|
| 2742 |
+
0xE002B: 84,
|
| 2743 |
+
0xE002C: 84,
|
| 2744 |
+
0xE002D: 84,
|
| 2745 |
+
0xE002E: 84,
|
| 2746 |
+
0xE002F: 84,
|
| 2747 |
+
0xE0030: 84,
|
| 2748 |
+
0xE0031: 84,
|
| 2749 |
+
0xE0032: 84,
|
| 2750 |
+
0xE0033: 84,
|
| 2751 |
+
0xE0034: 84,
|
| 2752 |
+
0xE0035: 84,
|
| 2753 |
+
0xE0036: 84,
|
| 2754 |
+
0xE0037: 84,
|
| 2755 |
+
0xE0038: 84,
|
| 2756 |
+
0xE0039: 84,
|
| 2757 |
+
0xE003A: 84,
|
| 2758 |
+
0xE003B: 84,
|
| 2759 |
+
0xE003C: 84,
|
| 2760 |
+
0xE003D: 84,
|
| 2761 |
+
0xE003E: 84,
|
| 2762 |
+
0xE003F: 84,
|
| 2763 |
+
0xE0040: 84,
|
| 2764 |
+
0xE0041: 84,
|
| 2765 |
+
0xE0042: 84,
|
| 2766 |
+
0xE0043: 84,
|
| 2767 |
+
0xE0044: 84,
|
| 2768 |
+
0xE0045: 84,
|
| 2769 |
+
0xE0046: 84,
|
| 2770 |
+
0xE0047: 84,
|
| 2771 |
+
0xE0048: 84,
|
| 2772 |
+
0xE0049: 84,
|
| 2773 |
+
0xE004A: 84,
|
| 2774 |
+
0xE004B: 84,
|
| 2775 |
+
0xE004C: 84,
|
| 2776 |
+
0xE004D: 84,
|
| 2777 |
+
0xE004E: 84,
|
| 2778 |
+
0xE004F: 84,
|
| 2779 |
+
0xE0050: 84,
|
| 2780 |
+
0xE0051: 84,
|
| 2781 |
+
0xE0052: 84,
|
| 2782 |
+
0xE0053: 84,
|
| 2783 |
+
0xE0054: 84,
|
| 2784 |
+
0xE0055: 84,
|
| 2785 |
+
0xE0056: 84,
|
| 2786 |
+
0xE0057: 84,
|
| 2787 |
+
0xE0058: 84,
|
| 2788 |
+
0xE0059: 84,
|
| 2789 |
+
0xE005A: 84,
|
| 2790 |
+
0xE005B: 84,
|
| 2791 |
+
0xE005C: 84,
|
| 2792 |
+
0xE005D: 84,
|
| 2793 |
+
0xE005E: 84,
|
| 2794 |
+
0xE005F: 84,
|
| 2795 |
+
0xE0060: 84,
|
| 2796 |
+
0xE0061: 84,
|
| 2797 |
+
0xE0062: 84,
|
| 2798 |
+
0xE0063: 84,
|
| 2799 |
+
0xE0064: 84,
|
| 2800 |
+
0xE0065: 84,
|
| 2801 |
+
0xE0066: 84,
|
| 2802 |
+
0xE0067: 84,
|
| 2803 |
+
0xE0068: 84,
|
| 2804 |
+
0xE0069: 84,
|
| 2805 |
+
0xE006A: 84,
|
| 2806 |
+
0xE006B: 84,
|
| 2807 |
+
0xE006C: 84,
|
| 2808 |
+
0xE006D: 84,
|
| 2809 |
+
0xE006E: 84,
|
| 2810 |
+
0xE006F: 84,
|
| 2811 |
+
0xE0070: 84,
|
| 2812 |
+
0xE0071: 84,
|
| 2813 |
+
0xE0072: 84,
|
| 2814 |
+
0xE0073: 84,
|
| 2815 |
+
0xE0074: 84,
|
| 2816 |
+
0xE0075: 84,
|
| 2817 |
+
0xE0076: 84,
|
| 2818 |
+
0xE0077: 84,
|
| 2819 |
+
0xE0078: 84,
|
| 2820 |
+
0xE0079: 84,
|
| 2821 |
+
0xE007A: 84,
|
| 2822 |
+
0xE007B: 84,
|
| 2823 |
+
0xE007C: 84,
|
| 2824 |
+
0xE007D: 84,
|
| 2825 |
+
0xE007E: 84,
|
| 2826 |
+
0xE007F: 84,
|
| 2827 |
+
0xE0100: 84,
|
| 2828 |
+
0xE0101: 84,
|
| 2829 |
+
0xE0102: 84,
|
| 2830 |
+
0xE0103: 84,
|
| 2831 |
+
0xE0104: 84,
|
| 2832 |
+
0xE0105: 84,
|
| 2833 |
+
0xE0106: 84,
|
| 2834 |
+
0xE0107: 84,
|
| 2835 |
+
0xE0108: 84,
|
| 2836 |
+
0xE0109: 84,
|
| 2837 |
+
0xE010A: 84,
|
| 2838 |
+
0xE010B: 84,
|
| 2839 |
+
0xE010C: 84,
|
| 2840 |
+
0xE010D: 84,
|
| 2841 |
+
0xE010E: 84,
|
| 2842 |
+
0xE010F: 84,
|
| 2843 |
+
0xE0110: 84,
|
| 2844 |
+
0xE0111: 84,
|
| 2845 |
+
0xE0112: 84,
|
| 2846 |
+
0xE0113: 84,
|
| 2847 |
+
0xE0114: 84,
|
| 2848 |
+
0xE0115: 84,
|
| 2849 |
+
0xE0116: 84,
|
| 2850 |
+
0xE0117: 84,
|
| 2851 |
+
0xE0118: 84,
|
| 2852 |
+
0xE0119: 84,
|
| 2853 |
+
0xE011A: 84,
|
| 2854 |
+
0xE011B: 84,
|
| 2855 |
+
0xE011C: 84,
|
| 2856 |
+
0xE011D: 84,
|
| 2857 |
+
0xE011E: 84,
|
| 2858 |
+
0xE011F: 84,
|
| 2859 |
+
0xE0120: 84,
|
| 2860 |
+
0xE0121: 84,
|
| 2861 |
+
0xE0122: 84,
|
| 2862 |
+
0xE0123: 84,
|
| 2863 |
+
0xE0124: 84,
|
| 2864 |
+
0xE0125: 84,
|
| 2865 |
+
0xE0126: 84,
|
| 2866 |
+
0xE0127: 84,
|
| 2867 |
+
0xE0128: 84,
|
| 2868 |
+
0xE0129: 84,
|
| 2869 |
+
0xE012A: 84,
|
| 2870 |
+
0xE012B: 84,
|
| 2871 |
+
0xE012C: 84,
|
| 2872 |
+
0xE012D: 84,
|
| 2873 |
+
0xE012E: 84,
|
| 2874 |
+
0xE012F: 84,
|
| 2875 |
+
0xE0130: 84,
|
| 2876 |
+
0xE0131: 84,
|
| 2877 |
+
0xE0132: 84,
|
| 2878 |
+
0xE0133: 84,
|
| 2879 |
+
0xE0134: 84,
|
| 2880 |
+
0xE0135: 84,
|
| 2881 |
+
0xE0136: 84,
|
| 2882 |
+
0xE0137: 84,
|
| 2883 |
+
0xE0138: 84,
|
| 2884 |
+
0xE0139: 84,
|
| 2885 |
+
0xE013A: 84,
|
| 2886 |
+
0xE013B: 84,
|
| 2887 |
+
0xE013C: 84,
|
| 2888 |
+
0xE013D: 84,
|
| 2889 |
+
0xE013E: 84,
|
| 2890 |
+
0xE013F: 84,
|
| 2891 |
+
0xE0140: 84,
|
| 2892 |
+
0xE0141: 84,
|
| 2893 |
+
0xE0142: 84,
|
| 2894 |
+
0xE0143: 84,
|
| 2895 |
+
0xE0144: 84,
|
| 2896 |
+
0xE0145: 84,
|
| 2897 |
+
0xE0146: 84,
|
| 2898 |
+
0xE0147: 84,
|
| 2899 |
+
0xE0148: 84,
|
| 2900 |
+
0xE0149: 84,
|
| 2901 |
+
0xE014A: 84,
|
| 2902 |
+
0xE014B: 84,
|
| 2903 |
+
0xE014C: 84,
|
| 2904 |
+
0xE014D: 84,
|
| 2905 |
+
0xE014E: 84,
|
| 2906 |
+
0xE014F: 84,
|
| 2907 |
+
0xE0150: 84,
|
| 2908 |
+
0xE0151: 84,
|
| 2909 |
+
0xE0152: 84,
|
| 2910 |
+
0xE0153: 84,
|
| 2911 |
+
0xE0154: 84,
|
| 2912 |
+
0xE0155: 84,
|
| 2913 |
+
0xE0156: 84,
|
| 2914 |
+
0xE0157: 84,
|
| 2915 |
+
0xE0158: 84,
|
| 2916 |
+
0xE0159: 84,
|
| 2917 |
+
0xE015A: 84,
|
| 2918 |
+
0xE015B: 84,
|
| 2919 |
+
0xE015C: 84,
|
| 2920 |
+
0xE015D: 84,
|
| 2921 |
+
0xE015E: 84,
|
| 2922 |
+
0xE015F: 84,
|
| 2923 |
+
0xE0160: 84,
|
| 2924 |
+
0xE0161: 84,
|
| 2925 |
+
0xE0162: 84,
|
| 2926 |
+
0xE0163: 84,
|
| 2927 |
+
0xE0164: 84,
|
| 2928 |
+
0xE0165: 84,
|
| 2929 |
+
0xE0166: 84,
|
| 2930 |
+
0xE0167: 84,
|
| 2931 |
+
0xE0168: 84,
|
| 2932 |
+
0xE0169: 84,
|
| 2933 |
+
0xE016A: 84,
|
| 2934 |
+
0xE016B: 84,
|
| 2935 |
+
0xE016C: 84,
|
| 2936 |
+
0xE016D: 84,
|
| 2937 |
+
0xE016E: 84,
|
| 2938 |
+
0xE016F: 84,
|
| 2939 |
+
0xE0170: 84,
|
| 2940 |
+
0xE0171: 84,
|
| 2941 |
+
0xE0172: 84,
|
| 2942 |
+
0xE0173: 84,
|
| 2943 |
+
0xE0174: 84,
|
| 2944 |
+
0xE0175: 84,
|
| 2945 |
+
0xE0176: 84,
|
| 2946 |
+
0xE0177: 84,
|
| 2947 |
+
0xE0178: 84,
|
| 2948 |
+
0xE0179: 84,
|
| 2949 |
+
0xE017A: 84,
|
| 2950 |
+
0xE017B: 84,
|
| 2951 |
+
0xE017C: 84,
|
| 2952 |
+
0xE017D: 84,
|
| 2953 |
+
0xE017E: 84,
|
| 2954 |
+
0xE017F: 84,
|
| 2955 |
+
0xE0180: 84,
|
| 2956 |
+
0xE0181: 84,
|
| 2957 |
+
0xE0182: 84,
|
| 2958 |
+
0xE0183: 84,
|
| 2959 |
+
0xE0184: 84,
|
| 2960 |
+
0xE0185: 84,
|
| 2961 |
+
0xE0186: 84,
|
| 2962 |
+
0xE0187: 84,
|
| 2963 |
+
0xE0188: 84,
|
| 2964 |
+
0xE0189: 84,
|
| 2965 |
+
0xE018A: 84,
|
| 2966 |
+
0xE018B: 84,
|
| 2967 |
+
0xE018C: 84,
|
| 2968 |
+
0xE018D: 84,
|
| 2969 |
+
0xE018E: 84,
|
| 2970 |
+
0xE018F: 84,
|
| 2971 |
+
0xE0190: 84,
|
| 2972 |
+
0xE0191: 84,
|
| 2973 |
+
0xE0192: 84,
|
| 2974 |
+
0xE0193: 84,
|
| 2975 |
+
0xE0194: 84,
|
| 2976 |
+
0xE0195: 84,
|
| 2977 |
+
0xE0196: 84,
|
| 2978 |
+
0xE0197: 84,
|
| 2979 |
+
0xE0198: 84,
|
| 2980 |
+
0xE0199: 84,
|
| 2981 |
+
0xE019A: 84,
|
| 2982 |
+
0xE019B: 84,
|
| 2983 |
+
0xE019C: 84,
|
| 2984 |
+
0xE019D: 84,
|
| 2985 |
+
0xE019E: 84,
|
| 2986 |
+
0xE019F: 84,
|
| 2987 |
+
0xE01A0: 84,
|
| 2988 |
+
0xE01A1: 84,
|
| 2989 |
+
0xE01A2: 84,
|
| 2990 |
+
0xE01A3: 84,
|
| 2991 |
+
0xE01A4: 84,
|
| 2992 |
+
0xE01A5: 84,
|
| 2993 |
+
0xE01A6: 84,
|
| 2994 |
+
0xE01A7: 84,
|
| 2995 |
+
0xE01A8: 84,
|
| 2996 |
+
0xE01A9: 84,
|
| 2997 |
+
0xE01AA: 84,
|
| 2998 |
+
0xE01AB: 84,
|
| 2999 |
+
0xE01AC: 84,
|
| 3000 |
+
0xE01AD: 84,
|
| 3001 |
+
0xE01AE: 84,
|
| 3002 |
+
0xE01AF: 84,
|
| 3003 |
+
0xE01B0: 84,
|
| 3004 |
+
0xE01B1: 84,
|
| 3005 |
+
0xE01B2: 84,
|
| 3006 |
+
0xE01B3: 84,
|
| 3007 |
+
0xE01B4: 84,
|
| 3008 |
+
0xE01B5: 84,
|
| 3009 |
+
0xE01B6: 84,
|
| 3010 |
+
0xE01B7: 84,
|
| 3011 |
+
0xE01B8: 84,
|
| 3012 |
+
0xE01B9: 84,
|
| 3013 |
+
0xE01BA: 84,
|
| 3014 |
+
0xE01BB: 84,
|
| 3015 |
+
0xE01BC: 84,
|
| 3016 |
+
0xE01BD: 84,
|
| 3017 |
+
0xE01BE: 84,
|
| 3018 |
+
0xE01BF: 84,
|
| 3019 |
+
0xE01C0: 84,
|
| 3020 |
+
0xE01C1: 84,
|
| 3021 |
+
0xE01C2: 84,
|
| 3022 |
+
0xE01C3: 84,
|
| 3023 |
+
0xE01C4: 84,
|
| 3024 |
+
0xE01C5: 84,
|
| 3025 |
+
0xE01C6: 84,
|
| 3026 |
+
0xE01C7: 84,
|
| 3027 |
+
0xE01C8: 84,
|
| 3028 |
+
0xE01C9: 84,
|
| 3029 |
+
0xE01CA: 84,
|
| 3030 |
+
0xE01CB: 84,
|
| 3031 |
+
0xE01CC: 84,
|
| 3032 |
+
0xE01CD: 84,
|
| 3033 |
+
0xE01CE: 84,
|
| 3034 |
+
0xE01CF: 84,
|
| 3035 |
+
0xE01D0: 84,
|
| 3036 |
+
0xE01D1: 84,
|
| 3037 |
+
0xE01D2: 84,
|
| 3038 |
+
0xE01D3: 84,
|
| 3039 |
+
0xE01D4: 84,
|
| 3040 |
+
0xE01D5: 84,
|
| 3041 |
+
0xE01D6: 84,
|
| 3042 |
+
0xE01D7: 84,
|
| 3043 |
+
0xE01D8: 84,
|
| 3044 |
+
0xE01D9: 84,
|
| 3045 |
+
0xE01DA: 84,
|
| 3046 |
+
0xE01DB: 84,
|
| 3047 |
+
0xE01DC: 84,
|
| 3048 |
+
0xE01DD: 84,
|
| 3049 |
+
0xE01DE: 84,
|
| 3050 |
+
0xE01DF: 84,
|
| 3051 |
+
0xE01E0: 84,
|
| 3052 |
+
0xE01E1: 84,
|
| 3053 |
+
0xE01E2: 84,
|
| 3054 |
+
0xE01E3: 84,
|
| 3055 |
+
0xE01E4: 84,
|
| 3056 |
+
0xE01E5: 84,
|
| 3057 |
+
0xE01E6: 84,
|
| 3058 |
+
0xE01E7: 84,
|
| 3059 |
+
0xE01E8: 84,
|
| 3060 |
+
0xE01E9: 84,
|
| 3061 |
+
0xE01EA: 84,
|
| 3062 |
+
0xE01EB: 84,
|
| 3063 |
+
0xE01EC: 84,
|
| 3064 |
+
0xE01ED: 84,
|
| 3065 |
+
0xE01EE: 84,
|
| 3066 |
+
0xE01EF: 84,
|
| 3067 |
+
}
|
| 3068 |
+
codepoint_classes = {
|
| 3069 |
+
"PVALID": (
|
| 3070 |
+
0x2D0000002E,
|
| 3071 |
+
0x300000003A,
|
| 3072 |
+
0x610000007B,
|
| 3073 |
+
0xDF000000F7,
|
| 3074 |
+
0xF800000100,
|
| 3075 |
+
0x10100000102,
|
| 3076 |
+
0x10300000104,
|
| 3077 |
+
0x10500000106,
|
| 3078 |
+
0x10700000108,
|
| 3079 |
+
0x1090000010A,
|
| 3080 |
+
0x10B0000010C,
|
| 3081 |
+
0x10D0000010E,
|
| 3082 |
+
0x10F00000110,
|
| 3083 |
+
0x11100000112,
|
| 3084 |
+
0x11300000114,
|
| 3085 |
+
0x11500000116,
|
| 3086 |
+
0x11700000118,
|
| 3087 |
+
0x1190000011A,
|
| 3088 |
+
0x11B0000011C,
|
| 3089 |
+
0x11D0000011E,
|
| 3090 |
+
0x11F00000120,
|
| 3091 |
+
0x12100000122,
|
| 3092 |
+
0x12300000124,
|
| 3093 |
+
0x12500000126,
|
| 3094 |
+
0x12700000128,
|
| 3095 |
+
0x1290000012A,
|
| 3096 |
+
0x12B0000012C,
|
| 3097 |
+
0x12D0000012E,
|
| 3098 |
+
0x12F00000130,
|
| 3099 |
+
0x13100000132,
|
| 3100 |
+
0x13500000136,
|
| 3101 |
+
0x13700000139,
|
| 3102 |
+
0x13A0000013B,
|
| 3103 |
+
0x13C0000013D,
|
| 3104 |
+
0x13E0000013F,
|
| 3105 |
+
0x14200000143,
|
| 3106 |
+
0x14400000145,
|
| 3107 |
+
0x14600000147,
|
| 3108 |
+
0x14800000149,
|
| 3109 |
+
0x14B0000014C,
|
| 3110 |
+
0x14D0000014E,
|
| 3111 |
+
0x14F00000150,
|
| 3112 |
+
0x15100000152,
|
| 3113 |
+
0x15300000154,
|
| 3114 |
+
0x15500000156,
|
| 3115 |
+
0x15700000158,
|
| 3116 |
+
0x1590000015A,
|
| 3117 |
+
0x15B0000015C,
|
| 3118 |
+
0x15D0000015E,
|
| 3119 |
+
0x15F00000160,
|
| 3120 |
+
0x16100000162,
|
| 3121 |
+
0x16300000164,
|
| 3122 |
+
0x16500000166,
|
| 3123 |
+
0x16700000168,
|
| 3124 |
+
0x1690000016A,
|
| 3125 |
+
0x16B0000016C,
|
| 3126 |
+
0x16D0000016E,
|
| 3127 |
+
0x16F00000170,
|
| 3128 |
+
0x17100000172,
|
| 3129 |
+
0x17300000174,
|
| 3130 |
+
0x17500000176,
|
| 3131 |
+
0x17700000178,
|
| 3132 |
+
0x17A0000017B,
|
| 3133 |
+
0x17C0000017D,
|
| 3134 |
+
0x17E0000017F,
|
| 3135 |
+
0x18000000181,
|
| 3136 |
+
0x18300000184,
|
| 3137 |
+
0x18500000186,
|
| 3138 |
+
0x18800000189,
|
| 3139 |
+
0x18C0000018E,
|
| 3140 |
+
0x19200000193,
|
| 3141 |
+
0x19500000196,
|
| 3142 |
+
0x1990000019C,
|
| 3143 |
+
0x19E0000019F,
|
| 3144 |
+
0x1A1000001A2,
|
| 3145 |
+
0x1A3000001A4,
|
| 3146 |
+
0x1A5000001A6,
|
| 3147 |
+
0x1A8000001A9,
|
| 3148 |
+
0x1AA000001AC,
|
| 3149 |
+
0x1AD000001AE,
|
| 3150 |
+
0x1B0000001B1,
|
| 3151 |
+
0x1B4000001B5,
|
| 3152 |
+
0x1B6000001B7,
|
| 3153 |
+
0x1B9000001BC,
|
| 3154 |
+
0x1BD000001C4,
|
| 3155 |
+
0x1CE000001CF,
|
| 3156 |
+
0x1D0000001D1,
|
| 3157 |
+
0x1D2000001D3,
|
| 3158 |
+
0x1D4000001D5,
|
| 3159 |
+
0x1D6000001D7,
|
| 3160 |
+
0x1D8000001D9,
|
| 3161 |
+
0x1DA000001DB,
|
| 3162 |
+
0x1DC000001DE,
|
| 3163 |
+
0x1DF000001E0,
|
| 3164 |
+
0x1E1000001E2,
|
| 3165 |
+
0x1E3000001E4,
|
| 3166 |
+
0x1E5000001E6,
|
| 3167 |
+
0x1E7000001E8,
|
| 3168 |
+
0x1E9000001EA,
|
| 3169 |
+
0x1EB000001EC,
|
| 3170 |
+
0x1ED000001EE,
|
| 3171 |
+
0x1EF000001F1,
|
| 3172 |
+
0x1F5000001F6,
|
| 3173 |
+
0x1F9000001FA,
|
| 3174 |
+
0x1FB000001FC,
|
| 3175 |
+
0x1FD000001FE,
|
| 3176 |
+
0x1FF00000200,
|
| 3177 |
+
0x20100000202,
|
| 3178 |
+
0x20300000204,
|
| 3179 |
+
0x20500000206,
|
| 3180 |
+
0x20700000208,
|
| 3181 |
+
0x2090000020A,
|
| 3182 |
+
0x20B0000020C,
|
| 3183 |
+
0x20D0000020E,
|
| 3184 |
+
0x20F00000210,
|
| 3185 |
+
0x21100000212,
|
| 3186 |
+
0x21300000214,
|
| 3187 |
+
0x21500000216,
|
| 3188 |
+
0x21700000218,
|
| 3189 |
+
0x2190000021A,
|
| 3190 |
+
0x21B0000021C,
|
| 3191 |
+
0x21D0000021E,
|
| 3192 |
+
0x21F00000220,
|
| 3193 |
+
0x22100000222,
|
| 3194 |
+
0x22300000224,
|
| 3195 |
+
0x22500000226,
|
| 3196 |
+
0x22700000228,
|
| 3197 |
+
0x2290000022A,
|
| 3198 |
+
0x22B0000022C,
|
| 3199 |
+
0x22D0000022E,
|
| 3200 |
+
0x22F00000230,
|
| 3201 |
+
0x23100000232,
|
| 3202 |
+
0x2330000023A,
|
| 3203 |
+
0x23C0000023D,
|
| 3204 |
+
0x23F00000241,
|
| 3205 |
+
0x24200000243,
|
| 3206 |
+
0x24700000248,
|
| 3207 |
+
0x2490000024A,
|
| 3208 |
+
0x24B0000024C,
|
| 3209 |
+
0x24D0000024E,
|
| 3210 |
+
0x24F000002B0,
|
| 3211 |
+
0x2B9000002C2,
|
| 3212 |
+
0x2C6000002D2,
|
| 3213 |
+
0x2EC000002ED,
|
| 3214 |
+
0x2EE000002EF,
|
| 3215 |
+
0x30000000340,
|
| 3216 |
+
0x34200000343,
|
| 3217 |
+
0x3460000034F,
|
| 3218 |
+
0x35000000370,
|
| 3219 |
+
0x37100000372,
|
| 3220 |
+
0x37300000374,
|
| 3221 |
+
0x37700000378,
|
| 3222 |
+
0x37B0000037E,
|
| 3223 |
+
0x39000000391,
|
| 3224 |
+
0x3AC000003CF,
|
| 3225 |
+
0x3D7000003D8,
|
| 3226 |
+
0x3D9000003DA,
|
| 3227 |
+
0x3DB000003DC,
|
| 3228 |
+
0x3DD000003DE,
|
| 3229 |
+
0x3DF000003E0,
|
| 3230 |
+
0x3E1000003E2,
|
| 3231 |
+
0x3E3000003E4,
|
| 3232 |
+
0x3E5000003E6,
|
| 3233 |
+
0x3E7000003E8,
|
| 3234 |
+
0x3E9000003EA,
|
| 3235 |
+
0x3EB000003EC,
|
| 3236 |
+
0x3ED000003EE,
|
| 3237 |
+
0x3EF000003F0,
|
| 3238 |
+
0x3F3000003F4,
|
| 3239 |
+
0x3F8000003F9,
|
| 3240 |
+
0x3FB000003FD,
|
| 3241 |
+
0x43000000460,
|
| 3242 |
+
0x46100000462,
|
| 3243 |
+
0x46300000464,
|
| 3244 |
+
0x46500000466,
|
| 3245 |
+
0x46700000468,
|
| 3246 |
+
0x4690000046A,
|
| 3247 |
+
0x46B0000046C,
|
| 3248 |
+
0x46D0000046E,
|
| 3249 |
+
0x46F00000470,
|
| 3250 |
+
0x47100000472,
|
| 3251 |
+
0x47300000474,
|
| 3252 |
+
0x47500000476,
|
| 3253 |
+
0x47700000478,
|
| 3254 |
+
0x4790000047A,
|
| 3255 |
+
0x47B0000047C,
|
| 3256 |
+
0x47D0000047E,
|
| 3257 |
+
0x47F00000480,
|
| 3258 |
+
0x48100000482,
|
| 3259 |
+
0x48300000488,
|
| 3260 |
+
0x48B0000048C,
|
| 3261 |
+
0x48D0000048E,
|
| 3262 |
+
0x48F00000490,
|
| 3263 |
+
0x49100000492,
|
| 3264 |
+
0x49300000494,
|
| 3265 |
+
0x49500000496,
|
| 3266 |
+
0x49700000498,
|
| 3267 |
+
0x4990000049A,
|
| 3268 |
+
0x49B0000049C,
|
| 3269 |
+
0x49D0000049E,
|
| 3270 |
+
0x49F000004A0,
|
| 3271 |
+
0x4A1000004A2,
|
| 3272 |
+
0x4A3000004A4,
|
| 3273 |
+
0x4A5000004A6,
|
| 3274 |
+
0x4A7000004A8,
|
| 3275 |
+
0x4A9000004AA,
|
| 3276 |
+
0x4AB000004AC,
|
| 3277 |
+
0x4AD000004AE,
|
| 3278 |
+
0x4AF000004B0,
|
| 3279 |
+
0x4B1000004B2,
|
| 3280 |
+
0x4B3000004B4,
|
| 3281 |
+
0x4B5000004B6,
|
| 3282 |
+
0x4B7000004B8,
|
| 3283 |
+
0x4B9000004BA,
|
| 3284 |
+
0x4BB000004BC,
|
| 3285 |
+
0x4BD000004BE,
|
| 3286 |
+
0x4BF000004C0,
|
| 3287 |
+
0x4C2000004C3,
|
| 3288 |
+
0x4C4000004C5,
|
| 3289 |
+
0x4C6000004C7,
|
| 3290 |
+
0x4C8000004C9,
|
| 3291 |
+
0x4CA000004CB,
|
| 3292 |
+
0x4CC000004CD,
|
| 3293 |
+
0x4CE000004D0,
|
| 3294 |
+
0x4D1000004D2,
|
| 3295 |
+
0x4D3000004D4,
|
| 3296 |
+
0x4D5000004D6,
|
| 3297 |
+
0x4D7000004D8,
|
| 3298 |
+
0x4D9000004DA,
|
| 3299 |
+
0x4DB000004DC,
|
| 3300 |
+
0x4DD000004DE,
|
| 3301 |
+
0x4DF000004E0,
|
| 3302 |
+
0x4E1000004E2,
|
| 3303 |
+
0x4E3000004E4,
|
| 3304 |
+
0x4E5000004E6,
|
| 3305 |
+
0x4E7000004E8,
|
| 3306 |
+
0x4E9000004EA,
|
| 3307 |
+
0x4EB000004EC,
|
| 3308 |
+
0x4ED000004EE,
|
| 3309 |
+
0x4EF000004F0,
|
| 3310 |
+
0x4F1000004F2,
|
| 3311 |
+
0x4F3000004F4,
|
| 3312 |
+
0x4F5000004F6,
|
| 3313 |
+
0x4F7000004F8,
|
| 3314 |
+
0x4F9000004FA,
|
| 3315 |
+
0x4FB000004FC,
|
| 3316 |
+
0x4FD000004FE,
|
| 3317 |
+
0x4FF00000500,
|
| 3318 |
+
0x50100000502,
|
| 3319 |
+
0x50300000504,
|
| 3320 |
+
0x50500000506,
|
| 3321 |
+
0x50700000508,
|
| 3322 |
+
0x5090000050A,
|
| 3323 |
+
0x50B0000050C,
|
| 3324 |
+
0x50D0000050E,
|
| 3325 |
+
0x50F00000510,
|
| 3326 |
+
0x51100000512,
|
| 3327 |
+
0x51300000514,
|
| 3328 |
+
0x51500000516,
|
| 3329 |
+
0x51700000518,
|
| 3330 |
+
0x5190000051A,
|
| 3331 |
+
0x51B0000051C,
|
| 3332 |
+
0x51D0000051E,
|
| 3333 |
+
0x51F00000520,
|
| 3334 |
+
0x52100000522,
|
| 3335 |
+
0x52300000524,
|
| 3336 |
+
0x52500000526,
|
| 3337 |
+
0x52700000528,
|
| 3338 |
+
0x5290000052A,
|
| 3339 |
+
0x52B0000052C,
|
| 3340 |
+
0x52D0000052E,
|
| 3341 |
+
0x52F00000530,
|
| 3342 |
+
0x5590000055A,
|
| 3343 |
+
0x56000000587,
|
| 3344 |
+
0x58800000589,
|
| 3345 |
+
0x591000005BE,
|
| 3346 |
+
0x5BF000005C0,
|
| 3347 |
+
0x5C1000005C3,
|
| 3348 |
+
0x5C4000005C6,
|
| 3349 |
+
0x5C7000005C8,
|
| 3350 |
+
0x5D0000005EB,
|
| 3351 |
+
0x5EF000005F3,
|
| 3352 |
+
0x6100000061B,
|
| 3353 |
+
0x62000000640,
|
| 3354 |
+
0x64100000660,
|
| 3355 |
+
0x66E00000675,
|
| 3356 |
+
0x679000006D4,
|
| 3357 |
+
0x6D5000006DD,
|
| 3358 |
+
0x6DF000006E9,
|
| 3359 |
+
0x6EA000006F0,
|
| 3360 |
+
0x6FA00000700,
|
| 3361 |
+
0x7100000074B,
|
| 3362 |
+
0x74D000007B2,
|
| 3363 |
+
0x7C0000007F6,
|
| 3364 |
+
0x7FD000007FE,
|
| 3365 |
+
0x8000000082E,
|
| 3366 |
+
0x8400000085C,
|
| 3367 |
+
0x8600000086B,
|
| 3368 |
+
0x87000000888,
|
| 3369 |
+
0x8890000088F,
|
| 3370 |
+
0x897000008E2,
|
| 3371 |
+
0x8E300000958,
|
| 3372 |
+
0x96000000964,
|
| 3373 |
+
0x96600000970,
|
| 3374 |
+
0x97100000984,
|
| 3375 |
+
0x9850000098D,
|
| 3376 |
+
0x98F00000991,
|
| 3377 |
+
0x993000009A9,
|
| 3378 |
+
0x9AA000009B1,
|
| 3379 |
+
0x9B2000009B3,
|
| 3380 |
+
0x9B6000009BA,
|
| 3381 |
+
0x9BC000009C5,
|
| 3382 |
+
0x9C7000009C9,
|
| 3383 |
+
0x9CB000009CF,
|
| 3384 |
+
0x9D7000009D8,
|
| 3385 |
+
0x9E0000009E4,
|
| 3386 |
+
0x9E6000009F2,
|
| 3387 |
+
0x9FC000009FD,
|
| 3388 |
+
0x9FE000009FF,
|
| 3389 |
+
0xA0100000A04,
|
| 3390 |
+
0xA0500000A0B,
|
| 3391 |
+
0xA0F00000A11,
|
| 3392 |
+
0xA1300000A29,
|
| 3393 |
+
0xA2A00000A31,
|
| 3394 |
+
0xA3200000A33,
|
| 3395 |
+
0xA3500000A36,
|
| 3396 |
+
0xA3800000A3A,
|
| 3397 |
+
0xA3C00000A3D,
|
| 3398 |
+
0xA3E00000A43,
|
| 3399 |
+
0xA4700000A49,
|
| 3400 |
+
0xA4B00000A4E,
|
| 3401 |
+
0xA5100000A52,
|
| 3402 |
+
0xA5C00000A5D,
|
| 3403 |
+
0xA6600000A76,
|
| 3404 |
+
0xA8100000A84,
|
| 3405 |
+
0xA8500000A8E,
|
| 3406 |
+
0xA8F00000A92,
|
| 3407 |
+
0xA9300000AA9,
|
| 3408 |
+
0xAAA00000AB1,
|
| 3409 |
+
0xAB200000AB4,
|
| 3410 |
+
0xAB500000ABA,
|
| 3411 |
+
0xABC00000AC6,
|
| 3412 |
+
0xAC700000ACA,
|
| 3413 |
+
0xACB00000ACE,
|
| 3414 |
+
0xAD000000AD1,
|
| 3415 |
+
0xAE000000AE4,
|
| 3416 |
+
0xAE600000AF0,
|
| 3417 |
+
0xAF900000B00,
|
| 3418 |
+
0xB0100000B04,
|
| 3419 |
+
0xB0500000B0D,
|
| 3420 |
+
0xB0F00000B11,
|
| 3421 |
+
0xB1300000B29,
|
| 3422 |
+
0xB2A00000B31,
|
| 3423 |
+
0xB3200000B34,
|
| 3424 |
+
0xB3500000B3A,
|
| 3425 |
+
0xB3C00000B45,
|
| 3426 |
+
0xB4700000B49,
|
| 3427 |
+
0xB4B00000B4E,
|
| 3428 |
+
0xB5500000B58,
|
| 3429 |
+
0xB5F00000B64,
|
| 3430 |
+
0xB6600000B70,
|
| 3431 |
+
0xB7100000B72,
|
| 3432 |
+
0xB8200000B84,
|
| 3433 |
+
0xB8500000B8B,
|
| 3434 |
+
0xB8E00000B91,
|
| 3435 |
+
0xB9200000B96,
|
| 3436 |
+
0xB9900000B9B,
|
| 3437 |
+
0xB9C00000B9D,
|
| 3438 |
+
0xB9E00000BA0,
|
| 3439 |
+
0xBA300000BA5,
|
| 3440 |
+
0xBA800000BAB,
|
| 3441 |
+
0xBAE00000BBA,
|
| 3442 |
+
0xBBE00000BC3,
|
| 3443 |
+
0xBC600000BC9,
|
| 3444 |
+
0xBCA00000BCE,
|
| 3445 |
+
0xBD000000BD1,
|
| 3446 |
+
0xBD700000BD8,
|
| 3447 |
+
0xBE600000BF0,
|
| 3448 |
+
0xC0000000C0D,
|
| 3449 |
+
0xC0E00000C11,
|
| 3450 |
+
0xC1200000C29,
|
| 3451 |
+
0xC2A00000C3A,
|
| 3452 |
+
0xC3C00000C45,
|
| 3453 |
+
0xC4600000C49,
|
| 3454 |
+
0xC4A00000C4E,
|
| 3455 |
+
0xC5500000C57,
|
| 3456 |
+
0xC5800000C5B,
|
| 3457 |
+
0xC5D00000C5E,
|
| 3458 |
+
0xC6000000C64,
|
| 3459 |
+
0xC6600000C70,
|
| 3460 |
+
0xC8000000C84,
|
| 3461 |
+
0xC8500000C8D,
|
| 3462 |
+
0xC8E00000C91,
|
| 3463 |
+
0xC9200000CA9,
|
| 3464 |
+
0xCAA00000CB4,
|
| 3465 |
+
0xCB500000CBA,
|
| 3466 |
+
0xCBC00000CC5,
|
| 3467 |
+
0xCC600000CC9,
|
| 3468 |
+
0xCCA00000CCE,
|
| 3469 |
+
0xCD500000CD7,
|
| 3470 |
+
0xCDD00000CDF,
|
| 3471 |
+
0xCE000000CE4,
|
| 3472 |
+
0xCE600000CF0,
|
| 3473 |
+
0xCF100000CF4,
|
| 3474 |
+
0xD0000000D0D,
|
| 3475 |
+
0xD0E00000D11,
|
| 3476 |
+
0xD1200000D45,
|
| 3477 |
+
0xD4600000D49,
|
| 3478 |
+
0xD4A00000D4F,
|
| 3479 |
+
0xD5400000D58,
|
| 3480 |
+
0xD5F00000D64,
|
| 3481 |
+
0xD6600000D70,
|
| 3482 |
+
0xD7A00000D80,
|
| 3483 |
+
0xD8100000D84,
|
| 3484 |
+
0xD8500000D97,
|
| 3485 |
+
0xD9A00000DB2,
|
| 3486 |
+
0xDB300000DBC,
|
| 3487 |
+
0xDBD00000DBE,
|
| 3488 |
+
0xDC000000DC7,
|
| 3489 |
+
0xDCA00000DCB,
|
| 3490 |
+
0xDCF00000DD5,
|
| 3491 |
+
0xDD600000DD7,
|
| 3492 |
+
0xDD800000DE0,
|
| 3493 |
+
0xDE600000DF0,
|
| 3494 |
+
0xDF200000DF4,
|
| 3495 |
+
0xE0100000E33,
|
| 3496 |
+
0xE3400000E3B,
|
| 3497 |
+
0xE4000000E4F,
|
| 3498 |
+
0xE5000000E5A,
|
| 3499 |
+
0xE8100000E83,
|
| 3500 |
+
0xE8400000E85,
|
| 3501 |
+
0xE8600000E8B,
|
| 3502 |
+
0xE8C00000EA4,
|
| 3503 |
+
0xEA500000EA6,
|
| 3504 |
+
0xEA700000EB3,
|
| 3505 |
+
0xEB400000EBE,
|
| 3506 |
+
0xEC000000EC5,
|
| 3507 |
+
0xEC600000EC7,
|
| 3508 |
+
0xEC800000ECF,
|
| 3509 |
+
0xED000000EDA,
|
| 3510 |
+
0xEDE00000EE0,
|
| 3511 |
+
0xF0000000F01,
|
| 3512 |
+
0xF0B00000F0C,
|
| 3513 |
+
0xF1800000F1A,
|
| 3514 |
+
0xF2000000F2A,
|
| 3515 |
+
0xF3500000F36,
|
| 3516 |
+
0xF3700000F38,
|
| 3517 |
+
0xF3900000F3A,
|
| 3518 |
+
0xF3E00000F43,
|
| 3519 |
+
0xF4400000F48,
|
| 3520 |
+
0xF4900000F4D,
|
| 3521 |
+
0xF4E00000F52,
|
| 3522 |
+
0xF5300000F57,
|
| 3523 |
+
0xF5800000F5C,
|
| 3524 |
+
0xF5D00000F69,
|
| 3525 |
+
0xF6A00000F6D,
|
| 3526 |
+
0xF7100000F73,
|
| 3527 |
+
0xF7400000F75,
|
| 3528 |
+
0xF7A00000F81,
|
| 3529 |
+
0xF8200000F85,
|
| 3530 |
+
0xF8600000F93,
|
| 3531 |
+
0xF9400000F98,
|
| 3532 |
+
0xF9900000F9D,
|
| 3533 |
+
0xF9E00000FA2,
|
| 3534 |
+
0xFA300000FA7,
|
| 3535 |
+
0xFA800000FAC,
|
| 3536 |
+
0xFAD00000FB9,
|
| 3537 |
+
0xFBA00000FBD,
|
| 3538 |
+
0xFC600000FC7,
|
| 3539 |
+
0x10000000104A,
|
| 3540 |
+
0x10500000109E,
|
| 3541 |
+
0x10D0000010FB,
|
| 3542 |
+
0x10FD00001100,
|
| 3543 |
+
0x120000001249,
|
| 3544 |
+
0x124A0000124E,
|
| 3545 |
+
0x125000001257,
|
| 3546 |
+
0x125800001259,
|
| 3547 |
+
0x125A0000125E,
|
| 3548 |
+
0x126000001289,
|
| 3549 |
+
0x128A0000128E,
|
| 3550 |
+
0x1290000012B1,
|
| 3551 |
+
0x12B2000012B6,
|
| 3552 |
+
0x12B8000012BF,
|
| 3553 |
+
0x12C0000012C1,
|
| 3554 |
+
0x12C2000012C6,
|
| 3555 |
+
0x12C8000012D7,
|
| 3556 |
+
0x12D800001311,
|
| 3557 |
+
0x131200001316,
|
| 3558 |
+
0x13180000135B,
|
| 3559 |
+
0x135D00001360,
|
| 3560 |
+
0x138000001390,
|
| 3561 |
+
0x13A0000013F6,
|
| 3562 |
+
0x14010000166D,
|
| 3563 |
+
0x166F00001680,
|
| 3564 |
+
0x16810000169B,
|
| 3565 |
+
0x16A0000016EB,
|
| 3566 |
+
0x16F1000016F9,
|
| 3567 |
+
0x170000001716,
|
| 3568 |
+
0x171F00001735,
|
| 3569 |
+
0x174000001754,
|
| 3570 |
+
0x17600000176D,
|
| 3571 |
+
0x176E00001771,
|
| 3572 |
+
0x177200001774,
|
| 3573 |
+
0x1780000017B4,
|
| 3574 |
+
0x17B6000017D4,
|
| 3575 |
+
0x17D7000017D8,
|
| 3576 |
+
0x17DC000017DE,
|
| 3577 |
+
0x17E0000017EA,
|
| 3578 |
+
0x18100000181A,
|
| 3579 |
+
0x182000001879,
|
| 3580 |
+
0x1880000018AB,
|
| 3581 |
+
0x18B0000018F6,
|
| 3582 |
+
0x19000000191F,
|
| 3583 |
+
0x19200000192C,
|
| 3584 |
+
0x19300000193C,
|
| 3585 |
+
0x19460000196E,
|
| 3586 |
+
0x197000001975,
|
| 3587 |
+
0x1980000019AC,
|
| 3588 |
+
0x19B0000019CA,
|
| 3589 |
+
0x19D0000019DA,
|
| 3590 |
+
0x1A0000001A1C,
|
| 3591 |
+
0x1A2000001A5F,
|
| 3592 |
+
0x1A6000001A7D,
|
| 3593 |
+
0x1A7F00001A8A,
|
| 3594 |
+
0x1A9000001A9A,
|
| 3595 |
+
0x1AA700001AA8,
|
| 3596 |
+
0x1AB000001ABE,
|
| 3597 |
+
0x1ABF00001ACF,
|
| 3598 |
+
0x1B0000001B4D,
|
| 3599 |
+
0x1B5000001B5A,
|
| 3600 |
+
0x1B6B00001B74,
|
| 3601 |
+
0x1B8000001BF4,
|
| 3602 |
+
0x1C0000001C38,
|
| 3603 |
+
0x1C4000001C4A,
|
| 3604 |
+
0x1C4D00001C7E,
|
| 3605 |
+
0x1C8A00001C8B,
|
| 3606 |
+
0x1CD000001CD3,
|
| 3607 |
+
0x1CD400001CFB,
|
| 3608 |
+
0x1D0000001D2C,
|
| 3609 |
+
0x1D2F00001D30,
|
| 3610 |
+
0x1D3B00001D3C,
|
| 3611 |
+
0x1D4E00001D4F,
|
| 3612 |
+
0x1D6B00001D78,
|
| 3613 |
+
0x1D7900001D9B,
|
| 3614 |
+
0x1DC000001E00,
|
| 3615 |
+
0x1E0100001E02,
|
| 3616 |
+
0x1E0300001E04,
|
| 3617 |
+
0x1E0500001E06,
|
| 3618 |
+
0x1E0700001E08,
|
| 3619 |
+
0x1E0900001E0A,
|
| 3620 |
+
0x1E0B00001E0C,
|
| 3621 |
+
0x1E0D00001E0E,
|
| 3622 |
+
0x1E0F00001E10,
|
| 3623 |
+
0x1E1100001E12,
|
| 3624 |
+
0x1E1300001E14,
|
| 3625 |
+
0x1E1500001E16,
|
| 3626 |
+
0x1E1700001E18,
|
| 3627 |
+
0x1E1900001E1A,
|
| 3628 |
+
0x1E1B00001E1C,
|
| 3629 |
+
0x1E1D00001E1E,
|
| 3630 |
+
0x1E1F00001E20,
|
| 3631 |
+
0x1E2100001E22,
|
| 3632 |
+
0x1E2300001E24,
|
| 3633 |
+
0x1E2500001E26,
|
| 3634 |
+
0x1E2700001E28,
|
| 3635 |
+
0x1E2900001E2A,
|
| 3636 |
+
0x1E2B00001E2C,
|
| 3637 |
+
0x1E2D00001E2E,
|
| 3638 |
+
0x1E2F00001E30,
|
| 3639 |
+
0x1E3100001E32,
|
| 3640 |
+
0x1E3300001E34,
|
| 3641 |
+
0x1E3500001E36,
|
| 3642 |
+
0x1E3700001E38,
|
| 3643 |
+
0x1E3900001E3A,
|
| 3644 |
+
0x1E3B00001E3C,
|
| 3645 |
+
0x1E3D00001E3E,
|
| 3646 |
+
0x1E3F00001E40,
|
| 3647 |
+
0x1E4100001E42,
|
| 3648 |
+
0x1E4300001E44,
|
| 3649 |
+
0x1E4500001E46,
|
| 3650 |
+
0x1E4700001E48,
|
| 3651 |
+
0x1E4900001E4A,
|
| 3652 |
+
0x1E4B00001E4C,
|
| 3653 |
+
0x1E4D00001E4E,
|
| 3654 |
+
0x1E4F00001E50,
|
| 3655 |
+
0x1E5100001E52,
|
| 3656 |
+
0x1E5300001E54,
|
| 3657 |
+
0x1E5500001E56,
|
| 3658 |
+
0x1E5700001E58,
|
| 3659 |
+
0x1E5900001E5A,
|
| 3660 |
+
0x1E5B00001E5C,
|
| 3661 |
+
0x1E5D00001E5E,
|
| 3662 |
+
0x1E5F00001E60,
|
| 3663 |
+
0x1E6100001E62,
|
| 3664 |
+
0x1E6300001E64,
|
| 3665 |
+
0x1E6500001E66,
|
| 3666 |
+
0x1E6700001E68,
|
| 3667 |
+
0x1E6900001E6A,
|
| 3668 |
+
0x1E6B00001E6C,
|
| 3669 |
+
0x1E6D00001E6E,
|
| 3670 |
+
0x1E6F00001E70,
|
| 3671 |
+
0x1E7100001E72,
|
| 3672 |
+
0x1E7300001E74,
|
| 3673 |
+
0x1E7500001E76,
|
| 3674 |
+
0x1E7700001E78,
|
| 3675 |
+
0x1E7900001E7A,
|
| 3676 |
+
0x1E7B00001E7C,
|
| 3677 |
+
0x1E7D00001E7E,
|
| 3678 |
+
0x1E7F00001E80,
|
| 3679 |
+
0x1E8100001E82,
|
| 3680 |
+
0x1E8300001E84,
|
| 3681 |
+
0x1E8500001E86,
|
| 3682 |
+
0x1E8700001E88,
|
| 3683 |
+
0x1E8900001E8A,
|
| 3684 |
+
0x1E8B00001E8C,
|
| 3685 |
+
0x1E8D00001E8E,
|
| 3686 |
+
0x1E8F00001E90,
|
| 3687 |
+
0x1E9100001E92,
|
| 3688 |
+
0x1E9300001E94,
|
| 3689 |
+
0x1E9500001E9A,
|
| 3690 |
+
0x1E9C00001E9E,
|
| 3691 |
+
0x1E9F00001EA0,
|
| 3692 |
+
0x1EA100001EA2,
|
| 3693 |
+
0x1EA300001EA4,
|
| 3694 |
+
0x1EA500001EA6,
|
| 3695 |
+
0x1EA700001EA8,
|
| 3696 |
+
0x1EA900001EAA,
|
| 3697 |
+
0x1EAB00001EAC,
|
| 3698 |
+
0x1EAD00001EAE,
|
| 3699 |
+
0x1EAF00001EB0,
|
| 3700 |
+
0x1EB100001EB2,
|
| 3701 |
+
0x1EB300001EB4,
|
| 3702 |
+
0x1EB500001EB6,
|
| 3703 |
+
0x1EB700001EB8,
|
| 3704 |
+
0x1EB900001EBA,
|
| 3705 |
+
0x1EBB00001EBC,
|
| 3706 |
+
0x1EBD00001EBE,
|
| 3707 |
+
0x1EBF00001EC0,
|
| 3708 |
+
0x1EC100001EC2,
|
| 3709 |
+
0x1EC300001EC4,
|
| 3710 |
+
0x1EC500001EC6,
|
| 3711 |
+
0x1EC700001EC8,
|
| 3712 |
+
0x1EC900001ECA,
|
| 3713 |
+
0x1ECB00001ECC,
|
| 3714 |
+
0x1ECD00001ECE,
|
| 3715 |
+
0x1ECF00001ED0,
|
| 3716 |
+
0x1ED100001ED2,
|
| 3717 |
+
0x1ED300001ED4,
|
| 3718 |
+
0x1ED500001ED6,
|
| 3719 |
+
0x1ED700001ED8,
|
| 3720 |
+
0x1ED900001EDA,
|
| 3721 |
+
0x1EDB00001EDC,
|
| 3722 |
+
0x1EDD00001EDE,
|
| 3723 |
+
0x1EDF00001EE0,
|
| 3724 |
+
0x1EE100001EE2,
|
| 3725 |
+
0x1EE300001EE4,
|
| 3726 |
+
0x1EE500001EE6,
|
| 3727 |
+
0x1EE700001EE8,
|
| 3728 |
+
0x1EE900001EEA,
|
| 3729 |
+
0x1EEB00001EEC,
|
| 3730 |
+
0x1EED00001EEE,
|
| 3731 |
+
0x1EEF00001EF0,
|
| 3732 |
+
0x1EF100001EF2,
|
| 3733 |
+
0x1EF300001EF4,
|
| 3734 |
+
0x1EF500001EF6,
|
| 3735 |
+
0x1EF700001EF8,
|
| 3736 |
+
0x1EF900001EFA,
|
| 3737 |
+
0x1EFB00001EFC,
|
| 3738 |
+
0x1EFD00001EFE,
|
| 3739 |
+
0x1EFF00001F08,
|
| 3740 |
+
0x1F1000001F16,
|
| 3741 |
+
0x1F2000001F28,
|
| 3742 |
+
0x1F3000001F38,
|
| 3743 |
+
0x1F4000001F46,
|
| 3744 |
+
0x1F5000001F58,
|
| 3745 |
+
0x1F6000001F68,
|
| 3746 |
+
0x1F7000001F71,
|
| 3747 |
+
0x1F7200001F73,
|
| 3748 |
+
0x1F7400001F75,
|
| 3749 |
+
0x1F7600001F77,
|
| 3750 |
+
0x1F7800001F79,
|
| 3751 |
+
0x1F7A00001F7B,
|
| 3752 |
+
0x1F7C00001F7D,
|
| 3753 |
+
0x1FB000001FB2,
|
| 3754 |
+
0x1FB600001FB7,
|
| 3755 |
+
0x1FC600001FC7,
|
| 3756 |
+
0x1FD000001FD3,
|
| 3757 |
+
0x1FD600001FD8,
|
| 3758 |
+
0x1FE000001FE3,
|
| 3759 |
+
0x1FE400001FE8,
|
| 3760 |
+
0x1FF600001FF7,
|
| 3761 |
+
0x214E0000214F,
|
| 3762 |
+
0x218400002185,
|
| 3763 |
+
0x2C3000002C60,
|
| 3764 |
+
0x2C6100002C62,
|
| 3765 |
+
0x2C6500002C67,
|
| 3766 |
+
0x2C6800002C69,
|
| 3767 |
+
0x2C6A00002C6B,
|
| 3768 |
+
0x2C6C00002C6D,
|
| 3769 |
+
0x2C7100002C72,
|
| 3770 |
+
0x2C7300002C75,
|
| 3771 |
+
0x2C7600002C7C,
|
| 3772 |
+
0x2C8100002C82,
|
| 3773 |
+
0x2C8300002C84,
|
| 3774 |
+
0x2C8500002C86,
|
| 3775 |
+
0x2C8700002C88,
|
| 3776 |
+
0x2C8900002C8A,
|
| 3777 |
+
0x2C8B00002C8C,
|
| 3778 |
+
0x2C8D00002C8E,
|
| 3779 |
+
0x2C8F00002C90,
|
| 3780 |
+
0x2C9100002C92,
|
| 3781 |
+
0x2C9300002C94,
|
| 3782 |
+
0x2C9500002C96,
|
| 3783 |
+
0x2C9700002C98,
|
| 3784 |
+
0x2C9900002C9A,
|
| 3785 |
+
0x2C9B00002C9C,
|
| 3786 |
+
0x2C9D00002C9E,
|
| 3787 |
+
0x2C9F00002CA0,
|
| 3788 |
+
0x2CA100002CA2,
|
| 3789 |
+
0x2CA300002CA4,
|
| 3790 |
+
0x2CA500002CA6,
|
| 3791 |
+
0x2CA700002CA8,
|
| 3792 |
+
0x2CA900002CAA,
|
| 3793 |
+
0x2CAB00002CAC,
|
| 3794 |
+
0x2CAD00002CAE,
|
| 3795 |
+
0x2CAF00002CB0,
|
| 3796 |
+
0x2CB100002CB2,
|
| 3797 |
+
0x2CB300002CB4,
|
| 3798 |
+
0x2CB500002CB6,
|
| 3799 |
+
0x2CB700002CB8,
|
| 3800 |
+
0x2CB900002CBA,
|
| 3801 |
+
0x2CBB00002CBC,
|
| 3802 |
+
0x2CBD00002CBE,
|
| 3803 |
+
0x2CBF00002CC0,
|
| 3804 |
+
0x2CC100002CC2,
|
| 3805 |
+
0x2CC300002CC4,
|
| 3806 |
+
0x2CC500002CC6,
|
| 3807 |
+
0x2CC700002CC8,
|
| 3808 |
+
0x2CC900002CCA,
|
| 3809 |
+
0x2CCB00002CCC,
|
| 3810 |
+
0x2CCD00002CCE,
|
| 3811 |
+
0x2CCF00002CD0,
|
| 3812 |
+
0x2CD100002CD2,
|
| 3813 |
+
0x2CD300002CD4,
|
| 3814 |
+
0x2CD500002CD6,
|
| 3815 |
+
0x2CD700002CD8,
|
| 3816 |
+
0x2CD900002CDA,
|
| 3817 |
+
0x2CDB00002CDC,
|
| 3818 |
+
0x2CDD00002CDE,
|
| 3819 |
+
0x2CDF00002CE0,
|
| 3820 |
+
0x2CE100002CE2,
|
| 3821 |
+
0x2CE300002CE5,
|
| 3822 |
+
0x2CEC00002CED,
|
| 3823 |
+
0x2CEE00002CF2,
|
| 3824 |
+
0x2CF300002CF4,
|
| 3825 |
+
0x2D0000002D26,
|
| 3826 |
+
0x2D2700002D28,
|
| 3827 |
+
0x2D2D00002D2E,
|
| 3828 |
+
0x2D3000002D68,
|
| 3829 |
+
0x2D7F00002D97,
|
| 3830 |
+
0x2DA000002DA7,
|
| 3831 |
+
0x2DA800002DAF,
|
| 3832 |
+
0x2DB000002DB7,
|
| 3833 |
+
0x2DB800002DBF,
|
| 3834 |
+
0x2DC000002DC7,
|
| 3835 |
+
0x2DC800002DCF,
|
| 3836 |
+
0x2DD000002DD7,
|
| 3837 |
+
0x2DD800002DDF,
|
| 3838 |
+
0x2DE000002E00,
|
| 3839 |
+
0x2E2F00002E30,
|
| 3840 |
+
0x300500003008,
|
| 3841 |
+
0x302A0000302E,
|
| 3842 |
+
0x303C0000303D,
|
| 3843 |
+
0x304100003097,
|
| 3844 |
+
0x30990000309B,
|
| 3845 |
+
0x309D0000309F,
|
| 3846 |
+
0x30A1000030FB,
|
| 3847 |
+
0x30FC000030FF,
|
| 3848 |
+
0x310500003130,
|
| 3849 |
+
0x31A0000031C0,
|
| 3850 |
+
0x31F000003200,
|
| 3851 |
+
0x340000004DC0,
|
| 3852 |
+
0x4E000000A48D,
|
| 3853 |
+
0xA4D00000A4FE,
|
| 3854 |
+
0xA5000000A60D,
|
| 3855 |
+
0xA6100000A62C,
|
| 3856 |
+
0xA6410000A642,
|
| 3857 |
+
0xA6430000A644,
|
| 3858 |
+
0xA6450000A646,
|
| 3859 |
+
0xA6470000A648,
|
| 3860 |
+
0xA6490000A64A,
|
| 3861 |
+
0xA64B0000A64C,
|
| 3862 |
+
0xA64D0000A64E,
|
| 3863 |
+
0xA64F0000A650,
|
| 3864 |
+
0xA6510000A652,
|
| 3865 |
+
0xA6530000A654,
|
| 3866 |
+
0xA6550000A656,
|
| 3867 |
+
0xA6570000A658,
|
| 3868 |
+
0xA6590000A65A,
|
| 3869 |
+
0xA65B0000A65C,
|
| 3870 |
+
0xA65D0000A65E,
|
| 3871 |
+
0xA65F0000A660,
|
| 3872 |
+
0xA6610000A662,
|
| 3873 |
+
0xA6630000A664,
|
| 3874 |
+
0xA6650000A666,
|
| 3875 |
+
0xA6670000A668,
|
| 3876 |
+
0xA6690000A66A,
|
| 3877 |
+
0xA66B0000A66C,
|
| 3878 |
+
0xA66D0000A670,
|
| 3879 |
+
0xA6740000A67E,
|
| 3880 |
+
0xA67F0000A680,
|
| 3881 |
+
0xA6810000A682,
|
| 3882 |
+
0xA6830000A684,
|
| 3883 |
+
0xA6850000A686,
|
| 3884 |
+
0xA6870000A688,
|
| 3885 |
+
0xA6890000A68A,
|
| 3886 |
+
0xA68B0000A68C,
|
| 3887 |
+
0xA68D0000A68E,
|
| 3888 |
+
0xA68F0000A690,
|
| 3889 |
+
0xA6910000A692,
|
| 3890 |
+
0xA6930000A694,
|
| 3891 |
+
0xA6950000A696,
|
| 3892 |
+
0xA6970000A698,
|
| 3893 |
+
0xA6990000A69A,
|
| 3894 |
+
0xA69B0000A69C,
|
| 3895 |
+
0xA69E0000A6E6,
|
| 3896 |
+
0xA6F00000A6F2,
|
| 3897 |
+
0xA7170000A720,
|
| 3898 |
+
0xA7230000A724,
|
| 3899 |
+
0xA7250000A726,
|
| 3900 |
+
0xA7270000A728,
|
| 3901 |
+
0xA7290000A72A,
|
| 3902 |
+
0xA72B0000A72C,
|
| 3903 |
+
0xA72D0000A72E,
|
| 3904 |
+
0xA72F0000A732,
|
| 3905 |
+
0xA7330000A734,
|
| 3906 |
+
0xA7350000A736,
|
| 3907 |
+
0xA7370000A738,
|
| 3908 |
+
0xA7390000A73A,
|
| 3909 |
+
0xA73B0000A73C,
|
| 3910 |
+
0xA73D0000A73E,
|
| 3911 |
+
0xA73F0000A740,
|
| 3912 |
+
0xA7410000A742,
|
| 3913 |
+
0xA7430000A744,
|
| 3914 |
+
0xA7450000A746,
|
| 3915 |
+
0xA7470000A748,
|
| 3916 |
+
0xA7490000A74A,
|
| 3917 |
+
0xA74B0000A74C,
|
| 3918 |
+
0xA74D0000A74E,
|
| 3919 |
+
0xA74F0000A750,
|
| 3920 |
+
0xA7510000A752,
|
| 3921 |
+
0xA7530000A754,
|
| 3922 |
+
0xA7550000A756,
|
| 3923 |
+
0xA7570000A758,
|
| 3924 |
+
0xA7590000A75A,
|
| 3925 |
+
0xA75B0000A75C,
|
| 3926 |
+
0xA75D0000A75E,
|
| 3927 |
+
0xA75F0000A760,
|
| 3928 |
+
0xA7610000A762,
|
| 3929 |
+
0xA7630000A764,
|
| 3930 |
+
0xA7650000A766,
|
| 3931 |
+
0xA7670000A768,
|
| 3932 |
+
0xA7690000A76A,
|
| 3933 |
+
0xA76B0000A76C,
|
| 3934 |
+
0xA76D0000A76E,
|
| 3935 |
+
0xA76F0000A770,
|
| 3936 |
+
0xA7710000A779,
|
| 3937 |
+
0xA77A0000A77B,
|
| 3938 |
+
0xA77C0000A77D,
|
| 3939 |
+
0xA77F0000A780,
|
| 3940 |
+
0xA7810000A782,
|
| 3941 |
+
0xA7830000A784,
|
| 3942 |
+
0xA7850000A786,
|
| 3943 |
+
0xA7870000A789,
|
| 3944 |
+
0xA78C0000A78D,
|
| 3945 |
+
0xA78E0000A790,
|
| 3946 |
+
0xA7910000A792,
|
| 3947 |
+
0xA7930000A796,
|
| 3948 |
+
0xA7970000A798,
|
| 3949 |
+
0xA7990000A79A,
|
| 3950 |
+
0xA79B0000A79C,
|
| 3951 |
+
0xA79D0000A79E,
|
| 3952 |
+
0xA79F0000A7A0,
|
| 3953 |
+
0xA7A10000A7A2,
|
| 3954 |
+
0xA7A30000A7A4,
|
| 3955 |
+
0xA7A50000A7A6,
|
| 3956 |
+
0xA7A70000A7A8,
|
| 3957 |
+
0xA7A90000A7AA,
|
| 3958 |
+
0xA7AF0000A7B0,
|
| 3959 |
+
0xA7B50000A7B6,
|
| 3960 |
+
0xA7B70000A7B8,
|
| 3961 |
+
0xA7B90000A7BA,
|
| 3962 |
+
0xA7BB0000A7BC,
|
| 3963 |
+
0xA7BD0000A7BE,
|
| 3964 |
+
0xA7BF0000A7C0,
|
| 3965 |
+
0xA7C10000A7C2,
|
| 3966 |
+
0xA7C30000A7C4,
|
| 3967 |
+
0xA7C80000A7C9,
|
| 3968 |
+
0xA7CA0000A7CB,
|
| 3969 |
+
0xA7CD0000A7CE,
|
| 3970 |
+
0xA7D10000A7D2,
|
| 3971 |
+
0xA7D30000A7D4,
|
| 3972 |
+
0xA7D50000A7D6,
|
| 3973 |
+
0xA7D70000A7D8,
|
| 3974 |
+
0xA7D90000A7DA,
|
| 3975 |
+
0xA7DB0000A7DC,
|
| 3976 |
+
0xA7F60000A7F8,
|
| 3977 |
+
0xA7FA0000A828,
|
| 3978 |
+
0xA82C0000A82D,
|
| 3979 |
+
0xA8400000A874,
|
| 3980 |
+
0xA8800000A8C6,
|
| 3981 |
+
0xA8D00000A8DA,
|
| 3982 |
+
0xA8E00000A8F8,
|
| 3983 |
+
0xA8FB0000A8FC,
|
| 3984 |
+
0xA8FD0000A92E,
|
| 3985 |
+
0xA9300000A954,
|
| 3986 |
+
0xA9800000A9C1,
|
| 3987 |
+
0xA9CF0000A9DA,
|
| 3988 |
+
0xA9E00000A9FF,
|
| 3989 |
+
0xAA000000AA37,
|
| 3990 |
+
0xAA400000AA4E,
|
| 3991 |
+
0xAA500000AA5A,
|
| 3992 |
+
0xAA600000AA77,
|
| 3993 |
+
0xAA7A0000AAC3,
|
| 3994 |
+
0xAADB0000AADE,
|
| 3995 |
+
0xAAE00000AAF0,
|
| 3996 |
+
0xAAF20000AAF7,
|
| 3997 |
+
0xAB010000AB07,
|
| 3998 |
+
0xAB090000AB0F,
|
| 3999 |
+
0xAB110000AB17,
|
| 4000 |
+
0xAB200000AB27,
|
| 4001 |
+
0xAB280000AB2F,
|
| 4002 |
+
0xAB300000AB5B,
|
| 4003 |
+
0xAB600000AB69,
|
| 4004 |
+
0xABC00000ABEB,
|
| 4005 |
+
0xABEC0000ABEE,
|
| 4006 |
+
0xABF00000ABFA,
|
| 4007 |
+
0xAC000000D7A4,
|
| 4008 |
+
0xFA0E0000FA10,
|
| 4009 |
+
0xFA110000FA12,
|
| 4010 |
+
0xFA130000FA15,
|
| 4011 |
+
0xFA1F0000FA20,
|
| 4012 |
+
0xFA210000FA22,
|
| 4013 |
+
0xFA230000FA25,
|
| 4014 |
+
0xFA270000FA2A,
|
| 4015 |
+
0xFB1E0000FB1F,
|
| 4016 |
+
0xFE200000FE30,
|
| 4017 |
+
0xFE730000FE74,
|
| 4018 |
+
0x100000001000C,
|
| 4019 |
+
0x1000D00010027,
|
| 4020 |
+
0x100280001003B,
|
| 4021 |
+
0x1003C0001003E,
|
| 4022 |
+
0x1003F0001004E,
|
| 4023 |
+
0x100500001005E,
|
| 4024 |
+
0x10080000100FB,
|
| 4025 |
+
0x101FD000101FE,
|
| 4026 |
+
0x102800001029D,
|
| 4027 |
+
0x102A0000102D1,
|
| 4028 |
+
0x102E0000102E1,
|
| 4029 |
+
0x1030000010320,
|
| 4030 |
+
0x1032D00010341,
|
| 4031 |
+
0x103420001034A,
|
| 4032 |
+
0x103500001037B,
|
| 4033 |
+
0x103800001039E,
|
| 4034 |
+
0x103A0000103C4,
|
| 4035 |
+
0x103C8000103D0,
|
| 4036 |
+
0x104280001049E,
|
| 4037 |
+
0x104A0000104AA,
|
| 4038 |
+
0x104D8000104FC,
|
| 4039 |
+
0x1050000010528,
|
| 4040 |
+
0x1053000010564,
|
| 4041 |
+
0x10597000105A2,
|
| 4042 |
+
0x105A3000105B2,
|
| 4043 |
+
0x105B3000105BA,
|
| 4044 |
+
0x105BB000105BD,
|
| 4045 |
+
0x105C0000105F4,
|
| 4046 |
+
0x1060000010737,
|
| 4047 |
+
0x1074000010756,
|
| 4048 |
+
0x1076000010768,
|
| 4049 |
+
0x1078000010781,
|
| 4050 |
+
0x1080000010806,
|
| 4051 |
+
0x1080800010809,
|
| 4052 |
+
0x1080A00010836,
|
| 4053 |
+
0x1083700010839,
|
| 4054 |
+
0x1083C0001083D,
|
| 4055 |
+
0x1083F00010856,
|
| 4056 |
+
0x1086000010877,
|
| 4057 |
+
0x108800001089F,
|
| 4058 |
+
0x108E0000108F3,
|
| 4059 |
+
0x108F4000108F6,
|
| 4060 |
+
0x1090000010916,
|
| 4061 |
+
0x109200001093A,
|
| 4062 |
+
0x10980000109B8,
|
| 4063 |
+
0x109BE000109C0,
|
| 4064 |
+
0x10A0000010A04,
|
| 4065 |
+
0x10A0500010A07,
|
| 4066 |
+
0x10A0C00010A14,
|
| 4067 |
+
0x10A1500010A18,
|
| 4068 |
+
0x10A1900010A36,
|
| 4069 |
+
0x10A3800010A3B,
|
| 4070 |
+
0x10A3F00010A40,
|
| 4071 |
+
0x10A6000010A7D,
|
| 4072 |
+
0x10A8000010A9D,
|
| 4073 |
+
0x10AC000010AC8,
|
| 4074 |
+
0x10AC900010AE7,
|
| 4075 |
+
0x10B0000010B36,
|
| 4076 |
+
0x10B4000010B56,
|
| 4077 |
+
0x10B6000010B73,
|
| 4078 |
+
0x10B8000010B92,
|
| 4079 |
+
0x10C0000010C49,
|
| 4080 |
+
0x10CC000010CF3,
|
| 4081 |
+
0x10D0000010D28,
|
| 4082 |
+
0x10D3000010D3A,
|
| 4083 |
+
0x10D4000010D50,
|
| 4084 |
+
0x10D6900010D6E,
|
| 4085 |
+
0x10D6F00010D86,
|
| 4086 |
+
0x10E8000010EAA,
|
| 4087 |
+
0x10EAB00010EAD,
|
| 4088 |
+
0x10EB000010EB2,
|
| 4089 |
+
0x10EC200010EC5,
|
| 4090 |
+
0x10EFC00010F1D,
|
| 4091 |
+
0x10F2700010F28,
|
| 4092 |
+
0x10F3000010F51,
|
| 4093 |
+
0x10F7000010F86,
|
| 4094 |
+
0x10FB000010FC5,
|
| 4095 |
+
0x10FE000010FF7,
|
| 4096 |
+
0x1100000011047,
|
| 4097 |
+
0x1106600011076,
|
| 4098 |
+
0x1107F000110BB,
|
| 4099 |
+
0x110C2000110C3,
|
| 4100 |
+
0x110D0000110E9,
|
| 4101 |
+
0x110F0000110FA,
|
| 4102 |
+
0x1110000011135,
|
| 4103 |
+
0x1113600011140,
|
| 4104 |
+
0x1114400011148,
|
| 4105 |
+
0x1115000011174,
|
| 4106 |
+
0x1117600011177,
|
| 4107 |
+
0x11180000111C5,
|
| 4108 |
+
0x111C9000111CD,
|
| 4109 |
+
0x111CE000111DB,
|
| 4110 |
+
0x111DC000111DD,
|
| 4111 |
+
0x1120000011212,
|
| 4112 |
+
0x1121300011238,
|
| 4113 |
+
0x1123E00011242,
|
| 4114 |
+
0x1128000011287,
|
| 4115 |
+
0x1128800011289,
|
| 4116 |
+
0x1128A0001128E,
|
| 4117 |
+
0x1128F0001129E,
|
| 4118 |
+
0x1129F000112A9,
|
| 4119 |
+
0x112B0000112EB,
|
| 4120 |
+
0x112F0000112FA,
|
| 4121 |
+
0x1130000011304,
|
| 4122 |
+
0x113050001130D,
|
| 4123 |
+
0x1130F00011311,
|
| 4124 |
+
0x1131300011329,
|
| 4125 |
+
0x1132A00011331,
|
| 4126 |
+
0x1133200011334,
|
| 4127 |
+
0x113350001133A,
|
| 4128 |
+
0x1133B00011345,
|
| 4129 |
+
0x1134700011349,
|
| 4130 |
+
0x1134B0001134E,
|
| 4131 |
+
0x1135000011351,
|
| 4132 |
+
0x1135700011358,
|
| 4133 |
+
0x1135D00011364,
|
| 4134 |
+
0x113660001136D,
|
| 4135 |
+
0x1137000011375,
|
| 4136 |
+
0x113800001138A,
|
| 4137 |
+
0x1138B0001138C,
|
| 4138 |
+
0x1138E0001138F,
|
| 4139 |
+
0x11390000113B6,
|
| 4140 |
+
0x113B7000113C1,
|
| 4141 |
+
0x113C2000113C3,
|
| 4142 |
+
0x113C5000113C6,
|
| 4143 |
+
0x113C7000113CB,
|
| 4144 |
+
0x113CC000113D4,
|
| 4145 |
+
0x113E1000113E3,
|
| 4146 |
+
0x114000001144B,
|
| 4147 |
+
0x114500001145A,
|
| 4148 |
+
0x1145E00011462,
|
| 4149 |
+
0x11480000114C6,
|
| 4150 |
+
0x114C7000114C8,
|
| 4151 |
+
0x114D0000114DA,
|
| 4152 |
+
0x11580000115B6,
|
| 4153 |
+
0x115B8000115C1,
|
| 4154 |
+
0x115D8000115DE,
|
| 4155 |
+
0x1160000011641,
|
| 4156 |
+
0x1164400011645,
|
| 4157 |
+
0x116500001165A,
|
| 4158 |
+
0x11680000116B9,
|
| 4159 |
+
0x116C0000116CA,
|
| 4160 |
+
0x116D0000116E4,
|
| 4161 |
+
0x117000001171B,
|
| 4162 |
+
0x1171D0001172C,
|
| 4163 |
+
0x117300001173A,
|
| 4164 |
+
0x1174000011747,
|
| 4165 |
+
0x118000001183B,
|
| 4166 |
+
0x118C0000118EA,
|
| 4167 |
+
0x118FF00011907,
|
| 4168 |
+
0x119090001190A,
|
| 4169 |
+
0x1190C00011914,
|
| 4170 |
+
0x1191500011917,
|
| 4171 |
+
0x1191800011936,
|
| 4172 |
+
0x1193700011939,
|
| 4173 |
+
0x1193B00011944,
|
| 4174 |
+
0x119500001195A,
|
| 4175 |
+
0x119A0000119A8,
|
| 4176 |
+
0x119AA000119D8,
|
| 4177 |
+
0x119DA000119E2,
|
| 4178 |
+
0x119E3000119E5,
|
| 4179 |
+
0x11A0000011A3F,
|
| 4180 |
+
0x11A4700011A48,
|
| 4181 |
+
0x11A5000011A9A,
|
| 4182 |
+
0x11A9D00011A9E,
|
| 4183 |
+
0x11AB000011AF9,
|
| 4184 |
+
0x11BC000011BE1,
|
| 4185 |
+
0x11BF000011BFA,
|
| 4186 |
+
0x11C0000011C09,
|
| 4187 |
+
0x11C0A00011C37,
|
| 4188 |
+
0x11C3800011C41,
|
| 4189 |
+
0x11C5000011C5A,
|
| 4190 |
+
0x11C7200011C90,
|
| 4191 |
+
0x11C9200011CA8,
|
| 4192 |
+
0x11CA900011CB7,
|
| 4193 |
+
0x11D0000011D07,
|
| 4194 |
+
0x11D0800011D0A,
|
| 4195 |
+
0x11D0B00011D37,
|
| 4196 |
+
0x11D3A00011D3B,
|
| 4197 |
+
0x11D3C00011D3E,
|
| 4198 |
+
0x11D3F00011D48,
|
| 4199 |
+
0x11D5000011D5A,
|
| 4200 |
+
0x11D6000011D66,
|
| 4201 |
+
0x11D6700011D69,
|
| 4202 |
+
0x11D6A00011D8F,
|
| 4203 |
+
0x11D9000011D92,
|
| 4204 |
+
0x11D9300011D99,
|
| 4205 |
+
0x11DA000011DAA,
|
| 4206 |
+
0x11EE000011EF7,
|
| 4207 |
+
0x11F0000011F11,
|
| 4208 |
+
0x11F1200011F3B,
|
| 4209 |
+
0x11F3E00011F43,
|
| 4210 |
+
0x11F5000011F5B,
|
| 4211 |
+
0x11FB000011FB1,
|
| 4212 |
+
0x120000001239A,
|
| 4213 |
+
0x1248000012544,
|
| 4214 |
+
0x12F9000012FF1,
|
| 4215 |
+
0x1300000013430,
|
| 4216 |
+
0x1344000013456,
|
| 4217 |
+
0x13460000143FB,
|
| 4218 |
+
0x1440000014647,
|
| 4219 |
+
0x161000001613A,
|
| 4220 |
+
0x1680000016A39,
|
| 4221 |
+
0x16A4000016A5F,
|
| 4222 |
+
0x16A6000016A6A,
|
| 4223 |
+
0x16A7000016ABF,
|
| 4224 |
+
0x16AC000016ACA,
|
| 4225 |
+
0x16AD000016AEE,
|
| 4226 |
+
0x16AF000016AF5,
|
| 4227 |
+
0x16B0000016B37,
|
| 4228 |
+
0x16B4000016B44,
|
| 4229 |
+
0x16B5000016B5A,
|
| 4230 |
+
0x16B6300016B78,
|
| 4231 |
+
0x16B7D00016B90,
|
| 4232 |
+
0x16D4000016D6D,
|
| 4233 |
+
0x16D7000016D7A,
|
| 4234 |
+
0x16E6000016E80,
|
| 4235 |
+
0x16F0000016F4B,
|
| 4236 |
+
0x16F4F00016F88,
|
| 4237 |
+
0x16F8F00016FA0,
|
| 4238 |
+
0x16FE000016FE2,
|
| 4239 |
+
0x16FE300016FE5,
|
| 4240 |
+
0x16FF000016FF2,
|
| 4241 |
+
0x17000000187F8,
|
| 4242 |
+
0x1880000018CD6,
|
| 4243 |
+
0x18CFF00018D09,
|
| 4244 |
+
0x1AFF00001AFF4,
|
| 4245 |
+
0x1AFF50001AFFC,
|
| 4246 |
+
0x1AFFD0001AFFF,
|
| 4247 |
+
0x1B0000001B123,
|
| 4248 |
+
0x1B1320001B133,
|
| 4249 |
+
0x1B1500001B153,
|
| 4250 |
+
0x1B1550001B156,
|
| 4251 |
+
0x1B1640001B168,
|
| 4252 |
+
0x1B1700001B2FC,
|
| 4253 |
+
0x1BC000001BC6B,
|
| 4254 |
+
0x1BC700001BC7D,
|
| 4255 |
+
0x1BC800001BC89,
|
| 4256 |
+
0x1BC900001BC9A,
|
| 4257 |
+
0x1BC9D0001BC9F,
|
| 4258 |
+
0x1CCF00001CCFA,
|
| 4259 |
+
0x1CF000001CF2E,
|
| 4260 |
+
0x1CF300001CF47,
|
| 4261 |
+
0x1DA000001DA37,
|
| 4262 |
+
0x1DA3B0001DA6D,
|
| 4263 |
+
0x1DA750001DA76,
|
| 4264 |
+
0x1DA840001DA85,
|
| 4265 |
+
0x1DA9B0001DAA0,
|
| 4266 |
+
0x1DAA10001DAB0,
|
| 4267 |
+
0x1DF000001DF1F,
|
| 4268 |
+
0x1DF250001DF2B,
|
| 4269 |
+
0x1E0000001E007,
|
| 4270 |
+
0x1E0080001E019,
|
| 4271 |
+
0x1E01B0001E022,
|
| 4272 |
+
0x1E0230001E025,
|
| 4273 |
+
0x1E0260001E02B,
|
| 4274 |
+
0x1E08F0001E090,
|
| 4275 |
+
0x1E1000001E12D,
|
| 4276 |
+
0x1E1300001E13E,
|
| 4277 |
+
0x1E1400001E14A,
|
| 4278 |
+
0x1E14E0001E14F,
|
| 4279 |
+
0x1E2900001E2AF,
|
| 4280 |
+
0x1E2C00001E2FA,
|
| 4281 |
+
0x1E4D00001E4FA,
|
| 4282 |
+
0x1E5D00001E5FB,
|
| 4283 |
+
0x1E7E00001E7E7,
|
| 4284 |
+
0x1E7E80001E7EC,
|
| 4285 |
+
0x1E7ED0001E7EF,
|
| 4286 |
+
0x1E7F00001E7FF,
|
| 4287 |
+
0x1E8000001E8C5,
|
| 4288 |
+
0x1E8D00001E8D7,
|
| 4289 |
+
0x1E9220001E94C,
|
| 4290 |
+
0x1E9500001E95A,
|
| 4291 |
+
0x200000002A6E0,
|
| 4292 |
+
0x2A7000002B73A,
|
| 4293 |
+
0x2B7400002B81E,
|
| 4294 |
+
0x2B8200002CEA2,
|
| 4295 |
+
0x2CEB00002EBE1,
|
| 4296 |
+
0x2EBF00002EE5E,
|
| 4297 |
+
0x300000003134B,
|
| 4298 |
+
0x31350000323B0,
|
| 4299 |
+
),
|
| 4300 |
+
"CONTEXTJ": (0x200C0000200E,),
|
| 4301 |
+
"CONTEXTO": (
|
| 4302 |
+
0xB7000000B8,
|
| 4303 |
+
0x37500000376,
|
| 4304 |
+
0x5F3000005F5,
|
| 4305 |
+
0x6600000066A,
|
| 4306 |
+
0x6F0000006FA,
|
| 4307 |
+
0x30FB000030FC,
|
| 4308 |
+
),
|
| 4309 |
+
}
|
idna/intranges.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Given a list of integers, made up of (hopefully) a small number of long runs
|
| 3 |
+
of consecutive integers, compute a representation of the form
|
| 4 |
+
((start1, end1), (start2, end2) ...). Then answer the question "was x present
|
| 5 |
+
in the original list?" in time O(log(# runs)).
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import bisect
|
| 9 |
+
from typing import List, Tuple
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def intranges_from_list(list_: List[int]) -> Tuple[int, ...]:
|
| 13 |
+
"""Represent a list of integers as a sequence of ranges:
|
| 14 |
+
((start_0, end_0), (start_1, end_1), ...), such that the original
|
| 15 |
+
integers are exactly those x such that start_i <= x < end_i for some i.
|
| 16 |
+
|
| 17 |
+
Ranges are encoded as single integers (start << 32 | end), not as tuples.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
sorted_list = sorted(list_)
|
| 21 |
+
ranges = []
|
| 22 |
+
last_write = -1
|
| 23 |
+
for i in range(len(sorted_list)):
|
| 24 |
+
if i + 1 < len(sorted_list):
|
| 25 |
+
if sorted_list[i] == sorted_list[i + 1] - 1:
|
| 26 |
+
continue
|
| 27 |
+
current_range = sorted_list[last_write + 1 : i + 1]
|
| 28 |
+
ranges.append(_encode_range(current_range[0], current_range[-1] + 1))
|
| 29 |
+
last_write = i
|
| 30 |
+
|
| 31 |
+
return tuple(ranges)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def _encode_range(start: int, end: int) -> int:
|
| 35 |
+
return (start << 32) | end
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def _decode_range(r: int) -> Tuple[int, int]:
|
| 39 |
+
return (r >> 32), (r & ((1 << 32) - 1))
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def intranges_contain(int_: int, ranges: Tuple[int, ...]) -> bool:
|
| 43 |
+
"""Determine if `int_` falls into one of the ranges in `ranges`."""
|
| 44 |
+
tuple_ = _encode_range(int_, 0)
|
| 45 |
+
pos = bisect.bisect_left(ranges, tuple_)
|
| 46 |
+
# we could be immediately ahead of a tuple (start, end)
|
| 47 |
+
# with start < int_ <= end
|
| 48 |
+
if pos > 0:
|
| 49 |
+
left, right = _decode_range(ranges[pos - 1])
|
| 50 |
+
if left <= int_ < right:
|
| 51 |
+
return True
|
| 52 |
+
# or we could be immediately behind a tuple (int_, end)
|
| 53 |
+
if pos < len(ranges):
|
| 54 |
+
left, _ = _decode_range(ranges[pos])
|
| 55 |
+
if left == int_:
|
| 56 |
+
return True
|
| 57 |
+
return False
|
idna/package_data.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
__version__ = "3.11"
|
idna/py.typed
ADDED
|
File without changes
|
idna/uts46data.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
importlib_metadata/__init__.py
ADDED
|
@@ -0,0 +1,1191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
APIs exposing metadata from third-party Python packages.
|
| 3 |
+
|
| 4 |
+
This codebase is shared between importlib.metadata in the stdlib
|
| 5 |
+
and importlib_metadata in PyPI. See
|
| 6 |
+
https://github.com/python/importlib_metadata/wiki/Development-Methodology
|
| 7 |
+
for more detail.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
import abc
|
| 13 |
+
import collections
|
| 14 |
+
import email
|
| 15 |
+
import functools
|
| 16 |
+
import itertools
|
| 17 |
+
import operator
|
| 18 |
+
import os
|
| 19 |
+
import pathlib
|
| 20 |
+
import posixpath
|
| 21 |
+
import re
|
| 22 |
+
import sys
|
| 23 |
+
import textwrap
|
| 24 |
+
import types
|
| 25 |
+
from collections.abc import Iterable, Mapping
|
| 26 |
+
from contextlib import suppress
|
| 27 |
+
from importlib import import_module
|
| 28 |
+
from importlib.abc import MetaPathFinder
|
| 29 |
+
from itertools import starmap
|
| 30 |
+
from typing import Any
|
| 31 |
+
|
| 32 |
+
from . import _meta
|
| 33 |
+
from ._collections import FreezableDefaultDict, Pair
|
| 34 |
+
from ._compat import (
|
| 35 |
+
NullFinder,
|
| 36 |
+
install,
|
| 37 |
+
)
|
| 38 |
+
from ._functools import method_cache, noop, pass_none, passthrough
|
| 39 |
+
from ._itertools import always_iterable, bucket, unique_everseen
|
| 40 |
+
from ._meta import PackageMetadata, SimplePath
|
| 41 |
+
from ._typing import md_none
|
| 42 |
+
from .compat import py311
|
| 43 |
+
|
| 44 |
+
__all__ = [
|
| 45 |
+
'Distribution',
|
| 46 |
+
'DistributionFinder',
|
| 47 |
+
'PackageMetadata',
|
| 48 |
+
'PackageNotFoundError',
|
| 49 |
+
'SimplePath',
|
| 50 |
+
'distribution',
|
| 51 |
+
'distributions',
|
| 52 |
+
'entry_points',
|
| 53 |
+
'files',
|
| 54 |
+
'metadata',
|
| 55 |
+
'packages_distributions',
|
| 56 |
+
'requires',
|
| 57 |
+
'version',
|
| 58 |
+
]
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
class PackageNotFoundError(ModuleNotFoundError):
|
| 62 |
+
"""The package was not found."""
|
| 63 |
+
|
| 64 |
+
def __str__(self) -> str:
|
| 65 |
+
return f"No package metadata was found for {self.name}"
|
| 66 |
+
|
| 67 |
+
@property
|
| 68 |
+
def name(self) -> str: # type: ignore[override] # make readonly
|
| 69 |
+
(name,) = self.args
|
| 70 |
+
return name
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
class Sectioned:
|
| 74 |
+
"""
|
| 75 |
+
A simple entry point config parser for performance
|
| 76 |
+
|
| 77 |
+
>>> for item in Sectioned.read(Sectioned._sample):
|
| 78 |
+
... print(item)
|
| 79 |
+
Pair(name='sec1', value='# comments ignored')
|
| 80 |
+
Pair(name='sec1', value='a = 1')
|
| 81 |
+
Pair(name='sec1', value='b = 2')
|
| 82 |
+
Pair(name='sec2', value='a = 2')
|
| 83 |
+
|
| 84 |
+
>>> res = Sectioned.section_pairs(Sectioned._sample)
|
| 85 |
+
>>> item = next(res)
|
| 86 |
+
>>> item.name
|
| 87 |
+
'sec1'
|
| 88 |
+
>>> item.value
|
| 89 |
+
Pair(name='a', value='1')
|
| 90 |
+
>>> item = next(res)
|
| 91 |
+
>>> item.value
|
| 92 |
+
Pair(name='b', value='2')
|
| 93 |
+
>>> item = next(res)
|
| 94 |
+
>>> item.name
|
| 95 |
+
'sec2'
|
| 96 |
+
>>> item.value
|
| 97 |
+
Pair(name='a', value='2')
|
| 98 |
+
>>> list(res)
|
| 99 |
+
[]
|
| 100 |
+
"""
|
| 101 |
+
|
| 102 |
+
_sample = textwrap.dedent(
|
| 103 |
+
"""
|
| 104 |
+
[sec1]
|
| 105 |
+
# comments ignored
|
| 106 |
+
a = 1
|
| 107 |
+
b = 2
|
| 108 |
+
|
| 109 |
+
[sec2]
|
| 110 |
+
a = 2
|
| 111 |
+
"""
|
| 112 |
+
).lstrip()
|
| 113 |
+
|
| 114 |
+
@classmethod
|
| 115 |
+
def section_pairs(cls, text):
|
| 116 |
+
return (
|
| 117 |
+
section._replace(value=Pair.parse(section.value))
|
| 118 |
+
for section in cls.read(text, filter_=cls.valid)
|
| 119 |
+
if section.name is not None
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
@staticmethod
|
| 123 |
+
def read(text, filter_=None):
|
| 124 |
+
lines = filter(filter_, map(str.strip, text.splitlines()))
|
| 125 |
+
name = None
|
| 126 |
+
for value in lines:
|
| 127 |
+
section_match = value.startswith('[') and value.endswith(']')
|
| 128 |
+
if section_match:
|
| 129 |
+
name = value.strip('[]')
|
| 130 |
+
continue
|
| 131 |
+
yield Pair(name, value)
|
| 132 |
+
|
| 133 |
+
@staticmethod
|
| 134 |
+
def valid(line: str):
|
| 135 |
+
return line and not line.startswith('#')
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
class _EntryPointMatch(types.SimpleNamespace):
|
| 139 |
+
module: str
|
| 140 |
+
attr: str
|
| 141 |
+
extras: str
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
class EntryPoint:
|
| 145 |
+
"""An entry point as defined by Python packaging conventions.
|
| 146 |
+
|
| 147 |
+
See `the packaging docs on entry points
|
| 148 |
+
<https://packaging.python.org/specifications/entry-points/>`_
|
| 149 |
+
for more information.
|
| 150 |
+
|
| 151 |
+
>>> ep = EntryPoint(
|
| 152 |
+
... name=None, group=None, value='package.module:attr [extra1, extra2]')
|
| 153 |
+
>>> ep.module
|
| 154 |
+
'package.module'
|
| 155 |
+
>>> ep.attr
|
| 156 |
+
'attr'
|
| 157 |
+
>>> ep.extras
|
| 158 |
+
['extra1', 'extra2']
|
| 159 |
+
|
| 160 |
+
If the value package or module are not valid identifiers, a
|
| 161 |
+
ValueError is raised on access.
|
| 162 |
+
|
| 163 |
+
>>> EntryPoint(name=None, group=None, value='invalid-name').module
|
| 164 |
+
Traceback (most recent call last):
|
| 165 |
+
...
|
| 166 |
+
ValueError: ('Invalid object reference...invalid-name...
|
| 167 |
+
>>> EntryPoint(name=None, group=None, value='invalid-name').attr
|
| 168 |
+
Traceback (most recent call last):
|
| 169 |
+
...
|
| 170 |
+
ValueError: ('Invalid object reference...invalid-name...
|
| 171 |
+
>>> EntryPoint(name=None, group=None, value='invalid-name').extras
|
| 172 |
+
Traceback (most recent call last):
|
| 173 |
+
...
|
| 174 |
+
ValueError: ('Invalid object reference...invalid-name...
|
| 175 |
+
|
| 176 |
+
The same thing happens on construction.
|
| 177 |
+
|
| 178 |
+
>>> EntryPoint(name=None, group=None, value='invalid-name')
|
| 179 |
+
Traceback (most recent call last):
|
| 180 |
+
...
|
| 181 |
+
ValueError: ('Invalid object reference...invalid-name...
|
| 182 |
+
|
| 183 |
+
"""
|
| 184 |
+
|
| 185 |
+
pattern = re.compile(
|
| 186 |
+
r'(?P<module>[\w.]+)\s*'
|
| 187 |
+
r'(:\s*(?P<attr>[\w.]+)\s*)?'
|
| 188 |
+
r'((?P<extras>\[.*\])\s*)?$'
|
| 189 |
+
)
|
| 190 |
+
"""
|
| 191 |
+
A regular expression describing the syntax for an entry point,
|
| 192 |
+
which might look like:
|
| 193 |
+
|
| 194 |
+
- module
|
| 195 |
+
- package.module
|
| 196 |
+
- package.module:attribute
|
| 197 |
+
- package.module:object.attribute
|
| 198 |
+
- package.module:attr [extra1, extra2]
|
| 199 |
+
|
| 200 |
+
Other combinations are possible as well.
|
| 201 |
+
|
| 202 |
+
The expression is lenient about whitespace around the ':',
|
| 203 |
+
following the attr, and following any extras.
|
| 204 |
+
"""
|
| 205 |
+
|
| 206 |
+
name: str
|
| 207 |
+
value: str
|
| 208 |
+
group: str
|
| 209 |
+
|
| 210 |
+
dist: Distribution | None = None
|
| 211 |
+
|
| 212 |
+
def __init__(self, name: str, value: str, group: str) -> None:
|
| 213 |
+
vars(self).update(name=name, value=value, group=group)
|
| 214 |
+
self.module
|
| 215 |
+
|
| 216 |
+
def load(self) -> Any:
|
| 217 |
+
"""Load the entry point from its definition. If only a module
|
| 218 |
+
is indicated by the value, return that module. Otherwise,
|
| 219 |
+
return the named object.
|
| 220 |
+
"""
|
| 221 |
+
module = import_module(self.module)
|
| 222 |
+
attrs = filter(None, (self.attr or '').split('.'))
|
| 223 |
+
return functools.reduce(getattr, attrs, module)
|
| 224 |
+
|
| 225 |
+
@property
|
| 226 |
+
def module(self) -> str:
|
| 227 |
+
return self._match.module
|
| 228 |
+
|
| 229 |
+
@property
|
| 230 |
+
def attr(self) -> str:
|
| 231 |
+
return self._match.attr
|
| 232 |
+
|
| 233 |
+
@property
|
| 234 |
+
def extras(self) -> list[str]:
|
| 235 |
+
return re.findall(r'\w+', self._match.extras or '')
|
| 236 |
+
|
| 237 |
+
@functools.cached_property
|
| 238 |
+
def _match(self) -> _EntryPointMatch:
|
| 239 |
+
match = self.pattern.match(self.value)
|
| 240 |
+
if not match:
|
| 241 |
+
raise ValueError(
|
| 242 |
+
'Invalid object reference. '
|
| 243 |
+
'See https://packaging.python.org'
|
| 244 |
+
'/en/latest/specifications/entry-points/#data-model',
|
| 245 |
+
self.value,
|
| 246 |
+
)
|
| 247 |
+
return _EntryPointMatch(**match.groupdict())
|
| 248 |
+
|
| 249 |
+
def _for(self, dist):
|
| 250 |
+
vars(self).update(dist=dist)
|
| 251 |
+
return self
|
| 252 |
+
|
| 253 |
+
def matches(self, **params):
|
| 254 |
+
"""
|
| 255 |
+
EntryPoint matches the given parameters.
|
| 256 |
+
|
| 257 |
+
>>> ep = EntryPoint(group='foo', name='bar', value='bing:bong [extra1, extra2]')
|
| 258 |
+
>>> ep.matches(group='foo')
|
| 259 |
+
True
|
| 260 |
+
>>> ep.matches(name='bar', value='bing:bong [extra1, extra2]')
|
| 261 |
+
True
|
| 262 |
+
>>> ep.matches(group='foo', name='other')
|
| 263 |
+
False
|
| 264 |
+
>>> ep.matches()
|
| 265 |
+
True
|
| 266 |
+
>>> ep.matches(extras=['extra1', 'extra2'])
|
| 267 |
+
True
|
| 268 |
+
>>> ep.matches(module='bing')
|
| 269 |
+
True
|
| 270 |
+
>>> ep.matches(attr='bong')
|
| 271 |
+
True
|
| 272 |
+
"""
|
| 273 |
+
self._disallow_dist(params)
|
| 274 |
+
attrs = (getattr(self, param) for param in params)
|
| 275 |
+
return all(map(operator.eq, params.values(), attrs))
|
| 276 |
+
|
| 277 |
+
@staticmethod
|
| 278 |
+
def _disallow_dist(params):
|
| 279 |
+
"""
|
| 280 |
+
Querying by dist is not allowed (dist objects are not comparable).
|
| 281 |
+
>>> EntryPoint(name='fan', value='fav', group='fag').matches(dist='foo')
|
| 282 |
+
Traceback (most recent call last):
|
| 283 |
+
...
|
| 284 |
+
ValueError: "dist" is not suitable for matching...
|
| 285 |
+
"""
|
| 286 |
+
if "dist" in params:
|
| 287 |
+
raise ValueError(
|
| 288 |
+
'"dist" is not suitable for matching. '
|
| 289 |
+
"Instead, use Distribution.entry_points.select() on a "
|
| 290 |
+
"located distribution."
|
| 291 |
+
)
|
| 292 |
+
|
| 293 |
+
def _key(self):
|
| 294 |
+
return self.name, self.value, self.group
|
| 295 |
+
|
| 296 |
+
def __lt__(self, other):
|
| 297 |
+
return self._key() < other._key()
|
| 298 |
+
|
| 299 |
+
def __eq__(self, other):
|
| 300 |
+
return self._key() == other._key()
|
| 301 |
+
|
| 302 |
+
def __setattr__(self, name, value):
|
| 303 |
+
raise AttributeError("EntryPoint objects are immutable.")
|
| 304 |
+
|
| 305 |
+
def __repr__(self):
|
| 306 |
+
return (
|
| 307 |
+
f'EntryPoint(name={self.name!r}, value={self.value!r}, '
|
| 308 |
+
f'group={self.group!r})'
|
| 309 |
+
)
|
| 310 |
+
|
| 311 |
+
def __hash__(self) -> int:
|
| 312 |
+
return hash(self._key())
|
| 313 |
+
|
| 314 |
+
|
| 315 |
+
class EntryPoints(tuple):
|
| 316 |
+
"""
|
| 317 |
+
An immutable collection of selectable EntryPoint objects.
|
| 318 |
+
"""
|
| 319 |
+
|
| 320 |
+
__slots__ = ()
|
| 321 |
+
|
| 322 |
+
def __getitem__(self, name: str) -> EntryPoint: # type: ignore[override] # Work with str instead of int
|
| 323 |
+
"""
|
| 324 |
+
Get the EntryPoint in self matching name.
|
| 325 |
+
"""
|
| 326 |
+
try:
|
| 327 |
+
return next(iter(self.select(name=name)))
|
| 328 |
+
except StopIteration:
|
| 329 |
+
raise KeyError(name)
|
| 330 |
+
|
| 331 |
+
def __repr__(self):
|
| 332 |
+
"""
|
| 333 |
+
Repr with classname and tuple constructor to
|
| 334 |
+
signal that we deviate from regular tuple behavior.
|
| 335 |
+
"""
|
| 336 |
+
return '%s(%r)' % (self.__class__.__name__, tuple(self))
|
| 337 |
+
|
| 338 |
+
def select(self, **params) -> EntryPoints:
|
| 339 |
+
"""
|
| 340 |
+
Select entry points from self that match the
|
| 341 |
+
given parameters (typically group and/or name).
|
| 342 |
+
"""
|
| 343 |
+
return EntryPoints(ep for ep in self if ep.matches(**params))
|
| 344 |
+
|
| 345 |
+
@property
|
| 346 |
+
def names(self) -> set[str]:
|
| 347 |
+
"""
|
| 348 |
+
Return the set of all names of all entry points.
|
| 349 |
+
"""
|
| 350 |
+
return {ep.name for ep in self}
|
| 351 |
+
|
| 352 |
+
@property
|
| 353 |
+
def groups(self) -> set[str]:
|
| 354 |
+
"""
|
| 355 |
+
Return the set of all groups of all entry points.
|
| 356 |
+
"""
|
| 357 |
+
return {ep.group for ep in self}
|
| 358 |
+
|
| 359 |
+
@classmethod
|
| 360 |
+
def _from_text_for(cls, text, dist):
|
| 361 |
+
return cls(ep._for(dist) for ep in cls._from_text(text))
|
| 362 |
+
|
| 363 |
+
@staticmethod
|
| 364 |
+
def _from_text(text):
|
| 365 |
+
return (
|
| 366 |
+
EntryPoint(name=item.value.name, value=item.value.value, group=item.name)
|
| 367 |
+
for item in Sectioned.section_pairs(text or '')
|
| 368 |
+
)
|
| 369 |
+
|
| 370 |
+
|
| 371 |
+
class PackagePath(pathlib.PurePosixPath):
|
| 372 |
+
"""A reference to a path in a package"""
|
| 373 |
+
|
| 374 |
+
hash: FileHash | None
|
| 375 |
+
size: int
|
| 376 |
+
dist: Distribution
|
| 377 |
+
|
| 378 |
+
def read_text(self, encoding: str = 'utf-8') -> str:
|
| 379 |
+
return self.locate().read_text(encoding=encoding)
|
| 380 |
+
|
| 381 |
+
def read_binary(self) -> bytes:
|
| 382 |
+
return self.locate().read_bytes()
|
| 383 |
+
|
| 384 |
+
def locate(self) -> SimplePath:
|
| 385 |
+
"""Return a path-like object for this path"""
|
| 386 |
+
return self.dist.locate_file(self)
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
class FileHash:
|
| 390 |
+
def __init__(self, spec: str) -> None:
|
| 391 |
+
self.mode, _, self.value = spec.partition('=')
|
| 392 |
+
|
| 393 |
+
def __repr__(self) -> str:
|
| 394 |
+
return f'<FileHash mode: {self.mode} value: {self.value}>'
|
| 395 |
+
|
| 396 |
+
|
| 397 |
+
class Distribution(metaclass=abc.ABCMeta):
|
| 398 |
+
"""
|
| 399 |
+
An abstract Python distribution package.
|
| 400 |
+
|
| 401 |
+
Custom providers may derive from this class and define
|
| 402 |
+
the abstract methods to provide a concrete implementation
|
| 403 |
+
for their environment. Some providers may opt to override
|
| 404 |
+
the default implementation of some properties to bypass
|
| 405 |
+
the file-reading mechanism.
|
| 406 |
+
"""
|
| 407 |
+
|
| 408 |
+
@abc.abstractmethod
|
| 409 |
+
def read_text(self, filename) -> str | None:
|
| 410 |
+
"""Attempt to load metadata file given by the name.
|
| 411 |
+
|
| 412 |
+
Python distribution metadata is organized by blobs of text
|
| 413 |
+
typically represented as "files" in the metadata directory
|
| 414 |
+
(e.g. package-1.0.dist-info). These files include things
|
| 415 |
+
like:
|
| 416 |
+
|
| 417 |
+
- METADATA: The distribution metadata including fields
|
| 418 |
+
like Name and Version and Description.
|
| 419 |
+
- entry_points.txt: A series of entry points as defined in
|
| 420 |
+
`the entry points spec <https://packaging.python.org/en/latest/specifications/entry-points/#file-format>`_.
|
| 421 |
+
- RECORD: A record of files according to
|
| 422 |
+
`this recording spec <https://packaging.python.org/en/latest/specifications/recording-installed-packages/#the-record-file>`_.
|
| 423 |
+
|
| 424 |
+
A package may provide any set of files, including those
|
| 425 |
+
not listed here or none at all.
|
| 426 |
+
|
| 427 |
+
:param filename: The name of the file in the distribution info.
|
| 428 |
+
:return: The text if found, otherwise None.
|
| 429 |
+
"""
|
| 430 |
+
|
| 431 |
+
@abc.abstractmethod
|
| 432 |
+
def locate_file(self, path: str | os.PathLike[str]) -> SimplePath:
|
| 433 |
+
"""
|
| 434 |
+
Given a path to a file in this distribution, return a SimplePath
|
| 435 |
+
to it.
|
| 436 |
+
|
| 437 |
+
This method is used by callers of ``Distribution.files()`` to
|
| 438 |
+
locate files within the distribution. If it's possible for a
|
| 439 |
+
Distribution to represent files in the distribution as
|
| 440 |
+
``SimplePath`` objects, it should implement this method
|
| 441 |
+
to resolve such objects.
|
| 442 |
+
|
| 443 |
+
Some Distribution providers may elect not to resolve SimplePath
|
| 444 |
+
objects within the distribution by raising a
|
| 445 |
+
NotImplementedError, but consumers of such a Distribution would
|
| 446 |
+
be unable to invoke ``Distribution.files()``.
|
| 447 |
+
"""
|
| 448 |
+
|
| 449 |
+
@classmethod
|
| 450 |
+
def from_name(cls, name: str) -> Distribution:
|
| 451 |
+
"""Return the Distribution for the given package name.
|
| 452 |
+
|
| 453 |
+
:param name: The name of the distribution package to search for.
|
| 454 |
+
:return: The Distribution instance (or subclass thereof) for the named
|
| 455 |
+
package, if found.
|
| 456 |
+
:raises PackageNotFoundError: When the named package's distribution
|
| 457 |
+
metadata cannot be found.
|
| 458 |
+
:raises ValueError: When an invalid value is supplied for name.
|
| 459 |
+
"""
|
| 460 |
+
if not name:
|
| 461 |
+
raise ValueError("A distribution name is required.")
|
| 462 |
+
try:
|
| 463 |
+
return next(iter(cls._prefer_valid(cls.discover(name=name))))
|
| 464 |
+
except StopIteration:
|
| 465 |
+
raise PackageNotFoundError(name)
|
| 466 |
+
|
| 467 |
+
@classmethod
|
| 468 |
+
def discover(
|
| 469 |
+
cls, *, context: DistributionFinder.Context | None = None, **kwargs
|
| 470 |
+
) -> Iterable[Distribution]:
|
| 471 |
+
"""Return an iterable of Distribution objects for all packages.
|
| 472 |
+
|
| 473 |
+
Pass a ``context`` or pass keyword arguments for constructing
|
| 474 |
+
a context.
|
| 475 |
+
|
| 476 |
+
:context: A ``DistributionFinder.Context`` object.
|
| 477 |
+
:return: Iterable of Distribution objects for packages matching
|
| 478 |
+
the context.
|
| 479 |
+
"""
|
| 480 |
+
if context and kwargs:
|
| 481 |
+
raise ValueError("cannot accept context and kwargs")
|
| 482 |
+
context = context or DistributionFinder.Context(**kwargs)
|
| 483 |
+
return itertools.chain.from_iterable(
|
| 484 |
+
resolver(context) for resolver in cls._discover_resolvers()
|
| 485 |
+
)
|
| 486 |
+
|
| 487 |
+
@staticmethod
|
| 488 |
+
def _prefer_valid(dists: Iterable[Distribution]) -> Iterable[Distribution]:
|
| 489 |
+
"""
|
| 490 |
+
Prefer (move to the front) distributions that have metadata.
|
| 491 |
+
|
| 492 |
+
Ref python/importlib_resources#489.
|
| 493 |
+
"""
|
| 494 |
+
buckets = bucket(dists, lambda dist: bool(dist.metadata))
|
| 495 |
+
return itertools.chain(buckets[True], buckets[False])
|
| 496 |
+
|
| 497 |
+
@staticmethod
|
| 498 |
+
def at(path: str | os.PathLike[str]) -> Distribution:
|
| 499 |
+
"""Return a Distribution for the indicated metadata path.
|
| 500 |
+
|
| 501 |
+
:param path: a string or path-like object
|
| 502 |
+
:return: a concrete Distribution instance for the path
|
| 503 |
+
"""
|
| 504 |
+
return PathDistribution(pathlib.Path(path))
|
| 505 |
+
|
| 506 |
+
@staticmethod
|
| 507 |
+
def _discover_resolvers():
|
| 508 |
+
"""Search the meta_path for resolvers (MetadataPathFinders)."""
|
| 509 |
+
declared = (
|
| 510 |
+
getattr(finder, 'find_distributions', None) for finder in sys.meta_path
|
| 511 |
+
)
|
| 512 |
+
return filter(None, declared)
|
| 513 |
+
|
| 514 |
+
@property
|
| 515 |
+
def metadata(self) -> _meta.PackageMetadata | None:
|
| 516 |
+
"""Return the parsed metadata for this Distribution.
|
| 517 |
+
|
| 518 |
+
The returned object will have keys that name the various bits of
|
| 519 |
+
metadata per the
|
| 520 |
+
`Core metadata specifications <https://packaging.python.org/en/latest/specifications/core-metadata/#core-metadata>`_.
|
| 521 |
+
|
| 522 |
+
Custom providers may provide the METADATA file or override this
|
| 523 |
+
property.
|
| 524 |
+
"""
|
| 525 |
+
|
| 526 |
+
text = (
|
| 527 |
+
self.read_text('METADATA')
|
| 528 |
+
or self.read_text('PKG-INFO')
|
| 529 |
+
# This last clause is here to support old egg-info files. Its
|
| 530 |
+
# effect is to just end up using the PathDistribution's self._path
|
| 531 |
+
# (which points to the egg-info file) attribute unchanged.
|
| 532 |
+
or self.read_text('')
|
| 533 |
+
)
|
| 534 |
+
return self._assemble_message(text)
|
| 535 |
+
|
| 536 |
+
@staticmethod
|
| 537 |
+
@pass_none
|
| 538 |
+
def _assemble_message(text: str) -> _meta.PackageMetadata:
|
| 539 |
+
# deferred for performance (python/cpython#109829)
|
| 540 |
+
from . import _adapters
|
| 541 |
+
|
| 542 |
+
return _adapters.Message(email.message_from_string(text))
|
| 543 |
+
|
| 544 |
+
@property
|
| 545 |
+
def name(self) -> str:
|
| 546 |
+
"""Return the 'Name' metadata for the distribution package."""
|
| 547 |
+
return md_none(self.metadata)['Name']
|
| 548 |
+
|
| 549 |
+
@property
|
| 550 |
+
def _normalized_name(self):
|
| 551 |
+
"""Return a normalized version of the name."""
|
| 552 |
+
return Prepared.normalize(self.name)
|
| 553 |
+
|
| 554 |
+
@property
|
| 555 |
+
def version(self) -> str:
|
| 556 |
+
"""Return the 'Version' metadata for the distribution package."""
|
| 557 |
+
return md_none(self.metadata)['Version']
|
| 558 |
+
|
| 559 |
+
@property
|
| 560 |
+
def entry_points(self) -> EntryPoints:
|
| 561 |
+
"""
|
| 562 |
+
Return EntryPoints for this distribution.
|
| 563 |
+
|
| 564 |
+
Custom providers may provide the ``entry_points.txt`` file
|
| 565 |
+
or override this property.
|
| 566 |
+
"""
|
| 567 |
+
return EntryPoints._from_text_for(self.read_text('entry_points.txt'), self)
|
| 568 |
+
|
| 569 |
+
@property
|
| 570 |
+
def files(self) -> list[PackagePath] | None:
|
| 571 |
+
"""Files in this distribution.
|
| 572 |
+
|
| 573 |
+
:return: List of PackagePath for this distribution or None
|
| 574 |
+
|
| 575 |
+
Result is `None` if the metadata file that enumerates files
|
| 576 |
+
(i.e. RECORD for dist-info, or installed-files.txt or
|
| 577 |
+
SOURCES.txt for egg-info) is missing.
|
| 578 |
+
Result may be empty if the metadata exists but is empty.
|
| 579 |
+
|
| 580 |
+
Custom providers are recommended to provide a "RECORD" file (in
|
| 581 |
+
``read_text``) or override this property to allow for callers to be
|
| 582 |
+
able to resolve filenames provided by the package.
|
| 583 |
+
"""
|
| 584 |
+
|
| 585 |
+
def make_file(name, hash=None, size_str=None):
|
| 586 |
+
result = PackagePath(name)
|
| 587 |
+
result.hash = FileHash(hash) if hash else None
|
| 588 |
+
result.size = int(size_str) if size_str else None
|
| 589 |
+
result.dist = self
|
| 590 |
+
return result
|
| 591 |
+
|
| 592 |
+
@pass_none
|
| 593 |
+
def make_files(lines):
|
| 594 |
+
# Delay csv import, since Distribution.files is not as widely used
|
| 595 |
+
# as other parts of importlib.metadata
|
| 596 |
+
import csv
|
| 597 |
+
|
| 598 |
+
return starmap(make_file, csv.reader(lines))
|
| 599 |
+
|
| 600 |
+
@pass_none
|
| 601 |
+
def skip_missing_files(package_paths):
|
| 602 |
+
return list(filter(lambda path: path.locate().exists(), package_paths))
|
| 603 |
+
|
| 604 |
+
return skip_missing_files(
|
| 605 |
+
make_files(
|
| 606 |
+
self._read_files_distinfo()
|
| 607 |
+
or self._read_files_egginfo_installed()
|
| 608 |
+
or self._read_files_egginfo_sources()
|
| 609 |
+
)
|
| 610 |
+
)
|
| 611 |
+
|
| 612 |
+
def _read_files_distinfo(self):
|
| 613 |
+
"""
|
| 614 |
+
Read the lines of RECORD.
|
| 615 |
+
"""
|
| 616 |
+
text = self.read_text('RECORD')
|
| 617 |
+
return text and text.splitlines()
|
| 618 |
+
|
| 619 |
+
def _read_files_egginfo_installed(self):
|
| 620 |
+
"""
|
| 621 |
+
Read installed-files.txt and return lines in a similar
|
| 622 |
+
CSV-parsable format as RECORD: each file must be placed
|
| 623 |
+
relative to the site-packages directory and must also be
|
| 624 |
+
quoted (since file names can contain literal commas).
|
| 625 |
+
|
| 626 |
+
This file is written when the package is installed by pip,
|
| 627 |
+
but it might not be written for other installation methods.
|
| 628 |
+
Assume the file is accurate if it exists.
|
| 629 |
+
"""
|
| 630 |
+
text = self.read_text('installed-files.txt')
|
| 631 |
+
# Prepend the .egg-info/ subdir to the lines in this file.
|
| 632 |
+
# But this subdir is only available from PathDistribution's
|
| 633 |
+
# self._path.
|
| 634 |
+
subdir = getattr(self, '_path', None)
|
| 635 |
+
if not text or not subdir:
|
| 636 |
+
return
|
| 637 |
+
|
| 638 |
+
paths = (
|
| 639 |
+
py311
|
| 640 |
+
.relative_fix((subdir / name).resolve())
|
| 641 |
+
.relative_to(self.locate_file('').resolve(), walk_up=True)
|
| 642 |
+
.as_posix()
|
| 643 |
+
for name in text.splitlines()
|
| 644 |
+
)
|
| 645 |
+
return map('"{}"'.format, paths)
|
| 646 |
+
|
| 647 |
+
def _read_files_egginfo_sources(self):
|
| 648 |
+
"""
|
| 649 |
+
Read SOURCES.txt and return lines in a similar CSV-parsable
|
| 650 |
+
format as RECORD: each file name must be quoted (since it
|
| 651 |
+
might contain literal commas).
|
| 652 |
+
|
| 653 |
+
Note that SOURCES.txt is not a reliable source for what
|
| 654 |
+
files are installed by a package. This file is generated
|
| 655 |
+
for a source archive, and the files that are present
|
| 656 |
+
there (e.g. setup.py) may not correctly reflect the files
|
| 657 |
+
that are present after the package has been installed.
|
| 658 |
+
"""
|
| 659 |
+
text = self.read_text('SOURCES.txt')
|
| 660 |
+
return text and map('"{}"'.format, text.splitlines())
|
| 661 |
+
|
| 662 |
+
@property
|
| 663 |
+
def requires(self) -> list[str] | None:
|
| 664 |
+
"""Generated requirements specified for this Distribution"""
|
| 665 |
+
reqs = self._read_dist_info_reqs() or self._read_egg_info_reqs()
|
| 666 |
+
return reqs and list(reqs)
|
| 667 |
+
|
| 668 |
+
def _read_dist_info_reqs(self):
|
| 669 |
+
return self.metadata.get_all('Requires-Dist')
|
| 670 |
+
|
| 671 |
+
def _read_egg_info_reqs(self):
|
| 672 |
+
source = self.read_text('requires.txt')
|
| 673 |
+
return pass_none(self._deps_from_requires_text)(source)
|
| 674 |
+
|
| 675 |
+
@classmethod
|
| 676 |
+
def _deps_from_requires_text(cls, source):
|
| 677 |
+
return cls._convert_egg_info_reqs_to_simple_reqs(Sectioned.read(source))
|
| 678 |
+
|
| 679 |
+
@staticmethod
|
| 680 |
+
def _convert_egg_info_reqs_to_simple_reqs(sections):
|
| 681 |
+
"""
|
| 682 |
+
Historically, setuptools would solicit and store 'extra'
|
| 683 |
+
requirements, including those with environment markers,
|
| 684 |
+
in separate sections. More modern tools expect each
|
| 685 |
+
dependency to be defined separately, with any relevant
|
| 686 |
+
extras and environment markers attached directly to that
|
| 687 |
+
requirement. This method converts the former to the
|
| 688 |
+
latter. See _test_deps_from_requires_text for an example.
|
| 689 |
+
"""
|
| 690 |
+
|
| 691 |
+
def make_condition(name):
|
| 692 |
+
return name and f'extra == "{name}"'
|
| 693 |
+
|
| 694 |
+
def quoted_marker(section):
|
| 695 |
+
section = section or ''
|
| 696 |
+
extra, sep, markers = section.partition(':')
|
| 697 |
+
if extra and markers:
|
| 698 |
+
markers = f'({markers})'
|
| 699 |
+
conditions = list(filter(None, [markers, make_condition(extra)]))
|
| 700 |
+
return '; ' + ' and '.join(conditions) if conditions else ''
|
| 701 |
+
|
| 702 |
+
def url_req_space(req):
|
| 703 |
+
"""
|
| 704 |
+
PEP 508 requires a space between the url_spec and the quoted_marker.
|
| 705 |
+
Ref python/importlib_metadata#357.
|
| 706 |
+
"""
|
| 707 |
+
# '@' is uniquely indicative of a url_req.
|
| 708 |
+
return ' ' * ('@' in req)
|
| 709 |
+
|
| 710 |
+
for section in sections:
|
| 711 |
+
space = url_req_space(section.value)
|
| 712 |
+
yield section.value + space + quoted_marker(section.name)
|
| 713 |
+
|
| 714 |
+
@property
|
| 715 |
+
def origin(self):
|
| 716 |
+
return self._load_json('direct_url.json')
|
| 717 |
+
|
| 718 |
+
def _load_json(self, filename):
|
| 719 |
+
# Deferred for performance (python/importlib_metadata#503)
|
| 720 |
+
import json
|
| 721 |
+
|
| 722 |
+
return pass_none(json.loads)(
|
| 723 |
+
self.read_text(filename),
|
| 724 |
+
object_hook=lambda data: types.SimpleNamespace(**data),
|
| 725 |
+
)
|
| 726 |
+
|
| 727 |
+
|
| 728 |
+
class DistributionFinder(MetaPathFinder):
|
| 729 |
+
"""
|
| 730 |
+
A MetaPathFinder capable of discovering installed distributions.
|
| 731 |
+
|
| 732 |
+
Custom providers should implement this interface in order to
|
| 733 |
+
supply metadata.
|
| 734 |
+
"""
|
| 735 |
+
|
| 736 |
+
class Context:
|
| 737 |
+
"""
|
| 738 |
+
Keyword arguments presented by the caller to
|
| 739 |
+
``distributions()`` or ``Distribution.discover()``
|
| 740 |
+
to narrow the scope of a search for distributions
|
| 741 |
+
in all DistributionFinders.
|
| 742 |
+
|
| 743 |
+
Each DistributionFinder may expect any parameters
|
| 744 |
+
and should attempt to honor the canonical
|
| 745 |
+
parameters defined below when appropriate.
|
| 746 |
+
|
| 747 |
+
This mechanism gives a custom provider a means to
|
| 748 |
+
solicit additional details from the caller beyond
|
| 749 |
+
"name" and "path" when searching distributions.
|
| 750 |
+
For example, imagine a provider that exposes suites
|
| 751 |
+
of packages in either a "public" or "private" ``realm``.
|
| 752 |
+
A caller may wish to query only for distributions in
|
| 753 |
+
a particular realm and could call
|
| 754 |
+
``distributions(realm="private")`` to signal to the
|
| 755 |
+
custom provider to only include distributions from that
|
| 756 |
+
realm.
|
| 757 |
+
"""
|
| 758 |
+
|
| 759 |
+
name = None
|
| 760 |
+
"""
|
| 761 |
+
Specific name for which a distribution finder should match.
|
| 762 |
+
A name of ``None`` matches all distributions.
|
| 763 |
+
"""
|
| 764 |
+
|
| 765 |
+
def __init__(self, **kwargs):
|
| 766 |
+
vars(self).update(kwargs)
|
| 767 |
+
|
| 768 |
+
@property
|
| 769 |
+
def path(self) -> list[str]:
|
| 770 |
+
"""
|
| 771 |
+
The sequence of directory path that a distribution finder
|
| 772 |
+
should search.
|
| 773 |
+
|
| 774 |
+
Typically refers to Python installed package paths such as
|
| 775 |
+
"site-packages" directories and defaults to ``sys.path``.
|
| 776 |
+
"""
|
| 777 |
+
return vars(self).get('path', sys.path)
|
| 778 |
+
|
| 779 |
+
@abc.abstractmethod
|
| 780 |
+
def find_distributions(self, context=Context()) -> Iterable[Distribution]:
|
| 781 |
+
"""
|
| 782 |
+
Find distributions.
|
| 783 |
+
|
| 784 |
+
Return an iterable of all Distribution instances capable of
|
| 785 |
+
loading the metadata for packages matching the ``context``,
|
| 786 |
+
a DistributionFinder.Context instance.
|
| 787 |
+
"""
|
| 788 |
+
|
| 789 |
+
|
| 790 |
+
@passthrough
|
| 791 |
+
def _clear_after_fork(cached):
|
| 792 |
+
"""Ensure ``func`` clears cached state after ``fork`` when supported.
|
| 793 |
+
|
| 794 |
+
``FastPath`` caches zip-backed ``pathlib.Path`` objects that retain a
|
| 795 |
+
reference to the parent's open ``ZipFile`` handle. Re-using a cached
|
| 796 |
+
instance in a forked child can therefore resurrect invalid file pointers
|
| 797 |
+
and trigger ``BadZipFile``/``OSError`` failures (python/importlib_metadata#520).
|
| 798 |
+
Registering ``cache_clear`` with ``os.register_at_fork`` keeps each process
|
| 799 |
+
on its own cache.
|
| 800 |
+
"""
|
| 801 |
+
getattr(os, 'register_at_fork', noop)(after_in_child=cached.cache_clear)
|
| 802 |
+
|
| 803 |
+
|
| 804 |
+
class FastPath:
|
| 805 |
+
"""
|
| 806 |
+
Micro-optimized class for searching a root for children.
|
| 807 |
+
|
| 808 |
+
Root is a path on the file system that may contain metadata
|
| 809 |
+
directories either as natural directories or within a zip file.
|
| 810 |
+
|
| 811 |
+
>>> FastPath('').children()
|
| 812 |
+
['...']
|
| 813 |
+
|
| 814 |
+
FastPath objects are cached and recycled for any given root.
|
| 815 |
+
|
| 816 |
+
>>> FastPath('foobar') is FastPath('foobar')
|
| 817 |
+
True
|
| 818 |
+
"""
|
| 819 |
+
|
| 820 |
+
@_clear_after_fork # type: ignore[misc]
|
| 821 |
+
@functools.lru_cache()
|
| 822 |
+
def __new__(cls, root):
|
| 823 |
+
return super().__new__(cls)
|
| 824 |
+
|
| 825 |
+
def __init__(self, root):
|
| 826 |
+
self.root = root
|
| 827 |
+
|
| 828 |
+
def joinpath(self, child):
|
| 829 |
+
return pathlib.Path(self.root, child)
|
| 830 |
+
|
| 831 |
+
def children(self):
|
| 832 |
+
with suppress(Exception):
|
| 833 |
+
return os.listdir(self.root or '.')
|
| 834 |
+
with suppress(Exception):
|
| 835 |
+
return self.zip_children()
|
| 836 |
+
return []
|
| 837 |
+
|
| 838 |
+
def zip_children(self):
|
| 839 |
+
# deferred for performance (python/importlib_metadata#502)
|
| 840 |
+
from zipp.compat.overlay import zipfile
|
| 841 |
+
|
| 842 |
+
zip_path = zipfile.Path(self.root)
|
| 843 |
+
names = zip_path.root.namelist()
|
| 844 |
+
self.joinpath = zip_path.joinpath
|
| 845 |
+
|
| 846 |
+
return dict.fromkeys(child.split(posixpath.sep, 1)[0] for child in names)
|
| 847 |
+
|
| 848 |
+
def search(self, name):
|
| 849 |
+
return self.lookup(self.mtime).search(name)
|
| 850 |
+
|
| 851 |
+
@property
|
| 852 |
+
def mtime(self):
|
| 853 |
+
with suppress(OSError):
|
| 854 |
+
return os.stat(self.root).st_mtime
|
| 855 |
+
self.lookup.cache_clear()
|
| 856 |
+
|
| 857 |
+
@method_cache
|
| 858 |
+
def lookup(self, mtime):
|
| 859 |
+
return Lookup(self)
|
| 860 |
+
|
| 861 |
+
|
| 862 |
+
class Lookup:
|
| 863 |
+
"""
|
| 864 |
+
A micro-optimized class for searching a (fast) path for metadata.
|
| 865 |
+
"""
|
| 866 |
+
|
| 867 |
+
def __init__(self, path: FastPath):
|
| 868 |
+
"""
|
| 869 |
+
Calculate all of the children representing metadata.
|
| 870 |
+
|
| 871 |
+
From the children in the path, calculate early all of the
|
| 872 |
+
children that appear to represent metadata (infos) or legacy
|
| 873 |
+
metadata (eggs).
|
| 874 |
+
"""
|
| 875 |
+
|
| 876 |
+
base = os.path.basename(path.root).lower()
|
| 877 |
+
base_is_egg = base.endswith(".egg")
|
| 878 |
+
self.infos = FreezableDefaultDict(list)
|
| 879 |
+
self.eggs = FreezableDefaultDict(list)
|
| 880 |
+
|
| 881 |
+
for child in path.children():
|
| 882 |
+
low = child.lower()
|
| 883 |
+
if low.endswith((".dist-info", ".egg-info")):
|
| 884 |
+
# rpartition is faster than splitext and suitable for this purpose.
|
| 885 |
+
name = low.rpartition(".")[0].partition("-")[0]
|
| 886 |
+
normalized = Prepared.normalize(name)
|
| 887 |
+
self.infos[normalized].append(path.joinpath(child))
|
| 888 |
+
elif base_is_egg and low == "egg-info":
|
| 889 |
+
name = base.rpartition(".")[0].partition("-")[0]
|
| 890 |
+
legacy_normalized = Prepared.legacy_normalize(name)
|
| 891 |
+
self.eggs[legacy_normalized].append(path.joinpath(child))
|
| 892 |
+
|
| 893 |
+
self.infos.freeze()
|
| 894 |
+
self.eggs.freeze()
|
| 895 |
+
|
| 896 |
+
def search(self, prepared: Prepared):
|
| 897 |
+
"""
|
| 898 |
+
Yield all infos and eggs matching the Prepared query.
|
| 899 |
+
"""
|
| 900 |
+
infos = (
|
| 901 |
+
self.infos[prepared.normalized]
|
| 902 |
+
if prepared
|
| 903 |
+
else itertools.chain.from_iterable(self.infos.values())
|
| 904 |
+
)
|
| 905 |
+
eggs = (
|
| 906 |
+
self.eggs[prepared.legacy_normalized]
|
| 907 |
+
if prepared
|
| 908 |
+
else itertools.chain.from_iterable(self.eggs.values())
|
| 909 |
+
)
|
| 910 |
+
return itertools.chain(infos, eggs)
|
| 911 |
+
|
| 912 |
+
|
| 913 |
+
class Prepared:
|
| 914 |
+
"""
|
| 915 |
+
A prepared search query for metadata on a possibly-named package.
|
| 916 |
+
|
| 917 |
+
Pre-calculates the normalization to prevent repeated operations.
|
| 918 |
+
|
| 919 |
+
>>> none = Prepared(None)
|
| 920 |
+
>>> none.normalized
|
| 921 |
+
>>> none.legacy_normalized
|
| 922 |
+
>>> bool(none)
|
| 923 |
+
False
|
| 924 |
+
>>> sample = Prepared('Sample__Pkg-name.foo')
|
| 925 |
+
>>> sample.normalized
|
| 926 |
+
'sample_pkg_name_foo'
|
| 927 |
+
>>> sample.legacy_normalized
|
| 928 |
+
'sample__pkg_name.foo'
|
| 929 |
+
>>> bool(sample)
|
| 930 |
+
True
|
| 931 |
+
"""
|
| 932 |
+
|
| 933 |
+
normalized = None
|
| 934 |
+
legacy_normalized = None
|
| 935 |
+
|
| 936 |
+
def __init__(self, name: str | None):
|
| 937 |
+
self.name = name
|
| 938 |
+
if name is None:
|
| 939 |
+
return
|
| 940 |
+
self.normalized = self.normalize(name)
|
| 941 |
+
self.legacy_normalized = self.legacy_normalize(name)
|
| 942 |
+
|
| 943 |
+
@staticmethod
|
| 944 |
+
def normalize(name):
|
| 945 |
+
"""
|
| 946 |
+
PEP 503 normalization plus dashes as underscores.
|
| 947 |
+
"""
|
| 948 |
+
return re.sub(r"[-_.]+", "-", name).lower().replace('-', '_')
|
| 949 |
+
|
| 950 |
+
@staticmethod
|
| 951 |
+
def legacy_normalize(name):
|
| 952 |
+
"""
|
| 953 |
+
Normalize the package name as found in the convention in
|
| 954 |
+
older packaging tools versions and specs.
|
| 955 |
+
"""
|
| 956 |
+
return name.lower().replace('-', '_')
|
| 957 |
+
|
| 958 |
+
def __bool__(self):
|
| 959 |
+
return bool(self.name)
|
| 960 |
+
|
| 961 |
+
|
| 962 |
+
@install
|
| 963 |
+
class MetadataPathFinder(NullFinder, DistributionFinder):
|
| 964 |
+
"""A degenerate finder for distribution packages on the file system.
|
| 965 |
+
|
| 966 |
+
This finder supplies only a find_distributions() method for versions
|
| 967 |
+
of Python that do not have a PathFinder find_distributions().
|
| 968 |
+
"""
|
| 969 |
+
|
| 970 |
+
@classmethod
|
| 971 |
+
def find_distributions(
|
| 972 |
+
cls, context=DistributionFinder.Context()
|
| 973 |
+
) -> Iterable[PathDistribution]:
|
| 974 |
+
"""
|
| 975 |
+
Find distributions.
|
| 976 |
+
|
| 977 |
+
Return an iterable of all Distribution instances capable of
|
| 978 |
+
loading the metadata for packages matching ``context.name``
|
| 979 |
+
(or all names if ``None`` indicated) along the paths in the list
|
| 980 |
+
of directories ``context.path``.
|
| 981 |
+
"""
|
| 982 |
+
found = cls._search_paths(context.name, context.path)
|
| 983 |
+
return map(PathDistribution, found)
|
| 984 |
+
|
| 985 |
+
@classmethod
|
| 986 |
+
def _search_paths(cls, name, paths):
|
| 987 |
+
"""Find metadata directories in paths heuristically."""
|
| 988 |
+
prepared = Prepared(name)
|
| 989 |
+
return itertools.chain.from_iterable(
|
| 990 |
+
path.search(prepared) for path in map(FastPath, paths)
|
| 991 |
+
)
|
| 992 |
+
|
| 993 |
+
@classmethod
|
| 994 |
+
def invalidate_caches(cls) -> None:
|
| 995 |
+
FastPath.__new__.cache_clear()
|
| 996 |
+
|
| 997 |
+
|
| 998 |
+
class PathDistribution(Distribution):
|
| 999 |
+
def __init__(self, path: SimplePath) -> None:
|
| 1000 |
+
"""Construct a distribution.
|
| 1001 |
+
|
| 1002 |
+
:param path: SimplePath indicating the metadata directory.
|
| 1003 |
+
"""
|
| 1004 |
+
self._path = path
|
| 1005 |
+
|
| 1006 |
+
def read_text(self, filename: str | os.PathLike[str]) -> str | None:
|
| 1007 |
+
with suppress(
|
| 1008 |
+
FileNotFoundError,
|
| 1009 |
+
IsADirectoryError,
|
| 1010 |
+
KeyError,
|
| 1011 |
+
NotADirectoryError,
|
| 1012 |
+
PermissionError,
|
| 1013 |
+
):
|
| 1014 |
+
return self._path.joinpath(filename).read_text(encoding='utf-8')
|
| 1015 |
+
|
| 1016 |
+
return None
|
| 1017 |
+
|
| 1018 |
+
read_text.__doc__ = Distribution.read_text.__doc__
|
| 1019 |
+
|
| 1020 |
+
def locate_file(self, path: str | os.PathLike[str]) -> SimplePath:
|
| 1021 |
+
return self._path.parent / path
|
| 1022 |
+
|
| 1023 |
+
@property
|
| 1024 |
+
def _normalized_name(self):
|
| 1025 |
+
"""
|
| 1026 |
+
Performance optimization: where possible, resolve the
|
| 1027 |
+
normalized name from the file system path.
|
| 1028 |
+
"""
|
| 1029 |
+
stem = os.path.basename(str(self._path))
|
| 1030 |
+
return (
|
| 1031 |
+
pass_none(Prepared.normalize)(self._name_from_stem(stem))
|
| 1032 |
+
or super()._normalized_name
|
| 1033 |
+
)
|
| 1034 |
+
|
| 1035 |
+
@staticmethod
|
| 1036 |
+
def _name_from_stem(stem):
|
| 1037 |
+
"""
|
| 1038 |
+
>>> PathDistribution._name_from_stem('foo-3.0.egg-info')
|
| 1039 |
+
'foo'
|
| 1040 |
+
>>> PathDistribution._name_from_stem('CherryPy-3.0.dist-info')
|
| 1041 |
+
'CherryPy'
|
| 1042 |
+
>>> PathDistribution._name_from_stem('face.egg-info')
|
| 1043 |
+
'face'
|
| 1044 |
+
>>> PathDistribution._name_from_stem('foo.bar')
|
| 1045 |
+
"""
|
| 1046 |
+
filename, ext = os.path.splitext(stem)
|
| 1047 |
+
if ext not in ('.dist-info', '.egg-info'):
|
| 1048 |
+
return
|
| 1049 |
+
name, sep, rest = filename.partition('-')
|
| 1050 |
+
return name
|
| 1051 |
+
|
| 1052 |
+
|
| 1053 |
+
def distribution(distribution_name: str) -> Distribution:
|
| 1054 |
+
"""Get the ``Distribution`` instance for the named package.
|
| 1055 |
+
|
| 1056 |
+
:param distribution_name: The name of the distribution package as a string.
|
| 1057 |
+
:return: A ``Distribution`` instance (or subclass thereof).
|
| 1058 |
+
"""
|
| 1059 |
+
return Distribution.from_name(distribution_name)
|
| 1060 |
+
|
| 1061 |
+
|
| 1062 |
+
def distributions(**kwargs) -> Iterable[Distribution]:
|
| 1063 |
+
"""Get all ``Distribution`` instances in the current environment.
|
| 1064 |
+
|
| 1065 |
+
:return: An iterable of ``Distribution`` instances.
|
| 1066 |
+
"""
|
| 1067 |
+
return Distribution.discover(**kwargs)
|
| 1068 |
+
|
| 1069 |
+
|
| 1070 |
+
def metadata(distribution_name: str) -> _meta.PackageMetadata | None:
|
| 1071 |
+
"""Get the metadata for the named package.
|
| 1072 |
+
|
| 1073 |
+
:param distribution_name: The name of the distribution package to query.
|
| 1074 |
+
:return: A PackageMetadata containing the parsed metadata.
|
| 1075 |
+
"""
|
| 1076 |
+
return Distribution.from_name(distribution_name).metadata
|
| 1077 |
+
|
| 1078 |
+
|
| 1079 |
+
def version(distribution_name: str) -> str:
|
| 1080 |
+
"""Get the version string for the named package.
|
| 1081 |
+
|
| 1082 |
+
:param distribution_name: The name of the distribution package to query.
|
| 1083 |
+
:return: The version string for the package as defined in the package's
|
| 1084 |
+
"Version" metadata key.
|
| 1085 |
+
"""
|
| 1086 |
+
return distribution(distribution_name).version
|
| 1087 |
+
|
| 1088 |
+
|
| 1089 |
+
_unique = functools.partial(
|
| 1090 |
+
unique_everseen,
|
| 1091 |
+
key=operator.attrgetter('_normalized_name'),
|
| 1092 |
+
)
|
| 1093 |
+
"""
|
| 1094 |
+
Wrapper for ``distributions`` to return unique distributions by name.
|
| 1095 |
+
"""
|
| 1096 |
+
|
| 1097 |
+
|
| 1098 |
+
def entry_points(**params) -> EntryPoints:
|
| 1099 |
+
"""Return EntryPoint objects for all installed packages.
|
| 1100 |
+
|
| 1101 |
+
Pass selection parameters (group or name) to filter the
|
| 1102 |
+
result to entry points matching those properties (see
|
| 1103 |
+
EntryPoints.select()).
|
| 1104 |
+
|
| 1105 |
+
:return: EntryPoints for all installed packages.
|
| 1106 |
+
"""
|
| 1107 |
+
eps = itertools.chain.from_iterable(
|
| 1108 |
+
dist.entry_points for dist in _unique(distributions())
|
| 1109 |
+
)
|
| 1110 |
+
return EntryPoints(eps).select(**params)
|
| 1111 |
+
|
| 1112 |
+
|
| 1113 |
+
def files(distribution_name: str) -> list[PackagePath] | None:
|
| 1114 |
+
"""Return a list of files for the named package.
|
| 1115 |
+
|
| 1116 |
+
:param distribution_name: The name of the distribution package to query.
|
| 1117 |
+
:return: List of files composing the distribution.
|
| 1118 |
+
"""
|
| 1119 |
+
return distribution(distribution_name).files
|
| 1120 |
+
|
| 1121 |
+
|
| 1122 |
+
def requires(distribution_name: str) -> list[str] | None:
|
| 1123 |
+
"""
|
| 1124 |
+
Return a list of requirements for the named package.
|
| 1125 |
+
|
| 1126 |
+
:return: An iterable of requirements, suitable for
|
| 1127 |
+
packaging.requirement.Requirement.
|
| 1128 |
+
"""
|
| 1129 |
+
return distribution(distribution_name).requires
|
| 1130 |
+
|
| 1131 |
+
|
| 1132 |
+
def packages_distributions() -> Mapping[str, list[str]]:
|
| 1133 |
+
"""
|
| 1134 |
+
Return a mapping of top-level packages to their
|
| 1135 |
+
distributions.
|
| 1136 |
+
|
| 1137 |
+
>>> import collections.abc
|
| 1138 |
+
>>> pkgs = packages_distributions()
|
| 1139 |
+
>>> all(isinstance(dist, collections.abc.Sequence) for dist in pkgs.values())
|
| 1140 |
+
True
|
| 1141 |
+
"""
|
| 1142 |
+
pkg_to_dist = collections.defaultdict(list)
|
| 1143 |
+
for dist in distributions():
|
| 1144 |
+
for pkg in _top_level_declared(dist) or _top_level_inferred(dist):
|
| 1145 |
+
pkg_to_dist[pkg].append(md_none(dist.metadata)['Name'])
|
| 1146 |
+
return dict(pkg_to_dist)
|
| 1147 |
+
|
| 1148 |
+
|
| 1149 |
+
def _top_level_declared(dist):
|
| 1150 |
+
return (dist.read_text('top_level.txt') or '').split()
|
| 1151 |
+
|
| 1152 |
+
|
| 1153 |
+
def _topmost(name: PackagePath) -> str | None:
|
| 1154 |
+
"""
|
| 1155 |
+
Return the top-most parent as long as there is a parent.
|
| 1156 |
+
"""
|
| 1157 |
+
top, *rest = name.parts
|
| 1158 |
+
return top if rest else None
|
| 1159 |
+
|
| 1160 |
+
|
| 1161 |
+
def _get_toplevel_name(name: PackagePath) -> str:
|
| 1162 |
+
"""
|
| 1163 |
+
Infer a possibly importable module name from a name presumed on
|
| 1164 |
+
sys.path.
|
| 1165 |
+
|
| 1166 |
+
>>> _get_toplevel_name(PackagePath('foo.py'))
|
| 1167 |
+
'foo'
|
| 1168 |
+
>>> _get_toplevel_name(PackagePath('foo'))
|
| 1169 |
+
'foo'
|
| 1170 |
+
>>> _get_toplevel_name(PackagePath('foo.pyc'))
|
| 1171 |
+
'foo'
|
| 1172 |
+
>>> _get_toplevel_name(PackagePath('foo/__init__.py'))
|
| 1173 |
+
'foo'
|
| 1174 |
+
>>> _get_toplevel_name(PackagePath('foo.pth'))
|
| 1175 |
+
'foo.pth'
|
| 1176 |
+
>>> _get_toplevel_name(PackagePath('foo.dist-info'))
|
| 1177 |
+
'foo.dist-info'
|
| 1178 |
+
"""
|
| 1179 |
+
# Defer import of inspect for performance (python/cpython#118761)
|
| 1180 |
+
import inspect
|
| 1181 |
+
|
| 1182 |
+
return _topmost(name) or inspect.getmodulename(name) or str(name)
|
| 1183 |
+
|
| 1184 |
+
|
| 1185 |
+
def _top_level_inferred(dist):
|
| 1186 |
+
opt_names = set(map(_get_toplevel_name, always_iterable(dist.files)))
|
| 1187 |
+
|
| 1188 |
+
def importable_name(name):
|
| 1189 |
+
return '.' not in name
|
| 1190 |
+
|
| 1191 |
+
return filter(importable_name, opt_names)
|
importlib_metadata/_adapters.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import email.message
|
| 2 |
+
import email.policy
|
| 3 |
+
import re
|
| 4 |
+
import textwrap
|
| 5 |
+
|
| 6 |
+
from ._text import FoldedCase
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class RawPolicy(email.policy.EmailPolicy):
|
| 10 |
+
def fold(self, name, value):
|
| 11 |
+
folded = self.linesep.join(
|
| 12 |
+
textwrap
|
| 13 |
+
.indent(value, prefix=' ' * 8, predicate=lambda line: True)
|
| 14 |
+
.lstrip()
|
| 15 |
+
.splitlines()
|
| 16 |
+
)
|
| 17 |
+
return f'{name}: {folded}{self.linesep}'
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class Message(email.message.Message):
|
| 21 |
+
r"""
|
| 22 |
+
Specialized Message subclass to handle metadata naturally.
|
| 23 |
+
|
| 24 |
+
Reads values that may have newlines in them and converts the
|
| 25 |
+
payload to the Description.
|
| 26 |
+
|
| 27 |
+
>>> msg_text = textwrap.dedent('''
|
| 28 |
+
... Name: Foo
|
| 29 |
+
... Version: 3.0
|
| 30 |
+
... License: blah
|
| 31 |
+
... de-blah
|
| 32 |
+
... <BLANKLINE>
|
| 33 |
+
... First line of description.
|
| 34 |
+
... Second line of description.
|
| 35 |
+
... <BLANKLINE>
|
| 36 |
+
... Fourth line!
|
| 37 |
+
... ''').lstrip().replace('<BLANKLINE>', '')
|
| 38 |
+
>>> msg = Message(email.message_from_string(msg_text))
|
| 39 |
+
>>> msg['Description']
|
| 40 |
+
'First line of description.\nSecond line of description.\n\nFourth line!\n'
|
| 41 |
+
|
| 42 |
+
Message should render even if values contain newlines.
|
| 43 |
+
|
| 44 |
+
>>> print(msg)
|
| 45 |
+
Name: Foo
|
| 46 |
+
Version: 3.0
|
| 47 |
+
License: blah
|
| 48 |
+
de-blah
|
| 49 |
+
Description: First line of description.
|
| 50 |
+
Second line of description.
|
| 51 |
+
<BLANKLINE>
|
| 52 |
+
Fourth line!
|
| 53 |
+
<BLANKLINE>
|
| 54 |
+
<BLANKLINE>
|
| 55 |
+
"""
|
| 56 |
+
|
| 57 |
+
multiple_use_keys = set(
|
| 58 |
+
map(
|
| 59 |
+
FoldedCase,
|
| 60 |
+
[
|
| 61 |
+
'Classifier',
|
| 62 |
+
'Obsoletes-Dist',
|
| 63 |
+
'Platform',
|
| 64 |
+
'Project-URL',
|
| 65 |
+
'Provides-Dist',
|
| 66 |
+
'Provides-Extra',
|
| 67 |
+
'Requires-Dist',
|
| 68 |
+
'Requires-External',
|
| 69 |
+
'Supported-Platform',
|
| 70 |
+
'Dynamic',
|
| 71 |
+
],
|
| 72 |
+
)
|
| 73 |
+
)
|
| 74 |
+
"""
|
| 75 |
+
Keys that may be indicated multiple times per PEP 566.
|
| 76 |
+
"""
|
| 77 |
+
|
| 78 |
+
def __new__(cls, orig: email.message.Message):
|
| 79 |
+
res = super().__new__(cls)
|
| 80 |
+
vars(res).update(vars(orig))
|
| 81 |
+
return res
|
| 82 |
+
|
| 83 |
+
def __init__(self, *args, **kwargs):
|
| 84 |
+
self._headers = self._repair_headers()
|
| 85 |
+
|
| 86 |
+
# suppress spurious error from mypy
|
| 87 |
+
def __iter__(self):
|
| 88 |
+
return super().__iter__()
|
| 89 |
+
|
| 90 |
+
def __getitem__(self, item):
|
| 91 |
+
"""
|
| 92 |
+
Override parent behavior to typical dict behavior.
|
| 93 |
+
|
| 94 |
+
``email.message.Message`` will emit None values for missing
|
| 95 |
+
keys. Typical mappings, including this ``Message``, will raise
|
| 96 |
+
a key error for missing keys.
|
| 97 |
+
|
| 98 |
+
Ref python/importlib_metadata#371.
|
| 99 |
+
"""
|
| 100 |
+
res = super().__getitem__(item)
|
| 101 |
+
if res is None:
|
| 102 |
+
raise KeyError(item)
|
| 103 |
+
return res
|
| 104 |
+
|
| 105 |
+
def _repair_headers(self):
|
| 106 |
+
def redent(value):
|
| 107 |
+
"Correct for RFC822 indentation"
|
| 108 |
+
indent = ' ' * 8
|
| 109 |
+
if not value or '\n' + indent not in value:
|
| 110 |
+
return value
|
| 111 |
+
return textwrap.dedent(indent + value)
|
| 112 |
+
|
| 113 |
+
headers = [(key, redent(value)) for key, value in vars(self)['_headers']]
|
| 114 |
+
if self._payload:
|
| 115 |
+
headers.append(('Description', self.get_payload()))
|
| 116 |
+
self.set_payload('')
|
| 117 |
+
return headers
|
| 118 |
+
|
| 119 |
+
def as_string(self):
|
| 120 |
+
return super().as_string(policy=RawPolicy())
|
| 121 |
+
|
| 122 |
+
@property
|
| 123 |
+
def json(self):
|
| 124 |
+
"""
|
| 125 |
+
Convert PackageMetadata to a JSON-compatible format
|
| 126 |
+
per PEP 0566.
|
| 127 |
+
"""
|
| 128 |
+
|
| 129 |
+
def transform(key):
|
| 130 |
+
value = self.get_all(key) if key in self.multiple_use_keys else self[key]
|
| 131 |
+
if key == 'Keywords':
|
| 132 |
+
value = re.split(r'\s+', value)
|
| 133 |
+
tk = key.lower().replace('-', '_')
|
| 134 |
+
return tk, value
|
| 135 |
+
|
| 136 |
+
return dict(map(transform, map(FoldedCase, self)))
|
importlib_metadata/_collections.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import collections
|
| 2 |
+
import typing
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
# from jaraco.collections 3.3
|
| 6 |
+
class FreezableDefaultDict(collections.defaultdict):
|
| 7 |
+
"""
|
| 8 |
+
Often it is desirable to prevent the mutation of
|
| 9 |
+
a default dict after its initial construction, such
|
| 10 |
+
as to prevent mutation during iteration.
|
| 11 |
+
|
| 12 |
+
>>> dd = FreezableDefaultDict(list)
|
| 13 |
+
>>> dd[0].append('1')
|
| 14 |
+
>>> dd.freeze()
|
| 15 |
+
>>> dd[1]
|
| 16 |
+
[]
|
| 17 |
+
>>> len(dd)
|
| 18 |
+
1
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
def __missing__(self, key):
|
| 22 |
+
return getattr(self, '_frozen', super().__missing__)(key)
|
| 23 |
+
|
| 24 |
+
def freeze(self):
|
| 25 |
+
self._frozen = lambda key: self.default_factory()
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class Pair(typing.NamedTuple):
|
| 29 |
+
name: str
|
| 30 |
+
value: str
|
| 31 |
+
|
| 32 |
+
@classmethod
|
| 33 |
+
def parse(cls, text):
|
| 34 |
+
return cls(*map(str.strip, text.split("=", 1)))
|
importlib_metadata/_compat.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import platform
|
| 2 |
+
import sys
|
| 3 |
+
|
| 4 |
+
__all__ = ['install', 'NullFinder']
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def install(cls):
|
| 8 |
+
"""
|
| 9 |
+
Class decorator for installation on sys.meta_path.
|
| 10 |
+
|
| 11 |
+
Adds the backport DistributionFinder to sys.meta_path and
|
| 12 |
+
attempts to disable the finder functionality of the stdlib
|
| 13 |
+
DistributionFinder.
|
| 14 |
+
"""
|
| 15 |
+
sys.meta_path.append(cls())
|
| 16 |
+
disable_stdlib_finder()
|
| 17 |
+
return cls
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def disable_stdlib_finder():
|
| 21 |
+
"""
|
| 22 |
+
Give the backport primacy for discovering path-based distributions
|
| 23 |
+
by monkey-patching the stdlib O_O.
|
| 24 |
+
|
| 25 |
+
See #91 for more background for rationale on this sketchy
|
| 26 |
+
behavior.
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
def matches(finder):
|
| 30 |
+
return getattr(
|
| 31 |
+
finder, '__module__', None
|
| 32 |
+
) == '_frozen_importlib_external' and hasattr(finder, 'find_distributions')
|
| 33 |
+
|
| 34 |
+
for finder in filter(matches, sys.meta_path): # pragma: nocover
|
| 35 |
+
del finder.find_distributions
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class NullFinder:
|
| 39 |
+
"""
|
| 40 |
+
A "Finder" (aka "MetaPathFinder") that never finds any modules,
|
| 41 |
+
but may find distributions.
|
| 42 |
+
"""
|
| 43 |
+
|
| 44 |
+
@staticmethod
|
| 45 |
+
def find_spec(*args, **kwargs):
|
| 46 |
+
return None
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def pypy_partial(val):
|
| 50 |
+
"""
|
| 51 |
+
Adjust for variable stacklevel on partial under PyPy.
|
| 52 |
+
|
| 53 |
+
Workaround for #327.
|
| 54 |
+
"""
|
| 55 |
+
is_pypy = platform.python_implementation() == 'PyPy'
|
| 56 |
+
return val + is_pypy
|
importlib_metadata/_functools.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import functools
|
| 2 |
+
import types
|
| 3 |
+
from collections.abc import Callable
|
| 4 |
+
from typing import TypeVar
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
# from jaraco.functools 3.3
|
| 8 |
+
def method_cache(method, cache_wrapper=None):
|
| 9 |
+
"""
|
| 10 |
+
Wrap lru_cache to support storing the cache data in the object instances.
|
| 11 |
+
|
| 12 |
+
Abstracts the common paradigm where the method explicitly saves an
|
| 13 |
+
underscore-prefixed protected property on first call and returns that
|
| 14 |
+
subsequently.
|
| 15 |
+
|
| 16 |
+
>>> class MyClass:
|
| 17 |
+
... calls = 0
|
| 18 |
+
...
|
| 19 |
+
... @method_cache
|
| 20 |
+
... def method(self, value):
|
| 21 |
+
... self.calls += 1
|
| 22 |
+
... return value
|
| 23 |
+
|
| 24 |
+
>>> a = MyClass()
|
| 25 |
+
>>> a.method(3)
|
| 26 |
+
3
|
| 27 |
+
>>> for x in range(75):
|
| 28 |
+
... res = a.method(x)
|
| 29 |
+
>>> a.calls
|
| 30 |
+
75
|
| 31 |
+
|
| 32 |
+
Note that the apparent behavior will be exactly like that of lru_cache
|
| 33 |
+
except that the cache is stored on each instance, so values in one
|
| 34 |
+
instance will not flush values from another, and when an instance is
|
| 35 |
+
deleted, so are the cached values for that instance.
|
| 36 |
+
|
| 37 |
+
>>> b = MyClass()
|
| 38 |
+
>>> for x in range(35):
|
| 39 |
+
... res = b.method(x)
|
| 40 |
+
>>> b.calls
|
| 41 |
+
35
|
| 42 |
+
>>> a.method(0)
|
| 43 |
+
0
|
| 44 |
+
>>> a.calls
|
| 45 |
+
75
|
| 46 |
+
|
| 47 |
+
Note that if method had been decorated with ``functools.lru_cache()``,
|
| 48 |
+
a.calls would have been 76 (due to the cached value of 0 having been
|
| 49 |
+
flushed by the 'b' instance).
|
| 50 |
+
|
| 51 |
+
Clear the cache with ``.cache_clear()``
|
| 52 |
+
|
| 53 |
+
>>> a.method.cache_clear()
|
| 54 |
+
|
| 55 |
+
Same for a method that hasn't yet been called.
|
| 56 |
+
|
| 57 |
+
>>> c = MyClass()
|
| 58 |
+
>>> c.method.cache_clear()
|
| 59 |
+
|
| 60 |
+
Another cache wrapper may be supplied:
|
| 61 |
+
|
| 62 |
+
>>> cache = functools.lru_cache(maxsize=2)
|
| 63 |
+
>>> MyClass.method2 = method_cache(lambda self: 3, cache_wrapper=cache)
|
| 64 |
+
>>> a = MyClass()
|
| 65 |
+
>>> a.method2()
|
| 66 |
+
3
|
| 67 |
+
|
| 68 |
+
Caution - do not subsequently wrap the method with another decorator, such
|
| 69 |
+
as ``@property``, which changes the semantics of the function.
|
| 70 |
+
|
| 71 |
+
See also
|
| 72 |
+
http://code.activestate.com/recipes/577452-a-memoize-decorator-for-instance-methods/
|
| 73 |
+
for another implementation and additional justification.
|
| 74 |
+
"""
|
| 75 |
+
cache_wrapper = cache_wrapper or functools.lru_cache()
|
| 76 |
+
|
| 77 |
+
def wrapper(self, *args, **kwargs):
|
| 78 |
+
# it's the first call, replace the method with a cached, bound method
|
| 79 |
+
bound_method = types.MethodType(method, self)
|
| 80 |
+
cached_method = cache_wrapper(bound_method)
|
| 81 |
+
setattr(self, method.__name__, cached_method)
|
| 82 |
+
return cached_method(*args, **kwargs)
|
| 83 |
+
|
| 84 |
+
# Support cache clear even before cache has been created.
|
| 85 |
+
wrapper.cache_clear = lambda: None
|
| 86 |
+
|
| 87 |
+
return wrapper
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
# From jaraco.functools 3.3
|
| 91 |
+
def pass_none(func):
|
| 92 |
+
"""
|
| 93 |
+
Wrap func so it's not called if its first param is None
|
| 94 |
+
|
| 95 |
+
>>> print_text = pass_none(print)
|
| 96 |
+
>>> print_text('text')
|
| 97 |
+
text
|
| 98 |
+
>>> print_text(None)
|
| 99 |
+
"""
|
| 100 |
+
|
| 101 |
+
@functools.wraps(func)
|
| 102 |
+
def wrapper(param, *args, **kwargs):
|
| 103 |
+
if param is not None:
|
| 104 |
+
return func(param, *args, **kwargs)
|
| 105 |
+
|
| 106 |
+
return wrapper
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
# From jaraco.functools 4.4
|
| 110 |
+
def noop(*args, **kwargs):
|
| 111 |
+
"""
|
| 112 |
+
A no-operation function that does nothing.
|
| 113 |
+
|
| 114 |
+
>>> noop(1, 2, three=3)
|
| 115 |
+
"""
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
_T = TypeVar('_T')
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
# From jaraco.functools 4.4
|
| 122 |
+
def passthrough(func: Callable[..., object]) -> Callable[[_T], _T]:
|
| 123 |
+
"""
|
| 124 |
+
Wrap the function to always return the first parameter.
|
| 125 |
+
|
| 126 |
+
>>> passthrough(print)('3')
|
| 127 |
+
3
|
| 128 |
+
'3'
|
| 129 |
+
"""
|
| 130 |
+
|
| 131 |
+
@functools.wraps(func)
|
| 132 |
+
def wrapper(first: _T, *args, **kwargs) -> _T:
|
| 133 |
+
func(first, *args, **kwargs)
|
| 134 |
+
return first
|
| 135 |
+
|
| 136 |
+
return wrapper # type: ignore[return-value]
|
importlib_metadata/_itertools.py
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from collections import defaultdict, deque
|
| 2 |
+
from itertools import filterfalse
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def unique_everseen(iterable, key=None):
|
| 6 |
+
"List unique elements, preserving order. Remember all elements ever seen."
|
| 7 |
+
# unique_everseen('AAAABBBCCDAABBB') --> A B C D
|
| 8 |
+
# unique_everseen('ABBCcAD', str.lower) --> A B C D
|
| 9 |
+
seen = set()
|
| 10 |
+
seen_add = seen.add
|
| 11 |
+
if key is None:
|
| 12 |
+
for element in filterfalse(seen.__contains__, iterable):
|
| 13 |
+
seen_add(element)
|
| 14 |
+
yield element
|
| 15 |
+
else:
|
| 16 |
+
for element in iterable:
|
| 17 |
+
k = key(element)
|
| 18 |
+
if k not in seen:
|
| 19 |
+
seen_add(k)
|
| 20 |
+
yield element
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# copied from more_itertools 8.8
|
| 24 |
+
def always_iterable(obj, base_type=(str, bytes)):
|
| 25 |
+
"""If *obj* is iterable, return an iterator over its items::
|
| 26 |
+
|
| 27 |
+
>>> obj = (1, 2, 3)
|
| 28 |
+
>>> list(always_iterable(obj))
|
| 29 |
+
[1, 2, 3]
|
| 30 |
+
|
| 31 |
+
If *obj* is not iterable, return a one-item iterable containing *obj*::
|
| 32 |
+
|
| 33 |
+
>>> obj = 1
|
| 34 |
+
>>> list(always_iterable(obj))
|
| 35 |
+
[1]
|
| 36 |
+
|
| 37 |
+
If *obj* is ``None``, return an empty iterable:
|
| 38 |
+
|
| 39 |
+
>>> obj = None
|
| 40 |
+
>>> list(always_iterable(None))
|
| 41 |
+
[]
|
| 42 |
+
|
| 43 |
+
By default, binary and text strings are not considered iterable::
|
| 44 |
+
|
| 45 |
+
>>> obj = 'foo'
|
| 46 |
+
>>> list(always_iterable(obj))
|
| 47 |
+
['foo']
|
| 48 |
+
|
| 49 |
+
If *base_type* is set, objects for which ``isinstance(obj, base_type)``
|
| 50 |
+
returns ``True`` won't be considered iterable.
|
| 51 |
+
|
| 52 |
+
>>> obj = {'a': 1}
|
| 53 |
+
>>> list(always_iterable(obj)) # Iterate over the dict's keys
|
| 54 |
+
['a']
|
| 55 |
+
>>> list(always_iterable(obj, base_type=dict)) # Treat dicts as a unit
|
| 56 |
+
[{'a': 1}]
|
| 57 |
+
|
| 58 |
+
Set *base_type* to ``None`` to avoid any special handling and treat objects
|
| 59 |
+
Python considers iterable as iterable:
|
| 60 |
+
|
| 61 |
+
>>> obj = 'foo'
|
| 62 |
+
>>> list(always_iterable(obj, base_type=None))
|
| 63 |
+
['f', 'o', 'o']
|
| 64 |
+
"""
|
| 65 |
+
if obj is None:
|
| 66 |
+
return iter(())
|
| 67 |
+
|
| 68 |
+
if (base_type is not None) and isinstance(obj, base_type):
|
| 69 |
+
return iter((obj,))
|
| 70 |
+
|
| 71 |
+
try:
|
| 72 |
+
return iter(obj)
|
| 73 |
+
except TypeError:
|
| 74 |
+
return iter((obj,))
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
# Copied from more_itertools 10.3
|
| 78 |
+
class bucket:
|
| 79 |
+
"""Wrap *iterable* and return an object that buckets the iterable into
|
| 80 |
+
child iterables based on a *key* function.
|
| 81 |
+
|
| 82 |
+
>>> iterable = ['a1', 'b1', 'c1', 'a2', 'b2', 'c2', 'b3']
|
| 83 |
+
>>> s = bucket(iterable, key=lambda x: x[0]) # Bucket by 1st character
|
| 84 |
+
>>> sorted(list(s)) # Get the keys
|
| 85 |
+
['a', 'b', 'c']
|
| 86 |
+
>>> a_iterable = s['a']
|
| 87 |
+
>>> next(a_iterable)
|
| 88 |
+
'a1'
|
| 89 |
+
>>> next(a_iterable)
|
| 90 |
+
'a2'
|
| 91 |
+
>>> list(s['b'])
|
| 92 |
+
['b1', 'b2', 'b3']
|
| 93 |
+
|
| 94 |
+
The original iterable will be advanced and its items will be cached until
|
| 95 |
+
they are used by the child iterables. This may require significant storage.
|
| 96 |
+
|
| 97 |
+
By default, attempting to select a bucket to which no items belong will
|
| 98 |
+
exhaust the iterable and cache all values.
|
| 99 |
+
If you specify a *validator* function, selected buckets will instead be
|
| 100 |
+
checked against it.
|
| 101 |
+
|
| 102 |
+
>>> from itertools import count
|
| 103 |
+
>>> it = count(1, 2) # Infinite sequence of odd numbers
|
| 104 |
+
>>> key = lambda x: x % 10 # Bucket by last digit
|
| 105 |
+
>>> validator = lambda x: x in {1, 3, 5, 7, 9} # Odd digits only
|
| 106 |
+
>>> s = bucket(it, key=key, validator=validator)
|
| 107 |
+
>>> 2 in s
|
| 108 |
+
False
|
| 109 |
+
>>> list(s[2])
|
| 110 |
+
[]
|
| 111 |
+
|
| 112 |
+
"""
|
| 113 |
+
|
| 114 |
+
def __init__(self, iterable, key, validator=None):
|
| 115 |
+
self._it = iter(iterable)
|
| 116 |
+
self._key = key
|
| 117 |
+
self._cache = defaultdict(deque)
|
| 118 |
+
self._validator = validator or (lambda x: True)
|
| 119 |
+
|
| 120 |
+
def __contains__(self, value):
|
| 121 |
+
if not self._validator(value):
|
| 122 |
+
return False
|
| 123 |
+
|
| 124 |
+
try:
|
| 125 |
+
item = next(self[value])
|
| 126 |
+
except StopIteration:
|
| 127 |
+
return False
|
| 128 |
+
else:
|
| 129 |
+
self._cache[value].appendleft(item)
|
| 130 |
+
|
| 131 |
+
return True
|
| 132 |
+
|
| 133 |
+
def _get_values(self, value):
|
| 134 |
+
"""
|
| 135 |
+
Helper to yield items from the parent iterator that match *value*.
|
| 136 |
+
Items that don't match are stored in the local cache as they
|
| 137 |
+
are encountered.
|
| 138 |
+
"""
|
| 139 |
+
while True:
|
| 140 |
+
# If we've cached some items that match the target value, emit
|
| 141 |
+
# the first one and evict it from the cache.
|
| 142 |
+
if self._cache[value]:
|
| 143 |
+
yield self._cache[value].popleft()
|
| 144 |
+
# Otherwise we need to advance the parent iterator to search for
|
| 145 |
+
# a matching item, caching the rest.
|
| 146 |
+
else:
|
| 147 |
+
while True:
|
| 148 |
+
try:
|
| 149 |
+
item = next(self._it)
|
| 150 |
+
except StopIteration:
|
| 151 |
+
return
|
| 152 |
+
item_value = self._key(item)
|
| 153 |
+
if item_value == value:
|
| 154 |
+
yield item
|
| 155 |
+
break
|
| 156 |
+
elif self._validator(item_value):
|
| 157 |
+
self._cache[item_value].append(item)
|
| 158 |
+
|
| 159 |
+
def __iter__(self):
|
| 160 |
+
for item in self._it:
|
| 161 |
+
item_value = self._key(item)
|
| 162 |
+
if self._validator(item_value):
|
| 163 |
+
self._cache[item_value].append(item)
|
| 164 |
+
|
| 165 |
+
yield from self._cache.keys()
|
| 166 |
+
|
| 167 |
+
def __getitem__(self, value):
|
| 168 |
+
if not self._validator(value):
|
| 169 |
+
return iter(())
|
| 170 |
+
|
| 171 |
+
return self._get_values(value)
|
importlib_metadata/_meta.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from collections.abc import Iterator
|
| 5 |
+
from typing import (
|
| 6 |
+
Any,
|
| 7 |
+
Protocol,
|
| 8 |
+
TypeVar,
|
| 9 |
+
overload,
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
_T = TypeVar("_T")
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class PackageMetadata(Protocol):
|
| 16 |
+
def __len__(self) -> int: ... # pragma: no cover
|
| 17 |
+
|
| 18 |
+
def __contains__(self, item: str) -> bool: ... # pragma: no cover
|
| 19 |
+
|
| 20 |
+
def __getitem__(self, key: str) -> str: ... # pragma: no cover
|
| 21 |
+
|
| 22 |
+
def __iter__(self) -> Iterator[str]: ... # pragma: no cover
|
| 23 |
+
|
| 24 |
+
@overload
|
| 25 |
+
def get(
|
| 26 |
+
self, name: str, failobj: None = None
|
| 27 |
+
) -> str | None: ... # pragma: no cover
|
| 28 |
+
|
| 29 |
+
@overload
|
| 30 |
+
def get(self, name: str, failobj: _T) -> str | _T: ... # pragma: no cover
|
| 31 |
+
|
| 32 |
+
# overload per python/importlib_metadata#435
|
| 33 |
+
@overload
|
| 34 |
+
def get_all(
|
| 35 |
+
self, name: str, failobj: None = None
|
| 36 |
+
) -> list[Any] | None: ... # pragma: no cover
|
| 37 |
+
|
| 38 |
+
@overload
|
| 39 |
+
def get_all(self, name: str, failobj: _T) -> list[Any] | _T:
|
| 40 |
+
"""
|
| 41 |
+
Return all values associated with a possibly multi-valued key.
|
| 42 |
+
"""
|
| 43 |
+
|
| 44 |
+
@property
|
| 45 |
+
def json(self) -> dict[str, str | list[str]]:
|
| 46 |
+
"""
|
| 47 |
+
A JSON-compatible form of the metadata.
|
| 48 |
+
"""
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
class SimplePath(Protocol):
|
| 52 |
+
"""
|
| 53 |
+
A minimal subset of pathlib.Path required by Distribution.
|
| 54 |
+
"""
|
| 55 |
+
|
| 56 |
+
def joinpath(
|
| 57 |
+
self, other: str | os.PathLike[str]
|
| 58 |
+
) -> SimplePath: ... # pragma: no cover
|
| 59 |
+
|
| 60 |
+
def __truediv__(
|
| 61 |
+
self, other: str | os.PathLike[str]
|
| 62 |
+
) -> SimplePath: ... # pragma: no cover
|
| 63 |
+
|
| 64 |
+
@property
|
| 65 |
+
def parent(self) -> SimplePath: ... # pragma: no cover
|
| 66 |
+
|
| 67 |
+
def read_text(self, encoding=None) -> str: ... # pragma: no cover
|
| 68 |
+
|
| 69 |
+
def read_bytes(self) -> bytes: ... # pragma: no cover
|
| 70 |
+
|
| 71 |
+
def exists(self) -> bool: ... # pragma: no cover
|
importlib_metadata/_text.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
from ._functools import method_cache
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
# from jaraco.text 3.5
|
| 7 |
+
class FoldedCase(str):
|
| 8 |
+
"""
|
| 9 |
+
A case insensitive string class; behaves just like str
|
| 10 |
+
except compares equal when the only variation is case.
|
| 11 |
+
|
| 12 |
+
>>> s = FoldedCase('hello world')
|
| 13 |
+
|
| 14 |
+
>>> s == 'Hello World'
|
| 15 |
+
True
|
| 16 |
+
|
| 17 |
+
>>> 'Hello World' == s
|
| 18 |
+
True
|
| 19 |
+
|
| 20 |
+
>>> s != 'Hello World'
|
| 21 |
+
False
|
| 22 |
+
|
| 23 |
+
>>> s.index('O')
|
| 24 |
+
4
|
| 25 |
+
|
| 26 |
+
>>> s.split('O')
|
| 27 |
+
['hell', ' w', 'rld']
|
| 28 |
+
|
| 29 |
+
>>> sorted(map(FoldedCase, ['GAMMA', 'alpha', 'Beta']))
|
| 30 |
+
['alpha', 'Beta', 'GAMMA']
|
| 31 |
+
|
| 32 |
+
Sequence membership is straightforward.
|
| 33 |
+
|
| 34 |
+
>>> "Hello World" in [s]
|
| 35 |
+
True
|
| 36 |
+
>>> s in ["Hello World"]
|
| 37 |
+
True
|
| 38 |
+
|
| 39 |
+
You may test for set inclusion, but candidate and elements
|
| 40 |
+
must both be folded.
|
| 41 |
+
|
| 42 |
+
>>> FoldedCase("Hello World") in {s}
|
| 43 |
+
True
|
| 44 |
+
>>> s in {FoldedCase("Hello World")}
|
| 45 |
+
True
|
| 46 |
+
|
| 47 |
+
String inclusion works as long as the FoldedCase object
|
| 48 |
+
is on the right.
|
| 49 |
+
|
| 50 |
+
>>> "hello" in FoldedCase("Hello World")
|
| 51 |
+
True
|
| 52 |
+
|
| 53 |
+
But not if the FoldedCase object is on the left:
|
| 54 |
+
|
| 55 |
+
>>> FoldedCase('hello') in 'Hello World'
|
| 56 |
+
False
|
| 57 |
+
|
| 58 |
+
In that case, use in_:
|
| 59 |
+
|
| 60 |
+
>>> FoldedCase('hello').in_('Hello World')
|
| 61 |
+
True
|
| 62 |
+
|
| 63 |
+
>>> FoldedCase('hello') > FoldedCase('Hello')
|
| 64 |
+
False
|
| 65 |
+
"""
|
| 66 |
+
|
| 67 |
+
def __lt__(self, other):
|
| 68 |
+
return self.lower() < other.lower()
|
| 69 |
+
|
| 70 |
+
def __gt__(self, other):
|
| 71 |
+
return self.lower() > other.lower()
|
| 72 |
+
|
| 73 |
+
def __eq__(self, other):
|
| 74 |
+
return self.lower() == other.lower()
|
| 75 |
+
|
| 76 |
+
def __ne__(self, other):
|
| 77 |
+
return self.lower() != other.lower()
|
| 78 |
+
|
| 79 |
+
def __hash__(self):
|
| 80 |
+
return hash(self.lower())
|
| 81 |
+
|
| 82 |
+
def __contains__(self, other):
|
| 83 |
+
return super().lower().__contains__(other.lower())
|
| 84 |
+
|
| 85 |
+
def in_(self, other):
|
| 86 |
+
"Does self appear in other?"
|
| 87 |
+
return self in FoldedCase(other)
|
| 88 |
+
|
| 89 |
+
# cache lower since it's likely to be called frequently.
|
| 90 |
+
@method_cache
|
| 91 |
+
def lower(self):
|
| 92 |
+
return super().lower()
|
| 93 |
+
|
| 94 |
+
def index(self, sub):
|
| 95 |
+
return self.lower().index(sub.lower())
|
| 96 |
+
|
| 97 |
+
def split(self, splitter=' ', maxsplit=0):
|
| 98 |
+
pattern = re.compile(re.escape(splitter), re.I)
|
| 99 |
+
return pattern.split(self, maxsplit)
|
importlib_metadata/_typing.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import functools
|
| 2 |
+
import typing
|
| 3 |
+
|
| 4 |
+
from ._meta import PackageMetadata
|
| 5 |
+
|
| 6 |
+
md_none = functools.partial(typing.cast, PackageMetadata)
|
| 7 |
+
"""
|
| 8 |
+
Suppress type errors for optional metadata.
|
| 9 |
+
|
| 10 |
+
Although Distribution.metadata can return None when metadata is corrupt
|
| 11 |
+
and thus None, allow callers to assume it's not None and crash if
|
| 12 |
+
that's the case.
|
| 13 |
+
|
| 14 |
+
# python/importlib_metadata#493
|
| 15 |
+
"""
|