salmankhanpm commited on
Commit
ffca8fa
·
verified ·
1 Parent(s): 9b44c7b

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. _cuda_bindings_redirector.py +30 -0
  2. anyio-4.12.1.dist-info/INSTALLER +1 -0
  3. anyio-4.12.1.dist-info/METADATA +96 -0
  4. anyio-4.12.1.dist-info/RECORD +51 -0
  5. anyio-4.12.1.dist-info/REQUESTED +0 -0
  6. anyio-4.12.1.dist-info/WHEEL +5 -0
  7. anyio-4.12.1.dist-info/entry_points.txt +2 -0
  8. anyio-4.12.1.dist-info/top_level.txt +1 -0
  9. dataset-metadata.json +9 -0
  10. datasets/__init__.py +47 -0
  11. datasets/arrow_dataset.py +0 -0
  12. datasets/arrow_reader.py +620 -0
  13. datasets/arrow_writer.py +766 -0
  14. datasets/builder.py +1866 -0
  15. datasets/combine.py +223 -0
  16. datasets/config.py +268 -0
  17. datasets/data_files.py +807 -0
  18. datasets/dataset_dict.py +0 -0
  19. datasets/distributed.py +39 -0
  20. datasets/exceptions.py +119 -0
  21. datasets/fingerprint.py +454 -0
  22. datasets/hub.py +124 -0
  23. datasets/info.py +430 -0
  24. datasets/inspect.py +353 -0
  25. datasets/iterable_dataset.py +0 -0
  26. datasets/keyhash.py +104 -0
  27. datasets/load.py +1481 -0
  28. datasets/naming.py +84 -0
  29. datasets/search.py +785 -0
  30. datasets/splits.py +635 -0
  31. datasets/streaming.py +131 -0
  32. datasets/table.py +2385 -0
  33. idna/__init__.py +45 -0
  34. idna/codec.py +122 -0
  35. idna/compat.py +15 -0
  36. idna/core.py +437 -0
  37. idna/idnadata.py +4309 -0
  38. idna/intranges.py +57 -0
  39. idna/package_data.py +1 -0
  40. idna/py.typed +0 -0
  41. idna/uts46data.py +0 -0
  42. importlib_metadata/__init__.py +1191 -0
  43. importlib_metadata/_adapters.py +136 -0
  44. importlib_metadata/_collections.py +34 -0
  45. importlib_metadata/_compat.py +56 -0
  46. importlib_metadata/_functools.py +136 -0
  47. importlib_metadata/_itertools.py +171 -0
  48. importlib_metadata/_meta.py +71 -0
  49. importlib_metadata/_text.py +99 -0
  50. importlib_metadata/_typing.py +15 -0
_cuda_bindings_redirector.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
3
+
4
+ import sys
5
+ from types import ModuleType
6
+
7
+
8
+ # Make sure 'cuda' is importable as a namespace package
9
+ import cuda
10
+
11
+
12
+ class LazyCudaModule(ModuleType):
13
+
14
+ def __getattr__(self, name):
15
+ if name == '__version__':
16
+ import warnings
17
+ warnings.warn(
18
+ "accessing cuda.__version__ is deprecated, " "please switch to use cuda.bindings.__version__ instead",
19
+ FutureWarning,
20
+ stacklevel=2,
21
+ )
22
+ from cuda.bindings import __version__
23
+
24
+ return __version__
25
+
26
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
27
+
28
+
29
+ # Patch in LazyCudaModule for `cuda`
30
+ sys.modules['cuda'].__class__ = LazyCudaModule
anyio-4.12.1.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ uv
anyio-4.12.1.dist-info/METADATA ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: anyio
3
+ Version: 4.12.1
4
+ Summary: High-level concurrency and networking framework on top of asyncio or Trio
5
+ Author-email: Alex Grönholm <alex.gronholm@nextday.fi>
6
+ License-Expression: MIT
7
+ Project-URL: Documentation, https://anyio.readthedocs.io/en/latest/
8
+ Project-URL: Changelog, https://anyio.readthedocs.io/en/stable/versionhistory.html
9
+ Project-URL: Source code, https://github.com/agronholm/anyio
10
+ Project-URL: Issue tracker, https://github.com/agronholm/anyio/issues
11
+ Classifier: Development Status :: 5 - Production/Stable
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Framework :: AnyIO
14
+ Classifier: Typing :: Typed
15
+ Classifier: Programming Language :: Python
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Programming Language :: Python :: 3.14
23
+ Requires-Python: >=3.9
24
+ Description-Content-Type: text/x-rst
25
+ License-File: LICENSE
26
+ Requires-Dist: exceptiongroup>=1.0.2; python_version < "3.11"
27
+ Requires-Dist: idna>=2.8
28
+ Requires-Dist: typing_extensions>=4.5; python_version < "3.13"
29
+ Provides-Extra: trio
30
+ Requires-Dist: trio>=0.32.0; python_version >= "3.10" and extra == "trio"
31
+ Requires-Dist: trio>=0.31.0; python_version < "3.10" and extra == "trio"
32
+ Dynamic: license-file
33
+
34
+ .. image:: https://github.com/agronholm/anyio/actions/workflows/test.yml/badge.svg
35
+ :target: https://github.com/agronholm/anyio/actions/workflows/test.yml
36
+ :alt: Build Status
37
+ .. image:: https://coveralls.io/repos/github/agronholm/anyio/badge.svg?branch=master
38
+ :target: https://coveralls.io/github/agronholm/anyio?branch=master
39
+ :alt: Code Coverage
40
+ .. image:: https://readthedocs.org/projects/anyio/badge/?version=latest
41
+ :target: https://anyio.readthedocs.io/en/latest/?badge=latest
42
+ :alt: Documentation
43
+ .. image:: https://badges.gitter.im/gitterHQ/gitter.svg
44
+ :target: https://gitter.im/python-trio/AnyIO
45
+ :alt: Gitter chat
46
+
47
+ AnyIO is an asynchronous networking and concurrency library that works on top of either asyncio_ or
48
+ Trio_. It implements Trio-like `structured concurrency`_ (SC) on top of asyncio and works in harmony
49
+ with the native SC of Trio itself.
50
+
51
+ Applications and libraries written against AnyIO's API will run unmodified on either asyncio_ or
52
+ Trio_. AnyIO can also be adopted into a library or application incrementally – bit by bit, no full
53
+ refactoring necessary. It will blend in with the native libraries of your chosen backend.
54
+
55
+ To find out why you might want to use AnyIO's APIs instead of asyncio's, you can read about it
56
+ `here <https://anyio.readthedocs.io/en/stable/why.html>`_.
57
+
58
+ Documentation
59
+ -------------
60
+
61
+ View full documentation at: https://anyio.readthedocs.io/
62
+
63
+ Features
64
+ --------
65
+
66
+ AnyIO offers the following functionality:
67
+
68
+ * Task groups (nurseries_ in trio terminology)
69
+ * High-level networking (TCP, UDP and UNIX sockets)
70
+
71
+ * `Happy eyeballs`_ algorithm for TCP connections (more robust than that of asyncio on Python
72
+ 3.8)
73
+ * async/await style UDP sockets (unlike asyncio where you still have to use Transports and
74
+ Protocols)
75
+
76
+ * A versatile API for byte streams and object streams
77
+ * Inter-task synchronization and communication (locks, conditions, events, semaphores, object
78
+ streams)
79
+ * Worker threads
80
+ * Subprocesses
81
+ * Subinterpreter support for code parallelization (on Python 3.13 and later)
82
+ * Asynchronous file I/O (using worker threads)
83
+ * Signal handling
84
+ * Asynchronous version of the functools_ module
85
+
86
+ AnyIO also comes with its own pytest_ plugin which also supports asynchronous fixtures.
87
+ It even works with the popular Hypothesis_ library.
88
+
89
+ .. _asyncio: https://docs.python.org/3/library/asyncio.html
90
+ .. _Trio: https://github.com/python-trio/trio
91
+ .. _structured concurrency: https://en.wikipedia.org/wiki/Structured_concurrency
92
+ .. _nurseries: https://trio.readthedocs.io/en/stable/reference-core.html#nurseries-and-spawning
93
+ .. _Happy eyeballs: https://en.wikipedia.org/wiki/Happy_Eyeballs
94
+ .. _pytest: https://docs.pytest.org/en/latest/
95
+ .. _functools: https://docs.python.org/3/library/functools.html
96
+ .. _Hypothesis: https://hypothesis.works/
anyio-4.12.1.dist-info/RECORD ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ anyio-4.12.1.dist-info/INSTALLER,sha256=5hhM4Q4mYTT9z6QB6PGpUAW81PGNFrYrdXMj4oM_6ak,2
2
+ anyio-4.12.1.dist-info/METADATA,sha256=DfiDab9Tmmcfy802lOLTMEHJQShkOSbopCwqCYbLuJk,4277
3
+ anyio-4.12.1.dist-info/RECORD,,
4
+ anyio-4.12.1.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ anyio-4.12.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
+ anyio-4.12.1.dist-info/entry_points.txt,sha256=_d6Yu6uiaZmNe0CydowirE9Cmg7zUL2g08tQpoS3Qvc,39
7
+ anyio-4.12.1.dist-info/licenses/LICENSE,sha256=U2GsncWPLvX9LpsJxoKXwX8ElQkJu8gCO9uC6s8iwrA,1081
8
+ anyio-4.12.1.dist-info/top_level.txt,sha256=QglSMiWX8_5dpoVAEIHdEYzvqFMdSYWmCj6tYw2ITkQ,6
9
+ anyio/__init__.py,sha256=7iDVqMUprUuKNY91FuoKqayAhR-OY136YDPI6P78HHk,6170
10
+ anyio/_backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ anyio/_backends/_asyncio.py,sha256=xG6qv60mgGnL0mK82dxjH2b8hlkMlJ-x2BqIq3qv70Y,98863
12
+ anyio/_backends/_trio.py,sha256=30Rctb7lm8g63ZHljVPVnj5aH-uK6oQvphjwUBoAzuI,41456
13
+ anyio/_core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ anyio/_core/_asyncio_selector_thread.py,sha256=2PdxFM3cs02Kp6BSppbvmRT7q7asreTW5FgBxEsflBo,5626
15
+ anyio/_core/_contextmanagers.py,sha256=YInBCabiEeS-UaP_Jdxa1CaFC71ETPW8HZTHIM8Rsc8,7215
16
+ anyio/_core/_eventloop.py,sha256=c2EdcBX-xnKwxPcC4Pjn3_qG9I-x4IWFO2R9RqCGjM4,6448
17
+ anyio/_core/_exceptions.py,sha256=Y3aq-Wxd7Q2HqwSg7nZPvRsHEuGazv_qeet6gqEBdPk,4407
18
+ anyio/_core/_fileio.py,sha256=uc7t10Vb-If7GbdWM_zFf-ajUe6uek63fSt7IBLlZW0,25731
19
+ anyio/_core/_resources.py,sha256=NbmU5O5UX3xEyACnkmYX28Fmwdl-f-ny0tHym26e0w0,435
20
+ anyio/_core/_signals.py,sha256=mjTBB2hTKNPRlU0IhnijeQedpWOGERDiMjSlJQsFrug,1016
21
+ anyio/_core/_sockets.py,sha256=RBXHcUqZt5gg_-OOfgHVv8uq2FSKk1uVUzTdpjBoI1o,34977
22
+ anyio/_core/_streams.py,sha256=FczFwIgDpnkK0bODWJXMpsUJYdvAD04kaUaGzJU8DK0,1806
23
+ anyio/_core/_subprocesses.py,sha256=EXm5igL7dj55iYkPlbYVAqtbqxJxjU-6OndSTIx9SRg,8047
24
+ anyio/_core/_synchronization.py,sha256=MgVVqFzvt580tHC31LiOcq1G6aryut--xRG4Ff8KwxQ,20869
25
+ anyio/_core/_tasks.py,sha256=pVB7K6AAulzUM8YgXAeqNZG44nSyZ1bYJjH8GznC00I,5435
26
+ anyio/_core/_tempfile.py,sha256=lHb7CW4FyIlpkf5ADAf4VmLHCKwEHF9nxqNyBCFFUiA,19697
27
+ anyio/_core/_testing.py,sha256=u7MPqGXwpTxqI7hclSdNA30z2GH1Nw258uwKvy_RfBg,2340
28
+ anyio/_core/_typedattr.py,sha256=P4ozZikn3-DbpoYcvyghS_FOYAgbmUxeoU8-L_07pZM,2508
29
+ anyio/abc/__init__.py,sha256=6mWhcl_pGXhrgZVHP_TCfMvIXIOp9mroEFM90fYCU_U,2869
30
+ anyio/abc/_eventloop.py,sha256=GlzgB3UJGgG6Kr7olpjOZ-o00PghecXuofVDQ_5611Q,10749
31
+ anyio/abc/_resources.py,sha256=DrYvkNN1hH6Uvv5_5uKySvDsnknGVDe8FCKfko0VtN8,783
32
+ anyio/abc/_sockets.py,sha256=ECTY0jLEF18gryANHR3vFzXzGdZ-xPwELq1QdgOb0Jo,13258
33
+ anyio/abc/_streams.py,sha256=005GKSCXGprxnhucILboSqc2JFovECZk9m3p-qqxXVc,7640
34
+ anyio/abc/_subprocesses.py,sha256=cumAPJTktOQtw63IqG0lDpyZqu_l1EElvQHMiwJgL08,2067
35
+ anyio/abc/_tasks.py,sha256=KC7wrciE48AINOI-AhPutnFhe1ewfP7QnamFlDzqesQ,3721
36
+ anyio/abc/_testing.py,sha256=tBJUzkSfOXJw23fe8qSJ03kJlShOYjjaEyFB6k6MYT8,1821
37
+ anyio/from_thread.py,sha256=L-0w1HxJ6BSb-KuVi57k5Tkc3yzQrx3QK5tAxMPcY-0,19141
38
+ anyio/functools.py,sha256=HWj7GBEmc0Z-mZg3uok7Z7ZJn0rEC_0Pzbt0nYUDaTQ,10973
39
+ anyio/lowlevel.py,sha256=AyKLVK3LaWSoK39LkCKxE4_GDMLKZBNqTrLUgk63y80,5158
40
+ anyio/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
+ anyio/pytest_plugin.py,sha256=3jAFQn0jv_pyoWE2GBBlHaj9sqXj4e8vob0_hgrsXE8,10244
42
+ anyio/streams/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
43
+ anyio/streams/buffered.py,sha256=2R3PeJhe4EXrdYqz44Y6-Eg9R6DrmlsYrP36Ir43-po,6263
44
+ anyio/streams/file.py,sha256=4WZ7XGz5WNu39FQHvqbe__TQ0HDP9OOhgO1mk9iVpVU,4470
45
+ anyio/streams/memory.py,sha256=F0zwzvFJKAhX_LRZGoKzzqDC2oMM-f-yyTBrEYEGOaU,10740
46
+ anyio/streams/stapled.py,sha256=T8Xqwf8K6EgURPxbt1N4i7A8BAk-gScv-GRhjLXIf_o,4390
47
+ anyio/streams/text.py,sha256=BcVAGJw1VRvtIqnv-o0Rb0pwH7p8vwlvl21xHq522ag,5765
48
+ anyio/streams/tls.py,sha256=Jpxy0Mfbcp1BxHCwE-YjSSFaLnIBbnnwur-excYThs4,15368
49
+ anyio/to_interpreter.py,sha256=_mLngrMy97TMR6VbW4Y6YzDUk9ZuPcQMPlkuyRh3C9k,7100
50
+ anyio/to_process.py,sha256=J7gAA_YOuoHqnpDAf5fm1Qu6kOmTzdFbiDNvnV755vk,9798
51
+ anyio/to_thread.py,sha256=menEgXYmUV7Fjg_9WqCV95P9MAtQS8BzPGGcWB_QnfQ,2687
anyio-4.12.1.dist-info/REQUESTED ADDED
File without changes
anyio-4.12.1.dist-info/WHEEL ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
anyio-4.12.1.dist-info/entry_points.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [pytest11]
2
+ anyio = anyio.pytest_plugin
anyio-4.12.1.dist-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ anyio
dataset-metadata.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "title": "mamba-packages",
3
+ "id": "pmsalmankhan/mamba-packages",
4
+ "licenses": [
5
+ {
6
+ "name": "CC0-1.0"
7
+ }
8
+ ]
9
+ }
datasets/__init__.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ __version__ = "4.3.0"
16
+
17
+ from .arrow_dataset import Column, Dataset
18
+ from .arrow_reader import ReadInstruction
19
+ from .builder import ArrowBasedBuilder, BuilderConfig, DatasetBuilder, GeneratorBasedBuilder
20
+ from .combine import concatenate_datasets, interleave_datasets
21
+ from .dataset_dict import DatasetDict, IterableDatasetDict
22
+ from .download import *
23
+ from .features import *
24
+ from .fingerprint import disable_caching, enable_caching, is_caching_enabled
25
+ from .info import DatasetInfo
26
+ from .inspect import (
27
+ get_dataset_config_info,
28
+ get_dataset_config_names,
29
+ get_dataset_default_config_name,
30
+ get_dataset_infos,
31
+ get_dataset_split_names,
32
+ )
33
+ from .iterable_dataset import IterableColumn, IterableDataset
34
+ from .load import load_dataset, load_dataset_builder, load_from_disk
35
+ from .splits import (
36
+ NamedSplit,
37
+ NamedSplitAll,
38
+ Split,
39
+ SplitBase,
40
+ SplitDict,
41
+ SplitGenerator,
42
+ SplitInfo,
43
+ SubSplitInfo,
44
+ percent,
45
+ )
46
+ from .utils import *
47
+ from .utils import logging
datasets/arrow_dataset.py ADDED
The diff for this file is too large to render. See raw diff
 
datasets/arrow_reader.py ADDED
@@ -0,0 +1,620 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # Lint as: python3
16
+ """Arrow ArrowReader."""
17
+
18
+ import copy
19
+ import math
20
+ import os
21
+ import re
22
+ from dataclasses import dataclass
23
+ from functools import partial
24
+ from typing import TYPE_CHECKING, Optional, Union
25
+
26
+ import pyarrow as pa
27
+ import pyarrow.parquet as pq
28
+ from tqdm.contrib.concurrent import thread_map
29
+
30
+ from .download.download_config import DownloadConfig # noqa: F401
31
+ from .naming import _split_re, filenames_for_dataset_split
32
+ from .table import InMemoryTable, MemoryMappedTable, Table, concat_tables
33
+ from .utils import logging
34
+ from .utils import tqdm as hf_tqdm
35
+
36
+
37
+ if TYPE_CHECKING:
38
+ from .info import DatasetInfo # noqa: F401
39
+ from .splits import Split, SplitInfo # noqa: F401
40
+
41
+
42
+ logger = logging.get_logger(__name__)
43
+
44
+ HF_GCP_BASE_URL = "https://storage.googleapis.com/huggingface-nlp/cache/datasets"
45
+
46
+ _SUB_SPEC_RE = re.compile(
47
+ rf"""
48
+ ^
49
+ (?P<split>{_split_re[1:-1]})
50
+ (\[
51
+ ((?P<from>-?[\d_]+)
52
+ (?P<from_pct>%)?)?
53
+ :
54
+ ((?P<to>-?[\d_]+)
55
+ (?P<to_pct>%)?)?
56
+ \])?(\((?P<rounding>[^\)]*)\))?
57
+ $
58
+ """, # remove ^ and $
59
+ re.X,
60
+ )
61
+
62
+ _ADDITION_SEP_RE = re.compile(r"\s*\+\s*")
63
+
64
+
65
+ class DatasetNotOnHfGcsError(ConnectionError):
66
+ """When you can't get the dataset from the Hf google cloud storage"""
67
+
68
+ pass
69
+
70
+
71
+ class MissingFilesOnHfGcsError(ConnectionError):
72
+ """When some files are missing on the Hf oogle cloud storage"""
73
+
74
+ pass
75
+
76
+
77
+ @dataclass(frozen=True)
78
+ class FileInstructions:
79
+ """The file instructions associated with a split ReadInstruction.
80
+
81
+ Attributes:
82
+ num_examples: `int`, The total number of examples
83
+ file_instructions: List[dict(filename, skip, take)], the files information.
84
+ The filenames contains the relative path, not absolute.
85
+ skip/take indicates which example read in the file: `ds.slice(skip, take)`
86
+ """
87
+
88
+ num_examples: int
89
+ file_instructions: list[dict]
90
+
91
+
92
+ def make_file_instructions(
93
+ name: str,
94
+ split_infos: list["SplitInfo"],
95
+ instruction: Union[str, "ReadInstruction"],
96
+ filetype_suffix: Optional[str] = None,
97
+ prefix_path: Optional[str] = None,
98
+ ) -> FileInstructions:
99
+ """Returns instructions of the split dict.
100
+
101
+ Args:
102
+ name (`str`): Name of the dataset.
103
+ split_infos (`list` of `[SplitInfo]`): Dataset splits information.
104
+ instruction ([`ReadInstruction`] or `str`): Reading instruction for a dataset.
105
+ filetype_suffix (`str`, *optional*): Suffix of dataset files, e.g. 'arrow' or 'parquet'.
106
+ prefix_path (`str`, *optional*): Prefix of dataset files, e.g. directory name.
107
+
108
+ Returns:
109
+ [`FileInstructions`]
110
+ """
111
+ if not isinstance(name, str):
112
+ raise TypeError(f"Expected str 'name', but got: {type(name).__name__}")
113
+ elif not name:
114
+ raise ValueError("Expected non-empty str 'name'")
115
+ name2len = {info.name: info.num_examples for info in split_infos}
116
+ name2shard_lengths = {info.name: info.shard_lengths for info in split_infos}
117
+ name2filenames = {
118
+ info.name: filenames_for_dataset_split(
119
+ path=prefix_path,
120
+ dataset_name=name,
121
+ split=info.name,
122
+ filetype_suffix=filetype_suffix,
123
+ shard_lengths=name2shard_lengths[info.name],
124
+ )
125
+ for info in split_infos
126
+ }
127
+ if not isinstance(instruction, ReadInstruction):
128
+ instruction = ReadInstruction.from_spec(instruction)
129
+ # Create the absolute instruction (per split)
130
+ absolute_instructions = instruction.to_absolute(name2len)
131
+
132
+ # For each split, return the files instruction (skip/take)
133
+ file_instructions = []
134
+ num_examples = 0
135
+ for abs_instr in absolute_instructions:
136
+ split_length = name2len[abs_instr.splitname]
137
+ filenames = name2filenames[abs_instr.splitname]
138
+ shard_lengths = name2shard_lengths[abs_instr.splitname]
139
+ from_ = 0 if abs_instr.from_ is None else abs_instr.from_
140
+ to = split_length if abs_instr.to is None else abs_instr.to
141
+ if shard_lengths is None: # not sharded
142
+ for filename in filenames:
143
+ take = to - from_
144
+ if take == 0:
145
+ continue
146
+ num_examples += take
147
+ file_instructions.append({"filename": filename, "skip": from_, "take": take})
148
+ else: # sharded
149
+ index_start = 0 # Beginning (included) of moving window.
150
+ index_end = 0 # End (excluded) of moving window.
151
+ for filename, shard_length in zip(filenames, shard_lengths):
152
+ index_end += shard_length
153
+ if from_ < index_end and to > index_start: # There is something to take.
154
+ skip = from_ - index_start if from_ > index_start else 0
155
+ take = to - index_start - skip if to < index_end else -1
156
+ if take == 0:
157
+ continue
158
+ file_instructions.append({"filename": filename, "skip": skip, "take": take})
159
+ num_examples += shard_length - skip if take == -1 else take
160
+ index_start += shard_length
161
+ return FileInstructions(
162
+ num_examples=num_examples,
163
+ file_instructions=file_instructions,
164
+ )
165
+
166
+
167
+ class BaseReader:
168
+ """
169
+ Build a Dataset object out of Instruction instance(s).
170
+ """
171
+
172
+ def __init__(self, path: str, info: Optional["DatasetInfo"]):
173
+ """Initializes ArrowReader.
174
+
175
+ Args:
176
+ path (str): path where tfrecords are stored.
177
+ info (DatasetInfo): info about the dataset.
178
+ """
179
+ self._path: str = path
180
+ self._info: Optional["DatasetInfo"] = info
181
+ self._filetype_suffix: Optional[str] = None
182
+
183
+ def _get_table_from_filename(self, filename_skip_take, in_memory=False) -> Table:
184
+ """Returns a Dataset instance from given (filename, skip, take)."""
185
+ raise NotImplementedError
186
+
187
+ def _read_files(self, files, in_memory=False) -> Table:
188
+ """Returns Dataset for given file instructions.
189
+
190
+ Args:
191
+ files: List[dict(filename, skip, take)], the files information.
192
+ The filenames contain the absolute path, not relative.
193
+ skip/take indicates which example read in the file: `ds.slice(skip, take)`
194
+ in_memory (bool, default False): Whether to copy the data in-memory.
195
+ """
196
+ if len(files) == 0 or not all(isinstance(f, dict) for f in files):
197
+ raise ValueError("please provide valid file informations")
198
+ files = copy.deepcopy(files)
199
+ for f in files:
200
+ f["filename"] = os.path.join(self._path, f["filename"])
201
+
202
+ pa_tables = thread_map(
203
+ partial(self._get_table_from_filename, in_memory=in_memory),
204
+ files,
205
+ tqdm_class=hf_tqdm,
206
+ desc="Loading dataset shards",
207
+ # set `disable=None` rather than `disable=False` by default to disable progress bar when no TTY attached
208
+ disable=len(files) <= 16 or None,
209
+ )
210
+ pa_tables = [t for t in pa_tables if len(t) > 0]
211
+ if not pa_tables and (self._info is None or self._info.features is None):
212
+ raise ValueError(
213
+ "Tried to read an empty table. Please specify at least info.features to create an empty table with the right type."
214
+ )
215
+ pa_tables = pa_tables or [InMemoryTable.from_batches([], schema=pa.schema(self._info.features.type))]
216
+ pa_table = concat_tables(pa_tables) if len(pa_tables) != 1 else pa_tables[0]
217
+ return pa_table
218
+
219
+ def get_file_instructions(self, name, instruction, split_infos):
220
+ """Return list of dict {'filename': str, 'skip': int, 'take': int}"""
221
+ file_instructions = make_file_instructions(
222
+ name, split_infos, instruction, filetype_suffix=self._filetype_suffix, prefix_path=self._path
223
+ )
224
+ files = file_instructions.file_instructions
225
+ return files
226
+
227
+ def read(
228
+ self,
229
+ name,
230
+ instructions,
231
+ split_infos,
232
+ in_memory=False,
233
+ ):
234
+ """Returns Dataset instance(s).
235
+
236
+ Args:
237
+ name (str): name of the dataset.
238
+ instructions (ReadInstruction): instructions to read.
239
+ Instruction can be string and will then be passed to the Instruction
240
+ constructor as it.
241
+ split_infos (list of SplitInfo proto): the available splits for dataset.
242
+ in_memory (bool, default False): Whether to copy the data in-memory.
243
+
244
+ Returns:
245
+ kwargs to build a single Dataset instance.
246
+ """
247
+
248
+ files = self.get_file_instructions(name, instructions, split_infos)
249
+ if not files:
250
+ msg = f'Instruction "{instructions}" corresponds to no data!'
251
+ raise ValueError(msg)
252
+ return self.read_files(files=files, original_instructions=instructions, in_memory=in_memory)
253
+
254
+ def read_files(
255
+ self,
256
+ files: list[dict],
257
+ original_instructions: Union[None, "ReadInstruction", "Split"] = None,
258
+ in_memory=False,
259
+ ):
260
+ """Returns single Dataset instance for the set of file instructions.
261
+
262
+ Args:
263
+ files: List[dict(filename, skip, take)], the files information.
264
+ The filenames contains the relative path, not absolute.
265
+ skip/take indicates which example read in the file: `ds.skip().take()`
266
+ original_instructions: store the original instructions used to build the dataset split in the dataset.
267
+ in_memory (bool, default False): Whether to copy the data in-memory.
268
+
269
+ Returns:
270
+ kwargs to build a Dataset instance.
271
+ """
272
+ # Prepend path to filename
273
+ pa_table = self._read_files(files, in_memory=in_memory)
274
+ # If original_instructions is not None, convert it to a human-readable NamedSplit
275
+ if original_instructions is not None:
276
+ from .splits import Split # noqa
277
+
278
+ split = Split(str(original_instructions))
279
+ else:
280
+ split = None
281
+ dataset_kwargs = {"arrow_table": pa_table, "info": self._info, "split": split}
282
+ return dataset_kwargs
283
+
284
+
285
+ class ArrowReader(BaseReader):
286
+ """
287
+ Build a Dataset object out of Instruction instance(s).
288
+ This Reader uses either memory mapping or file descriptors (in-memory) on arrow files.
289
+ """
290
+
291
+ def __init__(self, path: str, info: Optional["DatasetInfo"]):
292
+ """Initializes ArrowReader.
293
+
294
+ Args:
295
+ path (str): path where Arrow files are stored.
296
+ info (DatasetInfo): info about the dataset.
297
+ """
298
+ super().__init__(path, info)
299
+ self._filetype_suffix = "arrow"
300
+
301
+ def _get_table_from_filename(self, filename_skip_take, in_memory=False) -> Table:
302
+ """Returns a Dataset instance from given (filename, skip, take)."""
303
+ filename, skip, take = (
304
+ filename_skip_take["filename"],
305
+ filename_skip_take["skip"] if "skip" in filename_skip_take else None,
306
+ filename_skip_take["take"] if "take" in filename_skip_take else None,
307
+ )
308
+ table = ArrowReader.read_table(filename, in_memory=in_memory)
309
+ if take == -1:
310
+ take = len(table) - skip
311
+ # here we don't want to slice an empty table, or it may segfault
312
+ if skip is not None and take is not None and not (skip == 0 and take == len(table)):
313
+ table = table.slice(skip, take)
314
+ return table
315
+
316
+ @staticmethod
317
+ def read_table(filename, in_memory=False) -> Table:
318
+ """
319
+ Read table from file.
320
+
321
+ Args:
322
+ filename (str): File name of the table.
323
+ in_memory (bool, default=False): Whether to copy the data in-memory.
324
+
325
+ Returns:
326
+ pyarrow.Table
327
+ """
328
+ table_cls = InMemoryTable if in_memory else MemoryMappedTable
329
+ return table_cls.from_file(filename)
330
+
331
+
332
+ class ParquetReader(BaseReader):
333
+ """
334
+ Build a Dataset object out of Instruction instance(s).
335
+ This Reader uses memory mapping on parquet files.
336
+ """
337
+
338
+ def __init__(self, path: str, info: Optional["DatasetInfo"]):
339
+ """Initializes ParquetReader.
340
+
341
+ Args:
342
+ path (str): path where tfrecords are stored.
343
+ info (DatasetInfo): info about the dataset.
344
+ """
345
+ super().__init__(path, info)
346
+ self._filetype_suffix = "parquet"
347
+
348
+ def _get_table_from_filename(self, filename_skip_take, **kwargs):
349
+ """Returns a Dataset instance from given (filename, skip, take)."""
350
+ filename, skip, take = (
351
+ filename_skip_take["filename"],
352
+ filename_skip_take["skip"] if "skip" in filename_skip_take else None,
353
+ filename_skip_take["take"] if "take" in filename_skip_take else None,
354
+ )
355
+ # Parquet read_table always loads data in memory, independently of memory_map
356
+ pa_table = pq.read_table(filename, memory_map=True)
357
+ # here we don't want to slice an empty table, or it may segfault
358
+ if skip is not None and take is not None and not (skip == 0 and take == len(pa_table)):
359
+ pa_table = pa_table.slice(skip, take)
360
+ return pa_table
361
+
362
+
363
+ @dataclass(frozen=True)
364
+ class _AbsoluteInstruction:
365
+ """A machine friendly slice: defined absolute positive boundaries."""
366
+
367
+ splitname: str
368
+ from_: int # uint (starting index).
369
+ to: int # uint (ending index).
370
+
371
+
372
+ @dataclass(frozen=True)
373
+ class _RelativeInstruction:
374
+ """Represents a single parsed slicing instruction, can use % and negatives."""
375
+
376
+ splitname: str
377
+ from_: Optional[int] = None # int (starting index) or None if no lower boundary.
378
+ to: Optional[int] = None # int (ending index) or None if no upper boundary.
379
+ unit: Optional[str] = None
380
+ rounding: Optional[str] = None
381
+
382
+ def __post_init__(self):
383
+ if self.unit is not None and self.unit not in ["%", "abs"]:
384
+ raise ValueError("unit must be either % or abs")
385
+ if self.rounding is not None and self.rounding not in ["closest", "pct1_dropremainder"]:
386
+ raise ValueError("rounding must be either closest or pct1_dropremainder")
387
+ if self.unit != "%" and self.rounding is not None:
388
+ raise ValueError("It is forbidden to specify rounding if not using percent slicing.")
389
+ if self.unit == "%" and self.from_ is not None and abs(self.from_) > 100:
390
+ raise ValueError("Percent slice boundaries must be > -100 and < 100.")
391
+ if self.unit == "%" and self.to is not None and abs(self.to) > 100:
392
+ raise ValueError("Percent slice boundaries must be > -100 and < 100.")
393
+ # Update via __dict__ due to instance being "frozen"
394
+ self.__dict__["rounding"] = "closest" if self.rounding is None and self.unit == "%" else self.rounding
395
+
396
+
397
+ def _str_to_read_instruction(spec):
398
+ """Returns ReadInstruction for given string."""
399
+ res = _SUB_SPEC_RE.match(spec)
400
+ if not res:
401
+ raise ValueError(f"Unrecognized instruction format: {spec}")
402
+ unit = "%" if res.group("from_pct") or res.group("to_pct") else "abs"
403
+ return ReadInstruction(
404
+ split_name=res.group("split"),
405
+ rounding=res.group("rounding"),
406
+ from_=int(res.group("from")) if res.group("from") else None,
407
+ to=int(res.group("to")) if res.group("to") else None,
408
+ unit=unit,
409
+ )
410
+
411
+
412
+ def _pct_to_abs_pct1(boundary, num_examples):
413
+ # Using math.trunc here, since -99.5% should give -99%, not -100%.
414
+ if num_examples < 100:
415
+ msg = (
416
+ 'Using "pct1_dropremainder" rounding on a split with less than 100 '
417
+ "elements is forbidden: it always results in an empty dataset."
418
+ )
419
+ raise ValueError(msg)
420
+ return boundary * math.trunc(num_examples / 100.0)
421
+
422
+
423
+ def _pct_to_abs_closest(boundary, num_examples):
424
+ return int(round(boundary * num_examples / 100.0))
425
+
426
+
427
+ def _rel_to_abs_instr(rel_instr, name2len):
428
+ """Returns _AbsoluteInstruction instance for given RelativeInstruction.
429
+
430
+ Args:
431
+ rel_instr: RelativeInstruction instance.
432
+ name2len: dict {split_name: num_examples}.
433
+ """
434
+ pct_to_abs = _pct_to_abs_closest if rel_instr.rounding == "closest" else _pct_to_abs_pct1
435
+ split = rel_instr.splitname
436
+ if split not in name2len:
437
+ raise ValueError(f'Unknown split "{split}". Should be one of {list(name2len)}.')
438
+ num_examples = name2len[split]
439
+ from_ = rel_instr.from_
440
+ to = rel_instr.to
441
+ if rel_instr.unit == "%":
442
+ from_ = 0 if from_ is None else pct_to_abs(from_, num_examples)
443
+ to = num_examples if to is None else pct_to_abs(to, num_examples)
444
+ else:
445
+ from_ = 0 if from_ is None else from_
446
+ to = num_examples if to is None else to
447
+ if from_ < 0:
448
+ from_ = max(num_examples + from_, 0)
449
+ if to < 0:
450
+ to = max(num_examples + to, 0)
451
+ from_ = min(from_, num_examples)
452
+ to = min(to, num_examples)
453
+ return _AbsoluteInstruction(split, from_, to)
454
+
455
+
456
+ class ReadInstruction:
457
+ """Reading instruction for a dataset.
458
+
459
+ Examples::
460
+
461
+ # The following lines are equivalent:
462
+ ds = datasets.load_dataset('mnist', split='test[:33%]')
463
+ ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec('test[:33%]'))
464
+ ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction('test', to=33, unit='%'))
465
+ ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction(
466
+ 'test', from_=0, to=33, unit='%'))
467
+
468
+ # The following lines are equivalent:
469
+ ds = datasets.load_dataset('mnist', split='test[:33%]+train[1:-1]')
470
+ ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec(
471
+ 'test[:33%]+train[1:-1]'))
472
+ ds = datasets.load_dataset('mnist', split=(
473
+ datasets.ReadInstruction('test', to=33, unit='%') +
474
+ datasets.ReadInstruction('train', from_=1, to=-1, unit='abs')))
475
+
476
+ # The following lines are equivalent:
477
+ ds = datasets.load_dataset('mnist', split='test[:33%](pct1_dropremainder)')
478
+ ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec(
479
+ 'test[:33%](pct1_dropremainder)'))
480
+ ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction(
481
+ 'test', from_=0, to=33, unit='%', rounding="pct1_dropremainder"))
482
+
483
+ # 10-fold validation:
484
+ tests = datasets.load_dataset(
485
+ 'mnist',
486
+ [datasets.ReadInstruction('train', from_=k, to=k+10, unit='%')
487
+ for k in range(0, 100, 10)])
488
+ trains = datasets.load_dataset(
489
+ 'mnist',
490
+ [datasets.ReadInstruction('train', to=k, unit='%') + datasets.ReadInstruction('train', from_=k+10, unit='%')
491
+ for k in range(0, 100, 10)])
492
+
493
+ """
494
+
495
+ def _init(self, relative_instructions):
496
+ # Private initializer.
497
+ self._relative_instructions = relative_instructions
498
+
499
+ @classmethod
500
+ def _read_instruction_from_relative_instructions(cls, relative_instructions):
501
+ """Returns ReadInstruction obj initialized with relative_instructions."""
502
+ # Use __new__ to bypass __init__ used by public API and not conveniant here.
503
+ result = cls.__new__(cls)
504
+ result._init(relative_instructions) # pylint: disable=protected-access
505
+ return result
506
+
507
+ def __init__(self, split_name, rounding=None, from_=None, to=None, unit=None):
508
+ """Initialize ReadInstruction.
509
+
510
+ Args:
511
+ split_name (str): name of the split to read. Eg: 'train'.
512
+ rounding (str, optional): The rounding behaviour to use when percent slicing is
513
+ used. Ignored when slicing with absolute indices.
514
+ Possible values:
515
+ - 'closest' (default): The specified percentages are rounded to the
516
+ closest value. Use this if you want specified percents to be as
517
+ much exact as possible.
518
+ - 'pct1_dropremainder': the specified percentages are treated as
519
+ multiple of 1%. Use this option if you want consistency. Eg:
520
+ len(5%) == 5 * len(1%).
521
+ Using this option, one might not be able to use the full set of
522
+ examples, if the number of those is not a multiple of 100.
523
+ from_ (int):
524
+ to (int): alternative way of specifying slicing boundaries. If any of
525
+ {from_, to, unit} argument is used, slicing cannot be specified as
526
+ string.
527
+ unit (str): optional, one of:
528
+ '%': to set the slicing unit as percents of the split size.
529
+ 'abs': to set the slicing unit as absolute numbers.
530
+ """
531
+ # This constructor is not always called. See factory method
532
+ # `_read_instruction_from_relative_instructions`. Common init instructions
533
+ # MUST be placed in the _init method.
534
+ self._init([_RelativeInstruction(split_name, from_, to, unit, rounding)])
535
+
536
+ @classmethod
537
+ def from_spec(cls, spec):
538
+ """Creates a `ReadInstruction` instance out of a string spec.
539
+
540
+ Args:
541
+ spec (`str`):
542
+ Split(s) + optional slice(s) to read + optional rounding
543
+ if percents are used as the slicing unit. A slice can be specified,
544
+ using absolute numbers (`int`) or percentages (`int`).
545
+
546
+ Examples:
547
+
548
+ ```
549
+ test: test split.
550
+ test + validation: test split + validation split.
551
+ test[10:]: test split, minus its first 10 records.
552
+ test[:10%]: first 10% records of test split.
553
+ test[:20%](pct1_dropremainder): first 10% records, rounded with the pct1_dropremainder rounding.
554
+ test[:-5%]+train[40%:60%]: first 95% of test + middle 20% of train.
555
+ ```
556
+
557
+ Returns:
558
+ ReadInstruction instance.
559
+ """
560
+ spec = str(spec) # Need to convert to str in case of NamedSplit instance.
561
+ subs = _ADDITION_SEP_RE.split(spec)
562
+ if not subs:
563
+ raise ValueError(f"No instructions could be built out of {spec}")
564
+ instruction = _str_to_read_instruction(subs[0])
565
+ return sum((_str_to_read_instruction(sub) for sub in subs[1:]), instruction)
566
+
567
+ def to_spec(self):
568
+ rel_instr_specs = []
569
+ for rel_instr in self._relative_instructions:
570
+ rel_instr_spec = rel_instr.splitname
571
+ if rel_instr.from_ is not None or rel_instr.to is not None:
572
+ from_ = rel_instr.from_
573
+ to = rel_instr.to
574
+ unit = rel_instr.unit
575
+ rounding = rel_instr.rounding
576
+ unit = unit if unit == "%" else ""
577
+ from_ = str(from_) + unit if from_ is not None else ""
578
+ to = str(to) + unit if to is not None else ""
579
+ slice_str = f"[{from_}:{to}]"
580
+ rounding_str = (
581
+ f"({rounding})" if unit == "%" and rounding is not None and rounding != "closest" else ""
582
+ )
583
+ rel_instr_spec += slice_str + rounding_str
584
+ rel_instr_specs.append(rel_instr_spec)
585
+ return "+".join(rel_instr_specs)
586
+
587
+ def __add__(self, other):
588
+ """Returns a new ReadInstruction obj, result of appending other to self."""
589
+ if not isinstance(other, ReadInstruction):
590
+ msg = "ReadInstruction can only be added to another ReadInstruction obj."
591
+ raise TypeError(msg)
592
+ self_ris = self._relative_instructions
593
+ other_ris = other._relative_instructions # pylint: disable=protected-access
594
+ if (
595
+ self_ris[0].unit != "abs"
596
+ and other_ris[0].unit != "abs"
597
+ and self._relative_instructions[0].rounding != other_ris[0].rounding
598
+ ):
599
+ raise ValueError("It is forbidden to sum ReadInstruction instances with different rounding values.")
600
+ return self._read_instruction_from_relative_instructions(self_ris + other_ris)
601
+
602
+ def __str__(self):
603
+ return self.to_spec()
604
+
605
+ def __repr__(self):
606
+ return f"ReadInstruction({self._relative_instructions})"
607
+
608
+ def to_absolute(self, name2len):
609
+ """Translate instruction into a list of absolute instructions.
610
+
611
+ Those absolute instructions are then to be added together.
612
+
613
+ Args:
614
+ name2len (`dict`):
615
+ Associating split names to number of examples.
616
+
617
+ Returns:
618
+ list of _AbsoluteInstruction instances (corresponds to the + in spec).
619
+ """
620
+ return [_rel_to_abs_instr(rel_instr, name2len) for rel_instr in self._relative_instructions]
datasets/arrow_writer.py ADDED
@@ -0,0 +1,766 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ # Lint as: python3
14
+ """To write records into Parquet files."""
15
+
16
+ import json
17
+ import sys
18
+ from collections.abc import Iterable
19
+ from typing import Any, Optional, Union
20
+
21
+ import fsspec
22
+ import numpy as np
23
+ import pyarrow as pa
24
+ import pyarrow.parquet as pq
25
+ from fsspec.core import url_to_fs
26
+
27
+ from . import config
28
+ from .features import Audio, Features, Image, Pdf, Value, Video
29
+ from .features.features import (
30
+ FeatureType,
31
+ List,
32
+ _ArrayXDExtensionType,
33
+ _visit,
34
+ cast_to_python_objects,
35
+ generate_from_arrow_type,
36
+ get_nested_type,
37
+ list_of_np_array_to_pyarrow_listarray,
38
+ numpy_to_pyarrow_listarray,
39
+ to_pyarrow_listarray,
40
+ )
41
+ from .filesystems import is_remote_filesystem
42
+ from .info import DatasetInfo
43
+ from .keyhash import DuplicatedKeysError, KeyHasher
44
+ from .table import array_cast, cast_array_to_feature, embed_table_storage, table_cast
45
+ from .utils import logging
46
+ from .utils.py_utils import asdict, convert_file_size_to_int, first_non_null_non_empty_value
47
+
48
+
49
+ logger = logging.get_logger(__name__)
50
+
51
+ type_ = type # keep python's type function
52
+
53
+
54
+ def get_arrow_writer_batch_size_from_features(features: Optional[Features]) -> Optional[int]:
55
+ """
56
+ Get the writer_batch_size that defines the maximum record batch size in the arrow files based on configuration values.
57
+ The default value is 100 for image/audio datasets and 10 for videos.
58
+ This allows to avoid overflows in arrow buffers.
59
+
60
+ Args:
61
+ features (`datasets.Features` or `None`):
62
+ Dataset Features from `datasets`.
63
+ Returns:
64
+ writer_batch_size (`Optional[int]`):
65
+ Writer batch size to pass to a dataset builder.
66
+ If `None`, then it will use the `datasets` default, i.e. `datasets.config.DEFAULT_MAX_BATCH_SIZE`.
67
+ """
68
+ if not features:
69
+ return None
70
+
71
+ batch_size = np.inf
72
+
73
+ def set_batch_size(feature: FeatureType) -> None:
74
+ nonlocal batch_size
75
+ if isinstance(feature, Image) and config.ARROW_RECORD_BATCH_SIZE_FOR_IMAGE_DATASETS is not None:
76
+ batch_size = min(batch_size, config.ARROW_RECORD_BATCH_SIZE_FOR_IMAGE_DATASETS)
77
+ elif isinstance(feature, Audio) and config.ARROW_RECORD_BATCH_SIZE_FOR_AUDIO_DATASETS is not None:
78
+ batch_size = min(batch_size, config.ARROW_RECORD_BATCH_SIZE_FOR_AUDIO_DATASETS)
79
+ elif isinstance(feature, Video) and config.ARROW_RECORD_BATCH_SIZE_FOR_VIDEO_DATASETS is not None:
80
+ batch_size = min(batch_size, config.ARROW_RECORD_BATCH_SIZE_FOR_VIDEO_DATASETS)
81
+ elif (
82
+ isinstance(feature, Value)
83
+ and feature.dtype == "binary"
84
+ and config.ARROW_RECORD_BATCH_SIZE_FOR_BINARY_DATASETS is not None
85
+ ):
86
+ batch_size = min(batch_size, config.ARROW_RECORD_BATCH_SIZE_FOR_BINARY_DATASETS)
87
+
88
+ _visit(features, set_batch_size)
89
+
90
+ return None if batch_size is np.inf else batch_size
91
+
92
+
93
+ def get_writer_batch_size_from_features(features: Optional[Features]) -> Optional[int]:
94
+ """
95
+ Get the writer_batch_size that defines the maximum row group size in the parquet files based on configuration values.
96
+ By default these are not set, but it can be helpful to hard set those values in some cases.
97
+ This allows to optimize random access to parquet file, since accessing 1 row requires
98
+ to read its entire row group.
99
+
100
+ Args:
101
+ features (`datasets.Features` or `None`):
102
+ Dataset Features from `datasets`.
103
+ Returns:
104
+ writer_batch_size (`Optional[int]`):
105
+ Writer batch size to pass to a parquet writer.
106
+ If `None`, then it will use the `datasets` default, i.e. aiming for row groups of 100MB.
107
+ """
108
+ if not features:
109
+ return None
110
+
111
+ batch_size = np.inf
112
+
113
+ def set_batch_size(feature: FeatureType) -> None:
114
+ nonlocal batch_size
115
+ if isinstance(feature, Image) and config.PARQUET_ROW_GROUP_SIZE_FOR_IMAGE_DATASETS is not None:
116
+ batch_size = min(batch_size, config.PARQUET_ROW_GROUP_SIZE_FOR_IMAGE_DATASETS)
117
+ elif isinstance(feature, Audio) and config.PARQUET_ROW_GROUP_SIZE_FOR_AUDIO_DATASETS is not None:
118
+ batch_size = min(batch_size, config.PARQUET_ROW_GROUP_SIZE_FOR_AUDIO_DATASETS)
119
+ elif isinstance(feature, Video) and config.PARQUET_ROW_GROUP_SIZE_FOR_VIDEO_DATASETS is not None:
120
+ batch_size = min(batch_size, config.PARQUET_ROW_GROUP_SIZE_FOR_VIDEO_DATASETS)
121
+ elif (
122
+ isinstance(feature, Value)
123
+ and feature.dtype == "binary"
124
+ and config.PARQUET_ROW_GROUP_SIZE_FOR_BINARY_DATASETS is not None
125
+ ):
126
+ batch_size = min(batch_size, config.PARQUET_ROW_GROUP_SIZE_FOR_BINARY_DATASETS)
127
+
128
+ _visit(features, set_batch_size)
129
+
130
+ return None if batch_size is np.inf else batch_size
131
+
132
+
133
+ def get_writer_batch_size_from_data_size(num_rows: int, num_bytes: int) -> int:
134
+ """
135
+ Get the writer_batch_size that defines the maximum row group size in the parquet files.
136
+ The default in `datasets` is aiming for row groups of maximum 100MB uncompressed.
137
+ This allows to optimize random access to parquet file, since accessing 1 row requires
138
+ to read its entire row group.
139
+
140
+ This can be improved to get optimized size for querying/iterating
141
+ but at least it matches the dataset viewer expectations on HF.
142
+
143
+ Args:
144
+ num_rows (`int`):
145
+ Number of rows in the dataset.
146
+ num_bytes (`int`):
147
+ Number of bytes in the dataset.
148
+ For dataset with external files to embed (image, audio, videos), this can also be an
149
+ estimate from `dataset._estimate_nbytes()`.
150
+ Returns:
151
+ writer_batch_size (`Optional[int]`):
152
+ Writer batch size to pass to a parquet writer.
153
+ """
154
+ return max(10, num_rows * convert_file_size_to_int(config.MAX_ROW_GROUP_SIZE) // num_bytes) if num_bytes > 0 else 1
155
+
156
+
157
+ class SchemaInferenceError(ValueError):
158
+ pass
159
+
160
+
161
+ class TypedSequence:
162
+ """
163
+ This data container generalizes the typing when instantiating pyarrow arrays, tables or batches.
164
+
165
+ More specifically it adds several features:
166
+ - Support extension types like ``datasets.features.Array2DExtensionType``:
167
+ By default pyarrow arrays don't return extension arrays. One has to call
168
+ ``pa.ExtensionArray.from_storage(type, pa.array(data, type.storage_type))``
169
+ in order to get an extension array.
170
+ - Support for ``try_type`` parameter that can be used instead of ``type``:
171
+ When an array is transformed, we like to keep the same type as before if possible.
172
+ For example when calling :func:`datasets.Dataset.map`, we don't want to change the type
173
+ of each column by default.
174
+ - Better error message when a pyarrow array overflows.
175
+
176
+ Example::
177
+
178
+ from datasets.features import Array2D, Array2DExtensionType, Value
179
+ from datasets.arrow_writer import TypedSequence
180
+ import pyarrow as pa
181
+
182
+ arr = pa.array(TypedSequence([1, 2, 3], type=Value("int32")))
183
+ assert arr.type == pa.int32()
184
+
185
+ arr = pa.array(TypedSequence([1, 2, 3], try_type=Value("int32")))
186
+ assert arr.type == pa.int32()
187
+
188
+ arr = pa.array(TypedSequence(["foo", "bar"], try_type=Value("int32")))
189
+ assert arr.type == pa.string()
190
+
191
+ arr = pa.array(TypedSequence([[[1, 2, 3]]], type=Array2D((1, 3), "int64")))
192
+ assert arr.type == Array2DExtensionType((1, 3), "int64")
193
+
194
+ table = pa.Table.from_pydict({
195
+ "image": TypedSequence([[[1, 2, 3]]], type=Array2D((1, 3), "int64"))
196
+ })
197
+ assert table["image"].type == Array2DExtensionType((1, 3), "int64")
198
+
199
+ """
200
+
201
+ def __init__(
202
+ self,
203
+ data: Iterable,
204
+ type: Optional[FeatureType] = None,
205
+ try_type: Optional[FeatureType] = None,
206
+ optimized_int_type: Optional[FeatureType] = None,
207
+ ):
208
+ # assert type is None or try_type is None,
209
+ if type is not None and try_type is not None:
210
+ raise ValueError("You cannot specify both type and try_type")
211
+ # set attributes
212
+ self.data = data
213
+ self.type = type
214
+ self.try_type = try_type # is ignored if it doesn't match the data
215
+ self.optimized_int_type = optimized_int_type
216
+ # when trying a type (is ignored if data is not compatible)
217
+ self.trying_type = self.try_type is not None
218
+ self.trying_int_optimization = optimized_int_type is not None and type is None and try_type is None
219
+ # used to get back the inferred type after __arrow_array__() is called once
220
+ self._inferred_type = None
221
+
222
+ def get_inferred_type(self) -> FeatureType:
223
+ """Return the inferred feature type.
224
+ This is done by converting the sequence to an Arrow array, and getting the corresponding
225
+ feature type.
226
+
227
+ Since building the Arrow array can be expensive, the value of the inferred type is cached
228
+ as soon as pa.array is called on the typed sequence.
229
+
230
+ Returns:
231
+ FeatureType: inferred feature type of the sequence.
232
+ """
233
+ if self._inferred_type is None:
234
+ self._inferred_type = generate_from_arrow_type(pa.array(self).type)
235
+ return self._inferred_type
236
+
237
+ @staticmethod
238
+ def _infer_custom_type_and_encode(data: Iterable) -> tuple[Iterable, Optional[FeatureType]]:
239
+ """Implement type inference for custom objects like PIL.Image.Image -> Image type.
240
+
241
+ This function is only used for custom python objects that can't be directly passed to build
242
+ an Arrow array. In such cases is infers the feature type to use, and it encodes the data so
243
+ that they can be passed to an Arrow array.
244
+
245
+ Args:
246
+ data (Iterable): array of data to infer the type, e.g. a list of PIL images.
247
+
248
+ Returns:
249
+ Tuple[Iterable, Optional[FeatureType]]: a tuple with:
250
+ - the (possibly encoded) array, if the inferred feature type requires encoding
251
+ - the inferred feature type if the array is made of supported custom objects like
252
+ PIL images, else None.
253
+ """
254
+ if config.PIL_AVAILABLE and "PIL" in sys.modules:
255
+ import PIL.Image
256
+
257
+ non_null_idx, non_null_value = first_non_null_non_empty_value(data)
258
+ if isinstance(non_null_value, PIL.Image.Image):
259
+ return [Image().encode_example(value) if value is not None else None for value in data], Image()
260
+ if isinstance(non_null_value, list) and isinstance(non_null_value[0], PIL.Image.Image):
261
+ return [
262
+ [Image().encode_example(x) for x in value] if value is not None else None for value in data
263
+ ], List(Image())
264
+ if config.PDFPLUMBER_AVAILABLE and "pdfplumber" in sys.modules:
265
+ import pdfplumber
266
+
267
+ non_null_idx, non_null_value = first_non_null_non_empty_value(data)
268
+ if isinstance(non_null_value, pdfplumber.pdf.PDF):
269
+ return [Pdf().encode_example(value) if value is not None else None for value in data], Pdf()
270
+ if isinstance(non_null_value, list) and isinstance(non_null_value[0], pdfplumber.pdf.PDF):
271
+ return [
272
+ [Pdf().encode_example(x) for x in value] if value is not None else None for value in data
273
+ ], List(Pdf())
274
+ return data, None
275
+
276
+ def __arrow_array__(self, type: Optional[pa.DataType] = None):
277
+ """This function is called when calling pa.array(typed_sequence)"""
278
+
279
+ if type is not None:
280
+ raise ValueError("TypedSequence is supposed to be used with pa.array(typed_sequence, type=None)")
281
+ del type # make sure we don't use it
282
+ data = self.data
283
+ # automatic type inference for custom objects
284
+ if self.type is None and self.try_type is None:
285
+ data, self._inferred_type = self._infer_custom_type_and_encode(data)
286
+ if self._inferred_type is None:
287
+ type = self.try_type if self.trying_type else self.type
288
+ else:
289
+ type = self._inferred_type
290
+ pa_type = get_nested_type(type) if type is not None else None
291
+ optimized_int_pa_type = (
292
+ get_nested_type(self.optimized_int_type) if self.optimized_int_type is not None else None
293
+ )
294
+ trying_cast_to_python_objects = False
295
+ try:
296
+ # custom pyarrow types
297
+ if isinstance(pa_type, _ArrayXDExtensionType):
298
+ storage = to_pyarrow_listarray(data, pa_type)
299
+ return pa.ExtensionArray.from_storage(pa_type, storage)
300
+
301
+ # efficient np array to pyarrow array
302
+ if isinstance(data, np.ndarray):
303
+ out = numpy_to_pyarrow_listarray(data)
304
+ elif isinstance(data, list) and data and isinstance(first_non_null_non_empty_value(data)[1], np.ndarray):
305
+ out = list_of_np_array_to_pyarrow_listarray(data)
306
+ else:
307
+ trying_cast_to_python_objects = True
308
+ out = pa.array(cast_to_python_objects(data, only_1d_for_numpy=True))
309
+ # use smaller integer precisions if possible
310
+ if self.trying_int_optimization:
311
+ if pa.types.is_int64(out.type):
312
+ out = out.cast(optimized_int_pa_type)
313
+ elif pa.types.is_list(out.type):
314
+ if pa.types.is_int64(out.type.value_type):
315
+ out = array_cast(out, pa.list_(optimized_int_pa_type))
316
+ elif pa.types.is_list(out.type.value_type) and pa.types.is_int64(out.type.value_type.value_type):
317
+ out = array_cast(out, pa.list_(pa.list_(optimized_int_pa_type)))
318
+ # otherwise we can finally use the user's type
319
+ elif type is not None:
320
+ # We use cast_array_to_feature to support casting to custom types like Audio and Image
321
+ # Also, when trying type "string", we don't want to convert integers or floats to "string".
322
+ # We only do it if trying_type is False - since this is what the user asks for.
323
+ out = cast_array_to_feature(
324
+ out, type, allow_primitive_to_str=not self.trying_type, allow_decimal_to_str=not self.trying_type
325
+ )
326
+ return out
327
+ except (
328
+ TypeError,
329
+ pa.lib.ArrowInvalid,
330
+ pa.lib.ArrowNotImplementedError,
331
+ ) as e: # handle type errors and overflows
332
+ # Ignore ArrowNotImplementedError caused by trying type, otherwise re-raise
333
+ if not self.trying_type and isinstance(e, pa.lib.ArrowNotImplementedError):
334
+ raise
335
+
336
+ if self.trying_type:
337
+ try: # second chance
338
+ if isinstance(data, np.ndarray):
339
+ return numpy_to_pyarrow_listarray(data)
340
+ elif isinstance(data, list) and data and any(isinstance(value, np.ndarray) for value in data):
341
+ return list_of_np_array_to_pyarrow_listarray(data)
342
+ else:
343
+ trying_cast_to_python_objects = True
344
+ return pa.array(cast_to_python_objects(data, only_1d_for_numpy=True))
345
+ except pa.lib.ArrowInvalid as e:
346
+ if "overflow" in str(e):
347
+ raise OverflowError(
348
+ f"There was an overflow with type {type_(data)}. Try to reduce writer_batch_size to have batches smaller than 2GB.\n({e})"
349
+ ) from None
350
+ elif self.trying_int_optimization and "not in range" in str(e):
351
+ optimized_int_pa_type_str = np.dtype(optimized_int_pa_type.to_pandas_dtype()).name
352
+ logger.info(
353
+ f"Failed to cast a sequence to {optimized_int_pa_type_str}. Falling back to int64."
354
+ )
355
+ return out
356
+ elif trying_cast_to_python_objects and "Could not convert" in str(e):
357
+ out = pa.array(
358
+ cast_to_python_objects(data, only_1d_for_numpy=True, optimize_list_casting=False)
359
+ )
360
+ if type is not None:
361
+ out = cast_array_to_feature(
362
+ out, type, allow_primitive_to_str=True, allow_decimal_to_str=True
363
+ )
364
+ return out
365
+ else:
366
+ raise
367
+ elif "overflow" in str(e):
368
+ raise OverflowError(
369
+ f"There was an overflow with type {type_(data)}. Try to reduce writer_batch_size to have batches smaller than 2GB.\n({e})"
370
+ ) from None
371
+ elif self.trying_int_optimization and "not in range" in str(e):
372
+ optimized_int_pa_type_str = np.dtype(optimized_int_pa_type.to_pandas_dtype()).name
373
+ logger.info(f"Failed to cast a sequence to {optimized_int_pa_type_str}. Falling back to int64.")
374
+ return out
375
+ elif trying_cast_to_python_objects and "Could not convert" in str(e):
376
+ out = pa.array(cast_to_python_objects(data, only_1d_for_numpy=True, optimize_list_casting=False))
377
+ if type is not None:
378
+ out = cast_array_to_feature(out, type, allow_primitive_to_str=True, allow_decimal_to_str=True)
379
+ return out
380
+ else:
381
+ raise
382
+
383
+
384
+ class OptimizedTypedSequence(TypedSequence):
385
+ def __init__(
386
+ self,
387
+ data,
388
+ type: Optional[FeatureType] = None,
389
+ try_type: Optional[FeatureType] = None,
390
+ col: Optional[str] = None,
391
+ optimized_int_type: Optional[FeatureType] = None,
392
+ ):
393
+ optimized_int_type_by_col = {
394
+ "attention_mask": Value("int8"), # binary tensor
395
+ "special_tokens_mask": Value("int8"),
396
+ "input_ids": Value("int32"), # typical vocab size: 0-50k (max ~500k, never > 1M)
397
+ "token_type_ids": Value(
398
+ "int8"
399
+ ), # binary mask; some (XLNetModel) use an additional token represented by a 2
400
+ }
401
+ if type is None and try_type is None:
402
+ optimized_int_type = optimized_int_type_by_col.get(col, None)
403
+ super().__init__(data, type=type, try_type=try_type, optimized_int_type=optimized_int_type)
404
+
405
+
406
+ class ArrowWriter:
407
+ """Shuffles and writes Examples to Arrow files."""
408
+
409
+ def __init__(
410
+ self,
411
+ schema: Optional[pa.Schema] = None,
412
+ features: Optional[Features] = None,
413
+ path: Optional[str] = None,
414
+ stream: Optional[pa.NativeFile] = None,
415
+ fingerprint: Optional[str] = None,
416
+ writer_batch_size: Optional[int] = None,
417
+ hash_salt: Optional[str] = None,
418
+ check_duplicates: Optional[bool] = False,
419
+ disable_nullable: bool = False,
420
+ update_features: bool = False,
421
+ with_metadata: bool = True,
422
+ unit: str = "examples",
423
+ embed_local_files: bool = False,
424
+ storage_options: Optional[dict] = None,
425
+ ):
426
+ if path is None and stream is None:
427
+ raise ValueError("At least one of path and stream must be provided.")
428
+ if features is not None:
429
+ self._features = features
430
+ self._schema = None
431
+ elif schema is not None:
432
+ self._schema: pa.Schema = schema
433
+ self._features = Features.from_arrow_schema(self._schema)
434
+ else:
435
+ self._features = None
436
+ self._schema = None
437
+
438
+ if hash_salt is not None:
439
+ # Create KeyHasher instance using split name as hash salt
440
+ self._hasher = KeyHasher(hash_salt)
441
+ else:
442
+ self._hasher = KeyHasher("")
443
+
444
+ self._check_duplicates = check_duplicates
445
+ self._disable_nullable = disable_nullable
446
+
447
+ if stream is None:
448
+ fs, path = url_to_fs(path, **(storage_options or {}))
449
+ self._fs: fsspec.AbstractFileSystem = fs
450
+ self._path = path if not is_remote_filesystem(self._fs) else self._fs.unstrip_protocol(path)
451
+ self.stream = self._fs.open(path, "wb")
452
+ self._closable_stream = True
453
+ else:
454
+ self._fs = None
455
+ self._path = None
456
+ self.stream = stream
457
+ self._closable_stream = False
458
+
459
+ self.fingerprint = fingerprint
460
+ self.disable_nullable = disable_nullable
461
+ self.writer_batch_size = (
462
+ writer_batch_size
463
+ or get_arrow_writer_batch_size_from_features(self._features)
464
+ or config.DEFAULT_MAX_BATCH_SIZE
465
+ )
466
+ self.update_features = update_features
467
+ self.with_metadata = with_metadata
468
+ self.unit = unit
469
+ self.embed_local_files = embed_local_files
470
+
471
+ self._num_examples = 0
472
+ self._num_bytes = 0
473
+ self.current_examples: list[tuple[dict[str, Any], str]] = []
474
+ self.current_rows: list[pa.Table] = []
475
+ self.pa_writer: Optional[pa.RecordBatchStreamWriter] = None
476
+ self.hkey_record = []
477
+
478
+ def __len__(self):
479
+ """Return the number of writed and staged examples"""
480
+ return self._num_examples + len(self.current_examples) + len(self.current_rows)
481
+
482
+ def __enter__(self):
483
+ return self
484
+
485
+ def __exit__(self, exc_type, exc_val, exc_tb):
486
+ self.close()
487
+
488
+ def close(self):
489
+ # Try closing if opened; if closed: pyarrow.lib.ArrowInvalid: Invalid operation on closed file
490
+ if self.pa_writer: # it might be None
491
+ try:
492
+ self.pa_writer.close()
493
+ except Exception: # pyarrow.lib.ArrowInvalid, OSError
494
+ pass
495
+ if self._closable_stream and not self.stream.closed:
496
+ self.stream.close() # This also closes self.pa_writer if it is opened
497
+
498
+ def _build_schema(self, inferred_schema: pa.Schema):
499
+ schema = self.schema
500
+ features = self._features
501
+ inferred_features = Features.from_arrow_schema(inferred_schema)
502
+ if self._features is not None:
503
+ if self.update_features: # keep original features it they match, or update them
504
+ fields = {field.name: field for field in self._features.type}
505
+ for inferred_field in inferred_features.type:
506
+ name = inferred_field.name
507
+ if name in fields:
508
+ if inferred_field == fields[name]:
509
+ inferred_features[name] = self._features[name]
510
+ features = inferred_features
511
+ schema: pa.Schema = inferred_schema
512
+ else:
513
+ features = inferred_features
514
+ schema: pa.Schema = inferred_features.arrow_schema
515
+
516
+ if self.disable_nullable:
517
+ schema = pa.schema(pa.field(field.name, field.type, nullable=False) for field in schema)
518
+ if self.with_metadata:
519
+ schema = schema.with_metadata(self._build_metadata(DatasetInfo(features=features), self.fingerprint))
520
+ else:
521
+ schema = schema.with_metadata({})
522
+
523
+ return schema, features
524
+
525
+ def _build_writer(self, inferred_schema: pa.Schema):
526
+ self._schema, self._features = self._build_schema(inferred_schema)
527
+ self.pa_writer = pa.RecordBatchStreamWriter(self.stream, self._schema)
528
+
529
+ @property
530
+ def schema(self):
531
+ _schema = (
532
+ self._schema
533
+ if self._schema is not None
534
+ else (pa.schema(self._features.type) if self._features is not None else None)
535
+ )
536
+ if self._disable_nullable and _schema is not None:
537
+ _schema = pa.schema(pa.field(field.name, field.type, nullable=False) for field in _schema)
538
+ return _schema if _schema is not None else []
539
+
540
+ @staticmethod
541
+ def _build_metadata(info: DatasetInfo, fingerprint: Optional[str] = None) -> dict[str, str]:
542
+ info_keys = ["features"] # we can add support for more DatasetInfo keys in the future
543
+ info_as_dict = asdict(info)
544
+ metadata = {}
545
+ metadata["info"] = {key: info_as_dict[key] for key in info_keys}
546
+ if fingerprint is not None:
547
+ metadata["fingerprint"] = fingerprint
548
+ return {"huggingface": json.dumps(metadata)}
549
+
550
+ def write_examples_on_file(self):
551
+ """Write stored examples from the write-pool of examples. It makes a table out of the examples and write it."""
552
+ if not self.current_examples:
553
+ return
554
+ # preserve the order the columns
555
+ if self.schema:
556
+ schema_cols = set(self.schema.names)
557
+ examples_cols = self.current_examples[0][0].keys() # .keys() preserves the order (unlike set)
558
+ common_cols = [col for col in self.schema.names if col in examples_cols]
559
+ extra_cols = [col for col in examples_cols if col not in schema_cols]
560
+ cols = common_cols + extra_cols
561
+ else:
562
+ cols = list(self.current_examples[0][0])
563
+ batch_examples = {}
564
+ for col in cols:
565
+ # We use row[0][col] since current_examples contains (example, key) tuples.
566
+ # Moreover, examples could be Arrow arrays of 1 element.
567
+ # This can happen in `.map()` when we want to re-write the same Arrow data
568
+ if all(isinstance(row[0][col], (pa.Array, pa.ChunkedArray)) for row in self.current_examples):
569
+ arrays = [row[0][col] for row in self.current_examples]
570
+ arrays = [
571
+ chunk
572
+ for array in arrays
573
+ for chunk in (array.chunks if isinstance(array, pa.ChunkedArray) else [array])
574
+ ]
575
+ batch_examples[col] = pa.concat_arrays(arrays)
576
+ else:
577
+ batch_examples[col] = [
578
+ row[0][col].to_pylist()[0] if isinstance(row[0][col], (pa.Array, pa.ChunkedArray)) else row[0][col]
579
+ for row in self.current_examples
580
+ ]
581
+ self.write_batch(batch_examples=batch_examples)
582
+ self.current_examples = []
583
+
584
+ def write_rows_on_file(self):
585
+ """Write stored rows from the write-pool of rows. It concatenates the single-row tables and it writes the resulting table."""
586
+ if not self.current_rows:
587
+ return
588
+ table = pa.concat_tables(self.current_rows)
589
+ self.write_table(table)
590
+ self.current_rows = []
591
+
592
+ def write(
593
+ self,
594
+ example: dict[str, Any],
595
+ key: Optional[Union[str, int, bytes]] = None,
596
+ writer_batch_size: Optional[int] = None,
597
+ ):
598
+ """Add a given (Example,Key) pair to the write-pool of examples which is written to file.
599
+
600
+ Args:
601
+ example: the Example to add.
602
+ key: Optional, a unique identifier(str, int or bytes) associated with each example
603
+ """
604
+ # Utilize the keys and duplicate checking when `self._check_duplicates` is passed True
605
+ if self._check_duplicates:
606
+ # Create unique hash from key and store as (key, example) pairs
607
+ hash = self._hasher.hash(key)
608
+ self.current_examples.append((example, hash))
609
+ # Maintain record of keys and their respective hashes for checking duplicates
610
+ self.hkey_record.append((hash, key))
611
+ else:
612
+ # Store example as a tuple so as to keep the structure of `self.current_examples` uniform
613
+ self.current_examples.append((example, ""))
614
+
615
+ if writer_batch_size is None:
616
+ writer_batch_size = self.writer_batch_size
617
+ if writer_batch_size is not None and len(self.current_examples) >= writer_batch_size:
618
+ if self._check_duplicates:
619
+ self.check_duplicate_keys()
620
+ # Re-initializing to empty list for next batch
621
+ self.hkey_record = []
622
+
623
+ self.write_examples_on_file()
624
+
625
+ def check_duplicate_keys(self):
626
+ """Raises error if duplicates found in a batch"""
627
+ tmp_record = set()
628
+ for hash, key in self.hkey_record:
629
+ if hash in tmp_record:
630
+ duplicate_key_indices = [
631
+ str(self._num_examples + index)
632
+ for index, (duplicate_hash, _) in enumerate(self.hkey_record)
633
+ if duplicate_hash == hash
634
+ ]
635
+
636
+ raise DuplicatedKeysError(key, duplicate_key_indices)
637
+ else:
638
+ tmp_record.add(hash)
639
+
640
+ def write_row(self, row: pa.Table, writer_batch_size: Optional[int] = None):
641
+ """Add a given single-row Table to the write-pool of rows which is written to file.
642
+
643
+ Args:
644
+ row: the row to add.
645
+ """
646
+ if len(row) != 1:
647
+ raise ValueError(f"Only single-row pyarrow tables are allowed but got table with {len(row)} rows.")
648
+ self.current_rows.append(row)
649
+ if writer_batch_size is None:
650
+ writer_batch_size = self.writer_batch_size
651
+ if writer_batch_size is not None and len(self.current_rows) >= writer_batch_size:
652
+ self.write_rows_on_file()
653
+
654
+ def write_batch(
655
+ self,
656
+ batch_examples: dict[str, list],
657
+ writer_batch_size: Optional[int] = None,
658
+ try_original_type: Optional[bool] = True,
659
+ ):
660
+ """Write a batch of Example to file.
661
+ Ignores the batch if it appears to be empty,
662
+ preventing a potential schema update of unknown types.
663
+
664
+ Args:
665
+ batch_examples: the batch of examples to add.
666
+ try_original_type: use `try_type` when instantiating OptimizedTypedSequence if `True`, otherwise `try_type = None`.
667
+ """
668
+ if batch_examples and len(next(iter(batch_examples.values()))) == 0:
669
+ return
670
+ features = None if self.pa_writer is None and self.update_features else self._features
671
+ try_features = self._features if self.pa_writer is None and self.update_features else None
672
+ arrays = []
673
+ inferred_features = Features()
674
+ # preserve the order the columns
675
+ if self.schema:
676
+ schema_cols = set(self.schema.names)
677
+ batch_cols = batch_examples.keys() # .keys() preserves the order (unlike set)
678
+ common_cols = [col for col in self.schema.names if col in batch_cols]
679
+ extra_cols = [col for col in batch_cols if col not in schema_cols]
680
+ cols = common_cols + extra_cols
681
+ else:
682
+ cols = list(batch_examples)
683
+ for col in cols:
684
+ col_values = batch_examples[col]
685
+ col_type = features[col] if features else None
686
+ if isinstance(col_values, (pa.Array, pa.ChunkedArray)):
687
+ array = cast_array_to_feature(col_values, col_type) if col_type is not None else col_values
688
+ arrays.append(array)
689
+ inferred_features[col] = generate_from_arrow_type(col_values.type)
690
+ else:
691
+ col_try_type = (
692
+ try_features[col]
693
+ if try_features is not None and col in try_features and try_original_type
694
+ else None
695
+ )
696
+ typed_sequence = OptimizedTypedSequence(col_values, type=col_type, try_type=col_try_type, col=col)
697
+ arrays.append(pa.array(typed_sequence))
698
+ inferred_features[col] = typed_sequence.get_inferred_type()
699
+ schema = inferred_features.arrow_schema if self.pa_writer is None else self.schema
700
+ pa_table = pa.Table.from_arrays(arrays, schema=schema)
701
+ self.write_table(pa_table, writer_batch_size)
702
+
703
+ def write_table(self, pa_table: pa.Table, writer_batch_size: Optional[int] = None):
704
+ """Write a Table to file.
705
+
706
+ Args:
707
+ example: the Table to add.
708
+ """
709
+ if writer_batch_size is None:
710
+ writer_batch_size = self.writer_batch_size
711
+ if self.pa_writer is None:
712
+ self._build_writer(inferred_schema=pa_table.schema)
713
+ pa_table = pa_table.combine_chunks()
714
+ pa_table = table_cast(pa_table, self._schema)
715
+ if self.embed_local_files:
716
+ pa_table = embed_table_storage(pa_table)
717
+ self._num_bytes += pa_table.nbytes
718
+ self._num_examples += pa_table.num_rows
719
+ self.pa_writer.write_table(pa_table, writer_batch_size)
720
+
721
+ def finalize(self, close_stream=True):
722
+ self.write_rows_on_file()
723
+ # In case current_examples < writer_batch_size, but user uses finalize()
724
+ if self._check_duplicates:
725
+ self.check_duplicate_keys()
726
+ # Re-initializing to empty list for next batch
727
+ self.hkey_record = []
728
+ self.write_examples_on_file()
729
+ # If schema is known, infer features even if no examples were written
730
+ if self.pa_writer is None and self.schema:
731
+ self._build_writer(self.schema)
732
+ if self.pa_writer is not None:
733
+ self.pa_writer.close()
734
+ self.pa_writer = None
735
+ if close_stream:
736
+ self.stream.close()
737
+ else:
738
+ if close_stream:
739
+ self.stream.close()
740
+ raise SchemaInferenceError("Please pass `features` or at least one example when writing data")
741
+ logger.debug(
742
+ f"Done writing {self._num_examples} {self.unit} in {self._num_bytes} bytes {self._path if self._path else ''}."
743
+ )
744
+ return self._num_examples, self._num_bytes
745
+
746
+
747
+ class ParquetWriter(ArrowWriter):
748
+ def __init__(self, *args, use_content_defined_chunking=True, write_page_index=True, **kwargs):
749
+ super().__init__(*args, **kwargs)
750
+ if use_content_defined_chunking is True:
751
+ use_content_defined_chunking = config.DEFAULT_CDC_OPTIONS
752
+ self.use_content_defined_chunking = use_content_defined_chunking
753
+ self.write_page_index = write_page_index
754
+
755
+ def _build_writer(self, inferred_schema: pa.Schema):
756
+ self._schema, self._features = self._build_schema(inferred_schema)
757
+ self.pa_writer = pq.ParquetWriter(
758
+ self.stream,
759
+ self._schema,
760
+ use_content_defined_chunking=self.use_content_defined_chunking,
761
+ write_page_index=self.write_page_index,
762
+ )
763
+ if self.use_content_defined_chunking is not False:
764
+ self.pa_writer.add_key_value_metadata(
765
+ {"content_defined_chunking": json.dumps(self.use_content_defined_chunking)}
766
+ )
datasets/builder.py ADDED
@@ -0,0 +1,1866 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # Lint as: python3
16
+ """DatasetBuilder base class."""
17
+
18
+ import abc
19
+ import contextlib
20
+ import copy
21
+ import inspect
22
+ import os
23
+ import posixpath
24
+ import shutil
25
+ import textwrap
26
+ import time
27
+ import urllib
28
+ from collections.abc import Iterable, Mapping
29
+ from dataclasses import dataclass
30
+ from functools import partial
31
+ from pathlib import Path
32
+ from typing import TYPE_CHECKING, Optional, Union
33
+ from unittest.mock import patch
34
+
35
+ import fsspec
36
+ from fsspec.core import url_to_fs
37
+ from multiprocess import Pool
38
+ from tqdm.contrib.concurrent import thread_map
39
+
40
+ from . import config, utils
41
+ from .arrow_dataset import Dataset
42
+ from .arrow_reader import (
43
+ ArrowReader,
44
+ ReadInstruction,
45
+ )
46
+ from .arrow_writer import ArrowWriter, ParquetWriter, SchemaInferenceError
47
+ from .data_files import DataFilesDict, DataFilesPatternsDict, sanitize_patterns
48
+ from .dataset_dict import DatasetDict, IterableDatasetDict
49
+ from .download.download_config import DownloadConfig
50
+ from .download.download_manager import DownloadManager, DownloadMode
51
+ from .download.streaming_download_manager import StreamingDownloadManager, xjoin
52
+ from .exceptions import DatasetGenerationCastError, DatasetGenerationError, FileFormatError, ManualDownloadError
53
+ from .features import Features
54
+ from .filesystems import (
55
+ is_remote_filesystem,
56
+ rename,
57
+ )
58
+ from .fingerprint import Hasher
59
+ from .info import DatasetInfo, PostProcessedInfo
60
+ from .iterable_dataset import ArrowExamplesIterable, ExamplesIterable, IterableDataset
61
+ from .keyhash import DuplicatedKeysError
62
+ from .naming import INVALID_WINDOWS_CHARACTERS_IN_PATH, camelcase_to_snakecase
63
+ from .splits import Split, SplitDict, SplitGenerator, SplitInfo
64
+ from .streaming import extend_dataset_builder_for_streaming
65
+ from .table import CastError
66
+ from .utils import logging
67
+ from .utils import tqdm as hf_tqdm
68
+ from .utils._filelock import FileLock
69
+ from .utils.file_utils import is_remote_url
70
+ from .utils.info_utils import VerificationMode, get_size_checksum_dict, verify_checksums, verify_splits
71
+ from .utils.py_utils import (
72
+ classproperty,
73
+ convert_file_size_to_int,
74
+ has_sufficient_disk_space,
75
+ iflatmap_unordered,
76
+ map_nested,
77
+ memoize,
78
+ size_str,
79
+ temporary_assignment,
80
+ )
81
+ from .utils.sharding import _number_of_shards_in_gen_kwargs, _split_gen_kwargs
82
+ from .utils.track import tracked_list
83
+
84
+
85
+ if TYPE_CHECKING:
86
+ from .load import DatasetModule
87
+
88
+
89
+ logger = logging.get_logger(__name__)
90
+
91
+
92
+ class InvalidConfigName(ValueError):
93
+ pass
94
+
95
+
96
+ @dataclass
97
+ class BuilderConfig:
98
+ """Base class for `DatasetBuilder` data configuration.
99
+
100
+ `DatasetBuilder` subclasses with data configuration options should subclass
101
+ `BuilderConfig` and add their own properties.
102
+
103
+ Attributes:
104
+ name (`str`, defaults to `default`):
105
+ The name of the configuration.
106
+ version (`Version` or `str`, defaults to `0.0.0`):
107
+ The version of the configuration.
108
+ data_dir (`str`, *optional*):
109
+ Path to the directory containing the source data.
110
+ data_files (`str` or `Sequence` or `Mapping`, *optional*):
111
+ Path(s) to source data file(s).
112
+ description (`str`, *optional*):
113
+ A human description of the configuration.
114
+ """
115
+
116
+ name: str = "default"
117
+ version: Optional[Union[utils.Version, str]] = utils.Version("0.0.0")
118
+ data_dir: Optional[str] = None
119
+ data_files: Optional[Union[DataFilesDict, DataFilesPatternsDict]] = None
120
+ description: Optional[str] = None
121
+
122
+ def __post_init__(self):
123
+ # The config name is used to name the cache directory.
124
+ for invalid_char in INVALID_WINDOWS_CHARACTERS_IN_PATH:
125
+ if invalid_char in self.name:
126
+ raise InvalidConfigName(
127
+ f"Bad characters from black list '{INVALID_WINDOWS_CHARACTERS_IN_PATH}' found in '{self.name}'. "
128
+ f"They could create issues when creating a directory for this config on Windows filesystem."
129
+ )
130
+ if self.data_files is not None and not isinstance(self.data_files, (DataFilesDict, DataFilesPatternsDict)):
131
+ raise ValueError(f"Expected a DataFilesDict in data_files but got {self.data_files}")
132
+
133
+ def __eq__(self, o):
134
+ # we need to override the default dataclass __eq__ since it doesn't check for
135
+ # other attributes that the ones of the signature.
136
+ if set(self.__dict__.keys()) != set(o.__dict__.keys()):
137
+ return False
138
+ return all((k, getattr(self, k)) == (k, getattr(o, k)) for k in self.__dict__.keys())
139
+
140
+ def create_config_id(
141
+ self,
142
+ config_kwargs: dict,
143
+ custom_features: Optional[Features] = None,
144
+ ) -> str:
145
+ """
146
+ The config id is used to build the cache directory.
147
+ By default it is equal to the config name.
148
+ However the name of a config is not sufficient to have a unique identifier for the dataset being generated
149
+ since it doesn't take into account:
150
+ - the config kwargs that can be used to overwrite attributes
151
+ - the custom features used to write the dataset
152
+ - the data_files for json/text/csv/pandas datasets
153
+
154
+ Therefore the config id is just the config name with an optional suffix based on these.
155
+ """
156
+ # Possibly add a suffix to the name to handle custom features/data_files/config_kwargs
157
+ suffix: Optional[str] = None
158
+ config_kwargs_to_add_to_suffix = config_kwargs.copy()
159
+ # name and version are already used to build the cache directory
160
+ config_kwargs_to_add_to_suffix.pop("name", None)
161
+ config_kwargs_to_add_to_suffix.pop("version", None)
162
+ # data dir handling (when specified it points to the manually downloaded data):
163
+ # it was previously ignored before the introduction of config id because we didn't want
164
+ # to change the config name. Now it's fine to take it into account for the config id.
165
+ # config_kwargs_to_add_to_suffix.pop("data_dir", None)
166
+ if "data_dir" in config_kwargs_to_add_to_suffix:
167
+ if config_kwargs_to_add_to_suffix["data_dir"] is None:
168
+ config_kwargs_to_add_to_suffix.pop("data_dir", None)
169
+ else:
170
+ # canonicalize the data dir to avoid two paths to the same location having different
171
+ # hashes
172
+ data_dir = config_kwargs_to_add_to_suffix["data_dir"]
173
+ data_dir = os.path.normpath(data_dir)
174
+ config_kwargs_to_add_to_suffix["data_dir"] = data_dir
175
+ if config_kwargs_to_add_to_suffix:
176
+ # we don't care about the order of the kwargs
177
+ config_kwargs_to_add_to_suffix = {
178
+ k: config_kwargs_to_add_to_suffix[k] for k in sorted(config_kwargs_to_add_to_suffix)
179
+ }
180
+ if all(isinstance(v, (str, bool, int, float)) for v in config_kwargs_to_add_to_suffix.values()):
181
+ suffix = ",".join(
182
+ str(k) + "=" + urllib.parse.quote_plus(str(v)) for k, v in config_kwargs_to_add_to_suffix.items()
183
+ )
184
+ if len(suffix) > 32: # hash if too long
185
+ suffix = Hasher.hash(config_kwargs_to_add_to_suffix)
186
+ else:
187
+ suffix = Hasher.hash(config_kwargs_to_add_to_suffix)
188
+
189
+ if custom_features is not None:
190
+ m = Hasher()
191
+ if suffix:
192
+ m.update(suffix)
193
+ m.update(custom_features)
194
+ suffix = m.hexdigest()
195
+
196
+ if suffix:
197
+ config_id = self.name + "-" + suffix
198
+ if len(config_id) > config.MAX_DATASET_CONFIG_ID_READABLE_LENGTH:
199
+ config_id = self.name + "-" + Hasher.hash(suffix)
200
+ return config_id
201
+ else:
202
+ return self.name
203
+
204
+ def _resolve_data_files(self, base_path: str, download_config: DownloadConfig) -> None:
205
+ if isinstance(self.data_files, DataFilesPatternsDict):
206
+ base_path = xjoin(base_path, self.data_dir) if self.data_dir else base_path
207
+ self.data_files = self.data_files.resolve(base_path, download_config)
208
+
209
+
210
+ class DatasetBuilder:
211
+ """Abstract base class for all datasets.
212
+
213
+ `DatasetBuilder` has 3 key methods:
214
+
215
+ - [`DatasetBuilder.info`]: Documents the dataset, including feature
216
+ names, types, shapes, version, splits, citation, etc.
217
+ - [`DatasetBuilder.download_and_prepare`]: Downloads the source data
218
+ and writes it to disk.
219
+ - [`DatasetBuilder.as_dataset`]: Generates a [`Dataset`].
220
+
221
+ Some `DatasetBuilder`s expose multiple variants of the
222
+ dataset by defining a [`BuilderConfig`] subclass and accepting a
223
+ config object (or name) on construction. Configurable datasets expose a
224
+ pre-defined set of configurations in [`DatasetBuilder.builder_configs`].
225
+
226
+ Args:
227
+ cache_dir (`str`, *optional*):
228
+ Directory to cache data. Defaults to `"~/.cache/huggingface/datasets"`.
229
+ dataset_name (`str`, *optional*):
230
+ Name of the dataset, if different from the builder name. Useful for packaged builders
231
+ like csv, imagefolder, audiofolder, etc. to reflect the difference between datasets
232
+ that use the same packaged builder.
233
+ config_name (`str`, *optional*):
234
+ Name of the dataset configuration.
235
+ It affects the data generated on disk. Different configurations will have their own subdirectories and
236
+ versions.
237
+ If not provided, the default configuration is used (if it exists).
238
+
239
+ <Added version="2.3.0">
240
+
241
+ Parameter `name` was renamed to `config_name`.
242
+
243
+ </Added>
244
+ hash (`str`, *optional*):
245
+ Hash specific to the dataset builder code. Used to update the caching directory when the
246
+ dataset builder code is updated (to avoid reusing old data).
247
+ The typical caching directory (defined in `self._relative_data_dir`) is `name/version/hash/`.
248
+ base_path (`str`, *optional*):
249
+ Base path for relative paths that are used to download files.
250
+ This can be a remote URL.
251
+ features ([`Features`], *optional*):
252
+ Features types to use with this dataset.
253
+ It can be used to change the [`Features`] types of a dataset, for example.
254
+ token (`str` or `bool`, *optional*):
255
+ String or boolean to use as Bearer token for remote files on the
256
+ Datasets Hub. If `True`, will get token from `"~/.huggingface"`.
257
+ repo_id (`str`, *optional*):
258
+ ID of the dataset repository.
259
+ Used to distinguish builders with the same name but not coming from the same namespace, for example "rajpurkar/squad"
260
+ and "lhoestq/squad" repo IDs. In the latter, the builder name would be "lhoestq___squad".
261
+ data_files (`str` or `Sequence` or `Mapping`, *optional*):
262
+ Path(s) to source data file(s).
263
+ For builders like "csv" or "json" that need the user to specify data files. They can be either
264
+ local or remote files. For convenience, you can use a `DataFilesDict`.
265
+ data_dir (`str`, *optional*):
266
+ Path to directory containing source data file(s).
267
+ Use only if `data_files` is not passed, in which case it is equivalent to passing
268
+ `os.path.join(data_dir, "**")` as `data_files`.
269
+ For builders that require manual download, it must be the path to the local directory containing the
270
+ manually downloaded data.
271
+ storage_options (`dict`, *optional*):
272
+ Key/value pairs to be passed on to the dataset file-system backend, if any.
273
+ writer_batch_size (`int`, *optional*):
274
+ Batch size used by the ArrowWriter.
275
+ It defines the number of samples that are kept in memory before writing them
276
+ and also the length of the arrow chunks.
277
+ None means that the ArrowWriter will use its default value.
278
+ **config_kwargs (additional keyword arguments): Keyword arguments to be passed to the corresponding builder
279
+ configuration class, set on the class attribute [`DatasetBuilder.BUILDER_CONFIG_CLASS`]. The builder
280
+ configuration class is [`BuilderConfig`] or a subclass of it.
281
+ """
282
+
283
+ # Default version
284
+ VERSION = None # Default version set in BuilderConfig
285
+
286
+ # Class for the builder config.
287
+ BUILDER_CONFIG_CLASS = BuilderConfig
288
+
289
+ # Named configurations that modify the data generated by download_and_prepare.
290
+ BUILDER_CONFIGS = []
291
+
292
+ # Optional default config name to be used when name is None
293
+ DEFAULT_CONFIG_NAME = None
294
+
295
+ # Default batch size used by the ArrowWriter
296
+ # It defines the number of samples that are kept in memory before writing them
297
+ # and also the length of the arrow chunks
298
+ # None means that the ArrowWriter will use its default value
299
+ DEFAULT_WRITER_BATCH_SIZE = None
300
+
301
+ def __init__(
302
+ self,
303
+ cache_dir: Optional[str] = None,
304
+ dataset_name: Optional[str] = None,
305
+ config_name: Optional[str] = None,
306
+ hash: Optional[str] = None,
307
+ base_path: Optional[str] = None,
308
+ info: Optional[DatasetInfo] = None,
309
+ features: Optional[Features] = None,
310
+ token: Optional[Union[bool, str]] = None,
311
+ repo_id: Optional[str] = None,
312
+ data_files: Optional[Union[str, list, dict, DataFilesDict]] = None,
313
+ data_dir: Optional[str] = None,
314
+ storage_options: Optional[dict] = None,
315
+ writer_batch_size: Optional[int] = None,
316
+ config_id: Optional[str] = None,
317
+ **config_kwargs,
318
+ ):
319
+ # DatasetBuilder name
320
+ self.name: str = camelcase_to_snakecase(self.__module__.split(".")[-1])
321
+ self.hash: Optional[str] = hash
322
+ self.base_path = base_path
323
+ self.token = token
324
+ self.repo_id = repo_id
325
+ self.storage_options = storage_options or {}
326
+ self.dataset_name = camelcase_to_snakecase(dataset_name) if dataset_name else self.name
327
+ self._writer_batch_size = writer_batch_size or self.DEFAULT_WRITER_BATCH_SIZE
328
+
329
+ if data_files is not None and not isinstance(data_files, DataFilesDict):
330
+ data_files = DataFilesDict.from_patterns(
331
+ sanitize_patterns(data_files),
332
+ base_path=base_path,
333
+ download_config=DownloadConfig(token=token, storage_options=self.storage_options),
334
+ )
335
+
336
+ # Prepare config: DatasetConfig contains name, version and description but can be extended by each dataset
337
+ if "features" in inspect.signature(self.BUILDER_CONFIG_CLASS.__init__).parameters and features is not None:
338
+ config_kwargs["features"] = features
339
+ if data_files is not None:
340
+ config_kwargs["data_files"] = data_files
341
+ if data_dir is not None:
342
+ config_kwargs["data_dir"] = data_dir
343
+ self.config_kwargs = config_kwargs
344
+ self.config, self.config_id = self._create_builder_config(
345
+ config_name=config_name,
346
+ custom_features=features,
347
+ config_id=config_id,
348
+ **config_kwargs,
349
+ )
350
+
351
+ # prepare info: DatasetInfo are a standardized dataclass across all datasets
352
+ # Prefill datasetinfo
353
+ if info is None:
354
+ info = self._info()
355
+ info.builder_name = self.name
356
+ info.dataset_name = self.dataset_name
357
+ info.config_name = self.config.name
358
+ info.version = self.config.version
359
+ self.info = info
360
+ # update info with user specified infos
361
+ if features is not None:
362
+ self.info.features = features
363
+
364
+ # Prepare data dirs:
365
+ # cache_dir can be a remote bucket on GCS or S3
366
+ self._cache_dir_root = str(cache_dir or config.HF_DATASETS_CACHE)
367
+ self._cache_dir_root = (
368
+ self._cache_dir_root if is_remote_url(self._cache_dir_root) else os.path.expanduser(self._cache_dir_root)
369
+ )
370
+ self._cache_downloaded_dir = (
371
+ posixpath.join(self._cache_dir_root, config.DOWNLOADED_DATASETS_DIR)
372
+ if cache_dir
373
+ else str(config.DOWNLOADED_DATASETS_PATH)
374
+ )
375
+ self._cache_downloaded_dir = (
376
+ self._cache_downloaded_dir
377
+ if is_remote_url(self._cache_downloaded_dir)
378
+ else os.path.expanduser(self._cache_downloaded_dir)
379
+ )
380
+
381
+ # In case there exists a legacy cache directory
382
+ self._legacy_relative_data_dir = None
383
+
384
+ self._cache_dir = self._build_cache_dir()
385
+ if not is_remote_url(self._cache_dir_root):
386
+ os.makedirs(self._cache_dir_root, exist_ok=True)
387
+ lock_path = os.path.join(
388
+ self._cache_dir_root, Path(self._cache_dir).as_posix().replace("/", "_") + ".lock"
389
+ )
390
+ with FileLock(lock_path):
391
+ if os.path.exists(self._cache_dir): # check if data exist
392
+ if len(os.listdir(self._cache_dir)) > 0:
393
+ if os.path.exists(os.path.join(self._cache_dir, config.DATASET_INFO_FILENAME)):
394
+ logger.debug("Overwrite dataset info from restored data version if exists.")
395
+ self.info = DatasetInfo.from_directory(self._cache_dir)
396
+ else: # dir exists but no data, remove the empty dir as data aren't available anymore
397
+ logger.warning(
398
+ f"Old caching folder {self._cache_dir} for dataset {self.dataset_name} exists but no data were found. Removing it. "
399
+ )
400
+ os.rmdir(self._cache_dir)
401
+
402
+ # Store in the cache by default unless the user specifies a custom output_dir to download_and_prepare
403
+ self._output_dir = self._cache_dir
404
+ self._fs: fsspec.AbstractFileSystem = fsspec.filesystem("file")
405
+
406
+ # Set download manager
407
+ self.dl_manager = None
408
+
409
+ # Set to True by "datasets-cli test" to generate file checksums for (deprecated) dataset_infos.json independently of verification_mode value.
410
+ self._record_infos = False
411
+
412
+ # Set in `.download_and_prepare` once the format of the generated dataset is known
413
+ self._file_format = None
414
+
415
+ # Enable streaming (e.g. it patches "open" to work with remote files)
416
+ extend_dataset_builder_for_streaming(self)
417
+
418
+ def __getstate__(self):
419
+ return self.__dict__
420
+
421
+ def __setstate__(self, d):
422
+ self.__dict__ = d
423
+ # Re-enable streaming, since patched functions are not kept when pickling
424
+ extend_dataset_builder_for_streaming(self)
425
+
426
+ # Must be set for datasets that use 'data_dir' functionality - the ones
427
+ # that require users to do additional steps to download the data
428
+ # (this is usually due to some external regulations / rules).
429
+ # This field should contain a string with user instructions, including
430
+ # the list of files that should be present. It will be
431
+ # displayed in the dataset documentation.
432
+ @property
433
+ def manual_download_instructions(self) -> Optional[str]:
434
+ return None
435
+
436
+ def _check_legacy_cache(self) -> Optional[str]:
437
+ """Check for the old cache directory template {cache_dir}/{namespace}___{builder_name} from 2.13"""
438
+ if (
439
+ self.__module__.startswith("datasets.")
440
+ and not is_remote_url(self._cache_dir_root)
441
+ and self.config.name == "default"
442
+ ):
443
+ from .packaged_modules import _PACKAGED_DATASETS_MODULES
444
+
445
+ namespace = self.repo_id.split("/")[0] if self.repo_id and self.repo_id.count("/") > 0 else None
446
+ config_name = self.repo_id.replace("/", "--") if self.repo_id is not None else self.dataset_name
447
+ config_id = config_name + self.config_id[len(self.config.name) :]
448
+ hash = _PACKAGED_DATASETS_MODULES.get(self.name, "missing")[1]
449
+ legacy_relative_data_dir = posixpath.join(
450
+ self.dataset_name if namespace is None else f"{namespace}___{self.dataset_name}",
451
+ config_id,
452
+ "0.0.0",
453
+ hash,
454
+ )
455
+ legacy_cache_dir = posixpath.join(self._cache_dir_root, legacy_relative_data_dir)
456
+ if os.path.isdir(legacy_cache_dir):
457
+ return legacy_relative_data_dir
458
+
459
+ def _check_legacy_cache2(self, dataset_module: "DatasetModule") -> Optional[str]:
460
+ """Check for the old cache directory template {cache_dir}/{namespace}___{dataset_name}/{config_name}-xxx from 2.14 and 2.15"""
461
+ if (
462
+ self.__module__.startswith("datasets.")
463
+ and not is_remote_url(self._cache_dir_root)
464
+ and not (set(self.config_kwargs) - {"data_files", "data_dir"})
465
+ ):
466
+ from .packaged_modules import _PACKAGED_DATASETS_MODULES_2_15_HASHES
467
+ from .utils._dill import Pickler
468
+
469
+ def update_hash_with_config_parameters(hash: str, config_parameters: dict) -> str:
470
+ """
471
+ Used to update hash of packaged modules which is used for creating unique cache directories to reflect
472
+ different config parameters which are passed in metadata from readme.
473
+ """
474
+ params_to_exclude = {"config_name", "version", "description"}
475
+ params_to_add_to_hash = {
476
+ param: value
477
+ for param, value in sorted(config_parameters.items())
478
+ if param not in params_to_exclude
479
+ }
480
+ m = Hasher()
481
+ m.update(hash)
482
+ m.update(params_to_add_to_hash)
483
+ return m.hexdigest()
484
+
485
+ namespace = self.repo_id.split("/")[0] if self.repo_id and self.repo_id.count("/") > 0 else None
486
+ with patch.object(Pickler, "_legacy_no_dict_keys_sorting", True):
487
+ config_id = self.config.name + "-" + Hasher.hash({"data_files": self.config.data_files})
488
+ hash = _PACKAGED_DATASETS_MODULES_2_15_HASHES.get(self.name, "missing")
489
+ if (
490
+ dataset_module.builder_configs_parameters.metadata_configs
491
+ and self.config.name in dataset_module.builder_configs_parameters.metadata_configs
492
+ ):
493
+ hash = update_hash_with_config_parameters(
494
+ hash, dataset_module.builder_configs_parameters.metadata_configs[self.config.name]
495
+ )
496
+ legacy_relative_data_dir = posixpath.join(
497
+ self.dataset_name if namespace is None else f"{namespace}___{self.dataset_name}",
498
+ config_id,
499
+ "0.0.0",
500
+ hash,
501
+ )
502
+ legacy_cache_dir = posixpath.join(self._cache_dir_root, legacy_relative_data_dir)
503
+ if os.path.isdir(legacy_cache_dir):
504
+ return legacy_relative_data_dir
505
+
506
+ def _create_builder_config(
507
+ self, config_name=None, custom_features=None, config_id=None, **config_kwargs
508
+ ) -> tuple[BuilderConfig, str]:
509
+ """Create and validate BuilderConfig object as well as a unique config id for this config.
510
+ Raises ValueError if there are multiple builder configs and config_name and DEFAULT_CONFIG_NAME are None.
511
+ config_kwargs override the defaults kwargs in config
512
+ """
513
+ builder_config = None
514
+
515
+ # try default config
516
+ if config_name is None and self.BUILDER_CONFIGS:
517
+ if self.DEFAULT_CONFIG_NAME is not None:
518
+ builder_config = self.builder_configs.get(self.DEFAULT_CONFIG_NAME)
519
+ logger.info(f"No config specified, defaulting to: {self.dataset_name}/{builder_config.name}")
520
+ else:
521
+ if len(self.BUILDER_CONFIGS) > 1:
522
+ if not config_kwargs:
523
+ example_of_usage = (
524
+ f"load_dataset('{self.repo_id or self.dataset_name}', '{self.BUILDER_CONFIGS[0].name}')"
525
+ )
526
+ raise ValueError(
527
+ "Config name is missing."
528
+ f"\nPlease pick one among the available configs: {list(self.builder_configs.keys())}"
529
+ + f"\nExample of usage:\n\t`{example_of_usage}`"
530
+ )
531
+ else:
532
+ builder_config = self.BUILDER_CONFIGS[0]
533
+ logger.info(
534
+ f"No config specified, defaulting to the single config: {self.dataset_name}/{builder_config.name}"
535
+ )
536
+
537
+ # try to get config by name
538
+ if isinstance(config_name, str):
539
+ builder_config = self.builder_configs.get(config_name)
540
+ if builder_config is None and self.BUILDER_CONFIGS:
541
+ raise ValueError(
542
+ f"BuilderConfig '{config_name}' not found. Available: {list(self.builder_configs.keys())}"
543
+ )
544
+
545
+ # if not using an existing config, then create a new config on the fly
546
+ if not builder_config:
547
+ if config_name is not None:
548
+ config_kwargs["name"] = config_name
549
+ elif self.DEFAULT_CONFIG_NAME and not config_kwargs:
550
+ # Use DEFAULT_CONFIG_NAME only if no config_kwargs are passed
551
+ config_kwargs["name"] = self.DEFAULT_CONFIG_NAME
552
+ if "version" not in config_kwargs and hasattr(self, "VERSION") and self.VERSION:
553
+ config_kwargs["version"] = self.VERSION
554
+ builder_config = self.BUILDER_CONFIG_CLASS(**config_kwargs)
555
+
556
+ # otherwise use the config_kwargs to overwrite the attributes
557
+ else:
558
+ builder_config = copy.deepcopy(builder_config) if config_kwargs else builder_config
559
+ for key, value in config_kwargs.items():
560
+ if value is not None:
561
+ if not hasattr(builder_config, key):
562
+ raise ValueError(f"BuilderConfig {builder_config} doesn't have a '{key}' key.")
563
+ setattr(builder_config, key, value)
564
+
565
+ if not builder_config.name:
566
+ raise ValueError(f"BuilderConfig must have a name, got {builder_config.name}")
567
+
568
+ # resolve data files if needed
569
+ builder_config._resolve_data_files(
570
+ base_path=self.base_path,
571
+ download_config=DownloadConfig(token=self.token, storage_options=self.storage_options),
572
+ )
573
+
574
+ # compute the config id that is going to be used for caching
575
+ if config_id is None:
576
+ config_id = builder_config.create_config_id(
577
+ config_kwargs,
578
+ custom_features=custom_features,
579
+ )
580
+ is_custom = (config_id not in self.builder_configs) and config_id != "default"
581
+ if is_custom:
582
+ logger.info(f"Using custom data configuration {config_id}")
583
+ else:
584
+ if (
585
+ builder_config.name in self.builder_configs
586
+ and builder_config != self.builder_configs[builder_config.name]
587
+ ):
588
+ raise ValueError(
589
+ "Cannot name a custom BuilderConfig the same as an available "
590
+ f"BuilderConfig. Change the name. Available BuilderConfigs: {list(self.builder_configs.keys())}"
591
+ )
592
+ if not builder_config.version:
593
+ raise ValueError(f"BuilderConfig {builder_config.name} must have a version")
594
+
595
+ return builder_config, config_id
596
+
597
+ @classproperty
598
+ @classmethod
599
+ @memoize()
600
+ def builder_configs(cls) -> dict[str, BuilderConfig]:
601
+ """Dictionary of pre-defined configurations for this builder class."""
602
+ configs = {config.name: config for config in cls.BUILDER_CONFIGS}
603
+ if len(configs) != len(cls.BUILDER_CONFIGS):
604
+ names = [config.name for config in cls.BUILDER_CONFIGS]
605
+ raise ValueError(f"Names in BUILDER_CONFIGS must not be duplicated. Got {names}")
606
+ return configs
607
+
608
+ @property
609
+ def cache_dir(self):
610
+ return self._cache_dir
611
+
612
+ def _use_legacy_cache_dir_if_possible(self, dataset_module: "DatasetModule"):
613
+ # Check for the legacy cache directory template (datasets<3.0.0)
614
+ self._legacy_relative_data_dir = (
615
+ self._check_legacy_cache2(dataset_module) or self._check_legacy_cache() or None
616
+ )
617
+ self._cache_dir = self._build_cache_dir()
618
+ self._output_dir = self._cache_dir
619
+
620
+ def _relative_data_dir(self, with_version=True, with_hash=True) -> str:
621
+ """Relative path of this dataset in cache_dir:
622
+ Will be:
623
+ self.dataset_name/self.config.version/self.hash/
624
+ or if a repo_id with a namespace has been specified:
625
+ self.namespace___self.dataset_name/self.config.version/self.hash/
626
+ If any of these element is missing or if ``with_version=False`` the corresponding subfolders are dropped.
627
+ """
628
+ if self._legacy_relative_data_dir is not None and with_version and with_hash:
629
+ return self._legacy_relative_data_dir
630
+
631
+ namespace = self.repo_id.split("/")[0] if self.repo_id and self.repo_id.count("/") > 0 else None
632
+ builder_data_dir = self.dataset_name if namespace is None else f"{namespace}___{self.dataset_name}"
633
+ builder_data_dir = posixpath.join(builder_data_dir, self.config_id)
634
+ if with_version:
635
+ builder_data_dir = posixpath.join(builder_data_dir, str(self.config.version))
636
+ if with_hash and self.hash and isinstance(self.hash, str):
637
+ builder_data_dir = posixpath.join(builder_data_dir, self.hash)
638
+ return builder_data_dir
639
+
640
+ def _build_cache_dir(self):
641
+ """Return the data directory for the current version."""
642
+ builder_data_dir = posixpath.join(self._cache_dir_root, self._relative_data_dir(with_version=False))
643
+ version_data_dir = posixpath.join(self._cache_dir_root, self._relative_data_dir(with_version=True))
644
+
645
+ def _other_versions_on_disk():
646
+ """Returns previous versions on disk."""
647
+ if not os.path.exists(builder_data_dir):
648
+ return []
649
+
650
+ version_dirnames = []
651
+ for dir_name in os.listdir(builder_data_dir):
652
+ try:
653
+ version_dirnames.append((utils.Version(dir_name), dir_name))
654
+ except ValueError: # Invalid version (ex: incomplete data dir)
655
+ pass
656
+ version_dirnames.sort(reverse=True)
657
+ return version_dirnames
658
+
659
+ # Check and warn if other versions exist
660
+ if not is_remote_url(builder_data_dir):
661
+ version_dirs = _other_versions_on_disk()
662
+ if version_dirs:
663
+ other_version = version_dirs[0][0]
664
+ if other_version != self.config.version:
665
+ warn_msg = (
666
+ f"Found a different version {str(other_version)} of dataset {self.dataset_name} in "
667
+ f"cache_dir {self._cache_dir_root}. Using currently defined version "
668
+ f"{str(self.config.version)}."
669
+ )
670
+ logger.warning(warn_msg)
671
+
672
+ return version_data_dir
673
+
674
+ @abc.abstractmethod
675
+ def _info(self) -> DatasetInfo:
676
+ """Construct the DatasetInfo object. See `DatasetInfo` for details.
677
+
678
+ Warning: This function is only called once and the result is cached for all
679
+ following .info() calls.
680
+
681
+ Returns:
682
+ info: (DatasetInfo) The dataset information
683
+ """
684
+ raise NotImplementedError
685
+
686
+ @classmethod
687
+ def get_imported_module_dir(cls):
688
+ """Return the path of the module of this class or subclass."""
689
+ return os.path.dirname(inspect.getfile(inspect.getmodule(cls)))
690
+
691
+ def _rename(self, src: str, dst: str):
692
+ rename(self._fs, src, dst)
693
+
694
+ def download_and_prepare(
695
+ self,
696
+ output_dir: Optional[str] = None,
697
+ download_config: Optional[DownloadConfig] = None,
698
+ download_mode: Optional[Union[DownloadMode, str]] = None,
699
+ verification_mode: Optional[Union[VerificationMode, str]] = None,
700
+ dl_manager: Optional[DownloadManager] = None,
701
+ base_path: Optional[str] = None,
702
+ file_format: str = "arrow",
703
+ max_shard_size: Optional[Union[int, str]] = None,
704
+ num_proc: Optional[int] = None,
705
+ storage_options: Optional[dict] = None,
706
+ **download_and_prepare_kwargs,
707
+ ):
708
+ """Downloads and prepares dataset for reading.
709
+
710
+ Args:
711
+ output_dir (`str`, *optional*):
712
+ Output directory for the dataset.
713
+ Default to this builder's `cache_dir`, which is inside `~/.cache/huggingface/datasets` by default.
714
+
715
+ <Added version="2.5.0"/>
716
+ download_config (`DownloadConfig`, *optional*):
717
+ Specific download configuration parameters.
718
+ download_mode ([`DownloadMode`] or `str`, *optional*):
719
+ Select the download/generate mode, default to `REUSE_DATASET_IF_EXISTS`.
720
+ verification_mode ([`VerificationMode`] or `str`, defaults to `BASIC_CHECKS`):
721
+ Verification mode determining the checks to run on the downloaded/processed dataset information (checksums/size/splits/...).
722
+
723
+ <Added version="2.9.1"/>
724
+ dl_manager (`DownloadManager`, *optional*):
725
+ Specific `DownloadManger` to use.
726
+ base_path (`str`, *optional*):
727
+ Base path for relative paths that are used to download files. This can be a remote url.
728
+ If not specified, the value of the `base_path` attribute (`self.base_path`) will be used instead.
729
+ file_format (`str`, *optional*):
730
+ Format of the data files in which the dataset will be written.
731
+ Supported formats: "arrow", "parquet". Default to "arrow" format.
732
+ If the format is "parquet", then image and audio data are embedded into the Parquet files instead of pointing to local files.
733
+
734
+ <Added version="2.5.0"/>
735
+ max_shard_size (`Union[str, int]`, *optional*):
736
+ Maximum number of bytes written per shard, default is "500MB".
737
+ The size is based on uncompressed data size, so in practice your shard files may be smaller than
738
+ `max_shard_size` thanks to Parquet compression for example.
739
+
740
+ <Added version="2.5.0"/>
741
+ num_proc (`int`, *optional*, defaults to `None`):
742
+ Number of processes when downloading and generating the dataset locally.
743
+ Multiprocessing is disabled by default.
744
+
745
+ <Added version="2.7.0"/>
746
+ storage_options (`dict`, *optional*):
747
+ Key/value pairs to be passed on to the caching file-system backend, if any.
748
+
749
+ <Added version="2.5.0"/>
750
+ **download_and_prepare_kwargs (additional keyword arguments): Keyword arguments.
751
+
752
+ Example:
753
+
754
+ Download and prepare the dataset as Arrow files that can be loaded as a Dataset using `builder.as_dataset()`:
755
+
756
+ ```py
757
+ >>> from datasets import load_dataset_builder
758
+ >>> builder = load_dataset_builder("cornell-movie-review-data/rotten_tomatoes")
759
+ >>> builder.download_and_prepare()
760
+ ```
761
+
762
+ Download and prepare the dataset as sharded Parquet files locally:
763
+
764
+ ```py
765
+ >>> from datasets import load_dataset_builder
766
+ >>> builder = load_dataset_builder("cornell-movie-review-data/rotten_tomatoes")
767
+ >>> builder.download_and_prepare("./output_dir", file_format="parquet")
768
+ ```
769
+
770
+ Download and prepare the dataset as sharded Parquet files in a cloud storage:
771
+
772
+ ```py
773
+ >>> from datasets import load_dataset_builder
774
+ >>> storage_options = {"key": aws_access_key_id, "secret": aws_secret_access_key}
775
+ >>> builder = load_dataset_builder("cornell-movie-review-data/rotten_tomatoes")
776
+ >>> builder.download_and_prepare("s3://my-bucket/my_rotten_tomatoes", storage_options=storage_options, file_format="parquet")
777
+ ```
778
+ """
779
+ output_dir = output_dir if output_dir is not None else self._cache_dir
780
+ # output_dir can be a remote bucket on GCS or S3
781
+ fs, output_dir = url_to_fs(output_dir, **(storage_options or {}))
782
+ self._fs = fs
783
+ self._output_dir = output_dir if not is_remote_filesystem(self._fs) else self._fs.unstrip_protocol(output_dir)
784
+
785
+ download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS)
786
+ verification_mode = VerificationMode(verification_mode or VerificationMode.BASIC_CHECKS)
787
+ base_path = base_path if base_path is not None else self.base_path
788
+
789
+ if file_format is not None and file_format not in ["arrow", "parquet"]:
790
+ raise ValueError(f"Unsupported file_format: {file_format}. Expected 'arrow' or 'parquet'")
791
+ self._file_format = file_format
792
+
793
+ if self._fs._strip_protocol(self._output_dir) == "":
794
+ # We don't support the root directory, because it has no dirname,
795
+ # and we need a dirname to use a <dirname>.incomplete directory
796
+ # when the dataset is being written
797
+ raise RuntimeError(
798
+ f"Unable to download and prepare the dataset at the root {self._output_dir}. "
799
+ f"Please specify a subdirectory, e.g. '{self._output_dir + self.dataset_name}'"
800
+ )
801
+
802
+ if dl_manager is None:
803
+ if download_config is None:
804
+ download_config = DownloadConfig(
805
+ cache_dir=self._cache_downloaded_dir,
806
+ force_download=download_mode == DownloadMode.FORCE_REDOWNLOAD,
807
+ force_extract=download_mode == DownloadMode.FORCE_REDOWNLOAD,
808
+ use_etag=False,
809
+ num_proc=num_proc,
810
+ token=self.token,
811
+ storage_options=self.storage_options,
812
+ ) # We don't use etag for data files to speed up the process
813
+
814
+ dl_manager = DownloadManager(
815
+ dataset_name=self.dataset_name,
816
+ download_config=download_config,
817
+ data_dir=self.config.data_dir,
818
+ base_path=base_path,
819
+ record_checksums=(self._record_infos or verification_mode == VerificationMode.ALL_CHECKS),
820
+ )
821
+
822
+ is_local = not is_remote_filesystem(self._fs)
823
+ self.dl_manager = dl_manager
824
+
825
+ # Prevent parallel local disk operations
826
+ if is_local:
827
+ # Create parent directory of the output_dir to put the lock file in there
828
+ Path(self._output_dir).parent.mkdir(parents=True, exist_ok=True)
829
+ lock_path = self._output_dir + "_builder.lock"
830
+
831
+ # File locking only with local paths; no file locking on GCS or S3
832
+ with FileLock(lock_path) if is_local else contextlib.nullcontext():
833
+ # Check if the data already exists
834
+ data_exists = self._fs.exists(posixpath.join(self._output_dir, config.DATASET_INFO_FILENAME))
835
+ if data_exists and download_mode == DownloadMode.REUSE_DATASET_IF_EXISTS:
836
+ logger.info(f"Found cached dataset {self.dataset_name} ({self._output_dir})")
837
+ # We need to update the info in case some splits were added in the meantime
838
+ # for example when calling load_dataset from multiple workers.
839
+ self.info = self._load_info()
840
+ self.download_post_processing_resources(dl_manager)
841
+ return
842
+
843
+ logger.info(f"Generating dataset {self.dataset_name} ({self._output_dir})")
844
+ if is_local: # if cache dir is local, check for available space
845
+ if not has_sufficient_disk_space(
846
+ self.info.size_in_bytes or 0, directory=Path(self._output_dir).parent
847
+ ):
848
+ raise OSError(
849
+ f"Not enough disk space. Needed: {size_str(self.info.size_in_bytes or 0)} (download: {size_str(self.info.download_size or 0)}, generated: {size_str(self.info.dataset_size or 0)}, post-processed: {size_str(self.info.post_processing_size or 0)})"
850
+ )
851
+
852
+ @contextlib.contextmanager
853
+ def incomplete_dir(dirname):
854
+ """Create temporary dir for dirname and rename on exit."""
855
+ if not is_local:
856
+ self._fs.makedirs(dirname, exist_ok=True)
857
+ yield dirname
858
+ else:
859
+ tmp_dir = dirname + ".incomplete"
860
+ os.makedirs(tmp_dir, exist_ok=True)
861
+ try:
862
+ yield tmp_dir
863
+ if os.path.isdir(dirname):
864
+ shutil.rmtree(dirname)
865
+ # LocalFileSystem.mv does copy + rm, it is more efficient to simply rename a local directory
866
+ shutil.move(tmp_dir, dirname)
867
+ finally:
868
+ if os.path.exists(tmp_dir):
869
+ shutil.rmtree(tmp_dir)
870
+
871
+ # Print is intentional: we want this to always go to stdout so user has
872
+ # information needed to cancel download/preparation if needed.
873
+ # This comes right before the progress bar.
874
+ if self.info.size_in_bytes:
875
+ logger.info(
876
+ f"Downloading and preparing dataset {self.dataset_name}/{self.config.name} "
877
+ f"(download: {size_str(self.info.download_size)}, generated: {size_str(self.info.dataset_size)}, "
878
+ f"post-processed: {size_str(self.info.post_processing_size)}, "
879
+ f"total: {size_str(self.info.size_in_bytes)}) to {self._output_dir}..."
880
+ )
881
+ else:
882
+ _dest = self._fs._strip_protocol(self._output_dir) if is_local else self._output_dir
883
+ logger.info(f"Downloading and preparing dataset {self.dataset_name}/{self.config.name} to {_dest}...")
884
+
885
+ self._check_manual_download(dl_manager)
886
+
887
+ # Create a tmp dir and rename to self._output_dir on successful exit.
888
+ with incomplete_dir(self._output_dir) as tmp_output_dir:
889
+ # Temporarily assign _output_dir to tmp_data_dir to avoid having to forward
890
+ # it to every sub function.
891
+ with temporary_assignment(self, "_output_dir", tmp_output_dir):
892
+ prepare_split_kwargs = {"file_format": file_format}
893
+ if max_shard_size is not None:
894
+ prepare_split_kwargs["max_shard_size"] = max_shard_size
895
+ if num_proc is not None:
896
+ prepare_split_kwargs["num_proc"] = num_proc
897
+ self._download_and_prepare(
898
+ dl_manager=dl_manager,
899
+ verification_mode=verification_mode,
900
+ **prepare_split_kwargs,
901
+ **download_and_prepare_kwargs,
902
+ )
903
+ # Sync info
904
+ self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())
905
+ self.info.download_checksums = dl_manager.get_recorded_sizes_checksums()
906
+ if self.info.download_size is not None:
907
+ self.info.size_in_bytes = self.info.dataset_size + self.info.download_size
908
+ # Save info
909
+ self._save_info()
910
+
911
+ # Download post processing resources
912
+ self.download_post_processing_resources(dl_manager)
913
+
914
+ logger.info(
915
+ f"Dataset {self.dataset_name} downloaded and prepared to {self._output_dir}. "
916
+ f"Subsequent calls will reuse this data."
917
+ )
918
+
919
+ def _check_manual_download(self, dl_manager):
920
+ if self.manual_download_instructions is not None and dl_manager.manual_dir is None:
921
+ raise ManualDownloadError(
922
+ textwrap.dedent(
923
+ f"""\
924
+ The dataset {self.dataset_name} with config {self.config.name} requires manual data.
925
+ Please follow the manual download instructions:
926
+ {self.manual_download_instructions}
927
+ Manual data can be loaded with:
928
+ datasets.load_dataset("{self.repo_id or self.dataset_name}", data_dir="<path/to/manual/data>")"""
929
+ )
930
+ )
931
+
932
+ def _download_and_prepare(self, dl_manager, verification_mode, **prepare_split_kwargs):
933
+ """Downloads and prepares dataset for reading.
934
+
935
+ This is the internal implementation to overwrite called when user calls
936
+ `download_and_prepare`. It should download all required data and generate
937
+ the pre-processed datasets files.
938
+
939
+ Args:
940
+ dl_manager ([`DownloadManager`]):
941
+ `DownloadManager` used to download and cache data.
942
+ verification_mode ([`VerificationMode`]):
943
+ if `ALL_CHECKS`, perform all the verifications including checksums.
944
+ if `BASIC_CHECKS`, do not perform checksums, only perform split tests.
945
+ if `NO_CHECKS`, do not perform any verification.
946
+ prepare_split_kwargs: Additional options, such as `file_format`, `max_shard_size`
947
+ """
948
+ # Generating data for all splits
949
+ split_dict = SplitDict(dataset_name=self.dataset_name)
950
+ split_generators_kwargs = self._make_split_generators_kwargs(prepare_split_kwargs)
951
+ split_generators = self._split_generators(dl_manager, **split_generators_kwargs)
952
+
953
+ # Checksums verification
954
+ if verification_mode == VerificationMode.ALL_CHECKS and dl_manager.record_checksums:
955
+ verify_checksums(
956
+ self.info.download_checksums, dl_manager.get_recorded_sizes_checksums(), "dataset source files"
957
+ )
958
+
959
+ # Build splits
960
+ for split_generator in split_generators:
961
+ if str(split_generator.split_info.name).lower() == "all":
962
+ raise ValueError(
963
+ "`all` is a special split keyword corresponding to the "
964
+ "union of all splits, so cannot be used as key in "
965
+ "._split_generator()."
966
+ )
967
+
968
+ logger.info(f"Generating {split_generator.split_info.name} split")
969
+ split_dict.add(split_generator.split_info)
970
+
971
+ try:
972
+ # Prepare split will record examples associated to the split
973
+ self._prepare_split(split_generator, **prepare_split_kwargs)
974
+ except OSError as e:
975
+ raise OSError(
976
+ "Cannot find data file. "
977
+ + (self.manual_download_instructions or "")
978
+ + "\nOriginal error:\n"
979
+ + str(e)
980
+ ) from None
981
+ # If check_duplicates is set to True , then except DuplicatedKeysError
982
+ except DuplicatedKeysError as e:
983
+ raise DuplicatedKeysError(
984
+ e.key,
985
+ e.duplicate_key_indices,
986
+ fix_msg=f"To avoid duplicate keys, please fix the dataset splits for {self.name}",
987
+ ) from None
988
+ dl_manager.manage_extracted_files()
989
+
990
+ if verification_mode == VerificationMode.BASIC_CHECKS or verification_mode == VerificationMode.ALL_CHECKS:
991
+ verify_splits(self.info.splits, split_dict)
992
+
993
+ # Update the info object with the splits.
994
+ self.info.splits = split_dict
995
+ self.info.download_size = dl_manager.downloaded_size
996
+
997
+ def download_post_processing_resources(self, dl_manager):
998
+ for split in self.info.splits or []:
999
+ for resource_name, resource_file_name in self._post_processing_resources(split).items():
1000
+ if not not is_remote_filesystem(self._fs):
1001
+ raise NotImplementedError(f"Post processing is not supported on filesystem {self._fs}")
1002
+ if os.sep in resource_file_name:
1003
+ raise ValueError(f"Resources shouldn't be in a sub-directory: {resource_file_name}")
1004
+ resource_path = os.path.join(self._output_dir, resource_file_name)
1005
+ if not os.path.exists(resource_path):
1006
+ downloaded_resource_path = self._download_post_processing_resources(
1007
+ split, resource_name, dl_manager
1008
+ )
1009
+ if downloaded_resource_path:
1010
+ logger.info(f"Downloaded post-processing resource {resource_name} as {resource_file_name}")
1011
+ shutil.move(downloaded_resource_path, resource_path)
1012
+
1013
+ def _load_info(self) -> DatasetInfo:
1014
+ return DatasetInfo.from_directory(self._output_dir, storage_options=self._fs.storage_options)
1015
+
1016
+ def _save_info(self):
1017
+ file_lock = (
1018
+ FileLock(self._output_dir + "_info.lock")
1019
+ if not is_remote_filesystem(self._fs)
1020
+ else contextlib.nullcontext()
1021
+ )
1022
+ with file_lock:
1023
+ self.info.write_to_directory(self._output_dir, storage_options=self._fs.storage_options)
1024
+
1025
+ def _make_split_generators_kwargs(self, prepare_split_kwargs):
1026
+ """Get kwargs for `self._split_generators()` from `prepare_split_kwargs`."""
1027
+ del prepare_split_kwargs
1028
+ return {}
1029
+
1030
+ def as_dataset(
1031
+ self,
1032
+ split: Optional[Union[str, Split, list[str], list[Split]]] = None,
1033
+ run_post_process=True,
1034
+ verification_mode: Optional[Union[VerificationMode, str]] = None,
1035
+ in_memory=False,
1036
+ ) -> Union[Dataset, DatasetDict]:
1037
+ """Return a Dataset for the specified split.
1038
+
1039
+ Args:
1040
+ split (`datasets.Split`):
1041
+ Which subset of the data to return.
1042
+ run_post_process (`bool`, defaults to `True`):
1043
+ Whether to run post-processing dataset transforms and/or add
1044
+ indexes.
1045
+ verification_mode ([`VerificationMode`] or `str`, defaults to `BASIC_CHECKS`):
1046
+ Verification mode determining the checks to run on the
1047
+ downloaded/processed dataset information (checksums/size/splits/...).
1048
+
1049
+ <Added version="2.9.1"/>
1050
+ in_memory (`bool`, defaults to `False`):
1051
+ Whether to copy the data in-memory.
1052
+
1053
+ Returns:
1054
+ datasets.Dataset
1055
+
1056
+ Example:
1057
+
1058
+ ```py
1059
+ >>> from datasets import load_dataset_builder
1060
+ >>> builder = load_dataset_builder('cornell-movie-review-data/rotten_tomatoes')
1061
+ >>> builder.download_and_prepare()
1062
+ >>> ds = builder.as_dataset(split='train')
1063
+ >>> ds
1064
+ Dataset({
1065
+ features: ['text', 'label'],
1066
+ num_rows: 8530
1067
+ })
1068
+ ```
1069
+ """
1070
+ if self._file_format is not None and self._file_format != "arrow":
1071
+ raise FileFormatError('Loading a dataset not written in the "arrow" format is not supported.')
1072
+ if is_remote_filesystem(self._fs):
1073
+ raise NotImplementedError(f"Loading a dataset cached in a {type(self._fs).__name__} is not supported.")
1074
+ if not os.path.exists(self._output_dir):
1075
+ raise FileNotFoundError(
1076
+ f"Dataset {self.dataset_name}: could not find data in {self._output_dir}. Please make sure to call "
1077
+ "builder.download_and_prepare(), or use "
1078
+ "datasets.load_dataset() before trying to access the Dataset object."
1079
+ )
1080
+
1081
+ logger.debug(f"Constructing Dataset for split {split or ', '.join(self.info.splits)}, from {self._output_dir}")
1082
+
1083
+ # By default, return all splits
1084
+ if split is None:
1085
+ split = {s: s for s in self.info.splits}
1086
+
1087
+ verification_mode = VerificationMode(verification_mode or VerificationMode.BASIC_CHECKS)
1088
+
1089
+ # Create a dataset for each of the given splits
1090
+ datasets = map_nested(
1091
+ partial(
1092
+ self._build_single_dataset,
1093
+ run_post_process=run_post_process,
1094
+ verification_mode=verification_mode,
1095
+ in_memory=in_memory,
1096
+ ),
1097
+ split,
1098
+ map_tuple=True,
1099
+ disable_tqdm=True,
1100
+ )
1101
+ if isinstance(datasets, dict):
1102
+ datasets = DatasetDict(datasets)
1103
+ return datasets
1104
+
1105
+ def _build_single_dataset(
1106
+ self,
1107
+ split: Union[str, ReadInstruction, Split],
1108
+ run_post_process: bool,
1109
+ verification_mode: VerificationMode,
1110
+ in_memory: bool = False,
1111
+ ):
1112
+ """as_dataset for a single split."""
1113
+ if not isinstance(split, ReadInstruction):
1114
+ split = str(split)
1115
+ if split == "all":
1116
+ split = "+".join(self.info.splits.keys())
1117
+ split = Split(split)
1118
+
1119
+ # Build base dataset
1120
+ ds = self._as_dataset(
1121
+ split=split,
1122
+ in_memory=in_memory,
1123
+ )
1124
+ if run_post_process:
1125
+ for resource_file_name in self._post_processing_resources(split).values():
1126
+ if os.sep in resource_file_name:
1127
+ raise ValueError(f"Resources shouldn't be in a sub-directory: {resource_file_name}")
1128
+ resources_paths = {
1129
+ resource_name: os.path.join(self._output_dir, resource_file_name)
1130
+ for resource_name, resource_file_name in self._post_processing_resources(split).items()
1131
+ }
1132
+ post_processed = self._post_process(ds, resources_paths)
1133
+ if post_processed is not None:
1134
+ ds = post_processed
1135
+ recorded_checksums = {}
1136
+ record_checksums = False
1137
+ for resource_name, resource_path in resources_paths.items():
1138
+ size_checksum = get_size_checksum_dict(resource_path)
1139
+ recorded_checksums[resource_name] = size_checksum
1140
+ if verification_mode == VerificationMode.ALL_CHECKS and record_checksums:
1141
+ if self.info.post_processed is None or self.info.post_processed.resources_checksums is None:
1142
+ expected_checksums = None
1143
+ else:
1144
+ expected_checksums = self.info.post_processed.resources_checksums.get(split)
1145
+ verify_checksums(expected_checksums, recorded_checksums, "post processing resources")
1146
+ if self.info.post_processed is None:
1147
+ self.info.post_processed = PostProcessedInfo()
1148
+ if self.info.post_processed.resources_checksums is None:
1149
+ self.info.post_processed.resources_checksums = {}
1150
+ self.info.post_processed.resources_checksums[str(split)] = recorded_checksums
1151
+ self.info.post_processing_size = sum(
1152
+ checksums_dict["num_bytes"]
1153
+ for split_checksums_dicts in self.info.post_processed.resources_checksums.values()
1154
+ for checksums_dict in split_checksums_dicts.values()
1155
+ )
1156
+ if self.info.dataset_size is not None and self.info.download_size is not None:
1157
+ self.info.size_in_bytes = (
1158
+ self.info.dataset_size + self.info.download_size + self.info.post_processing_size
1159
+ )
1160
+ self._save_info()
1161
+ ds._info.post_processed = self.info.post_processed
1162
+ ds._info.post_processing_size = self.info.post_processing_size
1163
+ ds._info.size_in_bytes = self.info.size_in_bytes
1164
+ if self.info.post_processed.features is not None:
1165
+ if self.info.post_processed.features.type != ds.features.type:
1166
+ raise ValueError(
1167
+ f"Post-processed features info don't match the dataset:\nGot\n{self.info.post_processed.features}\nbut expected something like\n{ds.features}"
1168
+ )
1169
+ else:
1170
+ ds.info.features = self.info.post_processed.features
1171
+
1172
+ return ds
1173
+
1174
+ def _as_dataset(self, split: Union[ReadInstruction, Split] = Split.TRAIN, in_memory: bool = False) -> Dataset:
1175
+ """Constructs a `Dataset`.
1176
+
1177
+ This is the internal implementation to overwrite called when user calls
1178
+ `as_dataset`. It should read the pre-processed datasets files and generate
1179
+ the `Dataset` object.
1180
+
1181
+ Args:
1182
+ split (`datasets.Split`):
1183
+ which subset of the data to read.
1184
+ in_memory (`bool`, defaults to `False`):
1185
+ Whether to copy the data in-memory.
1186
+
1187
+ Returns:
1188
+ `Dataset`
1189
+ """
1190
+ cache_dir = self._fs._strip_protocol(self._output_dir)
1191
+ dataset_name = self.dataset_name
1192
+ if self._check_legacy_cache():
1193
+ dataset_name = self.name
1194
+ dataset_kwargs = ArrowReader(cache_dir, self.info).read(
1195
+ name=dataset_name,
1196
+ instructions=split,
1197
+ split_infos=self.info.splits.values(),
1198
+ in_memory=in_memory,
1199
+ )
1200
+ fingerprint = self._get_dataset_fingerprint(split)
1201
+ return Dataset(fingerprint=fingerprint, **dataset_kwargs)
1202
+
1203
+ def _get_dataset_fingerprint(self, split: Union[ReadInstruction, Split]) -> str:
1204
+ """The dataset fingerprint is the hash of the relative directory dataset_name/config_name/version/hash, as well as the split specs."""
1205
+ hasher = Hasher()
1206
+ hasher.update(Path(self._relative_data_dir()).as_posix())
1207
+ hasher.update(str(split)) # for example: train, train+test, train[:10%], test[:33%](pct1_dropremainder)
1208
+ fingerprint = hasher.hexdigest()
1209
+ return fingerprint
1210
+
1211
+ def as_streaming_dataset(
1212
+ self,
1213
+ split: Optional[str] = None,
1214
+ base_path: Optional[str] = None,
1215
+ ) -> Union[dict[str, IterableDataset], IterableDataset]:
1216
+ if is_remote_filesystem(self._fs):
1217
+ raise NotImplementedError(
1218
+ f"Loading a streaming dataset cached in a {type(self._fs).__name__} is not supported yet."
1219
+ )
1220
+
1221
+ dl_manager = StreamingDownloadManager(
1222
+ base_path=base_path or self.base_path,
1223
+ download_config=DownloadConfig(token=self.token, storage_options=self.storage_options),
1224
+ dataset_name=self.dataset_name,
1225
+ data_dir=self.config.data_dir,
1226
+ )
1227
+ self._check_manual_download(dl_manager)
1228
+ splits_generators = {sg.name: sg for sg in self._split_generators(dl_manager)}
1229
+ # By default, return all splits
1230
+ if split is None:
1231
+ splits_generator = splits_generators
1232
+ elif split in splits_generators:
1233
+ splits_generator = splits_generators[split]
1234
+ else:
1235
+ raise ValueError(f"Bad split: {split}. Available splits: {list(splits_generators)}")
1236
+
1237
+ # Create a dataset for each of the given splits
1238
+ datasets = map_nested(
1239
+ self._as_streaming_dataset_single,
1240
+ splits_generator,
1241
+ map_tuple=True,
1242
+ )
1243
+ if isinstance(datasets, dict):
1244
+ datasets = IterableDatasetDict(datasets)
1245
+ return datasets
1246
+
1247
+ def _as_streaming_dataset_single(
1248
+ self,
1249
+ splits_generator,
1250
+ ) -> IterableDataset:
1251
+ ex_iterable = self._get_examples_iterable_for_split(splits_generator)
1252
+ # add auth to be able to access and decode audio/image files from private repositories.
1253
+ token_per_repo_id = {self.repo_id: self.token} if self.repo_id else {}
1254
+ return IterableDataset(
1255
+ ex_iterable, info=self.info, split=splits_generator.name, token_per_repo_id=token_per_repo_id
1256
+ )
1257
+
1258
+ def _post_process(self, dataset: Dataset, resources_paths: Mapping[str, str]) -> Optional[Dataset]:
1259
+ """Run dataset transforms or add indexes"""
1260
+ return None
1261
+
1262
+ def _post_processing_resources(self, split: str) -> dict[str, str]:
1263
+ """Mapping resource_name -> resource_file_name"""
1264
+ return {}
1265
+
1266
+ def _download_post_processing_resources(
1267
+ self, split: str, resource_name: str, dl_manager: DownloadManager
1268
+ ) -> Optional[str]:
1269
+ """Download the resource using the download manager and return the downloaded path."""
1270
+ return None
1271
+
1272
+ @abc.abstractmethod
1273
+ def _split_generators(self, dl_manager: Union[DownloadManager, StreamingDownloadManager]):
1274
+ """Specify feature dictionary generators and dataset splits.
1275
+
1276
+ This function returns a list of `SplitGenerator`s defining how to generate
1277
+ data and what splits to use.
1278
+
1279
+ Example:
1280
+
1281
+ return [
1282
+ datasets.SplitGenerator(
1283
+ name=datasets.Split.TRAIN,
1284
+ gen_kwargs={'file': 'train_data.zip'},
1285
+ ),
1286
+ datasets.SplitGenerator(
1287
+ name=datasets.Split.TEST,
1288
+ gen_kwargs={'file': 'test_data.zip'},
1289
+ ),
1290
+ ]
1291
+
1292
+ The above code will first call `_generate_examples(file='train_data.zip')`
1293
+ to write the train data, then `_generate_examples(file='test_data.zip')` to
1294
+ write the test data.
1295
+
1296
+ Datasets are typically split into different subsets to be used at various
1297
+ stages of training and evaluation.
1298
+
1299
+ Note that for datasets without a `VALIDATION` split, you can use a
1300
+ fraction of the `TRAIN` data for evaluation as you iterate on your model
1301
+ so as not to overfit to the `TEST` data.
1302
+
1303
+ For downloads and extractions, use the given `download_manager`.
1304
+ Note that the `DownloadManager` caches downloads, so it is fine to have each
1305
+ generator attempt to download the source data.
1306
+
1307
+ A good practice is to download all data in this function, and then
1308
+ distribute the relevant parts to each split with the `gen_kwargs` argument
1309
+
1310
+ Args:
1311
+ dl_manager (`Union[DownloadManager, StreamingDownloadManager]`):
1312
+ Download manager to download the data
1313
+
1314
+ Returns:
1315
+ `list<SplitGenerator>`.
1316
+ """
1317
+ raise NotImplementedError()
1318
+
1319
+ @abc.abstractmethod
1320
+ def _prepare_split(
1321
+ self,
1322
+ split_generator: SplitGenerator,
1323
+ file_format: str = "arrow",
1324
+ max_shard_size: Optional[Union[str, int]] = None,
1325
+ num_proc: Optional[int] = None,
1326
+ **kwargs,
1327
+ ):
1328
+ """Generate the examples and record them on disk.
1329
+
1330
+ Args:
1331
+ split_generator (`SplitGenerator`):
1332
+ Split generator to process
1333
+ file_format (`str`, *optional*):
1334
+ format of the data files in which the dataset will be written.
1335
+ Supported formats: "arrow", "parquet". Default to "arrow" format.
1336
+ max_shard_size (`Union[str, int]`, *optional*):
1337
+ Maximum number of bytes written per shard, default is "500MB".
1338
+ The size is based on uncompressed data size, so in practice your shard files may be smaller than
1339
+ `max_shard_size` thanks to Parquet compression for example.
1340
+ num_proc (`int`, *optional*, defaults to `None`):
1341
+ Number of processes when downloading and generating the dataset locally.
1342
+ Multiprocessing is disabled by default.
1343
+
1344
+ <Added version="2.7.0"/>
1345
+ **kwargs: Additional kwargs forwarded from _download_and_prepare
1346
+ """
1347
+ raise NotImplementedError()
1348
+
1349
+ def _get_examples_iterable_for_split(self, split_generator: SplitGenerator) -> ExamplesIterable:
1350
+ """Generate the examples on the fly.
1351
+
1352
+ Args:
1353
+ split_generator (`SplitGenerator`):
1354
+ Split generator to process
1355
+ """
1356
+ raise NotImplementedError()
1357
+
1358
+
1359
+ class GeneratorBasedBuilder(DatasetBuilder):
1360
+ """Base class for datasets with data generation based on dict generators.
1361
+
1362
+ `GeneratorBasedBuilder` is a convenience class that abstracts away much
1363
+ of the data writing and reading of `DatasetBuilder`. It expects subclasses to
1364
+ implement generators of feature dictionaries across the dataset splits
1365
+ (`_split_generators`). See the method docstrings for details.
1366
+ """
1367
+
1368
+ @abc.abstractmethod
1369
+ def _generate_examples(self, **kwargs):
1370
+ """Default function generating examples for each `SplitGenerator`.
1371
+
1372
+ This function preprocess the examples from the raw data to the preprocessed
1373
+ dataset files.
1374
+ This function is called once for each `SplitGenerator` defined in
1375
+ `_split_generators`. The examples yielded here will be written on
1376
+ disk.
1377
+
1378
+ Args:
1379
+ **kwargs (additional keyword arguments):
1380
+ Arguments forwarded from the SplitGenerator.gen_kwargs
1381
+
1382
+ Yields:
1383
+ key: `str` or `int`, a unique deterministic example identification key.
1384
+ * Unique: An error will be raised if two examples are yield with the
1385
+ same key.
1386
+ * Deterministic: When generating the dataset twice, the same example
1387
+ should have the same key.
1388
+ Good keys can be the image id, or line number if examples are extracted
1389
+ from a text file.
1390
+ The key will be hashed and sorted to shuffle examples deterministically,
1391
+ such as generating the dataset multiple times keep examples in the
1392
+ same order.
1393
+ example: `dict<str feature_name, feature_value>`, a feature dictionary
1394
+ ready to be encoded and written to disk. The example will be
1395
+ encoded with `self.info.features.encode_example({...})`.
1396
+ """
1397
+ raise NotImplementedError()
1398
+
1399
+ def _prepare_split(
1400
+ self,
1401
+ split_generator: SplitGenerator,
1402
+ check_duplicate_keys: bool,
1403
+ file_format="arrow",
1404
+ num_proc: Optional[int] = None,
1405
+ max_shard_size: Optional[Union[int, str]] = None,
1406
+ ):
1407
+ max_shard_size = convert_file_size_to_int(max_shard_size or config.MAX_SHARD_SIZE)
1408
+
1409
+ if self.info.splits is not None:
1410
+ split_info = self.info.splits[split_generator.name]
1411
+ else:
1412
+ split_info = split_generator.split_info
1413
+
1414
+ SUFFIX = "-JJJJJ-SSSSS-of-NNNNN"
1415
+ fname = f"{self.dataset_name}-{split_generator.name}{SUFFIX}.{file_format}"
1416
+ fpath = posixpath.join(self._output_dir, fname)
1417
+
1418
+ if num_proc and num_proc > 1:
1419
+ num_input_shards = _number_of_shards_in_gen_kwargs(split_generator.gen_kwargs)
1420
+ if num_input_shards <= 1:
1421
+ logger.warning(
1422
+ f"Setting num_proc from {num_proc} back to 1 for the {split_info.name} split to disable multiprocessing as it only contains one shard."
1423
+ )
1424
+ num_proc = 1
1425
+ elif num_input_shards < num_proc:
1426
+ logger.warning(
1427
+ f"Setting num_proc from {num_proc} to {num_input_shards} for the {split_info.name} split as it only contains {num_input_shards} shards."
1428
+ )
1429
+ num_proc = num_input_shards
1430
+
1431
+ pbar = hf_tqdm(
1432
+ unit=" examples",
1433
+ total=split_info.num_examples,
1434
+ desc=f"Generating {split_info.name} split",
1435
+ )
1436
+
1437
+ _prepare_split_args = {
1438
+ "fpath": fpath,
1439
+ "file_format": file_format,
1440
+ "max_shard_size": max_shard_size,
1441
+ "split_info": split_info,
1442
+ "check_duplicate_keys": check_duplicate_keys,
1443
+ }
1444
+
1445
+ if num_proc is None or num_proc == 1:
1446
+ result = None
1447
+ gen_kwargs = split_generator.gen_kwargs
1448
+ job_id = 0
1449
+ with pbar:
1450
+ for job_id, done, content in self._prepare_split_single(
1451
+ gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args
1452
+ ):
1453
+ if done:
1454
+ result = content
1455
+ else:
1456
+ pbar.update(content)
1457
+ # wrapping everything into lists for consistency with the multiprocessed code path
1458
+ assert result is not None, "Failed to retrieve results from prepare_split"
1459
+ examples_per_job, bytes_per_job, features_per_job, shards_per_job, shard_lengths_per_job = (
1460
+ [item] for item in result
1461
+ )
1462
+ else:
1463
+ kwargs_per_job = [
1464
+ {"gen_kwargs": gen_kwargs, "job_id": job_id, **_prepare_split_args}
1465
+ for job_id, gen_kwargs in enumerate(
1466
+ _split_gen_kwargs(split_generator.gen_kwargs, max_num_jobs=num_proc)
1467
+ )
1468
+ ]
1469
+ num_jobs = len(kwargs_per_job)
1470
+
1471
+ examples_per_job = [None] * num_jobs
1472
+ bytes_per_job = [None] * num_jobs
1473
+ features_per_job = [None] * num_jobs
1474
+ shards_per_job = [None] * num_jobs
1475
+ shard_lengths_per_job = [None] * num_jobs
1476
+
1477
+ with Pool(num_proc) as pool:
1478
+ with pbar:
1479
+ for job_id, done, content in iflatmap_unordered(
1480
+ pool, self._prepare_split_single, kwargs_iterable=kwargs_per_job
1481
+ ):
1482
+ if done:
1483
+ # the content is the result of the job
1484
+ (
1485
+ examples_per_job[job_id],
1486
+ bytes_per_job[job_id],
1487
+ features_per_job[job_id],
1488
+ shards_per_job[job_id],
1489
+ shard_lengths_per_job[job_id],
1490
+ ) = content
1491
+ else:
1492
+ # the content is the number of examples progress update
1493
+ pbar.update(content)
1494
+
1495
+ assert None not in examples_per_job, (
1496
+ f"Failed to retrieve results from prepare_split: result list {examples_per_job} still contains None - at least one worker failed to return its results"
1497
+ )
1498
+
1499
+ total_shards = sum(shards_per_job)
1500
+ total_num_examples = sum(examples_per_job)
1501
+ total_num_bytes = sum(bytes_per_job)
1502
+ features = features_per_job[0]
1503
+
1504
+ split_generator.split_info.num_examples = total_num_examples
1505
+ split_generator.split_info.num_bytes = total_num_bytes
1506
+
1507
+ # should rename everything at the end
1508
+ logger.debug(f"Renaming {total_shards} shards.")
1509
+ if total_shards > 1:
1510
+ # use the -SSSSS-of-NNNNN pattern
1511
+
1512
+ def _rename_shard(shard_and_job: tuple[int]):
1513
+ shard_id, job_id = shard_and_job
1514
+ global_shard_id = sum(shards_per_job[:job_id]) + shard_id
1515
+ self._rename(
1516
+ fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
1517
+ fpath.replace("JJJJJ-SSSSS", f"{global_shard_id:05d}").replace("NNNNN", f"{total_shards:05d}"),
1518
+ )
1519
+
1520
+ shards_and_jobs = [
1521
+ (shard_id, job_id)
1522
+ for job_id, num_shards in enumerate(shards_per_job)
1523
+ for shard_id in range(num_shards)
1524
+ ]
1525
+ thread_map(_rename_shard, shards_and_jobs, disable=True, max_workers=64)
1526
+
1527
+ split_generator.split_info.shard_lengths = [
1528
+ shard_length for shard_lengths in shard_lengths_per_job for shard_length in shard_lengths
1529
+ ]
1530
+ else:
1531
+ # don't use any pattern
1532
+ shard_id, job_id = 0, 0
1533
+ self._rename(
1534
+ fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
1535
+ fpath.replace(SUFFIX, ""),
1536
+ )
1537
+
1538
+ if self.info.features is None:
1539
+ self.info.features = features
1540
+
1541
+ def _prepare_split_single(
1542
+ self,
1543
+ gen_kwargs: dict,
1544
+ fpath: str,
1545
+ file_format: str,
1546
+ max_shard_size: int,
1547
+ split_info: SplitInfo,
1548
+ check_duplicate_keys: bool,
1549
+ job_id: int,
1550
+ ) -> Iterable[tuple[int, bool, Union[int, tuple]]]:
1551
+ generator = self._generate_examples(**gen_kwargs)
1552
+ writer_class = ParquetWriter if file_format == "parquet" else ArrowWriter
1553
+ embed_local_files = file_format == "parquet"
1554
+ shard_lengths = []
1555
+ total_num_examples, total_num_bytes = 0, 0
1556
+
1557
+ shard_id = 0
1558
+ num_examples_progress_update = 0
1559
+ try:
1560
+ writer = writer_class(
1561
+ features=self.info.features,
1562
+ path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
1563
+ writer_batch_size=self._writer_batch_size,
1564
+ hash_salt=split_info.name,
1565
+ check_duplicates=check_duplicate_keys,
1566
+ storage_options=self._fs.storage_options,
1567
+ embed_local_files=embed_local_files,
1568
+ )
1569
+ try:
1570
+ _time = time.time()
1571
+ for key, record in generator:
1572
+ if max_shard_size is not None and writer._num_bytes > max_shard_size:
1573
+ num_examples, num_bytes = writer.finalize()
1574
+ writer.close()
1575
+ shard_lengths.append(num_examples)
1576
+ total_num_examples += num_examples
1577
+ total_num_bytes += num_bytes
1578
+ shard_id += 1
1579
+ writer = writer_class(
1580
+ features=writer._features,
1581
+ path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
1582
+ writer_batch_size=self._writer_batch_size,
1583
+ hash_salt=split_info.name,
1584
+ check_duplicates=check_duplicate_keys,
1585
+ storage_options=self._fs.storage_options,
1586
+ embed_local_files=embed_local_files,
1587
+ )
1588
+ example = self.info.features.encode_example(record) if self.info.features is not None else record
1589
+ writer.write(example, key)
1590
+ num_examples_progress_update += 1
1591
+ if time.time() > _time + config.PBAR_REFRESH_TIME_INTERVAL:
1592
+ _time = time.time()
1593
+ yield job_id, False, num_examples_progress_update
1594
+ num_examples_progress_update = 0
1595
+ finally:
1596
+ yield job_id, False, num_examples_progress_update
1597
+ num_shards = shard_id + 1
1598
+ num_examples, num_bytes = writer.finalize()
1599
+ writer.close()
1600
+ shard_lengths.append(num_examples)
1601
+ total_num_examples += num_examples
1602
+ total_num_bytes += num_bytes
1603
+ except Exception as e:
1604
+ # Ignore the writer's error for no examples written to the file if this error was caused by the error in _generate_examples before the first example was yielded
1605
+ if isinstance(e, SchemaInferenceError) and e.__context__ is not None:
1606
+ e = e.__context__
1607
+ raise DatasetGenerationError("An error occurred while generating the dataset") from e
1608
+
1609
+ yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths)
1610
+
1611
+ def _download_and_prepare(self, dl_manager, verification_mode, **prepare_splits_kwargs):
1612
+ super()._download_and_prepare(
1613
+ dl_manager,
1614
+ verification_mode,
1615
+ check_duplicate_keys=verification_mode == VerificationMode.BASIC_CHECKS
1616
+ or verification_mode == VerificationMode.ALL_CHECKS,
1617
+ **prepare_splits_kwargs,
1618
+ )
1619
+
1620
+ def _get_examples_iterable_for_split(self, split_generator: SplitGenerator) -> ExamplesIterable:
1621
+ return ExamplesIterable(self._generate_examples, split_generator.gen_kwargs)
1622
+
1623
+
1624
+ class ArrowBasedBuilder(DatasetBuilder):
1625
+ """Base class for datasets with data generation based on Arrow loading functions (CSV/JSON/Parquet)."""
1626
+
1627
+ @abc.abstractmethod
1628
+ def _generate_tables(self, **kwargs):
1629
+ """Default function generating examples for each `SplitGenerator`.
1630
+
1631
+ This function preprocess the examples from the raw data to the preprocessed
1632
+ dataset files.
1633
+ This function is called once for each `SplitGenerator` defined in
1634
+ `_split_generators`. The examples yielded here will be written on
1635
+ disk.
1636
+
1637
+ Args:
1638
+ **kwargs (additional keyword arguments):
1639
+ Arguments forwarded from the SplitGenerator.gen_kwargs
1640
+
1641
+ Yields:
1642
+ key: `str` or `int`, a unique deterministic example identification key.
1643
+ * Unique: An error will be raised if two examples are yield with the
1644
+ same key.
1645
+ * Deterministic: When generating the dataset twice, the same example
1646
+ should have the same key.
1647
+ Good keys can be the image id, or line number if examples are extracted
1648
+ from a text file.
1649
+ The key will be hashed and sorted to shuffle examples deterministically,
1650
+ such as generating the dataset multiple times keep examples in the
1651
+ same order.
1652
+ example: `pyarrow.Table`, a feature table
1653
+ ready to be encoded and written to disk.
1654
+ """
1655
+ raise NotImplementedError()
1656
+
1657
+ def _prepare_split(
1658
+ self,
1659
+ split_generator: SplitGenerator,
1660
+ file_format: str = "arrow",
1661
+ num_proc: Optional[int] = None,
1662
+ max_shard_size: Optional[Union[str, int]] = None,
1663
+ ):
1664
+ max_shard_size = convert_file_size_to_int(max_shard_size or config.MAX_SHARD_SIZE)
1665
+
1666
+ try:
1667
+ split_info = self.info.splits[split_generator.name]
1668
+ except Exception:
1669
+ split_info = split_generator.split_info
1670
+
1671
+ SUFFIX = "-JJJJJ-SSSSS-of-NNNNN"
1672
+ fname = f"{self.dataset_name}-{split_generator.name}{SUFFIX}.{file_format}"
1673
+ fpath = posixpath.join(self._output_dir, fname)
1674
+
1675
+ if num_proc and num_proc > 1:
1676
+ num_input_shards = _number_of_shards_in_gen_kwargs(split_generator.gen_kwargs)
1677
+ if num_input_shards <= 1:
1678
+ logger.warning(
1679
+ f"Setting num_proc from {num_proc} back to 1 for the {split_info.name} split to disable multiprocessing as it only contains one shard."
1680
+ )
1681
+ num_proc = 1
1682
+ elif num_input_shards < num_proc:
1683
+ logger.warning(
1684
+ f"Setting num_proc from {num_proc} to {num_input_shards} for the {split_info.name} split as it only contains {num_input_shards} shards."
1685
+ )
1686
+ num_proc = num_input_shards
1687
+
1688
+ pbar = hf_tqdm(
1689
+ unit=" examples",
1690
+ total=split_info.num_examples,
1691
+ desc=f"Generating {split_info.name} split",
1692
+ )
1693
+
1694
+ _prepare_split_args = {
1695
+ "fpath": fpath,
1696
+ "file_format": file_format,
1697
+ "max_shard_size": max_shard_size,
1698
+ }
1699
+
1700
+ if num_proc is None or num_proc == 1:
1701
+ result = None
1702
+ gen_kwargs = split_generator.gen_kwargs
1703
+ job_id = 0
1704
+ with pbar:
1705
+ for job_id, done, content in self._prepare_split_single(
1706
+ gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args
1707
+ ):
1708
+ if done:
1709
+ result = content
1710
+ else:
1711
+ pbar.update(content)
1712
+ # wrapping everything into lists for consistency with the multiprocessed code path
1713
+ assert result is not None, "Failed to retrieve results from prepare_split"
1714
+ examples_per_job, bytes_per_job, features_per_job, shards_per_job, shard_lengths_per_job = (
1715
+ [item] for item in result
1716
+ )
1717
+ else:
1718
+ kwargs_per_job = [
1719
+ {"gen_kwargs": gen_kwargs, "job_id": job_id, **_prepare_split_args}
1720
+ for job_id, gen_kwargs in enumerate(
1721
+ _split_gen_kwargs(split_generator.gen_kwargs, max_num_jobs=num_proc)
1722
+ )
1723
+ ]
1724
+ num_jobs = len(kwargs_per_job)
1725
+
1726
+ examples_per_job = [None] * num_jobs
1727
+ bytes_per_job = [None] * num_jobs
1728
+ features_per_job = [None] * num_jobs
1729
+ shards_per_job = [None] * num_jobs
1730
+ shard_lengths_per_job = [None] * num_jobs
1731
+
1732
+ with Pool(num_proc) as pool:
1733
+ with pbar:
1734
+ for job_id, done, content in iflatmap_unordered(
1735
+ pool, self._prepare_split_single, kwargs_iterable=kwargs_per_job
1736
+ ):
1737
+ if done:
1738
+ # the content is the result of the job
1739
+ (
1740
+ examples_per_job[job_id],
1741
+ bytes_per_job[job_id],
1742
+ features_per_job[job_id],
1743
+ shards_per_job[job_id],
1744
+ shard_lengths_per_job[job_id],
1745
+ ) = content
1746
+ else:
1747
+ # the content is the number of examples progress update
1748
+ pbar.update(content)
1749
+
1750
+ assert None not in examples_per_job, (
1751
+ f"Failed to retrieve results from prepare_split: result list {examples_per_job} still contains None - at least one worker failed to return its results"
1752
+ )
1753
+
1754
+ total_shards = sum(shards_per_job)
1755
+ total_num_examples = sum(examples_per_job)
1756
+ total_num_bytes = sum(bytes_per_job)
1757
+ features = features_per_job[0]
1758
+
1759
+ split_generator.split_info.num_examples = total_num_examples
1760
+ split_generator.split_info.num_bytes = total_num_bytes
1761
+
1762
+ # should rename everything at the end
1763
+ logger.debug(f"Renaming {total_shards} shards.")
1764
+ if total_shards > 1:
1765
+ # use the -SSSSS-of-NNNNN pattern
1766
+
1767
+ def _rename_shard(shard_id_and_job: tuple[int]):
1768
+ shard_id, job_id = shard_id_and_job
1769
+ global_shard_id = sum(shards_per_job[:job_id]) + shard_id
1770
+ self._rename(
1771
+ fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
1772
+ fpath.replace("JJJJJ-SSSSS", f"{global_shard_id:05d}").replace("NNNNN", f"{total_shards:05d}"),
1773
+ )
1774
+
1775
+ shard_ids_and_jobs = [
1776
+ (shard_id, job_id)
1777
+ for job_id, num_shards in enumerate(shards_per_job)
1778
+ for shard_id in range(num_shards)
1779
+ ]
1780
+ thread_map(_rename_shard, shard_ids_and_jobs, disable=True, max_workers=64)
1781
+
1782
+ split_generator.split_info.shard_lengths = [
1783
+ shard_length for shard_lengths in shard_lengths_per_job for shard_length in shard_lengths
1784
+ ]
1785
+ else:
1786
+ # don't use any pattern
1787
+ shard_id, job_id = 0, 0
1788
+ self._rename(
1789
+ fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
1790
+ fpath.replace(SUFFIX, ""),
1791
+ )
1792
+
1793
+ if self.info.features is None:
1794
+ self.info.features = features
1795
+
1796
+ def _prepare_split_single(
1797
+ self, gen_kwargs: dict, fpath: str, file_format: str, max_shard_size: int, job_id: int
1798
+ ) -> Iterable[tuple[int, bool, Union[int, tuple]]]:
1799
+ gen_kwargs = {k: tracked_list(v) if isinstance(v, list) else v for k, v in gen_kwargs.items()}
1800
+ generator = self._generate_tables(**gen_kwargs)
1801
+ writer_class = ParquetWriter if file_format == "parquet" else ArrowWriter
1802
+ embed_local_files = file_format == "parquet"
1803
+ shard_lengths = []
1804
+ total_num_examples, total_num_bytes = 0, 0
1805
+
1806
+ shard_id = 0
1807
+ num_examples_progress_update = 0
1808
+ try:
1809
+ writer = writer_class(
1810
+ features=self.info.features,
1811
+ path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
1812
+ writer_batch_size=self._writer_batch_size,
1813
+ storage_options=self._fs.storage_options,
1814
+ embed_local_files=embed_local_files,
1815
+ )
1816
+ try:
1817
+ _time = time.time()
1818
+ for _, table in generator:
1819
+ if max_shard_size is not None and writer._num_bytes > max_shard_size:
1820
+ num_examples, num_bytes = writer.finalize()
1821
+ writer.close()
1822
+ shard_lengths.append(num_examples)
1823
+ total_num_examples += num_examples
1824
+ total_num_bytes += num_bytes
1825
+ shard_id += 1
1826
+ writer = writer_class(
1827
+ features=writer._features,
1828
+ path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
1829
+ writer_batch_size=self._writer_batch_size,
1830
+ storage_options=self._fs.storage_options,
1831
+ embed_local_files=embed_local_files,
1832
+ )
1833
+ try:
1834
+ writer.write_table(table)
1835
+ except CastError as cast_error:
1836
+ raise DatasetGenerationCastError.from_cast_error(
1837
+ cast_error=cast_error,
1838
+ builder_name=self.info.builder_name,
1839
+ gen_kwargs=gen_kwargs,
1840
+ token=self.token,
1841
+ )
1842
+ num_examples_progress_update += len(table)
1843
+ if time.time() > _time + config.PBAR_REFRESH_TIME_INTERVAL:
1844
+ _time = time.time()
1845
+ yield job_id, False, num_examples_progress_update
1846
+ num_examples_progress_update = 0
1847
+ finally:
1848
+ yield job_id, False, num_examples_progress_update
1849
+ num_shards = shard_id + 1
1850
+ num_examples, num_bytes = writer.finalize()
1851
+ writer.close()
1852
+ shard_lengths.append(num_examples)
1853
+ total_num_examples += num_examples
1854
+ total_num_bytes += num_bytes
1855
+ except Exception as e:
1856
+ # Ignore the writer's error for no examples written to the file if this error was caused by the error in _generate_examples before the first example was yielded
1857
+ if isinstance(e, SchemaInferenceError) and e.__context__ is not None:
1858
+ e = e.__context__
1859
+ if isinstance(e, DatasetGenerationError):
1860
+ raise
1861
+ raise DatasetGenerationError("An error occurred while generating the dataset") from e
1862
+
1863
+ yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths)
1864
+
1865
+ def _get_examples_iterable_for_split(self, split_generator: SplitGenerator) -> ExamplesIterable:
1866
+ return ArrowExamplesIterable(self._generate_tables, kwargs=split_generator.gen_kwargs)
datasets/combine.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, TypeVar
2
+
3
+ from .arrow_dataset import Dataset, _concatenate_map_style_datasets, _interleave_map_style_datasets
4
+ from .dataset_dict import DatasetDict, IterableDatasetDict
5
+ from .info import DatasetInfo
6
+ from .iterable_dataset import IterableDataset, _concatenate_iterable_datasets, _interleave_iterable_datasets
7
+ from .splits import NamedSplit
8
+ from .utils import logging
9
+ from .utils.py_utils import Literal
10
+
11
+
12
+ logger = logging.get_logger(__name__)
13
+
14
+
15
+ DatasetType = TypeVar("DatasetType", Dataset, IterableDataset)
16
+
17
+
18
+ def interleave_datasets(
19
+ datasets: list[DatasetType],
20
+ probabilities: Optional[list[float]] = None,
21
+ seed: Optional[int] = None,
22
+ info: Optional[DatasetInfo] = None,
23
+ split: Optional[NamedSplit] = None,
24
+ stopping_strategy: Literal[
25
+ "first_exhausted", "all_exhausted", "all_exhausted_without_replacement"
26
+ ] = "first_exhausted",
27
+ ) -> DatasetType:
28
+ """
29
+ Interleave several datasets (sources) into a single dataset.
30
+ The new dataset is constructed by alternating between the sources to get the examples.
31
+
32
+ You can use this function on a list of [`Dataset`] objects, or on a list of [`IterableDataset`] objects.
33
+
34
+ - If `probabilities` is `None` (default) the new dataset is constructed by cycling between each source to get the examples.
35
+ - If `probabilities` is not `None`, the new dataset is constructed by getting examples from a random source at a time according to the provided probabilities.
36
+
37
+ The resulting dataset ends when one of the source datasets runs out of examples except when `oversampling` is `True`,
38
+ in which case, the resulting dataset ends when all datasets have ran out of examples at least one time.
39
+
40
+ Note for iterable datasets:
41
+
42
+ In a distributed setup or in PyTorch DataLoader workers, the stopping strategy is applied per process.
43
+ Therefore the "first_exhausted" strategy on an sharded iterable dataset can generate less samples in total (up to 1 missing sample per subdataset per worker).
44
+
45
+ Args:
46
+ datasets (`List[Dataset]` or `List[IterableDataset]`):
47
+ List of datasets to interleave.
48
+ probabilities (`List[float]`, *optional*, defaults to `None`):
49
+ If specified, the new dataset is constructed by sampling
50
+ examples from one source at a time according to these probabilities.
51
+ seed (`int`, *optional*, defaults to `None`):
52
+ The random seed used to choose a source for each example.
53
+ info ([`DatasetInfo`], *optional*):
54
+ Dataset information, like description, citation, etc.
55
+ <Added version="2.4.0"/>
56
+ split ([`NamedSplit`], *optional*):
57
+ Name of the dataset split.
58
+ <Added version="2.4.0"/>
59
+ stopping_strategy (`str`, defaults to `first_exhausted`):
60
+ Three strategies are proposed right now, `first_exhausted`, `all_exhausted` and `all_exhausted_without_replacement`.
61
+ By default, `first_exhausted` is an undersampling strategy, i.e the dataset construction is stopped as soon as one dataset has ran out of samples.
62
+ If the strategy is `all_exhausted`, we use an oversampling strategy, i.e the dataset construction is stopped as soon as every samples of every dataset has been added at least once.
63
+ When strategy is `all_exhausted_without_replacement` we make sure that each sample in each dataset is sampled only once.
64
+ Note that if the strategy is `all_exhausted`, the interleaved dataset size can get enormous:
65
+ - with no probabilities, the resulting dataset will have `max_length_datasets*nb_dataset` samples.
66
+ - with given probabilities, the resulting dataset will have more samples if some datasets have really low probability of visiting.
67
+ Returns:
68
+ [`Dataset`] or [`IterableDataset`]: Return type depends on the input `datasets`
69
+ parameter. `Dataset` if the input is a list of `Dataset`, `IterableDataset` if the input is a list of
70
+ `IterableDataset`.
71
+
72
+ Example:
73
+
74
+ For regular datasets (map-style):
75
+
76
+ ```python
77
+ >>> from datasets import Dataset, interleave_datasets
78
+ >>> d1 = Dataset.from_dict({"a": [0, 1, 2]})
79
+ >>> d2 = Dataset.from_dict({"a": [10, 11, 12]})
80
+ >>> d3 = Dataset.from_dict({"a": [20, 21, 22]})
81
+ >>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42, stopping_strategy="all_exhausted")
82
+ >>> dataset["a"]
83
+ [10, 0, 11, 1, 2, 20, 12, 10, 0, 1, 2, 21, 0, 11, 1, 2, 0, 1, 12, 2, 10, 0, 22]
84
+ >>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42)
85
+ >>> dataset["a"]
86
+ [10, 0, 11, 1, 2]
87
+ >>> dataset = interleave_datasets([d1, d2, d3])
88
+ >>> dataset["a"]
89
+ [0, 10, 20, 1, 11, 21, 2, 12, 22]
90
+ >>> dataset = interleave_datasets([d1, d2, d3], stopping_strategy="all_exhausted")
91
+ >>> dataset["a"]
92
+ [0, 10, 20, 1, 11, 21, 2, 12, 22]
93
+ >>> d1 = Dataset.from_dict({"a": [0, 1, 2]})
94
+ >>> d2 = Dataset.from_dict({"a": [10, 11, 12, 13]})
95
+ >>> d3 = Dataset.from_dict({"a": [20, 21, 22, 23, 24]})
96
+ >>> dataset = interleave_datasets([d1, d2, d3])
97
+ >>> dataset["a"]
98
+ [0, 10, 20, 1, 11, 21, 2, 12, 22]
99
+ >>> dataset = interleave_datasets([d1, d2, d3], stopping_strategy="all_exhausted")
100
+ >>> dataset["a"]
101
+ [0, 10, 20, 1, 11, 21, 2, 12, 22, 0, 13, 23, 1, 10, 24]
102
+ >>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42)
103
+ >>> dataset["a"]
104
+ [10, 0, 11, 1, 2]
105
+ >>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42, stopping_strategy="all_exhausted")
106
+ >>> dataset["a"]
107
+ [10, 0, 11, 1, 2, 20, 12, 13, ..., 0, 1, 2, 0, 24]
108
+ For datasets in streaming mode (iterable):
109
+
110
+ >>> from datasets import interleave_datasets
111
+ >>> d1 = load_dataset('allenai/c4', 'es', split='train', streaming=True)
112
+ >>> d2 = load_dataset('allenai/c4', 'fr', split='train', streaming=True)
113
+ >>> dataset = interleave_datasets([d1, d2])
114
+ >>> iterator = iter(dataset)
115
+ >>> next(iterator)
116
+ {'text': 'Comprar Zapatillas para niña en chancla con goma por...'}
117
+ >>> next(iterator)
118
+ {'text': 'Le sacre de philippe ier, 23 mai 1059 - Compte Rendu...'
119
+ ```
120
+ """
121
+ from .arrow_dataset import Dataset
122
+ from .iterable_dataset import IterableDataset
123
+
124
+ if not datasets:
125
+ raise ValueError("Unable to interleave an empty list of datasets.")
126
+ for i, dataset in enumerate(datasets):
127
+ if not isinstance(dataset, (Dataset, IterableDataset)):
128
+ if isinstance(dataset, (DatasetDict, IterableDatasetDict)):
129
+ if not dataset:
130
+ raise ValueError(
131
+ f"Expected a list of Dataset objects or a list of IterableDataset objects, but element at position {i} "
132
+ "is an empty dataset dictionary."
133
+ )
134
+ raise ValueError(
135
+ f"Dataset at position {i} has at least one split: {list(dataset)}\n"
136
+ f"Please pick one to interleave with the other datasets, for example: dataset['{next(iter(dataset))}']"
137
+ )
138
+ raise ValueError(
139
+ f"Expected a list of Dataset objects or a list of IterableDataset objects, but element at position {i} is a {type(dataset).__name__}."
140
+ )
141
+ if i == 0:
142
+ dataset_type, other_type = (
143
+ (Dataset, IterableDataset) if isinstance(dataset, Dataset) else (IterableDataset, Dataset)
144
+ )
145
+ elif not isinstance(dataset, dataset_type):
146
+ raise ValueError(
147
+ f"Unable to interleave a {dataset_type.__name__} (at position 0) with a {other_type.__name__} (at position {i}). Expected a list of Dataset objects or a list of IterableDataset objects."
148
+ )
149
+ if stopping_strategy not in ["first_exhausted", "all_exhausted", "all_exhausted_without_replacement"]:
150
+ raise ValueError(f"{stopping_strategy} is not supported. Please enter a valid stopping_strategy.")
151
+ if dataset_type is Dataset:
152
+ return _interleave_map_style_datasets(
153
+ datasets, probabilities, seed, info=info, split=split, stopping_strategy=stopping_strategy
154
+ )
155
+ else:
156
+ return _interleave_iterable_datasets(
157
+ datasets,
158
+ probabilities,
159
+ seed,
160
+ info=info,
161
+ split=split,
162
+ stopping_strategy=stopping_strategy,
163
+ )
164
+
165
+
166
+ def concatenate_datasets(
167
+ dsets: list[DatasetType],
168
+ info: Optional[DatasetInfo] = None,
169
+ split: Optional[NamedSplit] = None,
170
+ axis: int = 0,
171
+ ) -> DatasetType:
172
+ """
173
+ Converts a list of [`Dataset`] with the same schema into a single [`Dataset`].
174
+
175
+ Args:
176
+ dsets (`List[datasets.Dataset]`):
177
+ List of Datasets to concatenate.
178
+ info (`DatasetInfo`, *optional*):
179
+ Dataset information, like description, citation, etc.
180
+ split (`NamedSplit`, *optional*):
181
+ Name of the dataset split.
182
+ axis (`{0, 1}`, defaults to `0`):
183
+ Axis to concatenate over, where `0` means over rows (vertically) and `1` means over columns
184
+ (horizontally).
185
+
186
+ <Added version="1.6.0"/>
187
+
188
+ Example:
189
+
190
+ ```py
191
+ >>> ds3 = concatenate_datasets([ds1, ds2])
192
+ ```
193
+ """
194
+
195
+ if not dsets:
196
+ raise ValueError("Unable to concatenate an empty list of datasets.")
197
+ for i, dataset in enumerate(dsets):
198
+ if not isinstance(dataset, (Dataset, IterableDataset)):
199
+ if isinstance(dataset, (DatasetDict, IterableDatasetDict)):
200
+ if not dataset:
201
+ raise ValueError(
202
+ f"Expected a list of Dataset objects or a list of IterableDataset objects, but element at position {i} "
203
+ "is an empty dataset dictionary."
204
+ )
205
+ raise ValueError(
206
+ f"Dataset at position {i} has at least one split: {list(dataset)}\n"
207
+ f"Please pick one to interleave with the other datasets, for example: dataset['{next(iter(dataset))}']"
208
+ )
209
+ raise ValueError(
210
+ f"Expected a list of Dataset objects or a list of IterableDataset objects, but element at position {i} is a {type(dataset).__name__}."
211
+ )
212
+ if i == 0:
213
+ dataset_type, other_type = (
214
+ (Dataset, IterableDataset) if isinstance(dataset, Dataset) else (IterableDataset, Dataset)
215
+ )
216
+ elif not isinstance(dataset, dataset_type):
217
+ raise ValueError(
218
+ f"Unable to interleave a {dataset_type.__name__} (at position 0) with a {other_type.__name__} (at position {i}). Expected a list of Dataset objects or a list of IterableDataset objects."
219
+ )
220
+ if dataset_type is Dataset:
221
+ return _concatenate_map_style_datasets(dsets, info=info, split=split, axis=axis)
222
+ else:
223
+ return _concatenate_iterable_datasets(dsets, info=info, split=split, axis=axis)
datasets/config.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib
2
+ import importlib.metadata
3
+ import logging
4
+ import os
5
+ import platform
6
+ from pathlib import Path
7
+ from typing import Optional
8
+
9
+ from huggingface_hub import constants
10
+ from packaging import version
11
+
12
+
13
+ logger = logging.getLogger(__name__.split(".", 1)[0]) # to avoid circular import from .utils.logging
14
+
15
+ # Datasets
16
+ S3_DATASETS_BUCKET_PREFIX = "https://s3.amazonaws.com/datasets.huggingface.co/datasets/datasets"
17
+ CLOUDFRONT_DATASETS_DISTRIB_PREFIX = "https://cdn-datasets.huggingface.co/datasets/datasets"
18
+ REPO_DATASETS_URL = "https://raw.githubusercontent.com/huggingface/datasets/{revision}/datasets/{path}/{name}"
19
+
20
+ # Hub
21
+ HF_ENDPOINT = os.environ.get("HF_ENDPOINT", "https://huggingface.co")
22
+ HUB_DATASETS_URL = HF_ENDPOINT + "/datasets/{repo_id}/resolve/{revision}/{path}"
23
+ HUB_DATASETS_HFFS_URL = "hf://datasets/{repo_id}@{revision}/{path}"
24
+ HUB_DEFAULT_VERSION = "main"
25
+
26
+ PY_VERSION = version.parse(platform.python_version())
27
+
28
+ # General environment variables accepted values for booleans
29
+ ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
30
+ ENV_VARS_FALSE_VALUES = {"0", "OFF", "NO", "FALSE"}
31
+ ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
32
+ ENV_VARS_FALSE_AND_AUTO_VALUES = ENV_VARS_FALSE_VALUES.union({"AUTO"})
33
+
34
+
35
+ # Imports
36
+ DILL_VERSION = version.parse(importlib.metadata.version("dill"))
37
+ FSSPEC_VERSION = version.parse(importlib.metadata.version("fsspec"))
38
+ PANDAS_VERSION = version.parse(importlib.metadata.version("pandas"))
39
+ PYARROW_VERSION = version.parse(importlib.metadata.version("pyarrow"))
40
+ HF_HUB_VERSION = version.parse(importlib.metadata.version("huggingface_hub"))
41
+
42
+ USE_TF = os.environ.get("USE_TF", "AUTO").upper()
43
+ USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper()
44
+ USE_JAX = os.environ.get("USE_JAX", "AUTO").upper()
45
+
46
+ TORCH_VERSION = "N/A"
47
+ TORCH_AVAILABLE = False
48
+
49
+ if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES:
50
+ TORCH_AVAILABLE = importlib.util.find_spec("torch") is not None
51
+ if TORCH_AVAILABLE:
52
+ try:
53
+ TORCH_VERSION = version.parse(importlib.metadata.version("torch"))
54
+ logger.debug(f"PyTorch version {TORCH_VERSION} available.")
55
+ except importlib.metadata.PackageNotFoundError:
56
+ pass
57
+ else:
58
+ logger.info("Disabling PyTorch because USE_TF is set")
59
+
60
+ POLARS_VERSION = "N/A"
61
+ POLARS_AVAILABLE = importlib.util.find_spec("polars") is not None
62
+
63
+ if POLARS_AVAILABLE:
64
+ try:
65
+ POLARS_VERSION = version.parse(importlib.metadata.version("polars"))
66
+ logger.debug(f"Polars version {POLARS_VERSION} available.")
67
+ except importlib.metadata.PackageNotFoundError:
68
+ pass
69
+
70
+
71
+ DUCKDB_VERSION = "N/A"
72
+ DUCKDB_AVAILABLE = importlib.util.find_spec("duckdb") is not None
73
+
74
+ if DUCKDB_AVAILABLE:
75
+ try:
76
+ DUCKDB_VERSION = version.parse(importlib.metadata.version("duckdb"))
77
+ logger.debug(f"Duckdb version {DUCKDB_VERSION} available.")
78
+ except importlib.metadata.PackageNotFoundError:
79
+ pass
80
+
81
+ TF_VERSION = "N/A"
82
+ TF_AVAILABLE = False
83
+
84
+ if USE_TF in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TORCH not in ENV_VARS_TRUE_VALUES:
85
+ TF_AVAILABLE = importlib.util.find_spec("tensorflow") is not None
86
+ if TF_AVAILABLE:
87
+ # For the metadata, we have to look for both tensorflow and tensorflow-cpu
88
+ for package in [
89
+ "tensorflow",
90
+ "tensorflow-cpu",
91
+ "tensorflow-gpu",
92
+ "tf-nightly",
93
+ "tf-nightly-cpu",
94
+ "tf-nightly-gpu",
95
+ "intel-tensorflow",
96
+ "tensorflow-rocm",
97
+ "tensorflow-macos",
98
+ ]:
99
+ try:
100
+ TF_VERSION = version.parse(importlib.metadata.version(package))
101
+ except importlib.metadata.PackageNotFoundError:
102
+ continue
103
+ else:
104
+ break
105
+ else:
106
+ TF_AVAILABLE = False
107
+ if TF_AVAILABLE:
108
+ if TF_VERSION.major < 2:
109
+ logger.info(f"TensorFlow found but with version {TF_VERSION}. `datasets` requires version 2 minimum.")
110
+ TF_AVAILABLE = False
111
+ else:
112
+ logger.info(f"TensorFlow version {TF_VERSION} available.")
113
+ else:
114
+ logger.info("Disabling Tensorflow because USE_TORCH is set")
115
+
116
+
117
+ JAX_VERSION = "N/A"
118
+ JAX_AVAILABLE = False
119
+
120
+ if USE_JAX in ENV_VARS_TRUE_AND_AUTO_VALUES:
121
+ JAX_AVAILABLE = importlib.util.find_spec("jax") is not None and importlib.util.find_spec("jaxlib") is not None
122
+ if JAX_AVAILABLE:
123
+ try:
124
+ JAX_VERSION = version.parse(importlib.metadata.version("jax"))
125
+ logger.info(f"JAX version {JAX_VERSION} available.")
126
+ except importlib.metadata.PackageNotFoundError:
127
+ pass
128
+ else:
129
+ logger.info("Disabling JAX because USE_JAX is set to False")
130
+
131
+
132
+ # Optional tools for data loading
133
+ SQLALCHEMY_AVAILABLE = importlib.util.find_spec("sqlalchemy") is not None
134
+
135
+ # Optional tools for feature decoding
136
+ PIL_AVAILABLE = importlib.util.find_spec("PIL") is not None
137
+ IS_OPUS_SUPPORTED = True
138
+ IS_MP3_SUPPORTED = True
139
+ TORCHCODEC_AVAILABLE = importlib.util.find_spec("torchcodec") is not None
140
+ TORCHVISION_AVAILABLE = importlib.util.find_spec("torchvision") is not None
141
+ PDFPLUMBER_AVAILABLE = importlib.util.find_spec("pdfplumber") is not None
142
+
143
+ # Optional compression tools
144
+ RARFILE_AVAILABLE = importlib.util.find_spec("rarfile") is not None
145
+ ZSTANDARD_AVAILABLE = importlib.util.find_spec("zstandard") is not None
146
+ LZ4_AVAILABLE = importlib.util.find_spec("lz4") is not None
147
+ PY7ZR_AVAILABLE = importlib.util.find_spec("py7zr") is not None
148
+
149
+ # Cache location
150
+ DEFAULT_XDG_CACHE_HOME = "~/.cache"
151
+ XDG_CACHE_HOME = os.getenv("XDG_CACHE_HOME", DEFAULT_XDG_CACHE_HOME)
152
+ DEFAULT_HF_CACHE_HOME = os.path.join(XDG_CACHE_HOME, "huggingface")
153
+ HF_CACHE_HOME = os.path.expanduser(os.getenv("HF_HOME", DEFAULT_HF_CACHE_HOME))
154
+
155
+ DEFAULT_HF_DATASETS_CACHE = os.path.join(HF_CACHE_HOME, "datasets")
156
+ HF_DATASETS_CACHE = Path(os.getenv("HF_DATASETS_CACHE", DEFAULT_HF_DATASETS_CACHE))
157
+
158
+ DEFAULT_HF_MODULES_CACHE = os.path.join(HF_CACHE_HOME, "modules")
159
+ HF_MODULES_CACHE = Path(os.getenv("HF_MODULES_CACHE", DEFAULT_HF_MODULES_CACHE))
160
+
161
+ DOWNLOADED_DATASETS_DIR = "downloads"
162
+ DEFAULT_DOWNLOADED_DATASETS_PATH = os.path.join(HF_DATASETS_CACHE, DOWNLOADED_DATASETS_DIR)
163
+ DOWNLOADED_DATASETS_PATH = Path(os.getenv("HF_DATASETS_DOWNLOADED_DATASETS_PATH", DEFAULT_DOWNLOADED_DATASETS_PATH))
164
+
165
+ EXTRACTED_DATASETS_DIR = "extracted"
166
+ DEFAULT_EXTRACTED_DATASETS_PATH = os.path.join(DEFAULT_DOWNLOADED_DATASETS_PATH, EXTRACTED_DATASETS_DIR)
167
+ EXTRACTED_DATASETS_PATH = Path(os.getenv("HF_DATASETS_EXTRACTED_DATASETS_PATH", DEFAULT_EXTRACTED_DATASETS_PATH))
168
+
169
+ # Download count for the website
170
+ HF_UPDATE_DOWNLOAD_COUNTS = (
171
+ os.environ.get("HF_UPDATE_DOWNLOAD_COUNTS", "AUTO").upper() in ENV_VARS_TRUE_AND_AUTO_VALUES
172
+ )
173
+
174
+ # For downloads and to check remote files metadata
175
+ HF_DATASETS_MULTITHREADING_MAX_WORKERS = 16
176
+
177
+ # Dataset viewer API
178
+ USE_PARQUET_EXPORT = True
179
+
180
+ # Batch size constants. For more info, see:
181
+ # https://github.com/apache/arrow/blob/master/docs/source/cpp/arrays.rst#size-limitations-and-recommendations)
182
+ DEFAULT_MAX_BATCH_SIZE = 1000
183
+
184
+ DEFAULT_CDC_OPTIONS = {"min_chunk_size": 256 * 1024, "max_chunk_size": 1024 * 1024, "norm_level": 0}
185
+
186
+ # Size of the preloaded record batch in `Dataset.__iter__`
187
+ ARROW_READER_BATCH_SIZE_IN_DATASET_ITER = 10
188
+
189
+ # Max uncompressed shard size in bytes (e.g. to shard parquet datasets in push_to_hub or download_and_prepare)
190
+ MAX_SHARD_SIZE = "500MB"
191
+
192
+ # Max uncompressed row group size in bytes (e.g. for parquet files in push_to_hub or download_and_prepare)
193
+ MAX_ROW_GROUP_SIZE = "100MB"
194
+
195
+ # Parquet configuration
196
+ PARQUET_ROW_GROUP_SIZE_FOR_AUDIO_DATASETS = None
197
+ PARQUET_ROW_GROUP_SIZE_FOR_IMAGE_DATASETS = None
198
+ PARQUET_ROW_GROUP_SIZE_FOR_BINARY_DATASETS = None
199
+ PARQUET_ROW_GROUP_SIZE_FOR_VIDEO_DATASETS = None
200
+
201
+ # Arrow configuration
202
+ ARROW_RECORD_BATCH_SIZE_FOR_AUDIO_DATASETS = 100
203
+ ARROW_RECORD_BATCH_SIZE_FOR_IMAGE_DATASETS = 100
204
+ ARROW_RECORD_BATCH_SIZE_FOR_BINARY_DATASETS = 100
205
+ ARROW_RECORD_BATCH_SIZE_FOR_VIDEO_DATASETS = 10
206
+
207
+ # Offline mode
208
+ _offline = os.environ.get("HF_DATASETS_OFFLINE")
209
+ HF_HUB_OFFLINE = constants.HF_HUB_OFFLINE if _offline is None else _offline.upper() in ENV_VARS_TRUE_VALUES
210
+ HF_DATASETS_OFFLINE = HF_HUB_OFFLINE # kept for backward-compatibility
211
+
212
+ # Here, `True` will disable progress bars globally without possibility of enabling it
213
+ # programmatically. `False` will enable them without possibility of disabling them.
214
+ # If environment variable is not set (None), then the user is free to enable/disable
215
+ # them programmatically.
216
+ # TL;DR: env variable has priority over code
217
+ __HF_DATASETS_DISABLE_PROGRESS_BARS = os.environ.get("HF_DATASETS_DISABLE_PROGRESS_BARS")
218
+ HF_DATASETS_DISABLE_PROGRESS_BARS: Optional[bool] = (
219
+ __HF_DATASETS_DISABLE_PROGRESS_BARS.upper() in ENV_VARS_TRUE_VALUES
220
+ if __HF_DATASETS_DISABLE_PROGRESS_BARS is not None
221
+ else None
222
+ )
223
+
224
+ # In-memory
225
+ DEFAULT_IN_MEMORY_MAX_SIZE = 0 # Disabled
226
+ IN_MEMORY_MAX_SIZE = float(os.environ.get("HF_DATASETS_IN_MEMORY_MAX_SIZE", DEFAULT_IN_MEMORY_MAX_SIZE))
227
+
228
+ # File names
229
+ DATASET_ARROW_FILENAME = "dataset.arrow"
230
+ DATASET_INDICES_FILENAME = "indices.arrow"
231
+ DATASET_STATE_JSON_FILENAME = "state.json"
232
+ DATASET_INFO_FILENAME = "dataset_info.json"
233
+ DATASETDICT_INFOS_FILENAME = "dataset_infos.json"
234
+ LICENSE_FILENAME = "LICENSE"
235
+ DATASETDICT_JSON_FILENAME = "dataset_dict.json"
236
+ METADATA_CONFIGS_FIELD = "configs"
237
+ REPOCARD_FILENAME = "README.md"
238
+ REPOYAML_FILENAME = ".huggingface.yaml"
239
+
240
+ MODULE_NAME_FOR_DYNAMIC_MODULES = "datasets_modules"
241
+
242
+ MAX_DATASET_CONFIG_ID_READABLE_LENGTH = 255
243
+
244
+ # Temporary cache directory prefix
245
+ TEMP_CACHE_DIR_PREFIX = "hf_datasets-"
246
+
247
+ # Streaming
248
+ STREAMING_READ_MAX_RETRIES = 20
249
+ STREAMING_READ_RETRY_INTERVAL = 5
250
+ STREAMING_OPEN_MAX_RETRIES = 20
251
+ STREAMING_OPEN_RETRY_INTERVAL = 5
252
+
253
+ # Datasets repositories exploration
254
+ DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE = 200
255
+ GLOBBED_DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE = 10
256
+ ARCHIVED_DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE = 200
257
+
258
+ # Async map functions
259
+ MAX_NUM_RUNNING_ASYNC_MAP_FUNCTIONS_IN_PARALLEL = 1000
260
+
261
+ # Progress bars
262
+ PBAR_REFRESH_TIME_INTERVAL = 0.05 # 20 progress updates per sec
263
+
264
+ # Maximum number of uploaded files per commit
265
+ UPLOADS_MAX_NUMBER_PER_COMMIT = 50
266
+
267
+ # Backward compatibility
268
+ MAX_TABLE_NBYTES_FOR_PICKLING = 4 << 30
datasets/data_files.py ADDED
@@ -0,0 +1,807 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ from functools import partial
4
+ from glob import has_magic
5
+ from pathlib import Path, PurePath
6
+ from typing import Callable, Optional, Union
7
+
8
+ import huggingface_hub
9
+ from fsspec.core import url_to_fs
10
+ from huggingface_hub import HfFileSystem
11
+ from packaging import version
12
+ from tqdm.contrib.concurrent import thread_map
13
+
14
+ from . import config
15
+ from .download import DownloadConfig
16
+ from .naming import _split_re
17
+ from .splits import Split
18
+ from .utils import logging
19
+ from .utils import tqdm as hf_tqdm
20
+ from .utils.file_utils import _prepare_path_and_storage_options, is_local_path, is_relative_path, xbasename, xjoin
21
+ from .utils.py_utils import string_to_dict
22
+
23
+
24
+ SingleOriginMetadata = Union[tuple[str, str], tuple[str], tuple[()]]
25
+
26
+
27
+ SANITIZED_DEFAULT_SPLIT = str(Split.TRAIN)
28
+
29
+
30
+ logger = logging.get_logger(__name__)
31
+
32
+
33
+ class Url(str):
34
+ pass
35
+
36
+
37
+ class EmptyDatasetError(FileNotFoundError):
38
+ pass
39
+
40
+
41
+ SPLIT_PATTERN_SHARDED = "data/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*"
42
+
43
+ SPLIT_KEYWORDS = {
44
+ Split.TRAIN: ["train", "training"],
45
+ Split.VALIDATION: ["validation", "valid", "dev", "val"],
46
+ Split.TEST: ["test", "testing", "eval", "evaluation"],
47
+ }
48
+ NON_WORDS_CHARS = "-._ 0-9"
49
+ if config.FSSPEC_VERSION < version.parse("2023.9.0"):
50
+ KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**[{sep}/]{keyword}[{sep}]*", "{keyword}[{sep}]*"]
51
+ KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [
52
+ "{keyword}/**",
53
+ "{keyword}[{sep}]*/**",
54
+ "**[{sep}/]{keyword}/**",
55
+ "**[{sep}/]{keyword}[{sep}]*/**",
56
+ ]
57
+ elif config.FSSPEC_VERSION < version.parse("2023.12.0"):
58
+ KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**/*[{sep}/]{keyword}[{sep}]*", "{keyword}[{sep}]*"]
59
+ KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [
60
+ "{keyword}/**/*",
61
+ "{keyword}[{sep}]*/**/*",
62
+ "**/*[{sep}/]{keyword}/**/*",
63
+ "**/*[{sep}/]{keyword}[{sep}]*/**/*",
64
+ ]
65
+ else:
66
+ KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**/{keyword}[{sep}]*", "**/*[{sep}]{keyword}[{sep}]*"]
67
+ KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [
68
+ "**/{keyword}/**",
69
+ "**/{keyword}[{sep}]*/**",
70
+ "**/*[{sep}]{keyword}/**",
71
+ "**/*[{sep}]{keyword}[{sep}]*/**",
72
+ ]
73
+
74
+ DEFAULT_SPLITS = [Split.TRAIN, Split.VALIDATION, Split.TEST]
75
+ DEFAULT_PATTERNS_SPLIT_IN_FILENAME = {
76
+ split: [
77
+ pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
78
+ for keyword in SPLIT_KEYWORDS[split]
79
+ for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS
80
+ ]
81
+ for split in DEFAULT_SPLITS
82
+ }
83
+ DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME = {
84
+ split: [
85
+ pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
86
+ for keyword in SPLIT_KEYWORDS[split]
87
+ for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS
88
+ ]
89
+ for split in DEFAULT_SPLITS
90
+ }
91
+
92
+
93
+ DEFAULT_PATTERNS_ALL = {
94
+ Split.TRAIN: ["**"],
95
+ }
96
+
97
+ ALL_SPLIT_PATTERNS = [SPLIT_PATTERN_SHARDED]
98
+ ALL_DEFAULT_PATTERNS = [
99
+ DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME,
100
+ DEFAULT_PATTERNS_SPLIT_IN_FILENAME,
101
+ DEFAULT_PATTERNS_ALL,
102
+ ]
103
+ WILDCARD_CHARACTERS = "*[]"
104
+ FILES_TO_IGNORE = [
105
+ "README.md",
106
+ "config.json",
107
+ "dataset_info.json",
108
+ "dataset_infos.json",
109
+ "dummy_data.zip",
110
+ "dataset_dict.json",
111
+ ]
112
+
113
+
114
+ def contains_wildcards(pattern: str) -> bool:
115
+ return any(wildcard_character in pattern for wildcard_character in WILDCARD_CHARACTERS)
116
+
117
+
118
+ def sanitize_patterns(patterns: Union[dict, list, str]) -> dict[str, Union[list[str], "DataFilesList"]]:
119
+ """
120
+ Take the data_files patterns from the user, and format them into a dictionary.
121
+ Each key is the name of the split, and each value is a list of data files patterns (paths or urls).
122
+ The default split is "train".
123
+
124
+ Returns:
125
+ patterns: dictionary of split_name -> list of patterns
126
+ """
127
+ if isinstance(patterns, dict):
128
+ return {str(key): value if isinstance(value, list) else [value] for key, value in patterns.items()}
129
+ elif isinstance(patterns, str):
130
+ return {SANITIZED_DEFAULT_SPLIT: [patterns]}
131
+ elif isinstance(patterns, list):
132
+ if any(isinstance(pattern, dict) for pattern in patterns):
133
+ for pattern in patterns:
134
+ if not (
135
+ isinstance(pattern, dict)
136
+ and len(pattern) == 2
137
+ and "split" in pattern
138
+ and isinstance(pattern.get("path"), (str, list))
139
+ ):
140
+ raise ValueError(
141
+ f"Expected each split to have a 'path' key which can be a string or a list of strings, but got {pattern}"
142
+ )
143
+ splits = [pattern["split"] for pattern in patterns]
144
+ if len(set(splits)) != len(splits):
145
+ raise ValueError(f"Some splits are duplicated in data_files: {splits}")
146
+ return {
147
+ str(pattern["split"]): pattern["path"] if isinstance(pattern["path"], list) else [pattern["path"]]
148
+ for pattern in patterns
149
+ }
150
+ else:
151
+ return {SANITIZED_DEFAULT_SPLIT: patterns}
152
+ else:
153
+ return sanitize_patterns(list(patterns))
154
+
155
+
156
+ def _is_inside_unrequested_special_dir(matched_rel_path: str, pattern: str) -> bool:
157
+ """
158
+ When a path matches a pattern, we additionally check if it's inside a special directory
159
+ we ignore by default (if it starts with a double underscore).
160
+
161
+ Users can still explicitly request a filepath inside such a directory if "__pycache__" is
162
+ mentioned explicitly in the requested pattern.
163
+
164
+ Some examples:
165
+
166
+ base directory:
167
+
168
+ ./
169
+ └── __pycache__
170
+ └── b.txt
171
+
172
+ >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "**")
173
+ True
174
+ >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "*/b.txt")
175
+ True
176
+ >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__pycache__/*")
177
+ False
178
+ >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__*/*")
179
+ False
180
+ """
181
+ # We just need to check if every special directories from the path is present explicitly in the pattern.
182
+ # Since we assume that the path matches the pattern, it's equivalent to counting that both
183
+ # the parent path and the parent pattern have the same number of special directories.
184
+ data_dirs_to_ignore_in_path = [part for part in PurePath(matched_rel_path).parent.parts if part.startswith("__")]
185
+ data_dirs_to_ignore_in_pattern = [part for part in PurePath(pattern).parent.parts if part.startswith("__")]
186
+ return len(data_dirs_to_ignore_in_path) != len(data_dirs_to_ignore_in_pattern)
187
+
188
+
189
+ def _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(matched_rel_path: str, pattern: str) -> bool:
190
+ """
191
+ When a path matches a pattern, we additionally check if it's a hidden file or if it's inside
192
+ a hidden directory we ignore by default, i.e. if the file name or a parent directory name starts with a dot.
193
+
194
+ Users can still explicitly request a filepath that is hidden or is inside a hidden directory
195
+ if the hidden part is mentioned explicitly in the requested pattern.
196
+
197
+ Some examples:
198
+
199
+ base directory:
200
+
201
+ ./
202
+ └── .hidden_file.txt
203
+
204
+ >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", "**")
205
+ True
206
+ >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", ".*")
207
+ False
208
+
209
+ base directory:
210
+
211
+ ./
212
+ └── .hidden_dir
213
+ └── a.txt
214
+
215
+ >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", "**")
216
+ True
217
+ >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".*/*")
218
+ False
219
+ >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".hidden_dir/*")
220
+ False
221
+
222
+ base directory:
223
+
224
+ ./
225
+ └── .hidden_dir
226
+ └── .hidden_file.txt
227
+
228
+ >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", "**")
229
+ True
230
+ >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/*")
231
+ True
232
+ >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/.*")
233
+ False
234
+ >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/*")
235
+ True
236
+ >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/.*")
237
+ False
238
+ """
239
+ # We just need to check if every hidden part from the path is present explicitly in the pattern.
240
+ # Since we assume that the path matches the pattern, it's equivalent to counting that both
241
+ # the path and the pattern have the same number of hidden parts.
242
+ hidden_directories_in_path = [
243
+ part for part in PurePath(matched_rel_path).parts if part.startswith(".") and not set(part) == {"."}
244
+ ]
245
+ hidden_directories_in_pattern = [
246
+ part for part in PurePath(pattern).parts if part.startswith(".") and not set(part) == {"."}
247
+ ]
248
+ return len(hidden_directories_in_path) != len(hidden_directories_in_pattern)
249
+
250
+
251
+ def _get_data_files_patterns(pattern_resolver: Callable[[str], list[str]]) -> dict[str, list[str]]:
252
+ """
253
+ Get the default pattern from a directory or repository by testing all the supported patterns.
254
+ The first patterns to return a non-empty list of data files is returned.
255
+
256
+ In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
257
+ """
258
+ # first check the split patterns like data/{split}-00000-of-00001.parquet
259
+ for split_pattern in ALL_SPLIT_PATTERNS:
260
+ pattern = split_pattern.replace("{split}", "*")
261
+ try:
262
+ data_files = pattern_resolver(pattern)
263
+ except FileNotFoundError:
264
+ continue
265
+ if len(data_files) > 0:
266
+ splits: set[str] = set()
267
+ for p in data_files:
268
+ p_parts = string_to_dict(xbasename(p), xbasename(split_pattern))
269
+ assert p_parts is not None
270
+ splits.add(p_parts["split"])
271
+
272
+ if any(not re.match(_split_re, split) for split in splits):
273
+ raise ValueError(f"Split name should match '{_split_re}'' but got '{splits}'.")
274
+ sorted_splits = [str(split) for split in DEFAULT_SPLITS if split in splits] + sorted(
275
+ splits - {str(split) for split in DEFAULT_SPLITS}
276
+ )
277
+ return {split: [split_pattern.format(split=split)] for split in sorted_splits}
278
+ # then check the default patterns based on train/valid/test splits
279
+ for patterns_dict in ALL_DEFAULT_PATTERNS:
280
+ non_empty_splits = []
281
+ for split, patterns in patterns_dict.items():
282
+ for pattern in patterns:
283
+ try:
284
+ data_files = pattern_resolver(pattern)
285
+ except FileNotFoundError:
286
+ continue
287
+ if len(data_files) > 0:
288
+ non_empty_splits.append(split)
289
+ break
290
+ if non_empty_splits:
291
+ return {split: patterns_dict[split] for split in non_empty_splits}
292
+ raise FileNotFoundError(f"Couldn't resolve pattern {pattern} with resolver {pattern_resolver}")
293
+
294
+
295
+ def resolve_pattern(
296
+ pattern: str,
297
+ base_path: str,
298
+ allowed_extensions: Optional[list[str]] = None,
299
+ download_config: Optional[DownloadConfig] = None,
300
+ ) -> list[str]:
301
+ """
302
+ Resolve the paths and URLs of the data files from the pattern passed by the user.
303
+
304
+ You can use patterns to resolve multiple local files. Here are a few examples:
305
+ - *.csv to match all the CSV files at the first level
306
+ - **.csv to match all the CSV files at any level
307
+ - data/* to match all the files inside "data"
308
+ - data/** to match all the files inside "data" and its subdirectories
309
+
310
+ The patterns are resolved using the fsspec glob. In fsspec>=2023.12.0 this is equivalent to
311
+ Python's glob.glob, Path.glob, Path.match and fnmatch where ** is unsupported with a prefix/suffix
312
+ other than a forward slash /.
313
+
314
+ More generally:
315
+ - '*' matches any character except a forward-slash (to match just the file or directory name)
316
+ - '**' matches any character including a forward-slash /
317
+
318
+ Hidden files and directories (i.e. whose names start with a dot) are ignored, unless they are explicitly requested.
319
+ The same applies to special directories that start with a double underscore like "__pycache__".
320
+ You can still include one if the pattern explicitly mentions it:
321
+ - to include a hidden file: "*/.hidden.txt" or "*/.*"
322
+ - to include a hidden directory: ".hidden/*" or ".*/*"
323
+ - to include a special directory: "__special__/*" or "__*/*"
324
+
325
+ Example::
326
+
327
+ >>> from datasets.data_files import resolve_pattern
328
+ >>> base_path = "."
329
+ >>> resolve_pattern("docs/**/*.py", base_path)
330
+ [/Users/mariosasko/Desktop/projects/datasets/docs/source/_config.py']
331
+
332
+ Args:
333
+ pattern (str): Unix pattern or paths or URLs of the data files to resolve.
334
+ The paths can be absolute or relative to base_path.
335
+ Remote filesystems using fsspec are supported, e.g. with the hf:// protocol.
336
+ base_path (str): Base path to use when resolving relative paths.
337
+ allowed_extensions (Optional[list], optional): White-list of file extensions to use. Defaults to None (all extensions).
338
+ For example: allowed_extensions=[".csv", ".json", ".txt", ".parquet"]
339
+ download_config ([`DownloadConfig`], *optional*): Specific download configuration parameters.
340
+ Returns:
341
+ List[str]: List of paths or URLs to the local or remote files that match the patterns.
342
+ """
343
+ if is_relative_path(pattern):
344
+ pattern = xjoin(base_path, pattern)
345
+ elif is_local_path(pattern):
346
+ base_path = os.path.splitdrive(pattern)[0] + os.sep
347
+ else:
348
+ base_path = ""
349
+ pattern, storage_options = _prepare_path_and_storage_options(pattern, download_config=download_config)
350
+ fs, fs_pattern = url_to_fs(pattern, **storage_options)
351
+ files_to_ignore = set(FILES_TO_IGNORE) - {xbasename(pattern)}
352
+ protocol = fs.protocol if isinstance(fs.protocol, str) else fs.protocol[0]
353
+ protocol_prefix = protocol + "://" if protocol != "file" else ""
354
+ glob_kwargs = {}
355
+ if protocol == "hf":
356
+ # 10 times faster glob with detail=True (ignores costly info like lastCommit)
357
+ glob_kwargs["expand_info"] = False
358
+ matched_paths = [
359
+ filepath if filepath.startswith(protocol_prefix) else protocol_prefix + filepath
360
+ for filepath, info in fs.glob(pattern, detail=True, **glob_kwargs).items()
361
+ if (info["type"] == "file" or (info.get("islink") and os.path.isfile(os.path.realpath(filepath))))
362
+ and (xbasename(filepath) not in files_to_ignore)
363
+ and not _is_inside_unrequested_special_dir(filepath, fs_pattern)
364
+ and not _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(filepath, fs_pattern)
365
+ ] # ignore .ipynb and __pycache__, but keep /../
366
+ if allowed_extensions is not None:
367
+ out = [
368
+ filepath
369
+ for filepath in matched_paths
370
+ if any("." + suffix in allowed_extensions for suffix in xbasename(filepath).split(".")[1:])
371
+ ]
372
+ if len(out) < len(matched_paths):
373
+ invalid_matched_files = list(set(matched_paths) - set(out))
374
+ logger.info(
375
+ f"Some files matched the pattern '{pattern}' but don't have valid data file extensions: {invalid_matched_files}"
376
+ )
377
+ else:
378
+ out = matched_paths
379
+ if not out:
380
+ error_msg = f"Unable to find '{pattern}'"
381
+ if allowed_extensions is not None:
382
+ error_msg += f" with any supported extension {list(allowed_extensions)}"
383
+ raise FileNotFoundError(error_msg)
384
+ return out
385
+
386
+
387
+ def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig] = None) -> dict[str, list[str]]:
388
+ """
389
+ Get the default pattern from a directory testing all the supported patterns.
390
+ The first patterns to return a non-empty list of data files is returned.
391
+
392
+ Some examples of supported patterns:
393
+
394
+ Input:
395
+
396
+ my_dataset_repository/
397
+ ├── README.md
398
+ └── dataset.csv
399
+
400
+ Output:
401
+
402
+ {'train': ['**']}
403
+
404
+ Input:
405
+
406
+ my_dataset_repository/
407
+ ├── README.md
408
+ ├── train.csv
409
+ └── test.csv
410
+
411
+ my_dataset_repository/
412
+ ├── README.md
413
+ └── data/
414
+ ├── train.csv
415
+ └── test.csv
416
+
417
+ my_dataset_repository/
418
+ ├── README.md
419
+ ├── train_0.csv
420
+ ├── train_1.csv
421
+ ├── train_2.csv
422
+ ├── train_3.csv
423
+ ├── test_0.csv
424
+ └── test_1.csv
425
+
426
+ Output:
427
+
428
+ {'train': ['**/train[-._ 0-9]*', '**/*[-._ 0-9]train[-._ 0-9]*', '**/training[-._ 0-9]*', '**/*[-._ 0-9]training[-._ 0-9]*'],
429
+ 'test': ['**/test[-._ 0-9]*', '**/*[-._ 0-9]test[-._ 0-9]*', '**/testing[-._ 0-9]*', '**/*[-._ 0-9]testing[-._ 0-9]*', ...]}
430
+
431
+ Input:
432
+
433
+ my_dataset_repository/
434
+ ├── README.md
435
+ └── data/
436
+ ├── train/
437
+ │ ├── shard_0.csv
438
+ │ ├── shard_1.csv
439
+ │ ├── shard_2.csv
440
+ │ └── shard_3.csv
441
+ └── test/
442
+ ├── shard_0.csv
443
+ └── shard_1.csv
444
+
445
+ Output:
446
+
447
+ {'train': ['**/train/**', '**/train[-._ 0-9]*/**', '**/*[-._ 0-9]train/**', '**/*[-._ 0-9]train[-._ 0-9]*/**', ...],
448
+ 'test': ['**/test/**', '**/test[-._ 0-9]*/**', '**/*[-._ 0-9]test/**', '**/*[-._ 0-9]test[-._ 0-9]*/**', ...]}
449
+
450
+ Input:
451
+
452
+ my_dataset_repository/
453
+ ├── README.md
454
+ └── data/
455
+ ├── train-00000-of-00003.csv
456
+ ├── train-00001-of-00003.csv
457
+ ├── train-00002-of-00003.csv
458
+ ├── test-00000-of-00001.csv
459
+ ├── random-00000-of-00003.csv
460
+ ├── random-00001-of-00003.csv
461
+ └── random-00002-of-00003.csv
462
+
463
+ Output:
464
+
465
+ {'train': ['data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
466
+ 'test': ['data/test-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
467
+ 'random': ['data/random-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*']}
468
+
469
+ In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
470
+ """
471
+ resolver = partial(resolve_pattern, base_path=base_path, download_config=download_config)
472
+ try:
473
+ return _get_data_files_patterns(resolver)
474
+ except FileNotFoundError:
475
+ raise EmptyDatasetError(f"The directory at {base_path} doesn't contain any data files") from None
476
+
477
+
478
+ def _get_single_origin_metadata(
479
+ data_file: str,
480
+ download_config: Optional[DownloadConfig] = None,
481
+ ) -> SingleOriginMetadata:
482
+ data_file, storage_options = _prepare_path_and_storage_options(data_file, download_config=download_config)
483
+ fs, *_ = url_to_fs(data_file, **storage_options)
484
+ if isinstance(fs, HfFileSystem):
485
+ resolved_path = fs.resolve_path(data_file)
486
+ return resolved_path.repo_id, resolved_path.revision
487
+ elif data_file.startswith(config.HF_ENDPOINT):
488
+ hffs = HfFileSystem(endpoint=config.HF_ENDPOINT, token=download_config.token)
489
+ data_file = "hf://" + data_file[len(config.HF_ENDPOINT) + 1 :].replace("/resolve/", "@", 1)
490
+ resolved_path = hffs.resolve_path(data_file)
491
+ return resolved_path.repo_id, resolved_path.revision
492
+ info = fs.info(data_file)
493
+ # s3fs uses "ETag", gcsfs uses "etag", and for local we simply check mtime
494
+ for key in ["ETag", "etag", "mtime"]:
495
+ if key in info:
496
+ return (str(info[key]),)
497
+ return ()
498
+
499
+
500
+ def _get_origin_metadata(
501
+ data_files: list[str],
502
+ download_config: Optional[DownloadConfig] = None,
503
+ max_workers: Optional[int] = None,
504
+ ) -> list[SingleOriginMetadata]:
505
+ max_workers = max_workers if max_workers is not None else config.HF_DATASETS_MULTITHREADING_MAX_WORKERS
506
+ if all("hf://" in data_file for data_file in data_files):
507
+ # No need for multithreading here since the origin metadata of HF files
508
+ # is (repo_id, revision) and is cached after first .info() call.
509
+ return [
510
+ _get_single_origin_metadata(data_file, download_config=download_config)
511
+ for data_file in hf_tqdm(
512
+ data_files,
513
+ desc="Resolving data files",
514
+ # set `disable=None` rather than `disable=False` by default to disable progress bar when no TTY attached
515
+ disable=len(data_files) <= 16 or None,
516
+ )
517
+ ]
518
+ return thread_map(
519
+ partial(_get_single_origin_metadata, download_config=download_config),
520
+ data_files,
521
+ max_workers=max_workers,
522
+ tqdm_class=hf_tqdm,
523
+ desc="Resolving data files",
524
+ # set `disable=None` rather than `disable=False` by default to disable progress bar when no TTY attached
525
+ disable=len(data_files) <= 16 or None,
526
+ )
527
+
528
+
529
+ class DataFilesList(list[str]):
530
+ """
531
+ List of data files (absolute local paths or URLs).
532
+ It has two construction methods given the user's data files patterns:
533
+ - ``from_hf_repo``: resolve patterns inside a dataset repository
534
+ - ``from_local_or_remote``: resolve patterns from a local path
535
+
536
+ Moreover, DataFilesList has an additional attribute ``origin_metadata``.
537
+ It can store:
538
+ - the last modified time of local files
539
+ - ETag of remote files
540
+ - commit sha of a dataset repository
541
+
542
+ Thanks to this additional attribute, it is possible to hash the list
543
+ and get a different hash if and only if at least one file changed.
544
+ This is useful for caching Dataset objects that are obtained from a list of data files.
545
+ """
546
+
547
+ def __init__(self, data_files: list[str], origin_metadata: list[SingleOriginMetadata]) -> None:
548
+ super().__init__(data_files)
549
+ self.origin_metadata = origin_metadata
550
+
551
+ def __add__(self, other: "DataFilesList") -> "DataFilesList":
552
+ return DataFilesList([*self, *other], self.origin_metadata + other.origin_metadata)
553
+
554
+ @classmethod
555
+ def from_hf_repo(
556
+ cls,
557
+ patterns: list[str],
558
+ dataset_info: huggingface_hub.hf_api.DatasetInfo,
559
+ base_path: Optional[str] = None,
560
+ allowed_extensions: Optional[list[str]] = None,
561
+ download_config: Optional[DownloadConfig] = None,
562
+ ) -> "DataFilesList":
563
+ base_path = f"hf://datasets/{dataset_info.id}@{dataset_info.sha}/{base_path or ''}".rstrip("/")
564
+ return cls.from_patterns(
565
+ patterns, base_path=base_path, allowed_extensions=allowed_extensions, download_config=download_config
566
+ )
567
+
568
+ @classmethod
569
+ def from_local_or_remote(
570
+ cls,
571
+ patterns: list[str],
572
+ base_path: Optional[str] = None,
573
+ allowed_extensions: Optional[list[str]] = None,
574
+ download_config: Optional[DownloadConfig] = None,
575
+ ) -> "DataFilesList":
576
+ base_path = base_path if base_path is not None else Path().resolve().as_posix()
577
+ return cls.from_patterns(
578
+ patterns, base_path=base_path, allowed_extensions=allowed_extensions, download_config=download_config
579
+ )
580
+
581
+ @classmethod
582
+ def from_patterns(
583
+ cls,
584
+ patterns: list[str],
585
+ base_path: Optional[str] = None,
586
+ allowed_extensions: Optional[list[str]] = None,
587
+ download_config: Optional[DownloadConfig] = None,
588
+ ) -> "DataFilesList":
589
+ base_path = base_path if base_path is not None else Path().resolve().as_posix()
590
+ data_files = []
591
+ for pattern in patterns:
592
+ try:
593
+ data_files.extend(
594
+ resolve_pattern(
595
+ pattern,
596
+ base_path=base_path,
597
+ allowed_extensions=allowed_extensions,
598
+ download_config=download_config,
599
+ )
600
+ )
601
+ except FileNotFoundError:
602
+ if not has_magic(pattern):
603
+ raise
604
+ origin_metadata = _get_origin_metadata(data_files, download_config=download_config)
605
+ return cls(data_files, origin_metadata)
606
+
607
+ def filter(
608
+ self, *, extensions: Optional[list[str]] = None, file_names: Optional[list[str]] = None
609
+ ) -> "DataFilesList":
610
+ patterns = []
611
+ if extensions:
612
+ ext_pattern = "|".join(re.escape(ext) for ext in extensions)
613
+ patterns.append(re.compile(f".*({ext_pattern})(\\..+)?$"))
614
+ if file_names:
615
+ fn_pattern = "|".join(re.escape(fn) for fn in file_names)
616
+ patterns.append(re.compile(rf".*[\/]?({fn_pattern})$"))
617
+ if patterns:
618
+ return DataFilesList(
619
+ [data_file for data_file in self if any(pattern.match(data_file) for pattern in patterns)],
620
+ origin_metadata=self.origin_metadata,
621
+ )
622
+ else:
623
+ return DataFilesList(list(self), origin_metadata=self.origin_metadata)
624
+
625
+
626
+ class DataFilesDict(dict[str, DataFilesList]):
627
+ """
628
+ Dict of split_name -> list of data files (absolute local paths or URLs).
629
+ It has two construction methods given the user's data files patterns :
630
+ - ``from_hf_repo``: resolve patterns inside a dataset repository
631
+ - ``from_local_or_remote``: resolve patterns from a local path
632
+
633
+ Moreover, each list is a DataFilesList. It is possible to hash the dictionary
634
+ and get a different hash if and only if at least one file changed.
635
+ For more info, see [`DataFilesList`].
636
+
637
+ This is useful for caching Dataset objects that are obtained from a list of data files.
638
+
639
+ Changing the order of the keys of this dictionary also doesn't change its hash.
640
+ """
641
+
642
+ @classmethod
643
+ def from_local_or_remote(
644
+ cls,
645
+ patterns: dict[str, Union[list[str], DataFilesList]],
646
+ base_path: Optional[str] = None,
647
+ allowed_extensions: Optional[list[str]] = None,
648
+ download_config: Optional[DownloadConfig] = None,
649
+ ) -> "DataFilesDict":
650
+ out = cls()
651
+ for key, patterns_for_key in patterns.items():
652
+ out[key] = (
653
+ patterns_for_key
654
+ if isinstance(patterns_for_key, DataFilesList)
655
+ else DataFilesList.from_local_or_remote(
656
+ patterns_for_key,
657
+ base_path=base_path,
658
+ allowed_extensions=allowed_extensions,
659
+ download_config=download_config,
660
+ )
661
+ )
662
+ return out
663
+
664
+ @classmethod
665
+ def from_hf_repo(
666
+ cls,
667
+ patterns: dict[str, Union[list[str], DataFilesList]],
668
+ dataset_info: huggingface_hub.hf_api.DatasetInfo,
669
+ base_path: Optional[str] = None,
670
+ allowed_extensions: Optional[list[str]] = None,
671
+ download_config: Optional[DownloadConfig] = None,
672
+ ) -> "DataFilesDict":
673
+ out = cls()
674
+ for key, patterns_for_key in patterns.items():
675
+ out[key] = (
676
+ patterns_for_key
677
+ if isinstance(patterns_for_key, DataFilesList)
678
+ else DataFilesList.from_hf_repo(
679
+ patterns_for_key,
680
+ dataset_info=dataset_info,
681
+ base_path=base_path,
682
+ allowed_extensions=allowed_extensions,
683
+ download_config=download_config,
684
+ )
685
+ )
686
+ return out
687
+
688
+ @classmethod
689
+ def from_patterns(
690
+ cls,
691
+ patterns: dict[str, Union[list[str], DataFilesList]],
692
+ base_path: Optional[str] = None,
693
+ allowed_extensions: Optional[list[str]] = None,
694
+ download_config: Optional[DownloadConfig] = None,
695
+ ) -> "DataFilesDict":
696
+ out = cls()
697
+ for key, patterns_for_key in patterns.items():
698
+ out[key] = (
699
+ patterns_for_key
700
+ if isinstance(patterns_for_key, DataFilesList)
701
+ else DataFilesList.from_patterns(
702
+ patterns_for_key,
703
+ base_path=base_path,
704
+ allowed_extensions=allowed_extensions,
705
+ download_config=download_config,
706
+ )
707
+ )
708
+ return out
709
+
710
+ def filter(
711
+ self, *, extensions: Optional[list[str]] = None, file_names: Optional[list[str]] = None
712
+ ) -> "DataFilesDict":
713
+ out = type(self)()
714
+ for key, data_files_list in self.items():
715
+ out[key] = data_files_list.filter(extensions=extensions, file_names=file_names)
716
+ return out
717
+
718
+
719
+ class DataFilesPatternsList(list[str]):
720
+ """
721
+ List of data files patterns (absolute local paths or URLs).
722
+ For each pattern there should also be a list of allowed extensions
723
+ to keep, or a None ot keep all the files for the pattern.
724
+ """
725
+
726
+ def __init__(
727
+ self,
728
+ patterns: list[str],
729
+ allowed_extensions: list[Optional[list[str]]],
730
+ ):
731
+ super().__init__(patterns)
732
+ self.allowed_extensions = allowed_extensions
733
+
734
+ def __add__(self, other):
735
+ return DataFilesList([*self, *other], self.allowed_extensions + other.allowed_extensions)
736
+
737
+ @classmethod
738
+ def from_patterns(
739
+ cls, patterns: list[str], allowed_extensions: Optional[list[str]] = None
740
+ ) -> "DataFilesPatternsList":
741
+ return cls(patterns, [allowed_extensions] * len(patterns))
742
+
743
+ def resolve(
744
+ self,
745
+ base_path: str,
746
+ download_config: Optional[DownloadConfig] = None,
747
+ ) -> "DataFilesList":
748
+ base_path = base_path if base_path is not None else Path().resolve().as_posix()
749
+ data_files = []
750
+ for pattern, allowed_extensions in zip(self, self.allowed_extensions):
751
+ try:
752
+ data_files.extend(
753
+ resolve_pattern(
754
+ pattern,
755
+ base_path=base_path,
756
+ allowed_extensions=allowed_extensions,
757
+ download_config=download_config,
758
+ )
759
+ )
760
+ except FileNotFoundError:
761
+ if not has_magic(pattern):
762
+ raise
763
+ origin_metadata = _get_origin_metadata(data_files, download_config=download_config)
764
+ return DataFilesList(data_files, origin_metadata)
765
+
766
+ def filter_extensions(self, extensions: list[str]) -> "DataFilesPatternsList":
767
+ return DataFilesPatternsList(
768
+ self, [allowed_extensions + extensions for allowed_extensions in self.allowed_extensions]
769
+ )
770
+
771
+
772
+ class DataFilesPatternsDict(dict[str, DataFilesPatternsList]):
773
+ """
774
+ Dict of split_name -> list of data files patterns (absolute local paths or URLs).
775
+ """
776
+
777
+ @classmethod
778
+ def from_patterns(
779
+ cls, patterns: dict[str, list[str]], allowed_extensions: Optional[list[str]] = None
780
+ ) -> "DataFilesPatternsDict":
781
+ out = cls()
782
+ for key, patterns_for_key in patterns.items():
783
+ out[key] = (
784
+ patterns_for_key
785
+ if isinstance(patterns_for_key, DataFilesPatternsList)
786
+ else DataFilesPatternsList.from_patterns(
787
+ patterns_for_key,
788
+ allowed_extensions=allowed_extensions,
789
+ )
790
+ )
791
+ return out
792
+
793
+ def resolve(
794
+ self,
795
+ base_path: str,
796
+ download_config: Optional[DownloadConfig] = None,
797
+ ) -> "DataFilesDict":
798
+ out = DataFilesDict()
799
+ for key, data_files_patterns_list in self.items():
800
+ out[key] = data_files_patterns_list.resolve(base_path, download_config)
801
+ return out
802
+
803
+ def filter_extensions(self, extensions: list[str]) -> "DataFilesPatternsDict":
804
+ out = type(self)()
805
+ for key, data_files_patterns_list in self.items():
806
+ out[key] = data_files_patterns_list.filter_extensions(extensions)
807
+ return out
datasets/dataset_dict.py ADDED
The diff for this file is too large to render. See raw diff
 
datasets/distributed.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import TypeVar
2
+
3
+ from .arrow_dataset import Dataset, _split_by_node_map_style_dataset
4
+ from .iterable_dataset import IterableDataset, _split_by_node_iterable_dataset
5
+
6
+
7
+ DatasetType = TypeVar("DatasetType", Dataset, IterableDataset)
8
+
9
+
10
+ def split_dataset_by_node(dataset: DatasetType, rank: int, world_size: int) -> DatasetType:
11
+ """
12
+ Split a dataset for the node at rank `rank` in a pool of nodes of size `world_size`.
13
+
14
+ For map-style datasets:
15
+
16
+ Each node is assigned a chunk of data, e.g. rank 0 is given the first chunk of the dataset.
17
+ To maximize data loading throughput, chunks are made of contiguous data on disk if possible.
18
+
19
+ For iterable datasets:
20
+
21
+ If the dataset has a number of shards that is a factor of `world_size` (i.e. if `dataset.num_shards % world_size == 0`),
22
+ then the shards are evenly assigned across the nodes, which is the most optimized.
23
+ Otherwise, each node keeps 1 example out of `world_size`, skipping the other examples.
24
+
25
+ Args:
26
+ dataset ([`Dataset`] or [`IterableDataset`]):
27
+ The dataset to split by node.
28
+ rank (`int`):
29
+ Rank of the current node.
30
+ world_size (`int`):
31
+ Total number of nodes.
32
+
33
+ Returns:
34
+ [`Dataset`] or [`IterableDataset`]: The dataset to be used on the node at rank `rank`.
35
+ """
36
+ if isinstance(dataset, Dataset):
37
+ return _split_by_node_map_style_dataset(dataset, rank=rank, world_size=world_size)
38
+ else:
39
+ return _split_by_node_iterable_dataset(dataset, rank=rank, world_size=world_size)
datasets/exceptions.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2023 The HuggingFace Authors.
3
+ from typing import Any, Optional, Union
4
+
5
+ from huggingface_hub import HfFileSystem
6
+
7
+ from . import config
8
+ from .table import CastError
9
+ from .utils.track import TrackedIterableFromGenerator, tracked_list, tracked_str
10
+
11
+
12
+ class DatasetsError(Exception):
13
+ """Base class for exceptions in this library."""
14
+
15
+
16
+ class DefunctDatasetError(DatasetsError):
17
+ """The dataset has been defunct."""
18
+
19
+
20
+ class FileNotFoundDatasetsError(DatasetsError, FileNotFoundError):
21
+ """FileNotFoundError raised by this library."""
22
+
23
+
24
+ class DataFilesNotFoundError(FileNotFoundDatasetsError):
25
+ """No (supported) data files found."""
26
+
27
+
28
+ class DatasetNotFoundError(FileNotFoundDatasetsError):
29
+ """Dataset not found.
30
+
31
+ Raised when trying to access:
32
+ - a missing dataset, or
33
+ - a private/gated dataset and the user is not authenticated.
34
+ """
35
+
36
+
37
+ class DatasetBuildError(DatasetsError):
38
+ pass
39
+
40
+
41
+ class ManualDownloadError(DatasetBuildError):
42
+ pass
43
+
44
+
45
+ class FileFormatError(DatasetBuildError):
46
+ pass
47
+
48
+
49
+ class DatasetGenerationError(DatasetBuildError):
50
+ pass
51
+
52
+
53
+ class DatasetGenerationCastError(DatasetGenerationError):
54
+ @classmethod
55
+ def from_cast_error(
56
+ cls,
57
+ cast_error: CastError,
58
+ builder_name: str,
59
+ gen_kwargs: dict[str, Any],
60
+ token: Optional[Union[bool, str]],
61
+ ) -> "DatasetGenerationCastError":
62
+ explanation_message = (
63
+ f"\n\nAll the data files must have the same columns, but at some point {cast_error.details()}"
64
+ )
65
+ formatted_tracked_gen_kwargs: list[str] = []
66
+ for gen_kwarg in gen_kwargs.values():
67
+ if not isinstance(gen_kwarg, (tracked_str, tracked_list, TrackedIterableFromGenerator)):
68
+ continue
69
+ while (
70
+ isinstance(gen_kwarg, (tracked_list, TrackedIterableFromGenerator)) and gen_kwarg.last_item is not None
71
+ ):
72
+ gen_kwarg = gen_kwarg.last_item
73
+ if isinstance(gen_kwarg, tracked_str):
74
+ gen_kwarg = gen_kwarg.get_origin()
75
+ if isinstance(gen_kwarg, str) and gen_kwarg.startswith("hf://"):
76
+ resolved_path = HfFileSystem(endpoint=config.HF_ENDPOINT, token=token).resolve_path(gen_kwarg)
77
+ gen_kwarg = "hf://" + resolved_path.unresolve()
78
+ if "@" + resolved_path.revision in gen_kwarg:
79
+ gen_kwarg = (
80
+ gen_kwarg.replace("@" + resolved_path.revision, "", 1)
81
+ + f" (at revision {resolved_path.revision})"
82
+ )
83
+ formatted_tracked_gen_kwargs.append(str(gen_kwarg))
84
+ if formatted_tracked_gen_kwargs:
85
+ explanation_message += f"\n\nThis happened while the {builder_name} dataset builder was generating data using\n\n{', '.join(formatted_tracked_gen_kwargs)}"
86
+ help_message = "\n\nPlease either edit the data files to have matching columns, or separate them into different configurations (see docs at https://hf.co/docs/hub/datasets-manual-configuration#multiple-configurations)"
87
+ return cls("An error occurred while generating the dataset" + explanation_message + help_message)
88
+
89
+
90
+ class ChecksumVerificationError(DatasetsError):
91
+ """Error raised during checksums verifications of downloaded files."""
92
+
93
+
94
+ class UnexpectedDownloadedFileError(ChecksumVerificationError):
95
+ """Some downloaded files were not expected."""
96
+
97
+
98
+ class ExpectedMoreDownloadedFilesError(ChecksumVerificationError):
99
+ """Some files were supposed to be downloaded but were not."""
100
+
101
+
102
+ class NonMatchingChecksumError(ChecksumVerificationError):
103
+ """The downloaded file checksum don't match the expected checksum."""
104
+
105
+
106
+ class SplitsVerificationError(DatasetsError):
107
+ """Error raised during splits verifications."""
108
+
109
+
110
+ class UnexpectedSplitsError(SplitsVerificationError):
111
+ """The expected splits of the downloaded file is missing."""
112
+
113
+
114
+ class ExpectedMoreSplitsError(SplitsVerificationError):
115
+ """Some recorded splits are missing."""
116
+
117
+
118
+ class NonMatchingSplitsSizesError(SplitsVerificationError):
119
+ """The splits sizes don't match the expected splits sizes."""
datasets/fingerprint.py ADDED
@@ -0,0 +1,454 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import inspect
2
+ import os
3
+ import random
4
+ import shutil
5
+ import tempfile
6
+ import weakref
7
+ from functools import wraps
8
+ from pathlib import Path
9
+ from typing import TYPE_CHECKING, Any, Callable, Optional, Union
10
+
11
+ import numpy as np
12
+ import xxhash
13
+
14
+ from . import config
15
+ from .naming import INVALID_WINDOWS_CHARACTERS_IN_PATH
16
+ from .utils._dill import dumps
17
+ from .utils.logging import get_logger
18
+
19
+
20
+ if TYPE_CHECKING:
21
+ from .arrow_dataset import Dataset
22
+
23
+
24
+ logger = get_logger(__name__)
25
+
26
+
27
+ # Fingerprinting allows to have one deterministic fingerprint per dataset state.
28
+ # A dataset fingerprint is updated after each transform.
29
+ # Re-running the same transforms on a dataset in a different session results in the same fingerprint.
30
+ # This is possible thanks to a custom hashing function that works with most python objects.
31
+
32
+ # Fingerprinting is the main mechanism that enables caching.
33
+ # The caching mechanism allows to reload an existing cache file if it's already been computed.
34
+
35
+
36
+ #################
37
+ # Caching
38
+ #################
39
+
40
+ _CACHING_ENABLED = True
41
+ _TEMP_DIR_FOR_TEMP_CACHE_FILES: Optional["_TempCacheDir"] = None
42
+ _DATASETS_WITH_TABLE_IN_TEMP_DIR: Optional[weakref.WeakSet] = None
43
+
44
+
45
+ class _TempCacheDir:
46
+ """
47
+ A temporary directory for storing cached Arrow files with a cleanup that frees references to the Arrow files
48
+ before deleting the directory itself to avoid permission errors on Windows.
49
+ """
50
+
51
+ def __init__(self):
52
+ self.name = tempfile.mkdtemp(prefix=config.TEMP_CACHE_DIR_PREFIX)
53
+ self._finalizer = weakref.finalize(self, self._cleanup)
54
+
55
+ def _cleanup(self):
56
+ for dset in get_datasets_with_cache_file_in_temp_dir():
57
+ dset.__del__()
58
+ if os.path.exists(self.name):
59
+ try:
60
+ shutil.rmtree(self.name)
61
+ except Exception as e:
62
+ raise OSError(
63
+ f"An error occurred while trying to delete temporary cache directory {self.name}. Please delete it manually."
64
+ ) from e
65
+
66
+ def cleanup(self):
67
+ if self._finalizer.detach():
68
+ self._cleanup()
69
+
70
+
71
+ def maybe_register_dataset_for_temp_dir_deletion(dataset):
72
+ """
73
+ This function registers the datasets that have cache files in _TEMP_DIR_FOR_TEMP_CACHE_FILES in order
74
+ to properly delete them before deleting the temporary directory.
75
+ The temporary directory _TEMP_DIR_FOR_TEMP_CACHE_FILES is used when caching is disabled.
76
+ """
77
+ if _TEMP_DIR_FOR_TEMP_CACHE_FILES is None:
78
+ return
79
+
80
+ global _DATASETS_WITH_TABLE_IN_TEMP_DIR
81
+ if _DATASETS_WITH_TABLE_IN_TEMP_DIR is None:
82
+ _DATASETS_WITH_TABLE_IN_TEMP_DIR = weakref.WeakSet()
83
+ if any(
84
+ Path(_TEMP_DIR_FOR_TEMP_CACHE_FILES.name) in Path(cache_file["filename"]).parents
85
+ for cache_file in dataset.cache_files
86
+ ):
87
+ _DATASETS_WITH_TABLE_IN_TEMP_DIR.add(dataset)
88
+
89
+
90
+ def get_datasets_with_cache_file_in_temp_dir():
91
+ return list(_DATASETS_WITH_TABLE_IN_TEMP_DIR) if _DATASETS_WITH_TABLE_IN_TEMP_DIR is not None else []
92
+
93
+
94
+ def enable_caching():
95
+ """
96
+ When applying transforms on a dataset, the data are stored in cache files.
97
+ The caching mechanism allows to reload an existing cache file if it's already been computed.
98
+
99
+ Reloading a dataset is possible since the cache files are named using the dataset fingerprint, which is updated
100
+ after each transform.
101
+
102
+ If disabled, the library will no longer reload cached datasets files when applying transforms to the datasets.
103
+ More precisely, if the caching is disabled:
104
+ - cache files are always recreated
105
+ - cache files are written to a temporary directory that is deleted when session closes
106
+ - cache files are named using a random hash instead of the dataset fingerprint
107
+ - use [`~datasets.Dataset.save_to_disk`] to save a transformed dataset or it will be deleted when session closes
108
+ - caching doesn't affect [`~datasets.load_dataset`]. If you want to regenerate a dataset from scratch you should use
109
+ the `download_mode` parameter in [`~datasets.load_dataset`].
110
+ """
111
+ global _CACHING_ENABLED
112
+ _CACHING_ENABLED = True
113
+
114
+
115
+ def disable_caching():
116
+ """
117
+ When applying transforms on a dataset, the data are stored in cache files.
118
+ The caching mechanism allows to reload an existing cache file if it's already been computed.
119
+
120
+ Reloading a dataset is possible since the cache files are named using the dataset fingerprint, which is updated
121
+ after each transform.
122
+
123
+ If disabled, the library will no longer reload cached datasets files when applying transforms to the datasets.
124
+ More precisely, if the caching is disabled:
125
+ - cache files are always recreated
126
+ - cache files are written to a temporary directory that is deleted when session closes
127
+ - cache files are named using a random hash instead of the dataset fingerprint
128
+ - use [`~datasets.Dataset.save_to_disk`] to save a transformed dataset or it will be deleted when session closes
129
+ - caching doesn't affect [`~datasets.load_dataset`]. If you want to regenerate a dataset from scratch you should use
130
+ the `download_mode` parameter in [`~datasets.load_dataset`].
131
+ """
132
+ global _CACHING_ENABLED
133
+ _CACHING_ENABLED = False
134
+
135
+
136
+ def is_caching_enabled() -> bool:
137
+ """
138
+ When applying transforms on a dataset, the data are stored in cache files.
139
+ The caching mechanism allows to reload an existing cache file if it's already been computed.
140
+
141
+ Reloading a dataset is possible since the cache files are named using the dataset fingerprint, which is updated
142
+ after each transform.
143
+
144
+ If disabled, the library will no longer reload cached datasets files when applying transforms to the datasets.
145
+ More precisely, if the caching is disabled:
146
+ - cache files are always recreated
147
+ - cache files are written to a temporary directory that is deleted when session closes
148
+ - cache files are named using a random hash instead of the dataset fingerprint
149
+ - use [`~datasets.Dataset.save_to_disk`]] to save a transformed dataset or it will be deleted when session closes
150
+ - caching doesn't affect [`~datasets.load_dataset`]. If you want to regenerate a dataset from scratch you should use
151
+ the `download_mode` parameter in [`~datasets.load_dataset`].
152
+ """
153
+ global _CACHING_ENABLED
154
+ return bool(_CACHING_ENABLED)
155
+
156
+
157
+ def get_temporary_cache_files_directory() -> str:
158
+ """Return a directory that is deleted when session closes."""
159
+ global _TEMP_DIR_FOR_TEMP_CACHE_FILES
160
+ if _TEMP_DIR_FOR_TEMP_CACHE_FILES is None:
161
+ _TEMP_DIR_FOR_TEMP_CACHE_FILES = _TempCacheDir()
162
+ return _TEMP_DIR_FOR_TEMP_CACHE_FILES.name
163
+
164
+
165
+ #################
166
+ # Hashing
167
+ #################
168
+
169
+
170
+ class Hasher:
171
+ """Hasher that accepts python objects as inputs."""
172
+
173
+ dispatch: dict = {}
174
+
175
+ def __init__(self):
176
+ self.m = xxhash.xxh64()
177
+
178
+ @classmethod
179
+ def hash_bytes(cls, value: Union[bytes, list[bytes]]) -> str:
180
+ value = [value] if isinstance(value, bytes) else value
181
+ m = xxhash.xxh64()
182
+ for x in value:
183
+ m.update(x)
184
+ return m.hexdigest()
185
+
186
+ @classmethod
187
+ def hash(cls, value: Any) -> str:
188
+ return cls.hash_bytes(dumps(value))
189
+
190
+ def update(self, value: Any) -> None:
191
+ header_for_update = f"=={type(value)}=="
192
+ value_for_update = self.hash(value)
193
+ self.m.update(header_for_update.encode("utf8"))
194
+ self.m.update(value_for_update.encode("utf-8"))
195
+
196
+ def hexdigest(self) -> str:
197
+ return self.m.hexdigest()
198
+
199
+
200
+ #################
201
+ # Fingerprinting
202
+ #################
203
+
204
+ fingerprint_rng = random.Random()
205
+ # we show a warning only once when fingerprinting fails to avoid spam
206
+ fingerprint_warnings: dict[str, bool] = {}
207
+
208
+
209
+ def generate_fingerprint(dataset: "Dataset") -> str:
210
+ state = dataset.__dict__
211
+ hasher = Hasher()
212
+ for key in sorted(state):
213
+ if key == "_fingerprint":
214
+ continue
215
+ hasher.update(key)
216
+ hasher.update(state[key])
217
+ # hash data files last modification timestamps as well
218
+ for cache_file in dataset.cache_files:
219
+ hasher.update(os.path.getmtime(cache_file["filename"]))
220
+ return hasher.hexdigest()
221
+
222
+
223
+ def generate_random_fingerprint(nbits: int = 64) -> str:
224
+ return f"{fingerprint_rng.getrandbits(nbits):0{nbits // 4}x}"
225
+
226
+
227
+ def update_fingerprint(fingerprint, transform, transform_args):
228
+ global fingerprint_warnings
229
+ hasher = Hasher()
230
+ hasher.update(fingerprint)
231
+ try:
232
+ hasher.update(transform)
233
+ except: # noqa various errors might raise here from pickle or dill
234
+ if _CACHING_ENABLED:
235
+ if not fingerprint_warnings.get("update_fingerprint_transform_hash_failed", False):
236
+ logger.warning(
237
+ f"Transform {transform} couldn't be hashed properly, a random hash was used instead. "
238
+ "Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. "
239
+ "If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. "
240
+ "This warning is only shown once. Subsequent hashing failures won't be shown."
241
+ )
242
+ fingerprint_warnings["update_fingerprint_transform_hash_failed"] = True
243
+ else:
244
+ logger.info(f"Transform {transform} couldn't be hashed properly, a random hash was used instead.")
245
+ else:
246
+ logger.info(
247
+ f"Transform {transform} couldn't be hashed properly, a random hash was used instead. This doesn't affect caching since it's disabled."
248
+ )
249
+
250
+ return generate_random_fingerprint()
251
+ for key in sorted(transform_args):
252
+ hasher.update(key)
253
+ try:
254
+ hasher.update(transform_args[key])
255
+ except: # noqa various errors might raise here from pickle or dill
256
+ if _CACHING_ENABLED:
257
+ if not fingerprint_warnings.get("update_fingerprint_transform_hash_failed", False):
258
+ logger.warning(
259
+ f"Parameter '{key}'={transform_args[key]} of the transform {transform} couldn't be hashed properly, a random hash was used instead. "
260
+ "Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. "
261
+ "If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. "
262
+ "This warning is only shown once. Subsequent hashing failures won't be shown."
263
+ )
264
+ fingerprint_warnings["update_fingerprint_transform_hash_failed"] = True
265
+ else:
266
+ logger.info(
267
+ f"Parameter '{key}'={transform_args[key]} of the transform {transform} couldn't be hashed properly, a random hash was used instead."
268
+ )
269
+ else:
270
+ logger.info(
271
+ f"Parameter '{key}'={transform_args[key]} of the transform {transform} couldn't be hashed properly, a random hash was used instead. This doesn't affect caching since it's disabled."
272
+ )
273
+ return generate_random_fingerprint()
274
+ return hasher.hexdigest()
275
+
276
+
277
+ def validate_fingerprint(fingerprint: str, max_length=64):
278
+ """
279
+ Make sure the fingerprint is a non-empty string that is not longer that max_length=64 by default,
280
+ so that the fingerprint can be used to name cache files without issues.
281
+ """
282
+ if not isinstance(fingerprint, str) or not fingerprint:
283
+ raise ValueError(f"Invalid fingerprint '{fingerprint}': it should be a non-empty string.")
284
+ for invalid_char in INVALID_WINDOWS_CHARACTERS_IN_PATH:
285
+ if invalid_char in fingerprint:
286
+ raise ValueError(
287
+ f"Invalid fingerprint. Bad characters from black list '{INVALID_WINDOWS_CHARACTERS_IN_PATH}' found in '{fingerprint}'. "
288
+ f"They could create issues when creating cache files."
289
+ )
290
+ if len(fingerprint) > max_length:
291
+ raise ValueError(
292
+ f"Invalid fingerprint. Maximum lenth is {max_length} but '{fingerprint}' has length {len(fingerprint)}."
293
+ "It could create issues when creating cache files."
294
+ )
295
+
296
+
297
+ def format_transform_for_fingerprint(func: Callable, version: Optional[str] = None) -> str:
298
+ """
299
+ Format a transform to the format that will be used to update the fingerprint.
300
+ """
301
+ transform = f"{func.__module__}.{func.__qualname__}"
302
+ if version is not None:
303
+ transform += f"@{version}"
304
+ return transform
305
+
306
+
307
+ def format_kwargs_for_fingerprint(
308
+ func: Callable,
309
+ args: tuple,
310
+ kwargs: dict[str, Any],
311
+ use_kwargs: Optional[list[str]] = None,
312
+ ignore_kwargs: Optional[list[str]] = None,
313
+ randomized_function: bool = False,
314
+ ) -> dict[str, Any]:
315
+ """
316
+ Format the kwargs of a transform to the format that will be used to update the fingerprint.
317
+ """
318
+ kwargs_for_fingerprint = kwargs.copy()
319
+ if args:
320
+ params = [p.name for p in inspect.signature(func).parameters.values() if p != p.VAR_KEYWORD]
321
+ args = args[1:] # assume the first argument is the dataset
322
+ params = params[1:]
323
+ kwargs_for_fingerprint.update(zip(params, args))
324
+ else:
325
+ del kwargs_for_fingerprint[
326
+ next(iter(inspect.signature(func).parameters))
327
+ ] # assume the first key is the dataset
328
+
329
+ # keep the right kwargs to be hashed to generate the fingerprint
330
+
331
+ if use_kwargs:
332
+ kwargs_for_fingerprint = {k: v for k, v in kwargs_for_fingerprint.items() if k in use_kwargs}
333
+ if ignore_kwargs:
334
+ kwargs_for_fingerprint = {k: v for k, v in kwargs_for_fingerprint.items() if k not in ignore_kwargs}
335
+ if randomized_function: # randomized functions have `seed` and `generator` parameters
336
+ if kwargs_for_fingerprint.get("seed") is None and kwargs_for_fingerprint.get("generator") is None:
337
+ _, seed, pos, *_ = np.random.get_state()
338
+ seed = seed[pos] if pos < 624 else seed[0]
339
+ kwargs_for_fingerprint["generator"] = np.random.default_rng(seed)
340
+
341
+ # remove kwargs that are the default values
342
+
343
+ default_values = {
344
+ p.name: p.default for p in inspect.signature(func).parameters.values() if p.default != inspect._empty
345
+ }
346
+ for default_varname, default_value in default_values.items():
347
+ if default_varname in kwargs_for_fingerprint and kwargs_for_fingerprint[default_varname] == default_value:
348
+ kwargs_for_fingerprint.pop(default_varname)
349
+ return kwargs_for_fingerprint
350
+
351
+
352
+ def fingerprint_transform(
353
+ inplace: bool,
354
+ use_kwargs: Optional[list[str]] = None,
355
+ ignore_kwargs: Optional[list[str]] = None,
356
+ fingerprint_names: Optional[list[str]] = None,
357
+ randomized_function: bool = False,
358
+ version: Optional[str] = None,
359
+ ):
360
+ """
361
+ Wrapper for dataset transforms to update the dataset fingerprint using ``update_fingerprint``
362
+ Args:
363
+ inplace (:obj:`bool`): If inplace is True, the fingerprint of the dataset is updated inplace.
364
+ Otherwise, a parameter "new_fingerprint" is passed to the wrapped method that should take care of
365
+ setting the fingerprint of the returned Dataset.
366
+ use_kwargs (:obj:`List[str]`, optional): optional white list of argument names to take into account
367
+ to update the fingerprint to the wrapped method that should take care of
368
+ setting the fingerprint of the returned Dataset. By default all the arguments are used.
369
+ ignore_kwargs (:obj:`List[str]`, optional): optional black list of argument names to take into account
370
+ to update the fingerprint. Note that ignore_kwargs prevails on use_kwargs.
371
+ fingerprint_names (:obj:`List[str]`, optional, defaults to ["new_fingerprint"]):
372
+ If the dataset transforms is not inplace and returns a DatasetDict, then it can require
373
+ several fingerprints (one per dataset in the DatasetDict). By specifying fingerprint_names,
374
+ one fingerprint named after each element of fingerprint_names is going to be passed.
375
+ randomized_function (:obj:`bool`, defaults to False): If the dataset transform is random and has
376
+ optional parameters "seed" and "generator", then you can set randomized_function to True.
377
+ This way, even if users set "seed" and "generator" to None, then the fingerprint is
378
+ going to be randomly generated depending on numpy's current state. In this case, the
379
+ generator is set to np.random.default_rng(np.random.get_state()[1][0]).
380
+ version (:obj:`str`, optional): version of the transform. The version is taken into account when
381
+ computing the fingerprint. If a datase transform changes (or at least if the output data
382
+ that are cached changes), then one should increase the version. If the version stays the
383
+ same, then old cached data could be reused that are not compatible with the new transform.
384
+ It should be in the format "MAJOR.MINOR.PATCH".
385
+ """
386
+
387
+ if use_kwargs is not None and not isinstance(use_kwargs, list):
388
+ raise ValueError(f"use_kwargs is supposed to be a list, not {type(use_kwargs)}")
389
+
390
+ if ignore_kwargs is not None and not isinstance(ignore_kwargs, list):
391
+ raise ValueError(f"ignore_kwargs is supposed to be a list, not {type(use_kwargs)}")
392
+
393
+ if inplace and fingerprint_names:
394
+ raise ValueError("fingerprint_names are only used when inplace is False")
395
+
396
+ fingerprint_names = fingerprint_names if fingerprint_names is not None else ["new_fingerprint"]
397
+
398
+ def _fingerprint(func):
399
+ if not inplace and not all(name in func.__code__.co_varnames for name in fingerprint_names):
400
+ raise ValueError(f"function {func} is missing parameters {fingerprint_names} in signature")
401
+
402
+ if randomized_function: # randomized function have seed and generator parameters
403
+ if "seed" not in func.__code__.co_varnames:
404
+ raise ValueError(f"'seed' must be in {func}'s signature")
405
+ if "generator" not in func.__code__.co_varnames:
406
+ raise ValueError(f"'generator' must be in {func}'s signature")
407
+ # this call has to be outside the wrapper or since __qualname__ changes in multiprocessing
408
+ transform = format_transform_for_fingerprint(func, version=version)
409
+
410
+ @wraps(func)
411
+ def wrapper(*args, **kwargs):
412
+ kwargs_for_fingerprint = format_kwargs_for_fingerprint(
413
+ func,
414
+ args,
415
+ kwargs,
416
+ use_kwargs=use_kwargs,
417
+ ignore_kwargs=ignore_kwargs,
418
+ randomized_function=randomized_function,
419
+ )
420
+
421
+ if args:
422
+ dataset: Dataset = args[0]
423
+ args = args[1:]
424
+ else:
425
+ dataset: Dataset = kwargs.pop(next(iter(inspect.signature(func).parameters)))
426
+
427
+ # compute new_fingerprint and add it to the args of not in-place transforms
428
+ if inplace:
429
+ new_fingerprint = update_fingerprint(dataset._fingerprint, transform, kwargs_for_fingerprint)
430
+ else:
431
+ for fingerprint_name in fingerprint_names: # transforms like `train_test_split` have several hashes
432
+ if kwargs.get(fingerprint_name) is None:
433
+ kwargs_for_fingerprint["fingerprint_name"] = fingerprint_name
434
+ kwargs[fingerprint_name] = update_fingerprint(
435
+ dataset._fingerprint, transform, kwargs_for_fingerprint
436
+ )
437
+ else:
438
+ validate_fingerprint(kwargs[fingerprint_name])
439
+
440
+ # Call actual function
441
+
442
+ out = func(dataset, *args, **kwargs)
443
+
444
+ # Update fingerprint of in-place transforms + update in-place history of transforms
445
+
446
+ if inplace: # update after calling func so that the fingerprint doesn't change if the function fails
447
+ dataset._fingerprint = new_fingerprint
448
+
449
+ return out
450
+
451
+ wrapper._decorator_name_ = "fingerprint"
452
+ return wrapper
453
+
454
+ return _fingerprint
datasets/hub.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from itertools import chain
2
+ from typing import Optional, Union
3
+
4
+ from huggingface_hub import (
5
+ CommitInfo,
6
+ CommitOperationAdd,
7
+ CommitOperationDelete,
8
+ DatasetCard,
9
+ DatasetCardData,
10
+ HfApi,
11
+ HfFileSystem,
12
+ )
13
+
14
+ import datasets.config
15
+ from datasets.info import DatasetInfosDict
16
+ from datasets.load import load_dataset_builder
17
+ from datasets.utils.metadata import MetadataConfigs
18
+
19
+
20
+ def delete_from_hub(
21
+ repo_id: str,
22
+ config_name: str,
23
+ revision: Optional[str] = None,
24
+ token: Optional[Union[bool, str]] = None,
25
+ ) -> CommitInfo:
26
+ """Delete a dataset configuration from a [data-only dataset](repository_structure) on the Hub.
27
+
28
+ Args:
29
+ repo_id (`str`): ID of the Hub dataset repository, in the following format: `<user>/<dataset_name>` or
30
+ `<org>/<dataset_name>`.
31
+ config_name (`str`): Name of the dataset configuration.
32
+ revision (`str`, *optional*): Branch to delete the configuration from. Defaults to the `"main"` branch.
33
+ token (`bool` or `str`, *optional*): Authentication token for the Hugging Face Hub.
34
+
35
+ Returns:
36
+ `huggingface_hub.CommitInfo`
37
+ """
38
+ operations = []
39
+ # data_files
40
+ fs = HfFileSystem(endpoint=datasets.config.HF_ENDPOINT, token=token)
41
+ builder = load_dataset_builder(repo_id, config_name, revision=revision, token=token)
42
+ for data_file in chain(*builder.config.data_files.values()):
43
+ data_file_resolved_path = fs.resolve_path(data_file)
44
+ if data_file_resolved_path.repo_id == repo_id:
45
+ operations.append(CommitOperationDelete(path_in_repo=data_file_resolved_path.path_in_repo))
46
+ # README.md
47
+ dataset_card = DatasetCard.load(repo_id)
48
+ # config_names
49
+ if dataset_card.data.get("config_names", None) and config_name in dataset_card.data["config_names"]:
50
+ dataset_card.data["config_names"].remove(config_name)
51
+ # metadata_configs
52
+ metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card.data)
53
+ if metadata_configs:
54
+ _ = metadata_configs.pop(config_name, None)
55
+ dataset_card_data = DatasetCardData()
56
+ metadata_configs.to_dataset_card_data(dataset_card_data)
57
+ if datasets.config.METADATA_CONFIGS_FIELD in dataset_card_data:
58
+ dataset_card.data[datasets.config.METADATA_CONFIGS_FIELD] = dataset_card_data[
59
+ datasets.config.METADATA_CONFIGS_FIELD
60
+ ]
61
+ else:
62
+ _ = dataset_card.data.pop(datasets.config.METADATA_CONFIGS_FIELD, None)
63
+ # dataset_info
64
+ dataset_infos: DatasetInfosDict = DatasetInfosDict.from_dataset_card_data(dataset_card.data)
65
+ if dataset_infos:
66
+ _ = dataset_infos.pop(config_name, None)
67
+ dataset_card_data = DatasetCardData()
68
+ dataset_infos.to_dataset_card_data(dataset_card_data)
69
+ if "dataset_info" in dataset_card_data:
70
+ dataset_card.data["dataset_info"] = dataset_card_data["dataset_info"]
71
+ else:
72
+ _ = dataset_card.data.pop("dataset_info", None)
73
+ # Commit
74
+ operations.append(
75
+ CommitOperationAdd(path_in_repo=datasets.config.REPOCARD_FILENAME, path_or_fileobj=str(dataset_card).encode())
76
+ )
77
+ api = HfApi(endpoint=datasets.config.HF_ENDPOINT, token=token)
78
+ commit_info = api.create_commit(
79
+ repo_id,
80
+ operations=operations,
81
+ commit_message=f"Delete '{config_name}' config",
82
+ commit_description=f"Delete '{config_name}' config.",
83
+ token=token,
84
+ repo_type="dataset",
85
+ revision=revision,
86
+ create_pr=True,
87
+ )
88
+ print(f"You can find your PR to delete the dataset config at: {commit_info.pr_url}")
89
+ return commit_info
90
+
91
+
92
+ def _delete_files(dataset_id, revision=None, token=None):
93
+ hf_api = HfApi(endpoint=datasets.config.HF_ENDPOINT, token=token)
94
+ repo_files = hf_api.list_repo_files(
95
+ dataset_id,
96
+ repo_type="dataset",
97
+ )
98
+ if repo_files:
99
+ legacy_json_file = []
100
+ data_files = []
101
+ for filename in repo_files:
102
+ if filename in {".gitattributes", "README.md"}:
103
+ continue
104
+ elif filename == "dataset_infos.json":
105
+ legacy_json_file.append(filename)
106
+ else:
107
+ data_files.append(filename)
108
+ if legacy_json_file:
109
+ hf_api.delete_file(
110
+ "dataset_infos.json",
111
+ dataset_id,
112
+ repo_type="dataset",
113
+ revision=revision,
114
+ commit_message="Delete legacy dataset_infos.json",
115
+ )
116
+ if data_files:
117
+ for filename in data_files:
118
+ hf_api.delete_file(
119
+ filename,
120
+ dataset_id,
121
+ repo_type="dataset",
122
+ revision=revision,
123
+ commit_message="Delete data file",
124
+ )
datasets/info.py ADDED
@@ -0,0 +1,430 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # Lint as: python3
16
+ """DatasetInfo record information we know about a dataset.
17
+
18
+ This includes things that we know about the dataset statically, i.e.:
19
+ - description
20
+ - canonical location
21
+ - does it have validation and tests splits
22
+ - size
23
+ - etc.
24
+
25
+ This also includes the things that can and should be computed once we've
26
+ processed the dataset as well:
27
+ - number of examples (in each split)
28
+ - etc.
29
+ """
30
+
31
+ import copy
32
+ import dataclasses
33
+ import json
34
+ import os
35
+ import posixpath
36
+ from dataclasses import dataclass
37
+ from pathlib import Path
38
+ from typing import ClassVar, Optional, Union
39
+
40
+ import fsspec
41
+ from fsspec.core import url_to_fs
42
+ from huggingface_hub import DatasetCard, DatasetCardData
43
+
44
+ from . import config
45
+ from .features import Features
46
+ from .splits import SplitDict
47
+ from .utils import Version
48
+ from .utils.logging import get_logger
49
+ from .utils.py_utils import asdict, unique_values
50
+
51
+
52
+ logger = get_logger(__name__)
53
+
54
+
55
+ @dataclass
56
+ class SupervisedKeysData:
57
+ input: str = ""
58
+ output: str = ""
59
+
60
+
61
+ @dataclass
62
+ class DownloadChecksumsEntryData:
63
+ key: str = ""
64
+ value: str = ""
65
+
66
+
67
+ class MissingCachedSizesConfigError(Exception):
68
+ """The expected cached sizes of the download file are missing."""
69
+
70
+
71
+ class NonMatchingCachedSizesError(Exception):
72
+ """The prepared split doesn't have expected sizes."""
73
+
74
+
75
+ @dataclass
76
+ class PostProcessedInfo:
77
+ features: Optional[Features] = None
78
+ resources_checksums: Optional[dict] = None
79
+
80
+ def __post_init__(self):
81
+ # Convert back to the correct classes when we reload from dict
82
+ if self.features is not None and not isinstance(self.features, Features):
83
+ self.features = Features.from_dict(self.features)
84
+
85
+ @classmethod
86
+ def from_dict(cls, post_processed_info_dict: dict) -> "PostProcessedInfo":
87
+ field_names = {f.name for f in dataclasses.fields(cls)}
88
+ return cls(**{k: v for k, v in post_processed_info_dict.items() if k in field_names})
89
+
90
+
91
+ @dataclass
92
+ class DatasetInfo:
93
+ """Information about a dataset.
94
+
95
+ `DatasetInfo` documents datasets, including its name, version, and features.
96
+ See the constructor arguments and properties for a full list.
97
+
98
+ Not all fields are known on construction and may be updated later.
99
+
100
+ Attributes:
101
+ description (`str`):
102
+ A description of the dataset.
103
+ citation (`str`):
104
+ A BibTeX citation of the dataset.
105
+ homepage (`str`):
106
+ A URL to the official homepage for the dataset.
107
+ license (`str`):
108
+ The dataset's license. It can be the name of the license or a paragraph containing the terms of the license.
109
+ features ([`Features`], *optional*):
110
+ The features used to specify the dataset's column types.
111
+ post_processed (`PostProcessedInfo`, *optional*):
112
+ Information regarding the resources of a possible post-processing of a dataset. For example, it can contain the information of an index.
113
+ supervised_keys (`SupervisedKeysData`, *optional*):
114
+ Specifies the input feature and the label for supervised learning if applicable for the dataset (legacy from TFDS).
115
+ builder_name (`str`, *optional*):
116
+ The name of the `GeneratorBasedBuilder` subclass used to create the dataset. It is also the snake_case version of the dataset builder class name.
117
+ config_name (`str`, *optional*):
118
+ The name of the configuration derived from [`BuilderConfig`].
119
+ version (`str` or [`Version`], *optional*):
120
+ The version of the dataset.
121
+ splits (`dict`, *optional*):
122
+ The mapping between split name and metadata.
123
+ download_checksums (`dict`, *optional*):
124
+ The mapping between the URL to download the dataset's checksums and corresponding metadata.
125
+ download_size (`int`, *optional*):
126
+ The size of the files to download to generate the dataset, in bytes.
127
+ post_processing_size (`int`, *optional*):
128
+ Size of the dataset in bytes after post-processing, if any.
129
+ dataset_size (`int`, *optional*):
130
+ The combined size in bytes of the Arrow tables for all splits.
131
+ size_in_bytes (`int`, *optional*):
132
+ The combined size in bytes of all files associated with the dataset (downloaded files + Arrow files).
133
+ **config_kwargs (additional keyword arguments):
134
+ Keyword arguments to be passed to the [`BuilderConfig`] and used in the [`DatasetBuilder`].
135
+ """
136
+
137
+ # Set in the dataset builders
138
+ description: str = dataclasses.field(default_factory=str)
139
+ citation: str = dataclasses.field(default_factory=str)
140
+ homepage: str = dataclasses.field(default_factory=str)
141
+ license: str = dataclasses.field(default_factory=str)
142
+ features: Optional[Features] = None
143
+ post_processed: Optional[PostProcessedInfo] = None
144
+ supervised_keys: Optional[SupervisedKeysData] = None
145
+
146
+ # Set later by the builder
147
+ builder_name: Optional[str] = None
148
+ dataset_name: Optional[str] = None # for packaged builders, to be different from builder_name
149
+ config_name: Optional[str] = None
150
+ version: Optional[Union[str, Version]] = None
151
+ # Set later by `download_and_prepare`
152
+ splits: Optional[dict] = None
153
+ download_checksums: Optional[dict] = None
154
+ download_size: Optional[int] = None
155
+ post_processing_size: Optional[int] = None
156
+ dataset_size: Optional[int] = None
157
+ size_in_bytes: Optional[int] = None
158
+
159
+ _INCLUDED_INFO_IN_YAML: ClassVar[list[str]] = [
160
+ "config_name",
161
+ "download_size",
162
+ "dataset_size",
163
+ "features",
164
+ "splits",
165
+ ]
166
+
167
+ def __post_init__(self):
168
+ # Convert back to the correct classes when we reload from dict
169
+ if self.features is not None and not isinstance(self.features, Features):
170
+ self.features = Features.from_dict(self.features)
171
+ if self.post_processed is not None and not isinstance(self.post_processed, PostProcessedInfo):
172
+ self.post_processed = PostProcessedInfo.from_dict(self.post_processed)
173
+ if self.version is not None and not isinstance(self.version, Version):
174
+ if isinstance(self.version, str):
175
+ self.version = Version(self.version)
176
+ else:
177
+ self.version = Version.from_dict(self.version)
178
+ if self.splits is not None and not isinstance(self.splits, SplitDict):
179
+ self.splits = SplitDict.from_split_dict(self.splits)
180
+ if self.supervised_keys is not None and not isinstance(self.supervised_keys, SupervisedKeysData):
181
+ if isinstance(self.supervised_keys, (tuple, list)):
182
+ self.supervised_keys = SupervisedKeysData(*self.supervised_keys)
183
+ else:
184
+ self.supervised_keys = SupervisedKeysData(**self.supervised_keys)
185
+
186
+ def write_to_directory(self, dataset_info_dir, pretty_print=False, storage_options: Optional[dict] = None):
187
+ """Write `DatasetInfo` and license (if present) as JSON files to `dataset_info_dir`.
188
+
189
+ Args:
190
+ dataset_info_dir (`str`):
191
+ Destination directory.
192
+ pretty_print (`bool`, defaults to `False`):
193
+ If `True`, the JSON will be pretty-printed with the indent level of 4.
194
+ storage_options (`dict`, *optional*):
195
+ Key/value pairs to be passed on to the file-system backend, if any.
196
+
197
+ <Added version="2.9.0"/>
198
+
199
+ Example:
200
+
201
+ ```py
202
+ >>> from datasets import load_dataset
203
+ >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation")
204
+ >>> ds.info.write_to_directory("/path/to/directory/")
205
+ ```
206
+ """
207
+ fs: fsspec.AbstractFileSystem
208
+ fs, *_ = url_to_fs(dataset_info_dir, **(storage_options or {}))
209
+ with fs.open(posixpath.join(dataset_info_dir, config.DATASET_INFO_FILENAME), "wb") as f:
210
+ self._dump_info(f, pretty_print=pretty_print)
211
+ if self.license:
212
+ with fs.open(posixpath.join(dataset_info_dir, config.LICENSE_FILENAME), "wb") as f:
213
+ self._dump_license(f)
214
+
215
+ def _dump_info(self, file, pretty_print=False):
216
+ """Dump info in `file` file-like object open in bytes mode (to support remote files)"""
217
+ file.write(json.dumps(asdict(self), indent=4 if pretty_print else None).encode("utf-8"))
218
+
219
+ def _dump_license(self, file):
220
+ """Dump license in `file` file-like object open in bytes mode (to support remote files)"""
221
+ file.write(self.license.encode("utf-8"))
222
+
223
+ @classmethod
224
+ def from_merge(cls, dataset_infos: list["DatasetInfo"]):
225
+ dataset_infos = [dset_info.copy() for dset_info in dataset_infos if dset_info is not None]
226
+
227
+ if len(dataset_infos) > 0 and all(dataset_infos[0] == dset_info for dset_info in dataset_infos):
228
+ # if all dataset_infos are equal we don't need to merge. Just return the first.
229
+ return dataset_infos[0]
230
+
231
+ description = "\n\n".join(unique_values(info.description for info in dataset_infos)).strip()
232
+ citation = "\n\n".join(unique_values(info.citation for info in dataset_infos)).strip()
233
+ homepage = "\n\n".join(unique_values(info.homepage for info in dataset_infos)).strip()
234
+ license = "\n\n".join(unique_values(info.license for info in dataset_infos)).strip()
235
+ features = None
236
+ supervised_keys = None
237
+
238
+ return cls(
239
+ description=description,
240
+ citation=citation,
241
+ homepage=homepage,
242
+ license=license,
243
+ features=features,
244
+ supervised_keys=supervised_keys,
245
+ )
246
+
247
+ @classmethod
248
+ def from_directory(cls, dataset_info_dir: str, storage_options: Optional[dict] = None) -> "DatasetInfo":
249
+ """Create [`DatasetInfo`] from the JSON file in `dataset_info_dir`.
250
+
251
+ This function updates all the dynamically generated fields (num_examples,
252
+ hash, time of creation,...) of the [`DatasetInfo`].
253
+
254
+ This will overwrite all previous metadata.
255
+
256
+ Args:
257
+ dataset_info_dir (`str`):
258
+ The directory containing the metadata file. This
259
+ should be the root directory of a specific dataset version.
260
+ storage_options (`dict`, *optional*):
261
+ Key/value pairs to be passed on to the file-system backend, if any.
262
+
263
+ <Added version="2.9.0"/>
264
+
265
+ Example:
266
+
267
+ ```py
268
+ >>> from datasets import DatasetInfo
269
+ >>> ds_info = DatasetInfo.from_directory("/path/to/directory/")
270
+ ```
271
+ """
272
+ fs: fsspec.AbstractFileSystem
273
+ fs, *_ = url_to_fs(dataset_info_dir, **(storage_options or {}))
274
+ logger.debug(f"Loading Dataset info from {dataset_info_dir}")
275
+ if not dataset_info_dir:
276
+ raise ValueError("Calling DatasetInfo.from_directory() with undefined dataset_info_dir.")
277
+ with fs.open(posixpath.join(dataset_info_dir, config.DATASET_INFO_FILENAME), "r", encoding="utf-8") as f:
278
+ dataset_info_dict = json.load(f)
279
+ return cls.from_dict(dataset_info_dict)
280
+
281
+ @classmethod
282
+ def from_dict(cls, dataset_info_dict: dict) -> "DatasetInfo":
283
+ field_names = {f.name for f in dataclasses.fields(cls)}
284
+ return cls(**{k: v for k, v in dataset_info_dict.items() if k in field_names})
285
+
286
+ def update(self, other_dataset_info: "DatasetInfo", ignore_none=True):
287
+ self_dict = self.__dict__
288
+ self_dict.update(
289
+ **{
290
+ k: copy.deepcopy(v)
291
+ for k, v in other_dataset_info.__dict__.items()
292
+ if (v is not None or not ignore_none)
293
+ }
294
+ )
295
+
296
+ def copy(self) -> "DatasetInfo":
297
+ return self.__class__(**{k: copy.deepcopy(v) for k, v in self.__dict__.items()})
298
+
299
+ def _to_yaml_dict(self) -> dict:
300
+ yaml_dict = {}
301
+ dataset_info_dict = asdict(self)
302
+ for key in dataset_info_dict:
303
+ if key in self._INCLUDED_INFO_IN_YAML:
304
+ value = getattr(self, key)
305
+ if hasattr(value, "_to_yaml_list"): # Features, SplitDict
306
+ yaml_dict[key] = value._to_yaml_list()
307
+ elif hasattr(value, "_to_yaml_string"): # Version
308
+ yaml_dict[key] = value._to_yaml_string()
309
+ else:
310
+ yaml_dict[key] = value
311
+ return yaml_dict
312
+
313
+ @classmethod
314
+ def _from_yaml_dict(cls, yaml_data: dict) -> "DatasetInfo":
315
+ yaml_data = copy.deepcopy(yaml_data)
316
+ if yaml_data.get("features") is not None:
317
+ yaml_data["features"] = Features._from_yaml_list(yaml_data["features"])
318
+ if yaml_data.get("splits") is not None:
319
+ yaml_data["splits"] = SplitDict._from_yaml_list(yaml_data["splits"])
320
+ field_names = {f.name for f in dataclasses.fields(cls)}
321
+ return cls(**{k: v for k, v in yaml_data.items() if k in field_names})
322
+
323
+
324
+ class DatasetInfosDict(dict[str, DatasetInfo]):
325
+ def write_to_directory(self, dataset_infos_dir, overwrite=False, pretty_print=False) -> None:
326
+ total_dataset_infos = {}
327
+ dataset_infos_path = os.path.join(dataset_infos_dir, config.DATASETDICT_INFOS_FILENAME)
328
+ dataset_readme_path = os.path.join(dataset_infos_dir, config.REPOCARD_FILENAME)
329
+ if not overwrite:
330
+ total_dataset_infos = self.from_directory(dataset_infos_dir)
331
+ total_dataset_infos.update(self)
332
+ if os.path.exists(dataset_infos_path):
333
+ # for backward compatibility, let's update the JSON file if it exists
334
+ with open(dataset_infos_path, "w", encoding="utf-8") as f:
335
+ dataset_infos_dict = {
336
+ config_name: asdict(dset_info) for config_name, dset_info in total_dataset_infos.items()
337
+ }
338
+ json.dump(dataset_infos_dict, f, indent=4 if pretty_print else None)
339
+ # Dump the infos in the YAML part of the README.md file
340
+ if os.path.exists(dataset_readme_path):
341
+ dataset_card = DatasetCard.load(dataset_readme_path)
342
+ dataset_card_data = dataset_card.data
343
+ else:
344
+ dataset_card = None
345
+ dataset_card_data = DatasetCardData()
346
+ if total_dataset_infos:
347
+ total_dataset_infos.to_dataset_card_data(dataset_card_data)
348
+ dataset_card = (
349
+ DatasetCard("---\n" + str(dataset_card_data) + "\n---\n") if dataset_card is None else dataset_card
350
+ )
351
+ dataset_card.save(Path(dataset_readme_path))
352
+
353
+ @classmethod
354
+ def from_directory(cls, dataset_infos_dir) -> "DatasetInfosDict":
355
+ logger.debug(f"Loading Dataset Infos from {dataset_infos_dir}")
356
+ # Load the info from the YAML part of README.md
357
+ if os.path.exists(os.path.join(dataset_infos_dir, config.REPOCARD_FILENAME)):
358
+ dataset_card_data = DatasetCard.load(Path(dataset_infos_dir) / config.REPOCARD_FILENAME).data
359
+ if "dataset_info" in dataset_card_data:
360
+ return cls.from_dataset_card_data(dataset_card_data)
361
+ if os.path.exists(os.path.join(dataset_infos_dir, config.DATASETDICT_INFOS_FILENAME)):
362
+ # this is just to have backward compatibility with dataset_infos.json files
363
+ with open(os.path.join(dataset_infos_dir, config.DATASETDICT_INFOS_FILENAME), encoding="utf-8") as f:
364
+ return cls(
365
+ {
366
+ config_name: DatasetInfo.from_dict(dataset_info_dict)
367
+ for config_name, dataset_info_dict in json.load(f).items()
368
+ }
369
+ )
370
+ else:
371
+ return cls()
372
+
373
+ @classmethod
374
+ def from_dataset_card_data(cls, dataset_card_data: DatasetCardData) -> "DatasetInfosDict":
375
+ if isinstance(dataset_card_data.get("dataset_info"), (list, dict)):
376
+ if isinstance(dataset_card_data["dataset_info"], list):
377
+ return cls(
378
+ {
379
+ dataset_info_yaml_dict.get("config_name", "default"): DatasetInfo._from_yaml_dict(
380
+ dataset_info_yaml_dict
381
+ )
382
+ for dataset_info_yaml_dict in dataset_card_data["dataset_info"]
383
+ }
384
+ )
385
+ else:
386
+ dataset_info = DatasetInfo._from_yaml_dict(dataset_card_data["dataset_info"])
387
+ dataset_info.config_name = dataset_card_data["dataset_info"].get("config_name", "default")
388
+ return cls({dataset_info.config_name: dataset_info})
389
+ else:
390
+ return cls()
391
+
392
+ def to_dataset_card_data(self, dataset_card_data: DatasetCardData) -> None:
393
+ if self:
394
+ # first get existing metadata info
395
+ if "dataset_info" in dataset_card_data and isinstance(dataset_card_data["dataset_info"], dict):
396
+ dataset_metadata_infos = {
397
+ dataset_card_data["dataset_info"].get("config_name", "default"): dataset_card_data["dataset_info"]
398
+ }
399
+ elif "dataset_info" in dataset_card_data and isinstance(dataset_card_data["dataset_info"], list):
400
+ dataset_metadata_infos = {
401
+ config_metadata["config_name"]: config_metadata
402
+ for config_metadata in dataset_card_data["dataset_info"]
403
+ }
404
+ else:
405
+ dataset_metadata_infos = {}
406
+ # update/rewrite existing metadata info with the one to dump
407
+ total_dataset_infos = {
408
+ **dataset_metadata_infos,
409
+ **{config_name: dset_info._to_yaml_dict() for config_name, dset_info in self.items()},
410
+ }
411
+ # the config_name from the dataset_infos_dict takes over the config_name of the DatasetInfo
412
+ for config_name, dset_info_yaml_dict in total_dataset_infos.items():
413
+ dset_info_yaml_dict["config_name"] = config_name
414
+ if len(total_dataset_infos) == 1:
415
+ # use a struct instead of a list of configurations, since there's only one
416
+ dataset_card_data["dataset_info"] = next(iter(total_dataset_infos.values()))
417
+ config_name = dataset_card_data["dataset_info"].pop("config_name", None)
418
+ if config_name != "default":
419
+ # if config_name is not "default" preserve it and put at the first position
420
+ dataset_card_data["dataset_info"] = {
421
+ "config_name": config_name,
422
+ **dataset_card_data["dataset_info"],
423
+ }
424
+ else:
425
+ dataset_card_data["dataset_info"] = []
426
+ for config_name, dataset_info_yaml_dict in sorted(total_dataset_infos.items()):
427
+ # add the config_name field in first position
428
+ dataset_info_yaml_dict.pop("config_name", None)
429
+ dataset_info_yaml_dict = {"config_name": config_name, **dataset_info_yaml_dict}
430
+ dataset_card_data["dataset_info"].append(dataset_info_yaml_dict)
datasets/inspect.py ADDED
@@ -0,0 +1,353 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Datasets Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # Lint as: python3
16
+ """List and inspect datasets."""
17
+
18
+ import os
19
+ from collections.abc import Mapping, Sequence
20
+ from typing import Optional, Union
21
+
22
+ from .download.download_config import DownloadConfig
23
+ from .download.download_manager import DownloadMode
24
+ from .download.streaming_download_manager import StreamingDownloadManager
25
+ from .info import DatasetInfo
26
+ from .load import (
27
+ dataset_module_factory,
28
+ get_dataset_builder_class,
29
+ load_dataset_builder,
30
+ )
31
+ from .utils.logging import get_logger
32
+ from .utils.version import Version
33
+
34
+
35
+ logger = get_logger(__name__)
36
+
37
+
38
+ class SplitsNotFoundError(ValueError):
39
+ pass
40
+
41
+
42
+ def get_dataset_infos(
43
+ path: str,
44
+ data_files: Optional[Union[dict, list, str]] = None,
45
+ download_config: Optional[DownloadConfig] = None,
46
+ download_mode: Optional[Union[DownloadMode, str]] = None,
47
+ revision: Optional[Union[str, Version]] = None,
48
+ token: Optional[Union[bool, str]] = None,
49
+ **config_kwargs,
50
+ ):
51
+ """Get the meta information about a dataset, returned as a dict mapping config name to DatasetInfoDict.
52
+
53
+ Args:
54
+ path (`str`): path to the dataset repository. Can be either:
55
+
56
+ - a local path to the dataset directory containing the data files,
57
+ e.g. `'./dataset/squad'`
58
+ - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]),
59
+ e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'`
60
+ revision (`Union[str, datasets.Version]`, *optional*):
61
+ If specified, the dataset module will be loaded from the datasets repository at this version.
62
+ By default:
63
+ - it is set to the local version of the lib.
64
+ - it will also try to load it from the main branch if it's not available at the local version of the lib.
65
+ Specifying a version that is different from your local version of the lib might cause compatibility issues.
66
+ download_config ([`DownloadConfig`], *optional*):
67
+ Specific download configuration parameters.
68
+ download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):
69
+ Download/generate mode.
70
+ data_files (`Union[Dict, List, str]`, *optional*):
71
+ Defining the data_files of the dataset configuration.
72
+ token (`str` or `bool`, *optional*):
73
+ Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
74
+ If `True`, or not specified, will get token from `"~/.huggingface"`.
75
+ **config_kwargs (additional keyword arguments):
76
+ Optional attributes for builder class which will override the attributes if supplied.
77
+
78
+ Example:
79
+
80
+ ```py
81
+ >>> from datasets import get_dataset_infos
82
+ >>> get_dataset_infos('cornell-movie-review-data/rotten_tomatoes')
83
+ {'default': DatasetInfo(description="Movie Review Dataset.\nThis is a dataset of containing 5,331 positive and 5,331 negative processed\nsentences from Rotten Tomatoes movie reviews...), ...}
84
+ ```
85
+ """
86
+ config_names = get_dataset_config_names(
87
+ path=path,
88
+ revision=revision,
89
+ download_config=download_config,
90
+ download_mode=download_mode,
91
+ data_files=data_files,
92
+ token=token,
93
+ )
94
+ return {
95
+ config_name: get_dataset_config_info(
96
+ path=path,
97
+ config_name=config_name,
98
+ data_files=data_files,
99
+ download_config=download_config,
100
+ download_mode=download_mode,
101
+ revision=revision,
102
+ token=token,
103
+ **config_kwargs,
104
+ )
105
+ for config_name in config_names
106
+ }
107
+
108
+
109
+ def get_dataset_config_names(
110
+ path: str,
111
+ revision: Optional[Union[str, Version]] = None,
112
+ download_config: Optional[DownloadConfig] = None,
113
+ download_mode: Optional[Union[DownloadMode, str]] = None,
114
+ data_files: Optional[Union[dict, list, str]] = None,
115
+ **download_kwargs,
116
+ ):
117
+ """Get the list of available config names for a particular dataset.
118
+
119
+ Args:
120
+ path (`str`): path to the dataset repository. Can be either:
121
+
122
+ - a local path to the dataset directory containing the data files,
123
+ e.g. `'./dataset/squad'`
124
+ - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]),
125
+ e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'`
126
+ revision (`Union[str, datasets.Version]`, *optional*):
127
+ If specified, the dataset module will be loaded from the datasets repository at this version.
128
+ By default:
129
+ - it is set to the local version of the lib.
130
+ - it will also try to load it from the main branch if it's not available at the local version of the lib.
131
+ Specifying a version that is different from your local version of the lib might cause compatibility issues.
132
+ download_config ([`DownloadConfig`], *optional*):
133
+ Specific download configuration parameters.
134
+ download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):
135
+ Download/generate mode.
136
+ data_files (`Union[Dict, List, str]`, *optional*):
137
+ Defining the data_files of the dataset configuration.
138
+ **download_kwargs (additional keyword arguments):
139
+ Optional attributes for [`DownloadConfig`] which will override the attributes in `download_config` if supplied,
140
+ for example `token`.
141
+
142
+ Example:
143
+
144
+ ```py
145
+ >>> from datasets import get_dataset_config_names
146
+ >>> get_dataset_config_names("nyu-mll/glue")
147
+ ['cola',
148
+ 'sst2',
149
+ 'mrpc',
150
+ 'qqp',
151
+ 'stsb',
152
+ 'mnli',
153
+ 'mnli_mismatched',
154
+ 'mnli_matched',
155
+ 'qnli',
156
+ 'rte',
157
+ 'wnli',
158
+ 'ax']
159
+ ```
160
+ """
161
+ dataset_module = dataset_module_factory(
162
+ path,
163
+ revision=revision,
164
+ download_config=download_config,
165
+ download_mode=download_mode,
166
+ data_files=data_files,
167
+ **download_kwargs,
168
+ )
169
+ builder_cls = get_dataset_builder_class(dataset_module, dataset_name=os.path.basename(path))
170
+ return list(builder_cls.builder_configs.keys()) or [
171
+ dataset_module.builder_kwargs.get("config_name", builder_cls.DEFAULT_CONFIG_NAME or "default")
172
+ ]
173
+
174
+
175
+ def get_dataset_default_config_name(
176
+ path: str,
177
+ revision: Optional[Union[str, Version]] = None,
178
+ download_config: Optional[DownloadConfig] = None,
179
+ download_mode: Optional[Union[DownloadMode, str]] = None,
180
+ data_files: Optional[Union[dict, list, str]] = None,
181
+ **download_kwargs,
182
+ ) -> Optional[str]:
183
+ """Get the default config name for a particular dataset.
184
+ Can return None only if the dataset has multiple configurations and no default configuration.
185
+
186
+ Args:
187
+ path (`str`): path to the dataset repository. Can be either:
188
+
189
+ - a local path to the dataset directory containing the data files,
190
+ e.g. `'./dataset/squad'`
191
+ - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]),
192
+ e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'`
193
+ revision (`Union[str, datasets.Version]`, *optional*):
194
+ If specified, the dataset module will be loaded from the datasets repository at this version.
195
+ By default:
196
+ - it is set to the local version of the lib.
197
+ - it will also try to load it from the main branch if it's not available at the local version of the lib.
198
+ Specifying a version that is different from your local version of the lib might cause compatibility issues.
199
+ download_config ([`DownloadConfig`], *optional*):
200
+ Specific download configuration parameters.
201
+ download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):
202
+ Download/generate mode.
203
+ data_files (`Union[Dict, List, str]`, *optional*):
204
+ Defining the data_files of the dataset configuration.
205
+ **download_kwargs (additional keyword arguments):
206
+ Optional attributes for [`DownloadConfig`] which will override the attributes in `download_config` if supplied,
207
+ for example `token`.
208
+
209
+ Returns:
210
+ Optional[str]: the default config name if there is one
211
+
212
+ Example:
213
+
214
+ ```py
215
+ >>> from datasets import get_dataset_default_config_name
216
+ >>> get_dataset_default_config_name("openbookqa")
217
+ 'main'
218
+ ```
219
+ """
220
+ dataset_module = dataset_module_factory(
221
+ path,
222
+ revision=revision,
223
+ download_config=download_config,
224
+ download_mode=download_mode,
225
+ data_files=data_files,
226
+ **download_kwargs,
227
+ )
228
+ builder_cls = get_dataset_builder_class(dataset_module, dataset_name=os.path.basename(path))
229
+ builder_configs = list(builder_cls.builder_configs.keys())
230
+ if builder_configs:
231
+ default_config_name = builder_configs[0] if len(builder_configs) == 1 else None
232
+ else:
233
+ default_config_name = "default"
234
+ return builder_cls.DEFAULT_CONFIG_NAME or default_config_name
235
+
236
+
237
+ def get_dataset_config_info(
238
+ path: str,
239
+ config_name: Optional[str] = None,
240
+ data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
241
+ download_config: Optional[DownloadConfig] = None,
242
+ download_mode: Optional[Union[DownloadMode, str]] = None,
243
+ revision: Optional[Union[str, Version]] = None,
244
+ token: Optional[Union[bool, str]] = None,
245
+ **config_kwargs,
246
+ ) -> DatasetInfo:
247
+ """Get the meta information (DatasetInfo) about a dataset for a particular config
248
+
249
+ Args:
250
+ path (`str`): path to the dataset repository. Can be either:
251
+
252
+ - a local path to the dataset directory containing the data files,
253
+ e.g. `'./dataset/squad'`
254
+ - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]),
255
+ e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'`
256
+ config_name (:obj:`str`, optional): Defining the name of the dataset configuration.
257
+ data_files (:obj:`str` or :obj:`Sequence` or :obj:`Mapping`, optional): Path(s) to source data file(s).
258
+ download_config (:class:`~download.DownloadConfig`, optional): Specific download configuration parameters.
259
+ download_mode (:class:`DownloadMode` or :obj:`str`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode.
260
+ revision (:class:`~utils.Version` or :obj:`str`, optional): Version of the dataset to load.
261
+ As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch.
262
+ You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository.
263
+ token (``str`` or :obj:`bool`, optional): Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
264
+ If True, or not specified, will get token from `"~/.huggingface"`.
265
+ **config_kwargs (additional keyword arguments): optional attributes for builder class which will override the attributes if supplied.
266
+
267
+ """
268
+ builder = load_dataset_builder(
269
+ path,
270
+ name=config_name,
271
+ data_files=data_files,
272
+ download_config=download_config,
273
+ download_mode=download_mode,
274
+ revision=revision,
275
+ token=token,
276
+ **config_kwargs,
277
+ )
278
+ info = builder.info
279
+ if info.splits is None:
280
+ download_config = download_config.copy() if download_config else DownloadConfig()
281
+ if token is not None:
282
+ download_config.token = token
283
+ builder._check_manual_download(
284
+ StreamingDownloadManager(base_path=builder.base_path, download_config=download_config)
285
+ )
286
+ try:
287
+ info.splits = {
288
+ split_generator.name: {"name": split_generator.name, "dataset_name": path}
289
+ for split_generator in builder._split_generators(
290
+ StreamingDownloadManager(base_path=builder.base_path, download_config=download_config)
291
+ )
292
+ }
293
+ except Exception as err:
294
+ raise SplitsNotFoundError("The split names could not be parsed from the dataset config.") from err
295
+ return info
296
+
297
+
298
+ def get_dataset_split_names(
299
+ path: str,
300
+ config_name: Optional[str] = None,
301
+ data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
302
+ download_config: Optional[DownloadConfig] = None,
303
+ download_mode: Optional[Union[DownloadMode, str]] = None,
304
+ revision: Optional[Union[str, Version]] = None,
305
+ token: Optional[Union[bool, str]] = None,
306
+ **config_kwargs,
307
+ ):
308
+ """Get the list of available splits for a particular config and dataset.
309
+
310
+ Args:
311
+ path (`str`): path to the dataset repository. Can be either:
312
+
313
+ - a local path to the dataset directory containing the data files,
314
+ e.g. `'./dataset/squad'`
315
+ - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]),
316
+ e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'`
317
+ config_name (`str`, *optional*):
318
+ Defining the name of the dataset configuration.
319
+ data_files (`str` or `Sequence` or `Mapping`, *optional*):
320
+ Path(s) to source data file(s).
321
+ download_config ([`DownloadConfig`], *optional*):
322
+ Specific download configuration parameters.
323
+ download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):
324
+ Download/generate mode.
325
+ revision ([`Version`] or `str`, *optional*):
326
+ Version of the dataset to load.
327
+ As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch.
328
+ You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository.
329
+ token (`str` or `bool`, *optional*):
330
+ Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
331
+ If `True`, or not specified, will get token from `"~/.huggingface"`.
332
+ **config_kwargs (additional keyword arguments):
333
+ Optional attributes for builder class which will override the attributes if supplied.
334
+
335
+ Example:
336
+
337
+ ```py
338
+ >>> from datasets import get_dataset_split_names
339
+ >>> get_dataset_split_names('cornell-movie-review-data/rotten_tomatoes')
340
+ ['train', 'validation', 'test']
341
+ ```
342
+ """
343
+ info = get_dataset_config_info(
344
+ path,
345
+ config_name=config_name,
346
+ data_files=data_files,
347
+ download_config=download_config,
348
+ download_mode=download_mode,
349
+ revision=revision,
350
+ token=token,
351
+ **config_kwargs,
352
+ )
353
+ return list(info.splits.keys())
datasets/iterable_dataset.py ADDED
The diff for this file is too large to render. See raw diff
 
datasets/keyhash.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # Lint as: python3
16
+
17
+ """
18
+ Hashing function for dataset keys using `hashlib.md5`
19
+
20
+ Requirements for the hash function:
21
+
22
+ - Provides a uniformly distributed hash from random space
23
+ - Adequately fast speed
24
+ - Working with multiple input types (in this case, `str`, `int` or `bytes`)
25
+ - Should be platform independent (generates same hash on different OS and systems)
26
+
27
+ The hashing function provides a unique 128-bit integer hash of the key provided.
28
+
29
+ The split name is being used here as the hash salt to avoid having same hashes
30
+ in different splits due to same keys
31
+ """
32
+
33
+ from typing import Union
34
+
35
+ from huggingface_hub.utils import insecure_hashlib
36
+
37
+
38
+ def _as_bytes(hash_data: Union[str, int, bytes, bytearray]) -> bytes:
39
+ """
40
+ Returns the input hash_data in its bytes form
41
+
42
+ Args:
43
+ hash_data: the hash salt/key to be converted to bytes
44
+ """
45
+ if isinstance(hash_data, (bytes, bytearray)):
46
+ # Data already in bytes, returns as it as
47
+ return hash_data
48
+ elif isinstance(hash_data, str):
49
+ # We keep the data as it as for it ot be later encoded to UTF-8
50
+ # However replace `\\` with `/` for Windows compatibility
51
+ hash_data = hash_data.replace("\\", "/")
52
+ elif isinstance(hash_data, int):
53
+ hash_data = str(hash_data)
54
+ else:
55
+ # If data is not of the required type, raise error
56
+ raise InvalidKeyError(hash_data)
57
+
58
+ return hash_data.encode("utf-8")
59
+
60
+
61
+ class InvalidKeyError(Exception):
62
+ """Raises an error when given key is of invalid datatype."""
63
+
64
+ def __init__(self, hash_data):
65
+ self.prefix = "\nFAILURE TO GENERATE DATASET: Invalid key type detected"
66
+ self.err_msg = f"\nFound Key {hash_data} of type {type(hash_data)}"
67
+ self.suffix = "\nKeys should be either str, int or bytes type"
68
+ super().__init__(f"{self.prefix}{self.err_msg}{self.suffix}")
69
+
70
+
71
+ class DuplicatedKeysError(Exception):
72
+ """Raise an error when duplicate key found."""
73
+
74
+ def __init__(self, key, duplicate_key_indices, fix_msg=""):
75
+ self.key = key
76
+ self.duplicate_key_indices = duplicate_key_indices
77
+ self.fix_msg = fix_msg
78
+ self.prefix = "Found multiple examples generated with the same key"
79
+ if len(duplicate_key_indices) <= 20:
80
+ self.err_msg = f"\nThe examples at index {', '.join(duplicate_key_indices)} have the key {key}"
81
+ else:
82
+ self.err_msg = f"\nThe examples at index {', '.join(duplicate_key_indices[:20])}... ({len(duplicate_key_indices) - 20} more) have the key {key}"
83
+ self.suffix = "\n" + fix_msg if fix_msg else ""
84
+ super().__init__(f"{self.prefix}{self.err_msg}{self.suffix}")
85
+
86
+
87
+ class KeyHasher:
88
+ """KeyHasher class for providing hash using md5"""
89
+
90
+ def __init__(self, hash_salt: str):
91
+ self._split_md5 = insecure_hashlib.md5(_as_bytes(hash_salt))
92
+
93
+ def hash(self, key: Union[str, int, bytes]) -> int:
94
+ """Returns 128-bits unique hash of input key
95
+
96
+ Args:
97
+ key: the input key to be hashed (should be str, int or bytes)
98
+
99
+ Returns: 128-bit int hash key"""
100
+ md5 = self._split_md5.copy()
101
+ byte_key = _as_bytes(key)
102
+ md5.update(byte_key)
103
+ # Convert to integer with hexadecimal conversion
104
+ return int(md5.hexdigest(), 16)
datasets/load.py ADDED
@@ -0,0 +1,1481 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # Lint as: python3
16
+ """Access datasets."""
17
+
18
+ import glob
19
+ import importlib
20
+ import inspect
21
+ import json
22
+ import os
23
+ import posixpath
24
+ from collections import Counter
25
+ from collections.abc import Mapping, Sequence
26
+ from dataclasses import dataclass, field
27
+ from pathlib import Path
28
+ from typing import Any, Optional, Union
29
+
30
+ import fsspec
31
+ import httpx
32
+ import requests
33
+ import yaml
34
+ from fsspec.core import url_to_fs
35
+ from huggingface_hub import DatasetCard, DatasetCardData, HfApi
36
+ from huggingface_hub.utils import (
37
+ EntryNotFoundError,
38
+ GatedRepoError,
39
+ LocalEntryNotFoundError,
40
+ OfflineModeIsEnabled,
41
+ RepositoryNotFoundError,
42
+ RevisionNotFoundError,
43
+ get_session,
44
+ )
45
+
46
+ from . import __version__, config
47
+ from .arrow_dataset import Dataset
48
+ from .builder import BuilderConfig, DatasetBuilder
49
+ from .data_files import (
50
+ DataFilesDict,
51
+ DataFilesList,
52
+ DataFilesPatternsDict,
53
+ EmptyDatasetError,
54
+ get_data_patterns,
55
+ sanitize_patterns,
56
+ )
57
+ from .dataset_dict import DatasetDict, IterableDatasetDict
58
+ from .download.download_config import DownloadConfig
59
+ from .download.download_manager import DownloadMode
60
+ from .download.streaming_download_manager import StreamingDownloadManager, xbasename, xglob, xjoin
61
+ from .exceptions import DataFilesNotFoundError, DatasetNotFoundError
62
+ from .features import Features
63
+ from .features.features import _fix_for_backward_compatible_features
64
+ from .fingerprint import Hasher
65
+ from .info import DatasetInfo, DatasetInfosDict
66
+ from .iterable_dataset import IterableDataset
67
+ from .naming import camelcase_to_snakecase, snakecase_to_camelcase
68
+ from .packaged_modules import (
69
+ _EXTENSION_TO_MODULE,
70
+ _MODULE_TO_EXTENSIONS,
71
+ _MODULE_TO_METADATA_FILE_NAMES,
72
+ _PACKAGED_DATASETS_MODULES,
73
+ )
74
+ from .packaged_modules.folder_based_builder.folder_based_builder import FolderBasedBuilder
75
+ from .splits import Split
76
+ from .utils import _dataset_viewer
77
+ from .utils.file_utils import (
78
+ _raise_if_offline_mode_is_enabled,
79
+ cached_path,
80
+ get_datasets_user_agent,
81
+ is_relative_path,
82
+ relative_to_absolute_path,
83
+ )
84
+ from .utils.hub import hf_dataset_url
85
+ from .utils.info_utils import VerificationMode, is_small_dataset
86
+ from .utils.logging import get_logger
87
+ from .utils.metadata import MetadataConfigs
88
+ from .utils.typing import PathLike
89
+ from .utils.version import Version
90
+
91
+
92
+ logger = get_logger(__name__)
93
+
94
+ ALL_ALLOWED_EXTENSIONS = list(_EXTENSION_TO_MODULE.keys()) + [".zip"]
95
+
96
+
97
+ class _InitializeConfiguredDatasetBuilder:
98
+ """
99
+ From https://stackoverflow.com/questions/4647566/pickle-a-dynamically-parameterized-sub-class
100
+ See also ConfiguredDatasetBuilder.__reduce__
101
+ When called with the param value as the only argument, returns an
102
+ un-initialized instance of the parameterized class. Subsequent __setstate__
103
+ will be called by pickle.
104
+ """
105
+
106
+ def __call__(self, builder_cls, metadata_configs, default_config_name, name):
107
+ # make a simple object which has no complex __init__ (this one will do)
108
+ obj = _InitializeConfiguredDatasetBuilder()
109
+ obj.__class__ = configure_builder_class(
110
+ builder_cls, metadata_configs, default_config_name=default_config_name, dataset_name=name
111
+ )
112
+ return obj
113
+
114
+
115
+ def configure_builder_class(
116
+ builder_cls: type[DatasetBuilder],
117
+ builder_configs: list[BuilderConfig],
118
+ default_config_name: Optional[str],
119
+ dataset_name: str,
120
+ ) -> type[DatasetBuilder]:
121
+ """
122
+ Dynamically create a builder class with custom builder configs parsed from README.md file,
123
+ i.e. set BUILDER_CONFIGS class variable of a builder class to custom configs list.
124
+ """
125
+
126
+ class ConfiguredDatasetBuilder(builder_cls):
127
+ BUILDER_CONFIGS = builder_configs
128
+ DEFAULT_CONFIG_NAME = default_config_name
129
+
130
+ __module__ = builder_cls.__module__ # so that the actual packaged builder can be imported
131
+
132
+ def __reduce__(self): # to make dynamically created class pickable, see _InitializeParameterizedDatasetBuilder
133
+ parent_builder_cls = self.__class__.__mro__[1]
134
+ return (
135
+ _InitializeConfiguredDatasetBuilder(),
136
+ (
137
+ parent_builder_cls,
138
+ self.BUILDER_CONFIGS,
139
+ self.DEFAULT_CONFIG_NAME,
140
+ self.dataset_name,
141
+ ),
142
+ self.__dict__.copy(),
143
+ )
144
+
145
+ ConfiguredDatasetBuilder.__name__ = (
146
+ f"{builder_cls.__name__.lower().capitalize()}{snakecase_to_camelcase(dataset_name)}"
147
+ )
148
+ ConfiguredDatasetBuilder.__qualname__ = (
149
+ f"{builder_cls.__name__.lower().capitalize()}{snakecase_to_camelcase(dataset_name)}"
150
+ )
151
+
152
+ return ConfiguredDatasetBuilder
153
+
154
+
155
+ def import_main_class(module_path) -> Optional[type[DatasetBuilder]]:
156
+ """Import a module at module_path and return its main class: a DatasetBuilder"""
157
+ module = importlib.import_module(module_path)
158
+ # Find the main class in our imported module
159
+ module_main_cls = None
160
+ for name, obj in module.__dict__.items():
161
+ if inspect.isclass(obj) and issubclass(obj, DatasetBuilder):
162
+ if inspect.isabstract(obj):
163
+ continue
164
+ module_main_cls = obj
165
+ obj_module = inspect.getmodule(obj)
166
+ if obj_module is not None and module == obj_module:
167
+ break
168
+
169
+ return module_main_cls
170
+
171
+
172
+ def get_dataset_builder_class(
173
+ dataset_module: "DatasetModule", dataset_name: Optional[str] = None
174
+ ) -> type[DatasetBuilder]:
175
+ builder_cls = import_main_class(dataset_module.module_path)
176
+ if dataset_module.builder_configs_parameters.builder_configs:
177
+ dataset_name = dataset_name or dataset_module.builder_kwargs.get("dataset_name")
178
+ if dataset_name is None:
179
+ raise ValueError("dataset_name should be specified but got None")
180
+ builder_cls = configure_builder_class(
181
+ builder_cls,
182
+ builder_configs=dataset_module.builder_configs_parameters.builder_configs,
183
+ default_config_name=dataset_module.builder_configs_parameters.default_config_name,
184
+ dataset_name=dataset_name,
185
+ )
186
+ return builder_cls
187
+
188
+
189
+ def increase_load_count(name: str):
190
+ """Update the download count of a dataset."""
191
+ if not config.HF_HUB_OFFLINE and config.HF_UPDATE_DOWNLOAD_COUNTS:
192
+ try:
193
+ get_session().head(
194
+ "/".join((config.S3_DATASETS_BUCKET_PREFIX, name, name + ".py")),
195
+ headers={"User-Agent": get_datasets_user_agent()},
196
+ timeout=3,
197
+ )
198
+ except Exception:
199
+ pass
200
+
201
+
202
+ def infer_module_for_data_files_list(
203
+ data_files_list: DataFilesList, download_config: Optional[DownloadConfig] = None
204
+ ) -> tuple[Optional[str], dict]:
205
+ """Infer module (and builder kwargs) from list of data files.
206
+
207
+ It picks the module based on the most common file extension.
208
+ In case of a draw ".parquet" is the favorite, and then alphabetical order.
209
+
210
+ Args:
211
+ data_files_list (DataFilesList): List of data files.
212
+ download_config (bool or str, optional): Mainly use `token` or `storage_options` to support different platforms and auth types.
213
+
214
+ Returns:
215
+ tuple[str, dict[str, Any]]: Tuple with
216
+ - inferred module name
217
+ - dict of builder kwargs
218
+ """
219
+ extensions_counter = Counter(
220
+ ("." + suffix.lower(), xbasename(filepath) in FolderBasedBuilder.METADATA_FILENAMES)
221
+ for filepath in data_files_list[: config.DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE]
222
+ for suffix in xbasename(filepath).split(".")[1:]
223
+ )
224
+ if extensions_counter:
225
+
226
+ def sort_key(ext_count: tuple[tuple[str, bool], int]) -> tuple[int, bool]:
227
+ """Sort by count and set ".parquet" as the favorite in case of a draw, and ignore metadata files"""
228
+ (ext, is_metadata), count = ext_count
229
+ return (not is_metadata, count, ext == ".parquet", ext == ".jsonl", ext == ".json", ext == ".csv", ext)
230
+
231
+ for (ext, _), _ in sorted(extensions_counter.items(), key=sort_key, reverse=True):
232
+ if ext in _EXTENSION_TO_MODULE:
233
+ return _EXTENSION_TO_MODULE[ext]
234
+ elif ext == ".zip":
235
+ return infer_module_for_data_files_list_in_archives(data_files_list, download_config=download_config)
236
+ return None, {}
237
+
238
+
239
+ def infer_module_for_data_files_list_in_archives(
240
+ data_files_list: DataFilesList, download_config: Optional[DownloadConfig] = None
241
+ ) -> tuple[Optional[str], dict]:
242
+ """Infer module (and builder kwargs) from list of archive data files.
243
+
244
+ Args:
245
+ data_files_list (DataFilesList): List of data files.
246
+ download_config (bool or str, optional): Mainly use `token` or `storage_options` to support different platforms and auth types.
247
+
248
+ Returns:
249
+ tuple[str, dict[str, Any]]: Tuple with
250
+ - inferred module name
251
+ - dict of builder kwargs
252
+ """
253
+ archived_files = []
254
+ archive_files_counter = 0
255
+ for filepath in data_files_list:
256
+ if str(filepath).endswith(".zip"):
257
+ archive_files_counter += 1
258
+ if archive_files_counter > config.GLOBBED_DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE:
259
+ break
260
+ extracted = xjoin(StreamingDownloadManager().extract(filepath), "**")
261
+ archived_files += [
262
+ f.split("::")[0]
263
+ for f in xglob(extracted, recursive=True, download_config=download_config)[
264
+ : config.ARCHIVED_DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE
265
+ ]
266
+ ]
267
+ extensions_counter = Counter(
268
+ "." + suffix.lower() for filepath in archived_files for suffix in xbasename(filepath).split(".")[1:]
269
+ )
270
+ if extensions_counter:
271
+ most_common = extensions_counter.most_common(1)[0][0]
272
+ if most_common in _EXTENSION_TO_MODULE:
273
+ return _EXTENSION_TO_MODULE[most_common]
274
+ return None, {}
275
+
276
+
277
+ def infer_module_for_data_files(
278
+ data_files: DataFilesDict, path: Optional[str] = None, download_config: Optional[DownloadConfig] = None
279
+ ) -> tuple[Optional[str], dict[str, Any]]:
280
+ """Infer module (and builder kwargs) from data files. Raise if module names for different splits don't match.
281
+
282
+ Args:
283
+ data_files ([`DataFilesDict`]): Dict of list of data files.
284
+ path (str, *optional*): Dataset name or path.
285
+ download_config ([`DownloadConfig`], *optional*):
286
+ Specific download configuration parameters to authenticate on the Hugging Face Hub for private remote files.
287
+
288
+ Returns:
289
+ tuple[str, dict[str, Any]]: Tuple with
290
+ - inferred module name
291
+ - builder kwargs
292
+ """
293
+ split_modules = {
294
+ split: infer_module_for_data_files_list(data_files_list, download_config=download_config)
295
+ for split, data_files_list in data_files.items()
296
+ }
297
+ module_name, default_builder_kwargs = next(iter(split_modules.values()))
298
+ if any((module_name, default_builder_kwargs) != split_module for split_module in split_modules.values()):
299
+ raise ValueError(f"Couldn't infer the same data file format for all splits. Got {split_modules}")
300
+ if not module_name:
301
+ raise DataFilesNotFoundError("No (supported) data files found" + (f" in {path}" if path else ""))
302
+ return module_name, default_builder_kwargs
303
+
304
+
305
+ def create_builder_configs_from_metadata_configs(
306
+ module_path: str,
307
+ metadata_configs: MetadataConfigs,
308
+ base_path: Optional[str] = None,
309
+ default_builder_kwargs: dict[str, Any] = None,
310
+ download_config: Optional[DownloadConfig] = None,
311
+ ) -> tuple[list[BuilderConfig], str]:
312
+ builder_cls = import_main_class(module_path)
313
+ builder_config_cls = builder_cls.BUILDER_CONFIG_CLASS
314
+ default_config_name = metadata_configs.get_default_config_name()
315
+ builder_configs = []
316
+ default_builder_kwargs = {} if default_builder_kwargs is None else default_builder_kwargs
317
+
318
+ base_path = base_path if base_path is not None else ""
319
+ for config_name, config_params in metadata_configs.items():
320
+ config_data_files = config_params.get("data_files")
321
+ config_data_dir = config_params.get("data_dir")
322
+ config_base_path = xjoin(base_path, config_data_dir) if config_data_dir else base_path
323
+ try:
324
+ config_patterns = (
325
+ sanitize_patterns(config_data_files)
326
+ if config_data_files is not None
327
+ else get_data_patterns(config_base_path, download_config=download_config)
328
+ )
329
+ config_data_files_dict = DataFilesPatternsDict.from_patterns(
330
+ config_patterns,
331
+ allowed_extensions=ALL_ALLOWED_EXTENSIONS,
332
+ )
333
+ except EmptyDatasetError as e:
334
+ raise EmptyDatasetError(
335
+ f"Dataset at '{base_path}' doesn't contain data files matching the patterns for config '{config_name}',"
336
+ f" check `data_files` and `data_fir` parameters in the `configs` YAML field in README.md. "
337
+ ) from e
338
+ ignored_params = [
339
+ param for param in config_params if not hasattr(builder_config_cls, param) and param != "default"
340
+ ]
341
+ if ignored_params:
342
+ logger.warning(
343
+ f"Some datasets params were ignored: {ignored_params}. "
344
+ "Make sure to use only valid params for the dataset builder and to have "
345
+ "a up-to-date version of the `datasets` library."
346
+ )
347
+ builder_configs.append(
348
+ builder_config_cls(
349
+ name=config_name,
350
+ data_files=config_data_files_dict,
351
+ data_dir=config_data_dir,
352
+ **{
353
+ param: value
354
+ for param, value in {**default_builder_kwargs, **config_params}.items()
355
+ if hasattr(builder_config_cls, param) and param not in ("default", "data_files", "data_dir")
356
+ },
357
+ )
358
+ )
359
+ return builder_configs, default_config_name
360
+
361
+
362
+ @dataclass
363
+ class BuilderConfigsParameters:
364
+ """Dataclass containing objects related to creation of builder configurations from yaml's metadata content.
365
+
366
+ Attributes:
367
+ metadata_configs (`MetadataConfigs`, *optional*):
368
+ Configs parsed from yaml's metadata.
369
+ builder_configs (`list[BuilderConfig]`, *optional*):
370
+ List of BuilderConfig objects created from metadata_configs above.
371
+ default_config_name (`str`):
372
+ Name of default config taken from yaml's metadata.
373
+ """
374
+
375
+ metadata_configs: Optional[MetadataConfigs] = None
376
+ builder_configs: Optional[list[BuilderConfig]] = None
377
+ default_config_name: Optional[str] = None
378
+
379
+
380
+ @dataclass
381
+ class DatasetModule:
382
+ module_path: str
383
+ hash: str
384
+ builder_kwargs: dict
385
+ builder_configs_parameters: BuilderConfigsParameters = field(default_factory=BuilderConfigsParameters)
386
+ dataset_infos: Optional[DatasetInfosDict] = None
387
+
388
+
389
+ class _DatasetModuleFactory:
390
+ def get_module(self) -> DatasetModule:
391
+ raise NotImplementedError
392
+
393
+
394
+ class LocalDatasetModuleFactory(_DatasetModuleFactory):
395
+ """Get the module of a dataset loaded from the user's data files. The dataset builder module to use is inferred
396
+ from the data files extensions."""
397
+
398
+ def __init__(
399
+ self,
400
+ path: str,
401
+ data_dir: Optional[str] = None,
402
+ data_files: Optional[Union[str, list, dict]] = None,
403
+ download_mode: Optional[Union[DownloadMode, str]] = None,
404
+ ):
405
+ if data_dir and os.path.isabs(data_dir):
406
+ raise ValueError(f"`data_dir` must be relative to a dataset directory's root: {path}")
407
+
408
+ self.path = Path(path).as_posix()
409
+ self.name = Path(path).stem
410
+ self.data_files = data_files
411
+ self.data_dir = data_dir
412
+ self.download_mode = download_mode
413
+
414
+ def get_module(self) -> DatasetModule:
415
+ readme_path = os.path.join(self.path, config.REPOCARD_FILENAME)
416
+ standalone_yaml_path = os.path.join(self.path, config.REPOYAML_FILENAME)
417
+ dataset_card_data = DatasetCard.load(readme_path).data if os.path.isfile(readme_path) else DatasetCardData()
418
+ if os.path.exists(standalone_yaml_path):
419
+ with open(standalone_yaml_path, encoding="utf-8") as f:
420
+ standalone_yaml_data = yaml.safe_load(f.read())
421
+ if standalone_yaml_data:
422
+ _dataset_card_data_dict = dataset_card_data.to_dict()
423
+ _dataset_card_data_dict.update(standalone_yaml_data)
424
+ dataset_card_data = DatasetCardData(**_dataset_card_data_dict)
425
+ metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)
426
+ dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data)
427
+ # we need a set of data files to find which dataset builder to use
428
+ # because we need to infer module name by files extensions
429
+ base_path = Path(self.path, self.data_dir or "").expanduser().resolve().as_posix()
430
+ if self.data_files is not None:
431
+ patterns = sanitize_patterns(self.data_files)
432
+ elif metadata_configs and not self.data_dir and "data_files" in next(iter(metadata_configs.values())):
433
+ patterns = sanitize_patterns(next(iter(metadata_configs.values()))["data_files"])
434
+ else:
435
+ patterns = get_data_patterns(base_path)
436
+ data_files = DataFilesDict.from_patterns(
437
+ patterns,
438
+ base_path=base_path,
439
+ allowed_extensions=ALL_ALLOWED_EXTENSIONS,
440
+ )
441
+ module_name, default_builder_kwargs = infer_module_for_data_files(
442
+ data_files=data_files,
443
+ path=self.path,
444
+ )
445
+ data_files = data_files.filter(
446
+ extensions=_MODULE_TO_EXTENSIONS[module_name], file_names=_MODULE_TO_METADATA_FILE_NAMES[module_name]
447
+ )
448
+ module_path, _ = _PACKAGED_DATASETS_MODULES[module_name]
449
+ if metadata_configs:
450
+ builder_configs, default_config_name = create_builder_configs_from_metadata_configs(
451
+ module_path,
452
+ metadata_configs,
453
+ base_path=base_path,
454
+ default_builder_kwargs=default_builder_kwargs,
455
+ )
456
+ else:
457
+ builder_configs: list[BuilderConfig] = [
458
+ import_main_class(module_path).BUILDER_CONFIG_CLASS(
459
+ data_files=data_files,
460
+ **default_builder_kwargs,
461
+ )
462
+ ]
463
+ default_config_name = None
464
+ builder_kwargs = {
465
+ "base_path": self.path,
466
+ "dataset_name": camelcase_to_snakecase(Path(self.path).name),
467
+ }
468
+ if self.data_dir:
469
+ builder_kwargs["data_files"] = data_files
470
+ # this file is deprecated and was created automatically in old versions of push_to_hub
471
+ if os.path.isfile(os.path.join(self.path, config.DATASETDICT_INFOS_FILENAME)):
472
+ with open(os.path.join(self.path, config.DATASETDICT_INFOS_FILENAME), encoding="utf-8") as f:
473
+ legacy_dataset_infos = DatasetInfosDict(
474
+ {
475
+ config_name: DatasetInfo.from_dict(dataset_info_dict)
476
+ for config_name, dataset_info_dict in json.load(f).items()
477
+ }
478
+ )
479
+ if len(legacy_dataset_infos) == 1:
480
+ # old config e.g. named "username--dataset_name"
481
+ legacy_config_name = next(iter(legacy_dataset_infos))
482
+ legacy_dataset_infos["default"] = legacy_dataset_infos.pop(legacy_config_name)
483
+ legacy_dataset_infos.update(dataset_infos)
484
+ dataset_infos = legacy_dataset_infos
485
+ if default_config_name is None and len(dataset_infos) == 1:
486
+ default_config_name = next(iter(dataset_infos))
487
+
488
+ hash = Hasher.hash({"dataset_infos": dataset_infos, "builder_configs": builder_configs})
489
+ return DatasetModule(
490
+ module_path,
491
+ hash,
492
+ builder_kwargs,
493
+ dataset_infos=dataset_infos,
494
+ builder_configs_parameters=BuilderConfigsParameters(
495
+ metadata_configs=metadata_configs,
496
+ builder_configs=builder_configs,
497
+ default_config_name=default_config_name,
498
+ ),
499
+ )
500
+
501
+
502
+ class PackagedDatasetModuleFactory(_DatasetModuleFactory):
503
+ """Get the dataset builder module from the ones that are packaged with the library: csv, json, etc."""
504
+
505
+ def __init__(
506
+ self,
507
+ name: str,
508
+ data_dir: Optional[str] = None,
509
+ data_files: Optional[Union[str, list, dict]] = None,
510
+ download_config: Optional[DownloadConfig] = None,
511
+ download_mode: Optional[Union[DownloadMode, str]] = None,
512
+ ):
513
+ self.name = name
514
+ self.data_files = data_files
515
+ self.data_dir = data_dir
516
+ self.download_config = download_config
517
+ self.download_mode = download_mode
518
+ increase_load_count(name)
519
+
520
+ def get_module(self) -> DatasetModule:
521
+ base_path = Path(self.data_dir or "").expanduser().resolve().as_posix()
522
+ patterns = (
523
+ sanitize_patterns(self.data_files)
524
+ if self.data_files is not None
525
+ else get_data_patterns(base_path, download_config=self.download_config)
526
+ )
527
+ data_files = DataFilesDict.from_patterns(
528
+ patterns,
529
+ download_config=self.download_config,
530
+ base_path=base_path,
531
+ )
532
+
533
+ module_path, hash = _PACKAGED_DATASETS_MODULES[self.name]
534
+
535
+ builder_kwargs = {
536
+ "data_files": data_files,
537
+ "dataset_name": self.name,
538
+ }
539
+
540
+ return DatasetModule(module_path, hash, builder_kwargs)
541
+
542
+
543
+ class HubDatasetModuleFactory(_DatasetModuleFactory):
544
+ """
545
+ Get the module of a dataset loaded from data files of a dataset repository.
546
+ The dataset builder module to use is inferred from the data files extensions.
547
+ """
548
+
549
+ def __init__(
550
+ self,
551
+ name: str,
552
+ commit_hash: str,
553
+ data_dir: Optional[str] = None,
554
+ data_files: Optional[Union[str, list, dict]] = None,
555
+ download_config: Optional[DownloadConfig] = None,
556
+ download_mode: Optional[Union[DownloadMode, str]] = None,
557
+ use_exported_dataset_infos: bool = False,
558
+ ):
559
+ self.name = name
560
+ self.commit_hash = commit_hash
561
+ self.data_files = data_files
562
+ self.data_dir = data_dir
563
+ self.download_config = download_config or DownloadConfig()
564
+ self.download_mode = download_mode
565
+ self.use_exported_dataset_infos = use_exported_dataset_infos
566
+ increase_load_count(name)
567
+
568
+ def get_module(self) -> DatasetModule:
569
+ # Get the Dataset Card and fix the revision in case there are new commits in the meantime
570
+ api = HfApi(
571
+ endpoint=config.HF_ENDPOINT,
572
+ token=self.download_config.token,
573
+ library_name="datasets",
574
+ library_version=__version__,
575
+ user_agent=get_datasets_user_agent(self.download_config.user_agent),
576
+ )
577
+ try:
578
+ dataset_readme_path = api.hf_hub_download(
579
+ repo_id=self.name,
580
+ filename=config.REPOCARD_FILENAME,
581
+ repo_type="dataset",
582
+ revision=self.commit_hash,
583
+ proxies=self.download_config.proxies,
584
+ )
585
+ dataset_card_data = DatasetCard.load(dataset_readme_path).data
586
+ except EntryNotFoundError:
587
+ dataset_card_data = DatasetCardData()
588
+ download_config = self.download_config.copy()
589
+ if download_config.download_desc is None:
590
+ download_config.download_desc = "Downloading standalone yaml"
591
+ try:
592
+ standalone_yaml_path = cached_path(
593
+ hf_dataset_url(self.name, config.REPOYAML_FILENAME, revision=self.commit_hash),
594
+ download_config=download_config,
595
+ )
596
+ with open(standalone_yaml_path, encoding="utf-8") as f:
597
+ standalone_yaml_data = yaml.safe_load(f.read())
598
+ if standalone_yaml_data:
599
+ _dataset_card_data_dict = dataset_card_data.to_dict()
600
+ _dataset_card_data_dict.update(standalone_yaml_data)
601
+ dataset_card_data = DatasetCardData(**_dataset_card_data_dict)
602
+ except FileNotFoundError:
603
+ pass
604
+ base_path = f"hf://datasets/{self.name}@{self.commit_hash}/{self.data_dir or ''}".rstrip("/")
605
+ metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)
606
+ dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data)
607
+ if config.USE_PARQUET_EXPORT and self.use_exported_dataset_infos:
608
+ try:
609
+ exported_dataset_infos = _dataset_viewer.get_exported_dataset_infos(
610
+ dataset=self.name, commit_hash=self.commit_hash, token=self.download_config.token
611
+ )
612
+ exported_dataset_infos = DatasetInfosDict(
613
+ {
614
+ config_name: DatasetInfo.from_dict(exported_dataset_infos[config_name])
615
+ for config_name in exported_dataset_infos
616
+ }
617
+ )
618
+ except _dataset_viewer.DatasetViewerError:
619
+ exported_dataset_infos = None
620
+ else:
621
+ exported_dataset_infos = None
622
+ if exported_dataset_infos:
623
+ exported_dataset_infos.update(dataset_infos)
624
+ dataset_infos = exported_dataset_infos
625
+ # we need a set of data files to find which dataset builder to use
626
+ # because we need to infer module name by files extensions
627
+ if self.data_files is not None:
628
+ patterns = sanitize_patterns(self.data_files)
629
+ elif metadata_configs and not self.data_dir and "data_files" in next(iter(metadata_configs.values())):
630
+ patterns = sanitize_patterns(next(iter(metadata_configs.values()))["data_files"])
631
+ else:
632
+ patterns = get_data_patterns(base_path, download_config=self.download_config)
633
+ data_files = DataFilesDict.from_patterns(
634
+ patterns,
635
+ base_path=base_path,
636
+ allowed_extensions=ALL_ALLOWED_EXTENSIONS,
637
+ download_config=self.download_config,
638
+ )
639
+ module_name, default_builder_kwargs = infer_module_for_data_files(
640
+ data_files=data_files,
641
+ path=self.name,
642
+ download_config=self.download_config,
643
+ )
644
+ data_files = data_files.filter(
645
+ extensions=_MODULE_TO_EXTENSIONS[module_name], file_names=_MODULE_TO_METADATA_FILE_NAMES[module_name]
646
+ )
647
+ module_path, _ = _PACKAGED_DATASETS_MODULES[module_name]
648
+ if metadata_configs:
649
+ builder_configs, default_config_name = create_builder_configs_from_metadata_configs(
650
+ module_path,
651
+ metadata_configs,
652
+ base_path=base_path,
653
+ default_builder_kwargs=default_builder_kwargs,
654
+ download_config=self.download_config,
655
+ )
656
+ else:
657
+ builder_configs: list[BuilderConfig] = [
658
+ import_main_class(module_path).BUILDER_CONFIG_CLASS(
659
+ data_files=data_files,
660
+ **default_builder_kwargs,
661
+ )
662
+ ]
663
+ default_config_name = None
664
+ builder_kwargs = {
665
+ "base_path": hf_dataset_url(self.name, "", revision=self.commit_hash).rstrip("/"),
666
+ "repo_id": self.name,
667
+ "dataset_name": camelcase_to_snakecase(Path(self.name).name),
668
+ }
669
+ if self.data_dir:
670
+ builder_kwargs["data_files"] = data_files
671
+ download_config = self.download_config.copy()
672
+ if download_config.download_desc is None:
673
+ download_config.download_desc = "Downloading metadata"
674
+ try:
675
+ # this file is deprecated and was created automatically in old versions of push_to_hub
676
+ dataset_infos_path = cached_path(
677
+ hf_dataset_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=self.commit_hash),
678
+ download_config=download_config,
679
+ )
680
+ with open(dataset_infos_path, encoding="utf-8") as f:
681
+ legacy_dataset_infos = DatasetInfosDict(
682
+ {
683
+ config_name: DatasetInfo.from_dict(dataset_info_dict)
684
+ for config_name, dataset_info_dict in json.load(f).items()
685
+ }
686
+ )
687
+ if len(legacy_dataset_infos) == 1:
688
+ # old config e.g. named "username--dataset_name"
689
+ legacy_config_name = next(iter(legacy_dataset_infos))
690
+ legacy_dataset_infos["default"] = legacy_dataset_infos.pop(legacy_config_name)
691
+ legacy_dataset_infos.update(dataset_infos)
692
+ dataset_infos = legacy_dataset_infos
693
+ except FileNotFoundError:
694
+ pass
695
+ if default_config_name is None and len(dataset_infos) == 1:
696
+ default_config_name = next(iter(dataset_infos))
697
+
698
+ return DatasetModule(
699
+ module_path,
700
+ self.commit_hash,
701
+ builder_kwargs,
702
+ dataset_infos=dataset_infos,
703
+ builder_configs_parameters=BuilderConfigsParameters(
704
+ metadata_configs=metadata_configs,
705
+ builder_configs=builder_configs,
706
+ default_config_name=default_config_name,
707
+ ),
708
+ )
709
+
710
+
711
+ class HubDatasetModuleFactoryWithParquetExport(_DatasetModuleFactory):
712
+ """
713
+ Get the module of a dataset loaded from parquet files of a dataset repository parquet export.
714
+ """
715
+
716
+ def __init__(
717
+ self,
718
+ name: str,
719
+ commit_hash: str,
720
+ download_config: Optional[DownloadConfig] = None,
721
+ ):
722
+ self.name = name
723
+ self.commit_hash = commit_hash
724
+ self.download_config = download_config or DownloadConfig()
725
+ increase_load_count(name)
726
+
727
+ def get_module(self) -> DatasetModule:
728
+ exported_parquet_files = _dataset_viewer.get_exported_parquet_files(
729
+ dataset=self.name, commit_hash=self.commit_hash, token=self.download_config.token
730
+ )
731
+ exported_dataset_infos = _dataset_viewer.get_exported_dataset_infos(
732
+ dataset=self.name, commit_hash=self.commit_hash, token=self.download_config.token
733
+ )
734
+ dataset_infos = DatasetInfosDict(
735
+ {
736
+ config_name: DatasetInfo.from_dict(exported_dataset_infos[config_name])
737
+ for config_name in exported_dataset_infos
738
+ }
739
+ )
740
+ parquet_commit_hash = (
741
+ HfApi(
742
+ endpoint=config.HF_ENDPOINT,
743
+ token=self.download_config.token,
744
+ library_name="datasets",
745
+ library_version=__version__,
746
+ user_agent=get_datasets_user_agent(self.download_config.user_agent),
747
+ )
748
+ .dataset_info(
749
+ self.name,
750
+ revision="refs/convert/parquet",
751
+ token=self.download_config.token,
752
+ timeout=100.0,
753
+ )
754
+ .sha
755
+ ) # fix the revision in case there are new commits in the meantime
756
+ metadata_configs = MetadataConfigs._from_exported_parquet_files_and_dataset_infos(
757
+ parquet_commit_hash=parquet_commit_hash,
758
+ exported_parquet_files=exported_parquet_files,
759
+ dataset_infos=dataset_infos,
760
+ )
761
+ module_path, _ = _PACKAGED_DATASETS_MODULES["parquet"]
762
+ builder_configs, default_config_name = create_builder_configs_from_metadata_configs(
763
+ module_path,
764
+ metadata_configs,
765
+ download_config=self.download_config,
766
+ )
767
+ builder_kwargs = {
768
+ "repo_id": self.name,
769
+ "dataset_name": camelcase_to_snakecase(Path(self.name).name),
770
+ }
771
+
772
+ return DatasetModule(
773
+ module_path,
774
+ self.commit_hash,
775
+ builder_kwargs,
776
+ dataset_infos=dataset_infos,
777
+ builder_configs_parameters=BuilderConfigsParameters(
778
+ metadata_configs=metadata_configs,
779
+ builder_configs=builder_configs,
780
+ default_config_name=default_config_name,
781
+ ),
782
+ )
783
+
784
+
785
+ class CachedDatasetModuleFactory(_DatasetModuleFactory):
786
+ """
787
+ Get the module of a dataset that has been loaded once already and cached.
788
+ """
789
+
790
+ def __init__(
791
+ self,
792
+ name: str,
793
+ cache_dir: Optional[str] = None,
794
+ ):
795
+ self.name = name
796
+ self.cache_dir = cache_dir
797
+ assert self.name.count("/") <= 1
798
+
799
+ def get_module(self) -> DatasetModule:
800
+ cache_dir = os.path.expanduser(str(self.cache_dir or config.HF_DATASETS_CACHE))
801
+ namespace_and_dataset_name = self.name.split("/")
802
+ namespace_and_dataset_name[-1] = camelcase_to_snakecase(namespace_and_dataset_name[-1])
803
+ cached_relative_path = "___".join(namespace_and_dataset_name)
804
+ cached_datasets_directory_path_root = os.path.join(cache_dir, cached_relative_path)
805
+ cached_directory_paths = [
806
+ cached_directory_path
807
+ for cached_directory_path in glob.glob(os.path.join(cached_datasets_directory_path_root, "*", "*", "*"))
808
+ if os.path.isdir(cached_directory_path)
809
+ ]
810
+ if cached_directory_paths:
811
+ builder_kwargs = {
812
+ "repo_id": self.name,
813
+ "dataset_name": self.name.split("/")[-1],
814
+ }
815
+ warning_msg = f"Using the latest cached version of the dataset since {self.name} couldn't be found on the Hugging Face Hub"
816
+ if config.HF_HUB_OFFLINE:
817
+ warning_msg += " (offline mode is enabled)."
818
+ logger.warning(warning_msg)
819
+ return DatasetModule(
820
+ "datasets.packaged_modules.cache.cache",
821
+ "auto",
822
+ {**builder_kwargs, "version": "auto"},
823
+ )
824
+ raise FileNotFoundError(f"Dataset {self.name} is not cached in {self.cache_dir}")
825
+
826
+
827
+ def dataset_module_factory(
828
+ path: str,
829
+ revision: Optional[Union[str, Version]] = None,
830
+ download_config: Optional[DownloadConfig] = None,
831
+ download_mode: Optional[Union[DownloadMode, str]] = None,
832
+ data_dir: Optional[str] = None,
833
+ data_files: Optional[Union[dict, list, str, DataFilesDict]] = None,
834
+ cache_dir: Optional[str] = None,
835
+ **download_kwargs,
836
+ ) -> DatasetModule:
837
+ """
838
+ Download/extract/cache a dataset module.
839
+
840
+ Dataset codes are cached inside the dynamic modules cache to allow easy import (avoid ugly sys.path tweaks).
841
+
842
+ Args:
843
+
844
+ path (str): Path or name of the dataset.
845
+ Depending on ``path``, the dataset builder that is used comes from one of the generic dataset builders (JSON, CSV, Parquet, text etc.).
846
+
847
+ For local datasets:
848
+
849
+ - if ``path`` is a local directory (containing data files only)
850
+ -> load a generic dataset builder (csv, json, text etc.) based on the content of the directory
851
+ e.g. ``'./path/to/directory/with/my/csv/data'``.
852
+
853
+ For datasets on the Hugging Face Hub (list all available datasets with ``huggingface_hub.list_datasets()``)
854
+
855
+ - if ``path`` is a dataset repository on the HF hub (containing data files only)
856
+ -> load a generic dataset builder (csv, text etc.) based on the content of the repository
857
+ e.g. ``'username/dataset_name'``, a dataset repository on the HF hub containing your data files.
858
+
859
+ revision (:class:`~utils.Version` or :obj:`str`, optional): Version of the dataset to load.
860
+ As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch.
861
+ You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository.
862
+ download_config (:class:`DownloadConfig`, optional): Specific download configuration parameters.
863
+ download_mode (:class:`DownloadMode` or :obj:`str`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode.
864
+ data_dir (:obj:`str`, optional): Directory with the data files. Used only if `data_files` is not specified,
865
+ in which case it's equal to pass `os.path.join(data_dir, "**")` as `data_files`.
866
+ data_files (:obj:`Union[Dict, List, str]`, optional): Defining the data_files of the dataset configuration.
867
+ cache_dir (`str`, *optional*):
868
+ Directory to read/write data. Defaults to `"~/.cache/huggingface/datasets"`.
869
+
870
+ <Added version="2.16.0"/>
871
+
872
+ **download_kwargs (additional keyword arguments): optional attributes for DownloadConfig() which will override
873
+ the attributes in download_config if supplied.
874
+
875
+ Returns:
876
+ DatasetModule
877
+ """
878
+ if download_config is None:
879
+ download_config = DownloadConfig(**download_kwargs)
880
+ download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS)
881
+ download_config.extract_compressed_file = True
882
+ download_config.force_extract = True
883
+ download_config.force_download = download_mode == DownloadMode.FORCE_REDOWNLOAD
884
+
885
+ filename = list(filter(lambda x: x, path.replace(os.sep, "/").split("/")))[-1]
886
+ if not filename.endswith(".py"):
887
+ filename = filename + ".py"
888
+ combined_path = os.path.join(path, filename)
889
+
890
+ # We have several ways to get a dataset builder:
891
+ #
892
+ # - if path is the name of a packaged dataset module
893
+ # -> use the packaged module (json, csv, etc.)
894
+ #
895
+ # - if os.path.join(path, name) is a local python file
896
+ # -> use the module from the python file
897
+ # - if path is a local directory (but no python file)
898
+ # -> use a packaged module (csv, text etc.) based on content of the directory
899
+ #
900
+ # - if path has one "/" and is dataset repository on the HF hub with a python file
901
+ # -> the module from the python file in the dataset repository
902
+ # - if path has one "/" and is dataset repository on the HF hub without a python file
903
+ # -> use a packaged module (csv, text etc.) based on content of the repository
904
+
905
+ # Try packaged
906
+ if path in _PACKAGED_DATASETS_MODULES:
907
+ return PackagedDatasetModuleFactory(
908
+ path,
909
+ data_dir=data_dir,
910
+ data_files=data_files,
911
+ download_config=download_config,
912
+ download_mode=download_mode,
913
+ ).get_module()
914
+ # Try locally
915
+ elif path.endswith(filename):
916
+ raise RuntimeError(f"Dataset scripts are no longer supported, but found {filename}")
917
+ elif os.path.isfile(combined_path):
918
+ raise RuntimeError(f"Dataset scripts are no longer supported, but found {filename}")
919
+ elif os.path.isdir(path):
920
+ return LocalDatasetModuleFactory(
921
+ path, data_dir=data_dir, data_files=data_files, download_mode=download_mode
922
+ ).get_module()
923
+ # Try remotely
924
+ elif is_relative_path(path) and path.count("/") <= 1:
925
+ try:
926
+ # Get the Dataset Card + get the revision + check authentication all at in one call
927
+ # We fix the commit_hash in case there are new commits in the meantime
928
+ api = HfApi(
929
+ endpoint=config.HF_ENDPOINT,
930
+ token=download_config.token,
931
+ library_name="datasets",
932
+ library_version=__version__,
933
+ user_agent=get_datasets_user_agent(download_config.user_agent),
934
+ )
935
+ try:
936
+ _raise_if_offline_mode_is_enabled()
937
+ dataset_readme_path = api.hf_hub_download(
938
+ repo_id=path,
939
+ filename=config.REPOCARD_FILENAME,
940
+ repo_type="dataset",
941
+ revision=revision,
942
+ proxies=download_config.proxies,
943
+ )
944
+ commit_hash = os.path.basename(os.path.dirname(dataset_readme_path))
945
+ except LocalEntryNotFoundError as e:
946
+ if isinstance(
947
+ e.__cause__,
948
+ (
949
+ OfflineModeIsEnabled,
950
+ requests.exceptions.Timeout,
951
+ requests.exceptions.ConnectionError,
952
+ httpx.ConnectError,
953
+ httpx.TimeoutException,
954
+ ),
955
+ ):
956
+ raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({e.__class__.__name__})") from e
957
+ else:
958
+ raise
959
+ except EntryNotFoundError:
960
+ commit_hash = api.dataset_info(
961
+ path,
962
+ revision=revision,
963
+ timeout=100.0,
964
+ ).sha
965
+ except (
966
+ OfflineModeIsEnabled,
967
+ requests.exceptions.Timeout,
968
+ requests.exceptions.ConnectionError,
969
+ httpx.ConnectError,
970
+ httpx.TimeoutException,
971
+ ) as e:
972
+ raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({e.__class__.__name__})") from e
973
+ except GatedRepoError as e:
974
+ message = f"Dataset '{path}' is a gated dataset on the Hub."
975
+ if e.response.status_code == 401:
976
+ message += " You must be authenticated to access it."
977
+ elif e.response.status_code == 403:
978
+ message += f" Visit the dataset page at https://huggingface.co/datasets/{path} to ask for access."
979
+ raise DatasetNotFoundError(message) from e
980
+ except RevisionNotFoundError as e:
981
+ raise DatasetNotFoundError(
982
+ f"Revision '{revision}' doesn't exist for dataset '{path}' on the Hub."
983
+ ) from e
984
+ except RepositoryNotFoundError as e:
985
+ raise DatasetNotFoundError(f"Dataset '{path}' doesn't exist on the Hub or cannot be accessed.") from e
986
+ try:
987
+ api.hf_hub_download(
988
+ repo_id=path,
989
+ filename=filename,
990
+ repo_type="dataset",
991
+ revision=commit_hash,
992
+ proxies=download_config.proxies,
993
+ )
994
+ raise RuntimeError(f"Dataset scripts are no longer supported, but found {filename}")
995
+ except EntryNotFoundError:
996
+ # Use the infos from the parquet export except in some cases:
997
+ if data_dir or data_files or (revision and revision != "main"):
998
+ use_exported_dataset_infos = False
999
+ else:
1000
+ use_exported_dataset_infos = True
1001
+ return HubDatasetModuleFactory(
1002
+ path,
1003
+ commit_hash=commit_hash,
1004
+ data_dir=data_dir,
1005
+ data_files=data_files,
1006
+ download_config=download_config,
1007
+ download_mode=download_mode,
1008
+ use_exported_dataset_infos=use_exported_dataset_infos,
1009
+ ).get_module()
1010
+ except GatedRepoError as e:
1011
+ message = f"Dataset '{path}' is a gated dataset on the Hub."
1012
+ if e.response.status_code == 401:
1013
+ message += " You must be authenticated to access it."
1014
+ elif e.response.status_code == 403:
1015
+ message += f" Visit the dataset page at https://huggingface.co/datasets/{path} to ask for access."
1016
+ raise DatasetNotFoundError(message) from e
1017
+ except RevisionNotFoundError as e:
1018
+ raise DatasetNotFoundError(
1019
+ f"Revision '{revision}' doesn't exist for dataset '{path}' on the Hub."
1020
+ ) from e
1021
+ except Exception as e1:
1022
+ # All the attempts failed, before raising the error we should check if the module is already cached
1023
+ try:
1024
+ return CachedDatasetModuleFactory(path, cache_dir=cache_dir).get_module()
1025
+ except Exception:
1026
+ # If it's not in the cache, then it doesn't exist.
1027
+ if isinstance(e1, OfflineModeIsEnabled):
1028
+ raise ConnectionError(f"Couldn't reach the Hugging Face Hub for dataset '{path}': {e1}") from None
1029
+ if isinstance(e1, (DataFilesNotFoundError, DatasetNotFoundError, EmptyDatasetError)):
1030
+ raise e1 from None
1031
+ if isinstance(e1, FileNotFoundError):
1032
+ raise FileNotFoundError(
1033
+ f"Couldn't find any data file at {relative_to_absolute_path(path)}. "
1034
+ f"Couldn't find '{path}' on the Hugging Face Hub either: {type(e1).__name__}: {e1}"
1035
+ ) from None
1036
+ raise e1 from None
1037
+ else:
1038
+ raise FileNotFoundError(f"Couldn't find any data file at {relative_to_absolute_path(path)}.")
1039
+
1040
+
1041
+ def load_dataset_builder(
1042
+ path: str,
1043
+ name: Optional[str] = None,
1044
+ data_dir: Optional[str] = None,
1045
+ data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
1046
+ cache_dir: Optional[str] = None,
1047
+ features: Optional[Features] = None,
1048
+ download_config: Optional[DownloadConfig] = None,
1049
+ download_mode: Optional[Union[DownloadMode, str]] = None,
1050
+ revision: Optional[Union[str, Version]] = None,
1051
+ token: Optional[Union[bool, str]] = None,
1052
+ storage_options: Optional[dict] = None,
1053
+ **config_kwargs,
1054
+ ) -> DatasetBuilder:
1055
+ """Load a dataset builder which can be used to:
1056
+
1057
+ - Inspect general information that is required to build a dataset (cache directory, config, dataset info, features, data files, etc.)
1058
+ - Download and prepare the dataset as Arrow files in the cache
1059
+ - Get a streaming dataset without downloading or caching anything
1060
+
1061
+ You can find the list of datasets on the [Hub](https://huggingface.co/datasets) or with [`huggingface_hub.list_datasets`].
1062
+
1063
+ A dataset is a directory that contains some data files in generic formats (JSON, CSV, Parquet, etc.) and possibly
1064
+ in a generic structure (Webdataset, ImageFolder, AudioFolder, VideoFolder, etc.)
1065
+
1066
+ Args:
1067
+
1068
+ path (`str`):
1069
+ Path or name of the dataset.
1070
+
1071
+ - if `path` is a dataset repository on the HF hub (list all available datasets with [`huggingface_hub.list_datasets`])
1072
+ -> load the dataset builder from supported files in the repository (csv, json, parquet, etc.)
1073
+ e.g. `'username/dataset_name'`, a dataset repository on the HF hub containing the data files.
1074
+
1075
+ - if `path` is a local directory
1076
+ -> load the dataset builder from supported files in the directory (csv, json, parquet, etc.)
1077
+ e.g. `'./path/to/directory/with/my/csv/data'`.
1078
+
1079
+ - if `path` is the name of a dataset builder and `data_files` or `data_dir` is specified
1080
+ (available builders are "json", "csv", "parquet", "arrow", "text", "xml", "webdataset", "imagefolder", "audiofolder", "videofolder")
1081
+ -> load the dataset builder from the files in `data_files` or `data_dir`
1082
+ e.g. `'parquet'`.
1083
+
1084
+ name (`str`, *optional*):
1085
+ Defining the name of the dataset configuration.
1086
+ data_dir (`str`, *optional*):
1087
+ Defining the `data_dir` of the dataset configuration. If specified for the generic builders (csv, text etc.) or the Hub datasets and `data_files` is `None`,
1088
+ the behavior is equal to passing `os.path.join(data_dir, **)` as `data_files` to reference all the files in a directory.
1089
+ data_files (`str` or `Sequence` or `Mapping`, *optional*):
1090
+ Path(s) to source data file(s).
1091
+ cache_dir (`str`, *optional*):
1092
+ Directory to read/write data. Defaults to `"~/.cache/huggingface/datasets"`.
1093
+ features ([`Features`], *optional*):
1094
+ Set the features type to use for this dataset.
1095
+ download_config ([`DownloadConfig`], *optional*):
1096
+ Specific download configuration parameters.
1097
+ download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):
1098
+ Download/generate mode.
1099
+ revision ([`Version`] or `str`, *optional*):
1100
+ Version of the dataset to load.
1101
+ As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch.
1102
+ You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository.
1103
+ token (`str` or `bool`, *optional*):
1104
+ Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
1105
+ If `True`, or not specified, will get token from `"~/.huggingface"`.
1106
+ storage_options (`dict`, *optional*, defaults to `None`):
1107
+ **Experimental**. Key/value pairs to be passed on to the dataset file-system backend, if any.
1108
+
1109
+ <Added version="2.11.0"/>
1110
+
1111
+ **config_kwargs (additional keyword arguments):
1112
+ Keyword arguments to be passed to the [`BuilderConfig`]
1113
+ and used in the [`DatasetBuilder`].
1114
+
1115
+ Returns:
1116
+ [`DatasetBuilder`]
1117
+
1118
+ Example:
1119
+
1120
+ ```py
1121
+ >>> from datasets import load_dataset_builder
1122
+ >>> ds_builder = load_dataset_builder('cornell-movie-review-data/rotten_tomatoes')
1123
+ >>> ds_builder.info.features
1124
+ {'label': ClassLabel(names=['neg', 'pos']),
1125
+ 'text': Value('string')}
1126
+ ```
1127
+ """
1128
+ download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS)
1129
+ if token is not None:
1130
+ download_config = download_config.copy() if download_config else DownloadConfig()
1131
+ download_config.token = token
1132
+ if storage_options is not None:
1133
+ download_config = download_config.copy() if download_config else DownloadConfig()
1134
+ download_config.storage_options.update(storage_options)
1135
+ if features is not None:
1136
+ features = _fix_for_backward_compatible_features(features)
1137
+ dataset_module = dataset_module_factory(
1138
+ path,
1139
+ revision=revision,
1140
+ download_config=download_config,
1141
+ download_mode=download_mode,
1142
+ data_dir=data_dir,
1143
+ data_files=data_files,
1144
+ cache_dir=cache_dir,
1145
+ )
1146
+ # Get dataset builder class
1147
+ builder_kwargs = dataset_module.builder_kwargs
1148
+ data_dir = builder_kwargs.pop("data_dir", data_dir)
1149
+ data_files = builder_kwargs.pop("data_files", data_files)
1150
+ config_name = builder_kwargs.pop(
1151
+ "config_name", name or dataset_module.builder_configs_parameters.default_config_name
1152
+ )
1153
+ dataset_name = builder_kwargs.pop("dataset_name", None)
1154
+ info = dataset_module.dataset_infos.get(config_name) if dataset_module.dataset_infos else None
1155
+
1156
+ if (
1157
+ path in _PACKAGED_DATASETS_MODULES
1158
+ and data_files is None
1159
+ and dataset_module.builder_configs_parameters.builder_configs[0].data_files is None
1160
+ ):
1161
+ error_msg = f"Please specify the data files or data directory to load for the {path} dataset builder."
1162
+ example_extensions = [
1163
+ extension for extension in _EXTENSION_TO_MODULE if _EXTENSION_TO_MODULE[extension] == path
1164
+ ]
1165
+ if example_extensions:
1166
+ error_msg += f'\nFor example `data_files={{"train": "path/to/data/train/*.{example_extensions[0]}"}}`'
1167
+ raise ValueError(error_msg)
1168
+
1169
+ builder_cls = get_dataset_builder_class(dataset_module, dataset_name=dataset_name)
1170
+ # Instantiate the dataset builder
1171
+ builder_instance: DatasetBuilder = builder_cls(
1172
+ cache_dir=cache_dir,
1173
+ dataset_name=dataset_name,
1174
+ config_name=config_name,
1175
+ data_dir=data_dir,
1176
+ data_files=data_files,
1177
+ hash=dataset_module.hash,
1178
+ info=info,
1179
+ features=features,
1180
+ token=token,
1181
+ storage_options=storage_options,
1182
+ **builder_kwargs,
1183
+ **config_kwargs,
1184
+ )
1185
+ builder_instance._use_legacy_cache_dir_if_possible(dataset_module)
1186
+
1187
+ return builder_instance
1188
+
1189
+
1190
+ def load_dataset(
1191
+ path: str,
1192
+ name: Optional[str] = None,
1193
+ data_dir: Optional[str] = None,
1194
+ data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
1195
+ split: Optional[Union[str, Split, list[str], list[Split]]] = None,
1196
+ cache_dir: Optional[str] = None,
1197
+ features: Optional[Features] = None,
1198
+ download_config: Optional[DownloadConfig] = None,
1199
+ download_mode: Optional[Union[DownloadMode, str]] = None,
1200
+ verification_mode: Optional[Union[VerificationMode, str]] = None,
1201
+ keep_in_memory: Optional[bool] = None,
1202
+ save_infos: bool = False,
1203
+ revision: Optional[Union[str, Version]] = None,
1204
+ token: Optional[Union[bool, str]] = None,
1205
+ streaming: bool = False,
1206
+ num_proc: Optional[int] = None,
1207
+ storage_options: Optional[dict] = None,
1208
+ **config_kwargs,
1209
+ ) -> Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset]:
1210
+ """Load a dataset from the Hugging Face Hub, or a local dataset.
1211
+
1212
+ You can find the list of datasets on the [Hub](https://huggingface.co/datasets) or with [`huggingface_hub.list_datasets`].
1213
+
1214
+ A dataset is a directory that contains some data files in generic formats (JSON, CSV, Parquet, etc.) and possibly
1215
+ in a generic structure (Webdataset, ImageFolder, AudioFolder, VideoFolder, etc.)
1216
+
1217
+ This function does the following under the hood:
1218
+
1219
+ 1. Load a dataset builder:
1220
+
1221
+ * Find the most common data format in the dataset and pick its associated builder (JSON, CSV, Parquet, Webdataset, ImageFolder, AudioFolder, etc.)
1222
+ * Find which file goes into which split (e.g. train/test) based on file and directory names or on the YAML configuration
1223
+ * It is also possible to specify `data_files` manually, and which dataset builder to use (e.g. "parquet").
1224
+
1225
+ 2. Run the dataset builder:
1226
+
1227
+ In the general case:
1228
+
1229
+ * Download the data files from the dataset if they are not already available locally or cached.
1230
+ * Process and cache the dataset in typed Arrow tables for caching.
1231
+
1232
+ Arrow table are arbitrarily long, typed tables which can store nested objects and be mapped to numpy/pandas/python generic types.
1233
+ They can be directly accessed from disk, loaded in RAM or even streamed over the web.
1234
+
1235
+ In the streaming case:
1236
+
1237
+ * Don't download or cache anything. Instead, the dataset is lazily loaded and will be streamed on-the-fly when iterating on it.
1238
+
1239
+ 3. Return a dataset built from the requested splits in `split` (default: all).
1240
+
1241
+ Args:
1242
+
1243
+ path (`str`):
1244
+ Path or name of the dataset.
1245
+
1246
+ - if `path` is a dataset repository on the HF hub (list all available datasets with [`huggingface_hub.list_datasets`])
1247
+ -> load the dataset from supported files in the repository (csv, json, parquet, etc.)
1248
+ e.g. `'username/dataset_name'`, a dataset repository on the HF hub containing the data files.
1249
+
1250
+ - if `path` is a local directory
1251
+ -> load the dataset from supported files in the directory (csv, json, parquet, etc.)
1252
+ e.g. `'./path/to/directory/with/my/csv/data'`.
1253
+
1254
+ - if `path` is the name of a dataset builder and `data_files` or `data_dir` is specified
1255
+ (available builders are "json", "csv", "parquet", "arrow", "text", "xml", "webdataset", "imagefolder", "audiofolder", "videofolder")
1256
+ -> load the dataset from the files in `data_files` or `data_dir`
1257
+ e.g. `'parquet'`.
1258
+
1259
+ name (`str`, *optional*):
1260
+ Defining the name of the dataset configuration.
1261
+ data_dir (`str`, *optional*):
1262
+ Defining the `data_dir` of the dataset configuration. If specified for the generic builders (csv, text etc.) or the Hub datasets and `data_files` is `None`,
1263
+ the behavior is equal to passing `os.path.join(data_dir, **)` as `data_files` to reference all the files in a directory.
1264
+ data_files (`str` or `Sequence` or `Mapping`, *optional*):
1265
+ Path(s) to source data file(s).
1266
+ split (`Split` or `str`):
1267
+ Which split of the data to load.
1268
+ If `None`, will return a `dict` with all splits (typically `datasets.Split.TRAIN` and `datasets.Split.TEST`).
1269
+ If given, will return a single Dataset.
1270
+ Splits can be combined and specified like in tensorflow-datasets.
1271
+ cache_dir (`str`, *optional*):
1272
+ Directory to read/write data. Defaults to `"~/.cache/huggingface/datasets"`.
1273
+ features (`Features`, *optional*):
1274
+ Set the features type to use for this dataset.
1275
+ download_config ([`DownloadConfig`], *optional*):
1276
+ Specific download configuration parameters.
1277
+ download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):
1278
+ Download/generate mode.
1279
+ verification_mode ([`VerificationMode`] or `str`, defaults to `BASIC_CHECKS`):
1280
+ Verification mode determining the checks to run on the downloaded/processed dataset information (checksums/size/splits/...).
1281
+
1282
+ <Added version="2.9.1"/>
1283
+ keep_in_memory (`bool`, defaults to `None`):
1284
+ Whether to copy the dataset in-memory. If `None`, the dataset
1285
+ will not be copied in-memory unless explicitly enabled by setting `datasets.config.IN_MEMORY_MAX_SIZE` to
1286
+ nonzero. See more details in the [improve performance](../cache#improve-performance) section.
1287
+ revision ([`Version`] or `str`, *optional*):
1288
+ Version of the dataset to load.
1289
+ As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch.
1290
+ You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository.
1291
+ token (`str` or `bool`, *optional*):
1292
+ Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
1293
+ If `True`, or not specified, will get token from `"~/.huggingface"`.
1294
+ streaming (`bool`, defaults to `False`):
1295
+ If set to `True`, don't download the data files. Instead, it streams the data progressively while
1296
+ iterating on the dataset. An [`IterableDataset`] or [`IterableDatasetDict`] is returned instead in this case.
1297
+
1298
+ Note that streaming works for datasets that use data formats that support being iterated over like txt, csv, jsonl for example.
1299
+ Json files may be downloaded completely. Also streaming from remote zip or gzip files is supported but other compressed formats
1300
+ like rar and xz are not yet supported. The tgz format doesn't allow streaming.
1301
+ num_proc (`int`, *optional*, defaults to `None`):
1302
+ Number of processes when downloading and generating the dataset locally.
1303
+ Multiprocessing is disabled by default.
1304
+
1305
+ <Added version="2.7.0"/>
1306
+ storage_options (`dict`, *optional*, defaults to `None`):
1307
+ **Experimental**. Key/value pairs to be passed on to the dataset file-system backend, if any.
1308
+
1309
+ <Added version="2.11.0"/>
1310
+ **config_kwargs (additional keyword arguments):
1311
+ Keyword arguments to be passed to the `BuilderConfig`
1312
+ and used in the [`DatasetBuilder`].
1313
+
1314
+ Returns:
1315
+ [`Dataset`] or [`DatasetDict`]:
1316
+ - if `split` is not `None`: the dataset requested,
1317
+ - if `split` is `None`, a [`~datasets.DatasetDict`] with each split.
1318
+
1319
+ or [`IterableDataset`] or [`IterableDatasetDict`]: if `streaming=True`
1320
+
1321
+ - if `split` is not `None`, the dataset is requested
1322
+ - if `split` is `None`, a [`~datasets.streaming.IterableDatasetDict`] with each split.
1323
+
1324
+ Example:
1325
+
1326
+ Load a dataset from the Hugging Face Hub:
1327
+
1328
+ ```py
1329
+ >>> from datasets import load_dataset
1330
+ >>> ds = load_dataset('cornell-movie-review-data/rotten_tomatoes', split='train')
1331
+
1332
+ # Load a subset or dataset configuration (here 'sst2')
1333
+ >>> from datasets import load_dataset
1334
+ >>> ds = load_dataset('nyu-mll/glue', 'sst2', split='train')
1335
+
1336
+ # Manual mapping of data files to splits
1337
+ >>> data_files = {'train': 'train.csv', 'test': 'test.csv'}
1338
+ >>> ds = load_dataset('namespace/your_dataset_name', data_files=data_files)
1339
+
1340
+ # Manual selection of a directory to load
1341
+ >>> ds = load_dataset('namespace/your_dataset_name', data_dir='folder_name')
1342
+ ```
1343
+
1344
+ Load a local dataset:
1345
+
1346
+ ```py
1347
+ # Load a CSV file
1348
+ >>> from datasets import load_dataset
1349
+ >>> ds = load_dataset('csv', data_files='path/to/local/my_dataset.csv')
1350
+
1351
+ # Load a JSON file
1352
+ >>> from datasets import load_dataset
1353
+ >>> ds = load_dataset('json', data_files='path/to/local/my_dataset.json')
1354
+ ```
1355
+
1356
+ Load an [`~datasets.IterableDataset`]:
1357
+
1358
+ ```py
1359
+ >>> from datasets import load_dataset
1360
+ >>> ds = load_dataset('cornell-movie-review-data/rotten_tomatoes', split='train', streaming=True)
1361
+ ```
1362
+
1363
+ Load an image dataset with the `ImageFolder` dataset builder:
1364
+
1365
+ ```py
1366
+ >>> from datasets import load_dataset
1367
+ >>> ds = load_dataset('imagefolder', data_dir='/path/to/images', split='train')
1368
+ ```
1369
+ """
1370
+ if "trust_remote_code" in config_kwargs:
1371
+ if config_kwargs.pop("trust_remote_code"):
1372
+ logger.error(
1373
+ "`trust_remote_code` is not supported anymore.\n"
1374
+ f"Please check that the Hugging Face dataset '{path}' isn't based on a loading script and remove `trust_remote_code`.\n"
1375
+ "If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet."
1376
+ )
1377
+ if data_files is not None and not data_files:
1378
+ raise ValueError(f"Empty 'data_files': '{data_files}'. It should be either non-empty or None (default).")
1379
+ if Path(path, config.DATASET_STATE_JSON_FILENAME).exists():
1380
+ raise ValueError(
1381
+ "You are trying to load a dataset that was saved using `save_to_disk`. "
1382
+ "Please use `load_from_disk` instead."
1383
+ )
1384
+
1385
+ if streaming and num_proc is not None:
1386
+ raise NotImplementedError(
1387
+ "Loading a streaming dataset in parallel with `num_proc` is not implemented. "
1388
+ "To parallelize streaming, you can wrap the dataset with a PyTorch DataLoader using `num_workers` > 1 instead."
1389
+ )
1390
+
1391
+ download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS)
1392
+ verification_mode = VerificationMode(
1393
+ (verification_mode or VerificationMode.BASIC_CHECKS) if not save_infos else VerificationMode.ALL_CHECKS
1394
+ )
1395
+
1396
+ # Create a dataset builder
1397
+ builder_instance = load_dataset_builder(
1398
+ path=path,
1399
+ name=name,
1400
+ data_dir=data_dir,
1401
+ data_files=data_files,
1402
+ cache_dir=cache_dir,
1403
+ features=features,
1404
+ download_config=download_config,
1405
+ download_mode=download_mode,
1406
+ revision=revision,
1407
+ token=token,
1408
+ storage_options=storage_options,
1409
+ **config_kwargs,
1410
+ )
1411
+
1412
+ # Return iterable dataset in case of streaming
1413
+ if streaming:
1414
+ return builder_instance.as_streaming_dataset(split=split)
1415
+
1416
+ # Download and prepare data
1417
+ builder_instance.download_and_prepare(
1418
+ download_config=download_config,
1419
+ download_mode=download_mode,
1420
+ verification_mode=verification_mode,
1421
+ num_proc=num_proc,
1422
+ storage_options=storage_options,
1423
+ )
1424
+
1425
+ # Build dataset for splits
1426
+ keep_in_memory = (
1427
+ keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
1428
+ )
1429
+ ds = builder_instance.as_dataset(split=split, verification_mode=verification_mode, in_memory=keep_in_memory)
1430
+
1431
+ return ds
1432
+
1433
+
1434
+ def load_from_disk(
1435
+ dataset_path: PathLike, keep_in_memory: Optional[bool] = None, storage_options: Optional[dict] = None
1436
+ ) -> Union[Dataset, DatasetDict]:
1437
+ """
1438
+ Loads a dataset that was previously saved using [`~Dataset.save_to_disk`] from a dataset directory, or
1439
+ from a filesystem using any implementation of `fsspec.spec.AbstractFileSystem`.
1440
+
1441
+ Args:
1442
+ dataset_path (`path-like`):
1443
+ Path (e.g. `"dataset/train"`) or remote URI (e.g. `"s3://my-bucket/dataset/train"`)
1444
+ of the [`Dataset`] or [`DatasetDict`] directory where the dataset/dataset-dict will be
1445
+ loaded from.
1446
+ keep_in_memory (`bool`, defaults to `None`):
1447
+ Whether to copy the dataset in-memory. If `None`, the dataset
1448
+ will not be copied in-memory unless explicitly enabled by setting `datasets.config.IN_MEMORY_MAX_SIZE` to
1449
+ nonzero. See more details in the [improve performance](../cache#improve-performance) section.
1450
+
1451
+ storage_options (`dict`, *optional*):
1452
+ Key/value pairs to be passed on to the file-system backend, if any.
1453
+
1454
+ <Added version="2.9.0"/>
1455
+
1456
+ Returns:
1457
+ [`Dataset`] or [`DatasetDict`]:
1458
+ - If `dataset_path` is a path of a dataset directory: the dataset requested.
1459
+ - If `dataset_path` is a path of a dataset dict directory, a [`DatasetDict`] with each split.
1460
+
1461
+ Example:
1462
+
1463
+ ```py
1464
+ >>> from datasets import load_from_disk
1465
+ >>> ds = load_from_disk('path/to/dataset/directory')
1466
+ ```
1467
+ """
1468
+ fs: fsspec.AbstractFileSystem
1469
+ fs, *_ = url_to_fs(dataset_path, **(storage_options or {}))
1470
+ if not fs.exists(dataset_path):
1471
+ raise FileNotFoundError(f"Directory {dataset_path} not found")
1472
+ if fs.isfile(posixpath.join(dataset_path, config.DATASET_INFO_FILENAME)) and fs.isfile(
1473
+ posixpath.join(dataset_path, config.DATASET_STATE_JSON_FILENAME)
1474
+ ):
1475
+ return Dataset.load_from_disk(dataset_path, keep_in_memory=keep_in_memory, storage_options=storage_options)
1476
+ elif fs.isfile(posixpath.join(dataset_path, config.DATASETDICT_JSON_FILENAME)):
1477
+ return DatasetDict.load_from_disk(dataset_path, keep_in_memory=keep_in_memory, storage_options=storage_options)
1478
+ else:
1479
+ raise FileNotFoundError(
1480
+ f"Directory {dataset_path} is neither a `Dataset` directory nor a `DatasetDict` directory."
1481
+ )
datasets/naming.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # Lint as: python3
16
+ """Utilities for file names."""
17
+
18
+ import itertools
19
+ import os
20
+ import re
21
+
22
+
23
+ _uppercase_uppercase_re = re.compile(r"([A-Z]+)([A-Z][a-z])")
24
+ _lowercase_uppercase_re = re.compile(r"([a-z\d])([A-Z])")
25
+
26
+ _single_underscore_re = re.compile(r"(?<!_)_(?!_)")
27
+ _multiple_underscores_re = re.compile(r"(_{2,})")
28
+
29
+ _split_re = r"^\w+(\.\w+)*$"
30
+
31
+ INVALID_WINDOWS_CHARACTERS_IN_PATH = r"<>:/\|?*"
32
+
33
+
34
+ def camelcase_to_snakecase(name):
35
+ """Convert camel-case string to snake-case."""
36
+ name = _uppercase_uppercase_re.sub(r"\1_\2", name)
37
+ name = _lowercase_uppercase_re.sub(r"\1_\2", name)
38
+ return name.lower()
39
+
40
+
41
+ def snakecase_to_camelcase(name):
42
+ """Convert snake-case string to camel-case string."""
43
+ name = _single_underscore_re.split(name)
44
+ name = [_multiple_underscores_re.split(n) for n in name]
45
+ return "".join(n.capitalize() for n in itertools.chain.from_iterable(name) if n != "")
46
+
47
+
48
+ def filename_prefix_for_name(name):
49
+ if os.path.basename(name) != name:
50
+ raise ValueError(f"Should be a dataset name, not a path: {name}")
51
+ return camelcase_to_snakecase(name)
52
+
53
+
54
+ def filename_prefix_for_split(name, split):
55
+ if os.path.basename(name) != name:
56
+ raise ValueError(f"Should be a dataset name, not a path: {name}")
57
+ if not re.match(_split_re, split):
58
+ raise ValueError(f"Split name should match '{_split_re}'' but got '{split}'.")
59
+ return f"{filename_prefix_for_name(name)}-{split}"
60
+
61
+
62
+ def filepattern_for_dataset_split(dataset_name, split, data_dir, filetype_suffix=None):
63
+ prefix = filename_prefix_for_split(dataset_name, split)
64
+ if filetype_suffix:
65
+ prefix += f".{filetype_suffix}"
66
+ filepath = os.path.join(data_dir, prefix)
67
+ return f"{filepath}*"
68
+
69
+
70
+ def filenames_for_dataset_split(path, dataset_name, split, filetype_suffix=None, shard_lengths=None):
71
+ prefix = filename_prefix_for_split(dataset_name, split)
72
+ prefix = os.path.join(path, prefix)
73
+
74
+ if shard_lengths:
75
+ num_shards = len(shard_lengths)
76
+ filenames = [f"{prefix}-{shard_id:05d}-of-{num_shards:05d}" for shard_id in range(num_shards)]
77
+ if filetype_suffix:
78
+ filenames = [filename + f".{filetype_suffix}" for filename in filenames]
79
+ return filenames
80
+ else:
81
+ filename = prefix
82
+ if filetype_suffix:
83
+ filename += f".{filetype_suffix}"
84
+ return [filename]
datasets/search.py ADDED
@@ -0,0 +1,785 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib.util
2
+ import os
3
+ import tempfile
4
+ from pathlib import PurePath
5
+ from typing import TYPE_CHECKING, NamedTuple, Optional, Union
6
+
7
+ import fsspec
8
+ import numpy as np
9
+
10
+ from .features import List
11
+ from .utils import logging
12
+ from .utils import tqdm as hf_tqdm
13
+
14
+
15
+ if TYPE_CHECKING:
16
+ from .arrow_dataset import Dataset # noqa: F401
17
+
18
+ try:
19
+ from elasticsearch import Elasticsearch # noqa: F401
20
+
21
+ except ImportError:
22
+ pass
23
+ try:
24
+ import faiss # noqa: F401
25
+
26
+ except ImportError:
27
+ pass
28
+
29
+ _has_elasticsearch = importlib.util.find_spec("elasticsearch") is not None
30
+ _has_faiss = importlib.util.find_spec("faiss") is not None
31
+
32
+
33
+ logger = logging.get_logger(__name__)
34
+
35
+
36
+ class MissingIndex(Exception):
37
+ pass
38
+
39
+
40
+ class SearchResults(NamedTuple):
41
+ scores: list[float]
42
+ indices: list[int]
43
+
44
+
45
+ class BatchedSearchResults(NamedTuple):
46
+ total_scores: list[list[float]]
47
+ total_indices: list[list[int]]
48
+
49
+
50
+ class NearestExamplesResults(NamedTuple):
51
+ scores: list[float]
52
+ examples: dict
53
+
54
+
55
+ class BatchedNearestExamplesResults(NamedTuple):
56
+ total_scores: list[list[float]]
57
+ total_examples: list[dict]
58
+
59
+
60
+ class BaseIndex:
61
+ """Base class for indexing"""
62
+
63
+ def search(self, query, k: int = 10, **kwargs) -> SearchResults:
64
+ """
65
+ To implement.
66
+ This method has to return the scores and the indices of the retrieved examples given a certain query.
67
+ """
68
+ raise NotImplementedError
69
+
70
+ def search_batch(self, queries, k: int = 10, **kwargs) -> BatchedSearchResults:
71
+ """Find the nearest examples indices to the query.
72
+
73
+ Args:
74
+ queries (`Union[List[str], np.ndarray]`): The queries as a list of strings if `column` is a text index or as a numpy array if `column` is a vector index.
75
+ k (`int`): The number of examples to retrieve per query.
76
+
77
+ Output:
78
+ total_scores (`List[List[float]`): The retrieval scores of the retrieved examples per query.
79
+ total_indices (`List[List[int]]`): The indices of the retrieved examples per query.
80
+ """
81
+ total_scores, total_indices = [], []
82
+ for query in queries:
83
+ scores, indices = self.search(query, k)
84
+ total_scores.append(scores)
85
+ total_indices.append(indices)
86
+ return BatchedSearchResults(total_scores, total_indices)
87
+
88
+ def save(self, file: Union[str, PurePath]):
89
+ """Serialize the index on disk"""
90
+ raise NotImplementedError
91
+
92
+ @classmethod
93
+ def load(cls, file: Union[str, PurePath]) -> "BaseIndex":
94
+ """Deserialize the index from disk"""
95
+ raise NotImplementedError
96
+
97
+
98
+ class ElasticSearchIndex(BaseIndex):
99
+ """
100
+ Sparse index using Elasticsearch. It is used to index text and run queries based on BM25 similarity.
101
+ An Elasticsearch server needs to be accessible, and a python client is declared with
102
+ ```
103
+ es_client = Elasticsearch([{'host': 'localhost', 'port': '9200'}])
104
+ ```
105
+ for example.
106
+ """
107
+
108
+ def __init__(
109
+ self,
110
+ host: Optional[str] = None,
111
+ port: Optional[int] = None,
112
+ es_client: Optional["Elasticsearch"] = None,
113
+ es_index_name: Optional[str] = None,
114
+ es_index_config: Optional[dict] = None,
115
+ ):
116
+ if not _has_elasticsearch:
117
+ raise ImportError(
118
+ "You must install ElasticSearch to use ElasticSearchIndex. To do so you can run `pip install elasticsearch==7.7.1 for example`"
119
+ )
120
+ if es_client is not None and (host is not None or port is not None):
121
+ raise ValueError("Please specify either `es_client` or `(host, port)`, but not both.")
122
+ host = host or "localhost"
123
+ port = port or 9200
124
+
125
+ import elasticsearch.helpers # noqa: F401 - need this to properly load all the es features
126
+ from elasticsearch import Elasticsearch # noqa: F811
127
+
128
+ self.es_client = es_client if es_client is not None else Elasticsearch([{"host": host, "port": str(port)}])
129
+ self.es_index_name = (
130
+ es_index_name
131
+ if es_index_name is not None
132
+ else "huggingface_datasets_" + os.path.basename(tempfile.NamedTemporaryFile().name)
133
+ )
134
+ self.es_index_config = (
135
+ es_index_config
136
+ if es_index_config is not None
137
+ else {
138
+ "settings": {
139
+ "number_of_shards": 1,
140
+ "analysis": {"analyzer": {"stop_standard": {"type": "standard", " stopwords": "_english_"}}},
141
+ },
142
+ "mappings": {"properties": {"text": {"type": "text", "analyzer": "standard", "similarity": "BM25"}}},
143
+ }
144
+ )
145
+
146
+ def add_documents(self, documents: Union[list[str], "Dataset"], column: Optional[str] = None):
147
+ """
148
+ Add documents to the index.
149
+ If the documents are inside a certain column, you can specify it using the `column` argument.
150
+ """
151
+ index_name = self.es_index_name
152
+ index_config = self.es_index_config
153
+ self.es_client.indices.create(index=index_name, body=index_config)
154
+ number_of_docs = len(documents)
155
+ progress = hf_tqdm(unit="docs", total=number_of_docs)
156
+ successes = 0
157
+
158
+ def passage_generator():
159
+ if column is not None:
160
+ for i, example in enumerate(documents):
161
+ yield {"text": example[column], "_id": i}
162
+ else:
163
+ for i, example in enumerate(documents):
164
+ yield {"text": example, "_id": i}
165
+
166
+ # create the ES index
167
+ import elasticsearch as es
168
+
169
+ for ok, action in es.helpers.streaming_bulk(
170
+ client=self.es_client,
171
+ index=index_name,
172
+ actions=passage_generator(),
173
+ ):
174
+ progress.update(1)
175
+ successes += ok
176
+ if successes != len(documents):
177
+ logger.warning(
178
+ f"Some documents failed to be added to ElasticSearch. Failures: {len(documents) - successes}/{len(documents)}"
179
+ )
180
+ logger.info(f"Indexed {successes:d} documents")
181
+
182
+ def search(self, query: str, k=10, **kwargs) -> SearchResults:
183
+ """Find the nearest examples indices to the query.
184
+
185
+ Args:
186
+ query (`str`): The query as a string.
187
+ k (`int`): The number of examples to retrieve.
188
+
189
+ Output:
190
+ scores (`List[List[float]`): The retrieval scores of the retrieved examples.
191
+ indices (`List[List[int]]`): The indices of the retrieved examples.
192
+ """
193
+ response = self.es_client.search(
194
+ index=self.es_index_name,
195
+ body={"query": {"multi_match": {"query": query, "fields": ["text"], "type": "cross_fields"}}, "size": k},
196
+ **kwargs,
197
+ )
198
+ hits = response["hits"]["hits"]
199
+ return SearchResults([hit["_score"] for hit in hits], [int(hit["_id"]) for hit in hits])
200
+
201
+ def search_batch(self, queries, k: int = 10, max_workers=10, **kwargs) -> BatchedSearchResults:
202
+ import concurrent.futures
203
+
204
+ total_scores, total_indices = [None] * len(queries), [None] * len(queries)
205
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
206
+ future_to_index = {executor.submit(self.search, query, k, **kwargs): i for i, query in enumerate(queries)}
207
+ for future in concurrent.futures.as_completed(future_to_index):
208
+ index = future_to_index[future]
209
+ results: SearchResults = future.result()
210
+ total_scores[index] = results.scores
211
+ total_indices[index] = results.indices
212
+ return BatchedSearchResults(total_indices=total_indices, total_scores=total_scores)
213
+
214
+
215
+ class FaissIndex(BaseIndex):
216
+ """
217
+ Dense index using Faiss. It is used to index vectors.
218
+ Faiss is a library for efficient similarity search and clustering of dense vectors.
219
+ It contains algorithms that search in sets of vectors of any size, up to ones that possibly do not fit in RAM.
220
+ You can find more information about Faiss here:
221
+ - For index types and the string factory: https://github.com/facebookresearch/faiss/wiki/The-index-factory
222
+ - For GPU settings: https://github.com/facebookresearch/faiss/wiki/Faiss-on-the-GPU
223
+ """
224
+
225
+ def __init__(
226
+ self,
227
+ device: Optional[Union[int, list[int]]] = None,
228
+ string_factory: Optional[str] = None,
229
+ metric_type: Optional[int] = None,
230
+ custom_index: Optional["faiss.Index"] = None,
231
+ ):
232
+ """
233
+ Create a Dense index using Faiss. You can specify `device` if you want to run it on GPU (`device` must be the GPU index).
234
+ You can find more information about Faiss here:
235
+ - For `string factory`: https://github.com/facebookresearch/faiss/wiki/The-index-factory
236
+ """
237
+ if string_factory is not None and custom_index is not None:
238
+ raise ValueError("Please specify either `string_factory` or `custom_index` but not both.")
239
+ if device is not None and custom_index is not None:
240
+ raise ValueError(
241
+ "Cannot pass both 'custom_index' and 'device'. "
242
+ "Pass 'custom_index' already transferred to the target device instead."
243
+ )
244
+ self.device = device
245
+ self.string_factory = string_factory
246
+ self.metric_type = metric_type
247
+ self.faiss_index = custom_index
248
+ if not _has_faiss:
249
+ raise ImportError(
250
+ "You must install Faiss to use FaissIndex. To do so you can run `conda install -c pytorch faiss-cpu` or `conda install -c pytorch faiss-gpu`. "
251
+ "A community supported package is also available on pypi: `pip install faiss-cpu` or `pip install faiss-gpu`. "
252
+ "Note that pip may not have the latest version of FAISS, and thus, some of the latest features and bug fixes may not be available."
253
+ )
254
+
255
+ def add_vectors(
256
+ self,
257
+ vectors: Union[np.array, "Dataset"],
258
+ column: Optional[str] = None,
259
+ batch_size: int = 1000,
260
+ train_size: Optional[int] = None,
261
+ faiss_verbose: Optional[bool] = None,
262
+ ):
263
+ """
264
+ Add vectors to the index.
265
+ If the arrays are inside a certain column, you can specify it using the `column` argument.
266
+ """
267
+ import faiss # noqa: F811
268
+
269
+ if column and not isinstance(vectors.features[column], List):
270
+ raise ValueError(
271
+ f"Wrong feature type for column '{column}'. Expected 1d array, got {vectors.features[column]}"
272
+ )
273
+
274
+ # Create index
275
+ if self.faiss_index is None:
276
+ size = len(vectors[0]) if column is None else len(vectors[0][column])
277
+ if self.string_factory is not None:
278
+ if self.metric_type is None:
279
+ index = faiss.index_factory(size, self.string_factory)
280
+ else:
281
+ index = faiss.index_factory(size, self.string_factory, self.metric_type)
282
+ else:
283
+ if self.metric_type is None:
284
+ index = faiss.IndexFlat(size)
285
+ else:
286
+ index = faiss.IndexFlat(size, self.metric_type)
287
+
288
+ self.faiss_index = self._faiss_index_to_device(index, self.device)
289
+ logger.info(f"Created faiss index of type {type(self.faiss_index)}")
290
+
291
+ # Set verbosity level
292
+ if faiss_verbose is not None:
293
+ self.faiss_index.verbose = faiss_verbose
294
+ if hasattr(self.faiss_index, "index") and self.faiss_index.index is not None:
295
+ self.faiss_index.index.verbose = faiss_verbose
296
+ if hasattr(self.faiss_index, "quantizer") and self.faiss_index.quantizer is not None:
297
+ self.faiss_index.quantizer.verbose = faiss_verbose
298
+ if hasattr(self.faiss_index, "clustering_index") and self.faiss_index.clustering_index is not None:
299
+ self.faiss_index.clustering_index.verbose = faiss_verbose
300
+
301
+ # Train
302
+ if train_size is not None:
303
+ train_vecs = vectors[:train_size] if column is None else vectors[:train_size][column]
304
+ logger.info(f"Training the index with the first {len(train_vecs)} vectors")
305
+ self.faiss_index.train(train_vecs)
306
+ else:
307
+ logger.info("Ignored the training step of the faiss index as `train_size` is None.")
308
+
309
+ # Add vectors
310
+ logger.info(f"Adding {len(vectors)} vectors to the faiss index")
311
+ for i in hf_tqdm(range(0, len(vectors), batch_size)):
312
+ vecs = vectors[i : i + batch_size] if column is None else vectors[i : i + batch_size][column]
313
+ self.faiss_index.add(vecs)
314
+
315
+ @staticmethod
316
+ def _faiss_index_to_device(index: "faiss.Index", device: Optional[Union[int, list[int]]] = None) -> "faiss.Index":
317
+ """
318
+ Sends a faiss index to a device.
319
+ A device can either be a positive integer (GPU id), a negative integer (all GPUs),
320
+ or a list of positive integers (select GPUs to use), or `None` for CPU.
321
+ """
322
+
323
+ # If device is not specified, then it runs on CPU.
324
+ if device is None:
325
+ return index
326
+
327
+ import faiss # noqa: F811
328
+
329
+ # If the device id is given as an integer
330
+ if isinstance(device, int):
331
+ # Positive integers are directly mapped to GPU ids
332
+ if device > -1:
333
+ faiss_res = faiss.StandardGpuResources()
334
+ index = faiss.index_cpu_to_gpu(faiss_res, device, index)
335
+ # And negative integers mean using all GPUs
336
+ else:
337
+ index = faiss.index_cpu_to_all_gpus(index)
338
+ # Device ids given as a list mean mapping to those devices specified.
339
+ elif isinstance(device, (list, tuple)):
340
+ index = faiss.index_cpu_to_gpus_list(index, gpus=list(device))
341
+ else:
342
+ raise TypeError(
343
+ f"The argument type: {type(device)} is not expected. "
344
+ + "Please pass in either nothing, a positive int, a negative int, or a list of positive ints."
345
+ )
346
+
347
+ return index
348
+
349
+ def search(self, query: np.array, k=10, **kwargs) -> SearchResults:
350
+ """Find the nearest examples indices to the query.
351
+
352
+ Args:
353
+ query (`np.array`): The query as a numpy array.
354
+ k (`int`): The number of examples to retrieve.
355
+
356
+ Output:
357
+ scores (`List[List[float]`): The retrieval scores of the retrieved examples.
358
+ indices (`List[List[int]]`): The indices of the retrieved examples.
359
+ """
360
+ if len(query.shape) != 1 and (len(query.shape) != 2 or query.shape[0] != 1):
361
+ raise ValueError("Shape of query is incorrect, it has to be either a 1D array or 2D (1, N)")
362
+
363
+ queries = query.reshape(1, -1)
364
+ if not queries.flags.c_contiguous:
365
+ queries = np.asarray(queries, order="C")
366
+ scores, indices = self.faiss_index.search(queries, k, **kwargs)
367
+ return SearchResults(scores[0], indices[0].astype(int))
368
+
369
+ def search_batch(self, queries: np.array, k=10, **kwargs) -> BatchedSearchResults:
370
+ """Find the nearest examples indices to the queries.
371
+
372
+ Args:
373
+ queries (`np.array`): The queries as a numpy array.
374
+ k (`int`): The number of examples to retrieve.
375
+
376
+ Output:
377
+ total_scores (`List[List[float]`): The retrieval scores of the retrieved examples per query.
378
+ total_indices (`List[List[int]]`): The indices of the retrieved examples per query.
379
+ """
380
+ if len(queries.shape) != 2:
381
+ raise ValueError("Shape of query must be 2D")
382
+ if not queries.flags.c_contiguous:
383
+ queries = np.asarray(queries, order="C")
384
+ scores, indices = self.faiss_index.search(queries, k, **kwargs)
385
+ return BatchedSearchResults(scores, indices.astype(int))
386
+
387
+ def save(self, file: Union[str, PurePath], storage_options: Optional[dict] = None):
388
+ """Serialize the FaissIndex on disk"""
389
+ import faiss # noqa: F811
390
+
391
+ if self.device is not None and isinstance(self.device, (int, list, tuple)):
392
+ index = faiss.index_gpu_to_cpu(self.faiss_index)
393
+ else:
394
+ index = self.faiss_index
395
+
396
+ with fsspec.open(str(file), "wb", **(storage_options or {})) as f:
397
+ faiss.write_index(index, faiss.BufferedIOWriter(faiss.PyCallbackIOWriter(f.write)))
398
+
399
+ @classmethod
400
+ def load(
401
+ cls,
402
+ file: Union[str, PurePath],
403
+ device: Optional[Union[int, list[int]]] = None,
404
+ storage_options: Optional[dict] = None,
405
+ ) -> "FaissIndex":
406
+ """Deserialize the FaissIndex from disk"""
407
+ import faiss # noqa: F811
408
+
409
+ # Instances of FaissIndex is essentially just a wrapper for faiss indices.
410
+ faiss_index = cls(device=device)
411
+ with fsspec.open(str(file), "rb", **(storage_options or {})) as f:
412
+ index = faiss.read_index(faiss.BufferedIOReader(faiss.PyCallbackIOReader(f.read)))
413
+ faiss_index.faiss_index = faiss_index._faiss_index_to_device(index, faiss_index.device)
414
+ return faiss_index
415
+
416
+
417
+ class IndexableMixin:
418
+ """Add indexing features to `datasets.Dataset`"""
419
+
420
+ def __init__(self):
421
+ self._indexes: dict[str, BaseIndex] = {}
422
+
423
+ def __len__(self):
424
+ raise NotImplementedError
425
+
426
+ def __getitem__(self, key):
427
+ raise NotImplementedError
428
+
429
+ def is_index_initialized(self, index_name: str) -> bool:
430
+ return index_name in self._indexes
431
+
432
+ def _check_index_is_initialized(self, index_name: str):
433
+ if not self.is_index_initialized(index_name):
434
+ raise MissingIndex(
435
+ f"Index with index_name '{index_name}' not initialized yet. Please make sure that you call `add_faiss_index` or `add_elasticsearch_index` first."
436
+ )
437
+
438
+ def list_indexes(self) -> list[str]:
439
+ """List the `colindex_nameumns`/identifiers of all the attached indexes."""
440
+ return list(self._indexes)
441
+
442
+ def get_index(self, index_name: str) -> BaseIndex:
443
+ """List the `index_name`/identifiers of all the attached indexes.
444
+
445
+ Args:
446
+ index_name (`str`): Index name.
447
+
448
+ Returns:
449
+ [`BaseIndex`]
450
+ """
451
+ self._check_index_is_initialized(index_name)
452
+ return self._indexes[index_name]
453
+
454
+ def add_faiss_index(
455
+ self,
456
+ column: str,
457
+ index_name: Optional[str] = None,
458
+ device: Optional[Union[int, list[int]]] = None,
459
+ string_factory: Optional[str] = None,
460
+ metric_type: Optional[int] = None,
461
+ custom_index: Optional["faiss.Index"] = None,
462
+ batch_size: int = 1000,
463
+ train_size: Optional[int] = None,
464
+ faiss_verbose: bool = False,
465
+ ):
466
+ """Add a dense index using Faiss for fast retrieval.
467
+ The index is created using the vectors of the specified column.
468
+ You can specify `device` if you want to run it on GPU (`device` must be the GPU index, see more below).
469
+ You can find more information about Faiss here:
470
+ - For `string factory`: https://github.com/facebookresearch/faiss/wiki/The-index-factory
471
+
472
+ Args:
473
+ column (`str`): The column of the vectors to add to the index.
474
+ index_name (Optional `str`): The index_name/identifier of the index. This is the index_name that is used to call `.get_nearest` or `.search`.
475
+ By default it corresponds to `column`.
476
+ device (Optional `Union[int, List[int]]`): If positive integer, this is the index of the GPU to use. If negative integer, use all GPUs.
477
+ If a list of positive integers is passed in, run only on those GPUs. By default it uses the CPU.
478
+ string_factory (Optional `str`): This is passed to the index factory of Faiss to create the index. Default index class is IndexFlatIP.
479
+ metric_type (Optional `int`): Type of metric. Ex: `faiss.METRIC_INNER_PRODUCT` or `faiss.METRIC_L2`.
480
+ custom_index (Optional `faiss.Index`): Custom Faiss index that you already have instantiated and configured for your needs.
481
+ batch_size (Optional `int`): Size of the batch to use while adding vectors to the FaissIndex. Default value is 1000.
482
+ <Added version="2.4.0"/>
483
+ train_size (Optional `int`): If the index needs a training step, specifies how many vectors will be used to train the index.
484
+ faiss_verbose (`bool`, defaults to False): Enable the verbosity of the Faiss index.
485
+ """
486
+ index_name = index_name if index_name is not None else column
487
+ faiss_index = FaissIndex(
488
+ device=device, string_factory=string_factory, metric_type=metric_type, custom_index=custom_index
489
+ )
490
+ faiss_index.add_vectors(
491
+ self, column=column, batch_size=batch_size, train_size=train_size, faiss_verbose=faiss_verbose
492
+ )
493
+ self._indexes[index_name] = faiss_index
494
+
495
+ def add_faiss_index_from_external_arrays(
496
+ self,
497
+ external_arrays: np.array,
498
+ index_name: str,
499
+ device: Optional[Union[int, list[int]]] = None,
500
+ string_factory: Optional[str] = None,
501
+ metric_type: Optional[int] = None,
502
+ custom_index: Optional["faiss.Index"] = None,
503
+ batch_size: int = 1000,
504
+ train_size: Optional[int] = None,
505
+ faiss_verbose: bool = False,
506
+ ):
507
+ """Add a dense index using Faiss for fast retrieval.
508
+ The index is created using the vectors of `external_arrays`.
509
+ You can specify `device` if you want to run it on GPU (`device` must be the GPU index).
510
+ You can find more information about Faiss here:
511
+ - For `string factory`: https://github.com/facebookresearch/faiss/wiki/The-index-factory
512
+
513
+ Args:
514
+ external_arrays (`np.array`): If you want to use arrays from outside the lib for the index, you can set `external_arrays`.
515
+ It will use `external_arrays` to create the Faiss index instead of the arrays in the given `column`.
516
+ index_name (`str`): The index_name/identifier of the index. This is the index_name that is used to call `.get_nearest` or `.search`.
517
+ device (Optional `Union[int, List[int]]`): If positive integer, this is the index of the GPU to use. If negative integer, use all GPUs.
518
+ If a list of positive integers is passed in, run only on those GPUs. By default it uses the CPU.
519
+ string_factory (Optional `str`): This is passed to the index factory of Faiss to create the index. Default index class is IndexFlatIP.
520
+ metric_type (Optional `int`): Type of metric. Ex: `faiss.METRIC_INNER_PRODUCT` or `faiss.METRIC_L2`.
521
+ custom_index (Optional `faiss.Index`): Custom Faiss index that you already have instantiated and configured for your needs.
522
+ batch_size (Optional `int`): Size of the batch to use while adding vectors to the FaissIndex. Default value is 1000.
523
+ <Added version="2.4.0"/>
524
+ train_size (Optional `int`): If the index needs a training step, specifies how many vectors will be used to train the index.
525
+ faiss_verbose (`bool`, defaults to False): Enable the verbosity of the Faiss index.
526
+ """
527
+ faiss_index = FaissIndex(
528
+ device=device, string_factory=string_factory, metric_type=metric_type, custom_index=custom_index
529
+ )
530
+ faiss_index.add_vectors(
531
+ external_arrays, column=None, batch_size=batch_size, train_size=train_size, faiss_verbose=faiss_verbose
532
+ )
533
+ self._indexes[index_name] = faiss_index
534
+
535
+ def save_faiss_index(self, index_name: str, file: Union[str, PurePath], storage_options: Optional[dict] = None):
536
+ """Save a FaissIndex on disk.
537
+
538
+ Args:
539
+ index_name (`str`): The index_name/identifier of the index. This is the index_name that is used to call `.get_nearest` or `.search`.
540
+ file (`str`): The path to the serialized faiss index on disk or remote URI (e.g. `"s3://my-bucket/index.faiss"`).
541
+ storage_options (`dict`, *optional*):
542
+ Key/value pairs to be passed on to the file-system backend, if any.
543
+
544
+ <Added version="2.11.0"/>
545
+
546
+ """
547
+ index = self.get_index(index_name)
548
+ if not isinstance(index, FaissIndex):
549
+ raise ValueError(f"Index '{index_name}' is not a FaissIndex but a '{type(index)}'")
550
+ index.save(file, storage_options=storage_options)
551
+ logger.info(f"Saved FaissIndex {index_name} at {file}")
552
+
553
+ def load_faiss_index(
554
+ self,
555
+ index_name: str,
556
+ file: Union[str, PurePath],
557
+ device: Optional[Union[int, list[int]]] = None,
558
+ storage_options: Optional[dict] = None,
559
+ ):
560
+ """Load a FaissIndex from disk.
561
+
562
+ If you want to do additional configurations, you can have access to the faiss index object by doing
563
+ `.get_index(index_name).faiss_index` to make it fit your needs.
564
+
565
+ Args:
566
+ index_name (`str`): The index_name/identifier of the index. This is the index_name that is used to
567
+ call `.get_nearest` or `.search`.
568
+ file (`str`): The path to the serialized faiss index on disk or remote URI (e.g. `"s3://my-bucket/index.faiss"`).
569
+ device (Optional `Union[int, List[int]]`): If positive integer, this is the index of the GPU to use. If negative integer, use all GPUs.
570
+ If a list of positive integers is passed in, run only on those GPUs. By default it uses the CPU.
571
+ storage_options (`dict`, *optional*):
572
+ Key/value pairs to be passed on to the file-system backend, if any.
573
+
574
+ <Added version="2.11.0"/>
575
+
576
+ """
577
+ index = FaissIndex.load(file, device=device, storage_options=storage_options)
578
+ if index.faiss_index.ntotal != len(self):
579
+ raise ValueError(
580
+ f"Index size should match Dataset size, but Index '{index_name}' at {file} has {index.faiss_index.ntotal} elements while the dataset has {len(self)} examples."
581
+ )
582
+ self._indexes[index_name] = index
583
+ logger.info(f"Loaded FaissIndex {index_name} from {file}")
584
+
585
+ def add_elasticsearch_index(
586
+ self,
587
+ column: str,
588
+ index_name: Optional[str] = None,
589
+ host: Optional[str] = None,
590
+ port: Optional[int] = None,
591
+ es_client: Optional["Elasticsearch"] = None,
592
+ es_index_name: Optional[str] = None,
593
+ es_index_config: Optional[dict] = None,
594
+ ):
595
+ """Add a text index using ElasticSearch for fast retrieval.
596
+
597
+ Args:
598
+ column (`str`): The column of the documents to add to the index.
599
+ index_name (Optional `str`): The index_name/identifier of the index. This is the index name that is used to call `.get_nearest` or `.search`.
600
+ By default it corresponds to `column`.
601
+ host (Optional `str`, defaults to localhost):
602
+ host of where ElasticSearch is running
603
+ port (Optional `str`, defaults to 9200):
604
+ port of where ElasticSearch is running
605
+ es_client (Optional `elasticsearch.Elasticsearch`):
606
+ The elasticsearch client used to create the index if host and port are None.
607
+ es_index_name (Optional `str`): The elasticsearch index name used to create the index.
608
+ es_index_config (Optional `dict`):
609
+ The configuration of the elasticsearch index.
610
+ Default config is:
611
+
612
+ Config::
613
+
614
+ {
615
+ "settings": {
616
+ "number_of_shards": 1,
617
+ "analysis": {"analyzer": {"stop_standard": {"type": "standard", " stopwords": "_english_"}}},
618
+ },
619
+ "mappings": {
620
+ "properties": {
621
+ "text": {
622
+ "type": "text",
623
+ "analyzer": "standard",
624
+ "similarity": "BM25"
625
+ },
626
+ }
627
+ },
628
+ }
629
+ """
630
+ index_name = index_name if index_name is not None else column
631
+ es_index = ElasticSearchIndex(
632
+ host=host, port=port, es_client=es_client, es_index_name=es_index_name, es_index_config=es_index_config
633
+ )
634
+ es_index.add_documents(self, column=column)
635
+ self._indexes[index_name] = es_index
636
+
637
+ def load_elasticsearch_index(
638
+ self,
639
+ index_name: str,
640
+ es_index_name: str,
641
+ host: Optional[str] = None,
642
+ port: Optional[int] = None,
643
+ es_client: Optional["Elasticsearch"] = None,
644
+ es_index_config: Optional[dict] = None,
645
+ ):
646
+ """Load an existing text index using ElasticSearch for fast retrieval.
647
+
648
+ Args:
649
+ index_name (`str`):
650
+ The `index_name`/identifier of the index. This is the index name that is used to call `get_nearest` or `search`.
651
+ es_index_name (`str`):
652
+ The name of elasticsearch index to load.
653
+ host (`str`, *optional*, defaults to `localhost`):
654
+ Host of where ElasticSearch is running.
655
+ port (`str`, *optional*, defaults to `9200`):
656
+ Port of where ElasticSearch is running.
657
+ es_client (`elasticsearch.Elasticsearch`, *optional*):
658
+ The elasticsearch client used to create the index if host and port are `None`.
659
+ es_index_config (`dict`, *optional*):
660
+ The configuration of the elasticsearch index.
661
+ Default config is:
662
+ ```
663
+ {
664
+ "settings": {
665
+ "number_of_shards": 1,
666
+ "analysis": {"analyzer": {"stop_standard": {"type": "standard", " stopwords": "_english_"}}},
667
+ },
668
+ "mappings": {
669
+ "properties": {
670
+ "text": {
671
+ "type": "text",
672
+ "analyzer": "standard",
673
+ "similarity": "BM25"
674
+ },
675
+ }
676
+ },
677
+ }
678
+ ```
679
+ """
680
+ self._indexes[index_name] = ElasticSearchIndex(
681
+ host=host, port=port, es_client=es_client, es_index_name=es_index_name, es_index_config=es_index_config
682
+ )
683
+
684
+ def drop_index(self, index_name: str):
685
+ """Drop the index with the specified column.
686
+
687
+ Args:
688
+ index_name (`str`):
689
+ The `index_name`/identifier of the index.
690
+ """
691
+ del self._indexes[index_name]
692
+
693
+ def search(self, index_name: str, query: Union[str, np.array], k: int = 10, **kwargs) -> SearchResults:
694
+ """Find the nearest examples indices in the dataset to the query.
695
+
696
+ Args:
697
+ index_name (`str`):
698
+ The name/identifier of the index.
699
+ query (`Union[str, np.ndarray]`):
700
+ The query as a string if `index_name` is a text index or as a numpy array if `index_name` is a vector index.
701
+ k (`int`):
702
+ The number of examples to retrieve.
703
+
704
+ Returns:
705
+ `(scores, indices)`:
706
+ A tuple of `(scores, indices)` where:
707
+ - **scores** (`List[List[float]`): the retrieval scores from either FAISS (`IndexFlatL2` by default) or ElasticSearch of the retrieved examples
708
+ - **indices** (`List[List[int]]`): the indices of the retrieved examples
709
+ """
710
+ self._check_index_is_initialized(index_name)
711
+ return self._indexes[index_name].search(query, k, **kwargs)
712
+
713
+ def search_batch(
714
+ self, index_name: str, queries: Union[list[str], np.array], k: int = 10, **kwargs
715
+ ) -> BatchedSearchResults:
716
+ """Find the nearest examples indices in the dataset to the query.
717
+
718
+ Args:
719
+ index_name (`str`):
720
+ The `index_name`/identifier of the index.
721
+ queries (`Union[List[str], np.ndarray]`):
722
+ The queries as a list of strings if `index_name` is a text index or as a numpy array if `index_name` is a vector index.
723
+ k (`int`):
724
+ The number of examples to retrieve per query.
725
+
726
+ Returns:
727
+ `(total_scores, total_indices)`:
728
+ A tuple of `(total_scores, total_indices)` where:
729
+ - **total_scores** (`List[List[float]`): the retrieval scores from either FAISS (`IndexFlatL2` by default) or ElasticSearch of the retrieved examples per query
730
+ - **total_indices** (`List[List[int]]`): the indices of the retrieved examples per query
731
+ """
732
+ self._check_index_is_initialized(index_name)
733
+ return self._indexes[index_name].search_batch(queries, k, **kwargs)
734
+
735
+ def get_nearest_examples(
736
+ self, index_name: str, query: Union[str, np.array], k: int = 10, **kwargs
737
+ ) -> NearestExamplesResults:
738
+ """Find the nearest examples in the dataset to the query.
739
+
740
+ Args:
741
+ index_name (`str`):
742
+ The index_name/identifier of the index.
743
+ query (`Union[str, np.ndarray]`):
744
+ The query as a string if `index_name` is a text index or as a numpy array if `index_name` is a vector index.
745
+ k (`int`):
746
+ The number of examples to retrieve.
747
+
748
+ Returns:
749
+ `(scores, examples)`:
750
+ A tuple of `(scores, examples)` where:
751
+ - **scores** (`List[float]`): the retrieval scores from either FAISS (`IndexFlatL2` by default) or ElasticSearch of the retrieved examples
752
+ - **examples** (`dict`): the retrieved examples
753
+ """
754
+ self._check_index_is_initialized(index_name)
755
+ scores, indices = self.search(index_name, query, k, **kwargs)
756
+ top_indices = [i for i in indices if i >= 0]
757
+ return NearestExamplesResults(scores[: len(top_indices)], self[top_indices])
758
+
759
+ def get_nearest_examples_batch(
760
+ self, index_name: str, queries: Union[list[str], np.array], k: int = 10, **kwargs
761
+ ) -> BatchedNearestExamplesResults:
762
+ """Find the nearest examples in the dataset to the query.
763
+
764
+ Args:
765
+ index_name (`str`):
766
+ The `index_name`/identifier of the index.
767
+ queries (`Union[List[str], np.ndarray]`):
768
+ The queries as a list of strings if `index_name` is a text index or as a numpy array if `index_name` is a vector index.
769
+ k (`int`):
770
+ The number of examples to retrieve per query.
771
+
772
+ Returns:
773
+ `(total_scores, total_examples)`:
774
+ A tuple of `(total_scores, total_examples)` where:
775
+ - **total_scores** (`List[List[float]`): the retrieval scores from either FAISS (`IndexFlatL2` by default) or ElasticSearch of the retrieved examples per query
776
+ - **total_examples** (`List[dict]`): the retrieved examples per query
777
+ """
778
+ self._check_index_is_initialized(index_name)
779
+ total_scores, total_indices = self.search_batch(index_name, queries, k, **kwargs)
780
+ total_scores = [
781
+ scores_i[: len([i for i in indices_i if i >= 0])]
782
+ for scores_i, indices_i in zip(total_scores, total_indices)
783
+ ]
784
+ total_samples = [self[[i for i in indices if i >= 0]] for indices in total_indices]
785
+ return BatchedNearestExamplesResults(total_scores, total_samples)
datasets/splits.py ADDED
@@ -0,0 +1,635 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # Lint as: python3
16
+ """Splits related API."""
17
+
18
+ import abc
19
+ import collections
20
+ import copy
21
+ import dataclasses
22
+ import re
23
+ from dataclasses import dataclass
24
+ from typing import Optional, Union
25
+
26
+ from .arrow_reader import FileInstructions, make_file_instructions
27
+ from .naming import _split_re
28
+ from .utils.py_utils import NonMutableDict, asdict
29
+
30
+
31
+ @dataclass
32
+ class SplitInfo:
33
+ name: str = dataclasses.field(default="", metadata={"include_in_asdict_even_if_is_default": True})
34
+ num_bytes: int = dataclasses.field(default=0, metadata={"include_in_asdict_even_if_is_default": True})
35
+ num_examples: int = dataclasses.field(default=0, metadata={"include_in_asdict_even_if_is_default": True})
36
+ shard_lengths: Optional[list[int]] = None
37
+
38
+ # Deprecated
39
+ # For backward compatibility, this field needs to always be included in files like
40
+ # dataset_infos.json and dataset_info.json files
41
+ # To do so, we always include it in the output of datasets.utils.py_utils.asdict(split_info)
42
+ dataset_name: Optional[str] = dataclasses.field(
43
+ default=None, metadata={"include_in_asdict_even_if_is_default": True}
44
+ )
45
+
46
+ @property
47
+ def file_instructions(self):
48
+ """Returns the list of dict(filename, take, skip)."""
49
+ # `self.dataset_name` is assigned in `SplitDict.add()`.
50
+ instructions = make_file_instructions(
51
+ name=self.dataset_name,
52
+ split_infos=[self],
53
+ instruction=str(self.name),
54
+ )
55
+ return instructions.file_instructions
56
+
57
+
58
+ @dataclass
59
+ class SubSplitInfo:
60
+ """Wrapper around a sub split info.
61
+ This class expose info on the subsplit:
62
+ ```
63
+ ds, info = datasets.load_dataset(..., split='train[75%:]', with_info=True)
64
+ info.splits['train[75%:]'].num_examples
65
+ ```
66
+ """
67
+
68
+ instructions: FileInstructions
69
+
70
+ @property
71
+ def num_examples(self):
72
+ """Returns the number of example in the subsplit."""
73
+ return self.instructions.num_examples
74
+
75
+ @property
76
+ def file_instructions(self):
77
+ """Returns the list of dict(filename, take, skip)."""
78
+ return self.instructions.file_instructions
79
+
80
+
81
+ class SplitBase(metaclass=abc.ABCMeta):
82
+ # pylint: disable=line-too-long
83
+ """Abstract base class for Split compositionality.
84
+
85
+ See the
86
+ [guide on splits](../loading#slice-splits)
87
+ for more information.
88
+
89
+ There are three parts to the composition:
90
+ 1) The splits are composed (defined, merged, split,...) together before
91
+ calling the `.as_dataset()` function. This is done with the `__add__`,
92
+ `__getitem__`, which return a tree of `SplitBase` (whose leaf
93
+ are the `NamedSplit` objects)
94
+
95
+ ```
96
+ split = datasets.Split.TRAIN + datasets.Split.TEST.subsplit(datasets.percent[:50])
97
+ ```
98
+
99
+ 2) The `SplitBase` is forwarded to the `.as_dataset()` function
100
+ to be resolved into actual read instruction. This is done by the
101
+ `.get_read_instruction()` method which takes the real dataset splits
102
+ (name, number of shards,...) and parse the tree to return a
103
+ `SplitReadInstruction()` object
104
+
105
+ ```
106
+ read_instruction = split.get_read_instruction(self.info.splits)
107
+ ```
108
+
109
+ 3) The `SplitReadInstruction` is then used in the `tf.data.Dataset` pipeline
110
+ to define which files to read and how to skip examples within file.
111
+
112
+ """
113
+
114
+ # pylint: enable=line-too-long
115
+
116
+ @abc.abstractmethod
117
+ def get_read_instruction(self, split_dict):
118
+ """Parse the descriptor tree and compile all read instructions together.
119
+
120
+ Args:
121
+ split_dict: `dict`, The `dict[split_name, SplitInfo]` of the dataset
122
+
123
+ Returns:
124
+ split_read_instruction: `SplitReadInstruction`
125
+ """
126
+ raise NotImplementedError("Abstract method")
127
+
128
+ def __eq__(self, other):
129
+ """Equality: datasets.Split.TRAIN == 'train'."""
130
+ if isinstance(other, (NamedSplit, str)):
131
+ return False
132
+ raise NotImplementedError("Equality is not implemented between merged/sub splits.")
133
+
134
+ def __ne__(self, other):
135
+ """InEquality: datasets.Split.TRAIN != 'test'."""
136
+ return not self.__eq__(other)
137
+
138
+ def __add__(self, other):
139
+ """Merging: datasets.Split.TRAIN + datasets.Split.TEST."""
140
+ return _SplitMerged(self, other)
141
+
142
+ def subsplit(self, arg=None, k=None, percent=None, weighted=None): # pylint: disable=redefined-outer-name
143
+ """Divides this split into subsplits.
144
+
145
+ There are 3 ways to define subsplits, which correspond to the 3
146
+ arguments `k` (get `k` even subsplits), `percent` (get a slice of the
147
+ dataset with `datasets.percent`), and `weighted` (get subsplits with proportions
148
+ specified by `weighted`).
149
+
150
+ Example::
151
+
152
+ ```
153
+ # 50% train, 50% test
154
+ train, test = split.subsplit(k=2)
155
+ # 50% train, 25% test, 25% validation
156
+ train, test, validation = split.subsplit(weighted=[2, 1, 1])
157
+ # Extract last 20%
158
+ subsplit = split.subsplit(datasets.percent[-20:])
159
+ ```
160
+
161
+ Warning: k and weighted will be converted into percent which mean that
162
+ values below the percent will be rounded up or down. The final split may be
163
+ bigger to deal with remainders. For instance:
164
+
165
+ ```
166
+ train, test, valid = split.subsplit(k=3) # 33%, 33%, 34%
167
+ s1, s2, s3, s4 = split.subsplit(weighted=[2, 2, 1, 1]) # 33%, 33%, 16%, 18%
168
+ ```
169
+
170
+ Args:
171
+ arg: If no kwargs are given, `arg` will be interpreted as one of
172
+ `k`, `percent`, or `weighted` depending on the type.
173
+ For example:
174
+ ```
175
+ split.subsplit(10) # Equivalent to split.subsplit(k=10)
176
+ split.subsplit(datasets.percent[:-20]) # percent=datasets.percent[:-20]
177
+ split.subsplit([1, 1, 2]) # weighted=[1, 1, 2]
178
+ ```
179
+ k: `int` If set, subdivide the split into `k` equal parts.
180
+ percent: `datasets.percent slice`, return a single subsplit corresponding to
181
+ a slice of the original split. For example:
182
+ `split.subsplit(datasets.percent[-20:]) # Last 20% of the dataset`.
183
+ weighted: `list[int]`, return a list of subsplits whose proportions match
184
+ the normalized sum of the list. For example:
185
+ `split.subsplit(weighted=[1, 1, 2]) # 25%, 25%, 50%`.
186
+
187
+ Returns:
188
+ A subsplit or list of subsplits extracted from this split object.
189
+ """
190
+ # Note that the percent kwargs redefine the outer name datasets.percent. This
191
+ # is done for consistency (.subsplit(percent=datasets.percent[:40]))
192
+ if sum(bool(x) for x in (arg, k, percent, weighted)) != 1:
193
+ raise ValueError("Only one argument of subsplit should be set.")
194
+
195
+ # Auto deduce k
196
+ if isinstance(arg, int):
197
+ k = arg
198
+ elif isinstance(arg, slice):
199
+ percent = arg
200
+ elif isinstance(arg, list):
201
+ weighted = arg
202
+
203
+ if not (k or percent or weighted):
204
+ raise ValueError(
205
+ f"Invalid split argument {arg}. Only list, slice and int supported. "
206
+ "One of k, weighted or percent should be set to a non empty value."
207
+ )
208
+
209
+ def assert_slices_coverage(slices):
210
+ # Ensure that the expended slices cover all percents.
211
+ assert sum((list(range(*s.indices(100))) for s in slices), []) == list(range(100))
212
+
213
+ if k:
214
+ if not 0 < k <= 100:
215
+ raise ValueError(f"Subsplit k should be between 0 and 100, got {k}")
216
+ shift = 100 // k
217
+ slices = [slice(i * shift, (i + 1) * shift) for i in range(k)]
218
+ # Round up last element to ensure all elements are taken
219
+ slices[-1] = slice(slices[-1].start, 100)
220
+ # Internal check to ensure full coverage
221
+ assert_slices_coverage(slices)
222
+ return tuple(_SubSplit(self, s) for s in slices)
223
+ elif percent:
224
+ return _SubSplit(self, percent)
225
+ elif weighted:
226
+ # Normalize the weighted sum
227
+ total = sum(weighted)
228
+ weighted = [100 * x // total for x in weighted]
229
+ # Create the slice for each of the elements
230
+ start = 0
231
+ stop = 0
232
+ slices = []
233
+ for v in weighted:
234
+ stop += v
235
+ slices.append(slice(start, stop))
236
+ start = stop
237
+ # Round up last element to ensure all elements are taken
238
+ slices[-1] = slice(slices[-1].start, 100)
239
+ # Internal check to ensure full coverage
240
+ assert_slices_coverage(slices)
241
+ return tuple(_SubSplit(self, s) for s in slices)
242
+ else:
243
+ # Should not be possible
244
+ raise ValueError("Could not determine the split")
245
+
246
+
247
+ # 2 requirements:
248
+ # 1. datasets.percent be sliceable
249
+ # 2. datasets.percent be documented
250
+ #
251
+ # Instances are not documented, so we want datasets.percent to be a class, but to
252
+ # have it be sliceable, we need this metaclass.
253
+ class PercentSliceMeta(type):
254
+ def __getitem__(cls, slice_value):
255
+ if not isinstance(slice_value, slice):
256
+ raise ValueError(f"datasets.percent should only be called with slice, not {slice_value}")
257
+ return slice_value
258
+
259
+
260
+ class PercentSlice(metaclass=PercentSliceMeta):
261
+ # pylint: disable=line-too-long
262
+ """Syntactic sugar for defining slice subsplits: `datasets.percent[75:-5]`.
263
+
264
+ See the
265
+ [guide on splits](../loading#slice-splits)
266
+ for more information.
267
+ """
268
+
269
+ # pylint: enable=line-too-long
270
+ pass
271
+
272
+
273
+ percent = PercentSlice # pylint: disable=invalid-name
274
+
275
+
276
+ class _SplitMerged(SplitBase):
277
+ """Represent two split descriptors merged together."""
278
+
279
+ def __init__(self, split1, split2):
280
+ self._split1 = split1
281
+ self._split2 = split2
282
+
283
+ def get_read_instruction(self, split_dict):
284
+ read_instruction1 = self._split1.get_read_instruction(split_dict)
285
+ read_instruction2 = self._split2.get_read_instruction(split_dict)
286
+ return read_instruction1 + read_instruction2
287
+
288
+ def __repr__(self):
289
+ return f"({repr(self._split1)} + {repr(self._split2)})"
290
+
291
+
292
+ class _SubSplit(SplitBase):
293
+ """Represent a sub split of a split descriptor."""
294
+
295
+ def __init__(self, split, slice_value):
296
+ self._split = split
297
+ self._slice_value = slice_value
298
+
299
+ def get_read_instruction(self, split_dict):
300
+ return self._split.get_read_instruction(split_dict)[self._slice_value]
301
+
302
+ def __repr__(self):
303
+ slice_str = "{start}:{stop}"
304
+ if self._slice_value.step is not None:
305
+ slice_str += ":{step}"
306
+ slice_str = slice_str.format(
307
+ start="" if self._slice_value.start is None else self._slice_value.start,
308
+ stop="" if self._slice_value.stop is None else self._slice_value.stop,
309
+ step=self._slice_value.step,
310
+ )
311
+ return f"{repr(self._split)}(datasets.percent[{slice_str}])"
312
+
313
+
314
+ class NamedSplit(SplitBase):
315
+ """Descriptor corresponding to a named split (train, test, ...).
316
+
317
+ Example:
318
+ Each descriptor can be composed with other using addition or slice:
319
+
320
+ ```py
321
+ split = datasets.Split.TRAIN.subsplit(datasets.percent[0:25]) + datasets.Split.TEST
322
+ ```
323
+
324
+ The resulting split will correspond to 25% of the train split merged with
325
+ 100% of the test split.
326
+
327
+ A split cannot be added twice, so the following will fail:
328
+
329
+ ```py
330
+ split = (
331
+ datasets.Split.TRAIN.subsplit(datasets.percent[:25]) +
332
+ datasets.Split.TRAIN.subsplit(datasets.percent[75:])
333
+ ) # Error
334
+ split = datasets.Split.TEST + datasets.Split.ALL # Error
335
+ ```
336
+
337
+ The slices can be applied only one time. So the following are valid:
338
+
339
+ ```py
340
+ split = (
341
+ datasets.Split.TRAIN.subsplit(datasets.percent[:25]) +
342
+ datasets.Split.TEST.subsplit(datasets.percent[:50])
343
+ )
344
+ split = (datasets.Split.TRAIN + datasets.Split.TEST).subsplit(datasets.percent[:50])
345
+ ```
346
+
347
+ But this is not valid:
348
+
349
+ ```py
350
+ train = datasets.Split.TRAIN
351
+ test = datasets.Split.TEST
352
+ split = train.subsplit(datasets.percent[:25]).subsplit(datasets.percent[:25])
353
+ split = (train.subsplit(datasets.percent[:25]) + test).subsplit(datasets.percent[:50])
354
+ ```
355
+ """
356
+
357
+ def __init__(self, name):
358
+ self._name = name
359
+ split_names_from_instruction = [split_instruction.split("[")[0] for split_instruction in name.split("+")]
360
+ for split_name in split_names_from_instruction:
361
+ if not re.match(_split_re, split_name):
362
+ raise ValueError(f"Split name should match '{_split_re}' but got '{split_name}'.")
363
+
364
+ def __str__(self):
365
+ return self._name
366
+
367
+ def __repr__(self):
368
+ return f"NamedSplit({self._name!r})"
369
+
370
+ def __eq__(self, other):
371
+ """Equality: datasets.Split.TRAIN == 'train'."""
372
+ if isinstance(other, NamedSplit):
373
+ return self._name == other._name # pylint: disable=protected-access
374
+ elif isinstance(other, SplitBase):
375
+ return False
376
+ elif isinstance(other, str): # Other should be string
377
+ return self._name == other
378
+ else:
379
+ return False
380
+
381
+ def __lt__(self, other):
382
+ return self._name < other._name # pylint: disable=protected-access
383
+
384
+ def __hash__(self):
385
+ return hash(self._name)
386
+
387
+ def get_read_instruction(self, split_dict):
388
+ return SplitReadInstruction(split_dict[self._name])
389
+
390
+
391
+ class NamedSplitAll(NamedSplit):
392
+ """Split corresponding to the union of all defined dataset splits."""
393
+
394
+ def __init__(self):
395
+ super().__init__("all")
396
+
397
+ def __repr__(self):
398
+ return "NamedSplitAll()"
399
+
400
+ def get_read_instruction(self, split_dict):
401
+ # Merge all dataset split together
402
+ read_instructions = [SplitReadInstruction(s) for s in split_dict.values()]
403
+ return sum(read_instructions, SplitReadInstruction())
404
+
405
+
406
+ class Split:
407
+ # pylint: disable=line-too-long
408
+ """`Enum` for dataset splits.
409
+
410
+ Datasets are typically split into different subsets to be used at various
411
+ stages of training and evaluation.
412
+
413
+ - `TRAIN`: the training data.
414
+ - `VALIDATION`: the validation data. If present, this is typically used as
415
+ evaluation data while iterating on a model (e.g. changing hyperparameters,
416
+ model architecture, etc.).
417
+ - `TEST`: the testing data. This is the data to report metrics on. Typically
418
+ you do not want to use this during model iteration as you may overfit to it.
419
+ - `ALL`: the union of all defined dataset splits.
420
+
421
+ All splits, including compositions inherit from `datasets.SplitBase`.
422
+
423
+ See the [guide](../load_hub#splits) on splits for more information.
424
+
425
+ Example:
426
+
427
+ ```py
428
+ >>> datasets.SplitGenerator(
429
+ ... name=datasets.Split.TRAIN,
430
+ ... gen_kwargs={"split_key": "train", "files": dl_manager.download_and extract(url)},
431
+ ... ),
432
+ ... datasets.SplitGenerator(
433
+ ... name=datasets.Split.VALIDATION,
434
+ ... gen_kwargs={"split_key": "validation", "files": dl_manager.download_and extract(url)},
435
+ ... ),
436
+ ... datasets.SplitGenerator(
437
+ ... name=datasets.Split.TEST,
438
+ ... gen_kwargs={"split_key": "test", "files": dl_manager.download_and extract(url)},
439
+ ... )
440
+ ```
441
+ """
442
+
443
+ # pylint: enable=line-too-long
444
+ TRAIN = NamedSplit("train")
445
+ TEST = NamedSplit("test")
446
+ VALIDATION = NamedSplit("validation")
447
+ ALL = NamedSplitAll()
448
+
449
+ def __new__(cls, name):
450
+ """Create a custom split with datasets.Split('custom_name')."""
451
+ return NamedSplitAll() if name == "all" else NamedSplit(name)
452
+
453
+
454
+ # Similar to SplitInfo, but contain an additional slice info
455
+ SlicedSplitInfo = collections.namedtuple(
456
+ "SlicedSplitInfo",
457
+ [
458
+ "split_info",
459
+ "slice_value",
460
+ ],
461
+ ) # noqa: E231
462
+
463
+
464
+ class SplitReadInstruction:
465
+ """Object containing the reading instruction for the dataset.
466
+
467
+ Similarly to `SplitDescriptor` nodes, this object can be composed with itself,
468
+ but the resolution happens instantaneously, instead of keeping track of the
469
+ tree, such as all instructions are compiled and flattened in a single
470
+ SplitReadInstruction object containing the list of files and slice to use.
471
+
472
+ Once resolved, the instructions can be accessed with:
473
+
474
+ ```
475
+ read_instructions.get_list_sliced_split_info() # List of splits to use
476
+ ```
477
+
478
+ """
479
+
480
+ def __init__(self, split_info=None):
481
+ self._splits = NonMutableDict(error_msg="Overlap between splits. Split {key} has been added with itself.")
482
+
483
+ if split_info:
484
+ self.add(SlicedSplitInfo(split_info=split_info, slice_value=None))
485
+
486
+ def add(self, sliced_split):
487
+ """Add a SlicedSplitInfo the read instructions."""
488
+ # TODO(epot): Check that the number of examples per shard % 100 == 0
489
+ # Otherwise the slices value may be unbalanced and not exactly reflect the
490
+ # requested slice.
491
+ self._splits[sliced_split.split_info.name] = sliced_split
492
+
493
+ def __add__(self, other):
494
+ """Merging split together."""
495
+ # Will raise error if a split has already be added (NonMutableDict)
496
+ # TODO(epot): If a split is already added but there is no overlap between
497
+ # the slices, should merge the slices (ex: [:10] + [80:])
498
+ split_instruction = SplitReadInstruction()
499
+ split_instruction._splits.update(self._splits) # pylint: disable=protected-access
500
+ split_instruction._splits.update(other._splits) # pylint: disable=protected-access
501
+ return split_instruction
502
+
503
+ def __getitem__(self, slice_value):
504
+ """Sub-splits."""
505
+ # Will raise an error if a split has already been sliced
506
+ split_instruction = SplitReadInstruction()
507
+ for v in self._splits.values():
508
+ if v.slice_value is not None:
509
+ raise ValueError(f"Trying to slice Split {v.split_info.name} which has already been sliced")
510
+ v = v._asdict()
511
+ v["slice_value"] = slice_value
512
+ split_instruction.add(SlicedSplitInfo(**v))
513
+ return split_instruction
514
+
515
+ def get_list_sliced_split_info(self):
516
+ return list(self._splits.values())
517
+
518
+
519
+ class SplitDict(dict):
520
+ """Split info object."""
521
+
522
+ def __init__(self, *args, dataset_name=None, **kwargs):
523
+ super().__init__(*args, **kwargs)
524
+ self.dataset_name = dataset_name
525
+
526
+ def __getitem__(self, key: Union[SplitBase, str]):
527
+ # 1st case: The key exists: `info.splits['train']`
528
+ if str(key) in self:
529
+ return super().__getitem__(str(key))
530
+ # 2nd case: Uses instructions: `info.splits['train[50%]']`
531
+ else:
532
+ instructions = make_file_instructions(
533
+ name=self.dataset_name,
534
+ split_infos=self.values(),
535
+ instruction=key,
536
+ )
537
+ return SubSplitInfo(instructions)
538
+
539
+ def __setitem__(self, key: Union[SplitBase, str], value: SplitInfo):
540
+ if key != value.name:
541
+ raise ValueError(f"Cannot add elem. (key mismatch: '{key}' != '{value.name}')")
542
+ super().__setitem__(key, value)
543
+
544
+ def add(self, split_info: SplitInfo):
545
+ """Add the split info."""
546
+ if split_info.name in self:
547
+ raise ValueError(f"Split {split_info.name} already present")
548
+ split_info.dataset_name = self.dataset_name
549
+ super().__setitem__(split_info.name, split_info)
550
+
551
+ @property
552
+ def total_num_examples(self):
553
+ """Return the total number of examples."""
554
+ return sum(s.num_examples for s in self.values())
555
+
556
+ @classmethod
557
+ def from_split_dict(cls, split_infos: Union[list, dict], dataset_name: Optional[str] = None):
558
+ """Returns a new SplitDict initialized from a Dict or List of `split_infos`."""
559
+ if isinstance(split_infos, dict):
560
+ split_infos = list(split_infos.values())
561
+
562
+ if dataset_name is None:
563
+ dataset_name = split_infos[0].get("dataset_name") if split_infos else None
564
+
565
+ split_dict = cls(dataset_name=dataset_name)
566
+
567
+ for split_info in split_infos:
568
+ if isinstance(split_info, dict):
569
+ split_info = SplitInfo(**split_info)
570
+ split_dict.add(split_info)
571
+
572
+ return split_dict
573
+
574
+ def to_split_dict(self):
575
+ """Returns a list of SplitInfo protos that we have."""
576
+ out = []
577
+ for split_name, split_info in self.items():
578
+ split_info = copy.deepcopy(split_info)
579
+ split_info.name = split_name
580
+ out.append(split_info)
581
+ return out
582
+
583
+ def copy(self):
584
+ return SplitDict.from_split_dict(self.to_split_dict(), self.dataset_name)
585
+
586
+ def _to_yaml_list(self) -> list:
587
+ out = [asdict(s) for s in self.to_split_dict()]
588
+ # we don't need the shard lengths in YAML, since it depends on max_shard_size and num_proc
589
+ for split_info_dict in out:
590
+ split_info_dict.pop("shard_lengths", None)
591
+ # we don't need the dataset_name attribute that is deprecated
592
+ for split_info_dict in out:
593
+ split_info_dict.pop("dataset_name", None)
594
+ return out
595
+
596
+ @classmethod
597
+ def _from_yaml_list(cls, yaml_data: list) -> "SplitDict":
598
+ return cls.from_split_dict(yaml_data)
599
+
600
+
601
+ @dataclass
602
+ class SplitGenerator:
603
+ """Defines the split information for the generator.
604
+
605
+ This should be used as returned value of
606
+ `GeneratorBasedBuilder._split_generators`.
607
+ See `GeneratorBasedBuilder._split_generators` for more info and example
608
+ of usage.
609
+
610
+ Args:
611
+ name (`str`):
612
+ Name of the `Split` for which the generator will
613
+ create the examples.
614
+ **gen_kwargs (additional keyword arguments):
615
+ Keyword arguments to forward to the `DatasetBuilder._generate_examples` method
616
+ of the builder.
617
+
618
+ Example:
619
+
620
+ ```py
621
+ >>> datasets.SplitGenerator(
622
+ ... name=datasets.Split.TRAIN,
623
+ ... gen_kwargs={"split_key": "train", "files": dl_manager.download_and_extract(url)},
624
+ ... )
625
+ ```
626
+ """
627
+
628
+ name: str
629
+ gen_kwargs: dict = dataclasses.field(default_factory=dict)
630
+ split_info: SplitInfo = dataclasses.field(init=False)
631
+
632
+ def __post_init__(self):
633
+ self.name = str(self.name) # Make sure we convert NamedSplits in strings
634
+ NamedSplit(self.name) # check that it's a valid split name
635
+ self.split_info = SplitInfo(name=self.name)
datasets/streaming.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib
2
+ from functools import wraps
3
+ from typing import TYPE_CHECKING, Optional
4
+
5
+ from .download.download_config import DownloadConfig
6
+ from .utils.file_utils import (
7
+ xbasename,
8
+ xdirname,
9
+ xet_parse,
10
+ xexists,
11
+ xgetsize,
12
+ xglob,
13
+ xgzip_open,
14
+ xisdir,
15
+ xisfile,
16
+ xjoin,
17
+ xlistdir,
18
+ xnumpy_load,
19
+ xopen,
20
+ xpandas_read_csv,
21
+ xpandas_read_excel,
22
+ xPath,
23
+ xpyarrow_parquet_read_table,
24
+ xrelpath,
25
+ xsio_loadmat,
26
+ xsplit,
27
+ xsplitext,
28
+ xwalk,
29
+ xxml_dom_minidom_parse,
30
+ )
31
+ from .utils.logging import get_logger
32
+ from .utils.patching import patch_submodule
33
+
34
+
35
+ logger = get_logger(__name__)
36
+
37
+
38
+ if TYPE_CHECKING:
39
+ from .builder import DatasetBuilder
40
+
41
+
42
+ def extend_module_for_streaming(module_path, download_config: Optional[DownloadConfig] = None):
43
+ """Extend the module to support streaming.
44
+
45
+ We patch some functions in the module to use `fsspec` to support data streaming:
46
+ - We use `fsspec.open` to open and read remote files. We patch the module function:
47
+ - `open`
48
+ - We use the "::" hop separator to join paths and navigate remote compressed/archive files. We patch the module
49
+ functions:
50
+ - `os.path.join`
51
+ - `pathlib.Path.joinpath` and `pathlib.Path.__truediv__` (called when using the "/" operator)
52
+
53
+ The patched functions are replaced with custom functions defined to work with the
54
+ :class:`~download.streaming_download_manager.StreamingDownloadManager`.
55
+
56
+ Args:
57
+ module_path: Path to the module to be extended.
58
+ download_config: Mainly use `token` or `storage_options` to support different platforms and auth types.
59
+ """
60
+
61
+ module = importlib.import_module(module_path)
62
+
63
+ # TODO(QL): always update the module to add subsequent new authentication without removing old ones
64
+ if hasattr(module, "_patched_for_streaming") and module._patched_for_streaming:
65
+ if isinstance(module._patched_for_streaming, DownloadConfig):
66
+ module._patched_for_streaming.token = download_config.token
67
+ module._patched_for_streaming.storage_options = download_config.storage_options
68
+ return
69
+
70
+ def wrap_auth(function):
71
+ @wraps(function)
72
+ def wrapper(*args, **kwargs):
73
+ return function(*args, download_config=download_config, **kwargs)
74
+
75
+ wrapper._decorator_name_ = "wrap_auth"
76
+ return wrapper
77
+
78
+ # open files in a streaming fashion
79
+ patch_submodule(module, "open", wrap_auth(xopen)).start()
80
+ patch_submodule(module, "os.listdir", wrap_auth(xlistdir)).start()
81
+ patch_submodule(module, "os.walk", wrap_auth(xwalk)).start()
82
+ patch_submodule(module, "glob.glob", wrap_auth(xglob)).start()
83
+ # allow to navigate in remote zip files
84
+ patch_submodule(module, "os.path.join", xjoin).start()
85
+ patch_submodule(module, "os.path.dirname", xdirname).start()
86
+ patch_submodule(module, "os.path.basename", xbasename).start()
87
+ patch_submodule(module, "os.path.relpath", xrelpath).start()
88
+ patch_submodule(module, "os.path.split", xsplit).start()
89
+ patch_submodule(module, "os.path.splitext", xsplitext).start()
90
+ # allow checks on paths
91
+ patch_submodule(module, "os.path.exists", wrap_auth(xexists)).start()
92
+ patch_submodule(module, "os.path.isdir", wrap_auth(xisdir)).start()
93
+ patch_submodule(module, "os.path.isfile", wrap_auth(xisfile)).start()
94
+ patch_submodule(module, "os.path.getsize", wrap_auth(xgetsize)).start()
95
+ patch_submodule(module, "pathlib.Path", xPath).start()
96
+ # file readers
97
+ patch_submodule(module, "gzip.open", wrap_auth(xgzip_open)).start()
98
+ patch_submodule(module, "numpy.load", wrap_auth(xnumpy_load)).start()
99
+ patch_submodule(module, "pandas.read_csv", wrap_auth(xpandas_read_csv), attrs=["__version__"]).start()
100
+ patch_submodule(module, "pandas.read_excel", wrap_auth(xpandas_read_excel), attrs=["__version__"]).start()
101
+ patch_submodule(module, "scipy.io.loadmat", wrap_auth(xsio_loadmat), attrs=["__version__"]).start()
102
+ patch_submodule(module, "xml.etree.ElementTree.parse", wrap_auth(xet_parse)).start()
103
+ patch_submodule(module, "xml.dom.minidom.parse", wrap_auth(xxml_dom_minidom_parse)).start()
104
+ # pyarrow: do not patch pyarrow attribute in packaged modules
105
+ if not module.__name__.startswith("datasets.packaged_modules."):
106
+ patch_submodule(module, "pyarrow.parquet.read_table", wrap_auth(xpyarrow_parquet_read_table)).start()
107
+ module._patched_for_streaming = download_config
108
+
109
+
110
+ def extend_dataset_builder_for_streaming(builder: "DatasetBuilder"):
111
+ """Extend the dataset builder module and the modules imported by it to support streaming.
112
+
113
+ Args:
114
+ builder (:class:`DatasetBuilder`): Dataset builder instance.
115
+ """
116
+ # this extends the open and os.path.join functions for data streaming
117
+ download_config = DownloadConfig(storage_options=builder.storage_options, token=builder.token)
118
+ extend_module_for_streaming(builder.__module__, download_config=download_config)
119
+
120
+ # builders can inherit from other builders that might use streaming functionality
121
+ # (for example, ImageFolder and AudioFolder inherit from FolderBuilder which implements examples generation)
122
+ # but these parents builders are not patched automatically as they are not instantiated, so we patch them here
123
+ from .builder import DatasetBuilder
124
+
125
+ parent_builder_modules = [
126
+ cls.__module__
127
+ for cls in type(builder).__mro__[1:] # make sure it's not the same module we've already patched
128
+ if issubclass(cls, DatasetBuilder) and cls.__module__ != DatasetBuilder.__module__
129
+ ] # check it's not a standard builder from datasets.builder
130
+ for module in parent_builder_modules:
131
+ extend_module_for_streaming(module, download_config=download_config)
datasets/table.py ADDED
@@ -0,0 +1,2385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import os
3
+ from collections.abc import Iterator
4
+ from functools import partial
5
+ from itertools import groupby
6
+ from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar, Union
7
+
8
+ import numpy as np
9
+ import pyarrow as pa
10
+ import pyarrow.compute as pc
11
+
12
+ from .utils.logging import get_logger
13
+
14
+
15
+ if TYPE_CHECKING:
16
+ from .features.features import Features, FeatureType
17
+
18
+
19
+ logger = get_logger(__name__)
20
+
21
+
22
+ def inject_arrow_table_documentation(arrow_table_method):
23
+ def wrapper(fn):
24
+ fn.__doc__ = arrow_table_method.__doc__ + (fn.__doc__ if fn.__doc__ is not None else "")
25
+ fn.__doc__ = fn.__doc__.replace("pyarrow.Table", "Table")
26
+ if hasattr(arrow_table_method, "__annotations__"):
27
+ fn.__annotations__ = arrow_table_method.__annotations__
28
+ return fn
29
+
30
+ return wrapper
31
+
32
+
33
+ def _in_memory_arrow_table_from_file(filename: str) -> pa.Table:
34
+ in_memory_stream = pa.input_stream(filename)
35
+ opened_stream = pa.ipc.open_stream(in_memory_stream)
36
+ pa_table = opened_stream.read_all()
37
+ return pa_table
38
+
39
+
40
+ def _in_memory_arrow_table_from_buffer(buffer: pa.Buffer) -> pa.Table:
41
+ stream = pa.BufferReader(buffer)
42
+ opened_stream = pa.ipc.open_stream(stream)
43
+ table = opened_stream.read_all()
44
+ return table
45
+
46
+
47
+ def _memory_mapped_record_batch_reader_from_file(filename: str) -> pa.RecordBatchStreamReader:
48
+ memory_mapped_stream = pa.memory_map(filename)
49
+ return pa.ipc.open_stream(memory_mapped_stream)
50
+
51
+
52
+ def read_schema_from_file(filename: str) -> pa.Schema:
53
+ """
54
+ Infer arrow table schema from file without loading whole file into memory.
55
+ Useful especially while having very big files.
56
+ """
57
+ with pa.memory_map(filename) as memory_mapped_stream:
58
+ schema = pa.ipc.open_stream(memory_mapped_stream).schema
59
+ return schema
60
+
61
+
62
+ def _memory_mapped_arrow_table_from_file(filename: str) -> pa.Table:
63
+ opened_stream = _memory_mapped_record_batch_reader_from_file(filename)
64
+ pa_table = opened_stream.read_all()
65
+ return pa_table
66
+
67
+
68
+ def _deepcopy(x, memo: dict):
69
+ """deepcopy a regular class instance"""
70
+ cls = x.__class__
71
+ result = cls.__new__(cls)
72
+ memo[id(x)] = result
73
+ for k, v in x.__dict__.items():
74
+ setattr(result, k, copy.deepcopy(v, memo))
75
+ return result
76
+
77
+
78
+ def _interpolation_search(arr: list[int], x: int) -> int:
79
+ """
80
+ Return the position i of a sorted array so that arr[i] <= x < arr[i+1]
81
+
82
+ Args:
83
+ arr (`List[int]`): non-empty sorted list of integers
84
+ x (`int`): query
85
+
86
+ Returns:
87
+ `int`: the position i so that arr[i] <= x < arr[i+1]
88
+
89
+ Raises:
90
+ `IndexError`: if the array is empty or if the query is outside the array values
91
+ """
92
+ i, j = 0, len(arr) - 1
93
+ while i < j and arr[i] <= x < arr[j]:
94
+ k = i + ((j - i) * (x - arr[i]) // (arr[j] - arr[i]))
95
+ if arr[k] <= x < arr[k + 1]:
96
+ return k
97
+ elif arr[k] < x:
98
+ i, j = k + 1, j
99
+ else:
100
+ i, j = i, k
101
+ raise IndexError(f"Invalid query '{x}' for size {arr[-1] if len(arr) else 'none'}.")
102
+
103
+
104
+ class IndexedTableMixin:
105
+ def __init__(self, table: pa.Table):
106
+ self._schema: pa.Schema = table.schema
107
+ self._batches: list[pa.RecordBatch] = [
108
+ recordbatch for recordbatch in table.to_batches() if len(recordbatch) > 0
109
+ ]
110
+ self._offsets: np.ndarray = np.cumsum([0] + [len(b) for b in self._batches], dtype=np.int64)
111
+
112
+ def fast_gather(self, indices: Union[list[int], np.ndarray]) -> pa.Table:
113
+ """
114
+ Create a pa.Table by gathering the records at the records at the specified indices. Should be faster
115
+ than pa.concat_tables(table.fast_slice(int(i) % table.num_rows, 1) for i in indices) since NumPy can compute
116
+ the binary searches in parallel, highly optimized C
117
+ """
118
+ if not len(indices):
119
+ raise ValueError("Indices must be non-empty")
120
+ batch_indices = np.searchsorted(self._offsets, indices, side="right") - 1
121
+ return pa.Table.from_batches(
122
+ [
123
+ self._batches[batch_idx].slice(i - self._offsets[batch_idx], 1)
124
+ for batch_idx, i in zip(batch_indices, indices)
125
+ ],
126
+ schema=self._schema,
127
+ )
128
+
129
+ def fast_slice(self, offset=0, length=None) -> pa.Table:
130
+ """
131
+ Slice the Table using interpolation search.
132
+ The behavior is the same as `pyarrow.Table.slice` but it's significantly faster.
133
+
134
+ Interpolation search is used to find the start and end indexes of the batches we want to keep.
135
+ The batches to keep are then concatenated to form the sliced Table.
136
+ """
137
+ if offset < 0:
138
+ raise IndexError("Offset must be non-negative")
139
+ elif offset >= self._offsets[-1] or (length is not None and length <= 0):
140
+ return pa.Table.from_batches([], schema=self._schema)
141
+ i = _interpolation_search(self._offsets, offset)
142
+ if length is None or length + offset >= self._offsets[-1]:
143
+ batches = self._batches[i:]
144
+ batches[0] = batches[0].slice(offset - self._offsets[i])
145
+ else:
146
+ j = _interpolation_search(self._offsets, offset + length - 1)
147
+ batches = self._batches[i : j + 1]
148
+ batches[-1] = batches[-1].slice(0, offset + length - self._offsets[j])
149
+ batches[0] = batches[0].slice(offset - self._offsets[i])
150
+ return pa.Table.from_batches(batches, schema=self._schema)
151
+
152
+
153
+ class Table(IndexedTableMixin):
154
+ """
155
+ Wraps a pyarrow Table by using composition.
156
+ This is the base class for `InMemoryTable`, `MemoryMappedTable` and `ConcatenationTable`.
157
+
158
+ It implements all the basic attributes/methods of the pyarrow Table class except
159
+ the Table transforms: `slice, filter, flatten, combine_chunks, cast, add_column,
160
+ append_column, remove_column, set_column, rename_columns` and `drop`.
161
+
162
+ The implementation of these methods differs for the subclasses.
163
+ """
164
+
165
+ def __init__(self, table: pa.Table):
166
+ super().__init__(table)
167
+ self.table = table
168
+
169
+ def __deepcopy__(self, memo: dict):
170
+ # arrow tables are immutable, so there's no need to copy self.table
171
+ # moreover calling deepcopy on a pyarrow table seems to make pa.total_allocated_bytes() decrease for some reason
172
+ # by adding it to the memo, self.table won't be copied
173
+ memo[id(self.table)] = self.table
174
+ # same for the recordbatches used by the index
175
+ memo[id(self._batches)] = list(self._batches)
176
+ return _deepcopy(self, memo)
177
+
178
+ def validate(self, *args, **kwargs):
179
+ """
180
+ Perform validation checks. An exception is raised if validation fails.
181
+
182
+ By default only cheap validation checks are run. Pass `full=True`
183
+ for thorough validation checks (potentially `O(n)`).
184
+
185
+ Args:
186
+ full (`bool`, defaults to `False`):
187
+ If `True`, run expensive checks, otherwise cheap checks only.
188
+
189
+ Raises:
190
+ `pa.lib.ArrowInvalid`: if validation fails
191
+ """
192
+ return self.table.validate(*args, **kwargs)
193
+
194
+ def equals(self, *args, **kwargs):
195
+ """
196
+ Check if contents of two tables are equal.
197
+
198
+ Args:
199
+ other ([`~datasets.table.Table`]):
200
+ Table to compare against.
201
+ check_metadata `bool`, defaults to `False`):
202
+ Whether schema metadata equality should be checked as well.
203
+
204
+ Returns:
205
+ `bool`
206
+ """
207
+ args = tuple(arg.table if isinstance(arg, Table) else arg for arg in args)
208
+ kwargs = {k: v.table if isinstance(v, Table) else v for k, v in kwargs}
209
+ return self.table.equals(*args, **kwargs)
210
+
211
+ def to_batches(self, *args, **kwargs):
212
+ """
213
+ Convert Table to list of (contiguous) `RecordBatch` objects.
214
+
215
+ Args:
216
+ max_chunksize (`int`, defaults to `None`):
217
+ Maximum size for `RecordBatch` chunks. Individual chunks may be
218
+ smaller depending on the chunk layout of individual columns.
219
+
220
+ Returns:
221
+ `List[pyarrow.RecordBatch]`
222
+ """
223
+ return self.table.to_batches(*args, **kwargs)
224
+
225
+ def to_pydict(self, *args, **kwargs):
226
+ """
227
+ Convert the Table to a `dict` or `OrderedDict`.
228
+
229
+ Returns:
230
+ `dict`
231
+ """
232
+ return self.table.to_pydict(*args, **kwargs)
233
+
234
+ def to_pylist(self, *args, **kwargs):
235
+ """
236
+ Convert the Table to a list
237
+
238
+ Returns:
239
+ `list`
240
+ """
241
+ return self.table.to_pylist(*args, **kwargs)
242
+
243
+ def to_pandas(self, *args, **kwargs):
244
+ """
245
+ Convert to a pandas-compatible NumPy array or DataFrame, as appropriate.
246
+
247
+ Args:
248
+ memory_pool (`MemoryPool`, defaults to `None`):
249
+ Arrow MemoryPool to use for allocations. Uses the default memory
250
+ pool is not passed.
251
+ strings_to_categorical (`bool`, defaults to `False`):
252
+ Encode string (UTF8) and binary types to `pandas.Categorical`.
253
+ categories (`list`, defaults to `empty`):
254
+ List of fields that should be returned as `pandas.Categorical`. Only
255
+ applies to table-like data structures.
256
+ zero_copy_only (`bool`, defaults to `False`):
257
+ Raise an `ArrowException` if this function call would require copying
258
+ the underlying data.
259
+ integer_object_nulls (`bool`, defaults to `False`):
260
+ Cast integers with nulls to objects.
261
+ date_as_object (`bool`, defaults to `True`):
262
+ Cast dates to objects. If `False`, convert to `datetime64[ns]` dtype.
263
+ timestamp_as_object (`bool`, defaults to `False`):
264
+ Cast non-nanosecond timestamps (`np.datetime64`) to objects. This is
265
+ useful if you have timestamps that don't fit in the normal date
266
+ range of nanosecond timestamps (1678 CE-2262 CE).
267
+ If `False`, all timestamps are converted to `datetime64[ns]` dtype.
268
+ use_threads (`bool`, defaults to `True`):
269
+ Whether to parallelize the conversion using multiple threads.
270
+ deduplicate_objects (`bool`, defaults to `False`):
271
+ Do not create multiple copies Python objects when created, to save
272
+ on memory use. Conversion will be slower.
273
+ ignore_metadata (`bool`, defaults to `False`):
274
+ If `True`, do not use the 'pandas' metadata to reconstruct the
275
+ DataFrame index, if present.
276
+ safe (`bool`, defaults to `True`):
277
+ For certain data types, a cast is needed in order to store the
278
+ data in a pandas DataFrame or Series (e.g. timestamps are always
279
+ stored as nanoseconds in pandas). This option controls whether it
280
+ is a safe cast or not.
281
+ split_blocks (`bool`, defaults to `False`):
282
+ If `True`, generate one internal "block" for each column when
283
+ creating a pandas.DataFrame from a `RecordBatch` or `Table`. While this
284
+ can temporarily reduce memory note that various pandas operations
285
+ can trigger "consolidation" which may balloon memory use.
286
+ self_destruct (`bool`, defaults to `False`):
287
+ EXPERIMENTAL: If `True`, attempt to deallocate the originating Arrow
288
+ memory while converting the Arrow object to pandas. If you use the
289
+ object after calling `to_pandas` with this option it will crash your
290
+ program.
291
+ types_mapper (`function`, defaults to `None`):
292
+ A function mapping a pyarrow DataType to a pandas `ExtensionDtype`.
293
+ This can be used to override the default pandas type for conversion
294
+ of built-in pyarrow types or in absence of `pandas_metadata` in the
295
+ Table schema. The function receives a pyarrow DataType and is
296
+ expected to return a pandas `ExtensionDtype` or `None` if the
297
+ default conversion should be used for that type. If you have
298
+ a dictionary mapping, you can pass `dict.get` as function.
299
+
300
+ Returns:
301
+ `pandas.Series` or `pandas.DataFrame`: `pandas.Series` or `pandas.DataFrame` depending on type of object
302
+ """
303
+ return self.table.to_pandas(*args, **kwargs)
304
+
305
+ def to_string(self, *args, **kwargs):
306
+ return self.table.to_string(*args, **kwargs)
307
+
308
+ def to_reader(self, max_chunksize: Optional[int] = None):
309
+ """
310
+ Convert the Table to a RecordBatchReader.
311
+
312
+ Note that this method is zero-copy, it merely exposes the same data under a different API.
313
+
314
+ Args:
315
+ max_chunksize (`int`, defaults to `None`)
316
+ Maximum size for RecordBatch chunks. Individual chunks may be smaller depending
317
+ on the chunk layout of individual columns.
318
+
319
+ Returns:
320
+ `pyarrow.RecordBatchReader`
321
+ """
322
+ return self.table.to_reader(max_chunksize=max_chunksize)
323
+
324
+ def field(self, *args, **kwargs):
325
+ """
326
+ Select a schema field by its column name or numeric index.
327
+
328
+ Args:
329
+ i (`Union[int, str]`):
330
+ The index or name of the field to retrieve.
331
+
332
+ Returns:
333
+ `pyarrow.Field`
334
+ """
335
+ return self.table.field(*args, **kwargs)
336
+
337
+ def column(self, *args, **kwargs):
338
+ """
339
+ Select a column by its column name, or numeric index.
340
+
341
+ Args:
342
+ i (`Union[int, str]`):
343
+ The index or name of the column to retrieve.
344
+
345
+ Returns:
346
+ `pyarrow.ChunkedArray`
347
+ """
348
+ return self.table.column(*args, **kwargs)
349
+
350
+ def itercolumns(self, *args, **kwargs):
351
+ """
352
+ Iterator over all columns in their numerical order.
353
+
354
+ Yields:
355
+ `pyarrow.ChunkedArray`
356
+ """
357
+ return self.table.itercolumns(*args, **kwargs)
358
+
359
+ @property
360
+ def schema(self):
361
+ """
362
+ Schema of the table and its columns.
363
+
364
+ Returns:
365
+ `pyarrow.Schema`
366
+ """
367
+ return self.table.schema
368
+
369
+ @property
370
+ def columns(self):
371
+ """
372
+ List of all columns in numerical order.
373
+
374
+ Returns:
375
+ `List[pa.ChunkedArray]`
376
+ """
377
+ return self.table.columns
378
+
379
+ @property
380
+ def num_columns(self):
381
+ """
382
+ Number of columns in this table.
383
+
384
+ Returns:
385
+ int
386
+ """
387
+ return self.table.num_columns
388
+
389
+ @property
390
+ def num_rows(self):
391
+ """
392
+ Number of rows in this table.
393
+
394
+ Due to the definition of a table, all columns have the same number of
395
+ rows.
396
+
397
+ Returns:
398
+ int
399
+ """
400
+ return self.table.num_rows
401
+
402
+ @property
403
+ def shape(self):
404
+ """
405
+ Dimensions of the table: (#rows, #columns).
406
+
407
+ Returns:
408
+ `(int, int)`: Number of rows and number of columns.
409
+ """
410
+ return self.table.shape
411
+
412
+ @property
413
+ def nbytes(self):
414
+ """
415
+ Total number of bytes consumed by the elements of the table.
416
+ """
417
+ return self.table.nbytes
418
+
419
+ @property
420
+ def column_names(self):
421
+ """
422
+ Names of the table's columns.
423
+ """
424
+ return self.table.column_names
425
+
426
+ def __eq__(self, other):
427
+ return self.equals(other)
428
+
429
+ def __getitem__(self, i):
430
+ return self.table[i]
431
+
432
+ def __len__(self):
433
+ return len(self.table)
434
+
435
+ def __repr__(self):
436
+ return self.table.__repr__().replace("pyarrow.Table", self.__class__.__name__)
437
+
438
+ def __str__(self):
439
+ return self.table.__str__().replace("pyarrow.Table", self.__class__.__name__)
440
+
441
+ def slice(self, *args, **kwargs):
442
+ """
443
+ Compute zero-copy slice of this Table.
444
+
445
+ Args:
446
+ offset (`int`, defaults to `0`):
447
+ Offset from start of table to slice.
448
+ length (`int`, defaults to `None`):
449
+ Length of slice (default is until end of table starting from
450
+ offset).
451
+
452
+ Returns:
453
+ `datasets.table.Table`
454
+ """
455
+ raise NotImplementedError()
456
+
457
+ def filter(self, *args, **kwargs):
458
+ """
459
+ Select records from a Table. See `pyarrow.compute.filter` for full usage.
460
+ """
461
+ raise NotImplementedError()
462
+
463
+ def flatten(self, *args, **kwargs):
464
+ """
465
+ Flatten this Table. Each column with a struct type is flattened
466
+ into one column per struct field. Other columns are left unchanged.
467
+
468
+ Args:
469
+ memory_pool (`MemoryPool`, defaults to `None`):
470
+ For memory allocations, if required, otherwise use default pool.
471
+
472
+ Returns:
473
+ `datasets.table.Table`
474
+ """
475
+ raise NotImplementedError()
476
+
477
+ def combine_chunks(self, *args, **kwargs):
478
+ """
479
+ Make a new table by combining the chunks this table has.
480
+
481
+ All the underlying chunks in the `ChunkedArray` of each column are
482
+ concatenated into zero or one chunk.
483
+
484
+ Args:
485
+ memory_pool (`MemoryPool`, defaults to `None`):
486
+ For memory allocations, if required, otherwise use default pool.
487
+
488
+ Returns:
489
+ `datasets.table.Table`
490
+ """
491
+ raise NotImplementedError()
492
+
493
+ def cast(self, *args, **kwargs):
494
+ """
495
+ Cast table values to another schema.
496
+
497
+ Args:
498
+ target_schema (`Schema`):
499
+ Schema to cast to, the names and order of fields must match.
500
+ safe (`bool`, defaults to `True`):
501
+ Check for overflows or other unsafe conversions.
502
+
503
+ Returns:
504
+ `datasets.table.Table`
505
+ """
506
+ raise NotImplementedError()
507
+
508
+ def replace_schema_metadata(self, *args, **kwargs):
509
+ """
510
+ EXPERIMENTAL: Create shallow copy of table by replacing schema
511
+ key-value metadata with the indicated new metadata (which may be None,
512
+ which deletes any existing metadata
513
+
514
+ Args:
515
+ metadata (`dict`, defaults to `None`):
516
+
517
+ Returns:
518
+ `datasets.table.Table`: shallow_copy
519
+ """
520
+ raise NotImplementedError()
521
+
522
+ def add_column(self, *args, **kwargs):
523
+ """
524
+ Add column to Table at position.
525
+
526
+ A new table is returned with the column added, the original table
527
+ object is left unchanged.
528
+
529
+ Args:
530
+ i (`int`):
531
+ Index to place the column at.
532
+ field_ (`Union[str, pyarrow.Field]`):
533
+ If a string is passed then the type is deduced from the column
534
+ data.
535
+ column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
536
+ Column data.
537
+
538
+ Returns:
539
+ `datasets.table.Table`: New table with the passed column added.
540
+ """
541
+ raise NotImplementedError()
542
+
543
+ def append_column(self, *args, **kwargs):
544
+ """
545
+ Append column at end of columns.
546
+
547
+ Args:
548
+ field_ (`Union[str, pyarrow.Field]`):
549
+ If a string is passed then the type is deduced from the column
550
+ data.
551
+ column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
552
+ Column data.
553
+
554
+ Returns:
555
+ `datasets.table.Table`: New table with the passed column added.
556
+ """
557
+ raise NotImplementedError()
558
+
559
+ def remove_column(self, *args, **kwargs):
560
+ """
561
+ Create new Table with the indicated column removed.
562
+
563
+ Args:
564
+ i (`int`):
565
+ Index of column to remove.
566
+
567
+ Returns:
568
+ `datasets.table.Table`: New table without the column.
569
+ """
570
+ raise NotImplementedError()
571
+
572
+ def set_column(self, *args, **kwargs):
573
+ """
574
+ Replace column in Table at position.
575
+
576
+ Args:
577
+ i (`int`):
578
+ Index to place the column at.
579
+ field_ (`Union[str, pyarrow.Field]`):
580
+ If a string is passed then the type is deduced from the column
581
+ data.
582
+ column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
583
+ Column data.
584
+
585
+ Returns:
586
+ `datasets.table.Table`: New table with the passed column set.
587
+ """
588
+ raise NotImplementedError()
589
+
590
+ def rename_columns(self, *args, **kwargs):
591
+ """
592
+ Create new table with columns renamed to provided names.
593
+ """
594
+ raise NotImplementedError()
595
+
596
+ def drop(self, *args, **kwargs):
597
+ """
598
+ Drop one or more columns and return a new table.
599
+
600
+ Args:
601
+ columns (`List[str]`):
602
+ List of field names referencing existing columns.
603
+
604
+ Raises:
605
+ `KeyError` : if any of the passed columns name are not existing.
606
+
607
+ Returns:
608
+ `datasets.table.Table`: New table without the columns.
609
+ """
610
+ raise NotImplementedError()
611
+
612
+ def select(self, *args, **kwargs):
613
+ """
614
+ Select columns of the table.
615
+
616
+ Returns a new table with the specified columns, and metadata preserved.
617
+
618
+ Args:
619
+ columns (:obj:`Union[List[str], List[int]]`):
620
+ The column names or integer indices to select.
621
+
622
+ Returns:
623
+ `datasets.table.Table`: table with only a subset of the columns
624
+ """
625
+ raise NotImplementedError()
626
+
627
+
628
+ class TableBlock(Table):
629
+ """
630
+ `TableBlock` is the allowed class inside a `ConcanetationTable`.
631
+ Only `MemoryMappedTable` and `InMemoryTable` are `TableBlock`.
632
+ This is because we don't want a `ConcanetationTable` made out of other `ConcanetationTables`.
633
+ """
634
+
635
+ pass
636
+
637
+
638
+ class InMemoryTable(TableBlock):
639
+ """
640
+ The table is said in-memory when it is loaded into the user's RAM.
641
+
642
+ Pickling it does copy all the data using memory.
643
+ Its implementation is simple and uses the underlying pyarrow Table methods directly.
644
+
645
+ This is different from the `MemoryMapped` table, for which pickling doesn't copy all the
646
+ data in memory. For a `MemoryMapped`, unpickling instead reloads the table from the disk.
647
+
648
+ `InMemoryTable` must be used when data fit in memory, while `MemoryMapped` are reserved for
649
+ data bigger than memory or when you want the memory footprint of your application to
650
+ stay low.
651
+ """
652
+
653
+ @classmethod
654
+ def from_file(cls, filename: str):
655
+ table = _in_memory_arrow_table_from_file(filename)
656
+ return cls(table)
657
+
658
+ @classmethod
659
+ def from_buffer(cls, buffer: pa.Buffer):
660
+ table = _in_memory_arrow_table_from_buffer(buffer)
661
+ return cls(table)
662
+
663
+ @classmethod
664
+ def from_pandas(cls, *args, **kwargs):
665
+ """
666
+ Convert pandas.DataFrame to an Arrow Table.
667
+
668
+ The column types in the resulting Arrow Table are inferred from the
669
+ dtypes of the pandas.Series in the DataFrame. In the case of non-object
670
+ Series, the NumPy dtype is translated to its Arrow equivalent. In the
671
+ case of `object`, we need to guess the datatype by looking at the
672
+ Python objects in this Series.
673
+
674
+ Be aware that Series of the `object` dtype don't carry enough
675
+ information to always lead to a meaningful Arrow type. In the case that
676
+ we cannot infer a type, e.g. because the DataFrame is of length 0 or
677
+ the Series only contains `None/nan` objects, the type is set to
678
+ null. This behavior can be avoided by constructing an explicit schema
679
+ and passing it to this function.
680
+
681
+ Args:
682
+ df (`pandas.DataFrame`):
683
+ schema (`pyarrow.Schema`, *optional*):
684
+ The expected schema of the Arrow Table. This can be used to
685
+ indicate the type of columns if we cannot infer it automatically.
686
+ If passed, the output will have exactly this schema. Columns
687
+ specified in the schema that are not found in the DataFrame columns
688
+ or its index will raise an error. Additional columns or index
689
+ levels in the DataFrame which are not specified in the schema will
690
+ be ignored.
691
+ preserve_index (`bool`, *optional*):
692
+ Whether to store the index as an additional column in the resulting
693
+ `Table`. The default of None will store the index as a column,
694
+ except for RangeIndex which is stored as metadata only. Use
695
+ `preserve_index=True` to force it to be stored as a column.
696
+ nthreads (`int`, defaults to `None` (may use up to system CPU count threads))
697
+ If greater than 1, convert columns to Arrow in parallel using
698
+ indicated number of threads.
699
+ columns (`List[str]`, *optional*):
700
+ List of column to be converted. If `None`, use all columns.
701
+ safe (`bool`, defaults to `True`):
702
+ Check for overflows or other unsafe conversions,
703
+
704
+ Returns:
705
+ `datasets.table.Table`:
706
+
707
+ Examples:
708
+ ```python
709
+ >>> import pandas as pd
710
+ >>> import pyarrow as pa
711
+ >>> df = pd.DataFrame({
712
+ ... 'int': [1, 2],
713
+ ... 'str': ['a', 'b']
714
+ ... })
715
+ >>> pa.Table.from_pandas(df)
716
+ <pyarrow.lib.Table object at 0x7f05d1fb1b40>
717
+ ```
718
+ """
719
+ return cls(pa.Table.from_pandas(*args, **kwargs))
720
+
721
+ @classmethod
722
+ def from_arrays(cls, *args, **kwargs):
723
+ """
724
+ Construct a Table from Arrow arrays.
725
+
726
+ Args:
727
+ arrays (`List[Union[pyarrow.Array, pyarrow.ChunkedArray]]`):
728
+ Equal-length arrays that should form the table.
729
+ names (`List[str]`, *optional*):
730
+ Names for the table columns. If not passed, schema must be passed.
731
+ schema (`Schema`, defaults to `None`):
732
+ Schema for the created table. If not passed, names must be passed.
733
+ metadata (`Union[dict, Mapping]`, defaults to `None`):
734
+ Optional metadata for the schema (if inferred).
735
+
736
+ Returns:
737
+ `datasets.table.Table`
738
+ """
739
+ return cls(pa.Table.from_arrays(*args, **kwargs))
740
+
741
+ @classmethod
742
+ def from_pydict(cls, *args, **kwargs):
743
+ """
744
+ Construct a Table from Arrow arrays or columns.
745
+
746
+ Args:
747
+ mapping (`Union[dict, Mapping]`):
748
+ A mapping of strings to Arrays or Python lists.
749
+ schema (`Schema`, defaults to `None`):
750
+ If not passed, will be inferred from the Mapping values
751
+ metadata (`Union[dict, Mapping]`, defaults to `None`):
752
+ Optional metadata for the schema (if inferred).
753
+
754
+ Returns:
755
+ `datasets.table.Table`
756
+ """
757
+ return cls(pa.Table.from_pydict(*args, **kwargs))
758
+
759
+ @classmethod
760
+ def from_pylist(cls, mapping, *args, **kwargs):
761
+ """
762
+ Construct a Table from list of rows / dictionaries.
763
+
764
+ Args:
765
+ mapping (`List[dict]`):
766
+ A mapping of strings to row values.
767
+ schema (`Schema`, defaults to `None`):
768
+ If not passed, will be inferred from the Mapping values
769
+ metadata (`Union[dict, Mapping]`, defaults to `None`):
770
+ Optional metadata for the schema (if inferred).
771
+
772
+ Returns:
773
+ `datasets.table.Table`
774
+ """
775
+ return cls(pa.Table.from_pylist(mapping, *args, **kwargs))
776
+
777
+ @classmethod
778
+ def from_batches(cls, *args, **kwargs):
779
+ """
780
+ Construct a Table from a sequence or iterator of Arrow `RecordBatches`.
781
+
782
+ Args:
783
+ batches (`Union[Sequence[pyarrow.RecordBatch], Iterator[pyarrow.RecordBatch]]`):
784
+ Sequence of `RecordBatch` to be converted, all schemas must be equal.
785
+ schema (`Schema`, defaults to `None`):
786
+ If not passed, will be inferred from the first `RecordBatch`.
787
+
788
+ Returns:
789
+ `datasets.table.Table`:
790
+ """
791
+ return cls(pa.Table.from_batches(*args, **kwargs))
792
+
793
+ def slice(self, offset=0, length=None):
794
+ """
795
+ Compute zero-copy slice of this Table.
796
+
797
+ Args:
798
+ offset (`int`, defaults to `0`):
799
+ Offset from start of table to slice.
800
+ length (`int`, defaults to `None`):
801
+ Length of slice (default is until end of table starting from
802
+ offset).
803
+
804
+ Returns:
805
+ `datasets.table.Table`
806
+ """
807
+ # Use fast slicing here
808
+ return InMemoryTable(self.fast_slice(offset=offset, length=length))
809
+
810
+ def filter(self, *args, **kwargs):
811
+ """
812
+ Select records from a Table. See `pyarrow.compute.filter` for full usage.
813
+ """
814
+ return InMemoryTable(self.table.filter(*args, **kwargs))
815
+
816
+ def flatten(self, *args, **kwargs):
817
+ """
818
+ Flatten this Table. Each column with a struct type is flattened
819
+ into one column per struct field. Other columns are left unchanged.
820
+
821
+ Args:
822
+ memory_pool (`MemoryPool`, defaults to `None`):
823
+ For memory allocations, if required, otherwise use default pool.
824
+
825
+ Returns:
826
+ `datasets.table.Table`
827
+ """
828
+ return InMemoryTable(table_flatten(self.table, *args, **kwargs))
829
+
830
+ def combine_chunks(self, *args, **kwargs):
831
+ """
832
+ Make a new table by combining the chunks this table has.
833
+
834
+ All the underlying chunks in the `ChunkedArray` of each column are
835
+ concatenated into zero or one chunk.
836
+
837
+ Args:
838
+ memory_pool (`MemoryPool`, defaults to `None`):
839
+ For memory allocations, if required, otherwise use default pool.
840
+
841
+ Returns:
842
+ `datasets.table.Table`
843
+ """
844
+ return InMemoryTable(self.table.combine_chunks(*args, **kwargs))
845
+
846
+ def cast(self, *args, **kwargs):
847
+ """
848
+ Cast table values to another schema.
849
+
850
+ Args:
851
+ target_schema (`Schema`):
852
+ Schema to cast to, the names and order of fields must match.
853
+ safe (`bool`, defaults to `True`):
854
+ Check for overflows or other unsafe conversions.
855
+
856
+ Returns:
857
+ `datasets.table.Table`
858
+ """
859
+ return InMemoryTable(table_cast(self.table, *args, **kwargs))
860
+
861
+ def replace_schema_metadata(self, *args, **kwargs):
862
+ """
863
+ EXPERIMENTAL: Create shallow copy of table by replacing schema
864
+ key-value metadata with the indicated new metadata (which may be `None`,
865
+ which deletes any existing metadata).
866
+
867
+ Args:
868
+ metadata (`dict`, defaults to `None`):
869
+
870
+ Returns:
871
+ `datasets.table.Table`: shallow_copy
872
+ """
873
+ return InMemoryTable(self.table.replace_schema_metadata(*args, **kwargs))
874
+
875
+ def add_column(self, *args, **kwargs):
876
+ """
877
+ Add column to Table at position.
878
+
879
+ A new table is returned with the column added, the original table
880
+ object is left unchanged.
881
+
882
+ Args:
883
+ i (`int`):
884
+ Index to place the column at.
885
+ field_ (`Union[str, pyarrow.Field]`):
886
+ If a string is passed then the type is deduced from the column
887
+ data.
888
+ column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
889
+ Column data.
890
+
891
+ Returns:
892
+ `datasets.table.Table`: New table with the passed column added.
893
+ """
894
+ return InMemoryTable(self.table.add_column(*args, **kwargs))
895
+
896
+ def append_column(self, *args, **kwargs):
897
+ """
898
+ Append column at end of columns.
899
+
900
+ Args:
901
+ field_ (`Union[str, pyarrow.Field]`):
902
+ If a string is passed then the type is deduced from the column
903
+ data.
904
+ column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
905
+ Column data.
906
+
907
+ Returns:
908
+ `datasets.table.Table`:
909
+ New table with the passed column added.
910
+ """
911
+ return InMemoryTable(self.table.append_column(*args, **kwargs))
912
+
913
+ def remove_column(self, *args, **kwargs):
914
+ """
915
+ Create new Table with the indicated column removed.
916
+
917
+ Args:
918
+ i (`int`):
919
+ Index of column to remove.
920
+
921
+ Returns:
922
+ `datasets.table.Table`:
923
+ New table without the column.
924
+ """
925
+ return InMemoryTable(self.table.remove_column(*args, **kwargs))
926
+
927
+ def set_column(self, *args, **kwargs):
928
+ """
929
+ Replace column in Table at position.
930
+
931
+ Args:
932
+ i (`int`):
933
+ Index to place the column at.
934
+ field_ (`Union[str, pyarrow.Field]`):
935
+ If a string is passed then the type is deduced from the column
936
+ data.
937
+ column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
938
+ Column data.
939
+
940
+ Returns:
941
+ `datasets.table.Table`:
942
+ New table with the passed column set.
943
+ """
944
+ return InMemoryTable(self.table.set_column(*args, **kwargs))
945
+
946
+ def rename_columns(self, *args, **kwargs):
947
+ """
948
+ Create new table with columns renamed to provided names.
949
+ """
950
+ return InMemoryTable(self.table.rename_columns(*args, **kwargs))
951
+
952
+ def drop(self, *args, **kwargs):
953
+ """
954
+ Drop one or more columns and return a new table.
955
+
956
+ Args:
957
+ columns (`List[str]`):
958
+ List of field names referencing existing columns.
959
+
960
+ Raises:
961
+ `KeyError` : if any of the passed columns name are not existing.
962
+
963
+ Returns:
964
+ `datasets.table.Table`:
965
+ New table without the columns.
966
+ """
967
+ return InMemoryTable(self.table.drop(*args, **kwargs))
968
+
969
+ def select(self, *args, **kwargs):
970
+ """
971
+ Select columns of the table.
972
+
973
+ Returns a new table with the specified columns, and metadata preserved.
974
+
975
+ Args:
976
+ columns (:obj:`Union[List[str], List[int]]`):
977
+ The column names or integer indices to select.
978
+
979
+ Returns:
980
+ :class:`datasets.table.Table`: New table with the specified columns, and metadata preserved.
981
+ """
982
+ return InMemoryTable(self.table.select(*args, **kwargs))
983
+
984
+
985
+ # The MemoryMappedTable needs replays to properly reload tables from the disk
986
+ Replay = tuple[str, tuple, dict]
987
+
988
+
989
+ class MemoryMappedTable(TableBlock):
990
+ """
991
+ The table is said memory mapped when it doesn't use the user's RAM but loads the data
992
+ from the disk instead.
993
+
994
+ Pickling it doesn't copy the data into memory.
995
+ Instead, only the path to the memory mapped arrow file is pickled, as well as the list
996
+ of transforms to "replay" when reloading the table from the disk.
997
+
998
+ Its implementation requires to store an history of all the transforms that were applied
999
+ to the underlying pyarrow Table, so that they can be "replayed" when reloading the Table
1000
+ from the disk.
1001
+
1002
+ This is different from the `InMemoryTable` table, for which pickling does copy all the
1003
+ data in memory.
1004
+
1005
+ `InMemoryTable` must be used when data fit in memory, while `MemoryMapped` are reserved for
1006
+ data bigger than memory or when you want the memory footprint of your application to
1007
+ stay low.
1008
+ """
1009
+
1010
+ def __init__(self, table: pa.Table, path: str, replays: Optional[list[Replay]] = None):
1011
+ super().__init__(table)
1012
+ self.path = os.path.abspath(path)
1013
+ self.replays: list[Replay] = replays if replays is not None else []
1014
+
1015
+ @classmethod
1016
+ def from_file(cls, filename: str, replays=None):
1017
+ table = _memory_mapped_arrow_table_from_file(filename)
1018
+ table = cls._apply_replays(table, replays)
1019
+ return cls(table, filename, replays)
1020
+
1021
+ def __getstate__(self):
1022
+ return {"path": self.path, "replays": self.replays}
1023
+
1024
+ def __setstate__(self, state):
1025
+ path = state["path"]
1026
+ replays = state["replays"]
1027
+ table = _memory_mapped_arrow_table_from_file(path)
1028
+ table = self._apply_replays(table, replays)
1029
+ MemoryMappedTable.__init__(self, table, path=path, replays=replays)
1030
+
1031
+ @staticmethod
1032
+ def _apply_replays(table: pa.Table, replays: Optional[list[Replay]] = None) -> pa.Table:
1033
+ if replays is not None:
1034
+ for name, args, kwargs in replays:
1035
+ if name == "cast":
1036
+ table = table_cast(table, *args, **kwargs)
1037
+ elif name == "flatten":
1038
+ table = table_flatten(table, *args, **kwargs)
1039
+ else:
1040
+ table = getattr(table, name)(*args, **kwargs)
1041
+ return table
1042
+
1043
+ def _append_replay(self, replay: Replay) -> list[Replay]:
1044
+ replays = copy.deepcopy(self.replays)
1045
+ replays.append(replay)
1046
+ return replays
1047
+
1048
+ def slice(self, offset=0, length=None):
1049
+ """
1050
+ Compute zero-copy slice of this Table.
1051
+
1052
+ Args:
1053
+ offset (`int`, defaults to `0`):
1054
+ Offset from start of table to slice.
1055
+ length (`int`, defaults to `None`):
1056
+ Length of slice (default is until end of table starting from
1057
+ offset).
1058
+
1059
+ Returns:
1060
+ `datasets.table.Table`
1061
+ """
1062
+ replay = ("slice", (offset, length), {})
1063
+ replays = self._append_replay(replay)
1064
+ # Use fast slicing here
1065
+ return MemoryMappedTable(self.fast_slice(offset=offset, length=length), self.path, replays)
1066
+
1067
+ def filter(self, *args, **kwargs):
1068
+ """
1069
+ Select records from a Table. See `pyarrow.compute.filter` for full usage.
1070
+ """
1071
+ replay = ("filter", copy.deepcopy(args), copy.deepcopy(kwargs))
1072
+ replays = self._append_replay(replay)
1073
+ return MemoryMappedTable(self.table.filter(*args, **kwargs), self.path, replays)
1074
+
1075
+ def flatten(self, *args, **kwargs):
1076
+ """
1077
+ Flatten this Table. Each column with a struct type is flattened
1078
+ into one column per struct field. Other columns are left unchanged.
1079
+
1080
+ Args:
1081
+ memory_pool (`MemoryPool`, defaults to `None`):
1082
+ For memory allocations, if required, otherwise use default pool.
1083
+
1084
+ Returns:
1085
+ `datasets.table.Table`
1086
+ """
1087
+ replay = ("flatten", copy.deepcopy(args), copy.deepcopy(kwargs))
1088
+ replays = self._append_replay(replay)
1089
+ return MemoryMappedTable(table_flatten(self.table, *args, **kwargs), self.path, replays)
1090
+
1091
+ def combine_chunks(self, *args, **kwargs):
1092
+ """
1093
+ Make a new table by combining the chunks this table has.
1094
+
1095
+ All the underlying chunks in the ChunkedArray of each column are
1096
+ concatenated into zero or one chunk.
1097
+
1098
+ Args:
1099
+ memory_pool (`MemoryPool`, defaults to `None`):
1100
+ For memory allocations, if required, otherwise use default pool.
1101
+
1102
+ Returns:
1103
+ `datasets.table.Table`
1104
+ """
1105
+ replay = ("combine_chunks", copy.deepcopy(args), copy.deepcopy(kwargs))
1106
+ replays = self._append_replay(replay)
1107
+ return MemoryMappedTable(self.table.combine_chunks(*args, **kwargs), self.path, replays)
1108
+
1109
+ def cast(self, *args, **kwargs):
1110
+ """
1111
+ Cast table values to another schema
1112
+
1113
+ Args:
1114
+ target_schema (`Schema`):
1115
+ Schema to cast to, the names and order of fields must match.
1116
+ safe (`bool`, defaults to `True`):
1117
+ Check for overflows or other unsafe conversions.
1118
+
1119
+ Returns:
1120
+ `datasets.table.Table`
1121
+ """
1122
+ replay = ("cast", copy.deepcopy(args), copy.deepcopy(kwargs))
1123
+ replays = self._append_replay(replay)
1124
+ return MemoryMappedTable(table_cast(self.table, *args, **kwargs), self.path, replays)
1125
+
1126
+ def replace_schema_metadata(self, *args, **kwargs):
1127
+ """
1128
+ EXPERIMENTAL: Create shallow copy of table by replacing schema
1129
+ key-value metadata with the indicated new metadata (which may be None,
1130
+ which deletes any existing metadata.
1131
+
1132
+ Args:
1133
+ metadata (`dict`, defaults to `None`):
1134
+
1135
+ Returns:
1136
+ `datasets.table.Table`: shallow_copy
1137
+ """
1138
+ replay = ("replace_schema_metadata", copy.deepcopy(args), copy.deepcopy(kwargs))
1139
+ replays = self._append_replay(replay)
1140
+ return MemoryMappedTable(self.table.replace_schema_metadata(*args, **kwargs), self.path, replays)
1141
+
1142
+ def add_column(self, *args, **kwargs):
1143
+ """
1144
+ Add column to Table at position.
1145
+
1146
+ A new table is returned with the column added, the original table
1147
+ object is left unchanged.
1148
+
1149
+ Args:
1150
+ i (`int`):
1151
+ Index to place the column at.
1152
+ field_ (`Union[str, pyarrow.Field]`):
1153
+ If a string is passed then the type is deduced from the column
1154
+ data.
1155
+ column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
1156
+ Column data.
1157
+
1158
+ Returns:
1159
+ `datasets.table.Table`: New table with the passed column added.
1160
+ """
1161
+ replay = ("add_column", copy.deepcopy(args), copy.deepcopy(kwargs))
1162
+ replays = self._append_replay(replay)
1163
+ return MemoryMappedTable(self.table.add_column(*args, **kwargs), self.path, replays)
1164
+
1165
+ def append_column(self, *args, **kwargs):
1166
+ """
1167
+ Append column at end of columns.
1168
+
1169
+ Args:
1170
+ field_ (`Union[str, pyarrow.Field]`):
1171
+ If a string is passed then the type is deduced from the column
1172
+ data.
1173
+ column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
1174
+ Column data.
1175
+
1176
+ Returns:
1177
+ `datasets.table.Table`:
1178
+ New table with the passed column added.
1179
+ """
1180
+ replay = ("append_column", copy.deepcopy(args), copy.deepcopy(kwargs))
1181
+ replays = self._append_replay(replay)
1182
+ return MemoryMappedTable(self.table.append_column(*args, **kwargs), self.path, replays)
1183
+
1184
+ def remove_column(self, *args, **kwargs):
1185
+ """
1186
+ Create new Table with the indicated column removed.
1187
+
1188
+ Args:
1189
+ i (`int`):
1190
+ Index of column to remove.
1191
+
1192
+ Returns:
1193
+ `datasets.table.Table`:
1194
+ New table without the column.
1195
+ """
1196
+ replay = ("remove_column", copy.deepcopy(args), copy.deepcopy(kwargs))
1197
+ replays = self._append_replay(replay)
1198
+ return MemoryMappedTable(self.table.remove_column(*args, **kwargs), self.path, replays)
1199
+
1200
+ def set_column(self, *args, **kwargs):
1201
+ """
1202
+ Replace column in Table at position.
1203
+
1204
+ Args:
1205
+ i (`int`):
1206
+ Index to place the column at.
1207
+ field_ (`Union[str, pyarrow.Field]`):
1208
+ If a string is passed then the type is deduced from the column
1209
+ data.
1210
+ column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
1211
+ Column data.
1212
+
1213
+ Returns:
1214
+ `datasets.table.Table`:
1215
+ New table with the passed column set.
1216
+ """
1217
+ replay = ("set_column", copy.deepcopy(args), copy.deepcopy(kwargs))
1218
+ replays = self._append_replay(replay)
1219
+ return MemoryMappedTable(self.table.set_column(*args, **kwargs), self.path, replays)
1220
+
1221
+ def rename_columns(self, *args, **kwargs):
1222
+ """
1223
+ Create new table with columns renamed to provided names.
1224
+ """
1225
+ replay = ("rename_columns", copy.deepcopy(args), copy.deepcopy(kwargs))
1226
+ replays = self._append_replay(replay)
1227
+ return MemoryMappedTable(self.table.rename_columns(*args, **kwargs), self.path, replays)
1228
+
1229
+ def drop(self, *args, **kwargs):
1230
+ """
1231
+ Drop one or more columns and return a new table.
1232
+
1233
+ Args:
1234
+ columns (`List[str]`):
1235
+ List of field names referencing existing columns.
1236
+
1237
+ Raises:
1238
+ `KeyError` : if any of the passed columns name are not existing.
1239
+
1240
+ Returns:
1241
+ `datasets.table.Table`:
1242
+ New table without the columns.
1243
+ """
1244
+ replay = ("drop", copy.deepcopy(args), copy.deepcopy(kwargs))
1245
+ replays = self._append_replay(replay)
1246
+ return MemoryMappedTable(self.table.drop(*args, **kwargs), self.path, replays)
1247
+
1248
+ def select(self, *args, **kwargs):
1249
+ """
1250
+ Select columns of the table.
1251
+
1252
+ Returns a new table with the specified columns, and metadata preserved.
1253
+
1254
+ Args:
1255
+ columns (:obj:`Union[List[str], List[int]]`):
1256
+ The column names or integer indices to select.
1257
+
1258
+ Returns:
1259
+ :class:`datasets.table.Table`: New table with the specified columns, and metadata preserved.
1260
+ """
1261
+ replay = ("select", copy.deepcopy(args), copy.deepcopy(kwargs))
1262
+ replays = self._append_replay(replay)
1263
+ return MemoryMappedTable(self.table.select(*args, **kwargs), self.path, replays)
1264
+
1265
+
1266
+ # A ConcatenationTable is the concatenation of several tables.
1267
+ # The ``blocks`` attributes stores a list of list of blocks.
1268
+ # The first axis concatenates the tables along the axis 0 (it appends rows),
1269
+ # while the second axis concatenates tables along the axis 1 (it appends columns).
1270
+ TableBlockContainer = TypeVar("TableBlockContainer", TableBlock, list[TableBlock], list[list[TableBlock]])
1271
+
1272
+
1273
+ class ConcatenationTable(Table):
1274
+ """
1275
+ The table comes from the concatenation of several tables called blocks.
1276
+ It enables concatenation on both axis 0 (append rows) and axis 1 (append columns).
1277
+
1278
+ The underlying tables are called "blocks" and can be either `InMemoryTable`
1279
+ or `MemoryMappedTable` objects.
1280
+ This allows to combine tables that come from memory or that are memory mapped.
1281
+ When a `ConcatenationTable` is pickled, then each block is pickled:
1282
+ - the `InMemoryTable` objects are pickled by copying all the data in memory.
1283
+ - the MemoryMappedTable objects are pickled without copying the data into memory.
1284
+ Instead, only the path to the memory mapped arrow file is pickled, as well as the list
1285
+ of transforms to "replays" when reloading the table from the disk.
1286
+
1287
+ Its implementation requires to store each block separately.
1288
+ The `blocks` attributes stores a list of list of blocks.
1289
+ The first axis concatenates the tables along the axis 0 (it appends rows),
1290
+ while the second axis concatenates tables along the axis 1 (it appends columns).
1291
+
1292
+ If some columns are missing when concatenating on axis 0, they are filled with null values.
1293
+ This is done using `pyarrow.concat_tables(tables, promote=True)`.
1294
+
1295
+ You can access the fully combined table by accessing the `ConcatenationTable.table` attribute,
1296
+ and the blocks by accessing the `ConcatenationTable.blocks` attribute.
1297
+ """
1298
+
1299
+ def __init__(self, table: pa.Table, blocks: list[list[TableBlock]]):
1300
+ super().__init__(table)
1301
+ self.blocks = blocks
1302
+ # Check that all the blocks have the right type.
1303
+ # Only InMemoryTable and MemoryMappedTable are allowed.
1304
+ for subtables in blocks:
1305
+ for subtable in subtables:
1306
+ if not isinstance(subtable, TableBlock):
1307
+ raise TypeError(
1308
+ "The blocks of a ConcatenationTable must be InMemoryTable or MemoryMappedTable objects"
1309
+ f", but got {_short_str(subtable)}."
1310
+ )
1311
+
1312
+ def __getstate__(self):
1313
+ return {"blocks": self.blocks, "schema": self.table.schema}
1314
+
1315
+ def __setstate__(self, state):
1316
+ blocks = state["blocks"]
1317
+ schema = state["schema"]
1318
+ table = self._concat_blocks_horizontally_and_vertically(blocks)
1319
+ if schema is not None and table.schema != schema:
1320
+ # We fix the columns by concatenating with an empty table with the right columns
1321
+ empty_table = pa.Table.from_batches([], schema=schema)
1322
+ # We set promote_options="default" to fill missing columns with null values
1323
+ table = pa.concat_tables([table, empty_table], promote_options="default")
1324
+ ConcatenationTable.__init__(self, table, blocks=blocks)
1325
+
1326
+ @staticmethod
1327
+ def _concat_blocks(blocks: list[Union[TableBlock, pa.Table]], axis: int = 0) -> pa.Table:
1328
+ pa_tables = [table.table if hasattr(table, "table") else table for table in blocks]
1329
+ if axis == 0:
1330
+ # We set promote_options="default" to fill missing columns with null values
1331
+ return pa.concat_tables(pa_tables, promote_options="default")
1332
+ elif axis == 1:
1333
+ for i, table in enumerate(pa_tables):
1334
+ if i == 0:
1335
+ pa_table = table
1336
+ else:
1337
+ for name, col in zip(table.column_names, table.columns):
1338
+ pa_table = pa_table.append_column(name, col)
1339
+ return pa_table
1340
+ else:
1341
+ raise ValueError("'axis' must be either 0 or 1")
1342
+
1343
+ @classmethod
1344
+ def _concat_blocks_horizontally_and_vertically(cls, blocks: list[list[TableBlock]]) -> pa.Table:
1345
+ pa_tables_to_concat_vertically = []
1346
+ for i, tables in enumerate(blocks):
1347
+ if not tables:
1348
+ continue
1349
+ pa_table_horizontally_concatenated = cls._concat_blocks(tables, axis=1)
1350
+ pa_tables_to_concat_vertically.append(pa_table_horizontally_concatenated)
1351
+ return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)
1352
+
1353
+ @classmethod
1354
+ def _merge_blocks(cls, blocks: TableBlockContainer, axis: Optional[int] = None) -> TableBlockContainer:
1355
+ if axis is not None:
1356
+ merged_blocks = []
1357
+ for is_in_memory, block_group in groupby(blocks, key=lambda x: isinstance(x, InMemoryTable)):
1358
+ if is_in_memory:
1359
+ block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
1360
+ merged_blocks += list(block_group)
1361
+ else: # both
1362
+ merged_blocks = [cls._merge_blocks(row_block, axis=1) for row_block in blocks]
1363
+ if all(len(row_block) == 1 for row_block in merged_blocks):
1364
+ merged_blocks = cls._merge_blocks(
1365
+ [block for row_block in merged_blocks for block in row_block], axis=0
1366
+ )
1367
+ return merged_blocks
1368
+
1369
+ @classmethod
1370
+ def _consolidate_blocks(cls, blocks: TableBlockContainer) -> TableBlockContainer:
1371
+ if isinstance(blocks, TableBlock):
1372
+ return blocks
1373
+ elif isinstance(blocks[0], TableBlock):
1374
+ return cls._merge_blocks(blocks, axis=0)
1375
+ else:
1376
+ return cls._merge_blocks(blocks)
1377
+
1378
+ @classmethod
1379
+ def from_blocks(cls, blocks: TableBlockContainer) -> "ConcatenationTable":
1380
+ blocks = cls._consolidate_blocks(blocks)
1381
+ if isinstance(blocks, TableBlock):
1382
+ table = blocks
1383
+ return cls(table.table, [[table]])
1384
+ elif isinstance(blocks[0], TableBlock):
1385
+ table = cls._concat_blocks(blocks, axis=0)
1386
+ blocks = [[t] for t in blocks]
1387
+ return cls(table, blocks)
1388
+ else:
1389
+ table = cls._concat_blocks_horizontally_and_vertically(blocks)
1390
+ return cls(table, blocks)
1391
+
1392
+ @classmethod
1393
+ def from_tables(cls, tables: list[Union[pa.Table, Table]], axis: int = 0) -> "ConcatenationTable":
1394
+ """Create `ConcatenationTable` from list of tables.
1395
+
1396
+ Args:
1397
+ tables (list of `Table` or list of `pyarrow.Table`):
1398
+ List of tables.
1399
+ axis (`{0, 1}`, defaults to `0`, meaning over rows):
1400
+ Axis to concatenate over, where `0` means over rows (vertically) and `1` means over columns
1401
+ (horizontally).
1402
+
1403
+ <Added version="1.6.0"/>
1404
+ """
1405
+
1406
+ def to_blocks(table: Union[pa.Table, Table]) -> list[list[TableBlock]]:
1407
+ if isinstance(table, pa.Table):
1408
+ return [[InMemoryTable(table)]]
1409
+ elif isinstance(table, ConcatenationTable):
1410
+ return copy.deepcopy(table.blocks)
1411
+ else:
1412
+ return [[table]]
1413
+
1414
+ def _slice_row_block(row_block: list[TableBlock], length: int) -> tuple[list[TableBlock], list[TableBlock]]:
1415
+ sliced = [table.slice(0, length) for table in row_block]
1416
+ remainder = [table.slice(length, len(row_block[0]) - length) for table in row_block]
1417
+ return sliced, remainder
1418
+
1419
+ def _split_both_like(
1420
+ result: list[list[TableBlock]], blocks: list[list[TableBlock]]
1421
+ ) -> tuple[list[list[TableBlock]], list[list[TableBlock]]]:
1422
+ """
1423
+ Make sure each row_block contain the same num_rows to be able to concatenate them on axis=1.
1424
+
1425
+ To do so, we modify both blocks sets to have the same row_blocks boundaries.
1426
+ For example, if `result` has 2 row_blocks of 3 rows and `blocks` has 3 row_blocks of 2 rows,
1427
+ we modify both to have 4 row_blocks of size 2, 1, 1 and 2:
1428
+
1429
+ [ x x x | x x x ]
1430
+ + [ y y | y y | y y ]
1431
+ -----------------------------
1432
+ = [ x x | x | x | x x ]
1433
+ [ y y | y | y | y y ]
1434
+
1435
+ """
1436
+ result, blocks = list(result), list(blocks)
1437
+ new_result, new_blocks = [], []
1438
+ while result and blocks:
1439
+ # we slice the longest row block to save two row blocks of same length
1440
+ # and we replace the long row block by its remainder if necessary
1441
+ if len(result[0][0]) > len(blocks[0][0]):
1442
+ new_blocks.append(blocks[0])
1443
+ sliced, result[0] = _slice_row_block(result[0], len(blocks.pop(0)[0]))
1444
+ new_result.append(sliced)
1445
+ elif len(result[0][0]) < len(blocks[0][0]):
1446
+ new_result.append(result[0])
1447
+ sliced, blocks[0] = _slice_row_block(blocks[0], len(result.pop(0)[0]))
1448
+ new_blocks.append(sliced)
1449
+ else:
1450
+ new_result.append(result.pop(0))
1451
+ new_blocks.append(blocks.pop(0))
1452
+ if result or blocks:
1453
+ raise ValueError("Failed to concatenate on axis=1 because tables don't have the same number of rows")
1454
+ return new_result, new_blocks
1455
+
1456
+ def _extend_blocks(
1457
+ result: list[list[TableBlock]], blocks: list[list[TableBlock]], axis: int = 0
1458
+ ) -> list[list[TableBlock]]:
1459
+ if axis == 0:
1460
+ result.extend(blocks)
1461
+ elif axis == 1:
1462
+ # We make sure each row_block have the same num_rows
1463
+ result, blocks = _split_both_like(result, blocks)
1464
+ for i, row_block in enumerate(blocks):
1465
+ result[i].extend(row_block)
1466
+ return result
1467
+
1468
+ blocks = to_blocks(tables[0])
1469
+ for table in tables[1:]:
1470
+ table_blocks = to_blocks(table)
1471
+ blocks = _extend_blocks(blocks, table_blocks, axis=axis)
1472
+ return cls.from_blocks(blocks)
1473
+
1474
+ @property
1475
+ def _slices(self):
1476
+ offset = 0
1477
+ for tables in self.blocks:
1478
+ length = len(tables[0])
1479
+ yield (offset, length)
1480
+ offset += length
1481
+
1482
+ def slice(self, offset=0, length=None):
1483
+ """
1484
+ Compute zero-copy slice of this Table.
1485
+
1486
+ Args:
1487
+ offset (`int`, defaults to `0`):
1488
+ Offset from start of table to slice.
1489
+ length (`int`, defaults to `None`):
1490
+ Length of slice (default is until end of table starting from
1491
+ offset).
1492
+
1493
+ Returns:
1494
+ `datasets.table.Table`
1495
+ """
1496
+ table = self.table.slice(offset, length=length)
1497
+ length = length if length is not None else self.num_rows - offset
1498
+ blocks = []
1499
+ for tables in self.blocks:
1500
+ n_rows = len(tables[0])
1501
+ if length == 0:
1502
+ break
1503
+ elif n_rows <= offset:
1504
+ offset = offset - n_rows
1505
+ elif n_rows <= offset + length:
1506
+ blocks.append([t.slice(offset) for t in tables])
1507
+ length, offset = length + offset - n_rows, 0
1508
+ else:
1509
+ blocks.append([t.slice(offset, length) for t in tables])
1510
+ length, offset = 0, 0
1511
+ return ConcatenationTable(table, blocks)
1512
+
1513
+ def filter(self, mask, *args, **kwargs):
1514
+ """
1515
+ Select records from a Table. See `pyarrow.compute.filter` for full usage.
1516
+ """
1517
+ table = self.table.filter(mask, *args, **kwargs)
1518
+ blocks = []
1519
+ for (offset, length), tables in zip(self._slices, self.blocks):
1520
+ submask = mask.slice(offset, length)
1521
+ blocks.append([t.filter(submask, *args, **kwargs) for t in tables])
1522
+ return ConcatenationTable(table, blocks)
1523
+
1524
+ def flatten(self, *args, **kwargs):
1525
+ """
1526
+ Flatten this Table. Each column with a struct type is flattened
1527
+ into one column per struct field. Other columns are left unchanged.
1528
+
1529
+ Args:
1530
+ memory_pool (`MemoryPool`, defaults to `None`):
1531
+ For memory allocations, if required, otherwise use default pool.
1532
+
1533
+ Returns:
1534
+ `datasets.table.Table`
1535
+ """
1536
+ table = table_flatten(self.table, *args, **kwargs)
1537
+ blocks = []
1538
+ for tables in self.blocks:
1539
+ blocks.append([t.flatten(*args, **kwargs) for t in tables])
1540
+ return ConcatenationTable(table, blocks)
1541
+
1542
+ def combine_chunks(self, *args, **kwargs):
1543
+ """
1544
+ Make a new table by combining the chunks this table has.
1545
+
1546
+ All the underlying chunks in the `ChunkedArray` of each column are
1547
+ concatenated into zero or one chunk.
1548
+
1549
+ Args:
1550
+ memory_pool (`MemoryPool`, defaults to `None`):
1551
+ For memory allocations, if required, otherwise use default pool.
1552
+
1553
+ Returns:
1554
+ `datasets.table.Table`
1555
+ """
1556
+ table = self.table.combine_chunks(*args, **kwargs)
1557
+ blocks = []
1558
+ for tables in self.blocks:
1559
+ blocks.append([t.combine_chunks(*args, **kwargs) for t in tables])
1560
+ return ConcatenationTable(table, blocks)
1561
+
1562
+ def cast(self, target_schema, *args, **kwargs):
1563
+ """
1564
+ Cast table values to another schema.
1565
+
1566
+ Args:
1567
+ target_schema (`Schema`):
1568
+ Schema to cast to, the names and order of fields must match.
1569
+ safe (`bool`, defaults to `True`):
1570
+ Check for overflows or other unsafe conversions.
1571
+
1572
+ Returns:
1573
+ `datasets.table.Table`
1574
+ """
1575
+ from .features import Features
1576
+
1577
+ table = table_cast(self.table, target_schema, *args, **kwargs)
1578
+ target_features = Features.from_arrow_schema(target_schema)
1579
+ blocks = []
1580
+ for subtables in self.blocks:
1581
+ new_tables = []
1582
+ fields = list(target_schema)
1583
+ for subtable in subtables:
1584
+ subfields = []
1585
+ for name in subtable.column_names:
1586
+ subfields.append(fields.pop(next(i for i, field in enumerate(fields) if field.name == name)))
1587
+ subfeatures = Features({subfield.name: target_features[subfield.name] for subfield in subfields})
1588
+ subschema = subfeatures.arrow_schema
1589
+ new_tables.append(subtable.cast(subschema, *args, **kwargs))
1590
+ blocks.append(new_tables)
1591
+ return ConcatenationTable(table, blocks)
1592
+
1593
+ def replace_schema_metadata(self, *args, **kwargs):
1594
+ """
1595
+ EXPERIMENTAL: Create shallow copy of table by replacing schema
1596
+ key-value metadata with the indicated new metadata (which may be `None`,
1597
+ which deletes any existing metadata).
1598
+
1599
+ Args:
1600
+ metadata (`dict`, defaults to `None`):
1601
+
1602
+ Returns:
1603
+ `datasets.table.Table`: shallow_copy
1604
+ """
1605
+ table = self.table.replace_schema_metadata(*args, **kwargs)
1606
+ blocks = []
1607
+ for tables in self.blocks:
1608
+ blocks.append([t.replace_schema_metadata(*args, **kwargs) for t in tables])
1609
+ return ConcatenationTable(table, self.blocks)
1610
+
1611
+ def add_column(self, *args, **kwargs):
1612
+ """
1613
+ Add column to Table at position.
1614
+
1615
+ A new table is returned with the column added, the original table
1616
+ object is left unchanged.
1617
+
1618
+ Args:
1619
+ i (`int`):
1620
+ Index to place the column at.
1621
+ field_ (`Union[str, pyarrow.Field]`):
1622
+ If a string is passed then the type is deduced from the column
1623
+ data.
1624
+ column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
1625
+ Column data.
1626
+
1627
+ Returns:
1628
+ `datasets.table.Table`: New table with the passed column added.
1629
+ """
1630
+ raise NotImplementedError()
1631
+
1632
+ def append_column(self, *args, **kwargs):
1633
+ """
1634
+ Append column at end of columns.
1635
+
1636
+ Args:
1637
+ field_ (`Union[str, pyarrow.Field]`):
1638
+ If a string is passed then the type is deduced from the column
1639
+ data.
1640
+ column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
1641
+ Column data.
1642
+
1643
+ Returns:
1644
+ `datasets.table.Table`:
1645
+ New table with the passed column added.
1646
+ """
1647
+ raise NotImplementedError()
1648
+
1649
+ def remove_column(self, i, *args, **kwargs):
1650
+ """
1651
+ Create new Table with the indicated column removed.
1652
+
1653
+ Args:
1654
+ i (`int`):
1655
+ Index of column to remove.
1656
+
1657
+ Returns:
1658
+ `datasets.table.Table`:
1659
+ New table without the column.
1660
+ """
1661
+ table = self.table.remove_column(i, *args, **kwargs)
1662
+ name = self.table.column_names[i]
1663
+ blocks = []
1664
+ for tables in self.blocks:
1665
+ blocks.append(
1666
+ [
1667
+ t.remove_column(t.column_names.index(name), *args, **kwargs) if name in t.column_names else t
1668
+ for t in tables
1669
+ ]
1670
+ )
1671
+ return ConcatenationTable(table, blocks)
1672
+
1673
+ def set_column(self, *args, **kwargs):
1674
+ """
1675
+ Replace column in Table at position.
1676
+
1677
+ Args:
1678
+ i (`int`):
1679
+ Index to place the column at.
1680
+ field_ (`Union[str, pyarrow.Field]`):
1681
+ If a string is passed then the type is deduced from the column
1682
+ data.
1683
+ column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
1684
+ Column data.
1685
+
1686
+ Returns:
1687
+ `datasets.table.Table`:
1688
+ New table with the passed column set.
1689
+ """
1690
+ raise NotImplementedError()
1691
+
1692
+ def rename_columns(self, names, *args, **kwargs):
1693
+ """
1694
+ Create new table with columns renamed to provided names.
1695
+ """
1696
+ table = self.table.rename_columns(names, *args, **kwargs)
1697
+ names = dict(zip(self.table.column_names, names))
1698
+ blocks = []
1699
+ for tables in self.blocks:
1700
+ blocks.append(
1701
+ [t.rename_columns([names[name] for name in t.column_names], *args, **kwargs) for t in tables]
1702
+ )
1703
+ return ConcatenationTable(table, blocks)
1704
+
1705
+ def drop(self, columns, *args, **kwargs):
1706
+ """
1707
+ Drop one or more columns and return a new table.
1708
+
1709
+ Args:
1710
+ columns (`List[str]`):
1711
+ List of field names referencing existing columns.
1712
+
1713
+ Raises:
1714
+ `KeyError` : if any of the passed columns name are not existing.
1715
+
1716
+ Returns:
1717
+ `datasets.table.Table`:
1718
+ New table without the columns.
1719
+ """
1720
+ table = self.table.drop(columns, *args, **kwargs)
1721
+ blocks = []
1722
+ for tables in self.blocks:
1723
+ blocks.append([t.drop([c for c in columns if c in t.column_names], *args, **kwargs) for t in tables])
1724
+ return ConcatenationTable(table, blocks)
1725
+
1726
+ def select(self, columns, *args, **kwargs):
1727
+ """
1728
+ Select columns of the table.
1729
+
1730
+ Returns a new table with the specified columns, and metadata preserved.
1731
+
1732
+ Args:
1733
+ columns (:obj:`Union[List[str], List[int]]`):
1734
+ The column names or integer indices to select.
1735
+
1736
+ Returns:
1737
+ :class:`datasets.table.Table`: New table with the specified columns, and metadata preserved.
1738
+ """
1739
+ table = self.table.select(columns, *args, **kwargs)
1740
+ blocks = []
1741
+ for tables in self.blocks:
1742
+ blocks.append([t.select([c for c in columns if c in t.column_names], *args, **kwargs) for t in tables])
1743
+ return ConcatenationTable(table, blocks)
1744
+
1745
+
1746
+ def concat_tables(tables: list[Table], axis: int = 0) -> Table:
1747
+ """
1748
+ Concatenate tables.
1749
+
1750
+ Args:
1751
+ tables (list of `Table`):
1752
+ List of tables to be concatenated.
1753
+ axis (`{0, 1}`, defaults to `0`, meaning over rows):
1754
+ Axis to concatenate over, where `0` means over rows (vertically) and `1` means over columns
1755
+ (horizontally).
1756
+
1757
+ <Added version="1.6.0"/>
1758
+ Returns:
1759
+ `datasets.table.Table`:
1760
+ If the number of input tables is > 1, then the returned table is a `datasets.table.ConcatenationTable`.
1761
+ Otherwise if there's only one table, it is returned as is.
1762
+ """
1763
+ tables = list(tables)
1764
+ if len(tables) == 1:
1765
+ return tables[0]
1766
+ return ConcatenationTable.from_tables(tables, axis=axis)
1767
+
1768
+
1769
+ def list_table_cache_files(table: Table) -> list[str]:
1770
+ """
1771
+ Get the cache files that are loaded by the table.
1772
+ Cache file are used when parts of the table come from the disk via memory mapping.
1773
+
1774
+ Returns:
1775
+ `List[str]`:
1776
+ A list of paths to the cache files loaded by the table.
1777
+ """
1778
+ if isinstance(table, ConcatenationTable):
1779
+ cache_files = []
1780
+ for subtables in table.blocks:
1781
+ for subtable in subtables:
1782
+ cache_files += list_table_cache_files(subtable)
1783
+ return cache_files
1784
+ elif isinstance(table, MemoryMappedTable):
1785
+ return [table.path]
1786
+ else:
1787
+ return []
1788
+
1789
+
1790
+ def _wrap_for_chunked_arrays(func):
1791
+ """Apply the function on each chunk of a `pyarrow.ChunkedArray`, or on the array directly"""
1792
+
1793
+ def wrapper(array, *args, **kwargs):
1794
+ if isinstance(array, pa.ChunkedArray):
1795
+ return pa.chunked_array([func(chunk, *args, **kwargs) for chunk in array.chunks])
1796
+ else:
1797
+ return func(array, *args, **kwargs)
1798
+
1799
+ return wrapper
1800
+
1801
+
1802
+ def _are_list_values_of_length(array: pa.ListArray, length: int) -> bool:
1803
+ """Check if all the sub-lists of a `pa.ListArray` have the specified length."""
1804
+ return pc.all(pc.equal(array.value_lengths(), length)).as_py() or array.null_count == len(array)
1805
+
1806
+
1807
+ def _combine_list_array_offsets_with_mask(array: pa.ListArray) -> pa.Array:
1808
+ """Add the null bitmap to the offsets of a `pa.ListArray`."""
1809
+ offsets = array.offsets
1810
+ if array.null_count > 0:
1811
+ offsets = pa.concat_arrays(
1812
+ [
1813
+ pc.replace_with_mask(offsets[:-1], array.is_null(), pa.nulls(len(array), pa.int32())),
1814
+ offsets[-1:],
1815
+ ]
1816
+ )
1817
+ return offsets
1818
+
1819
+
1820
+ def _storage_type(type: pa.DataType) -> pa.DataType:
1821
+ """Convert a (possibly nested) `pa.ExtensionType` to its storage type."""
1822
+ if isinstance(type, pa.ExtensionType):
1823
+ return _storage_type(type.storage_type)
1824
+ elif isinstance(type, pa.StructType):
1825
+ return pa.struct([pa.field(field.name, _storage_type(field.type)) for field in type])
1826
+ elif isinstance(type, pa.ListType):
1827
+ return pa.list_(_storage_type(type.value_type))
1828
+ elif isinstance(type, pa.FixedSizeListType):
1829
+ return pa.list_(_storage_type(type.value_type), type.list_size)
1830
+ return type
1831
+
1832
+
1833
+ def _short_str(value: Any) -> str:
1834
+ out = str(value)
1835
+ if len(out) > 3000:
1836
+ out = out[:1500] + "\n...\n" + out[-1500:]
1837
+ return out
1838
+
1839
+
1840
+ @_wrap_for_chunked_arrays
1841
+ def array_cast(
1842
+ array: pa.Array, pa_type: pa.DataType, allow_primitive_to_str: bool = True, allow_decimal_to_str: bool = True
1843
+ ) -> Union[pa.Array, pa.FixedSizeListArray, pa.ListArray, pa.StructArray, pa.ExtensionArray]:
1844
+ """Improved version of `pa.Array.cast`
1845
+
1846
+ It supports casting `pa.StructArray` objects to re-order the fields.
1847
+ It also let you control certain aspects of the casting, e.g. whether
1848
+ to disable casting primitives (`booleans`, `floats` or `ints`) or
1849
+ disable casting decimals to strings.
1850
+
1851
+ Args:
1852
+ array (`pa.Array`):
1853
+ PyArrow array to cast
1854
+ pa_type (`pa.DataType`):
1855
+ Target PyArrow type
1856
+ allow_primitive_to_str (`bool`, defaults to `True`):
1857
+ Whether to allow casting primitives to strings.
1858
+ Defaults to `True`.
1859
+ allow_decimal_to_str (`bool`, defaults to `True`):
1860
+ Whether to allow casting decimals to strings.
1861
+ Defaults to `True`.
1862
+
1863
+ Raises:
1864
+ `pa.ArrowInvalidError`: if the arrow data casting fails
1865
+ `TypeError`: if the target type is not supported according, e.g.
1866
+
1867
+ - if a field is missing
1868
+ - if casting from primitives to strings and `allow_primitive_to_str` is `False`
1869
+ - if casting from decimals to strings and `allow_decimal_to_str` is `False`
1870
+
1871
+ Returns:
1872
+ `List[pyarrow.Array]`: the casted array
1873
+ """
1874
+ _c = partial(array_cast, allow_primitive_to_str=allow_primitive_to_str, allow_decimal_to_str=allow_decimal_to_str)
1875
+ if isinstance(array, pa.ExtensionArray):
1876
+ array = array.storage
1877
+ if isinstance(pa_type, pa.ExtensionType):
1878
+ return pa_type.wrap_array(_c(array, pa_type.storage_type))
1879
+ elif array.type == pa_type:
1880
+ return array
1881
+ elif pa.types.is_struct(array.type):
1882
+ if pa.types.is_struct(pa_type) and ({field.name for field in pa_type} == {field.name for field in array.type}):
1883
+ if array.type.num_fields == 0:
1884
+ return array
1885
+ arrays = [_c(array.field(field.name), field.type) for field in pa_type]
1886
+ return pa.StructArray.from_arrays(arrays, fields=list(pa_type), mask=array.is_null())
1887
+ elif pa.types.is_list(array.type) or pa.types.is_large_list(array.type):
1888
+ if pa.types.is_fixed_size_list(pa_type):
1889
+ if _are_list_values_of_length(array, pa_type.list_size):
1890
+ if array.null_count > 0:
1891
+ # Ensure each null value in the array translates to [null] * pa_type.list_size in the array's values array
1892
+ array_type = array.type
1893
+ storage_type = _storage_type(array_type)
1894
+ if array_type != storage_type:
1895
+ # Temporarily convert to the storage type to support extension types in the slice operation
1896
+ array = _c(array, storage_type)
1897
+ array = pc.list_slice(array, 0, pa_type.list_size, return_fixed_size_list=True)
1898
+ array = _c(array, array_type)
1899
+ else:
1900
+ array = pc.list_slice(array, 0, pa_type.list_size, return_fixed_size_list=True)
1901
+ array_values = array.values
1902
+ return pa.FixedSizeListArray.from_arrays(
1903
+ _c(array_values, pa_type.value_type), pa_type.list_size, mask=array.is_null()
1904
+ )
1905
+ else:
1906
+ array_values = array.values[
1907
+ array.offset * pa_type.list_size : (array.offset + len(array)) * pa_type.list_size
1908
+ ]
1909
+ return pa.FixedSizeListArray.from_arrays(_c(array_values, pa_type.value_type), pa_type.list_size)
1910
+ elif pa.types.is_list(pa_type):
1911
+ # Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError
1912
+ array_offsets = _combine_list_array_offsets_with_mask(array)
1913
+ return pa.ListArray.from_arrays(array_offsets, _c(array.values, pa_type.value_type))
1914
+ elif pa.types.is_large_list(pa_type):
1915
+ # Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError
1916
+ array_offsets = _combine_list_array_offsets_with_mask(array)
1917
+ return pa.LargeListArray.from_arrays(array_offsets, _c(array.values, pa_type.value_type))
1918
+ elif pa.types.is_fixed_size_list(array.type):
1919
+ if pa.types.is_fixed_size_list(pa_type):
1920
+ if pa_type.list_size == array.type.list_size:
1921
+ array_values = array.values[
1922
+ array.offset * array.type.list_size : (array.offset + len(array)) * array.type.list_size
1923
+ ]
1924
+ return pa.FixedSizeListArray.from_arrays(
1925
+ _c(array_values, pa_type.value_type), pa_type.list_size, mask=array.is_null()
1926
+ )
1927
+ elif pa.types.is_list(pa_type):
1928
+ array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size
1929
+ return pa.ListArray.from_arrays(array_offsets, _c(array.values, pa_type.value_type), mask=array.is_null())
1930
+ elif pa.types.is_large_list(pa_type):
1931
+ array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size
1932
+ return pa.LargeListArray.from_arrays(
1933
+ array_offsets, _c(array.values, pa_type.value_type), mask=array.is_null()
1934
+ )
1935
+ else:
1936
+ if pa.types.is_string(pa_type):
1937
+ if not allow_primitive_to_str and pa.types.is_primitive(array.type):
1938
+ raise TypeError(
1939
+ f"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)} "
1940
+ f"since allow_primitive_to_str is set to {allow_primitive_to_str} "
1941
+ )
1942
+ if not allow_decimal_to_str and pa.types.is_decimal(array.type):
1943
+ raise TypeError(
1944
+ f"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)} "
1945
+ f"and allow_decimal_to_str is set to {allow_decimal_to_str}"
1946
+ )
1947
+ if pa.types.is_null(pa_type) and not pa.types.is_null(array.type):
1948
+ raise TypeError(f"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)}")
1949
+ return array.cast(pa_type)
1950
+ raise TypeError(f"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)}")
1951
+
1952
+
1953
+ @_wrap_for_chunked_arrays
1954
+ def cast_array_to_feature(
1955
+ array: pa.Array, feature: "FeatureType", allow_primitive_to_str: bool = True, allow_decimal_to_str: bool = True
1956
+ ) -> pa.Array:
1957
+ """Cast an array to the arrow type that corresponds to the requested feature type.
1958
+ For custom features like [`Audio`] or [`Image`], it takes into account the "cast_storage" methods
1959
+ they defined to enable casting from other arrow types.
1960
+
1961
+ Args:
1962
+ array (`pa.Array`):
1963
+ The PyArrow array to cast.
1964
+ feature (`datasets.features.FeatureType`):
1965
+ The target feature type.
1966
+ allow_primitive_to_str (`bool`, defaults to `True`):
1967
+ Whether to allow casting primitives to strings.
1968
+ Defaults to `True`.
1969
+ allow_decimal_to_str (`bool`, defaults to `True`):
1970
+ Whether to allow casting decimals to strings.
1971
+ Defaults to `True`.
1972
+
1973
+ Raises:
1974
+ `pa.ArrowInvalidError`: if the arrow data casting fails
1975
+ `TypeError`: if the target type is not supported according, e.g.
1976
+
1977
+ - if a field is missing
1978
+ - if casting from primitives and `allow_primitive_to_str` is `False`
1979
+ - if casting from decimals and `allow_decimal_to_str` is `False`
1980
+
1981
+ Returns:
1982
+ array (`pyarrow.Array`): the casted array
1983
+ """
1984
+ from .features.features import LargeList, List, get_nested_type
1985
+
1986
+ _c = partial(
1987
+ cast_array_to_feature,
1988
+ allow_primitive_to_str=allow_primitive_to_str,
1989
+ allow_decimal_to_str=allow_decimal_to_str,
1990
+ )
1991
+
1992
+ if isinstance(array, pa.ExtensionArray):
1993
+ array = array.storage
1994
+ if hasattr(feature, "cast_storage"):
1995
+ return feature.cast_storage(array)
1996
+
1997
+ if pa.types.is_struct(array.type):
1998
+ # feature must be a dict
1999
+ if isinstance(feature, dict) and (array_fields := {field.name for field in array.type}) <= set(feature):
2000
+ null_array = pa.array([None] * len(array))
2001
+ arrays = [
2002
+ _c(array.field(name) if name in array_fields else null_array, subfeature)
2003
+ for name, subfeature in feature.items()
2004
+ ]
2005
+ return pa.StructArray.from_arrays(arrays, names=list(feature), mask=array.is_null())
2006
+ elif pa.types.is_list(array.type) or pa.types.is_large_list(array.type):
2007
+ # feature must be either List(subfeature) or LargeList(subfeature)
2008
+ if isinstance(feature, LargeList):
2009
+ casted_array_values = _c(array.values, feature.feature)
2010
+ if pa.types.is_large_list(array.type) and casted_array_values.type == array.values.type:
2011
+ # Both array and feature have equal large_list type and values (within the list) type
2012
+ return array
2013
+ else:
2014
+ # Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError
2015
+ array_offsets = _combine_list_array_offsets_with_mask(array)
2016
+ return pa.LargeListArray.from_arrays(array_offsets, casted_array_values)
2017
+ elif isinstance(feature, List):
2018
+ if feature.length > -1:
2019
+ if _are_list_values_of_length(array, feature.length):
2020
+ if array.null_count > 0:
2021
+ # Ensure each null value in the array translates to [null] * pa_type.list_size in the array's values array
2022
+ array_type = array.type
2023
+ storage_type = _storage_type(array_type)
2024
+ if array_type != storage_type:
2025
+ # Temporarily convert to the storage type to support extension types in the slice operation
2026
+ array = array_cast(
2027
+ array,
2028
+ storage_type,
2029
+ allow_primitive_to_str=allow_primitive_to_str,
2030
+ allow_decimal_to_str=allow_decimal_to_str,
2031
+ )
2032
+ array = pc.list_slice(array, 0, feature.length, return_fixed_size_list=True)
2033
+ array = array_cast(
2034
+ array,
2035
+ array_type,
2036
+ allow_primitive_to_str=allow_primitive_to_str,
2037
+ allow_decimal_to_str=allow_decimal_to_str,
2038
+ )
2039
+ else:
2040
+ array = pc.list_slice(array, 0, feature.length, return_fixed_size_list=True)
2041
+ array_values = array.values
2042
+ casted_array_values = _c(array_values, feature.feature)
2043
+ return pa.FixedSizeListArray.from_arrays(
2044
+ casted_array_values, feature.length, mask=array.is_null()
2045
+ )
2046
+ else:
2047
+ array_values = array.values[
2048
+ array.offset * feature.length : (array.offset + len(array)) * feature.length
2049
+ ]
2050
+ return pa.FixedSizeListArray.from_arrays(_c(array_values, feature.feature), feature.length)
2051
+ else:
2052
+ casted_array_values = _c(array.values, feature.feature)
2053
+ if pa.types.is_list(array.type) and casted_array_values.type == array.values.type:
2054
+ # Both array and feature have equal list type and values (within the list) type
2055
+ return array
2056
+ else:
2057
+ # Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError
2058
+ array_offsets = _combine_list_array_offsets_with_mask(array)
2059
+ return pa.ListArray.from_arrays(array_offsets, casted_array_values)
2060
+ elif pa.types.is_fixed_size_list(array.type):
2061
+ # feature must be List(subfeature)
2062
+ if isinstance(feature, LargeList):
2063
+ array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size
2064
+ return pa.LargeListArray.from_arrays(
2065
+ array_offsets, _c(array.values, feature.feature), mask=array.is_null()
2066
+ )
2067
+ elif isinstance(feature, List):
2068
+ if feature.length > -1:
2069
+ if feature.length == array.type.list_size:
2070
+ array_values = array.values[
2071
+ array.offset * array.type.list_size : (array.offset + len(array)) * array.type.list_size
2072
+ ]
2073
+ casted_array_values = _c(array_values, feature.feature)
2074
+ return pa.FixedSizeListArray.from_arrays(casted_array_values, feature.length, mask=array.is_null())
2075
+ else:
2076
+ array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size
2077
+ return pa.ListArray.from_arrays(array_offsets, _c(array.values, feature.feature), mask=array.is_null())
2078
+ if pa.types.is_null(array.type):
2079
+ return array_cast(
2080
+ array,
2081
+ get_nested_type(feature),
2082
+ allow_primitive_to_str=allow_primitive_to_str,
2083
+ allow_decimal_to_str=allow_decimal_to_str,
2084
+ )
2085
+ elif not isinstance(feature, (List, LargeList, dict)):
2086
+ return array_cast(
2087
+ array,
2088
+ feature(),
2089
+ allow_primitive_to_str=allow_primitive_to_str,
2090
+ allow_decimal_to_str=allow_decimal_to_str,
2091
+ )
2092
+ raise TypeError(f"Couldn't cast array of type\n{_short_str(array.type)}\nto\n{_short_str(feature)}")
2093
+
2094
+
2095
+ @_wrap_for_chunked_arrays
2096
+ def embed_array_storage(array: pa.Array, feature: "FeatureType", token_per_repo_id=None):
2097
+ """Embed data into an arrays's storage.
2098
+ For custom features like Audio or Image, it takes into account the "embed_storage" methods
2099
+ they define to embed external data (e.g. an image file) into an array.
2100
+
2101
+ <Added version="2.4.0"/>
2102
+
2103
+ Args:
2104
+ array (`pa.Array`):
2105
+ The PyArrow array in which to embed data.
2106
+ feature (`datasets.features.FeatureType`):
2107
+ Array features.
2108
+
2109
+ Raises:
2110
+ `TypeError`: if the target type is not supported according, e.g.
2111
+
2112
+ - if a field is missing
2113
+
2114
+ Returns:
2115
+ array (`pyarrow.Array`): the casted array
2116
+ """
2117
+ from .features import LargeList, List
2118
+
2119
+ _e = partial(embed_array_storage, token_per_repo_id=token_per_repo_id)
2120
+
2121
+ if isinstance(array, pa.ExtensionArray):
2122
+ array = array.storage
2123
+ if hasattr(feature, "embed_storage"):
2124
+ return feature.embed_storage(array, token_per_repo_id=token_per_repo_id)
2125
+ elif pa.types.is_struct(array.type):
2126
+ # feature must be a dict
2127
+ if isinstance(feature, dict):
2128
+ arrays = [_e(array.field(name), subfeature) for name, subfeature in feature.items()]
2129
+ return pa.StructArray.from_arrays(arrays, names=list(feature), mask=array.is_null())
2130
+ elif pa.types.is_list(array.type):
2131
+ # feature must be either List(subfeature)
2132
+ # Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError
2133
+ array_offsets = _combine_list_array_offsets_with_mask(array)
2134
+ if isinstance(feature, List) and feature.length == -1:
2135
+ return pa.ListArray.from_arrays(array_offsets, _e(array.values, feature.feature))
2136
+ elif pa.types.is_large_list(array.type):
2137
+ # feature must be LargeList(subfeature)
2138
+ # Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError
2139
+ array_offsets = _combine_list_array_offsets_with_mask(array)
2140
+ return pa.LargeListArray.from_arrays(array_offsets, _e(array.values, feature.feature))
2141
+ elif pa.types.is_fixed_size_list(array.type):
2142
+ # feature must be List(subfeature)
2143
+ if isinstance(feature, List) and feature.length > -1:
2144
+ array_values = array.values[
2145
+ array.offset * array.type.list_size : (array.offset + len(array)) * array.type.list_size
2146
+ ]
2147
+ embedded_array_values = _e(array_values, feature.feature)
2148
+ return pa.FixedSizeListArray.from_arrays(embedded_array_values, feature.length, mask=array.is_null())
2149
+ if not isinstance(feature, (List, LargeList, dict)):
2150
+ return array
2151
+ raise TypeError(f"Couldn't embed array of type\n{_short_str(array.type)}\nwith\n{_short_str(feature)}")
2152
+
2153
+
2154
+ class CastError(ValueError):
2155
+ """When it's not possible to cast an Arrow table to a specific schema or set of features"""
2156
+
2157
+ def __init__(self, *args, table_column_names: list[str], requested_column_names: list[str]) -> None:
2158
+ super().__init__(*args)
2159
+ self.table_column_names = table_column_names
2160
+ self.requested_column_names = requested_column_names
2161
+
2162
+ def __reduce__(self):
2163
+ # Fix unpickling: TypeError: __init__() missing 2 required keyword-only arguments: 'table_column_names' and 'requested_column_names'
2164
+ return partial(
2165
+ CastError, table_column_names=self.table_column_names, requested_column_names=self.requested_column_names
2166
+ ), ()
2167
+
2168
+ def details(self):
2169
+ new_columns = set(self.table_column_names) - set(self.requested_column_names)
2170
+ missing_columns = set(self.requested_column_names) - set(self.table_column_names)
2171
+ if new_columns and missing_columns:
2172
+ return f"there are {len(new_columns)} new columns ({_short_str(new_columns)}) and {len(missing_columns)} missing columns ({_short_str(missing_columns)})."
2173
+ elif new_columns:
2174
+ return f"there are {len(new_columns)} new columns ({_short_str(new_columns)})"
2175
+ else:
2176
+ return f"there are {len(missing_columns)} missing columns ({_short_str(missing_columns)})"
2177
+
2178
+
2179
+ def cast_table_to_features(table: pa.Table, features: "Features"):
2180
+ """Cast a table to the arrow schema that corresponds to the requested features.
2181
+
2182
+ Args:
2183
+ table (`pyarrow.Table`):
2184
+ PyArrow table to cast.
2185
+ features ([`Features`]):
2186
+ Target features.
2187
+
2188
+ Returns:
2189
+ table (`pyarrow.Table`): the casted table
2190
+ """
2191
+ if sorted(table.column_names) != sorted(features):
2192
+ raise CastError(
2193
+ f"Couldn't cast\n{_short_str(table.schema)}\nto\n{_short_str(features)}\nbecause column names don't match",
2194
+ table_column_names=table.column_names,
2195
+ requested_column_names=list(features),
2196
+ )
2197
+ arrays = [cast_array_to_feature(table[name], feature) for name, feature in features.items()]
2198
+ return pa.Table.from_arrays(arrays, schema=features.arrow_schema)
2199
+
2200
+
2201
+ def cast_table_to_schema(table: pa.Table, schema: pa.Schema):
2202
+ """Cast a table to the arrow schema. Different from `cast_table_to_features`, this method can preserve nullability.
2203
+
2204
+ Args:
2205
+ table (`pa.Table`):
2206
+ PyArrow table to cast.
2207
+ features ([`Features`]):
2208
+ Target features.
2209
+
2210
+ Returns:
2211
+ `pa.Table`: the casted table
2212
+ """
2213
+ from .features import Features
2214
+
2215
+ features = Features.from_arrow_schema(schema)
2216
+ table_column_names = set(table.column_names)
2217
+ if not table_column_names <= set(schema.names):
2218
+ raise CastError(
2219
+ f"Couldn't cast\n{_short_str(table.schema)}\nto\n{_short_str(features)}\nbecause column names don't match",
2220
+ table_column_names=table.column_names,
2221
+ requested_column_names=list(features),
2222
+ )
2223
+ arrays = [
2224
+ cast_array_to_feature(
2225
+ table[name] if name in table_column_names else pa.array([None] * len(table), type=schema.field(name).type),
2226
+ feature,
2227
+ )
2228
+ for name, feature in features.items()
2229
+ ]
2230
+ return pa.Table.from_arrays(arrays, schema=schema)
2231
+
2232
+
2233
+ def embed_table_storage(table: pa.Table, token_per_repo_id=None):
2234
+ """Embed external data into a table's storage.
2235
+
2236
+ <Added version="2.4.0"/>
2237
+
2238
+ Args:
2239
+ table (`pyarrow.Table`):
2240
+ PyArrow table in which to embed data.
2241
+
2242
+ Returns:
2243
+ table (`pyarrow.Table`): the table with embedded data
2244
+ """
2245
+ from .features.features import Features, require_storage_embed
2246
+
2247
+ features = Features.from_arrow_schema(table.schema)
2248
+ arrays = [
2249
+ embed_array_storage(table[name], feature, token_per_repo_id=token_per_repo_id)
2250
+ if require_storage_embed(feature)
2251
+ else table[name]
2252
+ for name, feature in features.items()
2253
+ ]
2254
+ return pa.Table.from_arrays(arrays, schema=features.arrow_schema)
2255
+
2256
+
2257
+ def table_cast(table: pa.Table, schema: pa.Schema):
2258
+ """Improved version of `pa.Table.cast`.
2259
+
2260
+ It supports casting to feature types stored in the schema metadata.
2261
+
2262
+ Args:
2263
+ table (`pyarrow.Table`):
2264
+ PyArrow table to cast.
2265
+ schema (`pyarrow.Schema`):
2266
+ Target PyArrow schema.
2267
+
2268
+ Returns:
2269
+ table (`pyarrow.Table`): the casted table
2270
+ """
2271
+ if table.schema != schema:
2272
+ return cast_table_to_schema(table, schema)
2273
+ elif table.schema.metadata != schema.metadata:
2274
+ return table.replace_schema_metadata(schema.metadata)
2275
+ else:
2276
+ return table
2277
+
2278
+
2279
+ def table_flatten(table: pa.Table):
2280
+ """Improved version of `pa.Table.flatten`.
2281
+
2282
+ It behaves as `pa.Table.flatten` in a sense it does 1-step flatten of the columns with a struct type into one column per struct field,
2283
+ but updates the metadata and skips decodable features unless the `decode` attribute of these features is set to False.
2284
+
2285
+ Args:
2286
+ table (`pa.Table`):
2287
+ PyArrow table to flatten.
2288
+
2289
+ Returns:
2290
+ `Table`: the flattened table
2291
+ """
2292
+ from .features import Features
2293
+
2294
+ features = Features.from_arrow_schema(table.schema)
2295
+ if any(hasattr(subfeature, "flatten") and subfeature.flatten() == subfeature for subfeature in features.values()):
2296
+ flat_arrays = []
2297
+ flat_column_names = []
2298
+ for field in table.schema:
2299
+ array = table.column(field.name)
2300
+ subfeature = features[field.name]
2301
+ if pa.types.is_struct(field.type) and (
2302
+ not hasattr(subfeature, "flatten") or subfeature.flatten() != subfeature
2303
+ ):
2304
+ flat_arrays.extend(array.flatten())
2305
+ flat_column_names.extend([f"{field.name}.{subfield.name}" for subfield in field.type])
2306
+ else:
2307
+ flat_arrays.append(array)
2308
+ flat_column_names.append(field.name)
2309
+ flat_table = pa.Table.from_arrays(
2310
+ flat_arrays,
2311
+ names=flat_column_names,
2312
+ )
2313
+ else:
2314
+ flat_table = table.flatten()
2315
+ # Preserve complex types in the metadata
2316
+ flat_features = features.flatten(max_depth=2)
2317
+ flat_features = Features({column_name: flat_features[column_name] for column_name in flat_table.column_names})
2318
+ return flat_table.replace_schema_metadata(flat_features.arrow_schema.metadata)
2319
+
2320
+
2321
+ def table_visitor(table: pa.Table, function: Callable[[pa.Array], None]):
2322
+ """Visit all arrays in a table and apply a function to them.
2323
+
2324
+ Args:
2325
+ table (`pyarrow.Table`):
2326
+ PyArrow table to visit.
2327
+ function (`Callable[[pa.Array], None]`):
2328
+ Function to apply to each array.
2329
+ """
2330
+ from .features import Features, LargeList, List
2331
+
2332
+ features = Features.from_arrow_schema(table.schema)
2333
+
2334
+ def _visit(array, feature):
2335
+ if isinstance(array, pa.ChunkedArray):
2336
+ for chunk in array.chunks:
2337
+ _visit(chunk, feature)
2338
+ else:
2339
+ if isinstance(array, pa.ExtensionArray):
2340
+ array = array.storage
2341
+ function(array, feature)
2342
+ if pa.types.is_struct(array.type) and not hasattr(feature, "cast_storage"):
2343
+ for name, subfeature in feature.items():
2344
+ _visit(array.field(name), subfeature)
2345
+ elif pa.types.is_list(array.type):
2346
+ if isinstance(feature, (LargeList, List)):
2347
+ _visit(array.values, feature.feature)
2348
+
2349
+ for name, feature in features.items():
2350
+ _visit(table[name], feature)
2351
+
2352
+
2353
+ def table_iter(table: Table, batch_size: int, drop_last_batch=False) -> Iterator[pa.Table]:
2354
+ """Iterate over sub-tables of size `batch_size`.
2355
+
2356
+ Args:
2357
+ table (`pyarrow.Table`):
2358
+ PyArrow table to iterate over.
2359
+ batch_size (`int`):
2360
+ Size of each sub-table to yield.
2361
+ drop_last_batch (`bool`, defaults to `False`):
2362
+ Drop the last batch if it is smaller than `batch_size`.
2363
+ """
2364
+ chunks_buffer = []
2365
+ chunks_buffer_size = 0
2366
+ for chunk in table.to_reader(max_chunksize=batch_size):
2367
+ if len(chunk) == 0:
2368
+ continue
2369
+ elif chunks_buffer_size + len(chunk) < batch_size:
2370
+ chunks_buffer.append(chunk)
2371
+ chunks_buffer_size += len(chunk)
2372
+ continue
2373
+ elif chunks_buffer_size + len(chunk) == batch_size:
2374
+ chunks_buffer.append(chunk)
2375
+ yield pa.Table.from_batches(chunks_buffer)
2376
+ chunks_buffer = []
2377
+ chunks_buffer_size = 0
2378
+ else:
2379
+ cropped_chunk_length = batch_size - chunks_buffer_size
2380
+ chunks_buffer.append(chunk.slice(0, cropped_chunk_length))
2381
+ yield pa.Table.from_batches(chunks_buffer)
2382
+ chunks_buffer = [chunk.slice(cropped_chunk_length, len(chunk) - cropped_chunk_length)]
2383
+ chunks_buffer_size = len(chunk) - cropped_chunk_length
2384
+ if not drop_last_batch and chunks_buffer:
2385
+ yield pa.Table.from_batches(chunks_buffer)
idna/__init__.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .core import (
2
+ IDNABidiError,
3
+ IDNAError,
4
+ InvalidCodepoint,
5
+ InvalidCodepointContext,
6
+ alabel,
7
+ check_bidi,
8
+ check_hyphen_ok,
9
+ check_initial_combiner,
10
+ check_label,
11
+ check_nfc,
12
+ decode,
13
+ encode,
14
+ ulabel,
15
+ uts46_remap,
16
+ valid_contextj,
17
+ valid_contexto,
18
+ valid_label_length,
19
+ valid_string_length,
20
+ )
21
+ from .intranges import intranges_contain
22
+ from .package_data import __version__
23
+
24
+ __all__ = [
25
+ "__version__",
26
+ "IDNABidiError",
27
+ "IDNAError",
28
+ "InvalidCodepoint",
29
+ "InvalidCodepointContext",
30
+ "alabel",
31
+ "check_bidi",
32
+ "check_hyphen_ok",
33
+ "check_initial_combiner",
34
+ "check_label",
35
+ "check_nfc",
36
+ "decode",
37
+ "encode",
38
+ "intranges_contain",
39
+ "ulabel",
40
+ "uts46_remap",
41
+ "valid_contextj",
42
+ "valid_contexto",
43
+ "valid_label_length",
44
+ "valid_string_length",
45
+ ]
idna/codec.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import codecs
2
+ import re
3
+ from typing import Any, Optional, Tuple
4
+
5
+ from .core import IDNAError, alabel, decode, encode, ulabel
6
+
7
+ _unicode_dots_re = re.compile("[\u002e\u3002\uff0e\uff61]")
8
+
9
+
10
+ class Codec(codecs.Codec):
11
+ def encode(self, data: str, errors: str = "strict") -> Tuple[bytes, int]:
12
+ if errors != "strict":
13
+ raise IDNAError('Unsupported error handling "{}"'.format(errors))
14
+
15
+ if not data:
16
+ return b"", 0
17
+
18
+ return encode(data), len(data)
19
+
20
+ def decode(self, data: bytes, errors: str = "strict") -> Tuple[str, int]:
21
+ if errors != "strict":
22
+ raise IDNAError('Unsupported error handling "{}"'.format(errors))
23
+
24
+ if not data:
25
+ return "", 0
26
+
27
+ return decode(data), len(data)
28
+
29
+
30
+ class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
31
+ def _buffer_encode(self, data: str, errors: str, final: bool) -> Tuple[bytes, int]:
32
+ if errors != "strict":
33
+ raise IDNAError('Unsupported error handling "{}"'.format(errors))
34
+
35
+ if not data:
36
+ return b"", 0
37
+
38
+ labels = _unicode_dots_re.split(data)
39
+ trailing_dot = b""
40
+ if labels:
41
+ if not labels[-1]:
42
+ trailing_dot = b"."
43
+ del labels[-1]
44
+ elif not final:
45
+ # Keep potentially unfinished label until the next call
46
+ del labels[-1]
47
+ if labels:
48
+ trailing_dot = b"."
49
+
50
+ result = []
51
+ size = 0
52
+ for label in labels:
53
+ result.append(alabel(label))
54
+ if size:
55
+ size += 1
56
+ size += len(label)
57
+
58
+ # Join with U+002E
59
+ result_bytes = b".".join(result) + trailing_dot
60
+ size += len(trailing_dot)
61
+ return result_bytes, size
62
+
63
+
64
+ class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
65
+ def _buffer_decode(self, data: Any, errors: str, final: bool) -> Tuple[str, int]:
66
+ if errors != "strict":
67
+ raise IDNAError('Unsupported error handling "{}"'.format(errors))
68
+
69
+ if not data:
70
+ return ("", 0)
71
+
72
+ if not isinstance(data, str):
73
+ data = str(data, "ascii")
74
+
75
+ labels = _unicode_dots_re.split(data)
76
+ trailing_dot = ""
77
+ if labels:
78
+ if not labels[-1]:
79
+ trailing_dot = "."
80
+ del labels[-1]
81
+ elif not final:
82
+ # Keep potentially unfinished label until the next call
83
+ del labels[-1]
84
+ if labels:
85
+ trailing_dot = "."
86
+
87
+ result = []
88
+ size = 0
89
+ for label in labels:
90
+ result.append(ulabel(label))
91
+ if size:
92
+ size += 1
93
+ size += len(label)
94
+
95
+ result_str = ".".join(result) + trailing_dot
96
+ size += len(trailing_dot)
97
+ return (result_str, size)
98
+
99
+
100
+ class StreamWriter(Codec, codecs.StreamWriter):
101
+ pass
102
+
103
+
104
+ class StreamReader(Codec, codecs.StreamReader):
105
+ pass
106
+
107
+
108
+ def search_function(name: str) -> Optional[codecs.CodecInfo]:
109
+ if name != "idna2008":
110
+ return None
111
+ return codecs.CodecInfo(
112
+ name=name,
113
+ encode=Codec().encode,
114
+ decode=Codec().decode, # type: ignore
115
+ incrementalencoder=IncrementalEncoder,
116
+ incrementaldecoder=IncrementalDecoder,
117
+ streamwriter=StreamWriter,
118
+ streamreader=StreamReader,
119
+ )
120
+
121
+
122
+ codecs.register(search_function)
idna/compat.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Union
2
+
3
+ from .core import decode, encode
4
+
5
+
6
+ def ToASCII(label: str) -> bytes:
7
+ return encode(label)
8
+
9
+
10
+ def ToUnicode(label: Union[bytes, bytearray]) -> str:
11
+ return decode(label)
12
+
13
+
14
+ def nameprep(s: Any) -> None:
15
+ raise NotImplementedError("IDNA 2008 does not utilise nameprep protocol")
idna/core.py ADDED
@@ -0,0 +1,437 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import bisect
2
+ import re
3
+ import unicodedata
4
+ from typing import Optional, Union
5
+
6
+ from . import idnadata
7
+ from .intranges import intranges_contain
8
+
9
+ _virama_combining_class = 9
10
+ _alabel_prefix = b"xn--"
11
+ _unicode_dots_re = re.compile("[\u002e\u3002\uff0e\uff61]")
12
+
13
+
14
+ class IDNAError(UnicodeError):
15
+ """Base exception for all IDNA-encoding related problems"""
16
+
17
+ pass
18
+
19
+
20
+ class IDNABidiError(IDNAError):
21
+ """Exception when bidirectional requirements are not satisfied"""
22
+
23
+ pass
24
+
25
+
26
+ class InvalidCodepoint(IDNAError):
27
+ """Exception when a disallowed or unallocated codepoint is used"""
28
+
29
+ pass
30
+
31
+
32
+ class InvalidCodepointContext(IDNAError):
33
+ """Exception when the codepoint is not valid in the context it is used"""
34
+
35
+ pass
36
+
37
+
38
+ def _combining_class(cp: int) -> int:
39
+ v = unicodedata.combining(chr(cp))
40
+ if v == 0:
41
+ if not unicodedata.name(chr(cp)):
42
+ raise ValueError("Unknown character in unicodedata")
43
+ return v
44
+
45
+
46
+ def _is_script(cp: str, script: str) -> bool:
47
+ return intranges_contain(ord(cp), idnadata.scripts[script])
48
+
49
+
50
+ def _punycode(s: str) -> bytes:
51
+ return s.encode("punycode")
52
+
53
+
54
+ def _unot(s: int) -> str:
55
+ return "U+{:04X}".format(s)
56
+
57
+
58
+ def valid_label_length(label: Union[bytes, str]) -> bool:
59
+ if len(label) > 63:
60
+ return False
61
+ return True
62
+
63
+
64
+ def valid_string_length(label: Union[bytes, str], trailing_dot: bool) -> bool:
65
+ if len(label) > (254 if trailing_dot else 253):
66
+ return False
67
+ return True
68
+
69
+
70
+ def check_bidi(label: str, check_ltr: bool = False) -> bool:
71
+ # Bidi rules should only be applied if string contains RTL characters
72
+ bidi_label = False
73
+ for idx, cp in enumerate(label, 1):
74
+ direction = unicodedata.bidirectional(cp)
75
+ if direction == "":
76
+ # String likely comes from a newer version of Unicode
77
+ raise IDNABidiError("Unknown directionality in label {} at position {}".format(repr(label), idx))
78
+ if direction in ["R", "AL", "AN"]:
79
+ bidi_label = True
80
+ if not bidi_label and not check_ltr:
81
+ return True
82
+
83
+ # Bidi rule 1
84
+ direction = unicodedata.bidirectional(label[0])
85
+ if direction in ["R", "AL"]:
86
+ rtl = True
87
+ elif direction == "L":
88
+ rtl = False
89
+ else:
90
+ raise IDNABidiError("First codepoint in label {} must be directionality L, R or AL".format(repr(label)))
91
+
92
+ valid_ending = False
93
+ number_type: Optional[str] = None
94
+ for idx, cp in enumerate(label, 1):
95
+ direction = unicodedata.bidirectional(cp)
96
+
97
+ if rtl:
98
+ # Bidi rule 2
99
+ if direction not in [
100
+ "R",
101
+ "AL",
102
+ "AN",
103
+ "EN",
104
+ "ES",
105
+ "CS",
106
+ "ET",
107
+ "ON",
108
+ "BN",
109
+ "NSM",
110
+ ]:
111
+ raise IDNABidiError("Invalid direction for codepoint at position {} in a right-to-left label".format(idx))
112
+ # Bidi rule 3
113
+ if direction in ["R", "AL", "EN", "AN"]:
114
+ valid_ending = True
115
+ elif direction != "NSM":
116
+ valid_ending = False
117
+ # Bidi rule 4
118
+ if direction in ["AN", "EN"]:
119
+ if not number_type:
120
+ number_type = direction
121
+ else:
122
+ if number_type != direction:
123
+ raise IDNABidiError("Can not mix numeral types in a right-to-left label")
124
+ else:
125
+ # Bidi rule 5
126
+ if direction not in ["L", "EN", "ES", "CS", "ET", "ON", "BN", "NSM"]:
127
+ raise IDNABidiError("Invalid direction for codepoint at position {} in a left-to-right label".format(idx))
128
+ # Bidi rule 6
129
+ if direction in ["L", "EN"]:
130
+ valid_ending = True
131
+ elif direction != "NSM":
132
+ valid_ending = False
133
+
134
+ if not valid_ending:
135
+ raise IDNABidiError("Label ends with illegal codepoint directionality")
136
+
137
+ return True
138
+
139
+
140
+ def check_initial_combiner(label: str) -> bool:
141
+ if unicodedata.category(label[0])[0] == "M":
142
+ raise IDNAError("Label begins with an illegal combining character")
143
+ return True
144
+
145
+
146
+ def check_hyphen_ok(label: str) -> bool:
147
+ if label[2:4] == "--":
148
+ raise IDNAError("Label has disallowed hyphens in 3rd and 4th position")
149
+ if label[0] == "-" or label[-1] == "-":
150
+ raise IDNAError("Label must not start or end with a hyphen")
151
+ return True
152
+
153
+
154
+ def check_nfc(label: str) -> None:
155
+ if unicodedata.normalize("NFC", label) != label:
156
+ raise IDNAError("Label must be in Normalization Form C")
157
+
158
+
159
+ def valid_contextj(label: str, pos: int) -> bool:
160
+ cp_value = ord(label[pos])
161
+
162
+ if cp_value == 0x200C:
163
+ if pos > 0:
164
+ if _combining_class(ord(label[pos - 1])) == _virama_combining_class:
165
+ return True
166
+
167
+ ok = False
168
+ for i in range(pos - 1, -1, -1):
169
+ joining_type = idnadata.joining_types.get(ord(label[i]))
170
+ if joining_type == ord("T"):
171
+ continue
172
+ elif joining_type in [ord("L"), ord("D")]:
173
+ ok = True
174
+ break
175
+ else:
176
+ break
177
+
178
+ if not ok:
179
+ return False
180
+
181
+ ok = False
182
+ for i in range(pos + 1, len(label)):
183
+ joining_type = idnadata.joining_types.get(ord(label[i]))
184
+ if joining_type == ord("T"):
185
+ continue
186
+ elif joining_type in [ord("R"), ord("D")]:
187
+ ok = True
188
+ break
189
+ else:
190
+ break
191
+ return ok
192
+
193
+ if cp_value == 0x200D:
194
+ if pos > 0:
195
+ if _combining_class(ord(label[pos - 1])) == _virama_combining_class:
196
+ return True
197
+ return False
198
+
199
+ else:
200
+ return False
201
+
202
+
203
+ def valid_contexto(label: str, pos: int, exception: bool = False) -> bool:
204
+ cp_value = ord(label[pos])
205
+
206
+ if cp_value == 0x00B7:
207
+ if 0 < pos < len(label) - 1:
208
+ if ord(label[pos - 1]) == 0x006C and ord(label[pos + 1]) == 0x006C:
209
+ return True
210
+ return False
211
+
212
+ elif cp_value == 0x0375:
213
+ if pos < len(label) - 1 and len(label) > 1:
214
+ return _is_script(label[pos + 1], "Greek")
215
+ return False
216
+
217
+ elif cp_value == 0x05F3 or cp_value == 0x05F4:
218
+ if pos > 0:
219
+ return _is_script(label[pos - 1], "Hebrew")
220
+ return False
221
+
222
+ elif cp_value == 0x30FB:
223
+ for cp in label:
224
+ if cp == "\u30fb":
225
+ continue
226
+ if _is_script(cp, "Hiragana") or _is_script(cp, "Katakana") or _is_script(cp, "Han"):
227
+ return True
228
+ return False
229
+
230
+ elif 0x660 <= cp_value <= 0x669:
231
+ for cp in label:
232
+ if 0x6F0 <= ord(cp) <= 0x06F9:
233
+ return False
234
+ return True
235
+
236
+ elif 0x6F0 <= cp_value <= 0x6F9:
237
+ for cp in label:
238
+ if 0x660 <= ord(cp) <= 0x0669:
239
+ return False
240
+ return True
241
+
242
+ return False
243
+
244
+
245
+ def check_label(label: Union[str, bytes, bytearray]) -> None:
246
+ if isinstance(label, (bytes, bytearray)):
247
+ label = label.decode("utf-8")
248
+ if len(label) == 0:
249
+ raise IDNAError("Empty Label")
250
+
251
+ check_nfc(label)
252
+ check_hyphen_ok(label)
253
+ check_initial_combiner(label)
254
+
255
+ for pos, cp in enumerate(label):
256
+ cp_value = ord(cp)
257
+ if intranges_contain(cp_value, idnadata.codepoint_classes["PVALID"]):
258
+ continue
259
+ elif intranges_contain(cp_value, idnadata.codepoint_classes["CONTEXTJ"]):
260
+ try:
261
+ if not valid_contextj(label, pos):
262
+ raise InvalidCodepointContext(
263
+ "Joiner {} not allowed at position {} in {}".format(_unot(cp_value), pos + 1, repr(label))
264
+ )
265
+ except ValueError:
266
+ raise IDNAError(
267
+ "Unknown codepoint adjacent to joiner {} at position {} in {}".format(
268
+ _unot(cp_value), pos + 1, repr(label)
269
+ )
270
+ )
271
+ elif intranges_contain(cp_value, idnadata.codepoint_classes["CONTEXTO"]):
272
+ if not valid_contexto(label, pos):
273
+ raise InvalidCodepointContext(
274
+ "Codepoint {} not allowed at position {} in {}".format(_unot(cp_value), pos + 1, repr(label))
275
+ )
276
+ else:
277
+ raise InvalidCodepoint(
278
+ "Codepoint {} at position {} of {} not allowed".format(_unot(cp_value), pos + 1, repr(label))
279
+ )
280
+
281
+ check_bidi(label)
282
+
283
+
284
+ def alabel(label: str) -> bytes:
285
+ try:
286
+ label_bytes = label.encode("ascii")
287
+ ulabel(label_bytes)
288
+ if not valid_label_length(label_bytes):
289
+ raise IDNAError("Label too long")
290
+ return label_bytes
291
+ except UnicodeEncodeError:
292
+ pass
293
+
294
+ check_label(label)
295
+ label_bytes = _alabel_prefix + _punycode(label)
296
+
297
+ if not valid_label_length(label_bytes):
298
+ raise IDNAError("Label too long")
299
+
300
+ return label_bytes
301
+
302
+
303
+ def ulabel(label: Union[str, bytes, bytearray]) -> str:
304
+ if not isinstance(label, (bytes, bytearray)):
305
+ try:
306
+ label_bytes = label.encode("ascii")
307
+ except UnicodeEncodeError:
308
+ check_label(label)
309
+ return label
310
+ else:
311
+ label_bytes = bytes(label)
312
+
313
+ label_bytes = label_bytes.lower()
314
+ if label_bytes.startswith(_alabel_prefix):
315
+ label_bytes = label_bytes[len(_alabel_prefix) :]
316
+ if not label_bytes:
317
+ raise IDNAError("Malformed A-label, no Punycode eligible content found")
318
+ if label_bytes.decode("ascii")[-1] == "-":
319
+ raise IDNAError("A-label must not end with a hyphen")
320
+ else:
321
+ check_label(label_bytes)
322
+ return label_bytes.decode("ascii")
323
+
324
+ try:
325
+ label = label_bytes.decode("punycode")
326
+ except UnicodeError:
327
+ raise IDNAError("Invalid A-label")
328
+ check_label(label)
329
+ return label
330
+
331
+
332
+ def uts46_remap(domain: str, std3_rules: bool = True, transitional: bool = False) -> str:
333
+ """Re-map the characters in the string according to UTS46 processing."""
334
+ from .uts46data import uts46data
335
+
336
+ output = ""
337
+
338
+ for pos, char in enumerate(domain):
339
+ code_point = ord(char)
340
+ try:
341
+ uts46row = uts46data[code_point if code_point < 256 else bisect.bisect_left(uts46data, (code_point, "Z")) - 1]
342
+ status = uts46row[1]
343
+ replacement: Optional[str] = None
344
+ if len(uts46row) == 3:
345
+ replacement = uts46row[2]
346
+ if (
347
+ status == "V"
348
+ or (status == "D" and not transitional)
349
+ or (status == "3" and not std3_rules and replacement is None)
350
+ ):
351
+ output += char
352
+ elif replacement is not None and (
353
+ status == "M" or (status == "3" and not std3_rules) or (status == "D" and transitional)
354
+ ):
355
+ output += replacement
356
+ elif status != "I":
357
+ raise IndexError()
358
+ except IndexError:
359
+ raise InvalidCodepoint(
360
+ "Codepoint {} not allowed at position {} in {}".format(_unot(code_point), pos + 1, repr(domain))
361
+ )
362
+
363
+ return unicodedata.normalize("NFC", output)
364
+
365
+
366
+ def encode(
367
+ s: Union[str, bytes, bytearray],
368
+ strict: bool = False,
369
+ uts46: bool = False,
370
+ std3_rules: bool = False,
371
+ transitional: bool = False,
372
+ ) -> bytes:
373
+ if not isinstance(s, str):
374
+ try:
375
+ s = str(s, "ascii")
376
+ except UnicodeDecodeError:
377
+ raise IDNAError("should pass a unicode string to the function rather than a byte string.")
378
+ if uts46:
379
+ s = uts46_remap(s, std3_rules, transitional)
380
+ trailing_dot = False
381
+ result = []
382
+ if strict:
383
+ labels = s.split(".")
384
+ else:
385
+ labels = _unicode_dots_re.split(s)
386
+ if not labels or labels == [""]:
387
+ raise IDNAError("Empty domain")
388
+ if labels[-1] == "":
389
+ del labels[-1]
390
+ trailing_dot = True
391
+ for label in labels:
392
+ s = alabel(label)
393
+ if s:
394
+ result.append(s)
395
+ else:
396
+ raise IDNAError("Empty label")
397
+ if trailing_dot:
398
+ result.append(b"")
399
+ s = b".".join(result)
400
+ if not valid_string_length(s, trailing_dot):
401
+ raise IDNAError("Domain too long")
402
+ return s
403
+
404
+
405
+ def decode(
406
+ s: Union[str, bytes, bytearray],
407
+ strict: bool = False,
408
+ uts46: bool = False,
409
+ std3_rules: bool = False,
410
+ ) -> str:
411
+ try:
412
+ if not isinstance(s, str):
413
+ s = str(s, "ascii")
414
+ except UnicodeDecodeError:
415
+ raise IDNAError("Invalid ASCII in A-label")
416
+ if uts46:
417
+ s = uts46_remap(s, std3_rules, False)
418
+ trailing_dot = False
419
+ result = []
420
+ if not strict:
421
+ labels = _unicode_dots_re.split(s)
422
+ else:
423
+ labels = s.split(".")
424
+ if not labels or labels == [""]:
425
+ raise IDNAError("Empty domain")
426
+ if not labels[-1]:
427
+ del labels[-1]
428
+ trailing_dot = True
429
+ for label in labels:
430
+ s = ulabel(label)
431
+ if s:
432
+ result.append(s)
433
+ else:
434
+ raise IDNAError("Empty label")
435
+ if trailing_dot:
436
+ result.append("")
437
+ return ".".join(result)
idna/idnadata.py ADDED
@@ -0,0 +1,4309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file is automatically generated by tools/idna-data
2
+
3
+ __version__ = "16.0.0"
4
+
5
+ scripts = {
6
+ "Greek": (
7
+ 0x37000000374,
8
+ 0x37500000378,
9
+ 0x37A0000037E,
10
+ 0x37F00000380,
11
+ 0x38400000385,
12
+ 0x38600000387,
13
+ 0x3880000038B,
14
+ 0x38C0000038D,
15
+ 0x38E000003A2,
16
+ 0x3A3000003E2,
17
+ 0x3F000000400,
18
+ 0x1D2600001D2B,
19
+ 0x1D5D00001D62,
20
+ 0x1D6600001D6B,
21
+ 0x1DBF00001DC0,
22
+ 0x1F0000001F16,
23
+ 0x1F1800001F1E,
24
+ 0x1F2000001F46,
25
+ 0x1F4800001F4E,
26
+ 0x1F5000001F58,
27
+ 0x1F5900001F5A,
28
+ 0x1F5B00001F5C,
29
+ 0x1F5D00001F5E,
30
+ 0x1F5F00001F7E,
31
+ 0x1F8000001FB5,
32
+ 0x1FB600001FC5,
33
+ 0x1FC600001FD4,
34
+ 0x1FD600001FDC,
35
+ 0x1FDD00001FF0,
36
+ 0x1FF200001FF5,
37
+ 0x1FF600001FFF,
38
+ 0x212600002127,
39
+ 0xAB650000AB66,
40
+ 0x101400001018F,
41
+ 0x101A0000101A1,
42
+ 0x1D2000001D246,
43
+ ),
44
+ "Han": (
45
+ 0x2E8000002E9A,
46
+ 0x2E9B00002EF4,
47
+ 0x2F0000002FD6,
48
+ 0x300500003006,
49
+ 0x300700003008,
50
+ 0x30210000302A,
51
+ 0x30380000303C,
52
+ 0x340000004DC0,
53
+ 0x4E000000A000,
54
+ 0xF9000000FA6E,
55
+ 0xFA700000FADA,
56
+ 0x16FE200016FE4,
57
+ 0x16FF000016FF2,
58
+ 0x200000002A6E0,
59
+ 0x2A7000002B73A,
60
+ 0x2B7400002B81E,
61
+ 0x2B8200002CEA2,
62
+ 0x2CEB00002EBE1,
63
+ 0x2EBF00002EE5E,
64
+ 0x2F8000002FA1E,
65
+ 0x300000003134B,
66
+ 0x31350000323B0,
67
+ ),
68
+ "Hebrew": (
69
+ 0x591000005C8,
70
+ 0x5D0000005EB,
71
+ 0x5EF000005F5,
72
+ 0xFB1D0000FB37,
73
+ 0xFB380000FB3D,
74
+ 0xFB3E0000FB3F,
75
+ 0xFB400000FB42,
76
+ 0xFB430000FB45,
77
+ 0xFB460000FB50,
78
+ ),
79
+ "Hiragana": (
80
+ 0x304100003097,
81
+ 0x309D000030A0,
82
+ 0x1B0010001B120,
83
+ 0x1B1320001B133,
84
+ 0x1B1500001B153,
85
+ 0x1F2000001F201,
86
+ ),
87
+ "Katakana": (
88
+ 0x30A1000030FB,
89
+ 0x30FD00003100,
90
+ 0x31F000003200,
91
+ 0x32D0000032FF,
92
+ 0x330000003358,
93
+ 0xFF660000FF70,
94
+ 0xFF710000FF9E,
95
+ 0x1AFF00001AFF4,
96
+ 0x1AFF50001AFFC,
97
+ 0x1AFFD0001AFFF,
98
+ 0x1B0000001B001,
99
+ 0x1B1200001B123,
100
+ 0x1B1550001B156,
101
+ 0x1B1640001B168,
102
+ ),
103
+ }
104
+ joining_types = {
105
+ 0xAD: 84,
106
+ 0x300: 84,
107
+ 0x301: 84,
108
+ 0x302: 84,
109
+ 0x303: 84,
110
+ 0x304: 84,
111
+ 0x305: 84,
112
+ 0x306: 84,
113
+ 0x307: 84,
114
+ 0x308: 84,
115
+ 0x309: 84,
116
+ 0x30A: 84,
117
+ 0x30B: 84,
118
+ 0x30C: 84,
119
+ 0x30D: 84,
120
+ 0x30E: 84,
121
+ 0x30F: 84,
122
+ 0x310: 84,
123
+ 0x311: 84,
124
+ 0x312: 84,
125
+ 0x313: 84,
126
+ 0x314: 84,
127
+ 0x315: 84,
128
+ 0x316: 84,
129
+ 0x317: 84,
130
+ 0x318: 84,
131
+ 0x319: 84,
132
+ 0x31A: 84,
133
+ 0x31B: 84,
134
+ 0x31C: 84,
135
+ 0x31D: 84,
136
+ 0x31E: 84,
137
+ 0x31F: 84,
138
+ 0x320: 84,
139
+ 0x321: 84,
140
+ 0x322: 84,
141
+ 0x323: 84,
142
+ 0x324: 84,
143
+ 0x325: 84,
144
+ 0x326: 84,
145
+ 0x327: 84,
146
+ 0x328: 84,
147
+ 0x329: 84,
148
+ 0x32A: 84,
149
+ 0x32B: 84,
150
+ 0x32C: 84,
151
+ 0x32D: 84,
152
+ 0x32E: 84,
153
+ 0x32F: 84,
154
+ 0x330: 84,
155
+ 0x331: 84,
156
+ 0x332: 84,
157
+ 0x333: 84,
158
+ 0x334: 84,
159
+ 0x335: 84,
160
+ 0x336: 84,
161
+ 0x337: 84,
162
+ 0x338: 84,
163
+ 0x339: 84,
164
+ 0x33A: 84,
165
+ 0x33B: 84,
166
+ 0x33C: 84,
167
+ 0x33D: 84,
168
+ 0x33E: 84,
169
+ 0x33F: 84,
170
+ 0x340: 84,
171
+ 0x341: 84,
172
+ 0x342: 84,
173
+ 0x343: 84,
174
+ 0x344: 84,
175
+ 0x345: 84,
176
+ 0x346: 84,
177
+ 0x347: 84,
178
+ 0x348: 84,
179
+ 0x349: 84,
180
+ 0x34A: 84,
181
+ 0x34B: 84,
182
+ 0x34C: 84,
183
+ 0x34D: 84,
184
+ 0x34E: 84,
185
+ 0x34F: 84,
186
+ 0x350: 84,
187
+ 0x351: 84,
188
+ 0x352: 84,
189
+ 0x353: 84,
190
+ 0x354: 84,
191
+ 0x355: 84,
192
+ 0x356: 84,
193
+ 0x357: 84,
194
+ 0x358: 84,
195
+ 0x359: 84,
196
+ 0x35A: 84,
197
+ 0x35B: 84,
198
+ 0x35C: 84,
199
+ 0x35D: 84,
200
+ 0x35E: 84,
201
+ 0x35F: 84,
202
+ 0x360: 84,
203
+ 0x361: 84,
204
+ 0x362: 84,
205
+ 0x363: 84,
206
+ 0x364: 84,
207
+ 0x365: 84,
208
+ 0x366: 84,
209
+ 0x367: 84,
210
+ 0x368: 84,
211
+ 0x369: 84,
212
+ 0x36A: 84,
213
+ 0x36B: 84,
214
+ 0x36C: 84,
215
+ 0x36D: 84,
216
+ 0x36E: 84,
217
+ 0x36F: 84,
218
+ 0x483: 84,
219
+ 0x484: 84,
220
+ 0x485: 84,
221
+ 0x486: 84,
222
+ 0x487: 84,
223
+ 0x488: 84,
224
+ 0x489: 84,
225
+ 0x591: 84,
226
+ 0x592: 84,
227
+ 0x593: 84,
228
+ 0x594: 84,
229
+ 0x595: 84,
230
+ 0x596: 84,
231
+ 0x597: 84,
232
+ 0x598: 84,
233
+ 0x599: 84,
234
+ 0x59A: 84,
235
+ 0x59B: 84,
236
+ 0x59C: 84,
237
+ 0x59D: 84,
238
+ 0x59E: 84,
239
+ 0x59F: 84,
240
+ 0x5A0: 84,
241
+ 0x5A1: 84,
242
+ 0x5A2: 84,
243
+ 0x5A3: 84,
244
+ 0x5A4: 84,
245
+ 0x5A5: 84,
246
+ 0x5A6: 84,
247
+ 0x5A7: 84,
248
+ 0x5A8: 84,
249
+ 0x5A9: 84,
250
+ 0x5AA: 84,
251
+ 0x5AB: 84,
252
+ 0x5AC: 84,
253
+ 0x5AD: 84,
254
+ 0x5AE: 84,
255
+ 0x5AF: 84,
256
+ 0x5B0: 84,
257
+ 0x5B1: 84,
258
+ 0x5B2: 84,
259
+ 0x5B3: 84,
260
+ 0x5B4: 84,
261
+ 0x5B5: 84,
262
+ 0x5B6: 84,
263
+ 0x5B7: 84,
264
+ 0x5B8: 84,
265
+ 0x5B9: 84,
266
+ 0x5BA: 84,
267
+ 0x5BB: 84,
268
+ 0x5BC: 84,
269
+ 0x5BD: 84,
270
+ 0x5BF: 84,
271
+ 0x5C1: 84,
272
+ 0x5C2: 84,
273
+ 0x5C4: 84,
274
+ 0x5C5: 84,
275
+ 0x5C7: 84,
276
+ 0x610: 84,
277
+ 0x611: 84,
278
+ 0x612: 84,
279
+ 0x613: 84,
280
+ 0x614: 84,
281
+ 0x615: 84,
282
+ 0x616: 84,
283
+ 0x617: 84,
284
+ 0x618: 84,
285
+ 0x619: 84,
286
+ 0x61A: 84,
287
+ 0x61C: 84,
288
+ 0x620: 68,
289
+ 0x622: 82,
290
+ 0x623: 82,
291
+ 0x624: 82,
292
+ 0x625: 82,
293
+ 0x626: 68,
294
+ 0x627: 82,
295
+ 0x628: 68,
296
+ 0x629: 82,
297
+ 0x62A: 68,
298
+ 0x62B: 68,
299
+ 0x62C: 68,
300
+ 0x62D: 68,
301
+ 0x62E: 68,
302
+ 0x62F: 82,
303
+ 0x630: 82,
304
+ 0x631: 82,
305
+ 0x632: 82,
306
+ 0x633: 68,
307
+ 0x634: 68,
308
+ 0x635: 68,
309
+ 0x636: 68,
310
+ 0x637: 68,
311
+ 0x638: 68,
312
+ 0x639: 68,
313
+ 0x63A: 68,
314
+ 0x63B: 68,
315
+ 0x63C: 68,
316
+ 0x63D: 68,
317
+ 0x63E: 68,
318
+ 0x63F: 68,
319
+ 0x640: 67,
320
+ 0x641: 68,
321
+ 0x642: 68,
322
+ 0x643: 68,
323
+ 0x644: 68,
324
+ 0x645: 68,
325
+ 0x646: 68,
326
+ 0x647: 68,
327
+ 0x648: 82,
328
+ 0x649: 68,
329
+ 0x64A: 68,
330
+ 0x64B: 84,
331
+ 0x64C: 84,
332
+ 0x64D: 84,
333
+ 0x64E: 84,
334
+ 0x64F: 84,
335
+ 0x650: 84,
336
+ 0x651: 84,
337
+ 0x652: 84,
338
+ 0x653: 84,
339
+ 0x654: 84,
340
+ 0x655: 84,
341
+ 0x656: 84,
342
+ 0x657: 84,
343
+ 0x658: 84,
344
+ 0x659: 84,
345
+ 0x65A: 84,
346
+ 0x65B: 84,
347
+ 0x65C: 84,
348
+ 0x65D: 84,
349
+ 0x65E: 84,
350
+ 0x65F: 84,
351
+ 0x66E: 68,
352
+ 0x66F: 68,
353
+ 0x670: 84,
354
+ 0x671: 82,
355
+ 0x672: 82,
356
+ 0x673: 82,
357
+ 0x675: 82,
358
+ 0x676: 82,
359
+ 0x677: 82,
360
+ 0x678: 68,
361
+ 0x679: 68,
362
+ 0x67A: 68,
363
+ 0x67B: 68,
364
+ 0x67C: 68,
365
+ 0x67D: 68,
366
+ 0x67E: 68,
367
+ 0x67F: 68,
368
+ 0x680: 68,
369
+ 0x681: 68,
370
+ 0x682: 68,
371
+ 0x683: 68,
372
+ 0x684: 68,
373
+ 0x685: 68,
374
+ 0x686: 68,
375
+ 0x687: 68,
376
+ 0x688: 82,
377
+ 0x689: 82,
378
+ 0x68A: 82,
379
+ 0x68B: 82,
380
+ 0x68C: 82,
381
+ 0x68D: 82,
382
+ 0x68E: 82,
383
+ 0x68F: 82,
384
+ 0x690: 82,
385
+ 0x691: 82,
386
+ 0x692: 82,
387
+ 0x693: 82,
388
+ 0x694: 82,
389
+ 0x695: 82,
390
+ 0x696: 82,
391
+ 0x697: 82,
392
+ 0x698: 82,
393
+ 0x699: 82,
394
+ 0x69A: 68,
395
+ 0x69B: 68,
396
+ 0x69C: 68,
397
+ 0x69D: 68,
398
+ 0x69E: 68,
399
+ 0x69F: 68,
400
+ 0x6A0: 68,
401
+ 0x6A1: 68,
402
+ 0x6A2: 68,
403
+ 0x6A3: 68,
404
+ 0x6A4: 68,
405
+ 0x6A5: 68,
406
+ 0x6A6: 68,
407
+ 0x6A7: 68,
408
+ 0x6A8: 68,
409
+ 0x6A9: 68,
410
+ 0x6AA: 68,
411
+ 0x6AB: 68,
412
+ 0x6AC: 68,
413
+ 0x6AD: 68,
414
+ 0x6AE: 68,
415
+ 0x6AF: 68,
416
+ 0x6B0: 68,
417
+ 0x6B1: 68,
418
+ 0x6B2: 68,
419
+ 0x6B3: 68,
420
+ 0x6B4: 68,
421
+ 0x6B5: 68,
422
+ 0x6B6: 68,
423
+ 0x6B7: 68,
424
+ 0x6B8: 68,
425
+ 0x6B9: 68,
426
+ 0x6BA: 68,
427
+ 0x6BB: 68,
428
+ 0x6BC: 68,
429
+ 0x6BD: 68,
430
+ 0x6BE: 68,
431
+ 0x6BF: 68,
432
+ 0x6C0: 82,
433
+ 0x6C1: 68,
434
+ 0x6C2: 68,
435
+ 0x6C3: 82,
436
+ 0x6C4: 82,
437
+ 0x6C5: 82,
438
+ 0x6C6: 82,
439
+ 0x6C7: 82,
440
+ 0x6C8: 82,
441
+ 0x6C9: 82,
442
+ 0x6CA: 82,
443
+ 0x6CB: 82,
444
+ 0x6CC: 68,
445
+ 0x6CD: 82,
446
+ 0x6CE: 68,
447
+ 0x6CF: 82,
448
+ 0x6D0: 68,
449
+ 0x6D1: 68,
450
+ 0x6D2: 82,
451
+ 0x6D3: 82,
452
+ 0x6D5: 82,
453
+ 0x6D6: 84,
454
+ 0x6D7: 84,
455
+ 0x6D8: 84,
456
+ 0x6D9: 84,
457
+ 0x6DA: 84,
458
+ 0x6DB: 84,
459
+ 0x6DC: 84,
460
+ 0x6DF: 84,
461
+ 0x6E0: 84,
462
+ 0x6E1: 84,
463
+ 0x6E2: 84,
464
+ 0x6E3: 84,
465
+ 0x6E4: 84,
466
+ 0x6E7: 84,
467
+ 0x6E8: 84,
468
+ 0x6EA: 84,
469
+ 0x6EB: 84,
470
+ 0x6EC: 84,
471
+ 0x6ED: 84,
472
+ 0x6EE: 82,
473
+ 0x6EF: 82,
474
+ 0x6FA: 68,
475
+ 0x6FB: 68,
476
+ 0x6FC: 68,
477
+ 0x6FF: 68,
478
+ 0x70F: 84,
479
+ 0x710: 82,
480
+ 0x711: 84,
481
+ 0x712: 68,
482
+ 0x713: 68,
483
+ 0x714: 68,
484
+ 0x715: 82,
485
+ 0x716: 82,
486
+ 0x717: 82,
487
+ 0x718: 82,
488
+ 0x719: 82,
489
+ 0x71A: 68,
490
+ 0x71B: 68,
491
+ 0x71C: 68,
492
+ 0x71D: 68,
493
+ 0x71E: 82,
494
+ 0x71F: 68,
495
+ 0x720: 68,
496
+ 0x721: 68,
497
+ 0x722: 68,
498
+ 0x723: 68,
499
+ 0x724: 68,
500
+ 0x725: 68,
501
+ 0x726: 68,
502
+ 0x727: 68,
503
+ 0x728: 82,
504
+ 0x729: 68,
505
+ 0x72A: 82,
506
+ 0x72B: 68,
507
+ 0x72C: 82,
508
+ 0x72D: 68,
509
+ 0x72E: 68,
510
+ 0x72F: 82,
511
+ 0x730: 84,
512
+ 0x731: 84,
513
+ 0x732: 84,
514
+ 0x733: 84,
515
+ 0x734: 84,
516
+ 0x735: 84,
517
+ 0x736: 84,
518
+ 0x737: 84,
519
+ 0x738: 84,
520
+ 0x739: 84,
521
+ 0x73A: 84,
522
+ 0x73B: 84,
523
+ 0x73C: 84,
524
+ 0x73D: 84,
525
+ 0x73E: 84,
526
+ 0x73F: 84,
527
+ 0x740: 84,
528
+ 0x741: 84,
529
+ 0x742: 84,
530
+ 0x743: 84,
531
+ 0x744: 84,
532
+ 0x745: 84,
533
+ 0x746: 84,
534
+ 0x747: 84,
535
+ 0x748: 84,
536
+ 0x749: 84,
537
+ 0x74A: 84,
538
+ 0x74D: 82,
539
+ 0x74E: 68,
540
+ 0x74F: 68,
541
+ 0x750: 68,
542
+ 0x751: 68,
543
+ 0x752: 68,
544
+ 0x753: 68,
545
+ 0x754: 68,
546
+ 0x755: 68,
547
+ 0x756: 68,
548
+ 0x757: 68,
549
+ 0x758: 68,
550
+ 0x759: 82,
551
+ 0x75A: 82,
552
+ 0x75B: 82,
553
+ 0x75C: 68,
554
+ 0x75D: 68,
555
+ 0x75E: 68,
556
+ 0x75F: 68,
557
+ 0x760: 68,
558
+ 0x761: 68,
559
+ 0x762: 68,
560
+ 0x763: 68,
561
+ 0x764: 68,
562
+ 0x765: 68,
563
+ 0x766: 68,
564
+ 0x767: 68,
565
+ 0x768: 68,
566
+ 0x769: 68,
567
+ 0x76A: 68,
568
+ 0x76B: 82,
569
+ 0x76C: 82,
570
+ 0x76D: 68,
571
+ 0x76E: 68,
572
+ 0x76F: 68,
573
+ 0x770: 68,
574
+ 0x771: 82,
575
+ 0x772: 68,
576
+ 0x773: 82,
577
+ 0x774: 82,
578
+ 0x775: 68,
579
+ 0x776: 68,
580
+ 0x777: 68,
581
+ 0x778: 82,
582
+ 0x779: 82,
583
+ 0x77A: 68,
584
+ 0x77B: 68,
585
+ 0x77C: 68,
586
+ 0x77D: 68,
587
+ 0x77E: 68,
588
+ 0x77F: 68,
589
+ 0x7A6: 84,
590
+ 0x7A7: 84,
591
+ 0x7A8: 84,
592
+ 0x7A9: 84,
593
+ 0x7AA: 84,
594
+ 0x7AB: 84,
595
+ 0x7AC: 84,
596
+ 0x7AD: 84,
597
+ 0x7AE: 84,
598
+ 0x7AF: 84,
599
+ 0x7B0: 84,
600
+ 0x7CA: 68,
601
+ 0x7CB: 68,
602
+ 0x7CC: 68,
603
+ 0x7CD: 68,
604
+ 0x7CE: 68,
605
+ 0x7CF: 68,
606
+ 0x7D0: 68,
607
+ 0x7D1: 68,
608
+ 0x7D2: 68,
609
+ 0x7D3: 68,
610
+ 0x7D4: 68,
611
+ 0x7D5: 68,
612
+ 0x7D6: 68,
613
+ 0x7D7: 68,
614
+ 0x7D8: 68,
615
+ 0x7D9: 68,
616
+ 0x7DA: 68,
617
+ 0x7DB: 68,
618
+ 0x7DC: 68,
619
+ 0x7DD: 68,
620
+ 0x7DE: 68,
621
+ 0x7DF: 68,
622
+ 0x7E0: 68,
623
+ 0x7E1: 68,
624
+ 0x7E2: 68,
625
+ 0x7E3: 68,
626
+ 0x7E4: 68,
627
+ 0x7E5: 68,
628
+ 0x7E6: 68,
629
+ 0x7E7: 68,
630
+ 0x7E8: 68,
631
+ 0x7E9: 68,
632
+ 0x7EA: 68,
633
+ 0x7EB: 84,
634
+ 0x7EC: 84,
635
+ 0x7ED: 84,
636
+ 0x7EE: 84,
637
+ 0x7EF: 84,
638
+ 0x7F0: 84,
639
+ 0x7F1: 84,
640
+ 0x7F2: 84,
641
+ 0x7F3: 84,
642
+ 0x7FA: 67,
643
+ 0x7FD: 84,
644
+ 0x816: 84,
645
+ 0x817: 84,
646
+ 0x818: 84,
647
+ 0x819: 84,
648
+ 0x81B: 84,
649
+ 0x81C: 84,
650
+ 0x81D: 84,
651
+ 0x81E: 84,
652
+ 0x81F: 84,
653
+ 0x820: 84,
654
+ 0x821: 84,
655
+ 0x822: 84,
656
+ 0x823: 84,
657
+ 0x825: 84,
658
+ 0x826: 84,
659
+ 0x827: 84,
660
+ 0x829: 84,
661
+ 0x82A: 84,
662
+ 0x82B: 84,
663
+ 0x82C: 84,
664
+ 0x82D: 84,
665
+ 0x840: 82,
666
+ 0x841: 68,
667
+ 0x842: 68,
668
+ 0x843: 68,
669
+ 0x844: 68,
670
+ 0x845: 68,
671
+ 0x846: 82,
672
+ 0x847: 82,
673
+ 0x848: 68,
674
+ 0x849: 82,
675
+ 0x84A: 68,
676
+ 0x84B: 68,
677
+ 0x84C: 68,
678
+ 0x84D: 68,
679
+ 0x84E: 68,
680
+ 0x84F: 68,
681
+ 0x850: 68,
682
+ 0x851: 68,
683
+ 0x852: 68,
684
+ 0x853: 68,
685
+ 0x854: 82,
686
+ 0x855: 68,
687
+ 0x856: 82,
688
+ 0x857: 82,
689
+ 0x858: 82,
690
+ 0x859: 84,
691
+ 0x85A: 84,
692
+ 0x85B: 84,
693
+ 0x860: 68,
694
+ 0x862: 68,
695
+ 0x863: 68,
696
+ 0x864: 68,
697
+ 0x865: 68,
698
+ 0x867: 82,
699
+ 0x868: 68,
700
+ 0x869: 82,
701
+ 0x86A: 82,
702
+ 0x870: 82,
703
+ 0x871: 82,
704
+ 0x872: 82,
705
+ 0x873: 82,
706
+ 0x874: 82,
707
+ 0x875: 82,
708
+ 0x876: 82,
709
+ 0x877: 82,
710
+ 0x878: 82,
711
+ 0x879: 82,
712
+ 0x87A: 82,
713
+ 0x87B: 82,
714
+ 0x87C: 82,
715
+ 0x87D: 82,
716
+ 0x87E: 82,
717
+ 0x87F: 82,
718
+ 0x880: 82,
719
+ 0x881: 82,
720
+ 0x882: 82,
721
+ 0x883: 67,
722
+ 0x884: 67,
723
+ 0x885: 67,
724
+ 0x886: 68,
725
+ 0x889: 68,
726
+ 0x88A: 68,
727
+ 0x88B: 68,
728
+ 0x88C: 68,
729
+ 0x88D: 68,
730
+ 0x88E: 82,
731
+ 0x897: 84,
732
+ 0x898: 84,
733
+ 0x899: 84,
734
+ 0x89A: 84,
735
+ 0x89B: 84,
736
+ 0x89C: 84,
737
+ 0x89D: 84,
738
+ 0x89E: 84,
739
+ 0x89F: 84,
740
+ 0x8A0: 68,
741
+ 0x8A1: 68,
742
+ 0x8A2: 68,
743
+ 0x8A3: 68,
744
+ 0x8A4: 68,
745
+ 0x8A5: 68,
746
+ 0x8A6: 68,
747
+ 0x8A7: 68,
748
+ 0x8A8: 68,
749
+ 0x8A9: 68,
750
+ 0x8AA: 82,
751
+ 0x8AB: 82,
752
+ 0x8AC: 82,
753
+ 0x8AE: 82,
754
+ 0x8AF: 68,
755
+ 0x8B0: 68,
756
+ 0x8B1: 82,
757
+ 0x8B2: 82,
758
+ 0x8B3: 68,
759
+ 0x8B4: 68,
760
+ 0x8B5: 68,
761
+ 0x8B6: 68,
762
+ 0x8B7: 68,
763
+ 0x8B8: 68,
764
+ 0x8B9: 82,
765
+ 0x8BA: 68,
766
+ 0x8BB: 68,
767
+ 0x8BC: 68,
768
+ 0x8BD: 68,
769
+ 0x8BE: 68,
770
+ 0x8BF: 68,
771
+ 0x8C0: 68,
772
+ 0x8C1: 68,
773
+ 0x8C2: 68,
774
+ 0x8C3: 68,
775
+ 0x8C4: 68,
776
+ 0x8C5: 68,
777
+ 0x8C6: 68,
778
+ 0x8C7: 68,
779
+ 0x8C8: 68,
780
+ 0x8CA: 84,
781
+ 0x8CB: 84,
782
+ 0x8CC: 84,
783
+ 0x8CD: 84,
784
+ 0x8CE: 84,
785
+ 0x8CF: 84,
786
+ 0x8D0: 84,
787
+ 0x8D1: 84,
788
+ 0x8D2: 84,
789
+ 0x8D3: 84,
790
+ 0x8D4: 84,
791
+ 0x8D5: 84,
792
+ 0x8D6: 84,
793
+ 0x8D7: 84,
794
+ 0x8D8: 84,
795
+ 0x8D9: 84,
796
+ 0x8DA: 84,
797
+ 0x8DB: 84,
798
+ 0x8DC: 84,
799
+ 0x8DD: 84,
800
+ 0x8DE: 84,
801
+ 0x8DF: 84,
802
+ 0x8E0: 84,
803
+ 0x8E1: 84,
804
+ 0x8E3: 84,
805
+ 0x8E4: 84,
806
+ 0x8E5: 84,
807
+ 0x8E6: 84,
808
+ 0x8E7: 84,
809
+ 0x8E8: 84,
810
+ 0x8E9: 84,
811
+ 0x8EA: 84,
812
+ 0x8EB: 84,
813
+ 0x8EC: 84,
814
+ 0x8ED: 84,
815
+ 0x8EE: 84,
816
+ 0x8EF: 84,
817
+ 0x8F0: 84,
818
+ 0x8F1: 84,
819
+ 0x8F2: 84,
820
+ 0x8F3: 84,
821
+ 0x8F4: 84,
822
+ 0x8F5: 84,
823
+ 0x8F6: 84,
824
+ 0x8F7: 84,
825
+ 0x8F8: 84,
826
+ 0x8F9: 84,
827
+ 0x8FA: 84,
828
+ 0x8FB: 84,
829
+ 0x8FC: 84,
830
+ 0x8FD: 84,
831
+ 0x8FE: 84,
832
+ 0x8FF: 84,
833
+ 0x900: 84,
834
+ 0x901: 84,
835
+ 0x902: 84,
836
+ 0x93A: 84,
837
+ 0x93C: 84,
838
+ 0x941: 84,
839
+ 0x942: 84,
840
+ 0x943: 84,
841
+ 0x944: 84,
842
+ 0x945: 84,
843
+ 0x946: 84,
844
+ 0x947: 84,
845
+ 0x948: 84,
846
+ 0x94D: 84,
847
+ 0x951: 84,
848
+ 0x952: 84,
849
+ 0x953: 84,
850
+ 0x954: 84,
851
+ 0x955: 84,
852
+ 0x956: 84,
853
+ 0x957: 84,
854
+ 0x962: 84,
855
+ 0x963: 84,
856
+ 0x981: 84,
857
+ 0x9BC: 84,
858
+ 0x9C1: 84,
859
+ 0x9C2: 84,
860
+ 0x9C3: 84,
861
+ 0x9C4: 84,
862
+ 0x9CD: 84,
863
+ 0x9E2: 84,
864
+ 0x9E3: 84,
865
+ 0x9FE: 84,
866
+ 0xA01: 84,
867
+ 0xA02: 84,
868
+ 0xA3C: 84,
869
+ 0xA41: 84,
870
+ 0xA42: 84,
871
+ 0xA47: 84,
872
+ 0xA48: 84,
873
+ 0xA4B: 84,
874
+ 0xA4C: 84,
875
+ 0xA4D: 84,
876
+ 0xA51: 84,
877
+ 0xA70: 84,
878
+ 0xA71: 84,
879
+ 0xA75: 84,
880
+ 0xA81: 84,
881
+ 0xA82: 84,
882
+ 0xABC: 84,
883
+ 0xAC1: 84,
884
+ 0xAC2: 84,
885
+ 0xAC3: 84,
886
+ 0xAC4: 84,
887
+ 0xAC5: 84,
888
+ 0xAC7: 84,
889
+ 0xAC8: 84,
890
+ 0xACD: 84,
891
+ 0xAE2: 84,
892
+ 0xAE3: 84,
893
+ 0xAFA: 84,
894
+ 0xAFB: 84,
895
+ 0xAFC: 84,
896
+ 0xAFD: 84,
897
+ 0xAFE: 84,
898
+ 0xAFF: 84,
899
+ 0xB01: 84,
900
+ 0xB3C: 84,
901
+ 0xB3F: 84,
902
+ 0xB41: 84,
903
+ 0xB42: 84,
904
+ 0xB43: 84,
905
+ 0xB44: 84,
906
+ 0xB4D: 84,
907
+ 0xB55: 84,
908
+ 0xB56: 84,
909
+ 0xB62: 84,
910
+ 0xB63: 84,
911
+ 0xB82: 84,
912
+ 0xBC0: 84,
913
+ 0xBCD: 84,
914
+ 0xC00: 84,
915
+ 0xC04: 84,
916
+ 0xC3C: 84,
917
+ 0xC3E: 84,
918
+ 0xC3F: 84,
919
+ 0xC40: 84,
920
+ 0xC46: 84,
921
+ 0xC47: 84,
922
+ 0xC48: 84,
923
+ 0xC4A: 84,
924
+ 0xC4B: 84,
925
+ 0xC4C: 84,
926
+ 0xC4D: 84,
927
+ 0xC55: 84,
928
+ 0xC56: 84,
929
+ 0xC62: 84,
930
+ 0xC63: 84,
931
+ 0xC81: 84,
932
+ 0xCBC: 84,
933
+ 0xCBF: 84,
934
+ 0xCC6: 84,
935
+ 0xCCC: 84,
936
+ 0xCCD: 84,
937
+ 0xCE2: 84,
938
+ 0xCE3: 84,
939
+ 0xD00: 84,
940
+ 0xD01: 84,
941
+ 0xD3B: 84,
942
+ 0xD3C: 84,
943
+ 0xD41: 84,
944
+ 0xD42: 84,
945
+ 0xD43: 84,
946
+ 0xD44: 84,
947
+ 0xD4D: 84,
948
+ 0xD62: 84,
949
+ 0xD63: 84,
950
+ 0xD81: 84,
951
+ 0xDCA: 84,
952
+ 0xDD2: 84,
953
+ 0xDD3: 84,
954
+ 0xDD4: 84,
955
+ 0xDD6: 84,
956
+ 0xE31: 84,
957
+ 0xE34: 84,
958
+ 0xE35: 84,
959
+ 0xE36: 84,
960
+ 0xE37: 84,
961
+ 0xE38: 84,
962
+ 0xE39: 84,
963
+ 0xE3A: 84,
964
+ 0xE47: 84,
965
+ 0xE48: 84,
966
+ 0xE49: 84,
967
+ 0xE4A: 84,
968
+ 0xE4B: 84,
969
+ 0xE4C: 84,
970
+ 0xE4D: 84,
971
+ 0xE4E: 84,
972
+ 0xEB1: 84,
973
+ 0xEB4: 84,
974
+ 0xEB5: 84,
975
+ 0xEB6: 84,
976
+ 0xEB7: 84,
977
+ 0xEB8: 84,
978
+ 0xEB9: 84,
979
+ 0xEBA: 84,
980
+ 0xEBB: 84,
981
+ 0xEBC: 84,
982
+ 0xEC8: 84,
983
+ 0xEC9: 84,
984
+ 0xECA: 84,
985
+ 0xECB: 84,
986
+ 0xECC: 84,
987
+ 0xECD: 84,
988
+ 0xECE: 84,
989
+ 0xF18: 84,
990
+ 0xF19: 84,
991
+ 0xF35: 84,
992
+ 0xF37: 84,
993
+ 0xF39: 84,
994
+ 0xF71: 84,
995
+ 0xF72: 84,
996
+ 0xF73: 84,
997
+ 0xF74: 84,
998
+ 0xF75: 84,
999
+ 0xF76: 84,
1000
+ 0xF77: 84,
1001
+ 0xF78: 84,
1002
+ 0xF79: 84,
1003
+ 0xF7A: 84,
1004
+ 0xF7B: 84,
1005
+ 0xF7C: 84,
1006
+ 0xF7D: 84,
1007
+ 0xF7E: 84,
1008
+ 0xF80: 84,
1009
+ 0xF81: 84,
1010
+ 0xF82: 84,
1011
+ 0xF83: 84,
1012
+ 0xF84: 84,
1013
+ 0xF86: 84,
1014
+ 0xF87: 84,
1015
+ 0xF8D: 84,
1016
+ 0xF8E: 84,
1017
+ 0xF8F: 84,
1018
+ 0xF90: 84,
1019
+ 0xF91: 84,
1020
+ 0xF92: 84,
1021
+ 0xF93: 84,
1022
+ 0xF94: 84,
1023
+ 0xF95: 84,
1024
+ 0xF96: 84,
1025
+ 0xF97: 84,
1026
+ 0xF99: 84,
1027
+ 0xF9A: 84,
1028
+ 0xF9B: 84,
1029
+ 0xF9C: 84,
1030
+ 0xF9D: 84,
1031
+ 0xF9E: 84,
1032
+ 0xF9F: 84,
1033
+ 0xFA0: 84,
1034
+ 0xFA1: 84,
1035
+ 0xFA2: 84,
1036
+ 0xFA3: 84,
1037
+ 0xFA4: 84,
1038
+ 0xFA5: 84,
1039
+ 0xFA6: 84,
1040
+ 0xFA7: 84,
1041
+ 0xFA8: 84,
1042
+ 0xFA9: 84,
1043
+ 0xFAA: 84,
1044
+ 0xFAB: 84,
1045
+ 0xFAC: 84,
1046
+ 0xFAD: 84,
1047
+ 0xFAE: 84,
1048
+ 0xFAF: 84,
1049
+ 0xFB0: 84,
1050
+ 0xFB1: 84,
1051
+ 0xFB2: 84,
1052
+ 0xFB3: 84,
1053
+ 0xFB4: 84,
1054
+ 0xFB5: 84,
1055
+ 0xFB6: 84,
1056
+ 0xFB7: 84,
1057
+ 0xFB8: 84,
1058
+ 0xFB9: 84,
1059
+ 0xFBA: 84,
1060
+ 0xFBB: 84,
1061
+ 0xFBC: 84,
1062
+ 0xFC6: 84,
1063
+ 0x102D: 84,
1064
+ 0x102E: 84,
1065
+ 0x102F: 84,
1066
+ 0x1030: 84,
1067
+ 0x1032: 84,
1068
+ 0x1033: 84,
1069
+ 0x1034: 84,
1070
+ 0x1035: 84,
1071
+ 0x1036: 84,
1072
+ 0x1037: 84,
1073
+ 0x1039: 84,
1074
+ 0x103A: 84,
1075
+ 0x103D: 84,
1076
+ 0x103E: 84,
1077
+ 0x1058: 84,
1078
+ 0x1059: 84,
1079
+ 0x105E: 84,
1080
+ 0x105F: 84,
1081
+ 0x1060: 84,
1082
+ 0x1071: 84,
1083
+ 0x1072: 84,
1084
+ 0x1073: 84,
1085
+ 0x1074: 84,
1086
+ 0x1082: 84,
1087
+ 0x1085: 84,
1088
+ 0x1086: 84,
1089
+ 0x108D: 84,
1090
+ 0x109D: 84,
1091
+ 0x135D: 84,
1092
+ 0x135E: 84,
1093
+ 0x135F: 84,
1094
+ 0x1712: 84,
1095
+ 0x1713: 84,
1096
+ 0x1714: 84,
1097
+ 0x1732: 84,
1098
+ 0x1733: 84,
1099
+ 0x1752: 84,
1100
+ 0x1753: 84,
1101
+ 0x1772: 84,
1102
+ 0x1773: 84,
1103
+ 0x17B4: 84,
1104
+ 0x17B5: 84,
1105
+ 0x17B7: 84,
1106
+ 0x17B8: 84,
1107
+ 0x17B9: 84,
1108
+ 0x17BA: 84,
1109
+ 0x17BB: 84,
1110
+ 0x17BC: 84,
1111
+ 0x17BD: 84,
1112
+ 0x17C6: 84,
1113
+ 0x17C9: 84,
1114
+ 0x17CA: 84,
1115
+ 0x17CB: 84,
1116
+ 0x17CC: 84,
1117
+ 0x17CD: 84,
1118
+ 0x17CE: 84,
1119
+ 0x17CF: 84,
1120
+ 0x17D0: 84,
1121
+ 0x17D1: 84,
1122
+ 0x17D2: 84,
1123
+ 0x17D3: 84,
1124
+ 0x17DD: 84,
1125
+ 0x1807: 68,
1126
+ 0x180A: 67,
1127
+ 0x180B: 84,
1128
+ 0x180C: 84,
1129
+ 0x180D: 84,
1130
+ 0x180F: 84,
1131
+ 0x1820: 68,
1132
+ 0x1821: 68,
1133
+ 0x1822: 68,
1134
+ 0x1823: 68,
1135
+ 0x1824: 68,
1136
+ 0x1825: 68,
1137
+ 0x1826: 68,
1138
+ 0x1827: 68,
1139
+ 0x1828: 68,
1140
+ 0x1829: 68,
1141
+ 0x182A: 68,
1142
+ 0x182B: 68,
1143
+ 0x182C: 68,
1144
+ 0x182D: 68,
1145
+ 0x182E: 68,
1146
+ 0x182F: 68,
1147
+ 0x1830: 68,
1148
+ 0x1831: 68,
1149
+ 0x1832: 68,
1150
+ 0x1833: 68,
1151
+ 0x1834: 68,
1152
+ 0x1835: 68,
1153
+ 0x1836: 68,
1154
+ 0x1837: 68,
1155
+ 0x1838: 68,
1156
+ 0x1839: 68,
1157
+ 0x183A: 68,
1158
+ 0x183B: 68,
1159
+ 0x183C: 68,
1160
+ 0x183D: 68,
1161
+ 0x183E: 68,
1162
+ 0x183F: 68,
1163
+ 0x1840: 68,
1164
+ 0x1841: 68,
1165
+ 0x1842: 68,
1166
+ 0x1843: 68,
1167
+ 0x1844: 68,
1168
+ 0x1845: 68,
1169
+ 0x1846: 68,
1170
+ 0x1847: 68,
1171
+ 0x1848: 68,
1172
+ 0x1849: 68,
1173
+ 0x184A: 68,
1174
+ 0x184B: 68,
1175
+ 0x184C: 68,
1176
+ 0x184D: 68,
1177
+ 0x184E: 68,
1178
+ 0x184F: 68,
1179
+ 0x1850: 68,
1180
+ 0x1851: 68,
1181
+ 0x1852: 68,
1182
+ 0x1853: 68,
1183
+ 0x1854: 68,
1184
+ 0x1855: 68,
1185
+ 0x1856: 68,
1186
+ 0x1857: 68,
1187
+ 0x1858: 68,
1188
+ 0x1859: 68,
1189
+ 0x185A: 68,
1190
+ 0x185B: 68,
1191
+ 0x185C: 68,
1192
+ 0x185D: 68,
1193
+ 0x185E: 68,
1194
+ 0x185F: 68,
1195
+ 0x1860: 68,
1196
+ 0x1861: 68,
1197
+ 0x1862: 68,
1198
+ 0x1863: 68,
1199
+ 0x1864: 68,
1200
+ 0x1865: 68,
1201
+ 0x1866: 68,
1202
+ 0x1867: 68,
1203
+ 0x1868: 68,
1204
+ 0x1869: 68,
1205
+ 0x186A: 68,
1206
+ 0x186B: 68,
1207
+ 0x186C: 68,
1208
+ 0x186D: 68,
1209
+ 0x186E: 68,
1210
+ 0x186F: 68,
1211
+ 0x1870: 68,
1212
+ 0x1871: 68,
1213
+ 0x1872: 68,
1214
+ 0x1873: 68,
1215
+ 0x1874: 68,
1216
+ 0x1875: 68,
1217
+ 0x1876: 68,
1218
+ 0x1877: 68,
1219
+ 0x1878: 68,
1220
+ 0x1885: 84,
1221
+ 0x1886: 84,
1222
+ 0x1887: 68,
1223
+ 0x1888: 68,
1224
+ 0x1889: 68,
1225
+ 0x188A: 68,
1226
+ 0x188B: 68,
1227
+ 0x188C: 68,
1228
+ 0x188D: 68,
1229
+ 0x188E: 68,
1230
+ 0x188F: 68,
1231
+ 0x1890: 68,
1232
+ 0x1891: 68,
1233
+ 0x1892: 68,
1234
+ 0x1893: 68,
1235
+ 0x1894: 68,
1236
+ 0x1895: 68,
1237
+ 0x1896: 68,
1238
+ 0x1897: 68,
1239
+ 0x1898: 68,
1240
+ 0x1899: 68,
1241
+ 0x189A: 68,
1242
+ 0x189B: 68,
1243
+ 0x189C: 68,
1244
+ 0x189D: 68,
1245
+ 0x189E: 68,
1246
+ 0x189F: 68,
1247
+ 0x18A0: 68,
1248
+ 0x18A1: 68,
1249
+ 0x18A2: 68,
1250
+ 0x18A3: 68,
1251
+ 0x18A4: 68,
1252
+ 0x18A5: 68,
1253
+ 0x18A6: 68,
1254
+ 0x18A7: 68,
1255
+ 0x18A8: 68,
1256
+ 0x18A9: 84,
1257
+ 0x18AA: 68,
1258
+ 0x1920: 84,
1259
+ 0x1921: 84,
1260
+ 0x1922: 84,
1261
+ 0x1927: 84,
1262
+ 0x1928: 84,
1263
+ 0x1932: 84,
1264
+ 0x1939: 84,
1265
+ 0x193A: 84,
1266
+ 0x193B: 84,
1267
+ 0x1A17: 84,
1268
+ 0x1A18: 84,
1269
+ 0x1A1B: 84,
1270
+ 0x1A56: 84,
1271
+ 0x1A58: 84,
1272
+ 0x1A59: 84,
1273
+ 0x1A5A: 84,
1274
+ 0x1A5B: 84,
1275
+ 0x1A5C: 84,
1276
+ 0x1A5D: 84,
1277
+ 0x1A5E: 84,
1278
+ 0x1A60: 84,
1279
+ 0x1A62: 84,
1280
+ 0x1A65: 84,
1281
+ 0x1A66: 84,
1282
+ 0x1A67: 84,
1283
+ 0x1A68: 84,
1284
+ 0x1A69: 84,
1285
+ 0x1A6A: 84,
1286
+ 0x1A6B: 84,
1287
+ 0x1A6C: 84,
1288
+ 0x1A73: 84,
1289
+ 0x1A74: 84,
1290
+ 0x1A75: 84,
1291
+ 0x1A76: 84,
1292
+ 0x1A77: 84,
1293
+ 0x1A78: 84,
1294
+ 0x1A79: 84,
1295
+ 0x1A7A: 84,
1296
+ 0x1A7B: 84,
1297
+ 0x1A7C: 84,
1298
+ 0x1A7F: 84,
1299
+ 0x1AB0: 84,
1300
+ 0x1AB1: 84,
1301
+ 0x1AB2: 84,
1302
+ 0x1AB3: 84,
1303
+ 0x1AB4: 84,
1304
+ 0x1AB5: 84,
1305
+ 0x1AB6: 84,
1306
+ 0x1AB7: 84,
1307
+ 0x1AB8: 84,
1308
+ 0x1AB9: 84,
1309
+ 0x1ABA: 84,
1310
+ 0x1ABB: 84,
1311
+ 0x1ABC: 84,
1312
+ 0x1ABD: 84,
1313
+ 0x1ABE: 84,
1314
+ 0x1ABF: 84,
1315
+ 0x1AC0: 84,
1316
+ 0x1AC1: 84,
1317
+ 0x1AC2: 84,
1318
+ 0x1AC3: 84,
1319
+ 0x1AC4: 84,
1320
+ 0x1AC5: 84,
1321
+ 0x1AC6: 84,
1322
+ 0x1AC7: 84,
1323
+ 0x1AC8: 84,
1324
+ 0x1AC9: 84,
1325
+ 0x1ACA: 84,
1326
+ 0x1ACB: 84,
1327
+ 0x1ACC: 84,
1328
+ 0x1ACD: 84,
1329
+ 0x1ACE: 84,
1330
+ 0x1B00: 84,
1331
+ 0x1B01: 84,
1332
+ 0x1B02: 84,
1333
+ 0x1B03: 84,
1334
+ 0x1B34: 84,
1335
+ 0x1B36: 84,
1336
+ 0x1B37: 84,
1337
+ 0x1B38: 84,
1338
+ 0x1B39: 84,
1339
+ 0x1B3A: 84,
1340
+ 0x1B3C: 84,
1341
+ 0x1B42: 84,
1342
+ 0x1B6B: 84,
1343
+ 0x1B6C: 84,
1344
+ 0x1B6D: 84,
1345
+ 0x1B6E: 84,
1346
+ 0x1B6F: 84,
1347
+ 0x1B70: 84,
1348
+ 0x1B71: 84,
1349
+ 0x1B72: 84,
1350
+ 0x1B73: 84,
1351
+ 0x1B80: 84,
1352
+ 0x1B81: 84,
1353
+ 0x1BA2: 84,
1354
+ 0x1BA3: 84,
1355
+ 0x1BA4: 84,
1356
+ 0x1BA5: 84,
1357
+ 0x1BA8: 84,
1358
+ 0x1BA9: 84,
1359
+ 0x1BAB: 84,
1360
+ 0x1BAC: 84,
1361
+ 0x1BAD: 84,
1362
+ 0x1BE6: 84,
1363
+ 0x1BE8: 84,
1364
+ 0x1BE9: 84,
1365
+ 0x1BED: 84,
1366
+ 0x1BEF: 84,
1367
+ 0x1BF0: 84,
1368
+ 0x1BF1: 84,
1369
+ 0x1C2C: 84,
1370
+ 0x1C2D: 84,
1371
+ 0x1C2E: 84,
1372
+ 0x1C2F: 84,
1373
+ 0x1C30: 84,
1374
+ 0x1C31: 84,
1375
+ 0x1C32: 84,
1376
+ 0x1C33: 84,
1377
+ 0x1C36: 84,
1378
+ 0x1C37: 84,
1379
+ 0x1CD0: 84,
1380
+ 0x1CD1: 84,
1381
+ 0x1CD2: 84,
1382
+ 0x1CD4: 84,
1383
+ 0x1CD5: 84,
1384
+ 0x1CD6: 84,
1385
+ 0x1CD7: 84,
1386
+ 0x1CD8: 84,
1387
+ 0x1CD9: 84,
1388
+ 0x1CDA: 84,
1389
+ 0x1CDB: 84,
1390
+ 0x1CDC: 84,
1391
+ 0x1CDD: 84,
1392
+ 0x1CDE: 84,
1393
+ 0x1CDF: 84,
1394
+ 0x1CE0: 84,
1395
+ 0x1CE2: 84,
1396
+ 0x1CE3: 84,
1397
+ 0x1CE4: 84,
1398
+ 0x1CE5: 84,
1399
+ 0x1CE6: 84,
1400
+ 0x1CE7: 84,
1401
+ 0x1CE8: 84,
1402
+ 0x1CED: 84,
1403
+ 0x1CF4: 84,
1404
+ 0x1CF8: 84,
1405
+ 0x1CF9: 84,
1406
+ 0x1DC0: 84,
1407
+ 0x1DC1: 84,
1408
+ 0x1DC2: 84,
1409
+ 0x1DC3: 84,
1410
+ 0x1DC4: 84,
1411
+ 0x1DC5: 84,
1412
+ 0x1DC6: 84,
1413
+ 0x1DC7: 84,
1414
+ 0x1DC8: 84,
1415
+ 0x1DC9: 84,
1416
+ 0x1DCA: 84,
1417
+ 0x1DCB: 84,
1418
+ 0x1DCC: 84,
1419
+ 0x1DCD: 84,
1420
+ 0x1DCE: 84,
1421
+ 0x1DCF: 84,
1422
+ 0x1DD0: 84,
1423
+ 0x1DD1: 84,
1424
+ 0x1DD2: 84,
1425
+ 0x1DD3: 84,
1426
+ 0x1DD4: 84,
1427
+ 0x1DD5: 84,
1428
+ 0x1DD6: 84,
1429
+ 0x1DD7: 84,
1430
+ 0x1DD8: 84,
1431
+ 0x1DD9: 84,
1432
+ 0x1DDA: 84,
1433
+ 0x1DDB: 84,
1434
+ 0x1DDC: 84,
1435
+ 0x1DDD: 84,
1436
+ 0x1DDE: 84,
1437
+ 0x1DDF: 84,
1438
+ 0x1DE0: 84,
1439
+ 0x1DE1: 84,
1440
+ 0x1DE2: 84,
1441
+ 0x1DE3: 84,
1442
+ 0x1DE4: 84,
1443
+ 0x1DE5: 84,
1444
+ 0x1DE6: 84,
1445
+ 0x1DE7: 84,
1446
+ 0x1DE8: 84,
1447
+ 0x1DE9: 84,
1448
+ 0x1DEA: 84,
1449
+ 0x1DEB: 84,
1450
+ 0x1DEC: 84,
1451
+ 0x1DED: 84,
1452
+ 0x1DEE: 84,
1453
+ 0x1DEF: 84,
1454
+ 0x1DF0: 84,
1455
+ 0x1DF1: 84,
1456
+ 0x1DF2: 84,
1457
+ 0x1DF3: 84,
1458
+ 0x1DF4: 84,
1459
+ 0x1DF5: 84,
1460
+ 0x1DF6: 84,
1461
+ 0x1DF7: 84,
1462
+ 0x1DF8: 84,
1463
+ 0x1DF9: 84,
1464
+ 0x1DFA: 84,
1465
+ 0x1DFB: 84,
1466
+ 0x1DFC: 84,
1467
+ 0x1DFD: 84,
1468
+ 0x1DFE: 84,
1469
+ 0x1DFF: 84,
1470
+ 0x200B: 84,
1471
+ 0x200D: 67,
1472
+ 0x200E: 84,
1473
+ 0x200F: 84,
1474
+ 0x202A: 84,
1475
+ 0x202B: 84,
1476
+ 0x202C: 84,
1477
+ 0x202D: 84,
1478
+ 0x202E: 84,
1479
+ 0x2060: 84,
1480
+ 0x2061: 84,
1481
+ 0x2062: 84,
1482
+ 0x2063: 84,
1483
+ 0x2064: 84,
1484
+ 0x206A: 84,
1485
+ 0x206B: 84,
1486
+ 0x206C: 84,
1487
+ 0x206D: 84,
1488
+ 0x206E: 84,
1489
+ 0x206F: 84,
1490
+ 0x20D0: 84,
1491
+ 0x20D1: 84,
1492
+ 0x20D2: 84,
1493
+ 0x20D3: 84,
1494
+ 0x20D4: 84,
1495
+ 0x20D5: 84,
1496
+ 0x20D6: 84,
1497
+ 0x20D7: 84,
1498
+ 0x20D8: 84,
1499
+ 0x20D9: 84,
1500
+ 0x20DA: 84,
1501
+ 0x20DB: 84,
1502
+ 0x20DC: 84,
1503
+ 0x20DD: 84,
1504
+ 0x20DE: 84,
1505
+ 0x20DF: 84,
1506
+ 0x20E0: 84,
1507
+ 0x20E1: 84,
1508
+ 0x20E2: 84,
1509
+ 0x20E3: 84,
1510
+ 0x20E4: 84,
1511
+ 0x20E5: 84,
1512
+ 0x20E6: 84,
1513
+ 0x20E7: 84,
1514
+ 0x20E8: 84,
1515
+ 0x20E9: 84,
1516
+ 0x20EA: 84,
1517
+ 0x20EB: 84,
1518
+ 0x20EC: 84,
1519
+ 0x20ED: 84,
1520
+ 0x20EE: 84,
1521
+ 0x20EF: 84,
1522
+ 0x20F0: 84,
1523
+ 0x2CEF: 84,
1524
+ 0x2CF0: 84,
1525
+ 0x2CF1: 84,
1526
+ 0x2D7F: 84,
1527
+ 0x2DE0: 84,
1528
+ 0x2DE1: 84,
1529
+ 0x2DE2: 84,
1530
+ 0x2DE3: 84,
1531
+ 0x2DE4: 84,
1532
+ 0x2DE5: 84,
1533
+ 0x2DE6: 84,
1534
+ 0x2DE7: 84,
1535
+ 0x2DE8: 84,
1536
+ 0x2DE9: 84,
1537
+ 0x2DEA: 84,
1538
+ 0x2DEB: 84,
1539
+ 0x2DEC: 84,
1540
+ 0x2DED: 84,
1541
+ 0x2DEE: 84,
1542
+ 0x2DEF: 84,
1543
+ 0x2DF0: 84,
1544
+ 0x2DF1: 84,
1545
+ 0x2DF2: 84,
1546
+ 0x2DF3: 84,
1547
+ 0x2DF4: 84,
1548
+ 0x2DF5: 84,
1549
+ 0x2DF6: 84,
1550
+ 0x2DF7: 84,
1551
+ 0x2DF8: 84,
1552
+ 0x2DF9: 84,
1553
+ 0x2DFA: 84,
1554
+ 0x2DFB: 84,
1555
+ 0x2DFC: 84,
1556
+ 0x2DFD: 84,
1557
+ 0x2DFE: 84,
1558
+ 0x2DFF: 84,
1559
+ 0x302A: 84,
1560
+ 0x302B: 84,
1561
+ 0x302C: 84,
1562
+ 0x302D: 84,
1563
+ 0x3099: 84,
1564
+ 0x309A: 84,
1565
+ 0xA66F: 84,
1566
+ 0xA670: 84,
1567
+ 0xA671: 84,
1568
+ 0xA672: 84,
1569
+ 0xA674: 84,
1570
+ 0xA675: 84,
1571
+ 0xA676: 84,
1572
+ 0xA677: 84,
1573
+ 0xA678: 84,
1574
+ 0xA679: 84,
1575
+ 0xA67A: 84,
1576
+ 0xA67B: 84,
1577
+ 0xA67C: 84,
1578
+ 0xA67D: 84,
1579
+ 0xA69E: 84,
1580
+ 0xA69F: 84,
1581
+ 0xA6F0: 84,
1582
+ 0xA6F1: 84,
1583
+ 0xA802: 84,
1584
+ 0xA806: 84,
1585
+ 0xA80B: 84,
1586
+ 0xA825: 84,
1587
+ 0xA826: 84,
1588
+ 0xA82C: 84,
1589
+ 0xA840: 68,
1590
+ 0xA841: 68,
1591
+ 0xA842: 68,
1592
+ 0xA843: 68,
1593
+ 0xA844: 68,
1594
+ 0xA845: 68,
1595
+ 0xA846: 68,
1596
+ 0xA847: 68,
1597
+ 0xA848: 68,
1598
+ 0xA849: 68,
1599
+ 0xA84A: 68,
1600
+ 0xA84B: 68,
1601
+ 0xA84C: 68,
1602
+ 0xA84D: 68,
1603
+ 0xA84E: 68,
1604
+ 0xA84F: 68,
1605
+ 0xA850: 68,
1606
+ 0xA851: 68,
1607
+ 0xA852: 68,
1608
+ 0xA853: 68,
1609
+ 0xA854: 68,
1610
+ 0xA855: 68,
1611
+ 0xA856: 68,
1612
+ 0xA857: 68,
1613
+ 0xA858: 68,
1614
+ 0xA859: 68,
1615
+ 0xA85A: 68,
1616
+ 0xA85B: 68,
1617
+ 0xA85C: 68,
1618
+ 0xA85D: 68,
1619
+ 0xA85E: 68,
1620
+ 0xA85F: 68,
1621
+ 0xA860: 68,
1622
+ 0xA861: 68,
1623
+ 0xA862: 68,
1624
+ 0xA863: 68,
1625
+ 0xA864: 68,
1626
+ 0xA865: 68,
1627
+ 0xA866: 68,
1628
+ 0xA867: 68,
1629
+ 0xA868: 68,
1630
+ 0xA869: 68,
1631
+ 0xA86A: 68,
1632
+ 0xA86B: 68,
1633
+ 0xA86C: 68,
1634
+ 0xA86D: 68,
1635
+ 0xA86E: 68,
1636
+ 0xA86F: 68,
1637
+ 0xA870: 68,
1638
+ 0xA871: 68,
1639
+ 0xA872: 76,
1640
+ 0xA8C4: 84,
1641
+ 0xA8C5: 84,
1642
+ 0xA8E0: 84,
1643
+ 0xA8E1: 84,
1644
+ 0xA8E2: 84,
1645
+ 0xA8E3: 84,
1646
+ 0xA8E4: 84,
1647
+ 0xA8E5: 84,
1648
+ 0xA8E6: 84,
1649
+ 0xA8E7: 84,
1650
+ 0xA8E8: 84,
1651
+ 0xA8E9: 84,
1652
+ 0xA8EA: 84,
1653
+ 0xA8EB: 84,
1654
+ 0xA8EC: 84,
1655
+ 0xA8ED: 84,
1656
+ 0xA8EE: 84,
1657
+ 0xA8EF: 84,
1658
+ 0xA8F0: 84,
1659
+ 0xA8F1: 84,
1660
+ 0xA8FF: 84,
1661
+ 0xA926: 84,
1662
+ 0xA927: 84,
1663
+ 0xA928: 84,
1664
+ 0xA929: 84,
1665
+ 0xA92A: 84,
1666
+ 0xA92B: 84,
1667
+ 0xA92C: 84,
1668
+ 0xA92D: 84,
1669
+ 0xA947: 84,
1670
+ 0xA948: 84,
1671
+ 0xA949: 84,
1672
+ 0xA94A: 84,
1673
+ 0xA94B: 84,
1674
+ 0xA94C: 84,
1675
+ 0xA94D: 84,
1676
+ 0xA94E: 84,
1677
+ 0xA94F: 84,
1678
+ 0xA950: 84,
1679
+ 0xA951: 84,
1680
+ 0xA980: 84,
1681
+ 0xA981: 84,
1682
+ 0xA982: 84,
1683
+ 0xA9B3: 84,
1684
+ 0xA9B6: 84,
1685
+ 0xA9B7: 84,
1686
+ 0xA9B8: 84,
1687
+ 0xA9B9: 84,
1688
+ 0xA9BC: 84,
1689
+ 0xA9BD: 84,
1690
+ 0xA9E5: 84,
1691
+ 0xAA29: 84,
1692
+ 0xAA2A: 84,
1693
+ 0xAA2B: 84,
1694
+ 0xAA2C: 84,
1695
+ 0xAA2D: 84,
1696
+ 0xAA2E: 84,
1697
+ 0xAA31: 84,
1698
+ 0xAA32: 84,
1699
+ 0xAA35: 84,
1700
+ 0xAA36: 84,
1701
+ 0xAA43: 84,
1702
+ 0xAA4C: 84,
1703
+ 0xAA7C: 84,
1704
+ 0xAAB0: 84,
1705
+ 0xAAB2: 84,
1706
+ 0xAAB3: 84,
1707
+ 0xAAB4: 84,
1708
+ 0xAAB7: 84,
1709
+ 0xAAB8: 84,
1710
+ 0xAABE: 84,
1711
+ 0xAABF: 84,
1712
+ 0xAAC1: 84,
1713
+ 0xAAEC: 84,
1714
+ 0xAAED: 84,
1715
+ 0xAAF6: 84,
1716
+ 0xABE5: 84,
1717
+ 0xABE8: 84,
1718
+ 0xABED: 84,
1719
+ 0xFB1E: 84,
1720
+ 0xFE00: 84,
1721
+ 0xFE01: 84,
1722
+ 0xFE02: 84,
1723
+ 0xFE03: 84,
1724
+ 0xFE04: 84,
1725
+ 0xFE05: 84,
1726
+ 0xFE06: 84,
1727
+ 0xFE07: 84,
1728
+ 0xFE08: 84,
1729
+ 0xFE09: 84,
1730
+ 0xFE0A: 84,
1731
+ 0xFE0B: 84,
1732
+ 0xFE0C: 84,
1733
+ 0xFE0D: 84,
1734
+ 0xFE0E: 84,
1735
+ 0xFE0F: 84,
1736
+ 0xFE20: 84,
1737
+ 0xFE21: 84,
1738
+ 0xFE22: 84,
1739
+ 0xFE23: 84,
1740
+ 0xFE24: 84,
1741
+ 0xFE25: 84,
1742
+ 0xFE26: 84,
1743
+ 0xFE27: 84,
1744
+ 0xFE28: 84,
1745
+ 0xFE29: 84,
1746
+ 0xFE2A: 84,
1747
+ 0xFE2B: 84,
1748
+ 0xFE2C: 84,
1749
+ 0xFE2D: 84,
1750
+ 0xFE2E: 84,
1751
+ 0xFE2F: 84,
1752
+ 0xFEFF: 84,
1753
+ 0xFFF9: 84,
1754
+ 0xFFFA: 84,
1755
+ 0xFFFB: 84,
1756
+ 0x101FD: 84,
1757
+ 0x102E0: 84,
1758
+ 0x10376: 84,
1759
+ 0x10377: 84,
1760
+ 0x10378: 84,
1761
+ 0x10379: 84,
1762
+ 0x1037A: 84,
1763
+ 0x10A01: 84,
1764
+ 0x10A02: 84,
1765
+ 0x10A03: 84,
1766
+ 0x10A05: 84,
1767
+ 0x10A06: 84,
1768
+ 0x10A0C: 84,
1769
+ 0x10A0D: 84,
1770
+ 0x10A0E: 84,
1771
+ 0x10A0F: 84,
1772
+ 0x10A38: 84,
1773
+ 0x10A39: 84,
1774
+ 0x10A3A: 84,
1775
+ 0x10A3F: 84,
1776
+ 0x10AC0: 68,
1777
+ 0x10AC1: 68,
1778
+ 0x10AC2: 68,
1779
+ 0x10AC3: 68,
1780
+ 0x10AC4: 68,
1781
+ 0x10AC5: 82,
1782
+ 0x10AC7: 82,
1783
+ 0x10AC9: 82,
1784
+ 0x10ACA: 82,
1785
+ 0x10ACD: 76,
1786
+ 0x10ACE: 82,
1787
+ 0x10ACF: 82,
1788
+ 0x10AD0: 82,
1789
+ 0x10AD1: 82,
1790
+ 0x10AD2: 82,
1791
+ 0x10AD3: 68,
1792
+ 0x10AD4: 68,
1793
+ 0x10AD5: 68,
1794
+ 0x10AD6: 68,
1795
+ 0x10AD7: 76,
1796
+ 0x10AD8: 68,
1797
+ 0x10AD9: 68,
1798
+ 0x10ADA: 68,
1799
+ 0x10ADB: 68,
1800
+ 0x10ADC: 68,
1801
+ 0x10ADD: 82,
1802
+ 0x10ADE: 68,
1803
+ 0x10ADF: 68,
1804
+ 0x10AE0: 68,
1805
+ 0x10AE1: 82,
1806
+ 0x10AE4: 82,
1807
+ 0x10AE5: 84,
1808
+ 0x10AE6: 84,
1809
+ 0x10AEB: 68,
1810
+ 0x10AEC: 68,
1811
+ 0x10AED: 68,
1812
+ 0x10AEE: 68,
1813
+ 0x10AEF: 82,
1814
+ 0x10B80: 68,
1815
+ 0x10B81: 82,
1816
+ 0x10B82: 68,
1817
+ 0x10B83: 82,
1818
+ 0x10B84: 82,
1819
+ 0x10B85: 82,
1820
+ 0x10B86: 68,
1821
+ 0x10B87: 68,
1822
+ 0x10B88: 68,
1823
+ 0x10B89: 82,
1824
+ 0x10B8A: 68,
1825
+ 0x10B8B: 68,
1826
+ 0x10B8C: 82,
1827
+ 0x10B8D: 68,
1828
+ 0x10B8E: 82,
1829
+ 0x10B8F: 82,
1830
+ 0x10B90: 68,
1831
+ 0x10B91: 82,
1832
+ 0x10BA9: 82,
1833
+ 0x10BAA: 82,
1834
+ 0x10BAB: 82,
1835
+ 0x10BAC: 82,
1836
+ 0x10BAD: 68,
1837
+ 0x10BAE: 68,
1838
+ 0x10D00: 76,
1839
+ 0x10D01: 68,
1840
+ 0x10D02: 68,
1841
+ 0x10D03: 68,
1842
+ 0x10D04: 68,
1843
+ 0x10D05: 68,
1844
+ 0x10D06: 68,
1845
+ 0x10D07: 68,
1846
+ 0x10D08: 68,
1847
+ 0x10D09: 68,
1848
+ 0x10D0A: 68,
1849
+ 0x10D0B: 68,
1850
+ 0x10D0C: 68,
1851
+ 0x10D0D: 68,
1852
+ 0x10D0E: 68,
1853
+ 0x10D0F: 68,
1854
+ 0x10D10: 68,
1855
+ 0x10D11: 68,
1856
+ 0x10D12: 68,
1857
+ 0x10D13: 68,
1858
+ 0x10D14: 68,
1859
+ 0x10D15: 68,
1860
+ 0x10D16: 68,
1861
+ 0x10D17: 68,
1862
+ 0x10D18: 68,
1863
+ 0x10D19: 68,
1864
+ 0x10D1A: 68,
1865
+ 0x10D1B: 68,
1866
+ 0x10D1C: 68,
1867
+ 0x10D1D: 68,
1868
+ 0x10D1E: 68,
1869
+ 0x10D1F: 68,
1870
+ 0x10D20: 68,
1871
+ 0x10D21: 68,
1872
+ 0x10D22: 82,
1873
+ 0x10D23: 68,
1874
+ 0x10D24: 84,
1875
+ 0x10D25: 84,
1876
+ 0x10D26: 84,
1877
+ 0x10D27: 84,
1878
+ 0x10D69: 84,
1879
+ 0x10D6A: 84,
1880
+ 0x10D6B: 84,
1881
+ 0x10D6C: 84,
1882
+ 0x10D6D: 84,
1883
+ 0x10EAB: 84,
1884
+ 0x10EAC: 84,
1885
+ 0x10EC2: 82,
1886
+ 0x10EC3: 68,
1887
+ 0x10EC4: 68,
1888
+ 0x10EFC: 84,
1889
+ 0x10EFD: 84,
1890
+ 0x10EFE: 84,
1891
+ 0x10EFF: 84,
1892
+ 0x10F30: 68,
1893
+ 0x10F31: 68,
1894
+ 0x10F32: 68,
1895
+ 0x10F33: 82,
1896
+ 0x10F34: 68,
1897
+ 0x10F35: 68,
1898
+ 0x10F36: 68,
1899
+ 0x10F37: 68,
1900
+ 0x10F38: 68,
1901
+ 0x10F39: 68,
1902
+ 0x10F3A: 68,
1903
+ 0x10F3B: 68,
1904
+ 0x10F3C: 68,
1905
+ 0x10F3D: 68,
1906
+ 0x10F3E: 68,
1907
+ 0x10F3F: 68,
1908
+ 0x10F40: 68,
1909
+ 0x10F41: 68,
1910
+ 0x10F42: 68,
1911
+ 0x10F43: 68,
1912
+ 0x10F44: 68,
1913
+ 0x10F46: 84,
1914
+ 0x10F47: 84,
1915
+ 0x10F48: 84,
1916
+ 0x10F49: 84,
1917
+ 0x10F4A: 84,
1918
+ 0x10F4B: 84,
1919
+ 0x10F4C: 84,
1920
+ 0x10F4D: 84,
1921
+ 0x10F4E: 84,
1922
+ 0x10F4F: 84,
1923
+ 0x10F50: 84,
1924
+ 0x10F51: 68,
1925
+ 0x10F52: 68,
1926
+ 0x10F53: 68,
1927
+ 0x10F54: 82,
1928
+ 0x10F70: 68,
1929
+ 0x10F71: 68,
1930
+ 0x10F72: 68,
1931
+ 0x10F73: 68,
1932
+ 0x10F74: 82,
1933
+ 0x10F75: 82,
1934
+ 0x10F76: 68,
1935
+ 0x10F77: 68,
1936
+ 0x10F78: 68,
1937
+ 0x10F79: 68,
1938
+ 0x10F7A: 68,
1939
+ 0x10F7B: 68,
1940
+ 0x10F7C: 68,
1941
+ 0x10F7D: 68,
1942
+ 0x10F7E: 68,
1943
+ 0x10F7F: 68,
1944
+ 0x10F80: 68,
1945
+ 0x10F81: 68,
1946
+ 0x10F82: 84,
1947
+ 0x10F83: 84,
1948
+ 0x10F84: 84,
1949
+ 0x10F85: 84,
1950
+ 0x10FB0: 68,
1951
+ 0x10FB2: 68,
1952
+ 0x10FB3: 68,
1953
+ 0x10FB4: 82,
1954
+ 0x10FB5: 82,
1955
+ 0x10FB6: 82,
1956
+ 0x10FB8: 68,
1957
+ 0x10FB9: 82,
1958
+ 0x10FBA: 82,
1959
+ 0x10FBB: 68,
1960
+ 0x10FBC: 68,
1961
+ 0x10FBD: 82,
1962
+ 0x10FBE: 68,
1963
+ 0x10FBF: 68,
1964
+ 0x10FC1: 68,
1965
+ 0x10FC2: 82,
1966
+ 0x10FC3: 82,
1967
+ 0x10FC4: 68,
1968
+ 0x10FC9: 82,
1969
+ 0x10FCA: 68,
1970
+ 0x10FCB: 76,
1971
+ 0x11001: 84,
1972
+ 0x11038: 84,
1973
+ 0x11039: 84,
1974
+ 0x1103A: 84,
1975
+ 0x1103B: 84,
1976
+ 0x1103C: 84,
1977
+ 0x1103D: 84,
1978
+ 0x1103E: 84,
1979
+ 0x1103F: 84,
1980
+ 0x11040: 84,
1981
+ 0x11041: 84,
1982
+ 0x11042: 84,
1983
+ 0x11043: 84,
1984
+ 0x11044: 84,
1985
+ 0x11045: 84,
1986
+ 0x11046: 84,
1987
+ 0x11070: 84,
1988
+ 0x11073: 84,
1989
+ 0x11074: 84,
1990
+ 0x1107F: 84,
1991
+ 0x11080: 84,
1992
+ 0x11081: 84,
1993
+ 0x110B3: 84,
1994
+ 0x110B4: 84,
1995
+ 0x110B5: 84,
1996
+ 0x110B6: 84,
1997
+ 0x110B9: 84,
1998
+ 0x110BA: 84,
1999
+ 0x110C2: 84,
2000
+ 0x11100: 84,
2001
+ 0x11101: 84,
2002
+ 0x11102: 84,
2003
+ 0x11127: 84,
2004
+ 0x11128: 84,
2005
+ 0x11129: 84,
2006
+ 0x1112A: 84,
2007
+ 0x1112B: 84,
2008
+ 0x1112D: 84,
2009
+ 0x1112E: 84,
2010
+ 0x1112F: 84,
2011
+ 0x11130: 84,
2012
+ 0x11131: 84,
2013
+ 0x11132: 84,
2014
+ 0x11133: 84,
2015
+ 0x11134: 84,
2016
+ 0x11173: 84,
2017
+ 0x11180: 84,
2018
+ 0x11181: 84,
2019
+ 0x111B6: 84,
2020
+ 0x111B7: 84,
2021
+ 0x111B8: 84,
2022
+ 0x111B9: 84,
2023
+ 0x111BA: 84,
2024
+ 0x111BB: 84,
2025
+ 0x111BC: 84,
2026
+ 0x111BD: 84,
2027
+ 0x111BE: 84,
2028
+ 0x111C9: 84,
2029
+ 0x111CA: 84,
2030
+ 0x111CB: 84,
2031
+ 0x111CC: 84,
2032
+ 0x111CF: 84,
2033
+ 0x1122F: 84,
2034
+ 0x11230: 84,
2035
+ 0x11231: 84,
2036
+ 0x11234: 84,
2037
+ 0x11236: 84,
2038
+ 0x11237: 84,
2039
+ 0x1123E: 84,
2040
+ 0x11241: 84,
2041
+ 0x112DF: 84,
2042
+ 0x112E3: 84,
2043
+ 0x112E4: 84,
2044
+ 0x112E5: 84,
2045
+ 0x112E6: 84,
2046
+ 0x112E7: 84,
2047
+ 0x112E8: 84,
2048
+ 0x112E9: 84,
2049
+ 0x112EA: 84,
2050
+ 0x11300: 84,
2051
+ 0x11301: 84,
2052
+ 0x1133B: 84,
2053
+ 0x1133C: 84,
2054
+ 0x11340: 84,
2055
+ 0x11366: 84,
2056
+ 0x11367: 84,
2057
+ 0x11368: 84,
2058
+ 0x11369: 84,
2059
+ 0x1136A: 84,
2060
+ 0x1136B: 84,
2061
+ 0x1136C: 84,
2062
+ 0x11370: 84,
2063
+ 0x11371: 84,
2064
+ 0x11372: 84,
2065
+ 0x11373: 84,
2066
+ 0x11374: 84,
2067
+ 0x113BB: 84,
2068
+ 0x113BC: 84,
2069
+ 0x113BD: 84,
2070
+ 0x113BE: 84,
2071
+ 0x113BF: 84,
2072
+ 0x113C0: 84,
2073
+ 0x113CE: 84,
2074
+ 0x113D0: 84,
2075
+ 0x113D2: 84,
2076
+ 0x113E1: 84,
2077
+ 0x113E2: 84,
2078
+ 0x11438: 84,
2079
+ 0x11439: 84,
2080
+ 0x1143A: 84,
2081
+ 0x1143B: 84,
2082
+ 0x1143C: 84,
2083
+ 0x1143D: 84,
2084
+ 0x1143E: 84,
2085
+ 0x1143F: 84,
2086
+ 0x11442: 84,
2087
+ 0x11443: 84,
2088
+ 0x11444: 84,
2089
+ 0x11446: 84,
2090
+ 0x1145E: 84,
2091
+ 0x114B3: 84,
2092
+ 0x114B4: 84,
2093
+ 0x114B5: 84,
2094
+ 0x114B6: 84,
2095
+ 0x114B7: 84,
2096
+ 0x114B8: 84,
2097
+ 0x114BA: 84,
2098
+ 0x114BF: 84,
2099
+ 0x114C0: 84,
2100
+ 0x114C2: 84,
2101
+ 0x114C3: 84,
2102
+ 0x115B2: 84,
2103
+ 0x115B3: 84,
2104
+ 0x115B4: 84,
2105
+ 0x115B5: 84,
2106
+ 0x115BC: 84,
2107
+ 0x115BD: 84,
2108
+ 0x115BF: 84,
2109
+ 0x115C0: 84,
2110
+ 0x115DC: 84,
2111
+ 0x115DD: 84,
2112
+ 0x11633: 84,
2113
+ 0x11634: 84,
2114
+ 0x11635: 84,
2115
+ 0x11636: 84,
2116
+ 0x11637: 84,
2117
+ 0x11638: 84,
2118
+ 0x11639: 84,
2119
+ 0x1163A: 84,
2120
+ 0x1163D: 84,
2121
+ 0x1163F: 84,
2122
+ 0x11640: 84,
2123
+ 0x116AB: 84,
2124
+ 0x116AD: 84,
2125
+ 0x116B0: 84,
2126
+ 0x116B1: 84,
2127
+ 0x116B2: 84,
2128
+ 0x116B3: 84,
2129
+ 0x116B4: 84,
2130
+ 0x116B5: 84,
2131
+ 0x116B7: 84,
2132
+ 0x1171D: 84,
2133
+ 0x1171F: 84,
2134
+ 0x11722: 84,
2135
+ 0x11723: 84,
2136
+ 0x11724: 84,
2137
+ 0x11725: 84,
2138
+ 0x11727: 84,
2139
+ 0x11728: 84,
2140
+ 0x11729: 84,
2141
+ 0x1172A: 84,
2142
+ 0x1172B: 84,
2143
+ 0x1182F: 84,
2144
+ 0x11830: 84,
2145
+ 0x11831: 84,
2146
+ 0x11832: 84,
2147
+ 0x11833: 84,
2148
+ 0x11834: 84,
2149
+ 0x11835: 84,
2150
+ 0x11836: 84,
2151
+ 0x11837: 84,
2152
+ 0x11839: 84,
2153
+ 0x1183A: 84,
2154
+ 0x1193B: 84,
2155
+ 0x1193C: 84,
2156
+ 0x1193E: 84,
2157
+ 0x11943: 84,
2158
+ 0x119D4: 84,
2159
+ 0x119D5: 84,
2160
+ 0x119D6: 84,
2161
+ 0x119D7: 84,
2162
+ 0x119DA: 84,
2163
+ 0x119DB: 84,
2164
+ 0x119E0: 84,
2165
+ 0x11A01: 84,
2166
+ 0x11A02: 84,
2167
+ 0x11A03: 84,
2168
+ 0x11A04: 84,
2169
+ 0x11A05: 84,
2170
+ 0x11A06: 84,
2171
+ 0x11A07: 84,
2172
+ 0x11A08: 84,
2173
+ 0x11A09: 84,
2174
+ 0x11A0A: 84,
2175
+ 0x11A33: 84,
2176
+ 0x11A34: 84,
2177
+ 0x11A35: 84,
2178
+ 0x11A36: 84,
2179
+ 0x11A37: 84,
2180
+ 0x11A38: 84,
2181
+ 0x11A3B: 84,
2182
+ 0x11A3C: 84,
2183
+ 0x11A3D: 84,
2184
+ 0x11A3E: 84,
2185
+ 0x11A47: 84,
2186
+ 0x11A51: 84,
2187
+ 0x11A52: 84,
2188
+ 0x11A53: 84,
2189
+ 0x11A54: 84,
2190
+ 0x11A55: 84,
2191
+ 0x11A56: 84,
2192
+ 0x11A59: 84,
2193
+ 0x11A5A: 84,
2194
+ 0x11A5B: 84,
2195
+ 0x11A8A: 84,
2196
+ 0x11A8B: 84,
2197
+ 0x11A8C: 84,
2198
+ 0x11A8D: 84,
2199
+ 0x11A8E: 84,
2200
+ 0x11A8F: 84,
2201
+ 0x11A90: 84,
2202
+ 0x11A91: 84,
2203
+ 0x11A92: 84,
2204
+ 0x11A93: 84,
2205
+ 0x11A94: 84,
2206
+ 0x11A95: 84,
2207
+ 0x11A96: 84,
2208
+ 0x11A98: 84,
2209
+ 0x11A99: 84,
2210
+ 0x11C30: 84,
2211
+ 0x11C31: 84,
2212
+ 0x11C32: 84,
2213
+ 0x11C33: 84,
2214
+ 0x11C34: 84,
2215
+ 0x11C35: 84,
2216
+ 0x11C36: 84,
2217
+ 0x11C38: 84,
2218
+ 0x11C39: 84,
2219
+ 0x11C3A: 84,
2220
+ 0x11C3B: 84,
2221
+ 0x11C3C: 84,
2222
+ 0x11C3D: 84,
2223
+ 0x11C3F: 84,
2224
+ 0x11C92: 84,
2225
+ 0x11C93: 84,
2226
+ 0x11C94: 84,
2227
+ 0x11C95: 84,
2228
+ 0x11C96: 84,
2229
+ 0x11C97: 84,
2230
+ 0x11C98: 84,
2231
+ 0x11C99: 84,
2232
+ 0x11C9A: 84,
2233
+ 0x11C9B: 84,
2234
+ 0x11C9C: 84,
2235
+ 0x11C9D: 84,
2236
+ 0x11C9E: 84,
2237
+ 0x11C9F: 84,
2238
+ 0x11CA0: 84,
2239
+ 0x11CA1: 84,
2240
+ 0x11CA2: 84,
2241
+ 0x11CA3: 84,
2242
+ 0x11CA4: 84,
2243
+ 0x11CA5: 84,
2244
+ 0x11CA6: 84,
2245
+ 0x11CA7: 84,
2246
+ 0x11CAA: 84,
2247
+ 0x11CAB: 84,
2248
+ 0x11CAC: 84,
2249
+ 0x11CAD: 84,
2250
+ 0x11CAE: 84,
2251
+ 0x11CAF: 84,
2252
+ 0x11CB0: 84,
2253
+ 0x11CB2: 84,
2254
+ 0x11CB3: 84,
2255
+ 0x11CB5: 84,
2256
+ 0x11CB6: 84,
2257
+ 0x11D31: 84,
2258
+ 0x11D32: 84,
2259
+ 0x11D33: 84,
2260
+ 0x11D34: 84,
2261
+ 0x11D35: 84,
2262
+ 0x11D36: 84,
2263
+ 0x11D3A: 84,
2264
+ 0x11D3C: 84,
2265
+ 0x11D3D: 84,
2266
+ 0x11D3F: 84,
2267
+ 0x11D40: 84,
2268
+ 0x11D41: 84,
2269
+ 0x11D42: 84,
2270
+ 0x11D43: 84,
2271
+ 0x11D44: 84,
2272
+ 0x11D45: 84,
2273
+ 0x11D47: 84,
2274
+ 0x11D90: 84,
2275
+ 0x11D91: 84,
2276
+ 0x11D95: 84,
2277
+ 0x11D97: 84,
2278
+ 0x11EF3: 84,
2279
+ 0x11EF4: 84,
2280
+ 0x11F00: 84,
2281
+ 0x11F01: 84,
2282
+ 0x11F36: 84,
2283
+ 0x11F37: 84,
2284
+ 0x11F38: 84,
2285
+ 0x11F39: 84,
2286
+ 0x11F3A: 84,
2287
+ 0x11F40: 84,
2288
+ 0x11F42: 84,
2289
+ 0x11F5A: 84,
2290
+ 0x13430: 84,
2291
+ 0x13431: 84,
2292
+ 0x13432: 84,
2293
+ 0x13433: 84,
2294
+ 0x13434: 84,
2295
+ 0x13435: 84,
2296
+ 0x13436: 84,
2297
+ 0x13437: 84,
2298
+ 0x13438: 84,
2299
+ 0x13439: 84,
2300
+ 0x1343A: 84,
2301
+ 0x1343B: 84,
2302
+ 0x1343C: 84,
2303
+ 0x1343D: 84,
2304
+ 0x1343E: 84,
2305
+ 0x1343F: 84,
2306
+ 0x13440: 84,
2307
+ 0x13447: 84,
2308
+ 0x13448: 84,
2309
+ 0x13449: 84,
2310
+ 0x1344A: 84,
2311
+ 0x1344B: 84,
2312
+ 0x1344C: 84,
2313
+ 0x1344D: 84,
2314
+ 0x1344E: 84,
2315
+ 0x1344F: 84,
2316
+ 0x13450: 84,
2317
+ 0x13451: 84,
2318
+ 0x13452: 84,
2319
+ 0x13453: 84,
2320
+ 0x13454: 84,
2321
+ 0x13455: 84,
2322
+ 0x1611E: 84,
2323
+ 0x1611F: 84,
2324
+ 0x16120: 84,
2325
+ 0x16121: 84,
2326
+ 0x16122: 84,
2327
+ 0x16123: 84,
2328
+ 0x16124: 84,
2329
+ 0x16125: 84,
2330
+ 0x16126: 84,
2331
+ 0x16127: 84,
2332
+ 0x16128: 84,
2333
+ 0x16129: 84,
2334
+ 0x1612D: 84,
2335
+ 0x1612E: 84,
2336
+ 0x1612F: 84,
2337
+ 0x16AF0: 84,
2338
+ 0x16AF1: 84,
2339
+ 0x16AF2: 84,
2340
+ 0x16AF3: 84,
2341
+ 0x16AF4: 84,
2342
+ 0x16B30: 84,
2343
+ 0x16B31: 84,
2344
+ 0x16B32: 84,
2345
+ 0x16B33: 84,
2346
+ 0x16B34: 84,
2347
+ 0x16B35: 84,
2348
+ 0x16B36: 84,
2349
+ 0x16F4F: 84,
2350
+ 0x16F8F: 84,
2351
+ 0x16F90: 84,
2352
+ 0x16F91: 84,
2353
+ 0x16F92: 84,
2354
+ 0x16FE4: 84,
2355
+ 0x1BC9D: 84,
2356
+ 0x1BC9E: 84,
2357
+ 0x1BCA0: 84,
2358
+ 0x1BCA1: 84,
2359
+ 0x1BCA2: 84,
2360
+ 0x1BCA3: 84,
2361
+ 0x1CF00: 84,
2362
+ 0x1CF01: 84,
2363
+ 0x1CF02: 84,
2364
+ 0x1CF03: 84,
2365
+ 0x1CF04: 84,
2366
+ 0x1CF05: 84,
2367
+ 0x1CF06: 84,
2368
+ 0x1CF07: 84,
2369
+ 0x1CF08: 84,
2370
+ 0x1CF09: 84,
2371
+ 0x1CF0A: 84,
2372
+ 0x1CF0B: 84,
2373
+ 0x1CF0C: 84,
2374
+ 0x1CF0D: 84,
2375
+ 0x1CF0E: 84,
2376
+ 0x1CF0F: 84,
2377
+ 0x1CF10: 84,
2378
+ 0x1CF11: 84,
2379
+ 0x1CF12: 84,
2380
+ 0x1CF13: 84,
2381
+ 0x1CF14: 84,
2382
+ 0x1CF15: 84,
2383
+ 0x1CF16: 84,
2384
+ 0x1CF17: 84,
2385
+ 0x1CF18: 84,
2386
+ 0x1CF19: 84,
2387
+ 0x1CF1A: 84,
2388
+ 0x1CF1B: 84,
2389
+ 0x1CF1C: 84,
2390
+ 0x1CF1D: 84,
2391
+ 0x1CF1E: 84,
2392
+ 0x1CF1F: 84,
2393
+ 0x1CF20: 84,
2394
+ 0x1CF21: 84,
2395
+ 0x1CF22: 84,
2396
+ 0x1CF23: 84,
2397
+ 0x1CF24: 84,
2398
+ 0x1CF25: 84,
2399
+ 0x1CF26: 84,
2400
+ 0x1CF27: 84,
2401
+ 0x1CF28: 84,
2402
+ 0x1CF29: 84,
2403
+ 0x1CF2A: 84,
2404
+ 0x1CF2B: 84,
2405
+ 0x1CF2C: 84,
2406
+ 0x1CF2D: 84,
2407
+ 0x1CF30: 84,
2408
+ 0x1CF31: 84,
2409
+ 0x1CF32: 84,
2410
+ 0x1CF33: 84,
2411
+ 0x1CF34: 84,
2412
+ 0x1CF35: 84,
2413
+ 0x1CF36: 84,
2414
+ 0x1CF37: 84,
2415
+ 0x1CF38: 84,
2416
+ 0x1CF39: 84,
2417
+ 0x1CF3A: 84,
2418
+ 0x1CF3B: 84,
2419
+ 0x1CF3C: 84,
2420
+ 0x1CF3D: 84,
2421
+ 0x1CF3E: 84,
2422
+ 0x1CF3F: 84,
2423
+ 0x1CF40: 84,
2424
+ 0x1CF41: 84,
2425
+ 0x1CF42: 84,
2426
+ 0x1CF43: 84,
2427
+ 0x1CF44: 84,
2428
+ 0x1CF45: 84,
2429
+ 0x1CF46: 84,
2430
+ 0x1D167: 84,
2431
+ 0x1D168: 84,
2432
+ 0x1D169: 84,
2433
+ 0x1D173: 84,
2434
+ 0x1D174: 84,
2435
+ 0x1D175: 84,
2436
+ 0x1D176: 84,
2437
+ 0x1D177: 84,
2438
+ 0x1D178: 84,
2439
+ 0x1D179: 84,
2440
+ 0x1D17A: 84,
2441
+ 0x1D17B: 84,
2442
+ 0x1D17C: 84,
2443
+ 0x1D17D: 84,
2444
+ 0x1D17E: 84,
2445
+ 0x1D17F: 84,
2446
+ 0x1D180: 84,
2447
+ 0x1D181: 84,
2448
+ 0x1D182: 84,
2449
+ 0x1D185: 84,
2450
+ 0x1D186: 84,
2451
+ 0x1D187: 84,
2452
+ 0x1D188: 84,
2453
+ 0x1D189: 84,
2454
+ 0x1D18A: 84,
2455
+ 0x1D18B: 84,
2456
+ 0x1D1AA: 84,
2457
+ 0x1D1AB: 84,
2458
+ 0x1D1AC: 84,
2459
+ 0x1D1AD: 84,
2460
+ 0x1D242: 84,
2461
+ 0x1D243: 84,
2462
+ 0x1D244: 84,
2463
+ 0x1DA00: 84,
2464
+ 0x1DA01: 84,
2465
+ 0x1DA02: 84,
2466
+ 0x1DA03: 84,
2467
+ 0x1DA04: 84,
2468
+ 0x1DA05: 84,
2469
+ 0x1DA06: 84,
2470
+ 0x1DA07: 84,
2471
+ 0x1DA08: 84,
2472
+ 0x1DA09: 84,
2473
+ 0x1DA0A: 84,
2474
+ 0x1DA0B: 84,
2475
+ 0x1DA0C: 84,
2476
+ 0x1DA0D: 84,
2477
+ 0x1DA0E: 84,
2478
+ 0x1DA0F: 84,
2479
+ 0x1DA10: 84,
2480
+ 0x1DA11: 84,
2481
+ 0x1DA12: 84,
2482
+ 0x1DA13: 84,
2483
+ 0x1DA14: 84,
2484
+ 0x1DA15: 84,
2485
+ 0x1DA16: 84,
2486
+ 0x1DA17: 84,
2487
+ 0x1DA18: 84,
2488
+ 0x1DA19: 84,
2489
+ 0x1DA1A: 84,
2490
+ 0x1DA1B: 84,
2491
+ 0x1DA1C: 84,
2492
+ 0x1DA1D: 84,
2493
+ 0x1DA1E: 84,
2494
+ 0x1DA1F: 84,
2495
+ 0x1DA20: 84,
2496
+ 0x1DA21: 84,
2497
+ 0x1DA22: 84,
2498
+ 0x1DA23: 84,
2499
+ 0x1DA24: 84,
2500
+ 0x1DA25: 84,
2501
+ 0x1DA26: 84,
2502
+ 0x1DA27: 84,
2503
+ 0x1DA28: 84,
2504
+ 0x1DA29: 84,
2505
+ 0x1DA2A: 84,
2506
+ 0x1DA2B: 84,
2507
+ 0x1DA2C: 84,
2508
+ 0x1DA2D: 84,
2509
+ 0x1DA2E: 84,
2510
+ 0x1DA2F: 84,
2511
+ 0x1DA30: 84,
2512
+ 0x1DA31: 84,
2513
+ 0x1DA32: 84,
2514
+ 0x1DA33: 84,
2515
+ 0x1DA34: 84,
2516
+ 0x1DA35: 84,
2517
+ 0x1DA36: 84,
2518
+ 0x1DA3B: 84,
2519
+ 0x1DA3C: 84,
2520
+ 0x1DA3D: 84,
2521
+ 0x1DA3E: 84,
2522
+ 0x1DA3F: 84,
2523
+ 0x1DA40: 84,
2524
+ 0x1DA41: 84,
2525
+ 0x1DA42: 84,
2526
+ 0x1DA43: 84,
2527
+ 0x1DA44: 84,
2528
+ 0x1DA45: 84,
2529
+ 0x1DA46: 84,
2530
+ 0x1DA47: 84,
2531
+ 0x1DA48: 84,
2532
+ 0x1DA49: 84,
2533
+ 0x1DA4A: 84,
2534
+ 0x1DA4B: 84,
2535
+ 0x1DA4C: 84,
2536
+ 0x1DA4D: 84,
2537
+ 0x1DA4E: 84,
2538
+ 0x1DA4F: 84,
2539
+ 0x1DA50: 84,
2540
+ 0x1DA51: 84,
2541
+ 0x1DA52: 84,
2542
+ 0x1DA53: 84,
2543
+ 0x1DA54: 84,
2544
+ 0x1DA55: 84,
2545
+ 0x1DA56: 84,
2546
+ 0x1DA57: 84,
2547
+ 0x1DA58: 84,
2548
+ 0x1DA59: 84,
2549
+ 0x1DA5A: 84,
2550
+ 0x1DA5B: 84,
2551
+ 0x1DA5C: 84,
2552
+ 0x1DA5D: 84,
2553
+ 0x1DA5E: 84,
2554
+ 0x1DA5F: 84,
2555
+ 0x1DA60: 84,
2556
+ 0x1DA61: 84,
2557
+ 0x1DA62: 84,
2558
+ 0x1DA63: 84,
2559
+ 0x1DA64: 84,
2560
+ 0x1DA65: 84,
2561
+ 0x1DA66: 84,
2562
+ 0x1DA67: 84,
2563
+ 0x1DA68: 84,
2564
+ 0x1DA69: 84,
2565
+ 0x1DA6A: 84,
2566
+ 0x1DA6B: 84,
2567
+ 0x1DA6C: 84,
2568
+ 0x1DA75: 84,
2569
+ 0x1DA84: 84,
2570
+ 0x1DA9B: 84,
2571
+ 0x1DA9C: 84,
2572
+ 0x1DA9D: 84,
2573
+ 0x1DA9E: 84,
2574
+ 0x1DA9F: 84,
2575
+ 0x1DAA1: 84,
2576
+ 0x1DAA2: 84,
2577
+ 0x1DAA3: 84,
2578
+ 0x1DAA4: 84,
2579
+ 0x1DAA5: 84,
2580
+ 0x1DAA6: 84,
2581
+ 0x1DAA7: 84,
2582
+ 0x1DAA8: 84,
2583
+ 0x1DAA9: 84,
2584
+ 0x1DAAA: 84,
2585
+ 0x1DAAB: 84,
2586
+ 0x1DAAC: 84,
2587
+ 0x1DAAD: 84,
2588
+ 0x1DAAE: 84,
2589
+ 0x1DAAF: 84,
2590
+ 0x1E000: 84,
2591
+ 0x1E001: 84,
2592
+ 0x1E002: 84,
2593
+ 0x1E003: 84,
2594
+ 0x1E004: 84,
2595
+ 0x1E005: 84,
2596
+ 0x1E006: 84,
2597
+ 0x1E008: 84,
2598
+ 0x1E009: 84,
2599
+ 0x1E00A: 84,
2600
+ 0x1E00B: 84,
2601
+ 0x1E00C: 84,
2602
+ 0x1E00D: 84,
2603
+ 0x1E00E: 84,
2604
+ 0x1E00F: 84,
2605
+ 0x1E010: 84,
2606
+ 0x1E011: 84,
2607
+ 0x1E012: 84,
2608
+ 0x1E013: 84,
2609
+ 0x1E014: 84,
2610
+ 0x1E015: 84,
2611
+ 0x1E016: 84,
2612
+ 0x1E017: 84,
2613
+ 0x1E018: 84,
2614
+ 0x1E01B: 84,
2615
+ 0x1E01C: 84,
2616
+ 0x1E01D: 84,
2617
+ 0x1E01E: 84,
2618
+ 0x1E01F: 84,
2619
+ 0x1E020: 84,
2620
+ 0x1E021: 84,
2621
+ 0x1E023: 84,
2622
+ 0x1E024: 84,
2623
+ 0x1E026: 84,
2624
+ 0x1E027: 84,
2625
+ 0x1E028: 84,
2626
+ 0x1E029: 84,
2627
+ 0x1E02A: 84,
2628
+ 0x1E08F: 84,
2629
+ 0x1E130: 84,
2630
+ 0x1E131: 84,
2631
+ 0x1E132: 84,
2632
+ 0x1E133: 84,
2633
+ 0x1E134: 84,
2634
+ 0x1E135: 84,
2635
+ 0x1E136: 84,
2636
+ 0x1E2AE: 84,
2637
+ 0x1E2EC: 84,
2638
+ 0x1E2ED: 84,
2639
+ 0x1E2EE: 84,
2640
+ 0x1E2EF: 84,
2641
+ 0x1E4EC: 84,
2642
+ 0x1E4ED: 84,
2643
+ 0x1E4EE: 84,
2644
+ 0x1E4EF: 84,
2645
+ 0x1E5EE: 84,
2646
+ 0x1E5EF: 84,
2647
+ 0x1E8D0: 84,
2648
+ 0x1E8D1: 84,
2649
+ 0x1E8D2: 84,
2650
+ 0x1E8D3: 84,
2651
+ 0x1E8D4: 84,
2652
+ 0x1E8D5: 84,
2653
+ 0x1E8D6: 84,
2654
+ 0x1E900: 68,
2655
+ 0x1E901: 68,
2656
+ 0x1E902: 68,
2657
+ 0x1E903: 68,
2658
+ 0x1E904: 68,
2659
+ 0x1E905: 68,
2660
+ 0x1E906: 68,
2661
+ 0x1E907: 68,
2662
+ 0x1E908: 68,
2663
+ 0x1E909: 68,
2664
+ 0x1E90A: 68,
2665
+ 0x1E90B: 68,
2666
+ 0x1E90C: 68,
2667
+ 0x1E90D: 68,
2668
+ 0x1E90E: 68,
2669
+ 0x1E90F: 68,
2670
+ 0x1E910: 68,
2671
+ 0x1E911: 68,
2672
+ 0x1E912: 68,
2673
+ 0x1E913: 68,
2674
+ 0x1E914: 68,
2675
+ 0x1E915: 68,
2676
+ 0x1E916: 68,
2677
+ 0x1E917: 68,
2678
+ 0x1E918: 68,
2679
+ 0x1E919: 68,
2680
+ 0x1E91A: 68,
2681
+ 0x1E91B: 68,
2682
+ 0x1E91C: 68,
2683
+ 0x1E91D: 68,
2684
+ 0x1E91E: 68,
2685
+ 0x1E91F: 68,
2686
+ 0x1E920: 68,
2687
+ 0x1E921: 68,
2688
+ 0x1E922: 68,
2689
+ 0x1E923: 68,
2690
+ 0x1E924: 68,
2691
+ 0x1E925: 68,
2692
+ 0x1E926: 68,
2693
+ 0x1E927: 68,
2694
+ 0x1E928: 68,
2695
+ 0x1E929: 68,
2696
+ 0x1E92A: 68,
2697
+ 0x1E92B: 68,
2698
+ 0x1E92C: 68,
2699
+ 0x1E92D: 68,
2700
+ 0x1E92E: 68,
2701
+ 0x1E92F: 68,
2702
+ 0x1E930: 68,
2703
+ 0x1E931: 68,
2704
+ 0x1E932: 68,
2705
+ 0x1E933: 68,
2706
+ 0x1E934: 68,
2707
+ 0x1E935: 68,
2708
+ 0x1E936: 68,
2709
+ 0x1E937: 68,
2710
+ 0x1E938: 68,
2711
+ 0x1E939: 68,
2712
+ 0x1E93A: 68,
2713
+ 0x1E93B: 68,
2714
+ 0x1E93C: 68,
2715
+ 0x1E93D: 68,
2716
+ 0x1E93E: 68,
2717
+ 0x1E93F: 68,
2718
+ 0x1E940: 68,
2719
+ 0x1E941: 68,
2720
+ 0x1E942: 68,
2721
+ 0x1E943: 68,
2722
+ 0x1E944: 84,
2723
+ 0x1E945: 84,
2724
+ 0x1E946: 84,
2725
+ 0x1E947: 84,
2726
+ 0x1E948: 84,
2727
+ 0x1E949: 84,
2728
+ 0x1E94A: 84,
2729
+ 0x1E94B: 84,
2730
+ 0xE0001: 84,
2731
+ 0xE0020: 84,
2732
+ 0xE0021: 84,
2733
+ 0xE0022: 84,
2734
+ 0xE0023: 84,
2735
+ 0xE0024: 84,
2736
+ 0xE0025: 84,
2737
+ 0xE0026: 84,
2738
+ 0xE0027: 84,
2739
+ 0xE0028: 84,
2740
+ 0xE0029: 84,
2741
+ 0xE002A: 84,
2742
+ 0xE002B: 84,
2743
+ 0xE002C: 84,
2744
+ 0xE002D: 84,
2745
+ 0xE002E: 84,
2746
+ 0xE002F: 84,
2747
+ 0xE0030: 84,
2748
+ 0xE0031: 84,
2749
+ 0xE0032: 84,
2750
+ 0xE0033: 84,
2751
+ 0xE0034: 84,
2752
+ 0xE0035: 84,
2753
+ 0xE0036: 84,
2754
+ 0xE0037: 84,
2755
+ 0xE0038: 84,
2756
+ 0xE0039: 84,
2757
+ 0xE003A: 84,
2758
+ 0xE003B: 84,
2759
+ 0xE003C: 84,
2760
+ 0xE003D: 84,
2761
+ 0xE003E: 84,
2762
+ 0xE003F: 84,
2763
+ 0xE0040: 84,
2764
+ 0xE0041: 84,
2765
+ 0xE0042: 84,
2766
+ 0xE0043: 84,
2767
+ 0xE0044: 84,
2768
+ 0xE0045: 84,
2769
+ 0xE0046: 84,
2770
+ 0xE0047: 84,
2771
+ 0xE0048: 84,
2772
+ 0xE0049: 84,
2773
+ 0xE004A: 84,
2774
+ 0xE004B: 84,
2775
+ 0xE004C: 84,
2776
+ 0xE004D: 84,
2777
+ 0xE004E: 84,
2778
+ 0xE004F: 84,
2779
+ 0xE0050: 84,
2780
+ 0xE0051: 84,
2781
+ 0xE0052: 84,
2782
+ 0xE0053: 84,
2783
+ 0xE0054: 84,
2784
+ 0xE0055: 84,
2785
+ 0xE0056: 84,
2786
+ 0xE0057: 84,
2787
+ 0xE0058: 84,
2788
+ 0xE0059: 84,
2789
+ 0xE005A: 84,
2790
+ 0xE005B: 84,
2791
+ 0xE005C: 84,
2792
+ 0xE005D: 84,
2793
+ 0xE005E: 84,
2794
+ 0xE005F: 84,
2795
+ 0xE0060: 84,
2796
+ 0xE0061: 84,
2797
+ 0xE0062: 84,
2798
+ 0xE0063: 84,
2799
+ 0xE0064: 84,
2800
+ 0xE0065: 84,
2801
+ 0xE0066: 84,
2802
+ 0xE0067: 84,
2803
+ 0xE0068: 84,
2804
+ 0xE0069: 84,
2805
+ 0xE006A: 84,
2806
+ 0xE006B: 84,
2807
+ 0xE006C: 84,
2808
+ 0xE006D: 84,
2809
+ 0xE006E: 84,
2810
+ 0xE006F: 84,
2811
+ 0xE0070: 84,
2812
+ 0xE0071: 84,
2813
+ 0xE0072: 84,
2814
+ 0xE0073: 84,
2815
+ 0xE0074: 84,
2816
+ 0xE0075: 84,
2817
+ 0xE0076: 84,
2818
+ 0xE0077: 84,
2819
+ 0xE0078: 84,
2820
+ 0xE0079: 84,
2821
+ 0xE007A: 84,
2822
+ 0xE007B: 84,
2823
+ 0xE007C: 84,
2824
+ 0xE007D: 84,
2825
+ 0xE007E: 84,
2826
+ 0xE007F: 84,
2827
+ 0xE0100: 84,
2828
+ 0xE0101: 84,
2829
+ 0xE0102: 84,
2830
+ 0xE0103: 84,
2831
+ 0xE0104: 84,
2832
+ 0xE0105: 84,
2833
+ 0xE0106: 84,
2834
+ 0xE0107: 84,
2835
+ 0xE0108: 84,
2836
+ 0xE0109: 84,
2837
+ 0xE010A: 84,
2838
+ 0xE010B: 84,
2839
+ 0xE010C: 84,
2840
+ 0xE010D: 84,
2841
+ 0xE010E: 84,
2842
+ 0xE010F: 84,
2843
+ 0xE0110: 84,
2844
+ 0xE0111: 84,
2845
+ 0xE0112: 84,
2846
+ 0xE0113: 84,
2847
+ 0xE0114: 84,
2848
+ 0xE0115: 84,
2849
+ 0xE0116: 84,
2850
+ 0xE0117: 84,
2851
+ 0xE0118: 84,
2852
+ 0xE0119: 84,
2853
+ 0xE011A: 84,
2854
+ 0xE011B: 84,
2855
+ 0xE011C: 84,
2856
+ 0xE011D: 84,
2857
+ 0xE011E: 84,
2858
+ 0xE011F: 84,
2859
+ 0xE0120: 84,
2860
+ 0xE0121: 84,
2861
+ 0xE0122: 84,
2862
+ 0xE0123: 84,
2863
+ 0xE0124: 84,
2864
+ 0xE0125: 84,
2865
+ 0xE0126: 84,
2866
+ 0xE0127: 84,
2867
+ 0xE0128: 84,
2868
+ 0xE0129: 84,
2869
+ 0xE012A: 84,
2870
+ 0xE012B: 84,
2871
+ 0xE012C: 84,
2872
+ 0xE012D: 84,
2873
+ 0xE012E: 84,
2874
+ 0xE012F: 84,
2875
+ 0xE0130: 84,
2876
+ 0xE0131: 84,
2877
+ 0xE0132: 84,
2878
+ 0xE0133: 84,
2879
+ 0xE0134: 84,
2880
+ 0xE0135: 84,
2881
+ 0xE0136: 84,
2882
+ 0xE0137: 84,
2883
+ 0xE0138: 84,
2884
+ 0xE0139: 84,
2885
+ 0xE013A: 84,
2886
+ 0xE013B: 84,
2887
+ 0xE013C: 84,
2888
+ 0xE013D: 84,
2889
+ 0xE013E: 84,
2890
+ 0xE013F: 84,
2891
+ 0xE0140: 84,
2892
+ 0xE0141: 84,
2893
+ 0xE0142: 84,
2894
+ 0xE0143: 84,
2895
+ 0xE0144: 84,
2896
+ 0xE0145: 84,
2897
+ 0xE0146: 84,
2898
+ 0xE0147: 84,
2899
+ 0xE0148: 84,
2900
+ 0xE0149: 84,
2901
+ 0xE014A: 84,
2902
+ 0xE014B: 84,
2903
+ 0xE014C: 84,
2904
+ 0xE014D: 84,
2905
+ 0xE014E: 84,
2906
+ 0xE014F: 84,
2907
+ 0xE0150: 84,
2908
+ 0xE0151: 84,
2909
+ 0xE0152: 84,
2910
+ 0xE0153: 84,
2911
+ 0xE0154: 84,
2912
+ 0xE0155: 84,
2913
+ 0xE0156: 84,
2914
+ 0xE0157: 84,
2915
+ 0xE0158: 84,
2916
+ 0xE0159: 84,
2917
+ 0xE015A: 84,
2918
+ 0xE015B: 84,
2919
+ 0xE015C: 84,
2920
+ 0xE015D: 84,
2921
+ 0xE015E: 84,
2922
+ 0xE015F: 84,
2923
+ 0xE0160: 84,
2924
+ 0xE0161: 84,
2925
+ 0xE0162: 84,
2926
+ 0xE0163: 84,
2927
+ 0xE0164: 84,
2928
+ 0xE0165: 84,
2929
+ 0xE0166: 84,
2930
+ 0xE0167: 84,
2931
+ 0xE0168: 84,
2932
+ 0xE0169: 84,
2933
+ 0xE016A: 84,
2934
+ 0xE016B: 84,
2935
+ 0xE016C: 84,
2936
+ 0xE016D: 84,
2937
+ 0xE016E: 84,
2938
+ 0xE016F: 84,
2939
+ 0xE0170: 84,
2940
+ 0xE0171: 84,
2941
+ 0xE0172: 84,
2942
+ 0xE0173: 84,
2943
+ 0xE0174: 84,
2944
+ 0xE0175: 84,
2945
+ 0xE0176: 84,
2946
+ 0xE0177: 84,
2947
+ 0xE0178: 84,
2948
+ 0xE0179: 84,
2949
+ 0xE017A: 84,
2950
+ 0xE017B: 84,
2951
+ 0xE017C: 84,
2952
+ 0xE017D: 84,
2953
+ 0xE017E: 84,
2954
+ 0xE017F: 84,
2955
+ 0xE0180: 84,
2956
+ 0xE0181: 84,
2957
+ 0xE0182: 84,
2958
+ 0xE0183: 84,
2959
+ 0xE0184: 84,
2960
+ 0xE0185: 84,
2961
+ 0xE0186: 84,
2962
+ 0xE0187: 84,
2963
+ 0xE0188: 84,
2964
+ 0xE0189: 84,
2965
+ 0xE018A: 84,
2966
+ 0xE018B: 84,
2967
+ 0xE018C: 84,
2968
+ 0xE018D: 84,
2969
+ 0xE018E: 84,
2970
+ 0xE018F: 84,
2971
+ 0xE0190: 84,
2972
+ 0xE0191: 84,
2973
+ 0xE0192: 84,
2974
+ 0xE0193: 84,
2975
+ 0xE0194: 84,
2976
+ 0xE0195: 84,
2977
+ 0xE0196: 84,
2978
+ 0xE0197: 84,
2979
+ 0xE0198: 84,
2980
+ 0xE0199: 84,
2981
+ 0xE019A: 84,
2982
+ 0xE019B: 84,
2983
+ 0xE019C: 84,
2984
+ 0xE019D: 84,
2985
+ 0xE019E: 84,
2986
+ 0xE019F: 84,
2987
+ 0xE01A0: 84,
2988
+ 0xE01A1: 84,
2989
+ 0xE01A2: 84,
2990
+ 0xE01A3: 84,
2991
+ 0xE01A4: 84,
2992
+ 0xE01A5: 84,
2993
+ 0xE01A6: 84,
2994
+ 0xE01A7: 84,
2995
+ 0xE01A8: 84,
2996
+ 0xE01A9: 84,
2997
+ 0xE01AA: 84,
2998
+ 0xE01AB: 84,
2999
+ 0xE01AC: 84,
3000
+ 0xE01AD: 84,
3001
+ 0xE01AE: 84,
3002
+ 0xE01AF: 84,
3003
+ 0xE01B0: 84,
3004
+ 0xE01B1: 84,
3005
+ 0xE01B2: 84,
3006
+ 0xE01B3: 84,
3007
+ 0xE01B4: 84,
3008
+ 0xE01B5: 84,
3009
+ 0xE01B6: 84,
3010
+ 0xE01B7: 84,
3011
+ 0xE01B8: 84,
3012
+ 0xE01B9: 84,
3013
+ 0xE01BA: 84,
3014
+ 0xE01BB: 84,
3015
+ 0xE01BC: 84,
3016
+ 0xE01BD: 84,
3017
+ 0xE01BE: 84,
3018
+ 0xE01BF: 84,
3019
+ 0xE01C0: 84,
3020
+ 0xE01C1: 84,
3021
+ 0xE01C2: 84,
3022
+ 0xE01C3: 84,
3023
+ 0xE01C4: 84,
3024
+ 0xE01C5: 84,
3025
+ 0xE01C6: 84,
3026
+ 0xE01C7: 84,
3027
+ 0xE01C8: 84,
3028
+ 0xE01C9: 84,
3029
+ 0xE01CA: 84,
3030
+ 0xE01CB: 84,
3031
+ 0xE01CC: 84,
3032
+ 0xE01CD: 84,
3033
+ 0xE01CE: 84,
3034
+ 0xE01CF: 84,
3035
+ 0xE01D0: 84,
3036
+ 0xE01D1: 84,
3037
+ 0xE01D2: 84,
3038
+ 0xE01D3: 84,
3039
+ 0xE01D4: 84,
3040
+ 0xE01D5: 84,
3041
+ 0xE01D6: 84,
3042
+ 0xE01D7: 84,
3043
+ 0xE01D8: 84,
3044
+ 0xE01D9: 84,
3045
+ 0xE01DA: 84,
3046
+ 0xE01DB: 84,
3047
+ 0xE01DC: 84,
3048
+ 0xE01DD: 84,
3049
+ 0xE01DE: 84,
3050
+ 0xE01DF: 84,
3051
+ 0xE01E0: 84,
3052
+ 0xE01E1: 84,
3053
+ 0xE01E2: 84,
3054
+ 0xE01E3: 84,
3055
+ 0xE01E4: 84,
3056
+ 0xE01E5: 84,
3057
+ 0xE01E6: 84,
3058
+ 0xE01E7: 84,
3059
+ 0xE01E8: 84,
3060
+ 0xE01E9: 84,
3061
+ 0xE01EA: 84,
3062
+ 0xE01EB: 84,
3063
+ 0xE01EC: 84,
3064
+ 0xE01ED: 84,
3065
+ 0xE01EE: 84,
3066
+ 0xE01EF: 84,
3067
+ }
3068
+ codepoint_classes = {
3069
+ "PVALID": (
3070
+ 0x2D0000002E,
3071
+ 0x300000003A,
3072
+ 0x610000007B,
3073
+ 0xDF000000F7,
3074
+ 0xF800000100,
3075
+ 0x10100000102,
3076
+ 0x10300000104,
3077
+ 0x10500000106,
3078
+ 0x10700000108,
3079
+ 0x1090000010A,
3080
+ 0x10B0000010C,
3081
+ 0x10D0000010E,
3082
+ 0x10F00000110,
3083
+ 0x11100000112,
3084
+ 0x11300000114,
3085
+ 0x11500000116,
3086
+ 0x11700000118,
3087
+ 0x1190000011A,
3088
+ 0x11B0000011C,
3089
+ 0x11D0000011E,
3090
+ 0x11F00000120,
3091
+ 0x12100000122,
3092
+ 0x12300000124,
3093
+ 0x12500000126,
3094
+ 0x12700000128,
3095
+ 0x1290000012A,
3096
+ 0x12B0000012C,
3097
+ 0x12D0000012E,
3098
+ 0x12F00000130,
3099
+ 0x13100000132,
3100
+ 0x13500000136,
3101
+ 0x13700000139,
3102
+ 0x13A0000013B,
3103
+ 0x13C0000013D,
3104
+ 0x13E0000013F,
3105
+ 0x14200000143,
3106
+ 0x14400000145,
3107
+ 0x14600000147,
3108
+ 0x14800000149,
3109
+ 0x14B0000014C,
3110
+ 0x14D0000014E,
3111
+ 0x14F00000150,
3112
+ 0x15100000152,
3113
+ 0x15300000154,
3114
+ 0x15500000156,
3115
+ 0x15700000158,
3116
+ 0x1590000015A,
3117
+ 0x15B0000015C,
3118
+ 0x15D0000015E,
3119
+ 0x15F00000160,
3120
+ 0x16100000162,
3121
+ 0x16300000164,
3122
+ 0x16500000166,
3123
+ 0x16700000168,
3124
+ 0x1690000016A,
3125
+ 0x16B0000016C,
3126
+ 0x16D0000016E,
3127
+ 0x16F00000170,
3128
+ 0x17100000172,
3129
+ 0x17300000174,
3130
+ 0x17500000176,
3131
+ 0x17700000178,
3132
+ 0x17A0000017B,
3133
+ 0x17C0000017D,
3134
+ 0x17E0000017F,
3135
+ 0x18000000181,
3136
+ 0x18300000184,
3137
+ 0x18500000186,
3138
+ 0x18800000189,
3139
+ 0x18C0000018E,
3140
+ 0x19200000193,
3141
+ 0x19500000196,
3142
+ 0x1990000019C,
3143
+ 0x19E0000019F,
3144
+ 0x1A1000001A2,
3145
+ 0x1A3000001A4,
3146
+ 0x1A5000001A6,
3147
+ 0x1A8000001A9,
3148
+ 0x1AA000001AC,
3149
+ 0x1AD000001AE,
3150
+ 0x1B0000001B1,
3151
+ 0x1B4000001B5,
3152
+ 0x1B6000001B7,
3153
+ 0x1B9000001BC,
3154
+ 0x1BD000001C4,
3155
+ 0x1CE000001CF,
3156
+ 0x1D0000001D1,
3157
+ 0x1D2000001D3,
3158
+ 0x1D4000001D5,
3159
+ 0x1D6000001D7,
3160
+ 0x1D8000001D9,
3161
+ 0x1DA000001DB,
3162
+ 0x1DC000001DE,
3163
+ 0x1DF000001E0,
3164
+ 0x1E1000001E2,
3165
+ 0x1E3000001E4,
3166
+ 0x1E5000001E6,
3167
+ 0x1E7000001E8,
3168
+ 0x1E9000001EA,
3169
+ 0x1EB000001EC,
3170
+ 0x1ED000001EE,
3171
+ 0x1EF000001F1,
3172
+ 0x1F5000001F6,
3173
+ 0x1F9000001FA,
3174
+ 0x1FB000001FC,
3175
+ 0x1FD000001FE,
3176
+ 0x1FF00000200,
3177
+ 0x20100000202,
3178
+ 0x20300000204,
3179
+ 0x20500000206,
3180
+ 0x20700000208,
3181
+ 0x2090000020A,
3182
+ 0x20B0000020C,
3183
+ 0x20D0000020E,
3184
+ 0x20F00000210,
3185
+ 0x21100000212,
3186
+ 0x21300000214,
3187
+ 0x21500000216,
3188
+ 0x21700000218,
3189
+ 0x2190000021A,
3190
+ 0x21B0000021C,
3191
+ 0x21D0000021E,
3192
+ 0x21F00000220,
3193
+ 0x22100000222,
3194
+ 0x22300000224,
3195
+ 0x22500000226,
3196
+ 0x22700000228,
3197
+ 0x2290000022A,
3198
+ 0x22B0000022C,
3199
+ 0x22D0000022E,
3200
+ 0x22F00000230,
3201
+ 0x23100000232,
3202
+ 0x2330000023A,
3203
+ 0x23C0000023D,
3204
+ 0x23F00000241,
3205
+ 0x24200000243,
3206
+ 0x24700000248,
3207
+ 0x2490000024A,
3208
+ 0x24B0000024C,
3209
+ 0x24D0000024E,
3210
+ 0x24F000002B0,
3211
+ 0x2B9000002C2,
3212
+ 0x2C6000002D2,
3213
+ 0x2EC000002ED,
3214
+ 0x2EE000002EF,
3215
+ 0x30000000340,
3216
+ 0x34200000343,
3217
+ 0x3460000034F,
3218
+ 0x35000000370,
3219
+ 0x37100000372,
3220
+ 0x37300000374,
3221
+ 0x37700000378,
3222
+ 0x37B0000037E,
3223
+ 0x39000000391,
3224
+ 0x3AC000003CF,
3225
+ 0x3D7000003D8,
3226
+ 0x3D9000003DA,
3227
+ 0x3DB000003DC,
3228
+ 0x3DD000003DE,
3229
+ 0x3DF000003E0,
3230
+ 0x3E1000003E2,
3231
+ 0x3E3000003E4,
3232
+ 0x3E5000003E6,
3233
+ 0x3E7000003E8,
3234
+ 0x3E9000003EA,
3235
+ 0x3EB000003EC,
3236
+ 0x3ED000003EE,
3237
+ 0x3EF000003F0,
3238
+ 0x3F3000003F4,
3239
+ 0x3F8000003F9,
3240
+ 0x3FB000003FD,
3241
+ 0x43000000460,
3242
+ 0x46100000462,
3243
+ 0x46300000464,
3244
+ 0x46500000466,
3245
+ 0x46700000468,
3246
+ 0x4690000046A,
3247
+ 0x46B0000046C,
3248
+ 0x46D0000046E,
3249
+ 0x46F00000470,
3250
+ 0x47100000472,
3251
+ 0x47300000474,
3252
+ 0x47500000476,
3253
+ 0x47700000478,
3254
+ 0x4790000047A,
3255
+ 0x47B0000047C,
3256
+ 0x47D0000047E,
3257
+ 0x47F00000480,
3258
+ 0x48100000482,
3259
+ 0x48300000488,
3260
+ 0x48B0000048C,
3261
+ 0x48D0000048E,
3262
+ 0x48F00000490,
3263
+ 0x49100000492,
3264
+ 0x49300000494,
3265
+ 0x49500000496,
3266
+ 0x49700000498,
3267
+ 0x4990000049A,
3268
+ 0x49B0000049C,
3269
+ 0x49D0000049E,
3270
+ 0x49F000004A0,
3271
+ 0x4A1000004A2,
3272
+ 0x4A3000004A4,
3273
+ 0x4A5000004A6,
3274
+ 0x4A7000004A8,
3275
+ 0x4A9000004AA,
3276
+ 0x4AB000004AC,
3277
+ 0x4AD000004AE,
3278
+ 0x4AF000004B0,
3279
+ 0x4B1000004B2,
3280
+ 0x4B3000004B4,
3281
+ 0x4B5000004B6,
3282
+ 0x4B7000004B8,
3283
+ 0x4B9000004BA,
3284
+ 0x4BB000004BC,
3285
+ 0x4BD000004BE,
3286
+ 0x4BF000004C0,
3287
+ 0x4C2000004C3,
3288
+ 0x4C4000004C5,
3289
+ 0x4C6000004C7,
3290
+ 0x4C8000004C9,
3291
+ 0x4CA000004CB,
3292
+ 0x4CC000004CD,
3293
+ 0x4CE000004D0,
3294
+ 0x4D1000004D2,
3295
+ 0x4D3000004D4,
3296
+ 0x4D5000004D6,
3297
+ 0x4D7000004D8,
3298
+ 0x4D9000004DA,
3299
+ 0x4DB000004DC,
3300
+ 0x4DD000004DE,
3301
+ 0x4DF000004E0,
3302
+ 0x4E1000004E2,
3303
+ 0x4E3000004E4,
3304
+ 0x4E5000004E6,
3305
+ 0x4E7000004E8,
3306
+ 0x4E9000004EA,
3307
+ 0x4EB000004EC,
3308
+ 0x4ED000004EE,
3309
+ 0x4EF000004F0,
3310
+ 0x4F1000004F2,
3311
+ 0x4F3000004F4,
3312
+ 0x4F5000004F6,
3313
+ 0x4F7000004F8,
3314
+ 0x4F9000004FA,
3315
+ 0x4FB000004FC,
3316
+ 0x4FD000004FE,
3317
+ 0x4FF00000500,
3318
+ 0x50100000502,
3319
+ 0x50300000504,
3320
+ 0x50500000506,
3321
+ 0x50700000508,
3322
+ 0x5090000050A,
3323
+ 0x50B0000050C,
3324
+ 0x50D0000050E,
3325
+ 0x50F00000510,
3326
+ 0x51100000512,
3327
+ 0x51300000514,
3328
+ 0x51500000516,
3329
+ 0x51700000518,
3330
+ 0x5190000051A,
3331
+ 0x51B0000051C,
3332
+ 0x51D0000051E,
3333
+ 0x51F00000520,
3334
+ 0x52100000522,
3335
+ 0x52300000524,
3336
+ 0x52500000526,
3337
+ 0x52700000528,
3338
+ 0x5290000052A,
3339
+ 0x52B0000052C,
3340
+ 0x52D0000052E,
3341
+ 0x52F00000530,
3342
+ 0x5590000055A,
3343
+ 0x56000000587,
3344
+ 0x58800000589,
3345
+ 0x591000005BE,
3346
+ 0x5BF000005C0,
3347
+ 0x5C1000005C3,
3348
+ 0x5C4000005C6,
3349
+ 0x5C7000005C8,
3350
+ 0x5D0000005EB,
3351
+ 0x5EF000005F3,
3352
+ 0x6100000061B,
3353
+ 0x62000000640,
3354
+ 0x64100000660,
3355
+ 0x66E00000675,
3356
+ 0x679000006D4,
3357
+ 0x6D5000006DD,
3358
+ 0x6DF000006E9,
3359
+ 0x6EA000006F0,
3360
+ 0x6FA00000700,
3361
+ 0x7100000074B,
3362
+ 0x74D000007B2,
3363
+ 0x7C0000007F6,
3364
+ 0x7FD000007FE,
3365
+ 0x8000000082E,
3366
+ 0x8400000085C,
3367
+ 0x8600000086B,
3368
+ 0x87000000888,
3369
+ 0x8890000088F,
3370
+ 0x897000008E2,
3371
+ 0x8E300000958,
3372
+ 0x96000000964,
3373
+ 0x96600000970,
3374
+ 0x97100000984,
3375
+ 0x9850000098D,
3376
+ 0x98F00000991,
3377
+ 0x993000009A9,
3378
+ 0x9AA000009B1,
3379
+ 0x9B2000009B3,
3380
+ 0x9B6000009BA,
3381
+ 0x9BC000009C5,
3382
+ 0x9C7000009C9,
3383
+ 0x9CB000009CF,
3384
+ 0x9D7000009D8,
3385
+ 0x9E0000009E4,
3386
+ 0x9E6000009F2,
3387
+ 0x9FC000009FD,
3388
+ 0x9FE000009FF,
3389
+ 0xA0100000A04,
3390
+ 0xA0500000A0B,
3391
+ 0xA0F00000A11,
3392
+ 0xA1300000A29,
3393
+ 0xA2A00000A31,
3394
+ 0xA3200000A33,
3395
+ 0xA3500000A36,
3396
+ 0xA3800000A3A,
3397
+ 0xA3C00000A3D,
3398
+ 0xA3E00000A43,
3399
+ 0xA4700000A49,
3400
+ 0xA4B00000A4E,
3401
+ 0xA5100000A52,
3402
+ 0xA5C00000A5D,
3403
+ 0xA6600000A76,
3404
+ 0xA8100000A84,
3405
+ 0xA8500000A8E,
3406
+ 0xA8F00000A92,
3407
+ 0xA9300000AA9,
3408
+ 0xAAA00000AB1,
3409
+ 0xAB200000AB4,
3410
+ 0xAB500000ABA,
3411
+ 0xABC00000AC6,
3412
+ 0xAC700000ACA,
3413
+ 0xACB00000ACE,
3414
+ 0xAD000000AD1,
3415
+ 0xAE000000AE4,
3416
+ 0xAE600000AF0,
3417
+ 0xAF900000B00,
3418
+ 0xB0100000B04,
3419
+ 0xB0500000B0D,
3420
+ 0xB0F00000B11,
3421
+ 0xB1300000B29,
3422
+ 0xB2A00000B31,
3423
+ 0xB3200000B34,
3424
+ 0xB3500000B3A,
3425
+ 0xB3C00000B45,
3426
+ 0xB4700000B49,
3427
+ 0xB4B00000B4E,
3428
+ 0xB5500000B58,
3429
+ 0xB5F00000B64,
3430
+ 0xB6600000B70,
3431
+ 0xB7100000B72,
3432
+ 0xB8200000B84,
3433
+ 0xB8500000B8B,
3434
+ 0xB8E00000B91,
3435
+ 0xB9200000B96,
3436
+ 0xB9900000B9B,
3437
+ 0xB9C00000B9D,
3438
+ 0xB9E00000BA0,
3439
+ 0xBA300000BA5,
3440
+ 0xBA800000BAB,
3441
+ 0xBAE00000BBA,
3442
+ 0xBBE00000BC3,
3443
+ 0xBC600000BC9,
3444
+ 0xBCA00000BCE,
3445
+ 0xBD000000BD1,
3446
+ 0xBD700000BD8,
3447
+ 0xBE600000BF0,
3448
+ 0xC0000000C0D,
3449
+ 0xC0E00000C11,
3450
+ 0xC1200000C29,
3451
+ 0xC2A00000C3A,
3452
+ 0xC3C00000C45,
3453
+ 0xC4600000C49,
3454
+ 0xC4A00000C4E,
3455
+ 0xC5500000C57,
3456
+ 0xC5800000C5B,
3457
+ 0xC5D00000C5E,
3458
+ 0xC6000000C64,
3459
+ 0xC6600000C70,
3460
+ 0xC8000000C84,
3461
+ 0xC8500000C8D,
3462
+ 0xC8E00000C91,
3463
+ 0xC9200000CA9,
3464
+ 0xCAA00000CB4,
3465
+ 0xCB500000CBA,
3466
+ 0xCBC00000CC5,
3467
+ 0xCC600000CC9,
3468
+ 0xCCA00000CCE,
3469
+ 0xCD500000CD7,
3470
+ 0xCDD00000CDF,
3471
+ 0xCE000000CE4,
3472
+ 0xCE600000CF0,
3473
+ 0xCF100000CF4,
3474
+ 0xD0000000D0D,
3475
+ 0xD0E00000D11,
3476
+ 0xD1200000D45,
3477
+ 0xD4600000D49,
3478
+ 0xD4A00000D4F,
3479
+ 0xD5400000D58,
3480
+ 0xD5F00000D64,
3481
+ 0xD6600000D70,
3482
+ 0xD7A00000D80,
3483
+ 0xD8100000D84,
3484
+ 0xD8500000D97,
3485
+ 0xD9A00000DB2,
3486
+ 0xDB300000DBC,
3487
+ 0xDBD00000DBE,
3488
+ 0xDC000000DC7,
3489
+ 0xDCA00000DCB,
3490
+ 0xDCF00000DD5,
3491
+ 0xDD600000DD7,
3492
+ 0xDD800000DE0,
3493
+ 0xDE600000DF0,
3494
+ 0xDF200000DF4,
3495
+ 0xE0100000E33,
3496
+ 0xE3400000E3B,
3497
+ 0xE4000000E4F,
3498
+ 0xE5000000E5A,
3499
+ 0xE8100000E83,
3500
+ 0xE8400000E85,
3501
+ 0xE8600000E8B,
3502
+ 0xE8C00000EA4,
3503
+ 0xEA500000EA6,
3504
+ 0xEA700000EB3,
3505
+ 0xEB400000EBE,
3506
+ 0xEC000000EC5,
3507
+ 0xEC600000EC7,
3508
+ 0xEC800000ECF,
3509
+ 0xED000000EDA,
3510
+ 0xEDE00000EE0,
3511
+ 0xF0000000F01,
3512
+ 0xF0B00000F0C,
3513
+ 0xF1800000F1A,
3514
+ 0xF2000000F2A,
3515
+ 0xF3500000F36,
3516
+ 0xF3700000F38,
3517
+ 0xF3900000F3A,
3518
+ 0xF3E00000F43,
3519
+ 0xF4400000F48,
3520
+ 0xF4900000F4D,
3521
+ 0xF4E00000F52,
3522
+ 0xF5300000F57,
3523
+ 0xF5800000F5C,
3524
+ 0xF5D00000F69,
3525
+ 0xF6A00000F6D,
3526
+ 0xF7100000F73,
3527
+ 0xF7400000F75,
3528
+ 0xF7A00000F81,
3529
+ 0xF8200000F85,
3530
+ 0xF8600000F93,
3531
+ 0xF9400000F98,
3532
+ 0xF9900000F9D,
3533
+ 0xF9E00000FA2,
3534
+ 0xFA300000FA7,
3535
+ 0xFA800000FAC,
3536
+ 0xFAD00000FB9,
3537
+ 0xFBA00000FBD,
3538
+ 0xFC600000FC7,
3539
+ 0x10000000104A,
3540
+ 0x10500000109E,
3541
+ 0x10D0000010FB,
3542
+ 0x10FD00001100,
3543
+ 0x120000001249,
3544
+ 0x124A0000124E,
3545
+ 0x125000001257,
3546
+ 0x125800001259,
3547
+ 0x125A0000125E,
3548
+ 0x126000001289,
3549
+ 0x128A0000128E,
3550
+ 0x1290000012B1,
3551
+ 0x12B2000012B6,
3552
+ 0x12B8000012BF,
3553
+ 0x12C0000012C1,
3554
+ 0x12C2000012C6,
3555
+ 0x12C8000012D7,
3556
+ 0x12D800001311,
3557
+ 0x131200001316,
3558
+ 0x13180000135B,
3559
+ 0x135D00001360,
3560
+ 0x138000001390,
3561
+ 0x13A0000013F6,
3562
+ 0x14010000166D,
3563
+ 0x166F00001680,
3564
+ 0x16810000169B,
3565
+ 0x16A0000016EB,
3566
+ 0x16F1000016F9,
3567
+ 0x170000001716,
3568
+ 0x171F00001735,
3569
+ 0x174000001754,
3570
+ 0x17600000176D,
3571
+ 0x176E00001771,
3572
+ 0x177200001774,
3573
+ 0x1780000017B4,
3574
+ 0x17B6000017D4,
3575
+ 0x17D7000017D8,
3576
+ 0x17DC000017DE,
3577
+ 0x17E0000017EA,
3578
+ 0x18100000181A,
3579
+ 0x182000001879,
3580
+ 0x1880000018AB,
3581
+ 0x18B0000018F6,
3582
+ 0x19000000191F,
3583
+ 0x19200000192C,
3584
+ 0x19300000193C,
3585
+ 0x19460000196E,
3586
+ 0x197000001975,
3587
+ 0x1980000019AC,
3588
+ 0x19B0000019CA,
3589
+ 0x19D0000019DA,
3590
+ 0x1A0000001A1C,
3591
+ 0x1A2000001A5F,
3592
+ 0x1A6000001A7D,
3593
+ 0x1A7F00001A8A,
3594
+ 0x1A9000001A9A,
3595
+ 0x1AA700001AA8,
3596
+ 0x1AB000001ABE,
3597
+ 0x1ABF00001ACF,
3598
+ 0x1B0000001B4D,
3599
+ 0x1B5000001B5A,
3600
+ 0x1B6B00001B74,
3601
+ 0x1B8000001BF4,
3602
+ 0x1C0000001C38,
3603
+ 0x1C4000001C4A,
3604
+ 0x1C4D00001C7E,
3605
+ 0x1C8A00001C8B,
3606
+ 0x1CD000001CD3,
3607
+ 0x1CD400001CFB,
3608
+ 0x1D0000001D2C,
3609
+ 0x1D2F00001D30,
3610
+ 0x1D3B00001D3C,
3611
+ 0x1D4E00001D4F,
3612
+ 0x1D6B00001D78,
3613
+ 0x1D7900001D9B,
3614
+ 0x1DC000001E00,
3615
+ 0x1E0100001E02,
3616
+ 0x1E0300001E04,
3617
+ 0x1E0500001E06,
3618
+ 0x1E0700001E08,
3619
+ 0x1E0900001E0A,
3620
+ 0x1E0B00001E0C,
3621
+ 0x1E0D00001E0E,
3622
+ 0x1E0F00001E10,
3623
+ 0x1E1100001E12,
3624
+ 0x1E1300001E14,
3625
+ 0x1E1500001E16,
3626
+ 0x1E1700001E18,
3627
+ 0x1E1900001E1A,
3628
+ 0x1E1B00001E1C,
3629
+ 0x1E1D00001E1E,
3630
+ 0x1E1F00001E20,
3631
+ 0x1E2100001E22,
3632
+ 0x1E2300001E24,
3633
+ 0x1E2500001E26,
3634
+ 0x1E2700001E28,
3635
+ 0x1E2900001E2A,
3636
+ 0x1E2B00001E2C,
3637
+ 0x1E2D00001E2E,
3638
+ 0x1E2F00001E30,
3639
+ 0x1E3100001E32,
3640
+ 0x1E3300001E34,
3641
+ 0x1E3500001E36,
3642
+ 0x1E3700001E38,
3643
+ 0x1E3900001E3A,
3644
+ 0x1E3B00001E3C,
3645
+ 0x1E3D00001E3E,
3646
+ 0x1E3F00001E40,
3647
+ 0x1E4100001E42,
3648
+ 0x1E4300001E44,
3649
+ 0x1E4500001E46,
3650
+ 0x1E4700001E48,
3651
+ 0x1E4900001E4A,
3652
+ 0x1E4B00001E4C,
3653
+ 0x1E4D00001E4E,
3654
+ 0x1E4F00001E50,
3655
+ 0x1E5100001E52,
3656
+ 0x1E5300001E54,
3657
+ 0x1E5500001E56,
3658
+ 0x1E5700001E58,
3659
+ 0x1E5900001E5A,
3660
+ 0x1E5B00001E5C,
3661
+ 0x1E5D00001E5E,
3662
+ 0x1E5F00001E60,
3663
+ 0x1E6100001E62,
3664
+ 0x1E6300001E64,
3665
+ 0x1E6500001E66,
3666
+ 0x1E6700001E68,
3667
+ 0x1E6900001E6A,
3668
+ 0x1E6B00001E6C,
3669
+ 0x1E6D00001E6E,
3670
+ 0x1E6F00001E70,
3671
+ 0x1E7100001E72,
3672
+ 0x1E7300001E74,
3673
+ 0x1E7500001E76,
3674
+ 0x1E7700001E78,
3675
+ 0x1E7900001E7A,
3676
+ 0x1E7B00001E7C,
3677
+ 0x1E7D00001E7E,
3678
+ 0x1E7F00001E80,
3679
+ 0x1E8100001E82,
3680
+ 0x1E8300001E84,
3681
+ 0x1E8500001E86,
3682
+ 0x1E8700001E88,
3683
+ 0x1E8900001E8A,
3684
+ 0x1E8B00001E8C,
3685
+ 0x1E8D00001E8E,
3686
+ 0x1E8F00001E90,
3687
+ 0x1E9100001E92,
3688
+ 0x1E9300001E94,
3689
+ 0x1E9500001E9A,
3690
+ 0x1E9C00001E9E,
3691
+ 0x1E9F00001EA0,
3692
+ 0x1EA100001EA2,
3693
+ 0x1EA300001EA4,
3694
+ 0x1EA500001EA6,
3695
+ 0x1EA700001EA8,
3696
+ 0x1EA900001EAA,
3697
+ 0x1EAB00001EAC,
3698
+ 0x1EAD00001EAE,
3699
+ 0x1EAF00001EB0,
3700
+ 0x1EB100001EB2,
3701
+ 0x1EB300001EB4,
3702
+ 0x1EB500001EB6,
3703
+ 0x1EB700001EB8,
3704
+ 0x1EB900001EBA,
3705
+ 0x1EBB00001EBC,
3706
+ 0x1EBD00001EBE,
3707
+ 0x1EBF00001EC0,
3708
+ 0x1EC100001EC2,
3709
+ 0x1EC300001EC4,
3710
+ 0x1EC500001EC6,
3711
+ 0x1EC700001EC8,
3712
+ 0x1EC900001ECA,
3713
+ 0x1ECB00001ECC,
3714
+ 0x1ECD00001ECE,
3715
+ 0x1ECF00001ED0,
3716
+ 0x1ED100001ED2,
3717
+ 0x1ED300001ED4,
3718
+ 0x1ED500001ED6,
3719
+ 0x1ED700001ED8,
3720
+ 0x1ED900001EDA,
3721
+ 0x1EDB00001EDC,
3722
+ 0x1EDD00001EDE,
3723
+ 0x1EDF00001EE0,
3724
+ 0x1EE100001EE2,
3725
+ 0x1EE300001EE4,
3726
+ 0x1EE500001EE6,
3727
+ 0x1EE700001EE8,
3728
+ 0x1EE900001EEA,
3729
+ 0x1EEB00001EEC,
3730
+ 0x1EED00001EEE,
3731
+ 0x1EEF00001EF0,
3732
+ 0x1EF100001EF2,
3733
+ 0x1EF300001EF4,
3734
+ 0x1EF500001EF6,
3735
+ 0x1EF700001EF8,
3736
+ 0x1EF900001EFA,
3737
+ 0x1EFB00001EFC,
3738
+ 0x1EFD00001EFE,
3739
+ 0x1EFF00001F08,
3740
+ 0x1F1000001F16,
3741
+ 0x1F2000001F28,
3742
+ 0x1F3000001F38,
3743
+ 0x1F4000001F46,
3744
+ 0x1F5000001F58,
3745
+ 0x1F6000001F68,
3746
+ 0x1F7000001F71,
3747
+ 0x1F7200001F73,
3748
+ 0x1F7400001F75,
3749
+ 0x1F7600001F77,
3750
+ 0x1F7800001F79,
3751
+ 0x1F7A00001F7B,
3752
+ 0x1F7C00001F7D,
3753
+ 0x1FB000001FB2,
3754
+ 0x1FB600001FB7,
3755
+ 0x1FC600001FC7,
3756
+ 0x1FD000001FD3,
3757
+ 0x1FD600001FD8,
3758
+ 0x1FE000001FE3,
3759
+ 0x1FE400001FE8,
3760
+ 0x1FF600001FF7,
3761
+ 0x214E0000214F,
3762
+ 0x218400002185,
3763
+ 0x2C3000002C60,
3764
+ 0x2C6100002C62,
3765
+ 0x2C6500002C67,
3766
+ 0x2C6800002C69,
3767
+ 0x2C6A00002C6B,
3768
+ 0x2C6C00002C6D,
3769
+ 0x2C7100002C72,
3770
+ 0x2C7300002C75,
3771
+ 0x2C7600002C7C,
3772
+ 0x2C8100002C82,
3773
+ 0x2C8300002C84,
3774
+ 0x2C8500002C86,
3775
+ 0x2C8700002C88,
3776
+ 0x2C8900002C8A,
3777
+ 0x2C8B00002C8C,
3778
+ 0x2C8D00002C8E,
3779
+ 0x2C8F00002C90,
3780
+ 0x2C9100002C92,
3781
+ 0x2C9300002C94,
3782
+ 0x2C9500002C96,
3783
+ 0x2C9700002C98,
3784
+ 0x2C9900002C9A,
3785
+ 0x2C9B00002C9C,
3786
+ 0x2C9D00002C9E,
3787
+ 0x2C9F00002CA0,
3788
+ 0x2CA100002CA2,
3789
+ 0x2CA300002CA4,
3790
+ 0x2CA500002CA6,
3791
+ 0x2CA700002CA8,
3792
+ 0x2CA900002CAA,
3793
+ 0x2CAB00002CAC,
3794
+ 0x2CAD00002CAE,
3795
+ 0x2CAF00002CB0,
3796
+ 0x2CB100002CB2,
3797
+ 0x2CB300002CB4,
3798
+ 0x2CB500002CB6,
3799
+ 0x2CB700002CB8,
3800
+ 0x2CB900002CBA,
3801
+ 0x2CBB00002CBC,
3802
+ 0x2CBD00002CBE,
3803
+ 0x2CBF00002CC0,
3804
+ 0x2CC100002CC2,
3805
+ 0x2CC300002CC4,
3806
+ 0x2CC500002CC6,
3807
+ 0x2CC700002CC8,
3808
+ 0x2CC900002CCA,
3809
+ 0x2CCB00002CCC,
3810
+ 0x2CCD00002CCE,
3811
+ 0x2CCF00002CD0,
3812
+ 0x2CD100002CD2,
3813
+ 0x2CD300002CD4,
3814
+ 0x2CD500002CD6,
3815
+ 0x2CD700002CD8,
3816
+ 0x2CD900002CDA,
3817
+ 0x2CDB00002CDC,
3818
+ 0x2CDD00002CDE,
3819
+ 0x2CDF00002CE0,
3820
+ 0x2CE100002CE2,
3821
+ 0x2CE300002CE5,
3822
+ 0x2CEC00002CED,
3823
+ 0x2CEE00002CF2,
3824
+ 0x2CF300002CF4,
3825
+ 0x2D0000002D26,
3826
+ 0x2D2700002D28,
3827
+ 0x2D2D00002D2E,
3828
+ 0x2D3000002D68,
3829
+ 0x2D7F00002D97,
3830
+ 0x2DA000002DA7,
3831
+ 0x2DA800002DAF,
3832
+ 0x2DB000002DB7,
3833
+ 0x2DB800002DBF,
3834
+ 0x2DC000002DC7,
3835
+ 0x2DC800002DCF,
3836
+ 0x2DD000002DD7,
3837
+ 0x2DD800002DDF,
3838
+ 0x2DE000002E00,
3839
+ 0x2E2F00002E30,
3840
+ 0x300500003008,
3841
+ 0x302A0000302E,
3842
+ 0x303C0000303D,
3843
+ 0x304100003097,
3844
+ 0x30990000309B,
3845
+ 0x309D0000309F,
3846
+ 0x30A1000030FB,
3847
+ 0x30FC000030FF,
3848
+ 0x310500003130,
3849
+ 0x31A0000031C0,
3850
+ 0x31F000003200,
3851
+ 0x340000004DC0,
3852
+ 0x4E000000A48D,
3853
+ 0xA4D00000A4FE,
3854
+ 0xA5000000A60D,
3855
+ 0xA6100000A62C,
3856
+ 0xA6410000A642,
3857
+ 0xA6430000A644,
3858
+ 0xA6450000A646,
3859
+ 0xA6470000A648,
3860
+ 0xA6490000A64A,
3861
+ 0xA64B0000A64C,
3862
+ 0xA64D0000A64E,
3863
+ 0xA64F0000A650,
3864
+ 0xA6510000A652,
3865
+ 0xA6530000A654,
3866
+ 0xA6550000A656,
3867
+ 0xA6570000A658,
3868
+ 0xA6590000A65A,
3869
+ 0xA65B0000A65C,
3870
+ 0xA65D0000A65E,
3871
+ 0xA65F0000A660,
3872
+ 0xA6610000A662,
3873
+ 0xA6630000A664,
3874
+ 0xA6650000A666,
3875
+ 0xA6670000A668,
3876
+ 0xA6690000A66A,
3877
+ 0xA66B0000A66C,
3878
+ 0xA66D0000A670,
3879
+ 0xA6740000A67E,
3880
+ 0xA67F0000A680,
3881
+ 0xA6810000A682,
3882
+ 0xA6830000A684,
3883
+ 0xA6850000A686,
3884
+ 0xA6870000A688,
3885
+ 0xA6890000A68A,
3886
+ 0xA68B0000A68C,
3887
+ 0xA68D0000A68E,
3888
+ 0xA68F0000A690,
3889
+ 0xA6910000A692,
3890
+ 0xA6930000A694,
3891
+ 0xA6950000A696,
3892
+ 0xA6970000A698,
3893
+ 0xA6990000A69A,
3894
+ 0xA69B0000A69C,
3895
+ 0xA69E0000A6E6,
3896
+ 0xA6F00000A6F2,
3897
+ 0xA7170000A720,
3898
+ 0xA7230000A724,
3899
+ 0xA7250000A726,
3900
+ 0xA7270000A728,
3901
+ 0xA7290000A72A,
3902
+ 0xA72B0000A72C,
3903
+ 0xA72D0000A72E,
3904
+ 0xA72F0000A732,
3905
+ 0xA7330000A734,
3906
+ 0xA7350000A736,
3907
+ 0xA7370000A738,
3908
+ 0xA7390000A73A,
3909
+ 0xA73B0000A73C,
3910
+ 0xA73D0000A73E,
3911
+ 0xA73F0000A740,
3912
+ 0xA7410000A742,
3913
+ 0xA7430000A744,
3914
+ 0xA7450000A746,
3915
+ 0xA7470000A748,
3916
+ 0xA7490000A74A,
3917
+ 0xA74B0000A74C,
3918
+ 0xA74D0000A74E,
3919
+ 0xA74F0000A750,
3920
+ 0xA7510000A752,
3921
+ 0xA7530000A754,
3922
+ 0xA7550000A756,
3923
+ 0xA7570000A758,
3924
+ 0xA7590000A75A,
3925
+ 0xA75B0000A75C,
3926
+ 0xA75D0000A75E,
3927
+ 0xA75F0000A760,
3928
+ 0xA7610000A762,
3929
+ 0xA7630000A764,
3930
+ 0xA7650000A766,
3931
+ 0xA7670000A768,
3932
+ 0xA7690000A76A,
3933
+ 0xA76B0000A76C,
3934
+ 0xA76D0000A76E,
3935
+ 0xA76F0000A770,
3936
+ 0xA7710000A779,
3937
+ 0xA77A0000A77B,
3938
+ 0xA77C0000A77D,
3939
+ 0xA77F0000A780,
3940
+ 0xA7810000A782,
3941
+ 0xA7830000A784,
3942
+ 0xA7850000A786,
3943
+ 0xA7870000A789,
3944
+ 0xA78C0000A78D,
3945
+ 0xA78E0000A790,
3946
+ 0xA7910000A792,
3947
+ 0xA7930000A796,
3948
+ 0xA7970000A798,
3949
+ 0xA7990000A79A,
3950
+ 0xA79B0000A79C,
3951
+ 0xA79D0000A79E,
3952
+ 0xA79F0000A7A0,
3953
+ 0xA7A10000A7A2,
3954
+ 0xA7A30000A7A4,
3955
+ 0xA7A50000A7A6,
3956
+ 0xA7A70000A7A8,
3957
+ 0xA7A90000A7AA,
3958
+ 0xA7AF0000A7B0,
3959
+ 0xA7B50000A7B6,
3960
+ 0xA7B70000A7B8,
3961
+ 0xA7B90000A7BA,
3962
+ 0xA7BB0000A7BC,
3963
+ 0xA7BD0000A7BE,
3964
+ 0xA7BF0000A7C0,
3965
+ 0xA7C10000A7C2,
3966
+ 0xA7C30000A7C4,
3967
+ 0xA7C80000A7C9,
3968
+ 0xA7CA0000A7CB,
3969
+ 0xA7CD0000A7CE,
3970
+ 0xA7D10000A7D2,
3971
+ 0xA7D30000A7D4,
3972
+ 0xA7D50000A7D6,
3973
+ 0xA7D70000A7D8,
3974
+ 0xA7D90000A7DA,
3975
+ 0xA7DB0000A7DC,
3976
+ 0xA7F60000A7F8,
3977
+ 0xA7FA0000A828,
3978
+ 0xA82C0000A82D,
3979
+ 0xA8400000A874,
3980
+ 0xA8800000A8C6,
3981
+ 0xA8D00000A8DA,
3982
+ 0xA8E00000A8F8,
3983
+ 0xA8FB0000A8FC,
3984
+ 0xA8FD0000A92E,
3985
+ 0xA9300000A954,
3986
+ 0xA9800000A9C1,
3987
+ 0xA9CF0000A9DA,
3988
+ 0xA9E00000A9FF,
3989
+ 0xAA000000AA37,
3990
+ 0xAA400000AA4E,
3991
+ 0xAA500000AA5A,
3992
+ 0xAA600000AA77,
3993
+ 0xAA7A0000AAC3,
3994
+ 0xAADB0000AADE,
3995
+ 0xAAE00000AAF0,
3996
+ 0xAAF20000AAF7,
3997
+ 0xAB010000AB07,
3998
+ 0xAB090000AB0F,
3999
+ 0xAB110000AB17,
4000
+ 0xAB200000AB27,
4001
+ 0xAB280000AB2F,
4002
+ 0xAB300000AB5B,
4003
+ 0xAB600000AB69,
4004
+ 0xABC00000ABEB,
4005
+ 0xABEC0000ABEE,
4006
+ 0xABF00000ABFA,
4007
+ 0xAC000000D7A4,
4008
+ 0xFA0E0000FA10,
4009
+ 0xFA110000FA12,
4010
+ 0xFA130000FA15,
4011
+ 0xFA1F0000FA20,
4012
+ 0xFA210000FA22,
4013
+ 0xFA230000FA25,
4014
+ 0xFA270000FA2A,
4015
+ 0xFB1E0000FB1F,
4016
+ 0xFE200000FE30,
4017
+ 0xFE730000FE74,
4018
+ 0x100000001000C,
4019
+ 0x1000D00010027,
4020
+ 0x100280001003B,
4021
+ 0x1003C0001003E,
4022
+ 0x1003F0001004E,
4023
+ 0x100500001005E,
4024
+ 0x10080000100FB,
4025
+ 0x101FD000101FE,
4026
+ 0x102800001029D,
4027
+ 0x102A0000102D1,
4028
+ 0x102E0000102E1,
4029
+ 0x1030000010320,
4030
+ 0x1032D00010341,
4031
+ 0x103420001034A,
4032
+ 0x103500001037B,
4033
+ 0x103800001039E,
4034
+ 0x103A0000103C4,
4035
+ 0x103C8000103D0,
4036
+ 0x104280001049E,
4037
+ 0x104A0000104AA,
4038
+ 0x104D8000104FC,
4039
+ 0x1050000010528,
4040
+ 0x1053000010564,
4041
+ 0x10597000105A2,
4042
+ 0x105A3000105B2,
4043
+ 0x105B3000105BA,
4044
+ 0x105BB000105BD,
4045
+ 0x105C0000105F4,
4046
+ 0x1060000010737,
4047
+ 0x1074000010756,
4048
+ 0x1076000010768,
4049
+ 0x1078000010781,
4050
+ 0x1080000010806,
4051
+ 0x1080800010809,
4052
+ 0x1080A00010836,
4053
+ 0x1083700010839,
4054
+ 0x1083C0001083D,
4055
+ 0x1083F00010856,
4056
+ 0x1086000010877,
4057
+ 0x108800001089F,
4058
+ 0x108E0000108F3,
4059
+ 0x108F4000108F6,
4060
+ 0x1090000010916,
4061
+ 0x109200001093A,
4062
+ 0x10980000109B8,
4063
+ 0x109BE000109C0,
4064
+ 0x10A0000010A04,
4065
+ 0x10A0500010A07,
4066
+ 0x10A0C00010A14,
4067
+ 0x10A1500010A18,
4068
+ 0x10A1900010A36,
4069
+ 0x10A3800010A3B,
4070
+ 0x10A3F00010A40,
4071
+ 0x10A6000010A7D,
4072
+ 0x10A8000010A9D,
4073
+ 0x10AC000010AC8,
4074
+ 0x10AC900010AE7,
4075
+ 0x10B0000010B36,
4076
+ 0x10B4000010B56,
4077
+ 0x10B6000010B73,
4078
+ 0x10B8000010B92,
4079
+ 0x10C0000010C49,
4080
+ 0x10CC000010CF3,
4081
+ 0x10D0000010D28,
4082
+ 0x10D3000010D3A,
4083
+ 0x10D4000010D50,
4084
+ 0x10D6900010D6E,
4085
+ 0x10D6F00010D86,
4086
+ 0x10E8000010EAA,
4087
+ 0x10EAB00010EAD,
4088
+ 0x10EB000010EB2,
4089
+ 0x10EC200010EC5,
4090
+ 0x10EFC00010F1D,
4091
+ 0x10F2700010F28,
4092
+ 0x10F3000010F51,
4093
+ 0x10F7000010F86,
4094
+ 0x10FB000010FC5,
4095
+ 0x10FE000010FF7,
4096
+ 0x1100000011047,
4097
+ 0x1106600011076,
4098
+ 0x1107F000110BB,
4099
+ 0x110C2000110C3,
4100
+ 0x110D0000110E9,
4101
+ 0x110F0000110FA,
4102
+ 0x1110000011135,
4103
+ 0x1113600011140,
4104
+ 0x1114400011148,
4105
+ 0x1115000011174,
4106
+ 0x1117600011177,
4107
+ 0x11180000111C5,
4108
+ 0x111C9000111CD,
4109
+ 0x111CE000111DB,
4110
+ 0x111DC000111DD,
4111
+ 0x1120000011212,
4112
+ 0x1121300011238,
4113
+ 0x1123E00011242,
4114
+ 0x1128000011287,
4115
+ 0x1128800011289,
4116
+ 0x1128A0001128E,
4117
+ 0x1128F0001129E,
4118
+ 0x1129F000112A9,
4119
+ 0x112B0000112EB,
4120
+ 0x112F0000112FA,
4121
+ 0x1130000011304,
4122
+ 0x113050001130D,
4123
+ 0x1130F00011311,
4124
+ 0x1131300011329,
4125
+ 0x1132A00011331,
4126
+ 0x1133200011334,
4127
+ 0x113350001133A,
4128
+ 0x1133B00011345,
4129
+ 0x1134700011349,
4130
+ 0x1134B0001134E,
4131
+ 0x1135000011351,
4132
+ 0x1135700011358,
4133
+ 0x1135D00011364,
4134
+ 0x113660001136D,
4135
+ 0x1137000011375,
4136
+ 0x113800001138A,
4137
+ 0x1138B0001138C,
4138
+ 0x1138E0001138F,
4139
+ 0x11390000113B6,
4140
+ 0x113B7000113C1,
4141
+ 0x113C2000113C3,
4142
+ 0x113C5000113C6,
4143
+ 0x113C7000113CB,
4144
+ 0x113CC000113D4,
4145
+ 0x113E1000113E3,
4146
+ 0x114000001144B,
4147
+ 0x114500001145A,
4148
+ 0x1145E00011462,
4149
+ 0x11480000114C6,
4150
+ 0x114C7000114C8,
4151
+ 0x114D0000114DA,
4152
+ 0x11580000115B6,
4153
+ 0x115B8000115C1,
4154
+ 0x115D8000115DE,
4155
+ 0x1160000011641,
4156
+ 0x1164400011645,
4157
+ 0x116500001165A,
4158
+ 0x11680000116B9,
4159
+ 0x116C0000116CA,
4160
+ 0x116D0000116E4,
4161
+ 0x117000001171B,
4162
+ 0x1171D0001172C,
4163
+ 0x117300001173A,
4164
+ 0x1174000011747,
4165
+ 0x118000001183B,
4166
+ 0x118C0000118EA,
4167
+ 0x118FF00011907,
4168
+ 0x119090001190A,
4169
+ 0x1190C00011914,
4170
+ 0x1191500011917,
4171
+ 0x1191800011936,
4172
+ 0x1193700011939,
4173
+ 0x1193B00011944,
4174
+ 0x119500001195A,
4175
+ 0x119A0000119A8,
4176
+ 0x119AA000119D8,
4177
+ 0x119DA000119E2,
4178
+ 0x119E3000119E5,
4179
+ 0x11A0000011A3F,
4180
+ 0x11A4700011A48,
4181
+ 0x11A5000011A9A,
4182
+ 0x11A9D00011A9E,
4183
+ 0x11AB000011AF9,
4184
+ 0x11BC000011BE1,
4185
+ 0x11BF000011BFA,
4186
+ 0x11C0000011C09,
4187
+ 0x11C0A00011C37,
4188
+ 0x11C3800011C41,
4189
+ 0x11C5000011C5A,
4190
+ 0x11C7200011C90,
4191
+ 0x11C9200011CA8,
4192
+ 0x11CA900011CB7,
4193
+ 0x11D0000011D07,
4194
+ 0x11D0800011D0A,
4195
+ 0x11D0B00011D37,
4196
+ 0x11D3A00011D3B,
4197
+ 0x11D3C00011D3E,
4198
+ 0x11D3F00011D48,
4199
+ 0x11D5000011D5A,
4200
+ 0x11D6000011D66,
4201
+ 0x11D6700011D69,
4202
+ 0x11D6A00011D8F,
4203
+ 0x11D9000011D92,
4204
+ 0x11D9300011D99,
4205
+ 0x11DA000011DAA,
4206
+ 0x11EE000011EF7,
4207
+ 0x11F0000011F11,
4208
+ 0x11F1200011F3B,
4209
+ 0x11F3E00011F43,
4210
+ 0x11F5000011F5B,
4211
+ 0x11FB000011FB1,
4212
+ 0x120000001239A,
4213
+ 0x1248000012544,
4214
+ 0x12F9000012FF1,
4215
+ 0x1300000013430,
4216
+ 0x1344000013456,
4217
+ 0x13460000143FB,
4218
+ 0x1440000014647,
4219
+ 0x161000001613A,
4220
+ 0x1680000016A39,
4221
+ 0x16A4000016A5F,
4222
+ 0x16A6000016A6A,
4223
+ 0x16A7000016ABF,
4224
+ 0x16AC000016ACA,
4225
+ 0x16AD000016AEE,
4226
+ 0x16AF000016AF5,
4227
+ 0x16B0000016B37,
4228
+ 0x16B4000016B44,
4229
+ 0x16B5000016B5A,
4230
+ 0x16B6300016B78,
4231
+ 0x16B7D00016B90,
4232
+ 0x16D4000016D6D,
4233
+ 0x16D7000016D7A,
4234
+ 0x16E6000016E80,
4235
+ 0x16F0000016F4B,
4236
+ 0x16F4F00016F88,
4237
+ 0x16F8F00016FA0,
4238
+ 0x16FE000016FE2,
4239
+ 0x16FE300016FE5,
4240
+ 0x16FF000016FF2,
4241
+ 0x17000000187F8,
4242
+ 0x1880000018CD6,
4243
+ 0x18CFF00018D09,
4244
+ 0x1AFF00001AFF4,
4245
+ 0x1AFF50001AFFC,
4246
+ 0x1AFFD0001AFFF,
4247
+ 0x1B0000001B123,
4248
+ 0x1B1320001B133,
4249
+ 0x1B1500001B153,
4250
+ 0x1B1550001B156,
4251
+ 0x1B1640001B168,
4252
+ 0x1B1700001B2FC,
4253
+ 0x1BC000001BC6B,
4254
+ 0x1BC700001BC7D,
4255
+ 0x1BC800001BC89,
4256
+ 0x1BC900001BC9A,
4257
+ 0x1BC9D0001BC9F,
4258
+ 0x1CCF00001CCFA,
4259
+ 0x1CF000001CF2E,
4260
+ 0x1CF300001CF47,
4261
+ 0x1DA000001DA37,
4262
+ 0x1DA3B0001DA6D,
4263
+ 0x1DA750001DA76,
4264
+ 0x1DA840001DA85,
4265
+ 0x1DA9B0001DAA0,
4266
+ 0x1DAA10001DAB0,
4267
+ 0x1DF000001DF1F,
4268
+ 0x1DF250001DF2B,
4269
+ 0x1E0000001E007,
4270
+ 0x1E0080001E019,
4271
+ 0x1E01B0001E022,
4272
+ 0x1E0230001E025,
4273
+ 0x1E0260001E02B,
4274
+ 0x1E08F0001E090,
4275
+ 0x1E1000001E12D,
4276
+ 0x1E1300001E13E,
4277
+ 0x1E1400001E14A,
4278
+ 0x1E14E0001E14F,
4279
+ 0x1E2900001E2AF,
4280
+ 0x1E2C00001E2FA,
4281
+ 0x1E4D00001E4FA,
4282
+ 0x1E5D00001E5FB,
4283
+ 0x1E7E00001E7E7,
4284
+ 0x1E7E80001E7EC,
4285
+ 0x1E7ED0001E7EF,
4286
+ 0x1E7F00001E7FF,
4287
+ 0x1E8000001E8C5,
4288
+ 0x1E8D00001E8D7,
4289
+ 0x1E9220001E94C,
4290
+ 0x1E9500001E95A,
4291
+ 0x200000002A6E0,
4292
+ 0x2A7000002B73A,
4293
+ 0x2B7400002B81E,
4294
+ 0x2B8200002CEA2,
4295
+ 0x2CEB00002EBE1,
4296
+ 0x2EBF00002EE5E,
4297
+ 0x300000003134B,
4298
+ 0x31350000323B0,
4299
+ ),
4300
+ "CONTEXTJ": (0x200C0000200E,),
4301
+ "CONTEXTO": (
4302
+ 0xB7000000B8,
4303
+ 0x37500000376,
4304
+ 0x5F3000005F5,
4305
+ 0x6600000066A,
4306
+ 0x6F0000006FA,
4307
+ 0x30FB000030FC,
4308
+ ),
4309
+ }
idna/intranges.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Given a list of integers, made up of (hopefully) a small number of long runs
3
+ of consecutive integers, compute a representation of the form
4
+ ((start1, end1), (start2, end2) ...). Then answer the question "was x present
5
+ in the original list?" in time O(log(# runs)).
6
+ """
7
+
8
+ import bisect
9
+ from typing import List, Tuple
10
+
11
+
12
+ def intranges_from_list(list_: List[int]) -> Tuple[int, ...]:
13
+ """Represent a list of integers as a sequence of ranges:
14
+ ((start_0, end_0), (start_1, end_1), ...), such that the original
15
+ integers are exactly those x such that start_i <= x < end_i for some i.
16
+
17
+ Ranges are encoded as single integers (start << 32 | end), not as tuples.
18
+ """
19
+
20
+ sorted_list = sorted(list_)
21
+ ranges = []
22
+ last_write = -1
23
+ for i in range(len(sorted_list)):
24
+ if i + 1 < len(sorted_list):
25
+ if sorted_list[i] == sorted_list[i + 1] - 1:
26
+ continue
27
+ current_range = sorted_list[last_write + 1 : i + 1]
28
+ ranges.append(_encode_range(current_range[0], current_range[-1] + 1))
29
+ last_write = i
30
+
31
+ return tuple(ranges)
32
+
33
+
34
+ def _encode_range(start: int, end: int) -> int:
35
+ return (start << 32) | end
36
+
37
+
38
+ def _decode_range(r: int) -> Tuple[int, int]:
39
+ return (r >> 32), (r & ((1 << 32) - 1))
40
+
41
+
42
+ def intranges_contain(int_: int, ranges: Tuple[int, ...]) -> bool:
43
+ """Determine if `int_` falls into one of the ranges in `ranges`."""
44
+ tuple_ = _encode_range(int_, 0)
45
+ pos = bisect.bisect_left(ranges, tuple_)
46
+ # we could be immediately ahead of a tuple (start, end)
47
+ # with start < int_ <= end
48
+ if pos > 0:
49
+ left, right = _decode_range(ranges[pos - 1])
50
+ if left <= int_ < right:
51
+ return True
52
+ # or we could be immediately behind a tuple (int_, end)
53
+ if pos < len(ranges):
54
+ left, _ = _decode_range(ranges[pos])
55
+ if left == int_:
56
+ return True
57
+ return False
idna/package_data.py ADDED
@@ -0,0 +1 @@
 
 
1
+ __version__ = "3.11"
idna/py.typed ADDED
File without changes
idna/uts46data.py ADDED
The diff for this file is too large to render. See raw diff
 
importlib_metadata/__init__.py ADDED
@@ -0,0 +1,1191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ APIs exposing metadata from third-party Python packages.
3
+
4
+ This codebase is shared between importlib.metadata in the stdlib
5
+ and importlib_metadata in PyPI. See
6
+ https://github.com/python/importlib_metadata/wiki/Development-Methodology
7
+ for more detail.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import abc
13
+ import collections
14
+ import email
15
+ import functools
16
+ import itertools
17
+ import operator
18
+ import os
19
+ import pathlib
20
+ import posixpath
21
+ import re
22
+ import sys
23
+ import textwrap
24
+ import types
25
+ from collections.abc import Iterable, Mapping
26
+ from contextlib import suppress
27
+ from importlib import import_module
28
+ from importlib.abc import MetaPathFinder
29
+ from itertools import starmap
30
+ from typing import Any
31
+
32
+ from . import _meta
33
+ from ._collections import FreezableDefaultDict, Pair
34
+ from ._compat import (
35
+ NullFinder,
36
+ install,
37
+ )
38
+ from ._functools import method_cache, noop, pass_none, passthrough
39
+ from ._itertools import always_iterable, bucket, unique_everseen
40
+ from ._meta import PackageMetadata, SimplePath
41
+ from ._typing import md_none
42
+ from .compat import py311
43
+
44
+ __all__ = [
45
+ 'Distribution',
46
+ 'DistributionFinder',
47
+ 'PackageMetadata',
48
+ 'PackageNotFoundError',
49
+ 'SimplePath',
50
+ 'distribution',
51
+ 'distributions',
52
+ 'entry_points',
53
+ 'files',
54
+ 'metadata',
55
+ 'packages_distributions',
56
+ 'requires',
57
+ 'version',
58
+ ]
59
+
60
+
61
+ class PackageNotFoundError(ModuleNotFoundError):
62
+ """The package was not found."""
63
+
64
+ def __str__(self) -> str:
65
+ return f"No package metadata was found for {self.name}"
66
+
67
+ @property
68
+ def name(self) -> str: # type: ignore[override] # make readonly
69
+ (name,) = self.args
70
+ return name
71
+
72
+
73
+ class Sectioned:
74
+ """
75
+ A simple entry point config parser for performance
76
+
77
+ >>> for item in Sectioned.read(Sectioned._sample):
78
+ ... print(item)
79
+ Pair(name='sec1', value='# comments ignored')
80
+ Pair(name='sec1', value='a = 1')
81
+ Pair(name='sec1', value='b = 2')
82
+ Pair(name='sec2', value='a = 2')
83
+
84
+ >>> res = Sectioned.section_pairs(Sectioned._sample)
85
+ >>> item = next(res)
86
+ >>> item.name
87
+ 'sec1'
88
+ >>> item.value
89
+ Pair(name='a', value='1')
90
+ >>> item = next(res)
91
+ >>> item.value
92
+ Pair(name='b', value='2')
93
+ >>> item = next(res)
94
+ >>> item.name
95
+ 'sec2'
96
+ >>> item.value
97
+ Pair(name='a', value='2')
98
+ >>> list(res)
99
+ []
100
+ """
101
+
102
+ _sample = textwrap.dedent(
103
+ """
104
+ [sec1]
105
+ # comments ignored
106
+ a = 1
107
+ b = 2
108
+
109
+ [sec2]
110
+ a = 2
111
+ """
112
+ ).lstrip()
113
+
114
+ @classmethod
115
+ def section_pairs(cls, text):
116
+ return (
117
+ section._replace(value=Pair.parse(section.value))
118
+ for section in cls.read(text, filter_=cls.valid)
119
+ if section.name is not None
120
+ )
121
+
122
+ @staticmethod
123
+ def read(text, filter_=None):
124
+ lines = filter(filter_, map(str.strip, text.splitlines()))
125
+ name = None
126
+ for value in lines:
127
+ section_match = value.startswith('[') and value.endswith(']')
128
+ if section_match:
129
+ name = value.strip('[]')
130
+ continue
131
+ yield Pair(name, value)
132
+
133
+ @staticmethod
134
+ def valid(line: str):
135
+ return line and not line.startswith('#')
136
+
137
+
138
+ class _EntryPointMatch(types.SimpleNamespace):
139
+ module: str
140
+ attr: str
141
+ extras: str
142
+
143
+
144
+ class EntryPoint:
145
+ """An entry point as defined by Python packaging conventions.
146
+
147
+ See `the packaging docs on entry points
148
+ <https://packaging.python.org/specifications/entry-points/>`_
149
+ for more information.
150
+
151
+ >>> ep = EntryPoint(
152
+ ... name=None, group=None, value='package.module:attr [extra1, extra2]')
153
+ >>> ep.module
154
+ 'package.module'
155
+ >>> ep.attr
156
+ 'attr'
157
+ >>> ep.extras
158
+ ['extra1', 'extra2']
159
+
160
+ If the value package or module are not valid identifiers, a
161
+ ValueError is raised on access.
162
+
163
+ >>> EntryPoint(name=None, group=None, value='invalid-name').module
164
+ Traceback (most recent call last):
165
+ ...
166
+ ValueError: ('Invalid object reference...invalid-name...
167
+ >>> EntryPoint(name=None, group=None, value='invalid-name').attr
168
+ Traceback (most recent call last):
169
+ ...
170
+ ValueError: ('Invalid object reference...invalid-name...
171
+ >>> EntryPoint(name=None, group=None, value='invalid-name').extras
172
+ Traceback (most recent call last):
173
+ ...
174
+ ValueError: ('Invalid object reference...invalid-name...
175
+
176
+ The same thing happens on construction.
177
+
178
+ >>> EntryPoint(name=None, group=None, value='invalid-name')
179
+ Traceback (most recent call last):
180
+ ...
181
+ ValueError: ('Invalid object reference...invalid-name...
182
+
183
+ """
184
+
185
+ pattern = re.compile(
186
+ r'(?P<module>[\w.]+)\s*'
187
+ r'(:\s*(?P<attr>[\w.]+)\s*)?'
188
+ r'((?P<extras>\[.*\])\s*)?$'
189
+ )
190
+ """
191
+ A regular expression describing the syntax for an entry point,
192
+ which might look like:
193
+
194
+ - module
195
+ - package.module
196
+ - package.module:attribute
197
+ - package.module:object.attribute
198
+ - package.module:attr [extra1, extra2]
199
+
200
+ Other combinations are possible as well.
201
+
202
+ The expression is lenient about whitespace around the ':',
203
+ following the attr, and following any extras.
204
+ """
205
+
206
+ name: str
207
+ value: str
208
+ group: str
209
+
210
+ dist: Distribution | None = None
211
+
212
+ def __init__(self, name: str, value: str, group: str) -> None:
213
+ vars(self).update(name=name, value=value, group=group)
214
+ self.module
215
+
216
+ def load(self) -> Any:
217
+ """Load the entry point from its definition. If only a module
218
+ is indicated by the value, return that module. Otherwise,
219
+ return the named object.
220
+ """
221
+ module = import_module(self.module)
222
+ attrs = filter(None, (self.attr or '').split('.'))
223
+ return functools.reduce(getattr, attrs, module)
224
+
225
+ @property
226
+ def module(self) -> str:
227
+ return self._match.module
228
+
229
+ @property
230
+ def attr(self) -> str:
231
+ return self._match.attr
232
+
233
+ @property
234
+ def extras(self) -> list[str]:
235
+ return re.findall(r'\w+', self._match.extras or '')
236
+
237
+ @functools.cached_property
238
+ def _match(self) -> _EntryPointMatch:
239
+ match = self.pattern.match(self.value)
240
+ if not match:
241
+ raise ValueError(
242
+ 'Invalid object reference. '
243
+ 'See https://packaging.python.org'
244
+ '/en/latest/specifications/entry-points/#data-model',
245
+ self.value,
246
+ )
247
+ return _EntryPointMatch(**match.groupdict())
248
+
249
+ def _for(self, dist):
250
+ vars(self).update(dist=dist)
251
+ return self
252
+
253
+ def matches(self, **params):
254
+ """
255
+ EntryPoint matches the given parameters.
256
+
257
+ >>> ep = EntryPoint(group='foo', name='bar', value='bing:bong [extra1, extra2]')
258
+ >>> ep.matches(group='foo')
259
+ True
260
+ >>> ep.matches(name='bar', value='bing:bong [extra1, extra2]')
261
+ True
262
+ >>> ep.matches(group='foo', name='other')
263
+ False
264
+ >>> ep.matches()
265
+ True
266
+ >>> ep.matches(extras=['extra1', 'extra2'])
267
+ True
268
+ >>> ep.matches(module='bing')
269
+ True
270
+ >>> ep.matches(attr='bong')
271
+ True
272
+ """
273
+ self._disallow_dist(params)
274
+ attrs = (getattr(self, param) for param in params)
275
+ return all(map(operator.eq, params.values(), attrs))
276
+
277
+ @staticmethod
278
+ def _disallow_dist(params):
279
+ """
280
+ Querying by dist is not allowed (dist objects are not comparable).
281
+ >>> EntryPoint(name='fan', value='fav', group='fag').matches(dist='foo')
282
+ Traceback (most recent call last):
283
+ ...
284
+ ValueError: "dist" is not suitable for matching...
285
+ """
286
+ if "dist" in params:
287
+ raise ValueError(
288
+ '"dist" is not suitable for matching. '
289
+ "Instead, use Distribution.entry_points.select() on a "
290
+ "located distribution."
291
+ )
292
+
293
+ def _key(self):
294
+ return self.name, self.value, self.group
295
+
296
+ def __lt__(self, other):
297
+ return self._key() < other._key()
298
+
299
+ def __eq__(self, other):
300
+ return self._key() == other._key()
301
+
302
+ def __setattr__(self, name, value):
303
+ raise AttributeError("EntryPoint objects are immutable.")
304
+
305
+ def __repr__(self):
306
+ return (
307
+ f'EntryPoint(name={self.name!r}, value={self.value!r}, '
308
+ f'group={self.group!r})'
309
+ )
310
+
311
+ def __hash__(self) -> int:
312
+ return hash(self._key())
313
+
314
+
315
+ class EntryPoints(tuple):
316
+ """
317
+ An immutable collection of selectable EntryPoint objects.
318
+ """
319
+
320
+ __slots__ = ()
321
+
322
+ def __getitem__(self, name: str) -> EntryPoint: # type: ignore[override] # Work with str instead of int
323
+ """
324
+ Get the EntryPoint in self matching name.
325
+ """
326
+ try:
327
+ return next(iter(self.select(name=name)))
328
+ except StopIteration:
329
+ raise KeyError(name)
330
+
331
+ def __repr__(self):
332
+ """
333
+ Repr with classname and tuple constructor to
334
+ signal that we deviate from regular tuple behavior.
335
+ """
336
+ return '%s(%r)' % (self.__class__.__name__, tuple(self))
337
+
338
+ def select(self, **params) -> EntryPoints:
339
+ """
340
+ Select entry points from self that match the
341
+ given parameters (typically group and/or name).
342
+ """
343
+ return EntryPoints(ep for ep in self if ep.matches(**params))
344
+
345
+ @property
346
+ def names(self) -> set[str]:
347
+ """
348
+ Return the set of all names of all entry points.
349
+ """
350
+ return {ep.name for ep in self}
351
+
352
+ @property
353
+ def groups(self) -> set[str]:
354
+ """
355
+ Return the set of all groups of all entry points.
356
+ """
357
+ return {ep.group for ep in self}
358
+
359
+ @classmethod
360
+ def _from_text_for(cls, text, dist):
361
+ return cls(ep._for(dist) for ep in cls._from_text(text))
362
+
363
+ @staticmethod
364
+ def _from_text(text):
365
+ return (
366
+ EntryPoint(name=item.value.name, value=item.value.value, group=item.name)
367
+ for item in Sectioned.section_pairs(text or '')
368
+ )
369
+
370
+
371
+ class PackagePath(pathlib.PurePosixPath):
372
+ """A reference to a path in a package"""
373
+
374
+ hash: FileHash | None
375
+ size: int
376
+ dist: Distribution
377
+
378
+ def read_text(self, encoding: str = 'utf-8') -> str:
379
+ return self.locate().read_text(encoding=encoding)
380
+
381
+ def read_binary(self) -> bytes:
382
+ return self.locate().read_bytes()
383
+
384
+ def locate(self) -> SimplePath:
385
+ """Return a path-like object for this path"""
386
+ return self.dist.locate_file(self)
387
+
388
+
389
+ class FileHash:
390
+ def __init__(self, spec: str) -> None:
391
+ self.mode, _, self.value = spec.partition('=')
392
+
393
+ def __repr__(self) -> str:
394
+ return f'<FileHash mode: {self.mode} value: {self.value}>'
395
+
396
+
397
+ class Distribution(metaclass=abc.ABCMeta):
398
+ """
399
+ An abstract Python distribution package.
400
+
401
+ Custom providers may derive from this class and define
402
+ the abstract methods to provide a concrete implementation
403
+ for their environment. Some providers may opt to override
404
+ the default implementation of some properties to bypass
405
+ the file-reading mechanism.
406
+ """
407
+
408
+ @abc.abstractmethod
409
+ def read_text(self, filename) -> str | None:
410
+ """Attempt to load metadata file given by the name.
411
+
412
+ Python distribution metadata is organized by blobs of text
413
+ typically represented as "files" in the metadata directory
414
+ (e.g. package-1.0.dist-info). These files include things
415
+ like:
416
+
417
+ - METADATA: The distribution metadata including fields
418
+ like Name and Version and Description.
419
+ - entry_points.txt: A series of entry points as defined in
420
+ `the entry points spec <https://packaging.python.org/en/latest/specifications/entry-points/#file-format>`_.
421
+ - RECORD: A record of files according to
422
+ `this recording spec <https://packaging.python.org/en/latest/specifications/recording-installed-packages/#the-record-file>`_.
423
+
424
+ A package may provide any set of files, including those
425
+ not listed here or none at all.
426
+
427
+ :param filename: The name of the file in the distribution info.
428
+ :return: The text if found, otherwise None.
429
+ """
430
+
431
+ @abc.abstractmethod
432
+ def locate_file(self, path: str | os.PathLike[str]) -> SimplePath:
433
+ """
434
+ Given a path to a file in this distribution, return a SimplePath
435
+ to it.
436
+
437
+ This method is used by callers of ``Distribution.files()`` to
438
+ locate files within the distribution. If it's possible for a
439
+ Distribution to represent files in the distribution as
440
+ ``SimplePath`` objects, it should implement this method
441
+ to resolve such objects.
442
+
443
+ Some Distribution providers may elect not to resolve SimplePath
444
+ objects within the distribution by raising a
445
+ NotImplementedError, but consumers of such a Distribution would
446
+ be unable to invoke ``Distribution.files()``.
447
+ """
448
+
449
+ @classmethod
450
+ def from_name(cls, name: str) -> Distribution:
451
+ """Return the Distribution for the given package name.
452
+
453
+ :param name: The name of the distribution package to search for.
454
+ :return: The Distribution instance (or subclass thereof) for the named
455
+ package, if found.
456
+ :raises PackageNotFoundError: When the named package's distribution
457
+ metadata cannot be found.
458
+ :raises ValueError: When an invalid value is supplied for name.
459
+ """
460
+ if not name:
461
+ raise ValueError("A distribution name is required.")
462
+ try:
463
+ return next(iter(cls._prefer_valid(cls.discover(name=name))))
464
+ except StopIteration:
465
+ raise PackageNotFoundError(name)
466
+
467
+ @classmethod
468
+ def discover(
469
+ cls, *, context: DistributionFinder.Context | None = None, **kwargs
470
+ ) -> Iterable[Distribution]:
471
+ """Return an iterable of Distribution objects for all packages.
472
+
473
+ Pass a ``context`` or pass keyword arguments for constructing
474
+ a context.
475
+
476
+ :context: A ``DistributionFinder.Context`` object.
477
+ :return: Iterable of Distribution objects for packages matching
478
+ the context.
479
+ """
480
+ if context and kwargs:
481
+ raise ValueError("cannot accept context and kwargs")
482
+ context = context or DistributionFinder.Context(**kwargs)
483
+ return itertools.chain.from_iterable(
484
+ resolver(context) for resolver in cls._discover_resolvers()
485
+ )
486
+
487
+ @staticmethod
488
+ def _prefer_valid(dists: Iterable[Distribution]) -> Iterable[Distribution]:
489
+ """
490
+ Prefer (move to the front) distributions that have metadata.
491
+
492
+ Ref python/importlib_resources#489.
493
+ """
494
+ buckets = bucket(dists, lambda dist: bool(dist.metadata))
495
+ return itertools.chain(buckets[True], buckets[False])
496
+
497
+ @staticmethod
498
+ def at(path: str | os.PathLike[str]) -> Distribution:
499
+ """Return a Distribution for the indicated metadata path.
500
+
501
+ :param path: a string or path-like object
502
+ :return: a concrete Distribution instance for the path
503
+ """
504
+ return PathDistribution(pathlib.Path(path))
505
+
506
+ @staticmethod
507
+ def _discover_resolvers():
508
+ """Search the meta_path for resolvers (MetadataPathFinders)."""
509
+ declared = (
510
+ getattr(finder, 'find_distributions', None) for finder in sys.meta_path
511
+ )
512
+ return filter(None, declared)
513
+
514
+ @property
515
+ def metadata(self) -> _meta.PackageMetadata | None:
516
+ """Return the parsed metadata for this Distribution.
517
+
518
+ The returned object will have keys that name the various bits of
519
+ metadata per the
520
+ `Core metadata specifications <https://packaging.python.org/en/latest/specifications/core-metadata/#core-metadata>`_.
521
+
522
+ Custom providers may provide the METADATA file or override this
523
+ property.
524
+ """
525
+
526
+ text = (
527
+ self.read_text('METADATA')
528
+ or self.read_text('PKG-INFO')
529
+ # This last clause is here to support old egg-info files. Its
530
+ # effect is to just end up using the PathDistribution's self._path
531
+ # (which points to the egg-info file) attribute unchanged.
532
+ or self.read_text('')
533
+ )
534
+ return self._assemble_message(text)
535
+
536
+ @staticmethod
537
+ @pass_none
538
+ def _assemble_message(text: str) -> _meta.PackageMetadata:
539
+ # deferred for performance (python/cpython#109829)
540
+ from . import _adapters
541
+
542
+ return _adapters.Message(email.message_from_string(text))
543
+
544
+ @property
545
+ def name(self) -> str:
546
+ """Return the 'Name' metadata for the distribution package."""
547
+ return md_none(self.metadata)['Name']
548
+
549
+ @property
550
+ def _normalized_name(self):
551
+ """Return a normalized version of the name."""
552
+ return Prepared.normalize(self.name)
553
+
554
+ @property
555
+ def version(self) -> str:
556
+ """Return the 'Version' metadata for the distribution package."""
557
+ return md_none(self.metadata)['Version']
558
+
559
+ @property
560
+ def entry_points(self) -> EntryPoints:
561
+ """
562
+ Return EntryPoints for this distribution.
563
+
564
+ Custom providers may provide the ``entry_points.txt`` file
565
+ or override this property.
566
+ """
567
+ return EntryPoints._from_text_for(self.read_text('entry_points.txt'), self)
568
+
569
+ @property
570
+ def files(self) -> list[PackagePath] | None:
571
+ """Files in this distribution.
572
+
573
+ :return: List of PackagePath for this distribution or None
574
+
575
+ Result is `None` if the metadata file that enumerates files
576
+ (i.e. RECORD for dist-info, or installed-files.txt or
577
+ SOURCES.txt for egg-info) is missing.
578
+ Result may be empty if the metadata exists but is empty.
579
+
580
+ Custom providers are recommended to provide a "RECORD" file (in
581
+ ``read_text``) or override this property to allow for callers to be
582
+ able to resolve filenames provided by the package.
583
+ """
584
+
585
+ def make_file(name, hash=None, size_str=None):
586
+ result = PackagePath(name)
587
+ result.hash = FileHash(hash) if hash else None
588
+ result.size = int(size_str) if size_str else None
589
+ result.dist = self
590
+ return result
591
+
592
+ @pass_none
593
+ def make_files(lines):
594
+ # Delay csv import, since Distribution.files is not as widely used
595
+ # as other parts of importlib.metadata
596
+ import csv
597
+
598
+ return starmap(make_file, csv.reader(lines))
599
+
600
+ @pass_none
601
+ def skip_missing_files(package_paths):
602
+ return list(filter(lambda path: path.locate().exists(), package_paths))
603
+
604
+ return skip_missing_files(
605
+ make_files(
606
+ self._read_files_distinfo()
607
+ or self._read_files_egginfo_installed()
608
+ or self._read_files_egginfo_sources()
609
+ )
610
+ )
611
+
612
+ def _read_files_distinfo(self):
613
+ """
614
+ Read the lines of RECORD.
615
+ """
616
+ text = self.read_text('RECORD')
617
+ return text and text.splitlines()
618
+
619
+ def _read_files_egginfo_installed(self):
620
+ """
621
+ Read installed-files.txt and return lines in a similar
622
+ CSV-parsable format as RECORD: each file must be placed
623
+ relative to the site-packages directory and must also be
624
+ quoted (since file names can contain literal commas).
625
+
626
+ This file is written when the package is installed by pip,
627
+ but it might not be written for other installation methods.
628
+ Assume the file is accurate if it exists.
629
+ """
630
+ text = self.read_text('installed-files.txt')
631
+ # Prepend the .egg-info/ subdir to the lines in this file.
632
+ # But this subdir is only available from PathDistribution's
633
+ # self._path.
634
+ subdir = getattr(self, '_path', None)
635
+ if not text or not subdir:
636
+ return
637
+
638
+ paths = (
639
+ py311
640
+ .relative_fix((subdir / name).resolve())
641
+ .relative_to(self.locate_file('').resolve(), walk_up=True)
642
+ .as_posix()
643
+ for name in text.splitlines()
644
+ )
645
+ return map('"{}"'.format, paths)
646
+
647
+ def _read_files_egginfo_sources(self):
648
+ """
649
+ Read SOURCES.txt and return lines in a similar CSV-parsable
650
+ format as RECORD: each file name must be quoted (since it
651
+ might contain literal commas).
652
+
653
+ Note that SOURCES.txt is not a reliable source for what
654
+ files are installed by a package. This file is generated
655
+ for a source archive, and the files that are present
656
+ there (e.g. setup.py) may not correctly reflect the files
657
+ that are present after the package has been installed.
658
+ """
659
+ text = self.read_text('SOURCES.txt')
660
+ return text and map('"{}"'.format, text.splitlines())
661
+
662
+ @property
663
+ def requires(self) -> list[str] | None:
664
+ """Generated requirements specified for this Distribution"""
665
+ reqs = self._read_dist_info_reqs() or self._read_egg_info_reqs()
666
+ return reqs and list(reqs)
667
+
668
+ def _read_dist_info_reqs(self):
669
+ return self.metadata.get_all('Requires-Dist')
670
+
671
+ def _read_egg_info_reqs(self):
672
+ source = self.read_text('requires.txt')
673
+ return pass_none(self._deps_from_requires_text)(source)
674
+
675
+ @classmethod
676
+ def _deps_from_requires_text(cls, source):
677
+ return cls._convert_egg_info_reqs_to_simple_reqs(Sectioned.read(source))
678
+
679
+ @staticmethod
680
+ def _convert_egg_info_reqs_to_simple_reqs(sections):
681
+ """
682
+ Historically, setuptools would solicit and store 'extra'
683
+ requirements, including those with environment markers,
684
+ in separate sections. More modern tools expect each
685
+ dependency to be defined separately, with any relevant
686
+ extras and environment markers attached directly to that
687
+ requirement. This method converts the former to the
688
+ latter. See _test_deps_from_requires_text for an example.
689
+ """
690
+
691
+ def make_condition(name):
692
+ return name and f'extra == "{name}"'
693
+
694
+ def quoted_marker(section):
695
+ section = section or ''
696
+ extra, sep, markers = section.partition(':')
697
+ if extra and markers:
698
+ markers = f'({markers})'
699
+ conditions = list(filter(None, [markers, make_condition(extra)]))
700
+ return '; ' + ' and '.join(conditions) if conditions else ''
701
+
702
+ def url_req_space(req):
703
+ """
704
+ PEP 508 requires a space between the url_spec and the quoted_marker.
705
+ Ref python/importlib_metadata#357.
706
+ """
707
+ # '@' is uniquely indicative of a url_req.
708
+ return ' ' * ('@' in req)
709
+
710
+ for section in sections:
711
+ space = url_req_space(section.value)
712
+ yield section.value + space + quoted_marker(section.name)
713
+
714
+ @property
715
+ def origin(self):
716
+ return self._load_json('direct_url.json')
717
+
718
+ def _load_json(self, filename):
719
+ # Deferred for performance (python/importlib_metadata#503)
720
+ import json
721
+
722
+ return pass_none(json.loads)(
723
+ self.read_text(filename),
724
+ object_hook=lambda data: types.SimpleNamespace(**data),
725
+ )
726
+
727
+
728
+ class DistributionFinder(MetaPathFinder):
729
+ """
730
+ A MetaPathFinder capable of discovering installed distributions.
731
+
732
+ Custom providers should implement this interface in order to
733
+ supply metadata.
734
+ """
735
+
736
+ class Context:
737
+ """
738
+ Keyword arguments presented by the caller to
739
+ ``distributions()`` or ``Distribution.discover()``
740
+ to narrow the scope of a search for distributions
741
+ in all DistributionFinders.
742
+
743
+ Each DistributionFinder may expect any parameters
744
+ and should attempt to honor the canonical
745
+ parameters defined below when appropriate.
746
+
747
+ This mechanism gives a custom provider a means to
748
+ solicit additional details from the caller beyond
749
+ "name" and "path" when searching distributions.
750
+ For example, imagine a provider that exposes suites
751
+ of packages in either a "public" or "private" ``realm``.
752
+ A caller may wish to query only for distributions in
753
+ a particular realm and could call
754
+ ``distributions(realm="private")`` to signal to the
755
+ custom provider to only include distributions from that
756
+ realm.
757
+ """
758
+
759
+ name = None
760
+ """
761
+ Specific name for which a distribution finder should match.
762
+ A name of ``None`` matches all distributions.
763
+ """
764
+
765
+ def __init__(self, **kwargs):
766
+ vars(self).update(kwargs)
767
+
768
+ @property
769
+ def path(self) -> list[str]:
770
+ """
771
+ The sequence of directory path that a distribution finder
772
+ should search.
773
+
774
+ Typically refers to Python installed package paths such as
775
+ "site-packages" directories and defaults to ``sys.path``.
776
+ """
777
+ return vars(self).get('path', sys.path)
778
+
779
+ @abc.abstractmethod
780
+ def find_distributions(self, context=Context()) -> Iterable[Distribution]:
781
+ """
782
+ Find distributions.
783
+
784
+ Return an iterable of all Distribution instances capable of
785
+ loading the metadata for packages matching the ``context``,
786
+ a DistributionFinder.Context instance.
787
+ """
788
+
789
+
790
+ @passthrough
791
+ def _clear_after_fork(cached):
792
+ """Ensure ``func`` clears cached state after ``fork`` when supported.
793
+
794
+ ``FastPath`` caches zip-backed ``pathlib.Path`` objects that retain a
795
+ reference to the parent's open ``ZipFile`` handle. Re-using a cached
796
+ instance in a forked child can therefore resurrect invalid file pointers
797
+ and trigger ``BadZipFile``/``OSError`` failures (python/importlib_metadata#520).
798
+ Registering ``cache_clear`` with ``os.register_at_fork`` keeps each process
799
+ on its own cache.
800
+ """
801
+ getattr(os, 'register_at_fork', noop)(after_in_child=cached.cache_clear)
802
+
803
+
804
+ class FastPath:
805
+ """
806
+ Micro-optimized class for searching a root for children.
807
+
808
+ Root is a path on the file system that may contain metadata
809
+ directories either as natural directories or within a zip file.
810
+
811
+ >>> FastPath('').children()
812
+ ['...']
813
+
814
+ FastPath objects are cached and recycled for any given root.
815
+
816
+ >>> FastPath('foobar') is FastPath('foobar')
817
+ True
818
+ """
819
+
820
+ @_clear_after_fork # type: ignore[misc]
821
+ @functools.lru_cache()
822
+ def __new__(cls, root):
823
+ return super().__new__(cls)
824
+
825
+ def __init__(self, root):
826
+ self.root = root
827
+
828
+ def joinpath(self, child):
829
+ return pathlib.Path(self.root, child)
830
+
831
+ def children(self):
832
+ with suppress(Exception):
833
+ return os.listdir(self.root or '.')
834
+ with suppress(Exception):
835
+ return self.zip_children()
836
+ return []
837
+
838
+ def zip_children(self):
839
+ # deferred for performance (python/importlib_metadata#502)
840
+ from zipp.compat.overlay import zipfile
841
+
842
+ zip_path = zipfile.Path(self.root)
843
+ names = zip_path.root.namelist()
844
+ self.joinpath = zip_path.joinpath
845
+
846
+ return dict.fromkeys(child.split(posixpath.sep, 1)[0] for child in names)
847
+
848
+ def search(self, name):
849
+ return self.lookup(self.mtime).search(name)
850
+
851
+ @property
852
+ def mtime(self):
853
+ with suppress(OSError):
854
+ return os.stat(self.root).st_mtime
855
+ self.lookup.cache_clear()
856
+
857
+ @method_cache
858
+ def lookup(self, mtime):
859
+ return Lookup(self)
860
+
861
+
862
+ class Lookup:
863
+ """
864
+ A micro-optimized class for searching a (fast) path for metadata.
865
+ """
866
+
867
+ def __init__(self, path: FastPath):
868
+ """
869
+ Calculate all of the children representing metadata.
870
+
871
+ From the children in the path, calculate early all of the
872
+ children that appear to represent metadata (infos) or legacy
873
+ metadata (eggs).
874
+ """
875
+
876
+ base = os.path.basename(path.root).lower()
877
+ base_is_egg = base.endswith(".egg")
878
+ self.infos = FreezableDefaultDict(list)
879
+ self.eggs = FreezableDefaultDict(list)
880
+
881
+ for child in path.children():
882
+ low = child.lower()
883
+ if low.endswith((".dist-info", ".egg-info")):
884
+ # rpartition is faster than splitext and suitable for this purpose.
885
+ name = low.rpartition(".")[0].partition("-")[0]
886
+ normalized = Prepared.normalize(name)
887
+ self.infos[normalized].append(path.joinpath(child))
888
+ elif base_is_egg and low == "egg-info":
889
+ name = base.rpartition(".")[0].partition("-")[0]
890
+ legacy_normalized = Prepared.legacy_normalize(name)
891
+ self.eggs[legacy_normalized].append(path.joinpath(child))
892
+
893
+ self.infos.freeze()
894
+ self.eggs.freeze()
895
+
896
+ def search(self, prepared: Prepared):
897
+ """
898
+ Yield all infos and eggs matching the Prepared query.
899
+ """
900
+ infos = (
901
+ self.infos[prepared.normalized]
902
+ if prepared
903
+ else itertools.chain.from_iterable(self.infos.values())
904
+ )
905
+ eggs = (
906
+ self.eggs[prepared.legacy_normalized]
907
+ if prepared
908
+ else itertools.chain.from_iterable(self.eggs.values())
909
+ )
910
+ return itertools.chain(infos, eggs)
911
+
912
+
913
+ class Prepared:
914
+ """
915
+ A prepared search query for metadata on a possibly-named package.
916
+
917
+ Pre-calculates the normalization to prevent repeated operations.
918
+
919
+ >>> none = Prepared(None)
920
+ >>> none.normalized
921
+ >>> none.legacy_normalized
922
+ >>> bool(none)
923
+ False
924
+ >>> sample = Prepared('Sample__Pkg-name.foo')
925
+ >>> sample.normalized
926
+ 'sample_pkg_name_foo'
927
+ >>> sample.legacy_normalized
928
+ 'sample__pkg_name.foo'
929
+ >>> bool(sample)
930
+ True
931
+ """
932
+
933
+ normalized = None
934
+ legacy_normalized = None
935
+
936
+ def __init__(self, name: str | None):
937
+ self.name = name
938
+ if name is None:
939
+ return
940
+ self.normalized = self.normalize(name)
941
+ self.legacy_normalized = self.legacy_normalize(name)
942
+
943
+ @staticmethod
944
+ def normalize(name):
945
+ """
946
+ PEP 503 normalization plus dashes as underscores.
947
+ """
948
+ return re.sub(r"[-_.]+", "-", name).lower().replace('-', '_')
949
+
950
+ @staticmethod
951
+ def legacy_normalize(name):
952
+ """
953
+ Normalize the package name as found in the convention in
954
+ older packaging tools versions and specs.
955
+ """
956
+ return name.lower().replace('-', '_')
957
+
958
+ def __bool__(self):
959
+ return bool(self.name)
960
+
961
+
962
+ @install
963
+ class MetadataPathFinder(NullFinder, DistributionFinder):
964
+ """A degenerate finder for distribution packages on the file system.
965
+
966
+ This finder supplies only a find_distributions() method for versions
967
+ of Python that do not have a PathFinder find_distributions().
968
+ """
969
+
970
+ @classmethod
971
+ def find_distributions(
972
+ cls, context=DistributionFinder.Context()
973
+ ) -> Iterable[PathDistribution]:
974
+ """
975
+ Find distributions.
976
+
977
+ Return an iterable of all Distribution instances capable of
978
+ loading the metadata for packages matching ``context.name``
979
+ (or all names if ``None`` indicated) along the paths in the list
980
+ of directories ``context.path``.
981
+ """
982
+ found = cls._search_paths(context.name, context.path)
983
+ return map(PathDistribution, found)
984
+
985
+ @classmethod
986
+ def _search_paths(cls, name, paths):
987
+ """Find metadata directories in paths heuristically."""
988
+ prepared = Prepared(name)
989
+ return itertools.chain.from_iterable(
990
+ path.search(prepared) for path in map(FastPath, paths)
991
+ )
992
+
993
+ @classmethod
994
+ def invalidate_caches(cls) -> None:
995
+ FastPath.__new__.cache_clear()
996
+
997
+
998
+ class PathDistribution(Distribution):
999
+ def __init__(self, path: SimplePath) -> None:
1000
+ """Construct a distribution.
1001
+
1002
+ :param path: SimplePath indicating the metadata directory.
1003
+ """
1004
+ self._path = path
1005
+
1006
+ def read_text(self, filename: str | os.PathLike[str]) -> str | None:
1007
+ with suppress(
1008
+ FileNotFoundError,
1009
+ IsADirectoryError,
1010
+ KeyError,
1011
+ NotADirectoryError,
1012
+ PermissionError,
1013
+ ):
1014
+ return self._path.joinpath(filename).read_text(encoding='utf-8')
1015
+
1016
+ return None
1017
+
1018
+ read_text.__doc__ = Distribution.read_text.__doc__
1019
+
1020
+ def locate_file(self, path: str | os.PathLike[str]) -> SimplePath:
1021
+ return self._path.parent / path
1022
+
1023
+ @property
1024
+ def _normalized_name(self):
1025
+ """
1026
+ Performance optimization: where possible, resolve the
1027
+ normalized name from the file system path.
1028
+ """
1029
+ stem = os.path.basename(str(self._path))
1030
+ return (
1031
+ pass_none(Prepared.normalize)(self._name_from_stem(stem))
1032
+ or super()._normalized_name
1033
+ )
1034
+
1035
+ @staticmethod
1036
+ def _name_from_stem(stem):
1037
+ """
1038
+ >>> PathDistribution._name_from_stem('foo-3.0.egg-info')
1039
+ 'foo'
1040
+ >>> PathDistribution._name_from_stem('CherryPy-3.0.dist-info')
1041
+ 'CherryPy'
1042
+ >>> PathDistribution._name_from_stem('face.egg-info')
1043
+ 'face'
1044
+ >>> PathDistribution._name_from_stem('foo.bar')
1045
+ """
1046
+ filename, ext = os.path.splitext(stem)
1047
+ if ext not in ('.dist-info', '.egg-info'):
1048
+ return
1049
+ name, sep, rest = filename.partition('-')
1050
+ return name
1051
+
1052
+
1053
+ def distribution(distribution_name: str) -> Distribution:
1054
+ """Get the ``Distribution`` instance for the named package.
1055
+
1056
+ :param distribution_name: The name of the distribution package as a string.
1057
+ :return: A ``Distribution`` instance (or subclass thereof).
1058
+ """
1059
+ return Distribution.from_name(distribution_name)
1060
+
1061
+
1062
+ def distributions(**kwargs) -> Iterable[Distribution]:
1063
+ """Get all ``Distribution`` instances in the current environment.
1064
+
1065
+ :return: An iterable of ``Distribution`` instances.
1066
+ """
1067
+ return Distribution.discover(**kwargs)
1068
+
1069
+
1070
+ def metadata(distribution_name: str) -> _meta.PackageMetadata | None:
1071
+ """Get the metadata for the named package.
1072
+
1073
+ :param distribution_name: The name of the distribution package to query.
1074
+ :return: A PackageMetadata containing the parsed metadata.
1075
+ """
1076
+ return Distribution.from_name(distribution_name).metadata
1077
+
1078
+
1079
+ def version(distribution_name: str) -> str:
1080
+ """Get the version string for the named package.
1081
+
1082
+ :param distribution_name: The name of the distribution package to query.
1083
+ :return: The version string for the package as defined in the package's
1084
+ "Version" metadata key.
1085
+ """
1086
+ return distribution(distribution_name).version
1087
+
1088
+
1089
+ _unique = functools.partial(
1090
+ unique_everseen,
1091
+ key=operator.attrgetter('_normalized_name'),
1092
+ )
1093
+ """
1094
+ Wrapper for ``distributions`` to return unique distributions by name.
1095
+ """
1096
+
1097
+
1098
+ def entry_points(**params) -> EntryPoints:
1099
+ """Return EntryPoint objects for all installed packages.
1100
+
1101
+ Pass selection parameters (group or name) to filter the
1102
+ result to entry points matching those properties (see
1103
+ EntryPoints.select()).
1104
+
1105
+ :return: EntryPoints for all installed packages.
1106
+ """
1107
+ eps = itertools.chain.from_iterable(
1108
+ dist.entry_points for dist in _unique(distributions())
1109
+ )
1110
+ return EntryPoints(eps).select(**params)
1111
+
1112
+
1113
+ def files(distribution_name: str) -> list[PackagePath] | None:
1114
+ """Return a list of files for the named package.
1115
+
1116
+ :param distribution_name: The name of the distribution package to query.
1117
+ :return: List of files composing the distribution.
1118
+ """
1119
+ return distribution(distribution_name).files
1120
+
1121
+
1122
+ def requires(distribution_name: str) -> list[str] | None:
1123
+ """
1124
+ Return a list of requirements for the named package.
1125
+
1126
+ :return: An iterable of requirements, suitable for
1127
+ packaging.requirement.Requirement.
1128
+ """
1129
+ return distribution(distribution_name).requires
1130
+
1131
+
1132
+ def packages_distributions() -> Mapping[str, list[str]]:
1133
+ """
1134
+ Return a mapping of top-level packages to their
1135
+ distributions.
1136
+
1137
+ >>> import collections.abc
1138
+ >>> pkgs = packages_distributions()
1139
+ >>> all(isinstance(dist, collections.abc.Sequence) for dist in pkgs.values())
1140
+ True
1141
+ """
1142
+ pkg_to_dist = collections.defaultdict(list)
1143
+ for dist in distributions():
1144
+ for pkg in _top_level_declared(dist) or _top_level_inferred(dist):
1145
+ pkg_to_dist[pkg].append(md_none(dist.metadata)['Name'])
1146
+ return dict(pkg_to_dist)
1147
+
1148
+
1149
+ def _top_level_declared(dist):
1150
+ return (dist.read_text('top_level.txt') or '').split()
1151
+
1152
+
1153
+ def _topmost(name: PackagePath) -> str | None:
1154
+ """
1155
+ Return the top-most parent as long as there is a parent.
1156
+ """
1157
+ top, *rest = name.parts
1158
+ return top if rest else None
1159
+
1160
+
1161
+ def _get_toplevel_name(name: PackagePath) -> str:
1162
+ """
1163
+ Infer a possibly importable module name from a name presumed on
1164
+ sys.path.
1165
+
1166
+ >>> _get_toplevel_name(PackagePath('foo.py'))
1167
+ 'foo'
1168
+ >>> _get_toplevel_name(PackagePath('foo'))
1169
+ 'foo'
1170
+ >>> _get_toplevel_name(PackagePath('foo.pyc'))
1171
+ 'foo'
1172
+ >>> _get_toplevel_name(PackagePath('foo/__init__.py'))
1173
+ 'foo'
1174
+ >>> _get_toplevel_name(PackagePath('foo.pth'))
1175
+ 'foo.pth'
1176
+ >>> _get_toplevel_name(PackagePath('foo.dist-info'))
1177
+ 'foo.dist-info'
1178
+ """
1179
+ # Defer import of inspect for performance (python/cpython#118761)
1180
+ import inspect
1181
+
1182
+ return _topmost(name) or inspect.getmodulename(name) or str(name)
1183
+
1184
+
1185
+ def _top_level_inferred(dist):
1186
+ opt_names = set(map(_get_toplevel_name, always_iterable(dist.files)))
1187
+
1188
+ def importable_name(name):
1189
+ return '.' not in name
1190
+
1191
+ return filter(importable_name, opt_names)
importlib_metadata/_adapters.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import email.message
2
+ import email.policy
3
+ import re
4
+ import textwrap
5
+
6
+ from ._text import FoldedCase
7
+
8
+
9
+ class RawPolicy(email.policy.EmailPolicy):
10
+ def fold(self, name, value):
11
+ folded = self.linesep.join(
12
+ textwrap
13
+ .indent(value, prefix=' ' * 8, predicate=lambda line: True)
14
+ .lstrip()
15
+ .splitlines()
16
+ )
17
+ return f'{name}: {folded}{self.linesep}'
18
+
19
+
20
+ class Message(email.message.Message):
21
+ r"""
22
+ Specialized Message subclass to handle metadata naturally.
23
+
24
+ Reads values that may have newlines in them and converts the
25
+ payload to the Description.
26
+
27
+ >>> msg_text = textwrap.dedent('''
28
+ ... Name: Foo
29
+ ... Version: 3.0
30
+ ... License: blah
31
+ ... de-blah
32
+ ... <BLANKLINE>
33
+ ... First line of description.
34
+ ... Second line of description.
35
+ ... <BLANKLINE>
36
+ ... Fourth line!
37
+ ... ''').lstrip().replace('<BLANKLINE>', '')
38
+ >>> msg = Message(email.message_from_string(msg_text))
39
+ >>> msg['Description']
40
+ 'First line of description.\nSecond line of description.\n\nFourth line!\n'
41
+
42
+ Message should render even if values contain newlines.
43
+
44
+ >>> print(msg)
45
+ Name: Foo
46
+ Version: 3.0
47
+ License: blah
48
+ de-blah
49
+ Description: First line of description.
50
+ Second line of description.
51
+ <BLANKLINE>
52
+ Fourth line!
53
+ <BLANKLINE>
54
+ <BLANKLINE>
55
+ """
56
+
57
+ multiple_use_keys = set(
58
+ map(
59
+ FoldedCase,
60
+ [
61
+ 'Classifier',
62
+ 'Obsoletes-Dist',
63
+ 'Platform',
64
+ 'Project-URL',
65
+ 'Provides-Dist',
66
+ 'Provides-Extra',
67
+ 'Requires-Dist',
68
+ 'Requires-External',
69
+ 'Supported-Platform',
70
+ 'Dynamic',
71
+ ],
72
+ )
73
+ )
74
+ """
75
+ Keys that may be indicated multiple times per PEP 566.
76
+ """
77
+
78
+ def __new__(cls, orig: email.message.Message):
79
+ res = super().__new__(cls)
80
+ vars(res).update(vars(orig))
81
+ return res
82
+
83
+ def __init__(self, *args, **kwargs):
84
+ self._headers = self._repair_headers()
85
+
86
+ # suppress spurious error from mypy
87
+ def __iter__(self):
88
+ return super().__iter__()
89
+
90
+ def __getitem__(self, item):
91
+ """
92
+ Override parent behavior to typical dict behavior.
93
+
94
+ ``email.message.Message`` will emit None values for missing
95
+ keys. Typical mappings, including this ``Message``, will raise
96
+ a key error for missing keys.
97
+
98
+ Ref python/importlib_metadata#371.
99
+ """
100
+ res = super().__getitem__(item)
101
+ if res is None:
102
+ raise KeyError(item)
103
+ return res
104
+
105
+ def _repair_headers(self):
106
+ def redent(value):
107
+ "Correct for RFC822 indentation"
108
+ indent = ' ' * 8
109
+ if not value or '\n' + indent not in value:
110
+ return value
111
+ return textwrap.dedent(indent + value)
112
+
113
+ headers = [(key, redent(value)) for key, value in vars(self)['_headers']]
114
+ if self._payload:
115
+ headers.append(('Description', self.get_payload()))
116
+ self.set_payload('')
117
+ return headers
118
+
119
+ def as_string(self):
120
+ return super().as_string(policy=RawPolicy())
121
+
122
+ @property
123
+ def json(self):
124
+ """
125
+ Convert PackageMetadata to a JSON-compatible format
126
+ per PEP 0566.
127
+ """
128
+
129
+ def transform(key):
130
+ value = self.get_all(key) if key in self.multiple_use_keys else self[key]
131
+ if key == 'Keywords':
132
+ value = re.split(r'\s+', value)
133
+ tk = key.lower().replace('-', '_')
134
+ return tk, value
135
+
136
+ return dict(map(transform, map(FoldedCase, self)))
importlib_metadata/_collections.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import collections
2
+ import typing
3
+
4
+
5
+ # from jaraco.collections 3.3
6
+ class FreezableDefaultDict(collections.defaultdict):
7
+ """
8
+ Often it is desirable to prevent the mutation of
9
+ a default dict after its initial construction, such
10
+ as to prevent mutation during iteration.
11
+
12
+ >>> dd = FreezableDefaultDict(list)
13
+ >>> dd[0].append('1')
14
+ >>> dd.freeze()
15
+ >>> dd[1]
16
+ []
17
+ >>> len(dd)
18
+ 1
19
+ """
20
+
21
+ def __missing__(self, key):
22
+ return getattr(self, '_frozen', super().__missing__)(key)
23
+
24
+ def freeze(self):
25
+ self._frozen = lambda key: self.default_factory()
26
+
27
+
28
+ class Pair(typing.NamedTuple):
29
+ name: str
30
+ value: str
31
+
32
+ @classmethod
33
+ def parse(cls, text):
34
+ return cls(*map(str.strip, text.split("=", 1)))
importlib_metadata/_compat.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import platform
2
+ import sys
3
+
4
+ __all__ = ['install', 'NullFinder']
5
+
6
+
7
+ def install(cls):
8
+ """
9
+ Class decorator for installation on sys.meta_path.
10
+
11
+ Adds the backport DistributionFinder to sys.meta_path and
12
+ attempts to disable the finder functionality of the stdlib
13
+ DistributionFinder.
14
+ """
15
+ sys.meta_path.append(cls())
16
+ disable_stdlib_finder()
17
+ return cls
18
+
19
+
20
+ def disable_stdlib_finder():
21
+ """
22
+ Give the backport primacy for discovering path-based distributions
23
+ by monkey-patching the stdlib O_O.
24
+
25
+ See #91 for more background for rationale on this sketchy
26
+ behavior.
27
+ """
28
+
29
+ def matches(finder):
30
+ return getattr(
31
+ finder, '__module__', None
32
+ ) == '_frozen_importlib_external' and hasattr(finder, 'find_distributions')
33
+
34
+ for finder in filter(matches, sys.meta_path): # pragma: nocover
35
+ del finder.find_distributions
36
+
37
+
38
+ class NullFinder:
39
+ """
40
+ A "Finder" (aka "MetaPathFinder") that never finds any modules,
41
+ but may find distributions.
42
+ """
43
+
44
+ @staticmethod
45
+ def find_spec(*args, **kwargs):
46
+ return None
47
+
48
+
49
+ def pypy_partial(val):
50
+ """
51
+ Adjust for variable stacklevel on partial under PyPy.
52
+
53
+ Workaround for #327.
54
+ """
55
+ is_pypy = platform.python_implementation() == 'PyPy'
56
+ return val + is_pypy
importlib_metadata/_functools.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import functools
2
+ import types
3
+ from collections.abc import Callable
4
+ from typing import TypeVar
5
+
6
+
7
+ # from jaraco.functools 3.3
8
+ def method_cache(method, cache_wrapper=None):
9
+ """
10
+ Wrap lru_cache to support storing the cache data in the object instances.
11
+
12
+ Abstracts the common paradigm where the method explicitly saves an
13
+ underscore-prefixed protected property on first call and returns that
14
+ subsequently.
15
+
16
+ >>> class MyClass:
17
+ ... calls = 0
18
+ ...
19
+ ... @method_cache
20
+ ... def method(self, value):
21
+ ... self.calls += 1
22
+ ... return value
23
+
24
+ >>> a = MyClass()
25
+ >>> a.method(3)
26
+ 3
27
+ >>> for x in range(75):
28
+ ... res = a.method(x)
29
+ >>> a.calls
30
+ 75
31
+
32
+ Note that the apparent behavior will be exactly like that of lru_cache
33
+ except that the cache is stored on each instance, so values in one
34
+ instance will not flush values from another, and when an instance is
35
+ deleted, so are the cached values for that instance.
36
+
37
+ >>> b = MyClass()
38
+ >>> for x in range(35):
39
+ ... res = b.method(x)
40
+ >>> b.calls
41
+ 35
42
+ >>> a.method(0)
43
+ 0
44
+ >>> a.calls
45
+ 75
46
+
47
+ Note that if method had been decorated with ``functools.lru_cache()``,
48
+ a.calls would have been 76 (due to the cached value of 0 having been
49
+ flushed by the 'b' instance).
50
+
51
+ Clear the cache with ``.cache_clear()``
52
+
53
+ >>> a.method.cache_clear()
54
+
55
+ Same for a method that hasn't yet been called.
56
+
57
+ >>> c = MyClass()
58
+ >>> c.method.cache_clear()
59
+
60
+ Another cache wrapper may be supplied:
61
+
62
+ >>> cache = functools.lru_cache(maxsize=2)
63
+ >>> MyClass.method2 = method_cache(lambda self: 3, cache_wrapper=cache)
64
+ >>> a = MyClass()
65
+ >>> a.method2()
66
+ 3
67
+
68
+ Caution - do not subsequently wrap the method with another decorator, such
69
+ as ``@property``, which changes the semantics of the function.
70
+
71
+ See also
72
+ http://code.activestate.com/recipes/577452-a-memoize-decorator-for-instance-methods/
73
+ for another implementation and additional justification.
74
+ """
75
+ cache_wrapper = cache_wrapper or functools.lru_cache()
76
+
77
+ def wrapper(self, *args, **kwargs):
78
+ # it's the first call, replace the method with a cached, bound method
79
+ bound_method = types.MethodType(method, self)
80
+ cached_method = cache_wrapper(bound_method)
81
+ setattr(self, method.__name__, cached_method)
82
+ return cached_method(*args, **kwargs)
83
+
84
+ # Support cache clear even before cache has been created.
85
+ wrapper.cache_clear = lambda: None
86
+
87
+ return wrapper
88
+
89
+
90
+ # From jaraco.functools 3.3
91
+ def pass_none(func):
92
+ """
93
+ Wrap func so it's not called if its first param is None
94
+
95
+ >>> print_text = pass_none(print)
96
+ >>> print_text('text')
97
+ text
98
+ >>> print_text(None)
99
+ """
100
+
101
+ @functools.wraps(func)
102
+ def wrapper(param, *args, **kwargs):
103
+ if param is not None:
104
+ return func(param, *args, **kwargs)
105
+
106
+ return wrapper
107
+
108
+
109
+ # From jaraco.functools 4.4
110
+ def noop(*args, **kwargs):
111
+ """
112
+ A no-operation function that does nothing.
113
+
114
+ >>> noop(1, 2, three=3)
115
+ """
116
+
117
+
118
+ _T = TypeVar('_T')
119
+
120
+
121
+ # From jaraco.functools 4.4
122
+ def passthrough(func: Callable[..., object]) -> Callable[[_T], _T]:
123
+ """
124
+ Wrap the function to always return the first parameter.
125
+
126
+ >>> passthrough(print)('3')
127
+ 3
128
+ '3'
129
+ """
130
+
131
+ @functools.wraps(func)
132
+ def wrapper(first: _T, *args, **kwargs) -> _T:
133
+ func(first, *args, **kwargs)
134
+ return first
135
+
136
+ return wrapper # type: ignore[return-value]
importlib_metadata/_itertools.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import defaultdict, deque
2
+ from itertools import filterfalse
3
+
4
+
5
+ def unique_everseen(iterable, key=None):
6
+ "List unique elements, preserving order. Remember all elements ever seen."
7
+ # unique_everseen('AAAABBBCCDAABBB') --> A B C D
8
+ # unique_everseen('ABBCcAD', str.lower) --> A B C D
9
+ seen = set()
10
+ seen_add = seen.add
11
+ if key is None:
12
+ for element in filterfalse(seen.__contains__, iterable):
13
+ seen_add(element)
14
+ yield element
15
+ else:
16
+ for element in iterable:
17
+ k = key(element)
18
+ if k not in seen:
19
+ seen_add(k)
20
+ yield element
21
+
22
+
23
+ # copied from more_itertools 8.8
24
+ def always_iterable(obj, base_type=(str, bytes)):
25
+ """If *obj* is iterable, return an iterator over its items::
26
+
27
+ >>> obj = (1, 2, 3)
28
+ >>> list(always_iterable(obj))
29
+ [1, 2, 3]
30
+
31
+ If *obj* is not iterable, return a one-item iterable containing *obj*::
32
+
33
+ >>> obj = 1
34
+ >>> list(always_iterable(obj))
35
+ [1]
36
+
37
+ If *obj* is ``None``, return an empty iterable:
38
+
39
+ >>> obj = None
40
+ >>> list(always_iterable(None))
41
+ []
42
+
43
+ By default, binary and text strings are not considered iterable::
44
+
45
+ >>> obj = 'foo'
46
+ >>> list(always_iterable(obj))
47
+ ['foo']
48
+
49
+ If *base_type* is set, objects for which ``isinstance(obj, base_type)``
50
+ returns ``True`` won't be considered iterable.
51
+
52
+ >>> obj = {'a': 1}
53
+ >>> list(always_iterable(obj)) # Iterate over the dict's keys
54
+ ['a']
55
+ >>> list(always_iterable(obj, base_type=dict)) # Treat dicts as a unit
56
+ [{'a': 1}]
57
+
58
+ Set *base_type* to ``None`` to avoid any special handling and treat objects
59
+ Python considers iterable as iterable:
60
+
61
+ >>> obj = 'foo'
62
+ >>> list(always_iterable(obj, base_type=None))
63
+ ['f', 'o', 'o']
64
+ """
65
+ if obj is None:
66
+ return iter(())
67
+
68
+ if (base_type is not None) and isinstance(obj, base_type):
69
+ return iter((obj,))
70
+
71
+ try:
72
+ return iter(obj)
73
+ except TypeError:
74
+ return iter((obj,))
75
+
76
+
77
+ # Copied from more_itertools 10.3
78
+ class bucket:
79
+ """Wrap *iterable* and return an object that buckets the iterable into
80
+ child iterables based on a *key* function.
81
+
82
+ >>> iterable = ['a1', 'b1', 'c1', 'a2', 'b2', 'c2', 'b3']
83
+ >>> s = bucket(iterable, key=lambda x: x[0]) # Bucket by 1st character
84
+ >>> sorted(list(s)) # Get the keys
85
+ ['a', 'b', 'c']
86
+ >>> a_iterable = s['a']
87
+ >>> next(a_iterable)
88
+ 'a1'
89
+ >>> next(a_iterable)
90
+ 'a2'
91
+ >>> list(s['b'])
92
+ ['b1', 'b2', 'b3']
93
+
94
+ The original iterable will be advanced and its items will be cached until
95
+ they are used by the child iterables. This may require significant storage.
96
+
97
+ By default, attempting to select a bucket to which no items belong will
98
+ exhaust the iterable and cache all values.
99
+ If you specify a *validator* function, selected buckets will instead be
100
+ checked against it.
101
+
102
+ >>> from itertools import count
103
+ >>> it = count(1, 2) # Infinite sequence of odd numbers
104
+ >>> key = lambda x: x % 10 # Bucket by last digit
105
+ >>> validator = lambda x: x in {1, 3, 5, 7, 9} # Odd digits only
106
+ >>> s = bucket(it, key=key, validator=validator)
107
+ >>> 2 in s
108
+ False
109
+ >>> list(s[2])
110
+ []
111
+
112
+ """
113
+
114
+ def __init__(self, iterable, key, validator=None):
115
+ self._it = iter(iterable)
116
+ self._key = key
117
+ self._cache = defaultdict(deque)
118
+ self._validator = validator or (lambda x: True)
119
+
120
+ def __contains__(self, value):
121
+ if not self._validator(value):
122
+ return False
123
+
124
+ try:
125
+ item = next(self[value])
126
+ except StopIteration:
127
+ return False
128
+ else:
129
+ self._cache[value].appendleft(item)
130
+
131
+ return True
132
+
133
+ def _get_values(self, value):
134
+ """
135
+ Helper to yield items from the parent iterator that match *value*.
136
+ Items that don't match are stored in the local cache as they
137
+ are encountered.
138
+ """
139
+ while True:
140
+ # If we've cached some items that match the target value, emit
141
+ # the first one and evict it from the cache.
142
+ if self._cache[value]:
143
+ yield self._cache[value].popleft()
144
+ # Otherwise we need to advance the parent iterator to search for
145
+ # a matching item, caching the rest.
146
+ else:
147
+ while True:
148
+ try:
149
+ item = next(self._it)
150
+ except StopIteration:
151
+ return
152
+ item_value = self._key(item)
153
+ if item_value == value:
154
+ yield item
155
+ break
156
+ elif self._validator(item_value):
157
+ self._cache[item_value].append(item)
158
+
159
+ def __iter__(self):
160
+ for item in self._it:
161
+ item_value = self._key(item)
162
+ if self._validator(item_value):
163
+ self._cache[item_value].append(item)
164
+
165
+ yield from self._cache.keys()
166
+
167
+ def __getitem__(self, value):
168
+ if not self._validator(value):
169
+ return iter(())
170
+
171
+ return self._get_values(value)
importlib_metadata/_meta.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from collections.abc import Iterator
5
+ from typing import (
6
+ Any,
7
+ Protocol,
8
+ TypeVar,
9
+ overload,
10
+ )
11
+
12
+ _T = TypeVar("_T")
13
+
14
+
15
+ class PackageMetadata(Protocol):
16
+ def __len__(self) -> int: ... # pragma: no cover
17
+
18
+ def __contains__(self, item: str) -> bool: ... # pragma: no cover
19
+
20
+ def __getitem__(self, key: str) -> str: ... # pragma: no cover
21
+
22
+ def __iter__(self) -> Iterator[str]: ... # pragma: no cover
23
+
24
+ @overload
25
+ def get(
26
+ self, name: str, failobj: None = None
27
+ ) -> str | None: ... # pragma: no cover
28
+
29
+ @overload
30
+ def get(self, name: str, failobj: _T) -> str | _T: ... # pragma: no cover
31
+
32
+ # overload per python/importlib_metadata#435
33
+ @overload
34
+ def get_all(
35
+ self, name: str, failobj: None = None
36
+ ) -> list[Any] | None: ... # pragma: no cover
37
+
38
+ @overload
39
+ def get_all(self, name: str, failobj: _T) -> list[Any] | _T:
40
+ """
41
+ Return all values associated with a possibly multi-valued key.
42
+ """
43
+
44
+ @property
45
+ def json(self) -> dict[str, str | list[str]]:
46
+ """
47
+ A JSON-compatible form of the metadata.
48
+ """
49
+
50
+
51
+ class SimplePath(Protocol):
52
+ """
53
+ A minimal subset of pathlib.Path required by Distribution.
54
+ """
55
+
56
+ def joinpath(
57
+ self, other: str | os.PathLike[str]
58
+ ) -> SimplePath: ... # pragma: no cover
59
+
60
+ def __truediv__(
61
+ self, other: str | os.PathLike[str]
62
+ ) -> SimplePath: ... # pragma: no cover
63
+
64
+ @property
65
+ def parent(self) -> SimplePath: ... # pragma: no cover
66
+
67
+ def read_text(self, encoding=None) -> str: ... # pragma: no cover
68
+
69
+ def read_bytes(self) -> bytes: ... # pragma: no cover
70
+
71
+ def exists(self) -> bool: ... # pragma: no cover
importlib_metadata/_text.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ from ._functools import method_cache
4
+
5
+
6
+ # from jaraco.text 3.5
7
+ class FoldedCase(str):
8
+ """
9
+ A case insensitive string class; behaves just like str
10
+ except compares equal when the only variation is case.
11
+
12
+ >>> s = FoldedCase('hello world')
13
+
14
+ >>> s == 'Hello World'
15
+ True
16
+
17
+ >>> 'Hello World' == s
18
+ True
19
+
20
+ >>> s != 'Hello World'
21
+ False
22
+
23
+ >>> s.index('O')
24
+ 4
25
+
26
+ >>> s.split('O')
27
+ ['hell', ' w', 'rld']
28
+
29
+ >>> sorted(map(FoldedCase, ['GAMMA', 'alpha', 'Beta']))
30
+ ['alpha', 'Beta', 'GAMMA']
31
+
32
+ Sequence membership is straightforward.
33
+
34
+ >>> "Hello World" in [s]
35
+ True
36
+ >>> s in ["Hello World"]
37
+ True
38
+
39
+ You may test for set inclusion, but candidate and elements
40
+ must both be folded.
41
+
42
+ >>> FoldedCase("Hello World") in {s}
43
+ True
44
+ >>> s in {FoldedCase("Hello World")}
45
+ True
46
+
47
+ String inclusion works as long as the FoldedCase object
48
+ is on the right.
49
+
50
+ >>> "hello" in FoldedCase("Hello World")
51
+ True
52
+
53
+ But not if the FoldedCase object is on the left:
54
+
55
+ >>> FoldedCase('hello') in 'Hello World'
56
+ False
57
+
58
+ In that case, use in_:
59
+
60
+ >>> FoldedCase('hello').in_('Hello World')
61
+ True
62
+
63
+ >>> FoldedCase('hello') > FoldedCase('Hello')
64
+ False
65
+ """
66
+
67
+ def __lt__(self, other):
68
+ return self.lower() < other.lower()
69
+
70
+ def __gt__(self, other):
71
+ return self.lower() > other.lower()
72
+
73
+ def __eq__(self, other):
74
+ return self.lower() == other.lower()
75
+
76
+ def __ne__(self, other):
77
+ return self.lower() != other.lower()
78
+
79
+ def __hash__(self):
80
+ return hash(self.lower())
81
+
82
+ def __contains__(self, other):
83
+ return super().lower().__contains__(other.lower())
84
+
85
+ def in_(self, other):
86
+ "Does self appear in other?"
87
+ return self in FoldedCase(other)
88
+
89
+ # cache lower since it's likely to be called frequently.
90
+ @method_cache
91
+ def lower(self):
92
+ return super().lower()
93
+
94
+ def index(self, sub):
95
+ return self.lower().index(sub.lower())
96
+
97
+ def split(self, splitter=' ', maxsplit=0):
98
+ pattern = re.compile(re.escape(splitter), re.I)
99
+ return pattern.split(self, maxsplit)
importlib_metadata/_typing.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import functools
2
+ import typing
3
+
4
+ from ._meta import PackageMetadata
5
+
6
+ md_none = functools.partial(typing.cast, PackageMetadata)
7
+ """
8
+ Suppress type errors for optional metadata.
9
+
10
+ Although Distribution.metadata can return None when metadata is corrupt
11
+ and thus None, allow callers to assume it's not None and crash if
12
+ that's the case.
13
+
14
+ # python/importlib_metadata#493
15
+ """