salmankhanpm commited on Mar 19

Commit

ffca8fa

verified ·

1 Parent(s): 9b44c7b

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

_cuda_bindings_redirector.py +30 -0
anyio-4.12.1.dist-info/INSTALLER +1 -0
anyio-4.12.1.dist-info/METADATA +96 -0
anyio-4.12.1.dist-info/RECORD +51 -0
anyio-4.12.1.dist-info/REQUESTED +0 -0
anyio-4.12.1.dist-info/WHEEL +5 -0
anyio-4.12.1.dist-info/entry_points.txt +2 -0
anyio-4.12.1.dist-info/top_level.txt +1 -0
dataset-metadata.json +9 -0
datasets/__init__.py +47 -0
datasets/arrow_dataset.py +0 -0
datasets/arrow_reader.py +620 -0
datasets/arrow_writer.py +766 -0
datasets/builder.py +1866 -0
datasets/combine.py +223 -0
datasets/config.py +268 -0
datasets/data_files.py +807 -0
datasets/dataset_dict.py +0 -0
datasets/distributed.py +39 -0
datasets/exceptions.py +119 -0
datasets/fingerprint.py +454 -0
datasets/hub.py +124 -0
datasets/info.py +430 -0
datasets/inspect.py +353 -0
datasets/iterable_dataset.py +0 -0
datasets/keyhash.py +104 -0
datasets/load.py +1481 -0
datasets/naming.py +84 -0
datasets/search.py +785 -0
datasets/splits.py +635 -0
datasets/streaming.py +131 -0
datasets/table.py +2385 -0
idna/__init__.py +45 -0
idna/codec.py +122 -0
idna/compat.py +15 -0
idna/core.py +437 -0
idna/idnadata.py +4309 -0
idna/intranges.py +57 -0
idna/package_data.py +1 -0
idna/py.typed +0 -0
idna/uts46data.py +0 -0
importlib_metadata/__init__.py +1191 -0
importlib_metadata/_adapters.py +136 -0
importlib_metadata/_collections.py +34 -0
importlib_metadata/_compat.py +56 -0
importlib_metadata/_functools.py +136 -0
importlib_metadata/_itertools.py +171 -0
importlib_metadata/_meta.py +71 -0
importlib_metadata/_text.py +99 -0
importlib_metadata/_typing.py +15 -0

_cuda_bindings_redirector.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+import sys
+from types import ModuleType
+# Make sure 'cuda' is importable as a namespace package
+import cuda
+class LazyCudaModule(ModuleType):
+    def __getattr__(self, name):
+        if name == '__version__':
+            import warnings
+            warnings.warn(
+                "accessing cuda.__version__ is deprecated, " "please switch to use cuda.bindings.__version__ instead",
+                FutureWarning,
+                stacklevel=2,
+            )
+            from cuda.bindings import __version__
+            return __version__
+        raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+# Patch in LazyCudaModule for `cuda`
+sys.modules['cuda'].__class__ = LazyCudaModule

anyio-4.12.1.dist-info/INSTALLER ADDED Viewed

	@@ -0,0 +1 @@


1	+ uv

anyio-4.12.1.dist-info/METADATA ADDED Viewed

	@@ -0,0 +1,96 @@

+Metadata-Version: 2.4
+Name: anyio
+Version: 4.12.1
+Summary: High-level concurrency and networking framework on top of asyncio or Trio
+Author-email: Alex Grönholm <alex.gronholm@nextday.fi>
+License-Expression: MIT
+Project-URL: Documentation, https://anyio.readthedocs.io/en/latest/
+Project-URL: Changelog, https://anyio.readthedocs.io/en/stable/versionhistory.html
+Project-URL: Source code, https://github.com/agronholm/anyio
+Project-URL: Issue tracker, https://github.com/agronholm/anyio/issues
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Developers
+Classifier: Framework :: AnyIO
+Classifier: Typing :: Typed
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Programming Language :: Python :: 3.14
+Requires-Python: >=3.9
+Description-Content-Type: text/x-rst
+License-File: LICENSE
+Requires-Dist: exceptiongroup>=1.0.2; python_version < "3.11"
+Requires-Dist: idna>=2.8
+Requires-Dist: typing_extensions>=4.5; python_version < "3.13"
+Provides-Extra: trio
+Requires-Dist: trio>=0.32.0; python_version >= "3.10" and extra == "trio"
+Requires-Dist: trio>=0.31.0; python_version < "3.10" and extra == "trio"
+Dynamic: license-file
+.. image:: https://github.com/agronholm/anyio/actions/workflows/test.yml/badge.svg
+  :target: https://github.com/agronholm/anyio/actions/workflows/test.yml
+  :alt: Build Status
+.. image:: https://coveralls.io/repos/github/agronholm/anyio/badge.svg?branch=master
+  :target: https://coveralls.io/github/agronholm/anyio?branch=master
+  :alt: Code Coverage
+.. image:: https://readthedocs.org/projects/anyio/badge/?version=latest
+  :target: https://anyio.readthedocs.io/en/latest/?badge=latest
+  :alt: Documentation
+.. image:: https://badges.gitter.im/gitterHQ/gitter.svg
+  :target: https://gitter.im/python-trio/AnyIO
+  :alt: Gitter chat
+AnyIO is an asynchronous networking and concurrency library that works on top of either asyncio_ or
+Trio_. It implements Trio-like `structured concurrency`_ (SC) on top of asyncio and works in harmony
+with the native SC of Trio itself.
+Applications and libraries written against AnyIO's API will run unmodified on either asyncio_ or
+Trio_. AnyIO can also be adopted into a library or application incrementally – bit by bit, no full
+refactoring necessary. It will blend in with the native libraries of your chosen backend.
+To find out why you might want to use AnyIO's APIs instead of asyncio's, you can read about it
+`here <https://anyio.readthedocs.io/en/stable/why.html>`_.
+Documentation
+-------------
+View full documentation at: https://anyio.readthedocs.io/
+Features
+--------
+AnyIO offers the following functionality:
+* Task groups (nurseries_ in trio terminology)
+* High-level networking (TCP, UDP and UNIX sockets)
+  * `Happy eyeballs`_ algorithm for TCP connections (more robust than that of asyncio on Python
+    3.8)
+  * async/await style UDP sockets (unlike asyncio where you still have to use Transports and
+    Protocols)
+* A versatile API for byte streams and object streams
+* Inter-task synchronization and communication (locks, conditions, events, semaphores, object
+  streams)
+* Worker threads
+* Subprocesses
+* Subinterpreter support for code parallelization (on Python 3.13 and later)
+* Asynchronous file I/O (using worker threads)
+* Signal handling
+* Asynchronous version of the functools_ module
+AnyIO also comes with its own pytest_ plugin which also supports asynchronous fixtures.
+It even works with the popular Hypothesis_ library.
+.. _asyncio: https://docs.python.org/3/library/asyncio.html
+.. _Trio: https://github.com/python-trio/trio
+.. _structured concurrency: https://en.wikipedia.org/wiki/Structured_concurrency
+.. _nurseries: https://trio.readthedocs.io/en/stable/reference-core.html#nurseries-and-spawning
+.. _Happy eyeballs: https://en.wikipedia.org/wiki/Happy_Eyeballs
+.. _pytest: https://docs.pytest.org/en/latest/
+.. _functools: https://docs.python.org/3/library/functools.html
+.. _Hypothesis: https://hypothesis.works/

anyio-4.12.1.dist-info/RECORD ADDED Viewed

	@@ -0,0 +1,51 @@

+anyio-4.12.1.dist-info/INSTALLER,sha256=5hhM4Q4mYTT9z6QB6PGpUAW81PGNFrYrdXMj4oM_6ak,2
+anyio-4.12.1.dist-info/METADATA,sha256=DfiDab9Tmmcfy802lOLTMEHJQShkOSbopCwqCYbLuJk,4277
+anyio-4.12.1.dist-info/RECORD,,
+anyio-4.12.1.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+anyio-4.12.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+anyio-4.12.1.dist-info/entry_points.txt,sha256=_d6Yu6uiaZmNe0CydowirE9Cmg7zUL2g08tQpoS3Qvc,39
+anyio-4.12.1.dist-info/licenses/LICENSE,sha256=U2GsncWPLvX9LpsJxoKXwX8ElQkJu8gCO9uC6s8iwrA,1081
+anyio-4.12.1.dist-info/top_level.txt,sha256=QglSMiWX8_5dpoVAEIHdEYzvqFMdSYWmCj6tYw2ITkQ,6
+anyio/__init__.py,sha256=7iDVqMUprUuKNY91FuoKqayAhR-OY136YDPI6P78HHk,6170
+anyio/_backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+anyio/_backends/_asyncio.py,sha256=xG6qv60mgGnL0mK82dxjH2b8hlkMlJ-x2BqIq3qv70Y,98863
+anyio/_backends/_trio.py,sha256=30Rctb7lm8g63ZHljVPVnj5aH-uK6oQvphjwUBoAzuI,41456
+anyio/_core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+anyio/_core/_asyncio_selector_thread.py,sha256=2PdxFM3cs02Kp6BSppbvmRT7q7asreTW5FgBxEsflBo,5626
+anyio/_core/_contextmanagers.py,sha256=YInBCabiEeS-UaP_Jdxa1CaFC71ETPW8HZTHIM8Rsc8,7215
+anyio/_core/_eventloop.py,sha256=c2EdcBX-xnKwxPcC4Pjn3_qG9I-x4IWFO2R9RqCGjM4,6448
+anyio/_core/_exceptions.py,sha256=Y3aq-Wxd7Q2HqwSg7nZPvRsHEuGazv_qeet6gqEBdPk,4407
+anyio/_core/_fileio.py,sha256=uc7t10Vb-If7GbdWM_zFf-ajUe6uek63fSt7IBLlZW0,25731
+anyio/_core/_resources.py,sha256=NbmU5O5UX3xEyACnkmYX28Fmwdl-f-ny0tHym26e0w0,435
+anyio/_core/_signals.py,sha256=mjTBB2hTKNPRlU0IhnijeQedpWOGERDiMjSlJQsFrug,1016
+anyio/_core/_sockets.py,sha256=RBXHcUqZt5gg_-OOfgHVv8uq2FSKk1uVUzTdpjBoI1o,34977
+anyio/_core/_streams.py,sha256=FczFwIgDpnkK0bODWJXMpsUJYdvAD04kaUaGzJU8DK0,1806
+anyio/_core/_subprocesses.py,sha256=EXm5igL7dj55iYkPlbYVAqtbqxJxjU-6OndSTIx9SRg,8047
+anyio/_core/_synchronization.py,sha256=MgVVqFzvt580tHC31LiOcq1G6aryut--xRG4Ff8KwxQ,20869
+anyio/_core/_tasks.py,sha256=pVB7K6AAulzUM8YgXAeqNZG44nSyZ1bYJjH8GznC00I,5435
+anyio/_core/_tempfile.py,sha256=lHb7CW4FyIlpkf5ADAf4VmLHCKwEHF9nxqNyBCFFUiA,19697
+anyio/_core/_testing.py,sha256=u7MPqGXwpTxqI7hclSdNA30z2GH1Nw258uwKvy_RfBg,2340
+anyio/_core/_typedattr.py,sha256=P4ozZikn3-DbpoYcvyghS_FOYAgbmUxeoU8-L_07pZM,2508
+anyio/abc/__init__.py,sha256=6mWhcl_pGXhrgZVHP_TCfMvIXIOp9mroEFM90fYCU_U,2869
+anyio/abc/_eventloop.py,sha256=GlzgB3UJGgG6Kr7olpjOZ-o00PghecXuofVDQ_5611Q,10749
+anyio/abc/_resources.py,sha256=DrYvkNN1hH6Uvv5_5uKySvDsnknGVDe8FCKfko0VtN8,783
+anyio/abc/_sockets.py,sha256=ECTY0jLEF18gryANHR3vFzXzGdZ-xPwELq1QdgOb0Jo,13258
+anyio/abc/_streams.py,sha256=005GKSCXGprxnhucILboSqc2JFovECZk9m3p-qqxXVc,7640
+anyio/abc/_subprocesses.py,sha256=cumAPJTktOQtw63IqG0lDpyZqu_l1EElvQHMiwJgL08,2067
+anyio/abc/_tasks.py,sha256=KC7wrciE48AINOI-AhPutnFhe1ewfP7QnamFlDzqesQ,3721
+anyio/abc/_testing.py,sha256=tBJUzkSfOXJw23fe8qSJ03kJlShOYjjaEyFB6k6MYT8,1821
+anyio/from_thread.py,sha256=L-0w1HxJ6BSb-KuVi57k5Tkc3yzQrx3QK5tAxMPcY-0,19141
+anyio/functools.py,sha256=HWj7GBEmc0Z-mZg3uok7Z7ZJn0rEC_0Pzbt0nYUDaTQ,10973
+anyio/lowlevel.py,sha256=AyKLVK3LaWSoK39LkCKxE4_GDMLKZBNqTrLUgk63y80,5158
+anyio/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+anyio/pytest_plugin.py,sha256=3jAFQn0jv_pyoWE2GBBlHaj9sqXj4e8vob0_hgrsXE8,10244
+anyio/streams/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+anyio/streams/buffered.py,sha256=2R3PeJhe4EXrdYqz44Y6-Eg9R6DrmlsYrP36Ir43-po,6263
+anyio/streams/file.py,sha256=4WZ7XGz5WNu39FQHvqbe__TQ0HDP9OOhgO1mk9iVpVU,4470
+anyio/streams/memory.py,sha256=F0zwzvFJKAhX_LRZGoKzzqDC2oMM-f-yyTBrEYEGOaU,10740
+anyio/streams/stapled.py,sha256=T8Xqwf8K6EgURPxbt1N4i7A8BAk-gScv-GRhjLXIf_o,4390
+anyio/streams/text.py,sha256=BcVAGJw1VRvtIqnv-o0Rb0pwH7p8vwlvl21xHq522ag,5765
+anyio/streams/tls.py,sha256=Jpxy0Mfbcp1BxHCwE-YjSSFaLnIBbnnwur-excYThs4,15368
+anyio/to_interpreter.py,sha256=_mLngrMy97TMR6VbW4Y6YzDUk9ZuPcQMPlkuyRh3C9k,7100
+anyio/to_process.py,sha256=J7gAA_YOuoHqnpDAf5fm1Qu6kOmTzdFbiDNvnV755vk,9798
+anyio/to_thread.py,sha256=menEgXYmUV7Fjg_9WqCV95P9MAtQS8BzPGGcWB_QnfQ,2687

anyio-4.12.1.dist-info/REQUESTED ADDED Viewed

File without changes

anyio-4.12.1.dist-info/WHEEL ADDED Viewed

	@@ -0,0 +1,5 @@

+Wheel-Version: 1.0
+Generator: setuptools (80.9.0)
+Root-Is-Purelib: true
+Tag: py3-none-any

anyio-4.12.1.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [pytest11]
2	+ anyio = anyio.pytest_plugin

anyio-4.12.1.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ anyio

dataset-metadata.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "title": "mamba-packages",
+  "id": "pmsalmankhan/mamba-packages",
+  "licenses": [
+    {
+      "name": "CC0-1.0"
+    }
+  ]
+}

datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+__version__ = "4.3.0"
+from .arrow_dataset import Column, Dataset
+from .arrow_reader import ReadInstruction
+from .builder import ArrowBasedBuilder, BuilderConfig, DatasetBuilder, GeneratorBasedBuilder
+from .combine import concatenate_datasets, interleave_datasets
+from .dataset_dict import DatasetDict, IterableDatasetDict
+from .download import *
+from .features import *
+from .fingerprint import disable_caching, enable_caching, is_caching_enabled
+from .info import DatasetInfo
+from .inspect import (
+    get_dataset_config_info,
+    get_dataset_config_names,
+    get_dataset_default_config_name,
+    get_dataset_infos,
+    get_dataset_split_names,
+)
+from .iterable_dataset import IterableColumn, IterableDataset
+from .load import load_dataset, load_dataset_builder, load_from_disk
+from .splits import (
+    NamedSplit,
+    NamedSplitAll,
+    Split,
+    SplitBase,
+    SplitDict,
+    SplitGenerator,
+    SplitInfo,
+    SubSplitInfo,
+    percent,
+)
+from .utils import *
+from .utils import logging

datasets/arrow_dataset.py ADDED Viewed

The diff for this file is too large to render. See raw diff

datasets/arrow_reader.py ADDED Viewed

	@@ -0,0 +1,620 @@

+# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Lint as: python3
+"""Arrow ArrowReader."""
+import copy
+import math
+import os
+import re
+from dataclasses import dataclass
+from functools import partial
+from typing import TYPE_CHECKING, Optional, Union
+import pyarrow as pa
+import pyarrow.parquet as pq
+from tqdm.contrib.concurrent import thread_map
+from .download.download_config import DownloadConfig  # noqa: F401
+from .naming import _split_re, filenames_for_dataset_split
+from .table import InMemoryTable, MemoryMappedTable, Table, concat_tables
+from .utils import logging
+from .utils import tqdm as hf_tqdm
+if TYPE_CHECKING:
+    from .info import DatasetInfo  # noqa: F401
+    from .splits import Split, SplitInfo  # noqa: F401
+logger = logging.get_logger(__name__)
+HF_GCP_BASE_URL = "https://storage.googleapis.com/huggingface-nlp/cache/datasets"
+_SUB_SPEC_RE = re.compile(
+    rf"""
+^
+ (?P<split>{_split_re[1:-1]})
+ (\[
+    ((?P<from>-?[\d_]+)
+     (?P<from_pct>%)?)?
+    :
+    ((?P<to>-?[\d_]+)
+     (?P<to_pct>%)?)?
+ \])?(\((?P<rounding>[^\)]*)\))?
+$
+""",  # remove ^ and $
+    re.X,
+)
+_ADDITION_SEP_RE = re.compile(r"\s*\+\s*")
+class DatasetNotOnHfGcsError(ConnectionError):
+    """When you can't get the dataset from the Hf google cloud storage"""
+    pass
+class MissingFilesOnHfGcsError(ConnectionError):
+    """When some files are missing on the Hf oogle cloud storage"""
+    pass
+@dataclass(frozen=True)
+class FileInstructions:
+    """The file instructions associated with a split ReadInstruction.
+    Attributes:
+        num_examples: `int`, The total number of examples
+        file_instructions: List[dict(filename, skip, take)], the files information.
+            The filenames contains the relative path, not absolute.
+            skip/take indicates which example read in the file: `ds.slice(skip, take)`
+    """
+    num_examples: int
+    file_instructions: list[dict]
+def make_file_instructions(
+    name: str,
+    split_infos: list["SplitInfo"],
+    instruction: Union[str, "ReadInstruction"],
+    filetype_suffix: Optional[str] = None,
+    prefix_path: Optional[str] = None,
+) -> FileInstructions:
+    """Returns instructions of the split dict.
+    Args:
+        name (`str`): Name of the dataset.
+        split_infos (`list` of `[SplitInfo]`): Dataset splits information.
+        instruction ([`ReadInstruction`] or `str`): Reading instruction for a dataset.
+        filetype_suffix (`str`, *optional*): Suffix of dataset files, e.g. 'arrow' or 'parquet'.
+        prefix_path (`str`, *optional*): Prefix of dataset files, e.g. directory name.
+    Returns:
+        [`FileInstructions`]
+    """
+    if not isinstance(name, str):
+        raise TypeError(f"Expected str 'name', but got: {type(name).__name__}")
+    elif not name:
+        raise ValueError("Expected non-empty str 'name'")
+    name2len = {info.name: info.num_examples for info in split_infos}
+    name2shard_lengths = {info.name: info.shard_lengths for info in split_infos}
+    name2filenames = {
+        info.name: filenames_for_dataset_split(
+            path=prefix_path,
+            dataset_name=name,
+            split=info.name,
+            filetype_suffix=filetype_suffix,
+            shard_lengths=name2shard_lengths[info.name],
+        )
+        for info in split_infos
+    }
+    if not isinstance(instruction, ReadInstruction):
+        instruction = ReadInstruction.from_spec(instruction)
+    # Create the absolute instruction (per split)
+    absolute_instructions = instruction.to_absolute(name2len)
+    # For each split, return the files instruction (skip/take)
+    file_instructions = []
+    num_examples = 0
+    for abs_instr in absolute_instructions:
+        split_length = name2len[abs_instr.splitname]
+        filenames = name2filenames[abs_instr.splitname]
+        shard_lengths = name2shard_lengths[abs_instr.splitname]
+        from_ = 0 if abs_instr.from_ is None else abs_instr.from_
+        to = split_length if abs_instr.to is None else abs_instr.to
+        if shard_lengths is None:  # not sharded
+            for filename in filenames:
+                take = to - from_
+                if take == 0:
+                    continue
+                num_examples += take
+                file_instructions.append({"filename": filename, "skip": from_, "take": take})
+        else:  # sharded
+            index_start = 0  # Beginning (included) of moving window.
+            index_end = 0  # End (excluded) of moving window.
+            for filename, shard_length in zip(filenames, shard_lengths):
+                index_end += shard_length
+                if from_ < index_end and to > index_start:  # There is something to take.
+                    skip = from_ - index_start if from_ > index_start else 0
+                    take = to - index_start - skip if to < index_end else -1
+                    if take == 0:
+                        continue
+                    file_instructions.append({"filename": filename, "skip": skip, "take": take})
+                    num_examples += shard_length - skip if take == -1 else take
+                index_start += shard_length
+    return FileInstructions(
+        num_examples=num_examples,
+        file_instructions=file_instructions,
+    )
+class BaseReader:
+    """
+    Build a Dataset object out of Instruction instance(s).
+    """
+    def __init__(self, path: str, info: Optional["DatasetInfo"]):
+        """Initializes ArrowReader.
+        Args:
+            path (str): path where tfrecords are stored.
+            info (DatasetInfo): info about the dataset.
+        """
+        self._path: str = path
+        self._info: Optional["DatasetInfo"] = info
+        self._filetype_suffix: Optional[str] = None
+    def _get_table_from_filename(self, filename_skip_take, in_memory=False) -> Table:
+        """Returns a Dataset instance from given (filename, skip, take)."""
+        raise NotImplementedError
+    def _read_files(self, files, in_memory=False) -> Table:
+        """Returns Dataset for given file instructions.
+        Args:
+            files: List[dict(filename, skip, take)], the files information.
+                The filenames contain the absolute path, not relative.
+                skip/take indicates which example read in the file: `ds.slice(skip, take)`
+            in_memory (bool, default False): Whether to copy the data in-memory.
+        """
+        if len(files) == 0 or not all(isinstance(f, dict) for f in files):
+            raise ValueError("please provide valid file informations")
+        files = copy.deepcopy(files)
+        for f in files:
+            f["filename"] = os.path.join(self._path, f["filename"])
+        pa_tables = thread_map(
+            partial(self._get_table_from_filename, in_memory=in_memory),
+            files,
+            tqdm_class=hf_tqdm,
+            desc="Loading dataset shards",
+            # set `disable=None` rather than `disable=False` by default to disable progress bar when no TTY attached
+            disable=len(files) <= 16 or None,
+        )
+        pa_tables = [t for t in pa_tables if len(t) > 0]
+        if not pa_tables and (self._info is None or self._info.features is None):
+            raise ValueError(
+                "Tried to read an empty table. Please specify at least info.features to create an empty table with the right type."
+            )
+        pa_tables = pa_tables or [InMemoryTable.from_batches([], schema=pa.schema(self._info.features.type))]
+        pa_table = concat_tables(pa_tables) if len(pa_tables) != 1 else pa_tables[0]
+        return pa_table
+    def get_file_instructions(self, name, instruction, split_infos):
+        """Return list of dict {'filename': str, 'skip': int, 'take': int}"""
+        file_instructions = make_file_instructions(
+            name, split_infos, instruction, filetype_suffix=self._filetype_suffix, prefix_path=self._path
+        )
+        files = file_instructions.file_instructions
+        return files
+    def read(
+        self,
+        name,
+        instructions,
+        split_infos,
+        in_memory=False,
+    ):
+        """Returns Dataset instance(s).
+        Args:
+            name (str): name of the dataset.
+            instructions (ReadInstruction): instructions to read.
+                Instruction can be string and will then be passed to the Instruction
+                constructor as it.
+            split_infos (list of SplitInfo proto): the available splits for dataset.
+            in_memory (bool, default False): Whether to copy the data in-memory.
+        Returns:
+             kwargs to build a single Dataset instance.
+        """
+        files = self.get_file_instructions(name, instructions, split_infos)
+        if not files:
+            msg = f'Instruction "{instructions}" corresponds to no data!'
+            raise ValueError(msg)
+        return self.read_files(files=files, original_instructions=instructions, in_memory=in_memory)
+    def read_files(
+        self,
+        files: list[dict],
+        original_instructions: Union[None, "ReadInstruction", "Split"] = None,
+        in_memory=False,
+    ):
+        """Returns single Dataset instance for the set of file instructions.
+        Args:
+            files: List[dict(filename, skip, take)], the files information.
+                The filenames contains the relative path, not absolute.
+                skip/take indicates which example read in the file: `ds.skip().take()`
+            original_instructions: store the original instructions used to build the dataset split in the dataset.
+            in_memory (bool, default False): Whether to copy the data in-memory.
+        Returns:
+            kwargs to build a Dataset instance.
+        """
+        # Prepend path to filename
+        pa_table = self._read_files(files, in_memory=in_memory)
+        # If original_instructions is not None, convert it to a human-readable NamedSplit
+        if original_instructions is not None:
+            from .splits import Split  # noqa
+            split = Split(str(original_instructions))
+        else:
+            split = None
+        dataset_kwargs = {"arrow_table": pa_table, "info": self._info, "split": split}
+        return dataset_kwargs
+class ArrowReader(BaseReader):
+    """
+    Build a Dataset object out of Instruction instance(s).
+    This Reader uses either memory mapping or file descriptors (in-memory) on arrow files.
+    """
+    def __init__(self, path: str, info: Optional["DatasetInfo"]):
+        """Initializes ArrowReader.
+        Args:
+            path (str): path where Arrow files are stored.
+            info (DatasetInfo): info about the dataset.
+        """
+        super().__init__(path, info)
+        self._filetype_suffix = "arrow"
+    def _get_table_from_filename(self, filename_skip_take, in_memory=False) -> Table:
+        """Returns a Dataset instance from given (filename, skip, take)."""
+        filename, skip, take = (
+            filename_skip_take["filename"],
+            filename_skip_take["skip"] if "skip" in filename_skip_take else None,
+            filename_skip_take["take"] if "take" in filename_skip_take else None,
+        )
+        table = ArrowReader.read_table(filename, in_memory=in_memory)
+        if take == -1:
+            take = len(table) - skip
+        # here we don't want to slice an empty table, or it may segfault
+        if skip is not None and take is not None and not (skip == 0 and take == len(table)):
+            table = table.slice(skip, take)
+        return table
+    @staticmethod
+    def read_table(filename, in_memory=False) -> Table:
+        """
+        Read table from file.
+        Args:
+            filename (str): File name of the table.
+            in_memory (bool, default=False): Whether to copy the data in-memory.
+        Returns:
+            pyarrow.Table
+        """
+        table_cls = InMemoryTable if in_memory else MemoryMappedTable
+        return table_cls.from_file(filename)
+class ParquetReader(BaseReader):
+    """
+    Build a Dataset object out of Instruction instance(s).
+    This Reader uses memory mapping on parquet files.
+    """
+    def __init__(self, path: str, info: Optional["DatasetInfo"]):
+        """Initializes ParquetReader.
+        Args:
+            path (str): path where tfrecords are stored.
+            info (DatasetInfo): info about the dataset.
+        """
+        super().__init__(path, info)
+        self._filetype_suffix = "parquet"
+    def _get_table_from_filename(self, filename_skip_take, **kwargs):
+        """Returns a Dataset instance from given (filename, skip, take)."""
+        filename, skip, take = (
+            filename_skip_take["filename"],
+            filename_skip_take["skip"] if "skip" in filename_skip_take else None,
+            filename_skip_take["take"] if "take" in filename_skip_take else None,
+        )
+        # Parquet read_table always loads data in memory, independently of memory_map
+        pa_table = pq.read_table(filename, memory_map=True)
+        # here we don't want to slice an empty table, or it may segfault
+        if skip is not None and take is not None and not (skip == 0 and take == len(pa_table)):
+            pa_table = pa_table.slice(skip, take)
+        return pa_table
+@dataclass(frozen=True)
+class _AbsoluteInstruction:
+    """A machine friendly slice: defined absolute positive boundaries."""
+    splitname: str
+    from_: int  # uint (starting index).
+    to: int  # uint (ending index).
+@dataclass(frozen=True)
+class _RelativeInstruction:
+    """Represents a single parsed slicing instruction, can use % and negatives."""
+    splitname: str
+    from_: Optional[int] = None  # int (starting index) or None if no lower boundary.
+    to: Optional[int] = None  # int (ending index) or None if no upper boundary.
+    unit: Optional[str] = None
+    rounding: Optional[str] = None
+    def __post_init__(self):
+        if self.unit is not None and self.unit not in ["%", "abs"]:
+            raise ValueError("unit must be either % or abs")
+        if self.rounding is not None and self.rounding not in ["closest", "pct1_dropremainder"]:
+            raise ValueError("rounding must be either closest or pct1_dropremainder")
+        if self.unit != "%" and self.rounding is not None:
+            raise ValueError("It is forbidden to specify rounding if not using percent slicing.")
+        if self.unit == "%" and self.from_ is not None and abs(self.from_) > 100:
+            raise ValueError("Percent slice boundaries must be > -100 and < 100.")
+        if self.unit == "%" and self.to is not None and abs(self.to) > 100:
+            raise ValueError("Percent slice boundaries must be > -100 and < 100.")
+        # Update via __dict__ due to instance being "frozen"
+        self.__dict__["rounding"] = "closest" if self.rounding is None and self.unit == "%" else self.rounding
+def _str_to_read_instruction(spec):
+    """Returns ReadInstruction for given string."""
+    res = _SUB_SPEC_RE.match(spec)
+    if not res:
+        raise ValueError(f"Unrecognized instruction format: {spec}")
+    unit = "%" if res.group("from_pct") or res.group("to_pct") else "abs"
+    return ReadInstruction(
+        split_name=res.group("split"),
+        rounding=res.group("rounding"),
+        from_=int(res.group("from")) if res.group("from") else None,
+        to=int(res.group("to")) if res.group("to") else None,
+        unit=unit,
+    )
+def _pct_to_abs_pct1(boundary, num_examples):
+    # Using math.trunc here, since -99.5% should give -99%, not -100%.
+    if num_examples < 100:
+        msg = (
+            'Using "pct1_dropremainder" rounding on a split with less than 100 '
+            "elements is forbidden: it always results in an empty dataset."
+        )
+        raise ValueError(msg)
+    return boundary * math.trunc(num_examples / 100.0)
+def _pct_to_abs_closest(boundary, num_examples):
+    return int(round(boundary * num_examples / 100.0))
+def _rel_to_abs_instr(rel_instr, name2len):
+    """Returns _AbsoluteInstruction instance for given RelativeInstruction.
+    Args:
+        rel_instr: RelativeInstruction instance.
+        name2len: dict {split_name: num_examples}.
+    """
+    pct_to_abs = _pct_to_abs_closest if rel_instr.rounding == "closest" else _pct_to_abs_pct1
+    split = rel_instr.splitname
+    if split not in name2len:
+        raise ValueError(f'Unknown split "{split}". Should be one of {list(name2len)}.')
+    num_examples = name2len[split]
+    from_ = rel_instr.from_
+    to = rel_instr.to
+    if rel_instr.unit == "%":
+        from_ = 0 if from_ is None else pct_to_abs(from_, num_examples)
+        to = num_examples if to is None else pct_to_abs(to, num_examples)
+    else:
+        from_ = 0 if from_ is None else from_
+        to = num_examples if to is None else to
+    if from_ < 0:
+        from_ = max(num_examples + from_, 0)
+    if to < 0:
+        to = max(num_examples + to, 0)
+    from_ = min(from_, num_examples)
+    to = min(to, num_examples)
+    return _AbsoluteInstruction(split, from_, to)
+class ReadInstruction:
+    """Reading instruction for a dataset.
+    Examples::
+      # The following lines are equivalent:
+      ds = datasets.load_dataset('mnist', split='test[:33%]')
+      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec('test[:33%]'))
+      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction('test', to=33, unit='%'))
+      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction(
+          'test', from_=0, to=33, unit='%'))
+      # The following lines are equivalent:
+      ds = datasets.load_dataset('mnist', split='test[:33%]+train[1:-1]')
+      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec(
+          'test[:33%]+train[1:-1]'))
+      ds = datasets.load_dataset('mnist', split=(
+          datasets.ReadInstruction('test', to=33, unit='%') +
+          datasets.ReadInstruction('train', from_=1, to=-1, unit='abs')))
+      # The following lines are equivalent:
+      ds = datasets.load_dataset('mnist', split='test[:33%](pct1_dropremainder)')
+      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec(
+          'test[:33%](pct1_dropremainder)'))
+      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction(
+          'test', from_=0, to=33, unit='%', rounding="pct1_dropremainder"))
+      # 10-fold validation:
+      tests = datasets.load_dataset(
+          'mnist',
+          [datasets.ReadInstruction('train', from_=k, to=k+10, unit='%')
+          for k in range(0, 100, 10)])
+      trains = datasets.load_dataset(
+          'mnist',
+          [datasets.ReadInstruction('train', to=k, unit='%') + datasets.ReadInstruction('train', from_=k+10, unit='%')
+          for k in range(0, 100, 10)])
+    """
+    def _init(self, relative_instructions):
+        # Private initializer.
+        self._relative_instructions = relative_instructions
+    @classmethod
+    def _read_instruction_from_relative_instructions(cls, relative_instructions):
+        """Returns ReadInstruction obj initialized with relative_instructions."""
+        # Use __new__ to bypass __init__ used by public API and not conveniant here.
+        result = cls.__new__(cls)
+        result._init(relative_instructions)  # pylint: disable=protected-access
+        return result
+    def __init__(self, split_name, rounding=None, from_=None, to=None, unit=None):
+        """Initialize ReadInstruction.
+        Args:
+            split_name (str): name of the split to read. Eg: 'train'.
+            rounding (str, optional): The rounding behaviour to use when percent slicing is
+                used. Ignored when slicing with absolute indices.
+                Possible values:
+                 - 'closest' (default): The specified percentages are rounded to the
+                     closest value. Use this if you want specified percents to be as
+                     much exact as possible.
+                 - 'pct1_dropremainder': the specified percentages are treated as
+                     multiple of 1%. Use this option if you want consistency. Eg:
+                         len(5%) == 5 * len(1%).
+                     Using this option, one might not be able to use the full set of
+                     examples, if the number of those is not a multiple of 100.
+            from_ (int):
+            to (int): alternative way of specifying slicing boundaries. If any of
+                {from_, to, unit} argument is used, slicing cannot be specified as
+                string.
+            unit (str): optional, one of:
+                '%': to set the slicing unit as percents of the split size.
+                'abs': to set the slicing unit as absolute numbers.
+        """
+        # This constructor is not always called. See factory method
+        # `_read_instruction_from_relative_instructions`. Common init instructions
+        # MUST be placed in the _init method.
+        self._init([_RelativeInstruction(split_name, from_, to, unit, rounding)])
+    @classmethod
+    def from_spec(cls, spec):
+        """Creates a `ReadInstruction` instance out of a string spec.
+        Args:
+            spec (`str`):
+                Split(s) + optional slice(s) to read + optional rounding
+                if percents are used as the slicing unit. A slice can be specified,
+                using absolute numbers (`int`) or percentages (`int`).
+        Examples:
+            ```
+            test: test split.
+            test + validation: test split + validation split.
+            test[10:]: test split, minus its first 10 records.
+            test[:10%]: first 10% records of test split.
+            test[:20%](pct1_dropremainder): first 10% records, rounded with the pct1_dropremainder rounding.
+            test[:-5%]+train[40%:60%]: first 95% of test + middle 20% of train.
+            ```
+        Returns:
+            ReadInstruction instance.
+        """
+        spec = str(spec)  # Need to convert to str in case of NamedSplit instance.
+        subs = _ADDITION_SEP_RE.split(spec)
+        if not subs:
+            raise ValueError(f"No instructions could be built out of {spec}")
+        instruction = _str_to_read_instruction(subs[0])
+        return sum((_str_to_read_instruction(sub) for sub in subs[1:]), instruction)
+    def to_spec(self):
+        rel_instr_specs = []
+        for rel_instr in self._relative_instructions:
+            rel_instr_spec = rel_instr.splitname
+            if rel_instr.from_ is not None or rel_instr.to is not None:
+                from_ = rel_instr.from_
+                to = rel_instr.to
+                unit = rel_instr.unit
+                rounding = rel_instr.rounding
+                unit = unit if unit == "%" else ""
+                from_ = str(from_) + unit if from_ is not None else ""
+                to = str(to) + unit if to is not None else ""
+                slice_str = f"[{from_}:{to}]"
+                rounding_str = (
+                    f"({rounding})" if unit == "%" and rounding is not None and rounding != "closest" else ""
+                )
+                rel_instr_spec += slice_str + rounding_str
+            rel_instr_specs.append(rel_instr_spec)
+        return "+".join(rel_instr_specs)
+    def __add__(self, other):
+        """Returns a new ReadInstruction obj, result of appending other to self."""
+        if not isinstance(other, ReadInstruction):
+            msg = "ReadInstruction can only be added to another ReadInstruction obj."
+            raise TypeError(msg)
+        self_ris = self._relative_instructions
+        other_ris = other._relative_instructions  # pylint: disable=protected-access
+        if (
+            self_ris[0].unit != "abs"
+            and other_ris[0].unit != "abs"
+            and self._relative_instructions[0].rounding != other_ris[0].rounding
+        ):
+            raise ValueError("It is forbidden to sum ReadInstruction instances with different rounding values.")
+        return self._read_instruction_from_relative_instructions(self_ris + other_ris)
+    def __str__(self):
+        return self.to_spec()
+    def __repr__(self):
+        return f"ReadInstruction({self._relative_instructions})"
+    def to_absolute(self, name2len):
+        """Translate instruction into a list of absolute instructions.
+        Those absolute instructions are then to be added together.
+        Args:
+            name2len (`dict`):
+                Associating split names to number of examples.
+        Returns:
+            list of _AbsoluteInstruction instances (corresponds to the + in spec).
+        """
+        return [_rel_to_abs_instr(rel_instr, name2len) for rel_instr in self._relative_instructions]

datasets/arrow_writer.py ADDED Viewed

	@@ -0,0 +1,766 @@

+# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Lint as: python3
+"""To write records into Parquet files."""
+import json
+import sys
+from collections.abc import Iterable
+from typing import Any, Optional, Union
+import fsspec
+import numpy as np
+import pyarrow as pa
+import pyarrow.parquet as pq
+from fsspec.core import url_to_fs
+from . import config
+from .features import Audio, Features, Image, Pdf, Value, Video
+from .features.features import (
+    FeatureType,
+    List,
+    _ArrayXDExtensionType,
+    _visit,
+    cast_to_python_objects,
+    generate_from_arrow_type,
+    get_nested_type,
+    list_of_np_array_to_pyarrow_listarray,
+    numpy_to_pyarrow_listarray,
+    to_pyarrow_listarray,
+)
+from .filesystems import is_remote_filesystem
+from .info import DatasetInfo
+from .keyhash import DuplicatedKeysError, KeyHasher
+from .table import array_cast, cast_array_to_feature, embed_table_storage, table_cast
+from .utils import logging
+from .utils.py_utils import asdict, convert_file_size_to_int, first_non_null_non_empty_value
+logger = logging.get_logger(__name__)
+type_ = type  # keep python's type function
+def get_arrow_writer_batch_size_from_features(features: Optional[Features]) -> Optional[int]:
+    """
+    Get the writer_batch_size that defines the maximum record batch size in the arrow files based on configuration values.
+    The default value is 100 for image/audio datasets and 10 for videos.
+    This allows to avoid overflows in arrow buffers.
+    Args:
+        features (`datasets.Features` or `None`):
+            Dataset Features from `datasets`.
+    Returns:
+        writer_batch_size (`Optional[int]`):
+            Writer batch size to pass to a dataset builder.
+            If `None`, then it will use the `datasets` default, i.e. `datasets.config.DEFAULT_MAX_BATCH_SIZE`.
+    """
+    if not features:
+        return None
+    batch_size = np.inf
+    def set_batch_size(feature: FeatureType) -> None:
+        nonlocal batch_size
+        if isinstance(feature, Image) and config.ARROW_RECORD_BATCH_SIZE_FOR_IMAGE_DATASETS is not None:
+            batch_size = min(batch_size, config.ARROW_RECORD_BATCH_SIZE_FOR_IMAGE_DATASETS)
+        elif isinstance(feature, Audio) and config.ARROW_RECORD_BATCH_SIZE_FOR_AUDIO_DATASETS is not None:
+            batch_size = min(batch_size, config.ARROW_RECORD_BATCH_SIZE_FOR_AUDIO_DATASETS)
+        elif isinstance(feature, Video) and config.ARROW_RECORD_BATCH_SIZE_FOR_VIDEO_DATASETS is not None:
+            batch_size = min(batch_size, config.ARROW_RECORD_BATCH_SIZE_FOR_VIDEO_DATASETS)
+        elif (
+            isinstance(feature, Value)
+            and feature.dtype == "binary"
+            and config.ARROW_RECORD_BATCH_SIZE_FOR_BINARY_DATASETS is not None
+        ):
+            batch_size = min(batch_size, config.ARROW_RECORD_BATCH_SIZE_FOR_BINARY_DATASETS)
+    _visit(features, set_batch_size)
+    return None if batch_size is np.inf else batch_size
+def get_writer_batch_size_from_features(features: Optional[Features]) -> Optional[int]:
+    """
+    Get the writer_batch_size that defines the maximum row group size in the parquet files based on configuration values.
+    By default these are not set, but it can be helpful to hard set those values in some cases.
+    This allows to optimize random access to parquet file, since accessing 1 row requires
+    to read its entire row group.
+    Args:
+        features (`datasets.Features` or `None`):
+            Dataset Features from `datasets`.
+    Returns:
+        writer_batch_size (`Optional[int]`):
+            Writer batch size to pass to a parquet writer.
+            If `None`, then it will use the `datasets` default, i.e. aiming for row groups of 100MB.
+    """
+    if not features:
+        return None
+    batch_size = np.inf
+    def set_batch_size(feature: FeatureType) -> None:
+        nonlocal batch_size
+        if isinstance(feature, Image) and config.PARQUET_ROW_GROUP_SIZE_FOR_IMAGE_DATASETS is not None:
+            batch_size = min(batch_size, config.PARQUET_ROW_GROUP_SIZE_FOR_IMAGE_DATASETS)
+        elif isinstance(feature, Audio) and config.PARQUET_ROW_GROUP_SIZE_FOR_AUDIO_DATASETS is not None:
+            batch_size = min(batch_size, config.PARQUET_ROW_GROUP_SIZE_FOR_AUDIO_DATASETS)
+        elif isinstance(feature, Video) and config.PARQUET_ROW_GROUP_SIZE_FOR_VIDEO_DATASETS is not None:
+            batch_size = min(batch_size, config.PARQUET_ROW_GROUP_SIZE_FOR_VIDEO_DATASETS)
+        elif (
+            isinstance(feature, Value)
+            and feature.dtype == "binary"
+            and config.PARQUET_ROW_GROUP_SIZE_FOR_BINARY_DATASETS is not None
+        ):
+            batch_size = min(batch_size, config.PARQUET_ROW_GROUP_SIZE_FOR_BINARY_DATASETS)
+    _visit(features, set_batch_size)
+    return None if batch_size is np.inf else batch_size
+def get_writer_batch_size_from_data_size(num_rows: int, num_bytes: int) -> int:
+    """
+    Get the writer_batch_size that defines the maximum row group size in the parquet files.
+    The default in `datasets` is aiming for row groups of maximum 100MB uncompressed.
+    This allows to optimize random access to parquet file, since accessing 1 row requires
+    to read its entire row group.
+    This can be improved to get optimized size for querying/iterating
+    but at least it matches the dataset viewer expectations on HF.
+    Args:
+        num_rows (`int`):
+            Number of rows in the dataset.
+        num_bytes (`int`):
+            Number of bytes in the dataset.
+            For dataset with external files to embed (image, audio, videos), this can also be an
+            estimate from `dataset._estimate_nbytes()`.
+    Returns:
+        writer_batch_size (`Optional[int]`):
+            Writer batch size to pass to a parquet writer.
+    """
+    return max(10, num_rows * convert_file_size_to_int(config.MAX_ROW_GROUP_SIZE) // num_bytes) if num_bytes > 0 else 1
+class SchemaInferenceError(ValueError):
+    pass
+class TypedSequence:
+    """
+    This data container generalizes the typing when instantiating pyarrow arrays, tables or batches.
+    More specifically it adds several features:
+    - Support extension types like ``datasets.features.Array2DExtensionType``:
+        By default pyarrow arrays don't return extension arrays. One has to call
+        ``pa.ExtensionArray.from_storage(type, pa.array(data, type.storage_type))``
+        in order to get an extension array.
+    - Support for ``try_type`` parameter that can be used instead of ``type``:
+        When an array is transformed, we like to keep the same type as before if possible.
+        For example when calling :func:`datasets.Dataset.map`, we don't want to change the type
+        of each column by default.
+    - Better error message when a pyarrow array overflows.
+    Example::
+        from datasets.features import Array2D, Array2DExtensionType, Value
+        from datasets.arrow_writer import TypedSequence
+        import pyarrow as pa
+        arr = pa.array(TypedSequence([1, 2, 3], type=Value("int32")))
+        assert arr.type == pa.int32()
+        arr = pa.array(TypedSequence([1, 2, 3], try_type=Value("int32")))
+        assert arr.type == pa.int32()
+        arr = pa.array(TypedSequence(["foo", "bar"], try_type=Value("int32")))
+        assert arr.type == pa.string()
+        arr = pa.array(TypedSequence([[[1, 2, 3]]], type=Array2D((1, 3), "int64")))
+        assert arr.type == Array2DExtensionType((1, 3), "int64")
+        table = pa.Table.from_pydict({
+            "image": TypedSequence([[[1, 2, 3]]], type=Array2D((1, 3), "int64"))
+        })
+        assert table["image"].type == Array2DExtensionType((1, 3), "int64")
+    """
+    def __init__(
+        self,
+        data: Iterable,
+        type: Optional[FeatureType] = None,
+        try_type: Optional[FeatureType] = None,
+        optimized_int_type: Optional[FeatureType] = None,
+    ):
+        # assert type is None or try_type is None,
+        if type is not None and try_type is not None:
+            raise ValueError("You cannot specify both type and try_type")
+        # set attributes
+        self.data = data
+        self.type = type
+        self.try_type = try_type  # is ignored if it doesn't match the data
+        self.optimized_int_type = optimized_int_type
+        # when trying a type (is ignored if data is not compatible)
+        self.trying_type = self.try_type is not None
+        self.trying_int_optimization = optimized_int_type is not None and type is None and try_type is None
+        # used to get back the inferred type after __arrow_array__() is called once
+        self._inferred_type = None
+    def get_inferred_type(self) -> FeatureType:
+        """Return the inferred feature type.
+        This is done by converting the sequence to an Arrow array, and getting the corresponding
+        feature type.
+        Since building the Arrow array can be expensive, the value of the inferred type is cached
+        as soon as pa.array is called on the typed sequence.
+        Returns:
+            FeatureType: inferred feature type of the sequence.
+        """
+        if self._inferred_type is None:
+            self._inferred_type = generate_from_arrow_type(pa.array(self).type)
+        return self._inferred_type
+    @staticmethod
+    def _infer_custom_type_and_encode(data: Iterable) -> tuple[Iterable, Optional[FeatureType]]:
+        """Implement type inference for custom objects like PIL.Image.Image -> Image type.
+        This function is only used for custom python objects that can't be directly passed to build
+        an Arrow array. In such cases is infers the feature type to use, and it encodes the data so
+        that they can be passed to an Arrow array.
+        Args:
+            data (Iterable): array of data to infer the type, e.g. a list of PIL images.
+        Returns:
+            Tuple[Iterable, Optional[FeatureType]]: a tuple with:
+                - the (possibly encoded) array, if the inferred feature type requires encoding
+                - the inferred feature type if the array is made of supported custom objects like
+                    PIL images, else None.
+        """
+        if config.PIL_AVAILABLE and "PIL" in sys.modules:
+            import PIL.Image
+            non_null_idx, non_null_value = first_non_null_non_empty_value(data)
+            if isinstance(non_null_value, PIL.Image.Image):
+                return [Image().encode_example(value) if value is not None else None for value in data], Image()
+            if isinstance(non_null_value, list) and isinstance(non_null_value[0], PIL.Image.Image):
+                return [
+                    [Image().encode_example(x) for x in value] if value is not None else None for value in data
+                ], List(Image())
+        if config.PDFPLUMBER_AVAILABLE and "pdfplumber" in sys.modules:
+            import pdfplumber
+            non_null_idx, non_null_value = first_non_null_non_empty_value(data)
+            if isinstance(non_null_value, pdfplumber.pdf.PDF):
+                return [Pdf().encode_example(value) if value is not None else None for value in data], Pdf()
+            if isinstance(non_null_value, list) and isinstance(non_null_value[0], pdfplumber.pdf.PDF):
+                return [
+                    [Pdf().encode_example(x) for x in value] if value is not None else None for value in data
+                ], List(Pdf())
+        return data, None
+    def __arrow_array__(self, type: Optional[pa.DataType] = None):
+        """This function is called when calling pa.array(typed_sequence)"""
+        if type is not None:
+            raise ValueError("TypedSequence is supposed to be used with pa.array(typed_sequence, type=None)")
+        del type  # make sure we don't use it
+        data = self.data
+        # automatic type inference for custom objects
+        if self.type is None and self.try_type is None:
+            data, self._inferred_type = self._infer_custom_type_and_encode(data)
+        if self._inferred_type is None:
+            type = self.try_type if self.trying_type else self.type
+        else:
+            type = self._inferred_type
+        pa_type = get_nested_type(type) if type is not None else None
+        optimized_int_pa_type = (
+            get_nested_type(self.optimized_int_type) if self.optimized_int_type is not None else None
+        )
+        trying_cast_to_python_objects = False
+        try:
+            # custom pyarrow types
+            if isinstance(pa_type, _ArrayXDExtensionType):
+                storage = to_pyarrow_listarray(data, pa_type)
+                return pa.ExtensionArray.from_storage(pa_type, storage)
+            # efficient np array to pyarrow array
+            if isinstance(data, np.ndarray):
+                out = numpy_to_pyarrow_listarray(data)
+            elif isinstance(data, list) and data and isinstance(first_non_null_non_empty_value(data)[1], np.ndarray):
+                out = list_of_np_array_to_pyarrow_listarray(data)
+            else:
+                trying_cast_to_python_objects = True
+                out = pa.array(cast_to_python_objects(data, only_1d_for_numpy=True))
+            # use smaller integer precisions if possible
+            if self.trying_int_optimization:
+                if pa.types.is_int64(out.type):
+                    out = out.cast(optimized_int_pa_type)
+                elif pa.types.is_list(out.type):
+                    if pa.types.is_int64(out.type.value_type):
+                        out = array_cast(out, pa.list_(optimized_int_pa_type))
+                    elif pa.types.is_list(out.type.value_type) and pa.types.is_int64(out.type.value_type.value_type):
+                        out = array_cast(out, pa.list_(pa.list_(optimized_int_pa_type)))
+            # otherwise we can finally use the user's type
+            elif type is not None:
+                # We use cast_array_to_feature to support casting to custom types like Audio and Image
+                # Also, when trying type "string", we don't want to convert integers or floats to "string".
+                # We only do it if trying_type is False - since this is what the user asks for.
+                out = cast_array_to_feature(
+                    out, type, allow_primitive_to_str=not self.trying_type, allow_decimal_to_str=not self.trying_type
+                )
+            return out
+        except (
+            TypeError,
+            pa.lib.ArrowInvalid,
+            pa.lib.ArrowNotImplementedError,
+        ) as e:  # handle type errors and overflows
+            # Ignore ArrowNotImplementedError caused by trying type, otherwise re-raise
+            if not self.trying_type and isinstance(e, pa.lib.ArrowNotImplementedError):
+                raise
+            if self.trying_type:
+                try:  # second chance
+                    if isinstance(data, np.ndarray):
+                        return numpy_to_pyarrow_listarray(data)
+                    elif isinstance(data, list) and data and any(isinstance(value, np.ndarray) for value in data):
+                        return list_of_np_array_to_pyarrow_listarray(data)
+                    else:
+                        trying_cast_to_python_objects = True
+                        return pa.array(cast_to_python_objects(data, only_1d_for_numpy=True))
+                except pa.lib.ArrowInvalid as e:
+                    if "overflow" in str(e):
+                        raise OverflowError(
+                            f"There was an overflow with type {type_(data)}. Try to reduce writer_batch_size to have batches smaller than 2GB.\n({e})"
+                        ) from None
+                    elif self.trying_int_optimization and "not in range" in str(e):
+                        optimized_int_pa_type_str = np.dtype(optimized_int_pa_type.to_pandas_dtype()).name
+                        logger.info(
+                            f"Failed to cast a sequence to {optimized_int_pa_type_str}. Falling back to int64."
+                        )
+                        return out
+                    elif trying_cast_to_python_objects and "Could not convert" in str(e):
+                        out = pa.array(
+                            cast_to_python_objects(data, only_1d_for_numpy=True, optimize_list_casting=False)
+                        )
+                        if type is not None:
+                            out = cast_array_to_feature(
+                                out, type, allow_primitive_to_str=True, allow_decimal_to_str=True
+                            )
+                        return out
+                    else:
+                        raise
+            elif "overflow" in str(e):
+                raise OverflowError(
+                    f"There was an overflow with type {type_(data)}. Try to reduce writer_batch_size to have batches smaller than 2GB.\n({e})"
+                ) from None
+            elif self.trying_int_optimization and "not in range" in str(e):
+                optimized_int_pa_type_str = np.dtype(optimized_int_pa_type.to_pandas_dtype()).name
+                logger.info(f"Failed to cast a sequence to {optimized_int_pa_type_str}. Falling back to int64.")
+                return out
+            elif trying_cast_to_python_objects and "Could not convert" in str(e):
+                out = pa.array(cast_to_python_objects(data, only_1d_for_numpy=True, optimize_list_casting=False))
+                if type is not None:
+                    out = cast_array_to_feature(out, type, allow_primitive_to_str=True, allow_decimal_to_str=True)
+                return out
+            else:
+                raise
+class OptimizedTypedSequence(TypedSequence):
+    def __init__(
+        self,
+        data,
+        type: Optional[FeatureType] = None,
+        try_type: Optional[FeatureType] = None,
+        col: Optional[str] = None,
+        optimized_int_type: Optional[FeatureType] = None,
+    ):
+        optimized_int_type_by_col = {
+            "attention_mask": Value("int8"),  # binary tensor
+            "special_tokens_mask": Value("int8"),
+            "input_ids": Value("int32"),  # typical vocab size: 0-50k (max ~500k, never > 1M)
+            "token_type_ids": Value(
+                "int8"
+            ),  # binary mask; some (XLNetModel) use an additional token represented by a 2
+        }
+        if type is None and try_type is None:
+            optimized_int_type = optimized_int_type_by_col.get(col, None)
+        super().__init__(data, type=type, try_type=try_type, optimized_int_type=optimized_int_type)
+class ArrowWriter:
+    """Shuffles and writes Examples to Arrow files."""
+    def __init__(
+        self,
+        schema: Optional[pa.Schema] = None,
+        features: Optional[Features] = None,
+        path: Optional[str] = None,
+        stream: Optional[pa.NativeFile] = None,
+        fingerprint: Optional[str] = None,
+        writer_batch_size: Optional[int] = None,
+        hash_salt: Optional[str] = None,
+        check_duplicates: Optional[bool] = False,
+        disable_nullable: bool = False,
+        update_features: bool = False,
+        with_metadata: bool = True,
+        unit: str = "examples",
+        embed_local_files: bool = False,
+        storage_options: Optional[dict] = None,
+    ):
+        if path is None and stream is None:
+            raise ValueError("At least one of path and stream must be provided.")
+        if features is not None:
+            self._features = features
+            self._schema = None
+        elif schema is not None:
+            self._schema: pa.Schema = schema
+            self._features = Features.from_arrow_schema(self._schema)
+        else:
+            self._features = None
+            self._schema = None
+        if hash_salt is not None:
+            # Create KeyHasher instance using split name as hash salt
+            self._hasher = KeyHasher(hash_salt)
+        else:
+            self._hasher = KeyHasher("")
+        self._check_duplicates = check_duplicates
+        self._disable_nullable = disable_nullable
+        if stream is None:
+            fs, path = url_to_fs(path, **(storage_options or {}))
+            self._fs: fsspec.AbstractFileSystem = fs
+            self._path = path if not is_remote_filesystem(self._fs) else self._fs.unstrip_protocol(path)
+            self.stream = self._fs.open(path, "wb")
+            self._closable_stream = True
+        else:
+            self._fs = None
+            self._path = None
+            self.stream = stream
+            self._closable_stream = False
+        self.fingerprint = fingerprint
+        self.disable_nullable = disable_nullable
+        self.writer_batch_size = (
+            writer_batch_size
+            or get_arrow_writer_batch_size_from_features(self._features)
+            or config.DEFAULT_MAX_BATCH_SIZE
+        )
+        self.update_features = update_features
+        self.with_metadata = with_metadata
+        self.unit = unit
+        self.embed_local_files = embed_local_files
+        self._num_examples = 0
+        self._num_bytes = 0
+        self.current_examples: list[tuple[dict[str, Any], str]] = []
+        self.current_rows: list[pa.Table] = []
+        self.pa_writer: Optional[pa.RecordBatchStreamWriter] = None
+        self.hkey_record = []
+    def __len__(self):
+        """Return the number of writed and staged examples"""
+        return self._num_examples + len(self.current_examples) + len(self.current_rows)
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+    def close(self):
+        # Try closing if opened; if closed: pyarrow.lib.ArrowInvalid: Invalid operation on closed file
+        if self.pa_writer:  # it might be None
+            try:
+                self.pa_writer.close()
+            except Exception:  # pyarrow.lib.ArrowInvalid, OSError
+                pass
+        if self._closable_stream and not self.stream.closed:
+            self.stream.close()  # This also closes self.pa_writer if it is opened
+    def _build_schema(self, inferred_schema: pa.Schema):
+        schema = self.schema
+        features = self._features
+        inferred_features = Features.from_arrow_schema(inferred_schema)
+        if self._features is not None:
+            if self.update_features:  # keep original features it they match, or update them
+                fields = {field.name: field for field in self._features.type}
+                for inferred_field in inferred_features.type:
+                    name = inferred_field.name
+                    if name in fields:
+                        if inferred_field == fields[name]:
+                            inferred_features[name] = self._features[name]
+                features = inferred_features
+                schema: pa.Schema = inferred_schema
+        else:
+            features = inferred_features
+            schema: pa.Schema = inferred_features.arrow_schema
+        if self.disable_nullable:
+            schema = pa.schema(pa.field(field.name, field.type, nullable=False) for field in schema)
+        if self.with_metadata:
+            schema = schema.with_metadata(self._build_metadata(DatasetInfo(features=features), self.fingerprint))
+        else:
+            schema = schema.with_metadata({})
+        return schema, features
+    def _build_writer(self, inferred_schema: pa.Schema):
+        self._schema, self._features = self._build_schema(inferred_schema)
+        self.pa_writer = pa.RecordBatchStreamWriter(self.stream, self._schema)
+    @property
+    def schema(self):
+        _schema = (
+            self._schema
+            if self._schema is not None
+            else (pa.schema(self._features.type) if self._features is not None else None)
+        )
+        if self._disable_nullable and _schema is not None:
+            _schema = pa.schema(pa.field(field.name, field.type, nullable=False) for field in _schema)
+        return _schema if _schema is not None else []
+    @staticmethod
+    def _build_metadata(info: DatasetInfo, fingerprint: Optional[str] = None) -> dict[str, str]:
+        info_keys = ["features"]  # we can add support for more DatasetInfo keys in the future
+        info_as_dict = asdict(info)
+        metadata = {}
+        metadata["info"] = {key: info_as_dict[key] for key in info_keys}
+        if fingerprint is not None:
+            metadata["fingerprint"] = fingerprint
+        return {"huggingface": json.dumps(metadata)}
+    def write_examples_on_file(self):
+        """Write stored examples from the write-pool of examples. It makes a table out of the examples and write it."""
+        if not self.current_examples:
+            return
+        # preserve the order the columns
+        if self.schema:
+            schema_cols = set(self.schema.names)
+            examples_cols = self.current_examples[0][0].keys()  # .keys() preserves the order (unlike set)
+            common_cols = [col for col in self.schema.names if col in examples_cols]
+            extra_cols = [col for col in examples_cols if col not in schema_cols]
+            cols = common_cols + extra_cols
+        else:
+            cols = list(self.current_examples[0][0])
+        batch_examples = {}
+        for col in cols:
+            # We use row[0][col] since current_examples contains (example, key) tuples.
+            # Moreover, examples could be Arrow arrays of 1 element.
+            # This can happen in `.map()` when we want to re-write the same Arrow data
+            if all(isinstance(row[0][col], (pa.Array, pa.ChunkedArray)) for row in self.current_examples):
+                arrays = [row[0][col] for row in self.current_examples]
+                arrays = [
+                    chunk
+                    for array in arrays
+                    for chunk in (array.chunks if isinstance(array, pa.ChunkedArray) else [array])
+                ]
+                batch_examples[col] = pa.concat_arrays(arrays)
+            else:
+                batch_examples[col] = [
+                    row[0][col].to_pylist()[0] if isinstance(row[0][col], (pa.Array, pa.ChunkedArray)) else row[0][col]
+                    for row in self.current_examples
+                ]
+        self.write_batch(batch_examples=batch_examples)
+        self.current_examples = []
+    def write_rows_on_file(self):
+        """Write stored rows from the write-pool of rows. It concatenates the single-row tables and it writes the resulting table."""
+        if not self.current_rows:
+            return
+        table = pa.concat_tables(self.current_rows)
+        self.write_table(table)
+        self.current_rows = []
+    def write(
+        self,
+        example: dict[str, Any],
+        key: Optional[Union[str, int, bytes]] = None,
+        writer_batch_size: Optional[int] = None,
+    ):
+        """Add a given (Example,Key) pair to the write-pool of examples which is written to file.
+        Args:
+            example: the Example to add.
+            key: Optional, a unique identifier(str, int or bytes) associated with each example
+        """
+        # Utilize the keys and duplicate checking when `self._check_duplicates` is passed True
+        if self._check_duplicates:
+            # Create unique hash from key and store as (key, example) pairs
+            hash = self._hasher.hash(key)
+            self.current_examples.append((example, hash))
+            # Maintain record of keys and their respective hashes for checking duplicates
+            self.hkey_record.append((hash, key))
+        else:
+            # Store example as a tuple so as to keep the structure of `self.current_examples` uniform
+            self.current_examples.append((example, ""))
+        if writer_batch_size is None:
+            writer_batch_size = self.writer_batch_size
+        if writer_batch_size is not None and len(self.current_examples) >= writer_batch_size:
+            if self._check_duplicates:
+                self.check_duplicate_keys()
+                # Re-initializing to empty list for next batch
+                self.hkey_record = []
+            self.write_examples_on_file()
+    def check_duplicate_keys(self):
+        """Raises error if duplicates found in a batch"""
+        tmp_record = set()
+        for hash, key in self.hkey_record:
+            if hash in tmp_record:
+                duplicate_key_indices = [
+                    str(self._num_examples + index)
+                    for index, (duplicate_hash, _) in enumerate(self.hkey_record)
+                    if duplicate_hash == hash
+                ]
+                raise DuplicatedKeysError(key, duplicate_key_indices)
+            else:
+                tmp_record.add(hash)
+    def write_row(self, row: pa.Table, writer_batch_size: Optional[int] = None):
+        """Add a given single-row Table to the write-pool of rows which is written to file.
+        Args:
+            row: the row to add.
+        """
+        if len(row) != 1:
+            raise ValueError(f"Only single-row pyarrow tables are allowed but got table with {len(row)} rows.")
+        self.current_rows.append(row)
+        if writer_batch_size is None:
+            writer_batch_size = self.writer_batch_size
+        if writer_batch_size is not None and len(self.current_rows) >= writer_batch_size:
+            self.write_rows_on_file()
+    def write_batch(
+        self,
+        batch_examples: dict[str, list],
+        writer_batch_size: Optional[int] = None,
+        try_original_type: Optional[bool] = True,
+    ):
+        """Write a batch of Example to file.
+        Ignores the batch if it appears to be empty,
+        preventing a potential schema update of unknown types.
+        Args:
+            batch_examples: the batch of examples to add.
+            try_original_type: use `try_type` when instantiating OptimizedTypedSequence if `True`, otherwise `try_type = None`.
+        """
+        if batch_examples and len(next(iter(batch_examples.values()))) == 0:
+            return
+        features = None if self.pa_writer is None and self.update_features else self._features
+        try_features = self._features if self.pa_writer is None and self.update_features else None
+        arrays = []
+        inferred_features = Features()
+        # preserve the order the columns
+        if self.schema:
+            schema_cols = set(self.schema.names)
+            batch_cols = batch_examples.keys()  # .keys() preserves the order (unlike set)
+            common_cols = [col for col in self.schema.names if col in batch_cols]
+            extra_cols = [col for col in batch_cols if col not in schema_cols]
+            cols = common_cols + extra_cols
+        else:
+            cols = list(batch_examples)
+        for col in cols:
+            col_values = batch_examples[col]
+            col_type = features[col] if features else None
+            if isinstance(col_values, (pa.Array, pa.ChunkedArray)):
+                array = cast_array_to_feature(col_values, col_type) if col_type is not None else col_values
+                arrays.append(array)
+                inferred_features[col] = generate_from_arrow_type(col_values.type)
+            else:
+                col_try_type = (
+                    try_features[col]
+                    if try_features is not None and col in try_features and try_original_type
+                    else None
+                )
+                typed_sequence = OptimizedTypedSequence(col_values, type=col_type, try_type=col_try_type, col=col)
+                arrays.append(pa.array(typed_sequence))
+                inferred_features[col] = typed_sequence.get_inferred_type()
+        schema = inferred_features.arrow_schema if self.pa_writer is None else self.schema
+        pa_table = pa.Table.from_arrays(arrays, schema=schema)
+        self.write_table(pa_table, writer_batch_size)
+    def write_table(self, pa_table: pa.Table, writer_batch_size: Optional[int] = None):
+        """Write a Table to file.
+        Args:
+            example: the Table to add.
+        """
+        if writer_batch_size is None:
+            writer_batch_size = self.writer_batch_size
+        if self.pa_writer is None:
+            self._build_writer(inferred_schema=pa_table.schema)
+        pa_table = pa_table.combine_chunks()
+        pa_table = table_cast(pa_table, self._schema)
+        if self.embed_local_files:
+            pa_table = embed_table_storage(pa_table)
+        self._num_bytes += pa_table.nbytes
+        self._num_examples += pa_table.num_rows
+        self.pa_writer.write_table(pa_table, writer_batch_size)
+    def finalize(self, close_stream=True):
+        self.write_rows_on_file()
+        # In case current_examples < writer_batch_size, but user uses finalize()
+        if self._check_duplicates:
+            self.check_duplicate_keys()
+            # Re-initializing to empty list for next batch
+            self.hkey_record = []
+        self.write_examples_on_file()
+        # If schema is known, infer features even if no examples were written
+        if self.pa_writer is None and self.schema:
+            self._build_writer(self.schema)
+        if self.pa_writer is not None:
+            self.pa_writer.close()
+            self.pa_writer = None
+            if close_stream:
+                self.stream.close()
+        else:
+            if close_stream:
+                self.stream.close()
+            raise SchemaInferenceError("Please pass `features` or at least one example when writing data")
+        logger.debug(
+            f"Done writing {self._num_examples} {self.unit} in {self._num_bytes} bytes {self._path if self._path else ''}."
+        )
+        return self._num_examples, self._num_bytes
+class ParquetWriter(ArrowWriter):
+    def __init__(self, *args, use_content_defined_chunking=True, write_page_index=True, **kwargs):
+        super().__init__(*args, **kwargs)
+        if use_content_defined_chunking is True:
+            use_content_defined_chunking = config.DEFAULT_CDC_OPTIONS
+        self.use_content_defined_chunking = use_content_defined_chunking
+        self.write_page_index = write_page_index
+    def _build_writer(self, inferred_schema: pa.Schema):
+        self._schema, self._features = self._build_schema(inferred_schema)
+        self.pa_writer = pq.ParquetWriter(
+            self.stream,
+            self._schema,
+            use_content_defined_chunking=self.use_content_defined_chunking,
+            write_page_index=self.write_page_index,
+        )
+        if self.use_content_defined_chunking is not False:
+            self.pa_writer.add_key_value_metadata(
+                {"content_defined_chunking": json.dumps(self.use_content_defined_chunking)}
+            )

datasets/builder.py ADDED Viewed

	@@ -0,0 +1,1866 @@

+# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Lint as: python3
+"""DatasetBuilder base class."""
+import abc
+import contextlib
+import copy
+import inspect
+import os
+import posixpath
+import shutil
+import textwrap
+import time
+import urllib
+from collections.abc import Iterable, Mapping
+from dataclasses import dataclass
+from functools import partial
+from pathlib import Path
+from typing import TYPE_CHECKING, Optional, Union
+from unittest.mock import patch
+import fsspec
+from fsspec.core import url_to_fs
+from multiprocess import Pool
+from tqdm.contrib.concurrent import thread_map
+from . import config, utils
+from .arrow_dataset import Dataset
+from .arrow_reader import (
+    ArrowReader,
+    ReadInstruction,
+)
+from .arrow_writer import ArrowWriter, ParquetWriter, SchemaInferenceError
+from .data_files import DataFilesDict, DataFilesPatternsDict, sanitize_patterns
+from .dataset_dict import DatasetDict, IterableDatasetDict
+from .download.download_config import DownloadConfig
+from .download.download_manager import DownloadManager, DownloadMode
+from .download.streaming_download_manager import StreamingDownloadManager, xjoin
+from .exceptions import DatasetGenerationCastError, DatasetGenerationError, FileFormatError, ManualDownloadError
+from .features import Features
+from .filesystems import (
+    is_remote_filesystem,
+    rename,
+)
+from .fingerprint import Hasher
+from .info import DatasetInfo, PostProcessedInfo
+from .iterable_dataset import ArrowExamplesIterable, ExamplesIterable, IterableDataset
+from .keyhash import DuplicatedKeysError
+from .naming import INVALID_WINDOWS_CHARACTERS_IN_PATH, camelcase_to_snakecase
+from .splits import Split, SplitDict, SplitGenerator, SplitInfo
+from .streaming import extend_dataset_builder_for_streaming
+from .table import CastError
+from .utils import logging
+from .utils import tqdm as hf_tqdm
+from .utils._filelock import FileLock
+from .utils.file_utils import is_remote_url
+from .utils.info_utils import VerificationMode, get_size_checksum_dict, verify_checksums, verify_splits
+from .utils.py_utils import (
+    classproperty,
+    convert_file_size_to_int,
+    has_sufficient_disk_space,
+    iflatmap_unordered,
+    map_nested,
+    memoize,
+    size_str,
+    temporary_assignment,
+)
+from .utils.sharding import _number_of_shards_in_gen_kwargs, _split_gen_kwargs
+from .utils.track import tracked_list
+if TYPE_CHECKING:
+    from .load import DatasetModule
+logger = logging.get_logger(__name__)
+class InvalidConfigName(ValueError):
+    pass
+@dataclass
+class BuilderConfig:
+    """Base class for `DatasetBuilder` data configuration.
+    `DatasetBuilder` subclasses with data configuration options should subclass
+    `BuilderConfig` and add their own properties.
+    Attributes:
+        name (`str`, defaults to `default`):
+            The name of the configuration.
+        version (`Version` or `str`, defaults to `0.0.0`):
+            The version of the configuration.
+        data_dir (`str`, *optional*):
+            Path to the directory containing the source data.
+        data_files (`str` or `Sequence` or `Mapping`, *optional*):
+            Path(s) to source data file(s).
+        description (`str`, *optional*):
+            A human description of the configuration.
+    """
+    name: str = "default"
+    version: Optional[Union[utils.Version, str]] = utils.Version("0.0.0")
+    data_dir: Optional[str] = None
+    data_files: Optional[Union[DataFilesDict, DataFilesPatternsDict]] = None
+    description: Optional[str] = None
+    def __post_init__(self):
+        # The config name is used to name the cache directory.
+        for invalid_char in INVALID_WINDOWS_CHARACTERS_IN_PATH:
+            if invalid_char in self.name:
+                raise InvalidConfigName(
+                    f"Bad characters from black list '{INVALID_WINDOWS_CHARACTERS_IN_PATH}' found in '{self.name}'. "
+                    f"They could create issues when creating a directory for this config on Windows filesystem."
+                )
+        if self.data_files is not None and not isinstance(self.data_files, (DataFilesDict, DataFilesPatternsDict)):
+            raise ValueError(f"Expected a DataFilesDict in data_files but got {self.data_files}")
+    def __eq__(self, o):
+        # we need to override the default dataclass __eq__ since it doesn't check for
+        # other attributes that the ones of the signature.
+        if set(self.__dict__.keys()) != set(o.__dict__.keys()):
+            return False
+        return all((k, getattr(self, k)) == (k, getattr(o, k)) for k in self.__dict__.keys())
+    def create_config_id(
+        self,
+        config_kwargs: dict,
+        custom_features: Optional[Features] = None,
+    ) -> str:
+        """
+        The config id is used to build the cache directory.
+        By default it is equal to the config name.
+        However the name of a config is not sufficient to have a unique identifier for the dataset being generated
+        since it doesn't take into account:
+        - the config kwargs that can be used to overwrite attributes
+        - the custom features used to write the dataset
+        - the data_files for json/text/csv/pandas datasets
+        Therefore the config id is just the config name with an optional suffix based on these.
+        """
+        # Possibly add a suffix to the name to handle custom features/data_files/config_kwargs
+        suffix: Optional[str] = None
+        config_kwargs_to_add_to_suffix = config_kwargs.copy()
+        # name and version are already used to build the cache directory
+        config_kwargs_to_add_to_suffix.pop("name", None)
+        config_kwargs_to_add_to_suffix.pop("version", None)
+        # data dir handling (when specified it points to the manually downloaded data):
+        # it was previously ignored before the introduction of config id because we didn't want
+        # to change the config name. Now it's fine to take it into account for the config id.
+        # config_kwargs_to_add_to_suffix.pop("data_dir", None)
+        if "data_dir" in config_kwargs_to_add_to_suffix:
+            if config_kwargs_to_add_to_suffix["data_dir"] is None:
+                config_kwargs_to_add_to_suffix.pop("data_dir", None)
+            else:
+                # canonicalize the data dir to avoid two paths to the same location having different
+                # hashes
+                data_dir = config_kwargs_to_add_to_suffix["data_dir"]
+                data_dir = os.path.normpath(data_dir)
+                config_kwargs_to_add_to_suffix["data_dir"] = data_dir
+        if config_kwargs_to_add_to_suffix:
+            # we don't care about the order of the kwargs
+            config_kwargs_to_add_to_suffix = {
+                k: config_kwargs_to_add_to_suffix[k] for k in sorted(config_kwargs_to_add_to_suffix)
+            }
+            if all(isinstance(v, (str, bool, int, float)) for v in config_kwargs_to_add_to_suffix.values()):
+                suffix = ",".join(
+                    str(k) + "=" + urllib.parse.quote_plus(str(v)) for k, v in config_kwargs_to_add_to_suffix.items()
+                )
+                if len(suffix) > 32:  # hash if too long
+                    suffix = Hasher.hash(config_kwargs_to_add_to_suffix)
+            else:
+                suffix = Hasher.hash(config_kwargs_to_add_to_suffix)
+        if custom_features is not None:
+            m = Hasher()
+            if suffix:
+                m.update(suffix)
+            m.update(custom_features)
+            suffix = m.hexdigest()
+        if suffix:
+            config_id = self.name + "-" + suffix
+            if len(config_id) > config.MAX_DATASET_CONFIG_ID_READABLE_LENGTH:
+                config_id = self.name + "-" + Hasher.hash(suffix)
+            return config_id
+        else:
+            return self.name
+    def _resolve_data_files(self, base_path: str, download_config: DownloadConfig) -> None:
+        if isinstance(self.data_files, DataFilesPatternsDict):
+            base_path = xjoin(base_path, self.data_dir) if self.data_dir else base_path
+            self.data_files = self.data_files.resolve(base_path, download_config)
+class DatasetBuilder:
+    """Abstract base class for all datasets.
+    `DatasetBuilder` has 3 key methods:
+        - [`DatasetBuilder.info`]: Documents the dataset, including feature
+          names, types, shapes, version, splits, citation, etc.
+        - [`DatasetBuilder.download_and_prepare`]: Downloads the source data
+          and writes it to disk.
+        - [`DatasetBuilder.as_dataset`]: Generates a [`Dataset`].
+    Some `DatasetBuilder`s expose multiple variants of the
+    dataset by defining a [`BuilderConfig`] subclass and accepting a
+    config object (or name) on construction. Configurable datasets expose a
+    pre-defined set of configurations in [`DatasetBuilder.builder_configs`].
+    Args:
+        cache_dir (`str`, *optional*):
+            Directory to cache data. Defaults to `"~/.cache/huggingface/datasets"`.
+        dataset_name (`str`, *optional*):
+            Name of the dataset, if different from the builder name. Useful for packaged builders
+            like csv, imagefolder, audiofolder, etc. to reflect the difference between datasets
+            that use the same packaged builder.
+        config_name (`str`, *optional*):
+            Name of the dataset configuration.
+            It affects the data generated on disk. Different configurations will have their own subdirectories and
+            versions.
+            If not provided, the default configuration is used (if it exists).
+            <Added version="2.3.0">
+            Parameter `name` was renamed to `config_name`.
+            </Added>
+        hash (`str`, *optional*):
+            Hash specific to the dataset builder code. Used to update the caching directory when the
+            dataset builder code is updated (to avoid reusing old data).
+            The typical caching directory (defined in `self._relative_data_dir`) is `name/version/hash/`.
+        base_path (`str`, *optional*):
+            Base path for relative paths that are used to download files.
+            This can be a remote URL.
+        features ([`Features`], *optional*):
+            Features types to use with this dataset.
+            It can be used to change the [`Features`] types of a dataset, for example.
+        token (`str` or `bool`, *optional*):
+            String or boolean to use as Bearer token for remote files on the
+            Datasets Hub. If `True`, will get token from `"~/.huggingface"`.
+        repo_id (`str`, *optional*):
+            ID of the dataset repository.
+            Used to distinguish builders with the same name but not coming from the same namespace, for example "rajpurkar/squad"
+            and "lhoestq/squad" repo IDs. In the latter, the builder name would be "lhoestq___squad".
+        data_files (`str` or `Sequence` or `Mapping`, *optional*):
+            Path(s) to source data file(s).
+            For builders like "csv" or "json" that need the user to specify data files. They can be either
+            local or remote files. For convenience, you can use a `DataFilesDict`.
+        data_dir (`str`, *optional*):
+            Path to directory containing source data file(s).
+            Use only if `data_files` is not passed, in which case it is equivalent to passing
+            `os.path.join(data_dir, "**")` as `data_files`.
+            For builders that require manual download, it must be the path to the local directory containing the
+            manually downloaded data.
+        storage_options (`dict`, *optional*):
+            Key/value pairs to be passed on to the dataset file-system backend, if any.
+        writer_batch_size (`int`, *optional*):
+            Batch size used by the ArrowWriter.
+            It defines the number of samples that are kept in memory before writing them
+            and also the length of the arrow chunks.
+            None means that the ArrowWriter will use its default value.
+        **config_kwargs (additional keyword arguments): Keyword arguments to be passed to the corresponding builder
+            configuration class, set on the class attribute [`DatasetBuilder.BUILDER_CONFIG_CLASS`]. The builder
+            configuration class is [`BuilderConfig`] or a subclass of it.
+    """
+    # Default version
+    VERSION = None  # Default version set in BuilderConfig
+    # Class for the builder config.
+    BUILDER_CONFIG_CLASS = BuilderConfig
+    # Named configurations that modify the data generated by download_and_prepare.
+    BUILDER_CONFIGS = []
+    # Optional default config name to be used when name is None
+    DEFAULT_CONFIG_NAME = None
+    # Default batch size used by the ArrowWriter
+    # It defines the number of samples that are kept in memory before writing them
+    # and also the length of the arrow chunks
+    # None means that the ArrowWriter will use its default value
+    DEFAULT_WRITER_BATCH_SIZE = None
+    def __init__(
+        self,
+        cache_dir: Optional[str] = None,
+        dataset_name: Optional[str] = None,
+        config_name: Optional[str] = None,
+        hash: Optional[str] = None,
+        base_path: Optional[str] = None,
+        info: Optional[DatasetInfo] = None,
+        features: Optional[Features] = None,
+        token: Optional[Union[bool, str]] = None,
+        repo_id: Optional[str] = None,
+        data_files: Optional[Union[str, list, dict, DataFilesDict]] = None,
+        data_dir: Optional[str] = None,
+        storage_options: Optional[dict] = None,
+        writer_batch_size: Optional[int] = None,
+        config_id: Optional[str] = None,
+        **config_kwargs,
+    ):
+        # DatasetBuilder name
+        self.name: str = camelcase_to_snakecase(self.__module__.split(".")[-1])
+        self.hash: Optional[str] = hash
+        self.base_path = base_path
+        self.token = token
+        self.repo_id = repo_id
+        self.storage_options = storage_options or {}
+        self.dataset_name = camelcase_to_snakecase(dataset_name) if dataset_name else self.name
+        self._writer_batch_size = writer_batch_size or self.DEFAULT_WRITER_BATCH_SIZE
+        if data_files is not None and not isinstance(data_files, DataFilesDict):
+            data_files = DataFilesDict.from_patterns(
+                sanitize_patterns(data_files),
+                base_path=base_path,
+                download_config=DownloadConfig(token=token, storage_options=self.storage_options),
+            )
+        # Prepare config: DatasetConfig contains name, version and description but can be extended by each dataset
+        if "features" in inspect.signature(self.BUILDER_CONFIG_CLASS.__init__).parameters and features is not None:
+            config_kwargs["features"] = features
+        if data_files is not None:
+            config_kwargs["data_files"] = data_files
+        if data_dir is not None:
+            config_kwargs["data_dir"] = data_dir
+        self.config_kwargs = config_kwargs
+        self.config, self.config_id = self._create_builder_config(
+            config_name=config_name,
+            custom_features=features,
+            config_id=config_id,
+            **config_kwargs,
+        )
+        # prepare info: DatasetInfo are a standardized dataclass across all datasets
+        # Prefill datasetinfo
+        if info is None:
+            info = self._info()
+        info.builder_name = self.name
+        info.dataset_name = self.dataset_name
+        info.config_name = self.config.name
+        info.version = self.config.version
+        self.info = info
+        # update info with user specified infos
+        if features is not None:
+            self.info.features = features
+        # Prepare data dirs:
+        # cache_dir can be a remote bucket on GCS or S3
+        self._cache_dir_root = str(cache_dir or config.HF_DATASETS_CACHE)
+        self._cache_dir_root = (
+            self._cache_dir_root if is_remote_url(self._cache_dir_root) else os.path.expanduser(self._cache_dir_root)
+        )
+        self._cache_downloaded_dir = (
+            posixpath.join(self._cache_dir_root, config.DOWNLOADED_DATASETS_DIR)
+            if cache_dir
+            else str(config.DOWNLOADED_DATASETS_PATH)
+        )
+        self._cache_downloaded_dir = (
+            self._cache_downloaded_dir
+            if is_remote_url(self._cache_downloaded_dir)
+            else os.path.expanduser(self._cache_downloaded_dir)
+        )
+        # In case there exists a legacy cache directory
+        self._legacy_relative_data_dir = None
+        self._cache_dir = self._build_cache_dir()
+        if not is_remote_url(self._cache_dir_root):
+            os.makedirs(self._cache_dir_root, exist_ok=True)
+            lock_path = os.path.join(
+                self._cache_dir_root, Path(self._cache_dir).as_posix().replace("/", "_") + ".lock"
+            )
+            with FileLock(lock_path):
+                if os.path.exists(self._cache_dir):  # check if data exist
+                    if len(os.listdir(self._cache_dir)) > 0:
+                        if os.path.exists(os.path.join(self._cache_dir, config.DATASET_INFO_FILENAME)):
+                            logger.debug("Overwrite dataset info from restored data version if exists.")
+                            self.info = DatasetInfo.from_directory(self._cache_dir)
+                    else:  # dir exists but no data, remove the empty dir as data aren't available anymore
+                        logger.warning(
+                            f"Old caching folder {self._cache_dir} for dataset {self.dataset_name} exists but no data were found. Removing it. "
+                        )
+                        os.rmdir(self._cache_dir)
+        # Store in the cache by default unless the user specifies a custom output_dir to download_and_prepare
+        self._output_dir = self._cache_dir
+        self._fs: fsspec.AbstractFileSystem = fsspec.filesystem("file")
+        # Set download manager
+        self.dl_manager = None
+        # Set to True by "datasets-cli test" to generate file checksums for (deprecated) dataset_infos.json independently of verification_mode value.
+        self._record_infos = False
+        # Set in `.download_and_prepare` once the format of the generated dataset is known
+        self._file_format = None
+        # Enable streaming (e.g. it patches "open" to work with remote files)
+        extend_dataset_builder_for_streaming(self)
+    def __getstate__(self):
+        return self.__dict__
+    def __setstate__(self, d):
+        self.__dict__ = d
+        # Re-enable streaming, since patched functions are not kept when pickling
+        extend_dataset_builder_for_streaming(self)
+    # Must be set for datasets that use 'data_dir' functionality - the ones
+    # that require users to do additional steps to download the data
+    # (this is usually due to some external regulations / rules).
+    # This field should contain a string with user instructions, including
+    # the list of files that should be present. It will be
+    # displayed in the dataset documentation.
+    @property
+    def manual_download_instructions(self) -> Optional[str]:
+        return None
+    def _check_legacy_cache(self) -> Optional[str]:
+        """Check for the old cache directory template {cache_dir}/{namespace}___{builder_name} from 2.13"""
+        if (
+            self.__module__.startswith("datasets.")
+            and not is_remote_url(self._cache_dir_root)
+            and self.config.name == "default"
+        ):
+            from .packaged_modules import _PACKAGED_DATASETS_MODULES
+            namespace = self.repo_id.split("/")[0] if self.repo_id and self.repo_id.count("/") > 0 else None
+            config_name = self.repo_id.replace("/", "--") if self.repo_id is not None else self.dataset_name
+            config_id = config_name + self.config_id[len(self.config.name) :]
+            hash = _PACKAGED_DATASETS_MODULES.get(self.name, "missing")[1]
+            legacy_relative_data_dir = posixpath.join(
+                self.dataset_name if namespace is None else f"{namespace}___{self.dataset_name}",
+                config_id,
+                "0.0.0",
+                hash,
+            )
+            legacy_cache_dir = posixpath.join(self._cache_dir_root, legacy_relative_data_dir)
+            if os.path.isdir(legacy_cache_dir):
+                return legacy_relative_data_dir
+    def _check_legacy_cache2(self, dataset_module: "DatasetModule") -> Optional[str]:
+        """Check for the old cache directory template {cache_dir}/{namespace}___{dataset_name}/{config_name}-xxx from 2.14 and 2.15"""
+        if (
+            self.__module__.startswith("datasets.")
+            and not is_remote_url(self._cache_dir_root)
+            and not (set(self.config_kwargs) - {"data_files", "data_dir"})
+        ):
+            from .packaged_modules import _PACKAGED_DATASETS_MODULES_2_15_HASHES
+            from .utils._dill import Pickler
+            def update_hash_with_config_parameters(hash: str, config_parameters: dict) -> str:
+                """
+                Used to update hash of packaged modules which is used for creating unique cache directories to reflect
+                different config parameters which are passed in metadata from readme.
+                """
+                params_to_exclude = {"config_name", "version", "description"}
+                params_to_add_to_hash = {
+                    param: value
+                    for param, value in sorted(config_parameters.items())
+                    if param not in params_to_exclude
+                }
+                m = Hasher()
+                m.update(hash)
+                m.update(params_to_add_to_hash)
+                return m.hexdigest()
+            namespace = self.repo_id.split("/")[0] if self.repo_id and self.repo_id.count("/") > 0 else None
+            with patch.object(Pickler, "_legacy_no_dict_keys_sorting", True):
+                config_id = self.config.name + "-" + Hasher.hash({"data_files": self.config.data_files})
+            hash = _PACKAGED_DATASETS_MODULES_2_15_HASHES.get(self.name, "missing")
+            if (
+                dataset_module.builder_configs_parameters.metadata_configs
+                and self.config.name in dataset_module.builder_configs_parameters.metadata_configs
+            ):
+                hash = update_hash_with_config_parameters(
+                    hash, dataset_module.builder_configs_parameters.metadata_configs[self.config.name]
+                )
+            legacy_relative_data_dir = posixpath.join(
+                self.dataset_name if namespace is None else f"{namespace}___{self.dataset_name}",
+                config_id,
+                "0.0.0",
+                hash,
+            )
+            legacy_cache_dir = posixpath.join(self._cache_dir_root, legacy_relative_data_dir)
+            if os.path.isdir(legacy_cache_dir):
+                return legacy_relative_data_dir
+    def _create_builder_config(
+        self, config_name=None, custom_features=None, config_id=None, **config_kwargs
+    ) -> tuple[BuilderConfig, str]:
+        """Create and validate BuilderConfig object as well as a unique config id for this config.
+        Raises ValueError if there are multiple builder configs and config_name and DEFAULT_CONFIG_NAME are None.
+        config_kwargs override the defaults kwargs in config
+        """
+        builder_config = None
+        # try default config
+        if config_name is None and self.BUILDER_CONFIGS:
+            if self.DEFAULT_CONFIG_NAME is not None:
+                builder_config = self.builder_configs.get(self.DEFAULT_CONFIG_NAME)
+                logger.info(f"No config specified, defaulting to: {self.dataset_name}/{builder_config.name}")
+            else:
+                if len(self.BUILDER_CONFIGS) > 1:
+                    if not config_kwargs:
+                        example_of_usage = (
+                            f"load_dataset('{self.repo_id or self.dataset_name}', '{self.BUILDER_CONFIGS[0].name}')"
+                        )
+                        raise ValueError(
+                            "Config name is missing."
+                            f"\nPlease pick one among the available configs: {list(self.builder_configs.keys())}"
+                            + f"\nExample of usage:\n\t`{example_of_usage}`"
+                        )
+                else:
+                    builder_config = self.BUILDER_CONFIGS[0]
+                    logger.info(
+                        f"No config specified, defaulting to the single config: {self.dataset_name}/{builder_config.name}"
+                    )
+        # try to get config by name
+        if isinstance(config_name, str):
+            builder_config = self.builder_configs.get(config_name)
+            if builder_config is None and self.BUILDER_CONFIGS:
+                raise ValueError(
+                    f"BuilderConfig '{config_name}' not found. Available: {list(self.builder_configs.keys())}"
+                )
+        # if not using an existing config, then create a new config on the fly
+        if not builder_config:
+            if config_name is not None:
+                config_kwargs["name"] = config_name
+            elif self.DEFAULT_CONFIG_NAME and not config_kwargs:
+                # Use DEFAULT_CONFIG_NAME only if no config_kwargs are passed
+                config_kwargs["name"] = self.DEFAULT_CONFIG_NAME
+            if "version" not in config_kwargs and hasattr(self, "VERSION") and self.VERSION:
+                config_kwargs["version"] = self.VERSION
+            builder_config = self.BUILDER_CONFIG_CLASS(**config_kwargs)
+        # otherwise use the config_kwargs to overwrite the attributes
+        else:
+            builder_config = copy.deepcopy(builder_config) if config_kwargs else builder_config
+            for key, value in config_kwargs.items():
+                if value is not None:
+                    if not hasattr(builder_config, key):
+                        raise ValueError(f"BuilderConfig {builder_config} doesn't have a '{key}' key.")
+                    setattr(builder_config, key, value)
+        if not builder_config.name:
+            raise ValueError(f"BuilderConfig must have a name, got {builder_config.name}")
+        # resolve data files if needed
+        builder_config._resolve_data_files(
+            base_path=self.base_path,
+            download_config=DownloadConfig(token=self.token, storage_options=self.storage_options),
+        )
+        # compute the config id that is going to be used for caching
+        if config_id is None:
+            config_id = builder_config.create_config_id(
+                config_kwargs,
+                custom_features=custom_features,
+            )
+        is_custom = (config_id not in self.builder_configs) and config_id != "default"
+        if is_custom:
+            logger.info(f"Using custom data configuration {config_id}")
+        else:
+            if (
+                builder_config.name in self.builder_configs
+                and builder_config != self.builder_configs[builder_config.name]
+            ):
+                raise ValueError(
+                    "Cannot name a custom BuilderConfig the same as an available "
+                    f"BuilderConfig. Change the name. Available BuilderConfigs: {list(self.builder_configs.keys())}"
+                )
+            if not builder_config.version:
+                raise ValueError(f"BuilderConfig {builder_config.name} must have a version")
+        return builder_config, config_id
+    @classproperty
+    @classmethod
+    @memoize()
+    def builder_configs(cls) -> dict[str, BuilderConfig]:
+        """Dictionary of pre-defined configurations for this builder class."""
+        configs = {config.name: config for config in cls.BUILDER_CONFIGS}
+        if len(configs) != len(cls.BUILDER_CONFIGS):
+            names = [config.name for config in cls.BUILDER_CONFIGS]
+            raise ValueError(f"Names in BUILDER_CONFIGS must not be duplicated. Got {names}")
+        return configs
+    @property
+    def cache_dir(self):
+        return self._cache_dir
+    def _use_legacy_cache_dir_if_possible(self, dataset_module: "DatasetModule"):
+        # Check for the legacy cache directory template (datasets<3.0.0)
+        self._legacy_relative_data_dir = (
+            self._check_legacy_cache2(dataset_module) or self._check_legacy_cache() or None
+        )
+        self._cache_dir = self._build_cache_dir()
+        self._output_dir = self._cache_dir
+    def _relative_data_dir(self, with_version=True, with_hash=True) -> str:
+        """Relative path of this dataset in cache_dir:
+        Will be:
+            self.dataset_name/self.config.version/self.hash/
+        or if a repo_id with a namespace has been specified:
+            self.namespace___self.dataset_name/self.config.version/self.hash/
+        If any of these element is missing or if ``with_version=False`` the corresponding subfolders are dropped.
+        """
+        if self._legacy_relative_data_dir is not None and with_version and with_hash:
+            return self._legacy_relative_data_dir
+        namespace = self.repo_id.split("/")[0] if self.repo_id and self.repo_id.count("/") > 0 else None
+        builder_data_dir = self.dataset_name if namespace is None else f"{namespace}___{self.dataset_name}"
+        builder_data_dir = posixpath.join(builder_data_dir, self.config_id)
+        if with_version:
+            builder_data_dir = posixpath.join(builder_data_dir, str(self.config.version))
+        if with_hash and self.hash and isinstance(self.hash, str):
+            builder_data_dir = posixpath.join(builder_data_dir, self.hash)
+        return builder_data_dir
+    def _build_cache_dir(self):
+        """Return the data directory for the current version."""
+        builder_data_dir = posixpath.join(self._cache_dir_root, self._relative_data_dir(with_version=False))
+        version_data_dir = posixpath.join(self._cache_dir_root, self._relative_data_dir(with_version=True))
+        def _other_versions_on_disk():
+            """Returns previous versions on disk."""
+            if not os.path.exists(builder_data_dir):
+                return []
+            version_dirnames = []
+            for dir_name in os.listdir(builder_data_dir):
+                try:
+                    version_dirnames.append((utils.Version(dir_name), dir_name))
+                except ValueError:  # Invalid version (ex: incomplete data dir)
+                    pass
+            version_dirnames.sort(reverse=True)
+            return version_dirnames
+        # Check and warn if other versions exist
+        if not is_remote_url(builder_data_dir):
+            version_dirs = _other_versions_on_disk()
+            if version_dirs:
+                other_version = version_dirs[0][0]
+                if other_version != self.config.version:
+                    warn_msg = (
+                        f"Found a different version {str(other_version)} of dataset {self.dataset_name} in "
+                        f"cache_dir {self._cache_dir_root}. Using currently defined version "
+                        f"{str(self.config.version)}."
+                    )
+                    logger.warning(warn_msg)
+        return version_data_dir
+    @abc.abstractmethod
+    def _info(self) -> DatasetInfo:
+        """Construct the DatasetInfo object. See `DatasetInfo` for details.
+        Warning: This function is only called once and the result is cached for all
+        following .info() calls.
+        Returns:
+            info: (DatasetInfo) The dataset information
+        """
+        raise NotImplementedError
+    @classmethod
+    def get_imported_module_dir(cls):
+        """Return the path of the module of this class or subclass."""
+        return os.path.dirname(inspect.getfile(inspect.getmodule(cls)))
+    def _rename(self, src: str, dst: str):
+        rename(self._fs, src, dst)
+    def download_and_prepare(
+        self,
+        output_dir: Optional[str] = None,
+        download_config: Optional[DownloadConfig] = None,
+        download_mode: Optional[Union[DownloadMode, str]] = None,
+        verification_mode: Optional[Union[VerificationMode, str]] = None,
+        dl_manager: Optional[DownloadManager] = None,
+        base_path: Optional[str] = None,
+        file_format: str = "arrow",
+        max_shard_size: Optional[Union[int, str]] = None,
+        num_proc: Optional[int] = None,
+        storage_options: Optional[dict] = None,
+        **download_and_prepare_kwargs,
+    ):
+        """Downloads and prepares dataset for reading.
+        Args:
+            output_dir (`str`, *optional*):
+                Output directory for the dataset.
+                Default to this builder's `cache_dir`, which is inside `~/.cache/huggingface/datasets` by default.
+                <Added version="2.5.0"/>
+            download_config (`DownloadConfig`, *optional*):
+                Specific download configuration parameters.
+            download_mode ([`DownloadMode`] or `str`, *optional*):
+                Select the download/generate mode, default to `REUSE_DATASET_IF_EXISTS`.
+            verification_mode ([`VerificationMode`] or `str`, defaults to `BASIC_CHECKS`):
+                Verification mode determining the checks to run on the downloaded/processed dataset information (checksums/size/splits/...).
+                <Added version="2.9.1"/>
+            dl_manager (`DownloadManager`, *optional*):
+                Specific `DownloadManger` to use.
+            base_path (`str`, *optional*):
+                Base path for relative paths that are used to download files. This can be a remote url.
+                If not specified, the value of the `base_path` attribute (`self.base_path`) will be used instead.
+            file_format (`str`, *optional*):
+                Format of the data files in which the dataset will be written.
+                Supported formats: "arrow", "parquet". Default to "arrow" format.
+                If the format is "parquet", then image and audio data are embedded into the Parquet files instead of pointing to local files.
+                <Added version="2.5.0"/>
+            max_shard_size (`Union[str, int]`, *optional*):
+                Maximum number of bytes written per shard, default is "500MB".
+                The size is based on uncompressed data size, so in practice your shard files may be smaller than
+                `max_shard_size` thanks to Parquet compression for example.
+                <Added version="2.5.0"/>
+            num_proc (`int`, *optional*, defaults to `None`):
+                Number of processes when downloading and generating the dataset locally.
+                Multiprocessing is disabled by default.
+                <Added version="2.7.0"/>
+            storage_options (`dict`, *optional*):
+                Key/value pairs to be passed on to the caching file-system backend, if any.
+                <Added version="2.5.0"/>
+            **download_and_prepare_kwargs (additional keyword arguments): Keyword arguments.
+        Example:
+        Download and prepare the dataset as Arrow files that can be loaded as a Dataset using `builder.as_dataset()`:
+        ```py
+        >>> from datasets import load_dataset_builder
+        >>> builder = load_dataset_builder("cornell-movie-review-data/rotten_tomatoes")
+        >>> builder.download_and_prepare()
+        ```
+        Download and prepare the dataset as sharded Parquet files locally:
+        ```py
+        >>> from datasets import load_dataset_builder
+        >>> builder = load_dataset_builder("cornell-movie-review-data/rotten_tomatoes")
+        >>> builder.download_and_prepare("./output_dir", file_format="parquet")
+        ```
+        Download and prepare the dataset as sharded Parquet files in a cloud storage:
+        ```py
+        >>> from datasets import load_dataset_builder
+        >>> storage_options = {"key": aws_access_key_id, "secret": aws_secret_access_key}
+        >>> builder = load_dataset_builder("cornell-movie-review-data/rotten_tomatoes")
+        >>> builder.download_and_prepare("s3://my-bucket/my_rotten_tomatoes", storage_options=storage_options, file_format="parquet")
+        ```
+        """
+        output_dir = output_dir if output_dir is not None else self._cache_dir
+        # output_dir can be a remote bucket on GCS or S3
+        fs, output_dir = url_to_fs(output_dir, **(storage_options or {}))
+        self._fs = fs
+        self._output_dir = output_dir if not is_remote_filesystem(self._fs) else self._fs.unstrip_protocol(output_dir)
+        download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS)
+        verification_mode = VerificationMode(verification_mode or VerificationMode.BASIC_CHECKS)
+        base_path = base_path if base_path is not None else self.base_path
+        if file_format is not None and file_format not in ["arrow", "parquet"]:
+            raise ValueError(f"Unsupported file_format: {file_format}. Expected 'arrow' or 'parquet'")
+        self._file_format = file_format
+        if self._fs._strip_protocol(self._output_dir) == "":
+            # We don't support the root directory, because it has no dirname,
+            # and we need a dirname to use a <dirname>.incomplete directory
+            # when the dataset is being written
+            raise RuntimeError(
+                f"Unable to download and prepare the dataset at the root {self._output_dir}. "
+                f"Please specify a subdirectory, e.g. '{self._output_dir + self.dataset_name}'"
+            )
+        if dl_manager is None:
+            if download_config is None:
+                download_config = DownloadConfig(
+                    cache_dir=self._cache_downloaded_dir,
+                    force_download=download_mode == DownloadMode.FORCE_REDOWNLOAD,
+                    force_extract=download_mode == DownloadMode.FORCE_REDOWNLOAD,
+                    use_etag=False,
+                    num_proc=num_proc,
+                    token=self.token,
+                    storage_options=self.storage_options,
+                )  # We don't use etag for data files to speed up the process
+            dl_manager = DownloadManager(
+                dataset_name=self.dataset_name,
+                download_config=download_config,
+                data_dir=self.config.data_dir,
+                base_path=base_path,
+                record_checksums=(self._record_infos or verification_mode == VerificationMode.ALL_CHECKS),
+            )
+        is_local = not is_remote_filesystem(self._fs)
+        self.dl_manager = dl_manager
+        # Prevent parallel local disk operations
+        if is_local:
+            # Create parent directory of the output_dir to put the lock file in there
+            Path(self._output_dir).parent.mkdir(parents=True, exist_ok=True)
+            lock_path = self._output_dir + "_builder.lock"
+        # File locking only with local paths; no file locking on GCS or S3
+        with FileLock(lock_path) if is_local else contextlib.nullcontext():
+            # Check if the data already exists
+            data_exists = self._fs.exists(posixpath.join(self._output_dir, config.DATASET_INFO_FILENAME))
+            if data_exists and download_mode == DownloadMode.REUSE_DATASET_IF_EXISTS:
+                logger.info(f"Found cached dataset {self.dataset_name} ({self._output_dir})")
+                # We need to update the info in case some splits were added in the meantime
+                # for example when calling load_dataset from multiple workers.
+                self.info = self._load_info()
+                self.download_post_processing_resources(dl_manager)
+                return
+            logger.info(f"Generating dataset {self.dataset_name} ({self._output_dir})")
+            if is_local:  # if cache dir is local, check for available space
+                if not has_sufficient_disk_space(
+                    self.info.size_in_bytes or 0, directory=Path(self._output_dir).parent
+                ):
+                    raise OSError(
+                        f"Not enough disk space. Needed: {size_str(self.info.size_in_bytes or 0)} (download: {size_str(self.info.download_size or 0)}, generated: {size_str(self.info.dataset_size or 0)}, post-processed: {size_str(self.info.post_processing_size or 0)})"
+                    )
+            @contextlib.contextmanager
+            def incomplete_dir(dirname):
+                """Create temporary dir for dirname and rename on exit."""
+                if not is_local:
+                    self._fs.makedirs(dirname, exist_ok=True)
+                    yield dirname
+                else:
+                    tmp_dir = dirname + ".incomplete"
+                    os.makedirs(tmp_dir, exist_ok=True)
+                    try:
+                        yield tmp_dir
+                        if os.path.isdir(dirname):
+                            shutil.rmtree(dirname)
+                        # LocalFileSystem.mv does copy + rm, it is more efficient to simply rename a local directory
+                        shutil.move(tmp_dir, dirname)
+                    finally:
+                        if os.path.exists(tmp_dir):
+                            shutil.rmtree(tmp_dir)
+            # Print is intentional: we want this to always go to stdout so user has
+            # information needed to cancel download/preparation if needed.
+            # This comes right before the progress bar.
+            if self.info.size_in_bytes:
+                logger.info(
+                    f"Downloading and preparing dataset {self.dataset_name}/{self.config.name} "
+                    f"(download: {size_str(self.info.download_size)}, generated: {size_str(self.info.dataset_size)}, "
+                    f"post-processed: {size_str(self.info.post_processing_size)}, "
+                    f"total: {size_str(self.info.size_in_bytes)}) to {self._output_dir}..."
+                )
+            else:
+                _dest = self._fs._strip_protocol(self._output_dir) if is_local else self._output_dir
+                logger.info(f"Downloading and preparing dataset {self.dataset_name}/{self.config.name} to {_dest}...")
+            self._check_manual_download(dl_manager)
+            # Create a tmp dir and rename to self._output_dir on successful exit.
+            with incomplete_dir(self._output_dir) as tmp_output_dir:
+                # Temporarily assign _output_dir to tmp_data_dir to avoid having to forward
+                # it to every sub function.
+                with temporary_assignment(self, "_output_dir", tmp_output_dir):
+                    prepare_split_kwargs = {"file_format": file_format}
+                    if max_shard_size is not None:
+                        prepare_split_kwargs["max_shard_size"] = max_shard_size
+                    if num_proc is not None:
+                        prepare_split_kwargs["num_proc"] = num_proc
+                    self._download_and_prepare(
+                        dl_manager=dl_manager,
+                        verification_mode=verification_mode,
+                        **prepare_split_kwargs,
+                        **download_and_prepare_kwargs,
+                    )
+                    # Sync info
+                    self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())
+                    self.info.download_checksums = dl_manager.get_recorded_sizes_checksums()
+                    if self.info.download_size is not None:
+                        self.info.size_in_bytes = self.info.dataset_size + self.info.download_size
+                    # Save info
+                    self._save_info()
+            # Download post processing resources
+            self.download_post_processing_resources(dl_manager)
+            logger.info(
+                f"Dataset {self.dataset_name} downloaded and prepared to {self._output_dir}. "
+                f"Subsequent calls will reuse this data."
+            )
+    def _check_manual_download(self, dl_manager):
+        if self.manual_download_instructions is not None and dl_manager.manual_dir is None:
+            raise ManualDownloadError(
+                textwrap.dedent(
+                    f"""\
+                    The dataset {self.dataset_name} with config {self.config.name} requires manual data.
+                    Please follow the manual download instructions:
+                     {self.manual_download_instructions}
+                    Manual data can be loaded with:
+                     datasets.load_dataset("{self.repo_id or self.dataset_name}", data_dir="<path/to/manual/data>")"""
+                )
+            )
+    def _download_and_prepare(self, dl_manager, verification_mode, **prepare_split_kwargs):
+        """Downloads and prepares dataset for reading.
+        This is the internal implementation to overwrite called when user calls
+        `download_and_prepare`. It should download all required data and generate
+        the pre-processed datasets files.
+        Args:
+            dl_manager ([`DownloadManager`]):
+                `DownloadManager` used to download and cache data.
+            verification_mode ([`VerificationMode`]):
+                if `ALL_CHECKS`, perform all the verifications including checksums.
+                if `BASIC_CHECKS`, do not perform checksums, only perform split tests.
+                if `NO_CHECKS`, do not perform any verification.
+            prepare_split_kwargs: Additional options, such as `file_format`, `max_shard_size`
+        """
+        # Generating data for all splits
+        split_dict = SplitDict(dataset_name=self.dataset_name)
+        split_generators_kwargs = self._make_split_generators_kwargs(prepare_split_kwargs)
+        split_generators = self._split_generators(dl_manager, **split_generators_kwargs)
+        # Checksums verification
+        if verification_mode == VerificationMode.ALL_CHECKS and dl_manager.record_checksums:
+            verify_checksums(
+                self.info.download_checksums, dl_manager.get_recorded_sizes_checksums(), "dataset source files"
+            )
+        # Build splits
+        for split_generator in split_generators:
+            if str(split_generator.split_info.name).lower() == "all":
+                raise ValueError(
+                    "`all` is a special split keyword corresponding to the "
+                    "union of all splits, so cannot be used as key in "
+                    "._split_generator()."
+                )
+            logger.info(f"Generating {split_generator.split_info.name} split")
+            split_dict.add(split_generator.split_info)
+            try:
+                # Prepare split will record examples associated to the split
+                self._prepare_split(split_generator, **prepare_split_kwargs)
+            except OSError as e:
+                raise OSError(
+                    "Cannot find data file. "
+                    + (self.manual_download_instructions or "")
+                    + "\nOriginal error:\n"
+                    + str(e)
+                ) from None
+            # If check_duplicates is set to True , then except DuplicatedKeysError
+            except DuplicatedKeysError as e:
+                raise DuplicatedKeysError(
+                    e.key,
+                    e.duplicate_key_indices,
+                    fix_msg=f"To avoid duplicate keys, please fix the dataset splits for {self.name}",
+                ) from None
+            dl_manager.manage_extracted_files()
+        if verification_mode == VerificationMode.BASIC_CHECKS or verification_mode == VerificationMode.ALL_CHECKS:
+            verify_splits(self.info.splits, split_dict)
+        # Update the info object with the splits.
+        self.info.splits = split_dict
+        self.info.download_size = dl_manager.downloaded_size
+    def download_post_processing_resources(self, dl_manager):
+        for split in self.info.splits or []:
+            for resource_name, resource_file_name in self._post_processing_resources(split).items():
+                if not not is_remote_filesystem(self._fs):
+                    raise NotImplementedError(f"Post processing is not supported on filesystem {self._fs}")
+                if os.sep in resource_file_name:
+                    raise ValueError(f"Resources shouldn't be in a sub-directory: {resource_file_name}")
+                resource_path = os.path.join(self._output_dir, resource_file_name)
+                if not os.path.exists(resource_path):
+                    downloaded_resource_path = self._download_post_processing_resources(
+                        split, resource_name, dl_manager
+                    )
+                    if downloaded_resource_path:
+                        logger.info(f"Downloaded post-processing resource {resource_name} as {resource_file_name}")
+                        shutil.move(downloaded_resource_path, resource_path)
+    def _load_info(self) -> DatasetInfo:
+        return DatasetInfo.from_directory(self._output_dir, storage_options=self._fs.storage_options)
+    def _save_info(self):
+        file_lock = (
+            FileLock(self._output_dir + "_info.lock")
+            if not is_remote_filesystem(self._fs)
+            else contextlib.nullcontext()
+        )
+        with file_lock:
+            self.info.write_to_directory(self._output_dir, storage_options=self._fs.storage_options)
+    def _make_split_generators_kwargs(self, prepare_split_kwargs):
+        """Get kwargs for `self._split_generators()` from `prepare_split_kwargs`."""
+        del prepare_split_kwargs
+        return {}
+    def as_dataset(
+        self,
+        split: Optional[Union[str, Split, list[str], list[Split]]] = None,
+        run_post_process=True,
+        verification_mode: Optional[Union[VerificationMode, str]] = None,
+        in_memory=False,
+    ) -> Union[Dataset, DatasetDict]:
+        """Return a Dataset for the specified split.
+        Args:
+            split (`datasets.Split`):
+                Which subset of the data to return.
+            run_post_process (`bool`, defaults to `True`):
+                Whether to run post-processing dataset transforms and/or add
+                indexes.
+            verification_mode ([`VerificationMode`] or `str`, defaults to `BASIC_CHECKS`):
+                Verification mode determining the checks to run on the
+                downloaded/processed dataset information (checksums/size/splits/...).
+                <Added version="2.9.1"/>
+            in_memory (`bool`, defaults to `False`):
+                Whether to copy the data in-memory.
+        Returns:
+            datasets.Dataset
+        Example:
+        ```py
+        >>> from datasets import load_dataset_builder
+        >>> builder = load_dataset_builder('cornell-movie-review-data/rotten_tomatoes')
+        >>> builder.download_and_prepare()
+        >>> ds = builder.as_dataset(split='train')
+        >>> ds
+        Dataset({
+            features: ['text', 'label'],
+            num_rows: 8530
+        })
+        ```
+        """
+        if self._file_format is not None and self._file_format != "arrow":
+            raise FileFormatError('Loading a dataset not written in the "arrow" format is not supported.')
+        if is_remote_filesystem(self._fs):
+            raise NotImplementedError(f"Loading a dataset cached in a {type(self._fs).__name__} is not supported.")
+        if not os.path.exists(self._output_dir):
+            raise FileNotFoundError(
+                f"Dataset {self.dataset_name}: could not find data in {self._output_dir}. Please make sure to call "
+                "builder.download_and_prepare(), or use "
+                "datasets.load_dataset() before trying to access the Dataset object."
+            )
+        logger.debug(f"Constructing Dataset for split {split or ', '.join(self.info.splits)}, from {self._output_dir}")
+        # By default, return all splits
+        if split is None:
+            split = {s: s for s in self.info.splits}
+        verification_mode = VerificationMode(verification_mode or VerificationMode.BASIC_CHECKS)
+        # Create a dataset for each of the given splits
+        datasets = map_nested(
+            partial(
+                self._build_single_dataset,
+                run_post_process=run_post_process,
+                verification_mode=verification_mode,
+                in_memory=in_memory,
+            ),
+            split,
+            map_tuple=True,
+            disable_tqdm=True,
+        )
+        if isinstance(datasets, dict):
+            datasets = DatasetDict(datasets)
+        return datasets
+    def _build_single_dataset(
+        self,
+        split: Union[str, ReadInstruction, Split],
+        run_post_process: bool,
+        verification_mode: VerificationMode,
+        in_memory: bool = False,
+    ):
+        """as_dataset for a single split."""
+        if not isinstance(split, ReadInstruction):
+            split = str(split)
+            if split == "all":
+                split = "+".join(self.info.splits.keys())
+            split = Split(split)
+        # Build base dataset
+        ds = self._as_dataset(
+            split=split,
+            in_memory=in_memory,
+        )
+        if run_post_process:
+            for resource_file_name in self._post_processing_resources(split).values():
+                if os.sep in resource_file_name:
+                    raise ValueError(f"Resources shouldn't be in a sub-directory: {resource_file_name}")
+            resources_paths = {
+                resource_name: os.path.join(self._output_dir, resource_file_name)
+                for resource_name, resource_file_name in self._post_processing_resources(split).items()
+            }
+            post_processed = self._post_process(ds, resources_paths)
+            if post_processed is not None:
+                ds = post_processed
+                recorded_checksums = {}
+                record_checksums = False
+                for resource_name, resource_path in resources_paths.items():
+                    size_checksum = get_size_checksum_dict(resource_path)
+                    recorded_checksums[resource_name] = size_checksum
+                if verification_mode == VerificationMode.ALL_CHECKS and record_checksums:
+                    if self.info.post_processed is None or self.info.post_processed.resources_checksums is None:
+                        expected_checksums = None
+                    else:
+                        expected_checksums = self.info.post_processed.resources_checksums.get(split)
+                    verify_checksums(expected_checksums, recorded_checksums, "post processing resources")
+                if self.info.post_processed is None:
+                    self.info.post_processed = PostProcessedInfo()
+                if self.info.post_processed.resources_checksums is None:
+                    self.info.post_processed.resources_checksums = {}
+                self.info.post_processed.resources_checksums[str(split)] = recorded_checksums
+                self.info.post_processing_size = sum(
+                    checksums_dict["num_bytes"]
+                    for split_checksums_dicts in self.info.post_processed.resources_checksums.values()
+                    for checksums_dict in split_checksums_dicts.values()
+                )
+                if self.info.dataset_size is not None and self.info.download_size is not None:
+                    self.info.size_in_bytes = (
+                        self.info.dataset_size + self.info.download_size + self.info.post_processing_size
+                    )
+                self._save_info()
+                ds._info.post_processed = self.info.post_processed
+                ds._info.post_processing_size = self.info.post_processing_size
+                ds._info.size_in_bytes = self.info.size_in_bytes
+                if self.info.post_processed.features is not None:
+                    if self.info.post_processed.features.type != ds.features.type:
+                        raise ValueError(
+                            f"Post-processed features info don't match the dataset:\nGot\n{self.info.post_processed.features}\nbut expected something like\n{ds.features}"
+                        )
+                    else:
+                        ds.info.features = self.info.post_processed.features
+        return ds
+    def _as_dataset(self, split: Union[ReadInstruction, Split] = Split.TRAIN, in_memory: bool = False) -> Dataset:
+        """Constructs a `Dataset`.
+        This is the internal implementation to overwrite called when user calls
+        `as_dataset`. It should read the pre-processed datasets files and generate
+        the `Dataset` object.
+        Args:
+            split (`datasets.Split`):
+                which subset of the data to read.
+            in_memory (`bool`, defaults to `False`):
+                Whether to copy the data in-memory.
+        Returns:
+            `Dataset`
+        """
+        cache_dir = self._fs._strip_protocol(self._output_dir)
+        dataset_name = self.dataset_name
+        if self._check_legacy_cache():
+            dataset_name = self.name
+        dataset_kwargs = ArrowReader(cache_dir, self.info).read(
+            name=dataset_name,
+            instructions=split,
+            split_infos=self.info.splits.values(),
+            in_memory=in_memory,
+        )
+        fingerprint = self._get_dataset_fingerprint(split)
+        return Dataset(fingerprint=fingerprint, **dataset_kwargs)
+    def _get_dataset_fingerprint(self, split: Union[ReadInstruction, Split]) -> str:
+        """The dataset fingerprint is the hash of the relative directory dataset_name/config_name/version/hash, as well as the split specs."""
+        hasher = Hasher()
+        hasher.update(Path(self._relative_data_dir()).as_posix())
+        hasher.update(str(split))  # for example: train, train+test, train[:10%], test[:33%](pct1_dropremainder)
+        fingerprint = hasher.hexdigest()
+        return fingerprint
+    def as_streaming_dataset(
+        self,
+        split: Optional[str] = None,
+        base_path: Optional[str] = None,
+    ) -> Union[dict[str, IterableDataset], IterableDataset]:
+        if is_remote_filesystem(self._fs):
+            raise NotImplementedError(
+                f"Loading a streaming dataset cached in a {type(self._fs).__name__} is not supported yet."
+            )
+        dl_manager = StreamingDownloadManager(
+            base_path=base_path or self.base_path,
+            download_config=DownloadConfig(token=self.token, storage_options=self.storage_options),
+            dataset_name=self.dataset_name,
+            data_dir=self.config.data_dir,
+        )
+        self._check_manual_download(dl_manager)
+        splits_generators = {sg.name: sg for sg in self._split_generators(dl_manager)}
+        # By default, return all splits
+        if split is None:
+            splits_generator = splits_generators
+        elif split in splits_generators:
+            splits_generator = splits_generators[split]
+        else:
+            raise ValueError(f"Bad split: {split}. Available splits: {list(splits_generators)}")
+        # Create a dataset for each of the given splits
+        datasets = map_nested(
+            self._as_streaming_dataset_single,
+            splits_generator,
+            map_tuple=True,
+        )
+        if isinstance(datasets, dict):
+            datasets = IterableDatasetDict(datasets)
+        return datasets
+    def _as_streaming_dataset_single(
+        self,
+        splits_generator,
+    ) -> IterableDataset:
+        ex_iterable = self._get_examples_iterable_for_split(splits_generator)
+        # add auth to be able to access and decode audio/image files from private repositories.
+        token_per_repo_id = {self.repo_id: self.token} if self.repo_id else {}
+        return IterableDataset(
+            ex_iterable, info=self.info, split=splits_generator.name, token_per_repo_id=token_per_repo_id
+        )
+    def _post_process(self, dataset: Dataset, resources_paths: Mapping[str, str]) -> Optional[Dataset]:
+        """Run dataset transforms or add indexes"""
+        return None
+    def _post_processing_resources(self, split: str) -> dict[str, str]:
+        """Mapping resource_name -> resource_file_name"""
+        return {}
+    def _download_post_processing_resources(
+        self, split: str, resource_name: str, dl_manager: DownloadManager
+    ) -> Optional[str]:
+        """Download the resource using the download manager and return the downloaded path."""
+        return None
+    @abc.abstractmethod
+    def _split_generators(self, dl_manager: Union[DownloadManager, StreamingDownloadManager]):
+        """Specify feature dictionary generators and dataset splits.
+        This function returns a list of `SplitGenerator`s defining how to generate
+        data and what splits to use.
+        Example:
+            return [
+                    datasets.SplitGenerator(
+                            name=datasets.Split.TRAIN,
+                            gen_kwargs={'file': 'train_data.zip'},
+                    ),
+                    datasets.SplitGenerator(
+                            name=datasets.Split.TEST,
+                            gen_kwargs={'file': 'test_data.zip'},
+                    ),
+            ]
+        The above code will first call `_generate_examples(file='train_data.zip')`
+        to write the train data, then `_generate_examples(file='test_data.zip')` to
+        write the test data.
+        Datasets are typically split into different subsets to be used at various
+        stages of training and evaluation.
+        Note that for datasets without a `VALIDATION` split, you can use a
+        fraction of the `TRAIN` data for evaluation as you iterate on your model
+        so as not to overfit to the `TEST` data.
+        For downloads and extractions, use the given `download_manager`.
+        Note that the `DownloadManager` caches downloads, so it is fine to have each
+        generator attempt to download the source data.
+        A good practice is to download all data in this function, and then
+        distribute the relevant parts to each split with the `gen_kwargs` argument
+        Args:
+            dl_manager (`Union[DownloadManager, StreamingDownloadManager]`):
+                Download manager to download the data
+        Returns:
+            `list<SplitGenerator>`.
+        """
+        raise NotImplementedError()
+    @abc.abstractmethod
+    def _prepare_split(
+        self,
+        split_generator: SplitGenerator,
+        file_format: str = "arrow",
+        max_shard_size: Optional[Union[str, int]] = None,
+        num_proc: Optional[int] = None,
+        **kwargs,
+    ):
+        """Generate the examples and record them on disk.
+        Args:
+            split_generator (`SplitGenerator`):
+                Split generator to process
+            file_format (`str`, *optional*):
+                format of the data files in which the dataset will be written.
+                Supported formats: "arrow", "parquet". Default to "arrow" format.
+            max_shard_size (`Union[str, int]`, *optional*):
+                Maximum number of bytes written per shard, default is "500MB".
+                The size is based on uncompressed data size, so in practice your shard files may be smaller than
+                `max_shard_size` thanks to Parquet compression for example.
+            num_proc (`int`, *optional*, defaults to `None`):
+                Number of processes when downloading and generating the dataset locally.
+                Multiprocessing is disabled by default.
+                <Added version="2.7.0"/>
+            **kwargs: Additional kwargs forwarded from _download_and_prepare
+        """
+        raise NotImplementedError()
+    def _get_examples_iterable_for_split(self, split_generator: SplitGenerator) -> ExamplesIterable:
+        """Generate the examples on the fly.
+        Args:
+            split_generator (`SplitGenerator`):
+                Split generator to process
+        """
+        raise NotImplementedError()
+class GeneratorBasedBuilder(DatasetBuilder):
+    """Base class for datasets with data generation based on dict generators.
+    `GeneratorBasedBuilder` is a convenience class that abstracts away much
+    of the data writing and reading of `DatasetBuilder`. It expects subclasses to
+    implement generators of feature dictionaries across the dataset splits
+    (`_split_generators`). See the method docstrings for details.
+    """
+    @abc.abstractmethod
+    def _generate_examples(self, **kwargs):
+        """Default function generating examples for each `SplitGenerator`.
+        This function preprocess the examples from the raw data to the preprocessed
+        dataset files.
+        This function is called once for each `SplitGenerator` defined in
+        `_split_generators`. The examples yielded here will be written on
+        disk.
+        Args:
+            **kwargs (additional keyword arguments):
+                Arguments forwarded from the SplitGenerator.gen_kwargs
+        Yields:
+            key: `str` or `int`, a unique deterministic example identification key.
+                * Unique: An error will be raised if two examples are yield with the
+                    same key.
+                * Deterministic: When generating the dataset twice, the same example
+                    should have the same key.
+                Good keys can be the image id, or line number if examples are extracted
+                from a text file.
+                The key will be hashed and sorted to shuffle examples deterministically,
+                such as generating the dataset multiple times keep examples in the
+                same order.
+            example: `dict<str feature_name, feature_value>`, a feature dictionary
+                ready to be encoded and written to disk. The example will be
+                encoded with `self.info.features.encode_example({...})`.
+        """
+        raise NotImplementedError()
+    def _prepare_split(
+        self,
+        split_generator: SplitGenerator,
+        check_duplicate_keys: bool,
+        file_format="arrow",
+        num_proc: Optional[int] = None,
+        max_shard_size: Optional[Union[int, str]] = None,
+    ):
+        max_shard_size = convert_file_size_to_int(max_shard_size or config.MAX_SHARD_SIZE)
+        if self.info.splits is not None:
+            split_info = self.info.splits[split_generator.name]
+        else:
+            split_info = split_generator.split_info
+        SUFFIX = "-JJJJJ-SSSSS-of-NNNNN"
+        fname = f"{self.dataset_name}-{split_generator.name}{SUFFIX}.{file_format}"
+        fpath = posixpath.join(self._output_dir, fname)
+        if num_proc and num_proc > 1:
+            num_input_shards = _number_of_shards_in_gen_kwargs(split_generator.gen_kwargs)
+            if num_input_shards <= 1:
+                logger.warning(
+                    f"Setting num_proc from {num_proc} back to 1 for the {split_info.name} split to disable multiprocessing as it only contains one shard."
+                )
+                num_proc = 1
+            elif num_input_shards < num_proc:
+                logger.warning(
+                    f"Setting num_proc from {num_proc} to {num_input_shards} for the {split_info.name} split as it only contains {num_input_shards} shards."
+                )
+                num_proc = num_input_shards
+        pbar = hf_tqdm(
+            unit=" examples",
+            total=split_info.num_examples,
+            desc=f"Generating {split_info.name} split",
+        )
+        _prepare_split_args = {
+            "fpath": fpath,
+            "file_format": file_format,
+            "max_shard_size": max_shard_size,
+            "split_info": split_info,
+            "check_duplicate_keys": check_duplicate_keys,
+        }
+        if num_proc is None or num_proc == 1:
+            result = None
+            gen_kwargs = split_generator.gen_kwargs
+            job_id = 0
+            with pbar:
+                for job_id, done, content in self._prepare_split_single(
+                    gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args
+                ):
+                    if done:
+                        result = content
+                    else:
+                        pbar.update(content)
+            # wrapping everything into lists for consistency with the multiprocessed code path
+            assert result is not None, "Failed to retrieve results from prepare_split"
+            examples_per_job, bytes_per_job, features_per_job, shards_per_job, shard_lengths_per_job = (
+                [item] for item in result
+            )
+        else:
+            kwargs_per_job = [
+                {"gen_kwargs": gen_kwargs, "job_id": job_id, **_prepare_split_args}
+                for job_id, gen_kwargs in enumerate(
+                    _split_gen_kwargs(split_generator.gen_kwargs, max_num_jobs=num_proc)
+                )
+            ]
+            num_jobs = len(kwargs_per_job)
+            examples_per_job = [None] * num_jobs
+            bytes_per_job = [None] * num_jobs
+            features_per_job = [None] * num_jobs
+            shards_per_job = [None] * num_jobs
+            shard_lengths_per_job = [None] * num_jobs
+            with Pool(num_proc) as pool:
+                with pbar:
+                    for job_id, done, content in iflatmap_unordered(
+                        pool, self._prepare_split_single, kwargs_iterable=kwargs_per_job
+                    ):
+                        if done:
+                            # the content is the result of the job
+                            (
+                                examples_per_job[job_id],
+                                bytes_per_job[job_id],
+                                features_per_job[job_id],
+                                shards_per_job[job_id],
+                                shard_lengths_per_job[job_id],
+                            ) = content
+                        else:
+                            # the content is the number of examples progress update
+                            pbar.update(content)
+            assert None not in examples_per_job, (
+                f"Failed to retrieve results from prepare_split: result list {examples_per_job} still contains None - at least one worker failed to return its results"
+            )
+        total_shards = sum(shards_per_job)
+        total_num_examples = sum(examples_per_job)
+        total_num_bytes = sum(bytes_per_job)
+        features = features_per_job[0]
+        split_generator.split_info.num_examples = total_num_examples
+        split_generator.split_info.num_bytes = total_num_bytes
+        # should rename everything at the end
+        logger.debug(f"Renaming {total_shards} shards.")
+        if total_shards > 1:
+            # use the -SSSSS-of-NNNNN pattern
+            def _rename_shard(shard_and_job: tuple[int]):
+                shard_id, job_id = shard_and_job
+                global_shard_id = sum(shards_per_job[:job_id]) + shard_id
+                self._rename(
+                    fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
+                    fpath.replace("JJJJJ-SSSSS", f"{global_shard_id:05d}").replace("NNNNN", f"{total_shards:05d}"),
+                )
+            shards_and_jobs = [
+                (shard_id, job_id)
+                for job_id, num_shards in enumerate(shards_per_job)
+                for shard_id in range(num_shards)
+            ]
+            thread_map(_rename_shard, shards_and_jobs, disable=True, max_workers=64)
+            split_generator.split_info.shard_lengths = [
+                shard_length for shard_lengths in shard_lengths_per_job for shard_length in shard_lengths
+            ]
+        else:
+            # don't use any pattern
+            shard_id, job_id = 0, 0
+            self._rename(
+                fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
+                fpath.replace(SUFFIX, ""),
+            )
+        if self.info.features is None:
+            self.info.features = features
+    def _prepare_split_single(
+        self,
+        gen_kwargs: dict,
+        fpath: str,
+        file_format: str,
+        max_shard_size: int,
+        split_info: SplitInfo,
+        check_duplicate_keys: bool,
+        job_id: int,
+    ) -> Iterable[tuple[int, bool, Union[int, tuple]]]:
+        generator = self._generate_examples(**gen_kwargs)
+        writer_class = ParquetWriter if file_format == "parquet" else ArrowWriter
+        embed_local_files = file_format == "parquet"
+        shard_lengths = []
+        total_num_examples, total_num_bytes = 0, 0
+        shard_id = 0
+        num_examples_progress_update = 0
+        try:
+            writer = writer_class(
+                features=self.info.features,
+                path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
+                writer_batch_size=self._writer_batch_size,
+                hash_salt=split_info.name,
+                check_duplicates=check_duplicate_keys,
+                storage_options=self._fs.storage_options,
+                embed_local_files=embed_local_files,
+            )
+            try:
+                _time = time.time()
+                for key, record in generator:
+                    if max_shard_size is not None and writer._num_bytes > max_shard_size:
+                        num_examples, num_bytes = writer.finalize()
+                        writer.close()
+                        shard_lengths.append(num_examples)
+                        total_num_examples += num_examples
+                        total_num_bytes += num_bytes
+                        shard_id += 1
+                        writer = writer_class(
+                            features=writer._features,
+                            path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
+                            writer_batch_size=self._writer_batch_size,
+                            hash_salt=split_info.name,
+                            check_duplicates=check_duplicate_keys,
+                            storage_options=self._fs.storage_options,
+                            embed_local_files=embed_local_files,
+                        )
+                    example = self.info.features.encode_example(record) if self.info.features is not None else record
+                    writer.write(example, key)
+                    num_examples_progress_update += 1
+                    if time.time() > _time + config.PBAR_REFRESH_TIME_INTERVAL:
+                        _time = time.time()
+                        yield job_id, False, num_examples_progress_update
+                        num_examples_progress_update = 0
+            finally:
+                yield job_id, False, num_examples_progress_update
+                num_shards = shard_id + 1
+                num_examples, num_bytes = writer.finalize()
+                writer.close()
+                shard_lengths.append(num_examples)
+                total_num_examples += num_examples
+                total_num_bytes += num_bytes
+        except Exception as e:
+            # Ignore the writer's error for no examples written to the file if this error was caused by the error in _generate_examples before the first example was yielded
+            if isinstance(e, SchemaInferenceError) and e.__context__ is not None:
+                e = e.__context__
+            raise DatasetGenerationError("An error occurred while generating the dataset") from e
+        yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths)
+    def _download_and_prepare(self, dl_manager, verification_mode, **prepare_splits_kwargs):
+        super()._download_and_prepare(
+            dl_manager,
+            verification_mode,
+            check_duplicate_keys=verification_mode == VerificationMode.BASIC_CHECKS
+            or verification_mode == VerificationMode.ALL_CHECKS,
+            **prepare_splits_kwargs,
+        )
+    def _get_examples_iterable_for_split(self, split_generator: SplitGenerator) -> ExamplesIterable:
+        return ExamplesIterable(self._generate_examples, split_generator.gen_kwargs)
+class ArrowBasedBuilder(DatasetBuilder):
+    """Base class for datasets with data generation based on Arrow loading functions (CSV/JSON/Parquet)."""
+    @abc.abstractmethod
+    def _generate_tables(self, **kwargs):
+        """Default function generating examples for each `SplitGenerator`.
+        This function preprocess the examples from the raw data to the preprocessed
+        dataset files.
+        This function is called once for each `SplitGenerator` defined in
+        `_split_generators`. The examples yielded here will be written on
+        disk.
+        Args:
+            **kwargs (additional keyword arguments):
+                Arguments forwarded from the SplitGenerator.gen_kwargs
+        Yields:
+            key: `str` or `int`, a unique deterministic example identification key.
+                * Unique: An error will be raised if two examples are yield with the
+                    same key.
+                * Deterministic: When generating the dataset twice, the same example
+                    should have the same key.
+                Good keys can be the image id, or line number if examples are extracted
+                from a text file.
+                The key will be hashed and sorted to shuffle examples deterministically,
+                such as generating the dataset multiple times keep examples in the
+                same order.
+            example: `pyarrow.Table`, a feature table
+                ready to be encoded and written to disk.
+        """
+        raise NotImplementedError()
+    def _prepare_split(
+        self,
+        split_generator: SplitGenerator,
+        file_format: str = "arrow",
+        num_proc: Optional[int] = None,
+        max_shard_size: Optional[Union[str, int]] = None,
+    ):
+        max_shard_size = convert_file_size_to_int(max_shard_size or config.MAX_SHARD_SIZE)
+        try:
+            split_info = self.info.splits[split_generator.name]
+        except Exception:
+            split_info = split_generator.split_info
+        SUFFIX = "-JJJJJ-SSSSS-of-NNNNN"
+        fname = f"{self.dataset_name}-{split_generator.name}{SUFFIX}.{file_format}"
+        fpath = posixpath.join(self._output_dir, fname)
+        if num_proc and num_proc > 1:
+            num_input_shards = _number_of_shards_in_gen_kwargs(split_generator.gen_kwargs)
+            if num_input_shards <= 1:
+                logger.warning(
+                    f"Setting num_proc from {num_proc} back to 1 for the {split_info.name} split to disable multiprocessing as it only contains one shard."
+                )
+                num_proc = 1
+            elif num_input_shards < num_proc:
+                logger.warning(
+                    f"Setting num_proc from {num_proc} to {num_input_shards} for the {split_info.name} split as it only contains {num_input_shards} shards."
+                )
+                num_proc = num_input_shards
+        pbar = hf_tqdm(
+            unit=" examples",
+            total=split_info.num_examples,
+            desc=f"Generating {split_info.name} split",
+        )
+        _prepare_split_args = {
+            "fpath": fpath,
+            "file_format": file_format,
+            "max_shard_size": max_shard_size,
+        }
+        if num_proc is None or num_proc == 1:
+            result = None
+            gen_kwargs = split_generator.gen_kwargs
+            job_id = 0
+            with pbar:
+                for job_id, done, content in self._prepare_split_single(
+                    gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args
+                ):
+                    if done:
+                        result = content
+                    else:
+                        pbar.update(content)
+            # wrapping everything into lists for consistency with the multiprocessed code path
+            assert result is not None, "Failed to retrieve results from prepare_split"
+            examples_per_job, bytes_per_job, features_per_job, shards_per_job, shard_lengths_per_job = (
+                [item] for item in result
+            )
+        else:
+            kwargs_per_job = [
+                {"gen_kwargs": gen_kwargs, "job_id": job_id, **_prepare_split_args}
+                for job_id, gen_kwargs in enumerate(
+                    _split_gen_kwargs(split_generator.gen_kwargs, max_num_jobs=num_proc)
+                )
+            ]
+            num_jobs = len(kwargs_per_job)
+            examples_per_job = [None] * num_jobs
+            bytes_per_job = [None] * num_jobs
+            features_per_job = [None] * num_jobs
+            shards_per_job = [None] * num_jobs
+            shard_lengths_per_job = [None] * num_jobs
+            with Pool(num_proc) as pool:
+                with pbar:
+                    for job_id, done, content in iflatmap_unordered(
+                        pool, self._prepare_split_single, kwargs_iterable=kwargs_per_job
+                    ):
+                        if done:
+                            # the content is the result of the job
+                            (
+                                examples_per_job[job_id],
+                                bytes_per_job[job_id],
+                                features_per_job[job_id],
+                                shards_per_job[job_id],
+                                shard_lengths_per_job[job_id],
+                            ) = content
+                        else:
+                            # the content is the number of examples progress update
+                            pbar.update(content)
+            assert None not in examples_per_job, (
+                f"Failed to retrieve results from prepare_split: result list {examples_per_job} still contains None - at least one worker failed to return its results"
+            )
+        total_shards = sum(shards_per_job)
+        total_num_examples = sum(examples_per_job)
+        total_num_bytes = sum(bytes_per_job)
+        features = features_per_job[0]
+        split_generator.split_info.num_examples = total_num_examples
+        split_generator.split_info.num_bytes = total_num_bytes
+        # should rename everything at the end
+        logger.debug(f"Renaming {total_shards} shards.")
+        if total_shards > 1:
+            # use the -SSSSS-of-NNNNN pattern
+            def _rename_shard(shard_id_and_job: tuple[int]):
+                shard_id, job_id = shard_id_and_job
+                global_shard_id = sum(shards_per_job[:job_id]) + shard_id
+                self._rename(
+                    fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
+                    fpath.replace("JJJJJ-SSSSS", f"{global_shard_id:05d}").replace("NNNNN", f"{total_shards:05d}"),
+                )
+            shard_ids_and_jobs = [
+                (shard_id, job_id)
+                for job_id, num_shards in enumerate(shards_per_job)
+                for shard_id in range(num_shards)
+            ]
+            thread_map(_rename_shard, shard_ids_and_jobs, disable=True, max_workers=64)
+            split_generator.split_info.shard_lengths = [
+                shard_length for shard_lengths in shard_lengths_per_job for shard_length in shard_lengths
+            ]
+        else:
+            # don't use any pattern
+            shard_id, job_id = 0, 0
+            self._rename(
+                fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
+                fpath.replace(SUFFIX, ""),
+            )
+        if self.info.features is None:
+            self.info.features = features
+    def _prepare_split_single(
+        self, gen_kwargs: dict, fpath: str, file_format: str, max_shard_size: int, job_id: int
+    ) -> Iterable[tuple[int, bool, Union[int, tuple]]]:
+        gen_kwargs = {k: tracked_list(v) if isinstance(v, list) else v for k, v in gen_kwargs.items()}
+        generator = self._generate_tables(**gen_kwargs)
+        writer_class = ParquetWriter if file_format == "parquet" else ArrowWriter
+        embed_local_files = file_format == "parquet"
+        shard_lengths = []
+        total_num_examples, total_num_bytes = 0, 0
+        shard_id = 0
+        num_examples_progress_update = 0
+        try:
+            writer = writer_class(
+                features=self.info.features,
+                path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
+                writer_batch_size=self._writer_batch_size,
+                storage_options=self._fs.storage_options,
+                embed_local_files=embed_local_files,
+            )
+            try:
+                _time = time.time()
+                for _, table in generator:
+                    if max_shard_size is not None and writer._num_bytes > max_shard_size:
+                        num_examples, num_bytes = writer.finalize()
+                        writer.close()
+                        shard_lengths.append(num_examples)
+                        total_num_examples += num_examples
+                        total_num_bytes += num_bytes
+                        shard_id += 1
+                        writer = writer_class(
+                            features=writer._features,
+                            path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
+                            writer_batch_size=self._writer_batch_size,
+                            storage_options=self._fs.storage_options,
+                            embed_local_files=embed_local_files,
+                        )
+                    try:
+                        writer.write_table(table)
+                    except CastError as cast_error:
+                        raise DatasetGenerationCastError.from_cast_error(
+                            cast_error=cast_error,
+                            builder_name=self.info.builder_name,
+                            gen_kwargs=gen_kwargs,
+                            token=self.token,
+                        )
+                    num_examples_progress_update += len(table)
+                    if time.time() > _time + config.PBAR_REFRESH_TIME_INTERVAL:
+                        _time = time.time()
+                        yield job_id, False, num_examples_progress_update
+                        num_examples_progress_update = 0
+            finally:
+                yield job_id, False, num_examples_progress_update
+                num_shards = shard_id + 1
+                num_examples, num_bytes = writer.finalize()
+                writer.close()
+                shard_lengths.append(num_examples)
+                total_num_examples += num_examples
+                total_num_bytes += num_bytes
+        except Exception as e:
+            # Ignore the writer's error for no examples written to the file if this error was caused by the error in _generate_examples before the first example was yielded
+            if isinstance(e, SchemaInferenceError) and e.__context__ is not None:
+                e = e.__context__
+            if isinstance(e, DatasetGenerationError):
+                raise
+            raise DatasetGenerationError("An error occurred while generating the dataset") from e
+        yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths)
+    def _get_examples_iterable_for_split(self, split_generator: SplitGenerator) -> ExamplesIterable:
+        return ArrowExamplesIterable(self._generate_tables, kwargs=split_generator.gen_kwargs)

datasets/combine.py ADDED Viewed

	@@ -0,0 +1,223 @@

+from typing import Optional, TypeVar
+from .arrow_dataset import Dataset, _concatenate_map_style_datasets, _interleave_map_style_datasets
+from .dataset_dict import DatasetDict, IterableDatasetDict
+from .info import DatasetInfo
+from .iterable_dataset import IterableDataset, _concatenate_iterable_datasets, _interleave_iterable_datasets
+from .splits import NamedSplit
+from .utils import logging
+from .utils.py_utils import Literal
+logger = logging.get_logger(__name__)
+DatasetType = TypeVar("DatasetType", Dataset, IterableDataset)
+def interleave_datasets(
+    datasets: list[DatasetType],
+    probabilities: Optional[list[float]] = None,
+    seed: Optional[int] = None,
+    info: Optional[DatasetInfo] = None,
+    split: Optional[NamedSplit] = None,
+    stopping_strategy: Literal[
+        "first_exhausted", "all_exhausted", "all_exhausted_without_replacement"
+    ] = "first_exhausted",
+) -> DatasetType:
+    """
+    Interleave several datasets (sources) into a single dataset.
+    The new dataset is constructed by alternating between the sources to get the examples.
+    You can use this function on a list of [`Dataset`] objects, or on a list of [`IterableDataset`] objects.
+        - If `probabilities` is `None` (default) the new dataset is constructed by cycling between each source to get the examples.
+        - If `probabilities` is not `None`, the new dataset is constructed by getting examples from a random source at a time according to the provided probabilities.
+    The resulting dataset ends when one of the source datasets runs out of examples except when `oversampling` is `True`,
+    in which case, the resulting dataset ends when all datasets have ran out of examples at least one time.
+    Note for iterable datasets:
+    In a distributed setup or in PyTorch DataLoader workers, the stopping strategy is applied per process.
+    Therefore the "first_exhausted" strategy on an sharded iterable dataset can generate less samples in total (up to 1 missing sample per subdataset per worker).
+    Args:
+        datasets (`List[Dataset]` or `List[IterableDataset]`):
+            List of datasets to interleave.
+        probabilities (`List[float]`, *optional*, defaults to `None`):
+            If specified, the new dataset is constructed by sampling
+            examples from one source at a time according to these probabilities.
+        seed (`int`, *optional*, defaults to `None`):
+            The random seed used to choose a source for each example.
+        info ([`DatasetInfo`], *optional*):
+            Dataset information, like description, citation, etc.
+            <Added version="2.4.0"/>
+        split ([`NamedSplit`], *optional*):
+            Name of the dataset split.
+            <Added version="2.4.0"/>
+        stopping_strategy (`str`, defaults to `first_exhausted`):
+            Three strategies are proposed right now, `first_exhausted`, `all_exhausted` and `all_exhausted_without_replacement`.
+            By default, `first_exhausted` is an undersampling strategy, i.e the dataset construction is stopped as soon as one dataset has ran out of samples.
+            If the strategy is `all_exhausted`,  we use an oversampling strategy, i.e the dataset construction is stopped as soon as every samples of every dataset has been added at least once.
+            When strategy is `all_exhausted_without_replacement` we make sure that each sample in each dataset is sampled only once.
+            Note that if the strategy is `all_exhausted`, the interleaved dataset size can get enormous:
+            - with no probabilities, the resulting dataset will have `max_length_datasets*nb_dataset` samples.
+            - with given probabilities, the resulting dataset will have more samples if some datasets have really low probability of visiting.
+    Returns:
+        [`Dataset`] or [`IterableDataset`]: Return type depends on the input `datasets`
+        parameter. `Dataset` if the input is a list of `Dataset`, `IterableDataset` if the input is a list of
+        `IterableDataset`.
+    Example:
+        For regular datasets (map-style):
+        ```python
+        >>> from datasets import Dataset, interleave_datasets
+        >>> d1 = Dataset.from_dict({"a": [0, 1, 2]})
+        >>> d2 = Dataset.from_dict({"a": [10, 11, 12]})
+        >>> d3 = Dataset.from_dict({"a": [20, 21, 22]})
+        >>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42, stopping_strategy="all_exhausted")
+        >>> dataset["a"]
+        [10, 0, 11, 1, 2, 20, 12, 10, 0, 1, 2, 21, 0, 11, 1, 2, 0, 1, 12, 2, 10, 0, 22]
+        >>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42)
+        >>> dataset["a"]
+        [10, 0, 11, 1, 2]
+        >>> dataset = interleave_datasets([d1, d2, d3])
+        >>> dataset["a"]
+        [0, 10, 20, 1, 11, 21, 2, 12, 22]
+        >>> dataset = interleave_datasets([d1, d2, d3], stopping_strategy="all_exhausted")
+        >>> dataset["a"]
+        [0, 10, 20, 1, 11, 21, 2, 12, 22]
+        >>> d1 = Dataset.from_dict({"a": [0, 1, 2]})
+        >>> d2 = Dataset.from_dict({"a": [10, 11, 12, 13]})
+        >>> d3 = Dataset.from_dict({"a": [20, 21, 22, 23, 24]})
+        >>> dataset = interleave_datasets([d1, d2, d3])
+        >>> dataset["a"]
+        [0, 10, 20, 1, 11, 21, 2, 12, 22]
+        >>> dataset = interleave_datasets([d1, d2, d3], stopping_strategy="all_exhausted")
+        >>> dataset["a"]
+        [0, 10, 20, 1, 11, 21, 2, 12, 22, 0, 13, 23, 1, 10, 24]
+        >>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42)
+        >>> dataset["a"]
+        [10, 0, 11, 1, 2]
+        >>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42, stopping_strategy="all_exhausted")
+        >>> dataset["a"]
+        [10, 0, 11, 1, 2, 20, 12, 13, ..., 0, 1, 2, 0, 24]
+        For datasets in streaming mode (iterable):
+        >>> from datasets import interleave_datasets
+        >>> d1 = load_dataset('allenai/c4', 'es', split='train', streaming=True)
+        >>> d2 = load_dataset('allenai/c4', 'fr', split='train', streaming=True)
+        >>> dataset = interleave_datasets([d1, d2])
+        >>> iterator = iter(dataset)
+        >>> next(iterator)
+        {'text': 'Comprar Zapatillas para niña en chancla con goma por...'}
+        >>> next(iterator)
+        {'text': 'Le sacre de philippe ier, 23 mai 1059 - Compte Rendu...'
+        ```
+    """
+    from .arrow_dataset import Dataset
+    from .iterable_dataset import IterableDataset
+    if not datasets:
+        raise ValueError("Unable to interleave an empty list of datasets.")
+    for i, dataset in enumerate(datasets):
+        if not isinstance(dataset, (Dataset, IterableDataset)):
+            if isinstance(dataset, (DatasetDict, IterableDatasetDict)):
+                if not dataset:
+                    raise ValueError(
+                        f"Expected a list of Dataset objects or a list of IterableDataset objects, but element at position {i} "
+                        "is an empty dataset dictionary."
+                    )
+                raise ValueError(
+                    f"Dataset at position {i} has at least one split: {list(dataset)}\n"
+                    f"Please pick one to interleave with the other datasets, for example: dataset['{next(iter(dataset))}']"
+                )
+            raise ValueError(
+                f"Expected a list of Dataset objects or a list of IterableDataset objects, but element at position {i} is a {type(dataset).__name__}."
+            )
+        if i == 0:
+            dataset_type, other_type = (
+                (Dataset, IterableDataset) if isinstance(dataset, Dataset) else (IterableDataset, Dataset)
+            )
+        elif not isinstance(dataset, dataset_type):
+            raise ValueError(
+                f"Unable to interleave a {dataset_type.__name__} (at position 0) with a {other_type.__name__} (at position {i}). Expected a list of Dataset objects or a list of IterableDataset objects."
+            )
+    if stopping_strategy not in ["first_exhausted", "all_exhausted", "all_exhausted_without_replacement"]:
+        raise ValueError(f"{stopping_strategy} is not supported. Please enter a valid stopping_strategy.")
+    if dataset_type is Dataset:
+        return _interleave_map_style_datasets(
+            datasets, probabilities, seed, info=info, split=split, stopping_strategy=stopping_strategy
+        )
+    else:
+        return _interleave_iterable_datasets(
+            datasets,
+            probabilities,
+            seed,
+            info=info,
+            split=split,
+            stopping_strategy=stopping_strategy,
+        )
+def concatenate_datasets(
+    dsets: list[DatasetType],
+    info: Optional[DatasetInfo] = None,
+    split: Optional[NamedSplit] = None,
+    axis: int = 0,
+) -> DatasetType:
+    """
+    Converts a list of [`Dataset`] with the same schema into a single [`Dataset`].
+    Args:
+        dsets (`List[datasets.Dataset]`):
+            List of Datasets to concatenate.
+        info (`DatasetInfo`, *optional*):
+            Dataset information, like description, citation, etc.
+        split (`NamedSplit`, *optional*):
+            Name of the dataset split.
+        axis (`{0, 1}`, defaults to `0`):
+            Axis to concatenate over, where `0` means over rows (vertically) and `1` means over columns
+            (horizontally).
+            <Added version="1.6.0"/>
+    Example:
+    ```py
+    >>> ds3 = concatenate_datasets([ds1, ds2])
+    ```
+    """
+    if not dsets:
+        raise ValueError("Unable to concatenate an empty list of datasets.")
+    for i, dataset in enumerate(dsets):
+        if not isinstance(dataset, (Dataset, IterableDataset)):
+            if isinstance(dataset, (DatasetDict, IterableDatasetDict)):
+                if not dataset:
+                    raise ValueError(
+                        f"Expected a list of Dataset objects or a list of IterableDataset objects, but element at position {i} "
+                        "is an empty dataset dictionary."
+                    )
+                raise ValueError(
+                    f"Dataset at position {i} has at least one split: {list(dataset)}\n"
+                    f"Please pick one to interleave with the other datasets, for example: dataset['{next(iter(dataset))}']"
+                )
+            raise ValueError(
+                f"Expected a list of Dataset objects or a list of IterableDataset objects, but element at position {i} is a {type(dataset).__name__}."
+            )
+        if i == 0:
+            dataset_type, other_type = (
+                (Dataset, IterableDataset) if isinstance(dataset, Dataset) else (IterableDataset, Dataset)
+            )
+        elif not isinstance(dataset, dataset_type):
+            raise ValueError(
+                f"Unable to interleave a {dataset_type.__name__} (at position 0) with a {other_type.__name__} (at position {i}). Expected a list of Dataset objects or a list of IterableDataset objects."
+            )
+    if dataset_type is Dataset:
+        return _concatenate_map_style_datasets(dsets, info=info, split=split, axis=axis)
+    else:
+        return _concatenate_iterable_datasets(dsets, info=info, split=split, axis=axis)

datasets/config.py ADDED Viewed

	@@ -0,0 +1,268 @@

+import importlib
+import importlib.metadata
+import logging
+import os
+import platform
+from pathlib import Path
+from typing import Optional
+from huggingface_hub import constants
+from packaging import version
+logger = logging.getLogger(__name__.split(".", 1)[0])  # to avoid circular import from .utils.logging
+# Datasets
+S3_DATASETS_BUCKET_PREFIX = "https://s3.amazonaws.com/datasets.huggingface.co/datasets/datasets"
+CLOUDFRONT_DATASETS_DISTRIB_PREFIX = "https://cdn-datasets.huggingface.co/datasets/datasets"
+REPO_DATASETS_URL = "https://raw.githubusercontent.com/huggingface/datasets/{revision}/datasets/{path}/{name}"
+# Hub
+HF_ENDPOINT = os.environ.get("HF_ENDPOINT", "https://huggingface.co")
+HUB_DATASETS_URL = HF_ENDPOINT + "/datasets/{repo_id}/resolve/{revision}/{path}"
+HUB_DATASETS_HFFS_URL = "hf://datasets/{repo_id}@{revision}/{path}"
+HUB_DEFAULT_VERSION = "main"
+PY_VERSION = version.parse(platform.python_version())
+# General environment variables accepted values for booleans
+ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
+ENV_VARS_FALSE_VALUES = {"0", "OFF", "NO", "FALSE"}
+ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
+ENV_VARS_FALSE_AND_AUTO_VALUES = ENV_VARS_FALSE_VALUES.union({"AUTO"})
+# Imports
+DILL_VERSION = version.parse(importlib.metadata.version("dill"))
+FSSPEC_VERSION = version.parse(importlib.metadata.version("fsspec"))
+PANDAS_VERSION = version.parse(importlib.metadata.version("pandas"))
+PYARROW_VERSION = version.parse(importlib.metadata.version("pyarrow"))
+HF_HUB_VERSION = version.parse(importlib.metadata.version("huggingface_hub"))
+USE_TF = os.environ.get("USE_TF", "AUTO").upper()
+USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper()
+USE_JAX = os.environ.get("USE_JAX", "AUTO").upper()
+TORCH_VERSION = "N/A"
+TORCH_AVAILABLE = False
+if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES:
+    TORCH_AVAILABLE = importlib.util.find_spec("torch") is not None
+    if TORCH_AVAILABLE:
+        try:
+            TORCH_VERSION = version.parse(importlib.metadata.version("torch"))
+            logger.debug(f"PyTorch version {TORCH_VERSION} available.")
+        except importlib.metadata.PackageNotFoundError:
+            pass
+else:
+    logger.info("Disabling PyTorch because USE_TF is set")
+POLARS_VERSION = "N/A"
+POLARS_AVAILABLE = importlib.util.find_spec("polars") is not None
+if POLARS_AVAILABLE:
+    try:
+        POLARS_VERSION = version.parse(importlib.metadata.version("polars"))
+        logger.debug(f"Polars version {POLARS_VERSION} available.")
+    except importlib.metadata.PackageNotFoundError:
+        pass
+DUCKDB_VERSION = "N/A"
+DUCKDB_AVAILABLE = importlib.util.find_spec("duckdb") is not None
+if DUCKDB_AVAILABLE:
+    try:
+        DUCKDB_VERSION = version.parse(importlib.metadata.version("duckdb"))
+        logger.debug(f"Duckdb version {DUCKDB_VERSION} available.")
+    except importlib.metadata.PackageNotFoundError:
+        pass
+TF_VERSION = "N/A"
+TF_AVAILABLE = False
+if USE_TF in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TORCH not in ENV_VARS_TRUE_VALUES:
+    TF_AVAILABLE = importlib.util.find_spec("tensorflow") is not None
+    if TF_AVAILABLE:
+        # For the metadata, we have to look for both tensorflow and tensorflow-cpu
+        for package in [
+            "tensorflow",
+            "tensorflow-cpu",
+            "tensorflow-gpu",
+            "tf-nightly",
+            "tf-nightly-cpu",
+            "tf-nightly-gpu",
+            "intel-tensorflow",
+            "tensorflow-rocm",
+            "tensorflow-macos",
+        ]:
+            try:
+                TF_VERSION = version.parse(importlib.metadata.version(package))
+            except importlib.metadata.PackageNotFoundError:
+                continue
+            else:
+                break
+        else:
+            TF_AVAILABLE = False
+    if TF_AVAILABLE:
+        if TF_VERSION.major < 2:
+            logger.info(f"TensorFlow found but with version {TF_VERSION}. `datasets` requires version 2 minimum.")
+            TF_AVAILABLE = False
+        else:
+            logger.info(f"TensorFlow version {TF_VERSION} available.")
+else:
+    logger.info("Disabling Tensorflow because USE_TORCH is set")
+JAX_VERSION = "N/A"
+JAX_AVAILABLE = False
+if USE_JAX in ENV_VARS_TRUE_AND_AUTO_VALUES:
+    JAX_AVAILABLE = importlib.util.find_spec("jax") is not None and importlib.util.find_spec("jaxlib") is not None
+    if JAX_AVAILABLE:
+        try:
+            JAX_VERSION = version.parse(importlib.metadata.version("jax"))
+            logger.info(f"JAX version {JAX_VERSION} available.")
+        except importlib.metadata.PackageNotFoundError:
+            pass
+else:
+    logger.info("Disabling JAX because USE_JAX is set to False")
+# Optional tools for data loading
+SQLALCHEMY_AVAILABLE = importlib.util.find_spec("sqlalchemy") is not None
+# Optional tools for feature decoding
+PIL_AVAILABLE = importlib.util.find_spec("PIL") is not None
+IS_OPUS_SUPPORTED = True
+IS_MP3_SUPPORTED = True
+TORCHCODEC_AVAILABLE = importlib.util.find_spec("torchcodec") is not None
+TORCHVISION_AVAILABLE = importlib.util.find_spec("torchvision") is not None
+PDFPLUMBER_AVAILABLE = importlib.util.find_spec("pdfplumber") is not None
+# Optional compression tools
+RARFILE_AVAILABLE = importlib.util.find_spec("rarfile") is not None
+ZSTANDARD_AVAILABLE = importlib.util.find_spec("zstandard") is not None
+LZ4_AVAILABLE = importlib.util.find_spec("lz4") is not None
+PY7ZR_AVAILABLE = importlib.util.find_spec("py7zr") is not None
+# Cache location
+DEFAULT_XDG_CACHE_HOME = "~/.cache"
+XDG_CACHE_HOME = os.getenv("XDG_CACHE_HOME", DEFAULT_XDG_CACHE_HOME)
+DEFAULT_HF_CACHE_HOME = os.path.join(XDG_CACHE_HOME, "huggingface")
+HF_CACHE_HOME = os.path.expanduser(os.getenv("HF_HOME", DEFAULT_HF_CACHE_HOME))
+DEFAULT_HF_DATASETS_CACHE = os.path.join(HF_CACHE_HOME, "datasets")
+HF_DATASETS_CACHE = Path(os.getenv("HF_DATASETS_CACHE", DEFAULT_HF_DATASETS_CACHE))
+DEFAULT_HF_MODULES_CACHE = os.path.join(HF_CACHE_HOME, "modules")
+HF_MODULES_CACHE = Path(os.getenv("HF_MODULES_CACHE", DEFAULT_HF_MODULES_CACHE))
+DOWNLOADED_DATASETS_DIR = "downloads"
+DEFAULT_DOWNLOADED_DATASETS_PATH = os.path.join(HF_DATASETS_CACHE, DOWNLOADED_DATASETS_DIR)
+DOWNLOADED_DATASETS_PATH = Path(os.getenv("HF_DATASETS_DOWNLOADED_DATASETS_PATH", DEFAULT_DOWNLOADED_DATASETS_PATH))
+EXTRACTED_DATASETS_DIR = "extracted"
+DEFAULT_EXTRACTED_DATASETS_PATH = os.path.join(DEFAULT_DOWNLOADED_DATASETS_PATH, EXTRACTED_DATASETS_DIR)
+EXTRACTED_DATASETS_PATH = Path(os.getenv("HF_DATASETS_EXTRACTED_DATASETS_PATH", DEFAULT_EXTRACTED_DATASETS_PATH))
+# Download count for the website
+HF_UPDATE_DOWNLOAD_COUNTS = (
+    os.environ.get("HF_UPDATE_DOWNLOAD_COUNTS", "AUTO").upper() in ENV_VARS_TRUE_AND_AUTO_VALUES
+)
+# For downloads and to check remote files metadata
+HF_DATASETS_MULTITHREADING_MAX_WORKERS = 16
+# Dataset viewer API
+USE_PARQUET_EXPORT = True
+# Batch size constants. For more info, see:
+# https://github.com/apache/arrow/blob/master/docs/source/cpp/arrays.rst#size-limitations-and-recommendations)
+DEFAULT_MAX_BATCH_SIZE = 1000
+DEFAULT_CDC_OPTIONS = {"min_chunk_size": 256 * 1024, "max_chunk_size": 1024 * 1024, "norm_level": 0}
+# Size of the preloaded record batch in `Dataset.__iter__`
+ARROW_READER_BATCH_SIZE_IN_DATASET_ITER = 10
+# Max uncompressed shard size in bytes (e.g. to shard parquet datasets in push_to_hub or download_and_prepare)
+MAX_SHARD_SIZE = "500MB"
+# Max uncompressed row group size in bytes (e.g. for parquet files in push_to_hub or download_and_prepare)
+MAX_ROW_GROUP_SIZE = "100MB"
+# Parquet configuration
+PARQUET_ROW_GROUP_SIZE_FOR_AUDIO_DATASETS = None
+PARQUET_ROW_GROUP_SIZE_FOR_IMAGE_DATASETS = None
+PARQUET_ROW_GROUP_SIZE_FOR_BINARY_DATASETS = None
+PARQUET_ROW_GROUP_SIZE_FOR_VIDEO_DATASETS = None
+# Arrow configuration
+ARROW_RECORD_BATCH_SIZE_FOR_AUDIO_DATASETS = 100
+ARROW_RECORD_BATCH_SIZE_FOR_IMAGE_DATASETS = 100
+ARROW_RECORD_BATCH_SIZE_FOR_BINARY_DATASETS = 100
+ARROW_RECORD_BATCH_SIZE_FOR_VIDEO_DATASETS = 10
+# Offline mode
+_offline = os.environ.get("HF_DATASETS_OFFLINE")
+HF_HUB_OFFLINE = constants.HF_HUB_OFFLINE if _offline is None else _offline.upper() in ENV_VARS_TRUE_VALUES
+HF_DATASETS_OFFLINE = HF_HUB_OFFLINE  # kept for backward-compatibility
+# Here, `True` will disable progress bars globally without possibility of enabling it
+# programmatically. `False` will enable them without possibility of disabling them.
+# If environment variable is not set (None), then the user is free to enable/disable
+# them programmatically.
+# TL;DR: env variable has priority over code
+__HF_DATASETS_DISABLE_PROGRESS_BARS = os.environ.get("HF_DATASETS_DISABLE_PROGRESS_BARS")
+HF_DATASETS_DISABLE_PROGRESS_BARS: Optional[bool] = (
+    __HF_DATASETS_DISABLE_PROGRESS_BARS.upper() in ENV_VARS_TRUE_VALUES
+    if __HF_DATASETS_DISABLE_PROGRESS_BARS is not None
+    else None
+)
+# In-memory
+DEFAULT_IN_MEMORY_MAX_SIZE = 0  # Disabled
+IN_MEMORY_MAX_SIZE = float(os.environ.get("HF_DATASETS_IN_MEMORY_MAX_SIZE", DEFAULT_IN_MEMORY_MAX_SIZE))
+# File names
+DATASET_ARROW_FILENAME = "dataset.arrow"
+DATASET_INDICES_FILENAME = "indices.arrow"
+DATASET_STATE_JSON_FILENAME = "state.json"
+DATASET_INFO_FILENAME = "dataset_info.json"
+DATASETDICT_INFOS_FILENAME = "dataset_infos.json"
+LICENSE_FILENAME = "LICENSE"
+DATASETDICT_JSON_FILENAME = "dataset_dict.json"
+METADATA_CONFIGS_FIELD = "configs"
+REPOCARD_FILENAME = "README.md"
+REPOYAML_FILENAME = ".huggingface.yaml"
+MODULE_NAME_FOR_DYNAMIC_MODULES = "datasets_modules"
+MAX_DATASET_CONFIG_ID_READABLE_LENGTH = 255
+# Temporary cache directory prefix
+TEMP_CACHE_DIR_PREFIX = "hf_datasets-"
+# Streaming
+STREAMING_READ_MAX_RETRIES = 20
+STREAMING_READ_RETRY_INTERVAL = 5
+STREAMING_OPEN_MAX_RETRIES = 20
+STREAMING_OPEN_RETRY_INTERVAL = 5
+# Datasets repositories exploration
+DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE = 200
+GLOBBED_DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE = 10
+ARCHIVED_DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE = 200
+# Async map functions
+MAX_NUM_RUNNING_ASYNC_MAP_FUNCTIONS_IN_PARALLEL = 1000
+# Progress bars
+PBAR_REFRESH_TIME_INTERVAL = 0.05  # 20 progress updates per sec
+# Maximum number of uploaded files per commit
+UPLOADS_MAX_NUMBER_PER_COMMIT = 50
+# Backward compatibility
+MAX_TABLE_NBYTES_FOR_PICKLING = 4 << 30

datasets/data_files.py ADDED Viewed

	@@ -0,0 +1,807 @@

+import os
+import re
+from functools import partial
+from glob import has_magic
+from pathlib import Path, PurePath
+from typing import Callable, Optional, Union
+import huggingface_hub
+from fsspec.core import url_to_fs
+from huggingface_hub import HfFileSystem
+from packaging import version
+from tqdm.contrib.concurrent import thread_map
+from . import config
+from .download import DownloadConfig
+from .naming import _split_re
+from .splits import Split
+from .utils import logging
+from .utils import tqdm as hf_tqdm
+from .utils.file_utils import _prepare_path_and_storage_options, is_local_path, is_relative_path, xbasename, xjoin
+from .utils.py_utils import string_to_dict
+SingleOriginMetadata = Union[tuple[str, str], tuple[str], tuple[()]]
+SANITIZED_DEFAULT_SPLIT = str(Split.TRAIN)
+logger = logging.get_logger(__name__)
+class Url(str):
+    pass
+class EmptyDatasetError(FileNotFoundError):
+    pass
+SPLIT_PATTERN_SHARDED = "data/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*"
+SPLIT_KEYWORDS = {
+    Split.TRAIN: ["train", "training"],
+    Split.VALIDATION: ["validation", "valid", "dev", "val"],
+    Split.TEST: ["test", "testing", "eval", "evaluation"],
+}
+NON_WORDS_CHARS = "-._ 0-9"
+if config.FSSPEC_VERSION < version.parse("2023.9.0"):
+    KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**[{sep}/]{keyword}[{sep}]*", "{keyword}[{sep}]*"]
+    KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [
+        "{keyword}/**",
+        "{keyword}[{sep}]*/**",
+        "**[{sep}/]{keyword}/**",
+        "**[{sep}/]{keyword}[{sep}]*/**",
+    ]
+elif config.FSSPEC_VERSION < version.parse("2023.12.0"):
+    KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**/*[{sep}/]{keyword}[{sep}]*", "{keyword}[{sep}]*"]
+    KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [
+        "{keyword}/**/*",
+        "{keyword}[{sep}]*/**/*",
+        "**/*[{sep}/]{keyword}/**/*",
+        "**/*[{sep}/]{keyword}[{sep}]*/**/*",
+    ]
+else:
+    KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**/{keyword}[{sep}]*", "**/*[{sep}]{keyword}[{sep}]*"]
+    KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [
+        "**/{keyword}/**",
+        "**/{keyword}[{sep}]*/**",
+        "**/*[{sep}]{keyword}/**",
+        "**/*[{sep}]{keyword}[{sep}]*/**",
+    ]
+DEFAULT_SPLITS = [Split.TRAIN, Split.VALIDATION, Split.TEST]
+DEFAULT_PATTERNS_SPLIT_IN_FILENAME = {
+    split: [
+        pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
+        for keyword in SPLIT_KEYWORDS[split]
+        for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS
+    ]
+    for split in DEFAULT_SPLITS
+}
+DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME = {
+    split: [
+        pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
+        for keyword in SPLIT_KEYWORDS[split]
+        for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS
+    ]
+    for split in DEFAULT_SPLITS
+}
+DEFAULT_PATTERNS_ALL = {
+    Split.TRAIN: ["**"],
+}
+ALL_SPLIT_PATTERNS = [SPLIT_PATTERN_SHARDED]
+ALL_DEFAULT_PATTERNS = [
+    DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME,
+    DEFAULT_PATTERNS_SPLIT_IN_FILENAME,
+    DEFAULT_PATTERNS_ALL,
+]
+WILDCARD_CHARACTERS = "*[]"
+FILES_TO_IGNORE = [
+    "README.md",
+    "config.json",
+    "dataset_info.json",
+    "dataset_infos.json",
+    "dummy_data.zip",
+    "dataset_dict.json",
+]
+def contains_wildcards(pattern: str) -> bool:
+    return any(wildcard_character in pattern for wildcard_character in WILDCARD_CHARACTERS)
+def sanitize_patterns(patterns: Union[dict, list, str]) -> dict[str, Union[list[str], "DataFilesList"]]:
+    """
+    Take the data_files patterns from the user, and format them into a dictionary.
+    Each key is the name of the split, and each value is a list of data files patterns (paths or urls).
+    The default split is "train".
+    Returns:
+        patterns: dictionary of split_name -> list of patterns
+    """
+    if isinstance(patterns, dict):
+        return {str(key): value if isinstance(value, list) else [value] for key, value in patterns.items()}
+    elif isinstance(patterns, str):
+        return {SANITIZED_DEFAULT_SPLIT: [patterns]}
+    elif isinstance(patterns, list):
+        if any(isinstance(pattern, dict) for pattern in patterns):
+            for pattern in patterns:
+                if not (
+                    isinstance(pattern, dict)
+                    and len(pattern) == 2
+                    and "split" in pattern
+                    and isinstance(pattern.get("path"), (str, list))
+                ):
+                    raise ValueError(
+                        f"Expected each split to have a 'path' key which can be a string or a list of strings, but got {pattern}"
+                    )
+            splits = [pattern["split"] for pattern in patterns]
+            if len(set(splits)) != len(splits):
+                raise ValueError(f"Some splits are duplicated in data_files: {splits}")
+            return {
+                str(pattern["split"]): pattern["path"] if isinstance(pattern["path"], list) else [pattern["path"]]
+                for pattern in patterns
+            }
+        else:
+            return {SANITIZED_DEFAULT_SPLIT: patterns}
+    else:
+        return sanitize_patterns(list(patterns))
+def _is_inside_unrequested_special_dir(matched_rel_path: str, pattern: str) -> bool:
+    """
+    When a path matches a pattern, we additionally check if it's inside a special directory
+    we ignore by default (if it starts with a double underscore).
+    Users can still explicitly request a filepath inside such a directory if "__pycache__" is
+    mentioned explicitly in the requested pattern.
+    Some examples:
+    base directory:
+        ./
+        └── __pycache__
+            └── b.txt
+    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "**")
+    True
+    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "*/b.txt")
+    True
+    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__pycache__/*")
+    False
+    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__*/*")
+    False
+    """
+    # We just need to check if every special directories from the path is present explicitly in the pattern.
+    # Since we assume that the path matches the pattern, it's equivalent to counting that both
+    # the parent path and the parent pattern have the same number of special directories.
+    data_dirs_to_ignore_in_path = [part for part in PurePath(matched_rel_path).parent.parts if part.startswith("__")]
+    data_dirs_to_ignore_in_pattern = [part for part in PurePath(pattern).parent.parts if part.startswith("__")]
+    return len(data_dirs_to_ignore_in_path) != len(data_dirs_to_ignore_in_pattern)
+def _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(matched_rel_path: str, pattern: str) -> bool:
+    """
+    When a path matches a pattern, we additionally check if it's a hidden file or if it's inside
+    a hidden directory we ignore by default, i.e. if the file name or a parent directory name starts with a dot.
+    Users can still explicitly request a filepath that is hidden or is inside a hidden directory
+    if the hidden part is mentioned explicitly in the requested pattern.
+    Some examples:
+    base directory:
+        ./
+        └── .hidden_file.txt
+    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", "**")
+    True
+    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", ".*")
+    False
+    base directory:
+        ./
+        └── .hidden_dir
+            └── a.txt
+    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", "**")
+    True
+    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".*/*")
+    False
+    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".hidden_dir/*")
+    False
+    base directory:
+        ./
+        └── .hidden_dir
+            └── .hidden_file.txt
+    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", "**")
+    True
+    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/*")
+    True
+    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/.*")
+    False
+    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/*")
+    True
+    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/.*")
+    False
+    """
+    # We just need to check if every hidden part from the path is present explicitly in the pattern.
+    # Since we assume that the path matches the pattern, it's equivalent to counting that both
+    # the path and the pattern have the same number of hidden parts.
+    hidden_directories_in_path = [
+        part for part in PurePath(matched_rel_path).parts if part.startswith(".") and not set(part) == {"."}
+    ]
+    hidden_directories_in_pattern = [
+        part for part in PurePath(pattern).parts if part.startswith(".") and not set(part) == {"."}
+    ]
+    return len(hidden_directories_in_path) != len(hidden_directories_in_pattern)
+def _get_data_files_patterns(pattern_resolver: Callable[[str], list[str]]) -> dict[str, list[str]]:
+    """
+    Get the default pattern from a directory or repository by testing all the supported patterns.
+    The first patterns to return a non-empty list of data files is returned.
+    In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
+    """
+    # first check the split patterns like data/{split}-00000-of-00001.parquet
+    for split_pattern in ALL_SPLIT_PATTERNS:
+        pattern = split_pattern.replace("{split}", "*")
+        try:
+            data_files = pattern_resolver(pattern)
+        except FileNotFoundError:
+            continue
+        if len(data_files) > 0:
+            splits: set[str] = set()
+            for p in data_files:
+                p_parts = string_to_dict(xbasename(p), xbasename(split_pattern))
+                assert p_parts is not None
+                splits.add(p_parts["split"])
+            if any(not re.match(_split_re, split) for split in splits):
+                raise ValueError(f"Split name should match '{_split_re}'' but got '{splits}'.")
+            sorted_splits = [str(split) for split in DEFAULT_SPLITS if split in splits] + sorted(
+                splits - {str(split) for split in DEFAULT_SPLITS}
+            )
+            return {split: [split_pattern.format(split=split)] for split in sorted_splits}
+    # then check the default patterns based on train/valid/test splits
+    for patterns_dict in ALL_DEFAULT_PATTERNS:
+        non_empty_splits = []
+        for split, patterns in patterns_dict.items():
+            for pattern in patterns:
+                try:
+                    data_files = pattern_resolver(pattern)
+                except FileNotFoundError:
+                    continue
+                if len(data_files) > 0:
+                    non_empty_splits.append(split)
+                    break
+        if non_empty_splits:
+            return {split: patterns_dict[split] for split in non_empty_splits}
+    raise FileNotFoundError(f"Couldn't resolve pattern {pattern} with resolver {pattern_resolver}")
+def resolve_pattern(
+    pattern: str,
+    base_path: str,
+    allowed_extensions: Optional[list[str]] = None,
+    download_config: Optional[DownloadConfig] = None,
+) -> list[str]:
+    """
+    Resolve the paths and URLs of the data files from the pattern passed by the user.
+    You can use patterns to resolve multiple local files. Here are a few examples:
+    - *.csv to match all the CSV files at the first level
+    - **.csv to match all the CSV files at any level
+    - data/* to match all the files inside "data"
+    - data/** to match all the files inside "data" and its subdirectories
+    The patterns are resolved using the fsspec glob. In fsspec>=2023.12.0 this is equivalent to
+    Python's glob.glob, Path.glob, Path.match and fnmatch where ** is unsupported with a prefix/suffix
+    other than a forward slash /.
+    More generally:
+    - '*' matches any character except a forward-slash (to match just the file or directory name)
+    - '**' matches any character including a forward-slash /
+    Hidden files and directories (i.e. whose names start with a dot) are ignored, unless they are explicitly requested.
+    The same applies to special directories that start with a double underscore like "__pycache__".
+    You can still include one if the pattern explicitly mentions it:
+    - to include a hidden file: "*/.hidden.txt" or "*/.*"
+    - to include a hidden directory: ".hidden/*" or ".*/*"
+    - to include a special directory: "__special__/*" or "__*/*"
+    Example::
+        >>> from datasets.data_files import resolve_pattern
+        >>> base_path = "."
+        >>> resolve_pattern("docs/**/*.py", base_path)
+        [/Users/mariosasko/Desktop/projects/datasets/docs/source/_config.py']
+    Args:
+        pattern (str): Unix pattern or paths or URLs of the data files to resolve.
+            The paths can be absolute or relative to base_path.
+            Remote filesystems using fsspec are supported, e.g. with the hf:// protocol.
+        base_path (str): Base path to use when resolving relative paths.
+        allowed_extensions (Optional[list], optional): White-list of file extensions to use. Defaults to None (all extensions).
+            For example: allowed_extensions=[".csv", ".json", ".txt", ".parquet"]
+        download_config ([`DownloadConfig`], *optional*): Specific download configuration parameters.
+    Returns:
+        List[str]: List of paths or URLs to the local or remote files that match the patterns.
+    """
+    if is_relative_path(pattern):
+        pattern = xjoin(base_path, pattern)
+    elif is_local_path(pattern):
+        base_path = os.path.splitdrive(pattern)[0] + os.sep
+    else:
+        base_path = ""
+    pattern, storage_options = _prepare_path_and_storage_options(pattern, download_config=download_config)
+    fs, fs_pattern = url_to_fs(pattern, **storage_options)
+    files_to_ignore = set(FILES_TO_IGNORE) - {xbasename(pattern)}
+    protocol = fs.protocol if isinstance(fs.protocol, str) else fs.protocol[0]
+    protocol_prefix = protocol + "://" if protocol != "file" else ""
+    glob_kwargs = {}
+    if protocol == "hf":
+        # 10 times faster glob with detail=True (ignores costly info like lastCommit)
+        glob_kwargs["expand_info"] = False
+    matched_paths = [
+        filepath if filepath.startswith(protocol_prefix) else protocol_prefix + filepath
+        for filepath, info in fs.glob(pattern, detail=True, **glob_kwargs).items()
+        if (info["type"] == "file" or (info.get("islink") and os.path.isfile(os.path.realpath(filepath))))
+        and (xbasename(filepath) not in files_to_ignore)
+        and not _is_inside_unrequested_special_dir(filepath, fs_pattern)
+        and not _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(filepath, fs_pattern)
+    ]  # ignore .ipynb and __pycache__, but keep /../
+    if allowed_extensions is not None:
+        out = [
+            filepath
+            for filepath in matched_paths
+            if any("." + suffix in allowed_extensions for suffix in xbasename(filepath).split(".")[1:])
+        ]
+        if len(out) < len(matched_paths):
+            invalid_matched_files = list(set(matched_paths) - set(out))
+            logger.info(
+                f"Some files matched the pattern '{pattern}' but don't have valid data file extensions: {invalid_matched_files}"
+            )
+    else:
+        out = matched_paths
+    if not out:
+        error_msg = f"Unable to find '{pattern}'"
+        if allowed_extensions is not None:
+            error_msg += f" with any supported extension {list(allowed_extensions)}"
+        raise FileNotFoundError(error_msg)
+    return out
+def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig] = None) -> dict[str, list[str]]:
+    """
+    Get the default pattern from a directory testing all the supported patterns.
+    The first patterns to return a non-empty list of data files is returned.
+    Some examples of supported patterns:
+    Input:
+        my_dataset_repository/
+        ├── README.md
+        └── dataset.csv
+    Output:
+        {'train': ['**']}
+    Input:
+        my_dataset_repository/
+        ├── README.md
+        ├── train.csv
+        └── test.csv
+        my_dataset_repository/
+        ├── README.md
+        └── data/
+            ├── train.csv
+            └── test.csv
+        my_dataset_repository/
+        ├── README.md
+        ├── train_0.csv
+        ├── train_1.csv
+        ├── train_2.csv
+        ├── train_3.csv
+        ├── test_0.csv
+        └── test_1.csv
+    Output:
+        {'train': ['**/train[-._ 0-9]*', '**/*[-._ 0-9]train[-._ 0-9]*', '**/training[-._ 0-9]*', '**/*[-._ 0-9]training[-._ 0-9]*'],
+         'test': ['**/test[-._ 0-9]*', '**/*[-._ 0-9]test[-._ 0-9]*', '**/testing[-._ 0-9]*', '**/*[-._ 0-9]testing[-._ 0-9]*', ...]}
+    Input:
+        my_dataset_repository/
+        ├── README.md
+        └── data/
+            ├── train/
+            │   ├── shard_0.csv
+            │   ├── shard_1.csv
+            │   ├── shard_2.csv
+            │   └── shard_3.csv
+            └── test/
+                ├── shard_0.csv
+                └── shard_1.csv
+    Output:
+        {'train': ['**/train/**', '**/train[-._ 0-9]*/**', '**/*[-._ 0-9]train/**', '**/*[-._ 0-9]train[-._ 0-9]*/**', ...],
+         'test': ['**/test/**', '**/test[-._ 0-9]*/**', '**/*[-._ 0-9]test/**', '**/*[-._ 0-9]test[-._ 0-9]*/**', ...]}
+    Input:
+        my_dataset_repository/
+        ├── README.md
+        └── data/
+            ├── train-00000-of-00003.csv
+            ├── train-00001-of-00003.csv
+            ├── train-00002-of-00003.csv
+            ├── test-00000-of-00001.csv
+            ├── random-00000-of-00003.csv
+            ├── random-00001-of-00003.csv
+            └── random-00002-of-00003.csv
+    Output:
+        {'train': ['data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
+         'test': ['data/test-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
+         'random': ['data/random-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*']}
+    In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
+    """
+    resolver = partial(resolve_pattern, base_path=base_path, download_config=download_config)
+    try:
+        return _get_data_files_patterns(resolver)
+    except FileNotFoundError:
+        raise EmptyDatasetError(f"The directory at {base_path} doesn't contain any data files") from None
+def _get_single_origin_metadata(
+    data_file: str,
+    download_config: Optional[DownloadConfig] = None,
+) -> SingleOriginMetadata:
+    data_file, storage_options = _prepare_path_and_storage_options(data_file, download_config=download_config)
+    fs, *_ = url_to_fs(data_file, **storage_options)
+    if isinstance(fs, HfFileSystem):
+        resolved_path = fs.resolve_path(data_file)
+        return resolved_path.repo_id, resolved_path.revision
+    elif data_file.startswith(config.HF_ENDPOINT):
+        hffs = HfFileSystem(endpoint=config.HF_ENDPOINT, token=download_config.token)
+        data_file = "hf://" + data_file[len(config.HF_ENDPOINT) + 1 :].replace("/resolve/", "@", 1)
+        resolved_path = hffs.resolve_path(data_file)
+        return resolved_path.repo_id, resolved_path.revision
+    info = fs.info(data_file)
+    # s3fs uses "ETag", gcsfs uses "etag", and for local we simply check mtime
+    for key in ["ETag", "etag", "mtime"]:
+        if key in info:
+            return (str(info[key]),)
+    return ()
+def _get_origin_metadata(
+    data_files: list[str],
+    download_config: Optional[DownloadConfig] = None,
+    max_workers: Optional[int] = None,
+) -> list[SingleOriginMetadata]:
+    max_workers = max_workers if max_workers is not None else config.HF_DATASETS_MULTITHREADING_MAX_WORKERS
+    if all("hf://" in data_file for data_file in data_files):
+        # No need for multithreading here since the origin metadata of HF files
+        # is (repo_id, revision) and is cached after first .info() call.
+        return [
+            _get_single_origin_metadata(data_file, download_config=download_config)
+            for data_file in hf_tqdm(
+                data_files,
+                desc="Resolving data files",
+                # set `disable=None` rather than `disable=False` by default to disable progress bar when no TTY attached
+                disable=len(data_files) <= 16 or None,
+            )
+        ]
+    return thread_map(
+        partial(_get_single_origin_metadata, download_config=download_config),
+        data_files,
+        max_workers=max_workers,
+        tqdm_class=hf_tqdm,
+        desc="Resolving data files",
+        # set `disable=None` rather than `disable=False` by default to disable progress bar when no TTY attached
+        disable=len(data_files) <= 16 or None,
+    )
+class DataFilesList(list[str]):
+    """
+    List of data files (absolute local paths or URLs).
+    It has two construction methods given the user's data files patterns:
+    - ``from_hf_repo``: resolve patterns inside a dataset repository
+    - ``from_local_or_remote``: resolve patterns from a local path
+    Moreover, DataFilesList has an additional attribute ``origin_metadata``.
+    It can store:
+    - the last modified time of local files
+    - ETag of remote files
+    - commit sha of a dataset repository
+    Thanks to this additional attribute, it is possible to hash the list
+    and get a different hash if and only if at least one file changed.
+    This is useful for caching Dataset objects that are obtained from a list of data files.
+    """
+    def __init__(self, data_files: list[str], origin_metadata: list[SingleOriginMetadata]) -> None:
+        super().__init__(data_files)
+        self.origin_metadata = origin_metadata
+    def __add__(self, other: "DataFilesList") -> "DataFilesList":
+        return DataFilesList([*self, *other], self.origin_metadata + other.origin_metadata)
+    @classmethod
+    def from_hf_repo(
+        cls,
+        patterns: list[str],
+        dataset_info: huggingface_hub.hf_api.DatasetInfo,
+        base_path: Optional[str] = None,
+        allowed_extensions: Optional[list[str]] = None,
+        download_config: Optional[DownloadConfig] = None,
+    ) -> "DataFilesList":
+        base_path = f"hf://datasets/{dataset_info.id}@{dataset_info.sha}/{base_path or ''}".rstrip("/")
+        return cls.from_patterns(
+            patterns, base_path=base_path, allowed_extensions=allowed_extensions, download_config=download_config
+        )
+    @classmethod
+    def from_local_or_remote(
+        cls,
+        patterns: list[str],
+        base_path: Optional[str] = None,
+        allowed_extensions: Optional[list[str]] = None,
+        download_config: Optional[DownloadConfig] = None,
+    ) -> "DataFilesList":
+        base_path = base_path if base_path is not None else Path().resolve().as_posix()
+        return cls.from_patterns(
+            patterns, base_path=base_path, allowed_extensions=allowed_extensions, download_config=download_config
+        )
+    @classmethod
+    def from_patterns(
+        cls,
+        patterns: list[str],
+        base_path: Optional[str] = None,
+        allowed_extensions: Optional[list[str]] = None,
+        download_config: Optional[DownloadConfig] = None,
+    ) -> "DataFilesList":
+        base_path = base_path if base_path is not None else Path().resolve().as_posix()
+        data_files = []
+        for pattern in patterns:
+            try:
+                data_files.extend(
+                    resolve_pattern(
+                        pattern,
+                        base_path=base_path,
+                        allowed_extensions=allowed_extensions,
+                        download_config=download_config,
+                    )
+                )
+            except FileNotFoundError:
+                if not has_magic(pattern):
+                    raise
+        origin_metadata = _get_origin_metadata(data_files, download_config=download_config)
+        return cls(data_files, origin_metadata)
+    def filter(
+        self, *, extensions: Optional[list[str]] = None, file_names: Optional[list[str]] = None
+    ) -> "DataFilesList":
+        patterns = []
+        if extensions:
+            ext_pattern = "|".join(re.escape(ext) for ext in extensions)
+            patterns.append(re.compile(f".*({ext_pattern})(\\..+)?$"))
+        if file_names:
+            fn_pattern = "|".join(re.escape(fn) for fn in file_names)
+            patterns.append(re.compile(rf".*[\/]?({fn_pattern})$"))
+        if patterns:
+            return DataFilesList(
+                [data_file for data_file in self if any(pattern.match(data_file) for pattern in patterns)],
+                origin_metadata=self.origin_metadata,
+            )
+        else:
+            return DataFilesList(list(self), origin_metadata=self.origin_metadata)
+class DataFilesDict(dict[str, DataFilesList]):
+    """
+    Dict of split_name -> list of data files (absolute local paths or URLs).
+    It has two construction methods given the user's data files patterns :
+    - ``from_hf_repo``: resolve patterns inside a dataset repository
+    - ``from_local_or_remote``: resolve patterns from a local path
+    Moreover, each list is a DataFilesList. It is possible to hash the dictionary
+    and get a different hash if and only if at least one file changed.
+    For more info, see [`DataFilesList`].
+    This is useful for caching Dataset objects that are obtained from a list of data files.
+    Changing the order of the keys of this dictionary also doesn't change its hash.
+    """
+    @classmethod
+    def from_local_or_remote(
+        cls,
+        patterns: dict[str, Union[list[str], DataFilesList]],
+        base_path: Optional[str] = None,
+        allowed_extensions: Optional[list[str]] = None,
+        download_config: Optional[DownloadConfig] = None,
+    ) -> "DataFilesDict":
+        out = cls()
+        for key, patterns_for_key in patterns.items():
+            out[key] = (
+                patterns_for_key
+                if isinstance(patterns_for_key, DataFilesList)
+                else DataFilesList.from_local_or_remote(
+                    patterns_for_key,
+                    base_path=base_path,
+                    allowed_extensions=allowed_extensions,
+                    download_config=download_config,
+                )
+            )
+        return out
+    @classmethod
+    def from_hf_repo(
+        cls,
+        patterns: dict[str, Union[list[str], DataFilesList]],
+        dataset_info: huggingface_hub.hf_api.DatasetInfo,
+        base_path: Optional[str] = None,
+        allowed_extensions: Optional[list[str]] = None,
+        download_config: Optional[DownloadConfig] = None,
+    ) -> "DataFilesDict":
+        out = cls()
+        for key, patterns_for_key in patterns.items():
+            out[key] = (
+                patterns_for_key
+                if isinstance(patterns_for_key, DataFilesList)
+                else DataFilesList.from_hf_repo(
+                    patterns_for_key,
+                    dataset_info=dataset_info,
+                    base_path=base_path,
+                    allowed_extensions=allowed_extensions,
+                    download_config=download_config,
+                )
+            )
+        return out
+    @classmethod
+    def from_patterns(
+        cls,
+        patterns: dict[str, Union[list[str], DataFilesList]],
+        base_path: Optional[str] = None,
+        allowed_extensions: Optional[list[str]] = None,
+        download_config: Optional[DownloadConfig] = None,
+    ) -> "DataFilesDict":
+        out = cls()
+        for key, patterns_for_key in patterns.items():
+            out[key] = (
+                patterns_for_key
+                if isinstance(patterns_for_key, DataFilesList)
+                else DataFilesList.from_patterns(
+                    patterns_for_key,
+                    base_path=base_path,
+                    allowed_extensions=allowed_extensions,
+                    download_config=download_config,
+                )
+            )
+        return out
+    def filter(
+        self, *, extensions: Optional[list[str]] = None, file_names: Optional[list[str]] = None
+    ) -> "DataFilesDict":
+        out = type(self)()
+        for key, data_files_list in self.items():
+            out[key] = data_files_list.filter(extensions=extensions, file_names=file_names)
+        return out
+class DataFilesPatternsList(list[str]):
+    """
+    List of data files patterns (absolute local paths or URLs).
+    For each pattern there should also be a list of allowed extensions
+    to keep, or a None ot keep all the files for the pattern.
+    """
+    def __init__(
+        self,
+        patterns: list[str],
+        allowed_extensions: list[Optional[list[str]]],
+    ):
+        super().__init__(patterns)
+        self.allowed_extensions = allowed_extensions
+    def __add__(self, other):
+        return DataFilesList([*self, *other], self.allowed_extensions + other.allowed_extensions)
+    @classmethod
+    def from_patterns(
+        cls, patterns: list[str], allowed_extensions: Optional[list[str]] = None
+    ) -> "DataFilesPatternsList":
+        return cls(patterns, [allowed_extensions] * len(patterns))
+    def resolve(
+        self,
+        base_path: str,
+        download_config: Optional[DownloadConfig] = None,
+    ) -> "DataFilesList":
+        base_path = base_path if base_path is not None else Path().resolve().as_posix()
+        data_files = []
+        for pattern, allowed_extensions in zip(self, self.allowed_extensions):
+            try:
+                data_files.extend(
+                    resolve_pattern(
+                        pattern,
+                        base_path=base_path,
+                        allowed_extensions=allowed_extensions,
+                        download_config=download_config,
+                    )
+                )
+            except FileNotFoundError:
+                if not has_magic(pattern):
+                    raise
+        origin_metadata = _get_origin_metadata(data_files, download_config=download_config)
+        return DataFilesList(data_files, origin_metadata)
+    def filter_extensions(self, extensions: list[str]) -> "DataFilesPatternsList":
+        return DataFilesPatternsList(
+            self, [allowed_extensions + extensions for allowed_extensions in self.allowed_extensions]
+        )
+class DataFilesPatternsDict(dict[str, DataFilesPatternsList]):
+    """
+    Dict of split_name -> list of data files patterns (absolute local paths or URLs).
+    """
+    @classmethod
+    def from_patterns(
+        cls, patterns: dict[str, list[str]], allowed_extensions: Optional[list[str]] = None
+    ) -> "DataFilesPatternsDict":
+        out = cls()
+        for key, patterns_for_key in patterns.items():
+            out[key] = (
+                patterns_for_key
+                if isinstance(patterns_for_key, DataFilesPatternsList)
+                else DataFilesPatternsList.from_patterns(
+                    patterns_for_key,
+                    allowed_extensions=allowed_extensions,
+                )
+            )
+        return out
+    def resolve(
+        self,
+        base_path: str,
+        download_config: Optional[DownloadConfig] = None,
+    ) -> "DataFilesDict":
+        out = DataFilesDict()
+        for key, data_files_patterns_list in self.items():
+            out[key] = data_files_patterns_list.resolve(base_path, download_config)
+        return out
+    def filter_extensions(self, extensions: list[str]) -> "DataFilesPatternsDict":
+        out = type(self)()
+        for key, data_files_patterns_list in self.items():
+            out[key] = data_files_patterns_list.filter_extensions(extensions)
+        return out

datasets/dataset_dict.py ADDED Viewed

The diff for this file is too large to render. See raw diff

datasets/distributed.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from typing import TypeVar
+from .arrow_dataset import Dataset, _split_by_node_map_style_dataset
+from .iterable_dataset import IterableDataset, _split_by_node_iterable_dataset
+DatasetType = TypeVar("DatasetType", Dataset, IterableDataset)
+def split_dataset_by_node(dataset: DatasetType, rank: int, world_size: int) -> DatasetType:
+    """
+    Split a dataset for the node at rank `rank` in a pool of nodes of size `world_size`.
+    For map-style datasets:
+    Each node is assigned a chunk of data, e.g. rank 0 is given the first chunk of the dataset.
+    To maximize data loading throughput, chunks are made of contiguous data on disk if possible.
+    For iterable datasets:
+    If the dataset has a number of shards that is a factor of `world_size` (i.e. if `dataset.num_shards % world_size == 0`),
+    then the shards are evenly assigned across the nodes, which is the most optimized.
+    Otherwise, each node keeps 1 example out of `world_size`, skipping the other examples.
+    Args:
+        dataset ([`Dataset`] or [`IterableDataset`]):
+            The dataset to split by node.
+        rank (`int`):
+            Rank of the current node.
+        world_size (`int`):
+            Total number of nodes.
+    Returns:
+        [`Dataset`] or [`IterableDataset`]: The dataset to be used on the node at rank `rank`.
+    """
+    if isinstance(dataset, Dataset):
+        return _split_by_node_map_style_dataset(dataset, rank=rank, world_size=world_size)
+    else:
+        return _split_by_node_iterable_dataset(dataset, rank=rank, world_size=world_size)

datasets/exceptions.py ADDED Viewed

	@@ -0,0 +1,119 @@

+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2023 The HuggingFace Authors.
+from typing import Any, Optional, Union
+from huggingface_hub import HfFileSystem
+from . import config
+from .table import CastError
+from .utils.track import TrackedIterableFromGenerator, tracked_list, tracked_str
+class DatasetsError(Exception):
+    """Base class for exceptions in this library."""
+class DefunctDatasetError(DatasetsError):
+    """The dataset has been defunct."""
+class FileNotFoundDatasetsError(DatasetsError, FileNotFoundError):
+    """FileNotFoundError raised by this library."""
+class DataFilesNotFoundError(FileNotFoundDatasetsError):
+    """No (supported) data files found."""
+class DatasetNotFoundError(FileNotFoundDatasetsError):
+    """Dataset not found.
+    Raised when trying to access:
+    - a missing dataset, or
+    - a private/gated dataset and the user is not authenticated.
+    """
+class DatasetBuildError(DatasetsError):
+    pass
+class ManualDownloadError(DatasetBuildError):
+    pass
+class FileFormatError(DatasetBuildError):
+    pass
+class DatasetGenerationError(DatasetBuildError):
+    pass
+class DatasetGenerationCastError(DatasetGenerationError):
+    @classmethod
+    def from_cast_error(
+        cls,
+        cast_error: CastError,
+        builder_name: str,
+        gen_kwargs: dict[str, Any],
+        token: Optional[Union[bool, str]],
+    ) -> "DatasetGenerationCastError":
+        explanation_message = (
+            f"\n\nAll the data files must have the same columns, but at some point {cast_error.details()}"
+        )
+        formatted_tracked_gen_kwargs: list[str] = []
+        for gen_kwarg in gen_kwargs.values():
+            if not isinstance(gen_kwarg, (tracked_str, tracked_list, TrackedIterableFromGenerator)):
+                continue
+            while (
+                isinstance(gen_kwarg, (tracked_list, TrackedIterableFromGenerator)) and gen_kwarg.last_item is not None
+            ):
+                gen_kwarg = gen_kwarg.last_item
+            if isinstance(gen_kwarg, tracked_str):
+                gen_kwarg = gen_kwarg.get_origin()
+            if isinstance(gen_kwarg, str) and gen_kwarg.startswith("hf://"):
+                resolved_path = HfFileSystem(endpoint=config.HF_ENDPOINT, token=token).resolve_path(gen_kwarg)
+                gen_kwarg = "hf://" + resolved_path.unresolve()
+                if "@" + resolved_path.revision in gen_kwarg:
+                    gen_kwarg = (
+                        gen_kwarg.replace("@" + resolved_path.revision, "", 1)
+                        + f" (at revision {resolved_path.revision})"
+                    )
+            formatted_tracked_gen_kwargs.append(str(gen_kwarg))
+        if formatted_tracked_gen_kwargs:
+            explanation_message += f"\n\nThis happened while the {builder_name} dataset builder was generating data using\n\n{', '.join(formatted_tracked_gen_kwargs)}"
+        help_message = "\n\nPlease either edit the data files to have matching columns, or separate them into different configurations (see docs at https://hf.co/docs/hub/datasets-manual-configuration#multiple-configurations)"
+        return cls("An error occurred while generating the dataset" + explanation_message + help_message)
+class ChecksumVerificationError(DatasetsError):
+    """Error raised during checksums verifications of downloaded files."""
+class UnexpectedDownloadedFileError(ChecksumVerificationError):
+    """Some downloaded files were not expected."""
+class ExpectedMoreDownloadedFilesError(ChecksumVerificationError):
+    """Some files were supposed to be downloaded but were not."""
+class NonMatchingChecksumError(ChecksumVerificationError):
+    """The downloaded file checksum don't match the expected checksum."""
+class SplitsVerificationError(DatasetsError):
+    """Error raised during splits verifications."""
+class UnexpectedSplitsError(SplitsVerificationError):
+    """The expected splits of the downloaded file is missing."""
+class ExpectedMoreSplitsError(SplitsVerificationError):
+    """Some recorded splits are missing."""
+class NonMatchingSplitsSizesError(SplitsVerificationError):
+    """The splits sizes don't match the expected splits sizes."""

datasets/fingerprint.py ADDED Viewed

	@@ -0,0 +1,454 @@

+import inspect
+import os
+import random
+import shutil
+import tempfile
+import weakref
+from functools import wraps
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Callable, Optional, Union
+import numpy as np
+import xxhash
+from . import config
+from .naming import INVALID_WINDOWS_CHARACTERS_IN_PATH
+from .utils._dill import dumps
+from .utils.logging import get_logger
+if TYPE_CHECKING:
+    from .arrow_dataset import Dataset
+logger = get_logger(__name__)
+# Fingerprinting allows to have one deterministic fingerprint per dataset state.
+# A dataset fingerprint is updated after each transform.
+# Re-running the same transforms on a dataset in a different session results in the same fingerprint.
+# This is possible thanks to a custom hashing function that works with most python objects.
+# Fingerprinting is the main mechanism that enables caching.
+# The caching mechanism allows to reload an existing cache file if it's already been computed.
+#################
+# Caching
+#################
+_CACHING_ENABLED = True
+_TEMP_DIR_FOR_TEMP_CACHE_FILES: Optional["_TempCacheDir"] = None
+_DATASETS_WITH_TABLE_IN_TEMP_DIR: Optional[weakref.WeakSet] = None
+class _TempCacheDir:
+    """
+    A temporary directory for storing cached Arrow files with a cleanup that frees references to the Arrow files
+    before deleting the directory itself to avoid permission errors on Windows.
+    """
+    def __init__(self):
+        self.name = tempfile.mkdtemp(prefix=config.TEMP_CACHE_DIR_PREFIX)
+        self._finalizer = weakref.finalize(self, self._cleanup)
+    def _cleanup(self):
+        for dset in get_datasets_with_cache_file_in_temp_dir():
+            dset.__del__()
+        if os.path.exists(self.name):
+            try:
+                shutil.rmtree(self.name)
+            except Exception as e:
+                raise OSError(
+                    f"An error occurred while trying to delete temporary cache directory {self.name}. Please delete it manually."
+                ) from e
+    def cleanup(self):
+        if self._finalizer.detach():
+            self._cleanup()
+def maybe_register_dataset_for_temp_dir_deletion(dataset):
+    """
+    This function registers the datasets that have cache files in _TEMP_DIR_FOR_TEMP_CACHE_FILES in order
+    to properly delete them before deleting the temporary directory.
+    The temporary directory _TEMP_DIR_FOR_TEMP_CACHE_FILES is used when caching is disabled.
+    """
+    if _TEMP_DIR_FOR_TEMP_CACHE_FILES is None:
+        return
+    global _DATASETS_WITH_TABLE_IN_TEMP_DIR
+    if _DATASETS_WITH_TABLE_IN_TEMP_DIR is None:
+        _DATASETS_WITH_TABLE_IN_TEMP_DIR = weakref.WeakSet()
+    if any(
+        Path(_TEMP_DIR_FOR_TEMP_CACHE_FILES.name) in Path(cache_file["filename"]).parents
+        for cache_file in dataset.cache_files
+    ):
+        _DATASETS_WITH_TABLE_IN_TEMP_DIR.add(dataset)
+def get_datasets_with_cache_file_in_temp_dir():
+    return list(_DATASETS_WITH_TABLE_IN_TEMP_DIR) if _DATASETS_WITH_TABLE_IN_TEMP_DIR is not None else []
+def enable_caching():
+    """
+    When applying transforms on a dataset, the data are stored in cache files.
+    The caching mechanism allows to reload an existing cache file if it's already been computed.
+    Reloading a dataset is possible since the cache files are named using the dataset fingerprint, which is updated
+    after each transform.
+    If disabled, the library will no longer reload cached datasets files when applying transforms to the datasets.
+    More precisely, if the caching is disabled:
+    - cache files are always recreated
+    - cache files are written to a temporary directory that is deleted when session closes
+    - cache files are named using a random hash instead of the dataset fingerprint
+    - use [`~datasets.Dataset.save_to_disk`] to save a transformed dataset or it will be deleted when session closes
+    - caching doesn't affect [`~datasets.load_dataset`]. If you want to regenerate a dataset from scratch you should use
+    the `download_mode` parameter in [`~datasets.load_dataset`].
+    """
+    global _CACHING_ENABLED
+    _CACHING_ENABLED = True
+def disable_caching():
+    """
+    When applying transforms on a dataset, the data are stored in cache files.
+    The caching mechanism allows to reload an existing cache file if it's already been computed.
+    Reloading a dataset is possible since the cache files are named using the dataset fingerprint, which is updated
+    after each transform.
+    If disabled, the library will no longer reload cached datasets files when applying transforms to the datasets.
+    More precisely, if the caching is disabled:
+    - cache files are always recreated
+    - cache files are written to a temporary directory that is deleted when session closes
+    - cache files are named using a random hash instead of the dataset fingerprint
+    - use [`~datasets.Dataset.save_to_disk`] to save a transformed dataset or it will be deleted when session closes
+    - caching doesn't affect [`~datasets.load_dataset`]. If you want to regenerate a dataset from scratch you should use
+    the `download_mode` parameter in [`~datasets.load_dataset`].
+    """
+    global _CACHING_ENABLED
+    _CACHING_ENABLED = False
+def is_caching_enabled() -> bool:
+    """
+    When applying transforms on a dataset, the data are stored in cache files.
+    The caching mechanism allows to reload an existing cache file if it's already been computed.
+    Reloading a dataset is possible since the cache files are named using the dataset fingerprint, which is updated
+    after each transform.
+    If disabled, the library will no longer reload cached datasets files when applying transforms to the datasets.
+    More precisely, if the caching is disabled:
+    - cache files are always recreated
+    - cache files are written to a temporary directory that is deleted when session closes
+    - cache files are named using a random hash instead of the dataset fingerprint
+    - use [`~datasets.Dataset.save_to_disk`]] to save a transformed dataset or it will be deleted when session closes
+    - caching doesn't affect [`~datasets.load_dataset`]. If you want to regenerate a dataset from scratch you should use
+    the `download_mode` parameter in [`~datasets.load_dataset`].
+    """
+    global _CACHING_ENABLED
+    return bool(_CACHING_ENABLED)
+def get_temporary_cache_files_directory() -> str:
+    """Return a directory that is deleted when session closes."""
+    global _TEMP_DIR_FOR_TEMP_CACHE_FILES
+    if _TEMP_DIR_FOR_TEMP_CACHE_FILES is None:
+        _TEMP_DIR_FOR_TEMP_CACHE_FILES = _TempCacheDir()
+    return _TEMP_DIR_FOR_TEMP_CACHE_FILES.name
+#################
+# Hashing
+#################
+class Hasher:
+    """Hasher that accepts python objects as inputs."""
+    dispatch: dict = {}
+    def __init__(self):
+        self.m = xxhash.xxh64()
+    @classmethod
+    def hash_bytes(cls, value: Union[bytes, list[bytes]]) -> str:
+        value = [value] if isinstance(value, bytes) else value
+        m = xxhash.xxh64()
+        for x in value:
+            m.update(x)
+        return m.hexdigest()
+    @classmethod
+    def hash(cls, value: Any) -> str:
+        return cls.hash_bytes(dumps(value))
+    def update(self, value: Any) -> None:
+        header_for_update = f"=={type(value)}=="
+        value_for_update = self.hash(value)
+        self.m.update(header_for_update.encode("utf8"))
+        self.m.update(value_for_update.encode("utf-8"))
+    def hexdigest(self) -> str:
+        return self.m.hexdigest()
+#################
+# Fingerprinting
+#################
+fingerprint_rng = random.Random()
+# we show a warning only once when fingerprinting fails to avoid spam
+fingerprint_warnings: dict[str, bool] = {}
+def generate_fingerprint(dataset: "Dataset") -> str:
+    state = dataset.__dict__
+    hasher = Hasher()
+    for key in sorted(state):
+        if key == "_fingerprint":
+            continue
+        hasher.update(key)
+        hasher.update(state[key])
+    # hash data files last modification timestamps as well
+    for cache_file in dataset.cache_files:
+        hasher.update(os.path.getmtime(cache_file["filename"]))
+    return hasher.hexdigest()
+def generate_random_fingerprint(nbits: int = 64) -> str:
+    return f"{fingerprint_rng.getrandbits(nbits):0{nbits // 4}x}"
+def update_fingerprint(fingerprint, transform, transform_args):
+    global fingerprint_warnings
+    hasher = Hasher()
+    hasher.update(fingerprint)
+    try:
+        hasher.update(transform)
+    except:  # noqa various errors might raise here from pickle or dill
+        if _CACHING_ENABLED:
+            if not fingerprint_warnings.get("update_fingerprint_transform_hash_failed", False):
+                logger.warning(
+                    f"Transform {transform} couldn't be hashed properly, a random hash was used instead. "
+                    "Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. "
+                    "If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. "
+                    "This warning is only shown once. Subsequent hashing failures won't be shown."
+                )
+                fingerprint_warnings["update_fingerprint_transform_hash_failed"] = True
+            else:
+                logger.info(f"Transform {transform} couldn't be hashed properly, a random hash was used instead.")
+        else:
+            logger.info(
+                f"Transform {transform} couldn't be hashed properly, a random hash was used instead. This doesn't affect caching since it's disabled."
+            )
+        return generate_random_fingerprint()
+    for key in sorted(transform_args):
+        hasher.update(key)
+        try:
+            hasher.update(transform_args[key])
+        except:  # noqa various errors might raise here from pickle or dill
+            if _CACHING_ENABLED:
+                if not fingerprint_warnings.get("update_fingerprint_transform_hash_failed", False):
+                    logger.warning(
+                        f"Parameter '{key}'={transform_args[key]} of the transform {transform} couldn't be hashed properly, a random hash was used instead. "
+                        "Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. "
+                        "If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. "
+                        "This warning is only shown once. Subsequent hashing failures won't be shown."
+                    )
+                    fingerprint_warnings["update_fingerprint_transform_hash_failed"] = True
+                else:
+                    logger.info(
+                        f"Parameter '{key}'={transform_args[key]} of the transform {transform} couldn't be hashed properly, a random hash was used instead."
+                    )
+            else:
+                logger.info(
+                    f"Parameter '{key}'={transform_args[key]} of the transform {transform} couldn't be hashed properly, a random hash was used instead. This doesn't affect caching since it's disabled."
+                )
+            return generate_random_fingerprint()
+    return hasher.hexdigest()
+def validate_fingerprint(fingerprint: str, max_length=64):
+    """
+    Make sure the fingerprint is a non-empty string that is not longer that max_length=64 by default,
+    so that the fingerprint can be used to name cache files without issues.
+    """
+    if not isinstance(fingerprint, str) or not fingerprint:
+        raise ValueError(f"Invalid fingerprint '{fingerprint}': it should be a non-empty string.")
+    for invalid_char in INVALID_WINDOWS_CHARACTERS_IN_PATH:
+        if invalid_char in fingerprint:
+            raise ValueError(
+                f"Invalid fingerprint. Bad characters from black list '{INVALID_WINDOWS_CHARACTERS_IN_PATH}' found in '{fingerprint}'. "
+                f"They could create issues when creating cache files."
+            )
+    if len(fingerprint) > max_length:
+        raise ValueError(
+            f"Invalid fingerprint. Maximum lenth is {max_length} but '{fingerprint}' has length {len(fingerprint)}."
+            "It could create issues when creating cache files."
+        )
+def format_transform_for_fingerprint(func: Callable, version: Optional[str] = None) -> str:
+    """
+    Format a transform to the format that will be used to update the fingerprint.
+    """
+    transform = f"{func.__module__}.{func.__qualname__}"
+    if version is not None:
+        transform += f"@{version}"
+    return transform
+def format_kwargs_for_fingerprint(
+    func: Callable,
+    args: tuple,
+    kwargs: dict[str, Any],
+    use_kwargs: Optional[list[str]] = None,
+    ignore_kwargs: Optional[list[str]] = None,
+    randomized_function: bool = False,
+) -> dict[str, Any]:
+    """
+    Format the kwargs of a transform to the format that will be used to update the fingerprint.
+    """
+    kwargs_for_fingerprint = kwargs.copy()
+    if args:
+        params = [p.name for p in inspect.signature(func).parameters.values() if p != p.VAR_KEYWORD]
+        args = args[1:]  # assume the first argument is the dataset
+        params = params[1:]
+        kwargs_for_fingerprint.update(zip(params, args))
+    else:
+        del kwargs_for_fingerprint[
+            next(iter(inspect.signature(func).parameters))
+        ]  # assume the first key is the dataset
+    # keep the right kwargs to be hashed to generate the fingerprint
+    if use_kwargs:
+        kwargs_for_fingerprint = {k: v for k, v in kwargs_for_fingerprint.items() if k in use_kwargs}
+    if ignore_kwargs:
+        kwargs_for_fingerprint = {k: v for k, v in kwargs_for_fingerprint.items() if k not in ignore_kwargs}
+    if randomized_function:  # randomized functions have `seed` and `generator` parameters
+        if kwargs_for_fingerprint.get("seed") is None and kwargs_for_fingerprint.get("generator") is None:
+            _, seed, pos, *_ = np.random.get_state()
+            seed = seed[pos] if pos < 624 else seed[0]
+            kwargs_for_fingerprint["generator"] = np.random.default_rng(seed)
+    # remove kwargs that are the default values
+    default_values = {
+        p.name: p.default for p in inspect.signature(func).parameters.values() if p.default != inspect._empty
+    }
+    for default_varname, default_value in default_values.items():
+        if default_varname in kwargs_for_fingerprint and kwargs_for_fingerprint[default_varname] == default_value:
+            kwargs_for_fingerprint.pop(default_varname)
+    return kwargs_for_fingerprint
+def fingerprint_transform(
+    inplace: bool,
+    use_kwargs: Optional[list[str]] = None,
+    ignore_kwargs: Optional[list[str]] = None,
+    fingerprint_names: Optional[list[str]] = None,
+    randomized_function: bool = False,
+    version: Optional[str] = None,
+):
+    """
+    Wrapper for dataset transforms to update the dataset fingerprint using ``update_fingerprint``
+    Args:
+        inplace (:obj:`bool`):  If inplace is True, the fingerprint of the dataset is updated inplace.
+            Otherwise, a parameter "new_fingerprint" is passed to the wrapped method that should take care of
+            setting the fingerprint of the returned Dataset.
+        use_kwargs (:obj:`List[str]`, optional): optional white list of argument names to take into account
+            to update the fingerprint to the wrapped method that should take care of
+            setting the fingerprint of the returned Dataset. By default all the arguments are used.
+        ignore_kwargs (:obj:`List[str]`, optional): optional black list of argument names to take into account
+            to update the fingerprint. Note that ignore_kwargs prevails on use_kwargs.
+        fingerprint_names (:obj:`List[str]`, optional, defaults to ["new_fingerprint"]):
+            If the dataset transforms is not inplace and returns a DatasetDict, then it can require
+            several fingerprints (one per dataset in the DatasetDict). By specifying fingerprint_names,
+            one fingerprint named after each element of fingerprint_names is going to be passed.
+        randomized_function (:obj:`bool`, defaults to False): If the dataset transform is random and has
+            optional parameters "seed" and "generator", then you can set randomized_function to True.
+            This way, even if users set "seed" and "generator" to None, then the fingerprint is
+            going to be randomly generated depending on numpy's current state. In this case, the
+            generator is set to np.random.default_rng(np.random.get_state()[1][0]).
+        version (:obj:`str`, optional): version of the transform. The version is taken into account when
+            computing the fingerprint. If a datase transform changes (or at least if the output data
+            that are cached changes), then one should increase the version. If the version stays the
+            same, then old cached data could be reused that are not compatible with the new transform.
+            It should be in the format "MAJOR.MINOR.PATCH".
+    """
+    if use_kwargs is not None and not isinstance(use_kwargs, list):
+        raise ValueError(f"use_kwargs is supposed to be a list, not {type(use_kwargs)}")
+    if ignore_kwargs is not None and not isinstance(ignore_kwargs, list):
+        raise ValueError(f"ignore_kwargs is supposed to be a list, not {type(use_kwargs)}")
+    if inplace and fingerprint_names:
+        raise ValueError("fingerprint_names are only used when inplace is False")
+    fingerprint_names = fingerprint_names if fingerprint_names is not None else ["new_fingerprint"]
+    def _fingerprint(func):
+        if not inplace and not all(name in func.__code__.co_varnames for name in fingerprint_names):
+            raise ValueError(f"function {func} is missing parameters {fingerprint_names} in signature")
+        if randomized_function:  # randomized function have seed and generator parameters
+            if "seed" not in func.__code__.co_varnames:
+                raise ValueError(f"'seed' must be in {func}'s signature")
+            if "generator" not in func.__code__.co_varnames:
+                raise ValueError(f"'generator' must be in {func}'s signature")
+        # this call has to be outside the wrapper or since __qualname__ changes in multiprocessing
+        transform = format_transform_for_fingerprint(func, version=version)
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            kwargs_for_fingerprint = format_kwargs_for_fingerprint(
+                func,
+                args,
+                kwargs,
+                use_kwargs=use_kwargs,
+                ignore_kwargs=ignore_kwargs,
+                randomized_function=randomized_function,
+            )
+            if args:
+                dataset: Dataset = args[0]
+                args = args[1:]
+            else:
+                dataset: Dataset = kwargs.pop(next(iter(inspect.signature(func).parameters)))
+            # compute new_fingerprint and add it to the args of not in-place transforms
+            if inplace:
+                new_fingerprint = update_fingerprint(dataset._fingerprint, transform, kwargs_for_fingerprint)
+            else:
+                for fingerprint_name in fingerprint_names:  # transforms like `train_test_split` have several hashes
+                    if kwargs.get(fingerprint_name) is None:
+                        kwargs_for_fingerprint["fingerprint_name"] = fingerprint_name
+                        kwargs[fingerprint_name] = update_fingerprint(
+                            dataset._fingerprint, transform, kwargs_for_fingerprint
+                        )
+                    else:
+                        validate_fingerprint(kwargs[fingerprint_name])
+            # Call actual function
+            out = func(dataset, *args, **kwargs)
+            # Update fingerprint of in-place transforms + update in-place history of transforms
+            if inplace:  # update after calling func so that the fingerprint doesn't change if the function fails
+                dataset._fingerprint = new_fingerprint
+            return out
+        wrapper._decorator_name_ = "fingerprint"
+        return wrapper
+    return _fingerprint

datasets/hub.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from itertools import chain
+from typing import Optional, Union
+from huggingface_hub import (
+    CommitInfo,
+    CommitOperationAdd,
+    CommitOperationDelete,
+    DatasetCard,
+    DatasetCardData,
+    HfApi,
+    HfFileSystem,
+)
+import datasets.config
+from datasets.info import DatasetInfosDict
+from datasets.load import load_dataset_builder
+from datasets.utils.metadata import MetadataConfigs
+def delete_from_hub(
+    repo_id: str,
+    config_name: str,
+    revision: Optional[str] = None,
+    token: Optional[Union[bool, str]] = None,
+) -> CommitInfo:
+    """Delete a dataset configuration from a [data-only dataset](repository_structure) on the Hub.
+    Args:
+        repo_id (`str`): ID of the Hub dataset repository, in the following format: `<user>/<dataset_name>` or
+            `<org>/<dataset_name>`.
+        config_name (`str`): Name of the dataset configuration.
+        revision (`str`, *optional*): Branch to delete the configuration from. Defaults to the `"main"` branch.
+        token (`bool` or `str`, *optional*): Authentication token for the Hugging Face Hub.
+    Returns:
+        `huggingface_hub.CommitInfo`
+    """
+    operations = []
+    # data_files
+    fs = HfFileSystem(endpoint=datasets.config.HF_ENDPOINT, token=token)
+    builder = load_dataset_builder(repo_id, config_name, revision=revision, token=token)
+    for data_file in chain(*builder.config.data_files.values()):
+        data_file_resolved_path = fs.resolve_path(data_file)
+        if data_file_resolved_path.repo_id == repo_id:
+            operations.append(CommitOperationDelete(path_in_repo=data_file_resolved_path.path_in_repo))
+    # README.md
+    dataset_card = DatasetCard.load(repo_id)
+    # config_names
+    if dataset_card.data.get("config_names", None) and config_name in dataset_card.data["config_names"]:
+        dataset_card.data["config_names"].remove(config_name)
+    # metadata_configs
+    metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card.data)
+    if metadata_configs:
+        _ = metadata_configs.pop(config_name, None)
+        dataset_card_data = DatasetCardData()
+        metadata_configs.to_dataset_card_data(dataset_card_data)
+        if datasets.config.METADATA_CONFIGS_FIELD in dataset_card_data:
+            dataset_card.data[datasets.config.METADATA_CONFIGS_FIELD] = dataset_card_data[
+                datasets.config.METADATA_CONFIGS_FIELD
+            ]
+        else:
+            _ = dataset_card.data.pop(datasets.config.METADATA_CONFIGS_FIELD, None)
+    # dataset_info
+    dataset_infos: DatasetInfosDict = DatasetInfosDict.from_dataset_card_data(dataset_card.data)
+    if dataset_infos:
+        _ = dataset_infos.pop(config_name, None)
+        dataset_card_data = DatasetCardData()
+        dataset_infos.to_dataset_card_data(dataset_card_data)
+        if "dataset_info" in dataset_card_data:
+            dataset_card.data["dataset_info"] = dataset_card_data["dataset_info"]
+        else:
+            _ = dataset_card.data.pop("dataset_info", None)
+    # Commit
+    operations.append(
+        CommitOperationAdd(path_in_repo=datasets.config.REPOCARD_FILENAME, path_or_fileobj=str(dataset_card).encode())
+    )
+    api = HfApi(endpoint=datasets.config.HF_ENDPOINT, token=token)
+    commit_info = api.create_commit(
+        repo_id,
+        operations=operations,
+        commit_message=f"Delete '{config_name}' config",
+        commit_description=f"Delete '{config_name}' config.",
+        token=token,
+        repo_type="dataset",
+        revision=revision,
+        create_pr=True,
+    )
+    print(f"You can find your PR to delete the dataset config at: {commit_info.pr_url}")
+    return commit_info
+def _delete_files(dataset_id, revision=None, token=None):
+    hf_api = HfApi(endpoint=datasets.config.HF_ENDPOINT, token=token)
+    repo_files = hf_api.list_repo_files(
+        dataset_id,
+        repo_type="dataset",
+    )
+    if repo_files:
+        legacy_json_file = []
+        data_files = []
+        for filename in repo_files:
+            if filename in {".gitattributes", "README.md"}:
+                continue
+            elif filename == "dataset_infos.json":
+                legacy_json_file.append(filename)
+            else:
+                data_files.append(filename)
+        if legacy_json_file:
+            hf_api.delete_file(
+                "dataset_infos.json",
+                dataset_id,
+                repo_type="dataset",
+                revision=revision,
+                commit_message="Delete legacy dataset_infos.json",
+            )
+        if data_files:
+            for filename in data_files:
+                hf_api.delete_file(
+                    filename,
+                    dataset_id,
+                    repo_type="dataset",
+                    revision=revision,
+                    commit_message="Delete data file",
+                )

datasets/info.py ADDED Viewed

	@@ -0,0 +1,430 @@

+# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Lint as: python3
+"""DatasetInfo record information we know about a dataset.
+This includes things that we know about the dataset statically, i.e.:
+ - description
+ - canonical location
+ - does it have validation and tests splits
+ - size
+ - etc.
+This also includes the things that can and should be computed once we've
+processed the dataset as well:
+ - number of examples (in each split)
+ - etc.
+"""
+import copy
+import dataclasses
+import json
+import os
+import posixpath
+from dataclasses import dataclass
+from pathlib import Path
+from typing import ClassVar, Optional, Union
+import fsspec
+from fsspec.core import url_to_fs
+from huggingface_hub import DatasetCard, DatasetCardData
+from . import config
+from .features import Features
+from .splits import SplitDict
+from .utils import Version
+from .utils.logging import get_logger
+from .utils.py_utils import asdict, unique_values
+logger = get_logger(__name__)
+@dataclass
+class SupervisedKeysData:
+    input: str = ""
+    output: str = ""
+@dataclass
+class DownloadChecksumsEntryData:
+    key: str = ""
+    value: str = ""
+class MissingCachedSizesConfigError(Exception):
+    """The expected cached sizes of the download file are missing."""
+class NonMatchingCachedSizesError(Exception):
+    """The prepared split doesn't have expected sizes."""
+@dataclass
+class PostProcessedInfo:
+    features: Optional[Features] = None
+    resources_checksums: Optional[dict] = None
+    def __post_init__(self):
+        # Convert back to the correct classes when we reload from dict
+        if self.features is not None and not isinstance(self.features, Features):
+            self.features = Features.from_dict(self.features)
+    @classmethod
+    def from_dict(cls, post_processed_info_dict: dict) -> "PostProcessedInfo":
+        field_names = {f.name for f in dataclasses.fields(cls)}
+        return cls(**{k: v for k, v in post_processed_info_dict.items() if k in field_names})
+@dataclass
+class DatasetInfo:
+    """Information about a dataset.
+    `DatasetInfo` documents datasets, including its name, version, and features.
+    See the constructor arguments and properties for a full list.
+    Not all fields are known on construction and may be updated later.
+    Attributes:
+        description (`str`):
+            A description of the dataset.
+        citation (`str`):
+            A BibTeX citation of the dataset.
+        homepage (`str`):
+            A URL to the official homepage for the dataset.
+        license (`str`):
+            The dataset's license. It can be the name of the license or a paragraph containing the terms of the license.
+        features ([`Features`], *optional*):
+            The features used to specify the dataset's column types.
+        post_processed (`PostProcessedInfo`, *optional*):
+            Information regarding the resources of a possible post-processing of a dataset. For example, it can contain the information of an index.
+        supervised_keys (`SupervisedKeysData`, *optional*):
+            Specifies the input feature and the label for supervised learning if applicable for the dataset (legacy from TFDS).
+        builder_name (`str`, *optional*):
+            The name of the `GeneratorBasedBuilder` subclass used to create the dataset. It is also the snake_case version of the dataset builder class name.
+        config_name (`str`, *optional*):
+            The name of the configuration derived from [`BuilderConfig`].
+        version (`str` or [`Version`], *optional*):
+            The version of the dataset.
+        splits (`dict`, *optional*):
+            The mapping between split name and metadata.
+        download_checksums (`dict`, *optional*):
+            The mapping between the URL to download the dataset's checksums and corresponding metadata.
+        download_size (`int`, *optional*):
+            The size of the files to download to generate the dataset, in bytes.
+        post_processing_size (`int`, *optional*):
+            Size of the dataset in bytes after post-processing, if any.
+        dataset_size (`int`, *optional*):
+            The combined size in bytes of the Arrow tables for all splits.
+        size_in_bytes (`int`, *optional*):
+            The combined size in bytes of all files associated with the dataset (downloaded files + Arrow files).
+        **config_kwargs (additional keyword arguments):
+            Keyword arguments to be passed to the [`BuilderConfig`] and used in the [`DatasetBuilder`].
+    """
+    # Set in the dataset builders
+    description: str = dataclasses.field(default_factory=str)
+    citation: str = dataclasses.field(default_factory=str)
+    homepage: str = dataclasses.field(default_factory=str)
+    license: str = dataclasses.field(default_factory=str)
+    features: Optional[Features] = None
+    post_processed: Optional[PostProcessedInfo] = None
+    supervised_keys: Optional[SupervisedKeysData] = None
+    # Set later by the builder
+    builder_name: Optional[str] = None
+    dataset_name: Optional[str] = None  # for packaged builders, to be different from builder_name
+    config_name: Optional[str] = None
+    version: Optional[Union[str, Version]] = None
+    # Set later by `download_and_prepare`
+    splits: Optional[dict] = None
+    download_checksums: Optional[dict] = None
+    download_size: Optional[int] = None
+    post_processing_size: Optional[int] = None
+    dataset_size: Optional[int] = None
+    size_in_bytes: Optional[int] = None
+    _INCLUDED_INFO_IN_YAML: ClassVar[list[str]] = [
+        "config_name",
+        "download_size",
+        "dataset_size",
+        "features",
+        "splits",
+    ]
+    def __post_init__(self):
+        # Convert back to the correct classes when we reload from dict
+        if self.features is not None and not isinstance(self.features, Features):
+            self.features = Features.from_dict(self.features)
+        if self.post_processed is not None and not isinstance(self.post_processed, PostProcessedInfo):
+            self.post_processed = PostProcessedInfo.from_dict(self.post_processed)
+        if self.version is not None and not isinstance(self.version, Version):
+            if isinstance(self.version, str):
+                self.version = Version(self.version)
+            else:
+                self.version = Version.from_dict(self.version)
+        if self.splits is not None and not isinstance(self.splits, SplitDict):
+            self.splits = SplitDict.from_split_dict(self.splits)
+        if self.supervised_keys is not None and not isinstance(self.supervised_keys, SupervisedKeysData):
+            if isinstance(self.supervised_keys, (tuple, list)):
+                self.supervised_keys = SupervisedKeysData(*self.supervised_keys)
+            else:
+                self.supervised_keys = SupervisedKeysData(**self.supervised_keys)
+    def write_to_directory(self, dataset_info_dir, pretty_print=False, storage_options: Optional[dict] = None):
+        """Write `DatasetInfo` and license (if present) as JSON files to `dataset_info_dir`.
+        Args:
+            dataset_info_dir (`str`):
+                Destination directory.
+            pretty_print (`bool`, defaults to `False`):
+                If `True`, the JSON will be pretty-printed with the indent level of 4.
+            storage_options (`dict`, *optional*):
+                Key/value pairs to be passed on to the file-system backend, if any.
+                <Added version="2.9.0"/>
+        Example:
+        ```py
+        >>> from datasets import load_dataset
+        >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation")
+        >>> ds.info.write_to_directory("/path/to/directory/")
+        ```
+        """
+        fs: fsspec.AbstractFileSystem
+        fs, *_ = url_to_fs(dataset_info_dir, **(storage_options or {}))
+        with fs.open(posixpath.join(dataset_info_dir, config.DATASET_INFO_FILENAME), "wb") as f:
+            self._dump_info(f, pretty_print=pretty_print)
+        if self.license:
+            with fs.open(posixpath.join(dataset_info_dir, config.LICENSE_FILENAME), "wb") as f:
+                self._dump_license(f)
+    def _dump_info(self, file, pretty_print=False):
+        """Dump info in `file` file-like object open in bytes mode (to support remote files)"""
+        file.write(json.dumps(asdict(self), indent=4 if pretty_print else None).encode("utf-8"))
+    def _dump_license(self, file):
+        """Dump license in `file` file-like object open in bytes mode (to support remote files)"""
+        file.write(self.license.encode("utf-8"))
+    @classmethod
+    def from_merge(cls, dataset_infos: list["DatasetInfo"]):
+        dataset_infos = [dset_info.copy() for dset_info in dataset_infos if dset_info is not None]
+        if len(dataset_infos) > 0 and all(dataset_infos[0] == dset_info for dset_info in dataset_infos):
+            # if all dataset_infos are equal we don't need to merge. Just return the first.
+            return dataset_infos[0]
+        description = "\n\n".join(unique_values(info.description for info in dataset_infos)).strip()
+        citation = "\n\n".join(unique_values(info.citation for info in dataset_infos)).strip()
+        homepage = "\n\n".join(unique_values(info.homepage for info in dataset_infos)).strip()
+        license = "\n\n".join(unique_values(info.license for info in dataset_infos)).strip()
+        features = None
+        supervised_keys = None
+        return cls(
+            description=description,
+            citation=citation,
+            homepage=homepage,
+            license=license,
+            features=features,
+            supervised_keys=supervised_keys,
+        )
+    @classmethod
+    def from_directory(cls, dataset_info_dir: str, storage_options: Optional[dict] = None) -> "DatasetInfo":
+        """Create [`DatasetInfo`] from the JSON file in `dataset_info_dir`.
+        This function updates all the dynamically generated fields (num_examples,
+        hash, time of creation,...) of the [`DatasetInfo`].
+        This will overwrite all previous metadata.
+        Args:
+            dataset_info_dir (`str`):
+                The directory containing the metadata file. This
+                should be the root directory of a specific dataset version.
+            storage_options (`dict`, *optional*):
+                Key/value pairs to be passed on to the file-system backend, if any.
+                <Added version="2.9.0"/>
+        Example:
+        ```py
+        >>> from datasets import DatasetInfo
+        >>> ds_info = DatasetInfo.from_directory("/path/to/directory/")
+        ```
+        """
+        fs: fsspec.AbstractFileSystem
+        fs, *_ = url_to_fs(dataset_info_dir, **(storage_options or {}))
+        logger.debug(f"Loading Dataset info from {dataset_info_dir}")
+        if not dataset_info_dir:
+            raise ValueError("Calling DatasetInfo.from_directory() with undefined dataset_info_dir.")
+        with fs.open(posixpath.join(dataset_info_dir, config.DATASET_INFO_FILENAME), "r", encoding="utf-8") as f:
+            dataset_info_dict = json.load(f)
+        return cls.from_dict(dataset_info_dict)
+    @classmethod
+    def from_dict(cls, dataset_info_dict: dict) -> "DatasetInfo":
+        field_names = {f.name for f in dataclasses.fields(cls)}
+        return cls(**{k: v for k, v in dataset_info_dict.items() if k in field_names})
+    def update(self, other_dataset_info: "DatasetInfo", ignore_none=True):
+        self_dict = self.__dict__
+        self_dict.update(
+            **{
+                k: copy.deepcopy(v)
+                for k, v in other_dataset_info.__dict__.items()
+                if (v is not None or not ignore_none)
+            }
+        )
+    def copy(self) -> "DatasetInfo":
+        return self.__class__(**{k: copy.deepcopy(v) for k, v in self.__dict__.items()})
+    def _to_yaml_dict(self) -> dict:
+        yaml_dict = {}
+        dataset_info_dict = asdict(self)
+        for key in dataset_info_dict:
+            if key in self._INCLUDED_INFO_IN_YAML:
+                value = getattr(self, key)
+                if hasattr(value, "_to_yaml_list"):  # Features, SplitDict
+                    yaml_dict[key] = value._to_yaml_list()
+                elif hasattr(value, "_to_yaml_string"):  # Version
+                    yaml_dict[key] = value._to_yaml_string()
+                else:
+                    yaml_dict[key] = value
+        return yaml_dict
+    @classmethod
+    def _from_yaml_dict(cls, yaml_data: dict) -> "DatasetInfo":
+        yaml_data = copy.deepcopy(yaml_data)
+        if yaml_data.get("features") is not None:
+            yaml_data["features"] = Features._from_yaml_list(yaml_data["features"])
+        if yaml_data.get("splits") is not None:
+            yaml_data["splits"] = SplitDict._from_yaml_list(yaml_data["splits"])
+        field_names = {f.name for f in dataclasses.fields(cls)}
+        return cls(**{k: v for k, v in yaml_data.items() if k in field_names})
+class DatasetInfosDict(dict[str, DatasetInfo]):
+    def write_to_directory(self, dataset_infos_dir, overwrite=False, pretty_print=False) -> None:
+        total_dataset_infos = {}
+        dataset_infos_path = os.path.join(dataset_infos_dir, config.DATASETDICT_INFOS_FILENAME)
+        dataset_readme_path = os.path.join(dataset_infos_dir, config.REPOCARD_FILENAME)
+        if not overwrite:
+            total_dataset_infos = self.from_directory(dataset_infos_dir)
+        total_dataset_infos.update(self)
+        if os.path.exists(dataset_infos_path):
+            # for backward compatibility, let's update the JSON file if it exists
+            with open(dataset_infos_path, "w", encoding="utf-8") as f:
+                dataset_infos_dict = {
+                    config_name: asdict(dset_info) for config_name, dset_info in total_dataset_infos.items()
+                }
+                json.dump(dataset_infos_dict, f, indent=4 if pretty_print else None)
+        # Dump the infos in the YAML part of the README.md file
+        if os.path.exists(dataset_readme_path):
+            dataset_card = DatasetCard.load(dataset_readme_path)
+            dataset_card_data = dataset_card.data
+        else:
+            dataset_card = None
+            dataset_card_data = DatasetCardData()
+        if total_dataset_infos:
+            total_dataset_infos.to_dataset_card_data(dataset_card_data)
+            dataset_card = (
+                DatasetCard("---\n" + str(dataset_card_data) + "\n---\n") if dataset_card is None else dataset_card
+            )
+            dataset_card.save(Path(dataset_readme_path))
+    @classmethod
+    def from_directory(cls, dataset_infos_dir) -> "DatasetInfosDict":
+        logger.debug(f"Loading Dataset Infos from {dataset_infos_dir}")
+        # Load the info from the YAML part of README.md
+        if os.path.exists(os.path.join(dataset_infos_dir, config.REPOCARD_FILENAME)):
+            dataset_card_data = DatasetCard.load(Path(dataset_infos_dir) / config.REPOCARD_FILENAME).data
+            if "dataset_info" in dataset_card_data:
+                return cls.from_dataset_card_data(dataset_card_data)
+        if os.path.exists(os.path.join(dataset_infos_dir, config.DATASETDICT_INFOS_FILENAME)):
+            # this is just to have backward compatibility with dataset_infos.json files
+            with open(os.path.join(dataset_infos_dir, config.DATASETDICT_INFOS_FILENAME), encoding="utf-8") as f:
+                return cls(
+                    {
+                        config_name: DatasetInfo.from_dict(dataset_info_dict)
+                        for config_name, dataset_info_dict in json.load(f).items()
+                    }
+                )
+        else:
+            return cls()
+    @classmethod
+    def from_dataset_card_data(cls, dataset_card_data: DatasetCardData) -> "DatasetInfosDict":
+        if isinstance(dataset_card_data.get("dataset_info"), (list, dict)):
+            if isinstance(dataset_card_data["dataset_info"], list):
+                return cls(
+                    {
+                        dataset_info_yaml_dict.get("config_name", "default"): DatasetInfo._from_yaml_dict(
+                            dataset_info_yaml_dict
+                        )
+                        for dataset_info_yaml_dict in dataset_card_data["dataset_info"]
+                    }
+                )
+            else:
+                dataset_info = DatasetInfo._from_yaml_dict(dataset_card_data["dataset_info"])
+                dataset_info.config_name = dataset_card_data["dataset_info"].get("config_name", "default")
+                return cls({dataset_info.config_name: dataset_info})
+        else:
+            return cls()
+    def to_dataset_card_data(self, dataset_card_data: DatasetCardData) -> None:
+        if self:
+            # first get existing metadata info
+            if "dataset_info" in dataset_card_data and isinstance(dataset_card_data["dataset_info"], dict):
+                dataset_metadata_infos = {
+                    dataset_card_data["dataset_info"].get("config_name", "default"): dataset_card_data["dataset_info"]
+                }
+            elif "dataset_info" in dataset_card_data and isinstance(dataset_card_data["dataset_info"], list):
+                dataset_metadata_infos = {
+                    config_metadata["config_name"]: config_metadata
+                    for config_metadata in dataset_card_data["dataset_info"]
+                }
+            else:
+                dataset_metadata_infos = {}
+            # update/rewrite existing metadata info with the one to dump
+            total_dataset_infos = {
+                **dataset_metadata_infos,
+                **{config_name: dset_info._to_yaml_dict() for config_name, dset_info in self.items()},
+            }
+            # the config_name from the dataset_infos_dict takes over the config_name of the DatasetInfo
+            for config_name, dset_info_yaml_dict in total_dataset_infos.items():
+                dset_info_yaml_dict["config_name"] = config_name
+            if len(total_dataset_infos) == 1:
+                # use a struct instead of a list of configurations, since there's only one
+                dataset_card_data["dataset_info"] = next(iter(total_dataset_infos.values()))
+                config_name = dataset_card_data["dataset_info"].pop("config_name", None)
+                if config_name != "default":
+                    # if config_name is not "default" preserve it and put at the first position
+                    dataset_card_data["dataset_info"] = {
+                        "config_name": config_name,
+                        **dataset_card_data["dataset_info"],
+                    }
+            else:
+                dataset_card_data["dataset_info"] = []
+                for config_name, dataset_info_yaml_dict in sorted(total_dataset_infos.items()):
+                    # add the config_name field in first position
+                    dataset_info_yaml_dict.pop("config_name", None)
+                    dataset_info_yaml_dict = {"config_name": config_name, **dataset_info_yaml_dict}
+                    dataset_card_data["dataset_info"].append(dataset_info_yaml_dict)

datasets/inspect.py ADDED Viewed

	@@ -0,0 +1,353 @@

+# Copyright 2020 The HuggingFace Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Lint as: python3
+"""List and inspect datasets."""
+import os
+from collections.abc import Mapping, Sequence
+from typing import Optional, Union
+from .download.download_config import DownloadConfig
+from .download.download_manager import DownloadMode
+from .download.streaming_download_manager import StreamingDownloadManager
+from .info import DatasetInfo
+from .load import (
+    dataset_module_factory,
+    get_dataset_builder_class,
+    load_dataset_builder,
+)
+from .utils.logging import get_logger
+from .utils.version import Version
+logger = get_logger(__name__)
+class SplitsNotFoundError(ValueError):
+    pass
+def get_dataset_infos(
+    path: str,
+    data_files: Optional[Union[dict, list, str]] = None,
+    download_config: Optional[DownloadConfig] = None,
+    download_mode: Optional[Union[DownloadMode, str]] = None,
+    revision: Optional[Union[str, Version]] = None,
+    token: Optional[Union[bool, str]] = None,
+    **config_kwargs,
+):
+    """Get the meta information about a dataset, returned as a dict mapping config name to DatasetInfoDict.
+    Args:
+        path (`str`): path to the dataset repository. Can be either:
+            - a local path to the dataset directory containing the data files,
+                e.g. `'./dataset/squad'`
+            - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]),
+                e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'`
+        revision (`Union[str, datasets.Version]`, *optional*):
+            If specified, the dataset module will be loaded from the datasets repository at this version.
+            By default:
+            - it is set to the local version of the lib.
+            - it will also try to load it from the main branch if it's not available at the local version of the lib.
+            Specifying a version that is different from your local version of the lib might cause compatibility issues.
+        download_config ([`DownloadConfig`], *optional*):
+            Specific download configuration parameters.
+        download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):
+            Download/generate mode.
+        data_files (`Union[Dict, List, str]`, *optional*):
+            Defining the data_files of the dataset configuration.
+        token (`str` or `bool`, *optional*):
+            Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
+            If `True`, or not specified, will get token from `"~/.huggingface"`.
+        **config_kwargs (additional keyword arguments):
+            Optional attributes for builder class which will override the attributes if supplied.
+    Example:
+    ```py
+    >>> from datasets import get_dataset_infos
+    >>> get_dataset_infos('cornell-movie-review-data/rotten_tomatoes')
+    {'default': DatasetInfo(description="Movie Review Dataset.\nThis is a dataset of containing 5,331 positive and 5,331 negative processed\nsentences from Rotten Tomatoes movie reviews...), ...}
+    ```
+    """
+    config_names = get_dataset_config_names(
+        path=path,
+        revision=revision,
+        download_config=download_config,
+        download_mode=download_mode,
+        data_files=data_files,
+        token=token,
+    )
+    return {
+        config_name: get_dataset_config_info(
+            path=path,
+            config_name=config_name,
+            data_files=data_files,
+            download_config=download_config,
+            download_mode=download_mode,
+            revision=revision,
+            token=token,
+            **config_kwargs,
+        )
+        for config_name in config_names
+    }
+def get_dataset_config_names(
+    path: str,
+    revision: Optional[Union[str, Version]] = None,
+    download_config: Optional[DownloadConfig] = None,
+    download_mode: Optional[Union[DownloadMode, str]] = None,
+    data_files: Optional[Union[dict, list, str]] = None,
+    **download_kwargs,
+):
+    """Get the list of available config names for a particular dataset.
+    Args:
+        path (`str`): path to the dataset repository. Can be either:
+            - a local path to the dataset directory containing the data files,
+                e.g. `'./dataset/squad'`
+            - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]),
+                e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'`
+        revision (`Union[str, datasets.Version]`, *optional*):
+            If specified, the dataset module will be loaded from the datasets repository at this version.
+            By default:
+            - it is set to the local version of the lib.
+            - it will also try to load it from the main branch if it's not available at the local version of the lib.
+            Specifying a version that is different from your local version of the lib might cause compatibility issues.
+        download_config ([`DownloadConfig`], *optional*):
+            Specific download configuration parameters.
+        download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):
+            Download/generate mode.
+        data_files (`Union[Dict, List, str]`, *optional*):
+            Defining the data_files of the dataset configuration.
+        **download_kwargs (additional keyword arguments):
+            Optional attributes for [`DownloadConfig`] which will override the attributes in `download_config` if supplied,
+            for example `token`.
+    Example:
+    ```py
+    >>> from datasets import get_dataset_config_names
+    >>> get_dataset_config_names("nyu-mll/glue")
+    ['cola',
+     'sst2',
+     'mrpc',
+     'qqp',
+     'stsb',
+     'mnli',
+     'mnli_mismatched',
+     'mnli_matched',
+     'qnli',
+     'rte',
+     'wnli',
+     'ax']
+    ```
+    """
+    dataset_module = dataset_module_factory(
+        path,
+        revision=revision,
+        download_config=download_config,
+        download_mode=download_mode,
+        data_files=data_files,
+        **download_kwargs,
+    )
+    builder_cls = get_dataset_builder_class(dataset_module, dataset_name=os.path.basename(path))
+    return list(builder_cls.builder_configs.keys()) or [
+        dataset_module.builder_kwargs.get("config_name", builder_cls.DEFAULT_CONFIG_NAME or "default")
+    ]
+def get_dataset_default_config_name(
+    path: str,
+    revision: Optional[Union[str, Version]] = None,
+    download_config: Optional[DownloadConfig] = None,
+    download_mode: Optional[Union[DownloadMode, str]] = None,
+    data_files: Optional[Union[dict, list, str]] = None,
+    **download_kwargs,
+) -> Optional[str]:
+    """Get the default config name for a particular dataset.
+    Can return None only if the dataset has multiple configurations and no default configuration.
+    Args:
+        path (`str`): path to the dataset repository. Can be either:
+            - a local path to the dataset directory containing the data files,
+                e.g. `'./dataset/squad'`
+            - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]),
+                e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'`
+        revision (`Union[str, datasets.Version]`, *optional*):
+            If specified, the dataset module will be loaded from the datasets repository at this version.
+            By default:
+            - it is set to the local version of the lib.
+            - it will also try to load it from the main branch if it's not available at the local version of the lib.
+            Specifying a version that is different from your local version of the lib might cause compatibility issues.
+        download_config ([`DownloadConfig`], *optional*):
+            Specific download configuration parameters.
+        download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):
+            Download/generate mode.
+        data_files (`Union[Dict, List, str]`, *optional*):
+            Defining the data_files of the dataset configuration.
+        **download_kwargs (additional keyword arguments):
+            Optional attributes for [`DownloadConfig`] which will override the attributes in `download_config` if supplied,
+            for example `token`.
+    Returns:
+        Optional[str]: the default config name if there is one
+    Example:
+    ```py
+    >>> from datasets import get_dataset_default_config_name
+    >>> get_dataset_default_config_name("openbookqa")
+    'main'
+    ```
+    """
+    dataset_module = dataset_module_factory(
+        path,
+        revision=revision,
+        download_config=download_config,
+        download_mode=download_mode,
+        data_files=data_files,
+        **download_kwargs,
+    )
+    builder_cls = get_dataset_builder_class(dataset_module, dataset_name=os.path.basename(path))
+    builder_configs = list(builder_cls.builder_configs.keys())
+    if builder_configs:
+        default_config_name = builder_configs[0] if len(builder_configs) == 1 else None
+    else:
+        default_config_name = "default"
+    return builder_cls.DEFAULT_CONFIG_NAME or default_config_name
+def get_dataset_config_info(
+    path: str,
+    config_name: Optional[str] = None,
+    data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
+    download_config: Optional[DownloadConfig] = None,
+    download_mode: Optional[Union[DownloadMode, str]] = None,
+    revision: Optional[Union[str, Version]] = None,
+    token: Optional[Union[bool, str]] = None,
+    **config_kwargs,
+) -> DatasetInfo:
+    """Get the meta information (DatasetInfo) about a dataset for a particular config
+    Args:
+        path (`str`): path to the dataset repository. Can be either:
+            - a local path to the dataset directory containing the data files,
+                e.g. `'./dataset/squad'`
+            - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]),
+                e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'`
+        config_name (:obj:`str`, optional): Defining the name of the dataset configuration.
+        data_files (:obj:`str` or :obj:`Sequence` or :obj:`Mapping`, optional): Path(s) to source data file(s).
+        download_config (:class:`~download.DownloadConfig`, optional): Specific download configuration parameters.
+        download_mode (:class:`DownloadMode` or :obj:`str`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode.
+        revision (:class:`~utils.Version` or :obj:`str`, optional): Version of the dataset to load.
+            As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch.
+            You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository.
+        token (``str`` or :obj:`bool`, optional): Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
+            If True, or not specified, will get token from `"~/.huggingface"`.
+        **config_kwargs (additional keyword arguments): optional attributes for builder class which will override the attributes if supplied.
+    """
+    builder = load_dataset_builder(
+        path,
+        name=config_name,
+        data_files=data_files,
+        download_config=download_config,
+        download_mode=download_mode,
+        revision=revision,
+        token=token,
+        **config_kwargs,
+    )
+    info = builder.info
+    if info.splits is None:
+        download_config = download_config.copy() if download_config else DownloadConfig()
+        if token is not None:
+            download_config.token = token
+        builder._check_manual_download(
+            StreamingDownloadManager(base_path=builder.base_path, download_config=download_config)
+        )
+        try:
+            info.splits = {
+                split_generator.name: {"name": split_generator.name, "dataset_name": path}
+                for split_generator in builder._split_generators(
+                    StreamingDownloadManager(base_path=builder.base_path, download_config=download_config)
+                )
+            }
+        except Exception as err:
+            raise SplitsNotFoundError("The split names could not be parsed from the dataset config.") from err
+    return info
+def get_dataset_split_names(
+    path: str,
+    config_name: Optional[str] = None,
+    data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
+    download_config: Optional[DownloadConfig] = None,
+    download_mode: Optional[Union[DownloadMode, str]] = None,
+    revision: Optional[Union[str, Version]] = None,
+    token: Optional[Union[bool, str]] = None,
+    **config_kwargs,
+):
+    """Get the list of available splits for a particular config and dataset.
+    Args:
+        path (`str`): path to the dataset repository. Can be either:
+            - a local path to the dataset directory containing the data files,
+                e.g. `'./dataset/squad'`
+            - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]),
+                e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'`
+        config_name (`str`, *optional*):
+            Defining the name of the dataset configuration.
+        data_files (`str` or `Sequence` or `Mapping`, *optional*):
+            Path(s) to source data file(s).
+        download_config ([`DownloadConfig`], *optional*):
+            Specific download configuration parameters.
+        download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):
+            Download/generate mode.
+        revision ([`Version`] or `str`, *optional*):
+            Version of the dataset to load.
+            As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch.
+            You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository.
+        token (`str` or `bool`, *optional*):
+            Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
+            If `True`, or not specified, will get token from `"~/.huggingface"`.
+        **config_kwargs (additional keyword arguments):
+            Optional attributes for builder class which will override the attributes if supplied.
+    Example:
+    ```py
+    >>> from datasets import get_dataset_split_names
+    >>> get_dataset_split_names('cornell-movie-review-data/rotten_tomatoes')
+    ['train', 'validation', 'test']
+    ```
+    """
+    info = get_dataset_config_info(
+        path,
+        config_name=config_name,
+        data_files=data_files,
+        download_config=download_config,
+        download_mode=download_mode,
+        revision=revision,
+        token=token,
+        **config_kwargs,
+    )
+    return list(info.splits.keys())

datasets/iterable_dataset.py ADDED Viewed

The diff for this file is too large to render. See raw diff

datasets/keyhash.py ADDED Viewed

	@@ -0,0 +1,104 @@

+# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Lint as: python3
+"""
+Hashing function for dataset keys using `hashlib.md5`
+Requirements for the hash function:
+- Provides a uniformly distributed hash from random space
+- Adequately fast speed
+- Working with multiple input types (in this case, `str`, `int` or `bytes`)
+- Should be platform independent (generates same hash on different OS and systems)
+The hashing function provides a unique 128-bit integer hash of the key provided.
+The split name is being used here as the hash salt to avoid having same hashes
+in different splits due to same keys
+"""
+from typing import Union
+from huggingface_hub.utils import insecure_hashlib
+def _as_bytes(hash_data: Union[str, int, bytes, bytearray]) -> bytes:
+    """
+    Returns the input hash_data in its bytes form
+    Args:
+    hash_data: the hash salt/key to be converted to bytes
+    """
+    if isinstance(hash_data, (bytes, bytearray)):
+        # Data already in bytes, returns as it as
+        return hash_data
+    elif isinstance(hash_data, str):
+        # We keep the data as it as for it ot be later encoded to UTF-8
+        # However replace `\\` with `/` for Windows compatibility
+        hash_data = hash_data.replace("\\", "/")
+    elif isinstance(hash_data, int):
+        hash_data = str(hash_data)
+    else:
+        # If data is not of the required type, raise error
+        raise InvalidKeyError(hash_data)
+    return hash_data.encode("utf-8")
+class InvalidKeyError(Exception):
+    """Raises an error when given key is of invalid datatype."""
+    def __init__(self, hash_data):
+        self.prefix = "\nFAILURE TO GENERATE DATASET: Invalid key type detected"
+        self.err_msg = f"\nFound Key {hash_data} of type {type(hash_data)}"
+        self.suffix = "\nKeys should be either str, int or bytes type"
+        super().__init__(f"{self.prefix}{self.err_msg}{self.suffix}")
+class DuplicatedKeysError(Exception):
+    """Raise an error when duplicate key found."""
+    def __init__(self, key, duplicate_key_indices, fix_msg=""):
+        self.key = key
+        self.duplicate_key_indices = duplicate_key_indices
+        self.fix_msg = fix_msg
+        self.prefix = "Found multiple examples generated with the same key"
+        if len(duplicate_key_indices) <= 20:
+            self.err_msg = f"\nThe examples at index {', '.join(duplicate_key_indices)} have the key {key}"
+        else:
+            self.err_msg = f"\nThe examples at index {', '.join(duplicate_key_indices[:20])}... ({len(duplicate_key_indices) - 20} more) have the key {key}"
+        self.suffix = "\n" + fix_msg if fix_msg else ""
+        super().__init__(f"{self.prefix}{self.err_msg}{self.suffix}")
+class KeyHasher:
+    """KeyHasher class for providing hash using md5"""
+    def __init__(self, hash_salt: str):
+        self._split_md5 = insecure_hashlib.md5(_as_bytes(hash_salt))
+    def hash(self, key: Union[str, int, bytes]) -> int:
+        """Returns 128-bits unique hash of input key
+        Args:
+        key: the input key to be hashed (should be str, int or bytes)
+        Returns: 128-bit int hash key"""
+        md5 = self._split_md5.copy()
+        byte_key = _as_bytes(key)
+        md5.update(byte_key)
+        # Convert to integer with hexadecimal conversion
+        return int(md5.hexdigest(), 16)

datasets/load.py ADDED Viewed

	@@ -0,0 +1,1481 @@

+# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Lint as: python3
+"""Access datasets."""
+import glob
+import importlib
+import inspect
+import json
+import os
+import posixpath
+from collections import Counter
+from collections.abc import Mapping, Sequence
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Optional, Union
+import fsspec
+import httpx
+import requests
+import yaml
+from fsspec.core import url_to_fs
+from huggingface_hub import DatasetCard, DatasetCardData, HfApi
+from huggingface_hub.utils import (
+    EntryNotFoundError,
+    GatedRepoError,
+    LocalEntryNotFoundError,
+    OfflineModeIsEnabled,
+    RepositoryNotFoundError,
+    RevisionNotFoundError,
+    get_session,
+)
+from . import __version__, config
+from .arrow_dataset import Dataset
+from .builder import BuilderConfig, DatasetBuilder
+from .data_files import (
+    DataFilesDict,
+    DataFilesList,
+    DataFilesPatternsDict,
+    EmptyDatasetError,
+    get_data_patterns,
+    sanitize_patterns,
+)
+from .dataset_dict import DatasetDict, IterableDatasetDict
+from .download.download_config import DownloadConfig
+from .download.download_manager import DownloadMode
+from .download.streaming_download_manager import StreamingDownloadManager, xbasename, xglob, xjoin
+from .exceptions import DataFilesNotFoundError, DatasetNotFoundError
+from .features import Features
+from .features.features import _fix_for_backward_compatible_features
+from .fingerprint import Hasher
+from .info import DatasetInfo, DatasetInfosDict
+from .iterable_dataset import IterableDataset
+from .naming import camelcase_to_snakecase, snakecase_to_camelcase
+from .packaged_modules import (
+    _EXTENSION_TO_MODULE,
+    _MODULE_TO_EXTENSIONS,
+    _MODULE_TO_METADATA_FILE_NAMES,
+    _PACKAGED_DATASETS_MODULES,
+)
+from .packaged_modules.folder_based_builder.folder_based_builder import FolderBasedBuilder
+from .splits import Split
+from .utils import _dataset_viewer
+from .utils.file_utils import (
+    _raise_if_offline_mode_is_enabled,
+    cached_path,
+    get_datasets_user_agent,
+    is_relative_path,
+    relative_to_absolute_path,
+)
+from .utils.hub import hf_dataset_url
+from .utils.info_utils import VerificationMode, is_small_dataset
+from .utils.logging import get_logger
+from .utils.metadata import MetadataConfigs
+from .utils.typing import PathLike
+from .utils.version import Version
+logger = get_logger(__name__)
+ALL_ALLOWED_EXTENSIONS = list(_EXTENSION_TO_MODULE.keys()) + [".zip"]
+class _InitializeConfiguredDatasetBuilder:
+    """
+    From https://stackoverflow.com/questions/4647566/pickle-a-dynamically-parameterized-sub-class
+    See also ConfiguredDatasetBuilder.__reduce__
+    When called with the param value as the only argument, returns an
+    un-initialized instance of the parameterized class. Subsequent __setstate__
+    will be called by pickle.
+    """
+    def __call__(self, builder_cls, metadata_configs, default_config_name, name):
+        # make a simple object which has no complex __init__ (this one will do)
+        obj = _InitializeConfiguredDatasetBuilder()
+        obj.__class__ = configure_builder_class(
+            builder_cls, metadata_configs, default_config_name=default_config_name, dataset_name=name
+        )
+        return obj
+def configure_builder_class(
+    builder_cls: type[DatasetBuilder],
+    builder_configs: list[BuilderConfig],
+    default_config_name: Optional[str],
+    dataset_name: str,
+) -> type[DatasetBuilder]:
+    """
+    Dynamically create a builder class with custom builder configs parsed from README.md file,
+    i.e. set BUILDER_CONFIGS class variable of a builder class to custom configs list.
+    """
+    class ConfiguredDatasetBuilder(builder_cls):
+        BUILDER_CONFIGS = builder_configs
+        DEFAULT_CONFIG_NAME = default_config_name
+        __module__ = builder_cls.__module__  # so that the actual packaged builder can be imported
+        def __reduce__(self):  # to make dynamically created class pickable, see _InitializeParameterizedDatasetBuilder
+            parent_builder_cls = self.__class__.__mro__[1]
+            return (
+                _InitializeConfiguredDatasetBuilder(),
+                (
+                    parent_builder_cls,
+                    self.BUILDER_CONFIGS,
+                    self.DEFAULT_CONFIG_NAME,
+                    self.dataset_name,
+                ),
+                self.__dict__.copy(),
+            )
+    ConfiguredDatasetBuilder.__name__ = (
+        f"{builder_cls.__name__.lower().capitalize()}{snakecase_to_camelcase(dataset_name)}"
+    )
+    ConfiguredDatasetBuilder.__qualname__ = (
+        f"{builder_cls.__name__.lower().capitalize()}{snakecase_to_camelcase(dataset_name)}"
+    )
+    return ConfiguredDatasetBuilder
+def import_main_class(module_path) -> Optional[type[DatasetBuilder]]:
+    """Import a module at module_path and return its main class: a DatasetBuilder"""
+    module = importlib.import_module(module_path)
+    # Find the main class in our imported module
+    module_main_cls = None
+    for name, obj in module.__dict__.items():
+        if inspect.isclass(obj) and issubclass(obj, DatasetBuilder):
+            if inspect.isabstract(obj):
+                continue
+            module_main_cls = obj
+            obj_module = inspect.getmodule(obj)
+            if obj_module is not None and module == obj_module:
+                break
+    return module_main_cls
+def get_dataset_builder_class(
+    dataset_module: "DatasetModule", dataset_name: Optional[str] = None
+) -> type[DatasetBuilder]:
+    builder_cls = import_main_class(dataset_module.module_path)
+    if dataset_module.builder_configs_parameters.builder_configs:
+        dataset_name = dataset_name or dataset_module.builder_kwargs.get("dataset_name")
+        if dataset_name is None:
+            raise ValueError("dataset_name should be specified but got None")
+        builder_cls = configure_builder_class(
+            builder_cls,
+            builder_configs=dataset_module.builder_configs_parameters.builder_configs,
+            default_config_name=dataset_module.builder_configs_parameters.default_config_name,
+            dataset_name=dataset_name,
+        )
+    return builder_cls
+def increase_load_count(name: str):
+    """Update the download count of a dataset."""
+    if not config.HF_HUB_OFFLINE and config.HF_UPDATE_DOWNLOAD_COUNTS:
+        try:
+            get_session().head(
+                "/".join((config.S3_DATASETS_BUCKET_PREFIX, name, name + ".py")),
+                headers={"User-Agent": get_datasets_user_agent()},
+                timeout=3,
+            )
+        except Exception:
+            pass
+def infer_module_for_data_files_list(
+    data_files_list: DataFilesList, download_config: Optional[DownloadConfig] = None
+) -> tuple[Optional[str], dict]:
+    """Infer module (and builder kwargs) from list of data files.
+    It picks the module based on the most common file extension.
+    In case of a draw ".parquet" is the favorite, and then alphabetical order.
+    Args:
+        data_files_list (DataFilesList): List of data files.
+        download_config (bool or str, optional): Mainly use `token` or `storage_options` to support different platforms and auth types.
+    Returns:
+        tuple[str, dict[str, Any]]: Tuple with
+            - inferred module name
+            - dict of builder kwargs
+    """
+    extensions_counter = Counter(
+        ("." + suffix.lower(), xbasename(filepath) in FolderBasedBuilder.METADATA_FILENAMES)
+        for filepath in data_files_list[: config.DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE]
+        for suffix in xbasename(filepath).split(".")[1:]
+    )
+    if extensions_counter:
+        def sort_key(ext_count: tuple[tuple[str, bool], int]) -> tuple[int, bool]:
+            """Sort by count and set ".parquet" as the favorite in case of a draw, and ignore metadata files"""
+            (ext, is_metadata), count = ext_count
+            return (not is_metadata, count, ext == ".parquet", ext == ".jsonl", ext == ".json", ext == ".csv", ext)
+        for (ext, _), _ in sorted(extensions_counter.items(), key=sort_key, reverse=True):
+            if ext in _EXTENSION_TO_MODULE:
+                return _EXTENSION_TO_MODULE[ext]
+            elif ext == ".zip":
+                return infer_module_for_data_files_list_in_archives(data_files_list, download_config=download_config)
+    return None, {}
+def infer_module_for_data_files_list_in_archives(
+    data_files_list: DataFilesList, download_config: Optional[DownloadConfig] = None
+) -> tuple[Optional[str], dict]:
+    """Infer module (and builder kwargs) from list of archive data files.
+    Args:
+        data_files_list (DataFilesList): List of data files.
+        download_config (bool or str, optional): Mainly use `token` or `storage_options` to support different platforms and auth types.
+    Returns:
+        tuple[str, dict[str, Any]]: Tuple with
+            - inferred module name
+            - dict of builder kwargs
+    """
+    archived_files = []
+    archive_files_counter = 0
+    for filepath in data_files_list:
+        if str(filepath).endswith(".zip"):
+            archive_files_counter += 1
+            if archive_files_counter > config.GLOBBED_DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE:
+                break
+            extracted = xjoin(StreamingDownloadManager().extract(filepath), "**")
+            archived_files += [
+                f.split("::")[0]
+                for f in xglob(extracted, recursive=True, download_config=download_config)[
+                    : config.ARCHIVED_DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE
+                ]
+            ]
+    extensions_counter = Counter(
+        "." + suffix.lower() for filepath in archived_files for suffix in xbasename(filepath).split(".")[1:]
+    )
+    if extensions_counter:
+        most_common = extensions_counter.most_common(1)[0][0]
+        if most_common in _EXTENSION_TO_MODULE:
+            return _EXTENSION_TO_MODULE[most_common]
+    return None, {}
+def infer_module_for_data_files(
+    data_files: DataFilesDict, path: Optional[str] = None, download_config: Optional[DownloadConfig] = None
+) -> tuple[Optional[str], dict[str, Any]]:
+    """Infer module (and builder kwargs) from data files. Raise if module names for different splits don't match.
+    Args:
+        data_files ([`DataFilesDict`]): Dict of list of data files.
+        path (str, *optional*): Dataset name or path.
+        download_config ([`DownloadConfig`], *optional*):
+            Specific download configuration parameters to authenticate on the Hugging Face Hub for private remote files.
+    Returns:
+        tuple[str, dict[str, Any]]: Tuple with
+            - inferred module name
+            - builder kwargs
+    """
+    split_modules = {
+        split: infer_module_for_data_files_list(data_files_list, download_config=download_config)
+        for split, data_files_list in data_files.items()
+    }
+    module_name, default_builder_kwargs = next(iter(split_modules.values()))
+    if any((module_name, default_builder_kwargs) != split_module for split_module in split_modules.values()):
+        raise ValueError(f"Couldn't infer the same data file format for all splits. Got {split_modules}")
+    if not module_name:
+        raise DataFilesNotFoundError("No (supported) data files found" + (f" in {path}" if path else ""))
+    return module_name, default_builder_kwargs
+def create_builder_configs_from_metadata_configs(
+    module_path: str,
+    metadata_configs: MetadataConfigs,
+    base_path: Optional[str] = None,
+    default_builder_kwargs: dict[str, Any] = None,
+    download_config: Optional[DownloadConfig] = None,
+) -> tuple[list[BuilderConfig], str]:
+    builder_cls = import_main_class(module_path)
+    builder_config_cls = builder_cls.BUILDER_CONFIG_CLASS
+    default_config_name = metadata_configs.get_default_config_name()
+    builder_configs = []
+    default_builder_kwargs = {} if default_builder_kwargs is None else default_builder_kwargs
+    base_path = base_path if base_path is not None else ""
+    for config_name, config_params in metadata_configs.items():
+        config_data_files = config_params.get("data_files")
+        config_data_dir = config_params.get("data_dir")
+        config_base_path = xjoin(base_path, config_data_dir) if config_data_dir else base_path
+        try:
+            config_patterns = (
+                sanitize_patterns(config_data_files)
+                if config_data_files is not None
+                else get_data_patterns(config_base_path, download_config=download_config)
+            )
+            config_data_files_dict = DataFilesPatternsDict.from_patterns(
+                config_patterns,
+                allowed_extensions=ALL_ALLOWED_EXTENSIONS,
+            )
+        except EmptyDatasetError as e:
+            raise EmptyDatasetError(
+                f"Dataset at '{base_path}' doesn't contain data files matching the patterns for config '{config_name}',"
+                f" check `data_files` and `data_fir` parameters in the `configs` YAML field in README.md. "
+            ) from e
+        ignored_params = [
+            param for param in config_params if not hasattr(builder_config_cls, param) and param != "default"
+        ]
+        if ignored_params:
+            logger.warning(
+                f"Some datasets params were ignored: {ignored_params}. "
+                "Make sure to use only valid params for the dataset builder and to have "
+                "a up-to-date version of the `datasets` library."
+            )
+        builder_configs.append(
+            builder_config_cls(
+                name=config_name,
+                data_files=config_data_files_dict,
+                data_dir=config_data_dir,
+                **{
+                    param: value
+                    for param, value in {**default_builder_kwargs, **config_params}.items()
+                    if hasattr(builder_config_cls, param) and param not in ("default", "data_files", "data_dir")
+                },
+            )
+        )
+    return builder_configs, default_config_name
+@dataclass
+class BuilderConfigsParameters:
+    """Dataclass containing objects related to creation of builder configurations from yaml's metadata content.
+    Attributes:
+        metadata_configs (`MetadataConfigs`, *optional*):
+            Configs parsed from yaml's metadata.
+        builder_configs (`list[BuilderConfig]`, *optional*):
+            List of BuilderConfig objects created from metadata_configs above.
+        default_config_name (`str`):
+            Name of default config taken from yaml's metadata.
+    """
+    metadata_configs: Optional[MetadataConfigs] = None
+    builder_configs: Optional[list[BuilderConfig]] = None
+    default_config_name: Optional[str] = None
+@dataclass
+class DatasetModule:
+    module_path: str
+    hash: str
+    builder_kwargs: dict
+    builder_configs_parameters: BuilderConfigsParameters = field(default_factory=BuilderConfigsParameters)
+    dataset_infos: Optional[DatasetInfosDict] = None
+class _DatasetModuleFactory:
+    def get_module(self) -> DatasetModule:
+        raise NotImplementedError
+class LocalDatasetModuleFactory(_DatasetModuleFactory):
+    """Get the module of a dataset loaded from the user's data files. The dataset builder module to use is inferred
+    from the data files extensions."""
+    def __init__(
+        self,
+        path: str,
+        data_dir: Optional[str] = None,
+        data_files: Optional[Union[str, list, dict]] = None,
+        download_mode: Optional[Union[DownloadMode, str]] = None,
+    ):
+        if data_dir and os.path.isabs(data_dir):
+            raise ValueError(f"`data_dir` must be relative to a dataset directory's root: {path}")
+        self.path = Path(path).as_posix()
+        self.name = Path(path).stem
+        self.data_files = data_files
+        self.data_dir = data_dir
+        self.download_mode = download_mode
+    def get_module(self) -> DatasetModule:
+        readme_path = os.path.join(self.path, config.REPOCARD_FILENAME)
+        standalone_yaml_path = os.path.join(self.path, config.REPOYAML_FILENAME)
+        dataset_card_data = DatasetCard.load(readme_path).data if os.path.isfile(readme_path) else DatasetCardData()
+        if os.path.exists(standalone_yaml_path):
+            with open(standalone_yaml_path, encoding="utf-8") as f:
+                standalone_yaml_data = yaml.safe_load(f.read())
+                if standalone_yaml_data:
+                    _dataset_card_data_dict = dataset_card_data.to_dict()
+                    _dataset_card_data_dict.update(standalone_yaml_data)
+                    dataset_card_data = DatasetCardData(**_dataset_card_data_dict)
+        metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)
+        dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data)
+        # we need a set of data files to find which dataset builder to use
+        # because we need to infer module name by files extensions
+        base_path = Path(self.path, self.data_dir or "").expanduser().resolve().as_posix()
+        if self.data_files is not None:
+            patterns = sanitize_patterns(self.data_files)
+        elif metadata_configs and not self.data_dir and "data_files" in next(iter(metadata_configs.values())):
+            patterns = sanitize_patterns(next(iter(metadata_configs.values()))["data_files"])
+        else:
+            patterns = get_data_patterns(base_path)
+        data_files = DataFilesDict.from_patterns(
+            patterns,
+            base_path=base_path,
+            allowed_extensions=ALL_ALLOWED_EXTENSIONS,
+        )
+        module_name, default_builder_kwargs = infer_module_for_data_files(
+            data_files=data_files,
+            path=self.path,
+        )
+        data_files = data_files.filter(
+            extensions=_MODULE_TO_EXTENSIONS[module_name], file_names=_MODULE_TO_METADATA_FILE_NAMES[module_name]
+        )
+        module_path, _ = _PACKAGED_DATASETS_MODULES[module_name]
+        if metadata_configs:
+            builder_configs, default_config_name = create_builder_configs_from_metadata_configs(
+                module_path,
+                metadata_configs,
+                base_path=base_path,
+                default_builder_kwargs=default_builder_kwargs,
+            )
+        else:
+            builder_configs: list[BuilderConfig] = [
+                import_main_class(module_path).BUILDER_CONFIG_CLASS(
+                    data_files=data_files,
+                    **default_builder_kwargs,
+                )
+            ]
+            default_config_name = None
+        builder_kwargs = {
+            "base_path": self.path,
+            "dataset_name": camelcase_to_snakecase(Path(self.path).name),
+        }
+        if self.data_dir:
+            builder_kwargs["data_files"] = data_files
+        # this file is deprecated and was created automatically in old versions of push_to_hub
+        if os.path.isfile(os.path.join(self.path, config.DATASETDICT_INFOS_FILENAME)):
+            with open(os.path.join(self.path, config.DATASETDICT_INFOS_FILENAME), encoding="utf-8") as f:
+                legacy_dataset_infos = DatasetInfosDict(
+                    {
+                        config_name: DatasetInfo.from_dict(dataset_info_dict)
+                        for config_name, dataset_info_dict in json.load(f).items()
+                    }
+                )
+                if len(legacy_dataset_infos) == 1:
+                    # old config e.g. named "username--dataset_name"
+                    legacy_config_name = next(iter(legacy_dataset_infos))
+                    legacy_dataset_infos["default"] = legacy_dataset_infos.pop(legacy_config_name)
+            legacy_dataset_infos.update(dataset_infos)
+            dataset_infos = legacy_dataset_infos
+        if default_config_name is None and len(dataset_infos) == 1:
+            default_config_name = next(iter(dataset_infos))
+        hash = Hasher.hash({"dataset_infos": dataset_infos, "builder_configs": builder_configs})
+        return DatasetModule(
+            module_path,
+            hash,
+            builder_kwargs,
+            dataset_infos=dataset_infos,
+            builder_configs_parameters=BuilderConfigsParameters(
+                metadata_configs=metadata_configs,
+                builder_configs=builder_configs,
+                default_config_name=default_config_name,
+            ),
+        )
+class PackagedDatasetModuleFactory(_DatasetModuleFactory):
+    """Get the dataset builder module from the ones that are packaged with the library: csv, json, etc."""
+    def __init__(
+        self,
+        name: str,
+        data_dir: Optional[str] = None,
+        data_files: Optional[Union[str, list, dict]] = None,
+        download_config: Optional[DownloadConfig] = None,
+        download_mode: Optional[Union[DownloadMode, str]] = None,
+    ):
+        self.name = name
+        self.data_files = data_files
+        self.data_dir = data_dir
+        self.download_config = download_config
+        self.download_mode = download_mode
+        increase_load_count(name)
+    def get_module(self) -> DatasetModule:
+        base_path = Path(self.data_dir or "").expanduser().resolve().as_posix()
+        patterns = (
+            sanitize_patterns(self.data_files)
+            if self.data_files is not None
+            else get_data_patterns(base_path, download_config=self.download_config)
+        )
+        data_files = DataFilesDict.from_patterns(
+            patterns,
+            download_config=self.download_config,
+            base_path=base_path,
+        )
+        module_path, hash = _PACKAGED_DATASETS_MODULES[self.name]
+        builder_kwargs = {
+            "data_files": data_files,
+            "dataset_name": self.name,
+        }
+        return DatasetModule(module_path, hash, builder_kwargs)
+class HubDatasetModuleFactory(_DatasetModuleFactory):
+    """
+    Get the module of a dataset loaded from data files of a dataset repository.
+    The dataset builder module to use is inferred from the data files extensions.
+    """
+    def __init__(
+        self,
+        name: str,
+        commit_hash: str,
+        data_dir: Optional[str] = None,
+        data_files: Optional[Union[str, list, dict]] = None,
+        download_config: Optional[DownloadConfig] = None,
+        download_mode: Optional[Union[DownloadMode, str]] = None,
+        use_exported_dataset_infos: bool = False,
+    ):
+        self.name = name
+        self.commit_hash = commit_hash
+        self.data_files = data_files
+        self.data_dir = data_dir
+        self.download_config = download_config or DownloadConfig()
+        self.download_mode = download_mode
+        self.use_exported_dataset_infos = use_exported_dataset_infos
+        increase_load_count(name)
+    def get_module(self) -> DatasetModule:
+        # Get the Dataset Card and fix the revision in case there are new commits in the meantime
+        api = HfApi(
+            endpoint=config.HF_ENDPOINT,
+            token=self.download_config.token,
+            library_name="datasets",
+            library_version=__version__,
+            user_agent=get_datasets_user_agent(self.download_config.user_agent),
+        )
+        try:
+            dataset_readme_path = api.hf_hub_download(
+                repo_id=self.name,
+                filename=config.REPOCARD_FILENAME,
+                repo_type="dataset",
+                revision=self.commit_hash,
+                proxies=self.download_config.proxies,
+            )
+            dataset_card_data = DatasetCard.load(dataset_readme_path).data
+        except EntryNotFoundError:
+            dataset_card_data = DatasetCardData()
+        download_config = self.download_config.copy()
+        if download_config.download_desc is None:
+            download_config.download_desc = "Downloading standalone yaml"
+        try:
+            standalone_yaml_path = cached_path(
+                hf_dataset_url(self.name, config.REPOYAML_FILENAME, revision=self.commit_hash),
+                download_config=download_config,
+            )
+            with open(standalone_yaml_path, encoding="utf-8") as f:
+                standalone_yaml_data = yaml.safe_load(f.read())
+                if standalone_yaml_data:
+                    _dataset_card_data_dict = dataset_card_data.to_dict()
+                    _dataset_card_data_dict.update(standalone_yaml_data)
+                    dataset_card_data = DatasetCardData(**_dataset_card_data_dict)
+        except FileNotFoundError:
+            pass
+        base_path = f"hf://datasets/{self.name}@{self.commit_hash}/{self.data_dir or ''}".rstrip("/")
+        metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)
+        dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data)
+        if config.USE_PARQUET_EXPORT and self.use_exported_dataset_infos:
+            try:
+                exported_dataset_infos = _dataset_viewer.get_exported_dataset_infos(
+                    dataset=self.name, commit_hash=self.commit_hash, token=self.download_config.token
+                )
+                exported_dataset_infos = DatasetInfosDict(
+                    {
+                        config_name: DatasetInfo.from_dict(exported_dataset_infos[config_name])
+                        for config_name in exported_dataset_infos
+                    }
+                )
+            except _dataset_viewer.DatasetViewerError:
+                exported_dataset_infos = None
+        else:
+            exported_dataset_infos = None
+        if exported_dataset_infos:
+            exported_dataset_infos.update(dataset_infos)
+            dataset_infos = exported_dataset_infos
+        # we need a set of data files to find which dataset builder to use
+        # because we need to infer module name by files extensions
+        if self.data_files is not None:
+            patterns = sanitize_patterns(self.data_files)
+        elif metadata_configs and not self.data_dir and "data_files" in next(iter(metadata_configs.values())):
+            patterns = sanitize_patterns(next(iter(metadata_configs.values()))["data_files"])
+        else:
+            patterns = get_data_patterns(base_path, download_config=self.download_config)
+        data_files = DataFilesDict.from_patterns(
+            patterns,
+            base_path=base_path,
+            allowed_extensions=ALL_ALLOWED_EXTENSIONS,
+            download_config=self.download_config,
+        )
+        module_name, default_builder_kwargs = infer_module_for_data_files(
+            data_files=data_files,
+            path=self.name,
+            download_config=self.download_config,
+        )
+        data_files = data_files.filter(
+            extensions=_MODULE_TO_EXTENSIONS[module_name], file_names=_MODULE_TO_METADATA_FILE_NAMES[module_name]
+        )
+        module_path, _ = _PACKAGED_DATASETS_MODULES[module_name]
+        if metadata_configs:
+            builder_configs, default_config_name = create_builder_configs_from_metadata_configs(
+                module_path,
+                metadata_configs,
+                base_path=base_path,
+                default_builder_kwargs=default_builder_kwargs,
+                download_config=self.download_config,
+            )
+        else:
+            builder_configs: list[BuilderConfig] = [
+                import_main_class(module_path).BUILDER_CONFIG_CLASS(
+                    data_files=data_files,
+                    **default_builder_kwargs,
+                )
+            ]
+            default_config_name = None
+        builder_kwargs = {
+            "base_path": hf_dataset_url(self.name, "", revision=self.commit_hash).rstrip("/"),
+            "repo_id": self.name,
+            "dataset_name": camelcase_to_snakecase(Path(self.name).name),
+        }
+        if self.data_dir:
+            builder_kwargs["data_files"] = data_files
+        download_config = self.download_config.copy()
+        if download_config.download_desc is None:
+            download_config.download_desc = "Downloading metadata"
+        try:
+            # this file is deprecated and was created automatically in old versions of push_to_hub
+            dataset_infos_path = cached_path(
+                hf_dataset_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=self.commit_hash),
+                download_config=download_config,
+            )
+            with open(dataset_infos_path, encoding="utf-8") as f:
+                legacy_dataset_infos = DatasetInfosDict(
+                    {
+                        config_name: DatasetInfo.from_dict(dataset_info_dict)
+                        for config_name, dataset_info_dict in json.load(f).items()
+                    }
+                )
+                if len(legacy_dataset_infos) == 1:
+                    # old config e.g. named "username--dataset_name"
+                    legacy_config_name = next(iter(legacy_dataset_infos))
+                    legacy_dataset_infos["default"] = legacy_dataset_infos.pop(legacy_config_name)
+            legacy_dataset_infos.update(dataset_infos)
+            dataset_infos = legacy_dataset_infos
+        except FileNotFoundError:
+            pass
+        if default_config_name is None and len(dataset_infos) == 1:
+            default_config_name = next(iter(dataset_infos))
+        return DatasetModule(
+            module_path,
+            self.commit_hash,
+            builder_kwargs,
+            dataset_infos=dataset_infos,
+            builder_configs_parameters=BuilderConfigsParameters(
+                metadata_configs=metadata_configs,
+                builder_configs=builder_configs,
+                default_config_name=default_config_name,
+            ),
+        )
+class HubDatasetModuleFactoryWithParquetExport(_DatasetModuleFactory):
+    """
+    Get the module of a dataset loaded from parquet files of a dataset repository parquet export.
+    """
+    def __init__(
+        self,
+        name: str,
+        commit_hash: str,
+        download_config: Optional[DownloadConfig] = None,
+    ):
+        self.name = name
+        self.commit_hash = commit_hash
+        self.download_config = download_config or DownloadConfig()
+        increase_load_count(name)
+    def get_module(self) -> DatasetModule:
+        exported_parquet_files = _dataset_viewer.get_exported_parquet_files(
+            dataset=self.name, commit_hash=self.commit_hash, token=self.download_config.token
+        )
+        exported_dataset_infos = _dataset_viewer.get_exported_dataset_infos(
+            dataset=self.name, commit_hash=self.commit_hash, token=self.download_config.token
+        )
+        dataset_infos = DatasetInfosDict(
+            {
+                config_name: DatasetInfo.from_dict(exported_dataset_infos[config_name])
+                for config_name in exported_dataset_infos
+            }
+        )
+        parquet_commit_hash = (
+            HfApi(
+                endpoint=config.HF_ENDPOINT,
+                token=self.download_config.token,
+                library_name="datasets",
+                library_version=__version__,
+                user_agent=get_datasets_user_agent(self.download_config.user_agent),
+            )
+            .dataset_info(
+                self.name,
+                revision="refs/convert/parquet",
+                token=self.download_config.token,
+                timeout=100.0,
+            )
+            .sha
+        )  # fix the revision in case there are new commits in the meantime
+        metadata_configs = MetadataConfigs._from_exported_parquet_files_and_dataset_infos(
+            parquet_commit_hash=parquet_commit_hash,
+            exported_parquet_files=exported_parquet_files,
+            dataset_infos=dataset_infos,
+        )
+        module_path, _ = _PACKAGED_DATASETS_MODULES["parquet"]
+        builder_configs, default_config_name = create_builder_configs_from_metadata_configs(
+            module_path,
+            metadata_configs,
+            download_config=self.download_config,
+        )
+        builder_kwargs = {
+            "repo_id": self.name,
+            "dataset_name": camelcase_to_snakecase(Path(self.name).name),
+        }
+        return DatasetModule(
+            module_path,
+            self.commit_hash,
+            builder_kwargs,
+            dataset_infos=dataset_infos,
+            builder_configs_parameters=BuilderConfigsParameters(
+                metadata_configs=metadata_configs,
+                builder_configs=builder_configs,
+                default_config_name=default_config_name,
+            ),
+        )
+class CachedDatasetModuleFactory(_DatasetModuleFactory):
+    """
+    Get the module of a dataset that has been loaded once already and cached.
+    """
+    def __init__(
+        self,
+        name: str,
+        cache_dir: Optional[str] = None,
+    ):
+        self.name = name
+        self.cache_dir = cache_dir
+        assert self.name.count("/") <= 1
+    def get_module(self) -> DatasetModule:
+        cache_dir = os.path.expanduser(str(self.cache_dir or config.HF_DATASETS_CACHE))
+        namespace_and_dataset_name = self.name.split("/")
+        namespace_and_dataset_name[-1] = camelcase_to_snakecase(namespace_and_dataset_name[-1])
+        cached_relative_path = "___".join(namespace_and_dataset_name)
+        cached_datasets_directory_path_root = os.path.join(cache_dir, cached_relative_path)
+        cached_directory_paths = [
+            cached_directory_path
+            for cached_directory_path in glob.glob(os.path.join(cached_datasets_directory_path_root, "*", "*", "*"))
+            if os.path.isdir(cached_directory_path)
+        ]
+        if cached_directory_paths:
+            builder_kwargs = {
+                "repo_id": self.name,
+                "dataset_name": self.name.split("/")[-1],
+            }
+            warning_msg = f"Using the latest cached version of the dataset since {self.name} couldn't be found on the Hugging Face Hub"
+            if config.HF_HUB_OFFLINE:
+                warning_msg += " (offline mode is enabled)."
+            logger.warning(warning_msg)
+            return DatasetModule(
+                "datasets.packaged_modules.cache.cache",
+                "auto",
+                {**builder_kwargs, "version": "auto"},
+            )
+        raise FileNotFoundError(f"Dataset {self.name} is not cached in {self.cache_dir}")
+def dataset_module_factory(
+    path: str,
+    revision: Optional[Union[str, Version]] = None,
+    download_config: Optional[DownloadConfig] = None,
+    download_mode: Optional[Union[DownloadMode, str]] = None,
+    data_dir: Optional[str] = None,
+    data_files: Optional[Union[dict, list, str, DataFilesDict]] = None,
+    cache_dir: Optional[str] = None,
+    **download_kwargs,
+) -> DatasetModule:
+    """
+    Download/extract/cache a dataset module.
+    Dataset codes are cached inside the dynamic modules cache to allow easy import (avoid ugly sys.path tweaks).
+    Args:
+        path (str): Path or name of the dataset.
+            Depending on ``path``, the dataset builder that is used comes from one of the generic dataset builders (JSON, CSV, Parquet, text etc.).
+            For local datasets:
+            - if ``path`` is a local directory (containing data files only)
+              -> load a generic dataset builder (csv, json, text etc.) based on the content of the directory
+              e.g. ``'./path/to/directory/with/my/csv/data'``.
+            For datasets on the Hugging Face Hub (list all available datasets with ``huggingface_hub.list_datasets()``)
+            - if ``path`` is a dataset repository on the HF hub (containing data files only)
+              -> load a generic dataset builder (csv, text etc.) based on the content of the repository
+              e.g. ``'username/dataset_name'``, a dataset repository on the HF hub containing your data files.
+        revision (:class:`~utils.Version` or :obj:`str`, optional): Version of the dataset to load.
+            As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch.
+            You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository.
+        download_config (:class:`DownloadConfig`, optional): Specific download configuration parameters.
+        download_mode (:class:`DownloadMode` or :obj:`str`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode.
+        data_dir (:obj:`str`, optional): Directory with the data files. Used only if `data_files` is not specified,
+            in which case it's equal to pass `os.path.join(data_dir, "**")` as `data_files`.
+        data_files (:obj:`Union[Dict, List, str]`, optional): Defining the data_files of the dataset configuration.
+        cache_dir (`str`, *optional*):
+            Directory to read/write data. Defaults to `"~/.cache/huggingface/datasets"`.
+            <Added version="2.16.0"/>
+        **download_kwargs (additional keyword arguments): optional attributes for DownloadConfig() which will override
+            the attributes in download_config if supplied.
+    Returns:
+        DatasetModule
+    """
+    if download_config is None:
+        download_config = DownloadConfig(**download_kwargs)
+    download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS)
+    download_config.extract_compressed_file = True
+    download_config.force_extract = True
+    download_config.force_download = download_mode == DownloadMode.FORCE_REDOWNLOAD
+    filename = list(filter(lambda x: x, path.replace(os.sep, "/").split("/")))[-1]
+    if not filename.endswith(".py"):
+        filename = filename + ".py"
+    combined_path = os.path.join(path, filename)
+    # We have several ways to get a dataset builder:
+    #
+    # - if path is the name of a packaged dataset module
+    #   -> use the packaged module (json, csv, etc.)
+    #
+    # - if os.path.join(path, name) is a local python file
+    #   -> use the module from the python file
+    # - if path is a local directory (but no python file)
+    #   -> use a packaged module (csv, text etc.) based on content of the directory
+    #
+    # - if path has one "/" and is dataset repository on the HF hub with a python file
+    #   -> the module from the python file in the dataset repository
+    # - if path has one "/" and is dataset repository on the HF hub without a python file
+    #   -> use a packaged module (csv, text etc.) based on content of the repository
+    # Try packaged
+    if path in _PACKAGED_DATASETS_MODULES:
+        return PackagedDatasetModuleFactory(
+            path,
+            data_dir=data_dir,
+            data_files=data_files,
+            download_config=download_config,
+            download_mode=download_mode,
+        ).get_module()
+    # Try locally
+    elif path.endswith(filename):
+        raise RuntimeError(f"Dataset scripts are no longer supported, but found {filename}")
+    elif os.path.isfile(combined_path):
+        raise RuntimeError(f"Dataset scripts are no longer supported, but found {filename}")
+    elif os.path.isdir(path):
+        return LocalDatasetModuleFactory(
+            path, data_dir=data_dir, data_files=data_files, download_mode=download_mode
+        ).get_module()
+    # Try remotely
+    elif is_relative_path(path) and path.count("/") <= 1:
+        try:
+            # Get the Dataset Card + get the revision + check authentication all at in one call
+            # We fix the commit_hash in case there are new commits in the meantime
+            api = HfApi(
+                endpoint=config.HF_ENDPOINT,
+                token=download_config.token,
+                library_name="datasets",
+                library_version=__version__,
+                user_agent=get_datasets_user_agent(download_config.user_agent),
+            )
+            try:
+                _raise_if_offline_mode_is_enabled()
+                dataset_readme_path = api.hf_hub_download(
+                    repo_id=path,
+                    filename=config.REPOCARD_FILENAME,
+                    repo_type="dataset",
+                    revision=revision,
+                    proxies=download_config.proxies,
+                )
+                commit_hash = os.path.basename(os.path.dirname(dataset_readme_path))
+            except LocalEntryNotFoundError as e:
+                if isinstance(
+                    e.__cause__,
+                    (
+                        OfflineModeIsEnabled,
+                        requests.exceptions.Timeout,
+                        requests.exceptions.ConnectionError,
+                        httpx.ConnectError,
+                        httpx.TimeoutException,
+                    ),
+                ):
+                    raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({e.__class__.__name__})") from e
+                else:
+                    raise
+            except EntryNotFoundError:
+                commit_hash = api.dataset_info(
+                    path,
+                    revision=revision,
+                    timeout=100.0,
+                ).sha
+            except (
+                OfflineModeIsEnabled,
+                requests.exceptions.Timeout,
+                requests.exceptions.ConnectionError,
+                httpx.ConnectError,
+                httpx.TimeoutException,
+            ) as e:
+                raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({e.__class__.__name__})") from e
+            except GatedRepoError as e:
+                message = f"Dataset '{path}' is a gated dataset on the Hub."
+                if e.response.status_code == 401:
+                    message += " You must be authenticated to access it."
+                elif e.response.status_code == 403:
+                    message += f" Visit the dataset page at https://huggingface.co/datasets/{path} to ask for access."
+                raise DatasetNotFoundError(message) from e
+            except RevisionNotFoundError as e:
+                raise DatasetNotFoundError(
+                    f"Revision '{revision}' doesn't exist for dataset '{path}' on the Hub."
+                ) from e
+            except RepositoryNotFoundError as e:
+                raise DatasetNotFoundError(f"Dataset '{path}' doesn't exist on the Hub or cannot be accessed.") from e
+            try:
+                api.hf_hub_download(
+                    repo_id=path,
+                    filename=filename,
+                    repo_type="dataset",
+                    revision=commit_hash,
+                    proxies=download_config.proxies,
+                )
+                raise RuntimeError(f"Dataset scripts are no longer supported, but found {filename}")
+            except EntryNotFoundError:
+                # Use the infos from the parquet export except in some cases:
+                if data_dir or data_files or (revision and revision != "main"):
+                    use_exported_dataset_infos = False
+                else:
+                    use_exported_dataset_infos = True
+                return HubDatasetModuleFactory(
+                    path,
+                    commit_hash=commit_hash,
+                    data_dir=data_dir,
+                    data_files=data_files,
+                    download_config=download_config,
+                    download_mode=download_mode,
+                    use_exported_dataset_infos=use_exported_dataset_infos,
+                ).get_module()
+            except GatedRepoError as e:
+                message = f"Dataset '{path}' is a gated dataset on the Hub."
+                if e.response.status_code == 401:
+                    message += " You must be authenticated to access it."
+                elif e.response.status_code == 403:
+                    message += f" Visit the dataset page at https://huggingface.co/datasets/{path} to ask for access."
+                raise DatasetNotFoundError(message) from e
+            except RevisionNotFoundError as e:
+                raise DatasetNotFoundError(
+                    f"Revision '{revision}' doesn't exist for dataset '{path}' on the Hub."
+                ) from e
+        except Exception as e1:
+            # All the attempts failed, before raising the error we should check if the module is already cached
+            try:
+                return CachedDatasetModuleFactory(path, cache_dir=cache_dir).get_module()
+            except Exception:
+                # If it's not in the cache, then it doesn't exist.
+                if isinstance(e1, OfflineModeIsEnabled):
+                    raise ConnectionError(f"Couldn't reach the Hugging Face Hub for dataset '{path}': {e1}") from None
+                if isinstance(e1, (DataFilesNotFoundError, DatasetNotFoundError, EmptyDatasetError)):
+                    raise e1 from None
+                if isinstance(e1, FileNotFoundError):
+                    raise FileNotFoundError(
+                        f"Couldn't find any data file at {relative_to_absolute_path(path)}. "
+                        f"Couldn't find '{path}' on the Hugging Face Hub either: {type(e1).__name__}: {e1}"
+                    ) from None
+                raise e1 from None
+    else:
+        raise FileNotFoundError(f"Couldn't find any data file at {relative_to_absolute_path(path)}.")
+def load_dataset_builder(
+    path: str,
+    name: Optional[str] = None,
+    data_dir: Optional[str] = None,
+    data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
+    cache_dir: Optional[str] = None,
+    features: Optional[Features] = None,
+    download_config: Optional[DownloadConfig] = None,
+    download_mode: Optional[Union[DownloadMode, str]] = None,
+    revision: Optional[Union[str, Version]] = None,
+    token: Optional[Union[bool, str]] = None,
+    storage_options: Optional[dict] = None,
+    **config_kwargs,
+) -> DatasetBuilder:
+    """Load a dataset builder which can be used to:
+    - Inspect general information that is required to build a dataset (cache directory, config, dataset info, features, data files, etc.)
+    - Download and prepare the dataset as Arrow files in the cache
+    - Get a streaming dataset without downloading or caching anything
+    You can find the list of datasets on the [Hub](https://huggingface.co/datasets) or with [`huggingface_hub.list_datasets`].
+    A dataset is a directory that contains some data files in generic formats (JSON, CSV, Parquet, etc.) and possibly
+    in a generic structure (Webdataset, ImageFolder, AudioFolder, VideoFolder, etc.)
+    Args:
+        path (`str`):
+            Path or name of the dataset.
+            - if `path` is a dataset repository on the HF hub (list all available datasets with [`huggingface_hub.list_datasets`])
+              -> load the dataset builder from supported files in the repository (csv, json, parquet, etc.)
+              e.g. `'username/dataset_name'`, a dataset repository on the HF hub containing the data files.
+            - if `path` is a local directory
+              -> load the dataset builder from supported files in the directory (csv, json, parquet, etc.)
+              e.g. `'./path/to/directory/with/my/csv/data'`.
+            - if `path` is the name of a dataset builder and `data_files` or `data_dir` is specified
+              (available builders are "json", "csv", "parquet", "arrow", "text", "xml", "webdataset", "imagefolder", "audiofolder", "videofolder")
+              -> load the dataset builder from the files in `data_files` or `data_dir`
+              e.g. `'parquet'`.
+        name (`str`, *optional*):
+            Defining the name of the dataset configuration.
+        data_dir (`str`, *optional*):
+            Defining the `data_dir` of the dataset configuration. If specified for the generic builders (csv, text etc.) or the Hub datasets and `data_files` is `None`,
+            the behavior is equal to passing `os.path.join(data_dir, **)` as `data_files` to reference all the files in a directory.
+        data_files (`str` or `Sequence` or `Mapping`, *optional*):
+            Path(s) to source data file(s).
+        cache_dir (`str`, *optional*):
+            Directory to read/write data. Defaults to `"~/.cache/huggingface/datasets"`.
+        features ([`Features`], *optional*):
+            Set the features type to use for this dataset.
+        download_config ([`DownloadConfig`], *optional*):
+            Specific download configuration parameters.
+        download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):
+            Download/generate mode.
+        revision ([`Version`] or `str`, *optional*):
+            Version of the dataset to load.
+            As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch.
+            You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository.
+        token (`str` or `bool`, *optional*):
+            Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
+            If `True`, or not specified, will get token from `"~/.huggingface"`.
+        storage_options (`dict`, *optional*, defaults to `None`):
+            **Experimental**. Key/value pairs to be passed on to the dataset file-system backend, if any.
+            <Added version="2.11.0"/>
+        **config_kwargs (additional keyword arguments):
+            Keyword arguments to be passed to the [`BuilderConfig`]
+            and used in the [`DatasetBuilder`].
+    Returns:
+        [`DatasetBuilder`]
+    Example:
+    ```py
+    >>> from datasets import load_dataset_builder
+    >>> ds_builder = load_dataset_builder('cornell-movie-review-data/rotten_tomatoes')
+    >>> ds_builder.info.features
+    {'label': ClassLabel(names=['neg', 'pos']),
+     'text': Value('string')}
+    ```
+    """
+    download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS)
+    if token is not None:
+        download_config = download_config.copy() if download_config else DownloadConfig()
+        download_config.token = token
+    if storage_options is not None:
+        download_config = download_config.copy() if download_config else DownloadConfig()
+        download_config.storage_options.update(storage_options)
+    if features is not None:
+        features = _fix_for_backward_compatible_features(features)
+    dataset_module = dataset_module_factory(
+        path,
+        revision=revision,
+        download_config=download_config,
+        download_mode=download_mode,
+        data_dir=data_dir,
+        data_files=data_files,
+        cache_dir=cache_dir,
+    )
+    # Get dataset builder class
+    builder_kwargs = dataset_module.builder_kwargs
+    data_dir = builder_kwargs.pop("data_dir", data_dir)
+    data_files = builder_kwargs.pop("data_files", data_files)
+    config_name = builder_kwargs.pop(
+        "config_name", name or dataset_module.builder_configs_parameters.default_config_name
+    )
+    dataset_name = builder_kwargs.pop("dataset_name", None)
+    info = dataset_module.dataset_infos.get(config_name) if dataset_module.dataset_infos else None
+    if (
+        path in _PACKAGED_DATASETS_MODULES
+        and data_files is None
+        and dataset_module.builder_configs_parameters.builder_configs[0].data_files is None
+    ):
+        error_msg = f"Please specify the data files or data directory to load for the {path} dataset builder."
+        example_extensions = [
+            extension for extension in _EXTENSION_TO_MODULE if _EXTENSION_TO_MODULE[extension] == path
+        ]
+        if example_extensions:
+            error_msg += f'\nFor example `data_files={{"train": "path/to/data/train/*.{example_extensions[0]}"}}`'
+        raise ValueError(error_msg)
+    builder_cls = get_dataset_builder_class(dataset_module, dataset_name=dataset_name)
+    # Instantiate the dataset builder
+    builder_instance: DatasetBuilder = builder_cls(
+        cache_dir=cache_dir,
+        dataset_name=dataset_name,
+        config_name=config_name,
+        data_dir=data_dir,
+        data_files=data_files,
+        hash=dataset_module.hash,
+        info=info,
+        features=features,
+        token=token,
+        storage_options=storage_options,
+        **builder_kwargs,
+        **config_kwargs,
+    )
+    builder_instance._use_legacy_cache_dir_if_possible(dataset_module)
+    return builder_instance
+def load_dataset(
+    path: str,
+    name: Optional[str] = None,
+    data_dir: Optional[str] = None,
+    data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
+    split: Optional[Union[str, Split, list[str], list[Split]]] = None,
+    cache_dir: Optional[str] = None,
+    features: Optional[Features] = None,
+    download_config: Optional[DownloadConfig] = None,
+    download_mode: Optional[Union[DownloadMode, str]] = None,
+    verification_mode: Optional[Union[VerificationMode, str]] = None,
+    keep_in_memory: Optional[bool] = None,
+    save_infos: bool = False,
+    revision: Optional[Union[str, Version]] = None,
+    token: Optional[Union[bool, str]] = None,
+    streaming: bool = False,
+    num_proc: Optional[int] = None,
+    storage_options: Optional[dict] = None,
+    **config_kwargs,
+) -> Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset]:
+    """Load a dataset from the Hugging Face Hub, or a local dataset.
+    You can find the list of datasets on the [Hub](https://huggingface.co/datasets) or with [`huggingface_hub.list_datasets`].
+    A dataset is a directory that contains some data files in generic formats (JSON, CSV, Parquet, etc.) and possibly
+    in a generic structure (Webdataset, ImageFolder, AudioFolder, VideoFolder, etc.)
+    This function does the following under the hood:
+        1. Load a dataset builder:
+            * Find the most common data format in the dataset and pick its associated builder (JSON, CSV, Parquet, Webdataset, ImageFolder, AudioFolder, etc.)
+            * Find which file goes into which split (e.g. train/test) based on file and directory names or on the YAML configuration
+            * It is also possible to specify `data_files` manually, and which dataset builder to use (e.g. "parquet").
+        2. Run the dataset builder:
+            In the general case:
+            * Download the data files from the dataset if they are not already available locally or cached.
+            * Process and cache the dataset in typed Arrow tables for caching.
+                Arrow table are arbitrarily long, typed tables which can store nested objects and be mapped to numpy/pandas/python generic types.
+                They can be directly accessed from disk, loaded in RAM or even streamed over the web.
+            In the streaming case:
+            * Don't download or cache anything. Instead, the dataset is lazily loaded and will be streamed on-the-fly when iterating on it.
+        3. Return a dataset built from the requested splits in `split` (default: all).
+    Args:
+        path (`str`):
+            Path or name of the dataset.
+            - if `path` is a dataset repository on the HF hub (list all available datasets with [`huggingface_hub.list_datasets`])
+              -> load the dataset from supported files in the repository (csv, json, parquet, etc.)
+              e.g. `'username/dataset_name'`, a dataset repository on the HF hub containing the data files.
+            - if `path` is a local directory
+              -> load the dataset from supported files in the directory (csv, json, parquet, etc.)
+              e.g. `'./path/to/directory/with/my/csv/data'`.
+            - if `path` is the name of a dataset builder and `data_files` or `data_dir` is specified
+              (available builders are "json", "csv", "parquet", "arrow", "text", "xml", "webdataset", "imagefolder", "audiofolder", "videofolder")
+              -> load the dataset from the files in `data_files` or `data_dir`
+              e.g. `'parquet'`.
+        name (`str`, *optional*):
+            Defining the name of the dataset configuration.
+        data_dir (`str`, *optional*):
+            Defining the `data_dir` of the dataset configuration. If specified for the generic builders (csv, text etc.) or the Hub datasets and `data_files` is `None`,
+            the behavior is equal to passing `os.path.join(data_dir, **)` as `data_files` to reference all the files in a directory.
+        data_files (`str` or `Sequence` or `Mapping`, *optional*):
+            Path(s) to source data file(s).
+        split (`Split` or `str`):
+            Which split of the data to load.
+            If `None`, will return a `dict` with all splits (typically `datasets.Split.TRAIN` and `datasets.Split.TEST`).
+            If given, will return a single Dataset.
+            Splits can be combined and specified like in tensorflow-datasets.
+        cache_dir (`str`, *optional*):
+            Directory to read/write data. Defaults to `"~/.cache/huggingface/datasets"`.
+        features (`Features`, *optional*):
+            Set the features type to use for this dataset.
+        download_config ([`DownloadConfig`], *optional*):
+            Specific download configuration parameters.
+        download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):
+            Download/generate mode.
+        verification_mode ([`VerificationMode`] or `str`, defaults to `BASIC_CHECKS`):
+            Verification mode determining the checks to run on the downloaded/processed dataset information (checksums/size/splits/...).
+            <Added version="2.9.1"/>
+        keep_in_memory (`bool`, defaults to `None`):
+            Whether to copy the dataset in-memory. If `None`, the dataset
+            will not be copied in-memory unless explicitly enabled by setting `datasets.config.IN_MEMORY_MAX_SIZE` to
+            nonzero. See more details in the [improve performance](../cache#improve-performance) section.
+        revision ([`Version`] or `str`, *optional*):
+            Version of the dataset to load.
+            As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch.
+            You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository.
+        token (`str` or `bool`, *optional*):
+            Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
+            If `True`, or not specified, will get token from `"~/.huggingface"`.
+        streaming (`bool`, defaults to `False`):
+            If set to `True`, don't download the data files. Instead, it streams the data progressively while
+            iterating on the dataset. An [`IterableDataset`] or [`IterableDatasetDict`] is returned instead in this case.
+            Note that streaming works for datasets that use data formats that support being iterated over like txt, csv, jsonl for example.
+            Json files may be downloaded completely. Also streaming from remote zip or gzip files is supported but other compressed formats
+            like rar and xz are not yet supported. The tgz format doesn't allow streaming.
+        num_proc (`int`, *optional*, defaults to `None`):
+            Number of processes when downloading and generating the dataset locally.
+            Multiprocessing is disabled by default.
+            <Added version="2.7.0"/>
+        storage_options (`dict`, *optional*, defaults to `None`):
+            **Experimental**. Key/value pairs to be passed on to the dataset file-system backend, if any.
+            <Added version="2.11.0"/>
+        **config_kwargs (additional keyword arguments):
+            Keyword arguments to be passed to the `BuilderConfig`
+            and used in the [`DatasetBuilder`].
+    Returns:
+        [`Dataset`] or [`DatasetDict`]:
+        - if `split` is not `None`: the dataset requested,
+        - if `split` is `None`, a [`~datasets.DatasetDict`] with each split.
+        or [`IterableDataset`] or [`IterableDatasetDict`]: if `streaming=True`
+        - if `split` is not `None`, the dataset is requested
+        - if `split` is `None`, a [`~datasets.streaming.IterableDatasetDict`] with each split.
+    Example:
+    Load a dataset from the Hugging Face Hub:
+    ```py
+    >>> from datasets import load_dataset
+    >>> ds = load_dataset('cornell-movie-review-data/rotten_tomatoes', split='train')
+    # Load a subset or dataset configuration (here 'sst2')
+    >>> from datasets import load_dataset
+    >>> ds = load_dataset('nyu-mll/glue', 'sst2', split='train')
+    # Manual mapping of data files to splits
+    >>> data_files = {'train': 'train.csv', 'test': 'test.csv'}
+    >>> ds = load_dataset('namespace/your_dataset_name', data_files=data_files)
+    # Manual selection of a directory to load
+    >>> ds = load_dataset('namespace/your_dataset_name', data_dir='folder_name')
+    ```
+    Load a local dataset:
+    ```py
+    # Load a CSV file
+    >>> from datasets import load_dataset
+    >>> ds = load_dataset('csv', data_files='path/to/local/my_dataset.csv')
+    # Load a JSON file
+    >>> from datasets import load_dataset
+    >>> ds = load_dataset('json', data_files='path/to/local/my_dataset.json')
+    ```
+    Load an [`~datasets.IterableDataset`]:
+    ```py
+    >>> from datasets import load_dataset
+    >>> ds = load_dataset('cornell-movie-review-data/rotten_tomatoes', split='train', streaming=True)
+    ```
+    Load an image dataset with the `ImageFolder` dataset builder:
+    ```py
+    >>> from datasets import load_dataset
+    >>> ds = load_dataset('imagefolder', data_dir='/path/to/images', split='train')
+    ```
+    """
+    if "trust_remote_code" in config_kwargs:
+        if config_kwargs.pop("trust_remote_code"):
+            logger.error(
+                "`trust_remote_code` is not supported anymore.\n"
+                f"Please check that the Hugging Face dataset '{path}' isn't based on a loading script and remove `trust_remote_code`.\n"
+                "If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet."
+            )
+    if data_files is not None and not data_files:
+        raise ValueError(f"Empty 'data_files': '{data_files}'. It should be either non-empty or None (default).")
+    if Path(path, config.DATASET_STATE_JSON_FILENAME).exists():
+        raise ValueError(
+            "You are trying to load a dataset that was saved using `save_to_disk`. "
+            "Please use `load_from_disk` instead."
+        )
+    if streaming and num_proc is not None:
+        raise NotImplementedError(
+            "Loading a streaming dataset in parallel with `num_proc` is not implemented. "
+            "To parallelize streaming, you can wrap the dataset with a PyTorch DataLoader using `num_workers` > 1 instead."
+        )
+    download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS)
+    verification_mode = VerificationMode(
+        (verification_mode or VerificationMode.BASIC_CHECKS) if not save_infos else VerificationMode.ALL_CHECKS
+    )
+    # Create a dataset builder
+    builder_instance = load_dataset_builder(
+        path=path,
+        name=name,
+        data_dir=data_dir,
+        data_files=data_files,
+        cache_dir=cache_dir,
+        features=features,
+        download_config=download_config,
+        download_mode=download_mode,
+        revision=revision,
+        token=token,
+        storage_options=storage_options,
+        **config_kwargs,
+    )
+    # Return iterable dataset in case of streaming
+    if streaming:
+        return builder_instance.as_streaming_dataset(split=split)
+    # Download and prepare data
+    builder_instance.download_and_prepare(
+        download_config=download_config,
+        download_mode=download_mode,
+        verification_mode=verification_mode,
+        num_proc=num_proc,
+        storage_options=storage_options,
+    )
+    # Build dataset for splits
+    keep_in_memory = (
+        keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
+    )
+    ds = builder_instance.as_dataset(split=split, verification_mode=verification_mode, in_memory=keep_in_memory)
+    return ds
+def load_from_disk(
+    dataset_path: PathLike, keep_in_memory: Optional[bool] = None, storage_options: Optional[dict] = None
+) -> Union[Dataset, DatasetDict]:
+    """
+    Loads a dataset that was previously saved using [`~Dataset.save_to_disk`] from a dataset directory, or
+    from a filesystem using any implementation of `fsspec.spec.AbstractFileSystem`.
+    Args:
+        dataset_path (`path-like`):
+            Path (e.g. `"dataset/train"`) or remote URI (e.g. `"s3://my-bucket/dataset/train"`)
+            of the [`Dataset`] or [`DatasetDict`] directory where the dataset/dataset-dict will be
+            loaded from.
+        keep_in_memory (`bool`, defaults to `None`):
+            Whether to copy the dataset in-memory. If `None`, the dataset
+            will not be copied in-memory unless explicitly enabled by setting `datasets.config.IN_MEMORY_MAX_SIZE` to
+            nonzero. See more details in the [improve performance](../cache#improve-performance) section.
+        storage_options (`dict`, *optional*):
+            Key/value pairs to be passed on to the file-system backend, if any.
+            <Added version="2.9.0"/>
+    Returns:
+        [`Dataset`] or [`DatasetDict`]:
+        - If `dataset_path` is a path of a dataset directory: the dataset requested.
+        - If `dataset_path` is a path of a dataset dict directory, a [`DatasetDict`] with each split.
+    Example:
+    ```py
+    >>> from datasets import load_from_disk
+    >>> ds = load_from_disk('path/to/dataset/directory')
+    ```
+    """
+    fs: fsspec.AbstractFileSystem
+    fs, *_ = url_to_fs(dataset_path, **(storage_options or {}))
+    if not fs.exists(dataset_path):
+        raise FileNotFoundError(f"Directory {dataset_path} not found")
+    if fs.isfile(posixpath.join(dataset_path, config.DATASET_INFO_FILENAME)) and fs.isfile(
+        posixpath.join(dataset_path, config.DATASET_STATE_JSON_FILENAME)
+    ):
+        return Dataset.load_from_disk(dataset_path, keep_in_memory=keep_in_memory, storage_options=storage_options)
+    elif fs.isfile(posixpath.join(dataset_path, config.DATASETDICT_JSON_FILENAME)):
+        return DatasetDict.load_from_disk(dataset_path, keep_in_memory=keep_in_memory, storage_options=storage_options)
+    else:
+        raise FileNotFoundError(
+            f"Directory {dataset_path} is neither a `Dataset` directory nor a `DatasetDict` directory."
+        )

datasets/naming.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Lint as: python3
+"""Utilities for file names."""
+import itertools
+import os
+import re
+_uppercase_uppercase_re = re.compile(r"([A-Z]+)([A-Z][a-z])")
+_lowercase_uppercase_re = re.compile(r"([a-z\d])([A-Z])")
+_single_underscore_re = re.compile(r"(?<!_)_(?!_)")
+_multiple_underscores_re = re.compile(r"(_{2,})")
+_split_re = r"^\w+(\.\w+)*$"
+INVALID_WINDOWS_CHARACTERS_IN_PATH = r"<>:/\|?*"
+def camelcase_to_snakecase(name):
+    """Convert camel-case string to snake-case."""
+    name = _uppercase_uppercase_re.sub(r"\1_\2", name)
+    name = _lowercase_uppercase_re.sub(r"\1_\2", name)
+    return name.lower()
+def snakecase_to_camelcase(name):
+    """Convert snake-case string to camel-case string."""
+    name = _single_underscore_re.split(name)
+    name = [_multiple_underscores_re.split(n) for n in name]
+    return "".join(n.capitalize() for n in itertools.chain.from_iterable(name) if n != "")
+def filename_prefix_for_name(name):
+    if os.path.basename(name) != name:
+        raise ValueError(f"Should be a dataset name, not a path: {name}")
+    return camelcase_to_snakecase(name)
+def filename_prefix_for_split(name, split):
+    if os.path.basename(name) != name:
+        raise ValueError(f"Should be a dataset name, not a path: {name}")
+    if not re.match(_split_re, split):
+        raise ValueError(f"Split name should match '{_split_re}'' but got '{split}'.")
+    return f"{filename_prefix_for_name(name)}-{split}"
+def filepattern_for_dataset_split(dataset_name, split, data_dir, filetype_suffix=None):
+    prefix = filename_prefix_for_split(dataset_name, split)
+    if filetype_suffix:
+        prefix += f".{filetype_suffix}"
+    filepath = os.path.join(data_dir, prefix)
+    return f"{filepath}*"
+def filenames_for_dataset_split(path, dataset_name, split, filetype_suffix=None, shard_lengths=None):
+    prefix = filename_prefix_for_split(dataset_name, split)
+    prefix = os.path.join(path, prefix)
+    if shard_lengths:
+        num_shards = len(shard_lengths)
+        filenames = [f"{prefix}-{shard_id:05d}-of-{num_shards:05d}" for shard_id in range(num_shards)]
+        if filetype_suffix:
+            filenames = [filename + f".{filetype_suffix}" for filename in filenames]
+        return filenames
+    else:
+        filename = prefix
+        if filetype_suffix:
+            filename += f".{filetype_suffix}"
+        return [filename]

datasets/search.py ADDED Viewed

	@@ -0,0 +1,785 @@

+import importlib.util
+import os
+import tempfile
+from pathlib import PurePath
+from typing import TYPE_CHECKING, NamedTuple, Optional, Union
+import fsspec
+import numpy as np
+from .features import List
+from .utils import logging
+from .utils import tqdm as hf_tqdm
+if TYPE_CHECKING:
+    from .arrow_dataset import Dataset  # noqa: F401
+    try:
+        from elasticsearch import Elasticsearch  # noqa: F401
+    except ImportError:
+        pass
+    try:
+        import faiss  # noqa: F401
+    except ImportError:
+        pass
+_has_elasticsearch = importlib.util.find_spec("elasticsearch") is not None
+_has_faiss = importlib.util.find_spec("faiss") is not None
+logger = logging.get_logger(__name__)
+class MissingIndex(Exception):
+    pass
+class SearchResults(NamedTuple):
+    scores: list[float]
+    indices: list[int]
+class BatchedSearchResults(NamedTuple):
+    total_scores: list[list[float]]
+    total_indices: list[list[int]]
+class NearestExamplesResults(NamedTuple):
+    scores: list[float]
+    examples: dict
+class BatchedNearestExamplesResults(NamedTuple):
+    total_scores: list[list[float]]
+    total_examples: list[dict]
+class BaseIndex:
+    """Base class for indexing"""
+    def search(self, query, k: int = 10, **kwargs) -> SearchResults:
+        """
+        To implement.
+        This method has to return the scores and the indices of the retrieved examples given a certain query.
+        """
+        raise NotImplementedError
+    def search_batch(self, queries, k: int = 10, **kwargs) -> BatchedSearchResults:
+        """Find the nearest examples indices to the query.
+        Args:
+            queries (`Union[List[str], np.ndarray]`): The queries as a list of strings if `column` is a text index or as a numpy array if `column` is a vector index.
+            k (`int`): The number of examples to retrieve per query.
+        Output:
+            total_scores (`List[List[float]`): The retrieval scores of the retrieved examples per query.
+            total_indices (`List[List[int]]`): The indices of the retrieved examples per query.
+        """
+        total_scores, total_indices = [], []
+        for query in queries:
+            scores, indices = self.search(query, k)
+            total_scores.append(scores)
+            total_indices.append(indices)
+        return BatchedSearchResults(total_scores, total_indices)
+    def save(self, file: Union[str, PurePath]):
+        """Serialize the index on disk"""
+        raise NotImplementedError
+    @classmethod
+    def load(cls, file: Union[str, PurePath]) -> "BaseIndex":
+        """Deserialize the index from disk"""
+        raise NotImplementedError
+class ElasticSearchIndex(BaseIndex):
+    """
+    Sparse index using Elasticsearch. It is used to index text and run queries based on BM25 similarity.
+    An Elasticsearch server needs to be accessible, and a python client is declared with
+    ```
+    es_client = Elasticsearch([{'host': 'localhost', 'port': '9200'}])
+    ```
+    for example.
+    """
+    def __init__(
+        self,
+        host: Optional[str] = None,
+        port: Optional[int] = None,
+        es_client: Optional["Elasticsearch"] = None,
+        es_index_name: Optional[str] = None,
+        es_index_config: Optional[dict] = None,
+    ):
+        if not _has_elasticsearch:
+            raise ImportError(
+                "You must install ElasticSearch to use ElasticSearchIndex. To do so you can run `pip install elasticsearch==7.7.1 for example`"
+            )
+        if es_client is not None and (host is not None or port is not None):
+            raise ValueError("Please specify either `es_client` or `(host, port)`, but not both.")
+        host = host or "localhost"
+        port = port or 9200
+        import elasticsearch.helpers  # noqa: F401 - need this to properly load all the es features
+        from elasticsearch import Elasticsearch  # noqa: F811
+        self.es_client = es_client if es_client is not None else Elasticsearch([{"host": host, "port": str(port)}])
+        self.es_index_name = (
+            es_index_name
+            if es_index_name is not None
+            else "huggingface_datasets_" + os.path.basename(tempfile.NamedTemporaryFile().name)
+        )
+        self.es_index_config = (
+            es_index_config
+            if es_index_config is not None
+            else {
+                "settings": {
+                    "number_of_shards": 1,
+                    "analysis": {"analyzer": {"stop_standard": {"type": "standard", " stopwords": "_english_"}}},
+                },
+                "mappings": {"properties": {"text": {"type": "text", "analyzer": "standard", "similarity": "BM25"}}},
+            }
+        )
+    def add_documents(self, documents: Union[list[str], "Dataset"], column: Optional[str] = None):
+        """
+        Add documents to the index.
+        If the documents are inside a certain column, you can specify it using the `column` argument.
+        """
+        index_name = self.es_index_name
+        index_config = self.es_index_config
+        self.es_client.indices.create(index=index_name, body=index_config)
+        number_of_docs = len(documents)
+        progress = hf_tqdm(unit="docs", total=number_of_docs)
+        successes = 0
+        def passage_generator():
+            if column is not None:
+                for i, example in enumerate(documents):
+                    yield {"text": example[column], "_id": i}
+            else:
+                for i, example in enumerate(documents):
+                    yield {"text": example, "_id": i}
+        # create the ES index
+        import elasticsearch as es
+        for ok, action in es.helpers.streaming_bulk(
+            client=self.es_client,
+            index=index_name,
+            actions=passage_generator(),
+        ):
+            progress.update(1)
+            successes += ok
+        if successes != len(documents):
+            logger.warning(
+                f"Some documents failed to be added to ElasticSearch. Failures: {len(documents) - successes}/{len(documents)}"
+            )
+        logger.info(f"Indexed {successes:d} documents")
+    def search(self, query: str, k=10, **kwargs) -> SearchResults:
+        """Find the nearest examples indices to the query.
+        Args:
+            query (`str`): The query as a string.
+            k (`int`): The number of examples to retrieve.
+        Output:
+            scores (`List[List[float]`): The retrieval scores of the retrieved examples.
+            indices (`List[List[int]]`): The indices of the retrieved examples.
+        """
+        response = self.es_client.search(
+            index=self.es_index_name,
+            body={"query": {"multi_match": {"query": query, "fields": ["text"], "type": "cross_fields"}}, "size": k},
+            **kwargs,
+        )
+        hits = response["hits"]["hits"]
+        return SearchResults([hit["_score"] for hit in hits], [int(hit["_id"]) for hit in hits])
+    def search_batch(self, queries, k: int = 10, max_workers=10, **kwargs) -> BatchedSearchResults:
+        import concurrent.futures
+        total_scores, total_indices = [None] * len(queries), [None] * len(queries)
+        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+            future_to_index = {executor.submit(self.search, query, k, **kwargs): i for i, query in enumerate(queries)}
+            for future in concurrent.futures.as_completed(future_to_index):
+                index = future_to_index[future]
+                results: SearchResults = future.result()
+                total_scores[index] = results.scores
+                total_indices[index] = results.indices
+        return BatchedSearchResults(total_indices=total_indices, total_scores=total_scores)
+class FaissIndex(BaseIndex):
+    """
+    Dense index using Faiss. It is used to index vectors.
+    Faiss is a library for efficient similarity search and clustering of dense vectors.
+    It contains algorithms that search in sets of vectors of any size, up to ones that possibly do not fit in RAM.
+    You can find more information about Faiss here:
+    - For index types and the string factory: https://github.com/facebookresearch/faiss/wiki/The-index-factory
+    - For GPU settings: https://github.com/facebookresearch/faiss/wiki/Faiss-on-the-GPU
+    """
+    def __init__(
+        self,
+        device: Optional[Union[int, list[int]]] = None,
+        string_factory: Optional[str] = None,
+        metric_type: Optional[int] = None,
+        custom_index: Optional["faiss.Index"] = None,
+    ):
+        """
+        Create a Dense index using Faiss. You can specify `device` if you want to run it on GPU (`device` must be the GPU index).
+        You can find more information about Faiss here:
+        - For `string factory`: https://github.com/facebookresearch/faiss/wiki/The-index-factory
+        """
+        if string_factory is not None and custom_index is not None:
+            raise ValueError("Please specify either `string_factory` or `custom_index` but not both.")
+        if device is not None and custom_index is not None:
+            raise ValueError(
+                "Cannot pass both 'custom_index' and 'device'. "
+                "Pass 'custom_index' already transferred to the target device instead."
+            )
+        self.device = device
+        self.string_factory = string_factory
+        self.metric_type = metric_type
+        self.faiss_index = custom_index
+        if not _has_faiss:
+            raise ImportError(
+                "You must install Faiss to use FaissIndex. To do so you can run `conda install -c pytorch faiss-cpu` or `conda install -c pytorch faiss-gpu`. "
+                "A community supported package is also available on pypi: `pip install faiss-cpu` or `pip install faiss-gpu`. "
+                "Note that pip may not have the latest version of FAISS, and thus, some of the latest features and bug fixes may not be available."
+            )
+    def add_vectors(
+        self,
+        vectors: Union[np.array, "Dataset"],
+        column: Optional[str] = None,
+        batch_size: int = 1000,
+        train_size: Optional[int] = None,
+        faiss_verbose: Optional[bool] = None,
+    ):
+        """
+        Add vectors to the index.
+        If the arrays are inside a certain column, you can specify it using the `column` argument.
+        """
+        import faiss  # noqa: F811
+        if column and not isinstance(vectors.features[column], List):
+            raise ValueError(
+                f"Wrong feature type for column '{column}'. Expected 1d array, got {vectors.features[column]}"
+            )
+        # Create index
+        if self.faiss_index is None:
+            size = len(vectors[0]) if column is None else len(vectors[0][column])
+            if self.string_factory is not None:
+                if self.metric_type is None:
+                    index = faiss.index_factory(size, self.string_factory)
+                else:
+                    index = faiss.index_factory(size, self.string_factory, self.metric_type)
+            else:
+                if self.metric_type is None:
+                    index = faiss.IndexFlat(size)
+                else:
+                    index = faiss.IndexFlat(size, self.metric_type)
+            self.faiss_index = self._faiss_index_to_device(index, self.device)
+            logger.info(f"Created faiss index of type {type(self.faiss_index)}")
+        # Set verbosity level
+        if faiss_verbose is not None:
+            self.faiss_index.verbose = faiss_verbose
+            if hasattr(self.faiss_index, "index") and self.faiss_index.index is not None:
+                self.faiss_index.index.verbose = faiss_verbose
+            if hasattr(self.faiss_index, "quantizer") and self.faiss_index.quantizer is not None:
+                self.faiss_index.quantizer.verbose = faiss_verbose
+            if hasattr(self.faiss_index, "clustering_index") and self.faiss_index.clustering_index is not None:
+                self.faiss_index.clustering_index.verbose = faiss_verbose
+        # Train
+        if train_size is not None:
+            train_vecs = vectors[:train_size] if column is None else vectors[:train_size][column]
+            logger.info(f"Training the index with the first {len(train_vecs)} vectors")
+            self.faiss_index.train(train_vecs)
+        else:
+            logger.info("Ignored the training step of the faiss index as `train_size` is None.")
+        # Add vectors
+        logger.info(f"Adding {len(vectors)} vectors to the faiss index")
+        for i in hf_tqdm(range(0, len(vectors), batch_size)):
+            vecs = vectors[i : i + batch_size] if column is None else vectors[i : i + batch_size][column]
+            self.faiss_index.add(vecs)
+    @staticmethod
+    def _faiss_index_to_device(index: "faiss.Index", device: Optional[Union[int, list[int]]] = None) -> "faiss.Index":
+        """
+        Sends a faiss index to a device.
+        A device can either be a positive integer (GPU id), a negative integer (all GPUs),
+            or a list of positive integers (select GPUs to use), or `None` for CPU.
+        """
+        # If device is not specified, then it runs on CPU.
+        if device is None:
+            return index
+        import faiss  # noqa: F811
+        # If the device id is given as an integer
+        if isinstance(device, int):
+            # Positive integers are directly mapped to GPU ids
+            if device > -1:
+                faiss_res = faiss.StandardGpuResources()
+                index = faiss.index_cpu_to_gpu(faiss_res, device, index)
+            # And negative integers mean using all GPUs
+            else:
+                index = faiss.index_cpu_to_all_gpus(index)
+        # Device ids given as a list mean mapping to those devices specified.
+        elif isinstance(device, (list, tuple)):
+            index = faiss.index_cpu_to_gpus_list(index, gpus=list(device))
+        else:
+            raise TypeError(
+                f"The argument type: {type(device)} is not expected. "
+                + "Please pass in either nothing, a positive int, a negative int, or a list of positive ints."
+            )
+        return index
+    def search(self, query: np.array, k=10, **kwargs) -> SearchResults:
+        """Find the nearest examples indices to the query.
+        Args:
+            query (`np.array`): The query as a numpy array.
+            k (`int`): The number of examples to retrieve.
+        Output:
+            scores (`List[List[float]`): The retrieval scores of the retrieved examples.
+            indices (`List[List[int]]`): The indices of the retrieved examples.
+        """
+        if len(query.shape) != 1 and (len(query.shape) != 2 or query.shape[0] != 1):
+            raise ValueError("Shape of query is incorrect, it has to be either a 1D array or 2D (1, N)")
+        queries = query.reshape(1, -1)
+        if not queries.flags.c_contiguous:
+            queries = np.asarray(queries, order="C")
+        scores, indices = self.faiss_index.search(queries, k, **kwargs)
+        return SearchResults(scores[0], indices[0].astype(int))
+    def search_batch(self, queries: np.array, k=10, **kwargs) -> BatchedSearchResults:
+        """Find the nearest examples indices to the queries.
+        Args:
+            queries (`np.array`): The queries as a numpy array.
+            k (`int`): The number of examples to retrieve.
+        Output:
+            total_scores (`List[List[float]`): The retrieval scores of the retrieved examples per query.
+            total_indices (`List[List[int]]`): The indices of the retrieved examples per query.
+        """
+        if len(queries.shape) != 2:
+            raise ValueError("Shape of query must be 2D")
+        if not queries.flags.c_contiguous:
+            queries = np.asarray(queries, order="C")
+        scores, indices = self.faiss_index.search(queries, k, **kwargs)
+        return BatchedSearchResults(scores, indices.astype(int))
+    def save(self, file: Union[str, PurePath], storage_options: Optional[dict] = None):
+        """Serialize the FaissIndex on disk"""
+        import faiss  # noqa: F811
+        if self.device is not None and isinstance(self.device, (int, list, tuple)):
+            index = faiss.index_gpu_to_cpu(self.faiss_index)
+        else:
+            index = self.faiss_index
+        with fsspec.open(str(file), "wb", **(storage_options or {})) as f:
+            faiss.write_index(index, faiss.BufferedIOWriter(faiss.PyCallbackIOWriter(f.write)))
+    @classmethod
+    def load(
+        cls,
+        file: Union[str, PurePath],
+        device: Optional[Union[int, list[int]]] = None,
+        storage_options: Optional[dict] = None,
+    ) -> "FaissIndex":
+        """Deserialize the FaissIndex from disk"""
+        import faiss  # noqa: F811
+        # Instances of FaissIndex is essentially just a wrapper for faiss indices.
+        faiss_index = cls(device=device)
+        with fsspec.open(str(file), "rb", **(storage_options or {})) as f:
+            index = faiss.read_index(faiss.BufferedIOReader(faiss.PyCallbackIOReader(f.read)))
+        faiss_index.faiss_index = faiss_index._faiss_index_to_device(index, faiss_index.device)
+        return faiss_index
+class IndexableMixin:
+    """Add indexing features to `datasets.Dataset`"""
+    def __init__(self):
+        self._indexes: dict[str, BaseIndex] = {}
+    def __len__(self):
+        raise NotImplementedError
+    def __getitem__(self, key):
+        raise NotImplementedError
+    def is_index_initialized(self, index_name: str) -> bool:
+        return index_name in self._indexes
+    def _check_index_is_initialized(self, index_name: str):
+        if not self.is_index_initialized(index_name):
+            raise MissingIndex(
+                f"Index with index_name '{index_name}' not initialized yet. Please make sure that you call `add_faiss_index` or `add_elasticsearch_index` first."
+            )
+    def list_indexes(self) -> list[str]:
+        """List the `colindex_nameumns`/identifiers of all the attached indexes."""
+        return list(self._indexes)
+    def get_index(self, index_name: str) -> BaseIndex:
+        """List the `index_name`/identifiers of all the attached indexes.
+        Args:
+            index_name (`str`): Index name.
+        Returns:
+            [`BaseIndex`]
+        """
+        self._check_index_is_initialized(index_name)
+        return self._indexes[index_name]
+    def add_faiss_index(
+        self,
+        column: str,
+        index_name: Optional[str] = None,
+        device: Optional[Union[int, list[int]]] = None,
+        string_factory: Optional[str] = None,
+        metric_type: Optional[int] = None,
+        custom_index: Optional["faiss.Index"] = None,
+        batch_size: int = 1000,
+        train_size: Optional[int] = None,
+        faiss_verbose: bool = False,
+    ):
+        """Add a dense index using Faiss for fast retrieval.
+        The index is created using the vectors of the specified column.
+        You can specify `device` if you want to run it on GPU (`device` must be the GPU index, see more below).
+        You can find more information about Faiss here:
+        - For `string factory`: https://github.com/facebookresearch/faiss/wiki/The-index-factory
+        Args:
+            column (`str`): The column of the vectors to add to the index.
+            index_name (Optional `str`): The index_name/identifier of the index. This is the index_name that is used to call `.get_nearest` or `.search`.
+                By default it corresponds to `column`.
+            device (Optional `Union[int, List[int]]`): If positive integer, this is the index of the GPU to use. If negative integer, use all GPUs.
+                If a list of positive integers is passed in, run only on those GPUs. By default it uses the CPU.
+            string_factory (Optional `str`): This is passed to the index factory of Faiss to create the index. Default index class is IndexFlatIP.
+            metric_type (Optional `int`): Type of metric. Ex: `faiss.METRIC_INNER_PRODUCT` or `faiss.METRIC_L2`.
+            custom_index (Optional `faiss.Index`): Custom Faiss index that you already have instantiated and configured for your needs.
+            batch_size (Optional `int`): Size of the batch to use while adding vectors to the FaissIndex. Default value is 1000.
+                <Added version="2.4.0"/>
+            train_size (Optional `int`): If the index needs a training step, specifies how many vectors will be used to train the index.
+            faiss_verbose (`bool`, defaults to False): Enable the verbosity of the Faiss index.
+        """
+        index_name = index_name if index_name is not None else column
+        faiss_index = FaissIndex(
+            device=device, string_factory=string_factory, metric_type=metric_type, custom_index=custom_index
+        )
+        faiss_index.add_vectors(
+            self, column=column, batch_size=batch_size, train_size=train_size, faiss_verbose=faiss_verbose
+        )
+        self._indexes[index_name] = faiss_index
+    def add_faiss_index_from_external_arrays(
+        self,
+        external_arrays: np.array,
+        index_name: str,
+        device: Optional[Union[int, list[int]]] = None,
+        string_factory: Optional[str] = None,
+        metric_type: Optional[int] = None,
+        custom_index: Optional["faiss.Index"] = None,
+        batch_size: int = 1000,
+        train_size: Optional[int] = None,
+        faiss_verbose: bool = False,
+    ):
+        """Add a dense index using Faiss for fast retrieval.
+        The index is created using the vectors of `external_arrays`.
+        You can specify `device` if you want to run it on GPU (`device` must be the GPU index).
+        You can find more information about Faiss here:
+        - For `string factory`: https://github.com/facebookresearch/faiss/wiki/The-index-factory
+        Args:
+            external_arrays (`np.array`): If you want to use arrays from outside the lib for the index, you can set `external_arrays`.
+                It will use `external_arrays` to create the Faiss index instead of the arrays in the given `column`.
+            index_name (`str`): The index_name/identifier of the index. This is the index_name that is used to call `.get_nearest` or `.search`.
+            device (Optional `Union[int, List[int]]`): If positive integer, this is the index of the GPU to use. If negative integer, use all GPUs.
+                If a list of positive integers is passed in, run only on those GPUs. By default it uses the CPU.
+            string_factory (Optional `str`): This is passed to the index factory of Faiss to create the index. Default index class is IndexFlatIP.
+            metric_type (Optional `int`): Type of metric. Ex: `faiss.METRIC_INNER_PRODUCT` or `faiss.METRIC_L2`.
+            custom_index (Optional `faiss.Index`): Custom Faiss index that you already have instantiated and configured for your needs.
+            batch_size (Optional `int`): Size of the batch to use while adding vectors to the FaissIndex. Default value is 1000.
+                <Added version="2.4.0"/>
+            train_size (Optional `int`): If the index needs a training step, specifies how many vectors will be used to train the index.
+            faiss_verbose (`bool`, defaults to False): Enable the verbosity of the Faiss index.
+        """
+        faiss_index = FaissIndex(
+            device=device, string_factory=string_factory, metric_type=metric_type, custom_index=custom_index
+        )
+        faiss_index.add_vectors(
+            external_arrays, column=None, batch_size=batch_size, train_size=train_size, faiss_verbose=faiss_verbose
+        )
+        self._indexes[index_name] = faiss_index
+    def save_faiss_index(self, index_name: str, file: Union[str, PurePath], storage_options: Optional[dict] = None):
+        """Save a FaissIndex on disk.
+        Args:
+            index_name (`str`): The index_name/identifier of the index. This is the index_name that is used to call `.get_nearest` or `.search`.
+            file (`str`): The path to the serialized faiss index on disk or remote URI (e.g. `"s3://my-bucket/index.faiss"`).
+            storage_options (`dict`, *optional*):
+                Key/value pairs to be passed on to the file-system backend, if any.
+                <Added version="2.11.0"/>
+        """
+        index = self.get_index(index_name)
+        if not isinstance(index, FaissIndex):
+            raise ValueError(f"Index '{index_name}' is not a FaissIndex but a '{type(index)}'")
+        index.save(file, storage_options=storage_options)
+        logger.info(f"Saved FaissIndex {index_name} at {file}")
+    def load_faiss_index(
+        self,
+        index_name: str,
+        file: Union[str, PurePath],
+        device: Optional[Union[int, list[int]]] = None,
+        storage_options: Optional[dict] = None,
+    ):
+        """Load a FaissIndex from disk.
+        If you want to do additional configurations, you can have access to the faiss index object by doing
+        `.get_index(index_name).faiss_index` to make it fit your needs.
+        Args:
+            index_name (`str`): The index_name/identifier of the index. This is the index_name that is used to
+                call `.get_nearest` or `.search`.
+            file (`str`): The path to the serialized faiss index on disk or remote URI (e.g. `"s3://my-bucket/index.faiss"`).
+            device (Optional `Union[int, List[int]]`): If positive integer, this is the index of the GPU to use. If negative integer, use all GPUs.
+                If a list of positive integers is passed in, run only on those GPUs. By default it uses the CPU.
+            storage_options (`dict`, *optional*):
+                Key/value pairs to be passed on to the file-system backend, if any.
+                <Added version="2.11.0"/>
+        """
+        index = FaissIndex.load(file, device=device, storage_options=storage_options)
+        if index.faiss_index.ntotal != len(self):
+            raise ValueError(
+                f"Index size should match Dataset size, but Index '{index_name}' at {file} has {index.faiss_index.ntotal} elements while the dataset has {len(self)} examples."
+            )
+        self._indexes[index_name] = index
+        logger.info(f"Loaded FaissIndex {index_name} from {file}")
+    def add_elasticsearch_index(
+        self,
+        column: str,
+        index_name: Optional[str] = None,
+        host: Optional[str] = None,
+        port: Optional[int] = None,
+        es_client: Optional["Elasticsearch"] = None,
+        es_index_name: Optional[str] = None,
+        es_index_config: Optional[dict] = None,
+    ):
+        """Add a text index using ElasticSearch for fast retrieval.
+        Args:
+            column (`str`): The column of the documents to add to the index.
+            index_name (Optional `str`): The index_name/identifier of the index. This is the index name that is used to call `.get_nearest` or `.search`.
+                By default it corresponds to `column`.
+            host (Optional `str`, defaults to localhost):
+                host of where ElasticSearch is running
+            port (Optional `str`, defaults to 9200):
+                port of where ElasticSearch is running
+            es_client (Optional `elasticsearch.Elasticsearch`):
+                The elasticsearch client used to create the index if host and port are None.
+            es_index_name (Optional `str`): The elasticsearch index name used to create the index.
+            es_index_config (Optional `dict`):
+                The configuration of the elasticsearch index.
+                Default config is:
+        Config::
+            {
+                "settings": {
+                    "number_of_shards": 1,
+                    "analysis": {"analyzer": {"stop_standard": {"type": "standard", " stopwords": "_english_"}}},
+                },
+                "mappings": {
+                    "properties": {
+                        "text": {
+                            "type": "text",
+                            "analyzer": "standard",
+                            "similarity": "BM25"
+                        },
+                    }
+                },
+            }
+        """
+        index_name = index_name if index_name is not None else column
+        es_index = ElasticSearchIndex(
+            host=host, port=port, es_client=es_client, es_index_name=es_index_name, es_index_config=es_index_config
+        )
+        es_index.add_documents(self, column=column)
+        self._indexes[index_name] = es_index
+    def load_elasticsearch_index(
+        self,
+        index_name: str,
+        es_index_name: str,
+        host: Optional[str] = None,
+        port: Optional[int] = None,
+        es_client: Optional["Elasticsearch"] = None,
+        es_index_config: Optional[dict] = None,
+    ):
+        """Load an existing text index using ElasticSearch for fast retrieval.
+        Args:
+            index_name (`str`):
+                The `index_name`/identifier of the index. This is the index name that is used to call `get_nearest` or `search`.
+            es_index_name (`str`):
+                The name of elasticsearch index to load.
+            host (`str`, *optional*, defaults to `localhost`):
+                Host of where ElasticSearch is running.
+            port (`str`, *optional*, defaults to `9200`):
+                Port of where ElasticSearch is running.
+            es_client (`elasticsearch.Elasticsearch`, *optional*):
+                The elasticsearch client used to create the index if host and port are `None`.
+            es_index_config (`dict`, *optional*):
+                The configuration of the elasticsearch index.
+                Default config is:
+                    ```
+                    {
+                        "settings": {
+                            "number_of_shards": 1,
+                            "analysis": {"analyzer": {"stop_standard": {"type": "standard", " stopwords": "_english_"}}},
+                        },
+                        "mappings": {
+                            "properties": {
+                                "text": {
+                                    "type": "text",
+                                    "analyzer": "standard",
+                                    "similarity": "BM25"
+                                },
+                            }
+                        },
+                    }
+                    ```
+        """
+        self._indexes[index_name] = ElasticSearchIndex(
+            host=host, port=port, es_client=es_client, es_index_name=es_index_name, es_index_config=es_index_config
+        )
+    def drop_index(self, index_name: str):
+        """Drop the index with the specified column.
+        Args:
+            index_name (`str`):
+                The `index_name`/identifier of the index.
+        """
+        del self._indexes[index_name]
+    def search(self, index_name: str, query: Union[str, np.array], k: int = 10, **kwargs) -> SearchResults:
+        """Find the nearest examples indices in the dataset to the query.
+        Args:
+            index_name (`str`):
+                The name/identifier of the index.
+            query (`Union[str, np.ndarray]`):
+                The query as a string if `index_name` is a text index or as a numpy array if `index_name` is a vector index.
+            k (`int`):
+                The number of examples to retrieve.
+        Returns:
+            `(scores, indices)`:
+                A tuple of `(scores, indices)` where:
+                - **scores** (`List[List[float]`): the retrieval scores from either FAISS (`IndexFlatL2` by default) or ElasticSearch of the retrieved examples
+                - **indices** (`List[List[int]]`): the indices of the retrieved examples
+        """
+        self._check_index_is_initialized(index_name)
+        return self._indexes[index_name].search(query, k, **kwargs)
+    def search_batch(
+        self, index_name: str, queries: Union[list[str], np.array], k: int = 10, **kwargs
+    ) -> BatchedSearchResults:
+        """Find the nearest examples indices in the dataset to the query.
+        Args:
+            index_name (`str`):
+                The `index_name`/identifier of the index.
+            queries (`Union[List[str], np.ndarray]`):
+                The queries as a list of strings if `index_name` is a text index or as a numpy array if `index_name` is a vector index.
+            k (`int`):
+                The number of examples to retrieve per query.
+        Returns:
+            `(total_scores, total_indices)`:
+                A tuple of `(total_scores, total_indices)` where:
+                - **total_scores** (`List[List[float]`): the retrieval scores from either FAISS (`IndexFlatL2` by default) or ElasticSearch of the retrieved examples per query
+                - **total_indices** (`List[List[int]]`): the indices of the retrieved examples per query
+        """
+        self._check_index_is_initialized(index_name)
+        return self._indexes[index_name].search_batch(queries, k, **kwargs)
+    def get_nearest_examples(
+        self, index_name: str, query: Union[str, np.array], k: int = 10, **kwargs
+    ) -> NearestExamplesResults:
+        """Find the nearest examples in the dataset to the query.
+        Args:
+            index_name (`str`):
+                The index_name/identifier of the index.
+            query (`Union[str, np.ndarray]`):
+                The query as a string if `index_name` is a text index or as a numpy array if `index_name` is a vector index.
+            k (`int`):
+                The number of examples to retrieve.
+        Returns:
+            `(scores, examples)`:
+                A tuple of `(scores, examples)` where:
+                - **scores** (`List[float]`): the retrieval scores from either FAISS (`IndexFlatL2` by default) or ElasticSearch of the retrieved examples
+                - **examples** (`dict`): the retrieved examples
+        """
+        self._check_index_is_initialized(index_name)
+        scores, indices = self.search(index_name, query, k, **kwargs)
+        top_indices = [i for i in indices if i >= 0]
+        return NearestExamplesResults(scores[: len(top_indices)], self[top_indices])
+    def get_nearest_examples_batch(
+        self, index_name: str, queries: Union[list[str], np.array], k: int = 10, **kwargs
+    ) -> BatchedNearestExamplesResults:
+        """Find the nearest examples in the dataset to the query.
+        Args:
+            index_name (`str`):
+                The `index_name`/identifier of the index.
+            queries (`Union[List[str], np.ndarray]`):
+                The queries as a list of strings if `index_name` is a text index or as a numpy array if `index_name` is a vector index.
+            k (`int`):
+                The number of examples to retrieve per query.
+        Returns:
+            `(total_scores, total_examples)`:
+                A tuple of `(total_scores, total_examples)` where:
+                - **total_scores** (`List[List[float]`): the retrieval scores from either FAISS (`IndexFlatL2` by default) or ElasticSearch of the retrieved examples per query
+                - **total_examples** (`List[dict]`): the retrieved examples per query
+        """
+        self._check_index_is_initialized(index_name)
+        total_scores, total_indices = self.search_batch(index_name, queries, k, **kwargs)
+        total_scores = [
+            scores_i[: len([i for i in indices_i if i >= 0])]
+            for scores_i, indices_i in zip(total_scores, total_indices)
+        ]
+        total_samples = [self[[i for i in indices if i >= 0]] for indices in total_indices]
+        return BatchedNearestExamplesResults(total_scores, total_samples)

datasets/splits.py ADDED Viewed

	@@ -0,0 +1,635 @@

+# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Lint as: python3
+"""Splits related API."""
+import abc
+import collections
+import copy
+import dataclasses
+import re
+from dataclasses import dataclass
+from typing import Optional, Union
+from .arrow_reader import FileInstructions, make_file_instructions
+from .naming import _split_re
+from .utils.py_utils import NonMutableDict, asdict
+@dataclass
+class SplitInfo:
+    name: str = dataclasses.field(default="", metadata={"include_in_asdict_even_if_is_default": True})
+    num_bytes: int = dataclasses.field(default=0, metadata={"include_in_asdict_even_if_is_default": True})
+    num_examples: int = dataclasses.field(default=0, metadata={"include_in_asdict_even_if_is_default": True})
+    shard_lengths: Optional[list[int]] = None
+    # Deprecated
+    # For backward compatibility, this field needs to always be included in files like
+    # dataset_infos.json and dataset_info.json files
+    # To do so, we always include it in the output of datasets.utils.py_utils.asdict(split_info)
+    dataset_name: Optional[str] = dataclasses.field(
+        default=None, metadata={"include_in_asdict_even_if_is_default": True}
+    )
+    @property
+    def file_instructions(self):
+        """Returns the list of dict(filename, take, skip)."""
+        # `self.dataset_name` is assigned in `SplitDict.add()`.
+        instructions = make_file_instructions(
+            name=self.dataset_name,
+            split_infos=[self],
+            instruction=str(self.name),
+        )
+        return instructions.file_instructions
+@dataclass
+class SubSplitInfo:
+    """Wrapper around a sub split info.
+    This class expose info on the subsplit:
+    ```
+    ds, info = datasets.load_dataset(..., split='train[75%:]', with_info=True)
+    info.splits['train[75%:]'].num_examples
+    ```
+    """
+    instructions: FileInstructions
+    @property
+    def num_examples(self):
+        """Returns the number of example in the subsplit."""
+        return self.instructions.num_examples
+    @property
+    def file_instructions(self):
+        """Returns the list of dict(filename, take, skip)."""
+        return self.instructions.file_instructions
+class SplitBase(metaclass=abc.ABCMeta):
+    # pylint: disable=line-too-long
+    """Abstract base class for Split compositionality.
+    See the
+    [guide on splits](../loading#slice-splits)
+    for more information.
+    There are three parts to the composition:
+        1) The splits are composed (defined, merged, split,...) together before
+             calling the `.as_dataset()` function. This is done with the `__add__`,
+             `__getitem__`, which return a tree of `SplitBase` (whose leaf
+             are the `NamedSplit` objects)
+        ```
+        split = datasets.Split.TRAIN + datasets.Split.TEST.subsplit(datasets.percent[:50])
+        ```
+        2) The `SplitBase` is forwarded to the `.as_dataset()` function
+             to be resolved into actual read instruction. This is done by the
+             `.get_read_instruction()` method which takes the real dataset splits
+             (name, number of shards,...) and parse the tree to return a
+             `SplitReadInstruction()` object
+        ```
+        read_instruction = split.get_read_instruction(self.info.splits)
+        ```
+        3) The `SplitReadInstruction` is then used in the `tf.data.Dataset` pipeline
+             to define which files to read and how to skip examples within file.
+    """
+    # pylint: enable=line-too-long
+    @abc.abstractmethod
+    def get_read_instruction(self, split_dict):
+        """Parse the descriptor tree and compile all read instructions together.
+        Args:
+            split_dict: `dict`, The `dict[split_name, SplitInfo]` of the dataset
+        Returns:
+            split_read_instruction: `SplitReadInstruction`
+        """
+        raise NotImplementedError("Abstract method")
+    def __eq__(self, other):
+        """Equality: datasets.Split.TRAIN == 'train'."""
+        if isinstance(other, (NamedSplit, str)):
+            return False
+        raise NotImplementedError("Equality is not implemented between merged/sub splits.")
+    def __ne__(self, other):
+        """InEquality: datasets.Split.TRAIN != 'test'."""
+        return not self.__eq__(other)
+    def __add__(self, other):
+        """Merging: datasets.Split.TRAIN + datasets.Split.TEST."""
+        return _SplitMerged(self, other)
+    def subsplit(self, arg=None, k=None, percent=None, weighted=None):  # pylint: disable=redefined-outer-name
+        """Divides this split into subsplits.
+        There are 3 ways to define subsplits, which correspond to the 3
+        arguments `k` (get `k` even subsplits), `percent` (get a slice of the
+        dataset with `datasets.percent`), and `weighted` (get subsplits with proportions
+        specified by `weighted`).
+        Example::
+        ```
+        # 50% train, 50% test
+        train, test = split.subsplit(k=2)
+        # 50% train, 25% test, 25% validation
+        train, test, validation = split.subsplit(weighted=[2, 1, 1])
+        # Extract last 20%
+        subsplit = split.subsplit(datasets.percent[-20:])
+        ```
+        Warning: k and weighted will be converted into percent which mean that
+        values below the percent will be rounded up or down. The final split may be
+        bigger to deal with remainders. For instance:
+        ```
+        train, test, valid = split.subsplit(k=3)  # 33%, 33%, 34%
+        s1, s2, s3, s4 = split.subsplit(weighted=[2, 2, 1, 1])  # 33%, 33%, 16%, 18%
+        ```
+        Args:
+            arg: If no kwargs are given, `arg` will be interpreted as one of
+                `k`, `percent`, or `weighted` depending on the type.
+                For example:
+                ```
+                split.subsplit(10)  # Equivalent to split.subsplit(k=10)
+                split.subsplit(datasets.percent[:-20])  # percent=datasets.percent[:-20]
+                split.subsplit([1, 1, 2])  # weighted=[1, 1, 2]
+                ```
+            k: `int` If set, subdivide the split into `k` equal parts.
+            percent: `datasets.percent slice`, return a single subsplit corresponding to
+                a slice of the original split. For example:
+                `split.subsplit(datasets.percent[-20:])  # Last 20% of the dataset`.
+            weighted: `list[int]`, return a list of subsplits whose proportions match
+                the normalized sum of the list. For example:
+                `split.subsplit(weighted=[1, 1, 2])  # 25%, 25%, 50%`.
+        Returns:
+            A subsplit or list of subsplits extracted from this split object.
+        """
+        # Note that the percent kwargs redefine the outer name datasets.percent. This
+        # is done for consistency (.subsplit(percent=datasets.percent[:40]))
+        if sum(bool(x) for x in (arg, k, percent, weighted)) != 1:
+            raise ValueError("Only one argument of subsplit should be set.")
+        # Auto deduce k
+        if isinstance(arg, int):
+            k = arg
+        elif isinstance(arg, slice):
+            percent = arg
+        elif isinstance(arg, list):
+            weighted = arg
+        if not (k or percent or weighted):
+            raise ValueError(
+                f"Invalid split argument {arg}. Only list, slice and int supported. "
+                "One of k, weighted or percent should be set to a non empty value."
+            )
+        def assert_slices_coverage(slices):
+            # Ensure that the expended slices cover all percents.
+            assert sum((list(range(*s.indices(100))) for s in slices), []) == list(range(100))
+        if k:
+            if not 0 < k <= 100:
+                raise ValueError(f"Subsplit k should be between 0 and 100, got {k}")
+            shift = 100 // k
+            slices = [slice(i * shift, (i + 1) * shift) for i in range(k)]
+            # Round up last element to ensure all elements are taken
+            slices[-1] = slice(slices[-1].start, 100)
+            # Internal check to ensure full coverage
+            assert_slices_coverage(slices)
+            return tuple(_SubSplit(self, s) for s in slices)
+        elif percent:
+            return _SubSplit(self, percent)
+        elif weighted:
+            # Normalize the weighted sum
+            total = sum(weighted)
+            weighted = [100 * x // total for x in weighted]
+            # Create the slice for each of the elements
+            start = 0
+            stop = 0
+            slices = []
+            for v in weighted:
+                stop += v
+                slices.append(slice(start, stop))
+                start = stop
+            # Round up last element to ensure all elements are taken
+            slices[-1] = slice(slices[-1].start, 100)
+            # Internal check to ensure full coverage
+            assert_slices_coverage(slices)
+            return tuple(_SubSplit(self, s) for s in slices)
+        else:
+            # Should not be possible
+            raise ValueError("Could not determine the split")
+# 2 requirements:
+# 1. datasets.percent be sliceable
+# 2. datasets.percent be documented
+#
+# Instances are not documented, so we want datasets.percent to be a class, but to
+# have it be sliceable, we need this metaclass.
+class PercentSliceMeta(type):
+    def __getitem__(cls, slice_value):
+        if not isinstance(slice_value, slice):
+            raise ValueError(f"datasets.percent should only be called with slice, not {slice_value}")
+        return slice_value
+class PercentSlice(metaclass=PercentSliceMeta):
+    # pylint: disable=line-too-long
+    """Syntactic sugar for defining slice subsplits: `datasets.percent[75:-5]`.
+    See the
+    [guide on splits](../loading#slice-splits)
+    for more information.
+    """
+    # pylint: enable=line-too-long
+    pass
+percent = PercentSlice  # pylint: disable=invalid-name
+class _SplitMerged(SplitBase):
+    """Represent two split descriptors merged together."""
+    def __init__(self, split1, split2):
+        self._split1 = split1
+        self._split2 = split2
+    def get_read_instruction(self, split_dict):
+        read_instruction1 = self._split1.get_read_instruction(split_dict)
+        read_instruction2 = self._split2.get_read_instruction(split_dict)
+        return read_instruction1 + read_instruction2
+    def __repr__(self):
+        return f"({repr(self._split1)} + {repr(self._split2)})"
+class _SubSplit(SplitBase):
+    """Represent a sub split of a split descriptor."""
+    def __init__(self, split, slice_value):
+        self._split = split
+        self._slice_value = slice_value
+    def get_read_instruction(self, split_dict):
+        return self._split.get_read_instruction(split_dict)[self._slice_value]
+    def __repr__(self):
+        slice_str = "{start}:{stop}"
+        if self._slice_value.step is not None:
+            slice_str += ":{step}"
+        slice_str = slice_str.format(
+            start="" if self._slice_value.start is None else self._slice_value.start,
+            stop="" if self._slice_value.stop is None else self._slice_value.stop,
+            step=self._slice_value.step,
+        )
+        return f"{repr(self._split)}(datasets.percent[{slice_str}])"
+class NamedSplit(SplitBase):
+    """Descriptor corresponding to a named split (train, test, ...).
+    Example:
+        Each descriptor can be composed with other using addition or slice:
+            ```py
+            split = datasets.Split.TRAIN.subsplit(datasets.percent[0:25]) + datasets.Split.TEST
+            ```
+        The resulting split will correspond to 25% of the train split merged with
+        100% of the test split.
+        A split cannot be added twice, so the following will fail:
+            ```py
+            split = (
+                    datasets.Split.TRAIN.subsplit(datasets.percent[:25]) +
+                    datasets.Split.TRAIN.subsplit(datasets.percent[75:])
+            )  # Error
+            split = datasets.Split.TEST + datasets.Split.ALL  # Error
+            ```
+        The slices can be applied only one time. So the following are valid:
+            ```py
+            split = (
+                    datasets.Split.TRAIN.subsplit(datasets.percent[:25]) +
+                    datasets.Split.TEST.subsplit(datasets.percent[:50])
+            )
+            split = (datasets.Split.TRAIN + datasets.Split.TEST).subsplit(datasets.percent[:50])
+            ```
+        But this is not valid:
+            ```py
+            train = datasets.Split.TRAIN
+            test = datasets.Split.TEST
+            split = train.subsplit(datasets.percent[:25]).subsplit(datasets.percent[:25])
+            split = (train.subsplit(datasets.percent[:25]) + test).subsplit(datasets.percent[:50])
+            ```
+    """
+    def __init__(self, name):
+        self._name = name
+        split_names_from_instruction = [split_instruction.split("[")[0] for split_instruction in name.split("+")]
+        for split_name in split_names_from_instruction:
+            if not re.match(_split_re, split_name):
+                raise ValueError(f"Split name should match '{_split_re}' but got '{split_name}'.")
+    def __str__(self):
+        return self._name
+    def __repr__(self):
+        return f"NamedSplit({self._name!r})"
+    def __eq__(self, other):
+        """Equality: datasets.Split.TRAIN == 'train'."""
+        if isinstance(other, NamedSplit):
+            return self._name == other._name  # pylint: disable=protected-access
+        elif isinstance(other, SplitBase):
+            return False
+        elif isinstance(other, str):  # Other should be string
+            return self._name == other
+        else:
+            return False
+    def __lt__(self, other):
+        return self._name < other._name  # pylint: disable=protected-access
+    def __hash__(self):
+        return hash(self._name)
+    def get_read_instruction(self, split_dict):
+        return SplitReadInstruction(split_dict[self._name])
+class NamedSplitAll(NamedSplit):
+    """Split corresponding to the union of all defined dataset splits."""
+    def __init__(self):
+        super().__init__("all")
+    def __repr__(self):
+        return "NamedSplitAll()"
+    def get_read_instruction(self, split_dict):
+        # Merge all dataset split together
+        read_instructions = [SplitReadInstruction(s) for s in split_dict.values()]
+        return sum(read_instructions, SplitReadInstruction())
+class Split:
+    # pylint: disable=line-too-long
+    """`Enum` for dataset splits.
+    Datasets are typically split into different subsets to be used at various
+    stages of training and evaluation.
+    - `TRAIN`: the training data.
+    - `VALIDATION`: the validation data. If present, this is typically used as
+      evaluation data while iterating on a model (e.g. changing hyperparameters,
+      model architecture, etc.).
+    - `TEST`: the testing data. This is the data to report metrics on. Typically
+      you do not want to use this during model iteration as you may overfit to it.
+    - `ALL`: the union of all defined dataset splits.
+    All splits, including compositions inherit from `datasets.SplitBase`.
+    See the [guide](../load_hub#splits) on splits for more information.
+    Example:
+    ```py
+    >>> datasets.SplitGenerator(
+    ...     name=datasets.Split.TRAIN,
+    ...     gen_kwargs={"split_key": "train", "files": dl_manager.download_and extract(url)},
+    ... ),
+    ... datasets.SplitGenerator(
+    ...     name=datasets.Split.VALIDATION,
+    ...     gen_kwargs={"split_key": "validation", "files": dl_manager.download_and extract(url)},
+    ... ),
+    ... datasets.SplitGenerator(
+    ...     name=datasets.Split.TEST,
+    ...     gen_kwargs={"split_key": "test", "files": dl_manager.download_and extract(url)},
+    ... )
+    ```
+    """
+    # pylint: enable=line-too-long
+    TRAIN = NamedSplit("train")
+    TEST = NamedSplit("test")
+    VALIDATION = NamedSplit("validation")
+    ALL = NamedSplitAll()
+    def __new__(cls, name):
+        """Create a custom split with datasets.Split('custom_name')."""
+        return NamedSplitAll() if name == "all" else NamedSplit(name)
+# Similar to SplitInfo, but contain an additional slice info
+SlicedSplitInfo = collections.namedtuple(
+    "SlicedSplitInfo",
+    [
+        "split_info",
+        "slice_value",
+    ],
+)  # noqa: E231
+class SplitReadInstruction:
+    """Object containing the reading instruction for the dataset.
+    Similarly to `SplitDescriptor` nodes, this object can be composed with itself,
+    but the resolution happens instantaneously, instead of keeping track of the
+    tree, such as all instructions are compiled and flattened in a single
+    SplitReadInstruction object containing the list of files and slice to use.
+    Once resolved, the instructions can be accessed with:
+    ```
+    read_instructions.get_list_sliced_split_info()  # List of splits to use
+    ```
+    """
+    def __init__(self, split_info=None):
+        self._splits = NonMutableDict(error_msg="Overlap between splits. Split {key} has been added with itself.")
+        if split_info:
+            self.add(SlicedSplitInfo(split_info=split_info, slice_value=None))
+    def add(self, sliced_split):
+        """Add a SlicedSplitInfo the read instructions."""
+        # TODO(epot): Check that the number of examples per shard % 100 == 0
+        # Otherwise the slices value may be unbalanced and not exactly reflect the
+        # requested slice.
+        self._splits[sliced_split.split_info.name] = sliced_split
+    def __add__(self, other):
+        """Merging split together."""
+        # Will raise error if a split has already be added (NonMutableDict)
+        # TODO(epot): If a split is already added but there is no overlap between
+        # the slices, should merge the slices (ex: [:10] + [80:])
+        split_instruction = SplitReadInstruction()
+        split_instruction._splits.update(self._splits)  # pylint: disable=protected-access
+        split_instruction._splits.update(other._splits)  # pylint: disable=protected-access
+        return split_instruction
+    def __getitem__(self, slice_value):
+        """Sub-splits."""
+        # Will raise an error if a split has already been sliced
+        split_instruction = SplitReadInstruction()
+        for v in self._splits.values():
+            if v.slice_value is not None:
+                raise ValueError(f"Trying to slice Split {v.split_info.name} which has already been sliced")
+            v = v._asdict()
+            v["slice_value"] = slice_value
+            split_instruction.add(SlicedSplitInfo(**v))
+        return split_instruction
+    def get_list_sliced_split_info(self):
+        return list(self._splits.values())
+class SplitDict(dict):
+    """Split info object."""
+    def __init__(self, *args, dataset_name=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.dataset_name = dataset_name
+    def __getitem__(self, key: Union[SplitBase, str]):
+        # 1st case: The key exists: `info.splits['train']`
+        if str(key) in self:
+            return super().__getitem__(str(key))
+        # 2nd case: Uses instructions: `info.splits['train[50%]']`
+        else:
+            instructions = make_file_instructions(
+                name=self.dataset_name,
+                split_infos=self.values(),
+                instruction=key,
+            )
+            return SubSplitInfo(instructions)
+    def __setitem__(self, key: Union[SplitBase, str], value: SplitInfo):
+        if key != value.name:
+            raise ValueError(f"Cannot add elem. (key mismatch: '{key}' != '{value.name}')")
+        super().__setitem__(key, value)
+    def add(self, split_info: SplitInfo):
+        """Add the split info."""
+        if split_info.name in self:
+            raise ValueError(f"Split {split_info.name} already present")
+        split_info.dataset_name = self.dataset_name
+        super().__setitem__(split_info.name, split_info)
+    @property
+    def total_num_examples(self):
+        """Return the total number of examples."""
+        return sum(s.num_examples for s in self.values())
+    @classmethod
+    def from_split_dict(cls, split_infos: Union[list, dict], dataset_name: Optional[str] = None):
+        """Returns a new SplitDict initialized from a Dict or List of `split_infos`."""
+        if isinstance(split_infos, dict):
+            split_infos = list(split_infos.values())
+        if dataset_name is None:
+            dataset_name = split_infos[0].get("dataset_name") if split_infos else None
+        split_dict = cls(dataset_name=dataset_name)
+        for split_info in split_infos:
+            if isinstance(split_info, dict):
+                split_info = SplitInfo(**split_info)
+            split_dict.add(split_info)
+        return split_dict
+    def to_split_dict(self):
+        """Returns a list of SplitInfo protos that we have."""
+        out = []
+        for split_name, split_info in self.items():
+            split_info = copy.deepcopy(split_info)
+            split_info.name = split_name
+            out.append(split_info)
+        return out
+    def copy(self):
+        return SplitDict.from_split_dict(self.to_split_dict(), self.dataset_name)
+    def _to_yaml_list(self) -> list:
+        out = [asdict(s) for s in self.to_split_dict()]
+        # we don't need the shard lengths in YAML, since it depends on max_shard_size and num_proc
+        for split_info_dict in out:
+            split_info_dict.pop("shard_lengths", None)
+        # we don't need the dataset_name attribute that is deprecated
+        for split_info_dict in out:
+            split_info_dict.pop("dataset_name", None)
+        return out
+    @classmethod
+    def _from_yaml_list(cls, yaml_data: list) -> "SplitDict":
+        return cls.from_split_dict(yaml_data)
+@dataclass
+class SplitGenerator:
+    """Defines the split information for the generator.
+    This should be used as returned value of
+    `GeneratorBasedBuilder._split_generators`.
+    See `GeneratorBasedBuilder._split_generators` for more info and example
+    of usage.
+    Args:
+        name (`str`):
+            Name of the `Split` for which the generator will
+            create the examples.
+        **gen_kwargs (additional keyword arguments):
+            Keyword arguments to forward to the `DatasetBuilder._generate_examples` method
+            of the builder.
+    Example:
+    ```py
+    >>> datasets.SplitGenerator(
+    ...     name=datasets.Split.TRAIN,
+    ...     gen_kwargs={"split_key": "train", "files": dl_manager.download_and_extract(url)},
+    ... )
+    ```
+    """
+    name: str
+    gen_kwargs: dict = dataclasses.field(default_factory=dict)
+    split_info: SplitInfo = dataclasses.field(init=False)
+    def __post_init__(self):
+        self.name = str(self.name)  # Make sure we convert NamedSplits in strings
+        NamedSplit(self.name)  # check that it's a valid split name
+        self.split_info = SplitInfo(name=self.name)

datasets/streaming.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import importlib
+from functools import wraps
+from typing import TYPE_CHECKING, Optional
+from .download.download_config import DownloadConfig
+from .utils.file_utils import (
+    xbasename,
+    xdirname,
+    xet_parse,
+    xexists,
+    xgetsize,
+    xglob,
+    xgzip_open,
+    xisdir,
+    xisfile,
+    xjoin,
+    xlistdir,
+    xnumpy_load,
+    xopen,
+    xpandas_read_csv,
+    xpandas_read_excel,
+    xPath,
+    xpyarrow_parquet_read_table,
+    xrelpath,
+    xsio_loadmat,
+    xsplit,
+    xsplitext,
+    xwalk,
+    xxml_dom_minidom_parse,
+)
+from .utils.logging import get_logger
+from .utils.patching import patch_submodule
+logger = get_logger(__name__)
+if TYPE_CHECKING:
+    from .builder import DatasetBuilder
+def extend_module_for_streaming(module_path, download_config: Optional[DownloadConfig] = None):
+    """Extend the module to support streaming.
+    We patch some functions in the module to use `fsspec` to support data streaming:
+    - We use `fsspec.open` to open and read remote files. We patch the module function:
+      - `open`
+    - We use the "::" hop separator to join paths and navigate remote compressed/archive files. We patch the module
+      functions:
+      - `os.path.join`
+      - `pathlib.Path.joinpath` and `pathlib.Path.__truediv__` (called when using the "/" operator)
+    The patched functions are replaced with custom functions defined to work with the
+    :class:`~download.streaming_download_manager.StreamingDownloadManager`.
+    Args:
+        module_path: Path to the module to be extended.
+        download_config: Mainly use `token` or `storage_options` to support different platforms and auth types.
+    """
+    module = importlib.import_module(module_path)
+    # TODO(QL): always update the module to add subsequent new authentication without removing old ones
+    if hasattr(module, "_patched_for_streaming") and module._patched_for_streaming:
+        if isinstance(module._patched_for_streaming, DownloadConfig):
+            module._patched_for_streaming.token = download_config.token
+            module._patched_for_streaming.storage_options = download_config.storage_options
+        return
+    def wrap_auth(function):
+        @wraps(function)
+        def wrapper(*args, **kwargs):
+            return function(*args, download_config=download_config, **kwargs)
+        wrapper._decorator_name_ = "wrap_auth"
+        return wrapper
+    # open files in a streaming fashion
+    patch_submodule(module, "open", wrap_auth(xopen)).start()
+    patch_submodule(module, "os.listdir", wrap_auth(xlistdir)).start()
+    patch_submodule(module, "os.walk", wrap_auth(xwalk)).start()
+    patch_submodule(module, "glob.glob", wrap_auth(xglob)).start()
+    # allow to navigate in remote zip files
+    patch_submodule(module, "os.path.join", xjoin).start()
+    patch_submodule(module, "os.path.dirname", xdirname).start()
+    patch_submodule(module, "os.path.basename", xbasename).start()
+    patch_submodule(module, "os.path.relpath", xrelpath).start()
+    patch_submodule(module, "os.path.split", xsplit).start()
+    patch_submodule(module, "os.path.splitext", xsplitext).start()
+    # allow checks on paths
+    patch_submodule(module, "os.path.exists", wrap_auth(xexists)).start()
+    patch_submodule(module, "os.path.isdir", wrap_auth(xisdir)).start()
+    patch_submodule(module, "os.path.isfile", wrap_auth(xisfile)).start()
+    patch_submodule(module, "os.path.getsize", wrap_auth(xgetsize)).start()
+    patch_submodule(module, "pathlib.Path", xPath).start()
+    # file readers
+    patch_submodule(module, "gzip.open", wrap_auth(xgzip_open)).start()
+    patch_submodule(module, "numpy.load", wrap_auth(xnumpy_load)).start()
+    patch_submodule(module, "pandas.read_csv", wrap_auth(xpandas_read_csv), attrs=["__version__"]).start()
+    patch_submodule(module, "pandas.read_excel", wrap_auth(xpandas_read_excel), attrs=["__version__"]).start()
+    patch_submodule(module, "scipy.io.loadmat", wrap_auth(xsio_loadmat), attrs=["__version__"]).start()
+    patch_submodule(module, "xml.etree.ElementTree.parse", wrap_auth(xet_parse)).start()
+    patch_submodule(module, "xml.dom.minidom.parse", wrap_auth(xxml_dom_minidom_parse)).start()
+    # pyarrow: do not patch pyarrow attribute in packaged modules
+    if not module.__name__.startswith("datasets.packaged_modules."):
+        patch_submodule(module, "pyarrow.parquet.read_table", wrap_auth(xpyarrow_parquet_read_table)).start()
+    module._patched_for_streaming = download_config
+def extend_dataset_builder_for_streaming(builder: "DatasetBuilder"):
+    """Extend the dataset builder module and the modules imported by it to support streaming.
+    Args:
+        builder (:class:`DatasetBuilder`): Dataset builder instance.
+    """
+    # this extends the open and os.path.join functions for data streaming
+    download_config = DownloadConfig(storage_options=builder.storage_options, token=builder.token)
+    extend_module_for_streaming(builder.__module__, download_config=download_config)
+    # builders can inherit from other builders that might use streaming functionality
+    # (for example, ImageFolder and AudioFolder inherit from FolderBuilder which implements examples generation)
+    # but these parents builders are not patched automatically as they are not instantiated, so we patch them here
+    from .builder import DatasetBuilder
+    parent_builder_modules = [
+        cls.__module__
+        for cls in type(builder).__mro__[1:]  # make sure it's not the same module we've already patched
+        if issubclass(cls, DatasetBuilder) and cls.__module__ != DatasetBuilder.__module__
+    ]  # check it's not a standard builder from datasets.builder
+    for module in parent_builder_modules:
+        extend_module_for_streaming(module, download_config=download_config)

datasets/table.py ADDED Viewed

	@@ -0,0 +1,2385 @@

+import copy
+import os
+from collections.abc import Iterator
+from functools import partial
+from itertools import groupby
+from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar, Union
+import numpy as np
+import pyarrow as pa
+import pyarrow.compute as pc
+from .utils.logging import get_logger
+if TYPE_CHECKING:
+    from .features.features import Features, FeatureType
+logger = get_logger(__name__)
+def inject_arrow_table_documentation(arrow_table_method):
+    def wrapper(fn):
+        fn.__doc__ = arrow_table_method.__doc__ + (fn.__doc__ if fn.__doc__ is not None else "")
+        fn.__doc__ = fn.__doc__.replace("pyarrow.Table", "Table")
+        if hasattr(arrow_table_method, "__annotations__"):
+            fn.__annotations__ = arrow_table_method.__annotations__
+        return fn
+    return wrapper
+def _in_memory_arrow_table_from_file(filename: str) -> pa.Table:
+    in_memory_stream = pa.input_stream(filename)
+    opened_stream = pa.ipc.open_stream(in_memory_stream)
+    pa_table = opened_stream.read_all()
+    return pa_table
+def _in_memory_arrow_table_from_buffer(buffer: pa.Buffer) -> pa.Table:
+    stream = pa.BufferReader(buffer)
+    opened_stream = pa.ipc.open_stream(stream)
+    table = opened_stream.read_all()
+    return table
+def _memory_mapped_record_batch_reader_from_file(filename: str) -> pa.RecordBatchStreamReader:
+    memory_mapped_stream = pa.memory_map(filename)
+    return pa.ipc.open_stream(memory_mapped_stream)
+def read_schema_from_file(filename: str) -> pa.Schema:
+    """
+    Infer arrow table schema from file without loading whole file into memory.
+    Useful especially while having very big files.
+    """
+    with pa.memory_map(filename) as memory_mapped_stream:
+        schema = pa.ipc.open_stream(memory_mapped_stream).schema
+    return schema
+def _memory_mapped_arrow_table_from_file(filename: str) -> pa.Table:
+    opened_stream = _memory_mapped_record_batch_reader_from_file(filename)
+    pa_table = opened_stream.read_all()
+    return pa_table
+def _deepcopy(x, memo: dict):
+    """deepcopy a regular class instance"""
+    cls = x.__class__
+    result = cls.__new__(cls)
+    memo[id(x)] = result
+    for k, v in x.__dict__.items():
+        setattr(result, k, copy.deepcopy(v, memo))
+    return result
+def _interpolation_search(arr: list[int], x: int) -> int:
+    """
+    Return the position i of a sorted array so that arr[i] <= x < arr[i+1]
+    Args:
+        arr (`List[int]`): non-empty sorted list of integers
+        x (`int`): query
+    Returns:
+        `int`: the position i so that arr[i] <= x < arr[i+1]
+    Raises:
+        `IndexError`: if the array is empty or if the query is outside the array values
+    """
+    i, j = 0, len(arr) - 1
+    while i < j and arr[i] <= x < arr[j]:
+        k = i + ((j - i) * (x - arr[i]) // (arr[j] - arr[i]))
+        if arr[k] <= x < arr[k + 1]:
+            return k
+        elif arr[k] < x:
+            i, j = k + 1, j
+        else:
+            i, j = i, k
+    raise IndexError(f"Invalid query '{x}' for size {arr[-1] if len(arr) else 'none'}.")
+class IndexedTableMixin:
+    def __init__(self, table: pa.Table):
+        self._schema: pa.Schema = table.schema
+        self._batches: list[pa.RecordBatch] = [
+            recordbatch for recordbatch in table.to_batches() if len(recordbatch) > 0
+        ]
+        self._offsets: np.ndarray = np.cumsum([0] + [len(b) for b in self._batches], dtype=np.int64)
+    def fast_gather(self, indices: Union[list[int], np.ndarray]) -> pa.Table:
+        """
+        Create a pa.Table by gathering the records at the records at the specified indices. Should be faster
+        than pa.concat_tables(table.fast_slice(int(i) % table.num_rows, 1) for i in indices) since NumPy can compute
+        the binary searches in parallel, highly optimized C
+        """
+        if not len(indices):
+            raise ValueError("Indices must be non-empty")
+        batch_indices = np.searchsorted(self._offsets, indices, side="right") - 1
+        return pa.Table.from_batches(
+            [
+                self._batches[batch_idx].slice(i - self._offsets[batch_idx], 1)
+                for batch_idx, i in zip(batch_indices, indices)
+            ],
+            schema=self._schema,
+        )
+    def fast_slice(self, offset=0, length=None) -> pa.Table:
+        """
+        Slice the Table using interpolation search.
+        The behavior is the same as `pyarrow.Table.slice` but it's significantly faster.
+        Interpolation search is used to find the start and end indexes of the batches we want to keep.
+        The batches to keep are then concatenated to form the sliced Table.
+        """
+        if offset < 0:
+            raise IndexError("Offset must be non-negative")
+        elif offset >= self._offsets[-1] or (length is not None and length <= 0):
+            return pa.Table.from_batches([], schema=self._schema)
+        i = _interpolation_search(self._offsets, offset)
+        if length is None or length + offset >= self._offsets[-1]:
+            batches = self._batches[i:]
+            batches[0] = batches[0].slice(offset - self._offsets[i])
+        else:
+            j = _interpolation_search(self._offsets, offset + length - 1)
+            batches = self._batches[i : j + 1]
+            batches[-1] = batches[-1].slice(0, offset + length - self._offsets[j])
+            batches[0] = batches[0].slice(offset - self._offsets[i])
+        return pa.Table.from_batches(batches, schema=self._schema)
+class Table(IndexedTableMixin):
+    """
+    Wraps a pyarrow Table by using composition.
+    This is the base class for `InMemoryTable`, `MemoryMappedTable` and `ConcatenationTable`.
+    It implements all the basic attributes/methods of the pyarrow Table class except
+    the Table transforms: `slice, filter, flatten, combine_chunks, cast, add_column,
+    append_column, remove_column, set_column, rename_columns` and `drop`.
+    The implementation of these methods differs for the subclasses.
+    """
+    def __init__(self, table: pa.Table):
+        super().__init__(table)
+        self.table = table
+    def __deepcopy__(self, memo: dict):
+        # arrow tables are immutable, so there's no need to copy self.table
+        # moreover calling deepcopy on a pyarrow table seems to make pa.total_allocated_bytes() decrease for some reason
+        # by adding it to the memo, self.table won't be copied
+        memo[id(self.table)] = self.table
+        # same for the recordbatches used by the index
+        memo[id(self._batches)] = list(self._batches)
+        return _deepcopy(self, memo)
+    def validate(self, *args, **kwargs):
+        """
+        Perform validation checks.  An exception is raised if validation fails.
+        By default only cheap validation checks are run.  Pass `full=True`
+        for thorough validation checks (potentially `O(n)`).
+        Args:
+            full (`bool`, defaults to `False`):
+                If `True`, run expensive checks, otherwise cheap checks only.
+        Raises:
+            `pa.lib.ArrowInvalid`: if validation fails
+        """
+        return self.table.validate(*args, **kwargs)
+    def equals(self, *args, **kwargs):
+        """
+        Check if contents of two tables are equal.
+        Args:
+            other ([`~datasets.table.Table`]):
+                Table to compare against.
+            check_metadata `bool`, defaults to `False`):
+                Whether schema metadata equality should be checked as well.
+        Returns:
+            `bool`
+        """
+        args = tuple(arg.table if isinstance(arg, Table) else arg for arg in args)
+        kwargs = {k: v.table if isinstance(v, Table) else v for k, v in kwargs}
+        return self.table.equals(*args, **kwargs)
+    def to_batches(self, *args, **kwargs):
+        """
+        Convert Table to list of (contiguous) `RecordBatch` objects.
+        Args:
+            max_chunksize (`int`, defaults to `None`):
+                Maximum size for `RecordBatch` chunks. Individual chunks may be
+                smaller depending on the chunk layout of individual columns.
+        Returns:
+            `List[pyarrow.RecordBatch]`
+        """
+        return self.table.to_batches(*args, **kwargs)
+    def to_pydict(self, *args, **kwargs):
+        """
+        Convert the Table to a `dict` or `OrderedDict`.
+        Returns:
+            `dict`
+        """
+        return self.table.to_pydict(*args, **kwargs)
+    def to_pylist(self, *args, **kwargs):
+        """
+        Convert the Table to a list
+        Returns:
+            `list`
+        """
+        return self.table.to_pylist(*args, **kwargs)
+    def to_pandas(self, *args, **kwargs):
+        """
+        Convert to a pandas-compatible NumPy array or DataFrame, as appropriate.
+        Args:
+            memory_pool (`MemoryPool`, defaults to `None`):
+                Arrow MemoryPool to use for allocations. Uses the default memory
+                pool is not passed.
+            strings_to_categorical (`bool`, defaults to `False`):
+                Encode string (UTF8) and binary types to `pandas.Categorical`.
+            categories (`list`, defaults to `empty`):
+                List of fields that should be returned as `pandas.Categorical`. Only
+                applies to table-like data structures.
+            zero_copy_only (`bool`, defaults to `False`):
+                Raise an `ArrowException` if this function call would require copying
+                the underlying data.
+            integer_object_nulls (`bool`, defaults to `False`):
+                Cast integers with nulls to objects.
+            date_as_object (`bool`, defaults to `True`):
+                Cast dates to objects. If `False`, convert to `datetime64[ns]` dtype.
+            timestamp_as_object (`bool`, defaults to `False`):
+                Cast non-nanosecond timestamps (`np.datetime64`) to objects. This is
+                useful if you have timestamps that don't fit in the normal date
+                range of nanosecond timestamps (1678 CE-2262 CE).
+                If `False`, all timestamps are converted to `datetime64[ns]` dtype.
+            use_threads (`bool`, defaults to `True`):
+                Whether to parallelize the conversion using multiple threads.
+            deduplicate_objects (`bool`, defaults to `False`):
+                Do not create multiple copies Python objects when created, to save
+                on memory use. Conversion will be slower.
+            ignore_metadata (`bool`, defaults to `False`):
+                If `True`, do not use the 'pandas' metadata to reconstruct the
+                DataFrame index, if present.
+            safe (`bool`, defaults to `True`):
+                For certain data types, a cast is needed in order to store the
+                data in a pandas DataFrame or Series (e.g. timestamps are always
+                stored as nanoseconds in pandas). This option controls whether it
+                is a safe cast or not.
+            split_blocks (`bool`, defaults to `False`):
+                If `True`, generate one internal "block" for each column when
+                creating a pandas.DataFrame from a `RecordBatch` or `Table`. While this
+                can temporarily reduce memory note that various pandas operations
+                can trigger "consolidation" which may balloon memory use.
+            self_destruct (`bool`, defaults to `False`):
+                EXPERIMENTAL: If `True`, attempt to deallocate the originating Arrow
+                memory while converting the Arrow object to pandas. If you use the
+                object after calling `to_pandas` with this option it will crash your
+                program.
+            types_mapper (`function`, defaults to `None`):
+                A function mapping a pyarrow DataType to a pandas `ExtensionDtype`.
+                This can be used to override the default pandas type for conversion
+                of built-in pyarrow types or in absence of `pandas_metadata` in the
+                Table schema. The function receives a pyarrow DataType and is
+                expected to return a pandas `ExtensionDtype` or `None` if the
+                default conversion should be used for that type. If you have
+                a dictionary mapping, you can pass `dict.get` as function.
+        Returns:
+            `pandas.Series` or `pandas.DataFrame`: `pandas.Series` or `pandas.DataFrame` depending on type of object
+        """
+        return self.table.to_pandas(*args, **kwargs)
+    def to_string(self, *args, **kwargs):
+        return self.table.to_string(*args, **kwargs)
+    def to_reader(self, max_chunksize: Optional[int] = None):
+        """
+        Convert the Table to a RecordBatchReader.
+        Note that this method is zero-copy, it merely exposes the same data under a different API.
+        Args:
+            max_chunksize (`int`, defaults to `None`)
+                Maximum size for RecordBatch chunks. Individual chunks may be smaller depending
+                on the chunk layout of individual columns.
+        Returns:
+            `pyarrow.RecordBatchReader`
+        """
+        return self.table.to_reader(max_chunksize=max_chunksize)
+    def field(self, *args, **kwargs):
+        """
+        Select a schema field by its column name or numeric index.
+        Args:
+            i (`Union[int, str]`):
+                The index or name of the field to retrieve.
+        Returns:
+            `pyarrow.Field`
+        """
+        return self.table.field(*args, **kwargs)
+    def column(self, *args, **kwargs):
+        """
+        Select a column by its column name, or numeric index.
+        Args:
+            i (`Union[int, str]`):
+                The index or name of the column to retrieve.
+        Returns:
+            `pyarrow.ChunkedArray`
+        """
+        return self.table.column(*args, **kwargs)
+    def itercolumns(self, *args, **kwargs):
+        """
+        Iterator over all columns in their numerical order.
+        Yields:
+            `pyarrow.ChunkedArray`
+        """
+        return self.table.itercolumns(*args, **kwargs)
+    @property
+    def schema(self):
+        """
+        Schema of the table and its columns.
+        Returns:
+            `pyarrow.Schema`
+        """
+        return self.table.schema
+    @property
+    def columns(self):
+        """
+        List of all columns in numerical order.
+        Returns:
+            `List[pa.ChunkedArray]`
+        """
+        return self.table.columns
+    @property
+    def num_columns(self):
+        """
+        Number of columns in this table.
+        Returns:
+            int
+        """
+        return self.table.num_columns
+    @property
+    def num_rows(self):
+        """
+        Number of rows in this table.
+        Due to the definition of a table, all columns have the same number of
+        rows.
+        Returns:
+            int
+        """
+        return self.table.num_rows
+    @property
+    def shape(self):
+        """
+        Dimensions of the table: (#rows, #columns).
+        Returns:
+            `(int, int)`: Number of rows and number of columns.
+        """
+        return self.table.shape
+    @property
+    def nbytes(self):
+        """
+        Total number of bytes consumed by the elements of the table.
+        """
+        return self.table.nbytes
+    @property
+    def column_names(self):
+        """
+        Names of the table's columns.
+        """
+        return self.table.column_names
+    def __eq__(self, other):
+        return self.equals(other)
+    def __getitem__(self, i):
+        return self.table[i]
+    def __len__(self):
+        return len(self.table)
+    def __repr__(self):
+        return self.table.__repr__().replace("pyarrow.Table", self.__class__.__name__)
+    def __str__(self):
+        return self.table.__str__().replace("pyarrow.Table", self.__class__.__name__)
+    def slice(self, *args, **kwargs):
+        """
+        Compute zero-copy slice of this Table.
+        Args:
+            offset (`int`, defaults to `0`):
+                Offset from start of table to slice.
+            length (`int`, defaults to `None`):
+                Length of slice (default is until end of table starting from
+                offset).
+        Returns:
+            `datasets.table.Table`
+        """
+        raise NotImplementedError()
+    def filter(self, *args, **kwargs):
+        """
+        Select records from a Table. See `pyarrow.compute.filter` for full usage.
+        """
+        raise NotImplementedError()
+    def flatten(self, *args, **kwargs):
+        """
+        Flatten this Table.  Each column with a struct type is flattened
+        into one column per struct field.  Other columns are left unchanged.
+        Args:
+            memory_pool (`MemoryPool`, defaults to `None`):
+                For memory allocations, if required, otherwise use default pool.
+        Returns:
+            `datasets.table.Table`
+        """
+        raise NotImplementedError()
+    def combine_chunks(self, *args, **kwargs):
+        """
+        Make a new table by combining the chunks this table has.
+        All the underlying chunks in the `ChunkedArray` of each column are
+        concatenated into zero or one chunk.
+        Args:
+            memory_pool (`MemoryPool`, defaults to `None`):
+                For memory allocations, if required, otherwise use default pool.
+        Returns:
+            `datasets.table.Table`
+        """
+        raise NotImplementedError()
+    def cast(self, *args, **kwargs):
+        """
+        Cast table values to another schema.
+        Args:
+            target_schema (`Schema`):
+                Schema to cast to, the names and order of fields must match.
+            safe (`bool`, defaults to `True`):
+                Check for overflows or other unsafe conversions.
+        Returns:
+            `datasets.table.Table`
+        """
+        raise NotImplementedError()
+    def replace_schema_metadata(self, *args, **kwargs):
+        """
+        EXPERIMENTAL: Create shallow copy of table by replacing schema
+        key-value metadata with the indicated new metadata (which may be None,
+        which deletes any existing metadata
+        Args:
+            metadata (`dict`, defaults to `None`):
+        Returns:
+            `datasets.table.Table`: shallow_copy
+        """
+        raise NotImplementedError()
+    def add_column(self, *args, **kwargs):
+        """
+        Add column to Table at position.
+        A new table is returned with the column added, the original table
+        object is left unchanged.
+        Args:
+            i (`int`):
+                Index to place the column at.
+            field_ (`Union[str, pyarrow.Field]`):
+                If a string is passed then the type is deduced from the column
+                data.
+            column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
+                Column data.
+        Returns:
+            `datasets.table.Table`: New table with the passed column added.
+        """
+        raise NotImplementedError()
+    def append_column(self, *args, **kwargs):
+        """
+        Append column at end of columns.
+        Args:
+            field_ (`Union[str, pyarrow.Field]`):
+                If a string is passed then the type is deduced from the column
+                data.
+            column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
+                Column data.
+        Returns:
+            `datasets.table.Table`:  New table with the passed column added.
+        """
+        raise NotImplementedError()
+    def remove_column(self, *args, **kwargs):
+        """
+        Create new Table with the indicated column removed.
+        Args:
+            i (`int`):
+                Index of column to remove.
+        Returns:
+            `datasets.table.Table`: New table without the column.
+        """
+        raise NotImplementedError()
+    def set_column(self, *args, **kwargs):
+        """
+        Replace column in Table at position.
+        Args:
+            i (`int`):
+                Index to place the column at.
+            field_ (`Union[str, pyarrow.Field]`):
+                If a string is passed then the type is deduced from the column
+                data.
+            column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
+                Column data.
+        Returns:
+            `datasets.table.Table`: New table with the passed column set.
+        """
+        raise NotImplementedError()
+    def rename_columns(self, *args, **kwargs):
+        """
+        Create new table with columns renamed to provided names.
+        """
+        raise NotImplementedError()
+    def drop(self, *args, **kwargs):
+        """
+        Drop one or more columns and return a new table.
+        Args:
+            columns (`List[str]`):
+                List of field names referencing existing columns.
+        Raises:
+            `KeyError` : if any of the passed columns name are not existing.
+        Returns:
+            `datasets.table.Table`: New table without the columns.
+        """
+        raise NotImplementedError()
+    def select(self, *args, **kwargs):
+        """
+        Select columns of the table.
+        Returns a new table with the specified columns, and metadata preserved.
+        Args:
+            columns (:obj:`Union[List[str], List[int]]`):
+                The column names or integer indices to select.
+        Returns:
+            `datasets.table.Table`: table with only a subset of the columns
+        """
+        raise NotImplementedError()
+class TableBlock(Table):
+    """
+    `TableBlock` is the allowed class inside a `ConcanetationTable`.
+    Only `MemoryMappedTable` and `InMemoryTable` are `TableBlock`.
+    This is because we don't want a `ConcanetationTable` made out of other `ConcanetationTables`.
+    """
+    pass
+class InMemoryTable(TableBlock):
+    """
+    The table is said in-memory when it is loaded into the user's RAM.
+    Pickling it does copy all the data using memory.
+    Its implementation is simple and uses the underlying pyarrow Table methods directly.
+    This is different from the `MemoryMapped` table, for which pickling doesn't copy all the
+    data in memory. For a `MemoryMapped`, unpickling instead reloads the table from the disk.
+    `InMemoryTable` must be used when data fit in memory, while `MemoryMapped` are reserved for
+    data bigger than memory or when you want the memory footprint of your application to
+    stay low.
+    """
+    @classmethod
+    def from_file(cls, filename: str):
+        table = _in_memory_arrow_table_from_file(filename)
+        return cls(table)
+    @classmethod
+    def from_buffer(cls, buffer: pa.Buffer):
+        table = _in_memory_arrow_table_from_buffer(buffer)
+        return cls(table)
+    @classmethod
+    def from_pandas(cls, *args, **kwargs):
+        """
+        Convert pandas.DataFrame to an Arrow Table.
+        The column types in the resulting Arrow Table are inferred from the
+        dtypes of the pandas.Series in the DataFrame. In the case of non-object
+        Series, the NumPy dtype is translated to its Arrow equivalent. In the
+        case of `object`, we need to guess the datatype by looking at the
+        Python objects in this Series.
+        Be aware that Series of the `object` dtype don't carry enough
+        information to always lead to a meaningful Arrow type. In the case that
+        we cannot infer a type, e.g. because the DataFrame is of length 0 or
+        the Series only contains `None/nan` objects, the type is set to
+        null. This behavior can be avoided by constructing an explicit schema
+        and passing it to this function.
+        Args:
+            df (`pandas.DataFrame`):
+            schema (`pyarrow.Schema`, *optional*):
+                The expected schema of the Arrow Table. This can be used to
+                indicate the type of columns if we cannot infer it automatically.
+                If passed, the output will have exactly this schema. Columns
+                specified in the schema that are not found in the DataFrame columns
+                or its index will raise an error. Additional columns or index
+                levels in the DataFrame which are not specified in the schema will
+                be ignored.
+            preserve_index (`bool`, *optional*):
+                Whether to store the index as an additional column in the resulting
+                `Table`. The default of None will store the index as a column,
+                except for RangeIndex which is stored as metadata only. Use
+                `preserve_index=True` to force it to be stored as a column.
+            nthreads (`int`, defaults to `None` (may use up to system CPU count threads))
+                If greater than 1, convert columns to Arrow in parallel using
+                indicated number of threads.
+            columns (`List[str]`, *optional*):
+               List of column to be converted. If `None`, use all columns.
+            safe (`bool`, defaults to `True`):
+               Check for overflows or other unsafe conversions,
+        Returns:
+            `datasets.table.Table`:
+        Examples:
+        ```python
+        >>> import pandas as pd
+        >>> import pyarrow as pa
+        >>> df = pd.DataFrame({
+            ...     'int': [1, 2],
+            ...     'str': ['a', 'b']
+            ... })
+        >>> pa.Table.from_pandas(df)
+        <pyarrow.lib.Table object at 0x7f05d1fb1b40>
+        ```
+        """
+        return cls(pa.Table.from_pandas(*args, **kwargs))
+    @classmethod
+    def from_arrays(cls, *args, **kwargs):
+        """
+        Construct a Table from Arrow arrays.
+        Args:
+            arrays (`List[Union[pyarrow.Array, pyarrow.ChunkedArray]]`):
+                Equal-length arrays that should form the table.
+            names (`List[str]`, *optional*):
+                Names for the table columns. If not passed, schema must be passed.
+            schema (`Schema`, defaults to `None`):
+                Schema for the created table. If not passed, names must be passed.
+            metadata (`Union[dict, Mapping]`, defaults to `None`):
+                Optional metadata for the schema (if inferred).
+        Returns:
+            `datasets.table.Table`
+        """
+        return cls(pa.Table.from_arrays(*args, **kwargs))
+    @classmethod
+    def from_pydict(cls, *args, **kwargs):
+        """
+        Construct a Table from Arrow arrays or columns.
+        Args:
+            mapping (`Union[dict, Mapping]`):
+                A mapping of strings to Arrays or Python lists.
+            schema (`Schema`, defaults to `None`):
+                If not passed, will be inferred from the Mapping values
+            metadata (`Union[dict, Mapping]`, defaults to `None`):
+                Optional metadata for the schema (if inferred).
+        Returns:
+            `datasets.table.Table`
+        """
+        return cls(pa.Table.from_pydict(*args, **kwargs))
+    @classmethod
+    def from_pylist(cls, mapping, *args, **kwargs):
+        """
+        Construct a Table from list of rows / dictionaries.
+        Args:
+            mapping (`List[dict]`):
+                A mapping of strings to row values.
+            schema (`Schema`, defaults to `None`):
+                If not passed, will be inferred from the Mapping values
+            metadata (`Union[dict, Mapping]`, defaults to `None`):
+                Optional metadata for the schema (if inferred).
+        Returns:
+            `datasets.table.Table`
+        """
+        return cls(pa.Table.from_pylist(mapping, *args, **kwargs))
+    @classmethod
+    def from_batches(cls, *args, **kwargs):
+        """
+        Construct a Table from a sequence or iterator of Arrow `RecordBatches`.
+        Args:
+            batches (`Union[Sequence[pyarrow.RecordBatch], Iterator[pyarrow.RecordBatch]]`):
+                Sequence of `RecordBatch` to be converted, all schemas must be equal.
+            schema (`Schema`, defaults to `None`):
+                If not passed, will be inferred from the first `RecordBatch`.
+        Returns:
+            `datasets.table.Table`:
+        """
+        return cls(pa.Table.from_batches(*args, **kwargs))
+    def slice(self, offset=0, length=None):
+        """
+        Compute zero-copy slice of this Table.
+        Args:
+            offset (`int`, defaults to `0`):
+                Offset from start of table to slice.
+            length (`int`, defaults to `None`):
+                Length of slice (default is until end of table starting from
+                offset).
+        Returns:
+            `datasets.table.Table`
+        """
+        # Use fast slicing here
+        return InMemoryTable(self.fast_slice(offset=offset, length=length))
+    def filter(self, *args, **kwargs):
+        """
+        Select records from a Table. See `pyarrow.compute.filter` for full usage.
+        """
+        return InMemoryTable(self.table.filter(*args, **kwargs))
+    def flatten(self, *args, **kwargs):
+        """
+        Flatten this Table.  Each column with a struct type is flattened
+        into one column per struct field.  Other columns are left unchanged.
+        Args:
+            memory_pool (`MemoryPool`, defaults to `None`):
+                For memory allocations, if required, otherwise use default pool.
+        Returns:
+            `datasets.table.Table`
+        """
+        return InMemoryTable(table_flatten(self.table, *args, **kwargs))
+    def combine_chunks(self, *args, **kwargs):
+        """
+        Make a new table by combining the chunks this table has.
+        All the underlying chunks in the `ChunkedArray` of each column are
+        concatenated into zero or one chunk.
+        Args:
+            memory_pool (`MemoryPool`, defaults to `None`):
+                For memory allocations, if required, otherwise use default pool.
+        Returns:
+            `datasets.table.Table`
+        """
+        return InMemoryTable(self.table.combine_chunks(*args, **kwargs))
+    def cast(self, *args, **kwargs):
+        """
+        Cast table values to another schema.
+        Args:
+            target_schema (`Schema`):
+                Schema to cast to, the names and order of fields must match.
+            safe (`bool`, defaults to `True`):
+                Check for overflows or other unsafe conversions.
+        Returns:
+            `datasets.table.Table`
+        """
+        return InMemoryTable(table_cast(self.table, *args, **kwargs))
+    def replace_schema_metadata(self, *args, **kwargs):
+        """
+        EXPERIMENTAL: Create shallow copy of table by replacing schema
+        key-value metadata with the indicated new metadata (which may be `None`,
+        which deletes any existing metadata).
+        Args:
+            metadata (`dict`, defaults to `None`):
+        Returns:
+            `datasets.table.Table`: shallow_copy
+        """
+        return InMemoryTable(self.table.replace_schema_metadata(*args, **kwargs))
+    def add_column(self, *args, **kwargs):
+        """
+        Add column to Table at position.
+        A new table is returned with the column added, the original table
+        object is left unchanged.
+        Args:
+            i (`int`):
+                Index to place the column at.
+            field_ (`Union[str, pyarrow.Field]`):
+                If a string is passed then the type is deduced from the column
+                data.
+            column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
+                Column data.
+        Returns:
+            `datasets.table.Table`: New table with the passed column added.
+        """
+        return InMemoryTable(self.table.add_column(*args, **kwargs))
+    def append_column(self, *args, **kwargs):
+        """
+        Append column at end of columns.
+        Args:
+            field_ (`Union[str, pyarrow.Field]`):
+                If a string is passed then the type is deduced from the column
+                data.
+            column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
+                Column data.
+        Returns:
+            `datasets.table.Table`:
+                New table with the passed column added.
+        """
+        return InMemoryTable(self.table.append_column(*args, **kwargs))
+    def remove_column(self, *args, **kwargs):
+        """
+        Create new Table with the indicated column removed.
+        Args:
+            i (`int`):
+                Index of column to remove.
+        Returns:
+            `datasets.table.Table`:
+                New table without the column.
+        """
+        return InMemoryTable(self.table.remove_column(*args, **kwargs))
+    def set_column(self, *args, **kwargs):
+        """
+        Replace column in Table at position.
+        Args:
+            i (`int`):
+                Index to place the column at.
+            field_ (`Union[str, pyarrow.Field]`):
+                If a string is passed then the type is deduced from the column
+                data.
+            column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
+                Column data.
+        Returns:
+            `datasets.table.Table`:
+                New table with the passed column set.
+        """
+        return InMemoryTable(self.table.set_column(*args, **kwargs))
+    def rename_columns(self, *args, **kwargs):
+        """
+        Create new table with columns renamed to provided names.
+        """
+        return InMemoryTable(self.table.rename_columns(*args, **kwargs))
+    def drop(self, *args, **kwargs):
+        """
+        Drop one or more columns and return a new table.
+        Args:
+            columns (`List[str]`):
+                List of field names referencing existing columns.
+        Raises:
+            `KeyError` : if any of the passed columns name are not existing.
+        Returns:
+            `datasets.table.Table`:
+                New table without the columns.
+        """
+        return InMemoryTable(self.table.drop(*args, **kwargs))
+    def select(self, *args, **kwargs):
+        """
+        Select columns of the table.
+        Returns a new table with the specified columns, and metadata preserved.
+        Args:
+            columns (:obj:`Union[List[str], List[int]]`):
+                The column names or integer indices to select.
+        Returns:
+            :class:`datasets.table.Table`: New table with the specified columns, and metadata preserved.
+        """
+        return InMemoryTable(self.table.select(*args, **kwargs))
+# The MemoryMappedTable needs replays to properly reload tables from the disk
+Replay = tuple[str, tuple, dict]
+class MemoryMappedTable(TableBlock):
+    """
+    The table is said memory mapped when it doesn't use the user's RAM but loads the data
+    from the disk instead.
+    Pickling it doesn't copy the data into memory.
+    Instead, only the path to the memory mapped arrow file is pickled, as well as the list
+    of transforms to "replay" when reloading the table from the disk.
+    Its implementation requires to store an history of all the transforms that were applied
+    to the underlying pyarrow Table, so that they can be "replayed" when reloading the Table
+    from the disk.
+    This is different from the `InMemoryTable` table, for which pickling does copy all the
+    data in memory.
+    `InMemoryTable` must be used when data fit in memory, while `MemoryMapped` are reserved for
+    data bigger than memory or when you want the memory footprint of your application to
+    stay low.
+    """
+    def __init__(self, table: pa.Table, path: str, replays: Optional[list[Replay]] = None):
+        super().__init__(table)
+        self.path = os.path.abspath(path)
+        self.replays: list[Replay] = replays if replays is not None else []
+    @classmethod
+    def from_file(cls, filename: str, replays=None):
+        table = _memory_mapped_arrow_table_from_file(filename)
+        table = cls._apply_replays(table, replays)
+        return cls(table, filename, replays)
+    def __getstate__(self):
+        return {"path": self.path, "replays": self.replays}
+    def __setstate__(self, state):
+        path = state["path"]
+        replays = state["replays"]
+        table = _memory_mapped_arrow_table_from_file(path)
+        table = self._apply_replays(table, replays)
+        MemoryMappedTable.__init__(self, table, path=path, replays=replays)
+    @staticmethod
+    def _apply_replays(table: pa.Table, replays: Optional[list[Replay]] = None) -> pa.Table:
+        if replays is not None:
+            for name, args, kwargs in replays:
+                if name == "cast":
+                    table = table_cast(table, *args, **kwargs)
+                elif name == "flatten":
+                    table = table_flatten(table, *args, **kwargs)
+                else:
+                    table = getattr(table, name)(*args, **kwargs)
+        return table
+    def _append_replay(self, replay: Replay) -> list[Replay]:
+        replays = copy.deepcopy(self.replays)
+        replays.append(replay)
+        return replays
+    def slice(self, offset=0, length=None):
+        """
+        Compute zero-copy slice of this Table.
+        Args:
+            offset (`int`, defaults to `0`):
+                Offset from start of table to slice.
+            length (`int`, defaults to `None`):
+                Length of slice (default is until end of table starting from
+                offset).
+        Returns:
+            `datasets.table.Table`
+        """
+        replay = ("slice", (offset, length), {})
+        replays = self._append_replay(replay)
+        # Use fast slicing here
+        return MemoryMappedTable(self.fast_slice(offset=offset, length=length), self.path, replays)
+    def filter(self, *args, **kwargs):
+        """
+        Select records from a Table. See `pyarrow.compute.filter` for full usage.
+        """
+        replay = ("filter", copy.deepcopy(args), copy.deepcopy(kwargs))
+        replays = self._append_replay(replay)
+        return MemoryMappedTable(self.table.filter(*args, **kwargs), self.path, replays)
+    def flatten(self, *args, **kwargs):
+        """
+        Flatten this Table.  Each column with a struct type is flattened
+        into one column per struct field.  Other columns are left unchanged.
+        Args:
+            memory_pool (`MemoryPool`, defaults to `None`):
+                For memory allocations, if required, otherwise use default pool.
+        Returns:
+            `datasets.table.Table`
+        """
+        replay = ("flatten", copy.deepcopy(args), copy.deepcopy(kwargs))
+        replays = self._append_replay(replay)
+        return MemoryMappedTable(table_flatten(self.table, *args, **kwargs), self.path, replays)
+    def combine_chunks(self, *args, **kwargs):
+        """
+        Make a new table by combining the chunks this table has.
+        All the underlying chunks in the ChunkedArray of each column are
+        concatenated into zero or one chunk.
+        Args:
+            memory_pool (`MemoryPool`, defaults to `None`):
+                For memory allocations, if required, otherwise use default pool.
+        Returns:
+            `datasets.table.Table`
+        """
+        replay = ("combine_chunks", copy.deepcopy(args), copy.deepcopy(kwargs))
+        replays = self._append_replay(replay)
+        return MemoryMappedTable(self.table.combine_chunks(*args, **kwargs), self.path, replays)
+    def cast(self, *args, **kwargs):
+        """
+        Cast table values to another schema
+        Args:
+            target_schema (`Schema`):
+                Schema to cast to, the names and order of fields must match.
+            safe (`bool`, defaults to `True`):
+                Check for overflows or other unsafe conversions.
+        Returns:
+            `datasets.table.Table`
+        """
+        replay = ("cast", copy.deepcopy(args), copy.deepcopy(kwargs))
+        replays = self._append_replay(replay)
+        return MemoryMappedTable(table_cast(self.table, *args, **kwargs), self.path, replays)
+    def replace_schema_metadata(self, *args, **kwargs):
+        """
+        EXPERIMENTAL: Create shallow copy of table by replacing schema
+        key-value metadata with the indicated new metadata (which may be None,
+        which deletes any existing metadata.
+        Args:
+            metadata (`dict`, defaults to `None`):
+        Returns:
+            `datasets.table.Table`: shallow_copy
+        """
+        replay = ("replace_schema_metadata", copy.deepcopy(args), copy.deepcopy(kwargs))
+        replays = self._append_replay(replay)
+        return MemoryMappedTable(self.table.replace_schema_metadata(*args, **kwargs), self.path, replays)
+    def add_column(self, *args, **kwargs):
+        """
+        Add column to Table at position.
+        A new table is returned with the column added, the original table
+        object is left unchanged.
+        Args:
+            i (`int`):
+                Index to place the column at.
+            field_ (`Union[str, pyarrow.Field]`):
+                If a string is passed then the type is deduced from the column
+                data.
+            column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
+                Column data.
+        Returns:
+            `datasets.table.Table`: New table with the passed column added.
+        """
+        replay = ("add_column", copy.deepcopy(args), copy.deepcopy(kwargs))
+        replays = self._append_replay(replay)
+        return MemoryMappedTable(self.table.add_column(*args, **kwargs), self.path, replays)
+    def append_column(self, *args, **kwargs):
+        """
+        Append column at end of columns.
+        Args:
+            field_ (`Union[str, pyarrow.Field]`):
+                If a string is passed then the type is deduced from the column
+                data.
+            column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
+                Column data.
+        Returns:
+            `datasets.table.Table`:
+                New table with the passed column added.
+        """
+        replay = ("append_column", copy.deepcopy(args), copy.deepcopy(kwargs))
+        replays = self._append_replay(replay)
+        return MemoryMappedTable(self.table.append_column(*args, **kwargs), self.path, replays)
+    def remove_column(self, *args, **kwargs):
+        """
+        Create new Table with the indicated column removed.
+        Args:
+            i (`int`):
+                Index of column to remove.
+        Returns:
+            `datasets.table.Table`:
+                New table without the column.
+        """
+        replay = ("remove_column", copy.deepcopy(args), copy.deepcopy(kwargs))
+        replays = self._append_replay(replay)
+        return MemoryMappedTable(self.table.remove_column(*args, **kwargs), self.path, replays)
+    def set_column(self, *args, **kwargs):
+        """
+        Replace column in Table at position.
+        Args:
+            i (`int`):
+                Index to place the column at.
+            field_ (`Union[str, pyarrow.Field]`):
+                If a string is passed then the type is deduced from the column
+                data.
+            column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
+                Column data.
+        Returns:
+            `datasets.table.Table`:
+                New table with the passed column set.
+        """
+        replay = ("set_column", copy.deepcopy(args), copy.deepcopy(kwargs))
+        replays = self._append_replay(replay)
+        return MemoryMappedTable(self.table.set_column(*args, **kwargs), self.path, replays)
+    def rename_columns(self, *args, **kwargs):
+        """
+        Create new table with columns renamed to provided names.
+        """
+        replay = ("rename_columns", copy.deepcopy(args), copy.deepcopy(kwargs))
+        replays = self._append_replay(replay)
+        return MemoryMappedTable(self.table.rename_columns(*args, **kwargs), self.path, replays)
+    def drop(self, *args, **kwargs):
+        """
+        Drop one or more columns and return a new table.
+        Args:
+            columns (`List[str]`):
+                List of field names referencing existing columns.
+        Raises:
+            `KeyError` : if any of the passed columns name are not existing.
+        Returns:
+            `datasets.table.Table`:
+                New table without the columns.
+        """
+        replay = ("drop", copy.deepcopy(args), copy.deepcopy(kwargs))
+        replays = self._append_replay(replay)
+        return MemoryMappedTable(self.table.drop(*args, **kwargs), self.path, replays)
+    def select(self, *args, **kwargs):
+        """
+        Select columns of the table.
+        Returns a new table with the specified columns, and metadata preserved.
+        Args:
+            columns (:obj:`Union[List[str], List[int]]`):
+                The column names or integer indices to select.
+        Returns:
+            :class:`datasets.table.Table`: New table with the specified columns, and metadata preserved.
+        """
+        replay = ("select", copy.deepcopy(args), copy.deepcopy(kwargs))
+        replays = self._append_replay(replay)
+        return MemoryMappedTable(self.table.select(*args, **kwargs), self.path, replays)
+# A ConcatenationTable is the concatenation of several tables.
+# The ``blocks`` attributes stores a list of list of blocks.
+# The first axis concatenates the tables along the axis 0 (it appends rows),
+# while the second axis concatenates tables along the axis 1 (it appends columns).
+TableBlockContainer = TypeVar("TableBlockContainer", TableBlock, list[TableBlock], list[list[TableBlock]])
+class ConcatenationTable(Table):
+    """
+    The table comes from the concatenation of several tables called blocks.
+    It enables concatenation on both axis 0 (append rows) and axis 1 (append columns).
+    The underlying tables are called "blocks" and can be either `InMemoryTable`
+    or `MemoryMappedTable` objects.
+    This allows to combine tables that come from memory or that are memory mapped.
+    When a `ConcatenationTable` is pickled, then each block is pickled:
+    - the `InMemoryTable` objects are pickled by copying all the data in memory.
+    - the MemoryMappedTable objects are pickled without copying the data into memory.
+    Instead, only the path to the memory mapped arrow file is pickled, as well as the list
+    of transforms to "replays" when reloading the table from the disk.
+    Its implementation requires to store each block separately.
+    The `blocks` attributes stores a list of list of blocks.
+    The first axis concatenates the tables along the axis 0 (it appends rows),
+    while the second axis concatenates tables along the axis 1 (it appends columns).
+    If some columns are missing when concatenating on axis 0, they are filled with null values.
+    This is done using `pyarrow.concat_tables(tables, promote=True)`.
+    You can access the fully combined table by accessing the `ConcatenationTable.table` attribute,
+    and the blocks by accessing the `ConcatenationTable.blocks` attribute.
+    """
+    def __init__(self, table: pa.Table, blocks: list[list[TableBlock]]):
+        super().__init__(table)
+        self.blocks = blocks
+        # Check that all the blocks have the right type.
+        # Only InMemoryTable and MemoryMappedTable are allowed.
+        for subtables in blocks:
+            for subtable in subtables:
+                if not isinstance(subtable, TableBlock):
+                    raise TypeError(
+                        "The blocks of a ConcatenationTable must be InMemoryTable or MemoryMappedTable objects"
+                        f", but got {_short_str(subtable)}."
+                    )
+    def __getstate__(self):
+        return {"blocks": self.blocks, "schema": self.table.schema}
+    def __setstate__(self, state):
+        blocks = state["blocks"]
+        schema = state["schema"]
+        table = self._concat_blocks_horizontally_and_vertically(blocks)
+        if schema is not None and table.schema != schema:
+            # We fix the columns by concatenating with an empty table with the right columns
+            empty_table = pa.Table.from_batches([], schema=schema)
+            # We set promote_options="default" to fill missing columns with null values
+            table = pa.concat_tables([table, empty_table], promote_options="default")
+        ConcatenationTable.__init__(self, table, blocks=blocks)
+    @staticmethod
+    def _concat_blocks(blocks: list[Union[TableBlock, pa.Table]], axis: int = 0) -> pa.Table:
+        pa_tables = [table.table if hasattr(table, "table") else table for table in blocks]
+        if axis == 0:
+            # We set promote_options="default" to fill missing columns with null values
+            return pa.concat_tables(pa_tables, promote_options="default")
+        elif axis == 1:
+            for i, table in enumerate(pa_tables):
+                if i == 0:
+                    pa_table = table
+                else:
+                    for name, col in zip(table.column_names, table.columns):
+                        pa_table = pa_table.append_column(name, col)
+            return pa_table
+        else:
+            raise ValueError("'axis' must be either 0 or 1")
+    @classmethod
+    def _concat_blocks_horizontally_and_vertically(cls, blocks: list[list[TableBlock]]) -> pa.Table:
+        pa_tables_to_concat_vertically = []
+        for i, tables in enumerate(blocks):
+            if not tables:
+                continue
+            pa_table_horizontally_concatenated = cls._concat_blocks(tables, axis=1)
+            pa_tables_to_concat_vertically.append(pa_table_horizontally_concatenated)
+        return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)
+    @classmethod
+    def _merge_blocks(cls, blocks: TableBlockContainer, axis: Optional[int] = None) -> TableBlockContainer:
+        if axis is not None:
+            merged_blocks = []
+            for is_in_memory, block_group in groupby(blocks, key=lambda x: isinstance(x, InMemoryTable)):
+                if is_in_memory:
+                    block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
+                merged_blocks += list(block_group)
+        else:  # both
+            merged_blocks = [cls._merge_blocks(row_block, axis=1) for row_block in blocks]
+            if all(len(row_block) == 1 for row_block in merged_blocks):
+                merged_blocks = cls._merge_blocks(
+                    [block for row_block in merged_blocks for block in row_block], axis=0
+                )
+        return merged_blocks
+    @classmethod
+    def _consolidate_blocks(cls, blocks: TableBlockContainer) -> TableBlockContainer:
+        if isinstance(blocks, TableBlock):
+            return blocks
+        elif isinstance(blocks[0], TableBlock):
+            return cls._merge_blocks(blocks, axis=0)
+        else:
+            return cls._merge_blocks(blocks)
+    @classmethod
+    def from_blocks(cls, blocks: TableBlockContainer) -> "ConcatenationTable":
+        blocks = cls._consolidate_blocks(blocks)
+        if isinstance(blocks, TableBlock):
+            table = blocks
+            return cls(table.table, [[table]])
+        elif isinstance(blocks[0], TableBlock):
+            table = cls._concat_blocks(blocks, axis=0)
+            blocks = [[t] for t in blocks]
+            return cls(table, blocks)
+        else:
+            table = cls._concat_blocks_horizontally_and_vertically(blocks)
+            return cls(table, blocks)
+    @classmethod
+    def from_tables(cls, tables: list[Union[pa.Table, Table]], axis: int = 0) -> "ConcatenationTable":
+        """Create `ConcatenationTable` from list of tables.
+        Args:
+            tables (list of `Table` or list of `pyarrow.Table`):
+                List of tables.
+            axis (`{0, 1}`, defaults to `0`, meaning over rows):
+                Axis to concatenate over, where `0` means over rows (vertically) and `1` means over columns
+                (horizontally).
+                <Added version="1.6.0"/>
+        """
+        def to_blocks(table: Union[pa.Table, Table]) -> list[list[TableBlock]]:
+            if isinstance(table, pa.Table):
+                return [[InMemoryTable(table)]]
+            elif isinstance(table, ConcatenationTable):
+                return copy.deepcopy(table.blocks)
+            else:
+                return [[table]]
+        def _slice_row_block(row_block: list[TableBlock], length: int) -> tuple[list[TableBlock], list[TableBlock]]:
+            sliced = [table.slice(0, length) for table in row_block]
+            remainder = [table.slice(length, len(row_block[0]) - length) for table in row_block]
+            return sliced, remainder
+        def _split_both_like(
+            result: list[list[TableBlock]], blocks: list[list[TableBlock]]
+        ) -> tuple[list[list[TableBlock]], list[list[TableBlock]]]:
+            """
+            Make sure each row_block contain the same num_rows to be able to concatenate them on axis=1.
+            To do so, we modify both blocks sets to have the same row_blocks boundaries.
+            For example, if `result` has 2 row_blocks of 3 rows and `blocks` has 3 row_blocks of 2 rows,
+            we modify both to have 4 row_blocks of size 2, 1, 1 and 2:
+                    [ x   x   x | x   x   x ]
+                +   [ y   y | y   y | y   y ]
+                -----------------------------
+                =   [ x   x | x | x | x   x ]
+                    [ y   y | y | y | y   y ]
+            """
+            result, blocks = list(result), list(blocks)
+            new_result, new_blocks = [], []
+            while result and blocks:
+                # we slice the longest row block to save two row blocks of same length
+                # and we replace the long row block by its remainder if necessary
+                if len(result[0][0]) > len(blocks[0][0]):
+                    new_blocks.append(blocks[0])
+                    sliced, result[0] = _slice_row_block(result[0], len(blocks.pop(0)[0]))
+                    new_result.append(sliced)
+                elif len(result[0][0]) < len(blocks[0][0]):
+                    new_result.append(result[0])
+                    sliced, blocks[0] = _slice_row_block(blocks[0], len(result.pop(0)[0]))
+                    new_blocks.append(sliced)
+                else:
+                    new_result.append(result.pop(0))
+                    new_blocks.append(blocks.pop(0))
+            if result or blocks:
+                raise ValueError("Failed to concatenate on axis=1 because tables don't have the same number of rows")
+            return new_result, new_blocks
+        def _extend_blocks(
+            result: list[list[TableBlock]], blocks: list[list[TableBlock]], axis: int = 0
+        ) -> list[list[TableBlock]]:
+            if axis == 0:
+                result.extend(blocks)
+            elif axis == 1:
+                # We make sure each row_block have the same num_rows
+                result, blocks = _split_both_like(result, blocks)
+                for i, row_block in enumerate(blocks):
+                    result[i].extend(row_block)
+            return result
+        blocks = to_blocks(tables[0])
+        for table in tables[1:]:
+            table_blocks = to_blocks(table)
+            blocks = _extend_blocks(blocks, table_blocks, axis=axis)
+        return cls.from_blocks(blocks)
+    @property
+    def _slices(self):
+        offset = 0
+        for tables in self.blocks:
+            length = len(tables[0])
+            yield (offset, length)
+            offset += length
+    def slice(self, offset=0, length=None):
+        """
+        Compute zero-copy slice of this Table.
+        Args:
+            offset (`int`, defaults to `0`):
+                Offset from start of table to slice.
+            length (`int`, defaults to `None`):
+                Length of slice (default is until end of table starting from
+                offset).
+        Returns:
+            `datasets.table.Table`
+        """
+        table = self.table.slice(offset, length=length)
+        length = length if length is not None else self.num_rows - offset
+        blocks = []
+        for tables in self.blocks:
+            n_rows = len(tables[0])
+            if length == 0:
+                break
+            elif n_rows <= offset:
+                offset = offset - n_rows
+            elif n_rows <= offset + length:
+                blocks.append([t.slice(offset) for t in tables])
+                length, offset = length + offset - n_rows, 0
+            else:
+                blocks.append([t.slice(offset, length) for t in tables])
+                length, offset = 0, 0
+        return ConcatenationTable(table, blocks)
+    def filter(self, mask, *args, **kwargs):
+        """
+        Select records from a Table. See `pyarrow.compute.filter` for full usage.
+        """
+        table = self.table.filter(mask, *args, **kwargs)
+        blocks = []
+        for (offset, length), tables in zip(self._slices, self.blocks):
+            submask = mask.slice(offset, length)
+            blocks.append([t.filter(submask, *args, **kwargs) for t in tables])
+        return ConcatenationTable(table, blocks)
+    def flatten(self, *args, **kwargs):
+        """
+        Flatten this Table.  Each column with a struct type is flattened
+        into one column per struct field.  Other columns are left unchanged.
+        Args:
+            memory_pool (`MemoryPool`, defaults to `None`):
+                For memory allocations, if required, otherwise use default pool.
+        Returns:
+            `datasets.table.Table`
+        """
+        table = table_flatten(self.table, *args, **kwargs)
+        blocks = []
+        for tables in self.blocks:
+            blocks.append([t.flatten(*args, **kwargs) for t in tables])
+        return ConcatenationTable(table, blocks)
+    def combine_chunks(self, *args, **kwargs):
+        """
+        Make a new table by combining the chunks this table has.
+        All the underlying chunks in the `ChunkedArray` of each column are
+        concatenated into zero or one chunk.
+        Args:
+            memory_pool (`MemoryPool`, defaults to `None`):
+                For memory allocations, if required, otherwise use default pool.
+        Returns:
+            `datasets.table.Table`
+        """
+        table = self.table.combine_chunks(*args, **kwargs)
+        blocks = []
+        for tables in self.blocks:
+            blocks.append([t.combine_chunks(*args, **kwargs) for t in tables])
+        return ConcatenationTable(table, blocks)
+    def cast(self, target_schema, *args, **kwargs):
+        """
+        Cast table values to another schema.
+        Args:
+            target_schema (`Schema`):
+                Schema to cast to, the names and order of fields must match.
+            safe (`bool`, defaults to `True`):
+                Check for overflows or other unsafe conversions.
+        Returns:
+            `datasets.table.Table`
+        """
+        from .features import Features
+        table = table_cast(self.table, target_schema, *args, **kwargs)
+        target_features = Features.from_arrow_schema(target_schema)
+        blocks = []
+        for subtables in self.blocks:
+            new_tables = []
+            fields = list(target_schema)
+            for subtable in subtables:
+                subfields = []
+                for name in subtable.column_names:
+                    subfields.append(fields.pop(next(i for i, field in enumerate(fields) if field.name == name)))
+                subfeatures = Features({subfield.name: target_features[subfield.name] for subfield in subfields})
+                subschema = subfeatures.arrow_schema
+                new_tables.append(subtable.cast(subschema, *args, **kwargs))
+            blocks.append(new_tables)
+        return ConcatenationTable(table, blocks)
+    def replace_schema_metadata(self, *args, **kwargs):
+        """
+        EXPERIMENTAL: Create shallow copy of table by replacing schema
+        key-value metadata with the indicated new metadata (which may be `None`,
+        which deletes any existing metadata).
+        Args:
+            metadata (`dict`, defaults to `None`):
+        Returns:
+            `datasets.table.Table`: shallow_copy
+        """
+        table = self.table.replace_schema_metadata(*args, **kwargs)
+        blocks = []
+        for tables in self.blocks:
+            blocks.append([t.replace_schema_metadata(*args, **kwargs) for t in tables])
+        return ConcatenationTable(table, self.blocks)
+    def add_column(self, *args, **kwargs):
+        """
+        Add column to Table at position.
+        A new table is returned with the column added, the original table
+        object is left unchanged.
+        Args:
+            i (`int`):
+                Index to place the column at.
+            field_ (`Union[str, pyarrow.Field]`):
+                If a string is passed then the type is deduced from the column
+                data.
+            column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
+                Column data.
+        Returns:
+            `datasets.table.Table`: New table with the passed column added.
+        """
+        raise NotImplementedError()
+    def append_column(self, *args, **kwargs):
+        """
+        Append column at end of columns.
+        Args:
+            field_ (`Union[str, pyarrow.Field]`):
+                If a string is passed then the type is deduced from the column
+                data.
+            column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
+                Column data.
+        Returns:
+            `datasets.table.Table`:
+                New table with the passed column added.
+        """
+        raise NotImplementedError()
+    def remove_column(self, i, *args, **kwargs):
+        """
+        Create new Table with the indicated column removed.
+        Args:
+            i (`int`):
+                Index of column to remove.
+        Returns:
+            `datasets.table.Table`:
+                New table without the column.
+        """
+        table = self.table.remove_column(i, *args, **kwargs)
+        name = self.table.column_names[i]
+        blocks = []
+        for tables in self.blocks:
+            blocks.append(
+                [
+                    t.remove_column(t.column_names.index(name), *args, **kwargs) if name in t.column_names else t
+                    for t in tables
+                ]
+            )
+        return ConcatenationTable(table, blocks)
+    def set_column(self, *args, **kwargs):
+        """
+        Replace column in Table at position.
+        Args:
+            i (`int`):
+                Index to place the column at.
+            field_ (`Union[str, pyarrow.Field]`):
+                If a string is passed then the type is deduced from the column
+                data.
+            column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
+                Column data.
+        Returns:
+            `datasets.table.Table`:
+                New table with the passed column set.
+        """
+        raise NotImplementedError()
+    def rename_columns(self, names, *args, **kwargs):
+        """
+        Create new table with columns renamed to provided names.
+        """
+        table = self.table.rename_columns(names, *args, **kwargs)
+        names = dict(zip(self.table.column_names, names))
+        blocks = []
+        for tables in self.blocks:
+            blocks.append(
+                [t.rename_columns([names[name] for name in t.column_names], *args, **kwargs) for t in tables]
+            )
+        return ConcatenationTable(table, blocks)
+    def drop(self, columns, *args, **kwargs):
+        """
+        Drop one or more columns and return a new table.
+        Args:
+            columns (`List[str]`):
+                List of field names referencing existing columns.
+        Raises:
+            `KeyError` : if any of the passed columns name are not existing.
+        Returns:
+            `datasets.table.Table`:
+                New table without the columns.
+        """
+        table = self.table.drop(columns, *args, **kwargs)
+        blocks = []
+        for tables in self.blocks:
+            blocks.append([t.drop([c for c in columns if c in t.column_names], *args, **kwargs) for t in tables])
+        return ConcatenationTable(table, blocks)
+    def select(self, columns, *args, **kwargs):
+        """
+        Select columns of the table.
+        Returns a new table with the specified columns, and metadata preserved.
+        Args:
+            columns (:obj:`Union[List[str], List[int]]`):
+                The column names or integer indices to select.
+        Returns:
+            :class:`datasets.table.Table`: New table with the specified columns, and metadata preserved.
+        """
+        table = self.table.select(columns, *args, **kwargs)
+        blocks = []
+        for tables in self.blocks:
+            blocks.append([t.select([c for c in columns if c in t.column_names], *args, **kwargs) for t in tables])
+        return ConcatenationTable(table, blocks)
+def concat_tables(tables: list[Table], axis: int = 0) -> Table:
+    """
+    Concatenate tables.
+    Args:
+        tables (list of `Table`):
+            List of tables to be concatenated.
+        axis (`{0, 1}`, defaults to `0`, meaning over rows):
+            Axis to concatenate over, where `0` means over rows (vertically) and `1` means over columns
+            (horizontally).
+            <Added version="1.6.0"/>
+    Returns:
+        `datasets.table.Table`:
+            If the number of input tables is > 1, then the returned table is a `datasets.table.ConcatenationTable`.
+            Otherwise if there's only one table, it is returned as is.
+    """
+    tables = list(tables)
+    if len(tables) == 1:
+        return tables[0]
+    return ConcatenationTable.from_tables(tables, axis=axis)
+def list_table_cache_files(table: Table) -> list[str]:
+    """
+    Get the cache files that are loaded by the table.
+    Cache file are used when parts of the table come from the disk via memory mapping.
+    Returns:
+        `List[str]`:
+            A list of paths to the cache files loaded by the table.
+    """
+    if isinstance(table, ConcatenationTable):
+        cache_files = []
+        for subtables in table.blocks:
+            for subtable in subtables:
+                cache_files += list_table_cache_files(subtable)
+        return cache_files
+    elif isinstance(table, MemoryMappedTable):
+        return [table.path]
+    else:
+        return []
+def _wrap_for_chunked_arrays(func):
+    """Apply the function on each chunk of a `pyarrow.ChunkedArray`, or on the array directly"""
+    def wrapper(array, *args, **kwargs):
+        if isinstance(array, pa.ChunkedArray):
+            return pa.chunked_array([func(chunk, *args, **kwargs) for chunk in array.chunks])
+        else:
+            return func(array, *args, **kwargs)
+    return wrapper
+def _are_list_values_of_length(array: pa.ListArray, length: int) -> bool:
+    """Check if all the sub-lists of a `pa.ListArray` have the specified length."""
+    return pc.all(pc.equal(array.value_lengths(), length)).as_py() or array.null_count == len(array)
+def _combine_list_array_offsets_with_mask(array: pa.ListArray) -> pa.Array:
+    """Add the null bitmap to the offsets of a `pa.ListArray`."""
+    offsets = array.offsets
+    if array.null_count > 0:
+        offsets = pa.concat_arrays(
+            [
+                pc.replace_with_mask(offsets[:-1], array.is_null(), pa.nulls(len(array), pa.int32())),
+                offsets[-1:],
+            ]
+        )
+    return offsets
+def _storage_type(type: pa.DataType) -> pa.DataType:
+    """Convert a (possibly nested) `pa.ExtensionType` to its storage type."""
+    if isinstance(type, pa.ExtensionType):
+        return _storage_type(type.storage_type)
+    elif isinstance(type, pa.StructType):
+        return pa.struct([pa.field(field.name, _storage_type(field.type)) for field in type])
+    elif isinstance(type, pa.ListType):
+        return pa.list_(_storage_type(type.value_type))
+    elif isinstance(type, pa.FixedSizeListType):
+        return pa.list_(_storage_type(type.value_type), type.list_size)
+    return type
+def _short_str(value: Any) -> str:
+    out = str(value)
+    if len(out) > 3000:
+        out = out[:1500] + "\n...\n" + out[-1500:]
+    return out
+@_wrap_for_chunked_arrays
+def array_cast(
+    array: pa.Array, pa_type: pa.DataType, allow_primitive_to_str: bool = True, allow_decimal_to_str: bool = True
+) -> Union[pa.Array, pa.FixedSizeListArray, pa.ListArray, pa.StructArray, pa.ExtensionArray]:
+    """Improved version of `pa.Array.cast`
+    It supports casting `pa.StructArray` objects to re-order the fields.
+    It also let you control certain aspects of the casting, e.g. whether
+    to disable casting primitives (`booleans`, `floats` or `ints`) or
+    disable casting decimals to strings.
+    Args:
+        array (`pa.Array`):
+            PyArrow array to cast
+        pa_type (`pa.DataType`):
+            Target PyArrow type
+        allow_primitive_to_str (`bool`, defaults to `True`):
+            Whether to allow casting primitives to strings.
+            Defaults to `True`.
+        allow_decimal_to_str (`bool`, defaults to `True`):
+            Whether to allow casting decimals to strings.
+            Defaults to `True`.
+    Raises:
+        `pa.ArrowInvalidError`: if the arrow data casting fails
+        `TypeError`: if the target type is not supported according, e.g.
+            - if a field is missing
+            - if casting from primitives to strings and `allow_primitive_to_str` is `False`
+            - if casting from decimals to strings and `allow_decimal_to_str` is `False`
+    Returns:
+        `List[pyarrow.Array]`: the casted array
+    """
+    _c = partial(array_cast, allow_primitive_to_str=allow_primitive_to_str, allow_decimal_to_str=allow_decimal_to_str)
+    if isinstance(array, pa.ExtensionArray):
+        array = array.storage
+    if isinstance(pa_type, pa.ExtensionType):
+        return pa_type.wrap_array(_c(array, pa_type.storage_type))
+    elif array.type == pa_type:
+        return array
+    elif pa.types.is_struct(array.type):
+        if pa.types.is_struct(pa_type) and ({field.name for field in pa_type} == {field.name for field in array.type}):
+            if array.type.num_fields == 0:
+                return array
+            arrays = [_c(array.field(field.name), field.type) for field in pa_type]
+            return pa.StructArray.from_arrays(arrays, fields=list(pa_type), mask=array.is_null())
+    elif pa.types.is_list(array.type) or pa.types.is_large_list(array.type):
+        if pa.types.is_fixed_size_list(pa_type):
+            if _are_list_values_of_length(array, pa_type.list_size):
+                if array.null_count > 0:
+                    # Ensure each null value in the array translates to [null] * pa_type.list_size in the array's values array
+                    array_type = array.type
+                    storage_type = _storage_type(array_type)
+                    if array_type != storage_type:
+                        # Temporarily convert to the storage type to support extension types in the slice operation
+                        array = _c(array, storage_type)
+                        array = pc.list_slice(array, 0, pa_type.list_size, return_fixed_size_list=True)
+                        array = _c(array, array_type)
+                    else:
+                        array = pc.list_slice(array, 0, pa_type.list_size, return_fixed_size_list=True)
+                    array_values = array.values
+                    return pa.FixedSizeListArray.from_arrays(
+                        _c(array_values, pa_type.value_type), pa_type.list_size, mask=array.is_null()
+                    )
+                else:
+                    array_values = array.values[
+                        array.offset * pa_type.list_size : (array.offset + len(array)) * pa_type.list_size
+                    ]
+                    return pa.FixedSizeListArray.from_arrays(_c(array_values, pa_type.value_type), pa_type.list_size)
+        elif pa.types.is_list(pa_type):
+            # Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError
+            array_offsets = _combine_list_array_offsets_with_mask(array)
+            return pa.ListArray.from_arrays(array_offsets, _c(array.values, pa_type.value_type))
+        elif pa.types.is_large_list(pa_type):
+            # Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError
+            array_offsets = _combine_list_array_offsets_with_mask(array)
+            return pa.LargeListArray.from_arrays(array_offsets, _c(array.values, pa_type.value_type))
+    elif pa.types.is_fixed_size_list(array.type):
+        if pa.types.is_fixed_size_list(pa_type):
+            if pa_type.list_size == array.type.list_size:
+                array_values = array.values[
+                    array.offset * array.type.list_size : (array.offset + len(array)) * array.type.list_size
+                ]
+                return pa.FixedSizeListArray.from_arrays(
+                    _c(array_values, pa_type.value_type), pa_type.list_size, mask=array.is_null()
+                )
+        elif pa.types.is_list(pa_type):
+            array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size
+            return pa.ListArray.from_arrays(array_offsets, _c(array.values, pa_type.value_type), mask=array.is_null())
+        elif pa.types.is_large_list(pa_type):
+            array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size
+            return pa.LargeListArray.from_arrays(
+                array_offsets, _c(array.values, pa_type.value_type), mask=array.is_null()
+            )
+    else:
+        if pa.types.is_string(pa_type):
+            if not allow_primitive_to_str and pa.types.is_primitive(array.type):
+                raise TypeError(
+                    f"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)} "
+                    f"since allow_primitive_to_str is set to {allow_primitive_to_str} "
+                )
+            if not allow_decimal_to_str and pa.types.is_decimal(array.type):
+                raise TypeError(
+                    f"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)} "
+                    f"and allow_decimal_to_str is set to {allow_decimal_to_str}"
+                )
+        if pa.types.is_null(pa_type) and not pa.types.is_null(array.type):
+            raise TypeError(f"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)}")
+        return array.cast(pa_type)
+    raise TypeError(f"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)}")
+@_wrap_for_chunked_arrays
+def cast_array_to_feature(
+    array: pa.Array, feature: "FeatureType", allow_primitive_to_str: bool = True, allow_decimal_to_str: bool = True
+) -> pa.Array:
+    """Cast an array to the arrow type that corresponds to the requested feature type.
+    For custom features like [`Audio`] or [`Image`], it takes into account the "cast_storage" methods
+    they defined to enable casting from other arrow types.
+    Args:
+        array (`pa.Array`):
+            The PyArrow array to cast.
+        feature (`datasets.features.FeatureType`):
+            The target feature type.
+        allow_primitive_to_str (`bool`, defaults to `True`):
+            Whether to allow casting primitives to strings.
+            Defaults to `True`.
+        allow_decimal_to_str (`bool`, defaults to `True`):
+            Whether to allow casting decimals to strings.
+            Defaults to `True`.
+    Raises:
+        `pa.ArrowInvalidError`: if the arrow data casting fails
+        `TypeError`: if the target type is not supported according, e.g.
+            - if a field is missing
+            - if casting from primitives and `allow_primitive_to_str` is `False`
+            - if casting from decimals and `allow_decimal_to_str` is `False`
+    Returns:
+        array (`pyarrow.Array`): the casted array
+    """
+    from .features.features import LargeList, List, get_nested_type
+    _c = partial(
+        cast_array_to_feature,
+        allow_primitive_to_str=allow_primitive_to_str,
+        allow_decimal_to_str=allow_decimal_to_str,
+    )
+    if isinstance(array, pa.ExtensionArray):
+        array = array.storage
+    if hasattr(feature, "cast_storage"):
+        return feature.cast_storage(array)
+    if pa.types.is_struct(array.type):
+        # feature must be a dict
+        if isinstance(feature, dict) and (array_fields := {field.name for field in array.type}) <= set(feature):
+            null_array = pa.array([None] * len(array))
+            arrays = [
+                _c(array.field(name) if name in array_fields else null_array, subfeature)
+                for name, subfeature in feature.items()
+            ]
+            return pa.StructArray.from_arrays(arrays, names=list(feature), mask=array.is_null())
+    elif pa.types.is_list(array.type) or pa.types.is_large_list(array.type):
+        # feature must be either List(subfeature) or LargeList(subfeature)
+        if isinstance(feature, LargeList):
+            casted_array_values = _c(array.values, feature.feature)
+            if pa.types.is_large_list(array.type) and casted_array_values.type == array.values.type:
+                # Both array and feature have equal large_list type and values (within the list) type
+                return array
+            else:
+                # Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError
+                array_offsets = _combine_list_array_offsets_with_mask(array)
+                return pa.LargeListArray.from_arrays(array_offsets, casted_array_values)
+        elif isinstance(feature, List):
+            if feature.length > -1:
+                if _are_list_values_of_length(array, feature.length):
+                    if array.null_count > 0:
+                        # Ensure each null value in the array translates to [null] * pa_type.list_size in the array's values array
+                        array_type = array.type
+                        storage_type = _storage_type(array_type)
+                        if array_type != storage_type:
+                            # Temporarily convert to the storage type to support extension types in the slice operation
+                            array = array_cast(
+                                array,
+                                storage_type,
+                                allow_primitive_to_str=allow_primitive_to_str,
+                                allow_decimal_to_str=allow_decimal_to_str,
+                            )
+                            array = pc.list_slice(array, 0, feature.length, return_fixed_size_list=True)
+                            array = array_cast(
+                                array,
+                                array_type,
+                                allow_primitive_to_str=allow_primitive_to_str,
+                                allow_decimal_to_str=allow_decimal_to_str,
+                            )
+                        else:
+                            array = pc.list_slice(array, 0, feature.length, return_fixed_size_list=True)
+                        array_values = array.values
+                        casted_array_values = _c(array_values, feature.feature)
+                        return pa.FixedSizeListArray.from_arrays(
+                            casted_array_values, feature.length, mask=array.is_null()
+                        )
+                    else:
+                        array_values = array.values[
+                            array.offset * feature.length : (array.offset + len(array)) * feature.length
+                        ]
+                        return pa.FixedSizeListArray.from_arrays(_c(array_values, feature.feature), feature.length)
+            else:
+                casted_array_values = _c(array.values, feature.feature)
+                if pa.types.is_list(array.type) and casted_array_values.type == array.values.type:
+                    # Both array and feature have equal list type and values (within the list) type
+                    return array
+                else:
+                    # Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError
+                    array_offsets = _combine_list_array_offsets_with_mask(array)
+                    return pa.ListArray.from_arrays(array_offsets, casted_array_values)
+    elif pa.types.is_fixed_size_list(array.type):
+        # feature must be List(subfeature)
+        if isinstance(feature, LargeList):
+            array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size
+            return pa.LargeListArray.from_arrays(
+                array_offsets, _c(array.values, feature.feature), mask=array.is_null()
+            )
+        elif isinstance(feature, List):
+            if feature.length > -1:
+                if feature.length == array.type.list_size:
+                    array_values = array.values[
+                        array.offset * array.type.list_size : (array.offset + len(array)) * array.type.list_size
+                    ]
+                    casted_array_values = _c(array_values, feature.feature)
+                    return pa.FixedSizeListArray.from_arrays(casted_array_values, feature.length, mask=array.is_null())
+            else:
+                array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size
+                return pa.ListArray.from_arrays(array_offsets, _c(array.values, feature.feature), mask=array.is_null())
+    if pa.types.is_null(array.type):
+        return array_cast(
+            array,
+            get_nested_type(feature),
+            allow_primitive_to_str=allow_primitive_to_str,
+            allow_decimal_to_str=allow_decimal_to_str,
+        )
+    elif not isinstance(feature, (List, LargeList, dict)):
+        return array_cast(
+            array,
+            feature(),
+            allow_primitive_to_str=allow_primitive_to_str,
+            allow_decimal_to_str=allow_decimal_to_str,
+        )
+    raise TypeError(f"Couldn't cast array of type\n{_short_str(array.type)}\nto\n{_short_str(feature)}")
+@_wrap_for_chunked_arrays
+def embed_array_storage(array: pa.Array, feature: "FeatureType", token_per_repo_id=None):
+    """Embed data into an arrays's storage.
+    For custom features like Audio or Image, it takes into account the "embed_storage" methods
+    they define to embed external data (e.g. an image file) into an array.
+    <Added version="2.4.0"/>
+    Args:
+        array (`pa.Array`):
+            The PyArrow array in which to embed data.
+        feature (`datasets.features.FeatureType`):
+            Array features.
+    Raises:
+        `TypeError`: if the target type is not supported according, e.g.
+            - if a field is missing
+    Returns:
+         array (`pyarrow.Array`): the casted array
+    """
+    from .features import LargeList, List
+    _e = partial(embed_array_storage, token_per_repo_id=token_per_repo_id)
+    if isinstance(array, pa.ExtensionArray):
+        array = array.storage
+    if hasattr(feature, "embed_storage"):
+        return feature.embed_storage(array, token_per_repo_id=token_per_repo_id)
+    elif pa.types.is_struct(array.type):
+        # feature must be a dict
+        if isinstance(feature, dict):
+            arrays = [_e(array.field(name), subfeature) for name, subfeature in feature.items()]
+            return pa.StructArray.from_arrays(arrays, names=list(feature), mask=array.is_null())
+    elif pa.types.is_list(array.type):
+        # feature must be either List(subfeature)
+        # Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError
+        array_offsets = _combine_list_array_offsets_with_mask(array)
+        if isinstance(feature, List) and feature.length == -1:
+            return pa.ListArray.from_arrays(array_offsets, _e(array.values, feature.feature))
+    elif pa.types.is_large_list(array.type):
+        # feature must be LargeList(subfeature)
+        # Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError
+        array_offsets = _combine_list_array_offsets_with_mask(array)
+        return pa.LargeListArray.from_arrays(array_offsets, _e(array.values, feature.feature))
+    elif pa.types.is_fixed_size_list(array.type):
+        # feature must be List(subfeature)
+        if isinstance(feature, List) and feature.length > -1:
+            array_values = array.values[
+                array.offset * array.type.list_size : (array.offset + len(array)) * array.type.list_size
+            ]
+            embedded_array_values = _e(array_values, feature.feature)
+            return pa.FixedSizeListArray.from_arrays(embedded_array_values, feature.length, mask=array.is_null())
+    if not isinstance(feature, (List, LargeList, dict)):
+        return array
+    raise TypeError(f"Couldn't embed array of type\n{_short_str(array.type)}\nwith\n{_short_str(feature)}")
+class CastError(ValueError):
+    """When it's not possible to cast an Arrow table to a specific schema or set of features"""
+    def __init__(self, *args, table_column_names: list[str], requested_column_names: list[str]) -> None:
+        super().__init__(*args)
+        self.table_column_names = table_column_names
+        self.requested_column_names = requested_column_names
+    def __reduce__(self):
+        # Fix unpickling: TypeError: __init__() missing 2 required keyword-only arguments: 'table_column_names' and 'requested_column_names'
+        return partial(
+            CastError, table_column_names=self.table_column_names, requested_column_names=self.requested_column_names
+        ), ()
+    def details(self):
+        new_columns = set(self.table_column_names) - set(self.requested_column_names)
+        missing_columns = set(self.requested_column_names) - set(self.table_column_names)
+        if new_columns and missing_columns:
+            return f"there are {len(new_columns)} new columns ({_short_str(new_columns)}) and {len(missing_columns)} missing columns ({_short_str(missing_columns)})."
+        elif new_columns:
+            return f"there are {len(new_columns)} new columns ({_short_str(new_columns)})"
+        else:
+            return f"there are {len(missing_columns)} missing columns ({_short_str(missing_columns)})"
+def cast_table_to_features(table: pa.Table, features: "Features"):
+    """Cast a table to the arrow schema that corresponds to the requested features.
+    Args:
+        table (`pyarrow.Table`):
+            PyArrow table to cast.
+        features ([`Features`]):
+            Target features.
+    Returns:
+        table (`pyarrow.Table`): the casted table
+    """
+    if sorted(table.column_names) != sorted(features):
+        raise CastError(
+            f"Couldn't cast\n{_short_str(table.schema)}\nto\n{_short_str(features)}\nbecause column names don't match",
+            table_column_names=table.column_names,
+            requested_column_names=list(features),
+        )
+    arrays = [cast_array_to_feature(table[name], feature) for name, feature in features.items()]
+    return pa.Table.from_arrays(arrays, schema=features.arrow_schema)
+def cast_table_to_schema(table: pa.Table, schema: pa.Schema):
+    """Cast a table to the arrow schema. Different from `cast_table_to_features`, this method can preserve nullability.
+    Args:
+        table (`pa.Table`):
+            PyArrow table to cast.
+        features ([`Features`]):
+            Target features.
+    Returns:
+        `pa.Table`: the casted table
+    """
+    from .features import Features
+    features = Features.from_arrow_schema(schema)
+    table_column_names = set(table.column_names)
+    if not table_column_names <= set(schema.names):
+        raise CastError(
+            f"Couldn't cast\n{_short_str(table.schema)}\nto\n{_short_str(features)}\nbecause column names don't match",
+            table_column_names=table.column_names,
+            requested_column_names=list(features),
+        )
+    arrays = [
+        cast_array_to_feature(
+            table[name] if name in table_column_names else pa.array([None] * len(table), type=schema.field(name).type),
+            feature,
+        )
+        for name, feature in features.items()
+    ]
+    return pa.Table.from_arrays(arrays, schema=schema)
+def embed_table_storage(table: pa.Table, token_per_repo_id=None):
+    """Embed external data into a table's storage.
+    <Added version="2.4.0"/>
+    Args:
+        table (`pyarrow.Table`):
+            PyArrow table in which to embed data.
+    Returns:
+        table (`pyarrow.Table`): the table with embedded data
+    """
+    from .features.features import Features, require_storage_embed
+    features = Features.from_arrow_schema(table.schema)
+    arrays = [
+        embed_array_storage(table[name], feature, token_per_repo_id=token_per_repo_id)
+        if require_storage_embed(feature)
+        else table[name]
+        for name, feature in features.items()
+    ]
+    return pa.Table.from_arrays(arrays, schema=features.arrow_schema)
+def table_cast(table: pa.Table, schema: pa.Schema):
+    """Improved version of `pa.Table.cast`.
+    It supports casting to feature types stored in the schema metadata.
+    Args:
+        table (`pyarrow.Table`):
+            PyArrow table to cast.
+        schema (`pyarrow.Schema`):
+            Target PyArrow schema.
+    Returns:
+        table (`pyarrow.Table`): the casted table
+    """
+    if table.schema != schema:
+        return cast_table_to_schema(table, schema)
+    elif table.schema.metadata != schema.metadata:
+        return table.replace_schema_metadata(schema.metadata)
+    else:
+        return table
+def table_flatten(table: pa.Table):
+    """Improved version of `pa.Table.flatten`.
+    It behaves as `pa.Table.flatten` in a sense it does 1-step flatten of the columns with a struct type into one column per struct field,
+    but updates the metadata and skips decodable features unless the `decode` attribute of these features is set to False.
+    Args:
+        table (`pa.Table`):
+            PyArrow table to flatten.
+    Returns:
+        `Table`: the flattened table
+    """
+    from .features import Features
+    features = Features.from_arrow_schema(table.schema)
+    if any(hasattr(subfeature, "flatten") and subfeature.flatten() == subfeature for subfeature in features.values()):
+        flat_arrays = []
+        flat_column_names = []
+        for field in table.schema:
+            array = table.column(field.name)
+            subfeature = features[field.name]
+            if pa.types.is_struct(field.type) and (
+                not hasattr(subfeature, "flatten") or subfeature.flatten() != subfeature
+            ):
+                flat_arrays.extend(array.flatten())
+                flat_column_names.extend([f"{field.name}.{subfield.name}" for subfield in field.type])
+            else:
+                flat_arrays.append(array)
+                flat_column_names.append(field.name)
+        flat_table = pa.Table.from_arrays(
+            flat_arrays,
+            names=flat_column_names,
+        )
+    else:
+        flat_table = table.flatten()
+    # Preserve complex types in the metadata
+    flat_features = features.flatten(max_depth=2)
+    flat_features = Features({column_name: flat_features[column_name] for column_name in flat_table.column_names})
+    return flat_table.replace_schema_metadata(flat_features.arrow_schema.metadata)
+def table_visitor(table: pa.Table, function: Callable[[pa.Array], None]):
+    """Visit all arrays in a table and apply a function to them.
+    Args:
+        table (`pyarrow.Table`):
+            PyArrow table to visit.
+        function (`Callable[[pa.Array], None]`):
+            Function to apply to each array.
+    """
+    from .features import Features, LargeList, List
+    features = Features.from_arrow_schema(table.schema)
+    def _visit(array, feature):
+        if isinstance(array, pa.ChunkedArray):
+            for chunk in array.chunks:
+                _visit(chunk, feature)
+        else:
+            if isinstance(array, pa.ExtensionArray):
+                array = array.storage
+            function(array, feature)
+            if pa.types.is_struct(array.type) and not hasattr(feature, "cast_storage"):
+                for name, subfeature in feature.items():
+                    _visit(array.field(name), subfeature)
+            elif pa.types.is_list(array.type):
+                if isinstance(feature, (LargeList, List)):
+                    _visit(array.values, feature.feature)
+    for name, feature in features.items():
+        _visit(table[name], feature)
+def table_iter(table: Table, batch_size: int, drop_last_batch=False) -> Iterator[pa.Table]:
+    """Iterate over sub-tables of size `batch_size`.
+    Args:
+        table (`pyarrow.Table`):
+            PyArrow table to iterate over.
+        batch_size (`int`):
+            Size of each sub-table to yield.
+        drop_last_batch (`bool`, defaults to `False`):
+            Drop the last batch if it is smaller than `batch_size`.
+    """
+    chunks_buffer = []
+    chunks_buffer_size = 0
+    for chunk in table.to_reader(max_chunksize=batch_size):
+        if len(chunk) == 0:
+            continue
+        elif chunks_buffer_size + len(chunk) < batch_size:
+            chunks_buffer.append(chunk)
+            chunks_buffer_size += len(chunk)
+            continue
+        elif chunks_buffer_size + len(chunk) == batch_size:
+            chunks_buffer.append(chunk)
+            yield pa.Table.from_batches(chunks_buffer)
+            chunks_buffer = []
+            chunks_buffer_size = 0
+        else:
+            cropped_chunk_length = batch_size - chunks_buffer_size
+            chunks_buffer.append(chunk.slice(0, cropped_chunk_length))
+            yield pa.Table.from_batches(chunks_buffer)
+            chunks_buffer = [chunk.slice(cropped_chunk_length, len(chunk) - cropped_chunk_length)]
+            chunks_buffer_size = len(chunk) - cropped_chunk_length
+    if not drop_last_batch and chunks_buffer:
+        yield pa.Table.from_batches(chunks_buffer)

idna/__init__.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from .core import (
+    IDNABidiError,
+    IDNAError,
+    InvalidCodepoint,
+    InvalidCodepointContext,
+    alabel,
+    check_bidi,
+    check_hyphen_ok,
+    check_initial_combiner,
+    check_label,
+    check_nfc,
+    decode,
+    encode,
+    ulabel,
+    uts46_remap,
+    valid_contextj,
+    valid_contexto,
+    valid_label_length,
+    valid_string_length,
+)
+from .intranges import intranges_contain
+from .package_data import __version__
+__all__ = [
+    "__version__",
+    "IDNABidiError",
+    "IDNAError",
+    "InvalidCodepoint",
+    "InvalidCodepointContext",
+    "alabel",
+    "check_bidi",
+    "check_hyphen_ok",
+    "check_initial_combiner",
+    "check_label",
+    "check_nfc",
+    "decode",
+    "encode",
+    "intranges_contain",
+    "ulabel",
+    "uts46_remap",
+    "valid_contextj",
+    "valid_contexto",
+    "valid_label_length",
+    "valid_string_length",
+]

idna/codec.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import codecs
+import re
+from typing import Any, Optional, Tuple
+from .core import IDNAError, alabel, decode, encode, ulabel
+_unicode_dots_re = re.compile("[\u002e\u3002\uff0e\uff61]")
+class Codec(codecs.Codec):
+    def encode(self, data: str, errors: str = "strict") -> Tuple[bytes, int]:
+        if errors != "strict":
+            raise IDNAError('Unsupported error handling "{}"'.format(errors))
+        if not data:
+            return b"", 0
+        return encode(data), len(data)
+    def decode(self, data: bytes, errors: str = "strict") -> Tuple[str, int]:
+        if errors != "strict":
+            raise IDNAError('Unsupported error handling "{}"'.format(errors))
+        if not data:
+            return "", 0
+        return decode(data), len(data)
+class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
+    def _buffer_encode(self, data: str, errors: str, final: bool) -> Tuple[bytes, int]:
+        if errors != "strict":
+            raise IDNAError('Unsupported error handling "{}"'.format(errors))
+        if not data:
+            return b"", 0
+        labels = _unicode_dots_re.split(data)
+        trailing_dot = b""
+        if labels:
+            if not labels[-1]:
+                trailing_dot = b"."
+                del labels[-1]
+            elif not final:
+                # Keep potentially unfinished label until the next call
+                del labels[-1]
+                if labels:
+                    trailing_dot = b"."
+        result = []
+        size = 0
+        for label in labels:
+            result.append(alabel(label))
+            if size:
+                size += 1
+            size += len(label)
+        # Join with U+002E
+        result_bytes = b".".join(result) + trailing_dot
+        size += len(trailing_dot)
+        return result_bytes, size
+class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
+    def _buffer_decode(self, data: Any, errors: str, final: bool) -> Tuple[str, int]:
+        if errors != "strict":
+            raise IDNAError('Unsupported error handling "{}"'.format(errors))
+        if not data:
+            return ("", 0)
+        if not isinstance(data, str):
+            data = str(data, "ascii")
+        labels = _unicode_dots_re.split(data)
+        trailing_dot = ""
+        if labels:
+            if not labels[-1]:
+                trailing_dot = "."
+                del labels[-1]
+            elif not final:
+                # Keep potentially unfinished label until the next call
+                del labels[-1]
+                if labels:
+                    trailing_dot = "."
+        result = []
+        size = 0
+        for label in labels:
+            result.append(ulabel(label))
+            if size:
+                size += 1
+            size += len(label)
+        result_str = ".".join(result) + trailing_dot
+        size += len(trailing_dot)
+        return (result_str, size)
+class StreamWriter(Codec, codecs.StreamWriter):
+    pass
+class StreamReader(Codec, codecs.StreamReader):
+    pass
+def search_function(name: str) -> Optional[codecs.CodecInfo]:
+    if name != "idna2008":
+        return None
+    return codecs.CodecInfo(
+        name=name,
+        encode=Codec().encode,
+        decode=Codec().decode,  # type: ignore
+        incrementalencoder=IncrementalEncoder,
+        incrementaldecoder=IncrementalDecoder,
+        streamwriter=StreamWriter,
+        streamreader=StreamReader,
+    )
+codecs.register(search_function)

idna/compat.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from typing import Any, Union
+from .core import decode, encode
+def ToASCII(label: str) -> bytes:
+    return encode(label)
+def ToUnicode(label: Union[bytes, bytearray]) -> str:
+    return decode(label)
+def nameprep(s: Any) -> None:
+    raise NotImplementedError("IDNA 2008 does not utilise nameprep protocol")

idna/core.py ADDED Viewed

	@@ -0,0 +1,437 @@

+import bisect
+import re
+import unicodedata
+from typing import Optional, Union
+from . import idnadata
+from .intranges import intranges_contain
+_virama_combining_class = 9
+_alabel_prefix = b"xn--"
+_unicode_dots_re = re.compile("[\u002e\u3002\uff0e\uff61]")
+class IDNAError(UnicodeError):
+    """Base exception for all IDNA-encoding related problems"""
+    pass
+class IDNABidiError(IDNAError):
+    """Exception when bidirectional requirements are not satisfied"""
+    pass
+class InvalidCodepoint(IDNAError):
+    """Exception when a disallowed or unallocated codepoint is used"""
+    pass
+class InvalidCodepointContext(IDNAError):
+    """Exception when the codepoint is not valid in the context it is used"""
+    pass
+def _combining_class(cp: int) -> int:
+    v = unicodedata.combining(chr(cp))
+    if v == 0:
+        if not unicodedata.name(chr(cp)):
+            raise ValueError("Unknown character in unicodedata")
+    return v
+def _is_script(cp: str, script: str) -> bool:
+    return intranges_contain(ord(cp), idnadata.scripts[script])
+def _punycode(s: str) -> bytes:
+    return s.encode("punycode")
+def _unot(s: int) -> str:
+    return "U+{:04X}".format(s)
+def valid_label_length(label: Union[bytes, str]) -> bool:
+    if len(label) > 63:
+        return False
+    return True
+def valid_string_length(label: Union[bytes, str], trailing_dot: bool) -> bool:
+    if len(label) > (254 if trailing_dot else 253):
+        return False
+    return True
+def check_bidi(label: str, check_ltr: bool = False) -> bool:
+    # Bidi rules should only be applied if string contains RTL characters
+    bidi_label = False
+    for idx, cp in enumerate(label, 1):
+        direction = unicodedata.bidirectional(cp)
+        if direction == "":
+            # String likely comes from a newer version of Unicode
+            raise IDNABidiError("Unknown directionality in label {} at position {}".format(repr(label), idx))
+        if direction in ["R", "AL", "AN"]:
+            bidi_label = True
+    if not bidi_label and not check_ltr:
+        return True
+    # Bidi rule 1
+    direction = unicodedata.bidirectional(label[0])
+    if direction in ["R", "AL"]:
+        rtl = True
+    elif direction == "L":
+        rtl = False
+    else:
+        raise IDNABidiError("First codepoint in label {} must be directionality L, R or AL".format(repr(label)))
+    valid_ending = False
+    number_type: Optional[str] = None
+    for idx, cp in enumerate(label, 1):
+        direction = unicodedata.bidirectional(cp)
+        if rtl:
+            # Bidi rule 2
+            if direction not in [
+                "R",
+                "AL",
+                "AN",
+                "EN",
+                "ES",
+                "CS",
+                "ET",
+                "ON",
+                "BN",
+                "NSM",
+            ]:
+                raise IDNABidiError("Invalid direction for codepoint at position {} in a right-to-left label".format(idx))
+            # Bidi rule 3
+            if direction in ["R", "AL", "EN", "AN"]:
+                valid_ending = True
+            elif direction != "NSM":
+                valid_ending = False
+            # Bidi rule 4
+            if direction in ["AN", "EN"]:
+                if not number_type:
+                    number_type = direction
+                else:
+                    if number_type != direction:
+                        raise IDNABidiError("Can not mix numeral types in a right-to-left label")
+        else:
+            # Bidi rule 5
+            if direction not in ["L", "EN", "ES", "CS", "ET", "ON", "BN", "NSM"]:
+                raise IDNABidiError("Invalid direction for codepoint at position {} in a left-to-right label".format(idx))
+            # Bidi rule 6
+            if direction in ["L", "EN"]:
+                valid_ending = True
+            elif direction != "NSM":
+                valid_ending = False
+    if not valid_ending:
+        raise IDNABidiError("Label ends with illegal codepoint directionality")
+    return True
+def check_initial_combiner(label: str) -> bool:
+    if unicodedata.category(label[0])[0] == "M":
+        raise IDNAError("Label begins with an illegal combining character")
+    return True
+def check_hyphen_ok(label: str) -> bool:
+    if label[2:4] == "--":
+        raise IDNAError("Label has disallowed hyphens in 3rd and 4th position")
+    if label[0] == "-" or label[-1] == "-":
+        raise IDNAError("Label must not start or end with a hyphen")
+    return True
+def check_nfc(label: str) -> None:
+    if unicodedata.normalize("NFC", label) != label:
+        raise IDNAError("Label must be in Normalization Form C")
+def valid_contextj(label: str, pos: int) -> bool:
+    cp_value = ord(label[pos])
+    if cp_value == 0x200C:
+        if pos > 0:
+            if _combining_class(ord(label[pos - 1])) == _virama_combining_class:
+                return True
+        ok = False
+        for i in range(pos - 1, -1, -1):
+            joining_type = idnadata.joining_types.get(ord(label[i]))
+            if joining_type == ord("T"):
+                continue
+            elif joining_type in [ord("L"), ord("D")]:
+                ok = True
+                break
+            else:
+                break
+        if not ok:
+            return False
+        ok = False
+        for i in range(pos + 1, len(label)):
+            joining_type = idnadata.joining_types.get(ord(label[i]))
+            if joining_type == ord("T"):
+                continue
+            elif joining_type in [ord("R"), ord("D")]:
+                ok = True
+                break
+            else:
+                break
+        return ok
+    if cp_value == 0x200D:
+        if pos > 0:
+            if _combining_class(ord(label[pos - 1])) == _virama_combining_class:
+                return True
+        return False
+    else:
+        return False
+def valid_contexto(label: str, pos: int, exception: bool = False) -> bool:
+    cp_value = ord(label[pos])
+    if cp_value == 0x00B7:
+        if 0 < pos < len(label) - 1:
+            if ord(label[pos - 1]) == 0x006C and ord(label[pos + 1]) == 0x006C:
+                return True
+        return False
+    elif cp_value == 0x0375:
+        if pos < len(label) - 1 and len(label) > 1:
+            return _is_script(label[pos + 1], "Greek")
+        return False
+    elif cp_value == 0x05F3 or cp_value == 0x05F4:
+        if pos > 0:
+            return _is_script(label[pos - 1], "Hebrew")
+        return False
+    elif cp_value == 0x30FB:
+        for cp in label:
+            if cp == "\u30fb":
+                continue
+            if _is_script(cp, "Hiragana") or _is_script(cp, "Katakana") or _is_script(cp, "Han"):
+                return True
+        return False
+    elif 0x660 <= cp_value <= 0x669:
+        for cp in label:
+            if 0x6F0 <= ord(cp) <= 0x06F9:
+                return False
+        return True
+    elif 0x6F0 <= cp_value <= 0x6F9:
+        for cp in label:
+            if 0x660 <= ord(cp) <= 0x0669:
+                return False
+        return True
+    return False
+def check_label(label: Union[str, bytes, bytearray]) -> None:
+    if isinstance(label, (bytes, bytearray)):
+        label = label.decode("utf-8")
+    if len(label) == 0:
+        raise IDNAError("Empty Label")
+    check_nfc(label)
+    check_hyphen_ok(label)
+    check_initial_combiner(label)
+    for pos, cp in enumerate(label):
+        cp_value = ord(cp)
+        if intranges_contain(cp_value, idnadata.codepoint_classes["PVALID"]):
+            continue
+        elif intranges_contain(cp_value, idnadata.codepoint_classes["CONTEXTJ"]):
+            try:
+                if not valid_contextj(label, pos):
+                    raise InvalidCodepointContext(
+                        "Joiner {} not allowed at position {} in {}".format(_unot(cp_value), pos + 1, repr(label))
+                    )
+            except ValueError:
+                raise IDNAError(
+                    "Unknown codepoint adjacent to joiner {} at position {} in {}".format(
+                        _unot(cp_value), pos + 1, repr(label)
+                    )
+                )
+        elif intranges_contain(cp_value, idnadata.codepoint_classes["CONTEXTO"]):
+            if not valid_contexto(label, pos):
+                raise InvalidCodepointContext(
+                    "Codepoint {} not allowed at position {} in {}".format(_unot(cp_value), pos + 1, repr(label))
+                )
+        else:
+            raise InvalidCodepoint(
+                "Codepoint {} at position {} of {} not allowed".format(_unot(cp_value), pos + 1, repr(label))
+            )
+    check_bidi(label)
+def alabel(label: str) -> bytes:
+    try:
+        label_bytes = label.encode("ascii")
+        ulabel(label_bytes)
+        if not valid_label_length(label_bytes):
+            raise IDNAError("Label too long")
+        return label_bytes
+    except UnicodeEncodeError:
+        pass
+    check_label(label)
+    label_bytes = _alabel_prefix + _punycode(label)
+    if not valid_label_length(label_bytes):
+        raise IDNAError("Label too long")
+    return label_bytes
+def ulabel(label: Union[str, bytes, bytearray]) -> str:
+    if not isinstance(label, (bytes, bytearray)):
+        try:
+            label_bytes = label.encode("ascii")
+        except UnicodeEncodeError:
+            check_label(label)
+            return label
+    else:
+        label_bytes = bytes(label)
+    label_bytes = label_bytes.lower()
+    if label_bytes.startswith(_alabel_prefix):
+        label_bytes = label_bytes[len(_alabel_prefix) :]
+        if not label_bytes:
+            raise IDNAError("Malformed A-label, no Punycode eligible content found")
+        if label_bytes.decode("ascii")[-1] == "-":
+            raise IDNAError("A-label must not end with a hyphen")
+    else:
+        check_label(label_bytes)
+        return label_bytes.decode("ascii")
+    try:
+        label = label_bytes.decode("punycode")
+    except UnicodeError:
+        raise IDNAError("Invalid A-label")
+    check_label(label)
+    return label
+def uts46_remap(domain: str, std3_rules: bool = True, transitional: bool = False) -> str:
+    """Re-map the characters in the string according to UTS46 processing."""
+    from .uts46data import uts46data
+    output = ""
+    for pos, char in enumerate(domain):
+        code_point = ord(char)
+        try:
+            uts46row = uts46data[code_point if code_point < 256 else bisect.bisect_left(uts46data, (code_point, "Z")) - 1]
+            status = uts46row[1]
+            replacement: Optional[str] = None
+            if len(uts46row) == 3:
+                replacement = uts46row[2]
+            if (
+                status == "V"
+                or (status == "D" and not transitional)
+                or (status == "3" and not std3_rules and replacement is None)
+            ):
+                output += char
+            elif replacement is not None and (
+                status == "M" or (status == "3" and not std3_rules) or (status == "D" and transitional)
+            ):
+                output += replacement
+            elif status != "I":
+                raise IndexError()
+        except IndexError:
+            raise InvalidCodepoint(
+                "Codepoint {} not allowed at position {} in {}".format(_unot(code_point), pos + 1, repr(domain))
+            )
+    return unicodedata.normalize("NFC", output)
+def encode(
+    s: Union[str, bytes, bytearray],
+    strict: bool = False,
+    uts46: bool = False,
+    std3_rules: bool = False,
+    transitional: bool = False,
+) -> bytes:
+    if not isinstance(s, str):
+        try:
+            s = str(s, "ascii")
+        except UnicodeDecodeError:
+            raise IDNAError("should pass a unicode string to the function rather than a byte string.")
+    if uts46:
+        s = uts46_remap(s, std3_rules, transitional)
+    trailing_dot = False
+    result = []
+    if strict:
+        labels = s.split(".")
+    else:
+        labels = _unicode_dots_re.split(s)
+    if not labels or labels == [""]:
+        raise IDNAError("Empty domain")
+    if labels[-1] == "":
+        del labels[-1]
+        trailing_dot = True
+    for label in labels:
+        s = alabel(label)
+        if s:
+            result.append(s)
+        else:
+            raise IDNAError("Empty label")
+    if trailing_dot:
+        result.append(b"")
+    s = b".".join(result)
+    if not valid_string_length(s, trailing_dot):
+        raise IDNAError("Domain too long")
+    return s
+def decode(
+    s: Union[str, bytes, bytearray],
+    strict: bool = False,
+    uts46: bool = False,
+    std3_rules: bool = False,
+) -> str:
+    try:
+        if not isinstance(s, str):
+            s = str(s, "ascii")
+    except UnicodeDecodeError:
+        raise IDNAError("Invalid ASCII in A-label")
+    if uts46:
+        s = uts46_remap(s, std3_rules, False)
+    trailing_dot = False
+    result = []
+    if not strict:
+        labels = _unicode_dots_re.split(s)
+    else:
+        labels = s.split(".")
+    if not labels or labels == [""]:
+        raise IDNAError("Empty domain")
+    if not labels[-1]:
+        del labels[-1]
+        trailing_dot = True
+    for label in labels:
+        s = ulabel(label)
+        if s:
+            result.append(s)
+        else:
+            raise IDNAError("Empty label")
+    if trailing_dot:
+        result.append("")
+    return ".".join(result)

idna/idnadata.py ADDED Viewed

	@@ -0,0 +1,4309 @@

+# This file is automatically generated by tools/idna-data
+__version__ = "16.0.0"
+scripts = {
+    "Greek": (
+        0x37000000374,
+        0x37500000378,
+        0x37A0000037E,
+        0x37F00000380,
+        0x38400000385,
+        0x38600000387,
+        0x3880000038B,
+        0x38C0000038D,
+        0x38E000003A2,
+        0x3A3000003E2,
+        0x3F000000400,
+        0x1D2600001D2B,
+        0x1D5D00001D62,
+        0x1D6600001D6B,
+        0x1DBF00001DC0,
+        0x1F0000001F16,
+        0x1F1800001F1E,
+        0x1F2000001F46,
+        0x1F4800001F4E,
+        0x1F5000001F58,
+        0x1F5900001F5A,
+        0x1F5B00001F5C,
+        0x1F5D00001F5E,
+        0x1F5F00001F7E,
+        0x1F8000001FB5,
+        0x1FB600001FC5,
+        0x1FC600001FD4,
+        0x1FD600001FDC,
+        0x1FDD00001FF0,
+        0x1FF200001FF5,
+        0x1FF600001FFF,
+        0x212600002127,
+        0xAB650000AB66,
+        0x101400001018F,
+        0x101A0000101A1,
+        0x1D2000001D246,
+    ),
+    "Han": (
+        0x2E8000002E9A,
+        0x2E9B00002EF4,
+        0x2F0000002FD6,
+        0x300500003006,
+        0x300700003008,
+        0x30210000302A,
+        0x30380000303C,
+        0x340000004DC0,
+        0x4E000000A000,
+        0xF9000000FA6E,
+        0xFA700000FADA,
+        0x16FE200016FE4,
+        0x16FF000016FF2,
+        0x200000002A6E0,
+        0x2A7000002B73A,
+        0x2B7400002B81E,
+        0x2B8200002CEA2,
+        0x2CEB00002EBE1,
+        0x2EBF00002EE5E,
+        0x2F8000002FA1E,
+        0x300000003134B,
+        0x31350000323B0,
+    ),
+    "Hebrew": (
+        0x591000005C8,
+        0x5D0000005EB,
+        0x5EF000005F5,
+        0xFB1D0000FB37,
+        0xFB380000FB3D,
+        0xFB3E0000FB3F,
+        0xFB400000FB42,
+        0xFB430000FB45,
+        0xFB460000FB50,
+    ),
+    "Hiragana": (
+        0x304100003097,
+        0x309D000030A0,
+        0x1B0010001B120,
+        0x1B1320001B133,
+        0x1B1500001B153,
+        0x1F2000001F201,
+    ),
+    "Katakana": (
+        0x30A1000030FB,
+        0x30FD00003100,
+        0x31F000003200,
+        0x32D0000032FF,
+        0x330000003358,
+        0xFF660000FF70,
+        0xFF710000FF9E,
+        0x1AFF00001AFF4,
+        0x1AFF50001AFFC,
+        0x1AFFD0001AFFF,
+        0x1B0000001B001,
+        0x1B1200001B123,
+        0x1B1550001B156,
+        0x1B1640001B168,
+    ),
+}
+joining_types = {
+    0xAD: 84,
+    0x300: 84,
+    0x301: 84,
+    0x302: 84,
+    0x303: 84,
+    0x304: 84,
+    0x305: 84,
+    0x306: 84,
+    0x307: 84,
+    0x308: 84,
+    0x309: 84,
+    0x30A: 84,
+    0x30B: 84,
+    0x30C: 84,
+    0x30D: 84,
+    0x30E: 84,
+    0x30F: 84,
+    0x310: 84,
+    0x311: 84,
+    0x312: 84,
+    0x313: 84,
+    0x314: 84,
+    0x315: 84,
+    0x316: 84,
+    0x317: 84,
+    0x318: 84,
+    0x319: 84,
+    0x31A: 84,
+    0x31B: 84,
+    0x31C: 84,
+    0x31D: 84,
+    0x31E: 84,
+    0x31F: 84,
+    0x320: 84,
+    0x321: 84,
+    0x322: 84,
+    0x323: 84,
+    0x324: 84,
+    0x325: 84,
+    0x326: 84,
+    0x327: 84,
+    0x328: 84,
+    0x329: 84,
+    0x32A: 84,
+    0x32B: 84,
+    0x32C: 84,
+    0x32D: 84,
+    0x32E: 84,
+    0x32F: 84,
+    0x330: 84,
+    0x331: 84,
+    0x332: 84,
+    0x333: 84,
+    0x334: 84,
+    0x335: 84,
+    0x336: 84,
+    0x337: 84,
+    0x338: 84,
+    0x339: 84,
+    0x33A: 84,
+    0x33B: 84,
+    0x33C: 84,
+    0x33D: 84,
+    0x33E: 84,
+    0x33F: 84,
+    0x340: 84,
+    0x341: 84,
+    0x342: 84,
+    0x343: 84,
+    0x344: 84,
+    0x345: 84,
+    0x346: 84,
+    0x347: 84,
+    0x348: 84,
+    0x349: 84,
+    0x34A: 84,
+    0x34B: 84,
+    0x34C: 84,
+    0x34D: 84,
+    0x34E: 84,
+    0x34F: 84,
+    0x350: 84,
+    0x351: 84,
+    0x352: 84,
+    0x353: 84,
+    0x354: 84,
+    0x355: 84,
+    0x356: 84,
+    0x357: 84,
+    0x358: 84,
+    0x359: 84,
+    0x35A: 84,
+    0x35B: 84,
+    0x35C: 84,
+    0x35D: 84,
+    0x35E: 84,
+    0x35F: 84,
+    0x360: 84,
+    0x361: 84,
+    0x362: 84,
+    0x363: 84,
+    0x364: 84,
+    0x365: 84,
+    0x366: 84,
+    0x367: 84,
+    0x368: 84,
+    0x369: 84,
+    0x36A: 84,
+    0x36B: 84,
+    0x36C: 84,
+    0x36D: 84,
+    0x36E: 84,
+    0x36F: 84,
+    0x483: 84,
+    0x484: 84,
+    0x485: 84,
+    0x486: 84,
+    0x487: 84,
+    0x488: 84,
+    0x489: 84,
+    0x591: 84,
+    0x592: 84,
+    0x593: 84,
+    0x594: 84,
+    0x595: 84,
+    0x596: 84,
+    0x597: 84,
+    0x598: 84,
+    0x599: 84,
+    0x59A: 84,
+    0x59B: 84,
+    0x59C: 84,
+    0x59D: 84,
+    0x59E: 84,
+    0x59F: 84,
+    0x5A0: 84,
+    0x5A1: 84,
+    0x5A2: 84,
+    0x5A3: 84,
+    0x5A4: 84,
+    0x5A5: 84,
+    0x5A6: 84,
+    0x5A7: 84,
+    0x5A8: 84,
+    0x5A9: 84,
+    0x5AA: 84,
+    0x5AB: 84,
+    0x5AC: 84,
+    0x5AD: 84,
+    0x5AE: 84,
+    0x5AF: 84,
+    0x5B0: 84,
+    0x5B1: 84,
+    0x5B2: 84,
+    0x5B3: 84,
+    0x5B4: 84,
+    0x5B5: 84,
+    0x5B6: 84,
+    0x5B7: 84,
+    0x5B8: 84,
+    0x5B9: 84,
+    0x5BA: 84,
+    0x5BB: 84,
+    0x5BC: 84,
+    0x5BD: 84,
+    0x5BF: 84,
+    0x5C1: 84,
+    0x5C2: 84,
+    0x5C4: 84,
+    0x5C5: 84,
+    0x5C7: 84,
+    0x610: 84,
+    0x611: 84,
+    0x612: 84,
+    0x613: 84,
+    0x614: 84,
+    0x615: 84,
+    0x616: 84,
+    0x617: 84,
+    0x618: 84,
+    0x619: 84,
+    0x61A: 84,
+    0x61C: 84,
+    0x620: 68,
+    0x622: 82,
+    0x623: 82,
+    0x624: 82,
+    0x625: 82,
+    0x626: 68,
+    0x627: 82,
+    0x628: 68,
+    0x629: 82,
+    0x62A: 68,
+    0x62B: 68,
+    0x62C: 68,
+    0x62D: 68,
+    0x62E: 68,
+    0x62F: 82,
+    0x630: 82,
+    0x631: 82,
+    0x632: 82,
+    0x633: 68,
+    0x634: 68,
+    0x635: 68,
+    0x636: 68,
+    0x637: 68,
+    0x638: 68,
+    0x639: 68,
+    0x63A: 68,
+    0x63B: 68,
+    0x63C: 68,
+    0x63D: 68,
+    0x63E: 68,
+    0x63F: 68,
+    0x640: 67,
+    0x641: 68,
+    0x642: 68,
+    0x643: 68,
+    0x644: 68,
+    0x645: 68,
+    0x646: 68,
+    0x647: 68,
+    0x648: 82,
+    0x649: 68,
+    0x64A: 68,
+    0x64B: 84,
+    0x64C: 84,
+    0x64D: 84,
+    0x64E: 84,
+    0x64F: 84,
+    0x650: 84,
+    0x651: 84,
+    0x652: 84,
+    0x653: 84,
+    0x654: 84,
+    0x655: 84,
+    0x656: 84,
+    0x657: 84,
+    0x658: 84,
+    0x659: 84,
+    0x65A: 84,
+    0x65B: 84,
+    0x65C: 84,
+    0x65D: 84,
+    0x65E: 84,
+    0x65F: 84,
+    0x66E: 68,
+    0x66F: 68,
+    0x670: 84,
+    0x671: 82,
+    0x672: 82,
+    0x673: 82,
+    0x675: 82,
+    0x676: 82,
+    0x677: 82,
+    0x678: 68,
+    0x679: 68,
+    0x67A: 68,
+    0x67B: 68,
+    0x67C: 68,
+    0x67D: 68,
+    0x67E: 68,
+    0x67F: 68,
+    0x680: 68,
+    0x681: 68,
+    0x682: 68,
+    0x683: 68,
+    0x684: 68,
+    0x685: 68,
+    0x686: 68,
+    0x687: 68,
+    0x688: 82,
+    0x689: 82,
+    0x68A: 82,
+    0x68B: 82,
+    0x68C: 82,
+    0x68D: 82,
+    0x68E: 82,
+    0x68F: 82,
+    0x690: 82,
+    0x691: 82,
+    0x692: 82,
+    0x693: 82,
+    0x694: 82,
+    0x695: 82,
+    0x696: 82,
+    0x697: 82,
+    0x698: 82,
+    0x699: 82,
+    0x69A: 68,
+    0x69B: 68,
+    0x69C: 68,
+    0x69D: 68,
+    0x69E: 68,
+    0x69F: 68,
+    0x6A0: 68,
+    0x6A1: 68,
+    0x6A2: 68,
+    0x6A3: 68,
+    0x6A4: 68,
+    0x6A5: 68,
+    0x6A6: 68,
+    0x6A7: 68,
+    0x6A8: 68,
+    0x6A9: 68,
+    0x6AA: 68,
+    0x6AB: 68,
+    0x6AC: 68,
+    0x6AD: 68,
+    0x6AE: 68,
+    0x6AF: 68,
+    0x6B0: 68,
+    0x6B1: 68,
+    0x6B2: 68,
+    0x6B3: 68,
+    0x6B4: 68,
+    0x6B5: 68,
+    0x6B6: 68,
+    0x6B7: 68,
+    0x6B8: 68,
+    0x6B9: 68,
+    0x6BA: 68,
+    0x6BB: 68,
+    0x6BC: 68,
+    0x6BD: 68,
+    0x6BE: 68,
+    0x6BF: 68,
+    0x6C0: 82,
+    0x6C1: 68,
+    0x6C2: 68,
+    0x6C3: 82,
+    0x6C4: 82,
+    0x6C5: 82,
+    0x6C6: 82,
+    0x6C7: 82,
+    0x6C8: 82,
+    0x6C9: 82,
+    0x6CA: 82,
+    0x6CB: 82,
+    0x6CC: 68,
+    0x6CD: 82,
+    0x6CE: 68,
+    0x6CF: 82,
+    0x6D0: 68,
+    0x6D1: 68,
+    0x6D2: 82,
+    0x6D3: 82,
+    0x6D5: 82,
+    0x6D6: 84,
+    0x6D7: 84,
+    0x6D8: 84,
+    0x6D9: 84,
+    0x6DA: 84,
+    0x6DB: 84,
+    0x6DC: 84,
+    0x6DF: 84,
+    0x6E0: 84,
+    0x6E1: 84,
+    0x6E2: 84,
+    0x6E3: 84,
+    0x6E4: 84,
+    0x6E7: 84,
+    0x6E8: 84,
+    0x6EA: 84,
+    0x6EB: 84,
+    0x6EC: 84,
+    0x6ED: 84,
+    0x6EE: 82,
+    0x6EF: 82,
+    0x6FA: 68,
+    0x6FB: 68,
+    0x6FC: 68,
+    0x6FF: 68,
+    0x70F: 84,
+    0x710: 82,
+    0x711: 84,
+    0x712: 68,
+    0x713: 68,
+    0x714: 68,
+    0x715: 82,
+    0x716: 82,
+    0x717: 82,
+    0x718: 82,
+    0x719: 82,
+    0x71A: 68,
+    0x71B: 68,
+    0x71C: 68,
+    0x71D: 68,
+    0x71E: 82,
+    0x71F: 68,
+    0x720: 68,
+    0x721: 68,
+    0x722: 68,
+    0x723: 68,
+    0x724: 68,
+    0x725: 68,
+    0x726: 68,
+    0x727: 68,
+    0x728: 82,
+    0x729: 68,
+    0x72A: 82,
+    0x72B: 68,
+    0x72C: 82,
+    0x72D: 68,
+    0x72E: 68,
+    0x72F: 82,
+    0x730: 84,
+    0x731: 84,
+    0x732: 84,
+    0x733: 84,
+    0x734: 84,
+    0x735: 84,
+    0x736: 84,
+    0x737: 84,
+    0x738: 84,
+    0x739: 84,
+    0x73A: 84,
+    0x73B: 84,
+    0x73C: 84,
+    0x73D: 84,
+    0x73E: 84,
+    0x73F: 84,
+    0x740: 84,
+    0x741: 84,
+    0x742: 84,
+    0x743: 84,
+    0x744: 84,
+    0x745: 84,
+    0x746: 84,
+    0x747: 84,
+    0x748: 84,
+    0x749: 84,
+    0x74A: 84,
+    0x74D: 82,
+    0x74E: 68,
+    0x74F: 68,
+    0x750: 68,
+    0x751: 68,
+    0x752: 68,
+    0x753: 68,
+    0x754: 68,
+    0x755: 68,
+    0x756: 68,
+    0x757: 68,
+    0x758: 68,
+    0x759: 82,
+    0x75A: 82,
+    0x75B: 82,
+    0x75C: 68,
+    0x75D: 68,
+    0x75E: 68,
+    0x75F: 68,
+    0x760: 68,
+    0x761: 68,
+    0x762: 68,
+    0x763: 68,
+    0x764: 68,
+    0x765: 68,
+    0x766: 68,
+    0x767: 68,
+    0x768: 68,
+    0x769: 68,
+    0x76A: 68,
+    0x76B: 82,
+    0x76C: 82,
+    0x76D: 68,
+    0x76E: 68,
+    0x76F: 68,
+    0x770: 68,
+    0x771: 82,
+    0x772: 68,
+    0x773: 82,
+    0x774: 82,
+    0x775: 68,
+    0x776: 68,
+    0x777: 68,
+    0x778: 82,
+    0x779: 82,
+    0x77A: 68,
+    0x77B: 68,
+    0x77C: 68,
+    0x77D: 68,
+    0x77E: 68,
+    0x77F: 68,
+    0x7A6: 84,
+    0x7A7: 84,
+    0x7A8: 84,
+    0x7A9: 84,
+    0x7AA: 84,
+    0x7AB: 84,
+    0x7AC: 84,
+    0x7AD: 84,
+    0x7AE: 84,
+    0x7AF: 84,
+    0x7B0: 84,
+    0x7CA: 68,
+    0x7CB: 68,
+    0x7CC: 68,
+    0x7CD: 68,
+    0x7CE: 68,
+    0x7CF: 68,
+    0x7D0: 68,
+    0x7D1: 68,
+    0x7D2: 68,
+    0x7D3: 68,
+    0x7D4: 68,
+    0x7D5: 68,
+    0x7D6: 68,
+    0x7D7: 68,
+    0x7D8: 68,
+    0x7D9: 68,
+    0x7DA: 68,
+    0x7DB: 68,
+    0x7DC: 68,
+    0x7DD: 68,
+    0x7DE: 68,
+    0x7DF: 68,
+    0x7E0: 68,
+    0x7E1: 68,
+    0x7E2: 68,
+    0x7E3: 68,
+    0x7E4: 68,
+    0x7E5: 68,
+    0x7E6: 68,
+    0x7E7: 68,
+    0x7E8: 68,
+    0x7E9: 68,
+    0x7EA: 68,
+    0x7EB: 84,
+    0x7EC: 84,
+    0x7ED: 84,
+    0x7EE: 84,
+    0x7EF: 84,
+    0x7F0: 84,
+    0x7F1: 84,
+    0x7F2: 84,
+    0x7F3: 84,
+    0x7FA: 67,
+    0x7FD: 84,
+    0x816: 84,
+    0x817: 84,
+    0x818: 84,
+    0x819: 84,
+    0x81B: 84,
+    0x81C: 84,
+    0x81D: 84,
+    0x81E: 84,
+    0x81F: 84,
+    0x820: 84,
+    0x821: 84,
+    0x822: 84,
+    0x823: 84,
+    0x825: 84,
+    0x826: 84,
+    0x827: 84,
+    0x829: 84,
+    0x82A: 84,
+    0x82B: 84,
+    0x82C: 84,
+    0x82D: 84,
+    0x840: 82,
+    0x841: 68,
+    0x842: 68,
+    0x843: 68,
+    0x844: 68,
+    0x845: 68,
+    0x846: 82,
+    0x847: 82,
+    0x848: 68,
+    0x849: 82,
+    0x84A: 68,
+    0x84B: 68,
+    0x84C: 68,
+    0x84D: 68,
+    0x84E: 68,
+    0x84F: 68,
+    0x850: 68,
+    0x851: 68,
+    0x852: 68,
+    0x853: 68,
+    0x854: 82,
+    0x855: 68,
+    0x856: 82,
+    0x857: 82,
+    0x858: 82,
+    0x859: 84,
+    0x85A: 84,
+    0x85B: 84,
+    0x860: 68,
+    0x862: 68,
+    0x863: 68,
+    0x864: 68,
+    0x865: 68,
+    0x867: 82,
+    0x868: 68,
+    0x869: 82,
+    0x86A: 82,
+    0x870: 82,
+    0x871: 82,
+    0x872: 82,
+    0x873: 82,
+    0x874: 82,
+    0x875: 82,
+    0x876: 82,
+    0x877: 82,
+    0x878: 82,
+    0x879: 82,
+    0x87A: 82,
+    0x87B: 82,
+    0x87C: 82,
+    0x87D: 82,
+    0x87E: 82,
+    0x87F: 82,
+    0x880: 82,
+    0x881: 82,
+    0x882: 82,
+    0x883: 67,
+    0x884: 67,
+    0x885: 67,
+    0x886: 68,
+    0x889: 68,
+    0x88A: 68,
+    0x88B: 68,
+    0x88C: 68,
+    0x88D: 68,
+    0x88E: 82,
+    0x897: 84,
+    0x898: 84,
+    0x899: 84,
+    0x89A: 84,
+    0x89B: 84,
+    0x89C: 84,
+    0x89D: 84,
+    0x89E: 84,
+    0x89F: 84,
+    0x8A0: 68,
+    0x8A1: 68,
+    0x8A2: 68,
+    0x8A3: 68,
+    0x8A4: 68,
+    0x8A5: 68,
+    0x8A6: 68,
+    0x8A7: 68,
+    0x8A8: 68,
+    0x8A9: 68,
+    0x8AA: 82,
+    0x8AB: 82,
+    0x8AC: 82,
+    0x8AE: 82,
+    0x8AF: 68,
+    0x8B0: 68,
+    0x8B1: 82,
+    0x8B2: 82,
+    0x8B3: 68,
+    0x8B4: 68,
+    0x8B5: 68,
+    0x8B6: 68,
+    0x8B7: 68,
+    0x8B8: 68,
+    0x8B9: 82,
+    0x8BA: 68,
+    0x8BB: 68,
+    0x8BC: 68,
+    0x8BD: 68,
+    0x8BE: 68,
+    0x8BF: 68,
+    0x8C0: 68,
+    0x8C1: 68,
+    0x8C2: 68,
+    0x8C3: 68,
+    0x8C4: 68,
+    0x8C5: 68,
+    0x8C6: 68,
+    0x8C7: 68,
+    0x8C8: 68,
+    0x8CA: 84,
+    0x8CB: 84,
+    0x8CC: 84,
+    0x8CD: 84,
+    0x8CE: 84,
+    0x8CF: 84,
+    0x8D0: 84,
+    0x8D1: 84,
+    0x8D2: 84,
+    0x8D3: 84,
+    0x8D4: 84,
+    0x8D5: 84,
+    0x8D6: 84,
+    0x8D7: 84,
+    0x8D8: 84,
+    0x8D9: 84,
+    0x8DA: 84,
+    0x8DB: 84,
+    0x8DC: 84,
+    0x8DD: 84,
+    0x8DE: 84,
+    0x8DF: 84,
+    0x8E0: 84,
+    0x8E1: 84,
+    0x8E3: 84,
+    0x8E4: 84,
+    0x8E5: 84,
+    0x8E6: 84,
+    0x8E7: 84,
+    0x8E8: 84,
+    0x8E9: 84,
+    0x8EA: 84,
+    0x8EB: 84,
+    0x8EC: 84,
+    0x8ED: 84,
+    0x8EE: 84,
+    0x8EF: 84,
+    0x8F0: 84,
+    0x8F1: 84,
+    0x8F2: 84,
+    0x8F3: 84,
+    0x8F4: 84,
+    0x8F5: 84,
+    0x8F6: 84,
+    0x8F7: 84,
+    0x8F8: 84,
+    0x8F9: 84,
+    0x8FA: 84,
+    0x8FB: 84,
+    0x8FC: 84,
+    0x8FD: 84,
+    0x8FE: 84,
+    0x8FF: 84,
+    0x900: 84,
+    0x901: 84,
+    0x902: 84,
+    0x93A: 84,
+    0x93C: 84,
+    0x941: 84,
+    0x942: 84,
+    0x943: 84,
+    0x944: 84,
+    0x945: 84,
+    0x946: 84,
+    0x947: 84,
+    0x948: 84,
+    0x94D: 84,
+    0x951: 84,
+    0x952: 84,
+    0x953: 84,
+    0x954: 84,
+    0x955: 84,
+    0x956: 84,
+    0x957: 84,
+    0x962: 84,
+    0x963: 84,
+    0x981: 84,
+    0x9BC: 84,
+    0x9C1: 84,
+    0x9C2: 84,
+    0x9C3: 84,
+    0x9C4: 84,
+    0x9CD: 84,
+    0x9E2: 84,
+    0x9E3: 84,
+    0x9FE: 84,
+    0xA01: 84,
+    0xA02: 84,
+    0xA3C: 84,
+    0xA41: 84,
+    0xA42: 84,
+    0xA47: 84,
+    0xA48: 84,
+    0xA4B: 84,
+    0xA4C: 84,
+    0xA4D: 84,
+    0xA51: 84,
+    0xA70: 84,
+    0xA71: 84,
+    0xA75: 84,
+    0xA81: 84,
+    0xA82: 84,
+    0xABC: 84,
+    0xAC1: 84,
+    0xAC2: 84,
+    0xAC3: 84,
+    0xAC4: 84,
+    0xAC5: 84,
+    0xAC7: 84,
+    0xAC8: 84,
+    0xACD: 84,
+    0xAE2: 84,
+    0xAE3: 84,
+    0xAFA: 84,
+    0xAFB: 84,
+    0xAFC: 84,
+    0xAFD: 84,
+    0xAFE: 84,
+    0xAFF: 84,
+    0xB01: 84,
+    0xB3C: 84,
+    0xB3F: 84,
+    0xB41: 84,
+    0xB42: 84,
+    0xB43: 84,
+    0xB44: 84,
+    0xB4D: 84,
+    0xB55: 84,
+    0xB56: 84,
+    0xB62: 84,
+    0xB63: 84,
+    0xB82: 84,
+    0xBC0: 84,
+    0xBCD: 84,
+    0xC00: 84,
+    0xC04: 84,
+    0xC3C: 84,
+    0xC3E: 84,
+    0xC3F: 84,
+    0xC40: 84,
+    0xC46: 84,
+    0xC47: 84,
+    0xC48: 84,
+    0xC4A: 84,
+    0xC4B: 84,
+    0xC4C: 84,
+    0xC4D: 84,
+    0xC55: 84,
+    0xC56: 84,
+    0xC62: 84,
+    0xC63: 84,
+    0xC81: 84,
+    0xCBC: 84,
+    0xCBF: 84,
+    0xCC6: 84,
+    0xCCC: 84,
+    0xCCD: 84,
+    0xCE2: 84,
+    0xCE3: 84,
+    0xD00: 84,
+    0xD01: 84,
+    0xD3B: 84,
+    0xD3C: 84,
+    0xD41: 84,
+    0xD42: 84,
+    0xD43: 84,
+    0xD44: 84,
+    0xD4D: 84,
+    0xD62: 84,
+    0xD63: 84,
+    0xD81: 84,
+    0xDCA: 84,
+    0xDD2: 84,
+    0xDD3: 84,
+    0xDD4: 84,
+    0xDD6: 84,
+    0xE31: 84,
+    0xE34: 84,
+    0xE35: 84,
+    0xE36: 84,
+    0xE37: 84,
+    0xE38: 84,
+    0xE39: 84,
+    0xE3A: 84,
+    0xE47: 84,
+    0xE48: 84,
+    0xE49: 84,
+    0xE4A: 84,
+    0xE4B: 84,
+    0xE4C: 84,
+    0xE4D: 84,
+    0xE4E: 84,
+    0xEB1: 84,
+    0xEB4: 84,
+    0xEB5: 84,
+    0xEB6: 84,
+    0xEB7: 84,
+    0xEB8: 84,
+    0xEB9: 84,
+    0xEBA: 84,
+    0xEBB: 84,
+    0xEBC: 84,
+    0xEC8: 84,
+    0xEC9: 84,
+    0xECA: 84,
+    0xECB: 84,
+    0xECC: 84,
+    0xECD: 84,
+    0xECE: 84,
+    0xF18: 84,
+    0xF19: 84,
+    0xF35: 84,
+    0xF37: 84,
+    0xF39: 84,
+    0xF71: 84,
+    0xF72: 84,
+    0xF73: 84,
+    0xF74: 84,
+    0xF75: 84,
+    0xF76: 84,
+    0xF77: 84,
+    0xF78: 84,
+    0xF79: 84,
+    0xF7A: 84,
+    0xF7B: 84,
+    0xF7C: 84,
+    0xF7D: 84,
+    0xF7E: 84,
+    0xF80: 84,
+    0xF81: 84,
+    0xF82: 84,
+    0xF83: 84,
+    0xF84: 84,
+    0xF86: 84,
+    0xF87: 84,
+    0xF8D: 84,
+    0xF8E: 84,
+    0xF8F: 84,
+    0xF90: 84,
+    0xF91: 84,
+    0xF92: 84,
+    0xF93: 84,
+    0xF94: 84,
+    0xF95: 84,
+    0xF96: 84,
+    0xF97: 84,
+    0xF99: 84,
+    0xF9A: 84,
+    0xF9B: 84,
+    0xF9C: 84,
+    0xF9D: 84,
+    0xF9E: 84,
+    0xF9F: 84,
+    0xFA0: 84,
+    0xFA1: 84,
+    0xFA2: 84,
+    0xFA3: 84,
+    0xFA4: 84,
+    0xFA5: 84,
+    0xFA6: 84,
+    0xFA7: 84,
+    0xFA8: 84,
+    0xFA9: 84,
+    0xFAA: 84,
+    0xFAB: 84,
+    0xFAC: 84,
+    0xFAD: 84,
+    0xFAE: 84,
+    0xFAF: 84,
+    0xFB0: 84,
+    0xFB1: 84,
+    0xFB2: 84,
+    0xFB3: 84,
+    0xFB4: 84,
+    0xFB5: 84,
+    0xFB6: 84,
+    0xFB7: 84,
+    0xFB8: 84,
+    0xFB9: 84,
+    0xFBA: 84,
+    0xFBB: 84,
+    0xFBC: 84,
+    0xFC6: 84,
+    0x102D: 84,
+    0x102E: 84,
+    0x102F: 84,
+    0x1030: 84,
+    0x1032: 84,
+    0x1033: 84,
+    0x1034: 84,
+    0x1035: 84,
+    0x1036: 84,
+    0x1037: 84,
+    0x1039: 84,
+    0x103A: 84,
+    0x103D: 84,
+    0x103E: 84,
+    0x1058: 84,
+    0x1059: 84,
+    0x105E: 84,
+    0x105F: 84,
+    0x1060: 84,
+    0x1071: 84,
+    0x1072: 84,
+    0x1073: 84,
+    0x1074: 84,
+    0x1082: 84,
+    0x1085: 84,
+    0x1086: 84,
+    0x108D: 84,
+    0x109D: 84,
+    0x135D: 84,
+    0x135E: 84,
+    0x135F: 84,
+    0x1712: 84,
+    0x1713: 84,
+    0x1714: 84,
+    0x1732: 84,
+    0x1733: 84,
+    0x1752: 84,
+    0x1753: 84,
+    0x1772: 84,
+    0x1773: 84,
+    0x17B4: 84,
+    0x17B5: 84,
+    0x17B7: 84,
+    0x17B8: 84,
+    0x17B9: 84,
+    0x17BA: 84,
+    0x17BB: 84,
+    0x17BC: 84,
+    0x17BD: 84,
+    0x17C6: 84,
+    0x17C9: 84,
+    0x17CA: 84,
+    0x17CB: 84,
+    0x17CC: 84,
+    0x17CD: 84,
+    0x17CE: 84,
+    0x17CF: 84,
+    0x17D0: 84,
+    0x17D1: 84,
+    0x17D2: 84,
+    0x17D3: 84,
+    0x17DD: 84,
+    0x1807: 68,
+    0x180A: 67,
+    0x180B: 84,
+    0x180C: 84,
+    0x180D: 84,
+    0x180F: 84,
+    0x1820: 68,
+    0x1821: 68,
+    0x1822: 68,
+    0x1823: 68,
+    0x1824: 68,
+    0x1825: 68,
+    0x1826: 68,
+    0x1827: 68,
+    0x1828: 68,
+    0x1829: 68,
+    0x182A: 68,
+    0x182B: 68,
+    0x182C: 68,
+    0x182D: 68,
+    0x182E: 68,
+    0x182F: 68,
+    0x1830: 68,
+    0x1831: 68,
+    0x1832: 68,
+    0x1833: 68,
+    0x1834: 68,
+    0x1835: 68,
+    0x1836: 68,
+    0x1837: 68,
+    0x1838: 68,
+    0x1839: 68,
+    0x183A: 68,
+    0x183B: 68,
+    0x183C: 68,
+    0x183D: 68,
+    0x183E: 68,
+    0x183F: 68,
+    0x1840: 68,
+    0x1841: 68,
+    0x1842: 68,
+    0x1843: 68,
+    0x1844: 68,
+    0x1845: 68,
+    0x1846: 68,
+    0x1847: 68,
+    0x1848: 68,
+    0x1849: 68,
+    0x184A: 68,
+    0x184B: 68,
+    0x184C: 68,
+    0x184D: 68,
+    0x184E: 68,
+    0x184F: 68,
+    0x1850: 68,
+    0x1851: 68,
+    0x1852: 68,
+    0x1853: 68,
+    0x1854: 68,
+    0x1855: 68,
+    0x1856: 68,
+    0x1857: 68,
+    0x1858: 68,
+    0x1859: 68,
+    0x185A: 68,
+    0x185B: 68,
+    0x185C: 68,
+    0x185D: 68,
+    0x185E: 68,
+    0x185F: 68,
+    0x1860: 68,
+    0x1861: 68,
+    0x1862: 68,
+    0x1863: 68,
+    0x1864: 68,
+    0x1865: 68,
+    0x1866: 68,
+    0x1867: 68,
+    0x1868: 68,
+    0x1869: 68,
+    0x186A: 68,
+    0x186B: 68,
+    0x186C: 68,
+    0x186D: 68,
+    0x186E: 68,
+    0x186F: 68,
+    0x1870: 68,
+    0x1871: 68,
+    0x1872: 68,
+    0x1873: 68,
+    0x1874: 68,
+    0x1875: 68,
+    0x1876: 68,
+    0x1877: 68,
+    0x1878: 68,
+    0x1885: 84,
+    0x1886: 84,
+    0x1887: 68,
+    0x1888: 68,
+    0x1889: 68,
+    0x188A: 68,
+    0x188B: 68,
+    0x188C: 68,
+    0x188D: 68,
+    0x188E: 68,
+    0x188F: 68,
+    0x1890: 68,
+    0x1891: 68,
+    0x1892: 68,
+    0x1893: 68,
+    0x1894: 68,
+    0x1895: 68,
+    0x1896: 68,
+    0x1897: 68,
+    0x1898: 68,
+    0x1899: 68,
+    0x189A: 68,
+    0x189B: 68,
+    0x189C: 68,
+    0x189D: 68,
+    0x189E: 68,
+    0x189F: 68,
+    0x18A0: 68,
+    0x18A1: 68,
+    0x18A2: 68,
+    0x18A3: 68,
+    0x18A4: 68,
+    0x18A5: 68,
+    0x18A6: 68,
+    0x18A7: 68,
+    0x18A8: 68,
+    0x18A9: 84,
+    0x18AA: 68,
+    0x1920: 84,
+    0x1921: 84,
+    0x1922: 84,
+    0x1927: 84,
+    0x1928: 84,
+    0x1932: 84,
+    0x1939: 84,
+    0x193A: 84,
+    0x193B: 84,
+    0x1A17: 84,
+    0x1A18: 84,
+    0x1A1B: 84,
+    0x1A56: 84,
+    0x1A58: 84,
+    0x1A59: 84,
+    0x1A5A: 84,
+    0x1A5B: 84,
+    0x1A5C: 84,
+    0x1A5D: 84,
+    0x1A5E: 84,
+    0x1A60: 84,
+    0x1A62: 84,
+    0x1A65: 84,
+    0x1A66: 84,
+    0x1A67: 84,
+    0x1A68: 84,
+    0x1A69: 84,
+    0x1A6A: 84,
+    0x1A6B: 84,
+    0x1A6C: 84,
+    0x1A73: 84,
+    0x1A74: 84,
+    0x1A75: 84,
+    0x1A76: 84,
+    0x1A77: 84,
+    0x1A78: 84,
+    0x1A79: 84,
+    0x1A7A: 84,
+    0x1A7B: 84,
+    0x1A7C: 84,
+    0x1A7F: 84,
+    0x1AB0: 84,
+    0x1AB1: 84,
+    0x1AB2: 84,
+    0x1AB3: 84,
+    0x1AB4: 84,
+    0x1AB5: 84,
+    0x1AB6: 84,
+    0x1AB7: 84,
+    0x1AB8: 84,
+    0x1AB9: 84,
+    0x1ABA: 84,
+    0x1ABB: 84,
+    0x1ABC: 84,
+    0x1ABD: 84,
+    0x1ABE: 84,
+    0x1ABF: 84,
+    0x1AC0: 84,
+    0x1AC1: 84,
+    0x1AC2: 84,
+    0x1AC3: 84,
+    0x1AC4: 84,
+    0x1AC5: 84,
+    0x1AC6: 84,
+    0x1AC7: 84,
+    0x1AC8: 84,
+    0x1AC9: 84,
+    0x1ACA: 84,
+    0x1ACB: 84,
+    0x1ACC: 84,
+    0x1ACD: 84,
+    0x1ACE: 84,
+    0x1B00: 84,
+    0x1B01: 84,
+    0x1B02: 84,
+    0x1B03: 84,
+    0x1B34: 84,
+    0x1B36: 84,
+    0x1B37: 84,
+    0x1B38: 84,
+    0x1B39: 84,
+    0x1B3A: 84,
+    0x1B3C: 84,
+    0x1B42: 84,
+    0x1B6B: 84,
+    0x1B6C: 84,
+    0x1B6D: 84,
+    0x1B6E: 84,
+    0x1B6F: 84,
+    0x1B70: 84,
+    0x1B71: 84,
+    0x1B72: 84,
+    0x1B73: 84,
+    0x1B80: 84,
+    0x1B81: 84,
+    0x1BA2: 84,
+    0x1BA3: 84,
+    0x1BA4: 84,
+    0x1BA5: 84,
+    0x1BA8: 84,
+    0x1BA9: 84,
+    0x1BAB: 84,
+    0x1BAC: 84,
+    0x1BAD: 84,
+    0x1BE6: 84,
+    0x1BE8: 84,
+    0x1BE9: 84,
+    0x1BED: 84,
+    0x1BEF: 84,
+    0x1BF0: 84,
+    0x1BF1: 84,
+    0x1C2C: 84,
+    0x1C2D: 84,
+    0x1C2E: 84,
+    0x1C2F: 84,
+    0x1C30: 84,
+    0x1C31: 84,
+    0x1C32: 84,
+    0x1C33: 84,
+    0x1C36: 84,
+    0x1C37: 84,
+    0x1CD0: 84,
+    0x1CD1: 84,
+    0x1CD2: 84,
+    0x1CD4: 84,
+    0x1CD5: 84,
+    0x1CD6: 84,
+    0x1CD7: 84,
+    0x1CD8: 84,
+    0x1CD9: 84,
+    0x1CDA: 84,
+    0x1CDB: 84,
+    0x1CDC: 84,
+    0x1CDD: 84,
+    0x1CDE: 84,
+    0x1CDF: 84,
+    0x1CE0: 84,
+    0x1CE2: 84,
+    0x1CE3: 84,
+    0x1CE4: 84,
+    0x1CE5: 84,
+    0x1CE6: 84,
+    0x1CE7: 84,
+    0x1CE8: 84,
+    0x1CED: 84,
+    0x1CF4: 84,
+    0x1CF8: 84,
+    0x1CF9: 84,
+    0x1DC0: 84,
+    0x1DC1: 84,
+    0x1DC2: 84,
+    0x1DC3: 84,
+    0x1DC4: 84,
+    0x1DC5: 84,
+    0x1DC6: 84,
+    0x1DC7: 84,
+    0x1DC8: 84,
+    0x1DC9: 84,
+    0x1DCA: 84,
+    0x1DCB: 84,
+    0x1DCC: 84,
+    0x1DCD: 84,
+    0x1DCE: 84,
+    0x1DCF: 84,
+    0x1DD0: 84,
+    0x1DD1: 84,
+    0x1DD2: 84,
+    0x1DD3: 84,
+    0x1DD4: 84,
+    0x1DD5: 84,
+    0x1DD6: 84,
+    0x1DD7: 84,
+    0x1DD8: 84,
+    0x1DD9: 84,
+    0x1DDA: 84,
+    0x1DDB: 84,
+    0x1DDC: 84,
+    0x1DDD: 84,
+    0x1DDE: 84,
+    0x1DDF: 84,
+    0x1DE0: 84,
+    0x1DE1: 84,
+    0x1DE2: 84,
+    0x1DE3: 84,
+    0x1DE4: 84,
+    0x1DE5: 84,
+    0x1DE6: 84,
+    0x1DE7: 84,
+    0x1DE8: 84,
+    0x1DE9: 84,
+    0x1DEA: 84,
+    0x1DEB: 84,
+    0x1DEC: 84,
+    0x1DED: 84,
+    0x1DEE: 84,
+    0x1DEF: 84,
+    0x1DF0: 84,
+    0x1DF1: 84,
+    0x1DF2: 84,
+    0x1DF3: 84,
+    0x1DF4: 84,
+    0x1DF5: 84,
+    0x1DF6: 84,
+    0x1DF7: 84,
+    0x1DF8: 84,
+    0x1DF9: 84,
+    0x1DFA: 84,
+    0x1DFB: 84,
+    0x1DFC: 84,
+    0x1DFD: 84,
+    0x1DFE: 84,
+    0x1DFF: 84,
+    0x200B: 84,
+    0x200D: 67,
+    0x200E: 84,
+    0x200F: 84,
+    0x202A: 84,
+    0x202B: 84,
+    0x202C: 84,
+    0x202D: 84,
+    0x202E: 84,
+    0x2060: 84,
+    0x2061: 84,
+    0x2062: 84,
+    0x2063: 84,
+    0x2064: 84,
+    0x206A: 84,
+    0x206B: 84,
+    0x206C: 84,
+    0x206D: 84,
+    0x206E: 84,
+    0x206F: 84,
+    0x20D0: 84,
+    0x20D1: 84,
+    0x20D2: 84,
+    0x20D3: 84,
+    0x20D4: 84,
+    0x20D5: 84,
+    0x20D6: 84,
+    0x20D7: 84,
+    0x20D8: 84,
+    0x20D9: 84,
+    0x20DA: 84,
+    0x20DB: 84,
+    0x20DC: 84,
+    0x20DD: 84,
+    0x20DE: 84,
+    0x20DF: 84,
+    0x20E0: 84,
+    0x20E1: 84,
+    0x20E2: 84,
+    0x20E3: 84,
+    0x20E4: 84,
+    0x20E5: 84,
+    0x20E6: 84,
+    0x20E7: 84,
+    0x20E8: 84,
+    0x20E9: 84,
+    0x20EA: 84,
+    0x20EB: 84,
+    0x20EC: 84,
+    0x20ED: 84,
+    0x20EE: 84,
+    0x20EF: 84,
+    0x20F0: 84,
+    0x2CEF: 84,
+    0x2CF0: 84,
+    0x2CF1: 84,
+    0x2D7F: 84,
+    0x2DE0: 84,
+    0x2DE1: 84,
+    0x2DE2: 84,
+    0x2DE3: 84,
+    0x2DE4: 84,
+    0x2DE5: 84,
+    0x2DE6: 84,
+    0x2DE7: 84,
+    0x2DE8: 84,
+    0x2DE9: 84,
+    0x2DEA: 84,
+    0x2DEB: 84,
+    0x2DEC: 84,
+    0x2DED: 84,
+    0x2DEE: 84,
+    0x2DEF: 84,
+    0x2DF0: 84,
+    0x2DF1: 84,
+    0x2DF2: 84,
+    0x2DF3: 84,
+    0x2DF4: 84,
+    0x2DF5: 84,
+    0x2DF6: 84,
+    0x2DF7: 84,
+    0x2DF8: 84,
+    0x2DF9: 84,
+    0x2DFA: 84,
+    0x2DFB: 84,
+    0x2DFC: 84,
+    0x2DFD: 84,
+    0x2DFE: 84,
+    0x2DFF: 84,
+    0x302A: 84,
+    0x302B: 84,
+    0x302C: 84,
+    0x302D: 84,
+    0x3099: 84,
+    0x309A: 84,
+    0xA66F: 84,
+    0xA670: 84,
+    0xA671: 84,
+    0xA672: 84,
+    0xA674: 84,
+    0xA675: 84,
+    0xA676: 84,
+    0xA677: 84,
+    0xA678: 84,
+    0xA679: 84,
+    0xA67A: 84,
+    0xA67B: 84,
+    0xA67C: 84,
+    0xA67D: 84,
+    0xA69E: 84,
+    0xA69F: 84,
+    0xA6F0: 84,
+    0xA6F1: 84,
+    0xA802: 84,
+    0xA806: 84,
+    0xA80B: 84,
+    0xA825: 84,
+    0xA826: 84,
+    0xA82C: 84,
+    0xA840: 68,
+    0xA841: 68,
+    0xA842: 68,
+    0xA843: 68,
+    0xA844: 68,
+    0xA845: 68,
+    0xA846: 68,
+    0xA847: 68,
+    0xA848: 68,
+    0xA849: 68,
+    0xA84A: 68,
+    0xA84B: 68,
+    0xA84C: 68,
+    0xA84D: 68,
+    0xA84E: 68,
+    0xA84F: 68,
+    0xA850: 68,
+    0xA851: 68,
+    0xA852: 68,
+    0xA853: 68,
+    0xA854: 68,
+    0xA855: 68,
+    0xA856: 68,
+    0xA857: 68,
+    0xA858: 68,
+    0xA859: 68,
+    0xA85A: 68,
+    0xA85B: 68,
+    0xA85C: 68,
+    0xA85D: 68,
+    0xA85E: 68,
+    0xA85F: 68,
+    0xA860: 68,
+    0xA861: 68,
+    0xA862: 68,
+    0xA863: 68,
+    0xA864: 68,
+    0xA865: 68,
+    0xA866: 68,
+    0xA867: 68,
+    0xA868: 68,
+    0xA869: 68,
+    0xA86A: 68,
+    0xA86B: 68,
+    0xA86C: 68,
+    0xA86D: 68,
+    0xA86E: 68,
+    0xA86F: 68,
+    0xA870: 68,
+    0xA871: 68,
+    0xA872: 76,
+    0xA8C4: 84,
+    0xA8C5: 84,
+    0xA8E0: 84,
+    0xA8E1: 84,
+    0xA8E2: 84,
+    0xA8E3: 84,
+    0xA8E4: 84,
+    0xA8E5: 84,
+    0xA8E6: 84,
+    0xA8E7: 84,
+    0xA8E8: 84,
+    0xA8E9: 84,
+    0xA8EA: 84,
+    0xA8EB: 84,
+    0xA8EC: 84,
+    0xA8ED: 84,
+    0xA8EE: 84,
+    0xA8EF: 84,
+    0xA8F0: 84,
+    0xA8F1: 84,
+    0xA8FF: 84,
+    0xA926: 84,
+    0xA927: 84,
+    0xA928: 84,
+    0xA929: 84,
+    0xA92A: 84,
+    0xA92B: 84,
+    0xA92C: 84,
+    0xA92D: 84,
+    0xA947: 84,
+    0xA948: 84,
+    0xA949: 84,
+    0xA94A: 84,
+    0xA94B: 84,
+    0xA94C: 84,
+    0xA94D: 84,
+    0xA94E: 84,
+    0xA94F: 84,
+    0xA950: 84,
+    0xA951: 84,
+    0xA980: 84,
+    0xA981: 84,
+    0xA982: 84,
+    0xA9B3: 84,
+    0xA9B6: 84,
+    0xA9B7: 84,
+    0xA9B8: 84,
+    0xA9B9: 84,
+    0xA9BC: 84,
+    0xA9BD: 84,
+    0xA9E5: 84,
+    0xAA29: 84,
+    0xAA2A: 84,
+    0xAA2B: 84,
+    0xAA2C: 84,
+    0xAA2D: 84,
+    0xAA2E: 84,
+    0xAA31: 84,
+    0xAA32: 84,
+    0xAA35: 84,
+    0xAA36: 84,
+    0xAA43: 84,
+    0xAA4C: 84,
+    0xAA7C: 84,
+    0xAAB0: 84,
+    0xAAB2: 84,
+    0xAAB3: 84,
+    0xAAB4: 84,
+    0xAAB7: 84,
+    0xAAB8: 84,
+    0xAABE: 84,
+    0xAABF: 84,
+    0xAAC1: 84,
+    0xAAEC: 84,
+    0xAAED: 84,
+    0xAAF6: 84,
+    0xABE5: 84,
+    0xABE8: 84,
+    0xABED: 84,
+    0xFB1E: 84,
+    0xFE00: 84,
+    0xFE01: 84,
+    0xFE02: 84,
+    0xFE03: 84,
+    0xFE04: 84,
+    0xFE05: 84,
+    0xFE06: 84,
+    0xFE07: 84,
+    0xFE08: 84,
+    0xFE09: 84,
+    0xFE0A: 84,
+    0xFE0B: 84,
+    0xFE0C: 84,
+    0xFE0D: 84,
+    0xFE0E: 84,
+    0xFE0F: 84,
+    0xFE20: 84,
+    0xFE21: 84,
+    0xFE22: 84,
+    0xFE23: 84,
+    0xFE24: 84,
+    0xFE25: 84,
+    0xFE26: 84,
+    0xFE27: 84,
+    0xFE28: 84,
+    0xFE29: 84,
+    0xFE2A: 84,
+    0xFE2B: 84,
+    0xFE2C: 84,
+    0xFE2D: 84,
+    0xFE2E: 84,
+    0xFE2F: 84,
+    0xFEFF: 84,
+    0xFFF9: 84,
+    0xFFFA: 84,
+    0xFFFB: 84,
+    0x101FD: 84,
+    0x102E0: 84,
+    0x10376: 84,
+    0x10377: 84,
+    0x10378: 84,
+    0x10379: 84,
+    0x1037A: 84,
+    0x10A01: 84,
+    0x10A02: 84,
+    0x10A03: 84,
+    0x10A05: 84,
+    0x10A06: 84,
+    0x10A0C: 84,
+    0x10A0D: 84,
+    0x10A0E: 84,
+    0x10A0F: 84,
+    0x10A38: 84,
+    0x10A39: 84,
+    0x10A3A: 84,
+    0x10A3F: 84,
+    0x10AC0: 68,
+    0x10AC1: 68,
+    0x10AC2: 68,
+    0x10AC3: 68,
+    0x10AC4: 68,
+    0x10AC5: 82,
+    0x10AC7: 82,
+    0x10AC9: 82,
+    0x10ACA: 82,
+    0x10ACD: 76,
+    0x10ACE: 82,
+    0x10ACF: 82,
+    0x10AD0: 82,
+    0x10AD1: 82,
+    0x10AD2: 82,
+    0x10AD3: 68,
+    0x10AD4: 68,
+    0x10AD5: 68,
+    0x10AD6: 68,
+    0x10AD7: 76,
+    0x10AD8: 68,
+    0x10AD9: 68,
+    0x10ADA: 68,
+    0x10ADB: 68,
+    0x10ADC: 68,
+    0x10ADD: 82,
+    0x10ADE: 68,
+    0x10ADF: 68,
+    0x10AE0: 68,
+    0x10AE1: 82,
+    0x10AE4: 82,
+    0x10AE5: 84,
+    0x10AE6: 84,
+    0x10AEB: 68,
+    0x10AEC: 68,
+    0x10AED: 68,
+    0x10AEE: 68,
+    0x10AEF: 82,
+    0x10B80: 68,
+    0x10B81: 82,
+    0x10B82: 68,
+    0x10B83: 82,
+    0x10B84: 82,
+    0x10B85: 82,
+    0x10B86: 68,
+    0x10B87: 68,
+    0x10B88: 68,
+    0x10B89: 82,
+    0x10B8A: 68,
+    0x10B8B: 68,
+    0x10B8C: 82,
+    0x10B8D: 68,
+    0x10B8E: 82,
+    0x10B8F: 82,
+    0x10B90: 68,
+    0x10B91: 82,
+    0x10BA9: 82,
+    0x10BAA: 82,
+    0x10BAB: 82,
+    0x10BAC: 82,
+    0x10BAD: 68,
+    0x10BAE: 68,
+    0x10D00: 76,
+    0x10D01: 68,
+    0x10D02: 68,
+    0x10D03: 68,
+    0x10D04: 68,
+    0x10D05: 68,
+    0x10D06: 68,
+    0x10D07: 68,
+    0x10D08: 68,
+    0x10D09: 68,
+    0x10D0A: 68,
+    0x10D0B: 68,
+    0x10D0C: 68,
+    0x10D0D: 68,
+    0x10D0E: 68,
+    0x10D0F: 68,
+    0x10D10: 68,
+    0x10D11: 68,
+    0x10D12: 68,
+    0x10D13: 68,
+    0x10D14: 68,
+    0x10D15: 68,
+    0x10D16: 68,
+    0x10D17: 68,
+    0x10D18: 68,
+    0x10D19: 68,
+    0x10D1A: 68,
+    0x10D1B: 68,
+    0x10D1C: 68,
+    0x10D1D: 68,
+    0x10D1E: 68,
+    0x10D1F: 68,
+    0x10D20: 68,
+    0x10D21: 68,
+    0x10D22: 82,
+    0x10D23: 68,
+    0x10D24: 84,
+    0x10D25: 84,
+    0x10D26: 84,
+    0x10D27: 84,
+    0x10D69: 84,
+    0x10D6A: 84,
+    0x10D6B: 84,
+    0x10D6C: 84,
+    0x10D6D: 84,
+    0x10EAB: 84,
+    0x10EAC: 84,
+    0x10EC2: 82,
+    0x10EC3: 68,
+    0x10EC4: 68,
+    0x10EFC: 84,
+    0x10EFD: 84,
+    0x10EFE: 84,
+    0x10EFF: 84,
+    0x10F30: 68,
+    0x10F31: 68,
+    0x10F32: 68,
+    0x10F33: 82,
+    0x10F34: 68,
+    0x10F35: 68,
+    0x10F36: 68,
+    0x10F37: 68,
+    0x10F38: 68,
+    0x10F39: 68,
+    0x10F3A: 68,
+    0x10F3B: 68,
+    0x10F3C: 68,
+    0x10F3D: 68,
+    0x10F3E: 68,
+    0x10F3F: 68,
+    0x10F40: 68,
+    0x10F41: 68,
+    0x10F42: 68,
+    0x10F43: 68,
+    0x10F44: 68,
+    0x10F46: 84,
+    0x10F47: 84,
+    0x10F48: 84,
+    0x10F49: 84,
+    0x10F4A: 84,
+    0x10F4B: 84,
+    0x10F4C: 84,
+    0x10F4D: 84,
+    0x10F4E: 84,
+    0x10F4F: 84,
+    0x10F50: 84,
+    0x10F51: 68,
+    0x10F52: 68,
+    0x10F53: 68,
+    0x10F54: 82,
+    0x10F70: 68,
+    0x10F71: 68,
+    0x10F72: 68,
+    0x10F73: 68,
+    0x10F74: 82,
+    0x10F75: 82,
+    0x10F76: 68,
+    0x10F77: 68,
+    0x10F78: 68,
+    0x10F79: 68,
+    0x10F7A: 68,
+    0x10F7B: 68,
+    0x10F7C: 68,
+    0x10F7D: 68,
+    0x10F7E: 68,
+    0x10F7F: 68,
+    0x10F80: 68,
+    0x10F81: 68,
+    0x10F82: 84,
+    0x10F83: 84,
+    0x10F84: 84,
+    0x10F85: 84,
+    0x10FB0: 68,
+    0x10FB2: 68,
+    0x10FB3: 68,
+    0x10FB4: 82,
+    0x10FB5: 82,
+    0x10FB6: 82,
+    0x10FB8: 68,
+    0x10FB9: 82,
+    0x10FBA: 82,
+    0x10FBB: 68,
+    0x10FBC: 68,
+    0x10FBD: 82,
+    0x10FBE: 68,
+    0x10FBF: 68,
+    0x10FC1: 68,
+    0x10FC2: 82,
+    0x10FC3: 82,
+    0x10FC4: 68,
+    0x10FC9: 82,
+    0x10FCA: 68,
+    0x10FCB: 76,
+    0x11001: 84,
+    0x11038: 84,
+    0x11039: 84,
+    0x1103A: 84,
+    0x1103B: 84,
+    0x1103C: 84,
+    0x1103D: 84,
+    0x1103E: 84,
+    0x1103F: 84,
+    0x11040: 84,
+    0x11041: 84,
+    0x11042: 84,
+    0x11043: 84,
+    0x11044: 84,
+    0x11045: 84,
+    0x11046: 84,
+    0x11070: 84,
+    0x11073: 84,
+    0x11074: 84,
+    0x1107F: 84,
+    0x11080: 84,
+    0x11081: 84,
+    0x110B3: 84,
+    0x110B4: 84,
+    0x110B5: 84,
+    0x110B6: 84,
+    0x110B9: 84,
+    0x110BA: 84,
+    0x110C2: 84,
+    0x11100: 84,
+    0x11101: 84,
+    0x11102: 84,
+    0x11127: 84,
+    0x11128: 84,
+    0x11129: 84,
+    0x1112A: 84,
+    0x1112B: 84,
+    0x1112D: 84,
+    0x1112E: 84,
+    0x1112F: 84,
+    0x11130: 84,
+    0x11131: 84,
+    0x11132: 84,
+    0x11133: 84,
+    0x11134: 84,
+    0x11173: 84,
+    0x11180: 84,
+    0x11181: 84,
+    0x111B6: 84,
+    0x111B7: 84,
+    0x111B8: 84,
+    0x111B9: 84,
+    0x111BA: 84,
+    0x111BB: 84,
+    0x111BC: 84,
+    0x111BD: 84,
+    0x111BE: 84,
+    0x111C9: 84,
+    0x111CA: 84,
+    0x111CB: 84,
+    0x111CC: 84,
+    0x111CF: 84,
+    0x1122F: 84,
+    0x11230: 84,
+    0x11231: 84,
+    0x11234: 84,
+    0x11236: 84,
+    0x11237: 84,
+    0x1123E: 84,
+    0x11241: 84,
+    0x112DF: 84,
+    0x112E3: 84,
+    0x112E4: 84,
+    0x112E5: 84,
+    0x112E6: 84,
+    0x112E7: 84,
+    0x112E8: 84,
+    0x112E9: 84,
+    0x112EA: 84,
+    0x11300: 84,
+    0x11301: 84,
+    0x1133B: 84,
+    0x1133C: 84,
+    0x11340: 84,
+    0x11366: 84,
+    0x11367: 84,
+    0x11368: 84,
+    0x11369: 84,
+    0x1136A: 84,
+    0x1136B: 84,
+    0x1136C: 84,
+    0x11370: 84,
+    0x11371: 84,
+    0x11372: 84,
+    0x11373: 84,
+    0x11374: 84,
+    0x113BB: 84,
+    0x113BC: 84,
+    0x113BD: 84,
+    0x113BE: 84,
+    0x113BF: 84,
+    0x113C0: 84,
+    0x113CE: 84,
+    0x113D0: 84,
+    0x113D2: 84,
+    0x113E1: 84,
+    0x113E2: 84,
+    0x11438: 84,
+    0x11439: 84,
+    0x1143A: 84,
+    0x1143B: 84,
+    0x1143C: 84,
+    0x1143D: 84,
+    0x1143E: 84,
+    0x1143F: 84,
+    0x11442: 84,
+    0x11443: 84,
+    0x11444: 84,
+    0x11446: 84,
+    0x1145E: 84,
+    0x114B3: 84,
+    0x114B4: 84,
+    0x114B5: 84,
+    0x114B6: 84,
+    0x114B7: 84,
+    0x114B8: 84,
+    0x114BA: 84,
+    0x114BF: 84,
+    0x114C0: 84,
+    0x114C2: 84,
+    0x114C3: 84,
+    0x115B2: 84,
+    0x115B3: 84,
+    0x115B4: 84,
+    0x115B5: 84,
+    0x115BC: 84,
+    0x115BD: 84,
+    0x115BF: 84,
+    0x115C0: 84,
+    0x115DC: 84,
+    0x115DD: 84,
+    0x11633: 84,
+    0x11634: 84,
+    0x11635: 84,
+    0x11636: 84,
+    0x11637: 84,
+    0x11638: 84,
+    0x11639: 84,
+    0x1163A: 84,
+    0x1163D: 84,
+    0x1163F: 84,
+    0x11640: 84,
+    0x116AB: 84,
+    0x116AD: 84,
+    0x116B0: 84,
+    0x116B1: 84,
+    0x116B2: 84,
+    0x116B3: 84,
+    0x116B4: 84,
+    0x116B5: 84,
+    0x116B7: 84,
+    0x1171D: 84,
+    0x1171F: 84,
+    0x11722: 84,
+    0x11723: 84,
+    0x11724: 84,
+    0x11725: 84,
+    0x11727: 84,
+    0x11728: 84,
+    0x11729: 84,
+    0x1172A: 84,
+    0x1172B: 84,
+    0x1182F: 84,
+    0x11830: 84,
+    0x11831: 84,
+    0x11832: 84,
+    0x11833: 84,
+    0x11834: 84,
+    0x11835: 84,
+    0x11836: 84,
+    0x11837: 84,
+    0x11839: 84,
+    0x1183A: 84,
+    0x1193B: 84,
+    0x1193C: 84,
+    0x1193E: 84,
+    0x11943: 84,
+    0x119D4: 84,
+    0x119D5: 84,
+    0x119D6: 84,
+    0x119D7: 84,
+    0x119DA: 84,
+    0x119DB: 84,
+    0x119E0: 84,
+    0x11A01: 84,
+    0x11A02: 84,
+    0x11A03: 84,
+    0x11A04: 84,
+    0x11A05: 84,
+    0x11A06: 84,
+    0x11A07: 84,
+    0x11A08: 84,
+    0x11A09: 84,
+    0x11A0A: 84,
+    0x11A33: 84,
+    0x11A34: 84,
+    0x11A35: 84,
+    0x11A36: 84,
+    0x11A37: 84,
+    0x11A38: 84,
+    0x11A3B: 84,
+    0x11A3C: 84,
+    0x11A3D: 84,
+    0x11A3E: 84,
+    0x11A47: 84,
+    0x11A51: 84,
+    0x11A52: 84,
+    0x11A53: 84,
+    0x11A54: 84,
+    0x11A55: 84,
+    0x11A56: 84,
+    0x11A59: 84,
+    0x11A5A: 84,
+    0x11A5B: 84,
+    0x11A8A: 84,
+    0x11A8B: 84,
+    0x11A8C: 84,
+    0x11A8D: 84,
+    0x11A8E: 84,
+    0x11A8F: 84,
+    0x11A90: 84,
+    0x11A91: 84,
+    0x11A92: 84,
+    0x11A93: 84,
+    0x11A94: 84,
+    0x11A95: 84,
+    0x11A96: 84,
+    0x11A98: 84,
+    0x11A99: 84,
+    0x11C30: 84,
+    0x11C31: 84,
+    0x11C32: 84,
+    0x11C33: 84,
+    0x11C34: 84,
+    0x11C35: 84,
+    0x11C36: 84,
+    0x11C38: 84,
+    0x11C39: 84,
+    0x11C3A: 84,
+    0x11C3B: 84,
+    0x11C3C: 84,
+    0x11C3D: 84,
+    0x11C3F: 84,
+    0x11C92: 84,
+    0x11C93: 84,
+    0x11C94: 84,
+    0x11C95: 84,
+    0x11C96: 84,
+    0x11C97: 84,
+    0x11C98: 84,
+    0x11C99: 84,
+    0x11C9A: 84,
+    0x11C9B: 84,
+    0x11C9C: 84,
+    0x11C9D: 84,
+    0x11C9E: 84,
+    0x11C9F: 84,
+    0x11CA0: 84,
+    0x11CA1: 84,
+    0x11CA2: 84,
+    0x11CA3: 84,
+    0x11CA4: 84,
+    0x11CA5: 84,
+    0x11CA6: 84,
+    0x11CA7: 84,
+    0x11CAA: 84,
+    0x11CAB: 84,
+    0x11CAC: 84,
+    0x11CAD: 84,
+    0x11CAE: 84,
+    0x11CAF: 84,
+    0x11CB0: 84,
+    0x11CB2: 84,
+    0x11CB3: 84,
+    0x11CB5: 84,
+    0x11CB6: 84,
+    0x11D31: 84,
+    0x11D32: 84,
+    0x11D33: 84,
+    0x11D34: 84,
+    0x11D35: 84,
+    0x11D36: 84,
+    0x11D3A: 84,
+    0x11D3C: 84,
+    0x11D3D: 84,
+    0x11D3F: 84,
+    0x11D40: 84,
+    0x11D41: 84,
+    0x11D42: 84,
+    0x11D43: 84,
+    0x11D44: 84,
+    0x11D45: 84,
+    0x11D47: 84,
+    0x11D90: 84,
+    0x11D91: 84,
+    0x11D95: 84,
+    0x11D97: 84,
+    0x11EF3: 84,
+    0x11EF4: 84,
+    0x11F00: 84,
+    0x11F01: 84,
+    0x11F36: 84,
+    0x11F37: 84,
+    0x11F38: 84,
+    0x11F39: 84,
+    0x11F3A: 84,
+    0x11F40: 84,
+    0x11F42: 84,
+    0x11F5A: 84,
+    0x13430: 84,
+    0x13431: 84,
+    0x13432: 84,
+    0x13433: 84,
+    0x13434: 84,
+    0x13435: 84,
+    0x13436: 84,
+    0x13437: 84,
+    0x13438: 84,
+    0x13439: 84,
+    0x1343A: 84,
+    0x1343B: 84,
+    0x1343C: 84,
+    0x1343D: 84,
+    0x1343E: 84,
+    0x1343F: 84,
+    0x13440: 84,
+    0x13447: 84,
+    0x13448: 84,
+    0x13449: 84,
+    0x1344A: 84,
+    0x1344B: 84,
+    0x1344C: 84,
+    0x1344D: 84,
+    0x1344E: 84,
+    0x1344F: 84,
+    0x13450: 84,
+    0x13451: 84,
+    0x13452: 84,
+    0x13453: 84,
+    0x13454: 84,
+    0x13455: 84,
+    0x1611E: 84,
+    0x1611F: 84,
+    0x16120: 84,
+    0x16121: 84,
+    0x16122: 84,
+    0x16123: 84,
+    0x16124: 84,
+    0x16125: 84,
+    0x16126: 84,
+    0x16127: 84,
+    0x16128: 84,
+    0x16129: 84,
+    0x1612D: 84,
+    0x1612E: 84,
+    0x1612F: 84,
+    0x16AF0: 84,
+    0x16AF1: 84,
+    0x16AF2: 84,
+    0x16AF3: 84,
+    0x16AF4: 84,
+    0x16B30: 84,
+    0x16B31: 84,
+    0x16B32: 84,
+    0x16B33: 84,
+    0x16B34: 84,
+    0x16B35: 84,
+    0x16B36: 84,
+    0x16F4F: 84,
+    0x16F8F: 84,
+    0x16F90: 84,
+    0x16F91: 84,
+    0x16F92: 84,
+    0x16FE4: 84,
+    0x1BC9D: 84,
+    0x1BC9E: 84,
+    0x1BCA0: 84,
+    0x1BCA1: 84,
+    0x1BCA2: 84,
+    0x1BCA3: 84,
+    0x1CF00: 84,
+    0x1CF01: 84,
+    0x1CF02: 84,
+    0x1CF03: 84,
+    0x1CF04: 84,
+    0x1CF05: 84,
+    0x1CF06: 84,
+    0x1CF07: 84,
+    0x1CF08: 84,
+    0x1CF09: 84,
+    0x1CF0A: 84,
+    0x1CF0B: 84,
+    0x1CF0C: 84,
+    0x1CF0D: 84,
+    0x1CF0E: 84,
+    0x1CF0F: 84,
+    0x1CF10: 84,
+    0x1CF11: 84,
+    0x1CF12: 84,
+    0x1CF13: 84,
+    0x1CF14: 84,
+    0x1CF15: 84,
+    0x1CF16: 84,
+    0x1CF17: 84,
+    0x1CF18: 84,
+    0x1CF19: 84,
+    0x1CF1A: 84,
+    0x1CF1B: 84,
+    0x1CF1C: 84,
+    0x1CF1D: 84,
+    0x1CF1E: 84,
+    0x1CF1F: 84,
+    0x1CF20: 84,
+    0x1CF21: 84,
+    0x1CF22: 84,
+    0x1CF23: 84,
+    0x1CF24: 84,
+    0x1CF25: 84,
+    0x1CF26: 84,
+    0x1CF27: 84,
+    0x1CF28: 84,
+    0x1CF29: 84,
+    0x1CF2A: 84,
+    0x1CF2B: 84,
+    0x1CF2C: 84,
+    0x1CF2D: 84,
+    0x1CF30: 84,
+    0x1CF31: 84,
+    0x1CF32: 84,
+    0x1CF33: 84,
+    0x1CF34: 84,
+    0x1CF35: 84,
+    0x1CF36: 84,
+    0x1CF37: 84,
+    0x1CF38: 84,
+    0x1CF39: 84,
+    0x1CF3A: 84,
+    0x1CF3B: 84,
+    0x1CF3C: 84,
+    0x1CF3D: 84,
+    0x1CF3E: 84,
+    0x1CF3F: 84,
+    0x1CF40: 84,
+    0x1CF41: 84,
+    0x1CF42: 84,
+    0x1CF43: 84,
+    0x1CF44: 84,
+    0x1CF45: 84,
+    0x1CF46: 84,
+    0x1D167: 84,
+    0x1D168: 84,
+    0x1D169: 84,
+    0x1D173: 84,
+    0x1D174: 84,
+    0x1D175: 84,
+    0x1D176: 84,
+    0x1D177: 84,
+    0x1D178: 84,
+    0x1D179: 84,
+    0x1D17A: 84,
+    0x1D17B: 84,
+    0x1D17C: 84,
+    0x1D17D: 84,
+    0x1D17E: 84,
+    0x1D17F: 84,
+    0x1D180: 84,
+    0x1D181: 84,
+    0x1D182: 84,
+    0x1D185: 84,
+    0x1D186: 84,
+    0x1D187: 84,
+    0x1D188: 84,
+    0x1D189: 84,
+    0x1D18A: 84,
+    0x1D18B: 84,
+    0x1D1AA: 84,
+    0x1D1AB: 84,
+    0x1D1AC: 84,
+    0x1D1AD: 84,
+    0x1D242: 84,
+    0x1D243: 84,
+    0x1D244: 84,
+    0x1DA00: 84,
+    0x1DA01: 84,
+    0x1DA02: 84,
+    0x1DA03: 84,
+    0x1DA04: 84,
+    0x1DA05: 84,
+    0x1DA06: 84,
+    0x1DA07: 84,
+    0x1DA08: 84,
+    0x1DA09: 84,
+    0x1DA0A: 84,
+    0x1DA0B: 84,
+    0x1DA0C: 84,
+    0x1DA0D: 84,
+    0x1DA0E: 84,
+    0x1DA0F: 84,
+    0x1DA10: 84,
+    0x1DA11: 84,
+    0x1DA12: 84,
+    0x1DA13: 84,
+    0x1DA14: 84,
+    0x1DA15: 84,
+    0x1DA16: 84,
+    0x1DA17: 84,
+    0x1DA18: 84,
+    0x1DA19: 84,
+    0x1DA1A: 84,
+    0x1DA1B: 84,
+    0x1DA1C: 84,
+    0x1DA1D: 84,
+    0x1DA1E: 84,
+    0x1DA1F: 84,
+    0x1DA20: 84,
+    0x1DA21: 84,
+    0x1DA22: 84,
+    0x1DA23: 84,
+    0x1DA24: 84,
+    0x1DA25: 84,
+    0x1DA26: 84,
+    0x1DA27: 84,
+    0x1DA28: 84,
+    0x1DA29: 84,
+    0x1DA2A: 84,
+    0x1DA2B: 84,
+    0x1DA2C: 84,
+    0x1DA2D: 84,
+    0x1DA2E: 84,
+    0x1DA2F: 84,
+    0x1DA30: 84,
+    0x1DA31: 84,
+    0x1DA32: 84,
+    0x1DA33: 84,
+    0x1DA34: 84,
+    0x1DA35: 84,
+    0x1DA36: 84,
+    0x1DA3B: 84,
+    0x1DA3C: 84,
+    0x1DA3D: 84,
+    0x1DA3E: 84,
+    0x1DA3F: 84,
+    0x1DA40: 84,
+    0x1DA41: 84,
+    0x1DA42: 84,
+    0x1DA43: 84,
+    0x1DA44: 84,
+    0x1DA45: 84,
+    0x1DA46: 84,
+    0x1DA47: 84,
+    0x1DA48: 84,
+    0x1DA49: 84,
+    0x1DA4A: 84,
+    0x1DA4B: 84,
+    0x1DA4C: 84,
+    0x1DA4D: 84,
+    0x1DA4E: 84,
+    0x1DA4F: 84,
+    0x1DA50: 84,
+    0x1DA51: 84,
+    0x1DA52: 84,
+    0x1DA53: 84,
+    0x1DA54: 84,
+    0x1DA55: 84,
+    0x1DA56: 84,
+    0x1DA57: 84,
+    0x1DA58: 84,
+    0x1DA59: 84,
+    0x1DA5A: 84,
+    0x1DA5B: 84,
+    0x1DA5C: 84,
+    0x1DA5D: 84,
+    0x1DA5E: 84,
+    0x1DA5F: 84,
+    0x1DA60: 84,
+    0x1DA61: 84,
+    0x1DA62: 84,
+    0x1DA63: 84,
+    0x1DA64: 84,
+    0x1DA65: 84,
+    0x1DA66: 84,
+    0x1DA67: 84,
+    0x1DA68: 84,
+    0x1DA69: 84,
+    0x1DA6A: 84,
+    0x1DA6B: 84,
+    0x1DA6C: 84,
+    0x1DA75: 84,
+    0x1DA84: 84,
+    0x1DA9B: 84,
+    0x1DA9C: 84,
+    0x1DA9D: 84,
+    0x1DA9E: 84,
+    0x1DA9F: 84,
+    0x1DAA1: 84,
+    0x1DAA2: 84,
+    0x1DAA3: 84,
+    0x1DAA4: 84,
+    0x1DAA5: 84,
+    0x1DAA6: 84,
+    0x1DAA7: 84,
+    0x1DAA8: 84,
+    0x1DAA9: 84,
+    0x1DAAA: 84,
+    0x1DAAB: 84,
+    0x1DAAC: 84,
+    0x1DAAD: 84,
+    0x1DAAE: 84,
+    0x1DAAF: 84,
+    0x1E000: 84,
+    0x1E001: 84,
+    0x1E002: 84,
+    0x1E003: 84,
+    0x1E004: 84,
+    0x1E005: 84,
+    0x1E006: 84,
+    0x1E008: 84,
+    0x1E009: 84,
+    0x1E00A: 84,
+    0x1E00B: 84,
+    0x1E00C: 84,
+    0x1E00D: 84,
+    0x1E00E: 84,
+    0x1E00F: 84,
+    0x1E010: 84,
+    0x1E011: 84,
+    0x1E012: 84,
+    0x1E013: 84,
+    0x1E014: 84,
+    0x1E015: 84,
+    0x1E016: 84,
+    0x1E017: 84,
+    0x1E018: 84,
+    0x1E01B: 84,
+    0x1E01C: 84,
+    0x1E01D: 84,
+    0x1E01E: 84,
+    0x1E01F: 84,
+    0x1E020: 84,
+    0x1E021: 84,
+    0x1E023: 84,
+    0x1E024: 84,
+    0x1E026: 84,
+    0x1E027: 84,
+    0x1E028: 84,
+    0x1E029: 84,
+    0x1E02A: 84,
+    0x1E08F: 84,
+    0x1E130: 84,
+    0x1E131: 84,
+    0x1E132: 84,
+    0x1E133: 84,
+    0x1E134: 84,
+    0x1E135: 84,
+    0x1E136: 84,
+    0x1E2AE: 84,
+    0x1E2EC: 84,
+    0x1E2ED: 84,
+    0x1E2EE: 84,
+    0x1E2EF: 84,
+    0x1E4EC: 84,
+    0x1E4ED: 84,
+    0x1E4EE: 84,
+    0x1E4EF: 84,
+    0x1E5EE: 84,
+    0x1E5EF: 84,
+    0x1E8D0: 84,
+    0x1E8D1: 84,
+    0x1E8D2: 84,
+    0x1E8D3: 84,
+    0x1E8D4: 84,
+    0x1E8D5: 84,
+    0x1E8D6: 84,
+    0x1E900: 68,
+    0x1E901: 68,
+    0x1E902: 68,
+    0x1E903: 68,
+    0x1E904: 68,
+    0x1E905: 68,
+    0x1E906: 68,
+    0x1E907: 68,
+    0x1E908: 68,
+    0x1E909: 68,
+    0x1E90A: 68,
+    0x1E90B: 68,
+    0x1E90C: 68,
+    0x1E90D: 68,
+    0x1E90E: 68,
+    0x1E90F: 68,
+    0x1E910: 68,
+    0x1E911: 68,
+    0x1E912: 68,
+    0x1E913: 68,
+    0x1E914: 68,
+    0x1E915: 68,
+    0x1E916: 68,
+    0x1E917: 68,
+    0x1E918: 68,
+    0x1E919: 68,
+    0x1E91A: 68,
+    0x1E91B: 68,
+    0x1E91C: 68,
+    0x1E91D: 68,
+    0x1E91E: 68,
+    0x1E91F: 68,
+    0x1E920: 68,
+    0x1E921: 68,
+    0x1E922: 68,
+    0x1E923: 68,
+    0x1E924: 68,
+    0x1E925: 68,
+    0x1E926: 68,
+    0x1E927: 68,
+    0x1E928: 68,
+    0x1E929: 68,
+    0x1E92A: 68,
+    0x1E92B: 68,
+    0x1E92C: 68,
+    0x1E92D: 68,
+    0x1E92E: 68,
+    0x1E92F: 68,
+    0x1E930: 68,
+    0x1E931: 68,
+    0x1E932: 68,
+    0x1E933: 68,
+    0x1E934: 68,
+    0x1E935: 68,
+    0x1E936: 68,
+    0x1E937: 68,
+    0x1E938: 68,
+    0x1E939: 68,
+    0x1E93A: 68,
+    0x1E93B: 68,
+    0x1E93C: 68,
+    0x1E93D: 68,
+    0x1E93E: 68,
+    0x1E93F: 68,
+    0x1E940: 68,
+    0x1E941: 68,
+    0x1E942: 68,
+    0x1E943: 68,
+    0x1E944: 84,
+    0x1E945: 84,
+    0x1E946: 84,
+    0x1E947: 84,
+    0x1E948: 84,
+    0x1E949: 84,
+    0x1E94A: 84,
+    0x1E94B: 84,
+    0xE0001: 84,
+    0xE0020: 84,
+    0xE0021: 84,
+    0xE0022: 84,
+    0xE0023: 84,
+    0xE0024: 84,
+    0xE0025: 84,
+    0xE0026: 84,
+    0xE0027: 84,
+    0xE0028: 84,
+    0xE0029: 84,
+    0xE002A: 84,
+    0xE002B: 84,
+    0xE002C: 84,
+    0xE002D: 84,
+    0xE002E: 84,
+    0xE002F: 84,
+    0xE0030: 84,
+    0xE0031: 84,
+    0xE0032: 84,
+    0xE0033: 84,
+    0xE0034: 84,
+    0xE0035: 84,
+    0xE0036: 84,
+    0xE0037: 84,
+    0xE0038: 84,
+    0xE0039: 84,
+    0xE003A: 84,
+    0xE003B: 84,
+    0xE003C: 84,
+    0xE003D: 84,
+    0xE003E: 84,
+    0xE003F: 84,
+    0xE0040: 84,
+    0xE0041: 84,
+    0xE0042: 84,
+    0xE0043: 84,
+    0xE0044: 84,
+    0xE0045: 84,
+    0xE0046: 84,
+    0xE0047: 84,
+    0xE0048: 84,
+    0xE0049: 84,
+    0xE004A: 84,
+    0xE004B: 84,
+    0xE004C: 84,
+    0xE004D: 84,
+    0xE004E: 84,
+    0xE004F: 84,
+    0xE0050: 84,
+    0xE0051: 84,
+    0xE0052: 84,
+    0xE0053: 84,
+    0xE0054: 84,
+    0xE0055: 84,
+    0xE0056: 84,
+    0xE0057: 84,
+    0xE0058: 84,
+    0xE0059: 84,
+    0xE005A: 84,
+    0xE005B: 84,
+    0xE005C: 84,
+    0xE005D: 84,
+    0xE005E: 84,
+    0xE005F: 84,
+    0xE0060: 84,
+    0xE0061: 84,
+    0xE0062: 84,
+    0xE0063: 84,
+    0xE0064: 84,
+    0xE0065: 84,
+    0xE0066: 84,
+    0xE0067: 84,
+    0xE0068: 84,
+    0xE0069: 84,
+    0xE006A: 84,
+    0xE006B: 84,
+    0xE006C: 84,
+    0xE006D: 84,
+    0xE006E: 84,
+    0xE006F: 84,
+    0xE0070: 84,
+    0xE0071: 84,
+    0xE0072: 84,
+    0xE0073: 84,
+    0xE0074: 84,
+    0xE0075: 84,
+    0xE0076: 84,
+    0xE0077: 84,
+    0xE0078: 84,
+    0xE0079: 84,
+    0xE007A: 84,
+    0xE007B: 84,
+    0xE007C: 84,
+    0xE007D: 84,
+    0xE007E: 84,
+    0xE007F: 84,
+    0xE0100: 84,
+    0xE0101: 84,
+    0xE0102: 84,
+    0xE0103: 84,
+    0xE0104: 84,
+    0xE0105: 84,
+    0xE0106: 84,
+    0xE0107: 84,
+    0xE0108: 84,
+    0xE0109: 84,
+    0xE010A: 84,
+    0xE010B: 84,
+    0xE010C: 84,
+    0xE010D: 84,
+    0xE010E: 84,
+    0xE010F: 84,
+    0xE0110: 84,
+    0xE0111: 84,
+    0xE0112: 84,
+    0xE0113: 84,
+    0xE0114: 84,
+    0xE0115: 84,
+    0xE0116: 84,
+    0xE0117: 84,
+    0xE0118: 84,
+    0xE0119: 84,
+    0xE011A: 84,
+    0xE011B: 84,
+    0xE011C: 84,
+    0xE011D: 84,
+    0xE011E: 84,
+    0xE011F: 84,
+    0xE0120: 84,
+    0xE0121: 84,
+    0xE0122: 84,
+    0xE0123: 84,
+    0xE0124: 84,
+    0xE0125: 84,
+    0xE0126: 84,
+    0xE0127: 84,
+    0xE0128: 84,
+    0xE0129: 84,
+    0xE012A: 84,
+    0xE012B: 84,
+    0xE012C: 84,
+    0xE012D: 84,
+    0xE012E: 84,
+    0xE012F: 84,
+    0xE0130: 84,
+    0xE0131: 84,
+    0xE0132: 84,
+    0xE0133: 84,
+    0xE0134: 84,
+    0xE0135: 84,
+    0xE0136: 84,
+    0xE0137: 84,
+    0xE0138: 84,
+    0xE0139: 84,
+    0xE013A: 84,
+    0xE013B: 84,
+    0xE013C: 84,
+    0xE013D: 84,
+    0xE013E: 84,
+    0xE013F: 84,
+    0xE0140: 84,
+    0xE0141: 84,
+    0xE0142: 84,
+    0xE0143: 84,
+    0xE0144: 84,
+    0xE0145: 84,
+    0xE0146: 84,
+    0xE0147: 84,
+    0xE0148: 84,
+    0xE0149: 84,
+    0xE014A: 84,
+    0xE014B: 84,
+    0xE014C: 84,
+    0xE014D: 84,
+    0xE014E: 84,
+    0xE014F: 84,
+    0xE0150: 84,
+    0xE0151: 84,
+    0xE0152: 84,
+    0xE0153: 84,
+    0xE0154: 84,
+    0xE0155: 84,
+    0xE0156: 84,
+    0xE0157: 84,
+    0xE0158: 84,
+    0xE0159: 84,
+    0xE015A: 84,
+    0xE015B: 84,
+    0xE015C: 84,
+    0xE015D: 84,
+    0xE015E: 84,
+    0xE015F: 84,
+    0xE0160: 84,
+    0xE0161: 84,
+    0xE0162: 84,
+    0xE0163: 84,
+    0xE0164: 84,
+    0xE0165: 84,
+    0xE0166: 84,
+    0xE0167: 84,
+    0xE0168: 84,
+    0xE0169: 84,
+    0xE016A: 84,
+    0xE016B: 84,
+    0xE016C: 84,
+    0xE016D: 84,
+    0xE016E: 84,
+    0xE016F: 84,
+    0xE0170: 84,
+    0xE0171: 84,
+    0xE0172: 84,
+    0xE0173: 84,
+    0xE0174: 84,
+    0xE0175: 84,
+    0xE0176: 84,
+    0xE0177: 84,
+    0xE0178: 84,
+    0xE0179: 84,
+    0xE017A: 84,
+    0xE017B: 84,
+    0xE017C: 84,
+    0xE017D: 84,
+    0xE017E: 84,
+    0xE017F: 84,
+    0xE0180: 84,
+    0xE0181: 84,
+    0xE0182: 84,
+    0xE0183: 84,
+    0xE0184: 84,
+    0xE0185: 84,
+    0xE0186: 84,
+    0xE0187: 84,
+    0xE0188: 84,
+    0xE0189: 84,
+    0xE018A: 84,
+    0xE018B: 84,
+    0xE018C: 84,
+    0xE018D: 84,
+    0xE018E: 84,
+    0xE018F: 84,
+    0xE0190: 84,
+    0xE0191: 84,
+    0xE0192: 84,
+    0xE0193: 84,
+    0xE0194: 84,
+    0xE0195: 84,
+    0xE0196: 84,
+    0xE0197: 84,
+    0xE0198: 84,
+    0xE0199: 84,
+    0xE019A: 84,
+    0xE019B: 84,
+    0xE019C: 84,
+    0xE019D: 84,
+    0xE019E: 84,
+    0xE019F: 84,
+    0xE01A0: 84,
+    0xE01A1: 84,
+    0xE01A2: 84,
+    0xE01A3: 84,
+    0xE01A4: 84,
+    0xE01A5: 84,
+    0xE01A6: 84,
+    0xE01A7: 84,
+    0xE01A8: 84,
+    0xE01A9: 84,
+    0xE01AA: 84,
+    0xE01AB: 84,
+    0xE01AC: 84,
+    0xE01AD: 84,
+    0xE01AE: 84,
+    0xE01AF: 84,
+    0xE01B0: 84,
+    0xE01B1: 84,
+    0xE01B2: 84,
+    0xE01B3: 84,
+    0xE01B4: 84,
+    0xE01B5: 84,
+    0xE01B6: 84,
+    0xE01B7: 84,
+    0xE01B8: 84,
+    0xE01B9: 84,
+    0xE01BA: 84,
+    0xE01BB: 84,
+    0xE01BC: 84,
+    0xE01BD: 84,
+    0xE01BE: 84,
+    0xE01BF: 84,
+    0xE01C0: 84,
+    0xE01C1: 84,
+    0xE01C2: 84,
+    0xE01C3: 84,
+    0xE01C4: 84,
+    0xE01C5: 84,
+    0xE01C6: 84,
+    0xE01C7: 84,
+    0xE01C8: 84,
+    0xE01C9: 84,
+    0xE01CA: 84,
+    0xE01CB: 84,
+    0xE01CC: 84,
+    0xE01CD: 84,
+    0xE01CE: 84,
+    0xE01CF: 84,
+    0xE01D0: 84,
+    0xE01D1: 84,
+    0xE01D2: 84,
+    0xE01D3: 84,
+    0xE01D4: 84,
+    0xE01D5: 84,
+    0xE01D6: 84,
+    0xE01D7: 84,
+    0xE01D8: 84,
+    0xE01D9: 84,
+    0xE01DA: 84,
+    0xE01DB: 84,
+    0xE01DC: 84,
+    0xE01DD: 84,
+    0xE01DE: 84,
+    0xE01DF: 84,
+    0xE01E0: 84,
+    0xE01E1: 84,
+    0xE01E2: 84,
+    0xE01E3: 84,
+    0xE01E4: 84,
+    0xE01E5: 84,
+    0xE01E6: 84,
+    0xE01E7: 84,
+    0xE01E8: 84,
+    0xE01E9: 84,
+    0xE01EA: 84,
+    0xE01EB: 84,
+    0xE01EC: 84,
+    0xE01ED: 84,
+    0xE01EE: 84,
+    0xE01EF: 84,
+}
+codepoint_classes = {
+    "PVALID": (
+        0x2D0000002E,
+        0x300000003A,
+        0x610000007B,
+        0xDF000000F7,
+        0xF800000100,
+        0x10100000102,
+        0x10300000104,
+        0x10500000106,
+        0x10700000108,
+        0x1090000010A,
+        0x10B0000010C,
+        0x10D0000010E,
+        0x10F00000110,
+        0x11100000112,
+        0x11300000114,
+        0x11500000116,
+        0x11700000118,
+        0x1190000011A,
+        0x11B0000011C,
+        0x11D0000011E,
+        0x11F00000120,
+        0x12100000122,
+        0x12300000124,
+        0x12500000126,
+        0x12700000128,
+        0x1290000012A,
+        0x12B0000012C,
+        0x12D0000012E,
+        0x12F00000130,
+        0x13100000132,
+        0x13500000136,
+        0x13700000139,
+        0x13A0000013B,
+        0x13C0000013D,
+        0x13E0000013F,
+        0x14200000143,
+        0x14400000145,
+        0x14600000147,
+        0x14800000149,
+        0x14B0000014C,
+        0x14D0000014E,
+        0x14F00000150,
+        0x15100000152,
+        0x15300000154,
+        0x15500000156,
+        0x15700000158,
+        0x1590000015A,
+        0x15B0000015C,
+        0x15D0000015E,
+        0x15F00000160,
+        0x16100000162,
+        0x16300000164,
+        0x16500000166,
+        0x16700000168,
+        0x1690000016A,
+        0x16B0000016C,
+        0x16D0000016E,
+        0x16F00000170,
+        0x17100000172,
+        0x17300000174,
+        0x17500000176,
+        0x17700000178,
+        0x17A0000017B,
+        0x17C0000017D,
+        0x17E0000017F,
+        0x18000000181,
+        0x18300000184,
+        0x18500000186,
+        0x18800000189,
+        0x18C0000018E,
+        0x19200000193,
+        0x19500000196,
+        0x1990000019C,
+        0x19E0000019F,
+        0x1A1000001A2,
+        0x1A3000001A4,
+        0x1A5000001A6,
+        0x1A8000001A9,
+        0x1AA000001AC,
+        0x1AD000001AE,
+        0x1B0000001B1,
+        0x1B4000001B5,
+        0x1B6000001B7,
+        0x1B9000001BC,
+        0x1BD000001C4,
+        0x1CE000001CF,
+        0x1D0000001D1,
+        0x1D2000001D3,
+        0x1D4000001D5,
+        0x1D6000001D7,
+        0x1D8000001D9,
+        0x1DA000001DB,
+        0x1DC000001DE,
+        0x1DF000001E0,
+        0x1E1000001E2,
+        0x1E3000001E4,
+        0x1E5000001E6,
+        0x1E7000001E8,
+        0x1E9000001EA,
+        0x1EB000001EC,
+        0x1ED000001EE,
+        0x1EF000001F1,
+        0x1F5000001F6,
+        0x1F9000001FA,
+        0x1FB000001FC,
+        0x1FD000001FE,
+        0x1FF00000200,
+        0x20100000202,
+        0x20300000204,
+        0x20500000206,
+        0x20700000208,
+        0x2090000020A,
+        0x20B0000020C,
+        0x20D0000020E,
+        0x20F00000210,
+        0x21100000212,
+        0x21300000214,
+        0x21500000216,
+        0x21700000218,
+        0x2190000021A,
+        0x21B0000021C,
+        0x21D0000021E,
+        0x21F00000220,
+        0x22100000222,
+        0x22300000224,
+        0x22500000226,
+        0x22700000228,
+        0x2290000022A,
+        0x22B0000022C,
+        0x22D0000022E,
+        0x22F00000230,
+        0x23100000232,
+        0x2330000023A,
+        0x23C0000023D,
+        0x23F00000241,
+        0x24200000243,
+        0x24700000248,
+        0x2490000024A,
+        0x24B0000024C,
+        0x24D0000024E,
+        0x24F000002B0,
+        0x2B9000002C2,
+        0x2C6000002D2,
+        0x2EC000002ED,
+        0x2EE000002EF,
+        0x30000000340,
+        0x34200000343,
+        0x3460000034F,
+        0x35000000370,
+        0x37100000372,
+        0x37300000374,
+        0x37700000378,
+        0x37B0000037E,
+        0x39000000391,
+        0x3AC000003CF,
+        0x3D7000003D8,
+        0x3D9000003DA,
+        0x3DB000003DC,
+        0x3DD000003DE,
+        0x3DF000003E0,
+        0x3E1000003E2,
+        0x3E3000003E4,
+        0x3E5000003E6,
+        0x3E7000003E8,
+        0x3E9000003EA,
+        0x3EB000003EC,
+        0x3ED000003EE,
+        0x3EF000003F0,
+        0x3F3000003F4,
+        0x3F8000003F9,
+        0x3FB000003FD,
+        0x43000000460,
+        0x46100000462,
+        0x46300000464,
+        0x46500000466,
+        0x46700000468,
+        0x4690000046A,
+        0x46B0000046C,
+        0x46D0000046E,
+        0x46F00000470,
+        0x47100000472,
+        0x47300000474,
+        0x47500000476,
+        0x47700000478,
+        0x4790000047A,
+        0x47B0000047C,
+        0x47D0000047E,
+        0x47F00000480,
+        0x48100000482,
+        0x48300000488,
+        0x48B0000048C,
+        0x48D0000048E,
+        0x48F00000490,
+        0x49100000492,
+        0x49300000494,
+        0x49500000496,
+        0x49700000498,
+        0x4990000049A,
+        0x49B0000049C,
+        0x49D0000049E,
+        0x49F000004A0,
+        0x4A1000004A2,
+        0x4A3000004A4,
+        0x4A5000004A6,
+        0x4A7000004A8,
+        0x4A9000004AA,
+        0x4AB000004AC,
+        0x4AD000004AE,
+        0x4AF000004B0,
+        0x4B1000004B2,
+        0x4B3000004B4,
+        0x4B5000004B6,
+        0x4B7000004B8,
+        0x4B9000004BA,
+        0x4BB000004BC,
+        0x4BD000004BE,
+        0x4BF000004C0,
+        0x4C2000004C3,
+        0x4C4000004C5,
+        0x4C6000004C7,
+        0x4C8000004C9,
+        0x4CA000004CB,
+        0x4CC000004CD,
+        0x4CE000004D0,
+        0x4D1000004D2,
+        0x4D3000004D4,
+        0x4D5000004D6,
+        0x4D7000004D8,
+        0x4D9000004DA,
+        0x4DB000004DC,
+        0x4DD000004DE,
+        0x4DF000004E0,
+        0x4E1000004E2,
+        0x4E3000004E4,
+        0x4E5000004E6,
+        0x4E7000004E8,
+        0x4E9000004EA,
+        0x4EB000004EC,
+        0x4ED000004EE,
+        0x4EF000004F0,
+        0x4F1000004F2,
+        0x4F3000004F4,
+        0x4F5000004F6,
+        0x4F7000004F8,
+        0x4F9000004FA,
+        0x4FB000004FC,
+        0x4FD000004FE,
+        0x4FF00000500,
+        0x50100000502,
+        0x50300000504,
+        0x50500000506,
+        0x50700000508,
+        0x5090000050A,
+        0x50B0000050C,
+        0x50D0000050E,
+        0x50F00000510,
+        0x51100000512,
+        0x51300000514,
+        0x51500000516,
+        0x51700000518,
+        0x5190000051A,
+        0x51B0000051C,
+        0x51D0000051E,
+        0x51F00000520,
+        0x52100000522,
+        0x52300000524,
+        0x52500000526,
+        0x52700000528,
+        0x5290000052A,
+        0x52B0000052C,
+        0x52D0000052E,
+        0x52F00000530,
+        0x5590000055A,
+        0x56000000587,
+        0x58800000589,
+        0x591000005BE,
+        0x5BF000005C0,
+        0x5C1000005C3,
+        0x5C4000005C6,
+        0x5C7000005C8,
+        0x5D0000005EB,
+        0x5EF000005F3,
+        0x6100000061B,
+        0x62000000640,
+        0x64100000660,
+        0x66E00000675,
+        0x679000006D4,
+        0x6D5000006DD,
+        0x6DF000006E9,
+        0x6EA000006F0,
+        0x6FA00000700,
+        0x7100000074B,
+        0x74D000007B2,
+        0x7C0000007F6,
+        0x7FD000007FE,
+        0x8000000082E,
+        0x8400000085C,
+        0x8600000086B,
+        0x87000000888,
+        0x8890000088F,
+        0x897000008E2,
+        0x8E300000958,
+        0x96000000964,
+        0x96600000970,
+        0x97100000984,
+        0x9850000098D,
+        0x98F00000991,
+        0x993000009A9,
+        0x9AA000009B1,
+        0x9B2000009B3,
+        0x9B6000009BA,
+        0x9BC000009C5,
+        0x9C7000009C9,
+        0x9CB000009CF,
+        0x9D7000009D8,
+        0x9E0000009E4,
+        0x9E6000009F2,
+        0x9FC000009FD,
+        0x9FE000009FF,
+        0xA0100000A04,
+        0xA0500000A0B,
+        0xA0F00000A11,
+        0xA1300000A29,
+        0xA2A00000A31,
+        0xA3200000A33,
+        0xA3500000A36,
+        0xA3800000A3A,
+        0xA3C00000A3D,
+        0xA3E00000A43,
+        0xA4700000A49,
+        0xA4B00000A4E,
+        0xA5100000A52,
+        0xA5C00000A5D,
+        0xA6600000A76,
+        0xA8100000A84,
+        0xA8500000A8E,
+        0xA8F00000A92,
+        0xA9300000AA9,
+        0xAAA00000AB1,
+        0xAB200000AB4,
+        0xAB500000ABA,
+        0xABC00000AC6,
+        0xAC700000ACA,
+        0xACB00000ACE,
+        0xAD000000AD1,
+        0xAE000000AE4,
+        0xAE600000AF0,
+        0xAF900000B00,
+        0xB0100000B04,
+        0xB0500000B0D,
+        0xB0F00000B11,
+        0xB1300000B29,
+        0xB2A00000B31,
+        0xB3200000B34,
+        0xB3500000B3A,
+        0xB3C00000B45,
+        0xB4700000B49,
+        0xB4B00000B4E,
+        0xB5500000B58,
+        0xB5F00000B64,
+        0xB6600000B70,
+        0xB7100000B72,
+        0xB8200000B84,
+        0xB8500000B8B,
+        0xB8E00000B91,
+        0xB9200000B96,
+        0xB9900000B9B,
+        0xB9C00000B9D,
+        0xB9E00000BA0,
+        0xBA300000BA5,
+        0xBA800000BAB,
+        0xBAE00000BBA,
+        0xBBE00000BC3,
+        0xBC600000BC9,
+        0xBCA00000BCE,
+        0xBD000000BD1,
+        0xBD700000BD8,
+        0xBE600000BF0,
+        0xC0000000C0D,
+        0xC0E00000C11,
+        0xC1200000C29,
+        0xC2A00000C3A,
+        0xC3C00000C45,
+        0xC4600000C49,
+        0xC4A00000C4E,
+        0xC5500000C57,
+        0xC5800000C5B,
+        0xC5D00000C5E,
+        0xC6000000C64,
+        0xC6600000C70,
+        0xC8000000C84,
+        0xC8500000C8D,
+        0xC8E00000C91,
+        0xC9200000CA9,
+        0xCAA00000CB4,
+        0xCB500000CBA,
+        0xCBC00000CC5,
+        0xCC600000CC9,
+        0xCCA00000CCE,
+        0xCD500000CD7,
+        0xCDD00000CDF,
+        0xCE000000CE4,
+        0xCE600000CF0,
+        0xCF100000CF4,
+        0xD0000000D0D,
+        0xD0E00000D11,
+        0xD1200000D45,
+        0xD4600000D49,
+        0xD4A00000D4F,
+        0xD5400000D58,
+        0xD5F00000D64,
+        0xD6600000D70,
+        0xD7A00000D80,
+        0xD8100000D84,
+        0xD8500000D97,
+        0xD9A00000DB2,
+        0xDB300000DBC,
+        0xDBD00000DBE,
+        0xDC000000DC7,
+        0xDCA00000DCB,
+        0xDCF00000DD5,
+        0xDD600000DD7,
+        0xDD800000DE0,
+        0xDE600000DF0,
+        0xDF200000DF4,
+        0xE0100000E33,
+        0xE3400000E3B,
+        0xE4000000E4F,
+        0xE5000000E5A,
+        0xE8100000E83,
+        0xE8400000E85,
+        0xE8600000E8B,
+        0xE8C00000EA4,
+        0xEA500000EA6,
+        0xEA700000EB3,
+        0xEB400000EBE,
+        0xEC000000EC5,
+        0xEC600000EC7,
+        0xEC800000ECF,
+        0xED000000EDA,
+        0xEDE00000EE0,
+        0xF0000000F01,
+        0xF0B00000F0C,
+        0xF1800000F1A,
+        0xF2000000F2A,
+        0xF3500000F36,
+        0xF3700000F38,
+        0xF3900000F3A,
+        0xF3E00000F43,
+        0xF4400000F48,
+        0xF4900000F4D,
+        0xF4E00000F52,
+        0xF5300000F57,
+        0xF5800000F5C,
+        0xF5D00000F69,
+        0xF6A00000F6D,
+        0xF7100000F73,
+        0xF7400000F75,
+        0xF7A00000F81,
+        0xF8200000F85,
+        0xF8600000F93,
+        0xF9400000F98,
+        0xF9900000F9D,
+        0xF9E00000FA2,
+        0xFA300000FA7,
+        0xFA800000FAC,
+        0xFAD00000FB9,
+        0xFBA00000FBD,
+        0xFC600000FC7,
+        0x10000000104A,
+        0x10500000109E,
+        0x10D0000010FB,
+        0x10FD00001100,
+        0x120000001249,
+        0x124A0000124E,
+        0x125000001257,
+        0x125800001259,
+        0x125A0000125E,
+        0x126000001289,
+        0x128A0000128E,
+        0x1290000012B1,
+        0x12B2000012B6,
+        0x12B8000012BF,
+        0x12C0000012C1,
+        0x12C2000012C6,
+        0x12C8000012D7,
+        0x12D800001311,
+        0x131200001316,
+        0x13180000135B,
+        0x135D00001360,
+        0x138000001390,
+        0x13A0000013F6,
+        0x14010000166D,
+        0x166F00001680,
+        0x16810000169B,
+        0x16A0000016EB,
+        0x16F1000016F9,
+        0x170000001716,
+        0x171F00001735,
+        0x174000001754,
+        0x17600000176D,
+        0x176E00001771,
+        0x177200001774,
+        0x1780000017B4,
+        0x17B6000017D4,
+        0x17D7000017D8,
+        0x17DC000017DE,
+        0x17E0000017EA,
+        0x18100000181A,
+        0x182000001879,
+        0x1880000018AB,
+        0x18B0000018F6,
+        0x19000000191F,
+        0x19200000192C,
+        0x19300000193C,
+        0x19460000196E,
+        0x197000001975,
+        0x1980000019AC,
+        0x19B0000019CA,
+        0x19D0000019DA,
+        0x1A0000001A1C,
+        0x1A2000001A5F,
+        0x1A6000001A7D,
+        0x1A7F00001A8A,
+        0x1A9000001A9A,
+        0x1AA700001AA8,
+        0x1AB000001ABE,
+        0x1ABF00001ACF,
+        0x1B0000001B4D,
+        0x1B5000001B5A,
+        0x1B6B00001B74,
+        0x1B8000001BF4,
+        0x1C0000001C38,
+        0x1C4000001C4A,
+        0x1C4D00001C7E,
+        0x1C8A00001C8B,
+        0x1CD000001CD3,
+        0x1CD400001CFB,
+        0x1D0000001D2C,
+        0x1D2F00001D30,
+        0x1D3B00001D3C,
+        0x1D4E00001D4F,
+        0x1D6B00001D78,
+        0x1D7900001D9B,
+        0x1DC000001E00,
+        0x1E0100001E02,
+        0x1E0300001E04,
+        0x1E0500001E06,
+        0x1E0700001E08,
+        0x1E0900001E0A,
+        0x1E0B00001E0C,
+        0x1E0D00001E0E,
+        0x1E0F00001E10,
+        0x1E1100001E12,
+        0x1E1300001E14,
+        0x1E1500001E16,
+        0x1E1700001E18,
+        0x1E1900001E1A,
+        0x1E1B00001E1C,
+        0x1E1D00001E1E,
+        0x1E1F00001E20,
+        0x1E2100001E22,
+        0x1E2300001E24,
+        0x1E2500001E26,
+        0x1E2700001E28,
+        0x1E2900001E2A,
+        0x1E2B00001E2C,
+        0x1E2D00001E2E,
+        0x1E2F00001E30,
+        0x1E3100001E32,
+        0x1E3300001E34,
+        0x1E3500001E36,
+        0x1E3700001E38,
+        0x1E3900001E3A,
+        0x1E3B00001E3C,
+        0x1E3D00001E3E,
+        0x1E3F00001E40,
+        0x1E4100001E42,
+        0x1E4300001E44,
+        0x1E4500001E46,
+        0x1E4700001E48,
+        0x1E4900001E4A,
+        0x1E4B00001E4C,
+        0x1E4D00001E4E,
+        0x1E4F00001E50,
+        0x1E5100001E52,
+        0x1E5300001E54,
+        0x1E5500001E56,
+        0x1E5700001E58,
+        0x1E5900001E5A,
+        0x1E5B00001E5C,
+        0x1E5D00001E5E,
+        0x1E5F00001E60,
+        0x1E6100001E62,
+        0x1E6300001E64,
+        0x1E6500001E66,
+        0x1E6700001E68,
+        0x1E6900001E6A,
+        0x1E6B00001E6C,
+        0x1E6D00001E6E,
+        0x1E6F00001E70,
+        0x1E7100001E72,
+        0x1E7300001E74,
+        0x1E7500001E76,
+        0x1E7700001E78,
+        0x1E7900001E7A,
+        0x1E7B00001E7C,
+        0x1E7D00001E7E,
+        0x1E7F00001E80,
+        0x1E8100001E82,
+        0x1E8300001E84,
+        0x1E8500001E86,
+        0x1E8700001E88,
+        0x1E8900001E8A,
+        0x1E8B00001E8C,
+        0x1E8D00001E8E,
+        0x1E8F00001E90,
+        0x1E9100001E92,
+        0x1E9300001E94,
+        0x1E9500001E9A,
+        0x1E9C00001E9E,
+        0x1E9F00001EA0,
+        0x1EA100001EA2,
+        0x1EA300001EA4,
+        0x1EA500001EA6,
+        0x1EA700001EA8,
+        0x1EA900001EAA,
+        0x1EAB00001EAC,
+        0x1EAD00001EAE,
+        0x1EAF00001EB0,
+        0x1EB100001EB2,
+        0x1EB300001EB4,
+        0x1EB500001EB6,
+        0x1EB700001EB8,
+        0x1EB900001EBA,
+        0x1EBB00001EBC,
+        0x1EBD00001EBE,
+        0x1EBF00001EC0,
+        0x1EC100001EC2,
+        0x1EC300001EC4,
+        0x1EC500001EC6,
+        0x1EC700001EC8,
+        0x1EC900001ECA,
+        0x1ECB00001ECC,
+        0x1ECD00001ECE,
+        0x1ECF00001ED0,
+        0x1ED100001ED2,
+        0x1ED300001ED4,
+        0x1ED500001ED6,
+        0x1ED700001ED8,
+        0x1ED900001EDA,
+        0x1EDB00001EDC,
+        0x1EDD00001EDE,
+        0x1EDF00001EE0,
+        0x1EE100001EE2,
+        0x1EE300001EE4,
+        0x1EE500001EE6,
+        0x1EE700001EE8,
+        0x1EE900001EEA,
+        0x1EEB00001EEC,
+        0x1EED00001EEE,
+        0x1EEF00001EF0,
+        0x1EF100001EF2,
+        0x1EF300001EF4,
+        0x1EF500001EF6,
+        0x1EF700001EF8,
+        0x1EF900001EFA,
+        0x1EFB00001EFC,
+        0x1EFD00001EFE,
+        0x1EFF00001F08,
+        0x1F1000001F16,
+        0x1F2000001F28,
+        0x1F3000001F38,
+        0x1F4000001F46,
+        0x1F5000001F58,
+        0x1F6000001F68,
+        0x1F7000001F71,
+        0x1F7200001F73,
+        0x1F7400001F75,
+        0x1F7600001F77,
+        0x1F7800001F79,
+        0x1F7A00001F7B,
+        0x1F7C00001F7D,
+        0x1FB000001FB2,
+        0x1FB600001FB7,
+        0x1FC600001FC7,
+        0x1FD000001FD3,
+        0x1FD600001FD8,
+        0x1FE000001FE3,
+        0x1FE400001FE8,
+        0x1FF600001FF7,
+        0x214E0000214F,
+        0x218400002185,
+        0x2C3000002C60,
+        0x2C6100002C62,
+        0x2C6500002C67,
+        0x2C6800002C69,
+        0x2C6A00002C6B,
+        0x2C6C00002C6D,
+        0x2C7100002C72,
+        0x2C7300002C75,
+        0x2C7600002C7C,
+        0x2C8100002C82,
+        0x2C8300002C84,
+        0x2C8500002C86,
+        0x2C8700002C88,
+        0x2C8900002C8A,
+        0x2C8B00002C8C,
+        0x2C8D00002C8E,
+        0x2C8F00002C90,
+        0x2C9100002C92,
+        0x2C9300002C94,
+        0x2C9500002C96,
+        0x2C9700002C98,
+        0x2C9900002C9A,
+        0x2C9B00002C9C,
+        0x2C9D00002C9E,
+        0x2C9F00002CA0,
+        0x2CA100002CA2,
+        0x2CA300002CA4,
+        0x2CA500002CA6,
+        0x2CA700002CA8,
+        0x2CA900002CAA,
+        0x2CAB00002CAC,
+        0x2CAD00002CAE,
+        0x2CAF00002CB0,
+        0x2CB100002CB2,
+        0x2CB300002CB4,
+        0x2CB500002CB6,
+        0x2CB700002CB8,
+        0x2CB900002CBA,
+        0x2CBB00002CBC,
+        0x2CBD00002CBE,
+        0x2CBF00002CC0,
+        0x2CC100002CC2,
+        0x2CC300002CC4,
+        0x2CC500002CC6,
+        0x2CC700002CC8,
+        0x2CC900002CCA,
+        0x2CCB00002CCC,
+        0x2CCD00002CCE,
+        0x2CCF00002CD0,
+        0x2CD100002CD2,
+        0x2CD300002CD4,
+        0x2CD500002CD6,
+        0x2CD700002CD8,
+        0x2CD900002CDA,
+        0x2CDB00002CDC,
+        0x2CDD00002CDE,
+        0x2CDF00002CE0,
+        0x2CE100002CE2,
+        0x2CE300002CE5,
+        0x2CEC00002CED,
+        0x2CEE00002CF2,
+        0x2CF300002CF4,
+        0x2D0000002D26,
+        0x2D2700002D28,
+        0x2D2D00002D2E,
+        0x2D3000002D68,
+        0x2D7F00002D97,
+        0x2DA000002DA7,
+        0x2DA800002DAF,
+        0x2DB000002DB7,
+        0x2DB800002DBF,
+        0x2DC000002DC7,
+        0x2DC800002DCF,
+        0x2DD000002DD7,
+        0x2DD800002DDF,
+        0x2DE000002E00,
+        0x2E2F00002E30,
+        0x300500003008,
+        0x302A0000302E,
+        0x303C0000303D,
+        0x304100003097,
+        0x30990000309B,
+        0x309D0000309F,
+        0x30A1000030FB,
+        0x30FC000030FF,
+        0x310500003130,
+        0x31A0000031C0,
+        0x31F000003200,
+        0x340000004DC0,
+        0x4E000000A48D,
+        0xA4D00000A4FE,
+        0xA5000000A60D,
+        0xA6100000A62C,
+        0xA6410000A642,
+        0xA6430000A644,
+        0xA6450000A646,
+        0xA6470000A648,
+        0xA6490000A64A,
+        0xA64B0000A64C,
+        0xA64D0000A64E,
+        0xA64F0000A650,
+        0xA6510000A652,
+        0xA6530000A654,
+        0xA6550000A656,
+        0xA6570000A658,
+        0xA6590000A65A,
+        0xA65B0000A65C,
+        0xA65D0000A65E,
+        0xA65F0000A660,
+        0xA6610000A662,
+        0xA6630000A664,
+        0xA6650000A666,
+        0xA6670000A668,
+        0xA6690000A66A,
+        0xA66B0000A66C,
+        0xA66D0000A670,
+        0xA6740000A67E,
+        0xA67F0000A680,
+        0xA6810000A682,
+        0xA6830000A684,
+        0xA6850000A686,
+        0xA6870000A688,
+        0xA6890000A68A,
+        0xA68B0000A68C,
+        0xA68D0000A68E,
+        0xA68F0000A690,
+        0xA6910000A692,
+        0xA6930000A694,
+        0xA6950000A696,
+        0xA6970000A698,
+        0xA6990000A69A,
+        0xA69B0000A69C,
+        0xA69E0000A6E6,
+        0xA6F00000A6F2,
+        0xA7170000A720,
+        0xA7230000A724,
+        0xA7250000A726,
+        0xA7270000A728,
+        0xA7290000A72A,
+        0xA72B0000A72C,
+        0xA72D0000A72E,
+        0xA72F0000A732,
+        0xA7330000A734,
+        0xA7350000A736,
+        0xA7370000A738,
+        0xA7390000A73A,
+        0xA73B0000A73C,
+        0xA73D0000A73E,
+        0xA73F0000A740,
+        0xA7410000A742,
+        0xA7430000A744,
+        0xA7450000A746,
+        0xA7470000A748,
+        0xA7490000A74A,
+        0xA74B0000A74C,
+        0xA74D0000A74E,
+        0xA74F0000A750,
+        0xA7510000A752,
+        0xA7530000A754,
+        0xA7550000A756,
+        0xA7570000A758,
+        0xA7590000A75A,
+        0xA75B0000A75C,
+        0xA75D0000A75E,
+        0xA75F0000A760,
+        0xA7610000A762,
+        0xA7630000A764,
+        0xA7650000A766,
+        0xA7670000A768,
+        0xA7690000A76A,
+        0xA76B0000A76C,
+        0xA76D0000A76E,
+        0xA76F0000A770,
+        0xA7710000A779,
+        0xA77A0000A77B,
+        0xA77C0000A77D,
+        0xA77F0000A780,
+        0xA7810000A782,
+        0xA7830000A784,
+        0xA7850000A786,
+        0xA7870000A789,
+        0xA78C0000A78D,
+        0xA78E0000A790,
+        0xA7910000A792,
+        0xA7930000A796,
+        0xA7970000A798,
+        0xA7990000A79A,
+        0xA79B0000A79C,
+        0xA79D0000A79E,
+        0xA79F0000A7A0,
+        0xA7A10000A7A2,
+        0xA7A30000A7A4,
+        0xA7A50000A7A6,
+        0xA7A70000A7A8,
+        0xA7A90000A7AA,
+        0xA7AF0000A7B0,
+        0xA7B50000A7B6,
+        0xA7B70000A7B8,
+        0xA7B90000A7BA,
+        0xA7BB0000A7BC,
+        0xA7BD0000A7BE,
+        0xA7BF0000A7C0,
+        0xA7C10000A7C2,
+        0xA7C30000A7C4,
+        0xA7C80000A7C9,
+        0xA7CA0000A7CB,
+        0xA7CD0000A7CE,
+        0xA7D10000A7D2,
+        0xA7D30000A7D4,
+        0xA7D50000A7D6,
+        0xA7D70000A7D8,
+        0xA7D90000A7DA,
+        0xA7DB0000A7DC,
+        0xA7F60000A7F8,
+        0xA7FA0000A828,
+        0xA82C0000A82D,
+        0xA8400000A874,
+        0xA8800000A8C6,
+        0xA8D00000A8DA,
+        0xA8E00000A8F8,
+        0xA8FB0000A8FC,
+        0xA8FD0000A92E,
+        0xA9300000A954,
+        0xA9800000A9C1,
+        0xA9CF0000A9DA,
+        0xA9E00000A9FF,
+        0xAA000000AA37,
+        0xAA400000AA4E,
+        0xAA500000AA5A,
+        0xAA600000AA77,
+        0xAA7A0000AAC3,
+        0xAADB0000AADE,
+        0xAAE00000AAF0,
+        0xAAF20000AAF7,
+        0xAB010000AB07,
+        0xAB090000AB0F,
+        0xAB110000AB17,
+        0xAB200000AB27,
+        0xAB280000AB2F,
+        0xAB300000AB5B,
+        0xAB600000AB69,
+        0xABC00000ABEB,
+        0xABEC0000ABEE,
+        0xABF00000ABFA,
+        0xAC000000D7A4,
+        0xFA0E0000FA10,
+        0xFA110000FA12,
+        0xFA130000FA15,
+        0xFA1F0000FA20,
+        0xFA210000FA22,
+        0xFA230000FA25,
+        0xFA270000FA2A,
+        0xFB1E0000FB1F,
+        0xFE200000FE30,
+        0xFE730000FE74,
+        0x100000001000C,
+        0x1000D00010027,
+        0x100280001003B,
+        0x1003C0001003E,
+        0x1003F0001004E,
+        0x100500001005E,
+        0x10080000100FB,
+        0x101FD000101FE,
+        0x102800001029D,
+        0x102A0000102D1,
+        0x102E0000102E1,
+        0x1030000010320,
+        0x1032D00010341,
+        0x103420001034A,
+        0x103500001037B,
+        0x103800001039E,
+        0x103A0000103C4,
+        0x103C8000103D0,
+        0x104280001049E,
+        0x104A0000104AA,
+        0x104D8000104FC,
+        0x1050000010528,
+        0x1053000010564,
+        0x10597000105A2,
+        0x105A3000105B2,
+        0x105B3000105BA,
+        0x105BB000105BD,
+        0x105C0000105F4,
+        0x1060000010737,
+        0x1074000010756,
+        0x1076000010768,
+        0x1078000010781,
+        0x1080000010806,
+        0x1080800010809,
+        0x1080A00010836,
+        0x1083700010839,
+        0x1083C0001083D,
+        0x1083F00010856,
+        0x1086000010877,
+        0x108800001089F,
+        0x108E0000108F3,
+        0x108F4000108F6,
+        0x1090000010916,
+        0x109200001093A,
+        0x10980000109B8,
+        0x109BE000109C0,
+        0x10A0000010A04,
+        0x10A0500010A07,
+        0x10A0C00010A14,
+        0x10A1500010A18,
+        0x10A1900010A36,
+        0x10A3800010A3B,
+        0x10A3F00010A40,
+        0x10A6000010A7D,
+        0x10A8000010A9D,
+        0x10AC000010AC8,
+        0x10AC900010AE7,
+        0x10B0000010B36,
+        0x10B4000010B56,
+        0x10B6000010B73,
+        0x10B8000010B92,
+        0x10C0000010C49,
+        0x10CC000010CF3,
+        0x10D0000010D28,
+        0x10D3000010D3A,
+        0x10D4000010D50,
+        0x10D6900010D6E,
+        0x10D6F00010D86,
+        0x10E8000010EAA,
+        0x10EAB00010EAD,
+        0x10EB000010EB2,
+        0x10EC200010EC5,
+        0x10EFC00010F1D,
+        0x10F2700010F28,
+        0x10F3000010F51,
+        0x10F7000010F86,
+        0x10FB000010FC5,
+        0x10FE000010FF7,
+        0x1100000011047,
+        0x1106600011076,
+        0x1107F000110BB,
+        0x110C2000110C3,
+        0x110D0000110E9,
+        0x110F0000110FA,
+        0x1110000011135,
+        0x1113600011140,
+        0x1114400011148,
+        0x1115000011174,
+        0x1117600011177,
+        0x11180000111C5,
+        0x111C9000111CD,
+        0x111CE000111DB,
+        0x111DC000111DD,
+        0x1120000011212,
+        0x1121300011238,
+        0x1123E00011242,
+        0x1128000011287,
+        0x1128800011289,
+        0x1128A0001128E,
+        0x1128F0001129E,
+        0x1129F000112A9,
+        0x112B0000112EB,
+        0x112F0000112FA,
+        0x1130000011304,
+        0x113050001130D,
+        0x1130F00011311,
+        0x1131300011329,
+        0x1132A00011331,
+        0x1133200011334,
+        0x113350001133A,
+        0x1133B00011345,
+        0x1134700011349,
+        0x1134B0001134E,
+        0x1135000011351,
+        0x1135700011358,
+        0x1135D00011364,
+        0x113660001136D,
+        0x1137000011375,
+        0x113800001138A,
+        0x1138B0001138C,
+        0x1138E0001138F,
+        0x11390000113B6,
+        0x113B7000113C1,
+        0x113C2000113C3,
+        0x113C5000113C6,
+        0x113C7000113CB,
+        0x113CC000113D4,
+        0x113E1000113E3,
+        0x114000001144B,
+        0x114500001145A,
+        0x1145E00011462,
+        0x11480000114C6,
+        0x114C7000114C8,
+        0x114D0000114DA,
+        0x11580000115B6,
+        0x115B8000115C1,
+        0x115D8000115DE,
+        0x1160000011641,
+        0x1164400011645,
+        0x116500001165A,
+        0x11680000116B9,
+        0x116C0000116CA,
+        0x116D0000116E4,
+        0x117000001171B,
+        0x1171D0001172C,
+        0x117300001173A,
+        0x1174000011747,
+        0x118000001183B,
+        0x118C0000118EA,
+        0x118FF00011907,
+        0x119090001190A,
+        0x1190C00011914,
+        0x1191500011917,
+        0x1191800011936,
+        0x1193700011939,
+        0x1193B00011944,
+        0x119500001195A,
+        0x119A0000119A8,
+        0x119AA000119D8,
+        0x119DA000119E2,
+        0x119E3000119E5,
+        0x11A0000011A3F,
+        0x11A4700011A48,
+        0x11A5000011A9A,
+        0x11A9D00011A9E,
+        0x11AB000011AF9,
+        0x11BC000011BE1,
+        0x11BF000011BFA,
+        0x11C0000011C09,
+        0x11C0A00011C37,
+        0x11C3800011C41,
+        0x11C5000011C5A,
+        0x11C7200011C90,
+        0x11C9200011CA8,
+        0x11CA900011CB7,
+        0x11D0000011D07,
+        0x11D0800011D0A,
+        0x11D0B00011D37,
+        0x11D3A00011D3B,
+        0x11D3C00011D3E,
+        0x11D3F00011D48,
+        0x11D5000011D5A,
+        0x11D6000011D66,
+        0x11D6700011D69,
+        0x11D6A00011D8F,
+        0x11D9000011D92,
+        0x11D9300011D99,
+        0x11DA000011DAA,
+        0x11EE000011EF7,
+        0x11F0000011F11,
+        0x11F1200011F3B,
+        0x11F3E00011F43,
+        0x11F5000011F5B,
+        0x11FB000011FB1,
+        0x120000001239A,
+        0x1248000012544,
+        0x12F9000012FF1,
+        0x1300000013430,
+        0x1344000013456,
+        0x13460000143FB,
+        0x1440000014647,
+        0x161000001613A,
+        0x1680000016A39,
+        0x16A4000016A5F,
+        0x16A6000016A6A,
+        0x16A7000016ABF,
+        0x16AC000016ACA,
+        0x16AD000016AEE,
+        0x16AF000016AF5,
+        0x16B0000016B37,
+        0x16B4000016B44,
+        0x16B5000016B5A,
+        0x16B6300016B78,
+        0x16B7D00016B90,
+        0x16D4000016D6D,
+        0x16D7000016D7A,
+        0x16E6000016E80,
+        0x16F0000016F4B,
+        0x16F4F00016F88,
+        0x16F8F00016FA0,
+        0x16FE000016FE2,
+        0x16FE300016FE5,
+        0x16FF000016FF2,
+        0x17000000187F8,
+        0x1880000018CD6,
+        0x18CFF00018D09,
+        0x1AFF00001AFF4,
+        0x1AFF50001AFFC,
+        0x1AFFD0001AFFF,
+        0x1B0000001B123,
+        0x1B1320001B133,
+        0x1B1500001B153,
+        0x1B1550001B156,
+        0x1B1640001B168,
+        0x1B1700001B2FC,
+        0x1BC000001BC6B,
+        0x1BC700001BC7D,
+        0x1BC800001BC89,
+        0x1BC900001BC9A,
+        0x1BC9D0001BC9F,
+        0x1CCF00001CCFA,
+        0x1CF000001CF2E,
+        0x1CF300001CF47,
+        0x1DA000001DA37,
+        0x1DA3B0001DA6D,
+        0x1DA750001DA76,
+        0x1DA840001DA85,
+        0x1DA9B0001DAA0,
+        0x1DAA10001DAB0,
+        0x1DF000001DF1F,
+        0x1DF250001DF2B,
+        0x1E0000001E007,
+        0x1E0080001E019,
+        0x1E01B0001E022,
+        0x1E0230001E025,
+        0x1E0260001E02B,
+        0x1E08F0001E090,
+        0x1E1000001E12D,
+        0x1E1300001E13E,
+        0x1E1400001E14A,
+        0x1E14E0001E14F,
+        0x1E2900001E2AF,
+        0x1E2C00001E2FA,
+        0x1E4D00001E4FA,
+        0x1E5D00001E5FB,
+        0x1E7E00001E7E7,
+        0x1E7E80001E7EC,
+        0x1E7ED0001E7EF,
+        0x1E7F00001E7FF,
+        0x1E8000001E8C5,
+        0x1E8D00001E8D7,
+        0x1E9220001E94C,
+        0x1E9500001E95A,
+        0x200000002A6E0,
+        0x2A7000002B73A,
+        0x2B7400002B81E,
+        0x2B8200002CEA2,
+        0x2CEB00002EBE1,
+        0x2EBF00002EE5E,
+        0x300000003134B,
+        0x31350000323B0,
+    ),
+    "CONTEXTJ": (0x200C0000200E,),
+    "CONTEXTO": (
+        0xB7000000B8,
+        0x37500000376,
+        0x5F3000005F5,
+        0x6600000066A,
+        0x6F0000006FA,
+        0x30FB000030FC,
+    ),
+}

idna/intranges.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""
+Given a list of integers, made up of (hopefully) a small number of long runs
+of consecutive integers, compute a representation of the form
+((start1, end1), (start2, end2) ...). Then answer the question "was x present
+in the original list?" in time O(log(# runs)).
+"""
+import bisect
+from typing import List, Tuple
+def intranges_from_list(list_: List[int]) -> Tuple[int, ...]:
+    """Represent a list of integers as a sequence of ranges:
+    ((start_0, end_0), (start_1, end_1), ...), such that the original
+    integers are exactly those x such that start_i <= x < end_i for some i.
+    Ranges are encoded as single integers (start << 32 | end), not as tuples.
+    """
+    sorted_list = sorted(list_)
+    ranges = []
+    last_write = -1
+    for i in range(len(sorted_list)):
+        if i + 1 < len(sorted_list):
+            if sorted_list[i] == sorted_list[i + 1] - 1:
+                continue
+        current_range = sorted_list[last_write + 1 : i + 1]
+        ranges.append(_encode_range(current_range[0], current_range[-1] + 1))
+        last_write = i
+    return tuple(ranges)
+def _encode_range(start: int, end: int) -> int:
+    return (start << 32) | end
+def _decode_range(r: int) -> Tuple[int, int]:
+    return (r >> 32), (r & ((1 << 32) - 1))
+def intranges_contain(int_: int, ranges: Tuple[int, ...]) -> bool:
+    """Determine if `int_` falls into one of the ranges in `ranges`."""
+    tuple_ = _encode_range(int_, 0)
+    pos = bisect.bisect_left(ranges, tuple_)
+    # we could be immediately ahead of a tuple (start, end)
+    # with start < int_ <= end
+    if pos > 0:
+        left, right = _decode_range(ranges[pos - 1])
+        if left <= int_ < right:
+            return True
+    # or we could be immediately behind a tuple (int_, end)
+    if pos < len(ranges):
+        left, _ = _decode_range(ranges[pos])
+        if left == int_:
+            return True
+    return False

idna/package_data.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __version__ = "3.11"

idna/py.typed ADDED Viewed

File without changes

idna/uts46data.py ADDED Viewed

The diff for this file is too large to render. See raw diff

importlib_metadata/__init__.py ADDED Viewed

	@@ -0,0 +1,1191 @@

+"""
+APIs exposing metadata from third-party Python packages.
+This codebase is shared between importlib.metadata in the stdlib
+and importlib_metadata in PyPI. See
+https://github.com/python/importlib_metadata/wiki/Development-Methodology
+for more detail.
+"""
+from __future__ import annotations
+import abc
+import collections
+import email
+import functools
+import itertools
+import operator
+import os
+import pathlib
+import posixpath
+import re
+import sys
+import textwrap
+import types
+from collections.abc import Iterable, Mapping
+from contextlib import suppress
+from importlib import import_module
+from importlib.abc import MetaPathFinder
+from itertools import starmap
+from typing import Any
+from . import _meta
+from ._collections import FreezableDefaultDict, Pair
+from ._compat import (
+    NullFinder,
+    install,
+)
+from ._functools import method_cache, noop, pass_none, passthrough
+from ._itertools import always_iterable, bucket, unique_everseen
+from ._meta import PackageMetadata, SimplePath
+from ._typing import md_none
+from .compat import py311
+__all__ = [
+    'Distribution',
+    'DistributionFinder',
+    'PackageMetadata',
+    'PackageNotFoundError',
+    'SimplePath',
+    'distribution',
+    'distributions',
+    'entry_points',
+    'files',
+    'metadata',
+    'packages_distributions',
+    'requires',
+    'version',
+]
+class PackageNotFoundError(ModuleNotFoundError):
+    """The package was not found."""
+    def __str__(self) -> str:
+        return f"No package metadata was found for {self.name}"
+    @property
+    def name(self) -> str:  # type: ignore[override] # make readonly
+        (name,) = self.args
+        return name
+class Sectioned:
+    """
+    A simple entry point config parser for performance
+    >>> for item in Sectioned.read(Sectioned._sample):
+    ...     print(item)
+    Pair(name='sec1', value='# comments ignored')
+    Pair(name='sec1', value='a = 1')
+    Pair(name='sec1', value='b = 2')
+    Pair(name='sec2', value='a = 2')
+    >>> res = Sectioned.section_pairs(Sectioned._sample)
+    >>> item = next(res)
+    >>> item.name
+    'sec1'
+    >>> item.value
+    Pair(name='a', value='1')
+    >>> item = next(res)
+    >>> item.value
+    Pair(name='b', value='2')
+    >>> item = next(res)
+    >>> item.name
+    'sec2'
+    >>> item.value
+    Pair(name='a', value='2')
+    >>> list(res)
+    []
+    """
+    _sample = textwrap.dedent(
+        """
+        [sec1]
+        # comments ignored
+        a = 1
+        b = 2
+        [sec2]
+        a = 2
+        """
+    ).lstrip()
+    @classmethod
+    def section_pairs(cls, text):
+        return (
+            section._replace(value=Pair.parse(section.value))
+            for section in cls.read(text, filter_=cls.valid)
+            if section.name is not None
+        )
+    @staticmethod
+    def read(text, filter_=None):
+        lines = filter(filter_, map(str.strip, text.splitlines()))
+        name = None
+        for value in lines:
+            section_match = value.startswith('[') and value.endswith(']')
+            if section_match:
+                name = value.strip('[]')
+                continue
+            yield Pair(name, value)
+    @staticmethod
+    def valid(line: str):
+        return line and not line.startswith('#')
+class _EntryPointMatch(types.SimpleNamespace):
+    module: str
+    attr: str
+    extras: str
+class EntryPoint:
+    """An entry point as defined by Python packaging conventions.
+    See `the packaging docs on entry points
+    <https://packaging.python.org/specifications/entry-points/>`_
+    for more information.
+    >>> ep = EntryPoint(
+    ...     name=None, group=None, value='package.module:attr [extra1, extra2]')
+    >>> ep.module
+    'package.module'
+    >>> ep.attr
+    'attr'
+    >>> ep.extras
+    ['extra1', 'extra2']
+    If the value package or module are not valid identifiers, a
+    ValueError is raised on access.
+    >>> EntryPoint(name=None, group=None, value='invalid-name').module
+    Traceback (most recent call last):
+    ...
+    ValueError: ('Invalid object reference...invalid-name...
+    >>> EntryPoint(name=None, group=None, value='invalid-name').attr
+    Traceback (most recent call last):
+    ...
+    ValueError: ('Invalid object reference...invalid-name...
+    >>> EntryPoint(name=None, group=None, value='invalid-name').extras
+    Traceback (most recent call last):
+    ...
+    ValueError: ('Invalid object reference...invalid-name...
+    The same thing happens on construction.
+    >>> EntryPoint(name=None, group=None, value='invalid-name')
+    Traceback (most recent call last):
+    ...
+    ValueError: ('Invalid object reference...invalid-name...
+    """
+    pattern = re.compile(
+        r'(?P<module>[\w.]+)\s*'
+        r'(:\s*(?P<attr>[\w.]+)\s*)?'
+        r'((?P<extras>\[.*\])\s*)?$'
+    )
+    """
+    A regular expression describing the syntax for an entry point,
+    which might look like:
+        - module
+        - package.module
+        - package.module:attribute
+        - package.module:object.attribute
+        - package.module:attr [extra1, extra2]
+    Other combinations are possible as well.
+    The expression is lenient about whitespace around the ':',
+    following the attr, and following any extras.
+    """
+    name: str
+    value: str
+    group: str
+    dist: Distribution | None = None
+    def __init__(self, name: str, value: str, group: str) -> None:
+        vars(self).update(name=name, value=value, group=group)
+        self.module
+    def load(self) -> Any:
+        """Load the entry point from its definition. If only a module
+        is indicated by the value, return that module. Otherwise,
+        return the named object.
+        """
+        module = import_module(self.module)
+        attrs = filter(None, (self.attr or '').split('.'))
+        return functools.reduce(getattr, attrs, module)
+    @property
+    def module(self) -> str:
+        return self._match.module
+    @property
+    def attr(self) -> str:
+        return self._match.attr
+    @property
+    def extras(self) -> list[str]:
+        return re.findall(r'\w+', self._match.extras or '')
+    @functools.cached_property
+    def _match(self) -> _EntryPointMatch:
+        match = self.pattern.match(self.value)
+        if not match:
+            raise ValueError(
+                'Invalid object reference. '
+                'See https://packaging.python.org'
+                '/en/latest/specifications/entry-points/#data-model',
+                self.value,
+            )
+        return _EntryPointMatch(**match.groupdict())
+    def _for(self, dist):
+        vars(self).update(dist=dist)
+        return self
+    def matches(self, **params):
+        """
+        EntryPoint matches the given parameters.
+        >>> ep = EntryPoint(group='foo', name='bar', value='bing:bong [extra1, extra2]')
+        >>> ep.matches(group='foo')
+        True
+        >>> ep.matches(name='bar', value='bing:bong [extra1, extra2]')
+        True
+        >>> ep.matches(group='foo', name='other')
+        False
+        >>> ep.matches()
+        True
+        >>> ep.matches(extras=['extra1', 'extra2'])
+        True
+        >>> ep.matches(module='bing')
+        True
+        >>> ep.matches(attr='bong')
+        True
+        """
+        self._disallow_dist(params)
+        attrs = (getattr(self, param) for param in params)
+        return all(map(operator.eq, params.values(), attrs))
+    @staticmethod
+    def _disallow_dist(params):
+        """
+        Querying by dist is not allowed (dist objects are not comparable).
+        >>> EntryPoint(name='fan', value='fav', group='fag').matches(dist='foo')
+        Traceback (most recent call last):
+        ...
+        ValueError: "dist" is not suitable for matching...
+        """
+        if "dist" in params:
+            raise ValueError(
+                '"dist" is not suitable for matching. '
+                "Instead, use Distribution.entry_points.select() on a "
+                "located distribution."
+            )
+    def _key(self):
+        return self.name, self.value, self.group
+    def __lt__(self, other):
+        return self._key() < other._key()
+    def __eq__(self, other):
+        return self._key() == other._key()
+    def __setattr__(self, name, value):
+        raise AttributeError("EntryPoint objects are immutable.")
+    def __repr__(self):
+        return (
+            f'EntryPoint(name={self.name!r}, value={self.value!r}, '
+            f'group={self.group!r})'
+        )
+    def __hash__(self) -> int:
+        return hash(self._key())
+class EntryPoints(tuple):
+    """
+    An immutable collection of selectable EntryPoint objects.
+    """
+    __slots__ = ()
+    def __getitem__(self, name: str) -> EntryPoint:  # type: ignore[override] # Work with str instead of int
+        """
+        Get the EntryPoint in self matching name.
+        """
+        try:
+            return next(iter(self.select(name=name)))
+        except StopIteration:
+            raise KeyError(name)
+    def __repr__(self):
+        """
+        Repr with classname and tuple constructor to
+        signal that we deviate from regular tuple behavior.
+        """
+        return '%s(%r)' % (self.__class__.__name__, tuple(self))
+    def select(self, **params) -> EntryPoints:
+        """
+        Select entry points from self that match the
+        given parameters (typically group and/or name).
+        """
+        return EntryPoints(ep for ep in self if ep.matches(**params))
+    @property
+    def names(self) -> set[str]:
+        """
+        Return the set of all names of all entry points.
+        """
+        return {ep.name for ep in self}
+    @property
+    def groups(self) -> set[str]:
+        """
+        Return the set of all groups of all entry points.
+        """
+        return {ep.group for ep in self}
+    @classmethod
+    def _from_text_for(cls, text, dist):
+        return cls(ep._for(dist) for ep in cls._from_text(text))
+    @staticmethod
+    def _from_text(text):
+        return (
+            EntryPoint(name=item.value.name, value=item.value.value, group=item.name)
+            for item in Sectioned.section_pairs(text or '')
+        )
+class PackagePath(pathlib.PurePosixPath):
+    """A reference to a path in a package"""
+    hash: FileHash | None
+    size: int
+    dist: Distribution
+    def read_text(self, encoding: str = 'utf-8') -> str:
+        return self.locate().read_text(encoding=encoding)
+    def read_binary(self) -> bytes:
+        return self.locate().read_bytes()
+    def locate(self) -> SimplePath:
+        """Return a path-like object for this path"""
+        return self.dist.locate_file(self)
+class FileHash:
+    def __init__(self, spec: str) -> None:
+        self.mode, _, self.value = spec.partition('=')
+    def __repr__(self) -> str:
+        return f'<FileHash mode: {self.mode} value: {self.value}>'
+class Distribution(metaclass=abc.ABCMeta):
+    """
+    An abstract Python distribution package.
+    Custom providers may derive from this class and define
+    the abstract methods to provide a concrete implementation
+    for their environment. Some providers may opt to override
+    the default implementation of some properties to bypass
+    the file-reading mechanism.
+    """
+    @abc.abstractmethod
+    def read_text(self, filename) -> str | None:
+        """Attempt to load metadata file given by the name.
+        Python distribution metadata is organized by blobs of text
+        typically represented as "files" in the metadata directory
+        (e.g. package-1.0.dist-info). These files include things
+        like:
+        - METADATA: The distribution metadata including fields
+          like Name and Version and Description.
+        - entry_points.txt: A series of entry points as defined in
+          `the entry points spec <https://packaging.python.org/en/latest/specifications/entry-points/#file-format>`_.
+        - RECORD: A record of files according to
+          `this recording spec <https://packaging.python.org/en/latest/specifications/recording-installed-packages/#the-record-file>`_.
+        A package may provide any set of files, including those
+        not listed here or none at all.
+        :param filename: The name of the file in the distribution info.
+        :return: The text if found, otherwise None.
+        """
+    @abc.abstractmethod
+    def locate_file(self, path: str | os.PathLike[str]) -> SimplePath:
+        """
+        Given a path to a file in this distribution, return a SimplePath
+        to it.
+        This method is used by callers of ``Distribution.files()`` to
+        locate files within the distribution. If it's possible for a
+        Distribution to represent files in the distribution as
+        ``SimplePath`` objects, it should implement this method
+        to resolve such objects.
+        Some Distribution providers may elect not to resolve SimplePath
+        objects within the distribution by raising a
+        NotImplementedError, but consumers of such a Distribution would
+        be unable to invoke ``Distribution.files()``.
+        """
+    @classmethod
+    def from_name(cls, name: str) -> Distribution:
+        """Return the Distribution for the given package name.
+        :param name: The name of the distribution package to search for.
+        :return: The Distribution instance (or subclass thereof) for the named
+            package, if found.
+        :raises PackageNotFoundError: When the named package's distribution
+            metadata cannot be found.
+        :raises ValueError: When an invalid value is supplied for name.
+        """
+        if not name:
+            raise ValueError("A distribution name is required.")
+        try:
+            return next(iter(cls._prefer_valid(cls.discover(name=name))))
+        except StopIteration:
+            raise PackageNotFoundError(name)
+    @classmethod
+    def discover(
+        cls, *, context: DistributionFinder.Context | None = None, **kwargs
+    ) -> Iterable[Distribution]:
+        """Return an iterable of Distribution objects for all packages.
+        Pass a ``context`` or pass keyword arguments for constructing
+        a context.
+        :context: A ``DistributionFinder.Context`` object.
+        :return: Iterable of Distribution objects for packages matching
+          the context.
+        """
+        if context and kwargs:
+            raise ValueError("cannot accept context and kwargs")
+        context = context or DistributionFinder.Context(**kwargs)
+        return itertools.chain.from_iterable(
+            resolver(context) for resolver in cls._discover_resolvers()
+        )
+    @staticmethod
+    def _prefer_valid(dists: Iterable[Distribution]) -> Iterable[Distribution]:
+        """
+        Prefer (move to the front) distributions that have metadata.
+        Ref python/importlib_resources#489.
+        """
+        buckets = bucket(dists, lambda dist: bool(dist.metadata))
+        return itertools.chain(buckets[True], buckets[False])
+    @staticmethod
+    def at(path: str | os.PathLike[str]) -> Distribution:
+        """Return a Distribution for the indicated metadata path.
+        :param path: a string or path-like object
+        :return: a concrete Distribution instance for the path
+        """
+        return PathDistribution(pathlib.Path(path))
+    @staticmethod
+    def _discover_resolvers():
+        """Search the meta_path for resolvers (MetadataPathFinders)."""
+        declared = (
+            getattr(finder, 'find_distributions', None) for finder in sys.meta_path
+        )
+        return filter(None, declared)
+    @property
+    def metadata(self) -> _meta.PackageMetadata | None:
+        """Return the parsed metadata for this Distribution.
+        The returned object will have keys that name the various bits of
+        metadata per the
+        `Core metadata specifications <https://packaging.python.org/en/latest/specifications/core-metadata/#core-metadata>`_.
+        Custom providers may provide the METADATA file or override this
+        property.
+        """
+        text = (
+            self.read_text('METADATA')
+            or self.read_text('PKG-INFO')
+            # This last clause is here to support old egg-info files.  Its
+            # effect is to just end up using the PathDistribution's self._path
+            # (which points to the egg-info file) attribute unchanged.
+            or self.read_text('')
+        )
+        return self._assemble_message(text)
+    @staticmethod
+    @pass_none
+    def _assemble_message(text: str) -> _meta.PackageMetadata:
+        # deferred for performance (python/cpython#109829)
+        from . import _adapters
+        return _adapters.Message(email.message_from_string(text))
+    @property
+    def name(self) -> str:
+        """Return the 'Name' metadata for the distribution package."""
+        return md_none(self.metadata)['Name']
+    @property
+    def _normalized_name(self):
+        """Return a normalized version of the name."""
+        return Prepared.normalize(self.name)
+    @property
+    def version(self) -> str:
+        """Return the 'Version' metadata for the distribution package."""
+        return md_none(self.metadata)['Version']
+    @property
+    def entry_points(self) -> EntryPoints:
+        """
+        Return EntryPoints for this distribution.
+        Custom providers may provide the ``entry_points.txt`` file
+        or override this property.
+        """
+        return EntryPoints._from_text_for(self.read_text('entry_points.txt'), self)
+    @property
+    def files(self) -> list[PackagePath] | None:
+        """Files in this distribution.
+        :return: List of PackagePath for this distribution or None
+        Result is `None` if the metadata file that enumerates files
+        (i.e. RECORD for dist-info, or installed-files.txt or
+        SOURCES.txt for egg-info) is missing.
+        Result may be empty if the metadata exists but is empty.
+        Custom providers are recommended to provide a "RECORD" file (in
+        ``read_text``) or override this property to allow for callers to be
+        able to resolve filenames provided by the package.
+        """
+        def make_file(name, hash=None, size_str=None):
+            result = PackagePath(name)
+            result.hash = FileHash(hash) if hash else None
+            result.size = int(size_str) if size_str else None
+            result.dist = self
+            return result
+        @pass_none
+        def make_files(lines):
+            # Delay csv import, since Distribution.files is not as widely used
+            # as other parts of importlib.metadata
+            import csv
+            return starmap(make_file, csv.reader(lines))
+        @pass_none
+        def skip_missing_files(package_paths):
+            return list(filter(lambda path: path.locate().exists(), package_paths))
+        return skip_missing_files(
+            make_files(
+                self._read_files_distinfo()
+                or self._read_files_egginfo_installed()
+                or self._read_files_egginfo_sources()
+            )
+        )
+    def _read_files_distinfo(self):
+        """
+        Read the lines of RECORD.
+        """
+        text = self.read_text('RECORD')
+        return text and text.splitlines()
+    def _read_files_egginfo_installed(self):
+        """
+        Read installed-files.txt and return lines in a similar
+        CSV-parsable format as RECORD: each file must be placed
+        relative to the site-packages directory and must also be
+        quoted (since file names can contain literal commas).
+        This file is written when the package is installed by pip,
+        but it might not be written for other installation methods.
+        Assume the file is accurate if it exists.
+        """
+        text = self.read_text('installed-files.txt')
+        # Prepend the .egg-info/ subdir to the lines in this file.
+        # But this subdir is only available from PathDistribution's
+        # self._path.
+        subdir = getattr(self, '_path', None)
+        if not text or not subdir:
+            return
+        paths = (
+            py311
+            .relative_fix((subdir / name).resolve())
+            .relative_to(self.locate_file('').resolve(), walk_up=True)
+            .as_posix()
+            for name in text.splitlines()
+        )
+        return map('"{}"'.format, paths)
+    def _read_files_egginfo_sources(self):
+        """
+        Read SOURCES.txt and return lines in a similar CSV-parsable
+        format as RECORD: each file name must be quoted (since it
+        might contain literal commas).
+        Note that SOURCES.txt is not a reliable source for what
+        files are installed by a package. This file is generated
+        for a source archive, and the files that are present
+        there (e.g. setup.py) may not correctly reflect the files
+        that are present after the package has been installed.
+        """
+        text = self.read_text('SOURCES.txt')
+        return text and map('"{}"'.format, text.splitlines())
+    @property
+    def requires(self) -> list[str] | None:
+        """Generated requirements specified for this Distribution"""
+        reqs = self._read_dist_info_reqs() or self._read_egg_info_reqs()
+        return reqs and list(reqs)
+    def _read_dist_info_reqs(self):
+        return self.metadata.get_all('Requires-Dist')
+    def _read_egg_info_reqs(self):
+        source = self.read_text('requires.txt')
+        return pass_none(self._deps_from_requires_text)(source)
+    @classmethod
+    def _deps_from_requires_text(cls, source):
+        return cls._convert_egg_info_reqs_to_simple_reqs(Sectioned.read(source))
+    @staticmethod
+    def _convert_egg_info_reqs_to_simple_reqs(sections):
+        """
+        Historically, setuptools would solicit and store 'extra'
+        requirements, including those with environment markers,
+        in separate sections. More modern tools expect each
+        dependency to be defined separately, with any relevant
+        extras and environment markers attached directly to that
+        requirement. This method converts the former to the
+        latter. See _test_deps_from_requires_text for an example.
+        """
+        def make_condition(name):
+            return name and f'extra == "{name}"'
+        def quoted_marker(section):
+            section = section or ''
+            extra, sep, markers = section.partition(':')
+            if extra and markers:
+                markers = f'({markers})'
+            conditions = list(filter(None, [markers, make_condition(extra)]))
+            return '; ' + ' and '.join(conditions) if conditions else ''
+        def url_req_space(req):
+            """
+            PEP 508 requires a space between the url_spec and the quoted_marker.
+            Ref python/importlib_metadata#357.
+            """
+            # '@' is uniquely indicative of a url_req.
+            return ' ' * ('@' in req)
+        for section in sections:
+            space = url_req_space(section.value)
+            yield section.value + space + quoted_marker(section.name)
+    @property
+    def origin(self):
+        return self._load_json('direct_url.json')
+    def _load_json(self, filename):
+        # Deferred for performance (python/importlib_metadata#503)
+        import json
+        return pass_none(json.loads)(
+            self.read_text(filename),
+            object_hook=lambda data: types.SimpleNamespace(**data),
+        )
+class DistributionFinder(MetaPathFinder):
+    """
+    A MetaPathFinder capable of discovering installed distributions.
+    Custom providers should implement this interface in order to
+    supply metadata.
+    """
+    class Context:
+        """
+        Keyword arguments presented by the caller to
+        ``distributions()`` or ``Distribution.discover()``
+        to narrow the scope of a search for distributions
+        in all DistributionFinders.
+        Each DistributionFinder may expect any parameters
+        and should attempt to honor the canonical
+        parameters defined below when appropriate.
+        This mechanism gives a custom provider a means to
+        solicit additional details from the caller beyond
+        "name" and "path" when searching distributions.
+        For example, imagine a provider that exposes suites
+        of packages in either a "public" or "private" ``realm``.
+        A caller may wish to query only for distributions in
+        a particular realm and could call
+        ``distributions(realm="private")`` to signal to the
+        custom provider to only include distributions from that
+        realm.
+        """
+        name = None
+        """
+        Specific name for which a distribution finder should match.
+        A name of ``None`` matches all distributions.
+        """
+        def __init__(self, **kwargs):
+            vars(self).update(kwargs)
+        @property
+        def path(self) -> list[str]:
+            """
+            The sequence of directory path that a distribution finder
+            should search.
+            Typically refers to Python installed package paths such as
+            "site-packages" directories and defaults to ``sys.path``.
+            """
+            return vars(self).get('path', sys.path)
+    @abc.abstractmethod
+    def find_distributions(self, context=Context()) -> Iterable[Distribution]:
+        """
+        Find distributions.
+        Return an iterable of all Distribution instances capable of
+        loading the metadata for packages matching the ``context``,
+        a DistributionFinder.Context instance.
+        """
+@passthrough
+def _clear_after_fork(cached):
+    """Ensure ``func`` clears cached state after ``fork`` when supported.
+    ``FastPath`` caches zip-backed ``pathlib.Path`` objects that retain a
+    reference to the parent's open ``ZipFile`` handle. Re-using a cached
+    instance in a forked child can therefore resurrect invalid file pointers
+    and trigger ``BadZipFile``/``OSError`` failures (python/importlib_metadata#520).
+    Registering ``cache_clear`` with ``os.register_at_fork`` keeps each process
+    on its own cache.
+    """
+    getattr(os, 'register_at_fork', noop)(after_in_child=cached.cache_clear)
+class FastPath:
+    """
+    Micro-optimized class for searching a root for children.
+    Root is a path on the file system that may contain metadata
+    directories either as natural directories or within a zip file.
+    >>> FastPath('').children()
+    ['...']
+    FastPath objects are cached and recycled for any given root.
+    >>> FastPath('foobar') is FastPath('foobar')
+    True
+    """
+    @_clear_after_fork  # type: ignore[misc]
+    @functools.lru_cache()
+    def __new__(cls, root):
+        return super().__new__(cls)
+    def __init__(self, root):
+        self.root = root
+    def joinpath(self, child):
+        return pathlib.Path(self.root, child)
+    def children(self):
+        with suppress(Exception):
+            return os.listdir(self.root or '.')
+        with suppress(Exception):
+            return self.zip_children()
+        return []
+    def zip_children(self):
+        # deferred for performance (python/importlib_metadata#502)
+        from zipp.compat.overlay import zipfile
+        zip_path = zipfile.Path(self.root)
+        names = zip_path.root.namelist()
+        self.joinpath = zip_path.joinpath
+        return dict.fromkeys(child.split(posixpath.sep, 1)[0] for child in names)
+    def search(self, name):
+        return self.lookup(self.mtime).search(name)
+    @property
+    def mtime(self):
+        with suppress(OSError):
+            return os.stat(self.root).st_mtime
+        self.lookup.cache_clear()
+    @method_cache
+    def lookup(self, mtime):
+        return Lookup(self)
+class Lookup:
+    """
+    A micro-optimized class for searching a (fast) path for metadata.
+    """
+    def __init__(self, path: FastPath):
+        """
+        Calculate all of the children representing metadata.
+        From the children in the path, calculate early all of the
+        children that appear to represent metadata (infos) or legacy
+        metadata (eggs).
+        """
+        base = os.path.basename(path.root).lower()
+        base_is_egg = base.endswith(".egg")
+        self.infos = FreezableDefaultDict(list)
+        self.eggs = FreezableDefaultDict(list)
+        for child in path.children():
+            low = child.lower()
+            if low.endswith((".dist-info", ".egg-info")):
+                # rpartition is faster than splitext and suitable for this purpose.
+                name = low.rpartition(".")[0].partition("-")[0]
+                normalized = Prepared.normalize(name)
+                self.infos[normalized].append(path.joinpath(child))
+            elif base_is_egg and low == "egg-info":
+                name = base.rpartition(".")[0].partition("-")[0]
+                legacy_normalized = Prepared.legacy_normalize(name)
+                self.eggs[legacy_normalized].append(path.joinpath(child))
+        self.infos.freeze()
+        self.eggs.freeze()
+    def search(self, prepared: Prepared):
+        """
+        Yield all infos and eggs matching the Prepared query.
+        """
+        infos = (
+            self.infos[prepared.normalized]
+            if prepared
+            else itertools.chain.from_iterable(self.infos.values())
+        )
+        eggs = (
+            self.eggs[prepared.legacy_normalized]
+            if prepared
+            else itertools.chain.from_iterable(self.eggs.values())
+        )
+        return itertools.chain(infos, eggs)
+class Prepared:
+    """
+    A prepared search query for metadata on a possibly-named package.
+    Pre-calculates the normalization to prevent repeated operations.
+    >>> none = Prepared(None)
+    >>> none.normalized
+    >>> none.legacy_normalized
+    >>> bool(none)
+    False
+    >>> sample = Prepared('Sample__Pkg-name.foo')
+    >>> sample.normalized
+    'sample_pkg_name_foo'
+    >>> sample.legacy_normalized
+    'sample__pkg_name.foo'
+    >>> bool(sample)
+    True
+    """
+    normalized = None
+    legacy_normalized = None
+    def __init__(self, name: str | None):
+        self.name = name
+        if name is None:
+            return
+        self.normalized = self.normalize(name)
+        self.legacy_normalized = self.legacy_normalize(name)
+    @staticmethod
+    def normalize(name):
+        """
+        PEP 503 normalization plus dashes as underscores.
+        """
+        return re.sub(r"[-_.]+", "-", name).lower().replace('-', '_')
+    @staticmethod
+    def legacy_normalize(name):
+        """
+        Normalize the package name as found in the convention in
+        older packaging tools versions and specs.
+        """
+        return name.lower().replace('-', '_')
+    def __bool__(self):
+        return bool(self.name)
+@install
+class MetadataPathFinder(NullFinder, DistributionFinder):
+    """A degenerate finder for distribution packages on the file system.
+    This finder supplies only a find_distributions() method for versions
+    of Python that do not have a PathFinder find_distributions().
+    """
+    @classmethod
+    def find_distributions(
+        cls, context=DistributionFinder.Context()
+    ) -> Iterable[PathDistribution]:
+        """
+        Find distributions.
+        Return an iterable of all Distribution instances capable of
+        loading the metadata for packages matching ``context.name``
+        (or all names if ``None`` indicated) along the paths in the list
+        of directories ``context.path``.
+        """
+        found = cls._search_paths(context.name, context.path)
+        return map(PathDistribution, found)
+    @classmethod
+    def _search_paths(cls, name, paths):
+        """Find metadata directories in paths heuristically."""
+        prepared = Prepared(name)
+        return itertools.chain.from_iterable(
+            path.search(prepared) for path in map(FastPath, paths)
+        )
+    @classmethod
+    def invalidate_caches(cls) -> None:
+        FastPath.__new__.cache_clear()
+class PathDistribution(Distribution):
+    def __init__(self, path: SimplePath) -> None:
+        """Construct a distribution.
+        :param path: SimplePath indicating the metadata directory.
+        """
+        self._path = path
+    def read_text(self, filename: str | os.PathLike[str]) -> str | None:
+        with suppress(
+            FileNotFoundError,
+            IsADirectoryError,
+            KeyError,
+            NotADirectoryError,
+            PermissionError,
+        ):
+            return self._path.joinpath(filename).read_text(encoding='utf-8')
+        return None
+    read_text.__doc__ = Distribution.read_text.__doc__
+    def locate_file(self, path: str | os.PathLike[str]) -> SimplePath:
+        return self._path.parent / path
+    @property
+    def _normalized_name(self):
+        """
+        Performance optimization: where possible, resolve the
+        normalized name from the file system path.
+        """
+        stem = os.path.basename(str(self._path))
+        return (
+            pass_none(Prepared.normalize)(self._name_from_stem(stem))
+            or super()._normalized_name
+        )
+    @staticmethod
+    def _name_from_stem(stem):
+        """
+        >>> PathDistribution._name_from_stem('foo-3.0.egg-info')
+        'foo'
+        >>> PathDistribution._name_from_stem('CherryPy-3.0.dist-info')
+        'CherryPy'
+        >>> PathDistribution._name_from_stem('face.egg-info')
+        'face'
+        >>> PathDistribution._name_from_stem('foo.bar')
+        """
+        filename, ext = os.path.splitext(stem)
+        if ext not in ('.dist-info', '.egg-info'):
+            return
+        name, sep, rest = filename.partition('-')
+        return name
+def distribution(distribution_name: str) -> Distribution:
+    """Get the ``Distribution`` instance for the named package.
+    :param distribution_name: The name of the distribution package as a string.
+    :return: A ``Distribution`` instance (or subclass thereof).
+    """
+    return Distribution.from_name(distribution_name)
+def distributions(**kwargs) -> Iterable[Distribution]:
+    """Get all ``Distribution`` instances in the current environment.
+    :return: An iterable of ``Distribution`` instances.
+    """
+    return Distribution.discover(**kwargs)
+def metadata(distribution_name: str) -> _meta.PackageMetadata | None:
+    """Get the metadata for the named package.
+    :param distribution_name: The name of the distribution package to query.
+    :return: A PackageMetadata containing the parsed metadata.
+    """
+    return Distribution.from_name(distribution_name).metadata
+def version(distribution_name: str) -> str:
+    """Get the version string for the named package.
+    :param distribution_name: The name of the distribution package to query.
+    :return: The version string for the package as defined in the package's
+        "Version" metadata key.
+    """
+    return distribution(distribution_name).version
+_unique = functools.partial(
+    unique_everseen,
+    key=operator.attrgetter('_normalized_name'),
+)
+"""
+Wrapper for ``distributions`` to return unique distributions by name.
+"""
+def entry_points(**params) -> EntryPoints:
+    """Return EntryPoint objects for all installed packages.
+    Pass selection parameters (group or name) to filter the
+    result to entry points matching those properties (see
+    EntryPoints.select()).
+    :return: EntryPoints for all installed packages.
+    """
+    eps = itertools.chain.from_iterable(
+        dist.entry_points for dist in _unique(distributions())
+    )
+    return EntryPoints(eps).select(**params)
+def files(distribution_name: str) -> list[PackagePath] | None:
+    """Return a list of files for the named package.
+    :param distribution_name: The name of the distribution package to query.
+    :return: List of files composing the distribution.
+    """
+    return distribution(distribution_name).files
+def requires(distribution_name: str) -> list[str] | None:
+    """
+    Return a list of requirements for the named package.
+    :return: An iterable of requirements, suitable for
+        packaging.requirement.Requirement.
+    """
+    return distribution(distribution_name).requires
+def packages_distributions() -> Mapping[str, list[str]]:
+    """
+    Return a mapping of top-level packages to their
+    distributions.
+    >>> import collections.abc
+    >>> pkgs = packages_distributions()
+    >>> all(isinstance(dist, collections.abc.Sequence) for dist in pkgs.values())
+    True
+    """
+    pkg_to_dist = collections.defaultdict(list)
+    for dist in distributions():
+        for pkg in _top_level_declared(dist) or _top_level_inferred(dist):
+            pkg_to_dist[pkg].append(md_none(dist.metadata)['Name'])
+    return dict(pkg_to_dist)
+def _top_level_declared(dist):
+    return (dist.read_text('top_level.txt') or '').split()
+def _topmost(name: PackagePath) -> str | None:
+    """
+    Return the top-most parent as long as there is a parent.
+    """
+    top, *rest = name.parts
+    return top if rest else None
+def _get_toplevel_name(name: PackagePath) -> str:
+    """
+    Infer a possibly importable module name from a name presumed on
+    sys.path.
+    >>> _get_toplevel_name(PackagePath('foo.py'))
+    'foo'
+    >>> _get_toplevel_name(PackagePath('foo'))
+    'foo'
+    >>> _get_toplevel_name(PackagePath('foo.pyc'))
+    'foo'
+    >>> _get_toplevel_name(PackagePath('foo/__init__.py'))
+    'foo'
+    >>> _get_toplevel_name(PackagePath('foo.pth'))
+    'foo.pth'
+    >>> _get_toplevel_name(PackagePath('foo.dist-info'))
+    'foo.dist-info'
+    """
+    # Defer import of inspect for performance (python/cpython#118761)
+    import inspect
+    return _topmost(name) or inspect.getmodulename(name) or str(name)
+def _top_level_inferred(dist):
+    opt_names = set(map(_get_toplevel_name, always_iterable(dist.files)))
+    def importable_name(name):
+        return '.' not in name
+    return filter(importable_name, opt_names)

importlib_metadata/_adapters.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import email.message
+import email.policy
+import re
+import textwrap
+from ._text import FoldedCase
+class RawPolicy(email.policy.EmailPolicy):
+    def fold(self, name, value):
+        folded = self.linesep.join(
+            textwrap
+            .indent(value, prefix=' ' * 8, predicate=lambda line: True)
+            .lstrip()
+            .splitlines()
+        )
+        return f'{name}: {folded}{self.linesep}'
+class Message(email.message.Message):
+    r"""
+    Specialized Message subclass to handle metadata naturally.
+    Reads values that may have newlines in them and converts the
+    payload to the Description.
+    >>> msg_text = textwrap.dedent('''
+    ...     Name: Foo
+    ...     Version: 3.0
+    ...     License: blah
+    ...             de-blah
+    ...     <BLANKLINE>
+    ...     First line of description.
+    ...     Second line of description.
+    ...     <BLANKLINE>
+    ...     Fourth line!
+    ...     ''').lstrip().replace('<BLANKLINE>', '')
+    >>> msg = Message(email.message_from_string(msg_text))
+    >>> msg['Description']
+    'First line of description.\nSecond line of description.\n\nFourth line!\n'
+    Message should render even if values contain newlines.
+    >>> print(msg)
+    Name: Foo
+    Version: 3.0
+    License: blah
+            de-blah
+    Description: First line of description.
+            Second line of description.
+    <BLANKLINE>
+            Fourth line!
+    <BLANKLINE>
+    <BLANKLINE>
+    """
+    multiple_use_keys = set(
+        map(
+            FoldedCase,
+            [
+                'Classifier',
+                'Obsoletes-Dist',
+                'Platform',
+                'Project-URL',
+                'Provides-Dist',
+                'Provides-Extra',
+                'Requires-Dist',
+                'Requires-External',
+                'Supported-Platform',
+                'Dynamic',
+            ],
+        )
+    )
+    """
+    Keys that may be indicated multiple times per PEP 566.
+    """
+    def __new__(cls, orig: email.message.Message):
+        res = super().__new__(cls)
+        vars(res).update(vars(orig))
+        return res
+    def __init__(self, *args, **kwargs):
+        self._headers = self._repair_headers()
+    # suppress spurious error from mypy
+    def __iter__(self):
+        return super().__iter__()
+    def __getitem__(self, item):
+        """
+        Override parent behavior to typical dict behavior.
+        ``email.message.Message`` will emit None values for missing
+        keys. Typical mappings, including this ``Message``, will raise
+        a key error for missing keys.
+        Ref python/importlib_metadata#371.
+        """
+        res = super().__getitem__(item)
+        if res is None:
+            raise KeyError(item)
+        return res
+    def _repair_headers(self):
+        def redent(value):
+            "Correct for RFC822 indentation"
+            indent = ' ' * 8
+            if not value or '\n' + indent not in value:
+                return value
+            return textwrap.dedent(indent + value)
+        headers = [(key, redent(value)) for key, value in vars(self)['_headers']]
+        if self._payload:
+            headers.append(('Description', self.get_payload()))
+            self.set_payload('')
+        return headers
+    def as_string(self):
+        return super().as_string(policy=RawPolicy())
+    @property
+    def json(self):
+        """
+        Convert PackageMetadata to a JSON-compatible format
+        per PEP 0566.
+        """
+        def transform(key):
+            value = self.get_all(key) if key in self.multiple_use_keys else self[key]
+            if key == 'Keywords':
+                value = re.split(r'\s+', value)
+            tk = key.lower().replace('-', '_')
+            return tk, value
+        return dict(map(transform, map(FoldedCase, self)))

importlib_metadata/_collections.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import collections
+import typing
+# from jaraco.collections 3.3
+class FreezableDefaultDict(collections.defaultdict):
+    """
+    Often it is desirable to prevent the mutation of
+    a default dict after its initial construction, such
+    as to prevent mutation during iteration.
+    >>> dd = FreezableDefaultDict(list)
+    >>> dd[0].append('1')
+    >>> dd.freeze()
+    >>> dd[1]
+    []
+    >>> len(dd)
+    1
+    """
+    def __missing__(self, key):
+        return getattr(self, '_frozen', super().__missing__)(key)
+    def freeze(self):
+        self._frozen = lambda key: self.default_factory()
+class Pair(typing.NamedTuple):
+    name: str
+    value: str
+    @classmethod
+    def parse(cls, text):
+        return cls(*map(str.strip, text.split("=", 1)))

importlib_metadata/_compat.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import platform
+import sys
+__all__ = ['install', 'NullFinder']
+def install(cls):
+    """
+    Class decorator for installation on sys.meta_path.
+    Adds the backport DistributionFinder to sys.meta_path and
+    attempts to disable the finder functionality of the stdlib
+    DistributionFinder.
+    """
+    sys.meta_path.append(cls())
+    disable_stdlib_finder()
+    return cls
+def disable_stdlib_finder():
+    """
+    Give the backport primacy for discovering path-based distributions
+    by monkey-patching the stdlib O_O.
+    See #91 for more background for rationale on this sketchy
+    behavior.
+    """
+    def matches(finder):
+        return getattr(
+            finder, '__module__', None
+        ) == '_frozen_importlib_external' and hasattr(finder, 'find_distributions')
+    for finder in filter(matches, sys.meta_path):  # pragma: nocover
+        del finder.find_distributions
+class NullFinder:
+    """
+    A "Finder" (aka "MetaPathFinder") that never finds any modules,
+    but may find distributions.
+    """
+    @staticmethod
+    def find_spec(*args, **kwargs):
+        return None
+def pypy_partial(val):
+    """
+    Adjust for variable stacklevel on partial under PyPy.
+    Workaround for #327.
+    """
+    is_pypy = platform.python_implementation() == 'PyPy'
+    return val + is_pypy

importlib_metadata/_functools.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import functools
+import types
+from collections.abc import Callable
+from typing import TypeVar
+# from jaraco.functools 3.3
+def method_cache(method, cache_wrapper=None):
+    """
+    Wrap lru_cache to support storing the cache data in the object instances.
+    Abstracts the common paradigm where the method explicitly saves an
+    underscore-prefixed protected property on first call and returns that
+    subsequently.
+    >>> class MyClass:
+    ...     calls = 0
+    ...
+    ...     @method_cache
+    ...     def method(self, value):
+    ...         self.calls += 1
+    ...         return value
+    >>> a = MyClass()
+    >>> a.method(3)
+    3
+    >>> for x in range(75):
+    ...     res = a.method(x)
+    >>> a.calls
+    75
+    Note that the apparent behavior will be exactly like that of lru_cache
+    except that the cache is stored on each instance, so values in one
+    instance will not flush values from another, and when an instance is
+    deleted, so are the cached values for that instance.
+    >>> b = MyClass()
+    >>> for x in range(35):
+    ...     res = b.method(x)
+    >>> b.calls
+    35
+    >>> a.method(0)
+    0
+    >>> a.calls
+    75
+    Note that if method had been decorated with ``functools.lru_cache()``,
+    a.calls would have been 76 (due to the cached value of 0 having been
+    flushed by the 'b' instance).
+    Clear the cache with ``.cache_clear()``
+    >>> a.method.cache_clear()
+    Same for a method that hasn't yet been called.
+    >>> c = MyClass()
+    >>> c.method.cache_clear()
+    Another cache wrapper may be supplied:
+    >>> cache = functools.lru_cache(maxsize=2)
+    >>> MyClass.method2 = method_cache(lambda self: 3, cache_wrapper=cache)
+    >>> a = MyClass()
+    >>> a.method2()
+    3
+    Caution - do not subsequently wrap the method with another decorator, such
+    as ``@property``, which changes the semantics of the function.
+    See also
+    http://code.activestate.com/recipes/577452-a-memoize-decorator-for-instance-methods/
+    for another implementation and additional justification.
+    """
+    cache_wrapper = cache_wrapper or functools.lru_cache()
+    def wrapper(self, *args, **kwargs):
+        # it's the first call, replace the method with a cached, bound method
+        bound_method = types.MethodType(method, self)
+        cached_method = cache_wrapper(bound_method)
+        setattr(self, method.__name__, cached_method)
+        return cached_method(*args, **kwargs)
+    # Support cache clear even before cache has been created.
+    wrapper.cache_clear = lambda: None
+    return wrapper
+# From jaraco.functools 3.3
+def pass_none(func):
+    """
+    Wrap func so it's not called if its first param is None
+    >>> print_text = pass_none(print)
+    >>> print_text('text')
+    text
+    >>> print_text(None)
+    """
+    @functools.wraps(func)
+    def wrapper(param, *args, **kwargs):
+        if param is not None:
+            return func(param, *args, **kwargs)
+    return wrapper
+# From jaraco.functools 4.4
+def noop(*args, **kwargs):
+    """
+    A no-operation function that does nothing.
+    >>> noop(1, 2, three=3)
+    """
+_T = TypeVar('_T')
+# From jaraco.functools 4.4
+def passthrough(func: Callable[..., object]) -> Callable[[_T], _T]:
+    """
+    Wrap the function to always return the first parameter.
+    >>> passthrough(print)('3')
+    3
+    '3'
+    """
+    @functools.wraps(func)
+    def wrapper(first: _T, *args, **kwargs) -> _T:
+        func(first, *args, **kwargs)
+        return first
+    return wrapper  # type: ignore[return-value]

importlib_metadata/_itertools.py ADDED Viewed

	@@ -0,0 +1,171 @@

+from collections import defaultdict, deque
+from itertools import filterfalse
+def unique_everseen(iterable, key=None):
+    "List unique elements, preserving order. Remember all elements ever seen."
+    # unique_everseen('AAAABBBCCDAABBB') --> A B C D
+    # unique_everseen('ABBCcAD', str.lower) --> A B C D
+    seen = set()
+    seen_add = seen.add
+    if key is None:
+        for element in filterfalse(seen.__contains__, iterable):
+            seen_add(element)
+            yield element
+    else:
+        for element in iterable:
+            k = key(element)
+            if k not in seen:
+                seen_add(k)
+                yield element
+# copied from more_itertools 8.8
+def always_iterable(obj, base_type=(str, bytes)):
+    """If *obj* is iterable, return an iterator over its items::
+        >>> obj = (1, 2, 3)
+        >>> list(always_iterable(obj))
+        [1, 2, 3]
+    If *obj* is not iterable, return a one-item iterable containing *obj*::
+        >>> obj = 1
+        >>> list(always_iterable(obj))
+        [1]
+    If *obj* is ``None``, return an empty iterable:
+        >>> obj = None
+        >>> list(always_iterable(None))
+        []
+    By default, binary and text strings are not considered iterable::
+        >>> obj = 'foo'
+        >>> list(always_iterable(obj))
+        ['foo']
+    If *base_type* is set, objects for which ``isinstance(obj, base_type)``
+    returns ``True`` won't be considered iterable.
+        >>> obj = {'a': 1}
+        >>> list(always_iterable(obj))  # Iterate over the dict's keys
+        ['a']
+        >>> list(always_iterable(obj, base_type=dict))  # Treat dicts as a unit
+        [{'a': 1}]
+    Set *base_type* to ``None`` to avoid any special handling and treat objects
+    Python considers iterable as iterable:
+        >>> obj = 'foo'
+        >>> list(always_iterable(obj, base_type=None))
+        ['f', 'o', 'o']
+    """
+    if obj is None:
+        return iter(())
+    if (base_type is not None) and isinstance(obj, base_type):
+        return iter((obj,))
+    try:
+        return iter(obj)
+    except TypeError:
+        return iter((obj,))
+# Copied from more_itertools 10.3
+class bucket:
+    """Wrap *iterable* and return an object that buckets the iterable into
+    child iterables based on a *key* function.
+        >>> iterable = ['a1', 'b1', 'c1', 'a2', 'b2', 'c2', 'b3']
+        >>> s = bucket(iterable, key=lambda x: x[0])  # Bucket by 1st character
+        >>> sorted(list(s))  # Get the keys
+        ['a', 'b', 'c']
+        >>> a_iterable = s['a']
+        >>> next(a_iterable)
+        'a1'
+        >>> next(a_iterable)
+        'a2'
+        >>> list(s['b'])
+        ['b1', 'b2', 'b3']
+    The original iterable will be advanced and its items will be cached until
+    they are used by the child iterables. This may require significant storage.
+    By default, attempting to select a bucket to which no items belong  will
+    exhaust the iterable and cache all values.
+    If you specify a *validator* function, selected buckets will instead be
+    checked against it.
+        >>> from itertools import count
+        >>> it = count(1, 2)  # Infinite sequence of odd numbers
+        >>> key = lambda x: x % 10  # Bucket by last digit
+        >>> validator = lambda x: x in {1, 3, 5, 7, 9}  # Odd digits only
+        >>> s = bucket(it, key=key, validator=validator)
+        >>> 2 in s
+        False
+        >>> list(s[2])
+        []
+    """
+    def __init__(self, iterable, key, validator=None):
+        self._it = iter(iterable)
+        self._key = key
+        self._cache = defaultdict(deque)
+        self._validator = validator or (lambda x: True)
+    def __contains__(self, value):
+        if not self._validator(value):
+            return False
+        try:
+            item = next(self[value])
+        except StopIteration:
+            return False
+        else:
+            self._cache[value].appendleft(item)
+        return True
+    def _get_values(self, value):
+        """
+        Helper to yield items from the parent iterator that match *value*.
+        Items that don't match are stored in the local cache as they
+        are encountered.
+        """
+        while True:
+            # If we've cached some items that match the target value, emit
+            # the first one and evict it from the cache.
+            if self._cache[value]:
+                yield self._cache[value].popleft()
+            # Otherwise we need to advance the parent iterator to search for
+            # a matching item, caching the rest.
+            else:
+                while True:
+                    try:
+                        item = next(self._it)
+                    except StopIteration:
+                        return
+                    item_value = self._key(item)
+                    if item_value == value:
+                        yield item
+                        break
+                    elif self._validator(item_value):
+                        self._cache[item_value].append(item)
+    def __iter__(self):
+        for item in self._it:
+            item_value = self._key(item)
+            if self._validator(item_value):
+                self._cache[item_value].append(item)
+        yield from self._cache.keys()
+    def __getitem__(self, value):
+        if not self._validator(value):
+            return iter(())
+        return self._get_values(value)

importlib_metadata/_meta.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from __future__ import annotations
+import os
+from collections.abc import Iterator
+from typing import (
+    Any,
+    Protocol,
+    TypeVar,
+    overload,
+)
+_T = TypeVar("_T")
+class PackageMetadata(Protocol):
+    def __len__(self) -> int: ...  # pragma: no cover
+    def __contains__(self, item: str) -> bool: ...  # pragma: no cover
+    def __getitem__(self, key: str) -> str: ...  # pragma: no cover
+    def __iter__(self) -> Iterator[str]: ...  # pragma: no cover
+    @overload
+    def get(
+        self, name: str, failobj: None = None
+    ) -> str | None: ...  # pragma: no cover
+    @overload
+    def get(self, name: str, failobj: _T) -> str | _T: ...  # pragma: no cover
+    # overload per python/importlib_metadata#435
+    @overload
+    def get_all(
+        self, name: str, failobj: None = None
+    ) -> list[Any] | None: ...  # pragma: no cover
+    @overload
+    def get_all(self, name: str, failobj: _T) -> list[Any] | _T:
+        """
+        Return all values associated with a possibly multi-valued key.
+        """
+    @property
+    def json(self) -> dict[str, str | list[str]]:
+        """
+        A JSON-compatible form of the metadata.
+        """
+class SimplePath(Protocol):
+    """
+    A minimal subset of pathlib.Path required by Distribution.
+    """
+    def joinpath(
+        self, other: str | os.PathLike[str]
+    ) -> SimplePath: ...  # pragma: no cover
+    def __truediv__(
+        self, other: str | os.PathLike[str]
+    ) -> SimplePath: ...  # pragma: no cover
+    @property
+    def parent(self) -> SimplePath: ...  # pragma: no cover
+    def read_text(self, encoding=None) -> str: ...  # pragma: no cover
+    def read_bytes(self) -> bytes: ...  # pragma: no cover
+    def exists(self) -> bool: ...  # pragma: no cover

importlib_metadata/_text.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import re
+from ._functools import method_cache
+# from jaraco.text 3.5
+class FoldedCase(str):
+    """
+    A case insensitive string class; behaves just like str
+    except compares equal when the only variation is case.
+    >>> s = FoldedCase('hello world')
+    >>> s == 'Hello World'
+    True
+    >>> 'Hello World' == s
+    True
+    >>> s != 'Hello World'
+    False
+    >>> s.index('O')
+    4
+    >>> s.split('O')
+    ['hell', ' w', 'rld']
+    >>> sorted(map(FoldedCase, ['GAMMA', 'alpha', 'Beta']))
+    ['alpha', 'Beta', 'GAMMA']
+    Sequence membership is straightforward.
+    >>> "Hello World" in [s]
+    True
+    >>> s in ["Hello World"]
+    True
+    You may test for set inclusion, but candidate and elements
+    must both be folded.
+    >>> FoldedCase("Hello World") in {s}
+    True
+    >>> s in {FoldedCase("Hello World")}
+    True
+    String inclusion works as long as the FoldedCase object
+    is on the right.
+    >>> "hello" in FoldedCase("Hello World")
+    True
+    But not if the FoldedCase object is on the left:
+    >>> FoldedCase('hello') in 'Hello World'
+    False
+    In that case, use in_:
+    >>> FoldedCase('hello').in_('Hello World')
+    True
+    >>> FoldedCase('hello') > FoldedCase('Hello')
+    False
+    """
+    def __lt__(self, other):
+        return self.lower() < other.lower()
+    def __gt__(self, other):
+        return self.lower() > other.lower()
+    def __eq__(self, other):
+        return self.lower() == other.lower()
+    def __ne__(self, other):
+        return self.lower() != other.lower()
+    def __hash__(self):
+        return hash(self.lower())
+    def __contains__(self, other):
+        return super().lower().__contains__(other.lower())
+    def in_(self, other):
+        "Does self appear in other?"
+        return self in FoldedCase(other)
+    # cache lower since it's likely to be called frequently.
+    @method_cache
+    def lower(self):
+        return super().lower()
+    def index(self, sub):
+        return self.lower().index(sub.lower())
+    def split(self, splitter=' ', maxsplit=0):
+        pattern = re.compile(re.escape(splitter), re.I)
+        return pattern.split(self, maxsplit)

importlib_metadata/_typing.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import functools
+import typing
+from ._meta import PackageMetadata
+md_none = functools.partial(typing.cast, PackageMetadata)
+"""
+Suppress type errors for optional metadata.
+Although Distribution.metadata can return None when metadata is corrupt
+and thus None, allow callers to assume it's not None and crash if
+that's the case.
+# python/importlib_metadata#493
+"""