diff --git a/_cuda_bindings_redirector.py b/_cuda_bindings_redirector.py new file mode 100644 index 0000000000000000000000000000000000000000..13b3c04cf138be492013dde800e9dcc9e90b32be --- /dev/null +++ b/_cuda_bindings_redirector.py @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +import sys +from types import ModuleType + + +# Make sure 'cuda' is importable as a namespace package +import cuda + + +class LazyCudaModule(ModuleType): + + def __getattr__(self, name): + if name == '__version__': + import warnings + warnings.warn( + "accessing cuda.__version__ is deprecated, " "please switch to use cuda.bindings.__version__ instead", + FutureWarning, + stacklevel=2, + ) + from cuda.bindings import __version__ + + return __version__ + + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + +# Patch in LazyCudaModule for `cuda` +sys.modules['cuda'].__class__ = LazyCudaModule diff --git a/anyio-4.12.1.dist-info/INSTALLER b/anyio-4.12.1.dist-info/INSTALLER new file mode 100644 index 0000000000000000000000000000000000000000..5c69047b2eb8235994febeeae1da4a82365a240a --- /dev/null +++ b/anyio-4.12.1.dist-info/INSTALLER @@ -0,0 +1 @@ +uv \ No newline at end of file diff --git a/anyio-4.12.1.dist-info/METADATA b/anyio-4.12.1.dist-info/METADATA new file mode 100644 index 0000000000000000000000000000000000000000..dbeb1989ba35da34a15e015e0225304dcd140331 --- /dev/null +++ b/anyio-4.12.1.dist-info/METADATA @@ -0,0 +1,96 @@ +Metadata-Version: 2.4 +Name: anyio +Version: 4.12.1 +Summary: High-level concurrency and networking framework on top of asyncio or Trio +Author-email: Alex Grönholm +License-Expression: MIT +Project-URL: Documentation, https://anyio.readthedocs.io/en/latest/ +Project-URL: Changelog, https://anyio.readthedocs.io/en/stable/versionhistory.html +Project-URL: Source code, https://github.com/agronholm/anyio +Project-URL: Issue tracker, https://github.com/agronholm/anyio/issues +Classifier: Development Status :: 5 - Production/Stable +Classifier: Intended Audience :: Developers +Classifier: Framework :: AnyIO +Classifier: Typing :: Typed +Classifier: Programming Language :: Python +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3.9 +Classifier: Programming Language :: Python :: 3.10 +Classifier: Programming Language :: Python :: 3.11 +Classifier: Programming Language :: Python :: 3.12 +Classifier: Programming Language :: Python :: 3.13 +Classifier: Programming Language :: Python :: 3.14 +Requires-Python: >=3.9 +Description-Content-Type: text/x-rst +License-File: LICENSE +Requires-Dist: exceptiongroup>=1.0.2; python_version < "3.11" +Requires-Dist: idna>=2.8 +Requires-Dist: typing_extensions>=4.5; python_version < "3.13" +Provides-Extra: trio +Requires-Dist: trio>=0.32.0; python_version >= "3.10" and extra == "trio" +Requires-Dist: trio>=0.31.0; python_version < "3.10" and extra == "trio" +Dynamic: license-file + +.. image:: https://github.com/agronholm/anyio/actions/workflows/test.yml/badge.svg + :target: https://github.com/agronholm/anyio/actions/workflows/test.yml + :alt: Build Status +.. image:: https://coveralls.io/repos/github/agronholm/anyio/badge.svg?branch=master + :target: https://coveralls.io/github/agronholm/anyio?branch=master + :alt: Code Coverage +.. image:: https://readthedocs.org/projects/anyio/badge/?version=latest + :target: https://anyio.readthedocs.io/en/latest/?badge=latest + :alt: Documentation +.. image:: https://badges.gitter.im/gitterHQ/gitter.svg + :target: https://gitter.im/python-trio/AnyIO + :alt: Gitter chat + +AnyIO is an asynchronous networking and concurrency library that works on top of either asyncio_ or +Trio_. It implements Trio-like `structured concurrency`_ (SC) on top of asyncio and works in harmony +with the native SC of Trio itself. + +Applications and libraries written against AnyIO's API will run unmodified on either asyncio_ or +Trio_. AnyIO can also be adopted into a library or application incrementally – bit by bit, no full +refactoring necessary. It will blend in with the native libraries of your chosen backend. + +To find out why you might want to use AnyIO's APIs instead of asyncio's, you can read about it +`here `_. + +Documentation +------------- + +View full documentation at: https://anyio.readthedocs.io/ + +Features +-------- + +AnyIO offers the following functionality: + +* Task groups (nurseries_ in trio terminology) +* High-level networking (TCP, UDP and UNIX sockets) + + * `Happy eyeballs`_ algorithm for TCP connections (more robust than that of asyncio on Python + 3.8) + * async/await style UDP sockets (unlike asyncio where you still have to use Transports and + Protocols) + +* A versatile API for byte streams and object streams +* Inter-task synchronization and communication (locks, conditions, events, semaphores, object + streams) +* Worker threads +* Subprocesses +* Subinterpreter support for code parallelization (on Python 3.13 and later) +* Asynchronous file I/O (using worker threads) +* Signal handling +* Asynchronous version of the functools_ module + +AnyIO also comes with its own pytest_ plugin which also supports asynchronous fixtures. +It even works with the popular Hypothesis_ library. + +.. _asyncio: https://docs.python.org/3/library/asyncio.html +.. _Trio: https://github.com/python-trio/trio +.. _structured concurrency: https://en.wikipedia.org/wiki/Structured_concurrency +.. _nurseries: https://trio.readthedocs.io/en/stable/reference-core.html#nurseries-and-spawning +.. _Happy eyeballs: https://en.wikipedia.org/wiki/Happy_Eyeballs +.. _pytest: https://docs.pytest.org/en/latest/ +.. _functools: https://docs.python.org/3/library/functools.html +.. _Hypothesis: https://hypothesis.works/ diff --git a/anyio-4.12.1.dist-info/RECORD b/anyio-4.12.1.dist-info/RECORD new file mode 100644 index 0000000000000000000000000000000000000000..0fc88aae05cd1a615f9ea88f51c59b41f9b2bb44 --- /dev/null +++ b/anyio-4.12.1.dist-info/RECORD @@ -0,0 +1,51 @@ +anyio-4.12.1.dist-info/INSTALLER,sha256=5hhM4Q4mYTT9z6QB6PGpUAW81PGNFrYrdXMj4oM_6ak,2 +anyio-4.12.1.dist-info/METADATA,sha256=DfiDab9Tmmcfy802lOLTMEHJQShkOSbopCwqCYbLuJk,4277 +anyio-4.12.1.dist-info/RECORD,, +anyio-4.12.1.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +anyio-4.12.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91 +anyio-4.12.1.dist-info/entry_points.txt,sha256=_d6Yu6uiaZmNe0CydowirE9Cmg7zUL2g08tQpoS3Qvc,39 +anyio-4.12.1.dist-info/licenses/LICENSE,sha256=U2GsncWPLvX9LpsJxoKXwX8ElQkJu8gCO9uC6s8iwrA,1081 +anyio-4.12.1.dist-info/top_level.txt,sha256=QglSMiWX8_5dpoVAEIHdEYzvqFMdSYWmCj6tYw2ITkQ,6 +anyio/__init__.py,sha256=7iDVqMUprUuKNY91FuoKqayAhR-OY136YDPI6P78HHk,6170 +anyio/_backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +anyio/_backends/_asyncio.py,sha256=xG6qv60mgGnL0mK82dxjH2b8hlkMlJ-x2BqIq3qv70Y,98863 +anyio/_backends/_trio.py,sha256=30Rctb7lm8g63ZHljVPVnj5aH-uK6oQvphjwUBoAzuI,41456 +anyio/_core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +anyio/_core/_asyncio_selector_thread.py,sha256=2PdxFM3cs02Kp6BSppbvmRT7q7asreTW5FgBxEsflBo,5626 +anyio/_core/_contextmanagers.py,sha256=YInBCabiEeS-UaP_Jdxa1CaFC71ETPW8HZTHIM8Rsc8,7215 +anyio/_core/_eventloop.py,sha256=c2EdcBX-xnKwxPcC4Pjn3_qG9I-x4IWFO2R9RqCGjM4,6448 +anyio/_core/_exceptions.py,sha256=Y3aq-Wxd7Q2HqwSg7nZPvRsHEuGazv_qeet6gqEBdPk,4407 +anyio/_core/_fileio.py,sha256=uc7t10Vb-If7GbdWM_zFf-ajUe6uek63fSt7IBLlZW0,25731 +anyio/_core/_resources.py,sha256=NbmU5O5UX3xEyACnkmYX28Fmwdl-f-ny0tHym26e0w0,435 +anyio/_core/_signals.py,sha256=mjTBB2hTKNPRlU0IhnijeQedpWOGERDiMjSlJQsFrug,1016 +anyio/_core/_sockets.py,sha256=RBXHcUqZt5gg_-OOfgHVv8uq2FSKk1uVUzTdpjBoI1o,34977 +anyio/_core/_streams.py,sha256=FczFwIgDpnkK0bODWJXMpsUJYdvAD04kaUaGzJU8DK0,1806 +anyio/_core/_subprocesses.py,sha256=EXm5igL7dj55iYkPlbYVAqtbqxJxjU-6OndSTIx9SRg,8047 +anyio/_core/_synchronization.py,sha256=MgVVqFzvt580tHC31LiOcq1G6aryut--xRG4Ff8KwxQ,20869 +anyio/_core/_tasks.py,sha256=pVB7K6AAulzUM8YgXAeqNZG44nSyZ1bYJjH8GznC00I,5435 +anyio/_core/_tempfile.py,sha256=lHb7CW4FyIlpkf5ADAf4VmLHCKwEHF9nxqNyBCFFUiA,19697 +anyio/_core/_testing.py,sha256=u7MPqGXwpTxqI7hclSdNA30z2GH1Nw258uwKvy_RfBg,2340 +anyio/_core/_typedattr.py,sha256=P4ozZikn3-DbpoYcvyghS_FOYAgbmUxeoU8-L_07pZM,2508 +anyio/abc/__init__.py,sha256=6mWhcl_pGXhrgZVHP_TCfMvIXIOp9mroEFM90fYCU_U,2869 +anyio/abc/_eventloop.py,sha256=GlzgB3UJGgG6Kr7olpjOZ-o00PghecXuofVDQ_5611Q,10749 +anyio/abc/_resources.py,sha256=DrYvkNN1hH6Uvv5_5uKySvDsnknGVDe8FCKfko0VtN8,783 +anyio/abc/_sockets.py,sha256=ECTY0jLEF18gryANHR3vFzXzGdZ-xPwELq1QdgOb0Jo,13258 +anyio/abc/_streams.py,sha256=005GKSCXGprxnhucILboSqc2JFovECZk9m3p-qqxXVc,7640 +anyio/abc/_subprocesses.py,sha256=cumAPJTktOQtw63IqG0lDpyZqu_l1EElvQHMiwJgL08,2067 +anyio/abc/_tasks.py,sha256=KC7wrciE48AINOI-AhPutnFhe1ewfP7QnamFlDzqesQ,3721 +anyio/abc/_testing.py,sha256=tBJUzkSfOXJw23fe8qSJ03kJlShOYjjaEyFB6k6MYT8,1821 +anyio/from_thread.py,sha256=L-0w1HxJ6BSb-KuVi57k5Tkc3yzQrx3QK5tAxMPcY-0,19141 +anyio/functools.py,sha256=HWj7GBEmc0Z-mZg3uok7Z7ZJn0rEC_0Pzbt0nYUDaTQ,10973 +anyio/lowlevel.py,sha256=AyKLVK3LaWSoK39LkCKxE4_GDMLKZBNqTrLUgk63y80,5158 +anyio/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +anyio/pytest_plugin.py,sha256=3jAFQn0jv_pyoWE2GBBlHaj9sqXj4e8vob0_hgrsXE8,10244 +anyio/streams/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +anyio/streams/buffered.py,sha256=2R3PeJhe4EXrdYqz44Y6-Eg9R6DrmlsYrP36Ir43-po,6263 +anyio/streams/file.py,sha256=4WZ7XGz5WNu39FQHvqbe__TQ0HDP9OOhgO1mk9iVpVU,4470 +anyio/streams/memory.py,sha256=F0zwzvFJKAhX_LRZGoKzzqDC2oMM-f-yyTBrEYEGOaU,10740 +anyio/streams/stapled.py,sha256=T8Xqwf8K6EgURPxbt1N4i7A8BAk-gScv-GRhjLXIf_o,4390 +anyio/streams/text.py,sha256=BcVAGJw1VRvtIqnv-o0Rb0pwH7p8vwlvl21xHq522ag,5765 +anyio/streams/tls.py,sha256=Jpxy0Mfbcp1BxHCwE-YjSSFaLnIBbnnwur-excYThs4,15368 +anyio/to_interpreter.py,sha256=_mLngrMy97TMR6VbW4Y6YzDUk9ZuPcQMPlkuyRh3C9k,7100 +anyio/to_process.py,sha256=J7gAA_YOuoHqnpDAf5fm1Qu6kOmTzdFbiDNvnV755vk,9798 +anyio/to_thread.py,sha256=menEgXYmUV7Fjg_9WqCV95P9MAtQS8BzPGGcWB_QnfQ,2687 diff --git a/anyio-4.12.1.dist-info/REQUESTED b/anyio-4.12.1.dist-info/REQUESTED new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/anyio-4.12.1.dist-info/WHEEL b/anyio-4.12.1.dist-info/WHEEL new file mode 100644 index 0000000000000000000000000000000000000000..e7fa31b6f3f78deb1022c1f7927f07d4d16da822 --- /dev/null +++ b/anyio-4.12.1.dist-info/WHEEL @@ -0,0 +1,5 @@ +Wheel-Version: 1.0 +Generator: setuptools (80.9.0) +Root-Is-Purelib: true +Tag: py3-none-any + diff --git a/anyio-4.12.1.dist-info/entry_points.txt b/anyio-4.12.1.dist-info/entry_points.txt new file mode 100644 index 0000000000000000000000000000000000000000..44dd9bdc3039122cc98014c1439ca254313fd014 --- /dev/null +++ b/anyio-4.12.1.dist-info/entry_points.txt @@ -0,0 +1,2 @@ +[pytest11] +anyio = anyio.pytest_plugin diff --git a/anyio-4.12.1.dist-info/top_level.txt b/anyio-4.12.1.dist-info/top_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..c77c069ecc9b7f8b1f97dbcfec905725db0253a8 --- /dev/null +++ b/anyio-4.12.1.dist-info/top_level.txt @@ -0,0 +1 @@ +anyio diff --git a/dataset-metadata.json b/dataset-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..ec6a041e46e73f43853c910450a0129fc749e982 --- /dev/null +++ b/dataset-metadata.json @@ -0,0 +1,9 @@ +{ + "title": "mamba-packages", + "id": "pmsalmankhan/mamba-packages", + "licenses": [ + { + "name": "CC0-1.0" + } + ] +} \ No newline at end of file diff --git a/datasets/__init__.py b/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2beccf63bda85745e146db06f1fa9d57e2b4a86e --- /dev/null +++ b/datasets/__init__.py @@ -0,0 +1,47 @@ +# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__version__ = "4.3.0" + +from .arrow_dataset import Column, Dataset +from .arrow_reader import ReadInstruction +from .builder import ArrowBasedBuilder, BuilderConfig, DatasetBuilder, GeneratorBasedBuilder +from .combine import concatenate_datasets, interleave_datasets +from .dataset_dict import DatasetDict, IterableDatasetDict +from .download import * +from .features import * +from .fingerprint import disable_caching, enable_caching, is_caching_enabled +from .info import DatasetInfo +from .inspect import ( + get_dataset_config_info, + get_dataset_config_names, + get_dataset_default_config_name, + get_dataset_infos, + get_dataset_split_names, +) +from .iterable_dataset import IterableColumn, IterableDataset +from .load import load_dataset, load_dataset_builder, load_from_disk +from .splits import ( + NamedSplit, + NamedSplitAll, + Split, + SplitBase, + SplitDict, + SplitGenerator, + SplitInfo, + SubSplitInfo, + percent, +) +from .utils import * +from .utils import logging diff --git a/datasets/arrow_dataset.py b/datasets/arrow_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..43301d2304133f2e51137123dda2c45583f74716 --- /dev/null +++ b/datasets/arrow_dataset.py @@ -0,0 +1,6823 @@ +# Copyright 2020 The HuggingFace Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Lint as: python3 +"""Simple Dataset wrapping an Arrow Table.""" + +import asyncio +import contextlib +import copy +import fnmatch +import glob +import inspect +import itertools +import json +import math +import os +import posixpath +import random +import re +import shutil +import string +import sys +import tempfile +import time +import warnings +import weakref +from collections import Counter, defaultdict +from collections.abc import Iterable, Iterator, Mapping +from collections.abc import Sequence as Sequence_ +from copy import deepcopy +from functools import partial, wraps +from io import BytesIO +from math import ceil, floor +from pathlib import Path +from random import sample +from typing import ( + TYPE_CHECKING, + Any, + BinaryIO, + Callable, + Optional, + Union, + overload, +) + +import fsspec +import numpy as np +import pandas as pd +import pyarrow as pa +import pyarrow.compute as pc +from fsspec.core import url_to_fs +from huggingface_hub import ( + CommitInfo, + CommitOperationAdd, + CommitOperationDelete, + DatasetCard, + DatasetCardData, + HfApi, +) +from huggingface_hub.hf_api import RepoFile +from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError +from multiprocess import Pool +from tqdm.contrib.concurrent import thread_map + +from . import config +from .arrow_reader import ArrowReader +from .arrow_writer import ArrowWriter, OptimizedTypedSequence +from .data_files import sanitize_patterns +from .download.streaming_download_manager import xgetsize +from .features import Audio, ClassLabel, Features, Image, List, Value, Video +from .features.features import ( + FeatureType, + _align_features, + _check_if_features_can_be_aligned, + _fix_for_backward_compatible_features, + generate_from_arrow_type, + pandas_types_mapper, + require_decoding, +) +from .filesystems import is_remote_filesystem +from .fingerprint import ( + fingerprint_transform, + format_kwargs_for_fingerprint, + format_transform_for_fingerprint, + generate_fingerprint, + generate_random_fingerprint, + get_temporary_cache_files_directory, + is_caching_enabled, + maybe_register_dataset_for_temp_dir_deletion, + update_fingerprint, + validate_fingerprint, +) +from .formatting import format_table, get_format_type_from_alias, get_formatter, query_table +from .formatting.formatting import LazyDict, _is_range_contiguous +from .info import DatasetInfo, DatasetInfosDict +from .naming import _split_re +from .search import IndexableMixin +from .splits import NamedSplit, Split, SplitDict, SplitInfo +from .table import ( + InMemoryTable, + MemoryMappedTable, + Table, + _memory_mapped_record_batch_reader_from_file, + cast_array_to_feature, + concat_tables, + embed_table_storage, + list_table_cache_files, + table_cast, + table_iter, + table_visitor, +) +from .utils import logging +from .utils import tqdm as hf_tqdm +from .utils.file_utils import estimate_dataset_size +from .utils.info_utils import is_small_dataset +from .utils.metadata import MetadataConfigs +from .utils.py_utils import ( + Literal, + asdict, + convert_file_size_to_int, + glob_pattern_to_regex, + iflatmap_unordered, + string_to_dict, +) +from .utils.stratify import stratified_shuffle_split_generate_indices +from .utils.tf_utils import dataset_to_tf, minimal_tf_collate_fn, multiprocess_dataset_to_tf +from .utils.typing import ListLike, PathLike + + +if TYPE_CHECKING: + import sqlite3 + + import polars as pl + import pyspark + import sqlalchemy + + from .dataset_dict import DatasetDict + from .iterable_dataset import IterableDataset + +logger = logging.get_logger(__name__) + +PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED = ( + "data/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.parquet" +) + + +class DatasetInfoMixin: + """This base class exposes some attributes of DatasetInfo + at the base level of the Dataset for easy access. + """ + + def __init__(self, info: DatasetInfo, split: Optional[NamedSplit]): + self._info = info + self._split = split + + @property + def info(self): + """[`~datasets.DatasetInfo`] object containing all the metadata in the dataset.""" + return self._info + + @property + def split(self): + """[`~datasets.NamedSplit`] object corresponding to a named dataset split.""" + return self._split + + @property + def builder_name(self) -> str: + return self._info.builder_name + + @property + def citation(self) -> str: + return self._info.citation + + @property + def config_name(self) -> str: + return self._info.config_name + + @property + def dataset_size(self) -> Optional[int]: + return self._info.dataset_size + + @property + def description(self) -> str: + return self._info.description + + @property + def download_checksums(self) -> Optional[dict]: + return self._info.download_checksums + + @property + def download_size(self) -> Optional[int]: + return self._info.download_size + + @property + def features(self) -> Optional[Features]: + return self._info.features.copy() if self._info.features is not None else None + + @property + def homepage(self) -> Optional[str]: + return self._info.homepage + + @property + def license(self) -> Optional[str]: + return self._info.license + + @property + def size_in_bytes(self) -> Optional[int]: + return self._info.size_in_bytes + + @property + def supervised_keys(self): + return self._info.supervised_keys + + @property + def version(self): + return self._info.version + + +class TensorflowDatasetMixin: + _TF_DATASET_REFS = set() + + @staticmethod + def _get_output_signature( + dataset: "Dataset", + collate_fn: Callable, + collate_fn_args: dict, + cols_to_retain: Optional[list[str]] = None, + batch_size: Optional[int] = None, + num_test_batches: int = 20, + ): + """Private method used by `to_tf_dataset()` to find the shapes and dtypes of samples from this dataset + after being passed through the collate_fn. Tensorflow needs an exact signature for tf.numpy_function, so + the only way to do this is to run test batches - the collator may add or rename columns, so we can't figure + it out just by inspecting the dataset. + + Args: + dataset (`Dataset`): Dataset to load samples from. + collate_fn(`bool`): Shuffle the dataset order when loading. Recommended True for training, False for + validation/evaluation. + collate_fn(`Callable`): A function or callable object (such as a `DataCollator`) that will collate + lists of samples into a batch. + collate_fn_args (`Dict`): A `dict` of keyword arguments to be passed to the + `collate_fn`. + batch_size (`int`, optional): The size of batches loaded from the dataset. Used for shape inference. + Can be None, which indicates that batch sizes can be variable. + num_test_batches (`int`): The number of batches to load from the dataset for shape inference. + + Returns: + `dict`: Dict mapping column names to tf.Tensorspec objects + `dict`: Dict mapping column names to np.dtype objects + """ + if config.TF_AVAILABLE: + import tensorflow as tf + else: + raise ImportError("Called a Tensorflow-specific function but Tensorflow is not installed.") + + if len(dataset) == 0: + raise ValueError("Unable to get the output signature because the dataset is empty.") + if batch_size is not None: + batch_size = min(len(dataset), batch_size) + test_batch_size = 1 + + if cols_to_retain is not None: + cols_to_retain = list(set(cols_to_retain + ["label_ids", "label", "labels"])) + + test_batches = [] + for _ in range(num_test_batches): + indices = sample(range(len(dataset)), test_batch_size) + test_batch = dataset[indices] + if cols_to_retain is not None: + test_batch = {key: value for key, value in test_batch.items() if key in cols_to_retain} + test_batch = [{key: value[i] for key, value in test_batch.items()} for i in range(test_batch_size)] + test_batch = collate_fn(test_batch, **collate_fn_args) + test_batches.append(test_batch) + + tf_columns_to_signatures = {} + np_columns_to_dtypes = {} + for column in test_batches[0].keys(): + raw_arrays = [batch[column] for batch in test_batches] + # In case the collate_fn returns something strange + np_arrays = [] + for array in raw_arrays: + if isinstance(array, np.ndarray): + np_arrays.append(array) + elif isinstance(array, tf.Tensor): + np_arrays.append(array.numpy()) + else: + np_arrays.append(np.array(array)) + + if np.issubdtype(np_arrays[0].dtype, np.integer) or np_arrays[0].dtype == bool: + tf_dtype = tf.int64 + np_dtype = np.int64 + elif np.issubdtype(np_arrays[0].dtype, np.number): + tf_dtype = tf.float32 + np_dtype = np.float32 + elif np_arrays[0].dtype.kind == "U": # Unicode strings + np_dtype = np.str_ + tf_dtype = tf.string + else: + raise RuntimeError( + f"Unrecognized array dtype {np_arrays[0].dtype}. \n" + "Nested types and image/audio types are not supported yet." + ) + shapes = [array.shape for array in np_arrays] + static_shape = [] + for dim in range(len(shapes[0])): + sizes = {shape[dim] for shape in shapes} + if dim == 0: + static_shape.append(batch_size) + continue + if len(sizes) == 1: # This dimension looks constant + static_shape.append(sizes.pop()) + else: # Use None for variable dimensions + static_shape.append(None) + tf_columns_to_signatures[column] = tf.TensorSpec(shape=static_shape, dtype=tf_dtype) + np_columns_to_dtypes[column] = np_dtype + + return tf_columns_to_signatures, np_columns_to_dtypes + + def to_tf_dataset( + self, + batch_size: Optional[int] = None, + columns: Optional[Union[str, list[str]]] = None, + shuffle: bool = False, + collate_fn: Optional[Callable] = None, + drop_remainder: bool = False, + collate_fn_args: Optional[dict[str, Any]] = None, + label_cols: Optional[Union[str, list[str]]] = None, + prefetch: bool = True, + num_workers: int = 0, + num_test_batches: int = 20, + ): + """Create a `tf.data.Dataset` from the underlying Dataset. This `tf.data.Dataset` will load and collate batches from + the Dataset, and is suitable for passing to methods like `model.fit()` or `model.predict()`. The dataset will yield + `dicts` for both inputs and labels unless the `dict` would contain only a single key, in which case a raw + `tf.Tensor` is yielded instead. + + Args: + batch_size (`int`, *optional*): + Size of batches to load from the dataset. Defaults to `None`, which implies that the dataset won't be + batched, but the returned dataset can be batched later with `tf_dataset.batch(batch_size)`. + columns (`List[str]` or `str`, *optional*): + Dataset column(s) to load in the `tf.data.Dataset`. + Column names that are created by the `collate_fn` and that do not exist in the original dataset can be used. + shuffle(`bool`, defaults to `False`): + Shuffle the dataset order when loading. Recommended `True` for training, `False` for + validation/evaluation. + drop_remainder(`bool`, defaults to `False`): + Drop the last incomplete batch when loading. Ensures + that all batches yielded by the dataset will have the same length on the batch dimension. + collate_fn(`Callable`, *optional*): + A function or callable object (such as a `DataCollator`) that will collate + lists of samples into a batch. + collate_fn_args (`Dict`, *optional*): + An optional `dict` of keyword arguments to be passed to the + `collate_fn`. + label_cols (`List[str]` or `str`, defaults to `None`): + Dataset column(s) to load as labels. + Note that many models compute loss internally rather than letting Keras do it, in which case + passing the labels here is optional, as long as they're in the input `columns`. + prefetch (`bool`, defaults to `True`): + Whether to run the dataloader in a separate thread and maintain + a small buffer of batches for training. Improves performance by allowing data to be loaded in the + background while the model is training. + num_workers (`int`, defaults to `0`): + Number of workers to use for loading the dataset. + num_test_batches (`int`, defaults to `20`): + Number of batches to use to infer the output signature of the dataset. + The higher this number, the more accurate the signature will be, but the longer it will take to + create the dataset. + + Returns: + `tf.data.Dataset` + + Example: + + ```py + >>> ds_train = ds["train"].to_tf_dataset( + ... columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'], + ... shuffle=True, + ... batch_size=16, + ... collate_fn=data_collator, + ... ) + ``` + """ + if config.TF_AVAILABLE: + import tensorflow as tf + else: + raise ImportError("Called a Tensorflow-specific function but Tensorflow is not installed.") + + if (isinstance(columns, list) and len(columns) == 1) or ( + isinstance(label_cols, list) and len(label_cols) == 1 + ): + warnings.warn( + "The output of `to_tf_dataset` will change when a passing single element list for `labels` or " + "`columns` in the next datasets version. To return a tuple structure rather than dict, pass a " + "single string.\n" + "Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor) \n" + " : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) \n" + "New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor}) \n" + " : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) ", + FutureWarning, + ) + + if isinstance(tf.distribute.get_strategy(), tf.distribute.TPUStrategy): + logger.warning( + "Note that to_tf_dataset() loads the data with a generator rather than a full tf.data " + "pipeline and is not compatible with remote TPU connections. If you encounter errors, please " + "try using a TPU VM or, if your data can fit in memory, loading it into memory as a dict of " + "Tensors instead of streaming with to_tf_dataset()." + ) + + if collate_fn is None: + # Set a very simple default collator that just stacks things together + collate_fn = minimal_tf_collate_fn + if collate_fn_args is None: + collate_fn_args = {} + if label_cols and not columns: + raise ValueError("Cannot specify label_cols without specifying columns!") + if label_cols is None: + label_cols = [] + elif isinstance(label_cols, str): + label_cols = [label_cols] + if len(set(label_cols)) < len(label_cols): + raise ValueError("List of label_cols contains duplicates.") + if columns: + if isinstance(columns, str): + columns = [columns] + if len(set(columns)) < len(columns): + raise ValueError("List of columns contains duplicates.") + cols_to_retain = list(set(columns + label_cols)) + else: + cols_to_retain = None # Indicates keeping all valid columns + columns = [] + + if self.format["type"] not in ["custom", "numpy"]: + dataset = self.with_format("numpy") + else: + dataset = self + + # TODO(Matt, QL): deprecate the retention of label_ids and label + + output_signature, columns_to_np_types = dataset._get_output_signature( + dataset, + collate_fn=collate_fn, + collate_fn_args=collate_fn_args, + cols_to_retain=cols_to_retain, + batch_size=batch_size if drop_remainder else None, + num_test_batches=num_test_batches, + ) + + if "labels" in output_signature: + if ("label_ids" in columns or "label" in columns) and "labels" not in columns: + columns = [col for col in columns if col not in ["label_ids", "label"]] + ["labels"] + if ("label_ids" in label_cols or "label" in label_cols) and "labels" not in label_cols: + label_cols = [col for col in label_cols if col not in ["label_ids", "label"]] + ["labels"] + + for col in columns: + if col not in output_signature: + raise ValueError(f"Column {col} not found in dataset!") + + for col in label_cols: + if col not in output_signature: + raise ValueError(f"Label column {col} not found in dataset!") + + if num_workers == 0: + tf_dataset = dataset_to_tf( + dataset=dataset, + cols_to_retain=cols_to_retain, + collate_fn=collate_fn, + collate_fn_args=collate_fn_args, + columns_to_np_types=columns_to_np_types, + output_signature=output_signature, + shuffle=shuffle, + batch_size=batch_size, + drop_remainder=drop_remainder, + ) + elif num_workers > 0: + if batch_size is None: + raise NotImplementedError( + "`batch_size` must be specified when using multiple workers, as unbatched multiprocessing " + "is not supported yet. Please provide a `batch_size` if `num_workers` is greater than 0." + ) + tf_dataset = multiprocess_dataset_to_tf( + dataset=dataset, + cols_to_retain=cols_to_retain, + collate_fn=collate_fn, + collate_fn_args=collate_fn_args, + columns_to_np_types=columns_to_np_types, + output_signature=output_signature, + shuffle=shuffle, + batch_size=batch_size, + drop_remainder=drop_remainder, + num_workers=num_workers, + ) + else: + raise ValueError("num_workers must be >= 0") + + def split_features_and_labels(input_batch): + # TODO(Matt, QL): deprecate returning the dict content when there's only one key + features = {key: tensor for key, tensor in input_batch.items() if key in columns} + labels = {key: tensor for key, tensor in input_batch.items() if key in label_cols} + if len(features) == 1: + features = list(features.values())[0] + if len(labels) == 1: + labels = list(labels.values())[0] + if isinstance(labels, dict) and len(labels) == 0: + return features + else: + return features, labels + + if cols_to_retain is not None: + tf_dataset = tf_dataset.map(split_features_and_labels) + + if prefetch: + tf_dataset = tf_dataset.prefetch(tf.data.experimental.AUTOTUNE) + + # Remove a reference to the open Arrow file on delete + def cleanup_callback(ref): + dataset.__del__() + self._TF_DATASET_REFS.remove(ref) + + self._TF_DATASET_REFS.add(weakref.ref(tf_dataset, cleanup_callback)) + + return tf_dataset + + +class DatasetTransformationNotAllowedError(Exception): + pass + + +def transmit_format(func): + """Wrapper for dataset transforms that recreate a new Dataset to transmit the format of the original dataset to the new dataset""" + + @wraps(func) + def wrapper(*args, **kwargs): + if args: + self: "Dataset" = args[0] + args = args[1:] + else: + self: "Dataset" = kwargs.pop("self") + # don't use self.format since it returns a list of columns for 'columns' even if self_format_columns is None + unformatted_columns = set(self.column_names) - set(self._format_columns or []) + self_format = { + "type": self._format_type, + "format_kwargs": self._format_kwargs, + "columns": self._format_columns, + "output_all_columns": self._output_all_columns, + } + # apply actual function + out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs) + datasets: list["Dataset"] = list(out.values()) if isinstance(out, dict) else [out] + # re-apply format to the output + for dataset in datasets: + new_format = self_format.copy() + if new_format["columns"] is not None: # new formatted columns = (columns - previously unformatted columns) + # sort the columns to have a deterministic list of columns that we can compare with `out_format` + new_format["columns"] = sorted(set(dataset.column_names) - unformatted_columns) + out_format = { + "type": dataset._format_type, + "format_kwargs": dataset._format_kwargs, + "columns": sorted(dataset._format_columns) if dataset._format_columns is not None else None, + "output_all_columns": dataset._output_all_columns, + } + if out_format != new_format: + fingerprint = dataset._fingerprint + dataset.set_format(**new_format) + dataset._fingerprint = fingerprint + return out + + wrapper._decorator_name_ = "transmit_format" + return wrapper + + +def update_metadata_with_features(table: Table, features: Features): + """To be used in dataset transforms that modify the features of the dataset, in order to update the features stored in the metadata of its schema.""" + features = Features({col_name: features[col_name] for col_name in table.column_names}) + if table.schema.metadata is None or b"huggingface" not in table.schema.metadata: + pa_metadata = ArrowWriter._build_metadata(DatasetInfo(features=features)) + else: + metadata = json.loads(table.schema.metadata[b"huggingface"].decode()) + if "info" not in metadata: + metadata["info"] = asdict(DatasetInfo(features=features)) + else: + metadata["info"]["features"] = asdict(DatasetInfo(features=features))["features"] + pa_metadata = {"huggingface": json.dumps(metadata)} + table = table.replace_schema_metadata(pa_metadata) + return table + + +def _check_table(table) -> Table: + """We check the table type to make sure it's an instance of :class:`datasets.table.Table`""" + if isinstance(table, pa.Table): + # for a pyarrow table, we can just consider it as a in-memory table + # this is here for backward compatibility + return InMemoryTable(table) + elif isinstance(table, Table): + return table + else: + raise TypeError(f"Expected a pyarrow.Table or a datasets.table.Table object, but got {table}.") + + +def _check_column_names(column_names: list[str]): + """Check the column names to make sure they don't contain duplicates.""" + counter = Counter(column_names) + if not all(count == 1 for count in counter.values()): + duplicated_columns = [col for col in counter if counter[col] > 1] + raise ValueError(f"The table can't have duplicated columns but columns {duplicated_columns} are duplicated.") + + +def _check_valid_indices_value(index, size): + if (index < 0 and index + size < 0) or (index >= size): + raise IndexError(f"Index {index} out of range for dataset of size {size}.") + + +class NonExistentDatasetError(Exception): + """Used when we expect the existence of a dataset""" + + pass + + +class Column(Sequence_): + """ + An iterable for a specific column of a [`Dataset`]. + + Example: + + Iterate on the texts of the "text" column of a dataset: + + ```python + for text in dataset["text"]: + ... + ``` + + It also works with nested columns: + + ```python + for source in dataset["metadata"]["source"]: + ... + ``` + """ + + def __init__(self, source: Union["Dataset", "Column"], column_name: str): + self.source = source + self.column_name = column_name + if not isinstance(source.features, dict) or column_name not in source.features: + raise ValueError(f"Column '{column_name}' doesn't exist.") + self.features = source.features[column_name] + + def __iter__(self) -> Iterator[Any]: + if isinstance(self.source, Dataset): + source = self.source._fast_select_column(self.column_name) + else: + source = self.source + for example in source: + yield example[self.column_name] + + def __getitem__(self, key: Union[int, str, list[int]]) -> Any: + if isinstance(key, str): + return Column(self, key) + elif isinstance(self.source, Dataset): + return self.source._fast_select_column(self.column_name)[key][self.column_name] + elif isinstance(key, int): + return self.source[key][self.column_name] + else: + return [item[self.column_name] for item in self.source[key]] + + def __len__(self) -> int: + return len(self.source) + + def __repr__(self): + return "Column(" + repr(list(self[:5])) + ")" + + def __str__(self): + return "Column(" + str(list(self[:5])) + ")" + + def __eq__(self, value): + if isinstance(value, Column): + return list(self) == list(value) + else: + return value == list(self) + + +class Dataset(DatasetInfoMixin, IndexableMixin, TensorflowDatasetMixin): + """A Dataset backed by an Arrow table.""" + + def __init__( + self, + arrow_table: Table, + info: Optional[DatasetInfo] = None, + split: Optional[NamedSplit] = None, + indices_table: Optional[Table] = None, + fingerprint: Optional[str] = None, + ): + info = info.copy() if info is not None else DatasetInfo() + DatasetInfoMixin.__init__(self, info=info, split=split) + IndexableMixin.__init__(self) + + self._data: Table = _check_table(arrow_table) + self._indices: Optional[Table] = _check_table(indices_table) if indices_table is not None else None + maybe_register_dataset_for_temp_dir_deletion(self) + + self._format_type: Optional[str] = None + self._format_kwargs: dict = {} + self._format_columns: Optional[list] = None + self._output_all_columns: bool = False + self._fingerprint: str = fingerprint + + # Read metadata + + if self._data.schema.metadata is not None and b"huggingface" in self._data.schema.metadata: + metadata = json.loads(self._data.schema.metadata[b"huggingface"].decode()) + if ( + "fingerprint" in metadata and self._fingerprint is None + ): # try to load fingerprint from the arrow file metadata + self._fingerprint = metadata["fingerprint"] + + # Infer features if None + inferred_features = Features.from_arrow_schema(arrow_table.schema) + if self.info.features is None: + self.info.features = inferred_features + else: # make sure the nested columns are in the right order + try: + self.info.features = self.info.features.reorder_fields_as(inferred_features) + except ValueError as e: + raise ValueError( + f"{e}\nThe 'source' features come from dataset_info.json, and the 'target' ones are those of the dataset arrow file." + ) + + # In case there are types like pa.dictionary that we need to convert to the underlying type + + if self.data.schema != self.info.features.arrow_schema: + self._data = self.data.cast(self.info.features.arrow_schema) + + # Infer fingerprint if None + + if self._fingerprint is None: + self._fingerprint = generate_fingerprint(self) + + # Sanity checks + + if self._info.features is None: + raise ValueError("Features can't be None in a Dataset object") + if self._fingerprint is None: + raise ValueError("Fingerprint can't be None in a Dataset object") + if self.info.features.type != inferred_features.type: + raise ValueError( + f"External features info don't match the dataset:\nGot\n{self.info.features}\nwith type\n{self.info.features.type}\n\nbut expected something like\n{inferred_features}\nwith type\n{inferred_features.type}" + ) + + if self._indices is not None: + if not pa.types.is_unsigned_integer(self._indices.column(0).type): + raise ValueError( + f"indices must be an Arrow table of unsigned integers, current type is {self._indices.column(0).type}" + ) + _check_column_names(self._data.column_names) + + self._data = update_metadata_with_features(self._data, self._info.features) + + @property + def features(self) -> Features: + features = super().features + if features is None: # this is already checked in __init__ + raise ValueError("Features can't be None in a Dataset object") + return features + + @classmethod + def from_file( + cls, + filename: str, + info: Optional[DatasetInfo] = None, + split: Optional[NamedSplit] = None, + indices_filename: Optional[str] = None, + in_memory: bool = False, + ) -> "Dataset": + """Instantiate a Dataset backed by an Arrow table at filename. + + Args: + filename (`str`): + File name of the dataset. + info (`DatasetInfo`, *optional*): + Dataset information, like description, citation, etc. + split (`NamedSplit`, *optional*): + Name of the dataset split. + indices_filename (`str`, *optional*): + File names of the indices. + in_memory (`bool`, defaults to `False`): + Whether to copy the data in-memory. + + Returns: + [`Dataset`] + """ + table = ArrowReader.read_table(filename, in_memory=in_memory) + + if indices_filename is not None: + indices_pa_table = ArrowReader.read_table(indices_filename, in_memory=in_memory) + else: + indices_pa_table = None + + return cls( + arrow_table=table, + info=info, + split=split, + indices_table=indices_pa_table, + ) + + @classmethod + def from_buffer( + cls, + buffer: pa.Buffer, + info: Optional[DatasetInfo] = None, + split: Optional[NamedSplit] = None, + indices_buffer: Optional[pa.Buffer] = None, + ) -> "Dataset": + """Instantiate a Dataset backed by an Arrow buffer. + + Args: + buffer (`pyarrow.Buffer`): + Arrow buffer. + info (`DatasetInfo`, *optional*): + Dataset information, like description, citation, etc. + split (`NamedSplit`, *optional*): + Name of the dataset split. + indices_buffer (`pyarrow.Buffer`, *optional*): + Indices Arrow buffer. + + Returns: + [`Dataset`] + """ + table = InMemoryTable.from_buffer(buffer) + + if indices_buffer is not None: + indices_table = InMemoryTable.from_buffer(buffer) + else: + indices_table = None + + return cls(table, info=info, split=split, indices_table=indices_table) + + @classmethod + def from_pandas( + cls, + df: pd.DataFrame, + features: Optional[Features] = None, + info: Optional[DatasetInfo] = None, + split: Optional[NamedSplit] = None, + preserve_index: Optional[bool] = None, + ) -> "Dataset": + """ + Convert `pandas.DataFrame` to a `pyarrow.Table` to create a [`Dataset`]. + + The column types in the resulting Arrow Table are inferred from the dtypes of the `pandas.Series` in the + DataFrame. In the case of non-object Series, the NumPy dtype is translated to its Arrow equivalent. In the + case of `object`, we need to guess the datatype by looking at the Python objects in this Series. + + Be aware that Series of the `object` dtype don't carry enough information to always lead to a meaningful Arrow + type. In the case that we cannot infer a type, e.g. because the DataFrame is of length 0 or the Series only + contains `None/nan` objects, the type is set to `null`. This behavior can be avoided by constructing explicit + features and passing it to this function. + + Important: a dataset created with from_pandas() lives in memory + and therefore doesn't have an associated cache directory. + This may change in the future, but in the meantime if you + want to reduce memory usage you should write it back on disk + and reload using e.g. save_to_disk / load_from_disk. + + Args: + df (`pandas.DataFrame`): + Dataframe that contains the dataset. + features ([`Features`], *optional*): + Dataset features. + info (`DatasetInfo`, *optional*): + Dataset information, like description, citation, etc. + split (`NamedSplit`, *optional*): + Name of the dataset split. + preserve_index (`bool`, *optional*): + Whether to store the index as an additional column in the resulting Dataset. + The default of `None` will store the index as a column, except for `RangeIndex` which is stored as metadata only. + Use `preserve_index=True` to force it to be stored as a column. + + Returns: + [`Dataset`] + + Example: + + ```py + >>> ds = Dataset.from_pandas(df) + ``` + """ + if info is not None and features is not None and info.features != features: + raise ValueError( + f"Features specified in `features` and `info.features` can't be different:\n{features}\n{info.features}" + ) + features = features if features is not None else info.features if info is not None else None + if features is not None: + features = _fix_for_backward_compatible_features(features) + if info is None: + info = DatasetInfo() + info.features = features + table = InMemoryTable.from_pandas( + df=df, + preserve_index=preserve_index, + ) + if features is not None: + # more expensive cast than InMemoryTable.from_pandas(..., schema=features.arrow_schema) + # needed to support the str to Audio conversion for instance + table = table.cast(features.arrow_schema) + return cls(table, info=info, split=split) + + @classmethod + def from_polars( + cls, + df: "pl.DataFrame", + features: Optional[Features] = None, + info: Optional[DatasetInfo] = None, + split: Optional[NamedSplit] = None, + ) -> "Dataset": + """ + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + * CategoricalType + + Args: + df (`polars.DataFrame`): DataFrame to convert to Arrow Table + features (`Features`, optional): Dataset features. + info (`DatasetInfo`, optional): Dataset information, like description, citation, etc. + split (`NamedSplit`, optional): Name of the dataset split. + + Examples: + ```py + >>> ds = Dataset.from_polars(df) + ``` + """ + if info is not None and features is not None and info.features != features: + raise ValueError( + f"Features specified in `features` and `info.features` can't be different:\n{features}\n{info.features}" + ) + features = features if features is not None else info.features if info is not None else None + if features is not None: + features = _fix_for_backward_compatible_features(features) + if info is None: + info = DatasetInfo() + info.features = features + table = InMemoryTable(df.to_arrow()) + if features is not None: + # more expensive cast than InMemoryTable.from_polars(..., schema=features.arrow_schema) + # needed to support the str to Audio conversion for instance + table = table.cast(features.arrow_schema) + return cls(table, info=info, split=split) + + @classmethod + def from_dict( + cls, + mapping: dict, + features: Optional[Features] = None, + info: Optional[DatasetInfo] = None, + split: Optional[NamedSplit] = None, + ) -> "Dataset": + """ + Convert `dict` to a `pyarrow.Table` to create a [`Dataset`]. + + Important: a dataset created with from_dict() lives in memory + and therefore doesn't have an associated cache directory. + This may change in the future, but in the meantime if you + want to reduce memory usage you should write it back on disk + and reload using e.g. save_to_disk / load_from_disk. + + Args: + mapping (`Mapping`): + Mapping of strings to Arrays or Python lists. + features ([`Features`], *optional*): + Dataset features. + info (`DatasetInfo`, *optional*): + Dataset information, like description, citation, etc. + split (`NamedSplit`, *optional*): + Name of the dataset split. + + Returns: + [`Dataset`] + """ + if info is not None and features is not None and info.features != features: + raise ValueError( + f"Features specified in `features` and `info.features` can't be different:\n{features}\n{info.features}" + ) + features = features if features is not None else info.features if info is not None else None + if features is not None: + features = _fix_for_backward_compatible_features(features) + arrow_typed_mapping = {} + for col, data in mapping.items(): + if isinstance(data, (pa.Array, pa.ChunkedArray)): + data = cast_array_to_feature(data, features[col]) if features is not None else data + else: + data = OptimizedTypedSequence( + features.encode_column(data, col) if features is not None else data, + type=features[col] if features is not None else None, + col=col, + ) + arrow_typed_mapping[col] = data + mapping = arrow_typed_mapping + pa_table = InMemoryTable.from_pydict(mapping=mapping) + if info is None: + info = DatasetInfo() + info.features = features + if info.features is None: + info.features = Features( + { + col: generate_from_arrow_type(data.type) + if isinstance(data, (pa.Array, pa.ChunkedArray)) + else data.get_inferred_type() + for col, data in mapping.items() + } + ) + return cls(pa_table, info=info, split=split) + + @classmethod + def from_list( + cls, + mapping: list[dict], + features: Optional[Features] = None, + info: Optional[DatasetInfo] = None, + split: Optional[NamedSplit] = None, + ) -> "Dataset": + """ + Convert a list of dicts to a `pyarrow.Table` to create a [`Dataset`]`. + + Note that the keys of the first entry will be used to determine the dataset columns, + regardless of what is passed to features. + + Important: a dataset created with from_list() lives in memory + and therefore doesn't have an associated cache directory. + This may change in the future, but in the meantime if you + want to reduce memory usage you should write it back on disk + and reload using e.g. save_to_disk / load_from_disk. + + Args: + mapping (`List[dict]`): A list of mappings of strings to row values. + features (`Features`, optional): Dataset features. + info (`DatasetInfo`, optional): Dataset information, like description, citation, etc. + split (`NamedSplit`, optional): Name of the dataset split. + + Returns: + [`Dataset`] + """ + # for simplicity and consistency wrt OptimizedTypedSequence we do not use InMemoryTable.from_pylist here + mapping = {k: [r.get(k) for r in mapping] for k in mapping[0]} if mapping else {} + return cls.from_dict(mapping, features, info, split) + + @staticmethod + def from_csv( + path_or_paths: Union[PathLike, list[PathLike]], + split: Optional[NamedSplit] = None, + features: Optional[Features] = None, + cache_dir: str = None, + keep_in_memory: bool = False, + num_proc: Optional[int] = None, + **kwargs, + ): + """Create Dataset from CSV file(s). + + Args: + path_or_paths (`path-like` or list of `path-like`): + Path(s) of the CSV file(s). + split ([`NamedSplit`], *optional*): + Split name to be assigned to the dataset. + features ([`Features`], *optional*): + Dataset features. + cache_dir (`str`, *optional*, defaults to `"~/.cache/huggingface/datasets"`): + Directory to cache data. + keep_in_memory (`bool`, defaults to `False`): + Whether to copy the data in-memory. + num_proc (`int`, *optional*, defaults to `None`): + Number of processes when downloading and generating the dataset locally. + This is helpful if the dataset is made of multiple files. Multiprocessing is disabled by default. + + + **kwargs (additional keyword arguments): + Keyword arguments to be passed to [`pandas.read_csv`]. + + Returns: + [`Dataset`] + + Example: + + ```py + >>> ds = Dataset.from_csv('path/to/dataset.csv') + ``` + """ + # Dynamic import to avoid circular dependency + from .io.csv import CsvDatasetReader + + return CsvDatasetReader( + path_or_paths, + split=split, + features=features, + cache_dir=cache_dir, + keep_in_memory=keep_in_memory, + num_proc=num_proc, + **kwargs, + ).read() + + @staticmethod + def from_generator( + generator: Callable, + features: Optional[Features] = None, + cache_dir: str = None, + keep_in_memory: bool = False, + gen_kwargs: Optional[dict] = None, + num_proc: Optional[int] = None, + split: NamedSplit = Split.TRAIN, + fingerprint: Optional[str] = None, + **kwargs, + ): + """Create a Dataset from a generator. + + Args: + generator (:`Callable`): + A generator function that `yields` examples. + features ([`Features`], *optional*): + Dataset features. + cache_dir (`str`, *optional*, defaults to `"~/.cache/huggingface/datasets"`): + Directory to cache data. + keep_in_memory (`bool`, defaults to `False`): + Whether to copy the data in-memory. + gen_kwargs(`dict`, *optional*): + Keyword arguments to be passed to the `generator` callable. + You can define a sharded dataset by passing the list of shards in `gen_kwargs` and setting `num_proc` greater than 1. + num_proc (`int`, *optional*, defaults to `None`): + Number of processes when downloading and generating the dataset locally. + This is helpful if the dataset is made of multiple files. Multiprocessing is disabled by default. + If `num_proc` is greater than one, then all list values in `gen_kwargs` must be the same length. These values will be split between calls to the generator. The number of shards will be the minimum of the shortest list in `gen_kwargs` and `num_proc`. + + + split ([`NamedSplit`], defaults to `Split.TRAIN`): + Split name to be assigned to the dataset. + + + fingerprint (`str`, *optional*): + Fingerprint that will be used to generate dataset ID. + By default `fingerprint` is generated by hashing the generator function and all the args which can be slow + if it uses large objects like AI models. + + + **kwargs (additional keyword arguments): + Keyword arguments to be passed to :[`GeneratorConfig`]. + + Returns: + [`Dataset`] + + Example: + + ```py + >>> def gen(): + ... yield {"text": "Good", "label": 0} + ... yield {"text": "Bad", "label": 1} + ... + >>> ds = Dataset.from_generator(gen) + ``` + + ```py + >>> def gen(shards): + ... for shard in shards: + ... with open(shard) as f: + ... for line in f: + ... yield {"line": line} + ... + >>> shards = [f"data{i}.txt" for i in range(32)] + >>> ds = Dataset.from_generator(gen, gen_kwargs={"shards": shards}) + ``` + """ + from .io.generator import GeneratorDatasetInputStream + + return GeneratorDatasetInputStream( + generator=generator, + features=features, + cache_dir=cache_dir, + keep_in_memory=keep_in_memory, + gen_kwargs=gen_kwargs, + num_proc=num_proc, + split=split, + fingerprint=fingerprint, + **kwargs, + ).read() + + @staticmethod + def from_json( + path_or_paths: Union[PathLike, list[PathLike]], + split: Optional[NamedSplit] = None, + features: Optional[Features] = None, + cache_dir: str = None, + keep_in_memory: bool = False, + field: Optional[str] = None, + num_proc: Optional[int] = None, + **kwargs, + ): + """Create Dataset from JSON or JSON Lines file(s). + + Args: + path_or_paths (`path-like` or list of `path-like`): + Path(s) of the JSON or JSON Lines file(s). + split ([`NamedSplit`], *optional*): + Split name to be assigned to the dataset. + features ([`Features`], *optional*): + Dataset features. + cache_dir (`str`, *optional*, defaults to `"~/.cache/huggingface/datasets"`): + Directory to cache data. + keep_in_memory (`bool`, defaults to `False`): + Whether to copy the data in-memory. + field (`str`, *optional*): + Field name of the JSON file where the dataset is contained in. + num_proc (`int`, *optional* defaults to `None`): + Number of processes when downloading and generating the dataset locally. + This is helpful if the dataset is made of multiple files. Multiprocessing is disabled by default. + + + **kwargs (additional keyword arguments): + Keyword arguments to be passed to [`JsonConfig`]. + + Returns: + [`Dataset`] + + Example: + + ```py + >>> ds = Dataset.from_json('path/to/dataset.json') + ``` + """ + # Dynamic import to avoid circular dependency + from .io.json import JsonDatasetReader + + return JsonDatasetReader( + path_or_paths, + split=split, + features=features, + cache_dir=cache_dir, + keep_in_memory=keep_in_memory, + field=field, + num_proc=num_proc, + **kwargs, + ).read() + + @staticmethod + def from_parquet( + path_or_paths: Union[PathLike, list[PathLike]], + split: Optional[NamedSplit] = None, + features: Optional[Features] = None, + cache_dir: str = None, + keep_in_memory: bool = False, + columns: Optional[list[str]] = None, + num_proc: Optional[int] = None, + **kwargs, + ): + """Create Dataset from Parquet file(s). + + Args: + path_or_paths (`path-like` or list of `path-like`): + Path(s) of the Parquet file(s). + split (`NamedSplit`, *optional*): + Split name to be assigned to the dataset. + features (`Features`, *optional*): + Dataset features. + cache_dir (`str`, *optional*, defaults to `"~/.cache/huggingface/datasets"`): + Directory to cache data. + keep_in_memory (`bool`, defaults to `False`): + Whether to copy the data in-memory. + columns (`List[str]`, *optional*): + If not `None`, only these columns will be read from the file. + A column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e'. + num_proc (`int`, *optional*, defaults to `None`): + Number of processes when downloading and generating the dataset locally. + This is helpful if the dataset is made of multiple files. Multiprocessing is disabled by default. + + + **kwargs (additional keyword arguments): + Keyword arguments to be passed to [`ParquetConfig`]. + + Returns: + [`Dataset`] + + Example: + + ```py + >>> ds = Dataset.from_parquet('path/to/dataset.parquet') + ``` + """ + # Dynamic import to avoid circular dependency + from .io.parquet import ParquetDatasetReader + + return ParquetDatasetReader( + path_or_paths, + split=split, + features=features, + cache_dir=cache_dir, + keep_in_memory=keep_in_memory, + columns=columns, + num_proc=num_proc, + **kwargs, + ).read() + + @staticmethod + def from_text( + path_or_paths: Union[PathLike, list[PathLike]], + split: Optional[NamedSplit] = None, + features: Optional[Features] = None, + cache_dir: str = None, + keep_in_memory: bool = False, + num_proc: Optional[int] = None, + **kwargs, + ): + """Create Dataset from text file(s). + + Args: + path_or_paths (`path-like` or list of `path-like`): + Path(s) of the text file(s). + split (`NamedSplit`, *optional*): + Split name to be assigned to the dataset. + features (`Features`, *optional*): + Dataset features. + cache_dir (`str`, *optional*, defaults to `"~/.cache/huggingface/datasets"`): + Directory to cache data. + keep_in_memory (`bool`, defaults to `False`): + Whether to copy the data in-memory. + num_proc (`int`, *optional*, defaults to `None`): + Number of processes when downloading and generating the dataset locally. + This is helpful if the dataset is made of multiple files. Multiprocessing is disabled by default. + + + **kwargs (additional keyword arguments): + Keyword arguments to be passed to [`TextConfig`]. + + Returns: + [`Dataset`] + + Example: + + ```py + >>> ds = Dataset.from_text('path/to/dataset.txt') + ``` + """ + # Dynamic import to avoid circular dependency + from .io.text import TextDatasetReader + + return TextDatasetReader( + path_or_paths, + split=split, + features=features, + cache_dir=cache_dir, + keep_in_memory=keep_in_memory, + num_proc=num_proc, + **kwargs, + ).read() + + @staticmethod + def from_spark( + df: "pyspark.sql.DataFrame", + split: Optional[NamedSplit] = None, + features: Optional[Features] = None, + keep_in_memory: bool = False, + cache_dir: str = None, + working_dir: str = None, + load_from_cache_file: bool = True, + **kwargs, + ): + """Create a Dataset from Spark DataFrame. Dataset downloading is distributed over Spark workers. + + Args: + df (`pyspark.sql.DataFrame`): + The DataFrame containing the desired data. + split (`NamedSplit`, *optional*): + Split name to be assigned to the dataset. + features (`Features`, *optional*): + Dataset features. + cache_dir (`str`, *optional*, defaults to `"~/.cache/huggingface/datasets"`): + Directory to cache data. When using a multi-node Spark cluster, the cache_dir must be accessible to both + workers and the driver. + keep_in_memory (`bool`): + Whether to copy the data in-memory. + working_dir (`str`, *optional*) + Intermediate directory for each Spark worker to write data to before moving it to `cache_dir`. Setting + a non-NFS intermediate directory may improve performance. + load_from_cache_file (`bool`): + Whether to load the dataset from the cache if possible. + + Returns: + [`Dataset`] + + Example: + + ```py + >>> df = spark.createDataFrame( + >>> data=[[1, "Elia"], [2, "Teo"], [3, "Fang"]], + >>> columns=["id", "name"], + >>> ) + >>> ds = Dataset.from_spark(df) + ``` + """ + # Dynamic import to avoid circular dependency + from .io.spark import SparkDatasetReader + + if sys.platform == "win32": + raise OSError("Dataset.from_spark is not currently supported on Windows") + + return SparkDatasetReader( + df, + split=split, + features=features, + streaming=False, + cache_dir=cache_dir, + keep_in_memory=keep_in_memory, + working_dir=working_dir, + load_from_cache_file=load_from_cache_file, + **kwargs, + ).read() + + @staticmethod + def from_sql( + sql: Union[str, "sqlalchemy.sql.Selectable"], + con: Union[str, "sqlalchemy.engine.Connection", "sqlalchemy.engine.Engine", "sqlite3.Connection"], + features: Optional[Features] = None, + cache_dir: str = None, + keep_in_memory: bool = False, + **kwargs, + ): + """Create Dataset from SQL query or database table. + + Args: + sql (`str` or `sqlalchemy.sql.Selectable`): + SQL query to be executed or a table name. + con (`str` or `sqlite3.Connection` or `sqlalchemy.engine.Connection` or `sqlalchemy.engine.Connection`): + A [URI string](https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls) used to instantiate a database connection or a SQLite3/SQLAlchemy connection object. + features ([`Features`], *optional*): + Dataset features. + cache_dir (`str`, *optional*, defaults to `"~/.cache/huggingface/datasets"`): + Directory to cache data. + keep_in_memory (`bool`, defaults to `False`): + Whether to copy the data in-memory. + **kwargs (additional keyword arguments): + Keyword arguments to be passed to [`SqlConfig`]. + + Returns: + [`Dataset`] + + Example: + + ```py + >>> # Fetch a database table + >>> ds = Dataset.from_sql("test_data", "postgres:///db_name") + >>> # Execute a SQL query on the table + >>> ds = Dataset.from_sql("SELECT sentence FROM test_data", "postgres:///db_name") + >>> # Use a Selectable object to specify the query + >>> from sqlalchemy import select, text + >>> stmt = select([text("sentence")]).select_from(text("test_data")) + >>> ds = Dataset.from_sql(stmt, "postgres:///db_name") + ``` + + > [!TIP] + > The returned dataset can only be cached if `con` is specified as URI string. + """ + from .io.sql import SqlDatasetReader + + return SqlDatasetReader( + sql, + con, + features=features, + cache_dir=cache_dir, + keep_in_memory=keep_in_memory, + **kwargs, + ).read() + + def __setstate__(self, state): + self.__dict__.update(state) + maybe_register_dataset_for_temp_dir_deletion(self) + return self + + def __del__(self): + if hasattr(self, "_data"): + del self._data + if hasattr(self, "_indices"): + del self._indices + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + # Here `del` is used to del the pyarrow tables. This properly closes the files used for memory mapped tables + self.__del__() + + def save_to_disk( + self, + dataset_path: PathLike, + max_shard_size: Optional[Union[str, int]] = None, + num_shards: Optional[int] = None, + num_proc: Optional[int] = None, + storage_options: Optional[dict] = None, + ): + """ + Saves a dataset to a dataset directory, or in a filesystem using any implementation of `fsspec.spec.AbstractFileSystem`. + + For [`Image`], [`Audio`] and [`Video`] data: + + All the Image(), Audio() and Video() data are stored in the arrow files. + If you want to store paths or urls, please use the Value("string") type. + + Args: + dataset_path (`path-like`): + Path (e.g. `dataset/train`) or remote URI (e.g. `s3://my-bucket/dataset/train`) + of the dataset directory where the dataset will be saved to. + max_shard_size (`int` or `str`, *optional*, defaults to `"500MB"`): + The maximum size of the dataset shards to be saved to the filesystem. If expressed as a string, needs to be digits followed by a unit + (like `"50MB"`). + num_shards (`int`, *optional*): + Number of shards to write. By default the number of shards depends on `max_shard_size` and `num_proc`. + + + num_proc (`int`, *optional*): + Number of processes when downloading and generating the dataset locally. + Multiprocessing is disabled by default. + + + storage_options (`dict`, *optional*): + Key/value pairs to be passed on to the file-system backend, if any. + + + + Example: + + ```py + >>> ds.save_to_disk("path/to/dataset/directory") + >>> ds.save_to_disk("path/to/dataset/directory", max_shard_size="1GB") + >>> ds.save_to_disk("path/to/dataset/directory", num_shards=1024) + ``` + """ + if max_shard_size is not None and num_shards is not None: + raise ValueError( + "Failed to push_to_hub: please specify either max_shard_size or num_shards, but not both." + ) + if self.list_indexes(): + raise ValueError("please remove all the indexes using `dataset.drop_index` before saving a dataset") + + if num_shards is None: + dataset_nbytes = self._estimate_nbytes() + max_shard_size = convert_file_size_to_int(max_shard_size or config.MAX_SHARD_SIZE) + num_shards = int(dataset_nbytes / max_shard_size) + 1 + num_shards = max(num_shards, num_proc or 1) + + fs: fsspec.AbstractFileSystem + fs, _ = url_to_fs(dataset_path, **(storage_options or {})) + + if not is_remote_filesystem(fs): + parent_cache_files_paths = { + Path(cache_filename["filename"]).resolve().parent for cache_filename in self.cache_files + } + # Check that the dataset doesn't overwrite itself. It can cause a permission error on Windows and a segfault on linux. + if Path(dataset_path).expanduser().resolve() in parent_cache_files_paths: + raise PermissionError( + f"Tried to overwrite {Path(dataset_path).expanduser().resolve()} but a dataset can't overwrite itself." + ) + + fs.makedirs(dataset_path, exist_ok=True) + + # Get json serializable state + state = { + key: self.__dict__[key] + for key in [ + "_fingerprint", + "_format_columns", + "_format_kwargs", + "_format_type", + "_output_all_columns", + ] + } + state["_split"] = str(self.split) if self.split is not None else self.split + state["_data_files"] = [ + {"filename": f"data-{shard_idx:05d}-of-{num_shards:05d}.arrow"} for shard_idx in range(num_shards) + ] + for k in state["_format_kwargs"].keys(): + try: + json.dumps(state["_format_kwargs"][k]) + except TypeError as e: + raise TypeError( + str(e) + f"\nThe format kwargs must be JSON serializable, but key '{k}' isn't." + ) from None + # Get json serializable dataset info + dataset_info = asdict(self._info) + + shards_done = 0 + pbar = hf_tqdm( + unit=" examples", + total=len(self), + desc=f"Saving the dataset ({shards_done}/{num_shards} shards)", + ) + kwargs_per_job = ( + { + "job_id": shard_idx, + "shard": self.shard(num_shards=num_shards, index=shard_idx, contiguous=True), + "fpath": posixpath.join(dataset_path, f"data-{shard_idx:05d}-of-{num_shards:05d}.arrow"), + "storage_options": storage_options, + } + for shard_idx in range(num_shards) + ) + shard_lengths = [None] * num_shards + shard_sizes = [None] * num_shards + if num_proc is not None and num_proc >= 1: + with Pool(num_proc) as pool: + with pbar: + for job_id, done, content in iflatmap_unordered( + pool, Dataset._save_to_disk_single, kwargs_iterable=kwargs_per_job + ): + if done: + shards_done += 1 + pbar.set_description(f"Saving the dataset ({shards_done}/{num_shards} shards)") + logger.debug(f"Finished writing shard number {job_id} of {num_shards}.") + shard_lengths[job_id], shard_sizes[job_id] = content + else: + pbar.update(content) + else: + with pbar: + for kwargs in kwargs_per_job: + for job_id, done, content in Dataset._save_to_disk_single(**kwargs): + if done: + shards_done += 1 + pbar.set_description(f"Saving the dataset ({shards_done}/{num_shards} shards)") + logger.debug(f"Finished writing shard number {job_id} of {num_shards}.") + shard_lengths[job_id], shard_sizes[job_id] = content + else: + pbar.update(content) + with fs.open( + posixpath.join(dataset_path, config.DATASET_STATE_JSON_FILENAME), "w", encoding="utf-8" + ) as state_file: + json.dump(state, state_file, indent=2, sort_keys=True) + with fs.open( + posixpath.join(dataset_path, config.DATASET_INFO_FILENAME), "w", encoding="utf-8" + ) as dataset_info_file: + # Sort only the first level of keys, or we might shuffle fields of nested features if we use sort_keys=True + sorted_keys_dataset_info = {key: dataset_info[key] for key in sorted(dataset_info)} + json.dump(sorted_keys_dataset_info, dataset_info_file, indent=2) + + @staticmethod + def _save_to_disk_single(job_id: int, shard: "Dataset", fpath: str, storage_options: Optional[dict]): + batch_size = config.DEFAULT_MAX_BATCH_SIZE + + num_examples_progress_update = 0 + writer = ArrowWriter( + features=shard.features, + path=fpath, + storage_options=storage_options, + embed_local_files=True, + ) + try: + _time = time.time() + for pa_table in shard.with_format("arrow").iter(batch_size): + writer.write_table(pa_table) + num_examples_progress_update += len(pa_table) + if time.time() > _time + config.PBAR_REFRESH_TIME_INTERVAL: + _time = time.time() + yield job_id, False, num_examples_progress_update + num_examples_progress_update = 0 + finally: + yield job_id, False, num_examples_progress_update + num_examples, num_bytes = writer.finalize() + writer.close() + + yield job_id, True, (num_examples, num_bytes) + + @staticmethod + def _build_local_temp_path(uri_or_path: str) -> Path: + """ + Builds and returns a Path concatenating a local temporary dir with the dir path (or absolute/relative + path extracted from the uri) passed. + + Args: + uri_or_path (`str`): Path (e.g. `"dataset/train"`) or remote URI (e.g. + `"s3://my-bucket/dataset/train"`) to concatenate. + + Returns: + :class:`Path`: the concatenated path (temp dir + path) + """ + src_dataset_path = Path(uri_or_path) + tmp_dir = get_temporary_cache_files_directory() + return Path(tmp_dir, src_dataset_path.relative_to(src_dataset_path.anchor)) + + @staticmethod + def load_from_disk( + dataset_path: PathLike, + keep_in_memory: Optional[bool] = None, + storage_options: Optional[dict] = None, + ) -> "Dataset": + """ + Loads a dataset that was previously saved using [`save_to_disk`] from a dataset directory, or from a + filesystem using any implementation of `fsspec.spec.AbstractFileSystem`. + + Args: + dataset_path (`path-like`): + Path (e.g. `"dataset/train"`) or remote URI (e.g. `"s3//my-bucket/dataset/train"`) + of the dataset directory where the dataset will be loaded from. + keep_in_memory (`bool`, defaults to `None`): + Whether to copy the dataset in-memory. If `None`, the + dataset will not be copied in-memory unless explicitly enabled by setting + `datasets.config.IN_MEMORY_MAX_SIZE` to nonzero. See more details in the + [improve performance](../cache#improve-performance) section. + storage_options (`dict`, *optional*): + Key/value pairs to be passed on to the file-system backend, if any. + + + + Returns: + [`Dataset`] or [`DatasetDict`]: + - If `dataset_path` is a path of a dataset directory, the dataset requested. + - If `dataset_path` is a path of a dataset dict directory, a `datasets.DatasetDict` with each split. + + Example: + + ```py + >>> ds = load_from_disk("path/to/dataset/directory") + ``` + """ + fs: fsspec.AbstractFileSystem + fs, dataset_path = url_to_fs(dataset_path, **(storage_options or {})) + + dest_dataset_path = dataset_path + dataset_dict_json_path = posixpath.join(dest_dataset_path, config.DATASETDICT_JSON_FILENAME) + dataset_state_json_path = posixpath.join(dest_dataset_path, config.DATASET_STATE_JSON_FILENAME) + dataset_info_path = posixpath.join(dest_dataset_path, config.DATASET_INFO_FILENAME) + + dataset_dict_is_file = fs.isfile(dataset_dict_json_path) + dataset_info_is_file = fs.isfile(dataset_info_path) + dataset_state_is_file = fs.isfile(dataset_state_json_path) + if not dataset_info_is_file and not dataset_state_is_file: + if dataset_dict_is_file: + raise FileNotFoundError( + f"No such files: '{dataset_info_path}', nor '{dataset_state_json_path}' found. Expected to load a `Dataset` object, but got a `DatasetDict`. Please use either `datasets.load_from_disk` or `DatasetDict.load_from_disk` instead." + ) + raise FileNotFoundError( + f"No such files: '{dataset_info_path}', nor '{dataset_state_json_path}' found. Expected to load a `Dataset` object but provided path is not a `Dataset`." + ) + if not dataset_info_is_file: + if dataset_dict_is_file: + raise FileNotFoundError( + f"No such file: '{dataset_info_path}' found. Expected to load a `Dataset` object, but got a `DatasetDict`. Please use either `datasets.load_from_disk` or `DatasetDict.load_from_disk` instead." + ) + raise FileNotFoundError( + f"No such file: '{dataset_info_path}'. Expected to load a `Dataset` object but provided path is not a `Dataset`." + ) + if not dataset_state_is_file: + if dataset_dict_is_file: + raise FileNotFoundError( + f"No such file: '{dataset_state_json_path}' found. Expected to load a `Dataset` object, but got a `DatasetDict`. Please use either `datasets.load_from_disk` or `DatasetDict.load_from_disk` instead." + ) + raise FileNotFoundError( + f"No such file: '{dataset_state_json_path}'. Expected to load a `Dataset` object but provided path is not a `Dataset`." + ) + + # copies file from filesystem if it is remote filesystem to local filesystem and modifies dataset_path to temp directory containing local copies + if is_remote_filesystem(fs): + src_dataset_path = dest_dataset_path + dest_dataset_path = Dataset._build_local_temp_path(src_dataset_path) + fs.download(src_dataset_path, dest_dataset_path.as_posix(), recursive=True) + dataset_state_json_path = posixpath.join(dest_dataset_path, config.DATASET_STATE_JSON_FILENAME) + dataset_info_path = posixpath.join(dest_dataset_path, config.DATASET_INFO_FILENAME) + + with open(dataset_state_json_path, encoding="utf-8") as state_file: + state = json.load(state_file) + with open(dataset_info_path, encoding="utf-8") as dataset_info_file: + dataset_info = DatasetInfo.from_dict(json.load(dataset_info_file)) + + dataset_size = estimate_dataset_size( + Path(dest_dataset_path, data_file["filename"]) for data_file in state["_data_files"] + ) + keep_in_memory = keep_in_memory if keep_in_memory is not None else is_small_dataset(dataset_size) + table_cls = InMemoryTable if keep_in_memory else MemoryMappedTable + + arrow_table = concat_tables( + thread_map( + table_cls.from_file, + [posixpath.join(dest_dataset_path, data_file["filename"]) for data_file in state["_data_files"]], + tqdm_class=hf_tqdm, + desc="Loading dataset from disk", + # set `disable=None` rather than `disable=False` by default to disable progress bar when no TTY attached + disable=len(state["_data_files"]) <= 16 or None, + ) + ) + + split = state["_split"] + split = Split(split) if split is not None else split + + dataset = Dataset( + arrow_table=arrow_table, + info=dataset_info, + split=split, + fingerprint=state["_fingerprint"], + ) + + format = { + "type": state["_format_type"], + "format_kwargs": state["_format_kwargs"], + "columns": state["_format_columns"], + "output_all_columns": state["_output_all_columns"], + } + dataset = dataset.with_format(**format) + + return dataset + + @property + def data(self) -> Table: + """The Apache Arrow table backing the dataset. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") + >>> ds.data + MemoryMappedTable + text: string + label: int64 + ---- + text: [["compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .","the soundtrack alone is worth the price of admission .","rodriguez does a splendid job of racial profiling hollywood style--casting excellent latin actors of all ages--a trend long overdue .","beneath the film's obvious determination to shock at any cost lies considerable skill and determination , backed by sheer nerve .","bielinsky is a filmmaker of impressive talent .","so beautifully acted and directed , it's clear that washington most certainly has a new career ahead of him if he so chooses .","a visual spectacle full of stunning images and effects .","a gentle and engrossing character study .","it's enough to watch huppert scheming , with her small , intelligent eyes as steady as any noir villain , and to enjoy the perfectly pitched web of tension that chabrol spins .","an engrossing portrait of uncompromising artists trying to create something original against the backdrop of a corporate music industry that only seems to care about the bottom line .",...,"ultimately , jane learns her place as a girl , softens up and loses some of the intensity that made her an interesting character to begin with .","ah-nuld's action hero days might be over .","it's clear why deuces wild , which was shot two years ago , has been gathering dust on mgm's shelf .","feels like nothing quite so much as a middle-aged moviemaker's attempt to surround himself with beautiful , half-naked women .","when the precise nature of matthew's predicament finally comes into sharp focus , the revelation fails to justify the build-up .","this picture is murder by numbers , and as easy to be bored by as your abc's , despite a few whopping shootouts .","hilarious musical comedy though stymied by accents thick as mud .","if you are into splatter movies , then you will probably have a reasonably good time with the salton sea .","a dull , simple-minded and stereotypical tale of drugs , death and mind-numbing indifference on the inner-city streets .","the feature-length stretch . . . strains the show's concept ."]] + label: [[1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0]] + ``` + """ + return self._data + + @property + def cache_files(self) -> list[dict]: + """The cache files containing the Apache Arrow table backing the dataset. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") + >>> ds.cache_files + [{'filename': '/root/.cache/huggingface/datasets/rotten_tomatoes_movie_review/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46/rotten_tomatoes_movie_review-validation.arrow'}] + ``` + """ + cache_files = list_table_cache_files(self._data) + if self._indices is not None: + cache_files += list_table_cache_files(self._indices) + return [{"filename": cache_filename} for cache_filename in cache_files] + + @property + def num_columns(self) -> int: + """Number of columns in the dataset. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") + >>> ds.num_columns + 2 + ``` + """ + return self._data.num_columns + + @property + def num_rows(self) -> int: + """Number of rows in the dataset (same as [`Dataset.__len__`]). + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") + >>> ds.num_rows + 1066 + ``` + """ + if self._indices is not None: + return self._indices.num_rows + return self._data.num_rows + + @property + def column_names(self) -> list[str]: + """Names of the columns in the dataset. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") + >>> ds.column_names + ['text', 'label'] + ``` + """ + return self._data.column_names + + @property + def shape(self) -> tuple[int, int]: + """Shape of the dataset (number of columns, number of rows). + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") + >>> ds.shape + (1066, 2) + ``` + """ + if self._indices is not None: + return (self._indices.num_rows, self._data.num_columns) + return self._data.shape + + def unique(self, column: str) -> list: + """Return a list of the unique elements in a column. + + This is implemented in the low-level backend and as such, very fast. + + Args: + column (`str`): + Column name (list all the column names with [`~datasets.Dataset.column_names`]). + + Returns: + `list`: List of unique elements in the given column. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") + >>> ds.unique('label') + [1, 0] + ``` + """ + if column not in self._data.column_names: + raise ValueError(f"Column ({column}) not in table columns ({self._data.column_names}).") + + if self._indices is not None and self._indices.num_rows != self._data.num_rows: + dataset = self.flatten_indices() + else: + dataset = self + + return dataset._data.column(column).unique().to_pylist() + + def class_encode_column(self, column: str, include_nulls: bool = False) -> "Dataset": + """Casts the given column as [`~datasets.features.ClassLabel`] and updates the table. + + Args: + column (`str`): + The name of the column to cast (list all the column names with [`~datasets.Dataset.column_names`]) + include_nulls (`bool`, defaults to `False`): + Whether to include null values in the class labels. If `True`, the null values will be encoded as the `"None"` class label. + + + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("boolq", split="validation") + >>> ds.features + {'answer': Value('bool'), + 'passage': Value('string'), + 'question': Value('string')} + >>> ds = ds.class_encode_column('answer') + >>> ds.features + {'answer': ClassLabel(num_classes=2, names=['False', 'True']), + 'passage': Value('string'), + 'question': Value('string')} + ``` + """ + # Sanity checks + if column not in self._data.column_names: + raise ValueError(f"Column ({column}) not in table columns ({self._data.column_names}).") + src_feat = self._info.features[column] + if not isinstance(src_feat, Value): + raise ValueError( + f"Class encoding is only supported for {Value.__name__} column, and column {column} is {type(src_feat).__name__}." + ) + + if src_feat.dtype != "string" or (include_nulls and None in self.unique(column)): + + def stringify_column(batch): + batch[column] = [ + str(sample) if include_nulls or sample is not None else None for sample in batch[column] + ] + return batch + + dset = self.map( + stringify_column, + batched=True, + desc="Stringifying the column", + ) + else: + dset = self + + # Create the new feature + class_names = sorted(str(sample) for sample in dset.unique(column) if include_nulls or sample is not None) + dst_feat = ClassLabel(names=class_names) + + def cast_to_class_labels(batch): + batch[column] = [ + dst_feat.str2int(str(sample)) if include_nulls or sample is not None else None + for sample in batch[column] + ] + return batch + + new_features = dset.features.copy() + new_features[column] = dst_feat + + dset = dset.map( + cast_to_class_labels, + batched=True, + features=new_features, + desc="Casting to class labels", + ) + + return dset + + @fingerprint_transform(inplace=False) + def flatten(self, new_fingerprint: Optional[str] = None, max_depth=16) -> "Dataset": + """Flatten the table. + Each column with a struct type is flattened into one column per struct field. + Other columns are left unchanged. + + Args: + new_fingerprint (`str`, *optional*): + The new fingerprint of the dataset after transform. + If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments. + + Returns: + [`Dataset`]: A copy of the dataset with flattened columns. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("rajpurkar/squad", split="train") + >>> ds.features + {'id': Value('string'), + 'title': Value('string'), + 'context': Value('string'), + 'question': Value('string'), + 'answers': {'text': List(Value('string')), + 'answer_start': List(Value('int32'))}} + >>> ds = ds.flatten() + >>> ds + Dataset({ + features: ['id', 'title', 'context', 'question', 'answers.text', 'answers.answer_start'], + num_rows: 87599 + }) + ``` + """ + dataset = copy.deepcopy(self) + for depth in range(1, max_depth): + if any(isinstance(field.type, pa.StructType) for field in dataset._data.schema): + dataset._data = dataset._data.flatten() + else: + break + dataset.info.features = self._info.features.flatten(max_depth=max_depth) + dataset.info.features = Features({col: dataset.info.features[col] for col in dataset.data.column_names}) + dataset._data = update_metadata_with_features(dataset._data, dataset.features) + logger.info(f"Flattened dataset from depth {depth} to depth {1 if depth + 1 < max_depth else 'unknown'}.") + dataset._fingerprint = new_fingerprint + return dataset + + def cast( + self, + features: Features, + batch_size: Optional[int] = 1000, + keep_in_memory: bool = False, + load_from_cache_file: Optional[bool] = None, + cache_file_name: Optional[str] = None, + writer_batch_size: Optional[int] = 1000, + num_proc: Optional[int] = None, + ) -> "Dataset": + """ + Cast the dataset to a new set of features. + + Args: + features ([`Features`]): + New features to cast the dataset to. + The name of the fields in the features must match the current column names. + The type of the data must also be convertible from one type to the other. + For non-trivial conversion, e.g. `str` <-> `ClassLabel` you should use [`~datasets.Dataset.map`] to update the Dataset. + batch_size (`int`, defaults to `1000`): + Number of examples per batch provided to cast. + If `batch_size <= 0` or `batch_size == None` then provide the full dataset as a single batch to cast. + keep_in_memory (`bool`, defaults to `False`): + Whether to copy the data in-memory. + load_from_cache_file (`bool`, defaults to `True` if caching is enabled): + If a cache file storing the current computation from `function` + can be identified, use it instead of recomputing. + cache_file_name (`str`, *optional*, defaults to `None`): + Provide the name of a path for the cache file. It is used to store the + results of the computation instead of the automatically generated cache file name. + writer_batch_size (`int`, defaults to `1000`): + Number of rows per write operation for the cache file writer. + This value is a good trade-off between memory usage during the processing, and processing speed. + Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running [`~datasets.Dataset.map`]. + num_proc (`int`, *optional*, defaults to `None`): + Number of processes for multiprocessing. By default it doesn't + use multiprocessing. + + Returns: + [`Dataset`]: A copy of the dataset with casted features. + + Example: + + ```py + >>> from datasets import load_dataset, ClassLabel, Value + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") + >>> ds.features + {'label': ClassLabel(names=['neg', 'pos']), + 'text': Value('string')} + >>> new_features = ds.features.copy() + >>> new_features['label'] = ClassLabel(names=['bad', 'good']) + >>> new_features['text'] = Value('large_string') + >>> ds = ds.cast(new_features) + >>> ds.features + {'label': ClassLabel(names=['bad', 'good']), + 'text': Value('large_string')} + ``` + """ + if sorted(features) != sorted(self._data.column_names): + raise ValueError( + f"The columns in features ({list(features)}) must be identical " + f"as the columns in the dataset: {self._data.column_names}" + ) + + features = _fix_for_backward_compatible_features(features) + schema = features.arrow_schema + format = self.format + dataset = self.with_format("arrow") + # capture the PyArrow version here to make the lambda serializable on Windows + dataset = dataset.map( + partial(table_cast, schema=schema), + batched=True, + batch_size=batch_size, + keep_in_memory=keep_in_memory, + load_from_cache_file=load_from_cache_file, + cache_file_name=cache_file_name, + writer_batch_size=writer_batch_size, + num_proc=num_proc, + features=features, + desc="Casting the dataset", + ) + dataset = dataset.with_format(**format) + return dataset + + @fingerprint_transform(inplace=False) + def cast_column(self, column: str, feature: FeatureType, new_fingerprint: Optional[str] = None) -> "Dataset": + """Cast column to feature for decoding. + + Args: + column (`str`): + Column name. + feature (`FeatureType`): + Target feature. + new_fingerprint (`str`, *optional*): + The new fingerprint of the dataset after transform. + If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments. + + Returns: + [`Dataset`] + + Example: + + ```py + >>> from datasets import load_dataset, ClassLabel + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") + >>> ds.features + {'label': ClassLabel(names=['neg', 'pos']), + 'text': Value('string')} + >>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good'])) + >>> ds.features + {'label': ClassLabel(names=['bad', 'good']), + 'text': Value('string')} + ``` + """ + feature = _fix_for_backward_compatible_features(feature) + if hasattr(feature, "decode_example"): + dataset = copy.deepcopy(self) + dataset._info.features[column] = feature + dataset._fingerprint = new_fingerprint + dataset._data = dataset._data.cast(dataset.features.arrow_schema) + dataset._data = update_metadata_with_features(dataset._data, dataset.features) + return dataset + else: + features = self.features + features[column] = feature + return self.cast(features) + + @transmit_format + @fingerprint_transform(inplace=False) + def remove_columns(self, column_names: Union[str, list[str]], new_fingerprint: Optional[str] = None) -> "Dataset": + """ + Remove one or several column(s) in the dataset and the features associated to them. + + You can also remove a column using [`~datasets.Dataset.map`] with `remove_columns` but the present method + doesn't copy the data of the remaining columns and is thus faster. + + Args: + column_names (`Union[str, List[str]]`): + Name of the column(s) to remove. + new_fingerprint (`str`, *optional*): + The new fingerprint of the dataset after transform. + If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments. + + Returns: + [`Dataset`]: A copy of the dataset object without the columns to remove. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") + >>> ds = ds.remove_columns('label') + Dataset({ + features: ['text'], + num_rows: 1066 + }) + >>> ds = ds.remove_columns(column_names=ds.column_names) # Removing all the columns returns an empty dataset with the `num_rows` property set to 0 + Dataset({ + features: [], + num_rows: 0 + }) + ``` + """ + dataset = copy.deepcopy(self) + if isinstance(column_names, str): + column_names = [column_names] + + missing_columns = set(column_names) - set(self._data.column_names) + if missing_columns: + raise ValueError( + f"Column name {list(missing_columns)} not in the dataset. " + f"Current columns in the dataset: {dataset._data.column_names}" + ) + + for column_name in column_names: + del dataset._info.features[column_name] + + dataset._data = dataset._data.drop(column_names) + dataset._data = update_metadata_with_features(dataset._data, dataset.features) + dataset._fingerprint = new_fingerprint + return dataset + + @fingerprint_transform(inplace=False) + def rename_column( + self, original_column_name: str, new_column_name: str, new_fingerprint: Optional[str] = None + ) -> "Dataset": + """ + Rename a column in the dataset, and move the features associated to the original column under the new column + name. + + Args: + original_column_name (`str`): + Name of the column to rename. + new_column_name (`str`): + New name for the column. + new_fingerprint (`str`, *optional*): + The new fingerprint of the dataset after transform. + If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments. + + Returns: + [`Dataset`]: A copy of the dataset with a renamed column. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") + >>> ds = ds.rename_column('label', 'label_new') + Dataset({ + features: ['text', 'label_new'], + num_rows: 1066 + }) + ``` + """ + dataset = copy.deepcopy(self) + if original_column_name not in dataset._data.column_names: + raise ValueError( + f"Original column name {original_column_name} not in the dataset. " + f"Current columns in the dataset: {dataset._data.column_names}" + ) + if new_column_name in dataset._data.column_names: + raise ValueError( + f"New column name {new_column_name} already in the dataset. " + f"Please choose a column name which is not already in the dataset. " + f"Current columns in the dataset: {dataset._data.column_names}" + ) + if not new_column_name: + raise ValueError("New column name is empty.") + + def rename(columns): + return [new_column_name if col == original_column_name else col for col in columns] + + new_column_names = rename(self._data.column_names) + if self._format_columns is not None: + dataset._format_columns = rename(self._format_columns) + + dataset._info.features = Features( + { + new_column_name if col == original_column_name else col: feature + for col, feature in self._info.features.items() + } + ) + + dataset._data = dataset._data.rename_columns(new_column_names) + dataset._data = update_metadata_with_features(dataset._data, dataset.features) + dataset._fingerprint = new_fingerprint + return dataset + + @fingerprint_transform(inplace=False) + def rename_columns(self, column_mapping: dict[str, str], new_fingerprint: Optional[str] = None) -> "Dataset": + """ + Rename several columns in the dataset, and move the features associated to the original columns under + the new column names. + + Args: + column_mapping (`Dict[str, str]`): + A mapping of columns to rename to their new names + new_fingerprint (`str`, *optional*): + The new fingerprint of the dataset after transform. + If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments. + + Returns: + [`Dataset`]: A copy of the dataset with renamed columns + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") + >>> ds = ds.rename_columns({'text': 'text_new', 'label': 'label_new'}) + Dataset({ + features: ['text_new', 'label_new'], + num_rows: 1066 + }) + ``` + """ + dataset = copy.deepcopy(self) + + extra_columns = set(column_mapping.keys()) - set(dataset.column_names) + if extra_columns: + raise ValueError( + f"Original column names {extra_columns} not in the dataset. " + f"Current columns in the dataset: {dataset._data.column_names}" + ) + + number_of_duplicates_in_new_columns = len(column_mapping.values()) - len(set(column_mapping.values())) + if number_of_duplicates_in_new_columns != 0: + raise ValueError( + "New column names must all be different, but this column mapping " + f"has {number_of_duplicates_in_new_columns} duplicates" + ) + + empty_new_columns = [new_col for new_col in column_mapping.values() if not new_col] + if empty_new_columns: + raise ValueError(f"New column names {empty_new_columns} are empty.") + + def rename(columns): + return [column_mapping[col] if col in column_mapping else col for col in columns] + + new_column_names = rename(self._data.column_names) + if self._format_columns is not None: + dataset._format_columns = rename(self._format_columns) + + dataset._info.features = Features( + { + column_mapping[col] if col in column_mapping else col: feature + for col, feature in (self._info.features or {}).items() + } + ) + + dataset._data = dataset._data.rename_columns(new_column_names) + dataset._data = update_metadata_with_features(dataset._data, dataset.features) + dataset._fingerprint = new_fingerprint + return dataset + + @transmit_format + @fingerprint_transform(inplace=False) + def select_columns(self, column_names: Union[str, list[str]], new_fingerprint: Optional[str] = None) -> "Dataset": + """Select one or several column(s) in the dataset and the features + associated to them. + + Args: + column_names (`Union[str, List[str]]`): + Name of the column(s) to keep. + new_fingerprint (`str`, *optional*): + The new fingerprint of the dataset after transform. If `None`, + the new fingerprint is computed using a hash of the previous + fingerprint, and the transform arguments. + + Returns: + [`Dataset`]: A copy of the dataset object which only consists of + selected columns. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") + >>> ds = ds.select_columns(['text']) + >>> ds + Dataset({ + features: ['text'], + num_rows: 1066 + }) + ``` + """ + if isinstance(column_names, str): + column_names = [column_names] + + missing_columns = set(column_names) - set(self._data.column_names) + if missing_columns: + raise ValueError( + f"Column name {list(missing_columns)} not in the " + "dataset. Current columns in the dataset: " + f"{self._data.column_names}." + ) + + dataset = copy.deepcopy(self) + dataset._data = dataset._data.select(column_names) + dataset._info.features = Features({col: self._info.features[col] for col in dataset._data.column_names}) + dataset._data = update_metadata_with_features(dataset._data, dataset.features) + dataset._fingerprint = new_fingerprint + return dataset + + @transmit_format + def _fast_select_column(self, column_name: str) -> "Dataset": + dataset = copy.copy(self) + dataset._data = dataset._data.select([column_name]) + dataset._info = DatasetInfo(features=Features({column_name: self._info.features[column_name]})) + return dataset + + def __len__(self): + """Number of rows in the dataset. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") + >>> ds.__len__ + + ``` + """ + return self.num_rows + + def __iter__(self): + """Iterate through the examples. + + If a formatting is set with [`Dataset.set_format`] rows will be returned with the + selected format. + """ + if self._indices is None: + # Fast iteration + # Benchmark: https://gist.github.com/mariosasko/0248288a2e3a7556873969717c1fe52b (fast_iter_batch) + format_kwargs = self._format_kwargs if self._format_kwargs is not None else {} + formatter = get_formatter(self._format_type, features=self._info.features, **format_kwargs) + batch_size = config.ARROW_READER_BATCH_SIZE_IN_DATASET_ITER + for pa_subtable in table_iter(self.data, batch_size=batch_size): + for i in range(pa_subtable.num_rows): + pa_subtable_ex = pa_subtable.slice(i, 1) + formatted_output = format_table( + pa_subtable_ex, + 0, + formatter=formatter, + format_columns=self._format_columns, + output_all_columns=self._output_all_columns, + ) + yield formatted_output + else: + for i in range(self.num_rows): + yield self._getitem( + i, + ) + + def iter(self, batch_size: int, drop_last_batch: bool = False): + """Iterate through the batches of size `batch_size`. + + If a formatting is set with [`~datasets.Dataset.set_format`] rows will be returned with the + selected format. + + Args: + batch_size (:obj:`int`): size of each batch to yield. + drop_last_batch (:obj:`bool`, default `False`): Whether a last batch smaller than the batch_size should be + dropped + """ + if self._indices is None: + # Fast iteration + # Benchmark: https://gist.github.com/mariosasko/0248288a2e3a7556873969717c1fe52b (fast_iter_batch) + format_kwargs = self._format_kwargs if self._format_kwargs is not None else {} + formatter = get_formatter(self._format_type, features=self._info.features, **format_kwargs) + for pa_subtable in table_iter(self.data, batch_size=batch_size, drop_last_batch=drop_last_batch): + formatted_batch = format_table( + pa_subtable, + range(pa_subtable.num_rows), + formatter=formatter, + format_columns=self._format_columns, + output_all_columns=self._output_all_columns, + ) + yield formatted_batch + else: + num_rows = self.num_rows if not drop_last_batch else self.num_rows // batch_size * batch_size + for i in range(0, num_rows, batch_size): + yield self._getitem( + slice(i, i + batch_size), + ) + + def __repr__(self): + return f"Dataset({{\n features: {list(self._info.features.keys())},\n num_rows: {self.num_rows}\n}})" + + @property + def format(self): + return { + "type": self._format_type, + "format_kwargs": self._format_kwargs, + "columns": self.column_names if self._format_columns is None else self._format_columns, + "output_all_columns": self._output_all_columns, + } + + @contextlib.contextmanager + def formatted_as( + self, + type: Optional[str] = None, + columns: Optional[list] = None, + output_all_columns: bool = False, + **format_kwargs, + ): + """To be used in a `with` statement. Set `__getitem__` return format (type and columns). + + Args: + type (`str`, *optional*): + Either output type selected in `[None, 'numpy', 'torch', 'tensorflow', 'jax', 'arrow', 'pandas', 'polars']`. + `None` means `__getitem__`` returns python objects (default). + columns (`List[str]`, *optional*): + Columns to format in the output. + `None` means `__getitem__` returns all columns (default). + output_all_columns (`bool`, defaults to `False`): + Keep un-formatted columns as well in the output (as python objects). + **format_kwargs (additional keyword arguments): + Keywords arguments passed to the convert function like `np.array`, `torch.tensor` or `tensorflow.ragged.constant`. + """ + old_format_type = self._format_type + old_format_kwargs = self._format_kwargs + old_format_columns = self._format_columns + old_output_all_columns = self._output_all_columns + try: + self.set_format(type, columns, output_all_columns, **format_kwargs) + yield + finally: + self.set_format(old_format_type, old_format_columns, old_output_all_columns, **old_format_kwargs) + + @fingerprint_transform(inplace=True) + def set_format( + self, + type: Optional[str] = None, + columns: Optional[list] = None, + output_all_columns: bool = False, + **format_kwargs, + ): + """Set `__getitem__` return format (type and columns). The data formatting is applied on-the-fly. + The format `type` (for example "numpy") is used to format batches when using `__getitem__`. + It's also possible to use custom transforms for formatting using [`~datasets.Dataset.set_transform`]. + + Args: + type (`str`, *optional*): + Either output type selected in `[None, 'numpy', 'torch', 'tensorflow', 'jax', 'arrow', 'pandas', 'polars']`. + `None` means `__getitem__` returns python objects (default). + columns (`List[str]`, *optional*): + Columns to format in the output. + `None` means `__getitem__` returns all columns (default). + output_all_columns (`bool`, defaults to `False`): + Keep un-formatted columns as well in the output (as python objects). + **format_kwargs (additional keyword arguments): + Keywords arguments passed to the convert function like `np.array`, `torch.tensor` or `tensorflow.ragged.constant`. + + It is possible to call [`~datasets.Dataset.map`] after calling `set_format`. Since `map` may add new columns, then the list of formatted columns + gets updated. In this case, if you apply `map` on a dataset to add a new column, then this column will be formatted as: + + ``` + new formatted columns = (all columns - previously unformatted columns) + ``` + + Example: + + ```py + >>> from datasets import load_dataset + >>> from transformers import AutoTokenizer + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") + >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") + >>> ds = ds.map(lambda x: tokenizer(x['text'], truncation=True, padding=True), batched=True) + >>> ds.set_format(type='numpy', columns=['text', 'label']) + >>> ds.format + {'type': 'numpy', + 'format_kwargs': {}, + 'columns': ['text', 'label'], + 'output_all_columns': False} + ``` + """ + format_kwargs.update(format_kwargs.pop("format_kwargs", {})) # allow to use self.set_format(**self.format) + + # Check that the format_type and format_kwargs are valid and make it possible to have a Formatter + type = get_format_type_from_alias(type) + get_formatter(type, features=self._info.features, **format_kwargs) + + # Check filter column + if isinstance(columns, str): + columns = [columns] + if isinstance(columns, tuple): + columns = list(columns) + if columns is not None: + missing_columns = set(columns) - set(self._data.column_names) + if missing_columns: + raise ValueError( + f"Columns {list(missing_columns)} not in the dataset. Current columns in the dataset: {self._data.column_names}" + ) + if columns is not None: + columns = columns.copy() # Ensures modifications made to the list after this call don't cause bugs + + self._format_type = type + self._format_kwargs = format_kwargs + self._format_columns = columns + self._output_all_columns = output_all_columns + logger.debug( + "Set __getitem__(key) output type to %s for %s columns " + " (when key is int or slice) and %s output other (un-formatted) columns.", + "python objects" if type is None else type, + "no" if columns is None else str(columns), + "do" if output_all_columns else "don't", + ) + + def reset_format(self): + """Reset `__getitem__` return format to python objects and all columns. + + Same as `self.set_format()` + + Example: + + ```py + >>> from datasets import load_dataset + >>> from transformers import AutoTokenizer + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") + >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") + >>> ds = ds.map(lambda x: tokenizer(x['text'], truncation=True, padding=True), batched=True) + >>> ds.set_format(type='numpy', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label']) + >>> ds.format + {'columns': ['input_ids', 'token_type_ids', 'attention_mask', 'label'], + 'format_kwargs': {}, + 'output_all_columns': False, + 'type': 'numpy'} + >>> ds.reset_format() + >>> ds.format + {'columns': ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'], + 'format_kwargs': {}, + 'output_all_columns': False, + 'type': None} + ``` + """ + self.set_format() + + def set_transform( + self, + transform: Optional[Callable], + columns: Optional[list] = None, + output_all_columns: bool = False, + ): + """Set `__getitem__` return format using this transform. The transform is applied on-the-fly on batches when `__getitem__` is called. + As [`~datasets.Dataset.set_format`], this can be reset using [`~datasets.Dataset.reset_format`]. + + Args: + transform (`Callable`, *optional*): + User-defined formatting transform, replaces the format defined by [`~datasets.Dataset.set_format`]. + A formatting function is a callable that takes a batch (as a `dict`) as input and returns a batch. + This function is applied right before returning the objects in `__getitem__`. + columns (`List[str]`, *optional*): + Columns to format in the output. + If specified, then the input batch of the transform only contains those columns. + output_all_columns (`bool`, defaults to `False`): + Keep un-formatted columns as well in the output (as python objects). + If set to True, then the other un-formatted columns are kept with the output of the transform. + + Example: + + ```py + >>> from datasets import load_dataset + >>> from transformers import AutoTokenizer + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") + >>> tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') + >>> def encode(batch): + ... return tokenizer(batch['text'], padding=True, truncation=True, return_tensors='pt') + >>> ds.set_transform(encode) + >>> ds[0] + {'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1]), + 'input_ids': tensor([ 101, 29353, 2135, 15102, 1996, 9428, 20868, 2890, 8663, 6895, + 20470, 2571, 3663, 2090, 4603, 3017, 3008, 1998, 2037, 24211, + 5637, 1998, 11690, 2336, 1012, 102]), + 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0])} + ``` + """ + self.set_format("custom", columns=columns, output_all_columns=output_all_columns, transform=transform) + + def with_format( + self, + type: Optional[str] = None, + columns: Optional[list] = None, + output_all_columns: bool = False, + **format_kwargs, + ): + """Set `__getitem__` return format (type and columns). The data formatting is applied on-the-fly. + The format `type` (for example "numpy") is used to format batches when using `__getitem__`. + + It's also possible to use custom transforms for formatting using [`~datasets.Dataset.with_transform`]. + + Contrary to [`~datasets.Dataset.set_format`], `with_format` returns a new [`Dataset`] object. + + Args: + type (`str`, *optional*): + Either output type selected in `[None, 'numpy', 'torch', 'tensorflow', 'jax', 'arrow', 'pandas', 'polars']`. + `None` means `__getitem__` returns python objects (default). + columns (`List[str]`, *optional*): + Columns to format in the output. + `None` means `__getitem__` returns all columns (default). + output_all_columns (`bool`, defaults to `False`): + Keep un-formatted columns as well in the output (as python objects). + **format_kwargs (additional keyword arguments): + Keywords arguments passed to the convert function like `np.array`, `torch.tensor` or `tensorflow.ragged.constant`. + + Example: + + ```py + >>> from datasets import load_dataset + >>> from transformers import AutoTokenizer + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") + >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") + >>> ds = ds.map(lambda x: tokenizer(x['text'], truncation=True, padding=True), batched=True) + >>> ds.format + {'columns': ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'], + 'format_kwargs': {}, + 'output_all_columns': False, + 'type': None} + >>> ds = ds.with_format("torch") + >>> ds.format + {'columns': ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'], + 'format_kwargs': {}, + 'output_all_columns': False, + 'type': 'torch'} + >>> ds[0] + {'text': 'compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .', + 'label': tensor(1), + 'input_ids': tensor([ 101, 18027, 16310, 16001, 1103, 9321, 178, 11604, 7235, 6617, + 1742, 2165, 2820, 1206, 6588, 22572, 12937, 1811, 2153, 1105, + 1147, 12890, 19587, 6463, 1105, 15026, 1482, 119, 102, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0]), + 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), + 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])} + ``` + """ + dataset = copy.deepcopy(self) + dataset.set_format(type=type, columns=columns, output_all_columns=output_all_columns, **format_kwargs) + return dataset + + def with_transform( + self, + transform: Optional[Callable], + columns: Optional[list] = None, + output_all_columns: bool = False, + ): + """Set `__getitem__` return format using this transform. The transform is applied on-the-fly on batches when `__getitem__` is called. + + As [`~datasets.Dataset.set_format`], this can be reset using [`~datasets.Dataset.reset_format`]. + + Contrary to [`~datasets.Dataset.set_transform`], `with_transform` returns a new [`Dataset`] object. + + Args: + transform (`Callable`, `optional`): + User-defined formatting transform, replaces the format defined by [`~datasets.Dataset.set_format`]. + A formatting function is a callable that takes a batch (as a `dict`) as input and returns a batch. + This function is applied right before returning the objects in `__getitem__`. + columns (`List[str]`, `optional`): + Columns to format in the output. + If specified, then the input batch of the transform only contains those columns. + output_all_columns (`bool`, defaults to `False`): + Keep un-formatted columns as well in the output (as python objects). + If set to `True`, then the other un-formatted columns are kept with the output of the transform. + + Example: + + ```py + >>> from datasets import load_dataset + >>> from transformers import AutoTokenizer + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") + >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") + >>> def encode(example): + ... return tokenizer(example["text"], padding=True, truncation=True, return_tensors='pt') + >>> ds = ds.with_transform(encode) + >>> ds[0] + {'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1]), + 'input_ids': tensor([ 101, 18027, 16310, 16001, 1103, 9321, 178, 11604, 7235, 6617, + 1742, 2165, 2820, 1206, 6588, 22572, 12937, 1811, 2153, 1105, + 1147, 12890, 19587, 6463, 1105, 15026, 1482, 119, 102]), + 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0])} + ``` + """ + dataset = copy.deepcopy(self) + dataset.set_transform(transform=transform, columns=columns, output_all_columns=output_all_columns) + return dataset + + def _getitem(self, key: Union[int, slice, str, ListLike[int]], **kwargs) -> Union[dict, list]: + """ + Can be used to index columns (by string names) or rows (by integer, slice, or list-like of integer indices) + """ + if isinstance(key, bool): + raise TypeError("dataset index must be int, str, slice or collection of int, not bool") + format_type = kwargs["format_type"] if "format_type" in kwargs else self._format_type + format_columns = kwargs["format_columns"] if "format_columns" in kwargs else self._format_columns + output_all_columns = ( + kwargs["output_all_columns"] if "output_all_columns" in kwargs else self._output_all_columns + ) + format_kwargs = kwargs["format_kwargs"] if "format_kwargs" in kwargs else self._format_kwargs + format_kwargs = format_kwargs if format_kwargs is not None else {} + formatter = get_formatter(format_type, features=self._info.features, **format_kwargs) + pa_subtable = query_table(self._data, key, indices=self._indices) + formatted_output = format_table( + pa_subtable, key, formatter=formatter, format_columns=format_columns, output_all_columns=output_all_columns + ) + return formatted_output + + @overload + def __getitem__(self, key: Union[int, slice, Iterable[int]]) -> dict: # noqa: F811 + ... + + @overload + def __getitem__(self, key: str) -> list: # noqa: F811 + ... + + def __getitem__(self, key): # noqa: F811 + """Can be used to index columns (by string names) or rows (by integer index or iterable of indices or bools).""" + if isinstance(key, str): + if self._format_type is None or self._format_type not in ("arrow", "pandas", "polars"): + return Column(self, key) + return self._getitem(key) + + def __getitems__(self, keys: list) -> list: + """Can be used to get a batch using a list of integers indices.""" + batch = self.__getitem__(keys) + n_examples = len(batch[next(iter(batch))]) + return [{col: array[i] for col, array in batch.items()} for i in range(n_examples)] + + def cleanup_cache_files(self) -> int: + """Clean up all cache files in the dataset cache directory, excepted the currently used cache file if there is + one. + + Be careful when running this command that no other process is currently using other cache files. + + Returns: + `int`: Number of removed files. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") + >>> ds.cleanup_cache_files() + 10 + ``` + """ + current_cache_files = [os.path.abspath(cache_file["filename"]) for cache_file in self.cache_files] + if not current_cache_files: + return 0 + cache_directory = os.path.dirname(current_cache_files[0]) + logger.info(f"Listing files in {cache_directory}") + files: list[str] = os.listdir(cache_directory) + files_to_remove = [] + for f_name in files: + full_name = os.path.abspath(os.path.join(cache_directory, f_name)) + if f_name.startswith("cache-") and f_name.endswith(".arrow"): + if full_name in current_cache_files: + logger.info(f"Keeping currently used cache file at {full_name}") + continue + files_to_remove.append(full_name) + for file_path in files_to_remove: + logger.info(f"Removing {file_path}") + os.remove(file_path) + return len(files_to_remove) + + def _get_cache_file_path(self, fingerprint): + if is_caching_enabled() and self.cache_files: + cache_file_name = "cache-" + fingerprint + ".arrow" + cache_directory = os.path.dirname(self.cache_files[0]["filename"]) + else: + cache_file_name = "cache-" + generate_random_fingerprint() + ".arrow" + cache_directory = get_temporary_cache_files_directory() + cache_file_path = os.path.join(cache_directory, cache_file_name) + return cache_file_path + + @transmit_format + def map( + self, + function: Optional[Callable] = None, + with_indices: bool = False, + with_rank: bool = False, + input_columns: Optional[Union[str, list[str]]] = None, + batched: bool = False, + batch_size: Optional[int] = 1000, + drop_last_batch: bool = False, + remove_columns: Optional[Union[str, list[str]]] = None, + keep_in_memory: bool = False, + load_from_cache_file: Optional[bool] = None, + cache_file_name: Optional[str] = None, + writer_batch_size: Optional[int] = 1000, + features: Optional[Features] = None, + disable_nullable: bool = False, + fn_kwargs: Optional[dict] = None, + num_proc: Optional[int] = None, + suffix_template: str = "_{rank:05d}_of_{num_proc:05d}", + new_fingerprint: Optional[str] = None, + desc: Optional[str] = None, + try_original_type: Optional[bool] = True, + ) -> "Dataset": + """ + Apply a function to all the examples in the table (individually or in batches) and update the table. + If your function returns a column that already exists, then it overwrites it. + + You can specify whether the function should be batched or not with the `batched` parameter: + + - If batched is `False`, then the function takes 1 example in and should return 1 example. + An example is a dictionary, e.g. `{"text": "Hello there !"}`. + - If batched is `True` and `batch_size` is 1, then the function takes a batch of 1 example as input and can return a batch with 1 or more examples. + A batch is a dictionary, e.g. a batch of 1 example is `{"text": ["Hello there !"]}`. + - If batched is `True` and `batch_size` is `n > 1`, then the function takes a batch of `n` examples as input and can return a batch with `n` examples, or with an arbitrary number of examples. + Note that the last batch may have less than `n` examples. + A batch is a dictionary, e.g. a batch of `n` examples is `{"text": ["Hello there !"] * n}`. + + If the function is asynchronous, then `map` will run your function in parallel, with up to one thousand simultaneous calls. + It is recommended to use a `asyncio.Semaphore` in your function if you want to set a maximum number of operations that can run at the same time. + + Args: + function (`Callable`): Function with one of the following signatures: + + - `function(example: Dict[str, Any]) -> Dict[str, Any]` if `batched=False` and `with_indices=False` and `with_rank=False` + - `function(example: Dict[str, Any], *extra_args) -> Dict[str, Any]` if `batched=False` and `with_indices=True` and/or `with_rank=True` (one extra arg for each) + - `function(batch: Dict[str, List]) -> Dict[str, List]` if `batched=True` and `with_indices=False` and `with_rank=False` + - `function(batch: Dict[str, List], *extra_args) -> Dict[str, List]` if `batched=True` and `with_indices=True` and/or `with_rank=True` (one extra arg for each) + + For advanced usage, the function can also return a `pyarrow.Table`. + If the function is asynchronous, then `map` will run your function in parallel. + Moreover if your function returns nothing (`None`), then `map` will run your function and return the dataset unchanged. + If no function is provided, default to identity function: `lambda x: x`. + with_indices (`bool`, defaults to `False`): + Provide example indices to `function`. Note that in this case the + signature of `function` should be `def function(example, idx[, rank]): ...`. + with_rank (`bool`, defaults to `False`): + Provide process rank to `function`. Note that in this case the + signature of `function` should be `def function(example[, idx], rank): ...`. + input_columns (`Optional[Union[str, List[str]]]`, defaults to `None`): + The columns to be passed into `function` + as positional arguments. If `None`, a `dict` mapping to all formatted columns is passed as one argument. + batched (`bool`, defaults to `False`): + Provide batch of examples to `function`. + batch_size (`int`, *optional*, defaults to `1000`): + Number of examples per batch provided to `function` if `batched=True`. + If `batch_size <= 0` or `batch_size == None`, provide the full dataset as a single batch to `function`. + drop_last_batch (`bool`, defaults to `False`): + Whether a last batch smaller than the batch_size should be + dropped instead of being processed by the function. + remove_columns (`Optional[Union[str, List[str]]]`, defaults to `None`): + Remove a selection of columns while doing the mapping. + Columns will be removed before updating the examples with the output of `function`, i.e. if `function` is adding + columns with names in `remove_columns`, these columns will be kept. + keep_in_memory (`bool`, defaults to `False`): + Keep the dataset in memory instead of writing it to a cache file. + load_from_cache_file (`Optional[bool]`, defaults to `True` if caching is enabled): + If a cache file storing the current computation from `function` + can be identified, use it instead of recomputing. + cache_file_name (`str`, *optional*, defaults to `None`): + Provide the name of a path for the cache file. It is used to store the + results of the computation instead of the automatically generated cache file name. + writer_batch_size (`int`, defaults to `1000`): + Number of rows per write operation for the cache file writer. + This value is a good trade-off between memory usage during the processing, and processing speed. + Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `map`. + features (`Optional[datasets.Features]`, defaults to `None`): + Use a specific Features to store the cache file + instead of the automatically generated one. + disable_nullable (`bool`, defaults to `False`): + Disallow null values in the table. + fn_kwargs (`Dict`, *optional*, defaults to `None`): + Keyword arguments to be passed to `function`. + num_proc (`int`, *optional*, defaults to `None`): + The number of processes to use for multiprocessing. + - If `None` or `0`, no multiprocessing is used and the operation runs in the main process. + - If greater than `1`, one or multiple worker processes are used to process data in parallel. + Note: The function passed to `map()` must be picklable for multiprocessing to work correctly + (i.e., prefer functions defined at the top level of a module, not inside another function or class). + suffix_template (`str`): + If `cache_file_name` is specified, then this suffix + will be added at the end of the base name of each. Defaults to `"_{rank:05d}_of_{num_proc:05d}"`. For example, if `cache_file_name` is "processed.arrow", then for + `rank=1` and `num_proc=4`, the resulting file would be `"processed_00001_of_00004.arrow"` for the default suffix. + new_fingerprint (`str`, *optional*, defaults to `None`): + The new fingerprint of the dataset after transform. + If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments. + desc (`str`, *optional*, defaults to `None`): + Meaningful description to be displayed alongside with the progress bar while mapping examples. + try_original_type (`Optional[bool]`, defaults to `True`): + Try to keep the types of the original columns (e.g. int32 -> int32). + Set to False if you want to always infer new types. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") + >>> def add_prefix(example): + ... example["text"] = "Review: " + example["text"] + ... return example + >>> ds = ds.map(add_prefix) + >>> ds[0:3]["text"] + ['Review: compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .', + 'Review: the soundtrack alone is worth the price of admission .', + 'Review: rodriguez does a splendid job of racial profiling hollywood style--casting excellent latin actors of all ages--a trend long overdue .'] + + # process a batch of examples + >>> ds = ds.map(lambda example: tokenizer(example["text"]), batched=True) + # set number of processors + >>> ds = ds.map(add_prefix, num_proc=4) + ``` + """ + if keep_in_memory and cache_file_name is not None: + raise ValueError("Please use either `keep_in_memory` or `cache_file_name` but not both.") + + if num_proc == 0: + num_proc = None + elif num_proc is not None and num_proc < 0: + raise ValueError("num_proc must be >= 0 or None.") + + string_formatter = string.Formatter() + fields = {field_name for _, field_name, _, _ in string_formatter.parse(suffix_template) if field_name} + if fields != {"rank", "num_proc"}: + raise ValueError(f"suffix_template must contain exactly the fields 'rank' and 'num_proc', got: {fields}") + + # If the array is empty we do nothing (but we make sure to handle an empty indices mapping and remove the requested columns anyway) + if len(self) == 0: + if self._indices is not None: # empty indices mapping + self = Dataset( + self.data.slice(0, 0), + info=self.info.copy(), + split=self.split, + fingerprint=new_fingerprint, + ) + if remove_columns: + return self.remove_columns(remove_columns) + else: + return self + + if function is None: + function = lambda x: x # noqa: E731 + + if isinstance(input_columns, str): + input_columns = [input_columns] + + if input_columns is not None: + missing_columns = set(input_columns) - set(self._data.column_names) + if missing_columns: + raise ValueError( + f"Input column {list(missing_columns)} not in the dataset. Current columns in the dataset: {self._data.column_names}" + ) + + if isinstance(remove_columns, str): + remove_columns = [remove_columns] + + if remove_columns is not None: + missing_columns = set(remove_columns) - set(self._data.column_names) + if missing_columns: + raise ValueError( + f"Column to remove {list(missing_columns)} not in the dataset. Current columns in the dataset: {self._data.column_names}" + ) + + load_from_cache_file = load_from_cache_file if load_from_cache_file is not None else is_caching_enabled() + + if fn_kwargs is None: + fn_kwargs = {} + + if features is not None: + features = _fix_for_backward_compatible_features(features) + + if num_proc is not None and num_proc > len(self): + num_proc = len(self) + logger.warning( + f"num_proc must be <= {len(self)}. Reducing num_proc to {num_proc} for dataset of size {len(self)}." + ) + + dataset_kwargs = { + "shard": self, + "function": function, + "with_indices": with_indices, + "with_rank": with_rank, + "input_columns": input_columns, + "batched": batched, + "batch_size": batch_size, + "drop_last_batch": drop_last_batch, + "remove_columns": remove_columns, + "keep_in_memory": keep_in_memory, + "writer_batch_size": writer_batch_size, + "features": features, + "disable_nullable": disable_nullable, + "fn_kwargs": fn_kwargs, + "try_original_type": try_original_type, + } + + if new_fingerprint is None: + # we create a unique hash from the function, + # current dataset file and the mapping args + transform = format_transform_for_fingerprint(Dataset._map_single) + kwargs_for_fingerprint = format_kwargs_for_fingerprint(Dataset._map_single, (), dataset_kwargs) + kwargs_for_fingerprint["fingerprint_name"] = "new_fingerprint" + new_fingerprint = update_fingerprint(self._fingerprint, transform, kwargs_for_fingerprint) + else: + validate_fingerprint(new_fingerprint) + dataset_kwargs["new_fingerprint"] = new_fingerprint + + if self.cache_files: + if cache_file_name is None: + cache_file_name = self._get_cache_file_path(new_fingerprint) + dataset_kwargs["cache_file_name"] = cache_file_name + + if cache_file_name is not None: + cache_file_prefix, cache_file_ext = os.path.splitext(cache_file_name) + if not cache_file_ext: + raise ValueError(f"Expected cache_file_name to have an extension, but got: {cache_file_name}") + else: + cache_file_prefix = cache_file_ext = None + + def load_processed_shard_from_cache(shard_kwargs: dict[str, Any]) -> Dataset: + """Load a processed shard from cache if it exists, otherwise throw an error.""" + shard = shard_kwargs["shard"] + # Check if we've already cached this computation (indexed by a hash) + if shard_kwargs["cache_file_name"] is not None: + if os.path.exists(shard_kwargs["cache_file_name"]) and load_from_cache_file: + info = shard.info.copy() + info.features = features + return Dataset.from_file(shard_kwargs["cache_file_name"], info=info, split=shard.split) + raise NonExistentDatasetError + + existing_cache_file_map: dict[int, list[str]] = defaultdict(list) + if cache_file_name is not None: + if os.path.exists(cache_file_name): + existing_cache_file_map[1] = [cache_file_name] + + assert cache_file_prefix is not None and cache_file_ext is not None + cache_file_with_suffix_pattern = cache_file_prefix + suffix_template + cache_file_ext + + for cache_file in glob.iglob(f"{cache_file_prefix}*{cache_file_ext}"): + suffix_variable_map = string_to_dict( + Path(cache_file).as_posix(), Path(cache_file_with_suffix_pattern).as_posix() + ) + if suffix_variable_map is not None: + file_num_proc = int(suffix_variable_map["num_proc"]) + existing_cache_file_map[file_num_proc].append(cache_file) + + num_shards = num_proc or 1 + if existing_cache_file_map: + # to avoid remapping when a different num_proc is given than when originally cached, update num_shards to + # what was used originally + + def select_existing_cache_files(mapped_num_proc: int) -> tuple[float, ...]: + percent_missing = (mapped_num_proc - len(existing_cache_file_map[mapped_num_proc])) / mapped_num_proc + num_shards_diff = abs(mapped_num_proc - num_shards) + return ( + percent_missing, # choose the most complete set of existing cache files + num_shards_diff, # then choose the mapped_num_proc closest to the current num_proc + mapped_num_proc, # finally, choose whichever mapped_num_proc is lower + ) + + num_shards = min(existing_cache_file_map, key=select_existing_cache_files) + + existing_cache_files = existing_cache_file_map[num_shards] + + def format_cache_file_name( + cache_file_name: Optional[str], + rank: Union[int, Literal["*"]], # noqa: F722 + ) -> Optional[str]: + if not cache_file_name: + return cache_file_name + + assert cache_file_prefix is not None and cache_file_ext is not None + + if isinstance(rank, int): + cache_file_name = ( + cache_file_prefix + suffix_template.format(rank=rank, num_proc=num_shards) + cache_file_ext + ) + if not os.path.exists(cache_file_name): + process_name = ( + "Main process" if num_proc is None or num_proc == 1 else f"Process #{rank % num_shards + 1}" + ) + logger.info(f"{process_name} will write at {cache_file_name}") + else: + # TODO: this assumes the format_spec of rank in suffix_template + cache_file_name = ( + cache_file_prefix + + suffix_template.replace("{rank:05d}", "{rank}").format(rank=rank, num_proc=num_shards) + + cache_file_ext + ) + return cache_file_name + + def format_new_fingerprint(new_fingerprint: str, rank: int) -> str: + new_fingerprint = new_fingerprint + suffix_template.format(rank=rank, num_proc=num_shards) + validate_fingerprint(new_fingerprint) + return new_fingerprint + + if num_proc is not None and num_proc >= 1: + prev_env = deepcopy(os.environ) + # check if parallelism if off + # from https://github.com/huggingface/tokenizers/blob/bb668bc439dc34389b71dbb8ce0c597f15707b53/tokenizers/src/utils/parallelism.rs#L22 + if prev_env.get("TOKENIZERS_PARALLELISM", "false").lower() not in ( + "", + "off", + "false", + "f", + "no", + "n", + "0", + ): + logger.warning("Setting TOKENIZERS_PARALLELISM=false for forked processes.") + os.environ["TOKENIZERS_PARALLELISM"] = "false" + else: + prev_env = os.environ + + kwargs_per_job: list[Optional[dict[str, Any]]] + if num_shards == 1: + shards = [self] + kwargs_per_job = [dataset_kwargs] + else: + shards = [ + self.shard(num_shards=num_shards, index=rank, contiguous=True, keep_in_memory=keep_in_memory) + for rank in range(num_shards) + ] + kwargs_per_job = [ + { + **dataset_kwargs, + "shard": shards[rank], + "cache_file_name": format_cache_file_name(cache_file_name, rank), + "rank": rank, + "offset": sum(len(s) for s in shards[:rank]), + "new_fingerprint": format_new_fingerprint(new_fingerprint, rank), + } + for rank in range(num_shards) + ] + + transformed_shards: list[Optional[Dataset]] = [None] * num_shards + for rank in range(num_shards): + try: + job_kwargs = kwargs_per_job[rank] + assert job_kwargs is not None + transformed_shards[rank] = load_processed_shard_from_cache(job_kwargs) + kwargs_per_job[rank] = None + except NonExistentDatasetError: + pass + + if unprocessed_kwargs_per_job := [kwargs for kwargs in kwargs_per_job if kwargs is not None]: + if len(unprocessed_kwargs_per_job) != num_shards: + logger.info( + f"Reprocessing {len(unprocessed_kwargs_per_job)}/{num_shards} shards because some of them were " + "missing from the cache." + ) + + pbar_total = len(self) + pbar_initial = len(existing_cache_files) * pbar_total // num_shards + if batched and drop_last_batch: + batch_size = batch_size or 1 + pbar_initial = pbar_initial // num_shards // batch_size * num_shards * batch_size + pbar_total = pbar_total // num_shards // batch_size * num_shards * batch_size + + with hf_tqdm( + unit=" examples", + initial=pbar_initial, + total=pbar_total, + desc=(desc or "Map") + (f" (num_proc={num_proc})" if num_proc is not None and num_proc >= 1 else ""), + ) as pbar: + shards_done = 0 + + def check_if_shard_done(rank: Optional[int], done: bool, content: Union[Dataset, int]) -> None: + nonlocal shards_done + if done: + shards_done += 1 + logger.debug(f"Finished processing shard number {rank} of {num_shards}.") + assert isinstance(content, Dataset) + transformed_shards[rank or 0] = content + else: + assert isinstance(content, int) + pbar.update(content) + + if num_proc is not None and num_proc >= 1: + with Pool(num_proc) as pool: + os.environ = prev_env + logger.info(f"Spawning {num_proc} processes") + + for rank, done, content in iflatmap_unordered( + pool, Dataset._map_single, kwargs_iterable=unprocessed_kwargs_per_job + ): + check_if_shard_done(rank, done, content) + + pool.close() + pool.join() + else: + for unprocessed_kwargs in unprocessed_kwargs_per_job: + for rank, done, content in Dataset._map_single(**unprocessed_kwargs): + check_if_shard_done(rank, done, content) + + # Avoids PermissionError on Windows (the error: https://github.com/huggingface/datasets/actions/runs/4026734820/jobs/6921621805) + for job_kwargs in unprocessed_kwargs_per_job: + if "shard" in job_kwargs: + del job_kwargs["shard"] + else: + logger.info(f"Loading cached processed dataset at {format_cache_file_name(cache_file_name, '*')}") + + all_transformed_shards = [shard for shard in transformed_shards if shard is not None] + if len(transformed_shards) != len(all_transformed_shards): + raise ValueError( + f"Failed to retrieve results from map: result list {transformed_shards} still contains None - " + "at least one worker failed to return its results" + ) + + if num_shards == 1: + result = all_transformed_shards[0] + else: + logger.info(f"Concatenating {num_shards} shards") + result = _concatenate_map_style_datasets(all_transformed_shards) + + # update fingerprint if the dataset changed + result._fingerprint = ( + new_fingerprint + if any( + transformed_shard._fingerprint != shard._fingerprint + for transformed_shard, shard in zip(all_transformed_shards, shards) + ) + else self._fingerprint + ) + + return result + + @staticmethod + def _map_single( + shard: "Dataset", + function: Optional[Callable] = None, + with_indices: bool = False, + with_rank: bool = False, + input_columns: Optional[list[str]] = None, + batched: bool = False, + batch_size: Optional[int] = 1000, + drop_last_batch: bool = False, + remove_columns: Optional[list[str]] = None, + keep_in_memory: bool = False, + cache_file_name: Optional[str] = None, + writer_batch_size: Optional[int] = 1000, + features: Optional[Features] = None, + disable_nullable: bool = False, + fn_kwargs: Optional[dict] = None, + new_fingerprint: Optional[str] = None, + rank: Optional[int] = None, + offset: int = 0, + try_original_type: Optional[bool] = True, + ) -> Iterable[tuple[Optional[int], bool, Union[int, "Dataset"]]]: + """Apply a function to all the elements in the table (individually or in batches) + and update the table (if function does update examples). + + Args: + shard (`datasets.Dataset`): Dataset to map the transform on. + function (`Callable`): with one of the following signature: + - `function(example: Dict[str, Any]) -> Dict[str, Any]` if `batched=False` and `with_indices=False` and `with_rank=False` + - `function(example: Dict[str, Any], *extra_args) -> Dict[str, Any]` if `batched=False` and `with_indices=True` and/or `with_rank=True` (one extra arg for each) + - `function(batch: Dict[str, List]) -> Dict[str, List]` if `batched=True` and `with_indices=False` and `with_rank=False` + - `function(batch: Dict[str, List], *extra_args) -> Dict[str, List]` if `batched=True` and `with_indices=True` and/or `with_rank=True` (one extra arg for each) + + For advanced usage, the function can also return a `pyarrow.Table`. + Moreover if your function returns nothing (`None`), then `map` will run your function and return the dataset unchanged. + If no function is provided, default to identity function: lambda x: x + with_indices (`bool`, defaults to `False`): Provide example indices to `function`. Note that in this case the signature of `function` should be `def function(example, idx[, rank]): ...`. + with_rank (`bool`, default `False`): Provide process rank to `function`. Note that in this case the signature of `function` should be `def function(example[, idx], rank): ...`. + input_columns (`Optional[List[str]]`, defaults to `None`): The columns to be passed into `function` as + positional arguments. If `None`, a dict mapping to all formatted columns is passed as one argument. + batched (`bool`, defaults to `False`): Provide batch of examples to `function` + batch_size (`int`, optional, defaults to `1000`): Number of examples per batch provided to `function` if `batched=True` + `batch_size <= 0` or `batch_size == None`: Provide the full dataset as a single batch to `function` + drop_last_batch (`bool`, default: `False`): Whether a last batch smaller than the batch_size should be + dropped instead of being processed by the function. + remove_columns (`Optional[List[str]]`, defaults to `None`): Remove a selection of columns while doing the mapping. + Columns will be removed before updating the examples with the output of `function`, i.e. if `function` is adding + columns with names in `remove_columns`, these columns will be kept. + keep_in_memory (`bool`, defaults to `False`): Keep the dataset in memory instead of writing it to a cache file. + cache_file_name (`str`, optional, defaults to `None`): Provide the name of a path for the cache file. It is used to store the + results of the computation instead of the automatically generated cache file name. + writer_batch_size (`int`, default `1000`): Number of rows per write operation for the cache file writer. + This value is a good trade-off between memory usage during the processing, and processing speed. + Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `.map()`. + features (`Optional[datasets.Features]`, defaults to `None`): Use a specific Features to store the cache file + instead of the automatically generated one. + disable_nullable (`bool`, defaults to `False`): Disallow null values in the table. + fn_kwargs (`Dict`, optional, defaults to `None`): Keyword arguments to be passed to `function` + new_fingerprint (`str`, optional, defaults to `None`): the new fingerprint of the dataset after transform. + If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments + rank: (`int`, optional, defaults to `None`): If specified, this is the process rank when doing multiprocessing + offset: (`int`, defaults to 0): If specified, this is an offset applied to the indices passed to `function` if `with_indices=True`. + try_original_type: (`Optional[bool]`, defaults to `True`): + Try to keep the types of the original columns (e.g. int32 -> int32). + Set to False if you want to always infer new types. + """ + if fn_kwargs is None: + fn_kwargs = {} + + # If we do batch computation but no batch size is provided, default to the full dataset + if batched and (batch_size is None or batch_size <= 0): + batch_size = shard.num_rows + + # We set this variable to True after processing the first example/batch in + # `apply_function_on_filtered_inputs` if the map function returns a dict. + # If set to False, no new arrow table will be created + + update_data = None + + format_kwargs = shard._format_kwargs.copy() + # Lazy formatting is only available for the default format (None/python) + if not input_columns and shard._format_type is None: + format_kwargs["lazy"] = True + input_formatter = get_formatter( + shard._format_type, + features=shard.features, + **format_kwargs, + ) + + check_same_num_examples = batched and len(shard.list_indexes()) > 0 + + def validate_function_output(processed_inputs): + """Validate output of the map function.""" + allowed_processed_inputs_types = (Mapping, pa.Table, pd.DataFrame) + if config.POLARS_AVAILABLE and "polars" in sys.modules: + import polars as pl + + allowed_processed_inputs_types += (pl.DataFrame,) + if processed_inputs is not None and not isinstance(processed_inputs, allowed_processed_inputs_types): + raise TypeError( + f"Provided `function` which is applied to all elements of table returns a variable of type {type(processed_inputs)}. Make sure provided `function` returns a variable of type `dict` (or a pyarrow table) to update the dataset or `None` if you are only interested in side effects." + ) + if batched and isinstance(processed_inputs, Mapping): + allowed_batch_return_types = (list, np.ndarray, pd.Series) + if config.POLARS_AVAILABLE and "polars" in sys.modules: + import polars as pl + + allowed_batch_return_types += (pl.Series, pl.DataFrame) + if config.TF_AVAILABLE and "tensorflow" in sys.modules: + import tensorflow as tf + + allowed_batch_return_types += (tf.Tensor,) + if config.TORCH_AVAILABLE and "torch" in sys.modules: + import torch + + allowed_batch_return_types += (torch.Tensor,) + if config.JAX_AVAILABLE and "jax" in sys.modules: + import jax.numpy as jnp + + allowed_batch_return_types += (jnp.ndarray,) + all_dict_values_are_lists = all( + isinstance(value, allowed_batch_return_types) for value in processed_inputs.values() + ) + if all_dict_values_are_lists is False: + raise TypeError( + f"Provided `function` which is applied to all elements of table returns a `dict` of types {[type(x) for x in processed_inputs.values()]}. When using `batched=True`, make sure provided `function` returns a `dict` of types like `{allowed_batch_return_types}`." + ) + + def prepare_inputs(pa_inputs, indices, offset=0): + """Utility to apply the function on a selection of columns.""" + inputs = format_table( + pa_inputs, + 0 if not batched else range(pa_inputs.num_rows), + format_columns=input_columns, + formatter=input_formatter, + ) + fn_args = [inputs] if input_columns is None else [inputs[col] for col in input_columns] + if offset == 0: + effective_indices = indices + else: + effective_indices = [i + offset for i in indices] if isinstance(indices, list) else indices + offset + additional_args = () + if with_indices: + additional_args += (effective_indices,) + if with_rank: + additional_args += (rank,) + return inputs, fn_args, additional_args, fn_kwargs + + def prepare_outputs(pa_inputs, inputs, processed_inputs): + nonlocal update_data + if not (update_data := (processed_inputs is not None)): + return None + if isinstance(processed_inputs, LazyDict): + processed_inputs = { + k: v for k, v in processed_inputs.data.items() if k not in processed_inputs.keys_to_format + } + returned_lazy_dict = True + else: + returned_lazy_dict = False + validate_function_output(processed_inputs) + if shard._format_type or input_columns: + # TODO(QL, MS): ideally the behavior should be the same even if the dataset is formatted (may require major release) + inputs_to_merge = dict(zip(pa_inputs.column_names, pa_inputs.itercolumns())) + elif isinstance(inputs, LazyDict): + inputs_to_merge = { + k: (v if k not in inputs.keys_to_format else pa_inputs[k]) for k, v in inputs.data.items() + } + else: + inputs_to_merge = inputs + if remove_columns is not None: + for column in remove_columns: + # `function` can modify input in-place causing column to be already removed. + if column in inputs_to_merge: + inputs_to_merge.pop(column) + if returned_lazy_dict and column in processed_inputs: + processed_inputs.pop(column) + if check_same_num_examples: + input_num_examples = len(pa_inputs) + processed_inputs_num_examples = len(processed_inputs[next(iter(processed_inputs.keys()))]) + if input_num_examples != processed_inputs_num_examples: + raise DatasetTransformationNotAllowedError( + "Using `.map` in batched mode on a dataset with attached indexes is allowed only if it doesn't create or remove existing examples. You can first run `.drop_index() to remove your index and then re-add it." + ) from None + if isinstance(inputs, Mapping) and isinstance(processed_inputs, Mapping): + # The .map() transform *updates* the dataset: + # the output dictionary contains both the the input data and the output data. + # The output dictionary may contain Arrow values from `inputs_to_merge` so that we can re-write them efficiently. + return {**inputs_to_merge, **processed_inputs} + else: + return processed_inputs + + def apply_function(pa_inputs, indices, offset=0): + """Utility to apply the function on a selection of columns.""" + inputs, fn_args, additional_args, fn_kwargs = prepare_inputs(pa_inputs, indices, offset=offset) + processed_inputs = function(*fn_args, *additional_args, **fn_kwargs) + return prepare_outputs(pa_inputs, inputs, processed_inputs) + + async def async_apply_function(pa_inputs, indices, offset=0): + """Utility to apply the function on a selection of columns. Same code but async""" + inputs, fn_args, additional_args, fn_kwargs = prepare_inputs(pa_inputs, indices, offset=offset) + processed_inputs = await function(*fn_args, *additional_args, **fn_kwargs) + return prepare_outputs(pa_inputs, inputs, processed_inputs) + + def init_buffer_and_writer(): + # Prepare output buffer and batched writer in memory or on file if we update the table + writer_features = features + if writer_features is None: + writer_features = shard.features + update_features = True + else: + update_features = False + if keep_in_memory or cache_file_name is None: + buf_writer = pa.BufferOutputStream() + tmp_file = None + writer = ArrowWriter( + features=writer_features, + stream=buf_writer, + writer_batch_size=writer_batch_size, + update_features=update_features, + fingerprint=new_fingerprint, + disable_nullable=disable_nullable, + ) + else: + buf_writer = None + logger.info(f"Caching processed dataset at {cache_file_name}") + cache_dir = os.path.dirname(cache_file_name) + os.makedirs(cache_dir, exist_ok=True) + tmp_file = tempfile.NamedTemporaryFile("wb", dir=cache_dir, delete=False) + writer = ArrowWriter( + features=writer_features, + path=tmp_file.name, + writer_batch_size=writer_batch_size, + update_features=update_features, + fingerprint=new_fingerprint, + disable_nullable=disable_nullable, + ) + return buf_writer, writer, tmp_file + + tasks: list[asyncio.Task] = [] + if inspect.iscoroutinefunction(function): + try: + loop = asyncio.get_running_loop() + except RuntimeError: + loop = asyncio.new_event_loop() + else: + loop = None + + def iter_outputs(shard_iterable): + nonlocal tasks, loop + if inspect.iscoroutinefunction(function): + indices: Union[list[int], list[list[int]]] = [] + for i, example in shard_iterable: + indices.append(i) + tasks.append(loop.create_task(async_apply_function(example, i, offset=offset))) + # keep the total active tasks under a certain number + if len(tasks) >= config.MAX_NUM_RUNNING_ASYNC_MAP_FUNCTIONS_IN_PARALLEL: + done, pending = loop.run_until_complete( + asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED) + ) + while tasks and len(pending) >= config.MAX_NUM_RUNNING_ASYNC_MAP_FUNCTIONS_IN_PARALLEL: + done, pending = loop.run_until_complete( + asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED) + ) + # yield finished tasks + while tasks and tasks[0].done(): + yield indices.pop(0), tasks.pop(0).result() + while tasks: + yield indices[0], loop.run_until_complete(tasks[0]) + indices.pop(0), tasks.pop(0) + else: + for i, example in shard_iterable: + yield i, apply_function(example, i, offset=offset) + + num_examples_progress_update = 0 + # If `update_data` is True after processing the first example/batch, initialize these resources with `init_buffer_and_writer` + buf_writer, writer, tmp_file = None, None, None + + # Check if Polars is available and import it if so + if config.POLARS_AVAILABLE and "polars" in sys.modules: + import polars as pl + + # Optionally initialize the writer as a context manager + with contextlib.ExitStack() as stack: + try: + arrow_formatted_shard = shard.with_format("arrow") + + # Loop over single examples or batches and write to buffer/file if examples are to be updated + if not batched: + shard_iterable = enumerate(arrow_formatted_shard) + else: + num_rows = len(shard) if not drop_last_batch else len(shard) // batch_size * batch_size + shard_iterable = zip( + (list(range(i, min(i + batch_size, num_rows))) for i in range(0, num_rows, batch_size)), + arrow_formatted_shard.iter(batch_size, drop_last_batch=drop_last_batch), + ) + if not batched: + _time = time.time() + for i, example in iter_outputs(shard_iterable): + if update_data: + if i == 0: + buf_writer, writer, tmp_file = init_buffer_and_writer() + stack.enter_context(writer) + if isinstance(example, pa.Table): + writer.write_row(example) + elif isinstance(example, pd.DataFrame): + writer.write_row(pa.Table.from_pandas(example)) + elif ( + config.POLARS_AVAILABLE + and "polars" in sys.modules + and isinstance(example, pl.DataFrame) + ): + writer.write_row(example.to_arrow()) + else: + writer.write(example) + num_examples_progress_update += 1 + if time.time() > _time + config.PBAR_REFRESH_TIME_INTERVAL: + _time = time.time() + yield rank, False, num_examples_progress_update + num_examples_progress_update = 0 + else: + _time = time.time() + for i, batch in iter_outputs(shard_iterable): + num_examples_in_batch = len(i) + if update_data: + if i and i[0] == 0: + buf_writer, writer, tmp_file = init_buffer_and_writer() + stack.enter_context(writer) + if isinstance(batch, pa.Table): + writer.write_table(batch) + elif isinstance(batch, pd.DataFrame): + writer.write_table(pa.Table.from_pandas(batch)) + elif ( + config.POLARS_AVAILABLE and "polars" in sys.modules and isinstance(batch, pl.DataFrame) + ): + writer.write_table(batch.to_arrow()) + else: + writer.write_batch(batch, try_original_type=try_original_type) + num_examples_progress_update += num_examples_in_batch + if time.time() > _time + config.PBAR_REFRESH_TIME_INTERVAL: + _time = time.time() + yield rank, False, num_examples_progress_update + num_examples_progress_update = 0 + if update_data and writer is not None: + writer.finalize() # close_stream=bool(buf_writer is None)) # We only close if we are writing in a file + except (Exception, KeyboardInterrupt): + yield rank, False, num_examples_progress_update + if update_data: + if writer is not None: + writer.finalize() + if tmp_file is not None: + tmp_file.close() + if os.path.exists(tmp_file.name): + os.remove(tmp_file.name) + if loop: + logger.debug(f"Canceling {len(tasks)} async tasks.") + for task in tasks: + task.cancel(msg="KeyboardInterrupt") + try: + loop.run_until_complete(asyncio.gather(*tasks)) + except (asyncio.CancelledError, ValueError): + logger.debug("Tasks canceled.") + raise + + yield rank, False, num_examples_progress_update + if update_data and tmp_file is not None: + tmp_file.close() + shutil.move(tmp_file.name, cache_file_name) + umask = os.umask(0o666) + os.umask(umask) + os.chmod(cache_file_name, 0o666 & ~umask) + + if update_data: + # Create new Dataset from buffer or file + info = shard.info.copy() + info.features = writer._features + if buf_writer is None: + yield rank, True, Dataset.from_file(cache_file_name, info=info, split=shard.split) + else: + yield rank, True, Dataset.from_buffer(buf_writer.getvalue(), info=info, split=shard.split) + else: + yield rank, True, shard + + @transmit_format + @fingerprint_transform(inplace=False) + def batch( + self, + batch_size: int, + drop_last_batch: bool = False, + num_proc: Optional[int] = None, + new_fingerprint: Optional[str] = None, + ) -> "Dataset": + """ + Group samples from the dataset into batches. + + Args: + batch_size (`int`): + The number of samples in each batch. + drop_last_batch (`bool`, defaults to `False`): + Whether to drop the last incomplete batch. + num_proc (`int`, *optional*, defaults to `None`): + Max number of processes when generating cache. Already cached shards are loaded sequentially. + new_fingerprint (`str`, *optional*, defaults to `None`): + The new fingerprint of the dataset after transform. + If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments. + + Returns: + [`Dataset`]: A new Dataset where each item is a batch of multiple samples from the original dataset. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train") + >>> batched_ds = ds.batch(batch_size=4) + >>> batched_ds[0] + {'text': ['compassionately explores the seemingly irreconcilable situation...', ...], # 4 items + 'label': [1, 1, 1, 1]} + ``` + """ + + def batch_fn(example): + return {k: [v] for k, v in example.items()} + + return self.map( + batch_fn, + batched=True, + batch_size=batch_size, + drop_last_batch=drop_last_batch, + num_proc=num_proc, + new_fingerprint=new_fingerprint, + desc="Batching examples", + ) + + @transmit_format + @fingerprint_transform( + inplace=False, ignore_kwargs=["load_from_cache_file", "cache_file_name", "desc"], version="2.0.1" + ) + def filter( + self, + function: Optional[Callable] = None, + with_indices: bool = False, + with_rank: bool = False, + input_columns: Optional[Union[str, list[str]]] = None, + batched: bool = False, + batch_size: Optional[int] = 1000, + keep_in_memory: bool = False, + load_from_cache_file: Optional[bool] = None, + cache_file_name: Optional[str] = None, + writer_batch_size: Optional[int] = 1000, + fn_kwargs: Optional[dict] = None, + num_proc: Optional[int] = None, + suffix_template: str = "_{rank:05d}_of_{num_proc:05d}", + new_fingerprint: Optional[str] = None, + desc: Optional[str] = None, + ) -> "Dataset": + """Apply a filter function to all the elements in the table in batches + and update the table so that the dataset only includes examples according to the filter function. + + If the function is asynchronous, then `filter` will run your function in parallel, with up to one thousand simultaneous calls (configurable). + It is recommended to use a `asyncio.Semaphore` in your function if you want to set a maximum number of operations that can run at the same time. + + Args: + function (`Callable`): Callable with one of the following signatures: + + - `function(example: Dict[str, Any]) -> bool` if `batched=False` and `with_indices=False` and `with_rank=False` + - `function(example: Dict[str, Any], *extra_args) -> bool` if `batched=False` and `with_indices=True` and/or `with_rank=True` (one extra arg for each) + - `function(batch: Dict[str, List]) -> List[bool]` if `batched=True` and `with_indices=False` and `with_rank=False` + - `function(batch: Dict[str, List], *extra_args) -> List[bool]` if `batched=True` and `with_indices=True` and/or `with_rank=True` (one extra arg for each) + + If the function is asynchronous, then `filter` will run your function in parallel. + If no function is provided, defaults to an always `True` function: `lambda x: True`. + with_indices (`bool`, defaults to `False`): + Provide example indices to `function`. Note that in this case the + signature of `function` should be `def function(example, idx[, rank]): ...`. + with_rank (`bool`, defaults to `False`): + Provide process rank to `function`. Note that in this case the + signature of `function` should be `def function(example[, idx], rank): ...`. + input_columns (`str` or `List[str]`, *optional*): + The columns to be passed into `function` as + positional arguments. If `None`, a `dict` mapping to all formatted columns is passed as one argument. + batched (`bool`, defaults to `False`): + Provide batch of examples to `function`. + batch_size (`int`, *optional*, defaults to `1000`): + Number of examples per batch provided to `function` if + `batched = True`. If `batched = False`, one example per batch is passed to `function`. + If `batch_size <= 0` or `batch_size == None`, provide the full dataset as a single batch to `function`. + keep_in_memory (`bool`, defaults to `False`): + Keep the dataset in memory instead of writing it to a cache file. + load_from_cache_file (`Optional[bool]`, defaults to `True` if caching is enabled): + If a cache file storing the current computation from `function` + can be identified, use it instead of recomputing. + cache_file_name (`str`, *optional*): + Provide the name of a path for the cache file. It is used to store the + results of the computation instead of the automatically generated cache file name. + writer_batch_size (`int`, defaults to `1000`): + Number of rows per write operation for the cache file writer. + This value is a good trade-off between memory usage during the processing, and processing speed. + Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `map`. + fn_kwargs (`dict`, *optional*): + Keyword arguments to be passed to `function`. + num_proc (`int`, *optional*, defaults to `None`): + The number of processes to use for multiprocessing. + - If `None` or `0`, no multiprocessing is used and the operation runs in the main process. + - If greater than `1`, one or multiple worker processes are used to process data in parallel. + Note: The function passed to `map()` must be picklable for multiprocessing to work correctly + (i.e., prefer functions defined at the top level of a module, not inside another function or class). + suffix_template (`str`): + If `cache_file_name` is specified, then this suffix will be added at the end of the base name of each. + For example, if `cache_file_name` is `"processed.arrow"`, then for `rank = 1` and `num_proc = 4`, + the resulting file would be `"processed_00001_of_00004.arrow"` for the default suffix (default + `_{rank:05d}_of_{num_proc:05d}`). + new_fingerprint (`str`, *optional*): + The new fingerprint of the dataset after transform. + If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments. + desc (`str`, *optional*, defaults to `None`): + Meaningful description to be displayed alongside with the progress bar while filtering examples. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") + >>> ds = ds.filter(lambda x: x["label"] == 1) + >>> ds + Dataset({ + features: ['text', 'label'], + num_rows: 533 + }) + ``` + + """ + if len(self.list_indexes()) > 0: + raise DatasetTransformationNotAllowedError( + "Using `.filter` on a dataset with attached indexes is not allowed. You can first run `.drop_index() to remove your index and then re-add it.`" + ) + + if function is None: + function = lambda x: True # noqa: E731 + + if len(self) == 0: + return self + + # We generally batch the underlying map() to get faster throughput, + # but in case of async we force batch_size=1 to enable parallelism + if inspect.iscoroutinefunction(function) and not batched: + batch_size = 1 + + indices = self.map( + function=partial( + async_get_indices_from_mask_function + if inspect.iscoroutinefunction(function) + else get_indices_from_mask_function, + function, + batched, + with_indices, + with_rank, + input_columns, + self._indices, + ), + with_indices=True, + with_rank=True, + features=Features({"indices": Value("uint64")}), + batched=True, + batch_size=batch_size, + remove_columns=self.column_names, + keep_in_memory=keep_in_memory, + load_from_cache_file=load_from_cache_file, + cache_file_name=cache_file_name, + writer_batch_size=writer_batch_size, + fn_kwargs=fn_kwargs, + num_proc=num_proc, + suffix_template=suffix_template, + new_fingerprint=new_fingerprint, + input_columns=input_columns, + desc=desc or "Filter", + ) + new_dataset = copy.deepcopy(self) + new_dataset._indices = indices.data + new_dataset._fingerprint = new_fingerprint + return new_dataset + + @transmit_format + @fingerprint_transform(inplace=False, ignore_kwargs=["cache_file_name"]) + def flatten_indices( + self, + keep_in_memory: bool = False, + cache_file_name: Optional[str] = None, + writer_batch_size: Optional[int] = 1000, + features: Optional[Features] = None, + disable_nullable: bool = False, + num_proc: Optional[int] = None, + new_fingerprint: Optional[str] = None, + ) -> "Dataset": + """Create and cache a new Dataset by flattening the indices mapping. + + Args: + keep_in_memory (`bool`, defaults to `False`): + Keep the dataset in memory instead of writing it to a cache file. + cache_file_name (`str`, *optional*, default `None`): + Provide the name of a path for the cache file. It is used to store the + results of the computation instead of the automatically generated cache file name. + writer_batch_size (`int`, defaults to `1000`): + Number of rows per write operation for the cache file writer. + This value is a good trade-off between memory usage during the processing, and processing speed. + Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `map`. + features (`Optional[datasets.Features]`, defaults to `None`): + Use a specific [`Features`] to store the cache file + instead of the automatically generated one. + disable_nullable (`bool`, defaults to `False`): + Allow null values in the table. + num_proc (`int`, optional, default `None`): + Max number of processes when generating cache. Already cached shards are loaded sequentially + new_fingerprint (`str`, *optional*, defaults to `None`): + The new fingerprint of the dataset after transform. + If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments + """ + + return self.map( + batched=True, # for speed + keep_in_memory=keep_in_memory, + cache_file_name=cache_file_name, + writer_batch_size=writer_batch_size, + features=features, + disable_nullable=disable_nullable, + new_fingerprint=new_fingerprint, + desc="Flattening the indices", + num_proc=num_proc, + ) + + def _new_dataset_with_indices( + self, + indices_cache_file_name: Optional[str] = None, + indices_buffer: Optional[pa.Buffer] = None, + fingerprint: Optional[str] = None, + ) -> "Dataset": + """Return a new Dataset obtained by adding indices (provided in indices_cache_file_name or in a buffer) to the + current Dataset. + """ + + if indices_cache_file_name is None and indices_buffer is None: + raise ValueError("At least one of indices_cache_file_name or indices_buffer must be provided.") + + if fingerprint is None: + raise ValueError("please specify a fingerprint for the dataset with indices") + + if indices_cache_file_name is not None: + indices_table = MemoryMappedTable.from_file(indices_cache_file_name) + else: + indices_table = InMemoryTable.from_buffer(indices_buffer) + + # Return new Dataset object + # don't forget to copy the objects + return Dataset( + self._data, + info=self.info.copy(), + split=self.split, + indices_table=indices_table, + fingerprint=fingerprint, + ) + + @transmit_format + @fingerprint_transform(inplace=False, ignore_kwargs=["indices_cache_file_name"]) + def select( + self, + indices: Iterable, + keep_in_memory: bool = False, + indices_cache_file_name: Optional[str] = None, + writer_batch_size: Optional[int] = 1000, + new_fingerprint: Optional[str] = None, + ) -> "Dataset": + """Create a new dataset with rows selected following the list/array of indices. + + Args: + indices (`range`, `list`, `iterable`, `ndarray` or `Series`): + Range, list or 1D-array of integer indices for indexing. + If the indices correspond to a contiguous range, the Arrow table is simply sliced. + However passing a list of indices that are not contiguous creates indices mapping, which is much less efficient, + but still faster than recreating an Arrow table made of the requested rows. + keep_in_memory (`bool`, defaults to `False`): + Keep the indices mapping in memory instead of writing it to a cache file. + indices_cache_file_name (`str`, *optional*, defaults to `None`): + Provide the name of a path for the cache file. It is used to store the + indices mapping instead of the automatically generated cache file name. + writer_batch_size (`int`, defaults to `1000`): + Number of rows per write operation for the cache file writer. + This value is a good trade-off between memory usage during the processing, and processing speed. + Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `map`. + new_fingerprint (`str`, *optional*, defaults to `None`): + The new fingerprint of the dataset after transform. + If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") + >>> ds = ds.select(range(4)) + >>> ds + Dataset({ + features: ['text', 'label'], + num_rows: 4 + }) + ``` + """ + if keep_in_memory and indices_cache_file_name is not None: + raise ValueError("Please use either `keep_in_memory` or `indices_cache_file_name` but not both.") + + if len(self.list_indexes()) > 0: + raise DatasetTransformationNotAllowedError( + "Using `.select` on a dataset with attached indexes is not allowed. You can first run `.drop_index() to remove your index and then re-add it." + ) + + # If the array is empty we do nothing + if len(self) == 0: + return self + + # If indices is a PyArrow array, we convert to NumPy + if isinstance(indices, (pa.Array, pa.ChunkedArray)): + indices = indices.to_numpy().astype(np.int64) + + # Convert generator objects to lists + if isinstance(indices, Iterator): + indices = list(indices) + + # If the indices are contiguous, simply slice the arrow table + if isinstance(indices, range): + if _is_range_contiguous(indices) and indices.start >= 0: + start, length = indices.start, indices.stop - indices.start + return self._select_contiguous(start, length, new_fingerprint=new_fingerprint) + else: + try: + start = next(iter(indices)) + except StopIteration: + # if `indices` is an empty iterable, we return an empty dataset + return self._select_contiguous(0, 0, new_fingerprint=new_fingerprint) + if start >= 0: + counter_from_start = itertools.count(start=start) + if all(i == j for i, j in zip(indices, counter_from_start)): + length = next(counter_from_start) - start + return self._select_contiguous(start, length, new_fingerprint=new_fingerprint) + + # If not contiguous, we need to create a new indices mapping + return self._select_with_indices_mapping( + indices, + keep_in_memory=keep_in_memory, + indices_cache_file_name=indices_cache_file_name, + writer_batch_size=writer_batch_size, + new_fingerprint=new_fingerprint, + ) + + @transmit_format + @fingerprint_transform(inplace=False) + def _select_contiguous( + self, + start: int, + length: int, + new_fingerprint: Optional[str] = None, + ) -> "Dataset": + """Create a new dataset with rows from a contiguous slice of data. + The slice is defined by that start index and its length. + + Args: + start (`int`): start index. + length (`int`): length of the slice to select. + new_fingerprint (`str`, optional, default `None`): the new fingerprint of the dataset after transform. + If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") + >>> ds._select_contiguous(0, 4) + Dataset({ + features: ['text', 'label'], + num_rows: 4 + }) + ``` + """ + if len(self.list_indexes()) > 0: + raise DatasetTransformationNotAllowedError( + "Using `.select` on a dataset with attached indexes is not allowed. You can first run `.drop_index() to remove your index and then re-add it." + ) + + # If the array is empty we do nothing + if len(self) == 0: + return self + + _check_valid_indices_value(start, len(self)) + _check_valid_indices_value(start + length - 1, len(self)) + if self._indices is None or length == 0: + return Dataset( + self.data.slice(start, length), + info=self.info.copy(), + split=self.split, + fingerprint=new_fingerprint, + ) + else: + return Dataset( + self.data, + info=self.info.copy(), + split=self.split, + indices_table=self._indices.slice(start, length), + fingerprint=new_fingerprint, + ) + + @transmit_format + @fingerprint_transform(inplace=False, ignore_kwargs=["indices_cache_file_name"]) + def _select_with_indices_mapping( + self, + indices: Iterable, + keep_in_memory: bool = False, + indices_cache_file_name: Optional[str] = None, + writer_batch_size: Optional[int] = 1000, + new_fingerprint: Optional[str] = None, + ) -> "Dataset": + """Create a new dataset with rows selected following the list/array of indices. + The new dataset is made by creating a new indices mapping on top of the main arrow table. + + Args: + indices (sequence, iterable, range, ndarray or Series): List or 1D-array of integer indices for indexing. + keep_in_memory (`bool`, default `False`): Keep the indices mapping in memory instead of writing it to a cache file. + indices_cache_file_name (`str`, optional, default `None`): Provide the name of a path for the cache file. It is used to store the + indices mapping instead of the automatically generated cache file name. + writer_batch_size (`int`, default `1000`): Number of rows per write operation for the cache file writer. + This value is a good trade-off between memory usage during the processing, and processing speed. + Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `.map()`. + new_fingerprint (`str`, optional, default `None`): the new fingerprint of the dataset after transform. + If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") + >>> ds._select_with_indices_mapping(range(4)) + Dataset({ + features: ['text', 'label'], + num_rows: 4 + }) + ``` + """ + if keep_in_memory and indices_cache_file_name is not None: + raise ValueError("Please use either `keep_in_memory` or `indices_cache_file_name` but not both.") + + if len(self.list_indexes()) > 0: + raise DatasetTransformationNotAllowedError( + "Using `.select` on a dataset with attached indexes is not allowed. You can first run `.drop_index() to remove your index and then re-add it." + ) + + # If the array is empty we do nothing + if len(self) == 0: + return self + + # Prepare the writer for our indices arrow table + if keep_in_memory or indices_cache_file_name is None: + buf_writer = pa.BufferOutputStream() + tmp_file = None + writer = ArrowWriter( + stream=buf_writer, writer_batch_size=writer_batch_size, fingerprint=new_fingerprint, unit="indices" + ) + else: + buf_writer = None + logger.info(f"Caching indices mapping at {indices_cache_file_name}") + cache_dir = os.path.dirname(indices_cache_file_name) + os.makedirs(cache_dir, exist_ok=True) + tmp_file = tempfile.NamedTemporaryFile("wb", dir=cache_dir, delete=False) + writer = ArrowWriter( + path=tmp_file.name, writer_batch_size=writer_batch_size, fingerprint=new_fingerprint, unit="indices" + ) + + indices = indices if isinstance(indices, list) else list(indices) + + size = len(self) + if indices: + _check_valid_indices_value(int(max(indices)), size=size) + _check_valid_indices_value(int(min(indices)), size=size) + else: + return self._select_contiguous(0, 0, new_fingerprint=new_fingerprint) + + indices_array = pa.array(indices, type=pa.uint64()) + # Check if we need to convert indices + if self._indices is not None: + indices_array = self._indices.column(0).take(indices_array) + + indices_table = pa.Table.from_arrays([indices_array], names=["indices"]) + + with writer: + try: + writer.write_table(indices_table) + writer.finalize() # close_stream=bool(buf_writer is None)) We only close if we are writing in a file + except (Exception, KeyboardInterrupt): + if tmp_file is not None: + tmp_file.close() + if os.path.exists(tmp_file.name): + os.remove(tmp_file.name) + raise + + if tmp_file is not None: + tmp_file.close() + shutil.move(tmp_file.name, indices_cache_file_name) + umask = os.umask(0o666) + os.umask(umask) + os.chmod(indices_cache_file_name, 0o666 & ~umask) + + # Return new Dataset object + if buf_writer is None: + return self._new_dataset_with_indices( + indices_cache_file_name=indices_cache_file_name, fingerprint=new_fingerprint + ) + else: + return self._new_dataset_with_indices(indices_buffer=buf_writer.getvalue(), fingerprint=new_fingerprint) + + def skip(self, n: int) -> "Dataset": + """ + Create a new [`Dataset`] that skips the first `n` elements. + + Args: + n (`int`): + Number of elements to skip. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train") + >>> list(ds.take(3)) + [{'label': 1, + 'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'}, + {'label': 1, + 'text': 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .'}, + {'label': 1, 'text': 'effective but too-tepid biopic'}] + >>> ds = ds.skip(1) + >>> list(ds.take(3)) + [{'label': 1, + 'text': 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .'}, + {'label': 1, 'text': 'effective but too-tepid biopic'}, + {'label': 1, + 'text': 'if you sometimes like to go to the movies to have fun , wasabi is a good place to start .'}] + ``` + """ + return self.select(range(n, len(self))) + + def repeat(self, num_times: int) -> "Dataset": + """ + Create a new [`Dataset`] that repeats the underlying dataset `num_times` times. + + Like itertools.repeat, repeating once just returns the full dataset. + + Args: + num_times (`int`): + Number of times to repeat the dataset. + + Example: + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train") + >>> ds = ds.take(2).repeat(2) + >>> list(ds) + [{'label': 1, + 'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'}, + {'label': 1, + 'text': 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .'}, + {'label': 1, 'text': 'effective but too-tepid biopic'}, + {'label': 1, + 'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'}, + {'label': 1, + 'text': 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .'}, + {'label': 1, 'text': 'effective but too-tepid biopic'}] + ``` + """ + if num_times is None: + raise ValueError("Map style datasets do not support indefinite repetition.") + return _concatenate_map_style_datasets([self] * num_times) if num_times > 0 else self.select([]) + + def take(self, n: int) -> "Dataset": + """ + Create a new [`Dataset`] with only the first `n` elements. + + Args: + n (`int`): + Number of elements to take. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train") + >>> small_ds = ds.take(2) + >>> list(small_ds) + [{'label': 1, + 'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'}, + {'label': 1, + 'text': 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .'}] + ``` + """ + return self.select(range(n)) + + @transmit_format + @fingerprint_transform(inplace=False, ignore_kwargs=["load_from_cache_file", "indices_cache_file_name"]) + def sort( + self, + column_names: Union[str, Sequence_[str]], + reverse: Union[bool, Sequence_[bool]] = False, + null_placement: str = "at_end", + keep_in_memory: bool = False, + load_from_cache_file: Optional[bool] = None, + indices_cache_file_name: Optional[str] = None, + writer_batch_size: Optional[int] = 1000, + new_fingerprint: Optional[str] = None, + ) -> "Dataset": + """Create a new dataset sorted according to a single or multiple columns. + + Args: + column_names (`Union[str, Sequence[str]]`): + Column name(s) to sort by. + reverse (`Union[bool, Sequence[bool]]`, defaults to `False`): + If `True`, sort by descending order rather than ascending. If a single bool is provided, + the value is applied to the sorting of all column names. Otherwise a list of bools with the + same length and order as column_names must be provided. + null_placement (`str`, defaults to `at_end`): + Put `None` values at the beginning if `at_start` or `first` or at the end if `at_end` or `last` + + + keep_in_memory (`bool`, defaults to `False`): + Keep the sorted indices in memory instead of writing it to a cache file. + load_from_cache_file (`Optional[bool]`, defaults to `True` if caching is enabled): + If a cache file storing the sorted indices + can be identified, use it instead of recomputing. + indices_cache_file_name (`str`, *optional*, defaults to `None`): + Provide the name of a path for the cache file. It is used to store the + sorted indices instead of the automatically generated cache file name. + writer_batch_size (`int`, defaults to `1000`): + Number of rows per write operation for the cache file writer. + Higher value gives smaller cache files, lower value consume less temporary memory. + new_fingerprint (`str`, *optional*, defaults to `None`): + The new fingerprint of the dataset after transform. + If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset('cornell-movie-review-data/rotten_tomatoes', split='validation') + >>> ds['label'][:10] + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + >>> sorted_ds = ds.sort('label') + >>> sorted_ds['label'][:10] + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + >>> another_sorted_ds = ds.sort(['label', 'text'], reverse=[True, False]) + >>> another_sorted_ds['label'][:10] + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + ``` + """ + if len(self.list_indexes()) > 0: + raise DatasetTransformationNotAllowedError( + "Using `.sort` on a dataset with attached indexes is not allowed. You can first run `.drop_index() to remove your index and then re-add it." + ) + # If the array is empty we do nothing + if len(self) == 0: + return self + + # Check proper format of and for duplicates in column_names + if isinstance(column_names, str): + column_names = [column_names] + + # Check proper format and length of reverse + if not isinstance(reverse, bool): + if len(reverse) != len(column_names): + raise ValueError( + "Parameter 'reverse' should be either a boolean or a list of booleans with the same length as 'column_names'." + ) + else: + reverse = [reverse] * len(column_names) + + # Check whether column name(s) exist in dataset + for column in column_names: + if not isinstance(column, str) or column not in self._data.column_names: + raise ValueError( + f"Column '{column}' not found in the dataset. Please provide a column selected in: {self._data.column_names}" + ) + + # Change null_placement to conform to pyarrow's sort_indices() while ensuring backwards compatibility + if null_placement not in ["at_start", "at_end"]: + if null_placement == "first": + null_placement = "at_start" + elif null_placement == "last": + null_placement = "at_end" + else: + raise ValueError( + f"null_placement '{null_placement}' is an invalid parameter value. Must be either 'last', 'at_end', 'first' or 'at_start'." + ) + + load_from_cache_file = load_from_cache_file if load_from_cache_file is not None else is_caching_enabled() + + # Check if we've already cached this computation (indexed by a hash) + if self.cache_files: + if indices_cache_file_name is None: + # we create a unique hash from the function, current dataset file and the mapping args + indices_cache_file_name = self._get_cache_file_path(new_fingerprint) + if os.path.exists(indices_cache_file_name) and load_from_cache_file: + logger.info(f"Loading cached sorted indices for dataset at {indices_cache_file_name}") + return self._new_dataset_with_indices( + fingerprint=new_fingerprint, indices_cache_file_name=indices_cache_file_name + ) + + sort_table = query_table( + table=self._data, + key=slice(0, len(self)), + indices=self._indices, + ) + + sort_keys = [ + (col, "ascending" if not col_reverse else "descending") for col, col_reverse in zip(column_names, reverse) + ] + + indices = pc.sort_indices(sort_table, sort_keys=sort_keys, null_placement=null_placement) + + return self.select( + indices=indices, + keep_in_memory=keep_in_memory, + indices_cache_file_name=indices_cache_file_name, + writer_batch_size=writer_batch_size, + new_fingerprint=new_fingerprint, + ) + + @transmit_format + @fingerprint_transform( + inplace=False, randomized_function=True, ignore_kwargs=["load_from_cache_file", "indices_cache_file_name"] + ) + def shuffle( + self, + seed: Optional[int] = None, + generator: Optional[np.random.Generator] = None, + keep_in_memory: bool = False, + load_from_cache_file: Optional[bool] = None, + indices_cache_file_name: Optional[str] = None, + writer_batch_size: Optional[int] = 1000, + new_fingerprint: Optional[str] = None, + ) -> "Dataset": + """Create a new Dataset where the rows are shuffled. + + Currently shuffling uses numpy random generators. + You can either supply a NumPy BitGenerator to use, or a seed to initiate NumPy's default random generator (PCG64). + + Shuffling takes the list of indices `[0:len(my_dataset)]` and shuffles it to create an indices mapping. + However as soon as your [`Dataset`] has an indices mapping, the speed can become 10x slower. + This is because there is an extra step to get the row index to read using the indices mapping, and most importantly, you aren't reading contiguous chunks of data anymore. + To restore the speed, you'd need to rewrite the entire dataset on your disk again using [`Dataset.flatten_indices`], which removes the indices mapping. + This may take a lot of time depending of the size of your dataset though: + + ```python + my_dataset[0] # fast + my_dataset = my_dataset.shuffle(seed=42) + my_dataset[0] # up to 10x slower + my_dataset = my_dataset.flatten_indices() # rewrite the shuffled dataset on disk as contiguous chunks of data + my_dataset[0] # fast again + ``` + + In this case, we recommend switching to an [`IterableDataset`] and leveraging its fast approximate shuffling method [`IterableDataset.shuffle`]. + It only shuffles the shards order and adds a shuffle buffer to your dataset, which keeps the speed of your dataset optimal: + + ```python + my_iterable_dataset = my_dataset.to_iterable_dataset(num_shards=128) + for example in enumerate(my_iterable_dataset): # fast + pass + + shuffled_iterable_dataset = my_iterable_dataset.shuffle(seed=42, buffer_size=100) + + for example in enumerate(shuffled_iterable_dataset): # as fast as before + pass + ``` + + Args: + seed (`int`, *optional*): + A seed to initialize the default BitGenerator if `generator=None`. + If `None`, then fresh, unpredictable entropy will be pulled from the OS. + If an `int` or `array_like[ints]` is passed, then it will be passed to SeedSequence to derive the initial BitGenerator state. + generator (`numpy.random.Generator`, *optional*): + Numpy random Generator to use to compute the permutation of the dataset rows. + If `generator=None` (default), uses `np.random.default_rng` (the default BitGenerator (PCG64) of NumPy). + keep_in_memory (`bool`, default `False`): + Keep the shuffled indices in memory instead of writing it to a cache file. + load_from_cache_file (`Optional[bool]`, defaults to `True` if caching is enabled): + If a cache file storing the shuffled indices + can be identified, use it instead of recomputing. + indices_cache_file_name (`str`, *optional*): + Provide the name of a path for the cache file. It is used to store the + shuffled indices instead of the automatically generated cache file name. + writer_batch_size (`int`, defaults to `1000`): + Number of rows per write operation for the cache file writer. + This value is a good trade-off between memory usage during the processing, and processing speed. + Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `map`. + new_fingerprint (`str`, *optional*, defaults to `None`): + The new fingerprint of the dataset after transform. + If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") + >>> ds['label'][:10] + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + + # set a seed + >>> shuffled_ds = ds.shuffle(seed=42) + >>> shuffled_ds['label'][:10] + [1, 0, 1, 1, 0, 0, 0, 0, 0, 0] + ``` + """ + if len(self.list_indexes()) > 0: + raise DatasetTransformationNotAllowedError( + "Using `.shuffle` on a dataset with attached indexes is not allowed. You can first run `.drop_index() to remove your index and then re-add it." + ) + # If the array is empty we do nothing + if len(self) == 0: + return self + + if keep_in_memory and indices_cache_file_name is not None: + raise ValueError("Please use either `keep_in_memory` or `indices_cache_file_name` but not both.") + + if seed is not None and generator is not None: + raise ValueError("Both `seed` and `generator` were provided. Please specify just one of them.") + + if generator is not None and not isinstance(generator, np.random.Generator): + raise ValueError("The provided generator must be an instance of numpy.random.Generator") + + load_from_cache_file = load_from_cache_file if load_from_cache_file is not None else is_caching_enabled() + + if generator is None: + if seed is None: + _, seed, pos, *_ = np.random.get_state() + seed = seed[pos] if pos < 624 else seed[0] + _ = np.random.random() # do 1 step of rng + generator = np.random.default_rng(seed) + + # Check if we've already cached this computation (indexed by a hash) + if self.cache_files: + if indices_cache_file_name is None: + # we create a unique hash from the function, current dataset file and the mapping args + indices_cache_file_name = self._get_cache_file_path(new_fingerprint) + if os.path.exists(indices_cache_file_name) and load_from_cache_file: + logger.info(f"Loading cached shuffled indices for dataset at {indices_cache_file_name}") + return self._new_dataset_with_indices( + fingerprint=new_fingerprint, indices_cache_file_name=indices_cache_file_name + ) + + permutation = generator.permutation(len(self)) + + return self.select( + indices=permutation, + keep_in_memory=keep_in_memory, + indices_cache_file_name=indices_cache_file_name if not keep_in_memory else None, + writer_batch_size=writer_batch_size, + new_fingerprint=new_fingerprint, + ) + + @transmit_format + @fingerprint_transform( + inplace=False, + randomized_function=True, + fingerprint_names=["train_new_fingerprint", "test_new_fingerprint"], + ignore_kwargs=["load_from_cache_file", "train_indices_cache_file_name", "test_indices_cache_file_name"], + ) + def train_test_split( + self, + test_size: Union[float, int, None] = None, + train_size: Union[float, int, None] = None, + shuffle: bool = True, + stratify_by_column: Optional[str] = None, + seed: Optional[int] = None, + generator: Optional[np.random.Generator] = None, + keep_in_memory: bool = False, + load_from_cache_file: Optional[bool] = None, + train_indices_cache_file_name: Optional[str] = None, + test_indices_cache_file_name: Optional[str] = None, + writer_batch_size: Optional[int] = 1000, + train_new_fingerprint: Optional[str] = None, + test_new_fingerprint: Optional[str] = None, + ) -> "DatasetDict": + """Return a dictionary ([`datasets.DatasetDict`]) with two random train and test subsets (`train` and `test` `Dataset` splits). + Splits are created from the dataset according to `test_size`, `train_size` and `shuffle`. + + This method is similar to scikit-learn `train_test_split`. + + Args: + test_size (`Union[float, int, None]`, *optional*): + Size of the test split + If `float`, should be between `0.0` and `1.0` and represent the proportion of the dataset to include in the test split. + If `int`, represents the absolute number of test samples. + If `None`, the value is set to the complement of the train size. + If `train_size` is also `None`, it will be set to `0.25`. + train_size (`Union[float, int, None]`, *optional*): + Size of the train split + If `float`, should be between `0.0` and `1.0` and represent the proportion of the dataset to include in the train split. + If `int`, represents the absolute number of train samples. + If `None`, the value is automatically set to the complement of the test size. + shuffle (`bool`, *optional*, defaults to `True`): + Whether or not to shuffle the data before splitting. + stratify_by_column (`str`, *optional*, defaults to `None`): + The column name of labels to be used to perform stratified split of data. + seed (`int`, *optional*): + A seed to initialize the default BitGenerator if `generator=None`. + If `None`, then fresh, unpredictable entropy will be pulled from the OS. + If an `int` or `array_like[ints]` is passed, then it will be passed to SeedSequence to derive the initial BitGenerator state. + generator (`numpy.random.Generator`, *optional*): + Numpy random Generator to use to compute the permutation of the dataset rows. + If `generator=None` (default), uses `np.random.default_rng` (the default BitGenerator (PCG64) of NumPy). + keep_in_memory (`bool`, defaults to `False`): + Keep the splits indices in memory instead of writing it to a cache file. + load_from_cache_file (`Optional[bool]`, defaults to `True` if caching is enabled): + If a cache file storing the splits indices + can be identified, use it instead of recomputing. + train_cache_file_name (`str`, *optional*): + Provide the name of a path for the cache file. It is used to store the + train split indices instead of the automatically generated cache file name. + test_cache_file_name (`str`, *optional*): + Provide the name of a path for the cache file. It is used to store the + test split indices instead of the automatically generated cache file name. + writer_batch_size (`int`, defaults to `1000`): + Number of rows per write operation for the cache file writer. + This value is a good trade-off between memory usage during the processing, and processing speed. + Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `map`. + train_new_fingerprint (`str`, *optional*, defaults to `None`): + The new fingerprint of the train set after transform. + If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments + test_new_fingerprint (`str`, *optional*, defaults to `None`): + The new fingerprint of the test set after transform. + If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") + >>> ds = ds.train_test_split(test_size=0.2, shuffle=True) + DatasetDict({ + train: Dataset({ + features: ['text', 'label'], + num_rows: 852 + }) + test: Dataset({ + features: ['text', 'label'], + num_rows: 214 + }) + }) + + # set a seed + >>> ds = ds.train_test_split(test_size=0.2, seed=42) + + # stratified split + >>> ds = load_dataset("imdb",split="train") + Dataset({ + features: ['text', 'label'], + num_rows: 25000 + }) + >>> ds = ds.train_test_split(test_size=0.2, stratify_by_column="label") + DatasetDict({ + train: Dataset({ + features: ['text', 'label'], + num_rows: 20000 + }) + test: Dataset({ + features: ['text', 'label'], + num_rows: 5000 + }) + }) + ``` + """ + from .dataset_dict import DatasetDict # import here because of circular dependency + + if len(self.list_indexes()) > 0: + raise DatasetTransformationNotAllowedError( + "Using `.train_test_split` on a dataset with attached indexes is not allowed. You can first run `.drop_index() to remove your index and then re-add it." + ) + # If the array is empty we do nothing + if len(self) == 0: + return DatasetDict({"train": self, "test": self}) + + if test_size is None and train_size is None: + test_size = 0.25 + + # Safety checks similar to scikit-learn's ones. + # (adapted from https://github.com/scikit-learn/scikit-learn/blob/fd237278e895b42abe8d8d09105cbb82dc2cbba7/sklearn/model_selection/_split.py#L1750) + n_samples = len(self) + if ( + isinstance(test_size, int) + and (test_size >= n_samples or test_size <= 0) + or isinstance(test_size, float) + and (test_size <= 0 or test_size >= 1) + ): + raise ValueError( + f"test_size={test_size} should be either positive and smaller " + f"than the number of samples {n_samples} or a float in the (0, 1) range" + ) + + if ( + isinstance(train_size, int) + and (train_size >= n_samples or train_size <= 0) + or isinstance(train_size, float) + and (train_size <= 0 or train_size >= 1) + ): + raise ValueError( + f"train_size={train_size} should be either positive and smaller " + f"than the number of samples {n_samples} or a float in the (0, 1) range" + ) + + if train_size is not None and not isinstance(train_size, (int, float)): + raise ValueError(f"Invalid value for train_size: {train_size} of type {type(train_size)}") + if test_size is not None and not isinstance(test_size, (int, float)): + raise ValueError(f"Invalid value for test_size: {test_size} of type {type(test_size)}") + + if isinstance(train_size, float) and isinstance(test_size, float) and train_size + test_size > 1: + raise ValueError( + f"The sum of test_size and train_size = {train_size + test_size}, should be in the (0, 1)" + " range. Reduce test_size and/or train_size." + ) + + if isinstance(test_size, float): + n_test = ceil(test_size * n_samples) + elif isinstance(test_size, int): + n_test = float(test_size) + + if isinstance(train_size, float): + n_train = floor(train_size * n_samples) + elif isinstance(train_size, int): + n_train = float(train_size) + + if train_size is None: + n_train = n_samples - n_test + elif test_size is None: + n_test = n_samples - n_train + + if n_train + n_test > n_samples: + raise ValueError( + f"The sum of train_size and test_size = {n_train + n_test}, " + "should be smaller than the number of " + f"samples {n_samples}. Reduce test_size and/or " + "train_size." + ) + + n_train, n_test = int(n_train), int(n_test) + + if n_train == 0: + raise ValueError( + f"With n_samples={n_samples}, test_size={test_size} and train_size={train_size}, the " + "resulting train set will be empty. Adjust any of the " + "aforementioned parameters." + ) + + load_from_cache_file = load_from_cache_file if load_from_cache_file is not None else is_caching_enabled() + + if generator is None and shuffle is True: + if seed is None: + _, seed, pos, *_ = np.random.get_state() + seed = seed[pos] if pos < 624 else seed[0] + _ = np.random.random() # do 1 step of rng + generator = np.random.default_rng(seed) + + # Check if we've already cached this computation (indexed by a hash) + if self.cache_files: + if train_indices_cache_file_name is None or test_indices_cache_file_name is None: + # we create a unique hash from the function, current dataset file and the mapping args + + if train_indices_cache_file_name is None: + train_indices_cache_file_name = self._get_cache_file_path(train_new_fingerprint) + if test_indices_cache_file_name is None: + test_indices_cache_file_name = self._get_cache_file_path(test_new_fingerprint) + if ( + os.path.exists(train_indices_cache_file_name) + and os.path.exists(test_indices_cache_file_name) + and load_from_cache_file + ): + logger.info( + f"Loading cached split indices for dataset at {train_indices_cache_file_name} and {test_indices_cache_file_name}" + ) + return DatasetDict( + { + "train": self._new_dataset_with_indices( + fingerprint=train_new_fingerprint, indices_cache_file_name=train_indices_cache_file_name + ), + "test": self._new_dataset_with_indices( + fingerprint=test_new_fingerprint, indices_cache_file_name=test_indices_cache_file_name + ), + } + ) + if not shuffle: + if stratify_by_column is not None: + raise ValueError("Stratified train/test split is not implemented for `shuffle=False`") + train_indices = np.arange(n_train) + test_indices = np.arange(n_train, n_train + n_test) + else: + # stratified partition + if stratify_by_column is not None: + if stratify_by_column not in self._info.features.keys(): + raise ValueError(f"Key {stratify_by_column} not found in {self._info.features.keys()}") + if not isinstance(self._info.features[stratify_by_column], ClassLabel): + raise ValueError( + f"Stratifying by column is only supported for {ClassLabel.__name__} column, and column {stratify_by_column} is {type(self._info.features[stratify_by_column]).__name__}." + ) + try: + train_indices, test_indices = next( + stratified_shuffle_split_generate_indices( + self.with_format("numpy")[stratify_by_column], n_train, n_test, rng=generator + ) + ) + except Exception as error: + if str(error) == "Minimum class count error": + raise ValueError( + f"The least populated class in {stratify_by_column} column has only 1" + " member, which is too few. The minimum" + " number of groups for any class cannot" + " be less than 2." + ) + else: + raise error + + # random partition + else: + permutation = generator.permutation(len(self)) + test_indices = permutation[:n_test] + train_indices = permutation[n_test : (n_test + n_train)] + + train_split = self.select( + indices=train_indices, + keep_in_memory=keep_in_memory, + indices_cache_file_name=train_indices_cache_file_name, + writer_batch_size=writer_batch_size, + new_fingerprint=train_new_fingerprint, + ) + test_split = self.select( + indices=test_indices, + keep_in_memory=keep_in_memory, + indices_cache_file_name=test_indices_cache_file_name, + writer_batch_size=writer_batch_size, + new_fingerprint=test_new_fingerprint, + ) + + return DatasetDict({"train": train_split, "test": test_split}) + + def shard( + self, + num_shards: int, + index: int, + contiguous: bool = True, + keep_in_memory: bool = False, + indices_cache_file_name: Optional[str] = None, + writer_batch_size: Optional[int] = 1000, + ) -> "Dataset": + """Return the `index`-nth shard from dataset split into `num_shards` pieces. + + This shards deterministically. `dataset.shard(n, i)` splits the dataset into contiguous chunks, + so it can be easily concatenated back together after processing. If `len(dataset) % n == l`, then the + first `l` dataset each have length `(len(dataset) // n) + 1`, and the remaining dataset have length `(len(dataset) // n)`. + `datasets.concatenate_datasets([dset.shard(n, i) for i in range(n)])` returns a dataset with the same order as the original. + + Note: n should be less or equal to the number of elements in the dataset `len(dataset)`. + + On the other hand, `dataset.shard(n, i, contiguous=False)` contains all elements of the dataset whose index mod `n = i`. + + Be sure to shard before using any randomizing operator (such as `shuffle`). + It is best if the shard operator is used early in the dataset pipeline. + + Args: + num_shards (`int`): + How many shards to split the dataset into. + index (`int`): + Which shard to select and return. + contiguous: (`bool`, defaults to `True`): + Whether to select contiguous blocks of indices for shards. + keep_in_memory (`bool`, defaults to `False`): + Keep the dataset in memory instead of writing it to a cache file. + indices_cache_file_name (`str`, *optional*): + Provide the name of a path for the cache file. It is used to store the + indices of each shard instead of the automatically generated cache file name. + writer_batch_size (`int`, defaults to `1000`): + This only concerns the indices mapping. + Number of indices per write operation for the cache file writer. + This value is a good trade-off between memory usage during the processing, and processing speed. + Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `map`. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") + >>> ds + Dataset({ + features: ['text', 'label'], + num_rows: 1066 + }) + >>> ds = ds.shard(num_shards=2, index=0) + >>> ds + Dataset({ + features: ['text', 'label'], + num_rows: 533 + }) + ``` + """ + if not 0 <= index < num_shards: + raise ValueError("index should be in [0, num_shards-1]") + if contiguous: + div = len(self) // num_shards + mod = len(self) % num_shards + start = div * index + min(index, mod) + end = start + div + (1 if index < mod else 0) + indices = range(start, end) + else: + indices = np.arange(index, len(self), num_shards) + + return self.select( + indices=indices, + keep_in_memory=keep_in_memory, + indices_cache_file_name=indices_cache_file_name, + writer_batch_size=writer_batch_size, + ) + + def to_csv( + self, + path_or_buf: Union[PathLike, BinaryIO], + batch_size: Optional[int] = None, + num_proc: Optional[int] = None, + storage_options: Optional[dict] = None, + **to_csv_kwargs, + ) -> int: + """Exports the dataset to csv + + Args: + path_or_buf (`PathLike` or `FileOrBuffer`): + Either a path to a file (e.g. `file.csv`), a remote URI (e.g. `hf://datasets/username/my_dataset_name/data.csv`), + or a BinaryIO, where the dataset will be saved to in the specified format. + batch_size (`int`, *optional*): + Size of the batch to load in memory and write at once. + Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`. + num_proc (`int`, *optional*): + Number of processes for multiprocessing. By default it doesn't + use multiprocessing. `batch_size` in this case defaults to + `datasets.config.DEFAULT_MAX_BATCH_SIZE` but feel free to make it 5x or 10x of the default + value if you have sufficient compute power. + storage_options (`dict`, *optional*): + Key/value pairs to be passed on to the file-system backend, if any. + + + **to_csv_kwargs (additional keyword arguments): + Parameters to pass to pandas's [`pandas.DataFrame.to_csv`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html). + + + + Now, `index` defaults to `False` if not specified. + + If you would like to write the index, pass `index=True` and also set a name for the index column by + passing `index_label`. + + + + Returns: + `int`: The number of characters or bytes written. + + Example: + + ```py + >>> ds.to_csv("path/to/dataset/directory") + ``` + """ + # Dynamic import to avoid circular dependency + from .io.csv import CsvDatasetWriter + + return CsvDatasetWriter( + self, + path_or_buf, + batch_size=batch_size, + num_proc=num_proc, + storage_options=storage_options, + **to_csv_kwargs, + ).write() + + def to_dict(self, batch_size: Optional[int] = None, batched: bool = False) -> Union[dict, Iterator[dict]]: + """Returns the dataset as a Python dict. Can also return a generator for large datasets. + + Args: + batch_size (`int`, *optional*): The size (number of rows) of the batches if `batched` is `True`. + Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`. + batched (`bool`): + Set to `True` to return a generator that yields the dataset as batches + of `batch_size` rows. Defaults to `False` (returns the whole datasets once). + + Returns: + `dict` or `Iterator[dict]` + + Example: + + ```py + >>> ds.to_dict() + ``` + """ + return query_table( + table=self._data, + key=slice(0, len(self)), + indices=self._indices, + ).to_pydict() + + def to_list(self) -> list: + """Returns the dataset as a Python list. + + Returns: + `list` + + Example: + + ```py + >>> ds.to_list() + ``` + """ + return query_table( + table=self._data, + key=slice(0, len(self)), + indices=self._indices, + ).to_pylist() + + def to_json( + self, + path_or_buf: Union[PathLike, BinaryIO], + batch_size: Optional[int] = None, + num_proc: Optional[int] = None, + storage_options: Optional[dict] = None, + **to_json_kwargs, + ) -> int: + """Export the dataset to JSON Lines or JSON. + + The default output format is [JSON Lines](https://jsonlines.org/). + To export to [JSON](https://www.json.org), pass `lines=False` argument and the desired `orient`. + + Args: + path_or_buf (`PathLike` or `FileOrBuffer`): + Either a path to a file (e.g. `file.json`), a remote URI (e.g. `hf://datasets/username/my_dataset_name/data.json`), + or a BinaryIO, where the dataset will be saved to in the specified format. + batch_size (`int`, *optional*): + Size of the batch to load in memory and write at once. + Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`. + num_proc (`int`, *optional*): + Number of processes for multiprocessing. By default, it doesn't + use multiprocessing. `batch_size` in this case defaults to + `datasets.config.DEFAULT_MAX_BATCH_SIZE` but feel free to make it 5x or 10x of the default + value if you have sufficient compute power. + storage_options (`dict`, *optional*): + Key/value pairs to be passed on to the file-system backend, if any. + + + **to_json_kwargs (additional keyword arguments): + Parameters to pass to pandas's [`pandas.DataFrame.to_json`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_json.html). + Default arguments are `lines=True` and `orient="records". + + + + The parameter `index` defaults to `False` if `orient` is `"split"` or `"table"`. + + If you would like to write the index, pass `index=True`. + + + + Returns: + `int`: The number of characters or bytes written. + + Example: + + ```py + >>> ds.to_json("path/to/dataset/directory/filename.jsonl") + ``` + """ + # Dynamic import to avoid circular dependency + from .io.json import JsonDatasetWriter + + return JsonDatasetWriter( + self, + path_or_buf, + batch_size=batch_size, + num_proc=num_proc, + storage_options=storage_options, + **to_json_kwargs, + ).write() + + def to_pandas( + self, batch_size: Optional[int] = None, batched: bool = False + ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: + """Returns the dataset as a `pandas.DataFrame`. Can also return a generator for large datasets. + + Args: + batch_size (`int`, *optional*): + The size (number of rows) of the batches if `batched` is `True`. + Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`. + batched (`bool`): + Set to `True` to return a generator that yields the dataset as batches + of `batch_size` rows. Defaults to `False` (returns the whole datasets once). + + Returns: + `pandas.DataFrame` or `Iterator[pandas.DataFrame]` + + Example: + + ```py + >>> ds.to_pandas() + ``` + """ + if not batched: + return query_table( + table=self._data, + key=slice(0, len(self)), + indices=self._indices, + ).to_pandas(types_mapper=pandas_types_mapper) + else: + batch_size = batch_size if batch_size else config.DEFAULT_MAX_BATCH_SIZE + return ( + query_table( + table=self._data, + key=slice(offset, offset + batch_size), + indices=self._indices, + ).to_pandas(types_mapper=pandas_types_mapper) + for offset in range(0, len(self), batch_size) + ) + + def to_polars( + self, + batch_size: Optional[int] = None, + batched: bool = False, + schema_overrides: Optional[dict] = None, + rechunk: bool = True, + ) -> Union["pl.DataFrame", Iterator["pl.DataFrame"]]: + """Returns the dataset as a `polars.DataFrame`. Can also return a generator for large datasets. + + Args: + batch_size (`int`, *optional*): + The size (number of rows) of the batches if `batched` is `True`. + Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`. + batched (`bool`): + Set to `True` to return a generator that yields the dataset as batches + of `batch_size` rows. Defaults to `False` (returns the whole datasets once). + schema_overrides (`dict`, *optional*): + Support type specification or override of one or more columns; note that + any dtypes inferred from the schema param will be overridden. + rechunk (`bool`): + Make sure that all data is in contiguous memory. Defaults to `True`. + Returns: + `polars.DataFrame` or `Iterator[polars.DataFrame]` + + Example: + + ```py + >>> ds.to_polars() + ``` + """ + if config.POLARS_AVAILABLE: + import polars as pl + + if not batched: + return pl.from_arrow( + query_table( + table=self._data, + key=slice(0, len(self)), + indices=self._indices if self._indices is not None else None, + ), + schema_overrides=schema_overrides, + rechunk=rechunk, + ) + else: + batch_size = batch_size if batch_size else config.DEFAULT_MAX_BATCH_SIZE + return ( + pl.from_arrow( + query_table( + table=self._data, + key=slice(offset, offset + batch_size), + indices=self._indices if self._indices is not None else None, + ), + schema_overrides=schema_overrides, + rechunk=rechunk, + ) + for offset in range(0, len(self), batch_size) + ) + else: + raise ValueError("Polars needs to be installed to be able to return Polars dataframes.") + + def to_parquet( + self, + path_or_buf: Union[PathLike, BinaryIO], + batch_size: Optional[int] = None, + storage_options: Optional[dict] = None, + **parquet_writer_kwargs, + ) -> int: + """Exports the dataset to parquet + + Args: + path_or_buf (`PathLike` or `FileOrBuffer`): + Either a path to a file (e.g. `file.parquet`), a remote URI (e.g. `hf://datasets/username/my_dataset_name/data.parquet`), + or a BinaryIO, where the dataset will be saved to in the specified format. + batch_size (`int`, *optional*): + Size of the batch to load in memory and write at once. + By default it aims for row groups with maximum uncompressed byte size of "100MB", + defined by `datasets.config.MAX_ROW_GROUP_SIZE`. + storage_options (`dict`, *optional*): + Key/value pairs to be passed on to the file-system backend, if any. + + + **parquet_writer_kwargs (additional keyword arguments): + Parameters to pass to PyArrow's `pyarrow.parquet.ParquetWriter`. + + Returns: + `int`: The number of characters or bytes written. + + Example: + + ```py + >>> ds.to_parquet("path/to/dataset/directory") + ``` + """ + # Dynamic import to avoid circular dependency + from .io.parquet import ParquetDatasetWriter + + return ParquetDatasetWriter( + self, path_or_buf, batch_size=batch_size, storage_options=storage_options, **parquet_writer_kwargs + ).write() + + def to_sql( + self, + name: str, + con: Union[str, "sqlalchemy.engine.Connection", "sqlalchemy.engine.Engine", "sqlite3.Connection"], + batch_size: Optional[int] = None, + **sql_writer_kwargs, + ) -> int: + """Exports the dataset to a SQL database. + + Args: + name (`str`): + Name of SQL table. + con (`str` or `sqlite3.Connection` or `sqlalchemy.engine.Connection` or `sqlalchemy.engine.Connection`): + A [URI string](https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls) or a SQLite3/SQLAlchemy connection object used to write to a database. + batch_size (`int`, *optional*): + Size of the batch to load in memory and write at once. + Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`. + **sql_writer_kwargs (additional keyword arguments): + Parameters to pass to pandas's [`pandas.DataFrame.to_sql`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_sql.html). + + + + Now, `index` defaults to `False` if not specified. + + If you would like to write the index, pass `index=True` and also set a name for the index column by + passing `index_label`. + + + + Returns: + `int`: The number of records written. + + Example: + + ```py + >>> # con provided as a connection URI string + >>> ds.to_sql("data", "sqlite:///my_own_db.sql") + >>> # con provided as a sqlite3 connection object + >>> import sqlite3 + >>> con = sqlite3.connect("my_own_db.sql") + >>> with con: + ... ds.to_sql("data", con) + ``` + """ + # Dynamic import to avoid circular dependency + from .io.sql import SqlDatasetWriter + + return SqlDatasetWriter(self, name, con, batch_size=batch_size, **sql_writer_kwargs).write() + + def _estimate_nbytes(self) -> int: + dataset_nbytes = self.data.nbytes + + # Find decodable columns, because if there are any, we need to + # adjust the dataset size computation (needed for sharding) to account for possible external files + decodable_columns = [ + k for k, v in self._info.features.items() if require_decoding(v, ignore_decode_attribute=True) + ] + + if decodable_columns: + # Approximate the space needed to store the bytes from the external files by analyzing the first 1000 examples + extra_nbytes = 0 + + def extra_nbytes_visitor(array, feature): + nonlocal extra_nbytes + if isinstance(feature, (Audio, Image, Video)): + for x in array.to_pylist(): + if x is not None and x["bytes"] is None and x["path"] is not None: + size = xgetsize(x["path"]) + extra_nbytes += size + extra_nbytes -= array.field("path").nbytes + + table = self.with_format("arrow")[:1000] + table_visitor(table, extra_nbytes_visitor) + + extra_nbytes = extra_nbytes * len(self.data) // len(table) + dataset_nbytes = dataset_nbytes + extra_nbytes + + if self._indices is not None: + dataset_nbytes = dataset_nbytes * len(self._indices) // len(self.data) + return dataset_nbytes + + @staticmethod + def _generate_tables_from_shards(shards: list["Dataset"], batch_size: int): + for shard_idx, shard in enumerate(shards): + for pa_table in shard.with_format("arrow").iter(batch_size): + yield shard_idx, pa_table + + @staticmethod + def _generate_tables_from_cache_file(filename: str): + for batch_idx, batch in enumerate(_memory_mapped_record_batch_reader_from_file(filename)): + yield batch_idx, pa.Table.from_batches([batch]) + + def to_iterable_dataset(self, num_shards: Optional[int] = 1) -> "IterableDataset": + """Get an [`datasets.IterableDataset`] from a map-style [`datasets.Dataset`]. + This is equivalent to loading a dataset in streaming mode with [`datasets.load_dataset`], but much faster since the data is streamed from local files. + + Contrary to map-style datasets, iterable datasets are lazy and can only be iterated over (e.g. using a for loop). + Since they are read sequentially in training loops, iterable datasets are much faster than map-style datasets. + All the transformations applied to iterable datasets like filtering or processing are done on-the-fly when you start iterating over the dataset. + + Still, it is possible to shuffle an iterable dataset using [`datasets.IterableDataset.shuffle`]. + This is a fast approximate shuffling that works best if you have multiple shards and if you specify a buffer size that is big enough. + + To get the best speed performance, make sure your dataset doesn't have an indices mapping. + If this is the case, the data are not read contiguously, which can be slow sometimes. + You can use `ds = ds.flatten_indices()` to write your dataset in contiguous chunks of data and have optimal speed before switching to an iterable dataset. + + Args: + num_shards (`int`, default to `1`): + Number of shards to define when instantiating the iterable dataset. This is especially useful for big datasets to be able to shuffle properly, + and also to enable fast parallel loading using a PyTorch DataLoader or in distributed setups for example. + Shards are defined using [`datasets.Dataset.shard`]: it simply slices the data without writing anything on disk. + + Returns: + [`datasets.IterableDataset`] + + Example: + + Basic usage: + ```python + >>> ids = ds.to_iterable_dataset() + >>> for example in ids: + ... pass + ``` + + With lazy filtering and processing: + ```python + >>> ids = ds.to_iterable_dataset() + >>> ids = ids.filter(filter_fn).map(process_fn) # will filter and process on-the-fly when you start iterating over the iterable dataset + >>> for example in ids: + ... pass + ``` + + With sharding to enable efficient shuffling: + ```python + >>> ids = ds.to_iterable_dataset(num_shards=64) # the dataset is split into 64 shards to be iterated over + >>> ids = ids.shuffle(buffer_size=10_000) # will shuffle the shards order and use a shuffle buffer for fast approximate shuffling when you start iterating + >>> for example in ids: + ... pass + ``` + + With a PyTorch DataLoader: + ```python + >>> import torch + >>> ids = ds.to_iterable_dataset(num_shards=64) + >>> ids = ids.filter(filter_fn).map(process_fn) + >>> dataloader = torch.utils.data.DataLoader(ids, num_workers=4) # will assign 64 / 4 = 16 shards to each worker to load, filter and process when you start iterating + >>> for example in ids: + ... pass + ``` + + With a PyTorch DataLoader and shuffling: + ```python + >>> import torch + >>> ids = ds.to_iterable_dataset(num_shards=64) + >>> ids = ids.shuffle(buffer_size=10_000) # will shuffle the shards order and use a shuffle buffer when you start iterating + >>> dataloader = torch.utils.data.DataLoader(ids, num_workers=4) # will assign 64 / 4 = 16 shards from the shuffled list of shards to each worker when you start iterating + >>> for example in ids: + ... pass + ``` + + In a distributed setup like PyTorch DDP with a PyTorch DataLoader and shuffling + ```python + >>> from datasets.distributed import split_dataset_by_node + >>> ids = ds.to_iterable_dataset(num_shards=512) + >>> ids = ids.shuffle(buffer_size=10_000, seed=42) # will shuffle the shards order and use a shuffle buffer when you start iterating + >>> ids = split_dataset_by_node(ds, world_size=8, rank=0) # will keep only 512 / 8 = 64 shards from the shuffled lists of shards when you start iterating + >>> dataloader = torch.utils.data.DataLoader(ids, num_workers=4) # will assign 64 / 4 = 16 shards from this node's list of shards to each worker when you start iterating + >>> for example in ids: + ... pass + ``` + + With shuffling and multiple epochs: + ```python + >>> ids = ds.to_iterable_dataset(num_shards=64) + >>> ids = ids.shuffle(buffer_size=10_000, seed=42) # will shuffle the shards order and use a shuffle buffer when you start iterating + >>> for epoch in range(n_epochs): + ... ids.set_epoch(epoch) # will use effective_seed = seed + epoch to shuffle the shards and for the shuffle buffer when you start iterating + ... for example in ids: + ... pass + ``` + Feel free to also use [`IterableDataset.set_epoch`] when using a PyTorch DataLoader or in distributed setups. + """ + from .iterable_dataset import ArrowExamplesIterable, IterableDataset + + if self._format_type is not None: + if self._format_kwargs or ( + self._format_columns is not None and set(self._format_columns) != set(self.column_names) + ): + raise NotImplementedError( + "Converting a formatted dataset with kwargs or selected columns to a formatted iterable dataset is not implemented yet. Please run `my_dataset = my_dataset.with_format(None)` before calling to_iterable_dataset" + ) + if num_shards > len(self): + raise ValueError( + f"Unable to shard a dataset of size {len(self)} into {num_shards} shards (the number of shards exceeds the number of samples)." + ) + if self._indices is not None: + logger.info( + "Converting an Arrow dataset to iterable but it has an indices mapping that can make it slower. " + "You can use `ds = ds.flatten_indices()` to write your dataset in contiguous chunks of data and have optimal speed." + ) + shards = ( + [copy.deepcopy(self)] + if num_shards == 1 + else [ + self.shard(num_shards=num_shards, index=shard_idx, contiguous=True) for shard_idx in range(num_shards) + ] + ) + ex_iterable = ArrowExamplesIterable( + Dataset._generate_tables_from_shards, + kwargs={"shards": shards, "batch_size": config.DEFAULT_MAX_BATCH_SIZE}, + ) + ds = IterableDataset(ex_iterable, info=DatasetInfo(features=self.features)) + if self._format_type: + ds = ds.with_format(self._format_type) + return ds + + def _push_parquet_shards_to_hub_single( + self, + job_id: int, + num_jobs: int, + repo_id: str, + data_dir: str, + split: str, + token: Optional[str], + revision: Optional[str], + create_pr: Optional[bool], + num_shards: int, + embed_external_files: bool, + writer_batch_size: int, + ): + div = num_shards // num_jobs + mod = num_shards % num_jobs + start = div * job_id + min(job_id, mod) + end = start + div + (1 if job_id < mod else 0) + + index_shards = ( + (start + i, self.shard(num_shards=end - start, index=i, contiguous=True)) for i in range(end - start) + ) + + api = HfApi(endpoint=config.HF_ENDPOINT, token=token) + + uploaded_size = 0 + additions: list[CommitOperationAdd] = [] + for index, shard in index_shards: + if embed_external_files: + format = shard.format + shard = shard.with_format("arrow") + shard = shard.map( + embed_table_storage, + batched=True, + batch_size=writer_batch_size, + keep_in_memory=True, + ) + shard = shard.with_format(**format) + shard_path_in_repo = f"{data_dir}/{split}-{index:05d}-of-{num_shards:05d}.parquet" + buffer = BytesIO() + shard.to_parquet(buffer, batch_size=writer_batch_size) + parquet_content = buffer.getvalue() + uploaded_size += len(parquet_content) + del buffer + shard_addition = CommitOperationAdd(path_in_repo=shard_path_in_repo, path_or_fileobj=parquet_content) + api.preupload_lfs_files( + repo_id=repo_id, + additions=[shard_addition], + repo_type="dataset", + revision=revision, + create_pr=create_pr, + ) + additions.append(shard_addition) + yield job_id, False, 1 + + yield job_id, True, additions + + def _push_parquet_shards_to_hub( + self, + repo_id: str, + data_dir: str, + split: str, + token: Optional[str], + revision: Optional[str], + create_pr: Optional[bool], + max_shard_size: Optional[Union[int, str]], + num_shards: Optional[int], + embed_external_files: bool, + num_proc: Optional[int], + ) -> tuple[list[CommitOperationAdd], int, int]: + """Pushes the dataset shards as Parquet files to the hub. + + Returns: + additions (`List[CommitOperation]`): list of the `CommitOperationAdd` of the uploaded shards + uploaded_size (`int`): number of uploaded bytes to the repository + dataset_nbytes (`int`): approximate size in bytes of the uploaded dataset after uncompression + """ + from .arrow_writer import get_writer_batch_size_from_data_size, get_writer_batch_size_from_features + + dataset_nbytes = self._estimate_nbytes() + writer_batch_size = get_writer_batch_size_from_features(self.features) or get_writer_batch_size_from_data_size( + len(self), dataset_nbytes + ) + + # Find decodable columns, because if there are any, we need to: + # embed the bytes from the files in the shards + decodable_columns = ( + [k for k, v in self._info.features.items() if require_decoding(v, ignore_decode_attribute=True)] + if embed_external_files + else [] + ) + embed_external_files = embed_external_files and bool(decodable_columns) + + if num_shards is None: + max_shard_size = convert_file_size_to_int(max_shard_size or config.MAX_SHARD_SIZE) + num_shards = int(dataset_nbytes / max_shard_size) + 1 + num_shards = max(num_shards, num_proc or 1) + + additions: list[CommitOperationAdd] = [] + + num_jobs = num_proc or 1 + kwargs_iterable = [ + { + "self": self.shard(num_shards=num_jobs, index=job_id, contiguous=True), + "job_id": job_id, + "num_jobs": num_jobs, + "repo_id": repo_id, + "data_dir": data_dir, + "split": split, + "token": token, + "revision": revision, + "create_pr": create_pr, + "num_shards": num_shards, + "embed_external_files": embed_external_files, + "writer_batch_size": writer_batch_size, + } + for job_id in range(num_jobs) + ] + desc = "Uploading the dataset shards" + desc += f" (num_proc={num_proc})" if num_proc is not None and num_proc >= 1 else "" + pbar = hf_tqdm( + unit=" shards", + total=num_shards, + desc=desc, + ) + with contextlib.nullcontext() if num_proc is None or num_proc < 1 else Pool(num_proc) as pool: + update_stream = ( + Dataset._push_parquet_shards_to_hub_single(**kwargs_iterable[0]) + if pool is None + else iflatmap_unordered( + pool, + Dataset._push_parquet_shards_to_hub_single, + kwargs_iterable=kwargs_iterable, + ) + ) + for job_id, done, content in update_stream: + if not done: + pbar.update(content) + else: + additions += content + + uploaded_size = sum(addition.upload_info.size for addition in additions) + return additions, uploaded_size, dataset_nbytes + + def push_to_hub( + self, + repo_id: str, + config_name: str = "default", + set_default: Optional[bool] = None, + split: Optional[str] = None, + data_dir: Optional[str] = None, + commit_message: Optional[str] = None, + commit_description: Optional[str] = None, + private: Optional[bool] = None, + token: Optional[str] = None, + revision: Optional[str] = None, + create_pr: Optional[bool] = False, + max_shard_size: Optional[Union[int, str]] = None, + num_shards: Optional[int] = None, + embed_external_files: bool = True, + num_proc: Optional[int] = None, + ) -> CommitInfo: + """Pushes the dataset to the hub as a Parquet dataset. + The dataset is pushed using HTTP requests and does not need to have neither git or git-lfs installed. + + The resulting Parquet files are self-contained by default. If your dataset contains [`Image`], [`Audio`] or [`Video`] + data, the Parquet files will store the bytes of your images or audio files. + You can disable this by setting `embed_external_files` to `False`. + + Args: + repo_id (`str`): + The ID of the repository to push to in the following format: `/` or + `/`. Also accepts ``, which will default to the namespace + of the logged-in user. + config_name (`str`, defaults to "default"): + The configuration name (or subset) of a dataset. Defaults to "default". + set_default (`bool`, *optional*): + Whether to set this configuration as the default one. Otherwise, the default configuration is the one + named "default". + split (`str`, *optional*): + The name of the split that will be given to that dataset. Defaults to `self.split`. + data_dir (`str`, *optional*): + Directory name that will contain the uploaded data files. Defaults to the `config_name` if different + from "default", else "data". + + + commit_message (`str`, *optional*): + Message to commit while pushing. Will default to `"Upload dataset"`. + commit_description (`str`, *optional*): + Description of the commit that will be created. + Additionally, description of the PR if a PR is created (`create_pr` is True). + + + private (`bool`, *optional*): + Whether to make the repo private. If `None` (default), the repo will be public unless the + organization's default is private. This value is ignored if the repo already exists. + token (`str`, *optional*): + An optional authentication token for the Hugging Face Hub. If no token is passed, will default + to the token saved locally when logging in with `huggingface-cli login`. Will raise an error + if no token is passed and the user is not logged-in. + revision (`str`, *optional*): + Branch to push the uploaded files to. Defaults to the `"main"` branch. + + + create_pr (`bool`, *optional*, defaults to `False`): + Whether to create a PR with the uploaded files or directly commit. + + + max_shard_size (`int` or `str`, *optional*, defaults to `"500MB"`): + The maximum size of the dataset shards to be uploaded to the hub. If expressed as a string, needs to be digits followed by + a unit (like `"5MB"`). + num_shards (`int`, *optional*): + Number of shards to write. By default, the number of shards depends on `max_shard_size`. + + + embed_external_files (`bool`, defaults to `True`): + Whether to embed file bytes in the shards. + In particular, this will do the following before the push for the fields of type: + + - [`Audio`] and [`Image`]: remove local path information and embed file content in the Parquet files. + num_proc (`int`, *optional*, defaults to `None`): + Number of processes when preparing and uploading the dataset. + This is helpful if the dataset is made of many samples or media files to embed. + Multiprocessing is disabled by default. + + + + Return: + huggingface_hub.CommitInfo + + Example: + + ```python + >>> dataset.push_to_hub("/") + >>> dataset_dict.push_to_hub("/", private=True) + >>> dataset.push_to_hub("/", max_shard_size="1GB") + >>> dataset.push_to_hub("/", num_shards=1024) + ``` + + If your dataset has multiple splits (e.g. train/validation/test): + + ```python + >>> train_dataset.push_to_hub("/", split="train") + >>> val_dataset.push_to_hub("/", split="validation") + >>> # later + >>> dataset = load_dataset("/") + >>> train_dataset = dataset["train"] + >>> val_dataset = dataset["validation"] + ``` + + If you want to add a new configuration (or subset) to a dataset (e.g. if the dataset has multiple tasks/versions/languages): + + ```python + >>> english_dataset.push_to_hub("/", "en") + >>> french_dataset.push_to_hub("/", "fr") + >>> # later + >>> english_dataset = load_dataset("/", "en") + >>> french_dataset = load_dataset("/", "fr") + ``` + """ + if "Video(" in str(self.features): + raise NotImplementedError( + "push_to_hub is not implemented for video datasets, instead you should upload the video files " + "using e.g. the huggingface_hub library and optionally upload a metadata.csv or metadata.jsonl " + "file containing other information like video captions, features or labels. More information " + "at https://huggingface.co/docs/datasets/main/en/video_load#videofolder" + ) + if config_name == "data": + raise ValueError("`config_name` cannot be 'data'. Please, choose another name for configuration.") + + if max_shard_size is not None and num_shards is not None: + raise ValueError( + "Failed to push_to_hub: please specify either max_shard_size or num_shards, but not both." + ) + + if split is None: + split = str(self.split) if self.split is not None else "train" + + if not re.match(_split_re, split): + raise ValueError(f"Split name should match '{_split_re}' but got '{split}'.") + + api = HfApi(endpoint=config.HF_ENDPOINT, token=token) + + try: + repo_id = api.repo_info(repo_id, repo_type="dataset").id + except RepositoryNotFoundError: + repo_url = api.create_repo( + repo_id, + repo_type="dataset", + private=private, + exist_ok=True, + ) + repo_id = repo_url.repo_id + + if revision is not None and not revision.startswith("refs/pr/"): + # We do not call create_branch for a PR reference: 400 Bad Request + api.create_branch(repo_id, branch=revision, repo_type="dataset", exist_ok=True) + + if not data_dir: + data_dir = config_name if config_name != "default" else "data" # for backward compatibility + + additions, uploaded_size, dataset_nbytes = self._push_parquet_shards_to_hub( + repo_id=repo_id, + data_dir=data_dir, + split=split, + token=token, + revision=revision, + max_shard_size=max_shard_size, + num_shards=num_shards, + create_pr=create_pr, + embed_external_files=embed_external_files, + num_proc=num_proc, + ) + + def get_deletions_and_dataset_card() -> tuple[str, list[CommitOperationDelete], str, Optional[str]]: + parent_commit = api.repo_info(repo_id, repo_type="dataset", revision=revision).sha + + # Check if the repo already has a README.md and/or a dataset_infos.json to update them with the new split info (size and pattern) + # and delete old split shards (if they exist) + repo_with_dataset_card, repo_with_dataset_infos = False, False + deletions: list[CommitOperationDelete] = [] + deleted_size = 0 + repo_splits: list[str] = [] # use a list to keep the order of the splits + repo_files_to_add = [addition.path_in_repo for addition in additions] + for repo_file in api.list_repo_tree( + repo_id=repo_id, revision=parent_commit, repo_type="dataset", token=token, recursive=True + ): + if not isinstance(repo_file, RepoFile): + continue + if repo_file.rfilename == config.REPOCARD_FILENAME: + repo_with_dataset_card = True + elif repo_file.rfilename == config.DATASETDICT_INFOS_FILENAME: + repo_with_dataset_infos = True + elif ( + repo_file.rfilename.startswith(f"{data_dir}/{split}-") + and repo_file.rfilename not in repo_files_to_add + ): + deletions.append(CommitOperationDelete(path_in_repo=repo_file.rfilename)) + deleted_size += repo_file.size + elif fnmatch.fnmatch( + repo_file.rfilename, + PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED.replace("{split}", "*"), + ): + pattern = glob_pattern_to_regex(PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED) + split_pattern_fields = string_to_dict(repo_file.rfilename, pattern) + assert split_pattern_fields is not None + repo_split = split_pattern_fields["split"] + if repo_split not in repo_splits: + repo_splits.append(repo_split) + + organization, dataset_name = repo_id.split("/") if "/" in repo_id else (None, repo_id) + info_to_dump = self.info.copy() + info_to_dump.download_checksums = None + info_to_dump.download_size = uploaded_size + info_to_dump.dataset_size = dataset_nbytes + info_to_dump.size_in_bytes = uploaded_size + dataset_nbytes + info_to_dump.config_name = config_name + info_to_dump.splits = SplitDict( + {split: SplitInfo(split, num_bytes=dataset_nbytes, num_examples=len(self), dataset_name=dataset_name)} + ) + # get the info from the README to update them + if repo_with_dataset_card: + dataset_card_path = api.hf_hub_download( + repo_id, config.REPOCARD_FILENAME, repo_type="dataset", revision=parent_commit + ) + dataset_card = DatasetCard.load(Path(dataset_card_path)) + dataset_card_data = dataset_card.data + metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data) + dataset_infos: DatasetInfosDict = DatasetInfosDict.from_dataset_card_data(dataset_card_data) + if dataset_infos and config_name in dataset_infos: + repo_info = dataset_infos[config_name] + else: + repo_info = None + # get the deprecated dataset_infos.json to update them + elif repo_with_dataset_infos: + dataset_card = None + dataset_card_data = DatasetCardData() + metadata_configs = MetadataConfigs() + dataset_infos_path = api.hf_hub_download( + repo_id, config.DATASETDICT_INFOS_FILENAME, repo_type="dataset", revision=parent_commit + ) + with open(dataset_infos_path, encoding="utf-8") as f: + dataset_infos: dict = json.load(f) + dataset_info = dataset_infos.get(config_name, None) if dataset_infos else None + repo_info = DatasetInfo.from_dict(dataset_info) if dataset_info else None + else: + dataset_card = None + dataset_card_data = DatasetCardData() + metadata_configs = MetadataConfigs() + repo_info = None + # update the total info to dump from existing info + if repo_info is not None: + logger.info("Updating downloaded metadata with the new split.") + if repo_info.splits and list(repo_info.splits) != [split]: + if self._info.features != repo_info.features: + raise ValueError( + f"Features of the new split don't match the features of the existing splits on the hub: {self._info.features} != {repo_info.features}" + ) + + if split in repo_info.splits: + repo_info.download_size -= deleted_size + repo_info.dataset_size -= repo_info.splits.get(split, SplitInfo()).num_bytes or 0 + + repo_info.download_checksums = None + repo_info.download_size = (repo_info.download_size or 0) + uploaded_size + repo_info.dataset_size = (repo_info.dataset_size or 0) + dataset_nbytes + repo_info.size_in_bytes = repo_info.download_size + repo_info.dataset_size + repo_info.splits.pop(split, None) + repo_info.splits[split] = SplitInfo( + split, num_bytes=dataset_nbytes, num_examples=len(self), dataset_name=dataset_name + ) + info_to_dump = repo_info + # create the metadata configs if it was uploaded with push_to_hub before metadata configs existed + if not metadata_configs and repo_splits: + default_metadata_configs_to_dump = { + "data_files": [{"split": split, "path": f"data/{split}-*"} for split in repo_splits] + } + MetadataConfigs({"default": default_metadata_configs_to_dump}).to_dataset_card_data(dataset_card_data) + # update the metadata configs + if config_name in metadata_configs: + metadata_config = metadata_configs[config_name] + if "data_files" in metadata_config: + data_files_to_dump = sanitize_patterns(metadata_config["data_files"]) + else: + data_files_to_dump = {} + # add the new split + data_files_to_dump[split] = [f"{data_dir}/{split}-*"] + metadata_config_to_dump = { + "data_files": [ + { + "split": _split, + "path": _pattern[0] if len(_pattern) == 1 else _pattern, + } + for _split, _pattern in data_files_to_dump.items() + ] + } + else: + metadata_config_to_dump = {"data_files": [{"split": split, "path": f"{data_dir}/{split}-*"}]} + configs_to_dump = {config_name: metadata_config_to_dump} + if set_default and config_name != "default": + if metadata_configs: + current_default_config_name = metadata_configs.get_default_config_name() + if current_default_config_name == "default": + raise ValueError( + "There exists a configuration named 'default'. To set a different configuration as default, " + "rename the 'default' one first." + ) + if current_default_config_name: + _ = metadata_configs[current_default_config_name].pop("default") + configs_to_dump[current_default_config_name] = metadata_configs[current_default_config_name] + metadata_config_to_dump["default"] = True + # push to the deprecated dataset_infos.json + if repo_with_dataset_infos: + dataset_infos_path = api.hf_hub_download( + repo_id, config.DATASETDICT_INFOS_FILENAME, repo_type="dataset", revision=parent_commit + ) + with open(dataset_infos_path, encoding="utf-8") as f: + dataset_infos: dict = json.load(f) + dataset_infos[config_name] = asdict(info_to_dump) + new_dataset_infos = json.dumps(dataset_infos, indent=4) + else: + new_dataset_infos = None + # push to README + DatasetInfosDict({config_name: info_to_dump}).to_dataset_card_data(dataset_card_data) + MetadataConfigs(configs_to_dump).to_dataset_card_data(dataset_card_data) + new_dataset_card = ( + DatasetCard(f"---\n{dataset_card_data}\n---\n") if dataset_card is None else dataset_card + ) + return parent_commit, deletions, new_dataset_card, new_dataset_infos + + commit_message = commit_message if commit_message is not None else "Upload dataset" + if len(additions) > config.UPLOADS_MAX_NUMBER_PER_COMMIT: + logger.info( + f"Number of files to upload is larger than {config.UPLOADS_MAX_NUMBER_PER_COMMIT}. Splitting the push into multiple commits." + ) + num_commits = math.ceil(len(additions) / config.UPLOADS_MAX_NUMBER_PER_COMMIT) + for i in range(0, num_commits): + operations = additions[ + i * config.UPLOADS_MAX_NUMBER_PER_COMMIT : (i + 1) * config.UPLOADS_MAX_NUMBER_PER_COMMIT + ] + for retry, sleep_time in enumerate(itertools.chain(range(10), itertools.repeat(30)), start=1): + # We need to retry if another commit happens at the same time + sleep_time *= 1 + random.random() + try: + commit_info = api.create_commit( + repo_id, + operations=operations, + commit_message=commit_message + f" (part {i:05d}-of-{num_commits:05d})", + commit_description=commit_description, + repo_type="dataset", + revision=revision, + create_pr=create_pr, + ) + except HfHubHTTPError as err: + if ( + err.__context__ + and isinstance(err.__context__, HfHubHTTPError) + and err.__context__.response.status_code == 409 + ): + # 409 is Conflict (another commit is in progress) + time.sleep(sleep_time) + logger.info( + f"Retrying intermediate commit for {repo_id}, {config_name} ({retry}/n with status_code {err.__context__.response.status_code})" + ) + continue + else: + raise + break + logger.info( + f"Commit #{i + 1} completed" + + (f" (still {num_commits - i - 1} to go)" if num_commits - i - 1 else "") + + "." + ) + last_commit_additions = [] + else: + last_commit_additions = additions + + for retry, sleep_time in enumerate(itertools.chain(range(10), itertools.repeat(30)), start=1): + # We need to retry if there was a commit in between in case it touched the dataset card data + sleep_time *= 1 + random.random() + parent_commit, deletions, dataset_card, dataset_infos = get_deletions_and_dataset_card() + dataset_card_additions = [] + if dataset_infos: + dataset_card_additions.append( + CommitOperationAdd( + path_in_repo=config.DATASETDICT_INFOS_FILENAME, + path_or_fileobj=dataset_infos.encode("utf-8"), + ) + ) + dataset_card_additions.append( + CommitOperationAdd(path_in_repo=config.REPOCARD_FILENAME, path_or_fileobj=str(dataset_card).encode()) + ) + try: + commit_info = api.create_commit( + repo_id, + operations=last_commit_additions + dataset_card_additions + deletions, + commit_message=commit_message, + commit_description=commit_description, + repo_type="dataset", + revision=revision, + create_pr=create_pr, + parent_commit=parent_commit, + ) + except HfHubHTTPError as err: + if ( + err.__context__ + and isinstance(err.__context__, HfHubHTTPError) + and err.__context__.response.status_code in (412, 409) + ): + # 412 is Precondition failed (parent_commit isn't satisfied) + # 409 is Conflict (another commit is in progress) + time.sleep(sleep_time) + logger.info( + f"Retrying commit for {repo_id}, {config_name} ({retry}/n with status_code {err.__context__.response.status_code})" + ) + continue + else: + raise + break + + return commit_info + + @transmit_format + @fingerprint_transform(inplace=False) + def add_column( + self, name: str, column: Union[list, np.ndarray], new_fingerprint: str, feature: Optional[FeatureType] = None + ): + """Add column to Dataset. + + + + Args: + name (`str`): + Column name. + column (`list` or `np.array`): + Column data to be added. + feature (`FeatureType` or `None`, defaults to `None`): + Column datatype. + + Returns: + [`Dataset`] + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") + >>> more_text = ds["text"] + >>> ds = ds.add_column(name="text_2", column=more_text) + >>> ds + Dataset({ + features: ['text', 'label', 'text_2'], + num_rows: 1066 + }) + ``` + """ + + if feature: + pyarrow_schema = Features({name: feature}).arrow_schema + else: + pyarrow_schema = None + + column_table = InMemoryTable.from_pydict({name: column}, schema=pyarrow_schema) + _check_column_names(self._data.column_names + column_table.column_names) + dataset = self.flatten_indices() if self._indices is not None else self + # Concatenate tables horizontally + table = concat_tables([dataset._data, column_table], axis=1) + # Update features + info = dataset.info.copy() + info.features.update(Features.from_arrow_schema(column_table.schema)) + table = update_metadata_with_features(table, info.features) + return Dataset(table, info=info, split=self.split, indices_table=None, fingerprint=new_fingerprint) + + def add_faiss_index( + self, + column: str, + index_name: Optional[str] = None, + device: Optional[int] = None, + string_factory: Optional[str] = None, + metric_type: Optional[int] = None, + custom_index: Optional["faiss.Index"] = None, # noqa: F821 + batch_size: int = 1000, + train_size: Optional[int] = None, + faiss_verbose: bool = False, + dtype=np.float32, + ): + """Add a dense index using Faiss for fast retrieval. + By default the index is done over the vectors of the specified column. + You can specify `device` if you want to run it on GPU (`device` must be the GPU index). + You can find more information about Faiss here: + + - For [string factory](https://github.com/facebookresearch/faiss/wiki/The-index-factory) + + Args: + column (`str`): + The column of the vectors to add to the index. + index_name (`str`, *optional*): + The `index_name`/identifier of the index. + This is the `index_name` that is used to call [`~datasets.Dataset.get_nearest_examples`] or [`~datasets.Dataset.search`]. + By default it corresponds to `column`. + device (`Union[int, List[int]]`, *optional*): + If positive integer, this is the index of the GPU to use. If negative integer, use all GPUs. + If a list of positive integers is passed in, run only on those GPUs. By default it uses the CPU. + string_factory (`str`, *optional*): + This is passed to the index factory of Faiss to create the index. + Default index class is `IndexFlat`. + metric_type (`int`, *optional*): + Type of metric. Ex: `faiss.METRIC_INNER_PRODUCT` or `faiss.METRIC_L2`. + custom_index (`faiss.Index`, *optional*): + Custom Faiss index that you already have instantiated and configured for your needs. + batch_size (`int`): + Size of the batch to use while adding vectors to the `FaissIndex`. Default value is `1000`. + + train_size (`int`, *optional*): + If the index needs a training step, specifies how many vectors will be used to train the index. + faiss_verbose (`bool`, defaults to `False`): + Enable the verbosity of the Faiss index. + dtype (`data-type`): + The dtype of the numpy arrays that are indexed. + Default is `np.float32`. + + Example: + + ```python + >>> ds = datasets.load_dataset('crime_and_punish', split='train') + >>> ds_with_embeddings = ds.map(lambda example: {'embeddings': embed(example['line']})) + >>> ds_with_embeddings.add_faiss_index(column='embeddings') + >>> # query + >>> scores, retrieved_examples = ds_with_embeddings.get_nearest_examples('embeddings', embed('my new query'), k=10) + >>> # save index + >>> ds_with_embeddings.save_faiss_index('embeddings', 'my_index.faiss') + + >>> ds = datasets.load_dataset('crime_and_punish', split='train') + >>> # load index + >>> ds.load_faiss_index('embeddings', 'my_index.faiss') + >>> # query + >>> scores, retrieved_examples = ds.get_nearest_examples('embeddings', embed('my new query'), k=10) + ``` + """ + with self.formatted_as(type="numpy", columns=[column], dtype=dtype): + super().add_faiss_index( + column=column, + index_name=index_name, + device=device, + string_factory=string_factory, + metric_type=metric_type, + custom_index=custom_index, + batch_size=batch_size, + train_size=train_size, + faiss_verbose=faiss_verbose, + ) + return self + + def add_faiss_index_from_external_arrays( + self, + external_arrays: np.array, + index_name: str, + device: Optional[int] = None, + string_factory: Optional[str] = None, + metric_type: Optional[int] = None, + custom_index: Optional["faiss.Index"] = None, # noqa: F821 + batch_size: int = 1000, + train_size: Optional[int] = None, + faiss_verbose: bool = False, + dtype=np.float32, + ): + """Add a dense index using Faiss for fast retrieval. + The index is created using the vectors of `external_arrays`. + You can specify `device` if you want to run it on GPU (`device` must be the GPU index). + You can find more information about Faiss here: + + - For [string factory](https://github.com/facebookresearch/faiss/wiki/The-index-factory) + + Args: + external_arrays (`np.array`): + If you want to use arrays from outside the lib for the index, you can set `external_arrays`. + It will use `external_arrays` to create the Faiss index instead of the arrays in the given `column`. + index_name (`str`): + The `index_name`/identifier of the index. + This is the `index_name` that is used to call [`~datasets.Dataset.get_nearest_examples`] or [`~datasets.Dataset.search`]. + device (Optional `Union[int, List[int]]`, *optional*): + If positive integer, this is the index of the GPU to use. If negative integer, use all GPUs. + If a list of positive integers is passed in, run only on those GPUs. By default it uses the CPU. + string_factory (`str`, *optional*): + This is passed to the index factory of Faiss to create the index. + Default index class is `IndexFlat`. + metric_type (`int`, *optional*): + Type of metric. Ex: `faiss.faiss.METRIC_INNER_PRODUCT` or `faiss.METRIC_L2`. + custom_index (`faiss.Index`, *optional*): + Custom Faiss index that you already have instantiated and configured for your needs. + batch_size (`int`, *optional*): + Size of the batch to use while adding vectors to the FaissIndex. Default value is 1000. + + train_size (`int`, *optional*): + If the index needs a training step, specifies how many vectors will be used to train the index. + faiss_verbose (`bool`, defaults to False): + Enable the verbosity of the Faiss index. + dtype (`numpy.dtype`): + The dtype of the numpy arrays that are indexed. Default is np.float32. + """ + super().add_faiss_index_from_external_arrays( + external_arrays=external_arrays.astype(dtype), + index_name=index_name, + device=device, + string_factory=string_factory, + metric_type=metric_type, + custom_index=custom_index, + batch_size=batch_size, + train_size=train_size, + faiss_verbose=faiss_verbose, + ) + + def add_elasticsearch_index( + self, + column: str, + index_name: Optional[str] = None, + host: Optional[str] = None, + port: Optional[int] = None, + es_client: Optional["elasticsearch.Elasticsearch"] = None, # noqa: F821 + es_index_name: Optional[str] = None, + es_index_config: Optional[dict] = None, + ): + """Add a text index using ElasticSearch for fast retrieval. This is done in-place. + + Args: + column (`str`): + The column of the documents to add to the index. + index_name (`str`, *optional*): + The `index_name`/identifier of the index. + This is the index name that is used to call [`~Dataset.get_nearest_examples`] or [`~Dataset.search`]. + By default it corresponds to `column`. + host (`str`, *optional*, defaults to `localhost`): + Host of where ElasticSearch is running. + port (`str`, *optional*, defaults to `9200`): + Port of where ElasticSearch is running. + es_client (`elasticsearch.Elasticsearch`, *optional*): + The elasticsearch client used to create the index if host and port are `None`. + es_index_name (`str`, *optional*): + The elasticsearch index name used to create the index. + es_index_config (`dict`, *optional*): + The configuration of the elasticsearch index. + Default config is: + ``` + { + "settings": { + "number_of_shards": 1, + "analysis": {"analyzer": {"stop_standard": {"type": "standard", " stopwords": "_english_"}}}, + }, + "mappings": { + "properties": { + "text": { + "type": "text", + "analyzer": "standard", + "similarity": "BM25" + }, + } + }, + } + ``` + Example: + + ```python + >>> es_client = elasticsearch.Elasticsearch() + >>> ds = datasets.load_dataset('crime_and_punish', split='train') + >>> ds.add_elasticsearch_index(column='line', es_client=es_client, es_index_name="my_es_index") + >>> scores, retrieved_examples = ds.get_nearest_examples('line', 'my new query', k=10) + ``` + """ + with self.formatted_as(type=None, columns=[column]): + super().add_elasticsearch_index( + column=column, + index_name=index_name, + host=host, + port=port, + es_client=es_client, + es_index_name=es_index_name, + es_index_config=es_index_config, + ) + return self + + @transmit_format + @fingerprint_transform(inplace=False) + def add_item(self, item: dict, new_fingerprint: str): + """Add item to Dataset. + + + + Args: + item (`dict`): + Item data to be added. + + Returns: + [`Dataset`] + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") + >>> new_review = {'label': 0, 'text': 'this movie is the absolute worst thing I have ever seen'} + >>> ds = ds.add_item(new_review) + >>> ds[-1] + {'label': 0, 'text': 'this movie is the absolute worst thing I have ever seen'} + ``` + """ + item_table = InMemoryTable.from_pydict({k: [v] for k, v in item.items()}) + # We don't call _check_if_features_can_be_aligned here so this cast is "unsafe" + dset_features, item_features = _align_features( + [self._info.features, Features.from_arrow_schema(item_table.schema)] + ) + # Cast to align the schemas of the tables and concatenate the tables + table = concat_tables( + [ + self._data.cast(dset_features.arrow_schema) if self._info.features != dset_features else self._data, + item_table.cast(item_features.arrow_schema), + ] + ) + if self._indices is None: + indices_table = None + else: + item_indices_array = pa.array([len(self._data)], type=pa.uint64()) + item_indices_table = InMemoryTable.from_arrays([item_indices_array], names=["indices"]) + indices_table = concat_tables([self._indices, item_indices_table]) + info = self.info.copy() + info.features.update(item_features) + table = update_metadata_with_features(table, info.features) + return Dataset( + table, + info=info, + split=self.split, + indices_table=indices_table, + fingerprint=new_fingerprint, + ) + + def align_labels_with_mapping(self, label2id: dict, label_column: str) -> "Dataset": + """Align the dataset's label ID and label name mapping to match an input `label2id` mapping. + This is useful when you want to ensure that a model's predicted labels are aligned with the dataset. + The alignment in done using the lowercase label names. + + Args: + label2id (`dict`): + The label name to ID mapping to align the dataset with. + label_column (`str`): + The column name of labels to align on. + + Example: + + ```python + >>> # dataset with mapping {'entailment': 0, 'neutral': 1, 'contradiction': 2} + >>> ds = load_dataset("nyu-mll/glue", "mnli", split="train") + >>> # mapping to align with + >>> label2id = {'CONTRADICTION': 0, 'NEUTRAL': 1, 'ENTAILMENT': 2} + >>> ds_aligned = ds.align_labels_with_mapping(label2id, "label") + ``` + + """ + # Sanity checks + if label_column not in self._data.column_names: + raise ValueError(f"Column ({label_column}) not in table columns ({self._data.column_names}).") + + label_feature = self._info.features[label_column] + if not ( + isinstance(label_feature, ClassLabel) + or (isinstance(label_feature, Sequence) and isinstance(label_feature.feature, ClassLabel)) + ): + raise ValueError( + f"Aligning labels with a mapping is only supported for {ClassLabel.__name__} column or {Sequence.__name__} column with the inner type {ClassLabel.__name__}, and column {label_feature} is of type {type(label_feature).__name__}." + ) + + # Sort input mapping by ID value to ensure the label names are aligned + label2id = dict(sorted(label2id.items(), key=lambda item: item[1])) + label_names = list(label2id.keys()) + # Some label mappings use uppercase label names so we lowercase them during alignment + label2id = {k.lower(): v for k, v in label2id.items()} + int2str_function = ( + label_feature.int2str if isinstance(label_feature, ClassLabel) else label_feature.feature.int2str + ) + + if isinstance(label_feature, ClassLabel): + + def process_label_ids(batch): + dset_label_names = [ + int2str_function(label_id).lower() if label_id is not None else None + for label_id in batch[label_column] + ] + batch[label_column] = [ + label2id[label_name] if label_name is not None else None for label_name in dset_label_names + ] + return batch + + else: + + def process_label_ids(batch): + dset_label_names = [ + [int2str_function(label_id).lower() if label_id is not None else None for label_id in seq] + for seq in batch[label_column] + ] + batch[label_column] = [ + [label2id[label_name] if label_name is not None else None for label_name in seq] + for seq in dset_label_names + ] + return batch + + features = self.features + features[label_column] = ( + ClassLabel(num_classes=len(label_names), names=label_names) + if isinstance(label_feature, ClassLabel) + else List(ClassLabel(num_classes=len(label_names), names=label_names)) + ) + return self.map(process_label_ids, features=features, batched=True, desc="Aligning the labels") + + +def _concatenate_map_style_datasets( + dsets: list[Dataset], + info: Optional[DatasetInfo] = None, + split: Optional[NamedSplit] = None, + axis: int = 0, +): + """ + Converts a list of :class:`Dataset` with the same schema into a single :class:`Dataset`. + When you concatenate on axis 0, missing data are filled with None values. + + Args: + dsets (`List[datasets.Dataset]`): List of Datasets to concatenate. + info (:class:`DatasetInfo`, optional): Dataset information, like description, citation, etc. + split (:class:`NamedSplit`, optional): Name of the dataset split. + axis (``{0, 1}``, default ``0``, meaning over rows): + Axis to concatenate over, where ``0`` means over rows (vertically) and ``1`` means over columns + (horizontally). + + *New in version 1.6.0* + + Example: + + ```py + >>> ds3 = _concatenate_map_style_datasets([ds1, ds2]) + ``` + """ + # Ignore datasets with no rows + if any(dset.num_rows > 0 for dset in dsets): + dsets = [dset for dset in dsets if dset.num_rows > 0] + else: + # Return first dataset if all datasets are empty + return dsets[0] + + # Perform checks (and a potential cast if axis=0) + if axis == 0: + _check_if_features_can_be_aligned([dset.features for dset in dsets]) + else: + if not all(dset.num_rows == dsets[0].num_rows for dset in dsets): + raise ValueError("Number of rows must match for all datasets") + _check_column_names([col_name for dset in dsets for col_name in dset._data.column_names]) + + # Find common format or reset format + format = dsets[0].format + if any(dset.format != format for dset in dsets): + format = {} + logger.info("Some of the datasets have disparate format. Resetting the format of the concatenated dataset.") + + def apply_offset_to_indices_table(table, offset): + if offset == 0: + return table + else: + array = table["indices"] + new_array = pc.add(array, pa.scalar(offset, type=pa.uint64())) + return InMemoryTable.from_arrays([new_array], names=["indices"]) + + # Concatenate indices if they exist + if any(dset._indices is not None for dset in dsets): + if axis == 0: + # Datasets with no indices tables are replaced with a dataset with an indices table in memory. + # Applying an offset to an indices table also brings the table in memory. + indices_tables = [] + for i in range(len(dsets)): + if dsets[i]._indices is None: + dsets[i] = dsets[i]._select_with_indices_mapping(range(len(dsets[i]))) + indices_tables.append(dsets[i]._indices) + + # An offset needs to be applied to the indices before concatenating + offset = 0 + for i in range(len(dsets)): + indices_tables[i] = apply_offset_to_indices_table(indices_tables[i], offset) + offset += len(dsets[i]._data) + + # Concatenate indices + indices_tables = [t for t in indices_tables if len(t) > 0] + if indices_tables: + indices_table = concat_tables(indices_tables) + else: + indices_table = InMemoryTable.from_batches([], schema=pa.schema({"indices": pa.int64()})) + else: + if len(dsets) == 1: + indices_table = dsets[0]._indices + else: + for i in range(len(dsets)): + dsets[i] = dsets[i].flatten_indices() + indices_table = None + else: + indices_table = None + + table = concat_tables([dset._data for dset in dsets], axis=axis) + if axis == 0: + features_list = _align_features([dset.features for dset in dsets]) + else: + features_list = [dset.features for dset in dsets] + table = update_metadata_with_features(table, {k: v for features in features_list for k, v in features.items()}) + + # Concatenate infos + if info is None: + info = DatasetInfo.from_merge([dset.info for dset in dsets]) + fingerprint = update_fingerprint( + "".join(dset._fingerprint for dset in dsets), _concatenate_map_style_datasets, {"info": info, "split": split} + ) + + # Make final concatenated dataset + concatenated_dataset = Dataset( + table, + info=info, + split=split, + indices_table=indices_table, + fingerprint=fingerprint, + ) + concatenated_dataset.set_format(**format) + return concatenated_dataset + + +def _interleave_map_style_datasets( + datasets: list["Dataset"], + probabilities: Optional[list[float]] = None, + seed: Optional[int] = None, + info: Optional[DatasetInfo] = None, + split: Optional[NamedSplit] = None, + stopping_strategy: Literal[ + "first_exhausted", "all_exhausted", "all_exhausted_without_replacement" + ] = "first_exhausted", + **kwargs, +) -> "Dataset": + """ + Interleave several map-style datasets (sources) into a single map-style dataset. + The new dataset is constructed by alternating between the sources to get the examples. + If `probabilities = None` (default) the new dataset is constructed by cycling between each source to get the examples. + If `probabilities` is not `None, the new dataset is constructed by getting examples from a random source at a time according to the provided probabilities. + + Args: + datasets (`List[Dataset]`): list of datasets to interleave + probabilities (`List[float]`, optional, default None): If specified, the new dataset is constructed by sampling + examples from one source at a time according to these probabilities. + seed (`int`, optional, default None): The random seed used to choose a source for each example. + info (:class:`DatasetInfo`, optional): Dataset information, like description, citation, etc. + split (:class:`NamedSplit`, optional): Name of the dataset split. + stopping_strategy (`str`, defaults to `first_exhausted`): + Two strategies are proposed right now. + By default, `first_exhausted` is an undersampling strategy, i.e the dataset construction is stopped as soon as one dataset has ran out of samples. + If the strategy is `all_exhausted`, we use an oversampling strategy, i.e the dataset construction is stopped as soon as every samples of every dataset has been added at least once. + When strategy is `all_exhausted_without_replacement` we make sure that each sample in each dataset is sampled only once. + Note that if the strategy is `all_exhausted`, the interleaved dataset size can get enormous: + - with no probabilities, the resulting dataset will have max_length_datasets*nb_dataset samples. + - with given probabilities, the resulting dataset will have more samples if some datasets have really low probability of visiting. + **kwargs (additional keyword arguments): Keyword arguments to be passed to :meth:`datasets.Datasets.select` when selecting the indices used to interleave the datasets. + + Output: + :class:`datasets.Dataset` + """ + if stopping_strategy not in ["first_exhausted", "all_exhausted", "all_exhausted_without_replacement"]: + raise ValueError( + f"{stopping_strategy} stopping strategy in `interleave_datasets` is not implemented yet with a list of {type(datasets[0])}" + ) + + # To interleave the datasets, we concatenate them and then we re-order the indices + concatenated_datasets = _concatenate_map_style_datasets(datasets, info=info, split=split) + + # Let's now build the indices to pass to .select() + lengths = [len(dset) for dset in datasets] + offsets = np.cumsum([0] + lengths[:-1]) + + # if stopping_strategy is "first_exhausted", it is an undersampling situation whereas it is an oversampling situation if it is "all_exhausted" + oversampling = stopping_strategy == "all_exhausted" + + if probabilities is None and not oversampling: + # Undersampling situation with cycling between each sources + # Example:: If lengths of the datasets are [3, 4, 5] + # Then the resulting indices should be [0, 3, 7, 1, 4, 8, 2, 6, 9] + # Note that we only have 3 examples per dataset since the first dataset ran out of examples + + # Reasoning behind the following operation: keeping the min_length first indices of each dataset + # while offsetting in order to correspond to the right indices of the concatenated dataset + # and flattening to effectively interleave the datasets + indices = (offsets.reshape(1, -1) + np.arange(min(lengths)).reshape(-1, 1)).flatten().tolist() + elif probabilities is None: + # Oversampling situation with cycling between each sources + # Then the resulting indices should be [0, 3, 7, 1, 4, 8, 2, 5, 9, 0, 6, 10, 1, 3, 11] + # Note that we have 5 examples per dataset with a rolling window since the longest dataset has 5 samples + + # Reasoning behind the following operation: for each dataset indices (i.e column) repeat the indices to have max_length indices per dataset + # For example, if the max_length is 5 and the i-th dataset has 3 samples, the i-th column will be [0,1,2,0,1] + indices = np.mod(np.arange(max(lengths)).reshape(-1, 1), np.array(lengths).reshape(1, -1)) + + # We have to keep the indices to their respective dataset offsets and to flatten to effectively interleave the datasets + indices = (indices + offsets).flatten().tolist() + + else: + # boolean array indicating if at index i if the dataset_i has been fully exhausted + is_exhausted = np.full(len(lengths), False) + + # if undersampling ("first_exhausted"), we stop as soon as one dataset is exhausted + # if oversampling ("all_exhausted"), we stop as soons as every dataset is exhausted, i.e as soon as every samples of every dataset has been visited at least once + bool_strategy_func = ( + np.all if (oversampling or stopping_strategy == "all_exhausted_without_replacement") else np.any + ) + + def iter_random_indices(): + """Get an infinite iterator that randomly samples the index of the source to pick examples from.""" + rng = np.random.default_rng(seed) + while True: + yield from (int(i) for i in rng.choice(len(datasets), size=1000, p=probabilities)) + + current_index = [0] * len(datasets) + indices = [] + for source_idx in iter_random_indices(): + # If no oversampling, we stop as soon as a dataset has ran out of examples (np.any) + # Otherwise, we stop as soon as every dataset has ran out of examples (np.all) + if bool_strategy_func(is_exhausted): + # the stopping condition was reached, let's stop + break + + # let's add the example at the current index of the `source_idx`-th dataset + # For without replacement sampling we additionally need to make sure the current source is not exhausted to not oversample. + if stopping_strategy != "all_exhausted_without_replacement" or not is_exhausted[source_idx]: + indices.append(current_index[source_idx] + offsets[source_idx]) + current_index[source_idx] += 1 + + # we've ran out of examples for the current dataset, let's update our boolean array and bring the current_index back to 0 + if current_index[source_idx] >= lengths[source_idx]: + is_exhausted[source_idx] = True + # We don't want to reset the iterator when stopping strategy is without replacement. + if stopping_strategy != "all_exhausted_without_replacement": + current_index[source_idx] = 0 + + return concatenated_datasets.select(indices, **kwargs) + + +def _split_by_node_map_style_dataset(dataset: Dataset, rank: int, world_size: int) -> Dataset: + """ + Split a dataset for the node at rank `rank` in a pool of nodes of size `world_size`. + Each node is assigned a chunk of data, e.g. rank 0 is given the first chunk of the dataset. + To maximize data loading throughput, chunks are made of contiguous data on disk if possible. + + Args: + dataset ([`Dataset`]): + The dataset to split by node. + rank (`int`): + Rank of the current node. + world_size (`int`): + Total number of nodes. + + Returns: + [`Dataset`]: The dataset to be used on the node at rank `rank`. + """ + return dataset.shard(num_shards=world_size, index=rank, contiguous=True) + + +# This is outside Dataset.filter as it needs to be picklable for multiprocessing + + +def get_indices_from_mask_function( + function: Callable, + batched: bool, + with_indices: bool, + with_rank: bool, + input_columns: Optional[Union[str, list[str]]], + indices_mapping: Optional[Table] = None, + *args, + **fn_kwargs, +): + if batched: + # we extract indices and rank from args + *inputs, indices, rank = args + additional_args = () + if with_indices: + additional_args += (indices,) + if with_rank: + additional_args += (rank,) + mask = function(*inputs, *additional_args, **fn_kwargs) + if isinstance(mask, (pa.Array, pa.ChunkedArray)): + mask = mask.to_pylist() + else: + # we get batched data (to return less data than input) but `function` only accepts one example + # therefore we need to call `function` on each example of the batch to get the mask + *inputs, indices, rank = args + mask = [] + if input_columns is None: + # inputs only contains a batch of examples + batch: dict = inputs[0] + num_examples = len(batch[next(iter(batch.keys()))]) + for i in range(num_examples): + example = {key: batch[key][i] for key in batch} + additional_args = () + if with_indices: + additional_args += (indices[i],) + if with_rank: + additional_args += (rank,) + mask.append(function(example, *additional_args, **fn_kwargs)) + else: + # inputs is a list of columns + columns: list[list] = inputs + num_examples = len(columns[0]) + for i in range(num_examples): + input = [column[i] for column in columns] + additional_args = () + if with_indices: + additional_args += (indices[i],) + if with_rank: + additional_args += (rank,) + mask.append(function(*input, *additional_args, **fn_kwargs)) + indices_array = [i for i, to_keep in zip(indices, mask) if to_keep] + if indices_mapping is not None: + indices_array = pa.array(indices_array, type=pa.uint64()) + indices_array = indices_mapping.column(0).take(indices_array) + indices_array = indices_array.to_pylist() + return {"indices": indices_array} + + +async def async_get_indices_from_mask_function( + function: Callable, + batched: bool, + with_indices: bool, + with_rank: bool, + input_columns: Optional[Union[str, list[str]]], + indices_mapping: Optional[Table] = None, + *args, + **fn_kwargs, +): + """same function but async""" + if batched: + # we extract indices and rank from args + *inputs, indices, rank = args + additional_args = () + if with_indices: + additional_args += (indices,) + if with_rank: + additional_args += (rank,) + mask = await function(*inputs, *additional_args, **fn_kwargs) + if isinstance(mask, (pa.Array, pa.ChunkedArray)): + mask = mask.to_pylist() + else: + # we get batched data (to return less data than input) but `function` only accepts one example + # therefore we need to call `function` on each example of the batch to get the mask + *inputs, indices, rank = args + mask = [] + if input_columns is None: + # inputs only contains a batch of examples + batch: dict = inputs[0] + num_examples = len(batch[next(iter(batch.keys()))]) + for i in range(num_examples): + example = {key: batch[key][i] for key in batch} + additional_args = () + if with_indices: + additional_args += (indices[i],) + if with_rank: + additional_args += (rank,) + mask.append(await function(example, *additional_args, **fn_kwargs)) + else: + # inputs is a list of columns + columns: list[list] = inputs + num_examples = len(columns[0]) + for i in range(num_examples): + input = [column[i] for column in columns] + additional_args = () + if with_indices: + additional_args += (indices[i],) + if with_rank: + additional_args += (rank,) + mask.append(await function(*input, *additional_args, **fn_kwargs)) + indices_array = [i for i, to_keep in zip(indices, mask) if to_keep] + if indices_mapping is not None: + indices_array = pa.array(indices_array, type=pa.uint64()) + indices_array = indices_mapping.column(0).take(indices_array) + indices_array = indices_array.to_pylist() + return {"indices": indices_array} diff --git a/datasets/arrow_reader.py b/datasets/arrow_reader.py new file mode 100644 index 0000000000000000000000000000000000000000..3bbb58a59c3966d2f039ebb5d9ef86997d4c3cbd --- /dev/null +++ b/datasets/arrow_reader.py @@ -0,0 +1,620 @@ +# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Lint as: python3 +"""Arrow ArrowReader.""" + +import copy +import math +import os +import re +from dataclasses import dataclass +from functools import partial +from typing import TYPE_CHECKING, Optional, Union + +import pyarrow as pa +import pyarrow.parquet as pq +from tqdm.contrib.concurrent import thread_map + +from .download.download_config import DownloadConfig # noqa: F401 +from .naming import _split_re, filenames_for_dataset_split +from .table import InMemoryTable, MemoryMappedTable, Table, concat_tables +from .utils import logging +from .utils import tqdm as hf_tqdm + + +if TYPE_CHECKING: + from .info import DatasetInfo # noqa: F401 + from .splits import Split, SplitInfo # noqa: F401 + + +logger = logging.get_logger(__name__) + +HF_GCP_BASE_URL = "https://storage.googleapis.com/huggingface-nlp/cache/datasets" + +_SUB_SPEC_RE = re.compile( + rf""" +^ + (?P{_split_re[1:-1]}) + (\[ + ((?P-?[\d_]+) + (?P%)?)? + : + ((?P-?[\d_]+) + (?P%)?)? + \])?(\((?P[^\)]*)\))? +$ +""", # remove ^ and $ + re.X, +) + +_ADDITION_SEP_RE = re.compile(r"\s*\+\s*") + + +class DatasetNotOnHfGcsError(ConnectionError): + """When you can't get the dataset from the Hf google cloud storage""" + + pass + + +class MissingFilesOnHfGcsError(ConnectionError): + """When some files are missing on the Hf oogle cloud storage""" + + pass + + +@dataclass(frozen=True) +class FileInstructions: + """The file instructions associated with a split ReadInstruction. + + Attributes: + num_examples: `int`, The total number of examples + file_instructions: List[dict(filename, skip, take)], the files information. + The filenames contains the relative path, not absolute. + skip/take indicates which example read in the file: `ds.slice(skip, take)` + """ + + num_examples: int + file_instructions: list[dict] + + +def make_file_instructions( + name: str, + split_infos: list["SplitInfo"], + instruction: Union[str, "ReadInstruction"], + filetype_suffix: Optional[str] = None, + prefix_path: Optional[str] = None, +) -> FileInstructions: + """Returns instructions of the split dict. + + Args: + name (`str`): Name of the dataset. + split_infos (`list` of `[SplitInfo]`): Dataset splits information. + instruction ([`ReadInstruction`] or `str`): Reading instruction for a dataset. + filetype_suffix (`str`, *optional*): Suffix of dataset files, e.g. 'arrow' or 'parquet'. + prefix_path (`str`, *optional*): Prefix of dataset files, e.g. directory name. + + Returns: + [`FileInstructions`] + """ + if not isinstance(name, str): + raise TypeError(f"Expected str 'name', but got: {type(name).__name__}") + elif not name: + raise ValueError("Expected non-empty str 'name'") + name2len = {info.name: info.num_examples for info in split_infos} + name2shard_lengths = {info.name: info.shard_lengths for info in split_infos} + name2filenames = { + info.name: filenames_for_dataset_split( + path=prefix_path, + dataset_name=name, + split=info.name, + filetype_suffix=filetype_suffix, + shard_lengths=name2shard_lengths[info.name], + ) + for info in split_infos + } + if not isinstance(instruction, ReadInstruction): + instruction = ReadInstruction.from_spec(instruction) + # Create the absolute instruction (per split) + absolute_instructions = instruction.to_absolute(name2len) + + # For each split, return the files instruction (skip/take) + file_instructions = [] + num_examples = 0 + for abs_instr in absolute_instructions: + split_length = name2len[abs_instr.splitname] + filenames = name2filenames[abs_instr.splitname] + shard_lengths = name2shard_lengths[abs_instr.splitname] + from_ = 0 if abs_instr.from_ is None else abs_instr.from_ + to = split_length if abs_instr.to is None else abs_instr.to + if shard_lengths is None: # not sharded + for filename in filenames: + take = to - from_ + if take == 0: + continue + num_examples += take + file_instructions.append({"filename": filename, "skip": from_, "take": take}) + else: # sharded + index_start = 0 # Beginning (included) of moving window. + index_end = 0 # End (excluded) of moving window. + for filename, shard_length in zip(filenames, shard_lengths): + index_end += shard_length + if from_ < index_end and to > index_start: # There is something to take. + skip = from_ - index_start if from_ > index_start else 0 + take = to - index_start - skip if to < index_end else -1 + if take == 0: + continue + file_instructions.append({"filename": filename, "skip": skip, "take": take}) + num_examples += shard_length - skip if take == -1 else take + index_start += shard_length + return FileInstructions( + num_examples=num_examples, + file_instructions=file_instructions, + ) + + +class BaseReader: + """ + Build a Dataset object out of Instruction instance(s). + """ + + def __init__(self, path: str, info: Optional["DatasetInfo"]): + """Initializes ArrowReader. + + Args: + path (str): path where tfrecords are stored. + info (DatasetInfo): info about the dataset. + """ + self._path: str = path + self._info: Optional["DatasetInfo"] = info + self._filetype_suffix: Optional[str] = None + + def _get_table_from_filename(self, filename_skip_take, in_memory=False) -> Table: + """Returns a Dataset instance from given (filename, skip, take).""" + raise NotImplementedError + + def _read_files(self, files, in_memory=False) -> Table: + """Returns Dataset for given file instructions. + + Args: + files: List[dict(filename, skip, take)], the files information. + The filenames contain the absolute path, not relative. + skip/take indicates which example read in the file: `ds.slice(skip, take)` + in_memory (bool, default False): Whether to copy the data in-memory. + """ + if len(files) == 0 or not all(isinstance(f, dict) for f in files): + raise ValueError("please provide valid file informations") + files = copy.deepcopy(files) + for f in files: + f["filename"] = os.path.join(self._path, f["filename"]) + + pa_tables = thread_map( + partial(self._get_table_from_filename, in_memory=in_memory), + files, + tqdm_class=hf_tqdm, + desc="Loading dataset shards", + # set `disable=None` rather than `disable=False` by default to disable progress bar when no TTY attached + disable=len(files) <= 16 or None, + ) + pa_tables = [t for t in pa_tables if len(t) > 0] + if not pa_tables and (self._info is None or self._info.features is None): + raise ValueError( + "Tried to read an empty table. Please specify at least info.features to create an empty table with the right type." + ) + pa_tables = pa_tables or [InMemoryTable.from_batches([], schema=pa.schema(self._info.features.type))] + pa_table = concat_tables(pa_tables) if len(pa_tables) != 1 else pa_tables[0] + return pa_table + + def get_file_instructions(self, name, instruction, split_infos): + """Return list of dict {'filename': str, 'skip': int, 'take': int}""" + file_instructions = make_file_instructions( + name, split_infos, instruction, filetype_suffix=self._filetype_suffix, prefix_path=self._path + ) + files = file_instructions.file_instructions + return files + + def read( + self, + name, + instructions, + split_infos, + in_memory=False, + ): + """Returns Dataset instance(s). + + Args: + name (str): name of the dataset. + instructions (ReadInstruction): instructions to read. + Instruction can be string and will then be passed to the Instruction + constructor as it. + split_infos (list of SplitInfo proto): the available splits for dataset. + in_memory (bool, default False): Whether to copy the data in-memory. + + Returns: + kwargs to build a single Dataset instance. + """ + + files = self.get_file_instructions(name, instructions, split_infos) + if not files: + msg = f'Instruction "{instructions}" corresponds to no data!' + raise ValueError(msg) + return self.read_files(files=files, original_instructions=instructions, in_memory=in_memory) + + def read_files( + self, + files: list[dict], + original_instructions: Union[None, "ReadInstruction", "Split"] = None, + in_memory=False, + ): + """Returns single Dataset instance for the set of file instructions. + + Args: + files: List[dict(filename, skip, take)], the files information. + The filenames contains the relative path, not absolute. + skip/take indicates which example read in the file: `ds.skip().take()` + original_instructions: store the original instructions used to build the dataset split in the dataset. + in_memory (bool, default False): Whether to copy the data in-memory. + + Returns: + kwargs to build a Dataset instance. + """ + # Prepend path to filename + pa_table = self._read_files(files, in_memory=in_memory) + # If original_instructions is not None, convert it to a human-readable NamedSplit + if original_instructions is not None: + from .splits import Split # noqa + + split = Split(str(original_instructions)) + else: + split = None + dataset_kwargs = {"arrow_table": pa_table, "info": self._info, "split": split} + return dataset_kwargs + + +class ArrowReader(BaseReader): + """ + Build a Dataset object out of Instruction instance(s). + This Reader uses either memory mapping or file descriptors (in-memory) on arrow files. + """ + + def __init__(self, path: str, info: Optional["DatasetInfo"]): + """Initializes ArrowReader. + + Args: + path (str): path where Arrow files are stored. + info (DatasetInfo): info about the dataset. + """ + super().__init__(path, info) + self._filetype_suffix = "arrow" + + def _get_table_from_filename(self, filename_skip_take, in_memory=False) -> Table: + """Returns a Dataset instance from given (filename, skip, take).""" + filename, skip, take = ( + filename_skip_take["filename"], + filename_skip_take["skip"] if "skip" in filename_skip_take else None, + filename_skip_take["take"] if "take" in filename_skip_take else None, + ) + table = ArrowReader.read_table(filename, in_memory=in_memory) + if take == -1: + take = len(table) - skip + # here we don't want to slice an empty table, or it may segfault + if skip is not None and take is not None and not (skip == 0 and take == len(table)): + table = table.slice(skip, take) + return table + + @staticmethod + def read_table(filename, in_memory=False) -> Table: + """ + Read table from file. + + Args: + filename (str): File name of the table. + in_memory (bool, default=False): Whether to copy the data in-memory. + + Returns: + pyarrow.Table + """ + table_cls = InMemoryTable if in_memory else MemoryMappedTable + return table_cls.from_file(filename) + + +class ParquetReader(BaseReader): + """ + Build a Dataset object out of Instruction instance(s). + This Reader uses memory mapping on parquet files. + """ + + def __init__(self, path: str, info: Optional["DatasetInfo"]): + """Initializes ParquetReader. + + Args: + path (str): path where tfrecords are stored. + info (DatasetInfo): info about the dataset. + """ + super().__init__(path, info) + self._filetype_suffix = "parquet" + + def _get_table_from_filename(self, filename_skip_take, **kwargs): + """Returns a Dataset instance from given (filename, skip, take).""" + filename, skip, take = ( + filename_skip_take["filename"], + filename_skip_take["skip"] if "skip" in filename_skip_take else None, + filename_skip_take["take"] if "take" in filename_skip_take else None, + ) + # Parquet read_table always loads data in memory, independently of memory_map + pa_table = pq.read_table(filename, memory_map=True) + # here we don't want to slice an empty table, or it may segfault + if skip is not None and take is not None and not (skip == 0 and take == len(pa_table)): + pa_table = pa_table.slice(skip, take) + return pa_table + + +@dataclass(frozen=True) +class _AbsoluteInstruction: + """A machine friendly slice: defined absolute positive boundaries.""" + + splitname: str + from_: int # uint (starting index). + to: int # uint (ending index). + + +@dataclass(frozen=True) +class _RelativeInstruction: + """Represents a single parsed slicing instruction, can use % and negatives.""" + + splitname: str + from_: Optional[int] = None # int (starting index) or None if no lower boundary. + to: Optional[int] = None # int (ending index) or None if no upper boundary. + unit: Optional[str] = None + rounding: Optional[str] = None + + def __post_init__(self): + if self.unit is not None and self.unit not in ["%", "abs"]: + raise ValueError("unit must be either % or abs") + if self.rounding is not None and self.rounding not in ["closest", "pct1_dropremainder"]: + raise ValueError("rounding must be either closest or pct1_dropremainder") + if self.unit != "%" and self.rounding is not None: + raise ValueError("It is forbidden to specify rounding if not using percent slicing.") + if self.unit == "%" and self.from_ is not None and abs(self.from_) > 100: + raise ValueError("Percent slice boundaries must be > -100 and < 100.") + if self.unit == "%" and self.to is not None and abs(self.to) > 100: + raise ValueError("Percent slice boundaries must be > -100 and < 100.") + # Update via __dict__ due to instance being "frozen" + self.__dict__["rounding"] = "closest" if self.rounding is None and self.unit == "%" else self.rounding + + +def _str_to_read_instruction(spec): + """Returns ReadInstruction for given string.""" + res = _SUB_SPEC_RE.match(spec) + if not res: + raise ValueError(f"Unrecognized instruction format: {spec}") + unit = "%" if res.group("from_pct") or res.group("to_pct") else "abs" + return ReadInstruction( + split_name=res.group("split"), + rounding=res.group("rounding"), + from_=int(res.group("from")) if res.group("from") else None, + to=int(res.group("to")) if res.group("to") else None, + unit=unit, + ) + + +def _pct_to_abs_pct1(boundary, num_examples): + # Using math.trunc here, since -99.5% should give -99%, not -100%. + if num_examples < 100: + msg = ( + 'Using "pct1_dropremainder" rounding on a split with less than 100 ' + "elements is forbidden: it always results in an empty dataset." + ) + raise ValueError(msg) + return boundary * math.trunc(num_examples / 100.0) + + +def _pct_to_abs_closest(boundary, num_examples): + return int(round(boundary * num_examples / 100.0)) + + +def _rel_to_abs_instr(rel_instr, name2len): + """Returns _AbsoluteInstruction instance for given RelativeInstruction. + + Args: + rel_instr: RelativeInstruction instance. + name2len: dict {split_name: num_examples}. + """ + pct_to_abs = _pct_to_abs_closest if rel_instr.rounding == "closest" else _pct_to_abs_pct1 + split = rel_instr.splitname + if split not in name2len: + raise ValueError(f'Unknown split "{split}". Should be one of {list(name2len)}.') + num_examples = name2len[split] + from_ = rel_instr.from_ + to = rel_instr.to + if rel_instr.unit == "%": + from_ = 0 if from_ is None else pct_to_abs(from_, num_examples) + to = num_examples if to is None else pct_to_abs(to, num_examples) + else: + from_ = 0 if from_ is None else from_ + to = num_examples if to is None else to + if from_ < 0: + from_ = max(num_examples + from_, 0) + if to < 0: + to = max(num_examples + to, 0) + from_ = min(from_, num_examples) + to = min(to, num_examples) + return _AbsoluteInstruction(split, from_, to) + + +class ReadInstruction: + """Reading instruction for a dataset. + + Examples:: + + # The following lines are equivalent: + ds = datasets.load_dataset('mnist', split='test[:33%]') + ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec('test[:33%]')) + ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction('test', to=33, unit='%')) + ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction( + 'test', from_=0, to=33, unit='%')) + + # The following lines are equivalent: + ds = datasets.load_dataset('mnist', split='test[:33%]+train[1:-1]') + ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec( + 'test[:33%]+train[1:-1]')) + ds = datasets.load_dataset('mnist', split=( + datasets.ReadInstruction('test', to=33, unit='%') + + datasets.ReadInstruction('train', from_=1, to=-1, unit='abs'))) + + # The following lines are equivalent: + ds = datasets.load_dataset('mnist', split='test[:33%](pct1_dropremainder)') + ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec( + 'test[:33%](pct1_dropremainder)')) + ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction( + 'test', from_=0, to=33, unit='%', rounding="pct1_dropremainder")) + + # 10-fold validation: + tests = datasets.load_dataset( + 'mnist', + [datasets.ReadInstruction('train', from_=k, to=k+10, unit='%') + for k in range(0, 100, 10)]) + trains = datasets.load_dataset( + 'mnist', + [datasets.ReadInstruction('train', to=k, unit='%') + datasets.ReadInstruction('train', from_=k+10, unit='%') + for k in range(0, 100, 10)]) + + """ + + def _init(self, relative_instructions): + # Private initializer. + self._relative_instructions = relative_instructions + + @classmethod + def _read_instruction_from_relative_instructions(cls, relative_instructions): + """Returns ReadInstruction obj initialized with relative_instructions.""" + # Use __new__ to bypass __init__ used by public API and not conveniant here. + result = cls.__new__(cls) + result._init(relative_instructions) # pylint: disable=protected-access + return result + + def __init__(self, split_name, rounding=None, from_=None, to=None, unit=None): + """Initialize ReadInstruction. + + Args: + split_name (str): name of the split to read. Eg: 'train'. + rounding (str, optional): The rounding behaviour to use when percent slicing is + used. Ignored when slicing with absolute indices. + Possible values: + - 'closest' (default): The specified percentages are rounded to the + closest value. Use this if you want specified percents to be as + much exact as possible. + - 'pct1_dropremainder': the specified percentages are treated as + multiple of 1%. Use this option if you want consistency. Eg: + len(5%) == 5 * len(1%). + Using this option, one might not be able to use the full set of + examples, if the number of those is not a multiple of 100. + from_ (int): + to (int): alternative way of specifying slicing boundaries. If any of + {from_, to, unit} argument is used, slicing cannot be specified as + string. + unit (str): optional, one of: + '%': to set the slicing unit as percents of the split size. + 'abs': to set the slicing unit as absolute numbers. + """ + # This constructor is not always called. See factory method + # `_read_instruction_from_relative_instructions`. Common init instructions + # MUST be placed in the _init method. + self._init([_RelativeInstruction(split_name, from_, to, unit, rounding)]) + + @classmethod + def from_spec(cls, spec): + """Creates a `ReadInstruction` instance out of a string spec. + + Args: + spec (`str`): + Split(s) + optional slice(s) to read + optional rounding + if percents are used as the slicing unit. A slice can be specified, + using absolute numbers (`int`) or percentages (`int`). + + Examples: + + ``` + test: test split. + test + validation: test split + validation split. + test[10:]: test split, minus its first 10 records. + test[:10%]: first 10% records of test split. + test[:20%](pct1_dropremainder): first 10% records, rounded with the pct1_dropremainder rounding. + test[:-5%]+train[40%:60%]: first 95% of test + middle 20% of train. + ``` + + Returns: + ReadInstruction instance. + """ + spec = str(spec) # Need to convert to str in case of NamedSplit instance. + subs = _ADDITION_SEP_RE.split(spec) + if not subs: + raise ValueError(f"No instructions could be built out of {spec}") + instruction = _str_to_read_instruction(subs[0]) + return sum((_str_to_read_instruction(sub) for sub in subs[1:]), instruction) + + def to_spec(self): + rel_instr_specs = [] + for rel_instr in self._relative_instructions: + rel_instr_spec = rel_instr.splitname + if rel_instr.from_ is not None or rel_instr.to is not None: + from_ = rel_instr.from_ + to = rel_instr.to + unit = rel_instr.unit + rounding = rel_instr.rounding + unit = unit if unit == "%" else "" + from_ = str(from_) + unit if from_ is not None else "" + to = str(to) + unit if to is not None else "" + slice_str = f"[{from_}:{to}]" + rounding_str = ( + f"({rounding})" if unit == "%" and rounding is not None and rounding != "closest" else "" + ) + rel_instr_spec += slice_str + rounding_str + rel_instr_specs.append(rel_instr_spec) + return "+".join(rel_instr_specs) + + def __add__(self, other): + """Returns a new ReadInstruction obj, result of appending other to self.""" + if not isinstance(other, ReadInstruction): + msg = "ReadInstruction can only be added to another ReadInstruction obj." + raise TypeError(msg) + self_ris = self._relative_instructions + other_ris = other._relative_instructions # pylint: disable=protected-access + if ( + self_ris[0].unit != "abs" + and other_ris[0].unit != "abs" + and self._relative_instructions[0].rounding != other_ris[0].rounding + ): + raise ValueError("It is forbidden to sum ReadInstruction instances with different rounding values.") + return self._read_instruction_from_relative_instructions(self_ris + other_ris) + + def __str__(self): + return self.to_spec() + + def __repr__(self): + return f"ReadInstruction({self._relative_instructions})" + + def to_absolute(self, name2len): + """Translate instruction into a list of absolute instructions. + + Those absolute instructions are then to be added together. + + Args: + name2len (`dict`): + Associating split names to number of examples. + + Returns: + list of _AbsoluteInstruction instances (corresponds to the + in spec). + """ + return [_rel_to_abs_instr(rel_instr, name2len) for rel_instr in self._relative_instructions] diff --git a/datasets/arrow_writer.py b/datasets/arrow_writer.py new file mode 100644 index 0000000000000000000000000000000000000000..3174f5cf206b9072742b098f0c68d22d98e33c9e --- /dev/null +++ b/datasets/arrow_writer.py @@ -0,0 +1,766 @@ +# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Lint as: python3 +"""To write records into Parquet files.""" + +import json +import sys +from collections.abc import Iterable +from typing import Any, Optional, Union + +import fsspec +import numpy as np +import pyarrow as pa +import pyarrow.parquet as pq +from fsspec.core import url_to_fs + +from . import config +from .features import Audio, Features, Image, Pdf, Value, Video +from .features.features import ( + FeatureType, + List, + _ArrayXDExtensionType, + _visit, + cast_to_python_objects, + generate_from_arrow_type, + get_nested_type, + list_of_np_array_to_pyarrow_listarray, + numpy_to_pyarrow_listarray, + to_pyarrow_listarray, +) +from .filesystems import is_remote_filesystem +from .info import DatasetInfo +from .keyhash import DuplicatedKeysError, KeyHasher +from .table import array_cast, cast_array_to_feature, embed_table_storage, table_cast +from .utils import logging +from .utils.py_utils import asdict, convert_file_size_to_int, first_non_null_non_empty_value + + +logger = logging.get_logger(__name__) + +type_ = type # keep python's type function + + +def get_arrow_writer_batch_size_from_features(features: Optional[Features]) -> Optional[int]: + """ + Get the writer_batch_size that defines the maximum record batch size in the arrow files based on configuration values. + The default value is 100 for image/audio datasets and 10 for videos. + This allows to avoid overflows in arrow buffers. + + Args: + features (`datasets.Features` or `None`): + Dataset Features from `datasets`. + Returns: + writer_batch_size (`Optional[int]`): + Writer batch size to pass to a dataset builder. + If `None`, then it will use the `datasets` default, i.e. `datasets.config.DEFAULT_MAX_BATCH_SIZE`. + """ + if not features: + return None + + batch_size = np.inf + + def set_batch_size(feature: FeatureType) -> None: + nonlocal batch_size + if isinstance(feature, Image) and config.ARROW_RECORD_BATCH_SIZE_FOR_IMAGE_DATASETS is not None: + batch_size = min(batch_size, config.ARROW_RECORD_BATCH_SIZE_FOR_IMAGE_DATASETS) + elif isinstance(feature, Audio) and config.ARROW_RECORD_BATCH_SIZE_FOR_AUDIO_DATASETS is not None: + batch_size = min(batch_size, config.ARROW_RECORD_BATCH_SIZE_FOR_AUDIO_DATASETS) + elif isinstance(feature, Video) and config.ARROW_RECORD_BATCH_SIZE_FOR_VIDEO_DATASETS is not None: + batch_size = min(batch_size, config.ARROW_RECORD_BATCH_SIZE_FOR_VIDEO_DATASETS) + elif ( + isinstance(feature, Value) + and feature.dtype == "binary" + and config.ARROW_RECORD_BATCH_SIZE_FOR_BINARY_DATASETS is not None + ): + batch_size = min(batch_size, config.ARROW_RECORD_BATCH_SIZE_FOR_BINARY_DATASETS) + + _visit(features, set_batch_size) + + return None if batch_size is np.inf else batch_size + + +def get_writer_batch_size_from_features(features: Optional[Features]) -> Optional[int]: + """ + Get the writer_batch_size that defines the maximum row group size in the parquet files based on configuration values. + By default these are not set, but it can be helpful to hard set those values in some cases. + This allows to optimize random access to parquet file, since accessing 1 row requires + to read its entire row group. + + Args: + features (`datasets.Features` or `None`): + Dataset Features from `datasets`. + Returns: + writer_batch_size (`Optional[int]`): + Writer batch size to pass to a parquet writer. + If `None`, then it will use the `datasets` default, i.e. aiming for row groups of 100MB. + """ + if not features: + return None + + batch_size = np.inf + + def set_batch_size(feature: FeatureType) -> None: + nonlocal batch_size + if isinstance(feature, Image) and config.PARQUET_ROW_GROUP_SIZE_FOR_IMAGE_DATASETS is not None: + batch_size = min(batch_size, config.PARQUET_ROW_GROUP_SIZE_FOR_IMAGE_DATASETS) + elif isinstance(feature, Audio) and config.PARQUET_ROW_GROUP_SIZE_FOR_AUDIO_DATASETS is not None: + batch_size = min(batch_size, config.PARQUET_ROW_GROUP_SIZE_FOR_AUDIO_DATASETS) + elif isinstance(feature, Video) and config.PARQUET_ROW_GROUP_SIZE_FOR_VIDEO_DATASETS is not None: + batch_size = min(batch_size, config.PARQUET_ROW_GROUP_SIZE_FOR_VIDEO_DATASETS) + elif ( + isinstance(feature, Value) + and feature.dtype == "binary" + and config.PARQUET_ROW_GROUP_SIZE_FOR_BINARY_DATASETS is not None + ): + batch_size = min(batch_size, config.PARQUET_ROW_GROUP_SIZE_FOR_BINARY_DATASETS) + + _visit(features, set_batch_size) + + return None if batch_size is np.inf else batch_size + + +def get_writer_batch_size_from_data_size(num_rows: int, num_bytes: int) -> int: + """ + Get the writer_batch_size that defines the maximum row group size in the parquet files. + The default in `datasets` is aiming for row groups of maximum 100MB uncompressed. + This allows to optimize random access to parquet file, since accessing 1 row requires + to read its entire row group. + + This can be improved to get optimized size for querying/iterating + but at least it matches the dataset viewer expectations on HF. + + Args: + num_rows (`int`): + Number of rows in the dataset. + num_bytes (`int`): + Number of bytes in the dataset. + For dataset with external files to embed (image, audio, videos), this can also be an + estimate from `dataset._estimate_nbytes()`. + Returns: + writer_batch_size (`Optional[int]`): + Writer batch size to pass to a parquet writer. + """ + return max(10, num_rows * convert_file_size_to_int(config.MAX_ROW_GROUP_SIZE) // num_bytes) if num_bytes > 0 else 1 + + +class SchemaInferenceError(ValueError): + pass + + +class TypedSequence: + """ + This data container generalizes the typing when instantiating pyarrow arrays, tables or batches. + + More specifically it adds several features: + - Support extension types like ``datasets.features.Array2DExtensionType``: + By default pyarrow arrays don't return extension arrays. One has to call + ``pa.ExtensionArray.from_storage(type, pa.array(data, type.storage_type))`` + in order to get an extension array. + - Support for ``try_type`` parameter that can be used instead of ``type``: + When an array is transformed, we like to keep the same type as before if possible. + For example when calling :func:`datasets.Dataset.map`, we don't want to change the type + of each column by default. + - Better error message when a pyarrow array overflows. + + Example:: + + from datasets.features import Array2D, Array2DExtensionType, Value + from datasets.arrow_writer import TypedSequence + import pyarrow as pa + + arr = pa.array(TypedSequence([1, 2, 3], type=Value("int32"))) + assert arr.type == pa.int32() + + arr = pa.array(TypedSequence([1, 2, 3], try_type=Value("int32"))) + assert arr.type == pa.int32() + + arr = pa.array(TypedSequence(["foo", "bar"], try_type=Value("int32"))) + assert arr.type == pa.string() + + arr = pa.array(TypedSequence([[[1, 2, 3]]], type=Array2D((1, 3), "int64"))) + assert arr.type == Array2DExtensionType((1, 3), "int64") + + table = pa.Table.from_pydict({ + "image": TypedSequence([[[1, 2, 3]]], type=Array2D((1, 3), "int64")) + }) + assert table["image"].type == Array2DExtensionType((1, 3), "int64") + + """ + + def __init__( + self, + data: Iterable, + type: Optional[FeatureType] = None, + try_type: Optional[FeatureType] = None, + optimized_int_type: Optional[FeatureType] = None, + ): + # assert type is None or try_type is None, + if type is not None and try_type is not None: + raise ValueError("You cannot specify both type and try_type") + # set attributes + self.data = data + self.type = type + self.try_type = try_type # is ignored if it doesn't match the data + self.optimized_int_type = optimized_int_type + # when trying a type (is ignored if data is not compatible) + self.trying_type = self.try_type is not None + self.trying_int_optimization = optimized_int_type is not None and type is None and try_type is None + # used to get back the inferred type after __arrow_array__() is called once + self._inferred_type = None + + def get_inferred_type(self) -> FeatureType: + """Return the inferred feature type. + This is done by converting the sequence to an Arrow array, and getting the corresponding + feature type. + + Since building the Arrow array can be expensive, the value of the inferred type is cached + as soon as pa.array is called on the typed sequence. + + Returns: + FeatureType: inferred feature type of the sequence. + """ + if self._inferred_type is None: + self._inferred_type = generate_from_arrow_type(pa.array(self).type) + return self._inferred_type + + @staticmethod + def _infer_custom_type_and_encode(data: Iterable) -> tuple[Iterable, Optional[FeatureType]]: + """Implement type inference for custom objects like PIL.Image.Image -> Image type. + + This function is only used for custom python objects that can't be directly passed to build + an Arrow array. In such cases is infers the feature type to use, and it encodes the data so + that they can be passed to an Arrow array. + + Args: + data (Iterable): array of data to infer the type, e.g. a list of PIL images. + + Returns: + Tuple[Iterable, Optional[FeatureType]]: a tuple with: + - the (possibly encoded) array, if the inferred feature type requires encoding + - the inferred feature type if the array is made of supported custom objects like + PIL images, else None. + """ + if config.PIL_AVAILABLE and "PIL" in sys.modules: + import PIL.Image + + non_null_idx, non_null_value = first_non_null_non_empty_value(data) + if isinstance(non_null_value, PIL.Image.Image): + return [Image().encode_example(value) if value is not None else None for value in data], Image() + if isinstance(non_null_value, list) and isinstance(non_null_value[0], PIL.Image.Image): + return [ + [Image().encode_example(x) for x in value] if value is not None else None for value in data + ], List(Image()) + if config.PDFPLUMBER_AVAILABLE and "pdfplumber" in sys.modules: + import pdfplumber + + non_null_idx, non_null_value = first_non_null_non_empty_value(data) + if isinstance(non_null_value, pdfplumber.pdf.PDF): + return [Pdf().encode_example(value) if value is not None else None for value in data], Pdf() + if isinstance(non_null_value, list) and isinstance(non_null_value[0], pdfplumber.pdf.PDF): + return [ + [Pdf().encode_example(x) for x in value] if value is not None else None for value in data + ], List(Pdf()) + return data, None + + def __arrow_array__(self, type: Optional[pa.DataType] = None): + """This function is called when calling pa.array(typed_sequence)""" + + if type is not None: + raise ValueError("TypedSequence is supposed to be used with pa.array(typed_sequence, type=None)") + del type # make sure we don't use it + data = self.data + # automatic type inference for custom objects + if self.type is None and self.try_type is None: + data, self._inferred_type = self._infer_custom_type_and_encode(data) + if self._inferred_type is None: + type = self.try_type if self.trying_type else self.type + else: + type = self._inferred_type + pa_type = get_nested_type(type) if type is not None else None + optimized_int_pa_type = ( + get_nested_type(self.optimized_int_type) if self.optimized_int_type is not None else None + ) + trying_cast_to_python_objects = False + try: + # custom pyarrow types + if isinstance(pa_type, _ArrayXDExtensionType): + storage = to_pyarrow_listarray(data, pa_type) + return pa.ExtensionArray.from_storage(pa_type, storage) + + # efficient np array to pyarrow array + if isinstance(data, np.ndarray): + out = numpy_to_pyarrow_listarray(data) + elif isinstance(data, list) and data and isinstance(first_non_null_non_empty_value(data)[1], np.ndarray): + out = list_of_np_array_to_pyarrow_listarray(data) + else: + trying_cast_to_python_objects = True + out = pa.array(cast_to_python_objects(data, only_1d_for_numpy=True)) + # use smaller integer precisions if possible + if self.trying_int_optimization: + if pa.types.is_int64(out.type): + out = out.cast(optimized_int_pa_type) + elif pa.types.is_list(out.type): + if pa.types.is_int64(out.type.value_type): + out = array_cast(out, pa.list_(optimized_int_pa_type)) + elif pa.types.is_list(out.type.value_type) and pa.types.is_int64(out.type.value_type.value_type): + out = array_cast(out, pa.list_(pa.list_(optimized_int_pa_type))) + # otherwise we can finally use the user's type + elif type is not None: + # We use cast_array_to_feature to support casting to custom types like Audio and Image + # Also, when trying type "string", we don't want to convert integers or floats to "string". + # We only do it if trying_type is False - since this is what the user asks for. + out = cast_array_to_feature( + out, type, allow_primitive_to_str=not self.trying_type, allow_decimal_to_str=not self.trying_type + ) + return out + except ( + TypeError, + pa.lib.ArrowInvalid, + pa.lib.ArrowNotImplementedError, + ) as e: # handle type errors and overflows + # Ignore ArrowNotImplementedError caused by trying type, otherwise re-raise + if not self.trying_type and isinstance(e, pa.lib.ArrowNotImplementedError): + raise + + if self.trying_type: + try: # second chance + if isinstance(data, np.ndarray): + return numpy_to_pyarrow_listarray(data) + elif isinstance(data, list) and data and any(isinstance(value, np.ndarray) for value in data): + return list_of_np_array_to_pyarrow_listarray(data) + else: + trying_cast_to_python_objects = True + return pa.array(cast_to_python_objects(data, only_1d_for_numpy=True)) + except pa.lib.ArrowInvalid as e: + if "overflow" in str(e): + raise OverflowError( + f"There was an overflow with type {type_(data)}. Try to reduce writer_batch_size to have batches smaller than 2GB.\n({e})" + ) from None + elif self.trying_int_optimization and "not in range" in str(e): + optimized_int_pa_type_str = np.dtype(optimized_int_pa_type.to_pandas_dtype()).name + logger.info( + f"Failed to cast a sequence to {optimized_int_pa_type_str}. Falling back to int64." + ) + return out + elif trying_cast_to_python_objects and "Could not convert" in str(e): + out = pa.array( + cast_to_python_objects(data, only_1d_for_numpy=True, optimize_list_casting=False) + ) + if type is not None: + out = cast_array_to_feature( + out, type, allow_primitive_to_str=True, allow_decimal_to_str=True + ) + return out + else: + raise + elif "overflow" in str(e): + raise OverflowError( + f"There was an overflow with type {type_(data)}. Try to reduce writer_batch_size to have batches smaller than 2GB.\n({e})" + ) from None + elif self.trying_int_optimization and "not in range" in str(e): + optimized_int_pa_type_str = np.dtype(optimized_int_pa_type.to_pandas_dtype()).name + logger.info(f"Failed to cast a sequence to {optimized_int_pa_type_str}. Falling back to int64.") + return out + elif trying_cast_to_python_objects and "Could not convert" in str(e): + out = pa.array(cast_to_python_objects(data, only_1d_for_numpy=True, optimize_list_casting=False)) + if type is not None: + out = cast_array_to_feature(out, type, allow_primitive_to_str=True, allow_decimal_to_str=True) + return out + else: + raise + + +class OptimizedTypedSequence(TypedSequence): + def __init__( + self, + data, + type: Optional[FeatureType] = None, + try_type: Optional[FeatureType] = None, + col: Optional[str] = None, + optimized_int_type: Optional[FeatureType] = None, + ): + optimized_int_type_by_col = { + "attention_mask": Value("int8"), # binary tensor + "special_tokens_mask": Value("int8"), + "input_ids": Value("int32"), # typical vocab size: 0-50k (max ~500k, never > 1M) + "token_type_ids": Value( + "int8" + ), # binary mask; some (XLNetModel) use an additional token represented by a 2 + } + if type is None and try_type is None: + optimized_int_type = optimized_int_type_by_col.get(col, None) + super().__init__(data, type=type, try_type=try_type, optimized_int_type=optimized_int_type) + + +class ArrowWriter: + """Shuffles and writes Examples to Arrow files.""" + + def __init__( + self, + schema: Optional[pa.Schema] = None, + features: Optional[Features] = None, + path: Optional[str] = None, + stream: Optional[pa.NativeFile] = None, + fingerprint: Optional[str] = None, + writer_batch_size: Optional[int] = None, + hash_salt: Optional[str] = None, + check_duplicates: Optional[bool] = False, + disable_nullable: bool = False, + update_features: bool = False, + with_metadata: bool = True, + unit: str = "examples", + embed_local_files: bool = False, + storage_options: Optional[dict] = None, + ): + if path is None and stream is None: + raise ValueError("At least one of path and stream must be provided.") + if features is not None: + self._features = features + self._schema = None + elif schema is not None: + self._schema: pa.Schema = schema + self._features = Features.from_arrow_schema(self._schema) + else: + self._features = None + self._schema = None + + if hash_salt is not None: + # Create KeyHasher instance using split name as hash salt + self._hasher = KeyHasher(hash_salt) + else: + self._hasher = KeyHasher("") + + self._check_duplicates = check_duplicates + self._disable_nullable = disable_nullable + + if stream is None: + fs, path = url_to_fs(path, **(storage_options or {})) + self._fs: fsspec.AbstractFileSystem = fs + self._path = path if not is_remote_filesystem(self._fs) else self._fs.unstrip_protocol(path) + self.stream = self._fs.open(path, "wb") + self._closable_stream = True + else: + self._fs = None + self._path = None + self.stream = stream + self._closable_stream = False + + self.fingerprint = fingerprint + self.disable_nullable = disable_nullable + self.writer_batch_size = ( + writer_batch_size + or get_arrow_writer_batch_size_from_features(self._features) + or config.DEFAULT_MAX_BATCH_SIZE + ) + self.update_features = update_features + self.with_metadata = with_metadata + self.unit = unit + self.embed_local_files = embed_local_files + + self._num_examples = 0 + self._num_bytes = 0 + self.current_examples: list[tuple[dict[str, Any], str]] = [] + self.current_rows: list[pa.Table] = [] + self.pa_writer: Optional[pa.RecordBatchStreamWriter] = None + self.hkey_record = [] + + def __len__(self): + """Return the number of writed and staged examples""" + return self._num_examples + len(self.current_examples) + len(self.current_rows) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + + def close(self): + # Try closing if opened; if closed: pyarrow.lib.ArrowInvalid: Invalid operation on closed file + if self.pa_writer: # it might be None + try: + self.pa_writer.close() + except Exception: # pyarrow.lib.ArrowInvalid, OSError + pass + if self._closable_stream and not self.stream.closed: + self.stream.close() # This also closes self.pa_writer if it is opened + + def _build_schema(self, inferred_schema: pa.Schema): + schema = self.schema + features = self._features + inferred_features = Features.from_arrow_schema(inferred_schema) + if self._features is not None: + if self.update_features: # keep original features it they match, or update them + fields = {field.name: field for field in self._features.type} + for inferred_field in inferred_features.type: + name = inferred_field.name + if name in fields: + if inferred_field == fields[name]: + inferred_features[name] = self._features[name] + features = inferred_features + schema: pa.Schema = inferred_schema + else: + features = inferred_features + schema: pa.Schema = inferred_features.arrow_schema + + if self.disable_nullable: + schema = pa.schema(pa.field(field.name, field.type, nullable=False) for field in schema) + if self.with_metadata: + schema = schema.with_metadata(self._build_metadata(DatasetInfo(features=features), self.fingerprint)) + else: + schema = schema.with_metadata({}) + + return schema, features + + def _build_writer(self, inferred_schema: pa.Schema): + self._schema, self._features = self._build_schema(inferred_schema) + self.pa_writer = pa.RecordBatchStreamWriter(self.stream, self._schema) + + @property + def schema(self): + _schema = ( + self._schema + if self._schema is not None + else (pa.schema(self._features.type) if self._features is not None else None) + ) + if self._disable_nullable and _schema is not None: + _schema = pa.schema(pa.field(field.name, field.type, nullable=False) for field in _schema) + return _schema if _schema is not None else [] + + @staticmethod + def _build_metadata(info: DatasetInfo, fingerprint: Optional[str] = None) -> dict[str, str]: + info_keys = ["features"] # we can add support for more DatasetInfo keys in the future + info_as_dict = asdict(info) + metadata = {} + metadata["info"] = {key: info_as_dict[key] for key in info_keys} + if fingerprint is not None: + metadata["fingerprint"] = fingerprint + return {"huggingface": json.dumps(metadata)} + + def write_examples_on_file(self): + """Write stored examples from the write-pool of examples. It makes a table out of the examples and write it.""" + if not self.current_examples: + return + # preserve the order the columns + if self.schema: + schema_cols = set(self.schema.names) + examples_cols = self.current_examples[0][0].keys() # .keys() preserves the order (unlike set) + common_cols = [col for col in self.schema.names if col in examples_cols] + extra_cols = [col for col in examples_cols if col not in schema_cols] + cols = common_cols + extra_cols + else: + cols = list(self.current_examples[0][0]) + batch_examples = {} + for col in cols: + # We use row[0][col] since current_examples contains (example, key) tuples. + # Moreover, examples could be Arrow arrays of 1 element. + # This can happen in `.map()` when we want to re-write the same Arrow data + if all(isinstance(row[0][col], (pa.Array, pa.ChunkedArray)) for row in self.current_examples): + arrays = [row[0][col] for row in self.current_examples] + arrays = [ + chunk + for array in arrays + for chunk in (array.chunks if isinstance(array, pa.ChunkedArray) else [array]) + ] + batch_examples[col] = pa.concat_arrays(arrays) + else: + batch_examples[col] = [ + row[0][col].to_pylist()[0] if isinstance(row[0][col], (pa.Array, pa.ChunkedArray)) else row[0][col] + for row in self.current_examples + ] + self.write_batch(batch_examples=batch_examples) + self.current_examples = [] + + def write_rows_on_file(self): + """Write stored rows from the write-pool of rows. It concatenates the single-row tables and it writes the resulting table.""" + if not self.current_rows: + return + table = pa.concat_tables(self.current_rows) + self.write_table(table) + self.current_rows = [] + + def write( + self, + example: dict[str, Any], + key: Optional[Union[str, int, bytes]] = None, + writer_batch_size: Optional[int] = None, + ): + """Add a given (Example,Key) pair to the write-pool of examples which is written to file. + + Args: + example: the Example to add. + key: Optional, a unique identifier(str, int or bytes) associated with each example + """ + # Utilize the keys and duplicate checking when `self._check_duplicates` is passed True + if self._check_duplicates: + # Create unique hash from key and store as (key, example) pairs + hash = self._hasher.hash(key) + self.current_examples.append((example, hash)) + # Maintain record of keys and their respective hashes for checking duplicates + self.hkey_record.append((hash, key)) + else: + # Store example as a tuple so as to keep the structure of `self.current_examples` uniform + self.current_examples.append((example, "")) + + if writer_batch_size is None: + writer_batch_size = self.writer_batch_size + if writer_batch_size is not None and len(self.current_examples) >= writer_batch_size: + if self._check_duplicates: + self.check_duplicate_keys() + # Re-initializing to empty list for next batch + self.hkey_record = [] + + self.write_examples_on_file() + + def check_duplicate_keys(self): + """Raises error if duplicates found in a batch""" + tmp_record = set() + for hash, key in self.hkey_record: + if hash in tmp_record: + duplicate_key_indices = [ + str(self._num_examples + index) + for index, (duplicate_hash, _) in enumerate(self.hkey_record) + if duplicate_hash == hash + ] + + raise DuplicatedKeysError(key, duplicate_key_indices) + else: + tmp_record.add(hash) + + def write_row(self, row: pa.Table, writer_batch_size: Optional[int] = None): + """Add a given single-row Table to the write-pool of rows which is written to file. + + Args: + row: the row to add. + """ + if len(row) != 1: + raise ValueError(f"Only single-row pyarrow tables are allowed but got table with {len(row)} rows.") + self.current_rows.append(row) + if writer_batch_size is None: + writer_batch_size = self.writer_batch_size + if writer_batch_size is not None and len(self.current_rows) >= writer_batch_size: + self.write_rows_on_file() + + def write_batch( + self, + batch_examples: dict[str, list], + writer_batch_size: Optional[int] = None, + try_original_type: Optional[bool] = True, + ): + """Write a batch of Example to file. + Ignores the batch if it appears to be empty, + preventing a potential schema update of unknown types. + + Args: + batch_examples: the batch of examples to add. + try_original_type: use `try_type` when instantiating OptimizedTypedSequence if `True`, otherwise `try_type = None`. + """ + if batch_examples and len(next(iter(batch_examples.values()))) == 0: + return + features = None if self.pa_writer is None and self.update_features else self._features + try_features = self._features if self.pa_writer is None and self.update_features else None + arrays = [] + inferred_features = Features() + # preserve the order the columns + if self.schema: + schema_cols = set(self.schema.names) + batch_cols = batch_examples.keys() # .keys() preserves the order (unlike set) + common_cols = [col for col in self.schema.names if col in batch_cols] + extra_cols = [col for col in batch_cols if col not in schema_cols] + cols = common_cols + extra_cols + else: + cols = list(batch_examples) + for col in cols: + col_values = batch_examples[col] + col_type = features[col] if features else None + if isinstance(col_values, (pa.Array, pa.ChunkedArray)): + array = cast_array_to_feature(col_values, col_type) if col_type is not None else col_values + arrays.append(array) + inferred_features[col] = generate_from_arrow_type(col_values.type) + else: + col_try_type = ( + try_features[col] + if try_features is not None and col in try_features and try_original_type + else None + ) + typed_sequence = OptimizedTypedSequence(col_values, type=col_type, try_type=col_try_type, col=col) + arrays.append(pa.array(typed_sequence)) + inferred_features[col] = typed_sequence.get_inferred_type() + schema = inferred_features.arrow_schema if self.pa_writer is None else self.schema + pa_table = pa.Table.from_arrays(arrays, schema=schema) + self.write_table(pa_table, writer_batch_size) + + def write_table(self, pa_table: pa.Table, writer_batch_size: Optional[int] = None): + """Write a Table to file. + + Args: + example: the Table to add. + """ + if writer_batch_size is None: + writer_batch_size = self.writer_batch_size + if self.pa_writer is None: + self._build_writer(inferred_schema=pa_table.schema) + pa_table = pa_table.combine_chunks() + pa_table = table_cast(pa_table, self._schema) + if self.embed_local_files: + pa_table = embed_table_storage(pa_table) + self._num_bytes += pa_table.nbytes + self._num_examples += pa_table.num_rows + self.pa_writer.write_table(pa_table, writer_batch_size) + + def finalize(self, close_stream=True): + self.write_rows_on_file() + # In case current_examples < writer_batch_size, but user uses finalize() + if self._check_duplicates: + self.check_duplicate_keys() + # Re-initializing to empty list for next batch + self.hkey_record = [] + self.write_examples_on_file() + # If schema is known, infer features even if no examples were written + if self.pa_writer is None and self.schema: + self._build_writer(self.schema) + if self.pa_writer is not None: + self.pa_writer.close() + self.pa_writer = None + if close_stream: + self.stream.close() + else: + if close_stream: + self.stream.close() + raise SchemaInferenceError("Please pass `features` or at least one example when writing data") + logger.debug( + f"Done writing {self._num_examples} {self.unit} in {self._num_bytes} bytes {self._path if self._path else ''}." + ) + return self._num_examples, self._num_bytes + + +class ParquetWriter(ArrowWriter): + def __init__(self, *args, use_content_defined_chunking=True, write_page_index=True, **kwargs): + super().__init__(*args, **kwargs) + if use_content_defined_chunking is True: + use_content_defined_chunking = config.DEFAULT_CDC_OPTIONS + self.use_content_defined_chunking = use_content_defined_chunking + self.write_page_index = write_page_index + + def _build_writer(self, inferred_schema: pa.Schema): + self._schema, self._features = self._build_schema(inferred_schema) + self.pa_writer = pq.ParquetWriter( + self.stream, + self._schema, + use_content_defined_chunking=self.use_content_defined_chunking, + write_page_index=self.write_page_index, + ) + if self.use_content_defined_chunking is not False: + self.pa_writer.add_key_value_metadata( + {"content_defined_chunking": json.dumps(self.use_content_defined_chunking)} + ) diff --git a/datasets/builder.py b/datasets/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..b88aa0bf8f99fa0c833b736022f12c18b8c09c63 --- /dev/null +++ b/datasets/builder.py @@ -0,0 +1,1866 @@ +# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Lint as: python3 +"""DatasetBuilder base class.""" + +import abc +import contextlib +import copy +import inspect +import os +import posixpath +import shutil +import textwrap +import time +import urllib +from collections.abc import Iterable, Mapping +from dataclasses import dataclass +from functools import partial +from pathlib import Path +from typing import TYPE_CHECKING, Optional, Union +from unittest.mock import patch + +import fsspec +from fsspec.core import url_to_fs +from multiprocess import Pool +from tqdm.contrib.concurrent import thread_map + +from . import config, utils +from .arrow_dataset import Dataset +from .arrow_reader import ( + ArrowReader, + ReadInstruction, +) +from .arrow_writer import ArrowWriter, ParquetWriter, SchemaInferenceError +from .data_files import DataFilesDict, DataFilesPatternsDict, sanitize_patterns +from .dataset_dict import DatasetDict, IterableDatasetDict +from .download.download_config import DownloadConfig +from .download.download_manager import DownloadManager, DownloadMode +from .download.streaming_download_manager import StreamingDownloadManager, xjoin +from .exceptions import DatasetGenerationCastError, DatasetGenerationError, FileFormatError, ManualDownloadError +from .features import Features +from .filesystems import ( + is_remote_filesystem, + rename, +) +from .fingerprint import Hasher +from .info import DatasetInfo, PostProcessedInfo +from .iterable_dataset import ArrowExamplesIterable, ExamplesIterable, IterableDataset +from .keyhash import DuplicatedKeysError +from .naming import INVALID_WINDOWS_CHARACTERS_IN_PATH, camelcase_to_snakecase +from .splits import Split, SplitDict, SplitGenerator, SplitInfo +from .streaming import extend_dataset_builder_for_streaming +from .table import CastError +from .utils import logging +from .utils import tqdm as hf_tqdm +from .utils._filelock import FileLock +from .utils.file_utils import is_remote_url +from .utils.info_utils import VerificationMode, get_size_checksum_dict, verify_checksums, verify_splits +from .utils.py_utils import ( + classproperty, + convert_file_size_to_int, + has_sufficient_disk_space, + iflatmap_unordered, + map_nested, + memoize, + size_str, + temporary_assignment, +) +from .utils.sharding import _number_of_shards_in_gen_kwargs, _split_gen_kwargs +from .utils.track import tracked_list + + +if TYPE_CHECKING: + from .load import DatasetModule + + +logger = logging.get_logger(__name__) + + +class InvalidConfigName(ValueError): + pass + + +@dataclass +class BuilderConfig: + """Base class for `DatasetBuilder` data configuration. + + `DatasetBuilder` subclasses with data configuration options should subclass + `BuilderConfig` and add their own properties. + + Attributes: + name (`str`, defaults to `default`): + The name of the configuration. + version (`Version` or `str`, defaults to `0.0.0`): + The version of the configuration. + data_dir (`str`, *optional*): + Path to the directory containing the source data. + data_files (`str` or `Sequence` or `Mapping`, *optional*): + Path(s) to source data file(s). + description (`str`, *optional*): + A human description of the configuration. + """ + + name: str = "default" + version: Optional[Union[utils.Version, str]] = utils.Version("0.0.0") + data_dir: Optional[str] = None + data_files: Optional[Union[DataFilesDict, DataFilesPatternsDict]] = None + description: Optional[str] = None + + def __post_init__(self): + # The config name is used to name the cache directory. + for invalid_char in INVALID_WINDOWS_CHARACTERS_IN_PATH: + if invalid_char in self.name: + raise InvalidConfigName( + f"Bad characters from black list '{INVALID_WINDOWS_CHARACTERS_IN_PATH}' found in '{self.name}'. " + f"They could create issues when creating a directory for this config on Windows filesystem." + ) + if self.data_files is not None and not isinstance(self.data_files, (DataFilesDict, DataFilesPatternsDict)): + raise ValueError(f"Expected a DataFilesDict in data_files but got {self.data_files}") + + def __eq__(self, o): + # we need to override the default dataclass __eq__ since it doesn't check for + # other attributes that the ones of the signature. + if set(self.__dict__.keys()) != set(o.__dict__.keys()): + return False + return all((k, getattr(self, k)) == (k, getattr(o, k)) for k in self.__dict__.keys()) + + def create_config_id( + self, + config_kwargs: dict, + custom_features: Optional[Features] = None, + ) -> str: + """ + The config id is used to build the cache directory. + By default it is equal to the config name. + However the name of a config is not sufficient to have a unique identifier for the dataset being generated + since it doesn't take into account: + - the config kwargs that can be used to overwrite attributes + - the custom features used to write the dataset + - the data_files for json/text/csv/pandas datasets + + Therefore the config id is just the config name with an optional suffix based on these. + """ + # Possibly add a suffix to the name to handle custom features/data_files/config_kwargs + suffix: Optional[str] = None + config_kwargs_to_add_to_suffix = config_kwargs.copy() + # name and version are already used to build the cache directory + config_kwargs_to_add_to_suffix.pop("name", None) + config_kwargs_to_add_to_suffix.pop("version", None) + # data dir handling (when specified it points to the manually downloaded data): + # it was previously ignored before the introduction of config id because we didn't want + # to change the config name. Now it's fine to take it into account for the config id. + # config_kwargs_to_add_to_suffix.pop("data_dir", None) + if "data_dir" in config_kwargs_to_add_to_suffix: + if config_kwargs_to_add_to_suffix["data_dir"] is None: + config_kwargs_to_add_to_suffix.pop("data_dir", None) + else: + # canonicalize the data dir to avoid two paths to the same location having different + # hashes + data_dir = config_kwargs_to_add_to_suffix["data_dir"] + data_dir = os.path.normpath(data_dir) + config_kwargs_to_add_to_suffix["data_dir"] = data_dir + if config_kwargs_to_add_to_suffix: + # we don't care about the order of the kwargs + config_kwargs_to_add_to_suffix = { + k: config_kwargs_to_add_to_suffix[k] for k in sorted(config_kwargs_to_add_to_suffix) + } + if all(isinstance(v, (str, bool, int, float)) for v in config_kwargs_to_add_to_suffix.values()): + suffix = ",".join( + str(k) + "=" + urllib.parse.quote_plus(str(v)) for k, v in config_kwargs_to_add_to_suffix.items() + ) + if len(suffix) > 32: # hash if too long + suffix = Hasher.hash(config_kwargs_to_add_to_suffix) + else: + suffix = Hasher.hash(config_kwargs_to_add_to_suffix) + + if custom_features is not None: + m = Hasher() + if suffix: + m.update(suffix) + m.update(custom_features) + suffix = m.hexdigest() + + if suffix: + config_id = self.name + "-" + suffix + if len(config_id) > config.MAX_DATASET_CONFIG_ID_READABLE_LENGTH: + config_id = self.name + "-" + Hasher.hash(suffix) + return config_id + else: + return self.name + + def _resolve_data_files(self, base_path: str, download_config: DownloadConfig) -> None: + if isinstance(self.data_files, DataFilesPatternsDict): + base_path = xjoin(base_path, self.data_dir) if self.data_dir else base_path + self.data_files = self.data_files.resolve(base_path, download_config) + + +class DatasetBuilder: + """Abstract base class for all datasets. + + `DatasetBuilder` has 3 key methods: + + - [`DatasetBuilder.info`]: Documents the dataset, including feature + names, types, shapes, version, splits, citation, etc. + - [`DatasetBuilder.download_and_prepare`]: Downloads the source data + and writes it to disk. + - [`DatasetBuilder.as_dataset`]: Generates a [`Dataset`]. + + Some `DatasetBuilder`s expose multiple variants of the + dataset by defining a [`BuilderConfig`] subclass and accepting a + config object (or name) on construction. Configurable datasets expose a + pre-defined set of configurations in [`DatasetBuilder.builder_configs`]. + + Args: + cache_dir (`str`, *optional*): + Directory to cache data. Defaults to `"~/.cache/huggingface/datasets"`. + dataset_name (`str`, *optional*): + Name of the dataset, if different from the builder name. Useful for packaged builders + like csv, imagefolder, audiofolder, etc. to reflect the difference between datasets + that use the same packaged builder. + config_name (`str`, *optional*): + Name of the dataset configuration. + It affects the data generated on disk. Different configurations will have their own subdirectories and + versions. + If not provided, the default configuration is used (if it exists). + + + + Parameter `name` was renamed to `config_name`. + + + hash (`str`, *optional*): + Hash specific to the dataset builder code. Used to update the caching directory when the + dataset builder code is updated (to avoid reusing old data). + The typical caching directory (defined in `self._relative_data_dir`) is `name/version/hash/`. + base_path (`str`, *optional*): + Base path for relative paths that are used to download files. + This can be a remote URL. + features ([`Features`], *optional*): + Features types to use with this dataset. + It can be used to change the [`Features`] types of a dataset, for example. + token (`str` or `bool`, *optional*): + String or boolean to use as Bearer token for remote files on the + Datasets Hub. If `True`, will get token from `"~/.huggingface"`. + repo_id (`str`, *optional*): + ID of the dataset repository. + Used to distinguish builders with the same name but not coming from the same namespace, for example "rajpurkar/squad" + and "lhoestq/squad" repo IDs. In the latter, the builder name would be "lhoestq___squad". + data_files (`str` or `Sequence` or `Mapping`, *optional*): + Path(s) to source data file(s). + For builders like "csv" or "json" that need the user to specify data files. They can be either + local or remote files. For convenience, you can use a `DataFilesDict`. + data_dir (`str`, *optional*): + Path to directory containing source data file(s). + Use only if `data_files` is not passed, in which case it is equivalent to passing + `os.path.join(data_dir, "**")` as `data_files`. + For builders that require manual download, it must be the path to the local directory containing the + manually downloaded data. + storage_options (`dict`, *optional*): + Key/value pairs to be passed on to the dataset file-system backend, if any. + writer_batch_size (`int`, *optional*): + Batch size used by the ArrowWriter. + It defines the number of samples that are kept in memory before writing them + and also the length of the arrow chunks. + None means that the ArrowWriter will use its default value. + **config_kwargs (additional keyword arguments): Keyword arguments to be passed to the corresponding builder + configuration class, set on the class attribute [`DatasetBuilder.BUILDER_CONFIG_CLASS`]. The builder + configuration class is [`BuilderConfig`] or a subclass of it. + """ + + # Default version + VERSION = None # Default version set in BuilderConfig + + # Class for the builder config. + BUILDER_CONFIG_CLASS = BuilderConfig + + # Named configurations that modify the data generated by download_and_prepare. + BUILDER_CONFIGS = [] + + # Optional default config name to be used when name is None + DEFAULT_CONFIG_NAME = None + + # Default batch size used by the ArrowWriter + # It defines the number of samples that are kept in memory before writing them + # and also the length of the arrow chunks + # None means that the ArrowWriter will use its default value + DEFAULT_WRITER_BATCH_SIZE = None + + def __init__( + self, + cache_dir: Optional[str] = None, + dataset_name: Optional[str] = None, + config_name: Optional[str] = None, + hash: Optional[str] = None, + base_path: Optional[str] = None, + info: Optional[DatasetInfo] = None, + features: Optional[Features] = None, + token: Optional[Union[bool, str]] = None, + repo_id: Optional[str] = None, + data_files: Optional[Union[str, list, dict, DataFilesDict]] = None, + data_dir: Optional[str] = None, + storage_options: Optional[dict] = None, + writer_batch_size: Optional[int] = None, + config_id: Optional[str] = None, + **config_kwargs, + ): + # DatasetBuilder name + self.name: str = camelcase_to_snakecase(self.__module__.split(".")[-1]) + self.hash: Optional[str] = hash + self.base_path = base_path + self.token = token + self.repo_id = repo_id + self.storage_options = storage_options or {} + self.dataset_name = camelcase_to_snakecase(dataset_name) if dataset_name else self.name + self._writer_batch_size = writer_batch_size or self.DEFAULT_WRITER_BATCH_SIZE + + if data_files is not None and not isinstance(data_files, DataFilesDict): + data_files = DataFilesDict.from_patterns( + sanitize_patterns(data_files), + base_path=base_path, + download_config=DownloadConfig(token=token, storage_options=self.storage_options), + ) + + # Prepare config: DatasetConfig contains name, version and description but can be extended by each dataset + if "features" in inspect.signature(self.BUILDER_CONFIG_CLASS.__init__).parameters and features is not None: + config_kwargs["features"] = features + if data_files is not None: + config_kwargs["data_files"] = data_files + if data_dir is not None: + config_kwargs["data_dir"] = data_dir + self.config_kwargs = config_kwargs + self.config, self.config_id = self._create_builder_config( + config_name=config_name, + custom_features=features, + config_id=config_id, + **config_kwargs, + ) + + # prepare info: DatasetInfo are a standardized dataclass across all datasets + # Prefill datasetinfo + if info is None: + info = self._info() + info.builder_name = self.name + info.dataset_name = self.dataset_name + info.config_name = self.config.name + info.version = self.config.version + self.info = info + # update info with user specified infos + if features is not None: + self.info.features = features + + # Prepare data dirs: + # cache_dir can be a remote bucket on GCS or S3 + self._cache_dir_root = str(cache_dir or config.HF_DATASETS_CACHE) + self._cache_dir_root = ( + self._cache_dir_root if is_remote_url(self._cache_dir_root) else os.path.expanduser(self._cache_dir_root) + ) + self._cache_downloaded_dir = ( + posixpath.join(self._cache_dir_root, config.DOWNLOADED_DATASETS_DIR) + if cache_dir + else str(config.DOWNLOADED_DATASETS_PATH) + ) + self._cache_downloaded_dir = ( + self._cache_downloaded_dir + if is_remote_url(self._cache_downloaded_dir) + else os.path.expanduser(self._cache_downloaded_dir) + ) + + # In case there exists a legacy cache directory + self._legacy_relative_data_dir = None + + self._cache_dir = self._build_cache_dir() + if not is_remote_url(self._cache_dir_root): + os.makedirs(self._cache_dir_root, exist_ok=True) + lock_path = os.path.join( + self._cache_dir_root, Path(self._cache_dir).as_posix().replace("/", "_") + ".lock" + ) + with FileLock(lock_path): + if os.path.exists(self._cache_dir): # check if data exist + if len(os.listdir(self._cache_dir)) > 0: + if os.path.exists(os.path.join(self._cache_dir, config.DATASET_INFO_FILENAME)): + logger.debug("Overwrite dataset info from restored data version if exists.") + self.info = DatasetInfo.from_directory(self._cache_dir) + else: # dir exists but no data, remove the empty dir as data aren't available anymore + logger.warning( + f"Old caching folder {self._cache_dir} for dataset {self.dataset_name} exists but no data were found. Removing it. " + ) + os.rmdir(self._cache_dir) + + # Store in the cache by default unless the user specifies a custom output_dir to download_and_prepare + self._output_dir = self._cache_dir + self._fs: fsspec.AbstractFileSystem = fsspec.filesystem("file") + + # Set download manager + self.dl_manager = None + + # Set to True by "datasets-cli test" to generate file checksums for (deprecated) dataset_infos.json independently of verification_mode value. + self._record_infos = False + + # Set in `.download_and_prepare` once the format of the generated dataset is known + self._file_format = None + + # Enable streaming (e.g. it patches "open" to work with remote files) + extend_dataset_builder_for_streaming(self) + + def __getstate__(self): + return self.__dict__ + + def __setstate__(self, d): + self.__dict__ = d + # Re-enable streaming, since patched functions are not kept when pickling + extend_dataset_builder_for_streaming(self) + + # Must be set for datasets that use 'data_dir' functionality - the ones + # that require users to do additional steps to download the data + # (this is usually due to some external regulations / rules). + # This field should contain a string with user instructions, including + # the list of files that should be present. It will be + # displayed in the dataset documentation. + @property + def manual_download_instructions(self) -> Optional[str]: + return None + + def _check_legacy_cache(self) -> Optional[str]: + """Check for the old cache directory template {cache_dir}/{namespace}___{builder_name} from 2.13""" + if ( + self.__module__.startswith("datasets.") + and not is_remote_url(self._cache_dir_root) + and self.config.name == "default" + ): + from .packaged_modules import _PACKAGED_DATASETS_MODULES + + namespace = self.repo_id.split("/")[0] if self.repo_id and self.repo_id.count("/") > 0 else None + config_name = self.repo_id.replace("/", "--") if self.repo_id is not None else self.dataset_name + config_id = config_name + self.config_id[len(self.config.name) :] + hash = _PACKAGED_DATASETS_MODULES.get(self.name, "missing")[1] + legacy_relative_data_dir = posixpath.join( + self.dataset_name if namespace is None else f"{namespace}___{self.dataset_name}", + config_id, + "0.0.0", + hash, + ) + legacy_cache_dir = posixpath.join(self._cache_dir_root, legacy_relative_data_dir) + if os.path.isdir(legacy_cache_dir): + return legacy_relative_data_dir + + def _check_legacy_cache2(self, dataset_module: "DatasetModule") -> Optional[str]: + """Check for the old cache directory template {cache_dir}/{namespace}___{dataset_name}/{config_name}-xxx from 2.14 and 2.15""" + if ( + self.__module__.startswith("datasets.") + and not is_remote_url(self._cache_dir_root) + and not (set(self.config_kwargs) - {"data_files", "data_dir"}) + ): + from .packaged_modules import _PACKAGED_DATASETS_MODULES_2_15_HASHES + from .utils._dill import Pickler + + def update_hash_with_config_parameters(hash: str, config_parameters: dict) -> str: + """ + Used to update hash of packaged modules which is used for creating unique cache directories to reflect + different config parameters which are passed in metadata from readme. + """ + params_to_exclude = {"config_name", "version", "description"} + params_to_add_to_hash = { + param: value + for param, value in sorted(config_parameters.items()) + if param not in params_to_exclude + } + m = Hasher() + m.update(hash) + m.update(params_to_add_to_hash) + return m.hexdigest() + + namespace = self.repo_id.split("/")[0] if self.repo_id and self.repo_id.count("/") > 0 else None + with patch.object(Pickler, "_legacy_no_dict_keys_sorting", True): + config_id = self.config.name + "-" + Hasher.hash({"data_files": self.config.data_files}) + hash = _PACKAGED_DATASETS_MODULES_2_15_HASHES.get(self.name, "missing") + if ( + dataset_module.builder_configs_parameters.metadata_configs + and self.config.name in dataset_module.builder_configs_parameters.metadata_configs + ): + hash = update_hash_with_config_parameters( + hash, dataset_module.builder_configs_parameters.metadata_configs[self.config.name] + ) + legacy_relative_data_dir = posixpath.join( + self.dataset_name if namespace is None else f"{namespace}___{self.dataset_name}", + config_id, + "0.0.0", + hash, + ) + legacy_cache_dir = posixpath.join(self._cache_dir_root, legacy_relative_data_dir) + if os.path.isdir(legacy_cache_dir): + return legacy_relative_data_dir + + def _create_builder_config( + self, config_name=None, custom_features=None, config_id=None, **config_kwargs + ) -> tuple[BuilderConfig, str]: + """Create and validate BuilderConfig object as well as a unique config id for this config. + Raises ValueError if there are multiple builder configs and config_name and DEFAULT_CONFIG_NAME are None. + config_kwargs override the defaults kwargs in config + """ + builder_config = None + + # try default config + if config_name is None and self.BUILDER_CONFIGS: + if self.DEFAULT_CONFIG_NAME is not None: + builder_config = self.builder_configs.get(self.DEFAULT_CONFIG_NAME) + logger.info(f"No config specified, defaulting to: {self.dataset_name}/{builder_config.name}") + else: + if len(self.BUILDER_CONFIGS) > 1: + if not config_kwargs: + example_of_usage = ( + f"load_dataset('{self.repo_id or self.dataset_name}', '{self.BUILDER_CONFIGS[0].name}')" + ) + raise ValueError( + "Config name is missing." + f"\nPlease pick one among the available configs: {list(self.builder_configs.keys())}" + + f"\nExample of usage:\n\t`{example_of_usage}`" + ) + else: + builder_config = self.BUILDER_CONFIGS[0] + logger.info( + f"No config specified, defaulting to the single config: {self.dataset_name}/{builder_config.name}" + ) + + # try to get config by name + if isinstance(config_name, str): + builder_config = self.builder_configs.get(config_name) + if builder_config is None and self.BUILDER_CONFIGS: + raise ValueError( + f"BuilderConfig '{config_name}' not found. Available: {list(self.builder_configs.keys())}" + ) + + # if not using an existing config, then create a new config on the fly + if not builder_config: + if config_name is not None: + config_kwargs["name"] = config_name + elif self.DEFAULT_CONFIG_NAME and not config_kwargs: + # Use DEFAULT_CONFIG_NAME only if no config_kwargs are passed + config_kwargs["name"] = self.DEFAULT_CONFIG_NAME + if "version" not in config_kwargs and hasattr(self, "VERSION") and self.VERSION: + config_kwargs["version"] = self.VERSION + builder_config = self.BUILDER_CONFIG_CLASS(**config_kwargs) + + # otherwise use the config_kwargs to overwrite the attributes + else: + builder_config = copy.deepcopy(builder_config) if config_kwargs else builder_config + for key, value in config_kwargs.items(): + if value is not None: + if not hasattr(builder_config, key): + raise ValueError(f"BuilderConfig {builder_config} doesn't have a '{key}' key.") + setattr(builder_config, key, value) + + if not builder_config.name: + raise ValueError(f"BuilderConfig must have a name, got {builder_config.name}") + + # resolve data files if needed + builder_config._resolve_data_files( + base_path=self.base_path, + download_config=DownloadConfig(token=self.token, storage_options=self.storage_options), + ) + + # compute the config id that is going to be used for caching + if config_id is None: + config_id = builder_config.create_config_id( + config_kwargs, + custom_features=custom_features, + ) + is_custom = (config_id not in self.builder_configs) and config_id != "default" + if is_custom: + logger.info(f"Using custom data configuration {config_id}") + else: + if ( + builder_config.name in self.builder_configs + and builder_config != self.builder_configs[builder_config.name] + ): + raise ValueError( + "Cannot name a custom BuilderConfig the same as an available " + f"BuilderConfig. Change the name. Available BuilderConfigs: {list(self.builder_configs.keys())}" + ) + if not builder_config.version: + raise ValueError(f"BuilderConfig {builder_config.name} must have a version") + + return builder_config, config_id + + @classproperty + @classmethod + @memoize() + def builder_configs(cls) -> dict[str, BuilderConfig]: + """Dictionary of pre-defined configurations for this builder class.""" + configs = {config.name: config for config in cls.BUILDER_CONFIGS} + if len(configs) != len(cls.BUILDER_CONFIGS): + names = [config.name for config in cls.BUILDER_CONFIGS] + raise ValueError(f"Names in BUILDER_CONFIGS must not be duplicated. Got {names}") + return configs + + @property + def cache_dir(self): + return self._cache_dir + + def _use_legacy_cache_dir_if_possible(self, dataset_module: "DatasetModule"): + # Check for the legacy cache directory template (datasets<3.0.0) + self._legacy_relative_data_dir = ( + self._check_legacy_cache2(dataset_module) or self._check_legacy_cache() or None + ) + self._cache_dir = self._build_cache_dir() + self._output_dir = self._cache_dir + + def _relative_data_dir(self, with_version=True, with_hash=True) -> str: + """Relative path of this dataset in cache_dir: + Will be: + self.dataset_name/self.config.version/self.hash/ + or if a repo_id with a namespace has been specified: + self.namespace___self.dataset_name/self.config.version/self.hash/ + If any of these element is missing or if ``with_version=False`` the corresponding subfolders are dropped. + """ + if self._legacy_relative_data_dir is not None and with_version and with_hash: + return self._legacy_relative_data_dir + + namespace = self.repo_id.split("/")[0] if self.repo_id and self.repo_id.count("/") > 0 else None + builder_data_dir = self.dataset_name if namespace is None else f"{namespace}___{self.dataset_name}" + builder_data_dir = posixpath.join(builder_data_dir, self.config_id) + if with_version: + builder_data_dir = posixpath.join(builder_data_dir, str(self.config.version)) + if with_hash and self.hash and isinstance(self.hash, str): + builder_data_dir = posixpath.join(builder_data_dir, self.hash) + return builder_data_dir + + def _build_cache_dir(self): + """Return the data directory for the current version.""" + builder_data_dir = posixpath.join(self._cache_dir_root, self._relative_data_dir(with_version=False)) + version_data_dir = posixpath.join(self._cache_dir_root, self._relative_data_dir(with_version=True)) + + def _other_versions_on_disk(): + """Returns previous versions on disk.""" + if not os.path.exists(builder_data_dir): + return [] + + version_dirnames = [] + for dir_name in os.listdir(builder_data_dir): + try: + version_dirnames.append((utils.Version(dir_name), dir_name)) + except ValueError: # Invalid version (ex: incomplete data dir) + pass + version_dirnames.sort(reverse=True) + return version_dirnames + + # Check and warn if other versions exist + if not is_remote_url(builder_data_dir): + version_dirs = _other_versions_on_disk() + if version_dirs: + other_version = version_dirs[0][0] + if other_version != self.config.version: + warn_msg = ( + f"Found a different version {str(other_version)} of dataset {self.dataset_name} in " + f"cache_dir {self._cache_dir_root}. Using currently defined version " + f"{str(self.config.version)}." + ) + logger.warning(warn_msg) + + return version_data_dir + + @abc.abstractmethod + def _info(self) -> DatasetInfo: + """Construct the DatasetInfo object. See `DatasetInfo` for details. + + Warning: This function is only called once and the result is cached for all + following .info() calls. + + Returns: + info: (DatasetInfo) The dataset information + """ + raise NotImplementedError + + @classmethod + def get_imported_module_dir(cls): + """Return the path of the module of this class or subclass.""" + return os.path.dirname(inspect.getfile(inspect.getmodule(cls))) + + def _rename(self, src: str, dst: str): + rename(self._fs, src, dst) + + def download_and_prepare( + self, + output_dir: Optional[str] = None, + download_config: Optional[DownloadConfig] = None, + download_mode: Optional[Union[DownloadMode, str]] = None, + verification_mode: Optional[Union[VerificationMode, str]] = None, + dl_manager: Optional[DownloadManager] = None, + base_path: Optional[str] = None, + file_format: str = "arrow", + max_shard_size: Optional[Union[int, str]] = None, + num_proc: Optional[int] = None, + storage_options: Optional[dict] = None, + **download_and_prepare_kwargs, + ): + """Downloads and prepares dataset for reading. + + Args: + output_dir (`str`, *optional*): + Output directory for the dataset. + Default to this builder's `cache_dir`, which is inside `~/.cache/huggingface/datasets` by default. + + + download_config (`DownloadConfig`, *optional*): + Specific download configuration parameters. + download_mode ([`DownloadMode`] or `str`, *optional*): + Select the download/generate mode, default to `REUSE_DATASET_IF_EXISTS`. + verification_mode ([`VerificationMode`] or `str`, defaults to `BASIC_CHECKS`): + Verification mode determining the checks to run on the downloaded/processed dataset information (checksums/size/splits/...). + + + dl_manager (`DownloadManager`, *optional*): + Specific `DownloadManger` to use. + base_path (`str`, *optional*): + Base path for relative paths that are used to download files. This can be a remote url. + If not specified, the value of the `base_path` attribute (`self.base_path`) will be used instead. + file_format (`str`, *optional*): + Format of the data files in which the dataset will be written. + Supported formats: "arrow", "parquet". Default to "arrow" format. + If the format is "parquet", then image and audio data are embedded into the Parquet files instead of pointing to local files. + + + max_shard_size (`Union[str, int]`, *optional*): + Maximum number of bytes written per shard, default is "500MB". + The size is based on uncompressed data size, so in practice your shard files may be smaller than + `max_shard_size` thanks to Parquet compression for example. + + + num_proc (`int`, *optional*, defaults to `None`): + Number of processes when downloading and generating the dataset locally. + Multiprocessing is disabled by default. + + + storage_options (`dict`, *optional*): + Key/value pairs to be passed on to the caching file-system backend, if any. + + + **download_and_prepare_kwargs (additional keyword arguments): Keyword arguments. + + Example: + + Download and prepare the dataset as Arrow files that can be loaded as a Dataset using `builder.as_dataset()`: + + ```py + >>> from datasets import load_dataset_builder + >>> builder = load_dataset_builder("cornell-movie-review-data/rotten_tomatoes") + >>> builder.download_and_prepare() + ``` + + Download and prepare the dataset as sharded Parquet files locally: + + ```py + >>> from datasets import load_dataset_builder + >>> builder = load_dataset_builder("cornell-movie-review-data/rotten_tomatoes") + >>> builder.download_and_prepare("./output_dir", file_format="parquet") + ``` + + Download and prepare the dataset as sharded Parquet files in a cloud storage: + + ```py + >>> from datasets import load_dataset_builder + >>> storage_options = {"key": aws_access_key_id, "secret": aws_secret_access_key} + >>> builder = load_dataset_builder("cornell-movie-review-data/rotten_tomatoes") + >>> builder.download_and_prepare("s3://my-bucket/my_rotten_tomatoes", storage_options=storage_options, file_format="parquet") + ``` + """ + output_dir = output_dir if output_dir is not None else self._cache_dir + # output_dir can be a remote bucket on GCS or S3 + fs, output_dir = url_to_fs(output_dir, **(storage_options or {})) + self._fs = fs + self._output_dir = output_dir if not is_remote_filesystem(self._fs) else self._fs.unstrip_protocol(output_dir) + + download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS) + verification_mode = VerificationMode(verification_mode or VerificationMode.BASIC_CHECKS) + base_path = base_path if base_path is not None else self.base_path + + if file_format is not None and file_format not in ["arrow", "parquet"]: + raise ValueError(f"Unsupported file_format: {file_format}. Expected 'arrow' or 'parquet'") + self._file_format = file_format + + if self._fs._strip_protocol(self._output_dir) == "": + # We don't support the root directory, because it has no dirname, + # and we need a dirname to use a .incomplete directory + # when the dataset is being written + raise RuntimeError( + f"Unable to download and prepare the dataset at the root {self._output_dir}. " + f"Please specify a subdirectory, e.g. '{self._output_dir + self.dataset_name}'" + ) + + if dl_manager is None: + if download_config is None: + download_config = DownloadConfig( + cache_dir=self._cache_downloaded_dir, + force_download=download_mode == DownloadMode.FORCE_REDOWNLOAD, + force_extract=download_mode == DownloadMode.FORCE_REDOWNLOAD, + use_etag=False, + num_proc=num_proc, + token=self.token, + storage_options=self.storage_options, + ) # We don't use etag for data files to speed up the process + + dl_manager = DownloadManager( + dataset_name=self.dataset_name, + download_config=download_config, + data_dir=self.config.data_dir, + base_path=base_path, + record_checksums=(self._record_infos or verification_mode == VerificationMode.ALL_CHECKS), + ) + + is_local = not is_remote_filesystem(self._fs) + self.dl_manager = dl_manager + + # Prevent parallel local disk operations + if is_local: + # Create parent directory of the output_dir to put the lock file in there + Path(self._output_dir).parent.mkdir(parents=True, exist_ok=True) + lock_path = self._output_dir + "_builder.lock" + + # File locking only with local paths; no file locking on GCS or S3 + with FileLock(lock_path) if is_local else contextlib.nullcontext(): + # Check if the data already exists + data_exists = self._fs.exists(posixpath.join(self._output_dir, config.DATASET_INFO_FILENAME)) + if data_exists and download_mode == DownloadMode.REUSE_DATASET_IF_EXISTS: + logger.info(f"Found cached dataset {self.dataset_name} ({self._output_dir})") + # We need to update the info in case some splits were added in the meantime + # for example when calling load_dataset from multiple workers. + self.info = self._load_info() + self.download_post_processing_resources(dl_manager) + return + + logger.info(f"Generating dataset {self.dataset_name} ({self._output_dir})") + if is_local: # if cache dir is local, check for available space + if not has_sufficient_disk_space( + self.info.size_in_bytes or 0, directory=Path(self._output_dir).parent + ): + raise OSError( + f"Not enough disk space. Needed: {size_str(self.info.size_in_bytes or 0)} (download: {size_str(self.info.download_size or 0)}, generated: {size_str(self.info.dataset_size or 0)}, post-processed: {size_str(self.info.post_processing_size or 0)})" + ) + + @contextlib.contextmanager + def incomplete_dir(dirname): + """Create temporary dir for dirname and rename on exit.""" + if not is_local: + self._fs.makedirs(dirname, exist_ok=True) + yield dirname + else: + tmp_dir = dirname + ".incomplete" + os.makedirs(tmp_dir, exist_ok=True) + try: + yield tmp_dir + if os.path.isdir(dirname): + shutil.rmtree(dirname) + # LocalFileSystem.mv does copy + rm, it is more efficient to simply rename a local directory + shutil.move(tmp_dir, dirname) + finally: + if os.path.exists(tmp_dir): + shutil.rmtree(tmp_dir) + + # Print is intentional: we want this to always go to stdout so user has + # information needed to cancel download/preparation if needed. + # This comes right before the progress bar. + if self.info.size_in_bytes: + logger.info( + f"Downloading and preparing dataset {self.dataset_name}/{self.config.name} " + f"(download: {size_str(self.info.download_size)}, generated: {size_str(self.info.dataset_size)}, " + f"post-processed: {size_str(self.info.post_processing_size)}, " + f"total: {size_str(self.info.size_in_bytes)}) to {self._output_dir}..." + ) + else: + _dest = self._fs._strip_protocol(self._output_dir) if is_local else self._output_dir + logger.info(f"Downloading and preparing dataset {self.dataset_name}/{self.config.name} to {_dest}...") + + self._check_manual_download(dl_manager) + + # Create a tmp dir and rename to self._output_dir on successful exit. + with incomplete_dir(self._output_dir) as tmp_output_dir: + # Temporarily assign _output_dir to tmp_data_dir to avoid having to forward + # it to every sub function. + with temporary_assignment(self, "_output_dir", tmp_output_dir): + prepare_split_kwargs = {"file_format": file_format} + if max_shard_size is not None: + prepare_split_kwargs["max_shard_size"] = max_shard_size + if num_proc is not None: + prepare_split_kwargs["num_proc"] = num_proc + self._download_and_prepare( + dl_manager=dl_manager, + verification_mode=verification_mode, + **prepare_split_kwargs, + **download_and_prepare_kwargs, + ) + # Sync info + self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values()) + self.info.download_checksums = dl_manager.get_recorded_sizes_checksums() + if self.info.download_size is not None: + self.info.size_in_bytes = self.info.dataset_size + self.info.download_size + # Save info + self._save_info() + + # Download post processing resources + self.download_post_processing_resources(dl_manager) + + logger.info( + f"Dataset {self.dataset_name} downloaded and prepared to {self._output_dir}. " + f"Subsequent calls will reuse this data." + ) + + def _check_manual_download(self, dl_manager): + if self.manual_download_instructions is not None and dl_manager.manual_dir is None: + raise ManualDownloadError( + textwrap.dedent( + f"""\ + The dataset {self.dataset_name} with config {self.config.name} requires manual data. + Please follow the manual download instructions: + {self.manual_download_instructions} + Manual data can be loaded with: + datasets.load_dataset("{self.repo_id or self.dataset_name}", data_dir="")""" + ) + ) + + def _download_and_prepare(self, dl_manager, verification_mode, **prepare_split_kwargs): + """Downloads and prepares dataset for reading. + + This is the internal implementation to overwrite called when user calls + `download_and_prepare`. It should download all required data and generate + the pre-processed datasets files. + + Args: + dl_manager ([`DownloadManager`]): + `DownloadManager` used to download and cache data. + verification_mode ([`VerificationMode`]): + if `ALL_CHECKS`, perform all the verifications including checksums. + if `BASIC_CHECKS`, do not perform checksums, only perform split tests. + if `NO_CHECKS`, do not perform any verification. + prepare_split_kwargs: Additional options, such as `file_format`, `max_shard_size` + """ + # Generating data for all splits + split_dict = SplitDict(dataset_name=self.dataset_name) + split_generators_kwargs = self._make_split_generators_kwargs(prepare_split_kwargs) + split_generators = self._split_generators(dl_manager, **split_generators_kwargs) + + # Checksums verification + if verification_mode == VerificationMode.ALL_CHECKS and dl_manager.record_checksums: + verify_checksums( + self.info.download_checksums, dl_manager.get_recorded_sizes_checksums(), "dataset source files" + ) + + # Build splits + for split_generator in split_generators: + if str(split_generator.split_info.name).lower() == "all": + raise ValueError( + "`all` is a special split keyword corresponding to the " + "union of all splits, so cannot be used as key in " + "._split_generator()." + ) + + logger.info(f"Generating {split_generator.split_info.name} split") + split_dict.add(split_generator.split_info) + + try: + # Prepare split will record examples associated to the split + self._prepare_split(split_generator, **prepare_split_kwargs) + except OSError as e: + raise OSError( + "Cannot find data file. " + + (self.manual_download_instructions or "") + + "\nOriginal error:\n" + + str(e) + ) from None + # If check_duplicates is set to True , then except DuplicatedKeysError + except DuplicatedKeysError as e: + raise DuplicatedKeysError( + e.key, + e.duplicate_key_indices, + fix_msg=f"To avoid duplicate keys, please fix the dataset splits for {self.name}", + ) from None + dl_manager.manage_extracted_files() + + if verification_mode == VerificationMode.BASIC_CHECKS or verification_mode == VerificationMode.ALL_CHECKS: + verify_splits(self.info.splits, split_dict) + + # Update the info object with the splits. + self.info.splits = split_dict + self.info.download_size = dl_manager.downloaded_size + + def download_post_processing_resources(self, dl_manager): + for split in self.info.splits or []: + for resource_name, resource_file_name in self._post_processing_resources(split).items(): + if not not is_remote_filesystem(self._fs): + raise NotImplementedError(f"Post processing is not supported on filesystem {self._fs}") + if os.sep in resource_file_name: + raise ValueError(f"Resources shouldn't be in a sub-directory: {resource_file_name}") + resource_path = os.path.join(self._output_dir, resource_file_name) + if not os.path.exists(resource_path): + downloaded_resource_path = self._download_post_processing_resources( + split, resource_name, dl_manager + ) + if downloaded_resource_path: + logger.info(f"Downloaded post-processing resource {resource_name} as {resource_file_name}") + shutil.move(downloaded_resource_path, resource_path) + + def _load_info(self) -> DatasetInfo: + return DatasetInfo.from_directory(self._output_dir, storage_options=self._fs.storage_options) + + def _save_info(self): + file_lock = ( + FileLock(self._output_dir + "_info.lock") + if not is_remote_filesystem(self._fs) + else contextlib.nullcontext() + ) + with file_lock: + self.info.write_to_directory(self._output_dir, storage_options=self._fs.storage_options) + + def _make_split_generators_kwargs(self, prepare_split_kwargs): + """Get kwargs for `self._split_generators()` from `prepare_split_kwargs`.""" + del prepare_split_kwargs + return {} + + def as_dataset( + self, + split: Optional[Union[str, Split, list[str], list[Split]]] = None, + run_post_process=True, + verification_mode: Optional[Union[VerificationMode, str]] = None, + in_memory=False, + ) -> Union[Dataset, DatasetDict]: + """Return a Dataset for the specified split. + + Args: + split (`datasets.Split`): + Which subset of the data to return. + run_post_process (`bool`, defaults to `True`): + Whether to run post-processing dataset transforms and/or add + indexes. + verification_mode ([`VerificationMode`] or `str`, defaults to `BASIC_CHECKS`): + Verification mode determining the checks to run on the + downloaded/processed dataset information (checksums/size/splits/...). + + + in_memory (`bool`, defaults to `False`): + Whether to copy the data in-memory. + + Returns: + datasets.Dataset + + Example: + + ```py + >>> from datasets import load_dataset_builder + >>> builder = load_dataset_builder('cornell-movie-review-data/rotten_tomatoes') + >>> builder.download_and_prepare() + >>> ds = builder.as_dataset(split='train') + >>> ds + Dataset({ + features: ['text', 'label'], + num_rows: 8530 + }) + ``` + """ + if self._file_format is not None and self._file_format != "arrow": + raise FileFormatError('Loading a dataset not written in the "arrow" format is not supported.') + if is_remote_filesystem(self._fs): + raise NotImplementedError(f"Loading a dataset cached in a {type(self._fs).__name__} is not supported.") + if not os.path.exists(self._output_dir): + raise FileNotFoundError( + f"Dataset {self.dataset_name}: could not find data in {self._output_dir}. Please make sure to call " + "builder.download_and_prepare(), or use " + "datasets.load_dataset() before trying to access the Dataset object." + ) + + logger.debug(f"Constructing Dataset for split {split or ', '.join(self.info.splits)}, from {self._output_dir}") + + # By default, return all splits + if split is None: + split = {s: s for s in self.info.splits} + + verification_mode = VerificationMode(verification_mode or VerificationMode.BASIC_CHECKS) + + # Create a dataset for each of the given splits + datasets = map_nested( + partial( + self._build_single_dataset, + run_post_process=run_post_process, + verification_mode=verification_mode, + in_memory=in_memory, + ), + split, + map_tuple=True, + disable_tqdm=True, + ) + if isinstance(datasets, dict): + datasets = DatasetDict(datasets) + return datasets + + def _build_single_dataset( + self, + split: Union[str, ReadInstruction, Split], + run_post_process: bool, + verification_mode: VerificationMode, + in_memory: bool = False, + ): + """as_dataset for a single split.""" + if not isinstance(split, ReadInstruction): + split = str(split) + if split == "all": + split = "+".join(self.info.splits.keys()) + split = Split(split) + + # Build base dataset + ds = self._as_dataset( + split=split, + in_memory=in_memory, + ) + if run_post_process: + for resource_file_name in self._post_processing_resources(split).values(): + if os.sep in resource_file_name: + raise ValueError(f"Resources shouldn't be in a sub-directory: {resource_file_name}") + resources_paths = { + resource_name: os.path.join(self._output_dir, resource_file_name) + for resource_name, resource_file_name in self._post_processing_resources(split).items() + } + post_processed = self._post_process(ds, resources_paths) + if post_processed is not None: + ds = post_processed + recorded_checksums = {} + record_checksums = False + for resource_name, resource_path in resources_paths.items(): + size_checksum = get_size_checksum_dict(resource_path) + recorded_checksums[resource_name] = size_checksum + if verification_mode == VerificationMode.ALL_CHECKS and record_checksums: + if self.info.post_processed is None or self.info.post_processed.resources_checksums is None: + expected_checksums = None + else: + expected_checksums = self.info.post_processed.resources_checksums.get(split) + verify_checksums(expected_checksums, recorded_checksums, "post processing resources") + if self.info.post_processed is None: + self.info.post_processed = PostProcessedInfo() + if self.info.post_processed.resources_checksums is None: + self.info.post_processed.resources_checksums = {} + self.info.post_processed.resources_checksums[str(split)] = recorded_checksums + self.info.post_processing_size = sum( + checksums_dict["num_bytes"] + for split_checksums_dicts in self.info.post_processed.resources_checksums.values() + for checksums_dict in split_checksums_dicts.values() + ) + if self.info.dataset_size is not None and self.info.download_size is not None: + self.info.size_in_bytes = ( + self.info.dataset_size + self.info.download_size + self.info.post_processing_size + ) + self._save_info() + ds._info.post_processed = self.info.post_processed + ds._info.post_processing_size = self.info.post_processing_size + ds._info.size_in_bytes = self.info.size_in_bytes + if self.info.post_processed.features is not None: + if self.info.post_processed.features.type != ds.features.type: + raise ValueError( + f"Post-processed features info don't match the dataset:\nGot\n{self.info.post_processed.features}\nbut expected something like\n{ds.features}" + ) + else: + ds.info.features = self.info.post_processed.features + + return ds + + def _as_dataset(self, split: Union[ReadInstruction, Split] = Split.TRAIN, in_memory: bool = False) -> Dataset: + """Constructs a `Dataset`. + + This is the internal implementation to overwrite called when user calls + `as_dataset`. It should read the pre-processed datasets files and generate + the `Dataset` object. + + Args: + split (`datasets.Split`): + which subset of the data to read. + in_memory (`bool`, defaults to `False`): + Whether to copy the data in-memory. + + Returns: + `Dataset` + """ + cache_dir = self._fs._strip_protocol(self._output_dir) + dataset_name = self.dataset_name + if self._check_legacy_cache(): + dataset_name = self.name + dataset_kwargs = ArrowReader(cache_dir, self.info).read( + name=dataset_name, + instructions=split, + split_infos=self.info.splits.values(), + in_memory=in_memory, + ) + fingerprint = self._get_dataset_fingerprint(split) + return Dataset(fingerprint=fingerprint, **dataset_kwargs) + + def _get_dataset_fingerprint(self, split: Union[ReadInstruction, Split]) -> str: + """The dataset fingerprint is the hash of the relative directory dataset_name/config_name/version/hash, as well as the split specs.""" + hasher = Hasher() + hasher.update(Path(self._relative_data_dir()).as_posix()) + hasher.update(str(split)) # for example: train, train+test, train[:10%], test[:33%](pct1_dropremainder) + fingerprint = hasher.hexdigest() + return fingerprint + + def as_streaming_dataset( + self, + split: Optional[str] = None, + base_path: Optional[str] = None, + ) -> Union[dict[str, IterableDataset], IterableDataset]: + if is_remote_filesystem(self._fs): + raise NotImplementedError( + f"Loading a streaming dataset cached in a {type(self._fs).__name__} is not supported yet." + ) + + dl_manager = StreamingDownloadManager( + base_path=base_path or self.base_path, + download_config=DownloadConfig(token=self.token, storage_options=self.storage_options), + dataset_name=self.dataset_name, + data_dir=self.config.data_dir, + ) + self._check_manual_download(dl_manager) + splits_generators = {sg.name: sg for sg in self._split_generators(dl_manager)} + # By default, return all splits + if split is None: + splits_generator = splits_generators + elif split in splits_generators: + splits_generator = splits_generators[split] + else: + raise ValueError(f"Bad split: {split}. Available splits: {list(splits_generators)}") + + # Create a dataset for each of the given splits + datasets = map_nested( + self._as_streaming_dataset_single, + splits_generator, + map_tuple=True, + ) + if isinstance(datasets, dict): + datasets = IterableDatasetDict(datasets) + return datasets + + def _as_streaming_dataset_single( + self, + splits_generator, + ) -> IterableDataset: + ex_iterable = self._get_examples_iterable_for_split(splits_generator) + # add auth to be able to access and decode audio/image files from private repositories. + token_per_repo_id = {self.repo_id: self.token} if self.repo_id else {} + return IterableDataset( + ex_iterable, info=self.info, split=splits_generator.name, token_per_repo_id=token_per_repo_id + ) + + def _post_process(self, dataset: Dataset, resources_paths: Mapping[str, str]) -> Optional[Dataset]: + """Run dataset transforms or add indexes""" + return None + + def _post_processing_resources(self, split: str) -> dict[str, str]: + """Mapping resource_name -> resource_file_name""" + return {} + + def _download_post_processing_resources( + self, split: str, resource_name: str, dl_manager: DownloadManager + ) -> Optional[str]: + """Download the resource using the download manager and return the downloaded path.""" + return None + + @abc.abstractmethod + def _split_generators(self, dl_manager: Union[DownloadManager, StreamingDownloadManager]): + """Specify feature dictionary generators and dataset splits. + + This function returns a list of `SplitGenerator`s defining how to generate + data and what splits to use. + + Example: + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={'file': 'train_data.zip'}, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={'file': 'test_data.zip'}, + ), + ] + + The above code will first call `_generate_examples(file='train_data.zip')` + to write the train data, then `_generate_examples(file='test_data.zip')` to + write the test data. + + Datasets are typically split into different subsets to be used at various + stages of training and evaluation. + + Note that for datasets without a `VALIDATION` split, you can use a + fraction of the `TRAIN` data for evaluation as you iterate on your model + so as not to overfit to the `TEST` data. + + For downloads and extractions, use the given `download_manager`. + Note that the `DownloadManager` caches downloads, so it is fine to have each + generator attempt to download the source data. + + A good practice is to download all data in this function, and then + distribute the relevant parts to each split with the `gen_kwargs` argument + + Args: + dl_manager (`Union[DownloadManager, StreamingDownloadManager]`): + Download manager to download the data + + Returns: + `list`. + """ + raise NotImplementedError() + + @abc.abstractmethod + def _prepare_split( + self, + split_generator: SplitGenerator, + file_format: str = "arrow", + max_shard_size: Optional[Union[str, int]] = None, + num_proc: Optional[int] = None, + **kwargs, + ): + """Generate the examples and record them on disk. + + Args: + split_generator (`SplitGenerator`): + Split generator to process + file_format (`str`, *optional*): + format of the data files in which the dataset will be written. + Supported formats: "arrow", "parquet". Default to "arrow" format. + max_shard_size (`Union[str, int]`, *optional*): + Maximum number of bytes written per shard, default is "500MB". + The size is based on uncompressed data size, so in practice your shard files may be smaller than + `max_shard_size` thanks to Parquet compression for example. + num_proc (`int`, *optional*, defaults to `None`): + Number of processes when downloading and generating the dataset locally. + Multiprocessing is disabled by default. + + + **kwargs: Additional kwargs forwarded from _download_and_prepare + """ + raise NotImplementedError() + + def _get_examples_iterable_for_split(self, split_generator: SplitGenerator) -> ExamplesIterable: + """Generate the examples on the fly. + + Args: + split_generator (`SplitGenerator`): + Split generator to process + """ + raise NotImplementedError() + + +class GeneratorBasedBuilder(DatasetBuilder): + """Base class for datasets with data generation based on dict generators. + + `GeneratorBasedBuilder` is a convenience class that abstracts away much + of the data writing and reading of `DatasetBuilder`. It expects subclasses to + implement generators of feature dictionaries across the dataset splits + (`_split_generators`). See the method docstrings for details. + """ + + @abc.abstractmethod + def _generate_examples(self, **kwargs): + """Default function generating examples for each `SplitGenerator`. + + This function preprocess the examples from the raw data to the preprocessed + dataset files. + This function is called once for each `SplitGenerator` defined in + `_split_generators`. The examples yielded here will be written on + disk. + + Args: + **kwargs (additional keyword arguments): + Arguments forwarded from the SplitGenerator.gen_kwargs + + Yields: + key: `str` or `int`, a unique deterministic example identification key. + * Unique: An error will be raised if two examples are yield with the + same key. + * Deterministic: When generating the dataset twice, the same example + should have the same key. + Good keys can be the image id, or line number if examples are extracted + from a text file. + The key will be hashed and sorted to shuffle examples deterministically, + such as generating the dataset multiple times keep examples in the + same order. + example: `dict`, a feature dictionary + ready to be encoded and written to disk. The example will be + encoded with `self.info.features.encode_example({...})`. + """ + raise NotImplementedError() + + def _prepare_split( + self, + split_generator: SplitGenerator, + check_duplicate_keys: bool, + file_format="arrow", + num_proc: Optional[int] = None, + max_shard_size: Optional[Union[int, str]] = None, + ): + max_shard_size = convert_file_size_to_int(max_shard_size or config.MAX_SHARD_SIZE) + + if self.info.splits is not None: + split_info = self.info.splits[split_generator.name] + else: + split_info = split_generator.split_info + + SUFFIX = "-JJJJJ-SSSSS-of-NNNNN" + fname = f"{self.dataset_name}-{split_generator.name}{SUFFIX}.{file_format}" + fpath = posixpath.join(self._output_dir, fname) + + if num_proc and num_proc > 1: + num_input_shards = _number_of_shards_in_gen_kwargs(split_generator.gen_kwargs) + if num_input_shards <= 1: + logger.warning( + f"Setting num_proc from {num_proc} back to 1 for the {split_info.name} split to disable multiprocessing as it only contains one shard." + ) + num_proc = 1 + elif num_input_shards < num_proc: + logger.warning( + f"Setting num_proc from {num_proc} to {num_input_shards} for the {split_info.name} split as it only contains {num_input_shards} shards." + ) + num_proc = num_input_shards + + pbar = hf_tqdm( + unit=" examples", + total=split_info.num_examples, + desc=f"Generating {split_info.name} split", + ) + + _prepare_split_args = { + "fpath": fpath, + "file_format": file_format, + "max_shard_size": max_shard_size, + "split_info": split_info, + "check_duplicate_keys": check_duplicate_keys, + } + + if num_proc is None or num_proc == 1: + result = None + gen_kwargs = split_generator.gen_kwargs + job_id = 0 + with pbar: + for job_id, done, content in self._prepare_split_single( + gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args + ): + if done: + result = content + else: + pbar.update(content) + # wrapping everything into lists for consistency with the multiprocessed code path + assert result is not None, "Failed to retrieve results from prepare_split" + examples_per_job, bytes_per_job, features_per_job, shards_per_job, shard_lengths_per_job = ( + [item] for item in result + ) + else: + kwargs_per_job = [ + {"gen_kwargs": gen_kwargs, "job_id": job_id, **_prepare_split_args} + for job_id, gen_kwargs in enumerate( + _split_gen_kwargs(split_generator.gen_kwargs, max_num_jobs=num_proc) + ) + ] + num_jobs = len(kwargs_per_job) + + examples_per_job = [None] * num_jobs + bytes_per_job = [None] * num_jobs + features_per_job = [None] * num_jobs + shards_per_job = [None] * num_jobs + shard_lengths_per_job = [None] * num_jobs + + with Pool(num_proc) as pool: + with pbar: + for job_id, done, content in iflatmap_unordered( + pool, self._prepare_split_single, kwargs_iterable=kwargs_per_job + ): + if done: + # the content is the result of the job + ( + examples_per_job[job_id], + bytes_per_job[job_id], + features_per_job[job_id], + shards_per_job[job_id], + shard_lengths_per_job[job_id], + ) = content + else: + # the content is the number of examples progress update + pbar.update(content) + + assert None not in examples_per_job, ( + f"Failed to retrieve results from prepare_split: result list {examples_per_job} still contains None - at least one worker failed to return its results" + ) + + total_shards = sum(shards_per_job) + total_num_examples = sum(examples_per_job) + total_num_bytes = sum(bytes_per_job) + features = features_per_job[0] + + split_generator.split_info.num_examples = total_num_examples + split_generator.split_info.num_bytes = total_num_bytes + + # should rename everything at the end + logger.debug(f"Renaming {total_shards} shards.") + if total_shards > 1: + # use the -SSSSS-of-NNNNN pattern + + def _rename_shard(shard_and_job: tuple[int]): + shard_id, job_id = shard_and_job + global_shard_id = sum(shards_per_job[:job_id]) + shard_id + self._rename( + fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"), + fpath.replace("JJJJJ-SSSSS", f"{global_shard_id:05d}").replace("NNNNN", f"{total_shards:05d}"), + ) + + shards_and_jobs = [ + (shard_id, job_id) + for job_id, num_shards in enumerate(shards_per_job) + for shard_id in range(num_shards) + ] + thread_map(_rename_shard, shards_and_jobs, disable=True, max_workers=64) + + split_generator.split_info.shard_lengths = [ + shard_length for shard_lengths in shard_lengths_per_job for shard_length in shard_lengths + ] + else: + # don't use any pattern + shard_id, job_id = 0, 0 + self._rename( + fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"), + fpath.replace(SUFFIX, ""), + ) + + if self.info.features is None: + self.info.features = features + + def _prepare_split_single( + self, + gen_kwargs: dict, + fpath: str, + file_format: str, + max_shard_size: int, + split_info: SplitInfo, + check_duplicate_keys: bool, + job_id: int, + ) -> Iterable[tuple[int, bool, Union[int, tuple]]]: + generator = self._generate_examples(**gen_kwargs) + writer_class = ParquetWriter if file_format == "parquet" else ArrowWriter + embed_local_files = file_format == "parquet" + shard_lengths = [] + total_num_examples, total_num_bytes = 0, 0 + + shard_id = 0 + num_examples_progress_update = 0 + try: + writer = writer_class( + features=self.info.features, + path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"), + writer_batch_size=self._writer_batch_size, + hash_salt=split_info.name, + check_duplicates=check_duplicate_keys, + storage_options=self._fs.storage_options, + embed_local_files=embed_local_files, + ) + try: + _time = time.time() + for key, record in generator: + if max_shard_size is not None and writer._num_bytes > max_shard_size: + num_examples, num_bytes = writer.finalize() + writer.close() + shard_lengths.append(num_examples) + total_num_examples += num_examples + total_num_bytes += num_bytes + shard_id += 1 + writer = writer_class( + features=writer._features, + path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"), + writer_batch_size=self._writer_batch_size, + hash_salt=split_info.name, + check_duplicates=check_duplicate_keys, + storage_options=self._fs.storage_options, + embed_local_files=embed_local_files, + ) + example = self.info.features.encode_example(record) if self.info.features is not None else record + writer.write(example, key) + num_examples_progress_update += 1 + if time.time() > _time + config.PBAR_REFRESH_TIME_INTERVAL: + _time = time.time() + yield job_id, False, num_examples_progress_update + num_examples_progress_update = 0 + finally: + yield job_id, False, num_examples_progress_update + num_shards = shard_id + 1 + num_examples, num_bytes = writer.finalize() + writer.close() + shard_lengths.append(num_examples) + total_num_examples += num_examples + total_num_bytes += num_bytes + except Exception as e: + # Ignore the writer's error for no examples written to the file if this error was caused by the error in _generate_examples before the first example was yielded + if isinstance(e, SchemaInferenceError) and e.__context__ is not None: + e = e.__context__ + raise DatasetGenerationError("An error occurred while generating the dataset") from e + + yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths) + + def _download_and_prepare(self, dl_manager, verification_mode, **prepare_splits_kwargs): + super()._download_and_prepare( + dl_manager, + verification_mode, + check_duplicate_keys=verification_mode == VerificationMode.BASIC_CHECKS + or verification_mode == VerificationMode.ALL_CHECKS, + **prepare_splits_kwargs, + ) + + def _get_examples_iterable_for_split(self, split_generator: SplitGenerator) -> ExamplesIterable: + return ExamplesIterable(self._generate_examples, split_generator.gen_kwargs) + + +class ArrowBasedBuilder(DatasetBuilder): + """Base class for datasets with data generation based on Arrow loading functions (CSV/JSON/Parquet).""" + + @abc.abstractmethod + def _generate_tables(self, **kwargs): + """Default function generating examples for each `SplitGenerator`. + + This function preprocess the examples from the raw data to the preprocessed + dataset files. + This function is called once for each `SplitGenerator` defined in + `_split_generators`. The examples yielded here will be written on + disk. + + Args: + **kwargs (additional keyword arguments): + Arguments forwarded from the SplitGenerator.gen_kwargs + + Yields: + key: `str` or `int`, a unique deterministic example identification key. + * Unique: An error will be raised if two examples are yield with the + same key. + * Deterministic: When generating the dataset twice, the same example + should have the same key. + Good keys can be the image id, or line number if examples are extracted + from a text file. + The key will be hashed and sorted to shuffle examples deterministically, + such as generating the dataset multiple times keep examples in the + same order. + example: `pyarrow.Table`, a feature table + ready to be encoded and written to disk. + """ + raise NotImplementedError() + + def _prepare_split( + self, + split_generator: SplitGenerator, + file_format: str = "arrow", + num_proc: Optional[int] = None, + max_shard_size: Optional[Union[str, int]] = None, + ): + max_shard_size = convert_file_size_to_int(max_shard_size or config.MAX_SHARD_SIZE) + + try: + split_info = self.info.splits[split_generator.name] + except Exception: + split_info = split_generator.split_info + + SUFFIX = "-JJJJJ-SSSSS-of-NNNNN" + fname = f"{self.dataset_name}-{split_generator.name}{SUFFIX}.{file_format}" + fpath = posixpath.join(self._output_dir, fname) + + if num_proc and num_proc > 1: + num_input_shards = _number_of_shards_in_gen_kwargs(split_generator.gen_kwargs) + if num_input_shards <= 1: + logger.warning( + f"Setting num_proc from {num_proc} back to 1 for the {split_info.name} split to disable multiprocessing as it only contains one shard." + ) + num_proc = 1 + elif num_input_shards < num_proc: + logger.warning( + f"Setting num_proc from {num_proc} to {num_input_shards} for the {split_info.name} split as it only contains {num_input_shards} shards." + ) + num_proc = num_input_shards + + pbar = hf_tqdm( + unit=" examples", + total=split_info.num_examples, + desc=f"Generating {split_info.name} split", + ) + + _prepare_split_args = { + "fpath": fpath, + "file_format": file_format, + "max_shard_size": max_shard_size, + } + + if num_proc is None or num_proc == 1: + result = None + gen_kwargs = split_generator.gen_kwargs + job_id = 0 + with pbar: + for job_id, done, content in self._prepare_split_single( + gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args + ): + if done: + result = content + else: + pbar.update(content) + # wrapping everything into lists for consistency with the multiprocessed code path + assert result is not None, "Failed to retrieve results from prepare_split" + examples_per_job, bytes_per_job, features_per_job, shards_per_job, shard_lengths_per_job = ( + [item] for item in result + ) + else: + kwargs_per_job = [ + {"gen_kwargs": gen_kwargs, "job_id": job_id, **_prepare_split_args} + for job_id, gen_kwargs in enumerate( + _split_gen_kwargs(split_generator.gen_kwargs, max_num_jobs=num_proc) + ) + ] + num_jobs = len(kwargs_per_job) + + examples_per_job = [None] * num_jobs + bytes_per_job = [None] * num_jobs + features_per_job = [None] * num_jobs + shards_per_job = [None] * num_jobs + shard_lengths_per_job = [None] * num_jobs + + with Pool(num_proc) as pool: + with pbar: + for job_id, done, content in iflatmap_unordered( + pool, self._prepare_split_single, kwargs_iterable=kwargs_per_job + ): + if done: + # the content is the result of the job + ( + examples_per_job[job_id], + bytes_per_job[job_id], + features_per_job[job_id], + shards_per_job[job_id], + shard_lengths_per_job[job_id], + ) = content + else: + # the content is the number of examples progress update + pbar.update(content) + + assert None not in examples_per_job, ( + f"Failed to retrieve results from prepare_split: result list {examples_per_job} still contains None - at least one worker failed to return its results" + ) + + total_shards = sum(shards_per_job) + total_num_examples = sum(examples_per_job) + total_num_bytes = sum(bytes_per_job) + features = features_per_job[0] + + split_generator.split_info.num_examples = total_num_examples + split_generator.split_info.num_bytes = total_num_bytes + + # should rename everything at the end + logger.debug(f"Renaming {total_shards} shards.") + if total_shards > 1: + # use the -SSSSS-of-NNNNN pattern + + def _rename_shard(shard_id_and_job: tuple[int]): + shard_id, job_id = shard_id_and_job + global_shard_id = sum(shards_per_job[:job_id]) + shard_id + self._rename( + fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"), + fpath.replace("JJJJJ-SSSSS", f"{global_shard_id:05d}").replace("NNNNN", f"{total_shards:05d}"), + ) + + shard_ids_and_jobs = [ + (shard_id, job_id) + for job_id, num_shards in enumerate(shards_per_job) + for shard_id in range(num_shards) + ] + thread_map(_rename_shard, shard_ids_and_jobs, disable=True, max_workers=64) + + split_generator.split_info.shard_lengths = [ + shard_length for shard_lengths in shard_lengths_per_job for shard_length in shard_lengths + ] + else: + # don't use any pattern + shard_id, job_id = 0, 0 + self._rename( + fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"), + fpath.replace(SUFFIX, ""), + ) + + if self.info.features is None: + self.info.features = features + + def _prepare_split_single( + self, gen_kwargs: dict, fpath: str, file_format: str, max_shard_size: int, job_id: int + ) -> Iterable[tuple[int, bool, Union[int, tuple]]]: + gen_kwargs = {k: tracked_list(v) if isinstance(v, list) else v for k, v in gen_kwargs.items()} + generator = self._generate_tables(**gen_kwargs) + writer_class = ParquetWriter if file_format == "parquet" else ArrowWriter + embed_local_files = file_format == "parquet" + shard_lengths = [] + total_num_examples, total_num_bytes = 0, 0 + + shard_id = 0 + num_examples_progress_update = 0 + try: + writer = writer_class( + features=self.info.features, + path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"), + writer_batch_size=self._writer_batch_size, + storage_options=self._fs.storage_options, + embed_local_files=embed_local_files, + ) + try: + _time = time.time() + for _, table in generator: + if max_shard_size is not None and writer._num_bytes > max_shard_size: + num_examples, num_bytes = writer.finalize() + writer.close() + shard_lengths.append(num_examples) + total_num_examples += num_examples + total_num_bytes += num_bytes + shard_id += 1 + writer = writer_class( + features=writer._features, + path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"), + writer_batch_size=self._writer_batch_size, + storage_options=self._fs.storage_options, + embed_local_files=embed_local_files, + ) + try: + writer.write_table(table) + except CastError as cast_error: + raise DatasetGenerationCastError.from_cast_error( + cast_error=cast_error, + builder_name=self.info.builder_name, + gen_kwargs=gen_kwargs, + token=self.token, + ) + num_examples_progress_update += len(table) + if time.time() > _time + config.PBAR_REFRESH_TIME_INTERVAL: + _time = time.time() + yield job_id, False, num_examples_progress_update + num_examples_progress_update = 0 + finally: + yield job_id, False, num_examples_progress_update + num_shards = shard_id + 1 + num_examples, num_bytes = writer.finalize() + writer.close() + shard_lengths.append(num_examples) + total_num_examples += num_examples + total_num_bytes += num_bytes + except Exception as e: + # Ignore the writer's error for no examples written to the file if this error was caused by the error in _generate_examples before the first example was yielded + if isinstance(e, SchemaInferenceError) and e.__context__ is not None: + e = e.__context__ + if isinstance(e, DatasetGenerationError): + raise + raise DatasetGenerationError("An error occurred while generating the dataset") from e + + yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths) + + def _get_examples_iterable_for_split(self, split_generator: SplitGenerator) -> ExamplesIterable: + return ArrowExamplesIterable(self._generate_tables, kwargs=split_generator.gen_kwargs) diff --git a/datasets/combine.py b/datasets/combine.py new file mode 100644 index 0000000000000000000000000000000000000000..91a6457c02cdb097554826f1319670e42bfc9ba0 --- /dev/null +++ b/datasets/combine.py @@ -0,0 +1,223 @@ +from typing import Optional, TypeVar + +from .arrow_dataset import Dataset, _concatenate_map_style_datasets, _interleave_map_style_datasets +from .dataset_dict import DatasetDict, IterableDatasetDict +from .info import DatasetInfo +from .iterable_dataset import IterableDataset, _concatenate_iterable_datasets, _interleave_iterable_datasets +from .splits import NamedSplit +from .utils import logging +from .utils.py_utils import Literal + + +logger = logging.get_logger(__name__) + + +DatasetType = TypeVar("DatasetType", Dataset, IterableDataset) + + +def interleave_datasets( + datasets: list[DatasetType], + probabilities: Optional[list[float]] = None, + seed: Optional[int] = None, + info: Optional[DatasetInfo] = None, + split: Optional[NamedSplit] = None, + stopping_strategy: Literal[ + "first_exhausted", "all_exhausted", "all_exhausted_without_replacement" + ] = "first_exhausted", +) -> DatasetType: + """ + Interleave several datasets (sources) into a single dataset. + The new dataset is constructed by alternating between the sources to get the examples. + + You can use this function on a list of [`Dataset`] objects, or on a list of [`IterableDataset`] objects. + + - If `probabilities` is `None` (default) the new dataset is constructed by cycling between each source to get the examples. + - If `probabilities` is not `None`, the new dataset is constructed by getting examples from a random source at a time according to the provided probabilities. + + The resulting dataset ends when one of the source datasets runs out of examples except when `oversampling` is `True`, + in which case, the resulting dataset ends when all datasets have ran out of examples at least one time. + + Note for iterable datasets: + + In a distributed setup or in PyTorch DataLoader workers, the stopping strategy is applied per process. + Therefore the "first_exhausted" strategy on an sharded iterable dataset can generate less samples in total (up to 1 missing sample per subdataset per worker). + + Args: + datasets (`List[Dataset]` or `List[IterableDataset]`): + List of datasets to interleave. + probabilities (`List[float]`, *optional*, defaults to `None`): + If specified, the new dataset is constructed by sampling + examples from one source at a time according to these probabilities. + seed (`int`, *optional*, defaults to `None`): + The random seed used to choose a source for each example. + info ([`DatasetInfo`], *optional*): + Dataset information, like description, citation, etc. + + split ([`NamedSplit`], *optional*): + Name of the dataset split. + + stopping_strategy (`str`, defaults to `first_exhausted`): + Three strategies are proposed right now, `first_exhausted`, `all_exhausted` and `all_exhausted_without_replacement`. + By default, `first_exhausted` is an undersampling strategy, i.e the dataset construction is stopped as soon as one dataset has ran out of samples. + If the strategy is `all_exhausted`, we use an oversampling strategy, i.e the dataset construction is stopped as soon as every samples of every dataset has been added at least once. + When strategy is `all_exhausted_without_replacement` we make sure that each sample in each dataset is sampled only once. + Note that if the strategy is `all_exhausted`, the interleaved dataset size can get enormous: + - with no probabilities, the resulting dataset will have `max_length_datasets*nb_dataset` samples. + - with given probabilities, the resulting dataset will have more samples if some datasets have really low probability of visiting. + Returns: + [`Dataset`] or [`IterableDataset`]: Return type depends on the input `datasets` + parameter. `Dataset` if the input is a list of `Dataset`, `IterableDataset` if the input is a list of + `IterableDataset`. + + Example: + + For regular datasets (map-style): + + ```python + >>> from datasets import Dataset, interleave_datasets + >>> d1 = Dataset.from_dict({"a": [0, 1, 2]}) + >>> d2 = Dataset.from_dict({"a": [10, 11, 12]}) + >>> d3 = Dataset.from_dict({"a": [20, 21, 22]}) + >>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42, stopping_strategy="all_exhausted") + >>> dataset["a"] + [10, 0, 11, 1, 2, 20, 12, 10, 0, 1, 2, 21, 0, 11, 1, 2, 0, 1, 12, 2, 10, 0, 22] + >>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42) + >>> dataset["a"] + [10, 0, 11, 1, 2] + >>> dataset = interleave_datasets([d1, d2, d3]) + >>> dataset["a"] + [0, 10, 20, 1, 11, 21, 2, 12, 22] + >>> dataset = interleave_datasets([d1, d2, d3], stopping_strategy="all_exhausted") + >>> dataset["a"] + [0, 10, 20, 1, 11, 21, 2, 12, 22] + >>> d1 = Dataset.from_dict({"a": [0, 1, 2]}) + >>> d2 = Dataset.from_dict({"a": [10, 11, 12, 13]}) + >>> d3 = Dataset.from_dict({"a": [20, 21, 22, 23, 24]}) + >>> dataset = interleave_datasets([d1, d2, d3]) + >>> dataset["a"] + [0, 10, 20, 1, 11, 21, 2, 12, 22] + >>> dataset = interleave_datasets([d1, d2, d3], stopping_strategy="all_exhausted") + >>> dataset["a"] + [0, 10, 20, 1, 11, 21, 2, 12, 22, 0, 13, 23, 1, 10, 24] + >>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42) + >>> dataset["a"] + [10, 0, 11, 1, 2] + >>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42, stopping_strategy="all_exhausted") + >>> dataset["a"] + [10, 0, 11, 1, 2, 20, 12, 13, ..., 0, 1, 2, 0, 24] + For datasets in streaming mode (iterable): + + >>> from datasets import interleave_datasets + >>> d1 = load_dataset('allenai/c4', 'es', split='train', streaming=True) + >>> d2 = load_dataset('allenai/c4', 'fr', split='train', streaming=True) + >>> dataset = interleave_datasets([d1, d2]) + >>> iterator = iter(dataset) + >>> next(iterator) + {'text': 'Comprar Zapatillas para niña en chancla con goma por...'} + >>> next(iterator) + {'text': 'Le sacre de philippe ier, 23 mai 1059 - Compte Rendu...' + ``` + """ + from .arrow_dataset import Dataset + from .iterable_dataset import IterableDataset + + if not datasets: + raise ValueError("Unable to interleave an empty list of datasets.") + for i, dataset in enumerate(datasets): + if not isinstance(dataset, (Dataset, IterableDataset)): + if isinstance(dataset, (DatasetDict, IterableDatasetDict)): + if not dataset: + raise ValueError( + f"Expected a list of Dataset objects or a list of IterableDataset objects, but element at position {i} " + "is an empty dataset dictionary." + ) + raise ValueError( + f"Dataset at position {i} has at least one split: {list(dataset)}\n" + f"Please pick one to interleave with the other datasets, for example: dataset['{next(iter(dataset))}']" + ) + raise ValueError( + f"Expected a list of Dataset objects or a list of IterableDataset objects, but element at position {i} is a {type(dataset).__name__}." + ) + if i == 0: + dataset_type, other_type = ( + (Dataset, IterableDataset) if isinstance(dataset, Dataset) else (IterableDataset, Dataset) + ) + elif not isinstance(dataset, dataset_type): + raise ValueError( + f"Unable to interleave a {dataset_type.__name__} (at position 0) with a {other_type.__name__} (at position {i}). Expected a list of Dataset objects or a list of IterableDataset objects." + ) + if stopping_strategy not in ["first_exhausted", "all_exhausted", "all_exhausted_without_replacement"]: + raise ValueError(f"{stopping_strategy} is not supported. Please enter a valid stopping_strategy.") + if dataset_type is Dataset: + return _interleave_map_style_datasets( + datasets, probabilities, seed, info=info, split=split, stopping_strategy=stopping_strategy + ) + else: + return _interleave_iterable_datasets( + datasets, + probabilities, + seed, + info=info, + split=split, + stopping_strategy=stopping_strategy, + ) + + +def concatenate_datasets( + dsets: list[DatasetType], + info: Optional[DatasetInfo] = None, + split: Optional[NamedSplit] = None, + axis: int = 0, +) -> DatasetType: + """ + Converts a list of [`Dataset`] with the same schema into a single [`Dataset`]. + + Args: + dsets (`List[datasets.Dataset]`): + List of Datasets to concatenate. + info (`DatasetInfo`, *optional*): + Dataset information, like description, citation, etc. + split (`NamedSplit`, *optional*): + Name of the dataset split. + axis (`{0, 1}`, defaults to `0`): + Axis to concatenate over, where `0` means over rows (vertically) and `1` means over columns + (horizontally). + + + + Example: + + ```py + >>> ds3 = concatenate_datasets([ds1, ds2]) + ``` + """ + + if not dsets: + raise ValueError("Unable to concatenate an empty list of datasets.") + for i, dataset in enumerate(dsets): + if not isinstance(dataset, (Dataset, IterableDataset)): + if isinstance(dataset, (DatasetDict, IterableDatasetDict)): + if not dataset: + raise ValueError( + f"Expected a list of Dataset objects or a list of IterableDataset objects, but element at position {i} " + "is an empty dataset dictionary." + ) + raise ValueError( + f"Dataset at position {i} has at least one split: {list(dataset)}\n" + f"Please pick one to interleave with the other datasets, for example: dataset['{next(iter(dataset))}']" + ) + raise ValueError( + f"Expected a list of Dataset objects or a list of IterableDataset objects, but element at position {i} is a {type(dataset).__name__}." + ) + if i == 0: + dataset_type, other_type = ( + (Dataset, IterableDataset) if isinstance(dataset, Dataset) else (IterableDataset, Dataset) + ) + elif not isinstance(dataset, dataset_type): + raise ValueError( + f"Unable to interleave a {dataset_type.__name__} (at position 0) with a {other_type.__name__} (at position {i}). Expected a list of Dataset objects or a list of IterableDataset objects." + ) + if dataset_type is Dataset: + return _concatenate_map_style_datasets(dsets, info=info, split=split, axis=axis) + else: + return _concatenate_iterable_datasets(dsets, info=info, split=split, axis=axis) diff --git a/datasets/config.py b/datasets/config.py new file mode 100644 index 0000000000000000000000000000000000000000..5e61e7bc015719e14d97a05c10145754181a5745 --- /dev/null +++ b/datasets/config.py @@ -0,0 +1,268 @@ +import importlib +import importlib.metadata +import logging +import os +import platform +from pathlib import Path +from typing import Optional + +from huggingface_hub import constants +from packaging import version + + +logger = logging.getLogger(__name__.split(".", 1)[0]) # to avoid circular import from .utils.logging + +# Datasets +S3_DATASETS_BUCKET_PREFIX = "https://s3.amazonaws.com/datasets.huggingface.co/datasets/datasets" +CLOUDFRONT_DATASETS_DISTRIB_PREFIX = "https://cdn-datasets.huggingface.co/datasets/datasets" +REPO_DATASETS_URL = "https://raw.githubusercontent.com/huggingface/datasets/{revision}/datasets/{path}/{name}" + +# Hub +HF_ENDPOINT = os.environ.get("HF_ENDPOINT", "https://huggingface.co") +HUB_DATASETS_URL = HF_ENDPOINT + "/datasets/{repo_id}/resolve/{revision}/{path}" +HUB_DATASETS_HFFS_URL = "hf://datasets/{repo_id}@{revision}/{path}" +HUB_DEFAULT_VERSION = "main" + +PY_VERSION = version.parse(platform.python_version()) + +# General environment variables accepted values for booleans +ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"} +ENV_VARS_FALSE_VALUES = {"0", "OFF", "NO", "FALSE"} +ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"}) +ENV_VARS_FALSE_AND_AUTO_VALUES = ENV_VARS_FALSE_VALUES.union({"AUTO"}) + + +# Imports +DILL_VERSION = version.parse(importlib.metadata.version("dill")) +FSSPEC_VERSION = version.parse(importlib.metadata.version("fsspec")) +PANDAS_VERSION = version.parse(importlib.metadata.version("pandas")) +PYARROW_VERSION = version.parse(importlib.metadata.version("pyarrow")) +HF_HUB_VERSION = version.parse(importlib.metadata.version("huggingface_hub")) + +USE_TF = os.environ.get("USE_TF", "AUTO").upper() +USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper() +USE_JAX = os.environ.get("USE_JAX", "AUTO").upper() + +TORCH_VERSION = "N/A" +TORCH_AVAILABLE = False + +if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES: + TORCH_AVAILABLE = importlib.util.find_spec("torch") is not None + if TORCH_AVAILABLE: + try: + TORCH_VERSION = version.parse(importlib.metadata.version("torch")) + logger.debug(f"PyTorch version {TORCH_VERSION} available.") + except importlib.metadata.PackageNotFoundError: + pass +else: + logger.info("Disabling PyTorch because USE_TF is set") + +POLARS_VERSION = "N/A" +POLARS_AVAILABLE = importlib.util.find_spec("polars") is not None + +if POLARS_AVAILABLE: + try: + POLARS_VERSION = version.parse(importlib.metadata.version("polars")) + logger.debug(f"Polars version {POLARS_VERSION} available.") + except importlib.metadata.PackageNotFoundError: + pass + + +DUCKDB_VERSION = "N/A" +DUCKDB_AVAILABLE = importlib.util.find_spec("duckdb") is not None + +if DUCKDB_AVAILABLE: + try: + DUCKDB_VERSION = version.parse(importlib.metadata.version("duckdb")) + logger.debug(f"Duckdb version {DUCKDB_VERSION} available.") + except importlib.metadata.PackageNotFoundError: + pass + +TF_VERSION = "N/A" +TF_AVAILABLE = False + +if USE_TF in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TORCH not in ENV_VARS_TRUE_VALUES: + TF_AVAILABLE = importlib.util.find_spec("tensorflow") is not None + if TF_AVAILABLE: + # For the metadata, we have to look for both tensorflow and tensorflow-cpu + for package in [ + "tensorflow", + "tensorflow-cpu", + "tensorflow-gpu", + "tf-nightly", + "tf-nightly-cpu", + "tf-nightly-gpu", + "intel-tensorflow", + "tensorflow-rocm", + "tensorflow-macos", + ]: + try: + TF_VERSION = version.parse(importlib.metadata.version(package)) + except importlib.metadata.PackageNotFoundError: + continue + else: + break + else: + TF_AVAILABLE = False + if TF_AVAILABLE: + if TF_VERSION.major < 2: + logger.info(f"TensorFlow found but with version {TF_VERSION}. `datasets` requires version 2 minimum.") + TF_AVAILABLE = False + else: + logger.info(f"TensorFlow version {TF_VERSION} available.") +else: + logger.info("Disabling Tensorflow because USE_TORCH is set") + + +JAX_VERSION = "N/A" +JAX_AVAILABLE = False + +if USE_JAX in ENV_VARS_TRUE_AND_AUTO_VALUES: + JAX_AVAILABLE = importlib.util.find_spec("jax") is not None and importlib.util.find_spec("jaxlib") is not None + if JAX_AVAILABLE: + try: + JAX_VERSION = version.parse(importlib.metadata.version("jax")) + logger.info(f"JAX version {JAX_VERSION} available.") + except importlib.metadata.PackageNotFoundError: + pass +else: + logger.info("Disabling JAX because USE_JAX is set to False") + + +# Optional tools for data loading +SQLALCHEMY_AVAILABLE = importlib.util.find_spec("sqlalchemy") is not None + +# Optional tools for feature decoding +PIL_AVAILABLE = importlib.util.find_spec("PIL") is not None +IS_OPUS_SUPPORTED = True +IS_MP3_SUPPORTED = True +TORCHCODEC_AVAILABLE = importlib.util.find_spec("torchcodec") is not None +TORCHVISION_AVAILABLE = importlib.util.find_spec("torchvision") is not None +PDFPLUMBER_AVAILABLE = importlib.util.find_spec("pdfplumber") is not None + +# Optional compression tools +RARFILE_AVAILABLE = importlib.util.find_spec("rarfile") is not None +ZSTANDARD_AVAILABLE = importlib.util.find_spec("zstandard") is not None +LZ4_AVAILABLE = importlib.util.find_spec("lz4") is not None +PY7ZR_AVAILABLE = importlib.util.find_spec("py7zr") is not None + +# Cache location +DEFAULT_XDG_CACHE_HOME = "~/.cache" +XDG_CACHE_HOME = os.getenv("XDG_CACHE_HOME", DEFAULT_XDG_CACHE_HOME) +DEFAULT_HF_CACHE_HOME = os.path.join(XDG_CACHE_HOME, "huggingface") +HF_CACHE_HOME = os.path.expanduser(os.getenv("HF_HOME", DEFAULT_HF_CACHE_HOME)) + +DEFAULT_HF_DATASETS_CACHE = os.path.join(HF_CACHE_HOME, "datasets") +HF_DATASETS_CACHE = Path(os.getenv("HF_DATASETS_CACHE", DEFAULT_HF_DATASETS_CACHE)) + +DEFAULT_HF_MODULES_CACHE = os.path.join(HF_CACHE_HOME, "modules") +HF_MODULES_CACHE = Path(os.getenv("HF_MODULES_CACHE", DEFAULT_HF_MODULES_CACHE)) + +DOWNLOADED_DATASETS_DIR = "downloads" +DEFAULT_DOWNLOADED_DATASETS_PATH = os.path.join(HF_DATASETS_CACHE, DOWNLOADED_DATASETS_DIR) +DOWNLOADED_DATASETS_PATH = Path(os.getenv("HF_DATASETS_DOWNLOADED_DATASETS_PATH", DEFAULT_DOWNLOADED_DATASETS_PATH)) + +EXTRACTED_DATASETS_DIR = "extracted" +DEFAULT_EXTRACTED_DATASETS_PATH = os.path.join(DEFAULT_DOWNLOADED_DATASETS_PATH, EXTRACTED_DATASETS_DIR) +EXTRACTED_DATASETS_PATH = Path(os.getenv("HF_DATASETS_EXTRACTED_DATASETS_PATH", DEFAULT_EXTRACTED_DATASETS_PATH)) + +# Download count for the website +HF_UPDATE_DOWNLOAD_COUNTS = ( + os.environ.get("HF_UPDATE_DOWNLOAD_COUNTS", "AUTO").upper() in ENV_VARS_TRUE_AND_AUTO_VALUES +) + +# For downloads and to check remote files metadata +HF_DATASETS_MULTITHREADING_MAX_WORKERS = 16 + +# Dataset viewer API +USE_PARQUET_EXPORT = True + +# Batch size constants. For more info, see: +# https://github.com/apache/arrow/blob/master/docs/source/cpp/arrays.rst#size-limitations-and-recommendations) +DEFAULT_MAX_BATCH_SIZE = 1000 + +DEFAULT_CDC_OPTIONS = {"min_chunk_size": 256 * 1024, "max_chunk_size": 1024 * 1024, "norm_level": 0} + +# Size of the preloaded record batch in `Dataset.__iter__` +ARROW_READER_BATCH_SIZE_IN_DATASET_ITER = 10 + +# Max uncompressed shard size in bytes (e.g. to shard parquet datasets in push_to_hub or download_and_prepare) +MAX_SHARD_SIZE = "500MB" + +# Max uncompressed row group size in bytes (e.g. for parquet files in push_to_hub or download_and_prepare) +MAX_ROW_GROUP_SIZE = "100MB" + +# Parquet configuration +PARQUET_ROW_GROUP_SIZE_FOR_AUDIO_DATASETS = None +PARQUET_ROW_GROUP_SIZE_FOR_IMAGE_DATASETS = None +PARQUET_ROW_GROUP_SIZE_FOR_BINARY_DATASETS = None +PARQUET_ROW_GROUP_SIZE_FOR_VIDEO_DATASETS = None + +# Arrow configuration +ARROW_RECORD_BATCH_SIZE_FOR_AUDIO_DATASETS = 100 +ARROW_RECORD_BATCH_SIZE_FOR_IMAGE_DATASETS = 100 +ARROW_RECORD_BATCH_SIZE_FOR_BINARY_DATASETS = 100 +ARROW_RECORD_BATCH_SIZE_FOR_VIDEO_DATASETS = 10 + +# Offline mode +_offline = os.environ.get("HF_DATASETS_OFFLINE") +HF_HUB_OFFLINE = constants.HF_HUB_OFFLINE if _offline is None else _offline.upper() in ENV_VARS_TRUE_VALUES +HF_DATASETS_OFFLINE = HF_HUB_OFFLINE # kept for backward-compatibility + +# Here, `True` will disable progress bars globally without possibility of enabling it +# programmatically. `False` will enable them without possibility of disabling them. +# If environment variable is not set (None), then the user is free to enable/disable +# them programmatically. +# TL;DR: env variable has priority over code +__HF_DATASETS_DISABLE_PROGRESS_BARS = os.environ.get("HF_DATASETS_DISABLE_PROGRESS_BARS") +HF_DATASETS_DISABLE_PROGRESS_BARS: Optional[bool] = ( + __HF_DATASETS_DISABLE_PROGRESS_BARS.upper() in ENV_VARS_TRUE_VALUES + if __HF_DATASETS_DISABLE_PROGRESS_BARS is not None + else None +) + +# In-memory +DEFAULT_IN_MEMORY_MAX_SIZE = 0 # Disabled +IN_MEMORY_MAX_SIZE = float(os.environ.get("HF_DATASETS_IN_MEMORY_MAX_SIZE", DEFAULT_IN_MEMORY_MAX_SIZE)) + +# File names +DATASET_ARROW_FILENAME = "dataset.arrow" +DATASET_INDICES_FILENAME = "indices.arrow" +DATASET_STATE_JSON_FILENAME = "state.json" +DATASET_INFO_FILENAME = "dataset_info.json" +DATASETDICT_INFOS_FILENAME = "dataset_infos.json" +LICENSE_FILENAME = "LICENSE" +DATASETDICT_JSON_FILENAME = "dataset_dict.json" +METADATA_CONFIGS_FIELD = "configs" +REPOCARD_FILENAME = "README.md" +REPOYAML_FILENAME = ".huggingface.yaml" + +MODULE_NAME_FOR_DYNAMIC_MODULES = "datasets_modules" + +MAX_DATASET_CONFIG_ID_READABLE_LENGTH = 255 + +# Temporary cache directory prefix +TEMP_CACHE_DIR_PREFIX = "hf_datasets-" + +# Streaming +STREAMING_READ_MAX_RETRIES = 20 +STREAMING_READ_RETRY_INTERVAL = 5 +STREAMING_OPEN_MAX_RETRIES = 20 +STREAMING_OPEN_RETRY_INTERVAL = 5 + +# Datasets repositories exploration +DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE = 200 +GLOBBED_DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE = 10 +ARCHIVED_DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE = 200 + +# Async map functions +MAX_NUM_RUNNING_ASYNC_MAP_FUNCTIONS_IN_PARALLEL = 1000 + +# Progress bars +PBAR_REFRESH_TIME_INTERVAL = 0.05 # 20 progress updates per sec + +# Maximum number of uploaded files per commit +UPLOADS_MAX_NUMBER_PER_COMMIT = 50 + +# Backward compatibility +MAX_TABLE_NBYTES_FOR_PICKLING = 4 << 30 diff --git a/datasets/data_files.py b/datasets/data_files.py new file mode 100644 index 0000000000000000000000000000000000000000..9710bc84a8ea8d6e0b6ddedd42da23d64cd85c5d --- /dev/null +++ b/datasets/data_files.py @@ -0,0 +1,807 @@ +import os +import re +from functools import partial +from glob import has_magic +from pathlib import Path, PurePath +from typing import Callable, Optional, Union + +import huggingface_hub +from fsspec.core import url_to_fs +from huggingface_hub import HfFileSystem +from packaging import version +from tqdm.contrib.concurrent import thread_map + +from . import config +from .download import DownloadConfig +from .naming import _split_re +from .splits import Split +from .utils import logging +from .utils import tqdm as hf_tqdm +from .utils.file_utils import _prepare_path_and_storage_options, is_local_path, is_relative_path, xbasename, xjoin +from .utils.py_utils import string_to_dict + + +SingleOriginMetadata = Union[tuple[str, str], tuple[str], tuple[()]] + + +SANITIZED_DEFAULT_SPLIT = str(Split.TRAIN) + + +logger = logging.get_logger(__name__) + + +class Url(str): + pass + + +class EmptyDatasetError(FileNotFoundError): + pass + + +SPLIT_PATTERN_SHARDED = "data/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*" + +SPLIT_KEYWORDS = { + Split.TRAIN: ["train", "training"], + Split.VALIDATION: ["validation", "valid", "dev", "val"], + Split.TEST: ["test", "testing", "eval", "evaluation"], +} +NON_WORDS_CHARS = "-._ 0-9" +if config.FSSPEC_VERSION < version.parse("2023.9.0"): + KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**[{sep}/]{keyword}[{sep}]*", "{keyword}[{sep}]*"] + KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [ + "{keyword}/**", + "{keyword}[{sep}]*/**", + "**[{sep}/]{keyword}/**", + "**[{sep}/]{keyword}[{sep}]*/**", + ] +elif config.FSSPEC_VERSION < version.parse("2023.12.0"): + KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**/*[{sep}/]{keyword}[{sep}]*", "{keyword}[{sep}]*"] + KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [ + "{keyword}/**/*", + "{keyword}[{sep}]*/**/*", + "**/*[{sep}/]{keyword}/**/*", + "**/*[{sep}/]{keyword}[{sep}]*/**/*", + ] +else: + KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**/{keyword}[{sep}]*", "**/*[{sep}]{keyword}[{sep}]*"] + KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [ + "**/{keyword}/**", + "**/{keyword}[{sep}]*/**", + "**/*[{sep}]{keyword}/**", + "**/*[{sep}]{keyword}[{sep}]*/**", + ] + +DEFAULT_SPLITS = [Split.TRAIN, Split.VALIDATION, Split.TEST] +DEFAULT_PATTERNS_SPLIT_IN_FILENAME = { + split: [ + pattern.format(keyword=keyword, sep=NON_WORDS_CHARS) + for keyword in SPLIT_KEYWORDS[split] + for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS + ] + for split in DEFAULT_SPLITS +} +DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME = { + split: [ + pattern.format(keyword=keyword, sep=NON_WORDS_CHARS) + for keyword in SPLIT_KEYWORDS[split] + for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS + ] + for split in DEFAULT_SPLITS +} + + +DEFAULT_PATTERNS_ALL = { + Split.TRAIN: ["**"], +} + +ALL_SPLIT_PATTERNS = [SPLIT_PATTERN_SHARDED] +ALL_DEFAULT_PATTERNS = [ + DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME, + DEFAULT_PATTERNS_SPLIT_IN_FILENAME, + DEFAULT_PATTERNS_ALL, +] +WILDCARD_CHARACTERS = "*[]" +FILES_TO_IGNORE = [ + "README.md", + "config.json", + "dataset_info.json", + "dataset_infos.json", + "dummy_data.zip", + "dataset_dict.json", +] + + +def contains_wildcards(pattern: str) -> bool: + return any(wildcard_character in pattern for wildcard_character in WILDCARD_CHARACTERS) + + +def sanitize_patterns(patterns: Union[dict, list, str]) -> dict[str, Union[list[str], "DataFilesList"]]: + """ + Take the data_files patterns from the user, and format them into a dictionary. + Each key is the name of the split, and each value is a list of data files patterns (paths or urls). + The default split is "train". + + Returns: + patterns: dictionary of split_name -> list of patterns + """ + if isinstance(patterns, dict): + return {str(key): value if isinstance(value, list) else [value] for key, value in patterns.items()} + elif isinstance(patterns, str): + return {SANITIZED_DEFAULT_SPLIT: [patterns]} + elif isinstance(patterns, list): + if any(isinstance(pattern, dict) for pattern in patterns): + for pattern in patterns: + if not ( + isinstance(pattern, dict) + and len(pattern) == 2 + and "split" in pattern + and isinstance(pattern.get("path"), (str, list)) + ): + raise ValueError( + f"Expected each split to have a 'path' key which can be a string or a list of strings, but got {pattern}" + ) + splits = [pattern["split"] for pattern in patterns] + if len(set(splits)) != len(splits): + raise ValueError(f"Some splits are duplicated in data_files: {splits}") + return { + str(pattern["split"]): pattern["path"] if isinstance(pattern["path"], list) else [pattern["path"]] + for pattern in patterns + } + else: + return {SANITIZED_DEFAULT_SPLIT: patterns} + else: + return sanitize_patterns(list(patterns)) + + +def _is_inside_unrequested_special_dir(matched_rel_path: str, pattern: str) -> bool: + """ + When a path matches a pattern, we additionally check if it's inside a special directory + we ignore by default (if it starts with a double underscore). + + Users can still explicitly request a filepath inside such a directory if "__pycache__" is + mentioned explicitly in the requested pattern. + + Some examples: + + base directory: + + ./ + └── __pycache__ + └── b.txt + + >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "**") + True + >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "*/b.txt") + True + >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__pycache__/*") + False + >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__*/*") + False + """ + # We just need to check if every special directories from the path is present explicitly in the pattern. + # Since we assume that the path matches the pattern, it's equivalent to counting that both + # the parent path and the parent pattern have the same number of special directories. + data_dirs_to_ignore_in_path = [part for part in PurePath(matched_rel_path).parent.parts if part.startswith("__")] + data_dirs_to_ignore_in_pattern = [part for part in PurePath(pattern).parent.parts if part.startswith("__")] + return len(data_dirs_to_ignore_in_path) != len(data_dirs_to_ignore_in_pattern) + + +def _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(matched_rel_path: str, pattern: str) -> bool: + """ + When a path matches a pattern, we additionally check if it's a hidden file or if it's inside + a hidden directory we ignore by default, i.e. if the file name or a parent directory name starts with a dot. + + Users can still explicitly request a filepath that is hidden or is inside a hidden directory + if the hidden part is mentioned explicitly in the requested pattern. + + Some examples: + + base directory: + + ./ + └── .hidden_file.txt + + >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", "**") + True + >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", ".*") + False + + base directory: + + ./ + └── .hidden_dir + └── a.txt + + >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", "**") + True + >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".*/*") + False + >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".hidden_dir/*") + False + + base directory: + + ./ + └── .hidden_dir + └── .hidden_file.txt + + >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", "**") + True + >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/*") + True + >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/.*") + False + >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/*") + True + >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/.*") + False + """ + # We just need to check if every hidden part from the path is present explicitly in the pattern. + # Since we assume that the path matches the pattern, it's equivalent to counting that both + # the path and the pattern have the same number of hidden parts. + hidden_directories_in_path = [ + part for part in PurePath(matched_rel_path).parts if part.startswith(".") and not set(part) == {"."} + ] + hidden_directories_in_pattern = [ + part for part in PurePath(pattern).parts if part.startswith(".") and not set(part) == {"."} + ] + return len(hidden_directories_in_path) != len(hidden_directories_in_pattern) + + +def _get_data_files_patterns(pattern_resolver: Callable[[str], list[str]]) -> dict[str, list[str]]: + """ + Get the default pattern from a directory or repository by testing all the supported patterns. + The first patterns to return a non-empty list of data files is returned. + + In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS. + """ + # first check the split patterns like data/{split}-00000-of-00001.parquet + for split_pattern in ALL_SPLIT_PATTERNS: + pattern = split_pattern.replace("{split}", "*") + try: + data_files = pattern_resolver(pattern) + except FileNotFoundError: + continue + if len(data_files) > 0: + splits: set[str] = set() + for p in data_files: + p_parts = string_to_dict(xbasename(p), xbasename(split_pattern)) + assert p_parts is not None + splits.add(p_parts["split"]) + + if any(not re.match(_split_re, split) for split in splits): + raise ValueError(f"Split name should match '{_split_re}'' but got '{splits}'.") + sorted_splits = [str(split) for split in DEFAULT_SPLITS if split in splits] + sorted( + splits - {str(split) for split in DEFAULT_SPLITS} + ) + return {split: [split_pattern.format(split=split)] for split in sorted_splits} + # then check the default patterns based on train/valid/test splits + for patterns_dict in ALL_DEFAULT_PATTERNS: + non_empty_splits = [] + for split, patterns in patterns_dict.items(): + for pattern in patterns: + try: + data_files = pattern_resolver(pattern) + except FileNotFoundError: + continue + if len(data_files) > 0: + non_empty_splits.append(split) + break + if non_empty_splits: + return {split: patterns_dict[split] for split in non_empty_splits} + raise FileNotFoundError(f"Couldn't resolve pattern {pattern} with resolver {pattern_resolver}") + + +def resolve_pattern( + pattern: str, + base_path: str, + allowed_extensions: Optional[list[str]] = None, + download_config: Optional[DownloadConfig] = None, +) -> list[str]: + """ + Resolve the paths and URLs of the data files from the pattern passed by the user. + + You can use patterns to resolve multiple local files. Here are a few examples: + - *.csv to match all the CSV files at the first level + - **.csv to match all the CSV files at any level + - data/* to match all the files inside "data" + - data/** to match all the files inside "data" and its subdirectories + + The patterns are resolved using the fsspec glob. In fsspec>=2023.12.0 this is equivalent to + Python's glob.glob, Path.glob, Path.match and fnmatch where ** is unsupported with a prefix/suffix + other than a forward slash /. + + More generally: + - '*' matches any character except a forward-slash (to match just the file or directory name) + - '**' matches any character including a forward-slash / + + Hidden files and directories (i.e. whose names start with a dot) are ignored, unless they are explicitly requested. + The same applies to special directories that start with a double underscore like "__pycache__". + You can still include one if the pattern explicitly mentions it: + - to include a hidden file: "*/.hidden.txt" or "*/.*" + - to include a hidden directory: ".hidden/*" or ".*/*" + - to include a special directory: "__special__/*" or "__*/*" + + Example:: + + >>> from datasets.data_files import resolve_pattern + >>> base_path = "." + >>> resolve_pattern("docs/**/*.py", base_path) + [/Users/mariosasko/Desktop/projects/datasets/docs/source/_config.py'] + + Args: + pattern (str): Unix pattern or paths or URLs of the data files to resolve. + The paths can be absolute or relative to base_path. + Remote filesystems using fsspec are supported, e.g. with the hf:// protocol. + base_path (str): Base path to use when resolving relative paths. + allowed_extensions (Optional[list], optional): White-list of file extensions to use. Defaults to None (all extensions). + For example: allowed_extensions=[".csv", ".json", ".txt", ".parquet"] + download_config ([`DownloadConfig`], *optional*): Specific download configuration parameters. + Returns: + List[str]: List of paths or URLs to the local or remote files that match the patterns. + """ + if is_relative_path(pattern): + pattern = xjoin(base_path, pattern) + elif is_local_path(pattern): + base_path = os.path.splitdrive(pattern)[0] + os.sep + else: + base_path = "" + pattern, storage_options = _prepare_path_and_storage_options(pattern, download_config=download_config) + fs, fs_pattern = url_to_fs(pattern, **storage_options) + files_to_ignore = set(FILES_TO_IGNORE) - {xbasename(pattern)} + protocol = fs.protocol if isinstance(fs.protocol, str) else fs.protocol[0] + protocol_prefix = protocol + "://" if protocol != "file" else "" + glob_kwargs = {} + if protocol == "hf": + # 10 times faster glob with detail=True (ignores costly info like lastCommit) + glob_kwargs["expand_info"] = False + matched_paths = [ + filepath if filepath.startswith(protocol_prefix) else protocol_prefix + filepath + for filepath, info in fs.glob(pattern, detail=True, **glob_kwargs).items() + if (info["type"] == "file" or (info.get("islink") and os.path.isfile(os.path.realpath(filepath)))) + and (xbasename(filepath) not in files_to_ignore) + and not _is_inside_unrequested_special_dir(filepath, fs_pattern) + and not _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(filepath, fs_pattern) + ] # ignore .ipynb and __pycache__, but keep /../ + if allowed_extensions is not None: + out = [ + filepath + for filepath in matched_paths + if any("." + suffix in allowed_extensions for suffix in xbasename(filepath).split(".")[1:]) + ] + if len(out) < len(matched_paths): + invalid_matched_files = list(set(matched_paths) - set(out)) + logger.info( + f"Some files matched the pattern '{pattern}' but don't have valid data file extensions: {invalid_matched_files}" + ) + else: + out = matched_paths + if not out: + error_msg = f"Unable to find '{pattern}'" + if allowed_extensions is not None: + error_msg += f" with any supported extension {list(allowed_extensions)}" + raise FileNotFoundError(error_msg) + return out + + +def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig] = None) -> dict[str, list[str]]: + """ + Get the default pattern from a directory testing all the supported patterns. + The first patterns to return a non-empty list of data files is returned. + + Some examples of supported patterns: + + Input: + + my_dataset_repository/ + ├── README.md + └── dataset.csv + + Output: + + {'train': ['**']} + + Input: + + my_dataset_repository/ + ├── README.md + ├── train.csv + └── test.csv + + my_dataset_repository/ + ├── README.md + └── data/ + ├── train.csv + └── test.csv + + my_dataset_repository/ + ├── README.md + ├── train_0.csv + ├── train_1.csv + ├── train_2.csv + ├── train_3.csv + ├── test_0.csv + └── test_1.csv + + Output: + + {'train': ['**/train[-._ 0-9]*', '**/*[-._ 0-9]train[-._ 0-9]*', '**/training[-._ 0-9]*', '**/*[-._ 0-9]training[-._ 0-9]*'], + 'test': ['**/test[-._ 0-9]*', '**/*[-._ 0-9]test[-._ 0-9]*', '**/testing[-._ 0-9]*', '**/*[-._ 0-9]testing[-._ 0-9]*', ...]} + + Input: + + my_dataset_repository/ + ├── README.md + └── data/ + ├── train/ + │ ├── shard_0.csv + │ ├── shard_1.csv + │ ├── shard_2.csv + │ └── shard_3.csv + └── test/ + ├── shard_0.csv + └── shard_1.csv + + Output: + + {'train': ['**/train/**', '**/train[-._ 0-9]*/**', '**/*[-._ 0-9]train/**', '**/*[-._ 0-9]train[-._ 0-9]*/**', ...], + 'test': ['**/test/**', '**/test[-._ 0-9]*/**', '**/*[-._ 0-9]test/**', '**/*[-._ 0-9]test[-._ 0-9]*/**', ...]} + + Input: + + my_dataset_repository/ + ├── README.md + └── data/ + ├── train-00000-of-00003.csv + ├── train-00001-of-00003.csv + ├── train-00002-of-00003.csv + ├── test-00000-of-00001.csv + ├── random-00000-of-00003.csv + ├── random-00001-of-00003.csv + └── random-00002-of-00003.csv + + Output: + + {'train': ['data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'], + 'test': ['data/test-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'], + 'random': ['data/random-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*']} + + In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS. + """ + resolver = partial(resolve_pattern, base_path=base_path, download_config=download_config) + try: + return _get_data_files_patterns(resolver) + except FileNotFoundError: + raise EmptyDatasetError(f"The directory at {base_path} doesn't contain any data files") from None + + +def _get_single_origin_metadata( + data_file: str, + download_config: Optional[DownloadConfig] = None, +) -> SingleOriginMetadata: + data_file, storage_options = _prepare_path_and_storage_options(data_file, download_config=download_config) + fs, *_ = url_to_fs(data_file, **storage_options) + if isinstance(fs, HfFileSystem): + resolved_path = fs.resolve_path(data_file) + return resolved_path.repo_id, resolved_path.revision + elif data_file.startswith(config.HF_ENDPOINT): + hffs = HfFileSystem(endpoint=config.HF_ENDPOINT, token=download_config.token) + data_file = "hf://" + data_file[len(config.HF_ENDPOINT) + 1 :].replace("/resolve/", "@", 1) + resolved_path = hffs.resolve_path(data_file) + return resolved_path.repo_id, resolved_path.revision + info = fs.info(data_file) + # s3fs uses "ETag", gcsfs uses "etag", and for local we simply check mtime + for key in ["ETag", "etag", "mtime"]: + if key in info: + return (str(info[key]),) + return () + + +def _get_origin_metadata( + data_files: list[str], + download_config: Optional[DownloadConfig] = None, + max_workers: Optional[int] = None, +) -> list[SingleOriginMetadata]: + max_workers = max_workers if max_workers is not None else config.HF_DATASETS_MULTITHREADING_MAX_WORKERS + if all("hf://" in data_file for data_file in data_files): + # No need for multithreading here since the origin metadata of HF files + # is (repo_id, revision) and is cached after first .info() call. + return [ + _get_single_origin_metadata(data_file, download_config=download_config) + for data_file in hf_tqdm( + data_files, + desc="Resolving data files", + # set `disable=None` rather than `disable=False` by default to disable progress bar when no TTY attached + disable=len(data_files) <= 16 or None, + ) + ] + return thread_map( + partial(_get_single_origin_metadata, download_config=download_config), + data_files, + max_workers=max_workers, + tqdm_class=hf_tqdm, + desc="Resolving data files", + # set `disable=None` rather than `disable=False` by default to disable progress bar when no TTY attached + disable=len(data_files) <= 16 or None, + ) + + +class DataFilesList(list[str]): + """ + List of data files (absolute local paths or URLs). + It has two construction methods given the user's data files patterns: + - ``from_hf_repo``: resolve patterns inside a dataset repository + - ``from_local_or_remote``: resolve patterns from a local path + + Moreover, DataFilesList has an additional attribute ``origin_metadata``. + It can store: + - the last modified time of local files + - ETag of remote files + - commit sha of a dataset repository + + Thanks to this additional attribute, it is possible to hash the list + and get a different hash if and only if at least one file changed. + This is useful for caching Dataset objects that are obtained from a list of data files. + """ + + def __init__(self, data_files: list[str], origin_metadata: list[SingleOriginMetadata]) -> None: + super().__init__(data_files) + self.origin_metadata = origin_metadata + + def __add__(self, other: "DataFilesList") -> "DataFilesList": + return DataFilesList([*self, *other], self.origin_metadata + other.origin_metadata) + + @classmethod + def from_hf_repo( + cls, + patterns: list[str], + dataset_info: huggingface_hub.hf_api.DatasetInfo, + base_path: Optional[str] = None, + allowed_extensions: Optional[list[str]] = None, + download_config: Optional[DownloadConfig] = None, + ) -> "DataFilesList": + base_path = f"hf://datasets/{dataset_info.id}@{dataset_info.sha}/{base_path or ''}".rstrip("/") + return cls.from_patterns( + patterns, base_path=base_path, allowed_extensions=allowed_extensions, download_config=download_config + ) + + @classmethod + def from_local_or_remote( + cls, + patterns: list[str], + base_path: Optional[str] = None, + allowed_extensions: Optional[list[str]] = None, + download_config: Optional[DownloadConfig] = None, + ) -> "DataFilesList": + base_path = base_path if base_path is not None else Path().resolve().as_posix() + return cls.from_patterns( + patterns, base_path=base_path, allowed_extensions=allowed_extensions, download_config=download_config + ) + + @classmethod + def from_patterns( + cls, + patterns: list[str], + base_path: Optional[str] = None, + allowed_extensions: Optional[list[str]] = None, + download_config: Optional[DownloadConfig] = None, + ) -> "DataFilesList": + base_path = base_path if base_path is not None else Path().resolve().as_posix() + data_files = [] + for pattern in patterns: + try: + data_files.extend( + resolve_pattern( + pattern, + base_path=base_path, + allowed_extensions=allowed_extensions, + download_config=download_config, + ) + ) + except FileNotFoundError: + if not has_magic(pattern): + raise + origin_metadata = _get_origin_metadata(data_files, download_config=download_config) + return cls(data_files, origin_metadata) + + def filter( + self, *, extensions: Optional[list[str]] = None, file_names: Optional[list[str]] = None + ) -> "DataFilesList": + patterns = [] + if extensions: + ext_pattern = "|".join(re.escape(ext) for ext in extensions) + patterns.append(re.compile(f".*({ext_pattern})(\\..+)?$")) + if file_names: + fn_pattern = "|".join(re.escape(fn) for fn in file_names) + patterns.append(re.compile(rf".*[\/]?({fn_pattern})$")) + if patterns: + return DataFilesList( + [data_file for data_file in self if any(pattern.match(data_file) for pattern in patterns)], + origin_metadata=self.origin_metadata, + ) + else: + return DataFilesList(list(self), origin_metadata=self.origin_metadata) + + +class DataFilesDict(dict[str, DataFilesList]): + """ + Dict of split_name -> list of data files (absolute local paths or URLs). + It has two construction methods given the user's data files patterns : + - ``from_hf_repo``: resolve patterns inside a dataset repository + - ``from_local_or_remote``: resolve patterns from a local path + + Moreover, each list is a DataFilesList. It is possible to hash the dictionary + and get a different hash if and only if at least one file changed. + For more info, see [`DataFilesList`]. + + This is useful for caching Dataset objects that are obtained from a list of data files. + + Changing the order of the keys of this dictionary also doesn't change its hash. + """ + + @classmethod + def from_local_or_remote( + cls, + patterns: dict[str, Union[list[str], DataFilesList]], + base_path: Optional[str] = None, + allowed_extensions: Optional[list[str]] = None, + download_config: Optional[DownloadConfig] = None, + ) -> "DataFilesDict": + out = cls() + for key, patterns_for_key in patterns.items(): + out[key] = ( + patterns_for_key + if isinstance(patterns_for_key, DataFilesList) + else DataFilesList.from_local_or_remote( + patterns_for_key, + base_path=base_path, + allowed_extensions=allowed_extensions, + download_config=download_config, + ) + ) + return out + + @classmethod + def from_hf_repo( + cls, + patterns: dict[str, Union[list[str], DataFilesList]], + dataset_info: huggingface_hub.hf_api.DatasetInfo, + base_path: Optional[str] = None, + allowed_extensions: Optional[list[str]] = None, + download_config: Optional[DownloadConfig] = None, + ) -> "DataFilesDict": + out = cls() + for key, patterns_for_key in patterns.items(): + out[key] = ( + patterns_for_key + if isinstance(patterns_for_key, DataFilesList) + else DataFilesList.from_hf_repo( + patterns_for_key, + dataset_info=dataset_info, + base_path=base_path, + allowed_extensions=allowed_extensions, + download_config=download_config, + ) + ) + return out + + @classmethod + def from_patterns( + cls, + patterns: dict[str, Union[list[str], DataFilesList]], + base_path: Optional[str] = None, + allowed_extensions: Optional[list[str]] = None, + download_config: Optional[DownloadConfig] = None, + ) -> "DataFilesDict": + out = cls() + for key, patterns_for_key in patterns.items(): + out[key] = ( + patterns_for_key + if isinstance(patterns_for_key, DataFilesList) + else DataFilesList.from_patterns( + patterns_for_key, + base_path=base_path, + allowed_extensions=allowed_extensions, + download_config=download_config, + ) + ) + return out + + def filter( + self, *, extensions: Optional[list[str]] = None, file_names: Optional[list[str]] = None + ) -> "DataFilesDict": + out = type(self)() + for key, data_files_list in self.items(): + out[key] = data_files_list.filter(extensions=extensions, file_names=file_names) + return out + + +class DataFilesPatternsList(list[str]): + """ + List of data files patterns (absolute local paths or URLs). + For each pattern there should also be a list of allowed extensions + to keep, or a None ot keep all the files for the pattern. + """ + + def __init__( + self, + patterns: list[str], + allowed_extensions: list[Optional[list[str]]], + ): + super().__init__(patterns) + self.allowed_extensions = allowed_extensions + + def __add__(self, other): + return DataFilesList([*self, *other], self.allowed_extensions + other.allowed_extensions) + + @classmethod + def from_patterns( + cls, patterns: list[str], allowed_extensions: Optional[list[str]] = None + ) -> "DataFilesPatternsList": + return cls(patterns, [allowed_extensions] * len(patterns)) + + def resolve( + self, + base_path: str, + download_config: Optional[DownloadConfig] = None, + ) -> "DataFilesList": + base_path = base_path if base_path is not None else Path().resolve().as_posix() + data_files = [] + for pattern, allowed_extensions in zip(self, self.allowed_extensions): + try: + data_files.extend( + resolve_pattern( + pattern, + base_path=base_path, + allowed_extensions=allowed_extensions, + download_config=download_config, + ) + ) + except FileNotFoundError: + if not has_magic(pattern): + raise + origin_metadata = _get_origin_metadata(data_files, download_config=download_config) + return DataFilesList(data_files, origin_metadata) + + def filter_extensions(self, extensions: list[str]) -> "DataFilesPatternsList": + return DataFilesPatternsList( + self, [allowed_extensions + extensions for allowed_extensions in self.allowed_extensions] + ) + + +class DataFilesPatternsDict(dict[str, DataFilesPatternsList]): + """ + Dict of split_name -> list of data files patterns (absolute local paths or URLs). + """ + + @classmethod + def from_patterns( + cls, patterns: dict[str, list[str]], allowed_extensions: Optional[list[str]] = None + ) -> "DataFilesPatternsDict": + out = cls() + for key, patterns_for_key in patterns.items(): + out[key] = ( + patterns_for_key + if isinstance(patterns_for_key, DataFilesPatternsList) + else DataFilesPatternsList.from_patterns( + patterns_for_key, + allowed_extensions=allowed_extensions, + ) + ) + return out + + def resolve( + self, + base_path: str, + download_config: Optional[DownloadConfig] = None, + ) -> "DataFilesDict": + out = DataFilesDict() + for key, data_files_patterns_list in self.items(): + out[key] = data_files_patterns_list.resolve(base_path, download_config) + return out + + def filter_extensions(self, extensions: list[str]) -> "DataFilesPatternsDict": + out = type(self)() + for key, data_files_patterns_list in self.items(): + out[key] = data_files_patterns_list.filter_extensions(extensions) + return out diff --git a/datasets/dataset_dict.py b/datasets/dataset_dict.py new file mode 100644 index 0000000000000000000000000000000000000000..63a93429c4527a823ac3a5c3f5c0dab31e15ad54 --- /dev/null +++ b/datasets/dataset_dict.py @@ -0,0 +1,2852 @@ +import contextlib +import copy +import fnmatch +import itertools +import json +import math +import posixpath +import random +import re +import time +from collections.abc import Sequence +from functools import partial +from pathlib import Path +from typing import Callable, Optional, Union + +import fsspec +import numpy as np +from fsspec.core import url_to_fs +from huggingface_hub import ( + CommitInfo, + CommitOperationAdd, + CommitOperationDelete, + DatasetCard, + DatasetCardData, + HfApi, +) +from huggingface_hub.hf_api import RepoFile +from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError + +from . import config +from .arrow_dataset import ( + PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED, + Dataset, +) +from .features import Features +from .features.features import FeatureType +from .info import DatasetInfo, DatasetInfosDict +from .iterable_dataset import IterableDataset +from .naming import _split_re +from .splits import NamedSplit, Split, SplitDict, SplitInfo +from .table import Table +from .utils import logging +from .utils.doc_utils import is_documented_by +from .utils.metadata import MetadataConfigs +from .utils.py_utils import asdict, glob_pattern_to_regex, string_to_dict +from .utils.typing import PathLike + + +logger = logging.get_logger(__name__) + + +class bind(partial): + def __call__(self, *fn_args, **fn_kwargs): + return self.func(*fn_args, *self.args, **fn_kwargs) + + +class DatasetDict(dict[Union[str, NamedSplit], "Dataset"]): + """A dictionary (dict of str: datasets.Dataset) with dataset transforms methods (map, filter, etc.)""" + + def _check_values_type(self): + for dataset in self.values(): + if not isinstance(dataset, Dataset): + raise TypeError(f"Values in `DatasetDict` should be of type `Dataset` but got type '{type(dataset)}'") + + def _check_values_features(self): + items = list(self.items()) + for item_a, item_b in zip(items[:-1], items[1:]): + if item_a[1].features != item_b[1].features: + raise ValueError( + f"All datasets in `DatasetDict` should have the same features but features for '{item_a[0]}' and '{item_b[0]}' don't match: {item_a[1].features} != {item_b[1].features}" + ) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + # Here `del` is used to del the pyarrow tables. This properly closes the files used for memory mapped tables + for dataset in self.values(): + if hasattr(dataset, "_data"): + del dataset._data + if hasattr(dataset, "_indices"): + del dataset._indices + + def __getitem__(self, k) -> Dataset: + if isinstance(k, (str, NamedSplit)) or len(self) == 0: + return super().__getitem__(k) + else: + available_suggested_splits = [ + split for split in (Split.TRAIN, Split.TEST, Split.VALIDATION) if split in self + ] + suggested_split = available_suggested_splits[0] if available_suggested_splits else list(self)[0] + raise KeyError( + f"Invalid key: {k}. Please first select a split. For example: " + f"`my_dataset_dictionary['{suggested_split}'][{k}]`. " + f"Available splits: {sorted(self)}" + ) + + @property + def data(self) -> dict[str, Table]: + """The Apache Arrow tables backing each split. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes") + >>> ds.data + ``` + """ + self._check_values_type() + return {k: dataset.data for k, dataset in self.items()} + + @property + def cache_files(self) -> dict[str, dict]: + """The cache files containing the Apache Arrow table backing each split. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes") + >>> ds.cache_files + {'test': [{'filename': '/root/.cache/huggingface/datasets/rotten_tomatoes_movie_review/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46/rotten_tomatoes_movie_review-test.arrow'}], + 'train': [{'filename': '/root/.cache/huggingface/datasets/rotten_tomatoes_movie_review/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46/rotten_tomatoes_movie_review-train.arrow'}], + 'validation': [{'filename': '/root/.cache/huggingface/datasets/rotten_tomatoes_movie_review/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46/rotten_tomatoes_movie_review-validation.arrow'}]} + ``` + """ + self._check_values_type() + return {k: dataset.cache_files for k, dataset in self.items()} + + @property + def num_columns(self) -> dict[str, int]: + """Number of columns in each split of the dataset. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes") + >>> ds.num_columns + {'test': 2, 'train': 2, 'validation': 2} + ``` + """ + self._check_values_type() + return {k: dataset.num_columns for k, dataset in self.items()} + + @property + def num_rows(self) -> dict[str, int]: + """Number of rows in each split of the dataset. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes") + >>> ds.num_rows + {'test': 1066, 'train': 8530, 'validation': 1066} + ``` + """ + self._check_values_type() + return {k: dataset.num_rows for k, dataset in self.items()} + + @property + def column_names(self) -> dict[str, list[str]]: + """Names of the columns in each split of the dataset. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes") + >>> ds.column_names + {'test': ['text', 'label'], + 'train': ['text', 'label'], + 'validation': ['text', 'label']} + ``` + """ + self._check_values_type() + return {k: dataset.column_names for k, dataset in self.items()} + + @property + def shape(self) -> dict[str, tuple[int]]: + """Shape of each split of the dataset (number of rows, number of columns). + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes") + >>> ds.shape + {'test': (1066, 2), 'train': (8530, 2), 'validation': (1066, 2)} + ``` + """ + self._check_values_type() + return {k: dataset.shape for k, dataset in self.items()} + + def flatten(self, max_depth=16) -> "DatasetDict": + """Flatten the Apache Arrow Table of each split (nested features are flatten). + Each column with a struct type is flattened into one column per struct field. + Other columns are left unchanged. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("rajpurkar/squad") + >>> ds["train"].features + {'id': Value('string'), + 'title': Value('string'), + 'context': Value('string'), + 'question': Value('string'), + 'answers.text': List(Value('string')), + 'answers.answer_start': List(Value('int32'))} + >>> ds.flatten() + DatasetDict({ + train: Dataset({ + features: ['id', 'title', 'context', 'question', 'answers.text', 'answers.answer_start'], + num_rows: 87599 + }) + validation: Dataset({ + features: ['id', 'title', 'context', 'question', 'answers.text', 'answers.answer_start'], + num_rows: 10570 + }) + }) + ``` + """ + self._check_values_type() + return DatasetDict({k: dataset.flatten(max_depth=max_depth) for k, dataset in self.items()}) + + def unique(self, column: str) -> dict[str, list]: + """Return a list of the unique elements in a column for each split. + + This is implemented in the low-level backend and as such, very fast. + + Args: + column (`str`): + column name (list all the column names with [`~datasets.DatasetDict.column_names`]) + + Returns: + Dict[`str`, `list`]: Dictionary of unique elements in the given column. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes") + >>> ds.unique("label") + {'test': [1, 0], 'train': [1, 0], 'validation': [1, 0]} + ``` + """ + self._check_values_type() + return {k: dataset.unique(column) for k, dataset in self.items()} + + def cleanup_cache_files(self) -> dict[str, int]: + """Clean up all cache files in the dataset cache directory, excepted the currently used cache file if there is one. + Be careful when running this command that no other process is currently using other cache files. + + Return: + `Dict` with the number of removed files for each split + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes") + >>> ds.cleanup_cache_files() + {'test': 0, 'train': 0, 'validation': 0} + ``` + """ + self._check_values_type() + return {k: dataset.cleanup_cache_files() for k, dataset in self.items()} + + def __repr__(self): + repr = "\n".join([f"{k}: {v}" for k, v in self.items()]) + repr = re.sub(r"^", " " * 4, repr, count=0, flags=re.M) + return f"DatasetDict({{\n{repr}\n}})" + + def cast(self, features: Features) -> "DatasetDict": + """ + Cast the dataset to a new set of features. + The transformation is applied to all the datasets of the dataset dictionary. + + Args: + features ([`Features`]): + New features to cast the dataset to. + The name and order of the fields in the features must match the current column names. + The type of the data must also be convertible from one type to the other. + For non-trivial conversion, e.g. `string` <-> `ClassLabel` you should use [`~DatasetDict.map`] to update the dataset. + + Example: + + ```py + >>> from datasets import load_dataset, ClassLabel, Value + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes") + >>> ds["train"].features + {'label': ClassLabel(names=['neg', 'pos']), + 'text': Value('string')} + >>> new_features = ds["train"].features.copy() + >>> new_features['label'] = ClassLabel(names=['bad', 'good']) + >>> new_features['text'] = Value('large_string') + >>> ds = ds.cast(new_features) + >>> ds["train"].features + {'label': ClassLabel(names=['bad', 'good']), + 'text': Value('large_string')} + ``` + """ + self._check_values_type() + return DatasetDict({k: dataset.cast(features=features) for k, dataset in self.items()}) + + def cast_column(self, column: str, feature) -> "DatasetDict": + """Cast column to feature for decoding. + + Args: + column (`str`): + Column name. + feature ([`Feature`]): + Target feature. + + Returns: + [`DatasetDict`] + + Example: + + ```py + >>> from datasets import load_dataset, ClassLabel + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes") + >>> ds["train"].features + {'label': ClassLabel(names=['neg', 'pos']), + 'text': Value('string')} + >>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good'])) + >>> ds["train"].features + {'label': ClassLabel(names=['bad', 'good']), + 'text': Value('string')} + ``` + """ + self._check_values_type() + return DatasetDict({k: dataset.cast_column(column=column, feature=feature) for k, dataset in self.items()}) + + def remove_columns(self, column_names: Union[str, list[str]]) -> "DatasetDict": + """ + Remove one or several column(s) from each split in the dataset + and the features associated to the column(s). + + The transformation is applied to all the splits of the dataset dictionary. + + You can also remove a column using [`~DatasetDict.map`] with `remove_columns` but the present method + doesn't copy the data of the remaining columns and is thus faster. + + Args: + column_names (`Union[str, list[str]]`): + Name of the column(s) to remove. + + Returns: + [`DatasetDict`]: A copy of the dataset object without the columns to remove. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes") + >>> ds = ds.remove_columns("label") + DatasetDict({ + train: Dataset({ + features: ['text'], + num_rows: 8530 + }) + validation: Dataset({ + features: ['text'], + num_rows: 1066 + }) + test: Dataset({ + features: ['text'], + num_rows: 1066 + }) + }) + ``` + """ + self._check_values_type() + return DatasetDict({k: dataset.remove_columns(column_names=column_names) for k, dataset in self.items()}) + + def rename_column(self, original_column_name: str, new_column_name: str) -> "DatasetDict": + """ + Rename a column in the dataset and move the features associated to the original column under the new column name. + The transformation is applied to all the datasets of the dataset dictionary. + + You can also rename a column using [`~DatasetDict.map`] with `remove_columns` but the present method: + - takes care of moving the original features under the new column name. + - doesn't copy the data to a new dataset and is thus much faster. + + Args: + original_column_name (`str`): + Name of the column to rename. + new_column_name (`str`): + New name for the column. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes") + >>> ds = ds.rename_column("label", "label_new") + DatasetDict({ + train: Dataset({ + features: ['text', 'label_new'], + num_rows: 8530 + }) + validation: Dataset({ + features: ['text', 'label_new'], + num_rows: 1066 + }) + test: Dataset({ + features: ['text', 'label_new'], + num_rows: 1066 + }) + }) + ``` + """ + self._check_values_type() + return DatasetDict( + { + k: dataset.rename_column( + original_column_name=original_column_name, + new_column_name=new_column_name, + ) + for k, dataset in self.items() + } + ) + + def rename_columns(self, column_mapping: dict[str, str]) -> "DatasetDict": + """ + Rename several columns in the dataset, and move the features associated to the original columns under + the new column names. + The transformation is applied to all the datasets of the dataset dictionary. + + Args: + column_mapping (`Dict[str, str]`): + A mapping of columns to rename to their new names. + + Returns: + [`DatasetDict`]: A copy of the dataset with renamed columns. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes") + >>> ds.rename_columns({'text': 'text_new', 'label': 'label_new'}) + DatasetDict({ + train: Dataset({ + features: ['text_new', 'label_new'], + num_rows: 8530 + }) + validation: Dataset({ + features: ['text_new', 'label_new'], + num_rows: 1066 + }) + test: Dataset({ + features: ['text_new', 'label_new'], + num_rows: 1066 + }) + }) + ``` + """ + self._check_values_type() + return DatasetDict({k: dataset.rename_columns(column_mapping=column_mapping) for k, dataset in self.items()}) + + def select_columns(self, column_names: Union[str, list[str]]) -> "DatasetDict": + """Select one or several column(s) from each split in the dataset and + the features associated to the column(s). + + The transformation is applied to all the splits of the dataset + dictionary. + + Args: + column_names (`Union[str, list[str]]`): + Name of the column(s) to keep. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes") + >>> ds.select_columns("text") + DatasetDict({ + train: Dataset({ + features: ['text'], + num_rows: 8530 + }) + validation: Dataset({ + features: ['text'], + num_rows: 1066 + }) + test: Dataset({ + features: ['text'], + num_rows: 1066 + }) + }) + ``` + """ + self._check_values_type() + return DatasetDict({k: dataset.select_columns(column_names=column_names) for k, dataset in self.items()}) + + def class_encode_column(self, column: str, include_nulls: bool = False) -> "DatasetDict": + """Casts the given column as [`~datasets.features.ClassLabel`] and updates the tables. + + Args: + column (`str`): + The name of the column to cast. + include_nulls (`bool`, defaults to `False`): + Whether to include null values in the class labels. If `True`, the null values will be encoded as the `"None"` class label. + + + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("boolq") + >>> ds["train"].features + {'answer': Value('bool'), + 'passage': Value('string'), + 'question': Value('string')} + >>> ds = ds.class_encode_column("answer") + >>> ds["train"].features + {'answer': ClassLabel(num_classes=2, names=['False', 'True']), + 'passage': Value('string'), + 'question': Value('string')} + ``` + """ + self._check_values_type() + return DatasetDict( + {k: dataset.class_encode_column(column=column, include_nulls=include_nulls) for k, dataset in self.items()} + ) + + @contextlib.contextmanager + def formatted_as( + self, + type: Optional[str] = None, + columns: Optional[list] = None, + output_all_columns: bool = False, + **format_kwargs, + ): + """To be used in a `with` statement. Set `__getitem__` return format (type and columns). + The transformation is applied to all the datasets of the dataset dictionary. + + Args: + type (`str`, *optional*): + Either output type selected in `[None, 'numpy', 'torch', 'tensorflow', 'jax', 'arrow', 'pandas', 'polars']`. + `None` means `__getitem__` returns python objects (default). + columns (`list[str]`, *optional*): + Columns to format in the output. + `None` means `__getitem__` returns all columns (default). + output_all_columns (`bool`, defaults to False): + Keep un-formatted columns as well in the output (as python objects). + **format_kwargs (additional keyword arguments): + Keywords arguments passed to the convert function like `np.array`, `torch.tensor` or `tensorflow.ragged.constant`. + """ + self._check_values_type() + old_format_type = {k: dataset._format_type for k, dataset in self.items()} + old_format_kwargs = {k: dataset._format_kwargs for k, dataset in self.items()} + old_format_columns = {k: dataset._format_columns for k, dataset in self.items()} + old_output_all_columns = {k: dataset._output_all_columns for k, dataset in self.items()} + try: + self.set_format(type, columns, output_all_columns, **format_kwargs) + yield + finally: + for k, dataset in self.items(): + dataset.set_format( + old_format_type[k], + old_format_columns[k], + old_output_all_columns[k], + **old_format_kwargs[k], + ) + + def set_format( + self, + type: Optional[str] = None, + columns: Optional[list] = None, + output_all_columns: bool = False, + **format_kwargs, + ): + """Set `__getitem__` return format (type and columns). + The format is set for every dataset in the dataset dictionary. + + Args: + type (`str`, *optional*): + Either output type selected in `[None, 'numpy', 'torch', 'tensorflow', 'jax', 'arrow', 'pandas', 'polars']`. + `None` means `__getitem__` returns python objects (default). + columns (`list[str]`, *optional*): + Columns to format in the output. + `None` means `__getitem__` returns all columns (default). + output_all_columns (`bool`, defaults to False): + Keep un-formatted columns as well in the output (as python objects), + **format_kwargs (additional keyword arguments): + Keywords arguments passed to the convert function like `np.array`, `torch.tensor` or `tensorflow.ragged.constant`. + + It is possible to call `map` after calling `set_format`. Since `map` may add new columns, then the list of formatted columns + gets updated. In this case, if you apply `map` on a dataset to add a new column, then this column will be formatted: + + `new formatted columns = (all columns - previously unformatted columns)` + + Example: + + ```py + >>> from datasets import load_dataset + >>> from transformers import AutoTokenizer + >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") + >>> ds = ds.map(lambda x: tokenizer(x["text"], truncation=True, padding=True), batched=True) + >>> ds.set_format(type="numpy", columns=['input_ids', 'token_type_ids', 'attention_mask', 'label']) + >>> ds["train"].format + {'columns': ['input_ids', 'token_type_ids', 'attention_mask', 'label'], + 'format_kwargs': {}, + 'output_all_columns': False, + 'type': 'numpy'} + ``` + """ + self._check_values_type() + for dataset in self.values(): + dataset.set_format( + type=type, + columns=columns, + output_all_columns=output_all_columns, + **format_kwargs, + ) + + def reset_format(self): + """Reset `__getitem__` return format to python objects and all columns. + The transformation is applied to all the datasets of the dataset dictionary. + + Same as `self.set_format()` + + Example: + + ```py + >>> from datasets import load_dataset + >>> from transformers import AutoTokenizer + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes") + >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") + >>> ds = ds.map(lambda x: tokenizer(x["text"], truncation=True, padding=True), batched=True) + >>> ds.set_format(type="numpy", columns=['input_ids', 'token_type_ids', 'attention_mask', 'label']) + >>> ds["train"].format + {'columns': ['input_ids', 'token_type_ids', 'attention_mask', 'label'], + 'format_kwargs': {}, + 'output_all_columns': False, + 'type': 'numpy'} + >>> ds.reset_format() + >>> ds["train"].format + {'columns': ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'], + 'format_kwargs': {}, + 'output_all_columns': False, + 'type': None} + ``` + """ + self._check_values_type() + for dataset in self.values(): + dataset.set_format() + + def set_transform( + self, + transform: Optional[Callable], + columns: Optional[list] = None, + output_all_columns: bool = False, + ): + """Set ``__getitem__`` return format using this transform. The transform is applied on-the-fly on batches when ``__getitem__`` is called. + The transform is set for every dataset in the dataset dictionary + As :func:`datasets.Dataset.set_format`, this can be reset using :func:`datasets.Dataset.reset_format` + + Args: + transform (`Callable`, optional): user-defined formatting transform, replaces the format defined by :func:`datasets.Dataset.set_format` + A formatting function is a callable that takes a batch (as a dict) as input and returns a batch. + This function is applied right before returning the objects in ``__getitem__``. + columns (`list[str]`, optional): columns to format in the output + If specified, then the input batch of the transform only contains those columns. + output_all_columns (`bool`, default to False): keep un-formatted columns as well in the output (as python objects) + If set to True, then the other un-formatted columns are kept with the output of the transform. + + """ + self._check_values_type() + for dataset in self.values(): + dataset.set_format( + "custom", + columns=columns, + output_all_columns=output_all_columns, + transform=transform, + ) + + def with_format( + self, + type: Optional[str] = None, + columns: Optional[list] = None, + output_all_columns: bool = False, + **format_kwargs, + ) -> "DatasetDict": + """Set `__getitem__` return format (type and columns). The data formatting is applied on-the-fly. + The format `type` (for example "numpy") is used to format batches when using `__getitem__`. + The format is set for every dataset in the dataset dictionary. + + It's also possible to use custom transforms for formatting using [`~datasets.Dataset.with_transform`]. + + Contrary to [`~datasets.DatasetDict.set_format`], `with_format` returns a new [`DatasetDict`] object with new [`Dataset`] objects. + + Args: + type (`str`, *optional*): + Either output type selected in `[None, 'numpy', 'torch', 'tensorflow', 'jax', 'arrow', 'pandas', 'polars']`. + `None` means `__getitem__` returns python objects (default). + columns (`list[str]`, *optional*): + Columns to format in the output. + `None` means `__getitem__` returns all columns (default). + output_all_columns (`bool`, defaults to `False`): + Keep un-formatted columns as well in the output (as python objects). + **format_kwargs (additional keyword arguments): + Keywords arguments passed to the convert function like `np.array`, `torch.tensor` or `tensorflow.ragged.constant`. + + Example: + + ```py + >>> from datasets import load_dataset + >>> from transformers import AutoTokenizer + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes") + >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") + >>> ds = ds.map(lambda x: tokenizer(x['text'], truncation=True, padding=True), batched=True) + >>> ds["train"].format + {'columns': ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'], + 'format_kwargs': {}, + 'output_all_columns': False, + 'type': None} + >>> ds = ds.with_format("torch") + >>> ds["train"].format + {'columns': ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'], + 'format_kwargs': {}, + 'output_all_columns': False, + 'type': 'torch'} + >>> ds["train"][0] + {'text': 'compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .', + 'label': tensor(1), + 'input_ids': tensor([ 101, 18027, 16310, 16001, 1103, 9321, 178, 11604, 7235, 6617, + 1742, 2165, 2820, 1206, 6588, 22572, 12937, 1811, 2153, 1105, + 1147, 12890, 19587, 6463, 1105, 15026, 1482, 119, 102, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0]), + 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), + 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])} + ``` + """ + dataset = copy.deepcopy(self) + dataset.set_format( + type=type, + columns=columns, + output_all_columns=output_all_columns, + **format_kwargs, + ) + return dataset + + def with_transform( + self, + transform: Optional[Callable], + columns: Optional[list] = None, + output_all_columns: bool = False, + ) -> "DatasetDict": + """Set `__getitem__` return format using this transform. The transform is applied on-the-fly on batches when `__getitem__` is called. + The transform is set for every dataset in the dataset dictionary + + As [`~datasets.Dataset.set_format`], this can be reset using [`~datasets.Dataset.reset_format`]. + + Contrary to [`~datasets.DatasetDict.set_transform`], `with_transform` returns a new [`DatasetDict`] object with new [`Dataset`] objects. + + Args: + transform (`Callable`, *optional*): + User-defined formatting transform, replaces the format defined by [`~datasets.Dataset.set_format`]. + A formatting function is a callable that takes a batch (as a dict) as input and returns a batch. + This function is applied right before returning the objects in `__getitem__`. + columns (`list[str]`, *optional*): + Columns to format in the output. + If specified, then the input batch of the transform only contains those columns. + output_all_columns (`bool`, defaults to False): + Keep un-formatted columns as well in the output (as python objects). + If set to `True`, then the other un-formatted columns are kept with the output of the transform. + + Example: + + ```py + >>> from datasets import load_dataset + >>> from transformers import AutoTokenizer + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes") + >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") + >>> def encode(example): + ... return tokenizer(example['text'], truncation=True, padding=True, return_tensors="pt") + >>> ds = ds.with_transform(encode) + >>> ds["train"][0] + {'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1]), + 'input_ids': tensor([ 101, 1103, 2067, 1110, 17348, 1106, 1129, 1103, 6880, 1432, + 112, 188, 1207, 107, 14255, 1389, 107, 1105, 1115, 1119, + 112, 188, 1280, 1106, 1294, 170, 24194, 1256, 3407, 1190, + 170, 11791, 5253, 188, 1732, 7200, 10947, 12606, 2895, 117, + 179, 7766, 118, 172, 15554, 1181, 3498, 6961, 3263, 1137, + 188, 1566, 7912, 14516, 6997, 119, 102]), + 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0])} + ``` + """ + dataset = copy.deepcopy(self) + dataset.set_transform(transform=transform, columns=columns, output_all_columns=output_all_columns) + return dataset + + def map( + self, + function: Optional[Callable] = None, + with_indices: bool = False, + with_rank: bool = False, + with_split: bool = False, + input_columns: Optional[Union[str, list[str]]] = None, + batched: bool = False, + batch_size: Optional[int] = 1000, + drop_last_batch: bool = False, + remove_columns: Optional[Union[str, list[str]]] = None, + keep_in_memory: bool = False, + load_from_cache_file: Optional[bool] = None, + cache_file_names: Optional[dict[str, Optional[str]]] = None, + writer_batch_size: Optional[int] = 1000, + features: Optional[Features] = None, + disable_nullable: bool = False, + fn_kwargs: Optional[dict] = None, + num_proc: Optional[int] = None, + desc: Optional[str] = None, + try_original_type: Optional[bool] = True, + ) -> "DatasetDict": + """ + Apply a function to all the examples in the table (individually or in batches) and update the table. + If your function returns a column that already exists, then it overwrites it. + The transformation is applied to all the datasets of the dataset dictionary. + + You can specify whether the function should be batched or not with the `batched` parameter: + + - If batched is `False`, then the function takes 1 example in and should return 1 example. + An example is a dictionary, e.g. `{"text": "Hello there !"}`. + - If batched is `True` and `batch_size` is 1, then the function takes a batch of 1 example as input and can return a batch with 1 or more examples. + A batch is a dictionary, e.g. a batch of 1 example is `{"text": ["Hello there !"]}`. + - If batched is `True` and `batch_size` is `n > 1`, then the function takes a batch of `n` examples as input and can return a batch with `n` examples, or with an arbitrary number of examples. + Note that the last batch may have less than `n` examples. + A batch is a dictionary, e.g. a batch of `n` examples is `{"text": ["Hello there !"] * n}`. + + If the function is asynchronous, then `map` will run your function in parallel, with up to one thousand simultaneous calls. + It is recommended to use a `asyncio.Semaphore` in your function if you want to set a maximum number of operations that can run at the same time. + + Args: + function (`callable`): with one of the following signature: + - `function(example: Dict[str, Any]) -> Dict[str, Any]` if `batched=False` and `with_indices=False` + - `function(example: Dict[str, Any], indices: int) -> Dict[str, Any]` if `batched=False` and `with_indices=True` + - `function(batch: Dict[str, list]) -> Dict[str, list]` if `batched=True` and `with_indices=False` + - `function(batch: Dict[str, list], indices: list[int]) -> Dict[str, list]` if `batched=True` and `with_indices=True` + + For advanced usage, the function can also return a `pyarrow.Table`. + If the function is asynchronous, then `map` will run your function in parallel. + Moreover if your function returns nothing (`None`), then `map` will run your function and return the dataset unchanged. + If no function is provided, default to identity function: `lambda x: x`. + with_indices (`bool`, defaults to `False`): + Provide example indices to `function`. Note that in this case the signature of `function` should be `def function(example, idx): ...`. + with_rank (`bool`, defaults to `False`): + Provide process rank to `function`. Note that in this case the + signature of `function` should be `def function(example[, idx], rank): ...`. + with_split (`bool`, defaults to `False`): + Provide process split to `function`. Note that in this case the + signature of `function` should be `def function(example[, idx], split): ...`. + input_columns (`[Union[str, list[str]]]`, *optional*, defaults to `None`): + The columns to be passed into `function` as + positional arguments. If `None`, a dict mapping to all formatted columns is passed as one argument. + batched (`bool`, defaults to `False`): + Provide batch of examples to `function`. + batch_size (`int`, *optional*, defaults to `1000`): + Number of examples per batch provided to `function` if `batched=True`, + `batch_size <= 0` or `batch_size == None` then provide the full dataset as a single batch to `function`. + drop_last_batch (`bool`, defaults to `False`): + Whether a last batch smaller than the batch_size should be + dropped instead of being processed by the function. + remove_columns (`[Union[str, list[str]]]`, *optional*, defaults to `None`): + Remove a selection of columns while doing the mapping. + Columns will be removed before updating the examples with the output of `function`, i.e. if `function` is adding + columns with names in `remove_columns`, these columns will be kept. + keep_in_memory (`bool`, defaults to `False`): + Keep the dataset in memory instead of writing it to a cache file. + load_from_cache_file (`Optional[bool]`, defaults to `True` if caching is enabled): + If a cache file storing the current computation from `function` + can be identified, use it instead of recomputing. + cache_file_names (`[Dict[str, str]]`, *optional*, defaults to `None`): + Provide the name of a path for the cache file. It is used to store the + results of the computation instead of the automatically generated cache file name. + You have to provide one `cache_file_name` per dataset in the dataset dictionary. + writer_batch_size (`int`, default `1000`): + Number of rows per write operation for the cache file writer. + This value is a good trade-off between memory usage during the processing, and processing speed. + Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `map`. + features (`[datasets.Features]`, *optional*, defaults to `None`): + Use a specific [`Features`] to store the cache file + instead of the automatically generated one. + disable_nullable (`bool`, defaults to `False`): + Disallow null values in the table. + fn_kwargs (`Dict`, *optional*, defaults to `None`): + Keyword arguments to be passed to `function` + num_proc (`int`, *optional*, defaults to `None`): + The number of processes to use for multiprocessing. + - If `None` or `0`, no multiprocessing is used and the operation runs in the main process. + - If greater than `1`, one or multiple worker processes are used to process data in parallel. + Note: The function passed to `map()` must be picklable for multiprocessing to work correctly + (i.e., prefer functions defined at the top level of a module, not inside another function or class). + desc (`str`, *optional*, defaults to `None`): + Meaningful description to be displayed alongside with the progress bar while mapping examples. + try_original_type (`Optional[bool]`, defaults to `True`): + Try to keep the types of the original columns (e.g. int32 -> int32). + Set to False if you want to always infer new types. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes") + >>> def add_prefix(example): + ... example["text"] = "Review: " + example["text"] + ... return example + >>> ds = ds.map(add_prefix) + >>> ds["train"][0:3]["text"] + ['Review: the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', + 'Review: the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .', + 'Review: effective but too-tepid biopic'] + + # process a batch of examples + >>> ds = ds.map(lambda example: tokenizer(example["text"]), batched=True) + # set number of processors + >>> ds = ds.map(add_prefix, num_proc=4) + ``` + """ + self._check_values_type() + if cache_file_names is None: + cache_file_names = dict.fromkeys(self) + + dataset_dict = {} + for split, dataset in self.items(): + if with_split: + function = bind(function, split) + + dataset_dict[split] = dataset.map( + function=function, + with_indices=with_indices, + with_rank=with_rank, + input_columns=input_columns, + batched=batched, + batch_size=batch_size, + drop_last_batch=drop_last_batch, + remove_columns=remove_columns, + keep_in_memory=keep_in_memory, + load_from_cache_file=load_from_cache_file, + cache_file_name=cache_file_names[split], + writer_batch_size=writer_batch_size, + features=features, + disable_nullable=disable_nullable, + fn_kwargs=fn_kwargs, + num_proc=num_proc, + desc=desc, + try_original_type=try_original_type, + ) + + if with_split: + function = function.func + + return DatasetDict(dataset_dict) + + def filter( + self, + function: Optional[Callable] = None, + with_indices: bool = False, + with_rank: bool = False, + input_columns: Optional[Union[str, list[str]]] = None, + batched: bool = False, + batch_size: Optional[int] = 1000, + keep_in_memory: bool = False, + load_from_cache_file: Optional[bool] = None, + cache_file_names: Optional[dict[str, Optional[str]]] = None, + writer_batch_size: Optional[int] = 1000, + fn_kwargs: Optional[dict] = None, + num_proc: Optional[int] = None, + desc: Optional[str] = None, + ) -> "DatasetDict": + """Apply a filter function to all the elements in the table in batches + and update the table so that the dataset only includes examples according to the filter function. + The transformation is applied to all the datasets of the dataset dictionary. + + Args: + function (`Callable`): Callable with one of the following signatures: + + - `function(example: Dict[str, Any]) -> bool` if `batched=False` and `with_indices=False` and `with_rank=False` + - `function(example: Dict[str, Any], *extra_args) -> bool` if `batched=False` and `with_indices=True` and/or `with_rank=True` (one extra arg for each) + - `function(batch: Dict[str, list]) -> list[bool]` if `batched=True` and `with_indices=False` and `with_rank=False` + - `function(batch: Dict[str, list], *extra_args) -> list[bool]` if `batched=True` and `with_indices=True` and/or `with_rank=True` (one extra arg for each) + + If no function is provided, defaults to an always `True` function: `lambda x: True`. + with_indices (`bool`, defaults to `False`): + Provide example indices to `function`. Note that in this case the + signature of `function` should be `def function(example, idx[, rank]): ...`. + with_rank (`bool`, defaults to `False`): + Provide process rank to `function`. Note that in this case the + signature of `function` should be `def function(example[, idx], rank): ...`. + input_columns (`[Union[str, list[str]]]`, *optional*, defaults to `None`): + The columns to be passed into `function` as + positional arguments. If `None`, a dict mapping to all formatted columns is passed as one argument. + batched (`bool`, defaults to `False`): + Provide batch of examples to `function`. + batch_size (`int`, *optional*, defaults to `1000`): + Number of examples per batch provided to `function` if `batched=True` + `batch_size <= 0` or `batch_size == None` then provide the full dataset as a single batch to `function`. + keep_in_memory (`bool`, defaults to `False`): + Keep the dataset in memory instead of writing it to a cache file. + load_from_cache_file (`Optional[bool]`, defaults to `True` if caching is enabled): + If a cache file storing the current computation from `function` + can be identified, use it instead of recomputing. + cache_file_names (`[Dict[str, str]]`, *optional*, defaults to `None`): + Provide the name of a path for the cache file. It is used to store the + results of the computation instead of the automatically generated cache file name. + You have to provide one `cache_file_name` per dataset in the dataset dictionary. + writer_batch_size (`int`, defaults to `1000`): + Number of rows per write operation for the cache file writer. + This value is a good trade-off between memory usage during the processing, and processing speed. + Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `map`. + fn_kwargs (`Dict`, *optional*, defaults to `None`): + Keyword arguments to be passed to `function` + num_proc (`int`, *optional*, defaults to `None`): + The number of processes to use for multiprocessing. + - If `None` or `0`, no multiprocessing is used and the operation runs in the main process. + - If greater than `1`, one or multiple worker processes are used to process data in parallel. + Note: The function passed to `map()` must be picklable for multiprocessing to work correctly + (i.e., prefer functions defined at the top level of a module, not inside another function or class). + desc (`str`, *optional*, defaults to `None`): + Meaningful description to be displayed alongside with the progress bar while filtering examples. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes") + >>> ds.filter(lambda x: x["label"] == 1) + DatasetDict({ + train: Dataset({ + features: ['text', 'label'], + num_rows: 4265 + }) + validation: Dataset({ + features: ['text', 'label'], + num_rows: 533 + }) + test: Dataset({ + features: ['text', 'label'], + num_rows: 533 + }) + }) + ``` + """ + self._check_values_type() + if cache_file_names is None: + cache_file_names = dict.fromkeys(self) + return DatasetDict( + { + k: dataset.filter( + function=function, + with_indices=with_indices, + with_rank=with_rank, + input_columns=input_columns, + batched=batched, + batch_size=batch_size, + keep_in_memory=keep_in_memory, + load_from_cache_file=load_from_cache_file, + cache_file_name=cache_file_names[k], + writer_batch_size=writer_batch_size, + fn_kwargs=fn_kwargs, + num_proc=num_proc, + desc=desc, + ) + for k, dataset in self.items() + } + ) + + def flatten_indices( + self, + keep_in_memory: bool = False, + cache_file_names: Optional[dict[str, Optional[str]]] = None, + writer_batch_size: Optional[int] = 1000, + features: Optional[Features] = None, + disable_nullable: bool = False, + num_proc: Optional[int] = None, + new_fingerprint: Optional[str] = None, + ) -> "DatasetDict": + """Create and cache a new Dataset by flattening the indices mapping. + + Args: + keep_in_memory (`bool`, defaults to `False`): + Keep the dataset in memory instead of writing it to a cache file. + cache_file_names (`Dict[str, str]`, *optional*, default `None`): + Provide the name of a path for the cache file. It is used to store the + results of the computation instead of the automatically generated cache file name. + You have to provide one `cache_file_name` per dataset in the dataset dictionary. + writer_batch_size (`int`, defaults to `1000`): + Number of rows per write operation for the cache file writer. + This value is a good trade-off between memory usage during the processing, and processing speed. + Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `map`. + features (`Optional[datasets.Features]`, defaults to `None`): + Use a specific [`Features`] to store the cache file + instead of the automatically generated one. + disable_nullable (`bool`, defaults to `False`): + Allow null values in the table. + num_proc (`int`, optional, default `None`): + Max number of processes when generating cache. Already cached shards are loaded sequentially + new_fingerprint (`str`, *optional*, defaults to `None`): + The new fingerprint of the dataset after transform. + If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments + """ + self._check_values_type() + if cache_file_names is None: + cache_file_names = dict.fromkeys(self) + return DatasetDict( + { + k: dataset.flatten_indices( + keep_in_memory=keep_in_memory, + cache_file_name=cache_file_names[k], + writer_batch_size=writer_batch_size, + features=features, + disable_nullable=disable_nullable, + num_proc=num_proc, + new_fingerprint=new_fingerprint, + ) + for k, dataset in self.items() + } + ) + + def sort( + self, + column_names: Union[str, Sequence[str]], + reverse: Union[bool, Sequence[bool]] = False, + null_placement: str = "at_end", + keep_in_memory: bool = False, + load_from_cache_file: Optional[bool] = None, + indices_cache_file_names: Optional[dict[str, Optional[str]]] = None, + writer_batch_size: Optional[int] = 1000, + ) -> "DatasetDict": + """Create a new dataset sorted according to a single or multiple columns. + + Args: + column_names (`Union[str, Sequence[str]]`): + Column name(s) to sort by. + reverse (`Union[bool, Sequence[bool]]`, defaults to `False`): + If `True`, sort by descending order rather than ascending. If a single bool is provided, + the value is applied to the sorting of all column names. Otherwise a list of bools with the + same length and order as column_names must be provided. + null_placement (`str`, defaults to `at_end`): + Put `None` values at the beginning if `at_start` or `first` or at the end if `at_end` or `last` + keep_in_memory (`bool`, defaults to `False`): + Keep the sorted indices in memory instead of writing it to a cache file. + load_from_cache_file (`Optional[bool]`, defaults to `True` if caching is enabled): + If a cache file storing the sorted indices + can be identified, use it instead of recomputing. + indices_cache_file_names (`[Dict[str, str]]`, *optional*, defaults to `None`): + Provide the name of a path for the cache file. It is used to store the + indices mapping instead of the automatically generated cache file name. + You have to provide one `cache_file_name` per dataset in the dataset dictionary. + writer_batch_size (`int`, defaults to `1000`): + Number of rows per write operation for the cache file writer. + Higher value gives smaller cache files, lower value consume less temporary memory. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset('cornell-movie-review-data/rotten_tomatoes') + >>> ds['train']['label'][:10] + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + >>> sorted_ds = ds.sort('label') + >>> sorted_ds['train']['label'][:10] + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + >>> another_sorted_ds = ds.sort(['label', 'text'], reverse=[True, False]) + >>> another_sorted_ds['train']['label'][:10] + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + ``` + """ + self._check_values_type() + if indices_cache_file_names is None: + indices_cache_file_names = dict.fromkeys(self) + return DatasetDict( + { + k: dataset.sort( + column_names=column_names, + reverse=reverse, + null_placement=null_placement, + keep_in_memory=keep_in_memory, + load_from_cache_file=load_from_cache_file, + indices_cache_file_name=indices_cache_file_names[k], + writer_batch_size=writer_batch_size, + ) + for k, dataset in self.items() + } + ) + + def shuffle( + self, + seeds: Optional[Union[int, dict[str, Optional[int]]]] = None, + seed: Optional[int] = None, + generators: Optional[dict[str, np.random.Generator]] = None, + keep_in_memory: bool = False, + load_from_cache_file: Optional[bool] = None, + indices_cache_file_names: Optional[dict[str, Optional[str]]] = None, + writer_batch_size: Optional[int] = 1000, + ) -> "DatasetDict": + """Create a new Dataset where the rows are shuffled. + + The transformation is applied to all the datasets of the dataset dictionary. + + Currently shuffling uses numpy random generators. + You can either supply a NumPy BitGenerator to use, or a seed to initiate NumPy's default random generator (PCG64). + + Args: + seeds (`Dict[str, int]` or `int`, *optional*): + A seed to initialize the default BitGenerator if `generator=None`. + If `None`, then fresh, unpredictable entropy will be pulled from the OS. + If an `int` or `array_like[ints]` is passed, then it will be passed to SeedSequence to derive the initial BitGenerator state. + You can provide one `seed` per dataset in the dataset dictionary. + seed (`int`, *optional*): + A seed to initialize the default BitGenerator if `generator=None`. Alias for seeds (a `ValueError` is raised if both are provided). + generators (`Dict[str, *optional*, np.random.Generator]`): + Numpy random Generator to use to compute the permutation of the dataset rows. + If `generator=None` (default), uses `np.random.default_rng` (the default BitGenerator (PCG64) of NumPy). + You have to provide one `generator` per dataset in the dataset dictionary. + keep_in_memory (`bool`, defaults to `False`): + Keep the dataset in memory instead of writing it to a cache file. + load_from_cache_file (`Optional[bool]`, defaults to `True` if caching is enabled): + If a cache file storing the current computation from `function` + can be identified, use it instead of recomputing. + indices_cache_file_names (`Dict[str, str]`, *optional*): + Provide the name of a path for the cache file. It is used to store the + indices mappings instead of the automatically generated cache file name. + You have to provide one `cache_file_name` per dataset in the dataset dictionary. + writer_batch_size (`int`, defaults to `1000`): + Number of rows per write operation for the cache file writer. + This value is a good trade-off between memory usage during the processing, and processing speed. + Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `map`. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes") + >>> ds["train"]["label"][:10] + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + + # set a seed + >>> shuffled_ds = ds.shuffle(seed=42) + >>> shuffled_ds["train"]["label"][:10] + [0, 1, 0, 1, 0, 0, 0, 0, 0, 0] + ``` + """ + self._check_values_type() + if seed is not None and seeds is not None: + raise ValueError("Please specify seed or seeds, but not both") + seeds = seed if seed is not None else seeds + if seeds is None: + seeds = dict.fromkeys(self) + elif not isinstance(seeds, dict): + seeds = dict.fromkeys(self, seeds) + if generators is None: + generators = dict.fromkeys(self) + if indices_cache_file_names is None: + indices_cache_file_names = dict.fromkeys(self) + return DatasetDict( + { + k: dataset.shuffle( + seed=seeds[k], + generator=generators[k], + keep_in_memory=keep_in_memory, + load_from_cache_file=load_from_cache_file, + indices_cache_file_name=indices_cache_file_names[k], + writer_batch_size=writer_batch_size, + ) + for k, dataset in self.items() + } + ) + + def save_to_disk( + self, + dataset_dict_path: PathLike, + max_shard_size: Optional[Union[str, int]] = None, + num_shards: Optional[dict[str, int]] = None, + num_proc: Optional[int] = None, + storage_options: Optional[dict] = None, + ): + """ + Saves a dataset dict to a filesystem using `fsspec.spec.AbstractFileSystem`. + + For [`Image`], [`Audio`] and [`Video`] data: + + All the Image(), Audio() and Video() data are stored in the arrow files. + If you want to store paths or urls, please use the Value("string") type. + + Args: + dataset_dict_path (`path-like`): + Path (e.g. `dataset/train`) or remote URI (e.g. `s3://my-bucket/dataset/train`) + of the dataset dict directory where the dataset dict will be saved to. + max_shard_size (`int` or `str`, *optional*, defaults to `"500MB"`): + The maximum size of the dataset shards to be saved to the filesystem. If expressed as a string, needs to be digits followed by a unit + (like `"50MB"`). + num_shards (`Dict[str, int]`, *optional*): + Number of shards to write. By default the number of shards depends on `max_shard_size` and `num_proc`. + You need to provide the number of shards for each dataset in the dataset dictionary. + Use a dictionary to define a different num_shards for each split. + + + num_proc (`int`, *optional*, default `None`): + Number of processes when downloading and generating the dataset locally. + Multiprocessing is disabled by default. + + + storage_options (`dict`, *optional*): + Key/value pairs to be passed on to the file-system backend, if any. + + + + Example: + + ```python + >>> dataset_dict.save_to_disk("path/to/dataset/directory") + >>> dataset_dict.save_to_disk("path/to/dataset/directory", max_shard_size="1GB") + >>> dataset_dict.save_to_disk("path/to/dataset/directory", num_shards={"train": 1024, "test": 8}) + ``` + """ + fs: fsspec.AbstractFileSystem + fs, _ = url_to_fs(dataset_dict_path, **(storage_options or {})) + + if num_shards is None: + num_shards = dict.fromkeys(self) + elif not isinstance(num_shards, dict): + raise ValueError( + "Please provide one `num_shards` per dataset in the dataset dictionary, e.g. {{'train': 128, 'test': 4}}" + ) + + fs.makedirs(dataset_dict_path, exist_ok=True) + + with fs.open( + posixpath.join(dataset_dict_path, config.DATASETDICT_JSON_FILENAME), + "w", + encoding="utf-8", + ) as f: + json.dump({"splits": list(self)}, f) + for k, dataset in self.items(): + dataset.save_to_disk( + posixpath.join(dataset_dict_path, k), + num_shards=num_shards.get(k), + max_shard_size=max_shard_size, + num_proc=num_proc, + storage_options=storage_options, + ) + + @staticmethod + def load_from_disk( + dataset_dict_path: PathLike, + keep_in_memory: Optional[bool] = None, + storage_options: Optional[dict] = None, + ) -> "DatasetDict": + """ + Load a dataset that was previously saved using [`save_to_disk`] from a filesystem using `fsspec.spec.AbstractFileSystem`. + + Args: + dataset_dict_path (`path-like`): + Path (e.g. `"dataset/train"`) or remote URI (e.g. `"s3//my-bucket/dataset/train"`) + of the dataset dict directory where the dataset dict will be loaded from. + keep_in_memory (`bool`, defaults to `None`): + Whether to copy the dataset in-memory. If `None`, the + dataset will not be copied in-memory unless explicitly enabled by setting + `datasets.config.IN_MEMORY_MAX_SIZE` to nonzero. See more details in the + [improve performance](../cache#improve-performance) section. + storage_options (`dict`, *optional*): + Key/value pairs to be passed on to the file-system backend, if any. + + + + Returns: + [`DatasetDict`] + + Example: + + ```py + >>> ds = load_from_disk('path/to/dataset/directory') + ``` + """ + fs: fsspec.AbstractFileSystem + fs, dataset_dict_path = url_to_fs(dataset_dict_path, **(storage_options or {})) + + dataset_dict_json_path = posixpath.join(dataset_dict_path, config.DATASETDICT_JSON_FILENAME) + dataset_state_json_path = posixpath.join(dataset_dict_path, config.DATASET_STATE_JSON_FILENAME) + dataset_info_path = posixpath.join(dataset_dict_path, config.DATASET_INFO_FILENAME) + if not fs.isfile(dataset_dict_json_path): + if fs.isfile(dataset_info_path) and fs.isfile(dataset_state_json_path): + raise FileNotFoundError( + f"No such file: '{dataset_dict_json_path}'. Expected to load a `DatasetDict` object, but got a `Dataset`. Please use either `datasets.load_from_disk` or `Dataset.load_from_disk` instead." + ) + raise FileNotFoundError( + f"No such file: '{dataset_dict_json_path}'. Expected to load a `DatasetDict` object, but provided path is not a `DatasetDict`." + ) + + with fs.open(dataset_dict_json_path, "r", encoding="utf-8") as f: + splits = json.load(f)["splits"] + + dataset_dict = DatasetDict() + for k in splits: + dataset_dict_split_path = posixpath.join(fs.unstrip_protocol(dataset_dict_path), k) + dataset_dict[k] = Dataset.load_from_disk( + dataset_dict_split_path, + keep_in_memory=keep_in_memory, + storage_options=storage_options, + ) + return dataset_dict + + @staticmethod + def from_csv( + path_or_paths: dict[str, PathLike], + features: Optional[Features] = None, + cache_dir: str = None, + keep_in_memory: bool = False, + **kwargs, + ) -> "DatasetDict": + """Create [`DatasetDict`] from CSV file(s). + + Args: + path_or_paths (`dict` of path-like): + Path(s) of the CSV file(s). + features ([`Features`], *optional*): + Dataset features. + cache_dir (str, *optional*, defaults to `"~/.cache/huggingface/datasets"`): + Directory to cache data. + keep_in_memory (`bool`, defaults to `False`): + Whether to copy the data in-memory. + **kwargs (additional keyword arguments): + Keyword arguments to be passed to [`pandas.read_csv`]. + + Returns: + [`DatasetDict`] + + Example: + + ```py + >>> from datasets import DatasetDict + >>> ds = DatasetDict.from_csv({'train': 'path/to/dataset.csv'}) + ``` + """ + # Dynamic import to avoid circular dependency + from .io.csv import CsvDatasetReader + + return CsvDatasetReader( + path_or_paths, + features=features, + cache_dir=cache_dir, + keep_in_memory=keep_in_memory, + **kwargs, + ).read() + + @staticmethod + def from_json( + path_or_paths: dict[str, PathLike], + features: Optional[Features] = None, + cache_dir: str = None, + keep_in_memory: bool = False, + **kwargs, + ) -> "DatasetDict": + """Create [`DatasetDict`] from JSON Lines file(s). + + Args: + path_or_paths (`path-like` or list of `path-like`): + Path(s) of the JSON Lines file(s). + features ([`Features`], *optional*): + Dataset features. + cache_dir (str, *optional*, defaults to `"~/.cache/huggingface/datasets"`): + Directory to cache data. + keep_in_memory (`bool`, defaults to `False`): + Whether to copy the data in-memory. + **kwargs (additional keyword arguments): + Keyword arguments to be passed to [`JsonConfig`]. + + Returns: + [`DatasetDict`] + + Example: + + ```py + >>> from datasets import DatasetDict + >>> ds = DatasetDict.from_json({'train': 'path/to/dataset.json'}) + ``` + """ + # Dynamic import to avoid circular dependency + from .io.json import JsonDatasetReader + + return JsonDatasetReader( + path_or_paths, + features=features, + cache_dir=cache_dir, + keep_in_memory=keep_in_memory, + **kwargs, + ).read() + + @staticmethod + def from_parquet( + path_or_paths: dict[str, PathLike], + features: Optional[Features] = None, + cache_dir: str = None, + keep_in_memory: bool = False, + columns: Optional[list[str]] = None, + **kwargs, + ) -> "DatasetDict": + """Create [`DatasetDict`] from Parquet file(s). + + Args: + path_or_paths (`dict` of path-like): + Path(s) of the CSV file(s). + features ([`Features`], *optional*): + Dataset features. + cache_dir (`str`, *optional*, defaults to `"~/.cache/huggingface/datasets"`): + Directory to cache data. + keep_in_memory (`bool`, defaults to `False`): + Whether to copy the data in-memory. + columns (`list[str]`, *optional*): + If not `None`, only these columns will be read from the file. + A column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e'. + **kwargs (additional keyword arguments): + Keyword arguments to be passed to [`ParquetConfig`]. + + Returns: + [`DatasetDict`] + + Example: + + ```py + >>> from datasets import DatasetDict + >>> ds = DatasetDict.from_parquet({'train': 'path/to/dataset/parquet'}) + ``` + """ + # Dynamic import to avoid circular dependency + from .io.parquet import ParquetDatasetReader + + return ParquetDatasetReader( + path_or_paths, + features=features, + cache_dir=cache_dir, + keep_in_memory=keep_in_memory, + columns=columns, + **kwargs, + ).read() + + @staticmethod + def from_text( + path_or_paths: dict[str, PathLike], + features: Optional[Features] = None, + cache_dir: str = None, + keep_in_memory: bool = False, + **kwargs, + ) -> "DatasetDict": + """Create [`DatasetDict`] from text file(s). + + Args: + path_or_paths (`dict` of path-like): + Path(s) of the text file(s). + features ([`Features`], *optional*): + Dataset features. + cache_dir (`str`, *optional*, defaults to `"~/.cache/huggingface/datasets"`): + Directory to cache data. + keep_in_memory (`bool`, defaults to `False`): + Whether to copy the data in-memory. + **kwargs (additional keyword arguments): + Keyword arguments to be passed to [`TextConfig`]. + + Returns: + [`DatasetDict`] + + Example: + + ```py + >>> from datasets import DatasetDict + >>> ds = DatasetDict.from_text({'train': 'path/to/dataset.txt'}) + ``` + """ + # Dynamic import to avoid circular dependency + from .io.text import TextDatasetReader + + return TextDatasetReader( + path_or_paths, + features=features, + cache_dir=cache_dir, + keep_in_memory=keep_in_memory, + **kwargs, + ).read() + + @is_documented_by(Dataset.align_labels_with_mapping) + def align_labels_with_mapping(self, label2id: dict, label_column: str) -> "DatasetDict": + self._check_values_type() + return DatasetDict( + { + k: dataset.align_labels_with_mapping(label2id=label2id, label_column=label_column) + for k, dataset in self.items() + } + ) + + def push_to_hub( + self, + repo_id, + config_name: str = "default", + set_default: Optional[bool] = None, + data_dir: Optional[str] = None, + commit_message: Optional[str] = None, + commit_description: Optional[str] = None, + private: Optional[bool] = None, + token: Optional[str] = None, + revision: Optional[str] = None, + create_pr: Optional[bool] = False, + max_shard_size: Optional[Union[int, str]] = None, + num_shards: Optional[dict[str, int]] = None, + embed_external_files: bool = True, + num_proc: Optional[int] = None, + ) -> CommitInfo: + """Pushes the [`DatasetDict`] to the hub as a Parquet dataset. + The [`DatasetDict`] is pushed using HTTP requests and does not need to have neither git or git-lfs installed. + + Each dataset split will be pushed independently. The pushed dataset will keep the original split names. + + The resulting Parquet files are self-contained by default: if your dataset contains [`Image`] or [`Audio`] + data, the Parquet files will store the bytes of your images or audio files. + You can disable this by setting `embed_external_files` to False. + + Args: + repo_id (`str`): + The ID of the repository to push to in the following format: `/` or + `/`. Also accepts ``, which will default to the namespace + of the logged-in user. + config_name (`str`): + Configuration name of a dataset. Defaults to "default". + set_default (`bool`, *optional*): + Whether to set this configuration as the default one. Otherwise, the default configuration is the one + named "default". + data_dir (`str`, *optional*): + Directory name that will contain the uploaded data files. Defaults to the `config_name` if different + from "default", else "data". + + + commit_message (`str`, *optional*): + Message to commit while pushing. Will default to `"Upload dataset"`. + commit_description (`str`, *optional*): + Description of the commit that will be created. + Additionally, description of the PR if a PR is created (`create_pr` is True). + + + private (`bool`, *optional*): + Whether to make the repo private. If `None` (default), the repo will be public unless the + organization's default is private. This value is ignored if the repo already exists. + token (`str`, *optional*): + An optional authentication token for the Hugging Face Hub. If no token is passed, will default + to the token saved locally when logging in with `huggingface-cli login`. Will raise an error + if no token is passed and the user is not logged-in. + revision (`str`, *optional*): + Branch to push the uploaded files to. Defaults to the `"main"` branch. + + + create_pr (`bool`, *optional*, defaults to `False`): + Whether to create a PR with the uploaded files or directly commit. + + + max_shard_size (`int` or `str`, *optional*, defaults to `"500MB"`): + The maximum size of the dataset shards to be uploaded to the hub. If expressed as a string, needs to be digits followed by a unit + (like `"500MB"` or `"1GB"`). + num_shards (`Dict[str, int]`, *optional*): + Number of shards to write. By default, the number of shards depends on `max_shard_size`. + Use a dictionary to define a different num_shards for each split. + + + embed_external_files (`bool`, defaults to `True`): + Whether to embed file bytes in the shards. + In particular, this will do the following before the push for the fields of type: + + - [`Audio`] and [`Image`] removes local path information and embed file content in the Parquet files. + num_proc (`int`, *optional*, defaults to `None`): + Number of processes when preparing and uploading the dataset. + This is helpful if the dataset is made of many samples or media files to embed. + Multiprocessing is disabled by default. + + + + Return: + huggingface_hub.CommitInfo + + Example: + + ```python + >>> dataset_dict.push_to_hub("/") + >>> dataset_dict.push_to_hub("/", private=True) + >>> dataset_dict.push_to_hub("/", max_shard_size="1GB") + >>> dataset_dict.push_to_hub("/", num_shards={"train": 1024, "test": 8}) + ``` + + If you want to add a new configuration (or subset) to a dataset (e.g. if the dataset has multiple tasks/versions/languages): + + ```python + >>> english_dataset.push_to_hub("/", "en") + >>> french_dataset.push_to_hub("/", "fr") + >>> # later + >>> english_dataset = load_dataset("/", "en") + >>> french_dataset = load_dataset("/", "fr") + ``` + """ + if num_shards is None: + num_shards = dict.fromkeys(self) + elif not isinstance(num_shards, dict): + raise ValueError( + "Please provide one `num_shards` per dataset in the dataset dictionary, e.g. {{'train': 128, 'test': 4}}" + ) + + self._check_values_type() + self._check_values_features() + total_uploaded_size = 0 + total_dataset_nbytes = 0 + info_to_dump: DatasetInfo = next(iter(self.values())).info.copy() + info_to_dump.config_name = config_name + info_to_dump.splits = SplitDict() + + for split in self.keys(): + if not re.match(_split_re, split): + raise ValueError(f"Split name should match '{_split_re}' but got '{split}'.") + + api = HfApi(endpoint=config.HF_ENDPOINT, token=token) + + try: + repo_id = api.repo_info(repo_id, repo_type="dataset").id + except RepositoryNotFoundError: + repo_url = api.create_repo( + repo_id, + repo_type="dataset", + private=private, + exist_ok=True, + ) + repo_id = repo_url.repo_id + + if revision is not None and not revision.startswith("refs/pr/"): + # We do not call create_branch for a PR reference: 400 Bad Request + api.create_branch( + repo_id, + branch=revision, + token=token, + repo_type="dataset", + exist_ok=True, + ) + + if not data_dir: + data_dir = config_name if config_name != "default" else "data" # for backward compatibility + + additions = [] + for split in self.keys(): + logger.info(f"Pushing split {split} to the Hub.") + # The split=key needs to be removed before merging + split_additions, uploaded_size, dataset_nbytes = self[split]._push_parquet_shards_to_hub( + repo_id, + data_dir=data_dir, + split=split, + token=token, + revision=revision, + create_pr=create_pr, + max_shard_size=max_shard_size, + num_shards=num_shards.get(split), + embed_external_files=embed_external_files, + num_proc=num_proc, + ) + additions += split_additions + total_uploaded_size += uploaded_size + total_dataset_nbytes += dataset_nbytes + info_to_dump.splits[split] = SplitInfo(str(split), num_bytes=dataset_nbytes, num_examples=len(self[split])) + info_to_dump.download_checksums = None + info_to_dump.download_size = total_uploaded_size + info_to_dump.dataset_size = total_dataset_nbytes + info_to_dump.size_in_bytes = total_uploaded_size + total_dataset_nbytes + + def get_deletions_and_dataset_card() -> tuple[str, list[CommitOperationDelete], str, Optional[str]]: + parent_commit = api.repo_info(repo_id, repo_type="dataset", revision=revision).sha + + # Check if the repo already has a README.md and/or a dataset_infos.json to update them with the new split info (size and pattern) + # and delete old split shards (if they exist) + repo_with_dataset_card, repo_with_dataset_infos = False, False + repo_splits: list[str] = [] # use a list to keep the order of the splits + deletions: list[CommitOperationDelete] = [] + repo_files_to_add = [addition.path_in_repo for addition in additions] + for repo_file in api.list_repo_tree( + repo_id=repo_id, + revision=parent_commit, + repo_type="dataset", + token=token, + recursive=True, + ): + if not isinstance(repo_file, RepoFile): + continue + if repo_file.rfilename == config.REPOCARD_FILENAME: + repo_with_dataset_card = True + elif repo_file.rfilename == config.DATASETDICT_INFOS_FILENAME: + repo_with_dataset_infos = True + elif ( + repo_file.rfilename.startswith(tuple(f"{data_dir}/{split}-" for split in self.keys())) + and repo_file.rfilename not in repo_files_to_add + ): + deletions.append(CommitOperationDelete(path_in_repo=repo_file.rfilename)) + elif fnmatch.fnmatch( + repo_file.rfilename, + PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED.replace("{split}", "*"), + ): + pattern = glob_pattern_to_regex(PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED) + split_pattern_fields = string_to_dict(repo_file.rfilename, pattern) + assert split_pattern_fields is not None + repo_split = split_pattern_fields["split"] + if repo_split not in repo_splits: + repo_splits.append(repo_split) + + # get the info from the README to update them + if repo_with_dataset_card: + dataset_card_path = api.hf_hub_download( + repo_id, + config.REPOCARD_FILENAME, + repo_type="dataset", + revision=parent_commit, + ) + dataset_card = DatasetCard.load(Path(dataset_card_path)) + dataset_card_data = dataset_card.data + metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data) + # get the deprecated dataset_infos.json to update them + elif repo_with_dataset_infos: + dataset_card = None + dataset_card_data = DatasetCardData() + metadata_configs = MetadataConfigs() + else: + dataset_card = None + dataset_card_data = DatasetCardData() + metadata_configs = MetadataConfigs() + # create the metadata configs if it was uploaded with push_to_hub before metadata configs existed + if not metadata_configs and repo_splits: + default_metadata_configs_to_dump = { + "data_files": [{"split": split, "path": f"data/{split}-*"} for split in repo_splits] + } + MetadataConfigs({"default": default_metadata_configs_to_dump}).to_dataset_card_data(dataset_card_data) + metadata_config_to_dump = { + "data_files": [{"split": split, "path": f"{data_dir}/{split}-*"} for split in self.keys()], + } + configs_to_dump = {config_name: metadata_config_to_dump} + if set_default and config_name != "default": + if metadata_configs: + current_default_config_name = metadata_configs.get_default_config_name() + if current_default_config_name == "default": + raise ValueError( + "There exists a configuration named 'default'. To set a different configuration as default, " + "rename the 'default' one first." + ) + if current_default_config_name: + _ = metadata_configs[current_default_config_name].pop("default") + configs_to_dump[current_default_config_name] = metadata_configs[current_default_config_name] + metadata_config_to_dump["default"] = True + # push to the deprecated dataset_infos.json + if repo_with_dataset_infos: + dataset_infos_path = api.hf_hub_download( + repo_id, + config.DATASETDICT_INFOS_FILENAME, + repo_type="dataset", + revision=parent_commit, + ) + with open(dataset_infos_path, encoding="utf-8") as f: + dataset_infos: dict = json.load(f) + dataset_infos[config_name] = asdict(info_to_dump) + new_dataset_infos = json.dumps(dataset_infos, indent=4) + else: + new_dataset_infos = None + # push to README + DatasetInfosDict({config_name: info_to_dump}).to_dataset_card_data(dataset_card_data) + MetadataConfigs(configs_to_dump).to_dataset_card_data(dataset_card_data) + new_dataset_card = ( + DatasetCard(f"---\n{dataset_card_data}\n---\n") if dataset_card is None else dataset_card + ) + return parent_commit, deletions, new_dataset_card, new_dataset_infos + + commit_message = commit_message if commit_message is not None else "Upload dataset" + if len(additions) > config.UPLOADS_MAX_NUMBER_PER_COMMIT: + logger.info( + f"Number of files to upload is larger than {config.UPLOADS_MAX_NUMBER_PER_COMMIT}. Splitting the push into multiple commits." + ) + num_commits = math.ceil(len(additions) / config.UPLOADS_MAX_NUMBER_PER_COMMIT) + for i in range(0, num_commits): + operations = additions[ + i * config.UPLOADS_MAX_NUMBER_PER_COMMIT : (i + 1) * config.UPLOADS_MAX_NUMBER_PER_COMMIT + ] + for retry, sleep_time in enumerate(itertools.chain(range(10), itertools.repeat(30)), start=1): + # We need to retry if another commit happens at the same time + sleep_time *= 1 + random.random() + try: + commit_info = api.create_commit( + repo_id, + operations=operations, + commit_message=commit_message + f" (part {i:05d}-of-{num_commits:05d})", + commit_description=commit_description, + repo_type="dataset", + revision=revision, + create_pr=create_pr, + ) + except HfHubHTTPError as err: + if ( + err.__context__ + and isinstance(err.__context__, HfHubHTTPError) + and err.__context__.response.status_code == 409 + ): + # 409 is Conflict (another commit is in progress) + time.sleep(sleep_time) + logger.info( + f"Retrying intermediate commit for {repo_id}, {config_name} ({retry}/n with status_code {err.__context__.response.status_code})" + ) + continue + else: + raise + break + logger.info( + f"Commit #{i + 1} completed" + + (f" (still {num_commits - i - 1} to go)" if num_commits - i - 1 else "") + + "." + ) + last_commit_additions = [] + else: + last_commit_additions = additions + + for retry, sleep_time in enumerate(itertools.chain(range(10), itertools.repeat(30)), start=1): + # We need to retry if there was a commit in between in case it touched the dataset card data + sleep_time *= 1 + random.random() + parent_commit, deletions, dataset_card, dataset_infos = get_deletions_and_dataset_card() + dataset_card_additions = [] + if dataset_infos: + dataset_card_additions.append( + CommitOperationAdd( + path_in_repo=config.DATASETDICT_INFOS_FILENAME, + path_or_fileobj=dataset_infos.encode("utf-8"), + ) + ) + dataset_card_additions.append( + CommitOperationAdd(path_in_repo=config.REPOCARD_FILENAME, path_or_fileobj=str(dataset_card).encode()) + ) + try: + commit_info = api.create_commit( + repo_id, + operations=last_commit_additions + dataset_card_additions + deletions, + commit_message=commit_message, + commit_description=commit_description, + repo_type="dataset", + revision=revision, + create_pr=create_pr, + parent_commit=parent_commit, + ) + except HfHubHTTPError as err: + if ( + err.__context__ + and isinstance(err.__context__, HfHubHTTPError) + and err.__context__.response.status_code in (412, 409) + ): + # 412 is Precondition failed (parent_commit isn't satisfied) + # 409 is Conflict (another commit is in progress) + time.sleep(sleep_time) + logger.info( + f"Retrying commit for {repo_id}, {config_name} ({retry}/n with status_code {err.__context__.response.status_code})" + ) + continue + else: + raise + break + + return commit_info + + +class IterableDatasetDict(dict[Union[str, NamedSplit], IterableDataset]): + def _check_values_type(self): + for dataset in self.values(): + if not isinstance(dataset, IterableDataset): + raise TypeError(f"Values in `DatasetDict` should be of type `Dataset` but got type '{type(dataset)}'") + + def _check_values_features(self): + items = [(key, dataset._resolve_features()) for key, dataset in self.items()] + for item_a, item_b in zip(items[:-1], items[1:]): + if item_a[1].features != item_b[1].features: + raise ValueError( + f"All datasets in `DatasetDict` should have the same features but features for '{item_a[0]}' and '{item_b[0]}' don't match: {item_a[1].features} != {item_b[1].features}" + ) + + def __repr__(self): + repr = "\n".join([f"{k}: {v}" for k, v in self.items()]) + repr = re.sub(r"^", " " * 4, repr, count=0, flags=re.M) + return f"IterableDatasetDict({{\n{repr}\n}})" + + @property + def num_columns(self) -> dict[str, Optional[int]]: + """Number of columns in each split of the dataset. + This can contain None valies if some splits have unknown features (e.g. after a map() operation). + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes") + >>> ds.num_columns + {'test': 2, 'train': 2, 'validation': 2} + ``` + """ + self._check_values_type() + return {k: dataset.num_columns for k, dataset in self.items()} + + @property + def column_names(self) -> dict[str, Optional[list[str]]]: + """Names of the columns in each split of the dataset. + This can contain None valies if some splits have unknown features (e.g. after a map() operation). + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes") + >>> ds.column_names + {'test': ['text', 'label'], + 'train': ['text', 'label'], + 'validation': ['text', 'label']} + ``` + """ + self._check_values_type() + return {k: dataset.column_names for k, dataset in self.items()} + + def with_format( + self, + type: Optional[str] = None, + ) -> "IterableDatasetDict": + """ + Return a dataset with the specified format. + + Args: + + type (`str`, *optional*): + Either output type selected in `[None, 'numpy', 'torch', 'tensorflow', 'jax', 'arrow', 'pandas', 'polars']`. + `None` means it returns python objects (default). + + Example: + + ```py + >>> from datasets import load_dataset + >>> from transformers import AutoTokenizer + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation", streaming=True) + >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") + >>> ds = ds.map(lambda x: tokenizer(x['text'], truncation=True, padding=True), batched=True) + >>> ds = ds.with_format("torch") + >>> next(iter(ds)) + {'text': 'compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .', + 'label': tensor(1), + 'input_ids': tensor([ 101, 18027, 16310, 16001, 1103, 9321, 178, 11604, 7235, 6617, + 1742, 2165, 2820, 1206, 6588, 22572, 12937, 1811, 2153, 1105, + 1147, 12890, 19587, 6463, 1105, 15026, 1482, 119, 102, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0]), + 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), + 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])} + ``` + """ + return IterableDatasetDict({k: dataset.with_format(type=type) for k, dataset in self.items()}) + + def map( + self, + function: Optional[Callable] = None, + with_indices: bool = False, + with_split: bool = False, + input_columns: Optional[Union[str, list[str]]] = None, + batched: bool = False, + batch_size: int = 1000, + drop_last_batch: bool = False, + remove_columns: Optional[Union[str, list[str]]] = None, + fn_kwargs: Optional[dict] = None, + ) -> "IterableDatasetDict": + """ + Apply a function to all the examples in the iterable dataset (individually or in batches) and update them. + If your function returns a column that already exists, then it overwrites it. + The function is applied on-the-fly on the examples when iterating over the dataset. + The transformation is applied to all the datasets of the dataset dictionary. + + You can specify whether the function should be batched or not with the `batched` parameter: + + - If batched is `False`, then the function takes 1 example in and should return 1 example. + An example is a dictionary, e.g. `{"text": "Hello there !"}`. + - If batched is `True` and `batch_size` is 1, then the function takes a batch of 1 example as input and can return a batch with 1 or more examples. + A batch is a dictionary, e.g. a batch of 1 example is `{"text": ["Hello there !"]}`. + - If batched is `True` and `batch_size` is `n` > 1, then the function takes a batch of `n` examples as input and can return a batch with `n` examples, or with an arbitrary number of examples. + Note that the last batch may have less than `n` examples. + A batch is a dictionary, e.g. a batch of `n` examples is `{"text": ["Hello there !"] * n}`. + + If the function is asynchronous, then `map` will run your function in parallel, with up to one thousand simultaneous calls. + It is recommended to use a `asyncio.Semaphore` in your function if you want to set a maximum number of operations that can run at the same time. + + Args: + function (`Callable`, *optional*, defaults to `None`): + Function applied on-the-fly on the examples when you iterate on the dataset. + It must have one of the following signatures: + + - `function(example: Dict[str, Any]) -> Dict[str, Any]` if `batched=False` and `with_indices=False` + - `function(example: Dict[str, Any], idx: int) -> Dict[str, Any]` if `batched=False` and `with_indices=True` + - `function(batch: Dict[str, list]) -> Dict[str, list]` if `batched=True` and `with_indices=False` + - `function(batch: Dict[str, list], indices: list[int]) -> Dict[str, list]` if `batched=True` and `with_indices=True` + + For advanced usage, the function can also return a `pyarrow.Table`. + If the function is asynchronous, then `map` will run your function in parallel. + Moreover if your function returns nothing (`None`), then `map` will run your function and return the dataset unchanged. + If no function is provided, default to identity function: `lambda x: x`. + with_indices (`bool`, defaults to `False`): + Provide example indices to `function`. Note that in this case the signature of `function` should be `def function(example, idx[, rank]): ...`. + input_columns (`[Union[str, list[str]]]`, *optional*, defaults to `None`): + The columns to be passed into `function` + as positional arguments. If `None`, a dict mapping to all formatted columns is passed as one argument. + batched (`bool`, defaults to `False`): + Provide batch of examples to `function`. + batch_size (`int`, *optional*, defaults to `1000`): + Number of examples per batch provided to `function` if `batched=True`. + drop_last_batch (`bool`, defaults to `False`): + Whether a last batch smaller than the `batch_size` should be + dropped instead of being processed by the function. + remove_columns (`[list[str]]`, *optional*, defaults to `None`): + Remove a selection of columns while doing the mapping. + Columns will be removed before updating the examples with the output of `function`, i.e. if `function` is adding + columns with names in `remove_columns`, these columns will be kept. + fn_kwargs (`Dict`, *optional*, defaults to `None`): + Keyword arguments to be passed to `function` + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", streaming=True) + >>> def add_prefix(example): + ... example["text"] = "Review: " + example["text"] + ... return example + >>> ds = ds.map(add_prefix) + >>> next(iter(ds["train"])) + {'label': 1, + 'text': 'Review: the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'} + ``` + """ + + dataset_dict = {} + for split, dataset in self.items(): + if with_split: + function = bind(function, split) + + dataset_dict[split] = dataset.map( + function=function, + with_indices=with_indices, + input_columns=input_columns, + batched=batched, + batch_size=batch_size, + drop_last_batch=drop_last_batch, + remove_columns=remove_columns, + fn_kwargs=fn_kwargs, + ) + + if with_split: + function = function.func + + return IterableDatasetDict(dataset_dict) + + def filter( + self, + function: Optional[Callable] = None, + with_indices=False, + input_columns: Optional[Union[str, list[str]]] = None, + batched: bool = False, + batch_size: Optional[int] = 1000, + fn_kwargs: Optional[dict] = None, + ) -> "IterableDatasetDict": + """Apply a filter function to all the elements so that the dataset only includes examples according to the filter function. + The filtering is done on-the-fly when iterating over the dataset. + The filtering is applied to all the datasets of the dataset dictionary. + + Args: + function (`Callable`): + Callable with one of the following signatures: + + - `function(example: Dict[str, Any]) -> bool` if `with_indices=False, batched=False` + - `function(example: Dict[str, Any], indices: int) -> bool` if `with_indices=True, batched=False` + - `function(example: Dict[str, list]) -> list[bool]` if `with_indices=False, batched=True` + - `function(example: Dict[str, list], indices: list[int]) -> list[bool]` if `with_indices=True, batched=True` + + If no function is provided, defaults to an always True function: `lambda x: True`. + with_indices (`bool`, defaults to `False`): + Provide example indices to `function`. Note that in this case the signature of `function` should be `def function(example, idx): ...`. + input_columns (`str` or `list[str]`, *optional*): + The columns to be passed into `function` as + positional arguments. If `None`, a dict mapping to all formatted columns is passed as one argument. + batched (`bool`, defaults to `False`): + Provide batch of examples to `function` + batch_size (`int`, *optional*, defaults to `1000`): + Number of examples per batch provided to `function` if `batched=True`. + fn_kwargs (`Dict`, *optional*, defaults to `None`): + Keyword arguments to be passed to `function` + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", streaming=True) + >>> ds = ds.filter(lambda x: x["label"] == 0) + >>> list(ds["train"].take(3)) + [{'label': 0, 'text': 'Review: simplistic , silly and tedious .'}, + {'label': 0, + 'text': "Review: it's so laddish and juvenile , only teenage boys could possibly find it funny ."}, + {'label': 0, + 'text': 'Review: exploitative and largely devoid of the depth or sophistication that would make watching such a graphic treatment of the crimes bearable .'}] + ``` + """ + return IterableDatasetDict( + { + k: dataset.filter( + function=function, + with_indices=with_indices, + input_columns=input_columns, + batched=batched, + batch_size=batch_size, + fn_kwargs=fn_kwargs, + ) + for k, dataset in self.items() + } + ) + + def shuffle( + self, + seed=None, + generator: Optional[np.random.Generator] = None, + buffer_size: int = 1000, + ) -> "IterableDatasetDict": + """ + Randomly shuffles the elements of this dataset. + The shuffling is applied to all the datasets of the dataset dictionary. + + This dataset fills a buffer with buffer_size elements, then randomly samples elements from this buffer, + replacing the selected elements with new elements. For perfect shuffling, a buffer size greater than or + equal to the full size of the dataset is required. + + For instance, if your dataset contains 10,000 elements but `buffer_size` is set to 1000, then `shuffle` will + initially select a random element from only the first 1000 elements in the buffer. Once an element is + selected, its space in the buffer is replaced by the next (i.e. 1,001-st) element, + maintaining the 1000 element buffer. + + If the dataset is made of several shards, it also does `shuffle` the order of the shards. + However if the order has been fixed by using [`~datasets.IterableDataset.skip`] or [`~datasets.IterableDataset.take`] + then the order of the shards is kept unchanged. + + Args: + seed (`int`, *optional*, defaults to `None`): + Random seed that will be used to shuffle the dataset. + It is used to sample from the shuffle buffer and also to shuffle the data shards. + generator (`numpy.random.Generator`, *optional*): + Numpy random Generator to use to compute the permutation of the dataset rows. + If `generator=None` (default), uses `np.random.default_rng` (the default BitGenerator (PCG64) of NumPy). + buffer_size (`int`, defaults to `1000`): + Size of the buffer. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", streaming=True) + >>> list(ds["train"].take(3)) + [{'label': 1, + 'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'}, + {'label': 1, + 'text': 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .'}, + {'label': 1, 'text': 'effective but too-tepid biopic'}] + >>> ds = ds.shuffle(seed=42) + >>> list(ds["train"].take(3)) + [{'label': 1, + 'text': "a sports movie with action that's exciting on the field and a story you care about off it ."}, + {'label': 1, + 'text': 'at its best , the good girl is a refreshingly adult take on adultery . . .'}, + {'label': 1, + 'text': "sam jones became a very lucky filmmaker the day wilco got dropped from their record label , proving that one man's ruin may be another's fortune ."}] + ``` + """ + return IterableDatasetDict( + { + k: dataset.shuffle(seed=seed, generator=generator, buffer_size=buffer_size) + for k, dataset in self.items() + } + ) + + def rename_column(self, original_column_name: str, new_column_name: str) -> "IterableDatasetDict": + """ + Rename a column in the dataset, and move the features associated to the original column under the new column + name. + The renaming is applied to all the datasets of the dataset dictionary. + + Args: + original_column_name (`str`): + Name of the column to rename. + new_column_name (`str`): + New name for the column. + + Returns: + [`IterableDatasetDict`]: A copy of the dataset with a renamed column. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", streaming=True) + >>> ds = ds.rename_column("text", "movie_review") + >>> next(iter(ds["train"])) + {'label': 1, + 'movie_review': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'} + ``` + """ + return IterableDatasetDict( + { + k: dataset.rename_column( + original_column_name=original_column_name, + new_column_name=new_column_name, + ) + for k, dataset in self.items() + } + ) + + def rename_columns(self, column_mapping: dict[str, str]) -> "IterableDatasetDict": + """ + Rename several columns in the dataset, and move the features associated to the original columns under + the new column names. + The renaming is applied to all the datasets of the dataset dictionary. + + Args: + column_mapping (`Dict[str, str]`): + A mapping of columns to rename to their new names. + + Returns: + [`IterableDatasetDict`]: A copy of the dataset with renamed columns + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", streaming=True) + >>> ds = ds.rename_columns({"text": "movie_review", "label": "rating"}) + >>> next(iter(ds["train"])) + {'movie_review': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', + 'rating': 1} + ``` + """ + return IterableDatasetDict( + {k: dataset.rename_columns(column_mapping=column_mapping) for k, dataset in self.items()} + ) + + def remove_columns(self, column_names: Union[str, list[str]]) -> "IterableDatasetDict": + """ + Remove one or several column(s) in the dataset and the features associated to them. + The removal is done on-the-fly on the examples when iterating over the dataset. + The removal is applied to all the datasets of the dataset dictionary. + + + Args: + column_names (`Union[str, list[str]]`): + Name of the column(s) to remove. + + Returns: + [`IterableDatasetDict`]: A copy of the dataset object without the columns to remove. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", streaming=True) + >>> ds = ds.remove_columns("label") + >>> next(iter(ds["train"])) + {'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'} + ``` + """ + return IterableDatasetDict({k: dataset.remove_columns(column_names) for k, dataset in self.items()}) + + def select_columns(self, column_names: Union[str, list[str]]) -> "IterableDatasetDict": + """Select one or several column(s) in the dataset and the features + associated to them. The selection is done on-the-fly on the examples + when iterating over the dataset. The selection is applied to all the + datasets of the dataset dictionary. + + + Args: + column_names (`Union[str, list[str]]`): + Name of the column(s) to keep. + + Returns: + [`IterableDatasetDict`]: A copy of the dataset object with only selected columns. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", streaming=True) + >>> ds = ds.select("text") + >>> next(iter(ds["train"])) + {'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'} + ``` + """ + return IterableDatasetDict({k: dataset.select_columns(column_names) for k, dataset in self.items()}) + + def cast_column(self, column: str, feature: FeatureType) -> "IterableDatasetDict": + """Cast column to feature for decoding. + The type casting is applied to all the datasets of the dataset dictionary. + + Args: + column (`str`): + Column name. + feature ([`Feature`]): + Target feature. + + Returns: + [`IterableDatasetDict`] + + Example: + + ```py + >>> from datasets import load_dataset, ClassLabel + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", streaming=True) + >>> ds["train"].features + {'label': ClassLabel(names=['neg', 'pos']), + 'text': Value('string')} + >>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good'])) + >>> ds["train"].features + {'label': ClassLabel(names=['bad', 'good']), + 'text': Value('string')} + ``` + """ + return IterableDatasetDict( + {k: dataset.cast_column(column=column, feature=feature) for k, dataset in self.items()} + ) + + def cast( + self, + features: Features, + ) -> "IterableDatasetDict": + """ + Cast the dataset to a new set of features. + The type casting is applied to all the datasets of the dataset dictionary. + + Args: + features (`Features`): + New features to cast the dataset to. + The name of the fields in the features must match the current column names. + The type of the data must also be convertible from one type to the other. + For non-trivial conversion, e.g. `string` <-> `ClassLabel` you should use [`map`] to update the Dataset. + + Returns: + [`IterableDatasetDict`]: A copy of the dataset with casted features. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", streaming=True) + >>> ds["train"].features + {'label': ClassLabel(names=['neg', 'pos']), + 'text': Value('string')} + >>> new_features = ds["train"].features.copy() + >>> new_features['label'] = ClassLabel(names=['bad', 'good']) + >>> new_features['text'] = Value('large_string') + >>> ds = ds.cast(new_features) + >>> ds["train"].features + {'label': ClassLabel(names=['bad', 'good']), + 'text': Value('large_string')} + ``` + """ + return IterableDatasetDict({k: dataset.cast(features=features) for k, dataset in self.items()}) + + def push_to_hub( + self, + repo_id, + config_name: str = "default", + set_default: Optional[bool] = None, + data_dir: Optional[str] = None, + commit_message: Optional[str] = None, + commit_description: Optional[str] = None, + private: Optional[bool] = None, + token: Optional[str] = None, + revision: Optional[str] = None, + create_pr: Optional[bool] = False, + # max_shard_size: Optional[Union[int, str]] = None, # TODO(QL): add arg + num_shards: Optional[dict[str, int]] = None, + embed_external_files: bool = True, + num_proc: Optional[int] = None, + ) -> CommitInfo: + """Pushes the [`DatasetDict`] to the hub as a Parquet dataset. + The [`DatasetDict`] is pushed using HTTP requests and does not need to have neither git or git-lfs installed. + + Each dataset split will be pushed independently. The pushed dataset will keep the original split names. + + The resulting Parquet files are self-contained by default: if your dataset contains [`Image`] or [`Audio`] + data, the Parquet files will store the bytes of your images or audio files. + You can disable this by setting `embed_external_files` to False. + + Args: + repo_id (`str`): + The ID of the repository to push to in the following format: `/` or + `/`. Also accepts ``, which will default to the namespace + of the logged-in user. + config_name (`str`): + Configuration name of a dataset. Defaults to "default". + set_default (`bool`, *optional*): + Whether to set this configuration as the default one. Otherwise, the default configuration is the one + named "default". + data_dir (`str`, *optional*): + Directory name that will contain the uploaded data files. Defaults to the `config_name` if different + from "default", else "data". + + + commit_message (`str`, *optional*): + Message to commit while pushing. Will default to `"Upload dataset"`. + commit_description (`str`, *optional*): + Description of the commit that will be created. + Additionally, description of the PR if a PR is created (`create_pr` is True). + + + private (`bool`, *optional*): + Whether to make the repo private. If `None` (default), the repo will be public unless the + organization's default is private. This value is ignored if the repo already exists. + token (`str`, *optional*): + An optional authentication token for the Hugging Face Hub. If no token is passed, will default + to the token saved locally when logging in with `huggingface-cli login`. Will raise an error + if no token is passed and the user is not logged-in. + revision (`str`, *optional*): + Branch to push the uploaded files to. Defaults to the `"main"` branch. + create_pr (`bool`, *optional*, defaults to `False`): + Whether to create a PR with the uploaded files or directly commit. + num_shards (`Dict[str, int]`, *optional*): + Number of shards to write. Equals to this dataset's `.num_shards` by default. + Use a dictionary to define a different num_shards for each split. + embed_external_files (`bool`, defaults to `True`): + Whether to embed file bytes in the shards. + In particular, this will do the following before the push for the fields of type: + + - [`Audio`] and [`Image`] removes local path information and embed file content in the Parquet files. + num_proc (`int`, *optional*, defaults to `None`): + Number of processes when preparing and uploading the dataset. + This is helpful if the dataset is made of many samples or media files to embed. + Multiprocessing is disabled by default. + + + + Return: + huggingface_hub.CommitInfo + + Example: + + ```python + >>> dataset_dict.push_to_hub("/") + >>> dataset_dict.push_to_hub("/", private=True) + >>> dataset_dict.push_to_hub("/", num_shards={"train": 1024, "test": 8}) + ``` + + If you want to add a new configuration (or subset) to a dataset (e.g. if the dataset has multiple tasks/versions/languages): + + ```python + >>> english_dataset.push_to_hub("/", "en") + >>> french_dataset.push_to_hub("/", "fr") + >>> # later + >>> english_dataset = load_dataset("/", "en") + >>> french_dataset = load_dataset("/", "fr") + ``` + """ + if num_shards is None: + num_shards = dict.fromkeys(self) + elif not isinstance(num_shards, dict): + raise ValueError( + "Please provide one `num_shards` per dataset in the dataset dictionary, e.g. {{'train': 128, 'test': 4}}" + ) + + self._check_values_type() + self._check_values_features() + total_uploaded_size = 0 + total_dataset_nbytes = 0 + info_to_dump: DatasetInfo = next(iter(self.values())).info.copy() + info_to_dump.config_name = config_name + info_to_dump.splits = SplitDict() + + for split in self.keys(): + if not re.match(_split_re, split): + raise ValueError(f"Split name should match '{_split_re}' but got '{split}'.") + + api = HfApi(endpoint=config.HF_ENDPOINT, token=token) + + try: + repo_id = api.repo_info(repo_id, repo_type="dataset").id + except RepositoryNotFoundError: + repo_url = api.create_repo( + repo_id, + repo_type="dataset", + private=private, + exist_ok=True, + ) + repo_id = repo_url.repo_id + + if revision is not None and not revision.startswith("refs/pr/"): + # We do not call create_branch for a PR reference: 400 Bad Request + api.create_branch( + repo_id, + branch=revision, + token=token, + repo_type="dataset", + exist_ok=True, + ) + + if not data_dir: + data_dir = config_name if config_name != "default" else "data" # for backward compatibility + + additions = [] + for split in self.keys(): + logger.info(f"Pushing split {split} to the Hub.") + # The split=key needs to be removed before merging + split_additions, uploaded_size, dataset_nbytes, num_examples = self[split]._push_parquet_shards_to_hub( + repo_id, + data_dir=data_dir, + split=split, + token=token, + revision=revision, + create_pr=create_pr, + # max_shard_size=max_shard_size, # TODO(QL): add arg + num_shards=num_shards.get(split), + embed_external_files=embed_external_files, + num_proc=num_proc, + ) + additions += split_additions + total_uploaded_size += uploaded_size + total_dataset_nbytes += dataset_nbytes + info_to_dump.splits[split] = SplitInfo(str(split), num_bytes=dataset_nbytes, num_examples=num_examples) + info_to_dump.download_checksums = None + info_to_dump.download_size = total_uploaded_size + info_to_dump.dataset_size = total_dataset_nbytes + info_to_dump.size_in_bytes = total_uploaded_size + total_dataset_nbytes + + def get_deletions_and_dataset_card() -> tuple[str, list[CommitOperationDelete], str, Optional[str]]: + parent_commit = api.repo_info(repo_id, repo_type="dataset", revision=revision).sha + + # Check if the repo already has a README.md and/or a dataset_infos.json to update them with the new split info (size and pattern) + # and delete old split shards (if they exist) + repo_with_dataset_card, repo_with_dataset_infos = False, False + repo_splits: list[str] = [] # use a list to keep the order of the splits + deletions: list[CommitOperationDelete] = [] + repo_files_to_add = [addition.path_in_repo for addition in additions] + for repo_file in api.list_repo_tree( + repo_id=repo_id, + revision=parent_commit, + repo_type="dataset", + token=token, + recursive=True, + ): + if not isinstance(repo_file, RepoFile): + continue + if repo_file.rfilename == config.REPOCARD_FILENAME: + repo_with_dataset_card = True + elif repo_file.rfilename == config.DATASETDICT_INFOS_FILENAME: + repo_with_dataset_infos = True + elif ( + repo_file.rfilename.startswith(tuple(f"{data_dir}/{split}-" for split in self.keys())) + and repo_file.rfilename not in repo_files_to_add + ): + deletions.append(CommitOperationDelete(path_in_repo=repo_file.rfilename)) + elif fnmatch.fnmatch( + repo_file.rfilename, + PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED.replace("{split}", "*"), + ): + pattern = glob_pattern_to_regex(PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED) + split_pattern_fields = string_to_dict(repo_file.rfilename, pattern) + assert split_pattern_fields is not None + repo_split = split_pattern_fields["split"] + if repo_split not in repo_splits: + repo_splits.append(repo_split) + + # get the info from the README to update them + if repo_with_dataset_card: + dataset_card_path = api.hf_hub_download( + repo_id, + config.REPOCARD_FILENAME, + repo_type="dataset", + revision=parent_commit, + ) + dataset_card = DatasetCard.load(Path(dataset_card_path)) + dataset_card_data = dataset_card.data + metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data) + # get the deprecated dataset_infos.json to update them + elif repo_with_dataset_infos: + dataset_card = None + dataset_card_data = DatasetCardData() + metadata_configs = MetadataConfigs() + else: + dataset_card = None + dataset_card_data = DatasetCardData() + metadata_configs = MetadataConfigs() + # create the metadata configs if it was uploaded with push_to_hub before metadata configs existed + if not metadata_configs and repo_splits: + default_metadata_configs_to_dump = { + "data_files": [{"split": split, "path": f"data/{split}-*"} for split in repo_splits] + } + MetadataConfigs({"default": default_metadata_configs_to_dump}).to_dataset_card_data(dataset_card_data) + metadata_config_to_dump = { + "data_files": [{"split": split, "path": f"{data_dir}/{split}-*"} for split in self.keys()], + } + configs_to_dump = {config_name: metadata_config_to_dump} + if set_default and config_name != "default": + if metadata_configs: + current_default_config_name = metadata_configs.get_default_config_name() + if current_default_config_name == "default": + raise ValueError( + "There exists a configuration named 'default'. To set a different configuration as default, " + "rename the 'default' one first." + ) + if current_default_config_name: + _ = metadata_configs[current_default_config_name].pop("default") + configs_to_dump[current_default_config_name] = metadata_configs[current_default_config_name] + metadata_config_to_dump["default"] = True + # push to the deprecated dataset_infos.json + if repo_with_dataset_infos: + dataset_infos_path = api.hf_hub_download( + repo_id, + config.DATASETDICT_INFOS_FILENAME, + repo_type="dataset", + revision=parent_commit, + ) + with open(dataset_infos_path, encoding="utf-8") as f: + dataset_infos: dict = json.load(f) + dataset_infos[config_name] = asdict(info_to_dump) + new_dataset_infos = json.dumps(dataset_infos, indent=4) + else: + new_dataset_infos = None + # push to README + DatasetInfosDict({config_name: info_to_dump}).to_dataset_card_data(dataset_card_data) + MetadataConfigs(configs_to_dump).to_dataset_card_data(dataset_card_data) + new_dataset_card = ( + DatasetCard(f"---\n{dataset_card_data}\n---\n") if dataset_card is None else dataset_card + ) + return parent_commit, deletions, new_dataset_card, new_dataset_infos + + commit_message = commit_message if commit_message is not None else "Upload dataset" + if len(additions) > config.UPLOADS_MAX_NUMBER_PER_COMMIT: + logger.info( + f"Number of files to upload is larger than {config.UPLOADS_MAX_NUMBER_PER_COMMIT}. Splitting the push into multiple commits." + ) + num_commits = math.ceil(len(additions) / config.UPLOADS_MAX_NUMBER_PER_COMMIT) + for i in range(0, num_commits): + operations = additions[ + i * config.UPLOADS_MAX_NUMBER_PER_COMMIT : (i + 1) * config.UPLOADS_MAX_NUMBER_PER_COMMIT + ] + for retry, sleep_time in enumerate(itertools.chain(range(10), itertools.repeat(30)), start=1): + # We need to retry if another commit happens at the same time + sleep_time *= 1 + random.random() + try: + commit_info = api.create_commit( + repo_id, + operations=operations, + commit_message=commit_message + f" (part {i:05d}-of-{num_commits:05d})", + commit_description=commit_description, + repo_type="dataset", + revision=revision, + create_pr=create_pr, + ) + except HfHubHTTPError as err: + if ( + err.__context__ + and isinstance(err.__context__, HfHubHTTPError) + and err.__context__.response.status_code == 409 + ): + # 409 is Conflict (another commit is in progress) + time.sleep(sleep_time) + logger.info( + f"Retrying intermediate commit for {repo_id}, {config_name} ({retry}/n with status_code {err.__context__.response.status_code})" + ) + continue + else: + raise + break + logger.info( + f"Commit #{i + 1} completed" + + (f" (still {num_commits - i - 1} to go)" if num_commits - i - 1 else "") + + "." + ) + last_commit_additions = [] + else: + last_commit_additions = additions + + for retry, sleep_time in enumerate(itertools.chain(range(10), itertools.repeat(30)), start=1): + # We need to retry if there was a commit in between in case it touched the dataset card data + sleep_time *= 1 + random.random() + parent_commit, deletions, dataset_card, dataset_infos = get_deletions_and_dataset_card() + dataset_card_additions = [] + if dataset_infos: + dataset_card_additions.append( + CommitOperationAdd( + path_in_repo=config.DATASETDICT_INFOS_FILENAME, + path_or_fileobj=dataset_infos.encode("utf-8"), + ) + ) + dataset_card_additions.append( + CommitOperationAdd(path_in_repo=config.REPOCARD_FILENAME, path_or_fileobj=str(dataset_card).encode()) + ) + try: + commit_info = api.create_commit( + repo_id, + operations=last_commit_additions + dataset_card_additions + deletions, + commit_message=commit_message, + commit_description=commit_description, + repo_type="dataset", + revision=revision, + create_pr=create_pr, + parent_commit=parent_commit, + ) + except HfHubHTTPError as err: + if ( + err.__context__ + and isinstance(err.__context__, HfHubHTTPError) + and err.__context__.response.status_code in (412, 409) + ): + # 412 is Precondition failed (parent_commit isn't satisfied) + # 409 is Conflict (another commit is in progress) + time.sleep(sleep_time) + logger.info( + f"Retrying commit for {repo_id}, {config_name} ({retry}/n with status_code {err.__context__.response.status_code})" + ) + continue + else: + raise + break + + return commit_info diff --git a/datasets/distributed.py b/datasets/distributed.py new file mode 100644 index 0000000000000000000000000000000000000000..4697948f342a81d489084b068758327156a89b8b --- /dev/null +++ b/datasets/distributed.py @@ -0,0 +1,39 @@ +from typing import TypeVar + +from .arrow_dataset import Dataset, _split_by_node_map_style_dataset +from .iterable_dataset import IterableDataset, _split_by_node_iterable_dataset + + +DatasetType = TypeVar("DatasetType", Dataset, IterableDataset) + + +def split_dataset_by_node(dataset: DatasetType, rank: int, world_size: int) -> DatasetType: + """ + Split a dataset for the node at rank `rank` in a pool of nodes of size `world_size`. + + For map-style datasets: + + Each node is assigned a chunk of data, e.g. rank 0 is given the first chunk of the dataset. + To maximize data loading throughput, chunks are made of contiguous data on disk if possible. + + For iterable datasets: + + If the dataset has a number of shards that is a factor of `world_size` (i.e. if `dataset.num_shards % world_size == 0`), + then the shards are evenly assigned across the nodes, which is the most optimized. + Otherwise, each node keeps 1 example out of `world_size`, skipping the other examples. + + Args: + dataset ([`Dataset`] or [`IterableDataset`]): + The dataset to split by node. + rank (`int`): + Rank of the current node. + world_size (`int`): + Total number of nodes. + + Returns: + [`Dataset`] or [`IterableDataset`]: The dataset to be used on the node at rank `rank`. + """ + if isinstance(dataset, Dataset): + return _split_by_node_map_style_dataset(dataset, rank=rank, world_size=world_size) + else: + return _split_by_node_iterable_dataset(dataset, rank=rank, world_size=world_size) diff --git a/datasets/exceptions.py b/datasets/exceptions.py new file mode 100644 index 0000000000000000000000000000000000000000..7c1724fb572dc98a2b65ad07f0a1a08c5d2a32e5 --- /dev/null +++ b/datasets/exceptions.py @@ -0,0 +1,119 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright 2023 The HuggingFace Authors. +from typing import Any, Optional, Union + +from huggingface_hub import HfFileSystem + +from . import config +from .table import CastError +from .utils.track import TrackedIterableFromGenerator, tracked_list, tracked_str + + +class DatasetsError(Exception): + """Base class for exceptions in this library.""" + + +class DefunctDatasetError(DatasetsError): + """The dataset has been defunct.""" + + +class FileNotFoundDatasetsError(DatasetsError, FileNotFoundError): + """FileNotFoundError raised by this library.""" + + +class DataFilesNotFoundError(FileNotFoundDatasetsError): + """No (supported) data files found.""" + + +class DatasetNotFoundError(FileNotFoundDatasetsError): + """Dataset not found. + + Raised when trying to access: + - a missing dataset, or + - a private/gated dataset and the user is not authenticated. + """ + + +class DatasetBuildError(DatasetsError): + pass + + +class ManualDownloadError(DatasetBuildError): + pass + + +class FileFormatError(DatasetBuildError): + pass + + +class DatasetGenerationError(DatasetBuildError): + pass + + +class DatasetGenerationCastError(DatasetGenerationError): + @classmethod + def from_cast_error( + cls, + cast_error: CastError, + builder_name: str, + gen_kwargs: dict[str, Any], + token: Optional[Union[bool, str]], + ) -> "DatasetGenerationCastError": + explanation_message = ( + f"\n\nAll the data files must have the same columns, but at some point {cast_error.details()}" + ) + formatted_tracked_gen_kwargs: list[str] = [] + for gen_kwarg in gen_kwargs.values(): + if not isinstance(gen_kwarg, (tracked_str, tracked_list, TrackedIterableFromGenerator)): + continue + while ( + isinstance(gen_kwarg, (tracked_list, TrackedIterableFromGenerator)) and gen_kwarg.last_item is not None + ): + gen_kwarg = gen_kwarg.last_item + if isinstance(gen_kwarg, tracked_str): + gen_kwarg = gen_kwarg.get_origin() + if isinstance(gen_kwarg, str) and gen_kwarg.startswith("hf://"): + resolved_path = HfFileSystem(endpoint=config.HF_ENDPOINT, token=token).resolve_path(gen_kwarg) + gen_kwarg = "hf://" + resolved_path.unresolve() + if "@" + resolved_path.revision in gen_kwarg: + gen_kwarg = ( + gen_kwarg.replace("@" + resolved_path.revision, "", 1) + + f" (at revision {resolved_path.revision})" + ) + formatted_tracked_gen_kwargs.append(str(gen_kwarg)) + if formatted_tracked_gen_kwargs: + explanation_message += f"\n\nThis happened while the {builder_name} dataset builder was generating data using\n\n{', '.join(formatted_tracked_gen_kwargs)}" + help_message = "\n\nPlease either edit the data files to have matching columns, or separate them into different configurations (see docs at https://hf.co/docs/hub/datasets-manual-configuration#multiple-configurations)" + return cls("An error occurred while generating the dataset" + explanation_message + help_message) + + +class ChecksumVerificationError(DatasetsError): + """Error raised during checksums verifications of downloaded files.""" + + +class UnexpectedDownloadedFileError(ChecksumVerificationError): + """Some downloaded files were not expected.""" + + +class ExpectedMoreDownloadedFilesError(ChecksumVerificationError): + """Some files were supposed to be downloaded but were not.""" + + +class NonMatchingChecksumError(ChecksumVerificationError): + """The downloaded file checksum don't match the expected checksum.""" + + +class SplitsVerificationError(DatasetsError): + """Error raised during splits verifications.""" + + +class UnexpectedSplitsError(SplitsVerificationError): + """The expected splits of the downloaded file is missing.""" + + +class ExpectedMoreSplitsError(SplitsVerificationError): + """Some recorded splits are missing.""" + + +class NonMatchingSplitsSizesError(SplitsVerificationError): + """The splits sizes don't match the expected splits sizes.""" diff --git a/datasets/fingerprint.py b/datasets/fingerprint.py new file mode 100644 index 0000000000000000000000000000000000000000..91e8bd01eebba7773ce0b335bfc928c0796b943e --- /dev/null +++ b/datasets/fingerprint.py @@ -0,0 +1,454 @@ +import inspect +import os +import random +import shutil +import tempfile +import weakref +from functools import wraps +from pathlib import Path +from typing import TYPE_CHECKING, Any, Callable, Optional, Union + +import numpy as np +import xxhash + +from . import config +from .naming import INVALID_WINDOWS_CHARACTERS_IN_PATH +from .utils._dill import dumps +from .utils.logging import get_logger + + +if TYPE_CHECKING: + from .arrow_dataset import Dataset + + +logger = get_logger(__name__) + + +# Fingerprinting allows to have one deterministic fingerprint per dataset state. +# A dataset fingerprint is updated after each transform. +# Re-running the same transforms on a dataset in a different session results in the same fingerprint. +# This is possible thanks to a custom hashing function that works with most python objects. + +# Fingerprinting is the main mechanism that enables caching. +# The caching mechanism allows to reload an existing cache file if it's already been computed. + + +################# +# Caching +################# + +_CACHING_ENABLED = True +_TEMP_DIR_FOR_TEMP_CACHE_FILES: Optional["_TempCacheDir"] = None +_DATASETS_WITH_TABLE_IN_TEMP_DIR: Optional[weakref.WeakSet] = None + + +class _TempCacheDir: + """ + A temporary directory for storing cached Arrow files with a cleanup that frees references to the Arrow files + before deleting the directory itself to avoid permission errors on Windows. + """ + + def __init__(self): + self.name = tempfile.mkdtemp(prefix=config.TEMP_CACHE_DIR_PREFIX) + self._finalizer = weakref.finalize(self, self._cleanup) + + def _cleanup(self): + for dset in get_datasets_with_cache_file_in_temp_dir(): + dset.__del__() + if os.path.exists(self.name): + try: + shutil.rmtree(self.name) + except Exception as e: + raise OSError( + f"An error occurred while trying to delete temporary cache directory {self.name}. Please delete it manually." + ) from e + + def cleanup(self): + if self._finalizer.detach(): + self._cleanup() + + +def maybe_register_dataset_for_temp_dir_deletion(dataset): + """ + This function registers the datasets that have cache files in _TEMP_DIR_FOR_TEMP_CACHE_FILES in order + to properly delete them before deleting the temporary directory. + The temporary directory _TEMP_DIR_FOR_TEMP_CACHE_FILES is used when caching is disabled. + """ + if _TEMP_DIR_FOR_TEMP_CACHE_FILES is None: + return + + global _DATASETS_WITH_TABLE_IN_TEMP_DIR + if _DATASETS_WITH_TABLE_IN_TEMP_DIR is None: + _DATASETS_WITH_TABLE_IN_TEMP_DIR = weakref.WeakSet() + if any( + Path(_TEMP_DIR_FOR_TEMP_CACHE_FILES.name) in Path(cache_file["filename"]).parents + for cache_file in dataset.cache_files + ): + _DATASETS_WITH_TABLE_IN_TEMP_DIR.add(dataset) + + +def get_datasets_with_cache_file_in_temp_dir(): + return list(_DATASETS_WITH_TABLE_IN_TEMP_DIR) if _DATASETS_WITH_TABLE_IN_TEMP_DIR is not None else [] + + +def enable_caching(): + """ + When applying transforms on a dataset, the data are stored in cache files. + The caching mechanism allows to reload an existing cache file if it's already been computed. + + Reloading a dataset is possible since the cache files are named using the dataset fingerprint, which is updated + after each transform. + + If disabled, the library will no longer reload cached datasets files when applying transforms to the datasets. + More precisely, if the caching is disabled: + - cache files are always recreated + - cache files are written to a temporary directory that is deleted when session closes + - cache files are named using a random hash instead of the dataset fingerprint + - use [`~datasets.Dataset.save_to_disk`] to save a transformed dataset or it will be deleted when session closes + - caching doesn't affect [`~datasets.load_dataset`]. If you want to regenerate a dataset from scratch you should use + the `download_mode` parameter in [`~datasets.load_dataset`]. + """ + global _CACHING_ENABLED + _CACHING_ENABLED = True + + +def disable_caching(): + """ + When applying transforms on a dataset, the data are stored in cache files. + The caching mechanism allows to reload an existing cache file if it's already been computed. + + Reloading a dataset is possible since the cache files are named using the dataset fingerprint, which is updated + after each transform. + + If disabled, the library will no longer reload cached datasets files when applying transforms to the datasets. + More precisely, if the caching is disabled: + - cache files are always recreated + - cache files are written to a temporary directory that is deleted when session closes + - cache files are named using a random hash instead of the dataset fingerprint + - use [`~datasets.Dataset.save_to_disk`] to save a transformed dataset or it will be deleted when session closes + - caching doesn't affect [`~datasets.load_dataset`]. If you want to regenerate a dataset from scratch you should use + the `download_mode` parameter in [`~datasets.load_dataset`]. + """ + global _CACHING_ENABLED + _CACHING_ENABLED = False + + +def is_caching_enabled() -> bool: + """ + When applying transforms on a dataset, the data are stored in cache files. + The caching mechanism allows to reload an existing cache file if it's already been computed. + + Reloading a dataset is possible since the cache files are named using the dataset fingerprint, which is updated + after each transform. + + If disabled, the library will no longer reload cached datasets files when applying transforms to the datasets. + More precisely, if the caching is disabled: + - cache files are always recreated + - cache files are written to a temporary directory that is deleted when session closes + - cache files are named using a random hash instead of the dataset fingerprint + - use [`~datasets.Dataset.save_to_disk`]] to save a transformed dataset or it will be deleted when session closes + - caching doesn't affect [`~datasets.load_dataset`]. If you want to regenerate a dataset from scratch you should use + the `download_mode` parameter in [`~datasets.load_dataset`]. + """ + global _CACHING_ENABLED + return bool(_CACHING_ENABLED) + + +def get_temporary_cache_files_directory() -> str: + """Return a directory that is deleted when session closes.""" + global _TEMP_DIR_FOR_TEMP_CACHE_FILES + if _TEMP_DIR_FOR_TEMP_CACHE_FILES is None: + _TEMP_DIR_FOR_TEMP_CACHE_FILES = _TempCacheDir() + return _TEMP_DIR_FOR_TEMP_CACHE_FILES.name + + +################# +# Hashing +################# + + +class Hasher: + """Hasher that accepts python objects as inputs.""" + + dispatch: dict = {} + + def __init__(self): + self.m = xxhash.xxh64() + + @classmethod + def hash_bytes(cls, value: Union[bytes, list[bytes]]) -> str: + value = [value] if isinstance(value, bytes) else value + m = xxhash.xxh64() + for x in value: + m.update(x) + return m.hexdigest() + + @classmethod + def hash(cls, value: Any) -> str: + return cls.hash_bytes(dumps(value)) + + def update(self, value: Any) -> None: + header_for_update = f"=={type(value)}==" + value_for_update = self.hash(value) + self.m.update(header_for_update.encode("utf8")) + self.m.update(value_for_update.encode("utf-8")) + + def hexdigest(self) -> str: + return self.m.hexdigest() + + +################# +# Fingerprinting +################# + +fingerprint_rng = random.Random() +# we show a warning only once when fingerprinting fails to avoid spam +fingerprint_warnings: dict[str, bool] = {} + + +def generate_fingerprint(dataset: "Dataset") -> str: + state = dataset.__dict__ + hasher = Hasher() + for key in sorted(state): + if key == "_fingerprint": + continue + hasher.update(key) + hasher.update(state[key]) + # hash data files last modification timestamps as well + for cache_file in dataset.cache_files: + hasher.update(os.path.getmtime(cache_file["filename"])) + return hasher.hexdigest() + + +def generate_random_fingerprint(nbits: int = 64) -> str: + return f"{fingerprint_rng.getrandbits(nbits):0{nbits // 4}x}" + + +def update_fingerprint(fingerprint, transform, transform_args): + global fingerprint_warnings + hasher = Hasher() + hasher.update(fingerprint) + try: + hasher.update(transform) + except: # noqa various errors might raise here from pickle or dill + if _CACHING_ENABLED: + if not fingerprint_warnings.get("update_fingerprint_transform_hash_failed", False): + logger.warning( + f"Transform {transform} couldn't be hashed properly, a random hash was used instead. " + "Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. " + "If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. " + "This warning is only shown once. Subsequent hashing failures won't be shown." + ) + fingerprint_warnings["update_fingerprint_transform_hash_failed"] = True + else: + logger.info(f"Transform {transform} couldn't be hashed properly, a random hash was used instead.") + else: + logger.info( + f"Transform {transform} couldn't be hashed properly, a random hash was used instead. This doesn't affect caching since it's disabled." + ) + + return generate_random_fingerprint() + for key in sorted(transform_args): + hasher.update(key) + try: + hasher.update(transform_args[key]) + except: # noqa various errors might raise here from pickle or dill + if _CACHING_ENABLED: + if not fingerprint_warnings.get("update_fingerprint_transform_hash_failed", False): + logger.warning( + f"Parameter '{key}'={transform_args[key]} of the transform {transform} couldn't be hashed properly, a random hash was used instead. " + "Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. " + "If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. " + "This warning is only shown once. Subsequent hashing failures won't be shown." + ) + fingerprint_warnings["update_fingerprint_transform_hash_failed"] = True + else: + logger.info( + f"Parameter '{key}'={transform_args[key]} of the transform {transform} couldn't be hashed properly, a random hash was used instead." + ) + else: + logger.info( + f"Parameter '{key}'={transform_args[key]} of the transform {transform} couldn't be hashed properly, a random hash was used instead. This doesn't affect caching since it's disabled." + ) + return generate_random_fingerprint() + return hasher.hexdigest() + + +def validate_fingerprint(fingerprint: str, max_length=64): + """ + Make sure the fingerprint is a non-empty string that is not longer that max_length=64 by default, + so that the fingerprint can be used to name cache files without issues. + """ + if not isinstance(fingerprint, str) or not fingerprint: + raise ValueError(f"Invalid fingerprint '{fingerprint}': it should be a non-empty string.") + for invalid_char in INVALID_WINDOWS_CHARACTERS_IN_PATH: + if invalid_char in fingerprint: + raise ValueError( + f"Invalid fingerprint. Bad characters from black list '{INVALID_WINDOWS_CHARACTERS_IN_PATH}' found in '{fingerprint}'. " + f"They could create issues when creating cache files." + ) + if len(fingerprint) > max_length: + raise ValueError( + f"Invalid fingerprint. Maximum lenth is {max_length} but '{fingerprint}' has length {len(fingerprint)}." + "It could create issues when creating cache files." + ) + + +def format_transform_for_fingerprint(func: Callable, version: Optional[str] = None) -> str: + """ + Format a transform to the format that will be used to update the fingerprint. + """ + transform = f"{func.__module__}.{func.__qualname__}" + if version is not None: + transform += f"@{version}" + return transform + + +def format_kwargs_for_fingerprint( + func: Callable, + args: tuple, + kwargs: dict[str, Any], + use_kwargs: Optional[list[str]] = None, + ignore_kwargs: Optional[list[str]] = None, + randomized_function: bool = False, +) -> dict[str, Any]: + """ + Format the kwargs of a transform to the format that will be used to update the fingerprint. + """ + kwargs_for_fingerprint = kwargs.copy() + if args: + params = [p.name for p in inspect.signature(func).parameters.values() if p != p.VAR_KEYWORD] + args = args[1:] # assume the first argument is the dataset + params = params[1:] + kwargs_for_fingerprint.update(zip(params, args)) + else: + del kwargs_for_fingerprint[ + next(iter(inspect.signature(func).parameters)) + ] # assume the first key is the dataset + + # keep the right kwargs to be hashed to generate the fingerprint + + if use_kwargs: + kwargs_for_fingerprint = {k: v for k, v in kwargs_for_fingerprint.items() if k in use_kwargs} + if ignore_kwargs: + kwargs_for_fingerprint = {k: v for k, v in kwargs_for_fingerprint.items() if k not in ignore_kwargs} + if randomized_function: # randomized functions have `seed` and `generator` parameters + if kwargs_for_fingerprint.get("seed") is None and kwargs_for_fingerprint.get("generator") is None: + _, seed, pos, *_ = np.random.get_state() + seed = seed[pos] if pos < 624 else seed[0] + kwargs_for_fingerprint["generator"] = np.random.default_rng(seed) + + # remove kwargs that are the default values + + default_values = { + p.name: p.default for p in inspect.signature(func).parameters.values() if p.default != inspect._empty + } + for default_varname, default_value in default_values.items(): + if default_varname in kwargs_for_fingerprint and kwargs_for_fingerprint[default_varname] == default_value: + kwargs_for_fingerprint.pop(default_varname) + return kwargs_for_fingerprint + + +def fingerprint_transform( + inplace: bool, + use_kwargs: Optional[list[str]] = None, + ignore_kwargs: Optional[list[str]] = None, + fingerprint_names: Optional[list[str]] = None, + randomized_function: bool = False, + version: Optional[str] = None, +): + """ + Wrapper for dataset transforms to update the dataset fingerprint using ``update_fingerprint`` + Args: + inplace (:obj:`bool`): If inplace is True, the fingerprint of the dataset is updated inplace. + Otherwise, a parameter "new_fingerprint" is passed to the wrapped method that should take care of + setting the fingerprint of the returned Dataset. + use_kwargs (:obj:`List[str]`, optional): optional white list of argument names to take into account + to update the fingerprint to the wrapped method that should take care of + setting the fingerprint of the returned Dataset. By default all the arguments are used. + ignore_kwargs (:obj:`List[str]`, optional): optional black list of argument names to take into account + to update the fingerprint. Note that ignore_kwargs prevails on use_kwargs. + fingerprint_names (:obj:`List[str]`, optional, defaults to ["new_fingerprint"]): + If the dataset transforms is not inplace and returns a DatasetDict, then it can require + several fingerprints (one per dataset in the DatasetDict). By specifying fingerprint_names, + one fingerprint named after each element of fingerprint_names is going to be passed. + randomized_function (:obj:`bool`, defaults to False): If the dataset transform is random and has + optional parameters "seed" and "generator", then you can set randomized_function to True. + This way, even if users set "seed" and "generator" to None, then the fingerprint is + going to be randomly generated depending on numpy's current state. In this case, the + generator is set to np.random.default_rng(np.random.get_state()[1][0]). + version (:obj:`str`, optional): version of the transform. The version is taken into account when + computing the fingerprint. If a datase transform changes (or at least if the output data + that are cached changes), then one should increase the version. If the version stays the + same, then old cached data could be reused that are not compatible with the new transform. + It should be in the format "MAJOR.MINOR.PATCH". + """ + + if use_kwargs is not None and not isinstance(use_kwargs, list): + raise ValueError(f"use_kwargs is supposed to be a list, not {type(use_kwargs)}") + + if ignore_kwargs is not None and not isinstance(ignore_kwargs, list): + raise ValueError(f"ignore_kwargs is supposed to be a list, not {type(use_kwargs)}") + + if inplace and fingerprint_names: + raise ValueError("fingerprint_names are only used when inplace is False") + + fingerprint_names = fingerprint_names if fingerprint_names is not None else ["new_fingerprint"] + + def _fingerprint(func): + if not inplace and not all(name in func.__code__.co_varnames for name in fingerprint_names): + raise ValueError(f"function {func} is missing parameters {fingerprint_names} in signature") + + if randomized_function: # randomized function have seed and generator parameters + if "seed" not in func.__code__.co_varnames: + raise ValueError(f"'seed' must be in {func}'s signature") + if "generator" not in func.__code__.co_varnames: + raise ValueError(f"'generator' must be in {func}'s signature") + # this call has to be outside the wrapper or since __qualname__ changes in multiprocessing + transform = format_transform_for_fingerprint(func, version=version) + + @wraps(func) + def wrapper(*args, **kwargs): + kwargs_for_fingerprint = format_kwargs_for_fingerprint( + func, + args, + kwargs, + use_kwargs=use_kwargs, + ignore_kwargs=ignore_kwargs, + randomized_function=randomized_function, + ) + + if args: + dataset: Dataset = args[0] + args = args[1:] + else: + dataset: Dataset = kwargs.pop(next(iter(inspect.signature(func).parameters))) + + # compute new_fingerprint and add it to the args of not in-place transforms + if inplace: + new_fingerprint = update_fingerprint(dataset._fingerprint, transform, kwargs_for_fingerprint) + else: + for fingerprint_name in fingerprint_names: # transforms like `train_test_split` have several hashes + if kwargs.get(fingerprint_name) is None: + kwargs_for_fingerprint["fingerprint_name"] = fingerprint_name + kwargs[fingerprint_name] = update_fingerprint( + dataset._fingerprint, transform, kwargs_for_fingerprint + ) + else: + validate_fingerprint(kwargs[fingerprint_name]) + + # Call actual function + + out = func(dataset, *args, **kwargs) + + # Update fingerprint of in-place transforms + update in-place history of transforms + + if inplace: # update after calling func so that the fingerprint doesn't change if the function fails + dataset._fingerprint = new_fingerprint + + return out + + wrapper._decorator_name_ = "fingerprint" + return wrapper + + return _fingerprint diff --git a/datasets/hub.py b/datasets/hub.py new file mode 100644 index 0000000000000000000000000000000000000000..c6594760de4bb1da4d2726791f4e9726ffaaa611 --- /dev/null +++ b/datasets/hub.py @@ -0,0 +1,124 @@ +from itertools import chain +from typing import Optional, Union + +from huggingface_hub import ( + CommitInfo, + CommitOperationAdd, + CommitOperationDelete, + DatasetCard, + DatasetCardData, + HfApi, + HfFileSystem, +) + +import datasets.config +from datasets.info import DatasetInfosDict +from datasets.load import load_dataset_builder +from datasets.utils.metadata import MetadataConfigs + + +def delete_from_hub( + repo_id: str, + config_name: str, + revision: Optional[str] = None, + token: Optional[Union[bool, str]] = None, +) -> CommitInfo: + """Delete a dataset configuration from a [data-only dataset](repository_structure) on the Hub. + + Args: + repo_id (`str`): ID of the Hub dataset repository, in the following format: `/` or + `/`. + config_name (`str`): Name of the dataset configuration. + revision (`str`, *optional*): Branch to delete the configuration from. Defaults to the `"main"` branch. + token (`bool` or `str`, *optional*): Authentication token for the Hugging Face Hub. + + Returns: + `huggingface_hub.CommitInfo` + """ + operations = [] + # data_files + fs = HfFileSystem(endpoint=datasets.config.HF_ENDPOINT, token=token) + builder = load_dataset_builder(repo_id, config_name, revision=revision, token=token) + for data_file in chain(*builder.config.data_files.values()): + data_file_resolved_path = fs.resolve_path(data_file) + if data_file_resolved_path.repo_id == repo_id: + operations.append(CommitOperationDelete(path_in_repo=data_file_resolved_path.path_in_repo)) + # README.md + dataset_card = DatasetCard.load(repo_id) + # config_names + if dataset_card.data.get("config_names", None) and config_name in dataset_card.data["config_names"]: + dataset_card.data["config_names"].remove(config_name) + # metadata_configs + metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card.data) + if metadata_configs: + _ = metadata_configs.pop(config_name, None) + dataset_card_data = DatasetCardData() + metadata_configs.to_dataset_card_data(dataset_card_data) + if datasets.config.METADATA_CONFIGS_FIELD in dataset_card_data: + dataset_card.data[datasets.config.METADATA_CONFIGS_FIELD] = dataset_card_data[ + datasets.config.METADATA_CONFIGS_FIELD + ] + else: + _ = dataset_card.data.pop(datasets.config.METADATA_CONFIGS_FIELD, None) + # dataset_info + dataset_infos: DatasetInfosDict = DatasetInfosDict.from_dataset_card_data(dataset_card.data) + if dataset_infos: + _ = dataset_infos.pop(config_name, None) + dataset_card_data = DatasetCardData() + dataset_infos.to_dataset_card_data(dataset_card_data) + if "dataset_info" in dataset_card_data: + dataset_card.data["dataset_info"] = dataset_card_data["dataset_info"] + else: + _ = dataset_card.data.pop("dataset_info", None) + # Commit + operations.append( + CommitOperationAdd(path_in_repo=datasets.config.REPOCARD_FILENAME, path_or_fileobj=str(dataset_card).encode()) + ) + api = HfApi(endpoint=datasets.config.HF_ENDPOINT, token=token) + commit_info = api.create_commit( + repo_id, + operations=operations, + commit_message=f"Delete '{config_name}' config", + commit_description=f"Delete '{config_name}' config.", + token=token, + repo_type="dataset", + revision=revision, + create_pr=True, + ) + print(f"You can find your PR to delete the dataset config at: {commit_info.pr_url}") + return commit_info + + +def _delete_files(dataset_id, revision=None, token=None): + hf_api = HfApi(endpoint=datasets.config.HF_ENDPOINT, token=token) + repo_files = hf_api.list_repo_files( + dataset_id, + repo_type="dataset", + ) + if repo_files: + legacy_json_file = [] + data_files = [] + for filename in repo_files: + if filename in {".gitattributes", "README.md"}: + continue + elif filename == "dataset_infos.json": + legacy_json_file.append(filename) + else: + data_files.append(filename) + if legacy_json_file: + hf_api.delete_file( + "dataset_infos.json", + dataset_id, + repo_type="dataset", + revision=revision, + commit_message="Delete legacy dataset_infos.json", + ) + if data_files: + for filename in data_files: + hf_api.delete_file( + filename, + dataset_id, + repo_type="dataset", + revision=revision, + commit_message="Delete data file", + ) diff --git a/datasets/info.py b/datasets/info.py new file mode 100644 index 0000000000000000000000000000000000000000..3723439fb91ab301c1d90c58f4cb98a197cc8a1e --- /dev/null +++ b/datasets/info.py @@ -0,0 +1,430 @@ +# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Lint as: python3 +"""DatasetInfo record information we know about a dataset. + +This includes things that we know about the dataset statically, i.e.: + - description + - canonical location + - does it have validation and tests splits + - size + - etc. + +This also includes the things that can and should be computed once we've +processed the dataset as well: + - number of examples (in each split) + - etc. +""" + +import copy +import dataclasses +import json +import os +import posixpath +from dataclasses import dataclass +from pathlib import Path +from typing import ClassVar, Optional, Union + +import fsspec +from fsspec.core import url_to_fs +from huggingface_hub import DatasetCard, DatasetCardData + +from . import config +from .features import Features +from .splits import SplitDict +from .utils import Version +from .utils.logging import get_logger +from .utils.py_utils import asdict, unique_values + + +logger = get_logger(__name__) + + +@dataclass +class SupervisedKeysData: + input: str = "" + output: str = "" + + +@dataclass +class DownloadChecksumsEntryData: + key: str = "" + value: str = "" + + +class MissingCachedSizesConfigError(Exception): + """The expected cached sizes of the download file are missing.""" + + +class NonMatchingCachedSizesError(Exception): + """The prepared split doesn't have expected sizes.""" + + +@dataclass +class PostProcessedInfo: + features: Optional[Features] = None + resources_checksums: Optional[dict] = None + + def __post_init__(self): + # Convert back to the correct classes when we reload from dict + if self.features is not None and not isinstance(self.features, Features): + self.features = Features.from_dict(self.features) + + @classmethod + def from_dict(cls, post_processed_info_dict: dict) -> "PostProcessedInfo": + field_names = {f.name for f in dataclasses.fields(cls)} + return cls(**{k: v for k, v in post_processed_info_dict.items() if k in field_names}) + + +@dataclass +class DatasetInfo: + """Information about a dataset. + + `DatasetInfo` documents datasets, including its name, version, and features. + See the constructor arguments and properties for a full list. + + Not all fields are known on construction and may be updated later. + + Attributes: + description (`str`): + A description of the dataset. + citation (`str`): + A BibTeX citation of the dataset. + homepage (`str`): + A URL to the official homepage for the dataset. + license (`str`): + The dataset's license. It can be the name of the license or a paragraph containing the terms of the license. + features ([`Features`], *optional*): + The features used to specify the dataset's column types. + post_processed (`PostProcessedInfo`, *optional*): + Information regarding the resources of a possible post-processing of a dataset. For example, it can contain the information of an index. + supervised_keys (`SupervisedKeysData`, *optional*): + Specifies the input feature and the label for supervised learning if applicable for the dataset (legacy from TFDS). + builder_name (`str`, *optional*): + The name of the `GeneratorBasedBuilder` subclass used to create the dataset. It is also the snake_case version of the dataset builder class name. + config_name (`str`, *optional*): + The name of the configuration derived from [`BuilderConfig`]. + version (`str` or [`Version`], *optional*): + The version of the dataset. + splits (`dict`, *optional*): + The mapping between split name and metadata. + download_checksums (`dict`, *optional*): + The mapping between the URL to download the dataset's checksums and corresponding metadata. + download_size (`int`, *optional*): + The size of the files to download to generate the dataset, in bytes. + post_processing_size (`int`, *optional*): + Size of the dataset in bytes after post-processing, if any. + dataset_size (`int`, *optional*): + The combined size in bytes of the Arrow tables for all splits. + size_in_bytes (`int`, *optional*): + The combined size in bytes of all files associated with the dataset (downloaded files + Arrow files). + **config_kwargs (additional keyword arguments): + Keyword arguments to be passed to the [`BuilderConfig`] and used in the [`DatasetBuilder`]. + """ + + # Set in the dataset builders + description: str = dataclasses.field(default_factory=str) + citation: str = dataclasses.field(default_factory=str) + homepage: str = dataclasses.field(default_factory=str) + license: str = dataclasses.field(default_factory=str) + features: Optional[Features] = None + post_processed: Optional[PostProcessedInfo] = None + supervised_keys: Optional[SupervisedKeysData] = None + + # Set later by the builder + builder_name: Optional[str] = None + dataset_name: Optional[str] = None # for packaged builders, to be different from builder_name + config_name: Optional[str] = None + version: Optional[Union[str, Version]] = None + # Set later by `download_and_prepare` + splits: Optional[dict] = None + download_checksums: Optional[dict] = None + download_size: Optional[int] = None + post_processing_size: Optional[int] = None + dataset_size: Optional[int] = None + size_in_bytes: Optional[int] = None + + _INCLUDED_INFO_IN_YAML: ClassVar[list[str]] = [ + "config_name", + "download_size", + "dataset_size", + "features", + "splits", + ] + + def __post_init__(self): + # Convert back to the correct classes when we reload from dict + if self.features is not None and not isinstance(self.features, Features): + self.features = Features.from_dict(self.features) + if self.post_processed is not None and not isinstance(self.post_processed, PostProcessedInfo): + self.post_processed = PostProcessedInfo.from_dict(self.post_processed) + if self.version is not None and not isinstance(self.version, Version): + if isinstance(self.version, str): + self.version = Version(self.version) + else: + self.version = Version.from_dict(self.version) + if self.splits is not None and not isinstance(self.splits, SplitDict): + self.splits = SplitDict.from_split_dict(self.splits) + if self.supervised_keys is not None and not isinstance(self.supervised_keys, SupervisedKeysData): + if isinstance(self.supervised_keys, (tuple, list)): + self.supervised_keys = SupervisedKeysData(*self.supervised_keys) + else: + self.supervised_keys = SupervisedKeysData(**self.supervised_keys) + + def write_to_directory(self, dataset_info_dir, pretty_print=False, storage_options: Optional[dict] = None): + """Write `DatasetInfo` and license (if present) as JSON files to `dataset_info_dir`. + + Args: + dataset_info_dir (`str`): + Destination directory. + pretty_print (`bool`, defaults to `False`): + If `True`, the JSON will be pretty-printed with the indent level of 4. + storage_options (`dict`, *optional*): + Key/value pairs to be passed on to the file-system backend, if any. + + + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") + >>> ds.info.write_to_directory("/path/to/directory/") + ``` + """ + fs: fsspec.AbstractFileSystem + fs, *_ = url_to_fs(dataset_info_dir, **(storage_options or {})) + with fs.open(posixpath.join(dataset_info_dir, config.DATASET_INFO_FILENAME), "wb") as f: + self._dump_info(f, pretty_print=pretty_print) + if self.license: + with fs.open(posixpath.join(dataset_info_dir, config.LICENSE_FILENAME), "wb") as f: + self._dump_license(f) + + def _dump_info(self, file, pretty_print=False): + """Dump info in `file` file-like object open in bytes mode (to support remote files)""" + file.write(json.dumps(asdict(self), indent=4 if pretty_print else None).encode("utf-8")) + + def _dump_license(self, file): + """Dump license in `file` file-like object open in bytes mode (to support remote files)""" + file.write(self.license.encode("utf-8")) + + @classmethod + def from_merge(cls, dataset_infos: list["DatasetInfo"]): + dataset_infos = [dset_info.copy() for dset_info in dataset_infos if dset_info is not None] + + if len(dataset_infos) > 0 and all(dataset_infos[0] == dset_info for dset_info in dataset_infos): + # if all dataset_infos are equal we don't need to merge. Just return the first. + return dataset_infos[0] + + description = "\n\n".join(unique_values(info.description for info in dataset_infos)).strip() + citation = "\n\n".join(unique_values(info.citation for info in dataset_infos)).strip() + homepage = "\n\n".join(unique_values(info.homepage for info in dataset_infos)).strip() + license = "\n\n".join(unique_values(info.license for info in dataset_infos)).strip() + features = None + supervised_keys = None + + return cls( + description=description, + citation=citation, + homepage=homepage, + license=license, + features=features, + supervised_keys=supervised_keys, + ) + + @classmethod + def from_directory(cls, dataset_info_dir: str, storage_options: Optional[dict] = None) -> "DatasetInfo": + """Create [`DatasetInfo`] from the JSON file in `dataset_info_dir`. + + This function updates all the dynamically generated fields (num_examples, + hash, time of creation,...) of the [`DatasetInfo`]. + + This will overwrite all previous metadata. + + Args: + dataset_info_dir (`str`): + The directory containing the metadata file. This + should be the root directory of a specific dataset version. + storage_options (`dict`, *optional*): + Key/value pairs to be passed on to the file-system backend, if any. + + + + Example: + + ```py + >>> from datasets import DatasetInfo + >>> ds_info = DatasetInfo.from_directory("/path/to/directory/") + ``` + """ + fs: fsspec.AbstractFileSystem + fs, *_ = url_to_fs(dataset_info_dir, **(storage_options or {})) + logger.debug(f"Loading Dataset info from {dataset_info_dir}") + if not dataset_info_dir: + raise ValueError("Calling DatasetInfo.from_directory() with undefined dataset_info_dir.") + with fs.open(posixpath.join(dataset_info_dir, config.DATASET_INFO_FILENAME), "r", encoding="utf-8") as f: + dataset_info_dict = json.load(f) + return cls.from_dict(dataset_info_dict) + + @classmethod + def from_dict(cls, dataset_info_dict: dict) -> "DatasetInfo": + field_names = {f.name for f in dataclasses.fields(cls)} + return cls(**{k: v for k, v in dataset_info_dict.items() if k in field_names}) + + def update(self, other_dataset_info: "DatasetInfo", ignore_none=True): + self_dict = self.__dict__ + self_dict.update( + **{ + k: copy.deepcopy(v) + for k, v in other_dataset_info.__dict__.items() + if (v is not None or not ignore_none) + } + ) + + def copy(self) -> "DatasetInfo": + return self.__class__(**{k: copy.deepcopy(v) for k, v in self.__dict__.items()}) + + def _to_yaml_dict(self) -> dict: + yaml_dict = {} + dataset_info_dict = asdict(self) + for key in dataset_info_dict: + if key in self._INCLUDED_INFO_IN_YAML: + value = getattr(self, key) + if hasattr(value, "_to_yaml_list"): # Features, SplitDict + yaml_dict[key] = value._to_yaml_list() + elif hasattr(value, "_to_yaml_string"): # Version + yaml_dict[key] = value._to_yaml_string() + else: + yaml_dict[key] = value + return yaml_dict + + @classmethod + def _from_yaml_dict(cls, yaml_data: dict) -> "DatasetInfo": + yaml_data = copy.deepcopy(yaml_data) + if yaml_data.get("features") is not None: + yaml_data["features"] = Features._from_yaml_list(yaml_data["features"]) + if yaml_data.get("splits") is not None: + yaml_data["splits"] = SplitDict._from_yaml_list(yaml_data["splits"]) + field_names = {f.name for f in dataclasses.fields(cls)} + return cls(**{k: v for k, v in yaml_data.items() if k in field_names}) + + +class DatasetInfosDict(dict[str, DatasetInfo]): + def write_to_directory(self, dataset_infos_dir, overwrite=False, pretty_print=False) -> None: + total_dataset_infos = {} + dataset_infos_path = os.path.join(dataset_infos_dir, config.DATASETDICT_INFOS_FILENAME) + dataset_readme_path = os.path.join(dataset_infos_dir, config.REPOCARD_FILENAME) + if not overwrite: + total_dataset_infos = self.from_directory(dataset_infos_dir) + total_dataset_infos.update(self) + if os.path.exists(dataset_infos_path): + # for backward compatibility, let's update the JSON file if it exists + with open(dataset_infos_path, "w", encoding="utf-8") as f: + dataset_infos_dict = { + config_name: asdict(dset_info) for config_name, dset_info in total_dataset_infos.items() + } + json.dump(dataset_infos_dict, f, indent=4 if pretty_print else None) + # Dump the infos in the YAML part of the README.md file + if os.path.exists(dataset_readme_path): + dataset_card = DatasetCard.load(dataset_readme_path) + dataset_card_data = dataset_card.data + else: + dataset_card = None + dataset_card_data = DatasetCardData() + if total_dataset_infos: + total_dataset_infos.to_dataset_card_data(dataset_card_data) + dataset_card = ( + DatasetCard("---\n" + str(dataset_card_data) + "\n---\n") if dataset_card is None else dataset_card + ) + dataset_card.save(Path(dataset_readme_path)) + + @classmethod + def from_directory(cls, dataset_infos_dir) -> "DatasetInfosDict": + logger.debug(f"Loading Dataset Infos from {dataset_infos_dir}") + # Load the info from the YAML part of README.md + if os.path.exists(os.path.join(dataset_infos_dir, config.REPOCARD_FILENAME)): + dataset_card_data = DatasetCard.load(Path(dataset_infos_dir) / config.REPOCARD_FILENAME).data + if "dataset_info" in dataset_card_data: + return cls.from_dataset_card_data(dataset_card_data) + if os.path.exists(os.path.join(dataset_infos_dir, config.DATASETDICT_INFOS_FILENAME)): + # this is just to have backward compatibility with dataset_infos.json files + with open(os.path.join(dataset_infos_dir, config.DATASETDICT_INFOS_FILENAME), encoding="utf-8") as f: + return cls( + { + config_name: DatasetInfo.from_dict(dataset_info_dict) + for config_name, dataset_info_dict in json.load(f).items() + } + ) + else: + return cls() + + @classmethod + def from_dataset_card_data(cls, dataset_card_data: DatasetCardData) -> "DatasetInfosDict": + if isinstance(dataset_card_data.get("dataset_info"), (list, dict)): + if isinstance(dataset_card_data["dataset_info"], list): + return cls( + { + dataset_info_yaml_dict.get("config_name", "default"): DatasetInfo._from_yaml_dict( + dataset_info_yaml_dict + ) + for dataset_info_yaml_dict in dataset_card_data["dataset_info"] + } + ) + else: + dataset_info = DatasetInfo._from_yaml_dict(dataset_card_data["dataset_info"]) + dataset_info.config_name = dataset_card_data["dataset_info"].get("config_name", "default") + return cls({dataset_info.config_name: dataset_info}) + else: + return cls() + + def to_dataset_card_data(self, dataset_card_data: DatasetCardData) -> None: + if self: + # first get existing metadata info + if "dataset_info" in dataset_card_data and isinstance(dataset_card_data["dataset_info"], dict): + dataset_metadata_infos = { + dataset_card_data["dataset_info"].get("config_name", "default"): dataset_card_data["dataset_info"] + } + elif "dataset_info" in dataset_card_data and isinstance(dataset_card_data["dataset_info"], list): + dataset_metadata_infos = { + config_metadata["config_name"]: config_metadata + for config_metadata in dataset_card_data["dataset_info"] + } + else: + dataset_metadata_infos = {} + # update/rewrite existing metadata info with the one to dump + total_dataset_infos = { + **dataset_metadata_infos, + **{config_name: dset_info._to_yaml_dict() for config_name, dset_info in self.items()}, + } + # the config_name from the dataset_infos_dict takes over the config_name of the DatasetInfo + for config_name, dset_info_yaml_dict in total_dataset_infos.items(): + dset_info_yaml_dict["config_name"] = config_name + if len(total_dataset_infos) == 1: + # use a struct instead of a list of configurations, since there's only one + dataset_card_data["dataset_info"] = next(iter(total_dataset_infos.values())) + config_name = dataset_card_data["dataset_info"].pop("config_name", None) + if config_name != "default": + # if config_name is not "default" preserve it and put at the first position + dataset_card_data["dataset_info"] = { + "config_name": config_name, + **dataset_card_data["dataset_info"], + } + else: + dataset_card_data["dataset_info"] = [] + for config_name, dataset_info_yaml_dict in sorted(total_dataset_infos.items()): + # add the config_name field in first position + dataset_info_yaml_dict.pop("config_name", None) + dataset_info_yaml_dict = {"config_name": config_name, **dataset_info_yaml_dict} + dataset_card_data["dataset_info"].append(dataset_info_yaml_dict) diff --git a/datasets/inspect.py b/datasets/inspect.py new file mode 100644 index 0000000000000000000000000000000000000000..81022eadc678df28be247b77b925f6f05b9f6a0c --- /dev/null +++ b/datasets/inspect.py @@ -0,0 +1,353 @@ +# Copyright 2020 The HuggingFace Datasets Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Lint as: python3 +"""List and inspect datasets.""" + +import os +from collections.abc import Mapping, Sequence +from typing import Optional, Union + +from .download.download_config import DownloadConfig +from .download.download_manager import DownloadMode +from .download.streaming_download_manager import StreamingDownloadManager +from .info import DatasetInfo +from .load import ( + dataset_module_factory, + get_dataset_builder_class, + load_dataset_builder, +) +from .utils.logging import get_logger +from .utils.version import Version + + +logger = get_logger(__name__) + + +class SplitsNotFoundError(ValueError): + pass + + +def get_dataset_infos( + path: str, + data_files: Optional[Union[dict, list, str]] = None, + download_config: Optional[DownloadConfig] = None, + download_mode: Optional[Union[DownloadMode, str]] = None, + revision: Optional[Union[str, Version]] = None, + token: Optional[Union[bool, str]] = None, + **config_kwargs, +): + """Get the meta information about a dataset, returned as a dict mapping config name to DatasetInfoDict. + + Args: + path (`str`): path to the dataset repository. Can be either: + + - a local path to the dataset directory containing the data files, + e.g. `'./dataset/squad'` + - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]), + e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'` + revision (`Union[str, datasets.Version]`, *optional*): + If specified, the dataset module will be loaded from the datasets repository at this version. + By default: + - it is set to the local version of the lib. + - it will also try to load it from the main branch if it's not available at the local version of the lib. + Specifying a version that is different from your local version of the lib might cause compatibility issues. + download_config ([`DownloadConfig`], *optional*): + Specific download configuration parameters. + download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`): + Download/generate mode. + data_files (`Union[Dict, List, str]`, *optional*): + Defining the data_files of the dataset configuration. + token (`str` or `bool`, *optional*): + Optional string or boolean to use as Bearer token for remote files on the Datasets Hub. + If `True`, or not specified, will get token from `"~/.huggingface"`. + **config_kwargs (additional keyword arguments): + Optional attributes for builder class which will override the attributes if supplied. + + Example: + + ```py + >>> from datasets import get_dataset_infos + >>> get_dataset_infos('cornell-movie-review-data/rotten_tomatoes') + {'default': DatasetInfo(description="Movie Review Dataset.\nThis is a dataset of containing 5,331 positive and 5,331 negative processed\nsentences from Rotten Tomatoes movie reviews...), ...} + ``` + """ + config_names = get_dataset_config_names( + path=path, + revision=revision, + download_config=download_config, + download_mode=download_mode, + data_files=data_files, + token=token, + ) + return { + config_name: get_dataset_config_info( + path=path, + config_name=config_name, + data_files=data_files, + download_config=download_config, + download_mode=download_mode, + revision=revision, + token=token, + **config_kwargs, + ) + for config_name in config_names + } + + +def get_dataset_config_names( + path: str, + revision: Optional[Union[str, Version]] = None, + download_config: Optional[DownloadConfig] = None, + download_mode: Optional[Union[DownloadMode, str]] = None, + data_files: Optional[Union[dict, list, str]] = None, + **download_kwargs, +): + """Get the list of available config names for a particular dataset. + + Args: + path (`str`): path to the dataset repository. Can be either: + + - a local path to the dataset directory containing the data files, + e.g. `'./dataset/squad'` + - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]), + e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'` + revision (`Union[str, datasets.Version]`, *optional*): + If specified, the dataset module will be loaded from the datasets repository at this version. + By default: + - it is set to the local version of the lib. + - it will also try to load it from the main branch if it's not available at the local version of the lib. + Specifying a version that is different from your local version of the lib might cause compatibility issues. + download_config ([`DownloadConfig`], *optional*): + Specific download configuration parameters. + download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`): + Download/generate mode. + data_files (`Union[Dict, List, str]`, *optional*): + Defining the data_files of the dataset configuration. + **download_kwargs (additional keyword arguments): + Optional attributes for [`DownloadConfig`] which will override the attributes in `download_config` if supplied, + for example `token`. + + Example: + + ```py + >>> from datasets import get_dataset_config_names + >>> get_dataset_config_names("nyu-mll/glue") + ['cola', + 'sst2', + 'mrpc', + 'qqp', + 'stsb', + 'mnli', + 'mnli_mismatched', + 'mnli_matched', + 'qnli', + 'rte', + 'wnli', + 'ax'] + ``` + """ + dataset_module = dataset_module_factory( + path, + revision=revision, + download_config=download_config, + download_mode=download_mode, + data_files=data_files, + **download_kwargs, + ) + builder_cls = get_dataset_builder_class(dataset_module, dataset_name=os.path.basename(path)) + return list(builder_cls.builder_configs.keys()) or [ + dataset_module.builder_kwargs.get("config_name", builder_cls.DEFAULT_CONFIG_NAME or "default") + ] + + +def get_dataset_default_config_name( + path: str, + revision: Optional[Union[str, Version]] = None, + download_config: Optional[DownloadConfig] = None, + download_mode: Optional[Union[DownloadMode, str]] = None, + data_files: Optional[Union[dict, list, str]] = None, + **download_kwargs, +) -> Optional[str]: + """Get the default config name for a particular dataset. + Can return None only if the dataset has multiple configurations and no default configuration. + + Args: + path (`str`): path to the dataset repository. Can be either: + + - a local path to the dataset directory containing the data files, + e.g. `'./dataset/squad'` + - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]), + e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'` + revision (`Union[str, datasets.Version]`, *optional*): + If specified, the dataset module will be loaded from the datasets repository at this version. + By default: + - it is set to the local version of the lib. + - it will also try to load it from the main branch if it's not available at the local version of the lib. + Specifying a version that is different from your local version of the lib might cause compatibility issues. + download_config ([`DownloadConfig`], *optional*): + Specific download configuration parameters. + download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`): + Download/generate mode. + data_files (`Union[Dict, List, str]`, *optional*): + Defining the data_files of the dataset configuration. + **download_kwargs (additional keyword arguments): + Optional attributes for [`DownloadConfig`] which will override the attributes in `download_config` if supplied, + for example `token`. + + Returns: + Optional[str]: the default config name if there is one + + Example: + + ```py + >>> from datasets import get_dataset_default_config_name + >>> get_dataset_default_config_name("openbookqa") + 'main' + ``` + """ + dataset_module = dataset_module_factory( + path, + revision=revision, + download_config=download_config, + download_mode=download_mode, + data_files=data_files, + **download_kwargs, + ) + builder_cls = get_dataset_builder_class(dataset_module, dataset_name=os.path.basename(path)) + builder_configs = list(builder_cls.builder_configs.keys()) + if builder_configs: + default_config_name = builder_configs[0] if len(builder_configs) == 1 else None + else: + default_config_name = "default" + return builder_cls.DEFAULT_CONFIG_NAME or default_config_name + + +def get_dataset_config_info( + path: str, + config_name: Optional[str] = None, + data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None, + download_config: Optional[DownloadConfig] = None, + download_mode: Optional[Union[DownloadMode, str]] = None, + revision: Optional[Union[str, Version]] = None, + token: Optional[Union[bool, str]] = None, + **config_kwargs, +) -> DatasetInfo: + """Get the meta information (DatasetInfo) about a dataset for a particular config + + Args: + path (`str`): path to the dataset repository. Can be either: + + - a local path to the dataset directory containing the data files, + e.g. `'./dataset/squad'` + - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]), + e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'` + config_name (:obj:`str`, optional): Defining the name of the dataset configuration. + data_files (:obj:`str` or :obj:`Sequence` or :obj:`Mapping`, optional): Path(s) to source data file(s). + download_config (:class:`~download.DownloadConfig`, optional): Specific download configuration parameters. + download_mode (:class:`DownloadMode` or :obj:`str`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode. + revision (:class:`~utils.Version` or :obj:`str`, optional): Version of the dataset to load. + As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch. + You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository. + token (``str`` or :obj:`bool`, optional): Optional string or boolean to use as Bearer token for remote files on the Datasets Hub. + If True, or not specified, will get token from `"~/.huggingface"`. + **config_kwargs (additional keyword arguments): optional attributes for builder class which will override the attributes if supplied. + + """ + builder = load_dataset_builder( + path, + name=config_name, + data_files=data_files, + download_config=download_config, + download_mode=download_mode, + revision=revision, + token=token, + **config_kwargs, + ) + info = builder.info + if info.splits is None: + download_config = download_config.copy() if download_config else DownloadConfig() + if token is not None: + download_config.token = token + builder._check_manual_download( + StreamingDownloadManager(base_path=builder.base_path, download_config=download_config) + ) + try: + info.splits = { + split_generator.name: {"name": split_generator.name, "dataset_name": path} + for split_generator in builder._split_generators( + StreamingDownloadManager(base_path=builder.base_path, download_config=download_config) + ) + } + except Exception as err: + raise SplitsNotFoundError("The split names could not be parsed from the dataset config.") from err + return info + + +def get_dataset_split_names( + path: str, + config_name: Optional[str] = None, + data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None, + download_config: Optional[DownloadConfig] = None, + download_mode: Optional[Union[DownloadMode, str]] = None, + revision: Optional[Union[str, Version]] = None, + token: Optional[Union[bool, str]] = None, + **config_kwargs, +): + """Get the list of available splits for a particular config and dataset. + + Args: + path (`str`): path to the dataset repository. Can be either: + + - a local path to the dataset directory containing the data files, + e.g. `'./dataset/squad'` + - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]), + e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'` + config_name (`str`, *optional*): + Defining the name of the dataset configuration. + data_files (`str` or `Sequence` or `Mapping`, *optional*): + Path(s) to source data file(s). + download_config ([`DownloadConfig`], *optional*): + Specific download configuration parameters. + download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`): + Download/generate mode. + revision ([`Version`] or `str`, *optional*): + Version of the dataset to load. + As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch. + You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository. + token (`str` or `bool`, *optional*): + Optional string or boolean to use as Bearer token for remote files on the Datasets Hub. + If `True`, or not specified, will get token from `"~/.huggingface"`. + **config_kwargs (additional keyword arguments): + Optional attributes for builder class which will override the attributes if supplied. + + Example: + + ```py + >>> from datasets import get_dataset_split_names + >>> get_dataset_split_names('cornell-movie-review-data/rotten_tomatoes') + ['train', 'validation', 'test'] + ``` + """ + info = get_dataset_config_info( + path, + config_name=config_name, + data_files=data_files, + download_config=download_config, + download_mode=download_mode, + revision=revision, + token=token, + **config_kwargs, + ) + return list(info.splits.keys()) diff --git a/datasets/iterable_dataset.py b/datasets/iterable_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..7f53bc7372a46d9d98c13e11632cd3335c961790 --- /dev/null +++ b/datasets/iterable_dataset.py @@ -0,0 +1,4662 @@ +import asyncio +import contextlib +import copy +import fnmatch +import inspect +import itertools +import json +import math +import multiprocessing.pool +import random +import re +import sys +import time +from collections import Counter +from collections.abc import Iterable, Iterator +from copy import deepcopy +from dataclasses import dataclass +from functools import partial +from io import BytesIO +from itertools import cycle, islice +from pathlib import Path +from typing import TYPE_CHECKING, Any, BinaryIO, Callable, Optional, Union + +import fsspec.asyn +import numpy as np +import pandas as pd +import pyarrow as pa +import pyarrow.parquet as pq +from huggingface_hub import ( + CommitInfo, + CommitOperationAdd, + CommitOperationDelete, + DatasetCard, + DatasetCardData, + HfApi, + HfFileSystem, +) +from huggingface_hub.hf_api import RepoFile +from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError +from multiprocess import Pool + +from . import config +from .arrow_dataset import PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED, Dataset, DatasetInfoMixin +from .data_files import sanitize_patterns +from .features import Features +from .features.features import ( + FeatureType, + List, + Value, + _align_features, + _check_if_features_can_be_aligned, + _fix_for_backward_compatible_features, + _visit, + cast_to_python_objects, + require_decoding, +) +from .formatting import ( + ArrowFormatter, + PythonFormatter, + TableFormatter, + TensorFormatter, + get_format_type_from_alias, + get_formatter, +) +from .info import DatasetInfo, DatasetInfosDict +from .naming import _split_re +from .splits import NamedSplit, Split, SplitDict, SplitInfo +from .table import cast_table_to_features, embed_table_storage, read_schema_from_file, table_cast +from .utils import tqdm as hf_tqdm +from .utils.logging import get_logger +from .utils.metadata import MetadataConfigs +from .utils.py_utils import Literal, asdict, glob_pattern_to_regex, iflatmap_unordered, string_to_dict +from .utils.sharding import _merge_gen_kwargs, _number_of_shards_in_gen_kwargs, _shuffle_gen_kwargs, _split_gen_kwargs +from .utils.typing import PathLike + + +if TYPE_CHECKING: + import sqlite3 + + import polars as pl + import sqlalchemy + import torch + +logger = get_logger(__name__) + +Key = Union[int, str] + + +def identity_func(x): + return x + + +def _rename_columns_fn(example: dict, column_mapping: dict[str, str]): + if any(col not in example for col in column_mapping): + raise ValueError( + f"Error when renaming {list(column_mapping)} to {list(column_mapping.values())}: columns {set(column_mapping) - set(example)} are not in the dataset." + ) + if any(col in example for col in column_mapping.values()): + raise ValueError( + f"Error when renaming {list(column_mapping)} to {list(column_mapping.values())}: columns {set(example) - set(column_mapping.values())} are already in the dataset." + ) + return { + new_column_name: example[original_column_name] + for original_column_name, new_column_name in column_mapping.items() + } + + +def add_column_fn(example: dict, idx: int, name: str, column: list[dict]): + if name in example: + raise ValueError(f"Error when adding {name}: column {name} is already in the dataset.") + return {name: column[idx]} + + +def _infer_features_from_batch(batch: dict[str, list], try_features: Optional[Features] = None) -> Features: + pa_table = pa.Table.from_pydict(batch) + if try_features is not None: + try: + pa_table = table_cast(pa_table, pa.schema(try_features.type)) + except (TypeError, pa.ArrowInvalid, pa.ArrowNotImplementedError): + pass + return Features.from_arrow_schema(pa_table.schema) + + +def _examples_to_batch(examples: list[dict[str, Any]]) -> dict[str, list]: + # we order the columns by order of appearance + # to do so, we use a dict as an ordered set + cols = {col: None for example in examples for col in example} + # when an example is missing a column, we set the value to None with .get() + arrays = [[example.get(col) for example in examples] for col in cols] + return dict(zip(cols, arrays)) + + +def _batch_to_examples(batch: dict[str, list]) -> Iterator[dict[str, Any]]: + """Convert a batch (dict of examples) to examples list""" + n_examples = 0 if len(batch) == 0 else len(batch[next(iter(batch))]) + for i in range(n_examples): + yield {col: array[i] for col, array in batch.items()} + + +def _convert_to_arrow( + iterable: Iterable[tuple[Key, dict]], + batch_size: int, + drop_last_batch: bool = False, +) -> Iterator[tuple[Key, pa.Table]]: + """Convert and group examples in Arrow tables of size `batch_size`. + + Args: + iterable (`Iterable[Tuple[Key, dict]]`): + An examples iterable containing tuples (example_key, example) of type (int/str, dict) + batch_size (`Optional[int]`): + Size of each sub-table to yield. If None or <= 0, yields the full table. + drop_last_batch (`bool`, defaults to `False`): + Drop the last batch if it is smaller than `batch_size`. + """ + if batch_size is None or batch_size <= 0: + yield ( + "all", + pa.Table.from_pylist(cast_to_python_objects([example for _, example in iterable], only_1d_for_numpy=True)), + ) + return + iterator = iter(iterable) + for key, example in iterator: + iterator_batch = islice(iterator, batch_size - 1) + key_examples_list = [(key, example)] + list(iterator_batch) + if len(key_examples_list) < batch_size and drop_last_batch: + return + keys, examples = zip(*key_examples_list) + new_key = "_".join(str(key) for key in keys) + yield new_key, pa.Table.from_pylist(cast_to_python_objects(examples, only_1d_for_numpy=True)) + + +class _BaseExamplesIterable: + """Base class for the examples iterable used by an IterableDataset""" + + def __init__(self) -> None: + self._state_dict: Optional[Union[list, dict]] = None + + def __iter__(self) -> Iterator[tuple[Key, dict]]: + """An examples iterable should yield tuples (example_key, example) of type (int/str, dict)""" + raise NotImplementedError(f"{type(self)} doesn't implement __iter__ yet") + + @property + def iter_arrow(self) -> Optional[Callable[[], Iterator[tuple[Key, pa.Table]]]]: + return None + + @property + def is_typed(self) -> bool: + return False + + @property + def features(self) -> Optional[Features]: + return None + + def shuffle_data_sources(self, generator: np.random.Generator) -> "_BaseExamplesIterable": + """ + Either shuffle the shards/sources of the dataset, or propagate the shuffling to the underlying iterable. + If the order of the shards must stay fixed (when using .skip or .take for example), then this method returns self. + """ + raise NotImplementedError(f"{type(self)} doesn't implement shuffle_data_sources yet") + + def shard_data_sources(self, num_shards: int, index: int, contiguous=True) -> "_BaseExamplesIterable": + """Either keep only the requested shard, or propagate the request to the underlying iterable.""" + raise NotImplementedError(f"{type(self)} doesn't implement shard_data_sources yet") + + def split_shard_indices_by_worker(self, num_shards: int, index: int, contiguous=True) -> list[int]: + if contiguous: + div = self.num_shards // num_shards + mod = self.num_shards % num_shards + start = div * index + min(index, mod) + end = start + div + (1 if index < mod else 0) + return list(range(start, end)) + else: + return list(range(index, self.num_shards, num_shards)) + + @property + def num_shards(self) -> int: + raise NotImplementedError(f"{type(self)} doesn't implement num_shards yet") + + def _init_state_dict(self) -> dict: + raise NotImplementedError(f"{type(self)} doesn't implement _init_state_dict yet") + + def load_state_dict(self, state_dict: dict) -> dict: + def _inner_load_state_dict(state, new_state): + if new_state is not None and isinstance(state, dict): + for key in new_state: + state[key] = _inner_load_state_dict(state[key], new_state[key]) + return state + elif new_state is not None and isinstance(state, list): + for i in range(len(state)): + state[i] = _inner_load_state_dict(state[i], new_state[i]) + return state + return new_state + + return _inner_load_state_dict(self._state_dict, state_dict) + + def state_dict(self) -> dict: + if self._state_dict: + return copy.deepcopy(self._state_dict) + raise RuntimeError("State dict is not initialized, please call ex_iterable._init_state_dict() first.") + + +class ExamplesIterable(_BaseExamplesIterable): + def __init__(self, generate_examples_fn: Callable[..., tuple[Key, dict]], kwargs: dict): + super().__init__() + self.generate_examples_fn = generate_examples_fn + self.kwargs = kwargs + + def _init_state_dict(self) -> dict: + self._state_dict = {"shard_idx": 0, "shard_example_idx": 0, "type": self.__class__.__name__} + return self._state_dict + + def __iter__(self): + shard_idx_start = self._state_dict["shard_idx"] if self._state_dict else 0 + for gen_kwags in islice(_split_gen_kwargs(self.kwargs, max_num_jobs=self.num_shards), shard_idx_start, None): + shard_example_idx_start = self._state_dict["shard_example_idx"] if self._state_dict else 0 + for key_example in islice(self.generate_examples_fn(**gen_kwags), shard_example_idx_start, None): + if self._state_dict: + self._state_dict["shard_example_idx"] += 1 + yield key_example + if self._state_dict: + self._state_dict["shard_idx"] += 1 + self._state_dict["shard_example_idx"] = 0 + + def shuffle_data_sources(self, generator: np.random.Generator) -> "ExamplesIterable": + return ShuffledDataSourcesExamplesIterable(self.generate_examples_fn, self.kwargs, generator) + + def shard_data_sources(self, num_shards: int, index: int, contiguous=True) -> "ExamplesIterable": + """Keep only the requested shard.""" + gen_kwargs_list = _split_gen_kwargs(self.kwargs, max_num_jobs=self.num_shards) + shard_indices = self.split_shard_indices_by_worker(num_shards, index, contiguous=contiguous) + requested_gen_kwargs = _merge_gen_kwargs([gen_kwargs_list[i] for i in shard_indices]) + return ExamplesIterable(self.generate_examples_fn, requested_gen_kwargs) + + @property + def num_shards(self) -> int: + return _number_of_shards_in_gen_kwargs(self.kwargs) + + +class ShuffledDataSourcesExamplesIterable(ExamplesIterable): + def __init__( + self, generate_examples_fn: Callable[..., tuple[Key, dict]], kwargs: dict, generator: np.random.Generator + ): + super().__init__(generate_examples_fn, kwargs) + self.generator = deepcopy(generator) + + def _init_state_dict(self) -> dict: + self._state_dict = {"shard_idx": 0, "shard_example_idx": 0, "type": self.__class__.__name__} + return self._state_dict + + def __iter__(self): + """Shuffle the kwargs order to shuffle shards""" + rng = deepcopy(self.generator) + kwargs_with_shuffled_shards = _shuffle_gen_kwargs(rng, self.kwargs) + shard_idx_start = self._state_dict["shard_idx"] if self._state_dict else 0 + for gen_kwags in islice( + _split_gen_kwargs(kwargs_with_shuffled_shards, max_num_jobs=self.num_shards), shard_idx_start, None + ): + shard_example_idx_start = self._state_dict["shard_example_idx"] if self._state_dict else 0 + for key_example in islice(self.generate_examples_fn(**gen_kwags), shard_example_idx_start, None): + if self._state_dict: + self._state_dict["shard_example_idx"] += 1 + yield key_example + if self._state_dict: + self._state_dict["shard_idx"] += 1 + self._state_dict["shard_example_idx"] = 0 + + def shard_data_sources(self, num_shards: int, index: int, contiguous=True) -> "ExamplesIterable": + """Keep only the requested shard.""" + rng = deepcopy(self.generator) + kwargs_with_shuffled_shards = _shuffle_gen_kwargs(rng, self.kwargs) + return ExamplesIterable(self.generate_examples_fn, kwargs_with_shuffled_shards).shard_data_sources( + num_shards, index, contiguous=contiguous + ) + + +class ArrowExamplesIterable(_BaseExamplesIterable): + def __init__(self, generate_tables_fn: Callable[..., tuple[Key, pa.Table]], kwargs: dict): + super().__init__() + self.generate_tables_fn = generate_tables_fn + self.kwargs = kwargs + + @property + def iter_arrow(self): + return self._iter_arrow + + def _init_state_dict(self) -> dict: + self._state_dict = {"shard_idx": 0, "shard_example_idx": 0, "type": self.__class__.__name__} + return self._state_dict + + def __iter__(self): + formatter = PythonFormatter() + shard_idx_start = self._state_dict["shard_idx"] if self._state_dict else 0 + for gen_kwags in islice(_split_gen_kwargs(self.kwargs, max_num_jobs=self.num_shards), shard_idx_start, None): + shard_example_idx_start = self._state_dict["shard_example_idx"] if self._state_dict else 0 + shard_example_idx = 0 + for key, pa_table in self.generate_tables_fn(**gen_kwags): + if shard_example_idx + len(pa_table) <= shard_example_idx_start: + shard_example_idx += len(pa_table) + continue + for pa_subtable in pa_table.to_reader(max_chunksize=config.ARROW_READER_BATCH_SIZE_IN_DATASET_ITER): + formatted_batch = formatter.format_batch(pa_subtable) + for example in _batch_to_examples(formatted_batch): + if shard_example_idx >= shard_example_idx_start: + if self._state_dict: + self._state_dict["shard_example_idx"] += 1 + yield key, example + shard_example_idx += 1 + if self._state_dict: + self._state_dict["shard_idx"] += 1 + self._state_dict["shard_example_idx"] = 0 + + def _iter_arrow(self): + shard_idx_start = self._state_dict["shard_idx"] if self._state_dict else 0 + for gen_kwags in islice(_split_gen_kwargs(self.kwargs, max_num_jobs=self.num_shards), shard_idx_start, None): + shard_example_idx_start = self._state_dict["shard_example_idx"] if self._state_dict else 0 + shard_example_idx = 0 + for key, pa_table in self.generate_tables_fn(**gen_kwags): + shard_example_idx += len(pa_table) + if shard_example_idx <= shard_example_idx_start: + continue + if self._state_dict: + self._state_dict["shard_example_idx"] += len(pa_table) + yield key, pa_table + if self._state_dict: + self._state_dict["shard_idx"] += 1 + self._state_dict["shard_example_idx"] = 0 + + def shuffle_data_sources(self, generator: np.random.Generator) -> "ArrowExamplesIterable": + return ShuffledDataSourcesArrowExamplesIterable(self.generate_tables_fn, self.kwargs, generator) + + def shard_data_sources(self, num_shards: int, index: int, contiguous=True) -> "ArrowExamplesIterable": + """Keep only the requested shard.""" + gen_kwargs_list = _split_gen_kwargs(self.kwargs, max_num_jobs=self.num_shards) + shard_indices = self.split_shard_indices_by_worker(num_shards, index, contiguous=contiguous) + requested_gen_kwargs = _merge_gen_kwargs([gen_kwargs_list[i] for i in shard_indices]) + return ArrowExamplesIterable(self.generate_tables_fn, requested_gen_kwargs) + + @property + def num_shards(self) -> int: + return _number_of_shards_in_gen_kwargs(self.kwargs) + + +class ShuffledDataSourcesArrowExamplesIterable(ArrowExamplesIterable): + def __init__( + self, + generate_tables_fn: Callable[..., tuple[Key, pa.Table]], + kwargs: dict, + generator: np.random.Generator, + ): + super().__init__(generate_tables_fn, kwargs) + self.generator = deepcopy(generator) + + def _init_state_dict(self) -> dict: + self._state_dict = {"shard_idx": 0, "shard_example_idx": 0, "type": self.__class__.__name__} + return self._state_dict + + def __iter__(self): + """Shuffle the kwargs order to shuffle shards""" + rng = deepcopy(self.generator) + kwargs_with_shuffled_shards = _shuffle_gen_kwargs(rng, self.kwargs) + formatter = PythonFormatter() + shard_idx_start = self._state_dict["shard_idx"] if self._state_dict else 0 + for gen_kwags in islice( + _split_gen_kwargs(kwargs_with_shuffled_shards, max_num_jobs=self.num_shards), shard_idx_start, None + ): + shard_example_idx_start = self._state_dict["shard_example_idx"] if self._state_dict else 0 + shard_example_idx = 0 + for key, pa_table in self.generate_tables_fn(**gen_kwags): + if shard_example_idx + len(pa_table) <= shard_example_idx_start: + shard_example_idx += len(pa_table) + continue + for pa_subtable in pa_table.to_reader(max_chunksize=config.ARROW_READER_BATCH_SIZE_IN_DATASET_ITER): + formatted_batch = formatter.format_batch(pa_subtable) + for example in _batch_to_examples(formatted_batch): + if shard_example_idx >= shard_example_idx_start: + if self._state_dict: + self._state_dict["shard_example_idx"] += 1 + yield key, example + shard_example_idx += 1 + if self._state_dict: + self._state_dict["shard_idx"] += 1 + self._state_dict["shard_example_idx"] = 0 + + def _iter_arrow(self): + rng = deepcopy(self.generator) + kwargs_with_shuffled_shards = _shuffle_gen_kwargs(rng, self.kwargs) + shard_idx_start = self._state_dict["shard_idx"] if self._state_dict else 0 + for gen_kwags in islice( + _split_gen_kwargs(kwargs_with_shuffled_shards, max_num_jobs=self.num_shards), shard_idx_start, None + ): + shard_example_idx_start = self._state_dict["shard_example_idx"] if self._state_dict else 0 + shard_example_idx = 0 + for key, pa_table in self.generate_tables_fn(**gen_kwags): + shard_example_idx += len(pa_table) + if shard_example_idx <= shard_example_idx_start: + continue + if self._state_dict: + self._state_dict["shard_example_idx"] += len(pa_table) + yield key, pa_table + if self._state_dict: + self._state_dict["shard_idx"] += 1 + self._state_dict["shard_example_idx"] = 0 + + def shard_data_sources(self, num_shards: int, index: int, contiguous=True) -> "ArrowExamplesIterable": + """Keep only the requested shard.""" + rng = deepcopy(self.generator) + kwargs_with_shuffled_shards = _shuffle_gen_kwargs(rng, self.kwargs) + return ArrowExamplesIterable(self.generate_tables_fn, kwargs_with_shuffled_shards).shard_data_sources( + num_shards, index, contiguous=contiguous + ) + + +class RebatchedArrowExamplesIterable(_BaseExamplesIterable): + def __init__(self, ex_iterable: _BaseExamplesIterable, batch_size: Optional[int], drop_last_batch: bool = False): + super().__init__() + self.ex_iterable = ex_iterable + self.batch_size = batch_size + self.drop_last_batch = drop_last_batch + + @property + def iter_arrow(self): + return self._iter_arrow + + @property + def is_typed(self): + return self.ex_iterable.is_typed + + @property + def features(self): + return self.ex_iterable.features + + def _init_state_dict(self) -> dict: + self._state_dict = { + "examples_iterable": self.ex_iterable._init_state_dict(), + "previous_state": None, + "batch_idx": 0, + "num_chunks_since_previous_state": 0, + "cropped_chunk_length": 0, + "type": self.__class__.__name__, + } + return self._state_dict + + def __iter__(self): + yield from self.ex_iterable + + def _iter_arrow(self) -> Iterator[tuple[Key, pa.Table]]: + """Iterate over sub-tables of size `batch_size`.""" + if self._state_dict and self._state_dict["previous_state"]: + self.ex_iterable.load_state_dict(self._state_dict["previous_state"]) + if self.ex_iterable.iter_arrow: + iterator = self.ex_iterable.iter_arrow() + else: + iterator = _convert_to_arrow(self.ex_iterable, batch_size=1) + if self.batch_size is None or self.batch_size <= 0: + if self._state_dict and self._state_dict["batch_idx"] > 0: + return + all_pa_table = pa.concat_tables([pa_table for _, pa_table in iterator]) + if self._state_dict: + self._state_dict["batch_idx"] = 1 + yield "all", all_pa_table + return + keys_buffer = [] + chunks_buffer = [] + chunks_buffer_size = 0 + num_chunks_to_skip = self._state_dict["num_chunks_since_previous_state"] if self._state_dict else 0 + chunk_length_to_crop = self._state_dict["cropped_chunk_length"] if self._state_dict else 0 + if self._state_dict: + previous_state = self.ex_iterable.state_dict() + self._state_dict["previous_state"] = previous_state + for key, pa_table in iterator: + for num_chunks_since_previous_state, chunk in enumerate(pa_table.to_reader(max_chunksize=self.batch_size)): + if num_chunks_to_skip > 1: + num_chunks_to_skip -= 1 + continue + elif num_chunks_to_skip == 1 and chunk_length_to_crop == 0: + num_chunks_to_skip -= 1 + continue + elif num_chunks_to_skip == 1 and chunk_length_to_crop > 0: + chunk = chunk.slice(chunk_length_to_crop, len(chunk) - chunk_length_to_crop) + num_chunks_to_skip = 0 + chunk_length_to_crop = 0 + if len(chunk) == 0: + continue + + if chunks_buffer_size + len(chunk) < self.batch_size: + keys_buffer.append(key) + chunks_buffer.append(chunk) + chunks_buffer_size += len(chunk) + continue + elif chunks_buffer_size + len(chunk) == self.batch_size: + keys_buffer.append(key) + chunks_buffer.append(chunk) + new_key = "_".join(str(_key) for _key in keys_buffer) + if self._state_dict: + self._state_dict["batch_idx"] += 1 + self._state_dict["num_chunks_since_previous_state"] += len(chunks_buffer) + self._state_dict["cropped_chunk_length"] = 0 + yield new_key, pa.Table.from_batches(chunks_buffer) + keys_buffer = [] + chunks_buffer = [] + chunks_buffer_size = 0 + if self._state_dict: + self._state_dict["previous_state"] = previous_state + self._state_dict["num_chunks_since_previous_state"] = num_chunks_since_previous_state + 1 + else: + cropped_chunk_length = self.batch_size - chunks_buffer_size + keys_buffer.append(f"{key}[:{cropped_chunk_length}]") + chunks_buffer.append(chunk.slice(0, cropped_chunk_length)) + new_key = "_".join(str(_key) for _key in keys_buffer) + if self._state_dict: + self._state_dict["batch_idx"] += 1 + self._state_dict["num_chunks_since_previous_state"] += len(chunks_buffer) + self._state_dict["cropped_chunk_length"] = cropped_chunk_length + yield new_key, pa.Table.from_batches(chunks_buffer) + keys_buffer = [f"{key}[{cropped_chunk_length}:]"] + chunks_buffer = [chunk.slice(cropped_chunk_length, len(chunk) - cropped_chunk_length)] + chunks_buffer_size = len(chunk) - cropped_chunk_length + if self._state_dict: + self._state_dict["previous_state"] = previous_state + self._state_dict["num_chunks_since_previous_state"] = num_chunks_since_previous_state + if self._state_dict: + previous_state = self.ex_iterable.state_dict() + if not self.drop_last_batch and chunks_buffer: + new_key = "_".join(str(_key) for _key in keys_buffer) + if self._state_dict: + self._state_dict["previous_state"] = previous_state + self._state_dict["batch_idx"] += 1 + self._state_dict["num_chunks_since_previous_state"] = 0 + self._state_dict["cropped_chunk_length"] = 0 + yield new_key, pa.Table.from_batches(chunks_buffer) + + def shuffle_data_sources(self, generator: np.random.Generator) -> "RebatchedArrowExamplesIterable": + return RebatchedArrowExamplesIterable( + self.ex_iterable.shuffle_data_sources(generator), self.batch_size, self.drop_last_batch + ) + + def shard_data_sources(self, num_shards: int, index: int, contiguous=True) -> "RebatchedArrowExamplesIterable": + return RebatchedArrowExamplesIterable( + self.ex_iterable.shard_data_sources(num_shards, index, contiguous=contiguous), + self.batch_size, + self.drop_last_batch, + ) + + @property + def num_shards(self) -> int: + return self.ex_iterable.num_shards + + +class SelectColumnsIterable(_BaseExamplesIterable): + def __init__(self, ex_iterable: _BaseExamplesIterable, column_names: list[str]): + super().__init__() + self.ex_iterable = ex_iterable + self.column_names = column_names + + @property + def iter_arrow(self): + if self.ex_iterable.iter_arrow: + return self._iter_arrow + + @property + def is_typed(self): + return self.ex_iterable.is_typed + + @property + def features(self): + return self.ex_iterable.features + + def _init_state_dict(self) -> dict: + self._state_dict = self.ex_iterable._init_state_dict() + return self._state_dict + + def __iter__(self): + for idx, row in self.ex_iterable: + yield idx, {c: row[c] for c in self.column_names} + + def _iter_arrow(self) -> Iterator[tuple[Key, pa.Table]]: + for idx, pa_table in self.ex_iterable.iter_arrow(): + if len(pa_table) > 0: # empty tables have no schema + yield idx, pa_table.select(self.column_names) + + def shuffle_data_sources(self, generator: np.random.Generator) -> "SelectColumnsIterable": + return SelectColumnsIterable(self.ex_iterable.shuffle_data_sources(generator), self.column_names) + + def shard_data_sources(self, num_shards: int, index: int, contiguous=True) -> "SelectColumnsIterable": + return SelectColumnsIterable( + self.ex_iterable.shard_data_sources(num_shards, index, contiguous=contiguous), self.column_names + ) + + @property + def num_shards(self) -> int: + return self.ex_iterable.num_shards + + +class StepExamplesIterable(_BaseExamplesIterable): + def __init__(self, ex_iterable: _BaseExamplesIterable, step: int, offset: int): + super().__init__() + self.ex_iterable = ex_iterable + self.step = step + self.offset = offset + # TODO(QL): implement iter_arrow + + @property + def is_typed(self): + return self.ex_iterable.is_typed + + @property + def features(self): + return self.ex_iterable.features + + def _init_state_dict(self) -> dict: + self._state_dict = self.ex_iterable._init_state_dict() + return self._state_dict + + def __iter__(self): + ex_iterator = iter(self.ex_iterable) + while True: + batch = list(islice(ex_iterator, self.step)) + if len(batch) > self.offset: + yield batch[self.offset] + else: + break + + def shuffle_data_sources(self, generator: np.random.Generator) -> "StepExamplesIterable": + return StepExamplesIterable( + self.ex_iterable.shuffle_data_sources(generator), step=self.step, offset=self.offset + ) + + def shard_data_sources(self, num_shards: int, index: int, contiguous=True) -> "StepExamplesIterable": + return StepExamplesIterable( + self.ex_iterable.shard_data_sources(num_shards, index, contiguous=contiguous), + step=self.step, + offset=self.offset, + ) + + @property + def num_shards(self) -> int: + return self.ex_iterable.num_shards + + +class CyclingMultiSourcesExamplesIterable(_BaseExamplesIterable): + def __init__( + self, + ex_iterables: list[_BaseExamplesIterable], + stopping_strategy: Literal[ + "first_exhausted", "all_exhausted", "all_exhausted_without_replacement" + ] = "first_exhausted", + ): + super().__init__() + self.ex_iterables = ex_iterables + self.stopping_strategy = stopping_strategy + + # if undersampling ("first_exhausted"), we stop as soon as one dataset is exhausted + # if oversampling ("all_exhausted"), we stop as soons as every dataset is exhausted, i.e as soon as every samples of every dataset has been visited at least once + # if sampling without replacement ("all_exhausted_without_replacement"), we stop once all samples of every dataset has been visited exactly once. + self.bool_strategy_func = ( + np.all if (stopping_strategy in ("all_exhausted", "all_exhausted_without_replacement")) else np.any + ) + + @property + def is_typed(self): + return self.ex_iterables[0].is_typed + + @property + def features(self): + return self.ex_iterables[0].features + + @property + def iter_arrow(self): + # Can iterate on arrow tables if all ex_iterables can iterate + return self._iter_arrow if all(ex_iterable.iter_arrow for ex_iterable in self.ex_iterables) else None + + def _get_indices_iterator(self): + # this is an infinite iterator to keep track of which iterator we want to pick examples from + ex_iterable_idx = self._state_dict["ex_iterable_idx"] if self._state_dict else 0 + for next_ex_iterable_idx in islice(cycle(range(len(self.ex_iterables))), ex_iterable_idx + 1, None): + if self._state_dict: + self._state_dict["ex_iterable_idx"] = next_ex_iterable_idx + yield ex_iterable_idx + ex_iterable_idx = next_ex_iterable_idx + + def _init_state_dict(self) -> dict: + self._state_dict = { + "ex_iterable_idx": 0, + "ex_iterables": [ex_iterable._init_state_dict() for ex_iterable in self.ex_iterables], + "previous_states": [None] * len(self.ex_iterables), + "is_exhausted": [False] * len(self.ex_iterables), + "type": self.__class__.__name__, + } + return self._state_dict + + def _iter_arrow(self): + # we use this to buffer one example of each iterator to know if an iterator is exhausted + nexts = [None] * len(self.ex_iterables) + # because of that, we need to rewind 1 example when reloading the state dict + if self._state_dict: + for i in range(len(self.ex_iterables)): + if self._state_dict["previous_states"][i] is not None: + self.ex_iterables[i].load_state_dict(self._state_dict["previous_states"][i]) + iterators = [ex_iterable.iter_arrow() for ex_iterable in self.ex_iterables] + + indices_iterator = self._get_indices_iterator() + + is_exhausted = ( + np.array(self._state_dict["is_exhausted"]) if self._state_dict else np.full(len(self.ex_iterables), False) + ) + for i in indices_iterator: + # if the stopping criteria is met, break the main for loop + if self.bool_strategy_func(is_exhausted): + break + # Skip exhausted iterators if we sample without replacement + if is_exhausted[i] and self.stopping_strategy in ["all_exhausted_without_replacement"]: + continue + # let's pick one example from the iterator at index i + if nexts[i] is None: + nexts[i] = next(iterators[i], False) + result = nexts[i] + if self._state_dict: + self._state_dict["previous_states"][i] = deepcopy(self._state_dict["ex_iterables"][i]) + nexts[i] = next(iterators[i], False) + + # the iterator is exhausted + if nexts[i] is False: + is_exhausted[i] = True + if self._state_dict: + self._state_dict["is_exhausted"][i] = True + # we reset it in case the stopping crtieria isn't met yet and we sample with replacement + if self.stopping_strategy not in ["all_exhausted_without_replacement"]: + nexts[i] = None + if self._state_dict: + self._state_dict["ex_iterables"][i] = self.ex_iterables[i]._init_state_dict() + self._state_dict["previous_states"][i] = None + iterators[i] = self.ex_iterables[i]._iter_arrow() + + if result is not False: + yield result + + def __iter__(self): + # we use this to buffer one example of each iterator to know if an iterator is exhausted + nexts = [None] * len(self.ex_iterables) + # because of that, we need to rewind 1 example when reloading the state dict + if self._state_dict: + for i in range(len(self.ex_iterables)): + if self._state_dict["previous_states"][i] is not None: + self.ex_iterables[i].load_state_dict(self._state_dict["previous_states"][i]) + iterators = [iter(ex_iterable) for ex_iterable in self.ex_iterables] + + indices_iterator = self._get_indices_iterator() + + is_exhausted = ( + np.array(self._state_dict["is_exhausted"]) if self._state_dict else np.full(len(self.ex_iterables), False) + ) + for i in indices_iterator: + # if the stopping criteria is met, break the main for loop + if self.bool_strategy_func(is_exhausted): + break + # let's pick one example from the iterator at index i + if is_exhausted[i] and self.stopping_strategy in ["all_exhausted_without_replacement"]: + continue + if nexts[i] is None: + nexts[i] = next(iterators[i], False) + result = nexts[i] + if self._state_dict: + self._state_dict["previous_states"][i] = deepcopy(self._state_dict["ex_iterables"][i]) + nexts[i] = next(iterators[i], False) + + # the iterator is exhausted + if nexts[i] is False: + is_exhausted[i] = True + if self._state_dict: + self._state_dict["is_exhausted"][i] = True + # we reset it in case the stopping crtieria isn't met yet + if self.stopping_strategy not in ["all_exhausted_without_replacement"]: + nexts[i] = None + if self._state_dict: + self._state_dict["ex_iterables"][i] = self.ex_iterables[i]._init_state_dict() + self._state_dict["previous_states"][i] = None + iterators[i] = iter(self.ex_iterables[i]) + if result is not False: + yield result + + def shuffle_data_sources(self, generator: np.random.Generator) -> "CyclingMultiSourcesExamplesIterable": + """Shuffle each underlying examples iterable.""" + ex_iterables = [ex_iterable.shuffle_data_sources(generator) for ex_iterable in self.ex_iterables] + return CyclingMultiSourcesExamplesIterable(ex_iterables, self.stopping_strategy) + + @property + def num_shards(self) -> int: + return min(ex_iterable.num_shards for ex_iterable in self.ex_iterables) if self.ex_iterables else 0 + + def shard_data_sources( + self, num_shards: int, index: int, contiguous=True + ) -> "CyclingMultiSourcesExamplesIterable": + """Either keep only the requested shard, or propagate the request to the underlying iterable.""" + if num_shards < self.num_shards: + return CyclingMultiSourcesExamplesIterable( + [ + iterable.shard_data_sources(num_shards, index, contiguous=contiguous) + for iterable in self.ex_iterables + ], + stopping_strategy=self.stopping_strategy, + ) + elif index < self.num_shards: + return CyclingMultiSourcesExamplesIterable( + [ + iterable.shard_data_sources(self.num_shards, index, contiguous=contiguous) + for iterable in self.ex_iterables + ], + stopping_strategy=self.stopping_strategy, + ) + else: + return CyclingMultiSourcesExamplesIterable( + [], + stopping_strategy=self.stopping_strategy, + ) + + +class VerticallyConcatenatedMultiSourcesExamplesIterable(_BaseExamplesIterable): + """ + VerticallyConcatenatedMultiSourcesExamplesIterable simply chains the input iterables. + It doesn't require the examples iterables to always yield the same columns. + Instead, this is handled by the `IterableDataset` class or `FormattedExamplesIterable`. + + For information, `IterableDataset` merges the features of all the datasets to concatenate into one. + We use `IterableDataset._resolve_features` to obtain the features of all the datasets to concatenate. + + Then for each example, `IterableDataset` and `FormattedExamplesIterable` automatically fill missing columns with None. + This is done with `_apply_feature_types_on_example`. + """ + + def __init__(self, ex_iterables: list[_BaseExamplesIterable]): + super().__init__() + self.ex_iterables = ex_iterables + + @property + def is_typed(self): + return self.ex_iterables[0].is_typed + + @property + def features(self): + return self.ex_iterables[0].features + + @property + def iter_arrow(self): + if all(ex_iterable.iter_arrow is not None for ex_iterable in self.ex_iterables): + return self._iter_arrow + + def _init_state_dict(self) -> dict: + self._state_dict = { + "ex_iterable_idx": 0, + "ex_iterables": [ex_iterable._init_state_dict() for ex_iterable in self.ex_iterables], + "type": self.__class__.__name__, + } + return self._state_dict + + def __iter__(self): + ex_iterable_idx_start = self._state_dict["ex_iterable_idx"] if self._state_dict else 0 + for ex_iterable in islice(self.ex_iterables, ex_iterable_idx_start, None): + yield from ex_iterable + if self._state_dict: + self._state_dict["ex_iterable_idx"] += 1 + + def _iter_arrow(self): + ex_iterable_idx_start = self._state_dict["ex_iterable_idx"] if self._state_dict else 0 + for ex_iterable in islice(self.ex_iterables, ex_iterable_idx_start, None): + yield from ex_iterable.iter_arrow() + if self._state_dict: + self._state_dict["ex_iterable_idx"] += 1 + + def shuffle_data_sources( + self, generator: np.random.Generator + ) -> "VerticallyConcatenatedMultiSourcesExamplesIterable": + """Shuffle the list of examples iterable, as well as each underlying examples iterable.""" + rng = deepcopy(generator) + ex_iterables = list(self.ex_iterables) + rng.shuffle(ex_iterables) + ex_iterables = [ex_iterable.shuffle_data_sources(generator) for ex_iterable in ex_iterables] + return VerticallyConcatenatedMultiSourcesExamplesIterable(ex_iterables) + + @property + def num_shards(self) -> int: + return min(ex_iterable.num_shards for ex_iterable in self.ex_iterables) + + def shard_data_sources( + self, num_shards: int, index: int, contiguous=True + ) -> "VerticallyConcatenatedMultiSourcesExamplesIterable": + """Either keep only the requested shard, or propagate the request to the underlying iterable.""" + return VerticallyConcatenatedMultiSourcesExamplesIterable( + [iterable.shard_data_sources(num_shards, index, contiguous=contiguous) for iterable in self.ex_iterables] + ) + + +def _check_column_names(column_names: list[str]): + """Check the column names to make sure they don't contain duplicates.""" + counter = Counter(column_names) + if not all(count == 1 for count in counter.values()): + duplicated_columns = [col for col in counter if counter[col] > 1] + raise ValueError( + f"The examples iterables can't have duplicated columns but columns {duplicated_columns} are duplicated." + ) + + +class HorizontallyConcatenatedMultiSourcesExamplesIterable(_BaseExamplesIterable): + """ + HorizontallyConcatenatedMultiSourcesExamplesIterable merges examples together for the input list of iterables. + It also checks that there are no duplicate columns (otherwise we don't know which one to keep). + This check is done once when yielding the first example. + + However it doesn't fill missing columns with None. + Instead, this is handled by the `IterableDataset` class or `FormattedExamplesIterable`. + + For information, `IterableDataset` merges the features of all the datasets to concatenate into one. + We use `IterableDataset._resolve_features` to obtain the features of all the datasets to concatenate. + + Then for each example, `IterableDataset` and `FormattedExamplesIterable` automatically fill missing columns with None. + This is done with `_apply_feature_types_on_example`. + """ + + def __init__(self, ex_iterables: list[_BaseExamplesIterable]): + super().__init__() + self.ex_iterables = ex_iterables + # TODO(QL): implement iter_arrow + + @property + def is_typed(self): + return self.ex_iterables[0].is_typed + + @property + def features(self): + return self.ex_iterables[0].features + + def _init_state_dict(self) -> dict: + self._state_dict = { + "ex_iterables": [ex_iterable._init_state_dict() for ex_iterable in self.ex_iterables], + "type": self.__class__.__name__, + } + return self._state_dict + + def __iter__(self): + ex_iterators = [iter(ex_iterable) for ex_iterable in self.ex_iterables] + for i in itertools.count(): + keys = [] + examples = [] + for ex_iterator in list(ex_iterators): + try: + key, example = next(ex_iterator) + keys.append(key) + examples.append(example) + except StopIteration: + ex_iterators.remove(ex_iterator) + if ex_iterators: + if i == 0: + _check_column_names([column_name for example in examples for column_name in example]) + new_example = {} + for example in examples: + new_example.update(example) + new_key = "_".join(str(key) for key in keys) + yield new_key, new_example + else: + break + + def shuffle_data_sources( + self, generator: np.random.Generator + ) -> "HorizontallyConcatenatedMultiSourcesExamplesIterable": + """Doesn't shuffle the wrapped examples iterable since it would break the alignment between them.""" + return self + + @property + def num_shards(self) -> int: + return 1 + + def shard_data_sources( + self, num_shards: int, index: int, contiguous=True + ) -> "HorizontallyConcatenatedMultiSourcesExamplesIterable": + """Either keep only the requested shard, or propagate the request to the underlying iterable.""" + return HorizontallyConcatenatedMultiSourcesExamplesIterable( + [iterable.shard_data_sources(num_shards, index, contiguous=contiguous) for iterable in self.ex_iterables] + ) + + +class RandomlyCyclingMultiSourcesExamplesIterable(CyclingMultiSourcesExamplesIterable): + def __init__( + self, + ex_iterables: list[_BaseExamplesIterable], + generator: np.random.Generator, + probabilities: Optional[list[float]] = None, + stopping_strategy: Literal[ + "first_exhausted", "all_exhausted", "all_exhausted_without_replacement" + ] = "first_exhausted", + ): + super().__init__(ex_iterables, stopping_strategy) + self.generator = deepcopy(generator) + self.probabilities = probabilities + + @property + def is_typed(self): + return self.ex_iterables[0].is_typed + + @property + def features(self): + return self.ex_iterables[0].features + + def _get_indices_iterator(self): + rng = deepcopy(self.generator) + num_sources = len(self.ex_iterables) + random_batch_size = 1000 + # this is an infinite iterator that randomly samples the index of the source to pick examples from + index_offset = self._state_dict["bit_generator_index_offset"] if self._state_dict else 0 + if self._state_dict: + rng.bit_generator.state = self._state_dict["bit_generator_state"] + if self.probabilities is None: + while True: + for i in islice(rng.integers(0, num_sources, size=random_batch_size), index_offset, None): + index_offset = (index_offset + 1) % random_batch_size + if self._state_dict: + self._state_dict["bit_generator_index_offset"] = index_offset + if index_offset == 0: + self._state_dict["bit_generator_state"] = rng.bit_generator.state + yield int(i) + else: + while True: + for i in islice( + rng.choice(num_sources, size=random_batch_size, p=self.probabilities), index_offset, None + ): + index_offset = (index_offset + 1) % random_batch_size + if self._state_dict: + self._state_dict["bit_generator_index_offset"] = index_offset + if index_offset == 0: + self._state_dict["bit_generator_state"] = rng.bit_generator.state + yield int(i) + + def _init_state_dict(self) -> dict: + self._state_dict = { + "bit_generator_state": self.generator.bit_generator.state, + "bit_generator_index_offset": 0, + "ex_iterables": [ex_iterable._init_state_dict() for ex_iterable in self.ex_iterables], + "previous_states": [None] * len(self.ex_iterables), + "is_exhausted": [False] * len(self.ex_iterables), + "type": self.__class__.__name__, + } + return self._state_dict + + def shuffle_data_sources(self, generator: np.random.Generator) -> "RandomlyCyclingMultiSourcesExamplesIterable": + """Shuffle the data sources of each wrapped examples iterable.""" + ex_iterables = [ex_iterable.shuffle_data_sources(generator) for ex_iterable in self.ex_iterables] + return RandomlyCyclingMultiSourcesExamplesIterable( + ex_iterables, + generator=generator, + probabilities=self.probabilities, + stopping_strategy=self.stopping_strategy, + ) + + def shard_data_sources( + self, num_shards: int, index: int, contiguous=True + ) -> "RandomlyCyclingMultiSourcesExamplesIterable": + """Either keep only the requested shard, or propagate the request to the underlying iterable.""" + if num_shards < self.num_shards: + return RandomlyCyclingMultiSourcesExamplesIterable( + [ + iterable.shard_data_sources(num_shards, index, contiguous=contiguous) + for iterable in self.ex_iterables + ], + self.generator, + self.probabilities, + self.stopping_strategy, + ) + elif index < self.num_shards: + return RandomlyCyclingMultiSourcesExamplesIterable( + [ + iterable.shard_data_sources(self.num_shards, index, contiguous=contiguous) + for iterable in self.ex_iterables + ], + self.generator, + self.probabilities, + self.stopping_strategy, + ) + else: + return RandomlyCyclingMultiSourcesExamplesIterable( + [], + self.generator, + self.probabilities, + self.stopping_strategy, + ) + + +def _table_output_to_arrow(output) -> pa.Table: + if isinstance(output, pa.Table): + return output + if isinstance(output, (pd.DataFrame, pd.Series)): + return pa.Table.from_pandas(output) + if config.POLARS_AVAILABLE and "polars" in sys.modules: + import polars as pl + + if isinstance(output, (pl.DataFrame, pl.Series)): + return output.to_arrow() + return output + + +class MappedExamplesIterable(_BaseExamplesIterable): + def __init__( + self, + ex_iterable: _BaseExamplesIterable, + function: Callable, + with_indices: bool = False, + input_columns: Optional[list[str]] = None, + batched: bool = False, + batch_size: Optional[int] = 1000, + drop_last_batch: bool = False, + remove_columns: Optional[list[str]] = None, + fn_kwargs: Optional[dict] = None, + formatting: Optional["FormattingConfig"] = None, + features: Optional[Features] = None, + max_num_running_async_map_functions_in_parallel: Optional[int] = None, + ): + super().__init__() + self.ex_iterable = ex_iterable + self.function = function + self.batched = batched + self.batch_size = batch_size + self.drop_last_batch = drop_last_batch + self.remove_columns = remove_columns + self.with_indices = with_indices + self.input_columns = input_columns + self.fn_kwargs = fn_kwargs or {} + self.formatting = formatting # required for iter_arrow + self._features = features + self.max_num_running_async_map_functions_in_parallel = ( + max_num_running_async_map_functions_in_parallel or config.MAX_NUM_RUNNING_ASYNC_MAP_FUNCTIONS_IN_PARALLEL + ) + # sanity checks + if formatting and formatting.is_table: + # batch_size should match for iter_arrow + if not isinstance(ex_iterable, RebatchedArrowExamplesIterable): + raise ValueError( + f"The {formatting.format_type.capitalize()}-formatted {type(self).__name__} has underlying iterable" + f"that is a {type(ex_iterable).__name__} instead of a RebatchedArrowExamplesIterable." + ) + elif ex_iterable.batch_size != (batch_size if batched else 1): + raise ValueError( + f"The {formatting.format_type.capitalize()}-formatted {type(self).__name__} has batch_size={batch_size if batched else 1} which is" + f"different from {ex_iterable.batch_size=} from its underlying iterable." + ) + # to enable graceful ends + self._owned_loops_and_tasks: list[tuple[asyncio.AbstractEventLoop, list[asyncio.Task]]] = [] + + @property + def iter_arrow(self): + if self.formatting and self.formatting.is_table: + return self._iter_arrow + + @property + def is_typed(self): + return self.features is not None # user has extracted features + + @property + def features(self): + return self._features + + def _init_state_dict(self) -> dict: + self._state_dict = { + "examples_iterable": self.ex_iterable._init_state_dict(), + "previous_state": None, + "num_examples_since_previous_state": 0, + "previous_state_example_idx": 0, + "type": self.__class__.__name__, + } + return self._state_dict + + def __iter__(self): + if self.formatting and self.formatting.is_table: + formatter = PythonFormatter() + for key, pa_table in self._iter_arrow(max_chunksize=1): + yield key, formatter.format_row(pa_table) + else: + yield from self._iter() + + def _iter(self): + current_idx = self._state_dict["previous_state_example_idx"] if self._state_dict else 0 + if self._state_dict and self._state_dict["previous_state"]: + self.ex_iterable.load_state_dict(self._state_dict["previous_state"]) + num_examples_to_skip = self._state_dict["num_examples_since_previous_state"] + else: + num_examples_to_skip = 0 + iterator = iter(self.ex_iterable) + + # We use the same logic as in Dataset.map, but with less features/formatting + # since they're handled by FormattedExamplesIterable + + if self.formatting: + formatter = get_formatter(self.formatting.format_type) + format_dict = formatter.recursive_tensorize if isinstance(formatter, TensorFormatter) else None + else: + format_dict = None + + def iter_batched_inputs(): + nonlocal current_idx + for key, example in iterator: + # If `batched`, first build the batch, if `batch_size` is None or <=0, then the batch is the whole dataset + iterator_batch = ( + iterator + if self.batch_size is None or self.batch_size <= 0 + else islice(iterator, self.batch_size - 1) + ) + key_examples_list = [(key, example)] + list(iterator_batch) + keys, examples = zip(*key_examples_list) + # the new key is the concatenation of the examples keys from the batch + key = "_".join(str(key) for key in keys) + if ( + self.drop_last_batch + and self.batch_size is not None + and self.batch_size > 0 + and len(examples) < self.batch_size + ): # ignore last batch + return + batch = _examples_to_batch(examples) + # we need to format here in case we need to stack tensors together + batch = format_dict(batch) if format_dict else batch + indices = [current_idx + i for i in range(len(key_examples_list))] + current_idx += len(indices) + yield indices, (key, batch) + + def iter_inputs(): + nonlocal current_idx + for key, example in iterator: + # If not batched, we can apply the transform and yield the example directly + # first copy the example, since we might drop some keys + example = dict(example) + # no need to do formatting here + current_idx += 1 + yield current_idx - 1, (key, example) + + def validate_function_output(processed_inputs): + if self.batched and processed_inputs: + first_col = next(iter(processed_inputs)) + bad_cols = [ + col for col in processed_inputs if len(processed_inputs[col]) != len(processed_inputs[first_col]) + ] + if bad_cols: + raise ValueError( + f"Column lengths mismatch: columns {bad_cols} have length {[len(processed_inputs[col]) for col in bad_cols]} " + f"while {first_col} has length {len(processed_inputs[first_col])}." + ) + + def prepare_inputs(key_example, indices): + key, example = key_example + fn_args = [example] if self.input_columns is None else [example[col] for col in self.input_columns] + additional_args = () + if self.with_indices: + fn_args += (indices,) + inputs = dict(example) + return inputs, fn_args, additional_args, self.fn_kwargs + + def prepare_outputs(key_example, inputs, processed_inputs): + validate_function_output(processed_inputs) + # this logic mimics the one in Dataset.map + if self.remove_columns: + for c in self.remove_columns: + if c in inputs: + del inputs[c] + if processed_inputs is key_example[1] and c in processed_inputs: + del processed_inputs[c] + transformed_inputs = {**inputs, **processed_inputs} + # no need to do features decoding here + return transformed_inputs + + def apply_function(key_example, indices): + """Utility to apply the function on a selection of columns.""" + inputs, fn_args, additional_args, fn_kwargs = prepare_inputs(key_example, indices) + processed_inputs = self.function(*fn_args, *additional_args, **fn_kwargs) + return prepare_outputs(key_example, inputs, processed_inputs) + + async def async_apply_function(key_example, indices): + """Utility to apply the function on a selection of columns. Same code but async""" + inputs, fn_args, additional_args, fn_kwargs = prepare_inputs(key_example, indices) + processed_inputs = await self.function(*fn_args, *additional_args, **fn_kwargs) + return prepare_outputs(key_example, inputs, processed_inputs) + + tasks: list[asyncio.Task] = [] + if inspect.iscoroutinefunction(self.function): + try: + loop = asyncio.get_running_loop() + except RuntimeError: + loop = asyncio.new_event_loop() + self._owned_loops_and_tasks.append((loop, tasks)) + else: + loop = None + + def iter_outputs(): + nonlocal tasks, loop + inputs_iterator = iter_batched_inputs() if self.batched else iter_inputs() + if inspect.iscoroutinefunction(self.function): + if self._state_dict: + previous_state = self.ex_iterable.state_dict() + self._state_dict["previous_state"] = previous_state + previous_state_task = None + previous_state_example_idx = self._state_dict["previous_state_example_idx"] + indices: Union[list[int], list[list[int]]] = [] + for i, key_example in inputs_iterator: + indices.append(i) + tasks.append(loop.create_task(async_apply_function(key_example, i))) + # keep the total active tasks under a certain number + if len(tasks) >= self.max_num_running_async_map_functions_in_parallel: + done, pending = loop.run_until_complete( + asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED) + ) + while tasks and len(pending) >= self.max_num_running_async_map_functions_in_parallel: + done, pending = loop.run_until_complete( + asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED) + ) + if len(tasks) >= 10 * self.max_num_running_async_map_functions_in_parallel: + loop.run_until_complete(tasks[0]) + # yield finished tasks + while tasks and tasks[0].done(): + i, task = indices.pop(0), tasks.pop(0) + yield i, task.result() + if self._state_dict and task is previous_state_task: + self._state_dict["previous_state"] = previous_state + self._state_dict["num_examples_since_previous_state"] = 0 + self._state_dict["previous_state_example_idx"] = previous_state_example_idx + previous_state, previous_state_task = None, None + # checkpoint + if self._state_dict and previous_state_task is None and tasks: + previous_state = self.ex_iterable.state_dict() + previous_state_task = tasks[-1] + previous_state_example_idx = current_idx + while tasks: + yield indices[0], loop.run_until_complete(tasks[0]) + indices.pop(0), tasks.pop(0) + else: + if self._state_dict: + if self.batched: + self._state_dict["previous_state"] = self.ex_iterable.state_dict() + self._state_dict["num_examples_since_previous_state"] = 0 + self._state_dict["previous_state_example_idx"] = current_idx + for i, key_example in inputs_iterator: + if self._state_dict: + if not self.batched: + self._state_dict["previous_state_example_idx"] = current_idx + yield i, apply_function(key_example, i) + if self._state_dict: + if self.batched: + self._state_dict["previous_state"] = self.ex_iterable.state_dict() + self._state_dict["num_examples_since_previous_state"] = 0 + self._state_dict["previous_state_example_idx"] = current_idx + + try: + outputs = iter_outputs() + if self.batched: + outputs = ( + (key, transformed_example) + for key, transformed_batch in outputs + for transformed_example in _batch_to_examples(transformed_batch) + ) + for key, transformed_example in outputs: + if self._state_dict and self._state_dict["previous_state"] is not None: + self._state_dict["num_examples_since_previous_state"] += 1 + if num_examples_to_skip > 0: + num_examples_to_skip -= 1 + continue + yield key, transformed_example + except (Exception, KeyboardInterrupt): + if loop: + logger.debug(f"Canceling {len(tasks)} async tasks.") + for task in tasks: + task.cancel(msg="KeyboardInterrupt") + try: + loop.run_until_complete(asyncio.gather(*tasks)) + except (asyncio.CancelledError, ValueError): + logger.debug("Tasks canceled.") + raise + + def _iter_arrow(self, max_chunksize: Optional[int] = None) -> Iterator[tuple[Key, pa.Table]]: + formatter: TableFormatter = get_formatter(self.formatting.format_type) if self.formatting else ArrowFormatter() + if self.ex_iterable.iter_arrow: + iterator = self.ex_iterable.iter_arrow() + else: + iterator = _convert_to_arrow( + self.ex_iterable, + batch_size=self.batch_size if self.batched else 1, + drop_last_batch=self.drop_last_batch, + ) + if self._state_dict and self._state_dict["previous_state"]: + self.ex_iterable.load_state_dict(self._state_dict["previous_state"]) + num_examples_to_skip = self._state_dict["num_examples_since_previous_state"] + else: + num_examples_to_skip = 0 + if self._state_dict and max_chunksize is not None: + self._state_dict["previous_state"] = self.ex_iterable.state_dict() + self._state_dict["num_examples_since_previous_state"] = 0 + current_idx = self._state_dict["previous_state_example_idx"] if self._state_dict else 0 + for key, pa_table in iterator: + if ( + self.batched + and self.batch_size is not None + and len(pa_table) < self.batch_size + and self.drop_last_batch + ): + return + # first build the batch + function_args = ( + [formatter.format_batch(pa_table)] + if self.input_columns is None + else [pa_table[col] for col in self.input_columns] + ) + if self.with_indices: + if self.batched: + function_args.append([current_idx + i for i in range(len(pa_table))]) + else: + function_args.append(current_idx) + # then apply the transform + output = self.function(*function_args, **self.fn_kwargs) + output_table = _table_output_to_arrow(output) + if not isinstance(output_table, pa.Table): + raise TypeError( + f"Provided `function` which is applied to {formatter.table_type} returns a variable of type " + f"{type(output)}. Make sure provided `function` returns a {formatter.table_type} to update the dataset." + ) + # we don't need to merge results for consistency with Dataset.map which merges iif both input and output are dicts + # then remove the unwanted columns + if self.remove_columns: + for column in self.remove_columns: + if column in output_table.column_names: + output_table = output_table.remove_column(output_table.column_names.index(column)) + # return output + if max_chunksize is None: + current_idx += len(pa_table) + if self._state_dict: + self._state_dict["previous_state_example_idx"] += len(pa_table) + yield key, output_table + else: + for i, pa_subtable in enumerate(output_table.to_reader(max_chunksize=max_chunksize)): + current_idx += 1 + if self._state_dict: + self._state_dict["num_examples_since_previous_state"] += 1 + if num_examples_to_skip > 0: + num_examples_to_skip -= 1 + continue + yield f"{key}_{i}", pa_subtable + if self._state_dict: + self._state_dict["previous_state"] = self.ex_iterable.state_dict() + self._state_dict["num_examples_since_previous_state"] = 0 + self._state_dict["previous_state_example_idx"] += len(pa_table) + + def shuffle_data_sources(self, generator: np.random.Generator) -> "MappedExamplesIterable": + """Shuffle the wrapped examples iterable.""" + return MappedExamplesIterable( + self.ex_iterable.shuffle_data_sources(generator), + function=self.function, + with_indices=self.with_indices, + input_columns=self.input_columns, + batched=self.batched, + batch_size=self.batch_size, + drop_last_batch=self.drop_last_batch, + remove_columns=self.remove_columns, + fn_kwargs=self.fn_kwargs, + formatting=self.formatting, + features=self.features, + max_num_running_async_map_functions_in_parallel=self.max_num_running_async_map_functions_in_parallel, + ) + + def shard_data_sources(self, num_shards: int, index: int, contiguous=True) -> "MappedExamplesIterable": + """Keep only the requested shard.""" + return MappedExamplesIterable( + self.ex_iterable.shard_data_sources(num_shards, index, contiguous=contiguous), + function=self.function, + with_indices=self.with_indices, + input_columns=self.input_columns, + batched=self.batched, + batch_size=self.batch_size, + drop_last_batch=self.drop_last_batch, + remove_columns=self.remove_columns, + fn_kwargs=self.fn_kwargs, + formatting=self.formatting, + features=self.features, + max_num_running_async_map_functions_in_parallel=self.max_num_running_async_map_functions_in_parallel, + ) + + @property + def num_shards(self) -> int: + return self.ex_iterable.num_shards + + +def _add_mask( + input: Union[dict, pa.Table], + mask: Union[bool, list, pa.Array, pa.ChunkedArray, pa.BooleanScalar], + mask_column_name: str, +): + if isinstance(input, pa.Table): + if not isinstance(mask, (list, pa.Array, pa.ChunkedArray)): + mask = pa.array([mask], type=pa.bool_()) + return input.append_column(mask_column_name, mask) + else: + return {mask_column_name: mask} + + +def add_mask(mask_function: Callable, input: Union[dict, pa.Table], *args, mask_column_name: str, **kwargs): + mask = mask_function(input, *args, **kwargs) + return _add_mask(input, mask, mask_column_name) + + +async def async_add_mask( + mask_function: Callable, input: Union[dict, pa.Table], *args, mask_column_name: str, **kwargs +): + mask = await mask_function(input, *args, **kwargs) + return _add_mask(input, mask, mask_column_name) + + +class FilteredExamplesIterable(MappedExamplesIterable): + mask_column_name = "===MASK===" + + def __init__( + self, + ex_iterable: _BaseExamplesIterable, + function: Callable, + with_indices: bool = False, + input_columns: Optional[list[str]] = None, + batched: bool = False, + batch_size: Optional[int] = 1000, + fn_kwargs: Optional[dict] = None, + formatting: Optional["FormattingConfig"] = None, + ): + self.mask_function = function + if ex_iterable.is_typed: + features = Features({**ex_iterable.features, self.mask_column_name: Value("bool")}) + else: + features = None + super().__init__( + ex_iterable=ex_iterable, + function=partial( + async_add_mask if inspect.iscoroutinefunction(function) else add_mask, + function, + mask_column_name=self.mask_column_name, + ), + with_indices=with_indices, + input_columns=input_columns, + batched=batched, + batch_size=batch_size, + fn_kwargs=fn_kwargs, + formatting=formatting, + features=features, + ) + + def _iter(self): + for key, example in super()._iter(): + example = dict(example) + if example.pop(self.mask_column_name): + yield key, example + + def _iter_arrow(self, max_chunksize: Optional[int] = None): + for key, pa_table in super()._iter_arrow(max_chunksize=max_chunksize): + mask = pa_table[self.mask_column_name] + yield key, pa_table.drop(self.mask_column_name).filter(mask) + + def shuffle_data_sources(self, seed: Optional[int]) -> "FilteredExamplesIterable": + """Shuffle the wrapped examples iterable.""" + return FilteredExamplesIterable( + self.ex_iterable.shuffle_data_sources(seed), + function=self.mask_function, + with_indices=self.with_indices, + input_columns=self.input_columns, + batched=self.batched, + batch_size=self.batch_size, + fn_kwargs=self.fn_kwargs, + formatting=self.formatting, + ) + + def shard_data_sources(self, num_shards: int, index: int, contiguous=True) -> "FilteredExamplesIterable": + """Keep only the requested shard.""" + return FilteredExamplesIterable( + self.ex_iterable.shard_data_sources(num_shards, index, contiguous=contiguous), + function=self.mask_function, + with_indices=self.with_indices, + input_columns=self.input_columns, + batched=self.batched, + batch_size=self.batch_size, + fn_kwargs=self.fn_kwargs, + formatting=self.formatting, + ) + + @property + def num_shards(self) -> int: + return self.ex_iterable.num_shards + + +class BufferShuffledExamplesIterable(_BaseExamplesIterable): + def __init__(self, ex_iterable: _BaseExamplesIterable, buffer_size: int, generator: np.random.Generator): + super().__init__() + self.ex_iterable = ex_iterable + self.buffer_size = buffer_size + self.generator = generator + + @property + def is_typed(self): + return self.ex_iterable.is_typed + + @property + def features(self): + return self.ex_iterable.features + + @property + def iter_arrow(self): + return self._iter_arrow if self.ex_iterable.iter_arrow else None + + def _init_state_dict(self) -> dict: + self._state_dict = self.ex_iterable._init_state_dict() + self._original_state_dict = self.state_dict() + return self._state_dict + + def load_state_dict(self, state_dict: dict) -> dict: + if self._state_dict: + if state_dict != self._original_state_dict: + logger.warning( + "Loading a state dict of a shuffle buffer of a dataset without the buffer content." + "The shuffle buffer will be refilled before starting to yield new examples." + ) + return super().load_state_dict(state_dict) + + @staticmethod + def _iter_random_indices(rng: np.random.Generator, buffer_size: int, random_batch_size=1000) -> Iterator[int]: + while True: + yield from (int(i) for i in rng.integers(0, buffer_size, size=random_batch_size)) + + def __iter__(self): + buffer_size = self.buffer_size + rng = deepcopy(self.generator) + indices_iterator = self._iter_random_indices(rng, buffer_size) + # this is the shuffle buffer that we keep in memory + mem_buffer = [] + for x in self.ex_iterable: + if len(mem_buffer) == buffer_size: # if the buffer is full, pick and example from it + i = next(indices_iterator) + yield mem_buffer[i] + mem_buffer[i] = x # replace the picked example by a new one + else: # otherwise, keep filling the buffer + mem_buffer.append(x) + # when we run out of examples, we shuffle the remaining examples in the buffer and yield them + rng.shuffle(mem_buffer) + yield from mem_buffer + + def _iter_arrow(self): + buffer_size = self.buffer_size + rng = deepcopy(self.generator) + indices_iterator = self._iter_random_indices(rng, buffer_size) + # this is the shuffle buffer that we keep in memory + mem_buffer = [] + for key, pa_table in self.ex_iterable.iter_arrow(): + if len(mem_buffer) == buffer_size: # if the buffer is full, pick and example from it + i = next(indices_iterator) + yield mem_buffer[i] + mem_buffer[i] = (key, pa_table) # replace the picked example by a new one + else: # otherwise, keep filling the buffer + mem_buffer.append((key, pa_table)) + # when we run out of examples, we shuffle the remaining examples in the buffer and yield them + rng.shuffle(mem_buffer) + yield from mem_buffer + + def shuffle_data_sources(self, generator: np.random.Generator) -> "BufferShuffledExamplesIterable": + """Shuffle the wrapped examples iterable as well as the shuffling buffer.""" + return BufferShuffledExamplesIterable( + self.ex_iterable.shuffle_data_sources(generator), buffer_size=self.buffer_size, generator=generator + ) + + def shard_data_sources(self, num_shards: int, index: int, contiguous=True) -> "BufferShuffledExamplesIterable": + """Keep only the requested shard.""" + return BufferShuffledExamplesIterable( + self.ex_iterable.shard_data_sources(num_shards, index, contiguous=contiguous), + buffer_size=self.buffer_size, + generator=self.generator, + ) + + @property + def num_shards(self) -> int: + return self.ex_iterable.num_shards + + +class SkipExamplesIterable(_BaseExamplesIterable): + def __init__( + self, + ex_iterable: _BaseExamplesIterable, + n: int, + block_sources_order_when_shuffling: bool = True, + split_when_sharding: bool = True, + ): + super().__init__() + self.ex_iterable = ex_iterable + self.n = n + self.block_sources_order_when_shuffling = block_sources_order_when_shuffling + self.split_when_sharding = split_when_sharding + # TODO(QL): implement iter_arrow + + @property + def is_typed(self): + return self.ex_iterable.is_typed + + @property + def features(self): + return self.ex_iterable.features + + def _init_state_dict(self) -> dict: + self._state_dict = { + "skipped": False, + "examples_iterable": self.ex_iterable._init_state_dict(), + "type": self.__class__.__name__, + } + return self._state_dict + + def __iter__(self): + ex_iterable_idx_start = 0 if self._state_dict and self._state_dict["skipped"] else self.n + if self._state_dict: + self._state_dict["skipped"] = True + yield from islice(self.ex_iterable, ex_iterable_idx_start, None) + + @staticmethod + def split_number(num, n): + quotient = num // n + remainder = num % n + result = [quotient] * n + for i in range(remainder): + result[i] += 1 + return result + + def shuffle_data_sources(self, generator: np.random.Generator) -> "SkipExamplesIterable": + """May not shuffle the wrapped examples iterable since it would skip examples from other shards instead.""" + if self.block_sources_order_when_shuffling: + return self + else: + return SkipExamplesIterable( + self.ex_iterable.shuffle_data_sources(generator), + n=self.n, + block_sources_order_when_shuffling=self.block_sources_order_when_shuffling, + split_when_sharding=self.split_when_sharding, + ) + + def shard_data_sources(self, num_shards: int, index: int, contiguous=True) -> "SkipExamplesIterable": + """Keep only the requested shard.""" + if self.split_when_sharding: + return SkipExamplesIterable( + self.ex_iterable.shard_data_sources(num_shards, index, contiguous=contiguous), + n=self.split_number(self.n, num_shards)[index], + block_sources_order_when_shuffling=self.block_sources_order_when_shuffling, + split_when_sharding=self.split_when_sharding, + ) + else: + return self + + @property + def num_shards(self) -> int: + return self.ex_iterable.num_shards + + +class RepeatExamplesIterable(_BaseExamplesIterable): + """ + Iterable that repeats the underlying iterable a given number of times. + """ + + def __init__( + self, + ex_iterable: _BaseExamplesIterable, + num_times: Optional[int], + ): + super().__init__() + self.ex_iterable = ex_iterable + self.num_times = num_times + + def _init_state_dict(self) -> dict: + self._state_dict = { + "repeat_index": 0, + "examples_iterable": self.ex_iterable._init_state_dict(), + "type": self.__class__.__name__, + } + return self._state_dict + + def __iter__(self): + repeat_index = self._state_dict["repeat_index"] if self._state_dict else 0 + while True: + if self.num_times is not None and repeat_index >= max(self.num_times, 0): + break + yield from self.ex_iterable + repeat_index += 1 + if self._state_dict: + self._state_dict["repeat_index"] = repeat_index + self._state_dict["examples_iterable"] = self.ex_iterable._init_state_dict() + + def shuffle_data_sources(self, generator: np.random.Generator) -> "RepeatExamplesIterable": + """Shuffle the underlying iterable, then repeat.""" + return RepeatExamplesIterable(self.ex_iterable.shuffle_data_sources(generator), num_times=self.num_times) + + def shard_data_sources(self, num_shards: int, index: int, contiguous=True) -> "RepeatExamplesIterable": + """Shard, then repeat shards.""" + return RepeatExamplesIterable( + self.ex_iterable.shard_data_sources(num_shards, index, contiguous=contiguous), + num_times=self.num_times, + ) + + @property + def num_shards(self) -> int: + return self.ex_iterable.num_shards + + +class TakeExamplesIterable(_BaseExamplesIterable): + def __init__( + self, + ex_iterable: _BaseExamplesIterable, + n: int, + block_sources_order_when_shuffling: bool = True, + split_when_sharding: bool = True, + ): + super().__init__() + self.ex_iterable = ex_iterable + self.n = n + self.block_sources_order_when_shuffling = block_sources_order_when_shuffling + self.split_when_sharding = split_when_sharding + # TODO(QL): implement iter_arrow + + @property + def is_typed(self): + return self.ex_iterable.is_typed + + @property + def features(self): + return self.ex_iterable.features + + def _init_state_dict(self) -> dict: + self._state_dict = { + "num_taken": 0, + "examples_iterable": self.ex_iterable._init_state_dict(), + "type": self.__class__.__name__, + } + return self._state_dict + + def __iter__(self): + ex_iterable_num_taken = self._state_dict["num_taken"] if self._state_dict else 0 + for key_example in islice(self.ex_iterable, self.n - ex_iterable_num_taken): + if self._state_dict: + self._state_dict["num_taken"] += 1 + yield key_example + + @staticmethod + def split_number(num, n): + quotient = num // n + remainder = num % n + result = [quotient] * n + for i in range(remainder): + result[i] += 1 + return result + + def shuffle_data_sources(self, generator: np.random.Generator) -> "TakeExamplesIterable": + """May not shuffle the wrapped examples iterable since it would take examples from other shards instead.""" + if self.block_sources_order_when_shuffling: + return self + else: + return TakeExamplesIterable( + self.ex_iterable.shuffle_data_sources(generator), + n=self.n, + block_sources_order_when_shuffling=self.block_sources_order_when_shuffling, + split_when_sharding=self.split_when_sharding, + ) + + def shard_data_sources(self, num_shards: int, index: int, contiguous=True) -> "TakeExamplesIterable": + """Keep only the requested shard.""" + if self.split_when_sharding: + return TakeExamplesIterable( + self.ex_iterable.shard_data_sources(num_shards, index, contiguous=contiguous), + n=self.split_number(self.n, num_shards)[index], + block_sources_order_when_shuffling=self.block_sources_order_when_shuffling, + split_when_sharding=self.split_when_sharding, + ) + else: + return TakeExamplesIterable( + self.ex_iterable.shard_data_sources(num_shards, index, contiguous=contiguous), + n=self.n, + block_sources_order_when_shuffling=self.block_sources_order_when_shuffling, + split_when_sharding=self.split_when_sharding, + ) + + @property + def num_shards(self) -> int: + return self.ex_iterable.num_shards + + +def _apply_feature_types_on_example( + example: dict, features: Features, token_per_repo_id: dict[str, Union[str, bool, None]] +) -> dict: + example = dict(example) + # add missing columns + for column_name in features: + if column_name not in example: + example[column_name] = None + # we encode the example for ClassLabel feature types for example + encoded_example = features.encode_example(example) + # Decode example for Audio feature, e.g. + decoded_example = features.decode_example(encoded_example, token_per_repo_id=token_per_repo_id) + return decoded_example + + +def _apply_feature_types_on_batch( + batch: dict, features: Features, token_per_repo_id: dict[str, Union[str, bool, None]] +) -> dict: + batch = dict(batch) + # add missing columns + n_examples = len(batch[next(iter(batch))]) + for column_name in features: + if column_name not in batch: + batch[column_name] = [None] * n_examples + # we encode the batch for ClassLabel feature types for example + encoded_batch = features.encode_batch(batch) + # Decode batch for Audio feature, e.g. + decoded_batch = features.decode_batch(encoded_batch, token_per_repo_id=token_per_repo_id) + return decoded_batch + + +@dataclass +class FormattingConfig: + format_type: Optional[str] + + @property + def is_table(self) -> bool: + return isinstance(get_formatter(self.format_type), TableFormatter) + + @property + def is_tensor(self) -> bool: + return isinstance(get_formatter(self.format_type), TensorFormatter) + + +class FormattedExamplesIterable(_BaseExamplesIterable): + def __init__( + self, + ex_iterable: _BaseExamplesIterable, + formatting: Optional[FormattingConfig], + features: Optional[Features], + token_per_repo_id: dict[str, Union[str, bool, None]], + ): + super().__init__() + self.ex_iterable = ex_iterable + self._features = features + self.formatting = formatting + self.token_per_repo_id = token_per_repo_id + + @property + def iter_arrow(self): + if self.ex_iterable.iter_arrow and (not self.formatting or self.formatting.is_table): + return self._iter_arrow + + @property + def is_typed(self): + return self.ex_iterable.is_typed or self._features is not None + + @property + def features(self): + return self._features + + def _init_state_dict(self) -> dict: + self._state_dict = self.ex_iterable._init_state_dict() + return self._state_dict + + def __iter__(self): + if not self.formatting or self.formatting.is_table: + formatter = PythonFormatter( + features=self._features if not self.ex_iterable.is_typed else None, + token_per_repo_id=self.token_per_repo_id, + ) + else: + formatter = get_formatter( + self.formatting.format_type, + features=self._features if not self.ex_iterable.is_typed else None, + token_per_repo_id=self.token_per_repo_id, + ) + if self.ex_iterable.iter_arrow: + # feature casting (inc column addition) handled within self._iter_arrow() + for key, pa_table in self._iter_arrow(): + batch = formatter.format_batch(pa_table) + for example in _batch_to_examples(batch): + yield key, example + else: + format_dict = ( + formatter.recursive_tensorize + if isinstance(formatter, TensorFormatter) + else None # cast in case features is None + ) + for key, example in self.ex_iterable: + # don't apply feature types if already applied by ex_iterable (e.g. in case of chained with_format) + if self.features and not self.ex_iterable.is_typed: + example = _apply_feature_types_on_example( + example, self.features, token_per_repo_id=self.token_per_repo_id + ) + if format_dict: + example = format_dict(example) + yield key, example + + def _iter_arrow(self) -> Iterator[tuple[Key, pa.Table]]: + if not self.features: + yield from self.ex_iterable._iter_arrow() + for key, pa_table in self.ex_iterable._iter_arrow(): + columns = set(pa_table.column_names) + schema = self.features.arrow_schema + # add missing columns + for column_name in self.features: + if column_name not in columns: + col = pa.NullArray.from_buffers(pa.null(), len(pa_table), [None]) + pa_table = pa_table.append_column(column_name, col) + if pa_table.schema != schema: + pa_table = cast_table_to_features(pa_table, self.features) + yield key, pa_table + + def shuffle_data_sources(self, generator: np.random.Generator) -> "FormattedExamplesIterable": + """Shuffle the wrapped examples iterable.""" + return FormattedExamplesIterable( + self.ex_iterable.shuffle_data_sources(generator), + features=self.features, + token_per_repo_id=self.token_per_repo_id, + formatting=self.formatting, + ) + + def shard_data_sources(self, num_shards: int, index: int, contiguous=True) -> "FormattedExamplesIterable": + """Keep only the requested shard.""" + return FormattedExamplesIterable( + self.ex_iterable.shard_data_sources(num_shards, index, contiguous=contiguous), + features=self.features, + token_per_repo_id=self.token_per_repo_id, + formatting=self.formatting, + ) + + @property + def num_shards(self) -> int: + return self.ex_iterable.num_shards + + +@dataclass +class ShufflingConfig: + generator: np.random.Generator + _original_seed: Optional[int] = None + + +@dataclass +class DistributedConfig: + rank: int + world_size: int + + +def _maybe_add_torch_iterable_dataset_parent_class(cls): + """Add torch.utils.data.IterableDataset as a parent class if 'torch' is available""" + if config.TORCH_AVAILABLE: + import torch.utils.data + + if torch.utils.data.IterableDataset not in cls.__bases__: + cls.__bases__ += (torch.utils.data.IterableDataset,) + + +def _maybe_share_with_torch_persistent_workers(value: Union[int, "torch.Tensor"]) -> Union[int, "torch.Tensor"]: + if config.TORCH_AVAILABLE: + import torch + + if isinstance(value, torch.Tensor): + return value.share_memory_() + else: + return torch.tensor(value).share_memory_() + else: + return value + + +class IterableColumn: + """ + An iterable for a specific column of an [`IterableDataset`]. + + Example: + + Iterate on the texts of the "text" column of a dataset: + + ```python + for text in dataset["text"]: + ... + ``` + + It also works with nested columns: + + ```python + for source in dataset["metadata"]["source"]: + ... + ``` + """ + + def __init__(self, source: Union["IterableDataset", "IterableColumn"], column_name: str): + self.source = source + self.column_name = column_name + + def __iter__(self) -> Iterator[Any]: + for example in self.source: + yield example[self.column_name] + + def __getitem__(self, column_name: str) -> "IterableColumn": + return IterableColumn(self, column_name) + + +class IterableDataset(DatasetInfoMixin): + """A Dataset backed by an iterable.""" + + def __init__( + self, + ex_iterable: _BaseExamplesIterable, + info: Optional[DatasetInfo] = None, + split: Optional[NamedSplit] = None, + formatting: Optional[FormattingConfig] = None, + shuffling: Optional[ShufflingConfig] = None, + distributed: Optional[DistributedConfig] = None, + token_per_repo_id: Optional[dict[str, Union[str, bool, None]]] = None, + ): + if distributed and distributed.world_size > 1 and shuffling and shuffling._original_seed is None: + raise RuntimeError( + "The dataset doesn't have a fixed random seed across nodes to shuffle and split the list of dataset shards by node. " + "Please pass e.g. `seed=42` in `.shuffle()` to make all the nodes use the same seed. " + ) + + info = info.copy() if info is not None else DatasetInfo() + DatasetInfoMixin.__init__(self, info=info, split=split) + + self._ex_iterable = copy.copy(ex_iterable) + self._formatting = formatting + self._shuffling = shuffling + self._distributed = distributed + self._token_per_repo_id: dict[str, Union[str, bool, None]] = token_per_repo_id or {} + self._epoch: Union[int, "torch.Tensor"] = _maybe_share_with_torch_persistent_workers(0) + self._starting_state_dict: Optional[dict] = None + self.__hffs_cache = HfFileSystem._cache # keep the cache on pickling (e.g. for dataloader workers) + self._prepare_ex_iterable_for_iteration() # set state_dict + _maybe_add_torch_iterable_dataset_parent_class(self.__class__) # subclass of torch IterableDataset + + @property + def num_columns(self) -> Optional[int]: + """Number of columns in the dataset. + This can be None if the dataset has unknown features (e.g. after a map() operation). + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation") + >>> ds.num_columns + 2 + ``` + """ + return None if self.features is None else len(self.features) + + @property + def column_names(self) -> Optional[list[str]]: + """Names of the columns in the dataset. + This can be None if the dataset has unknown features (e.g. after a map() operation). + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation", streaming=True) + >>> ds.column_names + ['text', 'label'] + ``` + """ + return None if self.features is None else list(self.features) + + def state_dict(self) -> dict: + """Get the current state_dict of the dataset. + It corresponds to the state at the latest example it yielded. + + Resuming returns exactly where the checkpoint was saved except in two cases: + + 1. examples from shuffle buffers are lost when resuming and the buffers are refilled with new data + 2. combinations of `.with_format(arrow)` and batched `.map()` may skip one batch. + + Returns: + `dict` + + Example: + + ```py + >>> from datasets import Dataset, concatenate_datasets + >>> ds = Dataset.from_dict({"a": range(6)}).to_iterable_dataset(num_shards=3) + >>> for idx, example in enumerate(ds): + ... print(example) + ... if idx == 2: + ... state_dict = ds.state_dict() + ... print("checkpoint") + ... break + >>> ds.load_state_dict(state_dict) + >>> print(f"restart from checkpoint") + >>> for example in ds: + ... print(example) + ``` + + which returns: + ``` + {'a': 0} + {'a': 1} + {'a': 2} + checkpoint + restart from checkpoint + {'a': 3} + {'a': 4} + {'a': 5} + ``` + + ```py + >>> from torchdata.stateful_dataloader import StatefulDataLoader + >>> ds = load_dataset("deepmind/code_contests", streaming=True, split="train") + >>> dataloader = StatefulDataLoader(ds, batch_size=32, num_workers=4) + >>> # checkpoint + >>> state_dict = dataloader.state_dict() # uses ds.state_dict() under the hood + >>> # resume from checkpoint + >>> dataloader.load_state_dict(state_dict) # uses ds.load_state_dict() under the hood + ``` + """ + return copy.deepcopy(self._state_dict) + + def load_state_dict(self, state_dict: dict) -> None: + """Load the state_dict of the dataset. + The iteration will restart at the next example from when the state was saved. + + Resuming returns exactly where the checkpoint was saved except in two cases: + + 1. examples from shuffle buffers are lost when resuming and the buffers are refilled with new data + 2. combinations of `.with_format(arrow)` and batched `.map()` may skip one batch. + + Example: + + ```py + >>> from datasets import Dataset, concatenate_datasets + >>> ds = Dataset.from_dict({"a": range(6)}).to_iterable_dataset(num_shards=3) + >>> for idx, example in enumerate(ds): + ... print(example) + ... if idx == 2: + ... state_dict = ds.state_dict() + ... print("checkpoint") + ... break + >>> ds.load_state_dict(state_dict) + >>> print(f"restart from checkpoint") + >>> for example in ds: + ... print(example) + ``` + + which returns: + ``` + {'a': 0} + {'a': 1} + {'a': 2} + checkpoint + restart from checkpoint + {'a': 3} + {'a': 4} + {'a': 5} + ``` + + ```py + >>> from torchdata.stateful_dataloader import StatefulDataLoader + >>> ds = load_dataset("deepmind/code_contests", streaming=True, split="train") + >>> dataloader = StatefulDataLoader(ds, batch_size=32, num_workers=4) + >>> # checkpoint + >>> state_dict = dataloader.state_dict() # uses ds.state_dict() under the hood + >>> # resume from checkpoint + >>> dataloader.load_state_dict(state_dict) # uses ds.load_state_dict() under the hood + ``` + """ + self._starting_state_dict = state_dict + + def __repr__(self): + return f"IterableDataset({{\n features: {list(self._info.features.keys()) if self._info.features is not None else 'Unknown'},\n num_shards: {self.num_shards}\n}})" + + def __getstate__(self): + return self.__dict__ + + def __setstate__(self, d): + self.__dict__ = d + # Re-add torch shared memory, since shared memory is not always kept when pickling + self._epoch = _maybe_share_with_torch_persistent_workers(self._epoch) + # Re-add the cache to keep on pickling (e.g. for dataloader workers) + self.__hffs_cache = HfFileSystem._cache + # Re-add torch iterable dataset as a parent class, since dynamically added parent classes are not kept when pickling + _maybe_add_torch_iterable_dataset_parent_class(self.__class__) + + def _head(self, n=5): + return next(iter(self.iter(batch_size=n))) + + @property + def epoch(self) -> int: + return int(self._epoch) + + def _effective_generator(self): + if self._shuffling and self.epoch == 0: + return self._shuffling.generator + elif self._shuffling: + # Create effective seed using self.epoch (we subtract in order to avoir overflow in long_scalars) + effective_seed = deepcopy(self._shuffling.generator).integers(0, 1 << 63) - self.epoch + effective_seed = (1 << 63) + effective_seed if effective_seed < 0 else effective_seed + return np.random.default_rng(effective_seed) + else: + raise ValueError("This dataset is not shuffled") + + @property + def num_shards(self) -> int: + if self._distributed and self._ex_iterable.num_shards % self._distributed.world_size == 0: + return self._ex_iterable.num_shards // self._distributed.world_size + return self._ex_iterable.num_shards + + @property + def n_shards(self) -> int: # backward compatibility + return self.num_shards + + def _iter_pytorch(self): + ex_iterable = self._prepare_ex_iterable_for_iteration() + # Fix for fsspec when using multiprocess to avoid hanging in the ML training loop. (only required for fsspec >= 0.9.0) + # See https://github.com/fsspec/gcsfs/issues/379 + fsspec.asyn.reset_lock() + # check if there aren't too many workers + import torch.utils.data + + worker_info = torch.utils.data.get_worker_info() + if self._is_main_process() and ex_iterable.num_shards < worker_info.num_workers: + logger.warning( + f"Too many dataloader workers: {worker_info.num_workers} (max is dataset.num_shards={ex_iterable.num_shards}). " + f"Stopping {worker_info.num_workers - ex_iterable.num_shards} dataloader workers." + ) + logger.info( + f"To parallelize data loading, we give each process some shards (or data sources) to process. " + f"Therefore it's unnecessary to have a number of workers greater than dataset.num_shards={ex_iterable.num_shards}. " + f"To enable more parallelism, please split the dataset in more files than {ex_iterable.num_shards}." + ) + # split workload + _log_prefix = f"node#{self._distributed.rank} " if self._distributed else "" + shards_indices = ex_iterable.split_shard_indices_by_worker( + num_shards=worker_info.num_workers, index=worker_info.id, contiguous=False + ) + if shards_indices: + logger.debug( + f"{_log_prefix}dataloader worker#{worker_info.id}, ': Starting to iterate over {len(shards_indices)}/{ex_iterable.num_shards} shards." + ) + ex_iterable = ex_iterable.shard_data_sources( + num_shards=worker_info.num_workers, index=worker_info.id, contiguous=False + ) + self._state_dict = { + "examples_iterable": ex_iterable._init_state_dict(), + "epoch": self.epoch, + } + if self._starting_state_dict and self.epoch == self._starting_state_dict["epoch"]: + ex_iterable.load_state_dict(self._starting_state_dict["examples_iterable"]) + + if self._formatting and (ex_iterable.iter_arrow or self._formatting.is_table): + formatter = get_formatter(self._formatting.format_type, features=self.features) + if ex_iterable.iter_arrow: + iterator = ex_iterable.iter_arrow() + else: + iterator = _convert_to_arrow(ex_iterable, batch_size=1) + for key, pa_table in iterator: + yield formatter.format_row(pa_table) + return + else: + for key, example in ex_iterable: + # no need to format thanks to FormattedExamplesIterable + yield example + logger.debug( + f"{_log_prefix}dataloader worker#{worker_info.id}, ': Finished iterating over {len(shards_indices)}/{ex_iterable.num_shards} shards." + ) + else: + logger.debug( + f"{_log_prefix}dataloader worker#{worker_info.id}, ': Stopping... Number of dataset shards < num_workers ({ex_iterable.num_shards}<{worker_info.num_workers})." + ) + + def _is_main_process(self): + if self._distributed and self._distributed.rank > 0: + return False + if "torch" in sys.modules: + import torch.utils.data + + worker_info = torch.utils.data.get_worker_info() + if worker_info is not None and worker_info.id > 0: + return False + return True + + def _prepare_ex_iterable_for_iteration( + self, batch_size: int = 1, drop_last_batch: bool = False + ) -> _BaseExamplesIterable: + ex_iterable = self._ex_iterable + if ( + self._formatting + and (ex_iterable.iter_arrow or self._formatting.is_table) + or (self.features and ex_iterable.features != self.features) + ): + ex_iterable = RebatchedArrowExamplesIterable( + ex_iterable, batch_size=batch_size, drop_last_batch=drop_last_batch + ) + if self._shuffling: + ex_iterable = ex_iterable.shuffle_data_sources(self._effective_generator()) + else: + ex_iterable = ex_iterable + + if self._distributed: + rank = self._distributed.rank + world_size = self._distributed.world_size + if ex_iterable.num_shards % world_size == 0: + if self._is_main_process(): + num_shards_per_node = ex_iterable.num_shards // world_size + plural = "s" if num_shards_per_node > 1 else "" + logger.info( + f"Assigning {num_shards_per_node} shard{plural} (or data source{plural}) of the dataset to each node." + ) + ex_iterable = ex_iterable.shard_data_sources(num_shards=world_size, index=rank, contiguous=False) + else: + if self._is_main_process(): + logger.info( + f"Assigning 1 out of {world_size} examples of the dataset to each node. The others are skipped during the iteration." + ) + logger.info( + f"It is more optimized to distribute the dataset shards (or data sources) across nodes. " + f"You can do that by using a dataset with number of shards that is a factor of world_size={world_size}. " + f"The current dataset has {ex_iterable.num_shards} which is not a factor of {world_size}" + ) + ex_iterable = StepExamplesIterable(ex_iterable, step=world_size, offset=rank) + + if self._formatting or (self.features and ex_iterable.features != self.features): + ex_iterable = FormattedExamplesIterable( + ex_iterable, + formatting=self._formatting, + features=self.features, + token_per_repo_id=self._token_per_repo_id, + ) + + self._state_dict = { + "examples_iterable": ex_iterable._init_state_dict(), + "epoch": self.epoch, + } + if self._starting_state_dict and self.epoch == self._starting_state_dict["epoch"]: + ex_iterable.load_state_dict(self._starting_state_dict["examples_iterable"]) + return ex_iterable + + def __iter__(self): + if "torch" in sys.modules: + import torch.utils.data + + worker_info = torch.utils.data.get_worker_info() + if isinstance(self, torch.utils.data.IterableDataset) and worker_info is not None: + # We're a torch.utils.data.IterableDataset in a PyTorch worker process + yield from self._iter_pytorch() + return + + ex_iterable = self._prepare_ex_iterable_for_iteration() + if self._formatting and (ex_iterable.iter_arrow or self._formatting.is_table): + formatter = get_formatter(self._formatting.format_type, features=self.features) + if ex_iterable.iter_arrow: + iterator = ex_iterable.iter_arrow() + else: + iterator = _convert_to_arrow(ex_iterable, batch_size=1) + for key, pa_table in iterator: + yield formatter.format_row(pa_table) + return + + for key, example in ex_iterable: + # no need to format thanks to FormattedExamplesIterable + yield example + + def iter(self, batch_size: int, drop_last_batch: bool = False): + """Iterate through the batches of size `batch_size`. + + Args: + batch_size (:obj:`int`): size of each batch to yield. + drop_last_batch (:obj:`bool`, default `False`): Whether a last batch smaller than the batch_size should be + dropped + """ + + if self._formatting: + formatter = get_formatter(self._formatting.format_type, features=self.features) + format_dict = formatter.recursive_tensorize if isinstance(formatter, TensorFormatter) else None + else: + format_dict = None + + ex_iterable = self._prepare_ex_iterable_for_iteration(batch_size=batch_size, drop_last_batch=drop_last_batch) + if self._formatting and (ex_iterable.iter_arrow or self._formatting.is_table): + if ex_iterable.iter_arrow: + iterator = ex_iterable.iter_arrow() + else: + iterator = _convert_to_arrow(ex_iterable, batch_size=batch_size, drop_last_batch=drop_last_batch) + for key, pa_table in iterator: + yield formatter.format_batch(pa_table) + return + + iterator = iter(ex_iterable) + for key, example in iterator: + # If batched, first build the batch + examples = [example] + [example for key, example in islice(iterator, batch_size - 1)] + if drop_last_batch and len(examples) < batch_size: # ignore last batch + return + batch = _examples_to_batch(examples) + # we need to format here in case we need to stack tensors together + yield format_dict(batch) if format_dict else batch + + def __getitem__(self, column_name: str) -> IterableColumn: + return IterableColumn(self, column_name) + + @staticmethod + def from_generator( + generator: Callable, + features: Optional[Features] = None, + gen_kwargs: Optional[dict] = None, + split: NamedSplit = Split.TRAIN, + ) -> "IterableDataset": + """Create an Iterable Dataset from a generator. + + Args: + generator (`Callable`): + A generator function that `yields` examples. + features (`Features`, *optional*): + Dataset features. + gen_kwargs(`dict`, *optional*): + Keyword arguments to be passed to the `generator` callable. + You can define a sharded iterable dataset by passing the list of shards in `gen_kwargs`. + This can be used to improve shuffling and when iterating over the dataset with multiple workers. + split ([`NamedSplit`], defaults to `Split.TRAIN`): + Split name to be assigned to the dataset. + + + Returns: + `IterableDataset` + + Example: + + ```py + >>> def gen(): + ... yield {"text": "Good", "label": 0} + ... yield {"text": "Bad", "label": 1} + ... + >>> ds = IterableDataset.from_generator(gen) + ``` + + ```py + >>> def gen(shards): + ... for shard in shards: + ... with open(shard) as f: + ... for line in f: + ... yield {"line": line} + ... + >>> shards = [f"data{i}.txt" for i in range(32)] + >>> ds = IterableDataset.from_generator(gen, gen_kwargs={"shards": shards}) + >>> ds = ds.shuffle(seed=42, buffer_size=10_000) # shuffles the shards order + uses a shuffle buffer + >>> from torch.utils.data import DataLoader + >>> dataloader = DataLoader(ds.with_format("torch"), num_workers=4) # give each worker a subset of 32/4=8 shards + ``` + """ + from .io.generator import GeneratorDatasetInputStream + + return GeneratorDatasetInputStream( + generator=generator, features=features, gen_kwargs=gen_kwargs, streaming=True, split=split + ).read() + + @staticmethod + def from_spark( + df: "pyspark.sql.DataFrame", + split: Optional[NamedSplit] = None, + features: Optional[Features] = None, + **kwargs, + ) -> "IterableDataset": + """Create an IterableDataset from Spark DataFrame. The dataset is streamed to the driver in batches. + + Args: + df (`pyspark.sql.DataFrame`): + The DataFrame containing the desired data. + split (`NamedSplit`, *optional*): + Split name to be assigned to the dataset. + features (`Features`, *optional*): + Dataset features. + + Returns: + [`IterableDataset`] + + Example: + + ```py + >>> df = spark.createDataFrame( + >>> data=[[1, "Elia"], [2, "Teo"], [3, "Fang"]], + >>> columns=["id", "name"], + >>> ) + >>> ds = IterableDataset.from_spark(df) + ``` + """ + from .io.spark import SparkDatasetReader + + if sys.platform == "win32": + raise OSError("IterableDataset.from_spark is not currently supported on Windows") + + return SparkDatasetReader( + df, + split=split, + features=features, + streaming=True, + **kwargs, + ).read() + + @staticmethod + def from_file(filename: str) -> "IterableDataset": + """Instantiate a IterableDataset from Arrow table at filename. + + Args: + filename (`str`): + File name of the dataset. + + Returns: + [`IterableDataset`] + """ + pa_table_schema = read_schema_from_file(filename) + inferred_features = Features.from_arrow_schema(pa_table_schema) + ex_iterable = ArrowExamplesIterable(Dataset._generate_tables_from_cache_file, kwargs={"filename": filename}) + return IterableDataset(ex_iterable=ex_iterable, info=DatasetInfo(features=inferred_features)) + + def with_format( + self, + type: Optional[str] = None, + ) -> "IterableDataset": + """ + Return a dataset with the specified format. + + Args: + + type (`str`, *optional*): + Either output type selected in `[None, 'numpy', 'torch', 'tensorflow', 'jax', 'arrow', 'pandas', 'polars']`. + `None` means it returns python objects (default). + + Example: + + ```py + >>> from datasets import load_dataset + >>> from transformers import AutoTokenizer + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation", streaming=True) + >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") + >>> ds = ds.map(lambda x: tokenizer(x['text'], truncation=True, padding=True), batched=True) + >>> ds = ds.with_format("torch") + >>> next(iter(ds)) + {'text': 'compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .', + 'label': tensor(1), + 'input_ids': tensor([ 101, 18027, 16310, 16001, 1103, 9321, 178, 11604, 7235, 6617, + 1742, 2165, 2820, 1206, 6588, 22572, 12937, 1811, 2153, 1105, + 1147, 12890, 19587, 6463, 1105, 15026, 1482, 119, 102, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0]), + 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), + 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])} + ``` + """ + type = get_format_type_from_alias(type) + # TODO(QL): add format_kwargs + # TODO(QL): add format_columns and return_all_columns + # TODO(QL): add pandas format + return IterableDataset( + ex_iterable=self._ex_iterable, + info=self._info.copy(), + split=self._split, + formatting=FormattingConfig(format_type=type), + shuffling=copy.deepcopy(self._shuffling), + distributed=copy.deepcopy(self._distributed), + token_per_repo_id=self._token_per_repo_id, + ) + + def map( + self, + function: Optional[Callable] = None, + with_indices: bool = False, + input_columns: Optional[Union[str, list[str]]] = None, + batched: bool = False, + batch_size: Optional[int] = 1000, + drop_last_batch: bool = False, + remove_columns: Optional[Union[str, list[str]]] = None, + features: Optional[Features] = None, + fn_kwargs: Optional[dict] = None, + ) -> "IterableDataset": + """ + Apply a function to all the examples in the iterable dataset (individually or in batches) and update them. + If your function returns a column that already exists, then it overwrites it. + The function is applied on-the-fly on the examples when iterating over the dataset. + + You can specify whether the function should be batched or not with the `batched` parameter: + + - If batched is `False`, then the function takes 1 example in and should return 1 example. + An example is a dictionary, e.g. `{"text": "Hello there !"}`. + - If batched is `True` and `batch_size` is 1, then the function takes a batch of 1 example as input and can return a batch with 1 or more examples. + A batch is a dictionary, e.g. a batch of 1 example is {"text": ["Hello there !"]}. + - If batched is `True` and `batch_size` is `n` > 1, then the function takes a batch of `n` examples as input and can return a batch with `n` examples, or with an arbitrary number of examples. + Note that the last batch may have less than `n` examples. + A batch is a dictionary, e.g. a batch of `n` examples is `{"text": ["Hello there !"] * n}`. + + If the function is asynchronous, then `map` will run your function in parallel, with up to one thousand simulatenous calls. + It is recommended to use a `asyncio.Semaphore` in your function if you want to set a maximum number of operations that can run at the same time. + + Args: + function (`Callable`, *optional*, defaults to `None`): + Function applied on-the-fly on the examples when you iterate on the dataset. + It must have one of the following signatures: + + - `function(example: Dict[str, Any]) -> Dict[str, Any]` if `batched=False` and `with_indices=False` + - `function(example: Dict[str, Any], idx: int) -> Dict[str, Any]` if `batched=False` and `with_indices=True` + - `function(batch: Dict[str, List]) -> Dict[str, List]` if `batched=True` and `with_indices=False` + - `function(batch: Dict[str, List], indices: List[int]) -> Dict[str, List]` if `batched=True` and `with_indices=True` + + For advanced usage, the function can also return a `pyarrow.Table`. + If the function is asynchronous, then `map` will run your function in parallel. + Moreover if your function returns nothing (`None`), then `map` will run your function and return the dataset unchanged. + If no function is provided, default to identity function: `lambda x: x`. + with_indices (`bool`, defaults to `False`): + Provide example indices to `function`. Note that in this case the signature of `function` should be `def function(example, idx[, rank]): ...`. + input_columns (`Optional[Union[str, List[str]]]`, defaults to `None`): + The columns to be passed into `function` + as positional arguments. If `None`, a dict mapping to all formatted columns is passed as one argument. + batched (`bool`, defaults to `False`): + Provide batch of examples to `function`. + batch_size (`int`, *optional*, defaults to `1000`): + Number of examples per batch provided to `function` if `batched=True`. + `batch_size <= 0` or `batch_size == None` then provide the full dataset as a single batch to `function`. + drop_last_batch (`bool`, defaults to `False`): + Whether a last batch smaller than the batch_size should be + dropped instead of being processed by the function. + remove_columns (`[List[str]]`, *optional*, defaults to `None`): + Remove a selection of columns while doing the mapping. + Columns will be removed before updating the examples with the output of `function`, i.e. if `function` is adding + columns with names in `remove_columns`, these columns will be kept. + features (`[Features]`, *optional*, defaults to `None`): + Feature types of the resulting dataset. + fn_kwargs (`Dict`, *optional*, default `None`): + Keyword arguments to be passed to `function`. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train", streaming=True) + >>> def add_prefix(example): + ... example["text"] = "Review: " + example["text"] + ... return example + >>> ds = ds.map(add_prefix) + >>> list(ds.take(3)) + [{'label': 1, + 'text': 'Review: the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'}, + {'label': 1, + 'text': 'Review: the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .'}, + {'label': 1, 'text': 'Review: effective but too-tepid biopic'}] + ``` + """ + if isinstance(input_columns, str): + input_columns = [input_columns] + if isinstance(remove_columns, str): + remove_columns = [remove_columns] + if function is None: + function = identity_func + if fn_kwargs is None: + fn_kwargs = {} + if features is not None: + features = _fix_for_backward_compatible_features(features) + + ex_iterable = self._ex_iterable + # no need to apply features if ex_iterable is typed and if there was no cast_column() + input_features = ( + None + if (ex_iterable.is_typed and (self._info.features is None or self._info.features == ex_iterable.features)) + else self._info.features + ) + + if self._formatting and self._formatting.is_table: + # apply formatting before iter_arrow to keep map examples iterable happy + ex_iterable = FormattedExamplesIterable( + ex_iterable, + formatting=copy.deepcopy(self._formatting), + features=input_features, + token_per_repo_id=self._token_per_repo_id, + ) + ex_iterable = RebatchedArrowExamplesIterable( + ex_iterable, batch_size=batch_size if batched else 1, drop_last_batch=drop_last_batch + ) + else: + if self._formatting and self._ex_iterable.iter_arrow: + ex_iterable = RebatchedArrowExamplesIterable( + self._ex_iterable, batch_size=batch_size if batched else 1, drop_last_batch=drop_last_batch + ) + if self._formatting or input_features: + # apply formatting after iter_arrow to avoid re-encoding the examples + ex_iterable = FormattedExamplesIterable( + ex_iterable, + formatting=copy.deepcopy(self._formatting), + features=input_features, + token_per_repo_id=self._token_per_repo_id, + ) + + ex_iterable = MappedExamplesIterable( + ex_iterable, + function=function, + with_indices=with_indices, + input_columns=input_columns, + batched=batched, + batch_size=batch_size, + drop_last_batch=drop_last_batch, + remove_columns=remove_columns, + fn_kwargs=fn_kwargs, + formatting=self._formatting, + features=features, + ) + info = self.info.copy() + info.features = features + return IterableDataset( + ex_iterable=ex_iterable, + info=info, + split=self._split, + formatting=self._formatting, + shuffling=copy.deepcopy(self._shuffling), + distributed=copy.deepcopy(self._distributed), + token_per_repo_id=self._token_per_repo_id, + ) + + def filter( + self, + function: Optional[Callable] = None, + with_indices=False, + input_columns: Optional[Union[str, list[str]]] = None, + batched: bool = False, + batch_size: Optional[int] = 1000, + fn_kwargs: Optional[dict] = None, + ) -> "IterableDataset": + """Apply a filter function to all the elements so that the dataset only includes examples according to the filter function. + The filtering is done on-the-fly when iterating over the dataset. + + If the function is asynchronous, then `filter` will run your function in parallel, with up to one thousand simulatenous calls (configurable). + It is recommended to use a `asyncio.Semaphore` in your function if you want to set a maximum number of operations that can run at the same time. + + Args: + function (`Callable`): + Callable with one of the following signatures: + + - `function(example: Dict[str, Any]) -> bool` if `with_indices=False, batched=False` + - `function(example: Dict[str, Any], indices: int) -> bool` if `with_indices=True, batched=False` + - `function(example: Dict[str, List]) -> List[bool]` if `with_indices=False, batched=True` + - `function(example: Dict[str, List], indices: List[int]) -> List[bool]` if `with_indices=True, batched=True` + + If the function is asynchronous, then `filter` will run your function in parallel. + If no function is provided, defaults to an always True function: `lambda x: True`. + with_indices (`bool`, defaults to `False`): + Provide example indices to `function`. Note that in this case the signature of `function` should be `def function(example, idx): ...`. + input_columns (`str` or `List[str]`, *optional*): + The columns to be passed into `function` as + positional arguments. If `None`, a dict mapping to all formatted columns is passed as one argument. + batched (`bool`, defaults to `False`): + Provide batch of examples to `function`. + batch_size (`int`, *optional*, default `1000`): + Number of examples per batch provided to `function` if `batched=True`. + fn_kwargs (`Dict`, *optional*, default `None`): + Keyword arguments to be passed to `function`. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train", streaming=True) + >>> ds = ds.filter(lambda x: x["label"] == 0) + >>> list(ds.take(3)) + [{'label': 0, 'movie_review': 'simplistic , silly and tedious .'}, + {'label': 0, + 'movie_review': "it's so laddish and juvenile , only teenage boys could possibly find it funny ."}, + {'label': 0, + 'movie_review': 'exploitative and largely devoid of the depth or sophistication that would make watching such a graphic treatment of the crimes bearable .'}] + ``` + """ + if isinstance(input_columns, str): + input_columns = [input_columns] + + # We need the examples to be decoded for certain feature types like Image or Audio, + # format and type before filtering + ex_iterable = self._ex_iterable + if self._info.features or self._formatting: + ex_iterable = FormattedExamplesIterable( + ex_iterable, + formatting=self._formatting, + features=None if ex_iterable.is_typed else self._info.features, + token_per_repo_id=self._token_per_repo_id, + ) + + ex_iterable = FilteredExamplesIterable( + ex_iterable, + function=function, + with_indices=with_indices, + input_columns=input_columns, + batched=batched, + batch_size=batch_size, + fn_kwargs=fn_kwargs, + formatting=self._formatting, + ) + return IterableDataset( + ex_iterable=ex_iterable, + info=self._info, + split=self._split, + formatting=self._formatting, + shuffling=copy.deepcopy(self._shuffling), + distributed=copy.deepcopy(self._distributed), + token_per_repo_id=self._token_per_repo_id, + ) + + def shuffle( + self, seed=None, generator: Optional[np.random.Generator] = None, buffer_size: int = 1000 + ) -> "IterableDataset": + """ + Randomly shuffles the elements of this dataset. + + This dataset fills a buffer with `buffer_size` elements, then randomly samples elements from this buffer, + replacing the selected elements with new elements. For perfect shuffling, a buffer size greater than or + equal to the full size of the dataset is required. + + For instance, if your dataset contains 10,000 elements but `buffer_size` is set to 1000, then `shuffle` will + initially select a random element from only the first 1000 elements in the buffer. Once an element is + selected, its space in the buffer is replaced by the next (i.e. 1,001-st) element, + maintaining the 1000 element buffer. + + If the dataset is made of several shards, it also does shuffle the order of the shards. + However if the order has been fixed by using [`~datasets.IterableDataset.skip`] or [`~datasets.IterableDataset.take`] + then the order of the shards is kept unchanged. + + Args: + seed (`int`, *optional*, defaults to `None`): + Random seed that will be used to shuffle the dataset. + It is used to sample from the shuffle buffer and also to shuffle the data shards. + generator (`numpy.random.Generator`, *optional*): + Numpy random Generator to use to compute the permutation of the dataset rows. + If `generator=None` (default), uses `np.random.default_rng` (the default BitGenerator (PCG64) of NumPy). + buffer_size (`int`, defaults to `1000`): + Size of the buffer. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train", streaming=True) + >>> list(ds.take(3)) + [{'label': 1, + 'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'}, + {'label': 1, + 'text': 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .'}, + {'label': 1, 'text': 'effective but too-tepid biopic'}] + >>> shuffled_ds = ds.shuffle(seed=42) + >>> list(shuffled_ds.take(3)) + [{'label': 1, + 'text': "a sports movie with action that's exciting on the field and a story you care about off it ."}, + {'label': 1, + 'text': 'at its best , the good girl is a refreshingly adult take on adultery . . .'}, + {'label': 1, + 'text': "sam jones became a very lucky filmmaker the day wilco got dropped from their record label , proving that one man's ruin may be another's fortune ."}] + ``` + """ + if generator is None: + generator = np.random.default_rng(seed) + else: + generator = deepcopy(generator) + shuffling = ShufflingConfig(generator=generator, _original_seed=seed) + return IterableDataset( + BufferShuffledExamplesIterable( + RebatchedArrowExamplesIterable(self._ex_iterable, batch_size=1) + if self._ex_iterable.iter_arrow + else self._ex_iterable, + buffer_size=buffer_size, + generator=generator, + ), + info=self._info.copy(), + split=self._split, + formatting=self._formatting, + shuffling=shuffling, + distributed=copy.deepcopy(self._distributed), + token_per_repo_id=self._token_per_repo_id, + ) + + def set_epoch(self, epoch: int): + self._epoch += epoch - self._epoch # update torch value in shared memory in-place + + def skip(self, n: int) -> "IterableDataset": + """ + Create a new [`IterableDataset`] that skips the first `n` elements. + + Args: + n (`int`): + Number of elements to skip. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train", streaming=True) + >>> list(ds.take(3)) + [{'label': 1, + 'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'}, + {'label': 1, + 'text': 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .'}, + {'label': 1, 'text': 'effective but too-tepid biopic'}] + >>> ds = ds.skip(1) + >>> list(ds.take(3)) + [{'label': 1, + 'text': 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .'}, + {'label': 1, 'text': 'effective but too-tepid biopic'}, + {'label': 1, + 'text': 'if you sometimes like to go to the movies to have fun , wasabi is a good place to start .'}] + ``` + """ + ex_iterable = SkipExamplesIterable( + self._ex_iterable, + n, + block_sources_order_when_shuffling=self._shuffling is None, + split_when_sharding=self._distributed is None, + ) + return IterableDataset( + ex_iterable=ex_iterable, + info=self._info.copy(), + split=self._split, + formatting=self._formatting, + shuffling=copy.deepcopy(self._shuffling), + distributed=copy.deepcopy(self._distributed), + token_per_repo_id=self._token_per_repo_id, + ) + + def repeat(self, num_times: Optional[int]) -> "IterableDataset": + """ + Create a new [`IterableDataset`] that repeats the underlying dataset `num_times` times. + + N.B. The effect of calling shuffle after repeat depends significantly on buffer size. + With buffer_size 1, duplicate data is never seen in the same iteration, even after shuffling: + ds.repeat(n).shuffle(seed=42, buffer_size=1) is equivalent to ds.shuffle(seed=42, buffer_size=1).repeat(n), + and only shuffles shard orders within each iteration. + With buffer size >= (num samples in the dataset * num_times), we get full shuffling of the repeated data, i.e. we can observe duplicates in + the same iteration. + + Args: + num_times (`int`) or (`None`): + Number of times to repeat the dataset. If `None`, the dataset will be repeated indefinitely. + + Example: + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train") + >>> ds = ds.take(2).repeat(2) + >>> list(ds) + [{'label': 1, + 'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'}, + {'label': 1, + 'text': 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .'}, + {'label': 1, 'text': 'effective but too-tepid biopic'}, + {'label': 1, + 'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'}, + {'label': 1, + 'text': 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .'}, + {'label': 1, 'text': 'effective but too-tepid biopic'}] + ``` + """ + return IterableDataset( + ex_iterable=RepeatExamplesIterable(self._ex_iterable, num_times=num_times), + info=self._info, + split=self._split, + formatting=self._formatting, + shuffling=copy.deepcopy(self._shuffling), + distributed=copy.deepcopy(self._distributed), + token_per_repo_id=self._token_per_repo_id, + ) + + def take(self, n: int) -> "IterableDataset": + """ + Create a new [`IterableDataset`] with only the first `n` elements. + + Args: + n (`int`): + Number of elements to take. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train", streaming=True) + >>> small_ds = ds.take(2) + >>> list(small_ds) + [{'label': 1, + 'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'}, + {'label': 1, + 'text': 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .'}] + ``` + """ + ex_iterable = TakeExamplesIterable( + self._ex_iterable, + n, + block_sources_order_when_shuffling=self._shuffling is None, + split_when_sharding=self._distributed is None, + ) + return IterableDataset( + ex_iterable=ex_iterable, + info=self._info.copy(), + split=self._split, + formatting=self._formatting, + shuffling=copy.deepcopy(self._shuffling), + distributed=copy.deepcopy(self._distributed), + token_per_repo_id=self._token_per_repo_id, + ) + + def shard( + self, + num_shards: int, + index: int, + contiguous: bool = True, + ) -> "IterableDataset": + """Return the `index`-nth shard from dataset split into `num_shards` pieces. + + This shards deterministically. `dataset.shard(n, i)` splits the dataset into contiguous chunks, + so it can be easily concatenated back together after processing. If `dataset.num_shards % n == l`, then the + first `l` datasets each have `(dataset.num_shards // n) + 1` shards, and the remaining datasets have `(dataset.num_shards // n)` shards. + `datasets.concatenate_datasets([dset.shard(n, i) for i in range(n)])` returns a dataset with the same order as the original. + In particular, `dataset.shard(dataset.num_shards, i)` returns a dataset with 1 shard. + + Note: n should be less or equal to the number of shards in the dataset `dataset.num_shards`. + + On the other hand, `dataset.shard(n, i, contiguous=False)` contains all the shards of the dataset whose index mod `n = i`. + + Be sure to shard before using any randomizing operator (such as `shuffle`). + It is best if the shard operator is used early in the dataset pipeline. + + Args: + num_shards (`int`): + How many shards to split the dataset into. + index (`int`): + Which shard to select and return. + contiguous: (`bool`, defaults to `True`): + Whether to select contiguous blocks of indices for shards. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("amazon_polarity", split="train", streaming=True) + >>> ds + Dataset({ + features: ['label', 'title', 'content'], + num_shards: 4 + }) + >>> ds.shard(num_shards=2, index=0) + Dataset({ + features: ['label', 'title', 'content'], + num_shards: 2 + }) + ``` + """ + ex_iterable = self._ex_iterable.shard_data_sources(num_shards=num_shards, index=index, contiguous=contiguous) + return IterableDataset( + ex_iterable=ex_iterable, + info=self._info.copy(), + split=self._split, + formatting=self._formatting, + shuffling=copy.deepcopy(self._shuffling), + distributed=copy.deepcopy(self._distributed), + token_per_repo_id=self._token_per_repo_id, + ) + + def add_column(self, name: str, column: Union[list, np.array]) -> "IterableDataset": + """Add column to Dataset. + + Args: + name (str): Column name. + column (list or np.array): Column data to be added. + + Returns: + `IterableDataset` + """ + return self.map(partial(add_column_fn, name=name, column=column), with_indices=True) + + def rename_column(self, original_column_name: str, new_column_name: str) -> "IterableDataset": + """ + Rename a column in the dataset, and move the features associated to the original column under the new column + name. + + Args: + original_column_name (`str`): + Name of the column to rename. + new_column_name (`str`): + New name for the column. + + Returns: + `IterableDataset`: A copy of the dataset with a renamed column. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train", streaming=True) + >>> next(iter(ds)) + {'label': 1, + 'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'} + >>> ds = ds.rename_column("text", "movie_review") + >>> next(iter(ds)) + {'label': 1, + 'movie_review': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'} + ``` + """ + return self.rename_columns({original_column_name: new_column_name}) + + def rename_columns(self, column_mapping: dict[str, str]) -> "IterableDataset": + """ + Rename several columns in the dataset, and move the features associated to the original columns under + the new column names. + + Args: + column_mapping (`Dict[str, str]`): A mapping of columns to rename to their new names + + Returns: + `IterableDataset`: A copy of the dataset with renamed columns + """ + + original_features = self._info.features.copy() if self._info.features else None + ds_iterable = self.map( + partial(_rename_columns_fn, column_mapping=column_mapping), remove_columns=list(column_mapping) + ) + if original_features is not None: + ds_iterable._info.features = Features( + { + column_mapping[col] if col in column_mapping.keys() else col: feature + for col, feature in original_features.items() + } + ) + return ds_iterable + + def remove_columns(self, column_names: Union[str, list[str]]) -> "IterableDataset": + """ + Remove one or several column(s) in the dataset and the features associated to them. + The removal is done on-the-fly on the examples when iterating over the dataset. + + + Args: + column_names (`Union[str, List[str]]`): + Name of the column(s) to remove. + + Returns: + `IterableDataset`: A copy of the dataset object without the columns to remove. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train", streaming=True) + >>> next(iter(ds)) + {'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', 'label': 1} + >>> ds = ds.remove_columns("label") + >>> next(iter(ds)) + {'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'} + ``` + """ + original_features = self._info.features.copy() if self._info.features else None + ds_iterable = self.map(remove_columns=column_names) + if original_features is not None: + ds_iterable._info.features = original_features.copy() + for col, _ in original_features.items(): + if col in column_names: + del ds_iterable._info.features[col] + + return ds_iterable + + def select_columns(self, column_names: Union[str, list[str]]) -> "IterableDataset": + """Select one or several column(s) in the dataset and the features + associated to them. The selection is done on-the-fly on the examples + when iterating over the dataset. + + + Args: + column_names (`Union[str, List[str]]`): + Name of the column(s) to select. + + Returns: + `IterableDataset`: A copy of the dataset object with selected columns. + + Example: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train", streaming=True) + >>> next(iter(ds)) + {'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', 'label': 1} + >>> ds = ds.select_columns("text") + >>> next(iter(ds)) + {'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'} + ``` + """ + if isinstance(column_names, str): + column_names = [column_names] + + if self._info: + info = copy.deepcopy(self._info) + if self._info.features is not None: + missing_columns = set(column_names) - set(self._info.features.keys()) + if missing_columns: + raise ValueError( + f"Column name {list(missing_columns)} not in the " + "dataset. Columns in the dataset: " + f"{list(self._info.features.keys())}." + ) + info.features = Features({c: info.features[c] for c in column_names}) + + ex_iterable = SelectColumnsIterable(self._ex_iterable, column_names) + return IterableDataset( + ex_iterable=ex_iterable, + info=info, + split=self._split, + formatting=self._formatting, + shuffling=self._shuffling, + distributed=self._distributed, + token_per_repo_id=self._token_per_repo_id, + ) + + def cast_column(self, column: str, feature: FeatureType) -> "IterableDataset": + """Cast column to feature for decoding. + + Args: + column (`str`): + Column name. + feature (`Feature`): + Target feature. + + Returns: + `IterableDataset` + + Example: + + ```py + >>> from datasets import load_dataset, Audio + >>> ds = load_dataset("PolyAI/minds14", name="en-US", split="train", streaming=True) + >>> ds.features + {'audio': Audio(sampling_rate=8000, mono=True, decode=True, id=None), + 'english_transcription': Value('string'), + 'intent_class': ClassLabel(num_classes=14, names=['abroad', 'address', 'app_error', 'atm_limit', 'balance', 'business_loan', 'card_issues', 'cash_deposit', 'direct_debit', 'freeze', 'high_value_payment', 'joint_account', 'latest_transactions', 'pay_bill']), + 'lang_id': ClassLabel(num_classes=14, names=['cs-CZ', 'de-DE', 'en-AU', 'en-GB', 'en-US', 'es-ES', 'fr-FR', 'it-IT', 'ko-KR', 'nl-NL', 'pl-PL', 'pt-PT', 'ru-RU', 'zh-CN']), + 'path': Value('string'), + 'transcription': Value('string')} + >>> ds = ds.cast_column("audio", Audio(sampling_rate=16000)) + >>> ds.features + {'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), + 'english_transcription': Value('string'), + 'intent_class': ClassLabel(num_classes=14, names=['abroad', 'address', 'app_error', 'atm_limit', 'balance', 'business_loan', 'card_issues', 'cash_deposit', 'direct_debit', 'freeze', 'high_value_payment', 'joint_account', 'latest_transactions', 'pay_bill']), + 'lang_id': ClassLabel(num_classes=14, names=['cs-CZ', 'de-DE', 'en-AU', 'en-GB', 'en-US', 'es-ES', 'fr-FR', 'it-IT', 'ko-KR', 'nl-NL', 'pl-PL', 'pt-PT', 'ru-RU', 'zh-CN']), + 'path': Value('string'), + 'transcription': Value('string')} + ``` + """ + feature = _fix_for_backward_compatible_features(feature) + info = self._info.copy() + info.features[column] = feature + return IterableDataset( + ex_iterable=self._ex_iterable, + info=info, + split=self._split, + formatting=self._formatting, + shuffling=copy.deepcopy(self._shuffling), + distributed=copy.deepcopy(self._distributed), + token_per_repo_id=self._token_per_repo_id, + ) + + def cast( + self, + features: Features, + ) -> "IterableDataset": + """ + Cast the dataset to a new set of features. + + Args: + features ([`Features`]): + New features to cast the dataset to. + The name of the fields in the features must match the current column names. + The type of the data must also be convertible from one type to the other. + For non-trivial conversion, e.g. `string` <-> `ClassLabel` you should use [`~Dataset.map`] to update the Dataset. + + Returns: + `IterableDataset`: A copy of the dataset with casted features. + + Example: + + ```py + >>> from datasets import load_dataset, ClassLabel, Value + >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train", streaming=True) + >>> ds.features + {'label': ClassLabel(names=['neg', 'pos']), + 'text': Value('string')} + >>> new_features = ds.features.copy() + >>> new_features["label"] = ClassLabel(names=["bad", "good"]) + >>> new_features["text"] = Value("large_string") + >>> ds = ds.cast(new_features) + >>> ds.features + {'label': ClassLabel(names=['bad', 'good']), + 'text': Value('large_string')} + ``` + """ + features = _fix_for_backward_compatible_features(features) + info = self._info.copy() + info.features = features + return IterableDataset( + ex_iterable=self._ex_iterable, + info=info, + split=self._split, + formatting=self._formatting, + shuffling=copy.deepcopy(self._shuffling), + distributed=copy.deepcopy(self._distributed), + token_per_repo_id=self._token_per_repo_id, + ) + + def decode(self, enable: bool = True, num_threads: int = 0) -> "IterableDataset": + """ + Enable or disable the dataset features decoding for audio, image, video. + + When enabled (default), media types are decoded: + + * audio -> dict of "array" and "sampling_rate" and "path" + * image -> PIL.Image + * video -> torchvision.io.VideoReader + + You can enable multithreading using `num_threads`. This is especially useful to speed up remote + data streaming. However it can be slower than `num_threads=0` for local data on fast disks. + + Disabling decoding is useful if you want to iterate on the paths or bytes of the media files + without actually decoding their content. To disable decoding you can use `.decode(False)`, which + is equivalent to calling `.cast()` or `.cast_column()` with all the Audio, Image and Video types + set to `decode=False`. + + Args: + enable (`bool`, defaults to `True`): + Enable or disable features decoding. + num_threads (`int`, defaults to `0`): + Enable multithreading for features decoding. + + Returns: + `IterableDataset`: A copy of the dataset with casted features. + + Examples: + + Disable decoding: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset("sshh12/planet-textures", split="train", streaming=True) + >>> next(iter(ds)) + {'image': , + 'text': 'A distant celestial object with an icy crust, displaying a light blue shade, covered with round pits and rugged terrains.'} + >>> ds = ds.decode(False) + >>> ds.features + {'image': Image(mode=None, decode=False, id=None), + 'text': Value('string')} + >>> next(iter(ds)) + { + 'image': { + 'path': 'hf://datasets/sshh12/planet-textures@69dc4cef7a5c4b2cfe387727ec8ea73d4bff7302/train/textures/0000.png', + 'bytes': None + }, + 'text': 'A distant celestial object with an icy crust, displaying a light blue shade, covered with round pits and rugged terrains.' + } + ``` + + Speed up streaming with multithreading: + + ```py + >>> import os + >>> from datasets import load_dataset + >>> from tqdm import tqdm + >>> ds = load_dataset("sshh12/planet-textures", split="train", streaming=True) + >>> num_threads = min(32, (os.cpu_count() or 1) + 4) + >>> ds = ds.decode(num_threads=num_threads) + >>> for _ in tqdm(ds): # 20 times faster ! + ... ... + ``` + """ + if not self.features: + raise ValueError( + "Features decoding is only available for datasets with known features, but features are Unknown. " + "Please set the datasets features with `ds = ds.cast(features)`." + ) + ds = self + + def set_decoding(decode: bool, feature): + if hasattr(feature, "decode"): + feature.decode = decode + + if enable and num_threads > 0: + disabled_decoding_features = self.features.copy() + enabled_decoding_features = self.features.copy() + + _visit(disabled_decoding_features, partial(set_decoding, False)) + _visit(enabled_decoding_features, partial(set_decoding, True)) + ds = ds.cast(disabled_decoding_features) + pool = multiprocessing.pool.ThreadPool(num_threads) + func = partial(_apply_async, pool, enabled_decoding_features.decode_example) + ds = ds.map(func, features=enabled_decoding_features) + assert isinstance(ds._ex_iterable, MappedExamplesIterable) + ds._ex_iterable.max_num_running_async_map_functions_in_parallel = 2 * num_threads + else: + features = ds.features.copy() + _visit(features, partial(set_decoding, enable)) + ds = ds.cast(features) + return ds + + def _step(self, step: int, offset: int) -> "IterableDataset": + ex_iterable = StepExamplesIterable(self._ex_iterable, step=step, offset=offset) + return IterableDataset( + ex_iterable=ex_iterable, + info=self._info.copy(), + split=self._split, + formatting=self._formatting, + shuffling=copy.deepcopy(self._shuffling), + distributed=copy.deepcopy(self._distributed), + token_per_repo_id=self._token_per_repo_id, + ) + + def _resolve_features(self): + if self.features is not None: + return self + elif self._ex_iterable.is_typed: + features = self._ex_iterable.features + else: + features = _infer_features_from_batch(self.with_format(None)._head()) + info = self.info.copy() + info.features = features + return IterableDataset( + ex_iterable=self._ex_iterable, + info=info, + split=self._split, + formatting=self._formatting, + shuffling=copy.deepcopy(self._shuffling), + distributed=copy.deepcopy(self._distributed), + token_per_repo_id=self._token_per_repo_id, + ) + + def batch(self, batch_size: int, drop_last_batch: bool = False) -> "IterableDataset": + """ + Group samples from the dataset into batches. + + Args: + batch_size (`int`): The number of samples in each batch. + drop_last_batch (`bool`, defaults to `False`): Whether to drop the last incomplete batch. + + Example: + ```py + >>> ds = load_dataset("some_dataset", streaming=True) + >>> batched_ds = ds.batch(batch_size=32) + ``` + """ + + if self.features: + features = Features({col: List(feature) for col, feature in self.features.items()}) + else: + features = None + return self.map( + _batch_fn, batched=True, batch_size=batch_size, drop_last_batch=drop_last_batch, features=features + ) + + def to_dict(self, batch_size: Optional[int] = None, batched: bool = False) -> Union[dict, Iterator[dict]]: + """Returns the dataset as a Python dict. Can also return a generator for large datasets. + + Args: + batch_size (`int`, *optional*): The size (number of rows) of the batches if `batched` is `True`. + Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`. + + Returns: + `dict` or `Iterator[dict]` + + Example: + + ```py + >>> ds.to_dict() + ``` + """ + if batched: + for table in self.with_format("arrow").iter(batch_size=batch_size): + yield Dataset(table, fingerprint="unset").to_dict() + else: + table = pa.concat_tables(list(self.with_format("arrow").iter(batch_size=1000))) + return Dataset(table, fingerprint="unset").to_dict() + + def to_list(self) -> list: + """Returns the dataset as a Python list. + + Returns: + `list` + + Example: + + ```py + >>> ds.to_list() + ``` + """ + table = pa.concat_tables(list(self.with_format("arrow").iter(batch_size=1000))) + return Dataset(table, fingerprint="unset").to_list() + + def to_pandas( + self, batch_size: Optional[int] = None, batched: bool = False + ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: + """Returns the dataset as a `pandas.DataFrame`. Can also return a generator for large datasets. + + Args: + batch_size (`int`, *optional*): + The size (number of rows) of the batches if `batched` is `True`. + Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`. + batched (`bool`): + Set to `True` to return a generator that yields the dataset as batches + of `batch_size` rows. Defaults to `False` (returns the whole datasets once). + + Returns: + `pandas.DataFrame` or `Iterator[pandas.DataFrame]` + + Example: + + ```py + >>> ds.to_pandas() + ``` + """ + if batched: + for table in self.with_format("arrow").iter(batch_size=batch_size): + yield Dataset(table, fingerprint="unset").to_pandas() + else: + table = pa.concat_tables(list(self.with_format("arrow").iter(batch_size=1000))) + return Dataset(table, fingerprint="unset").to_pandas() + + def to_polars( + self, + batch_size: Optional[int] = None, + batched: bool = False, + schema_overrides: Optional[dict] = None, + rechunk: bool = True, + ) -> Union["pl.DataFrame", Iterator["pl.DataFrame"]]: + """Returns the dataset as a `polars.DataFrame`. Can also return a generator for large datasets. + + Args: + batch_size (`int`, *optional*): + The size (number of rows) of the batches if `batched` is `True`. + Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`. + batched (`bool`): + Set to `True` to return a generator that yields the dataset as batches + of `batch_size` rows. Defaults to `False` (returns the whole datasets once). + schema_overrides (`dict`, *optional*): + Support type specification or override of one or more columns; note that + any dtypes inferred from the schema param will be overridden. + rechunk (`bool`): + Make sure that all data is in contiguous memory. Defaults to `True`. + Returns: + `polars.DataFrame` or `Iterator[polars.DataFrame]` + + Example: + + ```py + >>> ds.to_polars() + ``` + """ + if batched: + for table in self.with_format("arrow").iter(batch_size=batch_size): + yield Dataset(table, fingerprint="unset").to_polars(schema_overrides=schema_overrides, rechunk=rechunk) + else: + table = pa.concat_tables(list(self.with_format("arrow").iter(batch_size=1000))) + return Dataset(table, fingerprint="unset").to_polars(schema_overrides=schema_overrides, rechunk=rechunk) + + def to_csv( + self, + path_or_buf: Union[PathLike, BinaryIO], + batch_size: Optional[int] = None, + storage_options: Optional[dict] = None, + **to_csv_kwargs, + ) -> int: + """Exports the dataset to csv. + + This iterates on the dataset and loads it completely in memory before writing it. + + Args: + path_or_buf (`PathLike` or `FileOrBuffer`): + Either a path to a file (e.g. `file.csv`), a remote URI (e.g. `hf://datasets/username/my_dataset_name/data.csv`), + or a BinaryIO, where the dataset will be saved to in the specified format. + batch_size (`int`, *optional*): + Size of the batch to load in memory and write at once. + Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`. + storage_options (`dict`, *optional*): + Key/value pairs to be passed on to the file-system backend, if any. + **to_csv_kwargs (additional keyword arguments): + Parameters to pass to pandas's [`pandas.DataFrame.to_csv`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html). + The parameter `index` defaults to `False` if not specified. + If you would like to write the index, pass `index=True` and also set a name for the index column by + passing `index_label`. + + Returns: + `int`: The number of characters or bytes written. + + Example: + + ```py + >>> ds.to_csv("path/to/dataset/directory") + ``` + """ + table = pa.concat_tables(list(self.with_format("arrow").iter(batch_size=1000))) + return Dataset(table, fingerprint="unset").to_csv( + path_or_buf, + batch_size=batch_size, + storage_options=storage_options, + **to_csv_kwargs, + ) + + def to_json( + self, + path_or_buf: Union[PathLike, BinaryIO], + batch_size: Optional[int] = None, + storage_options: Optional[dict] = None, + **to_json_kwargs, + ) -> int: + """Export the dataset to JSON Lines or JSON. + + This iterates on the dataset and loads it completely in memory before writing it. + + The default output format is [JSON Lines](https://jsonlines.org/). + To export to [JSON](https://www.json.org), pass `lines=False` argument and the desired `orient`. + + Args: + path_or_buf (`PathLike` or `FileOrBuffer`): + Either a path to a file (e.g. `file.json`), a remote URI (e.g. `hf://datasets/username/my_dataset_name/data.json`), + or a BinaryIO, where the dataset will be saved to in the specified format. + batch_size (`int`, *optional*): + Size of the batch to load in memory and write at once. + Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`. + storage_options (`dict`, *optional*): + Key/value pairs to be passed on to the file-system backend, if any. + **to_json_kwargs (additional keyword arguments): + Parameters to pass to pandas's [`pandas.DataFrame.to_json`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_json.html). + Default arguments are `lines=True` and `orient="records". + The parameter `index` defaults to `False` if `orient` is `"split"` or `"table"`. + If you would like to write the index, pass `index=True`. + + Returns: + `int`: The number of characters or bytes written. + + Example: + + ```py + >>> ds.to_json("path/to/dataset/directory/filename.jsonl") + ``` + + ```py + >>> num_shards = dataset.num_shards + >>> for index in range(num_shards): + ... shard = dataset.shard(index, num_shards) + ... shard.to_json(f"path/of/my/dataset/data-{index:05d}.jsonl") + ``` + + """ + table = pa.concat_tables(list(self.with_format("arrow").iter(batch_size=1000))) + return Dataset(table, fingerprint="unset").to_json( + path_or_buf, + batch_size=batch_size, + storage_options=storage_options, + **to_json_kwargs, + ) + + def to_sql( + self, + name: str, + con: Union[str, "sqlalchemy.engine.Connection", "sqlalchemy.engine.Engine", "sqlite3.Connection"], + batch_size: Optional[int] = None, + **sql_writer_kwargs, + ) -> int: + """Exports the dataset to a SQL database. + + Args: + name (`str`): + Name of SQL table. + con (`str` or `sqlite3.Connection` or `sqlalchemy.engine.Connection` or `sqlalchemy.engine.Connection`): + A [URI string](https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls) or a SQLite3/SQLAlchemy connection object used to write to a database. + batch_size (`int`, *optional*): + Size of the batch to load in memory and write at once. + Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`. + **sql_writer_kwargs (additional keyword arguments): + Parameters to pass to pandas's [`pandas.DataFrame.to_sql`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_sql.html). + The parameter `index` defaults to `False` if not specified. + If you would like to write the index, pass `index=True` and also set a name for the index column by + passing `index_label`. + + + Returns: + `int`: The number of records written. + + Example: + + ```py + >>> # con provided as a connection URI string + >>> ds.to_sql("data", "sqlite:///my_own_db.sql") + >>> # con provided as a sqlite3 connection object + >>> import sqlite3 + >>> con = sqlite3.connect("my_own_db.sql") + >>> with con: + ... ds.to_sql("data", con) + ``` + """ + table = pa.concat_tables(list(self.with_format("arrow").iter(batch_size=1000))) + return Dataset(table, fingerprint="unset").to_sql(name, con, batch_size=batch_size, **sql_writer_kwargs) + + def to_parquet( + self, + path_or_buf: Union[PathLike, BinaryIO], + batch_size: Optional[int] = None, + storage_options: Optional[dict] = None, + **parquet_writer_kwargs, + ) -> int: + """Exports the dataset to parquet + + Args: + path_or_buf (`PathLike` or `FileOrBuffer`): + Either a path to a file (e.g. `file.parquet`), a remote URI (e.g. `hf://datasets/username/my_dataset_name/data.parquet`), + or a BinaryIO, where the dataset will be saved to in the specified format. + batch_size (`int`, *optional*): + Size of the batch to load in memory and write at once. + Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`. + storage_options (`dict`, *optional*): + Key/value pairs to be passed on to the file-system backend, if any. + + + **parquet_writer_kwargs (additional keyword arguments): + Parameters to pass to PyArrow's `pyarrow.parquet.ParquetWriter`. + + Returns: + `int`: The number of characters or bytes written. + + Example: + + ```py + >>> ds.to_parquet("path/to/dataset/directory") + ``` + + ```py + >>> num_shards = dataset.num_shards + >>> for index in range(num_shards): + ... shard = dataset.shard(index, num_shards) + ... shard.to_parquet(f"path/of/my/dataset/data-{index:05d}.parquet") + ``` + + """ + from .arrow_writer import get_arrow_writer_batch_size_from_features + + batch_size = get_arrow_writer_batch_size_from_features(self.features) or config.DEFAULT_MAX_BATCH_SIZE + table = pa.concat_tables(list(self.with_format("arrow").iter(batch_size=batch_size))) + return Dataset(table, fingerprint="unset").to_parquet( + path_or_buf, storage_options=storage_options, **parquet_writer_kwargs + ) + + def _push_parquet_shards_to_hub_single( + self, + job_id: int, + num_jobs: int, + repo_id: str, + data_dir: str, + split: str, + token: Optional[str], + revision: Optional[str], + create_pr: Optional[bool], + # max_shard_size: Optional[Union[int, str]] = None, # TODO(QL): add arg + num_shards: int, + embed_external_files: bool, + ) -> Iterable[tuple[list[CommitOperationAdd], int, int]]: + """Pushes the dataset shards as Parquet files to the hub. + + Returns: + additions (`List[CommitOperation]`): list of the `CommitOperationAdd` of the uploaded shards + uploaded_size (`int`): number of uploaded bytes to the repository + dataset_nbytes (`int`): approximate size in bytes of the uploaded dataset after uncompression + """ + + div = num_shards // num_jobs + mod = num_shards % num_jobs + start = div * job_id + min(job_id, mod) + end = start + div + (1 if job_id < mod else 0) + + index_shards = ( + (start + i, self.shard(num_shards=end - start, index=i, contiguous=True)) for i in range(end - start) + ) + + api = HfApi(endpoint=config.HF_ENDPOINT, token=token) + + uploaded_size = 0 + dataset_nbytes = 0 + num_examples = 0 + additions: list[CommitOperationAdd] = [] + for index, shard in index_shards: + if embed_external_files: + from .arrow_writer import get_arrow_writer_batch_size_from_features + + shard = shard.with_format("arrow") + shard = shard.map( + partial(embed_table_storage, token_per_repo_id=self._token_per_repo_id), + batched=True, + batch_size=get_arrow_writer_batch_size_from_features(shard.features), + ) + shard_path_in_repo = f"{data_dir}/{split}-{index:05d}-of-{num_shards:05d}.parquet" + buffer = BytesIO() + shard.to_parquet(buffer) + parquet_metadata = pq.read_metadata(buffer) + num_examples += parquet_metadata.num_rows + dataset_nbytes += sum( + parquet_metadata.row_group(i).total_byte_size for i in range(parquet_metadata.num_row_groups) + ) + parquet_content = buffer.getvalue() + uploaded_size += len(parquet_content) + del buffer + shard_addition = CommitOperationAdd(path_in_repo=shard_path_in_repo, path_or_fileobj=parquet_content) + api.preupload_lfs_files( + repo_id=repo_id, + additions=[shard_addition], + repo_type="dataset", + revision=revision, + create_pr=create_pr, + ) + additions.append(shard_addition) + yield job_id, False, 1 + + yield job_id, True, (additions, dataset_nbytes, num_examples) + + def _push_parquet_shards_to_hub( + self, + repo_id: str, + data_dir: str, + split: str, + token: Optional[str], + revision: Optional[str], + create_pr: Optional[bool], + # max_shard_size: Optional[Union[int, str]], # TODO(QL): add arg + num_shards: Optional[int], + embed_external_files: bool, + num_proc: Optional[int], + ) -> tuple[list[CommitOperationAdd], int, int, int]: + """Pushes the dataset shards as Parquet files to the hub. + + Returns: + additions (`List[CommitOperation]`): list of the `CommitOperationAdd` of the uploaded shards + uploaded_size (`int`): number of uploaded bytes to the repository + dataset_nbytes (`int`): approximate size in bytes of the uploaded dataset after uncompression + num_examples (`int`): number of examples of the uploaded dataset + """ + + # Find decodable columns, because if there are any, we need to: + # embed the bytes from the files in the shards + decodable_columns = ( + [k for k, v in self._info.features.items() if require_decoding(v, ignore_decode_attribute=True)] + if embed_external_files + else [] + ) + embed_external_files = embed_external_files and bool(decodable_columns) + + if num_shards is None: + # TODO(QL): this can depend on max_shard_size later + num_shards = self.num_shards + + additions: list[CommitOperationAdd] = [] + dataset_nbytes = num_examples = 0 + + num_jobs = num_proc or 1 + kwargs_iterable = [ + { + "self": self.shard(num_shards=num_jobs, index=job_id, contiguous=True), + "job_id": job_id, + "num_jobs": num_jobs, + "repo_id": repo_id, + "data_dir": data_dir, + "split": split, + "token": token, + "revision": revision, + "create_pr": create_pr, + "num_shards": num_shards, + "embed_external_files": embed_external_files, + } + for job_id in range(num_jobs) + ] + desc = "Uploading the dataset shards" + desc += f" (num_proc={num_proc})" if num_proc is not None and num_proc >= 1 else "" + pbar = hf_tqdm( + unit=" shards", + total=num_shards, + desc=desc, + ) + with contextlib.nullcontext() if num_proc is None or num_proc < 1 else Pool(num_proc) as pool: + update_stream = ( + IterableDataset._push_parquet_shards_to_hub_single(**kwargs_iterable[0]) + if pool is None + else iflatmap_unordered( + pool, + IterableDataset._push_parquet_shards_to_hub_single, + kwargs_iterable=kwargs_iterable, + ) + ) + for job_id, done, content in update_stream: + if not done: + pbar.update(content) + else: + additions += content[0] + dataset_nbytes += content[1] + num_examples += content[2] + if pool is not None: + pool.close() + pool.join() + + uploaded_size = sum(addition.upload_info.size for addition in additions) + return additions, uploaded_size, dataset_nbytes, num_examples + + def push_to_hub( + self, + repo_id: str, + config_name: str = "default", + set_default: Optional[bool] = None, + split: Optional[str] = None, + data_dir: Optional[str] = None, + commit_message: Optional[str] = None, + commit_description: Optional[str] = None, + private: Optional[bool] = None, + token: Optional[str] = None, + revision: Optional[str] = None, + create_pr: Optional[bool] = False, + # max_shard_size: Optional[Union[int, str]] = None, # TODO(QL): add arg + num_shards: Optional[int] = None, + embed_external_files: bool = True, + num_proc: Optional[int] = None, + ) -> CommitInfo: + """Pushes the dataset to the hub as a Parquet dataset. + The dataset is pushed using HTTP requests and does not need to have neither git or git-lfs installed. + + The resulting Parquet files are self-contained by default. If your dataset contains [`Image`], [`Audio`] or [`Video`] + data, the Parquet files will store the bytes of your images or audio files. + You can disable this by setting `embed_external_files` to `False`. + + Args: + repo_id (`str`): + The ID of the repository to push to in the following format: `/` or + `/`. Also accepts ``, which will default to the namespace + of the logged-in user. + config_name (`str`, defaults to "default"): + The configuration name (or subset) of a dataset. Defaults to "default". + set_default (`bool`, *optional*): + Whether to set this configuration as the default one. Otherwise, the default configuration is the one + named "default". + split (`str`, *optional*): + The name of the split that will be given to that dataset. Defaults to `self.split`. + data_dir (`str`, *optional*): + Directory name that will contain the uploaded data files. Defaults to the `config_name` if different + from "default", else "data". + commit_message (`str`, *optional*): + Message to commit while pushing. Will default to `"Upload dataset"`. + commit_description (`str`, *optional*): + Description of the commit that will be created. + Additionally, description of the PR if a PR is created (`create_pr` is True). + private (`bool`, *optional*): + Whether to make the repo private. If `None` (default), the repo will be public unless the + organization's default is private. This value is ignored if the repo already exists. + token (`str`, *optional*): + An optional authentication token for the Hugging Face Hub. If no token is passed, will default + to the token saved locally when logging in with `huggingface-cli login`. Will raise an error + if no token is passed and the user is not logged-in. + revision (`str`, *optional*): + Branch to push the uploaded files to. Defaults to the `"main"` branch. + create_pr (`bool`, *optional*, defaults to `False`): + Whether to create a PR with the uploaded files or directly commit. + num_shards (`int`, *optional*): + Number of shards to write. Equals to this dataset's `.num_shards` by default. + embed_external_files (`bool`, defaults to `True`): + Whether to embed file bytes in the shards. + In particular, this will do the following before the push for the fields of type: + + - [`Audio`] and [`Image`]: remove local path information and embed file content in the Parquet files. + num_proc (`int`, *optional*, defaults to `None`): + Number of processes when preparing and uploading the dataset. + This is helpful if the dataset is made of many samples and transformations. + Multiprocessing is disabled by default. + + Return: + huggingface_hub.CommitInfo + + Example: + + ```python + >>> dataset.push_to_hub("/") + >>> dataset_dict.push_to_hub("/", private=True) + >>> dataset.push_to_hub("/", num_shards=1024) + ``` + + If your dataset has multiple splits (e.g. train/validation/test): + + ```python + >>> train_dataset.push_to_hub("/", split="train") + >>> val_dataset.push_to_hub("/", split="validation") + >>> # later + >>> dataset = load_dataset("/") + >>> train_dataset = dataset["train"] + >>> val_dataset = dataset["validation"] + ``` + + If you want to add a new configuration (or subset) to a dataset (e.g. if the dataset has multiple tasks/versions/languages): + + ```python + >>> english_dataset.push_to_hub("/", "en") + >>> french_dataset.push_to_hub("/", "fr") + >>> # later + >>> english_dataset = load_dataset("/", "en") + >>> french_dataset = load_dataset("/", "fr") + ``` + """ + if "Video(" in str(self.features): + raise NotImplementedError( + "push_to_hub is not implemented for video datasets, instead you should upload the video files " + "using e.g. the huggingface_hub library and optionally upload a metadata.csv or metadata.jsonl " + "file containing other information like video captions, features or labels. More information " + "at https://huggingface.co/docs/datasets/main/en/video_load#videofolder" + ) + if num_proc is not None and num_proc > self.num_shards: + logger.warning( + f"Too many num_proc: {num_proc} (max is dataset.num_shards={self.num_shards}). " + f"Stopping {num_proc - self.num_shards} processes." + ) + logger.info( + f"To parallelize data loading, we give each process some shards (or data sources) to process. " + f"Therefore it's unnecessary to have a number of processes greater than dataset.num_shards={self.num_shards}. " + f"To enable more parallelism, please split the dataset in more files than {self.num_shards}." + ) + num_proc = self.num_shards + + if config_name == "data": + raise ValueError("`config_name` cannot be 'data'. Please, choose another name for configuration.") + + # if max_shard_size is not None and num_shards is not None: + # raise ValueError( + # "Failed to push_to_hub: please specify either max_shard_size or num_shards, but not both." + # ) + + if split is None: + split = str(self.split) if self.split is not None else "train" + + if not re.match(_split_re, split): + raise ValueError(f"Split name should match '{_split_re}' but got '{split}'.") + + api = HfApi(endpoint=config.HF_ENDPOINT, token=token) + + try: + repo_id = api.repo_info(repo_id, repo_type="dataset").id + except RepositoryNotFoundError: + repo_url = api.create_repo( + repo_id, + repo_type="dataset", + private=private, + exist_ok=True, + ) + repo_id = repo_url.repo_id + + if revision is not None and not revision.startswith("refs/pr/"): + # We do not call create_branch for a PR reference: 400 Bad Request + api.create_branch(repo_id, branch=revision, token=token, repo_type="dataset", exist_ok=True) + + if not data_dir: + data_dir = config_name if config_name != "default" else "data" # for backward compatibility + + additions, uploaded_size, dataset_nbytes, num_examples = self._push_parquet_shards_to_hub( + repo_id=repo_id, + data_dir=data_dir, + split=split, + token=token, + revision=revision, + # max_shard_size=max_shard_size, # TODO(QL): add arg + num_shards=num_shards, + create_pr=create_pr, + embed_external_files=embed_external_files, + num_proc=num_proc, + ) + + def get_deletions_and_dataset_card() -> tuple[str, list[CommitOperationDelete], str, Optional[str]]: + parent_commit = api.repo_info(repo_id, repo_type="dataset", revision=revision).sha + + # Check if the repo already has a README.md and/or a dataset_infos.json to update them with the new split info (size and pattern) + # and delete old split shards (if they exist) + repo_with_dataset_card, repo_with_dataset_infos = False, False + deletions: list[CommitOperationDelete] = [] + deleted_size = 0 + repo_splits: list[str] = [] # use a list to keep the order of the splits + repo_files_to_add = [addition.path_in_repo for addition in additions] + for repo_file in api.list_repo_tree( + repo_id=repo_id, revision=parent_commit, repo_type="dataset", token=token, recursive=True + ): + if not isinstance(repo_file, RepoFile): + continue + if repo_file.rfilename == config.REPOCARD_FILENAME: + repo_with_dataset_card = True + elif repo_file.rfilename == config.DATASETDICT_INFOS_FILENAME: + repo_with_dataset_infos = True + elif ( + repo_file.rfilename.startswith(f"{data_dir}/{split}-") + and repo_file.rfilename not in repo_files_to_add + ): + deletions.append(CommitOperationDelete(path_in_repo=repo_file.rfilename)) + deleted_size += repo_file.size + elif fnmatch.fnmatch( + repo_file.rfilename, + PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED.replace("{split}", "*"), + ): + pattern = glob_pattern_to_regex(PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED) + split_pattern_fields = string_to_dict(repo_file.rfilename, pattern) + assert split_pattern_fields is not None + repo_split = split_pattern_fields["split"] + if repo_split not in repo_splits: + repo_splits.append(repo_split) + + organization, dataset_name = repo_id.split("/") if "/" in repo_id else (None, repo_id) + info_to_dump = self.info.copy() + info_to_dump.download_checksums = None + info_to_dump.download_size = uploaded_size + info_to_dump.dataset_size = dataset_nbytes + info_to_dump.size_in_bytes = uploaded_size + dataset_nbytes + info_to_dump.config_name = config_name + info_to_dump.splits = SplitDict( + { + split: SplitInfo( + split, num_bytes=dataset_nbytes, num_examples=num_examples, dataset_name=dataset_name + ) + } + ) + # get the info from the README to update them + if repo_with_dataset_card: + dataset_card_path = api.hf_hub_download( + repo_id, config.REPOCARD_FILENAME, repo_type="dataset", revision=parent_commit + ) + dataset_card = DatasetCard.load(Path(dataset_card_path)) + dataset_card_data = dataset_card.data + metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data) + dataset_infos: DatasetInfosDict = DatasetInfosDict.from_dataset_card_data(dataset_card_data) + if dataset_infos and config_name in dataset_infos: + repo_info = dataset_infos[config_name] + else: + repo_info = None + # get the deprecated dataset_infos.json to update them + elif repo_with_dataset_infos: + dataset_card = None + dataset_card_data = DatasetCardData() + metadata_configs = MetadataConfigs() + dataset_infos_path = api.hf_hub_download( + repo_id, config.DATASETDICT_INFOS_FILENAME, repo_type="dataset", revision=parent_commit + ) + with open(dataset_infos_path, encoding="utf-8") as f: + dataset_infos: dict = json.load(f) + dataset_info = dataset_infos.get(config_name, None) if dataset_infos else None + repo_info = DatasetInfo.from_dict(dataset_info) if dataset_info else None + else: + dataset_card = None + dataset_card_data = DatasetCardData() + metadata_configs = MetadataConfigs() + repo_info = None + # update the total info to dump from existing info + if repo_info is not None: + logger.info("Updating downloaded metadata with the new split.") + if repo_info.splits and list(repo_info.splits) != [split]: + if self._info.features != repo_info.features: + raise ValueError( + f"Features of the new split don't match the features of the existing splits on the hub: {self._info.features} != {repo_info.features}" + ) + + if split in repo_info.splits: + repo_info.download_size -= deleted_size + repo_info.dataset_size -= repo_info.splits.get(split, SplitInfo()).num_bytes or 0 + + repo_info.download_checksums = None + repo_info.download_size = (repo_info.download_size or 0) + uploaded_size + repo_info.dataset_size = (repo_info.dataset_size or 0) + dataset_nbytes + repo_info.size_in_bytes = repo_info.download_size + repo_info.dataset_size + repo_info.splits.pop(split, None) + repo_info.splits[split] = SplitInfo( + split, num_bytes=dataset_nbytes, num_examples=len(self), dataset_name=dataset_name + ) + info_to_dump = repo_info + # create the metadata configs if it was uploaded with push_to_hub before metadata configs existed + if not metadata_configs and repo_splits: + default_metadata_configs_to_dump = { + "data_files": [{"split": split, "path": f"data/{split}-*"} for split in repo_splits] + } + MetadataConfigs({"default": default_metadata_configs_to_dump}).to_dataset_card_data(dataset_card_data) + # update the metadata configs + if config_name in metadata_configs: + metadata_config = metadata_configs[config_name] + if "data_files" in metadata_config: + data_files_to_dump = sanitize_patterns(metadata_config["data_files"]) + else: + data_files_to_dump = {} + # add the new split + data_files_to_dump[split] = [f"{data_dir}/{split}-*"] + metadata_config_to_dump = { + "data_files": [ + { + "split": _split, + "path": _pattern[0] if len(_pattern) == 1 else _pattern, + } + for _split, _pattern in data_files_to_dump.items() + ] + } + else: + metadata_config_to_dump = {"data_files": [{"split": split, "path": f"{data_dir}/{split}-*"}]} + configs_to_dump = {config_name: metadata_config_to_dump} + if set_default and config_name != "default": + if metadata_configs: + current_default_config_name = metadata_configs.get_default_config_name() + if current_default_config_name == "default": + raise ValueError( + "There exists a configuration named 'default'. To set a different configuration as default, " + "rename the 'default' one first." + ) + if current_default_config_name: + _ = metadata_configs[current_default_config_name].pop("default") + configs_to_dump[current_default_config_name] = metadata_configs[current_default_config_name] + metadata_config_to_dump["default"] = True + # push to the deprecated dataset_infos.json + if repo_with_dataset_infos: + dataset_infos_path = api.hf_hub_download( + repo_id, config.DATASETDICT_INFOS_FILENAME, repo_type="dataset", revision=parent_commit + ) + with open(dataset_infos_path, encoding="utf-8") as f: + dataset_infos: dict = json.load(f) + dataset_infos[config_name] = asdict(info_to_dump) + new_dataset_infos = json.dumps(dataset_infos, indent=4) + else: + new_dataset_infos = None + # push to README + DatasetInfosDict({config_name: info_to_dump}).to_dataset_card_data(dataset_card_data) + MetadataConfigs(configs_to_dump).to_dataset_card_data(dataset_card_data) + new_dataset_card = ( + DatasetCard(f"---\n{dataset_card_data}\n---\n") if dataset_card is None else dataset_card + ) + return parent_commit, deletions, new_dataset_card, new_dataset_infos + + commit_message = commit_message if commit_message is not None else "Upload dataset" + if len(additions) > config.UPLOADS_MAX_NUMBER_PER_COMMIT: + logger.info( + f"Number of files to upload is larger than {config.UPLOADS_MAX_NUMBER_PER_COMMIT}. Splitting the push into multiple commits." + ) + num_commits = math.ceil(len(additions) / config.UPLOADS_MAX_NUMBER_PER_COMMIT) + for i in range(0, num_commits): + operations = additions[ + i * config.UPLOADS_MAX_NUMBER_PER_COMMIT : (i + 1) * config.UPLOADS_MAX_NUMBER_PER_COMMIT + ] + for retry, sleep_time in enumerate(itertools.chain(range(10), itertools.repeat(30)), start=1): + # We need to retry if another commit happens at the same time + sleep_time *= 1 + random.random() + try: + commit_info = api.create_commit( + repo_id, + operations=operations, + commit_message=commit_message + f" (part {i:05d}-of-{num_commits:05d})", + commit_description=commit_description, + repo_type="dataset", + revision=revision, + create_pr=create_pr, + ) + except HfHubHTTPError as err: + if ( + err.__context__ + and isinstance(err.__context__, HfHubHTTPError) + and err.__context__.response.status_code == 409 + ): + # 409 is Conflict (another commit is in progress) + time.sleep(sleep_time) + logger.info( + f"Retrying intermediate commit for {repo_id}, {config_name} ({retry}/n with status_code {err.__context__.response.status_code})" + ) + continue + else: + raise + break + logger.info( + f"Commit #{i + 1} completed" + + (f" (still {num_commits - i - 1} to go)" if num_commits - i - 1 else "") + + "." + ) + last_commit_additions = [] + else: + last_commit_additions = additions + + for retry, sleep_time in enumerate(itertools.chain(range(10), itertools.repeat(30)), start=1): + # We need to retry if there was a commit in between in case it touched the dataset card data + sleep_time *= 1 + random.random() + parent_commit, deletions, dataset_card, dataset_infos = get_deletions_and_dataset_card() + dataset_card_additions = [] + if dataset_infos: + dataset_card_additions.append( + CommitOperationAdd( + path_in_repo=config.DATASETDICT_INFOS_FILENAME, + path_or_fileobj=dataset_infos.encode("utf-8"), + ) + ) + dataset_card_additions.append( + CommitOperationAdd(path_in_repo=config.REPOCARD_FILENAME, path_or_fileobj=str(dataset_card).encode()) + ) + try: + commit_info = api.create_commit( + repo_id, + operations=last_commit_additions + dataset_card_additions + deletions, + commit_message=commit_message, + commit_description=commit_description, + repo_type="dataset", + revision=revision, + create_pr=create_pr, + parent_commit=parent_commit, + ) + except HfHubHTTPError as err: + if ( + err.__context__ + and isinstance(err.__context__, HfHubHTTPError) + and err.__context__.response.status_code in (412, 409) + ): + # 412 is Precondition failed (parent_commit isn't satisfied) + # 409 is Conflict (another commit is in progress) + time.sleep(sleep_time) + logger.info( + f"Retrying commit for {repo_id}, {config_name} ({retry}/n with status_code {err.__context__.response.status_code})" + ) + continue + else: + raise + break + + return commit_info + + +def _concatenate_iterable_datasets( + dsets: list[IterableDataset], + info: Optional[DatasetInfo] = None, + split: Optional[NamedSplit] = None, + axis: int = 0, +) -> IterableDataset: + """ + Converts a list of `IterableDataset` with the same schema into a single `IterableDataset`. + Missing data are filled with None values. + + + + Args: + dsets (`List[datasets.IterableDataset]`): List of Datasets to concatenate. + info (`DatasetInfo`, optional): Dataset information, like description, citation, etc. + split (`NamedSplit`, optional): Name of the dataset split. + axis (``{0, 1}``, default ``0``, meaning over rows): + Axis to concatenate over, where ``0`` means over rows (vertically) and ``1`` means over columns + (horizontally). + + *New in version 1.6.0* + + Example: + + ```py + >>> ds3 = _concatenate_iterable_datasets([ds1, ds2]) + ``` + """ + dsets = [d._resolve_features() for d in dsets] + + # Perform checks (and a potentional cast if axis=0) + if axis == 0: + _check_if_features_can_be_aligned([dset.features for dset in dsets]) + else: + _check_column_names([col_name for dset in dsets for col_name in dset.features]) + + # Check format is consistent; if so, will set format for concatenated dataset + if all(dset._formatting is None for dset in dsets): + formatting = None + elif any(dset._formatting is None for dset in dsets): + formatting = None + logger.info( + "Some of the datasets have disparate format or format not set. Resetting the format of the concatenated dataset." + ) + else: + format_type_set = {dset._formatting.format_type for dset in dsets} + if len(format_type_set) == 1: + format_type = format_type_set.pop() + formatting = FormattingConfig(format_type=format_type) + else: + formatting = None + logger.info( + "Some of the datasets have disparate format or format not set. Resetting the format of the concatenated dataset." + ) + + # TODO: improve this to account for a mix of ClassLabel and Value for example + # right now it would keep the type of the first dataset in the list + features = Features( + {k: v for features in _align_features([dset.features for dset in dsets]) for k, v in features.items()} + ) + + ex_iterables = [copy.deepcopy(d._ex_iterable) for d in dsets] + if axis == 0: + ex_iterable = VerticallyConcatenatedMultiSourcesExamplesIterable(ex_iterables) + else: + ex_iterable = HorizontallyConcatenatedMultiSourcesExamplesIterable(ex_iterables) + # Set new info - we update the features + # setting the features also ensures to fill missing columns with None + if info is None: + info = DatasetInfo.from_merge([d.info for d in dsets]) + else: + info = info.copy() + info.features = features + # Get all the auth tokens per repository - in case the datasets come from different private repositories + token_per_repo_id = {repo_id: token for dataset in dsets for repo_id, token in dataset._token_per_repo_id.items()} + # Return new daset + return IterableDataset( + ex_iterable=ex_iterable, + info=info, + split=split, + token_per_repo_id=token_per_repo_id, + formatting=formatting, + ) + + +def _interleave_iterable_datasets( + datasets: list[IterableDataset], + probabilities: Optional[list[float]] = None, + seed: Optional[int] = None, + info: Optional[DatasetInfo] = None, + split: Optional[NamedSplit] = None, + stopping_strategy: Literal[ + "first_exhausted", "all_exhausted", "all_exhausted_without_replacement" + ] = "first_exhausted", +) -> IterableDataset: + """ + Interleave several iterable datasets (sources) into a single iterable dataset. + The new iterable dataset alternates between the sources to yield examples. + If `probabilities = None` (default) the iterable dataset will cycles through the sources in order for each next example in the iteration. + If `probabilities` is not `None, the iterable dataset will sample a random source according to the provided probabilities for each next examples in the iteration. + + + + Args: + datasets (`List[IterableDataset]`): list of datasets to interleave + probabilities (`List[float]`, optional, default None): If specified, the new iterable dataset samples + examples from one source at a time according to these probabilities. + seed (`int`, optional, default None): The random seed used to choose a source for each example. + stopping_strategy (`str`, defaults to `first_exhausted`): + Two strategies are proposed right now. + By default, `first_exhausted` is an undersampling strategy, i.e the dataset construction is stopped as soon as one dataset has ran out of samples. + If the strategy is `all_exhausted`, we use an oversampling strategy, i.e the dataset construction is stopped as soon as every samples of every dataset has been added at least once. + Note that if the strategy is `all_exhausted`, the interleaved dataset size can get enormous: + - with no probabilities, the resulting dataset will have max_length_datasets*nb_dataset samples. + - with given probabilities, the resulting dataset will have more samples if some datasets have really low probability of visiting. + + Output: + `datasets.IterableDataset` + """ + datasets = [d._resolve_features() for d in datasets] + + # Perform checks + _check_if_features_can_be_aligned([dset.features for dset in datasets]) + + # TODO: improve this to account for a mix of ClassLabel and Value for example + # right now it would keep the type of the first dataset in the list + features = Features( + {k: v for features in _align_features([dset.features for dset in datasets]) for k, v in features.items()} + ) + + ex_iterables = [copy.deepcopy(d._ex_iterable) for d in datasets] + if all(ex_iterable.iter_arrow for ex_iterable in ex_iterables): + ex_iterables = [RebatchedArrowExamplesIterable(ex_iterable, batch_size=1) for ex_iterable in ex_iterables] + # Use cycling or random cycling of sources + if probabilities is None: + ex_iterable = CyclingMultiSourcesExamplesIterable(ex_iterables, stopping_strategy=stopping_strategy) + else: + generator = np.random.default_rng(seed) + ex_iterable = RandomlyCyclingMultiSourcesExamplesIterable( + ex_iterables, + generator=generator, + probabilities=probabilities, + stopping_strategy=stopping_strategy, + ) + # Set new info - we update the features + # setting the features also ensures to fill missing columns with None + if info is None: + info = DatasetInfo.from_merge([d.info for d in datasets]) + else: + info = info.copy() + info.features = features + # Get all the auth tokens per repository - in case the datasets come from different private repositories + token_per_repo_id = { + repo_id: token for dataset in datasets for repo_id, token in dataset._token_per_repo_id.items() + } + # Return new daset + return IterableDataset(ex_iterable=ex_iterable, info=info, split=split, token_per_repo_id=token_per_repo_id) + + +def _split_by_node_iterable_dataset(dataset: IterableDataset, rank: int, world_size: int) -> IterableDataset: + """ + Split an iterable dataset for the node at rank `rank` in a pool of nodes of size `world_size`. + + If the dataset has a number of shards that is a factor of `world_size` (i.e. if `dataset.num_shards % world_size == 0`), + then the shards are evenly assigned across the nodes, which is the most optimized. + Otherwise, each node keeps 1 example out of `world_size`, skipping the other examples. + + Args: + dataset ([`IterableDataset`]): + The iterable dataset to split by node. + rank (`int`): + Rank of the current node. + world_size (`int`): + Total number of nodes. + + Returns: + [`IterableDataset`]: The iterable dataset to be used on the node at rank `rank`. + """ + if dataset._distributed: + rank = world_size * dataset._distributed.rank + rank + world_size = world_size * dataset._distributed.world_size + distributed = DistributedConfig(rank=rank, world_size=world_size) + return IterableDataset( + ex_iterable=dataset._ex_iterable, + info=dataset._info.copy(), + split=dataset._split, + formatting=dataset._formatting, + shuffling=copy.deepcopy(dataset._shuffling), + distributed=distributed, + token_per_repo_id=dataset._token_per_repo_id, + ) + + +async def _apply_async(pool, func, x): + future = pool.apply_async(func, (x,)) + while True: + if future.ready(): + return future.get() + else: + await asyncio.sleep(0) + + +def _batch_fn(unbatched): + return {k: [v] for k, v in unbatched.items()} diff --git a/datasets/keyhash.py b/datasets/keyhash.py new file mode 100644 index 0000000000000000000000000000000000000000..5ba2686e259868e19ee7439d21b995dfd4c8de51 --- /dev/null +++ b/datasets/keyhash.py @@ -0,0 +1,104 @@ +# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Lint as: python3 + +""" +Hashing function for dataset keys using `hashlib.md5` + +Requirements for the hash function: + +- Provides a uniformly distributed hash from random space +- Adequately fast speed +- Working with multiple input types (in this case, `str`, `int` or `bytes`) +- Should be platform independent (generates same hash on different OS and systems) + +The hashing function provides a unique 128-bit integer hash of the key provided. + +The split name is being used here as the hash salt to avoid having same hashes +in different splits due to same keys +""" + +from typing import Union + +from huggingface_hub.utils import insecure_hashlib + + +def _as_bytes(hash_data: Union[str, int, bytes, bytearray]) -> bytes: + """ + Returns the input hash_data in its bytes form + + Args: + hash_data: the hash salt/key to be converted to bytes + """ + if isinstance(hash_data, (bytes, bytearray)): + # Data already in bytes, returns as it as + return hash_data + elif isinstance(hash_data, str): + # We keep the data as it as for it ot be later encoded to UTF-8 + # However replace `\\` with `/` for Windows compatibility + hash_data = hash_data.replace("\\", "/") + elif isinstance(hash_data, int): + hash_data = str(hash_data) + else: + # If data is not of the required type, raise error + raise InvalidKeyError(hash_data) + + return hash_data.encode("utf-8") + + +class InvalidKeyError(Exception): + """Raises an error when given key is of invalid datatype.""" + + def __init__(self, hash_data): + self.prefix = "\nFAILURE TO GENERATE DATASET: Invalid key type detected" + self.err_msg = f"\nFound Key {hash_data} of type {type(hash_data)}" + self.suffix = "\nKeys should be either str, int or bytes type" + super().__init__(f"{self.prefix}{self.err_msg}{self.suffix}") + + +class DuplicatedKeysError(Exception): + """Raise an error when duplicate key found.""" + + def __init__(self, key, duplicate_key_indices, fix_msg=""): + self.key = key + self.duplicate_key_indices = duplicate_key_indices + self.fix_msg = fix_msg + self.prefix = "Found multiple examples generated with the same key" + if len(duplicate_key_indices) <= 20: + self.err_msg = f"\nThe examples at index {', '.join(duplicate_key_indices)} have the key {key}" + else: + self.err_msg = f"\nThe examples at index {', '.join(duplicate_key_indices[:20])}... ({len(duplicate_key_indices) - 20} more) have the key {key}" + self.suffix = "\n" + fix_msg if fix_msg else "" + super().__init__(f"{self.prefix}{self.err_msg}{self.suffix}") + + +class KeyHasher: + """KeyHasher class for providing hash using md5""" + + def __init__(self, hash_salt: str): + self._split_md5 = insecure_hashlib.md5(_as_bytes(hash_salt)) + + def hash(self, key: Union[str, int, bytes]) -> int: + """Returns 128-bits unique hash of input key + + Args: + key: the input key to be hashed (should be str, int or bytes) + + Returns: 128-bit int hash key""" + md5 = self._split_md5.copy() + byte_key = _as_bytes(key) + md5.update(byte_key) + # Convert to integer with hexadecimal conversion + return int(md5.hexdigest(), 16) diff --git a/datasets/load.py b/datasets/load.py new file mode 100644 index 0000000000000000000000000000000000000000..ae3b9825970c1109dad2493e69a6141cd95858b0 --- /dev/null +++ b/datasets/load.py @@ -0,0 +1,1481 @@ +# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Lint as: python3 +"""Access datasets.""" + +import glob +import importlib +import inspect +import json +import os +import posixpath +from collections import Counter +from collections.abc import Mapping, Sequence +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Optional, Union + +import fsspec +import httpx +import requests +import yaml +from fsspec.core import url_to_fs +from huggingface_hub import DatasetCard, DatasetCardData, HfApi +from huggingface_hub.utils import ( + EntryNotFoundError, + GatedRepoError, + LocalEntryNotFoundError, + OfflineModeIsEnabled, + RepositoryNotFoundError, + RevisionNotFoundError, + get_session, +) + +from . import __version__, config +from .arrow_dataset import Dataset +from .builder import BuilderConfig, DatasetBuilder +from .data_files import ( + DataFilesDict, + DataFilesList, + DataFilesPatternsDict, + EmptyDatasetError, + get_data_patterns, + sanitize_patterns, +) +from .dataset_dict import DatasetDict, IterableDatasetDict +from .download.download_config import DownloadConfig +from .download.download_manager import DownloadMode +from .download.streaming_download_manager import StreamingDownloadManager, xbasename, xglob, xjoin +from .exceptions import DataFilesNotFoundError, DatasetNotFoundError +from .features import Features +from .features.features import _fix_for_backward_compatible_features +from .fingerprint import Hasher +from .info import DatasetInfo, DatasetInfosDict +from .iterable_dataset import IterableDataset +from .naming import camelcase_to_snakecase, snakecase_to_camelcase +from .packaged_modules import ( + _EXTENSION_TO_MODULE, + _MODULE_TO_EXTENSIONS, + _MODULE_TO_METADATA_FILE_NAMES, + _PACKAGED_DATASETS_MODULES, +) +from .packaged_modules.folder_based_builder.folder_based_builder import FolderBasedBuilder +from .splits import Split +from .utils import _dataset_viewer +from .utils.file_utils import ( + _raise_if_offline_mode_is_enabled, + cached_path, + get_datasets_user_agent, + is_relative_path, + relative_to_absolute_path, +) +from .utils.hub import hf_dataset_url +from .utils.info_utils import VerificationMode, is_small_dataset +from .utils.logging import get_logger +from .utils.metadata import MetadataConfigs +from .utils.typing import PathLike +from .utils.version import Version + + +logger = get_logger(__name__) + +ALL_ALLOWED_EXTENSIONS = list(_EXTENSION_TO_MODULE.keys()) + [".zip"] + + +class _InitializeConfiguredDatasetBuilder: + """ + From https://stackoverflow.com/questions/4647566/pickle-a-dynamically-parameterized-sub-class + See also ConfiguredDatasetBuilder.__reduce__ + When called with the param value as the only argument, returns an + un-initialized instance of the parameterized class. Subsequent __setstate__ + will be called by pickle. + """ + + def __call__(self, builder_cls, metadata_configs, default_config_name, name): + # make a simple object which has no complex __init__ (this one will do) + obj = _InitializeConfiguredDatasetBuilder() + obj.__class__ = configure_builder_class( + builder_cls, metadata_configs, default_config_name=default_config_name, dataset_name=name + ) + return obj + + +def configure_builder_class( + builder_cls: type[DatasetBuilder], + builder_configs: list[BuilderConfig], + default_config_name: Optional[str], + dataset_name: str, +) -> type[DatasetBuilder]: + """ + Dynamically create a builder class with custom builder configs parsed from README.md file, + i.e. set BUILDER_CONFIGS class variable of a builder class to custom configs list. + """ + + class ConfiguredDatasetBuilder(builder_cls): + BUILDER_CONFIGS = builder_configs + DEFAULT_CONFIG_NAME = default_config_name + + __module__ = builder_cls.__module__ # so that the actual packaged builder can be imported + + def __reduce__(self): # to make dynamically created class pickable, see _InitializeParameterizedDatasetBuilder + parent_builder_cls = self.__class__.__mro__[1] + return ( + _InitializeConfiguredDatasetBuilder(), + ( + parent_builder_cls, + self.BUILDER_CONFIGS, + self.DEFAULT_CONFIG_NAME, + self.dataset_name, + ), + self.__dict__.copy(), + ) + + ConfiguredDatasetBuilder.__name__ = ( + f"{builder_cls.__name__.lower().capitalize()}{snakecase_to_camelcase(dataset_name)}" + ) + ConfiguredDatasetBuilder.__qualname__ = ( + f"{builder_cls.__name__.lower().capitalize()}{snakecase_to_camelcase(dataset_name)}" + ) + + return ConfiguredDatasetBuilder + + +def import_main_class(module_path) -> Optional[type[DatasetBuilder]]: + """Import a module at module_path and return its main class: a DatasetBuilder""" + module = importlib.import_module(module_path) + # Find the main class in our imported module + module_main_cls = None + for name, obj in module.__dict__.items(): + if inspect.isclass(obj) and issubclass(obj, DatasetBuilder): + if inspect.isabstract(obj): + continue + module_main_cls = obj + obj_module = inspect.getmodule(obj) + if obj_module is not None and module == obj_module: + break + + return module_main_cls + + +def get_dataset_builder_class( + dataset_module: "DatasetModule", dataset_name: Optional[str] = None +) -> type[DatasetBuilder]: + builder_cls = import_main_class(dataset_module.module_path) + if dataset_module.builder_configs_parameters.builder_configs: + dataset_name = dataset_name or dataset_module.builder_kwargs.get("dataset_name") + if dataset_name is None: + raise ValueError("dataset_name should be specified but got None") + builder_cls = configure_builder_class( + builder_cls, + builder_configs=dataset_module.builder_configs_parameters.builder_configs, + default_config_name=dataset_module.builder_configs_parameters.default_config_name, + dataset_name=dataset_name, + ) + return builder_cls + + +def increase_load_count(name: str): + """Update the download count of a dataset.""" + if not config.HF_HUB_OFFLINE and config.HF_UPDATE_DOWNLOAD_COUNTS: + try: + get_session().head( + "/".join((config.S3_DATASETS_BUCKET_PREFIX, name, name + ".py")), + headers={"User-Agent": get_datasets_user_agent()}, + timeout=3, + ) + except Exception: + pass + + +def infer_module_for_data_files_list( + data_files_list: DataFilesList, download_config: Optional[DownloadConfig] = None +) -> tuple[Optional[str], dict]: + """Infer module (and builder kwargs) from list of data files. + + It picks the module based on the most common file extension. + In case of a draw ".parquet" is the favorite, and then alphabetical order. + + Args: + data_files_list (DataFilesList): List of data files. + download_config (bool or str, optional): Mainly use `token` or `storage_options` to support different platforms and auth types. + + Returns: + tuple[str, dict[str, Any]]: Tuple with + - inferred module name + - dict of builder kwargs + """ + extensions_counter = Counter( + ("." + suffix.lower(), xbasename(filepath) in FolderBasedBuilder.METADATA_FILENAMES) + for filepath in data_files_list[: config.DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE] + for suffix in xbasename(filepath).split(".")[1:] + ) + if extensions_counter: + + def sort_key(ext_count: tuple[tuple[str, bool], int]) -> tuple[int, bool]: + """Sort by count and set ".parquet" as the favorite in case of a draw, and ignore metadata files""" + (ext, is_metadata), count = ext_count + return (not is_metadata, count, ext == ".parquet", ext == ".jsonl", ext == ".json", ext == ".csv", ext) + + for (ext, _), _ in sorted(extensions_counter.items(), key=sort_key, reverse=True): + if ext in _EXTENSION_TO_MODULE: + return _EXTENSION_TO_MODULE[ext] + elif ext == ".zip": + return infer_module_for_data_files_list_in_archives(data_files_list, download_config=download_config) + return None, {} + + +def infer_module_for_data_files_list_in_archives( + data_files_list: DataFilesList, download_config: Optional[DownloadConfig] = None +) -> tuple[Optional[str], dict]: + """Infer module (and builder kwargs) from list of archive data files. + + Args: + data_files_list (DataFilesList): List of data files. + download_config (bool or str, optional): Mainly use `token` or `storage_options` to support different platforms and auth types. + + Returns: + tuple[str, dict[str, Any]]: Tuple with + - inferred module name + - dict of builder kwargs + """ + archived_files = [] + archive_files_counter = 0 + for filepath in data_files_list: + if str(filepath).endswith(".zip"): + archive_files_counter += 1 + if archive_files_counter > config.GLOBBED_DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE: + break + extracted = xjoin(StreamingDownloadManager().extract(filepath), "**") + archived_files += [ + f.split("::")[0] + for f in xglob(extracted, recursive=True, download_config=download_config)[ + : config.ARCHIVED_DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE + ] + ] + extensions_counter = Counter( + "." + suffix.lower() for filepath in archived_files for suffix in xbasename(filepath).split(".")[1:] + ) + if extensions_counter: + most_common = extensions_counter.most_common(1)[0][0] + if most_common in _EXTENSION_TO_MODULE: + return _EXTENSION_TO_MODULE[most_common] + return None, {} + + +def infer_module_for_data_files( + data_files: DataFilesDict, path: Optional[str] = None, download_config: Optional[DownloadConfig] = None +) -> tuple[Optional[str], dict[str, Any]]: + """Infer module (and builder kwargs) from data files. Raise if module names for different splits don't match. + + Args: + data_files ([`DataFilesDict`]): Dict of list of data files. + path (str, *optional*): Dataset name or path. + download_config ([`DownloadConfig`], *optional*): + Specific download configuration parameters to authenticate on the Hugging Face Hub for private remote files. + + Returns: + tuple[str, dict[str, Any]]: Tuple with + - inferred module name + - builder kwargs + """ + split_modules = { + split: infer_module_for_data_files_list(data_files_list, download_config=download_config) + for split, data_files_list in data_files.items() + } + module_name, default_builder_kwargs = next(iter(split_modules.values())) + if any((module_name, default_builder_kwargs) != split_module for split_module in split_modules.values()): + raise ValueError(f"Couldn't infer the same data file format for all splits. Got {split_modules}") + if not module_name: + raise DataFilesNotFoundError("No (supported) data files found" + (f" in {path}" if path else "")) + return module_name, default_builder_kwargs + + +def create_builder_configs_from_metadata_configs( + module_path: str, + metadata_configs: MetadataConfigs, + base_path: Optional[str] = None, + default_builder_kwargs: dict[str, Any] = None, + download_config: Optional[DownloadConfig] = None, +) -> tuple[list[BuilderConfig], str]: + builder_cls = import_main_class(module_path) + builder_config_cls = builder_cls.BUILDER_CONFIG_CLASS + default_config_name = metadata_configs.get_default_config_name() + builder_configs = [] + default_builder_kwargs = {} if default_builder_kwargs is None else default_builder_kwargs + + base_path = base_path if base_path is not None else "" + for config_name, config_params in metadata_configs.items(): + config_data_files = config_params.get("data_files") + config_data_dir = config_params.get("data_dir") + config_base_path = xjoin(base_path, config_data_dir) if config_data_dir else base_path + try: + config_patterns = ( + sanitize_patterns(config_data_files) + if config_data_files is not None + else get_data_patterns(config_base_path, download_config=download_config) + ) + config_data_files_dict = DataFilesPatternsDict.from_patterns( + config_patterns, + allowed_extensions=ALL_ALLOWED_EXTENSIONS, + ) + except EmptyDatasetError as e: + raise EmptyDatasetError( + f"Dataset at '{base_path}' doesn't contain data files matching the patterns for config '{config_name}'," + f" check `data_files` and `data_fir` parameters in the `configs` YAML field in README.md. " + ) from e + ignored_params = [ + param for param in config_params if not hasattr(builder_config_cls, param) and param != "default" + ] + if ignored_params: + logger.warning( + f"Some datasets params were ignored: {ignored_params}. " + "Make sure to use only valid params for the dataset builder and to have " + "a up-to-date version of the `datasets` library." + ) + builder_configs.append( + builder_config_cls( + name=config_name, + data_files=config_data_files_dict, + data_dir=config_data_dir, + **{ + param: value + for param, value in {**default_builder_kwargs, **config_params}.items() + if hasattr(builder_config_cls, param) and param not in ("default", "data_files", "data_dir") + }, + ) + ) + return builder_configs, default_config_name + + +@dataclass +class BuilderConfigsParameters: + """Dataclass containing objects related to creation of builder configurations from yaml's metadata content. + + Attributes: + metadata_configs (`MetadataConfigs`, *optional*): + Configs parsed from yaml's metadata. + builder_configs (`list[BuilderConfig]`, *optional*): + List of BuilderConfig objects created from metadata_configs above. + default_config_name (`str`): + Name of default config taken from yaml's metadata. + """ + + metadata_configs: Optional[MetadataConfigs] = None + builder_configs: Optional[list[BuilderConfig]] = None + default_config_name: Optional[str] = None + + +@dataclass +class DatasetModule: + module_path: str + hash: str + builder_kwargs: dict + builder_configs_parameters: BuilderConfigsParameters = field(default_factory=BuilderConfigsParameters) + dataset_infos: Optional[DatasetInfosDict] = None + + +class _DatasetModuleFactory: + def get_module(self) -> DatasetModule: + raise NotImplementedError + + +class LocalDatasetModuleFactory(_DatasetModuleFactory): + """Get the module of a dataset loaded from the user's data files. The dataset builder module to use is inferred + from the data files extensions.""" + + def __init__( + self, + path: str, + data_dir: Optional[str] = None, + data_files: Optional[Union[str, list, dict]] = None, + download_mode: Optional[Union[DownloadMode, str]] = None, + ): + if data_dir and os.path.isabs(data_dir): + raise ValueError(f"`data_dir` must be relative to a dataset directory's root: {path}") + + self.path = Path(path).as_posix() + self.name = Path(path).stem + self.data_files = data_files + self.data_dir = data_dir + self.download_mode = download_mode + + def get_module(self) -> DatasetModule: + readme_path = os.path.join(self.path, config.REPOCARD_FILENAME) + standalone_yaml_path = os.path.join(self.path, config.REPOYAML_FILENAME) + dataset_card_data = DatasetCard.load(readme_path).data if os.path.isfile(readme_path) else DatasetCardData() + if os.path.exists(standalone_yaml_path): + with open(standalone_yaml_path, encoding="utf-8") as f: + standalone_yaml_data = yaml.safe_load(f.read()) + if standalone_yaml_data: + _dataset_card_data_dict = dataset_card_data.to_dict() + _dataset_card_data_dict.update(standalone_yaml_data) + dataset_card_data = DatasetCardData(**_dataset_card_data_dict) + metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data) + dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data) + # we need a set of data files to find which dataset builder to use + # because we need to infer module name by files extensions + base_path = Path(self.path, self.data_dir or "").expanduser().resolve().as_posix() + if self.data_files is not None: + patterns = sanitize_patterns(self.data_files) + elif metadata_configs and not self.data_dir and "data_files" in next(iter(metadata_configs.values())): + patterns = sanitize_patterns(next(iter(metadata_configs.values()))["data_files"]) + else: + patterns = get_data_patterns(base_path) + data_files = DataFilesDict.from_patterns( + patterns, + base_path=base_path, + allowed_extensions=ALL_ALLOWED_EXTENSIONS, + ) + module_name, default_builder_kwargs = infer_module_for_data_files( + data_files=data_files, + path=self.path, + ) + data_files = data_files.filter( + extensions=_MODULE_TO_EXTENSIONS[module_name], file_names=_MODULE_TO_METADATA_FILE_NAMES[module_name] + ) + module_path, _ = _PACKAGED_DATASETS_MODULES[module_name] + if metadata_configs: + builder_configs, default_config_name = create_builder_configs_from_metadata_configs( + module_path, + metadata_configs, + base_path=base_path, + default_builder_kwargs=default_builder_kwargs, + ) + else: + builder_configs: list[BuilderConfig] = [ + import_main_class(module_path).BUILDER_CONFIG_CLASS( + data_files=data_files, + **default_builder_kwargs, + ) + ] + default_config_name = None + builder_kwargs = { + "base_path": self.path, + "dataset_name": camelcase_to_snakecase(Path(self.path).name), + } + if self.data_dir: + builder_kwargs["data_files"] = data_files + # this file is deprecated and was created automatically in old versions of push_to_hub + if os.path.isfile(os.path.join(self.path, config.DATASETDICT_INFOS_FILENAME)): + with open(os.path.join(self.path, config.DATASETDICT_INFOS_FILENAME), encoding="utf-8") as f: + legacy_dataset_infos = DatasetInfosDict( + { + config_name: DatasetInfo.from_dict(dataset_info_dict) + for config_name, dataset_info_dict in json.load(f).items() + } + ) + if len(legacy_dataset_infos) == 1: + # old config e.g. named "username--dataset_name" + legacy_config_name = next(iter(legacy_dataset_infos)) + legacy_dataset_infos["default"] = legacy_dataset_infos.pop(legacy_config_name) + legacy_dataset_infos.update(dataset_infos) + dataset_infos = legacy_dataset_infos + if default_config_name is None and len(dataset_infos) == 1: + default_config_name = next(iter(dataset_infos)) + + hash = Hasher.hash({"dataset_infos": dataset_infos, "builder_configs": builder_configs}) + return DatasetModule( + module_path, + hash, + builder_kwargs, + dataset_infos=dataset_infos, + builder_configs_parameters=BuilderConfigsParameters( + metadata_configs=metadata_configs, + builder_configs=builder_configs, + default_config_name=default_config_name, + ), + ) + + +class PackagedDatasetModuleFactory(_DatasetModuleFactory): + """Get the dataset builder module from the ones that are packaged with the library: csv, json, etc.""" + + def __init__( + self, + name: str, + data_dir: Optional[str] = None, + data_files: Optional[Union[str, list, dict]] = None, + download_config: Optional[DownloadConfig] = None, + download_mode: Optional[Union[DownloadMode, str]] = None, + ): + self.name = name + self.data_files = data_files + self.data_dir = data_dir + self.download_config = download_config + self.download_mode = download_mode + increase_load_count(name) + + def get_module(self) -> DatasetModule: + base_path = Path(self.data_dir or "").expanduser().resolve().as_posix() + patterns = ( + sanitize_patterns(self.data_files) + if self.data_files is not None + else get_data_patterns(base_path, download_config=self.download_config) + ) + data_files = DataFilesDict.from_patterns( + patterns, + download_config=self.download_config, + base_path=base_path, + ) + + module_path, hash = _PACKAGED_DATASETS_MODULES[self.name] + + builder_kwargs = { + "data_files": data_files, + "dataset_name": self.name, + } + + return DatasetModule(module_path, hash, builder_kwargs) + + +class HubDatasetModuleFactory(_DatasetModuleFactory): + """ + Get the module of a dataset loaded from data files of a dataset repository. + The dataset builder module to use is inferred from the data files extensions. + """ + + def __init__( + self, + name: str, + commit_hash: str, + data_dir: Optional[str] = None, + data_files: Optional[Union[str, list, dict]] = None, + download_config: Optional[DownloadConfig] = None, + download_mode: Optional[Union[DownloadMode, str]] = None, + use_exported_dataset_infos: bool = False, + ): + self.name = name + self.commit_hash = commit_hash + self.data_files = data_files + self.data_dir = data_dir + self.download_config = download_config or DownloadConfig() + self.download_mode = download_mode + self.use_exported_dataset_infos = use_exported_dataset_infos + increase_load_count(name) + + def get_module(self) -> DatasetModule: + # Get the Dataset Card and fix the revision in case there are new commits in the meantime + api = HfApi( + endpoint=config.HF_ENDPOINT, + token=self.download_config.token, + library_name="datasets", + library_version=__version__, + user_agent=get_datasets_user_agent(self.download_config.user_agent), + ) + try: + dataset_readme_path = api.hf_hub_download( + repo_id=self.name, + filename=config.REPOCARD_FILENAME, + repo_type="dataset", + revision=self.commit_hash, + proxies=self.download_config.proxies, + ) + dataset_card_data = DatasetCard.load(dataset_readme_path).data + except EntryNotFoundError: + dataset_card_data = DatasetCardData() + download_config = self.download_config.copy() + if download_config.download_desc is None: + download_config.download_desc = "Downloading standalone yaml" + try: + standalone_yaml_path = cached_path( + hf_dataset_url(self.name, config.REPOYAML_FILENAME, revision=self.commit_hash), + download_config=download_config, + ) + with open(standalone_yaml_path, encoding="utf-8") as f: + standalone_yaml_data = yaml.safe_load(f.read()) + if standalone_yaml_data: + _dataset_card_data_dict = dataset_card_data.to_dict() + _dataset_card_data_dict.update(standalone_yaml_data) + dataset_card_data = DatasetCardData(**_dataset_card_data_dict) + except FileNotFoundError: + pass + base_path = f"hf://datasets/{self.name}@{self.commit_hash}/{self.data_dir or ''}".rstrip("/") + metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data) + dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data) + if config.USE_PARQUET_EXPORT and self.use_exported_dataset_infos: + try: + exported_dataset_infos = _dataset_viewer.get_exported_dataset_infos( + dataset=self.name, commit_hash=self.commit_hash, token=self.download_config.token + ) + exported_dataset_infos = DatasetInfosDict( + { + config_name: DatasetInfo.from_dict(exported_dataset_infos[config_name]) + for config_name in exported_dataset_infos + } + ) + except _dataset_viewer.DatasetViewerError: + exported_dataset_infos = None + else: + exported_dataset_infos = None + if exported_dataset_infos: + exported_dataset_infos.update(dataset_infos) + dataset_infos = exported_dataset_infos + # we need a set of data files to find which dataset builder to use + # because we need to infer module name by files extensions + if self.data_files is not None: + patterns = sanitize_patterns(self.data_files) + elif metadata_configs and not self.data_dir and "data_files" in next(iter(metadata_configs.values())): + patterns = sanitize_patterns(next(iter(metadata_configs.values()))["data_files"]) + else: + patterns = get_data_patterns(base_path, download_config=self.download_config) + data_files = DataFilesDict.from_patterns( + patterns, + base_path=base_path, + allowed_extensions=ALL_ALLOWED_EXTENSIONS, + download_config=self.download_config, + ) + module_name, default_builder_kwargs = infer_module_for_data_files( + data_files=data_files, + path=self.name, + download_config=self.download_config, + ) + data_files = data_files.filter( + extensions=_MODULE_TO_EXTENSIONS[module_name], file_names=_MODULE_TO_METADATA_FILE_NAMES[module_name] + ) + module_path, _ = _PACKAGED_DATASETS_MODULES[module_name] + if metadata_configs: + builder_configs, default_config_name = create_builder_configs_from_metadata_configs( + module_path, + metadata_configs, + base_path=base_path, + default_builder_kwargs=default_builder_kwargs, + download_config=self.download_config, + ) + else: + builder_configs: list[BuilderConfig] = [ + import_main_class(module_path).BUILDER_CONFIG_CLASS( + data_files=data_files, + **default_builder_kwargs, + ) + ] + default_config_name = None + builder_kwargs = { + "base_path": hf_dataset_url(self.name, "", revision=self.commit_hash).rstrip("/"), + "repo_id": self.name, + "dataset_name": camelcase_to_snakecase(Path(self.name).name), + } + if self.data_dir: + builder_kwargs["data_files"] = data_files + download_config = self.download_config.copy() + if download_config.download_desc is None: + download_config.download_desc = "Downloading metadata" + try: + # this file is deprecated and was created automatically in old versions of push_to_hub + dataset_infos_path = cached_path( + hf_dataset_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=self.commit_hash), + download_config=download_config, + ) + with open(dataset_infos_path, encoding="utf-8") as f: + legacy_dataset_infos = DatasetInfosDict( + { + config_name: DatasetInfo.from_dict(dataset_info_dict) + for config_name, dataset_info_dict in json.load(f).items() + } + ) + if len(legacy_dataset_infos) == 1: + # old config e.g. named "username--dataset_name" + legacy_config_name = next(iter(legacy_dataset_infos)) + legacy_dataset_infos["default"] = legacy_dataset_infos.pop(legacy_config_name) + legacy_dataset_infos.update(dataset_infos) + dataset_infos = legacy_dataset_infos + except FileNotFoundError: + pass + if default_config_name is None and len(dataset_infos) == 1: + default_config_name = next(iter(dataset_infos)) + + return DatasetModule( + module_path, + self.commit_hash, + builder_kwargs, + dataset_infos=dataset_infos, + builder_configs_parameters=BuilderConfigsParameters( + metadata_configs=metadata_configs, + builder_configs=builder_configs, + default_config_name=default_config_name, + ), + ) + + +class HubDatasetModuleFactoryWithParquetExport(_DatasetModuleFactory): + """ + Get the module of a dataset loaded from parquet files of a dataset repository parquet export. + """ + + def __init__( + self, + name: str, + commit_hash: str, + download_config: Optional[DownloadConfig] = None, + ): + self.name = name + self.commit_hash = commit_hash + self.download_config = download_config or DownloadConfig() + increase_load_count(name) + + def get_module(self) -> DatasetModule: + exported_parquet_files = _dataset_viewer.get_exported_parquet_files( + dataset=self.name, commit_hash=self.commit_hash, token=self.download_config.token + ) + exported_dataset_infos = _dataset_viewer.get_exported_dataset_infos( + dataset=self.name, commit_hash=self.commit_hash, token=self.download_config.token + ) + dataset_infos = DatasetInfosDict( + { + config_name: DatasetInfo.from_dict(exported_dataset_infos[config_name]) + for config_name in exported_dataset_infos + } + ) + parquet_commit_hash = ( + HfApi( + endpoint=config.HF_ENDPOINT, + token=self.download_config.token, + library_name="datasets", + library_version=__version__, + user_agent=get_datasets_user_agent(self.download_config.user_agent), + ) + .dataset_info( + self.name, + revision="refs/convert/parquet", + token=self.download_config.token, + timeout=100.0, + ) + .sha + ) # fix the revision in case there are new commits in the meantime + metadata_configs = MetadataConfigs._from_exported_parquet_files_and_dataset_infos( + parquet_commit_hash=parquet_commit_hash, + exported_parquet_files=exported_parquet_files, + dataset_infos=dataset_infos, + ) + module_path, _ = _PACKAGED_DATASETS_MODULES["parquet"] + builder_configs, default_config_name = create_builder_configs_from_metadata_configs( + module_path, + metadata_configs, + download_config=self.download_config, + ) + builder_kwargs = { + "repo_id": self.name, + "dataset_name": camelcase_to_snakecase(Path(self.name).name), + } + + return DatasetModule( + module_path, + self.commit_hash, + builder_kwargs, + dataset_infos=dataset_infos, + builder_configs_parameters=BuilderConfigsParameters( + metadata_configs=metadata_configs, + builder_configs=builder_configs, + default_config_name=default_config_name, + ), + ) + + +class CachedDatasetModuleFactory(_DatasetModuleFactory): + """ + Get the module of a dataset that has been loaded once already and cached. + """ + + def __init__( + self, + name: str, + cache_dir: Optional[str] = None, + ): + self.name = name + self.cache_dir = cache_dir + assert self.name.count("/") <= 1 + + def get_module(self) -> DatasetModule: + cache_dir = os.path.expanduser(str(self.cache_dir or config.HF_DATASETS_CACHE)) + namespace_and_dataset_name = self.name.split("/") + namespace_and_dataset_name[-1] = camelcase_to_snakecase(namespace_and_dataset_name[-1]) + cached_relative_path = "___".join(namespace_and_dataset_name) + cached_datasets_directory_path_root = os.path.join(cache_dir, cached_relative_path) + cached_directory_paths = [ + cached_directory_path + for cached_directory_path in glob.glob(os.path.join(cached_datasets_directory_path_root, "*", "*", "*")) + if os.path.isdir(cached_directory_path) + ] + if cached_directory_paths: + builder_kwargs = { + "repo_id": self.name, + "dataset_name": self.name.split("/")[-1], + } + warning_msg = f"Using the latest cached version of the dataset since {self.name} couldn't be found on the Hugging Face Hub" + if config.HF_HUB_OFFLINE: + warning_msg += " (offline mode is enabled)." + logger.warning(warning_msg) + return DatasetModule( + "datasets.packaged_modules.cache.cache", + "auto", + {**builder_kwargs, "version": "auto"}, + ) + raise FileNotFoundError(f"Dataset {self.name} is not cached in {self.cache_dir}") + + +def dataset_module_factory( + path: str, + revision: Optional[Union[str, Version]] = None, + download_config: Optional[DownloadConfig] = None, + download_mode: Optional[Union[DownloadMode, str]] = None, + data_dir: Optional[str] = None, + data_files: Optional[Union[dict, list, str, DataFilesDict]] = None, + cache_dir: Optional[str] = None, + **download_kwargs, +) -> DatasetModule: + """ + Download/extract/cache a dataset module. + + Dataset codes are cached inside the dynamic modules cache to allow easy import (avoid ugly sys.path tweaks). + + Args: + + path (str): Path or name of the dataset. + Depending on ``path``, the dataset builder that is used comes from one of the generic dataset builders (JSON, CSV, Parquet, text etc.). + + For local datasets: + + - if ``path`` is a local directory (containing data files only) + -> load a generic dataset builder (csv, json, text etc.) based on the content of the directory + e.g. ``'./path/to/directory/with/my/csv/data'``. + + For datasets on the Hugging Face Hub (list all available datasets with ``huggingface_hub.list_datasets()``) + + - if ``path`` is a dataset repository on the HF hub (containing data files only) + -> load a generic dataset builder (csv, text etc.) based on the content of the repository + e.g. ``'username/dataset_name'``, a dataset repository on the HF hub containing your data files. + + revision (:class:`~utils.Version` or :obj:`str`, optional): Version of the dataset to load. + As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch. + You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository. + download_config (:class:`DownloadConfig`, optional): Specific download configuration parameters. + download_mode (:class:`DownloadMode` or :obj:`str`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode. + data_dir (:obj:`str`, optional): Directory with the data files. Used only if `data_files` is not specified, + in which case it's equal to pass `os.path.join(data_dir, "**")` as `data_files`. + data_files (:obj:`Union[Dict, List, str]`, optional): Defining the data_files of the dataset configuration. + cache_dir (`str`, *optional*): + Directory to read/write data. Defaults to `"~/.cache/huggingface/datasets"`. + + + + **download_kwargs (additional keyword arguments): optional attributes for DownloadConfig() which will override + the attributes in download_config if supplied. + + Returns: + DatasetModule + """ + if download_config is None: + download_config = DownloadConfig(**download_kwargs) + download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS) + download_config.extract_compressed_file = True + download_config.force_extract = True + download_config.force_download = download_mode == DownloadMode.FORCE_REDOWNLOAD + + filename = list(filter(lambda x: x, path.replace(os.sep, "/").split("/")))[-1] + if not filename.endswith(".py"): + filename = filename + ".py" + combined_path = os.path.join(path, filename) + + # We have several ways to get a dataset builder: + # + # - if path is the name of a packaged dataset module + # -> use the packaged module (json, csv, etc.) + # + # - if os.path.join(path, name) is a local python file + # -> use the module from the python file + # - if path is a local directory (but no python file) + # -> use a packaged module (csv, text etc.) based on content of the directory + # + # - if path has one "/" and is dataset repository on the HF hub with a python file + # -> the module from the python file in the dataset repository + # - if path has one "/" and is dataset repository on the HF hub without a python file + # -> use a packaged module (csv, text etc.) based on content of the repository + + # Try packaged + if path in _PACKAGED_DATASETS_MODULES: + return PackagedDatasetModuleFactory( + path, + data_dir=data_dir, + data_files=data_files, + download_config=download_config, + download_mode=download_mode, + ).get_module() + # Try locally + elif path.endswith(filename): + raise RuntimeError(f"Dataset scripts are no longer supported, but found {filename}") + elif os.path.isfile(combined_path): + raise RuntimeError(f"Dataset scripts are no longer supported, but found {filename}") + elif os.path.isdir(path): + return LocalDatasetModuleFactory( + path, data_dir=data_dir, data_files=data_files, download_mode=download_mode + ).get_module() + # Try remotely + elif is_relative_path(path) and path.count("/") <= 1: + try: + # Get the Dataset Card + get the revision + check authentication all at in one call + # We fix the commit_hash in case there are new commits in the meantime + api = HfApi( + endpoint=config.HF_ENDPOINT, + token=download_config.token, + library_name="datasets", + library_version=__version__, + user_agent=get_datasets_user_agent(download_config.user_agent), + ) + try: + _raise_if_offline_mode_is_enabled() + dataset_readme_path = api.hf_hub_download( + repo_id=path, + filename=config.REPOCARD_FILENAME, + repo_type="dataset", + revision=revision, + proxies=download_config.proxies, + ) + commit_hash = os.path.basename(os.path.dirname(dataset_readme_path)) + except LocalEntryNotFoundError as e: + if isinstance( + e.__cause__, + ( + OfflineModeIsEnabled, + requests.exceptions.Timeout, + requests.exceptions.ConnectionError, + httpx.ConnectError, + httpx.TimeoutException, + ), + ): + raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({e.__class__.__name__})") from e + else: + raise + except EntryNotFoundError: + commit_hash = api.dataset_info( + path, + revision=revision, + timeout=100.0, + ).sha + except ( + OfflineModeIsEnabled, + requests.exceptions.Timeout, + requests.exceptions.ConnectionError, + httpx.ConnectError, + httpx.TimeoutException, + ) as e: + raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({e.__class__.__name__})") from e + except GatedRepoError as e: + message = f"Dataset '{path}' is a gated dataset on the Hub." + if e.response.status_code == 401: + message += " You must be authenticated to access it." + elif e.response.status_code == 403: + message += f" Visit the dataset page at https://huggingface.co/datasets/{path} to ask for access." + raise DatasetNotFoundError(message) from e + except RevisionNotFoundError as e: + raise DatasetNotFoundError( + f"Revision '{revision}' doesn't exist for dataset '{path}' on the Hub." + ) from e + except RepositoryNotFoundError as e: + raise DatasetNotFoundError(f"Dataset '{path}' doesn't exist on the Hub or cannot be accessed.") from e + try: + api.hf_hub_download( + repo_id=path, + filename=filename, + repo_type="dataset", + revision=commit_hash, + proxies=download_config.proxies, + ) + raise RuntimeError(f"Dataset scripts are no longer supported, but found {filename}") + except EntryNotFoundError: + # Use the infos from the parquet export except in some cases: + if data_dir or data_files or (revision and revision != "main"): + use_exported_dataset_infos = False + else: + use_exported_dataset_infos = True + return HubDatasetModuleFactory( + path, + commit_hash=commit_hash, + data_dir=data_dir, + data_files=data_files, + download_config=download_config, + download_mode=download_mode, + use_exported_dataset_infos=use_exported_dataset_infos, + ).get_module() + except GatedRepoError as e: + message = f"Dataset '{path}' is a gated dataset on the Hub." + if e.response.status_code == 401: + message += " You must be authenticated to access it." + elif e.response.status_code == 403: + message += f" Visit the dataset page at https://huggingface.co/datasets/{path} to ask for access." + raise DatasetNotFoundError(message) from e + except RevisionNotFoundError as e: + raise DatasetNotFoundError( + f"Revision '{revision}' doesn't exist for dataset '{path}' on the Hub." + ) from e + except Exception as e1: + # All the attempts failed, before raising the error we should check if the module is already cached + try: + return CachedDatasetModuleFactory(path, cache_dir=cache_dir).get_module() + except Exception: + # If it's not in the cache, then it doesn't exist. + if isinstance(e1, OfflineModeIsEnabled): + raise ConnectionError(f"Couldn't reach the Hugging Face Hub for dataset '{path}': {e1}") from None + if isinstance(e1, (DataFilesNotFoundError, DatasetNotFoundError, EmptyDatasetError)): + raise e1 from None + if isinstance(e1, FileNotFoundError): + raise FileNotFoundError( + f"Couldn't find any data file at {relative_to_absolute_path(path)}. " + f"Couldn't find '{path}' on the Hugging Face Hub either: {type(e1).__name__}: {e1}" + ) from None + raise e1 from None + else: + raise FileNotFoundError(f"Couldn't find any data file at {relative_to_absolute_path(path)}.") + + +def load_dataset_builder( + path: str, + name: Optional[str] = None, + data_dir: Optional[str] = None, + data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None, + cache_dir: Optional[str] = None, + features: Optional[Features] = None, + download_config: Optional[DownloadConfig] = None, + download_mode: Optional[Union[DownloadMode, str]] = None, + revision: Optional[Union[str, Version]] = None, + token: Optional[Union[bool, str]] = None, + storage_options: Optional[dict] = None, + **config_kwargs, +) -> DatasetBuilder: + """Load a dataset builder which can be used to: + + - Inspect general information that is required to build a dataset (cache directory, config, dataset info, features, data files, etc.) + - Download and prepare the dataset as Arrow files in the cache + - Get a streaming dataset without downloading or caching anything + + You can find the list of datasets on the [Hub](https://huggingface.co/datasets) or with [`huggingface_hub.list_datasets`]. + + A dataset is a directory that contains some data files in generic formats (JSON, CSV, Parquet, etc.) and possibly + in a generic structure (Webdataset, ImageFolder, AudioFolder, VideoFolder, etc.) + + Args: + + path (`str`): + Path or name of the dataset. + + - if `path` is a dataset repository on the HF hub (list all available datasets with [`huggingface_hub.list_datasets`]) + -> load the dataset builder from supported files in the repository (csv, json, parquet, etc.) + e.g. `'username/dataset_name'`, a dataset repository on the HF hub containing the data files. + + - if `path` is a local directory + -> load the dataset builder from supported files in the directory (csv, json, parquet, etc.) + e.g. `'./path/to/directory/with/my/csv/data'`. + + - if `path` is the name of a dataset builder and `data_files` or `data_dir` is specified + (available builders are "json", "csv", "parquet", "arrow", "text", "xml", "webdataset", "imagefolder", "audiofolder", "videofolder") + -> load the dataset builder from the files in `data_files` or `data_dir` + e.g. `'parquet'`. + + name (`str`, *optional*): + Defining the name of the dataset configuration. + data_dir (`str`, *optional*): + Defining the `data_dir` of the dataset configuration. If specified for the generic builders (csv, text etc.) or the Hub datasets and `data_files` is `None`, + the behavior is equal to passing `os.path.join(data_dir, **)` as `data_files` to reference all the files in a directory. + data_files (`str` or `Sequence` or `Mapping`, *optional*): + Path(s) to source data file(s). + cache_dir (`str`, *optional*): + Directory to read/write data. Defaults to `"~/.cache/huggingface/datasets"`. + features ([`Features`], *optional*): + Set the features type to use for this dataset. + download_config ([`DownloadConfig`], *optional*): + Specific download configuration parameters. + download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`): + Download/generate mode. + revision ([`Version`] or `str`, *optional*): + Version of the dataset to load. + As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch. + You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository. + token (`str` or `bool`, *optional*): + Optional string or boolean to use as Bearer token for remote files on the Datasets Hub. + If `True`, or not specified, will get token from `"~/.huggingface"`. + storage_options (`dict`, *optional*, defaults to `None`): + **Experimental**. Key/value pairs to be passed on to the dataset file-system backend, if any. + + + + **config_kwargs (additional keyword arguments): + Keyword arguments to be passed to the [`BuilderConfig`] + and used in the [`DatasetBuilder`]. + + Returns: + [`DatasetBuilder`] + + Example: + + ```py + >>> from datasets import load_dataset_builder + >>> ds_builder = load_dataset_builder('cornell-movie-review-data/rotten_tomatoes') + >>> ds_builder.info.features + {'label': ClassLabel(names=['neg', 'pos']), + 'text': Value('string')} + ``` + """ + download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS) + if token is not None: + download_config = download_config.copy() if download_config else DownloadConfig() + download_config.token = token + if storage_options is not None: + download_config = download_config.copy() if download_config else DownloadConfig() + download_config.storage_options.update(storage_options) + if features is not None: + features = _fix_for_backward_compatible_features(features) + dataset_module = dataset_module_factory( + path, + revision=revision, + download_config=download_config, + download_mode=download_mode, + data_dir=data_dir, + data_files=data_files, + cache_dir=cache_dir, + ) + # Get dataset builder class + builder_kwargs = dataset_module.builder_kwargs + data_dir = builder_kwargs.pop("data_dir", data_dir) + data_files = builder_kwargs.pop("data_files", data_files) + config_name = builder_kwargs.pop( + "config_name", name or dataset_module.builder_configs_parameters.default_config_name + ) + dataset_name = builder_kwargs.pop("dataset_name", None) + info = dataset_module.dataset_infos.get(config_name) if dataset_module.dataset_infos else None + + if ( + path in _PACKAGED_DATASETS_MODULES + and data_files is None + and dataset_module.builder_configs_parameters.builder_configs[0].data_files is None + ): + error_msg = f"Please specify the data files or data directory to load for the {path} dataset builder." + example_extensions = [ + extension for extension in _EXTENSION_TO_MODULE if _EXTENSION_TO_MODULE[extension] == path + ] + if example_extensions: + error_msg += f'\nFor example `data_files={{"train": "path/to/data/train/*.{example_extensions[0]}"}}`' + raise ValueError(error_msg) + + builder_cls = get_dataset_builder_class(dataset_module, dataset_name=dataset_name) + # Instantiate the dataset builder + builder_instance: DatasetBuilder = builder_cls( + cache_dir=cache_dir, + dataset_name=dataset_name, + config_name=config_name, + data_dir=data_dir, + data_files=data_files, + hash=dataset_module.hash, + info=info, + features=features, + token=token, + storage_options=storage_options, + **builder_kwargs, + **config_kwargs, + ) + builder_instance._use_legacy_cache_dir_if_possible(dataset_module) + + return builder_instance + + +def load_dataset( + path: str, + name: Optional[str] = None, + data_dir: Optional[str] = None, + data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None, + split: Optional[Union[str, Split, list[str], list[Split]]] = None, + cache_dir: Optional[str] = None, + features: Optional[Features] = None, + download_config: Optional[DownloadConfig] = None, + download_mode: Optional[Union[DownloadMode, str]] = None, + verification_mode: Optional[Union[VerificationMode, str]] = None, + keep_in_memory: Optional[bool] = None, + save_infos: bool = False, + revision: Optional[Union[str, Version]] = None, + token: Optional[Union[bool, str]] = None, + streaming: bool = False, + num_proc: Optional[int] = None, + storage_options: Optional[dict] = None, + **config_kwargs, +) -> Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset]: + """Load a dataset from the Hugging Face Hub, or a local dataset. + + You can find the list of datasets on the [Hub](https://huggingface.co/datasets) or with [`huggingface_hub.list_datasets`]. + + A dataset is a directory that contains some data files in generic formats (JSON, CSV, Parquet, etc.) and possibly + in a generic structure (Webdataset, ImageFolder, AudioFolder, VideoFolder, etc.) + + This function does the following under the hood: + + 1. Load a dataset builder: + + * Find the most common data format in the dataset and pick its associated builder (JSON, CSV, Parquet, Webdataset, ImageFolder, AudioFolder, etc.) + * Find which file goes into which split (e.g. train/test) based on file and directory names or on the YAML configuration + * It is also possible to specify `data_files` manually, and which dataset builder to use (e.g. "parquet"). + + 2. Run the dataset builder: + + In the general case: + + * Download the data files from the dataset if they are not already available locally or cached. + * Process and cache the dataset in typed Arrow tables for caching. + + Arrow table are arbitrarily long, typed tables which can store nested objects and be mapped to numpy/pandas/python generic types. + They can be directly accessed from disk, loaded in RAM or even streamed over the web. + + In the streaming case: + + * Don't download or cache anything. Instead, the dataset is lazily loaded and will be streamed on-the-fly when iterating on it. + + 3. Return a dataset built from the requested splits in `split` (default: all). + + Args: + + path (`str`): + Path or name of the dataset. + + - if `path` is a dataset repository on the HF hub (list all available datasets with [`huggingface_hub.list_datasets`]) + -> load the dataset from supported files in the repository (csv, json, parquet, etc.) + e.g. `'username/dataset_name'`, a dataset repository on the HF hub containing the data files. + + - if `path` is a local directory + -> load the dataset from supported files in the directory (csv, json, parquet, etc.) + e.g. `'./path/to/directory/with/my/csv/data'`. + + - if `path` is the name of a dataset builder and `data_files` or `data_dir` is specified + (available builders are "json", "csv", "parquet", "arrow", "text", "xml", "webdataset", "imagefolder", "audiofolder", "videofolder") + -> load the dataset from the files in `data_files` or `data_dir` + e.g. `'parquet'`. + + name (`str`, *optional*): + Defining the name of the dataset configuration. + data_dir (`str`, *optional*): + Defining the `data_dir` of the dataset configuration. If specified for the generic builders (csv, text etc.) or the Hub datasets and `data_files` is `None`, + the behavior is equal to passing `os.path.join(data_dir, **)` as `data_files` to reference all the files in a directory. + data_files (`str` or `Sequence` or `Mapping`, *optional*): + Path(s) to source data file(s). + split (`Split` or `str`): + Which split of the data to load. + If `None`, will return a `dict` with all splits (typically `datasets.Split.TRAIN` and `datasets.Split.TEST`). + If given, will return a single Dataset. + Splits can be combined and specified like in tensorflow-datasets. + cache_dir (`str`, *optional*): + Directory to read/write data. Defaults to `"~/.cache/huggingface/datasets"`. + features (`Features`, *optional*): + Set the features type to use for this dataset. + download_config ([`DownloadConfig`], *optional*): + Specific download configuration parameters. + download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`): + Download/generate mode. + verification_mode ([`VerificationMode`] or `str`, defaults to `BASIC_CHECKS`): + Verification mode determining the checks to run on the downloaded/processed dataset information (checksums/size/splits/...). + + + keep_in_memory (`bool`, defaults to `None`): + Whether to copy the dataset in-memory. If `None`, the dataset + will not be copied in-memory unless explicitly enabled by setting `datasets.config.IN_MEMORY_MAX_SIZE` to + nonzero. See more details in the [improve performance](../cache#improve-performance) section. + revision ([`Version`] or `str`, *optional*): + Version of the dataset to load. + As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch. + You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository. + token (`str` or `bool`, *optional*): + Optional string or boolean to use as Bearer token for remote files on the Datasets Hub. + If `True`, or not specified, will get token from `"~/.huggingface"`. + streaming (`bool`, defaults to `False`): + If set to `True`, don't download the data files. Instead, it streams the data progressively while + iterating on the dataset. An [`IterableDataset`] or [`IterableDatasetDict`] is returned instead in this case. + + Note that streaming works for datasets that use data formats that support being iterated over like txt, csv, jsonl for example. + Json files may be downloaded completely. Also streaming from remote zip or gzip files is supported but other compressed formats + like rar and xz are not yet supported. The tgz format doesn't allow streaming. + num_proc (`int`, *optional*, defaults to `None`): + Number of processes when downloading and generating the dataset locally. + Multiprocessing is disabled by default. + + + storage_options (`dict`, *optional*, defaults to `None`): + **Experimental**. Key/value pairs to be passed on to the dataset file-system backend, if any. + + + **config_kwargs (additional keyword arguments): + Keyword arguments to be passed to the `BuilderConfig` + and used in the [`DatasetBuilder`]. + + Returns: + [`Dataset`] or [`DatasetDict`]: + - if `split` is not `None`: the dataset requested, + - if `split` is `None`, a [`~datasets.DatasetDict`] with each split. + + or [`IterableDataset`] or [`IterableDatasetDict`]: if `streaming=True` + + - if `split` is not `None`, the dataset is requested + - if `split` is `None`, a [`~datasets.streaming.IterableDatasetDict`] with each split. + + Example: + + Load a dataset from the Hugging Face Hub: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset('cornell-movie-review-data/rotten_tomatoes', split='train') + + # Load a subset or dataset configuration (here 'sst2') + >>> from datasets import load_dataset + >>> ds = load_dataset('nyu-mll/glue', 'sst2', split='train') + + # Manual mapping of data files to splits + >>> data_files = {'train': 'train.csv', 'test': 'test.csv'} + >>> ds = load_dataset('namespace/your_dataset_name', data_files=data_files) + + # Manual selection of a directory to load + >>> ds = load_dataset('namespace/your_dataset_name', data_dir='folder_name') + ``` + + Load a local dataset: + + ```py + # Load a CSV file + >>> from datasets import load_dataset + >>> ds = load_dataset('csv', data_files='path/to/local/my_dataset.csv') + + # Load a JSON file + >>> from datasets import load_dataset + >>> ds = load_dataset('json', data_files='path/to/local/my_dataset.json') + ``` + + Load an [`~datasets.IterableDataset`]: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset('cornell-movie-review-data/rotten_tomatoes', split='train', streaming=True) + ``` + + Load an image dataset with the `ImageFolder` dataset builder: + + ```py + >>> from datasets import load_dataset + >>> ds = load_dataset('imagefolder', data_dir='/path/to/images', split='train') + ``` + """ + if "trust_remote_code" in config_kwargs: + if config_kwargs.pop("trust_remote_code"): + logger.error( + "`trust_remote_code` is not supported anymore.\n" + f"Please check that the Hugging Face dataset '{path}' isn't based on a loading script and remove `trust_remote_code`.\n" + "If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet." + ) + if data_files is not None and not data_files: + raise ValueError(f"Empty 'data_files': '{data_files}'. It should be either non-empty or None (default).") + if Path(path, config.DATASET_STATE_JSON_FILENAME).exists(): + raise ValueError( + "You are trying to load a dataset that was saved using `save_to_disk`. " + "Please use `load_from_disk` instead." + ) + + if streaming and num_proc is not None: + raise NotImplementedError( + "Loading a streaming dataset in parallel with `num_proc` is not implemented. " + "To parallelize streaming, you can wrap the dataset with a PyTorch DataLoader using `num_workers` > 1 instead." + ) + + download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS) + verification_mode = VerificationMode( + (verification_mode or VerificationMode.BASIC_CHECKS) if not save_infos else VerificationMode.ALL_CHECKS + ) + + # Create a dataset builder + builder_instance = load_dataset_builder( + path=path, + name=name, + data_dir=data_dir, + data_files=data_files, + cache_dir=cache_dir, + features=features, + download_config=download_config, + download_mode=download_mode, + revision=revision, + token=token, + storage_options=storage_options, + **config_kwargs, + ) + + # Return iterable dataset in case of streaming + if streaming: + return builder_instance.as_streaming_dataset(split=split) + + # Download and prepare data + builder_instance.download_and_prepare( + download_config=download_config, + download_mode=download_mode, + verification_mode=verification_mode, + num_proc=num_proc, + storage_options=storage_options, + ) + + # Build dataset for splits + keep_in_memory = ( + keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size) + ) + ds = builder_instance.as_dataset(split=split, verification_mode=verification_mode, in_memory=keep_in_memory) + + return ds + + +def load_from_disk( + dataset_path: PathLike, keep_in_memory: Optional[bool] = None, storage_options: Optional[dict] = None +) -> Union[Dataset, DatasetDict]: + """ + Loads a dataset that was previously saved using [`~Dataset.save_to_disk`] from a dataset directory, or + from a filesystem using any implementation of `fsspec.spec.AbstractFileSystem`. + + Args: + dataset_path (`path-like`): + Path (e.g. `"dataset/train"`) or remote URI (e.g. `"s3://my-bucket/dataset/train"`) + of the [`Dataset`] or [`DatasetDict`] directory where the dataset/dataset-dict will be + loaded from. + keep_in_memory (`bool`, defaults to `None`): + Whether to copy the dataset in-memory. If `None`, the dataset + will not be copied in-memory unless explicitly enabled by setting `datasets.config.IN_MEMORY_MAX_SIZE` to + nonzero. See more details in the [improve performance](../cache#improve-performance) section. + + storage_options (`dict`, *optional*): + Key/value pairs to be passed on to the file-system backend, if any. + + + + Returns: + [`Dataset`] or [`DatasetDict`]: + - If `dataset_path` is a path of a dataset directory: the dataset requested. + - If `dataset_path` is a path of a dataset dict directory, a [`DatasetDict`] with each split. + + Example: + + ```py + >>> from datasets import load_from_disk + >>> ds = load_from_disk('path/to/dataset/directory') + ``` + """ + fs: fsspec.AbstractFileSystem + fs, *_ = url_to_fs(dataset_path, **(storage_options or {})) + if not fs.exists(dataset_path): + raise FileNotFoundError(f"Directory {dataset_path} not found") + if fs.isfile(posixpath.join(dataset_path, config.DATASET_INFO_FILENAME)) and fs.isfile( + posixpath.join(dataset_path, config.DATASET_STATE_JSON_FILENAME) + ): + return Dataset.load_from_disk(dataset_path, keep_in_memory=keep_in_memory, storage_options=storage_options) + elif fs.isfile(posixpath.join(dataset_path, config.DATASETDICT_JSON_FILENAME)): + return DatasetDict.load_from_disk(dataset_path, keep_in_memory=keep_in_memory, storage_options=storage_options) + else: + raise FileNotFoundError( + f"Directory {dataset_path} is neither a `Dataset` directory nor a `DatasetDict` directory." + ) diff --git a/datasets/naming.py b/datasets/naming.py new file mode 100644 index 0000000000000000000000000000000000000000..65e7ede10dcde8701823223ae98e7971f705f945 --- /dev/null +++ b/datasets/naming.py @@ -0,0 +1,84 @@ +# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Lint as: python3 +"""Utilities for file names.""" + +import itertools +import os +import re + + +_uppercase_uppercase_re = re.compile(r"([A-Z]+)([A-Z][a-z])") +_lowercase_uppercase_re = re.compile(r"([a-z\d])([A-Z])") + +_single_underscore_re = re.compile(r"(?:/\|?*" + + +def camelcase_to_snakecase(name): + """Convert camel-case string to snake-case.""" + name = _uppercase_uppercase_re.sub(r"\1_\2", name) + name = _lowercase_uppercase_re.sub(r"\1_\2", name) + return name.lower() + + +def snakecase_to_camelcase(name): + """Convert snake-case string to camel-case string.""" + name = _single_underscore_re.split(name) + name = [_multiple_underscores_re.split(n) for n in name] + return "".join(n.capitalize() for n in itertools.chain.from_iterable(name) if n != "") + + +def filename_prefix_for_name(name): + if os.path.basename(name) != name: + raise ValueError(f"Should be a dataset name, not a path: {name}") + return camelcase_to_snakecase(name) + + +def filename_prefix_for_split(name, split): + if os.path.basename(name) != name: + raise ValueError(f"Should be a dataset name, not a path: {name}") + if not re.match(_split_re, split): + raise ValueError(f"Split name should match '{_split_re}'' but got '{split}'.") + return f"{filename_prefix_for_name(name)}-{split}" + + +def filepattern_for_dataset_split(dataset_name, split, data_dir, filetype_suffix=None): + prefix = filename_prefix_for_split(dataset_name, split) + if filetype_suffix: + prefix += f".{filetype_suffix}" + filepath = os.path.join(data_dir, prefix) + return f"{filepath}*" + + +def filenames_for_dataset_split(path, dataset_name, split, filetype_suffix=None, shard_lengths=None): + prefix = filename_prefix_for_split(dataset_name, split) + prefix = os.path.join(path, prefix) + + if shard_lengths: + num_shards = len(shard_lengths) + filenames = [f"{prefix}-{shard_id:05d}-of-{num_shards:05d}" for shard_id in range(num_shards)] + if filetype_suffix: + filenames = [filename + f".{filetype_suffix}" for filename in filenames] + return filenames + else: + filename = prefix + if filetype_suffix: + filename += f".{filetype_suffix}" + return [filename] diff --git a/datasets/search.py b/datasets/search.py new file mode 100644 index 0000000000000000000000000000000000000000..cb994d24e41e96174386de79057441e4436cc19a --- /dev/null +++ b/datasets/search.py @@ -0,0 +1,785 @@ +import importlib.util +import os +import tempfile +from pathlib import PurePath +from typing import TYPE_CHECKING, NamedTuple, Optional, Union + +import fsspec +import numpy as np + +from .features import List +from .utils import logging +from .utils import tqdm as hf_tqdm + + +if TYPE_CHECKING: + from .arrow_dataset import Dataset # noqa: F401 + + try: + from elasticsearch import Elasticsearch # noqa: F401 + + except ImportError: + pass + try: + import faiss # noqa: F401 + + except ImportError: + pass + +_has_elasticsearch = importlib.util.find_spec("elasticsearch") is not None +_has_faiss = importlib.util.find_spec("faiss") is not None + + +logger = logging.get_logger(__name__) + + +class MissingIndex(Exception): + pass + + +class SearchResults(NamedTuple): + scores: list[float] + indices: list[int] + + +class BatchedSearchResults(NamedTuple): + total_scores: list[list[float]] + total_indices: list[list[int]] + + +class NearestExamplesResults(NamedTuple): + scores: list[float] + examples: dict + + +class BatchedNearestExamplesResults(NamedTuple): + total_scores: list[list[float]] + total_examples: list[dict] + + +class BaseIndex: + """Base class for indexing""" + + def search(self, query, k: int = 10, **kwargs) -> SearchResults: + """ + To implement. + This method has to return the scores and the indices of the retrieved examples given a certain query. + """ + raise NotImplementedError + + def search_batch(self, queries, k: int = 10, **kwargs) -> BatchedSearchResults: + """Find the nearest examples indices to the query. + + Args: + queries (`Union[List[str], np.ndarray]`): The queries as a list of strings if `column` is a text index or as a numpy array if `column` is a vector index. + k (`int`): The number of examples to retrieve per query. + + Output: + total_scores (`List[List[float]`): The retrieval scores of the retrieved examples per query. + total_indices (`List[List[int]]`): The indices of the retrieved examples per query. + """ + total_scores, total_indices = [], [] + for query in queries: + scores, indices = self.search(query, k) + total_scores.append(scores) + total_indices.append(indices) + return BatchedSearchResults(total_scores, total_indices) + + def save(self, file: Union[str, PurePath]): + """Serialize the index on disk""" + raise NotImplementedError + + @classmethod + def load(cls, file: Union[str, PurePath]) -> "BaseIndex": + """Deserialize the index from disk""" + raise NotImplementedError + + +class ElasticSearchIndex(BaseIndex): + """ + Sparse index using Elasticsearch. It is used to index text and run queries based on BM25 similarity. + An Elasticsearch server needs to be accessible, and a python client is declared with + ``` + es_client = Elasticsearch([{'host': 'localhost', 'port': '9200'}]) + ``` + for example. + """ + + def __init__( + self, + host: Optional[str] = None, + port: Optional[int] = None, + es_client: Optional["Elasticsearch"] = None, + es_index_name: Optional[str] = None, + es_index_config: Optional[dict] = None, + ): + if not _has_elasticsearch: + raise ImportError( + "You must install ElasticSearch to use ElasticSearchIndex. To do so you can run `pip install elasticsearch==7.7.1 for example`" + ) + if es_client is not None and (host is not None or port is not None): + raise ValueError("Please specify either `es_client` or `(host, port)`, but not both.") + host = host or "localhost" + port = port or 9200 + + import elasticsearch.helpers # noqa: F401 - need this to properly load all the es features + from elasticsearch import Elasticsearch # noqa: F811 + + self.es_client = es_client if es_client is not None else Elasticsearch([{"host": host, "port": str(port)}]) + self.es_index_name = ( + es_index_name + if es_index_name is not None + else "huggingface_datasets_" + os.path.basename(tempfile.NamedTemporaryFile().name) + ) + self.es_index_config = ( + es_index_config + if es_index_config is not None + else { + "settings": { + "number_of_shards": 1, + "analysis": {"analyzer": {"stop_standard": {"type": "standard", " stopwords": "_english_"}}}, + }, + "mappings": {"properties": {"text": {"type": "text", "analyzer": "standard", "similarity": "BM25"}}}, + } + ) + + def add_documents(self, documents: Union[list[str], "Dataset"], column: Optional[str] = None): + """ + Add documents to the index. + If the documents are inside a certain column, you can specify it using the `column` argument. + """ + index_name = self.es_index_name + index_config = self.es_index_config + self.es_client.indices.create(index=index_name, body=index_config) + number_of_docs = len(documents) + progress = hf_tqdm(unit="docs", total=number_of_docs) + successes = 0 + + def passage_generator(): + if column is not None: + for i, example in enumerate(documents): + yield {"text": example[column], "_id": i} + else: + for i, example in enumerate(documents): + yield {"text": example, "_id": i} + + # create the ES index + import elasticsearch as es + + for ok, action in es.helpers.streaming_bulk( + client=self.es_client, + index=index_name, + actions=passage_generator(), + ): + progress.update(1) + successes += ok + if successes != len(documents): + logger.warning( + f"Some documents failed to be added to ElasticSearch. Failures: {len(documents) - successes}/{len(documents)}" + ) + logger.info(f"Indexed {successes:d} documents") + + def search(self, query: str, k=10, **kwargs) -> SearchResults: + """Find the nearest examples indices to the query. + + Args: + query (`str`): The query as a string. + k (`int`): The number of examples to retrieve. + + Output: + scores (`List[List[float]`): The retrieval scores of the retrieved examples. + indices (`List[List[int]]`): The indices of the retrieved examples. + """ + response = self.es_client.search( + index=self.es_index_name, + body={"query": {"multi_match": {"query": query, "fields": ["text"], "type": "cross_fields"}}, "size": k}, + **kwargs, + ) + hits = response["hits"]["hits"] + return SearchResults([hit["_score"] for hit in hits], [int(hit["_id"]) for hit in hits]) + + def search_batch(self, queries, k: int = 10, max_workers=10, **kwargs) -> BatchedSearchResults: + import concurrent.futures + + total_scores, total_indices = [None] * len(queries), [None] * len(queries) + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + future_to_index = {executor.submit(self.search, query, k, **kwargs): i for i, query in enumerate(queries)} + for future in concurrent.futures.as_completed(future_to_index): + index = future_to_index[future] + results: SearchResults = future.result() + total_scores[index] = results.scores + total_indices[index] = results.indices + return BatchedSearchResults(total_indices=total_indices, total_scores=total_scores) + + +class FaissIndex(BaseIndex): + """ + Dense index using Faiss. It is used to index vectors. + Faiss is a library for efficient similarity search and clustering of dense vectors. + It contains algorithms that search in sets of vectors of any size, up to ones that possibly do not fit in RAM. + You can find more information about Faiss here: + - For index types and the string factory: https://github.com/facebookresearch/faiss/wiki/The-index-factory + - For GPU settings: https://github.com/facebookresearch/faiss/wiki/Faiss-on-the-GPU + """ + + def __init__( + self, + device: Optional[Union[int, list[int]]] = None, + string_factory: Optional[str] = None, + metric_type: Optional[int] = None, + custom_index: Optional["faiss.Index"] = None, + ): + """ + Create a Dense index using Faiss. You can specify `device` if you want to run it on GPU (`device` must be the GPU index). + You can find more information about Faiss here: + - For `string factory`: https://github.com/facebookresearch/faiss/wiki/The-index-factory + """ + if string_factory is not None and custom_index is not None: + raise ValueError("Please specify either `string_factory` or `custom_index` but not both.") + if device is not None and custom_index is not None: + raise ValueError( + "Cannot pass both 'custom_index' and 'device'. " + "Pass 'custom_index' already transferred to the target device instead." + ) + self.device = device + self.string_factory = string_factory + self.metric_type = metric_type + self.faiss_index = custom_index + if not _has_faiss: + raise ImportError( + "You must install Faiss to use FaissIndex. To do so you can run `conda install -c pytorch faiss-cpu` or `conda install -c pytorch faiss-gpu`. " + "A community supported package is also available on pypi: `pip install faiss-cpu` or `pip install faiss-gpu`. " + "Note that pip may not have the latest version of FAISS, and thus, some of the latest features and bug fixes may not be available." + ) + + def add_vectors( + self, + vectors: Union[np.array, "Dataset"], + column: Optional[str] = None, + batch_size: int = 1000, + train_size: Optional[int] = None, + faiss_verbose: Optional[bool] = None, + ): + """ + Add vectors to the index. + If the arrays are inside a certain column, you can specify it using the `column` argument. + """ + import faiss # noqa: F811 + + if column and not isinstance(vectors.features[column], List): + raise ValueError( + f"Wrong feature type for column '{column}'. Expected 1d array, got {vectors.features[column]}" + ) + + # Create index + if self.faiss_index is None: + size = len(vectors[0]) if column is None else len(vectors[0][column]) + if self.string_factory is not None: + if self.metric_type is None: + index = faiss.index_factory(size, self.string_factory) + else: + index = faiss.index_factory(size, self.string_factory, self.metric_type) + else: + if self.metric_type is None: + index = faiss.IndexFlat(size) + else: + index = faiss.IndexFlat(size, self.metric_type) + + self.faiss_index = self._faiss_index_to_device(index, self.device) + logger.info(f"Created faiss index of type {type(self.faiss_index)}") + + # Set verbosity level + if faiss_verbose is not None: + self.faiss_index.verbose = faiss_verbose + if hasattr(self.faiss_index, "index") and self.faiss_index.index is not None: + self.faiss_index.index.verbose = faiss_verbose + if hasattr(self.faiss_index, "quantizer") and self.faiss_index.quantizer is not None: + self.faiss_index.quantizer.verbose = faiss_verbose + if hasattr(self.faiss_index, "clustering_index") and self.faiss_index.clustering_index is not None: + self.faiss_index.clustering_index.verbose = faiss_verbose + + # Train + if train_size is not None: + train_vecs = vectors[:train_size] if column is None else vectors[:train_size][column] + logger.info(f"Training the index with the first {len(train_vecs)} vectors") + self.faiss_index.train(train_vecs) + else: + logger.info("Ignored the training step of the faiss index as `train_size` is None.") + + # Add vectors + logger.info(f"Adding {len(vectors)} vectors to the faiss index") + for i in hf_tqdm(range(0, len(vectors), batch_size)): + vecs = vectors[i : i + batch_size] if column is None else vectors[i : i + batch_size][column] + self.faiss_index.add(vecs) + + @staticmethod + def _faiss_index_to_device(index: "faiss.Index", device: Optional[Union[int, list[int]]] = None) -> "faiss.Index": + """ + Sends a faiss index to a device. + A device can either be a positive integer (GPU id), a negative integer (all GPUs), + or a list of positive integers (select GPUs to use), or `None` for CPU. + """ + + # If device is not specified, then it runs on CPU. + if device is None: + return index + + import faiss # noqa: F811 + + # If the device id is given as an integer + if isinstance(device, int): + # Positive integers are directly mapped to GPU ids + if device > -1: + faiss_res = faiss.StandardGpuResources() + index = faiss.index_cpu_to_gpu(faiss_res, device, index) + # And negative integers mean using all GPUs + else: + index = faiss.index_cpu_to_all_gpus(index) + # Device ids given as a list mean mapping to those devices specified. + elif isinstance(device, (list, tuple)): + index = faiss.index_cpu_to_gpus_list(index, gpus=list(device)) + else: + raise TypeError( + f"The argument type: {type(device)} is not expected. " + + "Please pass in either nothing, a positive int, a negative int, or a list of positive ints." + ) + + return index + + def search(self, query: np.array, k=10, **kwargs) -> SearchResults: + """Find the nearest examples indices to the query. + + Args: + query (`np.array`): The query as a numpy array. + k (`int`): The number of examples to retrieve. + + Output: + scores (`List[List[float]`): The retrieval scores of the retrieved examples. + indices (`List[List[int]]`): The indices of the retrieved examples. + """ + if len(query.shape) != 1 and (len(query.shape) != 2 or query.shape[0] != 1): + raise ValueError("Shape of query is incorrect, it has to be either a 1D array or 2D (1, N)") + + queries = query.reshape(1, -1) + if not queries.flags.c_contiguous: + queries = np.asarray(queries, order="C") + scores, indices = self.faiss_index.search(queries, k, **kwargs) + return SearchResults(scores[0], indices[0].astype(int)) + + def search_batch(self, queries: np.array, k=10, **kwargs) -> BatchedSearchResults: + """Find the nearest examples indices to the queries. + + Args: + queries (`np.array`): The queries as a numpy array. + k (`int`): The number of examples to retrieve. + + Output: + total_scores (`List[List[float]`): The retrieval scores of the retrieved examples per query. + total_indices (`List[List[int]]`): The indices of the retrieved examples per query. + """ + if len(queries.shape) != 2: + raise ValueError("Shape of query must be 2D") + if not queries.flags.c_contiguous: + queries = np.asarray(queries, order="C") + scores, indices = self.faiss_index.search(queries, k, **kwargs) + return BatchedSearchResults(scores, indices.astype(int)) + + def save(self, file: Union[str, PurePath], storage_options: Optional[dict] = None): + """Serialize the FaissIndex on disk""" + import faiss # noqa: F811 + + if self.device is not None and isinstance(self.device, (int, list, tuple)): + index = faiss.index_gpu_to_cpu(self.faiss_index) + else: + index = self.faiss_index + + with fsspec.open(str(file), "wb", **(storage_options or {})) as f: + faiss.write_index(index, faiss.BufferedIOWriter(faiss.PyCallbackIOWriter(f.write))) + + @classmethod + def load( + cls, + file: Union[str, PurePath], + device: Optional[Union[int, list[int]]] = None, + storage_options: Optional[dict] = None, + ) -> "FaissIndex": + """Deserialize the FaissIndex from disk""" + import faiss # noqa: F811 + + # Instances of FaissIndex is essentially just a wrapper for faiss indices. + faiss_index = cls(device=device) + with fsspec.open(str(file), "rb", **(storage_options or {})) as f: + index = faiss.read_index(faiss.BufferedIOReader(faiss.PyCallbackIOReader(f.read))) + faiss_index.faiss_index = faiss_index._faiss_index_to_device(index, faiss_index.device) + return faiss_index + + +class IndexableMixin: + """Add indexing features to `datasets.Dataset`""" + + def __init__(self): + self._indexes: dict[str, BaseIndex] = {} + + def __len__(self): + raise NotImplementedError + + def __getitem__(self, key): + raise NotImplementedError + + def is_index_initialized(self, index_name: str) -> bool: + return index_name in self._indexes + + def _check_index_is_initialized(self, index_name: str): + if not self.is_index_initialized(index_name): + raise MissingIndex( + f"Index with index_name '{index_name}' not initialized yet. Please make sure that you call `add_faiss_index` or `add_elasticsearch_index` first." + ) + + def list_indexes(self) -> list[str]: + """List the `colindex_nameumns`/identifiers of all the attached indexes.""" + return list(self._indexes) + + def get_index(self, index_name: str) -> BaseIndex: + """List the `index_name`/identifiers of all the attached indexes. + + Args: + index_name (`str`): Index name. + + Returns: + [`BaseIndex`] + """ + self._check_index_is_initialized(index_name) + return self._indexes[index_name] + + def add_faiss_index( + self, + column: str, + index_name: Optional[str] = None, + device: Optional[Union[int, list[int]]] = None, + string_factory: Optional[str] = None, + metric_type: Optional[int] = None, + custom_index: Optional["faiss.Index"] = None, + batch_size: int = 1000, + train_size: Optional[int] = None, + faiss_verbose: bool = False, + ): + """Add a dense index using Faiss for fast retrieval. + The index is created using the vectors of the specified column. + You can specify `device` if you want to run it on GPU (`device` must be the GPU index, see more below). + You can find more information about Faiss here: + - For `string factory`: https://github.com/facebookresearch/faiss/wiki/The-index-factory + + Args: + column (`str`): The column of the vectors to add to the index. + index_name (Optional `str`): The index_name/identifier of the index. This is the index_name that is used to call `.get_nearest` or `.search`. + By default it corresponds to `column`. + device (Optional `Union[int, List[int]]`): If positive integer, this is the index of the GPU to use. If negative integer, use all GPUs. + If a list of positive integers is passed in, run only on those GPUs. By default it uses the CPU. + string_factory (Optional `str`): This is passed to the index factory of Faiss to create the index. Default index class is IndexFlatIP. + metric_type (Optional `int`): Type of metric. Ex: `faiss.METRIC_INNER_PRODUCT` or `faiss.METRIC_L2`. + custom_index (Optional `faiss.Index`): Custom Faiss index that you already have instantiated and configured for your needs. + batch_size (Optional `int`): Size of the batch to use while adding vectors to the FaissIndex. Default value is 1000. + + train_size (Optional `int`): If the index needs a training step, specifies how many vectors will be used to train the index. + faiss_verbose (`bool`, defaults to False): Enable the verbosity of the Faiss index. + """ + index_name = index_name if index_name is not None else column + faiss_index = FaissIndex( + device=device, string_factory=string_factory, metric_type=metric_type, custom_index=custom_index + ) + faiss_index.add_vectors( + self, column=column, batch_size=batch_size, train_size=train_size, faiss_verbose=faiss_verbose + ) + self._indexes[index_name] = faiss_index + + def add_faiss_index_from_external_arrays( + self, + external_arrays: np.array, + index_name: str, + device: Optional[Union[int, list[int]]] = None, + string_factory: Optional[str] = None, + metric_type: Optional[int] = None, + custom_index: Optional["faiss.Index"] = None, + batch_size: int = 1000, + train_size: Optional[int] = None, + faiss_verbose: bool = False, + ): + """Add a dense index using Faiss for fast retrieval. + The index is created using the vectors of `external_arrays`. + You can specify `device` if you want to run it on GPU (`device` must be the GPU index). + You can find more information about Faiss here: + - For `string factory`: https://github.com/facebookresearch/faiss/wiki/The-index-factory + + Args: + external_arrays (`np.array`): If you want to use arrays from outside the lib for the index, you can set `external_arrays`. + It will use `external_arrays` to create the Faiss index instead of the arrays in the given `column`. + index_name (`str`): The index_name/identifier of the index. This is the index_name that is used to call `.get_nearest` or `.search`. + device (Optional `Union[int, List[int]]`): If positive integer, this is the index of the GPU to use. If negative integer, use all GPUs. + If a list of positive integers is passed in, run only on those GPUs. By default it uses the CPU. + string_factory (Optional `str`): This is passed to the index factory of Faiss to create the index. Default index class is IndexFlatIP. + metric_type (Optional `int`): Type of metric. Ex: `faiss.METRIC_INNER_PRODUCT` or `faiss.METRIC_L2`. + custom_index (Optional `faiss.Index`): Custom Faiss index that you already have instantiated and configured for your needs. + batch_size (Optional `int`): Size of the batch to use while adding vectors to the FaissIndex. Default value is 1000. + + train_size (Optional `int`): If the index needs a training step, specifies how many vectors will be used to train the index. + faiss_verbose (`bool`, defaults to False): Enable the verbosity of the Faiss index. + """ + faiss_index = FaissIndex( + device=device, string_factory=string_factory, metric_type=metric_type, custom_index=custom_index + ) + faiss_index.add_vectors( + external_arrays, column=None, batch_size=batch_size, train_size=train_size, faiss_verbose=faiss_verbose + ) + self._indexes[index_name] = faiss_index + + def save_faiss_index(self, index_name: str, file: Union[str, PurePath], storage_options: Optional[dict] = None): + """Save a FaissIndex on disk. + + Args: + index_name (`str`): The index_name/identifier of the index. This is the index_name that is used to call `.get_nearest` or `.search`. + file (`str`): The path to the serialized faiss index on disk or remote URI (e.g. `"s3://my-bucket/index.faiss"`). + storage_options (`dict`, *optional*): + Key/value pairs to be passed on to the file-system backend, if any. + + + + """ + index = self.get_index(index_name) + if not isinstance(index, FaissIndex): + raise ValueError(f"Index '{index_name}' is not a FaissIndex but a '{type(index)}'") + index.save(file, storage_options=storage_options) + logger.info(f"Saved FaissIndex {index_name} at {file}") + + def load_faiss_index( + self, + index_name: str, + file: Union[str, PurePath], + device: Optional[Union[int, list[int]]] = None, + storage_options: Optional[dict] = None, + ): + """Load a FaissIndex from disk. + + If you want to do additional configurations, you can have access to the faiss index object by doing + `.get_index(index_name).faiss_index` to make it fit your needs. + + Args: + index_name (`str`): The index_name/identifier of the index. This is the index_name that is used to + call `.get_nearest` or `.search`. + file (`str`): The path to the serialized faiss index on disk or remote URI (e.g. `"s3://my-bucket/index.faiss"`). + device (Optional `Union[int, List[int]]`): If positive integer, this is the index of the GPU to use. If negative integer, use all GPUs. + If a list of positive integers is passed in, run only on those GPUs. By default it uses the CPU. + storage_options (`dict`, *optional*): + Key/value pairs to be passed on to the file-system backend, if any. + + + + """ + index = FaissIndex.load(file, device=device, storage_options=storage_options) + if index.faiss_index.ntotal != len(self): + raise ValueError( + f"Index size should match Dataset size, but Index '{index_name}' at {file} has {index.faiss_index.ntotal} elements while the dataset has {len(self)} examples." + ) + self._indexes[index_name] = index + logger.info(f"Loaded FaissIndex {index_name} from {file}") + + def add_elasticsearch_index( + self, + column: str, + index_name: Optional[str] = None, + host: Optional[str] = None, + port: Optional[int] = None, + es_client: Optional["Elasticsearch"] = None, + es_index_name: Optional[str] = None, + es_index_config: Optional[dict] = None, + ): + """Add a text index using ElasticSearch for fast retrieval. + + Args: + column (`str`): The column of the documents to add to the index. + index_name (Optional `str`): The index_name/identifier of the index. This is the index name that is used to call `.get_nearest` or `.search`. + By default it corresponds to `column`. + host (Optional `str`, defaults to localhost): + host of where ElasticSearch is running + port (Optional `str`, defaults to 9200): + port of where ElasticSearch is running + es_client (Optional `elasticsearch.Elasticsearch`): + The elasticsearch client used to create the index if host and port are None. + es_index_name (Optional `str`): The elasticsearch index name used to create the index. + es_index_config (Optional `dict`): + The configuration of the elasticsearch index. + Default config is: + + Config:: + + { + "settings": { + "number_of_shards": 1, + "analysis": {"analyzer": {"stop_standard": {"type": "standard", " stopwords": "_english_"}}}, + }, + "mappings": { + "properties": { + "text": { + "type": "text", + "analyzer": "standard", + "similarity": "BM25" + }, + } + }, + } + """ + index_name = index_name if index_name is not None else column + es_index = ElasticSearchIndex( + host=host, port=port, es_client=es_client, es_index_name=es_index_name, es_index_config=es_index_config + ) + es_index.add_documents(self, column=column) + self._indexes[index_name] = es_index + + def load_elasticsearch_index( + self, + index_name: str, + es_index_name: str, + host: Optional[str] = None, + port: Optional[int] = None, + es_client: Optional["Elasticsearch"] = None, + es_index_config: Optional[dict] = None, + ): + """Load an existing text index using ElasticSearch for fast retrieval. + + Args: + index_name (`str`): + The `index_name`/identifier of the index. This is the index name that is used to call `get_nearest` or `search`. + es_index_name (`str`): + The name of elasticsearch index to load. + host (`str`, *optional*, defaults to `localhost`): + Host of where ElasticSearch is running. + port (`str`, *optional*, defaults to `9200`): + Port of where ElasticSearch is running. + es_client (`elasticsearch.Elasticsearch`, *optional*): + The elasticsearch client used to create the index if host and port are `None`. + es_index_config (`dict`, *optional*): + The configuration of the elasticsearch index. + Default config is: + ``` + { + "settings": { + "number_of_shards": 1, + "analysis": {"analyzer": {"stop_standard": {"type": "standard", " stopwords": "_english_"}}}, + }, + "mappings": { + "properties": { + "text": { + "type": "text", + "analyzer": "standard", + "similarity": "BM25" + }, + } + }, + } + ``` + """ + self._indexes[index_name] = ElasticSearchIndex( + host=host, port=port, es_client=es_client, es_index_name=es_index_name, es_index_config=es_index_config + ) + + def drop_index(self, index_name: str): + """Drop the index with the specified column. + + Args: + index_name (`str`): + The `index_name`/identifier of the index. + """ + del self._indexes[index_name] + + def search(self, index_name: str, query: Union[str, np.array], k: int = 10, **kwargs) -> SearchResults: + """Find the nearest examples indices in the dataset to the query. + + Args: + index_name (`str`): + The name/identifier of the index. + query (`Union[str, np.ndarray]`): + The query as a string if `index_name` is a text index or as a numpy array if `index_name` is a vector index. + k (`int`): + The number of examples to retrieve. + + Returns: + `(scores, indices)`: + A tuple of `(scores, indices)` where: + - **scores** (`List[List[float]`): the retrieval scores from either FAISS (`IndexFlatL2` by default) or ElasticSearch of the retrieved examples + - **indices** (`List[List[int]]`): the indices of the retrieved examples + """ + self._check_index_is_initialized(index_name) + return self._indexes[index_name].search(query, k, **kwargs) + + def search_batch( + self, index_name: str, queries: Union[list[str], np.array], k: int = 10, **kwargs + ) -> BatchedSearchResults: + """Find the nearest examples indices in the dataset to the query. + + Args: + index_name (`str`): + The `index_name`/identifier of the index. + queries (`Union[List[str], np.ndarray]`): + The queries as a list of strings if `index_name` is a text index or as a numpy array if `index_name` is a vector index. + k (`int`): + The number of examples to retrieve per query. + + Returns: + `(total_scores, total_indices)`: + A tuple of `(total_scores, total_indices)` where: + - **total_scores** (`List[List[float]`): the retrieval scores from either FAISS (`IndexFlatL2` by default) or ElasticSearch of the retrieved examples per query + - **total_indices** (`List[List[int]]`): the indices of the retrieved examples per query + """ + self._check_index_is_initialized(index_name) + return self._indexes[index_name].search_batch(queries, k, **kwargs) + + def get_nearest_examples( + self, index_name: str, query: Union[str, np.array], k: int = 10, **kwargs + ) -> NearestExamplesResults: + """Find the nearest examples in the dataset to the query. + + Args: + index_name (`str`): + The index_name/identifier of the index. + query (`Union[str, np.ndarray]`): + The query as a string if `index_name` is a text index or as a numpy array if `index_name` is a vector index. + k (`int`): + The number of examples to retrieve. + + Returns: + `(scores, examples)`: + A tuple of `(scores, examples)` where: + - **scores** (`List[float]`): the retrieval scores from either FAISS (`IndexFlatL2` by default) or ElasticSearch of the retrieved examples + - **examples** (`dict`): the retrieved examples + """ + self._check_index_is_initialized(index_name) + scores, indices = self.search(index_name, query, k, **kwargs) + top_indices = [i for i in indices if i >= 0] + return NearestExamplesResults(scores[: len(top_indices)], self[top_indices]) + + def get_nearest_examples_batch( + self, index_name: str, queries: Union[list[str], np.array], k: int = 10, **kwargs + ) -> BatchedNearestExamplesResults: + """Find the nearest examples in the dataset to the query. + + Args: + index_name (`str`): + The `index_name`/identifier of the index. + queries (`Union[List[str], np.ndarray]`): + The queries as a list of strings if `index_name` is a text index or as a numpy array if `index_name` is a vector index. + k (`int`): + The number of examples to retrieve per query. + + Returns: + `(total_scores, total_examples)`: + A tuple of `(total_scores, total_examples)` where: + - **total_scores** (`List[List[float]`): the retrieval scores from either FAISS (`IndexFlatL2` by default) or ElasticSearch of the retrieved examples per query + - **total_examples** (`List[dict]`): the retrieved examples per query + """ + self._check_index_is_initialized(index_name) + total_scores, total_indices = self.search_batch(index_name, queries, k, **kwargs) + total_scores = [ + scores_i[: len([i for i in indices_i if i >= 0])] + for scores_i, indices_i in zip(total_scores, total_indices) + ] + total_samples = [self[[i for i in indices if i >= 0]] for indices in total_indices] + return BatchedNearestExamplesResults(total_scores, total_samples) diff --git a/datasets/splits.py b/datasets/splits.py new file mode 100644 index 0000000000000000000000000000000000000000..02c18eef351ec3bd5270d8c6a42ca23710dd5f49 --- /dev/null +++ b/datasets/splits.py @@ -0,0 +1,635 @@ +# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Lint as: python3 +"""Splits related API.""" + +import abc +import collections +import copy +import dataclasses +import re +from dataclasses import dataclass +from typing import Optional, Union + +from .arrow_reader import FileInstructions, make_file_instructions +from .naming import _split_re +from .utils.py_utils import NonMutableDict, asdict + + +@dataclass +class SplitInfo: + name: str = dataclasses.field(default="", metadata={"include_in_asdict_even_if_is_default": True}) + num_bytes: int = dataclasses.field(default=0, metadata={"include_in_asdict_even_if_is_default": True}) + num_examples: int = dataclasses.field(default=0, metadata={"include_in_asdict_even_if_is_default": True}) + shard_lengths: Optional[list[int]] = None + + # Deprecated + # For backward compatibility, this field needs to always be included in files like + # dataset_infos.json and dataset_info.json files + # To do so, we always include it in the output of datasets.utils.py_utils.asdict(split_info) + dataset_name: Optional[str] = dataclasses.field( + default=None, metadata={"include_in_asdict_even_if_is_default": True} + ) + + @property + def file_instructions(self): + """Returns the list of dict(filename, take, skip).""" + # `self.dataset_name` is assigned in `SplitDict.add()`. + instructions = make_file_instructions( + name=self.dataset_name, + split_infos=[self], + instruction=str(self.name), + ) + return instructions.file_instructions + + +@dataclass +class SubSplitInfo: + """Wrapper around a sub split info. + This class expose info on the subsplit: + ``` + ds, info = datasets.load_dataset(..., split='train[75%:]', with_info=True) + info.splits['train[75%:]'].num_examples + ``` + """ + + instructions: FileInstructions + + @property + def num_examples(self): + """Returns the number of example in the subsplit.""" + return self.instructions.num_examples + + @property + def file_instructions(self): + """Returns the list of dict(filename, take, skip).""" + return self.instructions.file_instructions + + +class SplitBase(metaclass=abc.ABCMeta): + # pylint: disable=line-too-long + """Abstract base class for Split compositionality. + + See the + [guide on splits](../loading#slice-splits) + for more information. + + There are three parts to the composition: + 1) The splits are composed (defined, merged, split,...) together before + calling the `.as_dataset()` function. This is done with the `__add__`, + `__getitem__`, which return a tree of `SplitBase` (whose leaf + are the `NamedSplit` objects) + + ``` + split = datasets.Split.TRAIN + datasets.Split.TEST.subsplit(datasets.percent[:50]) + ``` + + 2) The `SplitBase` is forwarded to the `.as_dataset()` function + to be resolved into actual read instruction. This is done by the + `.get_read_instruction()` method which takes the real dataset splits + (name, number of shards,...) and parse the tree to return a + `SplitReadInstruction()` object + + ``` + read_instruction = split.get_read_instruction(self.info.splits) + ``` + + 3) The `SplitReadInstruction` is then used in the `tf.data.Dataset` pipeline + to define which files to read and how to skip examples within file. + + """ + + # pylint: enable=line-too-long + + @abc.abstractmethod + def get_read_instruction(self, split_dict): + """Parse the descriptor tree and compile all read instructions together. + + Args: + split_dict: `dict`, The `dict[split_name, SplitInfo]` of the dataset + + Returns: + split_read_instruction: `SplitReadInstruction` + """ + raise NotImplementedError("Abstract method") + + def __eq__(self, other): + """Equality: datasets.Split.TRAIN == 'train'.""" + if isinstance(other, (NamedSplit, str)): + return False + raise NotImplementedError("Equality is not implemented between merged/sub splits.") + + def __ne__(self, other): + """InEquality: datasets.Split.TRAIN != 'test'.""" + return not self.__eq__(other) + + def __add__(self, other): + """Merging: datasets.Split.TRAIN + datasets.Split.TEST.""" + return _SplitMerged(self, other) + + def subsplit(self, arg=None, k=None, percent=None, weighted=None): # pylint: disable=redefined-outer-name + """Divides this split into subsplits. + + There are 3 ways to define subsplits, which correspond to the 3 + arguments `k` (get `k` even subsplits), `percent` (get a slice of the + dataset with `datasets.percent`), and `weighted` (get subsplits with proportions + specified by `weighted`). + + Example:: + + ``` + # 50% train, 50% test + train, test = split.subsplit(k=2) + # 50% train, 25% test, 25% validation + train, test, validation = split.subsplit(weighted=[2, 1, 1]) + # Extract last 20% + subsplit = split.subsplit(datasets.percent[-20:]) + ``` + + Warning: k and weighted will be converted into percent which mean that + values below the percent will be rounded up or down. The final split may be + bigger to deal with remainders. For instance: + + ``` + train, test, valid = split.subsplit(k=3) # 33%, 33%, 34% + s1, s2, s3, s4 = split.subsplit(weighted=[2, 2, 1, 1]) # 33%, 33%, 16%, 18% + ``` + + Args: + arg: If no kwargs are given, `arg` will be interpreted as one of + `k`, `percent`, or `weighted` depending on the type. + For example: + ``` + split.subsplit(10) # Equivalent to split.subsplit(k=10) + split.subsplit(datasets.percent[:-20]) # percent=datasets.percent[:-20] + split.subsplit([1, 1, 2]) # weighted=[1, 1, 2] + ``` + k: `int` If set, subdivide the split into `k` equal parts. + percent: `datasets.percent slice`, return a single subsplit corresponding to + a slice of the original split. For example: + `split.subsplit(datasets.percent[-20:]) # Last 20% of the dataset`. + weighted: `list[int]`, return a list of subsplits whose proportions match + the normalized sum of the list. For example: + `split.subsplit(weighted=[1, 1, 2]) # 25%, 25%, 50%`. + + Returns: + A subsplit or list of subsplits extracted from this split object. + """ + # Note that the percent kwargs redefine the outer name datasets.percent. This + # is done for consistency (.subsplit(percent=datasets.percent[:40])) + if sum(bool(x) for x in (arg, k, percent, weighted)) != 1: + raise ValueError("Only one argument of subsplit should be set.") + + # Auto deduce k + if isinstance(arg, int): + k = arg + elif isinstance(arg, slice): + percent = arg + elif isinstance(arg, list): + weighted = arg + + if not (k or percent or weighted): + raise ValueError( + f"Invalid split argument {arg}. Only list, slice and int supported. " + "One of k, weighted or percent should be set to a non empty value." + ) + + def assert_slices_coverage(slices): + # Ensure that the expended slices cover all percents. + assert sum((list(range(*s.indices(100))) for s in slices), []) == list(range(100)) + + if k: + if not 0 < k <= 100: + raise ValueError(f"Subsplit k should be between 0 and 100, got {k}") + shift = 100 // k + slices = [slice(i * shift, (i + 1) * shift) for i in range(k)] + # Round up last element to ensure all elements are taken + slices[-1] = slice(slices[-1].start, 100) + # Internal check to ensure full coverage + assert_slices_coverage(slices) + return tuple(_SubSplit(self, s) for s in slices) + elif percent: + return _SubSplit(self, percent) + elif weighted: + # Normalize the weighted sum + total = sum(weighted) + weighted = [100 * x // total for x in weighted] + # Create the slice for each of the elements + start = 0 + stop = 0 + slices = [] + for v in weighted: + stop += v + slices.append(slice(start, stop)) + start = stop + # Round up last element to ensure all elements are taken + slices[-1] = slice(slices[-1].start, 100) + # Internal check to ensure full coverage + assert_slices_coverage(slices) + return tuple(_SubSplit(self, s) for s in slices) + else: + # Should not be possible + raise ValueError("Could not determine the split") + + +# 2 requirements: +# 1. datasets.percent be sliceable +# 2. datasets.percent be documented +# +# Instances are not documented, so we want datasets.percent to be a class, but to +# have it be sliceable, we need this metaclass. +class PercentSliceMeta(type): + def __getitem__(cls, slice_value): + if not isinstance(slice_value, slice): + raise ValueError(f"datasets.percent should only be called with slice, not {slice_value}") + return slice_value + + +class PercentSlice(metaclass=PercentSliceMeta): + # pylint: disable=line-too-long + """Syntactic sugar for defining slice subsplits: `datasets.percent[75:-5]`. + + See the + [guide on splits](../loading#slice-splits) + for more information. + """ + + # pylint: enable=line-too-long + pass + + +percent = PercentSlice # pylint: disable=invalid-name + + +class _SplitMerged(SplitBase): + """Represent two split descriptors merged together.""" + + def __init__(self, split1, split2): + self._split1 = split1 + self._split2 = split2 + + def get_read_instruction(self, split_dict): + read_instruction1 = self._split1.get_read_instruction(split_dict) + read_instruction2 = self._split2.get_read_instruction(split_dict) + return read_instruction1 + read_instruction2 + + def __repr__(self): + return f"({repr(self._split1)} + {repr(self._split2)})" + + +class _SubSplit(SplitBase): + """Represent a sub split of a split descriptor.""" + + def __init__(self, split, slice_value): + self._split = split + self._slice_value = slice_value + + def get_read_instruction(self, split_dict): + return self._split.get_read_instruction(split_dict)[self._slice_value] + + def __repr__(self): + slice_str = "{start}:{stop}" + if self._slice_value.step is not None: + slice_str += ":{step}" + slice_str = slice_str.format( + start="" if self._slice_value.start is None else self._slice_value.start, + stop="" if self._slice_value.stop is None else self._slice_value.stop, + step=self._slice_value.step, + ) + return f"{repr(self._split)}(datasets.percent[{slice_str}])" + + +class NamedSplit(SplitBase): + """Descriptor corresponding to a named split (train, test, ...). + + Example: + Each descriptor can be composed with other using addition or slice: + + ```py + split = datasets.Split.TRAIN.subsplit(datasets.percent[0:25]) + datasets.Split.TEST + ``` + + The resulting split will correspond to 25% of the train split merged with + 100% of the test split. + + A split cannot be added twice, so the following will fail: + + ```py + split = ( + datasets.Split.TRAIN.subsplit(datasets.percent[:25]) + + datasets.Split.TRAIN.subsplit(datasets.percent[75:]) + ) # Error + split = datasets.Split.TEST + datasets.Split.ALL # Error + ``` + + The slices can be applied only one time. So the following are valid: + + ```py + split = ( + datasets.Split.TRAIN.subsplit(datasets.percent[:25]) + + datasets.Split.TEST.subsplit(datasets.percent[:50]) + ) + split = (datasets.Split.TRAIN + datasets.Split.TEST).subsplit(datasets.percent[:50]) + ``` + + But this is not valid: + + ```py + train = datasets.Split.TRAIN + test = datasets.Split.TEST + split = train.subsplit(datasets.percent[:25]).subsplit(datasets.percent[:25]) + split = (train.subsplit(datasets.percent[:25]) + test).subsplit(datasets.percent[:50]) + ``` + """ + + def __init__(self, name): + self._name = name + split_names_from_instruction = [split_instruction.split("[")[0] for split_instruction in name.split("+")] + for split_name in split_names_from_instruction: + if not re.match(_split_re, split_name): + raise ValueError(f"Split name should match '{_split_re}' but got '{split_name}'.") + + def __str__(self): + return self._name + + def __repr__(self): + return f"NamedSplit({self._name!r})" + + def __eq__(self, other): + """Equality: datasets.Split.TRAIN == 'train'.""" + if isinstance(other, NamedSplit): + return self._name == other._name # pylint: disable=protected-access + elif isinstance(other, SplitBase): + return False + elif isinstance(other, str): # Other should be string + return self._name == other + else: + return False + + def __lt__(self, other): + return self._name < other._name # pylint: disable=protected-access + + def __hash__(self): + return hash(self._name) + + def get_read_instruction(self, split_dict): + return SplitReadInstruction(split_dict[self._name]) + + +class NamedSplitAll(NamedSplit): + """Split corresponding to the union of all defined dataset splits.""" + + def __init__(self): + super().__init__("all") + + def __repr__(self): + return "NamedSplitAll()" + + def get_read_instruction(self, split_dict): + # Merge all dataset split together + read_instructions = [SplitReadInstruction(s) for s in split_dict.values()] + return sum(read_instructions, SplitReadInstruction()) + + +class Split: + # pylint: disable=line-too-long + """`Enum` for dataset splits. + + Datasets are typically split into different subsets to be used at various + stages of training and evaluation. + + - `TRAIN`: the training data. + - `VALIDATION`: the validation data. If present, this is typically used as + evaluation data while iterating on a model (e.g. changing hyperparameters, + model architecture, etc.). + - `TEST`: the testing data. This is the data to report metrics on. Typically + you do not want to use this during model iteration as you may overfit to it. + - `ALL`: the union of all defined dataset splits. + + All splits, including compositions inherit from `datasets.SplitBase`. + + See the [guide](../load_hub#splits) on splits for more information. + + Example: + + ```py + >>> datasets.SplitGenerator( + ... name=datasets.Split.TRAIN, + ... gen_kwargs={"split_key": "train", "files": dl_manager.download_and extract(url)}, + ... ), + ... datasets.SplitGenerator( + ... name=datasets.Split.VALIDATION, + ... gen_kwargs={"split_key": "validation", "files": dl_manager.download_and extract(url)}, + ... ), + ... datasets.SplitGenerator( + ... name=datasets.Split.TEST, + ... gen_kwargs={"split_key": "test", "files": dl_manager.download_and extract(url)}, + ... ) + ``` + """ + + # pylint: enable=line-too-long + TRAIN = NamedSplit("train") + TEST = NamedSplit("test") + VALIDATION = NamedSplit("validation") + ALL = NamedSplitAll() + + def __new__(cls, name): + """Create a custom split with datasets.Split('custom_name').""" + return NamedSplitAll() if name == "all" else NamedSplit(name) + + +# Similar to SplitInfo, but contain an additional slice info +SlicedSplitInfo = collections.namedtuple( + "SlicedSplitInfo", + [ + "split_info", + "slice_value", + ], +) # noqa: E231 + + +class SplitReadInstruction: + """Object containing the reading instruction for the dataset. + + Similarly to `SplitDescriptor` nodes, this object can be composed with itself, + but the resolution happens instantaneously, instead of keeping track of the + tree, such as all instructions are compiled and flattened in a single + SplitReadInstruction object containing the list of files and slice to use. + + Once resolved, the instructions can be accessed with: + + ``` + read_instructions.get_list_sliced_split_info() # List of splits to use + ``` + + """ + + def __init__(self, split_info=None): + self._splits = NonMutableDict(error_msg="Overlap between splits. Split {key} has been added with itself.") + + if split_info: + self.add(SlicedSplitInfo(split_info=split_info, slice_value=None)) + + def add(self, sliced_split): + """Add a SlicedSplitInfo the read instructions.""" + # TODO(epot): Check that the number of examples per shard % 100 == 0 + # Otherwise the slices value may be unbalanced and not exactly reflect the + # requested slice. + self._splits[sliced_split.split_info.name] = sliced_split + + def __add__(self, other): + """Merging split together.""" + # Will raise error if a split has already be added (NonMutableDict) + # TODO(epot): If a split is already added but there is no overlap between + # the slices, should merge the slices (ex: [:10] + [80:]) + split_instruction = SplitReadInstruction() + split_instruction._splits.update(self._splits) # pylint: disable=protected-access + split_instruction._splits.update(other._splits) # pylint: disable=protected-access + return split_instruction + + def __getitem__(self, slice_value): + """Sub-splits.""" + # Will raise an error if a split has already been sliced + split_instruction = SplitReadInstruction() + for v in self._splits.values(): + if v.slice_value is not None: + raise ValueError(f"Trying to slice Split {v.split_info.name} which has already been sliced") + v = v._asdict() + v["slice_value"] = slice_value + split_instruction.add(SlicedSplitInfo(**v)) + return split_instruction + + def get_list_sliced_split_info(self): + return list(self._splits.values()) + + +class SplitDict(dict): + """Split info object.""" + + def __init__(self, *args, dataset_name=None, **kwargs): + super().__init__(*args, **kwargs) + self.dataset_name = dataset_name + + def __getitem__(self, key: Union[SplitBase, str]): + # 1st case: The key exists: `info.splits['train']` + if str(key) in self: + return super().__getitem__(str(key)) + # 2nd case: Uses instructions: `info.splits['train[50%]']` + else: + instructions = make_file_instructions( + name=self.dataset_name, + split_infos=self.values(), + instruction=key, + ) + return SubSplitInfo(instructions) + + def __setitem__(self, key: Union[SplitBase, str], value: SplitInfo): + if key != value.name: + raise ValueError(f"Cannot add elem. (key mismatch: '{key}' != '{value.name}')") + super().__setitem__(key, value) + + def add(self, split_info: SplitInfo): + """Add the split info.""" + if split_info.name in self: + raise ValueError(f"Split {split_info.name} already present") + split_info.dataset_name = self.dataset_name + super().__setitem__(split_info.name, split_info) + + @property + def total_num_examples(self): + """Return the total number of examples.""" + return sum(s.num_examples for s in self.values()) + + @classmethod + def from_split_dict(cls, split_infos: Union[list, dict], dataset_name: Optional[str] = None): + """Returns a new SplitDict initialized from a Dict or List of `split_infos`.""" + if isinstance(split_infos, dict): + split_infos = list(split_infos.values()) + + if dataset_name is None: + dataset_name = split_infos[0].get("dataset_name") if split_infos else None + + split_dict = cls(dataset_name=dataset_name) + + for split_info in split_infos: + if isinstance(split_info, dict): + split_info = SplitInfo(**split_info) + split_dict.add(split_info) + + return split_dict + + def to_split_dict(self): + """Returns a list of SplitInfo protos that we have.""" + out = [] + for split_name, split_info in self.items(): + split_info = copy.deepcopy(split_info) + split_info.name = split_name + out.append(split_info) + return out + + def copy(self): + return SplitDict.from_split_dict(self.to_split_dict(), self.dataset_name) + + def _to_yaml_list(self) -> list: + out = [asdict(s) for s in self.to_split_dict()] + # we don't need the shard lengths in YAML, since it depends on max_shard_size and num_proc + for split_info_dict in out: + split_info_dict.pop("shard_lengths", None) + # we don't need the dataset_name attribute that is deprecated + for split_info_dict in out: + split_info_dict.pop("dataset_name", None) + return out + + @classmethod + def _from_yaml_list(cls, yaml_data: list) -> "SplitDict": + return cls.from_split_dict(yaml_data) + + +@dataclass +class SplitGenerator: + """Defines the split information for the generator. + + This should be used as returned value of + `GeneratorBasedBuilder._split_generators`. + See `GeneratorBasedBuilder._split_generators` for more info and example + of usage. + + Args: + name (`str`): + Name of the `Split` for which the generator will + create the examples. + **gen_kwargs (additional keyword arguments): + Keyword arguments to forward to the `DatasetBuilder._generate_examples` method + of the builder. + + Example: + + ```py + >>> datasets.SplitGenerator( + ... name=datasets.Split.TRAIN, + ... gen_kwargs={"split_key": "train", "files": dl_manager.download_and_extract(url)}, + ... ) + ``` + """ + + name: str + gen_kwargs: dict = dataclasses.field(default_factory=dict) + split_info: SplitInfo = dataclasses.field(init=False) + + def __post_init__(self): + self.name = str(self.name) # Make sure we convert NamedSplits in strings + NamedSplit(self.name) # check that it's a valid split name + self.split_info = SplitInfo(name=self.name) diff --git a/datasets/streaming.py b/datasets/streaming.py new file mode 100644 index 0000000000000000000000000000000000000000..6e5e8ee15f9f529d93d43dc003ccdef3436fd725 --- /dev/null +++ b/datasets/streaming.py @@ -0,0 +1,131 @@ +import importlib +from functools import wraps +from typing import TYPE_CHECKING, Optional + +from .download.download_config import DownloadConfig +from .utils.file_utils import ( + xbasename, + xdirname, + xet_parse, + xexists, + xgetsize, + xglob, + xgzip_open, + xisdir, + xisfile, + xjoin, + xlistdir, + xnumpy_load, + xopen, + xpandas_read_csv, + xpandas_read_excel, + xPath, + xpyarrow_parquet_read_table, + xrelpath, + xsio_loadmat, + xsplit, + xsplitext, + xwalk, + xxml_dom_minidom_parse, +) +from .utils.logging import get_logger +from .utils.patching import patch_submodule + + +logger = get_logger(__name__) + + +if TYPE_CHECKING: + from .builder import DatasetBuilder + + +def extend_module_for_streaming(module_path, download_config: Optional[DownloadConfig] = None): + """Extend the module to support streaming. + + We patch some functions in the module to use `fsspec` to support data streaming: + - We use `fsspec.open` to open and read remote files. We patch the module function: + - `open` + - We use the "::" hop separator to join paths and navigate remote compressed/archive files. We patch the module + functions: + - `os.path.join` + - `pathlib.Path.joinpath` and `pathlib.Path.__truediv__` (called when using the "/" operator) + + The patched functions are replaced with custom functions defined to work with the + :class:`~download.streaming_download_manager.StreamingDownloadManager`. + + Args: + module_path: Path to the module to be extended. + download_config: Mainly use `token` or `storage_options` to support different platforms and auth types. + """ + + module = importlib.import_module(module_path) + + # TODO(QL): always update the module to add subsequent new authentication without removing old ones + if hasattr(module, "_patched_for_streaming") and module._patched_for_streaming: + if isinstance(module._patched_for_streaming, DownloadConfig): + module._patched_for_streaming.token = download_config.token + module._patched_for_streaming.storage_options = download_config.storage_options + return + + def wrap_auth(function): + @wraps(function) + def wrapper(*args, **kwargs): + return function(*args, download_config=download_config, **kwargs) + + wrapper._decorator_name_ = "wrap_auth" + return wrapper + + # open files in a streaming fashion + patch_submodule(module, "open", wrap_auth(xopen)).start() + patch_submodule(module, "os.listdir", wrap_auth(xlistdir)).start() + patch_submodule(module, "os.walk", wrap_auth(xwalk)).start() + patch_submodule(module, "glob.glob", wrap_auth(xglob)).start() + # allow to navigate in remote zip files + patch_submodule(module, "os.path.join", xjoin).start() + patch_submodule(module, "os.path.dirname", xdirname).start() + patch_submodule(module, "os.path.basename", xbasename).start() + patch_submodule(module, "os.path.relpath", xrelpath).start() + patch_submodule(module, "os.path.split", xsplit).start() + patch_submodule(module, "os.path.splitext", xsplitext).start() + # allow checks on paths + patch_submodule(module, "os.path.exists", wrap_auth(xexists)).start() + patch_submodule(module, "os.path.isdir", wrap_auth(xisdir)).start() + patch_submodule(module, "os.path.isfile", wrap_auth(xisfile)).start() + patch_submodule(module, "os.path.getsize", wrap_auth(xgetsize)).start() + patch_submodule(module, "pathlib.Path", xPath).start() + # file readers + patch_submodule(module, "gzip.open", wrap_auth(xgzip_open)).start() + patch_submodule(module, "numpy.load", wrap_auth(xnumpy_load)).start() + patch_submodule(module, "pandas.read_csv", wrap_auth(xpandas_read_csv), attrs=["__version__"]).start() + patch_submodule(module, "pandas.read_excel", wrap_auth(xpandas_read_excel), attrs=["__version__"]).start() + patch_submodule(module, "scipy.io.loadmat", wrap_auth(xsio_loadmat), attrs=["__version__"]).start() + patch_submodule(module, "xml.etree.ElementTree.parse", wrap_auth(xet_parse)).start() + patch_submodule(module, "xml.dom.minidom.parse", wrap_auth(xxml_dom_minidom_parse)).start() + # pyarrow: do not patch pyarrow attribute in packaged modules + if not module.__name__.startswith("datasets.packaged_modules."): + patch_submodule(module, "pyarrow.parquet.read_table", wrap_auth(xpyarrow_parquet_read_table)).start() + module._patched_for_streaming = download_config + + +def extend_dataset_builder_for_streaming(builder: "DatasetBuilder"): + """Extend the dataset builder module and the modules imported by it to support streaming. + + Args: + builder (:class:`DatasetBuilder`): Dataset builder instance. + """ + # this extends the open and os.path.join functions for data streaming + download_config = DownloadConfig(storage_options=builder.storage_options, token=builder.token) + extend_module_for_streaming(builder.__module__, download_config=download_config) + + # builders can inherit from other builders that might use streaming functionality + # (for example, ImageFolder and AudioFolder inherit from FolderBuilder which implements examples generation) + # but these parents builders are not patched automatically as they are not instantiated, so we patch them here + from .builder import DatasetBuilder + + parent_builder_modules = [ + cls.__module__ + for cls in type(builder).__mro__[1:] # make sure it's not the same module we've already patched + if issubclass(cls, DatasetBuilder) and cls.__module__ != DatasetBuilder.__module__ + ] # check it's not a standard builder from datasets.builder + for module in parent_builder_modules: + extend_module_for_streaming(module, download_config=download_config) diff --git a/datasets/table.py b/datasets/table.py new file mode 100644 index 0000000000000000000000000000000000000000..8d301afea7ad4279df8b33e69925ea76544f70c6 --- /dev/null +++ b/datasets/table.py @@ -0,0 +1,2385 @@ +import copy +import os +from collections.abc import Iterator +from functools import partial +from itertools import groupby +from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar, Union + +import numpy as np +import pyarrow as pa +import pyarrow.compute as pc + +from .utils.logging import get_logger + + +if TYPE_CHECKING: + from .features.features import Features, FeatureType + + +logger = get_logger(__name__) + + +def inject_arrow_table_documentation(arrow_table_method): + def wrapper(fn): + fn.__doc__ = arrow_table_method.__doc__ + (fn.__doc__ if fn.__doc__ is not None else "") + fn.__doc__ = fn.__doc__.replace("pyarrow.Table", "Table") + if hasattr(arrow_table_method, "__annotations__"): + fn.__annotations__ = arrow_table_method.__annotations__ + return fn + + return wrapper + + +def _in_memory_arrow_table_from_file(filename: str) -> pa.Table: + in_memory_stream = pa.input_stream(filename) + opened_stream = pa.ipc.open_stream(in_memory_stream) + pa_table = opened_stream.read_all() + return pa_table + + +def _in_memory_arrow_table_from_buffer(buffer: pa.Buffer) -> pa.Table: + stream = pa.BufferReader(buffer) + opened_stream = pa.ipc.open_stream(stream) + table = opened_stream.read_all() + return table + + +def _memory_mapped_record_batch_reader_from_file(filename: str) -> pa.RecordBatchStreamReader: + memory_mapped_stream = pa.memory_map(filename) + return pa.ipc.open_stream(memory_mapped_stream) + + +def read_schema_from_file(filename: str) -> pa.Schema: + """ + Infer arrow table schema from file without loading whole file into memory. + Useful especially while having very big files. + """ + with pa.memory_map(filename) as memory_mapped_stream: + schema = pa.ipc.open_stream(memory_mapped_stream).schema + return schema + + +def _memory_mapped_arrow_table_from_file(filename: str) -> pa.Table: + opened_stream = _memory_mapped_record_batch_reader_from_file(filename) + pa_table = opened_stream.read_all() + return pa_table + + +def _deepcopy(x, memo: dict): + """deepcopy a regular class instance""" + cls = x.__class__ + result = cls.__new__(cls) + memo[id(x)] = result + for k, v in x.__dict__.items(): + setattr(result, k, copy.deepcopy(v, memo)) + return result + + +def _interpolation_search(arr: list[int], x: int) -> int: + """ + Return the position i of a sorted array so that arr[i] <= x < arr[i+1] + + Args: + arr (`List[int]`): non-empty sorted list of integers + x (`int`): query + + Returns: + `int`: the position i so that arr[i] <= x < arr[i+1] + + Raises: + `IndexError`: if the array is empty or if the query is outside the array values + """ + i, j = 0, len(arr) - 1 + while i < j and arr[i] <= x < arr[j]: + k = i + ((j - i) * (x - arr[i]) // (arr[j] - arr[i])) + if arr[k] <= x < arr[k + 1]: + return k + elif arr[k] < x: + i, j = k + 1, j + else: + i, j = i, k + raise IndexError(f"Invalid query '{x}' for size {arr[-1] if len(arr) else 'none'}.") + + +class IndexedTableMixin: + def __init__(self, table: pa.Table): + self._schema: pa.Schema = table.schema + self._batches: list[pa.RecordBatch] = [ + recordbatch for recordbatch in table.to_batches() if len(recordbatch) > 0 + ] + self._offsets: np.ndarray = np.cumsum([0] + [len(b) for b in self._batches], dtype=np.int64) + + def fast_gather(self, indices: Union[list[int], np.ndarray]) -> pa.Table: + """ + Create a pa.Table by gathering the records at the records at the specified indices. Should be faster + than pa.concat_tables(table.fast_slice(int(i) % table.num_rows, 1) for i in indices) since NumPy can compute + the binary searches in parallel, highly optimized C + """ + if not len(indices): + raise ValueError("Indices must be non-empty") + batch_indices = np.searchsorted(self._offsets, indices, side="right") - 1 + return pa.Table.from_batches( + [ + self._batches[batch_idx].slice(i - self._offsets[batch_idx], 1) + for batch_idx, i in zip(batch_indices, indices) + ], + schema=self._schema, + ) + + def fast_slice(self, offset=0, length=None) -> pa.Table: + """ + Slice the Table using interpolation search. + The behavior is the same as `pyarrow.Table.slice` but it's significantly faster. + + Interpolation search is used to find the start and end indexes of the batches we want to keep. + The batches to keep are then concatenated to form the sliced Table. + """ + if offset < 0: + raise IndexError("Offset must be non-negative") + elif offset >= self._offsets[-1] or (length is not None and length <= 0): + return pa.Table.from_batches([], schema=self._schema) + i = _interpolation_search(self._offsets, offset) + if length is None or length + offset >= self._offsets[-1]: + batches = self._batches[i:] + batches[0] = batches[0].slice(offset - self._offsets[i]) + else: + j = _interpolation_search(self._offsets, offset + length - 1) + batches = self._batches[i : j + 1] + batches[-1] = batches[-1].slice(0, offset + length - self._offsets[j]) + batches[0] = batches[0].slice(offset - self._offsets[i]) + return pa.Table.from_batches(batches, schema=self._schema) + + +class Table(IndexedTableMixin): + """ + Wraps a pyarrow Table by using composition. + This is the base class for `InMemoryTable`, `MemoryMappedTable` and `ConcatenationTable`. + + It implements all the basic attributes/methods of the pyarrow Table class except + the Table transforms: `slice, filter, flatten, combine_chunks, cast, add_column, + append_column, remove_column, set_column, rename_columns` and `drop`. + + The implementation of these methods differs for the subclasses. + """ + + def __init__(self, table: pa.Table): + super().__init__(table) + self.table = table + + def __deepcopy__(self, memo: dict): + # arrow tables are immutable, so there's no need to copy self.table + # moreover calling deepcopy on a pyarrow table seems to make pa.total_allocated_bytes() decrease for some reason + # by adding it to the memo, self.table won't be copied + memo[id(self.table)] = self.table + # same for the recordbatches used by the index + memo[id(self._batches)] = list(self._batches) + return _deepcopy(self, memo) + + def validate(self, *args, **kwargs): + """ + Perform validation checks. An exception is raised if validation fails. + + By default only cheap validation checks are run. Pass `full=True` + for thorough validation checks (potentially `O(n)`). + + Args: + full (`bool`, defaults to `False`): + If `True`, run expensive checks, otherwise cheap checks only. + + Raises: + `pa.lib.ArrowInvalid`: if validation fails + """ + return self.table.validate(*args, **kwargs) + + def equals(self, *args, **kwargs): + """ + Check if contents of two tables are equal. + + Args: + other ([`~datasets.table.Table`]): + Table to compare against. + check_metadata `bool`, defaults to `False`): + Whether schema metadata equality should be checked as well. + + Returns: + `bool` + """ + args = tuple(arg.table if isinstance(arg, Table) else arg for arg in args) + kwargs = {k: v.table if isinstance(v, Table) else v for k, v in kwargs} + return self.table.equals(*args, **kwargs) + + def to_batches(self, *args, **kwargs): + """ + Convert Table to list of (contiguous) `RecordBatch` objects. + + Args: + max_chunksize (`int`, defaults to `None`): + Maximum size for `RecordBatch` chunks. Individual chunks may be + smaller depending on the chunk layout of individual columns. + + Returns: + `List[pyarrow.RecordBatch]` + """ + return self.table.to_batches(*args, **kwargs) + + def to_pydict(self, *args, **kwargs): + """ + Convert the Table to a `dict` or `OrderedDict`. + + Returns: + `dict` + """ + return self.table.to_pydict(*args, **kwargs) + + def to_pylist(self, *args, **kwargs): + """ + Convert the Table to a list + + Returns: + `list` + """ + return self.table.to_pylist(*args, **kwargs) + + def to_pandas(self, *args, **kwargs): + """ + Convert to a pandas-compatible NumPy array or DataFrame, as appropriate. + + Args: + memory_pool (`MemoryPool`, defaults to `None`): + Arrow MemoryPool to use for allocations. Uses the default memory + pool is not passed. + strings_to_categorical (`bool`, defaults to `False`): + Encode string (UTF8) and binary types to `pandas.Categorical`. + categories (`list`, defaults to `empty`): + List of fields that should be returned as `pandas.Categorical`. Only + applies to table-like data structures. + zero_copy_only (`bool`, defaults to `False`): + Raise an `ArrowException` if this function call would require copying + the underlying data. + integer_object_nulls (`bool`, defaults to `False`): + Cast integers with nulls to objects. + date_as_object (`bool`, defaults to `True`): + Cast dates to objects. If `False`, convert to `datetime64[ns]` dtype. + timestamp_as_object (`bool`, defaults to `False`): + Cast non-nanosecond timestamps (`np.datetime64`) to objects. This is + useful if you have timestamps that don't fit in the normal date + range of nanosecond timestamps (1678 CE-2262 CE). + If `False`, all timestamps are converted to `datetime64[ns]` dtype. + use_threads (`bool`, defaults to `True`): + Whether to parallelize the conversion using multiple threads. + deduplicate_objects (`bool`, defaults to `False`): + Do not create multiple copies Python objects when created, to save + on memory use. Conversion will be slower. + ignore_metadata (`bool`, defaults to `False`): + If `True`, do not use the 'pandas' metadata to reconstruct the + DataFrame index, if present. + safe (`bool`, defaults to `True`): + For certain data types, a cast is needed in order to store the + data in a pandas DataFrame or Series (e.g. timestamps are always + stored as nanoseconds in pandas). This option controls whether it + is a safe cast or not. + split_blocks (`bool`, defaults to `False`): + If `True`, generate one internal "block" for each column when + creating a pandas.DataFrame from a `RecordBatch` or `Table`. While this + can temporarily reduce memory note that various pandas operations + can trigger "consolidation" which may balloon memory use. + self_destruct (`bool`, defaults to `False`): + EXPERIMENTAL: If `True`, attempt to deallocate the originating Arrow + memory while converting the Arrow object to pandas. If you use the + object after calling `to_pandas` with this option it will crash your + program. + types_mapper (`function`, defaults to `None`): + A function mapping a pyarrow DataType to a pandas `ExtensionDtype`. + This can be used to override the default pandas type for conversion + of built-in pyarrow types or in absence of `pandas_metadata` in the + Table schema. The function receives a pyarrow DataType and is + expected to return a pandas `ExtensionDtype` or `None` if the + default conversion should be used for that type. If you have + a dictionary mapping, you can pass `dict.get` as function. + + Returns: + `pandas.Series` or `pandas.DataFrame`: `pandas.Series` or `pandas.DataFrame` depending on type of object + """ + return self.table.to_pandas(*args, **kwargs) + + def to_string(self, *args, **kwargs): + return self.table.to_string(*args, **kwargs) + + def to_reader(self, max_chunksize: Optional[int] = None): + """ + Convert the Table to a RecordBatchReader. + + Note that this method is zero-copy, it merely exposes the same data under a different API. + + Args: + max_chunksize (`int`, defaults to `None`) + Maximum size for RecordBatch chunks. Individual chunks may be smaller depending + on the chunk layout of individual columns. + + Returns: + `pyarrow.RecordBatchReader` + """ + return self.table.to_reader(max_chunksize=max_chunksize) + + def field(self, *args, **kwargs): + """ + Select a schema field by its column name or numeric index. + + Args: + i (`Union[int, str]`): + The index or name of the field to retrieve. + + Returns: + `pyarrow.Field` + """ + return self.table.field(*args, **kwargs) + + def column(self, *args, **kwargs): + """ + Select a column by its column name, or numeric index. + + Args: + i (`Union[int, str]`): + The index or name of the column to retrieve. + + Returns: + `pyarrow.ChunkedArray` + """ + return self.table.column(*args, **kwargs) + + def itercolumns(self, *args, **kwargs): + """ + Iterator over all columns in their numerical order. + + Yields: + `pyarrow.ChunkedArray` + """ + return self.table.itercolumns(*args, **kwargs) + + @property + def schema(self): + """ + Schema of the table and its columns. + + Returns: + `pyarrow.Schema` + """ + return self.table.schema + + @property + def columns(self): + """ + List of all columns in numerical order. + + Returns: + `List[pa.ChunkedArray]` + """ + return self.table.columns + + @property + def num_columns(self): + """ + Number of columns in this table. + + Returns: + int + """ + return self.table.num_columns + + @property + def num_rows(self): + """ + Number of rows in this table. + + Due to the definition of a table, all columns have the same number of + rows. + + Returns: + int + """ + return self.table.num_rows + + @property + def shape(self): + """ + Dimensions of the table: (#rows, #columns). + + Returns: + `(int, int)`: Number of rows and number of columns. + """ + return self.table.shape + + @property + def nbytes(self): + """ + Total number of bytes consumed by the elements of the table. + """ + return self.table.nbytes + + @property + def column_names(self): + """ + Names of the table's columns. + """ + return self.table.column_names + + def __eq__(self, other): + return self.equals(other) + + def __getitem__(self, i): + return self.table[i] + + def __len__(self): + return len(self.table) + + def __repr__(self): + return self.table.__repr__().replace("pyarrow.Table", self.__class__.__name__) + + def __str__(self): + return self.table.__str__().replace("pyarrow.Table", self.__class__.__name__) + + def slice(self, *args, **kwargs): + """ + Compute zero-copy slice of this Table. + + Args: + offset (`int`, defaults to `0`): + Offset from start of table to slice. + length (`int`, defaults to `None`): + Length of slice (default is until end of table starting from + offset). + + Returns: + `datasets.table.Table` + """ + raise NotImplementedError() + + def filter(self, *args, **kwargs): + """ + Select records from a Table. See `pyarrow.compute.filter` for full usage. + """ + raise NotImplementedError() + + def flatten(self, *args, **kwargs): + """ + Flatten this Table. Each column with a struct type is flattened + into one column per struct field. Other columns are left unchanged. + + Args: + memory_pool (`MemoryPool`, defaults to `None`): + For memory allocations, if required, otherwise use default pool. + + Returns: + `datasets.table.Table` + """ + raise NotImplementedError() + + def combine_chunks(self, *args, **kwargs): + """ + Make a new table by combining the chunks this table has. + + All the underlying chunks in the `ChunkedArray` of each column are + concatenated into zero or one chunk. + + Args: + memory_pool (`MemoryPool`, defaults to `None`): + For memory allocations, if required, otherwise use default pool. + + Returns: + `datasets.table.Table` + """ + raise NotImplementedError() + + def cast(self, *args, **kwargs): + """ + Cast table values to another schema. + + Args: + target_schema (`Schema`): + Schema to cast to, the names and order of fields must match. + safe (`bool`, defaults to `True`): + Check for overflows or other unsafe conversions. + + Returns: + `datasets.table.Table` + """ + raise NotImplementedError() + + def replace_schema_metadata(self, *args, **kwargs): + """ + EXPERIMENTAL: Create shallow copy of table by replacing schema + key-value metadata with the indicated new metadata (which may be None, + which deletes any existing metadata + + Args: + metadata (`dict`, defaults to `None`): + + Returns: + `datasets.table.Table`: shallow_copy + """ + raise NotImplementedError() + + def add_column(self, *args, **kwargs): + """ + Add column to Table at position. + + A new table is returned with the column added, the original table + object is left unchanged. + + Args: + i (`int`): + Index to place the column at. + field_ (`Union[str, pyarrow.Field]`): + If a string is passed then the type is deduced from the column + data. + column (`Union[pyarrow.Array, List[pyarrow.Array]]`): + Column data. + + Returns: + `datasets.table.Table`: New table with the passed column added. + """ + raise NotImplementedError() + + def append_column(self, *args, **kwargs): + """ + Append column at end of columns. + + Args: + field_ (`Union[str, pyarrow.Field]`): + If a string is passed then the type is deduced from the column + data. + column (`Union[pyarrow.Array, List[pyarrow.Array]]`): + Column data. + + Returns: + `datasets.table.Table`: New table with the passed column added. + """ + raise NotImplementedError() + + def remove_column(self, *args, **kwargs): + """ + Create new Table with the indicated column removed. + + Args: + i (`int`): + Index of column to remove. + + Returns: + `datasets.table.Table`: New table without the column. + """ + raise NotImplementedError() + + def set_column(self, *args, **kwargs): + """ + Replace column in Table at position. + + Args: + i (`int`): + Index to place the column at. + field_ (`Union[str, pyarrow.Field]`): + If a string is passed then the type is deduced from the column + data. + column (`Union[pyarrow.Array, List[pyarrow.Array]]`): + Column data. + + Returns: + `datasets.table.Table`: New table with the passed column set. + """ + raise NotImplementedError() + + def rename_columns(self, *args, **kwargs): + """ + Create new table with columns renamed to provided names. + """ + raise NotImplementedError() + + def drop(self, *args, **kwargs): + """ + Drop one or more columns and return a new table. + + Args: + columns (`List[str]`): + List of field names referencing existing columns. + + Raises: + `KeyError` : if any of the passed columns name are not existing. + + Returns: + `datasets.table.Table`: New table without the columns. + """ + raise NotImplementedError() + + def select(self, *args, **kwargs): + """ + Select columns of the table. + + Returns a new table with the specified columns, and metadata preserved. + + Args: + columns (:obj:`Union[List[str], List[int]]`): + The column names or integer indices to select. + + Returns: + `datasets.table.Table`: table with only a subset of the columns + """ + raise NotImplementedError() + + +class TableBlock(Table): + """ + `TableBlock` is the allowed class inside a `ConcanetationTable`. + Only `MemoryMappedTable` and `InMemoryTable` are `TableBlock`. + This is because we don't want a `ConcanetationTable` made out of other `ConcanetationTables`. + """ + + pass + + +class InMemoryTable(TableBlock): + """ + The table is said in-memory when it is loaded into the user's RAM. + + Pickling it does copy all the data using memory. + Its implementation is simple and uses the underlying pyarrow Table methods directly. + + This is different from the `MemoryMapped` table, for which pickling doesn't copy all the + data in memory. For a `MemoryMapped`, unpickling instead reloads the table from the disk. + + `InMemoryTable` must be used when data fit in memory, while `MemoryMapped` are reserved for + data bigger than memory or when you want the memory footprint of your application to + stay low. + """ + + @classmethod + def from_file(cls, filename: str): + table = _in_memory_arrow_table_from_file(filename) + return cls(table) + + @classmethod + def from_buffer(cls, buffer: pa.Buffer): + table = _in_memory_arrow_table_from_buffer(buffer) + return cls(table) + + @classmethod + def from_pandas(cls, *args, **kwargs): + """ + Convert pandas.DataFrame to an Arrow Table. + + The column types in the resulting Arrow Table are inferred from the + dtypes of the pandas.Series in the DataFrame. In the case of non-object + Series, the NumPy dtype is translated to its Arrow equivalent. In the + case of `object`, we need to guess the datatype by looking at the + Python objects in this Series. + + Be aware that Series of the `object` dtype don't carry enough + information to always lead to a meaningful Arrow type. In the case that + we cannot infer a type, e.g. because the DataFrame is of length 0 or + the Series only contains `None/nan` objects, the type is set to + null. This behavior can be avoided by constructing an explicit schema + and passing it to this function. + + Args: + df (`pandas.DataFrame`): + schema (`pyarrow.Schema`, *optional*): + The expected schema of the Arrow Table. This can be used to + indicate the type of columns if we cannot infer it automatically. + If passed, the output will have exactly this schema. Columns + specified in the schema that are not found in the DataFrame columns + or its index will raise an error. Additional columns or index + levels in the DataFrame which are not specified in the schema will + be ignored. + preserve_index (`bool`, *optional*): + Whether to store the index as an additional column in the resulting + `Table`. The default of None will store the index as a column, + except for RangeIndex which is stored as metadata only. Use + `preserve_index=True` to force it to be stored as a column. + nthreads (`int`, defaults to `None` (may use up to system CPU count threads)) + If greater than 1, convert columns to Arrow in parallel using + indicated number of threads. + columns (`List[str]`, *optional*): + List of column to be converted. If `None`, use all columns. + safe (`bool`, defaults to `True`): + Check for overflows or other unsafe conversions, + + Returns: + `datasets.table.Table`: + + Examples: + ```python + >>> import pandas as pd + >>> import pyarrow as pa + >>> df = pd.DataFrame({ + ... 'int': [1, 2], + ... 'str': ['a', 'b'] + ... }) + >>> pa.Table.from_pandas(df) + + ``` + """ + return cls(pa.Table.from_pandas(*args, **kwargs)) + + @classmethod + def from_arrays(cls, *args, **kwargs): + """ + Construct a Table from Arrow arrays. + + Args: + arrays (`List[Union[pyarrow.Array, pyarrow.ChunkedArray]]`): + Equal-length arrays that should form the table. + names (`List[str]`, *optional*): + Names for the table columns. If not passed, schema must be passed. + schema (`Schema`, defaults to `None`): + Schema for the created table. If not passed, names must be passed. + metadata (`Union[dict, Mapping]`, defaults to `None`): + Optional metadata for the schema (if inferred). + + Returns: + `datasets.table.Table` + """ + return cls(pa.Table.from_arrays(*args, **kwargs)) + + @classmethod + def from_pydict(cls, *args, **kwargs): + """ + Construct a Table from Arrow arrays or columns. + + Args: + mapping (`Union[dict, Mapping]`): + A mapping of strings to Arrays or Python lists. + schema (`Schema`, defaults to `None`): + If not passed, will be inferred from the Mapping values + metadata (`Union[dict, Mapping]`, defaults to `None`): + Optional metadata for the schema (if inferred). + + Returns: + `datasets.table.Table` + """ + return cls(pa.Table.from_pydict(*args, **kwargs)) + + @classmethod + def from_pylist(cls, mapping, *args, **kwargs): + """ + Construct a Table from list of rows / dictionaries. + + Args: + mapping (`List[dict]`): + A mapping of strings to row values. + schema (`Schema`, defaults to `None`): + If not passed, will be inferred from the Mapping values + metadata (`Union[dict, Mapping]`, defaults to `None`): + Optional metadata for the schema (if inferred). + + Returns: + `datasets.table.Table` + """ + return cls(pa.Table.from_pylist(mapping, *args, **kwargs)) + + @classmethod + def from_batches(cls, *args, **kwargs): + """ + Construct a Table from a sequence or iterator of Arrow `RecordBatches`. + + Args: + batches (`Union[Sequence[pyarrow.RecordBatch], Iterator[pyarrow.RecordBatch]]`): + Sequence of `RecordBatch` to be converted, all schemas must be equal. + schema (`Schema`, defaults to `None`): + If not passed, will be inferred from the first `RecordBatch`. + + Returns: + `datasets.table.Table`: + """ + return cls(pa.Table.from_batches(*args, **kwargs)) + + def slice(self, offset=0, length=None): + """ + Compute zero-copy slice of this Table. + + Args: + offset (`int`, defaults to `0`): + Offset from start of table to slice. + length (`int`, defaults to `None`): + Length of slice (default is until end of table starting from + offset). + + Returns: + `datasets.table.Table` + """ + # Use fast slicing here + return InMemoryTable(self.fast_slice(offset=offset, length=length)) + + def filter(self, *args, **kwargs): + """ + Select records from a Table. See `pyarrow.compute.filter` for full usage. + """ + return InMemoryTable(self.table.filter(*args, **kwargs)) + + def flatten(self, *args, **kwargs): + """ + Flatten this Table. Each column with a struct type is flattened + into one column per struct field. Other columns are left unchanged. + + Args: + memory_pool (`MemoryPool`, defaults to `None`): + For memory allocations, if required, otherwise use default pool. + + Returns: + `datasets.table.Table` + """ + return InMemoryTable(table_flatten(self.table, *args, **kwargs)) + + def combine_chunks(self, *args, **kwargs): + """ + Make a new table by combining the chunks this table has. + + All the underlying chunks in the `ChunkedArray` of each column are + concatenated into zero or one chunk. + + Args: + memory_pool (`MemoryPool`, defaults to `None`): + For memory allocations, if required, otherwise use default pool. + + Returns: + `datasets.table.Table` + """ + return InMemoryTable(self.table.combine_chunks(*args, **kwargs)) + + def cast(self, *args, **kwargs): + """ + Cast table values to another schema. + + Args: + target_schema (`Schema`): + Schema to cast to, the names and order of fields must match. + safe (`bool`, defaults to `True`): + Check for overflows or other unsafe conversions. + + Returns: + `datasets.table.Table` + """ + return InMemoryTable(table_cast(self.table, *args, **kwargs)) + + def replace_schema_metadata(self, *args, **kwargs): + """ + EXPERIMENTAL: Create shallow copy of table by replacing schema + key-value metadata with the indicated new metadata (which may be `None`, + which deletes any existing metadata). + + Args: + metadata (`dict`, defaults to `None`): + + Returns: + `datasets.table.Table`: shallow_copy + """ + return InMemoryTable(self.table.replace_schema_metadata(*args, **kwargs)) + + def add_column(self, *args, **kwargs): + """ + Add column to Table at position. + + A new table is returned with the column added, the original table + object is left unchanged. + + Args: + i (`int`): + Index to place the column at. + field_ (`Union[str, pyarrow.Field]`): + If a string is passed then the type is deduced from the column + data. + column (`Union[pyarrow.Array, List[pyarrow.Array]]`): + Column data. + + Returns: + `datasets.table.Table`: New table with the passed column added. + """ + return InMemoryTable(self.table.add_column(*args, **kwargs)) + + def append_column(self, *args, **kwargs): + """ + Append column at end of columns. + + Args: + field_ (`Union[str, pyarrow.Field]`): + If a string is passed then the type is deduced from the column + data. + column (`Union[pyarrow.Array, List[pyarrow.Array]]`): + Column data. + + Returns: + `datasets.table.Table`: + New table with the passed column added. + """ + return InMemoryTable(self.table.append_column(*args, **kwargs)) + + def remove_column(self, *args, **kwargs): + """ + Create new Table with the indicated column removed. + + Args: + i (`int`): + Index of column to remove. + + Returns: + `datasets.table.Table`: + New table without the column. + """ + return InMemoryTable(self.table.remove_column(*args, **kwargs)) + + def set_column(self, *args, **kwargs): + """ + Replace column in Table at position. + + Args: + i (`int`): + Index to place the column at. + field_ (`Union[str, pyarrow.Field]`): + If a string is passed then the type is deduced from the column + data. + column (`Union[pyarrow.Array, List[pyarrow.Array]]`): + Column data. + + Returns: + `datasets.table.Table`: + New table with the passed column set. + """ + return InMemoryTable(self.table.set_column(*args, **kwargs)) + + def rename_columns(self, *args, **kwargs): + """ + Create new table with columns renamed to provided names. + """ + return InMemoryTable(self.table.rename_columns(*args, **kwargs)) + + def drop(self, *args, **kwargs): + """ + Drop one or more columns and return a new table. + + Args: + columns (`List[str]`): + List of field names referencing existing columns. + + Raises: + `KeyError` : if any of the passed columns name are not existing. + + Returns: + `datasets.table.Table`: + New table without the columns. + """ + return InMemoryTable(self.table.drop(*args, **kwargs)) + + def select(self, *args, **kwargs): + """ + Select columns of the table. + + Returns a new table with the specified columns, and metadata preserved. + + Args: + columns (:obj:`Union[List[str], List[int]]`): + The column names or integer indices to select. + + Returns: + :class:`datasets.table.Table`: New table with the specified columns, and metadata preserved. + """ + return InMemoryTable(self.table.select(*args, **kwargs)) + + +# The MemoryMappedTable needs replays to properly reload tables from the disk +Replay = tuple[str, tuple, dict] + + +class MemoryMappedTable(TableBlock): + """ + The table is said memory mapped when it doesn't use the user's RAM but loads the data + from the disk instead. + + Pickling it doesn't copy the data into memory. + Instead, only the path to the memory mapped arrow file is pickled, as well as the list + of transforms to "replay" when reloading the table from the disk. + + Its implementation requires to store an history of all the transforms that were applied + to the underlying pyarrow Table, so that they can be "replayed" when reloading the Table + from the disk. + + This is different from the `InMemoryTable` table, for which pickling does copy all the + data in memory. + + `InMemoryTable` must be used when data fit in memory, while `MemoryMapped` are reserved for + data bigger than memory or when you want the memory footprint of your application to + stay low. + """ + + def __init__(self, table: pa.Table, path: str, replays: Optional[list[Replay]] = None): + super().__init__(table) + self.path = os.path.abspath(path) + self.replays: list[Replay] = replays if replays is not None else [] + + @classmethod + def from_file(cls, filename: str, replays=None): + table = _memory_mapped_arrow_table_from_file(filename) + table = cls._apply_replays(table, replays) + return cls(table, filename, replays) + + def __getstate__(self): + return {"path": self.path, "replays": self.replays} + + def __setstate__(self, state): + path = state["path"] + replays = state["replays"] + table = _memory_mapped_arrow_table_from_file(path) + table = self._apply_replays(table, replays) + MemoryMappedTable.__init__(self, table, path=path, replays=replays) + + @staticmethod + def _apply_replays(table: pa.Table, replays: Optional[list[Replay]] = None) -> pa.Table: + if replays is not None: + for name, args, kwargs in replays: + if name == "cast": + table = table_cast(table, *args, **kwargs) + elif name == "flatten": + table = table_flatten(table, *args, **kwargs) + else: + table = getattr(table, name)(*args, **kwargs) + return table + + def _append_replay(self, replay: Replay) -> list[Replay]: + replays = copy.deepcopy(self.replays) + replays.append(replay) + return replays + + def slice(self, offset=0, length=None): + """ + Compute zero-copy slice of this Table. + + Args: + offset (`int`, defaults to `0`): + Offset from start of table to slice. + length (`int`, defaults to `None`): + Length of slice (default is until end of table starting from + offset). + + Returns: + `datasets.table.Table` + """ + replay = ("slice", (offset, length), {}) + replays = self._append_replay(replay) + # Use fast slicing here + return MemoryMappedTable(self.fast_slice(offset=offset, length=length), self.path, replays) + + def filter(self, *args, **kwargs): + """ + Select records from a Table. See `pyarrow.compute.filter` for full usage. + """ + replay = ("filter", copy.deepcopy(args), copy.deepcopy(kwargs)) + replays = self._append_replay(replay) + return MemoryMappedTable(self.table.filter(*args, **kwargs), self.path, replays) + + def flatten(self, *args, **kwargs): + """ + Flatten this Table. Each column with a struct type is flattened + into one column per struct field. Other columns are left unchanged. + + Args: + memory_pool (`MemoryPool`, defaults to `None`): + For memory allocations, if required, otherwise use default pool. + + Returns: + `datasets.table.Table` + """ + replay = ("flatten", copy.deepcopy(args), copy.deepcopy(kwargs)) + replays = self._append_replay(replay) + return MemoryMappedTable(table_flatten(self.table, *args, **kwargs), self.path, replays) + + def combine_chunks(self, *args, **kwargs): + """ + Make a new table by combining the chunks this table has. + + All the underlying chunks in the ChunkedArray of each column are + concatenated into zero or one chunk. + + Args: + memory_pool (`MemoryPool`, defaults to `None`): + For memory allocations, if required, otherwise use default pool. + + Returns: + `datasets.table.Table` + """ + replay = ("combine_chunks", copy.deepcopy(args), copy.deepcopy(kwargs)) + replays = self._append_replay(replay) + return MemoryMappedTable(self.table.combine_chunks(*args, **kwargs), self.path, replays) + + def cast(self, *args, **kwargs): + """ + Cast table values to another schema + + Args: + target_schema (`Schema`): + Schema to cast to, the names and order of fields must match. + safe (`bool`, defaults to `True`): + Check for overflows or other unsafe conversions. + + Returns: + `datasets.table.Table` + """ + replay = ("cast", copy.deepcopy(args), copy.deepcopy(kwargs)) + replays = self._append_replay(replay) + return MemoryMappedTable(table_cast(self.table, *args, **kwargs), self.path, replays) + + def replace_schema_metadata(self, *args, **kwargs): + """ + EXPERIMENTAL: Create shallow copy of table by replacing schema + key-value metadata with the indicated new metadata (which may be None, + which deletes any existing metadata. + + Args: + metadata (`dict`, defaults to `None`): + + Returns: + `datasets.table.Table`: shallow_copy + """ + replay = ("replace_schema_metadata", copy.deepcopy(args), copy.deepcopy(kwargs)) + replays = self._append_replay(replay) + return MemoryMappedTable(self.table.replace_schema_metadata(*args, **kwargs), self.path, replays) + + def add_column(self, *args, **kwargs): + """ + Add column to Table at position. + + A new table is returned with the column added, the original table + object is left unchanged. + + Args: + i (`int`): + Index to place the column at. + field_ (`Union[str, pyarrow.Field]`): + If a string is passed then the type is deduced from the column + data. + column (`Union[pyarrow.Array, List[pyarrow.Array]]`): + Column data. + + Returns: + `datasets.table.Table`: New table with the passed column added. + """ + replay = ("add_column", copy.deepcopy(args), copy.deepcopy(kwargs)) + replays = self._append_replay(replay) + return MemoryMappedTable(self.table.add_column(*args, **kwargs), self.path, replays) + + def append_column(self, *args, **kwargs): + """ + Append column at end of columns. + + Args: + field_ (`Union[str, pyarrow.Field]`): + If a string is passed then the type is deduced from the column + data. + column (`Union[pyarrow.Array, List[pyarrow.Array]]`): + Column data. + + Returns: + `datasets.table.Table`: + New table with the passed column added. + """ + replay = ("append_column", copy.deepcopy(args), copy.deepcopy(kwargs)) + replays = self._append_replay(replay) + return MemoryMappedTable(self.table.append_column(*args, **kwargs), self.path, replays) + + def remove_column(self, *args, **kwargs): + """ + Create new Table with the indicated column removed. + + Args: + i (`int`): + Index of column to remove. + + Returns: + `datasets.table.Table`: + New table without the column. + """ + replay = ("remove_column", copy.deepcopy(args), copy.deepcopy(kwargs)) + replays = self._append_replay(replay) + return MemoryMappedTable(self.table.remove_column(*args, **kwargs), self.path, replays) + + def set_column(self, *args, **kwargs): + """ + Replace column in Table at position. + + Args: + i (`int`): + Index to place the column at. + field_ (`Union[str, pyarrow.Field]`): + If a string is passed then the type is deduced from the column + data. + column (`Union[pyarrow.Array, List[pyarrow.Array]]`): + Column data. + + Returns: + `datasets.table.Table`: + New table with the passed column set. + """ + replay = ("set_column", copy.deepcopy(args), copy.deepcopy(kwargs)) + replays = self._append_replay(replay) + return MemoryMappedTable(self.table.set_column(*args, **kwargs), self.path, replays) + + def rename_columns(self, *args, **kwargs): + """ + Create new table with columns renamed to provided names. + """ + replay = ("rename_columns", copy.deepcopy(args), copy.deepcopy(kwargs)) + replays = self._append_replay(replay) + return MemoryMappedTable(self.table.rename_columns(*args, **kwargs), self.path, replays) + + def drop(self, *args, **kwargs): + """ + Drop one or more columns and return a new table. + + Args: + columns (`List[str]`): + List of field names referencing existing columns. + + Raises: + `KeyError` : if any of the passed columns name are not existing. + + Returns: + `datasets.table.Table`: + New table without the columns. + """ + replay = ("drop", copy.deepcopy(args), copy.deepcopy(kwargs)) + replays = self._append_replay(replay) + return MemoryMappedTable(self.table.drop(*args, **kwargs), self.path, replays) + + def select(self, *args, **kwargs): + """ + Select columns of the table. + + Returns a new table with the specified columns, and metadata preserved. + + Args: + columns (:obj:`Union[List[str], List[int]]`): + The column names or integer indices to select. + + Returns: + :class:`datasets.table.Table`: New table with the specified columns, and metadata preserved. + """ + replay = ("select", copy.deepcopy(args), copy.deepcopy(kwargs)) + replays = self._append_replay(replay) + return MemoryMappedTable(self.table.select(*args, **kwargs), self.path, replays) + + +# A ConcatenationTable is the concatenation of several tables. +# The ``blocks`` attributes stores a list of list of blocks. +# The first axis concatenates the tables along the axis 0 (it appends rows), +# while the second axis concatenates tables along the axis 1 (it appends columns). +TableBlockContainer = TypeVar("TableBlockContainer", TableBlock, list[TableBlock], list[list[TableBlock]]) + + +class ConcatenationTable(Table): + """ + The table comes from the concatenation of several tables called blocks. + It enables concatenation on both axis 0 (append rows) and axis 1 (append columns). + + The underlying tables are called "blocks" and can be either `InMemoryTable` + or `MemoryMappedTable` objects. + This allows to combine tables that come from memory or that are memory mapped. + When a `ConcatenationTable` is pickled, then each block is pickled: + - the `InMemoryTable` objects are pickled by copying all the data in memory. + - the MemoryMappedTable objects are pickled without copying the data into memory. + Instead, only the path to the memory mapped arrow file is pickled, as well as the list + of transforms to "replays" when reloading the table from the disk. + + Its implementation requires to store each block separately. + The `blocks` attributes stores a list of list of blocks. + The first axis concatenates the tables along the axis 0 (it appends rows), + while the second axis concatenates tables along the axis 1 (it appends columns). + + If some columns are missing when concatenating on axis 0, they are filled with null values. + This is done using `pyarrow.concat_tables(tables, promote=True)`. + + You can access the fully combined table by accessing the `ConcatenationTable.table` attribute, + and the blocks by accessing the `ConcatenationTable.blocks` attribute. + """ + + def __init__(self, table: pa.Table, blocks: list[list[TableBlock]]): + super().__init__(table) + self.blocks = blocks + # Check that all the blocks have the right type. + # Only InMemoryTable and MemoryMappedTable are allowed. + for subtables in blocks: + for subtable in subtables: + if not isinstance(subtable, TableBlock): + raise TypeError( + "The blocks of a ConcatenationTable must be InMemoryTable or MemoryMappedTable objects" + f", but got {_short_str(subtable)}." + ) + + def __getstate__(self): + return {"blocks": self.blocks, "schema": self.table.schema} + + def __setstate__(self, state): + blocks = state["blocks"] + schema = state["schema"] + table = self._concat_blocks_horizontally_and_vertically(blocks) + if schema is not None and table.schema != schema: + # We fix the columns by concatenating with an empty table with the right columns + empty_table = pa.Table.from_batches([], schema=schema) + # We set promote_options="default" to fill missing columns with null values + table = pa.concat_tables([table, empty_table], promote_options="default") + ConcatenationTable.__init__(self, table, blocks=blocks) + + @staticmethod + def _concat_blocks(blocks: list[Union[TableBlock, pa.Table]], axis: int = 0) -> pa.Table: + pa_tables = [table.table if hasattr(table, "table") else table for table in blocks] + if axis == 0: + # We set promote_options="default" to fill missing columns with null values + return pa.concat_tables(pa_tables, promote_options="default") + elif axis == 1: + for i, table in enumerate(pa_tables): + if i == 0: + pa_table = table + else: + for name, col in zip(table.column_names, table.columns): + pa_table = pa_table.append_column(name, col) + return pa_table + else: + raise ValueError("'axis' must be either 0 or 1") + + @classmethod + def _concat_blocks_horizontally_and_vertically(cls, blocks: list[list[TableBlock]]) -> pa.Table: + pa_tables_to_concat_vertically = [] + for i, tables in enumerate(blocks): + if not tables: + continue + pa_table_horizontally_concatenated = cls._concat_blocks(tables, axis=1) + pa_tables_to_concat_vertically.append(pa_table_horizontally_concatenated) + return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0) + + @classmethod + def _merge_blocks(cls, blocks: TableBlockContainer, axis: Optional[int] = None) -> TableBlockContainer: + if axis is not None: + merged_blocks = [] + for is_in_memory, block_group in groupby(blocks, key=lambda x: isinstance(x, InMemoryTable)): + if is_in_memory: + block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))] + merged_blocks += list(block_group) + else: # both + merged_blocks = [cls._merge_blocks(row_block, axis=1) for row_block in blocks] + if all(len(row_block) == 1 for row_block in merged_blocks): + merged_blocks = cls._merge_blocks( + [block for row_block in merged_blocks for block in row_block], axis=0 + ) + return merged_blocks + + @classmethod + def _consolidate_blocks(cls, blocks: TableBlockContainer) -> TableBlockContainer: + if isinstance(blocks, TableBlock): + return blocks + elif isinstance(blocks[0], TableBlock): + return cls._merge_blocks(blocks, axis=0) + else: + return cls._merge_blocks(blocks) + + @classmethod + def from_blocks(cls, blocks: TableBlockContainer) -> "ConcatenationTable": + blocks = cls._consolidate_blocks(blocks) + if isinstance(blocks, TableBlock): + table = blocks + return cls(table.table, [[table]]) + elif isinstance(blocks[0], TableBlock): + table = cls._concat_blocks(blocks, axis=0) + blocks = [[t] for t in blocks] + return cls(table, blocks) + else: + table = cls._concat_blocks_horizontally_and_vertically(blocks) + return cls(table, blocks) + + @classmethod + def from_tables(cls, tables: list[Union[pa.Table, Table]], axis: int = 0) -> "ConcatenationTable": + """Create `ConcatenationTable` from list of tables. + + Args: + tables (list of `Table` or list of `pyarrow.Table`): + List of tables. + axis (`{0, 1}`, defaults to `0`, meaning over rows): + Axis to concatenate over, where `0` means over rows (vertically) and `1` means over columns + (horizontally). + + + """ + + def to_blocks(table: Union[pa.Table, Table]) -> list[list[TableBlock]]: + if isinstance(table, pa.Table): + return [[InMemoryTable(table)]] + elif isinstance(table, ConcatenationTable): + return copy.deepcopy(table.blocks) + else: + return [[table]] + + def _slice_row_block(row_block: list[TableBlock], length: int) -> tuple[list[TableBlock], list[TableBlock]]: + sliced = [table.slice(0, length) for table in row_block] + remainder = [table.slice(length, len(row_block[0]) - length) for table in row_block] + return sliced, remainder + + def _split_both_like( + result: list[list[TableBlock]], blocks: list[list[TableBlock]] + ) -> tuple[list[list[TableBlock]], list[list[TableBlock]]]: + """ + Make sure each row_block contain the same num_rows to be able to concatenate them on axis=1. + + To do so, we modify both blocks sets to have the same row_blocks boundaries. + For example, if `result` has 2 row_blocks of 3 rows and `blocks` has 3 row_blocks of 2 rows, + we modify both to have 4 row_blocks of size 2, 1, 1 and 2: + + [ x x x | x x x ] + + [ y y | y y | y y ] + ----------------------------- + = [ x x | x | x | x x ] + [ y y | y | y | y y ] + + """ + result, blocks = list(result), list(blocks) + new_result, new_blocks = [], [] + while result and blocks: + # we slice the longest row block to save two row blocks of same length + # and we replace the long row block by its remainder if necessary + if len(result[0][0]) > len(blocks[0][0]): + new_blocks.append(blocks[0]) + sliced, result[0] = _slice_row_block(result[0], len(blocks.pop(0)[0])) + new_result.append(sliced) + elif len(result[0][0]) < len(blocks[0][0]): + new_result.append(result[0]) + sliced, blocks[0] = _slice_row_block(blocks[0], len(result.pop(0)[0])) + new_blocks.append(sliced) + else: + new_result.append(result.pop(0)) + new_blocks.append(blocks.pop(0)) + if result or blocks: + raise ValueError("Failed to concatenate on axis=1 because tables don't have the same number of rows") + return new_result, new_blocks + + def _extend_blocks( + result: list[list[TableBlock]], blocks: list[list[TableBlock]], axis: int = 0 + ) -> list[list[TableBlock]]: + if axis == 0: + result.extend(blocks) + elif axis == 1: + # We make sure each row_block have the same num_rows + result, blocks = _split_both_like(result, blocks) + for i, row_block in enumerate(blocks): + result[i].extend(row_block) + return result + + blocks = to_blocks(tables[0]) + for table in tables[1:]: + table_blocks = to_blocks(table) + blocks = _extend_blocks(blocks, table_blocks, axis=axis) + return cls.from_blocks(blocks) + + @property + def _slices(self): + offset = 0 + for tables in self.blocks: + length = len(tables[0]) + yield (offset, length) + offset += length + + def slice(self, offset=0, length=None): + """ + Compute zero-copy slice of this Table. + + Args: + offset (`int`, defaults to `0`): + Offset from start of table to slice. + length (`int`, defaults to `None`): + Length of slice (default is until end of table starting from + offset). + + Returns: + `datasets.table.Table` + """ + table = self.table.slice(offset, length=length) + length = length if length is not None else self.num_rows - offset + blocks = [] + for tables in self.blocks: + n_rows = len(tables[0]) + if length == 0: + break + elif n_rows <= offset: + offset = offset - n_rows + elif n_rows <= offset + length: + blocks.append([t.slice(offset) for t in tables]) + length, offset = length + offset - n_rows, 0 + else: + blocks.append([t.slice(offset, length) for t in tables]) + length, offset = 0, 0 + return ConcatenationTable(table, blocks) + + def filter(self, mask, *args, **kwargs): + """ + Select records from a Table. See `pyarrow.compute.filter` for full usage. + """ + table = self.table.filter(mask, *args, **kwargs) + blocks = [] + for (offset, length), tables in zip(self._slices, self.blocks): + submask = mask.slice(offset, length) + blocks.append([t.filter(submask, *args, **kwargs) for t in tables]) + return ConcatenationTable(table, blocks) + + def flatten(self, *args, **kwargs): + """ + Flatten this Table. Each column with a struct type is flattened + into one column per struct field. Other columns are left unchanged. + + Args: + memory_pool (`MemoryPool`, defaults to `None`): + For memory allocations, if required, otherwise use default pool. + + Returns: + `datasets.table.Table` + """ + table = table_flatten(self.table, *args, **kwargs) + blocks = [] + for tables in self.blocks: + blocks.append([t.flatten(*args, **kwargs) for t in tables]) + return ConcatenationTable(table, blocks) + + def combine_chunks(self, *args, **kwargs): + """ + Make a new table by combining the chunks this table has. + + All the underlying chunks in the `ChunkedArray` of each column are + concatenated into zero or one chunk. + + Args: + memory_pool (`MemoryPool`, defaults to `None`): + For memory allocations, if required, otherwise use default pool. + + Returns: + `datasets.table.Table` + """ + table = self.table.combine_chunks(*args, **kwargs) + blocks = [] + for tables in self.blocks: + blocks.append([t.combine_chunks(*args, **kwargs) for t in tables]) + return ConcatenationTable(table, blocks) + + def cast(self, target_schema, *args, **kwargs): + """ + Cast table values to another schema. + + Args: + target_schema (`Schema`): + Schema to cast to, the names and order of fields must match. + safe (`bool`, defaults to `True`): + Check for overflows or other unsafe conversions. + + Returns: + `datasets.table.Table` + """ + from .features import Features + + table = table_cast(self.table, target_schema, *args, **kwargs) + target_features = Features.from_arrow_schema(target_schema) + blocks = [] + for subtables in self.blocks: + new_tables = [] + fields = list(target_schema) + for subtable in subtables: + subfields = [] + for name in subtable.column_names: + subfields.append(fields.pop(next(i for i, field in enumerate(fields) if field.name == name))) + subfeatures = Features({subfield.name: target_features[subfield.name] for subfield in subfields}) + subschema = subfeatures.arrow_schema + new_tables.append(subtable.cast(subschema, *args, **kwargs)) + blocks.append(new_tables) + return ConcatenationTable(table, blocks) + + def replace_schema_metadata(self, *args, **kwargs): + """ + EXPERIMENTAL: Create shallow copy of table by replacing schema + key-value metadata with the indicated new metadata (which may be `None`, + which deletes any existing metadata). + + Args: + metadata (`dict`, defaults to `None`): + + Returns: + `datasets.table.Table`: shallow_copy + """ + table = self.table.replace_schema_metadata(*args, **kwargs) + blocks = [] + for tables in self.blocks: + blocks.append([t.replace_schema_metadata(*args, **kwargs) for t in tables]) + return ConcatenationTable(table, self.blocks) + + def add_column(self, *args, **kwargs): + """ + Add column to Table at position. + + A new table is returned with the column added, the original table + object is left unchanged. + + Args: + i (`int`): + Index to place the column at. + field_ (`Union[str, pyarrow.Field]`): + If a string is passed then the type is deduced from the column + data. + column (`Union[pyarrow.Array, List[pyarrow.Array]]`): + Column data. + + Returns: + `datasets.table.Table`: New table with the passed column added. + """ + raise NotImplementedError() + + def append_column(self, *args, **kwargs): + """ + Append column at end of columns. + + Args: + field_ (`Union[str, pyarrow.Field]`): + If a string is passed then the type is deduced from the column + data. + column (`Union[pyarrow.Array, List[pyarrow.Array]]`): + Column data. + + Returns: + `datasets.table.Table`: + New table with the passed column added. + """ + raise NotImplementedError() + + def remove_column(self, i, *args, **kwargs): + """ + Create new Table with the indicated column removed. + + Args: + i (`int`): + Index of column to remove. + + Returns: + `datasets.table.Table`: + New table without the column. + """ + table = self.table.remove_column(i, *args, **kwargs) + name = self.table.column_names[i] + blocks = [] + for tables in self.blocks: + blocks.append( + [ + t.remove_column(t.column_names.index(name), *args, **kwargs) if name in t.column_names else t + for t in tables + ] + ) + return ConcatenationTable(table, blocks) + + def set_column(self, *args, **kwargs): + """ + Replace column in Table at position. + + Args: + i (`int`): + Index to place the column at. + field_ (`Union[str, pyarrow.Field]`): + If a string is passed then the type is deduced from the column + data. + column (`Union[pyarrow.Array, List[pyarrow.Array]]`): + Column data. + + Returns: + `datasets.table.Table`: + New table with the passed column set. + """ + raise NotImplementedError() + + def rename_columns(self, names, *args, **kwargs): + """ + Create new table with columns renamed to provided names. + """ + table = self.table.rename_columns(names, *args, **kwargs) + names = dict(zip(self.table.column_names, names)) + blocks = [] + for tables in self.blocks: + blocks.append( + [t.rename_columns([names[name] for name in t.column_names], *args, **kwargs) for t in tables] + ) + return ConcatenationTable(table, blocks) + + def drop(self, columns, *args, **kwargs): + """ + Drop one or more columns and return a new table. + + Args: + columns (`List[str]`): + List of field names referencing existing columns. + + Raises: + `KeyError` : if any of the passed columns name are not existing. + + Returns: + `datasets.table.Table`: + New table without the columns. + """ + table = self.table.drop(columns, *args, **kwargs) + blocks = [] + for tables in self.blocks: + blocks.append([t.drop([c for c in columns if c in t.column_names], *args, **kwargs) for t in tables]) + return ConcatenationTable(table, blocks) + + def select(self, columns, *args, **kwargs): + """ + Select columns of the table. + + Returns a new table with the specified columns, and metadata preserved. + + Args: + columns (:obj:`Union[List[str], List[int]]`): + The column names or integer indices to select. + + Returns: + :class:`datasets.table.Table`: New table with the specified columns, and metadata preserved. + """ + table = self.table.select(columns, *args, **kwargs) + blocks = [] + for tables in self.blocks: + blocks.append([t.select([c for c in columns if c in t.column_names], *args, **kwargs) for t in tables]) + return ConcatenationTable(table, blocks) + + +def concat_tables(tables: list[Table], axis: int = 0) -> Table: + """ + Concatenate tables. + + Args: + tables (list of `Table`): + List of tables to be concatenated. + axis (`{0, 1}`, defaults to `0`, meaning over rows): + Axis to concatenate over, where `0` means over rows (vertically) and `1` means over columns + (horizontally). + + + Returns: + `datasets.table.Table`: + If the number of input tables is > 1, then the returned table is a `datasets.table.ConcatenationTable`. + Otherwise if there's only one table, it is returned as is. + """ + tables = list(tables) + if len(tables) == 1: + return tables[0] + return ConcatenationTable.from_tables(tables, axis=axis) + + +def list_table_cache_files(table: Table) -> list[str]: + """ + Get the cache files that are loaded by the table. + Cache file are used when parts of the table come from the disk via memory mapping. + + Returns: + `List[str]`: + A list of paths to the cache files loaded by the table. + """ + if isinstance(table, ConcatenationTable): + cache_files = [] + for subtables in table.blocks: + for subtable in subtables: + cache_files += list_table_cache_files(subtable) + return cache_files + elif isinstance(table, MemoryMappedTable): + return [table.path] + else: + return [] + + +def _wrap_for_chunked_arrays(func): + """Apply the function on each chunk of a `pyarrow.ChunkedArray`, or on the array directly""" + + def wrapper(array, *args, **kwargs): + if isinstance(array, pa.ChunkedArray): + return pa.chunked_array([func(chunk, *args, **kwargs) for chunk in array.chunks]) + else: + return func(array, *args, **kwargs) + + return wrapper + + +def _are_list_values_of_length(array: pa.ListArray, length: int) -> bool: + """Check if all the sub-lists of a `pa.ListArray` have the specified length.""" + return pc.all(pc.equal(array.value_lengths(), length)).as_py() or array.null_count == len(array) + + +def _combine_list_array_offsets_with_mask(array: pa.ListArray) -> pa.Array: + """Add the null bitmap to the offsets of a `pa.ListArray`.""" + offsets = array.offsets + if array.null_count > 0: + offsets = pa.concat_arrays( + [ + pc.replace_with_mask(offsets[:-1], array.is_null(), pa.nulls(len(array), pa.int32())), + offsets[-1:], + ] + ) + return offsets + + +def _storage_type(type: pa.DataType) -> pa.DataType: + """Convert a (possibly nested) `pa.ExtensionType` to its storage type.""" + if isinstance(type, pa.ExtensionType): + return _storage_type(type.storage_type) + elif isinstance(type, pa.StructType): + return pa.struct([pa.field(field.name, _storage_type(field.type)) for field in type]) + elif isinstance(type, pa.ListType): + return pa.list_(_storage_type(type.value_type)) + elif isinstance(type, pa.FixedSizeListType): + return pa.list_(_storage_type(type.value_type), type.list_size) + return type + + +def _short_str(value: Any) -> str: + out = str(value) + if len(out) > 3000: + out = out[:1500] + "\n...\n" + out[-1500:] + return out + + +@_wrap_for_chunked_arrays +def array_cast( + array: pa.Array, pa_type: pa.DataType, allow_primitive_to_str: bool = True, allow_decimal_to_str: bool = True +) -> Union[pa.Array, pa.FixedSizeListArray, pa.ListArray, pa.StructArray, pa.ExtensionArray]: + """Improved version of `pa.Array.cast` + + It supports casting `pa.StructArray` objects to re-order the fields. + It also let you control certain aspects of the casting, e.g. whether + to disable casting primitives (`booleans`, `floats` or `ints`) or + disable casting decimals to strings. + + Args: + array (`pa.Array`): + PyArrow array to cast + pa_type (`pa.DataType`): + Target PyArrow type + allow_primitive_to_str (`bool`, defaults to `True`): + Whether to allow casting primitives to strings. + Defaults to `True`. + allow_decimal_to_str (`bool`, defaults to `True`): + Whether to allow casting decimals to strings. + Defaults to `True`. + + Raises: + `pa.ArrowInvalidError`: if the arrow data casting fails + `TypeError`: if the target type is not supported according, e.g. + + - if a field is missing + - if casting from primitives to strings and `allow_primitive_to_str` is `False` + - if casting from decimals to strings and `allow_decimal_to_str` is `False` + + Returns: + `List[pyarrow.Array]`: the casted array + """ + _c = partial(array_cast, allow_primitive_to_str=allow_primitive_to_str, allow_decimal_to_str=allow_decimal_to_str) + if isinstance(array, pa.ExtensionArray): + array = array.storage + if isinstance(pa_type, pa.ExtensionType): + return pa_type.wrap_array(_c(array, pa_type.storage_type)) + elif array.type == pa_type: + return array + elif pa.types.is_struct(array.type): + if pa.types.is_struct(pa_type) and ({field.name for field in pa_type} == {field.name for field in array.type}): + if array.type.num_fields == 0: + return array + arrays = [_c(array.field(field.name), field.type) for field in pa_type] + return pa.StructArray.from_arrays(arrays, fields=list(pa_type), mask=array.is_null()) + elif pa.types.is_list(array.type) or pa.types.is_large_list(array.type): + if pa.types.is_fixed_size_list(pa_type): + if _are_list_values_of_length(array, pa_type.list_size): + if array.null_count > 0: + # Ensure each null value in the array translates to [null] * pa_type.list_size in the array's values array + array_type = array.type + storage_type = _storage_type(array_type) + if array_type != storage_type: + # Temporarily convert to the storage type to support extension types in the slice operation + array = _c(array, storage_type) + array = pc.list_slice(array, 0, pa_type.list_size, return_fixed_size_list=True) + array = _c(array, array_type) + else: + array = pc.list_slice(array, 0, pa_type.list_size, return_fixed_size_list=True) + array_values = array.values + return pa.FixedSizeListArray.from_arrays( + _c(array_values, pa_type.value_type), pa_type.list_size, mask=array.is_null() + ) + else: + array_values = array.values[ + array.offset * pa_type.list_size : (array.offset + len(array)) * pa_type.list_size + ] + return pa.FixedSizeListArray.from_arrays(_c(array_values, pa_type.value_type), pa_type.list_size) + elif pa.types.is_list(pa_type): + # Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError + array_offsets = _combine_list_array_offsets_with_mask(array) + return pa.ListArray.from_arrays(array_offsets, _c(array.values, pa_type.value_type)) + elif pa.types.is_large_list(pa_type): + # Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError + array_offsets = _combine_list_array_offsets_with_mask(array) + return pa.LargeListArray.from_arrays(array_offsets, _c(array.values, pa_type.value_type)) + elif pa.types.is_fixed_size_list(array.type): + if pa.types.is_fixed_size_list(pa_type): + if pa_type.list_size == array.type.list_size: + array_values = array.values[ + array.offset * array.type.list_size : (array.offset + len(array)) * array.type.list_size + ] + return pa.FixedSizeListArray.from_arrays( + _c(array_values, pa_type.value_type), pa_type.list_size, mask=array.is_null() + ) + elif pa.types.is_list(pa_type): + array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size + return pa.ListArray.from_arrays(array_offsets, _c(array.values, pa_type.value_type), mask=array.is_null()) + elif pa.types.is_large_list(pa_type): + array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size + return pa.LargeListArray.from_arrays( + array_offsets, _c(array.values, pa_type.value_type), mask=array.is_null() + ) + else: + if pa.types.is_string(pa_type): + if not allow_primitive_to_str and pa.types.is_primitive(array.type): + raise TypeError( + f"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)} " + f"since allow_primitive_to_str is set to {allow_primitive_to_str} " + ) + if not allow_decimal_to_str and pa.types.is_decimal(array.type): + raise TypeError( + f"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)} " + f"and allow_decimal_to_str is set to {allow_decimal_to_str}" + ) + if pa.types.is_null(pa_type) and not pa.types.is_null(array.type): + raise TypeError(f"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)}") + return array.cast(pa_type) + raise TypeError(f"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)}") + + +@_wrap_for_chunked_arrays +def cast_array_to_feature( + array: pa.Array, feature: "FeatureType", allow_primitive_to_str: bool = True, allow_decimal_to_str: bool = True +) -> pa.Array: + """Cast an array to the arrow type that corresponds to the requested feature type. + For custom features like [`Audio`] or [`Image`], it takes into account the "cast_storage" methods + they defined to enable casting from other arrow types. + + Args: + array (`pa.Array`): + The PyArrow array to cast. + feature (`datasets.features.FeatureType`): + The target feature type. + allow_primitive_to_str (`bool`, defaults to `True`): + Whether to allow casting primitives to strings. + Defaults to `True`. + allow_decimal_to_str (`bool`, defaults to `True`): + Whether to allow casting decimals to strings. + Defaults to `True`. + + Raises: + `pa.ArrowInvalidError`: if the arrow data casting fails + `TypeError`: if the target type is not supported according, e.g. + + - if a field is missing + - if casting from primitives and `allow_primitive_to_str` is `False` + - if casting from decimals and `allow_decimal_to_str` is `False` + + Returns: + array (`pyarrow.Array`): the casted array + """ + from .features.features import LargeList, List, get_nested_type + + _c = partial( + cast_array_to_feature, + allow_primitive_to_str=allow_primitive_to_str, + allow_decimal_to_str=allow_decimal_to_str, + ) + + if isinstance(array, pa.ExtensionArray): + array = array.storage + if hasattr(feature, "cast_storage"): + return feature.cast_storage(array) + + if pa.types.is_struct(array.type): + # feature must be a dict + if isinstance(feature, dict) and (array_fields := {field.name for field in array.type}) <= set(feature): + null_array = pa.array([None] * len(array)) + arrays = [ + _c(array.field(name) if name in array_fields else null_array, subfeature) + for name, subfeature in feature.items() + ] + return pa.StructArray.from_arrays(arrays, names=list(feature), mask=array.is_null()) + elif pa.types.is_list(array.type) or pa.types.is_large_list(array.type): + # feature must be either List(subfeature) or LargeList(subfeature) + if isinstance(feature, LargeList): + casted_array_values = _c(array.values, feature.feature) + if pa.types.is_large_list(array.type) and casted_array_values.type == array.values.type: + # Both array and feature have equal large_list type and values (within the list) type + return array + else: + # Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError + array_offsets = _combine_list_array_offsets_with_mask(array) + return pa.LargeListArray.from_arrays(array_offsets, casted_array_values) + elif isinstance(feature, List): + if feature.length > -1: + if _are_list_values_of_length(array, feature.length): + if array.null_count > 0: + # Ensure each null value in the array translates to [null] * pa_type.list_size in the array's values array + array_type = array.type + storage_type = _storage_type(array_type) + if array_type != storage_type: + # Temporarily convert to the storage type to support extension types in the slice operation + array = array_cast( + array, + storage_type, + allow_primitive_to_str=allow_primitive_to_str, + allow_decimal_to_str=allow_decimal_to_str, + ) + array = pc.list_slice(array, 0, feature.length, return_fixed_size_list=True) + array = array_cast( + array, + array_type, + allow_primitive_to_str=allow_primitive_to_str, + allow_decimal_to_str=allow_decimal_to_str, + ) + else: + array = pc.list_slice(array, 0, feature.length, return_fixed_size_list=True) + array_values = array.values + casted_array_values = _c(array_values, feature.feature) + return pa.FixedSizeListArray.from_arrays( + casted_array_values, feature.length, mask=array.is_null() + ) + else: + array_values = array.values[ + array.offset * feature.length : (array.offset + len(array)) * feature.length + ] + return pa.FixedSizeListArray.from_arrays(_c(array_values, feature.feature), feature.length) + else: + casted_array_values = _c(array.values, feature.feature) + if pa.types.is_list(array.type) and casted_array_values.type == array.values.type: + # Both array and feature have equal list type and values (within the list) type + return array + else: + # Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError + array_offsets = _combine_list_array_offsets_with_mask(array) + return pa.ListArray.from_arrays(array_offsets, casted_array_values) + elif pa.types.is_fixed_size_list(array.type): + # feature must be List(subfeature) + if isinstance(feature, LargeList): + array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size + return pa.LargeListArray.from_arrays( + array_offsets, _c(array.values, feature.feature), mask=array.is_null() + ) + elif isinstance(feature, List): + if feature.length > -1: + if feature.length == array.type.list_size: + array_values = array.values[ + array.offset * array.type.list_size : (array.offset + len(array)) * array.type.list_size + ] + casted_array_values = _c(array_values, feature.feature) + return pa.FixedSizeListArray.from_arrays(casted_array_values, feature.length, mask=array.is_null()) + else: + array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size + return pa.ListArray.from_arrays(array_offsets, _c(array.values, feature.feature), mask=array.is_null()) + if pa.types.is_null(array.type): + return array_cast( + array, + get_nested_type(feature), + allow_primitive_to_str=allow_primitive_to_str, + allow_decimal_to_str=allow_decimal_to_str, + ) + elif not isinstance(feature, (List, LargeList, dict)): + return array_cast( + array, + feature(), + allow_primitive_to_str=allow_primitive_to_str, + allow_decimal_to_str=allow_decimal_to_str, + ) + raise TypeError(f"Couldn't cast array of type\n{_short_str(array.type)}\nto\n{_short_str(feature)}") + + +@_wrap_for_chunked_arrays +def embed_array_storage(array: pa.Array, feature: "FeatureType", token_per_repo_id=None): + """Embed data into an arrays's storage. + For custom features like Audio or Image, it takes into account the "embed_storage" methods + they define to embed external data (e.g. an image file) into an array. + + + + Args: + array (`pa.Array`): + The PyArrow array in which to embed data. + feature (`datasets.features.FeatureType`): + Array features. + + Raises: + `TypeError`: if the target type is not supported according, e.g. + + - if a field is missing + + Returns: + array (`pyarrow.Array`): the casted array + """ + from .features import LargeList, List + + _e = partial(embed_array_storage, token_per_repo_id=token_per_repo_id) + + if isinstance(array, pa.ExtensionArray): + array = array.storage + if hasattr(feature, "embed_storage"): + return feature.embed_storage(array, token_per_repo_id=token_per_repo_id) + elif pa.types.is_struct(array.type): + # feature must be a dict + if isinstance(feature, dict): + arrays = [_e(array.field(name), subfeature) for name, subfeature in feature.items()] + return pa.StructArray.from_arrays(arrays, names=list(feature), mask=array.is_null()) + elif pa.types.is_list(array.type): + # feature must be either List(subfeature) + # Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError + array_offsets = _combine_list_array_offsets_with_mask(array) + if isinstance(feature, List) and feature.length == -1: + return pa.ListArray.from_arrays(array_offsets, _e(array.values, feature.feature)) + elif pa.types.is_large_list(array.type): + # feature must be LargeList(subfeature) + # Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError + array_offsets = _combine_list_array_offsets_with_mask(array) + return pa.LargeListArray.from_arrays(array_offsets, _e(array.values, feature.feature)) + elif pa.types.is_fixed_size_list(array.type): + # feature must be List(subfeature) + if isinstance(feature, List) and feature.length > -1: + array_values = array.values[ + array.offset * array.type.list_size : (array.offset + len(array)) * array.type.list_size + ] + embedded_array_values = _e(array_values, feature.feature) + return pa.FixedSizeListArray.from_arrays(embedded_array_values, feature.length, mask=array.is_null()) + if not isinstance(feature, (List, LargeList, dict)): + return array + raise TypeError(f"Couldn't embed array of type\n{_short_str(array.type)}\nwith\n{_short_str(feature)}") + + +class CastError(ValueError): + """When it's not possible to cast an Arrow table to a specific schema or set of features""" + + def __init__(self, *args, table_column_names: list[str], requested_column_names: list[str]) -> None: + super().__init__(*args) + self.table_column_names = table_column_names + self.requested_column_names = requested_column_names + + def __reduce__(self): + # Fix unpickling: TypeError: __init__() missing 2 required keyword-only arguments: 'table_column_names' and 'requested_column_names' + return partial( + CastError, table_column_names=self.table_column_names, requested_column_names=self.requested_column_names + ), () + + def details(self): + new_columns = set(self.table_column_names) - set(self.requested_column_names) + missing_columns = set(self.requested_column_names) - set(self.table_column_names) + if new_columns and missing_columns: + return f"there are {len(new_columns)} new columns ({_short_str(new_columns)}) and {len(missing_columns)} missing columns ({_short_str(missing_columns)})." + elif new_columns: + return f"there are {len(new_columns)} new columns ({_short_str(new_columns)})" + else: + return f"there are {len(missing_columns)} missing columns ({_short_str(missing_columns)})" + + +def cast_table_to_features(table: pa.Table, features: "Features"): + """Cast a table to the arrow schema that corresponds to the requested features. + + Args: + table (`pyarrow.Table`): + PyArrow table to cast. + features ([`Features`]): + Target features. + + Returns: + table (`pyarrow.Table`): the casted table + """ + if sorted(table.column_names) != sorted(features): + raise CastError( + f"Couldn't cast\n{_short_str(table.schema)}\nto\n{_short_str(features)}\nbecause column names don't match", + table_column_names=table.column_names, + requested_column_names=list(features), + ) + arrays = [cast_array_to_feature(table[name], feature) for name, feature in features.items()] + return pa.Table.from_arrays(arrays, schema=features.arrow_schema) + + +def cast_table_to_schema(table: pa.Table, schema: pa.Schema): + """Cast a table to the arrow schema. Different from `cast_table_to_features`, this method can preserve nullability. + + Args: + table (`pa.Table`): + PyArrow table to cast. + features ([`Features`]): + Target features. + + Returns: + `pa.Table`: the casted table + """ + from .features import Features + + features = Features.from_arrow_schema(schema) + table_column_names = set(table.column_names) + if not table_column_names <= set(schema.names): + raise CastError( + f"Couldn't cast\n{_short_str(table.schema)}\nto\n{_short_str(features)}\nbecause column names don't match", + table_column_names=table.column_names, + requested_column_names=list(features), + ) + arrays = [ + cast_array_to_feature( + table[name] if name in table_column_names else pa.array([None] * len(table), type=schema.field(name).type), + feature, + ) + for name, feature in features.items() + ] + return pa.Table.from_arrays(arrays, schema=schema) + + +def embed_table_storage(table: pa.Table, token_per_repo_id=None): + """Embed external data into a table's storage. + + + + Args: + table (`pyarrow.Table`): + PyArrow table in which to embed data. + + Returns: + table (`pyarrow.Table`): the table with embedded data + """ + from .features.features import Features, require_storage_embed + + features = Features.from_arrow_schema(table.schema) + arrays = [ + embed_array_storage(table[name], feature, token_per_repo_id=token_per_repo_id) + if require_storage_embed(feature) + else table[name] + for name, feature in features.items() + ] + return pa.Table.from_arrays(arrays, schema=features.arrow_schema) + + +def table_cast(table: pa.Table, schema: pa.Schema): + """Improved version of `pa.Table.cast`. + + It supports casting to feature types stored in the schema metadata. + + Args: + table (`pyarrow.Table`): + PyArrow table to cast. + schema (`pyarrow.Schema`): + Target PyArrow schema. + + Returns: + table (`pyarrow.Table`): the casted table + """ + if table.schema != schema: + return cast_table_to_schema(table, schema) + elif table.schema.metadata != schema.metadata: + return table.replace_schema_metadata(schema.metadata) + else: + return table + + +def table_flatten(table: pa.Table): + """Improved version of `pa.Table.flatten`. + + It behaves as `pa.Table.flatten` in a sense it does 1-step flatten of the columns with a struct type into one column per struct field, + but updates the metadata and skips decodable features unless the `decode` attribute of these features is set to False. + + Args: + table (`pa.Table`): + PyArrow table to flatten. + + Returns: + `Table`: the flattened table + """ + from .features import Features + + features = Features.from_arrow_schema(table.schema) + if any(hasattr(subfeature, "flatten") and subfeature.flatten() == subfeature for subfeature in features.values()): + flat_arrays = [] + flat_column_names = [] + for field in table.schema: + array = table.column(field.name) + subfeature = features[field.name] + if pa.types.is_struct(field.type) and ( + not hasattr(subfeature, "flatten") or subfeature.flatten() != subfeature + ): + flat_arrays.extend(array.flatten()) + flat_column_names.extend([f"{field.name}.{subfield.name}" for subfield in field.type]) + else: + flat_arrays.append(array) + flat_column_names.append(field.name) + flat_table = pa.Table.from_arrays( + flat_arrays, + names=flat_column_names, + ) + else: + flat_table = table.flatten() + # Preserve complex types in the metadata + flat_features = features.flatten(max_depth=2) + flat_features = Features({column_name: flat_features[column_name] for column_name in flat_table.column_names}) + return flat_table.replace_schema_metadata(flat_features.arrow_schema.metadata) + + +def table_visitor(table: pa.Table, function: Callable[[pa.Array], None]): + """Visit all arrays in a table and apply a function to them. + + Args: + table (`pyarrow.Table`): + PyArrow table to visit. + function (`Callable[[pa.Array], None]`): + Function to apply to each array. + """ + from .features import Features, LargeList, List + + features = Features.from_arrow_schema(table.schema) + + def _visit(array, feature): + if isinstance(array, pa.ChunkedArray): + for chunk in array.chunks: + _visit(chunk, feature) + else: + if isinstance(array, pa.ExtensionArray): + array = array.storage + function(array, feature) + if pa.types.is_struct(array.type) and not hasattr(feature, "cast_storage"): + for name, subfeature in feature.items(): + _visit(array.field(name), subfeature) + elif pa.types.is_list(array.type): + if isinstance(feature, (LargeList, List)): + _visit(array.values, feature.feature) + + for name, feature in features.items(): + _visit(table[name], feature) + + +def table_iter(table: Table, batch_size: int, drop_last_batch=False) -> Iterator[pa.Table]: + """Iterate over sub-tables of size `batch_size`. + + Args: + table (`pyarrow.Table`): + PyArrow table to iterate over. + batch_size (`int`): + Size of each sub-table to yield. + drop_last_batch (`bool`, defaults to `False`): + Drop the last batch if it is smaller than `batch_size`. + """ + chunks_buffer = [] + chunks_buffer_size = 0 + for chunk in table.to_reader(max_chunksize=batch_size): + if len(chunk) == 0: + continue + elif chunks_buffer_size + len(chunk) < batch_size: + chunks_buffer.append(chunk) + chunks_buffer_size += len(chunk) + continue + elif chunks_buffer_size + len(chunk) == batch_size: + chunks_buffer.append(chunk) + yield pa.Table.from_batches(chunks_buffer) + chunks_buffer = [] + chunks_buffer_size = 0 + else: + cropped_chunk_length = batch_size - chunks_buffer_size + chunks_buffer.append(chunk.slice(0, cropped_chunk_length)) + yield pa.Table.from_batches(chunks_buffer) + chunks_buffer = [chunk.slice(cropped_chunk_length, len(chunk) - cropped_chunk_length)] + chunks_buffer_size = len(chunk) - cropped_chunk_length + if not drop_last_batch and chunks_buffer: + yield pa.Table.from_batches(chunks_buffer) diff --git a/idna/__init__.py b/idna/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cfdc030a751b089fc7e38fc88093b791605d501d --- /dev/null +++ b/idna/__init__.py @@ -0,0 +1,45 @@ +from .core import ( + IDNABidiError, + IDNAError, + InvalidCodepoint, + InvalidCodepointContext, + alabel, + check_bidi, + check_hyphen_ok, + check_initial_combiner, + check_label, + check_nfc, + decode, + encode, + ulabel, + uts46_remap, + valid_contextj, + valid_contexto, + valid_label_length, + valid_string_length, +) +from .intranges import intranges_contain +from .package_data import __version__ + +__all__ = [ + "__version__", + "IDNABidiError", + "IDNAError", + "InvalidCodepoint", + "InvalidCodepointContext", + "alabel", + "check_bidi", + "check_hyphen_ok", + "check_initial_combiner", + "check_label", + "check_nfc", + "decode", + "encode", + "intranges_contain", + "ulabel", + "uts46_remap", + "valid_contextj", + "valid_contexto", + "valid_label_length", + "valid_string_length", +] diff --git a/idna/codec.py b/idna/codec.py new file mode 100644 index 0000000000000000000000000000000000000000..cbc2e4ff4ec3e2318d47615bab44ea0ca3dba978 --- /dev/null +++ b/idna/codec.py @@ -0,0 +1,122 @@ +import codecs +import re +from typing import Any, Optional, Tuple + +from .core import IDNAError, alabel, decode, encode, ulabel + +_unicode_dots_re = re.compile("[\u002e\u3002\uff0e\uff61]") + + +class Codec(codecs.Codec): + def encode(self, data: str, errors: str = "strict") -> Tuple[bytes, int]: + if errors != "strict": + raise IDNAError('Unsupported error handling "{}"'.format(errors)) + + if not data: + return b"", 0 + + return encode(data), len(data) + + def decode(self, data: bytes, errors: str = "strict") -> Tuple[str, int]: + if errors != "strict": + raise IDNAError('Unsupported error handling "{}"'.format(errors)) + + if not data: + return "", 0 + + return decode(data), len(data) + + +class IncrementalEncoder(codecs.BufferedIncrementalEncoder): + def _buffer_encode(self, data: str, errors: str, final: bool) -> Tuple[bytes, int]: + if errors != "strict": + raise IDNAError('Unsupported error handling "{}"'.format(errors)) + + if not data: + return b"", 0 + + labels = _unicode_dots_re.split(data) + trailing_dot = b"" + if labels: + if not labels[-1]: + trailing_dot = b"." + del labels[-1] + elif not final: + # Keep potentially unfinished label until the next call + del labels[-1] + if labels: + trailing_dot = b"." + + result = [] + size = 0 + for label in labels: + result.append(alabel(label)) + if size: + size += 1 + size += len(label) + + # Join with U+002E + result_bytes = b".".join(result) + trailing_dot + size += len(trailing_dot) + return result_bytes, size + + +class IncrementalDecoder(codecs.BufferedIncrementalDecoder): + def _buffer_decode(self, data: Any, errors: str, final: bool) -> Tuple[str, int]: + if errors != "strict": + raise IDNAError('Unsupported error handling "{}"'.format(errors)) + + if not data: + return ("", 0) + + if not isinstance(data, str): + data = str(data, "ascii") + + labels = _unicode_dots_re.split(data) + trailing_dot = "" + if labels: + if not labels[-1]: + trailing_dot = "." + del labels[-1] + elif not final: + # Keep potentially unfinished label until the next call + del labels[-1] + if labels: + trailing_dot = "." + + result = [] + size = 0 + for label in labels: + result.append(ulabel(label)) + if size: + size += 1 + size += len(label) + + result_str = ".".join(result) + trailing_dot + size += len(trailing_dot) + return (result_str, size) + + +class StreamWriter(Codec, codecs.StreamWriter): + pass + + +class StreamReader(Codec, codecs.StreamReader): + pass + + +def search_function(name: str) -> Optional[codecs.CodecInfo]: + if name != "idna2008": + return None + return codecs.CodecInfo( + name=name, + encode=Codec().encode, + decode=Codec().decode, # type: ignore + incrementalencoder=IncrementalEncoder, + incrementaldecoder=IncrementalDecoder, + streamwriter=StreamWriter, + streamreader=StreamReader, + ) + + +codecs.register(search_function) diff --git a/idna/compat.py b/idna/compat.py new file mode 100644 index 0000000000000000000000000000000000000000..1df9f2a70e6815908f2784e88897a9a359eef84c --- /dev/null +++ b/idna/compat.py @@ -0,0 +1,15 @@ +from typing import Any, Union + +from .core import decode, encode + + +def ToASCII(label: str) -> bytes: + return encode(label) + + +def ToUnicode(label: Union[bytes, bytearray]) -> str: + return decode(label) + + +def nameprep(s: Any) -> None: + raise NotImplementedError("IDNA 2008 does not utilise nameprep protocol") diff --git a/idna/core.py b/idna/core.py new file mode 100644 index 0000000000000000000000000000000000000000..8177bf7a324f9f54a29e41e867f5d56f2dd0a924 --- /dev/null +++ b/idna/core.py @@ -0,0 +1,437 @@ +import bisect +import re +import unicodedata +from typing import Optional, Union + +from . import idnadata +from .intranges import intranges_contain + +_virama_combining_class = 9 +_alabel_prefix = b"xn--" +_unicode_dots_re = re.compile("[\u002e\u3002\uff0e\uff61]") + + +class IDNAError(UnicodeError): + """Base exception for all IDNA-encoding related problems""" + + pass + + +class IDNABidiError(IDNAError): + """Exception when bidirectional requirements are not satisfied""" + + pass + + +class InvalidCodepoint(IDNAError): + """Exception when a disallowed or unallocated codepoint is used""" + + pass + + +class InvalidCodepointContext(IDNAError): + """Exception when the codepoint is not valid in the context it is used""" + + pass + + +def _combining_class(cp: int) -> int: + v = unicodedata.combining(chr(cp)) + if v == 0: + if not unicodedata.name(chr(cp)): + raise ValueError("Unknown character in unicodedata") + return v + + +def _is_script(cp: str, script: str) -> bool: + return intranges_contain(ord(cp), idnadata.scripts[script]) + + +def _punycode(s: str) -> bytes: + return s.encode("punycode") + + +def _unot(s: int) -> str: + return "U+{:04X}".format(s) + + +def valid_label_length(label: Union[bytes, str]) -> bool: + if len(label) > 63: + return False + return True + + +def valid_string_length(label: Union[bytes, str], trailing_dot: bool) -> bool: + if len(label) > (254 if trailing_dot else 253): + return False + return True + + +def check_bidi(label: str, check_ltr: bool = False) -> bool: + # Bidi rules should only be applied if string contains RTL characters + bidi_label = False + for idx, cp in enumerate(label, 1): + direction = unicodedata.bidirectional(cp) + if direction == "": + # String likely comes from a newer version of Unicode + raise IDNABidiError("Unknown directionality in label {} at position {}".format(repr(label), idx)) + if direction in ["R", "AL", "AN"]: + bidi_label = True + if not bidi_label and not check_ltr: + return True + + # Bidi rule 1 + direction = unicodedata.bidirectional(label[0]) + if direction in ["R", "AL"]: + rtl = True + elif direction == "L": + rtl = False + else: + raise IDNABidiError("First codepoint in label {} must be directionality L, R or AL".format(repr(label))) + + valid_ending = False + number_type: Optional[str] = None + for idx, cp in enumerate(label, 1): + direction = unicodedata.bidirectional(cp) + + if rtl: + # Bidi rule 2 + if direction not in [ + "R", + "AL", + "AN", + "EN", + "ES", + "CS", + "ET", + "ON", + "BN", + "NSM", + ]: + raise IDNABidiError("Invalid direction for codepoint at position {} in a right-to-left label".format(idx)) + # Bidi rule 3 + if direction in ["R", "AL", "EN", "AN"]: + valid_ending = True + elif direction != "NSM": + valid_ending = False + # Bidi rule 4 + if direction in ["AN", "EN"]: + if not number_type: + number_type = direction + else: + if number_type != direction: + raise IDNABidiError("Can not mix numeral types in a right-to-left label") + else: + # Bidi rule 5 + if direction not in ["L", "EN", "ES", "CS", "ET", "ON", "BN", "NSM"]: + raise IDNABidiError("Invalid direction for codepoint at position {} in a left-to-right label".format(idx)) + # Bidi rule 6 + if direction in ["L", "EN"]: + valid_ending = True + elif direction != "NSM": + valid_ending = False + + if not valid_ending: + raise IDNABidiError("Label ends with illegal codepoint directionality") + + return True + + +def check_initial_combiner(label: str) -> bool: + if unicodedata.category(label[0])[0] == "M": + raise IDNAError("Label begins with an illegal combining character") + return True + + +def check_hyphen_ok(label: str) -> bool: + if label[2:4] == "--": + raise IDNAError("Label has disallowed hyphens in 3rd and 4th position") + if label[0] == "-" or label[-1] == "-": + raise IDNAError("Label must not start or end with a hyphen") + return True + + +def check_nfc(label: str) -> None: + if unicodedata.normalize("NFC", label) != label: + raise IDNAError("Label must be in Normalization Form C") + + +def valid_contextj(label: str, pos: int) -> bool: + cp_value = ord(label[pos]) + + if cp_value == 0x200C: + if pos > 0: + if _combining_class(ord(label[pos - 1])) == _virama_combining_class: + return True + + ok = False + for i in range(pos - 1, -1, -1): + joining_type = idnadata.joining_types.get(ord(label[i])) + if joining_type == ord("T"): + continue + elif joining_type in [ord("L"), ord("D")]: + ok = True + break + else: + break + + if not ok: + return False + + ok = False + for i in range(pos + 1, len(label)): + joining_type = idnadata.joining_types.get(ord(label[i])) + if joining_type == ord("T"): + continue + elif joining_type in [ord("R"), ord("D")]: + ok = True + break + else: + break + return ok + + if cp_value == 0x200D: + if pos > 0: + if _combining_class(ord(label[pos - 1])) == _virama_combining_class: + return True + return False + + else: + return False + + +def valid_contexto(label: str, pos: int, exception: bool = False) -> bool: + cp_value = ord(label[pos]) + + if cp_value == 0x00B7: + if 0 < pos < len(label) - 1: + if ord(label[pos - 1]) == 0x006C and ord(label[pos + 1]) == 0x006C: + return True + return False + + elif cp_value == 0x0375: + if pos < len(label) - 1 and len(label) > 1: + return _is_script(label[pos + 1], "Greek") + return False + + elif cp_value == 0x05F3 or cp_value == 0x05F4: + if pos > 0: + return _is_script(label[pos - 1], "Hebrew") + return False + + elif cp_value == 0x30FB: + for cp in label: + if cp == "\u30fb": + continue + if _is_script(cp, "Hiragana") or _is_script(cp, "Katakana") or _is_script(cp, "Han"): + return True + return False + + elif 0x660 <= cp_value <= 0x669: + for cp in label: + if 0x6F0 <= ord(cp) <= 0x06F9: + return False + return True + + elif 0x6F0 <= cp_value <= 0x6F9: + for cp in label: + if 0x660 <= ord(cp) <= 0x0669: + return False + return True + + return False + + +def check_label(label: Union[str, bytes, bytearray]) -> None: + if isinstance(label, (bytes, bytearray)): + label = label.decode("utf-8") + if len(label) == 0: + raise IDNAError("Empty Label") + + check_nfc(label) + check_hyphen_ok(label) + check_initial_combiner(label) + + for pos, cp in enumerate(label): + cp_value = ord(cp) + if intranges_contain(cp_value, idnadata.codepoint_classes["PVALID"]): + continue + elif intranges_contain(cp_value, idnadata.codepoint_classes["CONTEXTJ"]): + try: + if not valid_contextj(label, pos): + raise InvalidCodepointContext( + "Joiner {} not allowed at position {} in {}".format(_unot(cp_value), pos + 1, repr(label)) + ) + except ValueError: + raise IDNAError( + "Unknown codepoint adjacent to joiner {} at position {} in {}".format( + _unot(cp_value), pos + 1, repr(label) + ) + ) + elif intranges_contain(cp_value, idnadata.codepoint_classes["CONTEXTO"]): + if not valid_contexto(label, pos): + raise InvalidCodepointContext( + "Codepoint {} not allowed at position {} in {}".format(_unot(cp_value), pos + 1, repr(label)) + ) + else: + raise InvalidCodepoint( + "Codepoint {} at position {} of {} not allowed".format(_unot(cp_value), pos + 1, repr(label)) + ) + + check_bidi(label) + + +def alabel(label: str) -> bytes: + try: + label_bytes = label.encode("ascii") + ulabel(label_bytes) + if not valid_label_length(label_bytes): + raise IDNAError("Label too long") + return label_bytes + except UnicodeEncodeError: + pass + + check_label(label) + label_bytes = _alabel_prefix + _punycode(label) + + if not valid_label_length(label_bytes): + raise IDNAError("Label too long") + + return label_bytes + + +def ulabel(label: Union[str, bytes, bytearray]) -> str: + if not isinstance(label, (bytes, bytearray)): + try: + label_bytes = label.encode("ascii") + except UnicodeEncodeError: + check_label(label) + return label + else: + label_bytes = bytes(label) + + label_bytes = label_bytes.lower() + if label_bytes.startswith(_alabel_prefix): + label_bytes = label_bytes[len(_alabel_prefix) :] + if not label_bytes: + raise IDNAError("Malformed A-label, no Punycode eligible content found") + if label_bytes.decode("ascii")[-1] == "-": + raise IDNAError("A-label must not end with a hyphen") + else: + check_label(label_bytes) + return label_bytes.decode("ascii") + + try: + label = label_bytes.decode("punycode") + except UnicodeError: + raise IDNAError("Invalid A-label") + check_label(label) + return label + + +def uts46_remap(domain: str, std3_rules: bool = True, transitional: bool = False) -> str: + """Re-map the characters in the string according to UTS46 processing.""" + from .uts46data import uts46data + + output = "" + + for pos, char in enumerate(domain): + code_point = ord(char) + try: + uts46row = uts46data[code_point if code_point < 256 else bisect.bisect_left(uts46data, (code_point, "Z")) - 1] + status = uts46row[1] + replacement: Optional[str] = None + if len(uts46row) == 3: + replacement = uts46row[2] + if ( + status == "V" + or (status == "D" and not transitional) + or (status == "3" and not std3_rules and replacement is None) + ): + output += char + elif replacement is not None and ( + status == "M" or (status == "3" and not std3_rules) or (status == "D" and transitional) + ): + output += replacement + elif status != "I": + raise IndexError() + except IndexError: + raise InvalidCodepoint( + "Codepoint {} not allowed at position {} in {}".format(_unot(code_point), pos + 1, repr(domain)) + ) + + return unicodedata.normalize("NFC", output) + + +def encode( + s: Union[str, bytes, bytearray], + strict: bool = False, + uts46: bool = False, + std3_rules: bool = False, + transitional: bool = False, +) -> bytes: + if not isinstance(s, str): + try: + s = str(s, "ascii") + except UnicodeDecodeError: + raise IDNAError("should pass a unicode string to the function rather than a byte string.") + if uts46: + s = uts46_remap(s, std3_rules, transitional) + trailing_dot = False + result = [] + if strict: + labels = s.split(".") + else: + labels = _unicode_dots_re.split(s) + if not labels or labels == [""]: + raise IDNAError("Empty domain") + if labels[-1] == "": + del labels[-1] + trailing_dot = True + for label in labels: + s = alabel(label) + if s: + result.append(s) + else: + raise IDNAError("Empty label") + if trailing_dot: + result.append(b"") + s = b".".join(result) + if not valid_string_length(s, trailing_dot): + raise IDNAError("Domain too long") + return s + + +def decode( + s: Union[str, bytes, bytearray], + strict: bool = False, + uts46: bool = False, + std3_rules: bool = False, +) -> str: + try: + if not isinstance(s, str): + s = str(s, "ascii") + except UnicodeDecodeError: + raise IDNAError("Invalid ASCII in A-label") + if uts46: + s = uts46_remap(s, std3_rules, False) + trailing_dot = False + result = [] + if not strict: + labels = _unicode_dots_re.split(s) + else: + labels = s.split(".") + if not labels or labels == [""]: + raise IDNAError("Empty domain") + if not labels[-1]: + del labels[-1] + trailing_dot = True + for label in labels: + s = ulabel(label) + if s: + result.append(s) + else: + raise IDNAError("Empty label") + if trailing_dot: + result.append("") + return ".".join(result) diff --git a/idna/idnadata.py b/idna/idnadata.py new file mode 100644 index 0000000000000000000000000000000000000000..ded47cae0b16977aae69f3895ecfe8b8980f58d0 --- /dev/null +++ b/idna/idnadata.py @@ -0,0 +1,4309 @@ +# This file is automatically generated by tools/idna-data + +__version__ = "16.0.0" + +scripts = { + "Greek": ( + 0x37000000374, + 0x37500000378, + 0x37A0000037E, + 0x37F00000380, + 0x38400000385, + 0x38600000387, + 0x3880000038B, + 0x38C0000038D, + 0x38E000003A2, + 0x3A3000003E2, + 0x3F000000400, + 0x1D2600001D2B, + 0x1D5D00001D62, + 0x1D6600001D6B, + 0x1DBF00001DC0, + 0x1F0000001F16, + 0x1F1800001F1E, + 0x1F2000001F46, + 0x1F4800001F4E, + 0x1F5000001F58, + 0x1F5900001F5A, + 0x1F5B00001F5C, + 0x1F5D00001F5E, + 0x1F5F00001F7E, + 0x1F8000001FB5, + 0x1FB600001FC5, + 0x1FC600001FD4, + 0x1FD600001FDC, + 0x1FDD00001FF0, + 0x1FF200001FF5, + 0x1FF600001FFF, + 0x212600002127, + 0xAB650000AB66, + 0x101400001018F, + 0x101A0000101A1, + 0x1D2000001D246, + ), + "Han": ( + 0x2E8000002E9A, + 0x2E9B00002EF4, + 0x2F0000002FD6, + 0x300500003006, + 0x300700003008, + 0x30210000302A, + 0x30380000303C, + 0x340000004DC0, + 0x4E000000A000, + 0xF9000000FA6E, + 0xFA700000FADA, + 0x16FE200016FE4, + 0x16FF000016FF2, + 0x200000002A6E0, + 0x2A7000002B73A, + 0x2B7400002B81E, + 0x2B8200002CEA2, + 0x2CEB00002EBE1, + 0x2EBF00002EE5E, + 0x2F8000002FA1E, + 0x300000003134B, + 0x31350000323B0, + ), + "Hebrew": ( + 0x591000005C8, + 0x5D0000005EB, + 0x5EF000005F5, + 0xFB1D0000FB37, + 0xFB380000FB3D, + 0xFB3E0000FB3F, + 0xFB400000FB42, + 0xFB430000FB45, + 0xFB460000FB50, + ), + "Hiragana": ( + 0x304100003097, + 0x309D000030A0, + 0x1B0010001B120, + 0x1B1320001B133, + 0x1B1500001B153, + 0x1F2000001F201, + ), + "Katakana": ( + 0x30A1000030FB, + 0x30FD00003100, + 0x31F000003200, + 0x32D0000032FF, + 0x330000003358, + 0xFF660000FF70, + 0xFF710000FF9E, + 0x1AFF00001AFF4, + 0x1AFF50001AFFC, + 0x1AFFD0001AFFF, + 0x1B0000001B001, + 0x1B1200001B123, + 0x1B1550001B156, + 0x1B1640001B168, + ), +} +joining_types = { + 0xAD: 84, + 0x300: 84, + 0x301: 84, + 0x302: 84, + 0x303: 84, + 0x304: 84, + 0x305: 84, + 0x306: 84, + 0x307: 84, + 0x308: 84, + 0x309: 84, + 0x30A: 84, + 0x30B: 84, + 0x30C: 84, + 0x30D: 84, + 0x30E: 84, + 0x30F: 84, + 0x310: 84, + 0x311: 84, + 0x312: 84, + 0x313: 84, + 0x314: 84, + 0x315: 84, + 0x316: 84, + 0x317: 84, + 0x318: 84, + 0x319: 84, + 0x31A: 84, + 0x31B: 84, + 0x31C: 84, + 0x31D: 84, + 0x31E: 84, + 0x31F: 84, + 0x320: 84, + 0x321: 84, + 0x322: 84, + 0x323: 84, + 0x324: 84, + 0x325: 84, + 0x326: 84, + 0x327: 84, + 0x328: 84, + 0x329: 84, + 0x32A: 84, + 0x32B: 84, + 0x32C: 84, + 0x32D: 84, + 0x32E: 84, + 0x32F: 84, + 0x330: 84, + 0x331: 84, + 0x332: 84, + 0x333: 84, + 0x334: 84, + 0x335: 84, + 0x336: 84, + 0x337: 84, + 0x338: 84, + 0x339: 84, + 0x33A: 84, + 0x33B: 84, + 0x33C: 84, + 0x33D: 84, + 0x33E: 84, + 0x33F: 84, + 0x340: 84, + 0x341: 84, + 0x342: 84, + 0x343: 84, + 0x344: 84, + 0x345: 84, + 0x346: 84, + 0x347: 84, + 0x348: 84, + 0x349: 84, + 0x34A: 84, + 0x34B: 84, + 0x34C: 84, + 0x34D: 84, + 0x34E: 84, + 0x34F: 84, + 0x350: 84, + 0x351: 84, + 0x352: 84, + 0x353: 84, + 0x354: 84, + 0x355: 84, + 0x356: 84, + 0x357: 84, + 0x358: 84, + 0x359: 84, + 0x35A: 84, + 0x35B: 84, + 0x35C: 84, + 0x35D: 84, + 0x35E: 84, + 0x35F: 84, + 0x360: 84, + 0x361: 84, + 0x362: 84, + 0x363: 84, + 0x364: 84, + 0x365: 84, + 0x366: 84, + 0x367: 84, + 0x368: 84, + 0x369: 84, + 0x36A: 84, + 0x36B: 84, + 0x36C: 84, + 0x36D: 84, + 0x36E: 84, + 0x36F: 84, + 0x483: 84, + 0x484: 84, + 0x485: 84, + 0x486: 84, + 0x487: 84, + 0x488: 84, + 0x489: 84, + 0x591: 84, + 0x592: 84, + 0x593: 84, + 0x594: 84, + 0x595: 84, + 0x596: 84, + 0x597: 84, + 0x598: 84, + 0x599: 84, + 0x59A: 84, + 0x59B: 84, + 0x59C: 84, + 0x59D: 84, + 0x59E: 84, + 0x59F: 84, + 0x5A0: 84, + 0x5A1: 84, + 0x5A2: 84, + 0x5A3: 84, + 0x5A4: 84, + 0x5A5: 84, + 0x5A6: 84, + 0x5A7: 84, + 0x5A8: 84, + 0x5A9: 84, + 0x5AA: 84, + 0x5AB: 84, + 0x5AC: 84, + 0x5AD: 84, + 0x5AE: 84, + 0x5AF: 84, + 0x5B0: 84, + 0x5B1: 84, + 0x5B2: 84, + 0x5B3: 84, + 0x5B4: 84, + 0x5B5: 84, + 0x5B6: 84, + 0x5B7: 84, + 0x5B8: 84, + 0x5B9: 84, + 0x5BA: 84, + 0x5BB: 84, + 0x5BC: 84, + 0x5BD: 84, + 0x5BF: 84, + 0x5C1: 84, + 0x5C2: 84, + 0x5C4: 84, + 0x5C5: 84, + 0x5C7: 84, + 0x610: 84, + 0x611: 84, + 0x612: 84, + 0x613: 84, + 0x614: 84, + 0x615: 84, + 0x616: 84, + 0x617: 84, + 0x618: 84, + 0x619: 84, + 0x61A: 84, + 0x61C: 84, + 0x620: 68, + 0x622: 82, + 0x623: 82, + 0x624: 82, + 0x625: 82, + 0x626: 68, + 0x627: 82, + 0x628: 68, + 0x629: 82, + 0x62A: 68, + 0x62B: 68, + 0x62C: 68, + 0x62D: 68, + 0x62E: 68, + 0x62F: 82, + 0x630: 82, + 0x631: 82, + 0x632: 82, + 0x633: 68, + 0x634: 68, + 0x635: 68, + 0x636: 68, + 0x637: 68, + 0x638: 68, + 0x639: 68, + 0x63A: 68, + 0x63B: 68, + 0x63C: 68, + 0x63D: 68, + 0x63E: 68, + 0x63F: 68, + 0x640: 67, + 0x641: 68, + 0x642: 68, + 0x643: 68, + 0x644: 68, + 0x645: 68, + 0x646: 68, + 0x647: 68, + 0x648: 82, + 0x649: 68, + 0x64A: 68, + 0x64B: 84, + 0x64C: 84, + 0x64D: 84, + 0x64E: 84, + 0x64F: 84, + 0x650: 84, + 0x651: 84, + 0x652: 84, + 0x653: 84, + 0x654: 84, + 0x655: 84, + 0x656: 84, + 0x657: 84, + 0x658: 84, + 0x659: 84, + 0x65A: 84, + 0x65B: 84, + 0x65C: 84, + 0x65D: 84, + 0x65E: 84, + 0x65F: 84, + 0x66E: 68, + 0x66F: 68, + 0x670: 84, + 0x671: 82, + 0x672: 82, + 0x673: 82, + 0x675: 82, + 0x676: 82, + 0x677: 82, + 0x678: 68, + 0x679: 68, + 0x67A: 68, + 0x67B: 68, + 0x67C: 68, + 0x67D: 68, + 0x67E: 68, + 0x67F: 68, + 0x680: 68, + 0x681: 68, + 0x682: 68, + 0x683: 68, + 0x684: 68, + 0x685: 68, + 0x686: 68, + 0x687: 68, + 0x688: 82, + 0x689: 82, + 0x68A: 82, + 0x68B: 82, + 0x68C: 82, + 0x68D: 82, + 0x68E: 82, + 0x68F: 82, + 0x690: 82, + 0x691: 82, + 0x692: 82, + 0x693: 82, + 0x694: 82, + 0x695: 82, + 0x696: 82, + 0x697: 82, + 0x698: 82, + 0x699: 82, + 0x69A: 68, + 0x69B: 68, + 0x69C: 68, + 0x69D: 68, + 0x69E: 68, + 0x69F: 68, + 0x6A0: 68, + 0x6A1: 68, + 0x6A2: 68, + 0x6A3: 68, + 0x6A4: 68, + 0x6A5: 68, + 0x6A6: 68, + 0x6A7: 68, + 0x6A8: 68, + 0x6A9: 68, + 0x6AA: 68, + 0x6AB: 68, + 0x6AC: 68, + 0x6AD: 68, + 0x6AE: 68, + 0x6AF: 68, + 0x6B0: 68, + 0x6B1: 68, + 0x6B2: 68, + 0x6B3: 68, + 0x6B4: 68, + 0x6B5: 68, + 0x6B6: 68, + 0x6B7: 68, + 0x6B8: 68, + 0x6B9: 68, + 0x6BA: 68, + 0x6BB: 68, + 0x6BC: 68, + 0x6BD: 68, + 0x6BE: 68, + 0x6BF: 68, + 0x6C0: 82, + 0x6C1: 68, + 0x6C2: 68, + 0x6C3: 82, + 0x6C4: 82, + 0x6C5: 82, + 0x6C6: 82, + 0x6C7: 82, + 0x6C8: 82, + 0x6C9: 82, + 0x6CA: 82, + 0x6CB: 82, + 0x6CC: 68, + 0x6CD: 82, + 0x6CE: 68, + 0x6CF: 82, + 0x6D0: 68, + 0x6D1: 68, + 0x6D2: 82, + 0x6D3: 82, + 0x6D5: 82, + 0x6D6: 84, + 0x6D7: 84, + 0x6D8: 84, + 0x6D9: 84, + 0x6DA: 84, + 0x6DB: 84, + 0x6DC: 84, + 0x6DF: 84, + 0x6E0: 84, + 0x6E1: 84, + 0x6E2: 84, + 0x6E3: 84, + 0x6E4: 84, + 0x6E7: 84, + 0x6E8: 84, + 0x6EA: 84, + 0x6EB: 84, + 0x6EC: 84, + 0x6ED: 84, + 0x6EE: 82, + 0x6EF: 82, + 0x6FA: 68, + 0x6FB: 68, + 0x6FC: 68, + 0x6FF: 68, + 0x70F: 84, + 0x710: 82, + 0x711: 84, + 0x712: 68, + 0x713: 68, + 0x714: 68, + 0x715: 82, + 0x716: 82, + 0x717: 82, + 0x718: 82, + 0x719: 82, + 0x71A: 68, + 0x71B: 68, + 0x71C: 68, + 0x71D: 68, + 0x71E: 82, + 0x71F: 68, + 0x720: 68, + 0x721: 68, + 0x722: 68, + 0x723: 68, + 0x724: 68, + 0x725: 68, + 0x726: 68, + 0x727: 68, + 0x728: 82, + 0x729: 68, + 0x72A: 82, + 0x72B: 68, + 0x72C: 82, + 0x72D: 68, + 0x72E: 68, + 0x72F: 82, + 0x730: 84, + 0x731: 84, + 0x732: 84, + 0x733: 84, + 0x734: 84, + 0x735: 84, + 0x736: 84, + 0x737: 84, + 0x738: 84, + 0x739: 84, + 0x73A: 84, + 0x73B: 84, + 0x73C: 84, + 0x73D: 84, + 0x73E: 84, + 0x73F: 84, + 0x740: 84, + 0x741: 84, + 0x742: 84, + 0x743: 84, + 0x744: 84, + 0x745: 84, + 0x746: 84, + 0x747: 84, + 0x748: 84, + 0x749: 84, + 0x74A: 84, + 0x74D: 82, + 0x74E: 68, + 0x74F: 68, + 0x750: 68, + 0x751: 68, + 0x752: 68, + 0x753: 68, + 0x754: 68, + 0x755: 68, + 0x756: 68, + 0x757: 68, + 0x758: 68, + 0x759: 82, + 0x75A: 82, + 0x75B: 82, + 0x75C: 68, + 0x75D: 68, + 0x75E: 68, + 0x75F: 68, + 0x760: 68, + 0x761: 68, + 0x762: 68, + 0x763: 68, + 0x764: 68, + 0x765: 68, + 0x766: 68, + 0x767: 68, + 0x768: 68, + 0x769: 68, + 0x76A: 68, + 0x76B: 82, + 0x76C: 82, + 0x76D: 68, + 0x76E: 68, + 0x76F: 68, + 0x770: 68, + 0x771: 82, + 0x772: 68, + 0x773: 82, + 0x774: 82, + 0x775: 68, + 0x776: 68, + 0x777: 68, + 0x778: 82, + 0x779: 82, + 0x77A: 68, + 0x77B: 68, + 0x77C: 68, + 0x77D: 68, + 0x77E: 68, + 0x77F: 68, + 0x7A6: 84, + 0x7A7: 84, + 0x7A8: 84, + 0x7A9: 84, + 0x7AA: 84, + 0x7AB: 84, + 0x7AC: 84, + 0x7AD: 84, + 0x7AE: 84, + 0x7AF: 84, + 0x7B0: 84, + 0x7CA: 68, + 0x7CB: 68, + 0x7CC: 68, + 0x7CD: 68, + 0x7CE: 68, + 0x7CF: 68, + 0x7D0: 68, + 0x7D1: 68, + 0x7D2: 68, + 0x7D3: 68, + 0x7D4: 68, + 0x7D5: 68, + 0x7D6: 68, + 0x7D7: 68, + 0x7D8: 68, + 0x7D9: 68, + 0x7DA: 68, + 0x7DB: 68, + 0x7DC: 68, + 0x7DD: 68, + 0x7DE: 68, + 0x7DF: 68, + 0x7E0: 68, + 0x7E1: 68, + 0x7E2: 68, + 0x7E3: 68, + 0x7E4: 68, + 0x7E5: 68, + 0x7E6: 68, + 0x7E7: 68, + 0x7E8: 68, + 0x7E9: 68, + 0x7EA: 68, + 0x7EB: 84, + 0x7EC: 84, + 0x7ED: 84, + 0x7EE: 84, + 0x7EF: 84, + 0x7F0: 84, + 0x7F1: 84, + 0x7F2: 84, + 0x7F3: 84, + 0x7FA: 67, + 0x7FD: 84, + 0x816: 84, + 0x817: 84, + 0x818: 84, + 0x819: 84, + 0x81B: 84, + 0x81C: 84, + 0x81D: 84, + 0x81E: 84, + 0x81F: 84, + 0x820: 84, + 0x821: 84, + 0x822: 84, + 0x823: 84, + 0x825: 84, + 0x826: 84, + 0x827: 84, + 0x829: 84, + 0x82A: 84, + 0x82B: 84, + 0x82C: 84, + 0x82D: 84, + 0x840: 82, + 0x841: 68, + 0x842: 68, + 0x843: 68, + 0x844: 68, + 0x845: 68, + 0x846: 82, + 0x847: 82, + 0x848: 68, + 0x849: 82, + 0x84A: 68, + 0x84B: 68, + 0x84C: 68, + 0x84D: 68, + 0x84E: 68, + 0x84F: 68, + 0x850: 68, + 0x851: 68, + 0x852: 68, + 0x853: 68, + 0x854: 82, + 0x855: 68, + 0x856: 82, + 0x857: 82, + 0x858: 82, + 0x859: 84, + 0x85A: 84, + 0x85B: 84, + 0x860: 68, + 0x862: 68, + 0x863: 68, + 0x864: 68, + 0x865: 68, + 0x867: 82, + 0x868: 68, + 0x869: 82, + 0x86A: 82, + 0x870: 82, + 0x871: 82, + 0x872: 82, + 0x873: 82, + 0x874: 82, + 0x875: 82, + 0x876: 82, + 0x877: 82, + 0x878: 82, + 0x879: 82, + 0x87A: 82, + 0x87B: 82, + 0x87C: 82, + 0x87D: 82, + 0x87E: 82, + 0x87F: 82, + 0x880: 82, + 0x881: 82, + 0x882: 82, + 0x883: 67, + 0x884: 67, + 0x885: 67, + 0x886: 68, + 0x889: 68, + 0x88A: 68, + 0x88B: 68, + 0x88C: 68, + 0x88D: 68, + 0x88E: 82, + 0x897: 84, + 0x898: 84, + 0x899: 84, + 0x89A: 84, + 0x89B: 84, + 0x89C: 84, + 0x89D: 84, + 0x89E: 84, + 0x89F: 84, + 0x8A0: 68, + 0x8A1: 68, + 0x8A2: 68, + 0x8A3: 68, + 0x8A4: 68, + 0x8A5: 68, + 0x8A6: 68, + 0x8A7: 68, + 0x8A8: 68, + 0x8A9: 68, + 0x8AA: 82, + 0x8AB: 82, + 0x8AC: 82, + 0x8AE: 82, + 0x8AF: 68, + 0x8B0: 68, + 0x8B1: 82, + 0x8B2: 82, + 0x8B3: 68, + 0x8B4: 68, + 0x8B5: 68, + 0x8B6: 68, + 0x8B7: 68, + 0x8B8: 68, + 0x8B9: 82, + 0x8BA: 68, + 0x8BB: 68, + 0x8BC: 68, + 0x8BD: 68, + 0x8BE: 68, + 0x8BF: 68, + 0x8C0: 68, + 0x8C1: 68, + 0x8C2: 68, + 0x8C3: 68, + 0x8C4: 68, + 0x8C5: 68, + 0x8C6: 68, + 0x8C7: 68, + 0x8C8: 68, + 0x8CA: 84, + 0x8CB: 84, + 0x8CC: 84, + 0x8CD: 84, + 0x8CE: 84, + 0x8CF: 84, + 0x8D0: 84, + 0x8D1: 84, + 0x8D2: 84, + 0x8D3: 84, + 0x8D4: 84, + 0x8D5: 84, + 0x8D6: 84, + 0x8D7: 84, + 0x8D8: 84, + 0x8D9: 84, + 0x8DA: 84, + 0x8DB: 84, + 0x8DC: 84, + 0x8DD: 84, + 0x8DE: 84, + 0x8DF: 84, + 0x8E0: 84, + 0x8E1: 84, + 0x8E3: 84, + 0x8E4: 84, + 0x8E5: 84, + 0x8E6: 84, + 0x8E7: 84, + 0x8E8: 84, + 0x8E9: 84, + 0x8EA: 84, + 0x8EB: 84, + 0x8EC: 84, + 0x8ED: 84, + 0x8EE: 84, + 0x8EF: 84, + 0x8F0: 84, + 0x8F1: 84, + 0x8F2: 84, + 0x8F3: 84, + 0x8F4: 84, + 0x8F5: 84, + 0x8F6: 84, + 0x8F7: 84, + 0x8F8: 84, + 0x8F9: 84, + 0x8FA: 84, + 0x8FB: 84, + 0x8FC: 84, + 0x8FD: 84, + 0x8FE: 84, + 0x8FF: 84, + 0x900: 84, + 0x901: 84, + 0x902: 84, + 0x93A: 84, + 0x93C: 84, + 0x941: 84, + 0x942: 84, + 0x943: 84, + 0x944: 84, + 0x945: 84, + 0x946: 84, + 0x947: 84, + 0x948: 84, + 0x94D: 84, + 0x951: 84, + 0x952: 84, + 0x953: 84, + 0x954: 84, + 0x955: 84, + 0x956: 84, + 0x957: 84, + 0x962: 84, + 0x963: 84, + 0x981: 84, + 0x9BC: 84, + 0x9C1: 84, + 0x9C2: 84, + 0x9C3: 84, + 0x9C4: 84, + 0x9CD: 84, + 0x9E2: 84, + 0x9E3: 84, + 0x9FE: 84, + 0xA01: 84, + 0xA02: 84, + 0xA3C: 84, + 0xA41: 84, + 0xA42: 84, + 0xA47: 84, + 0xA48: 84, + 0xA4B: 84, + 0xA4C: 84, + 0xA4D: 84, + 0xA51: 84, + 0xA70: 84, + 0xA71: 84, + 0xA75: 84, + 0xA81: 84, + 0xA82: 84, + 0xABC: 84, + 0xAC1: 84, + 0xAC2: 84, + 0xAC3: 84, + 0xAC4: 84, + 0xAC5: 84, + 0xAC7: 84, + 0xAC8: 84, + 0xACD: 84, + 0xAE2: 84, + 0xAE3: 84, + 0xAFA: 84, + 0xAFB: 84, + 0xAFC: 84, + 0xAFD: 84, + 0xAFE: 84, + 0xAFF: 84, + 0xB01: 84, + 0xB3C: 84, + 0xB3F: 84, + 0xB41: 84, + 0xB42: 84, + 0xB43: 84, + 0xB44: 84, + 0xB4D: 84, + 0xB55: 84, + 0xB56: 84, + 0xB62: 84, + 0xB63: 84, + 0xB82: 84, + 0xBC0: 84, + 0xBCD: 84, + 0xC00: 84, + 0xC04: 84, + 0xC3C: 84, + 0xC3E: 84, + 0xC3F: 84, + 0xC40: 84, + 0xC46: 84, + 0xC47: 84, + 0xC48: 84, + 0xC4A: 84, + 0xC4B: 84, + 0xC4C: 84, + 0xC4D: 84, + 0xC55: 84, + 0xC56: 84, + 0xC62: 84, + 0xC63: 84, + 0xC81: 84, + 0xCBC: 84, + 0xCBF: 84, + 0xCC6: 84, + 0xCCC: 84, + 0xCCD: 84, + 0xCE2: 84, + 0xCE3: 84, + 0xD00: 84, + 0xD01: 84, + 0xD3B: 84, + 0xD3C: 84, + 0xD41: 84, + 0xD42: 84, + 0xD43: 84, + 0xD44: 84, + 0xD4D: 84, + 0xD62: 84, + 0xD63: 84, + 0xD81: 84, + 0xDCA: 84, + 0xDD2: 84, + 0xDD3: 84, + 0xDD4: 84, + 0xDD6: 84, + 0xE31: 84, + 0xE34: 84, + 0xE35: 84, + 0xE36: 84, + 0xE37: 84, + 0xE38: 84, + 0xE39: 84, + 0xE3A: 84, + 0xE47: 84, + 0xE48: 84, + 0xE49: 84, + 0xE4A: 84, + 0xE4B: 84, + 0xE4C: 84, + 0xE4D: 84, + 0xE4E: 84, + 0xEB1: 84, + 0xEB4: 84, + 0xEB5: 84, + 0xEB6: 84, + 0xEB7: 84, + 0xEB8: 84, + 0xEB9: 84, + 0xEBA: 84, + 0xEBB: 84, + 0xEBC: 84, + 0xEC8: 84, + 0xEC9: 84, + 0xECA: 84, + 0xECB: 84, + 0xECC: 84, + 0xECD: 84, + 0xECE: 84, + 0xF18: 84, + 0xF19: 84, + 0xF35: 84, + 0xF37: 84, + 0xF39: 84, + 0xF71: 84, + 0xF72: 84, + 0xF73: 84, + 0xF74: 84, + 0xF75: 84, + 0xF76: 84, + 0xF77: 84, + 0xF78: 84, + 0xF79: 84, + 0xF7A: 84, + 0xF7B: 84, + 0xF7C: 84, + 0xF7D: 84, + 0xF7E: 84, + 0xF80: 84, + 0xF81: 84, + 0xF82: 84, + 0xF83: 84, + 0xF84: 84, + 0xF86: 84, + 0xF87: 84, + 0xF8D: 84, + 0xF8E: 84, + 0xF8F: 84, + 0xF90: 84, + 0xF91: 84, + 0xF92: 84, + 0xF93: 84, + 0xF94: 84, + 0xF95: 84, + 0xF96: 84, + 0xF97: 84, + 0xF99: 84, + 0xF9A: 84, + 0xF9B: 84, + 0xF9C: 84, + 0xF9D: 84, + 0xF9E: 84, + 0xF9F: 84, + 0xFA0: 84, + 0xFA1: 84, + 0xFA2: 84, + 0xFA3: 84, + 0xFA4: 84, + 0xFA5: 84, + 0xFA6: 84, + 0xFA7: 84, + 0xFA8: 84, + 0xFA9: 84, + 0xFAA: 84, + 0xFAB: 84, + 0xFAC: 84, + 0xFAD: 84, + 0xFAE: 84, + 0xFAF: 84, + 0xFB0: 84, + 0xFB1: 84, + 0xFB2: 84, + 0xFB3: 84, + 0xFB4: 84, + 0xFB5: 84, + 0xFB6: 84, + 0xFB7: 84, + 0xFB8: 84, + 0xFB9: 84, + 0xFBA: 84, + 0xFBB: 84, + 0xFBC: 84, + 0xFC6: 84, + 0x102D: 84, + 0x102E: 84, + 0x102F: 84, + 0x1030: 84, + 0x1032: 84, + 0x1033: 84, + 0x1034: 84, + 0x1035: 84, + 0x1036: 84, + 0x1037: 84, + 0x1039: 84, + 0x103A: 84, + 0x103D: 84, + 0x103E: 84, + 0x1058: 84, + 0x1059: 84, + 0x105E: 84, + 0x105F: 84, + 0x1060: 84, + 0x1071: 84, + 0x1072: 84, + 0x1073: 84, + 0x1074: 84, + 0x1082: 84, + 0x1085: 84, + 0x1086: 84, + 0x108D: 84, + 0x109D: 84, + 0x135D: 84, + 0x135E: 84, + 0x135F: 84, + 0x1712: 84, + 0x1713: 84, + 0x1714: 84, + 0x1732: 84, + 0x1733: 84, + 0x1752: 84, + 0x1753: 84, + 0x1772: 84, + 0x1773: 84, + 0x17B4: 84, + 0x17B5: 84, + 0x17B7: 84, + 0x17B8: 84, + 0x17B9: 84, + 0x17BA: 84, + 0x17BB: 84, + 0x17BC: 84, + 0x17BD: 84, + 0x17C6: 84, + 0x17C9: 84, + 0x17CA: 84, + 0x17CB: 84, + 0x17CC: 84, + 0x17CD: 84, + 0x17CE: 84, + 0x17CF: 84, + 0x17D0: 84, + 0x17D1: 84, + 0x17D2: 84, + 0x17D3: 84, + 0x17DD: 84, + 0x1807: 68, + 0x180A: 67, + 0x180B: 84, + 0x180C: 84, + 0x180D: 84, + 0x180F: 84, + 0x1820: 68, + 0x1821: 68, + 0x1822: 68, + 0x1823: 68, + 0x1824: 68, + 0x1825: 68, + 0x1826: 68, + 0x1827: 68, + 0x1828: 68, + 0x1829: 68, + 0x182A: 68, + 0x182B: 68, + 0x182C: 68, + 0x182D: 68, + 0x182E: 68, + 0x182F: 68, + 0x1830: 68, + 0x1831: 68, + 0x1832: 68, + 0x1833: 68, + 0x1834: 68, + 0x1835: 68, + 0x1836: 68, + 0x1837: 68, + 0x1838: 68, + 0x1839: 68, + 0x183A: 68, + 0x183B: 68, + 0x183C: 68, + 0x183D: 68, + 0x183E: 68, + 0x183F: 68, + 0x1840: 68, + 0x1841: 68, + 0x1842: 68, + 0x1843: 68, + 0x1844: 68, + 0x1845: 68, + 0x1846: 68, + 0x1847: 68, + 0x1848: 68, + 0x1849: 68, + 0x184A: 68, + 0x184B: 68, + 0x184C: 68, + 0x184D: 68, + 0x184E: 68, + 0x184F: 68, + 0x1850: 68, + 0x1851: 68, + 0x1852: 68, + 0x1853: 68, + 0x1854: 68, + 0x1855: 68, + 0x1856: 68, + 0x1857: 68, + 0x1858: 68, + 0x1859: 68, + 0x185A: 68, + 0x185B: 68, + 0x185C: 68, + 0x185D: 68, + 0x185E: 68, + 0x185F: 68, + 0x1860: 68, + 0x1861: 68, + 0x1862: 68, + 0x1863: 68, + 0x1864: 68, + 0x1865: 68, + 0x1866: 68, + 0x1867: 68, + 0x1868: 68, + 0x1869: 68, + 0x186A: 68, + 0x186B: 68, + 0x186C: 68, + 0x186D: 68, + 0x186E: 68, + 0x186F: 68, + 0x1870: 68, + 0x1871: 68, + 0x1872: 68, + 0x1873: 68, + 0x1874: 68, + 0x1875: 68, + 0x1876: 68, + 0x1877: 68, + 0x1878: 68, + 0x1885: 84, + 0x1886: 84, + 0x1887: 68, + 0x1888: 68, + 0x1889: 68, + 0x188A: 68, + 0x188B: 68, + 0x188C: 68, + 0x188D: 68, + 0x188E: 68, + 0x188F: 68, + 0x1890: 68, + 0x1891: 68, + 0x1892: 68, + 0x1893: 68, + 0x1894: 68, + 0x1895: 68, + 0x1896: 68, + 0x1897: 68, + 0x1898: 68, + 0x1899: 68, + 0x189A: 68, + 0x189B: 68, + 0x189C: 68, + 0x189D: 68, + 0x189E: 68, + 0x189F: 68, + 0x18A0: 68, + 0x18A1: 68, + 0x18A2: 68, + 0x18A3: 68, + 0x18A4: 68, + 0x18A5: 68, + 0x18A6: 68, + 0x18A7: 68, + 0x18A8: 68, + 0x18A9: 84, + 0x18AA: 68, + 0x1920: 84, + 0x1921: 84, + 0x1922: 84, + 0x1927: 84, + 0x1928: 84, + 0x1932: 84, + 0x1939: 84, + 0x193A: 84, + 0x193B: 84, + 0x1A17: 84, + 0x1A18: 84, + 0x1A1B: 84, + 0x1A56: 84, + 0x1A58: 84, + 0x1A59: 84, + 0x1A5A: 84, + 0x1A5B: 84, + 0x1A5C: 84, + 0x1A5D: 84, + 0x1A5E: 84, + 0x1A60: 84, + 0x1A62: 84, + 0x1A65: 84, + 0x1A66: 84, + 0x1A67: 84, + 0x1A68: 84, + 0x1A69: 84, + 0x1A6A: 84, + 0x1A6B: 84, + 0x1A6C: 84, + 0x1A73: 84, + 0x1A74: 84, + 0x1A75: 84, + 0x1A76: 84, + 0x1A77: 84, + 0x1A78: 84, + 0x1A79: 84, + 0x1A7A: 84, + 0x1A7B: 84, + 0x1A7C: 84, + 0x1A7F: 84, + 0x1AB0: 84, + 0x1AB1: 84, + 0x1AB2: 84, + 0x1AB3: 84, + 0x1AB4: 84, + 0x1AB5: 84, + 0x1AB6: 84, + 0x1AB7: 84, + 0x1AB8: 84, + 0x1AB9: 84, + 0x1ABA: 84, + 0x1ABB: 84, + 0x1ABC: 84, + 0x1ABD: 84, + 0x1ABE: 84, + 0x1ABF: 84, + 0x1AC0: 84, + 0x1AC1: 84, + 0x1AC2: 84, + 0x1AC3: 84, + 0x1AC4: 84, + 0x1AC5: 84, + 0x1AC6: 84, + 0x1AC7: 84, + 0x1AC8: 84, + 0x1AC9: 84, + 0x1ACA: 84, + 0x1ACB: 84, + 0x1ACC: 84, + 0x1ACD: 84, + 0x1ACE: 84, + 0x1B00: 84, + 0x1B01: 84, + 0x1B02: 84, + 0x1B03: 84, + 0x1B34: 84, + 0x1B36: 84, + 0x1B37: 84, + 0x1B38: 84, + 0x1B39: 84, + 0x1B3A: 84, + 0x1B3C: 84, + 0x1B42: 84, + 0x1B6B: 84, + 0x1B6C: 84, + 0x1B6D: 84, + 0x1B6E: 84, + 0x1B6F: 84, + 0x1B70: 84, + 0x1B71: 84, + 0x1B72: 84, + 0x1B73: 84, + 0x1B80: 84, + 0x1B81: 84, + 0x1BA2: 84, + 0x1BA3: 84, + 0x1BA4: 84, + 0x1BA5: 84, + 0x1BA8: 84, + 0x1BA9: 84, + 0x1BAB: 84, + 0x1BAC: 84, + 0x1BAD: 84, + 0x1BE6: 84, + 0x1BE8: 84, + 0x1BE9: 84, + 0x1BED: 84, + 0x1BEF: 84, + 0x1BF0: 84, + 0x1BF1: 84, + 0x1C2C: 84, + 0x1C2D: 84, + 0x1C2E: 84, + 0x1C2F: 84, + 0x1C30: 84, + 0x1C31: 84, + 0x1C32: 84, + 0x1C33: 84, + 0x1C36: 84, + 0x1C37: 84, + 0x1CD0: 84, + 0x1CD1: 84, + 0x1CD2: 84, + 0x1CD4: 84, + 0x1CD5: 84, + 0x1CD6: 84, + 0x1CD7: 84, + 0x1CD8: 84, + 0x1CD9: 84, + 0x1CDA: 84, + 0x1CDB: 84, + 0x1CDC: 84, + 0x1CDD: 84, + 0x1CDE: 84, + 0x1CDF: 84, + 0x1CE0: 84, + 0x1CE2: 84, + 0x1CE3: 84, + 0x1CE4: 84, + 0x1CE5: 84, + 0x1CE6: 84, + 0x1CE7: 84, + 0x1CE8: 84, + 0x1CED: 84, + 0x1CF4: 84, + 0x1CF8: 84, + 0x1CF9: 84, + 0x1DC0: 84, + 0x1DC1: 84, + 0x1DC2: 84, + 0x1DC3: 84, + 0x1DC4: 84, + 0x1DC5: 84, + 0x1DC6: 84, + 0x1DC7: 84, + 0x1DC8: 84, + 0x1DC9: 84, + 0x1DCA: 84, + 0x1DCB: 84, + 0x1DCC: 84, + 0x1DCD: 84, + 0x1DCE: 84, + 0x1DCF: 84, + 0x1DD0: 84, + 0x1DD1: 84, + 0x1DD2: 84, + 0x1DD3: 84, + 0x1DD4: 84, + 0x1DD5: 84, + 0x1DD6: 84, + 0x1DD7: 84, + 0x1DD8: 84, + 0x1DD9: 84, + 0x1DDA: 84, + 0x1DDB: 84, + 0x1DDC: 84, + 0x1DDD: 84, + 0x1DDE: 84, + 0x1DDF: 84, + 0x1DE0: 84, + 0x1DE1: 84, + 0x1DE2: 84, + 0x1DE3: 84, + 0x1DE4: 84, + 0x1DE5: 84, + 0x1DE6: 84, + 0x1DE7: 84, + 0x1DE8: 84, + 0x1DE9: 84, + 0x1DEA: 84, + 0x1DEB: 84, + 0x1DEC: 84, + 0x1DED: 84, + 0x1DEE: 84, + 0x1DEF: 84, + 0x1DF0: 84, + 0x1DF1: 84, + 0x1DF2: 84, + 0x1DF3: 84, + 0x1DF4: 84, + 0x1DF5: 84, + 0x1DF6: 84, + 0x1DF7: 84, + 0x1DF8: 84, + 0x1DF9: 84, + 0x1DFA: 84, + 0x1DFB: 84, + 0x1DFC: 84, + 0x1DFD: 84, + 0x1DFE: 84, + 0x1DFF: 84, + 0x200B: 84, + 0x200D: 67, + 0x200E: 84, + 0x200F: 84, + 0x202A: 84, + 0x202B: 84, + 0x202C: 84, + 0x202D: 84, + 0x202E: 84, + 0x2060: 84, + 0x2061: 84, + 0x2062: 84, + 0x2063: 84, + 0x2064: 84, + 0x206A: 84, + 0x206B: 84, + 0x206C: 84, + 0x206D: 84, + 0x206E: 84, + 0x206F: 84, + 0x20D0: 84, + 0x20D1: 84, + 0x20D2: 84, + 0x20D3: 84, + 0x20D4: 84, + 0x20D5: 84, + 0x20D6: 84, + 0x20D7: 84, + 0x20D8: 84, + 0x20D9: 84, + 0x20DA: 84, + 0x20DB: 84, + 0x20DC: 84, + 0x20DD: 84, + 0x20DE: 84, + 0x20DF: 84, + 0x20E0: 84, + 0x20E1: 84, + 0x20E2: 84, + 0x20E3: 84, + 0x20E4: 84, + 0x20E5: 84, + 0x20E6: 84, + 0x20E7: 84, + 0x20E8: 84, + 0x20E9: 84, + 0x20EA: 84, + 0x20EB: 84, + 0x20EC: 84, + 0x20ED: 84, + 0x20EE: 84, + 0x20EF: 84, + 0x20F0: 84, + 0x2CEF: 84, + 0x2CF0: 84, + 0x2CF1: 84, + 0x2D7F: 84, + 0x2DE0: 84, + 0x2DE1: 84, + 0x2DE2: 84, + 0x2DE3: 84, + 0x2DE4: 84, + 0x2DE5: 84, + 0x2DE6: 84, + 0x2DE7: 84, + 0x2DE8: 84, + 0x2DE9: 84, + 0x2DEA: 84, + 0x2DEB: 84, + 0x2DEC: 84, + 0x2DED: 84, + 0x2DEE: 84, + 0x2DEF: 84, + 0x2DF0: 84, + 0x2DF1: 84, + 0x2DF2: 84, + 0x2DF3: 84, + 0x2DF4: 84, + 0x2DF5: 84, + 0x2DF6: 84, + 0x2DF7: 84, + 0x2DF8: 84, + 0x2DF9: 84, + 0x2DFA: 84, + 0x2DFB: 84, + 0x2DFC: 84, + 0x2DFD: 84, + 0x2DFE: 84, + 0x2DFF: 84, + 0x302A: 84, + 0x302B: 84, + 0x302C: 84, + 0x302D: 84, + 0x3099: 84, + 0x309A: 84, + 0xA66F: 84, + 0xA670: 84, + 0xA671: 84, + 0xA672: 84, + 0xA674: 84, + 0xA675: 84, + 0xA676: 84, + 0xA677: 84, + 0xA678: 84, + 0xA679: 84, + 0xA67A: 84, + 0xA67B: 84, + 0xA67C: 84, + 0xA67D: 84, + 0xA69E: 84, + 0xA69F: 84, + 0xA6F0: 84, + 0xA6F1: 84, + 0xA802: 84, + 0xA806: 84, + 0xA80B: 84, + 0xA825: 84, + 0xA826: 84, + 0xA82C: 84, + 0xA840: 68, + 0xA841: 68, + 0xA842: 68, + 0xA843: 68, + 0xA844: 68, + 0xA845: 68, + 0xA846: 68, + 0xA847: 68, + 0xA848: 68, + 0xA849: 68, + 0xA84A: 68, + 0xA84B: 68, + 0xA84C: 68, + 0xA84D: 68, + 0xA84E: 68, + 0xA84F: 68, + 0xA850: 68, + 0xA851: 68, + 0xA852: 68, + 0xA853: 68, + 0xA854: 68, + 0xA855: 68, + 0xA856: 68, + 0xA857: 68, + 0xA858: 68, + 0xA859: 68, + 0xA85A: 68, + 0xA85B: 68, + 0xA85C: 68, + 0xA85D: 68, + 0xA85E: 68, + 0xA85F: 68, + 0xA860: 68, + 0xA861: 68, + 0xA862: 68, + 0xA863: 68, + 0xA864: 68, + 0xA865: 68, + 0xA866: 68, + 0xA867: 68, + 0xA868: 68, + 0xA869: 68, + 0xA86A: 68, + 0xA86B: 68, + 0xA86C: 68, + 0xA86D: 68, + 0xA86E: 68, + 0xA86F: 68, + 0xA870: 68, + 0xA871: 68, + 0xA872: 76, + 0xA8C4: 84, + 0xA8C5: 84, + 0xA8E0: 84, + 0xA8E1: 84, + 0xA8E2: 84, + 0xA8E3: 84, + 0xA8E4: 84, + 0xA8E5: 84, + 0xA8E6: 84, + 0xA8E7: 84, + 0xA8E8: 84, + 0xA8E9: 84, + 0xA8EA: 84, + 0xA8EB: 84, + 0xA8EC: 84, + 0xA8ED: 84, + 0xA8EE: 84, + 0xA8EF: 84, + 0xA8F0: 84, + 0xA8F1: 84, + 0xA8FF: 84, + 0xA926: 84, + 0xA927: 84, + 0xA928: 84, + 0xA929: 84, + 0xA92A: 84, + 0xA92B: 84, + 0xA92C: 84, + 0xA92D: 84, + 0xA947: 84, + 0xA948: 84, + 0xA949: 84, + 0xA94A: 84, + 0xA94B: 84, + 0xA94C: 84, + 0xA94D: 84, + 0xA94E: 84, + 0xA94F: 84, + 0xA950: 84, + 0xA951: 84, + 0xA980: 84, + 0xA981: 84, + 0xA982: 84, + 0xA9B3: 84, + 0xA9B6: 84, + 0xA9B7: 84, + 0xA9B8: 84, + 0xA9B9: 84, + 0xA9BC: 84, + 0xA9BD: 84, + 0xA9E5: 84, + 0xAA29: 84, + 0xAA2A: 84, + 0xAA2B: 84, + 0xAA2C: 84, + 0xAA2D: 84, + 0xAA2E: 84, + 0xAA31: 84, + 0xAA32: 84, + 0xAA35: 84, + 0xAA36: 84, + 0xAA43: 84, + 0xAA4C: 84, + 0xAA7C: 84, + 0xAAB0: 84, + 0xAAB2: 84, + 0xAAB3: 84, + 0xAAB4: 84, + 0xAAB7: 84, + 0xAAB8: 84, + 0xAABE: 84, + 0xAABF: 84, + 0xAAC1: 84, + 0xAAEC: 84, + 0xAAED: 84, + 0xAAF6: 84, + 0xABE5: 84, + 0xABE8: 84, + 0xABED: 84, + 0xFB1E: 84, + 0xFE00: 84, + 0xFE01: 84, + 0xFE02: 84, + 0xFE03: 84, + 0xFE04: 84, + 0xFE05: 84, + 0xFE06: 84, + 0xFE07: 84, + 0xFE08: 84, + 0xFE09: 84, + 0xFE0A: 84, + 0xFE0B: 84, + 0xFE0C: 84, + 0xFE0D: 84, + 0xFE0E: 84, + 0xFE0F: 84, + 0xFE20: 84, + 0xFE21: 84, + 0xFE22: 84, + 0xFE23: 84, + 0xFE24: 84, + 0xFE25: 84, + 0xFE26: 84, + 0xFE27: 84, + 0xFE28: 84, + 0xFE29: 84, + 0xFE2A: 84, + 0xFE2B: 84, + 0xFE2C: 84, + 0xFE2D: 84, + 0xFE2E: 84, + 0xFE2F: 84, + 0xFEFF: 84, + 0xFFF9: 84, + 0xFFFA: 84, + 0xFFFB: 84, + 0x101FD: 84, + 0x102E0: 84, + 0x10376: 84, + 0x10377: 84, + 0x10378: 84, + 0x10379: 84, + 0x1037A: 84, + 0x10A01: 84, + 0x10A02: 84, + 0x10A03: 84, + 0x10A05: 84, + 0x10A06: 84, + 0x10A0C: 84, + 0x10A0D: 84, + 0x10A0E: 84, + 0x10A0F: 84, + 0x10A38: 84, + 0x10A39: 84, + 0x10A3A: 84, + 0x10A3F: 84, + 0x10AC0: 68, + 0x10AC1: 68, + 0x10AC2: 68, + 0x10AC3: 68, + 0x10AC4: 68, + 0x10AC5: 82, + 0x10AC7: 82, + 0x10AC9: 82, + 0x10ACA: 82, + 0x10ACD: 76, + 0x10ACE: 82, + 0x10ACF: 82, + 0x10AD0: 82, + 0x10AD1: 82, + 0x10AD2: 82, + 0x10AD3: 68, + 0x10AD4: 68, + 0x10AD5: 68, + 0x10AD6: 68, + 0x10AD7: 76, + 0x10AD8: 68, + 0x10AD9: 68, + 0x10ADA: 68, + 0x10ADB: 68, + 0x10ADC: 68, + 0x10ADD: 82, + 0x10ADE: 68, + 0x10ADF: 68, + 0x10AE0: 68, + 0x10AE1: 82, + 0x10AE4: 82, + 0x10AE5: 84, + 0x10AE6: 84, + 0x10AEB: 68, + 0x10AEC: 68, + 0x10AED: 68, + 0x10AEE: 68, + 0x10AEF: 82, + 0x10B80: 68, + 0x10B81: 82, + 0x10B82: 68, + 0x10B83: 82, + 0x10B84: 82, + 0x10B85: 82, + 0x10B86: 68, + 0x10B87: 68, + 0x10B88: 68, + 0x10B89: 82, + 0x10B8A: 68, + 0x10B8B: 68, + 0x10B8C: 82, + 0x10B8D: 68, + 0x10B8E: 82, + 0x10B8F: 82, + 0x10B90: 68, + 0x10B91: 82, + 0x10BA9: 82, + 0x10BAA: 82, + 0x10BAB: 82, + 0x10BAC: 82, + 0x10BAD: 68, + 0x10BAE: 68, + 0x10D00: 76, + 0x10D01: 68, + 0x10D02: 68, + 0x10D03: 68, + 0x10D04: 68, + 0x10D05: 68, + 0x10D06: 68, + 0x10D07: 68, + 0x10D08: 68, + 0x10D09: 68, + 0x10D0A: 68, + 0x10D0B: 68, + 0x10D0C: 68, + 0x10D0D: 68, + 0x10D0E: 68, + 0x10D0F: 68, + 0x10D10: 68, + 0x10D11: 68, + 0x10D12: 68, + 0x10D13: 68, + 0x10D14: 68, + 0x10D15: 68, + 0x10D16: 68, + 0x10D17: 68, + 0x10D18: 68, + 0x10D19: 68, + 0x10D1A: 68, + 0x10D1B: 68, + 0x10D1C: 68, + 0x10D1D: 68, + 0x10D1E: 68, + 0x10D1F: 68, + 0x10D20: 68, + 0x10D21: 68, + 0x10D22: 82, + 0x10D23: 68, + 0x10D24: 84, + 0x10D25: 84, + 0x10D26: 84, + 0x10D27: 84, + 0x10D69: 84, + 0x10D6A: 84, + 0x10D6B: 84, + 0x10D6C: 84, + 0x10D6D: 84, + 0x10EAB: 84, + 0x10EAC: 84, + 0x10EC2: 82, + 0x10EC3: 68, + 0x10EC4: 68, + 0x10EFC: 84, + 0x10EFD: 84, + 0x10EFE: 84, + 0x10EFF: 84, + 0x10F30: 68, + 0x10F31: 68, + 0x10F32: 68, + 0x10F33: 82, + 0x10F34: 68, + 0x10F35: 68, + 0x10F36: 68, + 0x10F37: 68, + 0x10F38: 68, + 0x10F39: 68, + 0x10F3A: 68, + 0x10F3B: 68, + 0x10F3C: 68, + 0x10F3D: 68, + 0x10F3E: 68, + 0x10F3F: 68, + 0x10F40: 68, + 0x10F41: 68, + 0x10F42: 68, + 0x10F43: 68, + 0x10F44: 68, + 0x10F46: 84, + 0x10F47: 84, + 0x10F48: 84, + 0x10F49: 84, + 0x10F4A: 84, + 0x10F4B: 84, + 0x10F4C: 84, + 0x10F4D: 84, + 0x10F4E: 84, + 0x10F4F: 84, + 0x10F50: 84, + 0x10F51: 68, + 0x10F52: 68, + 0x10F53: 68, + 0x10F54: 82, + 0x10F70: 68, + 0x10F71: 68, + 0x10F72: 68, + 0x10F73: 68, + 0x10F74: 82, + 0x10F75: 82, + 0x10F76: 68, + 0x10F77: 68, + 0x10F78: 68, + 0x10F79: 68, + 0x10F7A: 68, + 0x10F7B: 68, + 0x10F7C: 68, + 0x10F7D: 68, + 0x10F7E: 68, + 0x10F7F: 68, + 0x10F80: 68, + 0x10F81: 68, + 0x10F82: 84, + 0x10F83: 84, + 0x10F84: 84, + 0x10F85: 84, + 0x10FB0: 68, + 0x10FB2: 68, + 0x10FB3: 68, + 0x10FB4: 82, + 0x10FB5: 82, + 0x10FB6: 82, + 0x10FB8: 68, + 0x10FB9: 82, + 0x10FBA: 82, + 0x10FBB: 68, + 0x10FBC: 68, + 0x10FBD: 82, + 0x10FBE: 68, + 0x10FBF: 68, + 0x10FC1: 68, + 0x10FC2: 82, + 0x10FC3: 82, + 0x10FC4: 68, + 0x10FC9: 82, + 0x10FCA: 68, + 0x10FCB: 76, + 0x11001: 84, + 0x11038: 84, + 0x11039: 84, + 0x1103A: 84, + 0x1103B: 84, + 0x1103C: 84, + 0x1103D: 84, + 0x1103E: 84, + 0x1103F: 84, + 0x11040: 84, + 0x11041: 84, + 0x11042: 84, + 0x11043: 84, + 0x11044: 84, + 0x11045: 84, + 0x11046: 84, + 0x11070: 84, + 0x11073: 84, + 0x11074: 84, + 0x1107F: 84, + 0x11080: 84, + 0x11081: 84, + 0x110B3: 84, + 0x110B4: 84, + 0x110B5: 84, + 0x110B6: 84, + 0x110B9: 84, + 0x110BA: 84, + 0x110C2: 84, + 0x11100: 84, + 0x11101: 84, + 0x11102: 84, + 0x11127: 84, + 0x11128: 84, + 0x11129: 84, + 0x1112A: 84, + 0x1112B: 84, + 0x1112D: 84, + 0x1112E: 84, + 0x1112F: 84, + 0x11130: 84, + 0x11131: 84, + 0x11132: 84, + 0x11133: 84, + 0x11134: 84, + 0x11173: 84, + 0x11180: 84, + 0x11181: 84, + 0x111B6: 84, + 0x111B7: 84, + 0x111B8: 84, + 0x111B9: 84, + 0x111BA: 84, + 0x111BB: 84, + 0x111BC: 84, + 0x111BD: 84, + 0x111BE: 84, + 0x111C9: 84, + 0x111CA: 84, + 0x111CB: 84, + 0x111CC: 84, + 0x111CF: 84, + 0x1122F: 84, + 0x11230: 84, + 0x11231: 84, + 0x11234: 84, + 0x11236: 84, + 0x11237: 84, + 0x1123E: 84, + 0x11241: 84, + 0x112DF: 84, + 0x112E3: 84, + 0x112E4: 84, + 0x112E5: 84, + 0x112E6: 84, + 0x112E7: 84, + 0x112E8: 84, + 0x112E9: 84, + 0x112EA: 84, + 0x11300: 84, + 0x11301: 84, + 0x1133B: 84, + 0x1133C: 84, + 0x11340: 84, + 0x11366: 84, + 0x11367: 84, + 0x11368: 84, + 0x11369: 84, + 0x1136A: 84, + 0x1136B: 84, + 0x1136C: 84, + 0x11370: 84, + 0x11371: 84, + 0x11372: 84, + 0x11373: 84, + 0x11374: 84, + 0x113BB: 84, + 0x113BC: 84, + 0x113BD: 84, + 0x113BE: 84, + 0x113BF: 84, + 0x113C0: 84, + 0x113CE: 84, + 0x113D0: 84, + 0x113D2: 84, + 0x113E1: 84, + 0x113E2: 84, + 0x11438: 84, + 0x11439: 84, + 0x1143A: 84, + 0x1143B: 84, + 0x1143C: 84, + 0x1143D: 84, + 0x1143E: 84, + 0x1143F: 84, + 0x11442: 84, + 0x11443: 84, + 0x11444: 84, + 0x11446: 84, + 0x1145E: 84, + 0x114B3: 84, + 0x114B4: 84, + 0x114B5: 84, + 0x114B6: 84, + 0x114B7: 84, + 0x114B8: 84, + 0x114BA: 84, + 0x114BF: 84, + 0x114C0: 84, + 0x114C2: 84, + 0x114C3: 84, + 0x115B2: 84, + 0x115B3: 84, + 0x115B4: 84, + 0x115B5: 84, + 0x115BC: 84, + 0x115BD: 84, + 0x115BF: 84, + 0x115C0: 84, + 0x115DC: 84, + 0x115DD: 84, + 0x11633: 84, + 0x11634: 84, + 0x11635: 84, + 0x11636: 84, + 0x11637: 84, + 0x11638: 84, + 0x11639: 84, + 0x1163A: 84, + 0x1163D: 84, + 0x1163F: 84, + 0x11640: 84, + 0x116AB: 84, + 0x116AD: 84, + 0x116B0: 84, + 0x116B1: 84, + 0x116B2: 84, + 0x116B3: 84, + 0x116B4: 84, + 0x116B5: 84, + 0x116B7: 84, + 0x1171D: 84, + 0x1171F: 84, + 0x11722: 84, + 0x11723: 84, + 0x11724: 84, + 0x11725: 84, + 0x11727: 84, + 0x11728: 84, + 0x11729: 84, + 0x1172A: 84, + 0x1172B: 84, + 0x1182F: 84, + 0x11830: 84, + 0x11831: 84, + 0x11832: 84, + 0x11833: 84, + 0x11834: 84, + 0x11835: 84, + 0x11836: 84, + 0x11837: 84, + 0x11839: 84, + 0x1183A: 84, + 0x1193B: 84, + 0x1193C: 84, + 0x1193E: 84, + 0x11943: 84, + 0x119D4: 84, + 0x119D5: 84, + 0x119D6: 84, + 0x119D7: 84, + 0x119DA: 84, + 0x119DB: 84, + 0x119E0: 84, + 0x11A01: 84, + 0x11A02: 84, + 0x11A03: 84, + 0x11A04: 84, + 0x11A05: 84, + 0x11A06: 84, + 0x11A07: 84, + 0x11A08: 84, + 0x11A09: 84, + 0x11A0A: 84, + 0x11A33: 84, + 0x11A34: 84, + 0x11A35: 84, + 0x11A36: 84, + 0x11A37: 84, + 0x11A38: 84, + 0x11A3B: 84, + 0x11A3C: 84, + 0x11A3D: 84, + 0x11A3E: 84, + 0x11A47: 84, + 0x11A51: 84, + 0x11A52: 84, + 0x11A53: 84, + 0x11A54: 84, + 0x11A55: 84, + 0x11A56: 84, + 0x11A59: 84, + 0x11A5A: 84, + 0x11A5B: 84, + 0x11A8A: 84, + 0x11A8B: 84, + 0x11A8C: 84, + 0x11A8D: 84, + 0x11A8E: 84, + 0x11A8F: 84, + 0x11A90: 84, + 0x11A91: 84, + 0x11A92: 84, + 0x11A93: 84, + 0x11A94: 84, + 0x11A95: 84, + 0x11A96: 84, + 0x11A98: 84, + 0x11A99: 84, + 0x11C30: 84, + 0x11C31: 84, + 0x11C32: 84, + 0x11C33: 84, + 0x11C34: 84, + 0x11C35: 84, + 0x11C36: 84, + 0x11C38: 84, + 0x11C39: 84, + 0x11C3A: 84, + 0x11C3B: 84, + 0x11C3C: 84, + 0x11C3D: 84, + 0x11C3F: 84, + 0x11C92: 84, + 0x11C93: 84, + 0x11C94: 84, + 0x11C95: 84, + 0x11C96: 84, + 0x11C97: 84, + 0x11C98: 84, + 0x11C99: 84, + 0x11C9A: 84, + 0x11C9B: 84, + 0x11C9C: 84, + 0x11C9D: 84, + 0x11C9E: 84, + 0x11C9F: 84, + 0x11CA0: 84, + 0x11CA1: 84, + 0x11CA2: 84, + 0x11CA3: 84, + 0x11CA4: 84, + 0x11CA5: 84, + 0x11CA6: 84, + 0x11CA7: 84, + 0x11CAA: 84, + 0x11CAB: 84, + 0x11CAC: 84, + 0x11CAD: 84, + 0x11CAE: 84, + 0x11CAF: 84, + 0x11CB0: 84, + 0x11CB2: 84, + 0x11CB3: 84, + 0x11CB5: 84, + 0x11CB6: 84, + 0x11D31: 84, + 0x11D32: 84, + 0x11D33: 84, + 0x11D34: 84, + 0x11D35: 84, + 0x11D36: 84, + 0x11D3A: 84, + 0x11D3C: 84, + 0x11D3D: 84, + 0x11D3F: 84, + 0x11D40: 84, + 0x11D41: 84, + 0x11D42: 84, + 0x11D43: 84, + 0x11D44: 84, + 0x11D45: 84, + 0x11D47: 84, + 0x11D90: 84, + 0x11D91: 84, + 0x11D95: 84, + 0x11D97: 84, + 0x11EF3: 84, + 0x11EF4: 84, + 0x11F00: 84, + 0x11F01: 84, + 0x11F36: 84, + 0x11F37: 84, + 0x11F38: 84, + 0x11F39: 84, + 0x11F3A: 84, + 0x11F40: 84, + 0x11F42: 84, + 0x11F5A: 84, + 0x13430: 84, + 0x13431: 84, + 0x13432: 84, + 0x13433: 84, + 0x13434: 84, + 0x13435: 84, + 0x13436: 84, + 0x13437: 84, + 0x13438: 84, + 0x13439: 84, + 0x1343A: 84, + 0x1343B: 84, + 0x1343C: 84, + 0x1343D: 84, + 0x1343E: 84, + 0x1343F: 84, + 0x13440: 84, + 0x13447: 84, + 0x13448: 84, + 0x13449: 84, + 0x1344A: 84, + 0x1344B: 84, + 0x1344C: 84, + 0x1344D: 84, + 0x1344E: 84, + 0x1344F: 84, + 0x13450: 84, + 0x13451: 84, + 0x13452: 84, + 0x13453: 84, + 0x13454: 84, + 0x13455: 84, + 0x1611E: 84, + 0x1611F: 84, + 0x16120: 84, + 0x16121: 84, + 0x16122: 84, + 0x16123: 84, + 0x16124: 84, + 0x16125: 84, + 0x16126: 84, + 0x16127: 84, + 0x16128: 84, + 0x16129: 84, + 0x1612D: 84, + 0x1612E: 84, + 0x1612F: 84, + 0x16AF0: 84, + 0x16AF1: 84, + 0x16AF2: 84, + 0x16AF3: 84, + 0x16AF4: 84, + 0x16B30: 84, + 0x16B31: 84, + 0x16B32: 84, + 0x16B33: 84, + 0x16B34: 84, + 0x16B35: 84, + 0x16B36: 84, + 0x16F4F: 84, + 0x16F8F: 84, + 0x16F90: 84, + 0x16F91: 84, + 0x16F92: 84, + 0x16FE4: 84, + 0x1BC9D: 84, + 0x1BC9E: 84, + 0x1BCA0: 84, + 0x1BCA1: 84, + 0x1BCA2: 84, + 0x1BCA3: 84, + 0x1CF00: 84, + 0x1CF01: 84, + 0x1CF02: 84, + 0x1CF03: 84, + 0x1CF04: 84, + 0x1CF05: 84, + 0x1CF06: 84, + 0x1CF07: 84, + 0x1CF08: 84, + 0x1CF09: 84, + 0x1CF0A: 84, + 0x1CF0B: 84, + 0x1CF0C: 84, + 0x1CF0D: 84, + 0x1CF0E: 84, + 0x1CF0F: 84, + 0x1CF10: 84, + 0x1CF11: 84, + 0x1CF12: 84, + 0x1CF13: 84, + 0x1CF14: 84, + 0x1CF15: 84, + 0x1CF16: 84, + 0x1CF17: 84, + 0x1CF18: 84, + 0x1CF19: 84, + 0x1CF1A: 84, + 0x1CF1B: 84, + 0x1CF1C: 84, + 0x1CF1D: 84, + 0x1CF1E: 84, + 0x1CF1F: 84, + 0x1CF20: 84, + 0x1CF21: 84, + 0x1CF22: 84, + 0x1CF23: 84, + 0x1CF24: 84, + 0x1CF25: 84, + 0x1CF26: 84, + 0x1CF27: 84, + 0x1CF28: 84, + 0x1CF29: 84, + 0x1CF2A: 84, + 0x1CF2B: 84, + 0x1CF2C: 84, + 0x1CF2D: 84, + 0x1CF30: 84, + 0x1CF31: 84, + 0x1CF32: 84, + 0x1CF33: 84, + 0x1CF34: 84, + 0x1CF35: 84, + 0x1CF36: 84, + 0x1CF37: 84, + 0x1CF38: 84, + 0x1CF39: 84, + 0x1CF3A: 84, + 0x1CF3B: 84, + 0x1CF3C: 84, + 0x1CF3D: 84, + 0x1CF3E: 84, + 0x1CF3F: 84, + 0x1CF40: 84, + 0x1CF41: 84, + 0x1CF42: 84, + 0x1CF43: 84, + 0x1CF44: 84, + 0x1CF45: 84, + 0x1CF46: 84, + 0x1D167: 84, + 0x1D168: 84, + 0x1D169: 84, + 0x1D173: 84, + 0x1D174: 84, + 0x1D175: 84, + 0x1D176: 84, + 0x1D177: 84, + 0x1D178: 84, + 0x1D179: 84, + 0x1D17A: 84, + 0x1D17B: 84, + 0x1D17C: 84, + 0x1D17D: 84, + 0x1D17E: 84, + 0x1D17F: 84, + 0x1D180: 84, + 0x1D181: 84, + 0x1D182: 84, + 0x1D185: 84, + 0x1D186: 84, + 0x1D187: 84, + 0x1D188: 84, + 0x1D189: 84, + 0x1D18A: 84, + 0x1D18B: 84, + 0x1D1AA: 84, + 0x1D1AB: 84, + 0x1D1AC: 84, + 0x1D1AD: 84, + 0x1D242: 84, + 0x1D243: 84, + 0x1D244: 84, + 0x1DA00: 84, + 0x1DA01: 84, + 0x1DA02: 84, + 0x1DA03: 84, + 0x1DA04: 84, + 0x1DA05: 84, + 0x1DA06: 84, + 0x1DA07: 84, + 0x1DA08: 84, + 0x1DA09: 84, + 0x1DA0A: 84, + 0x1DA0B: 84, + 0x1DA0C: 84, + 0x1DA0D: 84, + 0x1DA0E: 84, + 0x1DA0F: 84, + 0x1DA10: 84, + 0x1DA11: 84, + 0x1DA12: 84, + 0x1DA13: 84, + 0x1DA14: 84, + 0x1DA15: 84, + 0x1DA16: 84, + 0x1DA17: 84, + 0x1DA18: 84, + 0x1DA19: 84, + 0x1DA1A: 84, + 0x1DA1B: 84, + 0x1DA1C: 84, + 0x1DA1D: 84, + 0x1DA1E: 84, + 0x1DA1F: 84, + 0x1DA20: 84, + 0x1DA21: 84, + 0x1DA22: 84, + 0x1DA23: 84, + 0x1DA24: 84, + 0x1DA25: 84, + 0x1DA26: 84, + 0x1DA27: 84, + 0x1DA28: 84, + 0x1DA29: 84, + 0x1DA2A: 84, + 0x1DA2B: 84, + 0x1DA2C: 84, + 0x1DA2D: 84, + 0x1DA2E: 84, + 0x1DA2F: 84, + 0x1DA30: 84, + 0x1DA31: 84, + 0x1DA32: 84, + 0x1DA33: 84, + 0x1DA34: 84, + 0x1DA35: 84, + 0x1DA36: 84, + 0x1DA3B: 84, + 0x1DA3C: 84, + 0x1DA3D: 84, + 0x1DA3E: 84, + 0x1DA3F: 84, + 0x1DA40: 84, + 0x1DA41: 84, + 0x1DA42: 84, + 0x1DA43: 84, + 0x1DA44: 84, + 0x1DA45: 84, + 0x1DA46: 84, + 0x1DA47: 84, + 0x1DA48: 84, + 0x1DA49: 84, + 0x1DA4A: 84, + 0x1DA4B: 84, + 0x1DA4C: 84, + 0x1DA4D: 84, + 0x1DA4E: 84, + 0x1DA4F: 84, + 0x1DA50: 84, + 0x1DA51: 84, + 0x1DA52: 84, + 0x1DA53: 84, + 0x1DA54: 84, + 0x1DA55: 84, + 0x1DA56: 84, + 0x1DA57: 84, + 0x1DA58: 84, + 0x1DA59: 84, + 0x1DA5A: 84, + 0x1DA5B: 84, + 0x1DA5C: 84, + 0x1DA5D: 84, + 0x1DA5E: 84, + 0x1DA5F: 84, + 0x1DA60: 84, + 0x1DA61: 84, + 0x1DA62: 84, + 0x1DA63: 84, + 0x1DA64: 84, + 0x1DA65: 84, + 0x1DA66: 84, + 0x1DA67: 84, + 0x1DA68: 84, + 0x1DA69: 84, + 0x1DA6A: 84, + 0x1DA6B: 84, + 0x1DA6C: 84, + 0x1DA75: 84, + 0x1DA84: 84, + 0x1DA9B: 84, + 0x1DA9C: 84, + 0x1DA9D: 84, + 0x1DA9E: 84, + 0x1DA9F: 84, + 0x1DAA1: 84, + 0x1DAA2: 84, + 0x1DAA3: 84, + 0x1DAA4: 84, + 0x1DAA5: 84, + 0x1DAA6: 84, + 0x1DAA7: 84, + 0x1DAA8: 84, + 0x1DAA9: 84, + 0x1DAAA: 84, + 0x1DAAB: 84, + 0x1DAAC: 84, + 0x1DAAD: 84, + 0x1DAAE: 84, + 0x1DAAF: 84, + 0x1E000: 84, + 0x1E001: 84, + 0x1E002: 84, + 0x1E003: 84, + 0x1E004: 84, + 0x1E005: 84, + 0x1E006: 84, + 0x1E008: 84, + 0x1E009: 84, + 0x1E00A: 84, + 0x1E00B: 84, + 0x1E00C: 84, + 0x1E00D: 84, + 0x1E00E: 84, + 0x1E00F: 84, + 0x1E010: 84, + 0x1E011: 84, + 0x1E012: 84, + 0x1E013: 84, + 0x1E014: 84, + 0x1E015: 84, + 0x1E016: 84, + 0x1E017: 84, + 0x1E018: 84, + 0x1E01B: 84, + 0x1E01C: 84, + 0x1E01D: 84, + 0x1E01E: 84, + 0x1E01F: 84, + 0x1E020: 84, + 0x1E021: 84, + 0x1E023: 84, + 0x1E024: 84, + 0x1E026: 84, + 0x1E027: 84, + 0x1E028: 84, + 0x1E029: 84, + 0x1E02A: 84, + 0x1E08F: 84, + 0x1E130: 84, + 0x1E131: 84, + 0x1E132: 84, + 0x1E133: 84, + 0x1E134: 84, + 0x1E135: 84, + 0x1E136: 84, + 0x1E2AE: 84, + 0x1E2EC: 84, + 0x1E2ED: 84, + 0x1E2EE: 84, + 0x1E2EF: 84, + 0x1E4EC: 84, + 0x1E4ED: 84, + 0x1E4EE: 84, + 0x1E4EF: 84, + 0x1E5EE: 84, + 0x1E5EF: 84, + 0x1E8D0: 84, + 0x1E8D1: 84, + 0x1E8D2: 84, + 0x1E8D3: 84, + 0x1E8D4: 84, + 0x1E8D5: 84, + 0x1E8D6: 84, + 0x1E900: 68, + 0x1E901: 68, + 0x1E902: 68, + 0x1E903: 68, + 0x1E904: 68, + 0x1E905: 68, + 0x1E906: 68, + 0x1E907: 68, + 0x1E908: 68, + 0x1E909: 68, + 0x1E90A: 68, + 0x1E90B: 68, + 0x1E90C: 68, + 0x1E90D: 68, + 0x1E90E: 68, + 0x1E90F: 68, + 0x1E910: 68, + 0x1E911: 68, + 0x1E912: 68, + 0x1E913: 68, + 0x1E914: 68, + 0x1E915: 68, + 0x1E916: 68, + 0x1E917: 68, + 0x1E918: 68, + 0x1E919: 68, + 0x1E91A: 68, + 0x1E91B: 68, + 0x1E91C: 68, + 0x1E91D: 68, + 0x1E91E: 68, + 0x1E91F: 68, + 0x1E920: 68, + 0x1E921: 68, + 0x1E922: 68, + 0x1E923: 68, + 0x1E924: 68, + 0x1E925: 68, + 0x1E926: 68, + 0x1E927: 68, + 0x1E928: 68, + 0x1E929: 68, + 0x1E92A: 68, + 0x1E92B: 68, + 0x1E92C: 68, + 0x1E92D: 68, + 0x1E92E: 68, + 0x1E92F: 68, + 0x1E930: 68, + 0x1E931: 68, + 0x1E932: 68, + 0x1E933: 68, + 0x1E934: 68, + 0x1E935: 68, + 0x1E936: 68, + 0x1E937: 68, + 0x1E938: 68, + 0x1E939: 68, + 0x1E93A: 68, + 0x1E93B: 68, + 0x1E93C: 68, + 0x1E93D: 68, + 0x1E93E: 68, + 0x1E93F: 68, + 0x1E940: 68, + 0x1E941: 68, + 0x1E942: 68, + 0x1E943: 68, + 0x1E944: 84, + 0x1E945: 84, + 0x1E946: 84, + 0x1E947: 84, + 0x1E948: 84, + 0x1E949: 84, + 0x1E94A: 84, + 0x1E94B: 84, + 0xE0001: 84, + 0xE0020: 84, + 0xE0021: 84, + 0xE0022: 84, + 0xE0023: 84, + 0xE0024: 84, + 0xE0025: 84, + 0xE0026: 84, + 0xE0027: 84, + 0xE0028: 84, + 0xE0029: 84, + 0xE002A: 84, + 0xE002B: 84, + 0xE002C: 84, + 0xE002D: 84, + 0xE002E: 84, + 0xE002F: 84, + 0xE0030: 84, + 0xE0031: 84, + 0xE0032: 84, + 0xE0033: 84, + 0xE0034: 84, + 0xE0035: 84, + 0xE0036: 84, + 0xE0037: 84, + 0xE0038: 84, + 0xE0039: 84, + 0xE003A: 84, + 0xE003B: 84, + 0xE003C: 84, + 0xE003D: 84, + 0xE003E: 84, + 0xE003F: 84, + 0xE0040: 84, + 0xE0041: 84, + 0xE0042: 84, + 0xE0043: 84, + 0xE0044: 84, + 0xE0045: 84, + 0xE0046: 84, + 0xE0047: 84, + 0xE0048: 84, + 0xE0049: 84, + 0xE004A: 84, + 0xE004B: 84, + 0xE004C: 84, + 0xE004D: 84, + 0xE004E: 84, + 0xE004F: 84, + 0xE0050: 84, + 0xE0051: 84, + 0xE0052: 84, + 0xE0053: 84, + 0xE0054: 84, + 0xE0055: 84, + 0xE0056: 84, + 0xE0057: 84, + 0xE0058: 84, + 0xE0059: 84, + 0xE005A: 84, + 0xE005B: 84, + 0xE005C: 84, + 0xE005D: 84, + 0xE005E: 84, + 0xE005F: 84, + 0xE0060: 84, + 0xE0061: 84, + 0xE0062: 84, + 0xE0063: 84, + 0xE0064: 84, + 0xE0065: 84, + 0xE0066: 84, + 0xE0067: 84, + 0xE0068: 84, + 0xE0069: 84, + 0xE006A: 84, + 0xE006B: 84, + 0xE006C: 84, + 0xE006D: 84, + 0xE006E: 84, + 0xE006F: 84, + 0xE0070: 84, + 0xE0071: 84, + 0xE0072: 84, + 0xE0073: 84, + 0xE0074: 84, + 0xE0075: 84, + 0xE0076: 84, + 0xE0077: 84, + 0xE0078: 84, + 0xE0079: 84, + 0xE007A: 84, + 0xE007B: 84, + 0xE007C: 84, + 0xE007D: 84, + 0xE007E: 84, + 0xE007F: 84, + 0xE0100: 84, + 0xE0101: 84, + 0xE0102: 84, + 0xE0103: 84, + 0xE0104: 84, + 0xE0105: 84, + 0xE0106: 84, + 0xE0107: 84, + 0xE0108: 84, + 0xE0109: 84, + 0xE010A: 84, + 0xE010B: 84, + 0xE010C: 84, + 0xE010D: 84, + 0xE010E: 84, + 0xE010F: 84, + 0xE0110: 84, + 0xE0111: 84, + 0xE0112: 84, + 0xE0113: 84, + 0xE0114: 84, + 0xE0115: 84, + 0xE0116: 84, + 0xE0117: 84, + 0xE0118: 84, + 0xE0119: 84, + 0xE011A: 84, + 0xE011B: 84, + 0xE011C: 84, + 0xE011D: 84, + 0xE011E: 84, + 0xE011F: 84, + 0xE0120: 84, + 0xE0121: 84, + 0xE0122: 84, + 0xE0123: 84, + 0xE0124: 84, + 0xE0125: 84, + 0xE0126: 84, + 0xE0127: 84, + 0xE0128: 84, + 0xE0129: 84, + 0xE012A: 84, + 0xE012B: 84, + 0xE012C: 84, + 0xE012D: 84, + 0xE012E: 84, + 0xE012F: 84, + 0xE0130: 84, + 0xE0131: 84, + 0xE0132: 84, + 0xE0133: 84, + 0xE0134: 84, + 0xE0135: 84, + 0xE0136: 84, + 0xE0137: 84, + 0xE0138: 84, + 0xE0139: 84, + 0xE013A: 84, + 0xE013B: 84, + 0xE013C: 84, + 0xE013D: 84, + 0xE013E: 84, + 0xE013F: 84, + 0xE0140: 84, + 0xE0141: 84, + 0xE0142: 84, + 0xE0143: 84, + 0xE0144: 84, + 0xE0145: 84, + 0xE0146: 84, + 0xE0147: 84, + 0xE0148: 84, + 0xE0149: 84, + 0xE014A: 84, + 0xE014B: 84, + 0xE014C: 84, + 0xE014D: 84, + 0xE014E: 84, + 0xE014F: 84, + 0xE0150: 84, + 0xE0151: 84, + 0xE0152: 84, + 0xE0153: 84, + 0xE0154: 84, + 0xE0155: 84, + 0xE0156: 84, + 0xE0157: 84, + 0xE0158: 84, + 0xE0159: 84, + 0xE015A: 84, + 0xE015B: 84, + 0xE015C: 84, + 0xE015D: 84, + 0xE015E: 84, + 0xE015F: 84, + 0xE0160: 84, + 0xE0161: 84, + 0xE0162: 84, + 0xE0163: 84, + 0xE0164: 84, + 0xE0165: 84, + 0xE0166: 84, + 0xE0167: 84, + 0xE0168: 84, + 0xE0169: 84, + 0xE016A: 84, + 0xE016B: 84, + 0xE016C: 84, + 0xE016D: 84, + 0xE016E: 84, + 0xE016F: 84, + 0xE0170: 84, + 0xE0171: 84, + 0xE0172: 84, + 0xE0173: 84, + 0xE0174: 84, + 0xE0175: 84, + 0xE0176: 84, + 0xE0177: 84, + 0xE0178: 84, + 0xE0179: 84, + 0xE017A: 84, + 0xE017B: 84, + 0xE017C: 84, + 0xE017D: 84, + 0xE017E: 84, + 0xE017F: 84, + 0xE0180: 84, + 0xE0181: 84, + 0xE0182: 84, + 0xE0183: 84, + 0xE0184: 84, + 0xE0185: 84, + 0xE0186: 84, + 0xE0187: 84, + 0xE0188: 84, + 0xE0189: 84, + 0xE018A: 84, + 0xE018B: 84, + 0xE018C: 84, + 0xE018D: 84, + 0xE018E: 84, + 0xE018F: 84, + 0xE0190: 84, + 0xE0191: 84, + 0xE0192: 84, + 0xE0193: 84, + 0xE0194: 84, + 0xE0195: 84, + 0xE0196: 84, + 0xE0197: 84, + 0xE0198: 84, + 0xE0199: 84, + 0xE019A: 84, + 0xE019B: 84, + 0xE019C: 84, + 0xE019D: 84, + 0xE019E: 84, + 0xE019F: 84, + 0xE01A0: 84, + 0xE01A1: 84, + 0xE01A2: 84, + 0xE01A3: 84, + 0xE01A4: 84, + 0xE01A5: 84, + 0xE01A6: 84, + 0xE01A7: 84, + 0xE01A8: 84, + 0xE01A9: 84, + 0xE01AA: 84, + 0xE01AB: 84, + 0xE01AC: 84, + 0xE01AD: 84, + 0xE01AE: 84, + 0xE01AF: 84, + 0xE01B0: 84, + 0xE01B1: 84, + 0xE01B2: 84, + 0xE01B3: 84, + 0xE01B4: 84, + 0xE01B5: 84, + 0xE01B6: 84, + 0xE01B7: 84, + 0xE01B8: 84, + 0xE01B9: 84, + 0xE01BA: 84, + 0xE01BB: 84, + 0xE01BC: 84, + 0xE01BD: 84, + 0xE01BE: 84, + 0xE01BF: 84, + 0xE01C0: 84, + 0xE01C1: 84, + 0xE01C2: 84, + 0xE01C3: 84, + 0xE01C4: 84, + 0xE01C5: 84, + 0xE01C6: 84, + 0xE01C7: 84, + 0xE01C8: 84, + 0xE01C9: 84, + 0xE01CA: 84, + 0xE01CB: 84, + 0xE01CC: 84, + 0xE01CD: 84, + 0xE01CE: 84, + 0xE01CF: 84, + 0xE01D0: 84, + 0xE01D1: 84, + 0xE01D2: 84, + 0xE01D3: 84, + 0xE01D4: 84, + 0xE01D5: 84, + 0xE01D6: 84, + 0xE01D7: 84, + 0xE01D8: 84, + 0xE01D9: 84, + 0xE01DA: 84, + 0xE01DB: 84, + 0xE01DC: 84, + 0xE01DD: 84, + 0xE01DE: 84, + 0xE01DF: 84, + 0xE01E0: 84, + 0xE01E1: 84, + 0xE01E2: 84, + 0xE01E3: 84, + 0xE01E4: 84, + 0xE01E5: 84, + 0xE01E6: 84, + 0xE01E7: 84, + 0xE01E8: 84, + 0xE01E9: 84, + 0xE01EA: 84, + 0xE01EB: 84, + 0xE01EC: 84, + 0xE01ED: 84, + 0xE01EE: 84, + 0xE01EF: 84, +} +codepoint_classes = { + "PVALID": ( + 0x2D0000002E, + 0x300000003A, + 0x610000007B, + 0xDF000000F7, + 0xF800000100, + 0x10100000102, + 0x10300000104, + 0x10500000106, + 0x10700000108, + 0x1090000010A, + 0x10B0000010C, + 0x10D0000010E, + 0x10F00000110, + 0x11100000112, + 0x11300000114, + 0x11500000116, + 0x11700000118, + 0x1190000011A, + 0x11B0000011C, + 0x11D0000011E, + 0x11F00000120, + 0x12100000122, + 0x12300000124, + 0x12500000126, + 0x12700000128, + 0x1290000012A, + 0x12B0000012C, + 0x12D0000012E, + 0x12F00000130, + 0x13100000132, + 0x13500000136, + 0x13700000139, + 0x13A0000013B, + 0x13C0000013D, + 0x13E0000013F, + 0x14200000143, + 0x14400000145, + 0x14600000147, + 0x14800000149, + 0x14B0000014C, + 0x14D0000014E, + 0x14F00000150, + 0x15100000152, + 0x15300000154, + 0x15500000156, + 0x15700000158, + 0x1590000015A, + 0x15B0000015C, + 0x15D0000015E, + 0x15F00000160, + 0x16100000162, + 0x16300000164, + 0x16500000166, + 0x16700000168, + 0x1690000016A, + 0x16B0000016C, + 0x16D0000016E, + 0x16F00000170, + 0x17100000172, + 0x17300000174, + 0x17500000176, + 0x17700000178, + 0x17A0000017B, + 0x17C0000017D, + 0x17E0000017F, + 0x18000000181, + 0x18300000184, + 0x18500000186, + 0x18800000189, + 0x18C0000018E, + 0x19200000193, + 0x19500000196, + 0x1990000019C, + 0x19E0000019F, + 0x1A1000001A2, + 0x1A3000001A4, + 0x1A5000001A6, + 0x1A8000001A9, + 0x1AA000001AC, + 0x1AD000001AE, + 0x1B0000001B1, + 0x1B4000001B5, + 0x1B6000001B7, + 0x1B9000001BC, + 0x1BD000001C4, + 0x1CE000001CF, + 0x1D0000001D1, + 0x1D2000001D3, + 0x1D4000001D5, + 0x1D6000001D7, + 0x1D8000001D9, + 0x1DA000001DB, + 0x1DC000001DE, + 0x1DF000001E0, + 0x1E1000001E2, + 0x1E3000001E4, + 0x1E5000001E6, + 0x1E7000001E8, + 0x1E9000001EA, + 0x1EB000001EC, + 0x1ED000001EE, + 0x1EF000001F1, + 0x1F5000001F6, + 0x1F9000001FA, + 0x1FB000001FC, + 0x1FD000001FE, + 0x1FF00000200, + 0x20100000202, + 0x20300000204, + 0x20500000206, + 0x20700000208, + 0x2090000020A, + 0x20B0000020C, + 0x20D0000020E, + 0x20F00000210, + 0x21100000212, + 0x21300000214, + 0x21500000216, + 0x21700000218, + 0x2190000021A, + 0x21B0000021C, + 0x21D0000021E, + 0x21F00000220, + 0x22100000222, + 0x22300000224, + 0x22500000226, + 0x22700000228, + 0x2290000022A, + 0x22B0000022C, + 0x22D0000022E, + 0x22F00000230, + 0x23100000232, + 0x2330000023A, + 0x23C0000023D, + 0x23F00000241, + 0x24200000243, + 0x24700000248, + 0x2490000024A, + 0x24B0000024C, + 0x24D0000024E, + 0x24F000002B0, + 0x2B9000002C2, + 0x2C6000002D2, + 0x2EC000002ED, + 0x2EE000002EF, + 0x30000000340, + 0x34200000343, + 0x3460000034F, + 0x35000000370, + 0x37100000372, + 0x37300000374, + 0x37700000378, + 0x37B0000037E, + 0x39000000391, + 0x3AC000003CF, + 0x3D7000003D8, + 0x3D9000003DA, + 0x3DB000003DC, + 0x3DD000003DE, + 0x3DF000003E0, + 0x3E1000003E2, + 0x3E3000003E4, + 0x3E5000003E6, + 0x3E7000003E8, + 0x3E9000003EA, + 0x3EB000003EC, + 0x3ED000003EE, + 0x3EF000003F0, + 0x3F3000003F4, + 0x3F8000003F9, + 0x3FB000003FD, + 0x43000000460, + 0x46100000462, + 0x46300000464, + 0x46500000466, + 0x46700000468, + 0x4690000046A, + 0x46B0000046C, + 0x46D0000046E, + 0x46F00000470, + 0x47100000472, + 0x47300000474, + 0x47500000476, + 0x47700000478, + 0x4790000047A, + 0x47B0000047C, + 0x47D0000047E, + 0x47F00000480, + 0x48100000482, + 0x48300000488, + 0x48B0000048C, + 0x48D0000048E, + 0x48F00000490, + 0x49100000492, + 0x49300000494, + 0x49500000496, + 0x49700000498, + 0x4990000049A, + 0x49B0000049C, + 0x49D0000049E, + 0x49F000004A0, + 0x4A1000004A2, + 0x4A3000004A4, + 0x4A5000004A6, + 0x4A7000004A8, + 0x4A9000004AA, + 0x4AB000004AC, + 0x4AD000004AE, + 0x4AF000004B0, + 0x4B1000004B2, + 0x4B3000004B4, + 0x4B5000004B6, + 0x4B7000004B8, + 0x4B9000004BA, + 0x4BB000004BC, + 0x4BD000004BE, + 0x4BF000004C0, + 0x4C2000004C3, + 0x4C4000004C5, + 0x4C6000004C7, + 0x4C8000004C9, + 0x4CA000004CB, + 0x4CC000004CD, + 0x4CE000004D0, + 0x4D1000004D2, + 0x4D3000004D4, + 0x4D5000004D6, + 0x4D7000004D8, + 0x4D9000004DA, + 0x4DB000004DC, + 0x4DD000004DE, + 0x4DF000004E0, + 0x4E1000004E2, + 0x4E3000004E4, + 0x4E5000004E6, + 0x4E7000004E8, + 0x4E9000004EA, + 0x4EB000004EC, + 0x4ED000004EE, + 0x4EF000004F0, + 0x4F1000004F2, + 0x4F3000004F4, + 0x4F5000004F6, + 0x4F7000004F8, + 0x4F9000004FA, + 0x4FB000004FC, + 0x4FD000004FE, + 0x4FF00000500, + 0x50100000502, + 0x50300000504, + 0x50500000506, + 0x50700000508, + 0x5090000050A, + 0x50B0000050C, + 0x50D0000050E, + 0x50F00000510, + 0x51100000512, + 0x51300000514, + 0x51500000516, + 0x51700000518, + 0x5190000051A, + 0x51B0000051C, + 0x51D0000051E, + 0x51F00000520, + 0x52100000522, + 0x52300000524, + 0x52500000526, + 0x52700000528, + 0x5290000052A, + 0x52B0000052C, + 0x52D0000052E, + 0x52F00000530, + 0x5590000055A, + 0x56000000587, + 0x58800000589, + 0x591000005BE, + 0x5BF000005C0, + 0x5C1000005C3, + 0x5C4000005C6, + 0x5C7000005C8, + 0x5D0000005EB, + 0x5EF000005F3, + 0x6100000061B, + 0x62000000640, + 0x64100000660, + 0x66E00000675, + 0x679000006D4, + 0x6D5000006DD, + 0x6DF000006E9, + 0x6EA000006F0, + 0x6FA00000700, + 0x7100000074B, + 0x74D000007B2, + 0x7C0000007F6, + 0x7FD000007FE, + 0x8000000082E, + 0x8400000085C, + 0x8600000086B, + 0x87000000888, + 0x8890000088F, + 0x897000008E2, + 0x8E300000958, + 0x96000000964, + 0x96600000970, + 0x97100000984, + 0x9850000098D, + 0x98F00000991, + 0x993000009A9, + 0x9AA000009B1, + 0x9B2000009B3, + 0x9B6000009BA, + 0x9BC000009C5, + 0x9C7000009C9, + 0x9CB000009CF, + 0x9D7000009D8, + 0x9E0000009E4, + 0x9E6000009F2, + 0x9FC000009FD, + 0x9FE000009FF, + 0xA0100000A04, + 0xA0500000A0B, + 0xA0F00000A11, + 0xA1300000A29, + 0xA2A00000A31, + 0xA3200000A33, + 0xA3500000A36, + 0xA3800000A3A, + 0xA3C00000A3D, + 0xA3E00000A43, + 0xA4700000A49, + 0xA4B00000A4E, + 0xA5100000A52, + 0xA5C00000A5D, + 0xA6600000A76, + 0xA8100000A84, + 0xA8500000A8E, + 0xA8F00000A92, + 0xA9300000AA9, + 0xAAA00000AB1, + 0xAB200000AB4, + 0xAB500000ABA, + 0xABC00000AC6, + 0xAC700000ACA, + 0xACB00000ACE, + 0xAD000000AD1, + 0xAE000000AE4, + 0xAE600000AF0, + 0xAF900000B00, + 0xB0100000B04, + 0xB0500000B0D, + 0xB0F00000B11, + 0xB1300000B29, + 0xB2A00000B31, + 0xB3200000B34, + 0xB3500000B3A, + 0xB3C00000B45, + 0xB4700000B49, + 0xB4B00000B4E, + 0xB5500000B58, + 0xB5F00000B64, + 0xB6600000B70, + 0xB7100000B72, + 0xB8200000B84, + 0xB8500000B8B, + 0xB8E00000B91, + 0xB9200000B96, + 0xB9900000B9B, + 0xB9C00000B9D, + 0xB9E00000BA0, + 0xBA300000BA5, + 0xBA800000BAB, + 0xBAE00000BBA, + 0xBBE00000BC3, + 0xBC600000BC9, + 0xBCA00000BCE, + 0xBD000000BD1, + 0xBD700000BD8, + 0xBE600000BF0, + 0xC0000000C0D, + 0xC0E00000C11, + 0xC1200000C29, + 0xC2A00000C3A, + 0xC3C00000C45, + 0xC4600000C49, + 0xC4A00000C4E, + 0xC5500000C57, + 0xC5800000C5B, + 0xC5D00000C5E, + 0xC6000000C64, + 0xC6600000C70, + 0xC8000000C84, + 0xC8500000C8D, + 0xC8E00000C91, + 0xC9200000CA9, + 0xCAA00000CB4, + 0xCB500000CBA, + 0xCBC00000CC5, + 0xCC600000CC9, + 0xCCA00000CCE, + 0xCD500000CD7, + 0xCDD00000CDF, + 0xCE000000CE4, + 0xCE600000CF0, + 0xCF100000CF4, + 0xD0000000D0D, + 0xD0E00000D11, + 0xD1200000D45, + 0xD4600000D49, + 0xD4A00000D4F, + 0xD5400000D58, + 0xD5F00000D64, + 0xD6600000D70, + 0xD7A00000D80, + 0xD8100000D84, + 0xD8500000D97, + 0xD9A00000DB2, + 0xDB300000DBC, + 0xDBD00000DBE, + 0xDC000000DC7, + 0xDCA00000DCB, + 0xDCF00000DD5, + 0xDD600000DD7, + 0xDD800000DE0, + 0xDE600000DF0, + 0xDF200000DF4, + 0xE0100000E33, + 0xE3400000E3B, + 0xE4000000E4F, + 0xE5000000E5A, + 0xE8100000E83, + 0xE8400000E85, + 0xE8600000E8B, + 0xE8C00000EA4, + 0xEA500000EA6, + 0xEA700000EB3, + 0xEB400000EBE, + 0xEC000000EC5, + 0xEC600000EC7, + 0xEC800000ECF, + 0xED000000EDA, + 0xEDE00000EE0, + 0xF0000000F01, + 0xF0B00000F0C, + 0xF1800000F1A, + 0xF2000000F2A, + 0xF3500000F36, + 0xF3700000F38, + 0xF3900000F3A, + 0xF3E00000F43, + 0xF4400000F48, + 0xF4900000F4D, + 0xF4E00000F52, + 0xF5300000F57, + 0xF5800000F5C, + 0xF5D00000F69, + 0xF6A00000F6D, + 0xF7100000F73, + 0xF7400000F75, + 0xF7A00000F81, + 0xF8200000F85, + 0xF8600000F93, + 0xF9400000F98, + 0xF9900000F9D, + 0xF9E00000FA2, + 0xFA300000FA7, + 0xFA800000FAC, + 0xFAD00000FB9, + 0xFBA00000FBD, + 0xFC600000FC7, + 0x10000000104A, + 0x10500000109E, + 0x10D0000010FB, + 0x10FD00001100, + 0x120000001249, + 0x124A0000124E, + 0x125000001257, + 0x125800001259, + 0x125A0000125E, + 0x126000001289, + 0x128A0000128E, + 0x1290000012B1, + 0x12B2000012B6, + 0x12B8000012BF, + 0x12C0000012C1, + 0x12C2000012C6, + 0x12C8000012D7, + 0x12D800001311, + 0x131200001316, + 0x13180000135B, + 0x135D00001360, + 0x138000001390, + 0x13A0000013F6, + 0x14010000166D, + 0x166F00001680, + 0x16810000169B, + 0x16A0000016EB, + 0x16F1000016F9, + 0x170000001716, + 0x171F00001735, + 0x174000001754, + 0x17600000176D, + 0x176E00001771, + 0x177200001774, + 0x1780000017B4, + 0x17B6000017D4, + 0x17D7000017D8, + 0x17DC000017DE, + 0x17E0000017EA, + 0x18100000181A, + 0x182000001879, + 0x1880000018AB, + 0x18B0000018F6, + 0x19000000191F, + 0x19200000192C, + 0x19300000193C, + 0x19460000196E, + 0x197000001975, + 0x1980000019AC, + 0x19B0000019CA, + 0x19D0000019DA, + 0x1A0000001A1C, + 0x1A2000001A5F, + 0x1A6000001A7D, + 0x1A7F00001A8A, + 0x1A9000001A9A, + 0x1AA700001AA8, + 0x1AB000001ABE, + 0x1ABF00001ACF, + 0x1B0000001B4D, + 0x1B5000001B5A, + 0x1B6B00001B74, + 0x1B8000001BF4, + 0x1C0000001C38, + 0x1C4000001C4A, + 0x1C4D00001C7E, + 0x1C8A00001C8B, + 0x1CD000001CD3, + 0x1CD400001CFB, + 0x1D0000001D2C, + 0x1D2F00001D30, + 0x1D3B00001D3C, + 0x1D4E00001D4F, + 0x1D6B00001D78, + 0x1D7900001D9B, + 0x1DC000001E00, + 0x1E0100001E02, + 0x1E0300001E04, + 0x1E0500001E06, + 0x1E0700001E08, + 0x1E0900001E0A, + 0x1E0B00001E0C, + 0x1E0D00001E0E, + 0x1E0F00001E10, + 0x1E1100001E12, + 0x1E1300001E14, + 0x1E1500001E16, + 0x1E1700001E18, + 0x1E1900001E1A, + 0x1E1B00001E1C, + 0x1E1D00001E1E, + 0x1E1F00001E20, + 0x1E2100001E22, + 0x1E2300001E24, + 0x1E2500001E26, + 0x1E2700001E28, + 0x1E2900001E2A, + 0x1E2B00001E2C, + 0x1E2D00001E2E, + 0x1E2F00001E30, + 0x1E3100001E32, + 0x1E3300001E34, + 0x1E3500001E36, + 0x1E3700001E38, + 0x1E3900001E3A, + 0x1E3B00001E3C, + 0x1E3D00001E3E, + 0x1E3F00001E40, + 0x1E4100001E42, + 0x1E4300001E44, + 0x1E4500001E46, + 0x1E4700001E48, + 0x1E4900001E4A, + 0x1E4B00001E4C, + 0x1E4D00001E4E, + 0x1E4F00001E50, + 0x1E5100001E52, + 0x1E5300001E54, + 0x1E5500001E56, + 0x1E5700001E58, + 0x1E5900001E5A, + 0x1E5B00001E5C, + 0x1E5D00001E5E, + 0x1E5F00001E60, + 0x1E6100001E62, + 0x1E6300001E64, + 0x1E6500001E66, + 0x1E6700001E68, + 0x1E6900001E6A, + 0x1E6B00001E6C, + 0x1E6D00001E6E, + 0x1E6F00001E70, + 0x1E7100001E72, + 0x1E7300001E74, + 0x1E7500001E76, + 0x1E7700001E78, + 0x1E7900001E7A, + 0x1E7B00001E7C, + 0x1E7D00001E7E, + 0x1E7F00001E80, + 0x1E8100001E82, + 0x1E8300001E84, + 0x1E8500001E86, + 0x1E8700001E88, + 0x1E8900001E8A, + 0x1E8B00001E8C, + 0x1E8D00001E8E, + 0x1E8F00001E90, + 0x1E9100001E92, + 0x1E9300001E94, + 0x1E9500001E9A, + 0x1E9C00001E9E, + 0x1E9F00001EA0, + 0x1EA100001EA2, + 0x1EA300001EA4, + 0x1EA500001EA6, + 0x1EA700001EA8, + 0x1EA900001EAA, + 0x1EAB00001EAC, + 0x1EAD00001EAE, + 0x1EAF00001EB0, + 0x1EB100001EB2, + 0x1EB300001EB4, + 0x1EB500001EB6, + 0x1EB700001EB8, + 0x1EB900001EBA, + 0x1EBB00001EBC, + 0x1EBD00001EBE, + 0x1EBF00001EC0, + 0x1EC100001EC2, + 0x1EC300001EC4, + 0x1EC500001EC6, + 0x1EC700001EC8, + 0x1EC900001ECA, + 0x1ECB00001ECC, + 0x1ECD00001ECE, + 0x1ECF00001ED0, + 0x1ED100001ED2, + 0x1ED300001ED4, + 0x1ED500001ED6, + 0x1ED700001ED8, + 0x1ED900001EDA, + 0x1EDB00001EDC, + 0x1EDD00001EDE, + 0x1EDF00001EE0, + 0x1EE100001EE2, + 0x1EE300001EE4, + 0x1EE500001EE6, + 0x1EE700001EE8, + 0x1EE900001EEA, + 0x1EEB00001EEC, + 0x1EED00001EEE, + 0x1EEF00001EF0, + 0x1EF100001EF2, + 0x1EF300001EF4, + 0x1EF500001EF6, + 0x1EF700001EF8, + 0x1EF900001EFA, + 0x1EFB00001EFC, + 0x1EFD00001EFE, + 0x1EFF00001F08, + 0x1F1000001F16, + 0x1F2000001F28, + 0x1F3000001F38, + 0x1F4000001F46, + 0x1F5000001F58, + 0x1F6000001F68, + 0x1F7000001F71, + 0x1F7200001F73, + 0x1F7400001F75, + 0x1F7600001F77, + 0x1F7800001F79, + 0x1F7A00001F7B, + 0x1F7C00001F7D, + 0x1FB000001FB2, + 0x1FB600001FB7, + 0x1FC600001FC7, + 0x1FD000001FD3, + 0x1FD600001FD8, + 0x1FE000001FE3, + 0x1FE400001FE8, + 0x1FF600001FF7, + 0x214E0000214F, + 0x218400002185, + 0x2C3000002C60, + 0x2C6100002C62, + 0x2C6500002C67, + 0x2C6800002C69, + 0x2C6A00002C6B, + 0x2C6C00002C6D, + 0x2C7100002C72, + 0x2C7300002C75, + 0x2C7600002C7C, + 0x2C8100002C82, + 0x2C8300002C84, + 0x2C8500002C86, + 0x2C8700002C88, + 0x2C8900002C8A, + 0x2C8B00002C8C, + 0x2C8D00002C8E, + 0x2C8F00002C90, + 0x2C9100002C92, + 0x2C9300002C94, + 0x2C9500002C96, + 0x2C9700002C98, + 0x2C9900002C9A, + 0x2C9B00002C9C, + 0x2C9D00002C9E, + 0x2C9F00002CA0, + 0x2CA100002CA2, + 0x2CA300002CA4, + 0x2CA500002CA6, + 0x2CA700002CA8, + 0x2CA900002CAA, + 0x2CAB00002CAC, + 0x2CAD00002CAE, + 0x2CAF00002CB0, + 0x2CB100002CB2, + 0x2CB300002CB4, + 0x2CB500002CB6, + 0x2CB700002CB8, + 0x2CB900002CBA, + 0x2CBB00002CBC, + 0x2CBD00002CBE, + 0x2CBF00002CC0, + 0x2CC100002CC2, + 0x2CC300002CC4, + 0x2CC500002CC6, + 0x2CC700002CC8, + 0x2CC900002CCA, + 0x2CCB00002CCC, + 0x2CCD00002CCE, + 0x2CCF00002CD0, + 0x2CD100002CD2, + 0x2CD300002CD4, + 0x2CD500002CD6, + 0x2CD700002CD8, + 0x2CD900002CDA, + 0x2CDB00002CDC, + 0x2CDD00002CDE, + 0x2CDF00002CE0, + 0x2CE100002CE2, + 0x2CE300002CE5, + 0x2CEC00002CED, + 0x2CEE00002CF2, + 0x2CF300002CF4, + 0x2D0000002D26, + 0x2D2700002D28, + 0x2D2D00002D2E, + 0x2D3000002D68, + 0x2D7F00002D97, + 0x2DA000002DA7, + 0x2DA800002DAF, + 0x2DB000002DB7, + 0x2DB800002DBF, + 0x2DC000002DC7, + 0x2DC800002DCF, + 0x2DD000002DD7, + 0x2DD800002DDF, + 0x2DE000002E00, + 0x2E2F00002E30, + 0x300500003008, + 0x302A0000302E, + 0x303C0000303D, + 0x304100003097, + 0x30990000309B, + 0x309D0000309F, + 0x30A1000030FB, + 0x30FC000030FF, + 0x310500003130, + 0x31A0000031C0, + 0x31F000003200, + 0x340000004DC0, + 0x4E000000A48D, + 0xA4D00000A4FE, + 0xA5000000A60D, + 0xA6100000A62C, + 0xA6410000A642, + 0xA6430000A644, + 0xA6450000A646, + 0xA6470000A648, + 0xA6490000A64A, + 0xA64B0000A64C, + 0xA64D0000A64E, + 0xA64F0000A650, + 0xA6510000A652, + 0xA6530000A654, + 0xA6550000A656, + 0xA6570000A658, + 0xA6590000A65A, + 0xA65B0000A65C, + 0xA65D0000A65E, + 0xA65F0000A660, + 0xA6610000A662, + 0xA6630000A664, + 0xA6650000A666, + 0xA6670000A668, + 0xA6690000A66A, + 0xA66B0000A66C, + 0xA66D0000A670, + 0xA6740000A67E, + 0xA67F0000A680, + 0xA6810000A682, + 0xA6830000A684, + 0xA6850000A686, + 0xA6870000A688, + 0xA6890000A68A, + 0xA68B0000A68C, + 0xA68D0000A68E, + 0xA68F0000A690, + 0xA6910000A692, + 0xA6930000A694, + 0xA6950000A696, + 0xA6970000A698, + 0xA6990000A69A, + 0xA69B0000A69C, + 0xA69E0000A6E6, + 0xA6F00000A6F2, + 0xA7170000A720, + 0xA7230000A724, + 0xA7250000A726, + 0xA7270000A728, + 0xA7290000A72A, + 0xA72B0000A72C, + 0xA72D0000A72E, + 0xA72F0000A732, + 0xA7330000A734, + 0xA7350000A736, + 0xA7370000A738, + 0xA7390000A73A, + 0xA73B0000A73C, + 0xA73D0000A73E, + 0xA73F0000A740, + 0xA7410000A742, + 0xA7430000A744, + 0xA7450000A746, + 0xA7470000A748, + 0xA7490000A74A, + 0xA74B0000A74C, + 0xA74D0000A74E, + 0xA74F0000A750, + 0xA7510000A752, + 0xA7530000A754, + 0xA7550000A756, + 0xA7570000A758, + 0xA7590000A75A, + 0xA75B0000A75C, + 0xA75D0000A75E, + 0xA75F0000A760, + 0xA7610000A762, + 0xA7630000A764, + 0xA7650000A766, + 0xA7670000A768, + 0xA7690000A76A, + 0xA76B0000A76C, + 0xA76D0000A76E, + 0xA76F0000A770, + 0xA7710000A779, + 0xA77A0000A77B, + 0xA77C0000A77D, + 0xA77F0000A780, + 0xA7810000A782, + 0xA7830000A784, + 0xA7850000A786, + 0xA7870000A789, + 0xA78C0000A78D, + 0xA78E0000A790, + 0xA7910000A792, + 0xA7930000A796, + 0xA7970000A798, + 0xA7990000A79A, + 0xA79B0000A79C, + 0xA79D0000A79E, + 0xA79F0000A7A0, + 0xA7A10000A7A2, + 0xA7A30000A7A4, + 0xA7A50000A7A6, + 0xA7A70000A7A8, + 0xA7A90000A7AA, + 0xA7AF0000A7B0, + 0xA7B50000A7B6, + 0xA7B70000A7B8, + 0xA7B90000A7BA, + 0xA7BB0000A7BC, + 0xA7BD0000A7BE, + 0xA7BF0000A7C0, + 0xA7C10000A7C2, + 0xA7C30000A7C4, + 0xA7C80000A7C9, + 0xA7CA0000A7CB, + 0xA7CD0000A7CE, + 0xA7D10000A7D2, + 0xA7D30000A7D4, + 0xA7D50000A7D6, + 0xA7D70000A7D8, + 0xA7D90000A7DA, + 0xA7DB0000A7DC, + 0xA7F60000A7F8, + 0xA7FA0000A828, + 0xA82C0000A82D, + 0xA8400000A874, + 0xA8800000A8C6, + 0xA8D00000A8DA, + 0xA8E00000A8F8, + 0xA8FB0000A8FC, + 0xA8FD0000A92E, + 0xA9300000A954, + 0xA9800000A9C1, + 0xA9CF0000A9DA, + 0xA9E00000A9FF, + 0xAA000000AA37, + 0xAA400000AA4E, + 0xAA500000AA5A, + 0xAA600000AA77, + 0xAA7A0000AAC3, + 0xAADB0000AADE, + 0xAAE00000AAF0, + 0xAAF20000AAF7, + 0xAB010000AB07, + 0xAB090000AB0F, + 0xAB110000AB17, + 0xAB200000AB27, + 0xAB280000AB2F, + 0xAB300000AB5B, + 0xAB600000AB69, + 0xABC00000ABEB, + 0xABEC0000ABEE, + 0xABF00000ABFA, + 0xAC000000D7A4, + 0xFA0E0000FA10, + 0xFA110000FA12, + 0xFA130000FA15, + 0xFA1F0000FA20, + 0xFA210000FA22, + 0xFA230000FA25, + 0xFA270000FA2A, + 0xFB1E0000FB1F, + 0xFE200000FE30, + 0xFE730000FE74, + 0x100000001000C, + 0x1000D00010027, + 0x100280001003B, + 0x1003C0001003E, + 0x1003F0001004E, + 0x100500001005E, + 0x10080000100FB, + 0x101FD000101FE, + 0x102800001029D, + 0x102A0000102D1, + 0x102E0000102E1, + 0x1030000010320, + 0x1032D00010341, + 0x103420001034A, + 0x103500001037B, + 0x103800001039E, + 0x103A0000103C4, + 0x103C8000103D0, + 0x104280001049E, + 0x104A0000104AA, + 0x104D8000104FC, + 0x1050000010528, + 0x1053000010564, + 0x10597000105A2, + 0x105A3000105B2, + 0x105B3000105BA, + 0x105BB000105BD, + 0x105C0000105F4, + 0x1060000010737, + 0x1074000010756, + 0x1076000010768, + 0x1078000010781, + 0x1080000010806, + 0x1080800010809, + 0x1080A00010836, + 0x1083700010839, + 0x1083C0001083D, + 0x1083F00010856, + 0x1086000010877, + 0x108800001089F, + 0x108E0000108F3, + 0x108F4000108F6, + 0x1090000010916, + 0x109200001093A, + 0x10980000109B8, + 0x109BE000109C0, + 0x10A0000010A04, + 0x10A0500010A07, + 0x10A0C00010A14, + 0x10A1500010A18, + 0x10A1900010A36, + 0x10A3800010A3B, + 0x10A3F00010A40, + 0x10A6000010A7D, + 0x10A8000010A9D, + 0x10AC000010AC8, + 0x10AC900010AE7, + 0x10B0000010B36, + 0x10B4000010B56, + 0x10B6000010B73, + 0x10B8000010B92, + 0x10C0000010C49, + 0x10CC000010CF3, + 0x10D0000010D28, + 0x10D3000010D3A, + 0x10D4000010D50, + 0x10D6900010D6E, + 0x10D6F00010D86, + 0x10E8000010EAA, + 0x10EAB00010EAD, + 0x10EB000010EB2, + 0x10EC200010EC5, + 0x10EFC00010F1D, + 0x10F2700010F28, + 0x10F3000010F51, + 0x10F7000010F86, + 0x10FB000010FC5, + 0x10FE000010FF7, + 0x1100000011047, + 0x1106600011076, + 0x1107F000110BB, + 0x110C2000110C3, + 0x110D0000110E9, + 0x110F0000110FA, + 0x1110000011135, + 0x1113600011140, + 0x1114400011148, + 0x1115000011174, + 0x1117600011177, + 0x11180000111C5, + 0x111C9000111CD, + 0x111CE000111DB, + 0x111DC000111DD, + 0x1120000011212, + 0x1121300011238, + 0x1123E00011242, + 0x1128000011287, + 0x1128800011289, + 0x1128A0001128E, + 0x1128F0001129E, + 0x1129F000112A9, + 0x112B0000112EB, + 0x112F0000112FA, + 0x1130000011304, + 0x113050001130D, + 0x1130F00011311, + 0x1131300011329, + 0x1132A00011331, + 0x1133200011334, + 0x113350001133A, + 0x1133B00011345, + 0x1134700011349, + 0x1134B0001134E, + 0x1135000011351, + 0x1135700011358, + 0x1135D00011364, + 0x113660001136D, + 0x1137000011375, + 0x113800001138A, + 0x1138B0001138C, + 0x1138E0001138F, + 0x11390000113B6, + 0x113B7000113C1, + 0x113C2000113C3, + 0x113C5000113C6, + 0x113C7000113CB, + 0x113CC000113D4, + 0x113E1000113E3, + 0x114000001144B, + 0x114500001145A, + 0x1145E00011462, + 0x11480000114C6, + 0x114C7000114C8, + 0x114D0000114DA, + 0x11580000115B6, + 0x115B8000115C1, + 0x115D8000115DE, + 0x1160000011641, + 0x1164400011645, + 0x116500001165A, + 0x11680000116B9, + 0x116C0000116CA, + 0x116D0000116E4, + 0x117000001171B, + 0x1171D0001172C, + 0x117300001173A, + 0x1174000011747, + 0x118000001183B, + 0x118C0000118EA, + 0x118FF00011907, + 0x119090001190A, + 0x1190C00011914, + 0x1191500011917, + 0x1191800011936, + 0x1193700011939, + 0x1193B00011944, + 0x119500001195A, + 0x119A0000119A8, + 0x119AA000119D8, + 0x119DA000119E2, + 0x119E3000119E5, + 0x11A0000011A3F, + 0x11A4700011A48, + 0x11A5000011A9A, + 0x11A9D00011A9E, + 0x11AB000011AF9, + 0x11BC000011BE1, + 0x11BF000011BFA, + 0x11C0000011C09, + 0x11C0A00011C37, + 0x11C3800011C41, + 0x11C5000011C5A, + 0x11C7200011C90, + 0x11C9200011CA8, + 0x11CA900011CB7, + 0x11D0000011D07, + 0x11D0800011D0A, + 0x11D0B00011D37, + 0x11D3A00011D3B, + 0x11D3C00011D3E, + 0x11D3F00011D48, + 0x11D5000011D5A, + 0x11D6000011D66, + 0x11D6700011D69, + 0x11D6A00011D8F, + 0x11D9000011D92, + 0x11D9300011D99, + 0x11DA000011DAA, + 0x11EE000011EF7, + 0x11F0000011F11, + 0x11F1200011F3B, + 0x11F3E00011F43, + 0x11F5000011F5B, + 0x11FB000011FB1, + 0x120000001239A, + 0x1248000012544, + 0x12F9000012FF1, + 0x1300000013430, + 0x1344000013456, + 0x13460000143FB, + 0x1440000014647, + 0x161000001613A, + 0x1680000016A39, + 0x16A4000016A5F, + 0x16A6000016A6A, + 0x16A7000016ABF, + 0x16AC000016ACA, + 0x16AD000016AEE, + 0x16AF000016AF5, + 0x16B0000016B37, + 0x16B4000016B44, + 0x16B5000016B5A, + 0x16B6300016B78, + 0x16B7D00016B90, + 0x16D4000016D6D, + 0x16D7000016D7A, + 0x16E6000016E80, + 0x16F0000016F4B, + 0x16F4F00016F88, + 0x16F8F00016FA0, + 0x16FE000016FE2, + 0x16FE300016FE5, + 0x16FF000016FF2, + 0x17000000187F8, + 0x1880000018CD6, + 0x18CFF00018D09, + 0x1AFF00001AFF4, + 0x1AFF50001AFFC, + 0x1AFFD0001AFFF, + 0x1B0000001B123, + 0x1B1320001B133, + 0x1B1500001B153, + 0x1B1550001B156, + 0x1B1640001B168, + 0x1B1700001B2FC, + 0x1BC000001BC6B, + 0x1BC700001BC7D, + 0x1BC800001BC89, + 0x1BC900001BC9A, + 0x1BC9D0001BC9F, + 0x1CCF00001CCFA, + 0x1CF000001CF2E, + 0x1CF300001CF47, + 0x1DA000001DA37, + 0x1DA3B0001DA6D, + 0x1DA750001DA76, + 0x1DA840001DA85, + 0x1DA9B0001DAA0, + 0x1DAA10001DAB0, + 0x1DF000001DF1F, + 0x1DF250001DF2B, + 0x1E0000001E007, + 0x1E0080001E019, + 0x1E01B0001E022, + 0x1E0230001E025, + 0x1E0260001E02B, + 0x1E08F0001E090, + 0x1E1000001E12D, + 0x1E1300001E13E, + 0x1E1400001E14A, + 0x1E14E0001E14F, + 0x1E2900001E2AF, + 0x1E2C00001E2FA, + 0x1E4D00001E4FA, + 0x1E5D00001E5FB, + 0x1E7E00001E7E7, + 0x1E7E80001E7EC, + 0x1E7ED0001E7EF, + 0x1E7F00001E7FF, + 0x1E8000001E8C5, + 0x1E8D00001E8D7, + 0x1E9220001E94C, + 0x1E9500001E95A, + 0x200000002A6E0, + 0x2A7000002B73A, + 0x2B7400002B81E, + 0x2B8200002CEA2, + 0x2CEB00002EBE1, + 0x2EBF00002EE5E, + 0x300000003134B, + 0x31350000323B0, + ), + "CONTEXTJ": (0x200C0000200E,), + "CONTEXTO": ( + 0xB7000000B8, + 0x37500000376, + 0x5F3000005F5, + 0x6600000066A, + 0x6F0000006FA, + 0x30FB000030FC, + ), +} diff --git a/idna/intranges.py b/idna/intranges.py new file mode 100644 index 0000000000000000000000000000000000000000..7bfaa8d80d7dc471d572db0f949460901126e8bd --- /dev/null +++ b/idna/intranges.py @@ -0,0 +1,57 @@ +""" +Given a list of integers, made up of (hopefully) a small number of long runs +of consecutive integers, compute a representation of the form +((start1, end1), (start2, end2) ...). Then answer the question "was x present +in the original list?" in time O(log(# runs)). +""" + +import bisect +from typing import List, Tuple + + +def intranges_from_list(list_: List[int]) -> Tuple[int, ...]: + """Represent a list of integers as a sequence of ranges: + ((start_0, end_0), (start_1, end_1), ...), such that the original + integers are exactly those x such that start_i <= x < end_i for some i. + + Ranges are encoded as single integers (start << 32 | end), not as tuples. + """ + + sorted_list = sorted(list_) + ranges = [] + last_write = -1 + for i in range(len(sorted_list)): + if i + 1 < len(sorted_list): + if sorted_list[i] == sorted_list[i + 1] - 1: + continue + current_range = sorted_list[last_write + 1 : i + 1] + ranges.append(_encode_range(current_range[0], current_range[-1] + 1)) + last_write = i + + return tuple(ranges) + + +def _encode_range(start: int, end: int) -> int: + return (start << 32) | end + + +def _decode_range(r: int) -> Tuple[int, int]: + return (r >> 32), (r & ((1 << 32) - 1)) + + +def intranges_contain(int_: int, ranges: Tuple[int, ...]) -> bool: + """Determine if `int_` falls into one of the ranges in `ranges`.""" + tuple_ = _encode_range(int_, 0) + pos = bisect.bisect_left(ranges, tuple_) + # we could be immediately ahead of a tuple (start, end) + # with start < int_ <= end + if pos > 0: + left, right = _decode_range(ranges[pos - 1]) + if left <= int_ < right: + return True + # or we could be immediately behind a tuple (int_, end) + if pos < len(ranges): + left, _ = _decode_range(ranges[pos]) + if left == int_: + return True + return False diff --git a/idna/package_data.py b/idna/package_data.py new file mode 100644 index 0000000000000000000000000000000000000000..7272c8d92364886c51fefd22837ed5ceab145606 --- /dev/null +++ b/idna/package_data.py @@ -0,0 +1 @@ +__version__ = "3.11" diff --git a/idna/py.typed b/idna/py.typed new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/idna/uts46data.py b/idna/uts46data.py new file mode 100644 index 0000000000000000000000000000000000000000..4610b71dad9196838d4e1e04e76d5e7c9baf8cd9 --- /dev/null +++ b/idna/uts46data.py @@ -0,0 +1,8841 @@ +# This file is automatically generated by tools/idna-data +# vim: set fileencoding=utf-8 : + +from typing import List, Tuple, Union + +"""IDNA Mapping Table from UTS46.""" + + +__version__ = "16.0.0" + + +def _seg_0() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x0, "V"), + (0x1, "V"), + (0x2, "V"), + (0x3, "V"), + (0x4, "V"), + (0x5, "V"), + (0x6, "V"), + (0x7, "V"), + (0x8, "V"), + (0x9, "V"), + (0xA, "V"), + (0xB, "V"), + (0xC, "V"), + (0xD, "V"), + (0xE, "V"), + (0xF, "V"), + (0x10, "V"), + (0x11, "V"), + (0x12, "V"), + (0x13, "V"), + (0x14, "V"), + (0x15, "V"), + (0x16, "V"), + (0x17, "V"), + (0x18, "V"), + (0x19, "V"), + (0x1A, "V"), + (0x1B, "V"), + (0x1C, "V"), + (0x1D, "V"), + (0x1E, "V"), + (0x1F, "V"), + (0x20, "V"), + (0x21, "V"), + (0x22, "V"), + (0x23, "V"), + (0x24, "V"), + (0x25, "V"), + (0x26, "V"), + (0x27, "V"), + (0x28, "V"), + (0x29, "V"), + (0x2A, "V"), + (0x2B, "V"), + (0x2C, "V"), + (0x2D, "V"), + (0x2E, "V"), + (0x2F, "V"), + (0x30, "V"), + (0x31, "V"), + (0x32, "V"), + (0x33, "V"), + (0x34, "V"), + (0x35, "V"), + (0x36, "V"), + (0x37, "V"), + (0x38, "V"), + (0x39, "V"), + (0x3A, "V"), + (0x3B, "V"), + (0x3C, "V"), + (0x3D, "V"), + (0x3E, "V"), + (0x3F, "V"), + (0x40, "V"), + (0x41, "M", "a"), + (0x42, "M", "b"), + (0x43, "M", "c"), + (0x44, "M", "d"), + (0x45, "M", "e"), + (0x46, "M", "f"), + (0x47, "M", "g"), + (0x48, "M", "h"), + (0x49, "M", "i"), + (0x4A, "M", "j"), + (0x4B, "M", "k"), + (0x4C, "M", "l"), + (0x4D, "M", "m"), + (0x4E, "M", "n"), + (0x4F, "M", "o"), + (0x50, "M", "p"), + (0x51, "M", "q"), + (0x52, "M", "r"), + (0x53, "M", "s"), + (0x54, "M", "t"), + (0x55, "M", "u"), + (0x56, "M", "v"), + (0x57, "M", "w"), + (0x58, "M", "x"), + (0x59, "M", "y"), + (0x5A, "M", "z"), + (0x5B, "V"), + (0x5C, "V"), + (0x5D, "V"), + (0x5E, "V"), + (0x5F, "V"), + (0x60, "V"), + (0x61, "V"), + (0x62, "V"), + (0x63, "V"), + ] + + +def _seg_1() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x64, "V"), + (0x65, "V"), + (0x66, "V"), + (0x67, "V"), + (0x68, "V"), + (0x69, "V"), + (0x6A, "V"), + (0x6B, "V"), + (0x6C, "V"), + (0x6D, "V"), + (0x6E, "V"), + (0x6F, "V"), + (0x70, "V"), + (0x71, "V"), + (0x72, "V"), + (0x73, "V"), + (0x74, "V"), + (0x75, "V"), + (0x76, "V"), + (0x77, "V"), + (0x78, "V"), + (0x79, "V"), + (0x7A, "V"), + (0x7B, "V"), + (0x7C, "V"), + (0x7D, "V"), + (0x7E, "V"), + (0x7F, "V"), + (0x80, "X"), + (0x81, "X"), + (0x82, "X"), + (0x83, "X"), + (0x84, "X"), + (0x85, "X"), + (0x86, "X"), + (0x87, "X"), + (0x88, "X"), + (0x89, "X"), + (0x8A, "X"), + (0x8B, "X"), + (0x8C, "X"), + (0x8D, "X"), + (0x8E, "X"), + (0x8F, "X"), + (0x90, "X"), + (0x91, "X"), + (0x92, "X"), + (0x93, "X"), + (0x94, "X"), + (0x95, "X"), + (0x96, "X"), + (0x97, "X"), + (0x98, "X"), + (0x99, "X"), + (0x9A, "X"), + (0x9B, "X"), + (0x9C, "X"), + (0x9D, "X"), + (0x9E, "X"), + (0x9F, "X"), + (0xA0, "M", " "), + (0xA1, "V"), + (0xA2, "V"), + (0xA3, "V"), + (0xA4, "V"), + (0xA5, "V"), + (0xA6, "V"), + (0xA7, "V"), + (0xA8, "M", " ̈"), + (0xA9, "V"), + (0xAA, "M", "a"), + (0xAB, "V"), + (0xAC, "V"), + (0xAD, "I"), + (0xAE, "V"), + (0xAF, "M", " ̄"), + (0xB0, "V"), + (0xB1, "V"), + (0xB2, "M", "2"), + (0xB3, "M", "3"), + (0xB4, "M", " ́"), + (0xB5, "M", "μ"), + (0xB6, "V"), + (0xB7, "V"), + (0xB8, "M", " ̧"), + (0xB9, "M", "1"), + (0xBA, "M", "o"), + (0xBB, "V"), + (0xBC, "M", "1⁄4"), + (0xBD, "M", "1⁄2"), + (0xBE, "M", "3⁄4"), + (0xBF, "V"), + (0xC0, "M", "à"), + (0xC1, "M", "á"), + (0xC2, "M", "â"), + (0xC3, "M", "ã"), + (0xC4, "M", "ä"), + (0xC5, "M", "å"), + (0xC6, "M", "æ"), + (0xC7, "M", "ç"), + ] + + +def _seg_2() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0xC8, "M", "è"), + (0xC9, "M", "é"), + (0xCA, "M", "ê"), + (0xCB, "M", "ë"), + (0xCC, "M", "ì"), + (0xCD, "M", "í"), + (0xCE, "M", "î"), + (0xCF, "M", "ï"), + (0xD0, "M", "ð"), + (0xD1, "M", "ñ"), + (0xD2, "M", "ò"), + (0xD3, "M", "ó"), + (0xD4, "M", "ô"), + (0xD5, "M", "õ"), + (0xD6, "M", "ö"), + (0xD7, "V"), + (0xD8, "M", "ø"), + (0xD9, "M", "ù"), + (0xDA, "M", "ú"), + (0xDB, "M", "û"), + (0xDC, "M", "ü"), + (0xDD, "M", "ý"), + (0xDE, "M", "þ"), + (0xDF, "D", "ss"), + (0xE0, "V"), + (0xE1, "V"), + (0xE2, "V"), + (0xE3, "V"), + (0xE4, "V"), + (0xE5, "V"), + (0xE6, "V"), + (0xE7, "V"), + (0xE8, "V"), + (0xE9, "V"), + (0xEA, "V"), + (0xEB, "V"), + (0xEC, "V"), + (0xED, "V"), + (0xEE, "V"), + (0xEF, "V"), + (0xF0, "V"), + (0xF1, "V"), + (0xF2, "V"), + (0xF3, "V"), + (0xF4, "V"), + (0xF5, "V"), + (0xF6, "V"), + (0xF7, "V"), + (0xF8, "V"), + (0xF9, "V"), + (0xFA, "V"), + (0xFB, "V"), + (0xFC, "V"), + (0xFD, "V"), + (0xFE, "V"), + (0xFF, "V"), + (0x100, "M", "ā"), + (0x101, "V"), + (0x102, "M", "ă"), + (0x103, "V"), + (0x104, "M", "ą"), + (0x105, "V"), + (0x106, "M", "ć"), + (0x107, "V"), + (0x108, "M", "ĉ"), + (0x109, "V"), + (0x10A, "M", "ċ"), + (0x10B, "V"), + (0x10C, "M", "č"), + (0x10D, "V"), + (0x10E, "M", "ď"), + (0x10F, "V"), + (0x110, "M", "đ"), + (0x111, "V"), + (0x112, "M", "ē"), + (0x113, "V"), + (0x114, "M", "ĕ"), + (0x115, "V"), + (0x116, "M", "ė"), + (0x117, "V"), + (0x118, "M", "ę"), + (0x119, "V"), + (0x11A, "M", "ě"), + (0x11B, "V"), + (0x11C, "M", "ĝ"), + (0x11D, "V"), + (0x11E, "M", "ğ"), + (0x11F, "V"), + (0x120, "M", "ġ"), + (0x121, "V"), + (0x122, "M", "ģ"), + (0x123, "V"), + (0x124, "M", "ĥ"), + (0x125, "V"), + (0x126, "M", "ħ"), + (0x127, "V"), + (0x128, "M", "ĩ"), + (0x129, "V"), + (0x12A, "M", "ī"), + (0x12B, "V"), + ] + + +def _seg_3() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x12C, "M", "ĭ"), + (0x12D, "V"), + (0x12E, "M", "į"), + (0x12F, "V"), + (0x130, "M", "i̇"), + (0x131, "V"), + (0x132, "M", "ij"), + (0x134, "M", "ĵ"), + (0x135, "V"), + (0x136, "M", "ķ"), + (0x137, "V"), + (0x139, "M", "ĺ"), + (0x13A, "V"), + (0x13B, "M", "ļ"), + (0x13C, "V"), + (0x13D, "M", "ľ"), + (0x13E, "V"), + (0x13F, "M", "l·"), + (0x141, "M", "ł"), + (0x142, "V"), + (0x143, "M", "ń"), + (0x144, "V"), + (0x145, "M", "ņ"), + (0x146, "V"), + (0x147, "M", "ň"), + (0x148, "V"), + (0x149, "M", "ʼn"), + (0x14A, "M", "ŋ"), + (0x14B, "V"), + (0x14C, "M", "ō"), + (0x14D, "V"), + (0x14E, "M", "ŏ"), + (0x14F, "V"), + (0x150, "M", "ő"), + (0x151, "V"), + (0x152, "M", "œ"), + (0x153, "V"), + (0x154, "M", "ŕ"), + (0x155, "V"), + (0x156, "M", "ŗ"), + (0x157, "V"), + (0x158, "M", "ř"), + (0x159, "V"), + (0x15A, "M", "ś"), + (0x15B, "V"), + (0x15C, "M", "ŝ"), + (0x15D, "V"), + (0x15E, "M", "ş"), + (0x15F, "V"), + (0x160, "M", "š"), + (0x161, "V"), + (0x162, "M", "ţ"), + (0x163, "V"), + (0x164, "M", "ť"), + (0x165, "V"), + (0x166, "M", "ŧ"), + (0x167, "V"), + (0x168, "M", "ũ"), + (0x169, "V"), + (0x16A, "M", "ū"), + (0x16B, "V"), + (0x16C, "M", "ŭ"), + (0x16D, "V"), + (0x16E, "M", "ů"), + (0x16F, "V"), + (0x170, "M", "ű"), + (0x171, "V"), + (0x172, "M", "ų"), + (0x173, "V"), + (0x174, "M", "ŵ"), + (0x175, "V"), + (0x176, "M", "ŷ"), + (0x177, "V"), + (0x178, "M", "ÿ"), + (0x179, "M", "ź"), + (0x17A, "V"), + (0x17B, "M", "ż"), + (0x17C, "V"), + (0x17D, "M", "ž"), + (0x17E, "V"), + (0x17F, "M", "s"), + (0x180, "V"), + (0x181, "M", "ɓ"), + (0x182, "M", "ƃ"), + (0x183, "V"), + (0x184, "M", "ƅ"), + (0x185, "V"), + (0x186, "M", "ɔ"), + (0x187, "M", "ƈ"), + (0x188, "V"), + (0x189, "M", "ɖ"), + (0x18A, "M", "ɗ"), + (0x18B, "M", "ƌ"), + (0x18C, "V"), + (0x18E, "M", "ǝ"), + (0x18F, "M", "ə"), + (0x190, "M", "ɛ"), + (0x191, "M", "ƒ"), + (0x192, "V"), + (0x193, "M", "ɠ"), + ] + + +def _seg_4() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x194, "M", "ɣ"), + (0x195, "V"), + (0x196, "M", "ɩ"), + (0x197, "M", "ɨ"), + (0x198, "M", "ƙ"), + (0x199, "V"), + (0x19C, "M", "ɯ"), + (0x19D, "M", "ɲ"), + (0x19E, "V"), + (0x19F, "M", "ɵ"), + (0x1A0, "M", "ơ"), + (0x1A1, "V"), + (0x1A2, "M", "ƣ"), + (0x1A3, "V"), + (0x1A4, "M", "ƥ"), + (0x1A5, "V"), + (0x1A6, "M", "ʀ"), + (0x1A7, "M", "ƨ"), + (0x1A8, "V"), + (0x1A9, "M", "ʃ"), + (0x1AA, "V"), + (0x1AC, "M", "ƭ"), + (0x1AD, "V"), + (0x1AE, "M", "ʈ"), + (0x1AF, "M", "ư"), + (0x1B0, "V"), + (0x1B1, "M", "ʊ"), + (0x1B2, "M", "ʋ"), + (0x1B3, "M", "ƴ"), + (0x1B4, "V"), + (0x1B5, "M", "ƶ"), + (0x1B6, "V"), + (0x1B7, "M", "ʒ"), + (0x1B8, "M", "ƹ"), + (0x1B9, "V"), + (0x1BC, "M", "ƽ"), + (0x1BD, "V"), + (0x1C4, "M", "dž"), + (0x1C7, "M", "lj"), + (0x1CA, "M", "nj"), + (0x1CD, "M", "ǎ"), + (0x1CE, "V"), + (0x1CF, "M", "ǐ"), + (0x1D0, "V"), + (0x1D1, "M", "ǒ"), + (0x1D2, "V"), + (0x1D3, "M", "ǔ"), + (0x1D4, "V"), + (0x1D5, "M", "ǖ"), + (0x1D6, "V"), + (0x1D7, "M", "ǘ"), + (0x1D8, "V"), + (0x1D9, "M", "ǚ"), + (0x1DA, "V"), + (0x1DB, "M", "ǜ"), + (0x1DC, "V"), + (0x1DE, "M", "ǟ"), + (0x1DF, "V"), + (0x1E0, "M", "ǡ"), + (0x1E1, "V"), + (0x1E2, "M", "ǣ"), + (0x1E3, "V"), + (0x1E4, "M", "ǥ"), + (0x1E5, "V"), + (0x1E6, "M", "ǧ"), + (0x1E7, "V"), + (0x1E8, "M", "ǩ"), + (0x1E9, "V"), + (0x1EA, "M", "ǫ"), + (0x1EB, "V"), + (0x1EC, "M", "ǭ"), + (0x1ED, "V"), + (0x1EE, "M", "ǯ"), + (0x1EF, "V"), + (0x1F1, "M", "dz"), + (0x1F4, "M", "ǵ"), + (0x1F5, "V"), + (0x1F6, "M", "ƕ"), + (0x1F7, "M", "ƿ"), + (0x1F8, "M", "ǹ"), + (0x1F9, "V"), + (0x1FA, "M", "ǻ"), + (0x1FB, "V"), + (0x1FC, "M", "ǽ"), + (0x1FD, "V"), + (0x1FE, "M", "ǿ"), + (0x1FF, "V"), + (0x200, "M", "ȁ"), + (0x201, "V"), + (0x202, "M", "ȃ"), + (0x203, "V"), + (0x204, "M", "ȅ"), + (0x205, "V"), + (0x206, "M", "ȇ"), + (0x207, "V"), + (0x208, "M", "ȉ"), + (0x209, "V"), + (0x20A, "M", "ȋ"), + (0x20B, "V"), + (0x20C, "M", "ȍ"), + ] + + +def _seg_5() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x20D, "V"), + (0x20E, "M", "ȏ"), + (0x20F, "V"), + (0x210, "M", "ȑ"), + (0x211, "V"), + (0x212, "M", "ȓ"), + (0x213, "V"), + (0x214, "M", "ȕ"), + (0x215, "V"), + (0x216, "M", "ȗ"), + (0x217, "V"), + (0x218, "M", "ș"), + (0x219, "V"), + (0x21A, "M", "ț"), + (0x21B, "V"), + (0x21C, "M", "ȝ"), + (0x21D, "V"), + (0x21E, "M", "ȟ"), + (0x21F, "V"), + (0x220, "M", "ƞ"), + (0x221, "V"), + (0x222, "M", "ȣ"), + (0x223, "V"), + (0x224, "M", "ȥ"), + (0x225, "V"), + (0x226, "M", "ȧ"), + (0x227, "V"), + (0x228, "M", "ȩ"), + (0x229, "V"), + (0x22A, "M", "ȫ"), + (0x22B, "V"), + (0x22C, "M", "ȭ"), + (0x22D, "V"), + (0x22E, "M", "ȯ"), + (0x22F, "V"), + (0x230, "M", "ȱ"), + (0x231, "V"), + (0x232, "M", "ȳ"), + (0x233, "V"), + (0x23A, "M", "ⱥ"), + (0x23B, "M", "ȼ"), + (0x23C, "V"), + (0x23D, "M", "ƚ"), + (0x23E, "M", "ⱦ"), + (0x23F, "V"), + (0x241, "M", "ɂ"), + (0x242, "V"), + (0x243, "M", "ƀ"), + (0x244, "M", "ʉ"), + (0x245, "M", "ʌ"), + (0x246, "M", "ɇ"), + (0x247, "V"), + (0x248, "M", "ɉ"), + (0x249, "V"), + (0x24A, "M", "ɋ"), + (0x24B, "V"), + (0x24C, "M", "ɍ"), + (0x24D, "V"), + (0x24E, "M", "ɏ"), + (0x24F, "V"), + (0x2B0, "M", "h"), + (0x2B1, "M", "ɦ"), + (0x2B2, "M", "j"), + (0x2B3, "M", "r"), + (0x2B4, "M", "ɹ"), + (0x2B5, "M", "ɻ"), + (0x2B6, "M", "ʁ"), + (0x2B7, "M", "w"), + (0x2B8, "M", "y"), + (0x2B9, "V"), + (0x2D8, "M", " ̆"), + (0x2D9, "M", " ̇"), + (0x2DA, "M", " ̊"), + (0x2DB, "M", " ̨"), + (0x2DC, "M", " ̃"), + (0x2DD, "M", " ̋"), + (0x2DE, "V"), + (0x2E0, "M", "ɣ"), + (0x2E1, "M", "l"), + (0x2E2, "M", "s"), + (0x2E3, "M", "x"), + (0x2E4, "M", "ʕ"), + (0x2E5, "V"), + (0x340, "M", "̀"), + (0x341, "M", "́"), + (0x342, "V"), + (0x343, "M", "̓"), + (0x344, "M", "̈́"), + (0x345, "M", "ι"), + (0x346, "V"), + (0x34F, "I"), + (0x350, "V"), + (0x370, "M", "ͱ"), + (0x371, "V"), + (0x372, "M", "ͳ"), + (0x373, "V"), + (0x374, "M", "ʹ"), + (0x375, "V"), + (0x376, "M", "ͷ"), + (0x377, "V"), + ] + + +def _seg_6() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x378, "X"), + (0x37A, "M", " ι"), + (0x37B, "V"), + (0x37E, "M", ";"), + (0x37F, "M", "ϳ"), + (0x380, "X"), + (0x384, "M", " ́"), + (0x385, "M", " ̈́"), + (0x386, "M", "ά"), + (0x387, "M", "·"), + (0x388, "M", "έ"), + (0x389, "M", "ή"), + (0x38A, "M", "ί"), + (0x38B, "X"), + (0x38C, "M", "ό"), + (0x38D, "X"), + (0x38E, "M", "ύ"), + (0x38F, "M", "ώ"), + (0x390, "V"), + (0x391, "M", "α"), + (0x392, "M", "β"), + (0x393, "M", "γ"), + (0x394, "M", "δ"), + (0x395, "M", "ε"), + (0x396, "M", "ζ"), + (0x397, "M", "η"), + (0x398, "M", "θ"), + (0x399, "M", "ι"), + (0x39A, "M", "κ"), + (0x39B, "M", "λ"), + (0x39C, "M", "μ"), + (0x39D, "M", "ν"), + (0x39E, "M", "ξ"), + (0x39F, "M", "ο"), + (0x3A0, "M", "π"), + (0x3A1, "M", "ρ"), + (0x3A2, "X"), + (0x3A3, "M", "σ"), + (0x3A4, "M", "τ"), + (0x3A5, "M", "υ"), + (0x3A6, "M", "φ"), + (0x3A7, "M", "χ"), + (0x3A8, "M", "ψ"), + (0x3A9, "M", "ω"), + (0x3AA, "M", "ϊ"), + (0x3AB, "M", "ϋ"), + (0x3AC, "V"), + (0x3C2, "D", "σ"), + (0x3C3, "V"), + (0x3CF, "M", "ϗ"), + (0x3D0, "M", "β"), + (0x3D1, "M", "θ"), + (0x3D2, "M", "υ"), + (0x3D3, "M", "ύ"), + (0x3D4, "M", "ϋ"), + (0x3D5, "M", "φ"), + (0x3D6, "M", "π"), + (0x3D7, "V"), + (0x3D8, "M", "ϙ"), + (0x3D9, "V"), + (0x3DA, "M", "ϛ"), + (0x3DB, "V"), + (0x3DC, "M", "ϝ"), + (0x3DD, "V"), + (0x3DE, "M", "ϟ"), + (0x3DF, "V"), + (0x3E0, "M", "ϡ"), + (0x3E1, "V"), + (0x3E2, "M", "ϣ"), + (0x3E3, "V"), + (0x3E4, "M", "ϥ"), + (0x3E5, "V"), + (0x3E6, "M", "ϧ"), + (0x3E7, "V"), + (0x3E8, "M", "ϩ"), + (0x3E9, "V"), + (0x3EA, "M", "ϫ"), + (0x3EB, "V"), + (0x3EC, "M", "ϭ"), + (0x3ED, "V"), + (0x3EE, "M", "ϯ"), + (0x3EF, "V"), + (0x3F0, "M", "κ"), + (0x3F1, "M", "ρ"), + (0x3F2, "M", "σ"), + (0x3F3, "V"), + (0x3F4, "M", "θ"), + (0x3F5, "M", "ε"), + (0x3F6, "V"), + (0x3F7, "M", "ϸ"), + (0x3F8, "V"), + (0x3F9, "M", "σ"), + (0x3FA, "M", "ϻ"), + (0x3FB, "V"), + (0x3FD, "M", "ͻ"), + (0x3FE, "M", "ͼ"), + (0x3FF, "M", "ͽ"), + (0x400, "M", "ѐ"), + (0x401, "M", "ё"), + (0x402, "M", "ђ"), + ] + + +def _seg_7() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x403, "M", "ѓ"), + (0x404, "M", "є"), + (0x405, "M", "ѕ"), + (0x406, "M", "і"), + (0x407, "M", "ї"), + (0x408, "M", "ј"), + (0x409, "M", "љ"), + (0x40A, "M", "њ"), + (0x40B, "M", "ћ"), + (0x40C, "M", "ќ"), + (0x40D, "M", "ѝ"), + (0x40E, "M", "ў"), + (0x40F, "M", "џ"), + (0x410, "M", "а"), + (0x411, "M", "б"), + (0x412, "M", "в"), + (0x413, "M", "г"), + (0x414, "M", "д"), + (0x415, "M", "е"), + (0x416, "M", "ж"), + (0x417, "M", "з"), + (0x418, "M", "и"), + (0x419, "M", "й"), + (0x41A, "M", "к"), + (0x41B, "M", "л"), + (0x41C, "M", "м"), + (0x41D, "M", "н"), + (0x41E, "M", "о"), + (0x41F, "M", "п"), + (0x420, "M", "р"), + (0x421, "M", "с"), + (0x422, "M", "т"), + (0x423, "M", "у"), + (0x424, "M", "ф"), + (0x425, "M", "х"), + (0x426, "M", "ц"), + (0x427, "M", "ч"), + (0x428, "M", "ш"), + (0x429, "M", "щ"), + (0x42A, "M", "ъ"), + (0x42B, "M", "ы"), + (0x42C, "M", "ь"), + (0x42D, "M", "э"), + (0x42E, "M", "ю"), + (0x42F, "M", "я"), + (0x430, "V"), + (0x460, "M", "ѡ"), + (0x461, "V"), + (0x462, "M", "ѣ"), + (0x463, "V"), + (0x464, "M", "ѥ"), + (0x465, "V"), + (0x466, "M", "ѧ"), + (0x467, "V"), + (0x468, "M", "ѩ"), + (0x469, "V"), + (0x46A, "M", "ѫ"), + (0x46B, "V"), + (0x46C, "M", "ѭ"), + (0x46D, "V"), + (0x46E, "M", "ѯ"), + (0x46F, "V"), + (0x470, "M", "ѱ"), + (0x471, "V"), + (0x472, "M", "ѳ"), + (0x473, "V"), + (0x474, "M", "ѵ"), + (0x475, "V"), + (0x476, "M", "ѷ"), + (0x477, "V"), + (0x478, "M", "ѹ"), + (0x479, "V"), + (0x47A, "M", "ѻ"), + (0x47B, "V"), + (0x47C, "M", "ѽ"), + (0x47D, "V"), + (0x47E, "M", "ѿ"), + (0x47F, "V"), + (0x480, "M", "ҁ"), + (0x481, "V"), + (0x48A, "M", "ҋ"), + (0x48B, "V"), + (0x48C, "M", "ҍ"), + (0x48D, "V"), + (0x48E, "M", "ҏ"), + (0x48F, "V"), + (0x490, "M", "ґ"), + (0x491, "V"), + (0x492, "M", "ғ"), + (0x493, "V"), + (0x494, "M", "ҕ"), + (0x495, "V"), + (0x496, "M", "җ"), + (0x497, "V"), + (0x498, "M", "ҙ"), + (0x499, "V"), + (0x49A, "M", "қ"), + (0x49B, "V"), + (0x49C, "M", "ҝ"), + (0x49D, "V"), + ] + + +def _seg_8() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x49E, "M", "ҟ"), + (0x49F, "V"), + (0x4A0, "M", "ҡ"), + (0x4A1, "V"), + (0x4A2, "M", "ң"), + (0x4A3, "V"), + (0x4A4, "M", "ҥ"), + (0x4A5, "V"), + (0x4A6, "M", "ҧ"), + (0x4A7, "V"), + (0x4A8, "M", "ҩ"), + (0x4A9, "V"), + (0x4AA, "M", "ҫ"), + (0x4AB, "V"), + (0x4AC, "M", "ҭ"), + (0x4AD, "V"), + (0x4AE, "M", "ү"), + (0x4AF, "V"), + (0x4B0, "M", "ұ"), + (0x4B1, "V"), + (0x4B2, "M", "ҳ"), + (0x4B3, "V"), + (0x4B4, "M", "ҵ"), + (0x4B5, "V"), + (0x4B6, "M", "ҷ"), + (0x4B7, "V"), + (0x4B8, "M", "ҹ"), + (0x4B9, "V"), + (0x4BA, "M", "һ"), + (0x4BB, "V"), + (0x4BC, "M", "ҽ"), + (0x4BD, "V"), + (0x4BE, "M", "ҿ"), + (0x4BF, "V"), + (0x4C0, "M", "ӏ"), + (0x4C1, "M", "ӂ"), + (0x4C2, "V"), + (0x4C3, "M", "ӄ"), + (0x4C4, "V"), + (0x4C5, "M", "ӆ"), + (0x4C6, "V"), + (0x4C7, "M", "ӈ"), + (0x4C8, "V"), + (0x4C9, "M", "ӊ"), + (0x4CA, "V"), + (0x4CB, "M", "ӌ"), + (0x4CC, "V"), + (0x4CD, "M", "ӎ"), + (0x4CE, "V"), + (0x4D0, "M", "ӑ"), + (0x4D1, "V"), + (0x4D2, "M", "ӓ"), + (0x4D3, "V"), + (0x4D4, "M", "ӕ"), + (0x4D5, "V"), + (0x4D6, "M", "ӗ"), + (0x4D7, "V"), + (0x4D8, "M", "ә"), + (0x4D9, "V"), + (0x4DA, "M", "ӛ"), + (0x4DB, "V"), + (0x4DC, "M", "ӝ"), + (0x4DD, "V"), + (0x4DE, "M", "ӟ"), + (0x4DF, "V"), + (0x4E0, "M", "ӡ"), + (0x4E1, "V"), + (0x4E2, "M", "ӣ"), + (0x4E3, "V"), + (0x4E4, "M", "ӥ"), + (0x4E5, "V"), + (0x4E6, "M", "ӧ"), + (0x4E7, "V"), + (0x4E8, "M", "ө"), + (0x4E9, "V"), + (0x4EA, "M", "ӫ"), + (0x4EB, "V"), + (0x4EC, "M", "ӭ"), + (0x4ED, "V"), + (0x4EE, "M", "ӯ"), + (0x4EF, "V"), + (0x4F0, "M", "ӱ"), + (0x4F1, "V"), + (0x4F2, "M", "ӳ"), + (0x4F3, "V"), + (0x4F4, "M", "ӵ"), + (0x4F5, "V"), + (0x4F6, "M", "ӷ"), + (0x4F7, "V"), + (0x4F8, "M", "ӹ"), + (0x4F9, "V"), + (0x4FA, "M", "ӻ"), + (0x4FB, "V"), + (0x4FC, "M", "ӽ"), + (0x4FD, "V"), + (0x4FE, "M", "ӿ"), + (0x4FF, "V"), + (0x500, "M", "ԁ"), + (0x501, "V"), + (0x502, "M", "ԃ"), + ] + + +def _seg_9() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x503, "V"), + (0x504, "M", "ԅ"), + (0x505, "V"), + (0x506, "M", "ԇ"), + (0x507, "V"), + (0x508, "M", "ԉ"), + (0x509, "V"), + (0x50A, "M", "ԋ"), + (0x50B, "V"), + (0x50C, "M", "ԍ"), + (0x50D, "V"), + (0x50E, "M", "ԏ"), + (0x50F, "V"), + (0x510, "M", "ԑ"), + (0x511, "V"), + (0x512, "M", "ԓ"), + (0x513, "V"), + (0x514, "M", "ԕ"), + (0x515, "V"), + (0x516, "M", "ԗ"), + (0x517, "V"), + (0x518, "M", "ԙ"), + (0x519, "V"), + (0x51A, "M", "ԛ"), + (0x51B, "V"), + (0x51C, "M", "ԝ"), + (0x51D, "V"), + (0x51E, "M", "ԟ"), + (0x51F, "V"), + (0x520, "M", "ԡ"), + (0x521, "V"), + (0x522, "M", "ԣ"), + (0x523, "V"), + (0x524, "M", "ԥ"), + (0x525, "V"), + (0x526, "M", "ԧ"), + (0x527, "V"), + (0x528, "M", "ԩ"), + (0x529, "V"), + (0x52A, "M", "ԫ"), + (0x52B, "V"), + (0x52C, "M", "ԭ"), + (0x52D, "V"), + (0x52E, "M", "ԯ"), + (0x52F, "V"), + (0x530, "X"), + (0x531, "M", "ա"), + (0x532, "M", "բ"), + (0x533, "M", "գ"), + (0x534, "M", "դ"), + (0x535, "M", "ե"), + (0x536, "M", "զ"), + (0x537, "M", "է"), + (0x538, "M", "ը"), + (0x539, "M", "թ"), + (0x53A, "M", "ժ"), + (0x53B, "M", "ի"), + (0x53C, "M", "լ"), + (0x53D, "M", "խ"), + (0x53E, "M", "ծ"), + (0x53F, "M", "կ"), + (0x540, "M", "հ"), + (0x541, "M", "ձ"), + (0x542, "M", "ղ"), + (0x543, "M", "ճ"), + (0x544, "M", "մ"), + (0x545, "M", "յ"), + (0x546, "M", "ն"), + (0x547, "M", "շ"), + (0x548, "M", "ո"), + (0x549, "M", "չ"), + (0x54A, "M", "պ"), + (0x54B, "M", "ջ"), + (0x54C, "M", "ռ"), + (0x54D, "M", "ս"), + (0x54E, "M", "վ"), + (0x54F, "M", "տ"), + (0x550, "M", "ր"), + (0x551, "M", "ց"), + (0x552, "M", "ւ"), + (0x553, "M", "փ"), + (0x554, "M", "ք"), + (0x555, "M", "օ"), + (0x556, "M", "ֆ"), + (0x557, "X"), + (0x559, "V"), + (0x587, "M", "եւ"), + (0x588, "V"), + (0x58B, "X"), + (0x58D, "V"), + (0x590, "X"), + (0x591, "V"), + (0x5C8, "X"), + (0x5D0, "V"), + (0x5EB, "X"), + (0x5EF, "V"), + (0x5F5, "X"), + (0x606, "V"), + (0x61C, "X"), + (0x61D, "V"), + ] + + +def _seg_10() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x675, "M", "اٴ"), + (0x676, "M", "وٴ"), + (0x677, "M", "ۇٴ"), + (0x678, "M", "يٴ"), + (0x679, "V"), + (0x6DD, "X"), + (0x6DE, "V"), + (0x70E, "X"), + (0x710, "V"), + (0x74B, "X"), + (0x74D, "V"), + (0x7B2, "X"), + (0x7C0, "V"), + (0x7FB, "X"), + (0x7FD, "V"), + (0x82E, "X"), + (0x830, "V"), + (0x83F, "X"), + (0x840, "V"), + (0x85C, "X"), + (0x85E, "V"), + (0x85F, "X"), + (0x860, "V"), + (0x86B, "X"), + (0x870, "V"), + (0x88F, "X"), + (0x897, "V"), + (0x8E2, "X"), + (0x8E3, "V"), + (0x958, "M", "क़"), + (0x959, "M", "ख़"), + (0x95A, "M", "ग़"), + (0x95B, "M", "ज़"), + (0x95C, "M", "ड़"), + (0x95D, "M", "ढ़"), + (0x95E, "M", "फ़"), + (0x95F, "M", "य़"), + (0x960, "V"), + (0x984, "X"), + (0x985, "V"), + (0x98D, "X"), + (0x98F, "V"), + (0x991, "X"), + (0x993, "V"), + (0x9A9, "X"), + (0x9AA, "V"), + (0x9B1, "X"), + (0x9B2, "V"), + (0x9B3, "X"), + (0x9B6, "V"), + (0x9BA, "X"), + (0x9BC, "V"), + (0x9C5, "X"), + (0x9C7, "V"), + (0x9C9, "X"), + (0x9CB, "V"), + (0x9CF, "X"), + (0x9D7, "V"), + (0x9D8, "X"), + (0x9DC, "M", "ড়"), + (0x9DD, "M", "ঢ়"), + (0x9DE, "X"), + (0x9DF, "M", "য়"), + (0x9E0, "V"), + (0x9E4, "X"), + (0x9E6, "V"), + (0x9FF, "X"), + (0xA01, "V"), + (0xA04, "X"), + (0xA05, "V"), + (0xA0B, "X"), + (0xA0F, "V"), + (0xA11, "X"), + (0xA13, "V"), + (0xA29, "X"), + (0xA2A, "V"), + (0xA31, "X"), + (0xA32, "V"), + (0xA33, "M", "ਲ਼"), + (0xA34, "X"), + (0xA35, "V"), + (0xA36, "M", "ਸ਼"), + (0xA37, "X"), + (0xA38, "V"), + (0xA3A, "X"), + (0xA3C, "V"), + (0xA3D, "X"), + (0xA3E, "V"), + (0xA43, "X"), + (0xA47, "V"), + (0xA49, "X"), + (0xA4B, "V"), + (0xA4E, "X"), + (0xA51, "V"), + (0xA52, "X"), + (0xA59, "M", "ਖ਼"), + (0xA5A, "M", "ਗ਼"), + (0xA5B, "M", "ਜ਼"), + (0xA5C, "V"), + (0xA5D, "X"), + ] + + +def _seg_11() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0xA5E, "M", "ਫ਼"), + (0xA5F, "X"), + (0xA66, "V"), + (0xA77, "X"), + (0xA81, "V"), + (0xA84, "X"), + (0xA85, "V"), + (0xA8E, "X"), + (0xA8F, "V"), + (0xA92, "X"), + (0xA93, "V"), + (0xAA9, "X"), + (0xAAA, "V"), + (0xAB1, "X"), + (0xAB2, "V"), + (0xAB4, "X"), + (0xAB5, "V"), + (0xABA, "X"), + (0xABC, "V"), + (0xAC6, "X"), + (0xAC7, "V"), + (0xACA, "X"), + (0xACB, "V"), + (0xACE, "X"), + (0xAD0, "V"), + (0xAD1, "X"), + (0xAE0, "V"), + (0xAE4, "X"), + (0xAE6, "V"), + (0xAF2, "X"), + (0xAF9, "V"), + (0xB00, "X"), + (0xB01, "V"), + (0xB04, "X"), + (0xB05, "V"), + (0xB0D, "X"), + (0xB0F, "V"), + (0xB11, "X"), + (0xB13, "V"), + (0xB29, "X"), + (0xB2A, "V"), + (0xB31, "X"), + (0xB32, "V"), + (0xB34, "X"), + (0xB35, "V"), + (0xB3A, "X"), + (0xB3C, "V"), + (0xB45, "X"), + (0xB47, "V"), + (0xB49, "X"), + (0xB4B, "V"), + (0xB4E, "X"), + (0xB55, "V"), + (0xB58, "X"), + (0xB5C, "M", "ଡ଼"), + (0xB5D, "M", "ଢ଼"), + (0xB5E, "X"), + (0xB5F, "V"), + (0xB64, "X"), + (0xB66, "V"), + (0xB78, "X"), + (0xB82, "V"), + (0xB84, "X"), + (0xB85, "V"), + (0xB8B, "X"), + (0xB8E, "V"), + (0xB91, "X"), + (0xB92, "V"), + (0xB96, "X"), + (0xB99, "V"), + (0xB9B, "X"), + (0xB9C, "V"), + (0xB9D, "X"), + (0xB9E, "V"), + (0xBA0, "X"), + (0xBA3, "V"), + (0xBA5, "X"), + (0xBA8, "V"), + (0xBAB, "X"), + (0xBAE, "V"), + (0xBBA, "X"), + (0xBBE, "V"), + (0xBC3, "X"), + (0xBC6, "V"), + (0xBC9, "X"), + (0xBCA, "V"), + (0xBCE, "X"), + (0xBD0, "V"), + (0xBD1, "X"), + (0xBD7, "V"), + (0xBD8, "X"), + (0xBE6, "V"), + (0xBFB, "X"), + (0xC00, "V"), + (0xC0D, "X"), + (0xC0E, "V"), + (0xC11, "X"), + (0xC12, "V"), + (0xC29, "X"), + (0xC2A, "V"), + ] + + +def _seg_12() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0xC3A, "X"), + (0xC3C, "V"), + (0xC45, "X"), + (0xC46, "V"), + (0xC49, "X"), + (0xC4A, "V"), + (0xC4E, "X"), + (0xC55, "V"), + (0xC57, "X"), + (0xC58, "V"), + (0xC5B, "X"), + (0xC5D, "V"), + (0xC5E, "X"), + (0xC60, "V"), + (0xC64, "X"), + (0xC66, "V"), + (0xC70, "X"), + (0xC77, "V"), + (0xC8D, "X"), + (0xC8E, "V"), + (0xC91, "X"), + (0xC92, "V"), + (0xCA9, "X"), + (0xCAA, "V"), + (0xCB4, "X"), + (0xCB5, "V"), + (0xCBA, "X"), + (0xCBC, "V"), + (0xCC5, "X"), + (0xCC6, "V"), + (0xCC9, "X"), + (0xCCA, "V"), + (0xCCE, "X"), + (0xCD5, "V"), + (0xCD7, "X"), + (0xCDD, "V"), + (0xCDF, "X"), + (0xCE0, "V"), + (0xCE4, "X"), + (0xCE6, "V"), + (0xCF0, "X"), + (0xCF1, "V"), + (0xCF4, "X"), + (0xD00, "V"), + (0xD0D, "X"), + (0xD0E, "V"), + (0xD11, "X"), + (0xD12, "V"), + (0xD45, "X"), + (0xD46, "V"), + (0xD49, "X"), + (0xD4A, "V"), + (0xD50, "X"), + (0xD54, "V"), + (0xD64, "X"), + (0xD66, "V"), + (0xD80, "X"), + (0xD81, "V"), + (0xD84, "X"), + (0xD85, "V"), + (0xD97, "X"), + (0xD9A, "V"), + (0xDB2, "X"), + (0xDB3, "V"), + (0xDBC, "X"), + (0xDBD, "V"), + (0xDBE, "X"), + (0xDC0, "V"), + (0xDC7, "X"), + (0xDCA, "V"), + (0xDCB, "X"), + (0xDCF, "V"), + (0xDD5, "X"), + (0xDD6, "V"), + (0xDD7, "X"), + (0xDD8, "V"), + (0xDE0, "X"), + (0xDE6, "V"), + (0xDF0, "X"), + (0xDF2, "V"), + (0xDF5, "X"), + (0xE01, "V"), + (0xE33, "M", "ํา"), + (0xE34, "V"), + (0xE3B, "X"), + (0xE3F, "V"), + (0xE5C, "X"), + (0xE81, "V"), + (0xE83, "X"), + (0xE84, "V"), + (0xE85, "X"), + (0xE86, "V"), + (0xE8B, "X"), + (0xE8C, "V"), + (0xEA4, "X"), + (0xEA5, "V"), + (0xEA6, "X"), + (0xEA7, "V"), + (0xEB3, "M", "ໍາ"), + (0xEB4, "V"), + ] + + +def _seg_13() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0xEBE, "X"), + (0xEC0, "V"), + (0xEC5, "X"), + (0xEC6, "V"), + (0xEC7, "X"), + (0xEC8, "V"), + (0xECF, "X"), + (0xED0, "V"), + (0xEDA, "X"), + (0xEDC, "M", "ຫນ"), + (0xEDD, "M", "ຫມ"), + (0xEDE, "V"), + (0xEE0, "X"), + (0xF00, "V"), + (0xF0C, "M", "་"), + (0xF0D, "V"), + (0xF43, "M", "གྷ"), + (0xF44, "V"), + (0xF48, "X"), + (0xF49, "V"), + (0xF4D, "M", "ཌྷ"), + (0xF4E, "V"), + (0xF52, "M", "དྷ"), + (0xF53, "V"), + (0xF57, "M", "བྷ"), + (0xF58, "V"), + (0xF5C, "M", "ཛྷ"), + (0xF5D, "V"), + (0xF69, "M", "ཀྵ"), + (0xF6A, "V"), + (0xF6D, "X"), + (0xF71, "V"), + (0xF73, "M", "ཱི"), + (0xF74, "V"), + (0xF75, "M", "ཱུ"), + (0xF76, "M", "ྲྀ"), + (0xF77, "M", "ྲཱྀ"), + (0xF78, "M", "ླྀ"), + (0xF79, "M", "ླཱྀ"), + (0xF7A, "V"), + (0xF81, "M", "ཱྀ"), + (0xF82, "V"), + (0xF93, "M", "ྒྷ"), + (0xF94, "V"), + (0xF98, "X"), + (0xF99, "V"), + (0xF9D, "M", "ྜྷ"), + (0xF9E, "V"), + (0xFA2, "M", "ྡྷ"), + (0xFA3, "V"), + (0xFA7, "M", "ྦྷ"), + (0xFA8, "V"), + (0xFAC, "M", "ྫྷ"), + (0xFAD, "V"), + (0xFB9, "M", "ྐྵ"), + (0xFBA, "V"), + (0xFBD, "X"), + (0xFBE, "V"), + (0xFCD, "X"), + (0xFCE, "V"), + (0xFDB, "X"), + (0x1000, "V"), + (0x10A0, "M", "ⴀ"), + (0x10A1, "M", "ⴁ"), + (0x10A2, "M", "ⴂ"), + (0x10A3, "M", "ⴃ"), + (0x10A4, "M", "ⴄ"), + (0x10A5, "M", "ⴅ"), + (0x10A6, "M", "ⴆ"), + (0x10A7, "M", "ⴇ"), + (0x10A8, "M", "ⴈ"), + (0x10A9, "M", "ⴉ"), + (0x10AA, "M", "ⴊ"), + (0x10AB, "M", "ⴋ"), + (0x10AC, "M", "ⴌ"), + (0x10AD, "M", "ⴍ"), + (0x10AE, "M", "ⴎ"), + (0x10AF, "M", "ⴏ"), + (0x10B0, "M", "ⴐ"), + (0x10B1, "M", "ⴑ"), + (0x10B2, "M", "ⴒ"), + (0x10B3, "M", "ⴓ"), + (0x10B4, "M", "ⴔ"), + (0x10B5, "M", "ⴕ"), + (0x10B6, "M", "ⴖ"), + (0x10B7, "M", "ⴗ"), + (0x10B8, "M", "ⴘ"), + (0x10B9, "M", "ⴙ"), + (0x10BA, "M", "ⴚ"), + (0x10BB, "M", "ⴛ"), + (0x10BC, "M", "ⴜ"), + (0x10BD, "M", "ⴝ"), + (0x10BE, "M", "ⴞ"), + (0x10BF, "M", "ⴟ"), + (0x10C0, "M", "ⴠ"), + (0x10C1, "M", "ⴡ"), + (0x10C2, "M", "ⴢ"), + (0x10C3, "M", "ⴣ"), + (0x10C4, "M", "ⴤ"), + (0x10C5, "M", "ⴥ"), + ] + + +def _seg_14() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x10C6, "X"), + (0x10C7, "M", "ⴧ"), + (0x10C8, "X"), + (0x10CD, "M", "ⴭ"), + (0x10CE, "X"), + (0x10D0, "V"), + (0x10FC, "M", "ნ"), + (0x10FD, "V"), + (0x115F, "I"), + (0x1161, "V"), + (0x1249, "X"), + (0x124A, "V"), + (0x124E, "X"), + (0x1250, "V"), + (0x1257, "X"), + (0x1258, "V"), + (0x1259, "X"), + (0x125A, "V"), + (0x125E, "X"), + (0x1260, "V"), + (0x1289, "X"), + (0x128A, "V"), + (0x128E, "X"), + (0x1290, "V"), + (0x12B1, "X"), + (0x12B2, "V"), + (0x12B6, "X"), + (0x12B8, "V"), + (0x12BF, "X"), + (0x12C0, "V"), + (0x12C1, "X"), + (0x12C2, "V"), + (0x12C6, "X"), + (0x12C8, "V"), + (0x12D7, "X"), + (0x12D8, "V"), + (0x1311, "X"), + (0x1312, "V"), + (0x1316, "X"), + (0x1318, "V"), + (0x135B, "X"), + (0x135D, "V"), + (0x137D, "X"), + (0x1380, "V"), + (0x139A, "X"), + (0x13A0, "V"), + (0x13F6, "X"), + (0x13F8, "M", "Ᏸ"), + (0x13F9, "M", "Ᏹ"), + (0x13FA, "M", "Ᏺ"), + (0x13FB, "M", "Ᏻ"), + (0x13FC, "M", "Ᏼ"), + (0x13FD, "M", "Ᏽ"), + (0x13FE, "X"), + (0x1400, "V"), + (0x1680, "X"), + (0x1681, "V"), + (0x169D, "X"), + (0x16A0, "V"), + (0x16F9, "X"), + (0x1700, "V"), + (0x1716, "X"), + (0x171F, "V"), + (0x1737, "X"), + (0x1740, "V"), + (0x1754, "X"), + (0x1760, "V"), + (0x176D, "X"), + (0x176E, "V"), + (0x1771, "X"), + (0x1772, "V"), + (0x1774, "X"), + (0x1780, "V"), + (0x17B4, "I"), + (0x17B6, "V"), + (0x17DE, "X"), + (0x17E0, "V"), + (0x17EA, "X"), + (0x17F0, "V"), + (0x17FA, "X"), + (0x1800, "V"), + (0x180B, "I"), + (0x1810, "V"), + (0x181A, "X"), + (0x1820, "V"), + (0x1879, "X"), + (0x1880, "V"), + (0x18AB, "X"), + (0x18B0, "V"), + (0x18F6, "X"), + (0x1900, "V"), + (0x191F, "X"), + (0x1920, "V"), + (0x192C, "X"), + (0x1930, "V"), + (0x193C, "X"), + (0x1940, "V"), + (0x1941, "X"), + (0x1944, "V"), + (0x196E, "X"), + ] + + +def _seg_15() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x1970, "V"), + (0x1975, "X"), + (0x1980, "V"), + (0x19AC, "X"), + (0x19B0, "V"), + (0x19CA, "X"), + (0x19D0, "V"), + (0x19DB, "X"), + (0x19DE, "V"), + (0x1A1C, "X"), + (0x1A1E, "V"), + (0x1A5F, "X"), + (0x1A60, "V"), + (0x1A7D, "X"), + (0x1A7F, "V"), + (0x1A8A, "X"), + (0x1A90, "V"), + (0x1A9A, "X"), + (0x1AA0, "V"), + (0x1AAE, "X"), + (0x1AB0, "V"), + (0x1ACF, "X"), + (0x1B00, "V"), + (0x1B4D, "X"), + (0x1B4E, "V"), + (0x1BF4, "X"), + (0x1BFC, "V"), + (0x1C38, "X"), + (0x1C3B, "V"), + (0x1C4A, "X"), + (0x1C4D, "V"), + (0x1C80, "M", "в"), + (0x1C81, "M", "д"), + (0x1C82, "M", "о"), + (0x1C83, "M", "с"), + (0x1C84, "M", "т"), + (0x1C86, "M", "ъ"), + (0x1C87, "M", "ѣ"), + (0x1C88, "M", "ꙋ"), + (0x1C89, "M", "ᲊ"), + (0x1C8A, "V"), + (0x1C8B, "X"), + (0x1C90, "M", "ა"), + (0x1C91, "M", "ბ"), + (0x1C92, "M", "გ"), + (0x1C93, "M", "დ"), + (0x1C94, "M", "ე"), + (0x1C95, "M", "ვ"), + (0x1C96, "M", "ზ"), + (0x1C97, "M", "თ"), + (0x1C98, "M", "ი"), + (0x1C99, "M", "კ"), + (0x1C9A, "M", "ლ"), + (0x1C9B, "M", "მ"), + (0x1C9C, "M", "ნ"), + (0x1C9D, "M", "ო"), + (0x1C9E, "M", "პ"), + (0x1C9F, "M", "ჟ"), + (0x1CA0, "M", "რ"), + (0x1CA1, "M", "ს"), + (0x1CA2, "M", "ტ"), + (0x1CA3, "M", "უ"), + (0x1CA4, "M", "ფ"), + (0x1CA5, "M", "ქ"), + (0x1CA6, "M", "ღ"), + (0x1CA7, "M", "ყ"), + (0x1CA8, "M", "შ"), + (0x1CA9, "M", "ჩ"), + (0x1CAA, "M", "ც"), + (0x1CAB, "M", "ძ"), + (0x1CAC, "M", "წ"), + (0x1CAD, "M", "ჭ"), + (0x1CAE, "M", "ხ"), + (0x1CAF, "M", "ჯ"), + (0x1CB0, "M", "ჰ"), + (0x1CB1, "M", "ჱ"), + (0x1CB2, "M", "ჲ"), + (0x1CB3, "M", "ჳ"), + (0x1CB4, "M", "ჴ"), + (0x1CB5, "M", "ჵ"), + (0x1CB6, "M", "ჶ"), + (0x1CB7, "M", "ჷ"), + (0x1CB8, "M", "ჸ"), + (0x1CB9, "M", "ჹ"), + (0x1CBA, "M", "ჺ"), + (0x1CBB, "X"), + (0x1CBD, "M", "ჽ"), + (0x1CBE, "M", "ჾ"), + (0x1CBF, "M", "ჿ"), + (0x1CC0, "V"), + (0x1CC8, "X"), + (0x1CD0, "V"), + (0x1CFB, "X"), + (0x1D00, "V"), + (0x1D2C, "M", "a"), + (0x1D2D, "M", "æ"), + (0x1D2E, "M", "b"), + (0x1D2F, "V"), + (0x1D30, "M", "d"), + (0x1D31, "M", "e"), + ] + + +def _seg_16() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x1D32, "M", "ǝ"), + (0x1D33, "M", "g"), + (0x1D34, "M", "h"), + (0x1D35, "M", "i"), + (0x1D36, "M", "j"), + (0x1D37, "M", "k"), + (0x1D38, "M", "l"), + (0x1D39, "M", "m"), + (0x1D3A, "M", "n"), + (0x1D3B, "V"), + (0x1D3C, "M", "o"), + (0x1D3D, "M", "ȣ"), + (0x1D3E, "M", "p"), + (0x1D3F, "M", "r"), + (0x1D40, "M", "t"), + (0x1D41, "M", "u"), + (0x1D42, "M", "w"), + (0x1D43, "M", "a"), + (0x1D44, "M", "ɐ"), + (0x1D45, "M", "ɑ"), + (0x1D46, "M", "ᴂ"), + (0x1D47, "M", "b"), + (0x1D48, "M", "d"), + (0x1D49, "M", "e"), + (0x1D4A, "M", "ə"), + (0x1D4B, "M", "ɛ"), + (0x1D4C, "M", "ɜ"), + (0x1D4D, "M", "g"), + (0x1D4E, "V"), + (0x1D4F, "M", "k"), + (0x1D50, "M", "m"), + (0x1D51, "M", "ŋ"), + (0x1D52, "M", "o"), + (0x1D53, "M", "ɔ"), + (0x1D54, "M", "ᴖ"), + (0x1D55, "M", "ᴗ"), + (0x1D56, "M", "p"), + (0x1D57, "M", "t"), + (0x1D58, "M", "u"), + (0x1D59, "M", "ᴝ"), + (0x1D5A, "M", "ɯ"), + (0x1D5B, "M", "v"), + (0x1D5C, "M", "ᴥ"), + (0x1D5D, "M", "β"), + (0x1D5E, "M", "γ"), + (0x1D5F, "M", "δ"), + (0x1D60, "M", "φ"), + (0x1D61, "M", "χ"), + (0x1D62, "M", "i"), + (0x1D63, "M", "r"), + (0x1D64, "M", "u"), + (0x1D65, "M", "v"), + (0x1D66, "M", "β"), + (0x1D67, "M", "γ"), + (0x1D68, "M", "ρ"), + (0x1D69, "M", "φ"), + (0x1D6A, "M", "χ"), + (0x1D6B, "V"), + (0x1D78, "M", "н"), + (0x1D79, "V"), + (0x1D9B, "M", "ɒ"), + (0x1D9C, "M", "c"), + (0x1D9D, "M", "ɕ"), + (0x1D9E, "M", "ð"), + (0x1D9F, "M", "ɜ"), + (0x1DA0, "M", "f"), + (0x1DA1, "M", "ɟ"), + (0x1DA2, "M", "ɡ"), + (0x1DA3, "M", "ɥ"), + (0x1DA4, "M", "ɨ"), + (0x1DA5, "M", "ɩ"), + (0x1DA6, "M", "ɪ"), + (0x1DA7, "M", "ᵻ"), + (0x1DA8, "M", "ʝ"), + (0x1DA9, "M", "ɭ"), + (0x1DAA, "M", "ᶅ"), + (0x1DAB, "M", "ʟ"), + (0x1DAC, "M", "ɱ"), + (0x1DAD, "M", "ɰ"), + (0x1DAE, "M", "ɲ"), + (0x1DAF, "M", "ɳ"), + (0x1DB0, "M", "ɴ"), + (0x1DB1, "M", "ɵ"), + (0x1DB2, "M", "ɸ"), + (0x1DB3, "M", "ʂ"), + (0x1DB4, "M", "ʃ"), + (0x1DB5, "M", "ƫ"), + (0x1DB6, "M", "ʉ"), + (0x1DB7, "M", "ʊ"), + (0x1DB8, "M", "ᴜ"), + (0x1DB9, "M", "ʋ"), + (0x1DBA, "M", "ʌ"), + (0x1DBB, "M", "z"), + (0x1DBC, "M", "ʐ"), + (0x1DBD, "M", "ʑ"), + (0x1DBE, "M", "ʒ"), + (0x1DBF, "M", "θ"), + (0x1DC0, "V"), + (0x1E00, "M", "ḁ"), + (0x1E01, "V"), + ] + + +def _seg_17() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x1E02, "M", "ḃ"), + (0x1E03, "V"), + (0x1E04, "M", "ḅ"), + (0x1E05, "V"), + (0x1E06, "M", "ḇ"), + (0x1E07, "V"), + (0x1E08, "M", "ḉ"), + (0x1E09, "V"), + (0x1E0A, "M", "ḋ"), + (0x1E0B, "V"), + (0x1E0C, "M", "ḍ"), + (0x1E0D, "V"), + (0x1E0E, "M", "ḏ"), + (0x1E0F, "V"), + (0x1E10, "M", "ḑ"), + (0x1E11, "V"), + (0x1E12, "M", "ḓ"), + (0x1E13, "V"), + (0x1E14, "M", "ḕ"), + (0x1E15, "V"), + (0x1E16, "M", "ḗ"), + (0x1E17, "V"), + (0x1E18, "M", "ḙ"), + (0x1E19, "V"), + (0x1E1A, "M", "ḛ"), + (0x1E1B, "V"), + (0x1E1C, "M", "ḝ"), + (0x1E1D, "V"), + (0x1E1E, "M", "ḟ"), + (0x1E1F, "V"), + (0x1E20, "M", "ḡ"), + (0x1E21, "V"), + (0x1E22, "M", "ḣ"), + (0x1E23, "V"), + (0x1E24, "M", "ḥ"), + (0x1E25, "V"), + (0x1E26, "M", "ḧ"), + (0x1E27, "V"), + (0x1E28, "M", "ḩ"), + (0x1E29, "V"), + (0x1E2A, "M", "ḫ"), + (0x1E2B, "V"), + (0x1E2C, "M", "ḭ"), + (0x1E2D, "V"), + (0x1E2E, "M", "ḯ"), + (0x1E2F, "V"), + (0x1E30, "M", "ḱ"), + (0x1E31, "V"), + (0x1E32, "M", "ḳ"), + (0x1E33, "V"), + (0x1E34, "M", "ḵ"), + (0x1E35, "V"), + (0x1E36, "M", "ḷ"), + (0x1E37, "V"), + (0x1E38, "M", "ḹ"), + (0x1E39, "V"), + (0x1E3A, "M", "ḻ"), + (0x1E3B, "V"), + (0x1E3C, "M", "ḽ"), + (0x1E3D, "V"), + (0x1E3E, "M", "ḿ"), + (0x1E3F, "V"), + (0x1E40, "M", "ṁ"), + (0x1E41, "V"), + (0x1E42, "M", "ṃ"), + (0x1E43, "V"), + (0x1E44, "M", "ṅ"), + (0x1E45, "V"), + (0x1E46, "M", "ṇ"), + (0x1E47, "V"), + (0x1E48, "M", "ṉ"), + (0x1E49, "V"), + (0x1E4A, "M", "ṋ"), + (0x1E4B, "V"), + (0x1E4C, "M", "ṍ"), + (0x1E4D, "V"), + (0x1E4E, "M", "ṏ"), + (0x1E4F, "V"), + (0x1E50, "M", "ṑ"), + (0x1E51, "V"), + (0x1E52, "M", "ṓ"), + (0x1E53, "V"), + (0x1E54, "M", "ṕ"), + (0x1E55, "V"), + (0x1E56, "M", "ṗ"), + (0x1E57, "V"), + (0x1E58, "M", "ṙ"), + (0x1E59, "V"), + (0x1E5A, "M", "ṛ"), + (0x1E5B, "V"), + (0x1E5C, "M", "ṝ"), + (0x1E5D, "V"), + (0x1E5E, "M", "ṟ"), + (0x1E5F, "V"), + (0x1E60, "M", "ṡ"), + (0x1E61, "V"), + (0x1E62, "M", "ṣ"), + (0x1E63, "V"), + (0x1E64, "M", "ṥ"), + (0x1E65, "V"), + ] + + +def _seg_18() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x1E66, "M", "ṧ"), + (0x1E67, "V"), + (0x1E68, "M", "ṩ"), + (0x1E69, "V"), + (0x1E6A, "M", "ṫ"), + (0x1E6B, "V"), + (0x1E6C, "M", "ṭ"), + (0x1E6D, "V"), + (0x1E6E, "M", "ṯ"), + (0x1E6F, "V"), + (0x1E70, "M", "ṱ"), + (0x1E71, "V"), + (0x1E72, "M", "ṳ"), + (0x1E73, "V"), + (0x1E74, "M", "ṵ"), + (0x1E75, "V"), + (0x1E76, "M", "ṷ"), + (0x1E77, "V"), + (0x1E78, "M", "ṹ"), + (0x1E79, "V"), + (0x1E7A, "M", "ṻ"), + (0x1E7B, "V"), + (0x1E7C, "M", "ṽ"), + (0x1E7D, "V"), + (0x1E7E, "M", "ṿ"), + (0x1E7F, "V"), + (0x1E80, "M", "ẁ"), + (0x1E81, "V"), + (0x1E82, "M", "ẃ"), + (0x1E83, "V"), + (0x1E84, "M", "ẅ"), + (0x1E85, "V"), + (0x1E86, "M", "ẇ"), + (0x1E87, "V"), + (0x1E88, "M", "ẉ"), + (0x1E89, "V"), + (0x1E8A, "M", "ẋ"), + (0x1E8B, "V"), + (0x1E8C, "M", "ẍ"), + (0x1E8D, "V"), + (0x1E8E, "M", "ẏ"), + (0x1E8F, "V"), + (0x1E90, "M", "ẑ"), + (0x1E91, "V"), + (0x1E92, "M", "ẓ"), + (0x1E93, "V"), + (0x1E94, "M", "ẕ"), + (0x1E95, "V"), + (0x1E9A, "M", "aʾ"), + (0x1E9B, "M", "ṡ"), + (0x1E9C, "V"), + (0x1E9E, "M", "ß"), + (0x1E9F, "V"), + (0x1EA0, "M", "ạ"), + (0x1EA1, "V"), + (0x1EA2, "M", "ả"), + (0x1EA3, "V"), + (0x1EA4, "M", "ấ"), + (0x1EA5, "V"), + (0x1EA6, "M", "ầ"), + (0x1EA7, "V"), + (0x1EA8, "M", "ẩ"), + (0x1EA9, "V"), + (0x1EAA, "M", "ẫ"), + (0x1EAB, "V"), + (0x1EAC, "M", "ậ"), + (0x1EAD, "V"), + (0x1EAE, "M", "ắ"), + (0x1EAF, "V"), + (0x1EB0, "M", "ằ"), + (0x1EB1, "V"), + (0x1EB2, "M", "ẳ"), + (0x1EB3, "V"), + (0x1EB4, "M", "ẵ"), + (0x1EB5, "V"), + (0x1EB6, "M", "ặ"), + (0x1EB7, "V"), + (0x1EB8, "M", "ẹ"), + (0x1EB9, "V"), + (0x1EBA, "M", "ẻ"), + (0x1EBB, "V"), + (0x1EBC, "M", "ẽ"), + (0x1EBD, "V"), + (0x1EBE, "M", "ế"), + (0x1EBF, "V"), + (0x1EC0, "M", "ề"), + (0x1EC1, "V"), + (0x1EC2, "M", "ể"), + (0x1EC3, "V"), + (0x1EC4, "M", "ễ"), + (0x1EC5, "V"), + (0x1EC6, "M", "ệ"), + (0x1EC7, "V"), + (0x1EC8, "M", "ỉ"), + (0x1EC9, "V"), + (0x1ECA, "M", "ị"), + (0x1ECB, "V"), + (0x1ECC, "M", "ọ"), + (0x1ECD, "V"), + (0x1ECE, "M", "ỏ"), + ] + + +def _seg_19() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x1ECF, "V"), + (0x1ED0, "M", "ố"), + (0x1ED1, "V"), + (0x1ED2, "M", "ồ"), + (0x1ED3, "V"), + (0x1ED4, "M", "ổ"), + (0x1ED5, "V"), + (0x1ED6, "M", "ỗ"), + (0x1ED7, "V"), + (0x1ED8, "M", "ộ"), + (0x1ED9, "V"), + (0x1EDA, "M", "ớ"), + (0x1EDB, "V"), + (0x1EDC, "M", "ờ"), + (0x1EDD, "V"), + (0x1EDE, "M", "ở"), + (0x1EDF, "V"), + (0x1EE0, "M", "ỡ"), + (0x1EE1, "V"), + (0x1EE2, "M", "ợ"), + (0x1EE3, "V"), + (0x1EE4, "M", "ụ"), + (0x1EE5, "V"), + (0x1EE6, "M", "ủ"), + (0x1EE7, "V"), + (0x1EE8, "M", "ứ"), + (0x1EE9, "V"), + (0x1EEA, "M", "ừ"), + (0x1EEB, "V"), + (0x1EEC, "M", "ử"), + (0x1EED, "V"), + (0x1EEE, "M", "ữ"), + (0x1EEF, "V"), + (0x1EF0, "M", "ự"), + (0x1EF1, "V"), + (0x1EF2, "M", "ỳ"), + (0x1EF3, "V"), + (0x1EF4, "M", "ỵ"), + (0x1EF5, "V"), + (0x1EF6, "M", "ỷ"), + (0x1EF7, "V"), + (0x1EF8, "M", "ỹ"), + (0x1EF9, "V"), + (0x1EFA, "M", "ỻ"), + (0x1EFB, "V"), + (0x1EFC, "M", "ỽ"), + (0x1EFD, "V"), + (0x1EFE, "M", "ỿ"), + (0x1EFF, "V"), + (0x1F08, "M", "ἀ"), + (0x1F09, "M", "ἁ"), + (0x1F0A, "M", "ἂ"), + (0x1F0B, "M", "ἃ"), + (0x1F0C, "M", "ἄ"), + (0x1F0D, "M", "ἅ"), + (0x1F0E, "M", "ἆ"), + (0x1F0F, "M", "ἇ"), + (0x1F10, "V"), + (0x1F16, "X"), + (0x1F18, "M", "ἐ"), + (0x1F19, "M", "ἑ"), + (0x1F1A, "M", "ἒ"), + (0x1F1B, "M", "ἓ"), + (0x1F1C, "M", "ἔ"), + (0x1F1D, "M", "ἕ"), + (0x1F1E, "X"), + (0x1F20, "V"), + (0x1F28, "M", "ἠ"), + (0x1F29, "M", "ἡ"), + (0x1F2A, "M", "ἢ"), + (0x1F2B, "M", "ἣ"), + (0x1F2C, "M", "ἤ"), + (0x1F2D, "M", "ἥ"), + (0x1F2E, "M", "ἦ"), + (0x1F2F, "M", "ἧ"), + (0x1F30, "V"), + (0x1F38, "M", "ἰ"), + (0x1F39, "M", "ἱ"), + (0x1F3A, "M", "ἲ"), + (0x1F3B, "M", "ἳ"), + (0x1F3C, "M", "ἴ"), + (0x1F3D, "M", "ἵ"), + (0x1F3E, "M", "ἶ"), + (0x1F3F, "M", "ἷ"), + (0x1F40, "V"), + (0x1F46, "X"), + (0x1F48, "M", "ὀ"), + (0x1F49, "M", "ὁ"), + (0x1F4A, "M", "ὂ"), + (0x1F4B, "M", "ὃ"), + (0x1F4C, "M", "ὄ"), + (0x1F4D, "M", "ὅ"), + (0x1F4E, "X"), + (0x1F50, "V"), + (0x1F58, "X"), + (0x1F59, "M", "ὑ"), + (0x1F5A, "X"), + (0x1F5B, "M", "ὓ"), + (0x1F5C, "X"), + (0x1F5D, "M", "ὕ"), + ] + + +def _seg_20() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x1F5E, "X"), + (0x1F5F, "M", "ὗ"), + (0x1F60, "V"), + (0x1F68, "M", "ὠ"), + (0x1F69, "M", "ὡ"), + (0x1F6A, "M", "ὢ"), + (0x1F6B, "M", "ὣ"), + (0x1F6C, "M", "ὤ"), + (0x1F6D, "M", "ὥ"), + (0x1F6E, "M", "ὦ"), + (0x1F6F, "M", "ὧ"), + (0x1F70, "V"), + (0x1F71, "M", "ά"), + (0x1F72, "V"), + (0x1F73, "M", "έ"), + (0x1F74, "V"), + (0x1F75, "M", "ή"), + (0x1F76, "V"), + (0x1F77, "M", "ί"), + (0x1F78, "V"), + (0x1F79, "M", "ό"), + (0x1F7A, "V"), + (0x1F7B, "M", "ύ"), + (0x1F7C, "V"), + (0x1F7D, "M", "ώ"), + (0x1F7E, "X"), + (0x1F80, "M", "ἀι"), + (0x1F81, "M", "ἁι"), + (0x1F82, "M", "ἂι"), + (0x1F83, "M", "ἃι"), + (0x1F84, "M", "ἄι"), + (0x1F85, "M", "ἅι"), + (0x1F86, "M", "ἆι"), + (0x1F87, "M", "ἇι"), + (0x1F88, "M", "ἀι"), + (0x1F89, "M", "ἁι"), + (0x1F8A, "M", "ἂι"), + (0x1F8B, "M", "ἃι"), + (0x1F8C, "M", "ἄι"), + (0x1F8D, "M", "ἅι"), + (0x1F8E, "M", "ἆι"), + (0x1F8F, "M", "ἇι"), + (0x1F90, "M", "ἠι"), + (0x1F91, "M", "ἡι"), + (0x1F92, "M", "ἢι"), + (0x1F93, "M", "ἣι"), + (0x1F94, "M", "ἤι"), + (0x1F95, "M", "ἥι"), + (0x1F96, "M", "ἦι"), + (0x1F97, "M", "ἧι"), + (0x1F98, "M", "ἠι"), + (0x1F99, "M", "ἡι"), + (0x1F9A, "M", "ἢι"), + (0x1F9B, "M", "ἣι"), + (0x1F9C, "M", "ἤι"), + (0x1F9D, "M", "ἥι"), + (0x1F9E, "M", "ἦι"), + (0x1F9F, "M", "ἧι"), + (0x1FA0, "M", "ὠι"), + (0x1FA1, "M", "ὡι"), + (0x1FA2, "M", "ὢι"), + (0x1FA3, "M", "ὣι"), + (0x1FA4, "M", "ὤι"), + (0x1FA5, "M", "ὥι"), + (0x1FA6, "M", "ὦι"), + (0x1FA7, "M", "ὧι"), + (0x1FA8, "M", "ὠι"), + (0x1FA9, "M", "ὡι"), + (0x1FAA, "M", "ὢι"), + (0x1FAB, "M", "ὣι"), + (0x1FAC, "M", "ὤι"), + (0x1FAD, "M", "ὥι"), + (0x1FAE, "M", "ὦι"), + (0x1FAF, "M", "ὧι"), + (0x1FB0, "V"), + (0x1FB2, "M", "ὰι"), + (0x1FB3, "M", "αι"), + (0x1FB4, "M", "άι"), + (0x1FB5, "X"), + (0x1FB6, "V"), + (0x1FB7, "M", "ᾶι"), + (0x1FB8, "M", "ᾰ"), + (0x1FB9, "M", "ᾱ"), + (0x1FBA, "M", "ὰ"), + (0x1FBB, "M", "ά"), + (0x1FBC, "M", "αι"), + (0x1FBD, "M", " ̓"), + (0x1FBE, "M", "ι"), + (0x1FBF, "M", " ̓"), + (0x1FC0, "M", " ͂"), + (0x1FC1, "M", " ̈͂"), + (0x1FC2, "M", "ὴι"), + (0x1FC3, "M", "ηι"), + (0x1FC4, "M", "ήι"), + (0x1FC5, "X"), + (0x1FC6, "V"), + (0x1FC7, "M", "ῆι"), + (0x1FC8, "M", "ὲ"), + (0x1FC9, "M", "έ"), + (0x1FCA, "M", "ὴ"), + ] + + +def _seg_21() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x1FCB, "M", "ή"), + (0x1FCC, "M", "ηι"), + (0x1FCD, "M", " ̓̀"), + (0x1FCE, "M", " ̓́"), + (0x1FCF, "M", " ̓͂"), + (0x1FD0, "V"), + (0x1FD3, "M", "ΐ"), + (0x1FD4, "X"), + (0x1FD6, "V"), + (0x1FD8, "M", "ῐ"), + (0x1FD9, "M", "ῑ"), + (0x1FDA, "M", "ὶ"), + (0x1FDB, "M", "ί"), + (0x1FDC, "X"), + (0x1FDD, "M", " ̔̀"), + (0x1FDE, "M", " ̔́"), + (0x1FDF, "M", " ̔͂"), + (0x1FE0, "V"), + (0x1FE3, "M", "ΰ"), + (0x1FE4, "V"), + (0x1FE8, "M", "ῠ"), + (0x1FE9, "M", "ῡ"), + (0x1FEA, "M", "ὺ"), + (0x1FEB, "M", "ύ"), + (0x1FEC, "M", "ῥ"), + (0x1FED, "M", " ̈̀"), + (0x1FEE, "M", " ̈́"), + (0x1FEF, "M", "`"), + (0x1FF0, "X"), + (0x1FF2, "M", "ὼι"), + (0x1FF3, "M", "ωι"), + (0x1FF4, "M", "ώι"), + (0x1FF5, "X"), + (0x1FF6, "V"), + (0x1FF7, "M", "ῶι"), + (0x1FF8, "M", "ὸ"), + (0x1FF9, "M", "ό"), + (0x1FFA, "M", "ὼ"), + (0x1FFB, "M", "ώ"), + (0x1FFC, "M", "ωι"), + (0x1FFD, "M", " ́"), + (0x1FFE, "M", " ̔"), + (0x1FFF, "X"), + (0x2000, "M", " "), + (0x200B, "I"), + (0x200C, "D", ""), + (0x200E, "X"), + (0x2010, "V"), + (0x2011, "M", "‐"), + (0x2012, "V"), + (0x2017, "M", " ̳"), + (0x2018, "V"), + (0x2024, "X"), + (0x2027, "V"), + (0x2028, "X"), + (0x202F, "M", " "), + (0x2030, "V"), + (0x2033, "M", "′′"), + (0x2034, "M", "′′′"), + (0x2035, "V"), + (0x2036, "M", "‵‵"), + (0x2037, "M", "‵‵‵"), + (0x2038, "V"), + (0x203C, "M", "!!"), + (0x203D, "V"), + (0x203E, "M", " ̅"), + (0x203F, "V"), + (0x2047, "M", "??"), + (0x2048, "M", "?!"), + (0x2049, "M", "!?"), + (0x204A, "V"), + (0x2057, "M", "′′′′"), + (0x2058, "V"), + (0x205F, "M", " "), + (0x2060, "I"), + (0x2065, "X"), + (0x206A, "I"), + (0x2070, "M", "0"), + (0x2071, "M", "i"), + (0x2072, "X"), + (0x2074, "M", "4"), + (0x2075, "M", "5"), + (0x2076, "M", "6"), + (0x2077, "M", "7"), + (0x2078, "M", "8"), + (0x2079, "M", "9"), + (0x207A, "M", "+"), + (0x207B, "M", "−"), + (0x207C, "M", "="), + (0x207D, "M", "("), + (0x207E, "M", ")"), + (0x207F, "M", "n"), + (0x2080, "M", "0"), + (0x2081, "M", "1"), + (0x2082, "M", "2"), + (0x2083, "M", "3"), + (0x2084, "M", "4"), + (0x2085, "M", "5"), + (0x2086, "M", "6"), + (0x2087, "M", "7"), + ] + + +def _seg_22() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x2088, "M", "8"), + (0x2089, "M", "9"), + (0x208A, "M", "+"), + (0x208B, "M", "−"), + (0x208C, "M", "="), + (0x208D, "M", "("), + (0x208E, "M", ")"), + (0x208F, "X"), + (0x2090, "M", "a"), + (0x2091, "M", "e"), + (0x2092, "M", "o"), + (0x2093, "M", "x"), + (0x2094, "M", "ə"), + (0x2095, "M", "h"), + (0x2096, "M", "k"), + (0x2097, "M", "l"), + (0x2098, "M", "m"), + (0x2099, "M", "n"), + (0x209A, "M", "p"), + (0x209B, "M", "s"), + (0x209C, "M", "t"), + (0x209D, "X"), + (0x20A0, "V"), + (0x20A8, "M", "rs"), + (0x20A9, "V"), + (0x20C1, "X"), + (0x20D0, "V"), + (0x20F1, "X"), + (0x2100, "M", "a/c"), + (0x2101, "M", "a/s"), + (0x2102, "M", "c"), + (0x2103, "M", "°c"), + (0x2104, "V"), + (0x2105, "M", "c/o"), + (0x2106, "M", "c/u"), + (0x2107, "M", "ɛ"), + (0x2108, "V"), + (0x2109, "M", "°f"), + (0x210A, "M", "g"), + (0x210B, "M", "h"), + (0x210F, "M", "ħ"), + (0x2110, "M", "i"), + (0x2112, "M", "l"), + (0x2114, "V"), + (0x2115, "M", "n"), + (0x2116, "M", "no"), + (0x2117, "V"), + (0x2119, "M", "p"), + (0x211A, "M", "q"), + (0x211B, "M", "r"), + (0x211E, "V"), + (0x2120, "M", "sm"), + (0x2121, "M", "tel"), + (0x2122, "M", "tm"), + (0x2123, "V"), + (0x2124, "M", "z"), + (0x2125, "V"), + (0x2126, "M", "ω"), + (0x2127, "V"), + (0x2128, "M", "z"), + (0x2129, "V"), + (0x212A, "M", "k"), + (0x212B, "M", "å"), + (0x212C, "M", "b"), + (0x212D, "M", "c"), + (0x212E, "V"), + (0x212F, "M", "e"), + (0x2131, "M", "f"), + (0x2132, "M", "ⅎ"), + (0x2133, "M", "m"), + (0x2134, "M", "o"), + (0x2135, "M", "א"), + (0x2136, "M", "ב"), + (0x2137, "M", "ג"), + (0x2138, "M", "ד"), + (0x2139, "M", "i"), + (0x213A, "V"), + (0x213B, "M", "fax"), + (0x213C, "M", "π"), + (0x213D, "M", "γ"), + (0x213F, "M", "π"), + (0x2140, "M", "∑"), + (0x2141, "V"), + (0x2145, "M", "d"), + (0x2147, "M", "e"), + (0x2148, "M", "i"), + (0x2149, "M", "j"), + (0x214A, "V"), + (0x2150, "M", "1⁄7"), + (0x2151, "M", "1⁄9"), + (0x2152, "M", "1⁄10"), + (0x2153, "M", "1⁄3"), + (0x2154, "M", "2⁄3"), + (0x2155, "M", "1⁄5"), + (0x2156, "M", "2⁄5"), + (0x2157, "M", "3⁄5"), + (0x2158, "M", "4⁄5"), + (0x2159, "M", "1⁄6"), + (0x215A, "M", "5⁄6"), + (0x215B, "M", "1⁄8"), + ] + + +def _seg_23() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x215C, "M", "3⁄8"), + (0x215D, "M", "5⁄8"), + (0x215E, "M", "7⁄8"), + (0x215F, "M", "1⁄"), + (0x2160, "M", "i"), + (0x2161, "M", "ii"), + (0x2162, "M", "iii"), + (0x2163, "M", "iv"), + (0x2164, "M", "v"), + (0x2165, "M", "vi"), + (0x2166, "M", "vii"), + (0x2167, "M", "viii"), + (0x2168, "M", "ix"), + (0x2169, "M", "x"), + (0x216A, "M", "xi"), + (0x216B, "M", "xii"), + (0x216C, "M", "l"), + (0x216D, "M", "c"), + (0x216E, "M", "d"), + (0x216F, "M", "m"), + (0x2170, "M", "i"), + (0x2171, "M", "ii"), + (0x2172, "M", "iii"), + (0x2173, "M", "iv"), + (0x2174, "M", "v"), + (0x2175, "M", "vi"), + (0x2176, "M", "vii"), + (0x2177, "M", "viii"), + (0x2178, "M", "ix"), + (0x2179, "M", "x"), + (0x217A, "M", "xi"), + (0x217B, "M", "xii"), + (0x217C, "M", "l"), + (0x217D, "M", "c"), + (0x217E, "M", "d"), + (0x217F, "M", "m"), + (0x2180, "V"), + (0x2183, "M", "ↄ"), + (0x2184, "V"), + (0x2189, "M", "0⁄3"), + (0x218A, "V"), + (0x218C, "X"), + (0x2190, "V"), + (0x222C, "M", "∫∫"), + (0x222D, "M", "∫∫∫"), + (0x222E, "V"), + (0x222F, "M", "∮∮"), + (0x2230, "M", "∮∮∮"), + (0x2231, "V"), + (0x2329, "M", "〈"), + (0x232A, "M", "〉"), + (0x232B, "V"), + (0x242A, "X"), + (0x2440, "V"), + (0x244B, "X"), + (0x2460, "M", "1"), + (0x2461, "M", "2"), + (0x2462, "M", "3"), + (0x2463, "M", "4"), + (0x2464, "M", "5"), + (0x2465, "M", "6"), + (0x2466, "M", "7"), + (0x2467, "M", "8"), + (0x2468, "M", "9"), + (0x2469, "M", "10"), + (0x246A, "M", "11"), + (0x246B, "M", "12"), + (0x246C, "M", "13"), + (0x246D, "M", "14"), + (0x246E, "M", "15"), + (0x246F, "M", "16"), + (0x2470, "M", "17"), + (0x2471, "M", "18"), + (0x2472, "M", "19"), + (0x2473, "M", "20"), + (0x2474, "M", "(1)"), + (0x2475, "M", "(2)"), + (0x2476, "M", "(3)"), + (0x2477, "M", "(4)"), + (0x2478, "M", "(5)"), + (0x2479, "M", "(6)"), + (0x247A, "M", "(7)"), + (0x247B, "M", "(8)"), + (0x247C, "M", "(9)"), + (0x247D, "M", "(10)"), + (0x247E, "M", "(11)"), + (0x247F, "M", "(12)"), + (0x2480, "M", "(13)"), + (0x2481, "M", "(14)"), + (0x2482, "M", "(15)"), + (0x2483, "M", "(16)"), + (0x2484, "M", "(17)"), + (0x2485, "M", "(18)"), + (0x2486, "M", "(19)"), + (0x2487, "M", "(20)"), + (0x2488, "X"), + (0x249C, "M", "(a)"), + (0x249D, "M", "(b)"), + (0x249E, "M", "(c)"), + (0x249F, "M", "(d)"), + ] + + +def _seg_24() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x24A0, "M", "(e)"), + (0x24A1, "M", "(f)"), + (0x24A2, "M", "(g)"), + (0x24A3, "M", "(h)"), + (0x24A4, "M", "(i)"), + (0x24A5, "M", "(j)"), + (0x24A6, "M", "(k)"), + (0x24A7, "M", "(l)"), + (0x24A8, "M", "(m)"), + (0x24A9, "M", "(n)"), + (0x24AA, "M", "(o)"), + (0x24AB, "M", "(p)"), + (0x24AC, "M", "(q)"), + (0x24AD, "M", "(r)"), + (0x24AE, "M", "(s)"), + (0x24AF, "M", "(t)"), + (0x24B0, "M", "(u)"), + (0x24B1, "M", "(v)"), + (0x24B2, "M", "(w)"), + (0x24B3, "M", "(x)"), + (0x24B4, "M", "(y)"), + (0x24B5, "M", "(z)"), + (0x24B6, "M", "a"), + (0x24B7, "M", "b"), + (0x24B8, "M", "c"), + (0x24B9, "M", "d"), + (0x24BA, "M", "e"), + (0x24BB, "M", "f"), + (0x24BC, "M", "g"), + (0x24BD, "M", "h"), + (0x24BE, "M", "i"), + (0x24BF, "M", "j"), + (0x24C0, "M", "k"), + (0x24C1, "M", "l"), + (0x24C2, "M", "m"), + (0x24C3, "M", "n"), + (0x24C4, "M", "o"), + (0x24C5, "M", "p"), + (0x24C6, "M", "q"), + (0x24C7, "M", "r"), + (0x24C8, "M", "s"), + (0x24C9, "M", "t"), + (0x24CA, "M", "u"), + (0x24CB, "M", "v"), + (0x24CC, "M", "w"), + (0x24CD, "M", "x"), + (0x24CE, "M", "y"), + (0x24CF, "M", "z"), + (0x24D0, "M", "a"), + (0x24D1, "M", "b"), + (0x24D2, "M", "c"), + (0x24D3, "M", "d"), + (0x24D4, "M", "e"), + (0x24D5, "M", "f"), + (0x24D6, "M", "g"), + (0x24D7, "M", "h"), + (0x24D8, "M", "i"), + (0x24D9, "M", "j"), + (0x24DA, "M", "k"), + (0x24DB, "M", "l"), + (0x24DC, "M", "m"), + (0x24DD, "M", "n"), + (0x24DE, "M", "o"), + (0x24DF, "M", "p"), + (0x24E0, "M", "q"), + (0x24E1, "M", "r"), + (0x24E2, "M", "s"), + (0x24E3, "M", "t"), + (0x24E4, "M", "u"), + (0x24E5, "M", "v"), + (0x24E6, "M", "w"), + (0x24E7, "M", "x"), + (0x24E8, "M", "y"), + (0x24E9, "M", "z"), + (0x24EA, "M", "0"), + (0x24EB, "V"), + (0x2A0C, "M", "∫∫∫∫"), + (0x2A0D, "V"), + (0x2A74, "M", "::="), + (0x2A75, "M", "=="), + (0x2A76, "M", "==="), + (0x2A77, "V"), + (0x2ADC, "M", "⫝̸"), + (0x2ADD, "V"), + (0x2B74, "X"), + (0x2B76, "V"), + (0x2B96, "X"), + (0x2B97, "V"), + (0x2C00, "M", "ⰰ"), + (0x2C01, "M", "ⰱ"), + (0x2C02, "M", "ⰲ"), + (0x2C03, "M", "ⰳ"), + (0x2C04, "M", "ⰴ"), + (0x2C05, "M", "ⰵ"), + (0x2C06, "M", "ⰶ"), + (0x2C07, "M", "ⰷ"), + (0x2C08, "M", "ⰸ"), + (0x2C09, "M", "ⰹ"), + (0x2C0A, "M", "ⰺ"), + (0x2C0B, "M", "ⰻ"), + ] + + +def _seg_25() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x2C0C, "M", "ⰼ"), + (0x2C0D, "M", "ⰽ"), + (0x2C0E, "M", "ⰾ"), + (0x2C0F, "M", "ⰿ"), + (0x2C10, "M", "ⱀ"), + (0x2C11, "M", "ⱁ"), + (0x2C12, "M", "ⱂ"), + (0x2C13, "M", "ⱃ"), + (0x2C14, "M", "ⱄ"), + (0x2C15, "M", "ⱅ"), + (0x2C16, "M", "ⱆ"), + (0x2C17, "M", "ⱇ"), + (0x2C18, "M", "ⱈ"), + (0x2C19, "M", "ⱉ"), + (0x2C1A, "M", "ⱊ"), + (0x2C1B, "M", "ⱋ"), + (0x2C1C, "M", "ⱌ"), + (0x2C1D, "M", "ⱍ"), + (0x2C1E, "M", "ⱎ"), + (0x2C1F, "M", "ⱏ"), + (0x2C20, "M", "ⱐ"), + (0x2C21, "M", "ⱑ"), + (0x2C22, "M", "ⱒ"), + (0x2C23, "M", "ⱓ"), + (0x2C24, "M", "ⱔ"), + (0x2C25, "M", "ⱕ"), + (0x2C26, "M", "ⱖ"), + (0x2C27, "M", "ⱗ"), + (0x2C28, "M", "ⱘ"), + (0x2C29, "M", "ⱙ"), + (0x2C2A, "M", "ⱚ"), + (0x2C2B, "M", "ⱛ"), + (0x2C2C, "M", "ⱜ"), + (0x2C2D, "M", "ⱝ"), + (0x2C2E, "M", "ⱞ"), + (0x2C2F, "M", "ⱟ"), + (0x2C30, "V"), + (0x2C60, "M", "ⱡ"), + (0x2C61, "V"), + (0x2C62, "M", "ɫ"), + (0x2C63, "M", "ᵽ"), + (0x2C64, "M", "ɽ"), + (0x2C65, "V"), + (0x2C67, "M", "ⱨ"), + (0x2C68, "V"), + (0x2C69, "M", "ⱪ"), + (0x2C6A, "V"), + (0x2C6B, "M", "ⱬ"), + (0x2C6C, "V"), + (0x2C6D, "M", "ɑ"), + (0x2C6E, "M", "ɱ"), + (0x2C6F, "M", "ɐ"), + (0x2C70, "M", "ɒ"), + (0x2C71, "V"), + (0x2C72, "M", "ⱳ"), + (0x2C73, "V"), + (0x2C75, "M", "ⱶ"), + (0x2C76, "V"), + (0x2C7C, "M", "j"), + (0x2C7D, "M", "v"), + (0x2C7E, "M", "ȿ"), + (0x2C7F, "M", "ɀ"), + (0x2C80, "M", "ⲁ"), + (0x2C81, "V"), + (0x2C82, "M", "ⲃ"), + (0x2C83, "V"), + (0x2C84, "M", "ⲅ"), + (0x2C85, "V"), + (0x2C86, "M", "ⲇ"), + (0x2C87, "V"), + (0x2C88, "M", "ⲉ"), + (0x2C89, "V"), + (0x2C8A, "M", "ⲋ"), + (0x2C8B, "V"), + (0x2C8C, "M", "ⲍ"), + (0x2C8D, "V"), + (0x2C8E, "M", "ⲏ"), + (0x2C8F, "V"), + (0x2C90, "M", "ⲑ"), + (0x2C91, "V"), + (0x2C92, "M", "ⲓ"), + (0x2C93, "V"), + (0x2C94, "M", "ⲕ"), + (0x2C95, "V"), + (0x2C96, "M", "ⲗ"), + (0x2C97, "V"), + (0x2C98, "M", "ⲙ"), + (0x2C99, "V"), + (0x2C9A, "M", "ⲛ"), + (0x2C9B, "V"), + (0x2C9C, "M", "ⲝ"), + (0x2C9D, "V"), + (0x2C9E, "M", "ⲟ"), + (0x2C9F, "V"), + (0x2CA0, "M", "ⲡ"), + (0x2CA1, "V"), + (0x2CA2, "M", "ⲣ"), + (0x2CA3, "V"), + (0x2CA4, "M", "ⲥ"), + (0x2CA5, "V"), + ] + + +def _seg_26() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x2CA6, "M", "ⲧ"), + (0x2CA7, "V"), + (0x2CA8, "M", "ⲩ"), + (0x2CA9, "V"), + (0x2CAA, "M", "ⲫ"), + (0x2CAB, "V"), + (0x2CAC, "M", "ⲭ"), + (0x2CAD, "V"), + (0x2CAE, "M", "ⲯ"), + (0x2CAF, "V"), + (0x2CB0, "M", "ⲱ"), + (0x2CB1, "V"), + (0x2CB2, "M", "ⲳ"), + (0x2CB3, "V"), + (0x2CB4, "M", "ⲵ"), + (0x2CB5, "V"), + (0x2CB6, "M", "ⲷ"), + (0x2CB7, "V"), + (0x2CB8, "M", "ⲹ"), + (0x2CB9, "V"), + (0x2CBA, "M", "ⲻ"), + (0x2CBB, "V"), + (0x2CBC, "M", "ⲽ"), + (0x2CBD, "V"), + (0x2CBE, "M", "ⲿ"), + (0x2CBF, "V"), + (0x2CC0, "M", "ⳁ"), + (0x2CC1, "V"), + (0x2CC2, "M", "ⳃ"), + (0x2CC3, "V"), + (0x2CC4, "M", "ⳅ"), + (0x2CC5, "V"), + (0x2CC6, "M", "ⳇ"), + (0x2CC7, "V"), + (0x2CC8, "M", "ⳉ"), + (0x2CC9, "V"), + (0x2CCA, "M", "ⳋ"), + (0x2CCB, "V"), + (0x2CCC, "M", "ⳍ"), + (0x2CCD, "V"), + (0x2CCE, "M", "ⳏ"), + (0x2CCF, "V"), + (0x2CD0, "M", "ⳑ"), + (0x2CD1, "V"), + (0x2CD2, "M", "ⳓ"), + (0x2CD3, "V"), + (0x2CD4, "M", "ⳕ"), + (0x2CD5, "V"), + (0x2CD6, "M", "ⳗ"), + (0x2CD7, "V"), + (0x2CD8, "M", "ⳙ"), + (0x2CD9, "V"), + (0x2CDA, "M", "ⳛ"), + (0x2CDB, "V"), + (0x2CDC, "M", "ⳝ"), + (0x2CDD, "V"), + (0x2CDE, "M", "ⳟ"), + (0x2CDF, "V"), + (0x2CE0, "M", "ⳡ"), + (0x2CE1, "V"), + (0x2CE2, "M", "ⳣ"), + (0x2CE3, "V"), + (0x2CEB, "M", "ⳬ"), + (0x2CEC, "V"), + (0x2CED, "M", "ⳮ"), + (0x2CEE, "V"), + (0x2CF2, "M", "ⳳ"), + (0x2CF3, "V"), + (0x2CF4, "X"), + (0x2CF9, "V"), + (0x2D26, "X"), + (0x2D27, "V"), + (0x2D28, "X"), + (0x2D2D, "V"), + (0x2D2E, "X"), + (0x2D30, "V"), + (0x2D68, "X"), + (0x2D6F, "M", "ⵡ"), + (0x2D70, "V"), + (0x2D71, "X"), + (0x2D7F, "V"), + (0x2D97, "X"), + (0x2DA0, "V"), + (0x2DA7, "X"), + (0x2DA8, "V"), + (0x2DAF, "X"), + (0x2DB0, "V"), + (0x2DB7, "X"), + (0x2DB8, "V"), + (0x2DBF, "X"), + (0x2DC0, "V"), + (0x2DC7, "X"), + (0x2DC8, "V"), + (0x2DCF, "X"), + (0x2DD0, "V"), + (0x2DD7, "X"), + (0x2DD8, "V"), + (0x2DDF, "X"), + (0x2DE0, "V"), + (0x2E5E, "X"), + ] + + +def _seg_27() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x2E80, "V"), + (0x2E9A, "X"), + (0x2E9B, "V"), + (0x2E9F, "M", "母"), + (0x2EA0, "V"), + (0x2EF3, "M", "龟"), + (0x2EF4, "X"), + (0x2F00, "M", "一"), + (0x2F01, "M", "丨"), + (0x2F02, "M", "丶"), + (0x2F03, "M", "丿"), + (0x2F04, "M", "乙"), + (0x2F05, "M", "亅"), + (0x2F06, "M", "二"), + (0x2F07, "M", "亠"), + (0x2F08, "M", "人"), + (0x2F09, "M", "儿"), + (0x2F0A, "M", "入"), + (0x2F0B, "M", "八"), + (0x2F0C, "M", "冂"), + (0x2F0D, "M", "冖"), + (0x2F0E, "M", "冫"), + (0x2F0F, "M", "几"), + (0x2F10, "M", "凵"), + (0x2F11, "M", "刀"), + (0x2F12, "M", "力"), + (0x2F13, "M", "勹"), + (0x2F14, "M", "匕"), + (0x2F15, "M", "匚"), + (0x2F16, "M", "匸"), + (0x2F17, "M", "十"), + (0x2F18, "M", "卜"), + (0x2F19, "M", "卩"), + (0x2F1A, "M", "厂"), + (0x2F1B, "M", "厶"), + (0x2F1C, "M", "又"), + (0x2F1D, "M", "口"), + (0x2F1E, "M", "囗"), + (0x2F1F, "M", "土"), + (0x2F20, "M", "士"), + (0x2F21, "M", "夂"), + (0x2F22, "M", "夊"), + (0x2F23, "M", "夕"), + (0x2F24, "M", "大"), + (0x2F25, "M", "女"), + (0x2F26, "M", "子"), + (0x2F27, "M", "宀"), + (0x2F28, "M", "寸"), + (0x2F29, "M", "小"), + (0x2F2A, "M", "尢"), + (0x2F2B, "M", "尸"), + (0x2F2C, "M", "屮"), + (0x2F2D, "M", "山"), + (0x2F2E, "M", "巛"), + (0x2F2F, "M", "工"), + (0x2F30, "M", "己"), + (0x2F31, "M", "巾"), + (0x2F32, "M", "干"), + (0x2F33, "M", "幺"), + (0x2F34, "M", "广"), + (0x2F35, "M", "廴"), + (0x2F36, "M", "廾"), + (0x2F37, "M", "弋"), + (0x2F38, "M", "弓"), + (0x2F39, "M", "彐"), + (0x2F3A, "M", "彡"), + (0x2F3B, "M", "彳"), + (0x2F3C, "M", "心"), + (0x2F3D, "M", "戈"), + (0x2F3E, "M", "戶"), + (0x2F3F, "M", "手"), + (0x2F40, "M", "支"), + (0x2F41, "M", "攴"), + (0x2F42, "M", "文"), + (0x2F43, "M", "斗"), + (0x2F44, "M", "斤"), + (0x2F45, "M", "方"), + (0x2F46, "M", "无"), + (0x2F47, "M", "日"), + (0x2F48, "M", "曰"), + (0x2F49, "M", "月"), + (0x2F4A, "M", "木"), + (0x2F4B, "M", "欠"), + (0x2F4C, "M", "止"), + (0x2F4D, "M", "歹"), + (0x2F4E, "M", "殳"), + (0x2F4F, "M", "毋"), + (0x2F50, "M", "比"), + (0x2F51, "M", "毛"), + (0x2F52, "M", "氏"), + (0x2F53, "M", "气"), + (0x2F54, "M", "水"), + (0x2F55, "M", "火"), + (0x2F56, "M", "爪"), + (0x2F57, "M", "父"), + (0x2F58, "M", "爻"), + (0x2F59, "M", "爿"), + (0x2F5A, "M", "片"), + (0x2F5B, "M", "牙"), + (0x2F5C, "M", "牛"), + ] + + +def _seg_28() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x2F5D, "M", "犬"), + (0x2F5E, "M", "玄"), + (0x2F5F, "M", "玉"), + (0x2F60, "M", "瓜"), + (0x2F61, "M", "瓦"), + (0x2F62, "M", "甘"), + (0x2F63, "M", "生"), + (0x2F64, "M", "用"), + (0x2F65, "M", "田"), + (0x2F66, "M", "疋"), + (0x2F67, "M", "疒"), + (0x2F68, "M", "癶"), + (0x2F69, "M", "白"), + (0x2F6A, "M", "皮"), + (0x2F6B, "M", "皿"), + (0x2F6C, "M", "目"), + (0x2F6D, "M", "矛"), + (0x2F6E, "M", "矢"), + (0x2F6F, "M", "石"), + (0x2F70, "M", "示"), + (0x2F71, "M", "禸"), + (0x2F72, "M", "禾"), + (0x2F73, "M", "穴"), + (0x2F74, "M", "立"), + (0x2F75, "M", "竹"), + (0x2F76, "M", "米"), + (0x2F77, "M", "糸"), + (0x2F78, "M", "缶"), + (0x2F79, "M", "网"), + (0x2F7A, "M", "羊"), + (0x2F7B, "M", "羽"), + (0x2F7C, "M", "老"), + (0x2F7D, "M", "而"), + (0x2F7E, "M", "耒"), + (0x2F7F, "M", "耳"), + (0x2F80, "M", "聿"), + (0x2F81, "M", "肉"), + (0x2F82, "M", "臣"), + (0x2F83, "M", "自"), + (0x2F84, "M", "至"), + (0x2F85, "M", "臼"), + (0x2F86, "M", "舌"), + (0x2F87, "M", "舛"), + (0x2F88, "M", "舟"), + (0x2F89, "M", "艮"), + (0x2F8A, "M", "色"), + (0x2F8B, "M", "艸"), + (0x2F8C, "M", "虍"), + (0x2F8D, "M", "虫"), + (0x2F8E, "M", "血"), + (0x2F8F, "M", "行"), + (0x2F90, "M", "衣"), + (0x2F91, "M", "襾"), + (0x2F92, "M", "見"), + (0x2F93, "M", "角"), + (0x2F94, "M", "言"), + (0x2F95, "M", "谷"), + (0x2F96, "M", "豆"), + (0x2F97, "M", "豕"), + (0x2F98, "M", "豸"), + (0x2F99, "M", "貝"), + (0x2F9A, "M", "赤"), + (0x2F9B, "M", "走"), + (0x2F9C, "M", "足"), + (0x2F9D, "M", "身"), + (0x2F9E, "M", "車"), + (0x2F9F, "M", "辛"), + (0x2FA0, "M", "辰"), + (0x2FA1, "M", "辵"), + (0x2FA2, "M", "邑"), + (0x2FA3, "M", "酉"), + (0x2FA4, "M", "釆"), + (0x2FA5, "M", "里"), + (0x2FA6, "M", "金"), + (0x2FA7, "M", "長"), + (0x2FA8, "M", "門"), + (0x2FA9, "M", "阜"), + (0x2FAA, "M", "隶"), + (0x2FAB, "M", "隹"), + (0x2FAC, "M", "雨"), + (0x2FAD, "M", "靑"), + (0x2FAE, "M", "非"), + (0x2FAF, "M", "面"), + (0x2FB0, "M", "革"), + (0x2FB1, "M", "韋"), + (0x2FB2, "M", "韭"), + (0x2FB3, "M", "音"), + (0x2FB4, "M", "頁"), + (0x2FB5, "M", "風"), + (0x2FB6, "M", "飛"), + (0x2FB7, "M", "食"), + (0x2FB8, "M", "首"), + (0x2FB9, "M", "香"), + (0x2FBA, "M", "馬"), + (0x2FBB, "M", "骨"), + (0x2FBC, "M", "高"), + (0x2FBD, "M", "髟"), + (0x2FBE, "M", "鬥"), + (0x2FBF, "M", "鬯"), + (0x2FC0, "M", "鬲"), + ] + + +def _seg_29() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x2FC1, "M", "鬼"), + (0x2FC2, "M", "魚"), + (0x2FC3, "M", "鳥"), + (0x2FC4, "M", "鹵"), + (0x2FC5, "M", "鹿"), + (0x2FC6, "M", "麥"), + (0x2FC7, "M", "麻"), + (0x2FC8, "M", "黃"), + (0x2FC9, "M", "黍"), + (0x2FCA, "M", "黑"), + (0x2FCB, "M", "黹"), + (0x2FCC, "M", "黽"), + (0x2FCD, "M", "鼎"), + (0x2FCE, "M", "鼓"), + (0x2FCF, "M", "鼠"), + (0x2FD0, "M", "鼻"), + (0x2FD1, "M", "齊"), + (0x2FD2, "M", "齒"), + (0x2FD3, "M", "龍"), + (0x2FD4, "M", "龜"), + (0x2FD5, "M", "龠"), + (0x2FD6, "X"), + (0x3000, "M", " "), + (0x3001, "V"), + (0x3002, "M", "."), + (0x3003, "V"), + (0x3036, "M", "〒"), + (0x3037, "V"), + (0x3038, "M", "十"), + (0x3039, "M", "卄"), + (0x303A, "M", "卅"), + (0x303B, "V"), + (0x3040, "X"), + (0x3041, "V"), + (0x3097, "X"), + (0x3099, "V"), + (0x309B, "M", " ゙"), + (0x309C, "M", " ゚"), + (0x309D, "V"), + (0x309F, "M", "より"), + (0x30A0, "V"), + (0x30FF, "M", "コト"), + (0x3100, "X"), + (0x3105, "V"), + (0x3130, "X"), + (0x3131, "M", "ᄀ"), + (0x3132, "M", "ᄁ"), + (0x3133, "M", "ᆪ"), + (0x3134, "M", "ᄂ"), + (0x3135, "M", "ᆬ"), + (0x3136, "M", "ᆭ"), + (0x3137, "M", "ᄃ"), + (0x3138, "M", "ᄄ"), + (0x3139, "M", "ᄅ"), + (0x313A, "M", "ᆰ"), + (0x313B, "M", "ᆱ"), + (0x313C, "M", "ᆲ"), + (0x313D, "M", "ᆳ"), + (0x313E, "M", "ᆴ"), + (0x313F, "M", "ᆵ"), + (0x3140, "M", "ᄚ"), + (0x3141, "M", "ᄆ"), + (0x3142, "M", "ᄇ"), + (0x3143, "M", "ᄈ"), + (0x3144, "M", "ᄡ"), + (0x3145, "M", "ᄉ"), + (0x3146, "M", "ᄊ"), + (0x3147, "M", "ᄋ"), + (0x3148, "M", "ᄌ"), + (0x3149, "M", "ᄍ"), + (0x314A, "M", "ᄎ"), + (0x314B, "M", "ᄏ"), + (0x314C, "M", "ᄐ"), + (0x314D, "M", "ᄑ"), + (0x314E, "M", "ᄒ"), + (0x314F, "M", "ᅡ"), + (0x3150, "M", "ᅢ"), + (0x3151, "M", "ᅣ"), + (0x3152, "M", "ᅤ"), + (0x3153, "M", "ᅥ"), + (0x3154, "M", "ᅦ"), + (0x3155, "M", "ᅧ"), + (0x3156, "M", "ᅨ"), + (0x3157, "M", "ᅩ"), + (0x3158, "M", "ᅪ"), + (0x3159, "M", "ᅫ"), + (0x315A, "M", "ᅬ"), + (0x315B, "M", "ᅭ"), + (0x315C, "M", "ᅮ"), + (0x315D, "M", "ᅯ"), + (0x315E, "M", "ᅰ"), + (0x315F, "M", "ᅱ"), + (0x3160, "M", "ᅲ"), + (0x3161, "M", "ᅳ"), + (0x3162, "M", "ᅴ"), + (0x3163, "M", "ᅵ"), + (0x3164, "I"), + (0x3165, "M", "ᄔ"), + (0x3166, "M", "ᄕ"), + (0x3167, "M", "ᇇ"), + ] + + +def _seg_30() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x3168, "M", "ᇈ"), + (0x3169, "M", "ᇌ"), + (0x316A, "M", "ᇎ"), + (0x316B, "M", "ᇓ"), + (0x316C, "M", "ᇗ"), + (0x316D, "M", "ᇙ"), + (0x316E, "M", "ᄜ"), + (0x316F, "M", "ᇝ"), + (0x3170, "M", "ᇟ"), + (0x3171, "M", "ᄝ"), + (0x3172, "M", "ᄞ"), + (0x3173, "M", "ᄠ"), + (0x3174, "M", "ᄢ"), + (0x3175, "M", "ᄣ"), + (0x3176, "M", "ᄧ"), + (0x3177, "M", "ᄩ"), + (0x3178, "M", "ᄫ"), + (0x3179, "M", "ᄬ"), + (0x317A, "M", "ᄭ"), + (0x317B, "M", "ᄮ"), + (0x317C, "M", "ᄯ"), + (0x317D, "M", "ᄲ"), + (0x317E, "M", "ᄶ"), + (0x317F, "M", "ᅀ"), + (0x3180, "M", "ᅇ"), + (0x3181, "M", "ᅌ"), + (0x3182, "M", "ᇱ"), + (0x3183, "M", "ᇲ"), + (0x3184, "M", "ᅗ"), + (0x3185, "M", "ᅘ"), + (0x3186, "M", "ᅙ"), + (0x3187, "M", "ᆄ"), + (0x3188, "M", "ᆅ"), + (0x3189, "M", "ᆈ"), + (0x318A, "M", "ᆑ"), + (0x318B, "M", "ᆒ"), + (0x318C, "M", "ᆔ"), + (0x318D, "M", "ᆞ"), + (0x318E, "M", "ᆡ"), + (0x318F, "X"), + (0x3190, "V"), + (0x3192, "M", "一"), + (0x3193, "M", "二"), + (0x3194, "M", "三"), + (0x3195, "M", "四"), + (0x3196, "M", "上"), + (0x3197, "M", "中"), + (0x3198, "M", "下"), + (0x3199, "M", "甲"), + (0x319A, "M", "乙"), + (0x319B, "M", "丙"), + (0x319C, "M", "丁"), + (0x319D, "M", "天"), + (0x319E, "M", "地"), + (0x319F, "M", "人"), + (0x31A0, "V"), + (0x31E6, "X"), + (0x31F0, "V"), + (0x3200, "M", "(ᄀ)"), + (0x3201, "M", "(ᄂ)"), + (0x3202, "M", "(ᄃ)"), + (0x3203, "M", "(ᄅ)"), + (0x3204, "M", "(ᄆ)"), + (0x3205, "M", "(ᄇ)"), + (0x3206, "M", "(ᄉ)"), + (0x3207, "M", "(ᄋ)"), + (0x3208, "M", "(ᄌ)"), + (0x3209, "M", "(ᄎ)"), + (0x320A, "M", "(ᄏ)"), + (0x320B, "M", "(ᄐ)"), + (0x320C, "M", "(ᄑ)"), + (0x320D, "M", "(ᄒ)"), + (0x320E, "M", "(가)"), + (0x320F, "M", "(나)"), + (0x3210, "M", "(다)"), + (0x3211, "M", "(라)"), + (0x3212, "M", "(마)"), + (0x3213, "M", "(바)"), + (0x3214, "M", "(사)"), + (0x3215, "M", "(아)"), + (0x3216, "M", "(자)"), + (0x3217, "M", "(차)"), + (0x3218, "M", "(카)"), + (0x3219, "M", "(타)"), + (0x321A, "M", "(파)"), + (0x321B, "M", "(하)"), + (0x321C, "M", "(주)"), + (0x321D, "M", "(오전)"), + (0x321E, "M", "(오후)"), + (0x321F, "X"), + (0x3220, "M", "(一)"), + (0x3221, "M", "(二)"), + (0x3222, "M", "(三)"), + (0x3223, "M", "(四)"), + (0x3224, "M", "(五)"), + (0x3225, "M", "(六)"), + (0x3226, "M", "(七)"), + (0x3227, "M", "(八)"), + (0x3228, "M", "(九)"), + (0x3229, "M", "(十)"), + ] + + +def _seg_31() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x322A, "M", "(月)"), + (0x322B, "M", "(火)"), + (0x322C, "M", "(水)"), + (0x322D, "M", "(木)"), + (0x322E, "M", "(金)"), + (0x322F, "M", "(土)"), + (0x3230, "M", "(日)"), + (0x3231, "M", "(株)"), + (0x3232, "M", "(有)"), + (0x3233, "M", "(社)"), + (0x3234, "M", "(名)"), + (0x3235, "M", "(特)"), + (0x3236, "M", "(財)"), + (0x3237, "M", "(祝)"), + (0x3238, "M", "(労)"), + (0x3239, "M", "(代)"), + (0x323A, "M", "(呼)"), + (0x323B, "M", "(学)"), + (0x323C, "M", "(監)"), + (0x323D, "M", "(企)"), + (0x323E, "M", "(資)"), + (0x323F, "M", "(協)"), + (0x3240, "M", "(祭)"), + (0x3241, "M", "(休)"), + (0x3242, "M", "(自)"), + (0x3243, "M", "(至)"), + (0x3244, "M", "問"), + (0x3245, "M", "幼"), + (0x3246, "M", "文"), + (0x3247, "M", "箏"), + (0x3248, "V"), + (0x3250, "M", "pte"), + (0x3251, "M", "21"), + (0x3252, "M", "22"), + (0x3253, "M", "23"), + (0x3254, "M", "24"), + (0x3255, "M", "25"), + (0x3256, "M", "26"), + (0x3257, "M", "27"), + (0x3258, "M", "28"), + (0x3259, "M", "29"), + (0x325A, "M", "30"), + (0x325B, "M", "31"), + (0x325C, "M", "32"), + (0x325D, "M", "33"), + (0x325E, "M", "34"), + (0x325F, "M", "35"), + (0x3260, "M", "ᄀ"), + (0x3261, "M", "ᄂ"), + (0x3262, "M", "ᄃ"), + (0x3263, "M", "ᄅ"), + (0x3264, "M", "ᄆ"), + (0x3265, "M", "ᄇ"), + (0x3266, "M", "ᄉ"), + (0x3267, "M", "ᄋ"), + (0x3268, "M", "ᄌ"), + (0x3269, "M", "ᄎ"), + (0x326A, "M", "ᄏ"), + (0x326B, "M", "ᄐ"), + (0x326C, "M", "ᄑ"), + (0x326D, "M", "ᄒ"), + (0x326E, "M", "가"), + (0x326F, "M", "나"), + (0x3270, "M", "다"), + (0x3271, "M", "라"), + (0x3272, "M", "마"), + (0x3273, "M", "바"), + (0x3274, "M", "사"), + (0x3275, "M", "아"), + (0x3276, "M", "자"), + (0x3277, "M", "차"), + (0x3278, "M", "카"), + (0x3279, "M", "타"), + (0x327A, "M", "파"), + (0x327B, "M", "하"), + (0x327C, "M", "참고"), + (0x327D, "M", "주의"), + (0x327E, "M", "우"), + (0x327F, "V"), + (0x3280, "M", "一"), + (0x3281, "M", "二"), + (0x3282, "M", "三"), + (0x3283, "M", "四"), + (0x3284, "M", "五"), + (0x3285, "M", "六"), + (0x3286, "M", "七"), + (0x3287, "M", "八"), + (0x3288, "M", "九"), + (0x3289, "M", "十"), + (0x328A, "M", "月"), + (0x328B, "M", "火"), + (0x328C, "M", "水"), + (0x328D, "M", "木"), + (0x328E, "M", "金"), + (0x328F, "M", "土"), + (0x3290, "M", "日"), + (0x3291, "M", "株"), + (0x3292, "M", "有"), + (0x3293, "M", "社"), + (0x3294, "M", "名"), + ] + + +def _seg_32() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x3295, "M", "特"), + (0x3296, "M", "財"), + (0x3297, "M", "祝"), + (0x3298, "M", "労"), + (0x3299, "M", "秘"), + (0x329A, "M", "男"), + (0x329B, "M", "女"), + (0x329C, "M", "適"), + (0x329D, "M", "優"), + (0x329E, "M", "印"), + (0x329F, "M", "注"), + (0x32A0, "M", "項"), + (0x32A1, "M", "休"), + (0x32A2, "M", "写"), + (0x32A3, "M", "正"), + (0x32A4, "M", "上"), + (0x32A5, "M", "中"), + (0x32A6, "M", "下"), + (0x32A7, "M", "左"), + (0x32A8, "M", "右"), + (0x32A9, "M", "医"), + (0x32AA, "M", "宗"), + (0x32AB, "M", "学"), + (0x32AC, "M", "監"), + (0x32AD, "M", "企"), + (0x32AE, "M", "資"), + (0x32AF, "M", "協"), + (0x32B0, "M", "夜"), + (0x32B1, "M", "36"), + (0x32B2, "M", "37"), + (0x32B3, "M", "38"), + (0x32B4, "M", "39"), + (0x32B5, "M", "40"), + (0x32B6, "M", "41"), + (0x32B7, "M", "42"), + (0x32B8, "M", "43"), + (0x32B9, "M", "44"), + (0x32BA, "M", "45"), + (0x32BB, "M", "46"), + (0x32BC, "M", "47"), + (0x32BD, "M", "48"), + (0x32BE, "M", "49"), + (0x32BF, "M", "50"), + (0x32C0, "M", "1月"), + (0x32C1, "M", "2月"), + (0x32C2, "M", "3月"), + (0x32C3, "M", "4月"), + (0x32C4, "M", "5月"), + (0x32C5, "M", "6月"), + (0x32C6, "M", "7月"), + (0x32C7, "M", "8月"), + (0x32C8, "M", "9月"), + (0x32C9, "M", "10月"), + (0x32CA, "M", "11月"), + (0x32CB, "M", "12月"), + (0x32CC, "M", "hg"), + (0x32CD, "M", "erg"), + (0x32CE, "M", "ev"), + (0x32CF, "M", "ltd"), + (0x32D0, "M", "ア"), + (0x32D1, "M", "イ"), + (0x32D2, "M", "ウ"), + (0x32D3, "M", "エ"), + (0x32D4, "M", "オ"), + (0x32D5, "M", "カ"), + (0x32D6, "M", "キ"), + (0x32D7, "M", "ク"), + (0x32D8, "M", "ケ"), + (0x32D9, "M", "コ"), + (0x32DA, "M", "サ"), + (0x32DB, "M", "シ"), + (0x32DC, "M", "ス"), + (0x32DD, "M", "セ"), + (0x32DE, "M", "ソ"), + (0x32DF, "M", "タ"), + (0x32E0, "M", "チ"), + (0x32E1, "M", "ツ"), + (0x32E2, "M", "テ"), + (0x32E3, "M", "ト"), + (0x32E4, "M", "ナ"), + (0x32E5, "M", "ニ"), + (0x32E6, "M", "ヌ"), + (0x32E7, "M", "ネ"), + (0x32E8, "M", "ノ"), + (0x32E9, "M", "ハ"), + (0x32EA, "M", "ヒ"), + (0x32EB, "M", "フ"), + (0x32EC, "M", "ヘ"), + (0x32ED, "M", "ホ"), + (0x32EE, "M", "マ"), + (0x32EF, "M", "ミ"), + (0x32F0, "M", "ム"), + (0x32F1, "M", "メ"), + (0x32F2, "M", "モ"), + (0x32F3, "M", "ヤ"), + (0x32F4, "M", "ユ"), + (0x32F5, "M", "ヨ"), + (0x32F6, "M", "ラ"), + (0x32F7, "M", "リ"), + (0x32F8, "M", "ル"), + ] + + +def _seg_33() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x32F9, "M", "レ"), + (0x32FA, "M", "ロ"), + (0x32FB, "M", "ワ"), + (0x32FC, "M", "ヰ"), + (0x32FD, "M", "ヱ"), + (0x32FE, "M", "ヲ"), + (0x32FF, "M", "令和"), + (0x3300, "M", "アパート"), + (0x3301, "M", "アルファ"), + (0x3302, "M", "アンペア"), + (0x3303, "M", "アール"), + (0x3304, "M", "イニング"), + (0x3305, "M", "インチ"), + (0x3306, "M", "ウォン"), + (0x3307, "M", "エスクード"), + (0x3308, "M", "エーカー"), + (0x3309, "M", "オンス"), + (0x330A, "M", "オーム"), + (0x330B, "M", "カイリ"), + (0x330C, "M", "カラット"), + (0x330D, "M", "カロリー"), + (0x330E, "M", "ガロン"), + (0x330F, "M", "ガンマ"), + (0x3310, "M", "ギガ"), + (0x3311, "M", "ギニー"), + (0x3312, "M", "キュリー"), + (0x3313, "M", "ギルダー"), + (0x3314, "M", "キロ"), + (0x3315, "M", "キログラム"), + (0x3316, "M", "キロメートル"), + (0x3317, "M", "キロワット"), + (0x3318, "M", "グラム"), + (0x3319, "M", "グラムトン"), + (0x331A, "M", "クルゼイロ"), + (0x331B, "M", "クローネ"), + (0x331C, "M", "ケース"), + (0x331D, "M", "コルナ"), + (0x331E, "M", "コーポ"), + (0x331F, "M", "サイクル"), + (0x3320, "M", "サンチーム"), + (0x3321, "M", "シリング"), + (0x3322, "M", "センチ"), + (0x3323, "M", "セント"), + (0x3324, "M", "ダース"), + (0x3325, "M", "デシ"), + (0x3326, "M", "ドル"), + (0x3327, "M", "トン"), + (0x3328, "M", "ナノ"), + (0x3329, "M", "ノット"), + (0x332A, "M", "ハイツ"), + (0x332B, "M", "パーセント"), + (0x332C, "M", "パーツ"), + (0x332D, "M", "バーレル"), + (0x332E, "M", "ピアストル"), + (0x332F, "M", "ピクル"), + (0x3330, "M", "ピコ"), + (0x3331, "M", "ビル"), + (0x3332, "M", "ファラッド"), + (0x3333, "M", "フィート"), + (0x3334, "M", "ブッシェル"), + (0x3335, "M", "フラン"), + (0x3336, "M", "ヘクタール"), + (0x3337, "M", "ペソ"), + (0x3338, "M", "ペニヒ"), + (0x3339, "M", "ヘルツ"), + (0x333A, "M", "ペンス"), + (0x333B, "M", "ページ"), + (0x333C, "M", "ベータ"), + (0x333D, "M", "ポイント"), + (0x333E, "M", "ボルト"), + (0x333F, "M", "ホン"), + (0x3340, "M", "ポンド"), + (0x3341, "M", "ホール"), + (0x3342, "M", "ホーン"), + (0x3343, "M", "マイクロ"), + (0x3344, "M", "マイル"), + (0x3345, "M", "マッハ"), + (0x3346, "M", "マルク"), + (0x3347, "M", "マンション"), + (0x3348, "M", "ミクロン"), + (0x3349, "M", "ミリ"), + (0x334A, "M", "ミリバール"), + (0x334B, "M", "メガ"), + (0x334C, "M", "メガトン"), + (0x334D, "M", "メートル"), + (0x334E, "M", "ヤード"), + (0x334F, "M", "ヤール"), + (0x3350, "M", "ユアン"), + (0x3351, "M", "リットル"), + (0x3352, "M", "リラ"), + (0x3353, "M", "ルピー"), + (0x3354, "M", "ルーブル"), + (0x3355, "M", "レム"), + (0x3356, "M", "レントゲン"), + (0x3357, "M", "ワット"), + (0x3358, "M", "0点"), + (0x3359, "M", "1点"), + (0x335A, "M", "2点"), + (0x335B, "M", "3点"), + (0x335C, "M", "4点"), + ] + + +def _seg_34() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x335D, "M", "5点"), + (0x335E, "M", "6点"), + (0x335F, "M", "7点"), + (0x3360, "M", "8点"), + (0x3361, "M", "9点"), + (0x3362, "M", "10点"), + (0x3363, "M", "11点"), + (0x3364, "M", "12点"), + (0x3365, "M", "13点"), + (0x3366, "M", "14点"), + (0x3367, "M", "15点"), + (0x3368, "M", "16点"), + (0x3369, "M", "17点"), + (0x336A, "M", "18点"), + (0x336B, "M", "19点"), + (0x336C, "M", "20点"), + (0x336D, "M", "21点"), + (0x336E, "M", "22点"), + (0x336F, "M", "23点"), + (0x3370, "M", "24点"), + (0x3371, "M", "hpa"), + (0x3372, "M", "da"), + (0x3373, "M", "au"), + (0x3374, "M", "bar"), + (0x3375, "M", "ov"), + (0x3376, "M", "pc"), + (0x3377, "M", "dm"), + (0x3378, "M", "dm2"), + (0x3379, "M", "dm3"), + (0x337A, "M", "iu"), + (0x337B, "M", "平成"), + (0x337C, "M", "昭和"), + (0x337D, "M", "大正"), + (0x337E, "M", "明治"), + (0x337F, "M", "株式会社"), + (0x3380, "M", "pa"), + (0x3381, "M", "na"), + (0x3382, "M", "μa"), + (0x3383, "M", "ma"), + (0x3384, "M", "ka"), + (0x3385, "M", "kb"), + (0x3386, "M", "mb"), + (0x3387, "M", "gb"), + (0x3388, "M", "cal"), + (0x3389, "M", "kcal"), + (0x338A, "M", "pf"), + (0x338B, "M", "nf"), + (0x338C, "M", "μf"), + (0x338D, "M", "μg"), + (0x338E, "M", "mg"), + (0x338F, "M", "kg"), + (0x3390, "M", "hz"), + (0x3391, "M", "khz"), + (0x3392, "M", "mhz"), + (0x3393, "M", "ghz"), + (0x3394, "M", "thz"), + (0x3395, "M", "μl"), + (0x3396, "M", "ml"), + (0x3397, "M", "dl"), + (0x3398, "M", "kl"), + (0x3399, "M", "fm"), + (0x339A, "M", "nm"), + (0x339B, "M", "μm"), + (0x339C, "M", "mm"), + (0x339D, "M", "cm"), + (0x339E, "M", "km"), + (0x339F, "M", "mm2"), + (0x33A0, "M", "cm2"), + (0x33A1, "M", "m2"), + (0x33A2, "M", "km2"), + (0x33A3, "M", "mm3"), + (0x33A4, "M", "cm3"), + (0x33A5, "M", "m3"), + (0x33A6, "M", "km3"), + (0x33A7, "M", "m∕s"), + (0x33A8, "M", "m∕s2"), + (0x33A9, "M", "pa"), + (0x33AA, "M", "kpa"), + (0x33AB, "M", "mpa"), + (0x33AC, "M", "gpa"), + (0x33AD, "M", "rad"), + (0x33AE, "M", "rad∕s"), + (0x33AF, "M", "rad∕s2"), + (0x33B0, "M", "ps"), + (0x33B1, "M", "ns"), + (0x33B2, "M", "μs"), + (0x33B3, "M", "ms"), + (0x33B4, "M", "pv"), + (0x33B5, "M", "nv"), + (0x33B6, "M", "μv"), + (0x33B7, "M", "mv"), + (0x33B8, "M", "kv"), + (0x33B9, "M", "mv"), + (0x33BA, "M", "pw"), + (0x33BB, "M", "nw"), + (0x33BC, "M", "μw"), + (0x33BD, "M", "mw"), + (0x33BE, "M", "kw"), + (0x33BF, "M", "mw"), + (0x33C0, "M", "kω"), + ] + + +def _seg_35() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x33C1, "M", "mω"), + (0x33C2, "X"), + (0x33C3, "M", "bq"), + (0x33C4, "M", "cc"), + (0x33C5, "M", "cd"), + (0x33C6, "M", "c∕kg"), + (0x33C7, "X"), + (0x33C8, "M", "db"), + (0x33C9, "M", "gy"), + (0x33CA, "M", "ha"), + (0x33CB, "M", "hp"), + (0x33CC, "M", "in"), + (0x33CD, "M", "kk"), + (0x33CE, "M", "km"), + (0x33CF, "M", "kt"), + (0x33D0, "M", "lm"), + (0x33D1, "M", "ln"), + (0x33D2, "M", "log"), + (0x33D3, "M", "lx"), + (0x33D4, "M", "mb"), + (0x33D5, "M", "mil"), + (0x33D6, "M", "mol"), + (0x33D7, "M", "ph"), + (0x33D8, "X"), + (0x33D9, "M", "ppm"), + (0x33DA, "M", "pr"), + (0x33DB, "M", "sr"), + (0x33DC, "M", "sv"), + (0x33DD, "M", "wb"), + (0x33DE, "M", "v∕m"), + (0x33DF, "M", "a∕m"), + (0x33E0, "M", "1日"), + (0x33E1, "M", "2日"), + (0x33E2, "M", "3日"), + (0x33E3, "M", "4日"), + (0x33E4, "M", "5日"), + (0x33E5, "M", "6日"), + (0x33E6, "M", "7日"), + (0x33E7, "M", "8日"), + (0x33E8, "M", "9日"), + (0x33E9, "M", "10日"), + (0x33EA, "M", "11日"), + (0x33EB, "M", "12日"), + (0x33EC, "M", "13日"), + (0x33ED, "M", "14日"), + (0x33EE, "M", "15日"), + (0x33EF, "M", "16日"), + (0x33F0, "M", "17日"), + (0x33F1, "M", "18日"), + (0x33F2, "M", "19日"), + (0x33F3, "M", "20日"), + (0x33F4, "M", "21日"), + (0x33F5, "M", "22日"), + (0x33F6, "M", "23日"), + (0x33F7, "M", "24日"), + (0x33F8, "M", "25日"), + (0x33F9, "M", "26日"), + (0x33FA, "M", "27日"), + (0x33FB, "M", "28日"), + (0x33FC, "M", "29日"), + (0x33FD, "M", "30日"), + (0x33FE, "M", "31日"), + (0x33FF, "M", "gal"), + (0x3400, "V"), + (0xA48D, "X"), + (0xA490, "V"), + (0xA4C7, "X"), + (0xA4D0, "V"), + (0xA62C, "X"), + (0xA640, "M", "ꙁ"), + (0xA641, "V"), + (0xA642, "M", "ꙃ"), + (0xA643, "V"), + (0xA644, "M", "ꙅ"), + (0xA645, "V"), + (0xA646, "M", "ꙇ"), + (0xA647, "V"), + (0xA648, "M", "ꙉ"), + (0xA649, "V"), + (0xA64A, "M", "ꙋ"), + (0xA64B, "V"), + (0xA64C, "M", "ꙍ"), + (0xA64D, "V"), + (0xA64E, "M", "ꙏ"), + (0xA64F, "V"), + (0xA650, "M", "ꙑ"), + (0xA651, "V"), + (0xA652, "M", "ꙓ"), + (0xA653, "V"), + (0xA654, "M", "ꙕ"), + (0xA655, "V"), + (0xA656, "M", "ꙗ"), + (0xA657, "V"), + (0xA658, "M", "ꙙ"), + (0xA659, "V"), + (0xA65A, "M", "ꙛ"), + (0xA65B, "V"), + (0xA65C, "M", "ꙝ"), + (0xA65D, "V"), + (0xA65E, "M", "ꙟ"), + ] + + +def _seg_36() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0xA65F, "V"), + (0xA660, "M", "ꙡ"), + (0xA661, "V"), + (0xA662, "M", "ꙣ"), + (0xA663, "V"), + (0xA664, "M", "ꙥ"), + (0xA665, "V"), + (0xA666, "M", "ꙧ"), + (0xA667, "V"), + (0xA668, "M", "ꙩ"), + (0xA669, "V"), + (0xA66A, "M", "ꙫ"), + (0xA66B, "V"), + (0xA66C, "M", "ꙭ"), + (0xA66D, "V"), + (0xA680, "M", "ꚁ"), + (0xA681, "V"), + (0xA682, "M", "ꚃ"), + (0xA683, "V"), + (0xA684, "M", "ꚅ"), + (0xA685, "V"), + (0xA686, "M", "ꚇ"), + (0xA687, "V"), + (0xA688, "M", "ꚉ"), + (0xA689, "V"), + (0xA68A, "M", "ꚋ"), + (0xA68B, "V"), + (0xA68C, "M", "ꚍ"), + (0xA68D, "V"), + (0xA68E, "M", "ꚏ"), + (0xA68F, "V"), + (0xA690, "M", "ꚑ"), + (0xA691, "V"), + (0xA692, "M", "ꚓ"), + (0xA693, "V"), + (0xA694, "M", "ꚕ"), + (0xA695, "V"), + (0xA696, "M", "ꚗ"), + (0xA697, "V"), + (0xA698, "M", "ꚙ"), + (0xA699, "V"), + (0xA69A, "M", "ꚛ"), + (0xA69B, "V"), + (0xA69C, "M", "ъ"), + (0xA69D, "M", "ь"), + (0xA69E, "V"), + (0xA6F8, "X"), + (0xA700, "V"), + (0xA722, "M", "ꜣ"), + (0xA723, "V"), + (0xA724, "M", "ꜥ"), + (0xA725, "V"), + (0xA726, "M", "ꜧ"), + (0xA727, "V"), + (0xA728, "M", "ꜩ"), + (0xA729, "V"), + (0xA72A, "M", "ꜫ"), + (0xA72B, "V"), + (0xA72C, "M", "ꜭ"), + (0xA72D, "V"), + (0xA72E, "M", "ꜯ"), + (0xA72F, "V"), + (0xA732, "M", "ꜳ"), + (0xA733, "V"), + (0xA734, "M", "ꜵ"), + (0xA735, "V"), + (0xA736, "M", "ꜷ"), + (0xA737, "V"), + (0xA738, "M", "ꜹ"), + (0xA739, "V"), + (0xA73A, "M", "ꜻ"), + (0xA73B, "V"), + (0xA73C, "M", "ꜽ"), + (0xA73D, "V"), + (0xA73E, "M", "ꜿ"), + (0xA73F, "V"), + (0xA740, "M", "ꝁ"), + (0xA741, "V"), + (0xA742, "M", "ꝃ"), + (0xA743, "V"), + (0xA744, "M", "ꝅ"), + (0xA745, "V"), + (0xA746, "M", "ꝇ"), + (0xA747, "V"), + (0xA748, "M", "ꝉ"), + (0xA749, "V"), + (0xA74A, "M", "ꝋ"), + (0xA74B, "V"), + (0xA74C, "M", "ꝍ"), + (0xA74D, "V"), + (0xA74E, "M", "ꝏ"), + (0xA74F, "V"), + (0xA750, "M", "ꝑ"), + (0xA751, "V"), + (0xA752, "M", "ꝓ"), + (0xA753, "V"), + (0xA754, "M", "ꝕ"), + (0xA755, "V"), + (0xA756, "M", "ꝗ"), + (0xA757, "V"), + ] + + +def _seg_37() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0xA758, "M", "ꝙ"), + (0xA759, "V"), + (0xA75A, "M", "ꝛ"), + (0xA75B, "V"), + (0xA75C, "M", "ꝝ"), + (0xA75D, "V"), + (0xA75E, "M", "ꝟ"), + (0xA75F, "V"), + (0xA760, "M", "ꝡ"), + (0xA761, "V"), + (0xA762, "M", "ꝣ"), + (0xA763, "V"), + (0xA764, "M", "ꝥ"), + (0xA765, "V"), + (0xA766, "M", "ꝧ"), + (0xA767, "V"), + (0xA768, "M", "ꝩ"), + (0xA769, "V"), + (0xA76A, "M", "ꝫ"), + (0xA76B, "V"), + (0xA76C, "M", "ꝭ"), + (0xA76D, "V"), + (0xA76E, "M", "ꝯ"), + (0xA76F, "V"), + (0xA770, "M", "ꝯ"), + (0xA771, "V"), + (0xA779, "M", "ꝺ"), + (0xA77A, "V"), + (0xA77B, "M", "ꝼ"), + (0xA77C, "V"), + (0xA77D, "M", "ᵹ"), + (0xA77E, "M", "ꝿ"), + (0xA77F, "V"), + (0xA780, "M", "ꞁ"), + (0xA781, "V"), + (0xA782, "M", "ꞃ"), + (0xA783, "V"), + (0xA784, "M", "ꞅ"), + (0xA785, "V"), + (0xA786, "M", "ꞇ"), + (0xA787, "V"), + (0xA78B, "M", "ꞌ"), + (0xA78C, "V"), + (0xA78D, "M", "ɥ"), + (0xA78E, "V"), + (0xA790, "M", "ꞑ"), + (0xA791, "V"), + (0xA792, "M", "ꞓ"), + (0xA793, "V"), + (0xA796, "M", "ꞗ"), + (0xA797, "V"), + (0xA798, "M", "ꞙ"), + (0xA799, "V"), + (0xA79A, "M", "ꞛ"), + (0xA79B, "V"), + (0xA79C, "M", "ꞝ"), + (0xA79D, "V"), + (0xA79E, "M", "ꞟ"), + (0xA79F, "V"), + (0xA7A0, "M", "ꞡ"), + (0xA7A1, "V"), + (0xA7A2, "M", "ꞣ"), + (0xA7A3, "V"), + (0xA7A4, "M", "ꞥ"), + (0xA7A5, "V"), + (0xA7A6, "M", "ꞧ"), + (0xA7A7, "V"), + (0xA7A8, "M", "ꞩ"), + (0xA7A9, "V"), + (0xA7AA, "M", "ɦ"), + (0xA7AB, "M", "ɜ"), + (0xA7AC, "M", "ɡ"), + (0xA7AD, "M", "ɬ"), + (0xA7AE, "M", "ɪ"), + (0xA7AF, "V"), + (0xA7B0, "M", "ʞ"), + (0xA7B1, "M", "ʇ"), + (0xA7B2, "M", "ʝ"), + (0xA7B3, "M", "ꭓ"), + (0xA7B4, "M", "ꞵ"), + (0xA7B5, "V"), + (0xA7B6, "M", "ꞷ"), + (0xA7B7, "V"), + (0xA7B8, "M", "ꞹ"), + (0xA7B9, "V"), + (0xA7BA, "M", "ꞻ"), + (0xA7BB, "V"), + (0xA7BC, "M", "ꞽ"), + (0xA7BD, "V"), + (0xA7BE, "M", "ꞿ"), + (0xA7BF, "V"), + (0xA7C0, "M", "ꟁ"), + (0xA7C1, "V"), + (0xA7C2, "M", "ꟃ"), + (0xA7C3, "V"), + (0xA7C4, "M", "ꞔ"), + (0xA7C5, "M", "ʂ"), + (0xA7C6, "M", "ᶎ"), + (0xA7C7, "M", "ꟈ"), + (0xA7C8, "V"), + ] + + +def _seg_38() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0xA7C9, "M", "ꟊ"), + (0xA7CA, "V"), + (0xA7CB, "M", "ɤ"), + (0xA7CC, "M", "ꟍ"), + (0xA7CD, "V"), + (0xA7CE, "X"), + (0xA7D0, "M", "ꟑ"), + (0xA7D1, "V"), + (0xA7D2, "X"), + (0xA7D3, "V"), + (0xA7D4, "X"), + (0xA7D5, "V"), + (0xA7D6, "M", "ꟗ"), + (0xA7D7, "V"), + (0xA7D8, "M", "ꟙ"), + (0xA7D9, "V"), + (0xA7DA, "M", "ꟛ"), + (0xA7DB, "V"), + (0xA7DC, "M", "ƛ"), + (0xA7DD, "X"), + (0xA7F2, "M", "c"), + (0xA7F3, "M", "f"), + (0xA7F4, "M", "q"), + (0xA7F5, "M", "ꟶ"), + (0xA7F6, "V"), + (0xA7F8, "M", "ħ"), + (0xA7F9, "M", "œ"), + (0xA7FA, "V"), + (0xA82D, "X"), + (0xA830, "V"), + (0xA83A, "X"), + (0xA840, "V"), + (0xA878, "X"), + (0xA880, "V"), + (0xA8C6, "X"), + (0xA8CE, "V"), + (0xA8DA, "X"), + (0xA8E0, "V"), + (0xA954, "X"), + (0xA95F, "V"), + (0xA97D, "X"), + (0xA980, "V"), + (0xA9CE, "X"), + (0xA9CF, "V"), + (0xA9DA, "X"), + (0xA9DE, "V"), + (0xA9FF, "X"), + (0xAA00, "V"), + (0xAA37, "X"), + (0xAA40, "V"), + (0xAA4E, "X"), + (0xAA50, "V"), + (0xAA5A, "X"), + (0xAA5C, "V"), + (0xAAC3, "X"), + (0xAADB, "V"), + (0xAAF7, "X"), + (0xAB01, "V"), + (0xAB07, "X"), + (0xAB09, "V"), + (0xAB0F, "X"), + (0xAB11, "V"), + (0xAB17, "X"), + (0xAB20, "V"), + (0xAB27, "X"), + (0xAB28, "V"), + (0xAB2F, "X"), + (0xAB30, "V"), + (0xAB5C, "M", "ꜧ"), + (0xAB5D, "M", "ꬷ"), + (0xAB5E, "M", "ɫ"), + (0xAB5F, "M", "ꭒ"), + (0xAB60, "V"), + (0xAB69, "M", "ʍ"), + (0xAB6A, "V"), + (0xAB6C, "X"), + (0xAB70, "M", "Ꭰ"), + (0xAB71, "M", "Ꭱ"), + (0xAB72, "M", "Ꭲ"), + (0xAB73, "M", "Ꭳ"), + (0xAB74, "M", "Ꭴ"), + (0xAB75, "M", "Ꭵ"), + (0xAB76, "M", "Ꭶ"), + (0xAB77, "M", "Ꭷ"), + (0xAB78, "M", "Ꭸ"), + (0xAB79, "M", "Ꭹ"), + (0xAB7A, "M", "Ꭺ"), + (0xAB7B, "M", "Ꭻ"), + (0xAB7C, "M", "Ꭼ"), + (0xAB7D, "M", "Ꭽ"), + (0xAB7E, "M", "Ꭾ"), + (0xAB7F, "M", "Ꭿ"), + (0xAB80, "M", "Ꮀ"), + (0xAB81, "M", "Ꮁ"), + (0xAB82, "M", "Ꮂ"), + (0xAB83, "M", "Ꮃ"), + (0xAB84, "M", "Ꮄ"), + (0xAB85, "M", "Ꮅ"), + (0xAB86, "M", "Ꮆ"), + (0xAB87, "M", "Ꮇ"), + ] + + +def _seg_39() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0xAB88, "M", "Ꮈ"), + (0xAB89, "M", "Ꮉ"), + (0xAB8A, "M", "Ꮊ"), + (0xAB8B, "M", "Ꮋ"), + (0xAB8C, "M", "Ꮌ"), + (0xAB8D, "M", "Ꮍ"), + (0xAB8E, "M", "Ꮎ"), + (0xAB8F, "M", "Ꮏ"), + (0xAB90, "M", "Ꮐ"), + (0xAB91, "M", "Ꮑ"), + (0xAB92, "M", "Ꮒ"), + (0xAB93, "M", "Ꮓ"), + (0xAB94, "M", "Ꮔ"), + (0xAB95, "M", "Ꮕ"), + (0xAB96, "M", "Ꮖ"), + (0xAB97, "M", "Ꮗ"), + (0xAB98, "M", "Ꮘ"), + (0xAB99, "M", "Ꮙ"), + (0xAB9A, "M", "Ꮚ"), + (0xAB9B, "M", "Ꮛ"), + (0xAB9C, "M", "Ꮜ"), + (0xAB9D, "M", "Ꮝ"), + (0xAB9E, "M", "Ꮞ"), + (0xAB9F, "M", "Ꮟ"), + (0xABA0, "M", "Ꮠ"), + (0xABA1, "M", "Ꮡ"), + (0xABA2, "M", "Ꮢ"), + (0xABA3, "M", "Ꮣ"), + (0xABA4, "M", "Ꮤ"), + (0xABA5, "M", "Ꮥ"), + (0xABA6, "M", "Ꮦ"), + (0xABA7, "M", "Ꮧ"), + (0xABA8, "M", "Ꮨ"), + (0xABA9, "M", "Ꮩ"), + (0xABAA, "M", "Ꮪ"), + (0xABAB, "M", "Ꮫ"), + (0xABAC, "M", "Ꮬ"), + (0xABAD, "M", "Ꮭ"), + (0xABAE, "M", "Ꮮ"), + (0xABAF, "M", "Ꮯ"), + (0xABB0, "M", "Ꮰ"), + (0xABB1, "M", "Ꮱ"), + (0xABB2, "M", "Ꮲ"), + (0xABB3, "M", "Ꮳ"), + (0xABB4, "M", "Ꮴ"), + (0xABB5, "M", "Ꮵ"), + (0xABB6, "M", "Ꮶ"), + (0xABB7, "M", "Ꮷ"), + (0xABB8, "M", "Ꮸ"), + (0xABB9, "M", "Ꮹ"), + (0xABBA, "M", "Ꮺ"), + (0xABBB, "M", "Ꮻ"), + (0xABBC, "M", "Ꮼ"), + (0xABBD, "M", "Ꮽ"), + (0xABBE, "M", "Ꮾ"), + (0xABBF, "M", "Ꮿ"), + (0xABC0, "V"), + (0xABEE, "X"), + (0xABF0, "V"), + (0xABFA, "X"), + (0xAC00, "V"), + (0xD7A4, "X"), + (0xD7B0, "V"), + (0xD7C7, "X"), + (0xD7CB, "V"), + (0xD7FC, "X"), + (0xF900, "M", "豈"), + (0xF901, "M", "更"), + (0xF902, "M", "車"), + (0xF903, "M", "賈"), + (0xF904, "M", "滑"), + (0xF905, "M", "串"), + (0xF906, "M", "句"), + (0xF907, "M", "龜"), + (0xF909, "M", "契"), + (0xF90A, "M", "金"), + (0xF90B, "M", "喇"), + (0xF90C, "M", "奈"), + (0xF90D, "M", "懶"), + (0xF90E, "M", "癩"), + (0xF90F, "M", "羅"), + (0xF910, "M", "蘿"), + (0xF911, "M", "螺"), + (0xF912, "M", "裸"), + (0xF913, "M", "邏"), + (0xF914, "M", "樂"), + (0xF915, "M", "洛"), + (0xF916, "M", "烙"), + (0xF917, "M", "珞"), + (0xF918, "M", "落"), + (0xF919, "M", "酪"), + (0xF91A, "M", "駱"), + (0xF91B, "M", "亂"), + (0xF91C, "M", "卵"), + (0xF91D, "M", "欄"), + (0xF91E, "M", "爛"), + (0xF91F, "M", "蘭"), + (0xF920, "M", "鸞"), + (0xF921, "M", "嵐"), + (0xF922, "M", "濫"), + ] + + +def _seg_40() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0xF923, "M", "藍"), + (0xF924, "M", "襤"), + (0xF925, "M", "拉"), + (0xF926, "M", "臘"), + (0xF927, "M", "蠟"), + (0xF928, "M", "廊"), + (0xF929, "M", "朗"), + (0xF92A, "M", "浪"), + (0xF92B, "M", "狼"), + (0xF92C, "M", "郎"), + (0xF92D, "M", "來"), + (0xF92E, "M", "冷"), + (0xF92F, "M", "勞"), + (0xF930, "M", "擄"), + (0xF931, "M", "櫓"), + (0xF932, "M", "爐"), + (0xF933, "M", "盧"), + (0xF934, "M", "老"), + (0xF935, "M", "蘆"), + (0xF936, "M", "虜"), + (0xF937, "M", "路"), + (0xF938, "M", "露"), + (0xF939, "M", "魯"), + (0xF93A, "M", "鷺"), + (0xF93B, "M", "碌"), + (0xF93C, "M", "祿"), + (0xF93D, "M", "綠"), + (0xF93E, "M", "菉"), + (0xF93F, "M", "錄"), + (0xF940, "M", "鹿"), + (0xF941, "M", "論"), + (0xF942, "M", "壟"), + (0xF943, "M", "弄"), + (0xF944, "M", "籠"), + (0xF945, "M", "聾"), + (0xF946, "M", "牢"), + (0xF947, "M", "磊"), + (0xF948, "M", "賂"), + (0xF949, "M", "雷"), + (0xF94A, "M", "壘"), + (0xF94B, "M", "屢"), + (0xF94C, "M", "樓"), + (0xF94D, "M", "淚"), + (0xF94E, "M", "漏"), + (0xF94F, "M", "累"), + (0xF950, "M", "縷"), + (0xF951, "M", "陋"), + (0xF952, "M", "勒"), + (0xF953, "M", "肋"), + (0xF954, "M", "凜"), + (0xF955, "M", "凌"), + (0xF956, "M", "稜"), + (0xF957, "M", "綾"), + (0xF958, "M", "菱"), + (0xF959, "M", "陵"), + (0xF95A, "M", "讀"), + (0xF95B, "M", "拏"), + (0xF95C, "M", "樂"), + (0xF95D, "M", "諾"), + (0xF95E, "M", "丹"), + (0xF95F, "M", "寧"), + (0xF960, "M", "怒"), + (0xF961, "M", "率"), + (0xF962, "M", "異"), + (0xF963, "M", "北"), + (0xF964, "M", "磻"), + (0xF965, "M", "便"), + (0xF966, "M", "復"), + (0xF967, "M", "不"), + (0xF968, "M", "泌"), + (0xF969, "M", "數"), + (0xF96A, "M", "索"), + (0xF96B, "M", "參"), + (0xF96C, "M", "塞"), + (0xF96D, "M", "省"), + (0xF96E, "M", "葉"), + (0xF96F, "M", "說"), + (0xF970, "M", "殺"), + (0xF971, "M", "辰"), + (0xF972, "M", "沈"), + (0xF973, "M", "拾"), + (0xF974, "M", "若"), + (0xF975, "M", "掠"), + (0xF976, "M", "略"), + (0xF977, "M", "亮"), + (0xF978, "M", "兩"), + (0xF979, "M", "凉"), + (0xF97A, "M", "梁"), + (0xF97B, "M", "糧"), + (0xF97C, "M", "良"), + (0xF97D, "M", "諒"), + (0xF97E, "M", "量"), + (0xF97F, "M", "勵"), + (0xF980, "M", "呂"), + (0xF981, "M", "女"), + (0xF982, "M", "廬"), + (0xF983, "M", "旅"), + (0xF984, "M", "濾"), + (0xF985, "M", "礪"), + (0xF986, "M", "閭"), + ] + + +def _seg_41() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0xF987, "M", "驪"), + (0xF988, "M", "麗"), + (0xF989, "M", "黎"), + (0xF98A, "M", "力"), + (0xF98B, "M", "曆"), + (0xF98C, "M", "歷"), + (0xF98D, "M", "轢"), + (0xF98E, "M", "年"), + (0xF98F, "M", "憐"), + (0xF990, "M", "戀"), + (0xF991, "M", "撚"), + (0xF992, "M", "漣"), + (0xF993, "M", "煉"), + (0xF994, "M", "璉"), + (0xF995, "M", "秊"), + (0xF996, "M", "練"), + (0xF997, "M", "聯"), + (0xF998, "M", "輦"), + (0xF999, "M", "蓮"), + (0xF99A, "M", "連"), + (0xF99B, "M", "鍊"), + (0xF99C, "M", "列"), + (0xF99D, "M", "劣"), + (0xF99E, "M", "咽"), + (0xF99F, "M", "烈"), + (0xF9A0, "M", "裂"), + (0xF9A1, "M", "說"), + (0xF9A2, "M", "廉"), + (0xF9A3, "M", "念"), + (0xF9A4, "M", "捻"), + (0xF9A5, "M", "殮"), + (0xF9A6, "M", "簾"), + (0xF9A7, "M", "獵"), + (0xF9A8, "M", "令"), + (0xF9A9, "M", "囹"), + (0xF9AA, "M", "寧"), + (0xF9AB, "M", "嶺"), + (0xF9AC, "M", "怜"), + (0xF9AD, "M", "玲"), + (0xF9AE, "M", "瑩"), + (0xF9AF, "M", "羚"), + (0xF9B0, "M", "聆"), + (0xF9B1, "M", "鈴"), + (0xF9B2, "M", "零"), + (0xF9B3, "M", "靈"), + (0xF9B4, "M", "領"), + (0xF9B5, "M", "例"), + (0xF9B6, "M", "禮"), + (0xF9B7, "M", "醴"), + (0xF9B8, "M", "隸"), + (0xF9B9, "M", "惡"), + (0xF9BA, "M", "了"), + (0xF9BB, "M", "僚"), + (0xF9BC, "M", "寮"), + (0xF9BD, "M", "尿"), + (0xF9BE, "M", "料"), + (0xF9BF, "M", "樂"), + (0xF9C0, "M", "燎"), + (0xF9C1, "M", "療"), + (0xF9C2, "M", "蓼"), + (0xF9C3, "M", "遼"), + (0xF9C4, "M", "龍"), + (0xF9C5, "M", "暈"), + (0xF9C6, "M", "阮"), + (0xF9C7, "M", "劉"), + (0xF9C8, "M", "杻"), + (0xF9C9, "M", "柳"), + (0xF9CA, "M", "流"), + (0xF9CB, "M", "溜"), + (0xF9CC, "M", "琉"), + (0xF9CD, "M", "留"), + (0xF9CE, "M", "硫"), + (0xF9CF, "M", "紐"), + (0xF9D0, "M", "類"), + (0xF9D1, "M", "六"), + (0xF9D2, "M", "戮"), + (0xF9D3, "M", "陸"), + (0xF9D4, "M", "倫"), + (0xF9D5, "M", "崙"), + (0xF9D6, "M", "淪"), + (0xF9D7, "M", "輪"), + (0xF9D8, "M", "律"), + (0xF9D9, "M", "慄"), + (0xF9DA, "M", "栗"), + (0xF9DB, "M", "率"), + (0xF9DC, "M", "隆"), + (0xF9DD, "M", "利"), + (0xF9DE, "M", "吏"), + (0xF9DF, "M", "履"), + (0xF9E0, "M", "易"), + (0xF9E1, "M", "李"), + (0xF9E2, "M", "梨"), + (0xF9E3, "M", "泥"), + (0xF9E4, "M", "理"), + (0xF9E5, "M", "痢"), + (0xF9E6, "M", "罹"), + (0xF9E7, "M", "裏"), + (0xF9E8, "M", "裡"), + (0xF9E9, "M", "里"), + (0xF9EA, "M", "離"), + ] + + +def _seg_42() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0xF9EB, "M", "匿"), + (0xF9EC, "M", "溺"), + (0xF9ED, "M", "吝"), + (0xF9EE, "M", "燐"), + (0xF9EF, "M", "璘"), + (0xF9F0, "M", "藺"), + (0xF9F1, "M", "隣"), + (0xF9F2, "M", "鱗"), + (0xF9F3, "M", "麟"), + (0xF9F4, "M", "林"), + (0xF9F5, "M", "淋"), + (0xF9F6, "M", "臨"), + (0xF9F7, "M", "立"), + (0xF9F8, "M", "笠"), + (0xF9F9, "M", "粒"), + (0xF9FA, "M", "狀"), + (0xF9FB, "M", "炙"), + (0xF9FC, "M", "識"), + (0xF9FD, "M", "什"), + (0xF9FE, "M", "茶"), + (0xF9FF, "M", "刺"), + (0xFA00, "M", "切"), + (0xFA01, "M", "度"), + (0xFA02, "M", "拓"), + (0xFA03, "M", "糖"), + (0xFA04, "M", "宅"), + (0xFA05, "M", "洞"), + (0xFA06, "M", "暴"), + (0xFA07, "M", "輻"), + (0xFA08, "M", "行"), + (0xFA09, "M", "降"), + (0xFA0A, "M", "見"), + (0xFA0B, "M", "廓"), + (0xFA0C, "M", "兀"), + (0xFA0D, "M", "嗀"), + (0xFA0E, "V"), + (0xFA10, "M", "塚"), + (0xFA11, "V"), + (0xFA12, "M", "晴"), + (0xFA13, "V"), + (0xFA15, "M", "凞"), + (0xFA16, "M", "猪"), + (0xFA17, "M", "益"), + (0xFA18, "M", "礼"), + (0xFA19, "M", "神"), + (0xFA1A, "M", "祥"), + (0xFA1B, "M", "福"), + (0xFA1C, "M", "靖"), + (0xFA1D, "M", "精"), + (0xFA1E, "M", "羽"), + (0xFA1F, "V"), + (0xFA20, "M", "蘒"), + (0xFA21, "V"), + (0xFA22, "M", "諸"), + (0xFA23, "V"), + (0xFA25, "M", "逸"), + (0xFA26, "M", "都"), + (0xFA27, "V"), + (0xFA2A, "M", "飯"), + (0xFA2B, "M", "飼"), + (0xFA2C, "M", "館"), + (0xFA2D, "M", "鶴"), + (0xFA2E, "M", "郞"), + (0xFA2F, "M", "隷"), + (0xFA30, "M", "侮"), + (0xFA31, "M", "僧"), + (0xFA32, "M", "免"), + (0xFA33, "M", "勉"), + (0xFA34, "M", "勤"), + (0xFA35, "M", "卑"), + (0xFA36, "M", "喝"), + (0xFA37, "M", "嘆"), + (0xFA38, "M", "器"), + (0xFA39, "M", "塀"), + (0xFA3A, "M", "墨"), + (0xFA3B, "M", "層"), + (0xFA3C, "M", "屮"), + (0xFA3D, "M", "悔"), + (0xFA3E, "M", "慨"), + (0xFA3F, "M", "憎"), + (0xFA40, "M", "懲"), + (0xFA41, "M", "敏"), + (0xFA42, "M", "既"), + (0xFA43, "M", "暑"), + (0xFA44, "M", "梅"), + (0xFA45, "M", "海"), + (0xFA46, "M", "渚"), + (0xFA47, "M", "漢"), + (0xFA48, "M", "煮"), + (0xFA49, "M", "爫"), + (0xFA4A, "M", "琢"), + (0xFA4B, "M", "碑"), + (0xFA4C, "M", "社"), + (0xFA4D, "M", "祉"), + (0xFA4E, "M", "祈"), + (0xFA4F, "M", "祐"), + (0xFA50, "M", "祖"), + (0xFA51, "M", "祝"), + (0xFA52, "M", "禍"), + (0xFA53, "M", "禎"), + ] + + +def _seg_43() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0xFA54, "M", "穀"), + (0xFA55, "M", "突"), + (0xFA56, "M", "節"), + (0xFA57, "M", "練"), + (0xFA58, "M", "縉"), + (0xFA59, "M", "繁"), + (0xFA5A, "M", "署"), + (0xFA5B, "M", "者"), + (0xFA5C, "M", "臭"), + (0xFA5D, "M", "艹"), + (0xFA5F, "M", "著"), + (0xFA60, "M", "褐"), + (0xFA61, "M", "視"), + (0xFA62, "M", "謁"), + (0xFA63, "M", "謹"), + (0xFA64, "M", "賓"), + (0xFA65, "M", "贈"), + (0xFA66, "M", "辶"), + (0xFA67, "M", "逸"), + (0xFA68, "M", "難"), + (0xFA69, "M", "響"), + (0xFA6A, "M", "頻"), + (0xFA6B, "M", "恵"), + (0xFA6C, "M", "𤋮"), + (0xFA6D, "M", "舘"), + (0xFA6E, "X"), + (0xFA70, "M", "並"), + (0xFA71, "M", "况"), + (0xFA72, "M", "全"), + (0xFA73, "M", "侀"), + (0xFA74, "M", "充"), + (0xFA75, "M", "冀"), + (0xFA76, "M", "勇"), + (0xFA77, "M", "勺"), + (0xFA78, "M", "喝"), + (0xFA79, "M", "啕"), + (0xFA7A, "M", "喙"), + (0xFA7B, "M", "嗢"), + (0xFA7C, "M", "塚"), + (0xFA7D, "M", "墳"), + (0xFA7E, "M", "奄"), + (0xFA7F, "M", "奔"), + (0xFA80, "M", "婢"), + (0xFA81, "M", "嬨"), + (0xFA82, "M", "廒"), + (0xFA83, "M", "廙"), + (0xFA84, "M", "彩"), + (0xFA85, "M", "徭"), + (0xFA86, "M", "惘"), + (0xFA87, "M", "慎"), + (0xFA88, "M", "愈"), + (0xFA89, "M", "憎"), + (0xFA8A, "M", "慠"), + (0xFA8B, "M", "懲"), + (0xFA8C, "M", "戴"), + (0xFA8D, "M", "揄"), + (0xFA8E, "M", "搜"), + (0xFA8F, "M", "摒"), + (0xFA90, "M", "敖"), + (0xFA91, "M", "晴"), + (0xFA92, "M", "朗"), + (0xFA93, "M", "望"), + (0xFA94, "M", "杖"), + (0xFA95, "M", "歹"), + (0xFA96, "M", "殺"), + (0xFA97, "M", "流"), + (0xFA98, "M", "滛"), + (0xFA99, "M", "滋"), + (0xFA9A, "M", "漢"), + (0xFA9B, "M", "瀞"), + (0xFA9C, "M", "煮"), + (0xFA9D, "M", "瞧"), + (0xFA9E, "M", "爵"), + (0xFA9F, "M", "犯"), + (0xFAA0, "M", "猪"), + (0xFAA1, "M", "瑱"), + (0xFAA2, "M", "甆"), + (0xFAA3, "M", "画"), + (0xFAA4, "M", "瘝"), + (0xFAA5, "M", "瘟"), + (0xFAA6, "M", "益"), + (0xFAA7, "M", "盛"), + (0xFAA8, "M", "直"), + (0xFAA9, "M", "睊"), + (0xFAAA, "M", "着"), + (0xFAAB, "M", "磌"), + (0xFAAC, "M", "窱"), + (0xFAAD, "M", "節"), + (0xFAAE, "M", "类"), + (0xFAAF, "M", "絛"), + (0xFAB0, "M", "練"), + (0xFAB1, "M", "缾"), + (0xFAB2, "M", "者"), + (0xFAB3, "M", "荒"), + (0xFAB4, "M", "華"), + (0xFAB5, "M", "蝹"), + (0xFAB6, "M", "襁"), + (0xFAB7, "M", "覆"), + (0xFAB8, "M", "視"), + (0xFAB9, "M", "調"), + ] + + +def _seg_44() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0xFABA, "M", "諸"), + (0xFABB, "M", "請"), + (0xFABC, "M", "謁"), + (0xFABD, "M", "諾"), + (0xFABE, "M", "諭"), + (0xFABF, "M", "謹"), + (0xFAC0, "M", "變"), + (0xFAC1, "M", "贈"), + (0xFAC2, "M", "輸"), + (0xFAC3, "M", "遲"), + (0xFAC4, "M", "醙"), + (0xFAC5, "M", "鉶"), + (0xFAC6, "M", "陼"), + (0xFAC7, "M", "難"), + (0xFAC8, "M", "靖"), + (0xFAC9, "M", "韛"), + (0xFACA, "M", "響"), + (0xFACB, "M", "頋"), + (0xFACC, "M", "頻"), + (0xFACD, "M", "鬒"), + (0xFACE, "M", "龜"), + (0xFACF, "M", "𢡊"), + (0xFAD0, "M", "𢡄"), + (0xFAD1, "M", "𣏕"), + (0xFAD2, "M", "㮝"), + (0xFAD3, "M", "䀘"), + (0xFAD4, "M", "䀹"), + (0xFAD5, "M", "𥉉"), + (0xFAD6, "M", "𥳐"), + (0xFAD7, "M", "𧻓"), + (0xFAD8, "M", "齃"), + (0xFAD9, "M", "龎"), + (0xFADA, "X"), + (0xFB00, "M", "ff"), + (0xFB01, "M", "fi"), + (0xFB02, "M", "fl"), + (0xFB03, "M", "ffi"), + (0xFB04, "M", "ffl"), + (0xFB05, "M", "st"), + (0xFB07, "X"), + (0xFB13, "M", "մն"), + (0xFB14, "M", "մե"), + (0xFB15, "M", "մի"), + (0xFB16, "M", "վն"), + (0xFB17, "M", "մխ"), + (0xFB18, "X"), + (0xFB1D, "M", "יִ"), + (0xFB1E, "V"), + (0xFB1F, "M", "ײַ"), + (0xFB20, "M", "ע"), + (0xFB21, "M", "א"), + (0xFB22, "M", "ד"), + (0xFB23, "M", "ה"), + (0xFB24, "M", "כ"), + (0xFB25, "M", "ל"), + (0xFB26, "M", "ם"), + (0xFB27, "M", "ר"), + (0xFB28, "M", "ת"), + (0xFB29, "M", "+"), + (0xFB2A, "M", "שׁ"), + (0xFB2B, "M", "שׂ"), + (0xFB2C, "M", "שּׁ"), + (0xFB2D, "M", "שּׂ"), + (0xFB2E, "M", "אַ"), + (0xFB2F, "M", "אָ"), + (0xFB30, "M", "אּ"), + (0xFB31, "M", "בּ"), + (0xFB32, "M", "גּ"), + (0xFB33, "M", "דּ"), + (0xFB34, "M", "הּ"), + (0xFB35, "M", "וּ"), + (0xFB36, "M", "זּ"), + (0xFB37, "X"), + (0xFB38, "M", "טּ"), + (0xFB39, "M", "יּ"), + (0xFB3A, "M", "ךּ"), + (0xFB3B, "M", "כּ"), + (0xFB3C, "M", "לּ"), + (0xFB3D, "X"), + (0xFB3E, "M", "מּ"), + (0xFB3F, "X"), + (0xFB40, "M", "נּ"), + (0xFB41, "M", "סּ"), + (0xFB42, "X"), + (0xFB43, "M", "ףּ"), + (0xFB44, "M", "פּ"), + (0xFB45, "X"), + (0xFB46, "M", "צּ"), + (0xFB47, "M", "קּ"), + (0xFB48, "M", "רּ"), + (0xFB49, "M", "שּ"), + (0xFB4A, "M", "תּ"), + (0xFB4B, "M", "וֹ"), + (0xFB4C, "M", "בֿ"), + (0xFB4D, "M", "כֿ"), + (0xFB4E, "M", "פֿ"), + (0xFB4F, "M", "אל"), + (0xFB50, "M", "ٱ"), + (0xFB52, "M", "ٻ"), + (0xFB56, "M", "پ"), + ] + + +def _seg_45() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0xFB5A, "M", "ڀ"), + (0xFB5E, "M", "ٺ"), + (0xFB62, "M", "ٿ"), + (0xFB66, "M", "ٹ"), + (0xFB6A, "M", "ڤ"), + (0xFB6E, "M", "ڦ"), + (0xFB72, "M", "ڄ"), + (0xFB76, "M", "ڃ"), + (0xFB7A, "M", "چ"), + (0xFB7E, "M", "ڇ"), + (0xFB82, "M", "ڍ"), + (0xFB84, "M", "ڌ"), + (0xFB86, "M", "ڎ"), + (0xFB88, "M", "ڈ"), + (0xFB8A, "M", "ژ"), + (0xFB8C, "M", "ڑ"), + (0xFB8E, "M", "ک"), + (0xFB92, "M", "گ"), + (0xFB96, "M", "ڳ"), + (0xFB9A, "M", "ڱ"), + (0xFB9E, "M", "ں"), + (0xFBA0, "M", "ڻ"), + (0xFBA4, "M", "ۀ"), + (0xFBA6, "M", "ہ"), + (0xFBAA, "M", "ھ"), + (0xFBAE, "M", "ے"), + (0xFBB0, "M", "ۓ"), + (0xFBB2, "V"), + (0xFBC3, "X"), + (0xFBD3, "M", "ڭ"), + (0xFBD7, "M", "ۇ"), + (0xFBD9, "M", "ۆ"), + (0xFBDB, "M", "ۈ"), + (0xFBDD, "M", "ۇٴ"), + (0xFBDE, "M", "ۋ"), + (0xFBE0, "M", "ۅ"), + (0xFBE2, "M", "ۉ"), + (0xFBE4, "M", "ې"), + (0xFBE8, "M", "ى"), + (0xFBEA, "M", "ئا"), + (0xFBEC, "M", "ئە"), + (0xFBEE, "M", "ئو"), + (0xFBF0, "M", "ئۇ"), + (0xFBF2, "M", "ئۆ"), + (0xFBF4, "M", "ئۈ"), + (0xFBF6, "M", "ئې"), + (0xFBF9, "M", "ئى"), + (0xFBFC, "M", "ی"), + (0xFC00, "M", "ئج"), + (0xFC01, "M", "ئح"), + (0xFC02, "M", "ئم"), + (0xFC03, "M", "ئى"), + (0xFC04, "M", "ئي"), + (0xFC05, "M", "بج"), + (0xFC06, "M", "بح"), + (0xFC07, "M", "بخ"), + (0xFC08, "M", "بم"), + (0xFC09, "M", "بى"), + (0xFC0A, "M", "بي"), + (0xFC0B, "M", "تج"), + (0xFC0C, "M", "تح"), + (0xFC0D, "M", "تخ"), + (0xFC0E, "M", "تم"), + (0xFC0F, "M", "تى"), + (0xFC10, "M", "تي"), + (0xFC11, "M", "ثج"), + (0xFC12, "M", "ثم"), + (0xFC13, "M", "ثى"), + (0xFC14, "M", "ثي"), + (0xFC15, "M", "جح"), + (0xFC16, "M", "جم"), + (0xFC17, "M", "حج"), + (0xFC18, "M", "حم"), + (0xFC19, "M", "خج"), + (0xFC1A, "M", "خح"), + (0xFC1B, "M", "خم"), + (0xFC1C, "M", "سج"), + (0xFC1D, "M", "سح"), + (0xFC1E, "M", "سخ"), + (0xFC1F, "M", "سم"), + (0xFC20, "M", "صح"), + (0xFC21, "M", "صم"), + (0xFC22, "M", "ضج"), + (0xFC23, "M", "ضح"), + (0xFC24, "M", "ضخ"), + (0xFC25, "M", "ضم"), + (0xFC26, "M", "طح"), + (0xFC27, "M", "طم"), + (0xFC28, "M", "ظم"), + (0xFC29, "M", "عج"), + (0xFC2A, "M", "عم"), + (0xFC2B, "M", "غج"), + (0xFC2C, "M", "غم"), + (0xFC2D, "M", "فج"), + (0xFC2E, "M", "فح"), + (0xFC2F, "M", "فخ"), + (0xFC30, "M", "فم"), + (0xFC31, "M", "فى"), + (0xFC32, "M", "في"), + (0xFC33, "M", "قح"), + ] + + +def _seg_46() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0xFC34, "M", "قم"), + (0xFC35, "M", "قى"), + (0xFC36, "M", "قي"), + (0xFC37, "M", "كا"), + (0xFC38, "M", "كج"), + (0xFC39, "M", "كح"), + (0xFC3A, "M", "كخ"), + (0xFC3B, "M", "كل"), + (0xFC3C, "M", "كم"), + (0xFC3D, "M", "كى"), + (0xFC3E, "M", "كي"), + (0xFC3F, "M", "لج"), + (0xFC40, "M", "لح"), + (0xFC41, "M", "لخ"), + (0xFC42, "M", "لم"), + (0xFC43, "M", "لى"), + (0xFC44, "M", "لي"), + (0xFC45, "M", "مج"), + (0xFC46, "M", "مح"), + (0xFC47, "M", "مخ"), + (0xFC48, "M", "مم"), + (0xFC49, "M", "مى"), + (0xFC4A, "M", "مي"), + (0xFC4B, "M", "نج"), + (0xFC4C, "M", "نح"), + (0xFC4D, "M", "نخ"), + (0xFC4E, "M", "نم"), + (0xFC4F, "M", "نى"), + (0xFC50, "M", "ني"), + (0xFC51, "M", "هج"), + (0xFC52, "M", "هم"), + (0xFC53, "M", "هى"), + (0xFC54, "M", "هي"), + (0xFC55, "M", "يج"), + (0xFC56, "M", "يح"), + (0xFC57, "M", "يخ"), + (0xFC58, "M", "يم"), + (0xFC59, "M", "يى"), + (0xFC5A, "M", "يي"), + (0xFC5B, "M", "ذٰ"), + (0xFC5C, "M", "رٰ"), + (0xFC5D, "M", "ىٰ"), + (0xFC5E, "M", " ٌّ"), + (0xFC5F, "M", " ٍّ"), + (0xFC60, "M", " َّ"), + (0xFC61, "M", " ُّ"), + (0xFC62, "M", " ِّ"), + (0xFC63, "M", " ّٰ"), + (0xFC64, "M", "ئر"), + (0xFC65, "M", "ئز"), + (0xFC66, "M", "ئم"), + (0xFC67, "M", "ئن"), + (0xFC68, "M", "ئى"), + (0xFC69, "M", "ئي"), + (0xFC6A, "M", "بر"), + (0xFC6B, "M", "بز"), + (0xFC6C, "M", "بم"), + (0xFC6D, "M", "بن"), + (0xFC6E, "M", "بى"), + (0xFC6F, "M", "بي"), + (0xFC70, "M", "تر"), + (0xFC71, "M", "تز"), + (0xFC72, "M", "تم"), + (0xFC73, "M", "تن"), + (0xFC74, "M", "تى"), + (0xFC75, "M", "تي"), + (0xFC76, "M", "ثر"), + (0xFC77, "M", "ثز"), + (0xFC78, "M", "ثم"), + (0xFC79, "M", "ثن"), + (0xFC7A, "M", "ثى"), + (0xFC7B, "M", "ثي"), + (0xFC7C, "M", "فى"), + (0xFC7D, "M", "في"), + (0xFC7E, "M", "قى"), + (0xFC7F, "M", "قي"), + (0xFC80, "M", "كا"), + (0xFC81, "M", "كل"), + (0xFC82, "M", "كم"), + (0xFC83, "M", "كى"), + (0xFC84, "M", "كي"), + (0xFC85, "M", "لم"), + (0xFC86, "M", "لى"), + (0xFC87, "M", "لي"), + (0xFC88, "M", "ما"), + (0xFC89, "M", "مم"), + (0xFC8A, "M", "نر"), + (0xFC8B, "M", "نز"), + (0xFC8C, "M", "نم"), + (0xFC8D, "M", "نن"), + (0xFC8E, "M", "نى"), + (0xFC8F, "M", "ني"), + (0xFC90, "M", "ىٰ"), + (0xFC91, "M", "ير"), + (0xFC92, "M", "يز"), + (0xFC93, "M", "يم"), + (0xFC94, "M", "ين"), + (0xFC95, "M", "يى"), + (0xFC96, "M", "يي"), + (0xFC97, "M", "ئج"), + ] + + +def _seg_47() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0xFC98, "M", "ئح"), + (0xFC99, "M", "ئخ"), + (0xFC9A, "M", "ئم"), + (0xFC9B, "M", "ئه"), + (0xFC9C, "M", "بج"), + (0xFC9D, "M", "بح"), + (0xFC9E, "M", "بخ"), + (0xFC9F, "M", "بم"), + (0xFCA0, "M", "به"), + (0xFCA1, "M", "تج"), + (0xFCA2, "M", "تح"), + (0xFCA3, "M", "تخ"), + (0xFCA4, "M", "تم"), + (0xFCA5, "M", "ته"), + (0xFCA6, "M", "ثم"), + (0xFCA7, "M", "جح"), + (0xFCA8, "M", "جم"), + (0xFCA9, "M", "حج"), + (0xFCAA, "M", "حم"), + (0xFCAB, "M", "خج"), + (0xFCAC, "M", "خم"), + (0xFCAD, "M", "سج"), + (0xFCAE, "M", "سح"), + (0xFCAF, "M", "سخ"), + (0xFCB0, "M", "سم"), + (0xFCB1, "M", "صح"), + (0xFCB2, "M", "صخ"), + (0xFCB3, "M", "صم"), + (0xFCB4, "M", "ضج"), + (0xFCB5, "M", "ضح"), + (0xFCB6, "M", "ضخ"), + (0xFCB7, "M", "ضم"), + (0xFCB8, "M", "طح"), + (0xFCB9, "M", "ظم"), + (0xFCBA, "M", "عج"), + (0xFCBB, "M", "عم"), + (0xFCBC, "M", "غج"), + (0xFCBD, "M", "غم"), + (0xFCBE, "M", "فج"), + (0xFCBF, "M", "فح"), + (0xFCC0, "M", "فخ"), + (0xFCC1, "M", "فم"), + (0xFCC2, "M", "قح"), + (0xFCC3, "M", "قم"), + (0xFCC4, "M", "كج"), + (0xFCC5, "M", "كح"), + (0xFCC6, "M", "كخ"), + (0xFCC7, "M", "كل"), + (0xFCC8, "M", "كم"), + (0xFCC9, "M", "لج"), + (0xFCCA, "M", "لح"), + (0xFCCB, "M", "لخ"), + (0xFCCC, "M", "لم"), + (0xFCCD, "M", "له"), + (0xFCCE, "M", "مج"), + (0xFCCF, "M", "مح"), + (0xFCD0, "M", "مخ"), + (0xFCD1, "M", "مم"), + (0xFCD2, "M", "نج"), + (0xFCD3, "M", "نح"), + (0xFCD4, "M", "نخ"), + (0xFCD5, "M", "نم"), + (0xFCD6, "M", "نه"), + (0xFCD7, "M", "هج"), + (0xFCD8, "M", "هم"), + (0xFCD9, "M", "هٰ"), + (0xFCDA, "M", "يج"), + (0xFCDB, "M", "يح"), + (0xFCDC, "M", "يخ"), + (0xFCDD, "M", "يم"), + (0xFCDE, "M", "يه"), + (0xFCDF, "M", "ئم"), + (0xFCE0, "M", "ئه"), + (0xFCE1, "M", "بم"), + (0xFCE2, "M", "به"), + (0xFCE3, "M", "تم"), + (0xFCE4, "M", "ته"), + (0xFCE5, "M", "ثم"), + (0xFCE6, "M", "ثه"), + (0xFCE7, "M", "سم"), + (0xFCE8, "M", "سه"), + (0xFCE9, "M", "شم"), + (0xFCEA, "M", "شه"), + (0xFCEB, "M", "كل"), + (0xFCEC, "M", "كم"), + (0xFCED, "M", "لم"), + (0xFCEE, "M", "نم"), + (0xFCEF, "M", "نه"), + (0xFCF0, "M", "يم"), + (0xFCF1, "M", "يه"), + (0xFCF2, "M", "ـَّ"), + (0xFCF3, "M", "ـُّ"), + (0xFCF4, "M", "ـِّ"), + (0xFCF5, "M", "طى"), + (0xFCF6, "M", "طي"), + (0xFCF7, "M", "عى"), + (0xFCF8, "M", "عي"), + (0xFCF9, "M", "غى"), + (0xFCFA, "M", "غي"), + (0xFCFB, "M", "سى"), + ] + + +def _seg_48() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0xFCFC, "M", "سي"), + (0xFCFD, "M", "شى"), + (0xFCFE, "M", "شي"), + (0xFCFF, "M", "حى"), + (0xFD00, "M", "حي"), + (0xFD01, "M", "جى"), + (0xFD02, "M", "جي"), + (0xFD03, "M", "خى"), + (0xFD04, "M", "خي"), + (0xFD05, "M", "صى"), + (0xFD06, "M", "صي"), + (0xFD07, "M", "ضى"), + (0xFD08, "M", "ضي"), + (0xFD09, "M", "شج"), + (0xFD0A, "M", "شح"), + (0xFD0B, "M", "شخ"), + (0xFD0C, "M", "شم"), + (0xFD0D, "M", "شر"), + (0xFD0E, "M", "سر"), + (0xFD0F, "M", "صر"), + (0xFD10, "M", "ضر"), + (0xFD11, "M", "طى"), + (0xFD12, "M", "طي"), + (0xFD13, "M", "عى"), + (0xFD14, "M", "عي"), + (0xFD15, "M", "غى"), + (0xFD16, "M", "غي"), + (0xFD17, "M", "سى"), + (0xFD18, "M", "سي"), + (0xFD19, "M", "شى"), + (0xFD1A, "M", "شي"), + (0xFD1B, "M", "حى"), + (0xFD1C, "M", "حي"), + (0xFD1D, "M", "جى"), + (0xFD1E, "M", "جي"), + (0xFD1F, "M", "خى"), + (0xFD20, "M", "خي"), + (0xFD21, "M", "صى"), + (0xFD22, "M", "صي"), + (0xFD23, "M", "ضى"), + (0xFD24, "M", "ضي"), + (0xFD25, "M", "شج"), + (0xFD26, "M", "شح"), + (0xFD27, "M", "شخ"), + (0xFD28, "M", "شم"), + (0xFD29, "M", "شر"), + (0xFD2A, "M", "سر"), + (0xFD2B, "M", "صر"), + (0xFD2C, "M", "ضر"), + (0xFD2D, "M", "شج"), + (0xFD2E, "M", "شح"), + (0xFD2F, "M", "شخ"), + (0xFD30, "M", "شم"), + (0xFD31, "M", "سه"), + (0xFD32, "M", "شه"), + (0xFD33, "M", "طم"), + (0xFD34, "M", "سج"), + (0xFD35, "M", "سح"), + (0xFD36, "M", "سخ"), + (0xFD37, "M", "شج"), + (0xFD38, "M", "شح"), + (0xFD39, "M", "شخ"), + (0xFD3A, "M", "طم"), + (0xFD3B, "M", "ظم"), + (0xFD3C, "M", "اً"), + (0xFD3E, "V"), + (0xFD50, "M", "تجم"), + (0xFD51, "M", "تحج"), + (0xFD53, "M", "تحم"), + (0xFD54, "M", "تخم"), + (0xFD55, "M", "تمج"), + (0xFD56, "M", "تمح"), + (0xFD57, "M", "تمخ"), + (0xFD58, "M", "جمح"), + (0xFD5A, "M", "حمي"), + (0xFD5B, "M", "حمى"), + (0xFD5C, "M", "سحج"), + (0xFD5D, "M", "سجح"), + (0xFD5E, "M", "سجى"), + (0xFD5F, "M", "سمح"), + (0xFD61, "M", "سمج"), + (0xFD62, "M", "سمم"), + (0xFD64, "M", "صحح"), + (0xFD66, "M", "صمم"), + (0xFD67, "M", "شحم"), + (0xFD69, "M", "شجي"), + (0xFD6A, "M", "شمخ"), + (0xFD6C, "M", "شمم"), + (0xFD6E, "M", "ضحى"), + (0xFD6F, "M", "ضخم"), + (0xFD71, "M", "طمح"), + (0xFD73, "M", "طمم"), + (0xFD74, "M", "طمي"), + (0xFD75, "M", "عجم"), + (0xFD76, "M", "عمم"), + (0xFD78, "M", "عمى"), + (0xFD79, "M", "غمم"), + (0xFD7A, "M", "غمي"), + (0xFD7B, "M", "غمى"), + (0xFD7C, "M", "فخم"), + ] + + +def _seg_49() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0xFD7E, "M", "قمح"), + (0xFD7F, "M", "قمم"), + (0xFD80, "M", "لحم"), + (0xFD81, "M", "لحي"), + (0xFD82, "M", "لحى"), + (0xFD83, "M", "لجج"), + (0xFD85, "M", "لخم"), + (0xFD87, "M", "لمح"), + (0xFD89, "M", "محج"), + (0xFD8A, "M", "محم"), + (0xFD8B, "M", "محي"), + (0xFD8C, "M", "مجح"), + (0xFD8D, "M", "مجم"), + (0xFD8E, "M", "مخج"), + (0xFD8F, "M", "مخم"), + (0xFD90, "X"), + (0xFD92, "M", "مجخ"), + (0xFD93, "M", "همج"), + (0xFD94, "M", "همم"), + (0xFD95, "M", "نحم"), + (0xFD96, "M", "نحى"), + (0xFD97, "M", "نجم"), + (0xFD99, "M", "نجى"), + (0xFD9A, "M", "نمي"), + (0xFD9B, "M", "نمى"), + (0xFD9C, "M", "يمم"), + (0xFD9E, "M", "بخي"), + (0xFD9F, "M", "تجي"), + (0xFDA0, "M", "تجى"), + (0xFDA1, "M", "تخي"), + (0xFDA2, "M", "تخى"), + (0xFDA3, "M", "تمي"), + (0xFDA4, "M", "تمى"), + (0xFDA5, "M", "جمي"), + (0xFDA6, "M", "جحى"), + (0xFDA7, "M", "جمى"), + (0xFDA8, "M", "سخى"), + (0xFDA9, "M", "صحي"), + (0xFDAA, "M", "شحي"), + (0xFDAB, "M", "ضحي"), + (0xFDAC, "M", "لجي"), + (0xFDAD, "M", "لمي"), + (0xFDAE, "M", "يحي"), + (0xFDAF, "M", "يجي"), + (0xFDB0, "M", "يمي"), + (0xFDB1, "M", "ممي"), + (0xFDB2, "M", "قمي"), + (0xFDB3, "M", "نحي"), + (0xFDB4, "M", "قمح"), + (0xFDB5, "M", "لحم"), + (0xFDB6, "M", "عمي"), + (0xFDB7, "M", "كمي"), + (0xFDB8, "M", "نجح"), + (0xFDB9, "M", "مخي"), + (0xFDBA, "M", "لجم"), + (0xFDBB, "M", "كمم"), + (0xFDBC, "M", "لجم"), + (0xFDBD, "M", "نجح"), + (0xFDBE, "M", "جحي"), + (0xFDBF, "M", "حجي"), + (0xFDC0, "M", "مجي"), + (0xFDC1, "M", "فمي"), + (0xFDC2, "M", "بحي"), + (0xFDC3, "M", "كمم"), + (0xFDC4, "M", "عجم"), + (0xFDC5, "M", "صمم"), + (0xFDC6, "M", "سخي"), + (0xFDC7, "M", "نجي"), + (0xFDC8, "X"), + (0xFDCF, "V"), + (0xFDD0, "X"), + (0xFDF0, "M", "صلے"), + (0xFDF1, "M", "قلے"), + (0xFDF2, "M", "الله"), + (0xFDF3, "M", "اكبر"), + (0xFDF4, "M", "محمد"), + (0xFDF5, "M", "صلعم"), + (0xFDF6, "M", "رسول"), + (0xFDF7, "M", "عليه"), + (0xFDF8, "M", "وسلم"), + (0xFDF9, "M", "صلى"), + (0xFDFA, "M", "صلى الله عليه وسلم"), + (0xFDFB, "M", "جل جلاله"), + (0xFDFC, "M", "ریال"), + (0xFDFD, "V"), + (0xFE00, "I"), + (0xFE10, "M", ","), + (0xFE11, "M", "、"), + (0xFE12, "X"), + (0xFE13, "M", ":"), + (0xFE14, "M", ";"), + (0xFE15, "M", "!"), + (0xFE16, "M", "?"), + (0xFE17, "M", "〖"), + (0xFE18, "M", "〗"), + (0xFE19, "X"), + (0xFE20, "V"), + (0xFE30, "X"), + (0xFE31, "M", "—"), + (0xFE32, "M", "–"), + ] + + +def _seg_50() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0xFE33, "M", "_"), + (0xFE35, "M", "("), + (0xFE36, "M", ")"), + (0xFE37, "M", "{"), + (0xFE38, "M", "}"), + (0xFE39, "M", "〔"), + (0xFE3A, "M", "〕"), + (0xFE3B, "M", "【"), + (0xFE3C, "M", "】"), + (0xFE3D, "M", "《"), + (0xFE3E, "M", "》"), + (0xFE3F, "M", "〈"), + (0xFE40, "M", "〉"), + (0xFE41, "M", "「"), + (0xFE42, "M", "」"), + (0xFE43, "M", "『"), + (0xFE44, "M", "』"), + (0xFE45, "V"), + (0xFE47, "M", "["), + (0xFE48, "M", "]"), + (0xFE49, "M", " ̅"), + (0xFE4D, "M", "_"), + (0xFE50, "M", ","), + (0xFE51, "M", "、"), + (0xFE52, "X"), + (0xFE54, "M", ";"), + (0xFE55, "M", ":"), + (0xFE56, "M", "?"), + (0xFE57, "M", "!"), + (0xFE58, "M", "—"), + (0xFE59, "M", "("), + (0xFE5A, "M", ")"), + (0xFE5B, "M", "{"), + (0xFE5C, "M", "}"), + (0xFE5D, "M", "〔"), + (0xFE5E, "M", "〕"), + (0xFE5F, "M", "#"), + (0xFE60, "M", "&"), + (0xFE61, "M", "*"), + (0xFE62, "M", "+"), + (0xFE63, "M", "-"), + (0xFE64, "M", "<"), + (0xFE65, "M", ">"), + (0xFE66, "M", "="), + (0xFE67, "X"), + (0xFE68, "M", "\\"), + (0xFE69, "M", "$"), + (0xFE6A, "M", "%"), + (0xFE6B, "M", "@"), + (0xFE6C, "X"), + (0xFE70, "M", " ً"), + (0xFE71, "M", "ـً"), + (0xFE72, "M", " ٌ"), + (0xFE73, "V"), + (0xFE74, "M", " ٍ"), + (0xFE75, "X"), + (0xFE76, "M", " َ"), + (0xFE77, "M", "ـَ"), + (0xFE78, "M", " ُ"), + (0xFE79, "M", "ـُ"), + (0xFE7A, "M", " ِ"), + (0xFE7B, "M", "ـِ"), + (0xFE7C, "M", " ّ"), + (0xFE7D, "M", "ـّ"), + (0xFE7E, "M", " ْ"), + (0xFE7F, "M", "ـْ"), + (0xFE80, "M", "ء"), + (0xFE81, "M", "آ"), + (0xFE83, "M", "أ"), + (0xFE85, "M", "ؤ"), + (0xFE87, "M", "إ"), + (0xFE89, "M", "ئ"), + (0xFE8D, "M", "ا"), + (0xFE8F, "M", "ب"), + (0xFE93, "M", "ة"), + (0xFE95, "M", "ت"), + (0xFE99, "M", "ث"), + (0xFE9D, "M", "ج"), + (0xFEA1, "M", "ح"), + (0xFEA5, "M", "خ"), + (0xFEA9, "M", "د"), + (0xFEAB, "M", "ذ"), + (0xFEAD, "M", "ر"), + (0xFEAF, "M", "ز"), + (0xFEB1, "M", "س"), + (0xFEB5, "M", "ش"), + (0xFEB9, "M", "ص"), + (0xFEBD, "M", "ض"), + (0xFEC1, "M", "ط"), + (0xFEC5, "M", "ظ"), + (0xFEC9, "M", "ع"), + (0xFECD, "M", "غ"), + (0xFED1, "M", "ف"), + (0xFED5, "M", "ق"), + (0xFED9, "M", "ك"), + (0xFEDD, "M", "ل"), + (0xFEE1, "M", "م"), + (0xFEE5, "M", "ن"), + (0xFEE9, "M", "ه"), + (0xFEED, "M", "و"), + ] + + +def _seg_51() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0xFEEF, "M", "ى"), + (0xFEF1, "M", "ي"), + (0xFEF5, "M", "لآ"), + (0xFEF7, "M", "لأ"), + (0xFEF9, "M", "لإ"), + (0xFEFB, "M", "لا"), + (0xFEFD, "X"), + (0xFEFF, "I"), + (0xFF00, "X"), + (0xFF01, "M", "!"), + (0xFF02, "M", '"'), + (0xFF03, "M", "#"), + (0xFF04, "M", "$"), + (0xFF05, "M", "%"), + (0xFF06, "M", "&"), + (0xFF07, "M", "'"), + (0xFF08, "M", "("), + (0xFF09, "M", ")"), + (0xFF0A, "M", "*"), + (0xFF0B, "M", "+"), + (0xFF0C, "M", ","), + (0xFF0D, "M", "-"), + (0xFF0E, "M", "."), + (0xFF0F, "M", "/"), + (0xFF10, "M", "0"), + (0xFF11, "M", "1"), + (0xFF12, "M", "2"), + (0xFF13, "M", "3"), + (0xFF14, "M", "4"), + (0xFF15, "M", "5"), + (0xFF16, "M", "6"), + (0xFF17, "M", "7"), + (0xFF18, "M", "8"), + (0xFF19, "M", "9"), + (0xFF1A, "M", ":"), + (0xFF1B, "M", ";"), + (0xFF1C, "M", "<"), + (0xFF1D, "M", "="), + (0xFF1E, "M", ">"), + (0xFF1F, "M", "?"), + (0xFF20, "M", "@"), + (0xFF21, "M", "a"), + (0xFF22, "M", "b"), + (0xFF23, "M", "c"), + (0xFF24, "M", "d"), + (0xFF25, "M", "e"), + (0xFF26, "M", "f"), + (0xFF27, "M", "g"), + (0xFF28, "M", "h"), + (0xFF29, "M", "i"), + (0xFF2A, "M", "j"), + (0xFF2B, "M", "k"), + (0xFF2C, "M", "l"), + (0xFF2D, "M", "m"), + (0xFF2E, "M", "n"), + (0xFF2F, "M", "o"), + (0xFF30, "M", "p"), + (0xFF31, "M", "q"), + (0xFF32, "M", "r"), + (0xFF33, "M", "s"), + (0xFF34, "M", "t"), + (0xFF35, "M", "u"), + (0xFF36, "M", "v"), + (0xFF37, "M", "w"), + (0xFF38, "M", "x"), + (0xFF39, "M", "y"), + (0xFF3A, "M", "z"), + (0xFF3B, "M", "["), + (0xFF3C, "M", "\\"), + (0xFF3D, "M", "]"), + (0xFF3E, "M", "^"), + (0xFF3F, "M", "_"), + (0xFF40, "M", "`"), + (0xFF41, "M", "a"), + (0xFF42, "M", "b"), + (0xFF43, "M", "c"), + (0xFF44, "M", "d"), + (0xFF45, "M", "e"), + (0xFF46, "M", "f"), + (0xFF47, "M", "g"), + (0xFF48, "M", "h"), + (0xFF49, "M", "i"), + (0xFF4A, "M", "j"), + (0xFF4B, "M", "k"), + (0xFF4C, "M", "l"), + (0xFF4D, "M", "m"), + (0xFF4E, "M", "n"), + (0xFF4F, "M", "o"), + (0xFF50, "M", "p"), + (0xFF51, "M", "q"), + (0xFF52, "M", "r"), + (0xFF53, "M", "s"), + (0xFF54, "M", "t"), + (0xFF55, "M", "u"), + (0xFF56, "M", "v"), + (0xFF57, "M", "w"), + (0xFF58, "M", "x"), + (0xFF59, "M", "y"), + (0xFF5A, "M", "z"), + (0xFF5B, "M", "{"), + ] + + +def _seg_52() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0xFF5C, "M", "|"), + (0xFF5D, "M", "}"), + (0xFF5E, "M", "~"), + (0xFF5F, "M", "⦅"), + (0xFF60, "M", "⦆"), + (0xFF61, "M", "."), + (0xFF62, "M", "「"), + (0xFF63, "M", "」"), + (0xFF64, "M", "、"), + (0xFF65, "M", "・"), + (0xFF66, "M", "ヲ"), + (0xFF67, "M", "ァ"), + (0xFF68, "M", "ィ"), + (0xFF69, "M", "ゥ"), + (0xFF6A, "M", "ェ"), + (0xFF6B, "M", "ォ"), + (0xFF6C, "M", "ャ"), + (0xFF6D, "M", "ュ"), + (0xFF6E, "M", "ョ"), + (0xFF6F, "M", "ッ"), + (0xFF70, "M", "ー"), + (0xFF71, "M", "ア"), + (0xFF72, "M", "イ"), + (0xFF73, "M", "ウ"), + (0xFF74, "M", "エ"), + (0xFF75, "M", "オ"), + (0xFF76, "M", "カ"), + (0xFF77, "M", "キ"), + (0xFF78, "M", "ク"), + (0xFF79, "M", "ケ"), + (0xFF7A, "M", "コ"), + (0xFF7B, "M", "サ"), + (0xFF7C, "M", "シ"), + (0xFF7D, "M", "ス"), + (0xFF7E, "M", "セ"), + (0xFF7F, "M", "ソ"), + (0xFF80, "M", "タ"), + (0xFF81, "M", "チ"), + (0xFF82, "M", "ツ"), + (0xFF83, "M", "テ"), + (0xFF84, "M", "ト"), + (0xFF85, "M", "ナ"), + (0xFF86, "M", "ニ"), + (0xFF87, "M", "ヌ"), + (0xFF88, "M", "ネ"), + (0xFF89, "M", "ノ"), + (0xFF8A, "M", "ハ"), + (0xFF8B, "M", "ヒ"), + (0xFF8C, "M", "フ"), + (0xFF8D, "M", "ヘ"), + (0xFF8E, "M", "ホ"), + (0xFF8F, "M", "マ"), + (0xFF90, "M", "ミ"), + (0xFF91, "M", "ム"), + (0xFF92, "M", "メ"), + (0xFF93, "M", "モ"), + (0xFF94, "M", "ヤ"), + (0xFF95, "M", "ユ"), + (0xFF96, "M", "ヨ"), + (0xFF97, "M", "ラ"), + (0xFF98, "M", "リ"), + (0xFF99, "M", "ル"), + (0xFF9A, "M", "レ"), + (0xFF9B, "M", "ロ"), + (0xFF9C, "M", "ワ"), + (0xFF9D, "M", "ン"), + (0xFF9E, "M", "゙"), + (0xFF9F, "M", "゚"), + (0xFFA0, "I"), + (0xFFA1, "M", "ᄀ"), + (0xFFA2, "M", "ᄁ"), + (0xFFA3, "M", "ᆪ"), + (0xFFA4, "M", "ᄂ"), + (0xFFA5, "M", "ᆬ"), + (0xFFA6, "M", "ᆭ"), + (0xFFA7, "M", "ᄃ"), + (0xFFA8, "M", "ᄄ"), + (0xFFA9, "M", "ᄅ"), + (0xFFAA, "M", "ᆰ"), + (0xFFAB, "M", "ᆱ"), + (0xFFAC, "M", "ᆲ"), + (0xFFAD, "M", "ᆳ"), + (0xFFAE, "M", "ᆴ"), + (0xFFAF, "M", "ᆵ"), + (0xFFB0, "M", "ᄚ"), + (0xFFB1, "M", "ᄆ"), + (0xFFB2, "M", "ᄇ"), + (0xFFB3, "M", "ᄈ"), + (0xFFB4, "M", "ᄡ"), + (0xFFB5, "M", "ᄉ"), + (0xFFB6, "M", "ᄊ"), + (0xFFB7, "M", "ᄋ"), + (0xFFB8, "M", "ᄌ"), + (0xFFB9, "M", "ᄍ"), + (0xFFBA, "M", "ᄎ"), + (0xFFBB, "M", "ᄏ"), + (0xFFBC, "M", "ᄐ"), + (0xFFBD, "M", "ᄑ"), + (0xFFBE, "M", "ᄒ"), + (0xFFBF, "X"), + ] + + +def _seg_53() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0xFFC2, "M", "ᅡ"), + (0xFFC3, "M", "ᅢ"), + (0xFFC4, "M", "ᅣ"), + (0xFFC5, "M", "ᅤ"), + (0xFFC6, "M", "ᅥ"), + (0xFFC7, "M", "ᅦ"), + (0xFFC8, "X"), + (0xFFCA, "M", "ᅧ"), + (0xFFCB, "M", "ᅨ"), + (0xFFCC, "M", "ᅩ"), + (0xFFCD, "M", "ᅪ"), + (0xFFCE, "M", "ᅫ"), + (0xFFCF, "M", "ᅬ"), + (0xFFD0, "X"), + (0xFFD2, "M", "ᅭ"), + (0xFFD3, "M", "ᅮ"), + (0xFFD4, "M", "ᅯ"), + (0xFFD5, "M", "ᅰ"), + (0xFFD6, "M", "ᅱ"), + (0xFFD7, "M", "ᅲ"), + (0xFFD8, "X"), + (0xFFDA, "M", "ᅳ"), + (0xFFDB, "M", "ᅴ"), + (0xFFDC, "M", "ᅵ"), + (0xFFDD, "X"), + (0xFFE0, "M", "¢"), + (0xFFE1, "M", "£"), + (0xFFE2, "M", "¬"), + (0xFFE3, "M", " ̄"), + (0xFFE4, "M", "¦"), + (0xFFE5, "M", "¥"), + (0xFFE6, "M", "₩"), + (0xFFE7, "X"), + (0xFFE8, "M", "│"), + (0xFFE9, "M", "←"), + (0xFFEA, "M", "↑"), + (0xFFEB, "M", "→"), + (0xFFEC, "M", "↓"), + (0xFFED, "M", "■"), + (0xFFEE, "M", "○"), + (0xFFEF, "X"), + (0x10000, "V"), + (0x1000C, "X"), + (0x1000D, "V"), + (0x10027, "X"), + (0x10028, "V"), + (0x1003B, "X"), + (0x1003C, "V"), + (0x1003E, "X"), + (0x1003F, "V"), + (0x1004E, "X"), + (0x10050, "V"), + (0x1005E, "X"), + (0x10080, "V"), + (0x100FB, "X"), + (0x10100, "V"), + (0x10103, "X"), + (0x10107, "V"), + (0x10134, "X"), + (0x10137, "V"), + (0x1018F, "X"), + (0x10190, "V"), + (0x1019D, "X"), + (0x101A0, "V"), + (0x101A1, "X"), + (0x101D0, "V"), + (0x101FE, "X"), + (0x10280, "V"), + (0x1029D, "X"), + (0x102A0, "V"), + (0x102D1, "X"), + (0x102E0, "V"), + (0x102FC, "X"), + (0x10300, "V"), + (0x10324, "X"), + (0x1032D, "V"), + (0x1034B, "X"), + (0x10350, "V"), + (0x1037B, "X"), + (0x10380, "V"), + (0x1039E, "X"), + (0x1039F, "V"), + (0x103C4, "X"), + (0x103C8, "V"), + (0x103D6, "X"), + (0x10400, "M", "𐐨"), + (0x10401, "M", "𐐩"), + (0x10402, "M", "𐐪"), + (0x10403, "M", "𐐫"), + (0x10404, "M", "𐐬"), + (0x10405, "M", "𐐭"), + (0x10406, "M", "𐐮"), + (0x10407, "M", "𐐯"), + (0x10408, "M", "𐐰"), + (0x10409, "M", "𐐱"), + (0x1040A, "M", "𐐲"), + (0x1040B, "M", "𐐳"), + (0x1040C, "M", "𐐴"), + (0x1040D, "M", "𐐵"), + (0x1040E, "M", "𐐶"), + ] + + +def _seg_54() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x1040F, "M", "𐐷"), + (0x10410, "M", "𐐸"), + (0x10411, "M", "𐐹"), + (0x10412, "M", "𐐺"), + (0x10413, "M", "𐐻"), + (0x10414, "M", "𐐼"), + (0x10415, "M", "𐐽"), + (0x10416, "M", "𐐾"), + (0x10417, "M", "𐐿"), + (0x10418, "M", "𐑀"), + (0x10419, "M", "𐑁"), + (0x1041A, "M", "𐑂"), + (0x1041B, "M", "𐑃"), + (0x1041C, "M", "𐑄"), + (0x1041D, "M", "𐑅"), + (0x1041E, "M", "𐑆"), + (0x1041F, "M", "𐑇"), + (0x10420, "M", "𐑈"), + (0x10421, "M", "𐑉"), + (0x10422, "M", "𐑊"), + (0x10423, "M", "𐑋"), + (0x10424, "M", "𐑌"), + (0x10425, "M", "𐑍"), + (0x10426, "M", "𐑎"), + (0x10427, "M", "𐑏"), + (0x10428, "V"), + (0x1049E, "X"), + (0x104A0, "V"), + (0x104AA, "X"), + (0x104B0, "M", "𐓘"), + (0x104B1, "M", "𐓙"), + (0x104B2, "M", "𐓚"), + (0x104B3, "M", "𐓛"), + (0x104B4, "M", "𐓜"), + (0x104B5, "M", "𐓝"), + (0x104B6, "M", "𐓞"), + (0x104B7, "M", "𐓟"), + (0x104B8, "M", "𐓠"), + (0x104B9, "M", "𐓡"), + (0x104BA, "M", "𐓢"), + (0x104BB, "M", "𐓣"), + (0x104BC, "M", "𐓤"), + (0x104BD, "M", "𐓥"), + (0x104BE, "M", "𐓦"), + (0x104BF, "M", "𐓧"), + (0x104C0, "M", "𐓨"), + (0x104C1, "M", "𐓩"), + (0x104C2, "M", "𐓪"), + (0x104C3, "M", "𐓫"), + (0x104C4, "M", "𐓬"), + (0x104C5, "M", "𐓭"), + (0x104C6, "M", "𐓮"), + (0x104C7, "M", "𐓯"), + (0x104C8, "M", "𐓰"), + (0x104C9, "M", "𐓱"), + (0x104CA, "M", "𐓲"), + (0x104CB, "M", "𐓳"), + (0x104CC, "M", "𐓴"), + (0x104CD, "M", "𐓵"), + (0x104CE, "M", "𐓶"), + (0x104CF, "M", "𐓷"), + (0x104D0, "M", "𐓸"), + (0x104D1, "M", "𐓹"), + (0x104D2, "M", "𐓺"), + (0x104D3, "M", "𐓻"), + (0x104D4, "X"), + (0x104D8, "V"), + (0x104FC, "X"), + (0x10500, "V"), + (0x10528, "X"), + (0x10530, "V"), + (0x10564, "X"), + (0x1056F, "V"), + (0x10570, "M", "𐖗"), + (0x10571, "M", "𐖘"), + (0x10572, "M", "𐖙"), + (0x10573, "M", "𐖚"), + (0x10574, "M", "𐖛"), + (0x10575, "M", "𐖜"), + (0x10576, "M", "𐖝"), + (0x10577, "M", "𐖞"), + (0x10578, "M", "𐖟"), + (0x10579, "M", "𐖠"), + (0x1057A, "M", "𐖡"), + (0x1057B, "X"), + (0x1057C, "M", "𐖣"), + (0x1057D, "M", "𐖤"), + (0x1057E, "M", "𐖥"), + (0x1057F, "M", "𐖦"), + (0x10580, "M", "𐖧"), + (0x10581, "M", "𐖨"), + (0x10582, "M", "𐖩"), + (0x10583, "M", "𐖪"), + (0x10584, "M", "𐖫"), + (0x10585, "M", "𐖬"), + (0x10586, "M", "𐖭"), + (0x10587, "M", "𐖮"), + (0x10588, "M", "𐖯"), + (0x10589, "M", "𐖰"), + (0x1058A, "M", "𐖱"), + ] + + +def _seg_55() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x1058B, "X"), + (0x1058C, "M", "𐖳"), + (0x1058D, "M", "𐖴"), + (0x1058E, "M", "𐖵"), + (0x1058F, "M", "𐖶"), + (0x10590, "M", "𐖷"), + (0x10591, "M", "𐖸"), + (0x10592, "M", "𐖹"), + (0x10593, "X"), + (0x10594, "M", "𐖻"), + (0x10595, "M", "𐖼"), + (0x10596, "X"), + (0x10597, "V"), + (0x105A2, "X"), + (0x105A3, "V"), + (0x105B2, "X"), + (0x105B3, "V"), + (0x105BA, "X"), + (0x105BB, "V"), + (0x105BD, "X"), + (0x105C0, "V"), + (0x105F4, "X"), + (0x10600, "V"), + (0x10737, "X"), + (0x10740, "V"), + (0x10756, "X"), + (0x10760, "V"), + (0x10768, "X"), + (0x10780, "V"), + (0x10781, "M", "ː"), + (0x10782, "M", "ˑ"), + (0x10783, "M", "æ"), + (0x10784, "M", "ʙ"), + (0x10785, "M", "ɓ"), + (0x10786, "X"), + (0x10787, "M", "ʣ"), + (0x10788, "M", "ꭦ"), + (0x10789, "M", "ʥ"), + (0x1078A, "M", "ʤ"), + (0x1078B, "M", "ɖ"), + (0x1078C, "M", "ɗ"), + (0x1078D, "M", "ᶑ"), + (0x1078E, "M", "ɘ"), + (0x1078F, "M", "ɞ"), + (0x10790, "M", "ʩ"), + (0x10791, "M", "ɤ"), + (0x10792, "M", "ɢ"), + (0x10793, "M", "ɠ"), + (0x10794, "M", "ʛ"), + (0x10795, "M", "ħ"), + (0x10796, "M", "ʜ"), + (0x10797, "M", "ɧ"), + (0x10798, "M", "ʄ"), + (0x10799, "M", "ʪ"), + (0x1079A, "M", "ʫ"), + (0x1079B, "M", "ɬ"), + (0x1079C, "M", "𝼄"), + (0x1079D, "M", "ꞎ"), + (0x1079E, "M", "ɮ"), + (0x1079F, "M", "𝼅"), + (0x107A0, "M", "ʎ"), + (0x107A1, "M", "𝼆"), + (0x107A2, "M", "ø"), + (0x107A3, "M", "ɶ"), + (0x107A4, "M", "ɷ"), + (0x107A5, "M", "q"), + (0x107A6, "M", "ɺ"), + (0x107A7, "M", "𝼈"), + (0x107A8, "M", "ɽ"), + (0x107A9, "M", "ɾ"), + (0x107AA, "M", "ʀ"), + (0x107AB, "M", "ʨ"), + (0x107AC, "M", "ʦ"), + (0x107AD, "M", "ꭧ"), + (0x107AE, "M", "ʧ"), + (0x107AF, "M", "ʈ"), + (0x107B0, "M", "ⱱ"), + (0x107B1, "X"), + (0x107B2, "M", "ʏ"), + (0x107B3, "M", "ʡ"), + (0x107B4, "M", "ʢ"), + (0x107B5, "M", "ʘ"), + (0x107B6, "M", "ǀ"), + (0x107B7, "M", "ǁ"), + (0x107B8, "M", "ǂ"), + (0x107B9, "M", "𝼊"), + (0x107BA, "M", "𝼞"), + (0x107BB, "X"), + (0x10800, "V"), + (0x10806, "X"), + (0x10808, "V"), + (0x10809, "X"), + (0x1080A, "V"), + (0x10836, "X"), + (0x10837, "V"), + (0x10839, "X"), + (0x1083C, "V"), + (0x1083D, "X"), + (0x1083F, "V"), + (0x10856, "X"), + ] + + +def _seg_56() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x10857, "V"), + (0x1089F, "X"), + (0x108A7, "V"), + (0x108B0, "X"), + (0x108E0, "V"), + (0x108F3, "X"), + (0x108F4, "V"), + (0x108F6, "X"), + (0x108FB, "V"), + (0x1091C, "X"), + (0x1091F, "V"), + (0x1093A, "X"), + (0x1093F, "V"), + (0x10940, "X"), + (0x10980, "V"), + (0x109B8, "X"), + (0x109BC, "V"), + (0x109D0, "X"), + (0x109D2, "V"), + (0x10A04, "X"), + (0x10A05, "V"), + (0x10A07, "X"), + (0x10A0C, "V"), + (0x10A14, "X"), + (0x10A15, "V"), + (0x10A18, "X"), + (0x10A19, "V"), + (0x10A36, "X"), + (0x10A38, "V"), + (0x10A3B, "X"), + (0x10A3F, "V"), + (0x10A49, "X"), + (0x10A50, "V"), + (0x10A59, "X"), + (0x10A60, "V"), + (0x10AA0, "X"), + (0x10AC0, "V"), + (0x10AE7, "X"), + (0x10AEB, "V"), + (0x10AF7, "X"), + (0x10B00, "V"), + (0x10B36, "X"), + (0x10B39, "V"), + (0x10B56, "X"), + (0x10B58, "V"), + (0x10B73, "X"), + (0x10B78, "V"), + (0x10B92, "X"), + (0x10B99, "V"), + (0x10B9D, "X"), + (0x10BA9, "V"), + (0x10BB0, "X"), + (0x10C00, "V"), + (0x10C49, "X"), + (0x10C80, "M", "𐳀"), + (0x10C81, "M", "𐳁"), + (0x10C82, "M", "𐳂"), + (0x10C83, "M", "𐳃"), + (0x10C84, "M", "𐳄"), + (0x10C85, "M", "𐳅"), + (0x10C86, "M", "𐳆"), + (0x10C87, "M", "𐳇"), + (0x10C88, "M", "𐳈"), + (0x10C89, "M", "𐳉"), + (0x10C8A, "M", "𐳊"), + (0x10C8B, "M", "𐳋"), + (0x10C8C, "M", "𐳌"), + (0x10C8D, "M", "𐳍"), + (0x10C8E, "M", "𐳎"), + (0x10C8F, "M", "𐳏"), + (0x10C90, "M", "𐳐"), + (0x10C91, "M", "𐳑"), + (0x10C92, "M", "𐳒"), + (0x10C93, "M", "𐳓"), + (0x10C94, "M", "𐳔"), + (0x10C95, "M", "𐳕"), + (0x10C96, "M", "𐳖"), + (0x10C97, "M", "𐳗"), + (0x10C98, "M", "𐳘"), + (0x10C99, "M", "𐳙"), + (0x10C9A, "M", "𐳚"), + (0x10C9B, "M", "𐳛"), + (0x10C9C, "M", "𐳜"), + (0x10C9D, "M", "𐳝"), + (0x10C9E, "M", "𐳞"), + (0x10C9F, "M", "𐳟"), + (0x10CA0, "M", "𐳠"), + (0x10CA1, "M", "𐳡"), + (0x10CA2, "M", "𐳢"), + (0x10CA3, "M", "𐳣"), + (0x10CA4, "M", "𐳤"), + (0x10CA5, "M", "𐳥"), + (0x10CA6, "M", "𐳦"), + (0x10CA7, "M", "𐳧"), + (0x10CA8, "M", "𐳨"), + (0x10CA9, "M", "𐳩"), + (0x10CAA, "M", "𐳪"), + (0x10CAB, "M", "𐳫"), + (0x10CAC, "M", "𐳬"), + (0x10CAD, "M", "𐳭"), + ] + + +def _seg_57() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x10CAE, "M", "𐳮"), + (0x10CAF, "M", "𐳯"), + (0x10CB0, "M", "𐳰"), + (0x10CB1, "M", "𐳱"), + (0x10CB2, "M", "𐳲"), + (0x10CB3, "X"), + (0x10CC0, "V"), + (0x10CF3, "X"), + (0x10CFA, "V"), + (0x10D28, "X"), + (0x10D30, "V"), + (0x10D3A, "X"), + (0x10D40, "V"), + (0x10D50, "M", "𐵰"), + (0x10D51, "M", "𐵱"), + (0x10D52, "M", "𐵲"), + (0x10D53, "M", "𐵳"), + (0x10D54, "M", "𐵴"), + (0x10D55, "M", "𐵵"), + (0x10D56, "M", "𐵶"), + (0x10D57, "M", "𐵷"), + (0x10D58, "M", "𐵸"), + (0x10D59, "M", "𐵹"), + (0x10D5A, "M", "𐵺"), + (0x10D5B, "M", "𐵻"), + (0x10D5C, "M", "𐵼"), + (0x10D5D, "M", "𐵽"), + (0x10D5E, "M", "𐵾"), + (0x10D5F, "M", "𐵿"), + (0x10D60, "M", "𐶀"), + (0x10D61, "M", "𐶁"), + (0x10D62, "M", "𐶂"), + (0x10D63, "M", "𐶃"), + (0x10D64, "M", "𐶄"), + (0x10D65, "M", "𐶅"), + (0x10D66, "X"), + (0x10D69, "V"), + (0x10D86, "X"), + (0x10D8E, "V"), + (0x10D90, "X"), + (0x10E60, "V"), + (0x10E7F, "X"), + (0x10E80, "V"), + (0x10EAA, "X"), + (0x10EAB, "V"), + (0x10EAE, "X"), + (0x10EB0, "V"), + (0x10EB2, "X"), + (0x10EC2, "V"), + (0x10EC5, "X"), + (0x10EFC, "V"), + (0x10F28, "X"), + (0x10F30, "V"), + (0x10F5A, "X"), + (0x10F70, "V"), + (0x10F8A, "X"), + (0x10FB0, "V"), + (0x10FCC, "X"), + (0x10FE0, "V"), + (0x10FF7, "X"), + (0x11000, "V"), + (0x1104E, "X"), + (0x11052, "V"), + (0x11076, "X"), + (0x1107F, "V"), + (0x110BD, "X"), + (0x110BE, "V"), + (0x110C3, "X"), + (0x110D0, "V"), + (0x110E9, "X"), + (0x110F0, "V"), + (0x110FA, "X"), + (0x11100, "V"), + (0x11135, "X"), + (0x11136, "V"), + (0x11148, "X"), + (0x11150, "V"), + (0x11177, "X"), + (0x11180, "V"), + (0x111E0, "X"), + (0x111E1, "V"), + (0x111F5, "X"), + (0x11200, "V"), + (0x11212, "X"), + (0x11213, "V"), + (0x11242, "X"), + (0x11280, "V"), + (0x11287, "X"), + (0x11288, "V"), + (0x11289, "X"), + (0x1128A, "V"), + (0x1128E, "X"), + (0x1128F, "V"), + (0x1129E, "X"), + (0x1129F, "V"), + (0x112AA, "X"), + (0x112B0, "V"), + (0x112EB, "X"), + (0x112F0, "V"), + (0x112FA, "X"), + ] + + +def _seg_58() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x11300, "V"), + (0x11304, "X"), + (0x11305, "V"), + (0x1130D, "X"), + (0x1130F, "V"), + (0x11311, "X"), + (0x11313, "V"), + (0x11329, "X"), + (0x1132A, "V"), + (0x11331, "X"), + (0x11332, "V"), + (0x11334, "X"), + (0x11335, "V"), + (0x1133A, "X"), + (0x1133B, "V"), + (0x11345, "X"), + (0x11347, "V"), + (0x11349, "X"), + (0x1134B, "V"), + (0x1134E, "X"), + (0x11350, "V"), + (0x11351, "X"), + (0x11357, "V"), + (0x11358, "X"), + (0x1135D, "V"), + (0x11364, "X"), + (0x11366, "V"), + (0x1136D, "X"), + (0x11370, "V"), + (0x11375, "X"), + (0x11380, "V"), + (0x1138A, "X"), + (0x1138B, "V"), + (0x1138C, "X"), + (0x1138E, "V"), + (0x1138F, "X"), + (0x11390, "V"), + (0x113B6, "X"), + (0x113B7, "V"), + (0x113C1, "X"), + (0x113C2, "V"), + (0x113C3, "X"), + (0x113C5, "V"), + (0x113C6, "X"), + (0x113C7, "V"), + (0x113CB, "X"), + (0x113CC, "V"), + (0x113D6, "X"), + (0x113D7, "V"), + (0x113D9, "X"), + (0x113E1, "V"), + (0x113E3, "X"), + (0x11400, "V"), + (0x1145C, "X"), + (0x1145D, "V"), + (0x11462, "X"), + (0x11480, "V"), + (0x114C8, "X"), + (0x114D0, "V"), + (0x114DA, "X"), + (0x11580, "V"), + (0x115B6, "X"), + (0x115B8, "V"), + (0x115DE, "X"), + (0x11600, "V"), + (0x11645, "X"), + (0x11650, "V"), + (0x1165A, "X"), + (0x11660, "V"), + (0x1166D, "X"), + (0x11680, "V"), + (0x116BA, "X"), + (0x116C0, "V"), + (0x116CA, "X"), + (0x116D0, "V"), + (0x116E4, "X"), + (0x11700, "V"), + (0x1171B, "X"), + (0x1171D, "V"), + (0x1172C, "X"), + (0x11730, "V"), + (0x11747, "X"), + (0x11800, "V"), + (0x1183C, "X"), + (0x118A0, "M", "𑣀"), + (0x118A1, "M", "𑣁"), + (0x118A2, "M", "𑣂"), + (0x118A3, "M", "𑣃"), + (0x118A4, "M", "𑣄"), + (0x118A5, "M", "𑣅"), + (0x118A6, "M", "𑣆"), + (0x118A7, "M", "𑣇"), + (0x118A8, "M", "𑣈"), + (0x118A9, "M", "𑣉"), + (0x118AA, "M", "𑣊"), + (0x118AB, "M", "𑣋"), + (0x118AC, "M", "𑣌"), + (0x118AD, "M", "𑣍"), + (0x118AE, "M", "𑣎"), + (0x118AF, "M", "𑣏"), + ] + + +def _seg_59() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x118B0, "M", "𑣐"), + (0x118B1, "M", "𑣑"), + (0x118B2, "M", "𑣒"), + (0x118B3, "M", "𑣓"), + (0x118B4, "M", "𑣔"), + (0x118B5, "M", "𑣕"), + (0x118B6, "M", "𑣖"), + (0x118B7, "M", "𑣗"), + (0x118B8, "M", "𑣘"), + (0x118B9, "M", "𑣙"), + (0x118BA, "M", "𑣚"), + (0x118BB, "M", "𑣛"), + (0x118BC, "M", "𑣜"), + (0x118BD, "M", "𑣝"), + (0x118BE, "M", "𑣞"), + (0x118BF, "M", "𑣟"), + (0x118C0, "V"), + (0x118F3, "X"), + (0x118FF, "V"), + (0x11907, "X"), + (0x11909, "V"), + (0x1190A, "X"), + (0x1190C, "V"), + (0x11914, "X"), + (0x11915, "V"), + (0x11917, "X"), + (0x11918, "V"), + (0x11936, "X"), + (0x11937, "V"), + (0x11939, "X"), + (0x1193B, "V"), + (0x11947, "X"), + (0x11950, "V"), + (0x1195A, "X"), + (0x119A0, "V"), + (0x119A8, "X"), + (0x119AA, "V"), + (0x119D8, "X"), + (0x119DA, "V"), + (0x119E5, "X"), + (0x11A00, "V"), + (0x11A48, "X"), + (0x11A50, "V"), + (0x11AA3, "X"), + (0x11AB0, "V"), + (0x11AF9, "X"), + (0x11B00, "V"), + (0x11B0A, "X"), + (0x11BC0, "V"), + (0x11BE2, "X"), + (0x11BF0, "V"), + (0x11BFA, "X"), + (0x11C00, "V"), + (0x11C09, "X"), + (0x11C0A, "V"), + (0x11C37, "X"), + (0x11C38, "V"), + (0x11C46, "X"), + (0x11C50, "V"), + (0x11C6D, "X"), + (0x11C70, "V"), + (0x11C90, "X"), + (0x11C92, "V"), + (0x11CA8, "X"), + (0x11CA9, "V"), + (0x11CB7, "X"), + (0x11D00, "V"), + (0x11D07, "X"), + (0x11D08, "V"), + (0x11D0A, "X"), + (0x11D0B, "V"), + (0x11D37, "X"), + (0x11D3A, "V"), + (0x11D3B, "X"), + (0x11D3C, "V"), + (0x11D3E, "X"), + (0x11D3F, "V"), + (0x11D48, "X"), + (0x11D50, "V"), + (0x11D5A, "X"), + (0x11D60, "V"), + (0x11D66, "X"), + (0x11D67, "V"), + (0x11D69, "X"), + (0x11D6A, "V"), + (0x11D8F, "X"), + (0x11D90, "V"), + (0x11D92, "X"), + (0x11D93, "V"), + (0x11D99, "X"), + (0x11DA0, "V"), + (0x11DAA, "X"), + (0x11EE0, "V"), + (0x11EF9, "X"), + (0x11F00, "V"), + (0x11F11, "X"), + (0x11F12, "V"), + (0x11F3B, "X"), + (0x11F3E, "V"), + (0x11F5B, "X"), + ] + + +def _seg_60() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x11FB0, "V"), + (0x11FB1, "X"), + (0x11FC0, "V"), + (0x11FF2, "X"), + (0x11FFF, "V"), + (0x1239A, "X"), + (0x12400, "V"), + (0x1246F, "X"), + (0x12470, "V"), + (0x12475, "X"), + (0x12480, "V"), + (0x12544, "X"), + (0x12F90, "V"), + (0x12FF3, "X"), + (0x13000, "V"), + (0x13430, "X"), + (0x13440, "V"), + (0x13456, "X"), + (0x13460, "V"), + (0x143FB, "X"), + (0x14400, "V"), + (0x14647, "X"), + (0x16100, "V"), + (0x1613A, "X"), + (0x16800, "V"), + (0x16A39, "X"), + (0x16A40, "V"), + (0x16A5F, "X"), + (0x16A60, "V"), + (0x16A6A, "X"), + (0x16A6E, "V"), + (0x16ABF, "X"), + (0x16AC0, "V"), + (0x16ACA, "X"), + (0x16AD0, "V"), + (0x16AEE, "X"), + (0x16AF0, "V"), + (0x16AF6, "X"), + (0x16B00, "V"), + (0x16B46, "X"), + (0x16B50, "V"), + (0x16B5A, "X"), + (0x16B5B, "V"), + (0x16B62, "X"), + (0x16B63, "V"), + (0x16B78, "X"), + (0x16B7D, "V"), + (0x16B90, "X"), + (0x16D40, "V"), + (0x16D7A, "X"), + (0x16E40, "M", "𖹠"), + (0x16E41, "M", "𖹡"), + (0x16E42, "M", "𖹢"), + (0x16E43, "M", "𖹣"), + (0x16E44, "M", "𖹤"), + (0x16E45, "M", "𖹥"), + (0x16E46, "M", "𖹦"), + (0x16E47, "M", "𖹧"), + (0x16E48, "M", "𖹨"), + (0x16E49, "M", "𖹩"), + (0x16E4A, "M", "𖹪"), + (0x16E4B, "M", "𖹫"), + (0x16E4C, "M", "𖹬"), + (0x16E4D, "M", "𖹭"), + (0x16E4E, "M", "𖹮"), + (0x16E4F, "M", "𖹯"), + (0x16E50, "M", "𖹰"), + (0x16E51, "M", "𖹱"), + (0x16E52, "M", "𖹲"), + (0x16E53, "M", "𖹳"), + (0x16E54, "M", "𖹴"), + (0x16E55, "M", "𖹵"), + (0x16E56, "M", "𖹶"), + (0x16E57, "M", "𖹷"), + (0x16E58, "M", "𖹸"), + (0x16E59, "M", "𖹹"), + (0x16E5A, "M", "𖹺"), + (0x16E5B, "M", "𖹻"), + (0x16E5C, "M", "𖹼"), + (0x16E5D, "M", "𖹽"), + (0x16E5E, "M", "𖹾"), + (0x16E5F, "M", "𖹿"), + (0x16E60, "V"), + (0x16E9B, "X"), + (0x16F00, "V"), + (0x16F4B, "X"), + (0x16F4F, "V"), + (0x16F88, "X"), + (0x16F8F, "V"), + (0x16FA0, "X"), + (0x16FE0, "V"), + (0x16FE5, "X"), + (0x16FF0, "V"), + (0x16FF2, "X"), + (0x17000, "V"), + (0x187F8, "X"), + (0x18800, "V"), + (0x18CD6, "X"), + (0x18CFF, "V"), + (0x18D09, "X"), + ] + + +def _seg_61() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x1AFF0, "V"), + (0x1AFF4, "X"), + (0x1AFF5, "V"), + (0x1AFFC, "X"), + (0x1AFFD, "V"), + (0x1AFFF, "X"), + (0x1B000, "V"), + (0x1B123, "X"), + (0x1B132, "V"), + (0x1B133, "X"), + (0x1B150, "V"), + (0x1B153, "X"), + (0x1B155, "V"), + (0x1B156, "X"), + (0x1B164, "V"), + (0x1B168, "X"), + (0x1B170, "V"), + (0x1B2FC, "X"), + (0x1BC00, "V"), + (0x1BC6B, "X"), + (0x1BC70, "V"), + (0x1BC7D, "X"), + (0x1BC80, "V"), + (0x1BC89, "X"), + (0x1BC90, "V"), + (0x1BC9A, "X"), + (0x1BC9C, "V"), + (0x1BCA0, "I"), + (0x1BCA4, "X"), + (0x1CC00, "V"), + (0x1CCD6, "M", "a"), + (0x1CCD7, "M", "b"), + (0x1CCD8, "M", "c"), + (0x1CCD9, "M", "d"), + (0x1CCDA, "M", "e"), + (0x1CCDB, "M", "f"), + (0x1CCDC, "M", "g"), + (0x1CCDD, "M", "h"), + (0x1CCDE, "M", "i"), + (0x1CCDF, "M", "j"), + (0x1CCE0, "M", "k"), + (0x1CCE1, "M", "l"), + (0x1CCE2, "M", "m"), + (0x1CCE3, "M", "n"), + (0x1CCE4, "M", "o"), + (0x1CCE5, "M", "p"), + (0x1CCE6, "M", "q"), + (0x1CCE7, "M", "r"), + (0x1CCE8, "M", "s"), + (0x1CCE9, "M", "t"), + (0x1CCEA, "M", "u"), + (0x1CCEB, "M", "v"), + (0x1CCEC, "M", "w"), + (0x1CCED, "M", "x"), + (0x1CCEE, "M", "y"), + (0x1CCEF, "M", "z"), + (0x1CCF0, "M", "0"), + (0x1CCF1, "M", "1"), + (0x1CCF2, "M", "2"), + (0x1CCF3, "M", "3"), + (0x1CCF4, "M", "4"), + (0x1CCF5, "M", "5"), + (0x1CCF6, "M", "6"), + (0x1CCF7, "M", "7"), + (0x1CCF8, "M", "8"), + (0x1CCF9, "M", "9"), + (0x1CCFA, "X"), + (0x1CD00, "V"), + (0x1CEB4, "X"), + (0x1CF00, "V"), + (0x1CF2E, "X"), + (0x1CF30, "V"), + (0x1CF47, "X"), + (0x1CF50, "V"), + (0x1CFC4, "X"), + (0x1D000, "V"), + (0x1D0F6, "X"), + (0x1D100, "V"), + (0x1D127, "X"), + (0x1D129, "V"), + (0x1D15E, "M", "𝅗𝅥"), + (0x1D15F, "M", "𝅘𝅥"), + (0x1D160, "M", "𝅘𝅥𝅮"), + (0x1D161, "M", "𝅘𝅥𝅯"), + (0x1D162, "M", "𝅘𝅥𝅰"), + (0x1D163, "M", "𝅘𝅥𝅱"), + (0x1D164, "M", "𝅘𝅥𝅲"), + (0x1D165, "V"), + (0x1D173, "I"), + (0x1D17B, "V"), + (0x1D1BB, "M", "𝆹𝅥"), + (0x1D1BC, "M", "𝆺𝅥"), + (0x1D1BD, "M", "𝆹𝅥𝅮"), + (0x1D1BE, "M", "𝆺𝅥𝅮"), + (0x1D1BF, "M", "𝆹𝅥𝅯"), + (0x1D1C0, "M", "𝆺𝅥𝅯"), + (0x1D1C1, "V"), + (0x1D1EB, "X"), + (0x1D200, "V"), + (0x1D246, "X"), + ] + + +def _seg_62() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x1D2C0, "V"), + (0x1D2D4, "X"), + (0x1D2E0, "V"), + (0x1D2F4, "X"), + (0x1D300, "V"), + (0x1D357, "X"), + (0x1D360, "V"), + (0x1D379, "X"), + (0x1D400, "M", "a"), + (0x1D401, "M", "b"), + (0x1D402, "M", "c"), + (0x1D403, "M", "d"), + (0x1D404, "M", "e"), + (0x1D405, "M", "f"), + (0x1D406, "M", "g"), + (0x1D407, "M", "h"), + (0x1D408, "M", "i"), + (0x1D409, "M", "j"), + (0x1D40A, "M", "k"), + (0x1D40B, "M", "l"), + (0x1D40C, "M", "m"), + (0x1D40D, "M", "n"), + (0x1D40E, "M", "o"), + (0x1D40F, "M", "p"), + (0x1D410, "M", "q"), + (0x1D411, "M", "r"), + (0x1D412, "M", "s"), + (0x1D413, "M", "t"), + (0x1D414, "M", "u"), + (0x1D415, "M", "v"), + (0x1D416, "M", "w"), + (0x1D417, "M", "x"), + (0x1D418, "M", "y"), + (0x1D419, "M", "z"), + (0x1D41A, "M", "a"), + (0x1D41B, "M", "b"), + (0x1D41C, "M", "c"), + (0x1D41D, "M", "d"), + (0x1D41E, "M", "e"), + (0x1D41F, "M", "f"), + (0x1D420, "M", "g"), + (0x1D421, "M", "h"), + (0x1D422, "M", "i"), + (0x1D423, "M", "j"), + (0x1D424, "M", "k"), + (0x1D425, "M", "l"), + (0x1D426, "M", "m"), + (0x1D427, "M", "n"), + (0x1D428, "M", "o"), + (0x1D429, "M", "p"), + (0x1D42A, "M", "q"), + (0x1D42B, "M", "r"), + (0x1D42C, "M", "s"), + (0x1D42D, "M", "t"), + (0x1D42E, "M", "u"), + (0x1D42F, "M", "v"), + (0x1D430, "M", "w"), + (0x1D431, "M", "x"), + (0x1D432, "M", "y"), + (0x1D433, "M", "z"), + (0x1D434, "M", "a"), + (0x1D435, "M", "b"), + (0x1D436, "M", "c"), + (0x1D437, "M", "d"), + (0x1D438, "M", "e"), + (0x1D439, "M", "f"), + (0x1D43A, "M", "g"), + (0x1D43B, "M", "h"), + (0x1D43C, "M", "i"), + (0x1D43D, "M", "j"), + (0x1D43E, "M", "k"), + (0x1D43F, "M", "l"), + (0x1D440, "M", "m"), + (0x1D441, "M", "n"), + (0x1D442, "M", "o"), + (0x1D443, "M", "p"), + (0x1D444, "M", "q"), + (0x1D445, "M", "r"), + (0x1D446, "M", "s"), + (0x1D447, "M", "t"), + (0x1D448, "M", "u"), + (0x1D449, "M", "v"), + (0x1D44A, "M", "w"), + (0x1D44B, "M", "x"), + (0x1D44C, "M", "y"), + (0x1D44D, "M", "z"), + (0x1D44E, "M", "a"), + (0x1D44F, "M", "b"), + (0x1D450, "M", "c"), + (0x1D451, "M", "d"), + (0x1D452, "M", "e"), + (0x1D453, "M", "f"), + (0x1D454, "M", "g"), + (0x1D455, "X"), + (0x1D456, "M", "i"), + (0x1D457, "M", "j"), + (0x1D458, "M", "k"), + (0x1D459, "M", "l"), + (0x1D45A, "M", "m"), + (0x1D45B, "M", "n"), + ] + + +def _seg_63() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x1D45C, "M", "o"), + (0x1D45D, "M", "p"), + (0x1D45E, "M", "q"), + (0x1D45F, "M", "r"), + (0x1D460, "M", "s"), + (0x1D461, "M", "t"), + (0x1D462, "M", "u"), + (0x1D463, "M", "v"), + (0x1D464, "M", "w"), + (0x1D465, "M", "x"), + (0x1D466, "M", "y"), + (0x1D467, "M", "z"), + (0x1D468, "M", "a"), + (0x1D469, "M", "b"), + (0x1D46A, "M", "c"), + (0x1D46B, "M", "d"), + (0x1D46C, "M", "e"), + (0x1D46D, "M", "f"), + (0x1D46E, "M", "g"), + (0x1D46F, "M", "h"), + (0x1D470, "M", "i"), + (0x1D471, "M", "j"), + (0x1D472, "M", "k"), + (0x1D473, "M", "l"), + (0x1D474, "M", "m"), + (0x1D475, "M", "n"), + (0x1D476, "M", "o"), + (0x1D477, "M", "p"), + (0x1D478, "M", "q"), + (0x1D479, "M", "r"), + (0x1D47A, "M", "s"), + (0x1D47B, "M", "t"), + (0x1D47C, "M", "u"), + (0x1D47D, "M", "v"), + (0x1D47E, "M", "w"), + (0x1D47F, "M", "x"), + (0x1D480, "M", "y"), + (0x1D481, "M", "z"), + (0x1D482, "M", "a"), + (0x1D483, "M", "b"), + (0x1D484, "M", "c"), + (0x1D485, "M", "d"), + (0x1D486, "M", "e"), + (0x1D487, "M", "f"), + (0x1D488, "M", "g"), + (0x1D489, "M", "h"), + (0x1D48A, "M", "i"), + (0x1D48B, "M", "j"), + (0x1D48C, "M", "k"), + (0x1D48D, "M", "l"), + (0x1D48E, "M", "m"), + (0x1D48F, "M", "n"), + (0x1D490, "M", "o"), + (0x1D491, "M", "p"), + (0x1D492, "M", "q"), + (0x1D493, "M", "r"), + (0x1D494, "M", "s"), + (0x1D495, "M", "t"), + (0x1D496, "M", "u"), + (0x1D497, "M", "v"), + (0x1D498, "M", "w"), + (0x1D499, "M", "x"), + (0x1D49A, "M", "y"), + (0x1D49B, "M", "z"), + (0x1D49C, "M", "a"), + (0x1D49D, "X"), + (0x1D49E, "M", "c"), + (0x1D49F, "M", "d"), + (0x1D4A0, "X"), + (0x1D4A2, "M", "g"), + (0x1D4A3, "X"), + (0x1D4A5, "M", "j"), + (0x1D4A6, "M", "k"), + (0x1D4A7, "X"), + (0x1D4A9, "M", "n"), + (0x1D4AA, "M", "o"), + (0x1D4AB, "M", "p"), + (0x1D4AC, "M", "q"), + (0x1D4AD, "X"), + (0x1D4AE, "M", "s"), + (0x1D4AF, "M", "t"), + (0x1D4B0, "M", "u"), + (0x1D4B1, "M", "v"), + (0x1D4B2, "M", "w"), + (0x1D4B3, "M", "x"), + (0x1D4B4, "M", "y"), + (0x1D4B5, "M", "z"), + (0x1D4B6, "M", "a"), + (0x1D4B7, "M", "b"), + (0x1D4B8, "M", "c"), + (0x1D4B9, "M", "d"), + (0x1D4BA, "X"), + (0x1D4BB, "M", "f"), + (0x1D4BC, "X"), + (0x1D4BD, "M", "h"), + (0x1D4BE, "M", "i"), + (0x1D4BF, "M", "j"), + (0x1D4C0, "M", "k"), + (0x1D4C1, "M", "l"), + (0x1D4C2, "M", "m"), + ] + + +def _seg_64() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x1D4C3, "M", "n"), + (0x1D4C4, "X"), + (0x1D4C5, "M", "p"), + (0x1D4C6, "M", "q"), + (0x1D4C7, "M", "r"), + (0x1D4C8, "M", "s"), + (0x1D4C9, "M", "t"), + (0x1D4CA, "M", "u"), + (0x1D4CB, "M", "v"), + (0x1D4CC, "M", "w"), + (0x1D4CD, "M", "x"), + (0x1D4CE, "M", "y"), + (0x1D4CF, "M", "z"), + (0x1D4D0, "M", "a"), + (0x1D4D1, "M", "b"), + (0x1D4D2, "M", "c"), + (0x1D4D3, "M", "d"), + (0x1D4D4, "M", "e"), + (0x1D4D5, "M", "f"), + (0x1D4D6, "M", "g"), + (0x1D4D7, "M", "h"), + (0x1D4D8, "M", "i"), + (0x1D4D9, "M", "j"), + (0x1D4DA, "M", "k"), + (0x1D4DB, "M", "l"), + (0x1D4DC, "M", "m"), + (0x1D4DD, "M", "n"), + (0x1D4DE, "M", "o"), + (0x1D4DF, "M", "p"), + (0x1D4E0, "M", "q"), + (0x1D4E1, "M", "r"), + (0x1D4E2, "M", "s"), + (0x1D4E3, "M", "t"), + (0x1D4E4, "M", "u"), + (0x1D4E5, "M", "v"), + (0x1D4E6, "M", "w"), + (0x1D4E7, "M", "x"), + (0x1D4E8, "M", "y"), + (0x1D4E9, "M", "z"), + (0x1D4EA, "M", "a"), + (0x1D4EB, "M", "b"), + (0x1D4EC, "M", "c"), + (0x1D4ED, "M", "d"), + (0x1D4EE, "M", "e"), + (0x1D4EF, "M", "f"), + (0x1D4F0, "M", "g"), + (0x1D4F1, "M", "h"), + (0x1D4F2, "M", "i"), + (0x1D4F3, "M", "j"), + (0x1D4F4, "M", "k"), + (0x1D4F5, "M", "l"), + (0x1D4F6, "M", "m"), + (0x1D4F7, "M", "n"), + (0x1D4F8, "M", "o"), + (0x1D4F9, "M", "p"), + (0x1D4FA, "M", "q"), + (0x1D4FB, "M", "r"), + (0x1D4FC, "M", "s"), + (0x1D4FD, "M", "t"), + (0x1D4FE, "M", "u"), + (0x1D4FF, "M", "v"), + (0x1D500, "M", "w"), + (0x1D501, "M", "x"), + (0x1D502, "M", "y"), + (0x1D503, "M", "z"), + (0x1D504, "M", "a"), + (0x1D505, "M", "b"), + (0x1D506, "X"), + (0x1D507, "M", "d"), + (0x1D508, "M", "e"), + (0x1D509, "M", "f"), + (0x1D50A, "M", "g"), + (0x1D50B, "X"), + (0x1D50D, "M", "j"), + (0x1D50E, "M", "k"), + (0x1D50F, "M", "l"), + (0x1D510, "M", "m"), + (0x1D511, "M", "n"), + (0x1D512, "M", "o"), + (0x1D513, "M", "p"), + (0x1D514, "M", "q"), + (0x1D515, "X"), + (0x1D516, "M", "s"), + (0x1D517, "M", "t"), + (0x1D518, "M", "u"), + (0x1D519, "M", "v"), + (0x1D51A, "M", "w"), + (0x1D51B, "M", "x"), + (0x1D51C, "M", "y"), + (0x1D51D, "X"), + (0x1D51E, "M", "a"), + (0x1D51F, "M", "b"), + (0x1D520, "M", "c"), + (0x1D521, "M", "d"), + (0x1D522, "M", "e"), + (0x1D523, "M", "f"), + (0x1D524, "M", "g"), + (0x1D525, "M", "h"), + (0x1D526, "M", "i"), + (0x1D527, "M", "j"), + ] + + +def _seg_65() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x1D528, "M", "k"), + (0x1D529, "M", "l"), + (0x1D52A, "M", "m"), + (0x1D52B, "M", "n"), + (0x1D52C, "M", "o"), + (0x1D52D, "M", "p"), + (0x1D52E, "M", "q"), + (0x1D52F, "M", "r"), + (0x1D530, "M", "s"), + (0x1D531, "M", "t"), + (0x1D532, "M", "u"), + (0x1D533, "M", "v"), + (0x1D534, "M", "w"), + (0x1D535, "M", "x"), + (0x1D536, "M", "y"), + (0x1D537, "M", "z"), + (0x1D538, "M", "a"), + (0x1D539, "M", "b"), + (0x1D53A, "X"), + (0x1D53B, "M", "d"), + (0x1D53C, "M", "e"), + (0x1D53D, "M", "f"), + (0x1D53E, "M", "g"), + (0x1D53F, "X"), + (0x1D540, "M", "i"), + (0x1D541, "M", "j"), + (0x1D542, "M", "k"), + (0x1D543, "M", "l"), + (0x1D544, "M", "m"), + (0x1D545, "X"), + (0x1D546, "M", "o"), + (0x1D547, "X"), + (0x1D54A, "M", "s"), + (0x1D54B, "M", "t"), + (0x1D54C, "M", "u"), + (0x1D54D, "M", "v"), + (0x1D54E, "M", "w"), + (0x1D54F, "M", "x"), + (0x1D550, "M", "y"), + (0x1D551, "X"), + (0x1D552, "M", "a"), + (0x1D553, "M", "b"), + (0x1D554, "M", "c"), + (0x1D555, "M", "d"), + (0x1D556, "M", "e"), + (0x1D557, "M", "f"), + (0x1D558, "M", "g"), + (0x1D559, "M", "h"), + (0x1D55A, "M", "i"), + (0x1D55B, "M", "j"), + (0x1D55C, "M", "k"), + (0x1D55D, "M", "l"), + (0x1D55E, "M", "m"), + (0x1D55F, "M", "n"), + (0x1D560, "M", "o"), + (0x1D561, "M", "p"), + (0x1D562, "M", "q"), + (0x1D563, "M", "r"), + (0x1D564, "M", "s"), + (0x1D565, "M", "t"), + (0x1D566, "M", "u"), + (0x1D567, "M", "v"), + (0x1D568, "M", "w"), + (0x1D569, "M", "x"), + (0x1D56A, "M", "y"), + (0x1D56B, "M", "z"), + (0x1D56C, "M", "a"), + (0x1D56D, "M", "b"), + (0x1D56E, "M", "c"), + (0x1D56F, "M", "d"), + (0x1D570, "M", "e"), + (0x1D571, "M", "f"), + (0x1D572, "M", "g"), + (0x1D573, "M", "h"), + (0x1D574, "M", "i"), + (0x1D575, "M", "j"), + (0x1D576, "M", "k"), + (0x1D577, "M", "l"), + (0x1D578, "M", "m"), + (0x1D579, "M", "n"), + (0x1D57A, "M", "o"), + (0x1D57B, "M", "p"), + (0x1D57C, "M", "q"), + (0x1D57D, "M", "r"), + (0x1D57E, "M", "s"), + (0x1D57F, "M", "t"), + (0x1D580, "M", "u"), + (0x1D581, "M", "v"), + (0x1D582, "M", "w"), + (0x1D583, "M", "x"), + (0x1D584, "M", "y"), + (0x1D585, "M", "z"), + (0x1D586, "M", "a"), + (0x1D587, "M", "b"), + (0x1D588, "M", "c"), + (0x1D589, "M", "d"), + (0x1D58A, "M", "e"), + (0x1D58B, "M", "f"), + (0x1D58C, "M", "g"), + (0x1D58D, "M", "h"), + ] + + +def _seg_66() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x1D58E, "M", "i"), + (0x1D58F, "M", "j"), + (0x1D590, "M", "k"), + (0x1D591, "M", "l"), + (0x1D592, "M", "m"), + (0x1D593, "M", "n"), + (0x1D594, "M", "o"), + (0x1D595, "M", "p"), + (0x1D596, "M", "q"), + (0x1D597, "M", "r"), + (0x1D598, "M", "s"), + (0x1D599, "M", "t"), + (0x1D59A, "M", "u"), + (0x1D59B, "M", "v"), + (0x1D59C, "M", "w"), + (0x1D59D, "M", "x"), + (0x1D59E, "M", "y"), + (0x1D59F, "M", "z"), + (0x1D5A0, "M", "a"), + (0x1D5A1, "M", "b"), + (0x1D5A2, "M", "c"), + (0x1D5A3, "M", "d"), + (0x1D5A4, "M", "e"), + (0x1D5A5, "M", "f"), + (0x1D5A6, "M", "g"), + (0x1D5A7, "M", "h"), + (0x1D5A8, "M", "i"), + (0x1D5A9, "M", "j"), + (0x1D5AA, "M", "k"), + (0x1D5AB, "M", "l"), + (0x1D5AC, "M", "m"), + (0x1D5AD, "M", "n"), + (0x1D5AE, "M", "o"), + (0x1D5AF, "M", "p"), + (0x1D5B0, "M", "q"), + (0x1D5B1, "M", "r"), + (0x1D5B2, "M", "s"), + (0x1D5B3, "M", "t"), + (0x1D5B4, "M", "u"), + (0x1D5B5, "M", "v"), + (0x1D5B6, "M", "w"), + (0x1D5B7, "M", "x"), + (0x1D5B8, "M", "y"), + (0x1D5B9, "M", "z"), + (0x1D5BA, "M", "a"), + (0x1D5BB, "M", "b"), + (0x1D5BC, "M", "c"), + (0x1D5BD, "M", "d"), + (0x1D5BE, "M", "e"), + (0x1D5BF, "M", "f"), + (0x1D5C0, "M", "g"), + (0x1D5C1, "M", "h"), + (0x1D5C2, "M", "i"), + (0x1D5C3, "M", "j"), + (0x1D5C4, "M", "k"), + (0x1D5C5, "M", "l"), + (0x1D5C6, "M", "m"), + (0x1D5C7, "M", "n"), + (0x1D5C8, "M", "o"), + (0x1D5C9, "M", "p"), + (0x1D5CA, "M", "q"), + (0x1D5CB, "M", "r"), + (0x1D5CC, "M", "s"), + (0x1D5CD, "M", "t"), + (0x1D5CE, "M", "u"), + (0x1D5CF, "M", "v"), + (0x1D5D0, "M", "w"), + (0x1D5D1, "M", "x"), + (0x1D5D2, "M", "y"), + (0x1D5D3, "M", "z"), + (0x1D5D4, "M", "a"), + (0x1D5D5, "M", "b"), + (0x1D5D6, "M", "c"), + (0x1D5D7, "M", "d"), + (0x1D5D8, "M", "e"), + (0x1D5D9, "M", "f"), + (0x1D5DA, "M", "g"), + (0x1D5DB, "M", "h"), + (0x1D5DC, "M", "i"), + (0x1D5DD, "M", "j"), + (0x1D5DE, "M", "k"), + (0x1D5DF, "M", "l"), + (0x1D5E0, "M", "m"), + (0x1D5E1, "M", "n"), + (0x1D5E2, "M", "o"), + (0x1D5E3, "M", "p"), + (0x1D5E4, "M", "q"), + (0x1D5E5, "M", "r"), + (0x1D5E6, "M", "s"), + (0x1D5E7, "M", "t"), + (0x1D5E8, "M", "u"), + (0x1D5E9, "M", "v"), + (0x1D5EA, "M", "w"), + (0x1D5EB, "M", "x"), + (0x1D5EC, "M", "y"), + (0x1D5ED, "M", "z"), + (0x1D5EE, "M", "a"), + (0x1D5EF, "M", "b"), + (0x1D5F0, "M", "c"), + (0x1D5F1, "M", "d"), + ] + + +def _seg_67() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x1D5F2, "M", "e"), + (0x1D5F3, "M", "f"), + (0x1D5F4, "M", "g"), + (0x1D5F5, "M", "h"), + (0x1D5F6, "M", "i"), + (0x1D5F7, "M", "j"), + (0x1D5F8, "M", "k"), + (0x1D5F9, "M", "l"), + (0x1D5FA, "M", "m"), + (0x1D5FB, "M", "n"), + (0x1D5FC, "M", "o"), + (0x1D5FD, "M", "p"), + (0x1D5FE, "M", "q"), + (0x1D5FF, "M", "r"), + (0x1D600, "M", "s"), + (0x1D601, "M", "t"), + (0x1D602, "M", "u"), + (0x1D603, "M", "v"), + (0x1D604, "M", "w"), + (0x1D605, "M", "x"), + (0x1D606, "M", "y"), + (0x1D607, "M", "z"), + (0x1D608, "M", "a"), + (0x1D609, "M", "b"), + (0x1D60A, "M", "c"), + (0x1D60B, "M", "d"), + (0x1D60C, "M", "e"), + (0x1D60D, "M", "f"), + (0x1D60E, "M", "g"), + (0x1D60F, "M", "h"), + (0x1D610, "M", "i"), + (0x1D611, "M", "j"), + (0x1D612, "M", "k"), + (0x1D613, "M", "l"), + (0x1D614, "M", "m"), + (0x1D615, "M", "n"), + (0x1D616, "M", "o"), + (0x1D617, "M", "p"), + (0x1D618, "M", "q"), + (0x1D619, "M", "r"), + (0x1D61A, "M", "s"), + (0x1D61B, "M", "t"), + (0x1D61C, "M", "u"), + (0x1D61D, "M", "v"), + (0x1D61E, "M", "w"), + (0x1D61F, "M", "x"), + (0x1D620, "M", "y"), + (0x1D621, "M", "z"), + (0x1D622, "M", "a"), + (0x1D623, "M", "b"), + (0x1D624, "M", "c"), + (0x1D625, "M", "d"), + (0x1D626, "M", "e"), + (0x1D627, "M", "f"), + (0x1D628, "M", "g"), + (0x1D629, "M", "h"), + (0x1D62A, "M", "i"), + (0x1D62B, "M", "j"), + (0x1D62C, "M", "k"), + (0x1D62D, "M", "l"), + (0x1D62E, "M", "m"), + (0x1D62F, "M", "n"), + (0x1D630, "M", "o"), + (0x1D631, "M", "p"), + (0x1D632, "M", "q"), + (0x1D633, "M", "r"), + (0x1D634, "M", "s"), + (0x1D635, "M", "t"), + (0x1D636, "M", "u"), + (0x1D637, "M", "v"), + (0x1D638, "M", "w"), + (0x1D639, "M", "x"), + (0x1D63A, "M", "y"), + (0x1D63B, "M", "z"), + (0x1D63C, "M", "a"), + (0x1D63D, "M", "b"), + (0x1D63E, "M", "c"), + (0x1D63F, "M", "d"), + (0x1D640, "M", "e"), + (0x1D641, "M", "f"), + (0x1D642, "M", "g"), + (0x1D643, "M", "h"), + (0x1D644, "M", "i"), + (0x1D645, "M", "j"), + (0x1D646, "M", "k"), + (0x1D647, "M", "l"), + (0x1D648, "M", "m"), + (0x1D649, "M", "n"), + (0x1D64A, "M", "o"), + (0x1D64B, "M", "p"), + (0x1D64C, "M", "q"), + (0x1D64D, "M", "r"), + (0x1D64E, "M", "s"), + (0x1D64F, "M", "t"), + (0x1D650, "M", "u"), + (0x1D651, "M", "v"), + (0x1D652, "M", "w"), + (0x1D653, "M", "x"), + (0x1D654, "M", "y"), + (0x1D655, "M", "z"), + ] + + +def _seg_68() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x1D656, "M", "a"), + (0x1D657, "M", "b"), + (0x1D658, "M", "c"), + (0x1D659, "M", "d"), + (0x1D65A, "M", "e"), + (0x1D65B, "M", "f"), + (0x1D65C, "M", "g"), + (0x1D65D, "M", "h"), + (0x1D65E, "M", "i"), + (0x1D65F, "M", "j"), + (0x1D660, "M", "k"), + (0x1D661, "M", "l"), + (0x1D662, "M", "m"), + (0x1D663, "M", "n"), + (0x1D664, "M", "o"), + (0x1D665, "M", "p"), + (0x1D666, "M", "q"), + (0x1D667, "M", "r"), + (0x1D668, "M", "s"), + (0x1D669, "M", "t"), + (0x1D66A, "M", "u"), + (0x1D66B, "M", "v"), + (0x1D66C, "M", "w"), + (0x1D66D, "M", "x"), + (0x1D66E, "M", "y"), + (0x1D66F, "M", "z"), + (0x1D670, "M", "a"), + (0x1D671, "M", "b"), + (0x1D672, "M", "c"), + (0x1D673, "M", "d"), + (0x1D674, "M", "e"), + (0x1D675, "M", "f"), + (0x1D676, "M", "g"), + (0x1D677, "M", "h"), + (0x1D678, "M", "i"), + (0x1D679, "M", "j"), + (0x1D67A, "M", "k"), + (0x1D67B, "M", "l"), + (0x1D67C, "M", "m"), + (0x1D67D, "M", "n"), + (0x1D67E, "M", "o"), + (0x1D67F, "M", "p"), + (0x1D680, "M", "q"), + (0x1D681, "M", "r"), + (0x1D682, "M", "s"), + (0x1D683, "M", "t"), + (0x1D684, "M", "u"), + (0x1D685, "M", "v"), + (0x1D686, "M", "w"), + (0x1D687, "M", "x"), + (0x1D688, "M", "y"), + (0x1D689, "M", "z"), + (0x1D68A, "M", "a"), + (0x1D68B, "M", "b"), + (0x1D68C, "M", "c"), + (0x1D68D, "M", "d"), + (0x1D68E, "M", "e"), + (0x1D68F, "M", "f"), + (0x1D690, "M", "g"), + (0x1D691, "M", "h"), + (0x1D692, "M", "i"), + (0x1D693, "M", "j"), + (0x1D694, "M", "k"), + (0x1D695, "M", "l"), + (0x1D696, "M", "m"), + (0x1D697, "M", "n"), + (0x1D698, "M", "o"), + (0x1D699, "M", "p"), + (0x1D69A, "M", "q"), + (0x1D69B, "M", "r"), + (0x1D69C, "M", "s"), + (0x1D69D, "M", "t"), + (0x1D69E, "M", "u"), + (0x1D69F, "M", "v"), + (0x1D6A0, "M", "w"), + (0x1D6A1, "M", "x"), + (0x1D6A2, "M", "y"), + (0x1D6A3, "M", "z"), + (0x1D6A4, "M", "ı"), + (0x1D6A5, "M", "ȷ"), + (0x1D6A6, "X"), + (0x1D6A8, "M", "α"), + (0x1D6A9, "M", "β"), + (0x1D6AA, "M", "γ"), + (0x1D6AB, "M", "δ"), + (0x1D6AC, "M", "ε"), + (0x1D6AD, "M", "ζ"), + (0x1D6AE, "M", "η"), + (0x1D6AF, "M", "θ"), + (0x1D6B0, "M", "ι"), + (0x1D6B1, "M", "κ"), + (0x1D6B2, "M", "λ"), + (0x1D6B3, "M", "μ"), + (0x1D6B4, "M", "ν"), + (0x1D6B5, "M", "ξ"), + (0x1D6B6, "M", "ο"), + (0x1D6B7, "M", "π"), + (0x1D6B8, "M", "ρ"), + (0x1D6B9, "M", "θ"), + (0x1D6BA, "M", "σ"), + ] + + +def _seg_69() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x1D6BB, "M", "τ"), + (0x1D6BC, "M", "υ"), + (0x1D6BD, "M", "φ"), + (0x1D6BE, "M", "χ"), + (0x1D6BF, "M", "ψ"), + (0x1D6C0, "M", "ω"), + (0x1D6C1, "M", "∇"), + (0x1D6C2, "M", "α"), + (0x1D6C3, "M", "β"), + (0x1D6C4, "M", "γ"), + (0x1D6C5, "M", "δ"), + (0x1D6C6, "M", "ε"), + (0x1D6C7, "M", "ζ"), + (0x1D6C8, "M", "η"), + (0x1D6C9, "M", "θ"), + (0x1D6CA, "M", "ι"), + (0x1D6CB, "M", "κ"), + (0x1D6CC, "M", "λ"), + (0x1D6CD, "M", "μ"), + (0x1D6CE, "M", "ν"), + (0x1D6CF, "M", "ξ"), + (0x1D6D0, "M", "ο"), + (0x1D6D1, "M", "π"), + (0x1D6D2, "M", "ρ"), + (0x1D6D3, "M", "σ"), + (0x1D6D5, "M", "τ"), + (0x1D6D6, "M", "υ"), + (0x1D6D7, "M", "φ"), + (0x1D6D8, "M", "χ"), + (0x1D6D9, "M", "ψ"), + (0x1D6DA, "M", "ω"), + (0x1D6DB, "M", "∂"), + (0x1D6DC, "M", "ε"), + (0x1D6DD, "M", "θ"), + (0x1D6DE, "M", "κ"), + (0x1D6DF, "M", "φ"), + (0x1D6E0, "M", "ρ"), + (0x1D6E1, "M", "π"), + (0x1D6E2, "M", "α"), + (0x1D6E3, "M", "β"), + (0x1D6E4, "M", "γ"), + (0x1D6E5, "M", "δ"), + (0x1D6E6, "M", "ε"), + (0x1D6E7, "M", "ζ"), + (0x1D6E8, "M", "η"), + (0x1D6E9, "M", "θ"), + (0x1D6EA, "M", "ι"), + (0x1D6EB, "M", "κ"), + (0x1D6EC, "M", "λ"), + (0x1D6ED, "M", "μ"), + (0x1D6EE, "M", "ν"), + (0x1D6EF, "M", "ξ"), + (0x1D6F0, "M", "ο"), + (0x1D6F1, "M", "π"), + (0x1D6F2, "M", "ρ"), + (0x1D6F3, "M", "θ"), + (0x1D6F4, "M", "σ"), + (0x1D6F5, "M", "τ"), + (0x1D6F6, "M", "υ"), + (0x1D6F7, "M", "φ"), + (0x1D6F8, "M", "χ"), + (0x1D6F9, "M", "ψ"), + (0x1D6FA, "M", "ω"), + (0x1D6FB, "M", "∇"), + (0x1D6FC, "M", "α"), + (0x1D6FD, "M", "β"), + (0x1D6FE, "M", "γ"), + (0x1D6FF, "M", "δ"), + (0x1D700, "M", "ε"), + (0x1D701, "M", "ζ"), + (0x1D702, "M", "η"), + (0x1D703, "M", "θ"), + (0x1D704, "M", "ι"), + (0x1D705, "M", "κ"), + (0x1D706, "M", "λ"), + (0x1D707, "M", "μ"), + (0x1D708, "M", "ν"), + (0x1D709, "M", "ξ"), + (0x1D70A, "M", "ο"), + (0x1D70B, "M", "π"), + (0x1D70C, "M", "ρ"), + (0x1D70D, "M", "σ"), + (0x1D70F, "M", "τ"), + (0x1D710, "M", "υ"), + (0x1D711, "M", "φ"), + (0x1D712, "M", "χ"), + (0x1D713, "M", "ψ"), + (0x1D714, "M", "ω"), + (0x1D715, "M", "∂"), + (0x1D716, "M", "ε"), + (0x1D717, "M", "θ"), + (0x1D718, "M", "κ"), + (0x1D719, "M", "φ"), + (0x1D71A, "M", "ρ"), + (0x1D71B, "M", "π"), + (0x1D71C, "M", "α"), + (0x1D71D, "M", "β"), + (0x1D71E, "M", "γ"), + (0x1D71F, "M", "δ"), + (0x1D720, "M", "ε"), + ] + + +def _seg_70() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x1D721, "M", "ζ"), + (0x1D722, "M", "η"), + (0x1D723, "M", "θ"), + (0x1D724, "M", "ι"), + (0x1D725, "M", "κ"), + (0x1D726, "M", "λ"), + (0x1D727, "M", "μ"), + (0x1D728, "M", "ν"), + (0x1D729, "M", "ξ"), + (0x1D72A, "M", "ο"), + (0x1D72B, "M", "π"), + (0x1D72C, "M", "ρ"), + (0x1D72D, "M", "θ"), + (0x1D72E, "M", "σ"), + (0x1D72F, "M", "τ"), + (0x1D730, "M", "υ"), + (0x1D731, "M", "φ"), + (0x1D732, "M", "χ"), + (0x1D733, "M", "ψ"), + (0x1D734, "M", "ω"), + (0x1D735, "M", "∇"), + (0x1D736, "M", "α"), + (0x1D737, "M", "β"), + (0x1D738, "M", "γ"), + (0x1D739, "M", "δ"), + (0x1D73A, "M", "ε"), + (0x1D73B, "M", "ζ"), + (0x1D73C, "M", "η"), + (0x1D73D, "M", "θ"), + (0x1D73E, "M", "ι"), + (0x1D73F, "M", "κ"), + (0x1D740, "M", "λ"), + (0x1D741, "M", "μ"), + (0x1D742, "M", "ν"), + (0x1D743, "M", "ξ"), + (0x1D744, "M", "ο"), + (0x1D745, "M", "π"), + (0x1D746, "M", "ρ"), + (0x1D747, "M", "σ"), + (0x1D749, "M", "τ"), + (0x1D74A, "M", "υ"), + (0x1D74B, "M", "φ"), + (0x1D74C, "M", "χ"), + (0x1D74D, "M", "ψ"), + (0x1D74E, "M", "ω"), + (0x1D74F, "M", "∂"), + (0x1D750, "M", "ε"), + (0x1D751, "M", "θ"), + (0x1D752, "M", "κ"), + (0x1D753, "M", "φ"), + (0x1D754, "M", "ρ"), + (0x1D755, "M", "π"), + (0x1D756, "M", "α"), + (0x1D757, "M", "β"), + (0x1D758, "M", "γ"), + (0x1D759, "M", "δ"), + (0x1D75A, "M", "ε"), + (0x1D75B, "M", "ζ"), + (0x1D75C, "M", "η"), + (0x1D75D, "M", "θ"), + (0x1D75E, "M", "ι"), + (0x1D75F, "M", "κ"), + (0x1D760, "M", "λ"), + (0x1D761, "M", "μ"), + (0x1D762, "M", "ν"), + (0x1D763, "M", "ξ"), + (0x1D764, "M", "ο"), + (0x1D765, "M", "π"), + (0x1D766, "M", "ρ"), + (0x1D767, "M", "θ"), + (0x1D768, "M", "σ"), + (0x1D769, "M", "τ"), + (0x1D76A, "M", "υ"), + (0x1D76B, "M", "φ"), + (0x1D76C, "M", "χ"), + (0x1D76D, "M", "ψ"), + (0x1D76E, "M", "ω"), + (0x1D76F, "M", "∇"), + (0x1D770, "M", "α"), + (0x1D771, "M", "β"), + (0x1D772, "M", "γ"), + (0x1D773, "M", "δ"), + (0x1D774, "M", "ε"), + (0x1D775, "M", "ζ"), + (0x1D776, "M", "η"), + (0x1D777, "M", "θ"), + (0x1D778, "M", "ι"), + (0x1D779, "M", "κ"), + (0x1D77A, "M", "λ"), + (0x1D77B, "M", "μ"), + (0x1D77C, "M", "ν"), + (0x1D77D, "M", "ξ"), + (0x1D77E, "M", "ο"), + (0x1D77F, "M", "π"), + (0x1D780, "M", "ρ"), + (0x1D781, "M", "σ"), + (0x1D783, "M", "τ"), + (0x1D784, "M", "υ"), + (0x1D785, "M", "φ"), + (0x1D786, "M", "χ"), + ] + + +def _seg_71() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x1D787, "M", "ψ"), + (0x1D788, "M", "ω"), + (0x1D789, "M", "∂"), + (0x1D78A, "M", "ε"), + (0x1D78B, "M", "θ"), + (0x1D78C, "M", "κ"), + (0x1D78D, "M", "φ"), + (0x1D78E, "M", "ρ"), + (0x1D78F, "M", "π"), + (0x1D790, "M", "α"), + (0x1D791, "M", "β"), + (0x1D792, "M", "γ"), + (0x1D793, "M", "δ"), + (0x1D794, "M", "ε"), + (0x1D795, "M", "ζ"), + (0x1D796, "M", "η"), + (0x1D797, "M", "θ"), + (0x1D798, "M", "ι"), + (0x1D799, "M", "κ"), + (0x1D79A, "M", "λ"), + (0x1D79B, "M", "μ"), + (0x1D79C, "M", "ν"), + (0x1D79D, "M", "ξ"), + (0x1D79E, "M", "ο"), + (0x1D79F, "M", "π"), + (0x1D7A0, "M", "ρ"), + (0x1D7A1, "M", "θ"), + (0x1D7A2, "M", "σ"), + (0x1D7A3, "M", "τ"), + (0x1D7A4, "M", "υ"), + (0x1D7A5, "M", "φ"), + (0x1D7A6, "M", "χ"), + (0x1D7A7, "M", "ψ"), + (0x1D7A8, "M", "ω"), + (0x1D7A9, "M", "∇"), + (0x1D7AA, "M", "α"), + (0x1D7AB, "M", "β"), + (0x1D7AC, "M", "γ"), + (0x1D7AD, "M", "δ"), + (0x1D7AE, "M", "ε"), + (0x1D7AF, "M", "ζ"), + (0x1D7B0, "M", "η"), + (0x1D7B1, "M", "θ"), + (0x1D7B2, "M", "ι"), + (0x1D7B3, "M", "κ"), + (0x1D7B4, "M", "λ"), + (0x1D7B5, "M", "μ"), + (0x1D7B6, "M", "ν"), + (0x1D7B7, "M", "ξ"), + (0x1D7B8, "M", "ο"), + (0x1D7B9, "M", "π"), + (0x1D7BA, "M", "ρ"), + (0x1D7BB, "M", "σ"), + (0x1D7BD, "M", "τ"), + (0x1D7BE, "M", "υ"), + (0x1D7BF, "M", "φ"), + (0x1D7C0, "M", "χ"), + (0x1D7C1, "M", "ψ"), + (0x1D7C2, "M", "ω"), + (0x1D7C3, "M", "∂"), + (0x1D7C4, "M", "ε"), + (0x1D7C5, "M", "θ"), + (0x1D7C6, "M", "κ"), + (0x1D7C7, "M", "φ"), + (0x1D7C8, "M", "ρ"), + (0x1D7C9, "M", "π"), + (0x1D7CA, "M", "ϝ"), + (0x1D7CC, "X"), + (0x1D7CE, "M", "0"), + (0x1D7CF, "M", "1"), + (0x1D7D0, "M", "2"), + (0x1D7D1, "M", "3"), + (0x1D7D2, "M", "4"), + (0x1D7D3, "M", "5"), + (0x1D7D4, "M", "6"), + (0x1D7D5, "M", "7"), + (0x1D7D6, "M", "8"), + (0x1D7D7, "M", "9"), + (0x1D7D8, "M", "0"), + (0x1D7D9, "M", "1"), + (0x1D7DA, "M", "2"), + (0x1D7DB, "M", "3"), + (0x1D7DC, "M", "4"), + (0x1D7DD, "M", "5"), + (0x1D7DE, "M", "6"), + (0x1D7DF, "M", "7"), + (0x1D7E0, "M", "8"), + (0x1D7E1, "M", "9"), + (0x1D7E2, "M", "0"), + (0x1D7E3, "M", "1"), + (0x1D7E4, "M", "2"), + (0x1D7E5, "M", "3"), + (0x1D7E6, "M", "4"), + (0x1D7E7, "M", "5"), + (0x1D7E8, "M", "6"), + (0x1D7E9, "M", "7"), + (0x1D7EA, "M", "8"), + (0x1D7EB, "M", "9"), + (0x1D7EC, "M", "0"), + (0x1D7ED, "M", "1"), + ] + + +def _seg_72() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x1D7EE, "M", "2"), + (0x1D7EF, "M", "3"), + (0x1D7F0, "M", "4"), + (0x1D7F1, "M", "5"), + (0x1D7F2, "M", "6"), + (0x1D7F3, "M", "7"), + (0x1D7F4, "M", "8"), + (0x1D7F5, "M", "9"), + (0x1D7F6, "M", "0"), + (0x1D7F7, "M", "1"), + (0x1D7F8, "M", "2"), + (0x1D7F9, "M", "3"), + (0x1D7FA, "M", "4"), + (0x1D7FB, "M", "5"), + (0x1D7FC, "M", "6"), + (0x1D7FD, "M", "7"), + (0x1D7FE, "M", "8"), + (0x1D7FF, "M", "9"), + (0x1D800, "V"), + (0x1DA8C, "X"), + (0x1DA9B, "V"), + (0x1DAA0, "X"), + (0x1DAA1, "V"), + (0x1DAB0, "X"), + (0x1DF00, "V"), + (0x1DF1F, "X"), + (0x1DF25, "V"), + (0x1DF2B, "X"), + (0x1E000, "V"), + (0x1E007, "X"), + (0x1E008, "V"), + (0x1E019, "X"), + (0x1E01B, "V"), + (0x1E022, "X"), + (0x1E023, "V"), + (0x1E025, "X"), + (0x1E026, "V"), + (0x1E02B, "X"), + (0x1E030, "M", "а"), + (0x1E031, "M", "б"), + (0x1E032, "M", "в"), + (0x1E033, "M", "г"), + (0x1E034, "M", "д"), + (0x1E035, "M", "е"), + (0x1E036, "M", "ж"), + (0x1E037, "M", "з"), + (0x1E038, "M", "и"), + (0x1E039, "M", "к"), + (0x1E03A, "M", "л"), + (0x1E03B, "M", "м"), + (0x1E03C, "M", "о"), + (0x1E03D, "M", "п"), + (0x1E03E, "M", "р"), + (0x1E03F, "M", "с"), + (0x1E040, "M", "т"), + (0x1E041, "M", "у"), + (0x1E042, "M", "ф"), + (0x1E043, "M", "х"), + (0x1E044, "M", "ц"), + (0x1E045, "M", "ч"), + (0x1E046, "M", "ш"), + (0x1E047, "M", "ы"), + (0x1E048, "M", "э"), + (0x1E049, "M", "ю"), + (0x1E04A, "M", "ꚉ"), + (0x1E04B, "M", "ә"), + (0x1E04C, "M", "і"), + (0x1E04D, "M", "ј"), + (0x1E04E, "M", "ө"), + (0x1E04F, "M", "ү"), + (0x1E050, "M", "ӏ"), + (0x1E051, "M", "а"), + (0x1E052, "M", "б"), + (0x1E053, "M", "в"), + (0x1E054, "M", "г"), + (0x1E055, "M", "д"), + (0x1E056, "M", "е"), + (0x1E057, "M", "ж"), + (0x1E058, "M", "з"), + (0x1E059, "M", "и"), + (0x1E05A, "M", "к"), + (0x1E05B, "M", "л"), + (0x1E05C, "M", "о"), + (0x1E05D, "M", "п"), + (0x1E05E, "M", "с"), + (0x1E05F, "M", "у"), + (0x1E060, "M", "ф"), + (0x1E061, "M", "х"), + (0x1E062, "M", "ц"), + (0x1E063, "M", "ч"), + (0x1E064, "M", "ш"), + (0x1E065, "M", "ъ"), + (0x1E066, "M", "ы"), + (0x1E067, "M", "ґ"), + (0x1E068, "M", "і"), + (0x1E069, "M", "ѕ"), + (0x1E06A, "M", "џ"), + (0x1E06B, "M", "ҫ"), + (0x1E06C, "M", "ꙑ"), + (0x1E06D, "M", "ұ"), + ] + + +def _seg_73() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x1E06E, "X"), + (0x1E08F, "V"), + (0x1E090, "X"), + (0x1E100, "V"), + (0x1E12D, "X"), + (0x1E130, "V"), + (0x1E13E, "X"), + (0x1E140, "V"), + (0x1E14A, "X"), + (0x1E14E, "V"), + (0x1E150, "X"), + (0x1E290, "V"), + (0x1E2AF, "X"), + (0x1E2C0, "V"), + (0x1E2FA, "X"), + (0x1E2FF, "V"), + (0x1E300, "X"), + (0x1E4D0, "V"), + (0x1E4FA, "X"), + (0x1E5D0, "V"), + (0x1E5FB, "X"), + (0x1E5FF, "V"), + (0x1E600, "X"), + (0x1E7E0, "V"), + (0x1E7E7, "X"), + (0x1E7E8, "V"), + (0x1E7EC, "X"), + (0x1E7ED, "V"), + (0x1E7EF, "X"), + (0x1E7F0, "V"), + (0x1E7FF, "X"), + (0x1E800, "V"), + (0x1E8C5, "X"), + (0x1E8C7, "V"), + (0x1E8D7, "X"), + (0x1E900, "M", "𞤢"), + (0x1E901, "M", "𞤣"), + (0x1E902, "M", "𞤤"), + (0x1E903, "M", "𞤥"), + (0x1E904, "M", "𞤦"), + (0x1E905, "M", "𞤧"), + (0x1E906, "M", "𞤨"), + (0x1E907, "M", "𞤩"), + (0x1E908, "M", "𞤪"), + (0x1E909, "M", "𞤫"), + (0x1E90A, "M", "𞤬"), + (0x1E90B, "M", "𞤭"), + (0x1E90C, "M", "𞤮"), + (0x1E90D, "M", "𞤯"), + (0x1E90E, "M", "𞤰"), + (0x1E90F, "M", "𞤱"), + (0x1E910, "M", "𞤲"), + (0x1E911, "M", "𞤳"), + (0x1E912, "M", "𞤴"), + (0x1E913, "M", "𞤵"), + (0x1E914, "M", "𞤶"), + (0x1E915, "M", "𞤷"), + (0x1E916, "M", "𞤸"), + (0x1E917, "M", "𞤹"), + (0x1E918, "M", "𞤺"), + (0x1E919, "M", "𞤻"), + (0x1E91A, "M", "𞤼"), + (0x1E91B, "M", "𞤽"), + (0x1E91C, "M", "𞤾"), + (0x1E91D, "M", "𞤿"), + (0x1E91E, "M", "𞥀"), + (0x1E91F, "M", "𞥁"), + (0x1E920, "M", "𞥂"), + (0x1E921, "M", "𞥃"), + (0x1E922, "V"), + (0x1E94C, "X"), + (0x1E950, "V"), + (0x1E95A, "X"), + (0x1E95E, "V"), + (0x1E960, "X"), + (0x1EC71, "V"), + (0x1ECB5, "X"), + (0x1ED01, "V"), + (0x1ED3E, "X"), + (0x1EE00, "M", "ا"), + (0x1EE01, "M", "ب"), + (0x1EE02, "M", "ج"), + (0x1EE03, "M", "د"), + (0x1EE04, "X"), + (0x1EE05, "M", "و"), + (0x1EE06, "M", "ز"), + (0x1EE07, "M", "ح"), + (0x1EE08, "M", "ط"), + (0x1EE09, "M", "ي"), + (0x1EE0A, "M", "ك"), + (0x1EE0B, "M", "ل"), + (0x1EE0C, "M", "م"), + (0x1EE0D, "M", "ن"), + (0x1EE0E, "M", "س"), + (0x1EE0F, "M", "ع"), + (0x1EE10, "M", "ف"), + (0x1EE11, "M", "ص"), + (0x1EE12, "M", "ق"), + (0x1EE13, "M", "ر"), + (0x1EE14, "M", "ش"), + ] + + +def _seg_74() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x1EE15, "M", "ت"), + (0x1EE16, "M", "ث"), + (0x1EE17, "M", "خ"), + (0x1EE18, "M", "ذ"), + (0x1EE19, "M", "ض"), + (0x1EE1A, "M", "ظ"), + (0x1EE1B, "M", "غ"), + (0x1EE1C, "M", "ٮ"), + (0x1EE1D, "M", "ں"), + (0x1EE1E, "M", "ڡ"), + (0x1EE1F, "M", "ٯ"), + (0x1EE20, "X"), + (0x1EE21, "M", "ب"), + (0x1EE22, "M", "ج"), + (0x1EE23, "X"), + (0x1EE24, "M", "ه"), + (0x1EE25, "X"), + (0x1EE27, "M", "ح"), + (0x1EE28, "X"), + (0x1EE29, "M", "ي"), + (0x1EE2A, "M", "ك"), + (0x1EE2B, "M", "ل"), + (0x1EE2C, "M", "م"), + (0x1EE2D, "M", "ن"), + (0x1EE2E, "M", "س"), + (0x1EE2F, "M", "ع"), + (0x1EE30, "M", "ف"), + (0x1EE31, "M", "ص"), + (0x1EE32, "M", "ق"), + (0x1EE33, "X"), + (0x1EE34, "M", "ش"), + (0x1EE35, "M", "ت"), + (0x1EE36, "M", "ث"), + (0x1EE37, "M", "خ"), + (0x1EE38, "X"), + (0x1EE39, "M", "ض"), + (0x1EE3A, "X"), + (0x1EE3B, "M", "غ"), + (0x1EE3C, "X"), + (0x1EE42, "M", "ج"), + (0x1EE43, "X"), + (0x1EE47, "M", "ح"), + (0x1EE48, "X"), + (0x1EE49, "M", "ي"), + (0x1EE4A, "X"), + (0x1EE4B, "M", "ل"), + (0x1EE4C, "X"), + (0x1EE4D, "M", "ن"), + (0x1EE4E, "M", "س"), + (0x1EE4F, "M", "ع"), + (0x1EE50, "X"), + (0x1EE51, "M", "ص"), + (0x1EE52, "M", "ق"), + (0x1EE53, "X"), + (0x1EE54, "M", "ش"), + (0x1EE55, "X"), + (0x1EE57, "M", "خ"), + (0x1EE58, "X"), + (0x1EE59, "M", "ض"), + (0x1EE5A, "X"), + (0x1EE5B, "M", "غ"), + (0x1EE5C, "X"), + (0x1EE5D, "M", "ں"), + (0x1EE5E, "X"), + (0x1EE5F, "M", "ٯ"), + (0x1EE60, "X"), + (0x1EE61, "M", "ب"), + (0x1EE62, "M", "ج"), + (0x1EE63, "X"), + (0x1EE64, "M", "ه"), + (0x1EE65, "X"), + (0x1EE67, "M", "ح"), + (0x1EE68, "M", "ط"), + (0x1EE69, "M", "ي"), + (0x1EE6A, "M", "ك"), + (0x1EE6B, "X"), + (0x1EE6C, "M", "م"), + (0x1EE6D, "M", "ن"), + (0x1EE6E, "M", "س"), + (0x1EE6F, "M", "ع"), + (0x1EE70, "M", "ف"), + (0x1EE71, "M", "ص"), + (0x1EE72, "M", "ق"), + (0x1EE73, "X"), + (0x1EE74, "M", "ش"), + (0x1EE75, "M", "ت"), + (0x1EE76, "M", "ث"), + (0x1EE77, "M", "خ"), + (0x1EE78, "X"), + (0x1EE79, "M", "ض"), + (0x1EE7A, "M", "ظ"), + (0x1EE7B, "M", "غ"), + (0x1EE7C, "M", "ٮ"), + (0x1EE7D, "X"), + (0x1EE7E, "M", "ڡ"), + (0x1EE7F, "X"), + (0x1EE80, "M", "ا"), + (0x1EE81, "M", "ب"), + (0x1EE82, "M", "ج"), + (0x1EE83, "M", "د"), + ] + + +def _seg_75() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x1EE84, "M", "ه"), + (0x1EE85, "M", "و"), + (0x1EE86, "M", "ز"), + (0x1EE87, "M", "ح"), + (0x1EE88, "M", "ط"), + (0x1EE89, "M", "ي"), + (0x1EE8A, "X"), + (0x1EE8B, "M", "ل"), + (0x1EE8C, "M", "م"), + (0x1EE8D, "M", "ن"), + (0x1EE8E, "M", "س"), + (0x1EE8F, "M", "ع"), + (0x1EE90, "M", "ف"), + (0x1EE91, "M", "ص"), + (0x1EE92, "M", "ق"), + (0x1EE93, "M", "ر"), + (0x1EE94, "M", "ش"), + (0x1EE95, "M", "ت"), + (0x1EE96, "M", "ث"), + (0x1EE97, "M", "خ"), + (0x1EE98, "M", "ذ"), + (0x1EE99, "M", "ض"), + (0x1EE9A, "M", "ظ"), + (0x1EE9B, "M", "غ"), + (0x1EE9C, "X"), + (0x1EEA1, "M", "ب"), + (0x1EEA2, "M", "ج"), + (0x1EEA3, "M", "د"), + (0x1EEA4, "X"), + (0x1EEA5, "M", "و"), + (0x1EEA6, "M", "ز"), + (0x1EEA7, "M", "ح"), + (0x1EEA8, "M", "ط"), + (0x1EEA9, "M", "ي"), + (0x1EEAA, "X"), + (0x1EEAB, "M", "ل"), + (0x1EEAC, "M", "م"), + (0x1EEAD, "M", "ن"), + (0x1EEAE, "M", "س"), + (0x1EEAF, "M", "ع"), + (0x1EEB0, "M", "ف"), + (0x1EEB1, "M", "ص"), + (0x1EEB2, "M", "ق"), + (0x1EEB3, "M", "ر"), + (0x1EEB4, "M", "ش"), + (0x1EEB5, "M", "ت"), + (0x1EEB6, "M", "ث"), + (0x1EEB7, "M", "خ"), + (0x1EEB8, "M", "ذ"), + (0x1EEB9, "M", "ض"), + (0x1EEBA, "M", "ظ"), + (0x1EEBB, "M", "غ"), + (0x1EEBC, "X"), + (0x1EEF0, "V"), + (0x1EEF2, "X"), + (0x1F000, "V"), + (0x1F02C, "X"), + (0x1F030, "V"), + (0x1F094, "X"), + (0x1F0A0, "V"), + (0x1F0AF, "X"), + (0x1F0B1, "V"), + (0x1F0C0, "X"), + (0x1F0C1, "V"), + (0x1F0D0, "X"), + (0x1F0D1, "V"), + (0x1F0F6, "X"), + (0x1F101, "M", "0,"), + (0x1F102, "M", "1,"), + (0x1F103, "M", "2,"), + (0x1F104, "M", "3,"), + (0x1F105, "M", "4,"), + (0x1F106, "M", "5,"), + (0x1F107, "M", "6,"), + (0x1F108, "M", "7,"), + (0x1F109, "M", "8,"), + (0x1F10A, "M", "9,"), + (0x1F10B, "V"), + (0x1F110, "M", "(a)"), + (0x1F111, "M", "(b)"), + (0x1F112, "M", "(c)"), + (0x1F113, "M", "(d)"), + (0x1F114, "M", "(e)"), + (0x1F115, "M", "(f)"), + (0x1F116, "M", "(g)"), + (0x1F117, "M", "(h)"), + (0x1F118, "M", "(i)"), + (0x1F119, "M", "(j)"), + (0x1F11A, "M", "(k)"), + (0x1F11B, "M", "(l)"), + (0x1F11C, "M", "(m)"), + (0x1F11D, "M", "(n)"), + (0x1F11E, "M", "(o)"), + (0x1F11F, "M", "(p)"), + (0x1F120, "M", "(q)"), + (0x1F121, "M", "(r)"), + (0x1F122, "M", "(s)"), + (0x1F123, "M", "(t)"), + (0x1F124, "M", "(u)"), + (0x1F125, "M", "(v)"), + ] + + +def _seg_76() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x1F126, "M", "(w)"), + (0x1F127, "M", "(x)"), + (0x1F128, "M", "(y)"), + (0x1F129, "M", "(z)"), + (0x1F12A, "M", "〔s〕"), + (0x1F12B, "M", "c"), + (0x1F12C, "M", "r"), + (0x1F12D, "M", "cd"), + (0x1F12E, "M", "wz"), + (0x1F12F, "V"), + (0x1F130, "M", "a"), + (0x1F131, "M", "b"), + (0x1F132, "M", "c"), + (0x1F133, "M", "d"), + (0x1F134, "M", "e"), + (0x1F135, "M", "f"), + (0x1F136, "M", "g"), + (0x1F137, "M", "h"), + (0x1F138, "M", "i"), + (0x1F139, "M", "j"), + (0x1F13A, "M", "k"), + (0x1F13B, "M", "l"), + (0x1F13C, "M", "m"), + (0x1F13D, "M", "n"), + (0x1F13E, "M", "o"), + (0x1F13F, "M", "p"), + (0x1F140, "M", "q"), + (0x1F141, "M", "r"), + (0x1F142, "M", "s"), + (0x1F143, "M", "t"), + (0x1F144, "M", "u"), + (0x1F145, "M", "v"), + (0x1F146, "M", "w"), + (0x1F147, "M", "x"), + (0x1F148, "M", "y"), + (0x1F149, "M", "z"), + (0x1F14A, "M", "hv"), + (0x1F14B, "M", "mv"), + (0x1F14C, "M", "sd"), + (0x1F14D, "M", "ss"), + (0x1F14E, "M", "ppv"), + (0x1F14F, "M", "wc"), + (0x1F150, "V"), + (0x1F16A, "M", "mc"), + (0x1F16B, "M", "md"), + (0x1F16C, "M", "mr"), + (0x1F16D, "V"), + (0x1F190, "M", "dj"), + (0x1F191, "V"), + (0x1F1AE, "X"), + (0x1F1E6, "V"), + (0x1F200, "M", "ほか"), + (0x1F201, "M", "ココ"), + (0x1F202, "M", "サ"), + (0x1F203, "X"), + (0x1F210, "M", "手"), + (0x1F211, "M", "字"), + (0x1F212, "M", "双"), + (0x1F213, "M", "デ"), + (0x1F214, "M", "二"), + (0x1F215, "M", "多"), + (0x1F216, "M", "解"), + (0x1F217, "M", "天"), + (0x1F218, "M", "交"), + (0x1F219, "M", "映"), + (0x1F21A, "M", "無"), + (0x1F21B, "M", "料"), + (0x1F21C, "M", "前"), + (0x1F21D, "M", "後"), + (0x1F21E, "M", "再"), + (0x1F21F, "M", "新"), + (0x1F220, "M", "初"), + (0x1F221, "M", "終"), + (0x1F222, "M", "生"), + (0x1F223, "M", "販"), + (0x1F224, "M", "声"), + (0x1F225, "M", "吹"), + (0x1F226, "M", "演"), + (0x1F227, "M", "投"), + (0x1F228, "M", "捕"), + (0x1F229, "M", "一"), + (0x1F22A, "M", "三"), + (0x1F22B, "M", "遊"), + (0x1F22C, "M", "左"), + (0x1F22D, "M", "中"), + (0x1F22E, "M", "右"), + (0x1F22F, "M", "指"), + (0x1F230, "M", "走"), + (0x1F231, "M", "打"), + (0x1F232, "M", "禁"), + (0x1F233, "M", "空"), + (0x1F234, "M", "合"), + (0x1F235, "M", "満"), + (0x1F236, "M", "有"), + (0x1F237, "M", "月"), + (0x1F238, "M", "申"), + (0x1F239, "M", "割"), + (0x1F23A, "M", "営"), + (0x1F23B, "M", "配"), + (0x1F23C, "X"), + ] + + +def _seg_77() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x1F240, "M", "〔本〕"), + (0x1F241, "M", "〔三〕"), + (0x1F242, "M", "〔二〕"), + (0x1F243, "M", "〔安〕"), + (0x1F244, "M", "〔点〕"), + (0x1F245, "M", "〔打〕"), + (0x1F246, "M", "〔盗〕"), + (0x1F247, "M", "〔勝〕"), + (0x1F248, "M", "〔敗〕"), + (0x1F249, "X"), + (0x1F250, "M", "得"), + (0x1F251, "M", "可"), + (0x1F252, "X"), + (0x1F260, "V"), + (0x1F266, "X"), + (0x1F300, "V"), + (0x1F6D8, "X"), + (0x1F6DC, "V"), + (0x1F6ED, "X"), + (0x1F6F0, "V"), + (0x1F6FD, "X"), + (0x1F700, "V"), + (0x1F777, "X"), + (0x1F77B, "V"), + (0x1F7DA, "X"), + (0x1F7E0, "V"), + (0x1F7EC, "X"), + (0x1F7F0, "V"), + (0x1F7F1, "X"), + (0x1F800, "V"), + (0x1F80C, "X"), + (0x1F810, "V"), + (0x1F848, "X"), + (0x1F850, "V"), + (0x1F85A, "X"), + (0x1F860, "V"), + (0x1F888, "X"), + (0x1F890, "V"), + (0x1F8AE, "X"), + (0x1F8B0, "V"), + (0x1F8BC, "X"), + (0x1F8C0, "V"), + (0x1F8C2, "X"), + (0x1F900, "V"), + (0x1FA54, "X"), + (0x1FA60, "V"), + (0x1FA6E, "X"), + (0x1FA70, "V"), + (0x1FA7D, "X"), + (0x1FA80, "V"), + (0x1FA8A, "X"), + (0x1FA8F, "V"), + (0x1FAC7, "X"), + (0x1FACE, "V"), + (0x1FADD, "X"), + (0x1FADF, "V"), + (0x1FAEA, "X"), + (0x1FAF0, "V"), + (0x1FAF9, "X"), + (0x1FB00, "V"), + (0x1FB93, "X"), + (0x1FB94, "V"), + (0x1FBF0, "M", "0"), + (0x1FBF1, "M", "1"), + (0x1FBF2, "M", "2"), + (0x1FBF3, "M", "3"), + (0x1FBF4, "M", "4"), + (0x1FBF5, "M", "5"), + (0x1FBF6, "M", "6"), + (0x1FBF7, "M", "7"), + (0x1FBF8, "M", "8"), + (0x1FBF9, "M", "9"), + (0x1FBFA, "X"), + (0x20000, "V"), + (0x2A6E0, "X"), + (0x2A700, "V"), + (0x2B73A, "X"), + (0x2B740, "V"), + (0x2B81E, "X"), + (0x2B820, "V"), + (0x2CEA2, "X"), + (0x2CEB0, "V"), + (0x2EBE1, "X"), + (0x2EBF0, "V"), + (0x2EE5E, "X"), + (0x2F800, "M", "丽"), + (0x2F801, "M", "丸"), + (0x2F802, "M", "乁"), + (0x2F803, "M", "𠄢"), + (0x2F804, "M", "你"), + (0x2F805, "M", "侮"), + (0x2F806, "M", "侻"), + (0x2F807, "M", "倂"), + (0x2F808, "M", "偺"), + (0x2F809, "M", "備"), + (0x2F80A, "M", "僧"), + (0x2F80B, "M", "像"), + (0x2F80C, "M", "㒞"), + (0x2F80D, "M", "𠘺"), + (0x2F80E, "M", "免"), + ] + + +def _seg_78() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x2F80F, "M", "兔"), + (0x2F810, "M", "兤"), + (0x2F811, "M", "具"), + (0x2F812, "M", "𠔜"), + (0x2F813, "M", "㒹"), + (0x2F814, "M", "內"), + (0x2F815, "M", "再"), + (0x2F816, "M", "𠕋"), + (0x2F817, "M", "冗"), + (0x2F818, "M", "冤"), + (0x2F819, "M", "仌"), + (0x2F81A, "M", "冬"), + (0x2F81B, "M", "况"), + (0x2F81C, "M", "𩇟"), + (0x2F81D, "M", "凵"), + (0x2F81E, "M", "刃"), + (0x2F81F, "M", "㓟"), + (0x2F820, "M", "刻"), + (0x2F821, "M", "剆"), + (0x2F822, "M", "割"), + (0x2F823, "M", "剷"), + (0x2F824, "M", "㔕"), + (0x2F825, "M", "勇"), + (0x2F826, "M", "勉"), + (0x2F827, "M", "勤"), + (0x2F828, "M", "勺"), + (0x2F829, "M", "包"), + (0x2F82A, "M", "匆"), + (0x2F82B, "M", "北"), + (0x2F82C, "M", "卉"), + (0x2F82D, "M", "卑"), + (0x2F82E, "M", "博"), + (0x2F82F, "M", "即"), + (0x2F830, "M", "卽"), + (0x2F831, "M", "卿"), + (0x2F834, "M", "𠨬"), + (0x2F835, "M", "灰"), + (0x2F836, "M", "及"), + (0x2F837, "M", "叟"), + (0x2F838, "M", "𠭣"), + (0x2F839, "M", "叫"), + (0x2F83A, "M", "叱"), + (0x2F83B, "M", "吆"), + (0x2F83C, "M", "咞"), + (0x2F83D, "M", "吸"), + (0x2F83E, "M", "呈"), + (0x2F83F, "M", "周"), + (0x2F840, "M", "咢"), + (0x2F841, "M", "哶"), + (0x2F842, "M", "唐"), + (0x2F843, "M", "啓"), + (0x2F844, "M", "啣"), + (0x2F845, "M", "善"), + (0x2F847, "M", "喙"), + (0x2F848, "M", "喫"), + (0x2F849, "M", "喳"), + (0x2F84A, "M", "嗂"), + (0x2F84B, "M", "圖"), + (0x2F84C, "M", "嘆"), + (0x2F84D, "M", "圗"), + (0x2F84E, "M", "噑"), + (0x2F84F, "M", "噴"), + (0x2F850, "M", "切"), + (0x2F851, "M", "壮"), + (0x2F852, "M", "城"), + (0x2F853, "M", "埴"), + (0x2F854, "M", "堍"), + (0x2F855, "M", "型"), + (0x2F856, "M", "堲"), + (0x2F857, "M", "報"), + (0x2F858, "M", "墬"), + (0x2F859, "M", "𡓤"), + (0x2F85A, "M", "売"), + (0x2F85B, "M", "壷"), + (0x2F85C, "M", "夆"), + (0x2F85D, "M", "多"), + (0x2F85E, "M", "夢"), + (0x2F85F, "M", "奢"), + (0x2F860, "M", "𡚨"), + (0x2F861, "M", "𡛪"), + (0x2F862, "M", "姬"), + (0x2F863, "M", "娛"), + (0x2F864, "M", "娧"), + (0x2F865, "M", "姘"), + (0x2F866, "M", "婦"), + (0x2F867, "M", "㛮"), + (0x2F868, "M", "㛼"), + (0x2F869, "M", "嬈"), + (0x2F86A, "M", "嬾"), + (0x2F86C, "M", "𡧈"), + (0x2F86D, "M", "寃"), + (0x2F86E, "M", "寘"), + (0x2F86F, "M", "寧"), + (0x2F870, "M", "寳"), + (0x2F871, "M", "𡬘"), + (0x2F872, "M", "寿"), + (0x2F873, "M", "将"), + (0x2F874, "M", "当"), + (0x2F875, "M", "尢"), + (0x2F876, "M", "㞁"), + ] + + +def _seg_79() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x2F877, "M", "屠"), + (0x2F878, "M", "屮"), + (0x2F879, "M", "峀"), + (0x2F87A, "M", "岍"), + (0x2F87B, "M", "𡷤"), + (0x2F87C, "M", "嵃"), + (0x2F87D, "M", "𡷦"), + (0x2F87E, "M", "嵮"), + (0x2F87F, "M", "嵫"), + (0x2F880, "M", "嵼"), + (0x2F881, "M", "巡"), + (0x2F882, "M", "巢"), + (0x2F883, "M", "㠯"), + (0x2F884, "M", "巽"), + (0x2F885, "M", "帨"), + (0x2F886, "M", "帽"), + (0x2F887, "M", "幩"), + (0x2F888, "M", "㡢"), + (0x2F889, "M", "𢆃"), + (0x2F88A, "M", "㡼"), + (0x2F88B, "M", "庰"), + (0x2F88C, "M", "庳"), + (0x2F88D, "M", "庶"), + (0x2F88E, "M", "廊"), + (0x2F88F, "M", "𪎒"), + (0x2F890, "M", "廾"), + (0x2F891, "M", "𢌱"), + (0x2F893, "M", "舁"), + (0x2F894, "M", "弢"), + (0x2F896, "M", "㣇"), + (0x2F897, "M", "𣊸"), + (0x2F898, "M", "𦇚"), + (0x2F899, "M", "形"), + (0x2F89A, "M", "彫"), + (0x2F89B, "M", "㣣"), + (0x2F89C, "M", "徚"), + (0x2F89D, "M", "忍"), + (0x2F89E, "M", "志"), + (0x2F89F, "M", "忹"), + (0x2F8A0, "M", "悁"), + (0x2F8A1, "M", "㤺"), + (0x2F8A2, "M", "㤜"), + (0x2F8A3, "M", "悔"), + (0x2F8A4, "M", "𢛔"), + (0x2F8A5, "M", "惇"), + (0x2F8A6, "M", "慈"), + (0x2F8A7, "M", "慌"), + (0x2F8A8, "M", "慎"), + (0x2F8A9, "M", "慌"), + (0x2F8AA, "M", "慺"), + (0x2F8AB, "M", "憎"), + (0x2F8AC, "M", "憲"), + (0x2F8AD, "M", "憤"), + (0x2F8AE, "M", "憯"), + (0x2F8AF, "M", "懞"), + (0x2F8B0, "M", "懲"), + (0x2F8B1, "M", "懶"), + (0x2F8B2, "M", "成"), + (0x2F8B3, "M", "戛"), + (0x2F8B4, "M", "扝"), + (0x2F8B5, "M", "抱"), + (0x2F8B6, "M", "拔"), + (0x2F8B7, "M", "捐"), + (0x2F8B8, "M", "𢬌"), + (0x2F8B9, "M", "挽"), + (0x2F8BA, "M", "拼"), + (0x2F8BB, "M", "捨"), + (0x2F8BC, "M", "掃"), + (0x2F8BD, "M", "揤"), + (0x2F8BE, "M", "𢯱"), + (0x2F8BF, "M", "搢"), + (0x2F8C0, "M", "揅"), + (0x2F8C1, "M", "掩"), + (0x2F8C2, "M", "㨮"), + (0x2F8C3, "M", "摩"), + (0x2F8C4, "M", "摾"), + (0x2F8C5, "M", "撝"), + (0x2F8C6, "M", "摷"), + (0x2F8C7, "M", "㩬"), + (0x2F8C8, "M", "敏"), + (0x2F8C9, "M", "敬"), + (0x2F8CA, "M", "𣀊"), + (0x2F8CB, "M", "旣"), + (0x2F8CC, "M", "書"), + (0x2F8CD, "M", "晉"), + (0x2F8CE, "M", "㬙"), + (0x2F8CF, "M", "暑"), + (0x2F8D0, "M", "㬈"), + (0x2F8D1, "M", "㫤"), + (0x2F8D2, "M", "冒"), + (0x2F8D3, "M", "冕"), + (0x2F8D4, "M", "最"), + (0x2F8D5, "M", "暜"), + (0x2F8D6, "M", "肭"), + (0x2F8D7, "M", "䏙"), + (0x2F8D8, "M", "朗"), + (0x2F8D9, "M", "望"), + (0x2F8DA, "M", "朡"), + (0x2F8DB, "M", "杞"), + (0x2F8DC, "M", "杓"), + ] + + +def _seg_80() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x2F8DD, "M", "𣏃"), + (0x2F8DE, "M", "㭉"), + (0x2F8DF, "M", "柺"), + (0x2F8E0, "M", "枅"), + (0x2F8E1, "M", "桒"), + (0x2F8E2, "M", "梅"), + (0x2F8E3, "M", "𣑭"), + (0x2F8E4, "M", "梎"), + (0x2F8E5, "M", "栟"), + (0x2F8E6, "M", "椔"), + (0x2F8E7, "M", "㮝"), + (0x2F8E8, "M", "楂"), + (0x2F8E9, "M", "榣"), + (0x2F8EA, "M", "槪"), + (0x2F8EB, "M", "檨"), + (0x2F8EC, "M", "𣚣"), + (0x2F8ED, "M", "櫛"), + (0x2F8EE, "M", "㰘"), + (0x2F8EF, "M", "次"), + (0x2F8F0, "M", "𣢧"), + (0x2F8F1, "M", "歔"), + (0x2F8F2, "M", "㱎"), + (0x2F8F3, "M", "歲"), + (0x2F8F4, "M", "殟"), + (0x2F8F5, "M", "殺"), + (0x2F8F6, "M", "殻"), + (0x2F8F7, "M", "𣪍"), + (0x2F8F8, "M", "𡴋"), + (0x2F8F9, "M", "𣫺"), + (0x2F8FA, "M", "汎"), + (0x2F8FB, "M", "𣲼"), + (0x2F8FC, "M", "沿"), + (0x2F8FD, "M", "泍"), + (0x2F8FE, "M", "汧"), + (0x2F8FF, "M", "洖"), + (0x2F900, "M", "派"), + (0x2F901, "M", "海"), + (0x2F902, "M", "流"), + (0x2F903, "M", "浩"), + (0x2F904, "M", "浸"), + (0x2F905, "M", "涅"), + (0x2F906, "M", "𣴞"), + (0x2F907, "M", "洴"), + (0x2F908, "M", "港"), + (0x2F909, "M", "湮"), + (0x2F90A, "M", "㴳"), + (0x2F90B, "M", "滋"), + (0x2F90C, "M", "滇"), + (0x2F90D, "M", "𣻑"), + (0x2F90E, "M", "淹"), + (0x2F90F, "M", "潮"), + (0x2F910, "M", "𣽞"), + (0x2F911, "M", "𣾎"), + (0x2F912, "M", "濆"), + (0x2F913, "M", "瀹"), + (0x2F914, "M", "瀞"), + (0x2F915, "M", "瀛"), + (0x2F916, "M", "㶖"), + (0x2F917, "M", "灊"), + (0x2F918, "M", "災"), + (0x2F919, "M", "灷"), + (0x2F91A, "M", "炭"), + (0x2F91B, "M", "𠔥"), + (0x2F91C, "M", "煅"), + (0x2F91D, "M", "𤉣"), + (0x2F91E, "M", "熜"), + (0x2F91F, "M", "𤎫"), + (0x2F920, "M", "爨"), + (0x2F921, "M", "爵"), + (0x2F922, "M", "牐"), + (0x2F923, "M", "𤘈"), + (0x2F924, "M", "犀"), + (0x2F925, "M", "犕"), + (0x2F926, "M", "𤜵"), + (0x2F927, "M", "𤠔"), + (0x2F928, "M", "獺"), + (0x2F929, "M", "王"), + (0x2F92A, "M", "㺬"), + (0x2F92B, "M", "玥"), + (0x2F92C, "M", "㺸"), + (0x2F92E, "M", "瑇"), + (0x2F92F, "M", "瑜"), + (0x2F930, "M", "瑱"), + (0x2F931, "M", "璅"), + (0x2F932, "M", "瓊"), + (0x2F933, "M", "㼛"), + (0x2F934, "M", "甤"), + (0x2F935, "M", "𤰶"), + (0x2F936, "M", "甾"), + (0x2F937, "M", "𤲒"), + (0x2F938, "M", "異"), + (0x2F939, "M", "𢆟"), + (0x2F93A, "M", "瘐"), + (0x2F93B, "M", "𤾡"), + (0x2F93C, "M", "𤾸"), + (0x2F93D, "M", "𥁄"), + (0x2F93E, "M", "㿼"), + (0x2F93F, "M", "䀈"), + (0x2F940, "M", "直"), + (0x2F941, "M", "𥃳"), + ] + + +def _seg_81() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x2F942, "M", "𥃲"), + (0x2F943, "M", "𥄙"), + (0x2F944, "M", "𥄳"), + (0x2F945, "M", "眞"), + (0x2F946, "M", "真"), + (0x2F948, "M", "睊"), + (0x2F949, "M", "䀹"), + (0x2F94A, "M", "瞋"), + (0x2F94B, "M", "䁆"), + (0x2F94C, "M", "䂖"), + (0x2F94D, "M", "𥐝"), + (0x2F94E, "M", "硎"), + (0x2F94F, "M", "碌"), + (0x2F950, "M", "磌"), + (0x2F951, "M", "䃣"), + (0x2F952, "M", "𥘦"), + (0x2F953, "M", "祖"), + (0x2F954, "M", "𥚚"), + (0x2F955, "M", "𥛅"), + (0x2F956, "M", "福"), + (0x2F957, "M", "秫"), + (0x2F958, "M", "䄯"), + (0x2F959, "M", "穀"), + (0x2F95A, "M", "穊"), + (0x2F95B, "M", "穏"), + (0x2F95C, "M", "𥥼"), + (0x2F95D, "M", "𥪧"), + (0x2F95F, "M", "竮"), + (0x2F960, "M", "䈂"), + (0x2F961, "M", "𥮫"), + (0x2F962, "M", "篆"), + (0x2F963, "M", "築"), + (0x2F964, "M", "䈧"), + (0x2F965, "M", "𥲀"), + (0x2F966, "M", "糒"), + (0x2F967, "M", "䊠"), + (0x2F968, "M", "糨"), + (0x2F969, "M", "糣"), + (0x2F96A, "M", "紀"), + (0x2F96B, "M", "𥾆"), + (0x2F96C, "M", "絣"), + (0x2F96D, "M", "䌁"), + (0x2F96E, "M", "緇"), + (0x2F96F, "M", "縂"), + (0x2F970, "M", "繅"), + (0x2F971, "M", "䌴"), + (0x2F972, "M", "𦈨"), + (0x2F973, "M", "𦉇"), + (0x2F974, "M", "䍙"), + (0x2F975, "M", "𦋙"), + (0x2F976, "M", "罺"), + (0x2F977, "M", "𦌾"), + (0x2F978, "M", "羕"), + (0x2F979, "M", "翺"), + (0x2F97A, "M", "者"), + (0x2F97B, "M", "𦓚"), + (0x2F97C, "M", "𦔣"), + (0x2F97D, "M", "聠"), + (0x2F97E, "M", "𦖨"), + (0x2F97F, "M", "聰"), + (0x2F980, "M", "𣍟"), + (0x2F981, "M", "䏕"), + (0x2F982, "M", "育"), + (0x2F983, "M", "脃"), + (0x2F984, "M", "䐋"), + (0x2F985, "M", "脾"), + (0x2F986, "M", "媵"), + (0x2F987, "M", "𦞧"), + (0x2F988, "M", "𦞵"), + (0x2F989, "M", "𣎓"), + (0x2F98A, "M", "𣎜"), + (0x2F98B, "M", "舁"), + (0x2F98C, "M", "舄"), + (0x2F98D, "M", "辞"), + (0x2F98E, "M", "䑫"), + (0x2F98F, "M", "芑"), + (0x2F990, "M", "芋"), + (0x2F991, "M", "芝"), + (0x2F992, "M", "劳"), + (0x2F993, "M", "花"), + (0x2F994, "M", "芳"), + (0x2F995, "M", "芽"), + (0x2F996, "M", "苦"), + (0x2F997, "M", "𦬼"), + (0x2F998, "M", "若"), + (0x2F999, "M", "茝"), + (0x2F99A, "M", "荣"), + (0x2F99B, "M", "莭"), + (0x2F99C, "M", "茣"), + (0x2F99D, "M", "莽"), + (0x2F99E, "M", "菧"), + (0x2F99F, "M", "著"), + (0x2F9A0, "M", "荓"), + (0x2F9A1, "M", "菊"), + (0x2F9A2, "M", "菌"), + (0x2F9A3, "M", "菜"), + (0x2F9A4, "M", "𦰶"), + (0x2F9A5, "M", "𦵫"), + (0x2F9A6, "M", "𦳕"), + (0x2F9A7, "M", "䔫"), + ] + + +def _seg_82() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x2F9A8, "M", "蓱"), + (0x2F9A9, "M", "蓳"), + (0x2F9AA, "M", "蔖"), + (0x2F9AB, "M", "𧏊"), + (0x2F9AC, "M", "蕤"), + (0x2F9AD, "M", "𦼬"), + (0x2F9AE, "M", "䕝"), + (0x2F9AF, "M", "䕡"), + (0x2F9B0, "M", "𦾱"), + (0x2F9B1, "M", "𧃒"), + (0x2F9B2, "M", "䕫"), + (0x2F9B3, "M", "虐"), + (0x2F9B4, "M", "虜"), + (0x2F9B5, "M", "虧"), + (0x2F9B6, "M", "虩"), + (0x2F9B7, "M", "蚩"), + (0x2F9B8, "M", "蚈"), + (0x2F9B9, "M", "蜎"), + (0x2F9BA, "M", "蛢"), + (0x2F9BB, "M", "蝹"), + (0x2F9BC, "M", "蜨"), + (0x2F9BD, "M", "蝫"), + (0x2F9BE, "M", "螆"), + (0x2F9BF, "M", "䗗"), + (0x2F9C0, "M", "蟡"), + (0x2F9C1, "M", "蠁"), + (0x2F9C2, "M", "䗹"), + (0x2F9C3, "M", "衠"), + (0x2F9C4, "M", "衣"), + (0x2F9C5, "M", "𧙧"), + (0x2F9C6, "M", "裗"), + (0x2F9C7, "M", "裞"), + (0x2F9C8, "M", "䘵"), + (0x2F9C9, "M", "裺"), + (0x2F9CA, "M", "㒻"), + (0x2F9CB, "M", "𧢮"), + (0x2F9CC, "M", "𧥦"), + (0x2F9CD, "M", "䚾"), + (0x2F9CE, "M", "䛇"), + (0x2F9CF, "M", "誠"), + (0x2F9D0, "M", "諭"), + (0x2F9D1, "M", "變"), + (0x2F9D2, "M", "豕"), + (0x2F9D3, "M", "𧲨"), + (0x2F9D4, "M", "貫"), + (0x2F9D5, "M", "賁"), + (0x2F9D6, "M", "贛"), + (0x2F9D7, "M", "起"), + (0x2F9D8, "M", "𧼯"), + (0x2F9D9, "M", "𠠄"), + (0x2F9DA, "M", "跋"), + (0x2F9DB, "M", "趼"), + (0x2F9DC, "M", "跰"), + (0x2F9DD, "M", "𠣞"), + (0x2F9DE, "M", "軔"), + (0x2F9DF, "M", "輸"), + (0x2F9E0, "M", "𨗒"), + (0x2F9E1, "M", "𨗭"), + (0x2F9E2, "M", "邔"), + (0x2F9E3, "M", "郱"), + (0x2F9E4, "M", "鄑"), + (0x2F9E5, "M", "𨜮"), + (0x2F9E6, "M", "鄛"), + (0x2F9E7, "M", "鈸"), + (0x2F9E8, "M", "鋗"), + (0x2F9E9, "M", "鋘"), + (0x2F9EA, "M", "鉼"), + (0x2F9EB, "M", "鏹"), + (0x2F9EC, "M", "鐕"), + (0x2F9ED, "M", "𨯺"), + (0x2F9EE, "M", "開"), + (0x2F9EF, "M", "䦕"), + (0x2F9F0, "M", "閷"), + (0x2F9F1, "M", "𨵷"), + (0x2F9F2, "M", "䧦"), + (0x2F9F3, "M", "雃"), + (0x2F9F4, "M", "嶲"), + (0x2F9F5, "M", "霣"), + (0x2F9F6, "M", "𩅅"), + (0x2F9F7, "M", "𩈚"), + (0x2F9F8, "M", "䩮"), + (0x2F9F9, "M", "䩶"), + (0x2F9FA, "M", "韠"), + (0x2F9FB, "M", "𩐊"), + (0x2F9FC, "M", "䪲"), + (0x2F9FD, "M", "𩒖"), + (0x2F9FE, "M", "頋"), + (0x2FA00, "M", "頩"), + (0x2FA01, "M", "𩖶"), + (0x2FA02, "M", "飢"), + (0x2FA03, "M", "䬳"), + (0x2FA04, "M", "餩"), + (0x2FA05, "M", "馧"), + (0x2FA06, "M", "駂"), + (0x2FA07, "M", "駾"), + (0x2FA08, "M", "䯎"), + (0x2FA09, "M", "𩬰"), + (0x2FA0A, "M", "鬒"), + (0x2FA0B, "M", "鱀"), + (0x2FA0C, "M", "鳽"), + ] + + +def _seg_83() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]: + return [ + (0x2FA0D, "M", "䳎"), + (0x2FA0E, "M", "䳭"), + (0x2FA0F, "M", "鵧"), + (0x2FA10, "M", "𪃎"), + (0x2FA11, "M", "䳸"), + (0x2FA12, "M", "𪄅"), + (0x2FA13, "M", "𪈎"), + (0x2FA14, "M", "𪊑"), + (0x2FA15, "M", "麻"), + (0x2FA16, "M", "䵖"), + (0x2FA17, "M", "黹"), + (0x2FA18, "M", "黾"), + (0x2FA19, "M", "鼅"), + (0x2FA1A, "M", "鼏"), + (0x2FA1B, "M", "鼖"), + (0x2FA1C, "M", "鼻"), + (0x2FA1D, "M", "𪘀"), + (0x2FA1E, "X"), + (0x30000, "V"), + (0x3134B, "X"), + (0x31350, "V"), + (0x323B0, "X"), + (0xE0100, "I"), + (0xE01F0, "X"), + ] + + +uts46data = tuple( + _seg_0() + + _seg_1() + + _seg_2() + + _seg_3() + + _seg_4() + + _seg_5() + + _seg_6() + + _seg_7() + + _seg_8() + + _seg_9() + + _seg_10() + + _seg_11() + + _seg_12() + + _seg_13() + + _seg_14() + + _seg_15() + + _seg_16() + + _seg_17() + + _seg_18() + + _seg_19() + + _seg_20() + + _seg_21() + + _seg_22() + + _seg_23() + + _seg_24() + + _seg_25() + + _seg_26() + + _seg_27() + + _seg_28() + + _seg_29() + + _seg_30() + + _seg_31() + + _seg_32() + + _seg_33() + + _seg_34() + + _seg_35() + + _seg_36() + + _seg_37() + + _seg_38() + + _seg_39() + + _seg_40() + + _seg_41() + + _seg_42() + + _seg_43() + + _seg_44() + + _seg_45() + + _seg_46() + + _seg_47() + + _seg_48() + + _seg_49() + + _seg_50() + + _seg_51() + + _seg_52() + + _seg_53() + + _seg_54() + + _seg_55() + + _seg_56() + + _seg_57() + + _seg_58() + + _seg_59() + + _seg_60() + + _seg_61() + + _seg_62() + + _seg_63() + + _seg_64() + + _seg_65() + + _seg_66() + + _seg_67() + + _seg_68() + + _seg_69() + + _seg_70() + + _seg_71() + + _seg_72() + + _seg_73() + + _seg_74() + + _seg_75() + + _seg_76() + + _seg_77() + + _seg_78() + + _seg_79() + + _seg_80() + + _seg_81() + + _seg_82() + + _seg_83() +) # type: Tuple[Union[Tuple[int, str], Tuple[int, str, str]], ...] diff --git a/importlib_metadata/__init__.py b/importlib_metadata/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..334a0916e44a096237886ef8e1e9cebaee229821 --- /dev/null +++ b/importlib_metadata/__init__.py @@ -0,0 +1,1191 @@ +""" +APIs exposing metadata from third-party Python packages. + +This codebase is shared between importlib.metadata in the stdlib +and importlib_metadata in PyPI. See +https://github.com/python/importlib_metadata/wiki/Development-Methodology +for more detail. +""" + +from __future__ import annotations + +import abc +import collections +import email +import functools +import itertools +import operator +import os +import pathlib +import posixpath +import re +import sys +import textwrap +import types +from collections.abc import Iterable, Mapping +from contextlib import suppress +from importlib import import_module +from importlib.abc import MetaPathFinder +from itertools import starmap +from typing import Any + +from . import _meta +from ._collections import FreezableDefaultDict, Pair +from ._compat import ( + NullFinder, + install, +) +from ._functools import method_cache, noop, pass_none, passthrough +from ._itertools import always_iterable, bucket, unique_everseen +from ._meta import PackageMetadata, SimplePath +from ._typing import md_none +from .compat import py311 + +__all__ = [ + 'Distribution', + 'DistributionFinder', + 'PackageMetadata', + 'PackageNotFoundError', + 'SimplePath', + 'distribution', + 'distributions', + 'entry_points', + 'files', + 'metadata', + 'packages_distributions', + 'requires', + 'version', +] + + +class PackageNotFoundError(ModuleNotFoundError): + """The package was not found.""" + + def __str__(self) -> str: + return f"No package metadata was found for {self.name}" + + @property + def name(self) -> str: # type: ignore[override] # make readonly + (name,) = self.args + return name + + +class Sectioned: + """ + A simple entry point config parser for performance + + >>> for item in Sectioned.read(Sectioned._sample): + ... print(item) + Pair(name='sec1', value='# comments ignored') + Pair(name='sec1', value='a = 1') + Pair(name='sec1', value='b = 2') + Pair(name='sec2', value='a = 2') + + >>> res = Sectioned.section_pairs(Sectioned._sample) + >>> item = next(res) + >>> item.name + 'sec1' + >>> item.value + Pair(name='a', value='1') + >>> item = next(res) + >>> item.value + Pair(name='b', value='2') + >>> item = next(res) + >>> item.name + 'sec2' + >>> item.value + Pair(name='a', value='2') + >>> list(res) + [] + """ + + _sample = textwrap.dedent( + """ + [sec1] + # comments ignored + a = 1 + b = 2 + + [sec2] + a = 2 + """ + ).lstrip() + + @classmethod + def section_pairs(cls, text): + return ( + section._replace(value=Pair.parse(section.value)) + for section in cls.read(text, filter_=cls.valid) + if section.name is not None + ) + + @staticmethod + def read(text, filter_=None): + lines = filter(filter_, map(str.strip, text.splitlines())) + name = None + for value in lines: + section_match = value.startswith('[') and value.endswith(']') + if section_match: + name = value.strip('[]') + continue + yield Pair(name, value) + + @staticmethod + def valid(line: str): + return line and not line.startswith('#') + + +class _EntryPointMatch(types.SimpleNamespace): + module: str + attr: str + extras: str + + +class EntryPoint: + """An entry point as defined by Python packaging conventions. + + See `the packaging docs on entry points + `_ + for more information. + + >>> ep = EntryPoint( + ... name=None, group=None, value='package.module:attr [extra1, extra2]') + >>> ep.module + 'package.module' + >>> ep.attr + 'attr' + >>> ep.extras + ['extra1', 'extra2'] + + If the value package or module are not valid identifiers, a + ValueError is raised on access. + + >>> EntryPoint(name=None, group=None, value='invalid-name').module + Traceback (most recent call last): + ... + ValueError: ('Invalid object reference...invalid-name... + >>> EntryPoint(name=None, group=None, value='invalid-name').attr + Traceback (most recent call last): + ... + ValueError: ('Invalid object reference...invalid-name... + >>> EntryPoint(name=None, group=None, value='invalid-name').extras + Traceback (most recent call last): + ... + ValueError: ('Invalid object reference...invalid-name... + + The same thing happens on construction. + + >>> EntryPoint(name=None, group=None, value='invalid-name') + Traceback (most recent call last): + ... + ValueError: ('Invalid object reference...invalid-name... + + """ + + pattern = re.compile( + r'(?P[\w.]+)\s*' + r'(:\s*(?P[\w.]+)\s*)?' + r'((?P\[.*\])\s*)?$' + ) + """ + A regular expression describing the syntax for an entry point, + which might look like: + + - module + - package.module + - package.module:attribute + - package.module:object.attribute + - package.module:attr [extra1, extra2] + + Other combinations are possible as well. + + The expression is lenient about whitespace around the ':', + following the attr, and following any extras. + """ + + name: str + value: str + group: str + + dist: Distribution | None = None + + def __init__(self, name: str, value: str, group: str) -> None: + vars(self).update(name=name, value=value, group=group) + self.module + + def load(self) -> Any: + """Load the entry point from its definition. If only a module + is indicated by the value, return that module. Otherwise, + return the named object. + """ + module = import_module(self.module) + attrs = filter(None, (self.attr or '').split('.')) + return functools.reduce(getattr, attrs, module) + + @property + def module(self) -> str: + return self._match.module + + @property + def attr(self) -> str: + return self._match.attr + + @property + def extras(self) -> list[str]: + return re.findall(r'\w+', self._match.extras or '') + + @functools.cached_property + def _match(self) -> _EntryPointMatch: + match = self.pattern.match(self.value) + if not match: + raise ValueError( + 'Invalid object reference. ' + 'See https://packaging.python.org' + '/en/latest/specifications/entry-points/#data-model', + self.value, + ) + return _EntryPointMatch(**match.groupdict()) + + def _for(self, dist): + vars(self).update(dist=dist) + return self + + def matches(self, **params): + """ + EntryPoint matches the given parameters. + + >>> ep = EntryPoint(group='foo', name='bar', value='bing:bong [extra1, extra2]') + >>> ep.matches(group='foo') + True + >>> ep.matches(name='bar', value='bing:bong [extra1, extra2]') + True + >>> ep.matches(group='foo', name='other') + False + >>> ep.matches() + True + >>> ep.matches(extras=['extra1', 'extra2']) + True + >>> ep.matches(module='bing') + True + >>> ep.matches(attr='bong') + True + """ + self._disallow_dist(params) + attrs = (getattr(self, param) for param in params) + return all(map(operator.eq, params.values(), attrs)) + + @staticmethod + def _disallow_dist(params): + """ + Querying by dist is not allowed (dist objects are not comparable). + >>> EntryPoint(name='fan', value='fav', group='fag').matches(dist='foo') + Traceback (most recent call last): + ... + ValueError: "dist" is not suitable for matching... + """ + if "dist" in params: + raise ValueError( + '"dist" is not suitable for matching. ' + "Instead, use Distribution.entry_points.select() on a " + "located distribution." + ) + + def _key(self): + return self.name, self.value, self.group + + def __lt__(self, other): + return self._key() < other._key() + + def __eq__(self, other): + return self._key() == other._key() + + def __setattr__(self, name, value): + raise AttributeError("EntryPoint objects are immutable.") + + def __repr__(self): + return ( + f'EntryPoint(name={self.name!r}, value={self.value!r}, ' + f'group={self.group!r})' + ) + + def __hash__(self) -> int: + return hash(self._key()) + + +class EntryPoints(tuple): + """ + An immutable collection of selectable EntryPoint objects. + """ + + __slots__ = () + + def __getitem__(self, name: str) -> EntryPoint: # type: ignore[override] # Work with str instead of int + """ + Get the EntryPoint in self matching name. + """ + try: + return next(iter(self.select(name=name))) + except StopIteration: + raise KeyError(name) + + def __repr__(self): + """ + Repr with classname and tuple constructor to + signal that we deviate from regular tuple behavior. + """ + return '%s(%r)' % (self.__class__.__name__, tuple(self)) + + def select(self, **params) -> EntryPoints: + """ + Select entry points from self that match the + given parameters (typically group and/or name). + """ + return EntryPoints(ep for ep in self if ep.matches(**params)) + + @property + def names(self) -> set[str]: + """ + Return the set of all names of all entry points. + """ + return {ep.name for ep in self} + + @property + def groups(self) -> set[str]: + """ + Return the set of all groups of all entry points. + """ + return {ep.group for ep in self} + + @classmethod + def _from_text_for(cls, text, dist): + return cls(ep._for(dist) for ep in cls._from_text(text)) + + @staticmethod + def _from_text(text): + return ( + EntryPoint(name=item.value.name, value=item.value.value, group=item.name) + for item in Sectioned.section_pairs(text or '') + ) + + +class PackagePath(pathlib.PurePosixPath): + """A reference to a path in a package""" + + hash: FileHash | None + size: int + dist: Distribution + + def read_text(self, encoding: str = 'utf-8') -> str: + return self.locate().read_text(encoding=encoding) + + def read_binary(self) -> bytes: + return self.locate().read_bytes() + + def locate(self) -> SimplePath: + """Return a path-like object for this path""" + return self.dist.locate_file(self) + + +class FileHash: + def __init__(self, spec: str) -> None: + self.mode, _, self.value = spec.partition('=') + + def __repr__(self) -> str: + return f'' + + +class Distribution(metaclass=abc.ABCMeta): + """ + An abstract Python distribution package. + + Custom providers may derive from this class and define + the abstract methods to provide a concrete implementation + for their environment. Some providers may opt to override + the default implementation of some properties to bypass + the file-reading mechanism. + """ + + @abc.abstractmethod + def read_text(self, filename) -> str | None: + """Attempt to load metadata file given by the name. + + Python distribution metadata is organized by blobs of text + typically represented as "files" in the metadata directory + (e.g. package-1.0.dist-info). These files include things + like: + + - METADATA: The distribution metadata including fields + like Name and Version and Description. + - entry_points.txt: A series of entry points as defined in + `the entry points spec `_. + - RECORD: A record of files according to + `this recording spec `_. + + A package may provide any set of files, including those + not listed here or none at all. + + :param filename: The name of the file in the distribution info. + :return: The text if found, otherwise None. + """ + + @abc.abstractmethod + def locate_file(self, path: str | os.PathLike[str]) -> SimplePath: + """ + Given a path to a file in this distribution, return a SimplePath + to it. + + This method is used by callers of ``Distribution.files()`` to + locate files within the distribution. If it's possible for a + Distribution to represent files in the distribution as + ``SimplePath`` objects, it should implement this method + to resolve such objects. + + Some Distribution providers may elect not to resolve SimplePath + objects within the distribution by raising a + NotImplementedError, but consumers of such a Distribution would + be unable to invoke ``Distribution.files()``. + """ + + @classmethod + def from_name(cls, name: str) -> Distribution: + """Return the Distribution for the given package name. + + :param name: The name of the distribution package to search for. + :return: The Distribution instance (or subclass thereof) for the named + package, if found. + :raises PackageNotFoundError: When the named package's distribution + metadata cannot be found. + :raises ValueError: When an invalid value is supplied for name. + """ + if not name: + raise ValueError("A distribution name is required.") + try: + return next(iter(cls._prefer_valid(cls.discover(name=name)))) + except StopIteration: + raise PackageNotFoundError(name) + + @classmethod + def discover( + cls, *, context: DistributionFinder.Context | None = None, **kwargs + ) -> Iterable[Distribution]: + """Return an iterable of Distribution objects for all packages. + + Pass a ``context`` or pass keyword arguments for constructing + a context. + + :context: A ``DistributionFinder.Context`` object. + :return: Iterable of Distribution objects for packages matching + the context. + """ + if context and kwargs: + raise ValueError("cannot accept context and kwargs") + context = context or DistributionFinder.Context(**kwargs) + return itertools.chain.from_iterable( + resolver(context) for resolver in cls._discover_resolvers() + ) + + @staticmethod + def _prefer_valid(dists: Iterable[Distribution]) -> Iterable[Distribution]: + """ + Prefer (move to the front) distributions that have metadata. + + Ref python/importlib_resources#489. + """ + buckets = bucket(dists, lambda dist: bool(dist.metadata)) + return itertools.chain(buckets[True], buckets[False]) + + @staticmethod + def at(path: str | os.PathLike[str]) -> Distribution: + """Return a Distribution for the indicated metadata path. + + :param path: a string or path-like object + :return: a concrete Distribution instance for the path + """ + return PathDistribution(pathlib.Path(path)) + + @staticmethod + def _discover_resolvers(): + """Search the meta_path for resolvers (MetadataPathFinders).""" + declared = ( + getattr(finder, 'find_distributions', None) for finder in sys.meta_path + ) + return filter(None, declared) + + @property + def metadata(self) -> _meta.PackageMetadata | None: + """Return the parsed metadata for this Distribution. + + The returned object will have keys that name the various bits of + metadata per the + `Core metadata specifications `_. + + Custom providers may provide the METADATA file or override this + property. + """ + + text = ( + self.read_text('METADATA') + or self.read_text('PKG-INFO') + # This last clause is here to support old egg-info files. Its + # effect is to just end up using the PathDistribution's self._path + # (which points to the egg-info file) attribute unchanged. + or self.read_text('') + ) + return self._assemble_message(text) + + @staticmethod + @pass_none + def _assemble_message(text: str) -> _meta.PackageMetadata: + # deferred for performance (python/cpython#109829) + from . import _adapters + + return _adapters.Message(email.message_from_string(text)) + + @property + def name(self) -> str: + """Return the 'Name' metadata for the distribution package.""" + return md_none(self.metadata)['Name'] + + @property + def _normalized_name(self): + """Return a normalized version of the name.""" + return Prepared.normalize(self.name) + + @property + def version(self) -> str: + """Return the 'Version' metadata for the distribution package.""" + return md_none(self.metadata)['Version'] + + @property + def entry_points(self) -> EntryPoints: + """ + Return EntryPoints for this distribution. + + Custom providers may provide the ``entry_points.txt`` file + or override this property. + """ + return EntryPoints._from_text_for(self.read_text('entry_points.txt'), self) + + @property + def files(self) -> list[PackagePath] | None: + """Files in this distribution. + + :return: List of PackagePath for this distribution or None + + Result is `None` if the metadata file that enumerates files + (i.e. RECORD for dist-info, or installed-files.txt or + SOURCES.txt for egg-info) is missing. + Result may be empty if the metadata exists but is empty. + + Custom providers are recommended to provide a "RECORD" file (in + ``read_text``) or override this property to allow for callers to be + able to resolve filenames provided by the package. + """ + + def make_file(name, hash=None, size_str=None): + result = PackagePath(name) + result.hash = FileHash(hash) if hash else None + result.size = int(size_str) if size_str else None + result.dist = self + return result + + @pass_none + def make_files(lines): + # Delay csv import, since Distribution.files is not as widely used + # as other parts of importlib.metadata + import csv + + return starmap(make_file, csv.reader(lines)) + + @pass_none + def skip_missing_files(package_paths): + return list(filter(lambda path: path.locate().exists(), package_paths)) + + return skip_missing_files( + make_files( + self._read_files_distinfo() + or self._read_files_egginfo_installed() + or self._read_files_egginfo_sources() + ) + ) + + def _read_files_distinfo(self): + """ + Read the lines of RECORD. + """ + text = self.read_text('RECORD') + return text and text.splitlines() + + def _read_files_egginfo_installed(self): + """ + Read installed-files.txt and return lines in a similar + CSV-parsable format as RECORD: each file must be placed + relative to the site-packages directory and must also be + quoted (since file names can contain literal commas). + + This file is written when the package is installed by pip, + but it might not be written for other installation methods. + Assume the file is accurate if it exists. + """ + text = self.read_text('installed-files.txt') + # Prepend the .egg-info/ subdir to the lines in this file. + # But this subdir is only available from PathDistribution's + # self._path. + subdir = getattr(self, '_path', None) + if not text or not subdir: + return + + paths = ( + py311 + .relative_fix((subdir / name).resolve()) + .relative_to(self.locate_file('').resolve(), walk_up=True) + .as_posix() + for name in text.splitlines() + ) + return map('"{}"'.format, paths) + + def _read_files_egginfo_sources(self): + """ + Read SOURCES.txt and return lines in a similar CSV-parsable + format as RECORD: each file name must be quoted (since it + might contain literal commas). + + Note that SOURCES.txt is not a reliable source for what + files are installed by a package. This file is generated + for a source archive, and the files that are present + there (e.g. setup.py) may not correctly reflect the files + that are present after the package has been installed. + """ + text = self.read_text('SOURCES.txt') + return text and map('"{}"'.format, text.splitlines()) + + @property + def requires(self) -> list[str] | None: + """Generated requirements specified for this Distribution""" + reqs = self._read_dist_info_reqs() or self._read_egg_info_reqs() + return reqs and list(reqs) + + def _read_dist_info_reqs(self): + return self.metadata.get_all('Requires-Dist') + + def _read_egg_info_reqs(self): + source = self.read_text('requires.txt') + return pass_none(self._deps_from_requires_text)(source) + + @classmethod + def _deps_from_requires_text(cls, source): + return cls._convert_egg_info_reqs_to_simple_reqs(Sectioned.read(source)) + + @staticmethod + def _convert_egg_info_reqs_to_simple_reqs(sections): + """ + Historically, setuptools would solicit and store 'extra' + requirements, including those with environment markers, + in separate sections. More modern tools expect each + dependency to be defined separately, with any relevant + extras and environment markers attached directly to that + requirement. This method converts the former to the + latter. See _test_deps_from_requires_text for an example. + """ + + def make_condition(name): + return name and f'extra == "{name}"' + + def quoted_marker(section): + section = section or '' + extra, sep, markers = section.partition(':') + if extra and markers: + markers = f'({markers})' + conditions = list(filter(None, [markers, make_condition(extra)])) + return '; ' + ' and '.join(conditions) if conditions else '' + + def url_req_space(req): + """ + PEP 508 requires a space between the url_spec and the quoted_marker. + Ref python/importlib_metadata#357. + """ + # '@' is uniquely indicative of a url_req. + return ' ' * ('@' in req) + + for section in sections: + space = url_req_space(section.value) + yield section.value + space + quoted_marker(section.name) + + @property + def origin(self): + return self._load_json('direct_url.json') + + def _load_json(self, filename): + # Deferred for performance (python/importlib_metadata#503) + import json + + return pass_none(json.loads)( + self.read_text(filename), + object_hook=lambda data: types.SimpleNamespace(**data), + ) + + +class DistributionFinder(MetaPathFinder): + """ + A MetaPathFinder capable of discovering installed distributions. + + Custom providers should implement this interface in order to + supply metadata. + """ + + class Context: + """ + Keyword arguments presented by the caller to + ``distributions()`` or ``Distribution.discover()`` + to narrow the scope of a search for distributions + in all DistributionFinders. + + Each DistributionFinder may expect any parameters + and should attempt to honor the canonical + parameters defined below when appropriate. + + This mechanism gives a custom provider a means to + solicit additional details from the caller beyond + "name" and "path" when searching distributions. + For example, imagine a provider that exposes suites + of packages in either a "public" or "private" ``realm``. + A caller may wish to query only for distributions in + a particular realm and could call + ``distributions(realm="private")`` to signal to the + custom provider to only include distributions from that + realm. + """ + + name = None + """ + Specific name for which a distribution finder should match. + A name of ``None`` matches all distributions. + """ + + def __init__(self, **kwargs): + vars(self).update(kwargs) + + @property + def path(self) -> list[str]: + """ + The sequence of directory path that a distribution finder + should search. + + Typically refers to Python installed package paths such as + "site-packages" directories and defaults to ``sys.path``. + """ + return vars(self).get('path', sys.path) + + @abc.abstractmethod + def find_distributions(self, context=Context()) -> Iterable[Distribution]: + """ + Find distributions. + + Return an iterable of all Distribution instances capable of + loading the metadata for packages matching the ``context``, + a DistributionFinder.Context instance. + """ + + +@passthrough +def _clear_after_fork(cached): + """Ensure ``func`` clears cached state after ``fork`` when supported. + + ``FastPath`` caches zip-backed ``pathlib.Path`` objects that retain a + reference to the parent's open ``ZipFile`` handle. Re-using a cached + instance in a forked child can therefore resurrect invalid file pointers + and trigger ``BadZipFile``/``OSError`` failures (python/importlib_metadata#520). + Registering ``cache_clear`` with ``os.register_at_fork`` keeps each process + on its own cache. + """ + getattr(os, 'register_at_fork', noop)(after_in_child=cached.cache_clear) + + +class FastPath: + """ + Micro-optimized class for searching a root for children. + + Root is a path on the file system that may contain metadata + directories either as natural directories or within a zip file. + + >>> FastPath('').children() + ['...'] + + FastPath objects are cached and recycled for any given root. + + >>> FastPath('foobar') is FastPath('foobar') + True + """ + + @_clear_after_fork # type: ignore[misc] + @functools.lru_cache() + def __new__(cls, root): + return super().__new__(cls) + + def __init__(self, root): + self.root = root + + def joinpath(self, child): + return pathlib.Path(self.root, child) + + def children(self): + with suppress(Exception): + return os.listdir(self.root or '.') + with suppress(Exception): + return self.zip_children() + return [] + + def zip_children(self): + # deferred for performance (python/importlib_metadata#502) + from zipp.compat.overlay import zipfile + + zip_path = zipfile.Path(self.root) + names = zip_path.root.namelist() + self.joinpath = zip_path.joinpath + + return dict.fromkeys(child.split(posixpath.sep, 1)[0] for child in names) + + def search(self, name): + return self.lookup(self.mtime).search(name) + + @property + def mtime(self): + with suppress(OSError): + return os.stat(self.root).st_mtime + self.lookup.cache_clear() + + @method_cache + def lookup(self, mtime): + return Lookup(self) + + +class Lookup: + """ + A micro-optimized class for searching a (fast) path for metadata. + """ + + def __init__(self, path: FastPath): + """ + Calculate all of the children representing metadata. + + From the children in the path, calculate early all of the + children that appear to represent metadata (infos) or legacy + metadata (eggs). + """ + + base = os.path.basename(path.root).lower() + base_is_egg = base.endswith(".egg") + self.infos = FreezableDefaultDict(list) + self.eggs = FreezableDefaultDict(list) + + for child in path.children(): + low = child.lower() + if low.endswith((".dist-info", ".egg-info")): + # rpartition is faster than splitext and suitable for this purpose. + name = low.rpartition(".")[0].partition("-")[0] + normalized = Prepared.normalize(name) + self.infos[normalized].append(path.joinpath(child)) + elif base_is_egg and low == "egg-info": + name = base.rpartition(".")[0].partition("-")[0] + legacy_normalized = Prepared.legacy_normalize(name) + self.eggs[legacy_normalized].append(path.joinpath(child)) + + self.infos.freeze() + self.eggs.freeze() + + def search(self, prepared: Prepared): + """ + Yield all infos and eggs matching the Prepared query. + """ + infos = ( + self.infos[prepared.normalized] + if prepared + else itertools.chain.from_iterable(self.infos.values()) + ) + eggs = ( + self.eggs[prepared.legacy_normalized] + if prepared + else itertools.chain.from_iterable(self.eggs.values()) + ) + return itertools.chain(infos, eggs) + + +class Prepared: + """ + A prepared search query for metadata on a possibly-named package. + + Pre-calculates the normalization to prevent repeated operations. + + >>> none = Prepared(None) + >>> none.normalized + >>> none.legacy_normalized + >>> bool(none) + False + >>> sample = Prepared('Sample__Pkg-name.foo') + >>> sample.normalized + 'sample_pkg_name_foo' + >>> sample.legacy_normalized + 'sample__pkg_name.foo' + >>> bool(sample) + True + """ + + normalized = None + legacy_normalized = None + + def __init__(self, name: str | None): + self.name = name + if name is None: + return + self.normalized = self.normalize(name) + self.legacy_normalized = self.legacy_normalize(name) + + @staticmethod + def normalize(name): + """ + PEP 503 normalization plus dashes as underscores. + """ + return re.sub(r"[-_.]+", "-", name).lower().replace('-', '_') + + @staticmethod + def legacy_normalize(name): + """ + Normalize the package name as found in the convention in + older packaging tools versions and specs. + """ + return name.lower().replace('-', '_') + + def __bool__(self): + return bool(self.name) + + +@install +class MetadataPathFinder(NullFinder, DistributionFinder): + """A degenerate finder for distribution packages on the file system. + + This finder supplies only a find_distributions() method for versions + of Python that do not have a PathFinder find_distributions(). + """ + + @classmethod + def find_distributions( + cls, context=DistributionFinder.Context() + ) -> Iterable[PathDistribution]: + """ + Find distributions. + + Return an iterable of all Distribution instances capable of + loading the metadata for packages matching ``context.name`` + (or all names if ``None`` indicated) along the paths in the list + of directories ``context.path``. + """ + found = cls._search_paths(context.name, context.path) + return map(PathDistribution, found) + + @classmethod + def _search_paths(cls, name, paths): + """Find metadata directories in paths heuristically.""" + prepared = Prepared(name) + return itertools.chain.from_iterable( + path.search(prepared) for path in map(FastPath, paths) + ) + + @classmethod + def invalidate_caches(cls) -> None: + FastPath.__new__.cache_clear() + + +class PathDistribution(Distribution): + def __init__(self, path: SimplePath) -> None: + """Construct a distribution. + + :param path: SimplePath indicating the metadata directory. + """ + self._path = path + + def read_text(self, filename: str | os.PathLike[str]) -> str | None: + with suppress( + FileNotFoundError, + IsADirectoryError, + KeyError, + NotADirectoryError, + PermissionError, + ): + return self._path.joinpath(filename).read_text(encoding='utf-8') + + return None + + read_text.__doc__ = Distribution.read_text.__doc__ + + def locate_file(self, path: str | os.PathLike[str]) -> SimplePath: + return self._path.parent / path + + @property + def _normalized_name(self): + """ + Performance optimization: where possible, resolve the + normalized name from the file system path. + """ + stem = os.path.basename(str(self._path)) + return ( + pass_none(Prepared.normalize)(self._name_from_stem(stem)) + or super()._normalized_name + ) + + @staticmethod + def _name_from_stem(stem): + """ + >>> PathDistribution._name_from_stem('foo-3.0.egg-info') + 'foo' + >>> PathDistribution._name_from_stem('CherryPy-3.0.dist-info') + 'CherryPy' + >>> PathDistribution._name_from_stem('face.egg-info') + 'face' + >>> PathDistribution._name_from_stem('foo.bar') + """ + filename, ext = os.path.splitext(stem) + if ext not in ('.dist-info', '.egg-info'): + return + name, sep, rest = filename.partition('-') + return name + + +def distribution(distribution_name: str) -> Distribution: + """Get the ``Distribution`` instance for the named package. + + :param distribution_name: The name of the distribution package as a string. + :return: A ``Distribution`` instance (or subclass thereof). + """ + return Distribution.from_name(distribution_name) + + +def distributions(**kwargs) -> Iterable[Distribution]: + """Get all ``Distribution`` instances in the current environment. + + :return: An iterable of ``Distribution`` instances. + """ + return Distribution.discover(**kwargs) + + +def metadata(distribution_name: str) -> _meta.PackageMetadata | None: + """Get the metadata for the named package. + + :param distribution_name: The name of the distribution package to query. + :return: A PackageMetadata containing the parsed metadata. + """ + return Distribution.from_name(distribution_name).metadata + + +def version(distribution_name: str) -> str: + """Get the version string for the named package. + + :param distribution_name: The name of the distribution package to query. + :return: The version string for the package as defined in the package's + "Version" metadata key. + """ + return distribution(distribution_name).version + + +_unique = functools.partial( + unique_everseen, + key=operator.attrgetter('_normalized_name'), +) +""" +Wrapper for ``distributions`` to return unique distributions by name. +""" + + +def entry_points(**params) -> EntryPoints: + """Return EntryPoint objects for all installed packages. + + Pass selection parameters (group or name) to filter the + result to entry points matching those properties (see + EntryPoints.select()). + + :return: EntryPoints for all installed packages. + """ + eps = itertools.chain.from_iterable( + dist.entry_points for dist in _unique(distributions()) + ) + return EntryPoints(eps).select(**params) + + +def files(distribution_name: str) -> list[PackagePath] | None: + """Return a list of files for the named package. + + :param distribution_name: The name of the distribution package to query. + :return: List of files composing the distribution. + """ + return distribution(distribution_name).files + + +def requires(distribution_name: str) -> list[str] | None: + """ + Return a list of requirements for the named package. + + :return: An iterable of requirements, suitable for + packaging.requirement.Requirement. + """ + return distribution(distribution_name).requires + + +def packages_distributions() -> Mapping[str, list[str]]: + """ + Return a mapping of top-level packages to their + distributions. + + >>> import collections.abc + >>> pkgs = packages_distributions() + >>> all(isinstance(dist, collections.abc.Sequence) for dist in pkgs.values()) + True + """ + pkg_to_dist = collections.defaultdict(list) + for dist in distributions(): + for pkg in _top_level_declared(dist) or _top_level_inferred(dist): + pkg_to_dist[pkg].append(md_none(dist.metadata)['Name']) + return dict(pkg_to_dist) + + +def _top_level_declared(dist): + return (dist.read_text('top_level.txt') or '').split() + + +def _topmost(name: PackagePath) -> str | None: + """ + Return the top-most parent as long as there is a parent. + """ + top, *rest = name.parts + return top if rest else None + + +def _get_toplevel_name(name: PackagePath) -> str: + """ + Infer a possibly importable module name from a name presumed on + sys.path. + + >>> _get_toplevel_name(PackagePath('foo.py')) + 'foo' + >>> _get_toplevel_name(PackagePath('foo')) + 'foo' + >>> _get_toplevel_name(PackagePath('foo.pyc')) + 'foo' + >>> _get_toplevel_name(PackagePath('foo/__init__.py')) + 'foo' + >>> _get_toplevel_name(PackagePath('foo.pth')) + 'foo.pth' + >>> _get_toplevel_name(PackagePath('foo.dist-info')) + 'foo.dist-info' + """ + # Defer import of inspect for performance (python/cpython#118761) + import inspect + + return _topmost(name) or inspect.getmodulename(name) or str(name) + + +def _top_level_inferred(dist): + opt_names = set(map(_get_toplevel_name, always_iterable(dist.files))) + + def importable_name(name): + return '.' not in name + + return filter(importable_name, opt_names) diff --git a/importlib_metadata/_adapters.py b/importlib_metadata/_adapters.py new file mode 100644 index 0000000000000000000000000000000000000000..dede395d79a38bab322d56a66d916703af84f77b --- /dev/null +++ b/importlib_metadata/_adapters.py @@ -0,0 +1,136 @@ +import email.message +import email.policy +import re +import textwrap + +from ._text import FoldedCase + + +class RawPolicy(email.policy.EmailPolicy): + def fold(self, name, value): + folded = self.linesep.join( + textwrap + .indent(value, prefix=' ' * 8, predicate=lambda line: True) + .lstrip() + .splitlines() + ) + return f'{name}: {folded}{self.linesep}' + + +class Message(email.message.Message): + r""" + Specialized Message subclass to handle metadata naturally. + + Reads values that may have newlines in them and converts the + payload to the Description. + + >>> msg_text = textwrap.dedent(''' + ... Name: Foo + ... Version: 3.0 + ... License: blah + ... de-blah + ... + ... First line of description. + ... Second line of description. + ... + ... Fourth line! + ... ''').lstrip().replace('', '') + >>> msg = Message(email.message_from_string(msg_text)) + >>> msg['Description'] + 'First line of description.\nSecond line of description.\n\nFourth line!\n' + + Message should render even if values contain newlines. + + >>> print(msg) + Name: Foo + Version: 3.0 + License: blah + de-blah + Description: First line of description. + Second line of description. + + Fourth line! + + + """ + + multiple_use_keys = set( + map( + FoldedCase, + [ + 'Classifier', + 'Obsoletes-Dist', + 'Platform', + 'Project-URL', + 'Provides-Dist', + 'Provides-Extra', + 'Requires-Dist', + 'Requires-External', + 'Supported-Platform', + 'Dynamic', + ], + ) + ) + """ + Keys that may be indicated multiple times per PEP 566. + """ + + def __new__(cls, orig: email.message.Message): + res = super().__new__(cls) + vars(res).update(vars(orig)) + return res + + def __init__(self, *args, **kwargs): + self._headers = self._repair_headers() + + # suppress spurious error from mypy + def __iter__(self): + return super().__iter__() + + def __getitem__(self, item): + """ + Override parent behavior to typical dict behavior. + + ``email.message.Message`` will emit None values for missing + keys. Typical mappings, including this ``Message``, will raise + a key error for missing keys. + + Ref python/importlib_metadata#371. + """ + res = super().__getitem__(item) + if res is None: + raise KeyError(item) + return res + + def _repair_headers(self): + def redent(value): + "Correct for RFC822 indentation" + indent = ' ' * 8 + if not value or '\n' + indent not in value: + return value + return textwrap.dedent(indent + value) + + headers = [(key, redent(value)) for key, value in vars(self)['_headers']] + if self._payload: + headers.append(('Description', self.get_payload())) + self.set_payload('') + return headers + + def as_string(self): + return super().as_string(policy=RawPolicy()) + + @property + def json(self): + """ + Convert PackageMetadata to a JSON-compatible format + per PEP 0566. + """ + + def transform(key): + value = self.get_all(key) if key in self.multiple_use_keys else self[key] + if key == 'Keywords': + value = re.split(r'\s+', value) + tk = key.lower().replace('-', '_') + return tk, value + + return dict(map(transform, map(FoldedCase, self))) diff --git a/importlib_metadata/_collections.py b/importlib_metadata/_collections.py new file mode 100644 index 0000000000000000000000000000000000000000..fc5045d36be57251000393b32baba7baa9cd8c4f --- /dev/null +++ b/importlib_metadata/_collections.py @@ -0,0 +1,34 @@ +import collections +import typing + + +# from jaraco.collections 3.3 +class FreezableDefaultDict(collections.defaultdict): + """ + Often it is desirable to prevent the mutation of + a default dict after its initial construction, such + as to prevent mutation during iteration. + + >>> dd = FreezableDefaultDict(list) + >>> dd[0].append('1') + >>> dd.freeze() + >>> dd[1] + [] + >>> len(dd) + 1 + """ + + def __missing__(self, key): + return getattr(self, '_frozen', super().__missing__)(key) + + def freeze(self): + self._frozen = lambda key: self.default_factory() + + +class Pair(typing.NamedTuple): + name: str + value: str + + @classmethod + def parse(cls, text): + return cls(*map(str.strip, text.split("=", 1))) diff --git a/importlib_metadata/_compat.py b/importlib_metadata/_compat.py new file mode 100644 index 0000000000000000000000000000000000000000..01356d69b97c95a6d41818e5c2c50a299146bef4 --- /dev/null +++ b/importlib_metadata/_compat.py @@ -0,0 +1,56 @@ +import platform +import sys + +__all__ = ['install', 'NullFinder'] + + +def install(cls): + """ + Class decorator for installation on sys.meta_path. + + Adds the backport DistributionFinder to sys.meta_path and + attempts to disable the finder functionality of the stdlib + DistributionFinder. + """ + sys.meta_path.append(cls()) + disable_stdlib_finder() + return cls + + +def disable_stdlib_finder(): + """ + Give the backport primacy for discovering path-based distributions + by monkey-patching the stdlib O_O. + + See #91 for more background for rationale on this sketchy + behavior. + """ + + def matches(finder): + return getattr( + finder, '__module__', None + ) == '_frozen_importlib_external' and hasattr(finder, 'find_distributions') + + for finder in filter(matches, sys.meta_path): # pragma: nocover + del finder.find_distributions + + +class NullFinder: + """ + A "Finder" (aka "MetaPathFinder") that never finds any modules, + but may find distributions. + """ + + @staticmethod + def find_spec(*args, **kwargs): + return None + + +def pypy_partial(val): + """ + Adjust for variable stacklevel on partial under PyPy. + + Workaround for #327. + """ + is_pypy = platform.python_implementation() == 'PyPy' + return val + is_pypy diff --git a/importlib_metadata/_functools.py b/importlib_metadata/_functools.py new file mode 100644 index 0000000000000000000000000000000000000000..c159b46e48959cdaeb8635c09cdd48302dbfb44f --- /dev/null +++ b/importlib_metadata/_functools.py @@ -0,0 +1,136 @@ +import functools +import types +from collections.abc import Callable +from typing import TypeVar + + +# from jaraco.functools 3.3 +def method_cache(method, cache_wrapper=None): + """ + Wrap lru_cache to support storing the cache data in the object instances. + + Abstracts the common paradigm where the method explicitly saves an + underscore-prefixed protected property on first call and returns that + subsequently. + + >>> class MyClass: + ... calls = 0 + ... + ... @method_cache + ... def method(self, value): + ... self.calls += 1 + ... return value + + >>> a = MyClass() + >>> a.method(3) + 3 + >>> for x in range(75): + ... res = a.method(x) + >>> a.calls + 75 + + Note that the apparent behavior will be exactly like that of lru_cache + except that the cache is stored on each instance, so values in one + instance will not flush values from another, and when an instance is + deleted, so are the cached values for that instance. + + >>> b = MyClass() + >>> for x in range(35): + ... res = b.method(x) + >>> b.calls + 35 + >>> a.method(0) + 0 + >>> a.calls + 75 + + Note that if method had been decorated with ``functools.lru_cache()``, + a.calls would have been 76 (due to the cached value of 0 having been + flushed by the 'b' instance). + + Clear the cache with ``.cache_clear()`` + + >>> a.method.cache_clear() + + Same for a method that hasn't yet been called. + + >>> c = MyClass() + >>> c.method.cache_clear() + + Another cache wrapper may be supplied: + + >>> cache = functools.lru_cache(maxsize=2) + >>> MyClass.method2 = method_cache(lambda self: 3, cache_wrapper=cache) + >>> a = MyClass() + >>> a.method2() + 3 + + Caution - do not subsequently wrap the method with another decorator, such + as ``@property``, which changes the semantics of the function. + + See also + http://code.activestate.com/recipes/577452-a-memoize-decorator-for-instance-methods/ + for another implementation and additional justification. + """ + cache_wrapper = cache_wrapper or functools.lru_cache() + + def wrapper(self, *args, **kwargs): + # it's the first call, replace the method with a cached, bound method + bound_method = types.MethodType(method, self) + cached_method = cache_wrapper(bound_method) + setattr(self, method.__name__, cached_method) + return cached_method(*args, **kwargs) + + # Support cache clear even before cache has been created. + wrapper.cache_clear = lambda: None + + return wrapper + + +# From jaraco.functools 3.3 +def pass_none(func): + """ + Wrap func so it's not called if its first param is None + + >>> print_text = pass_none(print) + >>> print_text('text') + text + >>> print_text(None) + """ + + @functools.wraps(func) + def wrapper(param, *args, **kwargs): + if param is not None: + return func(param, *args, **kwargs) + + return wrapper + + +# From jaraco.functools 4.4 +def noop(*args, **kwargs): + """ + A no-operation function that does nothing. + + >>> noop(1, 2, three=3) + """ + + +_T = TypeVar('_T') + + +# From jaraco.functools 4.4 +def passthrough(func: Callable[..., object]) -> Callable[[_T], _T]: + """ + Wrap the function to always return the first parameter. + + >>> passthrough(print)('3') + 3 + '3' + """ + + @functools.wraps(func) + def wrapper(first: _T, *args, **kwargs) -> _T: + func(first, *args, **kwargs) + return first + + return wrapper # type: ignore[return-value] diff --git a/importlib_metadata/_itertools.py b/importlib_metadata/_itertools.py new file mode 100644 index 0000000000000000000000000000000000000000..79d37198ce7aff317873f6e4e84cd904a46a69de --- /dev/null +++ b/importlib_metadata/_itertools.py @@ -0,0 +1,171 @@ +from collections import defaultdict, deque +from itertools import filterfalse + + +def unique_everseen(iterable, key=None): + "List unique elements, preserving order. Remember all elements ever seen." + # unique_everseen('AAAABBBCCDAABBB') --> A B C D + # unique_everseen('ABBCcAD', str.lower) --> A B C D + seen = set() + seen_add = seen.add + if key is None: + for element in filterfalse(seen.__contains__, iterable): + seen_add(element) + yield element + else: + for element in iterable: + k = key(element) + if k not in seen: + seen_add(k) + yield element + + +# copied from more_itertools 8.8 +def always_iterable(obj, base_type=(str, bytes)): + """If *obj* is iterable, return an iterator over its items:: + + >>> obj = (1, 2, 3) + >>> list(always_iterable(obj)) + [1, 2, 3] + + If *obj* is not iterable, return a one-item iterable containing *obj*:: + + >>> obj = 1 + >>> list(always_iterable(obj)) + [1] + + If *obj* is ``None``, return an empty iterable: + + >>> obj = None + >>> list(always_iterable(None)) + [] + + By default, binary and text strings are not considered iterable:: + + >>> obj = 'foo' + >>> list(always_iterable(obj)) + ['foo'] + + If *base_type* is set, objects for which ``isinstance(obj, base_type)`` + returns ``True`` won't be considered iterable. + + >>> obj = {'a': 1} + >>> list(always_iterable(obj)) # Iterate over the dict's keys + ['a'] + >>> list(always_iterable(obj, base_type=dict)) # Treat dicts as a unit + [{'a': 1}] + + Set *base_type* to ``None`` to avoid any special handling and treat objects + Python considers iterable as iterable: + + >>> obj = 'foo' + >>> list(always_iterable(obj, base_type=None)) + ['f', 'o', 'o'] + """ + if obj is None: + return iter(()) + + if (base_type is not None) and isinstance(obj, base_type): + return iter((obj,)) + + try: + return iter(obj) + except TypeError: + return iter((obj,)) + + +# Copied from more_itertools 10.3 +class bucket: + """Wrap *iterable* and return an object that buckets the iterable into + child iterables based on a *key* function. + + >>> iterable = ['a1', 'b1', 'c1', 'a2', 'b2', 'c2', 'b3'] + >>> s = bucket(iterable, key=lambda x: x[0]) # Bucket by 1st character + >>> sorted(list(s)) # Get the keys + ['a', 'b', 'c'] + >>> a_iterable = s['a'] + >>> next(a_iterable) + 'a1' + >>> next(a_iterable) + 'a2' + >>> list(s['b']) + ['b1', 'b2', 'b3'] + + The original iterable will be advanced and its items will be cached until + they are used by the child iterables. This may require significant storage. + + By default, attempting to select a bucket to which no items belong will + exhaust the iterable and cache all values. + If you specify a *validator* function, selected buckets will instead be + checked against it. + + >>> from itertools import count + >>> it = count(1, 2) # Infinite sequence of odd numbers + >>> key = lambda x: x % 10 # Bucket by last digit + >>> validator = lambda x: x in {1, 3, 5, 7, 9} # Odd digits only + >>> s = bucket(it, key=key, validator=validator) + >>> 2 in s + False + >>> list(s[2]) + [] + + """ + + def __init__(self, iterable, key, validator=None): + self._it = iter(iterable) + self._key = key + self._cache = defaultdict(deque) + self._validator = validator or (lambda x: True) + + def __contains__(self, value): + if not self._validator(value): + return False + + try: + item = next(self[value]) + except StopIteration: + return False + else: + self._cache[value].appendleft(item) + + return True + + def _get_values(self, value): + """ + Helper to yield items from the parent iterator that match *value*. + Items that don't match are stored in the local cache as they + are encountered. + """ + while True: + # If we've cached some items that match the target value, emit + # the first one and evict it from the cache. + if self._cache[value]: + yield self._cache[value].popleft() + # Otherwise we need to advance the parent iterator to search for + # a matching item, caching the rest. + else: + while True: + try: + item = next(self._it) + except StopIteration: + return + item_value = self._key(item) + if item_value == value: + yield item + break + elif self._validator(item_value): + self._cache[item_value].append(item) + + def __iter__(self): + for item in self._it: + item_value = self._key(item) + if self._validator(item_value): + self._cache[item_value].append(item) + + yield from self._cache.keys() + + def __getitem__(self, value): + if not self._validator(value): + return iter(()) + + return self._get_values(value) diff --git a/importlib_metadata/_meta.py b/importlib_metadata/_meta.py new file mode 100644 index 0000000000000000000000000000000000000000..0c20eff3da75223a5ca76a1743b7c5b8fa1dc1f6 --- /dev/null +++ b/importlib_metadata/_meta.py @@ -0,0 +1,71 @@ +from __future__ import annotations + +import os +from collections.abc import Iterator +from typing import ( + Any, + Protocol, + TypeVar, + overload, +) + +_T = TypeVar("_T") + + +class PackageMetadata(Protocol): + def __len__(self) -> int: ... # pragma: no cover + + def __contains__(self, item: str) -> bool: ... # pragma: no cover + + def __getitem__(self, key: str) -> str: ... # pragma: no cover + + def __iter__(self) -> Iterator[str]: ... # pragma: no cover + + @overload + def get( + self, name: str, failobj: None = None + ) -> str | None: ... # pragma: no cover + + @overload + def get(self, name: str, failobj: _T) -> str | _T: ... # pragma: no cover + + # overload per python/importlib_metadata#435 + @overload + def get_all( + self, name: str, failobj: None = None + ) -> list[Any] | None: ... # pragma: no cover + + @overload + def get_all(self, name: str, failobj: _T) -> list[Any] | _T: + """ + Return all values associated with a possibly multi-valued key. + """ + + @property + def json(self) -> dict[str, str | list[str]]: + """ + A JSON-compatible form of the metadata. + """ + + +class SimplePath(Protocol): + """ + A minimal subset of pathlib.Path required by Distribution. + """ + + def joinpath( + self, other: str | os.PathLike[str] + ) -> SimplePath: ... # pragma: no cover + + def __truediv__( + self, other: str | os.PathLike[str] + ) -> SimplePath: ... # pragma: no cover + + @property + def parent(self) -> SimplePath: ... # pragma: no cover + + def read_text(self, encoding=None) -> str: ... # pragma: no cover + + def read_bytes(self) -> bytes: ... # pragma: no cover + + def exists(self) -> bool: ... # pragma: no cover diff --git a/importlib_metadata/_text.py b/importlib_metadata/_text.py new file mode 100644 index 0000000000000000000000000000000000000000..c88cfbb2349c6401336bc5ba6623f51afd1eb59d --- /dev/null +++ b/importlib_metadata/_text.py @@ -0,0 +1,99 @@ +import re + +from ._functools import method_cache + + +# from jaraco.text 3.5 +class FoldedCase(str): + """ + A case insensitive string class; behaves just like str + except compares equal when the only variation is case. + + >>> s = FoldedCase('hello world') + + >>> s == 'Hello World' + True + + >>> 'Hello World' == s + True + + >>> s != 'Hello World' + False + + >>> s.index('O') + 4 + + >>> s.split('O') + ['hell', ' w', 'rld'] + + >>> sorted(map(FoldedCase, ['GAMMA', 'alpha', 'Beta'])) + ['alpha', 'Beta', 'GAMMA'] + + Sequence membership is straightforward. + + >>> "Hello World" in [s] + True + >>> s in ["Hello World"] + True + + You may test for set inclusion, but candidate and elements + must both be folded. + + >>> FoldedCase("Hello World") in {s} + True + >>> s in {FoldedCase("Hello World")} + True + + String inclusion works as long as the FoldedCase object + is on the right. + + >>> "hello" in FoldedCase("Hello World") + True + + But not if the FoldedCase object is on the left: + + >>> FoldedCase('hello') in 'Hello World' + False + + In that case, use in_: + + >>> FoldedCase('hello').in_('Hello World') + True + + >>> FoldedCase('hello') > FoldedCase('Hello') + False + """ + + def __lt__(self, other): + return self.lower() < other.lower() + + def __gt__(self, other): + return self.lower() > other.lower() + + def __eq__(self, other): + return self.lower() == other.lower() + + def __ne__(self, other): + return self.lower() != other.lower() + + def __hash__(self): + return hash(self.lower()) + + def __contains__(self, other): + return super().lower().__contains__(other.lower()) + + def in_(self, other): + "Does self appear in other?" + return self in FoldedCase(other) + + # cache lower since it's likely to be called frequently. + @method_cache + def lower(self): + return super().lower() + + def index(self, sub): + return self.lower().index(sub.lower()) + + def split(self, splitter=' ', maxsplit=0): + pattern = re.compile(re.escape(splitter), re.I) + return pattern.split(self, maxsplit) diff --git a/importlib_metadata/_typing.py b/importlib_metadata/_typing.py new file mode 100644 index 0000000000000000000000000000000000000000..32b1d2b98ac987e8361f60362b8bdabcdc6fb1c8 --- /dev/null +++ b/importlib_metadata/_typing.py @@ -0,0 +1,15 @@ +import functools +import typing + +from ._meta import PackageMetadata + +md_none = functools.partial(typing.cast, PackageMetadata) +""" +Suppress type errors for optional metadata. + +Although Distribution.metadata can return None when metadata is corrupt +and thus None, allow callers to assume it's not None and crash if +that's the case. + +# python/importlib_metadata#493 +""" diff --git a/importlib_metadata/diagnose.py b/importlib_metadata/diagnose.py new file mode 100644 index 0000000000000000000000000000000000000000..e405471ac4d94371b1ee9b1622227ff76b337180 --- /dev/null +++ b/importlib_metadata/diagnose.py @@ -0,0 +1,21 @@ +import sys + +from . import Distribution + + +def inspect(path): + print("Inspecting", path) + dists = list(Distribution.discover(path=[path])) + if not dists: + return + print("Found", len(dists), "packages:", end=' ') + print(', '.join(dist.name for dist in dists)) + + +def run(): + for path in sys.path: + inspect(path) + + +if __name__ == '__main__': + run() diff --git a/importlib_metadata/py.typed b/importlib_metadata/py.typed new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/isympy.py b/isympy.py new file mode 100644 index 0000000000000000000000000000000000000000..f7f4f7cd751f78e7d526aa50a527a914cd07d9af --- /dev/null +++ b/isympy.py @@ -0,0 +1,342 @@ +""" +Python shell for SymPy. + +This is just a normal Python shell (IPython shell if you have the +IPython package installed), that executes the following commands for +the user: + + >>> from __future__ import division + >>> from sympy import * + >>> x, y, z, t = symbols('x y z t') + >>> k, m, n = symbols('k m n', integer=True) + >>> f, g, h = symbols('f g h', cls=Function) + >>> init_printing() + +So starting 'isympy' is equivalent to starting Python (or IPython) and +executing the above commands by hand. It is intended for easy and quick +experimentation with SymPy. isympy is a good way to use SymPy as an +interactive calculator. If you have IPython and Matplotlib installed, then +interactive plotting is enabled by default. + +COMMAND LINE OPTIONS +-------------------- + +-c CONSOLE, --console=CONSOLE + + Use the specified shell (Python or IPython) shell as the console + backend instead of the default one (IPython if present, Python + otherwise), e.g.: + + $isympy -c python + + CONSOLE must be one of 'ipython' or 'python' + +-p PRETTY, --pretty PRETTY + + Setup pretty-printing in SymPy. When pretty-printing is enabled, + expressions can be printed with Unicode or ASCII. The default is + to use pretty-printing (with Unicode if the terminal supports it). + When this option is 'no', expressions will not be pretty-printed + and ASCII will be used: + + $isympy -p no + + PRETTY must be one of 'unicode', 'ascii', or 'no' + +-t TYPES, --types=TYPES + + Setup the ground types for the polys. By default, gmpy ground types + are used if gmpy2 or gmpy is installed, otherwise it falls back to python + ground types, which are a little bit slower. You can manually + choose python ground types even if gmpy is installed (e.g., for + testing purposes): + + $isympy -t python + + TYPES must be one of 'gmpy', 'gmpy1' or 'python' + + Note that the ground type gmpy1 is primarily intended for testing; it + forces the use of gmpy version 1 even if gmpy2 is available. + + This is the same as setting the environment variable + SYMPY_GROUND_TYPES to the given ground type (e.g., + SYMPY_GROUND_TYPES='gmpy') + + The ground types can be determined interactively from the variable + sympy.polys.domains.GROUND_TYPES. + +-o ORDER, --order ORDER + + Setup the ordering of terms for printing. The default is lex, which + orders terms lexicographically (e.g., x**2 + x + 1). You can choose + other orderings, such as rev-lex, which will use reverse + lexicographic ordering (e.g., 1 + x + x**2): + + $isympy -o rev-lex + + ORDER must be one of 'lex', 'rev-lex', 'grlex', 'rev-grlex', + 'grevlex', 'rev-grevlex', 'old', or 'none'. + + Note that for very large expressions, ORDER='none' may speed up + printing considerably but the terms will have no canonical order. + +-q, --quiet + + Print only Python's and SymPy's versions to stdout at startup. + +-d, --doctest + + Use the same format that should be used for doctests. This is + equivalent to -c python -p no. + +-C, --no-cache + + Disable the caching mechanism. Disabling the cache may slow certain + operations down considerably. This is useful for testing the cache, + or for benchmarking, as the cache can result in deceptive timings. + + This is equivalent to setting the environment variable + SYMPY_USE_CACHE to 'no'. + +-a, --auto-symbols (requires at least IPython 0.11) + + Automatically create missing symbols. Normally, typing a name of a + Symbol that has not been instantiated first would raise NameError, + but with this option enabled, any undefined name will be + automatically created as a Symbol. + + Note that this is intended only for interactive, calculator style + usage. In a script that uses SymPy, Symbols should be instantiated + at the top, so that it's clear what they are. + + This will not override any names that are already defined, which + includes the single character letters represented by the mnemonic + QCOSINE (see the "Gotchas and Pitfalls" document in the + documentation). You can delete existing names by executing "del + name". If a name is defined, typing "'name' in dir()" will return True. + + The Symbols that are created using this have default assumptions. + If you want to place assumptions on symbols, you should create them + using symbols() or var(). + + Finally, this only works in the top level namespace. So, for + example, if you define a function in isympy with an undefined + Symbol, it will not work. + + See also the -i and -I options. + +-i, --int-to-Integer (requires at least IPython 0.11) + + Automatically wrap int literals with Integer. This makes it so that + things like 1/2 will come out as Rational(1, 2), rather than 0.5. This + works by preprocessing the source and wrapping all int literals with + Integer. Note that this will not change the behavior of int literals + assigned to variables, and it also won't change the behavior of functions + that return int literals. + + If you want an int, you can wrap the literal in int(), e.g. int(3)/int(2) + gives 1.5 (with division imported from __future__). + +-I, --interactive (requires at least IPython 0.11) + + This is equivalent to --auto-symbols --int-to-Integer. Future options + designed for ease of interactive use may be added to this. + +-D, --debug + + Enable debugging output. This is the same as setting the + environment variable SYMPY_DEBUG to 'True'. The debug status is set + in the variable SYMPY_DEBUG within isympy. + +-- IPython options + + Additionally you can pass command line options directly to the IPython + interpreter (the standard Python shell is not supported). However you + need to add the '--' separator between two types of options, e.g the + startup banner option and the colors option. You need to enter the + options as required by the version of IPython that you are using, too: + + in IPython 0.11, + + $isympy -q -- --colors=NoColor + + or older versions of IPython, + + $isympy -q -- -colors NoColor + +See also isympy --help. +""" + +import os +import sys + +# DO NOT IMPORT SYMPY HERE! Or the setting of the sympy environment variables +# by the command line will break. + +def main() -> None: + from argparse import ArgumentParser, RawDescriptionHelpFormatter + + VERSION = None + if '--version' in sys.argv: + # We cannot import sympy before this is run, because flags like -C and + # -t set environment variables that must be set before SymPy is + # imported. The only thing we need to import it for is to get the + # version, which only matters with the --version flag. + import sympy + VERSION = sympy.__version__ + + usage = 'isympy [options] -- [ipython options]' + parser = ArgumentParser( + usage=usage, + description=__doc__, + formatter_class=RawDescriptionHelpFormatter, + ) + + parser.add_argument('--version', action='version', version=VERSION) + + parser.add_argument( + '-c', '--console', + dest='console', + action='store', + default=None, + choices=['ipython', 'python'], + metavar='CONSOLE', + help='select type of interactive session: ipython | python; defaults ' + 'to ipython if IPython is installed, otherwise python') + + parser.add_argument( + '-p', '--pretty', + dest='pretty', + action='store', + default=None, + metavar='PRETTY', + choices=['unicode', 'ascii', 'no'], + help='setup pretty printing: unicode | ascii | no; defaults to ' + 'unicode printing if the terminal supports it, otherwise ascii') + + parser.add_argument( + '-t', '--types', + dest='types', + action='store', + default=None, + metavar='TYPES', + choices=['gmpy', 'gmpy1', 'python'], + help='setup ground types: gmpy | gmpy1 | python; defaults to gmpy if gmpy2 ' + 'or gmpy is installed, otherwise python') + + parser.add_argument( + '-o', '--order', + dest='order', + action='store', + default=None, + metavar='ORDER', + choices=['lex', 'grlex', 'grevlex', 'rev-lex', 'rev-grlex', 'rev-grevlex', 'old', 'none'], + help='setup ordering of terms: [rev-]lex | [rev-]grlex | [rev-]grevlex | old | none; defaults to lex') + + parser.add_argument( + '-q', '--quiet', + dest='quiet', + action='store_true', + default=False, + help='print only version information at startup') + + parser.add_argument( + '-d', '--doctest', + dest='doctest', + action='store_true', + default=False, + help='use the doctest format for output (you can just copy and paste it)') + + parser.add_argument( + '-C', '--no-cache', + dest='cache', + action='store_false', + default=True, + help='disable caching mechanism') + + parser.add_argument( + '-a', '--auto-symbols', + dest='auto_symbols', + action='store_true', + default=False, + help='automatically construct missing symbols') + + parser.add_argument( + '-i', '--int-to-Integer', + dest='auto_int_to_Integer', + action='store_true', + default=False, + help="automatically wrap int literals with Integer") + + parser.add_argument( + '-I', '--interactive', + dest='interactive', + action='store_true', + default=False, + help="equivalent to -a -i") + + parser.add_argument( + '-D', '--debug', + dest='debug', + action='store_true', + default=False, + help='enable debugging output') + + (options, ipy_args) = parser.parse_known_args() + if '--' in ipy_args: + ipy_args.remove('--') + + if not options.cache: + os.environ['SYMPY_USE_CACHE'] = 'no' + + if options.types: + os.environ['SYMPY_GROUND_TYPES'] = options.types + + if options.debug: + os.environ['SYMPY_DEBUG'] = str(options.debug) + + if options.doctest: + options.pretty = 'no' + options.console = 'python' + + session = options.console + + if session is not None: + ipython = session == 'ipython' + else: + try: + import IPython # noqa: F401 + ipython = True + except ImportError: + if not options.quiet: + from sympy.interactive.session import no_ipython + print(no_ipython) + ipython = False + + args = { + 'pretty_print': True, + 'use_unicode': None, + 'use_latex': None, + 'order': None, + 'argv': ipy_args, + } + + if options.pretty == 'unicode': + args['use_unicode'] = True + elif options.pretty == 'ascii': + args['use_unicode'] = False + elif options.pretty == 'no': + args['pretty_print'] = False + + if options.order is not None: + args['order'] = options.order + + args['quiet'] = options.quiet + args['auto_symbols'] = options.auto_symbols or options.interactive + args['auto_int_to_Integer'] = options.auto_int_to_Integer or options.interactive + + from sympy.interactive import init_session + init_session(ipython, **args) + +if __name__ == "__main__": + main() diff --git a/nest_asyncio.py b/nest_asyncio.py new file mode 100644 index 0000000000000000000000000000000000000000..1cb5c253fa0658a0adea3516f6463904396cf573 --- /dev/null +++ b/nest_asyncio.py @@ -0,0 +1,219 @@ +"""Patch asyncio to allow nested event loops.""" + +import asyncio +import asyncio.events as events +import os +import sys +import threading +from contextlib import contextmanager, suppress +from heapq import heappop + + +def apply(loop=None): + """Patch asyncio to make its event loop reentrant.""" + _patch_asyncio() + _patch_policy() + _patch_tornado() + + loop = loop or asyncio.get_event_loop() + _patch_loop(loop) + + +def _patch_asyncio(): + """Patch asyncio module to use pure Python tasks and futures.""" + + def run(main, *, debug=False): + loop = asyncio.get_event_loop() + loop.set_debug(debug) + task = asyncio.ensure_future(main) + try: + return loop.run_until_complete(task) + finally: + if not task.done(): + task.cancel() + with suppress(asyncio.CancelledError): + loop.run_until_complete(task) + + def _get_event_loop(stacklevel=3): + loop = events._get_running_loop() + if loop is None: + loop = events.get_event_loop_policy().get_event_loop() + return loop + + # Use module level _current_tasks, all_tasks and patch run method. + if hasattr(asyncio, '_nest_patched'): + return + if sys.version_info >= (3, 6, 0): + asyncio.Task = asyncio.tasks._CTask = asyncio.tasks.Task = \ + asyncio.tasks._PyTask + asyncio.Future = asyncio.futures._CFuture = asyncio.futures.Future = \ + asyncio.futures._PyFuture + if sys.version_info < (3, 7, 0): + asyncio.tasks._current_tasks = asyncio.tasks.Task._current_tasks + asyncio.all_tasks = asyncio.tasks.Task.all_tasks + if sys.version_info >= (3, 9, 0): + events._get_event_loop = events.get_event_loop = \ + asyncio.get_event_loop = _get_event_loop + asyncio.run = run + asyncio._nest_patched = True + + +def _patch_policy(): + """Patch the policy to always return a patched loop.""" + + def get_event_loop(self): + if self._local._loop is None: + loop = self.new_event_loop() + _patch_loop(loop) + self.set_event_loop(loop) + return self._local._loop + + policy = events.get_event_loop_policy() + policy.__class__.get_event_loop = get_event_loop + + +def _patch_loop(loop): + """Patch loop to make it reentrant.""" + + def run_forever(self): + with manage_run(self), manage_asyncgens(self): + while True: + self._run_once() + if self._stopping: + break + self._stopping = False + + def run_until_complete(self, future): + with manage_run(self): + f = asyncio.ensure_future(future, loop=self) + if f is not future: + f._log_destroy_pending = False + while not f.done(): + self._run_once() + if self._stopping: + break + if not f.done(): + raise RuntimeError( + 'Event loop stopped before Future completed.') + return f.result() + + def _run_once(self): + """ + Simplified re-implementation of asyncio's _run_once that + runs handles as they become ready. + """ + ready = self._ready + scheduled = self._scheduled + while scheduled and scheduled[0]._cancelled: + heappop(scheduled) + + timeout = ( + 0 if ready or self._stopping + else min(max( + scheduled[0]._when - self.time(), 0), 86400) if scheduled + else None) + event_list = self._selector.select(timeout) + self._process_events(event_list) + + end_time = self.time() + self._clock_resolution + while scheduled and scheduled[0]._when < end_time: + handle = heappop(scheduled) + ready.append(handle) + + for _ in range(len(ready)): + if not ready: + break + handle = ready.popleft() + if not handle._cancelled: + # preempt the current task so that that checks in + # Task.__step do not raise + curr_task = curr_tasks.pop(self, None) + + try: + handle._run() + finally: + # restore the current task + if curr_task is not None: + curr_tasks[self] = curr_task + + handle = None + + @contextmanager + def manage_run(self): + """Set up the loop for running.""" + self._check_closed() + old_thread_id = self._thread_id + old_running_loop = events._get_running_loop() + try: + self._thread_id = threading.get_ident() + events._set_running_loop(self) + self._num_runs_pending += 1 + if self._is_proactorloop: + if self._self_reading_future is None: + self.call_soon(self._loop_self_reading) + yield + finally: + self._thread_id = old_thread_id + events._set_running_loop(old_running_loop) + self._num_runs_pending -= 1 + if self._is_proactorloop: + if (self._num_runs_pending == 0 + and self._self_reading_future is not None): + ov = self._self_reading_future._ov + self._self_reading_future.cancel() + if ov is not None: + self._proactor._unregister(ov) + self._self_reading_future = None + + @contextmanager + def manage_asyncgens(self): + if not hasattr(sys, 'get_asyncgen_hooks'): + # Python version is too old. + return + old_agen_hooks = sys.get_asyncgen_hooks() + try: + self._set_coroutine_origin_tracking(self._debug) + if self._asyncgens is not None: + sys.set_asyncgen_hooks( + firstiter=self._asyncgen_firstiter_hook, + finalizer=self._asyncgen_finalizer_hook) + yield + finally: + self._set_coroutine_origin_tracking(False) + if self._asyncgens is not None: + sys.set_asyncgen_hooks(*old_agen_hooks) + + def _check_running(self): + """Do not throw exception if loop is already running.""" + pass + + if hasattr(loop, '_nest_patched'): + return + if not isinstance(loop, asyncio.BaseEventLoop): + raise ValueError('Can\'t patch loop of type %s' % type(loop)) + cls = loop.__class__ + cls.run_forever = run_forever + cls.run_until_complete = run_until_complete + cls._run_once = _run_once + cls._check_running = _check_running + cls._check_runnung = _check_running # typo in Python 3.7 source + cls._num_runs_pending = 1 if loop.is_running() else 0 + cls._is_proactorloop = ( + os.name == 'nt' and issubclass(cls, asyncio.ProactorEventLoop)) + if sys.version_info < (3, 7, 0): + cls._set_coroutine_origin_tracking = cls._set_coroutine_wrapper + curr_tasks = asyncio.tasks._current_tasks \ + if sys.version_info >= (3, 7, 0) else asyncio.Task._current_tasks + cls._nest_patched = True + + +def _patch_tornado(): + """ + If tornado is imported before nest_asyncio, make tornado aware of + the pure-Python asyncio Future. + """ + if 'tornado' in sys.modules: + import tornado.concurrent as tc # type: ignore + tc.Future = asyncio.Future + if asyncio.Future not in tc.FUTURES: + tc.FUTURES += (asyncio.Future,) diff --git a/numpy-1.26.4.dist-info/RECORD b/numpy-1.26.4.dist-info/RECORD new file mode 100644 index 0000000000000000000000000000000000000000..88e4df29732e352b377b3a6a3db8521b2285923b --- /dev/null +++ b/numpy-1.26.4.dist-info/RECORD @@ -0,0 +1,792 @@ +bin/f2py,sha256=jquHsW_0DGPQfSffCB-4IZSSuujd1MFPwOxy32lz0hI,331 +numpy-1.26.4.dist-info/INSTALLER,sha256=5hhM4Q4mYTT9z6QB6PGpUAW81PGNFrYrdXMj4oM_6ak,2 +numpy-1.26.4.dist-info/LICENSE.txt,sha256=EQewyDHpGNTx28KKMxkMdyFe8njUpMQAlXIIh3DUM0o,47721 +numpy-1.26.4.dist-info/METADATA,sha256=sJc0p_7UToS0yBYZNM5TLf8ed57Ggi1BVkTRF_Y4EHA,61041 +numpy-1.26.4.dist-info/RECORD,, +numpy-1.26.4.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +numpy-1.26.4.dist-info/WHEEL,sha256=3qIDcXCk577AXiK3pDifO-gE9U_MYWYGgtD78gLa2_U,137 +numpy-1.26.4.dist-info/entry_points.txt,sha256=zddyYJuUw9Uud7LeLfynXk62_ry0lGihDwCIgugBdZM,144 +numpy.libs/libgfortran-040039e1.so.5.0.0,sha256=FK-zEpsai1C8QKOwggx_EVLqm8EBIaqxUpQ_cFdHKIY,2686065 +numpy.libs/libopenblas64_p-r0-0cf96a72.3.23.dev.so,sha256=klTQhU3XYV4R3ijXca5AiHjKgSOnrCBPIeTMejdswuU,35123345 +numpy.libs/libquadmath-96973f99.so.0.0.0,sha256=k0wi3tDn0WnE1GeIdslgUa3z2UVF2pYvYLQWWbB12js,247609 +numpy/__config__.py,sha256=z0NFqd9D20ShQlKyPTlbfAPWIJFDEJ7aVp3TQ5_vTxU,4902 +numpy/__init__.cython-30.pxd,sha256=yk2a3etxRNlBgj5uLfIho2RYDYDzhRW8oagAG-wzbPI,36690 +numpy/__init__.pxd,sha256=Pa0VYRSeQRSFepQ6ROgZrNtGY5TzBXIddWsMHtK0OkM,35066 +numpy/__init__.py,sha256=Is0VNfoU10729FfMoUn_3ICHX0YL4xO4-JUnP3i8QC4,17005 +numpy/__init__.pyi,sha256=9kK465XL9oS_X3fJLv0Na29NEYnWvtdMhXPtrnF_cG8,154080 +numpy/_core/__init__.py,sha256=C8_7wbHqUkB35JouY_XKsas1KLpRZ7JHWuZ7VGOPVpU,136 +numpy/_core/__init__.pyi,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +numpy/_core/_dtype.py,sha256=vE16-yiwUSYsAIbq7FlEY1GbXZAp8wjADDxJg3eBX-U,126 +numpy/_core/_dtype_ctypes.py,sha256=i5EhoWPUhu4kla3Xu4ZvXF1lVLPiI6Zg4h6o8jaiamo,147 +numpy/_core/_internal.py,sha256=g5ugmqDgUhSlie5-onOctcm4p0gcMHSIRLHVYtFTk1M,135 +numpy/_core/_multiarray_umath.py,sha256=VPtoT2uHnyU3rKL0G27CgmNmB1WRHM0mtc7Y9L85C3U,159 +numpy/_core/multiarray.py,sha256=kZxC_7P3Jwz1RApzQU2QGmqSq4MAEvKmaJEYnAsbSOs,138 +numpy/_core/umath.py,sha256=YcV0cdbGcem6D5P3yX7cR9HGYBrT8VMoAgCBzGwPhgg,123 +numpy/_distributor_init.py,sha256=IKy2THwmu5UgBjtVbwbD9H-Ap8uaUJoPJ2btQ4Jatdo,407 +numpy/_globals.py,sha256=neEdcfLZoHLwber_1Xyrn26LcXy0MrSta03Ze7aKa6g,3094 +numpy/_pyinstaller/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +numpy/_pyinstaller/hook-numpy.py,sha256=PUQ-mNWje6bFALB-mLVFRPkvbM4JpLXunB6sjBbTy5g,1409 +numpy/_pyinstaller/pyinstaller-smoke.py,sha256=6iL-eHMQaG3rxnS5EgcvrCqElm9aKL07Cjr1FZJSXls,1143 +numpy/_pyinstaller/test_pyinstaller.py,sha256=8K-7QxmfoXCG0NwR0bhIgCNrDjGlrTzWnrR1sR8btgU,1135 +numpy/_pytesttester.py,sha256=lQUTvKVz6kT8b4yiMV-uW-vG9KSv9UzqAmxaEMezTd8,6731 +numpy/_pytesttester.pyi,sha256=OtyXSiuSy8o_78w3QNQRjMLpvvNyEdC0aMsx6T-vRxU,489 +numpy/_typing/__init__.py,sha256=6w9E9V9VaT7vTM-veua8XcySv50Je5qSPJzK9HTocIg,7003 +numpy/_typing/_add_docstring.py,sha256=xQhQX372aN_m3XN95CneMxOST2FdPcovR-MXM-9ep58,3922 +numpy/_typing/_array_like.py,sha256=L4gnx2KWG8yYcouz5b9boJIkkFNtOJV6QjcnGCrbnRY,4298 +numpy/_typing/_callable.pyi,sha256=Mf57BwohRn9ye6ixJqjNEnK0gKqnVPE9Gy8vK-6_zxo,11121 +numpy/_typing/_char_codes.py,sha256=LR51O5AUBDbCmJvlMoxyUvsfvb1p7WHrexgtTGtuWTc,5916 +numpy/_typing/_dtype_like.py,sha256=21Uxy0UgIawGM82xjDF_ifMq-nP-Bkhn_LpiK_HvWC4,5661 +numpy/_typing/_extended_precision.py,sha256=dGios-1k-QBGew7YFzONZTzVWxz-aYAaqlccl2_h5Bo,777 +numpy/_typing/_nbit.py,sha256=-EQOShHpB3r30b4RVEcruQRTcTaFAZwtqCJ4BsvpEzA,345 +numpy/_typing/_nested_sequence.py,sha256=5eNaVZAV9tZQLFWHYOuVs336JjoiaWxyZQ7cMKb6m1I,2566 +numpy/_typing/_scalars.py,sha256=eVP8PjlcTIlY7v0fRI3tFXPogWtpLJZ8nFvRRrLjDqs,980 +numpy/_typing/_shape.py,sha256=JPy7jJMkISGFTnkgiEifYM-4xTcjb7JMRkLIIjZLw08,211 +numpy/_typing/_ufunc.pyi,sha256=e74LtOP9e8kkRhvrIJ_RXz9Ua_L43Pd9IixwNwermnM,12638 +numpy/_typing/setup.py,sha256=SE0Q6HPqDjWUfceA4yXgkII8y3z7EiSF0Z-MNwOIyG4,337 +numpy/_utils/__init__.py,sha256=Hhetwsi3eTBe8HdWbG51zXmcrX1DiPLxkYSrslMLYcc,723 +numpy/_utils/_convertions.py,sha256=0xMxdeLOziDmHsRM_8luEh4S-kQdMoMg6GxNDDas69k,329 +numpy/_utils/_inspect.py,sha256=8Ma7QBRwfSWKeK1ShJpFNc7CDhE6fkIE_wr1FxrG1A8,7447 +numpy/_utils/_pep440.py,sha256=Vr7B3QsijR5p6h8YAz2LjNGUyzHUJ5gZ4v26NpZAKDc,14069 +numpy/array_api/__init__.py,sha256=XtttWbDf6Yh0_m4zp-L_us4HKnV3oGwdlB6n-01Q9M8,10375 +numpy/array_api/_array_object.py,sha256=rfCBzE6vUjk4HElQGTVwe6Tw2vxiUx7tmBpQEmm1iBk,43794 +numpy/array_api/_constants.py,sha256=AYayN2jf1Dp5rXZ7WPBdUhtPBo_JMCi-pD9oW5zmFkI,87 +numpy/array_api/_creation_functions.py,sha256=6SqHdzZqHOJFEyWFtqnj6KIKRivrGXxROlgnez_3Mt0,10050 +numpy/array_api/_data_type_functions.py,sha256=P57FOsNdXahNUriVtdldonbvBQrrZkVzxZbcqkR_8AA,6288 +numpy/array_api/_dtypes.py,sha256=kDU1NLvEQN-W2HPmJ2wGPx8jiNkFbrvTCD1T1RT8Pwo,4823 +numpy/array_api/_elementwise_functions.py,sha256=0kGuDX3Ur_Qp6tBMBWTO7LPUxzXNGAlA2SSJhdAp4DU,25992 +numpy/array_api/_indexing_functions.py,sha256=d-gzqzyvR45FQerRYJrbBzCWFnDsZWSI9pggA5QWRO4,715 +numpy/array_api/_manipulation_functions.py,sha256=qCoW5B5FXcFOWKPU9D9MXHdMeXIuzvnHUUvprNlwfjc,3317 +numpy/array_api/_searching_functions.py,sha256=mGZiqheYXGWiDK9rqXFiDKX0_B0mJ1OjdA-9FC2o5lA,1715 +numpy/array_api/_set_functions.py,sha256=ULpfK1zznW9joX1DXSiP0R3ahcDB_po7mZlpsRqi7Fs,2948 +numpy/array_api/_sorting_functions.py,sha256=7pszlxNN7-DNqEZlonGLFQrlXPP7evVA8jN31NShg00,2031 +numpy/array_api/_statistical_functions.py,sha256=HspfYteZWSa3InMs10KZz-sk3ZuW6teX6fNdo829T84,3584 +numpy/array_api/_typing.py,sha256=uKidRp6nYxgHnEPaqXXZsDDZ6tw1LshpbwLvy-09eeM,1347 +numpy/array_api/_utility_functions.py,sha256=HwycylbPAgRVz4nZvjvwqN3mQnJbqKA-NRMaAvIP-CE,824 +numpy/array_api/linalg.py,sha256=QPpG2tG1pZgzjrtTjjOu2GDu3cI6UpSsLrsG_o1jXYk,18411 +numpy/array_api/setup.py,sha256=Wx6qD7GU_APiqKolYPO0OHv4eHGYrjPZmDAgjWhOEhM,341 +numpy/array_api/tests/__init__.py,sha256=t_2GZ3lKcsu4ec4GMKPUDYaeMUJyDquBlQAcPgj7kFE,282 +numpy/array_api/tests/test_array_object.py,sha256=FQoAxP4CLDiv6iih8KKUDSLuYM6dtnDcB1f0pMHw4-M,17035 +numpy/array_api/tests/test_creation_functions.py,sha256=s3A1COWmXIAJdhzd8v7VtL-jbiSspskTqwYy0BTpmpw,5023 +numpy/array_api/tests/test_data_type_functions.py,sha256=qc8ktRlVXWC3PKhxPVWI_UF9f1zZtpmzHjdCtf3e16E,1018 +numpy/array_api/tests/test_elementwise_functions.py,sha256=CTj4LLwtusI51HkpzD0JPohP1ffNxogAVFz8WLuWFzM,3800 +numpy/array_api/tests/test_indexing_functions.py,sha256=AbuBGyEufEAf24b7fy8JQhdJtGPdP9XEIxPTJAfAFFo,627 +numpy/array_api/tests/test_manipulation_functions.py,sha256=wce25dSJjubrGhFxmiatzR_IpmNYp9ICJ9PZBBnZTOQ,1087 +numpy/array_api/tests/test_set_functions.py,sha256=D016G7v3ko49bND5sVERP8IqQXZiwr-2yrKbBPJ-oqg,546 +numpy/array_api/tests/test_sorting_functions.py,sha256=INPiYnuGBcsmWtYqdTTX3ENHmM4iUx4zs9KdwDaSmdA,602 +numpy/array_api/tests/test_validation.py,sha256=QUG9yWC3QhkPxNhbQeakwBbl-0Rr0iTuZ41_0sfVIGU,676 +numpy/compat/__init__.py,sha256=iAHrmsZWzouOMSyD9bdSE0APWMlRpqW92MQgF8y6x3E,448 +numpy/compat/py3k.py,sha256=Je74CVk_7qI_qX7pLbYcuQJsxlMq1poGIfRIrH99kZQ,3833 +numpy/compat/setup.py,sha256=36X1kF0C_NVROXfJ7w3SQeBm5AIDBuJbM5qT7cvSDgU,335 +numpy/compat/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +numpy/compat/tests/test_compat.py,sha256=YqV67pSN8nXPbXaEdjhmyaoVetNyFupVv57OMEgCwKA,579 +numpy/conftest.py,sha256=HZyWo_wJyrbgnyXxI8t05WOg_IrzNAMnEV7O8koHous,4623 +numpy/core/__init__.py,sha256=CNsO-Ab4ywM2Wz3AbqWOH3ig1q5Bno9PsUMrCv-HNS4,5780 +numpy/core/__init__.pyi,sha256=xtd9OFYza-ZG3jyEJrlzRPT-SkVoB_qYmVCe6FxRks0,126 +numpy/core/_add_newdocs.py,sha256=39JFaeDPN2OQlSwfpY6_Jq9fO5vML8ZMF8J4ZTx_nrs,208972 +numpy/core/_add_newdocs_scalars.py,sha256=PF9v8POcSNH6ELYltkx9e07DWgMmft6NJy9zER3Jk44,12106 +numpy/core/_asarray.py,sha256=P2ddlZAsg1iGleRRfoQv_aKs2N7AGwpo5K4ZQv4Ujlk,3884 +numpy/core/_asarray.pyi,sha256=gNNxUVhToNU_F1QpgeEvUYddpUFN-AKP0QWa4gqcTGw,1086 +numpy/core/_dtype.py,sha256=SihUz41pHRB3Q2LiYYkug6LgMBKh6VV89MOpLxnXQdo,10606 +numpy/core/_dtype_ctypes.py,sha256=Vug4i7xKhznK2tdIjmn4ebclClpaCJwSZUlvEoYl0Eg,3673 +numpy/core/_exceptions.py,sha256=dZWKqfdLRvJvbAEG_fof_8ikEKxjakADMty1kLC_l_M,5379 +numpy/core/_internal.py,sha256=f9kNDuT-FGxF1EtVOVIxXWnH9gM9n-J5V2zwHMv4HEk,28348 +numpy/core/_internal.pyi,sha256=_mCTOX6Su8D4R9fV4HNeohPJx7515B-WOlv4uq6mry8,1032 +numpy/core/_machar.py,sha256=G3a3TXu8VDW_1EMxKKLnGMbvUShEIUEve3ealBlJJ3E,11565 +numpy/core/_methods.py,sha256=m31p0WjcFUGckbJiHnCpSaIQGqv-Lq5niIYkdd33YMo,8613 +numpy/core/_multiarray_tests.cpython-312-x86_64-linux-gnu.so,sha256=Cyy7dBn_wvcSmHqrr1GKOx2d6EBgk_edyx1xKjSrYFc,175912 +numpy/core/_multiarray_umath.cpython-312-x86_64-linux-gnu.so,sha256=amUIEKhzXL25iPdHKZc3QKM3ZF3RWF_vaW5z4tvGW-s,7463681 +numpy/core/_operand_flag_tests.cpython-312-x86_64-linux-gnu.so,sha256=VPbGfwOkzwWoNNVSh3jahuBTI8LrKbN_dCaMcOtDfQE,16856 +numpy/core/_rational_tests.cpython-312-x86_64-linux-gnu.so,sha256=0JmPpR0Ej5eZ4vrHN_6fvrKVCeUVuQam83AxViSkN2k,59776 +numpy/core/_simd.cpython-312-x86_64-linux-gnu.so,sha256=lAK8a8uKjaYoFqMQZBWnVvjeUm-KDsnZzyH_RThl9do,3535232 +numpy/core/_string_helpers.py,sha256=-fQM8z5s8_yX440PmgNEH3SUjEoXMPpPSysZwWZNbuo,2852 +numpy/core/_struct_ufunc_tests.cpython-312-x86_64-linux-gnu.so,sha256=PB6RqEbim2Ezi96GVTzyqi9IuqNcVGCKcgPxwHBVCAM,16960 +numpy/core/_type_aliases.py,sha256=qV6AZlsUWHMWTydmZya73xuBkKXiUKq_WXLj7q2CbZ0,7534 +numpy/core/_type_aliases.pyi,sha256=lguMSqMwvqAFHuRtm8YZSdKbikVz985BdKo_lo7GQCg,404 +numpy/core/_ufunc_config.py,sha256=-Twpe8dnd45ccXH-w-B9nvU8yCOd1E0e3Wpsts3g_bQ,13944 +numpy/core/_ufunc_config.pyi,sha256=-615enOVQMBhVx7Pln7DY_s4H6JjSgSnBy89YkpvuLg,1066 +numpy/core/_umath_tests.cpython-312-x86_64-linux-gnu.so,sha256=kT7z3gJc2t_GgamgqAf3MNRWeVo8KrSWPZVh3mLs_t8,42272 +numpy/core/arrayprint.py,sha256=ySZj4TZFFVCa5yhMmJKFYQYhuQTabZTRBb1YoiCD-ac,63608 +numpy/core/arrayprint.pyi,sha256=21pOWjTSfJOBaKgOOPzRox1ERb3c9ydufqL0b11_P_Q,4428 +numpy/core/cversions.py,sha256=H_iNIpx9-hY1cQNxqjT2d_5SXZhJbMo_caq4_q6LB7I,347 +numpy/core/defchararray.py,sha256=G1LExk-dMeVTYRhtYgcCZEsHk5tkawk7giXcK4Q5KVM,73617 +numpy/core/defchararray.pyi,sha256=ib3aWFcM7F4KooU57mWUNi4GlosNjdfgrLKBVSIKDvU,9216 +numpy/core/einsumfunc.py,sha256=TrL6t79F0H0AQH0y5Cj7Tq0_pzk4fVFi-4q4jJmujYQ,51868 +numpy/core/einsumfunc.pyi,sha256=IJZNdHHG_soig8XvCbXZl43gMr3MMKl9dckTYWecqLs,4860 +numpy/core/fromnumeric.py,sha256=YMtxOBg51VMem39AHXFs-4_vOb1p48ei7njXdYTRJ_Q,128821 +numpy/core/fromnumeric.pyi,sha256=KATMFeFxUJ8YNRaC-jd_dTOt3opz2ng6lHgke5u5COk,23726 +numpy/core/function_base.py,sha256=tHg1qSHTz1eO_wHXNFRt3Q40uqVtPT2eyQdrWbIi4wQ,19836 +numpy/core/function_base.pyi,sha256=3ZYad3cdaGwNEyP8VwK97IYMqk2PDoVjpjQzhIYHjk0,4725 +numpy/core/getlimits.py,sha256=AopcTZDCUXMPcEKIZE1botc3mEhmLb2p1_ejlq1CLqY,25865 +numpy/core/getlimits.pyi,sha256=qeIXUEtognTHr_T-tv-VcZI7n8Z2VzAyIpIgKXzsLkc,82 +numpy/core/include/numpy/__multiarray_api.c,sha256=nPRzTez_Wy3YXy3zZNJNPMspAzxbLOdohqhXwouwMLM,12116 +numpy/core/include/numpy/__multiarray_api.h,sha256=ZM--FKMhIaSQS39cPW0hj5dx8ngNMmbcy6SbgXZBd8U,61450 +numpy/core/include/numpy/__ufunc_api.c,sha256=670Gcz-vhkF4taBDmktCpFRBrZ9CHJnPRx7ag7Z6HsI,1714 +numpy/core/include/numpy/__ufunc_api.h,sha256=0MBOl7dgO3ldqdDi-SdciEOuqGv1UNsmk7mp7tEy4AY,12456 +numpy/core/include/numpy/_dtype_api.h,sha256=4veCexGvx9KNWMIUuEUAVOfcsei9GqugohDY5ud16pA,16697 +numpy/core/include/numpy/_neighborhood_iterator_imp.h,sha256=s-Hw_l5WRwKtYvsiIghF0bg-mA_CgWnzFFOYVFJ-q4k,1857 +numpy/core/include/numpy/_numpyconfig.h,sha256=o0fV_jb-wgVtRxnVIWvUttiZafyrWYFm2ab9Uixz1Cw,855 +numpy/core/include/numpy/arrayobject.h,sha256=-BlWQ7kfVbzCqzHn0qaeMe0_08AbwliuG98XWG57lT8,282 +numpy/core/include/numpy/arrayscalars.h,sha256=C3vDRndZTZRbppiDyV5jp8sV3dRKsrwBIZcNlh9gSTA,3944 +numpy/core/include/numpy/experimental_dtype_api.h,sha256=tlehD5r_pYhHbGzIrUea6vtOgf6IQ8Txblnhx7455h8,15532 +numpy/core/include/numpy/halffloat.h,sha256=TRZfXgipa-dFppX2uNgkrjrPli-1BfJtadWjAembJ4s,1959 +numpy/core/include/numpy/ndarrayobject.h,sha256=PhY4NjRZDoU5Zbc8MW0swPEm81hwgWZ63gAU93bLVVI,10183 +numpy/core/include/numpy/ndarraytypes.h,sha256=EjWXv-J8C5JET4AlIbJRdctycL7-dyJZcnoWgnlCPc8,68009 +numpy/core/include/numpy/noprefix.h,sha256=d83l1QpCCVqMV2k29NMkL3Ld1qNjiC6hzOPWZAivEjQ,6830 +numpy/core/include/numpy/npy_1_7_deprecated_api.h,sha256=y0MJ8Qw7Bkt4H_4VxIzHzpkw5JqAdj5ECgtn08fZFrI,4327 +numpy/core/include/numpy/npy_3kcompat.h,sha256=SvN9yRA3i02O4JFMXxZz0Uq_vJ5ZpvC-pC2sfF56A5I,15883 +numpy/core/include/numpy/npy_common.h,sha256=apWBsCJeP8P5T0exgzhFcGohbASsUF8vtFdS2jc1VfU,37746 +numpy/core/include/numpy/npy_cpu.h,sha256=pcVRtj-Y6120C5kWB1VAiAjZoxkTPDEg0gGm5IAt3jM,4629 +numpy/core/include/numpy/npy_endian.h,sha256=we7X9fPeWzNpo_YTh09MPGDwdE0Rw_WDM4c9y4nBj5I,2786 +numpy/core/include/numpy/npy_interrupt.h,sha256=DQZIxi6FycLXD8drdHn2SSmLoRhIpo6osvPv13vowUA,1948 +numpy/core/include/numpy/npy_math.h,sha256=SbKRoc7O3gVuDl7HOZjk424O049I0zn-7i9GwBwNmmk,18945 +numpy/core/include/numpy/npy_no_deprecated_api.h,sha256=0yZrJcQEJ6MCHJInQk5TP9_qZ4t7EfBuoLOJ34IlJd4,678 +numpy/core/include/numpy/npy_os.h,sha256=hlQsg_7-RkvS3s8OM8KXy99xxyJbCm-W1AYVcdnO1cw,1256 +numpy/core/include/numpy/numpyconfig.h,sha256=Nr59kE3cXmen6y0UymIBaU7F1BSIuPwgKZ4gdV5Q5JU,5308 +numpy/core/include/numpy/old_defines.h,sha256=xuYQDDlMywu0Zsqm57hkgGwLsOFx6IvxzN2eiNF-gJY,6405 +numpy/core/include/numpy/random/LICENSE.txt,sha256=-8U59H0M-DvGE3gID7hz1cFGMBJsrL_nVANcOSbapew,1018 +numpy/core/include/numpy/random/bitgen.h,sha256=49AwKOR552r-NkhuSOF1usb_URiMSRMvD22JF5pKIng,488 +numpy/core/include/numpy/random/distributions.h,sha256=W5tOyETd0m1W0GdaZ5dJP8fKlBtsTpG23V2Zlmrlqpg,9861 +numpy/core/include/numpy/random/libdivide.h,sha256=ew9MNhPQd1LsCZiWiFmj9IZ7yOnA3HKOXffDeR9X1jw,80138 +numpy/core/include/numpy/ufuncobject.h,sha256=Xmnny_ulZo9VwxkfkXF-1HCTKDavIp9PV_H7XWhi0Z8,12070 +numpy/core/include/numpy/utils.h,sha256=wMNomSH3Dfj0q78PrjLVtFtN-FPo7UJ4o0ifCUO-6Es,1185 +numpy/core/lib/libnpymath.a,sha256=mb8EluEp8SLpEeCTQJ0VshL-CqeZfWxSbS5ItM-9POc,93960 +numpy/core/lib/npy-pkg-config/mlib.ini,sha256=_LsWV1eStNqwhdiYPa2538GL46dnfVwT4MrI1zbsoFw,147 +numpy/core/lib/npy-pkg-config/npymath.ini,sha256=kamUNrYKAmXqQa8BcNv7D5sLqHh6bnChM0_5rZCsTfY,360 +numpy/core/memmap.py,sha256=yWBJLeVClHsD8BYusnf9bdqypOMPrj3_zoO_lQ2zVMc,11771 +numpy/core/memmap.pyi,sha256=sxIQ7T5hPLG-RBNndAc8JPvrsKEX1amBSH2HGg48Obo,55 +numpy/core/multiarray.py,sha256=zXaWf_DSkFEWjUQqVRCGeevwsI6kjQ3x6_MUwA1Y8fk,56097 +numpy/core/multiarray.pyi,sha256=_0X4W90U5ZiKt2n-9OscK-pcQyV6oGK-8jwGy5k1qxA,24768 +numpy/core/numeric.py,sha256=DgajaCDXiiQR-zuW_rrx_QhApSsa5k5FONK3Uk9mfTs,77014 +numpy/core/numeric.pyi,sha256=oVQkI4ABayFl_ZzCiGH4DxfYASL-3aETi-3B93THnEQ,14315 +numpy/core/numerictypes.py,sha256=qIf9v1OpNjjVQzXnKpD-3V01y5Bj9huw5F-U5Wa4glc,18098 +numpy/core/numerictypes.pyi,sha256=dEqtq9MLrGaqqeAF1sdXBgnEwDWOzlK02A6MTg1PS5g,3267 +numpy/core/overrides.py,sha256=YUZFS8RCBvOJ27sH-jDRcyMjOCn9VigMyuQY4J21JBI,7093 +numpy/core/records.py,sha256=4mpIjUp2XtZxY5cD2S8mgfn8GCzQGGrrkqLBqAJwM-Q,37533 +numpy/core/records.pyi,sha256=uYwE6cAoGKgN6U4ryfGZx_3m-3sY006jytjWLrDRRy0,5692 +numpy/core/shape_base.py,sha256=RPMKxA7_FCAgg_CruExl0LehnczSTFaxA6hrcfrUzns,29743 +numpy/core/shape_base.pyi,sha256=Ilb4joJmbjkIZLzKww7NJeaxg2FP3AfFib3HtfOsrC0,2774 +numpy/core/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +numpy/core/tests/_locales.py,sha256=S4x5soqF0oxpBYOE8J9Iky72O9J25IiZ8349m93pWC4,2206 +numpy/core/tests/data/astype_copy.pkl,sha256=lWSzCcvzRB_wpuRGj92spGIw-rNPFcd9hwJaRVvfWdk,716 +numpy/core/tests/data/generate_umath_validation_data.cpp,sha256=fyhQPNhIX9hzjeXujn6mhi1MVc133zELSV_hlSQ7BQU,5842 +numpy/core/tests/data/numpy_2_0_array.pkl,sha256=Vh02tdyCypa8Nb4QzdVhnDAiXEO2WQrcwcvOdDDFF5w,718 +numpy/core/tests/data/recarray_from_file.fits,sha256=NA0kliz31FlLnYxv3ppzeruONqNYkuEvts5wzXEeIc4,8640 +numpy/core/tests/data/umath-validation-set-README.txt,sha256=pxWwOaGGahaRd-AlAidDfocLyrAiDp0whf5hC7hYwqM,967 +numpy/core/tests/data/umath-validation-set-arccos.csv,sha256=W_aL99bjzVjlVyd5omfDUORag8jHzx6uctedPVZgOHQ,61365 +numpy/core/tests/data/umath-validation-set-arccosh.csv,sha256=Uko_d0kDXr1YlN-6Ii-fQQxUvbXAhRfC7Un4gJ23GJk,61365 +numpy/core/tests/data/umath-validation-set-arcsin.csv,sha256=15Aenze4WD2a2dF2aOBXpv9B7u3wwAeUVJdEm4TjOkQ,61339 +numpy/core/tests/data/umath-validation-set-arcsinh.csv,sha256=uDwx4PStpfV21IaPF8pmzQpul6i72g7zDwlfcynWaVQ,60289 +numpy/core/tests/data/umath-validation-set-arctan.csv,sha256=mw5tYze_BMs6ugGEZfg5mcXoInGYdn7fvSCYSUi9Bqw,60305 +numpy/core/tests/data/umath-validation-set-arctanh.csv,sha256=95l4Uu5RmZajljabfqlv5U34RVrifCMhhkop6iLeNBo,61339 +numpy/core/tests/data/umath-validation-set-cbrt.csv,sha256=v855MTZih-fZp_GuEDst2qaIsxU4a7vlAbeIJy2xKpc,60846 +numpy/core/tests/data/umath-validation-set-cos.csv,sha256=0PNnDqKkokZ7ERVDgbes8KNZc-ISJrZUlVZc5LkW18E,59122 +numpy/core/tests/data/umath-validation-set-cosh.csv,sha256=FGCNeUSUTAeASsb_j18iRSsCxXLxmzF-_C7tq1elVrQ,60869 +numpy/core/tests/data/umath-validation-set-exp.csv,sha256=BKg1_cyrKD2GXYMX_EB0DnXua8DI2O1KWODXf_BRhrk,17491 +numpy/core/tests/data/umath-validation-set-exp2.csv,sha256=f1b05MRXPOXihC9M-yi52udKBzVXalhbTuIcqoDAk-g,58624 +numpy/core/tests/data/umath-validation-set-expm1.csv,sha256=_ghc1xiUECNsBGrKCFUAy2lvu01_lkpeYJN0zDtCYWk,60299 +numpy/core/tests/data/umath-validation-set-log.csv,sha256=z9ej1ykKUoMRqYMUIJENWXbYi_A_x_RKs7K_GuXZJus,11692 +numpy/core/tests/data/umath-validation-set-log10.csv,sha256=RJgpruL16FVPgUT3-3xW4eppS_tn6o5yEW79KnITn48,68922 +numpy/core/tests/data/umath-validation-set-log1p.csv,sha256=IZZI-hi55HGCOvBat3vSBVha_8Nt-5alf2fqz6QeTG0,60303 +numpy/core/tests/data/umath-validation-set-log2.csv,sha256=HL2rOCsrEi378rNrbsXHPqlWlEGkXQq8R4e63YeTksU,68917 +numpy/core/tests/data/umath-validation-set-sin.csv,sha256=8PUjnQ_YfmxFb42XJrvpvmkeSpEOlEXSmNvIK4VgfAM,58611 +numpy/core/tests/data/umath-validation-set-sinh.csv,sha256=CYiibE8aX7MQnBatl__5k_PWc_9vHUifwS-sFZzzKk0,60293 +numpy/core/tests/data/umath-validation-set-tan.csv,sha256=Oq7gxMvblRVBrQ23kMxc8iT0bHnCWKg9EE4ZqzbJbOA,60299 +numpy/core/tests/data/umath-validation-set-tanh.csv,sha256=iolZF_MOyWRgYSa-SsD4df5mnyFK18zrICI740SWoTc,60299 +numpy/core/tests/examples/cython/checks.pyx,sha256=rKAhPSGHJ9oPK9Q_85YoUQyRTftEP1jcYOR5lSPB6oQ,662 +numpy/core/tests/examples/cython/meson.build,sha256=Qk4Q6OkpZ0xsLUkcGQVVrYkzb0ozoyL6YlSZ8_5tH1I,1088 +numpy/core/tests/examples/cython/setup.py,sha256=aAR-TvQabUabnCzuB6UdWdmRXaaPfIG7MzTIfMF-0tk,496 +numpy/core/tests/examples/limited_api/limited_api.c,sha256=mncE8TjjXmYpkwli433G0jB2zGQO_5NqWmGKdzRJZug,344 +numpy/core/tests/examples/limited_api/setup.py,sha256=p2w7F1ardi_GRXSrnNIR8W1oeH_pgmw_1P2wS0A2I6M,435 +numpy/core/tests/test__exceptions.py,sha256=QqxQSLXboPXEVwHz-TyE2JeIl_TC-rPugzfo25nbcns,2846 +numpy/core/tests/test_abc.py,sha256=FfgYA_HjYAi8XWGK_oOh6Zw86chB_KG_XoW_7ZlFp4c,2220 +numpy/core/tests/test_api.py,sha256=UMc7SvczAQ5ngHxE-NoXVvNpVzYRrn8oMwFNta1yMS0,22995 +numpy/core/tests/test_argparse.py,sha256=C0zBbwQ9xzzymXe_hHpWnnWQPwOi2ZdQB78gBAgJHvU,1969 +numpy/core/tests/test_array_coercion.py,sha256=zY4Pjlt4QZ0w71WxWGLHcrPnnhEF51yXYVLg5HMIy5c,34379 +numpy/core/tests/test_array_interface.py,sha256=8tGgj1Nzi76H_WF5GULkxqWL7Yu_Xf0lvTJZOwOBKsI,7774 +numpy/core/tests/test_arraymethod.py,sha256=VpjDYTmoMDTZcY7CsGzinBh0R_OICuwOykWCbmCRQZU,3244 +numpy/core/tests/test_arrayprint.py,sha256=cKaIoD9ZvsjJH0PHwZyOxmcRcBt1kN1WfFneqVqs0b8,40462 +numpy/core/tests/test_casting_floatingpoint_errors.py,sha256=W3Fgk0oKtXFv684fEZ7POwj6DHTYK0Jj_oGRLZ8UdyA,5063 +numpy/core/tests/test_casting_unittests.py,sha256=9-vkR0oXczQz8ED8DxGVPmalC8IZXe2jKgOCMGr8hIg,34298 +numpy/core/tests/test_conversion_utils.py,sha256=jNhbNNI-T8qtQnsIMEax7KFN30kjh0ICntLMwTyxJ5Q,6559 +numpy/core/tests/test_cpu_dispatcher.py,sha256=v_SlhUpENuoe7QYXizzYITLGXa7WfZ7jqcqmbSBg7JU,1542 +numpy/core/tests/test_cpu_features.py,sha256=mieGx7dxXFiyTYatbcCCjIjR67Un2hVcbJx4GEf2yFo,14892 +numpy/core/tests/test_custom_dtypes.py,sha256=JogRmttDLwfQ3PTbewEnGLKco9zV2Nu3yIfrMeCsx_I,9401 +numpy/core/tests/test_cython.py,sha256=t5-h4XSIFNLyw_9BIAQDYl8_80t_pH0SCfEa1Vf_3aI,3755 +numpy/core/tests/test_datetime.py,sha256=2vAGbrCQmsrWNXCVXOMZqUGZn2c-cQT-eZ1wTprYbcM,116211 +numpy/core/tests/test_defchararray.py,sha256=F88HUkByEP4H6cJ_ITvIe0a_T1BH2JOdRysMCu1XIn0,24997 +numpy/core/tests/test_deprecations.py,sha256=w2lhHb-W8hh7RoE_0Ftg8thpG86jvbFAJgior22DY2Q,31076 +numpy/core/tests/test_dlpack.py,sha256=cDlwFmTombb2rDeB8RHEAJ4eVMUiDbw8Oz5Jo1NQwk0,3522 +numpy/core/tests/test_dtype.py,sha256=J09pJF59v7UO6iNuJFISKP2DLPgdkQ_df5OAMDRLikU,75702 +numpy/core/tests/test_einsum.py,sha256=QzQAPIC-IjTV3Dxz97hBnvLBCmF8kpsBTBckThhgRjQ,53712 +numpy/core/tests/test_errstate.py,sha256=U3GT9I058jkF725mx4GdWUr9RoceCkGDV7Go79VA4wY,2219 +numpy/core/tests/test_extint128.py,sha256=gCZfAwPOb-F1TLsEEeDI0amQYwHk-60-OXi0ccZrrZ8,5643 +numpy/core/tests/test_function_base.py,sha256=Ibs6-WXZE5hsRx4VCnX-cZOWYKU-5PFXjouwAQzgnqQ,15595 +numpy/core/tests/test_getlimits.py,sha256=apdxr0zKkxaVHIUpLrqAvO39q54JKN14sV4xSbK2Ifs,6718 +numpy/core/tests/test_half.py,sha256=VYPyap9GYOWZuphsfFofcIRl-oa5Ufrtv83OTp6azdU,24593 +numpy/core/tests/test_hashtable.py,sha256=ZV8HL8NkDnoQZfnje7BP0fyIp4fSFqjKsQc40PaTggc,1011 +numpy/core/tests/test_indexerrors.py,sha256=kN9xLl6FVTzmI7fumn_cuZ3k0omXnTetgtCnPY44cvw,5130 +numpy/core/tests/test_indexing.py,sha256=x0ojWuhOwWD5MZuiJ9Ncim3CgkwI-GldWxrSCmjmFJM,54314 +numpy/core/tests/test_item_selection.py,sha256=kI30kiX8mIrZYPn0jw3lGGw1ruZF4PpE9zw-aai9EPA,6458 +numpy/core/tests/test_limited_api.py,sha256=5yO0nGmCKZ9b3S66QP7vY-HIgAoyOtHZmp8mvzKuOHI,1172 +numpy/core/tests/test_longdouble.py,sha256=jO8YMm_Hsz-XPKbmv6iMcOdHgTlIFkKTwAtxpy3Q1pE,13905 +numpy/core/tests/test_machar.py,sha256=_5_TDUVtAJvJI5jBfEFKpCZtAfKCsCFt7tXlWSkWzzc,1067 +numpy/core/tests/test_mem_overlap.py,sha256=QJ0unWD_LOoAGAo4ra0IvYenj56IYUtiz1fEJEmTY9Q,29086 +numpy/core/tests/test_mem_policy.py,sha256=CXa10FQw2Qj6MqJuaC8Fm4slsoipKFjCIpYF6c5IIAU,16801 +numpy/core/tests/test_memmap.py,sha256=tZ5lJs_4ZFsJmg392ZQ33fX0m8tdfZ8ZtY9Lq41LNtk,7477 +numpy/core/tests/test_multiarray.py,sha256=GPv4IJR9dijNG-icUsQsX2tBD2RdP3EhUehY4cxvVQU,380106 +numpy/core/tests/test_nditer.py,sha256=nVQ00aNxPHqf4ZcFs3e9AVDK64TCqlO0TzfocTAACZQ,130818 +numpy/core/tests/test_nep50_promotions.py,sha256=2TwtFvj1LBpYTtdR6NFe1RAAGXIJltLqwpA1vhQCVY4,8840 +numpy/core/tests/test_numeric.py,sha256=ZGNW5NKgShEjZC_TcPOtTuRaTM_GbuM21u82D205UPs,137294 +numpy/core/tests/test_numerictypes.py,sha256=f_xMjZJnyDwlc6XCrd71b6x1_6dAWOv-kZ3-NEq37hU,21687 +numpy/core/tests/test_numpy_2_0_compat.py,sha256=kVCTAXska7Xi5w_TYduWhid0nlCqI6Nvmt-gDnYsuKI,1630 +numpy/core/tests/test_overrides.py,sha256=t0gOZOzu7pevE58HA-npFYJqnInHR-LLBklnzKJWHqo,26080 +numpy/core/tests/test_print.py,sha256=ErZAWd88b0ygSEoYpd0BL2tFjkerMtn1vZ7dWvaNqTc,6837 +numpy/core/tests/test_protocols.py,sha256=fEXE9K9s22oiVWkX92BY-g00-uXCK-HxjZhZxxYAKFc,1168 +numpy/core/tests/test_records.py,sha256=pluit5x6jkWoPEIrHXM13L3xZuuSSiaxoXFsOdkakCU,20269 +numpy/core/tests/test_regression.py,sha256=SJo9cPTVr2SNjhgtW7boUMyNQlXxygsZ5g0oyqC8Eks,91595 +numpy/core/tests/test_scalar_ctors.py,sha256=qDIZV-tBukwAxNDhUmGtH3CemDXlS3xd_q3L52touuA,6115 +numpy/core/tests/test_scalar_methods.py,sha256=Uj-zU0zzzKAjMBdpkzsWZ3nSFj5gJkUlqi_euhOYdnU,7541 +numpy/core/tests/test_scalarbuffer.py,sha256=FSL94hriWX1_uV6Z33wB3ZXUrpmmX2-x87kNjIxUeBk,5580 +numpy/core/tests/test_scalarinherit.py,sha256=fMInDGKsiH3IS_2ejZtIcmJZ0Ry8c7kVsHx7wp5XDoM,2368 +numpy/core/tests/test_scalarmath.py,sha256=XZj_m2I2TLktJdFD1SWj2XtV8hT26VIxasDz3cAFvgA,43247 +numpy/core/tests/test_scalarprint.py,sha256=1599W5X0tjGhBnSQjalXkg6AY8eHXnr6PMqs4vYZQqs,18771 +numpy/core/tests/test_shape_base.py,sha256=D9haeuUVx3x3pOLmFQ9vUz7iU4T2bFTsPoI8HgSncFU,29723 +numpy/core/tests/test_simd.py,sha256=-L1UhIn9Eu_euLwaSU7bPRfYpWWOTb43qovoJS7Ws7w,48696 +numpy/core/tests/test_simd_module.py,sha256=OSpYhH_3QDxItyQcaW6SjXW57k2m-weRwpYOnJjCqN0,3902 +numpy/core/tests/test_strings.py,sha256=A9t1B65lFrYRLXgDJSg3mMDAe_hypIPcTMVOdAYIbU0,3835 +numpy/core/tests/test_ufunc.py,sha256=5pS2x3LACHn8GogYYad8LRAjByK7Gg9xTD9ik3d0Fm0,124907 +numpy/core/tests/test_umath.py,sha256=huHpclJqkO32k7BTflRHj8nImzg3p6yyryeS9LyHKWU,186482 +numpy/core/tests/test_umath_accuracy.py,sha256=mFcVdzXhhD9mqhzLDJVZsWfCHbjbFQ6XeEl5G8l-PTc,3897 +numpy/core/tests/test_umath_complex.py,sha256=WvZZZWeijo52RiOfx-G83bxzQOp_IJ3i9fEnUDVukLQ,23247 +numpy/core/tests/test_unicode.py,sha256=hUXIwMmoq89y_KXWzuXVyQaXvRwGjfY4TvKJsCbygEI,12775 +numpy/core/umath.py,sha256=JbT_SxnZ_3MEmjOI9UtX3CcAzX5Q-4RDlnnhDAEJ5Vo,2040 +numpy/core/umath_tests.py,sha256=TIzaDfrEHHgSc2J5kxFEibq8MOPhwSuyOZOUBsZNVSM,389 +numpy/ctypeslib.py,sha256=Po4XCWfxhwFQ1Q8x8DeayGiMCJLxREaCDkVyeladxBU,17247 +numpy/ctypeslib.pyi,sha256=A9te473aRO920iDVuyKypeVIQp-ueZK6EiI-qLSwJNg,7972 +numpy/doc/__init__.py,sha256=OYmE-F6x0CD05PCDY2MiW1HLlwB6i9vhDpk-a3r4lHY,508 +numpy/doc/constants.py,sha256=PlXoj7b4A8Aa9nADbg83uzTBRJaX8dvJmEdbn4FDPPo,9155 +numpy/doc/ufuncs.py,sha256=i1alLg19mNyCFZ2LYSOZGm--RsRN1x63U_UYU-N3x60,5357 +numpy/dtypes.py,sha256=BuBztrPQRasUmVZhXr2_NgJujdUTNhNwd59pZZHk3lA,2229 +numpy/dtypes.pyi,sha256=tIHniAYP7ALg2iT7NgSXO67jvE-zRlDod3MazEmD4M8,1315 +numpy/exceptions.py,sha256=7j7tv8cwXGZYgldyMisGmnAxAl2s4YU0vexME81yYlA,7339 +numpy/exceptions.pyi,sha256=KsZqWNvyPUEXUGR9EhZCUQF2f9EVSpBRlJUlGqRT02k,600 +numpy/f2py/__init__.py,sha256=m-ty_WiJZ4GVfV5--kJ3MFJaLXestz5Eo-4H0FPscK4,5565 +numpy/f2py/__init__.pyi,sha256=eA7uYXZr0p0aaz5rBW-EypLx9RchrvqDYtSnkEJQsYw,1087 +numpy/f2py/__main__.py,sha256=6i2jVH2fPriV1aocTY_dUFvWK18qa-zjpnISA-OpF3w,130 +numpy/f2py/__version__.py,sha256=7HHdjR82FCBmftwMRyrlhcEj-8mGQb6oCH-wlUPH4Nw,34 +numpy/f2py/_backends/__init__.py,sha256=7_bA7c_xDpLc4_8vPfH32-Lxn9fcUTgjQ25srdvwvAM,299 +numpy/f2py/_backends/_backend.py,sha256=GKb9-UaFszT045vUgVukPs1n97iyyjqahrWKxLOKNYo,1187 +numpy/f2py/_backends/_distutils.py,sha256=pxh2YURFYYSykIOvBFwVvhoNX1oSk-c30IPPhzlko-0,2383 +numpy/f2py/_backends/_meson.py,sha256=gi-nbnPFDC38sumfAjg-Q5FPu6nNkyQXTjEuVf9W9Cc,6916 +numpy/f2py/_backends/meson.build.template,sha256=oTPNMAQzS4CJ_lfEzYv-oBeJTtQuThUYVN5R6ROWpNU,1579 +numpy/f2py/_isocbind.py,sha256=zaBgpfPNRmxVG3doUIlbZIiyB990MsXiwDabrSj9HnQ,2360 +numpy/f2py/_src_pyf.py,sha256=4t6TN4ZKWciC4f1z6fwaGrpIGhHKRiwHfcrNj4FIzCg,7654 +numpy/f2py/auxfuncs.py,sha256=dNs4b2KDIcG4M1hPBvD09-Vh7CDzlPIrFscOdvL3p1o,26539 +numpy/f2py/capi_maps.py,sha256=ENjYyeZ3CCJcLwJJgmKOSYrD1KPuhpwauXqeizdV55o,30563 +numpy/f2py/cb_rules.py,sha256=5TuHbJWGjsF6yVNzKuV2tAnwdLyhcWlmdsjYlDOZOv4,24992 +numpy/f2py/cfuncs.py,sha256=KJyW7mdjmFSmxssfeegGJs5NZyF3mZMgNvOxN9-vYHQ,51913 +numpy/f2py/common_rules.py,sha256=gHB76WypbkVmhaD_RWhy8Od4zDTgj8cbDOdUdIp6PIQ,5131 +numpy/f2py/crackfortran.py,sha256=ErLdkWP8MxeyW5vVPGXwyvrxZAwymlvIBC0th2rvK74,148553 +numpy/f2py/diagnose.py,sha256=0SRXBE2hJgKJN_Rf4Zn00oKXC_Tka3efPWM47zg6BoY,5197 +numpy/f2py/f2py2e.py,sha256=5t093ZQ4xs0_0UbyaYVd2yA2EVOaOAcuU29JI-IU2Ag,27717 +numpy/f2py/f90mod_rules.py,sha256=otm3_dmVIna0eBVHLu_693s3a_82lU3pqeqDacWI37s,9594 +numpy/f2py/func2subr.py,sha256=6d2R5awuHRT4xzgfUfwS7JHTqhhAieSXcENlssD_2c4,10298 +numpy/f2py/rules.py,sha256=B4FxSYEfZ_1j_z9GulQNZ1BNrPrUvlU3ybxwTkrIxjI,62727 +numpy/f2py/setup.cfg,sha256=Fpn4sjqTl5OT5sp8haqKIRnUcTPZNM6MIvUJBU7BIhg,48 +numpy/f2py/setup.py,sha256=MmAVspT8DDTqDuL8ZJhxK62g0lcso4vqI6QNQ9CsfoQ,2422 +numpy/f2py/src/fortranobject.c,sha256=g4BKDO1_9pCu6hithKXD2oH_Mt-HH1NTnP6leCqJrzc,46017 +numpy/f2py/src/fortranobject.h,sha256=neMKotYWbHvrhW9KXz4QzQ8fzPkiQXLHHjy82vLSeog,5835 +numpy/f2py/symbolic.py,sha256=jWBoAwECCxRdWczR9r7O6UERcYmH_GbdcAReNp7cmJY,53270 +numpy/f2py/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +numpy/f2py/tests/src/abstract_interface/foo.f90,sha256=JFU2w98cB_XNwfrqNtI0yDTmpEdxYO_UEl2pgI_rnt8,658 +numpy/f2py/tests/src/abstract_interface/gh18403_mod.f90,sha256=gvQJIzNtvacWE0dhysxn30-iUeI65Hpq7DiE9oRauz8,105 +numpy/f2py/tests/src/array_from_pyobj/wrapmodule.c,sha256=Ff5wHYV9-OJnZuelfFWcjAibRvDkEIlbTVczTyv6TG8,7299 +numpy/f2py/tests/src/assumed_shape/.f2py_f2cmap,sha256=But9r9m4iL7EGq_haMW8IiQ4VivH0TgUozxX4pPvdpE,29 +numpy/f2py/tests/src/assumed_shape/foo_free.f90,sha256=oBwbGSlbr9MkFyhVO2aldjc01dr9GHrMrSiRQek8U64,460 +numpy/f2py/tests/src/assumed_shape/foo_mod.f90,sha256=rfzw3QdI-eaDSl-hslCgGpd5tHftJOVhXvb21Y9Gf6M,499 +numpy/f2py/tests/src/assumed_shape/foo_use.f90,sha256=rmT9k4jP9Ru1PLcGqepw9Jc6P9XNXM0axY7o4hi9lUw,269 +numpy/f2py/tests/src/assumed_shape/precision.f90,sha256=r08JeTVmTTExA-hYZ6HzaxVwBn1GMbPAuuwBhBDtJUk,130 +numpy/f2py/tests/src/block_docstring/foo.f,sha256=y7lPCPu7_Fhs_Tf2hfdpDQo1bhtvNSKRaZAOpM_l3dg,97 +numpy/f2py/tests/src/callback/foo.f,sha256=C1hjfpRCQWiOVVzIHqnsYcnLrqQcixrnHCn8hd9GhVk,1254 +numpy/f2py/tests/src/callback/gh17797.f90,sha256=_Nrl0a2HgUbtymGU0twaJ--7rMa1Uco2A3swbWvHoMo,148 +numpy/f2py/tests/src/callback/gh18335.f90,sha256=NraOyKIXyvv_Y-3xGnmTjtNjW2Znsnlk8AViI8zfovc,506 +numpy/f2py/tests/src/callback/gh25211.f,sha256=a2sxlQhtDVbYn8KOKHUYqwc-aCFt7sDPSnJsXFG35uI,179 +numpy/f2py/tests/src/callback/gh25211.pyf,sha256=FWxo0JWQlw519BpZV8PoYeI_FZ_K6C-3Wk6gLrfBPlw,447 +numpy/f2py/tests/src/cli/gh_22819.pyf,sha256=5rvOfCv-wSosB354LC9pExJmMoSHnbGZGl_rtA2fogA,142 +numpy/f2py/tests/src/cli/hi77.f,sha256=ttyI6vAP3qLnDqy82V04XmoqrXNM6uhMvvLri2p0dq0,71 +numpy/f2py/tests/src/cli/hiworld.f90,sha256=QWOLPrTxYQu1yrEtyQMbM0fE9M2RmXe7c185KnD5x3o,51 +numpy/f2py/tests/src/common/block.f,sha256=GQ0Pd-VMX3H3a-__f2SuosSdwNXHpBqoGnQDjf8aG9g,224 +numpy/f2py/tests/src/common/gh19161.f90,sha256=BUejyhqpNVfHZHQ-QC7o7ZSo7lQ6YHyX08lSmQqs6YM,193 +numpy/f2py/tests/src/crackfortran/accesstype.f90,sha256=-5Din7YlY1TU7tUHD2p-_DSTxGBpDsWYNeT9WOwGhno,208 +numpy/f2py/tests/src/crackfortran/data_common.f,sha256=ZSUAh3uhn9CCF-cYqK5TNmosBGPfsuHBIEfudgysun4,193 +numpy/f2py/tests/src/crackfortran/data_multiplier.f,sha256=jYrJKZWF_59JF9EMOSALUjn0UupWvp1teuGpcL5s1Sc,197 +numpy/f2py/tests/src/crackfortran/data_stmts.f90,sha256=19YO7OGj0IksyBlmMLZGRBQLjoE3erfkR4tFvhznvvE,693 +numpy/f2py/tests/src/crackfortran/data_with_comments.f,sha256=hoyXw330VHh8duMVmAQZjr1lgLVF4zFCIuEaUIrupv0,175 +numpy/f2py/tests/src/crackfortran/foo_deps.f90,sha256=CaH7mnWTG7FcnJe2vXN_0zDbMadw6NCqK-JJ2HmDjK8,128 +numpy/f2py/tests/src/crackfortran/gh15035.f,sha256=jJly1AzF5L9VxbVQ0vr-sf4LaUo4eQzJguhuemFxnvg,375 +numpy/f2py/tests/src/crackfortran/gh17859.f,sha256=7K5dtOXGuBDAENPNCt-tAGJqTfNKz5OsqVSk16_e7Es,340 +numpy/f2py/tests/src/crackfortran/gh22648.pyf,sha256=qZHPRNQljIeYNwbqPLxREnOrSdVV14f3fnaHqB1M7c0,241 +numpy/f2py/tests/src/crackfortran/gh23533.f,sha256=w3tr_KcY3s7oSWGDmjfMHv5h0RYVGUpyXquNdNFOJQg,126 +numpy/f2py/tests/src/crackfortran/gh23598.f90,sha256=41W6Ire-5wjJTTg6oAo7O1WZfd1Ug9vvNtNgHS5MhEU,101 +numpy/f2py/tests/src/crackfortran/gh23598Warn.f90,sha256=1v-hMCT_K7prhhamoM20nMU9zILam84Hr-imck_dYYk,205 +numpy/f2py/tests/src/crackfortran/gh23879.f90,sha256=LWDJTYR3t9h1IsrKC8dVXZlBfWX7clLeU006X6Ow8oI,332 +numpy/f2py/tests/src/crackfortran/gh2848.f90,sha256=gPNasx98SIf7Z9ibk_DHiGKCvl7ERtsfoGXiFDT7FbM,282 +numpy/f2py/tests/src/crackfortran/operators.f90,sha256=-Fc-qjW1wBr3Dkvdd5dMTrt0hnjnV-1AYo-NFWcwFSo,1184 +numpy/f2py/tests/src/crackfortran/privatemod.f90,sha256=7bubZGMIn7iD31wDkjF1TlXCUM7naCIK69M9d0e3y-U,174 +numpy/f2py/tests/src/crackfortran/publicmod.f90,sha256=Pnwyf56Qd6W3FUH-ZMgnXEYkb7gn18ptNTdwmGan0Jo,167 +numpy/f2py/tests/src/crackfortran/pubprivmod.f90,sha256=eYpJwBYLKGOxVbKgEqfny1znib-b7uYhxcRXIf7uwXg,165 +numpy/f2py/tests/src/crackfortran/unicode_comment.f90,sha256=aINLh6GlfTwFewxvDoqnMqwuCNb4XAqi5Nj5vXguXYs,98 +numpy/f2py/tests/src/f2cmap/.f2py_f2cmap,sha256=iUOtfHd3OuT1Rz2-yiSgt4uPKGvCt5AzQ1iygJt_yjg,82 +numpy/f2py/tests/src/f2cmap/isoFortranEnvMap.f90,sha256=iJCD8a8MUTmuPuedbcmxW54Nr4alYuLhksBe1sHS4K0,298 +numpy/f2py/tests/src/isocintrin/isoCtests.f90,sha256=jcw-fzrFh0w5U66uJYfeUW4gv94L5MnWQ_NpsV9y0oI,998 +numpy/f2py/tests/src/kind/foo.f90,sha256=zIHpw1KdkWbTzbXb73hPbCg4N2Htj3XL8DIwM7seXpo,347 +numpy/f2py/tests/src/mixed/foo.f,sha256=90zmbSHloY1XQYcPb8B5d9bv9mCZx8Z8AMTtgDwJDz8,85 +numpy/f2py/tests/src/mixed/foo_fixed.f90,sha256=pxKuPzxF3Kn5khyFq9ayCsQiolxB3SaNtcWaK5j6Rv4,179 +numpy/f2py/tests/src/mixed/foo_free.f90,sha256=fIQ71wrBc00JUAVUj_r3QF9SdeNniBiMw6Ly7CGgPWU,139 +numpy/f2py/tests/src/module_data/mod.mod,sha256=EkjrU7NTZrOH68yKrz6C_eyJMSFSxGgC2yMQT9Zscek,412 +numpy/f2py/tests/src/module_data/module_data_docstring.f90,sha256=tDZ3fUlazLL8ThJm3VwNGJ75QIlLcW70NnMFv-JA4W0,224 +numpy/f2py/tests/src/negative_bounds/issue_20853.f90,sha256=fdOPhRi7ipygwYCXcda7p_dlrws5Hd2GlpF9EZ-qnck,157 +numpy/f2py/tests/src/parameter/constant_both.f90,sha256=-bBf2eqHb-uFxgo6Q7iAtVUUQzrGFqzhHDNaxwSICfQ,1939 +numpy/f2py/tests/src/parameter/constant_compound.f90,sha256=re7pfzcuaquiOia53UT7qNNrTYu2euGKOF4IhoLmT6g,469 +numpy/f2py/tests/src/parameter/constant_integer.f90,sha256=nEmMLitKoSAG7gBBEQLWumogN-KS3DBZOAZJWcSDnFw,612 +numpy/f2py/tests/src/parameter/constant_non_compound.f90,sha256=IcxESVLKJUZ1k9uYKoSb8Hfm9-O_4rVnlkiUU2diy8Q,609 +numpy/f2py/tests/src/parameter/constant_real.f90,sha256=quNbDsM1Ts2rN4WtPO67S9Xi_8l2cXabWRO00CPQSSQ,610 +numpy/f2py/tests/src/quoted_character/foo.f,sha256=WjC9D9171fe2f7rkUAZUvik9bkIf9adByfRGzh6V0cM,482 +numpy/f2py/tests/src/regression/gh25337/data.f90,sha256=9Uz8CHB9i3_mjC3cTOmkTgPAF5tWSwYacG3MUrU-SY0,180 +numpy/f2py/tests/src/regression/gh25337/use_data.f90,sha256=WATiDGAoCKnGgMzm_iMgmfVU0UKOQlk5Fm0iXCmPAkE,179 +numpy/f2py/tests/src/regression/inout.f90,sha256=CpHpgMrf0bqA1W3Ozo3vInDz0RP904S7LkpdAH6ODck,277 +numpy/f2py/tests/src/return_character/foo77.f,sha256=WzDNF3d_hUDSSZjtxd3DtE-bSx1ilOMEviGyYHbcFgM,980 +numpy/f2py/tests/src/return_character/foo90.f90,sha256=ULcETDEt7gXHRzmsMhPsGG4o3lGrcx-FEFaJsPGFKyA,1248 +numpy/f2py/tests/src/return_complex/foo77.f,sha256=8ECRJkfX82oFvGWKbIrCvKjf5QQQClx4sSEvsbkB6A8,973 +numpy/f2py/tests/src/return_complex/foo90.f90,sha256=c1BnrtWwL2dkrTr7wvlEqNDg59SeNMo3gyJuGdRwcDw,1238 +numpy/f2py/tests/src/return_integer/foo77.f,sha256=_8k1evlzBwvgZ047ofpdcbwKdF8Bm3eQ7VYl2Y8b5kA,1178 +numpy/f2py/tests/src/return_integer/foo90.f90,sha256=bzxbYtofivGRYH35Ang9ScnbNsVERN8-6ub5-eI-LGQ,1531 +numpy/f2py/tests/src/return_logical/foo77.f,sha256=FxiF_X0HkyXHzJM2rLyTubZJu4JB-ObLnVqfZwAQFl8,1188 +numpy/f2py/tests/src/return_logical/foo90.f90,sha256=9KmCe7yJYpi4ftkKOM3BCDnPOdBPTbUNrKxY3p37O14,1531 +numpy/f2py/tests/src/return_real/foo77.f,sha256=ZTrzb6oDrIDPlrVWP3Bmtkbz3ffHaaSQoXkfTGtCuFE,933 +numpy/f2py/tests/src/return_real/foo90.f90,sha256=gZuH5lj2lG6gqHlH766KQ3J4-Ero-G4WpOOo2MG3ohU,1194 +numpy/f2py/tests/src/size/foo.f90,sha256=IlFAQazwBRr3zyT7v36-tV0-fXtB1d7WFp6S1JVMstg,815 +numpy/f2py/tests/src/string/char.f90,sha256=ihr_BH9lY7eXcQpHHDQhFoKcbu7VMOX5QP2Tlr7xlaM,618 +numpy/f2py/tests/src/string/fixed_string.f90,sha256=5n6IkuASFKgYICXY9foCVoqndfAY0AQZFEK8L8ARBGM,695 +numpy/f2py/tests/src/string/gh24008.f,sha256=UA8Pr-_yplfOFmc6m4v9ryFQ8W9OulaglulefkFWD68,217 +numpy/f2py/tests/src/string/gh24662.f90,sha256=-Tp9Kd1avvM7AIr8ZukFA9RVr-wusziAnE8AvG9QQI4,197 +numpy/f2py/tests/src/string/gh25286.f90,sha256=2EpxvC-0_dA58MBfGQcLyHzpZgKcMf_W9c73C_Mqnok,304 +numpy/f2py/tests/src/string/gh25286.pyf,sha256=GjgWKh1fHNdPGRiX5ek60i1XSeZsfFalydWqjISPVV8,381 +numpy/f2py/tests/src/string/gh25286_bc.pyf,sha256=6Y9zU66NfcGhTXlFOdFjCSMSwKXpq5ZfAe3FwpkAsm4,384 +numpy/f2py/tests/src/string/scalar_string.f90,sha256=ACxV2i6iPDk-a6L_Bs4jryVKYJMEGUTitEIYTjbJes4,176 +numpy/f2py/tests/src/string/string.f,sha256=shr3fLVZaa6SyUJFYIF1OZuhff8v5lCwsVNBU2B-3pk,248 +numpy/f2py/tests/src/value_attrspec/gh21665.f90,sha256=JC0FfVXsnB2lZHb-nGbySnxv_9VHAyD0mKaLDowczFU,190 +numpy/f2py/tests/test_abstract_interface.py,sha256=C8-ly0_TqkmpQNZmwPHwo2IV2MBH0jQEjAhpqHrg8Y4,832 +numpy/f2py/tests/test_array_from_pyobj.py,sha256=Txff89VUeEhWqUCRVybIqsqH4YQvpk4Uyjmh_XjyMi0,24049 +numpy/f2py/tests/test_assumed_shape.py,sha256=FeaqtrWyBf5uyArcmI0D2e_f763aSMpgU3QmdDXe-tA,1466 +numpy/f2py/tests/test_block_docstring.py,sha256=SEpuq73T9oVtHhRVilFf1xF7nb683d4-Kv7V0kfL4AA,564 +numpy/f2py/tests/test_callback.py,sha256=cReSlVjgnoT74wmtNn-oEIZiJUTfRX7ljjlqJi716IQ,6494 +numpy/f2py/tests/test_character.py,sha256=3ugjM1liymMRbY8wub1eiap-jdyNYVHxlNZBqNoRLe4,21868 +numpy/f2py/tests/test_common.py,sha256=m7TTSJt5zUZKJF-MQUeTtCyxW7YwRBSETINXGPFu8S4,896 +numpy/f2py/tests/test_compile_function.py,sha256=9d_FZ8P2wbIlQ2qPDRrsFqPb4nMH8tiWqYZN-P_shCs,4186 +numpy/f2py/tests/test_crackfortran.py,sha256=y1x3U-jlQWD5rmTXz1I2RlTz7LEfbI6qxCDkR5fzPwY,13441 +numpy/f2py/tests/test_data.py,sha256=HFcmPYbiveKa-swJ8x8XlRR9sM0ESB9FEN-txZnHTok,2876 +numpy/f2py/tests/test_docs.py,sha256=jqtuHE5ZjxP4D8Of3Fkzz36F8_0qKbeS040_m0ac4v4,1662 +numpy/f2py/tests/test_f2cmap.py,sha256=p-Sylbr3ctdKT3UQV9FzpCuYPH5U7Vyn8weXFAjiI9o,391 +numpy/f2py/tests/test_f2py2e.py,sha256=eoswH-daMEBlueoVpxXrDloahCpr0RLzHbr3zBHOsjk,25423 +numpy/f2py/tests/test_isoc.py,sha256=_nPTPxNEEagiKriZBeFNesOattIlHDzaNKmj35xxDBY,1406 +numpy/f2py/tests/test_kind.py,sha256=aOMQSBoD_dw49acKN25_abEvQBLI27DsnWIb9CNpSAE,1671 +numpy/f2py/tests/test_mixed.py,sha256=Ctuw-H7DxhPjSt7wZdJ2xffawIoEBCPWc5F7PSkY4HY,848 +numpy/f2py/tests/test_module_doc.py,sha256=sjCXWIKrqMD1NQ1DUAzgQqkjS5w9h9gvM_Lj29Rdcrg,863 +numpy/f2py/tests/test_parameter.py,sha256=ADI7EV_CM4ztICpqHqeq8LI-WdB6cX0ttatdRdjbsUA,3941 +numpy/f2py/tests/test_pyf_src.py,sha256=eD0bZu_GWfoCq--wWqEKRf-F2h5AwoTyO6GMA9wJPr4,1135 +numpy/f2py/tests/test_quoted_character.py,sha256=cpjMdrHwimnkoJkXd_W_FSlh43oWytY5VHySW9oskO4,454 +numpy/f2py/tests/test_regression.py,sha256=v_6RDQr6IcMmbCMElfzRSLPgZhHnH5l99uztrbJAzqE,2532 +numpy/f2py/tests/test_return_character.py,sha256=18HJtiRwQ7a_2mdPUonD5forKWZJEapD-Vi1DsbTjVs,1493 +numpy/f2py/tests/test_return_complex.py,sha256=BZIIqQ1abdiPLgVmu03_q37yCtND0ijxGSMhGz2Wf-o,2397 +numpy/f2py/tests/test_return_integer.py,sha256=t--9UsdLF9flLTQv7a0KTSVoBuoDtTnmOG2QIFPINVc,1758 +numpy/f2py/tests/test_return_logical.py,sha256=XCmp8E8I6BOeNYF59HjSFAdv1hM9WaDvl8UDS10_05o,2017 +numpy/f2py/tests/test_return_real.py,sha256=ATek5AM7dCCPeIvoMOQIt5yFNFzKrFb1Kno8B4M0rn4,3235 +numpy/f2py/tests/test_semicolon_split.py,sha256=_Mdsi84lES18pPjl9J-QsbGttV4tPFFjZvJvejNcqPc,1635 +numpy/f2py/tests/test_size.py,sha256=q6YqQvcyqdXJeWbGijTiCbxyEG3EkPcvT8AlAW6RCMo,1164 +numpy/f2py/tests/test_string.py,sha256=5xZOfdReoHnId0950XfmtfduPPfBbtMkzBoXMtygvMk,2962 +numpy/f2py/tests/test_symbolic.py,sha256=28quk2kTKfWhKe56n4vINJ8G9weKBfc7HysMlE9J3_g,18341 +numpy/f2py/tests/test_value_attrspec.py,sha256=rWwJBfE2qGzqilZZurJ-7ucNoJDICye6lLetQSLFees,323 +numpy/f2py/tests/util.py,sha256=bEhG699c4bLVPR2WR8fV67avgX6kH5I74SicGb7Z7T4,11167 +numpy/f2py/use_rules.py,sha256=3pTDOPur6gbPHPtwuMJPQvpnUMw39Law1KFSH0coB_0,3527 +numpy/fft/__init__.py,sha256=HqjmF6s_dh0Ri4UZzUDtOKbNUyfAfJAWew3e3EL_KUk,8175 +numpy/fft/__init__.pyi,sha256=vD9Xzz5r13caF4AVL87Y4U9KOj9ic25Vci_wb3dmgpk,550 +numpy/fft/_pocketfft.py,sha256=Xkm8wcP4JyBNMbp0ZoHIWhNDlgliX24RzrDuo29uRks,52897 +numpy/fft/_pocketfft.pyi,sha256=S6-ylUuHbgm8vNbh7tLru6K2R5SJzE81BC_Sllm6QrQ,2371 +numpy/fft/_pocketfft_internal.cpython-312-x86_64-linux-gnu.so,sha256=ONIiSfNRsdUOkmnFloif_GOGOevBUMHnX8n1Wg8zGrU,97008 +numpy/fft/helper.py,sha256=aNj1AcLvtfoX26RiLOwcR-k2QSMuBZkGj2Fu0CeFPJs,6154 +numpy/fft/helper.pyi,sha256=NLTEjy2Gz1aAMDZwCgssIyUne0ubjJqukfYkpsL3gXM,1176 +numpy/fft/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +numpy/fft/tests/test_helper.py,sha256=whgeaQ8PzFf3B1wkbXobGZ5sF4WxPp4gf1UPUVZest8,6148 +numpy/fft/tests/test_pocketfft.py,sha256=RdeCCvUQmJYVvccOJwToobTKDg9yzUL06o9MkPmRfmI,12895 +numpy/lib/__init__.py,sha256=XMPNJkG_mQ__xuvbf0OcpotgMbA9owt10ZHYVnYHq8E,2713 +numpy/lib/__init__.pyi,sha256=y5ANokFm7EkrlNoHdeQm1FsUhLFxkYtLuanCbsWrGio,5596 +numpy/lib/_datasource.py,sha256=CDF3im6IxdY3Mu6fwRQmkSEBmXS3kQVInQ4plXsoX9c,22631 +numpy/lib/_iotools.py,sha256=Yg9HCfPg4tbhbdgLPcxSMiZXq1xDprvJKLebLwhDszY,30868 +numpy/lib/_version.py,sha256=6vK7czNSB_KrWx2rZJzJ1pyOc73Q07hAgfLB5ItUCnU,4855 +numpy/lib/_version.pyi,sha256=B572hyWrUWG-TAAAXrNNAT4AgyUAmJ4lvgpwMkDzunk,633 +numpy/lib/arraypad.py,sha256=bKP7ZS9NYFYzqSk8OnpFLFrMsua4m_hcqFsi7cGkrJE,31803 +numpy/lib/arraypad.pyi,sha256=ADXphtAORYl3EqvE5qs_u32B_TALKSOtF43jOLmoxRw,1728 +numpy/lib/arraysetops.py,sha256=GJ2RhkzIJmIbwyG6h3LOFTPXg62kM9tcV1a-7tdbVuU,33655 +numpy/lib/arraysetops.pyi,sha256=6X-5l5Yss_9y10LYyIsDLbGX77vt7PtVLDqxOlSRPfY,8372 +numpy/lib/arrayterator.py,sha256=BQ97S00zvfURUZfes0GZo-5hydYNRuvwX1I1bLzeRik,7063 +numpy/lib/arrayterator.pyi,sha256=f7Pwp83_6DiMYmJGUsffncM-FRAynB1iYGvhmHM_SZE,1537 +numpy/lib/format.py,sha256=T8qJMyG2DDVjjYNNpUvBgfA9tCo23IS0w9byRB6twwQ,34769 +numpy/lib/format.pyi,sha256=YWBxC3GdsZ7SKBN8I7nMwWeVuFD1aT9d-VJ8zE4-P-o,748 +numpy/lib/function_base.py,sha256=IhhgfSmYJE-dHoUOMXHPiGYXso-NdXPpLXF9y0gEA6I,189172 +numpy/lib/function_base.pyi,sha256=KWaC5UOBANU4hiIoN2eptE4HYsm4vgp_8BMFV1Y3JX4,16585 +numpy/lib/histograms.py,sha256=xsj_qpaZoI2Bv1FBpY8mIMPJrYRiuIBszn_6kO7YFRA,37778 +numpy/lib/histograms.pyi,sha256=hNwR2xYWkgJCP-nfRGxc-EgHLTD3qm4zmWXthZLt08M,995 +numpy/lib/index_tricks.py,sha256=4PEvXk6VFTkttMViYBVC4yDhyOiKIon6JpIm0d_CmNg,31346 +numpy/lib/index_tricks.pyi,sha256=D2nkNXOB9Vea1PfMaTn94OGBGayjTaQ-bKMsjDmYpak,4251 +numpy/lib/mixins.py,sha256=y6_MzQuiNjv-1EFVROqv2y2cAJi5X4rQYzbZCyUyXgw,7071 +numpy/lib/mixins.pyi,sha256=h9N1kbZsUntF0zjOxPYeD_rCB2dMiG35TYYPl9ymkI4,3117 +numpy/lib/nanfunctions.py,sha256=6EjzydZlugIzfiENKtC4ycZ2Nckt8ZQg5v6D6tX1SiU,65775 +numpy/lib/nanfunctions.pyi,sha256=oPqAfCinmBL85Ji7ko4QlzAzLAK9nZL0t2_CllEbCEU,606 +numpy/lib/npyio.py,sha256=NUjtFvAmPdTjwJQ-ia-xbCr849M_M6NilP5IHfkKaRg,97316 +numpy/lib/npyio.pyi,sha256=SUFWJh90vWZCdd6GCSGbfYeXKlWut0XY_SHvZJc8yqY,9728 +numpy/lib/polynomial.py,sha256=6Aw3_2vdbh4urERQ6NaPhf9a_T1o1o6cjm3fb5Z3_YE,44133 +numpy/lib/polynomial.pyi,sha256=GerIpQnf5LdtFMOy9AxhOTqUyfn57k4MxqEYrfdckWE,6958 +numpy/lib/recfunctions.py,sha256=-90AbWWvVFOqVUPLh9K9NYdKUHYIgSEyg2Y35MnOVUA,59423 +numpy/lib/scimath.py,sha256=T4ITysZgqhY1J8IxyXCtioHjMTg2ci-4i3mr9TBF2UA,15037 +numpy/lib/scimath.pyi,sha256=E2roKJzMFwWSyhLu8UPUr54WOpxF8jp_pyXYBgsUSQ8,2883 +numpy/lib/setup.py,sha256=0K5NJKuvKvNEWp-EX7j0ODi3ZQQgIMHobzSFJq3G7yM,405 +numpy/lib/shape_base.py,sha256=AhCO9DEyysE-P-QJF9ryUtJ1ghU4_0mORhAJ59poObU,38947 +numpy/lib/shape_base.pyi,sha256=bGJhLA_RvUpVTiDFgCV-1rUjV8e1qCh0gK_3PLgXA_U,5341 +numpy/lib/stride_tricks.py,sha256=brY5b-0YQJuIH2CavfpIinMolyTUv5k9DUvLoZ-imis,17911 +numpy/lib/stride_tricks.pyi,sha256=0pQ4DP9l6g21q2Ajv6dJFRWMr9auPGTNV9BmZUbogPY,1747 +numpy/lib/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +numpy/lib/tests/data/py2-objarr.npy,sha256=F4cyUC-_TB9QSFLAo2c7c44rC6NUYIgrfGx9PqWPSKk,258 +numpy/lib/tests/data/py2-objarr.npz,sha256=xo13HBT0FbFZ2qvZz0LWGDb3SuQASSaXh7rKfVcJjx4,366 +numpy/lib/tests/data/py3-objarr.npy,sha256=pTTVh8ezp-lwAK3fkgvdKU8Arp5NMKznVD-M6Ex_uA0,341 +numpy/lib/tests/data/py3-objarr.npz,sha256=qQR0gS57e9ta16d_vCQjaaKM74gPdlwCPkp55P-qrdw,449 +numpy/lib/tests/data/python3.npy,sha256=X0ad3hAaLGXig9LtSHAo-BgOvLlFfPYMnZuVIxRmj-0,96 +numpy/lib/tests/data/win64python2.npy,sha256=agOcgHVYFJrV-nrRJDbGnUnF4ZTPYXuSeF-Mtg7GMpc,96 +numpy/lib/tests/test__datasource.py,sha256=65KXfUUvp8wXSqgQisuYlkhg-qHjBV5FXYetL8Ba-rc,10571 +numpy/lib/tests/test__iotools.py,sha256=HerCqvDE07JxjFQlWEfpZO7lC9z0Sbr3z20GSutoCPs,13743 +numpy/lib/tests/test__version.py,sha256=aO3YgkAohLsLzCNQ7vjIwdpFUMz0cPLbcuuxIkjuN74,1999 +numpy/lib/tests/test_arraypad.py,sha256=obohHbyM0gPYPUkd7iJSOSiDqyqtJsjDNtQX68NC4lM,54830 +numpy/lib/tests/test_arraysetops.py,sha256=5-T1MVhfIMivat8Z47GZw0ZaR811W_FskM1bAXnFyLU,35912 +numpy/lib/tests/test_arrayterator.py,sha256=AYs2SwV5ankgwnvKI9RSO1jZck118nu3SyZ4ngzZNso,1291 +numpy/lib/tests/test_financial_expired.py,sha256=yq5mqGMvqpkiiw9CuZhJgrYa7Squj1mXr_G-IvAFgwI,247 +numpy/lib/tests/test_format.py,sha256=xV0oi1eoRnVwAAhSOcPFQHQWF7TfsROtDYShQLPtdaA,41028 +numpy/lib/tests/test_function_base.py,sha256=DBKugIUEFTMP7g6iL1bk986E6ldCrcNdBCWOJbQla_Y,157830 +numpy/lib/tests/test_histograms.py,sha256=16_XJp-eFgsuM8B4mDQpQ4w_Ib29Hg0EPO-WFsdaFWA,32815 +numpy/lib/tests/test_index_tricks.py,sha256=Vjz25Y6H_ih0iEE2AG0kaxO9U8PwcXSrofzqnN4XBwI,20256 +numpy/lib/tests/test_io.py,sha256=3Tow1pucrQ7z7osNN4a2grBYUoBGNkQEhjmCjXT6Vag,107891 +numpy/lib/tests/test_loadtxt.py,sha256=gwcDJDJmLJRMLpg322yjQ1IzI505w9EqJoq4DmDPCdI,38560 +numpy/lib/tests/test_mixins.py,sha256=Wivwz3XBWsEozGzrzsyyvL3qAuE14t1BHk2LPm9Z9Zc,7030 +numpy/lib/tests/test_nanfunctions.py,sha256=01r_mmTCvKVdZuOGTEHNDZXrMS724us_jwZANzCd74A,47609 +numpy/lib/tests/test_packbits.py,sha256=OWGAd5g5GG0gl7WHqNfwkZ7G-2rrtLt2sI854PG4nnw,17546 +numpy/lib/tests/test_polynomial.py,sha256=URouxJpr8FQ5hiKybqhtOcLA7e-3hj4kWzjLBROByyA,11395 +numpy/lib/tests/test_recfunctions.py,sha256=6jzouPEQ7Uhtj8_-W5yTI6ymNp2nLgmdHzxdd74jVuM,44001 +numpy/lib/tests/test_regression.py,sha256=KzGFkhTcvEG97mymoOQ2hP2CEr2nPZou0Ztf4-WaXCs,8257 +numpy/lib/tests/test_shape_base.py,sha256=2iQCEFR6evVpF8woaenxUOzooHkfuMYkBaUj8ecyJ-E,26817 +numpy/lib/tests/test_stride_tricks.py,sha256=wprpWWH5eq07DY7rzG0WDv5fMtLxzRQz6fm6TZWlScQ,22849 +numpy/lib/tests/test_twodim_base.py,sha256=ll-72RhqCItIPB97nOWhH7H292h4nVIX_w1toKTPMUg,18841 +numpy/lib/tests/test_type_check.py,sha256=lxCH5aApWVYhhSoDQSLDTCHLVHuK2c-jBbnfnZUrOaA,15114 +numpy/lib/tests/test_ufunclike.py,sha256=4hSnXGlSC8HE-_pRRMzD8-HI4hGHqsAWu1pD0o2kPI0,2982 +numpy/lib/tests/test_utils.py,sha256=RVAxrzSFu6N3C4_jIgAlTDOWF_B7wr2v1Y20dX5upYM,6218 +numpy/lib/twodim_base.py,sha256=Mvzn_PyShIb9m7nJjJ4IetdxwmLYEsCPHvJoK7n2viU,32947 +numpy/lib/twodim_base.pyi,sha256=xFRcEVJdDj4mrXW_6iVP1lTMoJx4QJjYRD3o2_9f2eY,5370 +numpy/lib/type_check.py,sha256=_EOtB296nFYlNT7ztBYoC_yK9aycIb0KTmRjvzVdZNg,19954 +numpy/lib/type_check.pyi,sha256=LPvAvIxU-p5i_Qe-ic7hEvo4OTfSrNpplxMG7OAZe8Q,5571 +numpy/lib/ufunclike.py,sha256=_ceBGbGCMOd3u_h2UVzyaRK6ZY7ryoJ0GJB7zqcJG3w,6325 +numpy/lib/ufunclike.pyi,sha256=hLxcYfQprh1tTY_UO2QscA3Hd9Zd7cVGXIINZLhMFqY,1293 +numpy/lib/user_array.py,sha256=LE958--CMkBI2r3l1SQxmCHdCSw6HY6-RhWCnduzGA4,7721 +numpy/lib/utils.py,sha256=6NdleaELZiqARdj-ECZjxtwLf1bqklOcK43m9yoZefs,37804 +numpy/lib/utils.pyi,sha256=mVHVzWuc2-M3Oz60lFsbok0v8LH_HRHMjZpXwrtzF_c,2360 +numpy/linalg/__init__.py,sha256=mpdlEXWtTvpF7In776ONLwp6RIyo4U_GLPT1L1eIJnw,1813 +numpy/linalg/__init__.pyi,sha256=XBy4ocuypsRVflw_mbSTUhR4N5Roemu6w5SfeVwbkAc,620 +numpy/linalg/_umath_linalg.cpython-312-x86_64-linux-gnu.so,sha256=iCLnctdD1AWYPxucazS3BN0pd4CJDcJFRU8Qga31Ckw,216793 +numpy/linalg/lapack_lite.cpython-312-x86_64-linux-gnu.so,sha256=UAZPuN2wY1u7YCi4990o-QwErZqxw_rd0RF8K7fcj_0,29849 +numpy/linalg/linalg.py,sha256=kDVK1GBxbUjlRgxXCoEfkRJm8yrNr1Iu7hMn2rKK8RE,90923 +numpy/linalg/linalg.pyi,sha256=zD9U5BUCB1uQggSxfZaTGX_uB2Hkp75sttGmZbCGgBI,7505 +numpy/linalg/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +numpy/linalg/tests/test_deprecations.py,sha256=9p_SRmtxj2zc1doY9Ie3dyy5JzWy-tCQWFoajcAJUmM,640 +numpy/linalg/tests/test_linalg.py,sha256=rgvmK6Or70u8mN04puetL3FgSxZ8fJrOlI5ptTgCU5k,78085 +numpy/linalg/tests/test_regression.py,sha256=qbugUmrENybkEaM1GhfA01RXQUy8AkzalbrfzSIgUmM,5434 +numpy/ma/API_CHANGES.txt,sha256=F_4jW8X5cYBbzpcwteymkonTmvzgKKY2kGrHF1AtnrI,3405 +numpy/ma/LICENSE,sha256=BfO4g1GYjs-tEKvpLAxQ5YdcZFLVAJoAhMwpFVH_zKY,1593 +numpy/ma/README.rst,sha256=q-gCsZ4Cw_gUGGvEjog556sJUHIm8WTAwkFK5Qnz9XA,9872 +numpy/ma/__init__.py,sha256=dgP0WdnOpph28Fd6UiqoyDKhfrct0H6QWqbCcETsk6M,1404 +numpy/ma/__init__.pyi,sha256=ppCg_TS0POutNB3moJE4kBabWURnc0WGXyYPquXZxS4,6063 +numpy/ma/core.py,sha256=4MglVRJtmQ9_iIVaQ2b-_Vmw1TjAhEsMJdtKOhyBFXQ,278213 +numpy/ma/core.pyi,sha256=YfgyuBuKxZ5v4I2JxZDvCLhnztOCRgzTeDg-JGTon_M,14305 +numpy/ma/extras.py,sha256=MC7QPS34PC4wxNbOp7pTy57dqF9B-L6L1KMI6rrfe2w,64383 +numpy/ma/extras.pyi,sha256=BBsiCZbaPpGCY506fkmqZdBkJNCXcglc3wcSBuAACNk,2646 +numpy/ma/mrecords.py,sha256=degd6dLaDEvEWNHmvSnUZXos1csIzaqjR_jAutm8JfI,27232 +numpy/ma/mrecords.pyi,sha256=r1a2I662ywnhGS6zvfcyK-9RHVvb4sHxiCx9Dhf5AE4,1934 +numpy/ma/setup.py,sha256=MqmMicr_xHkAGoG-T7NJ4YdUZIJLO4ZFp6AmEJDlyhw,418 +numpy/ma/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +numpy/ma/tests/test_core.py,sha256=xd5S3oa0jObo8jnsJk0-o46d-KNC3RtgNRKinJeY_kE,215100 +numpy/ma/tests/test_deprecations.py,sha256=nq_wFVt2EBHcT3AHxattfKXx2JDf1K5D-QBzUU0_15A,2566 +numpy/ma/tests/test_extras.py,sha256=lX4cbdGDEXaBHzA3q8hJxve4635XCJw4AP7FO7zhOfk,74858 +numpy/ma/tests/test_mrecords.py,sha256=PsJhUlABgdpSsPUeijonfyFNqz5AfNSGQTtJUte7yts,19890 +numpy/ma/tests/test_old_ma.py,sha256=h4BncexBcBigqvZMA6RjDjpHPurWtt99A7KTag2rmOs,32690 +numpy/ma/tests/test_regression.py,sha256=foMpI0luAvwkkRpAfPDV_810h1URISXDZhmaNhxb50k,3287 +numpy/ma/tests/test_subclassing.py,sha256=HeTIE_n1I8atwzF8tpvNtGHp-0dmM8PT8AS4IDWbcso,16967 +numpy/ma/testutils.py,sha256=RQw0RyS7hOSVTk4KrCGleq0VHlnDqzwwaLtuZbRE4_I,10235 +numpy/ma/timer_comparison.py,sha256=pIGSZG-qYYYlRWSTgzPlyCAINbGKhXrZrDZBBjiM080,15658 +numpy/matlib.py,sha256=-54vTuGIgeTMg9ZUmElRPZ4Hr-XZ-om9xLzAsSoTvnc,10465 +numpy/matrixlib/__init__.py,sha256=BHBpQKoQv4EjT0UpWBA-Ck4L5OsMqTI2IuY24p-ucXk,242 +numpy/matrixlib/__init__.pyi,sha256=-t3ZuvbzRuRwWfZOeN4xlNWdm7gQEprhUsWzu8MRvUE,252 +numpy/matrixlib/defmatrix.py,sha256=JXdJGm1LayOOXfKpp7OVZfb0pzzP4Lwh45sTJrleALc,30656 +numpy/matrixlib/defmatrix.pyi,sha256=lmBMRahKcMOl2PHDo79J67VRAZOkI54BzfDaTLpE0LI,451 +numpy/matrixlib/setup.py,sha256=1r7JRkSM4HyVorgtjoKJGWLcOcPO3wmvivpeEsVtAEg,426 +numpy/matrixlib/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +numpy/matrixlib/tests/test_defmatrix.py,sha256=8E_-y7VD2vsq1y8CcI8km37pp5qcAtkciO16xqf2UIs,14982 +numpy/matrixlib/tests/test_interaction.py,sha256=PpjmgjEKighDXvt38labKE6L7f2jP74UEmp3JRb_iOY,11875 +numpy/matrixlib/tests/test_masked_matrix.py,sha256=7YO_LCO8DOhW3CuXJuxH93rnmttfvHnU7El-MBzxzFw,8932 +numpy/matrixlib/tests/test_matrix_linalg.py,sha256=ObbSUXU4R2pWajH__xAdizADrU2kBKDDCxkDV-oVBXc,2059 +numpy/matrixlib/tests/test_multiarray.py,sha256=jB3XCBmAtcqf-Wb9PwBW6uIykPpMPthuXLJ0giTKzZE,554 +numpy/matrixlib/tests/test_numeric.py,sha256=MP70qUwgshTtThKZaZDp7_6U-Z66NIV1geVhasGXejQ,441 +numpy/matrixlib/tests/test_regression.py,sha256=8sHDtO8Zi8p3a1eQKEWxtCmKrXmHoD3qxlIokg2AIAU,927 +numpy/polynomial/__init__.py,sha256=braLh6zP2QwuNKRKAaZGdC_qKWZ-tJlc3BN83LeuE_0,6781 +numpy/polynomial/__init__.pyi,sha256=W8szYtVUy0RUi83jmFLK58BN8CKVSoHA2CW7IcdUl1c,701 +numpy/polynomial/_polybase.py,sha256=YEnnQwlTgbn3dyD89ueraUx5nxx3x_pH6K6mmyEmhi8,39271 +numpy/polynomial/_polybase.pyi,sha256=J7yU9PPZW4W8mkqAltDfnL4ZNwljuM-bDEj4DPTJZpY,2321 +numpy/polynomial/chebyshev.py,sha256=NZCKjIblcX99foqZyp51i0_r8p0r1VKVGZFmQ1__kEk,62796 +numpy/polynomial/chebyshev.pyi,sha256=035CNdOas4dnb6lFLzRiBrYT_VnWh2T1-A3ibm_HYkI,1387 +numpy/polynomial/hermite.py,sha256=t5CFM-qE4tszYJiQZ301VcMn7IM67y2rUZPFPtnVRAc,52514 +numpy/polynomial/hermite.pyi,sha256=hdsvTULow8bIjnATudf0i6brpLHV7vbOoHzaMvbjMy0,1217 +numpy/polynomial/hermite_e.py,sha256=jRR3f8Oth8poV2Ix8c0eLEQR3UZary-2RupOrEAEUMY,52642 +numpy/polynomial/hermite_e.pyi,sha256=zV7msb9v9rV0iv_rnD3SjP-TGyc6pd3maCqiPCj3PbA,1238 +numpy/polynomial/laguerre.py,sha256=mcVw0ckWVX-kzJ1QIhdcuuxzPjuFmA3plQLkloQMOYM,50858 +numpy/polynomial/laguerre.pyi,sha256=Gxc9SLISNKMWrKdsVJ9fKFFFwfxxZzfF-Yc-2r__z5M,1178 +numpy/polynomial/legendre.py,sha256=wjtgFajmKEbYkSUk3vWSCveMHDP6UymK28bNUk4Ov0s,51550 +numpy/polynomial/legendre.pyi,sha256=9dmANwkxf7EbOHV3XQBPoaDtc56cCkf75Wo7FG9Zfj4,1178 +numpy/polynomial/polynomial.py,sha256=XsaZPHmLGJFqpJs7rPvO5E0loWQ1L3YHLIUybVu4dU8,49112 +numpy/polynomial/polynomial.pyi,sha256=bOPRnub4xXxsUwNGeiQLTT4PCfN1ysSrf6LBZIcAN2Y,1132 +numpy/polynomial/polyutils.py,sha256=Xy5qjdrjnRaqSlClG1ROmwWccLkAPC7IcHaNJLvhCf4,23237 +numpy/polynomial/polyutils.pyi,sha256=cFAyZ9Xzuw8Huhn9FEz4bhyD00m2Dp-2DiUSyogJwSo,264 +numpy/polynomial/setup.py,sha256=dXQfzVUMP9OcB6iKv5yo1GLEwFB3gJ48phIgo4N-eM0,373 +numpy/polynomial/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +numpy/polynomial/tests/test_chebyshev.py,sha256=6tMsFP1h7K8Zf72mNOta6Tv52_fVTlXknseuffj080c,20522 +numpy/polynomial/tests/test_classes.py,sha256=DFyY2IQBj3r2GZkvbRIeZO2EEY466xbuwc4PShAl4Sw,18331 +numpy/polynomial/tests/test_hermite.py,sha256=N9b2dx2UWPyja5v02dSoWYPnKvb6H-Ozgtrx-xjWz2k,18577 +numpy/polynomial/tests/test_hermite_e.py,sha256=_A3ohAWS4HXrQG06S8L47dImdZGTwYosCXnoyw7L45o,18911 +numpy/polynomial/tests/test_laguerre.py,sha256=BZOgs49VBXOFBepHopxuEDkIROHEvFBfWe4X73UZhn8,17511 +numpy/polynomial/tests/test_legendre.py,sha256=b_bblHs0F_BWw9ESuSq52ZsLKcQKFR5eqPf_SppWFqo,18673 +numpy/polynomial/tests/test_polynomial.py,sha256=4cuO8-5wdIxcz5CrucB5Ix7ySuMROokUF12F7ogQ_hc,20529 +numpy/polynomial/tests/test_polyutils.py,sha256=IxkbVfpcBqe5lOZluHFUPbLATLu1rwVg7ghLASpfYrY,3579 +numpy/polynomial/tests/test_printing.py,sha256=rfP4MaQbjGcO52faHmYrgsaarkm3Ndi3onwr6DDuapE,20525 +numpy/polynomial/tests/test_symbol.py,sha256=msTPv7B1niaKujU33kuZmdxJvLYvOjfl1oykmlL0dXo,5371 +numpy/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +numpy/random/LICENSE.md,sha256=EDFmtiuARDr7nrNIjgUuoGvgz_VmuQjxmeVh_eSa8Z8,3511 +numpy/random/__init__.pxd,sha256=9JbnX540aJNSothGs-7e23ozhilG6U8tINOUEp08M_k,431 +numpy/random/__init__.py,sha256=81Thnexg5umN5WZwD5TRyzNc2Yp-d14B6UC7NBgVKh8,7506 +numpy/random/__init__.pyi,sha256=RfW8mco48UaWDL1UC5ROv9vXiFZ9EGho62avhgEAHPc,2143 +numpy/random/_bounded_integers.cpython-312-x86_64-linux-gnu.so,sha256=s59-K0zP1pBK5g_hUX9r2ovng1tb9p1U3sDWk8Xot5M,348704 +numpy/random/_bounded_integers.pxd,sha256=hcoucPH5hkFEM2nm12zYO-5O_Rt8RujEXT5YWuAzl1Q,1669 +numpy/random/_common.cpython-312-x86_64-linux-gnu.so,sha256=q9iMqPRH8ixPUfImc000cylmuuYe3SqiX3S_7JVL7ig,258888 +numpy/random/_common.pxd,sha256=s2_IdIQ0MhNbogamulvXe-b93wbx882onmYkxqswwpo,4939 +numpy/random/_examples/cffi/extending.py,sha256=xSla3zWqxi6Hj48EvnYfD3WHfE189VvC4XsKu4_T_Iw,880 +numpy/random/_examples/cffi/parse.py,sha256=Bnb7t_6S_c5-3dZrQ-XX9EazOKhftUfcCejXXWyd1EU,1771 +numpy/random/_examples/cython/extending.pyx,sha256=4IE692pq1V53UhPZqQiQGcIHXDoNyqTx62x5a36puVg,2290 +numpy/random/_examples/cython/extending_distributions.pyx,sha256=oazFVWeemfE0eDzax7r7MMHNL1_Yofws2m-c_KT2Hbo,3870 +numpy/random/_examples/cython/meson.build,sha256=rXtugURMEo-ef4bPE1QIv4mzvWbeGjmcTdKCBvjxjtw,1443 +numpy/random/_examples/numba/extending.py,sha256=Ipyzel_h5iU_DMJ_vnXUgQC38uMDMn7adUpWSeEQLFE,1957 +numpy/random/_examples/numba/extending_distributions.py,sha256=Jnr9aWkHyIWygNbdae32GVURK-5T9BTGhuExRpvve98,2034 +numpy/random/_generator.cpython-312-x86_64-linux-gnu.so,sha256=Wz7yrIt4qoO8hptw4w4qcPvTqzc8UlPtbrqZgqVf1-I,946872 +numpy/random/_generator.pyi,sha256=zRvo_y6g0pWkE4fO1M9jLYUkxDfGdA6Enreb3U2AADM,22442 +numpy/random/_mt19937.cpython-312-x86_64-linux-gnu.so,sha256=Nhn3-Rue5xl8KQLA4Zfmmy5d1F-xHNIuVy6bC4hlFKk,119488 +numpy/random/_mt19937.pyi,sha256=_iZKaAmuKBQ4itSggfQvYYj_KjktcN4rt-YpE6bqFAM,724 +numpy/random/_pcg64.cpython-312-x86_64-linux-gnu.so,sha256=V3wUaPT7QLsjGEND4sG2RaF9HUk2QeqSwFLyhtxutVY,125040 +numpy/random/_pcg64.pyi,sha256=uxr5CbEJetN6lv9vBG21jlRhuzOK8SQnXrwqAQBxj_c,1091 +numpy/random/_philox.cpython-312-x86_64-linux-gnu.so,sha256=LJsf5T7xGePtKstzyALPKZZQKw_VHUkm1AR1ds6ldRQ,106712 +numpy/random/_philox.pyi,sha256=OKlaiIU-hj72Bp04zjNifwusOD_3-mYxIfvyuys8c_o,978 +numpy/random/_pickle.py,sha256=4NhdT-yk7C0m3tyZWmouYAs3ZGNPdPVNGfUIyuh8HDY,2318 +numpy/random/_sfc64.cpython-312-x86_64-linux-gnu.so,sha256=WIMwLOM6_VTbZjGtv14AApe460LA7IlLvsMYteaxQmg,76224 +numpy/random/_sfc64.pyi,sha256=09afHTedVW-519493ZXtGcl-H-_zluj-B_yfEJG8MMs,709 +numpy/random/bit_generator.cpython-312-x86_64-linux-gnu.so,sha256=h8XHMIh5Q8YDsWxTzyEzmDNu5BDvRllVGAk5d6_VsMs,234016 +numpy/random/bit_generator.pxd,sha256=lArpIXSgTwVnJMYc4XX0NGxegXq3h_QsUDK6qeZKbNc,1007 +numpy/random/bit_generator.pyi,sha256=aXv7a_hwa0nkjY8P2YENslwWp89UcFRn09woXh7Uoc0,3510 +numpy/random/c_distributions.pxd,sha256=7DE-mV3H_Dihk4OK4gMHHkyD4tPX1cAi4570zi5CI30,6344 +numpy/random/lib/libnpyrandom.a,sha256=xUcvOvieju5PThPQ8q0-uGJ5fjsCd5umnjIerIc85Sg,71926 +numpy/random/mtrand.cpython-312-x86_64-linux-gnu.so,sha256=jYZrS2EHQBq5VGBEVkII4KWJnDkD2gknfRxIddmLzw8,749040 +numpy/random/mtrand.pyi,sha256=3vAGOXsvyFFv0yZl34pVVPP7Dgt22COyfn4tUoi_hEQ,19753 +numpy/random/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +numpy/random/tests/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +numpy/random/tests/data/mt19937-testset-1.csv,sha256=Xkef402AVB-eZgYQkVtoxERHkxffCA9Jyt_oMbtJGwY,15844 +numpy/random/tests/data/mt19937-testset-2.csv,sha256=nsBEQNnff-aFjHYK4thjvUK4xSXDSfv5aTbcE59pOkE,15825 +numpy/random/tests/data/pcg64-testset-1.csv,sha256=xB00DpknGUTTCxDr9L6aNo9Hs-sfzEMbUSS4t11TTfE,23839 +numpy/random/tests/data/pcg64-testset-2.csv,sha256=NTdzTKvG2U7_WyU_IoQUtMzU3kEvDH39CgnR6VzhTkw,23845 +numpy/random/tests/data/pcg64dxsm-testset-1.csv,sha256=vNSUT-gXS_oEw_awR3O30ziVO4seNPUv1UIZ01SfVnI,23833 +numpy/random/tests/data/pcg64dxsm-testset-2.csv,sha256=uylS8PU2AIKZ185OC04RBr_OePweGRtvn-dE4YN0yYA,23839 +numpy/random/tests/data/philox-testset-1.csv,sha256=SedRaIy5zFadmk71nKrGxCFZ6BwKz8g1A9-OZp3IkkY,23852 +numpy/random/tests/data/philox-testset-2.csv,sha256=dWECt-sbfvaSiK8-Ygp5AqyjoN5i26VEOrXqg01rk3g,23838 +numpy/random/tests/data/sfc64-testset-1.csv,sha256=iHs6iX6KR8bxGwKk-3tedAdMPz6ZW8slDSUECkAqC8Q,23840 +numpy/random/tests/data/sfc64-testset-2.csv,sha256=FIDIDFCaPZfWUSxsJMAe58hPNmMrU27kCd9FhCEYt_k,23833 +numpy/random/tests/test_direct.py,sha256=6vLpCyeKnAWFEZei7l2YihVLQ0rSewO1hJBWt7A5fyQ,17779 +numpy/random/tests/test_extending.py,sha256=S3Wrzu3di4uBhr-Pxnx5dOPvlBY0FRdZqVX6CC1IN6s,4038 +numpy/random/tests/test_generator_mt19937.py,sha256=35LBwV6TtWPnxhefutxTQmhLzAQ5Ee4YiY8ziDXM-eQ,115477 +numpy/random/tests/test_generator_mt19937_regressions.py,sha256=xGkdz76BMX1EK0QPfabVxpNx9qQ9OC-1ZStWOs6N_M8,6387 +numpy/random/tests/test_random.py,sha256=kEkQs3i7zcpm9MozIRIz1FIx5B6fmXk0QqX0l6l-u_Y,70087 +numpy/random/tests/test_randomstate.py,sha256=DxF7rMUSxaAlL4h1qC3onHcHR7T_6rKWPbr0nJH84nE,85031 +numpy/random/tests/test_randomstate_regression.py,sha256=VucYWIjA7sAquWsalvZMnfkmYLM1O6ysyWnLl931-lA,7917 +numpy/random/tests/test_regression.py,sha256=trntK51UvajOVELiluEO85l64CKSw5nvBSc5SqYyr9w,5439 +numpy/random/tests/test_seed_sequence.py,sha256=GNRJ4jyzrtfolOND3gUWamnbvK6-b_p1bBK_RIG0sfU,3311 +numpy/random/tests/test_smoke.py,sha256=jjNz0aEGD1_oQl9a9UWt6Mz_298alG7KryLT1pgHljw,28183 +numpy/testing/__init__.py,sha256=InpVKoDAzMKO_l_HNcatziW_u1k9_JZze__t2nybrL0,595 +numpy/testing/__init__.pyi,sha256=AhK5NuOpdD-JjIzXOlssE8_iSLyFAAHzyGV_w1BT7vA,1674 +numpy/testing/_private/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +numpy/testing/_private/extbuild.py,sha256=nG2dwP4nUmQS3e5eIRinxt0s_f4sxxA1YfohCg-navo,8017 +numpy/testing/_private/utils.py,sha256=3FrSTMi0OdpDODBDoncgiDQzdo5NKA6YVfQ3uKRSQnc,85242 +numpy/testing/_private/utils.pyi,sha256=MMNrvwEeSTYzZFWawSSzHnTFYG-cSAIiID-1FuJ1f8U,10123 +numpy/testing/overrides.py,sha256=u6fcKSBC8HIzMPWKAbdyowU71h2Fx2ekDQxpG5NhIr8,2123 +numpy/testing/print_coercion_tables.py,sha256=ndxOsS4XfrZ4UY_9nqRTCnxhkzgdqcuUHL8nezd7Op4,6180 +numpy/testing/setup.py,sha256=GPKAtTTBRsNW4kmR7NjP6mmBR_GTdpaTvkTm10_VcLg,709 +numpy/testing/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +numpy/testing/tests/test_utils.py,sha256=IDOr-GXuNGlrsb-XzGSYUHXEqcGYJ78p60jOpBqyPM4,55740 +numpy/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +numpy/tests/test__all__.py,sha256=L3mCnYPTpzAgNfedVuq9g7xPWbc0c1Pot94k9jZ9NpI,221 +numpy/tests/test_ctypeslib.py,sha256=B06QKeFRgDIEbkEPBy_zYA1H5E2exuhTi7IDkzV8gfo,12257 +numpy/tests/test_lazyloading.py,sha256=YETrYiDLAqLX04K_u5_3NVxAfxDoeguxwkIRfz6qKcY,1162 +numpy/tests/test_matlib.py,sha256=gwhIXrJJo9DiecaGLCHLJBjhx2nVGl6yHq80AOUQSRM,1852 +numpy/tests/test_numpy_config.py,sha256=qHvepgi9oyAbQuZD06k7hpcCC2MYhdzcY6D1iQDPNMI,1241 +numpy/tests/test_numpy_version.py,sha256=A8cXFzp4k-p6J5zkOxlDfDvkoFMxDW2hpTFVXcaQRVo,1479 +numpy/tests/test_public_api.py,sha256=DTq7SO84uBjC2tKPoqX17xazc-SLkTAbQ2fLZwGM2jc,18170 +numpy/tests/test_reloading.py,sha256=QuVaPQulcNLg4Fl31Lw-O89L42KclYCK68n5GVy0PNQ,2354 +numpy/tests/test_scripts.py,sha256=jluCLfG94VM1cuX-5RcLFBli_yaJZpIvmVuMxRKRJrc,1645 +numpy/tests/test_warnings.py,sha256=ZEtXqHI1iyeVeLfVxDcMfN5qw67Ti2u54709hvBG4eY,2284 +numpy/typing/__init__.py,sha256=VoTILNDrUWvZx0LK9_97lBLQFKtSGmDt4QLOH8zYvlo,5234 +numpy/typing/mypy_plugin.py,sha256=24zVk4Ei3qH4Hc3SSz3v0XtIsycTo8HKoY6ilhB_7AQ,6376 +numpy/typing/setup.py,sha256=Cnz9q53w-vJNyE6vYxqYvQXx0pJbrG9quHyz9sqxfek,374 +numpy/typing/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +numpy/typing/tests/data/fail/arithmetic.pyi,sha256=4rY_ASCERAl8WCus1RakOe0Aw-8vvjilL29mgdD4lv0,3850 +numpy/typing/tests/data/fail/array_constructors.pyi,sha256=X9y_jUYS17WfYmXW5NwkVudyiR6ouUaAwEh0JRte42o,1089 +numpy/typing/tests/data/fail/array_like.pyi,sha256=OVAlEJZ5k8ZRKt0aGpZQwIjlUGpy0PzOOYqfI-IMqBQ,455 +numpy/typing/tests/data/fail/array_pad.pyi,sha256=57oK0Yp53rtKjjIrRFYLcxa-IfIGhtI-bEem7ggJKwI,132 +numpy/typing/tests/data/fail/arrayprint.pyi,sha256=-Fs9VnQfxyfak008Hq8kJWfB0snA6jGDXZz8ljQnwGE,549 +numpy/typing/tests/data/fail/arrayterator.pyi,sha256=FoU4ahHkJZ67dwWXer5FXLjjjesKKg-w2Jq1X1bHymA,480 +numpy/typing/tests/data/fail/bitwise_ops.pyi,sha256=GN9dVqk4_HFXn7zbRrHzJq_UGRFBccoYVUG1UuE7bXs,515 +numpy/typing/tests/data/fail/char.pyi,sha256=-vgN6EmfQ8VaA4SOZ5Ol9u4-Z7Q5I7G78LmaxZOuZ90,2615 +numpy/typing/tests/data/fail/chararray.pyi,sha256=jrNryZFpr8nxG2IHb9e0x3ranpvJpBy_RDex-WpT5rU,2296 +numpy/typing/tests/data/fail/comparisons.pyi,sha256=U4neWzwwtxG6QXsKlNGJuKXHBtwzYBQOa47_7SKF5Wg,888 +numpy/typing/tests/data/fail/constants.pyi,sha256=YSqNbXdhbdMmYbs7ntH0FCKbnm8IFeqsDlZBqcU43iw,286 +numpy/typing/tests/data/fail/datasource.pyi,sha256=PRT2hixR-mVxr2UILvHa99Dr54EF2h3snJXE-v3rWcc,395 +numpy/typing/tests/data/fail/dtype.pyi,sha256=OAGABqdXNB8gClJFEGMckoycuZcIasMaAlS2RkiKROI,334 +numpy/typing/tests/data/fail/einsumfunc.pyi,sha256=RS7GZqUCT_vEFJoyUx4gZlPO8GNFFNFWidxl-wLyRv0,539 +numpy/typing/tests/data/fail/false_positives.pyi,sha256=Q61qMsSsNCtmO0EMRxHj5Z7RYTyrELVpkzfJY5eK8Z0,366 +numpy/typing/tests/data/fail/flatiter.pyi,sha256=qLM4qm7gvJtEZ0rTHcyasUzoP5JbX4FREtqV3g1w6Lo,843 +numpy/typing/tests/data/fail/fromnumeric.pyi,sha256=FH2mjkgtCbA9soqlJRhYN7IIfRRrUL1i9mwqcbYKZSc,5591 +numpy/typing/tests/data/fail/histograms.pyi,sha256=yAPVt0rYTwtxnigoGT-u7hhKCE9iYxsXc24x2HGBrmA,367 +numpy/typing/tests/data/fail/index_tricks.pyi,sha256=moINir9iQoi6Q1ZuVg5BuSB9hSBtbg_uzv-Qm_lLYZk,509 +numpy/typing/tests/data/fail/lib_function_base.pyi,sha256=6y9T773CBLX-jUry1sCQGVuKVKM2wMuQ56Ni5V5j4Dw,2081 +numpy/typing/tests/data/fail/lib_polynomial.pyi,sha256=Ur7Y4iZX6WmoH5SDm0ePi8C8LPsuPs2Yr7g7P5O613g,899 +numpy/typing/tests/data/fail/lib_utils.pyi,sha256=VFpE6_DisvlDByyp1PiNPJEe5IcZp8cH0FlAJyoZipo,276 +numpy/typing/tests/data/fail/lib_version.pyi,sha256=7-ZJDZwDcB-wzpMN8TeYtZAgaqc7xnQ8Dnx2ISiX2Ts,158 +numpy/typing/tests/data/fail/linalg.pyi,sha256=yDd05aK1dI37RPt3pD2eJYo4dZFaT2yB1PEu3K0y9Tg,1322 +numpy/typing/tests/data/fail/memmap.pyi,sha256=HSTCQYNuW1Y6X1Woj361pN4rusSPs4oDCXywqk20yUo,159 +numpy/typing/tests/data/fail/modules.pyi,sha256=_ek4zKcdP-sIh_f-IDY0tP-RbLORKCSWelM9AOYxsyA,670 +numpy/typing/tests/data/fail/multiarray.pyi,sha256=XCdBxufNhR8ZtG8UMzk8nt9_NC5gJTKP9-xTqKO_K9I,1693 +numpy/typing/tests/data/fail/ndarray.pyi,sha256=YnjXy16RHs_esKelMjB07865CQ7gLyQnXhnitq5Kv5c,405 +numpy/typing/tests/data/fail/ndarray_misc.pyi,sha256=w-10xTDDWoff9Lq0dBO-jBeiBR-XjCz2qmes0dLx238,1372 +numpy/typing/tests/data/fail/nditer.pyi,sha256=w7emjnOxnf3NcvLktNLlke6Cuivn2gU3sVmGCfbG6rw,325 +numpy/typing/tests/data/fail/nested_sequence.pyi,sha256=em4GZwLDFE0QSxxg081wVwhh-Dmtkn8f7wThI0DiXVs,427 +numpy/typing/tests/data/fail/npyio.pyi,sha256=56QuHo9SvVR3Uhzl6gQZncCpX575Gy5wugjMICh20m0,620 +numpy/typing/tests/data/fail/numerictypes.pyi,sha256=fevH9x80CafYkiyBJ7LMLVl6GyTvQrZ34trBu6O8TtM,276 +numpy/typing/tests/data/fail/random.pyi,sha256=p5WsUGyOL-MGIeALh9Y0dVhYSRQLaUwMdjXc3G6C_7Q,2830 +numpy/typing/tests/data/fail/rec.pyi,sha256=Ws3TyesnoQjt7Q0wwtpShRDJmZCs2jjP17buFMomVGA,704 +numpy/typing/tests/data/fail/scalars.pyi,sha256=o91BwSfzPTczYVtbXsirqQUoUoYP1C_msGjc2GYsV04,2952 +numpy/typing/tests/data/fail/shape_base.pyi,sha256=Y_f4buHtX2Q2ZA4kaDTyR8LErlPXTzCB_-jBoScGh_Q,152 +numpy/typing/tests/data/fail/stride_tricks.pyi,sha256=IjA0Xrnx0lG3m07d1Hjbhtyo1Te5cXgjgr5fLUo4LYQ,315 +numpy/typing/tests/data/fail/testing.pyi,sha256=e7b5GKTWCtKGoB8z2a8edsW0Xjl1rMheALsvzEJjlCw,1370 +numpy/typing/tests/data/fail/twodim_base.pyi,sha256=ZqbRJfy5S_pW3fFLuomy4L5SBNqj6Nklexg9KDTo65c,899 +numpy/typing/tests/data/fail/type_check.pyi,sha256=CIyI0j0Buxv0QgCvNG2urjaKpoIZ-ZNawC2m6NzGlbo,379 +numpy/typing/tests/data/fail/ufunc_config.pyi,sha256=ukA0xwfJHLoGfoOIpWIN-91wj-DG8oaIjYbO72ymjg4,733 +numpy/typing/tests/data/fail/ufunclike.pyi,sha256=lbxjJyfARmt_QK1HxhxFxvwQTqCEZwJ9I53Wp8X3KIY,679 +numpy/typing/tests/data/fail/ufuncs.pyi,sha256=YaDTL7QLmGSUxE6JVMzpOlZTjHWrgbOo0UIlkX-6ZQk,1347 +numpy/typing/tests/data/fail/warnings_and_errors.pyi,sha256=PrbYDFI7IGN3Gf0OPBkVfefzQs4AXHwDQ495pvrX3RY,174 +numpy/typing/tests/data/misc/extended_precision.pyi,sha256=bS8bBeCFqjgtOiy-8_y39wfa7rwhdjLz2Vmo-RXAYD4,884 +numpy/typing/tests/data/mypy.ini,sha256=Ynv1VSx_kXTD2mFC3ZpgEFuCOg1F2VJXxPk0dxUnF2M,108 +numpy/typing/tests/data/pass/arithmetic.py,sha256=2z3dmuysQQmiPz8x0bg8SOOKW62mVJn97uMa9T0L7Vk,7455 +numpy/typing/tests/data/pass/array_constructors.py,sha256=3GrhfBcmWX53pJHD0NvhXjwr2-uNKREbR1I9WCcZ7rI,2419 +numpy/typing/tests/data/pass/array_like.py,sha256=ce_IVubBd7J6FkSpJmD7qMlRLuwmiidhOqhYfZb16Wo,916 +numpy/typing/tests/data/pass/arrayprint.py,sha256=y_KkuLz1uM7pv53qfq7GQOuud4LoXE3apK1wtARdVyM,766 +numpy/typing/tests/data/pass/arrayterator.py,sha256=FqcpKdUQBQ0FazHFxr9MsLEZG-jnJVGKWZX2owRr4DQ,393 +numpy/typing/tests/data/pass/bitwise_ops.py,sha256=UnmxVr9HwI8ifdrutGm_u3EZU4iOOPQhrOku7hTaH0c,970 +numpy/typing/tests/data/pass/comparisons.py,sha256=nTE-fvraLK6xTZcP4uPV02wOShzYKWDaoapx35AeDOY,2992 +numpy/typing/tests/data/pass/dtype.py,sha256=MqDKC6Ywv6jNkWsR8rdLuabzHUco5w1OylDHEdxve_I,1069 +numpy/typing/tests/data/pass/einsumfunc.py,sha256=eXj5L5MWPtQHgrHPsJ36qqrmBHqct9UoujjJCvHnF1k,1370 +numpy/typing/tests/data/pass/flatiter.py,sha256=0BnbuLMBC7MQlprNZ0QhNSscfYwPhEhXOhWoyiRACWU,174 +numpy/typing/tests/data/pass/fromnumeric.py,sha256=Xd_nJVVDoONdztUX8ddgo7EXJ2FD8AX51MO_Yujnmog,3742 +numpy/typing/tests/data/pass/index_tricks.py,sha256=oaFD9vY01_RI5OkrXt-xTk1n_dd-SpuPp-eZ58XR3c8,1492 +numpy/typing/tests/data/pass/lib_utils.py,sha256=sDQCjHVGUwct0RQqAtH5_16y241siSY4bXKZRsuJ8xA,434 +numpy/typing/tests/data/pass/lib_version.py,sha256=HnuGOx7tQA_bcxFIJ3dRoMAR0fockxg4lGqQ4g7LGIw,299 +numpy/typing/tests/data/pass/literal.py,sha256=DLzdWHD6ttW4S0NEvGQbsH_UEJjhZyhvO4OXJjoyvZQ,1331 +numpy/typing/tests/data/pass/mod.py,sha256=HB9aK4_wGJbc44tomaoroNy0foIL5cI9KIjknvMTbkk,1578 +numpy/typing/tests/data/pass/modules.py,sha256=t0KJxYWbrWd7HbbgIDFb3LAhJBiNNb6QPjjFDAgC2mU,576 +numpy/typing/tests/data/pass/multiarray.py,sha256=MxHax6l94yqlTVZleAqG77ILEbW6wU5osPcHzxJ85ns,1331 +numpy/typing/tests/data/pass/ndarray_conversion.py,sha256=yPgzXG6paY1uF_z-QyHYrcmrZvhX7qtvTUh7ANLseCA,1626 +numpy/typing/tests/data/pass/ndarray_misc.py,sha256=z3mucbn9fLM1gxmbUhWlp2lcrOv4zFjqZFze0caE2EA,2715 +numpy/typing/tests/data/pass/ndarray_shape_manipulation.py,sha256=37eYwMNqMLwanIW9-63hrokacnSz2K_qtPUlkdpsTjo,640 +numpy/typing/tests/data/pass/numeric.py,sha256=SdnsD5zv0wm8T2hnIylyS14ig2McSz6rG9YslckbNQ4,1490 +numpy/typing/tests/data/pass/numerictypes.py,sha256=r0_s-a0-H2MdWIn4U4P6W9RQO0V1xrDusgodHNZeIYM,750 +numpy/typing/tests/data/pass/random.py,sha256=uJCnzlsOn9hr_G1TpHLdsweJI4EdhUSEQ4dxROPjqAs,61881 +numpy/typing/tests/data/pass/scalars.py,sha256=En0adCZAwEigZrzdQ0JQwDEmrS0b-DMd1vvjkFcvwo8,3479 +numpy/typing/tests/data/pass/simple.py,sha256=HmAfCOdZBWQF211YaZFrIGisMgu5FzTELApKny08n3Y,2676 +numpy/typing/tests/data/pass/simple_py3.py,sha256=HuLrc5aphThQkLjU2_19KgGFaXwKOfSzXe0p2xMm8ZI,96 +numpy/typing/tests/data/pass/ufunc_config.py,sha256=_M8v-QWAeT1-2MkfSeAbNl_ZwyPvYfPTsLl6c1X8d_w,1204 +numpy/typing/tests/data/pass/ufunclike.py,sha256=Gve6cJ2AT3TAwOjUOQQDIUnqsRCGYq70_tv_sgODiiA,1039 +numpy/typing/tests/data/pass/ufuncs.py,sha256=xGuKuqPetUTS4io5YDHaki5nbYRu-wC29SGU32tzVIg,462 +numpy/typing/tests/data/pass/warnings_and_errors.py,sha256=Pcg-QWfY4PAhTKyehae8q6LhtbUABxa2Ye63-3h1f4w,150 +numpy/typing/tests/data/reveal/arithmetic.pyi,sha256=Ndmi_IFAl8z28RHsYTbOouf-B5FH91x_9ky-JwsdXVg,19765 +numpy/typing/tests/data/reveal/array_constructors.pyi,sha256=DcT8Z2rEpqYfjXySBejk8cGOUidUmizZGE5ZEy7r14E,10600 +numpy/typing/tests/data/reveal/arraypad.pyi,sha256=Q1pcU4B3eRsw5jsv-S0MsEfNUbp_4aMdO_o3n0rtA2A,776 +numpy/typing/tests/data/reveal/arrayprint.pyi,sha256=YyzzkL-wj4Rs-fdo3brpoaWtb5g3yk4Vn2HKu5KRo4w,876 +numpy/typing/tests/data/reveal/arraysetops.pyi,sha256=ApCFQcZzQ08zV32SJ86Xyv_7jazl3XKMmJmULtNquJ8,4155 +numpy/typing/tests/data/reveal/arrayterator.pyi,sha256=TF_1eneHoT0v9HqS9dKc5Xiv3iY3E330GR1RNcJ7s2Q,1111 +numpy/typing/tests/data/reveal/bitwise_ops.pyi,sha256=nRkyUGrBB_Es7TKyDxS_s3u2dFgBfzjocInI9Ea-J10,3919 +numpy/typing/tests/data/reveal/char.pyi,sha256=M_iTa9Pn8F7jQ1k6RN9KvbhEn00g7UYJZ5PV57ikcZM,7289 +numpy/typing/tests/data/reveal/chararray.pyi,sha256=O0EfwnKc3W1Fnx1c7Yotb1O84kVMuqJLlMBXd2duvjI,6093 +numpy/typing/tests/data/reveal/comparisons.pyi,sha256=huaf-seaF5ndTqfoaBfPtMMkOYovq7ibJl5-CRoQW7s,7468 +numpy/typing/tests/data/reveal/constants.pyi,sha256=P9vFEMkPpJ5KeUnzqPOuyHlh3zAFl9lzB4WxyB2od7A,1949 +numpy/typing/tests/data/reveal/ctypeslib.pyi,sha256=-Pk2rLEGCzz3B_y8Mu10JSVA8gPFztl5fV1dspPzqig,4727 +numpy/typing/tests/data/reveal/datasource.pyi,sha256=e8wjn60tO5EdnkBF34JrZT5XvdyW7kRWD2abtgr6qUg,671 +numpy/typing/tests/data/reveal/dtype.pyi,sha256=TKrYyxMu5IGobs0SDTIRcPuWsZ5X7zMYB4pmUlTTJxA,2872 +numpy/typing/tests/data/reveal/einsumfunc.pyi,sha256=pbtSfzIWUJRkDpe2riHBlvFlNSC3CqVM-SbYtBgX9H0,2044 +numpy/typing/tests/data/reveal/emath.pyi,sha256=-muNpWOv_niIn-zS3gUnFO4qBZAouNlVGue2x1L5Ris,2423 +numpy/typing/tests/data/reveal/false_positives.pyi,sha256=AplTmZV7TS7nivU8vegbstMN5MdMv4U0JJdZ4IeeA5M,482 +numpy/typing/tests/data/reveal/fft.pyi,sha256=ReQ9qn5frvJEy-g0RWpUGlPBntUS1cFSIu6WfPotHzE,1749 +numpy/typing/tests/data/reveal/flatiter.pyi,sha256=e1OQsVxQpgyfqMNw2puUTATl-w3swvdknlctAiWxf_E,882 +numpy/typing/tests/data/reveal/fromnumeric.pyi,sha256=PNtGQR1VmGk_xNbd0eP7k7B2oNCMBz2XOJ17-_SdE5M,12101 +numpy/typing/tests/data/reveal/getlimits.pyi,sha256=nUGOMFpWj3pMgqLy6ZbR7A4G2q7iLIl5zEFBGf-Qcfw,1592 +numpy/typing/tests/data/reveal/histograms.pyi,sha256=MxKWoa7UoJRRLim53H6OoyYfz87P3_9YUXGYPTknGVQ,1303 +numpy/typing/tests/data/reveal/index_tricks.pyi,sha256=HpD7lU7hcyDoLdZbeqskPXnX7KYwPtll7uJKYUzrlE8,3177 +numpy/typing/tests/data/reveal/lib_function_base.pyi,sha256=eSiSZUlmPXqVPKknM7GcEv76BDgj0IJRu3FXcZXpmqc,8318 +numpy/typing/tests/data/reveal/lib_polynomial.pyi,sha256=TOzOdMPDqveDv3vDKSjtq6RRvN-j_s2J7aud2ySDAB0,5986 +numpy/typing/tests/data/reveal/lib_utils.pyi,sha256=_zj7WGYGYMFXAHLK-F11aeFfDvjRvFARUjoXhbXn8V0,1049 +numpy/typing/tests/data/reveal/lib_version.pyi,sha256=UCioUeykot8-nWL6goKxZnKZxtgB4lFEi9wdN_xyF1U,672 +numpy/typing/tests/data/reveal/linalg.pyi,sha256=LPaY-RyYL7Xt3djCgNaWEgI8beI9Eo_XnvOwi6Y7-eo,4877 +numpy/typing/tests/data/reveal/matrix.pyi,sha256=ciJXsn5v2O1IZ3VEn5Ilp8-40NTQokfrOOgVXMFsvLo,2922 +numpy/typing/tests/data/reveal/memmap.pyi,sha256=A5PovMzjRp2zslF1vw3TdTQjj4Y0dIEJ__HDBV_svGM,842 +numpy/typing/tests/data/reveal/mod.pyi,sha256=-CNWft2jQGSdrO8dYRgwbl7OhL3a78Zo60JVmiY-gQI,5666 +numpy/typing/tests/data/reveal/modules.pyi,sha256=0WPq7A-aqWkJsV-IA1_7dFNCcxBacj1AWExaXbXErG4,1958 +numpy/typing/tests/data/reveal/multiarray.pyi,sha256=6MvfNKihK-oN6QwG9HFNelgheo4lnL0FCrmIF_qxdoA,5326 +numpy/typing/tests/data/reveal/nbit_base_example.pyi,sha256=DRUMGatQvQXTuovKEMF4dzazIU6it6FU53LkOEo2vNo,657 +numpy/typing/tests/data/reveal/ndarray_conversion.pyi,sha256=BfjQD8U756l4gOfY0LD47HhDRxbq0yCFfEFKvbXs7Rs,1791 +numpy/typing/tests/data/reveal/ndarray_misc.pyi,sha256=0EN-a47Msn4pZgKVdD-GrXCCmt-oxjlov5rszchBmOI,7126 +numpy/typing/tests/data/reveal/ndarray_shape_manipulation.pyi,sha256=QDQ9g6l-e73pTJp-Dosiynb-okbqi91D4KirjhIjcv4,1233 +numpy/typing/tests/data/reveal/nditer.pyi,sha256=VFXnT75BgWSUpb-dD-q5cZkfeOqsk-x9cH626g9FWT4,2021 +numpy/typing/tests/data/reveal/nested_sequence.pyi,sha256=IQyRlXduk-ZEakOtoliMLCqNgGbeg0mzZf-a-a3Gq_0,734 +numpy/typing/tests/data/reveal/npyio.pyi,sha256=YXagt2J-1suu5WXZ_si5NuJf7sHj_7NlaSLqQkam1Po,4209 +numpy/typing/tests/data/reveal/numeric.pyi,sha256=aJKnav-X45tjSFfgGD4iCetwEFcJXdNgU7valktjiCg,6160 +numpy/typing/tests/data/reveal/numerictypes.pyi,sha256=-YQRhwjBjsFJHjpGCRqzafNnKDdsmbBHbmPwccP0pLI,2487 +numpy/typing/tests/data/reveal/random.pyi,sha256=s6T074ZIpGAUqHnA-yAlozTLvt7PNBjCBqd-nGMqWGg,104091 +numpy/typing/tests/data/reveal/rec.pyi,sha256=DbRVk6lc7-3qPe-7Q26tUWpdaH9B4UVoQSYrRGJUo1Q,3858 +numpy/typing/tests/data/reveal/scalars.pyi,sha256=Qn3B3rsqSN397Jh25xs4odt2pfCQtWkoJe-e0-oX8d4,4790 +numpy/typing/tests/data/reveal/shape_base.pyi,sha256=YjiVukrK6OOydvopOaOmeAIIa0YQ2hn9_I_-FyYkHVU,2427 +numpy/typing/tests/data/reveal/stride_tricks.pyi,sha256=EBZR8gSP385nhotwJ3GH9DOUD2q5nUEYbXfhLo5xrPo,1542 +numpy/typing/tests/data/reveal/testing.pyi,sha256=_WOAj_t5SWYiqN0KG26Mza8RvaD3WAa7rFUlgksjLms,8611 +numpy/typing/tests/data/reveal/twodim_base.pyi,sha256=ZdNVo2HIJcx8iF9PA-z5W3Bs0hWM2nlVdbhLuAQlljM,3132 +numpy/typing/tests/data/reveal/type_check.pyi,sha256=yZSp50TtvPqv_PN7zmVcNOVUTUXMNYFGcguMNj25E9Y,3044 +numpy/typing/tests/data/reveal/ufunc_config.pyi,sha256=buwSvat3SVFAFl5k8TL6Mgpi32o6hHZYZ2Lpn6AHdEU,1327 +numpy/typing/tests/data/reveal/ufunclike.pyi,sha256=V_gLcZVrTXJ21VkUMwA0HyxUgA1r6OzjsdJegaKL2GE,1329 +numpy/typing/tests/data/reveal/ufuncs.pyi,sha256=VnwYr5KT_FLKfc0wV7dtNz7bNtaC9VIQt-oz56Hb5EE,2798 +numpy/typing/tests/data/reveal/warnings_and_errors.pyi,sha256=ImMlPt2PQBtX8Qf1EZFmLjNWm8fPE6IWQ_deaq_-85s,538 +numpy/typing/tests/test_isfile.py,sha256=BhKZs4-LrhFUfKjcG0yelySjE6ZITMxGIBYWGDHMRb8,864 +numpy/typing/tests/test_runtime.py,sha256=2qu8JEliITnZCBJ_QJpohacj_OQ08o73ixS2w2ooNXI,3275 +numpy/typing/tests/test_typing.py,sha256=Da1ZOFjtPh_Mvb5whpI-okBJdgLOAfJtJNyG6leGFoQ,8743 +numpy/version.py,sha256=OTLnSh0NGfWyL8VrnIj0Ndt_KZOTl1Z-kD9Cf-jRMmY,216 diff --git a/numpy-1.26.4.dist-info/WHEEL b/numpy-1.26.4.dist-info/WHEEL new file mode 100644 index 0000000000000000000000000000000000000000..d98ef534f680b37433e9ab0f8470bdbe56c303d8 --- /dev/null +++ b/numpy-1.26.4.dist-info/WHEEL @@ -0,0 +1,6 @@ +Wheel-Version: 1.0 +Generator: meson +Root-Is-Purelib: false +Tag: cp312-cp312-manylinux_2_17_x86_64 +Tag: cp312-cp312-manylinux2014_x86_64 + diff --git a/numpy-1.26.4.dist-info/entry_points.txt b/numpy-1.26.4.dist-info/entry_points.txt new file mode 100644 index 0000000000000000000000000000000000000000..450d8ef27b9d42527fde6aced068dd7e13f2c9a3 --- /dev/null +++ b/numpy-1.26.4.dist-info/entry_points.txt @@ -0,0 +1,9 @@ +[array_api] +numpy = numpy.array_api + +[pyinstaller40] +hook-dirs = numpy:_pyinstaller_hooks_dir + +[console_scripts] +f2py = numpy.f2py.f2py2e:main + diff --git a/rich-14.3.3.dist-info/INSTALLER b/rich-14.3.3.dist-info/INSTALLER new file mode 100644 index 0000000000000000000000000000000000000000..5c69047b2eb8235994febeeae1da4a82365a240a --- /dev/null +++ b/rich-14.3.3.dist-info/INSTALLER @@ -0,0 +1 @@ +uv \ No newline at end of file diff --git a/rich-14.3.3.dist-info/METADATA b/rich-14.3.3.dist-info/METADATA new file mode 100644 index 0000000000000000000000000000000000000000..9f6bc96c6dffe66668d531c1ef45a023820fdc67 --- /dev/null +++ b/rich-14.3.3.dist-info/METADATA @@ -0,0 +1,480 @@ +Metadata-Version: 2.4 +Name: rich +Version: 14.3.3 +Summary: Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal +License: MIT +License-File: LICENSE +Author: Will McGugan +Author-email: willmcgugan@gmail.com +Requires-Python: >=3.8.0 +Classifier: Development Status :: 5 - Production/Stable +Classifier: Environment :: Console +Classifier: Framework :: IPython +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: MIT License +Classifier: Operating System :: MacOS +Classifier: Operating System :: Microsoft :: Windows +Classifier: Operating System :: POSIX :: Linux +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3.8 +Classifier: Programming Language :: Python :: 3.9 +Classifier: Programming Language :: Python :: 3.10 +Classifier: Programming Language :: Python :: 3.11 +Classifier: Programming Language :: Python :: 3.12 +Classifier: Programming Language :: Python :: 3.13 +Classifier: Programming Language :: Python :: 3.14 +Classifier: Typing :: Typed +Provides-Extra: jupyter +Requires-Dist: ipywidgets (>=7.5.1,<9) ; extra == "jupyter" +Requires-Dist: markdown-it-py (>=2.2.0) +Requires-Dist: pygments (>=2.13.0,<3.0.0) +Project-URL: Documentation, https://rich.readthedocs.io/en/latest/ +Project-URL: Homepage, https://github.com/Textualize/rich +Description-Content-Type: text/markdown + +[![Supported Python Versions](https://img.shields.io/pypi/pyversions/rich)](https://pypi.org/project/rich/) [![PyPI version](https://badge.fury.io/py/rich.svg)](https://badge.fury.io/py/rich) + +[![Downloads](https://pepy.tech/badge/rich/month)](https://pepy.tech/project/rich) +[![codecov](https://img.shields.io/codecov/c/github/Textualize/rich?label=codecov&logo=codecov)](https://codecov.io/gh/Textualize/rich) +[![Rich blog](https://img.shields.io/badge/blog-rich%20news-yellowgreen)](https://www.willmcgugan.com/tag/rich/) +[![Twitter Follow](https://img.shields.io/twitter/follow/willmcgugan.svg?style=social)](https://twitter.com/willmcgugan) + +![Logo](https://github.com/textualize/rich/raw/master/imgs/logo.svg) + +[English readme](https://github.com/textualize/rich/blob/master/README.md) + • [简体中文 readme](https://github.com/textualize/rich/blob/master/README.cn.md) + • [正體中文 readme](https://github.com/textualize/rich/blob/master/README.zh-tw.md) + • [Lengua española readme](https://github.com/textualize/rich/blob/master/README.es.md) + • [Deutsche readme](https://github.com/textualize/rich/blob/master/README.de.md) + • [Läs på svenska](https://github.com/textualize/rich/blob/master/README.sv.md) + • [日本語 readme](https://github.com/textualize/rich/blob/master/README.ja.md) + • [한국어 readme](https://github.com/textualize/rich/blob/master/README.kr.md) + • [Français readme](https://github.com/textualize/rich/blob/master/README.fr.md) + • [Schwizerdütsch readme](https://github.com/textualize/rich/blob/master/README.de-ch.md) + • [हिन्दी readme](https://github.com/textualize/rich/blob/master/README.hi.md) + • [Português brasileiro readme](https://github.com/textualize/rich/blob/master/README.pt-br.md) + • [Italian readme](https://github.com/textualize/rich/blob/master/README.it.md) + • [Русский readme](https://github.com/textualize/rich/blob/master/README.ru.md) + • [Indonesian readme](https://github.com/textualize/rich/blob/master/README.id.md) + • [فارسی readme](https://github.com/textualize/rich/blob/master/README.fa.md) + • [Türkçe readme](https://github.com/textualize/rich/blob/master/README.tr.md) + • [Polskie readme](https://github.com/textualize/rich/blob/master/README.pl.md) + + +Rich is a Python library for _rich_ text and beautiful formatting in the terminal. + +The [Rich API](https://rich.readthedocs.io/en/latest/) makes it easy to add color and style to terminal output. Rich can also render pretty tables, progress bars, markdown, syntax highlighted source code, tracebacks, and more — out of the box. + +![Features](https://github.com/textualize/rich/raw/master/imgs/features.png) + +For a video introduction to Rich see [calmcode.io](https://calmcode.io/rich/introduction.html) by [@fishnets88](https://twitter.com/fishnets88). + +See what [people are saying about Rich](https://www.willmcgugan.com/blog/pages/post/rich-tweets/). + +## Compatibility + +Rich works with Linux, macOS and Windows. True color / emoji works with new Windows Terminal, classic terminal is limited to 16 colors. Rich requires Python 3.8 or later. + +Rich works with [Jupyter notebooks](https://jupyter.org/) with no additional configuration required. + +## Installing + +Install with `pip` or your favorite PyPI package manager. + +```sh +python -m pip install rich +``` + +Run the following to test Rich output on your terminal: + +```sh +python -m rich +``` + +## Rich Print + +To effortlessly add rich output to your application, you can import the [rich print](https://rich.readthedocs.io/en/latest/introduction.html#quick-start) method, which has the same signature as the builtin Python function. Try this: + +```python +from rich import print + +print("Hello, [bold magenta]World[/bold magenta]!", ":vampire:", locals()) +``` + +![Hello World](https://github.com/textualize/rich/raw/master/imgs/print.png) + +## Rich REPL + +Rich can be installed in the Python REPL, so that any data structures will be pretty printed and highlighted. + +```python +>>> from rich import pretty +>>> pretty.install() +``` + +![REPL](https://github.com/textualize/rich/raw/master/imgs/repl.png) + +## Using the Console + +For more control over rich terminal content, import and construct a [Console](https://rich.readthedocs.io/en/latest/reference/console.html#rich.console.Console) object. + +```python +from rich.console import Console + +console = Console() +``` + +The Console object has a `print` method which has an intentionally similar interface to the builtin `print` function. Here's an example of use: + +```python +console.print("Hello", "World!") +``` + +As you might expect, this will print `"Hello World!"` to the terminal. Note that unlike the builtin `print` function, Rich will word-wrap your text to fit within the terminal width. + +There are a few ways of adding color and style to your output. You can set a style for the entire output by adding a `style` keyword argument. Here's an example: + +```python +console.print("Hello", "World!", style="bold red") +``` + +The output will be something like the following: + +![Hello World](https://github.com/textualize/rich/raw/master/imgs/hello_world.png) + +That's fine for styling a line of text at a time. For more finely grained styling, Rich renders a special markup which is similar in syntax to [bbcode](https://en.wikipedia.org/wiki/BBCode). Here's an example: + +```python +console.print("Where there is a [bold cyan]Will[/bold cyan] there [u]is[/u] a [i]way[/i].") +``` + +![Console Markup](https://github.com/textualize/rich/raw/master/imgs/where_there_is_a_will.png) + +You can use a Console object to generate sophisticated output with minimal effort. See the [Console API](https://rich.readthedocs.io/en/latest/console.html) docs for details. + +## Rich Inspect + +Rich has an [inspect](https://rich.readthedocs.io/en/latest/reference/init.html?highlight=inspect#rich.inspect) function which can produce a report on any Python object, such as class, instance, or builtin. + +```python +>>> my_list = ["foo", "bar"] +>>> from rich import inspect +>>> inspect(my_list, methods=True) +``` + +![Log](https://github.com/textualize/rich/raw/master/imgs/inspect.png) + +See the [inspect docs](https://rich.readthedocs.io/en/latest/reference/init.html#rich.inspect) for details. + +# Rich Library + +Rich contains a number of builtin _renderables_ you can use to create elegant output in your CLI and help you debug your code. + +Click the following headings for details: + +
+Log + +The Console object has a `log()` method which has a similar interface to `print()`, but also renders a column for the current time and the file and line which made the call. By default Rich will do syntax highlighting for Python structures and for repr strings. If you log a collection (i.e. a dict or a list) Rich will pretty print it so that it fits in the available space. Here's an example of some of these features. + +```python +from rich.console import Console +console = Console() + +test_data = [ + {"jsonrpc": "2.0", "method": "sum", "params": [None, 1, 2, 4, False, True], "id": "1",}, + {"jsonrpc": "2.0", "method": "notify_hello", "params": [7]}, + {"jsonrpc": "2.0", "method": "subtract", "params": [42, 23], "id": "2"}, +] + +def test_log(): + enabled = False + context = { + "foo": "bar", + } + movies = ["Deadpool", "Rise of the Skywalker"] + console.log("Hello from", console, "!") + console.log(test_data, log_locals=True) + + +test_log() +``` + +The above produces the following output: + +![Log](https://github.com/textualize/rich/raw/master/imgs/log.png) + +Note the `log_locals` argument, which outputs a table containing the local variables where the log method was called. + +The log method could be used for logging to the terminal for long running applications such as servers, but is also a very nice debugging aid. + +
+
+Logging Handler + +You can also use the builtin [Handler class](https://rich.readthedocs.io/en/latest/logging.html) to format and colorize output from Python's logging module. Here's an example of the output: + +![Logging](https://github.com/textualize/rich/raw/master/imgs/logging.png) + +
+ +
+Emoji + +To insert an emoji in to console output place the name between two colons. Here's an example: + +```python +>>> console.print(":smiley: :vampire: :pile_of_poo: :thumbs_up: :raccoon:") +😃 🧛 💩 👍 🦝 +``` + +Please use this feature wisely. + +
+ +
+Tables + +Rich can render flexible [tables](https://rich.readthedocs.io/en/latest/tables.html) with unicode box characters. There is a large variety of formatting options for borders, styles, cell alignment etc. + +![table movie](https://github.com/textualize/rich/raw/master/imgs/table_movie.gif) + +The animation above was generated with [table_movie.py](https://github.com/textualize/rich/blob/master/examples/table_movie.py) in the examples directory. + +Here's a simpler table example: + +```python +from rich.console import Console +from rich.table import Table + +console = Console() + +table = Table(show_header=True, header_style="bold magenta") +table.add_column("Date", style="dim", width=12) +table.add_column("Title") +table.add_column("Production Budget", justify="right") +table.add_column("Box Office", justify="right") +table.add_row( + "Dec 20, 2019", "Star Wars: The Rise of Skywalker", "$275,000,000", "$375,126,118" +) +table.add_row( + "May 25, 2018", + "[red]Solo[/red]: A Star Wars Story", + "$275,000,000", + "$393,151,347", +) +table.add_row( + "Dec 15, 2017", + "Star Wars Ep. VIII: The Last Jedi", + "$262,000,000", + "[bold]$1,332,539,889[/bold]", +) + +console.print(table) +``` + +This produces the following output: + +![table](https://github.com/textualize/rich/raw/master/imgs/table.png) + +Note that console markup is rendered in the same way as `print()` and `log()`. In fact, anything that is renderable by Rich may be included in the headers / rows (even other tables). + +The `Table` class is smart enough to resize columns to fit the available width of the terminal, wrapping text as required. Here's the same example, with the terminal made smaller than the table above: + +![table2](https://github.com/textualize/rich/raw/master/imgs/table2.png) + +
+ +
+Progress Bars + +Rich can render multiple flicker-free [progress](https://rich.readthedocs.io/en/latest/progress.html) bars to track long-running tasks. + +For basic usage, wrap any sequence in the `track` function and iterate over the result. Here's an example: + +```python +from rich.progress import track + +for step in track(range(100)): + do_step(step) +``` + +It's not much harder to add multiple progress bars. Here's an example taken from the docs: + +![progress](https://github.com/textualize/rich/raw/master/imgs/progress.gif) + +The columns may be configured to show any details you want. Built-in columns include percentage complete, file size, file speed, and time remaining. Here's another example showing a download in progress: + +![progress](https://github.com/textualize/rich/raw/master/imgs/downloader.gif) + +To try this out yourself, see [examples/downloader.py](https://github.com/textualize/rich/blob/master/examples/downloader.py) which can download multiple URLs simultaneously while displaying progress. + +
+ +
+Status + +For situations where it is hard to calculate progress, you can use the [status](https://rich.readthedocs.io/en/latest/reference/console.html#rich.console.Console.status) method which will display a 'spinner' animation and message. The animation won't prevent you from using the console as normal. Here's an example: + +```python +from time import sleep +from rich.console import Console + +console = Console() +tasks = [f"task {n}" for n in range(1, 11)] + +with console.status("[bold green]Working on tasks...") as status: + while tasks: + task = tasks.pop(0) + sleep(1) + console.log(f"{task} complete") +``` + +This generates the following output in the terminal. + +![status](https://github.com/textualize/rich/raw/master/imgs/status.gif) + +The spinner animations were borrowed from [cli-spinners](https://www.npmjs.com/package/cli-spinners). You can select a spinner by specifying the `spinner` parameter. Run the following command to see the available values: + +``` +python -m rich.spinner +``` + +The above command generates the following output in the terminal: + +![spinners](https://github.com/textualize/rich/raw/master/imgs/spinners.gif) + +
+ +
+Tree + +Rich can render a [tree](https://rich.readthedocs.io/en/latest/tree.html) with guide lines. A tree is ideal for displaying a file structure, or any other hierarchical data. + +The labels of the tree can be simple text or anything else Rich can render. Run the following for a demonstration: + +``` +python -m rich.tree +``` + +This generates the following output: + +![markdown](https://github.com/textualize/rich/raw/master/imgs/tree.png) + +See the [tree.py](https://github.com/textualize/rich/blob/master/examples/tree.py) example for a script that displays a tree view of any directory, similar to the linux `tree` command. + +
+ +
+Columns + +Rich can render content in neat [columns](https://rich.readthedocs.io/en/latest/columns.html) with equal or optimal width. Here's a very basic clone of the (MacOS / Linux) `ls` command which displays a directory listing in columns: + +```python +import os +import sys + +from rich import print +from rich.columns import Columns + +directory = os.listdir(sys.argv[1]) +print(Columns(directory)) +``` + +The following screenshot is the output from the [columns example](https://github.com/textualize/rich/blob/master/examples/columns.py) which displays data pulled from an API in columns: + +![columns](https://github.com/textualize/rich/raw/master/imgs/columns.png) + +
+ +
+Markdown + +Rich can render [markdown](https://rich.readthedocs.io/en/latest/markdown.html) and does a reasonable job of translating the formatting to the terminal. + +To render markdown import the `Markdown` class and construct it with a string containing markdown code. Then print it to the console. Here's an example: + +```python +from rich.console import Console +from rich.markdown import Markdown + +console = Console() +with open("README.md") as readme: + markdown = Markdown(readme.read()) +console.print(markdown) +``` + +This will produce output something like the following: + +![markdown](https://github.com/textualize/rich/raw/master/imgs/markdown.png) + +
+ +
+Syntax Highlighting + +Rich uses the [pygments](https://pygments.org/) library to implement [syntax highlighting](https://rich.readthedocs.io/en/latest/syntax.html). Usage is similar to rendering markdown; construct a `Syntax` object and print it to the console. Here's an example: + +```python +from rich.console import Console +from rich.syntax import Syntax + +my_code = ''' +def iter_first_last(values: Iterable[T]) -> Iterable[Tuple[bool, bool, T]]: + """Iterate and generate a tuple with a flag for first and last value.""" + iter_values = iter(values) + try: + previous_value = next(iter_values) + except StopIteration: + return + first = True + for value in iter_values: + yield first, False, previous_value + first = False + previous_value = value + yield first, True, previous_value +''' +syntax = Syntax(my_code, "python", theme="monokai", line_numbers=True) +console = Console() +console.print(syntax) +``` + +This will produce the following output: + +![syntax](https://github.com/textualize/rich/raw/master/imgs/syntax.png) + +
+ +
+Tracebacks + +Rich can render [beautiful tracebacks](https://rich.readthedocs.io/en/latest/traceback.html) which are easier to read and show more code than standard Python tracebacks. You can set Rich as the default traceback handler so all uncaught exceptions will be rendered by Rich. + +Here's what it looks like on OSX (similar on Linux): + +![traceback](https://github.com/textualize/rich/raw/master/imgs/traceback.png) + +
+ +All Rich renderables make use of the [Console Protocol](https://rich.readthedocs.io/en/latest/protocol.html), which you can also use to implement your own Rich content. + +# Rich CLI + + +See also [Rich CLI](https://github.com/textualize/rich-cli) for a command line application powered by Rich. Syntax highlight code, render markdown, display CSVs in tables, and more, directly from the command prompt. + + +![Rich CLI](https://raw.githubusercontent.com/Textualize/rich-cli/main/imgs/rich-cli-splash.jpg) + +# Textual + +See also Rich's sister project, [Textual](https://github.com/Textualize/textual), which you can use to build sophisticated User Interfaces in the terminal. + +![textual-splash](https://github.com/user-attachments/assets/4caeb77e-48c0-4cf7-b14d-c53ded855ffd) + +# Toad + +[Toad](https://github.com/batrachianai/toad) is a unified interface for agentic coding. Built with Rich and Textual. + +![toad](https://github.com/user-attachments/assets/6678b707-1aeb-420f-99ad-abfcd4356771) + diff --git a/rich-14.3.3.dist-info/RECORD b/rich-14.3.3.dist-info/RECORD new file mode 100644 index 0000000000000000000000000000000000000000..8151b24f206c31ba145055ce893cc49ec2d2ac2e --- /dev/null +++ b/rich-14.3.3.dist-info/RECORD @@ -0,0 +1,107 @@ +rich-14.3.3.dist-info/INSTALLER,sha256=5hhM4Q4mYTT9z6QB6PGpUAW81PGNFrYrdXMj4oM_6ak,2 +rich-14.3.3.dist-info/METADATA,sha256=9kOnJPejl6vTwavl35qTJmtKu218p9Ob-z-jDoz1ATo,18495 +rich-14.3.3.dist-info/RECORD,, +rich-14.3.3.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +rich-14.3.3.dist-info/WHEEL,sha256=kJCRJT_g0adfAJzTx2GUMmS80rTJIVHRCfG0DQgLq3o,88 +rich-14.3.3.dist-info/licenses/LICENSE,sha256=3u18F6QxgVgZCj6iOcyHmlpQJxzruYrnAl9I--WNyhU,1056 +rich/__init__.py,sha256=j7AA9_z_PANQbmRTwTkWZc_DVdqtBBHxBqGrMsz21CI,6131 +rich/__main__.py,sha256=kcy8BtBEUkYLCogRkwhCdJs0VNx6KF1n8kFU2g-7plg,7725 +rich/_emoji_codes.py,sha256=hu1VL9nbVdppJrVoijVshRlcRRe_v3dju3Mmd2sKZdY,140235 +rich/_emoji_replace.py,sha256=n-kcetsEUx2ZUmhQrfeMNc-teeGhpuSQ5F8VPBsyvDo,1064 +rich/_export_format.py,sha256=RI08pSrm5tBSzPMvnbTqbD9WIalaOoN5d4M1RTmLq1Y,2128 +rich/_extension.py,sha256=G66PkbH_QdTJh6jD-J228O76CmAnr2hLQv72CgPPuzE,241 +rich/_fileno.py,sha256=HWZxP5C2ajMbHryvAQZseflVfQoGzsKOHzKGsLD8ynQ,799 +rich/_inspect.py,sha256=5pUYCqveN4ekb_mgHOYCipyZCrMGEJZYZs3y5QqmCf0,9894 +rich/_log_render.py,sha256=xBKCxqiO4FZk8eG56f8crFdrmJxFrJsQE3V3F-fFekc,3213 +rich/_loop.py,sha256=hV_6CLdoPm0va22Wpw4zKqM0RYsz3TZxXj0PoS-9eDQ,1236 +rich/_null_file.py,sha256=ADGKp1yt-k70FMKV6tnqCqecB-rSJzp-WQsD7LPL-kg,1394 +rich/_palettes.py,sha256=cdev1JQKZ0JvlguV9ipHgznTdnvlIzUFDBb0It2PzjI,7063 +rich/_pick.py,sha256=evDt8QN4lF5CiwrUIXlOJCntitBCOsI3ZLPEIAVRLJU,423 +rich/_ratio.py,sha256=IOtl78sQCYZsmHyxhe45krkb68u9xVz7zFsXVJD-b2Y,5325 +rich/_spinners.py,sha256=U2r1_g_1zSjsjiUdAESc2iAMc3i4ri_S8PYP6kQ5z1I,19919 +rich/_stack.py,sha256=-C8OK7rxn3sIUdVwxZBBpeHhIzX0eI-VM3MemYfaXm0,351 +rich/_timer.py,sha256=zelxbT6oPFZnNrwWPpc1ktUeAT-Vc4fuFcRZLQGLtMI,417 +rich/_unicode_data/__init__.py,sha256=40PhAFp3b88yPnRtTG6i9g5xj-5w98CET3C4Ip1LJAI,2631 +rich/_unicode_data/_versions.py,sha256=ftILR3G5dqmlQmCJYNovuyXfFneJY2eH2RvpV19tyYE,298 +rich/_unicode_data/unicode10-0-0.py,sha256=FUstYPZwDF15BEuAERqXHqIPvC1KY1SL20qLj999Fzo,14496 +rich/_unicode_data/unicode11-0-0.py,sha256=jp8SRKFBlIaHMafxAfhLiU7FizwdcgPVvytR3kSFtZo,14874 +rich/_unicode_data/unicode12-0-0.py,sha256=0XHXSjW5EXue6Qfgl_u0bK7NstiSdmqrR_F_Wqhs5bE,15216 +rich/_unicode_data/unicode12-1-0.py,sha256=kWcRGh1UVwsmK9qflfGsO_aM8AuNhLthJcsyXAOQryI,15189 +rich/_unicode_data/unicode13-0-0.py,sha256=DokduALtgaB-WvapW7X7PmM3kZFbgnCo9ifINWZtuDs,15519 +rich/_unicode_data/unicode14-0-0.py,sha256=Ni-3JnGf5sB4kGl8ip-Iyqoum-awJjZV0RoSNuo0yu8,15884 +rich/_unicode_data/unicode15-0-0.py,sha256=DBHkV5Si9vmtv_MiW6NmPQb8yUeP4N-9hNt2juZr4ZE,16156 +rich/_unicode_data/unicode15-1-0.py,sha256=uyCZLYrg0q7wGt3-uBdsF0Ay_x8MTMvFqK6aMOkl9rA,16129 +rich/_unicode_data/unicode16-0-0.py,sha256=5iGpbczVQ7BD2QGYb6k1h_OdILQXXuhcvI6dQFLKbMc,16480 +rich/_unicode_data/unicode17-0-0.py,sha256=NXbjFsAaTYdj6TYgk5Q9disv40LZHbedFGRl_48_9Y0,16704 +rich/_unicode_data/unicode4-1-0.py,sha256=yIApQGEG7VOFYY2FFfvOdPprE6VpDusjLlTlCy4G_Rw,9488 +rich/_unicode_data/unicode5-0-0.py,sha256=PkEI1x4RGwXSmgqHiu88jyz14VQT8bWiOtBDrECaVZg,9613 +rich/_unicode_data/unicode5-1-0.py,sha256=C0-PWziqpkVkf3PkPz2GWNwidCrjef5AHQGBpqTPkuY,9650 +rich/_unicode_data/unicode5-2-0.py,sha256=gM88qri3_9b0I37DGK1-ah-0SUed2VmvxUddB5UZLto,10390 +rich/_unicode_data/unicode6-0-0.py,sha256=hhcAipqKU81dZ4YbDXIlT7KEK9yNG_Lx-5GLgNTF75g,10604 +rich/_unicode_data/unicode6-1-0.py,sha256=PtUJYxwtU5S2A44YY_69Ka_wTkq-c7QCa-yibmkHvJs,10899 +rich/_unicode_data/unicode6-2-0.py,sha256=TdTOctGwO-OYVGif1DoRJvsaQISHw5EXHPt58fjB55g,10899 +rich/_unicode_data/unicode6-3-0.py,sha256=G9l8chu2MJTwZ_d_0tmH7PBg8KYYHaeuJCJNCHGHkDc,10924 +rich/_unicode_data/unicode7-0-0.py,sha256=po9GGYW7ilGU9MwGE6fny-OofqbDwelQrnOIm-IusjI,11630 +rich/_unicode_data/unicode8-0-0.py,sha256=4PjyI-w2Uj9xtanNlZm1d_2khcetMQlIefaoU2joGkY,11864 +rich/_unicode_data/unicode9-0-0.py,sha256=A-pQjvkTICUGXEDOiWNMqa6z6vv1E__HJffzu7zLi9c,14148 +rich/_win32_console.py,sha256=o2QN_IRx10biGP3Ap1neaqX8FBGlUKSmWM6Kw4OSg-U,22719 +rich/_windows.py,sha256=is3WpbHMj8WaTHYB11hc6lP2t4hlvt4TViTlHSmjsi0,1901 +rich/_windows_renderer.py,sha256=d799xOnxLbCCCzGu9-U7YLmIQkxtxQIBFQQ6iu4veSc,2759 +rich/_wrap.py,sha256=FlSsom5EX0LVkA3KWy34yHnCfLtqX-ZIepXKh-70rpc,3404 +rich/abc.py,sha256=dALMOGfKVNeAbvqq66IpTQxQUerxD7AE4FKwqd0eQKk,878 +rich/align.py,sha256=a8MbP-iJjoOFonxqgoP3YketKMWU3WFZaxjqAv2Qc5E,10726 +rich/ansi.py,sha256=Avs1LHbSdcyOvDOdpELZUoULcBiYewY76eNBp6uFBhs,6921 +rich/bar.py,sha256=ldbVHOzKJOnflVNuv1xS7g6dLX2E3wMnXkdPbpzJTcs,3263 +rich/box.py,sha256=SSolg8_pzHzY9QvJQo-qp0tbPsnj8O_2W4hmi1l-Zo0,10650 +rich/cells.py,sha256=fPRSnquYlCO4THgmUokOJOhNlcx8C4NZ2Xv9JM_7GbE,12335 +rich/color.py,sha256=3HSULVDj7qQkXUdFWv78JOiSZzfy5y1nkcYhna296V0,18211 +rich/color_triplet.py,sha256=3lhQkdJbvWPoLDO-AnYImAWmJvV5dlgYNCVZ97ORaN4,1054 +rich/columns.py,sha256=HUX0KcMm9dsKNi11fTbiM_h2iDtl8ySCaVcxlalEzq8,7131 +rich/console.py,sha256=doKNi4BLSKZCt6Ra-6AXiQK-FaWAL-7fClG2ZO5w7uQ,101009 +rich/constrain.py,sha256=1VIPuC8AgtKWrcncQrjBdYqA3JVWysu6jZo1rrh7c7Q,1288 +rich/containers.py,sha256=c_56TxcedGYqDepHBMTuZdUIijitAQgnox-Qde0Z1qo,5502 +rich/control.py,sha256=HnsraFTzBaUQDzKJWXsfPv-PPmgGypSgSv7oANackqs,6475 +rich/default_styles.py,sha256=Tk1pZ9bOZ40EVX7cfBbYDEo7hrpwBvZGsiUCXdh0P0M,8340 +rich/diagnose.py,sha256=1RWnQoppPXjC_49AB4vtV048DK3ksQSq671C83Y6f-g,977 +rich/emoji.py,sha256=_bTf1Y3JqiMk6Nfn4V_YOhq1wAPAHNODhGLJj95R3uI,2343 +rich/errors.py,sha256=5pP3Kc5d4QJ_c0KFsxrfyhjiPVe7J1zOqSFbFAzcV-Y,642 +rich/file_proxy.py,sha256=Tl9THMDZ-Pk5Wm8sI1gGg_U5DhusmxD-FZ0fUbcU0W0,1683 +rich/filesize.py,sha256=_iz9lIpRgvW7MNSeCZnLg-HwzbP4GETg543WqD8SFs0,2484 +rich/highlighter.py,sha256=MIapWwjR8ahUcsDicgN4xLRbanbh5x_Qzh5kfdDrKKk,9729 +rich/json.py,sha256=omC2WHTgURxEosna1ftoSJCne2EX7MDuQtCdswS3qsk,5019 +rich/jupyter.py,sha256=G9pOJmR4ESIFYSd4MKGqmHqCtstx0oRWpyeTgv54-Xc,3228 +rich/layout.py,sha256=WR8PCSroYnteIT3zawxQ3k3ad1sQO5wGG1SZOoeBuBM,13944 +rich/live.py,sha256=UKvqLSuSzNHIBz5pxRrU7SPsnG48vJJx2TMHiaHlmnI,15317 +rich/live_render.py,sha256=jQO3X_p2wemGI2bdCAY_e5j9xXNTwDxuujbpoSRsH6c,3803 +rich/logging.py,sha256=UL6TZNlaptYKHNhQ45LREy-29Pl-tQsBh7q3HSnWIAA,12456 +rich/markdown.py,sha256=mDDJmUbWRYm7_vKenVq76K6aG43KTJJ4tiYMzxXO7ns,26177 +rich/markup.py,sha256=btpr271BLhiCR1jNglRnv2BpIzVcNefYwSMeW9teDbc,8427 +rich/measure.py,sha256=HmrIJX8sWRTHbgh8MxEay_83VkqNW_70s8aKP5ZcYI8,5305 +rich/padding.py,sha256=h8XnIivLrNtlxI3vQPKHXh4hAwjOJqZx0slM0z3g1_M,4896 +rich/pager.py,sha256=SO_ETBFKbg3n_AgOzXm41Sv36YxXAyI3_R-KOY2_uSc,828 +rich/palette.py,sha256=Ar6ZUrYHiFt6-Rr2k-k9F8V7hxgJYHNdqjk2vVXsLgc,3288 +rich/panel.py,sha256=9sQl00hPIqH5G2gALQo4NepFwpP0k9wT-s_gOms5pIc,11157 +rich/pretty.py,sha256=lxW_bhBfDTKe-grtzP1KQhQsqPZxfZN14th4JRVoLzQ,36349 +rich/progress.py,sha256=Z6Yz6pYO26LFXx24cXKvk2bEX6FkjK2Z5e2fy0Bt78s,60393 +rich/progress_bar.py,sha256=mZTPpJUwcfcdgQCTTz3kyY-fc79ddLwtx6Ghhxfo064,8162 +rich/prompt.py,sha256=1SIuBEelovvwCG2wp22PZwskA3IkmAsOxf0ruFkOhOk,12448 +rich/protocol.py,sha256=Wt-2HZd67OYiopUkCTOz7lM38vyo5r3HEQZ9TOPDl5Q,1367 +rich/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +rich/region.py,sha256=rNT9xZrVZTYIXZC0NYn41CJQwYNbR-KecPOxTgQvB8Y,166 +rich/repr.py,sha256=HIsurPLZK9Gray75l3_vQx7S27AzTpAj4ChXSfe1Fes,4419 +rich/rule.py,sha256=umO21Wjw0FcYAeTB3UumNLCsDWhejzxnjlf2VwiXiDI,4590 +rich/scope.py,sha256=x-jkKIE-j7ZxA9I07lya5iVAuDAyUKM58HWtNZQUzRc,3239 +rich/screen.py,sha256=rL_j2wX-4SeuIOI2oOlc418QP9EAvD59GInUmEAE6jQ,1579 +rich/segment.py,sha256=aUh9qRteTyo6guePUn8cU7S1ccXRsfzLZ-VMDXXeNug,25795 +rich/spinner.py,sha256=onIhpKlljRHppTZasxO8kXgtYyCHUkpSgKglRJ3o51g,4214 +rich/status.py,sha256=kkPph3YeAZBo-X-4wPp8gTqZyU466NLwZBA4PZTTewo,4424 +rich/style.py,sha256=W9Ccy8Py8lNICtlfcp-ryzMTuQaGxAU3av7-g5fHu0s,26990 +rich/styled.py,sha256=wljVsVTXbABMMZvkzkO43ZEk_-irzEtvUiQ-sNnikQ8,1234 +rich/syntax.py,sha256=5ZBNxjIj3C1FC92vLwBVN-C5YAdKjPHfH6SqCzFaOYE,36263 +rich/table.py,sha256=6yv7wMLXZgOsGgzEDpKXpecXQOwcLBHNOn-MYcbo1Zk,40033 +rich/terminal_theme.py,sha256=1j5-ufJfnvlAo5Qsi_ACZiXDmwMXzqgmFByObT9-yJY,3370 +rich/text.py,sha256=t2J8iMquwR0BUUggm-cVNp30Wxw5M8fIF_p0jioYRV0,47655 +rich/theme.py,sha256=oNyhXhGagtDlbDye3tVu3esWOWk0vNkuxFw-_unlaK0,3771 +rich/themes.py,sha256=0xgTLozfabebYtcJtDdC5QkX5IVUEaviqDUJJh4YVFk,102 +rich/traceback.py,sha256=TT03eLKEj5XapXFp9kgf0bBwNupIFrHquKxZtwBhz_8,37535 +rich/tree.py,sha256=QoOwg424FkdwGfR8K0tZ6Q7qtzWNAUP_m4sFaYuG6nw,9391 diff --git a/rich-14.3.3.dist-info/REQUESTED b/rich-14.3.3.dist-info/REQUESTED new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/rich-14.3.3.dist-info/WHEEL b/rich-14.3.3.dist-info/WHEEL new file mode 100644 index 0000000000000000000000000000000000000000..7894e88612ce5ce2c8502e9eeee7ede5b88c9b9e --- /dev/null +++ b/rich-14.3.3.dist-info/WHEEL @@ -0,0 +1,4 @@ +Wheel-Version: 1.0 +Generator: poetry-core 2.3.1 +Root-Is-Purelib: true +Tag: py3-none-any diff --git a/sentence_transformers/__init__.py b/sentence_transformers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a3bdf18700aa0d07292b0aaab03306923cc9044e --- /dev/null +++ b/sentence_transformers/__init__.py @@ -0,0 +1,72 @@ +from __future__ import annotations + +__version__ = "5.3.0" +__MODEL_HUB_ORGANIZATION__ = "sentence-transformers" + +import importlib +import os +import warnings + +from sentence_transformers.backend import ( + export_dynamic_quantized_onnx_model, + export_optimized_onnx_model, + export_static_quantized_openvino_model, +) +from sentence_transformers.cross_encoder import ( + CrossEncoder, + CrossEncoderModelCardData, + CrossEncoderTrainer, + CrossEncoderTrainingArguments, +) +from sentence_transformers.datasets import ParallelSentencesDataset, SentencesDataset +from sentence_transformers.LoggingHandler import LoggingHandler +from sentence_transformers.model_card import SentenceTransformerModelCardData +from sentence_transformers.quantization import quantize_embeddings +from sentence_transformers.readers import InputExample +from sentence_transformers.sampler import DefaultBatchSampler, MultiDatasetDefaultBatchSampler +from sentence_transformers.SentenceTransformer import SentenceTransformer +from sentence_transformers.similarity_functions import SimilarityFunction +from sentence_transformers.sparse_encoder import ( + SparseEncoder, + SparseEncoderModelCardData, + SparseEncoderTrainer, + SparseEncoderTrainingArguments, +) +from sentence_transformers.trainer import SentenceTransformerTrainer +from sentence_transformers.training_args import SentenceTransformerTrainingArguments +from sentence_transformers.util import mine_hard_negatives + +# If codecarbon is installed and the log level is not defined, +# automatically overwrite the default to "error" +if importlib.util.find_spec("codecarbon") and "CODECARBON_LOG_LEVEL" not in os.environ: + os.environ["CODECARBON_LOG_LEVEL"] = "error" + +# Globally silence PyTorch sparse CSR tensor beta warning +warnings.filterwarnings("ignore", message="Sparse CSR tensor support is in beta state") + +__all__ = [ + "LoggingHandler", + "SentencesDataset", + "ParallelSentencesDataset", + "SentenceTransformer", + "SimilarityFunction", + "InputExample", + "CrossEncoder", + "CrossEncoderTrainer", + "CrossEncoderTrainingArguments", + "CrossEncoderModelCardData", + "SentenceTransformerTrainer", + "SentenceTransformerTrainingArguments", + "SentenceTransformerModelCardData", + "SparseEncoder", + "SparseEncoderTrainer", + "SparseEncoderTrainingArguments", + "SparseEncoderModelCardData", + "quantize_embeddings", + "export_optimized_onnx_model", + "export_dynamic_quantized_onnx_model", + "export_static_quantized_openvino_model", + "DefaultBatchSampler", + "MultiDatasetDefaultBatchSampler", + "mine_hard_negatives", +] diff --git a/sentence_transformers/model_card_templates.py b/sentence_transformers/model_card_templates.py new file mode 100644 index 0000000000000000000000000000000000000000..9c6a6e19ef2071a50138429d015d60b0a0979273 --- /dev/null +++ b/sentence_transformers/model_card_templates.py @@ -0,0 +1,191 @@ +""" +This file contains the templating for model cards prior to the v3.0 release. It still exists to be used alongside +SentenceTransformer.old_fit for backwards compatibility, but will be removed in a future release. +""" + +from __future__ import annotations + +import logging + +from .util import fullname + + +class ModelCardTemplate: + __TAGS__ = ["sentence-transformers", "feature-extraction", "sentence-similarity"] + __DEFAULT_VARS__ = { + "{PIPELINE_TAG}": "sentence-similarity", + "{MODEL_DESCRIPTION}": "", + "{TRAINING_SECTION}": "", + "{USAGE_TRANSFORMERS_SECTION}": "", + "{EVALUATION}": "", + "{CITING}": "", + } + + __MODEL_CARD__ = """ +--- +library_name: sentence-transformers +pipeline_tag: {PIPELINE_TAG} +tags: +{TAGS} +{DATASETS} +--- + +# {MODEL_NAME} + +This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a {NUM_DIMENSIONS} dimensional dense vector space and can be used for tasks like clustering or semantic search. + +{MODEL_DESCRIPTION} + +## Usage (Sentence-Transformers) + +Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed: + +``` +pip install -U sentence-transformers +``` + +Then you can use the model like this: + +```python +from sentence_transformers import SentenceTransformer +sentences = ["This is an example sentence", "Each sentence is converted"] + +model = SentenceTransformer('{MODEL_NAME}') +embeddings = model.encode(sentences) +print(embeddings) +``` + +{USAGE_TRANSFORMERS_SECTION} + +## Evaluation Results + +{EVALUATION} + +For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name={MODEL_NAME}) + +{TRAINING_SECTION} + +## Full Model Architecture +``` +{FULL_MODEL_STR} +``` + +## Citing & Authors + +{CITING} + +""" + + __TRAINING_SECTION__ = """ +## Training +The model was trained with the parameters: + +{LOSS_FUNCTIONS} + +Parameters of the fit()-Method: +``` +{FIT_PARAMETERS} +``` +""" + + __USAGE_TRANSFORMERS__ = """\n +## Usage (HuggingFace Transformers) +Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings. + +```python +from transformers import AutoTokenizer, AutoModel +import torch + +{POOLING_FUNCTION} + +# Sentences we want sentence embeddings for +sentences = ['This is an example sentence', 'Each sentence is converted'] + +# Load model from HuggingFace Hub +tokenizer = AutoTokenizer.from_pretrained('{MODEL_NAME}') +model = AutoModel.from_pretrained('{MODEL_NAME}') + +# Tokenize sentences +encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt') + +# Compute token embeddings +with torch.no_grad(): + model_output = model(**encoded_input) + +# Perform pooling. In this case, {POOLING_MODE} pooling. +sentence_embeddings = {POOLING_FUNCTION_NAME}(model_output, encoded_input['attention_mask']) + +print("Sentence embeddings:") +print(sentence_embeddings) +``` + +""" + + @staticmethod + def model_card_get_pooling_function(pooling_mode): + if pooling_mode == "max": + return ( + "max_pooling", + """ +# Max Pooling - Take the max value over time for every dimension. +def max_pooling(model_output, attention_mask): + token_embeddings = model_output[0] #First element of model_output contains all token embeddings + input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).to(token_embeddings.dtype) + token_embeddings[input_mask_expanded == 0] = -1e9 # Set padding tokens to large negative value + return torch.max(token_embeddings, 1)[0] +""", + ) + elif pooling_mode == "mean": + return ( + "mean_pooling", + """ +#Mean Pooling - Take attention mask into account for correct averaging +def mean_pooling(model_output, attention_mask): + token_embeddings = model_output[0] #First element of model_output contains all token embeddings + input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).to(token_embeddings.dtype) + return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) +""", + ) + + elif pooling_mode == "cls": + return ( + "cls_pooling", + """ +def cls_pooling(model_output, attention_mask): + return model_output[0][:,0] +""", + ) + + @staticmethod + def get_train_objective_info(dataloader, loss): + try: + if hasattr(dataloader, "get_config_dict"): + loader_params = dataloader.get_config_dict() + else: + loader_params = {} + loader_params["batch_size"] = dataloader.batch_size if hasattr(dataloader, "batch_size") else "unknown" + if hasattr(dataloader, "sampler"): + loader_params["sampler"] = fullname(dataloader.sampler) + if hasattr(dataloader, "batch_sampler"): + loader_params["batch_sampler"] = fullname(dataloader.batch_sampler) + + dataloader_str = f"""**DataLoader**:\n\n`{fullname(dataloader)}` of length {len(dataloader)} with parameters: +``` +{loader_params} +```""" + + loss_str = "**Loss**:\n\n`{}` {}".format( + fullname(loss), + f"""with parameters: + ``` + {loss.get_config_dict()} + ```""" + if hasattr(loss, "get_config_dict") + else "", + ) + + return [dataloader_str, loss_str] + + except Exception as e: + logging.WARN(f"Exception when creating get_train_objective_info: {str(e)}") + return "" diff --git a/sentence_transformers/peft_mixin.py b/sentence_transformers/peft_mixin.py new file mode 100644 index 0000000000000000000000000000000000000000..39af3523df0217c4979921fc54844df92583336f --- /dev/null +++ b/sentence_transformers/peft_mixin.py @@ -0,0 +1,158 @@ +from __future__ import annotations + +from functools import wraps + +from transformers.integrations.peft import PeftAdapterMixin as PeftAdapterMixinTransformers + + +def peft_wrapper(func): + """Wrapper to call the method on the auto_model with a check for PEFT compatibility.""" + + @wraps(func) + def wrapper(self, *args, **kwargs): + self.check_peft_compatible_model() + method = getattr(self.transformers_model, func.__name__) + return method(*args, **kwargs) + + return wrapper + + +class PeftAdapterMixin: + """ + Wrapper Mixin that adds the functionality to easily load and use adapters on the model. For + more details about adapters check out the documentation of PEFT + library: https://huggingface.co/docs/peft/index + + Currently supported PEFT methods follow those supported by transformers library, + you can find more information on: + https://huggingface.co/docs/transformers/main/en/main_classes/peft#transformers.integrations.PeftAdapterMixin + """ + + def has_peft_compatible_model(self) -> bool: + return isinstance(self.transformers_model, PeftAdapterMixinTransformers) + + def check_peft_compatible_model(self) -> None: + if not self.has_peft_compatible_model(): + raise ValueError( + "PEFT methods are only supported for Sentence Transformer models that use the Transformer module." + ) + + @peft_wrapper + def load_adapter(self, *args, **kwargs) -> None: + """ + Load adapter weights from file or remote Hub folder." If you are not familiar with adapters and PEFT methods, we + invite you to read more about them on PEFT official documentation: https://huggingface.co/docs/peft + + Requires peft as a backend to load the adapter weights and the underlying model to be compatible with PEFT. + + Args: + *args: + Positional arguments to pass to the underlying AutoModel `load_adapter` function. More information can be found in the transformers documentation + https://huggingface.co/docs/transformers/main/en/main_classes/peft#transformers.integrations.PeftAdapterMixin.load_adapter + **kwargs: + Keyword arguments to pass to the underlying AutoModel `load_adapter` function. More information can be found in the transformers documentation + https://huggingface.co/docs/transformers/main/en/main_classes/peft#transformers.integrations.PeftAdapterMixin.load_adapter + """ + ... # Implementation handled by the wrapper + + @peft_wrapper + def add_adapter(self, *args, **kwargs) -> None: + """ + Adds a fresh new adapter to the current model for training purposes. If no adapter name is passed, a default + name is assigned to the adapter to follow the convention of PEFT library (in PEFT we use "default" as the + default adapter name). + + Requires peft as a backend to load the adapter weights and the underlying model to be compatible with PEFT. + + Args: + *args: + Positional arguments to pass to the underlying AutoModel `add_adapter` function. More information can be found in the transformers documentation + https://huggingface.co/docs/transformers/main/en/main_classes/peft#transformers.integrations.PeftAdapterMixin.add_adapter + **kwargs: + Keyword arguments to pass to the underlying AutoModel `add_adapter` function. More information can be found in the transformers documentation + https://huggingface.co/docs/transformers/main/en/main_classes/peft#transformers.integrations.PeftAdapterMixin.add_adapter + + """ + ... # Implementation handled by the wrapper + + @peft_wrapper + def set_adapter(self, *args, **kwargs) -> None: + """ + Sets a specific adapter by forcing the model to use a that adapter and disable the other adapters. + + Args: + *args: + Positional arguments to pass to the underlying AutoModel `set_adapter` function. More information can be found in the transformers documentation + https://huggingface.co/docs/transformers/main/en/main_classes/peft#transformers.integrations.PeftAdapterMixin.set_adapter + **kwargs: + Keyword arguments to pass to the underlying AutoModel `set_adapter` function. More information can be found in the transformers documentation + https://huggingface.co/docs/transformers/main/en/main_classes/peft#transformers.integrations.PeftAdapterMixin.set_adapter + """ + ... # Implementation handled by the wrapper + + @peft_wrapper + def disable_adapters(self) -> None: + """ + Disable all adapters that are attached to the model. This leads to inferring with the base model only. + """ + ... # Implementation handled by the wrapper + + @peft_wrapper + def enable_adapters(self) -> None: + """ + Enable adapters that are attached to the model. The model will use `self.active_adapter()` + """ + ... # Implementation handled by the wrapper + + @peft_wrapper + def active_adapters(self) -> list[str]: + """ + If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT + official documentation: https://huggingface.co/docs/peft + + Gets the current active adapters of the model. In case of multi-adapter inference (combining multiple adapters + for inference) returns the list of all active adapters so that users can deal with them accordingly. + + For previous PEFT versions (that does not support multi-adapter inference), `module.active_adapter` will return + a single string. + """ + ... # Implementation handled by the wrapper + + @peft_wrapper + def active_adapter(self) -> str: ... # Implementation handled by the wrapper + + @peft_wrapper + def get_adapter_state_dict(self, *args, **kwargs) -> dict: + """ + If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT + official documentation: https://huggingface.co/docs/peft + + Gets the adapter state dict that should only contain the weights tensors of the specified adapter_name adapter. + If no adapter_name is passed, the active adapter is used. + + Args: + *args: + Positional arguments to pass to the underlying AutoModel `get_adapter_state_dict` function. More information can be found in the transformers documentation + https://huggingface.co/docs/transformers/main/en/main_classes/peft#transformers.integrations.PeftAdapterMixin.get_adapter_state_dict + **kwargs: + Keyword arguments to pass to the underlying AutoModel `get_adapter_state_dict` function. More information can be found in the transformers documentation + https://huggingface.co/docs/transformers/main/en/main_classes/peft#transformers.integrations.PeftAdapterMixin.get_adapter_state_dict + """ + ... # Implementation handled by the wrapper + + @peft_wrapper + def delete_adapter(self, *args, **kwargs) -> None: + """ + If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT + official documentation: https://huggingface.co/docs/peft + + Delete an adapter's LoRA layers from the underlying model. + + Args: + *args: + Positional arguments to pass to the underlying AutoModel `delete_adapter` function. More information can be found in the transformers documentation + https://huggingface.co/docs/transformers/main/en/main_classes/peft#transformers.integrations.PeftAdapterMixin.delete_adapter + **kwargs: + Keyword arguments to pass to the underlying AutoModel `delete_adapter` function. More information can be found in the transformers documentation + https://huggingface.co/docs/transformers/main/en/main_classes/peft#transformers.integrations.PeftAdapterMixin.delete_adapter + """ diff --git a/sentence_transformers/similarity_functions.py b/sentence_transformers/similarity_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..53b8d17aff4aac52b7693f61994014c69c375f25 --- /dev/null +++ b/sentence_transformers/similarity_functions.py @@ -0,0 +1,129 @@ +from __future__ import annotations + +from collections.abc import Callable +from enum import Enum + +from numpy import ndarray +from torch import Tensor + +from .util import ( + cos_sim, + dot_score, + euclidean_sim, + manhattan_sim, + pairwise_cos_sim, + pairwise_dot_score, + pairwise_euclidean_sim, + pairwise_manhattan_sim, +) + + +class SimilarityFunction(Enum): + """ + Enum class for supported similarity functions. The following functions are supported: + + - ``SimilarityFunction.COSINE`` (``"cosine"``): Cosine similarity + - ``SimilarityFunction.DOT_PRODUCT`` (``"dot"``, ``dot_product``): Dot product similarity + - ``SimilarityFunction.EUCLIDEAN`` (``"euclidean"``): Euclidean distance + - ``SimilarityFunction.MANHATTAN`` (``"manhattan"``): Manhattan distance + """ + + COSINE = "cosine" + DOT_PRODUCT = "dot" + DOT = "dot" # Alias for DOT_PRODUCT + EUCLIDEAN = "euclidean" + MANHATTAN = "manhattan" + + @staticmethod + def to_similarity_fn( + similarity_function: str | SimilarityFunction, + ) -> Callable[[Tensor | ndarray, Tensor | ndarray], Tensor]: + """ + Converts a similarity function name or enum value to the corresponding similarity function. + + Args: + similarity_function (Union[str, SimilarityFunction]): The name or enum value of the similarity function. + + Returns: + Callable[[Union[Tensor, ndarray], Union[Tensor, ndarray]], Tensor]: The corresponding similarity function. + + Raises: + ValueError: If the provided function is not supported. + + Example: + >>> similarity_fn = SimilarityFunction.to_similarity_fn("cosine") + >>> similarity_scores = similarity_fn(embeddings1, embeddings2) + >>> similarity_scores + tensor([[0.3952, 0.0554], + [0.0992, 0.1570]]) + """ + similarity_function = SimilarityFunction(similarity_function) + + if similarity_function == SimilarityFunction.COSINE: + return cos_sim + if similarity_function == SimilarityFunction.DOT_PRODUCT: + return dot_score + if similarity_function == SimilarityFunction.MANHATTAN: + return manhattan_sim + if similarity_function == SimilarityFunction.EUCLIDEAN: + return euclidean_sim + + raise ValueError( + f"The provided function {similarity_function} is not supported. Use one of the supported values: {SimilarityFunction.possible_values()}." + ) + + @staticmethod + def to_similarity_pairwise_fn( + similarity_function: str | SimilarityFunction, + ) -> Callable[[Tensor | ndarray, Tensor | ndarray], Tensor]: + """ + Converts a similarity function into a pairwise similarity function. + + The pairwise similarity function returns the diagonal vector from the similarity matrix, i.e. it only + computes the similarity(a[i], b[i]) for each i in the range of the input tensors, rather than + computing the similarity between all pairs of a and b. + + Args: + similarity_function (Union[str, SimilarityFunction]): The name or enum value of the similarity function. + + Returns: + Callable[[Union[Tensor, ndarray], Union[Tensor, ndarray]], Tensor]: The pairwise similarity function. + + Raises: + ValueError: If the provided similarity function is not supported. + + Example: + >>> pairwise_fn = SimilarityFunction.to_similarity_pairwise_fn("cosine") + >>> similarity_scores = pairwise_fn(embeddings1, embeddings2) + >>> similarity_scores + tensor([0.3952, 0.1570]) + """ + similarity_function = SimilarityFunction(similarity_function) + + if similarity_function == SimilarityFunction.COSINE: + return pairwise_cos_sim + if similarity_function == SimilarityFunction.DOT_PRODUCT: + return pairwise_dot_score + if similarity_function == SimilarityFunction.MANHATTAN: + return pairwise_manhattan_sim + if similarity_function == SimilarityFunction.EUCLIDEAN: + return pairwise_euclidean_sim + + raise ValueError( + f"The provided function {similarity_function} is not supported. Use one of the supported values: {SimilarityFunction.possible_values()}." + ) + + @staticmethod + def possible_values() -> list[str]: + """ + Returns a list of possible values for the SimilarityFunction enum. + + Returns: + list: A list of possible values for the SimilarityFunction enum. + + Example: + >>> possible_values = SimilarityFunction.possible_values() + >>> possible_values + ['cosine', 'dot', 'euclidean', 'manhattan'] + """ + return [m.value for m in SimilarityFunction] diff --git a/shellingham-1.5.4.dist-info/INSTALLER b/shellingham-1.5.4.dist-info/INSTALLER new file mode 100644 index 0000000000000000000000000000000000000000..5c69047b2eb8235994febeeae1da4a82365a240a --- /dev/null +++ b/shellingham-1.5.4.dist-info/INSTALLER @@ -0,0 +1 @@ +uv \ No newline at end of file diff --git a/shellingham-1.5.4.dist-info/LICENSE b/shellingham-1.5.4.dist-info/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..b9077766e9b9bdcae49ea5c8fced750ed13ec8f7 --- /dev/null +++ b/shellingham-1.5.4.dist-info/LICENSE @@ -0,0 +1,13 @@ +Copyright (c) 2018, Tzu-ping Chung + +Permission to use, copy, modify, and distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. diff --git a/shellingham-1.5.4.dist-info/METADATA b/shellingham-1.5.4.dist-info/METADATA new file mode 100644 index 0000000000000000000000000000000000000000..52118f1e5c83bd7ef39196a749651fc87d176812 --- /dev/null +++ b/shellingham-1.5.4.dist-info/METADATA @@ -0,0 +1,106 @@ +Metadata-Version: 2.1 +Name: shellingham +Version: 1.5.4 +Summary: Tool to Detect Surrounding Shell +Home-page: https://github.com/sarugaku/shellingham +Author: Tzu-ping Chung +Author-email: uranusjr@gmail.com +License: ISC License +Keywords: shell +Classifier: Development Status :: 3 - Alpha +Classifier: Environment :: Console +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: ISC License (ISCL) +Classifier: Operating System :: OS Independent +Classifier: Programming Language :: Python :: 3 :: Only +Classifier: Programming Language :: Python :: 3.7 +Classifier: Programming Language :: Python :: 3.8 +Classifier: Programming Language :: Python :: 3.9 +Classifier: Programming Language :: Python :: 3.10 +Classifier: Programming Language :: Python :: 3.11 +Classifier: Programming Language :: Python :: 3.12 +Classifier: Topic :: Software Development :: Libraries :: Python Modules +Requires-Python: >=3.7 +Description-Content-Type: text/x-rst +License-File: LICENSE + +============================================= +Shellingham: Tool to Detect Surrounding Shell +============================================= + +.. image:: https://img.shields.io/pypi/v/shellingham.svg + :target: https://pypi.org/project/shellingham/ + +Shellingham detects what shell the current Python executable is running in. + + +Usage +===== + +.. code-block:: python + + >>> import shellingham + >>> shellingham.detect_shell() + ('bash', '/bin/bash') + +``detect_shell`` pokes around the process's running environment to determine +what shell it is run in. It returns a 2-tuple: + +* The shell name, always lowercased. +* The command used to run the shell. + +``ShellDetectionFailure`` is raised if ``detect_shell`` fails to detect the +surrounding shell. + + +Notes +===== + +* The shell name is always lowercased. +* On Windows, the shell name is the name of the executable, minus the file + extension. + + +Notes for Application Developers +================================ + +Remember, your application's user is not necessarily using a shell. +Shellingham raises ``ShellDetectionFailure`` if there is no shell to detect, +but *your application should almost never do this to your user*. + +A practical approach to this is to wrap ``detect_shell`` in a try block, and +provide a sane default on failure + +.. code-block:: python + + try: + shell = shellingham.detect_shell() + except shellingham.ShellDetectionFailure: + shell = provide_default() + + +There are a few choices for you to choose from. + +* The POSIX standard mandates the environment variable ``SHELL`` to refer to + "the user's preferred command language interpreter". This is always available + (even if the user is not in an interactive session), and likely the correct + choice to launch an interactive sub-shell with. +* A command ``sh`` is almost guaranteed to exist, likely at ``/bin/sh``, since + several POSIX tools rely on it. This should be suitable if you want to run a + (possibly non-interactive) script. +* All versions of DOS and Windows have an environment variable ``COMSPEC``. + This can always be used to launch a usable command prompt (e.g. `cmd.exe` on + Windows). + +Here's a simple implementation to provide a default shell + +.. code-block:: python + + import os + + def provide_default(): + if os.name == 'posix': + return os.environ['SHELL'] + elif os.name == 'nt': + return os.environ['COMSPEC'] + raise NotImplementedError(f'OS {os.name!r} support not available') diff --git a/shellingham-1.5.4.dist-info/RECORD b/shellingham-1.5.4.dist-info/RECORD new file mode 100644 index 0000000000000000000000000000000000000000..8df44b12ec09402c0cc3c64561ec4bb489a96427 --- /dev/null +++ b/shellingham-1.5.4.dist-info/RECORD @@ -0,0 +1,15 @@ +shellingham-1.5.4.dist-info/INSTALLER,sha256=5hhM4Q4mYTT9z6QB6PGpUAW81PGNFrYrdXMj4oM_6ak,2 +shellingham-1.5.4.dist-info/LICENSE,sha256=84j9OMrRMRLB3A9mm76A5_hFQe26-3LzAw0sp2QsPJ0,751 +shellingham-1.5.4.dist-info/METADATA,sha256=GD2AIgo3STJieVc53TV8xbs_Sb05DMkZjVGA5UUaB_o,3461 +shellingham-1.5.4.dist-info/RECORD,, +shellingham-1.5.4.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +shellingham-1.5.4.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110 +shellingham-1.5.4.dist-info/top_level.txt,sha256=uKMQL5AKxPi4O9_Rbd838QeEs4ImpGQKNbEDZYqgBgk,12 +shellingham-1.5.4.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1 +shellingham/__init__.py,sha256=pAKXUPKUdwyErC0ZjS-5w-fRdSbmdcfvnpt_x1yWqtA,635 +shellingham/_core.py,sha256=v-CTr_7F7cJAtNnzpa1N_Hl8afkY5yiDA4joGmsUBu0,300 +shellingham/nt.py,sha256=m6J6SuwyqVVlxXT9Bc-9F_1x-T5u0gCFFrRAF2LIkeg,4516 +shellingham/posix/__init__.py,sha256=pB69qtvZJ_yIf48nl4-ZfS3wLwwuXuknXOZhBnC2T1o,3129 +shellingham/posix/_core.py,sha256=_v18UaXbzr4muNhr3-mH1FdSdjZ_dOXQrtUyomIbKYQ,81 +shellingham/posix/proc.py,sha256=nSUxIuQSotvaDW76i0oTQAM9aZ9PXBLFAEktWljSKCo,2659 +shellingham/posix/ps.py,sha256=NGmDKCukhNp0lahwYCaMXphBYaVbhbiR9BtE0OkT8qU,1770 diff --git a/shellingham-1.5.4.dist-info/REQUESTED b/shellingham-1.5.4.dist-info/REQUESTED new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/shellingham-1.5.4.dist-info/WHEEL b/shellingham-1.5.4.dist-info/WHEEL new file mode 100644 index 0000000000000000000000000000000000000000..c34f1162ef9a50c355df1261ef6194ffc1b39975 --- /dev/null +++ b/shellingham-1.5.4.dist-info/WHEEL @@ -0,0 +1,6 @@ +Wheel-Version: 1.0 +Generator: bdist_wheel (0.41.2) +Root-Is-Purelib: true +Tag: py2-none-any +Tag: py3-none-any + diff --git a/shellingham-1.5.4.dist-info/top_level.txt b/shellingham-1.5.4.dist-info/top_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..d4e44ce0299bb38463f8491ec8850910235c2709 --- /dev/null +++ b/shellingham-1.5.4.dist-info/top_level.txt @@ -0,0 +1 @@ +shellingham diff --git a/shellingham-1.5.4.dist-info/zip-safe b/shellingham-1.5.4.dist-info/zip-safe new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/shellingham-1.5.4.dist-info/zip-safe @@ -0,0 +1 @@ + diff --git a/six.py b/six.py new file mode 100644 index 0000000000000000000000000000000000000000..3de5969b1ad3b973342e5e88ee1770fa7c798152 --- /dev/null +++ b/six.py @@ -0,0 +1,1003 @@ +# Copyright (c) 2010-2024 Benjamin Peterson +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +"""Utilities for writing code that runs on Python 2 and 3""" + +from __future__ import absolute_import + +import functools +import itertools +import operator +import sys +import types + +__author__ = "Benjamin Peterson " +__version__ = "1.17.0" + + +# Useful for very coarse version differentiation. +PY2 = sys.version_info[0] == 2 +PY3 = sys.version_info[0] == 3 +PY34 = sys.version_info[0:2] >= (3, 4) + +if PY3: + string_types = str, + integer_types = int, + class_types = type, + text_type = str + binary_type = bytes + + MAXSIZE = sys.maxsize +else: + string_types = basestring, + integer_types = (int, long) + class_types = (type, types.ClassType) + text_type = unicode + binary_type = str + + if sys.platform.startswith("java"): + # Jython always uses 32 bits. + MAXSIZE = int((1 << 31) - 1) + else: + # It's possible to have sizeof(long) != sizeof(Py_ssize_t). + class X(object): + + def __len__(self): + return 1 << 31 + try: + len(X()) + except OverflowError: + # 32-bit + MAXSIZE = int((1 << 31) - 1) + else: + # 64-bit + MAXSIZE = int((1 << 63) - 1) + del X + +if PY34: + from importlib.util import spec_from_loader +else: + spec_from_loader = None + + +def _add_doc(func, doc): + """Add documentation to a function.""" + func.__doc__ = doc + + +def _import_module(name): + """Import module, returning the module after the last dot.""" + __import__(name) + return sys.modules[name] + + +class _LazyDescr(object): + + def __init__(self, name): + self.name = name + + def __get__(self, obj, tp): + result = self._resolve() + setattr(obj, self.name, result) # Invokes __set__. + try: + # This is a bit ugly, but it avoids running this again by + # removing this descriptor. + delattr(obj.__class__, self.name) + except AttributeError: + pass + return result + + +class MovedModule(_LazyDescr): + + def __init__(self, name, old, new=None): + super(MovedModule, self).__init__(name) + if PY3: + if new is None: + new = name + self.mod = new + else: + self.mod = old + + def _resolve(self): + return _import_module(self.mod) + + def __getattr__(self, attr): + _module = self._resolve() + value = getattr(_module, attr) + setattr(self, attr, value) + return value + + +class _LazyModule(types.ModuleType): + + def __init__(self, name): + super(_LazyModule, self).__init__(name) + self.__doc__ = self.__class__.__doc__ + + def __dir__(self): + attrs = ["__doc__", "__name__"] + attrs += [attr.name for attr in self._moved_attributes] + return attrs + + # Subclasses should override this + _moved_attributes = [] + + +class MovedAttribute(_LazyDescr): + + def __init__(self, name, old_mod, new_mod, old_attr=None, new_attr=None): + super(MovedAttribute, self).__init__(name) + if PY3: + if new_mod is None: + new_mod = name + self.mod = new_mod + if new_attr is None: + if old_attr is None: + new_attr = name + else: + new_attr = old_attr + self.attr = new_attr + else: + self.mod = old_mod + if old_attr is None: + old_attr = name + self.attr = old_attr + + def _resolve(self): + module = _import_module(self.mod) + return getattr(module, self.attr) + + +class _SixMetaPathImporter(object): + + """ + A meta path importer to import six.moves and its submodules. + + This class implements a PEP302 finder and loader. It should be compatible + with Python 2.5 and all existing versions of Python3 + """ + + def __init__(self, six_module_name): + self.name = six_module_name + self.known_modules = {} + + def _add_module(self, mod, *fullnames): + for fullname in fullnames: + self.known_modules[self.name + "." + fullname] = mod + + def _get_module(self, fullname): + return self.known_modules[self.name + "." + fullname] + + def find_module(self, fullname, path=None): + if fullname in self.known_modules: + return self + return None + + def find_spec(self, fullname, path, target=None): + if fullname in self.known_modules: + return spec_from_loader(fullname, self) + return None + + def __get_module(self, fullname): + try: + return self.known_modules[fullname] + except KeyError: + raise ImportError("This loader does not know module " + fullname) + + def load_module(self, fullname): + try: + # in case of a reload + return sys.modules[fullname] + except KeyError: + pass + mod = self.__get_module(fullname) + if isinstance(mod, MovedModule): + mod = mod._resolve() + else: + mod.__loader__ = self + sys.modules[fullname] = mod + return mod + + def is_package(self, fullname): + """ + Return true, if the named module is a package. + + We need this method to get correct spec objects with + Python 3.4 (see PEP451) + """ + return hasattr(self.__get_module(fullname), "__path__") + + def get_code(self, fullname): + """Return None + + Required, if is_package is implemented""" + self.__get_module(fullname) # eventually raises ImportError + return None + get_source = get_code # same as get_code + + def create_module(self, spec): + return self.load_module(spec.name) + + def exec_module(self, module): + pass + +_importer = _SixMetaPathImporter(__name__) + + +class _MovedItems(_LazyModule): + + """Lazy loading of moved objects""" + __path__ = [] # mark as package + + +_moved_attributes = [ + MovedAttribute("cStringIO", "cStringIO", "io", "StringIO"), + MovedAttribute("filter", "itertools", "builtins", "ifilter", "filter"), + MovedAttribute("filterfalse", "itertools", "itertools", "ifilterfalse", "filterfalse"), + MovedAttribute("input", "__builtin__", "builtins", "raw_input", "input"), + MovedAttribute("intern", "__builtin__", "sys"), + MovedAttribute("map", "itertools", "builtins", "imap", "map"), + MovedAttribute("getcwd", "os", "os", "getcwdu", "getcwd"), + MovedAttribute("getcwdb", "os", "os", "getcwd", "getcwdb"), + MovedAttribute("getoutput", "commands", "subprocess"), + MovedAttribute("range", "__builtin__", "builtins", "xrange", "range"), + MovedAttribute("reload_module", "__builtin__", "importlib" if PY34 else "imp", "reload"), + MovedAttribute("reduce", "__builtin__", "functools"), + MovedAttribute("shlex_quote", "pipes", "shlex", "quote"), + MovedAttribute("StringIO", "StringIO", "io"), + MovedAttribute("UserDict", "UserDict", "collections", "IterableUserDict", "UserDict"), + MovedAttribute("UserList", "UserList", "collections"), + MovedAttribute("UserString", "UserString", "collections"), + MovedAttribute("xrange", "__builtin__", "builtins", "xrange", "range"), + MovedAttribute("zip", "itertools", "builtins", "izip", "zip"), + MovedAttribute("zip_longest", "itertools", "itertools", "izip_longest", "zip_longest"), + MovedModule("builtins", "__builtin__"), + MovedModule("configparser", "ConfigParser"), + MovedModule("collections_abc", "collections", "collections.abc" if sys.version_info >= (3, 3) else "collections"), + MovedModule("copyreg", "copy_reg"), + MovedModule("dbm_gnu", "gdbm", "dbm.gnu"), + MovedModule("dbm_ndbm", "dbm", "dbm.ndbm"), + MovedModule("_dummy_thread", "dummy_thread", "_dummy_thread" if sys.version_info < (3, 9) else "_thread"), + MovedModule("http_cookiejar", "cookielib", "http.cookiejar"), + MovedModule("http_cookies", "Cookie", "http.cookies"), + MovedModule("html_entities", "htmlentitydefs", "html.entities"), + MovedModule("html_parser", "HTMLParser", "html.parser"), + MovedModule("http_client", "httplib", "http.client"), + MovedModule("email_mime_base", "email.MIMEBase", "email.mime.base"), + MovedModule("email_mime_image", "email.MIMEImage", "email.mime.image"), + MovedModule("email_mime_multipart", "email.MIMEMultipart", "email.mime.multipart"), + MovedModule("email_mime_nonmultipart", "email.MIMENonMultipart", "email.mime.nonmultipart"), + MovedModule("email_mime_text", "email.MIMEText", "email.mime.text"), + MovedModule("BaseHTTPServer", "BaseHTTPServer", "http.server"), + MovedModule("CGIHTTPServer", "CGIHTTPServer", "http.server"), + MovedModule("SimpleHTTPServer", "SimpleHTTPServer", "http.server"), + MovedModule("cPickle", "cPickle", "pickle"), + MovedModule("queue", "Queue"), + MovedModule("reprlib", "repr"), + MovedModule("socketserver", "SocketServer"), + MovedModule("_thread", "thread", "_thread"), + MovedModule("tkinter", "Tkinter"), + MovedModule("tkinter_dialog", "Dialog", "tkinter.dialog"), + MovedModule("tkinter_filedialog", "FileDialog", "tkinter.filedialog"), + MovedModule("tkinter_scrolledtext", "ScrolledText", "tkinter.scrolledtext"), + MovedModule("tkinter_simpledialog", "SimpleDialog", "tkinter.simpledialog"), + MovedModule("tkinter_tix", "Tix", "tkinter.tix"), + MovedModule("tkinter_ttk", "ttk", "tkinter.ttk"), + MovedModule("tkinter_constants", "Tkconstants", "tkinter.constants"), + MovedModule("tkinter_dnd", "Tkdnd", "tkinter.dnd"), + MovedModule("tkinter_colorchooser", "tkColorChooser", + "tkinter.colorchooser"), + MovedModule("tkinter_commondialog", "tkCommonDialog", + "tkinter.commondialog"), + MovedModule("tkinter_tkfiledialog", "tkFileDialog", "tkinter.filedialog"), + MovedModule("tkinter_font", "tkFont", "tkinter.font"), + MovedModule("tkinter_messagebox", "tkMessageBox", "tkinter.messagebox"), + MovedModule("tkinter_tksimpledialog", "tkSimpleDialog", + "tkinter.simpledialog"), + MovedModule("urllib_parse", __name__ + ".moves.urllib_parse", "urllib.parse"), + MovedModule("urllib_error", __name__ + ".moves.urllib_error", "urllib.error"), + MovedModule("urllib", __name__ + ".moves.urllib", __name__ + ".moves.urllib"), + MovedModule("urllib_robotparser", "robotparser", "urllib.robotparser"), + MovedModule("xmlrpc_client", "xmlrpclib", "xmlrpc.client"), + MovedModule("xmlrpc_server", "SimpleXMLRPCServer", "xmlrpc.server"), +] +# Add windows specific modules. +if sys.platform == "win32": + _moved_attributes += [ + MovedModule("winreg", "_winreg"), + ] + +for attr in _moved_attributes: + setattr(_MovedItems, attr.name, attr) + if isinstance(attr, MovedModule): + _importer._add_module(attr, "moves." + attr.name) +del attr + +_MovedItems._moved_attributes = _moved_attributes + +moves = _MovedItems(__name__ + ".moves") +_importer._add_module(moves, "moves") + + +class Module_six_moves_urllib_parse(_LazyModule): + + """Lazy loading of moved objects in six.moves.urllib_parse""" + + +_urllib_parse_moved_attributes = [ + MovedAttribute("ParseResult", "urlparse", "urllib.parse"), + MovedAttribute("SplitResult", "urlparse", "urllib.parse"), + MovedAttribute("parse_qs", "urlparse", "urllib.parse"), + MovedAttribute("parse_qsl", "urlparse", "urllib.parse"), + MovedAttribute("urldefrag", "urlparse", "urllib.parse"), + MovedAttribute("urljoin", "urlparse", "urllib.parse"), + MovedAttribute("urlparse", "urlparse", "urllib.parse"), + MovedAttribute("urlsplit", "urlparse", "urllib.parse"), + MovedAttribute("urlunparse", "urlparse", "urllib.parse"), + MovedAttribute("urlunsplit", "urlparse", "urllib.parse"), + MovedAttribute("quote", "urllib", "urllib.parse"), + MovedAttribute("quote_plus", "urllib", "urllib.parse"), + MovedAttribute("unquote", "urllib", "urllib.parse"), + MovedAttribute("unquote_plus", "urllib", "urllib.parse"), + MovedAttribute("unquote_to_bytes", "urllib", "urllib.parse", "unquote", "unquote_to_bytes"), + MovedAttribute("urlencode", "urllib", "urllib.parse"), + MovedAttribute("splitquery", "urllib", "urllib.parse"), + MovedAttribute("splittag", "urllib", "urllib.parse"), + MovedAttribute("splituser", "urllib", "urllib.parse"), + MovedAttribute("splitvalue", "urllib", "urllib.parse"), + MovedAttribute("uses_fragment", "urlparse", "urllib.parse"), + MovedAttribute("uses_netloc", "urlparse", "urllib.parse"), + MovedAttribute("uses_params", "urlparse", "urllib.parse"), + MovedAttribute("uses_query", "urlparse", "urllib.parse"), + MovedAttribute("uses_relative", "urlparse", "urllib.parse"), +] +for attr in _urllib_parse_moved_attributes: + setattr(Module_six_moves_urllib_parse, attr.name, attr) +del attr + +Module_six_moves_urllib_parse._moved_attributes = _urllib_parse_moved_attributes + +_importer._add_module(Module_six_moves_urllib_parse(__name__ + ".moves.urllib_parse"), + "moves.urllib_parse", "moves.urllib.parse") + + +class Module_six_moves_urllib_error(_LazyModule): + + """Lazy loading of moved objects in six.moves.urllib_error""" + + +_urllib_error_moved_attributes = [ + MovedAttribute("URLError", "urllib2", "urllib.error"), + MovedAttribute("HTTPError", "urllib2", "urllib.error"), + MovedAttribute("ContentTooShortError", "urllib", "urllib.error"), +] +for attr in _urllib_error_moved_attributes: + setattr(Module_six_moves_urllib_error, attr.name, attr) +del attr + +Module_six_moves_urllib_error._moved_attributes = _urllib_error_moved_attributes + +_importer._add_module(Module_six_moves_urllib_error(__name__ + ".moves.urllib.error"), + "moves.urllib_error", "moves.urllib.error") + + +class Module_six_moves_urllib_request(_LazyModule): + + """Lazy loading of moved objects in six.moves.urllib_request""" + + +_urllib_request_moved_attributes = [ + MovedAttribute("urlopen", "urllib2", "urllib.request"), + MovedAttribute("install_opener", "urllib2", "urllib.request"), + MovedAttribute("build_opener", "urllib2", "urllib.request"), + MovedAttribute("pathname2url", "urllib", "urllib.request"), + MovedAttribute("url2pathname", "urllib", "urllib.request"), + MovedAttribute("getproxies", "urllib", "urllib.request"), + MovedAttribute("Request", "urllib2", "urllib.request"), + MovedAttribute("OpenerDirector", "urllib2", "urllib.request"), + MovedAttribute("HTTPDefaultErrorHandler", "urllib2", "urllib.request"), + MovedAttribute("HTTPRedirectHandler", "urllib2", "urllib.request"), + MovedAttribute("HTTPCookieProcessor", "urllib2", "urllib.request"), + MovedAttribute("ProxyHandler", "urllib2", "urllib.request"), + MovedAttribute("BaseHandler", "urllib2", "urllib.request"), + MovedAttribute("HTTPPasswordMgr", "urllib2", "urllib.request"), + MovedAttribute("HTTPPasswordMgrWithDefaultRealm", "urllib2", "urllib.request"), + MovedAttribute("AbstractBasicAuthHandler", "urllib2", "urllib.request"), + MovedAttribute("HTTPBasicAuthHandler", "urllib2", "urllib.request"), + MovedAttribute("ProxyBasicAuthHandler", "urllib2", "urllib.request"), + MovedAttribute("AbstractDigestAuthHandler", "urllib2", "urllib.request"), + MovedAttribute("HTTPDigestAuthHandler", "urllib2", "urllib.request"), + MovedAttribute("ProxyDigestAuthHandler", "urllib2", "urllib.request"), + MovedAttribute("HTTPHandler", "urllib2", "urllib.request"), + MovedAttribute("HTTPSHandler", "urllib2", "urllib.request"), + MovedAttribute("FileHandler", "urllib2", "urllib.request"), + MovedAttribute("FTPHandler", "urllib2", "urllib.request"), + MovedAttribute("CacheFTPHandler", "urllib2", "urllib.request"), + MovedAttribute("UnknownHandler", "urllib2", "urllib.request"), + MovedAttribute("HTTPErrorProcessor", "urllib2", "urllib.request"), + MovedAttribute("urlretrieve", "urllib", "urllib.request"), + MovedAttribute("urlcleanup", "urllib", "urllib.request"), + MovedAttribute("proxy_bypass", "urllib", "urllib.request"), + MovedAttribute("parse_http_list", "urllib2", "urllib.request"), + MovedAttribute("parse_keqv_list", "urllib2", "urllib.request"), +] +if sys.version_info[:2] < (3, 14): + _urllib_request_moved_attributes.extend( + [ + MovedAttribute("URLopener", "urllib", "urllib.request"), + MovedAttribute("FancyURLopener", "urllib", "urllib.request"), + ] + ) +for attr in _urllib_request_moved_attributes: + setattr(Module_six_moves_urllib_request, attr.name, attr) +del attr + +Module_six_moves_urllib_request._moved_attributes = _urllib_request_moved_attributes + +_importer._add_module(Module_six_moves_urllib_request(__name__ + ".moves.urllib.request"), + "moves.urllib_request", "moves.urllib.request") + + +class Module_six_moves_urllib_response(_LazyModule): + + """Lazy loading of moved objects in six.moves.urllib_response""" + + +_urllib_response_moved_attributes = [ + MovedAttribute("addbase", "urllib", "urllib.response"), + MovedAttribute("addclosehook", "urllib", "urllib.response"), + MovedAttribute("addinfo", "urllib", "urllib.response"), + MovedAttribute("addinfourl", "urllib", "urllib.response"), +] +for attr in _urllib_response_moved_attributes: + setattr(Module_six_moves_urllib_response, attr.name, attr) +del attr + +Module_six_moves_urllib_response._moved_attributes = _urllib_response_moved_attributes + +_importer._add_module(Module_six_moves_urllib_response(__name__ + ".moves.urllib.response"), + "moves.urllib_response", "moves.urllib.response") + + +class Module_six_moves_urllib_robotparser(_LazyModule): + + """Lazy loading of moved objects in six.moves.urllib_robotparser""" + + +_urllib_robotparser_moved_attributes = [ + MovedAttribute("RobotFileParser", "robotparser", "urllib.robotparser"), +] +for attr in _urllib_robotparser_moved_attributes: + setattr(Module_six_moves_urllib_robotparser, attr.name, attr) +del attr + +Module_six_moves_urllib_robotparser._moved_attributes = _urllib_robotparser_moved_attributes + +_importer._add_module(Module_six_moves_urllib_robotparser(__name__ + ".moves.urllib.robotparser"), + "moves.urllib_robotparser", "moves.urllib.robotparser") + + +class Module_six_moves_urllib(types.ModuleType): + + """Create a six.moves.urllib namespace that resembles the Python 3 namespace""" + __path__ = [] # mark as package + parse = _importer._get_module("moves.urllib_parse") + error = _importer._get_module("moves.urllib_error") + request = _importer._get_module("moves.urllib_request") + response = _importer._get_module("moves.urllib_response") + robotparser = _importer._get_module("moves.urllib_robotparser") + + def __dir__(self): + return ['parse', 'error', 'request', 'response', 'robotparser'] + +_importer._add_module(Module_six_moves_urllib(__name__ + ".moves.urllib"), + "moves.urllib") + + +def add_move(move): + """Add an item to six.moves.""" + setattr(_MovedItems, move.name, move) + + +def remove_move(name): + """Remove item from six.moves.""" + try: + delattr(_MovedItems, name) + except AttributeError: + try: + del moves.__dict__[name] + except KeyError: + raise AttributeError("no such move, %r" % (name,)) + + +if PY3: + _meth_func = "__func__" + _meth_self = "__self__" + + _func_closure = "__closure__" + _func_code = "__code__" + _func_defaults = "__defaults__" + _func_globals = "__globals__" +else: + _meth_func = "im_func" + _meth_self = "im_self" + + _func_closure = "func_closure" + _func_code = "func_code" + _func_defaults = "func_defaults" + _func_globals = "func_globals" + + +try: + advance_iterator = next +except NameError: + def advance_iterator(it): + return it.next() +next = advance_iterator + + +try: + callable = callable +except NameError: + def callable(obj): + return any("__call__" in klass.__dict__ for klass in type(obj).__mro__) + + +if PY3: + def get_unbound_function(unbound): + return unbound + + create_bound_method = types.MethodType + + def create_unbound_method(func, cls): + return func + + Iterator = object +else: + def get_unbound_function(unbound): + return unbound.im_func + + def create_bound_method(func, obj): + return types.MethodType(func, obj, obj.__class__) + + def create_unbound_method(func, cls): + return types.MethodType(func, None, cls) + + class Iterator(object): + + def next(self): + return type(self).__next__(self) + + callable = callable +_add_doc(get_unbound_function, + """Get the function out of a possibly unbound function""") + + +get_method_function = operator.attrgetter(_meth_func) +get_method_self = operator.attrgetter(_meth_self) +get_function_closure = operator.attrgetter(_func_closure) +get_function_code = operator.attrgetter(_func_code) +get_function_defaults = operator.attrgetter(_func_defaults) +get_function_globals = operator.attrgetter(_func_globals) + + +if PY3: + def iterkeys(d, **kw): + return iter(d.keys(**kw)) + + def itervalues(d, **kw): + return iter(d.values(**kw)) + + def iteritems(d, **kw): + return iter(d.items(**kw)) + + def iterlists(d, **kw): + return iter(d.lists(**kw)) + + viewkeys = operator.methodcaller("keys") + + viewvalues = operator.methodcaller("values") + + viewitems = operator.methodcaller("items") +else: + def iterkeys(d, **kw): + return d.iterkeys(**kw) + + def itervalues(d, **kw): + return d.itervalues(**kw) + + def iteritems(d, **kw): + return d.iteritems(**kw) + + def iterlists(d, **kw): + return d.iterlists(**kw) + + viewkeys = operator.methodcaller("viewkeys") + + viewvalues = operator.methodcaller("viewvalues") + + viewitems = operator.methodcaller("viewitems") + +_add_doc(iterkeys, "Return an iterator over the keys of a dictionary.") +_add_doc(itervalues, "Return an iterator over the values of a dictionary.") +_add_doc(iteritems, + "Return an iterator over the (key, value) pairs of a dictionary.") +_add_doc(iterlists, + "Return an iterator over the (key, [values]) pairs of a dictionary.") + + +if PY3: + def b(s): + return s.encode("latin-1") + + def u(s): + return s + unichr = chr + import struct + int2byte = struct.Struct(">B").pack + del struct + byte2int = operator.itemgetter(0) + indexbytes = operator.getitem + iterbytes = iter + import io + StringIO = io.StringIO + BytesIO = io.BytesIO + del io + _assertCountEqual = "assertCountEqual" + if sys.version_info[1] <= 1: + _assertRaisesRegex = "assertRaisesRegexp" + _assertRegex = "assertRegexpMatches" + _assertNotRegex = "assertNotRegexpMatches" + else: + _assertRaisesRegex = "assertRaisesRegex" + _assertRegex = "assertRegex" + _assertNotRegex = "assertNotRegex" +else: + def b(s): + return s + # Workaround for standalone backslash + + def u(s): + return unicode(s.replace(r'\\', r'\\\\'), "unicode_escape") + unichr = unichr + int2byte = chr + + def byte2int(bs): + return ord(bs[0]) + + def indexbytes(buf, i): + return ord(buf[i]) + iterbytes = functools.partial(itertools.imap, ord) + import StringIO + StringIO = BytesIO = StringIO.StringIO + _assertCountEqual = "assertItemsEqual" + _assertRaisesRegex = "assertRaisesRegexp" + _assertRegex = "assertRegexpMatches" + _assertNotRegex = "assertNotRegexpMatches" +_add_doc(b, """Byte literal""") +_add_doc(u, """Text literal""") + + +def assertCountEqual(self, *args, **kwargs): + return getattr(self, _assertCountEqual)(*args, **kwargs) + + +def assertRaisesRegex(self, *args, **kwargs): + return getattr(self, _assertRaisesRegex)(*args, **kwargs) + + +def assertRegex(self, *args, **kwargs): + return getattr(self, _assertRegex)(*args, **kwargs) + + +def assertNotRegex(self, *args, **kwargs): + return getattr(self, _assertNotRegex)(*args, **kwargs) + + +if PY3: + exec_ = getattr(moves.builtins, "exec") + + def reraise(tp, value, tb=None): + try: + if value is None: + value = tp() + if value.__traceback__ is not tb: + raise value.with_traceback(tb) + raise value + finally: + value = None + tb = None + +else: + def exec_(_code_, _globs_=None, _locs_=None): + """Execute code in a namespace.""" + if _globs_ is None: + frame = sys._getframe(1) + _globs_ = frame.f_globals + if _locs_ is None: + _locs_ = frame.f_locals + del frame + elif _locs_ is None: + _locs_ = _globs_ + exec("""exec _code_ in _globs_, _locs_""") + + exec_("""def reraise(tp, value, tb=None): + try: + raise tp, value, tb + finally: + tb = None +""") + + +if sys.version_info[:2] > (3,): + exec_("""def raise_from(value, from_value): + try: + raise value from from_value + finally: + value = None +""") +else: + def raise_from(value, from_value): + raise value + + +print_ = getattr(moves.builtins, "print", None) +if print_ is None: + def print_(*args, **kwargs): + """The new-style print function for Python 2.4 and 2.5.""" + fp = kwargs.pop("file", sys.stdout) + if fp is None: + return + + def write(data): + if not isinstance(data, basestring): + data = str(data) + # If the file has an encoding, encode unicode with it. + if (isinstance(fp, file) and + isinstance(data, unicode) and + fp.encoding is not None): + errors = getattr(fp, "errors", None) + if errors is None: + errors = "strict" + data = data.encode(fp.encoding, errors) + fp.write(data) + want_unicode = False + sep = kwargs.pop("sep", None) + if sep is not None: + if isinstance(sep, unicode): + want_unicode = True + elif not isinstance(sep, str): + raise TypeError("sep must be None or a string") + end = kwargs.pop("end", None) + if end is not None: + if isinstance(end, unicode): + want_unicode = True + elif not isinstance(end, str): + raise TypeError("end must be None or a string") + if kwargs: + raise TypeError("invalid keyword arguments to print()") + if not want_unicode: + for arg in args: + if isinstance(arg, unicode): + want_unicode = True + break + if want_unicode: + newline = unicode("\n") + space = unicode(" ") + else: + newline = "\n" + space = " " + if sep is None: + sep = space + if end is None: + end = newline + for i, arg in enumerate(args): + if i: + write(sep) + write(arg) + write(end) +if sys.version_info[:2] < (3, 3): + _print = print_ + + def print_(*args, **kwargs): + fp = kwargs.get("file", sys.stdout) + flush = kwargs.pop("flush", False) + _print(*args, **kwargs) + if flush and fp is not None: + fp.flush() + +_add_doc(reraise, """Reraise an exception.""") + +if sys.version_info[0:2] < (3, 4): + # This does exactly the same what the :func:`py3:functools.update_wrapper` + # function does on Python versions after 3.2. It sets the ``__wrapped__`` + # attribute on ``wrapper`` object and it doesn't raise an error if any of + # the attributes mentioned in ``assigned`` and ``updated`` are missing on + # ``wrapped`` object. + def _update_wrapper(wrapper, wrapped, + assigned=functools.WRAPPER_ASSIGNMENTS, + updated=functools.WRAPPER_UPDATES): + for attr in assigned: + try: + value = getattr(wrapped, attr) + except AttributeError: + continue + else: + setattr(wrapper, attr, value) + for attr in updated: + getattr(wrapper, attr).update(getattr(wrapped, attr, {})) + wrapper.__wrapped__ = wrapped + return wrapper + _update_wrapper.__doc__ = functools.update_wrapper.__doc__ + + def wraps(wrapped, assigned=functools.WRAPPER_ASSIGNMENTS, + updated=functools.WRAPPER_UPDATES): + return functools.partial(_update_wrapper, wrapped=wrapped, + assigned=assigned, updated=updated) + wraps.__doc__ = functools.wraps.__doc__ + +else: + wraps = functools.wraps + + +def with_metaclass(meta, *bases): + """Create a base class with a metaclass.""" + # This requires a bit of explanation: the basic idea is to make a dummy + # metaclass for one level of class instantiation that replaces itself with + # the actual metaclass. + class metaclass(type): + + def __new__(cls, name, this_bases, d): + if sys.version_info[:2] >= (3, 7): + # This version introduced PEP 560 that requires a bit + # of extra care (we mimic what is done by __build_class__). + resolved_bases = types.resolve_bases(bases) + if resolved_bases is not bases: + d['__orig_bases__'] = bases + else: + resolved_bases = bases + return meta(name, resolved_bases, d) + + @classmethod + def __prepare__(cls, name, this_bases): + return meta.__prepare__(name, bases) + return type.__new__(metaclass, 'temporary_class', (), {}) + + +def add_metaclass(metaclass): + """Class decorator for creating a class with a metaclass.""" + def wrapper(cls): + orig_vars = cls.__dict__.copy() + slots = orig_vars.get('__slots__') + if slots is not None: + if isinstance(slots, str): + slots = [slots] + for slots_var in slots: + orig_vars.pop(slots_var) + orig_vars.pop('__dict__', None) + orig_vars.pop('__weakref__', None) + if hasattr(cls, '__qualname__'): + orig_vars['__qualname__'] = cls.__qualname__ + return metaclass(cls.__name__, cls.__bases__, orig_vars) + return wrapper + + +def ensure_binary(s, encoding='utf-8', errors='strict'): + """Coerce **s** to six.binary_type. + + For Python 2: + - `unicode` -> encoded to `str` + - `str` -> `str` + + For Python 3: + - `str` -> encoded to `bytes` + - `bytes` -> `bytes` + """ + if isinstance(s, binary_type): + return s + if isinstance(s, text_type): + return s.encode(encoding, errors) + raise TypeError("not expecting type '%s'" % type(s)) + + +def ensure_str(s, encoding='utf-8', errors='strict'): + """Coerce *s* to `str`. + + For Python 2: + - `unicode` -> encoded to `str` + - `str` -> `str` + + For Python 3: + - `str` -> `str` + - `bytes` -> decoded to `str` + """ + # Optimization: Fast return for the common case. + if type(s) is str: + return s + if PY2 and isinstance(s, text_type): + return s.encode(encoding, errors) + elif PY3 and isinstance(s, binary_type): + return s.decode(encoding, errors) + elif not isinstance(s, (text_type, binary_type)): + raise TypeError("not expecting type '%s'" % type(s)) + return s + + +def ensure_text(s, encoding='utf-8', errors='strict'): + """Coerce *s* to six.text_type. + + For Python 2: + - `unicode` -> `unicode` + - `str` -> `unicode` + + For Python 3: + - `str` -> `str` + - `bytes` -> decoded to `str` + """ + if isinstance(s, binary_type): + return s.decode(encoding, errors) + elif isinstance(s, text_type): + return s + else: + raise TypeError("not expecting type '%s'" % type(s)) + + +def python_2_unicode_compatible(klass): + """ + A class decorator that defines __unicode__ and __str__ methods under Python 2. + Under Python 3 it does nothing. + + To support Python 2 and 3 with a single code base, define a __str__ method + returning text and apply this decorator to the class. + """ + if PY2: + if '__str__' not in klass.__dict__: + raise ValueError("@python_2_unicode_compatible cannot be applied " + "to %s because it doesn't define __str__()." % + klass.__name__) + klass.__unicode__ = klass.__str__ + klass.__str__ = lambda self: self.__unicode__().encode('utf-8') + return klass + + +# Complete the moves implementation. +# This code is at the end of this module to speed up module loading. +# Turn this module into a package. +__path__ = [] # required for PEP 302 and PEP 451 +__package__ = __name__ # see PEP 366 @ReservedAssignment +if globals().get("__spec__") is not None: + __spec__.submodule_search_locations = [] # PEP 451 @UndefinedVariable +# Remove other six meta path importers, since they cause problems. This can +# happen if six is removed from sys.modules and then reloaded. (Setuptools does +# this for some reason.) +if sys.meta_path: + for i, importer in enumerate(sys.meta_path): + # Here's some real nastiness: Another "instance" of the six module might + # be floating around. Therefore, we can't use isinstance() to check for + # the six meta path importer, since the other six instance will have + # inserted an importer with different class. + if (type(importer).__name__ == "_SixMetaPathImporter" and + importer.name == __name__): + del sys.meta_path[i] + break + del i, importer +# Finally, add the importer to the meta path import hook. +sys.meta_path.append(_importer)