fuuuzzy commited on Dec 10, 2025

Commit

fdb0460

verified ·

1 Parent(s): 2d38e41

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +22 -0
.venv/lib/python3.11/site-packages/datasets/__init__.py +47 -0
.venv/lib/python3.11/site-packages/datasets/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/datasets/__pycache__/arrow_dataset.cpython-311.pyc +3 -0
.venv/lib/python3.11/site-packages/datasets/__pycache__/arrow_reader.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/datasets/__pycache__/arrow_writer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/datasets/__pycache__/builder.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/datasets/__pycache__/combine.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/datasets/__pycache__/config.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/datasets/__pycache__/data_files.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/datasets/__pycache__/dataset_dict.cpython-311.pyc +3 -0
.venv/lib/python3.11/site-packages/datasets/__pycache__/exceptions.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/datasets/__pycache__/fingerprint.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/datasets/__pycache__/info.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/datasets/__pycache__/inspect.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/datasets/__pycache__/iterable_dataset.cpython-311.pyc +3 -0
.venv/lib/python3.11/site-packages/datasets/__pycache__/keyhash.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/datasets/__pycache__/load.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/datasets/__pycache__/naming.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/datasets/__pycache__/search.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/datasets/__pycache__/splits.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/datasets/__pycache__/streaming.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/datasets/__pycache__/table.cpython-311.pyc +3 -0
.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py +0 -0
.venv/lib/python3.11/site-packages/datasets/arrow_reader.py +620 -0
.venv/lib/python3.11/site-packages/datasets/arrow_writer.py +766 -0
.venv/lib/python3.11/site-packages/datasets/builder.py +1866 -0
.venv/lib/python3.11/site-packages/datasets/combine.py +223 -0
.venv/lib/python3.11/site-packages/datasets/commands/__init__.py +13 -0
.venv/lib/python3.11/site-packages/datasets/commands/datasets_cli.py +39 -0
.venv/lib/python3.11/site-packages/datasets/commands/delete_from_hub.py +42 -0
.venv/lib/python3.11/site-packages/datasets/commands/env.py +41 -0
.venv/lib/python3.11/site-packages/datasets/commands/test.py +180 -0
.venv/lib/python3.11/site-packages/datasets/config.py +271 -0
.venv/lib/python3.11/site-packages/datasets/data_files.py +811 -0
.venv/lib/python3.11/site-packages/datasets/dataset_dict.py +0 -0
.venv/lib/python3.11/site-packages/datasets/distributed.py +39 -0
.venv/lib/python3.11/site-packages/datasets/download/__init__.py +10 -0
.venv/lib/python3.11/site-packages/datasets/download/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/datasets/download/__pycache__/download_config.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/datasets/download/__pycache__/download_manager.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/datasets/download/__pycache__/streaming_download_manager.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/datasets/download/download_config.py +81 -0
.venv/lib/python3.11/site-packages/datasets/download/download_manager.py +340 -0
.venv/lib/python3.11/site-packages/datasets/download/streaming_download_manager.py +219 -0
.venv/lib/python3.11/site-packages/datasets/exceptions.py +119 -0
.venv/lib/python3.11/site-packages/datasets/features/__init__.py +26 -0
.venv/lib/python3.11/site-packages/datasets/features/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/datasets/features/__pycache__/audio.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/datasets/features/__pycache__/features.cpython-311.pyc +3 -0

.gitattributes CHANGED Viewed

@@ -64,3 +64,25 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/lib/libnvrtc.so.12 filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/huggingface_hub/__pycache__/hf_api.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/nvidia/cudnn/lib/libcudnn_graph.so.9 filter=lfs diff=lfs merge=lfs -text

 .venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/lib/libnvrtc.so.12 filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/huggingface_hub/__pycache__/hf_api.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/nvidia/cudnn/lib/libcudnn_graph.so.9 filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/lib/libnvperf_target.so filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/lib/libcupti.so.12 filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/datasets/features/__pycache__/features.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/datasets/__pycache__/arrow_dataset.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/datasets/__pycache__/iterable_dataset.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/datasets/__pycache__/dataset_dict.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/datasets/__pycache__/table.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/yaml/_yaml.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/nvidia/cublas/lib/libcublas.so.12 filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/nvidia/curand/lib/libcurand.so.10 filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/nvidia/nvshmem/lib/libnvshmem_device.bc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/libcudnn_heuristic.so.9 filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/nvidia/cusolver/lib/libcusolverMg.so.11 filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/nvidia/cusolver/lib/libcusolver.so.11 filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/nvidia/cufft/lib/libcufft.so.11 filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/nvidia/cusparse/lib/libcusparse.so.12 filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/lib/libnvperf_host.so filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/nvidia/cufile/lib/libcufile.so.0 filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib/libcudart.so.12 filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/libcudnn_ops.so.9 filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/libcudnn_adv.so.9 filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/nvidia/nvshmem/lib/libnvshmem_host.so.3 filter=lfs diff=lfs merge=lfs -text

.venv/lib/python3.11/site-packages/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+__version__ = "4.4.1"
+from .arrow_dataset import Column, Dataset
+from .arrow_reader import ReadInstruction
+from .builder import ArrowBasedBuilder, BuilderConfig, DatasetBuilder, GeneratorBasedBuilder
+from .combine import concatenate_datasets, interleave_datasets
+from .dataset_dict import DatasetDict, IterableDatasetDict
+from .download import *
+from .features import *
+from .fingerprint import disable_caching, enable_caching, is_caching_enabled
+from .info import DatasetInfo
+from .inspect import (
+    get_dataset_config_info,
+    get_dataset_config_names,
+    get_dataset_default_config_name,
+    get_dataset_infos,
+    get_dataset_split_names,
+)
+from .iterable_dataset import IterableColumn, IterableDataset
+from .load import load_dataset, load_dataset_builder, load_from_disk
+from .splits import (
+    NamedSplit,
+    NamedSplitAll,
+    Split,
+    SplitBase,
+    SplitDict,
+    SplitGenerator,
+    SplitInfo,
+    SubSplitInfo,
+    percent,
+)
+from .utils import *
+from .utils import logging

.venv/lib/python3.11/site-packages/datasets/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (1.87 kB). View file

.venv/lib/python3.11/site-packages/datasets/__pycache__/arrow_dataset.cpython-311.pyc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bce41b6d6ccd430a8b99967f0b45c68048991596350316f088e4d4fe78d909f3
+size 350055

.venv/lib/python3.11/site-packages/datasets/__pycache__/arrow_reader.cpython-311.pyc ADDED Viewed

Binary file (31.4 kB). View file

.venv/lib/python3.11/site-packages/datasets/__pycache__/arrow_writer.cpython-311.pyc ADDED Viewed

Binary file (44.5 kB). View file

.venv/lib/python3.11/site-packages/datasets/__pycache__/builder.cpython-311.pyc ADDED Viewed

Binary file (99.9 kB). View file

.venv/lib/python3.11/site-packages/datasets/__pycache__/combine.cpython-311.pyc ADDED Viewed

Binary file (11.9 kB). View file

.venv/lib/python3.11/site-packages/datasets/__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (12 kB). View file

.venv/lib/python3.11/site-packages/datasets/__pycache__/data_files.cpython-311.pyc ADDED Viewed

Binary file (44.3 kB). View file

.venv/lib/python3.11/site-packages/datasets/__pycache__/dataset_dict.cpython-311.pyc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e341f4b5d4dcbcf7f0c3fafd96cb990b9e3d15edfe7b88f633a0bf65e79c3c34
+size 149986

.venv/lib/python3.11/site-packages/datasets/__pycache__/exceptions.cpython-311.pyc ADDED Viewed

Binary file (7.9 kB). View file

.venv/lib/python3.11/site-packages/datasets/__pycache__/fingerprint.cpython-311.pyc ADDED Viewed

Binary file (24.4 kB). View file

.venv/lib/python3.11/site-packages/datasets/__pycache__/info.cpython-311.pyc ADDED Viewed

Binary file (29.2 kB). View file

.venv/lib/python3.11/site-packages/datasets/__pycache__/inspect.cpython-311.pyc ADDED Viewed

Binary file (16.8 kB). View file

.venv/lib/python3.11/site-packages/datasets/__pycache__/iterable_dataset.cpython-311.pyc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3d8aa868348076f7093248ba04fc0c398cf44d39457261531e63e0ae717f7bb7
+size 262991

.venv/lib/python3.11/site-packages/datasets/__pycache__/keyhash.cpython-311.pyc ADDED Viewed

Binary file (5.36 kB). View file

.venv/lib/python3.11/site-packages/datasets/__pycache__/load.cpython-311.pyc ADDED Viewed

Binary file (75.3 kB). View file

.venv/lib/python3.11/site-packages/datasets/__pycache__/naming.cpython-311.pyc ADDED Viewed

Binary file (4.76 kB). View file

.venv/lib/python3.11/site-packages/datasets/__pycache__/search.cpython-311.pyc ADDED Viewed

Binary file (46.9 kB). View file

.venv/lib/python3.11/site-packages/datasets/__pycache__/splits.cpython-311.pyc ADDED Viewed

Binary file (32.8 kB). View file

.venv/lib/python3.11/site-packages/datasets/__pycache__/streaming.cpython-311.pyc ADDED Viewed

Binary file (8.46 kB). View file

.venv/lib/python3.11/site-packages/datasets/__pycache__/table.cpython-311.pyc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c3c3494184b2e8541c43f547b4b6d02e41755ed903dadb22854c60d68556fba
+size 121451

.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py ADDED Viewed

The diff for this file is too large to render. See raw diff

.venv/lib/python3.11/site-packages/datasets/arrow_reader.py ADDED Viewed

	@@ -0,0 +1,620 @@

+# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Lint as: python3
+"""Arrow ArrowReader."""
+import copy
+import math
+import os
+import re
+from dataclasses import dataclass
+from functools import partial
+from typing import TYPE_CHECKING, Optional, Union
+import pyarrow as pa
+import pyarrow.parquet as pq
+from tqdm.contrib.concurrent import thread_map
+from .download.download_config import DownloadConfig  # noqa: F401
+from .naming import _split_re, filenames_for_dataset_split
+from .table import InMemoryTable, MemoryMappedTable, Table, concat_tables
+from .utils import logging
+from .utils import tqdm as hf_tqdm
+if TYPE_CHECKING:
+    from .info import DatasetInfo  # noqa: F401
+    from .splits import Split, SplitInfo  # noqa: F401
+logger = logging.get_logger(__name__)
+HF_GCP_BASE_URL = "https://storage.googleapis.com/huggingface-nlp/cache/datasets"
+_SUB_SPEC_RE = re.compile(
+    rf"""
+^
+ (?P<split>{_split_re[1:-1]})
+ (\[
+    ((?P<from>-?[\d_]+)
+     (?P<from_pct>%)?)?
+    :
+    ((?P<to>-?[\d_]+)
+     (?P<to_pct>%)?)?
+ \])?(\((?P<rounding>[^\)]*)\))?
+$
+""",  # remove ^ and $
+    re.X,
+)
+_ADDITION_SEP_RE = re.compile(r"\s*\+\s*")
+class DatasetNotOnHfGcsError(ConnectionError):
+    """When you can't get the dataset from the Hf google cloud storage"""
+    pass
+class MissingFilesOnHfGcsError(ConnectionError):
+    """When some files are missing on the Hf oogle cloud storage"""
+    pass
+@dataclass(frozen=True)
+class FileInstructions:
+    """The file instructions associated with a split ReadInstruction.
+    Attributes:
+        num_examples: `int`, The total number of examples
+        file_instructions: List[dict(filename, skip, take)], the files information.
+            The filenames contains the relative path, not absolute.
+            skip/take indicates which example read in the file: `ds.slice(skip, take)`
+    """
+    num_examples: int
+    file_instructions: list[dict]
+def make_file_instructions(
+    name: str,
+    split_infos: list["SplitInfo"],
+    instruction: Union[str, "ReadInstruction"],
+    filetype_suffix: Optional[str] = None,
+    prefix_path: Optional[str] = None,
+) -> FileInstructions:
+    """Returns instructions of the split dict.
+    Args:
+        name (`str`): Name of the dataset.
+        split_infos (`list` of `[SplitInfo]`): Dataset splits information.
+        instruction ([`ReadInstruction`] or `str`): Reading instruction for a dataset.
+        filetype_suffix (`str`, *optional*): Suffix of dataset files, e.g. 'arrow' or 'parquet'.
+        prefix_path (`str`, *optional*): Prefix of dataset files, e.g. directory name.
+    Returns:
+        [`FileInstructions`]
+    """
+    if not isinstance(name, str):
+        raise TypeError(f"Expected str 'name', but got: {type(name).__name__}")
+    elif not name:
+        raise ValueError("Expected non-empty str 'name'")
+    name2len = {info.name: info.num_examples for info in split_infos}
+    name2shard_lengths = {info.name: info.shard_lengths for info in split_infos}
+    name2filenames = {
+        info.name: filenames_for_dataset_split(
+            path=prefix_path,
+            dataset_name=name,
+            split=info.name,
+            filetype_suffix=filetype_suffix,
+            shard_lengths=name2shard_lengths[info.name],
+        )
+        for info in split_infos
+    }
+    if not isinstance(instruction, ReadInstruction):
+        instruction = ReadInstruction.from_spec(instruction)
+    # Create the absolute instruction (per split)
+    absolute_instructions = instruction.to_absolute(name2len)
+    # For each split, return the files instruction (skip/take)
+    file_instructions = []
+    num_examples = 0
+    for abs_instr in absolute_instructions:
+        split_length = name2len[abs_instr.splitname]
+        filenames = name2filenames[abs_instr.splitname]
+        shard_lengths = name2shard_lengths[abs_instr.splitname]
+        from_ = 0 if abs_instr.from_ is None else abs_instr.from_
+        to = split_length if abs_instr.to is None else abs_instr.to
+        if shard_lengths is None:  # not sharded
+            for filename in filenames:
+                take = to - from_
+                if take == 0:
+                    continue
+                num_examples += take
+                file_instructions.append({"filename": filename, "skip": from_, "take": take})
+        else:  # sharded
+            index_start = 0  # Beginning (included) of moving window.
+            index_end = 0  # End (excluded) of moving window.
+            for filename, shard_length in zip(filenames, shard_lengths):
+                index_end += shard_length
+                if from_ < index_end and to > index_start:  # There is something to take.
+                    skip = from_ - index_start if from_ > index_start else 0
+                    take = to - index_start - skip if to < index_end else -1
+                    if take == 0:
+                        continue
+                    file_instructions.append({"filename": filename, "skip": skip, "take": take})
+                    num_examples += shard_length - skip if take == -1 else take
+                index_start += shard_length
+    return FileInstructions(
+        num_examples=num_examples,
+        file_instructions=file_instructions,
+    )
+class BaseReader:
+    """
+    Build a Dataset object out of Instruction instance(s).
+    """
+    def __init__(self, path: str, info: Optional["DatasetInfo"]):
+        """Initializes ArrowReader.
+        Args:
+            path (str): path where tfrecords are stored.
+            info (DatasetInfo): info about the dataset.
+        """
+        self._path: str = path
+        self._info: Optional["DatasetInfo"] = info
+        self._filetype_suffix: Optional[str] = None
+    def _get_table_from_filename(self, filename_skip_take, in_memory=False) -> Table:
+        """Returns a Dataset instance from given (filename, skip, take)."""
+        raise NotImplementedError
+    def _read_files(self, files, in_memory=False) -> Table:
+        """Returns Dataset for given file instructions.
+        Args:
+            files: List[dict(filename, skip, take)], the files information.
+                The filenames contain the absolute path, not relative.
+                skip/take indicates which example read in the file: `ds.slice(skip, take)`
+            in_memory (bool, default False): Whether to copy the data in-memory.
+        """
+        if len(files) == 0 or not all(isinstance(f, dict) for f in files):
+            raise ValueError("please provide valid file informations")
+        files = copy.deepcopy(files)
+        for f in files:
+            f["filename"] = os.path.join(self._path, f["filename"])
+        pa_tables = thread_map(
+            partial(self._get_table_from_filename, in_memory=in_memory),
+            files,
+            tqdm_class=hf_tqdm,
+            desc="Loading dataset shards",
+            # set `disable=None` rather than `disable=False` by default to disable progress bar when no TTY attached
+            disable=len(files) <= 16 or None,
+        )
+        pa_tables = [t for t in pa_tables if len(t) > 0]
+        if not pa_tables and (self._info is None or self._info.features is None):
+            raise ValueError(
+                "Tried to read an empty table. Please specify at least info.features to create an empty table with the right type."
+            )
+        pa_tables = pa_tables or [InMemoryTable.from_batches([], schema=pa.schema(self._info.features.type))]
+        pa_table = concat_tables(pa_tables) if len(pa_tables) != 1 else pa_tables[0]
+        return pa_table
+    def get_file_instructions(self, name, instruction, split_infos):
+        """Return list of dict {'filename': str, 'skip': int, 'take': int}"""
+        file_instructions = make_file_instructions(
+            name, split_infos, instruction, filetype_suffix=self._filetype_suffix, prefix_path=self._path
+        )
+        files = file_instructions.file_instructions
+        return files
+    def read(
+        self,
+        name,
+        instructions,
+        split_infos,
+        in_memory=False,
+    ):
+        """Returns Dataset instance(s).
+        Args:
+            name (str): name of the dataset.
+            instructions (ReadInstruction): instructions to read.
+                Instruction can be string and will then be passed to the Instruction
+                constructor as it.
+            split_infos (list of SplitInfo proto): the available splits for dataset.
+            in_memory (bool, default False): Whether to copy the data in-memory.
+        Returns:
+             kwargs to build a single Dataset instance.
+        """
+        files = self.get_file_instructions(name, instructions, split_infos)
+        if not files:
+            msg = f'Instruction "{instructions}" corresponds to no data!'
+            raise ValueError(msg)
+        return self.read_files(files=files, original_instructions=instructions, in_memory=in_memory)
+    def read_files(
+        self,
+        files: list[dict],
+        original_instructions: Union[None, "ReadInstruction", "Split"] = None,
+        in_memory=False,
+    ):
+        """Returns single Dataset instance for the set of file instructions.
+        Args:
+            files: List[dict(filename, skip, take)], the files information.
+                The filenames contains the relative path, not absolute.
+                skip/take indicates which example read in the file: `ds.skip().take()`
+            original_instructions: store the original instructions used to build the dataset split in the dataset.
+            in_memory (bool, default False): Whether to copy the data in-memory.
+        Returns:
+            kwargs to build a Dataset instance.
+        """
+        # Prepend path to filename
+        pa_table = self._read_files(files, in_memory=in_memory)
+        # If original_instructions is not None, convert it to a human-readable NamedSplit
+        if original_instructions is not None:
+            from .splits import Split  # noqa
+            split = Split(str(original_instructions))
+        else:
+            split = None
+        dataset_kwargs = {"arrow_table": pa_table, "info": self._info, "split": split}
+        return dataset_kwargs
+class ArrowReader(BaseReader):
+    """
+    Build a Dataset object out of Instruction instance(s).
+    This Reader uses either memory mapping or file descriptors (in-memory) on arrow files.
+    """
+    def __init__(self, path: str, info: Optional["DatasetInfo"]):
+        """Initializes ArrowReader.
+        Args:
+            path (str): path where Arrow files are stored.
+            info (DatasetInfo): info about the dataset.
+        """
+        super().__init__(path, info)
+        self._filetype_suffix = "arrow"
+    def _get_table_from_filename(self, filename_skip_take, in_memory=False) -> Table:
+        """Returns a Dataset instance from given (filename, skip, take)."""
+        filename, skip, take = (
+            filename_skip_take["filename"],
+            filename_skip_take["skip"] if "skip" in filename_skip_take else None,
+            filename_skip_take["take"] if "take" in filename_skip_take else None,
+        )
+        table = ArrowReader.read_table(filename, in_memory=in_memory)
+        if take == -1:
+            take = len(table) - skip
+        # here we don't want to slice an empty table, or it may segfault
+        if skip is not None and take is not None and not (skip == 0 and take == len(table)):
+            table = table.slice(skip, take)
+        return table
+    @staticmethod
+    def read_table(filename, in_memory=False) -> Table:
+        """
+        Read table from file.
+        Args:
+            filename (str): File name of the table.
+            in_memory (bool, default=False): Whether to copy the data in-memory.
+        Returns:
+            pyarrow.Table
+        """
+        table_cls = InMemoryTable if in_memory else MemoryMappedTable
+        return table_cls.from_file(filename)
+class ParquetReader(BaseReader):
+    """
+    Build a Dataset object out of Instruction instance(s).
+    This Reader uses memory mapping on parquet files.
+    """
+    def __init__(self, path: str, info: Optional["DatasetInfo"]):
+        """Initializes ParquetReader.
+        Args:
+            path (str): path where tfrecords are stored.
+            info (DatasetInfo): info about the dataset.
+        """
+        super().__init__(path, info)
+        self._filetype_suffix = "parquet"
+    def _get_table_from_filename(self, filename_skip_take, **kwargs):
+        """Returns a Dataset instance from given (filename, skip, take)."""
+        filename, skip, take = (
+            filename_skip_take["filename"],
+            filename_skip_take["skip"] if "skip" in filename_skip_take else None,
+            filename_skip_take["take"] if "take" in filename_skip_take else None,
+        )
+        # Parquet read_table always loads data in memory, independently of memory_map
+        pa_table = pq.read_table(filename, memory_map=True)
+        # here we don't want to slice an empty table, or it may segfault
+        if skip is not None and take is not None and not (skip == 0 and take == len(pa_table)):
+            pa_table = pa_table.slice(skip, take)
+        return pa_table
+@dataclass(frozen=True)
+class _AbsoluteInstruction:
+    """A machine friendly slice: defined absolute positive boundaries."""
+    splitname: str
+    from_: int  # uint (starting index).
+    to: int  # uint (ending index).
+@dataclass(frozen=True)
+class _RelativeInstruction:
+    """Represents a single parsed slicing instruction, can use % and negatives."""
+    splitname: str
+    from_: Optional[int] = None  # int (starting index) or None if no lower boundary.
+    to: Optional[int] = None  # int (ending index) or None if no upper boundary.
+    unit: Optional[str] = None
+    rounding: Optional[str] = None
+    def __post_init__(self):
+        if self.unit is not None and self.unit not in ["%", "abs"]:
+            raise ValueError("unit must be either % or abs")
+        if self.rounding is not None and self.rounding not in ["closest", "pct1_dropremainder"]:
+            raise ValueError("rounding must be either closest or pct1_dropremainder")
+        if self.unit != "%" and self.rounding is not None:
+            raise ValueError("It is forbidden to specify rounding if not using percent slicing.")
+        if self.unit == "%" and self.from_ is not None and abs(self.from_) > 100:
+            raise ValueError("Percent slice boundaries must be > -100 and < 100.")
+        if self.unit == "%" and self.to is not None and abs(self.to) > 100:
+            raise ValueError("Percent slice boundaries must be > -100 and < 100.")
+        # Update via __dict__ due to instance being "frozen"
+        self.__dict__["rounding"] = "closest" if self.rounding is None and self.unit == "%" else self.rounding
+def _str_to_read_instruction(spec):
+    """Returns ReadInstruction for given string."""
+    res = _SUB_SPEC_RE.match(spec)
+    if not res:
+        raise ValueError(f"Unrecognized instruction format: {spec}")
+    unit = "%" if res.group("from_pct") or res.group("to_pct") else "abs"
+    return ReadInstruction(
+        split_name=res.group("split"),
+        rounding=res.group("rounding"),
+        from_=int(res.group("from")) if res.group("from") else None,
+        to=int(res.group("to")) if res.group("to") else None,
+        unit=unit,
+    )
+def _pct_to_abs_pct1(boundary, num_examples):
+    # Using math.trunc here, since -99.5% should give -99%, not -100%.
+    if num_examples < 100:
+        msg = (
+            'Using "pct1_dropremainder" rounding on a split with less than 100 '
+            "elements is forbidden: it always results in an empty dataset."
+        )
+        raise ValueError(msg)
+    return boundary * math.trunc(num_examples / 100.0)
+def _pct_to_abs_closest(boundary, num_examples):
+    return int(round(boundary * num_examples / 100.0))
+def _rel_to_abs_instr(rel_instr, name2len):
+    """Returns _AbsoluteInstruction instance for given RelativeInstruction.
+    Args:
+        rel_instr: RelativeInstruction instance.
+        name2len: dict {split_name: num_examples}.
+    """
+    pct_to_abs = _pct_to_abs_closest if rel_instr.rounding == "closest" else _pct_to_abs_pct1
+    split = rel_instr.splitname
+    if split not in name2len:
+        raise ValueError(f'Unknown split "{split}". Should be one of {list(name2len)}.')
+    num_examples = name2len[split]
+    from_ = rel_instr.from_
+    to = rel_instr.to
+    if rel_instr.unit == "%":
+        from_ = 0 if from_ is None else pct_to_abs(from_, num_examples)
+        to = num_examples if to is None else pct_to_abs(to, num_examples)
+    else:
+        from_ = 0 if from_ is None else from_
+        to = num_examples if to is None else to
+    if from_ < 0:
+        from_ = max(num_examples + from_, 0)
+    if to < 0:
+        to = max(num_examples + to, 0)
+    from_ = min(from_, num_examples)
+    to = min(to, num_examples)
+    return _AbsoluteInstruction(split, from_, to)
+class ReadInstruction:
+    """Reading instruction for a dataset.
+    Examples::
+      # The following lines are equivalent:
+      ds = datasets.load_dataset('mnist', split='test[:33%]')
+      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec('test[:33%]'))
+      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction('test', to=33, unit='%'))
+      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction(
+          'test', from_=0, to=33, unit='%'))
+      # The following lines are equivalent:
+      ds = datasets.load_dataset('mnist', split='test[:33%]+train[1:-1]')
+      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec(
+          'test[:33%]+train[1:-1]'))
+      ds = datasets.load_dataset('mnist', split=(
+          datasets.ReadInstruction('test', to=33, unit='%') +
+          datasets.ReadInstruction('train', from_=1, to=-1, unit='abs')))
+      # The following lines are equivalent:
+      ds = datasets.load_dataset('mnist', split='test[:33%](pct1_dropremainder)')
+      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec(
+          'test[:33%](pct1_dropremainder)'))
+      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction(
+          'test', from_=0, to=33, unit='%', rounding="pct1_dropremainder"))
+      # 10-fold validation:
+      tests = datasets.load_dataset(
+          'mnist',
+          [datasets.ReadInstruction('train', from_=k, to=k+10, unit='%')
+          for k in range(0, 100, 10)])
+      trains = datasets.load_dataset(
+          'mnist',
+          [datasets.ReadInstruction('train', to=k, unit='%') + datasets.ReadInstruction('train', from_=k+10, unit='%')
+          for k in range(0, 100, 10)])
+    """
+    def _init(self, relative_instructions):
+        # Private initializer.
+        self._relative_instructions = relative_instructions
+    @classmethod
+    def _read_instruction_from_relative_instructions(cls, relative_instructions):
+        """Returns ReadInstruction obj initialized with relative_instructions."""
+        # Use __new__ to bypass __init__ used by public API and not conveniant here.
+        result = cls.__new__(cls)
+        result._init(relative_instructions)  # pylint: disable=protected-access
+        return result
+    def __init__(self, split_name, rounding=None, from_=None, to=None, unit=None):
+        """Initialize ReadInstruction.
+        Args:
+            split_name (str): name of the split to read. Eg: 'train'.
+            rounding (str, optional): The rounding behaviour to use when percent slicing is
+                used. Ignored when slicing with absolute indices.
+                Possible values:
+                 - 'closest' (default): The specified percentages are rounded to the
+                     closest value. Use this if you want specified percents to be as
+                     much exact as possible.
+                 - 'pct1_dropremainder': the specified percentages are treated as
+                     multiple of 1%. Use this option if you want consistency. Eg:
+                         len(5%) == 5 * len(1%).
+                     Using this option, one might not be able to use the full set of
+                     examples, if the number of those is not a multiple of 100.
+            from_ (int):
+            to (int): alternative way of specifying slicing boundaries. If any of
+                {from_, to, unit} argument is used, slicing cannot be specified as
+                string.
+            unit (str): optional, one of:
+                '%': to set the slicing unit as percents of the split size.
+                'abs': to set the slicing unit as absolute numbers.
+        """
+        # This constructor is not always called. See factory method
+        # `_read_instruction_from_relative_instructions`. Common init instructions
+        # MUST be placed in the _init method.
+        self._init([_RelativeInstruction(split_name, from_, to, unit, rounding)])
+    @classmethod
+    def from_spec(cls, spec):
+        """Creates a `ReadInstruction` instance out of a string spec.
+        Args:
+            spec (`str`):
+                Split(s) + optional slice(s) to read + optional rounding
+                if percents are used as the slicing unit. A slice can be specified,
+                using absolute numbers (`int`) or percentages (`int`).
+        Examples:
+            ```
+            test: test split.
+            test + validation: test split + validation split.
+            test[10:]: test split, minus its first 10 records.
+            test[:10%]: first 10% records of test split.
+            test[:20%](pct1_dropremainder): first 10% records, rounded with the pct1_dropremainder rounding.
+            test[:-5%]+train[40%:60%]: first 95% of test + middle 20% of train.
+            ```
+        Returns:
+            ReadInstruction instance.
+        """
+        spec = str(spec)  # Need to convert to str in case of NamedSplit instance.
+        subs = _ADDITION_SEP_RE.split(spec)
+        if not subs:
+            raise ValueError(f"No instructions could be built out of {spec}")
+        instruction = _str_to_read_instruction(subs[0])
+        return sum((_str_to_read_instruction(sub) for sub in subs[1:]), instruction)
+    def to_spec(self):
+        rel_instr_specs = []
+        for rel_instr in self._relative_instructions:
+            rel_instr_spec = rel_instr.splitname
+            if rel_instr.from_ is not None or rel_instr.to is not None:
+                from_ = rel_instr.from_
+                to = rel_instr.to
+                unit = rel_instr.unit
+                rounding = rel_instr.rounding
+                unit = unit if unit == "%" else ""
+                from_ = str(from_) + unit if from_ is not None else ""
+                to = str(to) + unit if to is not None else ""
+                slice_str = f"[{from_}:{to}]"
+                rounding_str = (
+                    f"({rounding})" if unit == "%" and rounding is not None and rounding != "closest" else ""
+                )
+                rel_instr_spec += slice_str + rounding_str
+            rel_instr_specs.append(rel_instr_spec)
+        return "+".join(rel_instr_specs)
+    def __add__(self, other):
+        """Returns a new ReadInstruction obj, result of appending other to self."""
+        if not isinstance(other, ReadInstruction):
+            msg = "ReadInstruction can only be added to another ReadInstruction obj."
+            raise TypeError(msg)
+        self_ris = self._relative_instructions
+        other_ris = other._relative_instructions  # pylint: disable=protected-access
+        if (
+            self_ris[0].unit != "abs"
+            and other_ris[0].unit != "abs"
+            and self._relative_instructions[0].rounding != other_ris[0].rounding
+        ):
+            raise ValueError("It is forbidden to sum ReadInstruction instances with different rounding values.")
+        return self._read_instruction_from_relative_instructions(self_ris + other_ris)
+    def __str__(self):
+        return self.to_spec()
+    def __repr__(self):
+        return f"ReadInstruction({self._relative_instructions})"
+    def to_absolute(self, name2len):
+        """Translate instruction into a list of absolute instructions.
+        Those absolute instructions are then to be added together.
+        Args:
+            name2len (`dict`):
+                Associating split names to number of examples.
+        Returns:
+            list of _AbsoluteInstruction instances (corresponds to the + in spec).
+        """
+        return [_rel_to_abs_instr(rel_instr, name2len) for rel_instr in self._relative_instructions]

.venv/lib/python3.11/site-packages/datasets/arrow_writer.py ADDED Viewed

	@@ -0,0 +1,766 @@

+# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Lint as: python3
+"""To write records into Parquet files."""
+import json
+import sys
+from collections.abc import Iterable
+from typing import Any, Optional, Union
+import fsspec
+import numpy as np
+import pyarrow as pa
+import pyarrow.parquet as pq
+from fsspec.core import url_to_fs
+from . import config
+from .features import Audio, Features, Image, Pdf, Value, Video
+from .features.features import (
+    FeatureType,
+    List,
+    _ArrayXDExtensionType,
+    _visit,
+    cast_to_python_objects,
+    generate_from_arrow_type,
+    get_nested_type,
+    list_of_np_array_to_pyarrow_listarray,
+    numpy_to_pyarrow_listarray,
+    to_pyarrow_listarray,
+)
+from .filesystems import is_remote_filesystem
+from .info import DatasetInfo
+from .keyhash import DuplicatedKeysError, KeyHasher
+from .table import array_cast, cast_array_to_feature, embed_table_storage, table_cast
+from .utils import logging
+from .utils.py_utils import asdict, convert_file_size_to_int, first_non_null_non_empty_value
+logger = logging.get_logger(__name__)
+type_ = type  # keep python's type function
+def get_arrow_writer_batch_size_from_features(features: Optional[Features]) -> Optional[int]:
+    """
+    Get the writer_batch_size that defines the maximum record batch size in the arrow files based on configuration values.
+    The default value is 100 for image/audio datasets and 10 for videos.
+    This allows to avoid overflows in arrow buffers.
+    Args:
+        features (`datasets.Features` or `None`):
+            Dataset Features from `datasets`.
+    Returns:
+        writer_batch_size (`Optional[int]`):
+            Writer batch size to pass to a dataset builder.
+            If `None`, then it will use the `datasets` default, i.e. `datasets.config.DEFAULT_MAX_BATCH_SIZE`.
+    """
+    if not features:
+        return None
+    batch_size = np.inf
+    def set_batch_size(feature: FeatureType) -> None:
+        nonlocal batch_size
+        if isinstance(feature, Image) and config.ARROW_RECORD_BATCH_SIZE_FOR_IMAGE_DATASETS is not None:
+            batch_size = min(batch_size, config.ARROW_RECORD_BATCH_SIZE_FOR_IMAGE_DATASETS)
+        elif isinstance(feature, Audio) and config.ARROW_RECORD_BATCH_SIZE_FOR_AUDIO_DATASETS is not None:
+            batch_size = min(batch_size, config.ARROW_RECORD_BATCH_SIZE_FOR_AUDIO_DATASETS)
+        elif isinstance(feature, Video) and config.ARROW_RECORD_BATCH_SIZE_FOR_VIDEO_DATASETS is not None:
+            batch_size = min(batch_size, config.ARROW_RECORD_BATCH_SIZE_FOR_VIDEO_DATASETS)
+        elif (
+            isinstance(feature, Value)
+            and feature.dtype == "binary"
+            and config.ARROW_RECORD_BATCH_SIZE_FOR_BINARY_DATASETS is not None
+        ):
+            batch_size = min(batch_size, config.ARROW_RECORD_BATCH_SIZE_FOR_BINARY_DATASETS)
+    _visit(features, set_batch_size)
+    return None if batch_size is np.inf else batch_size
+def get_writer_batch_size_from_features(features: Optional[Features]) -> Optional[int]:
+    """
+    Get the writer_batch_size that defines the maximum row group size in the parquet files based on configuration values.
+    By default these are not set, but it can be helpful to hard set those values in some cases.
+    This allows to optimize random access to parquet file, since accessing 1 row requires
+    to read its entire row group.
+    Args:
+        features (`datasets.Features` or `None`):
+            Dataset Features from `datasets`.
+    Returns:
+        writer_batch_size (`Optional[int]`):
+            Writer batch size to pass to a parquet writer.
+            If `None`, then it will use the `datasets` default, i.e. aiming for row groups of 100MB.
+    """
+    if not features:
+        return None
+    batch_size = np.inf
+    def set_batch_size(feature: FeatureType) -> None:
+        nonlocal batch_size
+        if isinstance(feature, Image) and config.PARQUET_ROW_GROUP_SIZE_FOR_IMAGE_DATASETS is not None:
+            batch_size = min(batch_size, config.PARQUET_ROW_GROUP_SIZE_FOR_IMAGE_DATASETS)
+        elif isinstance(feature, Audio) and config.PARQUET_ROW_GROUP_SIZE_FOR_AUDIO_DATASETS is not None:
+            batch_size = min(batch_size, config.PARQUET_ROW_GROUP_SIZE_FOR_AUDIO_DATASETS)
+        elif isinstance(feature, Video) and config.PARQUET_ROW_GROUP_SIZE_FOR_VIDEO_DATASETS is not None:
+            batch_size = min(batch_size, config.PARQUET_ROW_GROUP_SIZE_FOR_VIDEO_DATASETS)
+        elif (
+            isinstance(feature, Value)
+            and feature.dtype == "binary"
+            and config.PARQUET_ROW_GROUP_SIZE_FOR_BINARY_DATASETS is not None
+        ):
+            batch_size = min(batch_size, config.PARQUET_ROW_GROUP_SIZE_FOR_BINARY_DATASETS)
+    _visit(features, set_batch_size)
+    return None if batch_size is np.inf else batch_size
+def get_writer_batch_size_from_data_size(num_rows: int, num_bytes: int) -> int:
+    """
+    Get the writer_batch_size that defines the maximum row group size in the parquet files.
+    The default in `datasets` is aiming for row groups of maximum 100MB uncompressed.
+    This allows to optimize random access to parquet file, since accessing 1 row requires
+    to read its entire row group.
+    This can be improved to get optimized size for querying/iterating
+    but at least it matches the dataset viewer expectations on HF.
+    Args:
+        num_rows (`int`):
+            Number of rows in the dataset.
+        num_bytes (`int`):
+            Number of bytes in the dataset.
+            For dataset with external files to embed (image, audio, videos), this can also be an
+            estimate from `dataset._estimate_nbytes()`.
+    Returns:
+        writer_batch_size (`Optional[int]`):
+            Writer batch size to pass to a parquet writer.
+    """
+    return max(10, num_rows * convert_file_size_to_int(config.MAX_ROW_GROUP_SIZE) // num_bytes) if num_bytes > 0 else 1
+class SchemaInferenceError(ValueError):
+    pass
+class TypedSequence:
+    """
+    This data container generalizes the typing when instantiating pyarrow arrays, tables or batches.
+    More specifically it adds several features:
+    - Support extension types like ``datasets.features.Array2DExtensionType``:
+        By default pyarrow arrays don't return extension arrays. One has to call
+        ``pa.ExtensionArray.from_storage(type, pa.array(data, type.storage_type))``
+        in order to get an extension array.
+    - Support for ``try_type`` parameter that can be used instead of ``type``:
+        When an array is transformed, we like to keep the same type as before if possible.
+        For example when calling :func:`datasets.Dataset.map`, we don't want to change the type
+        of each column by default.
+    - Better error message when a pyarrow array overflows.
+    Example::
+        from datasets.features import Array2D, Array2DExtensionType, Value
+        from datasets.arrow_writer import TypedSequence
+        import pyarrow as pa
+        arr = pa.array(TypedSequence([1, 2, 3], type=Value("int32")))
+        assert arr.type == pa.int32()
+        arr = pa.array(TypedSequence([1, 2, 3], try_type=Value("int32")))
+        assert arr.type == pa.int32()
+        arr = pa.array(TypedSequence(["foo", "bar"], try_type=Value("int32")))
+        assert arr.type == pa.string()
+        arr = pa.array(TypedSequence([[[1, 2, 3]]], type=Array2D((1, 3), "int64")))
+        assert arr.type == Array2DExtensionType((1, 3), "int64")
+        table = pa.Table.from_pydict({
+            "image": TypedSequence([[[1, 2, 3]]], type=Array2D((1, 3), "int64"))
+        })
+        assert table["image"].type == Array2DExtensionType((1, 3), "int64")
+    """
+    def __init__(
+        self,
+        data: Iterable,
+        type: Optional[FeatureType] = None,
+        try_type: Optional[FeatureType] = None,
+        optimized_int_type: Optional[FeatureType] = None,
+    ):
+        # assert type is None or try_type is None,
+        if type is not None and try_type is not None:
+            raise ValueError("You cannot specify both type and try_type")
+        # set attributes
+        self.data = data
+        self.type = type
+        self.try_type = try_type  # is ignored if it doesn't match the data
+        self.optimized_int_type = optimized_int_type
+        # when trying a type (is ignored if data is not compatible)
+        self.trying_type = self.try_type is not None
+        self.trying_int_optimization = optimized_int_type is not None and type is None and try_type is None
+        # used to get back the inferred type after __arrow_array__() is called once
+        self._inferred_type = None
+    def get_inferred_type(self) -> FeatureType:
+        """Return the inferred feature type.
+        This is done by converting the sequence to an Arrow array, and getting the corresponding
+        feature type.
+        Since building the Arrow array can be expensive, the value of the inferred type is cached
+        as soon as pa.array is called on the typed sequence.
+        Returns:
+            FeatureType: inferred feature type of the sequence.
+        """
+        if self._inferred_type is None:
+            self._inferred_type = generate_from_arrow_type(pa.array(self).type)
+        return self._inferred_type
+    @staticmethod
+    def _infer_custom_type_and_encode(data: Iterable) -> tuple[Iterable, Optional[FeatureType]]:
+        """Implement type inference for custom objects like PIL.Image.Image -> Image type.
+        This function is only used for custom python objects that can't be directly passed to build
+        an Arrow array. In such cases is infers the feature type to use, and it encodes the data so
+        that they can be passed to an Arrow array.
+        Args:
+            data (Iterable): array of data to infer the type, e.g. a list of PIL images.
+        Returns:
+            Tuple[Iterable, Optional[FeatureType]]: a tuple with:
+                - the (possibly encoded) array, if the inferred feature type requires encoding
+                - the inferred feature type if the array is made of supported custom objects like
+                    PIL images, else None.
+        """
+        if config.PIL_AVAILABLE and "PIL" in sys.modules:
+            import PIL.Image
+            non_null_idx, non_null_value = first_non_null_non_empty_value(data)
+            if isinstance(non_null_value, PIL.Image.Image):
+                return [Image().encode_example(value) if value is not None else None for value in data], Image()
+            if isinstance(non_null_value, list) and isinstance(non_null_value[0], PIL.Image.Image):
+                return [
+                    [Image().encode_example(x) for x in value] if value is not None else None for value in data
+                ], List(Image())
+        if config.PDFPLUMBER_AVAILABLE and "pdfplumber" in sys.modules:
+            import pdfplumber
+            non_null_idx, non_null_value = first_non_null_non_empty_value(data)
+            if isinstance(non_null_value, pdfplumber.pdf.PDF):
+                return [Pdf().encode_example(value) if value is not None else None for value in data], Pdf()
+            if isinstance(non_null_value, list) and isinstance(non_null_value[0], pdfplumber.pdf.PDF):
+                return [
+                    [Pdf().encode_example(x) for x in value] if value is not None else None for value in data
+                ], List(Pdf())
+        return data, None
+    def __arrow_array__(self, type: Optional[pa.DataType] = None):
+        """This function is called when calling pa.array(typed_sequence)"""
+        if type is not None:
+            raise ValueError("TypedSequence is supposed to be used with pa.array(typed_sequence, type=None)")
+        del type  # make sure we don't use it
+        data = self.data
+        # automatic type inference for custom objects
+        if self.type is None and self.try_type is None:
+            data, self._inferred_type = self._infer_custom_type_and_encode(data)
+        if self._inferred_type is None:
+            type = self.try_type if self.trying_type else self.type
+        else:
+            type = self._inferred_type
+        pa_type = get_nested_type(type) if type is not None else None
+        optimized_int_pa_type = (
+            get_nested_type(self.optimized_int_type) if self.optimized_int_type is not None else None
+        )
+        trying_cast_to_python_objects = False
+        try:
+            # custom pyarrow types
+            if isinstance(pa_type, _ArrayXDExtensionType):
+                storage = to_pyarrow_listarray(data, pa_type)
+                return pa.ExtensionArray.from_storage(pa_type, storage)
+            # efficient np array to pyarrow array
+            if isinstance(data, np.ndarray):
+                out = numpy_to_pyarrow_listarray(data)
+            elif isinstance(data, list) and data and isinstance(first_non_null_non_empty_value(data)[1], np.ndarray):
+                out = list_of_np_array_to_pyarrow_listarray(data)
+            else:
+                trying_cast_to_python_objects = True
+                out = pa.array(cast_to_python_objects(data, only_1d_for_numpy=True))
+            # use smaller integer precisions if possible
+            if self.trying_int_optimization:
+                if pa.types.is_int64(out.type):
+                    out = out.cast(optimized_int_pa_type)
+                elif pa.types.is_list(out.type):
+                    if pa.types.is_int64(out.type.value_type):
+                        out = array_cast(out, pa.list_(optimized_int_pa_type))
+                    elif pa.types.is_list(out.type.value_type) and pa.types.is_int64(out.type.value_type.value_type):
+                        out = array_cast(out, pa.list_(pa.list_(optimized_int_pa_type)))
+            # otherwise we can finally use the user's type
+            elif type is not None:
+                # We use cast_array_to_feature to support casting to custom types like Audio and Image
+                # Also, when trying type "string", we don't want to convert integers or floats to "string".
+                # We only do it if trying_type is False - since this is what the user asks for.
+                out = cast_array_to_feature(
+                    out, type, allow_primitive_to_str=not self.trying_type, allow_decimal_to_str=not self.trying_type
+                )
+            return out
+        except (
+            TypeError,
+            pa.lib.ArrowInvalid,
+            pa.lib.ArrowNotImplementedError,
+        ) as e:  # handle type errors and overflows
+            # Ignore ArrowNotImplementedError caused by trying type, otherwise re-raise
+            if not self.trying_type and isinstance(e, pa.lib.ArrowNotImplementedError):
+                raise
+            if self.trying_type:
+                try:  # second chance
+                    if isinstance(data, np.ndarray):
+                        return numpy_to_pyarrow_listarray(data)
+                    elif isinstance(data, list) and data and any(isinstance(value, np.ndarray) for value in data):
+                        return list_of_np_array_to_pyarrow_listarray(data)
+                    else:
+                        trying_cast_to_python_objects = True
+                        return pa.array(cast_to_python_objects(data, only_1d_for_numpy=True))
+                except pa.lib.ArrowInvalid as e:
+                    if "overflow" in str(e):
+                        raise OverflowError(
+                            f"There was an overflow with type {type_(data)}. Try to reduce writer_batch_size to have batches smaller than 2GB.\n({e})"
+                        ) from None
+                    elif self.trying_int_optimization and "not in range" in str(e):
+                        optimized_int_pa_type_str = np.dtype(optimized_int_pa_type.to_pandas_dtype()).name
+                        logger.info(
+                            f"Failed to cast a sequence to {optimized_int_pa_type_str}. Falling back to int64."
+                        )
+                        return out
+                    elif trying_cast_to_python_objects and "Could not convert" in str(e):
+                        out = pa.array(
+                            cast_to_python_objects(data, only_1d_for_numpy=True, optimize_list_casting=False)
+                        )
+                        if type is not None:
+                            out = cast_array_to_feature(
+                                out, type, allow_primitive_to_str=True, allow_decimal_to_str=True
+                            )
+                        return out
+                    else:
+                        raise
+            elif "overflow" in str(e):
+                raise OverflowError(
+                    f"There was an overflow with type {type_(data)}. Try to reduce writer_batch_size to have batches smaller than 2GB.\n({e})"
+                ) from None
+            elif self.trying_int_optimization and "not in range" in str(e):
+                optimized_int_pa_type_str = np.dtype(optimized_int_pa_type.to_pandas_dtype()).name
+                logger.info(f"Failed to cast a sequence to {optimized_int_pa_type_str}. Falling back to int64.")
+                return out
+            elif trying_cast_to_python_objects and "Could not convert" in str(e):
+                out = pa.array(cast_to_python_objects(data, only_1d_for_numpy=True, optimize_list_casting=False))
+                if type is not None:
+                    out = cast_array_to_feature(out, type, allow_primitive_to_str=True, allow_decimal_to_str=True)
+                return out
+            else:
+                raise
+class OptimizedTypedSequence(TypedSequence):
+    def __init__(
+        self,
+        data,
+        type: Optional[FeatureType] = None,
+        try_type: Optional[FeatureType] = None,
+        col: Optional[str] = None,
+        optimized_int_type: Optional[FeatureType] = None,
+    ):
+        optimized_int_type_by_col = {
+            "attention_mask": Value("int8"),  # binary tensor
+            "special_tokens_mask": Value("int8"),
+            "input_ids": Value("int32"),  # typical vocab size: 0-50k (max ~500k, never > 1M)
+            "token_type_ids": Value(
+                "int8"
+            ),  # binary mask; some (XLNetModel) use an additional token represented by a 2
+        }
+        if type is None and try_type is None:
+            optimized_int_type = optimized_int_type_by_col.get(col, None)
+        super().__init__(data, type=type, try_type=try_type, optimized_int_type=optimized_int_type)
+class ArrowWriter:
+    """Shuffles and writes Examples to Arrow files."""
+    def __init__(
+        self,
+        schema: Optional[pa.Schema] = None,
+        features: Optional[Features] = None,
+        path: Optional[str] = None,
+        stream: Optional[pa.NativeFile] = None,
+        fingerprint: Optional[str] = None,
+        writer_batch_size: Optional[int] = None,
+        hash_salt: Optional[str] = None,
+        check_duplicates: Optional[bool] = False,
+        disable_nullable: bool = False,
+        update_features: bool = False,
+        with_metadata: bool = True,
+        unit: str = "examples",
+        embed_local_files: bool = False,
+        storage_options: Optional[dict] = None,
+    ):
+        if path is None and stream is None:
+            raise ValueError("At least one of path and stream must be provided.")
+        if features is not None:
+            self._features = features
+            self._schema = None
+        elif schema is not None:
+            self._schema: pa.Schema = schema
+            self._features = Features.from_arrow_schema(self._schema)
+        else:
+            self._features = None
+            self._schema = None
+        if hash_salt is not None:
+            # Create KeyHasher instance using split name as hash salt
+            self._hasher = KeyHasher(hash_salt)
+        else:
+            self._hasher = KeyHasher("")
+        self._check_duplicates = check_duplicates
+        self._disable_nullable = disable_nullable
+        if stream is None:
+            fs, path = url_to_fs(path, **(storage_options or {}))
+            self._fs: fsspec.AbstractFileSystem = fs
+            self._path = path if not is_remote_filesystem(self._fs) else self._fs.unstrip_protocol(path)
+            self.stream = self._fs.open(path, "wb")
+            self._closable_stream = True
+        else:
+            self._fs = None
+            self._path = None
+            self.stream = stream
+            self._closable_stream = False
+        self.fingerprint = fingerprint
+        self.disable_nullable = disable_nullable
+        self.writer_batch_size = (
+            writer_batch_size
+            or get_arrow_writer_batch_size_from_features(self._features)
+            or config.DEFAULT_MAX_BATCH_SIZE
+        )
+        self.update_features = update_features
+        self.with_metadata = with_metadata
+        self.unit = unit
+        self.embed_local_files = embed_local_files
+        self._num_examples = 0
+        self._num_bytes = 0
+        self.current_examples: list[tuple[dict[str, Any], str]] = []
+        self.current_rows: list[pa.Table] = []
+        self.pa_writer: Optional[pa.RecordBatchStreamWriter] = None
+        self.hkey_record = []
+    def __len__(self):
+        """Return the number of writed and staged examples"""
+        return self._num_examples + len(self.current_examples) + len(self.current_rows)
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+    def close(self):
+        # Try closing if opened; if closed: pyarrow.lib.ArrowInvalid: Invalid operation on closed file
+        if self.pa_writer:  # it might be None
+            try:
+                self.pa_writer.close()
+            except Exception:  # pyarrow.lib.ArrowInvalid, OSError
+                pass
+        if self._closable_stream and not self.stream.closed:
+            self.stream.close()  # This also closes self.pa_writer if it is opened
+    def _build_schema(self, inferred_schema: pa.Schema):
+        schema = self.schema
+        features = self._features
+        inferred_features = Features.from_arrow_schema(inferred_schema)
+        if self._features is not None:
+            if self.update_features:  # keep original features it they match, or update them
+                fields = {field.name: field for field in self._features.type}
+                for inferred_field in inferred_features.type:
+                    name = inferred_field.name
+                    if name in fields:
+                        if inferred_field == fields[name]:
+                            inferred_features[name] = self._features[name]
+                features = inferred_features
+                schema: pa.Schema = inferred_schema
+        else:
+            features = inferred_features
+            schema: pa.Schema = inferred_features.arrow_schema
+        if self.disable_nullable:
+            schema = pa.schema(pa.field(field.name, field.type, nullable=False) for field in schema)
+        if self.with_metadata:
+            schema = schema.with_metadata(self._build_metadata(DatasetInfo(features=features), self.fingerprint))
+        else:
+            schema = schema.with_metadata({})
+        return schema, features
+    def _build_writer(self, inferred_schema: pa.Schema):
+        self._schema, self._features = self._build_schema(inferred_schema)
+        self.pa_writer = pa.RecordBatchStreamWriter(self.stream, self._schema)
+    @property
+    def schema(self):
+        _schema = (
+            self._schema
+            if self._schema is not None
+            else (pa.schema(self._features.type) if self._features is not None else None)
+        )
+        if self._disable_nullable and _schema is not None:
+            _schema = pa.schema(pa.field(field.name, field.type, nullable=False) for field in _schema)
+        return _schema if _schema is not None else []
+    @staticmethod
+    def _build_metadata(info: DatasetInfo, fingerprint: Optional[str] = None) -> dict[str, str]:
+        info_keys = ["features"]  # we can add support for more DatasetInfo keys in the future
+        info_as_dict = asdict(info)
+        metadata = {}
+        metadata["info"] = {key: info_as_dict[key] for key in info_keys}
+        if fingerprint is not None:
+            metadata["fingerprint"] = fingerprint
+        return {"huggingface": json.dumps(metadata)}
+    def write_examples_on_file(self):
+        """Write stored examples from the write-pool of examples. It makes a table out of the examples and write it."""
+        if not self.current_examples:
+            return
+        # preserve the order the columns
+        if self.schema:
+            schema_cols = set(self.schema.names)
+            examples_cols = self.current_examples[0][0].keys()  # .keys() preserves the order (unlike set)
+            common_cols = [col for col in self.schema.names if col in examples_cols]
+            extra_cols = [col for col in examples_cols if col not in schema_cols]
+            cols = common_cols + extra_cols
+        else:
+            cols = list(self.current_examples[0][0])
+        batch_examples = {}
+        for col in cols:
+            # We use row[0][col] since current_examples contains (example, key) tuples.
+            # Moreover, examples could be Arrow arrays of 1 element.
+            # This can happen in `.map()` when we want to re-write the same Arrow data
+            if all(isinstance(row[0][col], (pa.Array, pa.ChunkedArray)) for row in self.current_examples):
+                arrays = [row[0][col] for row in self.current_examples]
+                arrays = [
+                    chunk
+                    for array in arrays
+                    for chunk in (array.chunks if isinstance(array, pa.ChunkedArray) else [array])
+                ]
+                batch_examples[col] = pa.concat_arrays(arrays)
+            else:
+                batch_examples[col] = [
+                    row[0][col].to_pylist()[0] if isinstance(row[0][col], (pa.Array, pa.ChunkedArray)) else row[0][col]
+                    for row in self.current_examples
+                ]
+        self.write_batch(batch_examples=batch_examples)
+        self.current_examples = []
+    def write_rows_on_file(self):
+        """Write stored rows from the write-pool of rows. It concatenates the single-row tables and it writes the resulting table."""
+        if not self.current_rows:
+            return
+        table = pa.concat_tables(self.current_rows)
+        self.write_table(table)
+        self.current_rows = []
+    def write(
+        self,
+        example: dict[str, Any],
+        key: Optional[Union[str, int, bytes]] = None,
+        writer_batch_size: Optional[int] = None,
+    ):
+        """Add a given (Example,Key) pair to the write-pool of examples which is written to file.
+        Args:
+            example: the Example to add.
+            key: Optional, a unique identifier(str, int or bytes) associated with each example
+        """
+        # Utilize the keys and duplicate checking when `self._check_duplicates` is passed True
+        if self._check_duplicates:
+            # Create unique hash from key and store as (key, example) pairs
+            hash = self._hasher.hash(key)
+            self.current_examples.append((example, hash))
+            # Maintain record of keys and their respective hashes for checking duplicates
+            self.hkey_record.append((hash, key))
+        else:
+            # Store example as a tuple so as to keep the structure of `self.current_examples` uniform
+            self.current_examples.append((example, ""))
+        if writer_batch_size is None:
+            writer_batch_size = self.writer_batch_size
+        if writer_batch_size is not None and len(self.current_examples) >= writer_batch_size:
+            if self._check_duplicates:
+                self.check_duplicate_keys()
+                # Re-initializing to empty list for next batch
+                self.hkey_record = []
+            self.write_examples_on_file()
+    def check_duplicate_keys(self):
+        """Raises error if duplicates found in a batch"""
+        tmp_record = set()
+        for hash, key in self.hkey_record:
+            if hash in tmp_record:
+                duplicate_key_indices = [
+                    str(self._num_examples + index)
+                    for index, (duplicate_hash, _) in enumerate(self.hkey_record)
+                    if duplicate_hash == hash
+                ]
+                raise DuplicatedKeysError(key, duplicate_key_indices)
+            else:
+                tmp_record.add(hash)
+    def write_row(self, row: pa.Table, writer_batch_size: Optional[int] = None):
+        """Add a given single-row Table to the write-pool of rows which is written to file.
+        Args:
+            row: the row to add.
+        """
+        if len(row) != 1:
+            raise ValueError(f"Only single-row pyarrow tables are allowed but got table with {len(row)} rows.")
+        self.current_rows.append(row)
+        if writer_batch_size is None:
+            writer_batch_size = self.writer_batch_size
+        if writer_batch_size is not None and len(self.current_rows) >= writer_batch_size:
+            self.write_rows_on_file()
+    def write_batch(
+        self,
+        batch_examples: dict[str, list],
+        writer_batch_size: Optional[int] = None,
+        try_original_type: Optional[bool] = True,
+    ):
+        """Write a batch of Example to file.
+        Ignores the batch if it appears to be empty,
+        preventing a potential schema update of unknown types.
+        Args:
+            batch_examples: the batch of examples to add.
+            try_original_type: use `try_type` when instantiating OptimizedTypedSequence if `True`, otherwise `try_type = None`.
+        """
+        if batch_examples and len(next(iter(batch_examples.values()))) == 0:
+            return
+        features = None if self.pa_writer is None and self.update_features else self._features
+        try_features = self._features if self.pa_writer is None and self.update_features else None
+        arrays = []
+        inferred_features = Features()
+        # preserve the order the columns
+        if self.schema:
+            schema_cols = set(self.schema.names)
+            batch_cols = batch_examples.keys()  # .keys() preserves the order (unlike set)
+            common_cols = [col for col in self.schema.names if col in batch_cols]
+            extra_cols = [col for col in batch_cols if col not in schema_cols]
+            cols = common_cols + extra_cols
+        else:
+            cols = list(batch_examples)
+        for col in cols:
+            col_values = batch_examples[col]
+            col_type = features[col] if features else None
+            if isinstance(col_values, (pa.Array, pa.ChunkedArray)):
+                array = cast_array_to_feature(col_values, col_type) if col_type is not None else col_values
+                arrays.append(array)
+                inferred_features[col] = generate_from_arrow_type(col_values.type)
+            else:
+                col_try_type = (
+                    try_features[col]
+                    if try_features is not None and col in try_features and try_original_type
+                    else None
+                )
+                typed_sequence = OptimizedTypedSequence(col_values, type=col_type, try_type=col_try_type, col=col)
+                arrays.append(pa.array(typed_sequence))
+                inferred_features[col] = typed_sequence.get_inferred_type()
+        schema = inferred_features.arrow_schema if self.pa_writer is None else self.schema
+        pa_table = pa.Table.from_arrays(arrays, schema=schema)
+        self.write_table(pa_table, writer_batch_size)
+    def write_table(self, pa_table: pa.Table, writer_batch_size: Optional[int] = None):
+        """Write a Table to file.
+        Args:
+            example: the Table to add.
+        """
+        if writer_batch_size is None:
+            writer_batch_size = self.writer_batch_size
+        if self.pa_writer is None:
+            self._build_writer(inferred_schema=pa_table.schema)
+        pa_table = pa_table.combine_chunks()
+        pa_table = table_cast(pa_table, self._schema)
+        if self.embed_local_files:
+            pa_table = embed_table_storage(pa_table)
+        self._num_bytes += pa_table.nbytes
+        self._num_examples += pa_table.num_rows
+        self.pa_writer.write_table(pa_table, writer_batch_size)
+    def finalize(self, close_stream=True):
+        self.write_rows_on_file()
+        # In case current_examples < writer_batch_size, but user uses finalize()
+        if self._check_duplicates:
+            self.check_duplicate_keys()
+            # Re-initializing to empty list for next batch
+            self.hkey_record = []
+        self.write_examples_on_file()
+        # If schema is known, infer features even if no examples were written
+        if self.pa_writer is None and self.schema:
+            self._build_writer(self.schema)
+        if self.pa_writer is not None:
+            self.pa_writer.close()
+            self.pa_writer = None
+            if close_stream:
+                self.stream.close()
+        else:
+            if close_stream:
+                self.stream.close()
+            raise SchemaInferenceError("Please pass `features` or at least one example when writing data")
+        logger.debug(
+            f"Done writing {self._num_examples} {self.unit} in {self._num_bytes} bytes {self._path if self._path else ''}."
+        )
+        return self._num_examples, self._num_bytes
+class ParquetWriter(ArrowWriter):
+    def __init__(self, *args, use_content_defined_chunking=True, write_page_index=True, **kwargs):
+        super().__init__(*args, **kwargs)
+        if use_content_defined_chunking is True:
+            use_content_defined_chunking = config.DEFAULT_CDC_OPTIONS
+        self.use_content_defined_chunking = use_content_defined_chunking
+        self.write_page_index = write_page_index
+    def _build_writer(self, inferred_schema: pa.Schema):
+        self._schema, self._features = self._build_schema(inferred_schema)
+        self.pa_writer = pq.ParquetWriter(
+            self.stream,
+            self._schema,
+            use_content_defined_chunking=self.use_content_defined_chunking,
+            write_page_index=self.write_page_index,
+        )
+        if self.use_content_defined_chunking is not False:
+            self.pa_writer.add_key_value_metadata(
+                {"content_defined_chunking": json.dumps(self.use_content_defined_chunking)}
+            )

.venv/lib/python3.11/site-packages/datasets/builder.py ADDED Viewed

	@@ -0,0 +1,1866 @@

+# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Lint as: python3
+"""DatasetBuilder base class."""
+import abc
+import contextlib
+import copy
+import inspect
+import os
+import posixpath
+import shutil
+import textwrap
+import time
+import urllib
+from collections.abc import Iterable, Mapping
+from dataclasses import dataclass
+from functools import partial
+from pathlib import Path
+from typing import TYPE_CHECKING, Optional, Union
+from unittest.mock import patch
+import fsspec
+from fsspec.core import url_to_fs
+from multiprocess import Pool
+from tqdm.contrib.concurrent import thread_map
+from . import config, utils
+from .arrow_dataset import Dataset
+from .arrow_reader import (
+    ArrowReader,
+    ReadInstruction,
+)
+from .arrow_writer import ArrowWriter, ParquetWriter, SchemaInferenceError
+from .data_files import DataFilesDict, DataFilesPatternsDict, sanitize_patterns
+from .dataset_dict import DatasetDict, IterableDatasetDict
+from .download.download_config import DownloadConfig
+from .download.download_manager import DownloadManager, DownloadMode
+from .download.streaming_download_manager import StreamingDownloadManager, xjoin
+from .exceptions import DatasetGenerationCastError, DatasetGenerationError, FileFormatError, ManualDownloadError
+from .features import Features
+from .filesystems import (
+    is_remote_filesystem,
+    rename,
+)
+from .fingerprint import Hasher
+from .info import DatasetInfo, PostProcessedInfo
+from .iterable_dataset import ArrowExamplesIterable, ExamplesIterable, IterableDataset
+from .keyhash import DuplicatedKeysError
+from .naming import INVALID_WINDOWS_CHARACTERS_IN_PATH, camelcase_to_snakecase
+from .splits import Split, SplitDict, SplitGenerator, SplitInfo
+from .streaming import extend_dataset_builder_for_streaming
+from .table import CastError
+from .utils import logging
+from .utils import tqdm as hf_tqdm
+from .utils._filelock import FileLock
+from .utils.file_utils import is_remote_url
+from .utils.info_utils import VerificationMode, get_size_checksum_dict, verify_checksums, verify_splits
+from .utils.py_utils import (
+    classproperty,
+    convert_file_size_to_int,
+    has_sufficient_disk_space,
+    iflatmap_unordered,
+    map_nested,
+    memoize,
+    size_str,
+    temporary_assignment,
+)
+from .utils.sharding import _number_of_shards_in_gen_kwargs, _split_gen_kwargs
+from .utils.track import tracked_list
+if TYPE_CHECKING:
+    from .load import DatasetModule
+logger = logging.get_logger(__name__)
+class InvalidConfigName(ValueError):
+    pass
+@dataclass
+class BuilderConfig:
+    """Base class for `DatasetBuilder` data configuration.
+    `DatasetBuilder` subclasses with data configuration options should subclass
+    `BuilderConfig` and add their own properties.
+    Attributes:
+        name (`str`, defaults to `default`):
+            The name of the configuration.
+        version (`Version` or `str`, defaults to `0.0.0`):
+            The version of the configuration.
+        data_dir (`str`, *optional*):
+            Path to the directory containing the source data.
+        data_files (`str` or `Sequence` or `Mapping`, *optional*):
+            Path(s) to source data file(s).
+        description (`str`, *optional*):
+            A human description of the configuration.
+    """
+    name: str = "default"
+    version: Optional[Union[utils.Version, str]] = utils.Version("0.0.0")
+    data_dir: Optional[str] = None
+    data_files: Optional[Union[DataFilesDict, DataFilesPatternsDict]] = None
+    description: Optional[str] = None
+    def __post_init__(self):
+        # The config name is used to name the cache directory.
+        for invalid_char in INVALID_WINDOWS_CHARACTERS_IN_PATH:
+            if invalid_char in self.name:
+                raise InvalidConfigName(
+                    f"Bad characters from black list '{INVALID_WINDOWS_CHARACTERS_IN_PATH}' found in '{self.name}'. "
+                    f"They could create issues when creating a directory for this config on Windows filesystem."
+                )
+        if self.data_files is not None and not isinstance(self.data_files, (DataFilesDict, DataFilesPatternsDict)):
+            raise ValueError(f"Expected a DataFilesDict in data_files but got {self.data_files}")
+    def __eq__(self, o):
+        # we need to override the default dataclass __eq__ since it doesn't check for
+        # other attributes that the ones of the signature.
+        if set(self.__dict__.keys()) != set(o.__dict__.keys()):
+            return False
+        return all((k, getattr(self, k)) == (k, getattr(o, k)) for k in self.__dict__.keys())
+    def create_config_id(
+        self,
+        config_kwargs: dict,
+        custom_features: Optional[Features] = None,
+    ) -> str:
+        """
+        The config id is used to build the cache directory.
+        By default it is equal to the config name.
+        However the name of a config is not sufficient to have a unique identifier for the dataset being generated
+        since it doesn't take into account:
+        - the config kwargs that can be used to overwrite attributes
+        - the custom features used to write the dataset
+        - the data_files for json/text/csv/pandas datasets
+        Therefore the config id is just the config name with an optional suffix based on these.
+        """
+        # Possibly add a suffix to the name to handle custom features/data_files/config_kwargs
+        suffix: Optional[str] = None
+        config_kwargs_to_add_to_suffix = config_kwargs.copy()
+        # name and version are already used to build the cache directory
+        config_kwargs_to_add_to_suffix.pop("name", None)
+        config_kwargs_to_add_to_suffix.pop("version", None)
+        # data dir handling (when specified it points to the manually downloaded data):
+        # it was previously ignored before the introduction of config id because we didn't want
+        # to change the config name. Now it's fine to take it into account for the config id.
+        # config_kwargs_to_add_to_suffix.pop("data_dir", None)
+        if "data_dir" in config_kwargs_to_add_to_suffix:
+            if config_kwargs_to_add_to_suffix["data_dir"] is None:
+                config_kwargs_to_add_to_suffix.pop("data_dir", None)
+            else:
+                # canonicalize the data dir to avoid two paths to the same location having different
+                # hashes
+                data_dir = config_kwargs_to_add_to_suffix["data_dir"]
+                data_dir = os.path.normpath(data_dir)
+                config_kwargs_to_add_to_suffix["data_dir"] = data_dir
+        if config_kwargs_to_add_to_suffix:
+            # we don't care about the order of the kwargs
+            config_kwargs_to_add_to_suffix = {
+                k: config_kwargs_to_add_to_suffix[k] for k in sorted(config_kwargs_to_add_to_suffix)
+            }
+            if all(isinstance(v, (str, bool, int, float)) for v in config_kwargs_to_add_to_suffix.values()):
+                suffix = ",".join(
+                    str(k) + "=" + urllib.parse.quote_plus(str(v)) for k, v in config_kwargs_to_add_to_suffix.items()
+                )
+                if len(suffix) > 32:  # hash if too long
+                    suffix = Hasher.hash(config_kwargs_to_add_to_suffix)
+            else:
+                suffix = Hasher.hash(config_kwargs_to_add_to_suffix)
+        if custom_features is not None:
+            m = Hasher()
+            if suffix:
+                m.update(suffix)
+            m.update(custom_features)
+            suffix = m.hexdigest()
+        if suffix:
+            config_id = self.name + "-" + suffix
+            if len(config_id) > config.MAX_DATASET_CONFIG_ID_READABLE_LENGTH:
+                config_id = self.name + "-" + Hasher.hash(suffix)
+            return config_id
+        else:
+            return self.name
+    def _resolve_data_files(self, base_path: str, download_config: DownloadConfig) -> None:
+        if isinstance(self.data_files, DataFilesPatternsDict):
+            base_path = xjoin(base_path, self.data_dir) if self.data_dir else base_path
+            self.data_files = self.data_files.resolve(base_path, download_config)
+class DatasetBuilder:
+    """Abstract base class for all datasets.
+    `DatasetBuilder` has 3 key methods:
+        - [`DatasetBuilder.info`]: Documents the dataset, including feature
+          names, types, shapes, version, splits, citation, etc.
+        - [`DatasetBuilder.download_and_prepare`]: Downloads the source data
+          and writes it to disk.
+        - [`DatasetBuilder.as_dataset`]: Generates a [`Dataset`].
+    Some `DatasetBuilder`s expose multiple variants of the
+    dataset by defining a [`BuilderConfig`] subclass and accepting a
+    config object (or name) on construction. Configurable datasets expose a
+    pre-defined set of configurations in [`DatasetBuilder.builder_configs`].
+    Args:
+        cache_dir (`str`, *optional*):
+            Directory to cache data. Defaults to `"~/.cache/huggingface/datasets"`.
+        dataset_name (`str`, *optional*):
+            Name of the dataset, if different from the builder name. Useful for packaged builders
+            like csv, imagefolder, audiofolder, etc. to reflect the difference between datasets
+            that use the same packaged builder.
+        config_name (`str`, *optional*):
+            Name of the dataset configuration.
+            It affects the data generated on disk. Different configurations will have their own subdirectories and
+            versions.
+            If not provided, the default configuration is used (if it exists).
+            <Added version="2.3.0">
+            Parameter `name` was renamed to `config_name`.
+            </Added>
+        hash (`str`, *optional*):
+            Hash specific to the dataset builder code. Used to update the caching directory when the
+            dataset builder code is updated (to avoid reusing old data).
+            The typical caching directory (defined in `self._relative_data_dir`) is `name/version/hash/`.
+        base_path (`str`, *optional*):
+            Base path for relative paths that are used to download files.
+            This can be a remote URL.
+        features ([`Features`], *optional*):
+            Features types to use with this dataset.
+            It can be used to change the [`Features`] types of a dataset, for example.
+        token (`str` or `bool`, *optional*):
+            String or boolean to use as Bearer token for remote files on the
+            Datasets Hub. If `True`, will get token from `"~/.huggingface"`.
+        repo_id (`str`, *optional*):
+            ID of the dataset repository.
+            Used to distinguish builders with the same name but not coming from the same namespace, for example "rajpurkar/squad"
+            and "lhoestq/squad" repo IDs. In the latter, the builder name would be "lhoestq___squad".
+        data_files (`str` or `Sequence` or `Mapping`, *optional*):
+            Path(s) to source data file(s).
+            For builders like "csv" or "json" that need the user to specify data files. They can be either
+            local or remote files. For convenience, you can use a `DataFilesDict`.
+        data_dir (`str`, *optional*):
+            Path to directory containing source data file(s).
+            Use only if `data_files` is not passed, in which case it is equivalent to passing
+            `os.path.join(data_dir, "**")` as `data_files`.
+            For builders that require manual download, it must be the path to the local directory containing the
+            manually downloaded data.
+        storage_options (`dict`, *optional*):
+            Key/value pairs to be passed on to the dataset file-system backend, if any.
+        writer_batch_size (`int`, *optional*):
+            Batch size used by the ArrowWriter.
+            It defines the number of samples that are kept in memory before writing them
+            and also the length of the arrow chunks.
+            None means that the ArrowWriter will use its default value.
+        **config_kwargs (additional keyword arguments): Keyword arguments to be passed to the corresponding builder
+            configuration class, set on the class attribute [`DatasetBuilder.BUILDER_CONFIG_CLASS`]. The builder
+            configuration class is [`BuilderConfig`] or a subclass of it.
+    """
+    # Default version
+    VERSION = None  # Default version set in BuilderConfig
+    # Class for the builder config.
+    BUILDER_CONFIG_CLASS = BuilderConfig
+    # Named configurations that modify the data generated by download_and_prepare.
+    BUILDER_CONFIGS = []
+    # Optional default config name to be used when name is None
+    DEFAULT_CONFIG_NAME = None
+    # Default batch size used by the ArrowWriter
+    # It defines the number of samples that are kept in memory before writing them
+    # and also the length of the arrow chunks
+    # None means that the ArrowWriter will use its default value
+    DEFAULT_WRITER_BATCH_SIZE = None
+    def __init__(
+        self,
+        cache_dir: Optional[str] = None,
+        dataset_name: Optional[str] = None,
+        config_name: Optional[str] = None,
+        hash: Optional[str] = None,
+        base_path: Optional[str] = None,
+        info: Optional[DatasetInfo] = None,
+        features: Optional[Features] = None,
+        token: Optional[Union[bool, str]] = None,
+        repo_id: Optional[str] = None,
+        data_files: Optional[Union[str, list, dict, DataFilesDict]] = None,
+        data_dir: Optional[str] = None,
+        storage_options: Optional[dict] = None,
+        writer_batch_size: Optional[int] = None,
+        config_id: Optional[str] = None,
+        **config_kwargs,
+    ):
+        # DatasetBuilder name
+        self.name: str = camelcase_to_snakecase(self.__module__.split(".")[-1])
+        self.hash: Optional[str] = hash
+        self.base_path = base_path
+        self.token = token
+        self.repo_id = repo_id
+        self.storage_options = storage_options or {}
+        self.dataset_name = camelcase_to_snakecase(dataset_name) if dataset_name else self.name
+        self._writer_batch_size = writer_batch_size or self.DEFAULT_WRITER_BATCH_SIZE
+        if data_files is not None and not isinstance(data_files, DataFilesDict):
+            data_files = DataFilesDict.from_patterns(
+                sanitize_patterns(data_files),
+                base_path=base_path,
+                download_config=DownloadConfig(token=token, storage_options=self.storage_options),
+            )
+        # Prepare config: DatasetConfig contains name, version and description but can be extended by each dataset
+        if "features" in inspect.signature(self.BUILDER_CONFIG_CLASS.__init__).parameters and features is not None:
+            config_kwargs["features"] = features
+        if data_files is not None:
+            config_kwargs["data_files"] = data_files
+        if data_dir is not None:
+            config_kwargs["data_dir"] = data_dir
+        self.config_kwargs = config_kwargs
+        self.config, self.config_id = self._create_builder_config(
+            config_name=config_name,
+            custom_features=features,
+            config_id=config_id,
+            **config_kwargs,
+        )
+        # prepare info: DatasetInfo are a standardized dataclass across all datasets
+        # Prefill datasetinfo
+        if info is None:
+            info = self._info()
+        info.builder_name = self.name
+        info.dataset_name = self.dataset_name
+        info.config_name = self.config.name
+        info.version = self.config.version
+        self.info = info
+        # update info with user specified infos
+        if features is not None:
+            self.info.features = features
+        # Prepare data dirs:
+        # cache_dir can be a remote bucket on GCS or S3
+        self._cache_dir_root = str(cache_dir or config.HF_DATASETS_CACHE)
+        self._cache_dir_root = (
+            self._cache_dir_root if is_remote_url(self._cache_dir_root) else os.path.expanduser(self._cache_dir_root)
+        )
+        self._cache_downloaded_dir = (
+            posixpath.join(self._cache_dir_root, config.DOWNLOADED_DATASETS_DIR)
+            if cache_dir
+            else str(config.DOWNLOADED_DATASETS_PATH)
+        )
+        self._cache_downloaded_dir = (
+            self._cache_downloaded_dir
+            if is_remote_url(self._cache_downloaded_dir)
+            else os.path.expanduser(self._cache_downloaded_dir)
+        )
+        # In case there exists a legacy cache directory
+        self._legacy_relative_data_dir = None
+        self._cache_dir = self._build_cache_dir()
+        if not is_remote_url(self._cache_dir_root):
+            os.makedirs(self._cache_dir_root, exist_ok=True)
+            lock_path = os.path.join(
+                self._cache_dir_root, Path(self._cache_dir).as_posix().replace("/", "_") + ".lock"
+            )
+            with FileLock(lock_path):
+                if os.path.exists(self._cache_dir):  # check if data exist
+                    if len(os.listdir(self._cache_dir)) > 0:
+                        if os.path.exists(os.path.join(self._cache_dir, config.DATASET_INFO_FILENAME)):
+                            logger.debug("Overwrite dataset info from restored data version if exists.")
+                            self.info = DatasetInfo.from_directory(self._cache_dir)
+                    else:  # dir exists but no data, remove the empty dir as data aren't available anymore
+                        logger.warning(
+                            f"Old caching folder {self._cache_dir} for dataset {self.dataset_name} exists but no data were found. Removing it. "
+                        )
+                        os.rmdir(self._cache_dir)
+        # Store in the cache by default unless the user specifies a custom output_dir to download_and_prepare
+        self._output_dir = self._cache_dir
+        self._fs: fsspec.AbstractFileSystem = fsspec.filesystem("file")
+        # Set download manager
+        self.dl_manager = None
+        # Set to True by "datasets-cli test" to generate file checksums for (deprecated) dataset_infos.json independently of verification_mode value.
+        self._record_infos = False
+        # Set in `.download_and_prepare` once the format of the generated dataset is known
+        self._file_format = None
+        # Enable streaming (e.g. it patches "open" to work with remote files)
+        extend_dataset_builder_for_streaming(self)
+    def __getstate__(self):
+        return self.__dict__
+    def __setstate__(self, d):
+        self.__dict__ = d
+        # Re-enable streaming, since patched functions are not kept when pickling
+        extend_dataset_builder_for_streaming(self)
+    # Must be set for datasets that use 'data_dir' functionality - the ones
+    # that require users to do additional steps to download the data
+    # (this is usually due to some external regulations / rules).
+    # This field should contain a string with user instructions, including
+    # the list of files that should be present. It will be
+    # displayed in the dataset documentation.
+    @property
+    def manual_download_instructions(self) -> Optional[str]:
+        return None
+    def _check_legacy_cache(self) -> Optional[str]:
+        """Check for the old cache directory template {cache_dir}/{namespace}___{builder_name} from 2.13"""
+        if (
+            self.__module__.startswith("datasets.")
+            and not is_remote_url(self._cache_dir_root)
+            and self.config.name == "default"
+        ):
+            from .packaged_modules import _PACKAGED_DATASETS_MODULES
+            namespace = self.repo_id.split("/")[0] if self.repo_id and self.repo_id.count("/") > 0 else None
+            config_name = self.repo_id.replace("/", "--") if self.repo_id is not None else self.dataset_name
+            config_id = config_name + self.config_id[len(self.config.name) :]
+            hash = _PACKAGED_DATASETS_MODULES.get(self.name, "missing")[1]
+            legacy_relative_data_dir = posixpath.join(
+                self.dataset_name if namespace is None else f"{namespace}___{self.dataset_name}",
+                config_id,
+                "0.0.0",
+                hash,
+            )
+            legacy_cache_dir = posixpath.join(self._cache_dir_root, legacy_relative_data_dir)
+            if os.path.isdir(legacy_cache_dir):
+                return legacy_relative_data_dir
+    def _check_legacy_cache2(self, dataset_module: "DatasetModule") -> Optional[str]:
+        """Check for the old cache directory template {cache_dir}/{namespace}___{dataset_name}/{config_name}-xxx from 2.14 and 2.15"""
+        if (
+            self.__module__.startswith("datasets.")
+            and not is_remote_url(self._cache_dir_root)
+            and not (set(self.config_kwargs) - {"data_files", "data_dir"})
+        ):
+            from .packaged_modules import _PACKAGED_DATASETS_MODULES_2_15_HASHES
+            from .utils._dill import Pickler
+            def update_hash_with_config_parameters(hash: str, config_parameters: dict) -> str:
+                """
+                Used to update hash of packaged modules which is used for creating unique cache directories to reflect
+                different config parameters which are passed in metadata from readme.
+                """
+                params_to_exclude = {"config_name", "version", "description"}
+                params_to_add_to_hash = {
+                    param: value
+                    for param, value in sorted(config_parameters.items())
+                    if param not in params_to_exclude
+                }
+                m = Hasher()
+                m.update(hash)
+                m.update(params_to_add_to_hash)
+                return m.hexdigest()
+            namespace = self.repo_id.split("/")[0] if self.repo_id and self.repo_id.count("/") > 0 else None
+            with patch.object(Pickler, "_legacy_no_dict_keys_sorting", True):
+                config_id = self.config.name + "-" + Hasher.hash({"data_files": self.config.data_files})
+            hash = _PACKAGED_DATASETS_MODULES_2_15_HASHES.get(self.name, "missing")
+            if (
+                dataset_module.builder_configs_parameters.metadata_configs
+                and self.config.name in dataset_module.builder_configs_parameters.metadata_configs
+            ):
+                hash = update_hash_with_config_parameters(
+                    hash, dataset_module.builder_configs_parameters.metadata_configs[self.config.name]
+                )
+            legacy_relative_data_dir = posixpath.join(
+                self.dataset_name if namespace is None else f"{namespace}___{self.dataset_name}",
+                config_id,
+                "0.0.0",
+                hash,
+            )
+            legacy_cache_dir = posixpath.join(self._cache_dir_root, legacy_relative_data_dir)
+            if os.path.isdir(legacy_cache_dir):
+                return legacy_relative_data_dir
+    def _create_builder_config(
+        self, config_name=None, custom_features=None, config_id=None, **config_kwargs
+    ) -> tuple[BuilderConfig, str]:
+        """Create and validate BuilderConfig object as well as a unique config id for this config.
+        Raises ValueError if there are multiple builder configs and config_name and DEFAULT_CONFIG_NAME are None.
+        config_kwargs override the defaults kwargs in config
+        """
+        builder_config = None
+        # try default config
+        if config_name is None and self.BUILDER_CONFIGS:
+            if self.DEFAULT_CONFIG_NAME is not None:
+                builder_config = self.builder_configs.get(self.DEFAULT_CONFIG_NAME)
+                logger.info(f"No config specified, defaulting to: {self.dataset_name}/{builder_config.name}")
+            else:
+                if len(self.BUILDER_CONFIGS) > 1:
+                    if not config_kwargs:
+                        example_of_usage = (
+                            f"load_dataset('{self.repo_id or self.dataset_name}', '{self.BUILDER_CONFIGS[0].name}')"
+                        )
+                        raise ValueError(
+                            "Config name is missing."
+                            f"\nPlease pick one among the available configs: {list(self.builder_configs.keys())}"
+                            + f"\nExample of usage:\n\t`{example_of_usage}`"
+                        )
+                else:
+                    builder_config = self.BUILDER_CONFIGS[0]
+                    logger.info(
+                        f"No config specified, defaulting to the single config: {self.dataset_name}/{builder_config.name}"
+                    )
+        # try to get config by name
+        if isinstance(config_name, str):
+            builder_config = self.builder_configs.get(config_name)
+            if builder_config is None and self.BUILDER_CONFIGS:
+                raise ValueError(
+                    f"BuilderConfig '{config_name}' not found. Available: {list(self.builder_configs.keys())}"
+                )
+        # if not using an existing config, then create a new config on the fly
+        if not builder_config:
+            if config_name is not None:
+                config_kwargs["name"] = config_name
+            elif self.DEFAULT_CONFIG_NAME and not config_kwargs:
+                # Use DEFAULT_CONFIG_NAME only if no config_kwargs are passed
+                config_kwargs["name"] = self.DEFAULT_CONFIG_NAME
+            if "version" not in config_kwargs and hasattr(self, "VERSION") and self.VERSION:
+                config_kwargs["version"] = self.VERSION
+            builder_config = self.BUILDER_CONFIG_CLASS(**config_kwargs)
+        # otherwise use the config_kwargs to overwrite the attributes
+        else:
+            builder_config = copy.deepcopy(builder_config) if config_kwargs else builder_config
+            for key, value in config_kwargs.items():
+                if value is not None:
+                    if not hasattr(builder_config, key):
+                        raise ValueError(f"BuilderConfig {builder_config} doesn't have a '{key}' key.")
+                    setattr(builder_config, key, value)
+        if not builder_config.name:
+            raise ValueError(f"BuilderConfig must have a name, got {builder_config.name}")
+        # resolve data files if needed
+        builder_config._resolve_data_files(
+            base_path=self.base_path,
+            download_config=DownloadConfig(token=self.token, storage_options=self.storage_options),
+        )
+        # compute the config id that is going to be used for caching
+        if config_id is None:
+            config_id = builder_config.create_config_id(
+                config_kwargs,
+                custom_features=custom_features,
+            )
+        is_custom = (config_id not in self.builder_configs) and config_id != "default"
+        if is_custom:
+            logger.info(f"Using custom data configuration {config_id}")
+        else:
+            if (
+                builder_config.name in self.builder_configs
+                and builder_config != self.builder_configs[builder_config.name]
+            ):
+                raise ValueError(
+                    "Cannot name a custom BuilderConfig the same as an available "
+                    f"BuilderConfig. Change the name. Available BuilderConfigs: {list(self.builder_configs.keys())}"
+                )
+            if not builder_config.version:
+                raise ValueError(f"BuilderConfig {builder_config.name} must have a version")
+        return builder_config, config_id
+    @classproperty
+    @classmethod
+    @memoize()
+    def builder_configs(cls) -> dict[str, BuilderConfig]:
+        """Dictionary of pre-defined configurations for this builder class."""
+        configs = {config.name: config for config in cls.BUILDER_CONFIGS}
+        if len(configs) != len(cls.BUILDER_CONFIGS):
+            names = [config.name for config in cls.BUILDER_CONFIGS]
+            raise ValueError(f"Names in BUILDER_CONFIGS must not be duplicated. Got {names}")
+        return configs
+    @property
+    def cache_dir(self):
+        return self._cache_dir
+    def _use_legacy_cache_dir_if_possible(self, dataset_module: "DatasetModule"):
+        # Check for the legacy cache directory template (datasets<3.0.0)
+        self._legacy_relative_data_dir = (
+            self._check_legacy_cache2(dataset_module) or self._check_legacy_cache() or None
+        )
+        self._cache_dir = self._build_cache_dir()
+        self._output_dir = self._cache_dir
+    def _relative_data_dir(self, with_version=True, with_hash=True) -> str:
+        """Relative path of this dataset in cache_dir:
+        Will be:
+            self.dataset_name/self.config.version/self.hash/
+        or if a repo_id with a namespace has been specified:
+            self.namespace___self.dataset_name/self.config.version/self.hash/
+        If any of these element is missing or if ``with_version=False`` the corresponding subfolders are dropped.
+        """
+        if self._legacy_relative_data_dir is not None and with_version and with_hash:
+            return self._legacy_relative_data_dir
+        namespace = self.repo_id.split("/")[0] if self.repo_id and self.repo_id.count("/") > 0 else None
+        builder_data_dir = self.dataset_name if namespace is None else f"{namespace}___{self.dataset_name}"
+        builder_data_dir = posixpath.join(builder_data_dir, self.config_id)
+        if with_version:
+            builder_data_dir = posixpath.join(builder_data_dir, str(self.config.version))
+        if with_hash and self.hash and isinstance(self.hash, str):
+            builder_data_dir = posixpath.join(builder_data_dir, self.hash)
+        return builder_data_dir
+    def _build_cache_dir(self):
+        """Return the data directory for the current version."""
+        builder_data_dir = posixpath.join(self._cache_dir_root, self._relative_data_dir(with_version=False))
+        version_data_dir = posixpath.join(self._cache_dir_root, self._relative_data_dir(with_version=True))
+        def _other_versions_on_disk():
+            """Returns previous versions on disk."""
+            if not os.path.exists(builder_data_dir):
+                return []
+            version_dirnames = []
+            for dir_name in os.listdir(builder_data_dir):
+                try:
+                    version_dirnames.append((utils.Version(dir_name), dir_name))
+                except ValueError:  # Invalid version (ex: incomplete data dir)
+                    pass
+            version_dirnames.sort(reverse=True)
+            return version_dirnames
+        # Check and warn if other versions exist
+        if not is_remote_url(builder_data_dir):
+            version_dirs = _other_versions_on_disk()
+            if version_dirs:
+                other_version = version_dirs[0][0]
+                if other_version != self.config.version:
+                    warn_msg = (
+                        f"Found a different version {str(other_version)} of dataset {self.dataset_name} in "
+                        f"cache_dir {self._cache_dir_root}. Using currently defined version "
+                        f"{str(self.config.version)}."
+                    )
+                    logger.warning(warn_msg)
+        return version_data_dir
+    @abc.abstractmethod
+    def _info(self) -> DatasetInfo:
+        """Construct the DatasetInfo object. See `DatasetInfo` for details.
+        Warning: This function is only called once and the result is cached for all
+        following .info() calls.
+        Returns:
+            info: (DatasetInfo) The dataset information
+        """
+        raise NotImplementedError
+    @classmethod
+    def get_imported_module_dir(cls):
+        """Return the path of the module of this class or subclass."""
+        return os.path.dirname(inspect.getfile(inspect.getmodule(cls)))
+    def _rename(self, src: str, dst: str):
+        rename(self._fs, src, dst)
+    def download_and_prepare(
+        self,
+        output_dir: Optional[str] = None,
+        download_config: Optional[DownloadConfig] = None,
+        download_mode: Optional[Union[DownloadMode, str]] = None,
+        verification_mode: Optional[Union[VerificationMode, str]] = None,
+        dl_manager: Optional[DownloadManager] = None,
+        base_path: Optional[str] = None,
+        file_format: str = "arrow",
+        max_shard_size: Optional[Union[int, str]] = None,
+        num_proc: Optional[int] = None,
+        storage_options: Optional[dict] = None,
+        **download_and_prepare_kwargs,
+    ):
+        """Downloads and prepares dataset for reading.
+        Args:
+            output_dir (`str`, *optional*):
+                Output directory for the dataset.
+                Default to this builder's `cache_dir`, which is inside `~/.cache/huggingface/datasets` by default.
+                <Added version="2.5.0"/>
+            download_config (`DownloadConfig`, *optional*):
+                Specific download configuration parameters.
+            download_mode ([`DownloadMode`] or `str`, *optional*):
+                Select the download/generate mode, default to `REUSE_DATASET_IF_EXISTS`.
+            verification_mode ([`VerificationMode`] or `str`, defaults to `BASIC_CHECKS`):
+                Verification mode determining the checks to run on the downloaded/processed dataset information (checksums/size/splits/...).
+                <Added version="2.9.1"/>
+            dl_manager (`DownloadManager`, *optional*):
+                Specific `DownloadManger` to use.
+            base_path (`str`, *optional*):
+                Base path for relative paths that are used to download files. This can be a remote url.
+                If not specified, the value of the `base_path` attribute (`self.base_path`) will be used instead.
+            file_format (`str`, *optional*):
+                Format of the data files in which the dataset will be written.
+                Supported formats: "arrow", "parquet". Default to "arrow" format.
+                If the format is "parquet", then image and audio data are embedded into the Parquet files instead of pointing to local files.
+                <Added version="2.5.0"/>
+            max_shard_size (`Union[str, int]`, *optional*):
+                Maximum number of bytes written per shard, default is "500MB".
+                The size is based on uncompressed data size, so in practice your shard files may be smaller than
+                `max_shard_size` thanks to Parquet compression for example.
+                <Added version="2.5.0"/>
+            num_proc (`int`, *optional*, defaults to `None`):
+                Number of processes when downloading and generating the dataset locally.
+                Multiprocessing is disabled by default.
+                <Added version="2.7.0"/>
+            storage_options (`dict`, *optional*):
+                Key/value pairs to be passed on to the caching file-system backend, if any.
+                <Added version="2.5.0"/>
+            **download_and_prepare_kwargs (additional keyword arguments): Keyword arguments.
+        Example:
+        Download and prepare the dataset as Arrow files that can be loaded as a Dataset using `builder.as_dataset()`:
+        ```py
+        >>> from datasets import load_dataset_builder
+        >>> builder = load_dataset_builder("cornell-movie-review-data/rotten_tomatoes")
+        >>> builder.download_and_prepare()
+        ```
+        Download and prepare the dataset as sharded Parquet files locally:
+        ```py
+        >>> from datasets import load_dataset_builder
+        >>> builder = load_dataset_builder("cornell-movie-review-data/rotten_tomatoes")
+        >>> builder.download_and_prepare("./output_dir", file_format="parquet")
+        ```
+        Download and prepare the dataset as sharded Parquet files in a cloud storage:
+        ```py
+        >>> from datasets import load_dataset_builder
+        >>> storage_options = {"key": aws_access_key_id, "secret": aws_secret_access_key}
+        >>> builder = load_dataset_builder("cornell-movie-review-data/rotten_tomatoes")
+        >>> builder.download_and_prepare("s3://my-bucket/my_rotten_tomatoes", storage_options=storage_options, file_format="parquet")
+        ```
+        """
+        output_dir = output_dir if output_dir is not None else self._cache_dir
+        # output_dir can be a remote bucket on GCS or S3
+        fs, output_dir = url_to_fs(output_dir, **(storage_options or {}))
+        self._fs = fs
+        self._output_dir = output_dir if not is_remote_filesystem(self._fs) else self._fs.unstrip_protocol(output_dir)
+        download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS)
+        verification_mode = VerificationMode(verification_mode or VerificationMode.BASIC_CHECKS)
+        base_path = base_path if base_path is not None else self.base_path
+        if file_format is not None and file_format not in ["arrow", "parquet"]:
+            raise ValueError(f"Unsupported file_format: {file_format}. Expected 'arrow' or 'parquet'")
+        self._file_format = file_format
+        if self._fs._strip_protocol(self._output_dir) == "":
+            # We don't support the root directory, because it has no dirname,
+            # and we need a dirname to use a <dirname>.incomplete directory
+            # when the dataset is being written
+            raise RuntimeError(
+                f"Unable to download and prepare the dataset at the root {self._output_dir}. "
+                f"Please specify a subdirectory, e.g. '{self._output_dir + self.dataset_name}'"
+            )
+        if dl_manager is None:
+            if download_config is None:
+                download_config = DownloadConfig(
+                    cache_dir=self._cache_downloaded_dir,
+                    force_download=download_mode == DownloadMode.FORCE_REDOWNLOAD,
+                    force_extract=download_mode == DownloadMode.FORCE_REDOWNLOAD,
+                    use_etag=False,
+                    num_proc=num_proc,
+                    token=self.token,
+                    storage_options=self.storage_options,
+                )  # We don't use etag for data files to speed up the process
+            dl_manager = DownloadManager(
+                dataset_name=self.dataset_name,
+                download_config=download_config,
+                data_dir=self.config.data_dir,
+                base_path=base_path,
+                record_checksums=(self._record_infos or verification_mode == VerificationMode.ALL_CHECKS),
+            )
+        is_local = not is_remote_filesystem(self._fs)
+        self.dl_manager = dl_manager
+        # Prevent parallel local disk operations
+        if is_local:
+            # Create parent directory of the output_dir to put the lock file in there
+            Path(self._output_dir).parent.mkdir(parents=True, exist_ok=True)
+            lock_path = self._output_dir + "_builder.lock"
+        # File locking only with local paths; no file locking on GCS or S3
+        with FileLock(lock_path) if is_local else contextlib.nullcontext():
+            # Check if the data already exists
+            data_exists = self._fs.exists(posixpath.join(self._output_dir, config.DATASET_INFO_FILENAME))
+            if data_exists and download_mode == DownloadMode.REUSE_DATASET_IF_EXISTS:
+                logger.info(f"Found cached dataset {self.dataset_name} ({self._output_dir})")
+                # We need to update the info in case some splits were added in the meantime
+                # for example when calling load_dataset from multiple workers.
+                self.info = self._load_info()
+                self.download_post_processing_resources(dl_manager)
+                return
+            logger.info(f"Generating dataset {self.dataset_name} ({self._output_dir})")
+            if is_local:  # if cache dir is local, check for available space
+                if not has_sufficient_disk_space(
+                    self.info.size_in_bytes or 0, directory=Path(self._output_dir).parent
+                ):
+                    raise OSError(
+                        f"Not enough disk space. Needed: {size_str(self.info.size_in_bytes or 0)} (download: {size_str(self.info.download_size or 0)}, generated: {size_str(self.info.dataset_size or 0)}, post-processed: {size_str(self.info.post_processing_size or 0)})"
+                    )
+            @contextlib.contextmanager
+            def incomplete_dir(dirname):
+                """Create temporary dir for dirname and rename on exit."""
+                if not is_local:
+                    self._fs.makedirs(dirname, exist_ok=True)
+                    yield dirname
+                else:
+                    tmp_dir = dirname + ".incomplete"
+                    os.makedirs(tmp_dir, exist_ok=True)
+                    try:
+                        yield tmp_dir
+                        if os.path.isdir(dirname):
+                            shutil.rmtree(dirname)
+                        # LocalFileSystem.mv does copy + rm, it is more efficient to simply rename a local directory
+                        shutil.move(tmp_dir, dirname)
+                    finally:
+                        if os.path.exists(tmp_dir):
+                            shutil.rmtree(tmp_dir)
+            # Print is intentional: we want this to always go to stdout so user has
+            # information needed to cancel download/preparation if needed.
+            # This comes right before the progress bar.
+            if self.info.size_in_bytes:
+                logger.info(
+                    f"Downloading and preparing dataset {self.dataset_name}/{self.config.name} "
+                    f"(download: {size_str(self.info.download_size)}, generated: {size_str(self.info.dataset_size)}, "
+                    f"post-processed: {size_str(self.info.post_processing_size)}, "
+                    f"total: {size_str(self.info.size_in_bytes)}) to {self._output_dir}..."
+                )
+            else:
+                _dest = self._fs._strip_protocol(self._output_dir) if is_local else self._output_dir
+                logger.info(f"Downloading and preparing dataset {self.dataset_name}/{self.config.name} to {_dest}...")
+            self._check_manual_download(dl_manager)
+            # Create a tmp dir and rename to self._output_dir on successful exit.
+            with incomplete_dir(self._output_dir) as tmp_output_dir:
+                # Temporarily assign _output_dir to tmp_data_dir to avoid having to forward
+                # it to every sub function.
+                with temporary_assignment(self, "_output_dir", tmp_output_dir):
+                    prepare_split_kwargs = {"file_format": file_format}
+                    if max_shard_size is not None:
+                        prepare_split_kwargs["max_shard_size"] = max_shard_size
+                    if num_proc is not None:
+                        prepare_split_kwargs["num_proc"] = num_proc
+                    self._download_and_prepare(
+                        dl_manager=dl_manager,
+                        verification_mode=verification_mode,
+                        **prepare_split_kwargs,
+                        **download_and_prepare_kwargs,
+                    )
+                    # Sync info
+                    self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())
+                    self.info.download_checksums = dl_manager.get_recorded_sizes_checksums()
+                    if self.info.download_size is not None:
+                        self.info.size_in_bytes = self.info.dataset_size + self.info.download_size
+                    # Save info
+                    self._save_info()
+            # Download post processing resources
+            self.download_post_processing_resources(dl_manager)
+            logger.info(
+                f"Dataset {self.dataset_name} downloaded and prepared to {self._output_dir}. "
+                f"Subsequent calls will reuse this data."
+            )
+    def _check_manual_download(self, dl_manager):
+        if self.manual_download_instructions is not None and dl_manager.manual_dir is None:
+            raise ManualDownloadError(
+                textwrap.dedent(
+                    f"""\
+                    The dataset {self.dataset_name} with config {self.config.name} requires manual data.
+                    Please follow the manual download instructions:
+                     {self.manual_download_instructions}
+                    Manual data can be loaded with:
+                     datasets.load_dataset("{self.repo_id or self.dataset_name}", data_dir="<path/to/manual/data>")"""
+                )
+            )
+    def _download_and_prepare(self, dl_manager, verification_mode, **prepare_split_kwargs):
+        """Downloads and prepares dataset for reading.
+        This is the internal implementation to overwrite called when user calls
+        `download_and_prepare`. It should download all required data and generate
+        the pre-processed datasets files.
+        Args:
+            dl_manager ([`DownloadManager`]):
+                `DownloadManager` used to download and cache data.
+            verification_mode ([`VerificationMode`]):
+                if `ALL_CHECKS`, perform all the verifications including checksums.
+                if `BASIC_CHECKS`, do not perform checksums, only perform split tests.
+                if `NO_CHECKS`, do not perform any verification.
+            prepare_split_kwargs: Additional options, such as `file_format`, `max_shard_size`
+        """
+        # Generating data for all splits
+        split_dict = SplitDict(dataset_name=self.dataset_name)
+        split_generators_kwargs = self._make_split_generators_kwargs(prepare_split_kwargs)
+        split_generators = self._split_generators(dl_manager, **split_generators_kwargs)
+        # Checksums verification
+        if verification_mode == VerificationMode.ALL_CHECKS and dl_manager.record_checksums:
+            verify_checksums(
+                self.info.download_checksums, dl_manager.get_recorded_sizes_checksums(), "dataset source files"
+            )
+        # Build splits
+        for split_generator in split_generators:
+            if str(split_generator.split_info.name).lower() == "all":
+                raise ValueError(
+                    "`all` is a special split keyword corresponding to the "
+                    "union of all splits, so cannot be used as key in "
+                    "._split_generator()."
+                )
+            logger.info(f"Generating {split_generator.split_info.name} split")
+            split_dict.add(split_generator.split_info)
+            try:
+                # Prepare split will record examples associated to the split
+                self._prepare_split(split_generator, **prepare_split_kwargs)
+            except OSError as e:
+                raise OSError(
+                    "Cannot find data file. "
+                    + (self.manual_download_instructions or "")
+                    + "\nOriginal error:\n"
+                    + str(e)
+                ) from None
+            # If check_duplicates is set to True , then except DuplicatedKeysError
+            except DuplicatedKeysError as e:
+                raise DuplicatedKeysError(
+                    e.key,
+                    e.duplicate_key_indices,
+                    fix_msg=f"To avoid duplicate keys, please fix the dataset splits for {self.name}",
+                ) from None
+            dl_manager.manage_extracted_files()
+        if verification_mode == VerificationMode.BASIC_CHECKS or verification_mode == VerificationMode.ALL_CHECKS:
+            verify_splits(self.info.splits, split_dict)
+        # Update the info object with the splits.
+        self.info.splits = split_dict
+        self.info.download_size = dl_manager.downloaded_size
+    def download_post_processing_resources(self, dl_manager):
+        for split in self.info.splits or []:
+            for resource_name, resource_file_name in self._post_processing_resources(split).items():
+                if not not is_remote_filesystem(self._fs):
+                    raise NotImplementedError(f"Post processing is not supported on filesystem {self._fs}")
+                if os.sep in resource_file_name:
+                    raise ValueError(f"Resources shouldn't be in a sub-directory: {resource_file_name}")
+                resource_path = os.path.join(self._output_dir, resource_file_name)
+                if not os.path.exists(resource_path):
+                    downloaded_resource_path = self._download_post_processing_resources(
+                        split, resource_name, dl_manager
+                    )
+                    if downloaded_resource_path:
+                        logger.info(f"Downloaded post-processing resource {resource_name} as {resource_file_name}")
+                        shutil.move(downloaded_resource_path, resource_path)
+    def _load_info(self) -> DatasetInfo:
+        return DatasetInfo.from_directory(self._output_dir, storage_options=self._fs.storage_options)
+    def _save_info(self):
+        file_lock = (
+            FileLock(self._output_dir + "_info.lock")
+            if not is_remote_filesystem(self._fs)
+            else contextlib.nullcontext()
+        )
+        with file_lock:
+            self.info.write_to_directory(self._output_dir, storage_options=self._fs.storage_options)
+    def _make_split_generators_kwargs(self, prepare_split_kwargs):
+        """Get kwargs for `self._split_generators()` from `prepare_split_kwargs`."""
+        del prepare_split_kwargs
+        return {}
+    def as_dataset(
+        self,
+        split: Optional[Union[str, Split, list[str], list[Split]]] = None,
+        run_post_process=True,
+        verification_mode: Optional[Union[VerificationMode, str]] = None,
+        in_memory=False,
+    ) -> Union[Dataset, DatasetDict]:
+        """Return a Dataset for the specified split.
+        Args:
+            split (`datasets.Split`):
+                Which subset of the data to return.
+            run_post_process (`bool`, defaults to `True`):
+                Whether to run post-processing dataset transforms and/or add
+                indexes.
+            verification_mode ([`VerificationMode`] or `str`, defaults to `BASIC_CHECKS`):
+                Verification mode determining the checks to run on the
+                downloaded/processed dataset information (checksums/size/splits/...).
+                <Added version="2.9.1"/>
+            in_memory (`bool`, defaults to `False`):
+                Whether to copy the data in-memory.
+        Returns:
+            datasets.Dataset
+        Example:
+        ```py
+        >>> from datasets import load_dataset_builder
+        >>> builder = load_dataset_builder('cornell-movie-review-data/rotten_tomatoes')
+        >>> builder.download_and_prepare()
+        >>> ds = builder.as_dataset(split='train')
+        >>> ds
+        Dataset({
+            features: ['text', 'label'],
+            num_rows: 8530
+        })
+        ```
+        """
+        if self._file_format is not None and self._file_format != "arrow":
+            raise FileFormatError('Loading a dataset not written in the "arrow" format is not supported.')
+        if is_remote_filesystem(self._fs):
+            raise NotImplementedError(f"Loading a dataset cached in a {type(self._fs).__name__} is not supported.")
+        if not os.path.exists(self._output_dir):
+            raise FileNotFoundError(
+                f"Dataset {self.dataset_name}: could not find data in {self._output_dir}. Please make sure to call "
+                "builder.download_and_prepare(), or use "
+                "datasets.load_dataset() before trying to access the Dataset object."
+            )
+        logger.debug(f"Constructing Dataset for split {split or ', '.join(self.info.splits)}, from {self._output_dir}")
+        # By default, return all splits
+        if split is None:
+            split = {s: s for s in self.info.splits}
+        verification_mode = VerificationMode(verification_mode or VerificationMode.BASIC_CHECKS)
+        # Create a dataset for each of the given splits
+        datasets = map_nested(
+            partial(
+                self._build_single_dataset,
+                run_post_process=run_post_process,
+                verification_mode=verification_mode,
+                in_memory=in_memory,
+            ),
+            split,
+            map_tuple=True,
+            disable_tqdm=True,
+        )
+        if isinstance(datasets, dict):
+            datasets = DatasetDict(datasets)
+        return datasets
+    def _build_single_dataset(
+        self,
+        split: Union[str, ReadInstruction, Split],
+        run_post_process: bool,
+        verification_mode: VerificationMode,
+        in_memory: bool = False,
+    ):
+        """as_dataset for a single split."""
+        if not isinstance(split, ReadInstruction):
+            split = str(split)
+            if split == "all":
+                split = "+".join(self.info.splits.keys())
+            split = Split(split)
+        # Build base dataset
+        ds = self._as_dataset(
+            split=split,
+            in_memory=in_memory,
+        )
+        if run_post_process:
+            for resource_file_name in self._post_processing_resources(split).values():
+                if os.sep in resource_file_name:
+                    raise ValueError(f"Resources shouldn't be in a sub-directory: {resource_file_name}")
+            resources_paths = {
+                resource_name: os.path.join(self._output_dir, resource_file_name)
+                for resource_name, resource_file_name in self._post_processing_resources(split).items()
+            }
+            post_processed = self._post_process(ds, resources_paths)
+            if post_processed is not None:
+                ds = post_processed
+                recorded_checksums = {}
+                record_checksums = False
+                for resource_name, resource_path in resources_paths.items():
+                    size_checksum = get_size_checksum_dict(resource_path)
+                    recorded_checksums[resource_name] = size_checksum
+                if verification_mode == VerificationMode.ALL_CHECKS and record_checksums:
+                    if self.info.post_processed is None or self.info.post_processed.resources_checksums is None:
+                        expected_checksums = None
+                    else:
+                        expected_checksums = self.info.post_processed.resources_checksums.get(split)
+                    verify_checksums(expected_checksums, recorded_checksums, "post processing resources")
+                if self.info.post_processed is None:
+                    self.info.post_processed = PostProcessedInfo()
+                if self.info.post_processed.resources_checksums is None:
+                    self.info.post_processed.resources_checksums = {}
+                self.info.post_processed.resources_checksums[str(split)] = recorded_checksums
+                self.info.post_processing_size = sum(
+                    checksums_dict["num_bytes"]
+                    for split_checksums_dicts in self.info.post_processed.resources_checksums.values()
+                    for checksums_dict in split_checksums_dicts.values()
+                )
+                if self.info.dataset_size is not None and self.info.download_size is not None:
+                    self.info.size_in_bytes = (
+                        self.info.dataset_size + self.info.download_size + self.info.post_processing_size
+                    )
+                self._save_info()
+                ds._info.post_processed = self.info.post_processed
+                ds._info.post_processing_size = self.info.post_processing_size
+                ds._info.size_in_bytes = self.info.size_in_bytes
+                if self.info.post_processed.features is not None:
+                    if self.info.post_processed.features.type != ds.features.type:
+                        raise ValueError(
+                            f"Post-processed features info don't match the dataset:\nGot\n{self.info.post_processed.features}\nbut expected something like\n{ds.features}"
+                        )
+                    else:
+                        ds.info.features = self.info.post_processed.features
+        return ds
+    def _as_dataset(self, split: Union[ReadInstruction, Split] = Split.TRAIN, in_memory: bool = False) -> Dataset:
+        """Constructs a `Dataset`.
+        This is the internal implementation to overwrite called when user calls
+        `as_dataset`. It should read the pre-processed datasets files and generate
+        the `Dataset` object.
+        Args:
+            split (`datasets.Split`):
+                which subset of the data to read.
+            in_memory (`bool`, defaults to `False`):
+                Whether to copy the data in-memory.
+        Returns:
+            `Dataset`
+        """
+        cache_dir = self._fs._strip_protocol(self._output_dir)
+        dataset_name = self.dataset_name
+        if self._check_legacy_cache():
+            dataset_name = self.name
+        dataset_kwargs = ArrowReader(cache_dir, self.info).read(
+            name=dataset_name,
+            instructions=split,
+            split_infos=self.info.splits.values(),
+            in_memory=in_memory,
+        )
+        fingerprint = self._get_dataset_fingerprint(split)
+        return Dataset(fingerprint=fingerprint, **dataset_kwargs)
+    def _get_dataset_fingerprint(self, split: Union[ReadInstruction, Split]) -> str:
+        """The dataset fingerprint is the hash of the relative directory dataset_name/config_name/version/hash, as well as the split specs."""
+        hasher = Hasher()
+        hasher.update(Path(self._relative_data_dir()).as_posix())
+        hasher.update(str(split))  # for example: train, train+test, train[:10%], test[:33%](pct1_dropremainder)
+        fingerprint = hasher.hexdigest()
+        return fingerprint
+    def as_streaming_dataset(
+        self,
+        split: Optional[str] = None,
+        base_path: Optional[str] = None,
+    ) -> Union[dict[str, IterableDataset], IterableDataset]:
+        if is_remote_filesystem(self._fs):
+            raise NotImplementedError(
+                f"Loading a streaming dataset cached in a {type(self._fs).__name__} is not supported yet."
+            )
+        dl_manager = StreamingDownloadManager(
+            base_path=base_path or self.base_path,
+            download_config=DownloadConfig(token=self.token, storage_options=self.storage_options),
+            dataset_name=self.dataset_name,
+            data_dir=self.config.data_dir,
+        )
+        self._check_manual_download(dl_manager)
+        splits_generators = {sg.name: sg for sg in self._split_generators(dl_manager)}
+        # By default, return all splits
+        if split is None:
+            splits_generator = splits_generators
+        elif split in splits_generators:
+            splits_generator = splits_generators[split]
+        else:
+            raise ValueError(f"Bad split: {split}. Available splits: {list(splits_generators)}")
+        # Create a dataset for each of the given splits
+        datasets = map_nested(
+            self._as_streaming_dataset_single,
+            splits_generator,
+            map_tuple=True,
+        )
+        if isinstance(datasets, dict):
+            datasets = IterableDatasetDict(datasets)
+        return datasets
+    def _as_streaming_dataset_single(
+        self,
+        splits_generator,
+    ) -> IterableDataset:
+        ex_iterable = self._get_examples_iterable_for_split(splits_generator)
+        # add auth to be able to access and decode audio/image files from private repositories.
+        token_per_repo_id = {self.repo_id: self.token} if self.repo_id else {}
+        return IterableDataset(
+            ex_iterable, info=self.info, split=splits_generator.name, token_per_repo_id=token_per_repo_id
+        )
+    def _post_process(self, dataset: Dataset, resources_paths: Mapping[str, str]) -> Optional[Dataset]:
+        """Run dataset transforms or add indexes"""
+        return None
+    def _post_processing_resources(self, split: str) -> dict[str, str]:
+        """Mapping resource_name -> resource_file_name"""
+        return {}
+    def _download_post_processing_resources(
+        self, split: str, resource_name: str, dl_manager: DownloadManager
+    ) -> Optional[str]:
+        """Download the resource using the download manager and return the downloaded path."""
+        return None
+    @abc.abstractmethod
+    def _split_generators(self, dl_manager: Union[DownloadManager, StreamingDownloadManager]):
+        """Specify feature dictionary generators and dataset splits.
+        This function returns a list of `SplitGenerator`s defining how to generate
+        data and what splits to use.
+        Example:
+            return [
+                    datasets.SplitGenerator(
+                            name=datasets.Split.TRAIN,
+                            gen_kwargs={'file': 'train_data.zip'},
+                    ),
+                    datasets.SplitGenerator(
+                            name=datasets.Split.TEST,
+                            gen_kwargs={'file': 'test_data.zip'},
+                    ),
+            ]
+        The above code will first call `_generate_examples(file='train_data.zip')`
+        to write the train data, then `_generate_examples(file='test_data.zip')` to
+        write the test data.
+        Datasets are typically split into different subsets to be used at various
+        stages of training and evaluation.
+        Note that for datasets without a `VALIDATION` split, you can use a
+        fraction of the `TRAIN` data for evaluation as you iterate on your model
+        so as not to overfit to the `TEST` data.
+        For downloads and extractions, use the given `download_manager`.
+        Note that the `DownloadManager` caches downloads, so it is fine to have each
+        generator attempt to download the source data.
+        A good practice is to download all data in this function, and then
+        distribute the relevant parts to each split with the `gen_kwargs` argument
+        Args:
+            dl_manager (`Union[DownloadManager, StreamingDownloadManager]`):
+                Download manager to download the data
+        Returns:
+            `list<SplitGenerator>`.
+        """
+        raise NotImplementedError()
+    @abc.abstractmethod
+    def _prepare_split(
+        self,
+        split_generator: SplitGenerator,
+        file_format: str = "arrow",
+        max_shard_size: Optional[Union[str, int]] = None,
+        num_proc: Optional[int] = None,
+        **kwargs,
+    ):
+        """Generate the examples and record them on disk.
+        Args:
+            split_generator (`SplitGenerator`):
+                Split generator to process
+            file_format (`str`, *optional*):
+                format of the data files in which the dataset will be written.
+                Supported formats: "arrow", "parquet". Default to "arrow" format.
+            max_shard_size (`Union[str, int]`, *optional*):
+                Maximum number of bytes written per shard, default is "500MB".
+                The size is based on uncompressed data size, so in practice your shard files may be smaller than
+                `max_shard_size` thanks to Parquet compression for example.
+            num_proc (`int`, *optional*, defaults to `None`):
+                Number of processes when downloading and generating the dataset locally.
+                Multiprocessing is disabled by default.
+                <Added version="2.7.0"/>
+            **kwargs: Additional kwargs forwarded from _download_and_prepare
+        """
+        raise NotImplementedError()
+    def _get_examples_iterable_for_split(self, split_generator: SplitGenerator) -> ExamplesIterable:
+        """Generate the examples on the fly.
+        Args:
+            split_generator (`SplitGenerator`):
+                Split generator to process
+        """
+        raise NotImplementedError()
+class GeneratorBasedBuilder(DatasetBuilder):
+    """Base class for datasets with data generation based on dict generators.
+    `GeneratorBasedBuilder` is a convenience class that abstracts away much
+    of the data writing and reading of `DatasetBuilder`. It expects subclasses to
+    implement generators of feature dictionaries across the dataset splits
+    (`_split_generators`). See the method docstrings for details.
+    """
+    @abc.abstractmethod
+    def _generate_examples(self, **kwargs):
+        """Default function generating examples for each `SplitGenerator`.
+        This function preprocess the examples from the raw data to the preprocessed
+        dataset files.
+        This function is called once for each `SplitGenerator` defined in
+        `_split_generators`. The examples yielded here will be written on
+        disk.
+        Args:
+            **kwargs (additional keyword arguments):
+                Arguments forwarded from the SplitGenerator.gen_kwargs
+        Yields:
+            key: `str` or `int`, a unique deterministic example identification key.
+                * Unique: An error will be raised if two examples are yield with the
+                    same key.
+                * Deterministic: When generating the dataset twice, the same example
+                    should have the same key.
+                Good keys can be the image id, or line number if examples are extracted
+                from a text file.
+                The key will be hashed and sorted to shuffle examples deterministically,
+                such as generating the dataset multiple times keep examples in the
+                same order.
+            example: `dict<str feature_name, feature_value>`, a feature dictionary
+                ready to be encoded and written to disk. The example will be
+                encoded with `self.info.features.encode_example({...})`.
+        """
+        raise NotImplementedError()
+    def _prepare_split(
+        self,
+        split_generator: SplitGenerator,
+        check_duplicate_keys: bool,
+        file_format="arrow",
+        num_proc: Optional[int] = None,
+        max_shard_size: Optional[Union[int, str]] = None,
+    ):
+        max_shard_size = convert_file_size_to_int(max_shard_size or config.MAX_SHARD_SIZE)
+        if self.info.splits is not None:
+            split_info = self.info.splits[split_generator.name]
+        else:
+            split_info = split_generator.split_info
+        SUFFIX = "-JJJJJ-SSSSS-of-NNNNN"
+        fname = f"{self.dataset_name}-{split_generator.name}{SUFFIX}.{file_format}"
+        fpath = posixpath.join(self._output_dir, fname)
+        if num_proc and num_proc > 1:
+            num_input_shards = _number_of_shards_in_gen_kwargs(split_generator.gen_kwargs)
+            if num_input_shards <= 1:
+                logger.warning(
+                    f"Setting num_proc from {num_proc} back to 1 for the {split_info.name} split to disable multiprocessing as it only contains one shard."
+                )
+                num_proc = 1
+            elif num_input_shards < num_proc:
+                logger.warning(
+                    f"Setting num_proc from {num_proc} to {num_input_shards} for the {split_info.name} split as it only contains {num_input_shards} shards."
+                )
+                num_proc = num_input_shards
+        pbar = hf_tqdm(
+            unit=" examples",
+            total=split_info.num_examples,
+            desc=f"Generating {split_info.name} split",
+        )
+        _prepare_split_args = {
+            "fpath": fpath,
+            "file_format": file_format,
+            "max_shard_size": max_shard_size,
+            "split_info": split_info,
+            "check_duplicate_keys": check_duplicate_keys,
+        }
+        if num_proc is None or num_proc == 1:
+            result = None
+            gen_kwargs = split_generator.gen_kwargs
+            job_id = 0
+            with pbar:
+                for job_id, done, content in self._prepare_split_single(
+                    gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args
+                ):
+                    if done:
+                        result = content
+                    else:
+                        pbar.update(content)
+            # wrapping everything into lists for consistency with the multiprocessed code path
+            assert result is not None, "Failed to retrieve results from prepare_split"
+            examples_per_job, bytes_per_job, features_per_job, shards_per_job, shard_lengths_per_job = (
+                [item] for item in result
+            )
+        else:
+            kwargs_per_job = [
+                {"gen_kwargs": gen_kwargs, "job_id": job_id, **_prepare_split_args}
+                for job_id, gen_kwargs in enumerate(
+                    _split_gen_kwargs(split_generator.gen_kwargs, max_num_jobs=num_proc)
+                )
+            ]
+            num_jobs = len(kwargs_per_job)
+            examples_per_job = [None] * num_jobs
+            bytes_per_job = [None] * num_jobs
+            features_per_job = [None] * num_jobs
+            shards_per_job = [None] * num_jobs
+            shard_lengths_per_job = [None] * num_jobs
+            with Pool(num_proc) as pool:
+                with pbar:
+                    for job_id, done, content in iflatmap_unordered(
+                        pool, self._prepare_split_single, kwargs_iterable=kwargs_per_job
+                    ):
+                        if done:
+                            # the content is the result of the job
+                            (
+                                examples_per_job[job_id],
+                                bytes_per_job[job_id],
+                                features_per_job[job_id],
+                                shards_per_job[job_id],
+                                shard_lengths_per_job[job_id],
+                            ) = content
+                        else:
+                            # the content is the number of examples progress update
+                            pbar.update(content)
+            assert None not in examples_per_job, (
+                f"Failed to retrieve results from prepare_split: result list {examples_per_job} still contains None - at least one worker failed to return its results"
+            )
+        total_shards = sum(shards_per_job)
+        total_num_examples = sum(examples_per_job)
+        total_num_bytes = sum(bytes_per_job)
+        features = features_per_job[0]
+        split_generator.split_info.num_examples = total_num_examples
+        split_generator.split_info.num_bytes = total_num_bytes
+        # should rename everything at the end
+        logger.debug(f"Renaming {total_shards} shards.")
+        if total_shards > 1:
+            # use the -SSSSS-of-NNNNN pattern
+            def _rename_shard(shard_and_job: tuple[int]):
+                shard_id, job_id = shard_and_job
+                global_shard_id = sum(shards_per_job[:job_id]) + shard_id
+                self._rename(
+                    fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
+                    fpath.replace("JJJJJ-SSSSS", f"{global_shard_id:05d}").replace("NNNNN", f"{total_shards:05d}"),
+                )
+            shards_and_jobs = [
+                (shard_id, job_id)
+                for job_id, num_shards in enumerate(shards_per_job)
+                for shard_id in range(num_shards)
+            ]
+            thread_map(_rename_shard, shards_and_jobs, disable=True, max_workers=64)
+            split_generator.split_info.shard_lengths = [
+                shard_length for shard_lengths in shard_lengths_per_job for shard_length in shard_lengths
+            ]
+        else:
+            # don't use any pattern
+            shard_id, job_id = 0, 0
+            self._rename(
+                fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
+                fpath.replace(SUFFIX, ""),
+            )
+        if self.info.features is None:
+            self.info.features = features
+    def _prepare_split_single(
+        self,
+        gen_kwargs: dict,
+        fpath: str,
+        file_format: str,
+        max_shard_size: int,
+        split_info: SplitInfo,
+        check_duplicate_keys: bool,
+        job_id: int,
+    ) -> Iterable[tuple[int, bool, Union[int, tuple]]]:
+        generator = self._generate_examples(**gen_kwargs)
+        writer_class = ParquetWriter if file_format == "parquet" else ArrowWriter
+        embed_local_files = file_format == "parquet"
+        shard_lengths = []
+        total_num_examples, total_num_bytes = 0, 0
+        shard_id = 0
+        num_examples_progress_update = 0
+        try:
+            writer = writer_class(
+                features=self.info.features,
+                path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
+                writer_batch_size=self._writer_batch_size,
+                hash_salt=split_info.name,
+                check_duplicates=check_duplicate_keys,
+                storage_options=self._fs.storage_options,
+                embed_local_files=embed_local_files,
+            )
+            try:
+                _time = time.time()
+                for key, record in generator:
+                    if max_shard_size is not None and writer._num_bytes > max_shard_size:
+                        num_examples, num_bytes = writer.finalize()
+                        writer.close()
+                        shard_lengths.append(num_examples)
+                        total_num_examples += num_examples
+                        total_num_bytes += num_bytes
+                        shard_id += 1
+                        writer = writer_class(
+                            features=writer._features,
+                            path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
+                            writer_batch_size=self._writer_batch_size,
+                            hash_salt=split_info.name,
+                            check_duplicates=check_duplicate_keys,
+                            storage_options=self._fs.storage_options,
+                            embed_local_files=embed_local_files,
+                        )
+                    example = self.info.features.encode_example(record) if self.info.features is not None else record
+                    writer.write(example, key)
+                    num_examples_progress_update += 1
+                    if time.time() > _time + config.PBAR_REFRESH_TIME_INTERVAL:
+                        _time = time.time()
+                        yield job_id, False, num_examples_progress_update
+                        num_examples_progress_update = 0
+            finally:
+                yield job_id, False, num_examples_progress_update
+                num_shards = shard_id + 1
+                num_examples, num_bytes = writer.finalize()
+                writer.close()
+                shard_lengths.append(num_examples)
+                total_num_examples += num_examples
+                total_num_bytes += num_bytes
+        except Exception as e:
+            # Ignore the writer's error for no examples written to the file if this error was caused by the error in _generate_examples before the first example was yielded
+            if isinstance(e, SchemaInferenceError) and e.__context__ is not None:
+                e = e.__context__
+            raise DatasetGenerationError("An error occurred while generating the dataset") from e
+        yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths)
+    def _download_and_prepare(self, dl_manager, verification_mode, **prepare_splits_kwargs):
+        super()._download_and_prepare(
+            dl_manager,
+            verification_mode,
+            check_duplicate_keys=verification_mode == VerificationMode.BASIC_CHECKS
+            or verification_mode == VerificationMode.ALL_CHECKS,
+            **prepare_splits_kwargs,
+        )
+    def _get_examples_iterable_for_split(self, split_generator: SplitGenerator) -> ExamplesIterable:
+        return ExamplesIterable(self._generate_examples, split_generator.gen_kwargs)
+class ArrowBasedBuilder(DatasetBuilder):
+    """Base class for datasets with data generation based on Arrow loading functions (CSV/JSON/Parquet)."""
+    @abc.abstractmethod
+    def _generate_tables(self, **kwargs):
+        """Default function generating examples for each `SplitGenerator`.
+        This function preprocess the examples from the raw data to the preprocessed
+        dataset files.
+        This function is called once for each `SplitGenerator` defined in
+        `_split_generators`. The examples yielded here will be written on
+        disk.
+        Args:
+            **kwargs (additional keyword arguments):
+                Arguments forwarded from the SplitGenerator.gen_kwargs
+        Yields:
+            key: `str` or `int`, a unique deterministic example identification key.
+                * Unique: An error will be raised if two examples are yield with the
+                    same key.
+                * Deterministic: When generating the dataset twice, the same example
+                    should have the same key.
+                Good keys can be the image id, or line number if examples are extracted
+                from a text file.
+                The key will be hashed and sorted to shuffle examples deterministically,
+                such as generating the dataset multiple times keep examples in the
+                same order.
+            example: `pyarrow.Table`, a feature table
+                ready to be encoded and written to disk.
+        """
+        raise NotImplementedError()
+    def _prepare_split(
+        self,
+        split_generator: SplitGenerator,
+        file_format: str = "arrow",
+        num_proc: Optional[int] = None,
+        max_shard_size: Optional[Union[str, int]] = None,
+    ):
+        max_shard_size = convert_file_size_to_int(max_shard_size or config.MAX_SHARD_SIZE)
+        try:
+            split_info = self.info.splits[split_generator.name]
+        except Exception:
+            split_info = split_generator.split_info
+        SUFFIX = "-JJJJJ-SSSSS-of-NNNNN"
+        fname = f"{self.dataset_name}-{split_generator.name}{SUFFIX}.{file_format}"
+        fpath = posixpath.join(self._output_dir, fname)
+        if num_proc and num_proc > 1:
+            num_input_shards = _number_of_shards_in_gen_kwargs(split_generator.gen_kwargs)
+            if num_input_shards <= 1:
+                logger.warning(
+                    f"Setting num_proc from {num_proc} back to 1 for the {split_info.name} split to disable multiprocessing as it only contains one shard."
+                )
+                num_proc = 1
+            elif num_input_shards < num_proc:
+                logger.warning(
+                    f"Setting num_proc from {num_proc} to {num_input_shards} for the {split_info.name} split as it only contains {num_input_shards} shards."
+                )
+                num_proc = num_input_shards
+        pbar = hf_tqdm(
+            unit=" examples",
+            total=split_info.num_examples,
+            desc=f"Generating {split_info.name} split",
+        )
+        _prepare_split_args = {
+            "fpath": fpath,
+            "file_format": file_format,
+            "max_shard_size": max_shard_size,
+        }
+        if num_proc is None or num_proc == 1:
+            result = None
+            gen_kwargs = split_generator.gen_kwargs
+            job_id = 0
+            with pbar:
+                for job_id, done, content in self._prepare_split_single(
+                    gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args
+                ):
+                    if done:
+                        result = content
+                    else:
+                        pbar.update(content)
+            # wrapping everything into lists for consistency with the multiprocessed code path
+            assert result is not None, "Failed to retrieve results from prepare_split"
+            examples_per_job, bytes_per_job, features_per_job, shards_per_job, shard_lengths_per_job = (
+                [item] for item in result
+            )
+        else:
+            kwargs_per_job = [
+                {"gen_kwargs": gen_kwargs, "job_id": job_id, **_prepare_split_args}
+                for job_id, gen_kwargs in enumerate(
+                    _split_gen_kwargs(split_generator.gen_kwargs, max_num_jobs=num_proc)
+                )
+            ]
+            num_jobs = len(kwargs_per_job)
+            examples_per_job = [None] * num_jobs
+            bytes_per_job = [None] * num_jobs
+            features_per_job = [None] * num_jobs
+            shards_per_job = [None] * num_jobs
+            shard_lengths_per_job = [None] * num_jobs
+            with Pool(num_proc) as pool:
+                with pbar:
+                    for job_id, done, content in iflatmap_unordered(
+                        pool, self._prepare_split_single, kwargs_iterable=kwargs_per_job
+                    ):
+                        if done:
+                            # the content is the result of the job
+                            (
+                                examples_per_job[job_id],
+                                bytes_per_job[job_id],
+                                features_per_job[job_id],
+                                shards_per_job[job_id],
+                                shard_lengths_per_job[job_id],
+                            ) = content
+                        else:
+                            # the content is the number of examples progress update
+                            pbar.update(content)
+            assert None not in examples_per_job, (
+                f"Failed to retrieve results from prepare_split: result list {examples_per_job} still contains None - at least one worker failed to return its results"
+            )
+        total_shards = sum(shards_per_job)
+        total_num_examples = sum(examples_per_job)
+        total_num_bytes = sum(bytes_per_job)
+        features = features_per_job[0]
+        split_generator.split_info.num_examples = total_num_examples
+        split_generator.split_info.num_bytes = total_num_bytes
+        # should rename everything at the end
+        logger.debug(f"Renaming {total_shards} shards.")
+        if total_shards > 1:
+            # use the -SSSSS-of-NNNNN pattern
+            def _rename_shard(shard_id_and_job: tuple[int]):
+                shard_id, job_id = shard_id_and_job
+                global_shard_id = sum(shards_per_job[:job_id]) + shard_id
+                self._rename(
+                    fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
+                    fpath.replace("JJJJJ-SSSSS", f"{global_shard_id:05d}").replace("NNNNN", f"{total_shards:05d}"),
+                )
+            shard_ids_and_jobs = [
+                (shard_id, job_id)
+                for job_id, num_shards in enumerate(shards_per_job)
+                for shard_id in range(num_shards)
+            ]
+            thread_map(_rename_shard, shard_ids_and_jobs, disable=True, max_workers=64)
+            split_generator.split_info.shard_lengths = [
+                shard_length for shard_lengths in shard_lengths_per_job for shard_length in shard_lengths
+            ]
+        else:
+            # don't use any pattern
+            shard_id, job_id = 0, 0
+            self._rename(
+                fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
+                fpath.replace(SUFFIX, ""),
+            )
+        if self.info.features is None:
+            self.info.features = features
+    def _prepare_split_single(
+        self, gen_kwargs: dict, fpath: str, file_format: str, max_shard_size: int, job_id: int
+    ) -> Iterable[tuple[int, bool, Union[int, tuple]]]:
+        gen_kwargs = {k: tracked_list(v) if isinstance(v, list) else v for k, v in gen_kwargs.items()}
+        generator = self._generate_tables(**gen_kwargs)
+        writer_class = ParquetWriter if file_format == "parquet" else ArrowWriter
+        embed_local_files = file_format == "parquet"
+        shard_lengths = []
+        total_num_examples, total_num_bytes = 0, 0
+        shard_id = 0
+        num_examples_progress_update = 0
+        try:
+            writer = writer_class(
+                features=self.info.features,
+                path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
+                writer_batch_size=self._writer_batch_size,
+                storage_options=self._fs.storage_options,
+                embed_local_files=embed_local_files,
+            )
+            try:
+                _time = time.time()
+                for _, table in generator:
+                    if max_shard_size is not None and writer._num_bytes > max_shard_size:
+                        num_examples, num_bytes = writer.finalize()
+                        writer.close()
+                        shard_lengths.append(num_examples)
+                        total_num_examples += num_examples
+                        total_num_bytes += num_bytes
+                        shard_id += 1
+                        writer = writer_class(
+                            features=writer._features,
+                            path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
+                            writer_batch_size=self._writer_batch_size,
+                            storage_options=self._fs.storage_options,
+                            embed_local_files=embed_local_files,
+                        )
+                    try:
+                        writer.write_table(table)
+                    except CastError as cast_error:
+                        raise DatasetGenerationCastError.from_cast_error(
+                            cast_error=cast_error,
+                            builder_name=self.info.builder_name,
+                            gen_kwargs=gen_kwargs,
+                            token=self.token,
+                        )
+                    num_examples_progress_update += len(table)
+                    if time.time() > _time + config.PBAR_REFRESH_TIME_INTERVAL:
+                        _time = time.time()
+                        yield job_id, False, num_examples_progress_update
+                        num_examples_progress_update = 0
+            finally:
+                yield job_id, False, num_examples_progress_update
+                num_shards = shard_id + 1
+                num_examples, num_bytes = writer.finalize()
+                writer.close()
+                shard_lengths.append(num_examples)
+                total_num_examples += num_examples
+                total_num_bytes += num_bytes
+        except Exception as e:
+            # Ignore the writer's error for no examples written to the file if this error was caused by the error in _generate_examples before the first example was yielded
+            if isinstance(e, SchemaInferenceError) and e.__context__ is not None:
+                e = e.__context__
+            if isinstance(e, DatasetGenerationError):
+                raise
+            raise DatasetGenerationError("An error occurred while generating the dataset") from e
+        yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths)
+    def _get_examples_iterable_for_split(self, split_generator: SplitGenerator) -> ExamplesIterable:
+        return ArrowExamplesIterable(self._generate_tables, kwargs=split_generator.gen_kwargs)

.venv/lib/python3.11/site-packages/datasets/combine.py ADDED Viewed

	@@ -0,0 +1,223 @@

+from typing import Optional, TypeVar
+from .arrow_dataset import Dataset, _concatenate_map_style_datasets, _interleave_map_style_datasets
+from .dataset_dict import DatasetDict, IterableDatasetDict
+from .info import DatasetInfo
+from .iterable_dataset import IterableDataset, _concatenate_iterable_datasets, _interleave_iterable_datasets
+from .splits import NamedSplit
+from .utils import logging
+from .utils.py_utils import Literal
+logger = logging.get_logger(__name__)
+DatasetType = TypeVar("DatasetType", Dataset, IterableDataset)
+def interleave_datasets(
+    datasets: list[DatasetType],
+    probabilities: Optional[list[float]] = None,
+    seed: Optional[int] = None,
+    info: Optional[DatasetInfo] = None,
+    split: Optional[NamedSplit] = None,
+    stopping_strategy: Literal[
+        "first_exhausted", "all_exhausted", "all_exhausted_without_replacement"
+    ] = "first_exhausted",
+) -> DatasetType:
+    """
+    Interleave several datasets (sources) into a single dataset.
+    The new dataset is constructed by alternating between the sources to get the examples.
+    You can use this function on a list of [`Dataset`] objects, or on a list of [`IterableDataset`] objects.
+        - If `probabilities` is `None` (default) the new dataset is constructed by cycling between each source to get the examples.
+        - If `probabilities` is not `None`, the new dataset is constructed by getting examples from a random source at a time according to the provided probabilities.
+    The resulting dataset ends when one of the source datasets runs out of examples except when `oversampling` is `True`,
+    in which case, the resulting dataset ends when all datasets have ran out of examples at least one time.
+    Note for iterable datasets:
+    In a distributed setup or in PyTorch DataLoader workers, the stopping strategy is applied per process.
+    Therefore the "first_exhausted" strategy on an sharded iterable dataset can generate less samples in total (up to 1 missing sample per subdataset per worker).
+    Args:
+        datasets (`List[Dataset]` or `List[IterableDataset]`):
+            List of datasets to interleave.
+        probabilities (`List[float]`, *optional*, defaults to `None`):
+            If specified, the new dataset is constructed by sampling
+            examples from one source at a time according to these probabilities.
+        seed (`int`, *optional*, defaults to `None`):
+            The random seed used to choose a source for each example.
+        info ([`DatasetInfo`], *optional*):
+            Dataset information, like description, citation, etc.
+            <Added version="2.4.0"/>
+        split ([`NamedSplit`], *optional*):
+            Name of the dataset split.
+            <Added version="2.4.0"/>
+        stopping_strategy (`str`, defaults to `first_exhausted`):
+            Three strategies are proposed right now, `first_exhausted`, `all_exhausted` and `all_exhausted_without_replacement`.
+            By default, `first_exhausted` is an undersampling strategy, i.e the dataset construction is stopped as soon as one dataset has ran out of samples.
+            If the strategy is `all_exhausted`,  we use an oversampling strategy, i.e the dataset construction is stopped as soon as every samples of every dataset has been added at least once.
+            When strategy is `all_exhausted_without_replacement` we make sure that each sample in each dataset is sampled only once.
+            Note that if the strategy is `all_exhausted`, the interleaved dataset size can get enormous:
+            - with no probabilities, the resulting dataset will have `max_length_datasets*nb_dataset` samples.
+            - with given probabilities, the resulting dataset will have more samples if some datasets have really low probability of visiting.
+    Returns:
+        [`Dataset`] or [`IterableDataset`]: Return type depends on the input `datasets`
+        parameter. `Dataset` if the input is a list of `Dataset`, `IterableDataset` if the input is a list of
+        `IterableDataset`.
+    Example:
+        For regular datasets (map-style):
+        ```python
+        >>> from datasets import Dataset, interleave_datasets
+        >>> d1 = Dataset.from_dict({"a": [0, 1, 2]})
+        >>> d2 = Dataset.from_dict({"a": [10, 11, 12]})
+        >>> d3 = Dataset.from_dict({"a": [20, 21, 22]})
+        >>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42, stopping_strategy="all_exhausted")
+        >>> dataset["a"]
+        [10, 0, 11, 1, 2, 20, 12, 10, 0, 1, 2, 21, 0, 11, 1, 2, 0, 1, 12, 2, 10, 0, 22]
+        >>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42)
+        >>> dataset["a"]
+        [10, 0, 11, 1, 2]
+        >>> dataset = interleave_datasets([d1, d2, d3])
+        >>> dataset["a"]
+        [0, 10, 20, 1, 11, 21, 2, 12, 22]
+        >>> dataset = interleave_datasets([d1, d2, d3], stopping_strategy="all_exhausted")
+        >>> dataset["a"]
+        [0, 10, 20, 1, 11, 21, 2, 12, 22]
+        >>> d1 = Dataset.from_dict({"a": [0, 1, 2]})
+        >>> d2 = Dataset.from_dict({"a": [10, 11, 12, 13]})
+        >>> d3 = Dataset.from_dict({"a": [20, 21, 22, 23, 24]})
+        >>> dataset = interleave_datasets([d1, d2, d3])
+        >>> dataset["a"]
+        [0, 10, 20, 1, 11, 21, 2, 12, 22]
+        >>> dataset = interleave_datasets([d1, d2, d3], stopping_strategy="all_exhausted")
+        >>> dataset["a"]
+        [0, 10, 20, 1, 11, 21, 2, 12, 22, 0, 13, 23, 1, 10, 24]
+        >>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42)
+        >>> dataset["a"]
+        [10, 0, 11, 1, 2]
+        >>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42, stopping_strategy="all_exhausted")
+        >>> dataset["a"]
+        [10, 0, 11, 1, 2, 20, 12, 13, ..., 0, 1, 2, 0, 24]
+        For datasets in streaming mode (iterable):
+        >>> from datasets import interleave_datasets
+        >>> d1 = load_dataset('allenai/c4', 'es', split='train', streaming=True)
+        >>> d2 = load_dataset('allenai/c4', 'fr', split='train', streaming=True)
+        >>> dataset = interleave_datasets([d1, d2])
+        >>> iterator = iter(dataset)
+        >>> next(iterator)
+        {'text': 'Comprar Zapatillas para niña en chancla con goma por...'}
+        >>> next(iterator)
+        {'text': 'Le sacre de philippe ier, 23 mai 1059 - Compte Rendu...'
+        ```
+    """
+    from .arrow_dataset import Dataset
+    from .iterable_dataset import IterableDataset
+    if not datasets:
+        raise ValueError("Unable to interleave an empty list of datasets.")
+    for i, dataset in enumerate(datasets):
+        if not isinstance(dataset, (Dataset, IterableDataset)):
+            if isinstance(dataset, (DatasetDict, IterableDatasetDict)):
+                if not dataset:
+                    raise ValueError(
+                        f"Expected a list of Dataset objects or a list of IterableDataset objects, but element at position {i} "
+                        "is an empty dataset dictionary."
+                    )
+                raise ValueError(
+                    f"Dataset at position {i} has at least one split: {list(dataset)}\n"
+                    f"Please pick one to interleave with the other datasets, for example: dataset['{next(iter(dataset))}']"
+                )
+            raise ValueError(
+                f"Expected a list of Dataset objects or a list of IterableDataset objects, but element at position {i} is a {type(dataset).__name__}."
+            )
+        if i == 0:
+            dataset_type, other_type = (
+                (Dataset, IterableDataset) if isinstance(dataset, Dataset) else (IterableDataset, Dataset)
+            )
+        elif not isinstance(dataset, dataset_type):
+            raise ValueError(
+                f"Unable to interleave a {dataset_type.__name__} (at position 0) with a {other_type.__name__} (at position {i}). Expected a list of Dataset objects or a list of IterableDataset objects."
+            )
+    if stopping_strategy not in ["first_exhausted", "all_exhausted", "all_exhausted_without_replacement"]:
+        raise ValueError(f"{stopping_strategy} is not supported. Please enter a valid stopping_strategy.")
+    if dataset_type is Dataset:
+        return _interleave_map_style_datasets(
+            datasets, probabilities, seed, info=info, split=split, stopping_strategy=stopping_strategy
+        )
+    else:
+        return _interleave_iterable_datasets(
+            datasets,
+            probabilities,
+            seed,
+            info=info,
+            split=split,
+            stopping_strategy=stopping_strategy,
+        )
+def concatenate_datasets(
+    dsets: list[DatasetType],
+    info: Optional[DatasetInfo] = None,
+    split: Optional[NamedSplit] = None,
+    axis: int = 0,
+) -> DatasetType:
+    """
+    Converts a list of [`Dataset`] with the same schema into a single [`Dataset`].
+    Args:
+        dsets (`List[datasets.Dataset]`):
+            List of Datasets to concatenate.
+        info (`DatasetInfo`, *optional*):
+            Dataset information, like description, citation, etc.
+        split (`NamedSplit`, *optional*):
+            Name of the dataset split.
+        axis (`{0, 1}`, defaults to `0`):
+            Axis to concatenate over, where `0` means over rows (vertically) and `1` means over columns
+            (horizontally).
+            <Added version="1.6.0"/>
+    Example:
+    ```py
+    >>> ds3 = concatenate_datasets([ds1, ds2])
+    ```
+    """
+    if not dsets:
+        raise ValueError("Unable to concatenate an empty list of datasets.")
+    for i, dataset in enumerate(dsets):
+        if not isinstance(dataset, (Dataset, IterableDataset)):
+            if isinstance(dataset, (DatasetDict, IterableDatasetDict)):
+                if not dataset:
+                    raise ValueError(
+                        f"Expected a list of Dataset objects or a list of IterableDataset objects, but element at position {i} "
+                        "is an empty dataset dictionary."
+                    )
+                raise ValueError(
+                    f"Dataset at position {i} has at least one split: {list(dataset)}\n"
+                    f"Please pick one to interleave with the other datasets, for example: dataset['{next(iter(dataset))}']"
+                )
+            raise ValueError(
+                f"Expected a list of Dataset objects or a list of IterableDataset objects, but element at position {i} is a {type(dataset).__name__}."
+            )
+        if i == 0:
+            dataset_type, other_type = (
+                (Dataset, IterableDataset) if isinstance(dataset, Dataset) else (IterableDataset, Dataset)
+            )
+        elif not isinstance(dataset, dataset_type):
+            raise ValueError(
+                f"Unable to interleave a {dataset_type.__name__} (at position 0) with a {other_type.__name__} (at position {i}). Expected a list of Dataset objects or a list of IterableDataset objects."
+            )
+    if dataset_type is Dataset:
+        return _concatenate_map_style_datasets(dsets, info=info, split=split, axis=axis)
+    else:
+        return _concatenate_iterable_datasets(dsets, info=info, split=split, axis=axis)

.venv/lib/python3.11/site-packages/datasets/commands/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from abc import ABC, abstractmethod
+from argparse import ArgumentParser
+class BaseDatasetsCLICommand(ABC):
+    @staticmethod
+    @abstractmethod
+    def register_subcommand(parser: ArgumentParser):
+        raise NotImplementedError()
+    @abstractmethod
+    def run(self):
+        raise NotImplementedError()

.venv/lib/python3.11/site-packages/datasets/commands/datasets_cli.py ADDED Viewed

	@@ -0,0 +1,39 @@

+#!/usr/bin/env python
+from argparse import ArgumentParser
+from datasets.commands.delete_from_hub import DeleteFromHubCommand
+from datasets.commands.env import EnvironmentCommand
+from datasets.commands.test import TestCommand
+from datasets.utils.logging import set_verbosity_info
+def parse_unknown_args(unknown_args):
+    return {key.lstrip("-"): value for key, value in zip(unknown_args[::2], unknown_args[1::2])}
+def main():
+    parser = ArgumentParser(
+        "HuggingFace Datasets CLI tool", usage="datasets-cli <command> [<args>]", allow_abbrev=False
+    )
+    commands_parser = parser.add_subparsers(help="datasets-cli command helpers")
+    set_verbosity_info()
+    # Register commands
+    EnvironmentCommand.register_subcommand(commands_parser)
+    TestCommand.register_subcommand(commands_parser)
+    DeleteFromHubCommand.register_subcommand(commands_parser)
+    # Parse args
+    args, unknown_args = parser.parse_known_args()
+    if not hasattr(args, "func"):
+        parser.print_help()
+        exit(1)
+    kwargs = parse_unknown_args(unknown_args)
+    # Run
+    service = args.func(args, **kwargs)
+    service.run()
+if __name__ == "__main__":
+    main()

.venv/lib/python3.11/site-packages/datasets/commands/delete_from_hub.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from argparse import ArgumentParser
+from typing import Optional
+from datasets.commands import BaseDatasetsCLICommand
+from datasets.hub import delete_from_hub
+def _command_factory(args):
+    return DeleteFromHubCommand(
+        args.dataset_id,
+        args.config_name,
+        args.token,
+        args.revision,
+    )
+class DeleteFromHubCommand(BaseDatasetsCLICommand):
+    @staticmethod
+    def register_subcommand(parser):
+        parser: ArgumentParser = parser.add_parser("delete_from_hub", help="Delete dataset config from the Hub")
+        parser.add_argument(
+            "dataset_id", help="source dataset ID, e.g. USERNAME/DATASET_NAME or ORGANIZATION/DATASET_NAME"
+        )
+        parser.add_argument("config_name", help="config name to delete")
+        parser.add_argument("--token", help="access token to the Hugging Face Hub")
+        parser.add_argument("--revision", help="source revision")
+        parser.set_defaults(func=_command_factory)
+    def __init__(
+        self,
+        dataset_id: str,
+        config_name: str,
+        token: Optional[str],
+        revision: Optional[str],
+    ):
+        self._dataset_id = dataset_id
+        self._config_name = config_name
+        self._token = token
+        self._revision = revision
+    def run(self) -> None:
+        _ = delete_from_hub(self._dataset_id, self._config_name, revision=self._revision, token=self._token)

.venv/lib/python3.11/site-packages/datasets/commands/env.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import platform
+from argparse import ArgumentParser
+import fsspec
+import huggingface_hub
+import pandas
+import pyarrow
+from datasets import __version__ as version
+from datasets.commands import BaseDatasetsCLICommand
+def info_command_factory(_):
+    return EnvironmentCommand()
+class EnvironmentCommand(BaseDatasetsCLICommand):
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        download_parser = parser.add_parser("env", help="Print relevant system environment info.")
+        download_parser.set_defaults(func=info_command_factory)
+    def run(self):
+        info = {
+            "`datasets` version": version,
+            "Platform": platform.platform(),
+            "Python version": platform.python_version(),
+            "`huggingface_hub` version": huggingface_hub.__version__,
+            "PyArrow version": pyarrow.__version__,
+            "Pandas version": pandas.__version__,
+            "`fsspec` version": fsspec.__version__,
+        }
+        print("\nCopy-and-paste the text below in your GitHub issue.\n")
+        print(self.format_dict(info))
+        return info
+    @staticmethod
+    def format_dict(d):
+        return "\n".join([f"- {prop}: {val}" for prop, val in d.items()]) + "\n"

.venv/lib/python3.11/site-packages/datasets/commands/test.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import logging
+import os
+from argparse import ArgumentParser
+from collections.abc import Generator
+from shutil import rmtree
+import datasets.config
+from datasets.builder import DatasetBuilder
+from datasets.commands import BaseDatasetsCLICommand
+from datasets.download.download_manager import DownloadMode
+from datasets.info import DatasetInfosDict
+from datasets.load import dataset_module_factory, get_dataset_builder_class
+from datasets.utils.info_utils import VerificationMode
+from datasets.utils.logging import ERROR, get_logger
+logger = get_logger(__name__)
+def _test_command_factory(args):
+    return TestCommand(
+        args.dataset,
+        args.name,
+        args.cache_dir,
+        args.data_dir,
+        args.all_configs,
+        args.save_info or args.save_infos,
+        args.ignore_verifications,
+        args.force_redownload,
+        args.clear_cache,
+        args.num_proc,
+    )
+class TestCommand(BaseDatasetsCLICommand):
+    __test__ = False  # to tell pytest it's not a test class
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        test_parser = parser.add_parser("test", help="Test dataset loading.")
+        test_parser.add_argument("--name", type=str, default=None, help="Dataset processing name")
+        test_parser.add_argument(
+            "--cache_dir",
+            type=str,
+            default=None,
+            help="Cache directory where the datasets are stored.",
+        )
+        test_parser.add_argument(
+            "--data_dir",
+            type=str,
+            default=None,
+            help="Can be used to specify a manual directory to get the files from.",
+        )
+        test_parser.add_argument("--all_configs", action="store_true", help="Test all dataset configurations")
+        test_parser.add_argument(
+            "--save_info", action="store_true", help="Save the dataset infos in the dataset card (README.md)"
+        )
+        test_parser.add_argument(
+            "--ignore_verifications",
+            action="store_true",
+            help="Run the test without checksums and splits checks.",
+        )
+        test_parser.add_argument("--force_redownload", action="store_true", help="Force dataset redownload")
+        test_parser.add_argument(
+            "--clear_cache",
+            action="store_true",
+            help="Remove downloaded files and cached datasets after each config test",
+        )
+        test_parser.add_argument("--num_proc", type=int, default=None, help="Number of processes")
+        # aliases
+        test_parser.add_argument("--save_infos", action="store_true", help="alias to save_info")
+        test_parser.add_argument("dataset", type=str, help="Name of the dataset to download")
+        test_parser.set_defaults(func=_test_command_factory)
+    def __init__(
+        self,
+        dataset: str,
+        name: str,
+        cache_dir: str,
+        data_dir: str,
+        all_configs: bool,
+        save_infos: bool,
+        ignore_verifications: bool,
+        force_redownload: bool,
+        clear_cache: bool,
+        num_proc: int,
+    ):
+        self._dataset = dataset
+        self._name = name
+        self._cache_dir = cache_dir
+        self._data_dir = data_dir
+        self._all_configs = all_configs
+        self._save_infos = save_infos
+        self._ignore_verifications = ignore_verifications
+        self._force_redownload = force_redownload
+        self._clear_cache = clear_cache
+        self._num_proc = num_proc
+        if clear_cache and not cache_dir:
+            print(
+                "When --clear_cache is used, specifying a cache directory is mandatory.\n"
+                "The 'download' folder of the cache directory and the dataset builder cache will be deleted after each configuration test.\n"
+                "Please provide a --cache_dir that will be used to test the dataset."
+            )
+            exit(1)
+        if save_infos:
+            self._ignore_verifications = True
+    def run(self):
+        logging.getLogger("filelock").setLevel(ERROR)
+        if self._name is not None and self._all_configs:
+            print("Both parameters `config` and `all_configs` can't be used at once.")
+            exit(1)
+        path, config_name = self._dataset, self._name
+        module = dataset_module_factory(path)
+        builder_cls = get_dataset_builder_class(module)
+        n_builders = len(builder_cls.BUILDER_CONFIGS) if self._all_configs and builder_cls.BUILDER_CONFIGS else 1
+        def get_builders() -> Generator[DatasetBuilder, None, None]:
+            if self._all_configs and builder_cls.BUILDER_CONFIGS:
+                for i, config in enumerate(builder_cls.BUILDER_CONFIGS):
+                    if "config_name" in module.builder_kwargs:
+                        yield builder_cls(
+                            cache_dir=self._cache_dir,
+                            data_dir=self._data_dir,
+                            **module.builder_kwargs,
+                        )
+                    else:
+                        yield builder_cls(
+                            config_name=config.name,
+                            cache_dir=self._cache_dir,
+                            data_dir=self._data_dir,
+                            **module.builder_kwargs,
+                        )
+            else:
+                if "config_name" in module.builder_kwargs:
+                    yield builder_cls(cache_dir=self._cache_dir, data_dir=self._data_dir, **module.builder_kwargs)
+                else:
+                    yield builder_cls(
+                        config_name=config_name,
+                        cache_dir=self._cache_dir,
+                        data_dir=self._data_dir,
+                        **module.builder_kwargs,
+                    )
+        for j, builder in enumerate(get_builders()):
+            print(f"Testing builder '{builder.config.name}' ({j + 1}/{n_builders})")
+            builder._record_infos = os.path.exists(
+                os.path.join(builder.get_imported_module_dir(), datasets.config.DATASETDICT_INFOS_FILENAME)
+            )  # record checksums only if we need to update a (deprecated) dataset_infos.json
+            builder.download_and_prepare(
+                download_mode=DownloadMode.REUSE_CACHE_IF_EXISTS
+                if not self._force_redownload
+                else DownloadMode.FORCE_REDOWNLOAD,
+                verification_mode=VerificationMode.NO_CHECKS
+                if self._ignore_verifications
+                else VerificationMode.ALL_CHECKS,
+                num_proc=self._num_proc,
+            )
+            builder.as_dataset()
+            # If save_infos=True, we create the dataset card (README.md)
+            # The dataset_infos are saved in the YAML part of the README.md
+            # This is to allow the user to upload them on HF afterwards.
+            if self._save_infos:
+                save_infos_dir = os.path.basename(path) if not os.path.isdir(path) else path
+                os.makedirs(save_infos_dir, exist_ok=True)
+                DatasetInfosDict(**{builder.config.name: builder.info}).write_to_directory(save_infos_dir)
+                print(f"Dataset card saved at {os.path.join(save_infos_dir, datasets.config.REPOCARD_FILENAME)}")
+            # If clear_cache=True, the download folder and the dataset builder cache directory are deleted
+            if self._clear_cache:
+                if os.path.isdir(builder._cache_dir):
+                    logger.warning(f"Clearing cache at {builder._cache_dir}")
+                    rmtree(builder._cache_dir)
+                download_dir = os.path.join(self._cache_dir, datasets.config.DOWNLOADED_DATASETS_DIR)
+                if os.path.isdir(download_dir):
+                    logger.warning(f"Clearing cache at {download_dir}")
+                    rmtree(download_dir)
+        print("Test successful.")

.venv/lib/python3.11/site-packages/datasets/config.py ADDED Viewed

	@@ -0,0 +1,271 @@

+import importlib
+import importlib.metadata
+import logging
+import os
+import platform
+from pathlib import Path
+from typing import Optional
+from huggingface_hub import constants
+from packaging import version
+logger = logging.getLogger(__name__.split(".", 1)[0])  # to avoid circular import from .utils.logging
+# Datasets
+S3_DATASETS_BUCKET_PREFIX = "https://s3.amazonaws.com/datasets.huggingface.co/datasets/datasets"
+CLOUDFRONT_DATASETS_DISTRIB_PREFIX = "https://cdn-datasets.huggingface.co/datasets/datasets"
+REPO_DATASETS_URL = "https://raw.githubusercontent.com/huggingface/datasets/{revision}/datasets/{path}/{name}"
+# Hub
+HF_ENDPOINT = os.environ.get("HF_ENDPOINT", "https://huggingface.co")
+HUB_DATASETS_URL = HF_ENDPOINT + "/datasets/{repo_id}/resolve/{revision}/{path}"
+HUB_DATASETS_HFFS_URL = "hf://datasets/{repo_id}@{revision}/{path}"
+HUB_DEFAULT_VERSION = "main"
+PY_VERSION = version.parse(platform.python_version())
+# General environment variables accepted values for booleans
+ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
+ENV_VARS_FALSE_VALUES = {"0", "OFF", "NO", "FALSE"}
+ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
+ENV_VARS_FALSE_AND_AUTO_VALUES = ENV_VARS_FALSE_VALUES.union({"AUTO"})
+# Imports
+DILL_VERSION = version.parse(importlib.metadata.version("dill"))
+FSSPEC_VERSION = version.parse(importlib.metadata.version("fsspec"))
+PANDAS_VERSION = version.parse(importlib.metadata.version("pandas"))
+PYARROW_VERSION = version.parse(importlib.metadata.version("pyarrow"))
+HF_HUB_VERSION = version.parse(importlib.metadata.version("huggingface_hub"))
+USE_TF = os.environ.get("USE_TF", "AUTO").upper()
+USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper()
+USE_JAX = os.environ.get("USE_JAX", "AUTO").upper()
+TORCH_VERSION = "N/A"
+TORCH_AVAILABLE = False
+if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES:
+    TORCH_AVAILABLE = importlib.util.find_spec("torch") is not None
+    if TORCH_AVAILABLE:
+        try:
+            TORCH_VERSION = version.parse(importlib.metadata.version("torch"))
+            logger.debug(f"PyTorch version {TORCH_VERSION} available.")
+        except importlib.metadata.PackageNotFoundError:
+            pass
+else:
+    logger.info("Disabling PyTorch because USE_TF is set")
+POLARS_VERSION = "N/A"
+POLARS_AVAILABLE = importlib.util.find_spec("polars") is not None
+if POLARS_AVAILABLE:
+    try:
+        POLARS_VERSION = version.parse(importlib.metadata.version("polars"))
+        logger.debug(f"Polars version {POLARS_VERSION} available.")
+    except importlib.metadata.PackageNotFoundError:
+        pass
+DUCKDB_VERSION = "N/A"
+DUCKDB_AVAILABLE = importlib.util.find_spec("duckdb") is not None
+if DUCKDB_AVAILABLE:
+    try:
+        DUCKDB_VERSION = version.parse(importlib.metadata.version("duckdb"))
+        logger.debug(f"Duckdb version {DUCKDB_VERSION} available.")
+    except importlib.metadata.PackageNotFoundError:
+        pass
+TF_VERSION = "N/A"
+TF_AVAILABLE = False
+if USE_TF in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TORCH not in ENV_VARS_TRUE_VALUES:
+    TF_AVAILABLE = importlib.util.find_spec("tensorflow") is not None
+    if TF_AVAILABLE:
+        # For the metadata, we have to look for both tensorflow and tensorflow-cpu
+        for package in [
+            "tensorflow",
+            "tensorflow-cpu",
+            "tensorflow-gpu",
+            "tf-nightly",
+            "tf-nightly-cpu",
+            "tf-nightly-gpu",
+            "intel-tensorflow",
+            "tensorflow-rocm",
+            "tensorflow-macos",
+        ]:
+            try:
+                TF_VERSION = version.parse(importlib.metadata.version(package))
+            except importlib.metadata.PackageNotFoundError:
+                continue
+            else:
+                break
+        else:
+            TF_AVAILABLE = False
+    if TF_AVAILABLE:
+        if TF_VERSION.major < 2:
+            logger.info(f"TensorFlow found but with version {TF_VERSION}. `datasets` requires version 2 minimum.")
+            TF_AVAILABLE = False
+        else:
+            logger.info(f"TensorFlow version {TF_VERSION} available.")
+else:
+    logger.info("Disabling Tensorflow because USE_TORCH is set")
+JAX_VERSION = "N/A"
+JAX_AVAILABLE = False
+if USE_JAX in ENV_VARS_TRUE_AND_AUTO_VALUES:
+    JAX_AVAILABLE = importlib.util.find_spec("jax") is not None and importlib.util.find_spec("jaxlib") is not None
+    if JAX_AVAILABLE:
+        try:
+            JAX_VERSION = version.parse(importlib.metadata.version("jax"))
+            logger.info(f"JAX version {JAX_VERSION} available.")
+        except importlib.metadata.PackageNotFoundError:
+            pass
+else:
+    logger.info("Disabling JAX because USE_JAX is set to False")
+# Optional tools for data loading
+SQLALCHEMY_AVAILABLE = importlib.util.find_spec("sqlalchemy") is not None
+# Optional tools for feature decoding
+PIL_AVAILABLE = importlib.util.find_spec("PIL") is not None
+IS_OPUS_SUPPORTED = True
+IS_MP3_SUPPORTED = True
+TORCHCODEC_AVAILABLE = importlib.util.find_spec("torchcodec") is not None
+TORCHVISION_AVAILABLE = importlib.util.find_spec("torchvision") is not None
+PDFPLUMBER_AVAILABLE = importlib.util.find_spec("pdfplumber") is not None
+NIBABEL_AVAILABLE = importlib.util.find_spec("nibabel") is not None
+# Optional compression tools
+RARFILE_AVAILABLE = importlib.util.find_spec("rarfile") is not None
+ZSTANDARD_AVAILABLE = importlib.util.find_spec("zstandard") is not None
+LZ4_AVAILABLE = importlib.util.find_spec("lz4") is not None
+PY7ZR_AVAILABLE = importlib.util.find_spec("py7zr") is not None
+# Cache location
+DEFAULT_XDG_CACHE_HOME = "~/.cache"
+XDG_CACHE_HOME = os.getenv("XDG_CACHE_HOME", DEFAULT_XDG_CACHE_HOME)
+DEFAULT_HF_CACHE_HOME = os.path.join(XDG_CACHE_HOME, "huggingface")
+HF_CACHE_HOME = os.path.expanduser(os.getenv("HF_HOME", DEFAULT_HF_CACHE_HOME))
+DEFAULT_HF_DATASETS_CACHE = os.path.join(HF_CACHE_HOME, "datasets")
+HF_DATASETS_CACHE = Path(os.getenv("HF_DATASETS_CACHE", DEFAULT_HF_DATASETS_CACHE))
+DEFAULT_HF_MODULES_CACHE = os.path.join(HF_CACHE_HOME, "modules")
+HF_MODULES_CACHE = Path(os.getenv("HF_MODULES_CACHE", DEFAULT_HF_MODULES_CACHE))
+DOWNLOADED_DATASETS_DIR = "downloads"
+DEFAULT_DOWNLOADED_DATASETS_PATH = os.path.join(HF_DATASETS_CACHE, DOWNLOADED_DATASETS_DIR)
+DOWNLOADED_DATASETS_PATH = Path(os.getenv("HF_DATASETS_DOWNLOADED_DATASETS_PATH", DEFAULT_DOWNLOADED_DATASETS_PATH))
+EXTRACTED_DATASETS_DIR = "extracted"
+DEFAULT_EXTRACTED_DATASETS_PATH = os.path.join(DEFAULT_DOWNLOADED_DATASETS_PATH, EXTRACTED_DATASETS_DIR)
+EXTRACTED_DATASETS_PATH = Path(os.getenv("HF_DATASETS_EXTRACTED_DATASETS_PATH", DEFAULT_EXTRACTED_DATASETS_PATH))
+# Download count for the website
+HF_UPDATE_DOWNLOAD_COUNTS = (
+    os.environ.get("HF_UPDATE_DOWNLOAD_COUNTS", "AUTO").upper() in ENV_VARS_TRUE_AND_AUTO_VALUES
+)
+# For downloads and to check remote files metadata
+HF_DATASETS_MULTITHREADING_MAX_WORKERS = 16
+# Dataset viewer API
+USE_PARQUET_EXPORT = True
+# Batch size constants. For more info, see:
+# https://github.com/apache/arrow/blob/master/docs/source/cpp/arrays.rst#size-limitations-and-recommendations)
+DEFAULT_MAX_BATCH_SIZE = 1000
+DEFAULT_CDC_OPTIONS = {"min_chunk_size": 256 * 1024, "max_chunk_size": 1024 * 1024, "norm_level": 0}
+# Size of the preloaded record batch in `Dataset.__iter__`
+ARROW_READER_BATCH_SIZE_IN_DATASET_ITER = 10
+# Max uncompressed shard size in bytes (e.g. to shard parquet datasets in push_to_hub or download_and_prepare)
+MAX_SHARD_SIZE = "500MB"
+# Max uncompressed row group size in bytes (e.g. for parquet files in push_to_hub or download_and_prepare)
+MAX_ROW_GROUP_SIZE = "100MB"
+# Parquet configuration
+PARQUET_ROW_GROUP_SIZE_FOR_AUDIO_DATASETS = None
+PARQUET_ROW_GROUP_SIZE_FOR_IMAGE_DATASETS = None
+PARQUET_ROW_GROUP_SIZE_FOR_BINARY_DATASETS = None
+PARQUET_ROW_GROUP_SIZE_FOR_VIDEO_DATASETS = None
+# Arrow configuration
+ARROW_RECORD_BATCH_SIZE_FOR_AUDIO_DATASETS = 100
+ARROW_RECORD_BATCH_SIZE_FOR_IMAGE_DATASETS = 100
+ARROW_RECORD_BATCH_SIZE_FOR_BINARY_DATASETS = 100
+ARROW_RECORD_BATCH_SIZE_FOR_VIDEO_DATASETS = 10
+# Offline mode
+_offline = os.environ.get("HF_DATASETS_OFFLINE")
+HF_HUB_OFFLINE = constants.HF_HUB_OFFLINE if _offline is None else _offline.upper() in ENV_VARS_TRUE_VALUES
+HF_DATASETS_OFFLINE = HF_HUB_OFFLINE  # kept for backward-compatibility
+# Here, `True` will disable progress bars globally without possibility of enabling it
+# programmatically. `False` will enable them without possibility of disabling them.
+# If environment variable is not set (None), then the user is free to enable/disable
+# them programmatically.
+# TL;DR: env variable has priority over code
+__HF_DATASETS_DISABLE_PROGRESS_BARS = os.environ.get("HF_DATASETS_DISABLE_PROGRESS_BARS")
+HF_DATASETS_DISABLE_PROGRESS_BARS: Optional[bool] = (
+    __HF_DATASETS_DISABLE_PROGRESS_BARS.upper() in ENV_VARS_TRUE_VALUES
+    if __HF_DATASETS_DISABLE_PROGRESS_BARS is not None
+    else None
+)
+# In-memory
+DEFAULT_IN_MEMORY_MAX_SIZE = 0  # Disabled
+IN_MEMORY_MAX_SIZE = float(os.environ.get("HF_DATASETS_IN_MEMORY_MAX_SIZE", DEFAULT_IN_MEMORY_MAX_SIZE))
+# File names
+DATASET_ARROW_FILENAME = "dataset.arrow"
+DATASET_INDICES_FILENAME = "indices.arrow"
+DATASET_STATE_JSON_FILENAME = "state.json"
+DATASET_INFO_FILENAME = "dataset_info.json"
+DATASETDICT_INFOS_FILENAME = "dataset_infos.json"
+LICENSE_FILENAME = "LICENSE"
+DATASETDICT_JSON_FILENAME = "dataset_dict.json"
+METADATA_CONFIGS_FIELD = "configs"
+REPOCARD_FILENAME = "README.md"
+REPOYAML_FILENAME = ".huggingface.yaml"
+MODULE_NAME_FOR_DYNAMIC_MODULES = "datasets_modules"
+MAX_DATASET_CONFIG_ID_READABLE_LENGTH = 255
+# Temporary cache directory prefix
+TEMP_CACHE_DIR_PREFIX = "hf_datasets-"
+# Streaming
+STREAMING_READ_MAX_RETRIES = 20
+STREAMING_READ_RETRY_INTERVAL = 5
+STREAMING_READ_SERVER_UNAVAILABLE_RETRY_INTERVAL = 20
+STREAMING_READ_RATE_LIMIT_RETRY_INTERVAL = 60
+STREAMING_OPEN_MAX_RETRIES = 20
+STREAMING_OPEN_RETRY_INTERVAL = 5
+# Datasets repositories exploration
+DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE = 200
+GLOBBED_DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE = 10
+ARCHIVED_DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE = 200
+# Async map functions
+MAX_NUM_RUNNING_ASYNC_MAP_FUNCTIONS_IN_PARALLEL = 1000
+# Progress bars
+PBAR_REFRESH_TIME_INTERVAL = 0.05  # 20 progress updates per sec
+# Maximum number of uploaded files per commit
+UPLOADS_MAX_NUMBER_PER_COMMIT = 50
+# Backward compatibility
+MAX_TABLE_NBYTES_FOR_PICKLING = 4 << 30

.venv/lib/python3.11/site-packages/datasets/data_files.py ADDED Viewed

	@@ -0,0 +1,811 @@

+import os
+import re
+from functools import partial
+from glob import has_magic
+from pathlib import Path, PurePath
+from typing import Callable, Optional, Union
+import huggingface_hub
+from fsspec.core import url_to_fs
+from huggingface_hub import HfFileSystem
+from packaging import version
+from tqdm.contrib.concurrent import thread_map
+from . import config
+from .download import DownloadConfig
+from .naming import _split_re
+from .splits import Split
+from .utils import logging
+from .utils import tqdm as hf_tqdm
+from .utils.file_utils import _prepare_path_and_storage_options, is_local_path, is_relative_path, xbasename, xjoin
+from .utils.py_utils import string_to_dict
+SingleOriginMetadata = Union[tuple[str, str], tuple[str], tuple[()]]
+SANITIZED_DEFAULT_SPLIT = str(Split.TRAIN)
+logger = logging.get_logger(__name__)
+class Url(str):
+    pass
+class EmptyDatasetError(FileNotFoundError):
+    pass
+SPLIT_PATTERN_SHARDED = "data/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*"
+SPLIT_KEYWORDS = {
+    Split.TRAIN: ["train", "training"],
+    Split.VALIDATION: ["validation", "valid", "dev", "val"],
+    Split.TEST: ["test", "testing", "eval", "evaluation"],
+}
+NON_WORDS_CHARS = "-._ 0-9"
+if config.FSSPEC_VERSION < version.parse("2023.9.0"):
+    KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**[{sep}/]{keyword}[{sep}]*", "{keyword}[{sep}]*"]
+    KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [
+        "{keyword}/**",
+        "{keyword}[{sep}]*/**",
+        "**[{sep}/]{keyword}/**",
+        "**[{sep}/]{keyword}[{sep}]*/**",
+    ]
+elif config.FSSPEC_VERSION < version.parse("2023.12.0"):
+    KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**/*[{sep}/]{keyword}[{sep}]*", "{keyword}[{sep}]*"]
+    KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [
+        "{keyword}/**/*",
+        "{keyword}[{sep}]*/**/*",
+        "**/*[{sep}/]{keyword}/**/*",
+        "**/*[{sep}/]{keyword}[{sep}]*/**/*",
+    ]
+else:
+    KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**/{keyword}[{sep}]*", "**/*[{sep}]{keyword}[{sep}]*"]
+    KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [
+        "**/{keyword}/**",
+        "**/{keyword}[{sep}]*/**",
+        "**/*[{sep}]{keyword}/**",
+        "**/*[{sep}]{keyword}[{sep}]*/**",
+    ]
+DEFAULT_SPLITS = [Split.TRAIN, Split.VALIDATION, Split.TEST]
+DEFAULT_PATTERNS_SPLIT_IN_FILENAME = {
+    split: [
+        pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
+        for keyword in SPLIT_KEYWORDS[split]
+        for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS
+    ]
+    for split in DEFAULT_SPLITS
+}
+DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME = {
+    split: [
+        pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
+        for keyword in SPLIT_KEYWORDS[split]
+        for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS
+    ]
+    for split in DEFAULT_SPLITS
+}
+DEFAULT_PATTERNS_ALL = {
+    Split.TRAIN: ["**"],
+}
+ALL_SPLIT_PATTERNS = [SPLIT_PATTERN_SHARDED]
+ALL_DEFAULT_PATTERNS = [
+    DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME,
+    DEFAULT_PATTERNS_SPLIT_IN_FILENAME,
+    DEFAULT_PATTERNS_ALL,
+]
+WILDCARD_CHARACTERS = "*[]"
+FILES_TO_IGNORE = [
+    "README.md",
+    "config.json",
+    "dataset_info.json",
+    "dataset_infos.json",
+    "dummy_data.zip",
+    "dataset_dict.json",
+]
+def contains_wildcards(pattern: str) -> bool:
+    return any(wildcard_character in pattern for wildcard_character in WILDCARD_CHARACTERS)
+def sanitize_patterns(patterns: Union[dict, list, str]) -> dict[str, Union[list[str], "DataFilesList"]]:
+    """
+    Take the data_files patterns from the user, and format them into a dictionary.
+    Each key is the name of the split, and each value is a list of data files patterns (paths or urls).
+    The default split is "train".
+    Returns:
+        patterns: dictionary of split_name -> list of patterns
+    """
+    if isinstance(patterns, dict):
+        return {str(key): value if isinstance(value, list) else [value] for key, value in patterns.items()}
+    elif isinstance(patterns, str):
+        return {SANITIZED_DEFAULT_SPLIT: [patterns]}
+    elif isinstance(patterns, list):
+        if any(isinstance(pattern, dict) for pattern in patterns):
+            for pattern in patterns:
+                if not (
+                    isinstance(pattern, dict)
+                    and len(pattern) == 2
+                    and "split" in pattern
+                    and isinstance(pattern.get("path"), (str, list))
+                ):
+                    raise ValueError(
+                        f"Expected each split to have a 'path' key which can be a string or a list of strings, but got {pattern}"
+                    )
+            splits = [pattern["split"] for pattern in patterns]
+            if len(set(splits)) != len(splits):
+                raise ValueError(f"Some splits are duplicated in data_files: {splits}")
+            return {
+                str(pattern["split"]): pattern["path"] if isinstance(pattern["path"], list) else [pattern["path"]]
+                for pattern in patterns
+            }
+        else:
+            return {SANITIZED_DEFAULT_SPLIT: patterns}
+    else:
+        return sanitize_patterns(list(patterns))
+def _is_inside_unrequested_special_dir(matched_rel_path: str, pattern: str) -> bool:
+    """
+    When a path matches a pattern, we additionally check if it's inside a special directory
+    we ignore by default (if it starts with a double underscore).
+    Users can still explicitly request a filepath inside such a directory if "__pycache__" is
+    mentioned explicitly in the requested pattern.
+    Some examples:
+    base directory:
+        ./
+        └── __pycache__
+            └── b.txt
+    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "**")
+    True
+    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "*/b.txt")
+    True
+    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__pycache__/*")
+    False
+    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__*/*")
+    False
+    """
+    # We just need to check if every special directories from the path is present explicitly in the pattern.
+    # Since we assume that the path matches the pattern, it's equivalent to counting that both
+    # the parent path and the parent pattern have the same number of special directories.
+    data_dirs_to_ignore_in_path = [part for part in PurePath(matched_rel_path).parent.parts if part.startswith("__")]
+    data_dirs_to_ignore_in_pattern = [part for part in PurePath(pattern).parent.parts if part.startswith("__")]
+    return len(data_dirs_to_ignore_in_path) != len(data_dirs_to_ignore_in_pattern)
+def _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(matched_rel_path: str, pattern: str) -> bool:
+    """
+    When a path matches a pattern, we additionally check if it's a hidden file or if it's inside
+    a hidden directory we ignore by default, i.e. if the file name or a parent directory name starts with a dot.
+    Users can still explicitly request a filepath that is hidden or is inside a hidden directory
+    if the hidden part is mentioned explicitly in the requested pattern.
+    Some examples:
+    base directory:
+        ./
+        └── .hidden_file.txt
+    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", "**")
+    True
+    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", ".*")
+    False
+    base directory:
+        ./
+        └── .hidden_dir
+            └── a.txt
+    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", "**")
+    True
+    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".*/*")
+    False
+    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".hidden_dir/*")
+    False
+    base directory:
+        ./
+        └── .hidden_dir
+            └── .hidden_file.txt
+    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", "**")
+    True
+    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/*")
+    True
+    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/.*")
+    False
+    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/*")
+    True
+    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/.*")
+    False
+    """
+    # We just need to check if every hidden part from the path is present explicitly in the pattern.
+    # Since we assume that the path matches the pattern, it's equivalent to counting that both
+    # the path and the pattern have the same number of hidden parts.
+    hidden_directories_in_path = [
+        part for part in PurePath(matched_rel_path).parts if part.startswith(".") and not set(part) == {"."}
+    ]
+    hidden_directories_in_pattern = [
+        part for part in PurePath(pattern).parts if part.startswith(".") and not set(part) == {"."}
+    ]
+    return len(hidden_directories_in_path) != len(hidden_directories_in_pattern)
+def _get_data_files_patterns(pattern_resolver: Callable[[str], list[str]]) -> dict[str, list[str]]:
+    """
+    Get the default pattern from a directory or repository by testing all the supported patterns.
+    The first patterns to return a non-empty list of data files is returned.
+    In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
+    """
+    # first check the split patterns like data/{split}-00000-of-00001.parquet
+    for split_pattern in ALL_SPLIT_PATTERNS:
+        pattern = split_pattern.replace("{split}", "*")
+        try:
+            data_files = pattern_resolver(pattern)
+        except FileNotFoundError:
+            continue
+        if len(data_files) > 0:
+            splits: set[str] = set()
+            for p in data_files:
+                p_parts = string_to_dict(xbasename(p), xbasename(split_pattern))
+                assert p_parts is not None
+                splits.add(p_parts["split"])
+            if any(not re.match(_split_re, split) for split in splits):
+                raise ValueError(f"Split name should match '{_split_re}'' but got '{splits}'.")
+            sorted_splits = [str(split) for split in DEFAULT_SPLITS if split in splits] + sorted(
+                splits - {str(split) for split in DEFAULT_SPLITS}
+            )
+            return {split: [split_pattern.format(split=split)] for split in sorted_splits}
+    # then check the default patterns based on train/valid/test splits
+    for patterns_dict in ALL_DEFAULT_PATTERNS:
+        non_empty_splits = []
+        for split, patterns in patterns_dict.items():
+            for pattern in patterns:
+                try:
+                    data_files = pattern_resolver(pattern)
+                except FileNotFoundError:
+                    continue
+                if len(data_files) > 0:
+                    non_empty_splits.append(split)
+                    break
+        if non_empty_splits:
+            return {split: patterns_dict[split] for split in non_empty_splits}
+    raise FileNotFoundError(f"Couldn't resolve pattern {pattern} with resolver {pattern_resolver}")
+def resolve_pattern(
+    pattern: str,
+    base_path: str,
+    allowed_extensions: Optional[list[str]] = None,
+    download_config: Optional[DownloadConfig] = None,
+) -> list[str]:
+    """
+    Resolve the paths and URLs of the data files from the pattern passed by the user.
+    You can use patterns to resolve multiple local files. Here are a few examples:
+    - *.csv to match all the CSV files at the first level
+    - **.csv to match all the CSV files at any level
+    - data/* to match all the files inside "data"
+    - data/** to match all the files inside "data" and its subdirectories
+    The patterns are resolved using the fsspec glob. In fsspec>=2023.12.0 this is equivalent to
+    Python's glob.glob, Path.glob, Path.match and fnmatch where ** is unsupported with a prefix/suffix
+    other than a forward slash /.
+    More generally:
+    - '*' matches any character except a forward-slash (to match just the file or directory name)
+    - '**' matches any character including a forward-slash /
+    Hidden files and directories (i.e. whose names start with a dot) are ignored, unless they are explicitly requested.
+    The same applies to special directories that start with a double underscore like "__pycache__".
+    You can still include one if the pattern explicitly mentions it:
+    - to include a hidden file: "*/.hidden.txt" or "*/.*"
+    - to include a hidden directory: ".hidden/*" or ".*/*"
+    - to include a special directory: "__special__/*" or "__*/*"
+    Example::
+        >>> from datasets.data_files import resolve_pattern
+        >>> base_path = "."
+        >>> resolve_pattern("docs/**/*.py", base_path)
+        [/Users/mariosasko/Desktop/projects/datasets/docs/source/_config.py']
+    Args:
+        pattern (str): Unix pattern or paths or URLs of the data files to resolve.
+            The paths can be absolute or relative to base_path.
+            Remote filesystems using fsspec are supported, e.g. with the hf:// protocol.
+        base_path (str): Base path to use when resolving relative paths.
+        allowed_extensions (Optional[list], optional): White-list of file extensions to use. Defaults to None (all extensions).
+            For example: allowed_extensions=[".csv", ".json", ".txt", ".parquet"]
+        download_config ([`DownloadConfig`], *optional*): Specific download configuration parameters.
+    Returns:
+        List[str]: List of paths or URLs to the local or remote files that match the patterns.
+    """
+    if is_relative_path(pattern):
+        pattern = xjoin(base_path, pattern)
+    elif is_local_path(pattern):
+        base_path = os.path.splitdrive(pattern)[0] + os.sep
+    else:
+        base_path = ""
+    pattern, storage_options = _prepare_path_and_storage_options(pattern, download_config=download_config)
+    fs, fs_pattern = url_to_fs(pattern, **storage_options)
+    files_to_ignore = set(FILES_TO_IGNORE) - {xbasename(pattern)}
+    protocol = (
+        pattern.split("://")[0]
+        if "://" in pattern
+        else (fs.protocol if isinstance(fs.protocol, str) else fs.protocol[0])
+    )
+    protocol_prefix = protocol + "://" if protocol != "file" else ""
+    glob_kwargs = {}
+    if protocol == "hf":
+        # 10 times faster glob with detail=True (ignores costly info like lastCommit)
+        glob_kwargs["expand_info"] = False
+    matched_paths = [
+        filepath if "://" in filepath else protocol_prefix + filepath
+        for filepath, info in fs.glob(pattern, detail=True, **glob_kwargs).items()
+        if (info["type"] == "file" or (info.get("islink") and os.path.isfile(os.path.realpath(filepath))))
+        and (xbasename(filepath) not in files_to_ignore)
+        and not _is_inside_unrequested_special_dir(filepath, fs_pattern)
+        and not _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(filepath, fs_pattern)
+    ]  # ignore .ipynb and __pycache__, but keep /../
+    if allowed_extensions is not None:
+        out = [
+            filepath
+            for filepath in matched_paths
+            if any("." + suffix in allowed_extensions for suffix in xbasename(filepath).split(".")[1:])
+        ]
+        if len(out) < len(matched_paths):
+            invalid_matched_files = list(set(matched_paths) - set(out))
+            logger.info(
+                f"Some files matched the pattern '{pattern}' but don't have valid data file extensions: {invalid_matched_files}"
+            )
+    else:
+        out = matched_paths
+    if not out:
+        error_msg = f"Unable to find '{pattern}'"
+        if allowed_extensions is not None:
+            error_msg += f" with any supported extension {list(allowed_extensions)}"
+        raise FileNotFoundError(error_msg)
+    return out
+def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig] = None) -> dict[str, list[str]]:
+    """
+    Get the default pattern from a directory testing all the supported patterns.
+    The first patterns to return a non-empty list of data files is returned.
+    Some examples of supported patterns:
+    Input:
+        my_dataset_repository/
+        ├── README.md
+        └── dataset.csv
+    Output:
+        {'train': ['**']}
+    Input:
+        my_dataset_repository/
+        ├── README.md
+        ├── train.csv
+        └── test.csv
+        my_dataset_repository/
+        ├── README.md
+        └── data/
+            ├── train.csv
+            └── test.csv
+        my_dataset_repository/
+        ├── README.md
+        ├── train_0.csv
+        ├── train_1.csv
+        ├── train_2.csv
+        ├── train_3.csv
+        ├── test_0.csv
+        └── test_1.csv
+    Output:
+        {'train': ['**/train[-._ 0-9]*', '**/*[-._ 0-9]train[-._ 0-9]*', '**/training[-._ 0-9]*', '**/*[-._ 0-9]training[-._ 0-9]*'],
+         'test': ['**/test[-._ 0-9]*', '**/*[-._ 0-9]test[-._ 0-9]*', '**/testing[-._ 0-9]*', '**/*[-._ 0-9]testing[-._ 0-9]*', ...]}
+    Input:
+        my_dataset_repository/
+        ├── README.md
+        └── data/
+            ├── train/
+            │   ├── shard_0.csv
+            │   ├── shard_1.csv
+            │   ├── shard_2.csv
+            │   └── shard_3.csv
+            └── test/
+                ├── shard_0.csv
+                └── shard_1.csv
+    Output:
+        {'train': ['**/train/**', '**/train[-._ 0-9]*/**', '**/*[-._ 0-9]train/**', '**/*[-._ 0-9]train[-._ 0-9]*/**', ...],
+         'test': ['**/test/**', '**/test[-._ 0-9]*/**', '**/*[-._ 0-9]test/**', '**/*[-._ 0-9]test[-._ 0-9]*/**', ...]}
+    Input:
+        my_dataset_repository/
+        ├── README.md
+        └── data/
+            ├── train-00000-of-00003.csv
+            ├── train-00001-of-00003.csv
+            ├── train-00002-of-00003.csv
+            ├── test-00000-of-00001.csv
+            ├── random-00000-of-00003.csv
+            ├── random-00001-of-00003.csv
+            └── random-00002-of-00003.csv
+    Output:
+        {'train': ['data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
+         'test': ['data/test-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
+         'random': ['data/random-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*']}
+    In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
+    """
+    resolver = partial(resolve_pattern, base_path=base_path, download_config=download_config)
+    try:
+        return _get_data_files_patterns(resolver)
+    except FileNotFoundError:
+        raise EmptyDatasetError(f"The directory at {base_path} doesn't contain any data files") from None
+def _get_single_origin_metadata(
+    data_file: str,
+    download_config: Optional[DownloadConfig] = None,
+) -> SingleOriginMetadata:
+    data_file, storage_options = _prepare_path_and_storage_options(data_file, download_config=download_config)
+    fs, *_ = url_to_fs(data_file, **storage_options)
+    if isinstance(fs, HfFileSystem):
+        resolved_path = fs.resolve_path(data_file)
+        return resolved_path.repo_id, resolved_path.revision
+    elif data_file.startswith(config.HF_ENDPOINT):
+        hffs = HfFileSystem(endpoint=config.HF_ENDPOINT, token=download_config.token)
+        data_file = "hf://" + data_file[len(config.HF_ENDPOINT) + 1 :].replace("/resolve/", "@", 1)
+        resolved_path = hffs.resolve_path(data_file)
+        return resolved_path.repo_id, resolved_path.revision
+    info = fs.info(data_file)
+    # s3fs uses "ETag", gcsfs uses "etag", and for local we simply check mtime
+    for key in ["ETag", "etag", "mtime"]:
+        if key in info:
+            return (str(info[key]),)
+    return ()
+def _get_origin_metadata(
+    data_files: list[str],
+    download_config: Optional[DownloadConfig] = None,
+    max_workers: Optional[int] = None,
+) -> list[SingleOriginMetadata]:
+    max_workers = max_workers if max_workers is not None else config.HF_DATASETS_MULTITHREADING_MAX_WORKERS
+    if all("hf://" in data_file for data_file in data_files):
+        # No need for multithreading here since the origin metadata of HF files
+        # is (repo_id, revision) and is cached after first .info() call.
+        return [
+            _get_single_origin_metadata(data_file, download_config=download_config)
+            for data_file in hf_tqdm(
+                data_files,
+                desc="Resolving data files",
+                # set `disable=None` rather than `disable=False` by default to disable progress bar when no TTY attached
+                disable=len(data_files) <= 16 or None,
+            )
+        ]
+    return thread_map(
+        partial(_get_single_origin_metadata, download_config=download_config),
+        data_files,
+        max_workers=max_workers,
+        tqdm_class=hf_tqdm,
+        desc="Resolving data files",
+        # set `disable=None` rather than `disable=False` by default to disable progress bar when no TTY attached
+        disable=len(data_files) <= 16 or None,
+    )
+class DataFilesList(list[str]):
+    """
+    List of data files (absolute local paths or URLs).
+    It has two construction methods given the user's data files patterns:
+    - ``from_hf_repo``: resolve patterns inside a dataset repository
+    - ``from_local_or_remote``: resolve patterns from a local path
+    Moreover, DataFilesList has an additional attribute ``origin_metadata``.
+    It can store:
+    - the last modified time of local files
+    - ETag of remote files
+    - commit sha of a dataset repository
+    Thanks to this additional attribute, it is possible to hash the list
+    and get a different hash if and only if at least one file changed.
+    This is useful for caching Dataset objects that are obtained from a list of data files.
+    """
+    def __init__(self, data_files: list[str], origin_metadata: list[SingleOriginMetadata]) -> None:
+        super().__init__(data_files)
+        self.origin_metadata = origin_metadata
+    def __add__(self, other: "DataFilesList") -> "DataFilesList":
+        return DataFilesList([*self, *other], self.origin_metadata + other.origin_metadata)
+    @classmethod
+    def from_hf_repo(
+        cls,
+        patterns: list[str],
+        dataset_info: huggingface_hub.hf_api.DatasetInfo,
+        base_path: Optional[str] = None,
+        allowed_extensions: Optional[list[str]] = None,
+        download_config: Optional[DownloadConfig] = None,
+    ) -> "DataFilesList":
+        base_path = f"hf://datasets/{dataset_info.id}@{dataset_info.sha}/{base_path or ''}".rstrip("/")
+        return cls.from_patterns(
+            patterns, base_path=base_path, allowed_extensions=allowed_extensions, download_config=download_config
+        )
+    @classmethod
+    def from_local_or_remote(
+        cls,
+        patterns: list[str],
+        base_path: Optional[str] = None,
+        allowed_extensions: Optional[list[str]] = None,
+        download_config: Optional[DownloadConfig] = None,
+    ) -> "DataFilesList":
+        base_path = base_path if base_path is not None else Path().resolve().as_posix()
+        return cls.from_patterns(
+            patterns, base_path=base_path, allowed_extensions=allowed_extensions, download_config=download_config
+        )
+    @classmethod
+    def from_patterns(
+        cls,
+        patterns: list[str],
+        base_path: Optional[str] = None,
+        allowed_extensions: Optional[list[str]] = None,
+        download_config: Optional[DownloadConfig] = None,
+    ) -> "DataFilesList":
+        base_path = base_path if base_path is not None else Path().resolve().as_posix()
+        data_files = []
+        for pattern in patterns:
+            try:
+                data_files.extend(
+                    resolve_pattern(
+                        pattern,
+                        base_path=base_path,
+                        allowed_extensions=allowed_extensions,
+                        download_config=download_config,
+                    )
+                )
+            except FileNotFoundError:
+                if not has_magic(pattern):
+                    raise
+        origin_metadata = _get_origin_metadata(data_files, download_config=download_config)
+        return cls(data_files, origin_metadata)
+    def filter(
+        self, *, extensions: Optional[list[str]] = None, file_names: Optional[list[str]] = None
+    ) -> "DataFilesList":
+        patterns = []
+        if extensions:
+            ext_pattern = "|".join(re.escape(ext) for ext in extensions)
+            patterns.append(re.compile(f".*({ext_pattern})(\\..+)?$"))
+        if file_names:
+            fn_pattern = "|".join(re.escape(fn) for fn in file_names)
+            patterns.append(re.compile(rf".*[\/]?({fn_pattern})$"))
+        if patterns:
+            return DataFilesList(
+                [data_file for data_file in self if any(pattern.match(data_file) for pattern in patterns)],
+                origin_metadata=self.origin_metadata,
+            )
+        else:
+            return DataFilesList(list(self), origin_metadata=self.origin_metadata)
+class DataFilesDict(dict[str, DataFilesList]):
+    """
+    Dict of split_name -> list of data files (absolute local paths or URLs).
+    It has two construction methods given the user's data files patterns :
+    - ``from_hf_repo``: resolve patterns inside a dataset repository
+    - ``from_local_or_remote``: resolve patterns from a local path
+    Moreover, each list is a DataFilesList. It is possible to hash the dictionary
+    and get a different hash if and only if at least one file changed.
+    For more info, see [`DataFilesList`].
+    This is useful for caching Dataset objects that are obtained from a list of data files.
+    Changing the order of the keys of this dictionary also doesn't change its hash.
+    """
+    @classmethod
+    def from_local_or_remote(
+        cls,
+        patterns: dict[str, Union[list[str], DataFilesList]],
+        base_path: Optional[str] = None,
+        allowed_extensions: Optional[list[str]] = None,
+        download_config: Optional[DownloadConfig] = None,
+    ) -> "DataFilesDict":
+        out = cls()
+        for key, patterns_for_key in patterns.items():
+            out[key] = (
+                patterns_for_key
+                if isinstance(patterns_for_key, DataFilesList)
+                else DataFilesList.from_local_or_remote(
+                    patterns_for_key,
+                    base_path=base_path,
+                    allowed_extensions=allowed_extensions,
+                    download_config=download_config,
+                )
+            )
+        return out
+    @classmethod
+    def from_hf_repo(
+        cls,
+        patterns: dict[str, Union[list[str], DataFilesList]],
+        dataset_info: huggingface_hub.hf_api.DatasetInfo,
+        base_path: Optional[str] = None,
+        allowed_extensions: Optional[list[str]] = None,
+        download_config: Optional[DownloadConfig] = None,
+    ) -> "DataFilesDict":
+        out = cls()
+        for key, patterns_for_key in patterns.items():
+            out[key] = (
+                patterns_for_key
+                if isinstance(patterns_for_key, DataFilesList)
+                else DataFilesList.from_hf_repo(
+                    patterns_for_key,
+                    dataset_info=dataset_info,
+                    base_path=base_path,
+                    allowed_extensions=allowed_extensions,
+                    download_config=download_config,
+                )
+            )
+        return out
+    @classmethod
+    def from_patterns(
+        cls,
+        patterns: dict[str, Union[list[str], DataFilesList]],
+        base_path: Optional[str] = None,
+        allowed_extensions: Optional[list[str]] = None,
+        download_config: Optional[DownloadConfig] = None,
+    ) -> "DataFilesDict":
+        out = cls()
+        for key, patterns_for_key in patterns.items():
+            out[key] = (
+                patterns_for_key
+                if isinstance(patterns_for_key, DataFilesList)
+                else DataFilesList.from_patterns(
+                    patterns_for_key,
+                    base_path=base_path,
+                    allowed_extensions=allowed_extensions,
+                    download_config=download_config,
+                )
+            )
+        return out
+    def filter(
+        self, *, extensions: Optional[list[str]] = None, file_names: Optional[list[str]] = None
+    ) -> "DataFilesDict":
+        out = type(self)()
+        for key, data_files_list in self.items():
+            out[key] = data_files_list.filter(extensions=extensions, file_names=file_names)
+        return out
+class DataFilesPatternsList(list[str]):
+    """
+    List of data files patterns (absolute local paths or URLs).
+    For each pattern there should also be a list of allowed extensions
+    to keep, or a None ot keep all the files for the pattern.
+    """
+    def __init__(
+        self,
+        patterns: list[str],
+        allowed_extensions: list[Optional[list[str]]],
+    ):
+        super().__init__(patterns)
+        self.allowed_extensions = allowed_extensions
+    def __add__(self, other):
+        return DataFilesList([*self, *other], self.allowed_extensions + other.allowed_extensions)
+    @classmethod
+    def from_patterns(
+        cls, patterns: list[str], allowed_extensions: Optional[list[str]] = None
+    ) -> "DataFilesPatternsList":
+        return cls(patterns, [allowed_extensions] * len(patterns))
+    def resolve(
+        self,
+        base_path: str,
+        download_config: Optional[DownloadConfig] = None,
+    ) -> "DataFilesList":
+        base_path = base_path if base_path is not None else Path().resolve().as_posix()
+        data_files = []
+        for pattern, allowed_extensions in zip(self, self.allowed_extensions):
+            try:
+                data_files.extend(
+                    resolve_pattern(
+                        pattern,
+                        base_path=base_path,
+                        allowed_extensions=allowed_extensions,
+                        download_config=download_config,
+                    )
+                )
+            except FileNotFoundError:
+                if not has_magic(pattern):
+                    raise
+        origin_metadata = _get_origin_metadata(data_files, download_config=download_config)
+        return DataFilesList(data_files, origin_metadata)
+    def filter_extensions(self, extensions: list[str]) -> "DataFilesPatternsList":
+        return DataFilesPatternsList(
+            self, [allowed_extensions + extensions for allowed_extensions in self.allowed_extensions]
+        )
+class DataFilesPatternsDict(dict[str, DataFilesPatternsList]):
+    """
+    Dict of split_name -> list of data files patterns (absolute local paths or URLs).
+    """
+    @classmethod
+    def from_patterns(
+        cls, patterns: dict[str, list[str]], allowed_extensions: Optional[list[str]] = None
+    ) -> "DataFilesPatternsDict":
+        out = cls()
+        for key, patterns_for_key in patterns.items():
+            out[key] = (
+                patterns_for_key
+                if isinstance(patterns_for_key, DataFilesPatternsList)
+                else DataFilesPatternsList.from_patterns(
+                    patterns_for_key,
+                    allowed_extensions=allowed_extensions,
+                )
+            )
+        return out
+    def resolve(
+        self,
+        base_path: str,
+        download_config: Optional[DownloadConfig] = None,
+    ) -> "DataFilesDict":
+        out = DataFilesDict()
+        for key, data_files_patterns_list in self.items():
+            out[key] = data_files_patterns_list.resolve(base_path, download_config)
+        return out
+    def filter_extensions(self, extensions: list[str]) -> "DataFilesPatternsDict":
+        out = type(self)()
+        for key, data_files_patterns_list in self.items():
+            out[key] = data_files_patterns_list.filter_extensions(extensions)
+        return out

.venv/lib/python3.11/site-packages/datasets/dataset_dict.py ADDED Viewed

The diff for this file is too large to render. See raw diff

.venv/lib/python3.11/site-packages/datasets/distributed.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from typing import TypeVar
+from .arrow_dataset import Dataset, _split_by_node_map_style_dataset
+from .iterable_dataset import IterableDataset, _split_by_node_iterable_dataset
+DatasetType = TypeVar("DatasetType", Dataset, IterableDataset)
+def split_dataset_by_node(dataset: DatasetType, rank: int, world_size: int) -> DatasetType:
+    """
+    Split a dataset for the node at rank `rank` in a pool of nodes of size `world_size`.
+    For map-style datasets:
+    Each node is assigned a chunk of data, e.g. rank 0 is given the first chunk of the dataset.
+    To maximize data loading throughput, chunks are made of contiguous data on disk if possible.
+    For iterable datasets:
+    If the dataset has a number of shards that is a factor of `world_size` (i.e. if `dataset.num_shards % world_size == 0`),
+    then the shards are evenly assigned across the nodes, which is the most optimized.
+    Otherwise, each node keeps 1 example out of `world_size`, skipping the other examples.
+    Args:
+        dataset ([`Dataset`] or [`IterableDataset`]):
+            The dataset to split by node.
+        rank (`int`):
+            Rank of the current node.
+        world_size (`int`):
+            Total number of nodes.
+    Returns:
+        [`Dataset`] or [`IterableDataset`]: The dataset to be used on the node at rank `rank`.
+    """
+    if isinstance(dataset, Dataset):
+        return _split_by_node_map_style_dataset(dataset, rank=rank, world_size=world_size)
+    else:
+        return _split_by_node_iterable_dataset(dataset, rank=rank, world_size=world_size)

.venv/lib/python3.11/site-packages/datasets/download/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+__all__ = [
+    "DownloadConfig",
+    "DownloadManager",
+    "DownloadMode",
+    "StreamingDownloadManager",
+]
+from .download_config import DownloadConfig
+from .download_manager import DownloadManager, DownloadMode
+from .streaming_download_manager import StreamingDownloadManager

.venv/lib/python3.11/site-packages/datasets/download/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (515 Bytes). View file

.venv/lib/python3.11/site-packages/datasets/download/__pycache__/download_config.cpython-311.pyc ADDED Viewed

Binary file (5.63 kB). View file

.venv/lib/python3.11/site-packages/datasets/download/__pycache__/download_manager.cpython-311.pyc ADDED Viewed

Binary file (16.6 kB). View file

.venv/lib/python3.11/site-packages/datasets/download/__pycache__/streaming_download_manager.cpython-311.pyc ADDED Viewed

Binary file (10.1 kB). View file

.venv/lib/python3.11/site-packages/datasets/download/download_config.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import copy
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Optional, Union
+from .. import config
+@dataclass
+class DownloadConfig:
+    """Configuration for our cached path manager.
+    Attributes:
+        cache_dir (`str` or `Path`, *optional*):
+            Specify a cache directory to save the file to (overwrite the
+            default cache dir).
+        force_download (`bool`, defaults to `False`):
+            If `True`, re-download the file even if it's already cached in
+            the cache dir.
+        resume_download (`bool`, defaults to `False`):
+            If `True`, resume the download if an incompletely received file is
+            found.
+        proxies (`dict`, *optional*):
+        user_agent (`str`, *optional*):
+            Optional string or dict that will be appended to the user-agent on remote
+            requests.
+        extract_compressed_file (`bool`, defaults to `False`):
+            If `True` and the path point to a zip or tar file,
+            extract the compressed file in a folder along the archive.
+        force_extract (`bool`, defaults to `False`):
+            If `True` when `extract_compressed_file` is `True` and the archive
+            was already extracted, re-extract the archive and override the folder where it was extracted.
+        delete_extracted (`bool`, defaults to `False`):
+            Whether to delete (or keep) the extracted files.
+        extract_on_the_fly (`bool`, defaults to `False`):
+            If `True`, extract compressed files while they are being read.
+        use_etag (`bool`, defaults to `True`):
+            Whether to use the ETag HTTP response header to validate the cached files.
+        num_proc (`int`, *optional*):
+            The number of processes to launch to download the files in parallel.
+        max_retries (`int`, default to `1`):
+            The number of times to retry an HTTP request if it fails.
+        token (`str` or `bool`, *optional*):
+            Optional string or boolean to use as Bearer token
+            for remote files on the Datasets Hub. If `True`, or not specified, will get token from `~/.huggingface`.
+        storage_options (`dict`, *optional*):
+            Key/value pairs to be passed on to the dataset file-system backend, if any.
+        download_desc (`str`, *optional*):
+            A description to be displayed alongside with the progress bar while downloading the files.
+        disable_tqdm (`bool`, defaults to `False`):
+            Whether to disable the individual files download progress bar
+    """
+    cache_dir: Optional[Union[str, Path]] = None
+    force_download: bool = False
+    resume_download: bool = False
+    local_files_only: bool = False
+    proxies: Optional[dict] = None
+    user_agent: Optional[str] = None
+    extract_compressed_file: bool = False
+    force_extract: bool = False
+    delete_extracted: bool = False
+    extract_on_the_fly: bool = False
+    use_etag: bool = True
+    num_proc: Optional[int] = None
+    max_retries: int = 1
+    token: Optional[Union[str, bool]] = None
+    storage_options: dict[str, Any] = field(default_factory=dict)
+    download_desc: Optional[str] = None
+    disable_tqdm: bool = False
+    def copy(self) -> "DownloadConfig":
+        return self.__class__(**{k: copy.deepcopy(v) for k, v in self.__dict__.items()})
+    def __setattr__(self, name, value):
+        if name == "token" and getattr(self, "storage_options", None) is not None:
+            if "hf" not in self.storage_options:
+                self.storage_options["hf"] = {"endpoint": config.HF_ENDPOINT, "token": value}
+            elif getattr(self.storage_options["hf"], "token", None) is None:
+                self.storage_options["hf"]["token"] = value
+        super().__setattr__(name, value)

.venv/lib/python3.11/site-packages/datasets/download/download_manager.py ADDED Viewed

	@@ -0,0 +1,340 @@

+# Copyright 2020 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Lint as: python3
+"""Download manager interface."""
+import enum
+import io
+import multiprocessing
+import os
+from datetime import datetime
+from functools import partial
+from typing import Optional, Union
+import fsspec
+from fsspec.core import url_to_fs
+from tqdm.contrib.concurrent import thread_map
+from .. import config
+from ..utils import tqdm as hf_tqdm
+from ..utils.file_utils import (
+    ArchiveIterable,
+    FilesIterable,
+    cached_path,
+    is_relative_path,
+    stack_multiprocessing_download_progress_bars,
+    url_or_path_join,
+)
+from ..utils.info_utils import get_size_checksum_dict
+from ..utils.logging import get_logger, tqdm
+from ..utils.py_utils import NestedDataStructure, map_nested
+from ..utils.track import tracked_str
+from .download_config import DownloadConfig
+logger = get_logger(__name__)
+class DownloadMode(enum.Enum):
+    """`Enum` for how to treat pre-existing downloads and data.
+    The default mode is `REUSE_DATASET_IF_EXISTS`, which will reuse both
+    raw downloads and the prepared dataset if they exist.
+    The generations modes:
+    |                                     | Downloads | Dataset |
+    |-------------------------------------|-----------|---------|
+    | `REUSE_DATASET_IF_EXISTS` (default) | Reuse     | Reuse   |
+    | `REUSE_CACHE_IF_EXISTS`             | Reuse     | Fresh   |
+    | `FORCE_REDOWNLOAD`                  | Fresh     | Fresh   |
+    """
+    REUSE_DATASET_IF_EXISTS = "reuse_dataset_if_exists"
+    REUSE_CACHE_IF_EXISTS = "reuse_cache_if_exists"
+    FORCE_REDOWNLOAD = "force_redownload"
+class DownloadManager:
+    is_streaming = False
+    def __init__(
+        self,
+        dataset_name: Optional[str] = None,
+        data_dir: Optional[str] = None,
+        download_config: Optional[DownloadConfig] = None,
+        base_path: Optional[str] = None,
+        record_checksums=True,
+    ):
+        """Download manager constructor.
+        Args:
+            data_dir:
+                can be used to specify a manual directory to get the files from.
+            dataset_name (`str`):
+                name of dataset this instance will be used for. If
+                provided, downloads will contain which datasets they were used for.
+            download_config (`DownloadConfig`):
+                to specify the cache directory and other
+                download options
+            base_path (`str`):
+                base path that is used when relative paths are used to
+                download files. This can be a remote url.
+            record_checksums (`bool`, defaults to `True`):
+                Whether to record the checksums of the downloaded files. If None, the value is inferred from the builder.
+        """
+        self._dataset_name = dataset_name
+        self._data_dir = data_dir
+        self._base_path = base_path or os.path.abspath(".")
+        # To record what is being used: {url: {num_bytes: int, checksum: str}}
+        self._recorded_sizes_checksums: dict[str, dict[str, Optional[Union[int, str]]]] = {}
+        self.record_checksums = record_checksums
+        self.download_config = download_config or DownloadConfig()
+        self.downloaded_paths = {}
+        self.extracted_paths = {}
+    @property
+    def manual_dir(self):
+        return self._data_dir
+    @property
+    def downloaded_size(self):
+        """Returns the total size of downloaded files."""
+        return sum(checksums_dict["num_bytes"] for checksums_dict in self._recorded_sizes_checksums.values())
+    def _record_sizes_checksums(self, url_or_urls: NestedDataStructure, downloaded_path_or_paths: NestedDataStructure):
+        """Record size/checksum of downloaded files."""
+        delay = 5
+        for url, path in hf_tqdm(
+            list(zip(url_or_urls.flatten(), downloaded_path_or_paths.flatten())),
+            delay=delay,
+            desc="Computing checksums",
+        ):
+            # call str to support PathLike objects
+            self._recorded_sizes_checksums[str(url)] = get_size_checksum_dict(
+                path, record_checksum=self.record_checksums
+            )
+    def download(self, url_or_urls):
+        """Download given URL(s).
+        By default, only one process is used for download. Pass customized `download_config.num_proc` to change this behavior.
+        Args:
+            url_or_urls (`str` or `list` or `dict`):
+                URL or `list` or `dict` of URLs to download. Each URL is a `str`.
+        Returns:
+            `str` or `list` or `dict`:
+                The downloaded paths matching the given input `url_or_urls`.
+        Example:
+        ```py
+        >>> downloaded_files = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
+        ```
+        """
+        download_config = self.download_config.copy()
+        download_config.extract_compressed_file = False
+        if download_config.download_desc is None:
+            download_config.download_desc = "Downloading data"
+        download_func = partial(self._download_batched, download_config=download_config)
+        start_time = datetime.now()
+        with stack_multiprocessing_download_progress_bars():
+            downloaded_path_or_paths = map_nested(
+                download_func,
+                url_or_urls,
+                map_tuple=True,
+                num_proc=download_config.num_proc,
+                desc="Downloading data files",
+                batched=True,
+                batch_size=-1,
+            )
+        duration = datetime.now() - start_time
+        logger.info(f"Downloading took {duration.total_seconds() // 60} min")
+        url_or_urls = NestedDataStructure(url_or_urls)
+        downloaded_path_or_paths = NestedDataStructure(downloaded_path_or_paths)
+        self.downloaded_paths.update(dict(zip(url_or_urls.flatten(), downloaded_path_or_paths.flatten())))
+        start_time = datetime.now()
+        self._record_sizes_checksums(url_or_urls, downloaded_path_or_paths)
+        duration = datetime.now() - start_time
+        logger.info(f"Checksum Computation took {duration.total_seconds() // 60} min")
+        return downloaded_path_or_paths.data
+    def _download_batched(
+        self,
+        url_or_filenames: list[str],
+        download_config: DownloadConfig,
+    ) -> list[str]:
+        if len(url_or_filenames) >= 16:
+            download_config = download_config.copy()
+            download_config.disable_tqdm = True
+            download_func = partial(self._download_single, download_config=download_config)
+            fs: fsspec.AbstractFileSystem
+            path = str(url_or_filenames[0])
+            if is_relative_path(path):
+                # append the relative path to the base_path
+                path = url_or_path_join(self._base_path, path)
+            fs, path = url_to_fs(path, **download_config.storage_options)
+            size = 0
+            try:
+                size = fs.info(path).get("size", 0)
+            except Exception:
+                pass
+            max_workers = (
+                config.HF_DATASETS_MULTITHREADING_MAX_WORKERS if size < (20 << 20) else 1
+            )  # enable multithreading if files are small
+            return thread_map(
+                download_func,
+                url_or_filenames,
+                desc=download_config.download_desc or "Downloading",
+                unit="files",
+                position=multiprocessing.current_process()._identity[-1]  # contains the ranks of subprocesses
+                if os.environ.get("HF_DATASETS_STACK_MULTIPROCESSING_DOWNLOAD_PROGRESS_BARS") == "1"
+                and multiprocessing.current_process()._identity
+                else None,
+                max_workers=max_workers,
+                tqdm_class=tqdm,
+            )
+        else:
+            return [
+                self._download_single(url_or_filename, download_config=download_config)
+                for url_or_filename in url_or_filenames
+            ]
+    def _download_single(self, url_or_filename: str, download_config: DownloadConfig) -> str:
+        url_or_filename = str(url_or_filename)
+        if is_relative_path(url_or_filename):
+            # append the relative path to the base_path
+            url_or_filename = url_or_path_join(self._base_path, url_or_filename)
+        out = cached_path(url_or_filename, download_config=download_config)
+        out = tracked_str(out)
+        out.set_origin(url_or_filename)
+        return out
+    def iter_archive(self, path_or_buf: Union[str, io.BufferedReader]):
+        """Iterate over files within an archive.
+        Args:
+            path_or_buf (`str` or `io.BufferedReader`):
+                Archive path or archive binary file object.
+        Yields:
+            `tuple[str, io.BufferedReader]`:
+                2-tuple (path_within_archive, file_object).
+                File object is opened in binary mode.
+        Example:
+        ```py
+        >>> archive = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
+        >>> files = dl_manager.iter_archive(archive)
+        ```
+        """
+        if hasattr(path_or_buf, "read"):
+            return ArchiveIterable.from_buf(path_or_buf)
+        else:
+            return ArchiveIterable.from_urlpath(path_or_buf)
+    def iter_files(self, paths: Union[str, list[str]]):
+        """Iterate over file paths.
+        Args:
+            paths (`str` or `list` of `str`):
+                Root paths.
+        Yields:
+            `str`: File path.
+        Example:
+        ```py
+        >>> files = dl_manager.download_and_extract('https://huggingface.co/datasets/beans/resolve/main/data/train.zip')
+        >>> files = dl_manager.iter_files(files)
+        ```
+        """
+        return FilesIterable.from_urlpaths(paths)
+    def extract(self, path_or_paths):
+        """Extract given path(s).
+        Args:
+            path_or_paths (path or `list` or `dict`):
+                Path of file to extract. Each path is a `str`.
+        Returns:
+            extracted_path(s): `str`, The extracted paths matching the given input
+            path_or_paths.
+        Example:
+        ```py
+        >>> downloaded_files = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
+        >>> extracted_files = dl_manager.extract(downloaded_files)
+        ```
+        """
+        download_config = self.download_config.copy()
+        download_config.extract_compressed_file = True
+        extract_func = partial(self._download_single, download_config=download_config)
+        extracted_paths = map_nested(
+            extract_func,
+            path_or_paths,
+            num_proc=download_config.num_proc,
+            desc="Extracting data files",
+        )
+        path_or_paths = NestedDataStructure(path_or_paths)
+        extracted_paths = NestedDataStructure(extracted_paths)
+        self.extracted_paths.update(dict(zip(path_or_paths.flatten(), extracted_paths.flatten())))
+        return extracted_paths.data
+    def download_and_extract(self, url_or_urls):
+        """Download and extract given `url_or_urls`.
+        Is roughly equivalent to:
+        ```
+        extracted_paths = dl_manager.extract(dl_manager.download(url_or_urls))
+        ```
+        Args:
+            url_or_urls (`str` or `list` or `dict`):
+                URL or `list` or `dict` of URLs to download and extract. Each URL is a `str`.
+        Returns:
+            extracted_path(s): `str`, extracted paths of given URL(s).
+        """
+        return self.extract(self.download(url_or_urls))
+    def get_recorded_sizes_checksums(self):
+        return self._recorded_sizes_checksums.copy()
+    def delete_extracted_files(self):
+        paths_to_delete = set(self.extracted_paths.values()) - set(self.downloaded_paths.values())
+        for key, path in list(self.extracted_paths.items()):
+            if path in paths_to_delete and os.path.isfile(path):
+                os.remove(path)
+                del self.extracted_paths[key]
+    def manage_extracted_files(self):
+        if self.download_config.delete_extracted:
+            self.delete_extracted_files()

.venv/lib/python3.11/site-packages/datasets/download/streaming_download_manager.py ADDED Viewed

	@@ -0,0 +1,219 @@

+import io
+import os
+from collections.abc import Iterable
+from typing import Optional, Union
+from ..utils.file_utils import (  # noqa: F401 # backward compatibility
+    SINGLE_FILE_COMPRESSION_PROTOCOLS,
+    ArchiveIterable,
+    FilesIterable,
+    _get_extraction_protocol,
+    _get_path_extension,
+    _prepare_path_and_storage_options,
+    is_relative_path,
+    url_or_path_join,
+    xbasename,
+    xdirname,
+    xet_parse,
+    xexists,
+    xgetsize,
+    xglob,
+    xgzip_open,
+    xisdir,
+    xisfile,
+    xjoin,
+    xlistdir,
+    xnumpy_load,
+    xopen,
+    xpandas_read_csv,
+    xpandas_read_excel,
+    xPath,
+    xpyarrow_parquet_read_table,
+    xrelpath,
+    xsio_loadmat,
+    xsplit,
+    xsplitext,
+    xwalk,
+    xxml_dom_minidom_parse,
+)
+from ..utils.logging import get_logger
+from ..utils.py_utils import map_nested
+from .download_config import DownloadConfig
+logger = get_logger(__name__)
+class StreamingDownloadManager:
+    """
+    Download manager that uses the "::" separator to navigate through (possibly remote) compressed archives.
+    Contrary to the regular `DownloadManager`, the `download` and `extract` methods don't actually download nor extract
+    data, but they rather return the path or url that could be opened using the `xopen` function which extends the
+    built-in `open` function to stream data from remote files.
+    """
+    is_streaming = True
+    def __init__(
+        self,
+        dataset_name: Optional[str] = None,
+        data_dir: Optional[str] = None,
+        download_config: Optional[DownloadConfig] = None,
+        base_path: Optional[str] = None,
+    ):
+        self._dataset_name = dataset_name
+        self._data_dir = data_dir
+        self._base_path = base_path or os.path.abspath(".")
+        self.download_config = download_config or DownloadConfig()
+        self.downloaded_size = None
+        self.record_checksums = False
+    @property
+    def manual_dir(self):
+        return self._data_dir
+    def download(self, url_or_urls):
+        """Normalize URL(s) of files to stream data from.
+        This is the lazy version of `DownloadManager.download` for streaming.
+        Args:
+            url_or_urls (`str` or `list` or `dict`):
+                URL(s) of files to stream data from. Each url is a `str`.
+        Returns:
+            url(s): (`str` or `list` or `dict`), URL(s) to stream data from matching the given input url_or_urls.
+        Example:
+        ```py
+        >>> downloaded_files = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
+        ```
+        """
+        url_or_urls = map_nested(self._download_single, url_or_urls, map_tuple=True)
+        return url_or_urls
+    def _download_single(self, urlpath: str) -> str:
+        urlpath = str(urlpath)
+        if is_relative_path(urlpath):
+            # append the relative path to the base_path
+            urlpath = url_or_path_join(self._base_path, urlpath)
+        return urlpath
+    def extract(self, url_or_urls):
+        """Add extraction protocol for given url(s) for streaming.
+        This is the lazy version of `DownloadManager.extract` for streaming.
+        Args:
+            url_or_urls (`str` or `list` or `dict`):
+                URL(s) of files to stream data from. Each url is a `str`.
+        Returns:
+            url(s): (`str` or `list` or `dict`), URL(s) to stream data from matching the given input `url_or_urls`.
+        Example:
+        ```py
+        >>> downloaded_files = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
+        >>> extracted_files = dl_manager.extract(downloaded_files)
+        ```
+        """
+        urlpaths = map_nested(self._extract, url_or_urls, map_tuple=True)
+        return urlpaths
+    def _extract(self, urlpath: str) -> str:
+        urlpath = str(urlpath)
+        protocol = _get_extraction_protocol(urlpath, download_config=self.download_config)
+        # get inner file: zip://train-00000.json.gz::https://foo.bar/data.zip -> zip://train-00000.json.gz
+        path = urlpath.split("::")[0]
+        extension = _get_path_extension(path)
+        if extension in ["tgz", "tar"] or path.endswith((".tar.gz", ".tar.bz2", ".tar.xz")):
+            raise NotImplementedError(
+                f"Extraction protocol for TAR archives like '{urlpath}' is not implemented in streaming mode. "
+                f"Please use `dl_manager.iter_archive` instead.\n\n"
+                f"Example usage:\n\n"
+                f"\turl = dl_manager.download(url)\n"
+                f"\ttar_archive_iterator = dl_manager.iter_archive(url)\n\n"
+                f"\tfor filename, file in tar_archive_iterator:\n"
+                f"\t\t..."
+            )
+        if protocol is None:
+            # no extraction
+            return urlpath
+        elif protocol in SINGLE_FILE_COMPRESSION_PROTOCOLS:
+            # there is one single file which is the uncompressed file
+            inner_file = os.path.basename(urlpath.split("::")[0])
+            inner_file = inner_file[: inner_file.rindex(".")] if "." in inner_file else inner_file
+            return f"{protocol}://{inner_file}::{urlpath}"
+        else:
+            return f"{protocol}://::{urlpath}"
+    def download_and_extract(self, url_or_urls):
+        """Prepare given `url_or_urls` for streaming (add extraction protocol).
+        This is the lazy version of `DownloadManager.download_and_extract` for streaming.
+        Is equivalent to:
+        ```
+        urls = dl_manager.extract(dl_manager.download(url_or_urls))
+        ```
+        Args:
+            url_or_urls (`str` or `list` or `dict`):
+                URL(s) to stream from data from. Each url is a `str`.
+        Returns:
+            url(s): (`str` or `list` or `dict`), URL(s) to stream data from matching the given input `url_or_urls`.
+        """
+        return self.extract(self.download(url_or_urls))
+    def iter_archive(self, urlpath_or_buf: Union[str, io.BufferedReader]) -> Iterable[tuple]:
+        """Iterate over files within an archive.
+        Args:
+            urlpath_or_buf (`str` or `io.BufferedReader`):
+                Archive path or archive binary file object.
+        Yields:
+            `tuple[str, io.BufferedReader]`:
+                2-tuple (path_within_archive, file_object).
+                File object is opened in binary mode.
+        Example:
+        ```py
+        >>> archive = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
+        >>> files = dl_manager.iter_archive(archive)
+        ```
+        """
+        if hasattr(urlpath_or_buf, "read"):
+            return ArchiveIterable.from_buf(urlpath_or_buf)
+        else:
+            return ArchiveIterable.from_urlpath(urlpath_or_buf, download_config=self.download_config)
+    def iter_files(self, urlpaths: Union[str, list[str]]) -> Iterable[str]:
+        """Iterate over files.
+        Args:
+            urlpaths (`str` or `list` of `str`):
+                Root paths.
+        Yields:
+            str: File URL path.
+        Example:
+        ```py
+        >>> files = dl_manager.download_and_extract('https://huggingface.co/datasets/beans/resolve/main/data/train.zip')
+        >>> files = dl_manager.iter_files(files)
+        ```
+        """
+        return FilesIterable.from_urlpaths(urlpaths, download_config=self.download_config)
+    def manage_extracted_files(self):
+        pass
+    def get_recorded_sizes_checksums(self):
+        pass

.venv/lib/python3.11/site-packages/datasets/exceptions.py ADDED Viewed

	@@ -0,0 +1,119 @@

+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2023 The HuggingFace Authors.
+from typing import Any, Optional, Union
+from huggingface_hub import HfFileSystem
+from . import config
+from .table import CastError
+from .utils.track import TrackedIterableFromGenerator, tracked_list, tracked_str
+class DatasetsError(Exception):
+    """Base class for exceptions in this library."""
+class DefunctDatasetError(DatasetsError):
+    """The dataset has been defunct."""
+class FileNotFoundDatasetsError(DatasetsError, FileNotFoundError):
+    """FileNotFoundError raised by this library."""
+class DataFilesNotFoundError(FileNotFoundDatasetsError):
+    """No (supported) data files found."""
+class DatasetNotFoundError(FileNotFoundDatasetsError):
+    """Dataset not found.
+    Raised when trying to access:
+    - a missing dataset, or
+    - a private/gated dataset and the user is not authenticated.
+    """
+class DatasetBuildError(DatasetsError):
+    pass
+class ManualDownloadError(DatasetBuildError):
+    pass
+class FileFormatError(DatasetBuildError):
+    pass
+class DatasetGenerationError(DatasetBuildError):
+    pass
+class DatasetGenerationCastError(DatasetGenerationError):
+    @classmethod
+    def from_cast_error(
+        cls,
+        cast_error: CastError,
+        builder_name: str,
+        gen_kwargs: dict[str, Any],
+        token: Optional[Union[bool, str]],
+    ) -> "DatasetGenerationCastError":
+        explanation_message = (
+            f"\n\nAll the data files must have the same columns, but at some point {cast_error.details()}"
+        )
+        formatted_tracked_gen_kwargs: list[str] = []
+        for gen_kwarg in gen_kwargs.values():
+            if not isinstance(gen_kwarg, (tracked_str, tracked_list, TrackedIterableFromGenerator)):
+                continue
+            while (
+                isinstance(gen_kwarg, (tracked_list, TrackedIterableFromGenerator)) and gen_kwarg.last_item is not None
+            ):
+                gen_kwarg = gen_kwarg.last_item
+            if isinstance(gen_kwarg, tracked_str):
+                gen_kwarg = gen_kwarg.get_origin()
+            if isinstance(gen_kwarg, str) and gen_kwarg.startswith("hf://"):
+                resolved_path = HfFileSystem(endpoint=config.HF_ENDPOINT, token=token).resolve_path(gen_kwarg)
+                gen_kwarg = "hf://" + resolved_path.unresolve()
+                if "@" + resolved_path.revision in gen_kwarg:
+                    gen_kwarg = (
+                        gen_kwarg.replace("@" + resolved_path.revision, "", 1)
+                        + f" (at revision {resolved_path.revision})"
+                    )
+            formatted_tracked_gen_kwargs.append(str(gen_kwarg))
+        if formatted_tracked_gen_kwargs:
+            explanation_message += f"\n\nThis happened while the {builder_name} dataset builder was generating data using\n\n{', '.join(formatted_tracked_gen_kwargs)}"
+        help_message = "\n\nPlease either edit the data files to have matching columns, or separate them into different configurations (see docs at https://hf.co/docs/hub/datasets-manual-configuration#multiple-configurations)"
+        return cls("An error occurred while generating the dataset" + explanation_message + help_message)
+class ChecksumVerificationError(DatasetsError):
+    """Error raised during checksums verifications of downloaded files."""
+class UnexpectedDownloadedFileError(ChecksumVerificationError):
+    """Some downloaded files were not expected."""
+class ExpectedMoreDownloadedFilesError(ChecksumVerificationError):
+    """Some files were supposed to be downloaded but were not."""
+class NonMatchingChecksumError(ChecksumVerificationError):
+    """The downloaded file checksum don't match the expected checksum."""
+class SplitsVerificationError(DatasetsError):
+    """Error raised during splits verifications."""
+class UnexpectedSplitsError(SplitsVerificationError):
+    """The expected splits of the downloaded file is missing."""
+class ExpectedMoreSplitsError(SplitsVerificationError):
+    """Some recorded splits are missing."""
+class NonMatchingSplitsSizesError(SplitsVerificationError):
+    """The splits sizes don't match the expected splits sizes."""

.venv/lib/python3.11/site-packages/datasets/features/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+__all__ = [
+    "Audio",
+    "Array2D",
+    "Array3D",
+    "Array4D",
+    "Array5D",
+    "ClassLabel",
+    "Features",
+    "LargeList",
+    "List",
+    "Sequence",
+    "Value",
+    "Image",
+    "Translation",
+    "TranslationVariableLanguages",
+    "Video",
+    "Pdf",
+    "Nifti",
+]
+from .audio import Audio
+from .features import Array2D, Array3D, Array4D, Array5D, ClassLabel, Features, LargeList, List, Sequence, Value
+from .image import Image
+from .nifti import Nifti
+from .pdf import Pdf
+from .translation import Translation, TranslationVariableLanguages
+from .video import Video

.venv/lib/python3.11/site-packages/datasets/features/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (951 Bytes). View file

.venv/lib/python3.11/site-packages/datasets/features/__pycache__/audio.cpython-311.pyc ADDED Viewed

Binary file (19.8 kB). View file

.venv/lib/python3.11/site-packages/datasets/features/__pycache__/features.cpython-311.pyc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:11f718da0d59a989bb790b038f4b9473b5bed9942e92a1d9b2bc9e25b3e954a3
+size 137118