diff --git "a/.venv/lib/python3.11/site-packages/ray/data/dataset.py" "b/.venv/lib/python3.11/site-packages/ray/data/dataset.py" new file mode 100644--- /dev/null +++ "b/.venv/lib/python3.11/site-packages/ray/data/dataset.py" @@ -0,0 +1,5621 @@ +import collections +import copy +import html +import itertools +import logging +import time +import warnings +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + Generic, + Iterable, + Iterator, + List, + Literal, + Mapping, + Optional, + Tuple, + TypeVar, + Union, +) + +import numpy as np + +import ray +import ray.cloudpickle as pickle +from ray._private.thirdparty.tabulate.tabulate import tabulate +from ray._private.usage import usage_lib +from ray.air.util.tensor_extensions.arrow import ( + ArrowTensorTypeV2, + get_arrow_extension_fixed_shape_tensor_types, +) +from ray.air.util.tensor_extensions.utils import _create_possibly_ragged_ndarray +from ray.data._internal.aggregate import Max, Mean, Min, Std, Sum, Unique +from ray.data._internal.compute import ComputeStrategy +from ray.data._internal.datasource.bigquery_datasink import BigQueryDatasink +from ray.data._internal.datasource.csv_datasink import CSVDatasink +from ray.data._internal.datasource.image_datasink import ImageDatasink +from ray.data._internal.datasource.json_datasink import JSONDatasink +from ray.data._internal.datasource.mongo_datasink import MongoDatasink +from ray.data._internal.datasource.numpy_datasink import NumpyDatasink +from ray.data._internal.datasource.parquet_datasink import ParquetDatasink +from ray.data._internal.datasource.sql_datasink import SQLDatasink +from ray.data._internal.datasource.tfrecords_datasink import TFRecordDatasink +from ray.data._internal.datasource.webdataset_datasink import WebDatasetDatasink +from ray.data._internal.equalize import _equalize +from ray.data._internal.execution.interfaces import RefBundle +from ray.data._internal.execution.interfaces.ref_bundle import ( + _ref_bundles_iterator_to_block_refs_list, +) +from ray.data._internal.execution.util import memory_string +from ray.data._internal.iterator.iterator_impl import DataIteratorImpl +from ray.data._internal.iterator.stream_split_iterator import StreamSplitDataIterator +from ray.data._internal.logical.operators.all_to_all_operator import ( + RandomizeBlocks, + RandomShuffle, + Repartition, + Sort, +) +from ray.data._internal.logical.operators.count_operator import Count +from ray.data._internal.logical.operators.input_data_operator import InputData +from ray.data._internal.logical.operators.map_operator import ( + Filter, + FlatMap, + MapBatches, + MapRows, + Project, +) +from ray.data._internal.logical.operators.n_ary_operator import ( + Union as UnionLogicalOperator, +) +from ray.data._internal.logical.operators.n_ary_operator import Zip +from ray.data._internal.logical.operators.one_to_one_operator import Limit +from ray.data._internal.logical.operators.write_operator import Write +from ray.data._internal.logical.optimizers import LogicalPlan +from ray.data._internal.pandas_block import PandasBlockBuilder, PandasBlockSchema +from ray.data._internal.plan import ExecutionPlan +from ray.data._internal.planner.exchange.sort_task_spec import SortKey +from ray.data._internal.planner.plan_write_op import gen_datasink_write_result +from ray.data._internal.remote_fn import cached_remote_fn +from ray.data._internal.split import _get_num_rows, _split_at_indices +from ray.data._internal.stats import DatasetStats, DatasetStatsSummary, StatsManager +from ray.data._internal.util import ( + AllToAllAPI, + ConsumptionAPI, + _validate_rows_per_file_args, + get_compute_strategy, +) +from ray.data.aggregate import AggregateFn +from ray.data.block import ( + VALID_BATCH_FORMATS, + Block, + BlockAccessor, + DataBatch, + DataBatchColumn, + T, + U, + UserDefinedFunction, + _apply_batch_format, + _apply_batch_size, +) +from ray.data.context import DataContext +from ray.data.datasource import Connection, Datasink, FilenameProvider +from ray.data.iterator import DataIterator +from ray.data.random_access_dataset import RandomAccessDataset +from ray.types import ObjectRef +from ray.util.annotations import Deprecated, DeveloperAPI, PublicAPI +from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy +from ray.widgets import Template +from ray.widgets.util import repr_with_fallback + +if TYPE_CHECKING: + import dask + import mars + import modin + import pandas + import pyarrow + import pyspark + import tensorflow as tf + import torch + import torch.utils.data + from tensorflow_metadata.proto.v0 import schema_pb2 + + from ray.data._internal.execution.interfaces import Executor, NodeIdStr + from ray.data.grouped_data import GroupedData + + +logger = logging.getLogger(__name__) + +TensorflowFeatureTypeSpec = Union[ + "tf.TypeSpec", List["tf.TypeSpec"], Dict[str, "tf.TypeSpec"] +] + +TensorFlowTensorBatchType = Union["tf.Tensor", Dict[str, "tf.Tensor"]] + +CollatedData = TypeVar("CollatedData") +TorchBatchType = Union[Dict[str, "torch.Tensor"], CollatedData] + +BT_API_GROUP = "Basic Transformations" +SSR_API_GROUP = "Sorting, Shuffling and Repartitioning" +SMD_API_GROUP = "Splitting and Merging datasets" +GGA_API_GROUP = "Grouped and Global aggregations" +CD_API_GROUP = "Consuming Data" +IOC_API_GROUP = "I/O and Conversion" +IM_API_GROUP = "Inspecting Metadata" +E_API_GROUP = "Execution" + + +@PublicAPI +class Dataset: + """A Dataset is a distributed data collection for data loading and processing. + + Datasets are distributed pipelines that produce ``ObjectRef[Block]`` outputs, + where each block holds data in Arrow format, representing a shard of the overall + data collection. The block also determines the unit of parallelism. For more + details, see :ref:`Ray Data Internals `. + + Datasets can be created in multiple ways: from synthetic data via ``range_*()`` + APIs, from existing memory data via ``from_*()`` APIs (this creates a subclass + of Dataset called ``MaterializedDataset``), or from external storage + systems such as local disk, S3, HDFS etc. via the ``read_*()`` APIs. The + (potentially processed) Dataset can be saved back to external storage systems + via the ``write_*()`` APIs. + + Examples: + .. testcode:: + :skipif: True + + import ray + # Create dataset from synthetic data. + ds = ray.data.range(1000) + # Create dataset from in-memory data. + ds = ray.data.from_items( + [{"col1": i, "col2": i * 2} for i in range(1000)] + ) + # Create dataset from external storage system. + ds = ray.data.read_parquet("s3://bucket/path") + # Save dataset back to external storage system. + ds.write_csv("s3://bucket/output") + + Dataset has two kinds of operations: transformation, which takes in Dataset + and outputs a new Dataset (e.g. :py:meth:`.map_batches()`); and consumption, + which produces values (not a data stream) as output + (e.g. :meth:`.iter_batches()`). + + Dataset transformations are lazy, with execution of the transformations being + triggered by downstream consumption. + + Dataset supports parallel processing at scale: transformations such as + :py:meth:`.map_batches()`, aggregations such as + :py:meth:`.min()`/:py:meth:`.max()`/:py:meth:`.mean()`, grouping via + :py:meth:`.groupby()`, shuffling operations such as :py:meth:`.sort()`, + :py:meth:`.random_shuffle()`, and :py:meth:`.repartition()`. + + Examples: + >>> import ray + >>> ds = ray.data.range(1000) + >>> # Transform batches (Dict[str, np.ndarray]) with map_batches(). + >>> ds.map_batches(lambda batch: {"id": batch["id"] * 2}) # doctest: +ELLIPSIS + MapBatches() + +- Dataset(num_rows=1000, schema={id: int64}) + >>> # Compute the maximum. + >>> ds.max("id") + 999 + >>> # Shuffle this dataset randomly. + >>> ds.random_shuffle() # doctest: +ELLIPSIS + RandomShuffle + +- Dataset(num_rows=1000, schema={id: int64}) + >>> # Sort it back in order. + >>> ds.sort("id") # doctest: +ELLIPSIS + Sort + +- Dataset(num_rows=1000, schema={id: int64}) + + Both unexecuted and materialized Datasets can be passed between Ray tasks and + actors without incurring a copy. Dataset supports conversion to/from several + more featureful dataframe libraries (e.g., Spark, Dask, Modin, MARS), and are also + compatible with distributed TensorFlow / PyTorch. + """ + + def __init__( + self, + plan: ExecutionPlan, + logical_plan: LogicalPlan, + ): + """Construct a Dataset (internal API). + + The constructor is not part of the Dataset API. Use the ``ray.data.*`` + read methods to construct a dataset. + """ + assert isinstance(plan, ExecutionPlan), type(plan) + usage_lib.record_library_usage("dataset") # Legacy telemetry name. + + self._plan = plan + self._logical_plan = logical_plan + self._plan.link_logical_plan(logical_plan) + + # Handle to currently running executor for this dataset. + self._current_executor: Optional["Executor"] = None + self._write_ds = None + + self._set_uuid(StatsManager.get_dataset_id_from_stats_actor()) + + @staticmethod + def copy( + ds: "Dataset", _deep_copy: bool = False, _as: Optional[type] = None + ) -> "Dataset": + if not _as: + _as = type(ds) + if _deep_copy: + return _as(ds._plan.deep_copy(), ds._logical_plan) + else: + return _as(ds._plan.copy(), ds._logical_plan) + + @PublicAPI(api_group=BT_API_GROUP) + def map( + self, + fn: UserDefinedFunction[Dict[str, Any], Dict[str, Any]], + *, + compute: Optional[ComputeStrategy] = None, + fn_args: Optional[Iterable[Any]] = None, + fn_kwargs: Optional[Dict[str, Any]] = None, + fn_constructor_args: Optional[Iterable[Any]] = None, + fn_constructor_kwargs: Optional[Dict[str, Any]] = None, + num_cpus: Optional[float] = None, + num_gpus: Optional[float] = None, + concurrency: Optional[Union[int, Tuple[int, int]]] = None, + ray_remote_args_fn: Optional[Callable[[], Dict[str, Any]]] = None, + **ray_remote_args, + ) -> "Dataset": + """Apply the given function to each row of this dataset. + + Use this method to transform your data. To learn more, see + :ref:`Transforming rows `. + + You can use either a function or a callable class to perform the transformation. + For functions, Ray Data uses stateless Ray tasks. For classes, Ray Data uses + stateful Ray actors. For more information, see + :ref:`Stateful Transforms `. + + .. tip:: + + If your transformation is vectorized like most NumPy or pandas operations, + :meth:`~Dataset.map_batches` might be faster. + + .. warning:: + Specifying both ``num_cpus`` and ``num_gpus`` for map tasks is experimental, + and may result in scheduling or stability issues. Please + `report any issues `_ + to the Ray team. + + Examples: + + .. testcode:: + + import os + from typing import Any, Dict + import ray + + def parse_filename(row: Dict[str, Any]) -> Dict[str, Any]: + row["filename"] = os.path.basename(row["path"]) + return row + + ds = ( + ray.data.read_images("s3://anonymous@ray-example-data/image-datasets/simple", include_paths=True) + .map(parse_filename) + ) + print(ds.schema()) + + .. testoutput:: + + Column Type + ------ ---- + image numpy.ndarray(shape=(32, 32, 3), dtype=uint8) + path string + filename string + + Time complexity: O(dataset size / parallelism) + + Args: + fn: The function to apply to each row, or a class type + that can be instantiated to create such a callable. + compute: This argument is deprecated. Use ``concurrency`` argument. + fn_args: Positional arguments to pass to ``fn`` after the first argument. + These arguments are top-level arguments to the underlying Ray task. + fn_kwargs: Keyword arguments to pass to ``fn``. These arguments are + top-level arguments to the underlying Ray task. + fn_constructor_args: Positional arguments to pass to ``fn``'s constructor. + You can only provide this if ``fn`` is a callable class. These arguments + are top-level arguments in the underlying Ray actor construction task. + fn_constructor_kwargs: Keyword arguments to pass to ``fn``'s constructor. + This can only be provided if ``fn`` is a callable class. These arguments + are top-level arguments in the underlying Ray actor construction task. + num_cpus: The number of CPUs to reserve for each parallel map worker. + num_gpus: The number of GPUs to reserve for each parallel map worker. For + example, specify `num_gpus=1` to request 1 GPU for each parallel map + worker. + concurrency: The number of Ray workers to use concurrently. For a fixed-sized + worker pool of size ``n``, specify ``concurrency=n``. For an autoscaling + worker pool from ``m`` to ``n`` workers, specify ``concurrency=(m, n)``. + ray_remote_args_fn: A function that returns a dictionary of remote args + passed to each map worker. The purpose of this argument is to generate + dynamic arguments for each actor/task, and will be called each time prior + to initializing the worker. Args returned from this dict will always + override the args in ``ray_remote_args``. Note: this is an advanced, + experimental feature. + ray_remote_args: Additional resource requirements to request from + Ray for each map worker. See :func:`ray.remote` for details. + + .. seealso:: + + :meth:`~Dataset.flat_map` + Call this method to create new rows from existing ones. Unlike + :meth:`~Dataset.map`, a function passed to + :meth:`~Dataset.flat_map` can return multiple rows. + + :meth:`~Dataset.map_batches` + Call this method to transform batches of data. + """ # noqa: E501 + compute = get_compute_strategy( + fn, + fn_constructor_args=fn_constructor_args, + compute=compute, + concurrency=concurrency, + ) + + if num_cpus is not None: + ray_remote_args["num_cpus"] = num_cpus + + if num_gpus is not None: + ray_remote_args["num_gpus"] = num_gpus + + plan = self._plan.copy() + map_op = MapRows( + self._logical_plan.dag, + fn, + fn_args=fn_args, + fn_kwargs=fn_kwargs, + fn_constructor_args=fn_constructor_args, + fn_constructor_kwargs=fn_constructor_kwargs, + compute=compute, + ray_remote_args_fn=ray_remote_args_fn, + ray_remote_args=ray_remote_args, + ) + logical_plan = LogicalPlan(map_op, self.context) + return Dataset(plan, logical_plan) + + def _set_name(self, name: Optional[str]): + """Set the name of the dataset. + + Used as a prefix for metrics tags. + """ + self._plan._dataset_name = name + + @property + def _name(self) -> Optional[str]: + """Returns the dataset name""" + return self._plan._dataset_name + + @PublicAPI(api_group=BT_API_GROUP) + def map_batches( + self, + fn: UserDefinedFunction[DataBatch, DataBatch], + *, + batch_size: Union[int, None, Literal["default"]] = "default", + compute: Optional[ComputeStrategy] = None, + batch_format: Optional[str] = "default", + zero_copy_batch: bool = False, + fn_args: Optional[Iterable[Any]] = None, + fn_kwargs: Optional[Dict[str, Any]] = None, + fn_constructor_args: Optional[Iterable[Any]] = None, + fn_constructor_kwargs: Optional[Dict[str, Any]] = None, + num_cpus: Optional[float] = None, + num_gpus: Optional[float] = None, + concurrency: Optional[Union[int, Tuple[int, int]]] = None, + ray_remote_args_fn: Optional[Callable[[], Dict[str, Any]]] = None, + **ray_remote_args, + ) -> "Dataset": + """Apply the given function to batches of data. + + This method is useful for preprocessing data and performing inference. To learn + more, see :ref:`Transforming batches `. + + You can use either a function or a callable class to perform the transformation. + For functions, Ray Data uses stateless Ray tasks. For classes, Ray Data uses + stateful Ray actors. For more information, see + :ref:`Stateful Transforms `. + + .. tip:: + To understand the format of the input to ``fn``, call :meth:`~Dataset.take_batch` + on the dataset to get a batch in the same format as will be passed to ``fn``. + + .. tip:: + If ``fn`` doesn't mutate its input, set ``zero_copy_batch=True`` to improve + performance and decrease memory utilization. + + .. warning:: + Specifying both ``num_cpus`` and ``num_gpus`` for map tasks is experimental, + and may result in scheduling or stability issues. Please + `report any issues `_ + to the Ray team. + + Examples: + + Call :meth:`~Dataset.map_batches` to transform your data. + + .. testcode:: + + from typing import Dict + import numpy as np + import ray + + def add_dog_years(batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: + batch["age_in_dog_years"] = 7 * batch["age"] + return batch + + ds = ( + ray.data.from_items([ + {"name": "Luna", "age": 4}, + {"name": "Rory", "age": 14}, + {"name": "Scout", "age": 9}, + ]) + .map_batches(add_dog_years) + ) + ds.show() + + .. testoutput:: + + {'name': 'Luna', 'age': 4, 'age_in_dog_years': 28} + {'name': 'Rory', 'age': 14, 'age_in_dog_years': 98} + {'name': 'Scout', 'age': 9, 'age_in_dog_years': 63} + + If your function returns large objects, yield outputs in chunks. + + .. testcode:: + + from typing import Dict + import ray + import numpy as np + + def map_fn_with_large_output(batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: + for i in range(3): + yield {"large_output": np.ones((100, 1000))} + + ds = ( + ray.data.from_items([1]) + .map_batches(map_fn_with_large_output) + ) + + If you require stateful transfomation, + use Python callable class. Here is an example showing how to use stateful transforms to create model inference workers, without having to reload the model on each call. + + .. testcode:: + + from typing import Dict + import numpy as np + import torch + import ray + + class TorchPredictor: + + def __init__(self): + self.model = torch.nn.Identity().cuda() + self.model.eval() + + def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: + inputs = torch.as_tensor(batch["data"], dtype=torch.float32).cuda() + with torch.inference_mode(): + batch["output"] = self.model(inputs).detach().cpu().numpy() + return batch + + ds = ( + ray.data.from_numpy(np.ones((32, 100))) + .map_batches( + TorchPredictor, + # Two workers with one GPU each + concurrency=2, + # Batch size is required if you're using GPUs. + batch_size=4, + num_gpus=1 + ) + ) + + To learn more, see + :ref:`End-to-end: Offline Batch Inference `. + + Args: + fn: The function or generator to apply to a record batch, or a class type + that can be instantiated to create such a callable. Note ``fn`` must be + pickle-able. + batch_size: The desired number of rows in each batch, or ``None`` to use + entire blocks as batches (blocks may contain different numbers of rows). + The actual size of the batch provided to ``fn`` may be smaller than + ``batch_size`` if ``batch_size`` doesn't evenly divide the block(s) sent + to a given map task. Default batch_size is 1024 with "default". + compute: This argument is deprecated. Use ``concurrency`` argument. + batch_format: If ``"default"`` or ``"numpy"``, batches are + ``Dict[str, numpy.ndarray]``. If ``"pandas"``, batches are + ``pandas.DataFrame``. If ``"pyarrow"``, batches are + ``pyarrow.Table``. + zero_copy_batch: Whether ``fn`` should be provided zero-copy, read-only + batches. If this is ``True`` and no copy is required for the + ``batch_format`` conversion, the batch is a zero-copy, read-only + view on data in Ray's object store, which can decrease memory + utilization and improve performance. If this is ``False``, the batch + is writable, which requires an extra copy to guarantee. + If ``fn`` mutates its input, this needs to be ``False`` in order to + avoid "assignment destination is read-only" or "buffer source array is + read-only" errors. Default is ``False``. + fn_args: Positional arguments to pass to ``fn`` after the first argument. + These arguments are top-level arguments to the underlying Ray task. + fn_kwargs: Keyword arguments to pass to ``fn``. These arguments are + top-level arguments to the underlying Ray task. + fn_constructor_args: Positional arguments to pass to ``fn``'s constructor. + You can only provide this if ``fn`` is a callable class. These arguments + are top-level arguments in the underlying Ray actor construction task. + fn_constructor_kwargs: Keyword arguments to pass to ``fn``'s constructor. + This can only be provided if ``fn`` is a callable class. These arguments + are top-level arguments in the underlying Ray actor construction task. + num_cpus: The number of CPUs to reserve for each parallel map worker. + num_gpus: The number of GPUs to reserve for each parallel map worker. For + example, specify `num_gpus=1` to request 1 GPU for each parallel map worker. + concurrency: The number of Ray workers to use concurrently. For a fixed-sized + worker pool of size ``n``, specify ``concurrency=n``. For an autoscaling + worker pool from ``m`` to ``n`` workers, specify ``concurrency=(m, n)``. + ray_remote_args_fn: A function that returns a dictionary of remote args + passed to each map worker. The purpose of this argument is to generate + dynamic arguments for each actor/task, and will be called each time prior + to initializing the worker. Args returned from this dict will always + override the args in ``ray_remote_args``. Note: this is an advanced, + experimental feature. + ray_remote_args: Additional resource requirements to request from + Ray for each map worker. See :func:`ray.remote` for details. + + .. note:: + + The size of the batches provided to ``fn`` might be smaller than the + specified ``batch_size`` if ``batch_size`` doesn't evenly divide the + block(s) sent to a given map task. + + If ``batch_size`` is set and each input block is smaller than the + ``batch_size``, Ray Data will bundle up many blocks as the input for one + task, until their total size is equal to or greater than the given + ``batch_size``. + If ``batch_size`` is not set, the bundling will not be performed. Each task + will receive only one input block. + + .. seealso:: + + :meth:`~Dataset.iter_batches` + Call this function to iterate over batches of data. + + :meth:`~Dataset.take_batch` + Call this function to get a batch of data from the dataset + in the same format as will be passed to the `fn` function of + :meth:`~Dataset.map_batches`. + + :meth:`~Dataset.flat_map` + Call this method to create new records from existing ones. Unlike + :meth:`~Dataset.map`, a function passed to :meth:`~Dataset.flat_map` + can return multiple records. + + :meth:`~Dataset.map` + Call this method to transform one record at time. + + """ # noqa: E501 + use_gpus = num_gpus is not None and num_gpus > 0 + if use_gpus and (batch_size is None or batch_size == "default"): + raise ValueError( + "You must provide `batch_size` to `map_batches` when requesting GPUs. " + "The optimal batch size depends on the model, data, and GPU used. " + "We recommend using the largest batch size that doesn't result " + "in your GPU device running out of memory. You can view the GPU memory " + "usage via the Ray dashboard." + ) + + if isinstance(batch_size, int) and batch_size < 1: + raise ValueError("Batch size can't be negative or 0") + + return self._map_batches_without_batch_size_validation( + fn, + batch_size=batch_size, + compute=compute, + batch_format=batch_format, + zero_copy_batch=zero_copy_batch, + fn_args=fn_args, + fn_kwargs=fn_kwargs, + fn_constructor_args=fn_constructor_args, + fn_constructor_kwargs=fn_constructor_kwargs, + num_cpus=num_cpus, + num_gpus=num_gpus, + concurrency=concurrency, + ray_remote_args_fn=ray_remote_args_fn, + **ray_remote_args, + ) + + def _map_batches_without_batch_size_validation( + self, + fn: UserDefinedFunction[DataBatch, DataBatch], + *, + batch_size: Union[int, None, Literal["default"]], + compute: Optional[ComputeStrategy], + batch_format: Optional[str], + zero_copy_batch: bool, + fn_args: Optional[Iterable[Any]], + fn_kwargs: Optional[Dict[str, Any]], + fn_constructor_args: Optional[Iterable[Any]], + fn_constructor_kwargs: Optional[Dict[str, Any]], + num_cpus: Optional[float], + num_gpus: Optional[float], + concurrency: Optional[Union[int, Tuple[int, int]]], + ray_remote_args_fn: Optional[Callable[[], Dict[str, Any]]], + **ray_remote_args, + ): + # NOTE: The `map_groups` implementation calls `map_batches` with + # `batch_size=None`. The issue is that if you request GPUs with + # `batch_size=None`, then `map_batches` raises a value error. So, to allow users + # to call `map_groups` with GPUs, we need a separate method that doesn't + # perform batch size validation. + + compute = get_compute_strategy( + fn, + fn_constructor_args=fn_constructor_args, + compute=compute, + concurrency=concurrency, + ) + + if num_cpus is not None: + ray_remote_args["num_cpus"] = num_cpus + + if num_gpus is not None: + ray_remote_args["num_gpus"] = num_gpus + + batch_format = _apply_batch_format(batch_format) + + min_rows_per_bundled_input = None + if batch_size is not None and batch_size != "default": + # Enable blocks bundling when batch_size is specified by caller. + min_rows_per_bundled_input = batch_size + batch_size = _apply_batch_size(batch_size) + + if batch_format not in VALID_BATCH_FORMATS: + raise ValueError( + f"The batch format must be one of {VALID_BATCH_FORMATS}, got: " + f"{batch_format}" + ) + + plan = self._plan.copy() + map_batches_op = MapBatches( + self._logical_plan.dag, + fn, + batch_size=batch_size, + batch_format=batch_format, + zero_copy_batch=zero_copy_batch, + min_rows_per_bundled_input=min_rows_per_bundled_input, + fn_args=fn_args, + fn_kwargs=fn_kwargs, + fn_constructor_args=fn_constructor_args, + fn_constructor_kwargs=fn_constructor_kwargs, + compute=compute, + ray_remote_args_fn=ray_remote_args_fn, + ray_remote_args=ray_remote_args, + ) + logical_plan = LogicalPlan(map_batches_op, self.context) + return Dataset(plan, logical_plan) + + @PublicAPI(api_group=BT_API_GROUP) + def add_column( + self, + col: str, + fn: Callable[ + [DataBatch], + DataBatchColumn, + ], + *, + batch_format: Optional[str] = "pandas", + compute: Optional[str] = None, + concurrency: Optional[Union[int, Tuple[int, int]]] = None, + **ray_remote_args, + ) -> "Dataset": + """Add the given column to the dataset. + + A function generating the new column values given the batch in pyarrow or pandas + format must be specified. This function must operate on batches of + `batch_format`. + + Examples: + + + >>> import ray + >>> ds = ray.data.range(100) + >>> ds.schema() + Column Type + ------ ---- + id int64 + + Add a new column equal to ``id * 2``. + + >>> ds.add_column("new_id", lambda df: df["id"] * 2).schema() + Column Type + ------ ---- + id int64 + new_id int64 + + Time complexity: O(dataset size / parallelism) + + Args: + col: Name of the column to add. If the name already exists, the + column is overwritten. + fn: Map function generating the column values given a batch of + records in pandas format. + batch_format: If ``"default"`` or ``"numpy"``, batches are + ``Dict[str, numpy.ndarray]``. If ``"pandas"``, batches are + ``pandas.DataFrame``. If ``"pyarrow"``, batches are + ``pyarrow.Table``. If ``"numpy"``, batches are + ``Dict[str, numpy.ndarray]``. + compute: This argument is deprecated. Use ``concurrency`` argument. + concurrency: The number of Ray workers to use concurrently. For a + fixed-sized worker pool of size ``n``, specify ``concurrency=n``. For + an autoscaling worker pool from ``m`` to ``n`` workers, specify + ``concurrency=(m, n)``. + ray_remote_args: Additional resource requirements to request from + Ray (e.g., num_gpus=1 to request GPUs for the map tasks). See + :func:`ray.remote` for details. + """ + # Check that batch_format + accepted_batch_formats = ["pandas", "pyarrow", "numpy"] + if batch_format not in accepted_batch_formats: + raise ValueError( + f"batch_format argument must be on of {accepted_batch_formats}, " + f"got: {batch_format}" + ) + + def add_column(batch: DataBatch) -> DataBatch: + column = fn(batch) + if batch_format == "pandas": + batch.loc[:, col] = column + return batch + elif batch_format == "pyarrow": + import pyarrow as pa + + assert isinstance(column, (pa.Array, pa.ChunkedArray)), ( + f"For pyarrow batch format, the function must return a pyarrow " + f"Array, got: {type(column)}" + ) + # Historically, this method was written for pandas batch format. + # To resolve https://github.com/ray-project/ray/issues/48090, + # we also allow pyarrow batch format which is preferred but would be + # a breaking change to enforce. + + # For pyarrow, the index of the column will be -1 if it is missing in + # which case we'll want to append it + column_idx = batch.schema.get_field_index(col) + if column_idx == -1: + return batch.append_column(col, column) + else: + return batch.set_column(column_idx, col, column) + + else: + # batch format is assumed to be numpy since we checked at the + # beginning of the add_column function + assert isinstance(column, np.ndarray), ( + f"For numpy batch format, the function must return a " + f"numpy.ndarray, got: {type(column)}" + ) + batch[col] = column + return batch + + if not callable(fn): + raise ValueError("`fn` must be callable, got {}".format(fn)) + + return self.map_batches( + add_column, + batch_format=batch_format, + compute=compute, + concurrency=concurrency, + zero_copy_batch=False, + **ray_remote_args, + ) + + @PublicAPI(api_group=BT_API_GROUP) + def drop_columns( + self, + cols: List[str], + *, + compute: Optional[str] = None, + concurrency: Optional[Union[int, Tuple[int, int]]] = None, + **ray_remote_args, + ) -> "Dataset": + """Drop one or more columns from the dataset. + + Examples: + + >>> import ray + >>> ds = ray.data.read_parquet("s3://anonymous@ray-example-data/iris.parquet") + >>> ds.schema() + Column Type + ------ ---- + sepal.length double + sepal.width double + petal.length double + petal.width double + variety string + >>> ds.drop_columns(["variety"]).schema() + Column Type + ------ ---- + sepal.length double + sepal.width double + petal.length double + petal.width double + + Time complexity: O(dataset size / parallelism) + + Args: + cols: Names of the columns to drop. If any name does not exist, + an exception is raised. Column names must be unique. + compute: This argument is deprecated. Use ``concurrency`` argument. + concurrency: The number of Ray workers to use concurrently. For a fixed-sized + worker pool of size ``n``, specify ``concurrency=n``. For an autoscaling + worker pool from ``m`` to ``n`` workers, specify ``concurrency=(m, n)``. + ray_remote_args: Additional resource requirements to request from + Ray (e.g., num_gpus=1 to request GPUs for the map tasks). See + :func:`ray.remote` for details. + """ # noqa: E501 + + if len(cols) != len(set(cols)): + raise ValueError(f"drop_columns expects unique column names, got: {cols}") + + def drop_columns(batch): + return batch.drop(cols) + + return self.map_batches( + drop_columns, + batch_format="pyarrow", + zero_copy_batch=True, + compute=compute, + concurrency=concurrency, + **ray_remote_args, + ) + + @PublicAPI(api_group=BT_API_GROUP) + def select_columns( + self, + cols: Union[str, List[str]], + *, + compute: Union[str, ComputeStrategy] = None, + concurrency: Optional[Union[int, Tuple[int, int]]] = None, + **ray_remote_args, + ) -> "Dataset": + """Select one or more columns from the dataset. + + Specified columns must be in the dataset schema. + + .. tip:: + If you're reading parquet files with :meth:`ray.data.read_parquet`, + you might be able to speed it up by using projection pushdown; see + :ref:`Parquet column pruning ` for details. + + Examples: + + >>> import ray + >>> ds = ray.data.read_parquet("s3://anonymous@ray-example-data/iris.parquet") + >>> ds.schema() + Column Type + ------ ---- + sepal.length double + sepal.width double + petal.length double + petal.width double + variety string + >>> ds.select_columns(["sepal.length", "sepal.width"]).schema() + Column Type + ------ ---- + sepal.length double + sepal.width double + + Time complexity: O(dataset size / parallelism) + + Args: + cols: Names of the columns to select. If a name isn't in the + dataset schema, an exception is raised. Columns also should be unique. + compute: This argument is deprecated. Use ``concurrency`` argument. + concurrency: The number of Ray workers to use concurrently. For a fixed-sized + worker pool of size ``n``, specify ``concurrency=n``. For an autoscaling + worker pool from ``m`` to ``n`` workers, specify ``concurrency=(m, n)``. + ray_remote_args: Additional resource requirements to request from + Ray (e.g., num_gpus=1 to request GPUs for the map tasks). See + :func:`ray.remote` for details. + """ # noqa: E501 + if isinstance(cols, str): + cols = [cols] + elif isinstance(cols, list): + if not all(isinstance(col, str) for col in cols): + raise ValueError( + "select_columns requires all elements of 'cols' to be strings." + ) + else: + raise TypeError( + "select_columns requires 'cols' to be a string or a list of strings." + ) + + if not cols: + raise ValueError("select_columns requires at least one column to select.") + + if len(cols) != len(set(cols)): + raise ValueError( + "select_columns expected unique column names, " + f"got duplicate column names: {cols}" + ) + + # Don't feel like we really need this + from ray.data._internal.compute import TaskPoolStrategy + + compute = TaskPoolStrategy(size=concurrency) + + plan = self._plan.copy() + select_op = Project( + self._logical_plan.dag, + cols=cols, + cols_rename=None, + compute=compute, + ray_remote_args=ray_remote_args, + ) + logical_plan = LogicalPlan(select_op, self.context) + return Dataset(plan, logical_plan) + + @PublicAPI(api_group=BT_API_GROUP) + def rename_columns( + self, + names: Union[List[str], Dict[str, str]], + *, + concurrency: Optional[Union[int, Tuple[int, int]]] = None, + **ray_remote_args, + ): + """Rename columns in the dataset. + + Examples: + + >>> import ray + >>> ds = ray.data.read_parquet("s3://anonymous@ray-example-data/iris.parquet") + >>> ds.schema() + Column Type + ------ ---- + sepal.length double + sepal.width double + petal.length double + petal.width double + variety string + + You can pass a dictionary mapping old column names to new column names. + + >>> ds.rename_columns({"variety": "category"}).schema() + Column Type + ------ ---- + sepal.length double + sepal.width double + petal.length double + petal.width double + category string + + Or you can pass a list of new column names. + + >>> ds.rename_columns( + ... ["sepal_length", "sepal_width", "petal_length", "petal_width", "variety"] + ... ).schema() + Column Type + ------ ---- + sepal_length double + sepal_width double + petal_length double + petal_width double + variety string + + Args: + names: A dictionary that maps old column names to new column names, or a + list of new column names. + concurrency: The maximum number of Ray workers to use concurrently. + ray_remote_args: Additional resource requirements to request from + Ray (e.g., num_gpus=1 to request GPUs for the map tasks). See + :func:`ray.remote` for details. + """ # noqa: E501 + + if isinstance(names, dict): + if not names: + raise ValueError("rename_columns received 'names' with no entries.") + + if len(names.values()) != len(set(names.values())): + raise ValueError( + f"rename_columns received duplicate values in the 'names': " + f"{names}" + ) + + if not all( + isinstance(k, str) and isinstance(v, str) for k, v in names.items() + ): + raise ValueError( + "rename_columns requires both keys and values in the 'names' " + "to be strings." + ) + + cols_rename = names + elif isinstance(names, list): + if not names: + raise ValueError( + "rename_columns requires 'names' with at least one column name." + ) + + if len(names) != len(set(names)): + raise ValueError( + f"rename_columns received duplicate values in the 'names': {names}" + ) + + if not all(isinstance(col, str) for col in names): + raise ValueError( + "rename_columns requires all elements in the 'names' to be strings." + ) + + current_names = self.schema().names + if len(current_names) != len(names): + raise ValueError( + f"rename_columns requires 'names': {names} length match current " + f"schema names: {current_names}." + ) + + cols_rename = dict(zip(current_names, names)) + else: + raise TypeError( + f"rename_columns expected names to be either List[str] or " + f"Dict[str, str], got {type(names)}." + ) + + if concurrency is not None and not isinstance(concurrency, int): + raise ValueError( + f"Expected `concurrency` to be an integer or `None`, but " + f"got {concurrency}." + ) + + # Construct the plan and project operation + from ray.data._internal.compute import TaskPoolStrategy + + compute = TaskPoolStrategy(size=concurrency) + + plan = self._plan.copy() + select_op = Project( + self._logical_plan.dag, + cols=None, + cols_rename=cols_rename, + compute=compute, + ray_remote_args=ray_remote_args, + ) + logical_plan = LogicalPlan(select_op, self.context) + return Dataset(plan, logical_plan) + + @PublicAPI(api_group=BT_API_GROUP) + def flat_map( + self, + fn: UserDefinedFunction[Dict[str, Any], List[Dict[str, Any]]], + *, + compute: Optional[ComputeStrategy] = None, + fn_args: Optional[Iterable[Any]] = None, + fn_kwargs: Optional[Dict[str, Any]] = None, + fn_constructor_args: Optional[Iterable[Any]] = None, + fn_constructor_kwargs: Optional[Dict[str, Any]] = None, + num_cpus: Optional[float] = None, + num_gpus: Optional[float] = None, + concurrency: Optional[Union[int, Tuple[int, int]]] = None, + ray_remote_args_fn: Optional[Callable[[], Dict[str, Any]]] = None, + **ray_remote_args, + ) -> "Dataset": + """Apply the given function to each row and then flatten results. + + Use this method if your transformation returns multiple rows for each input + row. + + You can use either a function or a callable class to perform the transformation. + For functions, Ray Data uses stateless Ray tasks. For classes, Ray Data uses + stateful Ray actors. For more information, see + :ref:`Stateful Transforms `. + + .. tip:: + :meth:`~Dataset.map_batches` can also modify the number of rows. If your + transformation is vectorized like most NumPy and pandas operations, + it might be faster. + + .. warning:: + Specifying both ``num_cpus`` and ``num_gpus`` for map tasks is experimental, + and may result in scheduling or stability issues. Please + `report any issues `_ + to the Ray team. + + Examples: + + .. testcode:: + + from typing import Any, Dict, List + import ray + + def duplicate_row(row: Dict[str, Any]) -> List[Dict[str, Any]]: + return [row] * 2 + + print( + ray.data.range(3) + .flat_map(duplicate_row) + .take_all() + ) + + .. testoutput:: + + [{'id': 0}, {'id': 0}, {'id': 1}, {'id': 1}, {'id': 2}, {'id': 2}] + + Time complexity: O(dataset size / parallelism) + + Args: + fn: The function or generator to apply to each record, or a class type + that can be instantiated to create such a callable. + compute: This argument is deprecated. Use ``concurrency`` argument. + fn_args: Positional arguments to pass to ``fn`` after the first argument. + These arguments are top-level arguments to the underlying Ray task. + fn_kwargs: Keyword arguments to pass to ``fn``. These arguments are + top-level arguments to the underlying Ray task. + fn_constructor_args: Positional arguments to pass to ``fn``'s constructor. + You can only provide this if ``fn`` is a callable class. These arguments + are top-level arguments in the underlying Ray actor construction task. + fn_constructor_kwargs: Keyword arguments to pass to ``fn``'s constructor. + This can only be provided if ``fn`` is a callable class. These arguments + are top-level arguments in the underlying Ray actor construction task. + num_cpus: The number of CPUs to reserve for each parallel map worker. + num_gpus: The number of GPUs to reserve for each parallel map worker. For + example, specify `num_gpus=1` to request 1 GPU for each parallel map + worker. + concurrency: The number of Ray workers to use concurrently. For a + fixed-sized worker pool of size ``n``, specify ``concurrency=n``. + For an autoscaling worker pool from ``m`` to ``n`` workers, specify + ``concurrency=(m, n)``. + ray_remote_args_fn: A function that returns a dictionary of remote args + passed to each map worker. The purpose of this argument is to generate + dynamic arguments for each actor/task, and will be called each time + prior to initializing the worker. Args returned from this dict will + always override the args in ``ray_remote_args``. Note: this is an + advanced, experimental feature. + ray_remote_args: Additional resource requirements to request from + Ray for each map worker. See :func:`ray.remote` for details. + + .. seealso:: + + :meth:`~Dataset.map_batches` + Call this method to transform batches of data. + + :meth:`~Dataset.map` + Call this method to transform one row at time. + """ + compute = get_compute_strategy( + fn, + fn_constructor_args=fn_constructor_args, + compute=compute, + concurrency=concurrency, + ) + + if num_cpus is not None: + ray_remote_args["num_cpus"] = num_cpus + + if num_gpus is not None: + ray_remote_args["num_gpus"] = num_gpus + + plan = self._plan.copy() + op = FlatMap( + input_op=self._logical_plan.dag, + fn=fn, + fn_args=fn_args, + fn_kwargs=fn_kwargs, + fn_constructor_args=fn_constructor_args, + fn_constructor_kwargs=fn_constructor_kwargs, + compute=compute, + ray_remote_args_fn=ray_remote_args_fn, + ray_remote_args=ray_remote_args, + ) + logical_plan = LogicalPlan(op, self.context) + return Dataset(plan, logical_plan) + + @PublicAPI(api_group=BT_API_GROUP) + def filter( + self, + fn: Optional[UserDefinedFunction[Dict[str, Any], bool]] = None, + expr: Optional[str] = None, + *, + compute: Union[str, ComputeStrategy] = None, + concurrency: Optional[Union[int, Tuple[int, int]]] = None, + ray_remote_args_fn: Optional[Callable[[], Dict[str, Any]]] = None, + **ray_remote_args, + ) -> "Dataset": + """Filter out rows that don't satisfy the given predicate. + + You can use either a function or a callable class or an expression string to + perform the transformation. + For functions, Ray Data uses stateless Ray tasks. For classes, Ray Data uses + stateful Ray actors. For more information, see + :ref:`Stateful Transforms `. + + .. tip:: + If you use the `expr` parameter with a Python expression string, Ray Data + optimizes your filter with native Arrow interfaces. + + Examples: + + >>> import ray + >>> ds = ray.data.range(100) + >>> ds.filter(expr="id <= 4").take_all() + [{'id': 0}, {'id': 1}, {'id': 2}, {'id': 3}, {'id': 4}] + + Time complexity: O(dataset size / parallelism) + + Args: + fn: The predicate to apply to each row, or a class type + that can be instantiated to create such a callable. + expr: An expression string needs to be a valid Python expression that + will be converted to ``pyarrow.dataset.Expression`` type. + compute: This argument is deprecated. Use ``concurrency`` argument. + concurrency: The number of Ray workers to use concurrently. For a + fixed-sized worker pool of size ``n``, specify ``concurrency=n``. + For an autoscaling worker pool from ``m`` to ``n`` workers, specify + ``concurrency=(m, n)``. + ray_remote_args_fn: A function that returns a dictionary of remote args + passed to each map worker. The purpose of this argument is to generate + dynamic arguments for each actor/task, and will be called each time + prior to initializing the worker. Args returned from this dict will + always override the args in ``ray_remote_args``. Note: this is an + advanced, experimental feature. + ray_remote_args: Additional resource requirements to request from + Ray (e.g., num_gpus=1 to request GPUs for the map tasks). See + :func:`ray.remote` for details. + """ + # Ensure exactly one of fn or expr is provided + resolved_expr = None + if not ((fn is None) ^ (expr is None)): + raise ValueError("Exactly one of 'fn' or 'expr' must be provided.") + elif expr is not None: + from ray.data._internal.compute import TaskPoolStrategy + from ray.data._internal.planner.plan_expression.expression_evaluator import ( # noqa: E501 + ExpressionEvaluator, + ) + + # TODO: (srinathk) bind the expression to the actual schema. + # If fn is a string, convert it to a pyarrow.dataset.Expression + # Initialize ExpressionEvaluator with valid columns, if available + evaluator = ExpressionEvaluator() + resolved_expr = evaluator.get_filters(expression=expr) + + compute = TaskPoolStrategy(size=concurrency) + else: + warnings.warn( + "Use 'expr' instead of 'fn' when possible for performant filters." + ) + + if callable(fn): + compute = get_compute_strategy( + fn=fn, + compute=compute, + concurrency=concurrency, + ) + else: + raise ValueError( + f"fn must be a UserDefinedFunction, but got " + f"{type(fn).__name__} instead." + ) + + plan = self._plan.copy() + op = Filter( + input_op=self._logical_plan.dag, + fn=fn, + filter_expr=resolved_expr, + compute=compute, + ray_remote_args_fn=ray_remote_args_fn, + ray_remote_args=ray_remote_args, + ) + logical_plan = LogicalPlan(op, self.context) + return Dataset(plan, logical_plan) + + @AllToAllAPI + @PublicAPI(api_group=SSR_API_GROUP) + def repartition( + self, + num_blocks: int, + *, + shuffle: bool = False, + ) -> "Dataset": + """Repartition the :class:`Dataset` into exactly this number of :ref:`blocks `. + + This method can be useful to tune the performance of your pipeline. To learn + more, see :ref:`Advanced: Performance Tips and Tuning `. + + If you're writing data to files, you can also use this method to change the + number of output files. To learn more, see + :ref:`Changing the number of output files `. + + .. note:: + + Repartition has two modes. If ``shuffle=False``, Ray Data performs the + minimal data movement needed to equalize block sizes. Otherwise, Ray Data + performs a full distributed shuffle. + + .. image:: /data/images/dataset-shuffle.svg + :align: center + + .. + https://docs.google.com/drawings/d/132jhE3KXZsf29ho1yUdPrCHB9uheHBWHJhDQMXqIVPA/edit + + Examples: + >>> import ray + >>> ds = ray.data.range(100).repartition(10).materialize() + >>> ds.num_blocks() + 10 + + Time complexity: O(dataset size / parallelism) + + Args: + num_blocks: The number of blocks. + shuffle: Whether to perform a distributed shuffle during the + repartition. When shuffle is enabled, each output block + contains a subset of data rows from each input block, which + requires all-to-all data movement. When shuffle is disabled, + output blocks are created from adjacent input blocks, + minimizing data movement. + + Returns: + The repartitioned :class:`Dataset`. + """ # noqa: E501 + plan = self._plan.copy() + op = Repartition( + self._logical_plan.dag, + num_outputs=num_blocks, + shuffle=shuffle, + ) + logical_plan = LogicalPlan(op, self.context) + return Dataset(plan, logical_plan) + + @AllToAllAPI + @PublicAPI(api_group=SSR_API_GROUP) + def random_shuffle( + self, + *, + seed: Optional[int] = None, + num_blocks: Optional[int] = None, + **ray_remote_args, + ) -> "Dataset": + """Randomly shuffle the rows of this :class:`Dataset`. + + .. tip:: + + This method can be slow. For better performance, try + :ref:`Iterating over batches with shuffling `. + Also, see :ref:`Optimizing shuffles `. + + Examples: + >>> import ray + >>> ds = ray.data.range(100) + >>> ds.random_shuffle().take(3) # doctest: +SKIP + {'id': 41}, {'id': 21}, {'id': 92}] + >>> ds.random_shuffle(seed=42).take(3) # doctest: +SKIP + {'id': 77}, {'id': 21}, {'id': 63}] + + Time complexity: O(dataset size / parallelism) + + Args: + seed: Fix the random seed to use, otherwise one is chosen + based on system randomness. + + Returns: + The shuffled :class:`Dataset`. + """ # noqa: E501 + + if num_blocks is not None: + raise DeprecationWarning( + "`num_blocks` parameter is deprecated in Ray 2.9. random_shuffle() " + "does not support to change the number of output blocks. Use " + "repartition() instead.", # noqa: E501 + ) + plan = self._plan.copy() + op = RandomShuffle( + self._logical_plan.dag, + seed=seed, + ray_remote_args=ray_remote_args, + ) + logical_plan = LogicalPlan(op, self.context) + return Dataset(plan, logical_plan) + + @AllToAllAPI + @PublicAPI(api_group=SSR_API_GROUP) + def randomize_block_order( + self, + *, + seed: Optional[int] = None, + ) -> "Dataset": + """Randomly shuffle the :ref:`blocks ` of this :class:`Dataset`. + + This method is useful if you :meth:`~Dataset.split` your dataset into shards and + want to randomize the data in each shard without performing a full + :meth:`~Dataset.random_shuffle`. + + Examples: + >>> import ray + >>> ds = ray.data.range(100) + >>> ds.take(5) + [{'id': 0}, {'id': 1}, {'id': 2}, {'id': 3}, {'id': 4}] + >>> ds.randomize_block_order().take(5) # doctest: +SKIP + {'id': 15}, {'id': 16}, {'id': 17}, {'id': 18}, {'id': 19}] + + Args: + seed: Fix the random seed to use, otherwise one is chosen + based on system randomness. + + Returns: + The block-shuffled :class:`Dataset`. + """ # noqa: E501 + + plan = self._plan.copy() + op = RandomizeBlocks( + self._logical_plan.dag, + seed=seed, + ) + logical_plan = LogicalPlan(op, self.context) + return Dataset(plan, logical_plan) + + @PublicAPI(api_group=BT_API_GROUP) + def random_sample( + self, fraction: float, *, seed: Optional[int] = None + ) -> "Dataset": + """Returns a new :class:`Dataset` containing a random fraction of the rows. + + .. note:: + + This method returns roughly ``fraction * total_rows`` rows. An exact number + of rows isn't guaranteed. + + Examples: + >>> import ray + >>> ds = ray.data.range(100) + >>> ds.random_sample(0.1).count() # doctest: +SKIP + 10 + + Args: + fraction: The fraction of elements to sample. + seed: Seeds the python random pRNG generator. + + Returns: + Returns a :class:`Dataset` containing the sampled rows. + """ + import random + + import pandas as pd + import pyarrow as pa + + if self._plan.initial_num_blocks() == 0: + raise ValueError("Cannot sample from an empty Dataset.") + + if fraction < 0 or fraction > 1: + raise ValueError("Fraction must be between 0 and 1.") + + if seed is not None: + random.seed(seed) + + def random_sample(batch): + if isinstance(batch, list): + return [row for row in batch if random.random() <= fraction] + if isinstance(batch, pa.Table): + # Lets the item pass if weight generated for that item <= fraction + return batch.filter( + pa.array(random.random() <= fraction for _ in range(len(batch))) + ) + if isinstance(batch, pd.DataFrame): + return batch.sample(frac=fraction) + if isinstance(batch, np.ndarray): + return _create_possibly_ragged_ndarray( + [row for row in batch if random.random() <= fraction] + ) + raise ValueError(f"Unsupported batch type: {type(batch)}") + + return self.map_batches(random_sample, batch_format=None) + + @ConsumptionAPI + @PublicAPI(api_group=SMD_API_GROUP) + def streaming_split( + self, + n: int, + *, + equal: bool = False, + locality_hints: Optional[List["NodeIdStr"]] = None, + ) -> List[DataIterator]: + """Returns ``n`` :class:`DataIterators ` that can + be used to read disjoint subsets of the dataset in parallel. + + This method is the recommended way to consume :class:`Datasets ` for + distributed training. + + Streaming split works by delegating the execution of this :class:`Dataset` to a + coordinator actor. The coordinator pulls block references from the executed + stream, and divides those blocks among ``n`` output iterators. Iterators pull + blocks from the coordinator actor to return to their caller on ``next``. + + The returned iterators are also repeatable; each iteration will trigger a + new execution of the Dataset. There is an implicit barrier at the start of + each iteration, which means that `next` must be called on all iterators before + the iteration starts. + + .. warning:: + + Because iterators are pulling blocks from the same :class:`Dataset` + execution, if one iterator falls behind, other iterators may be stalled. + + Examples: + + .. testcode:: + + import ray + + ds = ray.data.range(100) + it1, it2 = ds.streaming_split(2, equal=True) + + Consume data from iterators in parallel. + + .. testcode:: + + @ray.remote + def consume(it): + for batch in it.iter_batches(): + pass + + ray.get([consume.remote(it1), consume.remote(it2)]) + + You can loop over the iterators multiple times (multiple epochs). + + .. testcode:: + + @ray.remote + def train(it): + NUM_EPOCHS = 2 + for _ in range(NUM_EPOCHS): + for batch in it.iter_batches(): + pass + + ray.get([train.remote(it1), train.remote(it2)]) + + The following remote function call blocks waiting for a read on ``it2`` to + start. + + .. testcode:: + :skipif: True + + ray.get(train.remote(it1)) + + Args: + n: Number of output iterators to return. + equal: If ``True``, each output iterator sees an exactly equal number + of rows, dropping data if necessary. If ``False``, some iterators may + see slightly more or less rows than others, but no data is dropped. + locality_hints: Specify the node ids corresponding to each iterator + location. Dataset will try to minimize data movement based on the + iterator output locations. This list must have length ``n``. You can + get the current node id of a task or actor by calling + ``ray.get_runtime_context().get_node_id()``. + + Returns: + The output iterator splits. These iterators are Ray-serializable and can + be freely passed to any Ray task or actor. + + .. seealso:: + + :meth:`Dataset.split` + Unlike :meth:`~Dataset.streaming_split`, :meth:`~Dataset.split` + materializes the dataset in memory. + """ + return StreamSplitDataIterator.create(self, n, equal, locality_hints) + + @ConsumptionAPI + @PublicAPI(api_group=SMD_API_GROUP) + def split( + self, n: int, *, equal: bool = False, locality_hints: Optional[List[Any]] = None + ) -> List["MaterializedDataset"]: + """Materialize and split the dataset into ``n`` disjoint pieces. + + This method returns a list of ``MaterializedDataset`` that can be passed to Ray + Tasks and Actors and used to read the dataset rows in parallel. + + Examples: + + .. testcode:: + + @ray.remote + class Worker: + + def train(self, data_iterator): + for batch in data_iterator.iter_batches(batch_size=8): + pass + + workers = [Worker.remote() for _ in range(4)] + shards = ray.data.range(100).split(n=4, equal=True) + ray.get([w.train.remote(s) for w, s in zip(workers, shards)]) + + Time complexity: O(1) + + Args: + n: Number of child datasets to return. + equal: Whether to guarantee each split has an equal + number of records. This might drop records if the rows can't be + divided equally among the splits. + locality_hints: [Experimental] A list of Ray actor handles of size ``n``. + The system tries to co-locate the blocks of the i-th dataset + with the i-th actor to maximize data locality. + + Returns: + A list of ``n`` disjoint dataset splits. + + .. seealso:: + + :meth:`Dataset.split_at_indices` + Unlike :meth:`~Dataset.split`, which splits a dataset into approximately + equal splits, :meth:`Dataset.split_proportionately` lets you split a + dataset into different sizes. + + :meth:`Dataset.split_proportionately` + This method is equivalent to :meth:`Dataset.split_at_indices` if + you compute indices manually. + + :meth:`Dataset.streaming_split`. + Unlike :meth:`~Dataset.split`, :meth:`~Dataset.streaming_split` + doesn't materialize the dataset in memory. + """ + if n <= 0: + raise ValueError(f"The number of splits {n} is not positive.") + + # fallback to split_at_indices for equal split without locality hints. + # simple benchmarks shows spilit_at_indices yields more stable performance. + # https://github.com/ray-project/ray/pull/26641 for more context. + if equal and locality_hints is None: + count = self.count() + split_index = count // n + # we are creating n split_indices which will generate + # n + 1 splits; the last split will at most contains (n - 1) + # rows, which could be safely dropped. + split_indices = [split_index * i for i in range(1, n + 1)] + shards = self.split_at_indices(split_indices) + return shards[:n] + + if locality_hints and len(locality_hints) != n: + raise ValueError( + f"The length of locality_hints {len(locality_hints)} " + f"doesn't equal the number of splits {n}." + ) + + bundle = self._plan.execute() + # We should not free blocks since we will materialize the Datasets. + owned_by_consumer = False + stats = self._plan.stats() + block_refs, metadata = zip(*bundle.blocks) + + if locality_hints is None: + block_refs_splits = np.array_split(block_refs, n) + metadata_splits = np.array_split(metadata, n) + + split_datasets = [] + for block_refs_split, metadata_split in zip( + block_refs_splits, metadata_splits + ): + ref_bundles = [ + RefBundle([(b, m)], owns_blocks=owned_by_consumer) + for b, m in zip(block_refs_split, metadata_split) + ] + logical_plan = LogicalPlan( + InputData(input_data=ref_bundles), self.context + ) + split_datasets.append( + MaterializedDataset( + ExecutionPlan(stats), + logical_plan, + ) + ) + return split_datasets + + metadata_mapping = dict(zip(block_refs, metadata)) + + # If the locality_hints is set, we use a two-round greedy algorithm + # to co-locate the blocks with the actors based on block + # and actor's location (node_id). + # + # The split algorithm tries to allocate equally-sized blocks regardless + # of locality. Thus we first calculate the expected number of blocks + # for each split. + # + # In the first round, for each actor, we look for all blocks that + # match the actor's node_id, then allocate those matched blocks to + # this actor until we reach the limit(expected number). + # + # In the second round: fill each actor's allocation with + # remaining unallocated blocks until we reach the limit. + + def build_allocation_size_map( + num_blocks: int, actors: List[Any] + ) -> Dict[Any, int]: + """Given the total number of blocks and a list of actors, calcuate + the expected number of blocks to allocate for each actor. + """ + num_actors = len(actors) + num_blocks_per_actor = num_blocks // num_actors + num_blocks_left = num_blocks - num_blocks_per_actor * n + num_blocks_by_actor = {} + for i, actor in enumerate(actors): + num_blocks_by_actor[actor] = num_blocks_per_actor + if i < num_blocks_left: + num_blocks_by_actor[actor] += 1 + return num_blocks_by_actor + + def build_block_refs_by_node_id( + blocks: List[ObjectRef[Block]], + ) -> Dict[str, List[ObjectRef[Block]]]: + """Build the reverse index from node_id to block_refs. For + simplicity, if the block is stored on multiple nodes we + only pick the first one. + """ + block_ref_locations = ray.experimental.get_object_locations(blocks) + block_refs_by_node_id = collections.defaultdict(list) + for block_ref in blocks: + node_ids = block_ref_locations.get(block_ref, {}).get("node_ids", []) + node_id = node_ids[0] if node_ids else None + block_refs_by_node_id[node_id].append(block_ref) + return block_refs_by_node_id + + def build_node_id_by_actor(actors: List[Any]) -> Dict[Any, str]: + """Build a map from a actor to its node_id.""" + actors_state = ray._private.state.actors() + return { + actor: actors_state.get(actor._actor_id.hex(), {}) + .get("Address", {}) + .get("NodeID") + for actor in actors + } + + # expected number of blocks to be allocated for each actor + expected_block_count_by_actor = build_allocation_size_map( + len(block_refs), locality_hints + ) + # the reverse index from node_id to block_refs + block_refs_by_node_id = build_block_refs_by_node_id(block_refs) + # the map from actor to its node_id + node_id_by_actor = build_node_id_by_actor(locality_hints) + + allocation_per_actor = collections.defaultdict(list) + + # In the first round, for each actor, we look for all blocks that + # match the actor's node_id, then allocate those matched blocks to + # this actor until we reach the limit(expected number) + for actor in locality_hints: + node_id = node_id_by_actor[actor] + matching_blocks = block_refs_by_node_id[node_id] + expected_block_count = expected_block_count_by_actor[actor] + allocation = [] + while matching_blocks and len(allocation) < expected_block_count: + allocation.append(matching_blocks.pop()) + allocation_per_actor[actor] = allocation + + # In the second round: fill each actor's allocation with + # remaining unallocated blocks until we reach the limit + remaining_block_refs = list( + itertools.chain.from_iterable(block_refs_by_node_id.values()) + ) + for actor in locality_hints: + while ( + len(allocation_per_actor[actor]) < expected_block_count_by_actor[actor] + ): + allocation_per_actor[actor].append(remaining_block_refs.pop()) + + assert len(remaining_block_refs) == 0, len(remaining_block_refs) + + per_split_bundles = [] + for actor in locality_hints: + blocks = allocation_per_actor[actor] + metadata = [metadata_mapping[b] for b in blocks] + bundle = RefBundle( + tuple(zip(blocks, metadata)), owns_blocks=owned_by_consumer + ) + per_split_bundles.append(bundle) + + if equal: + # equalize the splits + per_split_bundles = _equalize(per_split_bundles, owned_by_consumer) + + split_datasets = [] + for bundle in per_split_bundles: + logical_plan = LogicalPlan(InputData(input_data=[bundle]), self.context) + split_datasets.append( + MaterializedDataset( + ExecutionPlan(stats), + logical_plan, + ) + ) + return split_datasets + + @ConsumptionAPI + @PublicAPI(api_group=SMD_API_GROUP) + def split_at_indices(self, indices: List[int]) -> List["MaterializedDataset"]: + """Materialize and split the dataset at the given indices (like ``np.split``). + + Examples: + >>> import ray + >>> ds = ray.data.range(10) + >>> d1, d2, d3 = ds.split_at_indices([2, 5]) + >>> d1.take_batch() + {'id': array([0, 1])} + >>> d2.take_batch() + {'id': array([2, 3, 4])} + >>> d3.take_batch() + {'id': array([5, 6, 7, 8, 9])} + + Time complexity: O(num splits) + + Args: + indices: List of sorted integers which indicate where the dataset + are split. If an index exceeds the length of the dataset, + an empty dataset is returned. + + Returns: + The dataset splits. + + .. seealso:: + + :meth:`Dataset.split` + Unlike :meth:`~Dataset.split_at_indices`, which lets you split a + dataset into different sizes, :meth:`Dataset.split` splits a dataset + into approximately equal splits. + + :meth:`Dataset.split_proportionately` + This method is equivalent to :meth:`Dataset.split_at_indices` if + you compute indices manually. + + :meth:`Dataset.streaming_split`. + Unlike :meth:`~Dataset.split`, :meth:`~Dataset.streaming_split` + doesn't materialize the dataset in memory. + """ + + if len(indices) < 1: + raise ValueError("indices must be at least of length 1") + if sorted(indices) != indices: + raise ValueError("indices must be sorted") + if indices[0] < 0: + raise ValueError("indices must be positive") + start_time = time.perf_counter() + bundle = self._plan.execute() + blocks, metadata = _split_at_indices( + bundle.blocks, + indices, + False, + ) + split_duration = time.perf_counter() - start_time + parent_stats = self._plan.stats() + splits = [] + + for bs, ms in zip(blocks, metadata): + stats = DatasetStats(metadata={"Split": ms}, parent=parent_stats) + stats.time_total_s = split_duration + ref_bundles = [ + RefBundle([(b, m)], owns_blocks=False) for b, m in zip(bs, ms) + ] + logical_plan = LogicalPlan(InputData(input_data=ref_bundles), self.context) + + splits.append( + MaterializedDataset( + ExecutionPlan(stats), + logical_plan, + ) + ) + return splits + + @ConsumptionAPI + @PublicAPI(api_group=SMD_API_GROUP) + def split_proportionately( + self, proportions: List[float] + ) -> List["MaterializedDataset"]: + """Materialize and split the dataset using proportions. + + A common use case for this is splitting the dataset into train + and test sets (equivalent to eg. scikit-learn's ``train_test_split``). + For a higher level abstraction, see :meth:`Dataset.train_test_split`. + + This method splits datasets so that all splits + always contains at least one row. If that isn't possible, + an exception is raised. + + This is equivalent to caulculating the indices manually and calling + :meth:`Dataset.split_at_indices`. + + Examples: + >>> import ray + >>> ds = ray.data.range(10) + >>> d1, d2, d3 = ds.split_proportionately([0.2, 0.5]) + >>> d1.take_batch() + {'id': array([0, 1])} + >>> d2.take_batch() + {'id': array([2, 3, 4, 5, 6])} + >>> d3.take_batch() + {'id': array([7, 8, 9])} + + Time complexity: O(num splits) + + Args: + proportions: List of proportions to split the dataset according to. + Must sum up to less than 1, and each proportion must be bigger + than 0. + + Returns: + The dataset splits. + + .. seealso:: + + :meth:`Dataset.split` + Unlike :meth:`~Dataset.split_proportionately`, which lets you split a + dataset into different sizes, :meth:`Dataset.split` splits a dataset + into approximately equal splits. + + :meth:`Dataset.split_at_indices` + :meth:`Dataset.split_proportionately` uses this method under the hood. + + :meth:`Dataset.streaming_split`. + Unlike :meth:`~Dataset.split`, :meth:`~Dataset.streaming_split` + doesn't materialize the dataset in memory. + """ + + if len(proportions) < 1: + raise ValueError("proportions must be at least of length 1") + if sum(proportions) >= 1: + raise ValueError("proportions must sum to less than 1") + if any(p <= 0 for p in proportions): + raise ValueError("proportions must be bigger than 0") + + dataset_length = self.count() + cumulative_proportions = np.cumsum(proportions) + split_indices = [ + int(dataset_length * proportion) for proportion in cumulative_proportions + ] + + # Ensure each split has at least one element + subtract = 0 + for i in range(len(split_indices) - 2, -1, -1): + split_indices[i] -= subtract + if split_indices[i] == split_indices[i + 1]: + subtract += 1 + split_indices[i] -= 1 + if any(i <= 0 for i in split_indices): + raise ValueError( + "Couldn't create non-empty splits with the given proportions." + ) + + return self.split_at_indices(split_indices) + + @ConsumptionAPI + @PublicAPI(api_group=SMD_API_GROUP) + def train_test_split( + self, + test_size: Union[int, float], + *, + shuffle: bool = False, + seed: Optional[int] = None, + ) -> Tuple["MaterializedDataset", "MaterializedDataset"]: + """Materialize and split the dataset into train and test subsets. + + Examples: + + >>> import ray + >>> ds = ray.data.range(8) + >>> train, test = ds.train_test_split(test_size=0.25) + >>> train.take_batch() + {'id': array([0, 1, 2, 3, 4, 5])} + >>> test.take_batch() + {'id': array([6, 7])} + + Args: + test_size: If float, should be between 0.0 and 1.0 and represent the + proportion of the dataset to include in the test split. If int, + represents the absolute number of test samples. The train split + always complements the test split. + shuffle: Whether or not to globally shuffle the dataset before splitting. + Defaults to ``False``. This may be a very expensive operation with a + large dataset. + seed: Fix the random seed to use for shuffle, otherwise one is chosen + based on system randomness. Ignored if ``shuffle=False``. + + Returns: + Train and test subsets as two ``MaterializedDatasets``. + + .. seealso:: + + :meth:`Dataset.split_proportionately` + """ + ds = self + + if shuffle: + ds = ds.random_shuffle(seed=seed) + + if not isinstance(test_size, (int, float)): + raise TypeError(f"`test_size` must be int or float got {type(test_size)}.") + if isinstance(test_size, float): + if test_size <= 0 or test_size >= 1: + raise ValueError( + "If `test_size` is a float, it must be bigger than 0 and smaller " + f"than 1. Got {test_size}." + ) + return ds.split_proportionately([1 - test_size]) + else: + ds_length = ds.count() + if test_size <= 0 or test_size >= ds_length: + raise ValueError( + "If `test_size` is an int, it must be bigger than 0 and smaller " + f"than the size of the dataset ({ds_length}). " + f"Got {test_size}." + ) + return ds.split_at_indices([ds_length - test_size]) + + @PublicAPI(api_group=SMD_API_GROUP) + def union(self, *other: List["Dataset"]) -> "Dataset": + """Concatenate :class:`Datasets ` across rows. + + The order of the blocks in the datasets is preserved, as is the + relative ordering between the datasets passed in the argument list. + + .. caution:: + Unioned datasets aren't lineage-serializable. As a result, they can't be + used as a tunable hyperparameter in Ray Tune. + + Examples: + + >>> import ray + >>> ds1 = ray.data.range(2) + >>> ds2 = ray.data.range(3) + >>> ds1.union(ds2).take_all() + [{'id': 0}, {'id': 1}, {'id': 0}, {'id': 1}, {'id': 2}] + + Args: + other: List of datasets to combine with this one. The datasets + must have the same schema as this dataset, otherwise the + behavior is undefined. + + Returns: + A new dataset holding the rows of the input datasets. + """ + start_time = time.perf_counter() + + datasets = [self] + list(other) + logical_plans = [union_ds._plan._logical_plan for union_ds in datasets] + op = UnionLogicalOperator( + *[plan.dag for plan in logical_plans], + ) + logical_plan = LogicalPlan(op, self.context) + + stats = DatasetStats( + metadata={"Union": []}, + parent=[d._plan.stats() for d in datasets], + ) + stats.time_total_s = time.perf_counter() - start_time + return Dataset( + ExecutionPlan(stats), + logical_plan, + ) + + @AllToAllAPI + @PublicAPI(api_group=GGA_API_GROUP) + def groupby( + self, + key: Union[str, List[str], None], + ) -> "GroupedData": + """Group rows of a :class:`Dataset` according to a column. + + Use this method to transform data based on a + categorical variable. + + Examples: + + .. testcode:: + + import pandas as pd + import ray + + def normalize_variety(group: pd.DataFrame) -> pd.DataFrame: + for feature in group.drop("variety").columns: + group[feature] = group[feature] / group[feature].abs().max() + return group + + ds = ( + ray.data.read_parquet("s3://anonymous@ray-example-data/iris.parquet") + .groupby("variety") + .map_groups(normalize_variety, batch_format="pandas") + ) + + Time complexity: O(dataset size * log(dataset size / parallelism)) + + Args: + key: A column name or list of column names. + If this is ``None``, place all rows in a single group. + + Returns: + A lazy :class:`~ray.data.grouped_data.GroupedData`. + + .. seealso:: + + :meth:`~ray.data.grouped_data.GroupedData.map_groups` + Call this method to transform groups of data. + """ + from ray.data.grouped_data import GroupedData + + # Always allow None since groupby interprets that as grouping all + # records into a single global group. + if key is not None: + # Fetching the schema can trigger execution, so don't fetch it for + # input validation. + SortKey(key).validate_schema(self.schema(fetch_if_missing=False)) + + return GroupedData(self, key) + + @AllToAllAPI + @ConsumptionAPI + @PublicAPI(api_group=GGA_API_GROUP) + def unique(self, column: str) -> List[Any]: + """List the unique elements in a given column. + + Examples: + + >>> import ray + >>> ds = ray.data.from_items([1, 2, 3, 2, 3]) + >>> ds.unique("item") + [1, 2, 3] + + This function is very useful for computing labels + in a machine learning dataset: + + >>> import ray + >>> ds = ray.data.read_csv("s3://anonymous@ray-example-data/iris.csv") + >>> ds.unique("target") + [0, 1, 2] + + One common use case is to convert the class labels + into integers for training and inference: + + >>> classes = {0: 'Setosa', 1: 'Versicolor', 2: 'Virginica'} + >>> def preprocessor(df, classes): + ... df["variety"] = df["target"].map(classes) + ... return df + >>> train_ds = ds.map_batches( + ... preprocessor, fn_kwargs={"classes": classes}, batch_format="pandas") + >>> train_ds.sort("sepal length (cm)").take(1) # Sort to make it deterministic + [{'sepal length (cm)': 4.3, ..., 'variety': 'Setosa'}] + + Time complexity: O(dataset size / parallelism) + + Args: + column: The column to collect unique elements over. + + Returns: + A list with unique elements in the given column. + """ # noqa: E501 + ret = self._aggregate_on(Unique, column) + return self._aggregate_result(ret) + + @AllToAllAPI + @ConsumptionAPI + @PublicAPI(api_group=GGA_API_GROUP) + def aggregate(self, *aggs: AggregateFn) -> Union[Any, Dict[str, Any]]: + """Aggregate values using one or more functions. + + Use this method to compute metrics like the product of a column. + + Examples: + + .. testcode:: + + import ray + from ray.data.aggregate import AggregateFn + + ds = ray.data.from_items([{"number": i} for i in range(1, 10)]) + aggregation = AggregateFn( + init=lambda column: 1, + # Apply this to each row to produce a partial aggregate result + accumulate_row=lambda a, row: a * row["number"], + # Apply this to merge partial aggregate results into a final result + merge=lambda a1, a2: a1 * a2, + name="prod" + ) + print(ds.aggregate(aggregation)) + + .. testoutput:: + + {'prod': 362880} + + Time complexity: O(dataset size / parallelism) + + Args: + *aggs: :class:`Aggregations ` to perform. + + Returns: + A ``dict`` where each each value is an aggregation for a given column. + """ + ret = self.groupby(None).aggregate(*aggs).take(1) + return ret[0] if len(ret) > 0 else None + + @AllToAllAPI + @ConsumptionAPI + @PublicAPI(api_group=GGA_API_GROUP) + def sum( + self, on: Optional[Union[str, List[str]]] = None, ignore_nulls: bool = True + ) -> Union[Any, Dict[str, Any]]: + """Compute the sum of one or more columns. + + Examples: + >>> import ray + >>> ray.data.range(100).sum("id") + 4950 + >>> ray.data.from_items([ + ... {"A": i, "B": i**2} + ... for i in range(100) + ... ]).sum(["A", "B"]) + {'sum(A)': 4950, 'sum(B)': 328350} + + Args: + on: a column name or a list of column names to aggregate. + ignore_nulls: Whether to ignore null values. If ``True``, null + values are ignored when computing the sum. If ``False``, + when a null value is encountered, the output is ``None``. + Ray Data considers ``np.nan``, ``None``, and ``pd.NaT`` to be null + values. Default is ``True``. + + Returns: + The sum result. + + For different values of ``on``, the return varies: + + - ``on=None``: a dict containing the column-wise sum of all + columns, + - ``on="col"``: a scalar representing the sum of all items in + column ``"col"``, + - ``on=["col_1", ..., "col_n"]``: an n-column ``dict`` + containing the column-wise sum of the provided columns. + + If the dataset is empty, all values are null. If ``ignore_nulls`` is + ``False`` and any value is null, then the output is ``None``. + """ + ret = self._aggregate_on(Sum, on, ignore_nulls=ignore_nulls) + return self._aggregate_result(ret) + + @AllToAllAPI + @ConsumptionAPI + @PublicAPI(api_group=GGA_API_GROUP) + def min( + self, on: Optional[Union[str, List[str]]] = None, ignore_nulls: bool = True + ) -> Union[Any, Dict[str, Any]]: + """Return the minimum of one or more columns. + + Examples: + >>> import ray + >>> ray.data.range(100).min("id") + 0 + >>> ray.data.from_items([ + ... {"A": i, "B": i**2} + ... for i in range(100) + ... ]).min(["A", "B"]) + {'min(A)': 0, 'min(B)': 0} + + Args: + on: a column name or a list of column names to aggregate. + ignore_nulls: Whether to ignore null values. If ``True``, null + values are ignored when computing the min; if ``False``, + when a null value is encountered, the output is ``None``. + This method considers ``np.nan``, ``None``, and ``pd.NaT`` to be null + values. Default is ``True``. + + Returns: + The min result. + + For different values of ``on``, the return varies: + + - ``on=None``: an dict containing the column-wise min of + all columns, + - ``on="col"``: a scalar representing the min of all items in + column ``"col"``, + - ``on=["col_1", ..., "col_n"]``: an n-column dict + containing the column-wise min of the provided columns. + + If the dataset is empty, all values are null. If ``ignore_nulls`` is + ``False`` and any value is null, then the output is ``None``. + """ + ret = self._aggregate_on(Min, on, ignore_nulls=ignore_nulls) + return self._aggregate_result(ret) + + @AllToAllAPI + @ConsumptionAPI + @PublicAPI(api_group=GGA_API_GROUP) + def max( + self, on: Optional[Union[str, List[str]]] = None, ignore_nulls: bool = True + ) -> Union[Any, Dict[str, Any]]: + """Return the maximum of one or more columns. + + Examples: + >>> import ray + >>> ray.data.range(100).max("id") + 99 + >>> ray.data.from_items([ + ... {"A": i, "B": i**2} + ... for i in range(100) + ... ]).max(["A", "B"]) + {'max(A)': 99, 'max(B)': 9801} + + Args: + on: a column name or a list of column names to aggregate. + ignore_nulls: Whether to ignore null values. If ``True``, null + values are ignored when computing the max; if ``False``, + when a null value is encountered, the output is ``None``. + This method considers ``np.nan``, ``None``, and ``pd.NaT`` to be null + values. Default is ``True``. + + Returns: + The max result. + + For different values of ``on``, the return varies: + + - ``on=None``: an dict containing the column-wise max of + all columns, + - ``on="col"``: a scalar representing the max of all items in + column ``"col"``, + - ``on=["col_1", ..., "col_n"]``: an n-column dict + containing the column-wise max of the provided columns. + + If the dataset is empty, all values are null. If ``ignore_nulls`` is + ``False`` and any value is null, then the output is ``None``. + """ + ret = self._aggregate_on(Max, on, ignore_nulls=ignore_nulls) + return self._aggregate_result(ret) + + @AllToAllAPI + @ConsumptionAPI + @PublicAPI(api_group=GGA_API_GROUP) + def mean( + self, on: Optional[Union[str, List[str]]] = None, ignore_nulls: bool = True + ) -> Union[Any, Dict[str, Any]]: + """Compute the mean of one or more columns. + + Examples: + >>> import ray + >>> ray.data.range(100).mean("id") + 49.5 + >>> ray.data.from_items([ + ... {"A": i, "B": i**2} + ... for i in range(100) + ... ]).mean(["A", "B"]) + {'mean(A)': 49.5, 'mean(B)': 3283.5} + + Args: + on: a column name or a list of column names to aggregate. + ignore_nulls: Whether to ignore null values. If ``True``, null + values are ignored when computing the mean; if ``False``, + when a null value is encountered, the output is ``None``. + This method considers ``np.nan``, ``None``, and ``pd.NaT`` to be null + values. Default is ``True``. + + Returns: + The mean result. + + For different values of ``on``, the return varies: + + - ``on=None``: an dict containing the column-wise mean of + all columns, + - ``on="col"``: a scalar representing the mean of all items in + column ``"col"``, + - ``on=["col_1", ..., "col_n"]``: an n-column dict + containing the column-wise mean of the provided columns. + + If the dataset is empty, all values are null. If ``ignore_nulls`` is + ``False`` and any value is null, then the output is ``None``. + """ + ret = self._aggregate_on(Mean, on, ignore_nulls=ignore_nulls) + return self._aggregate_result(ret) + + @AllToAllAPI + @ConsumptionAPI + @PublicAPI(api_group=GGA_API_GROUP) + def std( + self, + on: Optional[Union[str, List[str]]] = None, + ddof: int = 1, + ignore_nulls: bool = True, + ) -> Union[Any, Dict[str, Any]]: + """Compute the standard deviation of one or more columns. + + .. note:: + This method uses Welford's online method for an accumulator-style + computation of the standard deviation. This method has + numerical stability, and is computable in a single pass. This may give + different (but more accurate) results than NumPy, Pandas, and sklearn, which + use a less numerically stable two-pass algorithm. + To learn more, see + `the Wikapedia article `_. + + Examples: + >>> import ray + >>> round(ray.data.range(100).std("id", ddof=0), 5) + 28.86607 + >>> ray.data.from_items([ + ... {"A": i, "B": i**2} + ... for i in range(100) + ... ]).std(["A", "B"]) + {'std(A)': 29.011491975882016, 'std(B)': 2968.1748039269296} + + Args: + on: a column name or a list of column names to aggregate. + ddof: Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + ignore_nulls: Whether to ignore null values. If ``True``, null + values are ignored when computing the std; if ``False``, + when a null value is encountered, the output is ``None``. + This method considers ``np.nan``, ``None``, and ``pd.NaT`` to be null + values. Default is ``True``. + + Returns: + The standard deviation result. + + For different values of ``on``, the return varies: + + - ``on=None``: an dict containing the column-wise std of + all columns, + - ``on="col"``: a scalar representing the std of all items in + column ``"col"``, + - ``on=["col_1", ..., "col_n"]``: an n-column dict + containing the column-wise std of the provided columns. + + If the dataset is empty, all values are null. If ``ignore_nulls`` is + ``False`` and any value is null, then the output is ``None``. + """ # noqa: E501 + ret = self._aggregate_on(Std, on, ignore_nulls=ignore_nulls, ddof=ddof) + return self._aggregate_result(ret) + + @AllToAllAPI + @PublicAPI(api_group=SSR_API_GROUP) + def sort( + self, + key: Union[str, List[str]], + descending: Union[bool, List[bool]] = False, + boundaries: List[Union[int, float]] = None, + ) -> "Dataset": + """Sort the dataset by the specified key column or key function. + The `key` parameter must be specified (i.e., it cannot be `None`). + + .. note:: + If provided, the `boundaries` parameter can only be used to partition + the first sort key. + + Examples: + >>> import ray + >>> ds = ray.data.range(15) + >>> ds = ds.sort("id", descending=False, boundaries=[5, 10]) + >>> for df in ray.get(ds.to_pandas_refs()): + ... print(df) + id + 0 0 + 1 1 + 2 2 + 3 3 + 4 4 + id + 0 5 + 1 6 + 2 7 + 3 8 + 4 9 + id + 0 10 + 1 11 + 2 12 + 3 13 + 4 14 + + Time complexity: O(dataset size * log(dataset size / parallelism)) + + Args: + key: The column or a list of columns to sort by. + descending: Whether to sort in descending order. Must be a boolean or a list + of booleans matching the number of the columns. + boundaries: The list of values based on which to repartition the dataset. + For example, if the input boundary is [10,20], rows with values less + than 10 will be divided into the first block, rows with values greater + than or equal to 10 and less than 20 will be divided into the + second block, and rows with values greater than or equal to 20 + will be divided into the third block. If not provided, the + boundaries will be sampled from the input blocks. This feature + only supports numeric columns right now. + + Returns: + A new, sorted :class:`Dataset`. + + Raises: + ``ValueError``: if the sort key is None. + """ + if key is None: + raise ValueError("The 'key' parameter cannot be None for sorting.") + sort_key = SortKey(key, descending, boundaries) + plan = self._plan.copy() + op = Sort( + self._logical_plan.dag, + sort_key=sort_key, + ) + logical_plan = LogicalPlan(op, self.context) + return Dataset(plan, logical_plan) + + @PublicAPI(api_group=SMD_API_GROUP) + def zip(self, other: "Dataset") -> "Dataset": + """Zip the columns of this dataset with the columns of another. + + The datasets must have the same number of rows. Their column sets are + merged, and any duplicate column names are disambiguated with suffixes like + ``"_1"``. + + .. note:: + The smaller of the two datasets is repartitioned to align the number + of rows per block with the larger dataset. + + .. note:: + Zipped datasets aren't lineage-serializable. As a result, they can't be used + as a tunable hyperparameter in Ray Tune. + + Examples: + >>> import ray + >>> ds1 = ray.data.range(5) + >>> ds2 = ray.data.range(5) + >>> ds1.zip(ds2).take_batch() + {'id': array([0, 1, 2, 3, 4]), 'id_1': array([0, 1, 2, 3, 4])} + + Args: + other: The dataset to zip with on the right hand side. + + Returns: + A :class:`Dataset` containing the columns of the second dataset + concatenated horizontally with the columns of the first dataset, + with duplicate column names disambiguated with suffixes like ``"_1"``. + """ + plan = self._plan.copy() + op = Zip(self._logical_plan.dag, other._logical_plan.dag) + logical_plan = LogicalPlan(op, self.context) + return Dataset(plan, logical_plan) + + @PublicAPI(api_group=BT_API_GROUP) + def limit(self, limit: int) -> "Dataset": + """Truncate the dataset to the first ``limit`` rows. + + Unlike :meth:`~Dataset.take`, this method doesn't move data to the caller's + machine. Instead, it returns a new :class:`Dataset` pointing to the truncated + distributed data. + + Examples: + >>> import ray + >>> ds = ray.data.range(1000) + >>> ds.limit(5).count() + 5 + + Time complexity: O(limit specified) + + Args: + limit: The size of the dataset to truncate to. + + Returns: + The truncated dataset. + """ + plan = self._plan.copy() + op = Limit(self._logical_plan.dag, limit=limit) + logical_plan = LogicalPlan(op, self.context) + return Dataset(plan, logical_plan) + + @ConsumptionAPI + @PublicAPI(api_group=CD_API_GROUP) + def take_batch( + self, batch_size: int = 20, *, batch_format: Optional[str] = "default" + ) -> DataBatch: + """Return up to ``batch_size`` rows from the :class:`Dataset` in a batch. + + Ray Data represents batches as NumPy arrays or pandas DataFrames. You can + configure the batch type by specifying ``batch_format``. + + This method is useful for inspecting inputs to :meth:`~Dataset.map_batches`. + + .. warning:: + + :meth:`~Dataset.take_batch` moves up to ``batch_size`` rows to the caller's + machine. If ``batch_size`` is large, this method can cause an ` + ``OutOfMemory`` error on the caller. + + Examples: + + >>> import ray + >>> ds = ray.data.range(100) + >>> ds.take_batch(5) + {'id': array([0, 1, 2, 3, 4])} + + Time complexity: O(batch_size specified) + + Args: + batch_size: The maximum number of rows to return. + batch_format: If ``"default"`` or ``"numpy"``, batches are + ``Dict[str, numpy.ndarray]``. If ``"pandas"``, batches are + ``pandas.DataFrame``. + + Returns: + A batch of up to ``batch_size`` rows from the dataset. + + Raises: + ``ValueError``: if the dataset is empty. + """ + batch_format = _apply_batch_format(batch_format) + limited_ds = self.limit(batch_size) + + try: + res = next( + iter( + limited_ds.iter_batches( + batch_size=batch_size, + prefetch_batches=0, + batch_format=batch_format, + ) + ) + ) + except StopIteration: + raise ValueError("The dataset is empty.") + self._synchronize_progress_bar() + + # Save the computed stats to the original dataset. + self._plan._snapshot_stats = limited_ds._plan.stats() + return res + + @ConsumptionAPI + @PublicAPI(api_group=CD_API_GROUP) + def take(self, limit: int = 20) -> List[Dict[str, Any]]: + """Return up to ``limit`` rows from the :class:`Dataset`. + + This method is useful for inspecting data. + + .. warning:: + + :meth:`~Dataset.take` moves up to ``limit`` rows to the caller's machine. If + ``limit`` is large, this method can cause an ``OutOfMemory`` error on the + caller. + + Examples: + + >>> import ray + >>> ds = ray.data.range(100) + >>> ds.take(3) + [{'id': 0}, {'id': 1}, {'id': 2}] + + Time complexity: O(limit specified) + + Args: + limit: The maximum number of rows to return. + + Returns: + A list of up to ``limit`` rows from the dataset. + + .. seealso:: + + :meth:`~Dataset.take_all` + Call this method to return all rows. + """ + if ray.util.log_once("dataset_take"): + logger.info( + "Tip: Use `take_batch()` instead of `take() / show()` to return " + "records in pandas or numpy batch format." + ) + output = [] + + limited_ds = self.limit(limit) + for row in limited_ds.iter_rows(): + output.append(row) + if len(output) >= limit: + break + self._synchronize_progress_bar() + + # Save the computed stats to the original dataset. + self._plan._snapshot_stats = limited_ds._plan.stats() + return output + + @ConsumptionAPI + @PublicAPI(api_group=CD_API_GROUP) + def take_all(self, limit: Optional[int] = None) -> List[Dict[str, Any]]: + """Return all of the rows in this :class:`Dataset`. + + This method is useful for inspecting small datasets. + + .. warning:: + + :meth:`~Dataset.take_all` moves the entire dataset to the caller's + machine. If the dataset is large, this method can cause an + ``OutOfMemory`` error on the caller. + + Examples: + >>> import ray + >>> ds = ray.data.range(5) + >>> ds.take_all() + [{'id': 0}, {'id': 1}, {'id': 2}, {'id': 3}, {'id': 4}] + + Time complexity: O(dataset size) + + Args: + limit: Raise an error if the size exceeds the specified limit. + + Returns: + A list of all the rows in the dataset. + + .. seealso:: + + :meth:`~Dataset.take` + Call this method to return a specific number of rows. + """ + output = [] + for row in self.iter_rows(): + output.append(row) + if limit is not None and len(output) > limit: + raise ValueError( + f"The dataset has more than the given limit of {limit} records." + ) + self._synchronize_progress_bar() + return output + + @ConsumptionAPI + @PublicAPI(api_group=CD_API_GROUP) + def show(self, limit: int = 20) -> None: + """Print up to the given number of rows from the :class:`Dataset`. + + This method is useful for inspecting data. + + Examples: + + >>> import ray + >>> ds = ray.data.range(100) + >>> ds.show(3) + {'id': 0} + {'id': 1} + {'id': 2} + + Time complexity: O(limit specified) + + Args: + limit: The maximum number of row to print. + + .. seealso:: + + :meth:`~Dataset.take` + Call this method to get (not print) a given number of rows. + """ + for row in self.take(limit): + print(row) + + @ConsumptionAPI( + if_more_than_read=True, + datasource_metadata="row count", + pattern="Examples:", + ) + @PublicAPI(api_group=IM_API_GROUP) + def count(self) -> int: + """Count the number of rows in the dataset. + + For Datasets which only read Parquet files (created with + :meth:`~ray.data.read_parquet`), this method reads the file metadata to + efficiently count the number of rows without reading in the entire data. + + Examples: + >>> import ray + >>> ds = ray.data.range(10) + >>> ds.count() + 10 + + Returns: + The number of records in the dataset. + """ + # Handle empty dataset. + if self._plan.initial_num_blocks() == 0: + return 0 + + # For parquet, we can return the count directly from metadata. + meta_count = self._meta_count() + if meta_count is not None: + return meta_count + + plan = self._plan.copy() + count_op = Count([self._logical_plan.dag]) + logical_plan = LogicalPlan(count_op, self.context) + count_ds = Dataset(plan, logical_plan) + + count = 0 + for batch in count_ds.iter_batches(batch_size=None): + assert Count.COLUMN_NAME in batch, ( + "Outputs from the 'Count' logical operator should contain a column " + f"named '{Count.COLUMN_NAME}'" + ) + count += batch[Count.COLUMN_NAME].sum() + # Explicitly cast to int to avoid returning `np.int64`, which is the result + # from calculating `sum()` from numpy batches. + return int(count) + + @ConsumptionAPI( + if_more_than_read=True, + datasource_metadata="schema", + extra_condition="or if ``fetch_if_missing=True`` (the default)", + pattern="Time complexity:", + ) + @PublicAPI(api_group=IM_API_GROUP) + def schema(self, fetch_if_missing: bool = True) -> Optional["Schema"]: + """Return the schema of the dataset. + + Examples: + >>> import ray + >>> ds = ray.data.range(10) + >>> ds.schema() + Column Type + ------ ---- + id int64 + + Time complexity: O(1) + + Args: + fetch_if_missing: If True, synchronously fetch the schema if it's + not known. If False, None is returned if the schema is not known. + Default is True. + + Returns: + The :class:`ray.data.Schema` class of the records, or None if the + schema is not known and fetch_if_missing is False. + """ + + context = self._plan._context + + # First check if the schema is already known from materialized blocks. + base_schema = self._plan.schema(fetch_if_missing=False) + if base_schema is not None: + return Schema(base_schema, data_context=context) + + # Lazily execute only the first block to minimize computation. We achieve this + # by appending a Limit[1] operation to a copy of this Dataset, which we then + # execute to get its schema. + base_schema = self.limit(1)._plan.schema(fetch_if_missing=fetch_if_missing) + if base_schema is not None: + self._plan.cache_schema(base_schema) + return Schema(base_schema, data_context=context) + else: + return None + + @ConsumptionAPI( + if_more_than_read=True, + datasource_metadata="schema", + extra_condition="or if ``fetch_if_missing=True`` (the default)", + pattern="Time complexity:", + ) + @PublicAPI(api_group=IM_API_GROUP) + def columns(self, fetch_if_missing: bool = True) -> Optional[List[str]]: + """Returns the columns of this Dataset. + + Time complexity: O(1) + + Example: + >>> import ray + >>> # Create dataset from synthetic data. + >>> ds = ray.data.range(1000) + >>> ds.columns() + ['id'] + + Args: + fetch_if_missing: If True, synchronously fetch the column names from the + schema if it's not known. If False, None is returned if the schema is + not known. Default is True. + + Returns: + A list of the column names for this Dataset or None if schema is not known + and `fetch_if_missing` is False. + + """ + schema = self.schema(fetch_if_missing=fetch_if_missing) + if schema is not None: + return schema.names + return None + + @PublicAPI(api_group=IM_API_GROUP) + def num_blocks(self) -> int: + """Return the number of blocks of this :class:`Dataset`. + + This method is only implemented for :class:`~ray.data.MaterializedDataset`, + since the number of blocks may dynamically change during execution. + For instance, during read and transform operations, Ray Data may dynamically + adjust the number of blocks to respect memory limits, increasing the + number of blocks at runtime. + + Returns: + The number of blocks of this :class:`Dataset`. + """ + raise NotImplementedError( + "Number of blocks is only available for `MaterializedDataset`," + "because the number of blocks may dynamically change during execution." + "Call `ds.materialize()` to get a `MaterializedDataset`." + ) + + @ConsumptionAPI + @PublicAPI(api_group=IM_API_GROUP) + def size_bytes(self) -> int: + """Return the in-memory size of the dataset. + + Examples: + >>> import ray + >>> ds = ray.data.range(10) + >>> ds.size_bytes() + 80 + + Returns: + The in-memory size of the dataset in bytes, or None if the + in-memory size is not known. + """ + # If the size is known from metadata, return it. + if self._logical_plan.dag.aggregate_output_metadata().size_bytes is not None: + return self._logical_plan.dag.aggregate_output_metadata().size_bytes + + metadata = self._plan.execute().metadata + if not metadata or metadata[0].size_bytes is None: + return None + return sum(m.size_bytes for m in metadata) + + @ConsumptionAPI + @PublicAPI(api_group=IM_API_GROUP) + def input_files(self) -> List[str]: + """Return the list of input files for the dataset. + + Examples: + >>> import ray + >>> ds = ray.data.read_csv("s3://anonymous@ray-example-data/iris.csv") + >>> ds.input_files() + ['ray-example-data/iris.csv'] + + Returns: + The list of input files used to create the dataset, or an empty + list if the input files is not known. + """ + return list(set(self._plan.input_files())) + + @ConsumptionAPI + @PublicAPI(api_group=IOC_API_GROUP) + def write_parquet( + self, + path: str, + *, + partition_cols: Optional[List[str]] = None, + filesystem: Optional["pyarrow.fs.FileSystem"] = None, + try_create_dir: bool = True, + arrow_open_stream_args: Optional[Dict[str, Any]] = None, + filename_provider: Optional[FilenameProvider] = None, + arrow_parquet_args_fn: Optional[Callable[[], Dict[str, Any]]] = None, + min_rows_per_file: Optional[int] = None, + ray_remote_args: Dict[str, Any] = None, + concurrency: Optional[int] = None, + num_rows_per_file: Optional[int] = None, + **arrow_parquet_args, + ) -> None: + """Writes the :class:`~ray.data.Dataset` to parquet files under the provided ``path``. + + The number of files is determined by the number of blocks in the dataset. + To control the number of number of blocks, call + :meth:`~ray.data.Dataset.repartition`. + + If pyarrow can't represent your data, this method errors. + + By default, the format of the output files is ``{uuid}_{block_idx}.parquet``, + where ``uuid`` is a unique id for the dataset. To modify this behavior, + implement a custom :class:`~ray.data.datasource.FilenameProvider` and pass it in + as the ``filename_provider`` argument. + + Examples: + >>> import ray + >>> ds = ray.data.range(100) + >>> ds.write_parquet("local:///tmp/data/") + + Time complexity: O(dataset size / parallelism) + + Args: + path: The path to the destination root directory, where + parquet files are written to. + partition_cols: Column names by which to partition the dataset. + Files are writted in Hive partition style. + filesystem: The pyarrow filesystem implementation to write to. + These filesystems are specified in the + `pyarrow docs `_. + Specify this if you need to provide specific configurations to the + filesystem. By default, the filesystem is automatically selected based + on the scheme of the paths. For example, if the path begins with + ``s3://``, the ``S3FileSystem`` is used. + try_create_dir: If ``True``, attempts to create all directories in the + destination path. Does nothing if all directories already + exist. Defaults to ``True``. + arrow_open_stream_args: kwargs passed to + `pyarrow.fs.FileSystem.open_output_stream `_, which is used when + opening the file to write to. + filename_provider: A :class:`~ray.data.datasource.FilenameProvider` + implementation. Use this parameter to customize what your filenames + look like. + arrow_parquet_args_fn: Callable that returns a dictionary of write + arguments that are provided to `pyarrow.parquet.write_table() `_ + when writing each block to a file. Overrides + any duplicate keys from ``arrow_parquet_args``. Use this argument + instead of ``arrow_parquet_args`` if any of your write arguments + can't pickled, or if you'd like to lazily resolve the write + arguments for each dataset block. + min_rows_per_file: [Experimental] The target minimum number of rows to write + to each file. If ``None``, Ray Data writes a system-chosen number of + rows to each file. If the number of rows per block is larger than the + specified value, Ray Data writes the number of rows per block to each file. + The specified value is a hint, not a strict limit. Ray Data + might write more or fewer rows to each file. + ray_remote_args: Kwargs passed to :func:`ray.remote` in the write tasks. + concurrency: The maximum number of Ray tasks to run concurrently. Set this + to control number of tasks to run concurrently. This doesn't change the + total number of tasks run. By default, concurrency is dynamically + decided based on the available resources. + num_rows_per_file: [Deprecated] Use min_rows_per_file instead. + arrow_parquet_args: Options to pass to + `pyarrow.parquet.write_table() `_, which is used to write out each + block to a file. + """ # noqa: E501 + if arrow_parquet_args_fn is None: + arrow_parquet_args_fn = lambda: {} # noqa: E731 + + if partition_cols and (num_rows_per_file or min_rows_per_file): + raise ValueError( + "Cannot pass num_rows_per_file or min_rows_per_file when partition_cols " + "argument is specified" + ) + + effective_min_rows = _validate_rows_per_file_args( + num_rows_per_file=num_rows_per_file, min_rows_per_file=min_rows_per_file + ) + + datasink = ParquetDatasink( + path, + partition_cols=partition_cols, + arrow_parquet_args_fn=arrow_parquet_args_fn, + arrow_parquet_args=arrow_parquet_args, + min_rows_per_file=effective_min_rows, # Pass through to datasink + filesystem=filesystem, + try_create_dir=try_create_dir, + open_stream_args=arrow_open_stream_args, + filename_provider=filename_provider, + dataset_uuid=self._uuid, + ) + self.write_datasink( + datasink, + ray_remote_args=ray_remote_args, + concurrency=concurrency, + ) + + @ConsumptionAPI + @PublicAPI(api_group=IOC_API_GROUP) + def write_json( + self, + path: str, + *, + filesystem: Optional["pyarrow.fs.FileSystem"] = None, + try_create_dir: bool = True, + arrow_open_stream_args: Optional[Dict[str, Any]] = None, + filename_provider: Optional[FilenameProvider] = None, + pandas_json_args_fn: Optional[Callable[[], Dict[str, Any]]] = None, + min_rows_per_file: Optional[int] = None, + ray_remote_args: Dict[str, Any] = None, + concurrency: Optional[int] = None, + num_rows_per_file: Optional[int] = None, + **pandas_json_args, + ) -> None: + """Writes the :class:`~ray.data.Dataset` to JSON and JSONL files. + + The number of files is determined by the number of blocks in the dataset. + To control the number of number of blocks, call + :meth:`~ray.data.Dataset.repartition`. + + This method is only supported for datasets with records that are convertible to + pandas dataframes. + + By default, the format of the output files is ``{uuid}_{block_idx}.json``, + where ``uuid`` is a unique id for the dataset. To modify this behavior, + implement a custom :class:`~ray.data.datasource.FilenameProvider` and pass it in + as the ``filename_provider`` argument. + + Examples: + Write the dataset as JSON file to a local directory. + + >>> import ray + >>> import pandas as pd + >>> ds = ray.data.from_pandas([pd.DataFrame({"one": [1], "two": ["a"]})]) + >>> ds.write_json("local:///tmp/data") + + Write the dataset as JSONL files to a local directory. + + >>> ds = ray.data.read_json("s3://anonymous@ray-example-data/train.jsonl") + >>> ds.write_json("local:///tmp/data") + + Time complexity: O(dataset size / parallelism) + + Args: + path: The path to the destination root directory, where + the JSON files are written to. + filesystem: The pyarrow filesystem implementation to write to. + These filesystems are specified in the + `pyarrow docs `_. + Specify this if you need to provide specific configurations to the + filesystem. By default, the filesystem is automatically selected based + on the scheme of the paths. For example, if the path begins with + ``s3://``, the ``S3FileSystem`` is used. + try_create_dir: If ``True``, attempts to create all directories in the + destination path. Does nothing if all directories already + exist. Defaults to ``True``. + arrow_open_stream_args: kwargs passed to + `pyarrow.fs.FileSystem.open_output_stream `_, which is used when + opening the file to write to. + filename_provider: A :class:`~ray.data.datasource.FilenameProvider` + implementation. Use this parameter to customize what your filenames + look like. + pandas_json_args_fn: Callable that returns a dictionary of write + arguments that are provided to + `pandas.DataFrame.to_json() `_ + when writing each block to a file. Overrides + any duplicate keys from ``pandas_json_args``. Use this parameter + instead of ``pandas_json_args`` if any of your write arguments + can't be pickled, or if you'd like to lazily resolve the write + arguments for each dataset block. + min_rows_per_file: [Experimental] The target minimum number of rows to write + to each file. If ``None``, Ray Data writes a system-chosen number of + rows to each file. If the number of rows per block is larger than the + specified value, Ray Data writes the number of rows per block to each file. + The specified value is a hint, not a strict limit. Ray Data + might write more or fewer rows to each file. + ray_remote_args: kwargs passed to :func:`ray.remote` in the write tasks. + concurrency: The maximum number of Ray tasks to run concurrently. Set this + to control number of tasks to run concurrently. This doesn't change the + total number of tasks run. By default, concurrency is dynamically + decided based on the available resources. + num_rows_per_file: Deprecated. Use ``min_rows_per_file`` instead. + pandas_json_args: These args are passed to + `pandas.DataFrame.to_json() `_, + which is used under the hood to write out each + :class:`~ray.data.Dataset` block. These + are dict(orient="records", lines=True) by default. + """ + if pandas_json_args_fn is None: + pandas_json_args_fn = lambda: {} # noqa: E731 + + effective_min_rows = _validate_rows_per_file_args( + num_rows_per_file=num_rows_per_file, min_rows_per_file=min_rows_per_file + ) + + datasink = JSONDatasink( + path, + pandas_json_args_fn=pandas_json_args_fn, + pandas_json_args=pandas_json_args, + min_rows_per_file=effective_min_rows, + filesystem=filesystem, + try_create_dir=try_create_dir, + open_stream_args=arrow_open_stream_args, + filename_provider=filename_provider, + dataset_uuid=self._uuid, + ) + self.write_datasink( + datasink, + ray_remote_args=ray_remote_args, + concurrency=concurrency, + ) + + @PublicAPI(stability="alpha", api_group=IOC_API_GROUP) + @ConsumptionAPI + def write_images( + self, + path: str, + column: str, + file_format: str = "png", + *, + filesystem: Optional["pyarrow.fs.FileSystem"] = None, + try_create_dir: bool = True, + arrow_open_stream_args: Optional[Dict[str, Any]] = None, + filename_provider: Optional[FilenameProvider] = None, + ray_remote_args: Dict[str, Any] = None, + concurrency: Optional[int] = None, + ) -> None: + """Writes the :class:`~ray.data.Dataset` to images. + + Examples: + >>> import ray + >>> ds = ray.data.read_images("s3://anonymous@ray-example-data/image-datasets/simple") + >>> ds.write_images("local:///tmp/images", column="image") + + Time complexity: O(dataset size / parallelism) + + Args: + path: The path to the destination root directory, where + the images are written to. + column: The column containing the data you want to write to images. + file_format: The image file format to write with. For available options, + see `Image file formats `_. + filesystem: The pyarrow filesystem implementation to write to. + These filesystems are specified in the + `pyarrow docs `_. + Specify this if you need to provide specific configurations to the + filesystem. By default, the filesystem is automatically selected based + on the scheme of the paths. For example, if the path begins with + ``s3://``, the ``S3FileSystem`` is used. + try_create_dir: If ``True``, attempts to create all directories in the + destination path. Does nothing if all directories already + exist. Defaults to ``True``. + arrow_open_stream_args: kwargs passed to + `pyarrow.fs.FileSystem.open_output_stream `_, which is used when + opening the file to write to. + filename_provider: A :class:`~ray.data.datasource.FilenameProvider` + implementation. Use this parameter to customize what your filenames + look like. + ray_remote_args: kwargs passed to :func:`ray.remote` in the write tasks. + concurrency: The maximum number of Ray tasks to run concurrently. Set this + to control number of tasks to run concurrently. This doesn't change the + total number of tasks run. By default, concurrency is dynamically + decided based on the available resources. + """ # noqa: E501 + datasink = ImageDatasink( + path, + column, + file_format, + filesystem=filesystem, + try_create_dir=try_create_dir, + open_stream_args=arrow_open_stream_args, + filename_provider=filename_provider, + dataset_uuid=self._uuid, + ) + self.write_datasink( + datasink, + ray_remote_args=ray_remote_args, + concurrency=concurrency, + ) + + @ConsumptionAPI + @PublicAPI(api_group=IOC_API_GROUP) + def write_csv( + self, + path: str, + *, + filesystem: Optional["pyarrow.fs.FileSystem"] = None, + try_create_dir: bool = True, + arrow_open_stream_args: Optional[Dict[str, Any]] = None, + filename_provider: Optional[FilenameProvider] = None, + arrow_csv_args_fn: Optional[Callable[[], Dict[str, Any]]] = None, + min_rows_per_file: Optional[int] = None, + ray_remote_args: Dict[str, Any] = None, + concurrency: Optional[int] = None, + num_rows_per_file: Optional[int] = None, + **arrow_csv_args, + ) -> None: + """Writes the :class:`~ray.data.Dataset` to CSV files. + + The number of files is determined by the number of blocks in the dataset. + To control the number of number of blocks, call + :meth:`~ray.data.Dataset.repartition`. + + This method is only supported for datasets with records that are convertible to + pyarrow tables. + + By default, the format of the output files is ``{uuid}_{block_idx}.csv``, + where ``uuid`` is a unique id for the dataset. To modify this behavior, + implement a custom :class:`~ray.data.datasource.FilenameProvider` + and pass it in as the ``filename_provider`` argument. + + + Examples: + Write the dataset as CSV files to a local directory. + + >>> import ray + >>> ds = ray.data.range(100) + >>> ds.write_csv("local:///tmp/data") + + Write the dataset as CSV files to S3. + + >>> import ray + >>> ds = ray.data.range(100) + >>> ds.write_csv("s3://bucket/folder/) # doctest: +SKIP + + Time complexity: O(dataset size / parallelism) + + Args: + path: The path to the destination root directory, where + the CSV files are written to. + filesystem: The pyarrow filesystem implementation to write to. + These filesystems are specified in the + `pyarrow docs `_. + Specify this if you need to provide specific configurations to the + filesystem. By default, the filesystem is automatically selected based + on the scheme of the paths. For example, if the path begins with + ``s3://``, the ``S3FileSystem`` is used. + try_create_dir: If ``True``, attempts to create all directories in the + destination path if ``True``. Does nothing if all directories already + exist. Defaults to ``True``. + arrow_open_stream_args: kwargs passed to + `pyarrow.fs.FileSystem.open_output_stream `_, which is used when + opening the file to write to. + filename_provider: A :class:`~ray.data.datasource.FilenameProvider` + implementation. Use this parameter to customize what your filenames + look like. + arrow_csv_args_fn: Callable that returns a dictionary of write + arguments that are provided to `pyarrow.write.write_csv `_ when writing each + block to a file. Overrides any duplicate keys from ``arrow_csv_args``. + Use this argument instead of ``arrow_csv_args`` if any of your write + arguments cannot be pickled, or if you'd like to lazily resolve the + write arguments for each dataset block. + min_rows_per_file: [Experimental] The target minimum number of rows to write + to each file. If ``None``, Ray Data writes a system-chosen number of + rows to each file. If the number of rows per block is larger than the + specified value, Ray Data writes the number of rows per block to each file. + The specified value is a hint, not a strict limit. Ray Data + might write more or fewer rows to each file. + ray_remote_args: kwargs passed to :func:`ray.remote` in the write tasks. + concurrency: The maximum number of Ray tasks to run concurrently. Set this + to control number of tasks to run concurrently. This doesn't change the + total number of tasks run. By default, concurrency is dynamically + decided based on the available resources. + num_rows_per_file: [Deprecated] Use min_rows_per_file instead. + arrow_csv_args: Options to pass to `pyarrow.write.write_csv `_ + when writing each block to a file. + """ + if arrow_csv_args_fn is None: + arrow_csv_args_fn = lambda: {} # noqa: E731 + + effective_min_rows = _validate_rows_per_file_args( + num_rows_per_file=num_rows_per_file, min_rows_per_file=min_rows_per_file + ) + + datasink = CSVDatasink( + path, + arrow_csv_args_fn=arrow_csv_args_fn, + arrow_csv_args=arrow_csv_args, + min_rows_per_file=effective_min_rows, + filesystem=filesystem, + try_create_dir=try_create_dir, + open_stream_args=arrow_open_stream_args, + filename_provider=filename_provider, + dataset_uuid=self._uuid, + ) + self.write_datasink( + datasink, + ray_remote_args=ray_remote_args, + concurrency=concurrency, + ) + + @ConsumptionAPI + @PublicAPI(api_group=IOC_API_GROUP) + def write_tfrecords( + self, + path: str, + *, + tf_schema: Optional["schema_pb2.Schema"] = None, + filesystem: Optional["pyarrow.fs.FileSystem"] = None, + try_create_dir: bool = True, + arrow_open_stream_args: Optional[Dict[str, Any]] = None, + filename_provider: Optional[FilenameProvider] = None, + min_rows_per_file: Optional[int] = None, + ray_remote_args: Dict[str, Any] = None, + concurrency: Optional[int] = None, + num_rows_per_file: Optional[int] = None, + ) -> None: + """Write the :class:`~ray.data.Dataset` to TFRecord files. + + The `TFRecord `_ + files contain + `tf.train.Example `_ + records, with one Example record for each row in the dataset. + + .. warning:: + tf.train.Feature only natively stores ints, floats, and bytes, + so this function only supports datasets with these data types, + and will error if the dataset contains unsupported types. + + The number of files is determined by the number of blocks in the dataset. + To control the number of number of blocks, call + :meth:`~ray.data.Dataset.repartition`. + + This method is only supported for datasets with records that are convertible to + pyarrow tables. + + By default, the format of the output files is ``{uuid}_{block_idx}.tfrecords``, + where ``uuid`` is a unique id for the dataset. To modify this behavior, + implement a custom :class:`~ray.data.datasource.FilenameProvider` + and pass it in as the ``filename_provider`` argument. + + Examples: + >>> import ray + >>> ds = ray.data.range(100) + >>> ds.write_tfrecords("local:///tmp/data/") + + Time complexity: O(dataset size / parallelism) + + Args: + path: The path to the destination root directory, where tfrecords + files are written to. + filesystem: The pyarrow filesystem implementation to write to. + These filesystems are specified in the + `pyarrow docs `_. + Specify this if you need to provide specific configurations to the + filesystem. By default, the filesystem is automatically selected based + on the scheme of the paths. For example, if the path begins with + ``s3://``, the ``S3FileSystem`` is used. + try_create_dir: If ``True``, attempts to create all directories in the + destination path. Does nothing if all directories already + exist. Defaults to ``True``. + arrow_open_stream_args: kwargs passed to + `pyarrow.fs.FileSystem.open_output_stream `_, which is used when + opening the file to write to. + filename_provider: A :class:`~ray.data.datasource.FilenameProvider` + implementation. Use this parameter to customize what your filenames + look like. + min_rows_per_file: [Experimental] The target minimum number of rows to write + to each file. If ``None``, Ray Data writes a system-chosen number of + rows to each file. If the number of rows per block is larger than the + specified value, Ray Data writes the number of rows per block to each file. + The specified value is a hint, not a strict limit. Ray Data + might write more or fewer rows to each file. + ray_remote_args: kwargs passed to :func:`ray.remote` in the write tasks. + concurrency: The maximum number of Ray tasks to run concurrently. Set this + to control number of tasks to run concurrently. This doesn't change the + total number of tasks run. By default, concurrency is dynamically + decided based on the available resources. + num_rows_per_file: [Deprecated] Use min_rows_per_file instead. + """ + effective_min_rows = _validate_rows_per_file_args( + num_rows_per_file=num_rows_per_file, min_rows_per_file=min_rows_per_file + ) + + datasink = TFRecordDatasink( + path=path, + tf_schema=tf_schema, + min_rows_per_file=effective_min_rows, + filesystem=filesystem, + try_create_dir=try_create_dir, + open_stream_args=arrow_open_stream_args, + filename_provider=filename_provider, + dataset_uuid=self._uuid, + ) + self.write_datasink( + datasink, + ray_remote_args=ray_remote_args, + concurrency=concurrency, + ) + + @ConsumptionAPI + @PublicAPI(stability="alpha", api_group=IOC_API_GROUP) + def write_webdataset( + self, + path: str, + *, + filesystem: Optional["pyarrow.fs.FileSystem"] = None, + try_create_dir: bool = True, + arrow_open_stream_args: Optional[Dict[str, Any]] = None, + filename_provider: Optional[FilenameProvider] = None, + min_rows_per_file: Optional[int] = None, + ray_remote_args: Dict[str, Any] = None, + encoder: Optional[Union[bool, str, callable, list]] = True, + concurrency: Optional[int] = None, + num_rows_per_file: Optional[int] = None, + ) -> None: + """Writes the dataset to `WebDataset `_ files. + + The `TFRecord `_ + files will contain + `tf.train.Example `_ # noqa: E501 + records, with one Example record for each row in the dataset. + + .. warning:: + tf.train.Feature only natively stores ints, floats, and bytes, + so this function only supports datasets with these data types, + and will error if the dataset contains unsupported types. + + This is only supported for datasets convertible to Arrow records. + To control the number of files, use :meth:`Dataset.repartition`. + + Unless a custom filename provider is given, the format of the output + files is ``{uuid}_{block_idx}.tfrecords``, where ``uuid`` is a unique id + for the dataset. + + Examples: + + .. testcode:: + :skipif: True + + import ray + + ds = ray.data.range(100) + ds.write_webdataset("s3://bucket/folder/") + + Time complexity: O(dataset size / parallelism) + + Args: + path: The path to the destination root directory, where tfrecords + files are written to. + filesystem: The filesystem implementation to write to. + try_create_dir: If ``True``, attempts to create all + directories in the destination path. Does nothing if all directories + already exist. Defaults to ``True``. + arrow_open_stream_args: kwargs passed to + ``pyarrow.fs.FileSystem.open_output_stream`` + filename_provider: A :class:`~ray.data.datasource.FilenameProvider` + implementation. Use this parameter to customize what your filenames + look like. + min_rows_per_file: [Experimental] The target minimum number of rows to write + to each file. If ``None``, Ray Data writes a system-chosen number of + rows to each file. If the number of rows per block is larger than the + specified value, Ray Data writes the number of rows per block to each file. + The specified value is a hint, not a strict limit. Ray Data + might write more or fewer rows to each file. + ray_remote_args: Kwargs passed to :func:`ray.remote` in the write tasks. + concurrency: The maximum number of Ray tasks to run concurrently. Set this + to control number of tasks to run concurrently. This doesn't change the + total number of tasks run. By default, concurrency is dynamically + decided based on the available resources. + num_rows_per_file: [Deprecated] Use min_rows_per_file instead. + """ + effective_min_rows = _validate_rows_per_file_args( + num_rows_per_file=num_rows_per_file, min_rows_per_file=min_rows_per_file + ) + + datasink = WebDatasetDatasink( + path, + encoder=encoder, + min_rows_per_file=effective_min_rows, + filesystem=filesystem, + try_create_dir=try_create_dir, + open_stream_args=arrow_open_stream_args, + filename_provider=filename_provider, + dataset_uuid=self._uuid, + ) + self.write_datasink( + datasink, + ray_remote_args=ray_remote_args, + concurrency=concurrency, + ) + + @ConsumptionAPI + @PublicAPI(api_group=IOC_API_GROUP) + def write_numpy( + self, + path: str, + *, + column: str, + filesystem: Optional["pyarrow.fs.FileSystem"] = None, + try_create_dir: bool = True, + arrow_open_stream_args: Optional[Dict[str, Any]] = None, + filename_provider: Optional[FilenameProvider] = None, + min_rows_per_file: Optional[int] = None, + ray_remote_args: Dict[str, Any] = None, + concurrency: Optional[int] = None, + num_rows_per_file: Optional[int] = None, + ) -> None: + """Writes a column of the :class:`~ray.data.Dataset` to .npy files. + + This is only supported for columns in the datasets that can be converted to + NumPy arrays. + + The number of files is determined by the number of blocks in the dataset. + To control the number of number of blocks, call + :meth:`~ray.data.Dataset.repartition`. + + + By default, the format of the output files is ``{uuid}_{block_idx}.npy``, + where ``uuid`` is a unique id for the dataset. To modify this behavior, + implement a custom :class:`~ray.data.datasource.FilenameProvider` + and pass it in as the ``filename_provider`` argument. + + Examples: + >>> import ray + >>> ds = ray.data.range(100) + >>> ds.write_numpy("local:///tmp/data/", column="id") + + Time complexity: O(dataset size / parallelism) + + Args: + path: The path to the destination root directory, where + the npy files are written to. + column: The name of the column that contains the data to + be written. + filesystem: The pyarrow filesystem implementation to write to. + These filesystems are specified in the + `pyarrow docs `_. + Specify this if you need to provide specific configurations to the + filesystem. By default, the filesystem is automatically selected based + on the scheme of the paths. For example, if the path begins with + ``s3://``, the ``S3FileSystem`` is used. + try_create_dir: If ``True``, attempts to create all directories in + destination path. Does nothing if all directories already + exist. Defaults to ``True``. + arrow_open_stream_args: kwargs passed to + `pyarrow.fs.FileSystem.open_output_stream `_, which is used when + opening the file to write to. + filename_provider: A :class:`~ray.data.datasource.FilenameProvider` + implementation. Use this parameter to customize what your filenames + look like. + min_rows_per_file: [Experimental] The target minimum number of rows to write + to each file. If ``None``, Ray Data writes a system-chosen number of + rows to each file. If the number of rows per block is larger than the + specified value, Ray Data writes the number of rows per block to each file. + The specified value is a hint, not a strict limit. Ray Data + might write more or fewer rows to each file. + ray_remote_args: kwargs passed to :func:`ray.remote` in the write tasks. + concurrency: The maximum number of Ray tasks to run concurrently. Set this + to control number of tasks to run concurrently. This doesn't change the + total number of tasks run. By default, concurrency is dynamically + decided based on the available resources. + num_rows_per_file: [Deprecated] Use min_rows_per_file instead. + """ + effective_min_rows = _validate_rows_per_file_args( + num_rows_per_file=num_rows_per_file, min_rows_per_file=min_rows_per_file + ) + + datasink = NumpyDatasink( + path, + column, + min_rows_per_file=effective_min_rows, + filesystem=filesystem, + try_create_dir=try_create_dir, + open_stream_args=arrow_open_stream_args, + filename_provider=filename_provider, + dataset_uuid=self._uuid, + ) + self.write_datasink( + datasink, + ray_remote_args=ray_remote_args, + concurrency=concurrency, + ) + + @ConsumptionAPI + def write_sql( + self, + sql: str, + connection_factory: Callable[[], Connection], + ray_remote_args: Optional[Dict[str, Any]] = None, + concurrency: Optional[int] = None, + ) -> None: + """Write to a database that provides a + `Python DB API2-compliant `_ connector. + + .. note:: + + This method writes data in parallel using the DB API2 ``executemany`` + method. To learn more about this method, see + `PEP 249 `_. + + Examples: + + .. testcode:: + + import sqlite3 + import ray + + connection = sqlite3.connect("example.db") + connection.cursor().execute("CREATE TABLE movie(title, year, score)") + dataset = ray.data.from_items([ + {"title": "Monty Python and the Holy Grail", "year": 1975, "score": 8.2}, + {"title": "And Now for Something Completely Different", "year": 1971, "score": 7.5} + ]) + + dataset.write_sql( + "INSERT INTO movie VALUES(?, ?, ?)", lambda: sqlite3.connect("example.db") + ) + + result = connection.cursor().execute("SELECT * FROM movie ORDER BY year") + print(result.fetchall()) + + .. testoutput:: + + [('And Now for Something Completely Different', 1971, 7.5), ('Monty Python and the Holy Grail', 1975, 8.2)] + + .. testcode:: + :hide: + + import os + os.remove("example.db") + + Arguments: + sql: An ``INSERT INTO`` statement that specifies the table to write to. The + number of parameters must match the number of columns in the table. + connection_factory: A function that takes no arguments and returns a + Python DB API2 + `Connection object `_. + ray_remote_args: Keyword arguments passed to :func:`ray.remote` in the + write tasks. + concurrency: The maximum number of Ray tasks to run concurrently. Set this + to control number of tasks to run concurrently. This doesn't change the + total number of tasks run. By default, concurrency is dynamically + decided based on the available resources. + """ # noqa: E501 + datasink = SQLDatasink(sql=sql, connection_factory=connection_factory) + self.write_datasink( + datasink, + ray_remote_args=ray_remote_args, + concurrency=concurrency, + ) + + @PublicAPI(stability="alpha", api_group=IOC_API_GROUP) + @ConsumptionAPI + def write_mongo( + self, + uri: str, + database: str, + collection: str, + ray_remote_args: Dict[str, Any] = None, + concurrency: Optional[int] = None, + ) -> None: + """Writes the :class:`~ray.data.Dataset` to a MongoDB database. + + This method is only supported for datasets convertible to pyarrow tables. + + The number of parallel writes is determined by the number of blocks in the + dataset. To control the number of number of blocks, call + :meth:`~ray.data.Dataset.repartition`. + + .. warning:: + This method supports only a subset of the PyArrow's types, due to the + limitation of pymongoarrow which is used underneath. Writing unsupported + types fails on type checking. See all the supported types at: + https://mongo-arrow.readthedocs.io/en/latest/data_types.html. + + .. note:: + The records are inserted into MongoDB as new documents. If a record has + the _id field, this _id must be non-existent in MongoDB, otherwise the write + is rejected and fail (hence preexisting documents are protected from + being mutated). It's fine to not have _id field in record and MongoDB will + auto generate one at insertion. + + Examples: + + .. testcode:: + :skipif: True + + import ray + + ds = ray.data.range(100) + ds.write_mongo( + uri="mongodb://username:password@mongodb0.example.com:27017/?authSource=admin", + database="my_db", + collection="my_collection" + ) + + Args: + uri: The URI to the destination MongoDB where the dataset is + written to. For the URI format, see details in the + `MongoDB docs `_. + database: The name of the database. This database must exist otherwise + a ValueError is raised. + collection: The name of the collection in the database. This collection + must exist otherwise a ValueError is raised. + ray_remote_args: kwargs passed to :func:`ray.remote` in the write tasks. + concurrency: The maximum number of Ray tasks to run concurrently. Set this + to control number of tasks to run concurrently. This doesn't change the + total number of tasks run. By default, concurrency is dynamically + decided based on the available resources. + + Raises: + ValueError: if ``database`` doesn't exist. + ValueError: if ``collection`` doesn't exist. + """ + datasink = MongoDatasink( + uri=uri, + database=database, + collection=collection, + ) + self.write_datasink( + datasink, + ray_remote_args=ray_remote_args, + concurrency=concurrency, + ) + + @ConsumptionAPI + def write_bigquery( + self, + project_id: str, + dataset: str, + max_retry_cnt: int = 10, + overwrite_table: Optional[bool] = True, + ray_remote_args: Dict[str, Any] = None, + concurrency: Optional[int] = None, + ) -> None: + """Write the dataset to a BigQuery dataset table. + + To control the number of parallel write tasks, use ``.repartition()`` + before calling this method. + + Examples: + .. testcode:: + :skipif: True + + import ray + import pandas as pd + + docs = [{"title": "BigQuery Datasource test"} for key in range(4)] + ds = ray.data.from_pandas(pd.DataFrame(docs)) + ds.write_bigquery( + project_id="my_project_id", + dataset="my_dataset_table", + overwrite_table=True + ) + + Args: + project_id: The name of the associated Google Cloud Project that hosts + the dataset to read. For more information, see details in + `Creating and managing projects `_. + dataset: The name of the dataset in the format of ``dataset_id.table_id``. + The dataset is created if it doesn't already exist. + max_retry_cnt: The maximum number of retries that an individual block write + is retried due to BigQuery rate limiting errors. This isn't + related to Ray fault tolerance retries. The default number of retries + is 10. + overwrite_table: Whether the write will overwrite the table if it already + exists. The default behavior is to overwrite the table. + ``overwrite_table=False`` will append to the table if it exists. + ray_remote_args: Kwargs passed to :func:`ray.remote` in the write tasks. + concurrency: The maximum number of Ray tasks to run concurrently. Set this + to control number of tasks to run concurrently. This doesn't change the + total number of tasks run. By default, concurrency is dynamically + decided based on the available resources. + """ # noqa: E501 + if ray_remote_args is None: + ray_remote_args = {} + + # Each write task will launch individual remote tasks to write each block + # To avoid duplicate block writes, the write task should not be retried + if ray_remote_args.get("max_retries", 0) != 0: + warnings.warn( + "The max_retries of a BigQuery Write Task should be set to 0" + " to avoid duplicate writes." + ) + else: + ray_remote_args["max_retries"] = 0 + + datasink = BigQueryDatasink( + project_id=project_id, + dataset=dataset, + max_retry_cnt=max_retry_cnt, + overwrite_table=overwrite_table, + ) + self.write_datasink( + datasink, + ray_remote_args=ray_remote_args, + concurrency=concurrency, + ) + + @ConsumptionAPI(pattern="Time complexity:") + def write_datasink( + self, + datasink: Datasink, + *, + ray_remote_args: Dict[str, Any] = None, + concurrency: Optional[int] = None, + ) -> None: + """Writes the dataset to a custom :class:`~ray.data.Datasink`. + + Time complexity: O(dataset size / parallelism) + + Args: + datasink: The :class:`~ray.data.Datasink` to write to. + ray_remote_args: Kwargs passed to :func:`ray.remote` in the write tasks. + concurrency: The maximum number of Ray tasks to run concurrently. Set this + to control number of tasks to run concurrently. This doesn't change the + total number of tasks run. By default, concurrency is dynamically + decided based on the available resources. + """ # noqa: E501 + if ray_remote_args is None: + ray_remote_args = {} + + if not datasink.supports_distributed_writes: + if ray.util.client.ray.is_connected(): + raise ValueError( + "If you're using Ray Client, Ray Data won't schedule write tasks " + "on the driver's node." + ) + ray_remote_args["scheduling_strategy"] = NodeAffinitySchedulingStrategy( + ray.get_runtime_context().get_node_id(), + soft=False, + ) + + plan = self._plan.copy() + write_op = Write( + self._logical_plan.dag, + datasink, + ray_remote_args=ray_remote_args, + concurrency=concurrency, + ) + logical_plan = LogicalPlan(write_op, self.context) + + try: + + datasink.on_write_start() + + self._write_ds = Dataset(plan, logical_plan).materialize() + # TODO: Get and handle the blocks with an iterator instead of getting + # everything in a blocking way, so some blocks can be freed earlier. + raw_write_results = ray.get(self._write_ds._plan.execute().block_refs) + write_result = gen_datasink_write_result(raw_write_results) + logger.info( + "Data sink %s finished. %d rows and %s data written.", + datasink.get_name(), + write_result.num_rows, + memory_string(write_result.size_bytes), + ) + datasink.on_write_complete(write_result) + + except Exception as e: + datasink.on_write_failed(e) + raise + + @ConsumptionAPI( + delegate=( + "Calling any of the consumption methods on the returned ``DataIterator``" + ), + pattern="Returns:", + ) + @PublicAPI(api_group=CD_API_GROUP) + def iterator(self) -> DataIterator: + """Return a :class:`~ray.data.DataIterator` over this dataset. + + Don't call this method directly. Use it internally. + + Returns: + A :class:`~ray.data.DataIterator` over this dataset. + """ + return DataIteratorImpl(self) + + @ConsumptionAPI + @PublicAPI(api_group=CD_API_GROUP) + def iter_rows(self) -> Iterable[Dict[str, Any]]: + """Return an iterable over the rows in this dataset. + + Examples: + >>> import ray + >>> for row in ray.data.range(3).iter_rows(): + ... print(row) + {'id': 0} + {'id': 1} + {'id': 2} + + Time complexity: O(1) + + Returns: + An iterable over the rows in this dataset. + """ + return self.iterator().iter_rows() + + @ConsumptionAPI + @PublicAPI(api_group=CD_API_GROUP) + def iter_batches( + self, + *, + prefetch_batches: int = 1, + batch_size: Optional[int] = 256, + batch_format: Optional[str] = "default", + drop_last: bool = False, + local_shuffle_buffer_size: Optional[int] = None, + local_shuffle_seed: Optional[int] = None, + _collate_fn: Optional[Callable[[DataBatch], CollatedData]] = None, + ) -> Iterable[DataBatch]: + """Return an iterable over batches of data. + + This method is useful for model training. + + Examples: + + .. testcode:: + + import ray + + ds = ray.data.read_images("example://image-datasets/simple") + + for batch in ds.iter_batches(batch_size=2, batch_format="numpy"): + print(batch) + + .. testoutput:: + :options: +MOCK + + {'image': array([[[[...]]]], dtype=uint8)} + ... + {'image': array([[[[...]]]], dtype=uint8)} + + Time complexity: O(1) + + Args: + prefetch_batches: The number of batches to fetch ahead of the current batch + to fetch. If set to greater than 0, a separate threadpool is used + to fetch the objects to the local node and format the batches. Defaults + to 1. + batch_size: The number of rows in each batch, or ``None`` to use entire + blocks as batches (blocks may contain different numbers of rows). + The final batch may include fewer than ``batch_size`` rows if + ``drop_last`` is ``False``. Defaults to 256. + batch_format: If ``"default"`` or ``"numpy"``, batches are + ``Dict[str, numpy.ndarray]``. If ``"pandas"``, batches are + ``pandas.DataFrame``. + drop_last: Whether to drop the last batch if it's incomplete. + local_shuffle_buffer_size: If not ``None``, the data is randomly shuffled + using a local in-memory shuffle buffer, and this value serves as the + minimum number of rows that must be in the local in-memory shuffle + buffer in order to yield a batch. When there are no more rows to add to + the buffer, the remaining rows in the buffer are drained. + local_shuffle_seed: The seed to use for the local random shuffle. + + Returns: + An iterable over batches of data. + """ + batch_format = _apply_batch_format(batch_format) + return self.iterator().iter_batches( + prefetch_batches=prefetch_batches, + batch_size=batch_size, + batch_format=batch_format, + drop_last=drop_last, + local_shuffle_buffer_size=local_shuffle_buffer_size, + local_shuffle_seed=local_shuffle_seed, + _collate_fn=_collate_fn, + ) + + @ConsumptionAPI + @PublicAPI(api_group=CD_API_GROUP) + def iter_torch_batches( + self, + *, + prefetch_batches: int = 1, + batch_size: Optional[int] = 256, + dtypes: Optional[Union["torch.dtype", Dict[str, "torch.dtype"]]] = None, + device: str = "auto", + collate_fn: Optional[Callable[[Dict[str, np.ndarray]], CollatedData]] = None, + drop_last: bool = False, + local_shuffle_buffer_size: Optional[int] = None, + local_shuffle_seed: Optional[int] = None, + ) -> Iterable[TorchBatchType]: + """Return an iterable over batches of data represented as Torch tensors. + + This iterable yields batches of type ``Dict[str, torch.Tensor]``. + For more flexibility, call :meth:`~Dataset.iter_batches` and manually convert + your data to Torch tensors. + + Examples: + >>> import ray + >>> for batch in ray.data.range( + ... 12, + ... ).iter_torch_batches(batch_size=4): + ... print(batch) + {'id': tensor([0, 1, 2, 3])} + {'id': tensor([4, 5, 6, 7])} + {'id': tensor([ 8, 9, 10, 11])} + + Use the ``collate_fn`` to customize how the tensor batch is created. + + >>> from typing import Any, Dict + >>> import torch + >>> import numpy as np + >>> import ray + >>> def collate_fn(batch: Dict[str, np.ndarray]) -> Any: + ... return torch.stack( + ... [torch.as_tensor(array) for array in batch.values()], + ... axis=1 + ... ) + >>> dataset = ray.data.from_items([ + ... {"col_1": 1, "col_2": 2}, + ... {"col_1": 3, "col_2": 4}]) + >>> for batch in dataset.iter_torch_batches(collate_fn=collate_fn): + ... print(batch) + tensor([[1, 2], + [3, 4]]) + + + Time complexity: O(1) + + Args: + prefetch_batches: The number of batches to fetch ahead of the current batch + to fetch. If set to greater than 0, a separate threadpool is used + to fetch the objects to the local node, format the batches, and apply + the ``collate_fn``. Defaults to 1. + batch_size: The number of rows in each batch, or ``None`` to use entire + blocks as batches (blocks may contain different number of rows). + The final batch may include fewer than ``batch_size`` rows if + ``drop_last`` is ``False``. Defaults to 256. + dtypes: The Torch dtype(s) for the created tensor(s); if ``None``, the dtype + is inferred from the tensor data. You can't use this parameter with + ``collate_fn``. + device: The device on which the tensor should be placed. Defaults to + "auto" which moves the tensors to the appropriate device when the + Dataset is passed to Ray Train and ``collate_fn`` is not provided. + Otherwise, defaults to CPU. You can't use this parameter with + ``collate_fn``. + collate_fn: A function to convert a Numpy batch to a PyTorch tensor batch. + When this parameter is specified, the user should manually handle the + host to device data transfer outside of collate_fn. + This is useful for further processing the data after it has been + batched. Potential use cases include collating along a dimension other + than the first, padding sequences of various lengths, or generally + handling batches of different length tensors. If not provided, the + default collate function is used which simply converts the batch of + numpy arrays to a batch of PyTorch tensors. This API is still + experimental and is subject to change. You can't use this parameter in + conjunction with ``dtypes`` or ``device``. + drop_last: Whether to drop the last batch if it's incomplete. + local_shuffle_buffer_size: If not ``None``, the data is randomly shuffled + using a local in-memory shuffle buffer, and this value serves as the + minimum number of rows that must be in the local in-memory shuffle + buffer in order to yield a batch. When there are no more rows to add to + the buffer, the remaining rows in the buffer are drained. + ``batch_size`` must also be specified when using local shuffling. + local_shuffle_seed: The seed to use for the local random shuffle. + + Returns: + An iterable over Torch Tensor batches. + + .. seealso:: + :meth:`Dataset.iter_batches` + Call this method to manually convert your data to Torch tensors. + """ # noqa: E501 + return self.iterator().iter_torch_batches( + prefetch_batches=prefetch_batches, + batch_size=batch_size, + dtypes=dtypes, + device=device, + collate_fn=collate_fn, + drop_last=drop_last, + local_shuffle_buffer_size=local_shuffle_buffer_size, + local_shuffle_seed=local_shuffle_seed, + ) + + @ConsumptionAPI + @Deprecated + def iter_tf_batches( + self, + *, + prefetch_batches: int = 1, + batch_size: Optional[int] = 256, + dtypes: Optional[Union["tf.dtypes.DType", Dict[str, "tf.dtypes.DType"]]] = None, + drop_last: bool = False, + local_shuffle_buffer_size: Optional[int] = None, + local_shuffle_seed: Optional[int] = None, + ) -> Iterable[TensorFlowTensorBatchType]: + """Return an iterable over batches of data represented as TensorFlow tensors. + + This iterable yields batches of type ``Dict[str, tf.Tensor]``. + For more flexibility, call :meth:`~Dataset.iter_batches` and manually convert + your data to TensorFlow tensors. + + .. tip:: + If you don't need the additional flexibility provided by this method, + consider using :meth:`~ray.data.Dataset.to_tf` instead. It's easier + to use. + + Examples: + + .. testcode:: + + import ray + + ds = ray.data.read_csv("s3://anonymous@air-example-data/iris.csv") + + tf_dataset = ds.to_tf( + feature_columns="sepal length (cm)", + label_columns="target", + batch_size=2 + ) + for features, labels in tf_dataset: + print(features, labels) + + .. testoutput:: + + tf.Tensor([5.1 4.9], shape=(2,), dtype=float64) tf.Tensor([0 0], shape=(2,), dtype=int64) + ... + tf.Tensor([6.2 5.9], shape=(2,), dtype=float64) tf.Tensor([2 2], shape=(2,), dtype=int64) + + Time complexity: O(1) + + Args: + prefetch_batches: The number of batches to fetch ahead of the current batch + to fetch. If set to greater than 0, a separate threadpool is used + to fetch the objects to the local node, format the batches, and apply + the ``collate_fn``. Defaults to 1. + batch_size: The number of rows in each batch, or ``None`` to use entire + blocks as batches (blocks may contain different numbers of rows). + The final batch may include fewer than ``batch_size`` rows if + ``drop_last`` is ``False``. Defaults to 256. + dtypes: The TensorFlow dtype(s) for the created tensor(s); if ``None``, the + dtype is inferred from the tensor data. + drop_last: Whether to drop the last batch if it's incomplete. + local_shuffle_buffer_size: If not ``None``, the data is randomly shuffled + using a local in-memory shuffle buffer, and this value serves as the + minimum number of rows that must be in the local in-memory shuffle + buffer in order to yield a batch. When there are no more rows to add to + the buffer, the remaining rows in the buffer are drained. + ``batch_size`` must also be specified when using local shuffling. + local_shuffle_seed: The seed to use for the local random shuffle. + + Returns: + An iterable over TensorFlow Tensor batches. + + .. seealso:: + :meth:`Dataset.iter_batches` + Call this method to manually convert your data to TensorFlow tensors. + """ # noqa: E501 + warnings.warn( + "`iter_tf_batches` is deprecated and will be removed after May 2025. Use " + "`to_tf` instead.", + DeprecationWarning, + ) + return self.iterator().iter_tf_batches( + prefetch_batches=prefetch_batches, + batch_size=batch_size, + dtypes=dtypes, + drop_last=drop_last, + local_shuffle_buffer_size=local_shuffle_buffer_size, + local_shuffle_seed=local_shuffle_seed, + ) + + @ConsumptionAPI(pattern="Time complexity:") + @Deprecated + def to_torch( + self, + *, + label_column: Optional[str] = None, + feature_columns: Optional[ + Union[List[str], List[List[str]], Dict[str, List[str]]] + ] = None, + label_column_dtype: Optional["torch.dtype"] = None, + feature_column_dtypes: Optional[ + Union["torch.dtype", List["torch.dtype"], Dict[str, "torch.dtype"]] + ] = None, + batch_size: int = 1, + prefetch_batches: int = 1, + drop_last: bool = False, + local_shuffle_buffer_size: Optional[int] = None, + local_shuffle_seed: Optional[int] = None, + unsqueeze_label_tensor: bool = True, + unsqueeze_feature_tensors: bool = True, + ) -> "torch.utils.data.IterableDataset": + """Return a + `Torch IterableDataset `_ + over this :class:`~ray.data.Dataset`. + + This is only supported for datasets convertible to Arrow records. + + It is recommended to use the returned ``IterableDataset`` directly + instead of passing it into a torch ``DataLoader``. + + Each element in ``IterableDataset`` is a tuple consisting of 2 + elements. The first item contains the feature tensor(s), and the + second item is the label tensor. Those can take on different + forms, depending on the specified arguments. + + For the features tensor (N is the ``batch_size`` and n, m, k + are the number of features per tensor): + + * If ``feature_columns`` is a ``List[str]``, the features is + a tensor of shape (N, n), with columns corresponding to + ``feature_columns`` + + * If ``feature_columns`` is a ``List[List[str]]``, the features is + a list of tensors of shape [(N, m),...,(N, k)], with columns of each + tensor corresponding to the elements of ``feature_columns`` + + * If ``feature_columns`` is a ``Dict[str, List[str]]``, the features + is a dict of key-tensor pairs of shape + {key1: (N, m),..., keyN: (N, k)}, with columns of each + tensor corresponding to the value of ``feature_columns`` under the + key. + + If ``unsqueeze_label_tensor=True`` (default), the label tensor is + of shape (N, 1). Otherwise, it is of shape (N,). + If ``label_column`` is specified as ``None``, then no column from the + ``Dataset`` is treated as the label, and the output label tensor + is ``None``. + + Note that you probably want to call :meth:`Dataset.split` on this dataset if + there are to be multiple Torch workers consuming the data. + + Time complexity: O(1) + + Args: + label_column: The name of the column used as the + label (second element of the output list). Can be None for + prediction, in which case the second element of returned + tuple will also be None. + feature_columns: The names of the columns + to use as the features. Can be a list of lists or + a dict of string-list pairs for multi-tensor output. + If ``None``, then use all columns except the label column as + the features. + label_column_dtype: The torch dtype to + use for the label column. If ``None``, then automatically infer + the dtype. + feature_column_dtypes: The dtypes to use for the feature + tensors. This should match the format of ``feature_columns``, + or be a single dtype, in which case it is applied to + all tensors. If ``None``, then automatically infer the dtype. + batch_size: How many samples per batch to yield at a time. + Defaults to 1. + prefetch_batches: The number of batches to fetch ahead of the current batch + to fetch. If set to greater than 0, a separate threadpool is used + to fetch the objects to the local node, format the batches, and apply + the collate_fn. Defaults to 1. + drop_last: Set to True to drop the last incomplete batch, + if the dataset size is not divisible by the batch size. If + False and the size of the stream is not divisible by the batch + size, then the last batch is smaller. Defaults to False. + local_shuffle_buffer_size: If non-None, the data is randomly shuffled + using a local in-memory shuffle buffer, and this value will serve as the + minimum number of rows that must be in the local in-memory shuffle + buffer in order to yield a batch. When there are no more rows to add to + the buffer, the remaining rows in the buffer is drained. This + buffer size must be greater than or equal to ``batch_size``, and + therefore ``batch_size`` must also be specified when using local + shuffling. + local_shuffle_seed: The seed to use for the local random shuffle. + unsqueeze_label_tensor: If set to True, the label tensor + is unsqueezed (reshaped to (N, 1)). Otherwise, it will + be left as is, that is (N, ). In general, regression loss + functions expect an unsqueezed tensor, while classification + loss functions expect a squeezed one. Defaults to True. + unsqueeze_feature_tensors: If set to True, the features tensors + are unsqueezed (reshaped to (N, 1)) before being concatenated into + the final features tensor. Otherwise, they are left as is, that is + (N, ). Defaults to True. + + Returns: + A `Torch IterableDataset`_. + """ # noqa: E501 + warnings.warn( + "`to_torch` is deprecated and will be removed after May 2025. Use " + "`iter_torch_batches` instead.", + DeprecationWarning, + ) + return self.iterator().to_torch( + label_column=label_column, + feature_columns=feature_columns, + label_column_dtype=label_column_dtype, + feature_column_dtypes=feature_column_dtypes, + batch_size=batch_size, + prefetch_batches=prefetch_batches, + drop_last=drop_last, + local_shuffle_buffer_size=local_shuffle_buffer_size, + local_shuffle_seed=local_shuffle_seed, + unsqueeze_label_tensor=unsqueeze_label_tensor, + unsqueeze_feature_tensors=unsqueeze_feature_tensors, + ) + + @ConsumptionAPI + @PublicAPI(api_group=IOC_API_GROUP) + def to_tf( + self, + feature_columns: Union[str, List[str]], + label_columns: Union[str, List[str]], + *, + additional_columns: Union[str, List[str]] = None, + prefetch_batches: int = 1, + batch_size: int = 1, + drop_last: bool = False, + local_shuffle_buffer_size: Optional[int] = None, + local_shuffle_seed: Optional[int] = None, + feature_type_spec: Union["tf.TypeSpec", Dict[str, "tf.TypeSpec"]] = None, + label_type_spec: Union["tf.TypeSpec", Dict[str, "tf.TypeSpec"]] = None, + additional_type_spec: Union["tf.TypeSpec", Dict[str, "tf.TypeSpec"]] = None, + ) -> "tf.data.Dataset": + """Return a `TensorFlow Dataset `_ + over this :class:`~ray.data.Dataset`. + + .. warning:: + If your :class:`~ray.data.Dataset` contains ragged tensors, this method errors. + To prevent errors, :ref:`resize your tensors `. + + Examples: + >>> import ray + >>> ds = ray.data.read_csv("s3://anonymous@air-example-data/iris.csv") + >>> ds + Dataset( + num_rows=?, + schema={ + sepal length (cm): double, + sepal width (cm): double, + petal length (cm): double, + petal width (cm): double, + target: int64 + } + ) + + If your model accepts a single tensor as input, specify a single feature column. + + >>> ds.to_tf(feature_columns="sepal length (cm)", label_columns="target") + <_OptionsDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.float64, name='sepal length (cm)'), TensorSpec(shape=(None,), dtype=tf.int64, name='target'))> + + If your model accepts a dictionary as input, specify a list of feature columns. + + >>> ds.to_tf(["sepal length (cm)", "sepal width (cm)"], "target") + <_OptionsDataset element_spec=({'sepal length (cm)': TensorSpec(shape=(None,), dtype=tf.float64, name='sepal length (cm)'), 'sepal width (cm)': TensorSpec(shape=(None,), dtype=tf.float64, name='sepal width (cm)')}, TensorSpec(shape=(None,), dtype=tf.int64, name='target'))> + + If your dataset contains multiple features but your model accepts a single + tensor as input, combine features with + :class:`~ray.data.preprocessors.Concatenator`. + + >>> from ray.data.preprocessors import Concatenator + >>> columns_to_concat = ["sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)"] + >>> preprocessor = Concatenator(columns=columns_to_concat, output_column_name="features") + >>> ds = preprocessor.transform(ds) + >>> ds + Concatenator + +- Dataset( + num_rows=?, + schema={ + sepal length (cm): double, + sepal width (cm): double, + petal length (cm): double, + petal width (cm): double, + target: int64 + } + ) + >>> ds.to_tf("features", "target") + <_OptionsDataset element_spec=(TensorSpec(shape=(None, 4), dtype=tf.float64, name='features'), TensorSpec(shape=(None,), dtype=tf.int64, name='target'))> + + If your model accepts different types, shapes, or names of tensors as input, specify the type spec. + If type specs are not specified, they are automatically inferred from the schema of the dataset. + + >>> import tensorflow as tf + >>> ds.to_tf( + ... feature_columns="features", + ... label_columns="target", + ... feature_type_spec=tf.TensorSpec(shape=(None, 4), dtype=tf.float32, name="features"), + ... label_type_spec=tf.TensorSpec(shape=(None,), dtype=tf.float32, name="label") + ... ) + <_OptionsDataset element_spec=(TensorSpec(shape=(None, 4), dtype=tf.float32, name='features'), TensorSpec(shape=(None,), dtype=tf.float32, name='label'))> + + If your model accepts additional metadata aside from features and label, specify a single additional column or a list of additional columns. + A common use case is to include sample weights in the data samples and train a ``tf.keras.Model`` with ``tf.keras.Model.fit``. + + >>> import pandas as pd + >>> ds = ds.add_column("sample weights", lambda df: pd.Series([1] * len(df))) + >>> ds.to_tf(feature_columns="features", label_columns="target", additional_columns="sample weights") + <_OptionsDataset element_spec=(TensorSpec(shape=(None, 4), dtype=tf.float64, name='features'), TensorSpec(shape=(None,), dtype=tf.int64, name='target'), TensorSpec(shape=(None,), dtype=tf.int64, name='sample weights'))> + + If your model accepts different types, shapes, or names for the additional metadata, specify the type spec of the additional column. + + >>> ds.to_tf( + ... feature_columns="features", + ... label_columns="target", + ... additional_columns="sample weights", + ... additional_type_spec=tf.TensorSpec(shape=(None,), dtype=tf.float32, name="weight") + ... ) + <_OptionsDataset element_spec=(TensorSpec(shape=(None, 4), dtype=tf.float64, name='features'), TensorSpec(shape=(None,), dtype=tf.int64, name='target'), TensorSpec(shape=(None,), dtype=tf.float32, name='weight'))> + + Args: + feature_columns: Columns that correspond to model inputs. If this is a + string, the input data is a tensor. If this is a list, the input data + is a ``dict`` that maps column names to their tensor representation. + label_columns: Columns that correspond to model targets. If this is a + string, the target data is a tensor. If this is a list, the target data + is a ``dict`` that maps column names to their tensor representation. + additional_columns: Columns that correspond to sample weights or other metadata. + If this is a string, the weight data is a tensor. If this is a list, the + weight data is a ``dict`` that maps column names to their tensor representation. + prefetch_batches: The number of batches to fetch ahead of the current batch + to fetch. If set to greater than 0, a separate threadpool is used + to fetch the objects to the local node, format the batches, and apply + the collate_fn. Defaults to 1. + batch_size: Record batch size. Defaults to 1. + drop_last: Set to True to drop the last incomplete batch, + if the dataset size is not divisible by the batch size. If + False and the size of the stream is not divisible by the batch + size, then the last batch is smaller. Defaults to False. + local_shuffle_buffer_size: If non-None, the data is randomly shuffled + using a local in-memory shuffle buffer, and this value will serve as the + minimum number of rows that must be in the local in-memory shuffle + buffer in order to yield a batch. When there are no more rows to add to + the buffer, the remaining rows in the buffer is drained. This + buffer size must be greater than or equal to ``batch_size``, and + therefore ``batch_size`` must also be specified when using local + shuffling. + local_shuffle_seed: The seed to use for the local random shuffle. + feature_type_spec: The `tf.TypeSpec` of `feature_columns`. If there is + only one column, specify a `tf.TypeSpec`. If there are multiple columns, + specify a ``dict`` that maps column names to their `tf.TypeSpec`. + Default is `None` to automatically infer the type of each column. + label_type_spec: The `tf.TypeSpec` of `label_columns`. If there is + only one column, specify a `tf.TypeSpec`. If there are multiple columns, + specify a ``dict`` that maps column names to their `tf.TypeSpec`. + Default is `None` to automatically infer the type of each column. + additional_type_spec: The `tf.TypeSpec` of `additional_columns`. If there + is only one column, specify a `tf.TypeSpec`. If there are multiple + columns, specify a ``dict`` that maps column names to their `tf.TypeSpec`. + Default is `None` to automatically infer the type of each column. + + Returns: + A `TensorFlow Dataset`_ that yields inputs and targets. + + .. seealso:: + + :meth:`~ray.data.Dataset.iter_tf_batches` + Call this method if you need more flexibility. + """ # noqa: E501 + + return self.iterator().to_tf( + feature_columns=feature_columns, + label_columns=label_columns, + additional_columns=additional_columns, + prefetch_batches=prefetch_batches, + drop_last=drop_last, + batch_size=batch_size, + local_shuffle_buffer_size=local_shuffle_buffer_size, + local_shuffle_seed=local_shuffle_seed, + feature_type_spec=feature_type_spec, + label_type_spec=label_type_spec, + additional_type_spec=additional_type_spec, + ) + + @ConsumptionAPI(pattern="Time complexity:") + @PublicAPI(api_group=IOC_API_GROUP) + def to_dask( + self, + meta: Union[ + "pandas.DataFrame", + "pandas.Series", + Dict[str, Any], + Iterable[Any], + Tuple[Any], + None, + ] = None, + verify_meta: bool = True, + ) -> "dask.dataframe.DataFrame": + """Convert this :class:`~ray.data.Dataset` into a + `Dask DataFrame `_. + + This is only supported for datasets convertible to Arrow records. + + Note that this function will set the Dask scheduler to Dask-on-Ray + globally, via the config. + + Time complexity: O(dataset size / parallelism) + + Args: + meta: An empty `pandas DataFrame`_ or `Series`_ that matches the dtypes and column + names of the stream. This metadata is necessary for many algorithms in + dask dataframe to work. For ease of use, some alternative inputs are + also available. Instead of a DataFrame, a dict of ``{name: dtype}`` or + iterable of ``(name, dtype)`` can be provided (note that the order of + the names should match the order of the columns). Instead of a series, a + tuple of ``(name, dtype)`` can be used. + By default, this is inferred from the underlying Dataset schema, + with this argument supplying an optional override. + verify_meta: If True, Dask will check that the partitions have consistent + metadata. Defaults to True. + + Returns: + A `Dask DataFrame`_ created from this dataset. + + .. _pandas DataFrame: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html + .. _Series: https://pandas.pydata.org/docs/reference/api/pandas.Series.html + """ # noqa: E501 + import dask + import dask.dataframe as dd + import pandas as pd + + try: + import pyarrow as pa + except Exception: + pa = None + + from ray.data._internal.pandas_block import PandasBlockSchema + from ray.util.client.common import ClientObjectRef + from ray.util.dask import ray_dask_get + + dask.config.set(scheduler=ray_dask_get) + + @dask.delayed + def block_to_df(block_ref: ObjectRef[Block]) -> pd.DataFrame: + if isinstance(block_ref, (ray.ObjectRef, ClientObjectRef)): + raise ValueError( + "Dataset.to_dask() must be used with Dask-on-Ray, please " + "set the Dask scheduler to ray_dask_get (located in " + "ray.util.dask)." + ) + return _block_to_df(block_ref) + + if meta is None: + from ray.data.extensions import TensorDtype + + # Infer Dask metadata from Dataset schema. + schema = self.schema(fetch_if_missing=True) + if isinstance(schema, PandasBlockSchema): + meta = pd.DataFrame( + { + col: pd.Series( + dtype=( + dtype + if not isinstance(dtype, TensorDtype) + else np.object_ + ) + ) + for col, dtype in zip(schema.names, schema.types) + } + ) + elif pa is not None and isinstance(schema, pa.Schema): + arrow_tensor_ext_types = get_arrow_extension_fixed_shape_tensor_types() + + if any( + isinstance(type_, arrow_tensor_ext_types) for type_ in schema.types + ): + meta = pd.DataFrame( + { + col: pd.Series( + dtype=( + dtype.to_pandas_dtype() + if not isinstance(dtype, arrow_tensor_ext_types) + else np.object_ + ) + ) + for col, dtype in zip(schema.names, schema.types) + } + ) + else: + meta = schema.empty_table().to_pandas() + + dfs = [] + for ref_bundle in self.iter_internal_ref_bundles(): + for block_ref in ref_bundle.block_refs: + dfs.append(block_to_df(block_ref)) + + ddf = dd.from_delayed( + dfs, + meta=meta, + verify_meta=verify_meta, + ) + return ddf + + @ConsumptionAPI(pattern="Time complexity:") + @PublicAPI(api_group=IOC_API_GROUP) + def to_mars(self) -> "mars.dataframe.DataFrame": + """Convert this :class:`~ray.data.Dataset` into a + `Mars DataFrame `_. + + Time complexity: O(dataset size / parallelism) + + Returns: + A `Mars DataFrame`_ created from this dataset. + """ # noqa: E501 + import pandas as pd + import pyarrow as pa + from mars.dataframe.datasource.read_raydataset import DataFrameReadRayDataset + from mars.dataframe.utils import parse_index + + from ray.data._internal.pandas_block import PandasBlockSchema + + refs = self.to_pandas_refs() + # remove this when https://github.com/mars-project/mars/issues/2945 got fixed + schema = self.schema() + if isinstance(schema, Schema): + schema = schema.base_schema + if isinstance(schema, PandasBlockSchema): + dtypes = pd.Series(schema.types, index=schema.names) + elif isinstance(schema, pa.Schema): + dtypes = schema.empty_table().to_pandas().dtypes + else: + raise NotImplementedError(f"Unsupported format of schema {schema}") + index_value = parse_index(pd.RangeIndex(-1)) + columns_value = parse_index(dtypes.index, store_data=True) + op = DataFrameReadRayDataset(refs=refs) + return op(index_value=index_value, columns_value=columns_value, dtypes=dtypes) + + @ConsumptionAPI(pattern="Time complexity:") + @PublicAPI(api_group=IOC_API_GROUP) + def to_modin(self) -> "modin.pandas.dataframe.DataFrame": + """Convert this :class:`~ray.data.Dataset` into a + `Modin DataFrame `_. + + This works by first converting this dataset into a distributed set of + Pandas DataFrames (using :meth:`Dataset.to_pandas_refs`). + See caveats there. Then the individual DataFrames are used to + create the Modin DataFrame using + ``modin.distributed.dataframe.pandas.partitions.from_partitions()``. + + This is only supported for datasets convertible to Arrow records. + This function induces a copy of the data. For zero-copy access to the + underlying data, consider using :meth:`.to_arrow_refs` or + :meth:`.iter_internal_ref_bundles`. + + Time complexity: O(dataset size / parallelism) + + Returns: + A `Modin DataFrame`_ created from this dataset. + """ # noqa: E501 + + from modin.distributed.dataframe.pandas.partitions import from_partitions + + pd_objs = self.to_pandas_refs() + return from_partitions(pd_objs, axis=0) + + @ConsumptionAPI(pattern="Time complexity:") + @PublicAPI(api_group=IOC_API_GROUP) + def to_spark(self, spark: "pyspark.sql.SparkSession") -> "pyspark.sql.DataFrame": + """Convert this :class:`~ray.data.Dataset` into a + `Spark DataFrame `_. + + Time complexity: O(dataset size / parallelism) + + Args: + spark: A `SparkSession`_, which must be created by RayDP (Spark-on-Ray). + + Returns: + A `Spark DataFrame`_ created from this dataset. + + .. _SparkSession: https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.sql.SparkSession.html + """ # noqa: E501 + import raydp + + schema = self.schema() + if isinstance(schema, Schema): + schema = schema.base_schema + + ref_bundles = self.iter_internal_ref_bundles() + block_refs = _ref_bundles_iterator_to_block_refs_list(ref_bundles) + return raydp.spark.ray_dataset_to_spark_dataframe(spark, schema, block_refs) + + @ConsumptionAPI(pattern="Time complexity:") + @PublicAPI(api_group=IOC_API_GROUP) + def to_pandas(self, limit: int = None) -> "pandas.DataFrame": + """Convert this :class:`~ray.data.Dataset` to a single pandas DataFrame. + + This method errors if the number of rows exceeds the provided ``limit``. + To truncate the dataset beforehand, call :meth:`.limit`. + + Examples: + >>> import ray + >>> ds = ray.data.from_items([{"a": i} for i in range(3)]) + >>> ds.to_pandas() + a + 0 0 + 1 1 + 2 2 + + Time complexity: O(dataset size) + + Args: + limit: The maximum number of rows to return. An error is + raised if the dataset has more rows than this limit. Defaults to + ``None``, which means no limit. + + Returns: + A pandas DataFrame created from this dataset, containing a limited + number of rows. + + Raises: + ValueError: if the number of rows in the :class:`~ray.data.Dataset` exceeds + ``limit``. + """ + if limit is not None: + count = self.count() + if count > limit: + raise ValueError( + f"the dataset has more than the given limit of {limit} " + f"rows: {count}. If you are sure that a DataFrame with " + f"{count} rows will fit in local memory, set " + "ds.to_pandas(limit=None) to disable limits." + ) + + builder = PandasBlockBuilder() + for batch in self.iter_batches(batch_format="pandas", batch_size=None): + builder.add_block(batch) + block = builder.build() + + # `PandasBlockBuilder` creates a dataframe with internal extension types like + # 'TensorDtype'. We use the `to_pandas` method to convert these extension + # types to regular types. + return BlockAccessor.for_block(block).to_pandas() + + @ConsumptionAPI(pattern="Time complexity:") + @DeveloperAPI + def to_pandas_refs(self) -> List[ObjectRef["pandas.DataFrame"]]: + """Converts this :class:`~ray.data.Dataset` into a distributed set of Pandas + dataframes. + + One DataFrame is created for each block in this Dataset. + + This function induces a copy of the data. For zero-copy access to the + underlying data, consider using :meth:`Dataset.to_arrow_refs` or + :meth:`Dataset.iter_internal_ref_bundles`. + + Examples: + >>> import ray + >>> ds = ray.data.range(10, override_num_blocks=2) + >>> refs = ds.to_pandas_refs() + >>> len(refs) + 2 + + Time complexity: O(dataset size / parallelism) + + Returns: + A list of remote pandas DataFrames created from this dataset. + """ + + block_to_df = cached_remote_fn(_block_to_df) + pandas_refs = [] + for bundle in self.iter_internal_ref_bundles(): + for block_ref in bundle.block_refs: + pandas_refs.append(block_to_df.remote(block_ref)) + return pandas_refs + + @DeveloperAPI + def to_numpy_refs( + self, *, column: Optional[str] = None + ) -> List[ObjectRef[np.ndarray]]: + """Converts this :class:`~ray.data.Dataset` into a distributed set of NumPy + ndarrays or dictionary of NumPy ndarrays. + + This is only supported for datasets convertible to NumPy ndarrays. + This function induces a copy of the data. For zero-copy access to the + underlying data, consider using :meth:`Dataset.to_arrow_refs` or + :meth:`Dataset.iter_internal_ref_bundles`. + + Examples: + >>> import ray + >>> ds = ray.data.range(10, override_num_blocks=2) + >>> refs = ds.to_numpy_refs() + >>> len(refs) + 2 + + Time complexity: O(dataset size / parallelism) + + Args: + column: The name of the column to convert to numpy. If ``None``, all columns + are used. If multiple columns are specified, each returned + future represents a dict of ndarrays. Defaults to None. + + Returns: + A list of remote NumPy ndarrays created from this dataset. + """ + block_to_ndarray = cached_remote_fn(_block_to_ndarray) + numpy_refs = [] + for bundle in self.iter_internal_ref_bundles(): + for block_ref in bundle.block_refs: + numpy_refs.append(block_to_ndarray.remote(block_ref, column=column)) + return numpy_refs + + @ConsumptionAPI(pattern="Time complexity:") + @DeveloperAPI + def to_arrow_refs(self) -> List[ObjectRef["pyarrow.Table"]]: + """Convert this :class:`~ray.data.Dataset` into a distributed set of PyArrow + tables. + + One PyArrow table is created for each block in this Dataset. + + This method is only supported for datasets convertible to PyArrow tables. + This function is zero-copy if the existing data is already in PyArrow + format. Otherwise, the data is converted to PyArrow format. + + Examples: + >>> import ray + >>> ds = ray.data.range(10, override_num_blocks=2) + >>> refs = ds.to_arrow_refs() + >>> len(refs) + 2 + + Time complexity: O(1) unless conversion is required. + + Returns: + A list of remote PyArrow tables created from this dataset. + """ + import pyarrow as pa + + ref_bundles: Iterator[RefBundle] = self.iter_internal_ref_bundles() + block_refs: List[ + ObjectRef["pyarrow.Table"] + ] = _ref_bundles_iterator_to_block_refs_list(ref_bundles) + # Schema is safe to call since we have already triggered execution with + # iter_internal_ref_bundles. + schema = self.schema(fetch_if_missing=True) + if isinstance(schema, Schema): + schema = schema.base_schema + if isinstance(schema, pa.Schema): + # Zero-copy path. + return block_refs + + block_to_arrow = cached_remote_fn(_block_to_arrow) + return [block_to_arrow.remote(block) for block in block_refs] + + @ConsumptionAPI(pattern="Args:") + def to_random_access_dataset( + self, + key: str, + num_workers: Optional[int] = None, + ) -> RandomAccessDataset: + """Convert this dataset into a distributed RandomAccessDataset (EXPERIMENTAL). + + RandomAccessDataset partitions the dataset across the cluster by the given + sort key, providing efficient random access to records via binary search. A + number of worker actors are created, each of which has zero-copy access to the + underlying sorted data blocks of the dataset. + + Note that the key must be unique in the dataset. If there are duplicate keys, + an arbitrary value is returned. + + This is only supported for Arrow-format datasets. + + Args: + key: The key column over which records can be queried. + num_workers: The number of actors to use to serve random access queries. + By default, this is determined by multiplying the number of Ray nodes + in the cluster by four. As a rule of thumb, you can expect each worker + to provide ~3000 records / second via ``get_async()``, and + ~10000 records / second via ``multiget()``. + """ + if num_workers is None: + num_workers = 4 * len(ray.nodes()) + return RandomAccessDataset(self, key, num_workers=num_workers) + + @ConsumptionAPI(pattern="store memory.", insert_after=True) + @PublicAPI(api_group=E_API_GROUP) + def materialize(self) -> "MaterializedDataset": + """Execute and materialize this dataset into object store memory. + + This can be used to read all blocks into memory. By default, Dataset + doesn't read blocks from the datasource until the first transform. + + Note that this does not mutate the original Dataset. Only the blocks of the + returned MaterializedDataset class are pinned in memory. + + Examples: + >>> import ray + >>> ds = ray.data.range(10) + >>> materialized_ds = ds.materialize() + >>> materialized_ds + MaterializedDataset(num_blocks=..., num_rows=10, schema={id: int64}) + + Returns: + A MaterializedDataset holding the materialized data blocks. + """ + copy = Dataset.copy(self, _deep_copy=True, _as=MaterializedDataset) + copy._plan.execute() + + bundle = copy._plan._snapshot_bundle + blocks_with_metadata = bundle.blocks + # TODO(hchen): Here we generate the same number of blocks as + # the original Dataset. Because the old code path does this, and + # some unit tests implicily depend on this behavior. + # After we remove the old code path, we should consider merging + # some blocks for better perf. + ref_bundles = [ + RefBundle( + blocks=[block_with_metadata], + owns_blocks=False, + ) + for block_with_metadata in blocks_with_metadata + ] + logical_plan = LogicalPlan(InputData(input_data=ref_bundles), self.context) + output = MaterializedDataset( + ExecutionPlan(copy._plan.stats()), + logical_plan, + ) + # Metrics are tagged with `copy`s uuid, update the output uuid with + # this so the user can access the metrics label. + output._set_name(copy._name) + output._set_uuid(copy._get_uuid()) + output._plan.execute() # No-op that marks the plan as fully executed. + return output + + @PublicAPI(api_group=IM_API_GROUP) + def stats(self) -> str: + """Returns a string containing execution timing information. + + Note that this does not trigger execution, so if the dataset has not yet + executed, an empty string is returned. + + Examples: + + .. testcode:: + + import ray + + ds = ray.data.range(10) + assert ds.stats() == "" + + ds = ds.materialize() + print(ds.stats()) + + .. testoutput:: + :options: +MOCK + + Operator 0 Read: 1 tasks executed, 5 blocks produced in 0s + * Remote wall time: 16.29us min, 7.29ms max, 1.21ms mean, 24.17ms total + * Remote cpu time: 16.0us min, 2.54ms max, 810.45us mean, 16.21ms total + * Peak heap memory usage (MiB): 137968.75 min, 142734.38 max, 139846 mean + * Output num rows: 0 min, 1 max, 0 mean, 10 total + * Output size bytes: 0 min, 8 max, 4 mean, 80 total + * Tasks per node: 20 min, 20 max, 20 mean; 1 nodes used + + """ + if self._current_executor: + return self._current_executor.get_stats().to_summary().to_string() + elif self._write_ds is not None and self._write_ds._plan.has_computed_output(): + return self._write_ds.stats() + return self._get_stats_summary().to_string() + + def _get_stats_summary(self) -> DatasetStatsSummary: + return self._plan.stats().to_summary() + + @ConsumptionAPI(pattern="Examples:") + @DeveloperAPI + def iter_internal_ref_bundles(self) -> Iterator[RefBundle]: + """Get an iterator over ``RefBundles`` + belonging to this Dataset. Calling this function doesn't keep + the data materialized in-memory. + + Examples: + >>> import ray + >>> ds = ray.data.range(1) + >>> for ref_bundle in ds.iter_internal_ref_bundles(): + ... for block_ref, block_md in ref_bundle.blocks: + ... block = ray.get(block_ref) + + Returns: + An iterator over this Dataset's ``RefBundles``. + """ + + iter_ref_bundles, _, _ = self._plan.execute_to_iterator() + self._synchronize_progress_bar() + return iter_ref_bundles + + @Deprecated + @ConsumptionAPI(pattern="Examples:") + def get_internal_block_refs(self) -> List[ObjectRef[Block]]: + """Get a list of references to the underlying blocks of this dataset. + + This function can be used for zero-copy access to the data. It blocks + until the underlying blocks are computed. + + Examples: + >>> import ray + >>> ds = ray.data.range(1) + >>> ds.get_internal_block_refs() + [ObjectRef(...)] + + Returns: + A list of references to this dataset's blocks. + """ + logger.warning( + "`Dataset.get_internal_block_refs()` is deprecated. Use " + "`Dataset.iter_internal_ref_bundles()` instead.", + ) + block_refs = self._plan.execute().block_refs + self._synchronize_progress_bar() + return block_refs + + @DeveloperAPI + def has_serializable_lineage(self) -> bool: + """Whether this dataset's lineage is able to be serialized for storage and + later deserialized, possibly on a different cluster. + + Only datasets that are created from data that we know will still exist at + deserialization time, e.g. data external to this Ray cluster such as persistent + cloud object stores, support lineage-based serialization. All of the + ray.data.read_*() APIs support lineage-based serialization. + + Examples: + + >>> import ray + >>> ray.data.from_items(list(range(10))).has_serializable_lineage() + False + >>> ray.data.read_csv("s3://anonymous@ray-example-data/iris.csv").has_serializable_lineage() + True + """ # noqa: E501 + return all( + op.is_lineage_serializable() + for op in self._logical_plan.dag.post_order_iter() + ) + + @DeveloperAPI + def serialize_lineage(self) -> bytes: + """ + Serialize this dataset's lineage, not the actual data or the existing data + futures, to bytes that can be stored and later deserialized, possibly on a + different cluster. + + Note that this uses pickle and will drop all computed data, and that everything + is recomputed from scratch after deserialization. + + Use :py:meth:`Dataset.deserialize_lineage` to deserialize the serialized + bytes returned from this method into a Dataset. + + .. note:: + Unioned and zipped datasets, produced by :py:meth`Dataset.union` and + :py:meth:`Dataset.zip`, are not lineage-serializable. + + Examples: + + .. testcode:: + + import ray + + ds = ray.data.read_csv("s3://anonymous@ray-example-data/iris.csv") + serialized_ds = ds.serialize_lineage() + ds = ray.data.Dataset.deserialize_lineage(serialized_ds) + print(ds) + + .. testoutput:: + + Dataset( + num_rows=?, + schema={ + sepal length (cm): double, + sepal width (cm): double, + petal length (cm): double, + petal width (cm): double, + target: int64 + } + ) + + + Returns: + Serialized bytes containing the lineage of this dataset. + """ + if not self.has_serializable_lineage(): + raise ValueError( + "Lineage-based serialization is not supported for this stream, which " + "means that it cannot be used as a tunable hyperparameter. " + "Lineage-based serialization is explicitly NOT supported for unioned " + "or zipped datasets (see docstrings for those methods), and is only " + "supported for datasets created from data that we know will still " + "exist at deserialization time, e.g. external data in persistent cloud " + "object stores or in-memory data from long-lived clusters. Concretely, " + "all ray.data.read_*() APIs should support lineage-based " + "serialization, while all of the ray.data.from_*() APIs do not. To " + "allow this stream to be serialized to storage, write the data to an " + "external store (such as AWS S3, GCS, or Azure Blob Storage) using the " + "Dataset.write_*() APIs, and serialize a new dataset reading " + "from the external store using the ray.data.read_*() APIs." + ) + # Copy Dataset and clear the blocks from the execution plan so only the + # Dataset's lineage is serialized. + plan_copy = self._plan.deep_copy() + logical_plan_copy = copy.copy(self._plan._logical_plan) + ds = Dataset(plan_copy, logical_plan_copy) + ds._plan.clear_snapshot() + ds._set_uuid(self._get_uuid()) + + def _reduce_remote_fn(rf: ray.remote_function.RemoteFunction): + # Custom reducer for Ray remote function handles that allows for + # cross-cluster serialization. + # This manually unsets the last export session and job to force re-exporting + # of the function when the handle is deserialized on a new cluster. + # TODO(Clark): Fix this in core Ray, see issue: + # https://github.com/ray-project/ray/issues/24152. + reconstructor, args, state = rf.__reduce__() + state["_last_export_session_and_job"] = None + return reconstructor, args, state + + context = ray._private.worker.global_worker.get_serialization_context() + try: + context._register_cloudpickle_reducer( + ray.remote_function.RemoteFunction, _reduce_remote_fn + ) + serialized = pickle.dumps(ds) + finally: + context._unregister_cloudpickle_reducer(ray.remote_function.RemoteFunction) + return serialized + + @staticmethod + @DeveloperAPI + def deserialize_lineage(serialized_ds: bytes) -> "Dataset": + """ + Deserialize the provided lineage-serialized Dataset. + + This uses pickle, and assumes that the provided serialized bytes were + serialized using :py:meth:`Dataset.serialize_lineage`. + + Examples: + + .. testcode:: + + import ray + + ds = ray.data.read_csv("s3://anonymous@ray-example-data/iris.csv") + serialized_ds = ds.serialize_lineage() + ds = ray.data.Dataset.deserialize_lineage(serialized_ds) + print(ds) + + .. testoutput:: + + Dataset( + num_rows=?, + schema={ + sepal length (cm): double, + sepal width (cm): double, + petal length (cm): double, + petal width (cm): double, + target: int64 + } + ) + + Args: + serialized_ds: The serialized Dataset that we wish to deserialize. + + Returns: + A deserialized ``Dataset`` instance. + """ + return pickle.loads(serialized_ds) + + @property + @DeveloperAPI + def context(self) -> DataContext: + """Return the DataContext used to create this Dataset.""" + return self._plan._context + + def _aggregate_on( + self, agg_cls: type, on: Optional[Union[str, List[str]]], *args, **kwargs + ): + """Helper for aggregating on a particular subset of the dataset. + + This validates the `on` argument, and converts a list of column names + or lambdas to a multi-aggregation. A null `on` results in a + multi-aggregation on all columns for an Arrow Dataset, and a single + aggregation on the entire row for a simple Dataset. + """ + aggs = self._build_multicolumn_aggs(agg_cls, on, *args, **kwargs) + return self.aggregate(*aggs) + + def _build_multicolumn_aggs( + self, + agg_cls: type, + on: Optional[Union[str, List[str]]], + *args, + skip_cols: Optional[List[str]] = None, + **kwargs, + ): + """Build set of aggregations for applying a single aggregation to + multiple columns. + """ + # Expand None into an aggregation for each column. + if on is None: + schema = self.schema(fetch_if_missing=True) + if schema is not None and not isinstance(schema, type): + if not skip_cols: + skip_cols = [] + if len(schema.names) > 0: + on = [col for col in schema.names if col not in skip_cols] + + if not isinstance(on, list): + on = [on] + return [agg_cls(on_, *args, **kwargs) for on_ in on] + + def _aggregate_result(self, result: Union[Tuple, Mapping]) -> U: + if result is not None and len(result) == 1: + if isinstance(result, tuple): + return result[0] + else: + # NOTE (kfstorm): We cannot call `result[0]` directly on + # `PandasRow` because indexing a column with position is not + # supported by pandas. + return list(result.values())[0] + else: + return result + + @repr_with_fallback(["ipywidgets", "8"]) + def _repr_mimebundle_(self, **kwargs): + """Return a mimebundle with an ipywidget repr and a simple text repr. + + Depending on the frontend where the data is being displayed, + different mimetypes are used from this bundle. + See https://ipython.readthedocs.io/en/stable/config/integrating.html + for information about this method, and + https://ipywidgets.readthedocs.io/en/latest/embedding.html + for more information about the jupyter widget mimetype. + + Returns: + A mimebundle containing an ipywidget repr and a simple text repr. + """ + import ipywidgets + + title = ipywidgets.HTML(f"

{self.__class__.__name__}

") + tab = self._tab_repr_() + widget = ipywidgets.VBox([title, tab], layout=ipywidgets.Layout(width="100%")) + + # Get the widget mime bundle, but replace the plaintext + # with the Datastream repr + bundle = widget._repr_mimebundle_(**kwargs) + bundle.update( + { + "text/plain": repr(self), + } + ) + return bundle + + def _tab_repr_(self): + from ipywidgets import HTML, Tab + + metadata = { + "num_blocks": self._plan.initial_num_blocks(), + "num_rows": self._meta_count(), + } + # Show metadata if available, but don't trigger execution. + schema = self.schema(fetch_if_missing=False) + if schema is None: + schema_repr = Template("rendered_html_common.html.j2").render( + content="
Unknown schema
" + ) + elif isinstance(schema, type): + schema_repr = Template("rendered_html_common.html.j2").render( + content=f"
Data type: {html.escape(str(schema))}
" + ) + else: + schema_data = {} + for sname, stype in zip(schema.names, schema.types): + schema_data[sname] = getattr(stype, "__name__", str(stype)) + + schema_repr = Template("scrollableTable.html.j2").render( + table=tabulate( + tabular_data=schema_data.items(), + tablefmt="html", + showindex=False, + headers=["Name", "Type"], + ), + max_height="300px", + ) + + children = [] + children.append( + HTML( + Template("scrollableTable.html.j2").render( + table=tabulate( + tabular_data=metadata.items(), + tablefmt="html", + showindex=False, + headers=["Field", "Value"], + ), + max_height="300px", + ) + ) + ) + children.append(HTML(schema_repr)) + return Tab(children, titles=["Metadata", "Schema"]) + + def __repr__(self) -> str: + return self._plan.get_plan_as_string(self.__class__) + + def __str__(self) -> str: + return repr(self) + + def __bool__(self) -> bool: + # Prevents `__len__` from being called to check if it is None + # see: issue #25152 + return True + + def __len__(self) -> int: + raise AttributeError( + "Use `ds.count()` to compute the length of a distributed Dataset. " + "This may be an expensive operation." + ) + + def __iter__(self): + raise TypeError( + "`Dataset` objects aren't iterable. To iterate records, call " + "`ds.iter_rows()` or `ds.iter_batches()`. For more information, read " + "https://docs.ray.io/en/latest/data/iterating-over-data.html." + ) + + def _block_num_rows(self) -> List[int]: + get_num_rows = cached_remote_fn(_get_num_rows) + num_rows = [] + for ref_bundle in self.iter_internal_ref_bundles(): + for block_ref in ref_bundle.block_refs: + num_rows.append(get_num_rows.remote(block_ref)) + return ray.get(num_rows) + + def _meta_count(self) -> Optional[int]: + return self._plan.meta_count() + + def _get_uuid(self) -> str: + return self._uuid + + def _set_uuid(self, uuid: str) -> None: + self._uuid = uuid + self._plan._dataset_uuid = uuid + self._plan._in_stats.dataset_uuid = uuid + + def _synchronize_progress_bar(self): + """Flush progress bar output by shutting down the current executor. + + This should be called at the end of all blocking APIs (e.g., `take`), but not + async APIs (e.g., `iter_batches`). + + The streaming executor runs in a separate generator / thread, so it is + possible the shutdown logic runs even after a call to retrieve rows from the + stream has finished. Explicit shutdown avoids this, which can clobber console + output (https://github.com/ray-project/ray/issues/32414). + """ + if self._current_executor: + self._current_executor.shutdown() + self._current_executor = None + + def __getstate__(self): + # Note: excludes _current_executor which is not serializable. + return { + "plan": self._plan, + "uuid": self._uuid, + "logical_plan": self._logical_plan, + } + + def __setstate__(self, state): + self._plan = state["plan"] + self._uuid = state["uuid"] + self._logical_plan = state["logical_plan"] + self._current_executor = None + + def __del__(self): + if not self._current_executor: + return + + # When Python shuts down, `ray` might evaluate to ``. + # This value is truthy and not `None`, so we use a try-catch in addition to + # `if ray is not None`. For more information, see #42382. + try: + if ray is not None and ray.is_initialized(): + self._current_executor.shutdown() + except TypeError: + pass + + +@PublicAPI +class MaterializedDataset(Dataset, Generic[T]): + """A Dataset materialized in Ray memory, e.g., via `.materialize()`. + + The blocks of a MaterializedDataset object are materialized into Ray object store + memory, which means that this class can be shared or iterated over by multiple Ray + tasks without re-executing the underlying computations for producing the stream. + """ + + def num_blocks(self) -> int: + """Return the number of blocks of this :class:`MaterializedDataset`. + + Examples: + >>> import ray + >>> ds = ray.data.range(100).repartition(10).materialize() + >>> ds.num_blocks() + 10 + + Time complexity: O(1) + + Returns: + The number of blocks of this :class:`Dataset`. + """ + return self._plan.initial_num_blocks() + + +@PublicAPI(stability="beta") +class Schema: + """Dataset schema. + + Attributes: + base_schema: The underlying Arrow or Pandas schema. + """ + + def __init__( + self, + base_schema: Union["pyarrow.lib.Schema", "PandasBlockSchema"], + *, + data_context: Optional[DataContext] = None, + ): + self.base_schema = base_schema + + # Snapshot the current context, so that the config of Datasets is always + # determined by the config at the time it was created. + self._context = data_context or copy.deepcopy(DataContext.get_current()) + + @property + def names(self) -> List[str]: + """Lists the columns of this Dataset.""" + return self.base_schema.names + + @property + def types(self) -> List[Union[type[object], "pyarrow.lib.DataType"]]: + """Lists the types of this Dataset in Arrow format + + For non-Arrow compatible types, we return "object". + """ + import pyarrow as pa + + from ray.data.extensions import ArrowTensorType, TensorDtype + + if isinstance(self.base_schema, pa.lib.Schema): + return list(self.base_schema.types) + + arrow_types = [] + for dtype in self.base_schema.types: + if isinstance(dtype, TensorDtype): + + if self._context.use_arrow_tensor_v2: + pa_tensor_type_class = ArrowTensorTypeV2 + else: + pa_tensor_type_class = ArrowTensorType + + # Manually convert our Pandas tensor extension type to Arrow. + arrow_types.append( + pa_tensor_type_class( + shape=dtype._shape, dtype=pa.from_numpy_dtype(dtype._dtype) + ) + ) + + else: + try: + arrow_types.append(pa.from_numpy_dtype(dtype)) + except pa.ArrowNotImplementedError: + arrow_types.append(object) + except Exception: + logger.exception(f"Error converting dtype {dtype} to Arrow.") + arrow_types.append(None) + return arrow_types + + def __eq__(self, other): + return ( + isinstance(other, Schema) + and other.types == self.types + and other.names == self.names + ) + + def __repr__(self): + column_width = max([len(name) for name in self.names] + [len("Column")]) + padding = 2 + + output = "Column" + output += " " * ((column_width + padding) - len("Column")) + output += "Type\n" + + output += "-" * len("Column") + output += " " * ((column_width + padding) - len("Column")) + output += "-" * len("Type") + "\n" + + for name, type in zip(self.names, self.types): + output += name + output += " " * ((column_width + padding) - len(name)) + output += f"{type}\n" + + output = output.rstrip() + return output + + +def _block_to_df(block: Block) -> "pandas.DataFrame": + block = BlockAccessor.for_block(block) + return block.to_pandas() + + +def _block_to_ndarray(block: Block, column: Optional[str]): + block = BlockAccessor.for_block(block) + return block.to_numpy(column) + + +def _block_to_arrow(block: Block): + block = BlockAccessor.for_block(block) + return block.to_arrow()