koichi12 commited on Feb 12, 2025

Commit

110275e

verified ·

1 Parent(s): 98ca408

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
.venv/lib/python3.11/site-packages/mistral_common/data/tekken_240911.json +3 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/aggregate.py +411 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/arrow_block.py +649 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/batcher.py +325 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/block_batching.py +60 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/interfaces.py +47 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/block_builder.py +39 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/block_list.py +98 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/compute.py +143 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/delegating_block_builder.py +76 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/equalize.py +142 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/logging.py +208 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/memory_tracing.py +147 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/null_aggregate.py +276 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/numpy_support.py +233 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/output_buffer.py +109 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/pandas_block.py +728 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/plan.py +602 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/progress_bar.py +217 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/remote_fn.py +80 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/row.py +42 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/size_estimator.py +92 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/split.py +297 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/stats.py +1495 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/table_block.py +310 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/torch_iterable_dataset.py +10 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/util.py +1262 -0
.venv/lib/python3.11/site-packages/ray/data/datasource/__init__.py +67 -0
.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/datasink.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/datasource.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/file_based_datasource.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/file_datasink.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/file_meta_provider.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/filename_provider.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/parquet_meta_provider.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/partitioning.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/path_util.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/datasource/file_datasink.py +266 -0
.venv/lib/python3.11/site-packages/ray/data/datasource/partitioning.py +456 -0
.venv/lib/python3.11/site-packages/ray/data/datasource/path_util.py +206 -0
.venv/lib/python3.11/site-packages/ray/data/extensions/__init__.py +45 -0
.venv/lib/python3.11/site-packages/ray/data/extensions/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/extensions/__pycache__/object_extension.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/extensions/__pycache__/tensor_extension.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/extensions/object_extension.py +10 -0
.venv/lib/python3.11/site-packages/ray/data/extensions/tensor_extension.py +15 -0
.venv/lib/python3.11/site-packages/ray/data/preprocessors/__init__.py +50 -0

.gitattributes CHANGED Viewed

@@ -151,3 +151,4 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_
 .venv/lib/python3.11/site-packages/nvidia/cusparse/lib/libcusparse.so.12 filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/nvidia/cufft/lib/libcufft.so.11 filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/torchgen/__pycache__/model.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text

 .venv/lib/python3.11/site-packages/nvidia/cusparse/lib/libcusparse.so.12 filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/nvidia/cufft/lib/libcufft.so.11 filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/torchgen/__pycache__/model.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/mistral_common/data/tekken_240911.json filter=lfs diff=lfs merge=lfs -text

.venv/lib/python3.11/site-packages/mistral_common/data/tekken_240911.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:386b1f98fba69b38c3de512a4eb602dc69a95dae0e54e6ce048ea3e29a2627a8
+size 19280967

.venv/lib/python3.11/site-packages/ray/data/_internal/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/data/_internal/aggregate.py ADDED Viewed

	@@ -0,0 +1,411 @@

+import math
+from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union
+from ray.data._internal.null_aggregate import (
+    _null_wrap_accumulate_block,
+    _null_wrap_accumulate_row,
+    _null_wrap_finalize,
+    _null_wrap_init,
+    _null_wrap_merge,
+)
+from ray.data._internal.planner.exchange.sort_task_spec import SortKey
+from ray.data.aggregate import AggregateFn
+from ray.data.block import AggType, Block, BlockAccessor
+if TYPE_CHECKING:
+    import pyarrow as pa
+class _AggregateOnKeyBase(AggregateFn):
+    def _set_key_fn(self, on: str):
+        self._key_fn = on
+    def _validate(self, schema: Optional[Union[type, "pa.lib.Schema"]]) -> None:
+        SortKey(self._key_fn).validate_schema(schema)
+class Count(AggregateFn):
+    """Defines count aggregation."""
+    def __init__(self):
+        super().__init__(
+            init=lambda k: 0,
+            accumulate_block=(
+                lambda a, block: a + BlockAccessor.for_block(block).num_rows()
+            ),
+            merge=lambda a1, a2: a1 + a2,
+            name="count()",
+        )
+class Sum(_AggregateOnKeyBase):
+    """Defines sum aggregation."""
+    def __init__(
+        self,
+        on: Optional[str] = None,
+        ignore_nulls: bool = True,
+        alias_name: Optional[str] = None,
+    ):
+        self._set_key_fn(on)
+        if alias_name:
+            self._rs_name = alias_name
+        else:
+            self._rs_name = f"sum({str(on)})"
+        null_merge = _null_wrap_merge(ignore_nulls, lambda a1, a2: a1 + a2)
+        super().__init__(
+            init=_null_wrap_init(lambda k: 0),
+            merge=null_merge,
+            accumulate_block=_null_wrap_accumulate_block(
+                ignore_nulls,
+                lambda block: BlockAccessor.for_block(block).sum(on, ignore_nulls),
+                null_merge,
+            ),
+            finalize=_null_wrap_finalize(lambda a: a),
+            name=(self._rs_name),
+        )
+class Min(_AggregateOnKeyBase):
+    """Defines min aggregation."""
+    def __init__(
+        self,
+        on: Optional[str] = None,
+        ignore_nulls: bool = True,
+        alias_name: Optional[str] = None,
+    ):
+        self._set_key_fn(on)
+        if alias_name:
+            self._rs_name = alias_name
+        else:
+            self._rs_name = f"min({str(on)})"
+        null_merge = _null_wrap_merge(ignore_nulls, min)
+        super().__init__(
+            init=_null_wrap_init(lambda k: float("inf")),
+            merge=null_merge,
+            accumulate_block=_null_wrap_accumulate_block(
+                ignore_nulls,
+                lambda block: BlockAccessor.for_block(block).min(on, ignore_nulls),
+                null_merge,
+            ),
+            finalize=_null_wrap_finalize(lambda a: a),
+            name=(self._rs_name),
+        )
+class Max(_AggregateOnKeyBase):
+    """Defines max aggregation."""
+    def __init__(
+        self,
+        on: Optional[str] = None,
+        ignore_nulls: bool = True,
+        alias_name: Optional[str] = None,
+    ):
+        self._set_key_fn(on)
+        if alias_name:
+            self._rs_name = alias_name
+        else:
+            self._rs_name = f"max({str(on)})"
+        null_merge = _null_wrap_merge(ignore_nulls, max)
+        super().__init__(
+            init=_null_wrap_init(lambda k: float("-inf")),
+            merge=null_merge,
+            accumulate_block=_null_wrap_accumulate_block(
+                ignore_nulls,
+                lambda block: BlockAccessor.for_block(block).max(on, ignore_nulls),
+                null_merge,
+            ),
+            finalize=_null_wrap_finalize(lambda a: a),
+            name=(self._rs_name),
+        )
+class Mean(_AggregateOnKeyBase):
+    """Defines mean aggregation."""
+    def __init__(
+        self,
+        on: Optional[str] = None,
+        ignore_nulls: bool = True,
+        alias_name: Optional[str] = None,
+    ):
+        self._set_key_fn(on)
+        if alias_name:
+            self._rs_name = alias_name
+        else:
+            self._rs_name = f"mean({str(on)})"
+        null_merge = _null_wrap_merge(
+            ignore_nulls, lambda a1, a2: [a1[0] + a2[0], a1[1] + a2[1]]
+        )
+        def vectorized_mean(block: Block) -> AggType:
+            block_acc = BlockAccessor.for_block(block)
+            count = block_acc.count(on)
+            if count == 0 or count is None:
+                # Empty or all null.
+                return None
+            sum_ = block_acc.sum(on, ignore_nulls)
+            if sum_ is None:
+                # ignore_nulls=False and at least one null.
+                return None
+            return [sum_, count]
+        super().__init__(
+            init=_null_wrap_init(lambda k: [0, 0]),
+            merge=null_merge,
+            accumulate_block=_null_wrap_accumulate_block(
+                ignore_nulls,
+                vectorized_mean,
+                null_merge,
+            ),
+            finalize=_null_wrap_finalize(lambda a: a[0] / a[1]),
+            name=(self._rs_name),
+        )
+class Std(_AggregateOnKeyBase):
+    """Defines standard deviation aggregation.
+    Uses Welford's online method for an accumulator-style computation of the
+    standard deviation. This method was chosen due to its numerical
+    stability, and it being computable in a single pass.
+    This may give different (but more accurate) results than NumPy, Pandas,
+    and sklearn, which use a less numerically stable two-pass algorithm.
+    See
+    https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
+    """
+    def __init__(
+        self,
+        on: Optional[str] = None,
+        ddof: int = 1,
+        ignore_nulls: bool = True,
+        alias_name: Optional[str] = None,
+    ):
+        self._set_key_fn(on)
+        if alias_name:
+            self._rs_name = alias_name
+        else:
+            self._rs_name = f"std({str(on)})"
+        def merge(a: List[float], b: List[float]):
+            # Merges two accumulations into one.
+            # See
+            # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
+            M2_a, mean_a, count_a = a
+            M2_b, mean_b, count_b = b
+            delta = mean_b - mean_a
+            count = count_a + count_b
+            # NOTE: We use this mean calculation since it's more numerically
+            # stable than mean_a + delta * count_b / count, which actually
+            # deviates from Pandas in the ~15th decimal place and causes our
+            # exact comparison tests to fail.
+            mean = (mean_a * count_a + mean_b * count_b) / count
+            # Update the sum of squared differences.
+            M2 = M2_a + M2_b + (delta**2) * count_a * count_b / count
+            return [M2, mean, count]
+        null_merge = _null_wrap_merge(ignore_nulls, merge)
+        def vectorized_std(block: Block) -> AggType:
+            block_acc = BlockAccessor.for_block(block)
+            count = block_acc.count(on)
+            if count == 0 or count is None:
+                # Empty or all null.
+                return None
+            sum_ = block_acc.sum(on, ignore_nulls)
+            if sum_ is None:
+                # ignore_nulls=False and at least one null.
+                return None
+            mean = sum_ / count
+            M2 = block_acc.sum_of_squared_diffs_from_mean(on, ignore_nulls, mean)
+            return [M2, mean, count]
+        def finalize(a: List[float]):
+            # Compute the final standard deviation from the accumulated
+            # sum of squared differences from current mean and the count.
+            M2, mean, count = a
+            if count < 2:
+                return 0.0
+            return math.sqrt(M2 / (count - ddof))
+        super().__init__(
+            init=_null_wrap_init(lambda k: [0, 0, 0]),
+            merge=null_merge,
+            accumulate_block=_null_wrap_accumulate_block(
+                ignore_nulls,
+                vectorized_std,
+                null_merge,
+            ),
+            finalize=_null_wrap_finalize(finalize),
+            name=(self._rs_name),
+        )
+class AbsMax(_AggregateOnKeyBase):
+    """Defines absolute max aggregation."""
+    def __init__(
+        self,
+        on: Optional[str] = None,
+        ignore_nulls: bool = True,
+        alias_name: Optional[str] = None,
+    ):
+        self._set_key_fn(on)
+        on_fn = _to_on_fn(on)
+        if alias_name:
+            self._rs_name = alias_name
+        else:
+            self._rs_name = f"abs_max({str(on)})"
+        super().__init__(
+            init=_null_wrap_init(lambda k: 0),
+            merge=_null_wrap_merge(ignore_nulls, max),
+            accumulate_row=_null_wrap_accumulate_row(
+                ignore_nulls, on_fn, lambda a, r: max(a, abs(r))
+            ),
+            finalize=_null_wrap_finalize(lambda a: a),
+            name=(self._rs_name),
+        )
+def _to_on_fn(on: Optional[str]):
+    if on is None:
+        return lambda r: r
+    elif isinstance(on, str):
+        return lambda r: r[on]
+    else:
+        return on
+class Quantile(_AggregateOnKeyBase):
+    """Defines Quantile aggregation."""
+    def __init__(
+        self,
+        on: Optional[str] = None,
+        q: float = 0.5,
+        ignore_nulls: bool = True,
+        alias_name: Optional[str] = None,
+    ):
+        self._set_key_fn(on)
+        self._q = q
+        if alias_name:
+            self._rs_name = alias_name
+        else:
+            self._rs_name = f"quantile({str(on)})"
+        def merge(a: List[int], b: List[int]):
+            if isinstance(a, List) and isinstance(b, List):
+                a.extend(b)
+                return a
+            if isinstance(a, List) and (not isinstance(b, List)):
+                if b is not None and b != "":
+                    a.append(b)
+                return a
+            if isinstance(b, List) and (not isinstance(a, List)):
+                if a is not None and a != "":
+                    b.append(a)
+                return b
+            ls = []
+            if a is not None and a != "":
+                ls.append(a)
+            if b is not None and b != "":
+                ls.append(b)
+            return ls
+        null_merge = _null_wrap_merge(ignore_nulls, merge)
+        def block_row_ls(block: Block) -> AggType:
+            block_acc = BlockAccessor.for_block(block)
+            ls = []
+            for row in block_acc.iter_rows(public_row_format=False):
+                ls.append(row.get(on))
+            return ls
+        import math
+        def percentile(input_values, key: Optional[Callable[[Any], Any]] = None):
+            if not input_values:
+                return None
+            if key is None:
+                key = lambda x: x  # noqa: E731
+            input_values = sorted(input_values)
+            k = (len(input_values) - 1) * self._q
+            f = math.floor(k)
+            c = math.ceil(k)
+            if f == c:
+                return key(input_values[int(k)])
+            d0 = key(input_values[int(f)]) * (c - k)
+            d1 = key(input_values[int(c)]) * (k - f)
+            return round(d0 + d1, 5)
+        super().__init__(
+            init=_null_wrap_init(lambda k: [0]),
+            merge=null_merge,
+            accumulate_block=_null_wrap_accumulate_block(
+                ignore_nulls,
+                block_row_ls,
+                null_merge,
+            ),
+            finalize=_null_wrap_finalize(percentile),
+            name=(self._rs_name),
+        )
+class Unique(_AggregateOnKeyBase):
+    """Defines unique aggregation."""
+    def __init__(
+        self,
+        on: Optional[str] = None,
+        alias_name: Optional[str] = None,
+    ):
+        self._set_key_fn(on)
+        if alias_name:
+            self._rs_name = alias_name
+        else:
+            self._rs_name = f"unique({str(on)})"
+        def to_set(x):
+            if isinstance(x, set):
+                return x
+            elif isinstance(x, list):
+                return set(x)
+            else:
+                return {x}
+        def block_row_unique(block: Block) -> AggType:
+            import pyarrow.compute as pac
+            col = BlockAccessor.for_block(block).to_arrow().column(on)
+            return pac.unique(col).to_pylist()
+        def merge(a, b):
+            return to_set(a) | to_set(b)
+        null_merge = _null_wrap_merge(False, merge)
+        super().__init__(
+            init=_null_wrap_init(lambda x: set()),
+            merge=null_merge,
+            accumulate_block=_null_wrap_accumulate_block(
+                False,
+                block_row_unique,
+                null_merge,
+            ),
+            name=(self._rs_name),
+            finalize=_null_wrap_finalize(lambda x: x),
+        )

.venv/lib/python3.11/site-packages/ray/data/_internal/arrow_block.py ADDED Viewed

	@@ -0,0 +1,649 @@

+import collections
+import heapq
+import logging
+import random
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    TypeVar,
+    Union,
+)
+import numpy as np
+from ray._private.utils import _get_pyarrow_version
+from ray.air.constants import TENSOR_COLUMN_NAME
+from ray.air.util.tensor_extensions.arrow import (
+    convert_to_pyarrow_array,
+    pyarrow_table_from_pydict,
+)
+from ray.data._internal.arrow_ops import transform_polars, transform_pyarrow
+from ray.data._internal.numpy_support import convert_to_numpy
+from ray.data._internal.row import TableRow
+from ray.data._internal.table_block import TableBlockAccessor, TableBlockBuilder
+from ray.data._internal.util import NULL_SENTINEL, find_partitions, keys_equal
+from ray.data.block import (
+    Block,
+    BlockAccessor,
+    BlockExecStats,
+    BlockMetadata,
+    BlockType,
+    KeyType,
+    U,
+)
+from ray.data.context import DataContext
+try:
+    import pyarrow
+except ImportError:
+    pyarrow = None
+if TYPE_CHECKING:
+    import pandas
+    from ray.data._internal.planner.exchange.sort_task_spec import SortKey
+    from ray.data.aggregate import AggregateFn
+T = TypeVar("T")
+logger = logging.getLogger(__name__)
+# We offload some transformations to polars for performance.
+def get_sort_transform(context: DataContext) -> Callable:
+    if context.use_polars:
+        return transform_polars.sort
+    else:
+        return transform_pyarrow.sort
+def get_concat_and_sort_transform(context: DataContext) -> Callable:
+    if context.use_polars:
+        return transform_polars.concat_and_sort
+    else:
+        return transform_pyarrow.concat_and_sort
+class ArrowRow(TableRow):
+    """
+    Row of a tabular Dataset backed by a Arrow Table block.
+    """
+    def __getitem__(self, key: Union[str, List[str]]) -> Any:
+        from ray.data.extensions import get_arrow_extension_tensor_types
+        tensor_arrow_extension_types = get_arrow_extension_tensor_types()
+        def get_item(keys: List[str]) -> Any:
+            schema = self._row.schema
+            if isinstance(schema.field(keys[0]).type, tensor_arrow_extension_types):
+                # Build a tensor row.
+                return tuple(
+                    [
+                        ArrowBlockAccessor._build_tensor_row(self._row, col_name=key)
+                        for key in keys
+                    ]
+                )
+            table = self._row.select(keys)
+            if len(table) == 0:
+                return None
+            items = [col[0] for col in table.columns]
+            try:
+                # Try to interpret this as a pyarrow.Scalar value.
+                return tuple([item.as_py() for item in items])
+            except AttributeError:
+                # Assume that this row is an element of an extension array, and
+                # that it is bypassing pyarrow's scalar model for Arrow < 8.0.0.
+                return items
+        is_single_item = isinstance(key, str)
+        keys = [key] if is_single_item else key
+        items = get_item(keys)
+        if items is None:
+            return None
+        elif is_single_item:
+            return items[0]
+        else:
+            return items
+    def __iter__(self) -> Iterator:
+        for k in self._row.column_names:
+            yield k
+    def __len__(self):
+        return self._row.num_columns
+class ArrowBlockBuilder(TableBlockBuilder):
+    def __init__(self):
+        if pyarrow is None:
+            raise ImportError("Run `pip install pyarrow` for Arrow support")
+        super().__init__((pyarrow.Table, bytes))
+    @staticmethod
+    def _table_from_pydict(columns: Dict[str, List[Any]]) -> Block:
+        pa_cols: Dict[str, pyarrow.Array] = dict()
+        for col_name, col_vals in columns.items():
+            np_col_vals = convert_to_numpy(col_vals)
+            pa_cols[col_name] = convert_to_pyarrow_array(np_col_vals, col_name)
+        return pyarrow_table_from_pydict(pa_cols)
+    @staticmethod
+    def _concat_tables(tables: List[Block]) -> Block:
+        return transform_pyarrow.concat(tables)
+    @staticmethod
+    def _concat_would_copy() -> bool:
+        return False
+    @staticmethod
+    def _empty_table() -> "pyarrow.Table":
+        return pyarrow_table_from_pydict({})
+    def block_type(self) -> BlockType:
+        return BlockType.ARROW
+class ArrowBlockAccessor(TableBlockAccessor):
+    ROW_TYPE = ArrowRow
+    def __init__(self, table: "pyarrow.Table"):
+        if pyarrow is None:
+            raise ImportError("Run `pip install pyarrow` for Arrow support")
+        super().__init__(table)
+    def column_names(self) -> List[str]:
+        return self._table.column_names
+    def append_column(self, name: str, data: Any) -> Block:
+        assert name not in self._table.column_names
+        if any(isinstance(item, np.ndarray) for item in data):
+            raise NotImplementedError(
+                f"`{self.__class__.__name__}.append_column()` doesn't support "
+                "array-like data."
+            )
+        return self._table.append_column(name, [data])
+    @classmethod
+    def from_bytes(cls, data: bytes) -> "ArrowBlockAccessor":
+        reader = pyarrow.ipc.open_stream(data)
+        return cls(reader.read_all())
+    @staticmethod
+    def _build_tensor_row(
+        row: ArrowRow, col_name: str = TENSOR_COLUMN_NAME
+    ) -> np.ndarray:
+        from packaging.version import parse as parse_version
+        element = row[col_name][0]
+        # TODO(Clark): Reduce this to np.asarray(element) once we only support Arrow
+        # 9.0.0+.
+        pyarrow_version = _get_pyarrow_version()
+        if pyarrow_version is not None:
+            pyarrow_version = parse_version(pyarrow_version)
+        if pyarrow_version is None or pyarrow_version >= parse_version("8.0.0"):
+            assert isinstance(element, pyarrow.ExtensionScalar)
+            if pyarrow_version is None or pyarrow_version >= parse_version("9.0.0"):
+                # For Arrow 9.0.0+, accessing an element in a chunked tensor array
+                # produces an ArrowTensorScalar, which we convert to an ndarray using
+                # .as_py().
+                element = element.as_py()
+            else:
+                # For Arrow 8.*, accessing an element in a chunked tensor array produces
+                # an ExtensionScalar, which we convert to an ndarray using our custom
+                # method.
+                element = element.type._extension_scalar_to_ndarray(element)
+        # For Arrow < 8.0.0, accessing an element in a chunked tensor array produces an
+        # ndarray, which we return directly.
+        assert isinstance(element, np.ndarray), type(element)
+        return element
+    def slice(self, start: int, end: int, copy: bool = False) -> "pyarrow.Table":
+        view = self._table.slice(start, end - start)
+        if copy:
+            view = transform_pyarrow.combine_chunks(view)
+        return view
+    def random_shuffle(self, random_seed: Optional[int]) -> "pyarrow.Table":
+        # TODO(swang): Creating this np.array index can add a lot of memory
+        # pressure when there are a large number of small rows. Investigate
+        # random shuffling in place to reduce memory pressure.
+        # See https://github.com/ray-project/ray/issues/42146.
+        random = np.random.RandomState(random_seed)
+        return self.take(random.permutation(self.num_rows()))
+    def schema(self) -> "pyarrow.lib.Schema":
+        return self._table.schema
+    def to_pandas(self) -> "pandas.DataFrame":
+        from ray.air.util.data_batch_conversion import _cast_tensor_columns_to_ndarrays
+        df = self._table.to_pandas()
+        ctx = DataContext.get_current()
+        if ctx.enable_tensor_extension_casting:
+            df = _cast_tensor_columns_to_ndarrays(df)
+        return df
+    def to_numpy(
+        self, columns: Optional[Union[str, List[str]]] = None
+    ) -> Union[np.ndarray, Dict[str, np.ndarray]]:
+        if columns is None:
+            columns = self._table.column_names
+            should_be_single_ndarray = False
+        elif isinstance(columns, list):
+            should_be_single_ndarray = False
+        else:
+            columns = [columns]
+            should_be_single_ndarray = True
+        column_names_set = set(self._table.column_names)
+        for column in columns:
+            if column not in column_names_set:
+                raise ValueError(
+                    f"Cannot find column {column}, available columns: "
+                    f"{column_names_set}"
+                )
+        column_values_ndarrays = []
+        for col_name in columns:
+            col = self._table[col_name]
+            # Combine columnar values arrays to make these contiguous
+            # (making them compatible with numpy format)
+            combined_array = transform_pyarrow.combine_chunked_array(col)
+            column_values_ndarrays.append(
+                transform_pyarrow.to_numpy(combined_array, zero_copy_only=False)
+            )
+        if should_be_single_ndarray:
+            assert len(columns) == 1
+            return column_values_ndarrays[0]
+        else:
+            return dict(zip(columns, column_values_ndarrays))
+    def to_arrow(self) -> "pyarrow.Table":
+        return self._table
+    def num_rows(self) -> int:
+        # Arrow may represent an empty table via an N > 0 row, 0-column table, e.g. when
+        # slicing an empty table, so we return 0 if num_columns == 0.
+        return self._table.num_rows if self._table.num_columns > 0 else 0
+    def size_bytes(self) -> int:
+        return self._table.nbytes
+    def _zip(self, acc: BlockAccessor) -> "Block":
+        r = self.to_arrow()
+        s = acc.to_arrow()
+        for col_name in s.column_names:
+            col = s.column(col_name)
+            # Ensure the column names are unique after zip.
+            if col_name in r.column_names:
+                i = 1
+                new_name = col_name
+                while new_name in r.column_names:
+                    new_name = "{}_{}".format(col_name, i)
+                    i += 1
+                col_name = new_name
+            r = r.append_column(col_name, col)
+        return r
+    @staticmethod
+    def builder() -> ArrowBlockBuilder:
+        return ArrowBlockBuilder()
+    @staticmethod
+    def _empty_table() -> "pyarrow.Table":
+        return ArrowBlockBuilder._empty_table()
+    def take(
+        self,
+        indices: Union[List[int], "pyarrow.Array", "pyarrow.ChunkedArray"],
+    ) -> "pyarrow.Table":
+        """Select rows from the underlying table.
+        This method is an alternative to pyarrow.Table.take(), which breaks for
+        extension arrays.
+        """
+        return transform_pyarrow.take_table(self._table, indices)
+    def select(self, columns: List[str]) -> "pyarrow.Table":
+        if not all(isinstance(col, str) for col in columns):
+            raise ValueError(
+                "Columns must be a list of column name strings when aggregating on "
+                f"Arrow blocks, but got: {columns}."
+            )
+        return self._table.select(columns)
+    def rename_columns(self, columns_rename: Dict[str, str]) -> "pyarrow.Table":
+        return self._table.rename_columns(columns_rename)
+    def _sample(self, n_samples: int, sort_key: "SortKey") -> "pyarrow.Table":
+        indices = random.sample(range(self._table.num_rows), n_samples)
+        table = self._table.select(sort_key.get_columns())
+        return transform_pyarrow.take_table(table, indices)
+    def count(self, on: str) -> Optional[U]:
+        """Count the number of non-null values in the provided column."""
+        import pyarrow.compute as pac
+        if not isinstance(on, str):
+            raise ValueError(
+                "on must be a string when aggregating on Arrow blocks, but got:"
+                f"{type(on)}."
+            )
+        if self.num_rows() == 0:
+            return None
+        col = self._table[on]
+        return pac.count(col).as_py()
+    def _apply_arrow_compute(
+        self, compute_fn: Callable, on: str, ignore_nulls: bool
+    ) -> Optional[U]:
+        """Helper providing null handling around applying an aggregation to a column."""
+        import pyarrow as pa
+        if not isinstance(on, str):
+            raise ValueError(
+                "on must be a string when aggregating on Arrow blocks, but got:"
+                f"{type(on)}."
+            )
+        if self.num_rows() == 0:
+            return None
+        col = self._table[on]
+        if pa.types.is_null(col.type):
+            return None
+        else:
+            return compute_fn(col, skip_nulls=ignore_nulls).as_py()
+    def sum(self, on: str, ignore_nulls: bool) -> Optional[U]:
+        import pyarrow.compute as pac
+        return self._apply_arrow_compute(pac.sum, on, ignore_nulls)
+    def min(self, on: str, ignore_nulls: bool) -> Optional[U]:
+        import pyarrow.compute as pac
+        return self._apply_arrow_compute(pac.min, on, ignore_nulls)
+    def max(self, on: str, ignore_nulls: bool) -> Optional[U]:
+        import pyarrow.compute as pac
+        return self._apply_arrow_compute(pac.max, on, ignore_nulls)
+    def mean(self, on: str, ignore_nulls: bool) -> Optional[U]:
+        import pyarrow.compute as pac
+        return self._apply_arrow_compute(pac.mean, on, ignore_nulls)
+    def sum_of_squared_diffs_from_mean(
+        self,
+        on: str,
+        ignore_nulls: bool,
+        mean: Optional[U] = None,
+    ) -> Optional[U]:
+        import pyarrow.compute as pac
+        if mean is None:
+            # If precomputed mean not given, we compute it ourselves.
+            mean = self.mean(on, ignore_nulls)
+            if mean is None:
+                return None
+        return self._apply_arrow_compute(
+            lambda col, skip_nulls: pac.sum(
+                pac.power(pac.subtract(col, mean), 2),
+                skip_nulls=skip_nulls,
+            ),
+            on,
+            ignore_nulls,
+        )
+    def sort_and_partition(
+        self, boundaries: List[T], sort_key: "SortKey"
+    ) -> List["Block"]:
+        if self._table.num_rows == 0:
+            # If the pyarrow table is empty we may not have schema
+            # so calling sort_indices() will raise an error.
+            return [self._empty_table() for _ in range(len(boundaries) + 1)]
+        context = DataContext.get_current()
+        sort = get_sort_transform(context)
+        table = sort(self._table, sort_key)
+        if len(boundaries) == 0:
+            return [table]
+        return find_partitions(table, boundaries, sort_key)
+    def combine(self, sort_key: "SortKey", aggs: Tuple["AggregateFn"]) -> Block:
+        """Combine rows with the same key into an accumulator.
+        This assumes the block is already sorted by key in ascending order.
+        Args:
+            sort_key: A column name or list of column names.
+            If this is ``None``, place all rows in a single group.
+            aggs: The aggregations to do.
+        Returns:
+            A sorted block of [k, v_1, ..., v_n] columns where k is the groupby
+            key and v_i is the partially combined accumulator for the ith given
+            aggregation.
+            If key is None then the k column is omitted.
+        """
+        keys: List[str] = sort_key.get_columns()
+        def iter_groups() -> Iterator[Tuple[Sequence[KeyType], Block]]:
+            """Creates an iterator over zero-copy group views."""
+            if not keys:
+                # Global aggregation consists of a single "group", so we short-circuit.
+                yield tuple(), self.to_block()
+                return
+            start = end = 0
+            iter = self.iter_rows(public_row_format=False)
+            next_row = None
+            while True:
+                try:
+                    if next_row is None:
+                        next_row = next(iter)
+                    next_keys = next_row[keys]
+                    while keys_equal(next_row[keys], next_keys):
+                        end += 1
+                        try:
+                            next_row = next(iter)
+                        except StopIteration:
+                            next_row = None
+                            break
+                    yield next_keys, self.slice(start, end)
+                    start = end
+                except StopIteration:
+                    break
+        builder = ArrowBlockBuilder()
+        for group_keys, group_view in iter_groups():
+            # Aggregate.
+            init_vals = group_keys
+            if len(group_keys) == 1:
+                init_vals = group_keys[0]
+            accumulators = [agg.init(init_vals) for agg in aggs]
+            for i in range(len(aggs)):
+                accumulators[i] = aggs[i].accumulate_block(accumulators[i], group_view)
+            # Build the row.
+            row = {}
+            if keys:
+                for k, gk in zip(keys, group_keys):
+                    row[k] = gk
+            count = collections.defaultdict(int)
+            for agg, accumulator in zip(aggs, accumulators):
+                name = agg.name
+                # Check for conflicts with existing aggregation name.
+                if count[name] > 0:
+                    name = self._munge_conflict(name, count[name])
+                count[name] += 1
+                row[name] = accumulator
+            builder.add(row)
+        return builder.build()
+    @staticmethod
+    def merge_sorted_blocks(
+        blocks: List[Block], sort_key: "SortKey"
+    ) -> Tuple[Block, BlockMetadata]:
+        stats = BlockExecStats.builder()
+        blocks = [b for b in blocks if b.num_rows > 0]
+        if len(blocks) == 0:
+            ret = ArrowBlockAccessor._empty_table()
+        else:
+            # Handle blocks of different types.
+            blocks = TableBlockAccessor.normalize_block_types(blocks, "arrow")
+            concat_and_sort = get_concat_and_sort_transform(DataContext.get_current())
+            ret = concat_and_sort(blocks, sort_key)
+        return ret, ArrowBlockAccessor(ret).get_metadata(exec_stats=stats.build())
+    @staticmethod
+    def aggregate_combined_blocks(
+        blocks: List[Block],
+        sort_key: "SortKey",
+        aggs: Tuple["AggregateFn"],
+        finalize: bool,
+    ) -> Tuple[Block, BlockMetadata]:
+        """Aggregate sorted, partially combined blocks with the same key range.
+        This assumes blocks are already sorted by key in ascending order,
+        so we can do merge sort to get all the rows with the same key.
+        Args:
+            blocks: A list of partially combined and sorted blocks.
+            sort_key: The column name of key or None for global aggregation.
+            aggs: The aggregations to do.
+            finalize: Whether to finalize the aggregation. This is used as an
+                optimization for cases where we repeatedly combine partially
+                aggregated groups.
+        Returns:
+            A block of [k, v_1, ..., v_n] columns and its metadata where k is
+            the groupby key and v_i is the corresponding aggregation result for
+            the ith given aggregation.
+            If key is None then the k column is omitted.
+        """
+        stats = BlockExecStats.builder()
+        keys = sort_key.get_columns()
+        def key_fn(r):
+            if keys:
+                return tuple(r[keys])
+            else:
+                return (0,)
+        # Replace Nones with NULL_SENTINEL to ensure safe sorting.
+        def key_fn_with_null_sentinel(r):
+            values = key_fn(r)
+            return [NULL_SENTINEL if v is None else v for v in values]
+        # Handle blocks of different types.
+        blocks = TableBlockAccessor.normalize_block_types(blocks, "arrow")
+        iter = heapq.merge(
+            *[
+                ArrowBlockAccessor(block).iter_rows(public_row_format=False)
+                for block in blocks
+            ],
+            key=key_fn_with_null_sentinel,
+        )
+        next_row = None
+        builder = ArrowBlockBuilder()
+        while True:
+            try:
+                if next_row is None:
+                    next_row = next(iter)
+                next_keys = key_fn(next_row)
+                next_key_columns = keys
+                def gen():
+                    nonlocal iter
+                    nonlocal next_row
+                    while keys_equal(key_fn(next_row), next_keys):
+                        yield next_row
+                        try:
+                            next_row = next(iter)
+                        except StopIteration:
+                            next_row = None
+                            break
+                # Merge.
+                first = True
+                accumulators = [None] * len(aggs)
+                resolved_agg_names = [None] * len(aggs)
+                for r in gen():
+                    if first:
+                        count = collections.defaultdict(int)
+                        for i in range(len(aggs)):
+                            name = aggs[i].name
+                            # Check for conflicts with existing aggregation
+                            # name.
+                            if count[name] > 0:
+                                name = ArrowBlockAccessor._munge_conflict(
+                                    name, count[name]
+                                )
+                            count[name] += 1
+                            resolved_agg_names[i] = name
+                            accumulators[i] = r[name]
+                        first = False
+                    else:
+                        for i in range(len(aggs)):
+                            accumulators[i] = aggs[i].merge(
+                                accumulators[i], r[resolved_agg_names[i]]
+                            )
+                # Build the row.
+                row = {}
+                if keys:
+                    for col_name, next_key in zip(next_key_columns, next_keys):
+                        row[col_name] = next_key
+                for agg, agg_name, accumulator in zip(
+                    aggs, resolved_agg_names, accumulators
+                ):
+                    if finalize:
+                        row[agg_name] = agg.finalize(accumulator)
+                    else:
+                        row[agg_name] = accumulator
+                builder.add(row)
+            except StopIteration:
+                break
+        ret = builder.build()
+        return ret, ArrowBlockAccessor(ret).get_metadata(exec_stats=stats.build())
+    def block_type(self) -> BlockType:
+        return BlockType.ARROW

.venv/lib/python3.11/site-packages/ray/data/_internal/batcher.py ADDED Viewed

	@@ -0,0 +1,325 @@

+from typing import Optional
+from ray.data._internal.arrow_block import ArrowBlockAccessor
+from ray.data._internal.arrow_ops import transform_pyarrow
+from ray.data._internal.delegating_block_builder import DelegatingBlockBuilder
+from ray.data.block import Block, BlockAccessor
+# pyarrow.Table.slice is slow when the table has many chunks
+# so we combine chunks into a single one to make slice faster
+# with the cost of an extra copy.
+# See https://github.com/ray-project/ray/issues/31108 for more details.
+# TODO(jjyao): remove this once
+# https://github.com/apache/arrow/issues/35126 is resolved.
+MIN_NUM_CHUNKS_TO_TRIGGER_COMBINE_CHUNKS = 10
+# Delay compaction until the shuffle buffer has reached this ratio over the min
+# shuffle buffer size. Setting this to 1 minimizes memory usage, at the cost of
+# frequent compactions. Setting this to higher values increases memory usage but
+# reduces compaction frequency.
+SHUFFLE_BUFFER_COMPACTION_RATIO = 1.5
+class BatcherInterface:
+    def add(self, block: Block):
+        """Add a block to the block buffer.
+        Args:
+            block: Block to add to the block buffer.
+        """
+        raise NotImplementedError()
+    def done_adding(self) -> bool:
+        """Indicate to the batcher that no more blocks will be added to the buffer."""
+        raise NotImplementedError()
+    def has_batch(self) -> bool:
+        """Whether this Batcher has any full batches."""
+        raise NotImplementedError()
+    def has_any(self) -> bool:
+        """Whether this Batcher has any data."""
+        raise NotImplementedError()
+    def next_batch(self) -> Block:
+        """Get the next batch from the block buffer.
+        Returns:
+            A batch represented as a Block.
+        """
+        raise NotImplementedError()
+class Batcher(BatcherInterface):
+    """Chunks blocks into batches."""
+    # Implementation Note: When there are multiple batches per block, this batcher will
+    # slice off and return each batch and add the remaining block back to the buffer
+    # instead of optimally slicing and returning all batches from the block at once.
+    # This will result in extra (and nested) block slicing. However, since slices are
+    # zero-copy views, we sacrifice what should be a small performance hit for better
+    # readability.
+    def __init__(self, batch_size: Optional[int], ensure_copy: bool = False):
+        """
+        Construct a batcher that yields batches of batch_sizes rows.
+        Args:
+            batch_size: The size of batches to yield.
+            ensure_copy: Whether batches are always copied from the underlying base
+                blocks (not zero-copy views).
+        """
+        self._batch_size = batch_size
+        self._buffer = []
+        self._buffer_size = 0
+        self._done_adding = False
+        self._ensure_copy = ensure_copy
+    def add(self, block: Block):
+        """Add a block to the block buffer.
+        Note empty block is not added to buffer.
+        Args:
+            block: Block to add to the block buffer.
+        """
+        if BlockAccessor.for_block(block).num_rows() > 0:
+            self._buffer.append(block)
+            self._buffer_size += BlockAccessor.for_block(block).num_rows()
+    def done_adding(self) -> bool:
+        """Indicate to the batcher that no more blocks will be added to the batcher."""
+        self._done_adding = True
+    def has_batch(self) -> bool:
+        """Whether this Batcher has any full batches."""
+        return self.has_any() and (
+            self._batch_size is None or self._buffer_size >= self._batch_size
+        )
+    def has_any(self) -> bool:
+        """Whether this Batcher has any data."""
+        return self._buffer_size > 0
+    def next_batch(self) -> Block:
+        """Get the next batch from the block buffer.
+        Returns:
+            A batch represented as a Block.
+        """
+        assert self.has_batch() or (self._done_adding and self.has_any())
+        needs_copy = self._ensure_copy
+        # If no batch size, short-circuit.
+        if self._batch_size is None:
+            assert len(self._buffer) == 1
+            block = self._buffer[0]
+            if needs_copy:
+                # Copy block if needing to ensure fresh batch copy.
+                block = BlockAccessor.for_block(block)
+                block = block.slice(0, block.num_rows(), copy=True)
+            self._buffer = []
+            self._buffer_size = 0
+            return block
+        output = DelegatingBlockBuilder()
+        leftover = []
+        needed = self._batch_size
+        for block in self._buffer:
+            accessor = BlockAccessor.for_block(block)
+            if needed <= 0:
+                # We already have a full batch, so add this block to
+                # the leftovers.
+                leftover.append(block)
+            elif accessor.num_rows() <= needed:
+                output.add_block(accessor.to_block())
+                needed -= accessor.num_rows()
+            else:
+                if (
+                    isinstance(accessor, ArrowBlockAccessor)
+                    and block.num_columns > 0
+                    and block.column(0).num_chunks
+                    >= MIN_NUM_CHUNKS_TO_TRIGGER_COMBINE_CHUNKS
+                ):
+                    accessor = BlockAccessor.for_block(
+                        transform_pyarrow.combine_chunks(block)
+                    )
+                # We only need part of the block to fill out a batch.
+                output.add_block(accessor.slice(0, needed, copy=False))
+                # Add the rest of the block to the leftovers.
+                leftover.append(accessor.slice(needed, accessor.num_rows(), copy=False))
+                needed = 0
+        # Move the leftovers into the block buffer so they're the first
+        # blocks consumed on the next batch extraction.
+        self._buffer = leftover
+        self._buffer_size -= self._batch_size
+        needs_copy = needs_copy and not output.will_build_yield_copy()
+        batch = output.build()
+        if needs_copy:
+            # Need to ensure that the batch is a fresh copy.
+            batch = BlockAccessor.for_block(batch)
+            batch = batch.slice(0, batch.num_rows(), copy=True)
+        return batch
+class ShufflingBatcher(BatcherInterface):
+    """Chunks blocks into shuffled batches, using a local in-memory shuffle buffer."""
+    # Implementation Note:
+    #
+    # This shuffling batcher lazily builds a shuffle buffer from added blocks, and once
+    # a batch is requested via .next_batch(), it concatenates the blocks into a concrete
+    # shuffle buffer and randomly shuffles the entire buffer.
+    #
+    # Adding of more blocks can be intermixed with retrieving batches, but it should be
+    # noted that we can end up performing two expensive operations on each retrieval:
+    #  1. Build added blocks into a concrete shuffle buffer.
+    #  2. Shuffling the entire buffer.
+    # To amortize the overhead of this process, we only shuffle the blocks after a
+    # delay designated by SHUFFLE_BUFFER_COMPACTION_RATIO.
+    #
+    # Similarly, adding blocks is very cheap. Each added block will be appended to a
+    # list, with concatenation of the underlying data delayed until the next batch
+    # compaction.
+    def __init__(
+        self,
+        batch_size: Optional[int],
+        shuffle_buffer_min_size: int,
+        shuffle_seed: Optional[int] = None,
+    ):
+        """Constructs a random-shuffling block batcher.
+        Args:
+            batch_size: Record batch size.
+            shuffle_buffer_min_size: Minimum number of rows that must be in the local
+                in-memory shuffle buffer in order to yield a batch. When there are no
+                more rows to be added to the buffer, the number of rows in the buffer
+                *will* decrease below this value while yielding the remaining batches,
+                and the final batch may have less than ``batch_size`` rows. Increasing
+                this will improve the randomness of the shuffle but may increase the
+                latency to the first batch.
+            shuffle_seed: The seed to use for the local random shuffle.
+        """
+        if batch_size is None:
+            raise ValueError("Must specify a batch_size if using a local shuffle.")
+        self._batch_size = batch_size
+        self._shuffle_seed = shuffle_seed
+        if shuffle_buffer_min_size < batch_size:
+            # Round it up internally to `batch_size` since our algorithm requires it.
+            # This is harmless since it only offers extra randomization.
+            shuffle_buffer_min_size = batch_size
+        self._buffer_min_size = shuffle_buffer_min_size
+        self._builder = DelegatingBlockBuilder()
+        self._shuffle_buffer: Block = None
+        self._batch_head = 0
+        self._done_adding = False
+    def add(self, block: Block):
+        """Add a block to the shuffle buffer.
+        Note empty block is not added to buffer.
+        Args:
+            block: Block to add to the shuffle buffer.
+        """
+        if BlockAccessor.for_block(block).num_rows() > 0:
+            self._builder.add_block(block)
+    def done_adding(self) -> bool:
+        """Indicate to the batcher that no more blocks will be added to the batcher.
+        No more blocks should be added to the batcher after calling this.
+        """
+        self._done_adding = True
+    def has_any(self) -> bool:
+        """Whether this batcher has any data."""
+        return self._buffer_size() > 0
+    def has_batch(self) -> bool:
+        """Whether this batcher has any batches."""
+        buffer_size = self._buffer_size()
+        if not self._done_adding:
+            # Delay pulling of batches until the buffer is large enough in order to
+            # amortize compaction overhead.
+            return self._materialized_buffer_size() >= self._buffer_min_size or (
+                buffer_size - self._batch_size
+                >= self._buffer_min_size * SHUFFLE_BUFFER_COMPACTION_RATIO
+            )
+        else:
+            return buffer_size >= self._batch_size
+    def _buffer_size(self) -> int:
+        """Return shuffle buffer size."""
+        buffer_size = self._builder.num_rows()
+        buffer_size += self._materialized_buffer_size()
+        return buffer_size
+    def _materialized_buffer_size(self) -> int:
+        """Return materialized (compacted portion of) shuffle buffer size."""
+        if self._shuffle_buffer is None:
+            return 0
+        # The size of the concrete (materialized) shuffle buffer, adjusting
+        # for the batch head position, which also serves as a counter of the number
+        # of already-yielded rows from the current concrete shuffle buffer.
+        return max(
+            0,
+            BlockAccessor.for_block(self._shuffle_buffer).num_rows() - self._batch_head,
+        )
+    def next_batch(self) -> Block:
+        """Get the next shuffled batch from the shuffle buffer.
+        Returns:
+            A batch represented as a Block.
+        """
+        assert self.has_batch() or (self._done_adding and self.has_any())
+        # Add rows in the builder to the shuffle buffer. Note that we delay compaction
+        # as much as possible to amortize the concatenation overhead. Compaction is
+        # only necessary when the materialized buffer size falls below the min size.
+        if self._builder.num_rows() > 0 and (
+            self._done_adding
+            or self._materialized_buffer_size() <= self._buffer_min_size
+        ):
+            if self._shuffle_buffer is not None:
+                if self._batch_head > 0:
+                    # Compact the materialized shuffle buffer.
+                    block = BlockAccessor.for_block(self._shuffle_buffer)
+                    self._shuffle_buffer = block.slice(
+                        self._batch_head, block.num_rows()
+                    )
+                # Add the unyielded rows from the existing shuffle buffer.
+                self._builder.add_block(self._shuffle_buffer)
+            # Build the new shuffle buffer.
+            self._shuffle_buffer = self._builder.build()
+            self._shuffle_buffer = BlockAccessor.for_block(
+                self._shuffle_buffer
+            ).random_shuffle(self._shuffle_seed)
+            if self._shuffle_seed is not None:
+                self._shuffle_seed += 1
+            if (
+                isinstance(
+                    BlockAccessor.for_block(self._shuffle_buffer), ArrowBlockAccessor
+                )
+                and self._shuffle_buffer.num_columns > 0
+                and self._shuffle_buffer.column(0).num_chunks
+                >= MIN_NUM_CHUNKS_TO_TRIGGER_COMBINE_CHUNKS
+            ):
+                self._shuffle_buffer = transform_pyarrow.combine_chunks(
+                    self._shuffle_buffer
+                )
+            # Reset the builder.
+            self._builder = DelegatingBlockBuilder()
+            self._batch_head = 0
+        assert self._shuffle_buffer is not None
+        buffer_size = BlockAccessor.for_block(self._shuffle_buffer).num_rows()
+        # Truncate the batch to the buffer size, if necessary.
+        batch_size = min(self._batch_size, buffer_size)
+        slice_start = self._batch_head
+        self._batch_head += batch_size
+        # Yield the shuffled batch.
+        return BlockAccessor.for_block(self._shuffle_buffer).slice(
+            slice_start, self._batch_head
+        )

.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/block_batching.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from contextlib import nullcontext
+from typing import Callable, Iterator, Optional, TypeVar
+from ray.data._internal.block_batching.util import (
+    blocks_to_batches,
+    collate,
+    extract_data_from_batch,
+    format_batches,
+)
+from ray.data._internal.stats import DatasetStats
+from ray.data.block import Block, DataBatch
+T = TypeVar("T")
+def batch_blocks(
+    blocks: Iterator[Block],
+    *,
+    stats: Optional[DatasetStats] = None,
+    batch_size: Optional[int] = None,
+    batch_format: str = "default",
+    drop_last: bool = False,
+    collate_fn: Optional[Callable[[DataBatch], DataBatch]] = None,
+    shuffle_buffer_min_size: Optional[int] = None,
+    shuffle_seed: Optional[int] = None,
+    ensure_copy: bool = False,
+) -> Iterator[DataBatch]:
+    """Create formatted batches of data from 1 or more blocks.
+    This function takes in an iterator of already fetched blocks. Consequently, this
+    function doesn't support block prefetching.
+    """
+    def _iterator_fn(base_iterator: Iterator[Block]) -> Iterator[DataBatch]:
+        batch_iter = format_batches(
+            blocks_to_batches(
+                block_iter=base_iterator,
+                stats=stats,
+                batch_size=batch_size,
+                drop_last=drop_last,
+                shuffle_buffer_min_size=shuffle_buffer_min_size,
+                shuffle_seed=shuffle_seed,
+                ensure_copy=ensure_copy,
+            ),
+            batch_format=batch_format,
+            stats=stats,
+        )
+        if collate_fn is not None:
+            batch_iter = collate(batch_iter, collate_fn=collate_fn, stats=stats)
+        batch_iter = extract_data_from_batch(batch_iter)
+        yield from batch_iter
+    batch_iter = _iterator_fn(blocks)
+    for formatted_batch in batch_iter:
+        user_timer = stats.iter_user_s.timer() if stats else nullcontext()
+        with user_timer:
+            yield formatted_batch

.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/interfaces.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import abc
+from dataclasses import dataclass
+from typing import Any, List
+from ray.data.block import Block, DataBatch
+from ray.types import ObjectRef
+@dataclass
+class Batch:
+    """A batch of data with a corresponding index.
+    Attributes:
+        batch_idx: The global index of this batch so that downstream operations can
+            maintain ordering.
+        data: The batch of data.
+    """
+    batch_idx: int
+    data: DataBatch
+class CollatedBatch(Batch):
+    """A batch of collated data with a corresponding index.
+    Attributes:
+        batch_idx: The global index of this batch so that downstream operations can
+            maintain ordering.
+        data: The batch of data which is the output of a user provided collate_fn
+            Therefore, the type of this data can be Any.
+    """
+    batch_idx: int
+    data: Any
+class BlockPrefetcher(metaclass=abc.ABCMeta):
+    """Interface for prefetching blocks."""
+    @abc.abstractmethod
+    def prefetch_blocks(self, blocks: List[ObjectRef[Block]]):
+        """Prefetch the provided blocks to this node."""
+        pass
+    def stop(self):
+        """Stop prefetching and release resources."""
+        pass

.venv/lib/python3.11/site-packages/ray/data/_internal/block_builder.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from typing import Generic
+from ray.data.block import Block, BlockAccessor, BlockType, T
+class BlockBuilder(Generic[T]):
+    """A builder class for blocks."""
+    @staticmethod
+    def for_block(block: Block) -> "BlockBuilder":
+        return BlockAccessor.for_block(block).builder()
+    def add(self, item: T) -> None:
+        """Append a single row to the block being built."""
+        raise NotImplementedError
+    def add_block(self, block: Block) -> None:
+        """Append an entire block to the block being built."""
+        raise NotImplementedError
+    def will_build_yield_copy(self) -> bool:
+        """Whether building this block will yield a new block copy."""
+        raise NotImplementedError
+    def build(self) -> Block:
+        """Build the block."""
+        raise NotImplementedError
+    def num_rows(self) -> int:
+        """Return the number of rows added in the block."""
+        raise NotImplementedError
+    def get_estimated_memory_usage(self) -> int:
+        """Return the estimated memory usage so far in bytes."""
+        raise NotImplementedError
+    def block_type(self) -> BlockType:
+        """Return the block type."""
+        raise NotImplementedError

.venv/lib/python3.11/site-packages/ray/data/_internal/block_list.py ADDED Viewed

	@@ -0,0 +1,98 @@

+from typing import Iterator, List, Tuple
+from ray.data._internal.memory_tracing import trace_allocation
+from ray.data.block import Block, BlockMetadata
+from ray.types import ObjectRef
+class BlockList:
+    """A list of blocks that may be computed or pending computation.
+    All blocks are known ahead of time
+    """
+    def __init__(
+        self,
+        blocks: List[ObjectRef[Block]],
+        metadata: List[BlockMetadata],
+        *,
+        owned_by_consumer: bool,
+    ):
+        assert len(blocks) == len(metadata), (blocks, metadata)
+        for b in blocks:
+            trace_allocation(b, "BlockList.__init__")
+        self._blocks: List[ObjectRef[Block]] = blocks
+        self._num_blocks = len(self._blocks)
+        self._metadata: List[BlockMetadata] = metadata
+        # Whether the block list is owned by consuming APIs, and if so it can be
+        # eagerly deleted after read by the consumer.
+        self._owned_by_consumer = owned_by_consumer
+        # This field can be set to indicate the number of estimated output blocks,
+        # since each read task may produce multiple output blocks after splitting.
+        self._estimated_num_blocks = None
+    def __repr__(self):
+        return f"BlockList(owned_by_consumer={self._owned_by_consumer})"
+    def get_metadata(self, fetch_if_missing: bool = False) -> List[BlockMetadata]:
+        """Get the metadata for all blocks."""
+        return self._metadata.copy()
+    def copy(self) -> "BlockList":
+        """Perform a shallow copy of this BlockList."""
+        return BlockList(
+            self._blocks, self._metadata, owned_by_consumer=self._owned_by_consumer
+        )
+    def clear(self) -> None:
+        """Erase references to the tasks tracked by the BlockList."""
+        self._blocks = None
+    def is_cleared(self) -> bool:
+        """Whether this BlockList has been cleared."""
+        return self._blocks is None
+    def _check_if_cleared(self) -> None:
+        """Raise an error if this BlockList has been previously cleared."""
+        if self.is_cleared():
+            raise ValueError(
+                "This Dataset's blocks have been moved, which means that you "
+                "can no longer use this Dataset."
+            )
+    def get_blocks(self) -> List[ObjectRef[Block]]:
+        """Get list of the blocks of this block list.
+        This blocks on the execution of the tasks generating block outputs.
+        The length of this iterator is not known until execution.
+        """
+        self._check_if_cleared()
+        return list(self._blocks)
+    def get_blocks_with_metadata(self) -> List[Tuple[ObjectRef[Block], BlockMetadata]]:
+        """Bulk version of iter_blocks_with_metadata().
+        Prefer calling this instead of the iter form for performance if you
+        don't need lazy evaluation.
+        """
+        self.get_blocks()
+        return list(self.iter_blocks_with_metadata())
+    def iter_blocks_with_metadata(
+        self,
+    ) -> Iterator[Tuple[ObjectRef[Block], BlockMetadata]]:
+        """Iterate over the blocks along with their runtime metadata.
+        This blocks on the execution of the tasks generating block outputs.
+        The length of this iterator is not known until execution.
+        """
+        self._check_if_cleared()
+        return zip(self._blocks, self._metadata)
+    def initial_num_blocks(self) -> int:
+        """Returns the number of blocks of this BlockList."""
+        return self._num_blocks
+    def estimated_num_blocks(self) -> int:
+        """Estimate of number of output blocks, without triggering actual execution."""
+        return self._estimated_num_blocks or self._num_blocks

.venv/lib/python3.11/site-packages/ray/data/_internal/compute.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import logging
+from typing import Any, Callable, Iterable, Optional, TypeVar, Union
+from ray.data._internal.execution.interfaces import TaskContext
+from ray.data.block import Block, UserDefinedFunction
+from ray.util.annotations import DeveloperAPI
+logger = logging.getLogger(__name__)
+T = TypeVar("T")
+U = TypeVar("U")
+# Block transform function applied by task and actor pools.
+BlockTransform = Union[
+    # TODO(Clark): Once Ray only supports Python 3.8+, use protocol to constrain block
+    # transform type.
+    # Callable[[Block, ...], Iterable[Block]]
+    # Callable[[Block, UserDefinedFunction, ...], Iterable[Block]],
+    Callable[[Iterable[Block], TaskContext], Iterable[Block]],
+    Callable[[Iterable[Block], TaskContext, UserDefinedFunction], Iterable[Block]],
+    Callable[..., Iterable[Block]],
+]
+@DeveloperAPI
+class ComputeStrategy:
+    pass
+@DeveloperAPI
+class TaskPoolStrategy(ComputeStrategy):
+    def __init__(
+        self,
+        size: Optional[int] = None,
+    ):
+        """Construct TaskPoolStrategy for a Dataset transform.
+        Args:
+            size: Specify the maximum size of the task pool.
+        """
+        if size is not None and size < 1:
+            raise ValueError("`size` must be >= 1", size)
+        self.size = size
+    def __eq__(self, other: Any) -> bool:
+        return (isinstance(other, TaskPoolStrategy) and self.size == other.size) or (
+            other == "tasks" and self.size is None
+        )
+class ActorPoolStrategy(ComputeStrategy):
+    """Specify the compute strategy for a Dataset transform.
+    ActorPoolStrategy specifies that an autoscaling pool of actors should be used
+    for a given Dataset transform. This is useful for stateful setup of callable
+    classes.
+    For a fixed-sized pool of size ``n``, specify ``compute=ActorPoolStrategy(size=n)``.
+    To autoscale from ``m`` to ``n`` actors, specify
+    ``ActorPoolStrategy(min_size=m, max_size=n)``.
+    To increase opportunities for pipelining task dependency prefetching with
+    computation and avoiding actor startup delays, set max_tasks_in_flight_per_actor
+    to 2 or greater; to try to decrease the delay due to queueing of tasks on the worker
+    actors, set max_tasks_in_flight_per_actor to 1.
+    """
+    def __init__(
+        self,
+        *,
+        size: Optional[int] = None,
+        min_size: Optional[int] = None,
+        max_size: Optional[int] = None,
+        max_tasks_in_flight_per_actor: Optional[int] = None,
+    ):
+        """Construct ActorPoolStrategy for a Dataset transform.
+        Args:
+            size: Specify a fixed size actor pool of this size. It is an error to
+                specify both `size` and `min_size` or `max_size`.
+            min_size: The minimize size of the actor pool.
+            max_size: The maximum size of the actor pool.
+            max_tasks_in_flight_per_actor: The maximum number of tasks to concurrently
+                send to a single actor worker. Increasing this will increase
+                opportunities for pipelining task dependency prefetching with
+                computation and avoiding actor startup delays, but will also increase
+                queueing delay.
+        """
+        if size is not None:
+            if size < 1:
+                raise ValueError("size must be >= 1", size)
+            if max_size is not None or min_size is not None:
+                raise ValueError(
+                    "min_size and max_size cannot be set at the same time as `size`"
+                )
+            min_size = size
+            max_size = size
+        if min_size is not None and min_size < 1:
+            raise ValueError("min_size must be >= 1", min_size)
+        if max_size is not None:
+            if min_size is None:
+                min_size = 1  # Legacy default.
+            if min_size > max_size:
+                raise ValueError("min_size must be <= max_size", min_size, max_size)
+        if (
+            max_tasks_in_flight_per_actor is not None
+            and max_tasks_in_flight_per_actor < 1
+        ):
+            raise ValueError(
+                "max_tasks_in_flight_per_actor must be >= 1, got: ",
+                max_tasks_in_flight_per_actor,
+            )
+        self.min_size = min_size or 1
+        self.max_size = max_size or float("inf")
+        self.max_tasks_in_flight_per_actor = max_tasks_in_flight_per_actor
+        self.num_workers = 0
+        self.ready_to_total_workers_ratio = 0.8
+    def __eq__(self, other: Any) -> bool:
+        return isinstance(other, ActorPoolStrategy) and (
+            self.min_size == other.min_size
+            and self.max_size == other.max_size
+            and self.max_tasks_in_flight_per_actor
+            == other.max_tasks_in_flight_per_actor
+        )
+def get_compute(compute_spec: Union[str, ComputeStrategy]) -> ComputeStrategy:
+    if not isinstance(compute_spec, (TaskPoolStrategy, ActorPoolStrategy)):
+        raise ValueError(
+            "In Ray 2.5, the compute spec must be either "
+            f"TaskPoolStrategy or ActorPoolStrategy, was: {compute_spec}."
+        )
+    elif not compute_spec or compute_spec == "tasks":
+        return TaskPoolStrategy()
+    elif compute_spec == "actors":
+        return ActorPoolStrategy()
+    elif isinstance(compute_spec, ComputeStrategy):
+        return compute_spec
+    else:
+        raise ValueError("compute must be one of [`tasks`, `actors`, ComputeStrategy]")

.venv/lib/python3.11/site-packages/ray/data/_internal/delegating_block_builder.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import collections
+from typing import Any, Mapping, Optional
+from ray.data._internal.arrow_block import ArrowBlockBuilder
+from ray.data._internal.block_builder import BlockBuilder
+from ray.data.block import Block, BlockAccessor, BlockType, DataBatch
+class DelegatingBlockBuilder(BlockBuilder):
+    def __init__(self):
+        self._builder = None
+        self._empty_block = None
+    @property
+    def _inferred_block_type(self) -> Optional[BlockType]:
+        """The block type inferred from the first item added to the builder."""
+        if self._builder is not None:
+            return self._builder.block_type()
+        return None
+    def add(self, item: Mapping[str, Any]) -> None:
+        assert isinstance(item, collections.abc.Mapping), item
+        if self._builder is None:
+            self._builder = ArrowBlockBuilder()
+        self._builder.add(item)
+    def add_batch(self, batch: DataBatch):
+        """Add a user-facing data batch to the builder.
+        This data batch will be converted to an internal block and then added to the
+        underlying builder.
+        """
+        block = BlockAccessor.batch_to_block(batch, self._inferred_block_type)
+        return self.add_block(block)
+    def add_block(self, block: Block):
+        accessor = BlockAccessor.for_block(block)
+        if accessor.num_rows() == 0:
+            # Don't infer types of empty lists. Store the block and use it if no
+            # other data is added. https://github.com/ray-project/ray/issues/20290
+            self._empty_block = block
+            return
+        if self._builder is None:
+            self._builder = accessor.builder()
+        else:
+            block_type = accessor.block_type()
+            assert block_type == self._inferred_block_type, (
+                block_type,
+                self._inferred_block_type,
+            )
+        self._builder.add_block(accessor.to_block())
+    def will_build_yield_copy(self) -> bool:
+        if self._builder is None:
+            return True
+        return self._builder.will_build_yield_copy()
+    def build(self) -> Block:
+        if self._builder is None:
+            if self._empty_block is not None:
+                self._builder = BlockAccessor.for_block(self._empty_block).builder()
+                self._builder.add_block(self._empty_block)
+            else:
+                self._builder = ArrowBlockBuilder()
+        return self._builder.build()
+    def num_rows(self) -> int:
+        return self._builder.num_rows() if self._builder is not None else 0
+    def get_estimated_memory_usage(self) -> int:
+        if self._builder is None:
+            return 0
+        return self._builder.get_estimated_memory_usage()

.venv/lib/python3.11/site-packages/ray/data/_internal/equalize.py ADDED Viewed

	@@ -0,0 +1,142 @@

+from typing import List, Tuple
+from ray.data._internal.execution.interfaces import RefBundle
+from ray.data._internal.split import _calculate_blocks_rows, _split_at_indices
+from ray.data.block import Block, BlockMetadata, BlockPartition
+from ray.types import ObjectRef
+def _equalize(
+    per_split_bundles: List[RefBundle],
+    owned_by_consumer: bool,
+) -> List[RefBundle]:
+    """Equalize split ref bundles into equal number of rows.
+    Args:
+        per_split_bundles: ref bundles to equalize.
+    Returns:
+        the equalized ref bundles.
+    """
+    if len(per_split_bundles) == 0:
+        return per_split_bundles
+    per_split_blocks_with_metadata = [bundle.blocks for bundle in per_split_bundles]
+    per_split_num_rows: List[List[int]] = [
+        _calculate_blocks_rows(split) for split in per_split_blocks_with_metadata
+    ]
+    total_rows = sum([sum(blocks_rows) for blocks_rows in per_split_num_rows])
+    target_split_size = total_rows // len(per_split_blocks_with_metadata)
+    # phase 1: shave the current splits by dropping blocks (into leftovers)
+    # and calculate num rows needed to the meet target.
+    shaved_splits, per_split_needed_rows, leftovers = _shave_all_splits(
+        per_split_blocks_with_metadata, per_split_num_rows, target_split_size
+    )
+    # validate invariants
+    for shaved_split, split_needed_row in zip(shaved_splits, per_split_needed_rows):
+        num_shaved_rows = sum([meta.num_rows for _, meta in shaved_split])
+        assert num_shaved_rows <= target_split_size
+        assert num_shaved_rows + split_needed_row == target_split_size
+    # phase 2: based on the num rows needed for each shaved split, split the leftovers
+    # in the shape that exactly matches the rows needed.
+    leftover_bundle = RefBundle(leftovers, owns_blocks=owned_by_consumer)
+    leftover_splits = _split_leftovers(leftover_bundle, per_split_needed_rows)
+    # phase 3: merge the shaved_splits and leftoever splits and return.
+    for i, leftover_split in enumerate(leftover_splits):
+        shaved_splits[i].extend(leftover_split)
+        # validate invariants.
+        num_shaved_rows = sum([meta.num_rows for _, meta in shaved_splits[i]])
+        assert num_shaved_rows == target_split_size
+    # Compose the result back to RefBundle
+    equalized_ref_bundles: List[RefBundle] = []
+    for split in shaved_splits:
+        equalized_ref_bundles.append(RefBundle(split, owns_blocks=owned_by_consumer))
+    return equalized_ref_bundles
+def _shave_one_split(
+    split: BlockPartition, num_rows_per_block: List[int], target_size: int
+) -> Tuple[BlockPartition, int, BlockPartition]:
+    """Shave a block list to the target size.
+    Args:
+        split: the block list to shave.
+        num_rows_per_block: num rows for each block in the list.
+        target_size: the upper bound target size of the shaved list.
+    Returns:
+        A tuple of:
+            - shaved block list.
+            - num of rows needed for the block list to meet the target size.
+            - leftover blocks.
+    """
+    # iterates through the blocks from the input list and
+    shaved = []
+    leftovers = []
+    shaved_rows = 0
+    for block_with_meta, block_rows in zip(split, num_rows_per_block):
+        if block_rows + shaved_rows <= target_size:
+            shaved.append(block_with_meta)
+            shaved_rows += block_rows
+        else:
+            leftovers.append(block_with_meta)
+    num_rows_needed = target_size - shaved_rows
+    return shaved, num_rows_needed, leftovers
+def _shave_all_splits(
+    input_splits: List[BlockPartition],
+    per_split_num_rows: List[List[int]],
+    target_size: int,
+) -> Tuple[List[BlockPartition], List[int], BlockPartition]:
+    """Shave all block list to the target size.
+    Args:
+        input_splits: all block list to shave.
+        input_splits: num rows (per block) for each block list.
+        target_size: the upper bound target size of the shaved lists.
+    Returns:
+        A tuple of:
+            - all shaved block list.
+            - num of rows needed for the block list to meet the target size.
+            - leftover blocks.
+    """
+    shaved_splits = []
+    per_split_needed_rows = []
+    leftovers = []
+    for split, num_rows_per_block in zip(input_splits, per_split_num_rows):
+        shaved, num_rows_needed, _leftovers = _shave_one_split(
+            split, num_rows_per_block, target_size
+        )
+        shaved_splits.append(shaved)
+        per_split_needed_rows.append(num_rows_needed)
+        leftovers.extend(_leftovers)
+    return shaved_splits, per_split_needed_rows, leftovers
+def _split_leftovers(
+    leftovers: RefBundle, per_split_needed_rows: List[int]
+) -> List[BlockPartition]:
+    """Split leftover blocks by the num of rows needed."""
+    num_splits = len(per_split_needed_rows)
+    split_indices = []
+    prev = 0
+    for i, num_rows_needed in enumerate(per_split_needed_rows):
+        split_indices.append(prev + num_rows_needed)
+        prev = split_indices[i]
+    split_result: Tuple[
+        List[List[ObjectRef[Block]]], List[List[BlockMetadata]]
+    ] = _split_at_indices(
+        leftovers.blocks,
+        split_indices,
+        leftovers.owns_blocks,
+    )
+    return [list(zip(block_refs, meta)) for block_refs, meta in zip(*split_result)][
+        :num_splits
+    ]

.venv/lib/python3.11/site-packages/ray/data/_internal/logging.py ADDED Viewed

	@@ -0,0 +1,208 @@

+import logging
+import logging.config
+import os
+from typing import Optional
+import yaml
+import ray
+DEFAULT_CONFIG = {
+    "version": 1,
+    "disable_existing_loggers": False,
+    "formatters": {
+        "ray": {
+            "format": "%(asctime)s\t%(levelname)s %(filename)s:%(lineno)s -- %(message)s"  # noqa: E501
+        },
+        "ray_json": {"class": "ray._private.ray_logging.formatters.JSONFormatter"},
+    },
+    "filters": {
+        "console_filter": {"()": "ray.data._internal.logging.HiddenRecordFilter"},
+        "core_context_filter": {
+            "()": "ray._private.ray_logging.filters.CoreContextFilter"
+        },
+    },
+    "handlers": {
+        "file": {
+            "class": "ray.data._internal.logging.SessionFileHandler",
+            "formatter": "ray",
+            "filename": "ray-data.log",
+        },
+        "file_json": {
+            "class": "ray.data._internal.logging.SessionFileHandler",
+            "formatter": "ray_json",
+            "filename": "ray-data.log",
+            "filters": ["core_context_filter"],
+        },
+        "console": {
+            "class": "ray._private.log.PlainRayHandler",
+            "formatter": "ray",
+            "level": "INFO",
+            "filters": ["console_filter"],
+        },
+    },
+    "loggers": {
+        "ray.data": {
+            "level": "DEBUG",
+            "handlers": ["file", "console"],
+            "propagate": False,
+        },
+        "ray.air.util.tensor_extensions": {
+            "level": "DEBUG",
+            "handlers": ["file", "console"],
+            "propagate": False,
+        },
+    },
+}
+# Dictionary of substitutions to be performed when using JSON mode. Handlers with names
+# corresponding to keys will be replaced by those corresponding to values.
+RAY_DATA_LOG_HANDLER_JSON_SUBSTITUTIONS = {"file": "file_json"}
+# Env. variable to specify the encoding of the file logs when using the default config.
+RAY_DATA_LOG_ENCODING_ENV_VAR_NAME = "RAY_DATA_LOG_ENCODING"
+# Env. variable to specify the logging config path use defaults if not set
+RAY_DATA_LOGGING_CONFIG_ENV_VAR_NAME = "RAY_DATA_LOGGING_CONFIG"
+# To facilitate debugging, Ray Data writes debug logs to a file. However, if Ray Data
+# logs every scheduler loop, logging might impact performance. So, we add a "TRACE"
+# level where logs aren't written by default.
+#
+# Use the following code to log a message at the "TRACE" level:
+# ```
+# logger.log(logging.getLevelName("TRACE"), "Your message here.")
+# ````
+logging.addLevelName(logging.DEBUG - 1, "TRACE")
+class HiddenRecordFilter:
+    """Filters out log records with the "hide" attribute set to True.
+    This filter allows you to override default logging behavior. For example, if errors
+    are printed by default, and you don't want to print a specific error, you can set
+    the "hide" attribute to avoid printing the message.
+    .. testcode::
+        import logging
+        logger = logging.getLogger("ray.data.spam")
+        # This warning won't be printed to the console.
+        logger.warning("ham", extra={"hide": True})
+    """
+    def filter(self, record):
+        return not getattr(record, "hide", False)
+class SessionFileHandler(logging.Handler):
+    """A handler that writes to a log file in the Ray session directory.
+    The Ray session directory isn't available until Ray is initialized, so this handler
+    lazily creates the file handler when you emit a log record.
+    Args:
+        filename: The name of the log file. The file is created in the 'logs' directory
+            of the Ray session directory.
+    """
+    def __init__(self, filename: str):
+        super().__init__()
+        self._filename = filename
+        self._handler = None
+        self._formatter = None
+        self._path = None
+    def emit(self, record):
+        if self._handler is None:
+            self._try_create_handler()
+        if self._handler is not None:
+            self._handler.emit(record)
+    def setFormatter(self, fmt: logging.Formatter) -> None:
+        if self._handler is not None:
+            self._handler.setFormatter(fmt)
+        self._formatter = fmt
+    def _try_create_handler(self):
+        assert self._handler is None
+        log_directory = get_log_directory()
+        if log_directory is None:
+            return
+        os.makedirs(log_directory, exist_ok=True)
+        self._path = os.path.join(log_directory, self._filename)
+        self._handler = logging.FileHandler(self._path)
+        if self._formatter is not None:
+            self._handler.setFormatter(self._formatter)
+def configure_logging() -> None:
+    """Configure the Python logger named 'ray.data'.
+    This function loads the configration YAML specified by "RAY_DATA_LOGGING_CONFIG"
+    environment variable. If the variable isn't set, this function loads the default
+    "logging.yaml" file that is adjacent to this module.
+    If "RAY_DATA_LOG_ENCODING" is specified as "JSON" we will enable JSON logging mode
+    if using the default logging config.
+    """
+    def _load_logging_config(config_path: str):
+        with open(config_path) as file:
+            config = yaml.safe_load(file)
+        return config
+    # Dynamically load env vars
+    config_path = os.environ.get(RAY_DATA_LOGGING_CONFIG_ENV_VAR_NAME)
+    log_encoding = os.environ.get(RAY_DATA_LOG_ENCODING_ENV_VAR_NAME)
+    if config_path is not None:
+        config = _load_logging_config(config_path)
+    else:
+        config = DEFAULT_CONFIG
+        if log_encoding is not None and log_encoding.upper() == "JSON":
+            for logger in config["loggers"].values():
+                for (
+                    old_handler_name,
+                    new_handler_name,
+                ) in RAY_DATA_LOG_HANDLER_JSON_SUBSTITUTIONS.items():
+                    logger["handlers"].remove(old_handler_name)
+                    logger["handlers"].append(new_handler_name)
+    logging.config.dictConfig(config)
+    # After configuring logger, warn if RAY_DATA_LOGGING_CONFIG is used with
+    # RAY_DATA_LOG_ENCODING, because they are not both supported together.
+    if config_path is not None and log_encoding is not None:
+        logger = logging.getLogger(__name__)
+        logger.warning(
+            "Using `RAY_DATA_LOG_ENCODING` is not supported with "
+            + "`RAY_DATA_LOGGING_CONFIG`"
+        )
+def reset_logging() -> None:
+    """Reset the logger named 'ray.data' to its initial state.
+    Used for testing.
+    """
+    logger = logging.getLogger("ray.data")
+    logger.handlers.clear()
+    logger.setLevel(logging.NOTSET)
+def get_log_directory() -> Optional[str]:
+    """Return the directory where Ray Data writes log files.
+    If Ray isn't initialized, this function returns ``None``.
+    """
+    global_node = ray._private.worker._global_node
+    if global_node is None:
+        return None
+    session_dir = global_node.get_session_dir_path()
+    return os.path.join(session_dir, "logs", "ray-data")

.venv/lib/python3.11/site-packages/ray/data/_internal/memory_tracing.py ADDED Viewed

	@@ -0,0 +1,147 @@

+"""Utility for debugging object store memory eager deletion in Datasets.
+NOTE: the performance overhead of tracing object allocation is fairly substantial.
+This is meant to use in unit test for debugging. Please do not enable in production,
+without performance optimization.
+Enable with RAY_DATA_TRACE_ALLOCATIONS=1.
+Basic usage is to call `trace_allocation` each time a new object is created, and call
+`trace_deallocation` when an object should be disposed of. When the workload is
+complete, call `leak_report` to view possibly leaked objects.
+Note that so called "leaked" objects will be reclaimed eventually by reference counting
+in Ray. This is just to debug the eager deletion protocol which is more efficient.
+"""
+from io import StringIO
+from typing import Dict, List
+import ray
+from ray.data.context import DataContext
+def trace_allocation(ref: ray.ObjectRef, loc: str) -> None:
+    """Record that an object has been created.
+    Args:
+        ref: The object created.
+        loc: A human-readable string identifying the call site.
+    """
+    ctx = DataContext.get_current()
+    if ctx.trace_allocations:
+        tracer = _get_mem_actor()
+        # TODO: it would be nice to determine loc automatically based on the stack.
+        ray.get(tracer.trace_alloc.remote([ref], loc))
+def trace_deallocation(ref: ray.ObjectRef, loc: str, free: bool = True) -> None:
+    """Record that an object has been deleted (and delete if free=True).
+    Args:
+        ref: The object we no longer need.
+        loc: A human-readable string identifying the call site.
+        free: Whether to eagerly destroy the object instead of waiting for Ray
+            reference counting to kick in.
+    """
+    if free:
+        ray._private.internal_api.free(ref, local_only=False)
+    ctx = DataContext.get_current()
+    if ctx.trace_allocations:
+        tracer = _get_mem_actor()
+        ray.get(tracer.trace_dealloc.remote([ref], loc, free))
+def leak_report() -> str:
+    tracer = _get_mem_actor()
+    return ray.get(tracer.leak_report.remote())
+@ray.remote(num_cpus=0)
+class _MemActor:
+    def __init__(self):
+        self.allocated: Dict[ray.ObjectRef, dict] = {}
+        self.deallocated: Dict[ray.ObjectRef, dict] = {}
+        self.skip_dealloc: Dict[ray.ObjectRef, str] = {}
+        self.peak_mem = 0
+        self.cur_mem = 0
+    def trace_alloc(self, ref: List[ray.ObjectRef], loc: str):
+        ref = ref[0]  # Avoid Ray materializing the ref.
+        if ref not in self.allocated:
+            meta = ray.experimental.get_object_locations([ref])
+            size_bytes = meta.get("object_size", 0)
+            if not size_bytes:
+                size_bytes = -1
+                from ray import cloudpickle as pickle
+                try:
+                    obj = ray.get(ref, timeout=5.0)
+                    size_bytes = len(pickle.dumps(obj))
+                except Exception:
+                    print("[mem_tracing] ERROR getting size")
+                    size_bytes = -1
+            print(f"[mem_tracing] Allocated {size_bytes} bytes at {loc}: {ref}")
+            entry = {
+                "size_bytes": size_bytes,
+                "loc": loc,
+            }
+            self.allocated[ref] = entry
+            self.cur_mem += size_bytes
+            self.peak_mem = max(self.cur_mem, self.peak_mem)
+    def trace_dealloc(self, ref: List[ray.ObjectRef], loc: str, freed: bool):
+        ref = ref[0]  # Avoid Ray materializing the ref.
+        size_bytes = self.allocated.get(ref, {}).get("size_bytes", 0)
+        if freed:
+            print(f"[mem_tracing] Freed {size_bytes} bytes at {loc}: {ref}")
+            if ref in self.allocated:
+                self.cur_mem -= size_bytes
+                self.deallocated[ref] = self.allocated.pop(ref)
+                self.deallocated[ref]["dealloc_loc"] = loc
+            if ref in self.deallocated:
+                # This object reference is already deallocated.
+                pass
+            else:
+                print(f"[mem_tracing] WARNING: allocation of {ref} was not traced!")
+        else:
+            print(f"[mem_tracing] Skipped freeing {size_bytes} bytes at {loc}: {ref}")
+            self.skip_dealloc[ref] = loc
+    def leak_report(self) -> str:
+        output = StringIO()
+        output.write("[mem_tracing] ===== Leaked objects =====\n")
+        for ref in self.allocated:
+            size_bytes = self.allocated[ref].get("size_bytes")
+            loc = self.allocated[ref].get("loc")
+            if ref in self.skip_dealloc:
+                dealloc_loc = self.skip_dealloc[ref]
+                output.write(
+                    f"[mem_tracing] Leaked object, created at {loc}, size "
+                    f"{size_bytes}, skipped dealloc at {dealloc_loc}: {ref}\n"
+                )
+            else:
+                output.write(
+                    f"[mem_tracing] Leaked object, created at {loc}, "
+                    f"size {size_bytes}: {ref}\n"
+                )
+        output.write("[mem_tracing] ===== End leaked objects =====\n")
+        output.write("[mem_tracing] ===== Freed objects =====\n")
+        for ref in self.deallocated:
+            size_bytes = self.deallocated[ref].get("size_bytes")
+            loc = self.deallocated[ref].get("loc")
+            dealloc_loc = self.deallocated[ref].get("dealloc_loc")
+            output.write(
+                f"[mem_tracing] Freed object from {loc} at {dealloc_loc}, "
+                f"size {size_bytes}: {ref}\n"
+            )
+        output.write("[mem_tracing] ===== End freed objects =====\n")
+        output.write(f"[mem_tracing] Peak size bytes {self.peak_mem}\n")
+        output.write(f"[mem_tracing] Current size bytes {self.cur_mem}\n")
+        return output.getvalue()
+def _get_mem_actor():
+    return _MemActor.options(
+        name="mem_tracing_actor", get_if_exists=True, lifetime="detached"
+    ).remote()

.venv/lib/python3.11/site-packages/ray/data/_internal/null_aggregate.py ADDED Viewed

	@@ -0,0 +1,276 @@

+from types import ModuleType
+from typing import Any, Callable, Tuple, Union
+import numpy as np
+from ray.data.block import AggType, Block, KeyType, T, U
+WrappedAggType = Tuple[AggType, int]
+# This module contains aggregation helpers for handling nulls.
+# The null handling policy is:
+#   1. Mix of values and nulls - ignore_nulls=True:   Ignore the nulls, return
+#                                                     aggregation of non-null values.
+#   2. Mix of values and nulls - ignore_nulls=False:  Return None.
+#   3. All nulls:                                     Return None.
+#   4. Empty dataset:                              Return None.
+#
+# This is accomplished by checking rows for null values and by propagating nulls
+# if found AND if we're not ignoring them. If not ignoring nulls, in order to delineate
+# between found null rows and an empty block accumulation when merging (the latter of
+# which we want to propagate; the former of which we do not), we attach a boolean flag
+# indicating whether or not an accumulation contains valid data to intermediate block
+# accumulations via _wrap_acc() and _unwrap_acc(). This allows us to properly merge
+# intermediate block accumulations under a streaming constraint.
+def _wrap_acc(a: AggType, has_data: bool) -> WrappedAggType:
+    """
+    Wrap accumulation with a numeric boolean flag indicating whether or not
+    this accumulation contains real data; if it doesn't, we consider it to be
+    empty.
+    Args:
+        a: The accumulation value.
+        has_data: Whether the accumulation contains real data.
+    Returns:
+        An AggType list with the last element being a numeric boolean flag indicating
+        whether or not this accumulation contains real data. If the input a has length
+        n, the returned AggType has length n + 1.
+    """
+    if not isinstance(a, list):
+        a = [a]
+    return a + [1 if has_data else 0]
+def _unwrap_acc(a: WrappedAggType) -> Tuple[AggType, bool]:
+    """
+    Unwrap the accumulation, which we assume has been wrapped (via _wrap_acc) with a
+    numeric boolean flag indicating whether or not this accumulation contains real data.
+    Args:
+        a: The wrapped accumulation value that we wish to unwrap.
+    Returns:
+        A tuple containing the unwrapped accumulation value and a boolean indicating
+        whether the accumulation contains real data.
+    """
+    has_data = a[-1] == 1
+    a = a[:-1]
+    if len(a) == 1:
+        a = a[0]
+    return a, has_data
+def _null_wrap_init(
+    init: Callable[[KeyType], AggType]
+) -> Callable[[KeyType], WrappedAggType]:
+    """
+    Wraps an accumulation initializer with null handling.
+    The returned initializer function adds on a has_data field that the accumulator
+    uses to track whether an aggregation is empty.
+    Args:
+        init: The core init function to wrap.
+    Returns:
+        A new accumulation initializer function that can handle nulls.
+    """
+    def _init(k: KeyType) -> AggType:
+        a = init(k)
+        # Initializing accumulation, so indicate that the accumulation doesn't represent
+        # real data yet.
+        return _wrap_acc(a, has_data=False)
+    return _init
+def _null_wrap_merge(
+    ignore_nulls: bool,
+    merge: Callable[[AggType, AggType], AggType],
+) -> Callable[[WrappedAggType, WrappedAggType], WrappedAggType]:
+    """
+    Wrap merge function with null handling.
+    The returned merge function expects a1 and a2 to be either None or of the form:
+    a = [acc_data_1, ..., acc_data_2, has_data].
+    This merges two accumulations subject to the following null rules:
+    1. If a1 is empty and a2 is empty, return empty accumulation.
+    2. If a1 (a2) is empty and a2 (a1) is None, return None.
+    3. If a1 (a2) is empty and a2 (a1) is non-None, return a2 (a1).
+    4. If a1 (a2) is None, return a2 (a1) if ignoring nulls, None otherwise.
+    5. If a1 and a2 are both non-null, return merge(a1, a2).
+    Args:
+        ignore_nulls: Whether nulls should be ignored or cause a None result.
+        merge: The core merge function to wrap.
+    Returns:
+        A new merge function that handles nulls.
+    """
+    def _merge(a1: WrappedAggType, a2: WrappedAggType) -> WrappedAggType:
+        if a1 is None:
+            # If we're ignoring nulls, propagate a2; otherwise, propagate None.
+            return a2 if ignore_nulls else None
+        unwrapped_a1, a1_has_data = _unwrap_acc(a1)
+        if not a1_has_data:
+            # If a1 is empty, propagate a2.
+            # No matter whether a2 is a real value, empty, or None,
+            # propagating each of these is correct if a1 is empty.
+            return a2
+        if a2 is None:
+            # If we're ignoring nulls, propagate a1; otherwise, propagate None.
+            return a1 if ignore_nulls else None
+        unwrapped_a2, a2_has_data = _unwrap_acc(a2)
+        if not a2_has_data:
+            # If a2 is empty, propagate a1.
+            return a1
+        a = merge(unwrapped_a1, unwrapped_a2)
+        return _wrap_acc(a, has_data=True)
+    return _merge
+def _null_wrap_accumulate_row(
+    ignore_nulls: bool,
+    on_fn: Callable[[T], T],
+    accum: Callable[[AggType, T], AggType],
+) -> Callable[[WrappedAggType, T], WrappedAggType]:
+    """
+    Wrap accumulator function with null handling.
+    The returned accumulate function expects a to be either None or of the form:
+    a = [acc_data_1, ..., acc_data_n, has_data].
+    This performs an accumulation subject to the following null rules:
+    1. If r is null and ignore_nulls=False, return None.
+    2. If r is null and ignore_nulls=True, return a.
+    3. If r is non-null and a is None, return None.
+    4. If r is non-null and a is non-None, return accum(a[:-1], r).
+    Args:
+        ignore_nulls: Whether nulls should be ignored or cause a None result.
+        on_fn: Function selecting a subset of the row to apply the aggregation.
+        accum: The core accumulator function to wrap.
+    Returns:
+        A new accumulator function that handles nulls.
+    """
+    def _accum(a: WrappedAggType, r: T) -> WrappedAggType:
+        r = on_fn(r)
+        if _is_null(r):
+            if ignore_nulls:
+                # Ignoring nulls, return the current accumulation, ignoring r.
+                return a
+            else:
+                # Not ignoring nulls, so propagate the null.
+                return None
+        else:
+            if a is None:
+                # Accumulation is None so (1) a previous row must have been null, and
+                # (2) we must be propagating nulls, so continue to pragate this null.
+                return None
+            else:
+                # Row is non-null and accumulation is non-null, so we now apply the core
+                # accumulation.
+                a, _ = _unwrap_acc(a)
+                a = accum(a, r)
+                return _wrap_acc(a, has_data=True)
+    return _accum
+def _null_wrap_accumulate_block(
+    ignore_nulls: bool,
+    accum_block: Callable[[Block], AggType],
+    null_merge: Callable[[WrappedAggType, WrappedAggType], WrappedAggType],
+) -> Callable[[WrappedAggType, Block], WrappedAggType]:
+    """
+    Wrap vectorized aggregate function with null handling.
+    This performs a block accumulation subject to the following null rules:
+    1. If any row is null and ignore_nulls=False, return None.
+    2. If at least one row is not null and ignore_nulls=True, return the block
+       accumulation.
+    3. If all rows are null and ignore_nulls=True, return the base accumulation.
+    4. If all rows non-null, return the block accumulation.
+    Args:
+        ignore_nulls: Whether nulls should be ignored or cause a None result.
+        accum_block: The core vectorized aggregate function to wrap.
+        null_merge: A null-handling merge, as returned from _null_wrap_merge().
+    Returns:
+        A new vectorized aggregate function that handles nulls.
+    """
+    def _accum_block_null(a: WrappedAggType, block: Block) -> WrappedAggType:
+        ret = accum_block(block)
+        if ret is not None:
+            ret = _wrap_acc(ret, has_data=True)
+        elif ignore_nulls:
+            # This can happen if we're ignoring nulls but the entire block only consists
+            # of nulls. We treat the block as if it were empty in this case.
+            ret = a
+        return null_merge(a, ret)
+    return _accum_block_null
+def _null_wrap_finalize(
+    finalize: Callable[[AggType], AggType]
+) -> Callable[[WrappedAggType], U]:
+    """
+    Wrap finalizer with null handling.
+    If the accumulation is empty or None, the returned finalizer returns None.
+    Args:
+        finalize: The core finalizing function to wrap.
+    Returns:
+        A new finalizing function that handles nulls.
+    """
+    def _finalize(a: AggType) -> U:
+        if a is None:
+            return None
+        a, has_data = _unwrap_acc(a)
+        if not has_data:
+            return None
+        return finalize(a)
+    return _finalize
+LazyModule = Union[None, bool, ModuleType]
+_pandas: LazyModule = None
+def _lazy_import_pandas() -> LazyModule:
+    global _pandas
+    if _pandas is None:
+        try:
+            import pandas as _pandas
+        except ModuleNotFoundError:
+            # If module is not found, set _pandas to False so we won't
+            # keep trying to import it on every _lazy_import_pandas() call.
+            _pandas = False
+    return _pandas
+def _is_null(r: Any):
+    pd = _lazy_import_pandas()
+    if pd:
+        return pd.isnull(r)
+    try:
+        return np.isnan(r)
+    except TypeError:
+        return r is None

.venv/lib/python3.11/site-packages/ray/data/_internal/numpy_support.py ADDED Viewed

	@@ -0,0 +1,233 @@

+import collections
+import logging
+from datetime import datetime
+from typing import Any, Dict, List, Union
+import numpy as np
+from ray.air.util.tensor_extensions.utils import create_ragged_ndarray
+from ray.data._internal.util import _truncated_repr
+logger = logging.getLogger(__name__)
+def is_array_like(value: Any) -> bool:
+    """Checks whether objects are array-like, excluding numpy scalars."""
+    return hasattr(value, "__array__") and hasattr(value, "__len__")
+def is_valid_udf_return(udf_return_col: Any) -> bool:
+    """Check whether a UDF column is valid.
+    Valid columns must either be a list of elements, or an array-like object.
+    """
+    return isinstance(udf_return_col, list) or is_array_like(udf_return_col)
+def is_nested_list(udf_return_col: List[Any]) -> bool:
+    for e in udf_return_col:
+        if isinstance(e, list):
+            return True
+    return False
+def validate_numpy_batch(batch: Union[Dict[str, np.ndarray], Dict[str, list]]) -> None:
+    if not isinstance(batch, collections.abc.Mapping) or any(
+        not is_valid_udf_return(col) for col in batch.values()
+    ):
+        raise ValueError(
+            "Batch must be an ndarray or dictionary of ndarrays when converting "
+            f"a numpy batch to a block, got: {type(batch)} "
+            f"({_truncated_repr(batch)})"
+        )
+def _detect_highest_datetime_precision(datetime_list: List[datetime]) -> str:
+    """Detect the highest precision for a list of datetime objects.
+    Args:
+        datetime_list: List of datetime objects.
+    Returns:
+        A string representing the highest precision among the datetime objects
+        ('D', 's', 'ms', 'us', 'ns').
+    """
+    # Define precision hierarchy
+    precision_hierarchy = ["D", "s", "ms", "us", "ns"]
+    highest_precision_index = 0  # Start with the lowest precision ("D")
+    for dt in datetime_list:
+        # Safely get the nanosecond value using getattr for backward compatibility
+        nanosecond = getattr(dt, "nanosecond", 0)
+        if nanosecond != 0:
+            current_precision = "ns"
+        elif dt.microsecond != 0:
+            # Check if the microsecond precision is exactly millisecond
+            if dt.microsecond % 1000 == 0:
+                current_precision = "ms"
+            else:
+                current_precision = "us"
+        elif dt.second != 0 or dt.minute != 0 or dt.hour != 0:
+            # pyarrow does not support h or m, use s for those cases to
+            current_precision = "s"
+        else:
+            current_precision = "D"
+        # Update highest_precision_index based on the hierarchy
+        current_index = precision_hierarchy.index(current_precision)
+        highest_precision_index = max(highest_precision_index, current_index)
+        # Stop early if highest possible precision is reached
+        if highest_precision_index == len(precision_hierarchy) - 1:
+            break
+    return precision_hierarchy[highest_precision_index]
+def _convert_to_datetime64(dt: datetime, precision: str) -> np.datetime64:
+    """
+    Converts a datetime object to a numpy datetime64 object with the specified
+    precision.
+    Args:
+        dt: A datetime object to be converted.
+        precision: The desired precision for the datetime64 conversion. Possible
+        values are 'D', 's', 'ms', 'us', 'ns'.
+    Returns:
+        np.datetime64: A numpy datetime64 object with the specified precision.
+    """
+    if precision == "ns":
+        # Calculate nanoseconds from microsecond and nanosecond
+        microseconds_as_ns = dt.microsecond * 1000
+        # Use getattr for backward compatibility where nanosecond attribute may not
+        # exist
+        nanoseconds = getattr(dt, "nanosecond", 0)
+        total_nanoseconds = microseconds_as_ns + nanoseconds
+        # Create datetime64 from base datetime with microsecond precision
+        base_dt = np.datetime64(dt, "us")
+        # Add remaining nanoseconds as timedelta
+        return base_dt + np.timedelta64(total_nanoseconds - microseconds_as_ns, "ns")
+    else:
+        return np.datetime64(dt).astype(f"datetime64[{precision}]")
+def _convert_datetime_list_to_array(datetime_list: List[datetime]) -> np.ndarray:
+    """Convert a list of datetime objects to a NumPy array of datetime64 with proper
+    precision.
+    Args:
+        datetime_list (List[datetime]): A list of `datetime` objects to be converted.
+        Each `datetime` object represents a specific point in time.
+    Returns:
+        np.ndarray: A NumPy array containing the `datetime64` values of the datetime
+        objects from the input list, with the appropriate precision (e.g., nanoseconds,
+        microseconds, milliseconds, etc.).
+    """
+    # Detect the highest precision for the datetime objects
+    precision = _detect_highest_datetime_precision(datetime_list)
+    # Convert each datetime to the corresponding numpy datetime64 with the appropriate
+    # precision
+    return np.array([_convert_to_datetime64(dt, precision) for dt in datetime_list])
+def convert_to_numpy(column_values: Any) -> np.ndarray:
+    """Convert UDF columns (output of map_batches) to numpy, if possible.
+    This includes lists of scalars, objects supporting the array protocol, and lists
+    of objects supporting the array protocol, such as `[1, 2, 3]`, `Tensor([1, 2, 3])`,
+    and `[array(1), array(2), array(3)]`.
+    Returns:
+        The input as an np.ndarray if possible, otherwise the original input.
+    Raises:
+        ValueError if an input was array-like but we failed to convert it to an array.
+    """
+    if isinstance(column_values, np.ndarray):
+        # No copy/conversion needed, just keep it verbatim.
+        return column_values
+    elif isinstance(column_values, list):
+        if len(column_values) == 1 and isinstance(column_values[0], np.ndarray):
+            # Optimization to avoid conversion overhead from list to np.array.
+            return np.expand_dims(column_values[0], axis=0)
+        if all(isinstance(elem, datetime) for elem in column_values):
+            return _convert_datetime_list_to_array(column_values)
+        # Try to convert list values into an numpy array via
+        # np.array(), so users don't need to manually cast.
+        # NOTE: we don't cast generic iterables, since types like
+        # `str` are also Iterable.
+        try:
+            # Convert array-like objects (like torch.Tensor) to `np.ndarray`s
+            if all(is_array_like(e) for e in column_values):
+                # Use np.asarray() instead of np.array() to avoid copying if possible.
+                column_values = [np.asarray(e) for e in column_values]
+            shapes = set()
+            has_object = False
+            for e in column_values:
+                if isinstance(e, np.ndarray):
+                    shapes.add((e.dtype, e.shape))
+                elif isinstance(e, bytes):
+                    # Don't convert variable length binary data to Numpy arrays as it
+                    # treats zero encoding as termination by default.
+                    # Per recommendation from
+                    # https://github.com/apache/arrow/issues/26470,
+                    # we use object dtype.
+                    # https://github.com/ray-project/ray/issues/35586#issuecomment-1558148261
+                    has_object = True
+                elif not np.isscalar(e):
+                    has_object = True
+            # When column values are
+            #   - Arrays of heterogeneous shapes
+            #   - Byte-strings (viewed as arrays of heterogeneous shapes)
+            #   - Non-scalar objects (tuples, lists, arbitrary object types)
+            #
+            # Custom "ragged ndarray" is created, represented as an array of
+            # references (ie ndarray with dtype=object)
+            if has_object or len(shapes) > 1:
+                # This util works around some limitations of np.array(dtype=object).
+                return create_ragged_ndarray(column_values)
+            else:
+                return np.array(column_values)
+        except Exception as e:
+            logger.error(
+                f"Failed to convert column values to numpy array: "
+                f"{_truncated_repr(column_values)}",
+                exc_info=e,
+            )
+            raise ValueError(
+                "Failed to convert column values to numpy array: "
+                f"({_truncated_repr(column_values)}): {e}."
+            ) from e
+    elif is_array_like(column_values):
+        # Converts other array-like objects such as torch.Tensor.
+        try:
+            # Use np.asarray() instead of np.array() to avoid copying if possible.
+            return np.asarray(column_values)
+        except Exception as e:
+            logger.error(
+                f"Failed to convert column values to numpy array: "
+                f"{_truncated_repr(column_values)}",
+                exc_info=e,
+            )
+            raise ValueError(
+                "Failed to convert column values to numpy array: "
+                f"({_truncated_repr(column_values)}): {e}."
+            ) from e
+    else:
+        return column_values

.venv/lib/python3.11/site-packages/ray/data/_internal/output_buffer.py ADDED Viewed

	@@ -0,0 +1,109 @@

+from typing import Any
+from ray.data._internal.delegating_block_builder import DelegatingBlockBuilder
+from ray.data.block import Block, BlockAccessor, DataBatch
+from ray.data.context import MAX_SAFE_BLOCK_SIZE_FACTOR
+class BlockOutputBuffer:
+    """Generates output blocks of a given size given a stream of inputs.
+    This class is used to turn a stream of items / blocks of arbitrary size
+    into a stream of blocks of ``target_max_block_size``. The caller should
+    check ``has_next()`` after each ``add()`` call, and call ``next()`` to get
+    the next block when ``has_next()`` returns True.
+    When all items have been added, the caller must call ``finalize()`` and
+    then check ``has_next()`` one last time.
+    Examples:
+        >>> from ray.data._internal.output_buffer import BlockOutputBuffer
+        >>> udf = ... # doctest: +SKIP
+        >>> generator = ... # doctest: +SKIP
+        >>> # Yield a stream of output blocks.
+        >>> output = BlockOutputBuffer(udf, 500 * 1024 * 1024) # doctest: +SKIP
+        >>> for item in generator(): # doctest: +SKIP
+        ...     output.add(item) # doctest: +SKIP
+        ...     if output.has_next(): # doctest: +SKIP
+        ...         yield output.next() # doctest: +SKIP
+        >>> output.finalize() # doctest: +SKIP
+        >>> if output.has_next() # doctest: +SKIP
+        ...     yield output.next() # doctest: +SKIP
+    """
+    def __init__(self, target_max_block_size: int):
+        self._target_max_block_size = target_max_block_size
+        self._buffer = DelegatingBlockBuilder()
+        self._returned_at_least_one_block = False
+        self._finalized = False
+    def add(self, item: Any) -> None:
+        """Add a single item to this output buffer."""
+        assert not self._finalized
+        self._buffer.add(item)
+    def add_batch(self, batch: DataBatch) -> None:
+        """Add a data batch to this output buffer."""
+        assert not self._finalized
+        self._buffer.add_batch(batch)
+    def add_block(self, block: Block) -> None:
+        """Add a data block to this output buffer."""
+        assert not self._finalized
+        self._buffer.add_block(block)
+    def finalize(self) -> None:
+        """Must be called once all items have been added."""
+        assert not self._finalized
+        self._finalized = True
+    def has_next(self) -> bool:
+        """Returns true when a complete output block is produced."""
+        if self._finalized:
+            return not self._returned_at_least_one_block or self._buffer.num_rows() > 0
+        else:
+            return (
+                self._buffer.get_estimated_memory_usage() > self._target_max_block_size
+            )
+    def next(self) -> Block:
+        """Returns the next complete output block."""
+        assert self.has_next()
+        block_to_yield = self._buffer.build()
+        block_remainder = None
+        block = BlockAccessor.for_block(block_to_yield)
+        if (
+            block.size_bytes()
+            >= MAX_SAFE_BLOCK_SIZE_FACTOR * self._target_max_block_size
+        ):
+            # Slice a block to respect the target max block size.  We only do
+            # this if we are more than 50% above the target block size, because
+            # this ensures that the last block produced will be at least half
+            # the block size.
+            num_bytes_per_row = block.size_bytes() // block.num_rows()
+            target_num_rows = max(1, self._target_max_block_size // num_bytes_per_row)
+            if target_num_rows < block.num_rows():
+                # NOTE: We're maintaining following protocol of slicing underlying block
+                #       into appropriately sized ones:
+                #
+                #         - (Finalized) Target blocks sliced from the original one
+                #           and are *copied* to avoid referencing original blocks
+                #         - Temporary remainder of the block should *NOT* be copied
+                #           such as to avoid repeatedly copying the remainder bytes
+                #           of the block, resulting in O(M * N) total bytes being
+                #           copied, where N is the total number of bytes in the original
+                #           block and M is the number of blocks that will be produced by
+                #           this iterator
+                block_to_yield = block.slice(0, target_num_rows, copy=True)
+                block_remainder = block.slice(
+                    target_num_rows, block.num_rows(), copy=False
+                )
+        self._buffer = DelegatingBlockBuilder()
+        if block_remainder is not None:
+            self._buffer.add_block(block_remainder)
+        self._returned_at_least_one_block = True
+        return block_to_yield

.venv/lib/python3.11/site-packages/ray/data/_internal/pandas_block.py ADDED Viewed

	@@ -0,0 +1,728 @@

+import collections
+import heapq
+import logging
+import sys
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    TypeVar,
+    Union,
+)
+import numpy as np
+from ray.air.constants import TENSOR_COLUMN_NAME
+from ray.air.util.tensor_extensions.utils import _is_ndarray_tensor
+from ray.data._internal.numpy_support import convert_to_numpy, validate_numpy_batch
+from ray.data._internal.row import TableRow
+from ray.data._internal.table_block import TableBlockAccessor, TableBlockBuilder
+from ray.data._internal.util import find_partitions, keys_equal
+from ray.data.block import (
+    Block,
+    BlockAccessor,
+    BlockExecStats,
+    BlockMetadata,
+    BlockType,
+    KeyType,
+    U,
+)
+from ray.data.context import DataContext
+if TYPE_CHECKING:
+    import pandas
+    import pyarrow
+    from ray.data._internal.planner.exchange.sort_task_spec import SortKey
+    from ray.data.aggregate import AggregateFn
+T = TypeVar("T")
+# Max number of samples used to estimate the Pandas block size.
+_PANDAS_SIZE_BYTES_MAX_SAMPLE_COUNT = 50
+logger = logging.getLogger(__name__)
+_pandas = None
+def lazy_import_pandas():
+    global _pandas
+    if _pandas is None:
+        import pandas
+        _pandas = pandas
+    return _pandas
+class PandasRow(TableRow):
+    """
+    Row of a tabular Dataset backed by a Pandas DataFrame block.
+    """
+    def __getitem__(self, key: Union[str, List[str]]) -> Any:
+        from ray.data.extensions import TensorArrayElement
+        pd = lazy_import_pandas()
+        def get_item(keys: List[str]) -> Any:
+            col = self._row[keys]
+            if len(col) == 0:
+                return None
+            items = col.iloc[0]
+            if isinstance(items.iloc[0], TensorArrayElement):
+                # Getting an item in a Pandas tensor column may return
+                # a TensorArrayElement, which we have to convert to an ndarray.
+                return pd.Series(item.to_numpy() for item in items)
+            try:
+                # Try to interpret this as a numpy-type value.
+                # See https://stackoverflow.com/questions/9452775/converting-numpy-dtypes-to-native-python-types.  # noqa: E501
+                return pd.Series(item.as_py() for item in items)
+            except (AttributeError, ValueError):
+                # Fallback to the original form.
+                return items
+        is_single_item = isinstance(key, str)
+        keys = [key] if is_single_item else key
+        items = get_item(keys)
+        if items is None:
+            return None
+        elif is_single_item:
+            return items.iloc[0]
+        else:
+            return items
+    def __iter__(self) -> Iterator:
+        for k in self._row.columns:
+            yield k
+    def __len__(self):
+        return self._row.shape[1]
+class PandasBlockBuilder(TableBlockBuilder):
+    def __init__(self):
+        pandas = lazy_import_pandas()
+        super().__init__(pandas.DataFrame)
+    @staticmethod
+    def _table_from_pydict(columns: Dict[str, List[Any]]) -> "pandas.DataFrame":
+        pandas = lazy_import_pandas()
+        pd_columns: Dict[str, Any] = {}
+        for col_name, col_vals in columns.items():
+            np_col_vals = convert_to_numpy(col_vals)
+            if col_name == TENSOR_COLUMN_NAME or _is_ndarray_tensor(np_col_vals):
+                from ray.data.extensions.tensor_extension import TensorArray
+                pd_columns[col_name] = TensorArray(np_col_vals)
+            else:
+                pd_columns[col_name] = np_col_vals
+        return pandas.DataFrame(pd_columns)
+    @staticmethod
+    def _concat_tables(tables: List["pandas.DataFrame"]) -> "pandas.DataFrame":
+        pandas = lazy_import_pandas()
+        from ray.air.util.data_batch_conversion import (
+            _cast_ndarray_columns_to_tensor_extension,
+        )
+        if len(tables) > 1:
+            df = pandas.concat(tables, ignore_index=True)
+            df.reset_index(drop=True, inplace=True)
+        else:
+            df = tables[0]
+        ctx = DataContext.get_current()
+        if ctx.enable_tensor_extension_casting:
+            df = _cast_ndarray_columns_to_tensor_extension(df)
+        return df
+    @staticmethod
+    def _concat_would_copy() -> bool:
+        return True
+    @staticmethod
+    def _empty_table() -> "pandas.DataFrame":
+        pandas = lazy_import_pandas()
+        return pandas.DataFrame()
+    def block_type(self) -> BlockType:
+        return BlockType.PANDAS
+# This is to be compatible with pyarrow.lib.schema
+# TODO (kfstorm): We need a format-independent way to represent schema.
+PandasBlockSchema = collections.namedtuple("PandasBlockSchema", ["names", "types"])
+class PandasBlockAccessor(TableBlockAccessor):
+    ROW_TYPE = PandasRow
+    def __init__(self, table: "pandas.DataFrame"):
+        super().__init__(table)
+    def column_names(self) -> List[str]:
+        return self._table.columns.tolist()
+    def append_column(self, name: str, data: Any) -> Block:
+        assert name not in self._table.columns
+        if any(isinstance(item, np.ndarray) for item in data):
+            raise NotImplementedError(
+                f"`{self.__class__.__name__}.append_column()` doesn't support "
+                "array-like data."
+            )
+        table = self._table.copy()
+        table[name] = data
+        return table
+    @staticmethod
+    def _build_tensor_row(row: PandasRow) -> np.ndarray:
+        from ray.data.extensions import TensorArrayElement
+        tensor = row[TENSOR_COLUMN_NAME].iloc[0]
+        if isinstance(tensor, TensorArrayElement):
+            # Getting an item in a Pandas tensor column may return a TensorArrayElement,
+            # which we have to convert to an ndarray.
+            tensor = tensor.to_numpy()
+        return tensor
+    def slice(self, start: int, end: int, copy: bool = False) -> "pandas.DataFrame":
+        view = self._table[start:end]
+        view.reset_index(drop=True, inplace=True)
+        if copy:
+            view = view.copy(deep=True)
+        return view
+    def take(self, indices: List[int]) -> "pandas.DataFrame":
+        table = self._table.take(indices)
+        table.reset_index(drop=True, inplace=True)
+        return table
+    def select(self, columns: List[str]) -> "pandas.DataFrame":
+        if not all(isinstance(col, str) for col in columns):
+            raise ValueError(
+                "Columns must be a list of column name strings when aggregating on "
+                f"Pandas blocks, but got: {columns}."
+            )
+        return self._table[columns]
+    def rename_columns(self, columns_rename: Dict[str, str]) -> "pandas.DataFrame":
+        return self._table.rename(columns=columns_rename, inplace=False, copy=False)
+    def random_shuffle(self, random_seed: Optional[int]) -> "pandas.DataFrame":
+        table = self._table.sample(frac=1, random_state=random_seed)
+        table.reset_index(drop=True, inplace=True)
+        return table
+    def schema(self) -> PandasBlockSchema:
+        dtypes = self._table.dtypes
+        schema = PandasBlockSchema(
+            names=dtypes.index.tolist(), types=dtypes.values.tolist()
+        )
+        # Column names with non-str types of a pandas DataFrame is not
+        # supported by Ray Dataset.
+        if any(not isinstance(name, str) for name in schema.names):
+            raise ValueError(
+                "A Pandas DataFrame with column names of non-str types"
+                " is not supported by Ray Dataset. Column names of this"
+                f" DataFrame: {schema.names!r}."
+            )
+        return schema
+    def to_pandas(self) -> "pandas.DataFrame":
+        from ray.air.util.data_batch_conversion import _cast_tensor_columns_to_ndarrays
+        ctx = DataContext.get_current()
+        table = self._table
+        if ctx.enable_tensor_extension_casting:
+            table = _cast_tensor_columns_to_ndarrays(table)
+        return table
+    def to_numpy(
+        self, columns: Optional[Union[str, List[str]]] = None
+    ) -> Union[np.ndarray, Dict[str, np.ndarray]]:
+        if columns is None:
+            columns = self._table.columns.tolist()
+            should_be_single_ndarray = False
+        elif isinstance(columns, list):
+            should_be_single_ndarray = False
+        else:
+            columns = [columns]
+            should_be_single_ndarray = True
+        column_names_set = set(self._table.columns)
+        for column in columns:
+            if column not in column_names_set:
+                raise ValueError(
+                    f"Cannot find column {column}, available columns: "
+                    f"{self._table.columns.tolist()}"
+                )
+        arrays = []
+        for column in columns:
+            arrays.append(self._table[column].to_numpy())
+        if should_be_single_ndarray:
+            arrays = arrays[0]
+        else:
+            arrays = dict(zip(columns, arrays))
+        return arrays
+    def to_arrow(self) -> "pyarrow.Table":
+        import pyarrow
+        # Set `preserve_index=False` so that Arrow doesn't add a '__index_level_0__'
+        # column to the resulting table.
+        return pyarrow.Table.from_pandas(self._table, preserve_index=False)
+    @staticmethod
+    def numpy_to_block(
+        batch: Union[Dict[str, np.ndarray], Dict[str, list]],
+    ) -> "pandas.DataFrame":
+        validate_numpy_batch(batch)
+        block = PandasBlockBuilder._table_from_pydict(batch)
+        return block
+    def num_rows(self) -> int:
+        return self._table.shape[0]
+    def size_bytes(self) -> int:
+        from pandas.api.types import is_object_dtype
+        from ray.air.util.tensor_extensions.pandas import TensorArray
+        from ray.data.extensions import TensorArrayElement, TensorDtype
+        pd = lazy_import_pandas()
+        def get_deep_size(obj):
+            """Calculates the memory size of objects,
+            including nested objects using an iterative approach."""
+            seen = set()
+            total_size = 0
+            objects = collections.deque([obj])
+            while objects:
+                current = objects.pop()
+                # Skip interning-eligible immutable objects
+                if isinstance(current, (str, bytes, int, float)):
+                    size = sys.getsizeof(current)
+                    total_size += size
+                    continue
+                # Check if the object has been seen before
+                # i.e. a = np.ndarray([1,2,3]), b = [a,a]
+                # The patten above will have only one memory copy
+                if id(current) in seen:
+                    continue
+                seen.add(id(current))
+                try:
+                    size = sys.getsizeof(current)
+                except TypeError:
+                    size = 0
+                total_size += size
+                # Handle specific cases
+                if isinstance(current, np.ndarray):
+                    total_size += current.nbytes - size  # Avoid double counting
+                elif isinstance(current, pd.DataFrame):
+                    total_size += (
+                        current.memory_usage(index=True, deep=True).sum() - size
+                    )
+                elif isinstance(current, (list, tuple, set)):
+                    objects.extend(current)
+                elif isinstance(current, dict):
+                    objects.extend(current.keys())
+                    objects.extend(current.values())
+                elif isinstance(current, TensorArrayElement):
+                    objects.extend(current.to_numpy())
+            return total_size
+        # Get initial memory usage including deep introspection
+        memory_usage = self._table.memory_usage(index=True, deep=True)
+        # TensorDtype for ray.air.util.tensor_extensions.pandas.TensorDtype
+        object_need_check = (TensorDtype,)
+        max_sample_count = _PANDAS_SIZE_BYTES_MAX_SAMPLE_COUNT
+        # Handle object columns separately
+        for column in self._table.columns:
+            # Check pandas object dtype and the extension dtype
+            if is_object_dtype(self._table[column].dtype) or isinstance(
+                self._table[column].dtype, object_need_check
+            ):
+                total_size = len(self._table[column])
+                # Determine the sample size based on max_sample_count
+                sample_size = min(total_size, max_sample_count)
+                # Following codes can also handel case that sample_size == total_size
+                sampled_data = self._table[column].sample(n=sample_size).values
+                try:
+                    if isinstance(sampled_data, TensorArray) and np.issubdtype(
+                        sampled_data[0].numpy_dtype, np.number
+                    ):
+                        column_memory_sample = sampled_data.nbytes
+                    else:
+                        vectorized_size_calc = np.vectorize(lambda x: get_deep_size(x))
+                        column_memory_sample = np.sum(
+                            vectorized_size_calc(sampled_data)
+                        )
+                    # Scale back to the full column size if we sampled
+                    column_memory = column_memory_sample * (total_size / sample_size)
+                    memory_usage[column] = int(column_memory)
+                except Exception as e:
+                    # Handle or log the exception as needed
+                    logger.warning(f"Error calculating size for column '{column}': {e}")
+        # Sum up total memory usage
+        total_memory_usage = memory_usage.sum()
+        return int(total_memory_usage)
+    def _zip(self, acc: BlockAccessor) -> "pandas.DataFrame":
+        r = self.to_pandas().copy(deep=False)
+        s = acc.to_pandas()
+        for col_name in s.columns:
+            col = s[col_name]
+            column_names = list(r.columns)
+            # Ensure the column names are unique after zip.
+            if col_name in column_names:
+                i = 1
+                new_name = col_name
+                while new_name in column_names:
+                    new_name = "{}_{}".format(col_name, i)
+                    i += 1
+                col_name = new_name
+            r[col_name] = col
+        return r
+    @staticmethod
+    def builder() -> PandasBlockBuilder:
+        return PandasBlockBuilder()
+    @staticmethod
+    def _empty_table() -> "pandas.DataFrame":
+        return PandasBlockBuilder._empty_table()
+    def _sample(self, n_samples: int, sort_key: "SortKey") -> "pandas.DataFrame":
+        return self._table[sort_key.get_columns()].sample(n_samples, ignore_index=True)
+    def _apply_agg(
+        self, agg_fn: Callable[["pandas.Series", bool], U], on: str
+    ) -> Optional[U]:
+        """Helper providing null handling around applying an aggregation to a column."""
+        pd = lazy_import_pandas()
+        if on is not None and not isinstance(on, str):
+            raise ValueError(
+                "on must be a string or None when aggregating on Pandas blocks, but "
+                f"got: {type(on)}."
+            )
+        if self.num_rows() == 0:
+            return None
+        col = self._table[on]
+        try:
+            val = agg_fn(col)
+        except TypeError as e:
+            # Converting an all-null column in an Arrow Table to a Pandas DataFrame
+            # column will result in an all-None column of object type, which will raise
+            # a type error when attempting to do most binary operations. We explicitly
+            # check for this type failure here so we can properly propagate a null.
+            if np.issubdtype(col.dtype, np.object_) and col.isnull().all():
+                return None
+            raise e from None
+        if pd.isnull(val):
+            return None
+        return val
+    def count(self, on: str) -> Optional[U]:
+        return self._apply_agg(lambda col: col.count(), on)
+    def sum(self, on: str, ignore_nulls: bool) -> Optional[U]:
+        pd = lazy_import_pandas()
+        if on is not None and not isinstance(on, str):
+            raise ValueError(
+                "on must be a string or None when aggregating on Pandas blocks, but "
+                f"got: {type(on)}."
+            )
+        if self.num_rows() == 0:
+            return None
+        col = self._table[on]
+        if col.isnull().all():
+            # Short-circuit on an all-null column, returning None. This is required for
+            # sum() since it will otherwise return 0 when summing on an all-null column,
+            # which is not what we want.
+            return None
+        val = col.sum(skipna=ignore_nulls)
+        if pd.isnull(val):
+            return None
+        return val
+    def min(self, on: str, ignore_nulls: bool) -> Optional[U]:
+        return self._apply_agg(lambda col: col.min(skipna=ignore_nulls), on)
+    def max(self, on: str, ignore_nulls: bool) -> Optional[U]:
+        return self._apply_agg(lambda col: col.max(skipna=ignore_nulls), on)
+    def mean(self, on: str, ignore_nulls: bool) -> Optional[U]:
+        return self._apply_agg(lambda col: col.mean(skipna=ignore_nulls), on)
+    def sum_of_squared_diffs_from_mean(
+        self,
+        on: str,
+        ignore_nulls: bool,
+        mean: Optional[U] = None,
+    ) -> Optional[U]:
+        if mean is None:
+            mean = self.mean(on, ignore_nulls)
+        return self._apply_agg(
+            lambda col: ((col - mean) ** 2).sum(skipna=ignore_nulls),
+            on,
+        )
+    def sort_and_partition(
+        self, boundaries: List[T], sort_key: "SortKey"
+    ) -> List[Block]:
+        if self._table.shape[0] == 0:
+            # If the pyarrow table is empty we may not have schema
+            # so calling sort_indices() will raise an error.
+            return [self._empty_table() for _ in range(len(boundaries) + 1)]
+        columns, ascending = sort_key.to_pandas_sort_args()
+        table = self._table.sort_values(by=columns, ascending=ascending)
+        if len(boundaries) == 0:
+            return [table]
+        return find_partitions(table, boundaries, sort_key)
+    # TODO (srinathk) Needs to handle None types correctly.
+    def combine(
+        self, sort_key: "SortKey", aggs: Tuple["AggregateFn"]
+    ) -> "pandas.DataFrame":
+        """Combine rows with the same key into an accumulator.
+        This assumes the block is already sorted by key in ascending order.
+        Args:
+            sort_key: A SortKey object which holds column names/keys.
+            If this is ``None``, place all rows in a single group.
+            aggs: The aggregations to do.
+        Returns:
+            A sorted block of [k, v_1, ..., v_n] columns where k is the groupby
+            key and v_i is the partially combined accumulator for the ith given
+            aggregation.
+            If key is None then the k column is omitted.
+        """
+        keys: List[str] = sort_key.get_columns()
+        pd = lazy_import_pandas()
+        def iter_groups() -> Iterator[Tuple[Sequence[KeyType], Block]]:
+            """Creates an iterator over zero-copy group views."""
+            if not keys:
+                # Global aggregation consists of a single "group", so we short-circuit.
+                yield tuple(), self.to_block()
+                return
+            start = end = 0
+            iter = self.iter_rows(public_row_format=False)
+            next_row = None
+            while True:
+                try:
+                    if next_row is None:
+                        next_row = next(iter)
+                    next_keys = next_row[keys]
+                    while keys_equal(next_row[keys], next_keys):
+                        end += 1
+                        try:
+                            next_row = next(iter)
+                        except StopIteration:
+                            next_row = None
+                            break
+                    if isinstance(next_keys, pd.Series):
+                        next_keys = next_keys.values
+                    yield next_keys, self.slice(start, end, copy=False)
+                    start = end
+                except StopIteration:
+                    break
+        builder = PandasBlockBuilder()
+        for group_keys, group_view in iter_groups():
+            # Aggregate.
+            init_vals = group_keys
+            if len(group_keys) == 1:
+                init_vals = group_keys[0]
+            accumulators = [agg.init(init_vals) for agg in aggs]
+            for i in range(len(aggs)):
+                accumulators[i] = aggs[i].accumulate_block(accumulators[i], group_view)
+            # Build the row.
+            row = {}
+            if keys:
+                for k, gk in zip(keys, group_keys):
+                    row[k] = gk
+            count = collections.defaultdict(int)
+            for agg, accumulator in zip(aggs, accumulators):
+                name = agg.name
+                # Check for conflicts with existing aggregation name.
+                if count[name] > 0:
+                    name = self._munge_conflict(name, count[name])
+                count[name] += 1
+                row[name] = accumulator
+            builder.add(row)
+        return builder.build()
+    @staticmethod
+    def merge_sorted_blocks(
+        blocks: List[Block], sort_key: "SortKey"
+    ) -> Tuple["pandas.DataFrame", BlockMetadata]:
+        pd = lazy_import_pandas()
+        stats = BlockExecStats.builder()
+        blocks = [b for b in blocks if b.shape[0] > 0]
+        if len(blocks) == 0:
+            ret = PandasBlockAccessor._empty_table()
+        else:
+            # Handle blocks of different types.
+            blocks = TableBlockAccessor.normalize_block_types(blocks, "pandas")
+            ret = pd.concat(blocks, ignore_index=True)
+            columns, ascending = sort_key.to_pandas_sort_args()
+            ret = ret.sort_values(by=columns, ascending=ascending)
+        return ret, PandasBlockAccessor(ret).get_metadata(exec_stats=stats.build())
+    @staticmethod
+    def aggregate_combined_blocks(
+        blocks: List["pandas.DataFrame"],
+        sort_key: "SortKey",
+        aggs: Tuple["AggregateFn"],
+        finalize: bool,
+    ) -> Tuple["pandas.DataFrame", BlockMetadata]:
+        """Aggregate sorted, partially combined blocks with the same key range.
+        This assumes blocks are already sorted by key in ascending order,
+        so we can do merge sort to get all the rows with the same key.
+        Args:
+            blocks: A list of partially combined and sorted blocks.
+            sort_key: The column name of key or None for global aggregation.
+            aggs: The aggregations to do.
+            finalize: Whether to finalize the aggregation. This is used as an
+                optimization for cases where we repeatedly combine partially
+                aggregated groups.
+        Returns:
+            A block of [k, v_1, ..., v_n] columns and its metadata where k is
+            the groupby key and v_i is the corresponding aggregation result for
+            the ith given aggregation.
+            If key is None then the k column is omitted.
+        """
+        stats = BlockExecStats.builder()
+        keys = sort_key.get_columns()
+        def key_fn(r):
+            if keys:
+                return tuple(r[keys])
+            else:
+                return (0,)
+        # Handle blocks of different types.
+        blocks = TableBlockAccessor.normalize_block_types(blocks, "pandas")
+        iter = heapq.merge(
+            *[
+                PandasBlockAccessor(block).iter_rows(public_row_format=False)
+                for block in blocks
+            ],
+            key=key_fn,
+        )
+        next_row = None
+        builder = PandasBlockBuilder()
+        while True:
+            try:
+                if next_row is None:
+                    next_row = next(iter)
+                next_keys = key_fn(next_row)
+                next_key_columns = keys
+                def gen():
+                    nonlocal iter
+                    nonlocal next_row
+                    while keys_equal(key_fn(next_row), next_keys):
+                        yield next_row
+                        try:
+                            next_row = next(iter)
+                        except StopIteration:
+                            next_row = None
+                            break
+                # Merge.
+                first = True
+                accumulators = [None] * len(aggs)
+                resolved_agg_names = [None] * len(aggs)
+                for r in gen():
+                    if first:
+                        count = collections.defaultdict(int)
+                        for i in range(len(aggs)):
+                            name = aggs[i].name
+                            # Check for conflicts with existing aggregation
+                            # name.
+                            if count[name] > 0:
+                                name = PandasBlockAccessor._munge_conflict(
+                                    name, count[name]
+                                )
+                            count[name] += 1
+                            resolved_agg_names[i] = name
+                            accumulators[i] = r[name]
+                        first = False
+                    else:
+                        for i in range(len(aggs)):
+                            accumulators[i] = aggs[i].merge(
+                                accumulators[i], r[resolved_agg_names[i]]
+                            )
+                # Build the row.
+                row = {}
+                if keys:
+                    for col_name, next_key in zip(next_key_columns, next_keys):
+                        row[col_name] = next_key
+                for agg, agg_name, accumulator in zip(
+                    aggs, resolved_agg_names, accumulators
+                ):
+                    if finalize:
+                        row[agg_name] = agg.finalize(accumulator)
+                    else:
+                        row[agg_name] = accumulator
+                builder.add(row)
+            except StopIteration:
+                break
+        ret = builder.build()
+        return ret, PandasBlockAccessor(ret).get_metadata(exec_stats=stats.build())
+    def block_type(self) -> BlockType:
+        return BlockType.PANDAS

.venv/lib/python3.11/site-packages/ray/data/_internal/plan.py ADDED Viewed

	@@ -0,0 +1,602 @@

+import copy
+import itertools
+import logging
+from typing import TYPE_CHECKING, Iterator, List, Optional, Tuple, Type, Union
+import pyarrow
+import ray
+from ray._private.internal_api import get_memory_info_reply, get_state_from_address
+from ray.data._internal.execution.interfaces import RefBundle
+from ray.data._internal.logical.interfaces.logical_operator import LogicalOperator
+from ray.data._internal.logical.interfaces.logical_plan import LogicalPlan
+from ray.data._internal.logical.operators.from_operators import AbstractFrom
+from ray.data._internal.logical.operators.input_data_operator import InputData
+from ray.data._internal.logical.operators.read_operator import Read
+from ray.data._internal.stats import DatasetStats
+from ray.data._internal.util import create_dataset_tag, unify_block_metadata_schema
+from ray.data.block import BlockMetadata
+from ray.data.context import DataContext
+from ray.data.exceptions import omit_traceback_stdout
+from ray.util.debug import log_once
+if TYPE_CHECKING:
+    from ray.data._internal.execution.interfaces import Executor
+    from ray.data.dataset import Dataset
+# Scheduling strategy can be inherited from prev operator if not specified.
+INHERITABLE_REMOTE_ARGS = ["scheduling_strategy"]
+logger = logging.getLogger(__name__)
+class ExecutionPlan:
+    """A lazy execution plan for a Dataset.
+    This lazy execution plan builds up a chain of ``List[RefBundle]`` -->
+    ``List[RefBundle]`` operators. Prior to execution, we apply a set of logical
+    plan optimizations, such as operator fusion, in order to reduce Ray task
+    overhead and data copies.
+    Internally, the execution plan holds a snapshot of a computed list of
+    blocks and their associated metadata under ``self._snapshot_bundle``,
+    where this snapshot is the cached output of executing the operator chain."""
+    def __init__(
+        self,
+        stats: DatasetStats,
+        *,
+        data_context: Optional[DataContext] = None,
+    ):
+        """Create a plan with no transformation operators.
+        Args:
+            stats: Stats for the base blocks.
+            data_context: :class:`~ray.data.context.DataContext`
+                object to use for execution.
+        """
+        self._in_stats = stats
+        # A computed snapshot of some prefix of operators and their corresponding
+        # output blocks and stats.
+        self._snapshot_operator: Optional[LogicalOperator] = None
+        self._snapshot_stats = None
+        self._snapshot_bundle = None
+        # Snapshot of only metadata corresponding to the final operator's
+        # output bundles, used as the source of truth for the Dataset's schema
+        # and count. This is calculated and cached when the plan is executed as an
+        # iterator (`execute_to_iterator()`), and avoids caching
+        # all of the output blocks in memory like in `self.snapshot_bundle`.
+        # TODO(scottjlee): To keep the caching logic consistent, update `execute()`
+        # to also store the metadata in `_snapshot_metadata` instead of
+        # `_snapshot_bundle`. For example, we could store the blocks in
+        # `self._snapshot_blocks` and the metadata in `self._snapshot_metadata`.
+        self._snapshot_metadata: Optional[BlockMetadata] = None
+        # Cached schema.
+        self._schema = None
+        # Set when a Dataset is constructed with this plan
+        self._dataset_uuid = None
+        self._dataset_name = None
+        self._has_started_execution = False
+        if data_context is None:
+            # Snapshot the current context, so that the config of Datasets is always
+            # determined by the config at the time it was created.
+            self._context = copy.deepcopy(DataContext.get_current())
+        else:
+            self._context = data_context
+    def __repr__(self) -> str:
+        return (
+            f"ExecutionPlan("
+            f"dataset_uuid={self._dataset_uuid}, "
+            f"snapshot_operator={self._snapshot_operator}"
+            f")"
+        )
+    def get_plan_as_string(self, dataset_cls: Type["Dataset"]) -> str:
+        """Create a cosmetic string representation of this execution plan.
+        Returns:
+            The string representation of this execution plan.
+        """
+        # NOTE: this is used for Dataset.__repr__ to give a user-facing string
+        # representation. Ideally ExecutionPlan.__repr__ should be replaced with this
+        # method as well.
+        from ray.data.dataset import MaterializedDataset
+        # Do not force execution for schema, as this method is expected to be very
+        # cheap.
+        plan_str = ""
+        plan_max_depth = 0
+        if not self.has_computed_output():
+            def generate_logical_plan_string(
+                op: LogicalOperator,
+                curr_str: str = "",
+                depth: int = 0,
+            ):
+                """Traverse (DFS) the LogicalPlan DAG and
+                return a string representation of the operators."""
+                if isinstance(op, (Read, InputData, AbstractFrom)):
+                    return curr_str, depth
+                curr_max_depth = depth
+                op_name = op.name
+                if depth == 0:
+                    curr_str += f"{op_name}\n"
+                else:
+                    trailing_space = " " * ((depth - 1) * 3)
+                    curr_str += f"{trailing_space}+- {op_name}\n"
+                for input in op.input_dependencies:
+                    curr_str, input_max_depth = generate_logical_plan_string(
+                        input, curr_str, depth + 1
+                    )
+                    curr_max_depth = max(curr_max_depth, input_max_depth)
+                return curr_str, curr_max_depth
+            # generate_logical_plan_string(self._logical_plan.dag)
+            plan_str, plan_max_depth = generate_logical_plan_string(
+                self._logical_plan.dag
+            )
+            if self._snapshot_bundle is not None:
+                # This plan has executed some but not all operators.
+                schema = unify_block_metadata_schema(self._snapshot_bundle.metadata)
+                count = self._snapshot_bundle.num_rows()
+            elif self._snapshot_metadata is not None:
+                schema = self._snapshot_metadata.schema
+                count = self._snapshot_metadata.num_rows
+            else:
+                # This plan hasn't executed any operators.
+                sources = self._logical_plan.sources()
+                # TODO(@bveeramani): Handle schemas for n-ary operators like `Union`.
+                if len(sources) > 1:
+                    # Multiple sources, cannot determine schema.
+                    schema = None
+                    count = None
+                else:
+                    assert len(sources) == 1
+                    plan = ExecutionPlan(DatasetStats(metadata={}, parent=None))
+                    plan.link_logical_plan(LogicalPlan(sources[0], plan._context))
+                    schema = plan.schema()
+                    count = plan.meta_count()
+        else:
+            # Get schema of output blocks.
+            schema = self.schema(fetch_if_missing=False)
+            count = self._snapshot_bundle.num_rows()
+        if schema is None:
+            schema_str = "Unknown schema"
+        elif isinstance(schema, type):
+            schema_str = str(schema)
+        else:
+            schema_str = []
+            for n, t in zip(schema.names, schema.types):
+                if hasattr(t, "__name__"):
+                    t = t.__name__
+                schema_str.append(f"{n}: {t}")
+            schema_str = ", ".join(schema_str)
+            schema_str = "{" + schema_str + "}"
+        if count is None:
+            count = "?"
+        num_blocks = None
+        if dataset_cls == MaterializedDataset:
+            num_blocks = self.initial_num_blocks()
+            assert num_blocks is not None
+        name_str = (
+            "name={}, ".format(self._dataset_name)
+            if self._dataset_name is not None
+            else ""
+        )
+        num_blocks_str = f"num_blocks={num_blocks}, " if num_blocks else ""
+        dataset_str = "{}({}{}num_rows={}, schema={})".format(
+            dataset_cls.__name__,
+            name_str,
+            num_blocks_str,
+            count,
+            schema_str,
+        )
+        # If the resulting string representation fits in one line, use it directly.
+        SCHEMA_LINE_CHAR_LIMIT = 80
+        MIN_FIELD_LENGTH = 10
+        INDENT_STR = " " * 3
+        trailing_space = INDENT_STR * plan_max_depth
+        if len(dataset_str) > SCHEMA_LINE_CHAR_LIMIT:
+            # If the resulting string representation exceeds the line char limit,
+            # first try breaking up each `Dataset` parameter into its own line
+            # and check if each line fits within the line limit. We check the
+            # `schema` param's length, since this is likely the longest string.
+            schema_str_on_new_line = f"{trailing_space}{INDENT_STR}schema={schema_str}"
+            if len(schema_str_on_new_line) > SCHEMA_LINE_CHAR_LIMIT:
+                # If the schema cannot fit on a single line, break up each field
+                # into its own line.
+                schema_str = []
+                for n, t in zip(schema.names, schema.types):
+                    if hasattr(t, "__name__"):
+                        t = t.__name__
+                    col_str = f"{trailing_space}{INDENT_STR * 2}{n}: {t}"
+                    # If the field line exceeds the char limit, abbreviate
+                    # the field name to fit while maintaining the full type
+                    if len(col_str) > SCHEMA_LINE_CHAR_LIMIT:
+                        shortened_suffix = f"...: {str(t)}"
+                        # Show at least 10 characters of the field name, even if
+                        # we have already hit the line limit with the type.
+                        chars_left_for_col_name = max(
+                            SCHEMA_LINE_CHAR_LIMIT - len(shortened_suffix),
+                            MIN_FIELD_LENGTH,
+                        )
+                        col_str = (
+                            f"{col_str[:chars_left_for_col_name]}{shortened_suffix}"
+                        )
+                    schema_str.append(col_str)
+                schema_str = ",\n".join(schema_str)
+                schema_str = (
+                    "{\n" + schema_str + f"\n{trailing_space}{INDENT_STR}" + "}"
+                )
+            name_str = (
+                f"\n{trailing_space}{INDENT_STR}name={self._dataset_name},"
+                if self._dataset_name is not None
+                else ""
+            )
+            num_blocks_str = (
+                f"\n{trailing_space}{INDENT_STR}num_blocks={num_blocks},"
+                if num_blocks
+                else ""
+            )
+            dataset_str = (
+                f"{dataset_cls.__name__}("
+                f"{name_str}"
+                f"{num_blocks_str}"
+                f"\n{trailing_space}{INDENT_STR}num_rows={count},"
+                f"\n{trailing_space}{INDENT_STR}schema={schema_str}"
+                f"\n{trailing_space})"
+            )
+        if plan_max_depth == 0:
+            plan_str += dataset_str
+        else:
+            plan_str += f"{INDENT_STR * (plan_max_depth - 1)}+- {dataset_str}"
+        return plan_str
+    def link_logical_plan(self, logical_plan: "LogicalPlan"):
+        """Link the logical plan into this execution plan.
+        This is used for triggering execution for optimizer code path in this legacy
+        execution plan.
+        """
+        self._logical_plan = logical_plan
+        self._logical_plan._context = self._context
+    def copy(self) -> "ExecutionPlan":
+        """Create a shallow copy of this execution plan.
+        This copy can be executed without mutating the original, but clearing the copy
+        will also clear the original.
+        Returns:
+            A shallow copy of this execution plan.
+        """
+        plan_copy = ExecutionPlan(
+            self._in_stats,
+            data_context=self._context,
+        )
+        if self._snapshot_bundle is not None:
+            # Copy over the existing snapshot.
+            plan_copy._snapshot_bundle = self._snapshot_bundle
+            plan_copy._snapshot_operator = self._snapshot_operator
+            plan_copy._snapshot_stats = self._snapshot_stats
+        plan_copy._dataset_name = self._dataset_name
+        return plan_copy
+    def deep_copy(self) -> "ExecutionPlan":
+        """Create a deep copy of this execution plan.
+        This copy can be executed AND cleared without mutating the original.
+        Returns:
+            A deep copy of this execution plan.
+        """
+        plan_copy = ExecutionPlan(copy.copy(self._in_stats))
+        if self._snapshot_bundle:
+            # Copy over the existing snapshot.
+            plan_copy._snapshot_bundle = copy.copy(self._snapshot_bundle)
+            plan_copy._snapshot_operator = copy.copy(self._snapshot_operator)
+            plan_copy._snapshot_stats = copy.copy(self._snapshot_stats)
+        plan_copy._dataset_name = self._dataset_name
+        return plan_copy
+    def initial_num_blocks(self) -> Optional[int]:
+        """Get the estimated number of blocks from the logical plan
+        after applying execution plan optimizations, but prior to
+        fully executing the dataset."""
+        return self._logical_plan.dag.estimated_num_outputs()
+    def schema(
+        self, fetch_if_missing: bool = False
+    ) -> Union[type, "pyarrow.lib.Schema"]:
+        """Get the schema after applying all execution plan optimizations,
+        but prior to fully executing the dataset
+        (unless `fetch_if_missing` is set to True).
+        Args:
+            fetch_if_missing: Whether to execute the plan to fetch the schema.
+        Returns:
+            The schema of the output dataset.
+        """
+        if self._schema is not None:
+            return self._schema
+        schema = None
+        if self.has_computed_output():
+            schema = unify_block_metadata_schema(self._snapshot_bundle.metadata)
+        elif self._logical_plan.dag.aggregate_output_metadata().schema is not None:
+            schema = self._logical_plan.dag.aggregate_output_metadata().schema
+        elif fetch_if_missing:
+            iter_ref_bundles, _, _ = self.execute_to_iterator()
+            for ref_bundle in iter_ref_bundles:
+                for metadata in ref_bundle.metadata:
+                    if metadata.schema is not None and (
+                        metadata.num_rows is None or metadata.num_rows > 0
+                    ):
+                        schema = metadata.schema
+                        break
+        elif self.is_read_only():
+            # For consistency with the previous implementation, we fetch the schema if
+            # the plan is read-only even if `fetch_if_missing` is False.
+            iter_ref_bundles, _, _ = self.execute_to_iterator()
+            try:
+                ref_bundle = next(iter(iter_ref_bundles))
+                for metadata in ref_bundle.metadata:
+                    if metadata.schema is not None:
+                        schema = metadata.schema
+                        break
+            except StopIteration:  # Empty dataset.
+                schema = None
+        self._schema = schema
+        return self._schema
+    def cache_schema(self, schema: Union[type, "pyarrow.lib.Schema"]):
+        self._schema = schema
+    def input_files(self) -> Optional[List[str]]:
+        """Get the input files of the dataset, if available."""
+        return self._logical_plan.dag.aggregate_output_metadata().input_files
+    def meta_count(self) -> Optional[int]:
+        """Get the number of rows after applying all plan optimizations, if possible.
+        This method will never trigger any computation.
+        Returns:
+            The number of records of the result Dataset, or None.
+        """
+        if self.has_computed_output():
+            num_rows = sum(m.num_rows for m in self._snapshot_bundle.metadata)
+        elif self._logical_plan.dag.aggregate_output_metadata().num_rows is not None:
+            num_rows = self._logical_plan.dag.aggregate_output_metadata().num_rows
+        else:
+            num_rows = None
+        return num_rows
+    @omit_traceback_stdout
+    def execute_to_iterator(
+        self,
+    ) -> Tuple[Iterator[RefBundle], DatasetStats, Optional["Executor"]]:
+        """Execute this plan, returning an iterator.
+        This will use streaming execution to generate outputs.
+        Returns:
+            Tuple of iterator over output RefBundles, DatasetStats, and the executor.
+        """
+        self._has_started_execution = True
+        # Always used the saved context for execution.
+        ctx = self._context
+        if self.has_computed_output():
+            bundle = self.execute()
+            return iter([bundle]), self._snapshot_stats, None
+        from ray.data._internal.execution.legacy_compat import (
+            execute_to_legacy_bundle_iterator,
+        )
+        from ray.data._internal.execution.streaming_executor import StreamingExecutor
+        metrics_tag = create_dataset_tag(self._dataset_name, self._dataset_uuid)
+        executor = StreamingExecutor(ctx, metrics_tag)
+        bundle_iter = execute_to_legacy_bundle_iterator(executor, self)
+        # Since the generator doesn't run any code until we try to fetch the first
+        # value, force execution of one bundle before we call get_stats().
+        gen = iter(bundle_iter)
+        try:
+            bundle_iter = itertools.chain([next(gen)], gen)
+        except StopIteration:
+            pass
+        self._snapshot_stats = executor.get_stats()
+        return bundle_iter, self._snapshot_stats, executor
+    @omit_traceback_stdout
+    def execute(
+        self,
+        preserve_order: bool = False,
+    ) -> RefBundle:
+        """Execute this plan.
+        Args:
+            preserve_order: Whether to preserve order in execution.
+        Returns:
+            The blocks of the output dataset.
+        """
+        self._has_started_execution = True
+        # Always used the saved context for execution.
+        context = self._context
+        if not ray.available_resources().get("CPU"):
+            if log_once("cpu_warning"):
+                logger.warning(
+                    "Warning: The Ray cluster currently does not have "
+                    "any available CPUs. The Dataset job will hang unless more CPUs "
+                    "are freed up. A common reason is that cluster resources are "
+                    "used by Actors or Tune trials; see the following link "
+                    "for more details: "
+                    "https://docs.ray.io/en/latest/data/data-internals.html#ray-data-and-tune"  # noqa: E501
+                )
+        if not self.has_computed_output():
+            from ray.data._internal.execution.legacy_compat import (
+                _get_initial_stats_from_plan,
+                execute_to_legacy_block_list,
+            )
+            if self._logical_plan.dag.output_data() is not None:
+                # If the data is already materialized (e.g., `from_pandas`), we can
+                # skip execution and directly return the output data. This avoids
+                # recording unnecessary metrics for an empty plan execution.
+                stats = _get_initial_stats_from_plan(self)
+                # TODO(@bveeramani): Make `ExecutionPlan.execute()` return
+                # `List[RefBundle]` instead of `RefBundle`. Among other reasons, it'd
+                # allow us to remove the unwrapping logic below.
+                output_bundles = self._logical_plan.dag.output_data()
+                owns_blocks = all(bundle.owns_blocks for bundle in output_bundles)
+                bundle = RefBundle(
+                    [
+                        (block, metadata)
+                        for bundle in output_bundles
+                        for block, metadata in bundle.blocks
+                    ],
+                    owns_blocks=owns_blocks,
+                )
+            else:
+                from ray.data._internal.execution.streaming_executor import (
+                    StreamingExecutor,
+                )
+                metrics_tag = create_dataset_tag(self._dataset_name, self._dataset_uuid)
+                executor = StreamingExecutor(
+                    context,
+                    metrics_tag,
+                )
+                blocks = execute_to_legacy_block_list(
+                    executor,
+                    self,
+                    dataset_uuid=self._dataset_uuid,
+                    preserve_order=preserve_order,
+                )
+                bundle = RefBundle(
+                    tuple(blocks.iter_blocks_with_metadata()),
+                    owns_blocks=blocks._owned_by_consumer,
+                )
+                stats = executor.get_stats()
+                stats_summary_string = stats.to_summary().to_string(
+                    include_parent=False
+                )
+                if context.enable_auto_log_stats:
+                    logger.info(stats_summary_string)
+            # Retrieve memory-related stats from ray.
+            try:
+                reply = get_memory_info_reply(
+                    get_state_from_address(ray.get_runtime_context().gcs_address)
+                )
+                if reply.store_stats.spill_time_total_s > 0:
+                    stats.global_bytes_spilled = int(
+                        reply.store_stats.spilled_bytes_total
+                    )
+                if reply.store_stats.restore_time_total_s > 0:
+                    stats.global_bytes_restored = int(
+                        reply.store_stats.restored_bytes_total
+                    )
+            except Exception as e:
+                logger.debug(
+                    "Skipping recording memory spilled and restored statistics due to "
+                    f"exception: {e}"
+                )
+            stats.dataset_bytes_spilled = 0
+            def collect_stats(cur_stats):
+                stats.dataset_bytes_spilled += cur_stats.extra_metrics.get(
+                    "obj_store_mem_spilled", 0
+                )
+                for parent in cur_stats.parents:
+                    collect_stats(parent)
+            collect_stats(stats)
+            # Set the snapshot to the output of the final operator.
+            self._snapshot_bundle = bundle
+            self._snapshot_operator = self._logical_plan.dag
+            self._snapshot_stats = stats
+            self._snapshot_stats.dataset_uuid = self._dataset_uuid
+        return self._snapshot_bundle
+    @property
+    def has_started_execution(self) -> bool:
+        """Return ``True`` if this plan has been partially or fully executed."""
+        return self._has_started_execution
+    def clear_snapshot(self) -> None:
+        """Clear the snapshot kept in the plan to the beginning state."""
+        self._snapshot_bundle = None
+        self._snapshot_operator = None
+        self._snapshot_stats = None
+    def stats(self) -> DatasetStats:
+        """Return stats for this plan.
+        If the plan isn't executed, an empty stats object will be returned.
+        """
+        if not self._snapshot_stats:
+            return DatasetStats(metadata={}, parent=None)
+        return self._snapshot_stats
+    def has_lazy_input(self) -> bool:
+        """Return whether this plan has lazy input blocks."""
+        return all(isinstance(op, Read) for op in self._logical_plan.sources())
+    def is_read_only(self, root_op: Optional[LogicalOperator] = None) -> bool:
+        """Return whether the LogicalPlan corresponding to `root_op`
+        contains only a Read op. By default, the last operator of
+        the LogicalPlan is used."""
+        if root_op is None:
+            root_op = self._logical_plan.dag
+        return isinstance(root_op, Read) and len(root_op.input_dependencies) == 0
+    def has_computed_output(self) -> bool:
+        """Whether this plan has a computed snapshot for the final operator, i.e. for
+        the output of this plan.
+        """
+        return (
+            self._snapshot_bundle is not None
+            and self._snapshot_operator == self._logical_plan.dag
+        )
+    def require_preserve_order(self) -> bool:
+        """Whether this plan requires to preserve order."""
+        from ray.data._internal.logical.operators.all_to_all_operator import Sort
+        from ray.data._internal.logical.operators.n_ary_operator import Zip
+        for op in self._logical_plan.dag.post_order_iter():
+            if isinstance(op, (Zip, Sort)):
+                return True
+        return False

.venv/lib/python3.11/site-packages/ray/data/_internal/progress_bar.py ADDED Viewed

	@@ -0,0 +1,217 @@

+import logging
+import threading
+from typing import Any, List, Optional
+import ray
+from ray.experimental import tqdm_ray
+from ray.types import ObjectRef
+from ray.util.debug import log_once
+logger = logging.getLogger(__name__)
+try:
+    import tqdm
+    needs_warning = False
+except ImportError:
+    tqdm = None
+    needs_warning = True
+# Used a signal to cancel execution.
+_canceled_threads = set()
+_canceled_threads_lock = threading.Lock()
+def extract_num_rows(result: Any) -> int:
+    """Extract the number of rows from a result object.
+    Args:
+        result: The result object from which to extract the number of rows.
+    Returns:
+        The number of rows, defaulting to 1 if it cannot be determined.
+    """
+    if hasattr(result, "num_rows"):
+        return result.num_rows
+    elif hasattr(result, "__len__"):
+        # For output is DataFrame,i.e. sort_sample
+        return len(result)
+    else:
+        return 1
+class ProgressBar:
+    """Thin wrapper around tqdm to handle soft imports.
+    If `total` is `None` known (for example, it is unknown
+    because no tasks have finished yet), doesn't display the full
+    progress bar. Still displays basic progress stats from tqdm."""
+    # If the name/description of the progress bar exceeds this length,
+    # it will be truncated.
+    MAX_NAME_LENGTH = 100
+    def __init__(
+        self,
+        name: str,
+        total: Optional[int],
+        unit: str,
+        position: int = 0,
+        enabled: Optional[bool] = None,
+    ):
+        self._desc = self._truncate_name(name)
+        self._progress = 0
+        # Prepend a space to the unit for better formatting.
+        if unit[0] != " ":
+            unit = " " + unit
+        if enabled is None:
+            from ray.data import DataContext
+            enabled = DataContext.get_current().enable_progress_bars
+        if not enabled:
+            self._bar = None
+        elif tqdm:
+            ctx = ray.data.context.DataContext.get_current()
+            if ctx.use_ray_tqdm:
+                self._bar = tqdm_ray.tqdm(total=total, unit=unit, position=position)
+            else:
+                self._bar = tqdm.tqdm(
+                    total=total or 0,
+                    position=position,
+                    dynamic_ncols=True,
+                    unit=unit,
+                    unit_scale=True,
+                )
+            self._bar.set_description(self._desc)
+        else:
+            global needs_warning
+            if needs_warning:
+                print("[dataset]: Run `pip install tqdm` to enable progress reporting.")
+                needs_warning = False
+            self._bar = None
+    def _truncate_name(self, name: str) -> str:
+        ctx = ray.data.context.DataContext.get_current()
+        if (
+            not ctx.enable_progress_bar_name_truncation
+            or len(name) <= self.MAX_NAME_LENGTH
+        ):
+            return name
+        op_names = name.split("->")
+        if len(op_names) == 1:
+            return op_names[0]
+        # Include as many operators as possible without approximately
+        # exceeding `MAX_NAME_LENGTH`. Always include the first and
+        # last operator names soit is easy to identify the DAG.
+        truncated_op_names = [op_names[0]]
+        for op_name in op_names[1:-1]:
+            if (
+                len("->".join(truncated_op_names))
+                + len("->")
+                + len(op_name)
+                + len("->")
+                + len(op_names[-1])
+            ) > self.MAX_NAME_LENGTH:
+                truncated_op_names.append("...")
+                if log_once("ray_data_truncate_operator_name"):
+                    logger.warning(
+                        f"Truncating long operator name to {self.MAX_NAME_LENGTH} "
+                        "characters. To disable this behavior, set "
+                        "`ray.data.DataContext.get_current()."
+                        "DEFAULT_ENABLE_PROGRESS_BAR_NAME_TRUNCATION = False`."
+                    )
+                break
+            truncated_op_names.append(op_name)
+        truncated_op_names.append(op_names[-1])
+        return "->".join(truncated_op_names)
+    def block_until_complete(self, remaining: List[ObjectRef]) -> None:
+        t = threading.current_thread()
+        while remaining:
+            done, remaining = ray.wait(
+                remaining, num_returns=len(remaining), fetch_local=False, timeout=0.1
+            )
+            total_rows_processed = 0
+            for _, result in zip(done, ray.get(done)):
+                num_rows = extract_num_rows(result)
+                total_rows_processed += num_rows
+            self.update(total_rows_processed)
+            with _canceled_threads_lock:
+                if t in _canceled_threads:
+                    break
+    def fetch_until_complete(self, refs: List[ObjectRef]) -> List[Any]:
+        ref_to_result = {}
+        remaining = refs
+        t = threading.current_thread()
+        # Triggering fetch_local redundantly for the same object is slower.
+        # We only need to trigger the fetch_local once for each object,
+        # raylet will persist these fetch requests even after ray.wait returns.
+        # See https://github.com/ray-project/ray/issues/30375.
+        fetch_local = True
+        while remaining:
+            done, remaining = ray.wait(
+                remaining,
+                num_returns=len(remaining),
+                fetch_local=fetch_local,
+                timeout=0.1,
+            )
+            if fetch_local:
+                fetch_local = False
+            total_rows_processed = 0
+            for ref, result in zip(done, ray.get(done)):
+                ref_to_result[ref] = result
+                num_rows = extract_num_rows(result)
+                total_rows_processed += num_rows
+            self.update(total_rows_processed)
+            with _canceled_threads_lock:
+                if t in _canceled_threads:
+                    break
+        return [ref_to_result[ref] for ref in refs]
+    def set_description(self, name: str) -> None:
+        name = self._truncate_name(name)
+        if self._bar and name != self._desc:
+            self._desc = name
+            self._bar.set_description(self._desc)
+    def get_description(self) -> str:
+        return self._desc
+    def refresh(self):
+        if self._bar:
+            self._bar.refresh()
+    def update(self, i: int = 0, total: Optional[int] = None) -> None:
+        if self._bar and (i != 0 or self._bar.total != total):
+            self._progress += i
+            if total is not None:
+                self._bar.total = total
+            if self._bar.total is not None and self._progress > self._bar.total:
+                # If the progress goes over 100%, update the total.
+                self._bar.total = self._progress
+            self._bar.update(i)
+    def close(self):
+        if self._bar:
+            if self._bar.total is not None and self._progress != self._bar.total:
+                # If the progress is not complete, update the total.
+                self._bar.total = self._progress
+                self._bar.refresh()
+            self._bar.close()
+            self._bar = None
+    def __del__(self):
+        self.close()
+    def __getstate__(self):
+        return {}
+    def __setstate__(self, state):
+        self._bar = None  # Progress bar is disabled on remote nodes.

.venv/lib/python3.11/site-packages/ray/data/_internal/remote_fn.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from typing import Any, Dict, Hashable, List
+import ray
+CACHED_FUNCTIONS = {}
+def cached_remote_fn(fn: Any, **ray_remote_args) -> Any:
+    """Lazily defines a ray.remote function.
+    This is used in Datasets to avoid circular import issues with ray.remote.
+    (ray imports ray.data in order to allow ``ray.data.read_foo()`` to work,
+    which means ray.remote cannot be used top-level in ray.data).
+    NOTE: Dynamic arguments should not be passed in directly,
+    and should be set with ``options`` instead:
+    ``cached_remote_fn(fn, **static_args).options(**dynamic_args)``.
+    """
+    # NOTE: Hash of the passed in arguments guarantees that we're caching
+    #       complete instantiation of the Ray's remote method
+    #
+    # To compute the hash of passed in arguments and make sure it's deterministic
+    #   - Sort all KV-pairs by the keys
+    #   - Convert sorted list into tuple
+    #   - Compute hash of the resulting tuple
+    hashable_args = _make_hashable(ray_remote_args)
+    args_hash = hash(hashable_args)
+    if (fn, args_hash) not in CACHED_FUNCTIONS:
+        default_ray_remote_args = {
+            # Use the default scheduling strategy for all tasks so that we will
+            # not inherit a placement group from the caller, if there is one.
+            # The caller of this function may override the scheduling strategy
+            # as needed.
+            "scheduling_strategy": "DEFAULT",
+            "max_retries": -1,
+        }
+        ray_remote_args = {**default_ray_remote_args, **ray_remote_args}
+        _add_system_error_to_retry_exceptions(ray_remote_args)
+        CACHED_FUNCTIONS[(fn, args_hash)] = ray.remote(**ray_remote_args)(fn)
+    return CACHED_FUNCTIONS[(fn, args_hash)]
+def _make_hashable(obj):
+    if isinstance(obj, (List, tuple)):
+        return tuple([_make_hashable(o) for o in obj])
+    elif isinstance(obj, Dict):
+        converted = [(_make_hashable(k), _make_hashable(v)) for k, v in obj.items()]
+        return tuple(sorted(converted, key=lambda t: t[0]))
+    elif isinstance(obj, Hashable):
+        return obj
+    else:
+        raise ValueError(f"Type {type(obj)} is not hashable")
+def _add_system_error_to_retry_exceptions(ray_remote_args) -> None:
+    """Modify the remote args so that Ray retries `RaySystemError`s.
+    Ray typically automatically retries system errors. However, in some cases, Ray won't
+    retry system errors if they're raised from task code. To ensure that Ray Data is
+    fault tolerant to those errors, we need to add `RaySystemError` to the
+    `retry_exceptions` list.
+    TODO: Fix this in Ray Core. See https://github.com/ray-project/ray/pull/45079.
+    """
+    retry_exceptions = ray_remote_args.get("retry_exceptions", False)
+    assert isinstance(retry_exceptions, (list, bool))
+    if (
+        isinstance(retry_exceptions, list)
+        and ray.exceptions.RaySystemError not in retry_exceptions
+    ):
+        retry_exceptions.append(ray.exceptions.RaySystemError)
+    elif not retry_exceptions:
+        retry_exceptions = [ray.exceptions.RaySystemError]
+    ray_remote_args["retry_exceptions"] = retry_exceptions

.venv/lib/python3.11/site-packages/ray/data/_internal/row.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from collections.abc import Mapping
+from typing import Any
+class TableRow(Mapping):
+    """
+    A dict-like row of a tabular ``Dataset``.
+    This implements the dictionary mapping interface, but provides more
+    efficient access with less data copying than converting Arrow Tables
+    or Pandas DataFrames into per-row dicts. This class must be subclassed,
+    with subclasses implementing ``__getitem__``, ``__iter__``, and ``__len__``.
+    Concrete subclasses include ``ray.data._internal.arrow_block.ArrowRow`` and
+    ``ray.data._internal.pandas_block.PandasRow``.
+    """
+    def __init__(self, row: Any):
+        """
+        Construct a ``TableRow`` (internal API).
+        Args:
+            row: The tabular row that backs this row mapping.
+        """
+        self._row = row
+    def as_pydict(self) -> dict:
+        """
+        Convert to a normal Python dict. This will create a new copy of the row."""
+        return dict(self.items())
+    def __str__(self):
+        return str(self.as_pydict())
+    def __repr__(self):
+        return str(self)
+    def _repr_pretty_(self, p, cycle):
+        from IPython.lib.pretty import _dict_pprinter_factory
+        pprinter = _dict_pprinter_factory("{", "}")
+        return pprinter(self, p, cycle)

.venv/lib/python3.11/site-packages/ray/data/_internal/size_estimator.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from typing import Any, List
+import ray
+from ray import cloudpickle
+_ray_initialized = False
+class SizeEstimator:
+    """Efficiently estimates the Ray serialized size of a stream of items.
+    For efficiency, this only samples a fraction of the added items for real
+    Ray-serialization.
+    """
+    def __init__(self):
+        self._running_mean = RunningMean()
+        self._count = 0
+    def add(self, item: Any) -> None:
+        self._count += 1
+        if self._count <= 10:
+            self._running_mean.add(self._real_size(item), weight=1)
+        elif self._count <= 100:
+            if self._count % 10 == 0:
+                self._running_mean.add(self._real_size(item), weight=10)
+        elif self._count % 100 == 0:
+            self._running_mean.add(self._real_size(item), weight=100)
+    def add_block(self, block: List[Any]) -> None:
+        if self._count < 10:
+            for i in range(min(10 - self._count, len(block))):
+                self._running_mean.add(self._real_size(block[i]), weight=1)
+        if self._count < 100:
+            for i in range(
+                10 - (self._count % 10), min(100 - self._count, len(block)), 10
+            ):
+                self._running_mean.add(self._real_size(block[i]), weight=10)
+        if (len(block) + (self._count % 100)) // 100 > 1:
+            for i in range(100 - (self._count % 100), len(block), 100):
+                self._running_mean.add(self._real_size(block[i]), weight=100)
+        self._count += len(block)
+    def size_bytes(self) -> int:
+        return int(self._running_mean.mean * self._count)
+    def _real_size(self, item: Any) -> int:
+        is_client = ray.util.client.ray.is_connected()
+        # In client mode, fallback to using Ray cloudpickle instead of the
+        # real serializer.
+        if is_client:
+            return len(cloudpickle.dumps(item))
+        # We're using an internal Ray API, and have to ensure it's
+        # initialized # by calling a public API.
+        global _ray_initialized
+        if not _ray_initialized:
+            _ray_initialized = True
+            ray.put(None)
+        return (
+            ray._private.worker.global_worker.get_serialization_context()
+            .serialize(item)
+            .total_bytes
+        )
+# Adapted from the RLlib MeanStdFilter.
+class RunningMean:
+    def __init__(self):
+        self._weight = 0
+        self._mean = 0
+    def add(self, x: int, weight: int = 1) -> None:
+        if weight == 0:
+            return
+        n1 = self._weight
+        n2 = weight
+        n = n1 + n2
+        M = (n1 * self._mean + n2 * x) / n
+        self._weight = n
+        self._mean = M
+    @property
+    def n(self) -> int:
+        return self._weight
+    @property
+    def mean(self) -> float:
+        return self._mean
+    def __repr__(self):
+        return "(n={}, mean={})".format(self.n, self.mean)

.venv/lib/python3.11/site-packages/ray/data/_internal/split.py ADDED Viewed

	@@ -0,0 +1,297 @@

+import itertools
+import logging
+from typing import Iterable, List, Tuple, Union
+import ray
+from ray.data._internal.memory_tracing import trace_deallocation
+from ray.data._internal.remote_fn import cached_remote_fn
+from ray.data.block import (
+    Block,
+    BlockAccessor,
+    BlockExecStats,
+    BlockMetadata,
+    BlockPartition,
+)
+from ray.types import ObjectRef
+logger = logging.getLogger(__name__)
+def _calculate_blocks_rows(
+    blocks_with_metadata: BlockPartition,
+) -> List[int]:
+    """Calculate the number of rows for a list of blocks with metadata."""
+    get_num_rows = cached_remote_fn(_get_num_rows)
+    block_rows = []
+    for block, metadata in blocks_with_metadata:
+        if metadata.num_rows is None:
+            # Need to fetch number of rows.
+            num_rows = ray.get(get_num_rows.remote(block))
+            metadata.num_rows = num_rows
+        else:
+            num_rows = metadata.num_rows
+        block_rows.append(num_rows)
+    return block_rows
+def _generate_valid_indices(
+    num_rows_per_block: List[int],
+    split_indices: List[int],
+) -> List[int]:
+    """Generate valid split indices by apply min(index, total_num_rows)
+    to every index."""
+    total_rows = sum(num_rows_per_block)
+    return [min(index, total_rows) for index in split_indices]
+def _generate_per_block_split_indices(
+    num_rows_per_block: List[int],
+    split_indices: List[int],
+) -> List[List[int]]:
+    """Given num rows per block and valid split indices, generate per block split indices.
+    Args:
+        num_rows_per_block: num of rows per block.
+        split_indices: The (global) indices at which to split the blocks.
+    Returns:
+        Per block split indices indicates each input block's split point(s).
+    """
+    # for each split index, we iterate though the currnet input block
+    # to see if the index falls into this block. if the index
+    # falls into this block, we push it back to the current block's
+    # split indices. Otherwise, we move on to the next block.
+    per_block_split_indices = []
+    current_input_block_id = 0
+    current_block_split_indices = []
+    current_block_global_offset = 0
+    current_index_id = 0
+    while current_index_id < len(split_indices):
+        split_index = split_indices[current_index_id]
+        current_block_row = num_rows_per_block[current_input_block_id]
+        if split_index - current_block_global_offset <= current_block_row:
+            current_block_split_indices.append(
+                split_index - current_block_global_offset
+            )
+            current_index_id += 1
+            continue
+        per_block_split_indices.append(current_block_split_indices)
+        current_block_split_indices = []
+        current_block_global_offset += num_rows_per_block[current_input_block_id]
+        current_input_block_id += 1
+    # we might finished all the indices but there are still blocks left, also
+    # current_block_split_indices might not be added yet.
+    while len(per_block_split_indices) < len(num_rows_per_block):
+        per_block_split_indices.append(current_block_split_indices)
+        current_block_split_indices = []
+    return per_block_split_indices
+def _split_single_block(
+    block_id: int,
+    block: Block,
+    meta: BlockMetadata,
+    split_indices: List[int],
+) -> Tuple[Union[Tuple[int, List[BlockMetadata]], Block], ...]:
+    """Split the provided block at the given indices.
+    Args:
+        block_id: the id of this block in the block list.
+        block: block to be split.
+        meta: metadata of the block, we expect meta.num is valid.
+        split_indices: the indices where the block should be split.
+    Returns:
+        returns block_id, split blocks metadata, and a list of blocks
+        in the following form. We return blocks in this way
+        so that the owner of blocks could be the caller(driver)
+        instead of worker itself.
+        Tuple(block_id, split_blocks_meta), block0, block1 ...
+    """
+    split_meta = []
+    split_blocks = []
+    block_accessor = BlockAccessor.for_block(block)
+    prev_index = 0
+    # append one more entry at the last so we don't
+    # need handle empty edge case.
+    split_indices.append(meta.num_rows)
+    for index in split_indices:
+        logger.debug(f"slicing block {prev_index}:{index}")
+        stats = BlockExecStats.builder()
+        split_block = block_accessor.slice(prev_index, index)
+        accessor = BlockAccessor.for_block(split_block)
+        _meta = BlockMetadata(
+            num_rows=accessor.num_rows(),
+            size_bytes=accessor.size_bytes(),
+            schema=meta.schema,
+            input_files=meta.input_files,
+            exec_stats=stats.build(),
+        )
+        split_meta.append(_meta)
+        split_blocks.append(split_block)
+        prev_index = index
+    results = [(block_id, split_meta)]
+    results.extend(split_blocks)
+    return tuple(results)
+def _drop_empty_block_split(block_split_indices: List[int], num_rows: int) -> List[int]:
+    """drop split indices that creates empty block split. This could happen when there
+    are duplicated indices, or index equal to 0 (start of the block) or num_block_rows
+    (end of the block).
+    """
+    prev_index = -1
+    optimized_indices = []
+    for index in block_split_indices:
+        if index == 0 or index == num_rows:
+            continue
+        if index == prev_index:
+            continue
+        optimized_indices.append(index)
+        prev_index = index
+    return optimized_indices
+def _split_all_blocks(
+    blocks_with_metadata: List[Tuple[ObjectRef[Block], BlockMetadata]],
+    per_block_split_indices: List[List[int]],
+    owned_by_consumer: bool,
+) -> Iterable[Tuple[ObjectRef[Block], BlockMetadata]]:
+    """Split all the input blocks based on the split indices"""
+    split_single_block = cached_remote_fn(_split_single_block)
+    all_blocks_split_results: List[BlockPartition] = [None] * len(blocks_with_metadata)
+    per_block_split_metadata_futures = []
+    per_block_split_block_refs = []
+    # tracking splitted blocks for gc.
+    blocks_splitted = []
+    for block_id, block_split_indices in enumerate(per_block_split_indices):
+        (block_ref, meta) = blocks_with_metadata[block_id]
+        block_row = meta.num_rows
+        block_split_indices = _drop_empty_block_split(block_split_indices, block_row)
+        if len(block_split_indices) == 0:
+            # optimization: if no split is needed, we just need to add it to the
+            # result
+            all_blocks_split_results[block_id] = [(block_ref, meta)]
+        else:
+            # otherwise call split remote function.
+            object_refs = split_single_block.options(
+                scheduling_strategy="SPREAD", num_returns=2 + len(block_split_indices)
+            ).remote(
+                block_id,
+                block_ref,
+                meta,
+                block_split_indices,
+            )
+            per_block_split_metadata_futures.append(object_refs[0])
+            per_block_split_block_refs.append(object_refs[1:])
+            blocks_splitted.append(block_ref)
+    if per_block_split_metadata_futures:
+        # only get metadata.
+        per_block_split_metadata = ray.get(per_block_split_metadata_futures)
+        for (block_id, meta), block_refs in zip(
+            per_block_split_metadata, per_block_split_block_refs
+        ):
+            assert len(meta) == len(block_refs)
+            all_blocks_split_results[block_id] = zip(block_refs, meta)
+    # We make a copy for the blocks that have been splitted, so the input blocks
+    # can be cleared if they are owned by consumer (consumer-owned blocks will
+    # only be consumed by the owner).
+    if owned_by_consumer:
+        for b in blocks_splitted:
+            trace_deallocation(b, "split._split_all_blocks")
+    else:
+        for b in blocks_splitted:
+            trace_deallocation(b, "split._split_all_blocks", free=False)
+    return itertools.chain.from_iterable(all_blocks_split_results)
+def _generate_global_split_results(
+    all_blocks_split_results: Iterable[Tuple[ObjectRef[Block], BlockMetadata]],
+    global_split_sizes: List[int],
+) -> Tuple[List[List[ObjectRef[Block]]], List[List[BlockMetadata]]]:
+    """Reassemble per block's split result into final split result."""
+    result_blocks = []
+    result_metas = []
+    current_blocks = []
+    current_meta = []
+    current_split_size = 0
+    current_split_id = 0
+    while current_split_id < len(global_split_sizes):
+        if current_split_size >= global_split_sizes[current_split_id]:
+            assert current_split_size == global_split_sizes[current_split_id]
+            result_blocks.append(current_blocks)
+            result_metas.append(current_meta)
+            current_blocks = []
+            current_meta = []
+            current_split_size = 0
+            current_split_id += 1
+        else:
+            (block_ref, meta) = next(all_blocks_split_results)
+            current_blocks.append(block_ref)
+            current_meta.append(meta)
+            current_split_size += meta.num_rows
+    return result_blocks, result_metas
+def _split_at_indices(
+    blocks_with_metadata: List[Tuple[ObjectRef[Block], BlockMetadata]],
+    indices: List[int],
+    owned_by_consumer: bool = True,
+    block_rows: List[int] = None,
+) -> Tuple[List[List[ObjectRef[Block]]], List[List[BlockMetadata]]]:
+    """Split blocks at the provided indices.
+    Args:
+        blocks_with_metadata: Block futures to split, including the associated metadata.
+        indices: The (global) indices at which to split the blocks.
+        owned_by_consumer: Whether the provided blocks are owned by the consumer.
+        block_rows: The number of rows for each block, in case it has already been
+            computed.
+    Returns:
+        The block split futures and their metadata. If an index split is empty, the
+        corresponding block split will be empty .
+    """
+    # We implement the split in 3 phases.
+    # phase 1: calculate the per block split indices.
+    blocks_with_metadata = list(blocks_with_metadata)
+    if len(blocks_with_metadata) == 0:
+        return ([[]] * (len(indices) + 1), [[]] * (len(indices) + 1))
+    if block_rows is None:
+        block_rows = _calculate_blocks_rows(blocks_with_metadata)
+    valid_indices = _generate_valid_indices(block_rows, indices)
+    per_block_split_indices: List[List[int]] = _generate_per_block_split_indices(
+        block_rows, valid_indices
+    )
+    # phase 2: split each block based on the indices from previous step.
+    all_blocks_split_results: Iterable[
+        Tuple[ObjectRef[Block], BlockMetadata]
+    ] = _split_all_blocks(
+        blocks_with_metadata, per_block_split_indices, owned_by_consumer
+    )
+    # phase 3: generate the final split.
+    # first calculate the size for each split.
+    helper = [0] + valid_indices + [sum(block_rows)]
+    split_sizes = [helper[i] - helper[i - 1] for i in range(1, len(helper))]
+    return _generate_global_split_results(all_blocks_split_results, split_sizes)
+def _get_num_rows(block: Block) -> int:
+    """Get the number of rows contained in the provided block."""
+    return BlockAccessor.for_block(block).num_rows()

.venv/lib/python3.11/site-packages/ray/data/_internal/stats.py ADDED Viewed

	@@ -0,0 +1,1495 @@

+import collections
+import logging
+import threading
+import time
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Set, Tuple, Union
+from uuid import uuid4
+import numpy as np
+import ray
+from ray.actor import ActorHandle
+from ray.data._internal.block_list import BlockList
+from ray.data._internal.execution.interfaces.op_runtime_metrics import (
+    MetricsGroup,
+    OpRuntimeMetrics,
+)
+from ray.data._internal.util import capfirst
+from ray.data.block import BlockMetadata
+from ray.data.context import DataContext
+from ray.util.annotations import DeveloperAPI
+from ray.util.metrics import Gauge
+from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
+logger = logging.getLogger(__name__)
+STATS_ACTOR_NAME = "datasets_stats_actor"
+STATS_ACTOR_NAMESPACE = "_dataset_stats_actor"
+StatsDict = Dict[str, List[BlockMetadata]]
+def fmt(seconds: float) -> str:
+    if seconds > 1:
+        return str(round(seconds, 2)) + "s"
+    elif seconds > 0.001:
+        return str(round(seconds * 1000, 2)) + "ms"
+    else:
+        return str(round(seconds * 1000 * 1000, 2)) + "us"
+def leveled_indent(lvl: int = 0, spaces_per_indent: int = 3) -> str:
+    """Returns a string of spaces which contains `level` indents,
+    each indent containing `spaces_per_indent` spaces. For example:
+    >>> leveled_indent(2, 3)
+    '      '
+    """
+    return (" " * spaces_per_indent) * lvl
+class Timer:
+    """Helper class for tracking accumulated time (in seconds)."""
+    def __init__(self):
+        self._value: float = 0
+        self._min: float = float("inf")
+        self._max: float = 0
+        self._total_count: float = 0
+    @contextmanager
+    def timer(self) -> None:
+        time_start = time.perf_counter()
+        try:
+            yield
+        finally:
+            self.add(time.perf_counter() - time_start)
+    def add(self, value: float) -> None:
+        self._value += value
+        if value < self._min:
+            self._min = value
+        if value > self._max:
+            self._max = value
+        self._total_count += 1
+    def get(self) -> float:
+        return self._value
+    def min(self) -> float:
+        return self._min
+    def max(self) -> float:
+        return self._max
+    def avg(self) -> float:
+        return self._value / self._total_count if self._total_count else float("inf")
+class _DatasetStatsBuilder:
+    """Helper class for building dataset stats.
+    When this class is created, we record the start time. When build() is
+    called with the final blocks of the new dataset, the time delta is
+    saved as part of the stats."""
+    def __init__(
+        self,
+        operator_name: str,
+        parent: "DatasetStats",
+        override_start_time: Optional[float],
+    ):
+        self.operator_name = operator_name
+        self.parent = parent
+        self.start_time = override_start_time or time.perf_counter()
+    def build_multioperator(self, metadata: StatsDict) -> "DatasetStats":
+        op_metadata = {}
+        for i, (k, v) in enumerate(metadata.items()):
+            capped_k = capfirst(k)
+            if len(metadata) > 1:
+                if i == 0:
+                    op_metadata[self.operator_name + capped_k] = v
+                else:
+                    op_metadata[self.operator_name.split("->")[-1] + capped_k] = v
+            else:
+                op_metadata[self.operator_name] = v
+        stats = DatasetStats(
+            metadata=op_metadata,
+            parent=self.parent,
+            base_name=self.operator_name,
+        )
+        stats.time_total_s = time.perf_counter() - self.start_time
+        return stats
+    def build(self, final_blocks: BlockList) -> "DatasetStats":
+        stats = DatasetStats(
+            metadata={self.operator_name: final_blocks.get_metadata()},
+            parent=self.parent,
+        )
+        stats.time_total_s = time.perf_counter() - self.start_time
+        return stats
+@ray.remote(num_cpus=0)
+class _StatsActor:
+    """Actor holding stats for blocks created by LazyBlockList.
+    This actor is shared across all datasets created in the same cluster.
+    In order to cap memory usage, we set a max number of stats to keep
+    in the actor. When this limit is exceeded, the stats will be garbage
+    collected in FIFO order.
+    TODO(ekl) we should consider refactoring LazyBlockList so stats can be
+    extracted without using an out-of-band actor."""
+    def __init__(self, max_stats=1000):
+        # Mapping from uuid -> (task_id -> list of blocks statistics).
+        self.metadata = collections.defaultdict(dict)
+        self.last_time = {}
+        self.start_time = {}
+        self.max_stats = max_stats
+        self.fifo_queue = []
+        # Assign dataset uuids with a global counter.
+        self.next_dataset_id = 0
+        # Dataset metadata to be queried directly by DashboardHead api.
+        self.datasets: Dict[str, Any] = {}
+        # Ray Data dashboard metrics
+        # Everything is a gauge because we need to reset all of
+        # a dataset's metrics to 0 after each finishes execution.
+        op_tags_keys = ("dataset", "operator")
+        # TODO(scottjlee): move these overvie metrics as fields in a
+        # separate dataclass, similar to OpRuntimeMetrics.
+        self.spilled_bytes = Gauge(
+            "data_spilled_bytes",
+            description="""Bytes spilled by dataset operators.
+                DataContext.enable_get_object_locations_for_metrics
+                must be set to True to report this metric""",
+            tag_keys=op_tags_keys,
+        )
+        self.allocated_bytes = Gauge(
+            "data_allocated_bytes",
+            description="Bytes allocated by dataset operators",
+            tag_keys=op_tags_keys,
+        )
+        self.freed_bytes = Gauge(
+            "data_freed_bytes",
+            description="Bytes freed by dataset operators",
+            tag_keys=op_tags_keys,
+        )
+        self.current_bytes = Gauge(
+            "data_current_bytes",
+            description="Bytes currently in memory store used by dataset operators",
+            tag_keys=op_tags_keys,
+        )
+        self.cpu_usage_cores = Gauge(
+            "data_cpu_usage_cores",
+            description="CPUs allocated to dataset operators",
+            tag_keys=op_tags_keys,
+        )
+        self.gpu_usage_cores = Gauge(
+            "data_gpu_usage_cores",
+            description="GPUs allocated to dataset operators",
+            tag_keys=op_tags_keys,
+        )
+        self.output_bytes = Gauge(
+            "data_output_bytes",
+            description="Bytes outputted by dataset operators",
+            tag_keys=op_tags_keys,
+        )
+        self.output_rows = Gauge(
+            "data_output_rows",
+            description="Rows outputted by dataset operators",
+            tag_keys=op_tags_keys,
+        )
+        # === Metrics from OpRuntimeMetrics ===
+        # Inputs-related metrics
+        self.execution_metrics_inputs = (
+            self._create_prometheus_metrics_for_execution_metrics(
+                metrics_group=MetricsGroup.INPUTS,
+                tag_keys=op_tags_keys,
+            )
+        )
+        # Outputs-related metrics
+        self.execution_metrics_outputs = (
+            self._create_prometheus_metrics_for_execution_metrics(
+                metrics_group=MetricsGroup.OUTPUTS,
+                tag_keys=op_tags_keys,
+            )
+        )
+        # Task-related metrics
+        self.execution_metrics_tasks = (
+            self._create_prometheus_metrics_for_execution_metrics(
+                metrics_group=MetricsGroup.TASKS,
+                tag_keys=op_tags_keys,
+            )
+        )
+        # Object store memory-related metrics
+        self.execution_metrics_obj_store_memory = (
+            self._create_prometheus_metrics_for_execution_metrics(
+                metrics_group=MetricsGroup.OBJECT_STORE_MEMORY,
+                tag_keys=op_tags_keys,
+            )
+        )
+        # Miscellaneous metrics
+        self.execution_metrics_misc = (
+            self._create_prometheus_metrics_for_execution_metrics(
+                metrics_group=MetricsGroup.MISC,
+                tag_keys=op_tags_keys,
+            )
+        )
+        iter_tag_keys = ("dataset",)
+        self.iter_total_blocked_s = Gauge(
+            "data_iter_total_blocked_seconds",
+            description="Seconds user thread is blocked by iter_batches()",
+            tag_keys=iter_tag_keys,
+        )
+        self.iter_user_s = Gauge(
+            "data_iter_user_seconds",
+            description="Seconds spent in user code",
+            tag_keys=iter_tag_keys,
+        )
+        self.iter_initialize_s = Gauge(
+            "data_iter_initialize_seconds",
+            description="Seconds spent in iterator initialization code",
+            tag_keys=iter_tag_keys,
+        )
+    def _create_prometheus_metrics_for_execution_metrics(
+        self, metrics_group: MetricsGroup, tag_keys: Tuple[str, ...]
+    ) -> Dict[str, Gauge]:
+        metrics = {}
+        for metric in OpRuntimeMetrics.get_metrics():
+            if not metric.metrics_group == metrics_group:
+                continue
+            metric_name = f"data_{metric.name}"
+            metric_description = metric.description
+            metrics[metric.name] = Gauge(
+                metric_name,
+                description=metric_description,
+                tag_keys=tag_keys,
+            )
+        return metrics
+    def record_start(self, stats_uuid):
+        self.start_time[stats_uuid] = time.perf_counter()
+        self.fifo_queue.append(stats_uuid)
+        # Purge the oldest stats if the limit is exceeded.
+        if len(self.fifo_queue) > self.max_stats:
+            uuid = self.fifo_queue.pop(0)
+            if uuid in self.start_time:
+                del self.start_time[uuid]
+            if uuid in self.last_time:
+                del self.last_time[uuid]
+            if uuid in self.metadata:
+                del self.metadata[uuid]
+    def record_task(
+        self, stats_uuid: str, task_idx: int, blocks_metadata: List[BlockMetadata]
+    ):
+        # Null out the schema to keep the stats size small.
+        # TODO(chengsu): ideally schema should be null out on caller side.
+        for metadata in blocks_metadata:
+            metadata.schema = None
+        if stats_uuid in self.start_time:
+            self.metadata[stats_uuid][task_idx] = blocks_metadata
+            self.last_time[stats_uuid] = time.perf_counter()
+    def get(self, stats_uuid):
+        if stats_uuid not in self.metadata:
+            return {}, 0.0
+        return (
+            self.metadata[stats_uuid],
+            self.last_time[stats_uuid] - self.start_time[stats_uuid],
+        )
+    def _get_stats_dict_size(self):
+        return len(self.start_time), len(self.last_time), len(self.metadata)
+    def get_dataset_id(self):
+        dataset_id = str(self.next_dataset_id)
+        self.next_dataset_id += 1
+        return dataset_id
+    def update_metrics(self, execution_metrics, iteration_metrics):
+        for metrics in execution_metrics:
+            self.update_execution_metrics(*metrics)
+        for metrics in iteration_metrics:
+            self.update_iteration_metrics(*metrics)
+    def update_execution_metrics(
+        self,
+        dataset_tag: str,
+        op_metrics: List[Dict[str, Union[int, float]]],
+        operator_tags: List[str],
+        state: Dict[str, Any],
+    ):
+        for stats, operator_tag in zip(op_metrics, operator_tags):
+            tags = self._create_tags(dataset_tag, operator_tag)
+            self.spilled_bytes.set(stats.get("obj_store_mem_spilled", 0), tags)
+            self.freed_bytes.set(stats.get("obj_store_mem_freed", 0), tags)
+            self.current_bytes.set(stats.get("obj_store_mem_used", 0), tags)
+            self.output_bytes.set(stats.get("bytes_task_outputs_generated", 0), tags)
+            self.output_rows.set(stats.get("rows_task_outputs_generated", 0), tags)
+            self.cpu_usage_cores.set(stats.get("cpu_usage", 0), tags)
+            self.gpu_usage_cores.set(stats.get("gpu_usage", 0), tags)
+            for field_name, prom_metric in self.execution_metrics_inputs.items():
+                prom_metric.set(stats.get(field_name, 0), tags)
+            for field_name, prom_metric in self.execution_metrics_outputs.items():
+                prom_metric.set(stats.get(field_name, 0), tags)
+            for field_name, prom_metric in self.execution_metrics_tasks.items():
+                prom_metric.set(stats.get(field_name, 0), tags)
+            for (
+                field_name,
+                prom_metric,
+            ) in self.execution_metrics_obj_store_memory.items():
+                prom_metric.set(stats.get(field_name, 0), tags)
+            for field_name, prom_metric in self.execution_metrics_misc.items():
+                prom_metric.set(stats.get(field_name, 0), tags)
+        # This update is called from a dataset's executor,
+        # so all tags should contain the same dataset
+        self.update_dataset(dataset_tag, state)
+    def update_iteration_metrics(
+        self,
+        stats: "DatasetStats",
+        dataset_tag,
+    ):
+        tags = self._create_tags(dataset_tag)
+        self.iter_total_blocked_s.set(stats.iter_total_blocked_s.get(), tags)
+        self.iter_user_s.set(stats.iter_user_s.get(), tags)
+        self.iter_initialize_s.set(stats.iter_initialize_s.get(), tags)
+    def register_dataset(self, job_id: str, dataset_tag: str, operator_tags: List[str]):
+        self.datasets[dataset_tag] = {
+            "job_id": job_id,
+            "state": "RUNNING",
+            "progress": 0,
+            "total": 0,
+            "start_time": time.time(),
+            "end_time": None,
+            "operators": {
+                operator: {
+                    "state": "RUNNING",
+                    "progress": 0,
+                    "total": 0,
+                }
+                for operator in operator_tags
+            },
+        }
+    def update_dataset(self, dataset_tag, state):
+        self.datasets[dataset_tag].update(state)
+    def get_datasets(self, job_id: Optional[str] = None):
+        if not job_id:
+            return self.datasets
+        return {k: v for k, v in self.datasets.items() if v["job_id"] == job_id}
+    def _create_tags(self, dataset_tag: str, operator_tag: Optional[str] = None):
+        tags = {"dataset": dataset_tag}
+        if operator_tag is not None:
+            tags["operator"] = operator_tag
+        return tags
+# Creating/getting an actor from multiple threads is not safe.
+# https://github.com/ray-project/ray/issues/41324
+_stats_actor_lock: threading.RLock = threading.RLock()
+def _get_or_create_stats_actor():
+    ctx = DataContext.get_current()
+    scheduling_strategy = ctx.scheduling_strategy
+    if not ray.util.client.ray.is_connected():
+        # Pin the stats actor to the local node
+        # so it fate-shares with the driver.
+        scheduling_strategy = NodeAffinitySchedulingStrategy(
+            ray.get_runtime_context().get_node_id(),
+            soft=False,
+        )
+    with _stats_actor_lock:
+        return _StatsActor.options(
+            name=STATS_ACTOR_NAME,
+            namespace=STATS_ACTOR_NAMESPACE,
+            get_if_exists=True,
+            lifetime="detached",
+            scheduling_strategy=scheduling_strategy,
+        ).remote()
+class _StatsManager:
+    """A Class containing util functions that manage remote calls to _StatsActor.
+    This class collects stats from execution and iteration codepaths and keeps
+    track of the latest snapshot.
+    An instance of this class runs a single background thread that periodically
+    forwards the latest execution/iteration stats to the _StatsActor.
+    This thread will terminate itself after being inactive (meaning that there are
+    no active executors or iterators) for STATS_ACTOR_UPDATE_THREAD_INACTIVITY_LIMIT
+    iterations. After terminating, a new thread will start if more calls are made
+    to this class.
+    """
+    # Interval for making remote calls to the _StatsActor.
+    STATS_ACTOR_UPDATE_INTERVAL_SECONDS = 5
+    # After this many iterations of inactivity,
+    # _StatsManager._update_thread will close itself.
+    UPDATE_THREAD_INACTIVITY_LIMIT = 5
+    def __init__(self):
+        # Lazily get stats actor handle to avoid circular import.
+        self._stats_actor_handle: Optional[ActorHandle] = None
+        self._stats_actor_cluster_id = None
+        # Last execution stats snapshots for all executing datasets
+        self._last_execution_stats = {}
+        # Last iteration stats snapshots for all running iterators
+        self._last_iteration_stats: Dict[
+            str, Tuple[Dict[str, str], "DatasetStats"]
+        ] = {}
+        # Lock for updating stats snapshots
+        self._stats_lock: threading.Lock = threading.Lock()
+        # Background thread to make remote calls to _StatsActor
+        self._update_thread: Optional[threading.Thread] = None
+        self._update_thread_lock: threading.Lock = threading.Lock()
+    def _stats_actor(self, create_if_not_exists=True) -> Optional[ActorHandle]:
+        if ray._private.worker._global_node is None:
+            raise RuntimeError("Global node is not initialized.")
+        current_cluster_id = ray._private.worker._global_node.cluster_id
+        if (
+            self._stats_actor_handle is None
+            or self._stats_actor_cluster_id != current_cluster_id
+        ):
+            if create_if_not_exists:
+                self._stats_actor_handle = _get_or_create_stats_actor()
+            else:
+                try:
+                    self._stats_actor_handle = ray.get_actor(
+                        name=STATS_ACTOR_NAME, namespace=STATS_ACTOR_NAMESPACE
+                    )
+                except ValueError:
+                    return None
+            self._stats_actor_cluster_id = current_cluster_id
+        return self._stats_actor_handle
+    def _start_thread_if_not_running(self):
+        # Start background update thread if not running.
+        with self._update_thread_lock:
+            if self._update_thread is None or not self._update_thread.is_alive():
+                def _run_update_loop():
+                    iter_stats_inactivity = 0
+                    while True:
+                        if self._last_iteration_stats or self._last_execution_stats:
+                            try:
+                                # Do not create _StatsActor if it doesn't exist because
+                                # this thread can be running even after the cluster is
+                                # shutdown. Creating an actor will automatically start
+                                # a new cluster.
+                                stats_actor = self._stats_actor(
+                                    create_if_not_exists=False
+                                )
+                                if stats_actor is None:
+                                    continue
+                                stats_actor.update_metrics.remote(
+                                    execution_metrics=list(
+                                        self._last_execution_stats.values()
+                                    ),
+                                    iteration_metrics=list(
+                                        self._last_iteration_stats.values()
+                                    ),
+                                )
+                                iter_stats_inactivity = 0
+                            except Exception:
+                                logger.debug(
+                                    "Error occurred during remote call to _StatsActor.",
+                                    exc_info=True,
+                                )
+                                return
+                        else:
+                            iter_stats_inactivity += 1
+                            if (
+                                iter_stats_inactivity
+                                >= _StatsManager.UPDATE_THREAD_INACTIVITY_LIMIT
+                            ):
+                                logger.debug(
+                                    "Terminating StatsManager thread due to inactivity."
+                                )
+                                return
+                        time.sleep(StatsManager.STATS_ACTOR_UPDATE_INTERVAL_SECONDS)
+                self._update_thread = threading.Thread(
+                    target=_run_update_loop, daemon=True
+                )
+                self._update_thread.start()
+    # Execution methods
+    def update_execution_metrics(
+        self,
+        dataset_tag: str,
+        op_metrics: List[OpRuntimeMetrics],
+        operator_tags: List[str],
+        state: Dict[str, Any],
+        force_update: bool = False,
+    ):
+        op_metrics_dicts = [metric.as_dict() for metric in op_metrics]
+        args = (dataset_tag, op_metrics_dicts, operator_tags, state)
+        if force_update:
+            self._stats_actor().update_execution_metrics.remote(*args)
+        else:
+            with self._stats_lock:
+                self._last_execution_stats[dataset_tag] = args
+            self._start_thread_if_not_running()
+    def clear_last_execution_stats(self, dataset_tag: str):
+        # After dataset completes execution, remove cached execution stats.
+        # Marks the dataset as finished on job page's Ray Data Overview.
+        with self._stats_lock:
+            if dataset_tag in self._last_execution_stats:
+                del self._last_execution_stats[dataset_tag]
+    # Iteration methods
+    def update_iteration_metrics(self, stats: "DatasetStats", dataset_tag: str):
+        with self._stats_lock:
+            self._last_iteration_stats[dataset_tag] = (stats, dataset_tag)
+        self._start_thread_if_not_running()
+    def clear_iteration_metrics(self, dataset_tag: str):
+        # Delete the last iteration stats so that update thread will have
+        # a chance to terminate.
+        # Note we don't reset the actual metric values through the StatsActor
+        # since the value is essentially a counter value. See
+        # https://github.com/ray-project/ray/pull/48618 for more context.
+        with self._stats_lock:
+            if dataset_tag in self._last_iteration_stats:
+                del self._last_iteration_stats[dataset_tag]
+    # Other methods
+    def register_dataset_to_stats_actor(self, dataset_tag, operator_tags):
+        self._stats_actor().register_dataset.remote(
+            ray.get_runtime_context().get_job_id(),
+            dataset_tag,
+            operator_tags,
+        )
+    def get_dataset_id_from_stats_actor(self) -> str:
+        try:
+            return ray.get(self._stats_actor().get_dataset_id.remote())
+        except Exception:
+            # Getting dataset id from _StatsActor may fail, in this case
+            # fall back to uuid4
+            return uuid4().hex
+StatsManager = _StatsManager()
+class DatasetStats:
+    """Holds the execution times for a given Dataset.
+    This object contains a reference to the parent Dataset's stats as well,
+    but not the Dataset object itself, to allow its blocks to be dropped from
+    memory."""
+    def __init__(
+        self,
+        *,
+        metadata: StatsDict,
+        parent: Union[Optional["DatasetStats"], List["DatasetStats"]],
+        needs_stats_actor: bool = False,
+        stats_uuid: str = None,
+        base_name: str = None,
+    ):
+        """Create dataset stats.
+        Args:
+            metadata: Dict of operators used to create this Dataset from the
+                previous one. Typically one entry, e.g., {"map": [...]}.
+            parent: Reference to parent Dataset's stats, or a list of parents
+                if there are multiple.
+            needs_stats_actor: Whether this Dataset's stats needs a stats actor for
+                stats collection. This is currently only used for Datasets using a
+                lazy datasource (i.e. a LazyBlockList).
+            stats_uuid: The uuid for the stats, used to fetch the right stats
+                from the stats actor.
+            base_name: The name of the base operation for a multi-operator operation.
+        """
+        self.metadata: StatsDict = metadata
+        if parent is not None and not isinstance(parent, list):
+            parent = [parent]
+        self.parents: List["DatasetStats"] = parent or []
+        self.number: int = (
+            0 if not self.parents else max(p.number for p in self.parents) + 1
+        )
+        self.base_name = base_name
+        # TODO(ekl) deprecate and remove the notion of dataset UUID once we move
+        # fully to streaming execution.
+        self.dataset_uuid: str = "unknown_uuid"
+        self.time_total_s: float = 0
+        self.needs_stats_actor = needs_stats_actor
+        self.stats_uuid = stats_uuid
+        # Streaming executor stats
+        self.streaming_exec_schedule_s: Timer = Timer()
+        # Iteration stats, filled out if the user iterates over the dataset.
+        self.iter_wait_s: Timer = Timer()
+        self.iter_get_s: Timer = Timer()
+        self.iter_next_batch_s: Timer = Timer()
+        self.iter_format_batch_s: Timer = Timer()
+        self.iter_collate_batch_s: Timer = Timer()
+        self.iter_finalize_batch_s: Timer = Timer()
+        self.iter_total_blocked_s: Timer = Timer()
+        self.iter_user_s: Timer = Timer()
+        self.iter_initialize_s: Timer = Timer()
+        self.iter_total_s: Timer = Timer()
+        self.extra_metrics = {}
+        # Block fetch stats during iteration.
+        # These are stats about locations of blocks when the iterator is trying to
+        # consume them. The iteration performance will be affected depending on
+        # whether the block is in the local object store of the node where the
+        # iterator is running.
+        # This serves as an indicator of block prefetching effectiveness.
+        self.iter_blocks_local: int = 0
+        self.iter_blocks_remote: int = 0
+        self.iter_unknown_location: int = 0
+        # Memory usage stats
+        self.global_bytes_spilled: int = 0
+        self.global_bytes_restored: int = 0
+        self.dataset_bytes_spilled: int = 0
+        # Streaming split coordinator stats (dataset level)
+        self.streaming_split_coordinator_s: Timer = Timer()
+    @property
+    def stats_actor(self):
+        return _get_or_create_stats_actor()
+    def child_builder(
+        self, name: str, override_start_time: Optional[float] = None
+    ) -> _DatasetStatsBuilder:
+        """Start recording stats for an op of the given name (e.g., map)."""
+        return _DatasetStatsBuilder(name, self, override_start_time)
+    def to_summary(self) -> "DatasetStatsSummary":
+        """Generate a `DatasetStatsSummary` object from the given `DatasetStats`
+        object, which can be used to generate a summary string."""
+        if self.needs_stats_actor:
+            ac = self.stats_actor
+            # TODO(chengsu): this is a super hack, clean it up.
+            stats_map, self.time_total_s = ray.get(ac.get.remote(self.stats_uuid))
+            # Only populate stats when stats from all read tasks are ready at
+            # stats actor.
+            if len(stats_map.items()) == len(self.metadata["Read"]):
+                self.metadata["Read"] = []
+                for _, blocks_metadata in sorted(stats_map.items()):
+                    self.metadata["Read"] += blocks_metadata
+        operators_stats = []
+        is_sub_operator = len(self.metadata) > 1
+        for name, meta in self.metadata.items():
+            operators_stats.append(
+                OperatorStatsSummary.from_block_metadata(
+                    name,
+                    meta,
+                    is_sub_operator=is_sub_operator,
+                )
+            )
+        iter_stats = IterStatsSummary(
+            self.iter_wait_s,
+            self.iter_get_s,
+            self.iter_next_batch_s,
+            self.iter_format_batch_s,
+            self.iter_collate_batch_s,
+            self.iter_finalize_batch_s,
+            self.iter_total_blocked_s,
+            self.iter_user_s,
+            self.iter_initialize_s,
+            self.iter_total_s,
+            self.streaming_split_coordinator_s,
+            self.iter_blocks_local,
+            self.iter_blocks_remote,
+            self.iter_unknown_location,
+        )
+        stats_summary_parents = []
+        if self.parents is not None:
+            stats_summary_parents = [p.to_summary() for p in self.parents]
+        streaming_exec_schedule_s = (
+            self.streaming_exec_schedule_s.get()
+            if self.streaming_exec_schedule_s
+            else 0
+        )
+        return DatasetStatsSummary(
+            operators_stats,
+            iter_stats,
+            stats_summary_parents,
+            self.number,
+            self.dataset_uuid,
+            self.time_total_s,
+            self.base_name,
+            self.extra_metrics,
+            self.global_bytes_spilled,
+            self.global_bytes_restored,
+            self.dataset_bytes_spilled,
+            streaming_exec_schedule_s,
+        )
+    def runtime_metrics(self) -> str:
+        """Generate a string representing the runtime metrics of a Dataset. This is
+        a high level summary of the time spent in Ray Data code broken down by operator.
+        It also includes the time spent in the scheduler. Times are shown as the total
+        time for each operator and percentages of time are shown as a fraction of the
+        total time for the whole dataset."""
+        return self.to_summary().runtime_metrics()
+@DeveloperAPI
+@dataclass
+class DatasetStatsSummary:
+    operators_stats: List["OperatorStatsSummary"]
+    iter_stats: "IterStatsSummary"
+    parents: List["DatasetStatsSummary"]
+    number: int
+    dataset_uuid: str
+    time_total_s: float
+    base_name: str
+    extra_metrics: Dict[str, Any]
+    global_bytes_spilled: int
+    global_bytes_restored: int
+    dataset_bytes_spilled: int
+    streaming_exec_schedule_s: float
+    def to_string(
+        self,
+        already_printed: Optional[Set[str]] = None,
+        include_parent: bool = True,
+        add_global_stats=True,
+    ) -> str:
+        """Return a human-readable summary of this Dataset's stats.
+        Args:
+            already_printed: Set of operator IDs that have already had its stats printed
+            out.
+            include_parent: If true, also include parent stats summary; otherwise, only
+            log stats of the latest operator.
+            add_global_stats: If true, includes global stats to this summary.
+        Returns:
+            String with summary statistics for executing the Dataset.
+        """
+        if already_printed is None:
+            already_printed = set()
+        out = ""
+        if self.parents and include_parent:
+            for p in self.parents:
+                parent_sum = p.to_string(already_printed, add_global_stats=False)
+                if parent_sum:
+                    out += parent_sum
+                    out += "\n"
+        operators_stats_summary = None
+        if len(self.operators_stats) == 1:
+            operators_stats_summary = self.operators_stats[0]
+            operator_name = operators_stats_summary.operator_name
+            operator_uuid = self.dataset_uuid + operator_name
+            out += "Operator {} {}: ".format(self.number, operator_name)
+            if operator_uuid in already_printed:
+                out += "[execution cached]\n"
+            else:
+                already_printed.add(operator_uuid)
+                out += str(operators_stats_summary)
+        elif len(self.operators_stats) > 1:
+            rounded_total = round(self.time_total_s, 2)
+            if rounded_total <= 0:
+                # Handle -0.0 case.
+                rounded_total = 0
+            out += "Operator {} {}: executed in {}s\n".format(
+                self.number, self.base_name, rounded_total
+            )
+            for n, operators_stats_summary in enumerate(self.operators_stats):
+                operator_name = operators_stats_summary.operator_name
+                operator_uuid = self.dataset_uuid + operator_name
+                out += "\n"
+                out += "\tSuboperator {} {}: ".format(n, operator_name)
+                if operator_uuid in already_printed:
+                    out += "\t[execution cached]\n"
+                else:
+                    already_printed.add(operator_uuid)
+                    out += str(operators_stats_summary)
+        verbose_stats_logs = DataContext.get_current().verbose_stats_logs
+        if verbose_stats_logs and self.extra_metrics:
+            indent = (
+                "\t"
+                if operators_stats_summary and operators_stats_summary.is_sub_operator
+                else ""
+            )
+            out += indent
+            out += "* Extra metrics: " + str(self.extra_metrics) + "\n"
+        out += str(self.iter_stats)
+        if len(self.operators_stats) > 0 and add_global_stats:
+            mb_spilled = round(self.global_bytes_spilled / 1e6)
+            mb_restored = round(self.global_bytes_restored / 1e6)
+            if mb_spilled or mb_restored:
+                out += "\nCluster memory:\n"
+                out += "* Spilled to disk: {}MB\n".format(mb_spilled)
+                out += "* Restored from disk: {}MB\n".format(mb_restored)
+            dataset_mb_spilled = round(self.dataset_bytes_spilled / 1e6)
+            if dataset_mb_spilled:
+                out += "\nDataset memory:\n"
+                out += "* Spilled to disk: {}MB\n".format(dataset_mb_spilled)
+            # For throughput, we compute both an observed Ray Data dataset throughput
+            # and an estimated single node dataset throughput.
+            # The observed dataset throughput is computed by dividing the total number
+            # of rows produced by the total wall time of the dataset (i.e. from start to
+            # finish how long did the dataset take to be processed). With the recursive
+            # nature of the DatasetStatsSummary, we use get_total_wall_time to determine
+            # the total wall time (this finds the difference between the earliest start
+            # and latest end for any block in any operator).
+            # The estimated single node dataset throughput is computed by dividing the
+            # total number of rows produced the sum of the wall times across all blocks
+            # of all operators. This assumes that on a single node the work done would
+            # be equivalent, with no concurrency.
+            output_num_rows = self.operators_stats[-1].output_num_rows
+            total_num_out_rows = output_num_rows["sum"] if output_num_rows else 0
+            wall_time = self.get_total_wall_time()
+            total_time_all_blocks = self.get_total_time_all_blocks()
+            if total_num_out_rows and wall_time and total_time_all_blocks:
+                out += "\n"
+                out += "Dataset throughput:\n"
+                out += (
+                    "\t* Ray Data throughput:"
+                    f" {total_num_out_rows / wall_time} "
+                    "rows/s\n"
+                )
+                out += (
+                    "\t* Estimated single node throughput:"
+                    f" {total_num_out_rows / total_time_all_blocks} "
+                    "rows/s\n"
+                )
+        if verbose_stats_logs and add_global_stats:
+            out += "\n" + self.runtime_metrics()
+        return out
+    @staticmethod
+    def _collect_dataset_stats_summaries(
+        curr: "DatasetStatsSummary",
+    ) -> List["DatasetStatsSummary"]:
+        summs = []
+        # TODO: Do operators ever have multiple parents? Do we need to deduplicate?
+        for p in curr.parents:
+            if p and p.parents:
+                summs.extend(DatasetStatsSummary._collect_dataset_stats_summaries(p))
+        return summs + [curr]
+    @staticmethod
+    def _find_start_and_end(summ: "DatasetStatsSummary") -> Tuple[float, float]:
+        earliest_start = min(ops.earliest_start_time for ops in summ.operators_stats)
+        latest_end = max(ops.latest_end_time for ops in summ.operators_stats)
+        return earliest_start, latest_end
+    def runtime_metrics(self) -> str:
+        total_wall_time = self.get_total_wall_time()
+        def fmt_line(name: str, time: float) -> str:
+            return f"* {name}: {fmt(time)} ({time / total_wall_time * 100:.3f}%)\n"
+        summaries = DatasetStatsSummary._collect_dataset_stats_summaries(self)
+        out = "Runtime Metrics:\n"
+        for summ in summaries:
+            if len(summ.operators_stats) > 0:
+                earliest_start, latest_end = DatasetStatsSummary._find_start_and_end(
+                    summ
+                )
+                op_total_time = latest_end - earliest_start
+                out += fmt_line(summ.base_name, op_total_time)
+        out += fmt_line("Scheduling", self.streaming_exec_schedule_s)
+        out += fmt_line("Total", total_wall_time)
+        return out
+    def __repr__(self, level=0) -> str:
+        indent = leveled_indent(level)
+        operators_stats = "\n".join(
+            [ss.__repr__(level + 2) for ss in self.operators_stats]
+        )
+        parent_stats = "\n".join([ps.__repr__(level + 2) for ps in self.parents])
+        extra_metrics = "\n".join(
+            f"{leveled_indent(level + 2)}{k}: {v},"
+            for k, v in self.extra_metrics.items()
+        )
+        # Handle formatting case for empty outputs.
+        operators_stats = (
+            f"\n{operators_stats},\n{indent}   " if operators_stats else ""
+        )
+        parent_stats = f"\n{parent_stats},\n{indent}   " if parent_stats else ""
+        extra_metrics = f"\n{extra_metrics}\n{indent}   " if extra_metrics else ""
+        return (
+            f"{indent}DatasetStatsSummary(\n"
+            f"{indent}   dataset_uuid={self.dataset_uuid},\n"
+            f"{indent}   base_name={self.base_name},\n"
+            f"{indent}   number={self.number},\n"
+            f"{indent}   extra_metrics={{{extra_metrics}}},\n"
+            f"{indent}   operators_stats=[{operators_stats}],\n"
+            f"{indent}   iter_stats={self.iter_stats.__repr__(level+1)},\n"
+            f"{indent}   global_bytes_spilled={self.global_bytes_spilled / 1e6}MB,\n"
+            f"{indent}   global_bytes_restored={self.global_bytes_restored / 1e6}MB,\n"
+            f"{indent}   dataset_bytes_spilled={self.dataset_bytes_spilled / 1e6}MB,\n"
+            f"{indent}   parents=[{parent_stats}],\n"
+            f"{indent})"
+        )
+    def get_total_wall_time(self) -> float:
+        """Calculate the total wall time for the dataset, this is done by finding
+        the earliest start time and latest end time for any block in any operator.
+        The wall time is the difference of these two times.
+        """
+        start_ends = [
+            DatasetStatsSummary._find_start_and_end(summ)
+            for summ in DatasetStatsSummary._collect_dataset_stats_summaries(self)
+            if len(summ.operators_stats) > 0
+        ]
+        if len(start_ends) == 0:
+            return 0
+        else:
+            earliest_start = min(start_end[0] for start_end in start_ends)
+            latest_end = max(start_end[1] for start_end in start_ends)
+            return latest_end - earliest_start
+    def get_total_time_all_blocks(self) -> float:
+        """Calculate the sum of the wall times across all blocks of all operators."""
+        summaries = DatasetStatsSummary._collect_dataset_stats_summaries(self)
+        return sum(
+            (
+                sum(
+                    ops.wall_time.get("sum", 0) if ops.wall_time else 0
+                    for ops in summ.operators_stats
+                )
+            )
+            for summ in summaries
+        )
+    def get_total_cpu_time(self) -> float:
+        parent_sum = sum(p.get_total_cpu_time() for p in self.parents)
+        return parent_sum + sum(
+            ss.cpu_time.get("sum", 0) for ss in self.operators_stats
+        )
+    def get_max_heap_memory(self) -> float:
+        parent_memory = [p.get_max_heap_memory() for p in self.parents]
+        parent_max = max(parent_memory) if parent_memory else 0
+        if not self.operators_stats:
+            return parent_max
+        return max(
+            parent_max,
+            *[ss.memory.get("max", 0) for ss in self.operators_stats],
+        )
+@dataclass
+class OperatorStatsSummary:
+    operator_name: str
+    # Whether the operator associated with this OperatorStatsSummary object
+    # is a suboperator
+    is_sub_operator: bool
+    # This is the total walltime of the entire operator, typically obtained from
+    # `DatasetStats.time_total_s`. An important distinction is that this is the
+    # overall runtime of the operator, pulled from the stats actor, whereas the
+    # computed walltimes in `self.wall_time` are calculated on a operator level.
+    time_total_s: float
+    earliest_start_time: float
+    latest_end_time: float
+    # String summarizing high-level statistics from executing the operator
+    block_execution_summary_str: str
+    # The fields below are dicts with stats aggregated across blocks
+    # processed in this operator. For example:
+    # {"min": ..., "max": ..., "mean": ..., "sum": ...}
+    wall_time: Optional[Dict[str, float]] = None
+    cpu_time: Optional[Dict[str, float]] = None
+    udf_time: Optional[Dict[str, float]] = None
+    # memory: no "sum" stat
+    memory: Optional[Dict[str, float]] = None
+    output_num_rows: Optional[Dict[str, float]] = None
+    output_size_bytes: Optional[Dict[str, float]] = None
+    # node_count: "count" stat instead of "sum"
+    node_count: Optional[Dict[str, float]] = None
+    task_rows: Optional[Dict[str, float]] = None
+    @classmethod
+    def from_block_metadata(
+        cls,
+        operator_name: str,
+        block_metas: List[BlockMetadata],
+        is_sub_operator: bool,
+    ) -> "OperatorStatsSummary":
+        """Calculate the stats for a operator from a given list of blocks,
+        and generates a `OperatorStatsSummary` object with the results.
+        Args:
+            block_metas: List of `BlockMetadata` to calculate stats of
+            operator_name: Name of operator associated with `blocks`
+            is_sub_operator: Whether this set of blocks belongs to a sub operator.
+        Returns:
+            A `OperatorStatsSummary` object initialized with the calculated statistics
+        """
+        exec_stats = [m.exec_stats for m in block_metas if m.exec_stats is not None]
+        rounded_total = 0
+        time_total_s = 0
+        earliest_start_time, latest_end_time = 0, 0
+        if exec_stats:
+            # Calculate the total execution time of operator as
+            # the difference between the latest end time and
+            # the earliest start time of all blocks in the operator.
+            earliest_start_time = min(s.start_time_s for s in exec_stats)
+            latest_end_time = max(s.end_time_s for s in exec_stats)
+            time_total_s = latest_end_time - earliest_start_time
+        if is_sub_operator:
+            exec_summary_str = "{} blocks produced\n".format(len(exec_stats))
+        else:
+            if exec_stats:
+                rounded_total = round(time_total_s, 2)
+                if rounded_total <= 0:
+                    # Handle -0.0 case.
+                    rounded_total = 0
+                exec_summary_str = "{} blocks produced in {}s".format(
+                    len(exec_stats), rounded_total
+                )
+            else:
+                exec_summary_str = ""
+            exec_summary_str += "\n"
+        task_rows = collections.defaultdict(int)
+        for meta in block_metas:
+            if meta.num_rows is not None and meta.exec_stats is not None:
+                task_rows[meta.exec_stats.task_idx] += meta.num_rows
+        task_rows_stats = None
+        if len(task_rows) > 0:
+            task_rows_stats = {
+                "min": min(task_rows.values()),
+                "max": max(task_rows.values()),
+                "mean": int(np.mean(list(task_rows.values()))),
+                "count": len(task_rows),
+            }
+            exec_summary_str = "{} tasks executed, {}".format(
+                len(task_rows), exec_summary_str
+            )
+        wall_time_stats, cpu_stats, memory_stats, udf_stats = None, None, None, None
+        if exec_stats:
+            wall_time_stats = {
+                "min": min([e.wall_time_s for e in exec_stats]),
+                "max": max([e.wall_time_s for e in exec_stats]),
+                "mean": np.mean([e.wall_time_s for e in exec_stats]),
+                "sum": sum([e.wall_time_s for e in exec_stats]),
+            }
+            cpu_stats = {
+                "min": min([e.cpu_time_s for e in exec_stats]),
+                "max": max([e.cpu_time_s for e in exec_stats]),
+                "mean": np.mean([e.cpu_time_s for e in exec_stats]),
+                "sum": sum([e.cpu_time_s for e in exec_stats]),
+            }
+            memory_stats_mb = [
+                round(e.max_rss_bytes / (1024 * 1024), 2) for e in exec_stats
+            ]
+            memory_stats = {
+                "min": min(memory_stats_mb),
+                "max": max(memory_stats_mb),
+                "mean": int(np.mean(memory_stats_mb)),
+            }
+            udf_stats = {
+                "min": min([e.udf_time_s for e in exec_stats]),
+                "max": max([e.udf_time_s for e in exec_stats]),
+                "mean": np.mean([e.udf_time_s for e in exec_stats]),
+                "sum": sum([e.udf_time_s for e in exec_stats]),
+            }
+        output_num_rows_stats = None
+        output_num_rows = [m.num_rows for m in block_metas if m.num_rows is not None]
+        if output_num_rows:
+            output_num_rows_stats = {
+                "min": min(output_num_rows),
+                "max": max(output_num_rows),
+                "mean": int(np.mean(output_num_rows)),
+                "sum": sum(output_num_rows),
+            }
+        output_size_bytes_stats = None
+        output_size_bytes = [
+            m.size_bytes for m in block_metas if m.size_bytes is not None
+        ]
+        if output_size_bytes:
+            output_size_bytes_stats = {
+                "min": min(output_size_bytes),
+                "max": max(output_size_bytes),
+                "mean": int(np.mean(output_size_bytes)),
+                "sum": sum(output_size_bytes),
+            }
+        node_counts_stats = None
+        if exec_stats:
+            node_tasks = collections.defaultdict(set)
+            for s in exec_stats:
+                node_tasks[s.node_id].add(s.task_idx)
+            node_counts = {node: len(tasks) for node, tasks in node_tasks.items()}
+            node_counts_stats = {
+                "min": min(node_counts.values()),
+                "max": max(node_counts.values()),
+                "mean": int(np.mean(list(node_counts.values()))),
+                "count": len(node_counts),
+            }
+        return OperatorStatsSummary(
+            operator_name=operator_name,
+            is_sub_operator=is_sub_operator,
+            time_total_s=time_total_s,
+            earliest_start_time=earliest_start_time,
+            latest_end_time=latest_end_time,
+            block_execution_summary_str=exec_summary_str,
+            wall_time=wall_time_stats,
+            cpu_time=cpu_stats,
+            udf_time=udf_stats,
+            memory=memory_stats,
+            output_num_rows=output_num_rows_stats,
+            output_size_bytes=output_size_bytes_stats,
+            node_count=node_counts_stats,
+            task_rows=task_rows_stats,
+        )
+    def __str__(self) -> str:
+        """For a given (pre-calculated) `OperatorStatsSummary` object (e.g. generated from
+        `OperatorStatsSummary.from_block_metadata()`), returns a human-friendly string
+        that summarizes operator execution statistics.
+        Returns:
+            String with summary statistics for executing the given operator.
+        """
+        indent = "\t" if self.is_sub_operator else ""
+        out = self.block_execution_summary_str
+        wall_time_stats = self.wall_time
+        if wall_time_stats:
+            out += indent
+            out += "* Remote wall time: {} min, {} max, {} mean, {} total\n".format(
+                fmt(wall_time_stats["min"]),
+                fmt(wall_time_stats["max"]),
+                fmt(wall_time_stats["mean"]),
+                fmt(wall_time_stats["sum"]),
+            )
+        cpu_stats = self.cpu_time
+        if cpu_stats:
+            out += indent
+            out += "* Remote cpu time: {} min, {} max, {} mean, {} total\n".format(
+                fmt(cpu_stats["min"]),
+                fmt(cpu_stats["max"]),
+                fmt(cpu_stats["mean"]),
+                fmt(cpu_stats["sum"]),
+            )
+        udf_stats = self.udf_time
+        if udf_stats:
+            out += indent
+            out += "* UDF time: {} min, {} max, {} mean, {} total\n".format(
+                fmt(udf_stats["min"]),
+                fmt(udf_stats["max"]),
+                fmt(udf_stats["mean"]),
+                fmt(udf_stats["sum"]),
+            )
+        memory_stats = self.memory
+        if memory_stats:
+            out += indent
+            out += "* Peak heap memory usage (MiB): {} min, {} max, {} mean\n".format(
+                memory_stats["min"],
+                memory_stats["max"],
+                memory_stats["mean"],
+            )
+        output_num_rows_stats = self.output_num_rows
+        if output_num_rows_stats:
+            out += indent
+            out += (
+                "* Output num rows per block: {} min, {} max, {} mean, {} total\n"
+            ).format(
+                output_num_rows_stats["min"],
+                output_num_rows_stats["max"],
+                output_num_rows_stats["mean"],
+                output_num_rows_stats["sum"],
+            )
+        output_size_bytes_stats = self.output_size_bytes
+        if output_size_bytes_stats:
+            out += indent
+            out += (
+                "* Output size bytes per block: {} min, {} max, {} mean, {} total\n"
+            ).format(
+                output_size_bytes_stats["min"],
+                output_size_bytes_stats["max"],
+                output_size_bytes_stats["mean"],
+                output_size_bytes_stats["sum"],
+            )
+        task_rows = self.task_rows
+        if task_rows:
+            out += indent
+            out += (
+                "* Output rows per task: {} min, {} max, {} mean, {} tasks used\n"
+            ).format(
+                task_rows["min"],
+                task_rows["max"],
+                task_rows["mean"],
+                task_rows["count"],
+            )
+        node_count_stats = self.node_count
+        if node_count_stats:
+            out += indent
+            out += "* Tasks per node: {} min, {} max, {} mean; {} nodes used\n".format(
+                node_count_stats["min"],
+                node_count_stats["max"],
+                node_count_stats["mean"],
+                node_count_stats["count"],
+            )
+        if output_num_rows_stats and self.time_total_s and wall_time_stats:
+            # For throughput, we compute both an observed Ray Data operator throughput
+            # and an estimated single node operator throughput.
+            # The observed Ray Data operator throughput is computed by dividing the
+            # total number of rows produced by the wall time of the operator,
+            # time_total_s.
+            # The estimated single node operator throughput is computed by dividing the
+            # total number of rows produced by the the sum of the wall times across all
+            # blocks of the operator. This assumes that on a single node the work done
+            # would be equivalent, with no concurrency.
+            total_num_out_rows = output_num_rows_stats["sum"]
+            out += indent
+            out += "* Operator throughput:\n"
+            out += (
+                indent + "\t* Ray Data throughput:"
+                f" {total_num_out_rows / self.time_total_s} "
+                "rows/s\n"
+            )
+            out += (
+                indent + "\t* Estimated single node throughput:"
+                f" {total_num_out_rows / wall_time_stats['sum']} "
+                "rows/s\n"
+            )
+        return out
+    def __repr__(self, level=0) -> str:
+        """For a given (pre-calculated) `OperatorStatsSummary` object (e.g. generated from
+        `OperatorStatsSummary.from_block_metadata()`), returns a human-friendly string
+        that summarizes operator execution statistics.
+        Returns:
+            String with summary statistics for executing the given operator.
+        """
+        indent = leveled_indent(level)
+        indent += leveled_indent(1) if self.is_sub_operator else ""
+        wall_time_stats = {k: fmt(v) for k, v in (self.wall_time or {}).items()}
+        cpu_stats = {k: fmt(v) for k, v in (self.cpu_time or {}).items()}
+        memory_stats = {k: fmt(v) for k, v in (self.memory or {}).items()}
+        output_num_rows_stats = {
+            k: fmt(v) for k, v in (self.output_num_rows or {}).items()
+        }
+        output_size_bytes_stats = {
+            k: fmt(v) for k, v in (self.output_size_bytes or {}).items()
+        }
+        node_conut_stats = {k: fmt(v) for k, v in (self.node_count or {}).items()}
+        out = (
+            f"{indent}OperatorStatsSummary(\n"
+            f"{indent}   operator_name='{self.operator_name}',\n"
+            f"{indent}   is_suboperator={self.is_sub_operator},\n"
+            f"{indent}   time_total_s={fmt(self.time_total_s)},\n"
+            # block_execution_summary_str already ends with \n
+            f"{indent}   block_execution_summary_str={self.block_execution_summary_str}"
+            f"{indent}   wall_time={wall_time_stats or None},\n"
+            f"{indent}   cpu_time={cpu_stats or None},\n"
+            f"{indent}   memory={memory_stats or None},\n"
+            f"{indent}   output_num_rows={output_num_rows_stats or None},\n"
+            f"{indent}   output_size_bytes={output_size_bytes_stats or None},\n"
+            f"{indent}   node_count={node_conut_stats or None},\n"
+            f"{indent})"
+        )
+        return out
+@dataclass
+class IterStatsSummary:
+    # Time spent in actor based prefetching, in seconds.
+    wait_time: Timer
+    # Time spent in `ray.get()`, in seconds
+    get_time: Timer
+    # Time spent in batch building, in seconds
+    next_time: Timer
+    # Time spent in `_format_batch_()`, in seconds
+    format_time: Timer
+    # Time spent in collate fn, in seconds
+    collate_time: Timer
+    # Time spent in finalize_fn, in seconds
+    finalize_batch_time: Timer
+    # Total time user thread is blocked by iter_batches
+    block_time: Timer
+    # Time spent in user code, in seconds
+    user_time: Timer
+    initialize_time: Timer
+    # Total time taken by Dataset iterator, in seconds
+    total_time: Timer
+    # Time spent in streaming split coordinator
+    streaming_split_coord_time: Timer
+    # Num of blocks that are in local object store
+    iter_blocks_local: int
+    # Num of blocks that are in remote node and have to fetch locally
+    iter_blocks_remote: int
+    # Num of blocks with unknown locations
+    iter_unknown_location: int
+    def __str__(self) -> str:
+        return self.to_string()
+    def to_string(self) -> str:
+        out = ""
+        if (
+            self.block_time.get()
+            or self.total_time.get()
+            or self.get_time.get()
+            or self.next_time.get()
+            or self.format_time.get()
+            or self.collate_time.get()
+            or self.finalize_batch_time.get()
+        ):
+            out += "\nDataset iterator time breakdown:\n"
+            if self.total_time.get():
+                out += "* Total time overall: {}\n".format(fmt(self.total_time.get()))
+            if self.initialize_time.get():
+                out += (
+                    "    * Total time in Ray Data iterator initialization code: "
+                    "{}\n".format(fmt(self.initialize_time.get()))
+                )
+            if self.block_time.get():
+                out += (
+                    "    * Total time user thread is blocked by Ray Data iter_batches: "
+                    "{}\n".format(fmt(self.block_time.get()))
+                )
+            if self.user_time.get():
+                out += "    * Total execution time for user thread: {}\n".format(
+                    fmt(self.user_time.get())
+                )
+            out += (
+                "* Batch iteration time breakdown (summed across prefetch threads):\n"
+            )
+            if self.get_time.get():
+                out += "    * In ray.get(): {} min, {} max, {} avg, {} total\n".format(
+                    fmt(self.get_time.min()),
+                    fmt(self.get_time.max()),
+                    fmt(self.get_time.avg()),
+                    fmt(self.get_time.get()),
+                )
+            if self.next_time.get():
+                batch_creation_str = (
+                    "    * In batch creation: {} min, {} max, " "{} avg, {} total\n"
+                )
+                out += batch_creation_str.format(
+                    fmt(self.next_time.min()),
+                    fmt(self.next_time.max()),
+                    fmt(self.next_time.avg()),
+                    fmt(self.next_time.get()),
+                )
+            if self.format_time.get():
+                format_str = (
+                    "    * In batch formatting: {} min, {} max, " "{} avg, {} total\n"
+                )
+                out += format_str.format(
+                    fmt(self.format_time.min()),
+                    fmt(self.format_time.max()),
+                    fmt(self.format_time.avg()),
+                    fmt(self.format_time.get()),
+                )
+            if self.collate_time.get():
+                out += "    * In collate_fn: {} min, {} max, {} avg, {} total\n".format(
+                    fmt(self.collate_time.min()),
+                    fmt(self.collate_time.max()),
+                    fmt(self.collate_time.avg()),
+                    fmt(self.collate_time.get()),
+                )
+            if self.finalize_batch_time.get():
+                format_str = (
+                    "    * In host->device transfer: {} min, {} max, {} avg, {} total\n"
+                )
+                out += format_str.format(
+                    fmt(self.finalize_batch_time.min()),
+                    fmt(self.finalize_batch_time.max()),
+                    fmt(self.finalize_batch_time.avg()),
+                    fmt(self.finalize_batch_time.get()),
+                )
+            if DataContext.get_current().enable_get_object_locations_for_metrics:
+                out += "Block locations:\n"
+                out += "    * Num blocks local: {}\n".format(self.iter_blocks_local)
+                out += "    * Num blocks remote: {}\n".format(self.iter_blocks_remote)
+                out += "    * Num blocks unknown location: {}\n".format(
+                    self.iter_unknown_location
+                )
+            if self.streaming_split_coord_time.get() != 0:
+                out += "Streaming split coordinator overhead time: "
+                out += f"{fmt(self.streaming_split_coord_time.get())}\n"
+        return out
+    def __repr__(self, level=0) -> str:
+        indent = leveled_indent(level)
+        return (
+            f"IterStatsSummary(\n"
+            f"{indent}   wait_time={fmt(self.wait_time.get()) or None},\n"
+            f"{indent}   get_time={fmt(self.get_time.get()) or None},\n"
+            f"{indent}   iter_blocks_local={self.iter_blocks_local or None},\n"
+            f"{indent}   iter_blocks_remote={self.iter_blocks_remote or None},\n"
+            f"{indent}   iter_unknown_location={self.iter_unknown_location or None},\n"
+            f"{indent}   next_time={fmt(self.next_time.get()) or None},\n"
+            f"{indent}   format_time={fmt(self.format_time.get()) or None},\n"
+            f"{indent}   user_time={fmt(self.user_time.get()) or None},\n"
+            f"{indent}   total_time={fmt(self.total_time.get()) or None},\n"
+            f"{indent})"
+        )

.venv/lib/python3.11/site-packages/ray/data/_internal/table_block.py ADDED Viewed

	@@ -0,0 +1,310 @@

+import collections
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Dict,
+    Iterator,
+    List,
+    Mapping,
+    Optional,
+    TypeVar,
+    Union,
+)
+import numpy as np
+from ray.air.constants import TENSOR_COLUMN_NAME
+from ray.data._internal.block_builder import BlockBuilder
+from ray.data._internal.numpy_support import is_array_like
+from ray.data._internal.row import TableRow
+from ray.data._internal.size_estimator import SizeEstimator
+from ray.data._internal.util import MiB
+from ray.data.block import Block, BlockAccessor
+if TYPE_CHECKING:
+    from ray.data._internal.planner.exchange.sort_task_spec import SortKey
+T = TypeVar("T")
+# The max size of Python tuples to buffer before compacting them into a
+# table in the BlockBuilder.
+MAX_UNCOMPACTED_SIZE_BYTES = 50 * MiB
+class TableBlockBuilder(BlockBuilder):
+    def __init__(self, block_type):
+        # The set of uncompacted Python values buffered.
+        self._columns = collections.defaultdict(list)
+        # The column names of uncompacted Python values buffered.
+        self._column_names = None
+        # The set of compacted tables we have built so far.
+        self._tables: List[Any] = []
+        # Cursor into tables indicating up to which table we've accumulated table sizes.
+        # This is used to defer table size calculation, which can be expensive for e.g.
+        # Pandas DataFrames.
+        # This cursor points to the first table for which we haven't accumulated a table
+        # size.
+        self._tables_size_cursor = 0
+        # Accumulated table sizes, up to the table in _tables pointed to by
+        # _tables_size_cursor.
+        self._tables_size_bytes = 0
+        # Size estimator for un-compacted table values.
+        self._uncompacted_size = SizeEstimator()
+        self._num_rows = 0
+        self._num_compactions = 0
+        self._block_type = block_type
+    def add(self, item: Union[dict, TableRow, np.ndarray]) -> None:
+        if isinstance(item, TableRow):
+            item = item.as_pydict()
+        elif isinstance(item, np.ndarray):
+            item = {TENSOR_COLUMN_NAME: item}
+        if not isinstance(item, collections.abc.Mapping):
+            raise ValueError(
+                "Returned elements of an TableBlock must be of type `dict`, "
+                "got {} (type {}).".format(item, type(item))
+            )
+        item_column_names = item.keys()
+        if self._column_names is not None:
+            # Check all added rows have same columns.
+            if item_column_names != self._column_names:
+                raise ValueError(
+                    "Current row has different columns compared to previous rows. "
+                    f"Columns of current row: {sorted(item_column_names)}, "
+                    f"Columns of previous rows: {sorted(self._column_names)}."
+                )
+        else:
+            # Initialize column names with the first added row.
+            self._column_names = item_column_names
+        for key, value in item.items():
+            if is_array_like(value) and not isinstance(value, np.ndarray):
+                value = np.array(value)
+            self._columns[key].append(value)
+        self._num_rows += 1
+        self._compact_if_needed()
+        self._uncompacted_size.add(item)
+    def add_block(self, block: Any) -> None:
+        if not isinstance(block, self._block_type):
+            raise TypeError(
+                f"Got a block of type {type(block)}, expected {self._block_type}."
+                "If you are mapping a function, ensure it returns an "
+                "object with the expected type. Block:\n"
+                f"{block}"
+            )
+        accessor = BlockAccessor.for_block(block)
+        self._tables.append(block)
+        self._num_rows += accessor.num_rows()
+    @staticmethod
+    def _table_from_pydict(columns: Dict[str, List[Any]]) -> Block:
+        raise NotImplementedError
+    @staticmethod
+    def _concat_tables(tables: List[Block]) -> Block:
+        raise NotImplementedError
+    @staticmethod
+    def _empty_table() -> Any:
+        raise NotImplementedError
+    @staticmethod
+    def _concat_would_copy() -> bool:
+        raise NotImplementedError
+    def will_build_yield_copy(self) -> bool:
+        if self._columns:
+            # Building a table from a dict of list columns always creates a copy.
+            return True
+        return self._concat_would_copy() and len(self._tables) > 1
+    def build(self) -> Block:
+        if self._columns:
+            tables = [self._table_from_pydict(self._columns)]
+        else:
+            tables = []
+        tables.extend(self._tables)
+        if len(tables) > 0:
+            return self._concat_tables(tables)
+        else:
+            return self._empty_table()
+    def num_rows(self) -> int:
+        return self._num_rows
+    def get_estimated_memory_usage(self) -> int:
+        if self._num_rows == 0:
+            return 0
+        for table in self._tables[self._tables_size_cursor :]:
+            self._tables_size_bytes += BlockAccessor.for_block(table).size_bytes()
+        self._tables_size_cursor = len(self._tables)
+        return self._tables_size_bytes + self._uncompacted_size.size_bytes()
+    def _compact_if_needed(self) -> None:
+        assert self._columns
+        if self._uncompacted_size.size_bytes() < MAX_UNCOMPACTED_SIZE_BYTES:
+            return
+        block = self._table_from_pydict(self._columns)
+        self.add_block(block)
+        self._uncompacted_size = SizeEstimator()
+        self._columns.clear()
+        self._num_compactions += 1
+class TableBlockAccessor(BlockAccessor):
+    ROW_TYPE: TableRow = TableRow
+    def __init__(self, table: Any):
+        self._table = table
+    def _get_row(self, index: int, copy: bool = False) -> Union[TableRow, np.ndarray]:
+        base_row = self.slice(index, index + 1, copy=copy)
+        row = self.ROW_TYPE(base_row)
+        return row
+    @staticmethod
+    def _munge_conflict(name, count):
+        return f"{name}_{count+1}"
+    @staticmethod
+    def _build_tensor_row(row: TableRow) -> np.ndarray:
+        raise NotImplementedError
+    def to_default(self) -> Block:
+        # Always promote Arrow blocks to pandas for consistency, since
+        # we lazily convert pandas->Arrow internally for efficiency.
+        default = self.to_pandas()
+        return default
+    def column_names(self) -> List[str]:
+        raise NotImplementedError
+    def append_column(self, name: str, data: Any) -> Block:
+        raise NotImplementedError
+    def to_block(self) -> Block:
+        return self._table
+    def iter_rows(
+        self, public_row_format: bool
+    ) -> Iterator[Union[Mapping, np.ndarray]]:
+        outer = self
+        class Iter:
+            def __init__(self):
+                self._cur = -1
+            def __iter__(self):
+                return self
+            def __next__(self):
+                self._cur += 1
+                if self._cur < outer.num_rows():
+                    row = outer._get_row(self._cur)
+                    if public_row_format and isinstance(row, TableRow):
+                        return row.as_pydict()
+                    else:
+                        return row
+                raise StopIteration
+        return Iter()
+    def _zip(self, acc: BlockAccessor) -> "Block":
+        raise NotImplementedError
+    def zip(self, other: "Block") -> "Block":
+        acc = BlockAccessor.for_block(other)
+        if not isinstance(acc, type(self)):
+            if isinstance(self, TableBlockAccessor) and isinstance(
+                acc, TableBlockAccessor
+            ):
+                # If block types are different, but still both of TableBlock type, try
+                # converting both to default block type before zipping.
+                self_norm, other_norm = TableBlockAccessor.normalize_block_types(
+                    [self._table, other],
+                )
+                return BlockAccessor.for_block(self_norm).zip(other_norm)
+            else:
+                raise ValueError(
+                    "Cannot zip {} with block of type {}".format(
+                        type(self), type(other)
+                    )
+                )
+        if acc.num_rows() != self.num_rows():
+            raise ValueError(
+                "Cannot zip self (length {}) with block of length {}".format(
+                    self.num_rows(), acc.num_rows()
+                )
+            )
+        return self._zip(acc)
+    @staticmethod
+    def _empty_table() -> Any:
+        raise NotImplementedError
+    def _sample(self, n_samples: int, sort_key: "SortKey") -> Any:
+        raise NotImplementedError
+    def sample(self, n_samples: int, sort_key: "SortKey") -> Any:
+        if sort_key is None or callable(sort_key):
+            raise NotImplementedError(
+                f"Table sort key must be a column name, was: {sort_key}"
+            )
+        if self.num_rows() == 0:
+            # If the pyarrow table is empty we may not have schema
+            # so calling table.select() will raise an error.
+            return self._empty_table()
+        k = min(n_samples, self.num_rows())
+        return self._sample(k, sort_key)
+    @classmethod
+    def normalize_block_types(
+        cls,
+        blocks: List[Block],
+        normalize_type: Optional[str] = None,
+    ) -> List[Block]:
+        """Normalize input blocks to the specified `normalize_type`. If the blocks
+        are already all of the same type, returns the original blocks.
+         Args:
+            blocks: A list of TableBlocks to be normalized.
+            normalize_type: The type to normalize the blocks to. If None,
+                the default block type (Arrow) is used.
+        Returns:
+            A list of blocks of the same type.
+        """
+        seen_types = set()
+        for block in blocks:
+            acc = BlockAccessor.for_block(block)
+            if not isinstance(acc, TableBlockAccessor):
+                raise ValueError(
+                    "Block type normalization is only supported for TableBlock, "
+                    f"but received block of type: {type(block)}."
+                )
+            seen_types.add(type(block))
+        # Return original blocks if they are all of the same type.
+        if len(seen_types) <= 1:
+            return blocks
+        if normalize_type == "arrow":
+            results = [BlockAccessor.for_block(block).to_arrow() for block in blocks]
+        elif normalize_type == "pandas":
+            results = [BlockAccessor.for_block(block).to_pandas() for block in blocks]
+        else:
+            results = [BlockAccessor.for_block(block).to_default() for block in blocks]
+        if any(not isinstance(block, type(results[0])) for block in results):
+            raise ValueError(
+                "Expected all blocks to be of the same type after normalization, but "
+                f"got different types: {[type(b) for b in results]}. "
+                "Try using blocks of the same type to avoid the issue "
+                "with block normalization."
+            )
+        return results

.venv/lib/python3.11/site-packages/ray/data/_internal/torch_iterable_dataset.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from torch.utils.data import IterableDataset
+class TorchIterableDataset(IterableDataset):
+    def __init__(self, generator_func):
+        self.generator_func = generator_func
+    def __iter__(self):
+        it = self.generator_func()
+        yield from it

.venv/lib/python3.11/site-packages/ray/data/_internal/util.py ADDED Viewed

	@@ -0,0 +1,1262 @@

+import importlib
+import logging
+import os
+import pathlib
+import random
+import sys
+import threading
+import time
+import urllib.parse
+from queue import Empty, Full, Queue
+from types import ModuleType
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Generator,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Tuple,
+    TypeVar,
+    Union,
+)
+import numpy as np
+import ray
+from ray._private.utils import _get_pyarrow_version
+from ray.data.context import DEFAULT_READ_OP_MIN_NUM_BLOCKS, WARN_PREFIX, DataContext
+if TYPE_CHECKING:
+    import pandas
+    import pyarrow
+    from ray.data._internal.compute import ComputeStrategy
+    from ray.data._internal.planner.exchange.sort_task_spec import SortKey
+    from ray.data.block import Block, BlockMetadata, UserDefinedFunction
+    from ray.data.datasource import Datasource, Reader
+    from ray.util.placement_group import PlacementGroup
+logger = logging.getLogger(__name__)
+KiB = 1024  # bytes
+MiB = 1024 * KiB
+GiB = 1024 * MiB
+SENTINEL = object()
+# NOTE: Make sure that these lower and upper bounds stay in sync with version
+# constraints given in python/setup.py.
+# Inclusive minimum pyarrow version.
+MIN_PYARROW_VERSION = "6.0.1"
+RAY_DISABLE_PYARROW_VERSION_CHECK = "RAY_DISABLE_PYARROW_VERSION_CHECK"
+_VERSION_VALIDATED = False
+_LOCAL_SCHEME = "local"
+_EXAMPLE_SCHEME = "example"
+LazyModule = Union[None, bool, ModuleType]
+_pyarrow_dataset: LazyModule = None
+class _NullSentinel:
+    """Sentinel value that sorts greater than any other value."""
+    def __eq__(self, other):
+        return isinstance(other, _NullSentinel)
+    def __lt__(self, other):
+        return False
+    def __le__(self, other):
+        return isinstance(other, _NullSentinel)
+    def __gt__(self, other):
+        return True
+    def __ge__(self, other):
+        return True
+    def __hash__(self):
+        return id(self)
+NULL_SENTINEL = _NullSentinel()
+def _lazy_import_pyarrow_dataset() -> LazyModule:
+    global _pyarrow_dataset
+    if _pyarrow_dataset is None:
+        try:
+            from pyarrow import dataset as _pyarrow_dataset
+        except ModuleNotFoundError:
+            # If module is not found, set _pyarrow to False so we won't
+            # keep trying to import it on every _lazy_import_pyarrow() call.
+            _pyarrow_dataset = False
+    return _pyarrow_dataset
+def _check_pyarrow_version():
+    """Check that pyarrow's version is within the supported bounds."""
+    global _VERSION_VALIDATED
+    if not _VERSION_VALIDATED:
+        if os.environ.get(RAY_DISABLE_PYARROW_VERSION_CHECK, "0") == "1":
+            _VERSION_VALIDATED = True
+            return
+        version = _get_pyarrow_version()
+        if version is not None:
+            from packaging.version import parse as parse_version
+            if parse_version(version) < parse_version(MIN_PYARROW_VERSION):
+                raise ImportError(
+                    f"Dataset requires pyarrow >= {MIN_PYARROW_VERSION}, but "
+                    f"{version} is installed. Reinstall with "
+                    f'`pip install -U "pyarrow"`. '
+                    "If you want to disable this pyarrow version check, set the "
+                    f"environment variable {RAY_DISABLE_PYARROW_VERSION_CHECK}=1."
+                )
+        else:
+            logger.warning(
+                "You are using the 'pyarrow' module, but the exact version is unknown "
+                "(possibly carried as an internal component by another module). Please "
+                f"make sure you are using pyarrow >= {MIN_PYARROW_VERSION} to ensure "
+                "compatibility with Ray Dataset. "
+                "If you want to disable this pyarrow version check, set the "
+                f"environment variable {RAY_DISABLE_PYARROW_VERSION_CHECK}=1."
+            )
+        _VERSION_VALIDATED = True
+def _autodetect_parallelism(
+    parallelism: int,
+    target_max_block_size: int,
+    ctx: DataContext,
+    datasource_or_legacy_reader: Optional[Union["Datasource", "Reader"]] = None,
+    mem_size: Optional[int] = None,
+    placement_group: Optional["PlacementGroup"] = None,
+    avail_cpus: Optional[int] = None,
+) -> Tuple[int, str, Optional[int]]:
+    """Returns parallelism to use and the min safe parallelism to avoid OOMs.
+    This detects parallelism using the following heuristics, applied in order:
+     1) We start with the default value of 200. This can be overridden by
+        setting the `read_op_min_num_blocks` attribute of
+        :class:`~ray.data.context.DataContext`.
+     2) Min block size. If the parallelism would make blocks smaller than this
+        threshold, the parallelism is reduced to avoid the overhead of tiny blocks.
+     3) Max block size. If the parallelism would make blocks larger than this
+        threshold, the parallelism is increased to avoid OOMs during processing.
+     4) Available CPUs. If the parallelism cannot make use of all the available
+        CPUs in the cluster, the parallelism is increased until it can.
+    Args:
+        parallelism: The user-requested parallelism, or -1 for auto-detection.
+        target_max_block_size: The target max block size to
+            produce. We pass this separately from the
+            DatasetContext because it may be set per-op instead of
+            per-Dataset.
+        ctx: The current Dataset context to use for configs.
+        datasource_or_legacy_reader: The datasource or legacy reader, to be used for
+            data size estimation.
+        mem_size: If passed, then used to compute the parallelism according to
+            target_max_block_size.
+        placement_group: The placement group that this Dataset
+            will execute inside, if any.
+        avail_cpus: Override avail cpus detection (for testing only).
+    Returns:
+        Tuple of detected parallelism (only if -1 was specified), the reason
+        for the detected parallelism (only if -1 was specified), and the estimated
+        inmemory size of the dataset.
+    """
+    min_safe_parallelism = 1
+    max_reasonable_parallelism = sys.maxsize
+    if mem_size is None and datasource_or_legacy_reader:
+        mem_size = datasource_or_legacy_reader.estimate_inmemory_data_size()
+    if mem_size is not None and not np.isnan(mem_size):
+        min_safe_parallelism = max(1, int(mem_size / target_max_block_size))
+        max_reasonable_parallelism = max(1, int(mem_size / ctx.target_min_block_size))
+    reason = ""
+    if parallelism < 0:
+        if parallelism != -1:
+            raise ValueError("`parallelism` must either be -1 or a positive integer.")
+        if (
+            ctx.min_parallelism is not None
+            and ctx.min_parallelism != DEFAULT_READ_OP_MIN_NUM_BLOCKS
+            and ctx.read_op_min_num_blocks == DEFAULT_READ_OP_MIN_NUM_BLOCKS
+        ):
+            logger.warning(
+                "``DataContext.min_parallelism`` is deprecated in Ray 2.10. "
+                "Please specify ``DataContext.read_op_min_num_blocks`` instead."
+            )
+            ctx.read_op_min_num_blocks = ctx.min_parallelism
+        # Start with 2x the number of cores as a baseline, with a min floor.
+        if placement_group is None:
+            placement_group = ray.util.get_current_placement_group()
+        avail_cpus = avail_cpus or _estimate_avail_cpus(placement_group)
+        parallelism = max(
+            min(ctx.read_op_min_num_blocks, max_reasonable_parallelism),
+            min_safe_parallelism,
+            avail_cpus * 2,
+        )
+        if parallelism == ctx.read_op_min_num_blocks:
+            reason = (
+                "DataContext.get_current().read_op_min_num_blocks="
+                f"{ctx.read_op_min_num_blocks}"
+            )
+        elif parallelism == max_reasonable_parallelism:
+            reason = (
+                "output blocks of size at least "
+                "DataContext.get_current().target_min_block_size="
+                f"{ctx.target_min_block_size / (1024 * 1024)}MiB"
+            )
+        elif parallelism == min_safe_parallelism:
+            reason = (
+                "output blocks of size at most "
+                "DataContext.get_current().target_max_block_size="
+                f"{ctx.target_max_block_size / (1024 * 1024)}MiB"
+            )
+        else:
+            reason = (
+                "parallelism at least twice the available number "
+                f"of CPUs ({avail_cpus})"
+            )
+        logger.debug(
+            f"Autodetected parallelism={parallelism} based on "
+            f"estimated_available_cpus={avail_cpus} and "
+            f"estimated_data_size={mem_size}."
+        )
+    return parallelism, reason, mem_size
+def _estimate_avail_cpus(cur_pg: Optional["PlacementGroup"]) -> int:
+    """Estimates the available CPU parallelism for this Dataset in the cluster.
+    If we aren't in a placement group, this is trivially the number of CPUs in the
+    cluster. Otherwise, we try to calculate how large the placement group is relative
+    to the size of the cluster.
+    Args:
+        cur_pg: The current placement group, if any.
+    """
+    cluster_cpus = int(ray.cluster_resources().get("CPU", 1))
+    cluster_gpus = int(ray.cluster_resources().get("GPU", 0))
+    # If we're in a placement group, we shouldn't assume the entire cluster's
+    # resources are available for us to use. Estimate an upper bound on what's
+    # reasonable to assume is available for datasets to use.
+    if cur_pg:
+        pg_cpus = 0
+        for bundle in cur_pg.bundle_specs:
+            # Calculate the proportion of the cluster this placement group "takes up".
+            # Then scale our cluster_cpus proportionally to avoid over-parallelizing
+            # if there are many parallel Tune trials using the cluster.
+            cpu_fraction = bundle.get("CPU", 0) / max(1, cluster_cpus)
+            gpu_fraction = bundle.get("GPU", 0) / max(1, cluster_gpus)
+            max_fraction = max(cpu_fraction, gpu_fraction)
+            # Over-parallelize by up to a factor of 2, but no more than that. It's
+            # preferrable to over-estimate than under-estimate.
+            pg_cpus += 2 * int(max_fraction * cluster_cpus)
+        return min(cluster_cpus, pg_cpus)
+    return cluster_cpus
+def _estimate_available_parallelism() -> int:
+    """Estimates the available CPU parallelism for this Dataset in the cluster.
+    If we are currently in a placement group, take that into account."""
+    cur_pg = ray.util.get_current_placement_group()
+    return _estimate_avail_cpus(cur_pg)
+def _warn_on_high_parallelism(requested_parallelism, num_read_tasks):
+    available_cpu_slots = ray.available_resources().get("CPU", 1)
+    if (
+        requested_parallelism
+        and num_read_tasks > available_cpu_slots * 4
+        and num_read_tasks >= 5000
+    ):
+        logger.warning(
+            f"{WARN_PREFIX} The requested parallelism of {requested_parallelism} "
+            "is more than 4x the number of available CPU slots in the cluster of "
+            f"{available_cpu_slots}. This can "
+            "lead to slowdowns during the data reading phase due to excessive "
+            "task creation. Reduce the parallelism to match with the available "
+            "CPU slots in the cluster, or set parallelism to -1 for Ray Data "
+            "to automatically determine the parallelism. "
+            "You can ignore this message if the cluster is expected to autoscale."
+        )
+def _check_import(obj, *, module: str, package: str) -> None:
+    """Check if a required dependency is installed.
+    If `module` can't be imported, this function raises an `ImportError` instructing
+    the user to install `package` from PyPI.
+    Args:
+        obj: The object that has a dependency.
+        module: The name of the module to import.
+        package: The name of the package on PyPI.
+    """
+    try:
+        importlib.import_module(module)
+    except ImportError:
+        raise ImportError(
+            f"`{obj.__class__.__name__}` depends on '{package}', but '{package}' "
+            f"couldn't be imported. You can install '{package}' by running `pip "
+            f"install {package}`."
+        )
+def _resolve_custom_scheme(path: str) -> str:
+    """Returns the resolved path if the given path follows a Ray-specific custom
+    scheme. Othewise, returns the path unchanged.
+    The supported custom schemes are: "local", "example".
+    """
+    parsed_uri = urllib.parse.urlparse(path)
+    if parsed_uri.scheme == _LOCAL_SCHEME:
+        path = parsed_uri.netloc + parsed_uri.path
+    elif parsed_uri.scheme == _EXAMPLE_SCHEME:
+        example_data_path = pathlib.Path(__file__).parent.parent / "examples" / "data"
+        path = example_data_path / (parsed_uri.netloc + parsed_uri.path)
+        path = str(path.resolve())
+    return path
+def _is_local_scheme(paths: Union[str, List[str]]) -> bool:
+    """Returns True if the given paths are in local scheme.
+    Note: The paths must be in same scheme, i.e. it's invalid and
+    will raise error if paths are mixed with different schemes.
+    """
+    if isinstance(paths, str):
+        paths = [paths]
+    if isinstance(paths, pathlib.Path):
+        paths = [str(paths)]
+    elif not isinstance(paths, list) or any(not isinstance(p, str) for p in paths):
+        raise ValueError("paths must be a path string or a list of path strings.")
+    elif len(paths) == 0:
+        raise ValueError("Must provide at least one path.")
+    num = sum(urllib.parse.urlparse(path).scheme == _LOCAL_SCHEME for path in paths)
+    if num > 0 and num < len(paths):
+        raise ValueError(
+            "The paths must all be local-scheme or not local-scheme, "
+            f"but found mixed {paths}"
+        )
+    return num == len(paths)
+def _truncated_repr(obj: Any) -> str:
+    """Utility to return a truncated object representation for error messages."""
+    msg = str(obj)
+    if len(msg) > 200:
+        msg = msg[:200] + "..."
+    return msg
+def _insert_doc_at_pattern(
+    obj,
+    *,
+    message: str,
+    pattern: str,
+    insert_after: bool = True,
+    directive: Optional[str] = None,
+    skip_matches: int = 0,
+) -> str:
+    if "\n" in message:
+        raise ValueError(
+            "message shouldn't contain any newlines, since this function will insert "
+            f"its own linebreaks when text wrapping: {message}"
+        )
+    doc = obj.__doc__.strip()
+    if not doc:
+        doc = ""
+    if pattern == "" and insert_after:
+        # Empty pattern + insert_after means that we want to append the message to the
+        # end of the docstring.
+        head = doc
+        tail = ""
+    else:
+        tail = doc
+        i = tail.find(pattern)
+        skip_matches_left = skip_matches
+        while i != -1:
+            if insert_after:
+                # Set offset to the first character after the pattern.
+                offset = i + len(pattern)
+            else:
+                # Set offset to the first character in the matched line.
+                offset = tail[:i].rfind("\n") + 1
+            head = tail[:offset]
+            tail = tail[offset:]
+            skip_matches_left -= 1
+            if skip_matches_left <= 0:
+                break
+            elif not insert_after:
+                # Move past the found pattern, since we're skipping it.
+                tail = tail[i - offset + len(pattern) :]
+            i = tail.find(pattern)
+        else:
+            raise ValueError(
+                f"Pattern {pattern} not found after {skip_matches} skips in docstring "
+                f"{doc}"
+            )
+    # Get indentation of the to-be-inserted text.
+    after_lines = list(filter(bool, tail.splitlines()))
+    if len(after_lines) > 0:
+        lines = after_lines
+    else:
+        lines = list(filter(bool, reversed(head.splitlines())))
+    # Should always have at least one non-empty line in the docstring.
+    assert len(lines) > 0
+    indent = " " * (len(lines[0]) - len(lines[0].lstrip()))
+    # Handle directive.
+    message = message.strip("\n")
+    if directive is not None:
+        base = f"{indent}.. {directive}::\n"
+        message = message.replace("\n", "\n" + indent + " " * 4)
+        message = base + indent + " " * 4 + message
+    else:
+        message = indent + message.replace("\n", "\n" + indent)
+    # Add two blank lines before/after message, if necessary.
+    if insert_after ^ (pattern == "\n\n"):
+        # Only two blank lines before message if:
+        # 1. Inserting message after pattern and pattern is not two blank lines.
+        # 2. Inserting message before pattern and pattern is two blank lines.
+        message = "\n\n" + message
+    if (not insert_after) ^ (pattern == "\n\n"):
+        # Only two blank lines after message if:
+        # 1. Inserting message before pattern and pattern is not two blank lines.
+        # 2. Inserting message after pattern and pattern is two blank lines.
+        message = message + "\n\n"
+    # Insert message before/after pattern.
+    parts = [head, message, tail]
+    # Build new docstring.
+    obj.__doc__ = "".join(parts)
+def _consumption_api(
+    if_more_than_read: bool = False,
+    datasource_metadata: Optional[str] = None,
+    extra_condition: Optional[str] = None,
+    delegate: Optional[str] = None,
+    pattern="Examples:",
+    insert_after=False,
+):
+    """Annotate the function with an indication that it's a consumption API, and that it
+    will trigger Dataset execution.
+    """
+    base = (
+        " will trigger execution of the lazy transformations performed on "
+        "this dataset."
+    )
+    if delegate:
+        message = delegate + base
+    elif not if_more_than_read:
+        message = "This operation" + base
+    else:
+        condition = "If this dataset consists of more than a read, "
+        if datasource_metadata is not None:
+            condition += (
+                f"or if the {datasource_metadata} can't be determined from the "
+                "metadata provided by the datasource, "
+            )
+        if extra_condition is not None:
+            condition += extra_condition + ", "
+        message = condition + "then this operation" + base
+    def wrap(obj):
+        _insert_doc_at_pattern(
+            obj,
+            message=message,
+            pattern=pattern,
+            insert_after=insert_after,
+            directive="note",
+        )
+        return obj
+    return wrap
+def ConsumptionAPI(*args, **kwargs):
+    """Annotate the function with an indication that it's a consumption API, and that it
+    will trigger Dataset execution.
+    """
+    if len(args) == 1 and len(kwargs) == 0 and callable(args[0]):
+        return _consumption_api()(args[0])
+    return _consumption_api(*args, **kwargs)
+def _all_to_all_api(*args, **kwargs):
+    """Annotate the function with an indication that it's a all to all API, and that it
+    is an operation that requires all inputs to be materialized in-memory to execute.
+    """
+    def wrap(obj):
+        _insert_doc_at_pattern(
+            obj,
+            message=(
+                "This operation requires all inputs to be "
+                "materialized in object store for it to execute."
+            ),
+            pattern="Examples:",
+            insert_after=False,
+            directive="note",
+        )
+        return obj
+    return wrap
+def AllToAllAPI(*args, **kwargs):
+    """Annotate the function with an indication that it's a all to all API, and that it
+    is an operation that requires all inputs to be materialized in-memory to execute.
+    """
+    # This should only be used as a decorator for dataset methods.
+    assert len(args) == 1 and len(kwargs) == 0 and callable(args[0])
+    return _all_to_all_api()(args[0])
+def get_compute_strategy(
+    fn: "UserDefinedFunction",
+    fn_constructor_args: Optional[Iterable[Any]] = None,
+    compute: Optional[Union[str, "ComputeStrategy"]] = None,
+    concurrency: Optional[Union[int, Tuple[int, int]]] = None,
+) -> "ComputeStrategy":
+    """Get `ComputeStrategy` based on the function or class, and concurrency
+    information.
+    Args:
+        fn: The function or generator to apply to a record batch, or a class type
+            that can be instantiated to create such a callable.
+        fn_constructor_args: Positional arguments to pass to ``fn``'s constructor.
+        compute: Either "tasks" (default) to use Ray Tasks or an
+                :class:`~ray.data.ActorPoolStrategy` to use an autoscaling actor pool.
+        concurrency: The number of Ray workers to use concurrently.
+    Returns:
+       The `ComputeStrategy` for execution.
+    """
+    # Lazily import these objects to avoid circular imports.
+    from ray.data._internal.compute import ActorPoolStrategy, TaskPoolStrategy
+    from ray.data.block import CallableClass
+    if isinstance(fn, CallableClass):
+        is_callable_class = True
+    else:
+        # TODO(chengsu): disallow object that is not a function. For example,
+        # An object instance of class often indicates a bug in user code.
+        is_callable_class = False
+        if fn_constructor_args is not None:
+            raise ValueError(
+                "``fn_constructor_args`` can only be specified if providing a "
+                f"callable class instance for ``fn``, but got: {fn}."
+            )
+    if compute is not None:
+        # Legacy code path to support `compute` argument.
+        logger.warning(
+            "The argument ``compute`` is deprecated in Ray 2.9. Please specify "
+            "argument ``concurrency`` instead. For more information, see "
+            "https://docs.ray.io/en/master/data/transforming-data.html#"
+            "stateful-transforms."
+        )
+        if is_callable_class and (
+            compute == "tasks" or isinstance(compute, TaskPoolStrategy)
+        ):
+            raise ValueError(
+                "``compute`` must specify an actor compute strategy when using a "
+                f"callable class, but got: {compute}. For example, use "
+                "``compute=ray.data.ActorPoolStrategy(size=n)``."
+            )
+        elif not is_callable_class and (
+            compute == "actors" or isinstance(compute, ActorPoolStrategy)
+        ):
+            raise ValueError(
+                f"``compute`` is specified as the actor compute strategy: {compute}, "
+                f"but ``fn`` is not a callable class: {fn}. Pass a callable class or "
+                "use the default ``compute`` strategy."
+            )
+        return compute
+    elif concurrency is not None:
+        if isinstance(concurrency, tuple):
+            if (
+                len(concurrency) == 2
+                and isinstance(concurrency[0], int)
+                and isinstance(concurrency[1], int)
+            ):
+                if is_callable_class:
+                    return ActorPoolStrategy(
+                        min_size=concurrency[0], max_size=concurrency[1]
+                    )
+                else:
+                    raise ValueError(
+                        "``concurrency`` is set as a tuple of integers, but ``fn`` "
+                        f"is not a callable class: {fn}. Use ``concurrency=n`` to "
+                        "control maximum number of workers to use."
+                    )
+            else:
+                raise ValueError(
+                    "``concurrency`` is expected to be set as a tuple of "
+                    f"integers, but got: {concurrency}."
+                )
+        elif isinstance(concurrency, int):
+            if is_callable_class:
+                return ActorPoolStrategy(size=concurrency)
+            else:
+                return TaskPoolStrategy(size=concurrency)
+        else:
+            raise ValueError(
+                "``concurrency`` is expected to be set as an integer or a "
+                f"tuple of integers, but got: {concurrency}."
+            )
+    else:
+        if is_callable_class:
+            raise ValueError(
+                "``concurrency`` must be specified when using a callable class. "
+                "For example, use ``concurrency=n`` for a pool of ``n`` workers."
+            )
+        else:
+            return TaskPoolStrategy()
+def capfirst(s: str):
+    """Capitalize the first letter of a string
+    Args:
+        s: String to capitalize
+    Returns:
+       Capitalized string
+    """
+    return s[0].upper() + s[1:]
+def capitalize(s: str):
+    """Capitalize a string, removing '_' and keeping camelcase.
+    Args:
+        s: String to capitalize
+    Returns:
+        Capitalized string with no underscores.
+    """
+    return "".join(capfirst(x) for x in s.split("_"))
+def pandas_df_to_arrow_block(df: "pandas.DataFrame") -> "Block":
+    from ray.data.block import BlockAccessor, BlockExecStats
+    block = BlockAccessor.for_block(df).to_arrow()
+    stats = BlockExecStats.builder()
+    return (
+        block,
+        BlockAccessor.for_block(block).get_metadata(exec_stats=stats.build()),
+    )
+def ndarray_to_block(ndarray: np.ndarray, ctx: DataContext) -> "Block":
+    from ray.data.block import BlockAccessor, BlockExecStats
+    DataContext._set_current(ctx)
+    stats = BlockExecStats.builder()
+    block = BlockAccessor.batch_to_block({"data": ndarray})
+    metadata = BlockAccessor.for_block(block).get_metadata(exec_stats=stats.build())
+    return block, metadata
+def get_table_block_metadata(
+    table: Union["pyarrow.Table", "pandas.DataFrame"]
+) -> "BlockMetadata":
+    from ray.data.block import BlockAccessor, BlockExecStats
+    stats = BlockExecStats.builder()
+    return BlockAccessor.for_block(table).get_metadata(exec_stats=stats.build())
+def unify_block_metadata_schema(
+    metadata: List["BlockMetadata"],
+) -> Optional[Union[type, "pyarrow.lib.Schema"]]:
+    """For the input list of BlockMetadata, return a unified schema of the
+    corresponding blocks. If the metadata have no valid schema, returns None.
+    """
+    # Some blocks could be empty, in which case we cannot get their schema.
+    # TODO(ekl) validate schema is the same across different blocks.
+    from ray.data._internal.arrow_ops.transform_pyarrow import unify_schemas
+    # First check if there are blocks with computed schemas, then unify
+    # valid schemas from all such blocks.
+    schemas_to_unify = []
+    for m in metadata:
+        if m.schema is not None and (m.num_rows is None or m.num_rows > 0):
+            schemas_to_unify.append(m.schema)
+    if schemas_to_unify:
+        # Check valid pyarrow installation before attempting schema unification
+        try:
+            import pyarrow as pa
+        except ImportError:
+            pa = None
+        # If the result contains PyArrow schemas, unify them
+        if pa is not None and all(isinstance(s, pa.Schema) for s in schemas_to_unify):
+            return unify_schemas(schemas_to_unify)
+        # Otherwise, if the resulting schemas are simple types (e.g. int),
+        # return the first schema.
+        return schemas_to_unify[0]
+    return None
+def find_partition_index(
+    table: Union["pyarrow.Table", "pandas.DataFrame"],
+    desired: Tuple[Union[int, float]],
+    sort_key: "SortKey",
+) -> int:
+    """For the given block, find the index where the desired value should be
+    added, to maintain sorted order.
+    We do this by iterating over each column, starting with the primary sort key,
+    and binary searching for the desired value in the column. Each binary search
+    shortens the "range" of indices (represented by ``left`` and ``right``, which
+    are indices of rows) where the desired value could be inserted.
+    Args:
+        table: The block to search in.
+        desired: A single tuple representing the boundary to partition at.
+            ``len(desired)`` must be less than or equal to the number of columns
+            being sorted.
+        sort_key: The sort key to use for sorting, providing the columns to be
+            sorted and their directions.
+    Returns:
+        The index where the desired value should be inserted to maintain sorted
+        order.
+    """
+    columns = sort_key.get_columns()
+    descending = sort_key.get_descending()
+    left, right = 0, len(table)
+    for i in range(len(desired)):
+        if left == right:
+            return right
+        col_name = columns[i]
+        col_vals = table[col_name].to_numpy()[left:right]
+        desired_val = desired[i]
+        # Handle null values - replace them with sentinel values
+        if desired_val is None:
+            desired_val = NULL_SENTINEL
+        # Replace None/NaN values in col_vals with sentinel
+        null_mask = col_vals == None  # noqa: E711
+        if null_mask.any():
+            col_vals = col_vals.copy()  # Make a copy to avoid modifying original
+            col_vals[null_mask] = NULL_SENTINEL
+        prevleft = left
+        if descending[i] is True:
+            # ``np.searchsorted`` expects the array to be sorted in ascending
+            # order, so we pass ``sorter``, which is an array of integer indices
+            # that sort ``col_vals`` into ascending order. The returned index
+            # is an index into the ascending order of ``col_vals``, so we need
+            # to subtract it from ``len(col_vals)`` to get the index in the
+            # original descending order of ``col_vals``.
+            left = prevleft + (
+                len(col_vals)
+                - np.searchsorted(
+                    col_vals,
+                    desired_val,
+                    side="right",
+                    sorter=np.arange(len(col_vals) - 1, -1, -1),
+                )
+            )
+            right = prevleft + (
+                len(col_vals)
+                - np.searchsorted(
+                    col_vals,
+                    desired_val,
+                    side="left",
+                    sorter=np.arange(len(col_vals) - 1, -1, -1),
+                )
+            )
+        else:
+            left = prevleft + np.searchsorted(col_vals, desired_val, side="left")
+            right = prevleft + np.searchsorted(col_vals, desired_val, side="right")
+    return right if descending[0] is True else left
+def find_partitions(
+    table: Union["pyarrow.Table", "pandas.DataFrame"],
+    boundaries: List[Tuple[Union[int, float]]],
+    sort_key: "SortKey",
+):
+    partitions = []
+    # For each boundary value, count the number of items that are less
+    # than it. Since the block is sorted, these counts partition the items
+    # such that boundaries[i] <= x < boundaries[i + 1] for each x in
+    # partition[i]. If `descending` is true, `boundaries` would also be
+    # in descending order and we only need to count the number of items
+    # *greater than* the boundary value instead.
+    bounds = [
+        find_partition_index(table, boundary, sort_key) for boundary in boundaries
+    ]
+    last_idx = 0
+    for idx in bounds:
+        partitions.append(table[last_idx:idx])
+        last_idx = idx
+    partitions.append(table[last_idx:])
+    return partitions
+def get_attribute_from_class_name(class_name: str) -> Any:
+    """Get Python attribute from the provided class name.
+    The caller needs to make sure the provided class name includes
+    full module name, and can be imported successfully.
+    """
+    from importlib import import_module
+    paths = class_name.split(".")
+    if len(paths) < 2:
+        raise ValueError(f"Cannot create object from {class_name}.")
+    module_name = ".".join(paths[:-1])
+    attribute_name = paths[-1]
+    return getattr(import_module(module_name), attribute_name)
+T = TypeVar("T")
+U = TypeVar("U")
+class _InterruptibleQueue(Queue):
+    """Extension of Python's `queue.Queue` providing ability to get interrupt its
+    method callers in other threads"""
+    INTERRUPTION_CHECK_FREQUENCY_SEC = 0.5
+    def __init__(
+        self, max_size: int, interrupted_event: Optional[threading.Event] = None
+    ):
+        super().__init__(maxsize=max_size)
+        self._interrupted_event = interrupted_event or threading.Event()
+    def get(self, block=True, timeout=None):
+        if not block or timeout is not None:
+            return super().get(block, timeout)
+        # In case when the call is blocking and no timeout is specified (ie blocking
+        # indefinitely) we apply the following protocol to make it interruptible:
+        #
+        #   1. `Queue.get` is invoked w/ 500ms timeout
+        #   2. `Empty` exception is intercepted (will be raised upon timeout elapsing)
+        #   3. If interrupted flag is set `InterruptedError` is raised
+        #   4. Otherwise, protocol retried (until interrupted or queue
+        #      becoming non-empty)
+        while True:
+            if self._interrupted_event.is_set():
+                raise InterruptedError()
+            try:
+                return super().get(
+                    block=True, timeout=self.INTERRUPTION_CHECK_FREQUENCY_SEC
+                )
+            except Empty:
+                pass
+    def put(self, item, block=True, timeout=None):
+        if not block or timeout is not None:
+            super().put(item, block, timeout)
+            return
+        # In case when the call is blocking and no timeout is specified (ie blocking
+        # indefinitely) we apply the following protocol to make it interruptible:
+        #
+        #   1. `Queue.pet` is invoked w/ 500ms timeout
+        #   2. `Full` exception is intercepted (will be raised upon timeout elapsing)
+        #   3. If interrupted flag is set `InterruptedError` is raised
+        #   4. Otherwise, protocol retried (until interrupted or queue
+        #      becomes non-full)
+        while True:
+            if self._interrupted_event.is_set():
+                raise InterruptedError()
+            try:
+                super().put(
+                    item, block=True, timeout=self.INTERRUPTION_CHECK_FREQUENCY_SEC
+                )
+                return
+            except Full:
+                pass
+def make_async_gen(
+    base_iterator: Iterator[T],
+    fn: Callable[[Iterator[T]], Iterator[U]],
+    num_workers: int = 1,
+    queue_buffer_size: int = 2,
+) -> Generator[U, None, None]:
+    gen_id = random.randint(0, 2**31 - 1)
+    """Returns a generator (iterator) mapping items from the
+    provided iterator applying provided transformation in parallel (using a
+    thread-pool).
+    NOTE: Even though the mapping is performed in parallel across N
+          threads, this method provides crucial guarantee of preserving the
+          ordering of the source iterator, ie that
+            iterator = [A1, A2, ... An]
+            mapped iterator = [map(A1), map(A2), ..., map(An)]
+          Preserving ordering is crucial to eliminate non-determinism in producing
+          content of the blocks.
+    Args:
+        base_iterator: Iterator yielding elements to map
+        fn: Transformation to apply to each element
+        num_workers: The number of threads to use in the threadpool (defaults to 1)
+        buffer_size: Number of objects to be buffered in its input/output
+                     queues (per queue; defaults to 2). Total number of objects held
+                     in memory could be calculated as:
+                        num_workers * buffer_size * 2 (input and output)
+    Returns:
+        An generator (iterator) of the elements corresponding to the source
+        elements mapped by provided transformation (while *preserving the ordering*)
+    """
+    if num_workers < 1:
+        raise ValueError("Size of threadpool must be at least 1.")
+    # To apply transformations to elements in parallel *and* preserve the ordering
+    # following invariants are established:
+    #   - Every worker is handled by standalone thread
+    #   - Every worker is assigned an input and an output queue
+    #
+    # And following protocol is implemented:
+    #   - Filling worker traverses input iterator round-robin'ing elements across
+    #     the input queues (in order!)
+    #   - Transforming workers traverse respective input queue in-order: de-queueing
+    #     element, applying transformation and enqueuing the result into the output
+    #     queue
+    #   - Generator (returned from this method) traverses output queues (in the same
+    #     order as input queues) dequeues 1 mapped element at a time from each output
+    #     queue and yields it
+    #
+    # Signal handler used to interrupt workers when terminating
+    interrupted_event = threading.Event()
+    input_queues = [
+        _InterruptibleQueue(queue_buffer_size, interrupted_event)
+        for _ in range(num_workers)
+    ]
+    output_queues = [
+        _InterruptibleQueue(queue_buffer_size, interrupted_event)
+        for _ in range(num_workers)
+    ]
+    # Filling worker
+    def _run_filling_worker():
+        try:
+            # First, round-robin elements from the iterator into
+            # corresponding input queues (one by one)
+            for idx, item in enumerate(base_iterator):
+                input_queues[idx % num_workers].put(item)
+            # Enqueue sentinel objects to signal end of the line
+            for idx in range(num_workers):
+                input_queues[idx].put(SENTINEL)
+        except InterruptedError:
+            pass
+        except Exception as e:
+            logger.warning("Caught exception in filling worker!", exc_info=e)
+            # In case of filling worker encountering an exception we have to propagate
+            # it back to the (main) iterating thread. To achieve that we're traversing
+            # output queues *backwards* relative to the order of iterator-thread such
+            # that they are more likely to meet w/in a single iteration.
+            for output_queue in reversed(output_queues):
+                output_queue.put(e)
+    # Transforming worker
+    def _run_transforming_worker(worker_id: int):
+        input_queue = input_queues[worker_id]
+        output_queue = output_queues[worker_id]
+        try:
+            # Create iterator draining the queue, until it receives sentinel
+            #
+            # NOTE: `queue.get` is blocking!
+            input_queue_iter = iter(input_queue.get, SENTINEL)
+            mapped_iter = fn(input_queue_iter)
+            for result in mapped_iter:
+                # Enqueue result of the transformation
+                output_queue.put(result)
+            # Enqueue sentinel (to signal that transformations are completed)
+            output_queue.put(SENTINEL)
+        except InterruptedError:
+            pass
+        except Exception as e:
+            logger.warning("Caught exception in transforming worker!", exc_info=e)
+            # NOTE: In this case we simply enqueue the exception rather than
+            #       interrupting
+            output_queue.put(e)
+    # Start workers threads
+    filling_worker_thread = threading.Thread(
+        target=_run_filling_worker,
+        name=f"map_tp_filling_worker-{gen_id}",
+        daemon=True,
+    )
+    filling_worker_thread.start()
+    transforming_worker_threads = [
+        threading.Thread(
+            target=_run_transforming_worker,
+            name=f"map_tp_transforming_worker-{gen_id}-{worker_idx}",
+            args=(worker_idx,),
+            daemon=True,
+        )
+        for worker_idx in range(num_workers)
+    ]
+    for t in transforming_worker_threads:
+        t.start()
+    # Use main thread to yield output batches
+    try:
+        # Keep track of remaining non-empty output queues
+        remaining_output_queues = output_queues
+        while len(remaining_output_queues) > 0:
+            # To provide deterministic ordering of the produced iterator we rely
+            # on the following invariants:
+            #
+            #   - Elements from the original iterator are round-robin'd into
+            #     input queues (in order)
+            #   - Individual workers drain their respective input queues populating
+            #     output queues with the results of applying transformation to the
+            #     original item (and hence preserving original ordering of the input
+            #     queue)
+            #   - To yield from the generator output queues are traversed in the same
+            #     order and one single element is dequeued (in a blocking way!) at a
+            #     time from every individual output queue
+            #
+            non_empty_queues = []
+            empty_queues = []
+            # At every iteration only remaining non-empty queues
+            # are traversed (to prevent blocking on exhausted queue)
+            for output_queue in remaining_output_queues:
+                # NOTE: This is blocking!
+                item = output_queue.get()
+                if isinstance(item, Exception):
+                    raise item
+                if item is SENTINEL:
+                    empty_queues.append(output_queue)
+                else:
+                    non_empty_queues.append(output_queue)
+                    yield item
+            assert (
+                non_empty_queues + empty_queues == remaining_output_queues
+            ), "Exhausted non-trailing queue!"
+            remaining_output_queues = non_empty_queues
+    finally:
+        # Set flag to interrupt workers (to make sure no dangling
+        # threads holding the objects are left behind)
+        #
+        # NOTE: Interrupted event is set to interrupt the running threads
+        #       that might be blocked otherwise waiting on inputs from respective
+        #       queues. However, even though we're interrupting the threads we can't
+        #       guarantee that threads will be interrupted in time (as this is
+        #       dependent on Python's GC finalizer to close the generator by raising
+        #       `GeneratorExit`) and hence we can't join on either filling or
+        #       transforming workers.
+        interrupted_event.set()
+def call_with_retry(
+    f: Callable[[], Any],
+    description: str,
+    *,
+    match: Optional[List[str]] = None,
+    max_attempts: int = 10,
+    max_backoff_s: int = 32,
+) -> Any:
+    """Retry a function with exponential backoff.
+    Args:
+        f: The function to retry.
+        match: A list of strings to match in the exception message. If ``None``, any
+            error is retried.
+        description: An imperitive description of the function being retried. For
+            example, "open the file".
+        max_attempts: The maximum number of attempts to retry.
+        max_backoff_s: The maximum number of seconds to backoff.
+    """
+    assert max_attempts >= 1, f"`max_attempts` must be positive. Got {max_attempts}."
+    for i in range(max_attempts):
+        try:
+            return f()
+        except Exception as e:
+            is_retryable = match is None or any(
+                [pattern in str(e) for pattern in match]
+            )
+            if is_retryable and i + 1 < max_attempts:
+                # Retry with binary expoential backoff with random jitter.
+                backoff = min((2 ** (i + 1)), max_backoff_s) * random.random()
+                logger.debug(
+                    f"Retrying {i+1} attempts to {description} after {backoff} seconds."
+                )
+                time.sleep(backoff)
+            else:
+                raise e from None
+def iterate_with_retry(
+    iterable_factory: Callable[[], Iterable],
+    description: str,
+    *,
+    match: Optional[List[str]] = None,
+    max_attempts: int = 10,
+    max_backoff_s: int = 32,
+) -> Any:
+    """Iterate through an iterable with retries.
+    If the iterable raises an exception, this function recreates and re-iterates
+    through the iterable, while skipping the items that have already been yielded.
+    Args:
+        iterable_factory: A no-argument function that creates the iterable.
+        match: A list of strings to match in the exception message. If ``None``, any
+            error is retried.
+        description: An imperitive description of the function being retried. For
+            example, "open the file".
+        max_attempts: The maximum number of attempts to retry.
+        max_backoff_s: The maximum number of seconds to backoff.
+    """
+    assert max_attempts >= 1, f"`max_attempts` must be positive. Got {max_attempts}."
+    num_items_yielded = 0
+    for attempt in range(max_attempts):
+        try:
+            iterable = iterable_factory()
+            for item_index, item in enumerate(iterable):
+                if item_index < num_items_yielded:
+                    # Skip items that have already been yielded.
+                    continue
+                num_items_yielded += 1
+                yield item
+            return
+        except Exception as e:
+            is_retryable = match is None or any(
+                [pattern in str(e) for pattern in match]
+            )
+            if is_retryable and attempt + 1 < max_attempts:
+                # Retry with binary expoential backoff with random jitter.
+                backoff = min((2 ** (attempt + 1)), max_backoff_s) * random.random()
+                logger.debug(
+                    f"Retrying {attempt+1} attempts to {description} "
+                    f"after {backoff} seconds."
+                )
+                time.sleep(backoff)
+            else:
+                raise e from None
+def create_dataset_tag(dataset_name: Optional[str], *args):
+    tag = dataset_name or "dataset"
+    for arg in args:
+        tag += f"_{arg}"
+    return tag
+def convert_bytes_to_human_readable_str(num_bytes: int) -> str:
+    if num_bytes >= 1e9:
+        num_bytes_str = f"{round(num_bytes / 1e9)}GB"
+    elif num_bytes >= 1e6:
+        num_bytes_str = f"{round(num_bytes / 1e6)}MB"
+    else:
+        num_bytes_str = f"{round(num_bytes / 1e3)}KB"
+    return num_bytes_str
+def _validate_rows_per_file_args(
+    *, num_rows_per_file: Optional[int] = None, min_rows_per_file: Optional[int] = None
+) -> Optional[int]:
+    """Helper method to validate and handle rows per file arguments.
+    Args:
+        num_rows_per_file: Deprecated parameter for number of rows per file
+        min_rows_per_file: New parameter for minimum rows per file
+    Returns:
+        The effective min_rows_per_file value to use
+    """
+    if num_rows_per_file is not None:
+        import warnings
+        warnings.warn(
+            "`num_rows_per_file` is deprecated and will be removed in a future release. "
+            "Use `min_rows_per_file` instead.",
+            DeprecationWarning,
+            stacklevel=3,
+        )
+        if min_rows_per_file is not None:
+            raise ValueError(
+                "Cannot specify both `num_rows_per_file` and `min_rows_per_file`. "
+                "Use `min_rows_per_file` as `num_rows_per_file` is deprecated."
+            )
+        return num_rows_per_file
+    return min_rows_per_file
+def is_nan(value):
+    try:
+        return isinstance(value, float) and np.isnan(value)
+    except TypeError:
+        return False
+def keys_equal(keys1, keys2):
+    if len(keys1) != len(keys2):
+        return False
+    for k1, k2 in zip(keys1, keys2):
+        if not ((is_nan(k1) and is_nan(k2)) or k1 == k2):
+            return False
+    return True

.venv/lib/python3.11/site-packages/ray/data/datasource/__init__.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from ray.data._internal.datasource.sql_datasource import Connection
+from ray.data.datasource.datasink import (
+    Datasink,
+    DummyOutputDatasink,
+    WriteResult,
+    WriteReturnType,
+)
+from ray.data.datasource.datasource import (
+    Datasource,
+    RandomIntRowDatasource,
+    Reader,
+    ReadTask,
+)
+from ray.data.datasource.file_based_datasource import (
+    FileBasedDatasource,
+    FileShuffleConfig,
+    _S3FileSystemWrapper,
+)
+from ray.data.datasource.file_datasink import (
+    BlockBasedFileDatasink,
+    RowBasedFileDatasink,
+)
+from ray.data.datasource.file_meta_provider import (
+    BaseFileMetadataProvider,
+    DefaultFileMetadataProvider,
+    FastFileMetadataProvider,
+    FileMetadataProvider,
+)
+from ray.data.datasource.filename_provider import FilenameProvider
+from ray.data.datasource.parquet_meta_provider import ParquetMetadataProvider
+from ray.data.datasource.partitioning import (
+    Partitioning,
+    PartitionStyle,
+    PathPartitionFilter,
+    PathPartitionParser,
+)
+# Note: HuggingFaceDatasource should NOT be imported here, because
+# we want to only import the Hugging Face datasets library when we use
+# ray.data.from_huggingface() or HuggingFaceDatasource() directly.
+__all__ = [
+    "BaseFileMetadataProvider",
+    "BlockBasedFileDatasink",
+    "Connection",
+    "Datasink",
+    "Datasource",
+    "DeltaSharingDatasource",
+    "DefaultFileMetadataProvider",
+    "DummyOutputDatasink",
+    "FastFileMetadataProvider",
+    "FileBasedDatasource",
+    "FileShuffleConfig",
+    "FileMetadataProvider",
+    "FilenameProvider",
+    "ParquetMetadataProvider",
+    "PartitionStyle",
+    "PathPartitionFilter",
+    "PathPartitionParser",
+    "Partitioning",
+    "RandomIntRowDatasource",
+    "ReadTask",
+    "Reader",
+    "RowBasedFileDatasink",
+    "_S3FileSystemWrapper",
+    "WriteResult",
+    "WriteReturnType",
+]

.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (1.86 kB). View file

.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/datasink.cpython-311.pyc ADDED Viewed

Binary file (8.5 kB). View file

.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/datasource.cpython-311.pyc ADDED Viewed

Binary file (13.3 kB). View file

.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/file_based_datasource.cpython-311.pyc ADDED Viewed

Binary file (26.3 kB). View file

.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/file_datasink.cpython-311.pyc ADDED Viewed

Binary file (14.1 kB). View file

.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/file_meta_provider.cpython-311.pyc ADDED Viewed

Binary file (21.5 kB). View file

.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/filename_provider.cpython-311.pyc ADDED Viewed

Binary file (6.3 kB). View file

.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/parquet_meta_provider.cpython-311.pyc ADDED Viewed

Binary file (11.6 kB). View file

.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/partitioning.cpython-311.pyc ADDED Viewed

Binary file (23.6 kB). View file

.venv/lib/python3.11/site-packages/ray/data/datasource/__pycache__/path_util.cpython-311.pyc ADDED Viewed

Binary file (9.25 kB). View file

.venv/lib/python3.11/site-packages/ray/data/datasource/file_datasink.py ADDED Viewed

	@@ -0,0 +1,266 @@

+import logging
+import posixpath
+from typing import TYPE_CHECKING, Any, Dict, Iterable, Optional
+from urllib.parse import urlparse
+from ray._private.utils import _add_creatable_buckets_param_if_s3_uri
+from ray.data._internal.delegating_block_builder import DelegatingBlockBuilder
+from ray.data._internal.execution.interfaces import TaskContext
+from ray.data._internal.util import _is_local_scheme, call_with_retry
+from ray.data.block import Block, BlockAccessor
+from ray.data.context import DataContext
+from ray.data.datasource.datasink import Datasink, WriteResult
+from ray.data.datasource.filename_provider import (
+    FilenameProvider,
+    _DefaultFilenameProvider,
+)
+from ray.data.datasource.path_util import _resolve_paths_and_filesystem
+from ray.util.annotations import DeveloperAPI
+if TYPE_CHECKING:
+    import pyarrow
+logger = logging.getLogger(__name__)
+WRITE_FILE_MAX_ATTEMPTS = 10
+WRITE_FILE_RETRY_MAX_BACKOFF_SECONDS = 32
+class _FileDatasink(Datasink[None]):
+    def __init__(
+        self,
+        path: str,
+        *,
+        filesystem: Optional["pyarrow.fs.FileSystem"] = None,
+        try_create_dir: bool = True,
+        open_stream_args: Optional[Dict[str, Any]] = None,
+        filename_provider: Optional[FilenameProvider] = None,
+        dataset_uuid: Optional[str] = None,
+        file_format: Optional[str] = None,
+    ):
+        """Initialize this datasink.
+        Args:
+            path: The folder to write files to.
+            filesystem: The filesystem to write files to. If not provided, the
+                filesystem is inferred from the path.
+            try_create_dir: Whether to create the directory to write files to.
+            open_stream_args: Arguments to pass to ``filesystem.open_output_stream``.
+            filename_provider: A :class:`ray.data.datasource.FilenameProvider` that
+                generates filenames for each row or block.
+            dataset_uuid: The UUID of the dataset being written. If specified, it's
+                included in the filename.
+            file_format: The file extension. If specified, files are written with this
+                extension.
+        """
+        if open_stream_args is None:
+            open_stream_args = {}
+        if filename_provider is None:
+            filename_provider = _DefaultFilenameProvider(
+                dataset_uuid=dataset_uuid, file_format=file_format
+            )
+        self.unresolved_path = path
+        paths, self.filesystem = _resolve_paths_and_filesystem(path, filesystem)
+        assert len(paths) == 1, len(paths)
+        self.path = paths[0]
+        self.try_create_dir = try_create_dir
+        self.open_stream_args = open_stream_args
+        self.filename_provider = filename_provider
+        self.dataset_uuid = dataset_uuid
+        self.file_format = file_format
+        self.has_created_dir = False
+    def open_output_stream(self, path: str) -> "pyarrow.NativeFile":
+        return self.filesystem.open_output_stream(path, **self.open_stream_args)
+    def on_write_start(self) -> None:
+        self.has_created_dir = self._create_dir(self.path)
+    def _create_dir(self, dest) -> bool:
+        """Create a directory to write files to.
+        If ``try_create_dir`` is ``False``, this method is a no-op.
+        """
+        from pyarrow.fs import FileType
+        # We should skip creating directories in s3 unless the user specifically
+        # overrides this behavior. PyArrow's s3fs implementation for create_dir
+        # will attempt to check if the parent directory exists before trying to
+        # create the directory (with recursive=True it will try to do this to
+        # all of the directories until the root of the bucket). An IAM Policy that
+        # restricts access to a subset of prefixes within the bucket might cause
+        # the creation of the directory to fail even if the permissions should
+        # allow the data can be written to the specified path. For example if a
+        # a policy only allows users to write blobs prefixed with s3://bucket/foo
+        # a call to create_dir for s3://bucket/foo/bar will fail even though it
+        # should not.
+        parsed_uri = urlparse(dest)
+        is_s3_uri = parsed_uri.scheme == "s3"
+        skip_create_dir_for_s3 = (
+            is_s3_uri and not DataContext.get_current().s3_try_create_dir
+        )
+        if self.try_create_dir and not skip_create_dir_for_s3:
+            if self.filesystem.get_file_info(dest).type is FileType.NotFound:
+                # Arrow's S3FileSystem doesn't allow creating buckets by default, so we
+                # add a query arg enabling bucket creation if an S3 URI is provided.
+                tmp = _add_creatable_buckets_param_if_s3_uri(dest)
+                self.filesystem.create_dir(tmp, recursive=True)
+                return True
+        return False
+    def write(
+        self,
+        blocks: Iterable[Block],
+        ctx: TaskContext,
+    ) -> None:
+        builder = DelegatingBlockBuilder()
+        for block in blocks:
+            builder.add_block(block)
+        block = builder.build()
+        block_accessor = BlockAccessor.for_block(block)
+        if block_accessor.num_rows() == 0:
+            logger.warning(f"Skipped writing empty block to {self.path}")
+            return
+        self.write_block(block_accessor, 0, ctx)
+    def write_block(self, block: BlockAccessor, block_index: int, ctx: TaskContext):
+        raise NotImplementedError
+    def on_write_complete(self, write_result: WriteResult[None]):
+        # If no rows were written, we can delete the directory.
+        if self.has_created_dir and write_result.num_rows == 0:
+            self.filesystem.delete_dir(self.path)
+    @property
+    def supports_distributed_writes(self) -> bool:
+        return not _is_local_scheme(self.unresolved_path)
+@DeveloperAPI
+class RowBasedFileDatasink(_FileDatasink):
+    """A datasink that writes one row to each file.
+    Subclasses must implement ``write_row_to_file`` and call the superclass constructor.
+    Examples:
+        .. testcode::
+            import io
+            from typing import Any, Dict
+            import pyarrow
+            from PIL import Image
+            from ray.data.datasource import RowBasedFileDatasink
+            class ImageDatasink(RowBasedFileDatasink):
+                def __init__(self, path: str, *, column: str, file_format: str = "png"):
+                    super().__init__(path, file_format=file_format)
+                    self._file_format = file_format
+                    self._column = column
+                def write_row_to_file(self, row: Dict[str, Any], file: "pyarrow.NativeFile"):
+                    image = Image.fromarray(row[self._column])
+                    buffer = io.BytesIO()
+                    image.save(buffer, format=self._file_format)
+                    file.write(buffer.getvalue())
+    """  # noqa: E501
+    def write_row_to_file(self, row: Dict[str, Any], file: "pyarrow.NativeFile"):
+        """Write a row to a file.
+        Args:
+            row: The row to write.
+            file: The file to write the row to.
+        """
+        raise NotImplementedError
+    def write_block(self, block: BlockAccessor, block_index: int, ctx: TaskContext):
+        for row_index, row in enumerate(block.iter_rows(public_row_format=False)):
+            filename = self.filename_provider.get_filename_for_row(
+                row, ctx.task_idx, block_index, row_index
+            )
+            write_path = posixpath.join(self.path, filename)
+            def write_row_to_path(row, write_path):
+                with self.open_output_stream(write_path) as file:
+                    self.write_row_to_file(row, file)
+            logger.debug(f"Writing {write_path} file.")
+            call_with_retry(
+                lambda row=row, write_path=write_path: write_row_to_path(
+                    row, write_path
+                ),
+                description=f"write '{write_path}'",
+                match=DataContext.get_current().retried_io_errors,
+                max_attempts=WRITE_FILE_MAX_ATTEMPTS,
+                max_backoff_s=WRITE_FILE_RETRY_MAX_BACKOFF_SECONDS,
+            )
+@DeveloperAPI
+class BlockBasedFileDatasink(_FileDatasink):
+    """A datasink that writes multiple rows to each file.
+    Subclasses must implement ``write_block_to_file`` and call the superclass
+    constructor.
+    Examples:
+        .. testcode::
+            class CSVDatasink(BlockBasedFileDatasink):
+                def __init__(self, path: str):
+                    super().__init__(path, file_format="csv")
+                def write_block_to_file(self, block: BlockAccessor, file: "pyarrow.NativeFile"):
+                    from pyarrow import csv
+                    csv.write_csv(block.to_arrow(), file)
+    """  # noqa: E501
+    def __init__(
+        self, path, *, min_rows_per_file: Optional[int] = None, **file_datasink_kwargs
+    ):
+        super().__init__(path, **file_datasink_kwargs)
+        self._min_rows_per_file = min_rows_per_file
+    def write_block_to_file(self, block: BlockAccessor, file: "pyarrow.NativeFile"):
+        """Write a block of data to a file.
+        Args:
+            block: The block to write.
+            file: The file to write the block to.
+        """
+        raise NotImplementedError
+    def write_block(self, block: BlockAccessor, block_index: int, ctx: TaskContext):
+        filename = self.filename_provider.get_filename_for_block(
+            block, ctx.task_idx, block_index
+        )
+        write_path = posixpath.join(self.path, filename)
+        def write_block_to_path():
+            with self.open_output_stream(write_path) as file:
+                self.write_block_to_file(block, file)
+        logger.debug(f"Writing {write_path} file.")
+        call_with_retry(
+            write_block_to_path,
+            description=f"write '{write_path}'",
+            match=DataContext.get_current().retried_io_errors,
+            max_attempts=WRITE_FILE_MAX_ATTEMPTS,
+            max_backoff_s=WRITE_FILE_RETRY_MAX_BACKOFF_SECONDS,
+        )
+    @property
+    def min_rows_per_write(self) -> Optional[int]:
+        return self._min_rows_per_file

.venv/lib/python3.11/site-packages/ray/data/datasource/partitioning.py ADDED Viewed

	@@ -0,0 +1,456 @@

+import posixpath
+from dataclasses import dataclass
+from enum import Enum
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Type, Union
+from ray.util.annotations import DeveloperAPI, PublicAPI
+if TYPE_CHECKING:
+    import pyarrow
+PartitionDataType = Type[Union[int, float, str, bool]]
+@DeveloperAPI
+class PartitionStyle(str, Enum):
+    """Supported dataset partition styles.
+    Inherits from `str` to simplify plain text serialization/deserialization.
+    Examples:
+        >>> # Serialize to JSON text.
+        >>> json.dumps(PartitionStyle.HIVE)  # doctest: +SKIP
+        '"hive"'
+        >>> # Deserialize from JSON text.
+        >>> PartitionStyle(json.loads('"hive"'))  # doctest: +SKIP
+        <PartitionStyle.HIVE: 'hive'>
+    """
+    HIVE = "hive"
+    DIRECTORY = "dir"
+@DeveloperAPI
+@dataclass
+class Partitioning:
+    """Partition scheme used to describe path-based partitions.
+    Path-based partition formats embed all partition keys and values directly in
+    their dataset file paths.
+    For example, to read a dataset with
+    `Hive-style partitions <https://athena.guide/articles/hive-style-partitioning>`_:
+        >>> import ray
+        >>> from ray.data.datasource.partitioning import Partitioning
+        >>> ds = ray.data.read_csv(
+        ...     "s3://anonymous@ray-example-data/iris.csv",
+        ...     partitioning=Partitioning("hive"),
+        ... )
+    Instead, if your files are arranged in a directory structure such as:
+    .. code::
+        root/dog/dog_0.jpeg
+        root/dog/dog_1.jpeg
+        ...
+        root/cat/cat_0.jpeg
+        root/cat/cat_1.jpeg
+        ...
+    Then you can use directory-based partitioning:
+        >>> import ray
+        >>> from ray.data.datasource.partitioning import Partitioning
+        >>> root = "s3://anonymous@air-example-data/cifar-10/images"
+        >>> partitioning = Partitioning("dir", field_names=["class"], base_dir=root)
+        >>> ds = ray.data.read_images(root, partitioning=partitioning)
+    """
+    #: The partition style - may be either HIVE or DIRECTORY.
+    style: PartitionStyle
+    #: "/"-delimited base directory that all partitioned paths should
+    #: exist under (exclusive). File paths either outside of, or at the first
+    #: level of, this directory will be considered unpartitioned. Specify
+    #: `None` or an empty string to search for partitions in all file path
+    #: directories.
+    base_dir: Optional[str] = None
+    #: The partition key field names (i.e. column names for tabular
+    #: datasets). When non-empty, the order and length of partition key
+    #: field names must match the order and length of partition values.
+    #: Required when parsing DIRECTORY partitioned paths or generating
+    #: HIVE partitioned paths.
+    field_names: Optional[List[str]] = None
+    #: A dictionary that maps partition key names to their desired data type. If not
+    #: provided, the data type defaults to string.
+    field_types: Optional[Dict[str, PartitionDataType]] = None
+    #: Filesystem that will be used for partition path file I/O.
+    filesystem: Optional["pyarrow.fs.FileSystem"] = None
+    def __post_init__(self):
+        if self.base_dir is None:
+            self.base_dir = ""
+        if self.field_types is None:
+            self.field_types = {}
+        self._normalized_base_dir = None
+        self._resolved_filesystem = None
+    @property
+    def normalized_base_dir(self) -> str:
+        """Returns the base directory normalized for compatibility with a filesystem."""
+        if self._normalized_base_dir is None:
+            self._normalize_base_dir()
+        return self._normalized_base_dir
+    @property
+    def resolved_filesystem(self) -> "pyarrow.fs.FileSystem":
+        """Returns the filesystem resolved for compatibility with a base directory."""
+        if self._resolved_filesystem is None:
+            self._normalize_base_dir()
+        return self._resolved_filesystem
+    def _normalize_base_dir(self):
+        """Normalizes the partition base directory for compatibility with the
+        given filesystem.
+        This should be called once a filesystem has been resolved to ensure that this
+        base directory is correctly discovered at the root of all partitioned file
+        paths.
+        """
+        from ray.data.datasource.path_util import _resolve_paths_and_filesystem
+        paths, self._resolved_filesystem = _resolve_paths_and_filesystem(
+            self.base_dir,
+            self.filesystem,
+        )
+        assert (
+            len(paths) == 1
+        ), f"Expected 1 normalized base directory, but found {len(paths)}"
+        normalized_base_dir = paths[0]
+        if len(normalized_base_dir) and not normalized_base_dir.endswith("/"):
+            normalized_base_dir += "/"
+        self._normalized_base_dir = normalized_base_dir
+@DeveloperAPI
+class PathPartitionParser:
+    """Partition parser for path-based partition formats.
+    Path-based partition formats embed all partition keys and values directly in
+    their dataset file paths.
+    Two path partition formats are currently supported - `HIVE` and `DIRECTORY`.
+    For `HIVE` Partitioning, all partition directories under the base directory
+    will be discovered based on `{key1}={value1}/{key2}={value2}` naming
+    conventions. Key/value pairs do not need to be presented in the same
+    order across all paths. Directory names nested under the base directory that
+    don't follow this naming condition will be considered unpartitioned. If a
+    partition filter is defined, then it will be called with an empty input
+    dictionary for each unpartitioned file.
+    For `DIRECTORY` Partitioning, all directories under the base directory will
+    be interpreted as partition values of the form `{value1}/{value2}`. An
+    accompanying ordered list of partition field names must also be provided,
+    where the order and length of all partition values must match the order and
+    length of field names. Files stored directly in the base directory will
+    be considered unpartitioned. If a partition filter is defined, then it will
+    be called with an empty input dictionary for each unpartitioned file. For
+    example, if the base directory is `"foo"`, then `"foo.csv"` and `"foo/bar.csv"`
+    would be considered unpartitioned files but `"foo/bar/baz.csv"` would be associated
+    with partition `"bar"`. If the base directory is undefined, then `"foo.csv"` would
+    be unpartitioned, `"foo/bar.csv"` would be associated with partition `"foo"`, and
+    "foo/bar/baz.csv" would be associated with partition `("foo", "bar")`.
+    """
+    @staticmethod
+    def of(
+        style: PartitionStyle = PartitionStyle.HIVE,
+        base_dir: Optional[str] = None,
+        field_names: Optional[List[str]] = None,
+        field_types: Optional[Dict[str, PartitionDataType]] = None,
+        filesystem: Optional["pyarrow.fs.FileSystem"] = None,
+    ) -> "PathPartitionParser":
+        """Creates a path-based partition parser using a flattened argument list.
+        Args:
+            style: The partition style - may be either HIVE or DIRECTORY.
+            base_dir: "/"-delimited base directory to start searching for partitions
+                (exclusive). File paths outside of this directory will be considered
+                unpartitioned. Specify `None` or an empty string to search for
+                partitions in all file path directories.
+            field_names: The partition key names. Required for DIRECTORY partitioning.
+                Optional for HIVE partitioning. When non-empty, the order and length of
+                partition key field names must match the order and length of partition
+                directories discovered. Partition key field names are not required to
+                exist in the dataset schema.
+            field_types: A dictionary that maps partition key names to their desired
+                data type. If not provided, the data type default to string.
+            filesystem: Filesystem that will be used for partition path file I/O.
+        Returns:
+            The new path-based partition parser.
+        """
+        scheme = Partitioning(style, base_dir, field_names, field_types, filesystem)
+        return PathPartitionParser(scheme)
+    def __init__(self, partitioning: Partitioning):
+        """Creates a path-based partition parser.
+        Args:
+            partitioning: The path-based partition scheme. The parser starts
+                searching for partitions from this scheme's base directory. File paths
+                outside the base directory will be considered unpartitioned. If the
+                base directory is `None` or an empty string then this will search for
+                partitions in all file path directories. Field names are required for
+                DIRECTORY partitioning, and optional for HIVE partitioning. When
+                non-empty, the order and length of partition key field names must match
+                the order and length of partition directories discovered.
+        """
+        style = partitioning.style
+        field_names = partitioning.field_names
+        if style == PartitionStyle.DIRECTORY and not field_names:
+            raise ValueError(
+                "Directory partitioning requires a corresponding list of "
+                "partition key field names. Please retry your request with one "
+                "or more field names specified."
+            )
+        parsers = {
+            PartitionStyle.HIVE: self._parse_hive_path,
+            PartitionStyle.DIRECTORY: self._parse_dir_path,
+        }
+        self._parser_fn: Callable[[str], Dict[str, str]] = parsers.get(style)
+        if self._parser_fn is None:
+            raise ValueError(
+                f"Unsupported partition style: {style}. "
+                f"Supported styles: {parsers.keys()}"
+            )
+        self._scheme = partitioning
+    def __call__(self, path: str) -> Dict[str, str]:
+        """Parses partition keys and values from a single file path.
+        Args:
+            path: Input file path to parse.
+        Returns:
+            Dictionary mapping directory partition keys to values from the input file
+            path. Returns an empty dictionary for unpartitioned files.
+        """
+        dir_path = self._dir_path_trim_base(path)
+        if dir_path is None:
+            return {}
+        partitions: Dict[str, str] = self._parser_fn(dir_path)
+        for field, data_type in self._scheme.field_types.items():
+            partitions[field] = _cast_value(partitions[field], data_type)
+        return partitions
+    @property
+    def scheme(self) -> Partitioning:
+        """Returns the partitioning for this parser."""
+        return self._scheme
+    def _dir_path_trim_base(self, path: str) -> Optional[str]:
+        """Trims the normalized base directory and returns the directory path.
+        Returns None if the path does not start with the normalized base directory.
+        Simply returns the directory path if the base directory is undefined.
+        """
+        if not path.startswith(self._scheme.normalized_base_dir):
+            return None
+        path = path[len(self._scheme.normalized_base_dir) :]
+        return posixpath.dirname(path)
+    def _parse_hive_path(self, dir_path: str) -> Dict[str, str]:
+        """Hive partition path parser.
+        Returns a dictionary mapping partition keys to values given a hive-style
+        partition path of the form "{key1}={value1}/{key2}={value2}/..." or an empty
+        dictionary for unpartitioned files.
+        """
+        dirs = [d for d in dir_path.split("/") if d and (d.count("=") == 1)]
+        kv_pairs = [d.split("=") for d in dirs] if dirs else []
+        field_names = self._scheme.field_names
+        if field_names and kv_pairs:
+            if len(kv_pairs) != len(field_names):
+                raise ValueError(
+                    f"Expected {len(field_names)} partition value(s) but found "
+                    f"{len(kv_pairs)}: {kv_pairs}."
+                )
+            for i, field_name in enumerate(field_names):
+                if kv_pairs[i][0] != field_name:
+                    raise ValueError(
+                        f"Expected partition key {field_name} but found "
+                        f"{kv_pairs[i][0]}"
+                    )
+        return dict(kv_pairs)
+    def _parse_dir_path(self, dir_path: str) -> Dict[str, str]:
+        """Directory partition path parser.
+        Returns a dictionary mapping directory partition keys to values from a
+        partition path of the form "{value1}/{value2}/..." or an empty dictionary for
+        unpartitioned files.
+        Requires a corresponding ordered list of partition key field names to map the
+        correct key to each value.
+        """
+        dirs = [d for d in dir_path.split("/") if d]
+        field_names = self._scheme.field_names
+        if dirs and len(dirs) != len(field_names):
+            raise ValueError(
+                f"Expected {len(field_names)} partition value(s) but found "
+                f"{len(dirs)}: {dirs}."
+            )
+        if not dirs:
+            return {}
+        return {
+            field: directory
+            for field, directory in zip(field_names, dirs)
+            if field is not None
+        }
+@PublicAPI(stability="beta")
+class PathPartitionFilter:
+    """Partition filter for path-based partition formats.
+    Used to explicitly keep or reject files based on a custom filter function that
+    takes partition keys and values parsed from the file's path as input.
+    """
+    @staticmethod
+    def of(
+        filter_fn: Callable[[Dict[str, str]], bool],
+        style: PartitionStyle = PartitionStyle.HIVE,
+        base_dir: Optional[str] = None,
+        field_names: Optional[List[str]] = None,
+        field_types: Optional[Dict[str, PartitionDataType]] = None,
+        filesystem: Optional["pyarrow.fs.FileSystem"] = None,
+    ) -> "PathPartitionFilter":
+        """Creates a path-based partition filter using a flattened argument list.
+        Args:
+            filter_fn: Callback used to filter partitions. Takes a dictionary mapping
+                partition keys to values as input. Unpartitioned files are denoted with
+                an empty input dictionary. Returns `True` to read a file for that
+                partition or `False` to skip it. Partition keys and values are always
+                strings read from the filesystem path. For example, this removes all
+                unpartitioned files:
+                .. code:: python
+                    lambda d: True if d else False
+                This raises an assertion error for any unpartitioned file found:
+                .. code:: python
+                    def do_assert(val, msg):
+                        assert val, msg
+                    lambda d: do_assert(d, "Expected all files to be partitioned!")
+                And this only reads files from January, 2022 partitions:
+                .. code:: python
+                    lambda d: d["month"] == "January" and d["year"] == "2022"
+            style: The partition style - may be either HIVE or DIRECTORY.
+            base_dir: "/"-delimited base directory to start searching for partitions
+                (exclusive). File paths outside of this directory will be considered
+                unpartitioned. Specify `None` or an empty string to search for
+                partitions in all file path directories.
+            field_names: The partition key names. Required for DIRECTORY partitioning.
+                Optional for HIVE partitioning. When non-empty, the order and length of
+                partition key field names must match the order and length of partition
+                directories discovered. Partition key field names are not required to
+                exist in the dataset schema.
+            field_types: A dictionary that maps partition key names to their desired
+                data type. If not provided, the data type defaults to string.
+            filesystem: Filesystem that will be used for partition path file I/O.
+        Returns:
+            The new path-based partition filter.
+        """
+        scheme = Partitioning(style, base_dir, field_names, field_types, filesystem)
+        path_partition_parser = PathPartitionParser(scheme)
+        return PathPartitionFilter(path_partition_parser, filter_fn)
+    def __init__(
+        self,
+        path_partition_parser: PathPartitionParser,
+        filter_fn: Callable[[Dict[str, str]], bool],
+    ):
+        """Creates a new path-based partition filter based on a parser.
+        Args:
+            path_partition_parser: The path-based partition parser.
+            filter_fn: Callback used to filter partitions. Takes a dictionary mapping
+                partition keys to values as input. Unpartitioned files are denoted with
+                an empty input dictionary. Returns `True` to read a file for that
+                partition or `False` to skip it. Partition keys and values are always
+                strings read from the filesystem path. For example, this removes all
+                unpartitioned files:
+                ``lambda d: True if d else False``
+                This raises an assertion error for any unpartitioned file found:
+                ``lambda d: assert d, "Expected all files to be partitioned!"``
+                And this only reads files from January, 2022 partitions:
+                ``lambda d: d["month"] == "January" and d["year"] == "2022"``
+        """
+        self._parser = path_partition_parser
+        self._filter_fn = filter_fn
+    def __call__(self, paths: List[str]) -> List[str]:
+        """Returns all paths that pass this partition scheme's partition filter.
+        If no partition filter is set, then returns all input paths. If a base
+        directory is set, then only paths under this base directory will be parsed
+        for partitions. All paths outside of this base directory will automatically
+        be considered unpartitioned, and passed into the filter function as empty
+        dictionaries.
+        Also normalizes the partition base directory for compatibility with the
+        given filesystem before applying the filter.
+        Args:
+            paths: Paths to pass through the partition filter function. All
+                paths should be normalized for compatibility with the given
+                filesystem.
+        Returns:
+            List of paths that pass the partition filter, or all paths if no
+            partition filter is defined.
+        """
+        filtered_paths = paths
+        if self._filter_fn is not None:
+            filtered_paths = [
+                path for path in paths if self._filter_fn(self._parser(path))
+            ]
+        return filtered_paths
+    @property
+    def parser(self) -> PathPartitionParser:
+        """Returns the path partition parser for this filter."""
+        return self._parser
+def _cast_value(value: str, data_type: PartitionDataType) -> Any:
+    if data_type is int:
+        return int(value)
+    elif data_type is float:
+        return float(value)
+    elif data_type is bool:
+        return value.lower() == "true"
+    else:
+        return value

.venv/lib/python3.11/site-packages/ray/data/datasource/path_util.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import pathlib
+import sys
+import urllib
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+from ray.data._internal.util import _resolve_custom_scheme
+if TYPE_CHECKING:
+    import pyarrow
+def _has_file_extension(path: str, extensions: Optional[List[str]]) -> bool:
+    """Check if a path has a file extension in the provided list.
+    Examples:
+        >>> _has_file_extension("foo.csv", ["csv"])
+        True
+        >>> _has_file_extension("foo.CSV", ["csv"])
+        True
+        >>> _has_file_extension("foo.csv", ["json", "jsonl"])
+        False
+        >>> _has_file_extension("foo.csv", None)
+        True
+    Args:
+        path: The path to check.
+        extensions: A list of extensions to check against. If `None`, any extension is
+            considered valid.
+    """
+    assert extensions is None or isinstance(extensions, list), type(extensions)
+    if extensions is None:
+        return True
+    # The user-specified extensions don't contain a leading dot, so we add it here.
+    extensions = [f".{ext.lower()}" for ext in extensions]
+    return any(path.lower().endswith(ext) for ext in extensions)
+def _resolve_paths_and_filesystem(
+    paths: Union[str, List[str]],
+    filesystem: "pyarrow.fs.FileSystem" = None,
+) -> Tuple[List[str], "pyarrow.fs.FileSystem"]:
+    """
+    Resolves and normalizes all provided paths, infers a filesystem from the
+    paths and ensures that all paths use the same filesystem.
+    Args:
+        paths: A single file/directory path or a list of file/directory paths.
+            A list of paths can contain both files and directories.
+        filesystem: The filesystem implementation that should be used for
+            reading these files. If None, a filesystem will be inferred. If not
+            None, the provided filesystem will still be validated against all
+            filesystems inferred from the provided paths to ensure
+            compatibility.
+    """
+    import pyarrow as pa
+    from pyarrow.fs import (
+        FileSystem,
+        FSSpecHandler,
+        PyFileSystem,
+        _resolve_filesystem_and_path,
+    )
+    if isinstance(paths, str):
+        paths = [paths]
+    if isinstance(paths, pathlib.Path):
+        paths = [str(paths)]
+    elif not isinstance(paths, list) or any(not isinstance(p, str) for p in paths):
+        raise ValueError(
+            "Expected `paths` to be a `str`, `pathlib.Path`, or `list[str]`, but got "
+            f"`{paths}`."
+        )
+    elif len(paths) == 0:
+        raise ValueError("Must provide at least one path.")
+    need_unwrap_path_protocol = True
+    if filesystem and not isinstance(filesystem, FileSystem):
+        err_msg = (
+            f"The filesystem passed must either conform to "
+            f"pyarrow.fs.FileSystem, or "
+            f"fsspec.spec.AbstractFileSystem. The provided "
+            f"filesystem was: {filesystem}"
+        )
+        try:
+            import fsspec
+            from fsspec.implementations.http import HTTPFileSystem
+        except ModuleNotFoundError:
+            # If filesystem is not a pyarrow filesystem and fsspec isn't
+            # installed, then filesystem is neither a pyarrow filesystem nor
+            # an fsspec filesystem, so we raise a TypeError.
+            raise TypeError(err_msg) from None
+        if not isinstance(filesystem, fsspec.spec.AbstractFileSystem):
+            raise TypeError(err_msg) from None
+        if isinstance(filesystem, HTTPFileSystem):
+            # If filesystem is fsspec HTTPFileSystem, the protocol/scheme of paths
+            # should not be unwrapped/removed, because HTTPFileSystem expects full file
+            # paths including protocol/scheme. This is different behavior compared to
+            # file systems implementation in pyarrow.fs.FileSystem.
+            need_unwrap_path_protocol = False
+        filesystem = PyFileSystem(FSSpecHandler(filesystem))
+    resolved_paths = []
+    for path in paths:
+        path = _resolve_custom_scheme(path)
+        try:
+            resolved_filesystem, resolved_path = _resolve_filesystem_and_path(
+                path, filesystem
+            )
+        except pa.lib.ArrowInvalid as e:
+            if "Cannot parse URI" in str(e):
+                resolved_filesystem, resolved_path = _resolve_filesystem_and_path(
+                    _encode_url(path), filesystem
+                )
+                resolved_path = _decode_url(resolved_path)
+            elif "Unrecognized filesystem type in URI" in str(e):
+                scheme = urllib.parse.urlparse(path, allow_fragments=False).scheme
+                if scheme in ["http", "https"]:
+                    # If scheme of path is HTTP and filesystem is not resolved,
+                    # try to use fsspec HTTPFileSystem. This expects fsspec is
+                    # installed.
+                    try:
+                        from fsspec.implementations.http import HTTPFileSystem
+                    except ModuleNotFoundError:
+                        raise ImportError(
+                            "Please install fsspec to read files from HTTP."
+                        ) from None
+                    resolved_filesystem = PyFileSystem(FSSpecHandler(HTTPFileSystem()))
+                    resolved_path = path
+                    need_unwrap_path_protocol = False
+                else:
+                    raise
+            else:
+                raise
+        if filesystem is None:
+            filesystem = resolved_filesystem
+        elif need_unwrap_path_protocol:
+            resolved_path = _unwrap_protocol(resolved_path)
+        resolved_path = filesystem.normalize_path(resolved_path)
+        resolved_paths.append(resolved_path)
+    return resolved_paths, filesystem
+def _unwrap_protocol(path):
+    """
+    Slice off any protocol prefixes on path.
+    """
+    if sys.platform == "win32" and _is_local_windows_path(path):
+        # Represent as posix path such that downstream functions properly handle it.
+        # This is executed when 'file://' is NOT included in the path.
+        return pathlib.Path(path).as_posix()
+    parsed = urllib.parse.urlparse(path, allow_fragments=False)  # support '#' in path
+    query = "?" + parsed.query if parsed.query else ""  # support '?' in path
+    netloc = parsed.netloc
+    if parsed.scheme == "s3" and "@" in parsed.netloc:
+        # If the path contains an @, it is assumed to be an anonymous
+        # credentialed path, and we need to strip off the credentials.
+        netloc = parsed.netloc.split("@")[-1]
+    parsed_path = parsed.path
+    # urlparse prepends the path with a '/'. This does not work on Windows
+    # so if this is the case strip the leading slash.
+    if (
+        sys.platform == "win32"
+        and not netloc
+        and len(parsed_path) >= 3
+        and parsed_path[0] == "/"  # The problematic leading slash
+        and parsed_path[1].isalpha()  # Ensure it is a drive letter.
+        and parsed_path[2:4] in (":", ":/")
+    ):
+        parsed_path = parsed_path[1:]
+    return netloc + parsed_path + query
+def _is_url(path) -> bool:
+    return urllib.parse.urlparse(path).scheme != ""
+def _is_local_windows_path(path: str) -> bool:
+    """Determines if path is a Windows file-system location."""
+    if sys.platform != "win32":
+        return False
+    if len(path) >= 1 and path[0] == "\\":
+        return True
+    if (
+        len(path) >= 3
+        and path[1] == ":"
+        and (path[2] == "/" or path[2] == "\\")
+        and path[0].isalpha()
+    ):
+        return True
+    return False
+def _encode_url(path):
+    return urllib.parse.quote(path, safe="/:")
+def _decode_url(path):
+    return urllib.parse.unquote(path)

.venv/lib/python3.11/site-packages/ray/data/extensions/__init__.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from ray.air.util.tensor_extensions.arrow import (
+    ArrowTensorTypeV2,
+    get_arrow_extension_tensor_types,
+)
+from ray.data.extensions.object_extension import (
+    ArrowPythonObjectArray,
+    ArrowPythonObjectScalar,
+    ArrowPythonObjectType,
+    PythonObjectArray,
+    PythonObjectDtype,
+    _object_extension_type_allowed,
+)
+from ray.data.extensions.tensor_extension import (
+    ArrowConversionError,
+    ArrowTensorArray,
+    ArrowTensorType,
+    ArrowVariableShapedTensorArray,
+    ArrowVariableShapedTensorType,
+    TensorArray,
+    TensorArrayElement,
+    TensorDtype,
+    column_needs_tensor_extension,
+)
+__all__ = [
+    # Tensor array extension.
+    "TensorDtype",
+    "TensorArray",
+    "TensorArrayElement",
+    "ArrowTensorType",
+    "ArrowTensorTypeV2",
+    "ArrowTensorArray",
+    "ArrowVariableShapedTensorType",
+    "ArrowVariableShapedTensorArray",
+    "column_needs_tensor_extension",
+    "ArrowConversionError",
+    # Object array extension
+    "ArrowPythonObjectArray",
+    "ArrowPythonObjectType",
+    "ArrowPythonObjectScalar",
+    "PythonObjectArray",
+    "PythonObjectDtype",
+    "_object_extension_type_allowed",
+    "get_arrow_extension_tensor_types",
+]

.venv/lib/python3.11/site-packages/ray/data/extensions/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (1.23 kB). View file

.venv/lib/python3.11/site-packages/ray/data/extensions/__pycache__/object_extension.cpython-311.pyc ADDED Viewed

Binary file (595 Bytes). View file

.venv/lib/python3.11/site-packages/ray/data/extensions/__pycache__/tensor_extension.cpython-311.pyc ADDED Viewed

Binary file (842 Bytes). View file

.venv/lib/python3.11/site-packages/ray/data/extensions/object_extension.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from ray.air.util.object_extensions.arrow import (  # noqa: F401
+    ArrowPythonObjectArray,
+    ArrowPythonObjectScalar,
+    ArrowPythonObjectType,
+    _object_extension_type_allowed,
+)
+from ray.air.util.object_extensions.pandas import (  # noqa: F401
+    PythonObjectArray,
+    PythonObjectDtype,
+)

.venv/lib/python3.11/site-packages/ray/data/extensions/tensor_extension.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from ray.air.util.tensor_extensions.arrow import (  # noqa: F401
+    ArrowConversionError,
+    ArrowTensorArray,
+    ArrowTensorType,
+    ArrowTensorTypeV2,
+    ArrowVariableShapedTensorArray,
+    ArrowVariableShapedTensorType,
+)
+from ray.air.util.tensor_extensions.pandas import (  # noqa: F401
+    TensorArray,
+    TensorArrayElement,
+    TensorDtype,
+    column_needs_tensor_extension,
+)
+from ray.air.util.tensor_extensions.utils import create_ragged_ndarray  # noqa: F401

.venv/lib/python3.11/site-packages/ray/data/preprocessors/__init__.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from ray.data.preprocessors.chain import Chain
+from ray.data.preprocessors.concatenator import Concatenator
+from ray.data.preprocessors.discretizer import (
+    CustomKBinsDiscretizer,
+    UniformKBinsDiscretizer,
+)
+from ray.data.preprocessors.encoder import (
+    Categorizer,
+    LabelEncoder,
+    MultiHotEncoder,
+    OneHotEncoder,
+    OrdinalEncoder,
+)
+from ray.data.preprocessors.hasher import FeatureHasher
+from ray.data.preprocessors.imputer import SimpleImputer
+from ray.data.preprocessors.normalizer import Normalizer
+from ray.data.preprocessors.scaler import (
+    MaxAbsScaler,
+    MinMaxScaler,
+    RobustScaler,
+    StandardScaler,
+)
+from ray.data.preprocessors.tokenizer import Tokenizer
+from ray.data.preprocessors.torch import TorchVisionPreprocessor
+from ray.data.preprocessors.transformer import PowerTransformer
+from ray.data.preprocessors.vectorizer import CountVectorizer, HashingVectorizer
+__all__ = [
+    "Categorizer",
+    "CountVectorizer",
+    "Chain",
+    "FeatureHasher",
+    "HashingVectorizer",
+    "LabelEncoder",
+    "MaxAbsScaler",
+    "MinMaxScaler",
+    "MultiHotEncoder",
+    "Normalizer",
+    "OneHotEncoder",
+    "OrdinalEncoder",
+    "PowerTransformer",
+    "RobustScaler",
+    "SimpleImputer",
+    "StandardScaler",
+    "Concatenator",
+    "Tokenizer",
+    "TorchVisionPreprocessor",
+    "CustomKBinsDiscretizer",
+    "UniformKBinsDiscretizer",
+]