koichi12 commited on Feb 12, 2025

Commit

7cace8b

verified ·

1 Parent(s): 110275e

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
.venv/lib/python3.11/site-packages/ray/data/__pycache__/dataset.cpython-311.pyc +3 -0
.venv/lib/python3.11/site-packages/ray/data/__pycache__/read_api.cpython-311.pyc +3 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__init__.py +3 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/block_batching.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/interfaces.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/iter_batches.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/util.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/iter_batches.py +322 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/util.py +293 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/__init__.py +15 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/__pycache__/autoscaling_actor_pool.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/autoscaler.py +44 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/autoscaling_actor_pool.py +94 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/default_autoscaler.py +188 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaling_requester.py +131 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__init__.py +32 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__pycache__/backpressure_policy.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__pycache__/concurrency_cap_backpressure_policy.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/backpressure_policy.py +28 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/concurrency_cap_backpressure_policy.py +43 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__init__.py +9 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__pycache__/bundle_queue.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__pycache__/fifo_bundle_queue.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/bundle_queue.py +62 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/execution_callback.py +40 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/__init__.py +19 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/common.py +2 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/execution_options.py +300 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/executor.py +77 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/op_runtime_metrics.py +574 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/physical_operator.py +535 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/ref_bundle.py +136 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/task_context.py +44 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/transform_fn.py +10 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/legacy_compat.py +181 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/actor_pool_map_operator.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/aggregate_num_rows.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/base_physical_operator.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/input_data_buffer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/limit_operator.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/map_operator.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/map_transformer.cpython-311.pyc +0 -0

.gitattributes CHANGED Viewed

@@ -152,3 +152,5 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_
 .venv/lib/python3.11/site-packages/nvidia/cufft/lib/libcufft.so.11 filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/torchgen/__pycache__/model.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/mistral_common/data/tekken_240911.json filter=lfs diff=lfs merge=lfs -text

 .venv/lib/python3.11/site-packages/nvidia/cufft/lib/libcufft.so.11 filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/torchgen/__pycache__/model.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/mistral_common/data/tekken_240911.json filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/ray/data/__pycache__/dataset.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/ray/data/__pycache__/read_api.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text

.venv/lib/python3.11/site-packages/ray/data/__pycache__/dataset.cpython-311.pyc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c0b1a74e1674205ec83807b353da73daa79d781531cd64ecbd818fd5438ec680
+size 255996

.venv/lib/python3.11/site-packages/ray/data/__pycache__/read_api.cpython-311.pyc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a217bcdb2fd53d64e0014e4fd153627ade902228eadc09fe7df65ee93c07bc05
+size 160644

.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from ray.data._internal.block_batching.block_batching import batch_blocks
2	+
3	+ __all__ = ["batch_blocks"]

.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (331 Bytes). View file

.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/block_batching.cpython-311.pyc ADDED Viewed

Binary file (3 kB). View file

.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/interfaces.cpython-311.pyc ADDED Viewed

Binary file (2.53 kB). View file

.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/iter_batches.cpython-311.pyc ADDED Viewed

Binary file (15.1 kB). View file

.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/util.cpython-311.pyc ADDED Viewed

Binary file (17.6 kB). View file

.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/iter_batches.py ADDED Viewed

	@@ -0,0 +1,322 @@

+import collections
+from contextlib import nullcontext
+from typing import Any, Callable, Dict, Iterator, Optional
+import ray
+from ray.data._internal.block_batching.interfaces import Batch, BlockPrefetcher
+from ray.data._internal.block_batching.util import (
+    ActorBlockPrefetcher,
+    WaitBlockPrefetcher,
+    blocks_to_batches,
+    collate,
+    extract_data_from_batch,
+    finalize_batches,
+    format_batches,
+    resolve_block_refs,
+)
+from ray.data._internal.execution.interfaces.ref_bundle import RefBundle
+from ray.data._internal.memory_tracing import trace_deallocation
+from ray.data._internal.stats import DatasetStats
+from ray.data._internal.util import make_async_gen
+from ray.data.block import Block, DataBatch
+from ray.data.context import DataContext
+from ray.types import ObjectRef
+def iter_batches(
+    ref_bundles: Iterator[RefBundle],
+    *,
+    stats: Optional[DatasetStats] = None,
+    clear_block_after_read: bool = False,
+    batch_size: Optional[int] = None,
+    batch_format: Optional[str] = "default",
+    drop_last: bool = False,
+    collate_fn: Optional[Callable[[DataBatch], Any]] = None,
+    finalize_fn: Optional[Callable[[Any], Any]] = None,
+    shuffle_buffer_min_size: Optional[int] = None,
+    shuffle_seed: Optional[int] = None,
+    ensure_copy: bool = False,
+    prefetch_batches: int = 1,
+) -> Iterator[DataBatch]:
+    """Create formatted batches of data from an iterator of block object references and
+    corresponding metadata.
+    This takes a block iterator and creates batch_size batches, slicing,
+    unioning, shuffling, prefetching, and formatting blocks as needed.
+    The algorithm uses both pipeline parallelism and data parallelism:
+    If prefetch_batches=2, these are all the batches in flight:
+    [User thread] trains on Batch 0
+    - [Fetch thread] Batch 1 finalization + move to output queue
+            - [Worker thread 1] Batch 2 formatting + collating
+            - [Worker thread 2] Batch 3 formatting + collating
+            - [Raylet] Batches 4 + 5 fetched to local object store memory
+    At any point in time there are prefetch_batches+1 batches in local heap memory.
+    And the next set of prefetch_batches in local object store memory.
+    The actual steps are as follows:
+    In a single async thread, do the following:
+        1. Trigger Ray local prefetching of `prefetch_batches` worth of block object
+            references.
+        2. Resolve (i.e. call `ray.get()`) on the block references.
+        3. Perform the necessary batch slicing to construct full batches, possibly
+            shuffling if necessary.
+        4. Then, in a threadpool consisting of `prefetch_batches` threads:
+            a. Format the batches to the provided batch format.
+            b. Apply the collate function.
+        5. Finalize each of the collated batches
+        6. Fetch outputs from the threadpool, maintaining order of the batches.
+    Args:
+        ref_bundles: An iterator over RefBundles.
+        stats: DatasetStats object to record timing and other statistics.
+        clear_block_after_read: Whether to clear the block from object store
+            manually (i.e. without waiting for Python's automatic GC) after it
+            is read. Doing so will reclaim memory faster and hence reduce the
+            memory footprint. However, the caller has to ensure the safety, i.e.
+            the block will never be accessed again.
+        batch_size: Record batch size, or None to let the system pick.
+        batch_format: The format in which to return each batch.
+            Specify "default" to use the current block format (promoting
+            Arrow to pandas automatically), "pandas" to
+            select ``pandas.DataFrame`` or "pyarrow" to select
+            ``pyarrow.Table``, or None to use entire blocks
+            as batches. Default is "default".
+        drop_last: Whether to drop the last batch if it's incomplete.
+        collate_fn: A function to apply to each data batch before returning it.
+        finalize_fn: A function to apply to each data batch after it has been collated.
+            This function is not run in a threadpool so it can be used for
+            memory-intensive operations such as GPU preloading.
+        shuffle_buffer_min_size: If non-None, the data will be randomly shuffled using a
+            local in-memory shuffle buffer, and this value will serve as the minimum
+            number of rows that must be in the local in-memory shuffle buffer in order
+            to yield a batch.
+        shuffle_seed: The seed to use for the local random shuffle.
+        ensure_copy: Whether batches are always copied from the underlying base
+            blocks (not zero-copy views).
+        prefetch_batches: The number of batches to fetch ahead of the current batch to
+            process. If set to greater than 0, a separate thread will be used to fetch
+            the specified amount of formatted batches from blocks. This improves
+            performance for non-CPU bound UDFs, allowing batch fetching compute and
+            formatting to be overlapped with the UDF. Defaults to 1.
+    Returns:
+        An iterator over record batches.
+    """
+    context = DataContext.get_current()
+    if (
+        prefetch_batches > 0
+        and context.actor_prefetcher_enabled
+        and not ray.util.client.ray.is_connected()
+    ):
+        prefetcher = ActorBlockPrefetcher()
+    else:
+        prefetcher = WaitBlockPrefetcher()
+    eager_free = clear_block_after_read and DataContext.get_current().eager_free
+    def _async_iter_batches(
+        ref_bundles: Iterator[RefBundle],
+    ) -> Iterator[DataBatch]:
+        # Step 1: Prefetch logical batches locally.
+        block_iter = prefetch_batches_locally(
+            ref_bundles=ref_bundles,
+            prefetcher=prefetcher,
+            num_batches_to_prefetch=prefetch_batches,
+            batch_size=batch_size,
+            eager_free=eager_free,
+        )
+        # Step 2: Resolve the blocks.
+        block_iter = resolve_block_refs(block_ref_iter=block_iter, stats=stats)
+        # Step 3: Batch and shuffle the resolved blocks.
+        batch_iter = blocks_to_batches(
+            block_iter=block_iter,
+            stats=stats,
+            batch_size=batch_size,
+            drop_last=drop_last,
+            shuffle_buffer_min_size=shuffle_buffer_min_size,
+            shuffle_seed=shuffle_seed,
+            ensure_copy=ensure_copy,
+        )
+        # Step 4: Use a threadpool for formatting and collation.
+        batch_iter = _format_in_threadpool(
+            batch_iter,
+            stats=stats,
+            batch_format=batch_format,
+            collate_fn=collate_fn,
+            num_threadpool_workers=prefetch_batches,
+        )
+        # Step 5: Finalize each batch.
+        if finalize_fn is not None:
+            batch_iter = finalize_batches(
+                batch_iter, finalize_fn=finalize_fn, stats=stats
+            )
+        # Step 6: Restore original order.
+        batch_iter: Iterator[Batch] = restore_original_order(batch_iter)
+        yield from extract_data_from_batch(batch_iter)
+    # Run everything in a separate thread to not block the main thread when waiting
+    # for streaming results.
+    async_batch_iter = make_async_gen(
+        ref_bundles, fn=_async_iter_batches, num_workers=1
+    )
+    while True:
+        with stats.iter_total_blocked_s.timer() if stats else nullcontext():
+            try:
+                next_batch = next(async_batch_iter)
+            except StopIteration:
+                break
+        with stats.iter_user_s.timer() if stats else nullcontext():
+            yield next_batch
+def _format_in_threadpool(
+    batch_iter: Iterator[Batch],
+    stats: DatasetStats,
+    batch_format: Optional[str],
+    collate_fn: Optional[Callable[[DataBatch], Any]],
+    num_threadpool_workers: int,
+) -> Iterator[Batch]:
+    """Executes the batching, formatting, and collation logic in a threadpool.
+    Args:
+        logical_batch_iterator: An iterator over logical batches.
+        stats: DatasetStats object to record timing and other statistics.
+        batch_format: The format in which to return each batch.
+            Specify "default" to use the current block format (promoting
+            Arrow to pandas automatically), "pandas" to
+            select ``pandas.DataFrame`` or "pyarrow" to select
+            ``pyarrow.Table``, or None to use entire blocks
+            as batches.
+        collate_fn: A function to apply to each data batch before returning it.
+        num_threadpool_workers: The number of threads to use in the threadpool.
+    """
+    def threadpool_computations_format_collate(
+        batch_iter: Iterator[Batch],
+    ) -> Iterator[Batch]:
+        # Step 4a: Format the batches.
+        formatted_batch_iter = format_batches(
+            batch_iter, batch_format=batch_format, stats=stats
+        )
+        # Step 4b: Apply the collate function if applicable.
+        if collate_fn is not None:
+            formatted_batch_iter = collate(
+                formatted_batch_iter, collate_fn=collate_fn, stats=stats
+            )
+        yield from formatted_batch_iter
+    if num_threadpool_workers > 0:
+        collated_iter = make_async_gen(
+            base_iterator=batch_iter,
+            fn=threadpool_computations_format_collate,
+            num_workers=num_threadpool_workers,
+        )
+    else:
+        collated_iter = threadpool_computations_format_collate(batch_iter)
+    return collated_iter
+def prefetch_batches_locally(
+    ref_bundles: Iterator[RefBundle],
+    prefetcher: BlockPrefetcher,
+    num_batches_to_prefetch: int,
+    batch_size: Optional[int],
+    eager_free: bool = False,
+) -> Iterator[ObjectRef[Block]]:
+    """Given an iterator of batched RefBundles, returns an iterator over the
+    corresponding block references while prefetching `num_batches_to_prefetch`
+    batches in advance.
+    Args:
+        ref_bundles: An iterator over batched RefBundles.
+        prefetcher: The prefetcher to use.
+        num_batches_to_prefetch: The number of batches to prefetch ahead of the
+            current batch during the scan.
+        batch_size: User specified batch size, or None to let the system pick.
+        eager_free: Whether to eagerly free the object reference from the object store.
+    """
+    sliding_window = collections.deque()
+    current_window_size = 0
+    if num_batches_to_prefetch <= 0:
+        for ref_bundle in ref_bundles:
+            for block_ref in ref_bundle.block_refs:
+                yield block_ref
+        return
+    if batch_size is not None:
+        num_rows_to_prefetch = num_batches_to_prefetch * batch_size
+    else:
+        num_rows_to_prefetch = None
+    # Create and fetch the initial window.
+    # Stop adding if the number of rows in this window is greater than requested
+    # batch size, or if the batch size is None and the number of blocks in this window
+    # is greater than requested batches to prefetch.
+    while (batch_size is not None and current_window_size < num_rows_to_prefetch) or (
+        batch_size is None and len(sliding_window) < num_batches_to_prefetch
+    ):
+        try:
+            next_ref_bundle = next(ref_bundles)
+            sliding_window.extend(next_ref_bundle.blocks)
+            current_window_size += next_ref_bundle.num_rows()
+        except StopIteration:
+            break
+    prefetcher.prefetch_blocks([block_ref for block_ref, _ in list(sliding_window)])
+    while sliding_window:
+        block_ref, metadata = sliding_window.popleft()
+        current_window_size -= metadata.num_rows
+        if batch_size is None or current_window_size < num_rows_to_prefetch:
+            try:
+                next_ref_bundle = next(ref_bundles)
+                for block_ref_and_md in next_ref_bundle.blocks:
+                    sliding_window.append(block_ref_and_md)
+                    current_window_size += block_ref_and_md[1].num_rows
+                prefetcher.prefetch_blocks(
+                    [block_ref for block_ref, _ in list(sliding_window)]
+                )
+            except StopIteration:
+                pass
+        yield block_ref
+        trace_deallocation(block_ref, loc="iter_batches", free=eager_free)
+    prefetcher.stop()
+def restore_original_order(batch_iter: Iterator[Batch]) -> Iterator[Batch]:
+    """Restores the original order of the provided `batch_iter`
+    This function will yield items from `base_iterator` in the correct order based on
+    each batch's batch_idx. All indexes are expected to be unique.
+    `batch_iter` is expected to not have any missing indexes. All indexes from 0 to len
+    (base_iterator) must be present.
+    """
+    next_index_required = 0
+    buffer: Dict[int, Batch] = {}
+    for batch in batch_iter:
+        assert batch.batch_idx not in buffer
+        buffer[batch.batch_idx] = batch
+        while next_index_required in buffer:
+            yield buffer.pop(next_index_required)
+            next_index_required += 1
+    while next_index_required in buffer:
+        yield buffer.pop(next_index_required)
+        next_index_required += 1

.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/util.py ADDED Viewed

	@@ -0,0 +1,293 @@

+import logging
+import threading
+from contextlib import nullcontext
+from typing import Any, Callable, Iterator, List, Optional, Tuple
+import ray
+from ray.actor import ActorHandle
+from ray.data._internal.batcher import Batcher, ShufflingBatcher
+from ray.data._internal.block_batching.interfaces import (
+    Batch,
+    BlockPrefetcher,
+    CollatedBatch,
+)
+from ray.data._internal.stats import DatasetStats
+from ray.data.block import Block, BlockAccessor, DataBatch
+from ray.types import ObjectRef
+from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
+logger = logging.getLogger(__name__)
+def _calculate_ref_hits(refs: List[ObjectRef[Any]]) -> Tuple[int, int, int]:
+    """Given a list of object references, returns how many are already on the local
+    node, how many require fetching from another node, and how many have unknown
+    locations. If `DataContext.get_current().enable_get_object_locations_for_metrics` is
+    False, this will return `(-1, -1, -1)` as getting object locations is disabled."""
+    current_node_id = ray.get_runtime_context().get_node_id()
+    ctx = ray.data.context.DataContext.get_current()
+    if ctx.enable_get_object_locations_for_metrics:
+        locs = ray.experimental.get_object_locations(refs)
+        nodes: List[List[str]] = [loc["node_ids"] for loc in locs.values()]
+        hits = sum(current_node_id in node_ids for node_ids in nodes)
+        unknowns = sum(1 for node_ids in nodes if not node_ids)
+        misses = len(nodes) - hits - unknowns
+        return hits, misses, unknowns
+    return -1, -1, -1
+def resolve_block_refs(
+    block_ref_iter: Iterator[ObjectRef[Block]],
+    stats: Optional[DatasetStats] = None,
+) -> Iterator[Block]:
+    """Resolves the block references for each logical batch.
+    Args:
+        block_ref_iter: An iterator over block object references.
+        stats: An optional stats object to recording block hits and misses.
+    """
+    hits = 0
+    misses = 0
+    unknowns = 0
+    for block_ref in block_ref_iter:
+        current_hit, current_miss, current_unknown = _calculate_ref_hits([block_ref])
+        hits += current_hit
+        misses += current_miss
+        unknowns += current_unknown
+        # TODO(amogkam): Optimized further by batching multiple references in a single
+        # `ray.get()` call.
+        with stats.iter_get_s.timer() if stats else nullcontext():
+            block = ray.get(block_ref)
+        yield block
+    if stats:
+        stats.iter_blocks_local = hits
+        stats.iter_blocks_remote = misses
+        stats.iter_unknown_location = unknowns
+def blocks_to_batches(
+    block_iter: Iterator[Block],
+    stats: Optional[DatasetStats] = None,
+    batch_size: Optional[int] = None,
+    drop_last: bool = False,
+    shuffle_buffer_min_size: Optional[int] = None,
+    shuffle_seed: Optional[int] = None,
+    ensure_copy: bool = False,
+) -> Iterator[Batch]:
+    """Given an iterator over blocks, returns an iterator over blocks
+    of the appropriate bacth size.
+    If the shuffling configurations are specified, then the
+    output blocks contain shuffled data.
+    Args:
+        block_iter: An iterator over blocks.
+        stats: Dataset stats object used to store block batching time.
+        batch_size: Record batch size, or None to let the system pick.
+        drop_last: Whether to drop the last batch if it's incomplete.
+        shuffle_buffer_min_size: If non-None, the data will be randomly shuffled
+            using a local in-memory shuffle buffer, and this value will serve as the
+            minimum number of rows that must be in the local in-memory shuffle buffer in
+            order to yield a batch.
+        shuffle_seed: The seed to use for the local random shuffle.
+        ensure_copy: Whether batches are always copied from the underlying base
+            blocks (not zero-copy views).
+    Returns:
+        An iterator over blocks of the given size that are potentially shuffled.
+    """
+    if shuffle_buffer_min_size is not None:
+        batcher = ShufflingBatcher(
+            batch_size=batch_size,
+            shuffle_buffer_min_size=shuffle_buffer_min_size,
+            shuffle_seed=shuffle_seed,
+        )
+    else:
+        batcher = Batcher(batch_size=batch_size, ensure_copy=ensure_copy)
+    def get_iter_next_batch_s_timer():
+        return stats.iter_next_batch_s.timer() if stats else nullcontext()
+    global_counter = 0
+    for block in block_iter:
+        batcher.add(block)
+        while batcher.has_batch():
+            with get_iter_next_batch_s_timer():
+                batch = batcher.next_batch()
+            yield Batch(global_counter, batch)
+            global_counter += 1
+    # Signal to the batcher that there are no more blocks to add.
+    batcher.done_adding()
+    # Get any leftover batches in ShufflingBatcher.
+    while batcher.has_batch():
+        with get_iter_next_batch_s_timer():
+            batch = batcher.next_batch()
+        yield Batch(global_counter, batch)
+        global_counter += 1
+    # Get any remaining data.
+    if not drop_last and batcher.has_any():
+        with get_iter_next_batch_s_timer():
+            batch = batcher.next_batch()
+        yield Batch(global_counter, batch)
+        global_counter += 1
+def format_batches(
+    block_iter: Iterator[Batch],
+    batch_format: Optional[str],
+    stats: Optional[DatasetStats] = None,
+) -> Iterator[Batch]:
+    """Given an iterator of blocks, returns an iterator of formatted batches.
+    Args:
+        block_iter: An iterator over blocks.
+        batch_format: The batch format to use.
+        stats: An optional stats object to record formatting times.
+    Returns:
+        An iterator over batch index and the formatted batch.
+    """
+    for batch in block_iter:
+        with stats.iter_format_batch_s.timer() if stats else nullcontext():
+            formatted_batch = BlockAccessor.for_block(batch.data).to_batch_format(
+                batch_format
+            )
+        yield Batch(batch.batch_idx, formatted_batch)
+def collate(
+    batch_iter: Iterator[Batch],
+    collate_fn: Optional[Callable[[DataBatch], Any]],
+    stats: Optional[DatasetStats] = None,
+) -> Iterator[CollatedBatch]:
+    """Returns an iterator with the provided collate_fn applied to items of the batch
+    iterator.
+    Args:
+        batch_iter: An iterator over formatted batches.
+        collate_fn: A function to apply to each batch.
+        stats: An optional stats object to record formatting times.
+    """
+    for batch in batch_iter:
+        with stats.iter_collate_batch_s.timer() if stats else nullcontext():
+            collated_batch = collate_fn(batch.data)
+        yield CollatedBatch(batch.batch_idx, collated_batch)
+def finalize_batches(
+    batch_iter: Iterator[CollatedBatch],
+    finalize_fn: Callable[[Any], Any],
+    stats: Optional[DatasetStats] = None,
+) -> Iterator[CollatedBatch]:
+    """Returns an iterator with the provided finalize_fn applied to items of the batch
+    iterator.
+    This is the same as `collate` except the input batches can be of type Any.
+    Args:
+        batch_iter: An iterator over processed batches.
+        finalize_fn: A function to apply to each batch.
+        stats: An optional stats object to record formatting times.
+    Returns:
+        An iterator over batch index and the finalized batch.
+    """
+    for batch in batch_iter:
+        with stats.iter_finalize_batch_s.timer() if stats else nullcontext():
+            finalized_batch = finalize_fn(batch.data)
+        yield CollatedBatch(batch.batch_idx, finalized_batch)
+def extract_data_from_batch(batch_iter: Iterator[Batch]) -> Iterator[Any]:
+    for batch in batch_iter:
+        yield batch.data
+PREFETCHER_ACTOR_NAMESPACE = "ray.dataset"
+class WaitBlockPrefetcher(BlockPrefetcher):
+    """Block prefetcher using ray.wait."""
+    def __init__(self):
+        self._blocks = []
+        self._stopped = False
+        self._condition = threading.Condition()
+        self._thread = threading.Thread(
+            target=self._run,
+            name="Prefetcher",
+            daemon=True,
+        )
+        self._thread.start()
+    def _run(self):
+        while True:
+            try:
+                blocks_to_wait = []
+                with self._condition:
+                    if len(self._blocks) > 0:
+                        blocks_to_wait, self._blocks = self._blocks[:], []
+                    else:
+                        if self._stopped:
+                            return
+                        blocks_to_wait = []
+                        self._condition.wait()
+                if len(blocks_to_wait) > 0:
+                    ray.wait(blocks_to_wait, num_returns=1, fetch_local=True)
+            except Exception:
+                logger.exception("Error in prefetcher thread.")
+    def prefetch_blocks(self, blocks: List[ObjectRef[Block]]):
+        with self._condition:
+            if self._stopped:
+                raise RuntimeError("Prefetcher is stopped.")
+            self._blocks = blocks
+            self._condition.notify()
+    def stop(self):
+        with self._condition:
+            if self._stopped:
+                return
+            self._stopped = True
+            self._condition.notify()
+    def __del__(self):
+        self.stop()
+class ActorBlockPrefetcher(BlockPrefetcher):
+    """Block prefetcher using a local actor."""
+    def __init__(self):
+        self.prefetch_actor = self._get_or_create_actor_prefetcher()
+    @staticmethod
+    def _get_or_create_actor_prefetcher() -> "ActorHandle":
+        node_id = ray.get_runtime_context().get_node_id()
+        actor_name = f"dataset-block-prefetcher-{node_id}"
+        return _BlockPretcher.options(
+            scheduling_strategy=NodeAffinitySchedulingStrategy(node_id, soft=False),
+            name=actor_name,
+            namespace=PREFETCHER_ACTOR_NAMESPACE,
+            get_if_exists=True,
+        ).remote()
+    def prefetch_blocks(self, blocks: List[ObjectRef[Block]]):
+        self.prefetch_actor.prefetch.remote(*blocks)
+@ray.remote(num_cpus=0)
+class _BlockPretcher:
+    """Helper actor that prefetches blocks asynchronously."""
+    def prefetch(self, *blocks) -> None:
+        pass

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (201 Bytes). View file

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from .autoscaler import Autoscaler
+from .autoscaling_actor_pool import AutoscalingActorPool
+from .default_autoscaler import DefaultAutoscaler
+def create_autoscaler(topology, resource_manager, execution_id):
+    return DefaultAutoscaler(topology, resource_manager, execution_id)
+__all__ = [
+    "Autoscaler",
+    "DefaultAutoscaler",
+    "create_autoscaler",
+    "AutoscalingActorPool",
+]

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/__pycache__/autoscaling_actor_pool.cpython-311.pyc ADDED Viewed

Binary file (4.8 kB). View file

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/autoscaler.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING
+from ray.data._internal.execution.interfaces.execution_options import ExecutionResources
+from ray.util.annotations import DeveloperAPI
+if TYPE_CHECKING:
+    from ray.data._internal.execution.resource_manager import ResourceManager
+    from ray.data._internal.execution.streaming_executor_state import Topology
+@DeveloperAPI
+class Autoscaler(ABC):
+    """Abstract interface for Ray Data autoscaler."""
+    def __init__(
+        self,
+        topology: "Topology",
+        resource_manager: "ResourceManager",
+        execution_id: str,
+    ):
+        self._topology = topology
+        self._resource_manager = resource_manager
+        self._execution_id = execution_id
+    @abstractmethod
+    def try_trigger_scaling(self):
+        """Try trigger autoscaling.
+        This method will be called each time when StreamingExecutor makes
+        a scheduling decision. A subclass should override this method to
+        handle the autoscaling of both the cluster and `AutoscalingActorPool`s.
+        """
+        ...
+    @abstractmethod
+    def on_executor_shutdown(self):
+        """Callback when the StreamingExecutor is shutting down."""
+        ...
+    @abstractmethod
+    def get_total_resources(self) -> ExecutionResources:
+        """Get the total resources that are available to this data execution."""
+        ...

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/autoscaling_actor_pool.py ADDED Viewed

	@@ -0,0 +1,94 @@

+from abc import ABC, abstractmethod
+from ray.data._internal.execution.interfaces.execution_options import ExecutionResources
+from ray.util.annotations import DeveloperAPI
+@DeveloperAPI
+class AutoscalingActorPool(ABC):
+    """Abstract interface of an autoscaling actor pool.
+    A `PhysicalOperator` can manage one or more `AutoscalingActorPool`s.
+    `Autoscaler` is responsible for deciding autoscaling of these actor
+    pools.
+    """
+    @abstractmethod
+    def min_size(self) -> int:
+        """Min size of the actor pool."""
+        ...
+    @abstractmethod
+    def max_size(self) -> int:
+        """Max size of the actor pool."""
+        ...
+    @abstractmethod
+    def current_size(self) -> int:
+        """Current size of the actor pool."""
+        ...
+    @abstractmethod
+    def num_running_actors(self) -> int:
+        """Number of running actors."""
+        ...
+    @abstractmethod
+    def num_active_actors(self) -> int:
+        """Number of actors with at least one active task."""
+        ...
+    @abstractmethod
+    def num_pending_actors(self) -> int:
+        """Number of actors pending creation."""
+        ...
+    @abstractmethod
+    def max_tasks_in_flight_per_actor(self) -> int:
+        """Max number of in-flight tasks per actor."""
+        ...
+    @abstractmethod
+    def current_in_flight_tasks(self) -> int:
+        """Number of current in-flight tasks."""
+        ...
+    def num_total_task_slots(self) -> int:
+        """Total number of task slots."""
+        return self.max_tasks_in_flight_per_actor() * self.current_size()
+    def num_free_task_slots(self) -> int:
+        """Number of free slots to run tasks."""
+        return (
+            self.max_tasks_in_flight_per_actor() * self.current_size()
+            - self.current_in_flight_tasks()
+        )
+    @abstractmethod
+    def scale_up(self, num_actors: int) -> int:
+        """Request the actor pool to scale up by the given number of actors.
+        The number of actually added actors may be less than the requested
+        number.
+        Returns:
+            The number of actors actually added.
+        """
+        ...
+    @abstractmethod
+    def scale_down(self, num_actors: int) -> int:
+        """Request actor pool to scale down by the given number of actors.
+        The number of actually removed actors may be less than the requested
+        number.
+        Returns:
+            The number of actors actually removed.
+        """
+        ...
+    @abstractmethod
+    def per_actor_resource_usage(self) -> ExecutionResources:
+        """Per actor resource usage."""
+        ...

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/default_autoscaler.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import math
+import time
+from typing import TYPE_CHECKING, Dict
+import ray
+from .autoscaler import Autoscaler
+from .autoscaling_actor_pool import AutoscalingActorPool
+from ray.data._internal.execution.autoscaling_requester import (
+    get_or_create_autoscaling_requester_actor,
+)
+from ray.data._internal.execution.interfaces.execution_options import ExecutionResources
+if TYPE_CHECKING:
+    from ray.data._internal.execution.interfaces import PhysicalOperator
+    from ray.data._internal.execution.resource_manager import ResourceManager
+    from ray.data._internal.execution.streaming_executor_state import OpState, Topology
+class DefaultAutoscaler(Autoscaler):
+    # Default threshold of actor pool utilization to trigger scaling up.
+    DEFAULT_ACTOR_POOL_SCALING_UP_THRESHOLD: float = 0.8
+    # Default threshold of actor pool utilization to trigger scaling down.
+    DEFAULT_ACTOR_POOL_SCALING_DOWN_THRESHOLD: float = 0.5
+    # Min number of seconds between two autoscaling requests.
+    MIN_GAP_BETWEEN_AUTOSCALING_REQUESTS = 20
+    def __init__(
+        self,
+        topology: "Topology",
+        resource_manager: "ResourceManager",
+        execution_id: str,
+        actor_pool_scaling_up_threshold: float = DEFAULT_ACTOR_POOL_SCALING_UP_THRESHOLD,  # noqa: E501
+        actor_pool_scaling_down_threshold: float = DEFAULT_ACTOR_POOL_SCALING_DOWN_THRESHOLD,  # noqa: E501
+    ):
+        self._actor_pool_scaling_up_threshold = actor_pool_scaling_up_threshold
+        self._actor_pool_scaling_down_threshold = actor_pool_scaling_down_threshold
+        # Last time when a request was sent to Ray's autoscaler.
+        self._last_request_time = 0
+        super().__init__(topology, resource_manager, execution_id)
+    def try_trigger_scaling(self):
+        self._try_scale_up_cluster()
+        self._try_scale_up_or_down_actor_pool()
+    def _calculate_actor_pool_util(self, actor_pool: AutoscalingActorPool):
+        """Calculate the utilization of the given actor pool."""
+        if actor_pool.current_size() == 0:
+            return 0
+        else:
+            return actor_pool.num_active_actors() / actor_pool.current_size()
+    def _actor_pool_should_scale_up(
+        self,
+        actor_pool: AutoscalingActorPool,
+        op: "PhysicalOperator",
+        op_state: "OpState",
+    ):
+        # Do not scale up, if the op is completed or no more inputs are coming.
+        if op.completed() or (op._inputs_complete and op.internal_queue_size() == 0):
+            return False
+        if actor_pool.current_size() < actor_pool.min_size():
+            # Scale up, if the actor pool is below min size.
+            return True
+        elif actor_pool.current_size() >= actor_pool.max_size():
+            # Do not scale up, if the actor pool is already at max size.
+            return False
+        # Do not scale up, if the op does not have more resources.
+        if not op_state._scheduling_status.under_resource_limits:
+            return False
+        # Do not scale up, if the op has enough free slots for the existing inputs.
+        if op_state.num_queued() <= actor_pool.num_free_task_slots():
+            return False
+        # Determine whether to scale up based on the actor pool utilization.
+        util = self._calculate_actor_pool_util(actor_pool)
+        return util > self._actor_pool_scaling_up_threshold
+    def _actor_pool_should_scale_down(
+        self,
+        actor_pool: AutoscalingActorPool,
+        op: "PhysicalOperator",
+    ):
+        # Scale down, if the op is completed or no more inputs are coming.
+        if op.completed() or (op._inputs_complete and op.internal_queue_size() == 0):
+            return True
+        if actor_pool.current_size() > actor_pool.max_size():
+            # Scale down, if the actor pool is above max size.
+            return True
+        elif actor_pool.current_size() <= actor_pool.min_size():
+            # Do not scale down, if the actor pool is already at min size.
+            return False
+        # Determine whether to scale down based on the actor pool utilization.
+        util = self._calculate_actor_pool_util(actor_pool)
+        return util < self._actor_pool_scaling_down_threshold
+    def _try_scale_up_or_down_actor_pool(self):
+        for op, state in self._topology.items():
+            actor_pools = op.get_autoscaling_actor_pools()
+            for actor_pool in actor_pools:
+                while True:
+                    # Try to scale up or down the actor pool.
+                    should_scale_up = self._actor_pool_should_scale_up(
+                        actor_pool,
+                        op,
+                        state,
+                    )
+                    should_scale_down = self._actor_pool_should_scale_down(
+                        actor_pool, op
+                    )
+                    if should_scale_up and not should_scale_down:
+                        if actor_pool.scale_up(1) == 0:
+                            break
+                    elif should_scale_down and not should_scale_up:
+                        if actor_pool.scale_down(1) == 0:
+                            break
+                    else:
+                        break
+    def _try_scale_up_cluster(self):
+        """Try to scale up the cluster to accomodate the provided in-progress workload.
+        This makes a resource request to Ray's autoscaler consisting of the current,
+        aggregate usage of all operators in the DAG + the incremental usage of all
+        operators that are ready for dispatch (i.e. that have inputs queued). If the
+        autoscaler were to grant this resource request, it would allow us to dispatch
+        one task for every ready operator.
+        Note that this resource request does not take the global resource limits or the
+        liveness policy into account; it only tries to make the existing resource usage
+        + one more task per ready operator feasible in the cluster.
+        """
+        # Limit the frequency of autoscaling requests.
+        now = time.time()
+        if now - self._last_request_time < self.MIN_GAP_BETWEEN_AUTOSCALING_REQUESTS:
+            return
+        # Scale up the cluster, if no ops are allowed to run, but there are still data
+        # in the input queues.
+        no_runnable_op = all(
+            op_state._scheduling_status.runnable is False
+            for _, op_state in self._topology.items()
+        )
+        any_has_input = any(
+            op_state.num_queued() > 0 for _, op_state in self._topology.items()
+        )
+        if not (no_runnable_op and any_has_input):
+            return
+        self._last_request_time = now
+        # Get resource usage for all ops + additional resources needed to launch one
+        # more task for each ready op.
+        resource_request = []
+        def to_bundle(resource: ExecutionResources) -> Dict:
+            req = {}
+            if resource.cpu:
+                req["CPU"] = math.ceil(resource.cpu)
+            if resource.gpu:
+                req["GPU"] = math.ceil(resource.gpu)
+            return req
+        for op, state in self._topology.items():
+            per_task_resource = op.incremental_resource_usage()
+            task_bundle = to_bundle(per_task_resource)
+            resource_request.extend([task_bundle] * op.num_active_tasks())
+            # Only include incremental resource usage for ops that are ready for
+            # dispatch.
+            if state.num_queued() > 0:
+                # TODO(Clark): Scale up more aggressively by adding incremental resource
+                # usage for more than one bundle in the queue for this op?
+                resource_request.append(task_bundle)
+        self._send_resource_request(resource_request)
+    def _send_resource_request(self, resource_request):
+        # Make autoscaler resource request.
+        actor = get_or_create_autoscaling_requester_actor()
+        actor.request_resources.remote(resource_request, self._execution_id)
+    def on_executor_shutdown(self):
+        # Make request for zero resources to autoscaler for this execution.
+        actor = get_or_create_autoscaling_requester_actor()
+        actor.request_resources.remote({}, self._execution_id)
+    def get_total_resources(self) -> ExecutionResources:
+        return ExecutionResources.from_resource_dict(ray.cluster_resources())

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaling_requester.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import math
+import threading
+import time
+from typing import Dict, List
+import ray
+from ray.data.context import DataContext
+from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
+# Resource requests are considered stale after this number of seconds, and
+# will be purged.
+RESOURCE_REQUEST_TIMEOUT = 60
+PURGE_INTERVAL = RESOURCE_REQUEST_TIMEOUT * 2
+# When the autoscaling is driven by memory pressure and there are abundant
+# CPUs to support incremental CPUs needed to launch more tasks, we'll translate
+# memory pressure into an artificial request of CPUs. The amount of CPUs we'll
+# request is ARTIFICIAL_CPU_SCALING_FACTOR * ray.cluster_resources()["CPU"].
+ARTIFICIAL_CPU_SCALING_FACTOR = 1.2
+@ray.remote(num_cpus=0, max_restarts=-1, max_task_retries=-1)
+class AutoscalingRequester:
+    """Actor to make resource requests to autoscaler for the datasets.
+    The resource requests are set to timeout after RESOURCE_REQUEST_TIMEOUT seconds.
+    For those live requests, we keep track of the last request made for each execution,
+    which overrides all previous requests it made; then sum the requested amounts
+    across all executions as the final request to the autoscaler.
+    """
+    def __init__(self):
+        # execution_id -> (List[Dict], expiration timestamp)
+        self._resource_requests = {}
+        # TTL for requests.
+        self._timeout = RESOURCE_REQUEST_TIMEOUT
+        self._self_handle = ray.get_runtime_context().current_actor
+        # Start a thread to purge expired requests periodically.
+        def purge_thread_run():
+            while True:
+                time.sleep(PURGE_INTERVAL)
+                # Call purge_expired_requests() as an actor task,
+                # so we don't need to handle multi-threading.
+                ray.get(self._self_handle.purge_expired_requests.remote())
+        self._purge_thread = threading.Thread(target=purge_thread_run, daemon=True)
+        self._purge_thread.start()
+    def purge_expired_requests(self):
+        self._purge()
+        ray.autoscaler.sdk.request_resources(bundles=self._aggregate_requests())
+    def request_resources(self, req: List[Dict], execution_id: str):
+        # Purge expired requests before making request to autoscaler.
+        self._purge()
+        # For the same execution_id, we track the latest resource request and
+        # the its expiration timestamp.
+        self._resource_requests[execution_id] = (
+            req,
+            time.time() + self._timeout,
+        )
+        # We aggregate the resource requests across all execution_id's to Ray
+        # autoscaler.
+        ray.autoscaler.sdk.request_resources(bundles=self._aggregate_requests())
+    def _purge(self):
+        # Purge requests that are stale.
+        now = time.time()
+        for k, (_, t) in list(self._resource_requests.items()):
+            if t < now:
+                self._resource_requests.pop(k)
+    def _aggregate_requests(self) -> List[Dict]:
+        req = []
+        for _, (r, _) in self._resource_requests.items():
+            req.extend(r)
+        def get_cpus(req):
+            num_cpus = 0
+            for r in req:
+                if "CPU" in r:
+                    num_cpus += r["CPU"]
+            return num_cpus
+        # Round up CPUs to exceed total cluster CPUs so it can actually upscale.
+        # This is to handle the issue where the autoscaling is driven by memory
+        # pressure (rather than CPUs) from streaming executor. In such case, simply
+        # asking for incremental CPUs (e.g. 1 CPU for each ready operator) may not
+        # actually be able to trigger autoscaling if existing CPUs in cluster can
+        # already satisfy the incremental CPUs request.
+        num_cpus = get_cpus(req)
+        if num_cpus > 0:
+            total = ray.cluster_resources()
+            if "CPU" in total and num_cpus <= total["CPU"]:
+                delta = (
+                    math.ceil(ARTIFICIAL_CPU_SCALING_FACTOR * total["CPU"]) - num_cpus
+                )
+                req.extend([{"CPU": 1}] * delta)
+        return req
+    def _test_set_timeout(self, ttl):
+        """Set the timeout. This is for test only"""
+        self._timeout = ttl
+# Creating/getting an actor from multiple threads is not safe.
+# https://github.com/ray-project/ray/issues/41324
+_autoscaling_requester_lock: threading.RLock = threading.RLock()
+def get_or_create_autoscaling_requester_actor():
+    ctx = DataContext.get_current()
+    scheduling_strategy = ctx.scheduling_strategy
+    # Pin the stats actor to the local node so it fate-shares with the driver.
+    # Note: for Ray Client, the ray.get_runtime_context().get_node_id() should
+    # point to the head node.
+    scheduling_strategy = NodeAffinitySchedulingStrategy(
+        ray.get_runtime_context().get_node_id(),
+        soft=False,
+    )
+    with _autoscaling_requester_lock:
+        return AutoscalingRequester.options(
+            name="AutoscalingRequester",
+            namespace="AutoscalingRequester",
+            get_if_exists=True,
+            lifetime="detached",
+            scheduling_strategy=scheduling_strategy,
+        ).remote()

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__init__.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from typing import TYPE_CHECKING
+import ray
+from .backpressure_policy import BackpressurePolicy
+from .concurrency_cap_backpressure_policy import ConcurrencyCapBackpressurePolicy
+if TYPE_CHECKING:
+    from ray.data._internal.execution.streaming_executor_state import Topology
+# Default enabled backpressure policies and its config key.
+# Use `DataContext.set_config` to config it.
+ENABLED_BACKPRESSURE_POLICIES = [
+    ConcurrencyCapBackpressurePolicy,
+]
+ENABLED_BACKPRESSURE_POLICIES_CONFIG_KEY = "backpressure_policies.enabled"
+def get_backpressure_policies(topology: "Topology"):
+    data_context = ray.data.DataContext.get_current()
+    policies = data_context.get_config(
+        ENABLED_BACKPRESSURE_POLICIES_CONFIG_KEY, ENABLED_BACKPRESSURE_POLICIES
+    )
+    return [policy(topology) for policy in policies]
+__all__ = [
+    "BackpressurePolicy",
+    "ConcurrencyCapBackpressurePolicy",
+    "ENABLED_BACKPRESSURE_POLICIES_CONFIG_KEY",
+    "get_backpressure_policies",
+]

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (1.5 kB). View file

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__pycache__/backpressure_policy.cpython-311.pyc ADDED Viewed

Binary file (1.81 kB). View file

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__pycache__/concurrency_cap_backpressure_policy.cpython-311.pyc ADDED Viewed

Binary file (2.68 kB). View file

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/backpressure_policy.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from ray.data._internal.execution.interfaces.physical_operator import (
+        PhysicalOperator,
+    )
+    from ray.data._internal.execution.streaming_executor_state import Topology
+class BackpressurePolicy(ABC):
+    """Interface for back pressure policies."""
+    @abstractmethod
+    def __init__(self, topology: "Topology"):
+        ...
+    def can_add_input(self, op: "PhysicalOperator") -> bool:
+        """Determine if we can add a new input to the operator. If returns False, the
+        operator will be backpressured and will not be able to run new tasks.
+        Used in `streaming_executor_state.py::select_operator_to_run()`.
+        Returns: True if we can add a new input to the operator, False otherwise.
+        Note, if multiple backpressure policies are enabled, the operator will be
+        backpressured if any of the policies returns False.
+        """
+        return True

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/concurrency_cap_backpressure_policy.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import logging
+from typing import TYPE_CHECKING
+from .backpressure_policy import BackpressurePolicy
+from ray.data._internal.execution.operators.task_pool_map_operator import (
+    TaskPoolMapOperator,
+)
+if TYPE_CHECKING:
+    from ray.data._internal.execution.interfaces.physical_operator import (
+        PhysicalOperator,
+    )
+    from ray.data._internal.execution.streaming_executor_state import Topology
+logger = logging.getLogger(__name__)
+class ConcurrencyCapBackpressurePolicy(BackpressurePolicy):
+    """A backpressure policy that caps the concurrency of each operator.
+    The policy will limit the number of concurrently running tasks based on its
+    concurrency cap parameter.
+    NOTE: Only support setting concurrency cap for `TaskPoolMapOperator` for now.
+    TODO(chengsu): Consolidate with actor scaling logic of `ActorPoolMapOperator`.
+    """
+    def __init__(self, topology: "Topology"):
+        self._concurrency_caps: dict["PhysicalOperator", float] = {}
+        for op, _ in topology.items():
+            if isinstance(op, TaskPoolMapOperator) and op.get_concurrency() is not None:
+                self._concurrency_caps[op] = op.get_concurrency()
+            else:
+                self._concurrency_caps[op] = float("inf")
+        logger.debug(
+            "ConcurrencyCapBackpressurePolicy initialized with: "
+            f"{self._concurrency_caps}"
+        )
+    def can_add_input(self, op: "PhysicalOperator") -> bool:
+        return op.metrics.num_tasks_running < self._concurrency_caps[op]

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from .bundle_queue import BundleQueue
+from .fifo_bundle_queue import FIFOBundleQueue
+def create_bundle_queue() -> BundleQueue:
+    return FIFOBundleQueue()
+__all__ = ["BundleQueue", "create_bundle_queue"]

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (572 Bytes). View file

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__pycache__/bundle_queue.cpython-311.pyc ADDED Viewed

Binary file (3.06 kB). View file

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__pycache__/fifo_bundle_queue.cpython-311.pyc ADDED Viewed

Binary file (5.87 kB). View file

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/bundle_queue.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import abc
+from typing import TYPE_CHECKING, Optional
+if TYPE_CHECKING:
+    from ray.data._internal.execution.interfaces import RefBundle
+class BundleQueue(abc.ABC):
+    @abc.abstractmethod
+    def __len__(self) -> int:
+        """Return the number of bundles in the queue."""
+        ...
+    @abc.abstractmethod
+    def __contains__(self, bundle: "RefBundle") -> bool:
+        """Return whether the bundle is in the queue."""
+        ...
+    @abc.abstractmethod
+    def add(self, bundle: "RefBundle") -> None:
+        """Add a bundle to the queue."""
+        ...
+    @abc.abstractmethod
+    def pop(self) -> "RefBundle":
+        """Remove and return the head of the queue.
+        Raises:
+            IndexError: If the queue is empty.
+        """
+        ...
+    @abc.abstractmethod
+    def peek(self) -> Optional["RefBundle"]:
+        """Return the head of the queue without removing it.
+        If the queue is empty, return `None`.
+        """
+        ...
+    @abc.abstractmethod
+    def remove(self, bundle: "RefBundle"):
+        """Remove a bundle from the queue."""
+        ...
+    @abc.abstractmethod
+    def clear(self):
+        """Remove all bundles from the queue."""
+        ...
+    @abc.abstractmethod
+    def estimate_size_bytes(self) -> int:
+        """Return an estimate of the total size of objects in the queue."""
+        ...
+    @abc.abstractmethod
+    def is_empty(self):
+        """Return whether this queue and all of its internal data structures are empty.
+        This method is used for testing.
+        """
+        ...

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/execution_callback.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from typing import List
+from ray.data.context import DataContext
+EXECUTION_CALLBACKS_CONFIG_KEY = "execution_callbacks"
+class ExecutionCallback:
+    """Callback interface for execution events."""
+    def before_execution_starts(self):
+        """Called before the Dataset execution starts."""
+        ...
+    def after_execution_succeeds(self):
+        """Called after the Dataset execution succeeds."""
+        ...
+    def after_execution_fails(self, error: Exception):
+        """Called after the Dataset execution fails."""
+        ...
+def get_execution_callbacks(context: DataContext) -> List[ExecutionCallback]:
+    """Get all ExecutionCallbacks from the DataContext."""
+    return context.get_config(EXECUTION_CALLBACKS_CONFIG_KEY, [])
+def add_execution_callback(callback: ExecutionCallback, context: DataContext):
+    """Add an ExecutionCallback to the DataContext."""
+    execution_callbacks = context.get_config(EXECUTION_CALLBACKS_CONFIG_KEY, [])
+    execution_callbacks.append(callback)
+    context.set_config(EXECUTION_CALLBACKS_CONFIG_KEY, execution_callbacks)
+def remove_execution_callback(callback: ExecutionCallback, context: DataContext):
+    """Remove an ExecutionCallback from the DataContext."""
+    execution_callbacks = context.get_config(EXECUTION_CALLBACKS_CONFIG_KEY, [])
+    execution_callbacks.remove(callback)
+    context.set_config(EXECUTION_CALLBACKS_CONFIG_KEY, execution_callbacks)

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from .common import NodeIdStr
+from .execution_options import ExecutionOptions, ExecutionResources
+from .executor import Executor, OutputIterator
+from .physical_operator import PhysicalOperator
+from .ref_bundle import RefBundle
+from .task_context import TaskContext
+from .transform_fn import AllToAllTransformFn
+__all__ = [
+    "AllToAllTransformFn",
+    "ExecutionOptions",
+    "ExecutionResources",
+    "Executor",
+    "NodeIdStr",
+    "OutputIterator",
+    "PhysicalOperator",
+    "RefBundle",
+    "TaskContext",
+]

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/common.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Node id string returned by `ray.get_runtime_context().get_node_id()`.
2	+ NodeIdStr = str

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/execution_options.py ADDED Viewed

	@@ -0,0 +1,300 @@

+import os
+from typing import Dict, List, Optional, Union
+from .common import NodeIdStr
+from ray.data._internal.execution.util import memory_string
+from ray.util.annotations import DeveloperAPI
+class ExecutionResources:
+    """Specifies resources usage or resource limits for execution.
+    By default this class represents resource usage. Use `for_limits` or
+    set `default_to_inf` to True to create an object that represents resource limits.
+    """
+    def __init__(
+        self,
+        cpu: Optional[float] = None,
+        gpu: Optional[float] = None,
+        object_store_memory: Optional[float] = None,
+        default_to_inf: bool = False,
+    ):
+        """Initializes ExecutionResources.
+        Args:
+            cpu: Amount of logical CPU slots.
+            gpu: Amount of logical GPU slots.
+            object_store_memory: Amount of object store memory.
+            default_to_inf: When the object represents resource usage, this flag
+                should be set to False. And missing values will default to 0.
+                When the object represents resource limits, this flag should be
+                set to True. And missing values will default to infinity.
+        """
+        self._cpu = cpu
+        self._gpu = gpu
+        self._object_store_memory = object_store_memory
+        self._default_to_inf = default_to_inf
+    @classmethod
+    def from_resource_dict(
+        cls,
+        resource_dict: Dict[str, float],
+        default_to_inf: bool = False,
+    ):
+        """Create an ExecutionResources object from a resource dict."""
+        return ExecutionResources(
+            cpu=resource_dict.get("CPU", None),
+            gpu=resource_dict.get("GPU", None),
+            object_store_memory=resource_dict.get("object_store_memory", None),
+            default_to_inf=default_to_inf,
+        )
+    @classmethod
+    def for_limits(
+        cls,
+        cpu: Optional[float] = None,
+        gpu: Optional[float] = None,
+        object_store_memory: Optional[float] = None,
+    ) -> "ExecutionResources":
+        """Create an ExecutionResources object that represents resource limits.
+        Args:
+            cpu: Amount of logical CPU slots.
+            gpu: Amount of logical GPU slots.
+            object_store_memory: Amount of object store memory.
+        """
+        return ExecutionResources(
+            cpu=cpu,
+            gpu=gpu,
+            object_store_memory=object_store_memory,
+            default_to_inf=True,
+        )
+    @property
+    def cpu(self) -> float:
+        if self._cpu is not None:
+            return self._cpu
+        return 0.0 if not self._default_to_inf else float("inf")
+    @cpu.setter
+    def cpu(self, value: float):
+        self._cpu = value
+    @property
+    def gpu(self) -> float:
+        if self._gpu is not None:
+            return self._gpu
+        return 0.0 if not self._default_to_inf else float("inf")
+    @gpu.setter
+    def gpu(self, value: float):
+        self._gpu = value
+    @property
+    def object_store_memory(self) -> float:
+        if self._object_store_memory is not None:
+            return self._object_store_memory
+        return 0.0 if not self._default_to_inf else float("inf")
+    @object_store_memory.setter
+    def object_store_memory(self, value: float):
+        self._object_store_memory = value
+    def __repr__(self):
+        return (
+            f"ExecutionResources(cpu={self.cpu:.1f}, gpu={self.gpu:.1f}, "
+            f"object_store_memory={self.object_store_memory_str()})"
+        )
+    def __eq__(self, other: "ExecutionResources") -> bool:
+        return (
+            self.cpu == other.cpu
+            and self.gpu == other.gpu
+            and self.object_store_memory == other.object_store_memory
+        )
+    @classmethod
+    def zero(cls) -> "ExecutionResources":
+        """Returns an ExecutionResources object with zero resources."""
+        return ExecutionResources(0.0, 0.0, 0.0)
+    def is_zero(self) -> bool:
+        """Returns True if all resources are zero."""
+        return self.cpu == 0.0 and self.gpu == 0.0 and self.object_store_memory == 0.0
+    def is_non_negative(self) -> bool:
+        """Returns True if all resources are non-negative."""
+        return self.cpu >= 0 and self.gpu >= 0 and self.object_store_memory >= 0
+    def object_store_memory_str(self) -> str:
+        """Returns a human-readable string for the object store memory field."""
+        if self.object_store_memory == float("inf"):
+            return "inf"
+        return memory_string(self.object_store_memory)
+    def copy(self) -> "ExecutionResources":
+        """Returns a copy of this ExecutionResources object."""
+        return ExecutionResources(
+            self._cpu, self._gpu, self._object_store_memory, self._default_to_inf
+        )
+    def add(self, other: "ExecutionResources") -> "ExecutionResources":
+        """Adds execution resources.
+        Returns:
+            A new ExecutionResource object with summed resources.
+        """
+        return ExecutionResources(
+            self.cpu + other.cpu,
+            self.gpu + other.gpu,
+            self.object_store_memory + other.object_store_memory,
+        )
+    def subtract(self, other: "ExecutionResources") -> "ExecutionResources":
+        """Subtracts execution resources.
+        Returns:
+            A new ExecutionResource object with subtracted resources.
+        """
+        return ExecutionResources(
+            self.cpu - other.cpu,
+            self.gpu - other.gpu,
+            self.object_store_memory - other.object_store_memory,
+        )
+    def max(self, other: "ExecutionResources") -> "ExecutionResources":
+        """Returns the maximum for each resource type."""
+        return ExecutionResources(
+            cpu=max(self.cpu, other.cpu),
+            gpu=max(self.gpu, other.gpu),
+            object_store_memory=max(
+                self.object_store_memory, other.object_store_memory
+            ),
+        )
+    def min(self, other: "ExecutionResources") -> "ExecutionResources":
+        """Returns the minimum for each resource type."""
+        return ExecutionResources(
+            cpu=min(self.cpu, other.cpu),
+            gpu=min(self.gpu, other.gpu),
+            object_store_memory=min(
+                self.object_store_memory, other.object_store_memory
+            ),
+        )
+    def satisfies_limit(self, limit: "ExecutionResources") -> bool:
+        """Return if this resource struct meets the specified limits.
+        Note that None for a field means no limit.
+        """
+        return (
+            self.cpu <= limit.cpu
+            and self.gpu <= limit.gpu
+            and self.object_store_memory <= limit.object_store_memory
+        )
+    def scale(self, f: float) -> "ExecutionResources":
+        """Return copy with all set values scaled by `f`."""
+        if f < 0:
+            raise ValueError("Scaling factor must be non-negative.")
+        if f == 0:
+            # Explicitly handle the zero case, because `0 * inf` is undefined.
+            return ExecutionResources.zero()
+        return ExecutionResources(
+            cpu=self.cpu * f,
+            gpu=self.gpu * f,
+            object_store_memory=self.object_store_memory * f,
+        )
+@DeveloperAPI
+class ExecutionOptions:
+    """Common options for execution.
+    Some options may not be supported on all executors (e.g., resource limits).
+    Attributes:
+        resource_limits: Set a soft limit on the resource usage during execution.
+            Autodetected by default.
+        exclude_resources: Amount of resources to exclude from Ray Data.
+            Set this if you have other workloads running on the same cluster.
+            Note,
+            - If using Ray Data with Ray Train, training resources will be
+            automatically excluded.
+            - For each resource type, resource_limits and exclude_resources can
+            not be both set.
+        locality_with_output: Set this to prefer running tasks on the same node as the
+            output node (node driving the execution). It can also be set to a list of
+            node ids to spread the outputs across those nodes. Off by default.
+        preserve_order: Set this to preserve the ordering between blocks processed by
+            operators. Off by default.
+        actor_locality_enabled: Whether to enable locality-aware task dispatch to
+            actors (off by default). This parameter applies to both stateful map and
+            streaming_split operations.
+        verbose_progress: Whether to report progress individually per operator. By
+            default, only AllToAll operators and global progress is reported. This
+            option is useful for performance debugging. On by default.
+    """
+    def __init__(
+        self,
+        resource_limits: Optional[ExecutionResources] = None,
+        exclude_resources: Optional[ExecutionResources] = None,
+        locality_with_output: Union[bool, List[NodeIdStr]] = False,
+        preserve_order: bool = False,
+        # TODO(hchen): Re-enable `actor_locality_enabled` by default after fixing
+        # https://github.com/ray-project/ray/issues/43466
+        actor_locality_enabled: bool = False,
+        verbose_progress: Optional[bool] = None,
+    ):
+        if resource_limits is None:
+            resource_limits = ExecutionResources.for_limits()
+        self.resource_limits = resource_limits
+        if exclude_resources is None:
+            exclude_resources = ExecutionResources.zero()
+        self.exclude_resources = exclude_resources
+        self.locality_with_output = locality_with_output
+        self.preserve_order = preserve_order
+        self.actor_locality_enabled = actor_locality_enabled
+        if verbose_progress is None:
+            verbose_progress = bool(
+                int(os.environ.get("RAY_DATA_VERBOSE_PROGRESS", "1"))
+            )
+        self.verbose_progress = verbose_progress
+    def __repr__(self) -> str:
+        return (
+            f"ExecutionOptions(resource_limits={self.resource_limits}, "
+            f"exclude_resources={self.exclude_resources}, "
+            f"locality_with_output={self.locality_with_output}, "
+            f"preserve_order={self.preserve_order}, "
+            f"actor_locality_enabled={self.actor_locality_enabled}, "
+            f"verbose_progress={self.verbose_progress})"
+        )
+    @property
+    def resource_limits(self) -> ExecutionResources:
+        return self._resource_limits
+    @resource_limits.setter
+    def resource_limits(self, value: ExecutionResources) -> None:
+        self._resource_limits = ExecutionResources.for_limits(
+            cpu=value._cpu,
+            gpu=value._gpu,
+            object_store_memory=value._object_store_memory,
+        )
+    def is_resource_limits_default(self):
+        """Returns True if resource_limits is the default value."""
+        return self._resource_limits == ExecutionResources.for_limits()
+    def validate(self) -> None:
+        """Validate the options."""
+        for attr in ["cpu", "gpu", "object_store_memory"]:
+            if (
+                getattr(self.resource_limits, attr) != float("inf")
+                and getattr(self.exclude_resources, attr, 0) > 0
+            ):
+                raise ValueError(
+                    "resource_limits and exclude_resources cannot "
+                    f" both be set for {attr} resource."
+                )

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/executor.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from typing import Iterable, Iterator, Optional
+from .execution_options import ExecutionOptions
+from .physical_operator import PhysicalOperator
+from .ref_bundle import RefBundle
+from ray.data._internal.stats import DatasetStats
+class OutputIterator(Iterator[RefBundle]):
+    """Iterator used to access the output of an Executor execution.
+    This is a blocking iterator. Datasets guarantees that all its iterators are
+    thread-safe (i.e., multiple threads can block on them at the same time).
+    """
+    def __init__(self, base: Iterable[RefBundle]):
+        self._it = iter(base)
+    def get_next(self, output_split_idx: Optional[int] = None) -> RefBundle:
+        """Can be used to pull outputs by a specified output index.
+        This is used to support the streaming_split() API, where the output of a
+        streaming execution is to be consumed by multiple processes.
+        Args:
+            output_split_idx: The output split index to get results for. This arg is
+                only allowed for iterators created by `Dataset.streaming_split()`.
+        Raises:
+            StopIteration if there are no more outputs to return.
+        """
+        if output_split_idx is not None:
+            raise NotImplementedError()
+        return next(self._it)
+    def __next__(self) -> RefBundle:
+        return self.get_next()
+class Executor:
+    """Abstract class for executors, which implement physical operator execution.
+    Subclasses:
+        StreamingExecutor
+    """
+    def __init__(self, options: ExecutionOptions):
+        """Create the executor."""
+        options.validate()
+        self._options = options
+    def execute(
+        self, dag: PhysicalOperator, initial_stats: Optional[DatasetStats] = None
+    ) -> OutputIterator:
+        """Start execution.
+        Args:
+            dag: The operator graph to execute.
+            initial_stats: The DatasetStats to prepend to the stats returned by the
+                executor. These stats represent actions done to compute inputs.
+        """
+        raise NotImplementedError
+    def shutdown(self):
+        """Shutdown an executor, which may still be running.
+        This should interrupt execution and clean up any used resources.
+        """
+        pass
+    def get_stats(self) -> DatasetStats:
+        """Return stats for the execution so far.
+        This is generally called after `execute` has completed, but may be called
+        while iterating over `execute` results for streaming execution.
+        """
+        raise NotImplementedError

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/op_runtime_metrics.py ADDED Viewed

	@@ -0,0 +1,574 @@

+import time
+from dataclasses import Field, dataclass, field
+from enum import Enum
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+import ray
+from ray.data._internal.execution.bundle_queue import create_bundle_queue
+from ray.data._internal.execution.interfaces.ref_bundle import RefBundle
+from ray.data._internal.memory_tracing import trace_allocation
+if TYPE_CHECKING:
+    from ray.data._internal.execution.interfaces.physical_operator import (
+        PhysicalOperator,
+    )
+# A metadata key used to mark a dataclass field as a metric.
+_IS_FIELD_METRIC_KEY = "__is_metric"
+# Metadata keys used to store information about a metric.
+_METRIC_FIELD_DESCRIPTION_KEY = "__metric_description"
+_METRIC_FIELD_METRICS_GROUP_KEY = "__metric_metrics_group"
+_METRIC_FIELD_IS_MAP_ONLY_KEY = "__metric_is_map_only"
+_METRICS: List["MetricDefinition"] = []
+class MetricsGroup(Enum):
+    INPUTS = "inputs"
+    OUTPUTS = "outputs"
+    TASKS = "tasks"
+    OBJECT_STORE_MEMORY = "object_store_memory"
+    MISC = "misc"
+@dataclass(frozen=True)
+class MetricDefinition:
+    """Metadata for a metric.
+    Args:
+        name: The name of the metric.
+        description: A human-readable description of the metric, also used as the chart
+            description on the Ray Data dashboard.
+        metrics_group: The group of the metric, used to organize metrics into groups in
+            'StatsActor' and on the Ray Data dashboard.
+        map_only: Whether the metric is only measured for 'MapOperators'.
+    """
+    name: str
+    description: str
+    metrics_group: str
+    # TODO: Let's refactor this parameter so it isn't tightly coupled with a specific
+    # operator type (MapOperator).
+    map_only: bool = False
+def metric_field(
+    *,
+    description: str,
+    metrics_group: str,
+    map_only: bool = False,
+    **field_kwargs,
+):
+    """A dataclass field that represents a metric."""
+    metadata = field_kwargs.get("metadata", {})
+    metadata[_IS_FIELD_METRIC_KEY] = True
+    metadata[_METRIC_FIELD_DESCRIPTION_KEY] = description
+    metadata[_METRIC_FIELD_METRICS_GROUP_KEY] = metrics_group
+    metadata[_METRIC_FIELD_IS_MAP_ONLY_KEY] = map_only
+    return field(metadata=metadata, **field_kwargs)
+def metric_property(
+    *,
+    description: str,
+    metrics_group: str,
+    map_only: bool = False,
+):
+    """A property that represents a metric."""
+    def wrap(func):
+        metric = MetricDefinition(
+            name=func.__name__,
+            description=description,
+            metrics_group=metrics_group,
+            map_only=map_only,
+        )
+        _METRICS.append(metric)
+        return property(func)
+    return wrap
+@dataclass
+class RunningTaskInfo:
+    inputs: RefBundle
+    num_outputs: int
+    bytes_outputs: int
+class OpRuntimesMetricsMeta(type):
+    def __init__(cls, name, bases, dict):
+        # NOTE: `Field.name` isn't set until the dataclass is created, so we can't
+        # create the metrics in `metric_field` directly.
+        super().__init__(name, bases, dict)
+        # Iterate over the attributes and methods of 'OpRuntimeMetrics'.
+        for name, value in dict.items():
+            # If an attribute is a dataclass field and has _IS_FIELD_METRIC_KEY in its
+            # metadata, then create a metric from the field metadata and add it to the
+            # list of metrics. See also the 'metric_field' function.
+            if isinstance(value, Field) and value.metadata.get(_IS_FIELD_METRIC_KEY):
+                metric = MetricDefinition(
+                    name=name,
+                    description=value.metadata[_METRIC_FIELD_DESCRIPTION_KEY],
+                    metrics_group=value.metadata[_METRIC_FIELD_METRICS_GROUP_KEY],
+                    map_only=value.metadata[_METRIC_FIELD_IS_MAP_ONLY_KEY],
+                )
+                _METRICS.append(metric)
+@dataclass
+class OpRuntimeMetrics(metaclass=OpRuntimesMetricsMeta):
+    """Runtime metrics for a 'PhysicalOperator'.
+    Metrics are updated dynamically during the execution of the Dataset.
+    This class can be used for either observablity or scheduling purposes.
+    DO NOT modify the fields of this class directly. Instead, use the provided
+    callback methods.
+    """
+    # TODO(hchen): Fields tagged with "map_only" currently only work for MapOperator.
+    # We should make them work for all operators by unifying the task execution code.
+    # === Inputs-related metrics ===
+    num_inputs_received: int = metric_field(
+        default=0,
+        description="Number of input blocks received by operator.",
+        metrics_group=MetricsGroup.INPUTS,
+    )
+    bytes_inputs_received: int = metric_field(
+        default=0,
+        description="Byte size of input blocks received by operator.",
+        metrics_group=MetricsGroup.INPUTS,
+    )
+    num_task_inputs_processed: int = metric_field(
+        default=0,
+        description=(
+            "Number of input blocks that operator's tasks have finished processing."
+        ),
+        metrics_group=MetricsGroup.INPUTS,
+        map_only=True,
+    )
+    bytes_task_inputs_processed: int = metric_field(
+        default=0,
+        description=(
+            "Byte size of input blocks that operator's tasks have finished processing."
+        ),
+        metrics_group=MetricsGroup.INPUTS,
+        map_only=True,
+    )
+    bytes_inputs_of_submitted_tasks: int = metric_field(
+        default=0,
+        description="Byte size of input blocks passed to submitted tasks.",
+        metrics_group=MetricsGroup.INPUTS,
+        map_only=True,
+    )
+    # === Outputs-related metrics ===
+    num_task_outputs_generated: int = metric_field(
+        default=0,
+        description="Number of output blocks generated by tasks.",
+        metrics_group=MetricsGroup.OUTPUTS,
+        map_only=True,
+    )
+    bytes_task_outputs_generated: int = metric_field(
+        default=0,
+        description="Byte size of output blocks generated by tasks.",
+        metrics_group=MetricsGroup.OUTPUTS,
+        map_only=True,
+    )
+    rows_task_outputs_generated: int = metric_field(
+        default=0,
+        description="Number of output rows generated by tasks.",
+        metrics_group=MetricsGroup.OUTPUTS,
+        map_only=True,
+    )
+    num_outputs_taken: int = metric_field(
+        default=0,
+        description=(
+            "Number of output blocks that are already taken by downstream operators."
+        ),
+        metrics_group=MetricsGroup.OUTPUTS,
+    )
+    bytes_outputs_taken: int = metric_field(
+        default=0,
+        description=(
+            "Byte size of output blocks that are already taken by downstream operators."
+        ),
+        metrics_group=MetricsGroup.OUTPUTS,
+    )
+    num_outputs_of_finished_tasks: int = metric_field(
+        default=0,
+        description="Number of generated output blocks that are from finished tasks.",
+        metrics_group=MetricsGroup.OUTPUTS,
+        map_only=True,
+    )
+    bytes_outputs_of_finished_tasks: int = metric_field(
+        default=0,
+        description=(
+            "Byte size of generated output blocks that are from finished tasks."
+        ),
+        metrics_group=MetricsGroup.OUTPUTS,
+        map_only=True,
+    )
+    # === Tasks-related metrics ===
+    num_tasks_submitted: int = metric_field(
+        default=0,
+        description="Number of submitted tasks.",
+        metrics_group=MetricsGroup.TASKS,
+        map_only=True,
+    )
+    num_tasks_running: int = metric_field(
+        default=0,
+        description="Number of running tasks.",
+        metrics_group=MetricsGroup.TASKS,
+        map_only=True,
+    )
+    num_tasks_have_outputs: int = metric_field(
+        default=0,
+        description="Number of tasks that already have output.",
+        metrics_group=MetricsGroup.TASKS,
+        map_only=True,
+    )
+    num_tasks_finished: int = metric_field(
+        default=0,
+        description="Number of finished tasks.",
+        metrics_group=MetricsGroup.TASKS,
+        map_only=True,
+    )
+    num_tasks_failed: int = metric_field(
+        default=0,
+        description="Number of failed tasks.",
+        metrics_group=MetricsGroup.TASKS,
+        map_only=True,
+    )
+    block_generation_time: float = metric_field(
+        default=0,
+        description="Time spent generating blocks in tasks.",
+        metrics_group=MetricsGroup.TASKS,
+        map_only=True,
+    )
+    task_submission_backpressure_time: float = metric_field(
+        default=0,
+        description="Time spent in task submission backpressure.",
+        metrics_group=MetricsGroup.TASKS,
+    )
+    # === Object store memory metrics ===
+    obj_store_mem_internal_inqueue_blocks: int = metric_field(
+        default=0,
+        description="Number of blocks in operator's internal input queue.",
+        metrics_group=MetricsGroup.OBJECT_STORE_MEMORY,
+    )
+    obj_store_mem_internal_outqueue_blocks: int = metric_field(
+        default=0,
+        description="Number of blocks in the operator's internal output queue.",
+        metrics_group=MetricsGroup.OBJECT_STORE_MEMORY,
+    )
+    obj_store_mem_freed: int = metric_field(
+        default=0,
+        description="Byte size of freed memory in object store.",
+        metrics_group=MetricsGroup.OBJECT_STORE_MEMORY,
+        map_only=True,
+    )
+    obj_store_mem_spilled: int = metric_field(
+        default=0,
+        description="Byte size of spilled memory in object store.",
+        metrics_group=MetricsGroup.OBJECT_STORE_MEMORY,
+        map_only=True,
+    )
+    obj_store_mem_used: int = metric_field(
+        default=0,
+        description="Byte size of used memory in object store.",
+        metrics_group=MetricsGroup.OBJECT_STORE_MEMORY,
+    )
+    # === Miscellaneous metrics ===
+    # Use "metrics_group: "misc" in the metadata for new metrics in this section.
+    def __init__(self, op: "PhysicalOperator"):
+        from ray.data._internal.execution.operators.map_operator import MapOperator
+        self._op = op
+        self._is_map = isinstance(op, MapOperator)
+        self._running_tasks: Dict[int, RunningTaskInfo] = {}
+        self._extra_metrics: Dict[str, Any] = {}
+        # Start time of current pause due to task submission backpressure
+        self._task_submission_backpressure_start_time = -1
+        self._internal_inqueue = create_bundle_queue()
+        self._internal_outqueue = create_bundle_queue()
+        self._pending_task_inputs = create_bundle_queue()
+    @property
+    def extra_metrics(self) -> Dict[str, Any]:
+        """Return a dict of extra metrics."""
+        return self._extra_metrics
+    @classmethod
+    def get_metrics(self) -> List[MetricDefinition]:
+        return list(_METRICS)
+    def as_dict(self):
+        """Return a dict representation of the metrics."""
+        result = []
+        for metric in self.get_metrics():
+            if not self._is_map and metric.map_only:
+                continue
+            value = getattr(self, metric.name)
+            result.append((metric.name, value))
+        # TODO: record resource usage in OpRuntimeMetrics,
+        # avoid calling self._op.current_processor_usage()
+        resource_usage = self._op.current_processor_usage()
+        result.extend(
+            [
+                ("cpu_usage", resource_usage.cpu or 0),
+                ("gpu_usage", resource_usage.gpu or 0),
+            ]
+        )
+        result.extend(self._extra_metrics.items())
+        return dict(result)
+    @metric_property(
+        description="Average number of blocks generated per task.",
+        metrics_group=MetricsGroup.OUTPUTS,
+        map_only=True,
+    )
+    def average_num_outputs_per_task(self) -> Optional[float]:
+        """Average number of output blocks per task, or None if no task has finished."""
+        if self.num_tasks_finished == 0:
+            return None
+        else:
+            return self.num_outputs_of_finished_tasks / self.num_tasks_finished
+    @metric_property(
+        description="Average size of task output in bytes.",
+        metrics_group=MetricsGroup.OUTPUTS,
+        map_only=True,
+    )
+    def average_bytes_per_output(self) -> Optional[float]:
+        """Average size in bytes of output blocks."""
+        if self.num_task_outputs_generated == 0:
+            return None
+        else:
+            return self.bytes_task_outputs_generated / self.num_task_outputs_generated
+    @metric_property(
+        description="Byte size of input blocks in the operator's internal input queue.",
+        metrics_group=MetricsGroup.OBJECT_STORE_MEMORY,
+    )
+    def obj_store_mem_internal_inqueue(self) -> int:
+        return self._internal_inqueue.estimate_size_bytes()
+    @metric_property(
+        description=(
+            "Byte size of output blocks in the operator's internal output queue."
+        ),
+        metrics_group=MetricsGroup.OBJECT_STORE_MEMORY,
+    )
+    def obj_store_mem_internal_outqueue(self) -> int:
+        return self._internal_outqueue.estimate_size_bytes()
+    @metric_property(
+        description="Byte size of input blocks used by pending tasks.",
+        metrics_group=MetricsGroup.OBJECT_STORE_MEMORY,
+        map_only=True,
+    )
+    def obj_store_mem_pending_task_inputs(self) -> int:
+        return self._pending_task_inputs.estimate_size_bytes()
+    @property
+    def obj_store_mem_pending_task_outputs(self) -> Optional[float]:
+        """Estimated size in bytes of output blocks in Ray generator buffers.
+        If an estimate isn't available, this property returns ``None``.
+        """
+        per_task_output = self.obj_store_mem_max_pending_output_per_task
+        if per_task_output is None:
+            return None
+        # Ray Data launches multiple tasks per actor, but only one task runs at a
+        # time per actor. So, the number of actually running tasks is capped by the
+        # number of active actors.
+        from ray.data._internal.execution.operators.actor_pool_map_operator import (
+            ActorPoolMapOperator,
+        )
+        num_tasks_running = self.num_tasks_running
+        if isinstance(self._op, ActorPoolMapOperator):
+            num_tasks_running = min(
+                num_tasks_running, self._op._actor_pool.num_active_actors()
+            )
+        return num_tasks_running * per_task_output
+    @property
+    def obj_store_mem_max_pending_output_per_task(self) -> Optional[float]:
+        """Estimated size in bytes of output blocks in a task's generator buffer."""
+        context = self._op.data_context
+        if context._max_num_blocks_in_streaming_gen_buffer is None:
+            return None
+        bytes_per_output = self.average_bytes_per_output
+        if bytes_per_output is None:
+            bytes_per_output = context.target_max_block_size
+        num_pending_outputs = context._max_num_blocks_in_streaming_gen_buffer
+        if self.average_num_outputs_per_task is not None:
+            num_pending_outputs = min(
+                num_pending_outputs, self.average_num_outputs_per_task
+            )
+        return bytes_per_output * num_pending_outputs
+    @metric_property(
+        description="Average size of task inputs in bytes.",
+        metrics_group=MetricsGroup.INPUTS,
+        map_only=True,
+    )
+    def average_bytes_inputs_per_task(self) -> Optional[float]:
+        """Average size in bytes of ref bundles passed to tasks, or ``None`` if no
+        tasks have been submitted."""
+        if self.num_tasks_submitted == 0:
+            return None
+        else:
+            return self.bytes_inputs_of_submitted_tasks / self.num_tasks_submitted
+    @metric_property(
+        description="Average total output size of task in bytes.",
+        metrics_group=MetricsGroup.OUTPUTS,
+        map_only=True,
+    )
+    def average_bytes_outputs_per_task(self) -> Optional[float]:
+        """Average size in bytes of output blocks per task,
+        or None if no task has finished."""
+        if self.num_tasks_finished == 0:
+            return None
+        else:
+            return self.bytes_outputs_of_finished_tasks / self.num_tasks_finished
+    def on_input_received(self, input: RefBundle):
+        """Callback when the operator receives a new input."""
+        self.num_inputs_received += 1
+        self.bytes_inputs_received += input.size_bytes()
+    def on_input_queued(self, input: RefBundle):
+        """Callback when the operator queues an input."""
+        self.obj_store_mem_internal_inqueue_blocks += len(input.blocks)
+        self._internal_inqueue.add(input)
+    def on_input_dequeued(self, input: RefBundle):
+        """Callback when the operator dequeues an input."""
+        self.obj_store_mem_internal_inqueue_blocks -= len(input.blocks)
+        input_size = input.size_bytes()
+        self._internal_inqueue.remove(input)
+        assert self.obj_store_mem_internal_inqueue >= 0, (
+            self._op,
+            self.obj_store_mem_internal_inqueue,
+            input_size,
+        )
+    def on_output_queued(self, output: RefBundle):
+        """Callback when an output is queued by the operator."""
+        self.obj_store_mem_internal_outqueue_blocks += len(output.blocks)
+        self._internal_outqueue.add(output)
+    def on_output_dequeued(self, output: RefBundle):
+        """Callback when an output is dequeued by the operator."""
+        self.obj_store_mem_internal_outqueue_blocks -= len(output.blocks)
+        output_size = output.size_bytes()
+        self._internal_outqueue.remove(output)
+        assert self.obj_store_mem_internal_outqueue >= 0, (
+            self._op,
+            self.obj_store_mem_internal_outqueue,
+            output_size,
+        )
+    def on_toggle_task_submission_backpressure(self, in_backpressure):
+        if in_backpressure and self._task_submission_backpressure_start_time == -1:
+            # backpressure starting, start timer
+            self._task_submission_backpressure_start_time = time.perf_counter()
+        elif self._task_submission_backpressure_start_time != -1:
+            # backpressure stopping, stop timer
+            self.task_submission_backpressure_time += (
+                time.perf_counter() - self._task_submission_backpressure_start_time
+            )
+            self._task_submission_backpressure_start_time = -1
+    def on_output_taken(self, output: RefBundle):
+        """Callback when an output is taken from the operator."""
+        self.num_outputs_taken += 1
+        self.bytes_outputs_taken += output.size_bytes()
+    def on_task_submitted(self, task_index: int, inputs: RefBundle):
+        """Callback when the operator submits a task."""
+        self.num_tasks_submitted += 1
+        self.num_tasks_running += 1
+        self.bytes_inputs_of_submitted_tasks += inputs.size_bytes()
+        self._pending_task_inputs.add(inputs)
+        self._running_tasks[task_index] = RunningTaskInfo(inputs, 0, 0)
+    def on_task_output_generated(self, task_index: int, output: RefBundle):
+        """Callback when a new task generates an output."""
+        num_outputs = len(output)
+        output_bytes = output.size_bytes()
+        self.num_task_outputs_generated += num_outputs
+        self.bytes_task_outputs_generated += output_bytes
+        task_info = self._running_tasks[task_index]
+        if task_info.num_outputs == 0:
+            self.num_tasks_have_outputs += 1
+        task_info.num_outputs += num_outputs
+        task_info.bytes_outputs += output_bytes
+        for block_ref, meta in output.blocks:
+            assert meta.exec_stats and meta.exec_stats.wall_time_s
+            self.block_generation_time += meta.exec_stats.wall_time_s
+            assert meta.num_rows is not None
+            self.rows_task_outputs_generated += meta.num_rows
+            trace_allocation(block_ref, "operator_output")
+    def on_task_finished(self, task_index: int, exception: Optional[Exception]):
+        """Callback when a task is finished."""
+        self.num_tasks_running -= 1
+        self.num_tasks_finished += 1
+        if exception is not None:
+            self.num_tasks_failed += 1
+        task_info = self._running_tasks[task_index]
+        self.num_outputs_of_finished_tasks += task_info.num_outputs
+        self.bytes_outputs_of_finished_tasks += task_info.bytes_outputs
+        inputs = self._running_tasks[task_index].inputs
+        self.num_task_inputs_processed += len(inputs)
+        total_input_size = inputs.size_bytes()
+        self.bytes_task_inputs_processed += total_input_size
+        input_size = inputs.size_bytes()
+        self._pending_task_inputs.remove(inputs)
+        assert self.obj_store_mem_pending_task_inputs >= 0, (
+            self._op,
+            self.obj_store_mem_pending_task_inputs,
+            input_size,
+        )
+        ctx = self._op.data_context
+        if ctx.enable_get_object_locations_for_metrics:
+            locations = ray.experimental.get_object_locations(inputs.block_refs)
+            for block, meta in inputs.blocks:
+                if locations[block].get("did_spill", False):
+                    assert meta.size_bytes is not None
+                    self.obj_store_mem_spilled += meta.size_bytes
+        self.obj_store_mem_freed += total_input_size
+        inputs.destroy_if_owned()
+        del self._running_tasks[task_index]

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/physical_operator.py ADDED Viewed

	@@ -0,0 +1,535 @@

+from abc import ABC, abstractmethod
+from typing import Any, Callable, Dict, Iterator, List, Optional, Union
+import ray
+from .ref_bundle import RefBundle
+from ray._raylet import ObjectRefGenerator
+from ray.data._internal.execution.autoscaler.autoscaling_actor_pool import (
+    AutoscalingActorPool,
+)
+from ray.data._internal.execution.interfaces.execution_options import (
+    ExecutionOptions,
+    ExecutionResources,
+)
+from ray.data._internal.execution.interfaces.op_runtime_metrics import OpRuntimeMetrics
+from ray.data._internal.logical.interfaces import LogicalOperator, Operator
+from ray.data._internal.stats import StatsDict
+from ray.data.context import DataContext
+# TODO(hchen): Ray Core should have a common interface for these two types.
+Waitable = Union[ray.ObjectRef, ObjectRefGenerator]
+class OpTask(ABC):
+    """Abstract class that represents a task that is created by an PhysicalOperator.
+    The task can be either a regular task or an actor task.
+    """
+    def __init__(self, task_index: int):
+        self._task_index = task_index
+    def task_index(self) -> int:
+        """Return the index of the task."""
+        return self._task_index
+    @abstractmethod
+    def get_waitable(self) -> Waitable:
+        """Return the ObjectRef or ObjectRefGenerator to wait on."""
+        pass
+class DataOpTask(OpTask):
+    """Represents an OpTask that handles Block data."""
+    def __init__(
+        self,
+        task_index: int,
+        streaming_gen: ObjectRefGenerator,
+        output_ready_callback: Callable[[RefBundle], None],
+        task_done_callback: Callable[[Optional[Exception]], None],
+    ):
+        """
+        Args:
+            streaming_gen: The streaming generator of this task. It should yield blocks.
+            output_ready_callback: The callback to call when a new RefBundle is output
+                from the generator.
+            task_done_callback: The callback to call when the task is done.
+        """
+        super().__init__(task_index)
+        # TODO(hchen): Right now, the streaming generator is required to yield a Block
+        # and a BlockMetadata each time. We should unify task submission with an unified
+        # interface. So each individual operator don't need to take care of the
+        # BlockMetadata.
+        self._streaming_gen = streaming_gen
+        self._output_ready_callback = output_ready_callback
+        self._task_done_callback = task_done_callback
+    def get_waitable(self) -> ObjectRefGenerator:
+        return self._streaming_gen
+    def on_data_ready(self, max_bytes_to_read: Optional[int]) -> int:
+        """Callback when data is ready to be read from the streaming generator.
+        Args:
+            max_bytes_to_read: Max bytes of blocks to read. If None, all available
+                will be read.
+        Returns: The number of blocks read.
+        """
+        bytes_read = 0
+        while max_bytes_to_read is None or bytes_read < max_bytes_to_read:
+            try:
+                block_ref = self._streaming_gen._next_sync(0)
+                if block_ref.is_nil():
+                    # The generator currently doesn't have new output.
+                    # And it's not stopped yet.
+                    break
+            except StopIteration:
+                self._task_done_callback(None)
+                break
+            try:
+                meta = ray.get(next(self._streaming_gen))
+            except StopIteration:
+                # The generator should always yield 2 values (block and metadata)
+                # each time. If we get a StopIteration here, it means an error
+                # happened in the task.
+                # And in this case, the block_ref is the exception object.
+                # TODO(hchen): Ray Core should have a better interface for
+                # detecting and obtaining the exception.
+                try:
+                    ray.get(block_ref)
+                    assert False, "Above ray.get should raise an exception."
+                except Exception as ex:
+                    self._task_done_callback(ex)
+                    raise ex from None
+            self._output_ready_callback(
+                RefBundle([(block_ref, meta)], owns_blocks=True)
+            )
+            bytes_read += meta.size_bytes
+        return bytes_read
+class MetadataOpTask(OpTask):
+    """Represents an OpTask that only handles metadata, instead of Block data."""
+    def __init__(
+        self,
+        task_index: int,
+        object_ref: ray.ObjectRef,
+        task_done_callback: Callable[[], None],
+    ):
+        """
+        Args:
+            object_ref: The ObjectRef of the task.
+            task_done_callback: The callback to call when the task is done.
+        """
+        super().__init__(task_index)
+        self._object_ref = object_ref
+        self._task_done_callback = task_done_callback
+    def get_waitable(self) -> ray.ObjectRef:
+        return self._object_ref
+    def on_task_finished(self):
+        """Callback when the task is finished."""
+        self._task_done_callback()
+class PhysicalOperator(Operator):
+    """Abstract class for physical operators.
+    An operator transforms one or more input streams of RefBundles into a single
+    output stream of RefBundles.
+    Physical operators are stateful and non-serializable; they live on the driver side
+    of the Dataset only.
+    Here's a simple example of implementing a basic "Map" operator:
+        class MapOperator(PhysicalOperator):
+            def __init__(self):
+                self.active_tasks = []
+            def add_input(self, refs, _):
+                self.active_tasks.append(map_task.remote(refs))
+            def has_next(self):
+                ready, _ = ray.wait(self.active_tasks, timeout=0)
+                return len(ready) > 0
+            def get_next(self):
+                ready, remaining = ray.wait(self.active_tasks, num_returns=1)
+                self.active_tasks = remaining
+                return ready[0]
+    Note that the above operator fully supports both bulk and streaming execution,
+    since `add_input` and `get_next` can be called in any order. In bulk execution
+    (now deprecated), all inputs would be added up-front, but in streaming
+    execution (now the default execution mode) the calls could be interleaved.
+    """
+    def __init__(
+        self,
+        name: str,
+        input_dependencies: List["PhysicalOperator"],
+        data_context: DataContext,
+        target_max_block_size: Optional[int],
+    ):
+        super().__init__(name, input_dependencies)
+        for x in input_dependencies:
+            assert isinstance(x, PhysicalOperator), x
+        self._inputs_complete = not input_dependencies
+        self._target_max_block_size = target_max_block_size
+        self._started = False
+        self._in_task_submission_backpressure = False
+        self._in_task_output_backpressure = False
+        self._metrics = OpRuntimeMetrics(self)
+        self._estimated_num_output_bundles = None
+        self._estimated_output_num_rows = None
+        self._execution_completed = False
+        # The LogicalOperator(s) which were translated to create this PhysicalOperator.
+        # Set via `PhysicalOperator.set_logical_operators()`.
+        self._logical_operators: List[LogicalOperator] = []
+        self._data_context = data_context
+    def __reduce__(self):
+        raise ValueError("Operator is not serializable.")
+    @property
+    def data_context(self) -> DataContext:
+        return self._data_context
+    # Override the following 3 methods to correct type hints.
+    @property
+    def input_dependencies(self) -> List["PhysicalOperator"]:
+        return super().input_dependencies  # type: ignore
+    @property
+    def output_dependencies(self) -> List["PhysicalOperator"]:
+        return super().output_dependencies  # type: ignore
+    def post_order_iter(self) -> Iterator["PhysicalOperator"]:
+        return super().post_order_iter()  # type: ignore
+    def set_logical_operators(
+        self,
+        *logical_ops: LogicalOperator,
+    ):
+        self._logical_operators = list(logical_ops)
+    @property
+    def target_max_block_size(self) -> Optional[int]:
+        """
+        Target max block size output by this operator. If this returns None,
+        then the default from DataContext should be used.
+        """
+        return self._target_max_block_size
+    @property
+    def actual_target_max_block_size(self) -> int:
+        """
+        The actual target max block size output by this operator.
+        """
+        target_max_block_size = self._target_max_block_size
+        if target_max_block_size is None:
+            target_max_block_size = self.data_context.target_max_block_size
+        return target_max_block_size
+    def set_target_max_block_size(self, target_max_block_size: Optional[int]):
+        self._target_max_block_size = target_max_block_size
+    def mark_execution_completed(self):
+        """Manually mark this operator has completed execution."""
+        self._execution_completed = True
+    def completed(self) -> bool:
+        """Return True when this operator is completed.
+        An operator is completed the operator has stopped execution and all
+        outputs are taken.
+        """
+        if not self._execution_completed:
+            if self._inputs_complete and self.num_active_tasks() == 0:
+                # If all inputs are complete and there are no active tasks,
+                # then the operator has completed execution.
+                self._execution_completed = True
+        return self._execution_completed and not self.has_next()
+    def get_stats(self) -> StatsDict:
+        """Return recorded execution stats for use with DatasetStats."""
+        raise NotImplementedError
+    @property
+    def metrics(self) -> OpRuntimeMetrics:
+        """Returns the runtime metrics of this operator."""
+        self._metrics._extra_metrics = self._extra_metrics()
+        return self._metrics
+    def _extra_metrics(self) -> Dict[str, Any]:
+        """Subclasses should override this method to report extra metrics
+        that are specific to them."""
+        return {}
+    def progress_str(self) -> str:
+        """Return any extra status to be displayed in the operator progress bar.
+        For example, `<N> actors` to show current number of actors in an actor pool.
+        """
+        return ""
+    def num_outputs_total(self) -> Optional[int]:
+        """Returns the total number of output bundles of this operator,
+        or ``None`` if unable to provide a reasonable estimate (for example,
+        if no tasks have finished yet).
+        The value returned may be an estimate based off the consumption so far.
+        This is useful for reporting progress.
+        Subclasses should either override this method, or update
+        ``self._estimated_num_output_bundles`` appropriately.
+        """
+        return self._estimated_num_output_bundles
+    def num_output_rows_total(self) -> Optional[int]:
+        """Returns the total number of output rows of this operator,
+        or ``None`` if unable to provide a reasonable estimate (for example,
+        if no tasks have finished yet).
+        The value returned may be an estimate based off the consumption so far.
+        This is useful for reporting progress.
+        Subclasses should either override this method, or update
+        ``self._estimated_output_num_rows`` appropriately.
+        """
+        return self._estimated_output_num_rows
+    def start(self, options: ExecutionOptions) -> None:
+        """Called by the executor when execution starts for an operator.
+        Args:
+            options: The global options used for the overall execution.
+        """
+        self._started = True
+    def should_add_input(self) -> bool:
+        """Return whether it is desirable to add input to this operator right now.
+        Operators can customize the implementation of this method to apply additional
+        backpressure (e.g., waiting for internal actors to be created).
+        """
+        return True
+    def add_input(self, refs: RefBundle, input_index: int) -> None:
+        """Called when an upstream result is available.
+        Inputs may be added in any order, and calls to `add_input` may be interleaved
+        with calls to `get_next` / `has_next` to implement streaming execution.
+        Subclasses should override `_add_input_inner` instead of this method.
+        Args:
+            refs: The ref bundle that should be added as input.
+            input_index: The index identifying the input dependency producing the
+                input. For most operators, this is always `0` since there is only
+                one upstream input operator.
+        """
+        self._metrics.on_input_received(refs)
+        self._add_input_inner(refs, input_index)
+    def _add_input_inner(self, refs: RefBundle, input_index: int) -> None:
+        """Subclasses should override this method to implement `add_input`."""
+        raise NotImplementedError
+    def input_done(self, input_index: int) -> None:
+        """Called when the upstream operator at index `input_index` has completed().
+        After this is called, the executor guarantees that no more inputs will be added
+        via `add_input` for the given input index.
+        """
+        pass
+    def all_inputs_done(self) -> None:
+        """Called when all upstream operators have completed().
+        After this is called, the executor guarantees that no more inputs will be added
+        via `add_input` for any input index.
+        """
+        self._inputs_complete = True
+    def has_next(self) -> bool:
+        """Returns when a downstream output is available.
+        When this returns true, it is safe to call `get_next()`.
+        """
+        raise NotImplementedError
+    def get_next(self) -> RefBundle:
+        """Get the next downstream output.
+        It is only allowed to call this if `has_next()` has returned True.
+        Subclasses should override `_get_next_inner` instead of this method.
+        """
+        output = self._get_next_inner()
+        self._metrics.on_output_taken(output)
+        return output
+    def _get_next_inner(self) -> RefBundle:
+        """Subclasses should override this method to implement `get_next`."""
+        raise NotImplementedError
+    def get_active_tasks(self) -> List[OpTask]:
+        """Get a list of the active tasks of this operator.
+        Subclasses should return *all* running normal/actor tasks. The
+        StreamingExecutor will wait on these tasks and trigger callbacks.
+        """
+        return []
+    def num_active_tasks(self) -> int:
+        """Return the number of active tasks.
+        This method is used for 2 purposes:
+        * Determine if this operator is completed.
+        * Displaying active task info in the progress bar.
+        Thus, the return value can be less than `len(get_active_tasks())`,
+        if some tasks are not needed for the above purposes. E.g., for the
+        actor pool map operator, readiness checking tasks can be excluded
+        from `num_active_tasks`, but they should be included in
+        `get_active_tasks`.
+        Subclasses can override this as a performance optimization.
+        """
+        return len(self.get_active_tasks())
+    def throttling_disabled(self) -> bool:
+        """Whether to disable resource throttling for this operator.
+        This should return True for operators that only manipulate bundle metadata
+        (e.g., the OutputSplitter operator). This hints to the execution engine that
+        these operators should not be throttled based on resource usage.
+        """
+        return False
+    def internal_queue_size(self) -> int:
+        """If the operator has an internal input queue, return its size.
+        This is used to report tasks pending submission to actor pools.
+        """
+        return 0
+    def shutdown(self) -> None:
+        """Abort execution and release all resources used by this operator.
+        This release any Ray resources acquired by this operator such as active
+        tasks, actors, and objects.
+        """
+        if not self._started:
+            raise ValueError("Operator must be started before being shutdown.")
+    def current_processor_usage(self) -> ExecutionResources:
+        """Returns the current estimated CPU and GPU usage of this operator, excluding
+        object store memory.
+        This method is called by the executor to decide how to allocate processors
+        between different operators.
+        """
+        return ExecutionResources(0, 0, 0)
+    def running_processor_usage(self) -> ExecutionResources:
+        """Returns the estimated running CPU and GPU usage of this operator, excluding
+        object store memory.
+        This method is called by the resource manager and the streaming
+        executor to display the number of currently running CPUs and GPUs in the
+        progress bar.
+        Note, this method returns `current_processor_usage() -
+        pending_processor_usage()` by default. Subclasses should only override
+        `pending_processor_usage()` if needed.
+        """
+        usage = self.current_processor_usage()
+        usage = usage.subtract(self.pending_processor_usage())
+        return usage
+    def pending_processor_usage(self) -> ExecutionResources:
+        """Returns the estimated pending CPU and GPU usage of this operator, excluding
+        object store memory.
+        This method is called by the resource manager and the streaming
+        executor to display the number of currently pending actors in the
+        progress bar.
+        """
+        return ExecutionResources(0, 0, 0)
+    def base_resource_usage(self) -> ExecutionResources:
+        """Returns the minimum amount of resources required for execution.
+        For example, an operator that creates an actor pool requiring 8 GPUs could
+        return ExecutionResources(gpu=8) as its base usage.
+        """
+        return ExecutionResources()
+    def incremental_resource_usage(self) -> ExecutionResources:
+        """Returns the incremental resources required for processing another input.
+        For example, an operator that launches a task per input could return
+        ExecutionResources(cpu=1) as its incremental usage.
+        """
+        return ExecutionResources()
+    def notify_in_task_submission_backpressure(self, in_backpressure: bool) -> None:
+        """Called periodically from the executor to update internal in backpressure
+        status for stats collection purposes.
+        Args:
+            in_backpressure: Value this operator's in_backpressure should be set to.
+        """
+        # only update on change to in_backpressure
+        if self._in_task_submission_backpressure != in_backpressure:
+            self._metrics.on_toggle_task_submission_backpressure(in_backpressure)
+            self._in_task_submission_backpressure = in_backpressure
+    def get_autoscaling_actor_pools(self) -> List[AutoscalingActorPool]:
+        """Return a list of `AutoscalingActorPool`s managed by this operator."""
+        return []
+    def implements_accurate_memory_accounting(self) -> bool:
+        """Return whether this operator implements accurate memory accounting.
+        An operator that implements accurate memory accounting should should properly
+        report its memory usage via the following APIs:
+          - `self._metrics.on_input_queued`.
+          - `self._metrics.on_input_dequeued`.
+          - `self._metrics.on_output_queued`.
+          - `self._metrics.on_output_dequeued`.
+        """
+        # TODO(hchen): Currently we only enable `ReservationOpResourceAllocator` when
+        # all operators in the dataset have implemented accurate memory accounting.
+        # Eventually all operators should implement accurate memory accounting.
+        return False
+    def supports_fusion(self) -> bool:
+        """Returns ```True``` if this operator can be fused with other operators."""
+        return False
+    def update_resource_usage(self) -> None:
+        """Updates resource usage of this operator at runtime.
+        This method will be called at runtime in each StreamingExecutor iteration.
+        Subclasses can override it to account for dynamic resource usage updates due to
+        restarting actors, retrying tasks, lost objects, etc.
+        """
+        pass
+    def actor_info_progress_str(self) -> str:
+        """Returns Actor progress strings for Alive, Restarting and Pending Actors.
+        This method will be called in summary_str API in OpState. Subcallses can
+        override it to return Actor progress strings for Alive, Restarting and Pending
+        Actors.
+        """
+        return ""

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/ref_bundle.py ADDED Viewed

	@@ -0,0 +1,136 @@

+from dataclasses import dataclass
+from typing import Iterator, List, Optional, Tuple
+import ray
+from .common import NodeIdStr
+from ray.data._internal.memory_tracing import trace_deallocation
+from ray.data.block import Block, BlockMetadata
+from ray.data.context import DataContext
+from ray.types import ObjectRef
+@dataclass
+class RefBundle:
+    """A group of data block references and their metadata.
+    Operators take in and produce streams of RefBundles.
+    Most commonly a RefBundle consists of a single block object reference.
+    In some cases, e.g., due to block splitting, or for a reduce task, there may
+    be more than one block.
+    Block bundles have ownership semantics, i.e., shared ownership (similar to C++
+    shared_ptr, multiple operators share the same block bundle), or unique ownership
+    (similar to C++ unique_ptr, only one operator owns the block bundle). This
+    allows operators to know whether they can destroy blocks when they don't need
+    them. Destroying blocks eagerly is more efficient than waiting for Python GC /
+    Ray reference counting to kick in.
+    """
+    # The size_bytes must be known in the metadata, num_rows is optional.
+    blocks: Tuple[Tuple[ObjectRef[Block], BlockMetadata]]
+    # Whether we own the blocks (can safely destroy them).
+    owns_blocks: bool
+    # This attribute is used by the split() operator to assign bundles to logical
+    # output splits. It is otherwise None.
+    output_split_idx: Optional[int] = None
+    # Cached location, used for get_cached_location().
+    _cached_location: Optional[NodeIdStr] = None
+    def __post_init__(self):
+        if not isinstance(self.blocks, tuple):
+            object.__setattr__(self, "blocks", tuple(self.blocks))
+        for b in self.blocks:
+            assert isinstance(b, tuple), b
+            assert len(b) == 2, b
+            assert isinstance(b[0], ray.ObjectRef), b
+            assert isinstance(b[1], BlockMetadata), b
+            if b[1].size_bytes is None:
+                raise ValueError(
+                    "The size in bytes of the block must be known: {}".format(b)
+                )
+    def __setattr__(self, key, value):
+        if hasattr(self, key) and key in ["blocks", "owns_blocks"]:
+            raise ValueError(f"The `{key}` field of RefBundle cannot be updated.")
+        object.__setattr__(self, key, value)
+    @property
+    def block_refs(self) -> List[ObjectRef[Block]]:
+        """List of block references in this bundle."""
+        return [block_ref for block_ref, _ in self.blocks]
+    @property
+    def metadata(self) -> List[BlockMetadata]:
+        """List of block metadata in this bundle."""
+        return [metadata for _, metadata in self.blocks]
+    def num_rows(self) -> Optional[int]:
+        """Number of rows present in this bundle, if known."""
+        total = 0
+        for m in self.metadata:
+            if m.num_rows is None:
+                return None
+            else:
+                total += m.num_rows
+        return total
+    def size_bytes(self) -> int:
+        """Size of the blocks of this bundle in bytes."""
+        return sum(m.size_bytes for m in self.metadata)
+    def destroy_if_owned(self) -> int:
+        """Clears the object store memory for these blocks if owned.
+        Returns:
+            The number of bytes freed.
+        """
+        should_free = self.owns_blocks and DataContext.get_current().eager_free
+        for block_ref in self.block_refs:
+            trace_deallocation(
+                block_ref, "RefBundle.destroy_if_owned", free=should_free
+            )
+        return self.size_bytes() if should_free else 0
+    def get_cached_location(self) -> Optional[NodeIdStr]:
+        """Return a location for this bundle's data, if possible.
+        Caches the resolved location so multiple calls to this are efficient.
+        """
+        if self._cached_location is None:
+            # Only consider the first block in the bundle for now. TODO(ekl) consider
+            # taking into account other blocks.
+            ref = self.block_refs[0]
+            # This call is pretty fast for owned objects (~5k/s), so we don't need to
+            # batch it for now.
+            locs = ray.experimental.get_object_locations([ref])
+            nodes = locs[ref]["node_ids"]
+            if nodes:
+                self._cached_location = nodes[0]
+            else:
+                self._cached_location = ""
+        if self._cached_location:
+            return self._cached_location
+        else:
+            return None  # Return None if cached location is "".
+    def __eq__(self, other) -> bool:
+        return self is other
+    def __hash__(self) -> int:
+        return id(self)
+    def __len__(self) -> int:
+        return len(self.blocks)
+def _ref_bundles_iterator_to_block_refs_list(
+    ref_bundles: Iterator[RefBundle],
+) -> List[ObjectRef[Block]]:
+    """Convert an iterator of RefBundles to a list of Block object references."""
+    return [
+        block_ref for ref_bundle in ref_bundles for block_ref in ref_bundle.block_refs
+    ]

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/task_context.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any, Dict, Optional
+from ray.data._internal.progress_bar import ProgressBar
+if TYPE_CHECKING:
+    from ray.data._internal.execution.operators.map_transformer import MapTransformer
+@dataclass
+class TaskContext:
+    """This describes the information of a task running block transform."""
+    # The index of task. Each task has a unique task index within the same
+    # operator.
+    task_idx: int
+    # The dictionary of sub progress bar to update. The key is name of sub progress
+    # bar. Note this is only used on driver side.
+    # TODO(chengsu): clean it up from TaskContext with new optimizer framework.
+    sub_progress_bar_dict: Optional[Dict[str, ProgressBar]] = None
+    # NOTE(hchen): `upstream_map_transformer` and `upstream_map_ray_remote_args`
+    # are only used for `RandomShuffle`. DO NOT use them for other operators.
+    # Ideally, they should be handled by the optimizer, and should be transparent
+    # to the specific operators.
+    # But for `RandomShuffle`, the AllToAllOperator doesn't do the shuffle itself.
+    # It uses `ExchangeTaskScheduler` to launch new tasks to do the shuffle.
+    # That's why we need to pass them to `ExchangeTaskScheduler`.
+    # TODO(hchen): Use a physical operator to do the shuffle directly.
+    # The underlying function called in a MapOperator; this is used when fusing
+    # an AllToAllOperator with an upstream MapOperator.
+    upstream_map_transformer: Optional["MapTransformer"] = None
+    # The Ray remote arguments of the fused upstream MapOperator.
+    # This should be set if upstream_map_transformer is set.
+    upstream_map_ray_remote_args: Optional[Dict[str, Any]] = None
+    # The target maximum number of bytes to include in the task's output block.
+    target_max_block_size: Optional[int] = None
+    # Additional keyword arguments passed to the task.
+    kwargs: Dict[str, Any] = field(default_factory=dict)

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/transform_fn.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from typing import Callable, List, Tuple
+from .ref_bundle import RefBundle
+from .task_context import TaskContext
+from ray.data._internal.stats import StatsDict
+# Block transform function applied in AllToAllOperator.
+AllToAllTransformFn = Callable[
+    [List[RefBundle], TaskContext], Tuple[List[RefBundle], StatsDict]
+]

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/legacy_compat.py ADDED Viewed

	@@ -0,0 +1,181 @@

+"""This file contains temporary helper functions for legacy plan/executor interaction.
+It should be deleted once we fully move to the new executor backend.
+"""
+from typing import Iterator, Optional, Tuple
+from ray.data._internal.block_list import BlockList
+from ray.data._internal.execution.interfaces import (
+    Executor,
+    PhysicalOperator,
+    RefBundle,
+)
+from ray.data._internal.execution.interfaces.executor import OutputIterator
+from ray.data._internal.logical.optimizers import get_execution_plan
+from ray.data._internal.logical.util import record_operators_usage
+from ray.data._internal.plan import ExecutionPlan
+from ray.data._internal.stats import DatasetStats
+from ray.data._internal.util import unify_block_metadata_schema
+from ray.data.block import BlockMetadata
+# Warn about tasks larger than this.
+TASK_SIZE_WARN_THRESHOLD_BYTES = 100000
+def execute_to_legacy_bundle_iterator(
+    executor: Executor,
+    plan: ExecutionPlan,
+    dag_rewrite=None,
+) -> Iterator[RefBundle]:
+    """Execute a plan with the new executor and return a bundle iterator.
+    Args:
+        executor: The executor to use.
+        plan: The legacy plan to execute.
+        dag_rewrite: Callback that can be used to mutate the DAG prior to execution.
+            This is currently used as a legacy hack to inject the OutputSplit operator
+            for `Dataset.streaming_split()`.
+    Returns:
+        The output as a bundle iterator.
+    """
+    dag, stats = _get_execution_dag(
+        executor,
+        plan,
+        preserve_order=False,
+    )
+    if dag_rewrite:
+        dag = dag_rewrite(dag)
+    bundle_iter = executor.execute(dag, initial_stats=stats)
+    class CacheMetadataIterator(OutputIterator):
+        """Wrapper for `bundle_iterator` above.
+        For a given iterator which yields output RefBundles,
+        collect the metadata from each output bundle, and yield the
+        original RefBundle. Only after the entire iterator is exhausted,
+        we cache the resulting metadata to the execution plan."""
+        def __init__(self, base_iterator: OutputIterator):
+            # Note: the base_iterator should be of type StreamIterator,
+            # defined within `StreamingExecutor.execute()`. It must
+            # support the `get_next()` method.
+            self._base_iterator = base_iterator
+            self._collected_metadata = BlockMetadata(
+                num_rows=0,
+                size_bytes=0,
+                schema=None,
+                input_files=None,
+                exec_stats=None,
+            )
+        def get_next(self, output_split_idx: Optional[int] = None) -> RefBundle:
+            try:
+                bundle = self._base_iterator.get_next(output_split_idx)
+                self._collect_metadata(bundle)
+                return bundle
+            except StopIteration:
+                # Once the iterator is completely exhausted, we are done
+                # collecting metadata. We can add this cached metadata to the plan.
+                plan._snapshot_metadata = self._collected_metadata
+                raise
+        def _collect_metadata(self, bundle: RefBundle) -> RefBundle:
+            """Collect the metadata from each output bundle and accumulate
+            results, so we can access important information, such as
+            row count, schema, etc., after iteration completes."""
+            self._collected_metadata.num_rows += bundle.num_rows()
+            self._collected_metadata.size_bytes += bundle.size_bytes()
+            self._collected_metadata.schema = unify_block_metadata_schema(
+                [self._collected_metadata, *bundle.metadata]
+            )
+            return bundle
+    bundle_iter = CacheMetadataIterator(bundle_iter)
+    return bundle_iter
+def execute_to_legacy_block_list(
+    executor: Executor,
+    plan: ExecutionPlan,
+    dataset_uuid: str,
+    preserve_order: bool,
+) -> BlockList:
+    """Execute a plan with the new executor and translate it into a legacy block list.
+    Args:
+        executor: The executor to use.
+        plan: The legacy plan to execute.
+        dataset_uuid: UUID of the dataset for this execution.
+        preserve_order: Whether to preserve order in execution.
+    Returns:
+        The output as a legacy block list.
+    """
+    dag, stats = _get_execution_dag(
+        executor,
+        plan,
+        preserve_order,
+    )
+    bundles = executor.execute(dag, initial_stats=stats)
+    block_list = _bundles_to_block_list(bundles)
+    # Set the stats UUID after execution finishes.
+    _set_stats_uuid_recursive(executor.get_stats(), dataset_uuid)
+    return block_list
+def _get_execution_dag(
+    executor: Executor,
+    plan: ExecutionPlan,
+    preserve_order: bool,
+) -> Tuple[PhysicalOperator, DatasetStats]:
+    """Get the physical operators DAG from a plan."""
+    # Record usage of logical operators if available.
+    if hasattr(plan, "_logical_plan") and plan._logical_plan is not None:
+        record_operators_usage(plan._logical_plan.dag)
+    # Get DAG of physical operators and input statistics.
+    dag = get_execution_plan(plan._logical_plan).dag
+    stats = _get_initial_stats_from_plan(plan)
+    # Enforce to preserve ordering if the plan has operators
+    # required to do so, such as Zip and Sort.
+    if preserve_order or plan.require_preserve_order():
+        executor._options.preserve_order = True
+    return dag, stats
+def _get_initial_stats_from_plan(plan: ExecutionPlan) -> DatasetStats:
+    if plan._snapshot_bundle is not None:
+        return plan._snapshot_stats
+    # For Datasets created from "read_xxx", `plan._in_stats` contains useless data.
+    # For Datasets created from "from_xxx", we need to use `plan._in_stats` as
+    # the initial stats. Because the `FromXxx` logical operators will be translated to
+    # "InputDataBuffer" physical operators, which will be ignored when generating
+    # stats, see `StreamingExecutor._generate_stats`.
+    # TODO(hchen): Unify the logic by saving the initial stats in `InputDataBuffer
+    if plan.has_lazy_input():
+        return DatasetStats(metadata={}, parent=None)
+    else:
+        return plan._in_stats
+def _bundles_to_block_list(bundles: Iterator[RefBundle]) -> BlockList:
+    blocks, metadata = [], []
+    owns_blocks = True
+    for ref_bundle in bundles:
+        if not ref_bundle.owns_blocks:
+            owns_blocks = False
+        blocks.extend(ref_bundle.block_refs)
+        metadata.extend(ref_bundle.metadata)
+    return BlockList(blocks, metadata, owned_by_consumer=owns_blocks)
+def _set_stats_uuid_recursive(stats: DatasetStats, dataset_uuid: str) -> None:
+    if not stats.dataset_uuid:
+        stats.dataset_uuid = dataset_uuid
+    for parent in stats.parents or []:
+        _set_stats_uuid_recursive(parent, dataset_uuid)

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (211 Bytes). View file

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/actor_pool_map_operator.cpython-311.pyc ADDED Viewed

Binary file (40.5 kB). View file

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/aggregate_num_rows.cpython-311.pyc ADDED Viewed

Binary file (3.69 kB). View file

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/base_physical_operator.cpython-311.pyc ADDED Viewed

Binary file (10.1 kB). View file

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/input_data_buffer.cpython-311.pyc ADDED Viewed

Binary file (4.95 kB). View file

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/limit_operator.cpython-311.pyc ADDED Viewed

Binary file (7.58 kB). View file

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/map_operator.cpython-311.pyc ADDED Viewed

Binary file (36.8 kB). View file

.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/map_transformer.cpython-311.pyc ADDED Viewed

Binary file (24.7 kB). View file