diff --git a/.gitattributes b/.gitattributes
index 14fd1daeccb758c4fa9beb076130b4364e760d1b..3506abb69cbfd2afbb4fac14af8926e75c846185 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -152,3 +152,5 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_
 .venv/lib/python3.11/site-packages/nvidia/cufft/lib/libcufft.so.11 filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/torchgen/__pycache__/model.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/mistral_common/data/tekken_240911.json filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/ray/data/__pycache__/dataset.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/ray/data/__pycache__/read_api.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
diff --git a/.venv/lib/python3.11/site-packages/ray/data/__pycache__/dataset.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/__pycache__/dataset.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..870a0747579312d2f3312ac1b311026efe32c129
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/__pycache__/dataset.cpython-311.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c0b1a74e1674205ec83807b353da73daa79d781531cd64ecbd818fd5438ec680
+size 255996
diff --git a/.venv/lib/python3.11/site-packages/ray/data/__pycache__/read_api.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/__pycache__/read_api.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..910f857895c13a479c5b69215eebf2b30290bcf9
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/__pycache__/read_api.cpython-311.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a217bcdb2fd53d64e0014e4fd153627ade902228eadc09fe7df65ee93c07bc05
+size 160644
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__init__.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..29c77f6219805d9ca6f73284a3e2a0f9dfeed269
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__init__.py
@@ -0,0 +1,3 @@
+from ray.data._internal.block_batching.block_batching import batch_blocks
+
+__all__ = ["batch_blocks"]
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5542241a2c1020046e1542188f0c393b406b69cb
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/block_batching.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/block_batching.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0b136d6e1093a7340af878ab930c65c9df0be8c6
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/block_batching.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/interfaces.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/interfaces.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..327b27fda5d9394bb4502b7e36eb4b0b97539d53
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/interfaces.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/iter_batches.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/iter_batches.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dc9f929b87e420e6a37ce321cca7a4c1d1b8b2c7
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/iter_batches.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/util.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/util.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1f52e0e35c42ff0db2225ed997008f3de0105f4b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/util.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/iter_batches.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/iter_batches.py
new file mode 100644
index 0000000000000000000000000000000000000000..39bd5f4ad2dabf9f9f0c81649854e790ab0ae81a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/iter_batches.py
@@ -0,0 +1,322 @@
+import collections
+from contextlib import nullcontext
+from typing import Any, Callable, Dict, Iterator, Optional
+
+import ray
+from ray.data._internal.block_batching.interfaces import Batch, BlockPrefetcher
+from ray.data._internal.block_batching.util import (
+    ActorBlockPrefetcher,
+    WaitBlockPrefetcher,
+    blocks_to_batches,
+    collate,
+    extract_data_from_batch,
+    finalize_batches,
+    format_batches,
+    resolve_block_refs,
+)
+from ray.data._internal.execution.interfaces.ref_bundle import RefBundle
+from ray.data._internal.memory_tracing import trace_deallocation
+from ray.data._internal.stats import DatasetStats
+from ray.data._internal.util import make_async_gen
+from ray.data.block import Block, DataBatch
+from ray.data.context import DataContext
+from ray.types import ObjectRef
+
+
+def iter_batches(
+    ref_bundles: Iterator[RefBundle],
+    *,
+    stats: Optional[DatasetStats] = None,
+    clear_block_after_read: bool = False,
+    batch_size: Optional[int] = None,
+    batch_format: Optional[str] = "default",
+    drop_last: bool = False,
+    collate_fn: Optional[Callable[[DataBatch], Any]] = None,
+    finalize_fn: Optional[Callable[[Any], Any]] = None,
+    shuffle_buffer_min_size: Optional[int] = None,
+    shuffle_seed: Optional[int] = None,
+    ensure_copy: bool = False,
+    prefetch_batches: int = 1,
+) -> Iterator[DataBatch]:
+    """Create formatted batches of data from an iterator of block object references and
+    corresponding metadata.
+
+    This takes a block iterator and creates batch_size batches, slicing,
+    unioning, shuffling, prefetching, and formatting blocks as needed.
+
+    The algorithm uses both pipeline parallelism and data parallelism:
+
+    If prefetch_batches=2, these are all the batches in flight:
+
+    [User thread] trains on Batch 0
+    - [Fetch thread] Batch 1 finalization + move to output queue
+            - [Worker thread 1] Batch 2 formatting + collating
+            - [Worker thread 2] Batch 3 formatting + collating
+            - [Raylet] Batches 4 + 5 fetched to local object store memory
+
+    At any point in time there are prefetch_batches+1 batches in local heap memory.
+    And the next set of prefetch_batches in local object store memory.
+
+    The actual steps are as follows:
+
+    In a single async thread, do the following:
+        1. Trigger Ray local prefetching of `prefetch_batches` worth of block object
+            references.
+        2. Resolve (i.e. call `ray.get()`) on the block references.
+        3. Perform the necessary batch slicing to construct full batches, possibly
+            shuffling if necessary.
+        4. Then, in a threadpool consisting of `prefetch_batches` threads:
+            a. Format the batches to the provided batch format.
+            b. Apply the collate function.
+        5. Finalize each of the collated batches
+        6. Fetch outputs from the threadpool, maintaining order of the batches.
+
+    Args:
+        ref_bundles: An iterator over RefBundles.
+        stats: DatasetStats object to record timing and other statistics.
+        clear_block_after_read: Whether to clear the block from object store
+            manually (i.e. without waiting for Python's automatic GC) after it
+            is read. Doing so will reclaim memory faster and hence reduce the
+            memory footprint. However, the caller has to ensure the safety, i.e.
+            the block will never be accessed again.
+        batch_size: Record batch size, or None to let the system pick.
+        batch_format: The format in which to return each batch.
+            Specify "default" to use the current block format (promoting
+            Arrow to pandas automatically), "pandas" to
+            select ``pandas.DataFrame`` or "pyarrow" to select
+            ``pyarrow.Table``, or None to use entire blocks
+            as batches. Default is "default".
+        drop_last: Whether to drop the last batch if it's incomplete.
+        collate_fn: A function to apply to each data batch before returning it.
+        finalize_fn: A function to apply to each data batch after it has been collated.
+            This function is not run in a threadpool so it can be used for
+            memory-intensive operations such as GPU preloading.
+        shuffle_buffer_min_size: If non-None, the data will be randomly shuffled using a
+            local in-memory shuffle buffer, and this value will serve as the minimum
+            number of rows that must be in the local in-memory shuffle buffer in order
+            to yield a batch.
+        shuffle_seed: The seed to use for the local random shuffle.
+        ensure_copy: Whether batches are always copied from the underlying base
+            blocks (not zero-copy views).
+        prefetch_batches: The number of batches to fetch ahead of the current batch to
+            process. If set to greater than 0, a separate thread will be used to fetch
+            the specified amount of formatted batches from blocks. This improves
+            performance for non-CPU bound UDFs, allowing batch fetching compute and
+            formatting to be overlapped with the UDF. Defaults to 1.
+
+    Returns:
+        An iterator over record batches.
+    """
+    context = DataContext.get_current()
+
+    if (
+        prefetch_batches > 0
+        and context.actor_prefetcher_enabled
+        and not ray.util.client.ray.is_connected()
+    ):
+        prefetcher = ActorBlockPrefetcher()
+    else:
+        prefetcher = WaitBlockPrefetcher()
+
+    eager_free = clear_block_after_read and DataContext.get_current().eager_free
+
+    def _async_iter_batches(
+        ref_bundles: Iterator[RefBundle],
+    ) -> Iterator[DataBatch]:
+        # Step 1: Prefetch logical batches locally.
+        block_iter = prefetch_batches_locally(
+            ref_bundles=ref_bundles,
+            prefetcher=prefetcher,
+            num_batches_to_prefetch=prefetch_batches,
+            batch_size=batch_size,
+            eager_free=eager_free,
+        )
+
+        # Step 2: Resolve the blocks.
+        block_iter = resolve_block_refs(block_ref_iter=block_iter, stats=stats)
+
+        # Step 3: Batch and shuffle the resolved blocks.
+        batch_iter = blocks_to_batches(
+            block_iter=block_iter,
+            stats=stats,
+            batch_size=batch_size,
+            drop_last=drop_last,
+            shuffle_buffer_min_size=shuffle_buffer_min_size,
+            shuffle_seed=shuffle_seed,
+            ensure_copy=ensure_copy,
+        )
+
+        # Step 4: Use a threadpool for formatting and collation.
+        batch_iter = _format_in_threadpool(
+            batch_iter,
+            stats=stats,
+            batch_format=batch_format,
+            collate_fn=collate_fn,
+            num_threadpool_workers=prefetch_batches,
+        )
+
+        # Step 5: Finalize each batch.
+        if finalize_fn is not None:
+            batch_iter = finalize_batches(
+                batch_iter, finalize_fn=finalize_fn, stats=stats
+            )
+
+        # Step 6: Restore original order.
+        batch_iter: Iterator[Batch] = restore_original_order(batch_iter)
+
+        yield from extract_data_from_batch(batch_iter)
+
+    # Run everything in a separate thread to not block the main thread when waiting
+    # for streaming results.
+    async_batch_iter = make_async_gen(
+        ref_bundles, fn=_async_iter_batches, num_workers=1
+    )
+
+    while True:
+        with stats.iter_total_blocked_s.timer() if stats else nullcontext():
+            try:
+                next_batch = next(async_batch_iter)
+            except StopIteration:
+                break
+        with stats.iter_user_s.timer() if stats else nullcontext():
+            yield next_batch
+
+
+def _format_in_threadpool(
+    batch_iter: Iterator[Batch],
+    stats: DatasetStats,
+    batch_format: Optional[str],
+    collate_fn: Optional[Callable[[DataBatch], Any]],
+    num_threadpool_workers: int,
+) -> Iterator[Batch]:
+    """Executes the batching, formatting, and collation logic in a threadpool.
+
+    Args:
+        logical_batch_iterator: An iterator over logical batches.
+        stats: DatasetStats object to record timing and other statistics.
+        batch_format: The format in which to return each batch.
+            Specify "default" to use the current block format (promoting
+            Arrow to pandas automatically), "pandas" to
+            select ``pandas.DataFrame`` or "pyarrow" to select
+            ``pyarrow.Table``, or None to use entire blocks
+            as batches.
+        collate_fn: A function to apply to each data batch before returning it.
+        num_threadpool_workers: The number of threads to use in the threadpool.
+    """
+
+    def threadpool_computations_format_collate(
+        batch_iter: Iterator[Batch],
+    ) -> Iterator[Batch]:
+        # Step 4a: Format the batches.
+        formatted_batch_iter = format_batches(
+            batch_iter, batch_format=batch_format, stats=stats
+        )
+
+        # Step 4b: Apply the collate function if applicable.
+        if collate_fn is not None:
+            formatted_batch_iter = collate(
+                formatted_batch_iter, collate_fn=collate_fn, stats=stats
+            )
+        yield from formatted_batch_iter
+
+    if num_threadpool_workers > 0:
+        collated_iter = make_async_gen(
+            base_iterator=batch_iter,
+            fn=threadpool_computations_format_collate,
+            num_workers=num_threadpool_workers,
+        )
+    else:
+        collated_iter = threadpool_computations_format_collate(batch_iter)
+    return collated_iter
+
+
+def prefetch_batches_locally(
+    ref_bundles: Iterator[RefBundle],
+    prefetcher: BlockPrefetcher,
+    num_batches_to_prefetch: int,
+    batch_size: Optional[int],
+    eager_free: bool = False,
+) -> Iterator[ObjectRef[Block]]:
+    """Given an iterator of batched RefBundles, returns an iterator over the
+    corresponding block references while prefetching `num_batches_to_prefetch`
+    batches in advance.
+
+    Args:
+        ref_bundles: An iterator over batched RefBundles.
+        prefetcher: The prefetcher to use.
+        num_batches_to_prefetch: The number of batches to prefetch ahead of the
+            current batch during the scan.
+        batch_size: User specified batch size, or None to let the system pick.
+        eager_free: Whether to eagerly free the object reference from the object store.
+    """
+
+    sliding_window = collections.deque()
+    current_window_size = 0
+
+    if num_batches_to_prefetch <= 0:
+        for ref_bundle in ref_bundles:
+            for block_ref in ref_bundle.block_refs:
+                yield block_ref
+        return
+
+    if batch_size is not None:
+        num_rows_to_prefetch = num_batches_to_prefetch * batch_size
+    else:
+        num_rows_to_prefetch = None
+
+    # Create and fetch the initial window.
+    # Stop adding if the number of rows in this window is greater than requested
+    # batch size, or if the batch size is None and the number of blocks in this window
+    # is greater than requested batches to prefetch.
+    while (batch_size is not None and current_window_size < num_rows_to_prefetch) or (
+        batch_size is None and len(sliding_window) < num_batches_to_prefetch
+    ):
+        try:
+            next_ref_bundle = next(ref_bundles)
+            sliding_window.extend(next_ref_bundle.blocks)
+            current_window_size += next_ref_bundle.num_rows()
+        except StopIteration:
+            break
+
+    prefetcher.prefetch_blocks([block_ref for block_ref, _ in list(sliding_window)])
+
+    while sliding_window:
+        block_ref, metadata = sliding_window.popleft()
+        current_window_size -= metadata.num_rows
+        if batch_size is None or current_window_size < num_rows_to_prefetch:
+            try:
+                next_ref_bundle = next(ref_bundles)
+                for block_ref_and_md in next_ref_bundle.blocks:
+                    sliding_window.append(block_ref_and_md)
+                    current_window_size += block_ref_and_md[1].num_rows
+                prefetcher.prefetch_blocks(
+                    [block_ref for block_ref, _ in list(sliding_window)]
+                )
+            except StopIteration:
+                pass
+        yield block_ref
+        trace_deallocation(block_ref, loc="iter_batches", free=eager_free)
+    prefetcher.stop()
+
+
+def restore_original_order(batch_iter: Iterator[Batch]) -> Iterator[Batch]:
+    """Restores the original order of the provided `batch_iter`
+
+    This function will yield items from `base_iterator` in the correct order based on
+    each batch's batch_idx. All indexes are expected to be unique.
+
+    `batch_iter` is expected to not have any missing indexes. All indexes from 0 to len
+    (base_iterator) must be present.
+    """
+    next_index_required = 0
+    buffer: Dict[int, Batch] = {}
+    for batch in batch_iter:
+        assert batch.batch_idx not in buffer
+        buffer[batch.batch_idx] = batch
+        while next_index_required in buffer:
+            yield buffer.pop(next_index_required)
+            next_index_required += 1
+
+    while next_index_required in buffer:
+        yield buffer.pop(next_index_required)
+        next_index_required += 1
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/util.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..4cea60abca8011ade373d0ed5ba54e060aa57b7e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/util.py
@@ -0,0 +1,293 @@
+import logging
+import threading
+from contextlib import nullcontext
+from typing import Any, Callable, Iterator, List, Optional, Tuple
+
+import ray
+from ray.actor import ActorHandle
+from ray.data._internal.batcher import Batcher, ShufflingBatcher
+from ray.data._internal.block_batching.interfaces import (
+    Batch,
+    BlockPrefetcher,
+    CollatedBatch,
+)
+from ray.data._internal.stats import DatasetStats
+from ray.data.block import Block, BlockAccessor, DataBatch
+from ray.types import ObjectRef
+from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
+
+logger = logging.getLogger(__name__)
+
+
+def _calculate_ref_hits(refs: List[ObjectRef[Any]]) -> Tuple[int, int, int]:
+    """Given a list of object references, returns how many are already on the local
+    node, how many require fetching from another node, and how many have unknown
+    locations. If `DataContext.get_current().enable_get_object_locations_for_metrics` is
+    False, this will return `(-1, -1, -1)` as getting object locations is disabled."""
+    current_node_id = ray.get_runtime_context().get_node_id()
+
+    ctx = ray.data.context.DataContext.get_current()
+    if ctx.enable_get_object_locations_for_metrics:
+        locs = ray.experimental.get_object_locations(refs)
+        nodes: List[List[str]] = [loc["node_ids"] for loc in locs.values()]
+        hits = sum(current_node_id in node_ids for node_ids in nodes)
+        unknowns = sum(1 for node_ids in nodes if not node_ids)
+        misses = len(nodes) - hits - unknowns
+        return hits, misses, unknowns
+
+    return -1, -1, -1
+
+
+def resolve_block_refs(
+    block_ref_iter: Iterator[ObjectRef[Block]],
+    stats: Optional[DatasetStats] = None,
+) -> Iterator[Block]:
+    """Resolves the block references for each logical batch.
+
+    Args:
+        block_ref_iter: An iterator over block object references.
+        stats: An optional stats object to recording block hits and misses.
+    """
+    hits = 0
+    misses = 0
+    unknowns = 0
+
+    for block_ref in block_ref_iter:
+        current_hit, current_miss, current_unknown = _calculate_ref_hits([block_ref])
+        hits += current_hit
+        misses += current_miss
+        unknowns += current_unknown
+
+        # TODO(amogkam): Optimized further by batching multiple references in a single
+        # `ray.get()` call.
+        with stats.iter_get_s.timer() if stats else nullcontext():
+            block = ray.get(block_ref)
+        yield block
+
+    if stats:
+        stats.iter_blocks_local = hits
+        stats.iter_blocks_remote = misses
+        stats.iter_unknown_location = unknowns
+
+
+def blocks_to_batches(
+    block_iter: Iterator[Block],
+    stats: Optional[DatasetStats] = None,
+    batch_size: Optional[int] = None,
+    drop_last: bool = False,
+    shuffle_buffer_min_size: Optional[int] = None,
+    shuffle_seed: Optional[int] = None,
+    ensure_copy: bool = False,
+) -> Iterator[Batch]:
+    """Given an iterator over blocks, returns an iterator over blocks
+    of the appropriate bacth size.
+
+    If the shuffling configurations are specified, then the
+    output blocks contain shuffled data.
+
+    Args:
+        block_iter: An iterator over blocks.
+        stats: Dataset stats object used to store block batching time.
+        batch_size: Record batch size, or None to let the system pick.
+        drop_last: Whether to drop the last batch if it's incomplete.
+        shuffle_buffer_min_size: If non-None, the data will be randomly shuffled
+            using a local in-memory shuffle buffer, and this value will serve as the
+            minimum number of rows that must be in the local in-memory shuffle buffer in
+            order to yield a batch.
+        shuffle_seed: The seed to use for the local random shuffle.
+        ensure_copy: Whether batches are always copied from the underlying base
+            blocks (not zero-copy views).
+
+    Returns:
+        An iterator over blocks of the given size that are potentially shuffled.
+    """
+    if shuffle_buffer_min_size is not None:
+        batcher = ShufflingBatcher(
+            batch_size=batch_size,
+            shuffle_buffer_min_size=shuffle_buffer_min_size,
+            shuffle_seed=shuffle_seed,
+        )
+    else:
+        batcher = Batcher(batch_size=batch_size, ensure_copy=ensure_copy)
+
+    def get_iter_next_batch_s_timer():
+        return stats.iter_next_batch_s.timer() if stats else nullcontext()
+
+    global_counter = 0
+
+    for block in block_iter:
+        batcher.add(block)
+        while batcher.has_batch():
+            with get_iter_next_batch_s_timer():
+                batch = batcher.next_batch()
+            yield Batch(global_counter, batch)
+            global_counter += 1
+
+    # Signal to the batcher that there are no more blocks to add.
+    batcher.done_adding()
+
+    # Get any leftover batches in ShufflingBatcher.
+    while batcher.has_batch():
+        with get_iter_next_batch_s_timer():
+            batch = batcher.next_batch()
+        yield Batch(global_counter, batch)
+        global_counter += 1
+
+    # Get any remaining data.
+    if not drop_last and batcher.has_any():
+        with get_iter_next_batch_s_timer():
+            batch = batcher.next_batch()
+        yield Batch(global_counter, batch)
+        global_counter += 1
+
+
+def format_batches(
+    block_iter: Iterator[Batch],
+    batch_format: Optional[str],
+    stats: Optional[DatasetStats] = None,
+) -> Iterator[Batch]:
+    """Given an iterator of blocks, returns an iterator of formatted batches.
+
+    Args:
+        block_iter: An iterator over blocks.
+        batch_format: The batch format to use.
+        stats: An optional stats object to record formatting times.
+
+    Returns:
+        An iterator over batch index and the formatted batch.
+    """
+    for batch in block_iter:
+        with stats.iter_format_batch_s.timer() if stats else nullcontext():
+            formatted_batch = BlockAccessor.for_block(batch.data).to_batch_format(
+                batch_format
+            )
+        yield Batch(batch.batch_idx, formatted_batch)
+
+
+def collate(
+    batch_iter: Iterator[Batch],
+    collate_fn: Optional[Callable[[DataBatch], Any]],
+    stats: Optional[DatasetStats] = None,
+) -> Iterator[CollatedBatch]:
+    """Returns an iterator with the provided collate_fn applied to items of the batch
+    iterator.
+
+    Args:
+        batch_iter: An iterator over formatted batches.
+        collate_fn: A function to apply to each batch.
+        stats: An optional stats object to record formatting times.
+    """
+    for batch in batch_iter:
+        with stats.iter_collate_batch_s.timer() if stats else nullcontext():
+            collated_batch = collate_fn(batch.data)
+        yield CollatedBatch(batch.batch_idx, collated_batch)
+
+
+def finalize_batches(
+    batch_iter: Iterator[CollatedBatch],
+    finalize_fn: Callable[[Any], Any],
+    stats: Optional[DatasetStats] = None,
+) -> Iterator[CollatedBatch]:
+    """Returns an iterator with the provided finalize_fn applied to items of the batch
+    iterator.
+
+    This is the same as `collate` except the input batches can be of type Any.
+
+    Args:
+        batch_iter: An iterator over processed batches.
+        finalize_fn: A function to apply to each batch.
+        stats: An optional stats object to record formatting times.
+
+    Returns:
+        An iterator over batch index and the finalized batch.
+    """
+    for batch in batch_iter:
+        with stats.iter_finalize_batch_s.timer() if stats else nullcontext():
+            finalized_batch = finalize_fn(batch.data)
+        yield CollatedBatch(batch.batch_idx, finalized_batch)
+
+
+def extract_data_from_batch(batch_iter: Iterator[Batch]) -> Iterator[Any]:
+    for batch in batch_iter:
+        yield batch.data
+
+
+PREFETCHER_ACTOR_NAMESPACE = "ray.dataset"
+
+
+class WaitBlockPrefetcher(BlockPrefetcher):
+    """Block prefetcher using ray.wait."""
+
+    def __init__(self):
+        self._blocks = []
+        self._stopped = False
+        self._condition = threading.Condition()
+        self._thread = threading.Thread(
+            target=self._run,
+            name="Prefetcher",
+            daemon=True,
+        )
+        self._thread.start()
+
+    def _run(self):
+        while True:
+            try:
+                blocks_to_wait = []
+                with self._condition:
+                    if len(self._blocks) > 0:
+                        blocks_to_wait, self._blocks = self._blocks[:], []
+                    else:
+                        if self._stopped:
+                            return
+                        blocks_to_wait = []
+                        self._condition.wait()
+                if len(blocks_to_wait) > 0:
+                    ray.wait(blocks_to_wait, num_returns=1, fetch_local=True)
+            except Exception:
+                logger.exception("Error in prefetcher thread.")
+
+    def prefetch_blocks(self, blocks: List[ObjectRef[Block]]):
+        with self._condition:
+            if self._stopped:
+                raise RuntimeError("Prefetcher is stopped.")
+            self._blocks = blocks
+            self._condition.notify()
+
+    def stop(self):
+        with self._condition:
+            if self._stopped:
+                return
+            self._stopped = True
+            self._condition.notify()
+
+    def __del__(self):
+        self.stop()
+
+
+class ActorBlockPrefetcher(BlockPrefetcher):
+    """Block prefetcher using a local actor."""
+
+    def __init__(self):
+        self.prefetch_actor = self._get_or_create_actor_prefetcher()
+
+    @staticmethod
+    def _get_or_create_actor_prefetcher() -> "ActorHandle":
+        node_id = ray.get_runtime_context().get_node_id()
+        actor_name = f"dataset-block-prefetcher-{node_id}"
+        return _BlockPretcher.options(
+            scheduling_strategy=NodeAffinitySchedulingStrategy(node_id, soft=False),
+            name=actor_name,
+            namespace=PREFETCHER_ACTOR_NAMESPACE,
+            get_if_exists=True,
+        ).remote()
+
+    def prefetch_blocks(self, blocks: List[ObjectRef[Block]]):
+        self.prefetch_actor.prefetch.remote(*blocks)
+
+
+@ray.remote(num_cpus=0)
+class _BlockPretcher:
+    """Helper actor that prefetches blocks asynchronously."""
+
+    def prefetch(self, *blocks) -> None:
+        pass
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/__init__.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..651554ab677a4fb0323e1649b0facf916a43f2f0
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/__init__.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c167c14fa1f33b5aeaa59c1b0b4aa447fa7de114
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/__init__.py
@@ -0,0 +1,15 @@
+from .autoscaler import Autoscaler
+from .autoscaling_actor_pool import AutoscalingActorPool
+from .default_autoscaler import DefaultAutoscaler
+
+
+def create_autoscaler(topology, resource_manager, execution_id):
+    return DefaultAutoscaler(topology, resource_manager, execution_id)
+
+
+__all__ = [
+    "Autoscaler",
+    "DefaultAutoscaler",
+    "create_autoscaler",
+    "AutoscalingActorPool",
+]
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/__pycache__/autoscaling_actor_pool.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/__pycache__/autoscaling_actor_pool.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..456fdcb5e97bd007f40e213fe96e67f649ffc6cb
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/__pycache__/autoscaling_actor_pool.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/autoscaler.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/autoscaler.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4f54584d6c534592948a287d0461bd9294afbb4
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/autoscaler.py
@@ -0,0 +1,44 @@
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING
+
+from ray.data._internal.execution.interfaces.execution_options import ExecutionResources
+from ray.util.annotations import DeveloperAPI
+
+if TYPE_CHECKING:
+    from ray.data._internal.execution.resource_manager import ResourceManager
+    from ray.data._internal.execution.streaming_executor_state import Topology
+
+
+@DeveloperAPI
+class Autoscaler(ABC):
+    """Abstract interface for Ray Data autoscaler."""
+
+    def __init__(
+        self,
+        topology: "Topology",
+        resource_manager: "ResourceManager",
+        execution_id: str,
+    ):
+        self._topology = topology
+        self._resource_manager = resource_manager
+        self._execution_id = execution_id
+
+    @abstractmethod
+    def try_trigger_scaling(self):
+        """Try trigger autoscaling.
+
+        This method will be called each time when StreamingExecutor makes
+        a scheduling decision. A subclass should override this method to
+        handle the autoscaling of both the cluster and `AutoscalingActorPool`s.
+        """
+        ...
+
+    @abstractmethod
+    def on_executor_shutdown(self):
+        """Callback when the StreamingExecutor is shutting down."""
+        ...
+
+    @abstractmethod
+    def get_total_resources(self) -> ExecutionResources:
+        """Get the total resources that are available to this data execution."""
+        ...
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/autoscaling_actor_pool.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/autoscaling_actor_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d8e1bd40b5718c6594b9c47a0538da8e06693dc
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/autoscaling_actor_pool.py
@@ -0,0 +1,94 @@
+from abc import ABC, abstractmethod
+
+from ray.data._internal.execution.interfaces.execution_options import ExecutionResources
+from ray.util.annotations import DeveloperAPI
+
+
+@DeveloperAPI
+class AutoscalingActorPool(ABC):
+    """Abstract interface of an autoscaling actor pool.
+
+    A `PhysicalOperator` can manage one or more `AutoscalingActorPool`s.
+    `Autoscaler` is responsible for deciding autoscaling of these actor
+    pools.
+    """
+
+    @abstractmethod
+    def min_size(self) -> int:
+        """Min size of the actor pool."""
+        ...
+
+    @abstractmethod
+    def max_size(self) -> int:
+        """Max size of the actor pool."""
+        ...
+
+    @abstractmethod
+    def current_size(self) -> int:
+        """Current size of the actor pool."""
+        ...
+
+    @abstractmethod
+    def num_running_actors(self) -> int:
+        """Number of running actors."""
+        ...
+
+    @abstractmethod
+    def num_active_actors(self) -> int:
+        """Number of actors with at least one active task."""
+        ...
+
+    @abstractmethod
+    def num_pending_actors(self) -> int:
+        """Number of actors pending creation."""
+        ...
+
+    @abstractmethod
+    def max_tasks_in_flight_per_actor(self) -> int:
+        """Max number of in-flight tasks per actor."""
+        ...
+
+    @abstractmethod
+    def current_in_flight_tasks(self) -> int:
+        """Number of current in-flight tasks."""
+        ...
+
+    def num_total_task_slots(self) -> int:
+        """Total number of task slots."""
+        return self.max_tasks_in_flight_per_actor() * self.current_size()
+
+    def num_free_task_slots(self) -> int:
+        """Number of free slots to run tasks."""
+        return (
+            self.max_tasks_in_flight_per_actor() * self.current_size()
+            - self.current_in_flight_tasks()
+        )
+
+    @abstractmethod
+    def scale_up(self, num_actors: int) -> int:
+        """Request the actor pool to scale up by the given number of actors.
+
+        The number of actually added actors may be less than the requested
+        number.
+
+        Returns:
+            The number of actors actually added.
+        """
+        ...
+
+    @abstractmethod
+    def scale_down(self, num_actors: int) -> int:
+        """Request actor pool to scale down by the given number of actors.
+
+        The number of actually removed actors may be less than the requested
+        number.
+
+        Returns:
+            The number of actors actually removed.
+        """
+        ...
+
+    @abstractmethod
+    def per_actor_resource_usage(self) -> ExecutionResources:
+        """Per actor resource usage."""
+        ...
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/default_autoscaler.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/default_autoscaler.py
new file mode 100644
index 0000000000000000000000000000000000000000..778505f9fc905123f1ee11af26e1c31cc89f385e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/default_autoscaler.py
@@ -0,0 +1,188 @@
+import math
+import time
+from typing import TYPE_CHECKING, Dict
+
+import ray
+from .autoscaler import Autoscaler
+from .autoscaling_actor_pool import AutoscalingActorPool
+from ray.data._internal.execution.autoscaling_requester import (
+    get_or_create_autoscaling_requester_actor,
+)
+from ray.data._internal.execution.interfaces.execution_options import ExecutionResources
+
+if TYPE_CHECKING:
+    from ray.data._internal.execution.interfaces import PhysicalOperator
+    from ray.data._internal.execution.resource_manager import ResourceManager
+    from ray.data._internal.execution.streaming_executor_state import OpState, Topology
+
+
+class DefaultAutoscaler(Autoscaler):
+
+    # Default threshold of actor pool utilization to trigger scaling up.
+    DEFAULT_ACTOR_POOL_SCALING_UP_THRESHOLD: float = 0.8
+    # Default threshold of actor pool utilization to trigger scaling down.
+    DEFAULT_ACTOR_POOL_SCALING_DOWN_THRESHOLD: float = 0.5
+
+    # Min number of seconds between two autoscaling requests.
+    MIN_GAP_BETWEEN_AUTOSCALING_REQUESTS = 20
+
+    def __init__(
+        self,
+        topology: "Topology",
+        resource_manager: "ResourceManager",
+        execution_id: str,
+        actor_pool_scaling_up_threshold: float = DEFAULT_ACTOR_POOL_SCALING_UP_THRESHOLD,  # noqa: E501
+        actor_pool_scaling_down_threshold: float = DEFAULT_ACTOR_POOL_SCALING_DOWN_THRESHOLD,  # noqa: E501
+    ):
+        self._actor_pool_scaling_up_threshold = actor_pool_scaling_up_threshold
+        self._actor_pool_scaling_down_threshold = actor_pool_scaling_down_threshold
+        # Last time when a request was sent to Ray's autoscaler.
+        self._last_request_time = 0
+        super().__init__(topology, resource_manager, execution_id)
+
+    def try_trigger_scaling(self):
+        self._try_scale_up_cluster()
+        self._try_scale_up_or_down_actor_pool()
+
+    def _calculate_actor_pool_util(self, actor_pool: AutoscalingActorPool):
+        """Calculate the utilization of the given actor pool."""
+        if actor_pool.current_size() == 0:
+            return 0
+        else:
+            return actor_pool.num_active_actors() / actor_pool.current_size()
+
+    def _actor_pool_should_scale_up(
+        self,
+        actor_pool: AutoscalingActorPool,
+        op: "PhysicalOperator",
+        op_state: "OpState",
+    ):
+        # Do not scale up, if the op is completed or no more inputs are coming.
+        if op.completed() or (op._inputs_complete and op.internal_queue_size() == 0):
+            return False
+        if actor_pool.current_size() < actor_pool.min_size():
+            # Scale up, if the actor pool is below min size.
+            return True
+        elif actor_pool.current_size() >= actor_pool.max_size():
+            # Do not scale up, if the actor pool is already at max size.
+            return False
+        # Do not scale up, if the op does not have more resources.
+        if not op_state._scheduling_status.under_resource_limits:
+            return False
+        # Do not scale up, if the op has enough free slots for the existing inputs.
+        if op_state.num_queued() <= actor_pool.num_free_task_slots():
+            return False
+        # Determine whether to scale up based on the actor pool utilization.
+        util = self._calculate_actor_pool_util(actor_pool)
+        return util > self._actor_pool_scaling_up_threshold
+
+    def _actor_pool_should_scale_down(
+        self,
+        actor_pool: AutoscalingActorPool,
+        op: "PhysicalOperator",
+    ):
+        # Scale down, if the op is completed or no more inputs are coming.
+        if op.completed() or (op._inputs_complete and op.internal_queue_size() == 0):
+            return True
+        if actor_pool.current_size() > actor_pool.max_size():
+            # Scale down, if the actor pool is above max size.
+            return True
+        elif actor_pool.current_size() <= actor_pool.min_size():
+            # Do not scale down, if the actor pool is already at min size.
+            return False
+        # Determine whether to scale down based on the actor pool utilization.
+        util = self._calculate_actor_pool_util(actor_pool)
+        return util < self._actor_pool_scaling_down_threshold
+
+    def _try_scale_up_or_down_actor_pool(self):
+        for op, state in self._topology.items():
+            actor_pools = op.get_autoscaling_actor_pools()
+            for actor_pool in actor_pools:
+                while True:
+                    # Try to scale up or down the actor pool.
+                    should_scale_up = self._actor_pool_should_scale_up(
+                        actor_pool,
+                        op,
+                        state,
+                    )
+                    should_scale_down = self._actor_pool_should_scale_down(
+                        actor_pool, op
+                    )
+                    if should_scale_up and not should_scale_down:
+                        if actor_pool.scale_up(1) == 0:
+                            break
+                    elif should_scale_down and not should_scale_up:
+                        if actor_pool.scale_down(1) == 0:
+                            break
+                    else:
+                        break
+
+    def _try_scale_up_cluster(self):
+        """Try to scale up the cluster to accomodate the provided in-progress workload.
+
+        This makes a resource request to Ray's autoscaler consisting of the current,
+        aggregate usage of all operators in the DAG + the incremental usage of all
+        operators that are ready for dispatch (i.e. that have inputs queued). If the
+        autoscaler were to grant this resource request, it would allow us to dispatch
+        one task for every ready operator.
+
+        Note that this resource request does not take the global resource limits or the
+        liveness policy into account; it only tries to make the existing resource usage
+        + one more task per ready operator feasible in the cluster.
+        """
+        # Limit the frequency of autoscaling requests.
+        now = time.time()
+        if now - self._last_request_time < self.MIN_GAP_BETWEEN_AUTOSCALING_REQUESTS:
+            return
+
+        # Scale up the cluster, if no ops are allowed to run, but there are still data
+        # in the input queues.
+        no_runnable_op = all(
+            op_state._scheduling_status.runnable is False
+            for _, op_state in self._topology.items()
+        )
+        any_has_input = any(
+            op_state.num_queued() > 0 for _, op_state in self._topology.items()
+        )
+        if not (no_runnable_op and any_has_input):
+            return
+
+        self._last_request_time = now
+
+        # Get resource usage for all ops + additional resources needed to launch one
+        # more task for each ready op.
+        resource_request = []
+
+        def to_bundle(resource: ExecutionResources) -> Dict:
+            req = {}
+            if resource.cpu:
+                req["CPU"] = math.ceil(resource.cpu)
+            if resource.gpu:
+                req["GPU"] = math.ceil(resource.gpu)
+            return req
+
+        for op, state in self._topology.items():
+            per_task_resource = op.incremental_resource_usage()
+            task_bundle = to_bundle(per_task_resource)
+            resource_request.extend([task_bundle] * op.num_active_tasks())
+            # Only include incremental resource usage for ops that are ready for
+            # dispatch.
+            if state.num_queued() > 0:
+                # TODO(Clark): Scale up more aggressively by adding incremental resource
+                # usage for more than one bundle in the queue for this op?
+                resource_request.append(task_bundle)
+
+        self._send_resource_request(resource_request)
+
+    def _send_resource_request(self, resource_request):
+        # Make autoscaler resource request.
+        actor = get_or_create_autoscaling_requester_actor()
+        actor.request_resources.remote(resource_request, self._execution_id)
+
+    def on_executor_shutdown(self):
+        # Make request for zero resources to autoscaler for this execution.
+        actor = get_or_create_autoscaling_requester_actor()
+        actor.request_resources.remote({}, self._execution_id)
+
+    def get_total_resources(self) -> ExecutionResources:
+        return ExecutionResources.from_resource_dict(ray.cluster_resources())
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaling_requester.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaling_requester.py
new file mode 100644
index 0000000000000000000000000000000000000000..512c3c16f488130f32ac3993bc4b24a84e5ac4fd
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaling_requester.py
@@ -0,0 +1,131 @@
+import math
+import threading
+import time
+from typing import Dict, List
+
+import ray
+from ray.data.context import DataContext
+from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
+
+# Resource requests are considered stale after this number of seconds, and
+# will be purged.
+RESOURCE_REQUEST_TIMEOUT = 60
+PURGE_INTERVAL = RESOURCE_REQUEST_TIMEOUT * 2
+
+# When the autoscaling is driven by memory pressure and there are abundant
+# CPUs to support incremental CPUs needed to launch more tasks, we'll translate
+# memory pressure into an artificial request of CPUs. The amount of CPUs we'll
+# request is ARTIFICIAL_CPU_SCALING_FACTOR * ray.cluster_resources()["CPU"].
+ARTIFICIAL_CPU_SCALING_FACTOR = 1.2
+
+
+@ray.remote(num_cpus=0, max_restarts=-1, max_task_retries=-1)
+class AutoscalingRequester:
+    """Actor to make resource requests to autoscaler for the datasets.
+
+    The resource requests are set to timeout after RESOURCE_REQUEST_TIMEOUT seconds.
+    For those live requests, we keep track of the last request made for each execution,
+    which overrides all previous requests it made; then sum the requested amounts
+    across all executions as the final request to the autoscaler.
+    """
+
+    def __init__(self):
+        # execution_id -> (List[Dict], expiration timestamp)
+        self._resource_requests = {}
+        # TTL for requests.
+        self._timeout = RESOURCE_REQUEST_TIMEOUT
+
+        self._self_handle = ray.get_runtime_context().current_actor
+
+        # Start a thread to purge expired requests periodically.
+        def purge_thread_run():
+            while True:
+                time.sleep(PURGE_INTERVAL)
+                # Call purge_expired_requests() as an actor task,
+                # so we don't need to handle multi-threading.
+                ray.get(self._self_handle.purge_expired_requests.remote())
+
+        self._purge_thread = threading.Thread(target=purge_thread_run, daemon=True)
+        self._purge_thread.start()
+
+    def purge_expired_requests(self):
+        self._purge()
+        ray.autoscaler.sdk.request_resources(bundles=self._aggregate_requests())
+
+    def request_resources(self, req: List[Dict], execution_id: str):
+        # Purge expired requests before making request to autoscaler.
+        self._purge()
+        # For the same execution_id, we track the latest resource request and
+        # the its expiration timestamp.
+        self._resource_requests[execution_id] = (
+            req,
+            time.time() + self._timeout,
+        )
+        # We aggregate the resource requests across all execution_id's to Ray
+        # autoscaler.
+        ray.autoscaler.sdk.request_resources(bundles=self._aggregate_requests())
+
+    def _purge(self):
+        # Purge requests that are stale.
+        now = time.time()
+        for k, (_, t) in list(self._resource_requests.items()):
+            if t < now:
+                self._resource_requests.pop(k)
+
+    def _aggregate_requests(self) -> List[Dict]:
+        req = []
+        for _, (r, _) in self._resource_requests.items():
+            req.extend(r)
+
+        def get_cpus(req):
+            num_cpus = 0
+            for r in req:
+                if "CPU" in r:
+                    num_cpus += r["CPU"]
+            return num_cpus
+
+        # Round up CPUs to exceed total cluster CPUs so it can actually upscale.
+        # This is to handle the issue where the autoscaling is driven by memory
+        # pressure (rather than CPUs) from streaming executor. In such case, simply
+        # asking for incremental CPUs (e.g. 1 CPU for each ready operator) may not
+        # actually be able to trigger autoscaling if existing CPUs in cluster can
+        # already satisfy the incremental CPUs request.
+        num_cpus = get_cpus(req)
+        if num_cpus > 0:
+            total = ray.cluster_resources()
+            if "CPU" in total and num_cpus <= total["CPU"]:
+                delta = (
+                    math.ceil(ARTIFICIAL_CPU_SCALING_FACTOR * total["CPU"]) - num_cpus
+                )
+                req.extend([{"CPU": 1}] * delta)
+
+        return req
+
+    def _test_set_timeout(self, ttl):
+        """Set the timeout. This is for test only"""
+        self._timeout = ttl
+
+
+# Creating/getting an actor from multiple threads is not safe.
+# https://github.com/ray-project/ray/issues/41324
+_autoscaling_requester_lock: threading.RLock = threading.RLock()
+
+
+def get_or_create_autoscaling_requester_actor():
+    ctx = DataContext.get_current()
+    scheduling_strategy = ctx.scheduling_strategy
+    # Pin the stats actor to the local node so it fate-shares with the driver.
+    # Note: for Ray Client, the ray.get_runtime_context().get_node_id() should
+    # point to the head node.
+    scheduling_strategy = NodeAffinitySchedulingStrategy(
+        ray.get_runtime_context().get_node_id(),
+        soft=False,
+    )
+    with _autoscaling_requester_lock:
+        return AutoscalingRequester.options(
+            name="AutoscalingRequester",
+            namespace="AutoscalingRequester",
+            get_if_exists=True,
+            lifetime="detached",
+            scheduling_strategy=scheduling_strategy,
+        ).remote()
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__init__.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9d6ac177e97dda39e4f3b59c30d51efa30ddfc4
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__init__.py
@@ -0,0 +1,32 @@
+from typing import TYPE_CHECKING
+
+import ray
+from .backpressure_policy import BackpressurePolicy
+from .concurrency_cap_backpressure_policy import ConcurrencyCapBackpressurePolicy
+
+if TYPE_CHECKING:
+    from ray.data._internal.execution.streaming_executor_state import Topology
+
+# Default enabled backpressure policies and its config key.
+# Use `DataContext.set_config` to config it.
+ENABLED_BACKPRESSURE_POLICIES = [
+    ConcurrencyCapBackpressurePolicy,
+]
+ENABLED_BACKPRESSURE_POLICIES_CONFIG_KEY = "backpressure_policies.enabled"
+
+
+def get_backpressure_policies(topology: "Topology"):
+    data_context = ray.data.DataContext.get_current()
+    policies = data_context.get_config(
+        ENABLED_BACKPRESSURE_POLICIES_CONFIG_KEY, ENABLED_BACKPRESSURE_POLICIES
+    )
+
+    return [policy(topology) for policy in policies]
+
+
+__all__ = [
+    "BackpressurePolicy",
+    "ConcurrencyCapBackpressurePolicy",
+    "ENABLED_BACKPRESSURE_POLICIES_CONFIG_KEY",
+    "get_backpressure_policies",
+]
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..30c092a4600a4b589df948302c0a037c941e6863
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__pycache__/backpressure_policy.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__pycache__/backpressure_policy.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b40ae77cf6fa4afe02f1a4da94ff252bdf1e97f7
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__pycache__/backpressure_policy.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__pycache__/concurrency_cap_backpressure_policy.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__pycache__/concurrency_cap_backpressure_policy.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f6104096d5e45c84be9ca741b6d85d020f60ced4
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__pycache__/concurrency_cap_backpressure_policy.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/backpressure_policy.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/backpressure_policy.py
new file mode 100644
index 0000000000000000000000000000000000000000..6577936e1dd610f22ab038ab9aa15b8c181467b8
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/backpressure_policy.py
@@ -0,0 +1,28 @@
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from ray.data._internal.execution.interfaces.physical_operator import (
+        PhysicalOperator,
+    )
+    from ray.data._internal.execution.streaming_executor_state import Topology
+
+
+class BackpressurePolicy(ABC):
+    """Interface for back pressure policies."""
+
+    @abstractmethod
+    def __init__(self, topology: "Topology"):
+        ...
+
+    def can_add_input(self, op: "PhysicalOperator") -> bool:
+        """Determine if we can add a new input to the operator. If returns False, the
+        operator will be backpressured and will not be able to run new tasks.
+        Used in `streaming_executor_state.py::select_operator_to_run()`.
+
+        Returns: True if we can add a new input to the operator, False otherwise.
+
+        Note, if multiple backpressure policies are enabled, the operator will be
+        backpressured if any of the policies returns False.
+        """
+        return True
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/concurrency_cap_backpressure_policy.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/concurrency_cap_backpressure_policy.py
new file mode 100644
index 0000000000000000000000000000000000000000..a52bd1f6ab9f7a6e544bb07a0cacd008eb219e35
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/concurrency_cap_backpressure_policy.py
@@ -0,0 +1,43 @@
+import logging
+from typing import TYPE_CHECKING
+
+from .backpressure_policy import BackpressurePolicy
+from ray.data._internal.execution.operators.task_pool_map_operator import (
+    TaskPoolMapOperator,
+)
+
+if TYPE_CHECKING:
+    from ray.data._internal.execution.interfaces.physical_operator import (
+        PhysicalOperator,
+    )
+    from ray.data._internal.execution.streaming_executor_state import Topology
+
+logger = logging.getLogger(__name__)
+
+
+class ConcurrencyCapBackpressurePolicy(BackpressurePolicy):
+    """A backpressure policy that caps the concurrency of each operator.
+
+    The policy will limit the number of concurrently running tasks based on its
+    concurrency cap parameter.
+
+    NOTE: Only support setting concurrency cap for `TaskPoolMapOperator` for now.
+    TODO(chengsu): Consolidate with actor scaling logic of `ActorPoolMapOperator`.
+    """
+
+    def __init__(self, topology: "Topology"):
+        self._concurrency_caps: dict["PhysicalOperator", float] = {}
+
+        for op, _ in topology.items():
+            if isinstance(op, TaskPoolMapOperator) and op.get_concurrency() is not None:
+                self._concurrency_caps[op] = op.get_concurrency()
+            else:
+                self._concurrency_caps[op] = float("inf")
+
+        logger.debug(
+            "ConcurrencyCapBackpressurePolicy initialized with: "
+            f"{self._concurrency_caps}"
+        )
+
+    def can_add_input(self, op: "PhysicalOperator") -> bool:
+        return op.metrics.num_tasks_running < self._concurrency_caps[op]
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__init__.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa51465258e2838a6b55b928ad3c0d0361a90d38
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__init__.py
@@ -0,0 +1,9 @@
+from .bundle_queue import BundleQueue
+from .fifo_bundle_queue import FIFOBundleQueue
+
+
+def create_bundle_queue() -> BundleQueue:
+    return FIFOBundleQueue()
+
+
+__all__ = ["BundleQueue", "create_bundle_queue"]
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..942c347ecff5056e5e1f0a1e7e5ba5e6c85a8584
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__pycache__/bundle_queue.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__pycache__/bundle_queue.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d8b3baff6045d1ffe584b578f53d04227346cc41
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__pycache__/bundle_queue.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__pycache__/fifo_bundle_queue.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__pycache__/fifo_bundle_queue.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..37b97ee9b061fe816f8af56abb5fe43ed6d4438f
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__pycache__/fifo_bundle_queue.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/bundle_queue.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/bundle_queue.py
new file mode 100644
index 0000000000000000000000000000000000000000..f11bacf14c333febd11d840b3f4f369491a74f96
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/bundle_queue.py
@@ -0,0 +1,62 @@
+import abc
+from typing import TYPE_CHECKING, Optional
+
+if TYPE_CHECKING:
+    from ray.data._internal.execution.interfaces import RefBundle
+
+
+class BundleQueue(abc.ABC):
+    @abc.abstractmethod
+    def __len__(self) -> int:
+        """Return the number of bundles in the queue."""
+        ...
+
+    @abc.abstractmethod
+    def __contains__(self, bundle: "RefBundle") -> bool:
+        """Return whether the bundle is in the queue."""
+        ...
+
+    @abc.abstractmethod
+    def add(self, bundle: "RefBundle") -> None:
+        """Add a bundle to the queue."""
+        ...
+
+    @abc.abstractmethod
+    def pop(self) -> "RefBundle":
+        """Remove and return the head of the queue.
+
+        Raises:
+            IndexError: If the queue is empty.
+        """
+        ...
+
+    @abc.abstractmethod
+    def peek(self) -> Optional["RefBundle"]:
+        """Return the head of the queue without removing it.
+
+        If the queue is empty, return `None`.
+        """
+        ...
+
+    @abc.abstractmethod
+    def remove(self, bundle: "RefBundle"):
+        """Remove a bundle from the queue."""
+        ...
+
+    @abc.abstractmethod
+    def clear(self):
+        """Remove all bundles from the queue."""
+        ...
+
+    @abc.abstractmethod
+    def estimate_size_bytes(self) -> int:
+        """Return an estimate of the total size of objects in the queue."""
+        ...
+
+    @abc.abstractmethod
+    def is_empty(self):
+        """Return whether this queue and all of its internal data structures are empty.
+
+        This method is used for testing.
+        """
+        ...
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/execution_callback.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/execution_callback.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cc61a581088aafed4a36a4a94d16d9fd6cab1f4
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/execution_callback.py
@@ -0,0 +1,40 @@
+from typing import List
+
+from ray.data.context import DataContext
+
+EXECUTION_CALLBACKS_CONFIG_KEY = "execution_callbacks"
+
+
+class ExecutionCallback:
+    """Callback interface for execution events."""
+
+    def before_execution_starts(self):
+        """Called before the Dataset execution starts."""
+        ...
+
+    def after_execution_succeeds(self):
+        """Called after the Dataset execution succeeds."""
+        ...
+
+    def after_execution_fails(self, error: Exception):
+        """Called after the Dataset execution fails."""
+        ...
+
+
+def get_execution_callbacks(context: DataContext) -> List[ExecutionCallback]:
+    """Get all ExecutionCallbacks from the DataContext."""
+    return context.get_config(EXECUTION_CALLBACKS_CONFIG_KEY, [])
+
+
+def add_execution_callback(callback: ExecutionCallback, context: DataContext):
+    """Add an ExecutionCallback to the DataContext."""
+    execution_callbacks = context.get_config(EXECUTION_CALLBACKS_CONFIG_KEY, [])
+    execution_callbacks.append(callback)
+    context.set_config(EXECUTION_CALLBACKS_CONFIG_KEY, execution_callbacks)
+
+
+def remove_execution_callback(callback: ExecutionCallback, context: DataContext):
+    """Remove an ExecutionCallback from the DataContext."""
+    execution_callbacks = context.get_config(EXECUTION_CALLBACKS_CONFIG_KEY, [])
+    execution_callbacks.remove(callback)
+    context.set_config(EXECUTION_CALLBACKS_CONFIG_KEY, execution_callbacks)
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/__init__.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..32d84b64abddc45fe96e54e370ab189d57567d5e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/__init__.py
@@ -0,0 +1,19 @@
+from .common import NodeIdStr
+from .execution_options import ExecutionOptions, ExecutionResources
+from .executor import Executor, OutputIterator
+from .physical_operator import PhysicalOperator
+from .ref_bundle import RefBundle
+from .task_context import TaskContext
+from .transform_fn import AllToAllTransformFn
+
+__all__ = [
+    "AllToAllTransformFn",
+    "ExecutionOptions",
+    "ExecutionResources",
+    "Executor",
+    "NodeIdStr",
+    "OutputIterator",
+    "PhysicalOperator",
+    "RefBundle",
+    "TaskContext",
+]
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/common.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..a337c90e7dcce2c62f39bb917e31114a814f195a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/common.py
@@ -0,0 +1,2 @@
+# Node id string returned by `ray.get_runtime_context().get_node_id()`.
+NodeIdStr = str
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/execution_options.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/execution_options.py
new file mode 100644
index 0000000000000000000000000000000000000000..8000901992bc1ba672dc893222975c95aad095ef
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/execution_options.py
@@ -0,0 +1,300 @@
+import os
+from typing import Dict, List, Optional, Union
+
+from .common import NodeIdStr
+from ray.data._internal.execution.util import memory_string
+from ray.util.annotations import DeveloperAPI
+
+
+class ExecutionResources:
+    """Specifies resources usage or resource limits for execution.
+
+    By default this class represents resource usage. Use `for_limits` or
+    set `default_to_inf` to True to create an object that represents resource limits.
+    """
+
+    def __init__(
+        self,
+        cpu: Optional[float] = None,
+        gpu: Optional[float] = None,
+        object_store_memory: Optional[float] = None,
+        default_to_inf: bool = False,
+    ):
+        """Initializes ExecutionResources.
+        Args:
+            cpu: Amount of logical CPU slots.
+            gpu: Amount of logical GPU slots.
+            object_store_memory: Amount of object store memory.
+            default_to_inf: When the object represents resource usage, this flag
+                should be set to False. And missing values will default to 0.
+                When the object represents resource limits, this flag should be
+                set to True. And missing values will default to infinity.
+        """
+        self._cpu = cpu
+        self._gpu = gpu
+        self._object_store_memory = object_store_memory
+        self._default_to_inf = default_to_inf
+
+    @classmethod
+    def from_resource_dict(
+        cls,
+        resource_dict: Dict[str, float],
+        default_to_inf: bool = False,
+    ):
+        """Create an ExecutionResources object from a resource dict."""
+        return ExecutionResources(
+            cpu=resource_dict.get("CPU", None),
+            gpu=resource_dict.get("GPU", None),
+            object_store_memory=resource_dict.get("object_store_memory", None),
+            default_to_inf=default_to_inf,
+        )
+
+    @classmethod
+    def for_limits(
+        cls,
+        cpu: Optional[float] = None,
+        gpu: Optional[float] = None,
+        object_store_memory: Optional[float] = None,
+    ) -> "ExecutionResources":
+        """Create an ExecutionResources object that represents resource limits.
+        Args:
+            cpu: Amount of logical CPU slots.
+            gpu: Amount of logical GPU slots.
+            object_store_memory: Amount of object store memory.
+        """
+        return ExecutionResources(
+            cpu=cpu,
+            gpu=gpu,
+            object_store_memory=object_store_memory,
+            default_to_inf=True,
+        )
+
+    @property
+    def cpu(self) -> float:
+        if self._cpu is not None:
+            return self._cpu
+        return 0.0 if not self._default_to_inf else float("inf")
+
+    @cpu.setter
+    def cpu(self, value: float):
+        self._cpu = value
+
+    @property
+    def gpu(self) -> float:
+        if self._gpu is not None:
+            return self._gpu
+        return 0.0 if not self._default_to_inf else float("inf")
+
+    @gpu.setter
+    def gpu(self, value: float):
+        self._gpu = value
+
+    @property
+    def object_store_memory(self) -> float:
+        if self._object_store_memory is not None:
+            return self._object_store_memory
+        return 0.0 if not self._default_to_inf else float("inf")
+
+    @object_store_memory.setter
+    def object_store_memory(self, value: float):
+        self._object_store_memory = value
+
+    def __repr__(self):
+        return (
+            f"ExecutionResources(cpu={self.cpu:.1f}, gpu={self.gpu:.1f}, "
+            f"object_store_memory={self.object_store_memory_str()})"
+        )
+
+    def __eq__(self, other: "ExecutionResources") -> bool:
+        return (
+            self.cpu == other.cpu
+            and self.gpu == other.gpu
+            and self.object_store_memory == other.object_store_memory
+        )
+
+    @classmethod
+    def zero(cls) -> "ExecutionResources":
+        """Returns an ExecutionResources object with zero resources."""
+        return ExecutionResources(0.0, 0.0, 0.0)
+
+    def is_zero(self) -> bool:
+        """Returns True if all resources are zero."""
+        return self.cpu == 0.0 and self.gpu == 0.0 and self.object_store_memory == 0.0
+
+    def is_non_negative(self) -> bool:
+        """Returns True if all resources are non-negative."""
+        return self.cpu >= 0 and self.gpu >= 0 and self.object_store_memory >= 0
+
+    def object_store_memory_str(self) -> str:
+        """Returns a human-readable string for the object store memory field."""
+        if self.object_store_memory == float("inf"):
+            return "inf"
+        return memory_string(self.object_store_memory)
+
+    def copy(self) -> "ExecutionResources":
+        """Returns a copy of this ExecutionResources object."""
+        return ExecutionResources(
+            self._cpu, self._gpu, self._object_store_memory, self._default_to_inf
+        )
+
+    def add(self, other: "ExecutionResources") -> "ExecutionResources":
+        """Adds execution resources.
+
+        Returns:
+            A new ExecutionResource object with summed resources.
+        """
+        return ExecutionResources(
+            self.cpu + other.cpu,
+            self.gpu + other.gpu,
+            self.object_store_memory + other.object_store_memory,
+        )
+
+    def subtract(self, other: "ExecutionResources") -> "ExecutionResources":
+        """Subtracts execution resources.
+
+        Returns:
+            A new ExecutionResource object with subtracted resources.
+        """
+        return ExecutionResources(
+            self.cpu - other.cpu,
+            self.gpu - other.gpu,
+            self.object_store_memory - other.object_store_memory,
+        )
+
+    def max(self, other: "ExecutionResources") -> "ExecutionResources":
+        """Returns the maximum for each resource type."""
+        return ExecutionResources(
+            cpu=max(self.cpu, other.cpu),
+            gpu=max(self.gpu, other.gpu),
+            object_store_memory=max(
+                self.object_store_memory, other.object_store_memory
+            ),
+        )
+
+    def min(self, other: "ExecutionResources") -> "ExecutionResources":
+        """Returns the minimum for each resource type."""
+        return ExecutionResources(
+            cpu=min(self.cpu, other.cpu),
+            gpu=min(self.gpu, other.gpu),
+            object_store_memory=min(
+                self.object_store_memory, other.object_store_memory
+            ),
+        )
+
+    def satisfies_limit(self, limit: "ExecutionResources") -> bool:
+        """Return if this resource struct meets the specified limits.
+
+        Note that None for a field means no limit.
+        """
+        return (
+            self.cpu <= limit.cpu
+            and self.gpu <= limit.gpu
+            and self.object_store_memory <= limit.object_store_memory
+        )
+
+    def scale(self, f: float) -> "ExecutionResources":
+        """Return copy with all set values scaled by `f`."""
+        if f < 0:
+            raise ValueError("Scaling factor must be non-negative.")
+        if f == 0:
+            # Explicitly handle the zero case, because `0 * inf` is undefined.
+            return ExecutionResources.zero()
+        return ExecutionResources(
+            cpu=self.cpu * f,
+            gpu=self.gpu * f,
+            object_store_memory=self.object_store_memory * f,
+        )
+
+
+@DeveloperAPI
+class ExecutionOptions:
+    """Common options for execution.
+
+    Some options may not be supported on all executors (e.g., resource limits).
+
+    Attributes:
+        resource_limits: Set a soft limit on the resource usage during execution.
+            Autodetected by default.
+        exclude_resources: Amount of resources to exclude from Ray Data.
+            Set this if you have other workloads running on the same cluster.
+            Note,
+            - If using Ray Data with Ray Train, training resources will be
+            automatically excluded.
+            - For each resource type, resource_limits and exclude_resources can
+            not be both set.
+        locality_with_output: Set this to prefer running tasks on the same node as the
+            output node (node driving the execution). It can also be set to a list of
+            node ids to spread the outputs across those nodes. Off by default.
+        preserve_order: Set this to preserve the ordering between blocks processed by
+            operators. Off by default.
+        actor_locality_enabled: Whether to enable locality-aware task dispatch to
+            actors (off by default). This parameter applies to both stateful map and
+            streaming_split operations.
+        verbose_progress: Whether to report progress individually per operator. By
+            default, only AllToAll operators and global progress is reported. This
+            option is useful for performance debugging. On by default.
+    """
+
+    def __init__(
+        self,
+        resource_limits: Optional[ExecutionResources] = None,
+        exclude_resources: Optional[ExecutionResources] = None,
+        locality_with_output: Union[bool, List[NodeIdStr]] = False,
+        preserve_order: bool = False,
+        # TODO(hchen): Re-enable `actor_locality_enabled` by default after fixing
+        # https://github.com/ray-project/ray/issues/43466
+        actor_locality_enabled: bool = False,
+        verbose_progress: Optional[bool] = None,
+    ):
+        if resource_limits is None:
+            resource_limits = ExecutionResources.for_limits()
+        self.resource_limits = resource_limits
+        if exclude_resources is None:
+            exclude_resources = ExecutionResources.zero()
+        self.exclude_resources = exclude_resources
+        self.locality_with_output = locality_with_output
+        self.preserve_order = preserve_order
+        self.actor_locality_enabled = actor_locality_enabled
+        if verbose_progress is None:
+            verbose_progress = bool(
+                int(os.environ.get("RAY_DATA_VERBOSE_PROGRESS", "1"))
+            )
+        self.verbose_progress = verbose_progress
+
+    def __repr__(self) -> str:
+        return (
+            f"ExecutionOptions(resource_limits={self.resource_limits}, "
+            f"exclude_resources={self.exclude_resources}, "
+            f"locality_with_output={self.locality_with_output}, "
+            f"preserve_order={self.preserve_order}, "
+            f"actor_locality_enabled={self.actor_locality_enabled}, "
+            f"verbose_progress={self.verbose_progress})"
+        )
+
+    @property
+    def resource_limits(self) -> ExecutionResources:
+        return self._resource_limits
+
+    @resource_limits.setter
+    def resource_limits(self, value: ExecutionResources) -> None:
+        self._resource_limits = ExecutionResources.for_limits(
+            cpu=value._cpu,
+            gpu=value._gpu,
+            object_store_memory=value._object_store_memory,
+        )
+
+    def is_resource_limits_default(self):
+        """Returns True if resource_limits is the default value."""
+        return self._resource_limits == ExecutionResources.for_limits()
+
+    def validate(self) -> None:
+        """Validate the options."""
+        for attr in ["cpu", "gpu", "object_store_memory"]:
+            if (
+                getattr(self.resource_limits, attr) != float("inf")
+                and getattr(self.exclude_resources, attr, 0) > 0
+            ):
+                raise ValueError(
+                    "resource_limits and exclude_resources cannot "
+                    f" both be set for {attr} resource."
+                )
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/executor.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/executor.py
new file mode 100644
index 0000000000000000000000000000000000000000..007346b60f29473252b888e717d67149e58343c1
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/executor.py
@@ -0,0 +1,77 @@
+from typing import Iterable, Iterator, Optional
+
+from .execution_options import ExecutionOptions
+from .physical_operator import PhysicalOperator
+from .ref_bundle import RefBundle
+from ray.data._internal.stats import DatasetStats
+
+
+class OutputIterator(Iterator[RefBundle]):
+    """Iterator used to access the output of an Executor execution.
+
+    This is a blocking iterator. Datasets guarantees that all its iterators are
+    thread-safe (i.e., multiple threads can block on them at the same time).
+    """
+
+    def __init__(self, base: Iterable[RefBundle]):
+        self._it = iter(base)
+
+    def get_next(self, output_split_idx: Optional[int] = None) -> RefBundle:
+        """Can be used to pull outputs by a specified output index.
+
+        This is used to support the streaming_split() API, where the output of a
+        streaming execution is to be consumed by multiple processes.
+
+        Args:
+            output_split_idx: The output split index to get results for. This arg is
+                only allowed for iterators created by `Dataset.streaming_split()`.
+
+        Raises:
+            StopIteration if there are no more outputs to return.
+        """
+        if output_split_idx is not None:
+            raise NotImplementedError()
+        return next(self._it)
+
+    def __next__(self) -> RefBundle:
+        return self.get_next()
+
+
+class Executor:
+    """Abstract class for executors, which implement physical operator execution.
+
+    Subclasses:
+        StreamingExecutor
+    """
+
+    def __init__(self, options: ExecutionOptions):
+        """Create the executor."""
+        options.validate()
+        self._options = options
+
+    def execute(
+        self, dag: PhysicalOperator, initial_stats: Optional[DatasetStats] = None
+    ) -> OutputIterator:
+        """Start execution.
+
+        Args:
+            dag: The operator graph to execute.
+            initial_stats: The DatasetStats to prepend to the stats returned by the
+                executor. These stats represent actions done to compute inputs.
+        """
+        raise NotImplementedError
+
+    def shutdown(self):
+        """Shutdown an executor, which may still be running.
+
+        This should interrupt execution and clean up any used resources.
+        """
+        pass
+
+    def get_stats(self) -> DatasetStats:
+        """Return stats for the execution so far.
+
+        This is generally called after `execute` has completed, but may be called
+        while iterating over `execute` results for streaming execution.
+        """
+        raise NotImplementedError
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/op_runtime_metrics.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/op_runtime_metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..c70c6fbecd60fd22fb65d81214e025d291ca3c20
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/op_runtime_metrics.py
@@ -0,0 +1,574 @@
+import time
+from dataclasses import Field, dataclass, field
+from enum import Enum
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+
+import ray
+from ray.data._internal.execution.bundle_queue import create_bundle_queue
+from ray.data._internal.execution.interfaces.ref_bundle import RefBundle
+from ray.data._internal.memory_tracing import trace_allocation
+
+if TYPE_CHECKING:
+    from ray.data._internal.execution.interfaces.physical_operator import (
+        PhysicalOperator,
+    )
+
+
+# A metadata key used to mark a dataclass field as a metric.
+_IS_FIELD_METRIC_KEY = "__is_metric"
+# Metadata keys used to store information about a metric.
+_METRIC_FIELD_DESCRIPTION_KEY = "__metric_description"
+_METRIC_FIELD_METRICS_GROUP_KEY = "__metric_metrics_group"
+_METRIC_FIELD_IS_MAP_ONLY_KEY = "__metric_is_map_only"
+
+_METRICS: List["MetricDefinition"] = []
+
+
+class MetricsGroup(Enum):
+    INPUTS = "inputs"
+    OUTPUTS = "outputs"
+    TASKS = "tasks"
+    OBJECT_STORE_MEMORY = "object_store_memory"
+    MISC = "misc"
+
+
+@dataclass(frozen=True)
+class MetricDefinition:
+    """Metadata for a metric.
+
+    Args:
+        name: The name of the metric.
+        description: A human-readable description of the metric, also used as the chart
+            description on the Ray Data dashboard.
+        metrics_group: The group of the metric, used to organize metrics into groups in
+            'StatsActor' and on the Ray Data dashboard.
+        map_only: Whether the metric is only measured for 'MapOperators'.
+    """
+
+    name: str
+    description: str
+    metrics_group: str
+    # TODO: Let's refactor this parameter so it isn't tightly coupled with a specific
+    # operator type (MapOperator).
+    map_only: bool = False
+
+
+def metric_field(
+    *,
+    description: str,
+    metrics_group: str,
+    map_only: bool = False,
+    **field_kwargs,
+):
+    """A dataclass field that represents a metric."""
+    metadata = field_kwargs.get("metadata", {})
+
+    metadata[_IS_FIELD_METRIC_KEY] = True
+
+    metadata[_METRIC_FIELD_DESCRIPTION_KEY] = description
+    metadata[_METRIC_FIELD_METRICS_GROUP_KEY] = metrics_group
+    metadata[_METRIC_FIELD_IS_MAP_ONLY_KEY] = map_only
+
+    return field(metadata=metadata, **field_kwargs)
+
+
+def metric_property(
+    *,
+    description: str,
+    metrics_group: str,
+    map_only: bool = False,
+):
+    """A property that represents a metric."""
+
+    def wrap(func):
+        metric = MetricDefinition(
+            name=func.__name__,
+            description=description,
+            metrics_group=metrics_group,
+            map_only=map_only,
+        )
+
+        _METRICS.append(metric)
+
+        return property(func)
+
+    return wrap
+
+
+@dataclass
+class RunningTaskInfo:
+    inputs: RefBundle
+    num_outputs: int
+    bytes_outputs: int
+
+
+class OpRuntimesMetricsMeta(type):
+    def __init__(cls, name, bases, dict):
+        # NOTE: `Field.name` isn't set until the dataclass is created, so we can't
+        # create the metrics in `metric_field` directly.
+        super().__init__(name, bases, dict)
+
+        # Iterate over the attributes and methods of 'OpRuntimeMetrics'.
+        for name, value in dict.items():
+            # If an attribute is a dataclass field and has _IS_FIELD_METRIC_KEY in its
+            # metadata, then create a metric from the field metadata and add it to the
+            # list of metrics. See also the 'metric_field' function.
+            if isinstance(value, Field) and value.metadata.get(_IS_FIELD_METRIC_KEY):
+                metric = MetricDefinition(
+                    name=name,
+                    description=value.metadata[_METRIC_FIELD_DESCRIPTION_KEY],
+                    metrics_group=value.metadata[_METRIC_FIELD_METRICS_GROUP_KEY],
+                    map_only=value.metadata[_METRIC_FIELD_IS_MAP_ONLY_KEY],
+                )
+                _METRICS.append(metric)
+
+
+@dataclass
+class OpRuntimeMetrics(metaclass=OpRuntimesMetricsMeta):
+    """Runtime metrics for a 'PhysicalOperator'.
+
+    Metrics are updated dynamically during the execution of the Dataset.
+    This class can be used for either observablity or scheduling purposes.
+
+    DO NOT modify the fields of this class directly. Instead, use the provided
+    callback methods.
+    """
+
+    # TODO(hchen): Fields tagged with "map_only" currently only work for MapOperator.
+    # We should make them work for all operators by unifying the task execution code.
+
+    # === Inputs-related metrics ===
+    num_inputs_received: int = metric_field(
+        default=0,
+        description="Number of input blocks received by operator.",
+        metrics_group=MetricsGroup.INPUTS,
+    )
+    bytes_inputs_received: int = metric_field(
+        default=0,
+        description="Byte size of input blocks received by operator.",
+        metrics_group=MetricsGroup.INPUTS,
+    )
+    num_task_inputs_processed: int = metric_field(
+        default=0,
+        description=(
+            "Number of input blocks that operator's tasks have finished processing."
+        ),
+        metrics_group=MetricsGroup.INPUTS,
+        map_only=True,
+    )
+    bytes_task_inputs_processed: int = metric_field(
+        default=0,
+        description=(
+            "Byte size of input blocks that operator's tasks have finished processing."
+        ),
+        metrics_group=MetricsGroup.INPUTS,
+        map_only=True,
+    )
+    bytes_inputs_of_submitted_tasks: int = metric_field(
+        default=0,
+        description="Byte size of input blocks passed to submitted tasks.",
+        metrics_group=MetricsGroup.INPUTS,
+        map_only=True,
+    )
+
+    # === Outputs-related metrics ===
+    num_task_outputs_generated: int = metric_field(
+        default=0,
+        description="Number of output blocks generated by tasks.",
+        metrics_group=MetricsGroup.OUTPUTS,
+        map_only=True,
+    )
+    bytes_task_outputs_generated: int = metric_field(
+        default=0,
+        description="Byte size of output blocks generated by tasks.",
+        metrics_group=MetricsGroup.OUTPUTS,
+        map_only=True,
+    )
+    rows_task_outputs_generated: int = metric_field(
+        default=0,
+        description="Number of output rows generated by tasks.",
+        metrics_group=MetricsGroup.OUTPUTS,
+        map_only=True,
+    )
+    num_outputs_taken: int = metric_field(
+        default=0,
+        description=(
+            "Number of output blocks that are already taken by downstream operators."
+        ),
+        metrics_group=MetricsGroup.OUTPUTS,
+    )
+    bytes_outputs_taken: int = metric_field(
+        default=0,
+        description=(
+            "Byte size of output blocks that are already taken by downstream operators."
+        ),
+        metrics_group=MetricsGroup.OUTPUTS,
+    )
+    num_outputs_of_finished_tasks: int = metric_field(
+        default=0,
+        description="Number of generated output blocks that are from finished tasks.",
+        metrics_group=MetricsGroup.OUTPUTS,
+        map_only=True,
+    )
+    bytes_outputs_of_finished_tasks: int = metric_field(
+        default=0,
+        description=(
+            "Byte size of generated output blocks that are from finished tasks."
+        ),
+        metrics_group=MetricsGroup.OUTPUTS,
+        map_only=True,
+    )
+
+    # === Tasks-related metrics ===
+    num_tasks_submitted: int = metric_field(
+        default=0,
+        description="Number of submitted tasks.",
+        metrics_group=MetricsGroup.TASKS,
+        map_only=True,
+    )
+    num_tasks_running: int = metric_field(
+        default=0,
+        description="Number of running tasks.",
+        metrics_group=MetricsGroup.TASKS,
+        map_only=True,
+    )
+    num_tasks_have_outputs: int = metric_field(
+        default=0,
+        description="Number of tasks that already have output.",
+        metrics_group=MetricsGroup.TASKS,
+        map_only=True,
+    )
+    num_tasks_finished: int = metric_field(
+        default=0,
+        description="Number of finished tasks.",
+        metrics_group=MetricsGroup.TASKS,
+        map_only=True,
+    )
+    num_tasks_failed: int = metric_field(
+        default=0,
+        description="Number of failed tasks.",
+        metrics_group=MetricsGroup.TASKS,
+        map_only=True,
+    )
+    block_generation_time: float = metric_field(
+        default=0,
+        description="Time spent generating blocks in tasks.",
+        metrics_group=MetricsGroup.TASKS,
+        map_only=True,
+    )
+    task_submission_backpressure_time: float = metric_field(
+        default=0,
+        description="Time spent in task submission backpressure.",
+        metrics_group=MetricsGroup.TASKS,
+    )
+
+    # === Object store memory metrics ===
+    obj_store_mem_internal_inqueue_blocks: int = metric_field(
+        default=0,
+        description="Number of blocks in operator's internal input queue.",
+        metrics_group=MetricsGroup.OBJECT_STORE_MEMORY,
+    )
+    obj_store_mem_internal_outqueue_blocks: int = metric_field(
+        default=0,
+        description="Number of blocks in the operator's internal output queue.",
+        metrics_group=MetricsGroup.OBJECT_STORE_MEMORY,
+    )
+    obj_store_mem_freed: int = metric_field(
+        default=0,
+        description="Byte size of freed memory in object store.",
+        metrics_group=MetricsGroup.OBJECT_STORE_MEMORY,
+        map_only=True,
+    )
+    obj_store_mem_spilled: int = metric_field(
+        default=0,
+        description="Byte size of spilled memory in object store.",
+        metrics_group=MetricsGroup.OBJECT_STORE_MEMORY,
+        map_only=True,
+    )
+    obj_store_mem_used: int = metric_field(
+        default=0,
+        description="Byte size of used memory in object store.",
+        metrics_group=MetricsGroup.OBJECT_STORE_MEMORY,
+    )
+
+    # === Miscellaneous metrics ===
+    # Use "metrics_group: "misc" in the metadata for new metrics in this section.
+
+    def __init__(self, op: "PhysicalOperator"):
+        from ray.data._internal.execution.operators.map_operator import MapOperator
+
+        self._op = op
+        self._is_map = isinstance(op, MapOperator)
+        self._running_tasks: Dict[int, RunningTaskInfo] = {}
+        self._extra_metrics: Dict[str, Any] = {}
+        # Start time of current pause due to task submission backpressure
+        self._task_submission_backpressure_start_time = -1
+
+        self._internal_inqueue = create_bundle_queue()
+        self._internal_outqueue = create_bundle_queue()
+        self._pending_task_inputs = create_bundle_queue()
+
+    @property
+    def extra_metrics(self) -> Dict[str, Any]:
+        """Return a dict of extra metrics."""
+        return self._extra_metrics
+
+    @classmethod
+    def get_metrics(self) -> List[MetricDefinition]:
+        return list(_METRICS)
+
+    def as_dict(self):
+        """Return a dict representation of the metrics."""
+        result = []
+        for metric in self.get_metrics():
+            if not self._is_map and metric.map_only:
+                continue
+            value = getattr(self, metric.name)
+            result.append((metric.name, value))
+
+        # TODO: record resource usage in OpRuntimeMetrics,
+        # avoid calling self._op.current_processor_usage()
+        resource_usage = self._op.current_processor_usage()
+        result.extend(
+            [
+                ("cpu_usage", resource_usage.cpu or 0),
+                ("gpu_usage", resource_usage.gpu or 0),
+            ]
+        )
+        result.extend(self._extra_metrics.items())
+        return dict(result)
+
+    @metric_property(
+        description="Average number of blocks generated per task.",
+        metrics_group=MetricsGroup.OUTPUTS,
+        map_only=True,
+    )
+    def average_num_outputs_per_task(self) -> Optional[float]:
+        """Average number of output blocks per task, or None if no task has finished."""
+        if self.num_tasks_finished == 0:
+            return None
+        else:
+            return self.num_outputs_of_finished_tasks / self.num_tasks_finished
+
+    @metric_property(
+        description="Average size of task output in bytes.",
+        metrics_group=MetricsGroup.OUTPUTS,
+        map_only=True,
+    )
+    def average_bytes_per_output(self) -> Optional[float]:
+        """Average size in bytes of output blocks."""
+        if self.num_task_outputs_generated == 0:
+            return None
+        else:
+            return self.bytes_task_outputs_generated / self.num_task_outputs_generated
+
+    @metric_property(
+        description="Byte size of input blocks in the operator's internal input queue.",
+        metrics_group=MetricsGroup.OBJECT_STORE_MEMORY,
+    )
+    def obj_store_mem_internal_inqueue(self) -> int:
+        return self._internal_inqueue.estimate_size_bytes()
+
+    @metric_property(
+        description=(
+            "Byte size of output blocks in the operator's internal output queue."
+        ),
+        metrics_group=MetricsGroup.OBJECT_STORE_MEMORY,
+    )
+    def obj_store_mem_internal_outqueue(self) -> int:
+        return self._internal_outqueue.estimate_size_bytes()
+
+    @metric_property(
+        description="Byte size of input blocks used by pending tasks.",
+        metrics_group=MetricsGroup.OBJECT_STORE_MEMORY,
+        map_only=True,
+    )
+    def obj_store_mem_pending_task_inputs(self) -> int:
+        return self._pending_task_inputs.estimate_size_bytes()
+
+    @property
+    def obj_store_mem_pending_task_outputs(self) -> Optional[float]:
+        """Estimated size in bytes of output blocks in Ray generator buffers.
+
+        If an estimate isn't available, this property returns ``None``.
+        """
+        per_task_output = self.obj_store_mem_max_pending_output_per_task
+        if per_task_output is None:
+            return None
+
+        # Ray Data launches multiple tasks per actor, but only one task runs at a
+        # time per actor. So, the number of actually running tasks is capped by the
+        # number of active actors.
+        from ray.data._internal.execution.operators.actor_pool_map_operator import (
+            ActorPoolMapOperator,
+        )
+
+        num_tasks_running = self.num_tasks_running
+        if isinstance(self._op, ActorPoolMapOperator):
+            num_tasks_running = min(
+                num_tasks_running, self._op._actor_pool.num_active_actors()
+            )
+
+        return num_tasks_running * per_task_output
+
+    @property
+    def obj_store_mem_max_pending_output_per_task(self) -> Optional[float]:
+        """Estimated size in bytes of output blocks in a task's generator buffer."""
+        context = self._op.data_context
+        if context._max_num_blocks_in_streaming_gen_buffer is None:
+            return None
+
+        bytes_per_output = self.average_bytes_per_output
+        if bytes_per_output is None:
+            bytes_per_output = context.target_max_block_size
+
+        num_pending_outputs = context._max_num_blocks_in_streaming_gen_buffer
+        if self.average_num_outputs_per_task is not None:
+            num_pending_outputs = min(
+                num_pending_outputs, self.average_num_outputs_per_task
+            )
+        return bytes_per_output * num_pending_outputs
+
+    @metric_property(
+        description="Average size of task inputs in bytes.",
+        metrics_group=MetricsGroup.INPUTS,
+        map_only=True,
+    )
+    def average_bytes_inputs_per_task(self) -> Optional[float]:
+        """Average size in bytes of ref bundles passed to tasks, or ``None`` if no
+        tasks have been submitted."""
+        if self.num_tasks_submitted == 0:
+            return None
+        else:
+            return self.bytes_inputs_of_submitted_tasks / self.num_tasks_submitted
+
+    @metric_property(
+        description="Average total output size of task in bytes.",
+        metrics_group=MetricsGroup.OUTPUTS,
+        map_only=True,
+    )
+    def average_bytes_outputs_per_task(self) -> Optional[float]:
+        """Average size in bytes of output blocks per task,
+        or None if no task has finished."""
+        if self.num_tasks_finished == 0:
+            return None
+        else:
+            return self.bytes_outputs_of_finished_tasks / self.num_tasks_finished
+
+    def on_input_received(self, input: RefBundle):
+        """Callback when the operator receives a new input."""
+        self.num_inputs_received += 1
+        self.bytes_inputs_received += input.size_bytes()
+
+    def on_input_queued(self, input: RefBundle):
+        """Callback when the operator queues an input."""
+        self.obj_store_mem_internal_inqueue_blocks += len(input.blocks)
+        self._internal_inqueue.add(input)
+
+    def on_input_dequeued(self, input: RefBundle):
+        """Callback when the operator dequeues an input."""
+        self.obj_store_mem_internal_inqueue_blocks -= len(input.blocks)
+        input_size = input.size_bytes()
+        self._internal_inqueue.remove(input)
+        assert self.obj_store_mem_internal_inqueue >= 0, (
+            self._op,
+            self.obj_store_mem_internal_inqueue,
+            input_size,
+        )
+
+    def on_output_queued(self, output: RefBundle):
+        """Callback when an output is queued by the operator."""
+        self.obj_store_mem_internal_outqueue_blocks += len(output.blocks)
+        self._internal_outqueue.add(output)
+
+    def on_output_dequeued(self, output: RefBundle):
+        """Callback when an output is dequeued by the operator."""
+        self.obj_store_mem_internal_outqueue_blocks -= len(output.blocks)
+        output_size = output.size_bytes()
+        self._internal_outqueue.remove(output)
+        assert self.obj_store_mem_internal_outqueue >= 0, (
+            self._op,
+            self.obj_store_mem_internal_outqueue,
+            output_size,
+        )
+
+    def on_toggle_task_submission_backpressure(self, in_backpressure):
+        if in_backpressure and self._task_submission_backpressure_start_time == -1:
+            # backpressure starting, start timer
+            self._task_submission_backpressure_start_time = time.perf_counter()
+        elif self._task_submission_backpressure_start_time != -1:
+            # backpressure stopping, stop timer
+            self.task_submission_backpressure_time += (
+                time.perf_counter() - self._task_submission_backpressure_start_time
+            )
+            self._task_submission_backpressure_start_time = -1
+
+    def on_output_taken(self, output: RefBundle):
+        """Callback when an output is taken from the operator."""
+        self.num_outputs_taken += 1
+        self.bytes_outputs_taken += output.size_bytes()
+
+    def on_task_submitted(self, task_index: int, inputs: RefBundle):
+        """Callback when the operator submits a task."""
+        self.num_tasks_submitted += 1
+        self.num_tasks_running += 1
+        self.bytes_inputs_of_submitted_tasks += inputs.size_bytes()
+        self._pending_task_inputs.add(inputs)
+        self._running_tasks[task_index] = RunningTaskInfo(inputs, 0, 0)
+
+    def on_task_output_generated(self, task_index: int, output: RefBundle):
+        """Callback when a new task generates an output."""
+        num_outputs = len(output)
+        output_bytes = output.size_bytes()
+
+        self.num_task_outputs_generated += num_outputs
+        self.bytes_task_outputs_generated += output_bytes
+
+        task_info = self._running_tasks[task_index]
+        if task_info.num_outputs == 0:
+            self.num_tasks_have_outputs += 1
+        task_info.num_outputs += num_outputs
+        task_info.bytes_outputs += output_bytes
+
+        for block_ref, meta in output.blocks:
+            assert meta.exec_stats and meta.exec_stats.wall_time_s
+            self.block_generation_time += meta.exec_stats.wall_time_s
+            assert meta.num_rows is not None
+            self.rows_task_outputs_generated += meta.num_rows
+            trace_allocation(block_ref, "operator_output")
+
+    def on_task_finished(self, task_index: int, exception: Optional[Exception]):
+        """Callback when a task is finished."""
+        self.num_tasks_running -= 1
+        self.num_tasks_finished += 1
+        if exception is not None:
+            self.num_tasks_failed += 1
+
+        task_info = self._running_tasks[task_index]
+        self.num_outputs_of_finished_tasks += task_info.num_outputs
+        self.bytes_outputs_of_finished_tasks += task_info.bytes_outputs
+
+        inputs = self._running_tasks[task_index].inputs
+        self.num_task_inputs_processed += len(inputs)
+        total_input_size = inputs.size_bytes()
+        self.bytes_task_inputs_processed += total_input_size
+        input_size = inputs.size_bytes()
+        self._pending_task_inputs.remove(inputs)
+        assert self.obj_store_mem_pending_task_inputs >= 0, (
+            self._op,
+            self.obj_store_mem_pending_task_inputs,
+            input_size,
+        )
+
+        ctx = self._op.data_context
+        if ctx.enable_get_object_locations_for_metrics:
+            locations = ray.experimental.get_object_locations(inputs.block_refs)
+            for block, meta in inputs.blocks:
+                if locations[block].get("did_spill", False):
+                    assert meta.size_bytes is not None
+                    self.obj_store_mem_spilled += meta.size_bytes
+
+        self.obj_store_mem_freed += total_input_size
+
+        inputs.destroy_if_owned()
+        del self._running_tasks[task_index]
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/physical_operator.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/physical_operator.py
new file mode 100644
index 0000000000000000000000000000000000000000..dff5ac476b5127567a2993928ce1aa73fb2f1d10
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/physical_operator.py
@@ -0,0 +1,535 @@
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Dict, Iterator, List, Optional, Union
+
+import ray
+from .ref_bundle import RefBundle
+from ray._raylet import ObjectRefGenerator
+from ray.data._internal.execution.autoscaler.autoscaling_actor_pool import (
+    AutoscalingActorPool,
+)
+from ray.data._internal.execution.interfaces.execution_options import (
+    ExecutionOptions,
+    ExecutionResources,
+)
+from ray.data._internal.execution.interfaces.op_runtime_metrics import OpRuntimeMetrics
+from ray.data._internal.logical.interfaces import LogicalOperator, Operator
+from ray.data._internal.stats import StatsDict
+from ray.data.context import DataContext
+
+# TODO(hchen): Ray Core should have a common interface for these two types.
+Waitable = Union[ray.ObjectRef, ObjectRefGenerator]
+
+
+class OpTask(ABC):
+    """Abstract class that represents a task that is created by an PhysicalOperator.
+
+    The task can be either a regular task or an actor task.
+    """
+
+    def __init__(self, task_index: int):
+        self._task_index = task_index
+
+    def task_index(self) -> int:
+        """Return the index of the task."""
+        return self._task_index
+
+    @abstractmethod
+    def get_waitable(self) -> Waitable:
+        """Return the ObjectRef or ObjectRefGenerator to wait on."""
+        pass
+
+
+class DataOpTask(OpTask):
+    """Represents an OpTask that handles Block data."""
+
+    def __init__(
+        self,
+        task_index: int,
+        streaming_gen: ObjectRefGenerator,
+        output_ready_callback: Callable[[RefBundle], None],
+        task_done_callback: Callable[[Optional[Exception]], None],
+    ):
+        """
+        Args:
+            streaming_gen: The streaming generator of this task. It should yield blocks.
+            output_ready_callback: The callback to call when a new RefBundle is output
+                from the generator.
+            task_done_callback: The callback to call when the task is done.
+        """
+        super().__init__(task_index)
+        # TODO(hchen): Right now, the streaming generator is required to yield a Block
+        # and a BlockMetadata each time. We should unify task submission with an unified
+        # interface. So each individual operator don't need to take care of the
+        # BlockMetadata.
+        self._streaming_gen = streaming_gen
+        self._output_ready_callback = output_ready_callback
+        self._task_done_callback = task_done_callback
+
+    def get_waitable(self) -> ObjectRefGenerator:
+        return self._streaming_gen
+
+    def on_data_ready(self, max_bytes_to_read: Optional[int]) -> int:
+        """Callback when data is ready to be read from the streaming generator.
+
+        Args:
+            max_bytes_to_read: Max bytes of blocks to read. If None, all available
+                will be read.
+        Returns: The number of blocks read.
+        """
+        bytes_read = 0
+        while max_bytes_to_read is None or bytes_read < max_bytes_to_read:
+            try:
+                block_ref = self._streaming_gen._next_sync(0)
+                if block_ref.is_nil():
+                    # The generator currently doesn't have new output.
+                    # And it's not stopped yet.
+                    break
+            except StopIteration:
+                self._task_done_callback(None)
+                break
+
+            try:
+                meta = ray.get(next(self._streaming_gen))
+            except StopIteration:
+                # The generator should always yield 2 values (block and metadata)
+                # each time. If we get a StopIteration here, it means an error
+                # happened in the task.
+                # And in this case, the block_ref is the exception object.
+                # TODO(hchen): Ray Core should have a better interface for
+                # detecting and obtaining the exception.
+                try:
+                    ray.get(block_ref)
+                    assert False, "Above ray.get should raise an exception."
+                except Exception as ex:
+                    self._task_done_callback(ex)
+                    raise ex from None
+            self._output_ready_callback(
+                RefBundle([(block_ref, meta)], owns_blocks=True)
+            )
+            bytes_read += meta.size_bytes
+        return bytes_read
+
+
+class MetadataOpTask(OpTask):
+    """Represents an OpTask that only handles metadata, instead of Block data."""
+
+    def __init__(
+        self,
+        task_index: int,
+        object_ref: ray.ObjectRef,
+        task_done_callback: Callable[[], None],
+    ):
+        """
+        Args:
+            object_ref: The ObjectRef of the task.
+            task_done_callback: The callback to call when the task is done.
+        """
+        super().__init__(task_index)
+        self._object_ref = object_ref
+        self._task_done_callback = task_done_callback
+
+    def get_waitable(self) -> ray.ObjectRef:
+        return self._object_ref
+
+    def on_task_finished(self):
+        """Callback when the task is finished."""
+        self._task_done_callback()
+
+
+class PhysicalOperator(Operator):
+    """Abstract class for physical operators.
+
+    An operator transforms one or more input streams of RefBundles into a single
+    output stream of RefBundles.
+
+    Physical operators are stateful and non-serializable; they live on the driver side
+    of the Dataset only.
+
+    Here's a simple example of implementing a basic "Map" operator:
+
+        class MapOperator(PhysicalOperator):
+            def __init__(self):
+                self.active_tasks = []
+
+            def add_input(self, refs, _):
+                self.active_tasks.append(map_task.remote(refs))
+
+            def has_next(self):
+                ready, _ = ray.wait(self.active_tasks, timeout=0)
+                return len(ready) > 0
+
+            def get_next(self):
+                ready, remaining = ray.wait(self.active_tasks, num_returns=1)
+                self.active_tasks = remaining
+                return ready[0]
+
+    Note that the above operator fully supports both bulk and streaming execution,
+    since `add_input` and `get_next` can be called in any order. In bulk execution
+    (now deprecated), all inputs would be added up-front, but in streaming
+    execution (now the default execution mode) the calls could be interleaved.
+    """
+
+    def __init__(
+        self,
+        name: str,
+        input_dependencies: List["PhysicalOperator"],
+        data_context: DataContext,
+        target_max_block_size: Optional[int],
+    ):
+        super().__init__(name, input_dependencies)
+
+        for x in input_dependencies:
+            assert isinstance(x, PhysicalOperator), x
+        self._inputs_complete = not input_dependencies
+        self._target_max_block_size = target_max_block_size
+        self._started = False
+        self._in_task_submission_backpressure = False
+        self._in_task_output_backpressure = False
+        self._metrics = OpRuntimeMetrics(self)
+        self._estimated_num_output_bundles = None
+        self._estimated_output_num_rows = None
+        self._execution_completed = False
+        # The LogicalOperator(s) which were translated to create this PhysicalOperator.
+        # Set via `PhysicalOperator.set_logical_operators()`.
+        self._logical_operators: List[LogicalOperator] = []
+        self._data_context = data_context
+
+    def __reduce__(self):
+        raise ValueError("Operator is not serializable.")
+
+    @property
+    def data_context(self) -> DataContext:
+        return self._data_context
+
+    # Override the following 3 methods to correct type hints.
+
+    @property
+    def input_dependencies(self) -> List["PhysicalOperator"]:
+        return super().input_dependencies  # type: ignore
+
+    @property
+    def output_dependencies(self) -> List["PhysicalOperator"]:
+        return super().output_dependencies  # type: ignore
+
+    def post_order_iter(self) -> Iterator["PhysicalOperator"]:
+        return super().post_order_iter()  # type: ignore
+
+    def set_logical_operators(
+        self,
+        *logical_ops: LogicalOperator,
+    ):
+        self._logical_operators = list(logical_ops)
+
+    @property
+    def target_max_block_size(self) -> Optional[int]:
+        """
+        Target max block size output by this operator. If this returns None,
+        then the default from DataContext should be used.
+        """
+        return self._target_max_block_size
+
+    @property
+    def actual_target_max_block_size(self) -> int:
+        """
+        The actual target max block size output by this operator.
+        """
+        target_max_block_size = self._target_max_block_size
+        if target_max_block_size is None:
+            target_max_block_size = self.data_context.target_max_block_size
+        return target_max_block_size
+
+    def set_target_max_block_size(self, target_max_block_size: Optional[int]):
+        self._target_max_block_size = target_max_block_size
+
+    def mark_execution_completed(self):
+        """Manually mark this operator has completed execution."""
+        self._execution_completed = True
+
+    def completed(self) -> bool:
+        """Return True when this operator is completed.
+
+        An operator is completed the operator has stopped execution and all
+        outputs are taken.
+        """
+        if not self._execution_completed:
+            if self._inputs_complete and self.num_active_tasks() == 0:
+                # If all inputs are complete and there are no active tasks,
+                # then the operator has completed execution.
+                self._execution_completed = True
+        return self._execution_completed and not self.has_next()
+
+    def get_stats(self) -> StatsDict:
+        """Return recorded execution stats for use with DatasetStats."""
+        raise NotImplementedError
+
+    @property
+    def metrics(self) -> OpRuntimeMetrics:
+        """Returns the runtime metrics of this operator."""
+        self._metrics._extra_metrics = self._extra_metrics()
+        return self._metrics
+
+    def _extra_metrics(self) -> Dict[str, Any]:
+        """Subclasses should override this method to report extra metrics
+        that are specific to them."""
+        return {}
+
+    def progress_str(self) -> str:
+        """Return any extra status to be displayed in the operator progress bar.
+
+        For example, `<N> actors` to show current number of actors in an actor pool.
+        """
+        return ""
+
+    def num_outputs_total(self) -> Optional[int]:
+        """Returns the total number of output bundles of this operator,
+        or ``None`` if unable to provide a reasonable estimate (for example,
+        if no tasks have finished yet).
+
+        The value returned may be an estimate based off the consumption so far.
+        This is useful for reporting progress.
+
+        Subclasses should either override this method, or update
+        ``self._estimated_num_output_bundles`` appropriately.
+        """
+        return self._estimated_num_output_bundles
+
+    def num_output_rows_total(self) -> Optional[int]:
+        """Returns the total number of output rows of this operator,
+        or ``None`` if unable to provide a reasonable estimate (for example,
+        if no tasks have finished yet).
+
+        The value returned may be an estimate based off the consumption so far.
+        This is useful for reporting progress.
+
+        Subclasses should either override this method, or update
+        ``self._estimated_output_num_rows`` appropriately.
+        """
+        return self._estimated_output_num_rows
+
+    def start(self, options: ExecutionOptions) -> None:
+        """Called by the executor when execution starts for an operator.
+
+        Args:
+            options: The global options used for the overall execution.
+        """
+        self._started = True
+
+    def should_add_input(self) -> bool:
+        """Return whether it is desirable to add input to this operator right now.
+
+        Operators can customize the implementation of this method to apply additional
+        backpressure (e.g., waiting for internal actors to be created).
+        """
+        return True
+
+    def add_input(self, refs: RefBundle, input_index: int) -> None:
+        """Called when an upstream result is available.
+
+        Inputs may be added in any order, and calls to `add_input` may be interleaved
+        with calls to `get_next` / `has_next` to implement streaming execution.
+
+        Subclasses should override `_add_input_inner` instead of this method.
+
+        Args:
+            refs: The ref bundle that should be added as input.
+            input_index: The index identifying the input dependency producing the
+                input. For most operators, this is always `0` since there is only
+                one upstream input operator.
+        """
+        self._metrics.on_input_received(refs)
+        self._add_input_inner(refs, input_index)
+
+    def _add_input_inner(self, refs: RefBundle, input_index: int) -> None:
+        """Subclasses should override this method to implement `add_input`."""
+        raise NotImplementedError
+
+    def input_done(self, input_index: int) -> None:
+        """Called when the upstream operator at index `input_index` has completed().
+
+        After this is called, the executor guarantees that no more inputs will be added
+        via `add_input` for the given input index.
+        """
+        pass
+
+    def all_inputs_done(self) -> None:
+        """Called when all upstream operators have completed().
+
+        After this is called, the executor guarantees that no more inputs will be added
+        via `add_input` for any input index.
+        """
+        self._inputs_complete = True
+
+    def has_next(self) -> bool:
+        """Returns when a downstream output is available.
+
+        When this returns true, it is safe to call `get_next()`.
+        """
+        raise NotImplementedError
+
+    def get_next(self) -> RefBundle:
+        """Get the next downstream output.
+
+        It is only allowed to call this if `has_next()` has returned True.
+
+        Subclasses should override `_get_next_inner` instead of this method.
+        """
+        output = self._get_next_inner()
+        self._metrics.on_output_taken(output)
+        return output
+
+    def _get_next_inner(self) -> RefBundle:
+        """Subclasses should override this method to implement `get_next`."""
+        raise NotImplementedError
+
+    def get_active_tasks(self) -> List[OpTask]:
+        """Get a list of the active tasks of this operator.
+
+        Subclasses should return *all* running normal/actor tasks. The
+        StreamingExecutor will wait on these tasks and trigger callbacks.
+        """
+        return []
+
+    def num_active_tasks(self) -> int:
+        """Return the number of active tasks.
+
+        This method is used for 2 purposes:
+        * Determine if this operator is completed.
+        * Displaying active task info in the progress bar.
+        Thus, the return value can be less than `len(get_active_tasks())`,
+        if some tasks are not needed for the above purposes. E.g., for the
+        actor pool map operator, readiness checking tasks can be excluded
+        from `num_active_tasks`, but they should be included in
+        `get_active_tasks`.
+
+        Subclasses can override this as a performance optimization.
+        """
+        return len(self.get_active_tasks())
+
+    def throttling_disabled(self) -> bool:
+        """Whether to disable resource throttling for this operator.
+
+        This should return True for operators that only manipulate bundle metadata
+        (e.g., the OutputSplitter operator). This hints to the execution engine that
+        these operators should not be throttled based on resource usage.
+        """
+        return False
+
+    def internal_queue_size(self) -> int:
+        """If the operator has an internal input queue, return its size.
+
+        This is used to report tasks pending submission to actor pools.
+        """
+        return 0
+
+    def shutdown(self) -> None:
+        """Abort execution and release all resources used by this operator.
+
+        This release any Ray resources acquired by this operator such as active
+        tasks, actors, and objects.
+        """
+        if not self._started:
+            raise ValueError("Operator must be started before being shutdown.")
+
+    def current_processor_usage(self) -> ExecutionResources:
+        """Returns the current estimated CPU and GPU usage of this operator, excluding
+        object store memory.
+
+        This method is called by the executor to decide how to allocate processors
+        between different operators.
+        """
+        return ExecutionResources(0, 0, 0)
+
+    def running_processor_usage(self) -> ExecutionResources:
+        """Returns the estimated running CPU and GPU usage of this operator, excluding
+        object store memory.
+
+        This method is called by the resource manager and the streaming
+        executor to display the number of currently running CPUs and GPUs in the
+        progress bar.
+
+        Note, this method returns `current_processor_usage() -
+        pending_processor_usage()` by default. Subclasses should only override
+        `pending_processor_usage()` if needed.
+        """
+        usage = self.current_processor_usage()
+        usage = usage.subtract(self.pending_processor_usage())
+        return usage
+
+    def pending_processor_usage(self) -> ExecutionResources:
+        """Returns the estimated pending CPU and GPU usage of this operator, excluding
+        object store memory.
+
+        This method is called by the resource manager and the streaming
+        executor to display the number of currently pending actors in the
+        progress bar.
+        """
+        return ExecutionResources(0, 0, 0)
+
+    def base_resource_usage(self) -> ExecutionResources:
+        """Returns the minimum amount of resources required for execution.
+
+        For example, an operator that creates an actor pool requiring 8 GPUs could
+        return ExecutionResources(gpu=8) as its base usage.
+        """
+        return ExecutionResources()
+
+    def incremental_resource_usage(self) -> ExecutionResources:
+        """Returns the incremental resources required for processing another input.
+
+        For example, an operator that launches a task per input could return
+        ExecutionResources(cpu=1) as its incremental usage.
+        """
+        return ExecutionResources()
+
+    def notify_in_task_submission_backpressure(self, in_backpressure: bool) -> None:
+        """Called periodically from the executor to update internal in backpressure
+        status for stats collection purposes.
+
+        Args:
+            in_backpressure: Value this operator's in_backpressure should be set to.
+        """
+        # only update on change to in_backpressure
+        if self._in_task_submission_backpressure != in_backpressure:
+            self._metrics.on_toggle_task_submission_backpressure(in_backpressure)
+            self._in_task_submission_backpressure = in_backpressure
+
+    def get_autoscaling_actor_pools(self) -> List[AutoscalingActorPool]:
+        """Return a list of `AutoscalingActorPool`s managed by this operator."""
+        return []
+
+    def implements_accurate_memory_accounting(self) -> bool:
+        """Return whether this operator implements accurate memory accounting.
+
+        An operator that implements accurate memory accounting should should properly
+        report its memory usage via the following APIs:
+          - `self._metrics.on_input_queued`.
+          - `self._metrics.on_input_dequeued`.
+          - `self._metrics.on_output_queued`.
+          - `self._metrics.on_output_dequeued`.
+        """
+        # TODO(hchen): Currently we only enable `ReservationOpResourceAllocator` when
+        # all operators in the dataset have implemented accurate memory accounting.
+        # Eventually all operators should implement accurate memory accounting.
+        return False
+
+    def supports_fusion(self) -> bool:
+        """Returns ```True``` if this operator can be fused with other operators."""
+        return False
+
+    def update_resource_usage(self) -> None:
+        """Updates resource usage of this operator at runtime.
+
+        This method will be called at runtime in each StreamingExecutor iteration.
+        Subclasses can override it to account for dynamic resource usage updates due to
+        restarting actors, retrying tasks, lost objects, etc.
+        """
+        pass
+
+    def actor_info_progress_str(self) -> str:
+        """Returns Actor progress strings for Alive, Restarting and Pending Actors.
+
+        This method will be called in summary_str API in OpState. Subcallses can
+        override it to return Actor progress strings for Alive, Restarting and Pending
+        Actors.
+        """
+        return ""
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/ref_bundle.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/ref_bundle.py
new file mode 100644
index 0000000000000000000000000000000000000000..758b22215051e62035cc06dd594d29a32169253a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/ref_bundle.py
@@ -0,0 +1,136 @@
+from dataclasses import dataclass
+from typing import Iterator, List, Optional, Tuple
+
+import ray
+from .common import NodeIdStr
+from ray.data._internal.memory_tracing import trace_deallocation
+from ray.data.block import Block, BlockMetadata
+from ray.data.context import DataContext
+from ray.types import ObjectRef
+
+
+@dataclass
+class RefBundle:
+    """A group of data block references and their metadata.
+
+    Operators take in and produce streams of RefBundles.
+
+    Most commonly a RefBundle consists of a single block object reference.
+    In some cases, e.g., due to block splitting, or for a reduce task, there may
+    be more than one block.
+
+    Block bundles have ownership semantics, i.e., shared ownership (similar to C++
+    shared_ptr, multiple operators share the same block bundle), or unique ownership
+    (similar to C++ unique_ptr, only one operator owns the block bundle). This
+    allows operators to know whether they can destroy blocks when they don't need
+    them. Destroying blocks eagerly is more efficient than waiting for Python GC /
+    Ray reference counting to kick in.
+    """
+
+    # The size_bytes must be known in the metadata, num_rows is optional.
+    blocks: Tuple[Tuple[ObjectRef[Block], BlockMetadata]]
+
+    # Whether we own the blocks (can safely destroy them).
+    owns_blocks: bool
+
+    # This attribute is used by the split() operator to assign bundles to logical
+    # output splits. It is otherwise None.
+    output_split_idx: Optional[int] = None
+
+    # Cached location, used for get_cached_location().
+    _cached_location: Optional[NodeIdStr] = None
+
+    def __post_init__(self):
+        if not isinstance(self.blocks, tuple):
+            object.__setattr__(self, "blocks", tuple(self.blocks))
+        for b in self.blocks:
+            assert isinstance(b, tuple), b
+            assert len(b) == 2, b
+            assert isinstance(b[0], ray.ObjectRef), b
+            assert isinstance(b[1], BlockMetadata), b
+            if b[1].size_bytes is None:
+                raise ValueError(
+                    "The size in bytes of the block must be known: {}".format(b)
+                )
+
+    def __setattr__(self, key, value):
+        if hasattr(self, key) and key in ["blocks", "owns_blocks"]:
+            raise ValueError(f"The `{key}` field of RefBundle cannot be updated.")
+        object.__setattr__(self, key, value)
+
+    @property
+    def block_refs(self) -> List[ObjectRef[Block]]:
+        """List of block references in this bundle."""
+        return [block_ref for block_ref, _ in self.blocks]
+
+    @property
+    def metadata(self) -> List[BlockMetadata]:
+        """List of block metadata in this bundle."""
+        return [metadata for _, metadata in self.blocks]
+
+    def num_rows(self) -> Optional[int]:
+        """Number of rows present in this bundle, if known."""
+        total = 0
+        for m in self.metadata:
+            if m.num_rows is None:
+                return None
+            else:
+                total += m.num_rows
+        return total
+
+    def size_bytes(self) -> int:
+        """Size of the blocks of this bundle in bytes."""
+        return sum(m.size_bytes for m in self.metadata)
+
+    def destroy_if_owned(self) -> int:
+        """Clears the object store memory for these blocks if owned.
+
+        Returns:
+            The number of bytes freed.
+        """
+        should_free = self.owns_blocks and DataContext.get_current().eager_free
+        for block_ref in self.block_refs:
+            trace_deallocation(
+                block_ref, "RefBundle.destroy_if_owned", free=should_free
+            )
+        return self.size_bytes() if should_free else 0
+
+    def get_cached_location(self) -> Optional[NodeIdStr]:
+        """Return a location for this bundle's data, if possible.
+
+        Caches the resolved location so multiple calls to this are efficient.
+        """
+        if self._cached_location is None:
+            # Only consider the first block in the bundle for now. TODO(ekl) consider
+            # taking into account other blocks.
+            ref = self.block_refs[0]
+            # This call is pretty fast for owned objects (~5k/s), so we don't need to
+            # batch it for now.
+            locs = ray.experimental.get_object_locations([ref])
+            nodes = locs[ref]["node_ids"]
+            if nodes:
+                self._cached_location = nodes[0]
+            else:
+                self._cached_location = ""
+        if self._cached_location:
+            return self._cached_location
+        else:
+            return None  # Return None if cached location is "".
+
+    def __eq__(self, other) -> bool:
+        return self is other
+
+    def __hash__(self) -> int:
+        return id(self)
+
+    def __len__(self) -> int:
+        return len(self.blocks)
+
+
+def _ref_bundles_iterator_to_block_refs_list(
+    ref_bundles: Iterator[RefBundle],
+) -> List[ObjectRef[Block]]:
+    """Convert an iterator of RefBundles to a list of Block object references."""
+    return [
+        block_ref for ref_bundle in ref_bundles for block_ref in ref_bundle.block_refs
+    ]
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/task_context.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/task_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..094faf2440e01a1f92917f2d4ecb690fbb482257
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/task_context.py
@@ -0,0 +1,44 @@
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any, Dict, Optional
+
+from ray.data._internal.progress_bar import ProgressBar
+
+if TYPE_CHECKING:
+    from ray.data._internal.execution.operators.map_transformer import MapTransformer
+
+
+@dataclass
+class TaskContext:
+    """This describes the information of a task running block transform."""
+
+    # The index of task. Each task has a unique task index within the same
+    # operator.
+    task_idx: int
+
+    # The dictionary of sub progress bar to update. The key is name of sub progress
+    # bar. Note this is only used on driver side.
+    # TODO(chengsu): clean it up from TaskContext with new optimizer framework.
+    sub_progress_bar_dict: Optional[Dict[str, ProgressBar]] = None
+
+    # NOTE(hchen): `upstream_map_transformer` and `upstream_map_ray_remote_args`
+    # are only used for `RandomShuffle`. DO NOT use them for other operators.
+    # Ideally, they should be handled by the optimizer, and should be transparent
+    # to the specific operators.
+    # But for `RandomShuffle`, the AllToAllOperator doesn't do the shuffle itself.
+    # It uses `ExchangeTaskScheduler` to launch new tasks to do the shuffle.
+    # That's why we need to pass them to `ExchangeTaskScheduler`.
+    # TODO(hchen): Use a physical operator to do the shuffle directly.
+
+    # The underlying function called in a MapOperator; this is used when fusing
+    # an AllToAllOperator with an upstream MapOperator.
+    upstream_map_transformer: Optional["MapTransformer"] = None
+
+    # The Ray remote arguments of the fused upstream MapOperator.
+    # This should be set if upstream_map_transformer is set.
+    upstream_map_ray_remote_args: Optional[Dict[str, Any]] = None
+
+    # The target maximum number of bytes to include in the task's output block.
+    target_max_block_size: Optional[int] = None
+
+    # Additional keyword arguments passed to the task.
+    kwargs: Dict[str, Any] = field(default_factory=dict)
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/transform_fn.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/transform_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a4e13d8a08cc2cb4db008b8128b60c90716cae6
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/transform_fn.py
@@ -0,0 +1,10 @@
+from typing import Callable, List, Tuple
+
+from .ref_bundle import RefBundle
+from .task_context import TaskContext
+from ray.data._internal.stats import StatsDict
+
+# Block transform function applied in AllToAllOperator.
+AllToAllTransformFn = Callable[
+    [List[RefBundle], TaskContext], Tuple[List[RefBundle], StatsDict]
+]
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/legacy_compat.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/legacy_compat.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ef7ca0d08979f8e3ca8735f6557d7afba5142e5
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/legacy_compat.py
@@ -0,0 +1,181 @@
+"""This file contains temporary helper functions for legacy plan/executor interaction.
+
+It should be deleted once we fully move to the new executor backend.
+"""
+
+from typing import Iterator, Optional, Tuple
+
+from ray.data._internal.block_list import BlockList
+from ray.data._internal.execution.interfaces import (
+    Executor,
+    PhysicalOperator,
+    RefBundle,
+)
+from ray.data._internal.execution.interfaces.executor import OutputIterator
+from ray.data._internal.logical.optimizers import get_execution_plan
+from ray.data._internal.logical.util import record_operators_usage
+from ray.data._internal.plan import ExecutionPlan
+from ray.data._internal.stats import DatasetStats
+from ray.data._internal.util import unify_block_metadata_schema
+from ray.data.block import BlockMetadata
+
+# Warn about tasks larger than this.
+TASK_SIZE_WARN_THRESHOLD_BYTES = 100000
+
+
+def execute_to_legacy_bundle_iterator(
+    executor: Executor,
+    plan: ExecutionPlan,
+    dag_rewrite=None,
+) -> Iterator[RefBundle]:
+    """Execute a plan with the new executor and return a bundle iterator.
+
+    Args:
+        executor: The executor to use.
+        plan: The legacy plan to execute.
+        dag_rewrite: Callback that can be used to mutate the DAG prior to execution.
+            This is currently used as a legacy hack to inject the OutputSplit operator
+            for `Dataset.streaming_split()`.
+
+    Returns:
+        The output as a bundle iterator.
+    """
+    dag, stats = _get_execution_dag(
+        executor,
+        plan,
+        preserve_order=False,
+    )
+    if dag_rewrite:
+        dag = dag_rewrite(dag)
+
+    bundle_iter = executor.execute(dag, initial_stats=stats)
+
+    class CacheMetadataIterator(OutputIterator):
+        """Wrapper for `bundle_iterator` above.
+
+        For a given iterator which yields output RefBundles,
+        collect the metadata from each output bundle, and yield the
+        original RefBundle. Only after the entire iterator is exhausted,
+        we cache the resulting metadata to the execution plan."""
+
+        def __init__(self, base_iterator: OutputIterator):
+            # Note: the base_iterator should be of type StreamIterator,
+            # defined within `StreamingExecutor.execute()`. It must
+            # support the `get_next()` method.
+            self._base_iterator = base_iterator
+            self._collected_metadata = BlockMetadata(
+                num_rows=0,
+                size_bytes=0,
+                schema=None,
+                input_files=None,
+                exec_stats=None,
+            )
+
+        def get_next(self, output_split_idx: Optional[int] = None) -> RefBundle:
+            try:
+                bundle = self._base_iterator.get_next(output_split_idx)
+                self._collect_metadata(bundle)
+                return bundle
+            except StopIteration:
+                # Once the iterator is completely exhausted, we are done
+                # collecting metadata. We can add this cached metadata to the plan.
+                plan._snapshot_metadata = self._collected_metadata
+                raise
+
+        def _collect_metadata(self, bundle: RefBundle) -> RefBundle:
+            """Collect the metadata from each output bundle and accumulate
+            results, so we can access important information, such as
+            row count, schema, etc., after iteration completes."""
+            self._collected_metadata.num_rows += bundle.num_rows()
+            self._collected_metadata.size_bytes += bundle.size_bytes()
+            self._collected_metadata.schema = unify_block_metadata_schema(
+                [self._collected_metadata, *bundle.metadata]
+            )
+            return bundle
+
+    bundle_iter = CacheMetadataIterator(bundle_iter)
+    return bundle_iter
+
+
+def execute_to_legacy_block_list(
+    executor: Executor,
+    plan: ExecutionPlan,
+    dataset_uuid: str,
+    preserve_order: bool,
+) -> BlockList:
+    """Execute a plan with the new executor and translate it into a legacy block list.
+
+    Args:
+        executor: The executor to use.
+        plan: The legacy plan to execute.
+        dataset_uuid: UUID of the dataset for this execution.
+        preserve_order: Whether to preserve order in execution.
+
+    Returns:
+        The output as a legacy block list.
+    """
+    dag, stats = _get_execution_dag(
+        executor,
+        plan,
+        preserve_order,
+    )
+    bundles = executor.execute(dag, initial_stats=stats)
+    block_list = _bundles_to_block_list(bundles)
+    # Set the stats UUID after execution finishes.
+    _set_stats_uuid_recursive(executor.get_stats(), dataset_uuid)
+    return block_list
+
+
+def _get_execution_dag(
+    executor: Executor,
+    plan: ExecutionPlan,
+    preserve_order: bool,
+) -> Tuple[PhysicalOperator, DatasetStats]:
+    """Get the physical operators DAG from a plan."""
+    # Record usage of logical operators if available.
+    if hasattr(plan, "_logical_plan") and plan._logical_plan is not None:
+        record_operators_usage(plan._logical_plan.dag)
+
+    # Get DAG of physical operators and input statistics.
+    dag = get_execution_plan(plan._logical_plan).dag
+    stats = _get_initial_stats_from_plan(plan)
+
+    # Enforce to preserve ordering if the plan has operators
+    # required to do so, such as Zip and Sort.
+    if preserve_order or plan.require_preserve_order():
+        executor._options.preserve_order = True
+
+    return dag, stats
+
+
+def _get_initial_stats_from_plan(plan: ExecutionPlan) -> DatasetStats:
+    if plan._snapshot_bundle is not None:
+        return plan._snapshot_stats
+    # For Datasets created from "read_xxx", `plan._in_stats` contains useless data.
+    # For Datasets created from "from_xxx", we need to use `plan._in_stats` as
+    # the initial stats. Because the `FromXxx` logical operators will be translated to
+    # "InputDataBuffer" physical operators, which will be ignored when generating
+    # stats, see `StreamingExecutor._generate_stats`.
+    # TODO(hchen): Unify the logic by saving the initial stats in `InputDataBuffer
+    if plan.has_lazy_input():
+        return DatasetStats(metadata={}, parent=None)
+    else:
+        return plan._in_stats
+
+
+def _bundles_to_block_list(bundles: Iterator[RefBundle]) -> BlockList:
+    blocks, metadata = [], []
+    owns_blocks = True
+    for ref_bundle in bundles:
+        if not ref_bundle.owns_blocks:
+            owns_blocks = False
+        blocks.extend(ref_bundle.block_refs)
+        metadata.extend(ref_bundle.metadata)
+    return BlockList(blocks, metadata, owned_by_consumer=owns_blocks)
+
+
+def _set_stats_uuid_recursive(stats: DatasetStats, dataset_uuid: str) -> None:
+    if not stats.dataset_uuid:
+        stats.dataset_uuid = dataset_uuid
+    for parent in stats.parents or []:
+        _set_stats_uuid_recursive(parent, dataset_uuid)
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__init__.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..558d623705ebbc26715cd78ef3d3f1c4c1714631
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/actor_pool_map_operator.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/actor_pool_map_operator.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5fffcf8a3ad8fd23c15fcc06594527e62b932b17
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/actor_pool_map_operator.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/aggregate_num_rows.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/aggregate_num_rows.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f7d2c7919976c09ec84ca8ee48f8121222a72189
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/aggregate_num_rows.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/base_physical_operator.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/base_physical_operator.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d30669a7129fd1d8d3791abe4942dfbb24cffc5
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/base_physical_operator.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/input_data_buffer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/input_data_buffer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..298292165d941b16b07c7621bfa92676fcf0bd73
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/input_data_buffer.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/limit_operator.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/limit_operator.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6a512dcf0a22387c3319206c19c22bb5204fcfa8
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/limit_operator.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/map_operator.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/map_operator.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4dc5236fda229f1bf6423312ba18b99ef4a86f94
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/map_operator.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/map_transformer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/map_transformer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8607ed8ce80000981ab0cf72fba95905717a3141
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/map_transformer.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/output_splitter.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/output_splitter.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8aebec64b074ec15ab5d052d4db8f0b5defbe9e2
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/output_splitter.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/task_pool_map_operator.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/task_pool_map_operator.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..93f5f21b75a0dd48e235f6b152e5060c2af02ac7
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/task_pool_map_operator.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/union_operator.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/union_operator.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..818cc09371bcfd3f36fba4b6e648fcf4fc832827
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/union_operator.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/zip_operator.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/zip_operator.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d37831a9d92bb64bfc4d0f265d60c413237fe46e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/zip_operator.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/actor_pool_map_operator.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/actor_pool_map_operator.py
new file mode 100644
index 0000000000000000000000000000000000000000..678ff6c0d5bbd4ddc5004c0f69cc88d3bc318815
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/actor_pool_map_operator.py
@@ -0,0 +1,777 @@
+import logging
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union
+
+import ray
+from ray.actor import ActorHandle
+from ray.core.generated import gcs_pb2
+from ray.data._internal.compute import ActorPoolStrategy
+from ray.data._internal.execution.autoscaler import AutoscalingActorPool
+from ray.data._internal.execution.bundle_queue import create_bundle_queue
+from ray.data._internal.execution.interfaces import (
+    ExecutionOptions,
+    ExecutionResources,
+    NodeIdStr,
+    PhysicalOperator,
+    RefBundle,
+    TaskContext,
+)
+from ray.data._internal.execution.operators.map_operator import MapOperator, _map_task
+from ray.data._internal.execution.operators.map_transformer import MapTransformer
+from ray.data._internal.execution.util import locality_string
+from ray.data._internal.remote_fn import _add_system_error_to_retry_exceptions
+from ray.data.block import Block, BlockMetadata
+from ray.data.context import DataContext
+from ray.types import ObjectRef
+
+logger = logging.getLogger(__name__)
+
+# Higher values here are better for prefetching and locality. It's ok for this to be
+# fairly high since streaming backpressure prevents us from overloading actors.
+DEFAULT_MAX_TASKS_IN_FLIGHT = 4
+
+
+class ActorPoolMapOperator(MapOperator):
+    """A MapOperator implementation that executes tasks on an actor pool.
+
+    This class manages the state of a pool of actors used for task execution, as well
+    as dispatch of tasks to those actors.
+
+    It operates in two modes. In bulk mode, tasks are queued internally and executed
+    when the operator has free actor slots. In streaming mode, the streaming executor
+    only adds input when `should_add_input() = True` (i.e., there are free slots).
+    This allows for better control of backpressure (e.g., suppose we go over memory
+    limits after adding put, then there isn't any way to "take back" the inputs prior
+    to actual execution).
+    """
+
+    def __init__(
+        self,
+        map_transformer: MapTransformer,
+        input_op: PhysicalOperator,
+        data_context: DataContext,
+        target_max_block_size: Optional[int],
+        compute_strategy: ActorPoolStrategy,
+        name: str = "ActorPoolMap",
+        min_rows_per_bundle: Optional[int] = None,
+        supports_fusion: bool = True,
+        ray_remote_args_fn: Optional[Callable[[], Dict[str, Any]]] = None,
+        ray_remote_args: Optional[Dict[str, Any]] = None,
+    ):
+        """Create an ActorPoolMapOperator instance.
+
+        Args:
+            transform_fn: The function to apply to each ref bundle input.
+            init_fn: The callable class to instantiate on each actor.
+            input_op: Operator generating input data for this op.
+            compute_strategy: ComputeStrategy used for this operator.
+            name: The name of this operator.
+            target_max_block_size: The target maximum number of bytes to
+                include in an output block.
+            min_rows_per_bundle: The number of rows to gather per batch passed to the
+                transform_fn, or None to use the block size. Setting the batch size is
+                important for the performance of GPU-accelerated transform functions.
+                The actual rows passed may be less if the dataset is small.
+            supports_fusion: Whether this operator supports fusion with other operators.
+            ray_remote_args_fn: A function that returns a dictionary of remote args
+                passed to each map worker. The purpose of this argument is to generate
+                dynamic arguments for each actor/task, and will be called each time
+                prior to initializing the worker. Args returned from this dict will
+                always override the args in ``ray_remote_args``. Note: this is an
+                advanced, experimental feature.
+            ray_remote_args: Customize the ray remote args for this op's tasks.
+                See :func:`ray.remote` for details.
+        """
+        super().__init__(
+            map_transformer,
+            input_op,
+            data_context,
+            name,
+            target_max_block_size,
+            min_rows_per_bundle,
+            supports_fusion,
+            ray_remote_args_fn,
+            ray_remote_args,
+        )
+        self._ray_actor_task_remote_args = {}
+        actor_task_errors = self.data_context.actor_task_retry_on_errors
+        if actor_task_errors:
+            self._ray_actor_task_remote_args["retry_exceptions"] = actor_task_errors
+        _add_system_error_to_retry_exceptions(self._ray_actor_task_remote_args)
+        data_context = self.data_context
+        if data_context._max_num_blocks_in_streaming_gen_buffer is not None:
+            # The `_generator_backpressure_num_objects` parameter should be
+            # `2 * _max_num_blocks_in_streaming_gen_buffer` because we yield
+            # 2 objects for each block: the block and the block metadata.
+            self._ray_actor_task_remote_args["_generator_backpressure_num_objects"] = (
+                2 * data_context._max_num_blocks_in_streaming_gen_buffer
+            )
+        self._min_rows_per_bundle = min_rows_per_bundle
+        self._ray_remote_args_fn = ray_remote_args_fn
+        self._ray_remote_args = self._apply_default_remote_args(
+            self._ray_remote_args, data_context
+        )
+
+        per_actor_resource_usage = ExecutionResources(
+            cpu=self._ray_remote_args.get("num_cpus", 0),
+            gpu=self._ray_remote_args.get("num_gpus", 0),
+        )
+        self._actor_pool = _ActorPool(
+            compute_strategy, self._start_actor, per_actor_resource_usage
+        )
+        # A queue of bundles awaiting dispatch to actors.
+        self._bundle_queue = create_bundle_queue()
+        # Cached actor class.
+        self._cls = None
+        # Whether no more submittable bundles will be added.
+        self._inputs_done = False
+
+    def internal_queue_size(self) -> int:
+        return len(self._bundle_queue)
+
+    def start(self, options: ExecutionOptions):
+        self._actor_locality_enabled = options.actor_locality_enabled
+        super().start(options)
+
+        # Create the actor workers and add them to the pool.
+        self._cls = ray.remote(**self._ray_remote_args)(_MapWorker)
+        self._actor_pool.scale_up(self._actor_pool.min_size())
+        refs = self._actor_pool.get_pending_actor_refs()
+
+        # We synchronously wait for the initial number of actors to start. This avoids
+        # situations where the scheduler is unable to schedule downstream operators
+        # due to lack of available actors, causing an initial "pileup" of objects on
+        # upstream operators, leading to a spike in memory usage prior to steady state.
+        logger.debug(f"{self._name}: Waiting for {len(refs)} pool actors to start...")
+        try:
+            timeout = self.data_context.wait_for_min_actors_s
+            ray.get(refs, timeout=timeout)
+        except ray.exceptions.GetTimeoutError:
+            raise ray.exceptions.GetTimeoutError(
+                "Timed out while starting actors. "
+                "This may mean that the cluster does not have "
+                "enough resources for the requested actor pool."
+            )
+
+    def should_add_input(self) -> bool:
+        return self._actor_pool.num_free_slots() > 0
+
+    def _start_actor(self):
+        """Start a new actor and add it to the actor pool as a pending actor."""
+        assert self._cls is not None
+        ctx = self.data_context
+        if self._ray_remote_args_fn:
+            self._refresh_actor_cls()
+        actor = self._cls.remote(
+            ctx,
+            src_fn_name=self.name,
+            map_transformer=self._map_transformer,
+        )
+        res_ref = actor.get_location.remote()
+
+        def _task_done_callback(res_ref):
+            # res_ref is a future for a now-ready actor; move actor from pending to the
+            # active actor pool.
+            has_actor = self._actor_pool.pending_to_running(res_ref)
+            if not has_actor:
+                # Actor has already been killed.
+                return
+            # A new actor has started, we try to dispatch queued tasks.
+            self._dispatch_tasks()
+
+        self._submit_metadata_task(
+            res_ref,
+            lambda: _task_done_callback(res_ref),
+        )
+        return actor, res_ref
+
+    def _add_bundled_input(self, bundle: RefBundle):
+        self._bundle_queue.add(bundle)
+        self._metrics.on_input_queued(bundle)
+        # Try to dispatch all bundles in the queue, including this new bundle.
+        self._dispatch_tasks()
+
+    def _dispatch_tasks(self):
+        """Try to dispatch tasks from the bundle buffer to the actor pool.
+
+        This is called when:
+            * a new input bundle is added,
+            * a task finishes,
+            * a new worker has been created.
+        """
+        while self._bundle_queue:
+            # Pick an actor from the pool.
+            if self._actor_locality_enabled:
+                actor = self._actor_pool.pick_actor(self._bundle_queue.peek())
+            else:
+                actor = self._actor_pool.pick_actor()
+            if actor is None:
+                # No actors available for executing the next task.
+                break
+            # Submit the map task.
+            bundle = self._bundle_queue.pop()
+            self._metrics.on_input_dequeued(bundle)
+            input_blocks = [block for block, _ in bundle.blocks]
+            ctx = TaskContext(
+                task_idx=self._next_data_task_idx,
+                target_max_block_size=self.actual_target_max_block_size,
+            )
+            gen = actor.submit.options(
+                num_returns="streaming",
+                name=self.name,
+                **self._ray_actor_task_remote_args,
+            ).remote(
+                self.data_context,
+                ctx,
+                *input_blocks,
+                **self.get_map_task_kwargs(),
+            )
+
+            def _task_done_callback(actor_to_return):
+                # Return the actor that was running the task to the pool.
+                self._actor_pool.return_actor(actor_to_return)
+                # Dipsatch more tasks.
+                self._dispatch_tasks()
+
+            from functools import partial
+
+            self._submit_data_task(
+                gen, bundle, partial(_task_done_callback, actor_to_return=actor)
+            )
+
+    def _refresh_actor_cls(self):
+        """When `self._ray_remote_args_fn` is specified, this method should
+        be called prior to initializing the new worker in order to get new
+        remote args passed to the worker. It updates `self.cls` with the same
+        `_MapWorker` class, but with the new remote args from
+        `self._ray_remote_args_fn`."""
+        assert self._ray_remote_args_fn, "_ray_remote_args_fn must be provided"
+        remote_args = self._ray_remote_args.copy()
+        new_remote_args = self._ray_remote_args_fn()
+
+        # Override args from user-defined remote args function.
+        new_and_overriden_remote_args = {}
+        for k, v in new_remote_args.items():
+            remote_args[k] = v
+            new_and_overriden_remote_args[k] = v
+        self._cls = ray.remote(**remote_args)(_MapWorker)
+        return new_and_overriden_remote_args
+
+    def all_inputs_done(self):
+        # Call base implementation to handle any leftover bundles. This may or may not
+        # trigger task dispatch.
+        super().all_inputs_done()
+
+        # Mark inputs as done so future task dispatch will kill all inactive workers
+        # once the bundle queue is exhausted.
+        self._inputs_done = True
+
+    def shutdown(self):
+        # We kill all actors in the pool on shutdown, even if they are busy doing work.
+        self._actor_pool.kill_all_actors()
+        super().shutdown()
+
+        # Warn if the user specified a batch or block size that prevents full
+        # parallelization across the actor pool. We only know this information after
+        # execution has completed.
+        min_workers = self._actor_pool.min_size()
+        if len(self._output_metadata) < min_workers:
+            # The user created a stream that has too few blocks to begin with.
+            logger.warning(
+                "To ensure full parallelization across an actor pool of size "
+                f"{min_workers}, the Dataset should consist of at least "
+                f"{min_workers} distinct blocks. Consider increasing "
+                "the parallelism when creating the Dataset."
+            )
+
+    def progress_str(self) -> str:
+        if self._actor_locality_enabled:
+            return locality_string(
+                self._actor_pool._locality_hits,
+                self._actor_pool._locality_misses,
+            )
+        return "[locality off]"
+
+    def base_resource_usage(self) -> ExecutionResources:
+        min_workers = self._actor_pool.min_size()
+        return ExecutionResources(
+            cpu=self._ray_remote_args.get("num_cpus", 0) * min_workers,
+            gpu=self._ray_remote_args.get("num_gpus", 0) * min_workers,
+        )
+
+    def current_processor_usage(self) -> ExecutionResources:
+        # Both pending and running actors count towards our current resource usage.
+        num_active_workers = self._actor_pool.current_size()
+        return ExecutionResources(
+            cpu=self._ray_remote_args.get("num_cpus", 0) * num_active_workers,
+            gpu=self._ray_remote_args.get("num_gpus", 0) * num_active_workers,
+        )
+
+    def pending_processor_usage(self) -> ExecutionResources:
+        # Both pending and restarting actors count towards pending processor usage
+        num_pending_workers = (
+            self._actor_pool.num_pending_actors()
+            + self._actor_pool.num_restarting_actors()
+        )
+        return ExecutionResources(
+            cpu=self._ray_remote_args.get("num_cpus", 0) * num_pending_workers,
+            gpu=self._ray_remote_args.get("num_gpus", 0) * num_pending_workers,
+        )
+
+    def incremental_resource_usage(self) -> ExecutionResources:
+        # Submitting tasks to existing actors doesn't require additional
+        # CPU/GPU resources.
+        return ExecutionResources(
+            cpu=0,
+            gpu=0,
+            object_store_memory=self._metrics.obj_store_mem_max_pending_output_per_task
+            or 0,
+        )
+
+    def _extra_metrics(self) -> Dict[str, Any]:
+        res = {}
+        if self._actor_locality_enabled:
+            res["locality_hits"] = self._actor_pool._locality_hits
+            res["locality_misses"] = self._actor_pool._locality_misses
+        res["pending_actors"] = self._actor_pool.num_pending_actors()
+        res["restarting_actors"] = self._actor_pool.num_restarting_actors()
+        return res
+
+    @staticmethod
+    def _apply_default_remote_args(
+        ray_remote_args: Dict[str, Any], data_context: DataContext
+    ) -> Dict[str, Any]:
+        """Apply defaults to the actor creation remote args."""
+        ray_remote_args = ray_remote_args.copy()
+        if "scheduling_strategy" not in ray_remote_args:
+            ray_remote_args["scheduling_strategy"] = data_context.scheduling_strategy
+        # Enable actor fault tolerance by default, with infinite actor recreations and
+        # up to N retries per task. The user can customize this in map_batches via
+        # extra kwargs (e.g., map_batches(..., max_restarts=0) to disable).
+        if "max_restarts" not in ray_remote_args:
+            ray_remote_args["max_restarts"] = -1
+        if (
+            "max_task_retries" not in ray_remote_args
+            and ray_remote_args.get("max_restarts") != 0
+        ):
+            ray_remote_args["max_task_retries"] = -1
+        return ray_remote_args
+
+    def get_autoscaling_actor_pools(self) -> List[AutoscalingActorPool]:
+        return [self._actor_pool]
+
+    def update_resource_usage(self) -> None:
+        """Updates resources usage."""
+        for actor in self._actor_pool.get_running_actor_refs():
+            actor_state = actor._get_local_state()
+            if actor_state is None:
+                # actor._get_local_state can return None if the state is Unknown
+                continue
+            elif actor_state != gcs_pb2.ActorTableData.ActorState.ALIVE:
+                # The actors can be either ALIVE or RESTARTING here because they will
+                # be restarted indefinitely until execution finishes.
+                assert actor_state == gcs_pb2.ActorTableData.ActorState.RESTARTING
+                self._actor_pool.update_running_actor_state(actor, True)
+            else:
+                self._actor_pool.update_running_actor_state(actor, False)
+
+    def actor_info_progress_str(self) -> str:
+        """Returns Actor progress strings for Alive, Restarting and Pending Actors."""
+        return self._actor_pool.actor_info_progress_str()
+
+
+class _MapWorker:
+    """An actor worker for MapOperator."""
+
+    def __init__(
+        self,
+        ctx: DataContext,
+        src_fn_name: str,
+        map_transformer: MapTransformer,
+    ):
+        DataContext._set_current(ctx)
+        self.src_fn_name: str = src_fn_name
+        self._map_transformer = map_transformer
+        # Initialize state for this actor.
+        self._map_transformer.init()
+
+    def get_location(self) -> NodeIdStr:
+        return ray.get_runtime_context().get_node_id()
+
+    def submit(
+        self,
+        data_context: DataContext,
+        ctx: TaskContext,
+        *blocks: Block,
+        **kwargs: Dict[str, Any],
+    ) -> Iterator[Union[Block, List[BlockMetadata]]]:
+        yield from _map_task(
+            self._map_transformer,
+            data_context,
+            ctx,
+            *blocks,
+            **kwargs,
+        )
+
+    def __repr__(self):
+        return f"MapWorker({self.src_fn_name})"
+
+
+@dataclass
+class _ActorState:
+    """Actor state"""
+
+    # Number of tasks in flight per actor
+    num_tasks_in_flight: int
+
+    # Node id of each ready actor
+    actor_location: str
+
+    # Is Actor state restarting or alive
+    is_restarting: bool
+
+
+class _ActorPool(AutoscalingActorPool):
+    """A pool of actors for map task execution.
+
+    This class is in charge of tracking the number of in-flight tasks per actor,
+    providing the least heavily loaded actor to the operator, and killing idle
+    actors when the operator is done submitting work to the pool.
+    """
+
+    def __init__(
+        self,
+        compute_strategy: ActorPoolStrategy,
+        create_actor_fn: Callable[[], Tuple[ActorHandle, ObjectRef[Any]]],
+        per_actor_resource_usage: ExecutionResources,
+    ):
+        self._min_size: int = compute_strategy.min_size
+        self._max_size: int = compute_strategy.max_size
+        self._max_tasks_in_flight: int = (
+            compute_strategy.max_tasks_in_flight_per_actor
+            or DEFAULT_MAX_TASKS_IN_FLIGHT
+        )
+        self._create_actor_fn = create_actor_fn
+        self._per_actor_resource_usage = per_actor_resource_usage
+        assert self._min_size >= 1
+        assert self._max_size >= self._min_size
+        assert self._max_tasks_in_flight >= 1
+        assert self._create_actor_fn is not None
+
+        # Actors that have started running, including alive and restarting actors.
+        self._running_actors: Dict[ray.actor.ActorHandle, _ActorState] = {}
+        # Actors that are not yet ready (still pending creation).
+        self._pending_actors: Dict[ObjectRef, ray.actor.ActorHandle] = {}
+        # Whether actors that become idle should be eagerly killed. This is False until
+        # the first call to kill_idle_actors().
+        self._should_kill_idle_actors = False
+        # Track locality matching stats.
+        self._locality_hits: int = 0
+        self._locality_misses: int = 0
+
+    # === Overriding methods of AutoscalingActorPool ===
+
+    def min_size(self) -> int:
+        return self._min_size
+
+    def max_size(self) -> int:
+        return self._max_size
+
+    def current_size(self) -> int:
+        return self.num_pending_actors() + self.num_running_actors()
+
+    def num_running_actors(self) -> int:
+        return len(self._running_actors)
+
+    def num_restarting_actors(self) -> int:
+        """Restarting actors are all the running actors not in ALIVE state."""
+        return sum(
+            actor_state.is_restarting for actor_state in self._running_actors.values()
+        )
+
+    def num_active_actors(self) -> int:
+        """Active actors are all the running actors with inflight tasks."""
+        return sum(
+            1 if actor_state.num_tasks_in_flight > 0 else 0
+            for actor_state in self._running_actors.values()
+        )
+
+    def num_alive_actors(self) -> int:
+        """Alive actors are all the running actors in ALIVE state."""
+        return sum(
+            not actor_state.is_restarting
+            for actor_state in self._running_actors.values()
+        )
+
+    def num_pending_actors(self) -> int:
+        return len(self._pending_actors)
+
+    def max_tasks_in_flight_per_actor(self) -> int:
+        return self._max_tasks_in_flight
+
+    def current_in_flight_tasks(self) -> int:
+        return sum(
+            actor_state.num_tasks_in_flight
+            for actor_state in self._running_actors.values()
+        )
+
+    def scale_up(self, num_actors: int) -> int:
+        for _ in range(num_actors):
+            actor, ready_ref = self._create_actor_fn()
+            self.add_pending_actor(actor, ready_ref)
+        return num_actors
+
+    def scale_down(self, num_actors: int) -> int:
+        num_killed = 0
+        for _ in range(num_actors):
+            if self.kill_inactive_actor():
+                num_killed += 1
+        return num_killed
+
+    # === End of overriding methods of AutoscalingActorPool ===
+
+    def update_running_actor_state(
+        self, actor: ray.actor.ActorHandle, is_restarting: bool
+    ):
+        """Update running actor state.
+
+        Args:
+            actor: The running actor that needs state update.
+            is_restarting: Whether running actor is restarting or alive.
+        """
+        assert actor in self._running_actors
+        self._running_actors[actor].is_restarting = is_restarting
+
+    def add_pending_actor(self, actor: ray.actor.ActorHandle, ready_ref: ray.ObjectRef):
+        """Adds a pending actor to the pool.
+
+        This actor won't be pickable until it is marked as running via a
+        pending_to_running() call.
+
+        Args:
+            actor: The not-yet-ready actor to add as pending to the pool.
+            ready_ref: The ready future for the actor.
+        """
+        # The caller shouldn't add new actors to the pool after invoking
+        # kill_inactive_actors().
+        assert not self._should_kill_idle_actors
+        self._pending_actors[ready_ref] = actor
+
+    def pending_to_running(self, ready_ref: ray.ObjectRef) -> bool:
+        """Mark the actor corresponding to the provided ready future as running, making
+        the actor pickable.
+
+        Args:
+            ready_ref: The ready future for the actor that we wish to mark as running.
+
+        Returns:
+            Whether the actor was still pending. This can return False if the actor had
+            already been killed.
+        """
+        if ready_ref not in self._pending_actors:
+            # The actor has been removed from the pool before becoming running.
+            return False
+        actor = self._pending_actors.pop(ready_ref)
+        self._running_actors[actor] = _ActorState(
+            num_tasks_in_flight=0,
+            actor_location=ray.get(ready_ref),
+            is_restarting=False,
+        )
+        return True
+
+    def pick_actor(
+        self, locality_hint: Optional[RefBundle] = None
+    ) -> Optional[ray.actor.ActorHandle]:
+        """Picks an actor for task submission based on busyness and locality.
+
+        None will be returned if all actors are either at capacity (according to
+        max_tasks_in_flight) or are still pending.
+
+        Args:
+            locality_hint: Try to pick an actor that is local for this bundle.
+        """
+        if not self._running_actors:
+            # Actor pool is empty or all actors are still pending.
+            return None
+
+        if locality_hint:
+            preferred_loc = self._get_location(locality_hint)
+        else:
+            preferred_loc = None
+
+        # Filter out actors that are invalid, i.e. actors with number of tasks in
+        # flight >= _max_tasks_in_flight or actor_state is not ALIVE.
+        valid_actors = [
+            actor
+            for actor in self._running_actors
+            if self._running_actors[actor].num_tasks_in_flight
+            < self._max_tasks_in_flight
+            and not self._running_actors[actor].is_restarting
+        ]
+
+        if not valid_actors:
+            # All actors are at capacity or actor state is not ALIVE.
+            return None
+
+        def penalty_key(actor):
+            """Returns the key that should be minimized for the best actor.
+
+            We prioritize actors with argument locality, and those that are not busy,
+            in that order.
+            """
+            busyness = self._running_actors[actor].num_tasks_in_flight
+            requires_remote_fetch = (
+                self._running_actors[actor].actor_location != preferred_loc
+            )
+            return requires_remote_fetch, busyness
+
+        # Pick the best valid actor based on the penalty key
+        actor = min(valid_actors, key=penalty_key)
+
+        if locality_hint:
+            if self._running_actors[actor].actor_location == preferred_loc:
+                self._locality_hits += 1
+            else:
+                self._locality_misses += 1
+        self._running_actors[actor].num_tasks_in_flight += 1
+        return actor
+
+    def return_actor(self, actor: ray.actor.ActorHandle):
+        """Returns the provided actor to the pool."""
+        assert actor in self._running_actors
+        assert self._running_actors[actor].num_tasks_in_flight > 0
+        self._running_actors[actor].num_tasks_in_flight -= 1
+        if (
+            self._should_kill_idle_actors
+            and self._running_actors[actor].num_tasks_in_flight == 0
+        ):
+            self._remove_actor(actor)
+
+    def get_pending_actor_refs(self) -> List[ray.ObjectRef]:
+        return list(self._pending_actors.keys())
+
+    def get_running_actor_refs(self) -> List[ray.ObjectRef]:
+        return list(self._running_actors.keys())
+
+    def num_idle_actors(self) -> int:
+        """Return the number of idle actors in the pool."""
+        return sum(
+            1 if running_actor.num_tasks_in_flight == 0 else 0
+            for running_actor in self._running_actors.values()
+        )
+
+    def num_free_slots(self) -> int:
+        """Return the number of free slots for task execution."""
+        if not self._running_actors:
+            return 0
+        return sum(
+            max(0, self._max_tasks_in_flight - running_actor.num_tasks_in_flight)
+            for running_actor in self._running_actors.values()
+        )
+
+    def kill_inactive_actor(self) -> bool:
+        """Kills a single pending or idle actor, if any actors are pending/idle.
+
+        Returns whether an inactive actor was actually killed.
+        """
+        # We prioritize killing pending actors over idle actors to reduce actor starting
+        # churn.
+        killed = self._maybe_kill_pending_actor()
+        if not killed:
+            # If no pending actor was killed, so kill actor.
+            killed = self._maybe_kill_idle_actor()
+        return killed
+
+    def _maybe_kill_pending_actor(self) -> bool:
+        if self._pending_actors:
+            # At least one pending actor, so kill first one.
+            ready_ref = next(iter(self._pending_actors.keys()))
+            self._remove_actor(self._pending_actors[ready_ref])
+            del self._pending_actors[ready_ref]
+            return True
+        # No pending actors, so indicate to the caller that no actors were killed.
+        return False
+
+    def _maybe_kill_idle_actor(self) -> bool:
+        for actor, running_actor in self._running_actors.items():
+            if running_actor.num_tasks_in_flight == 0:
+                # At least one idle actor, so kill first one found.
+                self._remove_actor(actor)
+                return True
+        # No idle actors, so indicate to the caller that no actors were killed.
+        return False
+
+    def kill_all_inactive_actors(self):
+        """Kills all currently inactive actors and ensures that all actors that become
+        idle in the future will be eagerly killed.
+
+        This is called once the operator is done submitting work to the pool, and this
+        function is idempotent. Adding new pending actors after calling this function
+        will raise an error.
+        """
+        self._kill_all_pending_actors()
+        self._kill_all_idle_actors()
+
+    def kill_all_actors(self):
+        """Kills all actors, including running/active actors.
+
+        This is called once the operator is shutting down.
+        """
+        self._kill_all_pending_actors()
+        self._kill_all_running_actors()
+
+    def _kill_all_pending_actors(self):
+        for _, actor in self._pending_actors.items():
+            self._remove_actor(actor)
+        self._pending_actors.clear()
+
+    def _kill_all_idle_actors(self):
+        idle_actors = [
+            actor
+            for actor, running_actor in self._running_actors.items()
+            if running_actor.num_tasks_in_flight == 0
+        ]
+        for actor in idle_actors:
+            self._remove_actor(actor)
+        self._should_kill_idle_actors = True
+
+    def _kill_all_running_actors(self):
+        actors = list(self._running_actors.keys())
+        for actor in actors:
+            self._remove_actor(actor)
+
+    def _remove_actor(self, actor: ray.actor.ActorHandle):
+        """Remove the given actor from the pool."""
+        # NOTE: we remove references to the actor and let ref counting
+        # garbage collect the actor, instead of using ray.kill.
+        # Because otherwise the actor cannot be restarted upon lineage reconstruction.
+        if actor in self._running_actors:
+            del self._running_actors[actor]
+
+    def _get_location(self, bundle: RefBundle) -> Optional[NodeIdStr]:
+        """Ask Ray for the node id of the given bundle.
+
+        This method may be overriden for testing.
+
+        Returns:
+            A node id associated with the bundle, or None if unknown.
+        """
+        return bundle.get_cached_location()
+
+    def actor_info_progress_str(self) -> str:
+        """Returns Actor progress strings for Alive, Restarting and Pending Actors."""
+        alive = self.num_alive_actors()
+        pending = self.num_pending_actors()
+        restarting = self.num_restarting_actors()
+        total = alive + pending + restarting
+        if total == alive:
+            return f"; Actors: {total}"
+        else:
+            return (
+                f"; Actors: {total} (alive {alive}, restarting {restarting}, "
+                f"pending {pending})"
+            )
+
+    def per_actor_resource_usage(self) -> ExecutionResources:
+        """Per actor resource usage."""
+        return self._per_actor_resource_usage
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/aggregate_num_rows.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/aggregate_num_rows.py
new file mode 100644
index 0000000000000000000000000000000000000000..59b64d1a40208b0a66de8d8af762703a113f5ebc
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/aggregate_num_rows.py
@@ -0,0 +1,62 @@
+import ray
+from ray.data._internal.delegating_block_builder import DelegatingBlockBuilder
+from ray.data._internal.execution.interfaces import PhysicalOperator, RefBundle
+from ray.data._internal.stats import StatsDict
+from ray.data.block import BlockAccessor
+from ray.data.context import DataContext
+
+
+class AggregateNumRows(PhysicalOperator):
+    """Count number of rows in input bundles.
+
+    This operator aggregates the number of rows in input bundles using the bundles'
+    block metadata. It outputs a single row with the specified column name.
+    """
+
+    def __init__(
+        self,
+        input_dependencies,
+        data_context: DataContext,
+        column_name: str,
+    ):
+        super().__init__(
+            "AggregateNumRows",
+            input_dependencies,
+            data_context,
+            target_max_block_size=None,
+        )
+
+        self._column_name = column_name
+
+        self._num_rows = 0
+        self._has_outputted = False
+
+    def has_next(self) -> bool:
+        return self._inputs_complete and not self._has_outputted
+
+    def _get_next_inner(self) -> RefBundle:
+        assert self._inputs_complete
+
+        builder = DelegatingBlockBuilder()
+        builder.add({self._column_name: self._num_rows})
+        block = builder.build()
+        block_ref = ray.put(block)
+
+        metadata = BlockAccessor.for_block(block).get_metadata()
+        bundle = RefBundle([(block_ref, metadata)], owns_blocks=True)
+
+        self._has_outputted = True
+        return bundle
+
+    def get_stats(self) -> StatsDict:
+        return {}
+
+    def _add_input_inner(self, refs, input_index) -> None:
+        assert refs.num_rows() is not None
+        self._num_rows += refs.num_rows()
+
+    def throttling_disabled(self) -> bool:
+        return True
+
+    def implements_accurate_memory_accounting(self) -> bool:
+        return True
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/base_physical_operator.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/base_physical_operator.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ad9b069e87be3fe2acd9b47cbf245dd9f44db0d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/base_physical_operator.py
@@ -0,0 +1,176 @@
+from typing import List, Optional
+
+from ray.data._internal.execution.interfaces import (
+    AllToAllTransformFn,
+    PhysicalOperator,
+    RefBundle,
+    TaskContext,
+)
+from ray.data._internal.logical.interfaces import LogicalOperator
+from ray.data._internal.progress_bar import ProgressBar
+from ray.data._internal.stats import StatsDict
+from ray.data.context import DataContext
+
+
+class OneToOneOperator(PhysicalOperator):
+    """An operator that has one input and one output dependency.
+
+    This operator serves as the base for map, filter, limit, etc.
+    """
+
+    def __init__(
+        self,
+        name: str,
+        input_op: PhysicalOperator,
+        data_context: DataContext,
+        target_max_block_size: Optional[int],
+    ):
+        """Create a OneToOneOperator.
+        Args:
+            input_op: Operator generating input data for this op.
+            name: The name of this operator.
+            target_max_block_size: The target maximum number of bytes to
+                include in an output block.
+        """
+        super().__init__(name, [input_op], data_context, target_max_block_size)
+
+    @property
+    def input_dependency(self) -> PhysicalOperator:
+        return self.input_dependencies[0]
+
+
+class AllToAllOperator(PhysicalOperator):
+    """A blocking operator that executes once its inputs are complete.
+
+    This operator implements distributed sort / shuffle operations, etc.
+    """
+
+    def __init__(
+        self,
+        bulk_fn: AllToAllTransformFn,
+        input_op: PhysicalOperator,
+        data_context: DataContext,
+        target_max_block_size: Optional[int],
+        num_outputs: Optional[int] = None,
+        sub_progress_bar_names: Optional[List[str]] = None,
+        name: str = "AllToAll",
+    ):
+        """Create an AllToAllOperator.
+        Args:
+            bulk_fn: The blocking transformation function to run. The inputs are the
+                list of input ref bundles, and the outputs are the output ref bundles
+                and a stats dict.
+            input_op: Operator generating input data for this op.
+            num_outputs: The number of expected output bundles for progress bar.
+            sub_progress_bar_names: The names of internal sub progress bars.
+            name: The name of this operator.
+        """
+        self._bulk_fn = bulk_fn
+        self._next_task_index = 0
+        self._num_outputs = num_outputs
+        self._output_rows = 0
+        self._sub_progress_bar_names = sub_progress_bar_names
+        self._sub_progress_bar_dict = None
+        self._input_buffer: List[RefBundle] = []
+        self._output_buffer: List[RefBundle] = []
+        self._stats: StatsDict = {}
+        super().__init__(name, [input_op], data_context, target_max_block_size)
+
+    def num_outputs_total(self) -> Optional[int]:
+        return (
+            self._num_outputs
+            if self._num_outputs
+            else self.input_dependencies[0].num_outputs_total()
+        )
+
+    def num_output_rows_total(self) -> Optional[int]:
+        return (
+            self._output_rows
+            if self._output_rows
+            else self.input_dependencies[0].num_output_rows_total()
+        )
+
+    def _add_input_inner(self, refs: RefBundle, input_index: int) -> None:
+        assert not self.completed()
+        assert input_index == 0, input_index
+        self._input_buffer.append(refs)
+
+    def all_inputs_done(self) -> None:
+        ctx = TaskContext(
+            task_idx=self._next_task_index,
+            sub_progress_bar_dict=self._sub_progress_bar_dict,
+            target_max_block_size=self.actual_target_max_block_size,
+        )
+        self._output_buffer, self._stats = self._bulk_fn(self._input_buffer, ctx)
+        self._next_task_index += 1
+        self._input_buffer.clear()
+        super().all_inputs_done()
+
+    def has_next(self) -> bool:
+        return len(self._output_buffer) > 0
+
+    def _get_next_inner(self) -> RefBundle:
+        bundle = self._output_buffer.pop(0)
+        self._output_rows += bundle.num_rows()
+        return bundle
+
+    def get_stats(self) -> StatsDict:
+        return self._stats
+
+    def get_transformation_fn(self) -> AllToAllTransformFn:
+        return self._bulk_fn
+
+    def progress_str(self) -> str:
+        return f"{self.num_output_rows_total() or 0} rows output"
+
+    def initialize_sub_progress_bars(self, position: int) -> int:
+        """Initialize all internal sub progress bars, and return the number of bars."""
+        if self._sub_progress_bar_names is not None:
+            self._sub_progress_bar_dict = {}
+            for name in self._sub_progress_bar_names:
+                bar = ProgressBar(
+                    name,
+                    self.num_output_rows_total() or 1,
+                    unit="row",
+                    position=position,
+                )
+                # NOTE: call `set_description` to trigger the initial print of progress
+                # bar on console.
+                bar.set_description(f"  *- {name}")
+                self._sub_progress_bar_dict[name] = bar
+                position += 1
+            return len(self._sub_progress_bar_dict)
+        else:
+            return 0
+
+    def close_sub_progress_bars(self):
+        """Close all internal sub progress bars."""
+        if self._sub_progress_bar_dict is not None:
+            for sub_bar in self._sub_progress_bar_dict.values():
+                sub_bar.close()
+
+    def supports_fusion(self):
+        return True
+
+
+class NAryOperator(PhysicalOperator):
+    """An operator that has multiple input dependencies and one output.
+
+    This operator serves as the base for union, zip, etc.
+    """
+
+    def __init__(
+        self,
+        data_context: DataContext,
+        *input_ops: LogicalOperator,
+    ):
+        """Create a OneToOneOperator.
+        Args:
+            input_op: Operator generating input data for this op.
+            name: The name of this operator.
+        """
+        input_names = ", ".join([op._name for op in input_ops])
+        op_name = f"{self.__class__.__name__}({input_names})"
+        super().__init__(
+            op_name, list(input_ops), data_context, target_max_block_size=None
+        )
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/input_data_buffer.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/input_data_buffer.py
new file mode 100644
index 0000000000000000000000000000000000000000..195711e35cf1abbc720c8ca8923452a3c86d8a1d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/input_data_buffer.py
@@ -0,0 +1,89 @@
+from typing import Callable, List, Optional
+
+from ray.data._internal.execution.interfaces import (
+    ExecutionOptions,
+    PhysicalOperator,
+    RefBundle,
+)
+from ray.data._internal.stats import StatsDict
+from ray.data.context import DataContext
+
+
+class InputDataBuffer(PhysicalOperator):
+    """Defines the input data for the operator DAG.
+
+    For example, this may hold cached blocks from a previous Dataset execution, or
+    the arguments for read tasks.
+    """
+
+    def __init__(
+        self,
+        data_context: DataContext,
+        input_data: Optional[List[RefBundle]] = None,
+        input_data_factory: Optional[Callable[[int], List[RefBundle]]] = None,
+        num_output_blocks: Optional[int] = None,
+    ):
+        """Create an InputDataBuffer.
+
+        Args:
+            input_data: The list of bundles to output from this operator.
+            input_data_factory: The factory to get input data, if input_data is None.
+            num_output_blocks: The number of output blocks. If not specified, progress
+                bars total will be set based on num output bundles instead.
+        """
+        super().__init__("Input", [], data_context, target_max_block_size=None)
+        if input_data is not None:
+            assert input_data_factory is None
+            # Copy the input data to avoid mutating the original list.
+            self._input_data = input_data[:]
+            self._is_input_initialized = True
+            self._initialize_metadata()
+        else:
+            # Initialize input lazily when execution is started.
+            assert input_data_factory is not None
+            self._input_data_factory = input_data_factory
+            self._is_input_initialized = False
+        self._input_data_index = 0
+
+    def start(self, options: ExecutionOptions) -> None:
+        if not self._is_input_initialized:
+            self._input_data = self._input_data_factory(
+                self.actual_target_max_block_size
+            )
+            self._is_input_initialized = True
+            self._initialize_metadata()
+        # InputDataBuffer does not take inputs from other operators,
+        # so we record input metrics here
+        for bundle in self._input_data:
+            self._metrics.on_input_received(bundle)
+        super().start(options)
+
+    def has_next(self) -> bool:
+        return self._input_data_index < len(self._input_data)
+
+    def _get_next_inner(self) -> RefBundle:
+        # We can't pop the input data. If we do, Ray might garbage collect the block
+        # references, and Ray won't be able to reconstruct downstream objects.
+        bundle = self._input_data[self._input_data_index]
+        self._input_data_index += 1
+        return bundle
+
+    def get_stats(self) -> StatsDict:
+        return {}
+
+    def _add_input_inner(self, refs, input_index) -> None:
+        raise ValueError("Inputs are not allowed for this operator.")
+
+    def _initialize_metadata(self):
+        assert self._input_data is not None and self._is_input_initialized
+        self._estimated_num_output_bundles = len(self._input_data)
+
+        block_metadata = []
+        for bundle in self._input_data:
+            block_metadata.extend(bundle.metadata)
+        self._stats = {
+            "input": block_metadata,
+        }
+
+    def implements_accurate_memory_accounting(self) -> bool:
+        return True
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/limit_operator.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/limit_operator.py
new file mode 100644
index 0000000000000000000000000000000000000000..47e93ae3f22c40966507490eeaaedddd466ec2b8
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/limit_operator.py
@@ -0,0 +1,133 @@
+import copy
+from collections import deque
+from typing import Deque, List, Optional, Tuple
+
+import ray
+from ray.data._internal.execution.interfaces import PhysicalOperator, RefBundle
+from ray.data._internal.execution.operators.base_physical_operator import (
+    OneToOneOperator,
+)
+from ray.data._internal.remote_fn import cached_remote_fn
+from ray.data._internal.stats import StatsDict
+from ray.data.block import Block, BlockAccessor, BlockMetadata
+from ray.data.context import DataContext
+from ray.types import ObjectRef
+
+
+class LimitOperator(OneToOneOperator):
+    """Physical operator for limit."""
+
+    def __init__(
+        self,
+        limit: int,
+        input_op: PhysicalOperator,
+        data_context: DataContext,
+    ):
+        self._limit = limit
+        self._consumed_rows = 0
+        self._buffer: Deque[RefBundle] = deque()
+        self._name = f"limit={limit}"
+        self._output_metadata: List[BlockMetadata] = []
+        self._cur_output_bundles = 0
+        super().__init__(self._name, input_op, data_context, target_max_block_size=None)
+        if self._limit <= 0:
+            self.mark_execution_completed()
+
+    def _limit_reached(self) -> bool:
+        return self._consumed_rows >= self._limit
+
+    def _add_input_inner(self, refs: RefBundle, input_index: int) -> None:
+        assert not self.completed()
+        assert input_index == 0, input_index
+        if self._limit_reached():
+            return
+        out_blocks: List[ObjectRef[Block]] = []
+        out_metadata: List[BlockMetadata] = []
+        for block, metadata in refs.blocks:
+            num_rows = metadata.num_rows
+            assert num_rows is not None
+            if self._consumed_rows + num_rows <= self._limit:
+                out_blocks.append(block)
+                out_metadata.append(metadata)
+                self._output_metadata.append(metadata)
+                self._consumed_rows += num_rows
+            else:
+                # Slice the last block.
+                def slice_fn(block, metadata, num_rows) -> Tuple[Block, BlockMetadata]:
+                    block = BlockAccessor.for_block(block).slice(0, num_rows, copy=True)
+                    metadata = copy.deepcopy(metadata)
+                    metadata.num_rows = num_rows
+                    metadata.size_bytes = BlockAccessor.for_block(block).size_bytes()
+                    return block, metadata
+
+                block, metadata_ref = cached_remote_fn(
+                    slice_fn, num_cpus=0, num_returns=2
+                ).remote(
+                    block,
+                    metadata,
+                    self._limit - self._consumed_rows,
+                )
+                out_blocks.append(block)
+                metadata = ray.get(metadata_ref)
+                out_metadata.append(metadata)
+                self._output_metadata.append(metadata)
+                self._consumed_rows = self._limit
+                break
+        self._cur_output_bundles += 1
+        out_refs = RefBundle(
+            list(zip(out_blocks, out_metadata)),
+            owns_blocks=refs.owns_blocks,
+        )
+        self._buffer.append(out_refs)
+        self._metrics.on_output_queued(out_refs)
+        if self._limit_reached():
+            self.mark_execution_completed()
+
+        # We cannot estimate if we have only consumed empty blocks,
+        # or if the input dependency's total number of output bundles is unknown.
+        num_inputs = self.input_dependencies[0].num_outputs_total()
+        if self._consumed_rows > 0 and num_inputs is not None:
+            # Estimate number of output bundles
+            # Check the case where _limit > # of input rows
+            estimated_total_output_rows = min(
+                self._limit, self._consumed_rows / self._cur_output_bundles * num_inputs
+            )
+            # _consumed_rows / _limit is roughly equal to
+            # _cur_output_bundles / total output blocks
+            self._estimated_num_output_bundles = round(
+                estimated_total_output_rows
+                / self._consumed_rows
+                * self._cur_output_bundles
+            )
+
+    def has_next(self) -> bool:
+        return len(self._buffer) > 0
+
+    def _get_next_inner(self) -> RefBundle:
+        output = self._buffer.popleft()
+        self._metrics.on_output_dequeued(output)
+        return output
+
+    def get_stats(self) -> StatsDict:
+        return {self._name: self._output_metadata}
+
+    def num_outputs_total(self) -> Optional[int]:
+        # Before execution is completed, we don't know how many output
+        # bundles we will have. We estimate based off the consumption so far.
+        if self._execution_completed:
+            return self._cur_output_bundles
+        return self._estimated_num_output_bundles
+
+    def num_output_rows_total(self) -> Optional[int]:
+        # The total number of rows is simply the limit or the number
+        # of input rows, whichever is smaller
+        input_num_rows = self.input_dependencies[0].num_output_rows_total()
+        if input_num_rows is None:
+            return None
+        return min(self._limit, input_num_rows)
+
+    def throttling_disabled(self) -> bool:
+        return True
+
+    def implements_accurate_memory_accounting(self) -> bool:
+        return True
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_operator.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_operator.py
new file mode 100644
index 0000000000000000000000000000000000000000..49169ca750ca7db9e84952b283fe738b1a53e8e1
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_operator.py
@@ -0,0 +1,711 @@
+import copy
+import functools
+import itertools
+import logging
+from abc import ABC, abstractmethod
+from collections import defaultdict, deque
+from typing import (
+    Any,
+    Callable,
+    Deque,
+    Dict,
+    Iterator,
+    List,
+    Optional,
+    Set,
+    Tuple,
+    Union,
+)
+
+import ray
+from ray import ObjectRef
+from ray._raylet import ObjectRefGenerator
+from ray.data._internal.compute import (
+    ActorPoolStrategy,
+    ComputeStrategy,
+    TaskPoolStrategy,
+)
+from ray.data._internal.execution.interfaces import (
+    ExecutionOptions,
+    ExecutionResources,
+    PhysicalOperator,
+    RefBundle,
+    TaskContext,
+)
+from ray.data._internal.execution.interfaces.physical_operator import (
+    DataOpTask,
+    MetadataOpTask,
+    OpTask,
+)
+from ray.data._internal.execution.operators.base_physical_operator import (
+    OneToOneOperator,
+)
+from ray.data._internal.execution.operators.map_transformer import (
+    ApplyAdditionalSplitToOutputBlocks,
+    MapTransformer,
+)
+from ray.data._internal.stats import StatsDict
+from ray.data.block import Block, BlockAccessor, BlockExecStats, BlockMetadata
+from ray.data.context import DataContext
+from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
+
+logger = logging.getLogger(__name__)
+
+
+class MapOperator(OneToOneOperator, ABC):
+    """A streaming operator that maps input bundles 1:1 to output bundles.
+
+    This operator implements the distributed map operation, supporting both task
+    and actor compute strategies.
+    """
+
+    def __init__(
+        self,
+        map_transformer: MapTransformer,
+        input_op: PhysicalOperator,
+        data_context: DataContext,
+        name: str,
+        target_max_block_size: Optional[int],
+        min_rows_per_bundle: Optional[int],
+        supports_fusion: bool,
+        ray_remote_args_fn: Optional[Callable[[], Dict[str, Any]]],
+        ray_remote_args: Optional[Dict[str, Any]],
+    ):
+        # NOTE: This constructor should not be called directly; use MapOperator.create()
+        # instead.
+        # NOTE: This constructor must be called by subclasses.
+
+        self._map_transformer = map_transformer
+        self._supports_fusion = supports_fusion
+        self._ray_remote_args = _canonicalize_ray_remote_args(ray_remote_args or {})
+        self._ray_remote_args_fn = ray_remote_args_fn
+        self._ray_remote_args_factory_actor_locality = None
+        self._remote_args_for_metrics = copy.deepcopy(self._ray_remote_args)
+
+        # Bundles block references up to the min_rows_per_bundle target.
+        self._block_ref_bundler = _BlockRefBundler(min_rows_per_bundle)
+
+        # Queue for task outputs, either ordered or unordered (this is set by start()).
+        self._output_queue: _OutputQueue = None
+        # Output metadata, added to on get_next().
+        self._output_metadata: List[BlockMetadata] = []
+        # All active `DataOpTask`s.
+        self._data_tasks: Dict[int, DataOpTask] = {}
+        self._next_data_task_idx = 0
+        # All active `MetadataOpTask`s.
+        self._metadata_tasks: Dict[int, MetadataOpTask] = {}
+        self._next_metadata_task_idx = 0
+        # Keep track of all finished streaming generators.
+        super().__init__(name, input_op, data_context, target_max_block_size)
+
+        # If set, then all output blocks will be split into
+        # this many sub-blocks. This is to avoid having
+        # too-large blocks, which may reduce parallelism for
+        # the subsequent operator.
+        self._additional_split_factor = None
+        # Callback functions that generate additional task kwargs
+        # for the map task.
+        self._map_task_kwargs_fns: List[Callable[[], Dict[str, Any]]] = []
+
+    def add_map_task_kwargs_fn(self, map_task_kwargs_fn: Callable[[], Dict[str, Any]]):
+        """Add a callback function that generates additional kwargs for the map tasks.
+        In the map tasks, the kwargs can be accessible via `TaskContext.kwargs`.
+        """
+        self._map_task_kwargs_fns.append(map_task_kwargs_fn)
+
+    def get_map_task_kwargs(self) -> Dict[str, Any]:
+        """Get the kwargs for the map task.
+        Subclasses should pass the returned kwargs to the map tasks.
+        In the map tasks, the kwargs can be accessible via `TaskContext.kwargs`.
+        """
+        kwargs = {}
+        for fn in self._map_task_kwargs_fns:
+            kwargs.update(fn())
+        return kwargs
+
+    def get_additional_split_factor(self) -> int:
+        if self._additional_split_factor is None:
+            return 1
+        return self._additional_split_factor
+
+    def set_additional_split_factor(self, k: int):
+        self._additional_split_factor = k
+
+    @property
+    def name(self) -> str:
+        name = super().name
+        if self._additional_split_factor is not None:
+            name += f"->SplitBlocks({self._additional_split_factor})"
+        return name
+
+    @classmethod
+    def create(
+        cls,
+        map_transformer: MapTransformer,
+        input_op: PhysicalOperator,
+        data_context: DataContext,
+        target_max_block_size: Optional[int] = None,
+        name: str = "Map",
+        # TODO(ekl): slim down ComputeStrategy to only specify the compute
+        # config and not contain implementation code.
+        compute_strategy: Optional[ComputeStrategy] = None,
+        min_rows_per_bundle: Optional[int] = None,
+        supports_fusion: bool = True,
+        ray_remote_args_fn: Optional[Callable[[], Dict[str, Any]]] = None,
+        ray_remote_args: Optional[Dict[str, Any]] = None,
+    ) -> "MapOperator":
+        """Create a MapOperator.
+
+        This factory creates the MapOperator pool implementation that corresponds to the
+        compute argument:
+            - If None or TaskPoolStrategy -> TaskPoolMapOperator
+            - If ActorPoolStrategy -> ActorPoolMapOperator
+
+        Args:
+            transform_fn: The function to apply to each ref bundle input.
+            input_op: Operator generating input data for this op.
+            init_fn: The callable class to instantiate if using ActorPoolMapOperator.
+            name: The name of this operator.
+            compute_strategy: Customize the compute strategy for this op.
+            target_max_block_size: The target maximum number of bytes to
+                include in an output block.
+            min_rows_per_bundle: The number of rows to gather per batch passed to the
+                transform_fn, or None to use the block size. Setting the batch size is
+                important for the performance of GPU-accelerated transform functions.
+                The actual rows passed may be less if the dataset is small.
+            supports_fusion: Whether this operator supports fusion with other operators.
+            ray_remote_args_fn: A function that returns a dictionary of remote args
+                passed to each map worker. The purpose of this argument is to generate
+                dynamic arguments for each actor/task, and will be called each time
+                prior to initializing the worker. Args returned from this dict will
+                always override the args in ``ray_remote_args``. Note: this is an
+                advanced, experimental feature.
+            ray_remote_args: Customize the :func:`ray.remote` args for this op's tasks.
+        """
+        if compute_strategy is None:
+            compute_strategy = TaskPoolStrategy()
+
+        if isinstance(compute_strategy, TaskPoolStrategy):
+            from ray.data._internal.execution.operators.task_pool_map_operator import (
+                TaskPoolMapOperator,
+            )
+
+            return TaskPoolMapOperator(
+                map_transformer,
+                input_op,
+                data_context,
+                name=name,
+                target_max_block_size=target_max_block_size,
+                min_rows_per_bundle=min_rows_per_bundle,
+                concurrency=compute_strategy.size,
+                supports_fusion=supports_fusion,
+                ray_remote_args_fn=ray_remote_args_fn,
+                ray_remote_args=ray_remote_args,
+            )
+        elif isinstance(compute_strategy, ActorPoolStrategy):
+            from ray.data._internal.execution.operators.actor_pool_map_operator import (
+                ActorPoolMapOperator,
+            )
+
+            return ActorPoolMapOperator(
+                map_transformer,
+                input_op,
+                data_context,
+                target_max_block_size=target_max_block_size,
+                compute_strategy=compute_strategy,
+                name=name,
+                min_rows_per_bundle=min_rows_per_bundle,
+                supports_fusion=supports_fusion,
+                ray_remote_args_fn=ray_remote_args_fn,
+                ray_remote_args=ray_remote_args,
+            )
+        else:
+            raise ValueError(f"Unsupported execution strategy {compute_strategy}")
+
+    def start(self, options: "ExecutionOptions"):
+        super().start(options)
+        # Create output queue with desired ordering semantics.
+        if options.preserve_order:
+            self._output_queue = _OrderedOutputQueue()
+        else:
+            self._output_queue = _UnorderedOutputQueue()
+
+        if options.locality_with_output:
+            if isinstance(options.locality_with_output, list):
+                locs = options.locality_with_output
+            else:
+                locs = [ray.get_runtime_context().get_node_id()]
+
+            class RoundRobinAssign:
+                def __init__(self, locs):
+                    self.locs = locs
+                    self.i = 0
+
+                def __call__(self, args):
+                    args = copy.deepcopy(args)
+                    args["scheduling_strategy"] = NodeAffinitySchedulingStrategy(
+                        self.locs[self.i],
+                        soft=True,
+                        _spill_on_unavailable=True,
+                    )
+                    self.i += 1
+                    self.i %= len(self.locs)
+                    return args
+
+            self._ray_remote_args_factory_actor_locality = RoundRobinAssign(locs)
+
+        map_transformer = self._map_transformer
+        # Apply additional block split if needed.
+        if self.get_additional_split_factor() > 1:
+            split_transformer = MapTransformer(
+                [ApplyAdditionalSplitToOutputBlocks(self.get_additional_split_factor())]
+            )
+            map_transformer = map_transformer.fuse(split_transformer)
+        # Put the function def in the object store to avoid repeated serialization
+        # in case it's large (i.e., closure captures large objects).
+        self._map_transformer_ref = ray.put(map_transformer)
+
+    def _add_input_inner(self, refs: RefBundle, input_index: int):
+        assert input_index == 0, input_index
+
+        # Add RefBundle to the bundler.
+        self._block_ref_bundler.add_bundle(refs)
+        self._metrics.on_input_queued(refs)
+
+        if self._block_ref_bundler.has_bundle():
+            # The ref bundler combines one or more RefBundles into a new larger
+            # RefBundle. Rather than dequeuing the new RefBundle, which was never
+            # enqueued in the first place, we dequeue the original RefBundles.
+            input_refs, bundled_input = self._block_ref_bundler.get_next_bundle()
+            for bundle in input_refs:
+                self._metrics.on_input_dequeued(bundle)
+
+            # If the bundler has a full bundle, add it to the operator's task submission
+            # queue
+            self._add_bundled_input(bundled_input)
+
+    def _get_runtime_ray_remote_args(
+        self, input_bundle: Optional[RefBundle] = None
+    ) -> Dict[str, Any]:
+        ray_remote_args = copy.deepcopy(self._ray_remote_args)
+
+        # Override parameters from user provided remote args function.
+        if self._ray_remote_args_fn:
+            new_remote_args = self._ray_remote_args_fn()
+            for k, v in new_remote_args.items():
+                ray_remote_args[k] = v
+        # For tasks with small args, we will use SPREAD by default to optimize for
+        # compute load-balancing. For tasks with large args, we will use DEFAULT to
+        # allow the Ray locality scheduler a chance to optimize task placement.
+        if "scheduling_strategy" not in ray_remote_args:
+            ctx = self.data_context
+            if input_bundle and input_bundle.size_bytes() > ctx.large_args_threshold:
+                ray_remote_args[
+                    "scheduling_strategy"
+                ] = ctx.scheduling_strategy_large_args
+                # Takes precedence over small args case. This is to let users know
+                # when the large args case is being triggered.
+                self._remote_args_for_metrics = copy.deepcopy(ray_remote_args)
+            else:
+                ray_remote_args["scheduling_strategy"] = ctx.scheduling_strategy
+                # Only save to metrics if we haven't already done so.
+                if "scheduling_strategy" not in self._remote_args_for_metrics:
+                    self._remote_args_for_metrics = copy.deepcopy(ray_remote_args)
+        # This should take precedence over previously set scheduling strategy, as it
+        # implements actor-based locality overrides.
+        if self._ray_remote_args_factory_actor_locality:
+            return self._ray_remote_args_factory_actor_locality(ray_remote_args)
+        return ray_remote_args
+
+    @abstractmethod
+    def _add_bundled_input(self, refs: RefBundle):
+        """Add a pre-bundled upstream output to this operator.
+
+        Unlike the add_input() arg, this RefBundle has already been further bundled by
+        _block_ref_bundler up to the target size, meaning that this bundle is ready for
+        task submission.
+
+        This must be implemented by subclasses.
+
+        Args:
+            refs: The fully-bundled ref bundle that should be added as input.
+        """
+        raise NotImplementedError
+
+    def _submit_data_task(
+        self,
+        gen: ObjectRefGenerator,
+        inputs: RefBundle,
+        task_done_callback: Optional[Callable[[], None]] = None,
+    ):
+        """Submit a new data-handling task."""
+        # TODO(hchen):
+        # 1. Move this to the base PhyscialOperator class.
+        # 2. This method should only take a block-processing function as input,
+        #    instead of a streaming generator. The logic of submitting ray tasks
+        #    can also be capsulated in the base class.
+        task_index = self._next_data_task_idx
+        self._next_data_task_idx += 1
+        self._metrics.on_task_submitted(task_index, inputs)
+
+        def _output_ready_callback(task_index, output: RefBundle):
+            # Since output is streamed, it should only contain one block.
+            assert len(output) == 1
+            self._metrics.on_task_output_generated(task_index, output)
+
+            # Notify output queue that the task has produced an new output.
+            self._output_queue.notify_task_output_ready(task_index, output)
+            self._metrics.on_output_queued(output)
+
+        def _task_done_callback(task_index: int, exception: Optional[Exception]):
+            self._metrics.on_task_finished(task_index, exception)
+
+            # Estimate number of tasks and rows from inputs received and tasks
+            # submitted so far
+            upstream_op_num_outputs = self.input_dependencies[0].num_outputs_total()
+            if upstream_op_num_outputs:
+                estimated_num_tasks = (
+                    upstream_op_num_outputs
+                    / self._metrics.num_inputs_received
+                    * self._next_data_task_idx
+                )
+                self._estimated_num_output_bundles = round(
+                    estimated_num_tasks
+                    * self._metrics.num_outputs_of_finished_tasks
+                    / self._metrics.num_tasks_finished
+                )
+                self._estimated_output_num_rows = round(
+                    estimated_num_tasks
+                    * self._metrics.rows_task_outputs_generated
+                    / self._metrics.num_tasks_finished
+                )
+
+            self._data_tasks.pop(task_index)
+            # Notify output queue that this task is complete.
+            self._output_queue.notify_task_completed(task_index)
+            if task_done_callback:
+                task_done_callback()
+
+        self._data_tasks[task_index] = DataOpTask(
+            task_index,
+            gen,
+            lambda output: _output_ready_callback(task_index, output),
+            functools.partial(_task_done_callback, task_index),
+        )
+
+    def _submit_metadata_task(
+        self, result_ref: ObjectRef, task_done_callback: Callable[[], None]
+    ):
+        """Submit a new metadata-handling task."""
+        # TODO(hchen): Move this to the base PhyscialOperator class.
+        task_index = self._next_metadata_task_idx
+        self._next_metadata_task_idx += 1
+
+        def _task_done_callback():
+            self._metadata_tasks.pop(task_index)
+            task_done_callback()
+
+        self._metadata_tasks[task_index] = MetadataOpTask(
+            task_index, result_ref, _task_done_callback
+        )
+
+    def get_active_tasks(self) -> List[OpTask]:
+        return list(self._metadata_tasks.values()) + list(self._data_tasks.values())
+
+    def all_inputs_done(self):
+        self._block_ref_bundler.done_adding_bundles()
+        if self._block_ref_bundler.has_bundle():
+            # Handle any leftover bundles in the bundler.
+            _, bundled_input = self._block_ref_bundler.get_next_bundle()
+            self._add_bundled_input(bundled_input)
+        super().all_inputs_done()
+
+    def has_next(self) -> bool:
+        assert self._started
+        return self._output_queue.has_next()
+
+    def _get_next_inner(self) -> RefBundle:
+        assert self._started
+        bundle = self._output_queue.get_next()
+        self._metrics.on_output_dequeued(bundle)
+        self._output_metadata.extend(bundle.metadata)
+        return bundle
+
+    @abstractmethod
+    def progress_str(self) -> str:
+        raise NotImplementedError
+
+    def _extra_metrics(self) -> Dict[str, Any]:
+        return {"ray_remote_args": dict(sorted(self._remote_args_for_metrics.items()))}
+
+    def get_stats(self) -> StatsDict:
+        return {self._name: self._output_metadata}
+
+    def get_map_transformer(self) -> MapTransformer:
+        return self._map_transformer
+
+    def shutdown(self):
+        self._data_tasks.clear()
+        self._metadata_tasks.clear()
+
+    @abstractmethod
+    def current_processor_usage(self) -> ExecutionResources:
+        raise NotImplementedError
+
+    @abstractmethod
+    def pending_processor_usage(self) -> ExecutionResources:
+        raise NotImplementedError
+
+    @abstractmethod
+    def base_resource_usage(self) -> ExecutionResources:
+        raise NotImplementedError
+
+    @abstractmethod
+    def incremental_resource_usage(self) -> ExecutionResources:
+        raise NotImplementedError
+
+    def implements_accurate_memory_accounting(self) -> bool:
+        return True
+
+    def supports_fusion(self) -> bool:
+        return self._supports_fusion
+
+    def num_active_tasks(self) -> int:
+        # Override `num_active_tasks` to only include data tasks and exclude
+        # metadata tasks, which are used by the actor-pool map operator to
+        # check if a newly created actor is ready.
+        # The reasons are because:
+        # 1. `PhysicalOperator.completed` checks `num_active_tasks`. The operator
+        #   should be considered completed if there are still pending actors.
+        # 2. The number of active tasks in the progress bar will be more accurate
+        #   to reflect the actual data processing tasks.
+        return len(self._data_tasks)
+
+
+def _map_task(
+    map_transformer: MapTransformer,
+    data_context: DataContext,
+    ctx: TaskContext,
+    *blocks: Block,
+    **kwargs: Dict[str, Any],
+) -> Iterator[Union[Block, List[BlockMetadata]]]:
+    """Remote function for a single operator task.
+
+    Args:
+        fn: The callable that takes Iterator[Block] as input and returns
+            Iterator[Block] as output.
+        blocks: The concrete block values from the task ref bundle.
+
+    Returns:
+        A generator of blocks, followed by the list of BlockMetadata for the blocks
+        as the last generator return.
+    """
+    DataContext._set_current(data_context)
+    ctx.kwargs.update(kwargs)
+    stats = BlockExecStats.builder()
+    map_transformer.set_target_max_block_size(ctx.target_max_block_size)
+    for b_out in map_transformer.apply_transform(iter(blocks), ctx):
+        # TODO(Clark): Add input file propagation from input blocks.
+        m_out = BlockAccessor.for_block(b_out).get_metadata()
+        m_out.exec_stats = stats.build()
+        m_out.exec_stats.udf_time_s = map_transformer.udf_time()
+        m_out.exec_stats.task_idx = ctx.task_idx
+        yield b_out
+        yield m_out
+        stats = BlockExecStats.builder()
+
+
+class _BlockRefBundler:
+    """Rebundles RefBundles to get them close to a particular number of rows."""
+
+    def __init__(self, min_rows_per_bundle: Optional[int]):
+        """Creates a BlockRefBundler.
+
+        Args:
+            min_rows_per_bundle: The target number of rows per bundle. Note that we
+                bundle up to this target, but only exceed it if not doing so would
+                result in an empty bundle.
+        """
+        self._min_rows_per_bundle = min_rows_per_bundle
+        self._bundle_buffer: List[RefBundle] = []
+        self._bundle_buffer_size = 0
+        self._finalized = False
+
+    def add_bundle(self, bundle: RefBundle):
+        """Add a bundle to the bundler."""
+        self._bundle_buffer.append(bundle)
+        self._bundle_buffer_size += self._get_bundle_size(bundle)
+
+    def has_bundle(self) -> bool:
+        """Returns whether the bundler has a bundle."""
+        return self._bundle_buffer and (
+            self._min_rows_per_bundle is None
+            or self._bundle_buffer_size >= self._min_rows_per_bundle
+            or (self._finalized and self._bundle_buffer_size > 0)
+        )
+
+    def get_next_bundle(self) -> Tuple[List[RefBundle], RefBundle]:
+        """Gets the next bundle.
+
+        Returns:
+            A two-tuple. The first element is a list of bundles that were combined into
+            the output bundle. The second element is the output bundle.
+        """
+        assert self.has_bundle()
+        if self._min_rows_per_bundle is None:
+            # Short-circuit if no bundle row target was defined.
+            assert len(self._bundle_buffer) == 1
+            bundle = self._bundle_buffer[0]
+            self._bundle_buffer = []
+            self._bundle_buffer_size = 0
+            return [bundle], bundle
+        leftover = []
+        output_buffer = []
+        output_buffer_size = 0
+        buffer_filled = False
+        for bundle in self._bundle_buffer:
+            bundle_size = self._get_bundle_size(bundle)
+            if buffer_filled:
+                # Buffer has been filled, save it in the leftovers.
+                leftover.append(bundle)
+            elif (
+                output_buffer_size + bundle_size <= self._min_rows_per_bundle
+                or output_buffer_size == 0
+            ):
+                # Bundle fits in buffer, or bundle doesn't fit but the buffer still
+                # needs a non-empty bundle.
+                output_buffer.append(bundle)
+                output_buffer_size += bundle_size
+            else:
+                # Bundle doesn't fit in a buffer that already has at least one non-empty
+                # bundle, so we add it to the leftovers.
+                leftover.append(bundle)
+                # Add all remaining bundles to the leftovers.
+                buffer_filled = True
+        self._bundle_buffer = leftover
+        self._bundle_buffer_size = sum(
+            self._get_bundle_size(bundle) for bundle in leftover
+        )
+        return list(output_buffer), _merge_ref_bundles(*output_buffer)
+
+    def done_adding_bundles(self):
+        """Indicate that no more RefBundles will be added to this bundler."""
+        self._finalized = True
+
+    @staticmethod
+    def _get_bundle_size(bundle: RefBundle):
+        return bundle.num_rows() if bundle.num_rows() is not None else float("inf")
+
+
+def _merge_ref_bundles(*bundles: RefBundle) -> RefBundle:
+    """Merge N ref bundles into a single bundle of multiple blocks."""
+    # Check that at least one bundle is non-null.
+    assert any(bundle is not None for bundle in bundles)
+    blocks = list(
+        itertools.chain(
+            block for bundle in bundles if bundle is not None for block in bundle.blocks
+        )
+    )
+    owns_blocks = all(bundle.owns_blocks for bundle in bundles if bundle is not None)
+    return RefBundle(blocks, owns_blocks)
+
+
+class _OutputQueue(ABC):
+    """Interface for swapping between different output order modes."""
+
+    @abstractmethod
+    def notify_task_output_ready(self, task_index: int, output: RefBundle):
+        """Called when a task's output is ready."""
+        pass
+
+    def notify_task_completed(self, task_index: int):
+        """Called when a previously pending task completes."""
+        pass
+
+    @abstractmethod
+    def has_next(self) -> bool:
+        pass
+
+    @abstractmethod
+    def get_next(self) -> RefBundle:
+        pass
+
+
+class _OrderedOutputQueue(_OutputQueue):
+    """An queue that returns finished tasks in submission order."""
+
+    def __init__(self):
+        self._task_outputs: Dict[int, Deque[RefBundle]] = defaultdict(lambda: deque())
+        self._current_output_index: int = 0
+        self._completed_tasks: Set[int] = set()
+
+    def notify_task_output_ready(self, task_index: int, output: RefBundle):
+        self._task_outputs[task_index].append(output)
+
+    def _move_to_next_task(self):
+        """Move the outut index to the next task.
+
+        This method should only be called when the current task is complete and all
+        outputs have been taken.
+        """
+        assert len(self._task_outputs[self._current_output_index]) == 0
+        assert self._current_output_index in self._completed_tasks
+        del self._task_outputs[self._current_output_index]
+        self._completed_tasks.remove(self._current_output_index)
+        self._current_output_index += 1
+
+    def notify_task_completed(self, task_index: int):
+        assert task_index >= self._current_output_index
+        self._completed_tasks.add(task_index)
+        if task_index == self._current_output_index:
+            if len(self._task_outputs[task_index]) == 0:
+                self._move_to_next_task()
+
+    def has_next(self) -> bool:
+        return len(self._task_outputs[self._current_output_index]) > 0
+
+    def get_next(self) -> RefBundle:
+        next_bundle = self._task_outputs[self._current_output_index].popleft()
+        if len(self._task_outputs[self._current_output_index]) == 0:
+            if self._current_output_index in self._completed_tasks:
+                self._move_to_next_task()
+        return next_bundle
+
+
+class _UnorderedOutputQueue(_OutputQueue):
+    """An queue that does not guarantee output order of finished tasks."""
+
+    def __init__(self):
+        self._queue: Deque[RefBundle] = deque()
+
+    def notify_task_output_ready(self, _: int, output: RefBundle):
+        self._queue.append(output)
+
+    def has_next(self) -> bool:
+        return len(self._queue) > 0
+
+    def get_next(self) -> RefBundle:
+        return self._queue.popleft()
+
+
+def _canonicalize_ray_remote_args(ray_remote_args: Dict[str, Any]) -> Dict[str, Any]:
+    """Enforce rules on ray remote args for map tasks.
+
+    Namely, args must explicitly specify either CPU or GPU, not both. Disallowing
+    mixed resources avoids potential starvation and deadlock issues during scheduling,
+    and should not be a serious limitation for users.
+    """
+    ray_remote_args = ray_remote_args.copy()
+
+    if ray_remote_args.get("num_cpus") and ray_remote_args.get("num_gpus"):
+        logger.warning(
+            "Specifying both num_cpus and num_gpus for map tasks is experimental, "
+            "and may result in scheduling or stability issues. "
+            "Please report any issues to the Ray team: "
+            "https://github.com/ray-project/ray/issues/new/choose"
+        )
+
+    if "num_cpus" not in ray_remote_args and "num_gpus" not in ray_remote_args:
+        ray_remote_args["num_cpus"] = 1
+
+    return ray_remote_args
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_transformer.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3135fce59a4785f0066544f3130750263205256
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_transformer.py
@@ -0,0 +1,460 @@
+import itertools
+import time
+from abc import abstractmethod
+from enum import Enum
+from typing import Any, Callable, Dict, Iterable, List, Optional, TypeVar, Union
+
+from ray.data._internal.block_batching.block_batching import batch_blocks
+from ray.data._internal.execution.interfaces.task_context import TaskContext
+from ray.data._internal.output_buffer import BlockOutputBuffer
+from ray.data.block import Block, BlockAccessor, DataBatch
+
+# Allowed input/output data types for a MapTransformFn.
+Row = Dict[str, Any]
+MapTransformFnData = Union[Block, Row, DataBatch]
+
+# Function signature of a MapTransformFn.
+IN = TypeVar("IN")
+OUT = TypeVar("OUT")
+MapTransformCallable = Callable[[Iterable[IN], TaskContext], Iterable[OUT]]
+
+
+class MapTransformFnDataType(Enum):
+    """An enum that represents the input/output data type of a MapTransformFn."""
+
+    Block = 0
+    Row = 1
+    Batch = 2
+
+
+class MapTransformFn:
+    """Represents a single transform function in a MapTransformer."""
+
+    def __init__(
+        self,
+        input_type: MapTransformFnDataType,
+        output_type: MapTransformFnDataType,
+        is_udf: bool = False,
+    ):
+        """
+        Args:
+            callable: the underlying Python callable object.
+            input_type: the type of the input data.
+            output_type: the type of the output data.
+        """
+        self._callable = callable
+        self._input_type = input_type
+        self._output_type = output_type
+        self._target_max_block_size = None
+        self._is_udf = is_udf
+
+    @abstractmethod
+    def __call__(
+        self,
+        input: Iterable[MapTransformFnData],
+        ctx: TaskContext,
+    ) -> Iterable[MapTransformFnData]:
+        ...
+
+    @property
+    def input_type(self) -> MapTransformFnDataType:
+        return self._input_type
+
+    @property
+    def output_type(self) -> MapTransformFnDataType:
+        return self._output_type
+
+    def set_target_max_block_size(self, target_max_block_size: int):
+        self._target_max_block_size = target_max_block_size
+
+
+class MapTransformer:
+    """Encapsulates the data transformation logic of a physical MapOperator.
+
+    A MapTransformer may consist of one or more steps, each of which is represented
+    as a MapTransformFn. The first MapTransformFn must take blocks as input, and
+    the last MapTransformFn must output blocks. The intermediate data types can
+    be blocks, rows, or batches.
+    """
+
+    def __init__(
+        self,
+        transform_fns: List[MapTransformFn],
+        init_fn: Optional[Callable[[], None]] = None,
+    ):
+        """
+        Args:
+        transform_fns: A list of `MapTransformFn`s that will be executed sequentially
+            to transform data.
+        init_fn: A function that will be called before transforming data.
+            Used for the actor-based map operator.
+        """
+        self.set_transform_fns(transform_fns)
+        self._init_fn = init_fn if init_fn is not None else lambda: None
+        self._target_max_block_size = None
+        self._udf_time = 0
+
+    def set_transform_fns(self, transform_fns: List[MapTransformFn]) -> None:
+        """Set the transform functions."""
+        assert len(transform_fns) > 0
+        assert (
+            transform_fns[0].input_type == MapTransformFnDataType.Block
+        ), "The first transform function must take blocks as input."
+        assert (
+            transform_fns[-1].output_type == MapTransformFnDataType.Block
+        ), "The last transform function must output blocks."
+
+        for i in range(len(transform_fns) - 1):
+            assert transform_fns[i].output_type == transform_fns[i + 1].input_type, (
+                "The output type of the previous transform function must match "
+                "the input type of the next transform function."
+            )
+        self._transform_fns = transform_fns
+
+    def get_transform_fns(self) -> List[MapTransformFn]:
+        """Get the transform functions."""
+        return self._transform_fns
+
+    def set_target_max_block_size(self, target_max_block_size: int):
+        self._target_max_block_size = target_max_block_size
+
+    def init(self) -> None:
+        """Initialize the transformer.
+
+        Should be called before applying the transform.
+        """
+        self._init_fn()
+
+    def _udf_timed_iter(
+        self, input: Iterable[MapTransformFnData]
+    ) -> Iterable[MapTransformFnData]:
+        while True:
+            try:
+                start = time.perf_counter()
+                output = next(input)
+                self._udf_time += time.perf_counter() - start
+                yield output
+            except StopIteration:
+                break
+
+    def apply_transform(
+        self,
+        input_blocks: Iterable[Block],
+        ctx: TaskContext,
+    ) -> Iterable[Block]:
+        """Apply the transform functions to the input blocks."""
+        assert (
+            self._target_max_block_size is not None
+        ), "target_max_block_size must be set before running"
+        for transform_fn in self._transform_fns:
+            transform_fn.set_target_max_block_size(self._target_max_block_size)
+
+        iter = input_blocks
+        # Apply the transform functions sequentially to the input iterable.
+        for transform_fn in self._transform_fns:
+            iter = transform_fn(iter, ctx)
+            if transform_fn._is_udf:
+                iter = self._udf_timed_iter(iter)
+        return iter
+
+    def fuse(self, other: "MapTransformer") -> "MapTransformer":
+        """Fuse two `MapTransformer`s together."""
+        assert self._target_max_block_size == other._target_max_block_size or (
+            self._target_max_block_size is None or other._target_max_block_size is None
+        )
+        target_max_block_size = (
+            self._target_max_block_size or other._target_max_block_size
+        )
+
+        # Define them as standalone variables to avoid fused_init_fn capturing the
+        # entire `MapTransformer` object.
+        self_init_fn = self._init_fn
+        other_init_fn = other._init_fn
+
+        def fused_init_fn():
+            self_init_fn()
+            other_init_fn()
+
+        fused_transform_fns = self._transform_fns + other._transform_fns
+        transformer = MapTransformer(fused_transform_fns, init_fn=fused_init_fn)
+        transformer.set_target_max_block_size(target_max_block_size)
+        return transformer
+
+    def udf_time(self) -> float:
+        return self._udf_time
+
+
+def create_map_transformer_from_block_fn(
+    block_fn: MapTransformCallable[Block, Block],
+    init_fn: Optional[Callable[[], None]] = None,
+):
+    """Create a MapTransformer from a single block-based transform function.
+
+    This method should only be used for testing and legacy compatibility.
+    """
+    return MapTransformer(
+        [
+            BlockMapTransformFn(block_fn),
+        ],
+        init_fn,
+    )
+
+
+# Below are subclasses of MapTransformFn.
+
+
+class RowMapTransformFn(MapTransformFn):
+    """A rows-to-rows MapTransformFn."""
+
+    def __init__(self, row_fn: MapTransformCallable[Row, Row], is_udf: bool = False):
+        self._row_fn = row_fn
+        super().__init__(
+            MapTransformFnDataType.Row, MapTransformFnDataType.Row, is_udf=is_udf
+        )
+
+    def __call__(self, input: Iterable[Row], ctx: TaskContext) -> Iterable[Row]:
+        yield from self._row_fn(input, ctx)
+
+    def __repr__(self) -> str:
+        return f"RowMapTransformFn({self._row_fn})"
+
+
+class BatchMapTransformFn(MapTransformFn):
+    """A batch-to-batch MapTransformFn."""
+
+    def __init__(
+        self, batch_fn: MapTransformCallable[DataBatch, DataBatch], is_udf: bool = False
+    ):
+        self._batch_fn = batch_fn
+        super().__init__(
+            MapTransformFnDataType.Batch, MapTransformFnDataType.Batch, is_udf=is_udf
+        )
+
+    def __call__(
+        self, input: Iterable[DataBatch], ctx: TaskContext
+    ) -> Iterable[DataBatch]:
+        yield from self._batch_fn(input, ctx)
+
+    def __repr__(self) -> str:
+        return f"BatchMapTransformFn({self._batch_fn})"
+
+
+class BlockMapTransformFn(MapTransformFn):
+    """A block-to-block MapTransformFn."""
+
+    def __init__(self, block_fn: MapTransformCallable[Block, Block]):
+        self._block_fn = block_fn
+        super().__init__(
+            MapTransformFnDataType.Block,
+            MapTransformFnDataType.Block,
+        )
+
+    def __call__(self, input: Iterable[Block], ctx: TaskContext) -> Iterable[Block]:
+        yield from self._block_fn(input, ctx)
+
+    def __repr__(self) -> str:
+        return f"BlockMapTransformFn({self._block_fn})"
+
+
+class BlocksToRowsMapTransformFn(MapTransformFn):
+    """A MapTransformFn that converts input blocks to rows."""
+
+    def __init__(self):
+        super().__init__(
+            MapTransformFnDataType.Block,
+            MapTransformFnDataType.Row,
+        )
+
+    def __call__(self, blocks: Iterable[Block], _: TaskContext) -> Iterable[Row]:
+        for block in blocks:
+            block = BlockAccessor.for_block(block)
+            for row in block.iter_rows(public_row_format=True):
+                yield row
+
+    @classmethod
+    def instance(cls) -> "BlocksToRowsMapTransformFn":
+        """Returns the singleton instance."""
+        if getattr(cls, "_instance", None) is None:
+            cls._instance = cls()
+        return cls._instance
+
+    def __repr__(self) -> str:
+        return "BlocksToRowsMapTransformFn()"
+
+
+class BlocksToBatchesMapTransformFn(MapTransformFn):
+    """A MapTransformFn that converts input blocks to batches."""
+
+    def __init__(
+        self,
+        batch_size: Optional[int] = None,
+        batch_format: str = "default",
+        zero_copy_batch: bool = False,
+    ):
+        self._batch_size = batch_size
+        self._batch_format = batch_format
+        self._ensure_copy = not zero_copy_batch and batch_size is not None
+        super().__init__(
+            MapTransformFnDataType.Block,
+            MapTransformFnDataType.Batch,
+        )
+
+    def __call__(
+        self,
+        blocks: Iterable[Block],
+        _: TaskContext,
+    ) -> Iterable[DataBatch]:
+        """Converts input blocks to batches."""
+        block_iter = iter(blocks)
+        first = next(block_iter, None)
+        if first is None:
+            return []
+        blocks = itertools.chain([first], block_iter)
+        empty_block = BlockAccessor.for_block(first).builder().build()
+        # Don't hold the first block in memory, so we reset the reference.
+        first = None
+
+        # Ensure that zero-copy batch views are copied so mutating UDFs don't error.
+        formatted_batch_iter = batch_blocks(
+            blocks=blocks,
+            stats=None,
+            batch_size=self._batch_size,
+            batch_format=self._batch_format,
+            ensure_copy=self._ensure_copy,
+        )
+
+        first = next(formatted_batch_iter, None)
+        if first is None:
+            # If the input blocks are all empty, then yield an empty block with same
+            # format as the input blocks.
+            return [empty_block]
+        else:
+            return itertools.chain([first], formatted_batch_iter)
+
+    @property
+    def batch_size(self) -> Optional[int]:
+        return self._batch_size
+
+    @property
+    def batch_format(self) -> str:
+        return self._batch_format
+
+    @property
+    def zero_copy_batch(self) -> bool:
+        return not self._ensure_copy
+
+    def __repr__(self) -> str:
+        return (
+            f"BlocksToBatchesMapTransformFn("
+            f"batch_size={self._batch_size}, "
+            f"batch_format={self._batch_format}, "
+            f"zero_copy_batch={self.zero_copy_batch}"
+            f")"
+        )
+
+
+class BuildOutputBlocksMapTransformFn(MapTransformFn):
+    """A MapTransformFn that converts UDF-returned data to output blocks."""
+
+    def __init__(self, input_type: MapTransformFnDataType):
+        """
+        Args:
+            input_type: the type of input data.
+        """
+        self._input_type = input_type
+        super().__init__(
+            input_type,
+            MapTransformFnDataType.Block,
+        )
+
+    def __call__(
+        self,
+        iter: Iterable[MapTransformFnData],
+        _: TaskContext,
+    ) -> Iterable[Block]:
+        """Convert UDF-returned data to output blocks.
+
+        Args:
+            iter: the iterable of UDF-returned data, whose type
+                must match self._input_type.
+        """
+        assert (
+            self._target_max_block_size is not None
+        ), "target_max_block_size must be set before running"
+        output_buffer = BlockOutputBuffer(self._target_max_block_size)
+        if self._input_type == MapTransformFnDataType.Block:
+            add_fn = output_buffer.add_block
+        elif self._input_type == MapTransformFnDataType.Batch:
+            add_fn = output_buffer.add_batch
+        else:
+            assert self._input_type == MapTransformFnDataType.Row
+            add_fn = output_buffer.add
+        for data in iter:
+            add_fn(data)
+            while output_buffer.has_next():
+                yield output_buffer.next()
+        output_buffer.finalize()
+        while output_buffer.has_next():
+            yield output_buffer.next()
+
+    @classmethod
+    def for_rows(cls) -> "BuildOutputBlocksMapTransformFn":
+        """Return a BuildOutputBlocksMapTransformFn for row input."""
+        return cls(MapTransformFnDataType.Row)
+
+    @classmethod
+    def for_batches(cls) -> "BuildOutputBlocksMapTransformFn":
+        """Return a BuildOutputBlocksMapTransformFn for batch input."""
+        return cls(MapTransformFnDataType.Batch)
+
+    @classmethod
+    def for_blocks(cls) -> "BuildOutputBlocksMapTransformFn":
+        """Return a BuildOutputBlocksMapTransformFn for block input."""
+        return cls(MapTransformFnDataType.Block)
+
+    def __repr__(self) -> str:
+        return f"BuildOutputBlocksMapTransformFn(input_type={self._input_type})"
+
+
+def _splitrange(n, k):
+    """Calculates array lens of np.array_split().
+
+    This is the equivalent of
+    `[len(x) for x in np.array_split(range(n), k)]`.
+    """
+    base = n // k
+    output = [base] * k
+    rem = n - sum(output)
+    for i in range(len(output)):
+        if rem > 0:
+            output[i] += 1
+            rem -= 1
+    assert rem == 0, (rem, output, n, k)
+    assert sum(output) == n, (output, n, k)
+    return output
+
+
+class ApplyAdditionalSplitToOutputBlocks(MapTransformFn):
+    """Do additional splits on output blocks."""
+
+    def __init__(self, additional_split_factor: int):
+        """
+        Args:
+          additional_output_splits: The number of additional splits, must be
+          greater than 1.
+        """
+        assert additional_split_factor > 1
+        self._additional_split_factor = additional_split_factor
+        super().__init__(MapTransformFnDataType.Block, MapTransformFnDataType.Block)
+
+    def __call__(self, blocks: Iterable[Block], ctx: TaskContext) -> Iterable[Block]:
+        for block in blocks:
+            block = BlockAccessor.for_block(block)
+            offset = 0
+            split_sizes = _splitrange(block.num_rows(), self._additional_split_factor)
+            for size in split_sizes:
+                # NOTE: copy=True is needed because this is an output block. If
+                # a block slice is put into the object store, the entire block
+                # will get serialized.
+                yield block.slice(offset, offset + size, copy=True)
+                offset += size
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/output_splitter.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/output_splitter.py
new file mode 100644
index 0000000000000000000000000000000000000000..d571c92a4d164fc380c58b06937d411952394b7c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/output_splitter.py
@@ -0,0 +1,330 @@
+import math
+import time
+from collections import deque
+from typing import Any, Dict, List, Optional, Tuple
+
+from ray.data._internal.execution.interfaces import (
+    ExecutionOptions,
+    NodeIdStr,
+    PhysicalOperator,
+    RefBundle,
+)
+from ray.data._internal.execution.util import locality_string
+from ray.data._internal.remote_fn import cached_remote_fn
+from ray.data._internal.stats import StatsDict
+from ray.data.block import Block, BlockAccessor, BlockMetadata
+from ray.data.context import DataContext
+from ray.types import ObjectRef
+
+
+class OutputSplitter(PhysicalOperator):
+    """An operator that splits the given data into `n` output splits.
+
+    The output bundles of this operator will have a `bundle.output_split_idx` attr
+    set to an integer from [0..n-1]. This operator tries to divide the rows evenly
+    across output splits. If the `equal` option is set, the operator will furthermore
+    guarantee an exact split of rows across outputs, truncating the Dataset.
+
+    Implementation wise, this operator keeps an internal buffer of bundles. The buffer
+    has a minimum size calculated to enable a good locality hit rate, as well as ensure
+    we can satisfy the `equal` requirement.
+
+    OutputSplitter does not provide any ordering guarantees.
+    """
+
+    def __init__(
+        self,
+        input_op: PhysicalOperator,
+        n: int,
+        equal: bool,
+        data_context: DataContext,
+        locality_hints: Optional[List[NodeIdStr]] = None,
+    ):
+        super().__init__(
+            f"split({n}, equal={equal})",
+            [input_op],
+            data_context,
+            target_max_block_size=None,
+        )
+        self._equal = equal
+        # Buffer of bundles not yet assigned to output splits.
+        self._buffer: List[RefBundle] = []
+        # The outputted bundles with output_split attribute set.
+        self._output_queue: deque[RefBundle] = deque()
+        # The number of rows output to each output split so far.
+        self._num_output: List[int] = [0 for _ in range(n)]
+        # The time of the overhead for the output splitter (operator level)
+        self._output_splitter_overhead_time = 0
+
+        if locality_hints is not None:
+            if n != len(locality_hints):
+                raise ValueError(
+                    "Locality hints list must have length `n`: "
+                    f"len({locality_hints}) != {n}"
+                )
+        self._locality_hints = locality_hints
+        if locality_hints:
+            # To optimize locality, we should buffer a certain number of elements
+            # internally before dispatch to allow the locality algorithm a good chance
+            # of selecting a preferred location. We use a small multiple of `n` since
+            # it's reasonable to buffer a couple blocks per consumer.
+            self._min_buffer_size = 2 * n
+        else:
+            self._min_buffer_size = 0
+        self._locality_hits = 0
+        self._locality_misses = 0
+
+    def num_outputs_total(self) -> Optional[int]:
+        # OutputSplitter does not change the number of blocks,
+        # so we can return the number of blocks from the input op.
+        return self.input_dependencies[0].num_outputs_total()
+
+    def num_output_rows_total(self) -> Optional[int]:
+        # The total number of rows is the same as the number of input rows.
+        return self.input_dependencies[0].num_output_rows_total()
+
+    def start(self, options: ExecutionOptions) -> None:
+        super().start(options)
+        # Force disable locality optimization.
+        if not options.actor_locality_enabled:
+            self._locality_hints = None
+            self._min_buffer_size = 0
+
+    def throttling_disabled(self) -> bool:
+        """Disables resource-based throttling.
+
+        It doesn't make sense to throttle the inputs to this operator, since all that
+        would do is lower the buffer size and prevent us from emitting outputs /
+        reduce the locality hit rate.
+        """
+        return True
+
+    def has_next(self) -> bool:
+        return len(self._output_queue) > 0
+
+    def _get_next_inner(self) -> RefBundle:
+        output = self._output_queue.popleft()
+        self._metrics.on_output_dequeued(output)
+        return output
+
+    def get_stats(self) -> StatsDict:
+        return {"split": []}  # TODO(ekl) add split metrics?
+
+    def _extra_metrics(self) -> Dict[str, Any]:
+        stats = {}
+        for i, num in enumerate(self._num_output):
+            stats[f"num_output_{i}"] = num
+        stats["output_splitter_overhead_time"] = self._output_splitter_overhead_time
+        return stats
+
+    def _add_input_inner(self, bundle, input_index) -> None:
+        if bundle.num_rows() is None:
+            raise ValueError("OutputSplitter requires bundles with known row count")
+        self._buffer.append(bundle)
+        self._metrics.on_input_queued(bundle)
+        self._dispatch_bundles()
+
+    def all_inputs_done(self) -> None:
+        super().all_inputs_done()
+        if not self._equal:
+            self._dispatch_bundles(dispatch_all=True)
+            assert not self._buffer, "Should have dispatched all bundles."
+            return
+
+        # Otherwise:
+        # Need to finalize distribution of buffered data to output splits.
+        buffer_size = sum(b.num_rows() for b in self._buffer)
+        max_n = max(self._num_output)
+
+        # First calculate the min rows to add per output to equalize them.
+        allocation = [max_n - n for n in self._num_output]
+        remainder = buffer_size - sum(allocation)
+        # Invariant: buffer should always be large enough to equalize.
+        assert remainder >= 0, (remainder, buffer_size, allocation)
+
+        # Equally distribute remaining rows in buffer to outputs.
+        x = remainder // len(allocation)
+        allocation = [a + x for a in allocation]
+
+        # Execute the split.
+        for i, count in enumerate(allocation):
+            bundles = self._split_from_buffer(count)
+            for b in bundles:
+                b.output_split_idx = i
+                self._output_queue.append(b)
+                self._metrics.on_output_queued(b)
+        self._buffer = []
+
+    def internal_queue_size(self) -> int:
+        return len(self._buffer)
+
+    def progress_str(self) -> str:
+        if self._locality_hints:
+            return locality_string(self._locality_hits, self._locality_misses)
+        else:
+            return "[locality disabled]"
+
+    def _dispatch_bundles(self, dispatch_all: bool = False) -> None:
+        start_time = time.perf_counter()
+        # Dispatch all dispatchable bundles from the internal buffer.
+        # This may not dispatch all bundles when equal=True.
+        while self._buffer and (
+            dispatch_all or len(self._buffer) >= self._min_buffer_size
+        ):
+            target_index = self._select_output_index()
+            target_bundle = self._pop_bundle_to_dispatch(target_index)
+            if self._can_safely_dispatch(target_index, target_bundle.num_rows()):
+                target_bundle.output_split_idx = target_index
+                self._num_output[target_index] += target_bundle.num_rows()
+                self._output_queue.append(target_bundle)
+                self._metrics.on_output_queued(target_bundle)
+                if self._locality_hints:
+                    preferred_loc = self._locality_hints[target_index]
+                    if self._get_location(target_bundle) == preferred_loc:
+                        self._locality_hits += 1
+                    else:
+                        self._locality_misses += 1
+            else:
+                # Put it back and abort.
+                self._buffer.insert(0, target_bundle)
+                self._metrics.on_input_queued(target_bundle)
+                break
+        self._output_splitter_overhead_time += time.perf_counter() - start_time
+
+    def _select_output_index(self) -> int:
+        # Greedily dispatch to the consumer with the least data so far.
+        i, _ = min(enumerate(self._num_output), key=lambda t: t[1])
+        return i
+
+    def _pop_bundle_to_dispatch(self, target_index: int) -> RefBundle:
+        if self._locality_hints:
+            preferred_loc = self._locality_hints[target_index]
+            for bundle in self._buffer:
+                if self._get_location(bundle) == preferred_loc:
+                    self._buffer.remove(bundle)
+                    self._metrics.on_input_dequeued(bundle)
+                    return bundle
+
+        bundle = self._buffer.pop(0)
+        self._metrics.on_input_dequeued(bundle)
+        return bundle
+
+    def _can_safely_dispatch(self, target_index: int, nrow: int) -> bool:
+        if not self._equal:
+            # If not in equals mode, dispatch away with no buffer requirements.
+            return True
+        output_distribution = self._num_output.copy()
+        output_distribution[target_index] += nrow
+        buffer_requirement = self._calculate_buffer_requirement(output_distribution)
+        buffer_size = sum(b.num_rows() for b in self._buffer)
+        return buffer_size >= buffer_requirement
+
+    def _calculate_buffer_requirement(self, output_distribution: List[int]) -> int:
+        # Calculate the new number of rows that we'd need to equalize the row
+        # distribution after the bundle dispatch.
+        max_n = max(output_distribution)
+        return sum([max_n - n for n in output_distribution])
+
+    def _split_from_buffer(self, nrow: int) -> List[RefBundle]:
+        output = []
+        acc = 0
+        while acc < nrow:
+            b = self._buffer.pop()
+            self._metrics.on_input_dequeued(b)
+            if acc + b.num_rows() <= nrow:
+                output.append(b)
+                acc += b.num_rows()
+            else:
+                left, right = _split(b, nrow - acc)
+                output.append(left)
+                acc += left.num_rows()
+                self._buffer.append(right)
+                self._metrics.on_input_queued(right)
+                assert acc == nrow, (acc, nrow)
+
+        assert sum(b.num_rows() for b in output) == nrow, (acc, nrow)
+        return output
+
+    def _get_location(self, bundle: RefBundle) -> Optional[NodeIdStr]:
+        """Ask Ray for the node id of the given bundle.
+
+        This method may be overriden for testing.
+
+        Returns:
+            A node id associated with the bundle, or None if unknown.
+        """
+        return bundle.get_cached_location()
+
+    def implements_accurate_memory_accounting(self) -> bool:
+        return True
+
+
+def _split(bundle: RefBundle, left_size: int) -> Tuple[RefBundle, RefBundle]:
+    left_blocks, left_meta = [], []
+    right_blocks, right_meta = [], []
+    acc = 0
+    for b, m in bundle.blocks:
+        if acc >= left_size:
+            right_blocks.append(b)
+            right_meta.append(m)
+        elif acc + m.num_rows <= left_size:
+            left_blocks.append(b)
+            left_meta.append(m)
+            acc += m.num_rows
+        else:
+            # Trouble case: split it up.
+            lm, rm = _split_meta(m, left_size - acc)
+            lb, rb = _split_block(b, left_size - acc)
+            left_meta.append(lm)
+            right_meta.append(rm)
+            left_blocks.append(lb)
+            right_blocks.append(rb)
+            acc += lm.num_rows
+            assert acc == left_size
+    left = RefBundle(list(zip(left_blocks, left_meta)), owns_blocks=bundle.owns_blocks)
+    right = RefBundle(
+        list(zip(right_blocks, right_meta)), owns_blocks=bundle.owns_blocks
+    )
+    assert left.num_rows() == left_size
+    assert left.num_rows() + right.num_rows() == bundle.num_rows()
+    return left, right
+
+
+def _split_meta(
+    m: BlockMetadata, left_size: int
+) -> Tuple[BlockMetadata, BlockMetadata]:
+    left_bytes = int(math.floor(m.size_bytes * (left_size / m.num_rows)))
+    left = BlockMetadata(
+        num_rows=left_size,
+        size_bytes=left_bytes,
+        schema=m.schema,
+        input_files=m.input_files,
+        exec_stats=None,
+    )
+    right = BlockMetadata(
+        num_rows=m.num_rows - left_size,
+        size_bytes=m.size_bytes - left_bytes,
+        schema=m.schema,
+        input_files=m.input_files,
+        exec_stats=None,
+    )
+    return left, right
+
+
+def _split_block(
+    b: ObjectRef[Block], left_size: int
+) -> Tuple[ObjectRef[Block], ObjectRef[Block]]:
+    split_single_block = cached_remote_fn(_split_single_block)
+    left, right = split_single_block.options(num_cpus=0, num_returns=2).remote(
+        b, left_size
+    )
+    return left, right
+
+
+def _split_single_block(b: Block, left_size: int) -> Tuple[Block, Block]:
+    acc = BlockAccessor.for_block(b)
+    left = acc.slice(0, left_size)
+    right = acc.slice(left_size, acc.num_rows())
+    assert BlockAccessor.for_block(left).num_rows() == left_size
+    assert BlockAccessor.for_block(right).num_rows() == (acc.num_rows() - left_size)
+    return left, right
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/task_pool_map_operator.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/task_pool_map_operator.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4377d3eb365a1636502fefa23f81639fdb0eb7c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/task_pool_map_operator.py
@@ -0,0 +1,146 @@
+from typing import Any, Callable, Dict, Optional
+
+import ray
+from ray.data._internal.execution.interfaces import (
+    ExecutionResources,
+    PhysicalOperator,
+    RefBundle,
+    TaskContext,
+)
+from ray.data._internal.execution.operators.map_operator import MapOperator, _map_task
+from ray.data._internal.execution.operators.map_transformer import MapTransformer
+from ray.data._internal.remote_fn import cached_remote_fn
+from ray.data.context import DataContext
+
+
+class TaskPoolMapOperator(MapOperator):
+    """A MapOperator implementation that executes tasks on a task pool."""
+
+    def __init__(
+        self,
+        map_transformer: MapTransformer,
+        input_op: PhysicalOperator,
+        data_context: DataContext,
+        target_max_block_size: Optional[int],
+        name: str = "TaskPoolMap",
+        min_rows_per_bundle: Optional[int] = None,
+        concurrency: Optional[int] = None,
+        supports_fusion: bool = True,
+        ray_remote_args_fn: Optional[Callable[[], Dict[str, Any]]] = None,
+        ray_remote_args: Optional[Dict[str, Any]] = None,
+    ):
+        """Create an TaskPoolMapOperator instance.
+
+        Args:
+            transform_fn: The function to apply to each ref bundle input.
+            input_op: Operator generating input data for this op.
+            name: The name of this operator.
+            target_max_block_size: The target maximum number of bytes to
+                include in an output block.
+            min_rows_per_bundle: The number of rows to gather per batch passed to the
+                transform_fn, or None to use the block size. Setting the batch size is
+                important for the performance of GPU-accelerated transform functions.
+                The actual rows passed may be less if the dataset is small.
+            concurrency: The maximum number of Ray tasks to use concurrently,
+                or None to use as many tasks as possible.
+            supports_fusion: Whether this operator supports fusion with other operators.
+            ray_remote_args_fn: A function that returns a dictionary of remote args
+                passed to each map worker. The purpose of this argument is to generate
+                dynamic arguments for each actor/task, and will be called each time
+                prior to initializing the worker. Args returned from this dict will
+                always override the args in ``ray_remote_args``. Note: this is an
+                advanced, experimental feature.
+            ray_remote_args: Customize the :func:`ray.remote` args for this op's tasks.
+        """
+        super().__init__(
+            map_transformer,
+            input_op,
+            data_context,
+            name,
+            target_max_block_size,
+            min_rows_per_bundle,
+            supports_fusion,
+            ray_remote_args_fn,
+            ray_remote_args,
+        )
+        self._concurrency = concurrency
+
+        # NOTE: Unlike static Ray remote args, dynamic arguments extracted from the
+        #       blocks themselves are going to be passed inside `fn.options(...)`
+        #       invocation
+        ray_remote_static_args = {
+            **(self._ray_remote_args or {}),
+            "num_returns": "streaming",
+        }
+
+        self._map_task = cached_remote_fn(_map_task, **ray_remote_static_args)
+
+    def _add_bundled_input(self, bundle: RefBundle):
+        # Submit the task as a normal Ray task.
+        ctx = TaskContext(
+            task_idx=self._next_data_task_idx,
+            target_max_block_size=self.actual_target_max_block_size,
+        )
+
+        dynamic_ray_remote_args = self._get_runtime_ray_remote_args(input_bundle=bundle)
+        dynamic_ray_remote_args["name"] = self.name
+
+        data_context = self.data_context
+        if data_context._max_num_blocks_in_streaming_gen_buffer is not None:
+            # The `_generator_backpressure_num_objects` parameter should be
+            # `2 * _max_num_blocks_in_streaming_gen_buffer` because we yield
+            # 2 objects for each block: the block and the block metadata.
+            dynamic_ray_remote_args["_generator_backpressure_num_objects"] = (
+                2 * data_context._max_num_blocks_in_streaming_gen_buffer
+            )
+
+        gen = self._map_task.options(**dynamic_ray_remote_args).remote(
+            self._map_transformer_ref,
+            data_context,
+            ctx,
+            *bundle.block_refs,
+            **self.get_map_task_kwargs(),
+        )
+        self._submit_data_task(gen, bundle)
+
+    def shutdown(self):
+        # Cancel all active tasks.
+        for _, task in self._data_tasks.items():
+            ray.cancel(task.get_waitable())
+        # Wait until all tasks have failed or been cancelled.
+        for _, task in self._data_tasks.items():
+            try:
+                ray.get(task.get_waitable())
+            except ray.exceptions.RayError:
+                # Cancellation either succeeded, or the task had already failed with
+                # a different error, or cancellation failed. In all cases, we
+                # swallow the exception.
+                pass
+        super().shutdown()
+
+    def progress_str(self) -> str:
+        return ""
+
+    def base_resource_usage(self) -> ExecutionResources:
+        return ExecutionResources()
+
+    def current_processor_usage(self) -> ExecutionResources:
+        num_active_workers = self.num_active_tasks()
+        return ExecutionResources(
+            cpu=self._ray_remote_args.get("num_cpus", 0) * num_active_workers,
+            gpu=self._ray_remote_args.get("num_gpus", 0) * num_active_workers,
+        )
+
+    def pending_processor_usage(self) -> ExecutionResources:
+        return ExecutionResources()
+
+    def incremental_resource_usage(self) -> ExecutionResources:
+        return ExecutionResources(
+            cpu=self._ray_remote_args.get("num_cpus", 0),
+            gpu=self._ray_remote_args.get("num_gpus", 0),
+            object_store_memory=self._metrics.obj_store_mem_max_pending_output_per_task
+            or 0,
+        )
+
+    def get_concurrency(self) -> Optional[int]:
+        return self._concurrency
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/union_operator.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/union_operator.py
new file mode 100644
index 0000000000000000000000000000000000000000..0dda603287dedd85ba1b22dcaa8a4211d8c90954
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/union_operator.py
@@ -0,0 +1,116 @@
+from typing import List, Optional
+
+from ray.data._internal.execution.interfaces import (
+    ExecutionOptions,
+    PhysicalOperator,
+    RefBundle,
+)
+from ray.data._internal.execution.operators.base_physical_operator import NAryOperator
+from ray.data._internal.stats import StatsDict
+from ray.data.context import DataContext
+
+
+class UnionOperator(NAryOperator):
+    """An operator that combines output blocks from
+    two or more input operators into a single output."""
+
+    def __init__(
+        self,
+        data_context: DataContext,
+        *input_ops: PhysicalOperator,
+    ):
+        """Create a UnionOperator.
+
+        Args:
+            input_ops: Operators generating input data for this operator to union.
+        """
+
+        # By default, union does not preserve the order of output blocks.
+        # To preserve the order, configure ExecutionOptions accordingly.
+        self._preserve_order = False
+
+        # Intermediary buffers used to store blocks from each input dependency.
+        # Only used when `self._prserve_order` is True.
+        self._input_buffers: List[List[RefBundle]] = [[] for _ in range(len(input_ops))]
+
+        # The index of the input dependency that is currently the source of
+        # the output buffer. New inputs from this input dependency will be added
+        # directly to the output buffer. Only used when `self._preserve_order` is True.
+        self._input_idx_to_output = 0
+
+        self._output_buffer: List[RefBundle] = []
+        self._stats: StatsDict = {"Union": []}
+        super().__init__(data_context, *input_ops)
+
+    def start(self, options: ExecutionOptions):
+        # Whether to preserve the order of the input data (both the
+        # order of the input operators and the order of the blocks within).
+        self._preserve_order = options.preserve_order
+        super().start(options)
+
+    def num_outputs_total(self) -> Optional[int]:
+        num_outputs = 0
+        for input_op in self.input_dependencies:
+            input_num_outputs = input_op.num_outputs_total()
+            if input_num_outputs is None:
+                return None
+            num_outputs += input_num_outputs
+        return num_outputs
+
+    def num_output_rows_total(self) -> Optional[int]:
+        total_rows = 0
+        for input_op in self.input_dependencies:
+            input_num_rows = input_op.num_output_rows_total()
+            if input_num_rows is None:
+                return None
+            total_rows += input_num_rows
+        return total_rows
+
+    def _add_input_inner(self, refs: RefBundle, input_index: int) -> None:
+        assert not self.completed()
+        assert 0 <= input_index <= len(self._input_dependencies), input_index
+
+        if not self._preserve_order:
+            self._output_buffer.append(refs)
+        else:
+            if input_index == self._input_idx_to_output:
+                self._output_buffer.append(refs)
+            else:
+                self._input_buffers[input_index].append(refs)
+
+    def input_done(self, input_index: int) -> None:
+        """When `self._preserve_order` is True, change the
+        output buffer source to the next input dependency
+        once the current input dependency calls `input_done()`."""
+        if not self._preserve_order:
+            return
+        if not input_index == self._input_idx_to_output:
+            return
+        next_input_idx = self._input_idx_to_output + 1
+        if next_input_idx < len(self._input_buffers):
+            self._output_buffer.extend(self._input_buffers[next_input_idx])
+            self._input_buffers[next_input_idx].clear()
+            self._input_idx_to_output = next_input_idx
+        super().input_done(input_index)
+
+    def all_inputs_done(self) -> None:
+        # Note that in the case where order is not preserved, all inputs
+        # are directly added to the output buffer as soon as they are received,
+        # so there is no need to check any intermediary buffers.
+        if self._preserve_order:
+            for idx, input_buffer in enumerate(self._input_buffers):
+                assert len(input_buffer) == 0, (
+                    f"Input at index {idx} still has "
+                    f"{len(input_buffer)} blocks remaining."
+                )
+        super().all_inputs_done()
+
+    def has_next(self) -> bool:
+        # Check if the output buffer still contains at least one block.
+        return len(self._output_buffer) > 0
+
+    def _get_next_inner(self) -> RefBundle:
+        return self._output_buffer.pop(0)
+
+    def get_stats(self) -> StatsDict:
+        return self._stats
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/zip_operator.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/zip_operator.py
new file mode 100644
index 0000000000000000000000000000000000000000..552639ef97ccb10b7e1d64507de5dda6ca5e592d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/zip_operator.py
@@ -0,0 +1,263 @@
+import itertools
+from typing import List, Optional, Tuple
+
+import ray
+from ray.data._internal.delegating_block_builder import DelegatingBlockBuilder
+from ray.data._internal.execution.interfaces import PhysicalOperator, RefBundle
+from ray.data._internal.remote_fn import cached_remote_fn
+from ray.data._internal.split import _split_at_indices
+from ray.data._internal.stats import StatsDict
+from ray.data.block import (
+    Block,
+    BlockAccessor,
+    BlockExecStats,
+    BlockMetadata,
+    BlockPartition,
+)
+from ray.data.context import DataContext
+
+
+class ZipOperator(PhysicalOperator):
+    """An operator that zips its inputs together.
+
+    NOTE: the implementation is bulk for now, which materializes all its inputs in
+    object store, before starting execution. Should re-implement it as a streaming
+    operator in the future.
+    """
+
+    def __init__(
+        self,
+        left_input_op: PhysicalOperator,
+        right_input_op: PhysicalOperator,
+        data_context: DataContext,
+    ):
+        """Create a ZipOperator.
+
+        Args:
+            left_input_ops: The input operator at left hand side.
+            right_input_op: The input operator at right hand side.
+        """
+        self._left_buffer: List[RefBundle] = []
+        self._right_buffer: List[RefBundle] = []
+        self._output_buffer: List[RefBundle] = []
+        self._stats: StatsDict = {}
+        super().__init__(
+            "Zip",
+            [left_input_op, right_input_op],
+            data_context,
+            target_max_block_size=None,
+        )
+
+    def num_outputs_total(self) -> Optional[int]:
+        left_num_outputs = self.input_dependencies[0].num_outputs_total()
+        right_num_outputs = self.input_dependencies[1].num_outputs_total()
+        if left_num_outputs is not None and right_num_outputs is not None:
+            return max(left_num_outputs, right_num_outputs)
+        elif left_num_outputs is not None:
+            return left_num_outputs
+        else:
+            return right_num_outputs
+
+    def num_output_rows_total(self) -> Optional[int]:
+        left_num_rows = self.input_dependencies[0].num_output_rows_total()
+        right_num_rows = self.input_dependencies[1].num_output_rows_total()
+        if left_num_rows is not None and right_num_rows is not None:
+            return max(left_num_rows, right_num_rows)
+        elif left_num_rows is not None:
+            return left_num_rows
+        else:
+            return right_num_rows
+
+    def _add_input_inner(self, refs: RefBundle, input_index: int) -> None:
+        assert not self.completed()
+        assert input_index == 0 or input_index == 1, input_index
+        if input_index == 0:
+            self._left_buffer.append(refs)
+        else:
+            self._right_buffer.append(refs)
+
+    def all_inputs_done(self) -> None:
+        self._output_buffer, self._stats = self._zip(
+            self._left_buffer, self._right_buffer
+        )
+        self._left_buffer.clear()
+        self._right_buffer.clear()
+        super().all_inputs_done()
+
+    def has_next(self) -> bool:
+        return len(self._output_buffer) > 0
+
+    def _get_next_inner(self) -> RefBundle:
+        return self._output_buffer.pop(0)
+
+    def get_stats(self) -> StatsDict:
+        return self._stats
+
+    def _zip(
+        self, left_input: List[RefBundle], right_input: List[RefBundle]
+    ) -> Tuple[List[RefBundle], StatsDict]:
+        """Zip the RefBundles from `left_input` and `right_input` together.
+
+        Zip is done in 2 steps: aligning blocks, and zipping blocks from
+        both sides.
+
+        Aligning blocks (optional): check the blocks from `left_input` and
+        `right_input` are aligned or not, i.e. if having different number of blocks, or
+        having different number of rows in some blocks. If not aligned, repartition the
+        smaller input with `_split_at_indices` to align with larger input.
+
+        Zipping blocks: after blocks from both sides are aligned, zip
+        blocks from both sides together in parallel.
+        """
+        left_blocks_with_metadata = []
+        for bundle in left_input:
+            for block, meta in bundle.blocks:
+                left_blocks_with_metadata.append((block, meta))
+        right_blocks_with_metadata = []
+        for bundle in right_input:
+            for block, meta in bundle.blocks:
+                right_blocks_with_metadata.append((block, meta))
+
+        left_block_rows, left_block_bytes = self._calculate_blocks_rows_and_bytes(
+            left_blocks_with_metadata
+        )
+        right_block_rows, right_block_bytes = self._calculate_blocks_rows_and_bytes(
+            right_blocks_with_metadata
+        )
+
+        # Check that both sides have the same number of rows.
+        # TODO(Clark): Support different number of rows via user-directed
+        # dropping/padding.
+        total_left_rows = sum(left_block_rows)
+        total_right_rows = sum(right_block_rows)
+        if total_left_rows != total_right_rows:
+            raise ValueError(
+                "Cannot zip datasets of different number of rows: "
+                f"{total_left_rows}, {total_right_rows}"
+            )
+
+        # Whether the left and right input sides are inverted
+        input_side_inverted = False
+        if sum(right_block_bytes) > sum(left_block_bytes):
+            # Make sure that right side is smaller, so we minimize splitting
+            # work when aligning both sides.
+            # TODO(Clark): Improve this heuristic for minimizing splitting work,
+            # e.g. by generating the splitting plans for each route (via
+            # _generate_per_block_split_indices) and choosing the plan that splits
+            # the least cumulative bytes.
+            left_blocks_with_metadata, right_blocks_with_metadata = (
+                right_blocks_with_metadata,
+                left_blocks_with_metadata,
+            )
+            left_block_rows, right_block_rows = right_block_rows, left_block_rows
+            input_side_inverted = True
+
+        # Get the split indices that will align both sides.
+        indices = list(itertools.accumulate(left_block_rows))
+        indices.pop(-1)
+
+        # Split other at the alignment indices, such that for every block from
+        # left side, we have a list of blocks from right side that have the same
+        # cumulative number of rows as that left block.
+        # NOTE: _split_at_indices has a no-op fastpath if the blocks are already
+        # aligned.
+        aligned_right_blocks_with_metadata = _split_at_indices(
+            right_blocks_with_metadata,
+            indices,
+            block_rows=right_block_rows,
+        )
+        del right_blocks_with_metadata
+
+        left_blocks = [b for b, _ in left_blocks_with_metadata]
+        right_blocks_list = aligned_right_blocks_with_metadata[0]
+        del left_blocks_with_metadata, aligned_right_blocks_with_metadata
+
+        zip_one_block = cached_remote_fn(_zip_one_block, num_returns=2)
+
+        output_blocks = []
+        output_metadata = []
+        for left_block, right_blocks in zip(left_blocks, right_blocks_list):
+            # For each block from left side, zip it together with 1 or more blocks from
+            # right side. We're guaranteed to have that left_block has the same number
+            # of rows as right_blocks has cumulatively.
+            res, meta = zip_one_block.remote(
+                left_block, *right_blocks, inverted=input_side_inverted
+            )
+            output_blocks.append(res)
+            output_metadata.append(meta)
+
+        # Early release memory.
+        del left_blocks, right_blocks_list
+
+        # TODO(ekl) it might be nice to have a progress bar here.
+        output_metadata = ray.get(output_metadata)
+        output_refs = []
+        input_owned = all(b.owns_blocks for b in left_input)
+        for block, meta in zip(output_blocks, output_metadata):
+            output_refs.append(
+                RefBundle(
+                    [
+                        (
+                            block,
+                            meta,
+                        )
+                    ],
+                    owns_blocks=input_owned,
+                )
+            )
+        stats = {self._name: output_metadata}
+
+        # Clean up inputs.
+        for ref in left_input:
+            ref.destroy_if_owned()
+        for ref in right_input:
+            ref.destroy_if_owned()
+
+        return output_refs, stats
+
+    def _calculate_blocks_rows_and_bytes(
+        self,
+        blocks_with_metadata: BlockPartition,
+    ) -> Tuple[List[int], List[int]]:
+        """Calculate the number of rows and size in bytes for a list of blocks with
+        metadata.
+        """
+        get_num_rows_and_bytes = cached_remote_fn(_get_num_rows_and_bytes)
+        block_rows = []
+        block_bytes = []
+        for block, metadata in blocks_with_metadata:
+            if metadata.num_rows is None or metadata.size_bytes is None:
+                # Need to fetch number of rows or size in bytes, so just fetch both.
+                num_rows, size_bytes = ray.get(get_num_rows_and_bytes.remote(block))
+                # Cache on the block metadata.
+                metadata.num_rows = num_rows
+                metadata.size_bytes = size_bytes
+            block_rows.append(metadata.num_rows)
+            block_bytes.append(metadata.size_bytes)
+        return block_rows, block_bytes
+
+
+def _zip_one_block(
+    block: Block, *other_blocks: Block, inverted: bool = False
+) -> Tuple[Block, BlockMetadata]:
+    """Zip together `block` with `other_blocks`."""
+    stats = BlockExecStats.builder()
+    # Concatenate other blocks.
+    # TODO(Clark): Extend BlockAccessor.zip() to work with N other blocks,
+    # so we don't need to do this concatenation.
+    builder = DelegatingBlockBuilder()
+    for other_block in other_blocks:
+        builder.add_block(other_block)
+    other_block = builder.build()
+    if inverted:
+        # Swap blocks if ordering was inverted during block alignment splitting.
+        block, other_block = other_block, block
+    # Zip block and other blocks.
+    result = BlockAccessor.for_block(block).zip(other_block)
+    br = BlockAccessor.for_block(result)
+    return result, br.get_metadata(exec_stats=stats.build())
+
+
+def _get_num_rows_and_bytes(block: Block) -> Tuple[int, int]:
+    block = BlockAccessor.for_block(block)
+    return block.num_rows(), block.size_bytes()
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/resource_manager.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/resource_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..914869380ec6c6fbec0d2bae8b52cf6b9b307dd4
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/resource_manager.py
@@ -0,0 +1,651 @@
+import logging
+import os
+import time
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from typing import TYPE_CHECKING, Callable, Dict, Iterable, List, Optional
+
+from ray.data._internal.execution.interfaces.execution_options import (
+    ExecutionOptions,
+    ExecutionResources,
+)
+from ray.data._internal.execution.interfaces.physical_operator import PhysicalOperator
+from ray.data._internal.execution.operators.input_data_buffer import InputDataBuffer
+from ray.data._internal.execution.util import memory_string
+from ray.data.context import DataContext
+
+if TYPE_CHECKING:
+    from ray.data._internal.execution.streaming_executor_state import Topology
+
+
+logger = logging.getLogger(__name__)
+DEBUG_RESOURCE_MANAGER = os.environ.get("RAY_DATA_DEBUG_RESOURCE_MANAGER", "0") == "1"
+
+
+class ResourceManager:
+    """A class that manages the resource usage of a streaming executor."""
+
+    # The interval in seconds at which the global resource limits are refreshed.
+    GLOBAL_LIMITS_UPDATE_INTERVAL_S = 10
+
+    # The fraction of the object store capacity that will be used as the default object
+    # store memory limit for the streaming executor,
+    # when `ReservationOpResourceAllocator` is enabled.
+    DEFAULT_OBJECT_STORE_MEMORY_LIMIT_FRACTION = 0.5
+
+    # The fraction of the object store capacity that will be used as the default object
+    # store memory limit for the streaming executor,
+    # when `ReservationOpResourceAllocator` is not enabled.
+    DEFAULT_OBJECT_STORE_MEMORY_LIMIT_FRACTION_NO_RESERVATION = 0.25
+
+    def __init__(
+        self,
+        topology: "Topology",
+        options: ExecutionOptions,
+        get_total_resources: Callable[[], ExecutionResources],
+        data_context: DataContext,
+    ):
+        self._topology = topology
+        self._options = options
+        self._get_total_resources = get_total_resources
+        self._global_limits = ExecutionResources.zero()
+        self._global_limits_last_update_time = 0
+        self._global_usage = ExecutionResources.zero()
+        self._global_running_usage = ExecutionResources.zero()
+        self._global_pending_usage = ExecutionResources.zero()
+        self._op_usages: Dict[PhysicalOperator, ExecutionResources] = {}
+        self._op_running_usages: Dict[PhysicalOperator, ExecutionResources] = {}
+        self._op_pending_usages: Dict[PhysicalOperator, ExecutionResources] = {}
+        # Object store memory usage internal to the operator, including the
+        # pending task outputs and op's internal output buffers.
+        self._mem_op_internal: Dict[PhysicalOperator, int] = defaultdict(int)
+        # Object store memory usage of the blocks that have been taken out of
+        # the operator, including the external output buffer in OpState, and the
+        # input buffers of the downstream operators.
+        self._mem_op_outputs: Dict[PhysicalOperator, int] = defaultdict(int)
+        # Whether to print debug information.
+        self._debug = DEBUG_RESOURCE_MANAGER
+
+        self._downstream_fraction: Dict[PhysicalOperator, float] = {}
+        self._downstream_object_store_memory: Dict[PhysicalOperator, float] = {}
+
+        self._op_resource_allocator: Optional["OpResourceAllocator"] = None
+
+        if data_context.op_resource_reservation_enabled:
+            # We'll enable memory reservation if all operators have
+            # implemented accurate memory accounting.
+            should_enable = all(
+                op.implements_accurate_memory_accounting() for op in topology
+            )
+            if should_enable:
+                self._op_resource_allocator = ReservationOpResourceAllocator(
+                    self, data_context.op_resource_reservation_ratio
+                )
+
+        self._object_store_memory_limit_fraction = (
+            data_context.override_object_store_memory_limit_fraction
+            if data_context.override_object_store_memory_limit_fraction is not None
+            else (
+                self.DEFAULT_OBJECT_STORE_MEMORY_LIMIT_FRACTION
+                if self.op_resource_allocator_enabled()
+                else self.DEFAULT_OBJECT_STORE_MEMORY_LIMIT_FRACTION_NO_RESERVATION
+            )
+        )
+
+    def _estimate_object_store_memory(self, op, state) -> int:
+        # Don't count input refs towards dynamic memory usage, as they have been
+        # pre-created already outside this execution.
+        if isinstance(op, InputDataBuffer):
+            return 0
+
+        # Pending task outputs.
+        mem_op_internal = op.metrics.obj_store_mem_pending_task_outputs or 0
+        # Op's internal output buffers.
+        mem_op_internal += op.metrics.obj_store_mem_internal_outqueue
+
+        # Op's external output buffer.
+        mem_op_outputs = state.outqueue_memory_usage()
+        # Input buffers of the downstream operators.
+        for next_op in op.output_dependencies:
+            mem_op_outputs += (
+                next_op.metrics.obj_store_mem_internal_inqueue
+                + next_op.metrics.obj_store_mem_pending_task_inputs
+            )
+
+        self._mem_op_internal[op] = mem_op_internal
+        self._mem_op_outputs[op] = mem_op_outputs
+
+        return mem_op_internal + mem_op_outputs
+
+    def update_usages(self):
+        """Recalculate resource usages."""
+        # TODO(hchen): This method will be called frequently during the execution loop.
+        # And some computations are redundant. We should either remove redundant
+        # computations or remove this method entirely and compute usages on demand.
+        self._global_usage = ExecutionResources(0, 0, 0)
+        self._global_running_usage = ExecutionResources(0, 0, 0)
+        self._global_pending_usage = ExecutionResources(0, 0, 0)
+        self._op_usages.clear()
+        self._op_running_usages.clear()
+        self._op_pending_usages.clear()
+        self._downstream_fraction.clear()
+        self._downstream_object_store_memory.clear()
+
+        # Iterate from last to first operator.
+        num_ops_so_far = 0
+        num_ops_total = len(self._topology)
+        for op, state in reversed(self._topology.items()):
+            # Update `self._op_usages`, `self._op_running_usages`,
+            # and `self._op_pending_usages`.
+            op.update_resource_usage()
+            op_usage = op.current_processor_usage()
+            op_running_usage = op.running_processor_usage()
+            op_pending_usage = op.pending_processor_usage()
+
+            assert not op_usage.object_store_memory
+            assert not op_running_usage.object_store_memory
+            assert not op_pending_usage.object_store_memory
+            op_usage.object_store_memory = self._estimate_object_store_memory(op, state)
+            op_running_usage.object_store_memory = self._estimate_object_store_memory(
+                op, state
+            )
+            self._op_usages[op] = op_usage
+            self._op_running_usages[op] = op_running_usage
+            self._op_pending_usages[op] = op_pending_usage
+
+            # Update `self._global_usage`, `self._global_running_usage`,
+            # and `self._global_pending_usage`.
+            self._global_usage = self._global_usage.add(op_usage)
+            self._global_running_usage = self._global_running_usage.add(
+                op_running_usage
+            )
+            self._global_pending_usage = self._global_pending_usage.add(
+                op_pending_usage
+            )
+
+            # Update `self._downstream_fraction` and `_downstream_object_store_memory`.
+            # Subtract one from denom to account for input buffer.
+            f = (1.0 + num_ops_so_far) / max(1.0, num_ops_total - 1.0)
+            num_ops_so_far += 1
+            self._downstream_fraction[op] = min(1.0, f)
+            self._downstream_object_store_memory[
+                op
+            ] = self._global_usage.object_store_memory
+
+            # Update operator's object store usage, which is used by
+            # DatasetStats and updated on the Ray Data dashboard.
+            op._metrics.obj_store_mem_used = op_usage.object_store_memory
+
+        if self._op_resource_allocator is not None:
+            self._op_resource_allocator.update_usages()
+
+    def get_global_usage(self) -> ExecutionResources:
+        """Return the global resource usage at the current time."""
+        return self._global_usage
+
+    def get_global_running_usage(self) -> ExecutionResources:
+        """Return the global running resource usage at the current time."""
+        return self._global_running_usage
+
+    def get_global_pending_usage(self) -> ExecutionResources:
+        """Return the global pending resource usage at the current time."""
+        return self._global_pending_usage
+
+    def get_global_limits(self) -> ExecutionResources:
+        """Return the global resource limits at the current time.
+
+        This method autodetects any unspecified execution resource limits based on the
+        current cluster size, refreshing these values periodically to support cluster
+        autoscaling.
+        """
+        if (
+            time.time() - self._global_limits_last_update_time
+            < self.GLOBAL_LIMITS_UPDATE_INTERVAL_S
+        ):
+            return self._global_limits
+
+        self._global_limits_last_update_time = time.time()
+        default_limits = self._options.resource_limits
+        exclude = self._options.exclude_resources
+        total_resources = self._get_total_resources()
+        default_mem_fraction = self._object_store_memory_limit_fraction
+        total_resources.object_store_memory *= default_mem_fraction
+        self._global_limits = default_limits.min(total_resources).subtract(exclude)
+        return self._global_limits
+
+    def get_op_usage(self, op: PhysicalOperator) -> ExecutionResources:
+        """Return the resource usage of the given operator at the current time."""
+        return self._op_usages[op]
+
+    def get_op_usage_str(self, op: PhysicalOperator) -> str:
+        """Return a human-readable string representation of the resource usage of
+        the given operator."""
+        usage_str = f"{self._op_running_usages[op].cpu:.1f} CPU"
+        if self._op_running_usages[op].gpu:
+            usage_str += f", {self._op_running_usages[op].gpu:.1f} GPU"
+        usage_str += (
+            f", {self._op_running_usages[op].object_store_memory_str()} object store"
+        )
+        if self._debug:
+            usage_str += (
+                f" (in={memory_string(self._mem_op_internal[op])},"
+                f"out={memory_string(self._mem_op_outputs[op])})"
+            )
+            if (
+                isinstance(self._op_resource_allocator, ReservationOpResourceAllocator)
+                and op in self._op_resource_allocator._op_budgets
+            ):
+                budget = self._op_resource_allocator._op_budgets[op]
+                usage_str += f", budget=(cpu={budget.cpu:.1f}"
+                usage_str += f",gpu={budget.gpu:.1f}"
+                usage_str += f",object store={budget.object_store_memory_str()})"
+        return usage_str
+
+    def get_downstream_fraction(self, op: PhysicalOperator) -> float:
+        """Return the downstream fraction of the given operator."""
+        return self._downstream_fraction[op]
+
+    def get_downstream_object_store_memory(self, op: PhysicalOperator) -> float:
+        """Return the downstream object store memory usage of the given operator."""
+        return self._downstream_object_store_memory[op]
+
+    def op_resource_allocator_enabled(self) -> bool:
+        """Return whether OpResourceAllocator is enabled."""
+        return self._op_resource_allocator is not None
+
+    @property
+    def op_resource_allocator(self) -> "OpResourceAllocator":
+        """Return the OpResourceAllocator."""
+        assert self._op_resource_allocator is not None
+        return self._op_resource_allocator
+
+
+class OpResourceAllocator(ABC):
+    """An interface for dynamic operator resource allocation.
+
+    This interface allows dynamically allocating available resources to each operator,
+    limiting how many tasks each operator can submit, and how much data each operator
+    can read from its running tasks.
+    """
+
+    def __init__(self, resource_manager: ResourceManager):
+        self._resource_manager = resource_manager
+
+    @abstractmethod
+    def update_usages(self):
+        """Callback to update resource usages."""
+        ...
+
+    @abstractmethod
+    def can_submit_new_task(self, op: PhysicalOperator) -> bool:
+        """Return whether the given operator can submit a new task."""
+        ...
+
+    @abstractmethod
+    def max_task_output_bytes_to_read(self, op: PhysicalOperator) -> Optional[int]:
+        """Return the maximum bytes of pending task outputs can be read for
+        the given operator. None means no limit."""
+        ...
+
+    @abstractmethod
+    def get_budget(self, op: PhysicalOperator) -> ExecutionResources:
+        """Return the budget for the given operator."""
+        ...
+
+
+class ReservationOpResourceAllocator(OpResourceAllocator):
+    """An OpResourceAllocator implementation that reserves resources for each operator.
+
+    This class reserves memory and CPU resources for eligible operators, and considers
+    runtime resource usages to limit the resources that each operator can use.
+
+    It works in the following way:
+    1. An operator is eligible for resource reservation, if it has enabled throttling
+       and hasn't completed. Ineligible operators are not throttled, but
+       their usage will be accounted for their upstream eligible operators. E.g., for
+       such a dataset "map1->limit->map2->streaming_split", we'll treat "map1->limit" as
+       a group and "map2->streaming_split" as another group.
+    2. For each eligible operator, we reserve `reservation_ratio * global_resources /
+        num_eligible_ops` resources, half of which is reserved only for the operator
+        outputs, excluding pending task outputs.
+    3. Non-reserved resources are shared among all operators.
+    4. In each scheduling iteration, each eligible operator will get "remaining of their
+       own reserved resources" + "remaining of shared resources / num_eligible_ops"
+       resources.
+
+    The `reservation_ratio` is set to 50% by default. Users can tune this value to
+    adjust how aggressive or conservative the resource allocation is. A higher value
+    will make the resource allocation more even, but may lead to underutilization and
+    worse performance. And vice versa.
+    """
+
+    class IdleDetector:
+        """Utility class for detecting idle operators.
+
+        Note, stalling can happen when there are less resources than Data executor
+        expects. E.g., when some resources are preempted by non-Data code, see
+        `test_no_deadlock_on_resource_contention` as an example.
+
+        This class is used to detect potential stalling and allow the execution
+        to make progress.
+        """
+
+        # The interval to detect idle operators.
+        # When downstream is idle, we'll allow reading at least one task output
+        # per this interval,
+        DETECTION_INTERVAL_S = 10.0
+        # Print a warning if an operator is idle for this time.
+        WARN_ON_IDLE_TIME_S = 60.0
+        # Whether a warning has been printed.
+        _warn_printed = False
+
+        def __init__(self):
+            # per-op fields
+            self.last_num_outputs = defaultdict(int)
+            self.last_output_time = defaultdict(lambda: time.time())
+            self.last_detection_time = defaultdict(lambda: time.time())
+
+        def detect_idle(self, op: PhysicalOperator):
+            cur_time = time.time()
+            if cur_time - self.last_detection_time[op] > self.DETECTION_INTERVAL_S:
+                cur_num_outputs = op.metrics.num_task_outputs_generated
+                if cur_num_outputs > self.last_num_outputs[op]:
+                    self.last_num_outputs[op] = cur_num_outputs
+                    self.last_output_time[op] = cur_time
+                    self.last_detection_time[op] = cur_time
+                else:
+                    self.last_detection_time[op] = cur_time
+                    self.print_warning_if_idle_for_too_long(
+                        op, cur_time - self.last_output_time[op]
+                    )
+                    return True
+            return False
+
+        @classmethod
+        def print_warning_if_idle_for_too_long(
+            cls, op: PhysicalOperator, idle_time: float
+        ):
+            """Print a warning if an operator is idle for too long."""
+            if idle_time < cls.WARN_ON_IDLE_TIME_S or cls._warn_printed:
+                return
+            cls._warn_printed = True
+            msg = (
+                f"Operator {op} is running but has no outputs for {idle_time} seconds."
+                " Execution may be slower than expected.\n"
+                "Ignore this warning if your UDF is expected to be slow."
+                " Otherwise, this can happen when there are fewer cluster resources"
+                " available to Ray Data than expected."
+                " If you have non-Data tasks or actors running in the cluster, exclude"
+                " their resources from Ray Data with"
+                " `DataContext.get_current().execution_options.exclude_resources`."
+                " This message will only print once."
+            )
+            logger.warning(msg)
+
+    def __init__(self, resource_manager: ResourceManager, reservation_ratio: float):
+        super().__init__(resource_manager)
+        self._reservation_ratio = reservation_ratio
+        assert 0.0 <= self._reservation_ratio <= 1.0
+        # Per-op reserved resources, excluding `_reserved_for_op_outputs`.
+        self._op_reserved: Dict[PhysicalOperator, ExecutionResources] = {}
+        # Memory reserved exclusively for the outputs of each operator.
+        # "Op outputs" refer to blocks that have been taken out of an operator,
+        # i.e., `RessourceManager._mem_op_outputs`.
+        #
+        # Note, if we don't reserve memory for op outputs, all the budget may be used by
+        # the pending task outputs, and/or op's internal output buffers (the latter can
+        # happen when `preserve_order=True`).
+        # Then we'll have no budget to pull blocks from the op.
+        self._reserved_for_op_outputs: Dict[PhysicalOperator, float] = {}
+        # Total shared resources.
+        self._total_shared = ExecutionResources.zero()
+        # Resource budgets for each operator, excluding `_reserved_for_op_outputs`.
+        self._op_budgets: Dict[PhysicalOperator, ExecutionResources] = {}
+        # Whether each operator has reserved the minimum resources to run
+        # at least one task.
+        # This is used to avoid edge cases where the entire resource limits are not
+        # enough to run one task of each op.
+        # See `test_no_deadlock_on_small_cluster_resources` as an example.
+        self._reserved_min_resources: Dict[PhysicalOperator, bool] = {}
+
+        self._cached_global_limits = ExecutionResources.zero()
+        self._cached_num_eligible_ops = 0
+
+        self._idle_detector = self.IdleDetector()
+
+    def _is_op_eligible(self, op: PhysicalOperator) -> bool:
+        """Whether the op is eligible for memory reservation."""
+        return not op.throttling_disabled() and not op.completed()
+
+    def _get_eligible_ops(self) -> List[PhysicalOperator]:
+        return [
+            op for op in self._resource_manager._topology if self._is_op_eligible(op)
+        ]
+
+    def _update_reservation(self):
+        global_limits = self._resource_manager.get_global_limits()
+        eligible_ops = self._get_eligible_ops()
+
+        if (
+            global_limits == self._cached_global_limits
+            and len(eligible_ops) == self._cached_num_eligible_ops
+        ):
+            return
+        self._cached_global_limits = global_limits
+        self._cached_num_eligible_ops = len(eligible_ops)
+
+        self._op_reserved.clear()
+        self._reserved_for_op_outputs.clear()
+        self._reserved_min_resources.clear()
+        self._total_shared = global_limits.copy()
+
+        if len(eligible_ops) == 0:
+            return
+
+        # Reserve `reservation_ratio * global_limits / num_ops` resources for each
+        # operator.
+        default_reserved = global_limits.scale(
+            self._reservation_ratio / (len(eligible_ops))
+        )
+        for op in eligible_ops:
+            # Reserve at least half of the default reserved resources for the outputs.
+            # This makes sure that we will have enough budget to pull blocks from the
+            # op.
+            self._reserved_for_op_outputs[op] = max(
+                default_reserved.object_store_memory / 2, 1.0
+            )
+            # Calculate the minimum amount of resources to reserve.
+            # 1. Make sure the reserved resources are at least to allow one task.
+            min_reserved = op.incremental_resource_usage().copy()
+            # 2. To ensure that all GPUs are utilized, reserve enough resource budget
+            # to launch one task for each worker.
+            if op.base_resource_usage().gpu > 0:
+                min_workers = sum(
+                    pool.min_size() for pool in op.get_autoscaling_actor_pools()
+                )
+                min_reserved.object_store_memory *= min_workers
+            # Also include `reserved_for_op_outputs`.
+            min_reserved.object_store_memory += self._reserved_for_op_outputs[op]
+            # Total resources we want to reserve for this operator.
+            op_total_reserved = default_reserved.max(min_reserved)
+            if op_total_reserved.satisfies_limit(self._total_shared):
+                # If the remaining resources are enough to reserve `op_total_reserved`,
+                # subtract it from `self._total_shared` and reserve it for this op.
+                self._reserved_min_resources[op] = True
+                self._total_shared = self._total_shared.subtract(op_total_reserved)
+                self._op_reserved[op] = op_total_reserved
+                self._op_reserved[
+                    op
+                ].object_store_memory -= self._reserved_for_op_outputs[op]
+            else:
+                # If the remaining resources are not enough to reserve the minimum
+                # resources for this operator, we'll only reserve the minimum object
+                # store memory, but not the CPU and GPU resources.
+                # Because Ray Core doesn't allow CPU/GPU resources to be oversubscribed.
+                # Note, we reserve minimum resources first for the upstream
+                # ops. Downstream ops need to wait for upstream ops to finish
+                # and release resources.
+                self._reserved_min_resources[op] = False
+                self._op_reserved[op] = ExecutionResources(
+                    0,
+                    0,
+                    min_reserved.object_store_memory
+                    - self._reserved_for_op_outputs[op],
+                )
+                self._total_shared = self._total_shared.subtract(
+                    ExecutionResources(0, 0, min_reserved.object_store_memory)
+                )
+
+            self._total_shared = self._total_shared.max(ExecutionResources.zero())
+
+    def can_submit_new_task(self, op: PhysicalOperator) -> bool:
+        if op not in self._op_budgets:
+            return True
+        budget = self._op_budgets[op]
+        res = op.incremental_resource_usage().satisfies_limit(budget)
+        return res
+
+    def get_budget(self, op: PhysicalOperator) -> ExecutionResources:
+        return self._op_budgets[op]
+
+    def _should_unblock_streaming_output_backpressure(
+        self, op: PhysicalOperator
+    ) -> bool:
+        # In some edge cases, the downstream operators may have no enough resources to
+        # launch tasks. Then we should temporarily unblock the streaming output
+        # backpressure by allowing reading at least 1 block. So the current operator
+        # can finish at least one task and yield resources to the downstream operators.
+        for next_op in self._get_downstream_eligible_ops(op):
+            if not self._reserved_min_resources[next_op]:
+                # Case 1: the downstream operator hasn't reserved the minimum resources
+                # to run at least one task.
+                return True
+            # Case 2: the downstream operator has reserved the minimum resources, but
+            # the resources are preempted by non-Data tasks or actors.
+            # We don't have a good way to detect this case, so we'll unblock
+            # backpressure when the downstream operator has been idle for a while.
+            if self._idle_detector.detect_idle(next_op):
+                return True
+        return False
+
+    def _get_op_outputs_usage_with_downstream(self, op: PhysicalOperator) -> float:
+        """Get the outputs memory usage of the given operator, including the downstream
+        ineligible operators.
+        """
+        # Outputs usage of the current operator.
+        op_outputs_usage = self._resource_manager._mem_op_outputs[op]
+        # Also account the downstream ineligible operators' memory usage.
+        op_outputs_usage += sum(
+            self._resource_manager.get_op_usage(next_op).object_store_memory
+            for next_op in self._get_downstream_ineligible_ops(op)
+        )
+        return op_outputs_usage
+
+    def max_task_output_bytes_to_read(self, op: PhysicalOperator) -> Optional[int]:
+        if op not in self._op_budgets:
+            return None
+        res = self._op_budgets[op].object_store_memory
+        # Add the remaining of `_reserved_for_op_outputs`.
+        op_outputs_usage = self._get_op_outputs_usage_with_downstream(op)
+        res += max(self._reserved_for_op_outputs[op] - op_outputs_usage, 0)
+        res = int(res)
+        assert res >= 0
+        if res == 0 and self._should_unblock_streaming_output_backpressure(op):
+            res = 1
+        return res
+
+    def _get_downstream_ineligible_ops(
+        self, op: PhysicalOperator
+    ) -> Iterable[PhysicalOperator]:
+        """Get the downstream ineligible operators of the given operator.
+
+        E.g.,
+          - "cur_map->downstream_map" will return an empty list.
+          - "cur_map->limit1->limit2->downstream_map" will return [limit1, limit2].
+        """
+        for next_op in op.output_dependencies:
+            if not self._is_op_eligible(next_op):
+                yield next_op
+                yield from self._get_downstream_ineligible_ops(next_op)
+
+    def _get_downstream_eligible_ops(
+        self, op: PhysicalOperator
+    ) -> Iterable[PhysicalOperator]:
+        """Get the downstream eligible operators of the given operator, ignoring
+        intermediate ineligible operators.
+
+        E.g.,
+          - "cur_map->downstream_map" will return [downstream_map].
+          - "cur_map->limit1->limit2->downstream_map" will return [downstream_map].
+        """
+        for next_op in op.output_dependencies:
+            if self._is_op_eligible(next_op):
+                yield next_op
+            else:
+                yield from self._get_downstream_eligible_ops(next_op)
+
+    def update_usages(self):
+        self._update_reservation()
+
+        self._op_budgets.clear()
+        eligible_ops = self._get_eligible_ops()
+        if len(eligible_ops) == 0:
+            return
+
+        # Remaining of shared resources.
+        remaining_shared = self._total_shared
+        for op in eligible_ops:
+            # Calculate the memory usage of the operator.
+            op_mem_usage = 0
+            # Add the memory usage of the operator itself,
+            # excluding `_reserved_for_op_outputs`.
+            op_mem_usage += self._resource_manager._mem_op_internal[op]
+            # Add the portion of op outputs usage that has
+            # exceeded `_reserved_for_op_outputs`.
+            op_outputs_usage = self._get_op_outputs_usage_with_downstream(op)
+            op_mem_usage += max(op_outputs_usage - self._reserved_for_op_outputs[op], 0)
+            op_usage = self._resource_manager.get_op_usage(op).copy()
+            op_usage.object_store_memory = op_mem_usage
+            op_reserved = self._op_reserved[op]
+            # How much of the reserved resources are remaining.
+            op_reserved_remaining = op_reserved.subtract(op_usage).max(
+                ExecutionResources.zero()
+            )
+            self._op_budgets[op] = op_reserved_remaining
+            # How much of the reserved resources are exceeded.
+            # If exceeded, we need to subtract from the remaining shared resources.
+            op_reserved_exceeded = op_usage.subtract(op_reserved).max(
+                ExecutionResources.zero()
+            )
+            remaining_shared = remaining_shared.subtract(op_reserved_exceeded)
+
+        remaining_shared = remaining_shared.max(ExecutionResources.zero())
+
+        # Allocate the remaining shared resources to each operator.
+        for i, op in enumerate(reversed(eligible_ops)):
+            # By default, divide the remaining shared resources equally.
+            op_shared = remaining_shared.scale(1.0 / (len(eligible_ops) - i))
+            # But if the op's budget is less than `incremental_resource_usage`,
+            # it will be useless. So we'll let the downstream operator
+            # borrow some resources from the upstream operator, if remaining_shared
+            # is still enough.
+            to_borrow = (
+                op.incremental_resource_usage()
+                .subtract(self._op_budgets[op].add(op_shared))
+                .max(ExecutionResources.zero())
+            )
+            if not to_borrow.is_zero() and op_shared.add(to_borrow).satisfies_limit(
+                remaining_shared
+            ):
+                op_shared = op_shared.add(to_borrow)
+            remaining_shared = remaining_shared.subtract(op_shared)
+            assert remaining_shared.is_non_negative(), (
+                remaining_shared,
+                op,
+                op_shared,
+                to_borrow,
+            )
+            self._op_budgets[op] = self._op_budgets[op].add(op_shared)
+            # We don't limit GPU resources, as not all operators
+            # use GPU resources.
+            self._op_budgets[op].gpu = float("inf")
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/streaming_executor.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/streaming_executor.py
new file mode 100644
index 0000000000000000000000000000000000000000..9342bebc098c86c72c9116a3f3d8d17ea4fb7ef3
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/streaming_executor.py
@@ -0,0 +1,502 @@
+import logging
+import threading
+import time
+import uuid
+from typing import Dict, Iterator, List, Optional
+
+from ray.data._internal.execution.autoscaler import create_autoscaler
+from ray.data._internal.execution.backpressure_policy import (
+    BackpressurePolicy,
+    get_backpressure_policies,
+)
+from ray.data._internal.execution.execution_callback import get_execution_callbacks
+from ray.data._internal.execution.interfaces import (
+    ExecutionResources,
+    Executor,
+    OutputIterator,
+    PhysicalOperator,
+    RefBundle,
+)
+from ray.data._internal.execution.operators.input_data_buffer import InputDataBuffer
+from ray.data._internal.execution.resource_manager import ResourceManager
+from ray.data._internal.execution.streaming_executor_state import (
+    OpState,
+    Topology,
+    build_streaming_topology,
+    process_completed_tasks,
+    select_operator_to_run,
+    update_operator_states,
+)
+from ray.data._internal.logging import get_log_directory
+from ray.data._internal.progress_bar import ProgressBar
+from ray.data._internal.stats import DatasetStats, StatsManager
+from ray.data.context import OK_PREFIX, WARN_PREFIX, DataContext
+
+logger = logging.getLogger(__name__)
+
+# Force a progress bar update after this many events processed . This avoids the
+# progress bar seeming to stall for very large scale workloads.
+PROGRESS_BAR_UPDATE_INTERVAL = 50
+
+# Interval for logging execution progress updates and operator metrics.
+DEBUG_LOG_INTERVAL_SECONDS = 5
+
+# Visible for testing.
+_num_shutdown = 0
+
+
+class StreamingExecutor(Executor, threading.Thread):
+    """A streaming Dataset executor.
+
+    This implementation executes Dataset DAGs in a fully streamed way. It runs
+    by setting up the operator topology, and then routing blocks through operators in
+    a way that maximizes throughput under resource constraints.
+    """
+
+    def __init__(self, data_context: DataContext, dataset_tag: str = "unknown_dataset"):
+        self._data_context = data_context
+        self._start_time: Optional[float] = None
+        self._initial_stats: Optional[DatasetStats] = None
+        self._final_stats: Optional[DatasetStats] = None
+        self._global_info: Optional[ProgressBar] = None
+
+        self._execution_id = uuid.uuid4().hex
+
+        # The executor can be shutdown while still running.
+        self._shutdown_lock = threading.RLock()
+        self._execution_started = False
+        self._shutdown = False
+
+        # Internal execution state shared across thread boundaries. We run the control
+        # loop on a separate thread so that it doesn't become stalled between
+        # generator `yield`s.
+        self._topology: Optional[Topology] = None
+        self._output_node: Optional[OpState] = None
+        self._backpressure_policies: List[BackpressurePolicy] = []
+
+        self._dataset_tag = dataset_tag
+        # Stores if an operator is completed,
+        # used for marking when an op has just completed.
+        self._has_op_completed: Optional[Dict[PhysicalOperator, bool]] = None
+        self._max_errored_blocks = self._data_context.max_errored_blocks
+        self._num_errored_blocks = 0
+
+        self._last_debug_log_time = 0
+
+        Executor.__init__(self, self._data_context.execution_options)
+        thread_name = f"StreamingExecutor-{self._execution_id}"
+        threading.Thread.__init__(self, daemon=True, name=thread_name)
+
+    def execute(
+        self, dag: PhysicalOperator, initial_stats: Optional[DatasetStats] = None
+    ) -> Iterator[RefBundle]:
+        """Executes the DAG using a streaming execution strategy.
+
+        We take an event-loop approach to scheduling. We block on the next scheduling
+        event using `ray.wait`, updating operator state and dispatching new tasks.
+        """
+
+        self._initial_stats = initial_stats
+        self._start_time = time.perf_counter()
+
+        if not isinstance(dag, InputDataBuffer):
+            if self._data_context.print_on_execution_start:
+                message = "Starting execution of Dataset."
+                log_path = get_log_directory()
+                if log_path is not None:
+                    message += f" Full logs are in {log_path}"
+                logger.info(message)
+                logger.info(f"Execution plan of Dataset: {dag}")
+
+            logger.debug("Execution config: %s", self._options)
+
+            # Note: DAG must be initialized in order to query num_outputs_total.
+            # Note: Initialize global progress bar before building the streaming
+            # topology so bars are created in the same order as they should be
+            # displayed. This is done to ensure correct ordering within notebooks.
+            # TODO(zhilong): Implement num_output_rows_total for all
+            # AllToAllOperators
+            self._global_info = ProgressBar(
+                "Running", dag.num_output_rows_total(), unit="row"
+            )
+
+        # Setup the streaming DAG topology and start the runner thread.
+        self._topology, _ = build_streaming_topology(dag, self._options)
+        self._resource_manager = ResourceManager(
+            self._topology,
+            self._options,
+            lambda: self._autoscaler.get_total_resources(),
+            self._data_context,
+        )
+        self._backpressure_policies = get_backpressure_policies(self._topology)
+        self._autoscaler = create_autoscaler(
+            self._topology,
+            self._resource_manager,
+            self._execution_id,
+        )
+
+        self._has_op_completed = {op: False for op in self._topology}
+
+        self._output_node: OpState = self._topology[dag]
+        StatsManager.register_dataset_to_stats_actor(
+            self._dataset_tag,
+            self._get_operator_tags(),
+        )
+        for callback in get_execution_callbacks(self._data_context):
+            callback.before_execution_starts()
+
+        self.start()
+        self._execution_started = True
+
+        class StreamIterator(OutputIterator):
+            def __init__(self, outer: Executor):
+                self._outer = outer
+
+            def get_next(self, output_split_idx: Optional[int] = None) -> RefBundle:
+                try:
+                    item = self._outer._output_node.get_output_blocking(
+                        output_split_idx
+                    )
+                    if self._outer._global_info:
+                        self._outer._global_info.update(
+                            item.num_rows(), dag.num_output_rows_total()
+                        )
+                    return item
+                # Needs to be BaseException to catch KeyboardInterrupt. Otherwise we
+                # can leave dangling progress bars by skipping shutdown.
+                except BaseException as e:
+                    self._outer.shutdown(isinstance(e, StopIteration))
+                    raise
+
+            def __del__(self):
+                self._outer.shutdown()
+
+        return StreamIterator(self)
+
+    def __del__(self):
+        self.shutdown()
+
+    def shutdown(self, execution_completed: bool = True):
+        global _num_shutdown
+
+        with self._shutdown_lock:
+            if not self._execution_started or self._shutdown:
+                return
+            logger.debug(f"Shutting down {self}.")
+            _num_shutdown += 1
+            self._shutdown = True
+            # Give the scheduling loop some time to finish processing.
+            self.join(timeout=2.0)
+            self._update_stats_metrics(
+                state="FINISHED" if execution_completed else "FAILED",
+                force_update=True,
+            )
+            # Once Dataset execution completes, mark it as complete
+            # and remove last cached execution stats.
+            StatsManager.clear_last_execution_stats(self._dataset_tag)
+            # Freeze the stats and save it.
+            self._final_stats = self._generate_stats()
+            stats_summary_string = self._final_stats.to_summary().to_string(
+                include_parent=False
+            )
+            if self._data_context.enable_auto_log_stats:
+                logger.info(stats_summary_string)
+            # Close the progress bars from top to bottom to avoid them jumping
+            # around in the console after completion.
+            if self._global_info:
+                # Set the appropriate description that summarizes
+                # the result of dataset execution.
+                if execution_completed:
+                    prog_bar_msg = (
+                        f"{OK_PREFIX} Dataset execution finished in "
+                        f"{self._final_stats.time_total_s:.2f} seconds"
+                    )
+                else:
+                    prog_bar_msg = f"{WARN_PREFIX} Dataset execution failed"
+                self._global_info.set_description(prog_bar_msg)
+                self._global_info.close()
+            for op, state in self._topology.items():
+                op.shutdown()
+                state.close_progress_bars()
+            self._autoscaler.on_executor_shutdown()
+
+    def run(self):
+        """Run the control loop in a helper thread.
+
+        Results are returned via the output node's outqueue.
+        """
+        try:
+            # Run scheduling loop until complete.
+            while True:
+                t_start = time.process_time()
+                # use process_time to avoid timing ray.wait in _scheduling_loop_step
+                continue_sched = self._scheduling_loop_step(self._topology)
+                if self._initial_stats:
+                    self._initial_stats.streaming_exec_schedule_s.add(
+                        time.process_time() - t_start
+                    )
+                if not continue_sched or self._shutdown:
+                    break
+            for callback in get_execution_callbacks(self._data_context):
+                callback.after_execution_succeeds()
+        except Exception as e:
+            # Propagate it to the result iterator.
+            self._output_node.mark_finished(e)
+            for callback in get_execution_callbacks(self._data_context):
+                callback.after_execution_fails(e)
+        finally:
+            # Signal end of results.
+            self._output_node.mark_finished()
+
+    def get_stats(self):
+        """Return the stats object for the streaming execution.
+
+        The stats object will be updated as streaming execution progresses.
+        """
+        if self._final_stats:
+            return self._final_stats
+        else:
+            return self._generate_stats()
+
+    def _generate_stats(self) -> DatasetStats:
+        """Create a new stats object reflecting execution status so far."""
+        stats = self._initial_stats or DatasetStats(metadata={}, parent=None)
+        for op in self._topology:
+            if isinstance(op, InputDataBuffer):
+                continue
+            builder = stats.child_builder(op.name, override_start_time=self._start_time)
+            stats = builder.build_multioperator(op.get_stats())
+            stats.extra_metrics = op.metrics.as_dict()
+        stats.streaming_exec_schedule_s = (
+            self._initial_stats.streaming_exec_schedule_s
+            if self._initial_stats
+            else None
+        )
+        return stats
+
+    def _scheduling_loop_step(self, topology: Topology) -> bool:
+        """Run one step of the scheduling loop.
+
+        This runs a few general phases:
+            1. Waiting for the next task completion using `ray.wait()`.
+            2. Pulling completed refs into operator outqueues.
+            3. Selecting and dispatching new inputs to operators.
+
+        Returns:
+            True if we should continue running the scheduling loop.
+        """
+        self._resource_manager.update_usages()
+        # Note: calling process_completed_tasks() is expensive since it incurs
+        # ray.wait() overhead, so make sure to allow multiple dispatch per call for
+        # greater parallelism.
+        num_errored_blocks = process_completed_tasks(
+            topology,
+            self._resource_manager,
+            self._max_errored_blocks,
+        )
+        if self._max_errored_blocks > 0:
+            self._max_errored_blocks -= num_errored_blocks
+        self._num_errored_blocks += num_errored_blocks
+
+        self._resource_manager.update_usages()
+        # Dispatch as many operators as we can for completed tasks.
+        self._report_current_usage()
+        op = select_operator_to_run(
+            topology,
+            self._resource_manager,
+            self._backpressure_policies,
+            self._autoscaler,
+            ensure_at_least_one_running=self._consumer_idling(),
+        )
+
+        i = 0
+        while op is not None:
+            i += 1
+            if i % PROGRESS_BAR_UPDATE_INTERVAL == 0:
+                self._refresh_progress_bars(topology)
+            topology[op].dispatch_next_task()
+            self._resource_manager.update_usages()
+            op = select_operator_to_run(
+                topology,
+                self._resource_manager,
+                self._backpressure_policies,
+                self._autoscaler,
+                ensure_at_least_one_running=self._consumer_idling(),
+            )
+
+        update_operator_states(topology)
+        self._refresh_progress_bars(topology)
+
+        self._update_stats_metrics(state="RUNNING")
+        if time.time() - self._last_debug_log_time >= DEBUG_LOG_INTERVAL_SECONDS:
+            _log_op_metrics(topology)
+            _debug_dump_topology(topology, self._resource_manager)
+            self._last_debug_log_time = time.time()
+
+        # Log metrics of newly completed operators.
+        for op in topology:
+            if op.completed() and not self._has_op_completed[op]:
+                log_str = (
+                    f"Operator {op} completed. "
+                    f"Operator Metrics:\n{op._metrics.as_dict()}"
+                )
+                logger.debug(log_str)
+                self._has_op_completed[op] = True
+
+        # Keep going until all operators run to completion.
+        return not all(op.completed() for op in topology)
+
+    def _refresh_progress_bars(self, topology: Topology):
+        # Update the progress bar to reflect scheduling decisions.
+        for op_state in topology.values():
+            op_state.refresh_progress_bar(self._resource_manager)
+        # Refresh the global progress bar to update elapsed time progress.
+        if self._global_info:
+            self._global_info.refresh()
+
+    def _consumer_idling(self) -> bool:
+        """Returns whether the user thread is blocked on topology execution."""
+        return len(self._output_node.outqueue) == 0
+
+    def _report_current_usage(self) -> None:
+        # running_usage is the amount of resources that have been requested but
+        # not necessarily available
+        # TODO(sofian) https://github.com/ray-project/ray/issues/47520
+        # We need to split the reported resources into running, pending-scheduling,
+        # pending-node-assignment.
+        running_usage = self._resource_manager.get_global_running_usage()
+        pending_usage = self._resource_manager.get_global_pending_usage()
+        limits = self._resource_manager.get_global_limits()
+        resources_status = (
+            # TODO(scottjlee): Add dataset name/ID to progress bar output.
+            "Running Dataset. Active & requested resources: "
+            f"{running_usage.cpu:.4g}/{limits.cpu:.4g} CPU, "
+        )
+        if running_usage.gpu > 0:
+            resources_status += f"{running_usage.gpu:.4g}/{limits.gpu:.4g} GPU, "
+        resources_status += (
+            f"{running_usage.object_store_memory_str()}/"
+            f"{limits.object_store_memory_str()} object store"
+        )
+
+        # Only include pending section when there are pending resources.
+        if pending_usage.cpu or pending_usage.gpu:
+            if pending_usage.cpu and pending_usage.gpu:
+                pending_str = (
+                    f"{pending_usage.cpu:.4g} CPU, {pending_usage.gpu:.4g} GPU"
+                )
+            elif pending_usage.cpu:
+                pending_str = f"{pending_usage.cpu:.4g} CPU"
+            else:
+                pending_str = f"{pending_usage.gpu:.4g} GPU"
+            resources_status += f" (pending: {pending_str})"
+        if self._global_info:
+            self._global_info.set_description(resources_status)
+
+    def _get_operator_tags(self):
+        """Returns a list of operator tags."""
+        return [f"{op.name}{i}" for i, op in enumerate(self._topology)]
+
+    def _get_state_dict(self, state):
+        last_op, last_state = list(self._topology.items())[-1]
+        return {
+            "state": state,
+            "progress": last_state.num_completed_tasks,
+            "total": last_op.num_outputs_total(),
+            "end_time": time.time() if state != "RUNNING" else None,
+            "operators": {
+                f"{op.name}{i}": {
+                    "name": op.name,
+                    "progress": op_state.num_completed_tasks,
+                    "total": op.num_outputs_total(),
+                    "state": state,
+                }
+                for i, (op, op_state) in enumerate(self._topology.items())
+            },
+        }
+
+    def _update_stats_metrics(self, state: str, force_update: bool = False):
+        StatsManager.update_execution_metrics(
+            self._dataset_tag,
+            [op.metrics for op in self._topology],
+            self._get_operator_tags(),
+            self._get_state_dict(state=state),
+            force_update=force_update,
+        )
+
+
+def _validate_dag(dag: PhysicalOperator, limits: ExecutionResources) -> None:
+    """Raises an exception on invalid DAGs.
+
+    It checks if the the sum of min actor pool sizes are larger than the resource
+    limit, as well as other unsupported resource configurations.
+
+    This should be called prior to creating the topology from the DAG.
+
+    Args:
+        dag: The DAG to validate.
+        limits: The limits to validate against.
+    """
+
+    seen = set()
+
+    def walk(op):
+        seen.add(op)
+        for parent in op.input_dependencies:
+            if parent not in seen:
+                yield from walk(parent)
+        yield op
+
+    base_usage = ExecutionResources(cpu=1)
+    for op in walk(dag):
+        base_usage = base_usage.add(op.base_resource_usage())
+
+    if not base_usage.satisfies_limit(limits):
+        error_message = (
+            "The current cluster doesn't have the required resources to execute your "
+            "Dataset pipeline:\n"
+        )
+        if base_usage.cpu > limits.cpu:
+            error_message += (
+                f"- Your application needs {base_usage.cpu} CPU(s), but your cluster "
+                f"only has {limits.cpu}.\n"
+            )
+        if base_usage.gpu > limits.gpu:
+            error_message += (
+                f"- Your application needs {base_usage.gpu} GPU(s), but your cluster "
+                f"only has {limits.gpu}.\n"
+            )
+        if base_usage.object_store_memory > limits.object_store_memory:
+            error_message += (
+                f"- Your application needs {base_usage.object_store_memory}B object "
+                f"store memory, but your cluster only has "
+                f"{limits.object_store_memory}B.\n"
+            )
+        raise ValueError(error_message.strip())
+
+
+def _debug_dump_topology(topology: Topology, resource_manager: ResourceManager) -> None:
+    """Log current execution state for the topology for debugging.
+
+    Args:
+        topology: The topology to debug.
+        resource_manager: The resource manager for this topology.
+    """
+    logger.debug("Execution Progress:")
+    for i, (op, state) in enumerate(topology.items()):
+        logger.debug(
+            f"{i}: {state.summary_str(resource_manager)}, "
+            f"Blocks Outputted: {state.num_completed_tasks}/{op.num_outputs_total()}"
+        )
+
+
+def _log_op_metrics(topology: Topology) -> None:
+    """Logs the metrics of each operator.
+
+    Args:
+        topology: The topology to debug.
+    """
+    log_str = "Operator Metrics:\n"
+    for op in topology:
+        log_str += f"{op.name}: {op.metrics.as_dict()}\n"
+    logger.debug(log_str)
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/streaming_executor_state.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/streaming_executor_state.py
new file mode 100644
index 0000000000000000000000000000000000000000..f91c4f536da6a5a62d5d2d4ab3c5feab2ac8edd6
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/streaming_executor_state.py
@@ -0,0 +1,681 @@
+"""Contains classes that encapsulate streaming executor state.
+
+This is split out from streaming_executor.py to facilitate better unit testing.
+"""
+
+import logging
+import math
+import threading
+import time
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+import ray
+from ray.data._internal.execution.autoscaler import Autoscaler
+from ray.data._internal.execution.backpressure_policy import BackpressurePolicy
+from ray.data._internal.execution.bundle_queue import create_bundle_queue
+from ray.data._internal.execution.interfaces import (
+    ExecutionOptions,
+    ExecutionResources,
+    PhysicalOperator,
+    RefBundle,
+)
+from ray.data._internal.execution.interfaces.physical_operator import (
+    DataOpTask,
+    MetadataOpTask,
+    OpTask,
+    Waitable,
+)
+from ray.data._internal.execution.operators.base_physical_operator import (
+    AllToAllOperator,
+)
+from ray.data._internal.execution.operators.input_data_buffer import InputDataBuffer
+from ray.data._internal.execution.resource_manager import ResourceManager
+from ray.data._internal.progress_bar import ProgressBar
+from ray.data.context import DataContext
+
+logger = logging.getLogger(__name__)
+
+# Holds the full execution state of the streaming topology. It's a dict mapping each
+# operator to tracked streaming exec state.
+Topology = Dict[PhysicalOperator, "OpState"]
+
+
+class OpBufferQueue:
+    """A FIFO queue to buffer RefBundles between upstream and downstream operators.
+    This class is thread-safe.
+    """
+
+    def __init__(self):
+        self._num_blocks = 0
+        self._queue = create_bundle_queue()
+        self._num_per_split = defaultdict(int)
+        self._lock = threading.Lock()
+        # Used to buffer output RefBundles indexed by output splits.
+        self._outputs_by_split = defaultdict(create_bundle_queue)
+        super().__init__()
+
+    @property
+    def memory_usage(self) -> int:
+        """The total memory usage of the queue in bytes."""
+        with self._lock:
+            # The split queues contain bundles popped from the main queue. So, a bundle
+            # will either be in the main queue or in one of the split queues, and we
+            # don't need to worry about double counting.
+            return self._queue.estimate_size_bytes() + sum(
+                split_queue.estimate_size_bytes()
+                for split_queue in self._outputs_by_split.values()
+            )
+
+    @property
+    def num_blocks(self) -> int:
+        """The total number of blocks in the queue."""
+        with self._lock:
+            return self._num_blocks
+
+    def __len__(self):
+        with self._lock:
+            return len(self._queue)
+
+    def has_next(self, output_split_idx: Optional[int] = None) -> bool:
+        """Whether next RefBundle is available.
+
+        Args:
+            output_split_idx: If specified, only check ref bundles with the
+                given output split.
+        """
+        if output_split_idx is None:
+            with self._lock:
+                return len(self._queue) > 0
+        else:
+            with self._lock:
+                return self._num_per_split[output_split_idx] > 0
+
+    def append(self, ref: RefBundle):
+        """Append a RefBundle to the queue."""
+        with self._lock:
+            self._queue.add(ref)
+            self._num_blocks += len(ref.blocks)
+            if ref.output_split_idx is not None:
+                self._num_per_split[ref.output_split_idx] += 1
+
+    def pop(self, output_split_idx: Optional[int] = None) -> Optional[RefBundle]:
+        """Pop a RefBundle from the queue.
+        Args:
+            output_split_idx: If specified, only pop a RefBundle
+                with the given output split.
+        Returns:
+            A RefBundle if available, otherwise None.
+        """
+        ret = None
+        if output_split_idx is None:
+            try:
+                with self._lock:
+                    ret = self._queue.pop()
+            except IndexError:
+                pass
+        else:
+            with self._lock:
+                split_queue = self._outputs_by_split[output_split_idx]
+            if len(split_queue) == 0:
+                # Move all ref bundles to their indexed queues
+                # Note, the reason why we do indexing here instead of in the append
+                # is because only the last `OpBufferQueue` in the DAG, which will call
+                # pop with output_split_idx, needs indexing.
+                # If we also index the `OpBufferQueue`s in the middle, we cannot
+                # preserve the order of ref bundles with different output splits.
+                with self._lock:
+                    while len(self._queue) > 0:
+                        ref = self._queue.pop()
+                        self._outputs_by_split[ref.output_split_idx].add(ref)
+            try:
+                ret = split_queue.pop()
+            except IndexError:
+                pass
+        if ret is None:
+            return None
+        with self._lock:
+            self._num_blocks -= len(ret.blocks)
+            if ret.output_split_idx is not None:
+                self._num_per_split[ret.output_split_idx] -= 1
+        return ret
+
+    def clear(self):
+        with self._lock:
+            self._queue.clear()
+            self._num_blocks = 0
+            self._num_per_split.clear()
+
+
+@dataclass
+class OpSchedulingStatus:
+    """The scheduling status of an operator.
+
+    This will be updated each time when StreamingExecutor makes
+    a scheduling decision, i.e., in each `select_operator_to_run`
+    call.
+    """
+
+    # Whether the op was selected to run in the last scheduling
+    # decision.
+    selected: bool = False
+    # Whether the op was considered runnable in the last scheduling
+    # decision.
+    runnable: bool = False
+    # Whether the resources were sufficient for the operator to run
+    # in the last scheduling decision.
+    under_resource_limits: bool = False
+
+
+class OpState:
+    """The execution state tracked for each PhysicalOperator.
+
+    This tracks state to manage input and output buffering for StreamingExecutor and
+    progress bars, which is separate from execution state internal to the operators.
+
+    Note: we use the `deque` data structure here because it is thread-safe, enabling
+    operator queues to be shared across threads.
+    """
+
+    def __init__(self, op: PhysicalOperator, inqueues: List[OpBufferQueue]):
+        # Each inqueue is connected to another operator's outqueue.
+        assert len(inqueues) == len(op.input_dependencies), (op, inqueues)
+        self.inqueues: List[OpBufferQueue] = inqueues
+        # The outqueue is connected to another operator's inqueue (they physically
+        # share the same Python list reference).
+        #
+        # Note: this queue is also accessed concurrently from the consumer thread.
+        # (in addition to the streaming executor thread). Hence, it must be a
+        # thread-safe type such as `deque`.
+        self.outqueue: OpBufferQueue = OpBufferQueue()
+        self.op = op
+        self.progress_bar = None
+        self.num_completed_tasks = 0
+        self.inputs_done_called = False
+        # Tracks whether `input_done` is called for each input op.
+        self.input_done_called = [False] * len(op.input_dependencies)
+        # Used for StreamingExecutor to signal exception or end of execution
+        self._finished: bool = False
+        self._exception: Optional[Exception] = None
+        self._scheduling_status = OpSchedulingStatus()
+
+    def __repr__(self):
+        return f"OpState({self.op.name})"
+
+    def initialize_progress_bars(self, index: int, verbose_progress: bool) -> int:
+        """Create progress bars at the given index (line offset in console).
+
+        For AllToAllOperator, zero or more sub progress bar would be created.
+        Return the number of enabled progress bars created for this operator.
+        """
+        is_all_to_all = isinstance(self.op, AllToAllOperator)
+        # Only show 1:1 ops when in verbose progress mode.
+        ctx = DataContext.get_current()
+        progress_bar_enabled = (
+            ctx.enable_progress_bars
+            and ctx.enable_operator_progress_bars
+            and (is_all_to_all or verbose_progress)
+        )
+        self.progress_bar = ProgressBar(
+            "- " + self.op.name,
+            self.op.num_output_rows_total(),
+            unit="row",
+            position=index,
+            enabled=progress_bar_enabled,
+        )
+        num_progress_bars = 1
+        if is_all_to_all:
+            # Initialize must be called for sub progress bars, even the
+            # bars are not enabled via the DataContext.
+            num_progress_bars += self.op.initialize_sub_progress_bars(index + 1)
+        return num_progress_bars if progress_bar_enabled else 0
+
+    def close_progress_bars(self):
+        """Close all progress bars for this operator."""
+        if self.progress_bar:
+            self.progress_bar.close()
+            if isinstance(self.op, AllToAllOperator):
+                self.op.close_sub_progress_bars()
+
+    def num_queued(self) -> int:
+        """Return the number of queued bundles across all inqueues."""
+        return sum(len(q) for q in self.inqueues)
+
+    def num_processing(self):
+        """Return the number of bundles currently in processing for this operator."""
+        return self.op.num_active_tasks() + self.op.internal_queue_size()
+
+    def add_output(self, ref: RefBundle) -> None:
+        """Move a bundle produced by the operator to its outqueue."""
+        self.outqueue.append(ref)
+        self.num_completed_tasks += 1
+        if self.progress_bar:
+            assert (
+                ref.num_rows() is not None
+            ), "RefBundle must have a valid number of rows"
+            self.progress_bar.update(ref.num_rows(), self.op.num_output_rows_total())
+
+    def refresh_progress_bar(self, resource_manager: ResourceManager) -> None:
+        """Update the console with the latest operator progress."""
+        if self.progress_bar:
+            self.progress_bar.set_description(self.summary_str(resource_manager))
+            self.progress_bar.refresh()
+
+    def summary_str(self, resource_manager: ResourceManager) -> str:
+        # Active tasks
+        active = self.op.num_active_tasks()
+        desc = f"- {self.op.name}: Tasks: {active}"
+        if (
+            self.op._in_task_submission_backpressure
+            or self.op._in_task_output_backpressure
+        ):
+            desc += " [backpressured]"
+
+        # Actors info
+        desc += self.op.actor_info_progress_str()
+
+        # Queued blocks
+        queued = self.num_queued() + self.op.internal_queue_size()
+        desc += f"; Queued blocks: {queued}"
+        desc += f"; Resources: {resource_manager.get_op_usage_str(self.op)}"
+
+        # Any additional operator specific information.
+        suffix = self.op.progress_str()
+        if suffix:
+            desc += f"; {suffix}"
+
+        return desc
+
+    def dispatch_next_task(self) -> None:
+        """Move a bundle from the operator inqueue to the operator itself."""
+        for i, inqueue in enumerate(self.inqueues):
+            ref = inqueue.pop()
+            if ref is not None:
+                self.op.add_input(ref, input_index=i)
+                return
+        assert False, "Nothing to dispatch"
+
+    def get_output_blocking(self, output_split_idx: Optional[int]) -> RefBundle:
+        """Get an item from this node's output queue, blocking as needed.
+
+        Returns:
+            The RefBundle from the output queue, or an error / end of stream indicator.
+
+        Raises:
+            StopIteration: If all outputs are already consumed.
+            Exception: If there was an exception raised during execution.
+        """
+        while True:
+            # Check if StreamingExecutor has caught an exception or is done execution.
+            if self._exception is not None:
+                raise self._exception
+            elif self._finished and not self.outqueue.has_next(output_split_idx):
+                raise StopIteration()
+            ref = self.outqueue.pop(output_split_idx)
+            if ref is not None:
+                return ref
+            time.sleep(0.01)
+
+    def inqueue_memory_usage(self) -> int:
+        """Return the object store memory of this operator's inqueue."""
+        total = 0
+        for op, inq in zip(self.op.input_dependencies, self.inqueues):
+            # Exclude existing input data items from dynamic memory usage.
+            if not isinstance(op, InputDataBuffer):
+                total += inq.memory_usage
+        return total
+
+    def outqueue_memory_usage(self) -> int:
+        """Return the object store memory of this operator's outqueue."""
+        return self.outqueue.memory_usage
+
+    def outqueue_num_blocks(self) -> int:
+        """Return the number of blocks in this operator's outqueue."""
+        return self.outqueue.num_blocks
+
+    def mark_finished(self, exception: Optional[Exception] = None):
+        """Marks this operator as finished. Used for exiting get_output_blocking."""
+        if exception is None:
+            self._finished = True
+        else:
+            self._exception = exception
+
+
+def build_streaming_topology(
+    dag: PhysicalOperator, options: ExecutionOptions
+) -> Tuple[Topology, int]:
+    """Instantiate the streaming operator state topology for the given DAG.
+
+    This involves creating the operator state for each operator in the DAG,
+    registering it with this class, and wiring up the inqueues/outqueues of
+    dependent operator states.
+
+    Args:
+        dag: The operator DAG to instantiate.
+        options: The execution options to use to start operators.
+
+    Returns:
+        The topology dict holding the streaming execution state.
+        The number of progress bars initialized so far.
+    """
+
+    topology: Topology = {}
+
+    # DFS walk to wire up operator states.
+    def setup_state(op: PhysicalOperator) -> OpState:
+        if op in topology:
+            raise ValueError("An operator can only be present in a topology once.")
+
+        # Wire up the input outqueues to this op's inqueues.
+        inqueues = []
+        for i, parent in enumerate(op.input_dependencies):
+            parent_state = setup_state(parent)
+            inqueues.append(parent_state.outqueue)
+
+        # Create state.
+        op_state = OpState(op, inqueues)
+        topology[op] = op_state
+        op.start(options)
+        return op_state
+
+    setup_state(dag)
+
+    # Create the progress bars starting from the first operator to run.
+    # Note that the topology dict is in topological sort order. Index zero is reserved
+    # for global progress information.
+    i = 1
+    for op_state in list(topology.values()):
+        if not isinstance(op_state.op, InputDataBuffer):
+            i += op_state.initialize_progress_bars(i, options.verbose_progress)
+
+    return (topology, i)
+
+
+def process_completed_tasks(
+    topology: Topology,
+    resource_manager: ResourceManager,
+    max_errored_blocks: int,
+) -> int:
+    """Process any newly completed tasks. To update operator
+    states, call `update_operator_states()` afterwards.
+
+    Args:
+        topology: The toplogy of operators.
+        backpressure_policies: The backpressure policies to use.
+        max_errored_blocks: Max number of errored blocks to allow,
+            unlimited if negative.
+    Returns:
+        The number of errored blocks.
+    """
+
+    # All active tasks, keyed by their waitables.
+    active_tasks: Dict[Waitable, Tuple[OpState, OpTask]] = {}
+    for op, state in topology.items():
+        for task in op.get_active_tasks():
+            active_tasks[task.get_waitable()] = (state, task)
+
+    max_bytes_to_read_per_op: Dict[OpState, int] = {}
+    if resource_manager.op_resource_allocator_enabled():
+        for op, state in topology.items():
+            max_bytes_to_read = (
+                resource_manager.op_resource_allocator.max_task_output_bytes_to_read(op)
+            )
+            op._in_task_output_backpressure = max_bytes_to_read == 0
+            if max_bytes_to_read is not None:
+                max_bytes_to_read_per_op[state] = max_bytes_to_read
+
+    # Process completed Ray tasks and notify operators.
+    num_errored_blocks = 0
+    if active_tasks:
+        ready, _ = ray.wait(
+            list(active_tasks.keys()),
+            num_returns=len(active_tasks),
+            fetch_local=False,
+            timeout=0.1,
+        )
+
+        # Organize tasks by the operator they belong to, and sort them by task index.
+        # So that we'll process them in a deterministic order.
+        # This is because OpResourceAllocator may limit the number of blocks to read
+        # per operator. In this case, we want to have fewer tasks finish quickly and
+        # yield resources, instead of having all tasks output blocks together.
+        ready_tasks_by_op = defaultdict(list)
+        for ref in ready:
+            state, task = active_tasks[ref]
+            ready_tasks_by_op[state].append(task)
+
+        for state, ready_tasks in ready_tasks_by_op.items():
+            ready_tasks = sorted(ready_tasks, key=lambda t: t.task_index())
+            for task in ready_tasks:
+                if isinstance(task, DataOpTask):
+                    try:
+                        bytes_read = task.on_data_ready(
+                            max_bytes_to_read_per_op.get(state, None)
+                        )
+                        if state in max_bytes_to_read_per_op:
+                            max_bytes_to_read_per_op[state] -= bytes_read
+                    except Exception as e:
+                        num_errored_blocks += 1
+                        should_ignore = (
+                            max_errored_blocks < 0
+                            or max_errored_blocks >= num_errored_blocks
+                        )
+                        error_message = (
+                            "An exception was raised from a task of "
+                            f'operator "{state.op.name}".'
+                        )
+                        if should_ignore:
+                            remaining = (
+                                max_errored_blocks - num_errored_blocks
+                                if max_errored_blocks >= 0
+                                else "unlimited"
+                            )
+                            error_message += (
+                                " Ignoring this exception with remaining"
+                                f" max_errored_blocks={remaining}."
+                            )
+                            logger.error(error_message, exc_info=e)
+                        else:
+                            error_message += (
+                                " Dataset execution will now abort."
+                                " To ignore this exception and continue, set"
+                                " DataContext.max_errored_blocks."
+                            )
+                            logger.error(error_message)
+                            raise e from None
+                else:
+                    assert isinstance(task, MetadataOpTask)
+                    task.on_task_finished()
+
+    # Pull any operator outputs into the streaming op state.
+    for op, op_state in topology.items():
+        while op.has_next():
+            op_state.add_output(op.get_next())
+
+    return num_errored_blocks
+
+
+def update_operator_states(topology: Topology) -> None:
+    """Update operator states accordingly for newly completed tasks.
+    Should be called after `process_completed_tasks()`."""
+
+    # Call inputs_done() on ops where no more inputs are coming.
+    for op, op_state in topology.items():
+        if op_state.inputs_done_called:
+            continue
+        all_inputs_done = True
+        for idx, dep in enumerate(op.input_dependencies):
+            if dep.completed() and not topology[dep].outqueue:
+                if not op_state.input_done_called[idx]:
+                    op.input_done(idx)
+                    op_state.input_done_called[idx] = True
+            else:
+                all_inputs_done = False
+
+        if all_inputs_done:
+            op.all_inputs_done()
+            op_state.inputs_done_called = True
+
+    # Traverse the topology in reverse topological order.
+    # For each op, if all of its downstream operators have completed.
+    # call mark_execution_completed() to also complete this op.
+    for op, op_state in reversed(list(topology.items())):
+        if op.completed():
+            continue
+        dependents_completed = len(op.output_dependencies) > 0 and all(
+            dep.completed() for dep in op.output_dependencies
+        )
+        if dependents_completed:
+            op.mark_execution_completed()
+
+
+def select_operator_to_run(
+    topology: Topology,
+    resource_manager: ResourceManager,
+    backpressure_policies: List[BackpressurePolicy],
+    autoscaler: Autoscaler,
+    ensure_at_least_one_running: bool,
+) -> Optional[PhysicalOperator]:
+    """Select an operator to run, if possible.
+
+    The objective of this function is to maximize the throughput of the overall
+    pipeline, subject to defined memory and parallelism limits.
+
+    This is currently implemented by applying backpressure on operators that are
+    producing outputs faster than they are consuming them `len(outqueue)`, as well as
+    operators with a large number of running tasks `num_processing()`.
+
+    Note that memory limits also apply to the outqueue of the output operator. This
+    provides backpressure if the consumer is slow. However, once a bundle is returned
+    to the user, it is no longer tracked.
+    """
+    # Filter to ops that are eligible for execution.
+    ops = []
+    for op, state in topology.items():
+        if resource_manager.op_resource_allocator_enabled():
+            under_resource_limits = (
+                resource_manager.op_resource_allocator.can_submit_new_task(op)
+            )
+        else:
+            under_resource_limits = _execution_allowed(op, resource_manager)
+        in_backpressure = not under_resource_limits or any(
+            not p.can_add_input(op) for p in backpressure_policies
+        )
+        op_runnable = False
+        if (
+            not in_backpressure
+            and not op.completed()
+            and state.num_queued() > 0
+            and op.should_add_input()
+        ):
+            ops.append(op)
+            op_runnable = True
+        # Update scheduling status
+        state._scheduling_status = OpSchedulingStatus(
+            selected=False,
+            runnable=op_runnable,
+            under_resource_limits=under_resource_limits,
+        )
+
+        # Signal whether op in backpressure for stats collections
+        op.notify_in_task_submission_backpressure(in_backpressure)
+
+    # To ensure liveness, allow at least 1 op to run regardless of limits. This is
+    # gated on `ensure_at_least_one_running`, which is set if the consumer is blocked.
+    if (
+        ensure_at_least_one_running
+        and not ops
+        and all(op.num_active_tasks() == 0 for op in topology)
+    ):
+        # The topology is entirely idle, so choose from all ready ops ignoring limits.
+        ops = [
+            op
+            for op, state in topology.items()
+            if state.num_queued() > 0 and not op.completed()
+        ]
+
+    selected_op = None
+    if ops:
+        # Run metadata-only operators first. After that, choose the operator with the
+        # least memory usage.
+        selected_op = min(
+            ops,
+            key=lambda op: (
+                not op.throttling_disabled(),
+                resource_manager.get_op_usage(op).object_store_memory,
+            ),
+        )
+        topology[selected_op]._scheduling_status.selected = True
+    autoscaler.try_trigger_scaling()
+    return selected_op
+
+
+def _execution_allowed(op: PhysicalOperator, resource_manager: ResourceManager) -> bool:
+    """Return whether an operator is allowed to execute given resource usage.
+
+    Operators are throttled globally based on CPU and GPU limits for the stream.
+
+    For an N operator DAG, we only throttle the kth operator (in the source-to-sink
+    ordering) on object store utilization if the cumulative object store utilization
+    for the kth operator and every operator downstream from it is greater than
+    k/N * global_limit; i.e., the N - k operator sub-DAG is using more object store
+    memory than it's share.
+
+    Args:
+        op: The operator to check.
+        resource_manager: The ResourceManager of the current dataset.
+
+    Returns:
+        Whether the op is allowed to run.
+    """
+    if op.throttling_disabled():
+        return True
+
+    global_usage = resource_manager.get_global_usage()
+    global_limits = resource_manager.get_global_limits()
+
+    # To avoid starvation problems when dealing with fractional resource types,
+    # convert all quantities to integer (0 or 1) for deciding admissibility. This
+    # allows operators with non-integral requests to slightly overshoot the limit.
+    global_floored = ExecutionResources(
+        cpu=math.floor(global_usage.cpu or 0),
+        gpu=math.floor(global_usage.gpu or 0),
+        object_store_memory=global_usage.object_store_memory,
+    )
+    inc = op.incremental_resource_usage()
+    if inc.cpu and inc.gpu:
+        raise NotImplementedError(
+            "Operator incremental resource usage cannot specify both CPU "
+            "and GPU at the same time, since it may cause deadlock."
+        )
+
+    # Ignore the scale of CPU and GPU requests, i.e., treating them as either 1 or 0.
+    # This ensures operators don't get starved due to the shape of their resource
+    # requests.
+    inc_indicator = ExecutionResources(
+        cpu=1 if inc.cpu else 0,
+        gpu=1 if inc.gpu else 0,
+        object_store_memory=0,
+    )
+
+    # Under global limits; always allow.
+    new_usage = global_floored.add(inc_indicator)
+    if new_usage.satisfies_limit(global_limits):
+        return True
+
+    # We're over global limits, but execution may still be allowed if memory is the
+    # only bottleneck and this wouldn't impact downstream memory limits. This avoids
+    # stalling the execution for memory bottlenecks that occur upstream.
+    # See for more context: https://github.com/ray-project/ray/pull/32673
+    global_limits_sans_memory = ExecutionResources.for_limits(
+        cpu=global_limits.cpu, gpu=global_limits.gpu
+    )
+    global_ok_sans_memory = new_usage.satisfies_limit(global_limits_sans_memory)
+    downstream_memory = resource_manager.get_downstream_object_store_memory(op)
+    downstream_limit = global_limits.scale(resource_manager.get_downstream_fraction(op))
+    downstream_memory_ok = ExecutionResources(
+        object_store_memory=downstream_memory
+    ).satisfies_limit(downstream_limit)
+
+    return global_ok_sans_memory and downstream_memory_ok
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/util.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3bf3d9f1f54d0f9da6f7b55b2160e1228d43764
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/util.py
@@ -0,0 +1,80 @@
+from concurrent.futures import ThreadPoolExecutor
+from typing import TYPE_CHECKING, Any, List
+
+import ray
+from ray.data.block import BlockAccessor, CallableClass
+
+if TYPE_CHECKING:
+    from ray.data._internal.execution.interfaces import RefBundle
+
+
+def make_ref_bundles(simple_data: List[List[Any]]) -> List["RefBundle"]:
+    """Create ref bundles from a list of block data.
+
+    One bundle is created for each input block.
+    """
+    import pandas as pd
+
+    from ray.data._internal.execution.interfaces import RefBundle
+
+    output = []
+    for block in simple_data:
+        block = pd.DataFrame({"id": block})
+        output.append(
+            RefBundle(
+                [
+                    (
+                        ray.put(block),
+                        BlockAccessor.for_block(block).get_metadata(),
+                    )
+                ],
+                owns_blocks=True,
+            )
+        )
+    return output
+
+
+memory_units = ["B", "KB", "MB", "GB", "TB", "PB"]
+
+
+def memory_string(num_bytes: float) -> str:
+    """Return a human-readable memory string for the given amount of bytes."""
+    k = 0
+    while num_bytes >= 1024 and k < len(memory_units) - 1:
+        num_bytes /= 1024
+        k += 1
+    return f"{num_bytes:.1f}{memory_units[k]}"
+
+
+def locality_string(locality_hits: int, locality_misses) -> str:
+    """Return a human-readable string for object locality stats."""
+    if not locality_misses:
+        return "[all objects local]"
+    return f"[{locality_hits}/{locality_hits + locality_misses} objects local]"
+
+
+def make_callable_class_concurrent(callable_cls: CallableClass) -> CallableClass:
+    """Returns a thread-safe CallableClass with the same logic as the provided
+    `callable_cls`.
+
+    This function allows the usage of concurrent actors by safeguarding user logic
+    behind a separate thread.
+
+    This allows batch slicing and formatting to occur concurrently, to overlap with the
+    user provided UDF.
+    """
+
+    class _Wrapper(callable_cls):
+        def __init__(self, *args, **kwargs):
+            self.thread_pool_executor = ThreadPoolExecutor(max_workers=1)
+            super().__init__(*args, **kwargs)
+
+        def __repr__(self):
+            return super().__repr__()
+
+        def __call__(self, *args, **kwargs):
+            # ThreadPoolExecutor will reuse the same thread for every submit call.
+            future = self.thread_pool_executor.submit(super().__call__, *args, **kwargs)
+            return future.result()
+
+    return _Wrapper
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/exchange/pull_based_shuffle_task_scheduler.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/exchange/pull_based_shuffle_task_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2cf448f030d2165b56b14ebfc0539ddeecb778a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/exchange/pull_based_shuffle_task_scheduler.py
@@ -0,0 +1,149 @@
+import logging
+from typing import Any, Dict, List, Optional, Tuple
+
+from ray._private.ray_constants import CALLER_MEMORY_USAGE_PER_OBJECT_REF
+from ray.data._internal.execution.interfaces import RefBundle, TaskContext
+from ray.data._internal.planner.exchange.interfaces import (
+    ExchangeTaskScheduler,
+    ExchangeTaskSpec,
+)
+from ray.data._internal.remote_fn import cached_remote_fn
+from ray.data._internal.stats import StatsDict
+from ray.data._internal.util import convert_bytes_to_human_readable_str
+
+logger = logging.getLogger(__name__)
+
+
+class PullBasedShuffleTaskScheduler(ExchangeTaskScheduler):
+    """
+    The pull-based map-reduce shuffle scheduler.
+
+    Map tasks are first scheduled to generate map output blocks. After all map output
+    are generated, then reduce tasks are scheduled to combine map output blocks
+    together.
+
+    The concept here is similar to
+    "MapReduce: Simplified Data Processing on Large Clusters"
+    (https://dl.acm.org/doi/10.1145/1327452.1327492).
+    """
+
+    def execute(
+        self,
+        refs: List[RefBundle],
+        output_num_blocks: int,
+        task_ctx: TaskContext,
+        map_ray_remote_args: Optional[Dict[str, Any]] = None,
+        reduce_ray_remote_args: Optional[Dict[str, Any]] = None,
+        _debug_limit_execution_to_num_blocks: Optional[int] = None,
+    ) -> Tuple[List[RefBundle], StatsDict]:
+
+        # TODO: eagerly delete the input and map output block references in order to
+        # eagerly release the blocks' memory.
+        input_blocks_list = []
+        for ref_bundle in refs:
+            input_blocks_list.extend(ref_bundle.block_refs)
+        input_num_blocks = len(input_blocks_list)
+        input_owned = all(b.owns_blocks for b in refs)
+
+        caller_memory_usage = (
+            input_num_blocks * output_num_blocks * CALLER_MEMORY_USAGE_PER_OBJECT_REF
+        )
+        self.warn_on_driver_memory_usage(
+            caller_memory_usage,
+            "Execution is estimated to use at least "
+            f"{convert_bytes_to_human_readable_str(caller_memory_usage)} "
+            "of driver memory. Ensure that the driver machine has at least "
+            "this much memory to ensure job completion.\n\n"
+            "To reduce the "
+            "amount of driver memory needed, enable push-based shuffle using "
+            "RAY_DATA_PUSH_BASED_SHUFFLE=1 "
+            "(https://docs.ray.io/en/latest/data/performance-tips.html"
+            ").",
+        )
+
+        if map_ray_remote_args is None:
+            map_ray_remote_args = {}
+        if reduce_ray_remote_args is None:
+            reduce_ray_remote_args = {}
+        if "scheduling_strategy" not in reduce_ray_remote_args:
+            reduce_ray_remote_args = reduce_ray_remote_args.copy()
+            reduce_ray_remote_args["scheduling_strategy"] = "SPREAD"
+
+        shuffle_map = cached_remote_fn(self._exchange_spec.map)
+        shuffle_reduce = cached_remote_fn(self._exchange_spec.reduce)
+
+        sub_progress_bar_dict = task_ctx.sub_progress_bar_dict
+        bar_name = ExchangeTaskSpec.MAP_SUB_PROGRESS_BAR_NAME
+        assert bar_name in sub_progress_bar_dict, sub_progress_bar_dict
+        map_bar = sub_progress_bar_dict[bar_name]
+
+        if _debug_limit_execution_to_num_blocks is not None:
+            input_blocks_list = input_blocks_list[:_debug_limit_execution_to_num_blocks]
+            logger.debug(f"Limiting execution to {len(input_blocks_list)} map tasks")
+        shuffle_map_out = [
+            shuffle_map.options(
+                **map_ray_remote_args,
+                num_returns=1 + output_num_blocks,
+            ).remote(i, block, output_num_blocks, *self._exchange_spec._map_args)
+            for i, block in enumerate(input_blocks_list)
+        ]
+
+        # The first item returned is the BlockMetadata.
+        shuffle_map_metadata = []
+        for i, refs in enumerate(shuffle_map_out):
+            shuffle_map_metadata.append(refs[-1])
+            shuffle_map_out[i] = refs[:-1]
+
+        if _debug_limit_execution_to_num_blocks is not None:
+            while len(shuffle_map_out) < output_num_blocks:
+                # Repeat the first map task's results.
+                shuffle_map_out.append(shuffle_map_out[0][:])
+
+        shuffle_map_metadata = map_bar.fetch_until_complete(shuffle_map_metadata)
+
+        self.warn_on_high_local_memory_store_usage()
+
+        bar_name = ExchangeTaskSpec.REDUCE_SUB_PROGRESS_BAR_NAME
+        assert bar_name in sub_progress_bar_dict, sub_progress_bar_dict
+        reduce_bar = sub_progress_bar_dict[bar_name]
+
+        if _debug_limit_execution_to_num_blocks is not None:
+            output_num_blocks = _debug_limit_execution_to_num_blocks
+            logger.debug(f"Limiting execution to {output_num_blocks} reduce tasks")
+        shuffle_reduce_out = [
+            shuffle_reduce.options(**reduce_ray_remote_args, num_returns=2).remote(
+                *self._exchange_spec._reduce_args,
+                *[shuffle_map_out[i][j] for i in range(input_num_blocks)],
+            )
+            for j in range(output_num_blocks)
+        ]
+
+        # Release map task outputs from the Ray object store.
+        del shuffle_map_out
+
+        new_blocks, new_metadata = [], []
+        if shuffle_reduce_out:
+            new_blocks, new_metadata = zip(*shuffle_reduce_out)
+        new_metadata = reduce_bar.fetch_until_complete(list(new_metadata))
+
+        self.warn_on_high_local_memory_store_usage()
+
+        output = []
+        for block, meta in zip(new_blocks, new_metadata):
+            output.append(
+                RefBundle(
+                    [
+                        (
+                            block,
+                            meta,
+                        )
+                    ],
+                    owns_blocks=input_owned,
+                )
+            )
+        stats = {
+            "map": shuffle_map_metadata,
+            "reduce": new_metadata,
+        }
+
+        return (output, stats)
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/exchange/shuffle_task_spec.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/exchange/shuffle_task_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..51601ebbce04bd4501df5f369e811549e48ada28
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/exchange/shuffle_task_spec.py
@@ -0,0 +1,132 @@
+import logging
+import math
+from typing import Callable, Iterable, List, Optional, Tuple, Union
+
+import numpy as np
+
+from ray.data._internal.delegating_block_builder import DelegatingBlockBuilder
+from ray.data._internal.planner.exchange.interfaces import ExchangeTaskSpec
+from ray.data.block import Block, BlockAccessor, BlockExecStats, BlockMetadata
+from ray.data.context import MAX_SAFE_BLOCK_SIZE_FACTOR
+
+logger = logging.getLogger(__name__)
+
+
+class ShuffleTaskSpec(ExchangeTaskSpec):
+    """
+    The implementation for shuffle tasks.
+
+    This is used by random_shuffle() and repartition().
+    """
+
+    SPLIT_REPARTITION_SUB_PROGRESS_BAR_NAME = "Split Repartition"
+
+    def __init__(
+        self,
+        target_shuffle_max_block_size: int,
+        random_shuffle: bool = False,
+        random_seed: Optional[int] = None,
+        upstream_map_fn: Optional[Callable[[Iterable[Block]], Iterable[Block]]] = None,
+    ):
+        super().__init__(
+            map_args=[
+                target_shuffle_max_block_size,
+                upstream_map_fn,
+                random_shuffle,
+                random_seed,
+            ],
+            reduce_args=[random_shuffle, random_seed],
+        )
+
+    @staticmethod
+    def map(
+        idx: int,
+        block: Block,
+        output_num_blocks: int,
+        target_shuffle_max_block_size: int,
+        upstream_map_fn: Optional[Callable[[Iterable[Block]], Iterable[Block]]],
+        random_shuffle: bool,
+        random_seed: Optional[int],
+    ) -> List[Union[BlockMetadata, Block]]:
+        stats = BlockExecStats.builder()
+        if upstream_map_fn:
+            # TODO: Support dynamic block splitting in
+            # all-to-all ops, to avoid having to re-fuse
+            # upstream blocks together.
+            upstream_map_iter = upstream_map_fn([block])
+            mapped_block = next(upstream_map_iter)
+            builder = BlockAccessor.for_block(mapped_block).builder()
+            builder.add_block(mapped_block)
+            for mapped_block in upstream_map_iter:
+                builder.add_block(mapped_block)
+            # Drop the upstream inputs to reduce memory usage.
+            del mapped_block
+            block = builder.build()
+        block = BlockAccessor.for_block(block)
+        if (
+            block.size_bytes()
+            > MAX_SAFE_BLOCK_SIZE_FACTOR * target_shuffle_max_block_size
+        ):
+            logger.warning(
+                "Input block to map task has size "
+                f"{block.size_bytes() // (1024 * 1024)}MiB, which exceeds "
+                "DataContext.get_current().target_shuffle_max_block_size="
+                f"{target_shuffle_max_block_size // (1024 * 1024)}MiB. "
+                "This can lead to out-of-memory errors and can happen "
+                "when map tasks are fused to the shuffle operation. "
+                "To prevent fusion, call Dataset.materialize() on the "
+                "dataset before shuffling."
+            )
+
+        # Randomize the distribution of records to blocks.
+        if random_shuffle:
+            seed_i = random_seed + idx if random_seed is not None else None
+            block = block.random_shuffle(seed_i)
+            block = BlockAccessor.for_block(block)
+
+        # Build a list of slices to return. It's okay to put the results in a
+        # list instead of yielding them as a generator because slicing the
+        # ArrowBlock is zero-copy.
+        slice_sz = max(1, math.ceil(block.num_rows() / output_num_blocks))
+        slices = []
+        for i in range(output_num_blocks):
+            slices.append(block.slice(i * slice_sz, (i + 1) * slice_sz))
+
+        # Randomize the distribution order of the blocks (this prevents empty
+        # outputs when input blocks are very small).
+        if random_shuffle:
+            random = np.random.RandomState(seed_i)
+            random.shuffle(slices)
+
+        num_rows = sum(BlockAccessor.for_block(s).num_rows() for s in slices)
+        assert num_rows == block.num_rows(), (num_rows, block.num_rows())
+        metadata = block.get_metadata(input_files=None, exec_stats=stats.build())
+        return slices + [metadata]
+
+    @staticmethod
+    def reduce(
+        random_shuffle: bool,
+        random_seed: Optional[int],
+        *mapper_outputs: List[Block],
+        partial_reduce: bool = False,
+    ) -> Tuple[Block, BlockMetadata]:
+        # TODO: Support fusion with other downstream operators.
+        stats = BlockExecStats.builder()
+        builder = DelegatingBlockBuilder()
+        for block in mapper_outputs:
+            builder.add_block(block)
+        new_block = builder.build()
+        accessor = BlockAccessor.for_block(new_block)
+        if random_shuffle:
+            new_block = accessor.random_shuffle(
+                random_seed if random_seed is not None else None
+            )
+            accessor = BlockAccessor.for_block(new_block)
+        new_metadata = BlockMetadata(
+            num_rows=accessor.num_rows(),
+            size_bytes=accessor.size_bytes(),
+            schema=accessor.schema(),
+            input_files=None,
+            exec_stats=stats.build(),
+        )
+        return new_block, new_metadata
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/exchange/sort_task_spec.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/exchange/sort_task_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f79a7885cbf254299774c92d160122625c25a7a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/exchange/sort_task_spec.py
@@ -0,0 +1,230 @@
+from typing import TYPE_CHECKING, List, Optional, Tuple, TypeVar, Union
+
+import numpy as np
+
+from ray.data._internal.delegating_block_builder import DelegatingBlockBuilder
+from ray.data._internal.planner.exchange.interfaces import ExchangeTaskSpec
+from ray.data._internal.progress_bar import ProgressBar
+from ray.data._internal.remote_fn import cached_remote_fn
+from ray.data._internal.table_block import TableBlockAccessor
+from ray.data._internal.util import NULL_SENTINEL
+from ray.data.block import Block, BlockAccessor, BlockExecStats, BlockMetadata
+from ray.types import ObjectRef
+
+T = TypeVar("T")
+
+if TYPE_CHECKING:
+    import pyarrow
+
+
+class SortKey:
+    """SortKey class to convert between different sort args formats."""
+
+    def __init__(
+        self,
+        key: Optional[Union[str, List[str]]] = None,
+        descending: Union[bool, List[bool]] = False,
+        boundaries: Optional[List[T]] = None,
+    ):
+        if key is None:
+            key = []
+        if isinstance(key, str):
+            key = [key]
+        if not (isinstance(key, list) and all(isinstance(k, str) for k in key)):
+            raise ValueError(
+                f"Key must be a string or a list of strings, but got {key}."
+            )
+        if isinstance(descending, bool):
+            descending = [descending for _ in key]
+        elif isinstance(descending, list):
+            if len(descending) != len(key):
+                raise ValueError(
+                    "Length of `descending` does not match the length of the key."
+                )
+        self._columns = key
+        self._descending = descending
+        if boundaries:
+            for item in boundaries:
+                if not isinstance(item, (int, float)):
+                    raise ValueError(
+                        "The type of items in boundaries must be int or float."
+                    )
+            boundaries = list(set(boundaries))
+            boundaries.sort()
+        self._boundaries = boundaries
+
+    def get_columns(self) -> List[str]:
+        return self._columns
+
+    def get_descending(self) -> List[bool]:
+        return self._descending
+
+    def to_arrow_sort_args(self) -> List[Tuple[str, str]]:
+        return [
+            (key, "descending" if desc else "ascending")
+            for key, desc in zip(self._columns, self._descending)
+        ]
+
+    def to_pandas_sort_args(self) -> Tuple[List[str], List[bool]]:
+        return self._columns, [not desc for desc in self._descending]
+
+    def validate_schema(self, schema: Optional[Union[type, "pyarrow.lib.Schema"]]):
+        """Check the key function is valid on the given schema."""
+        if schema is None:
+            # Dataset is empty/cleared, validation not possible.
+            return
+
+        if self._columns and len(schema.names) > 0:
+            schema_names_set = set(schema.names)
+            for column in self._columns:
+                if column not in schema_names_set:
+                    raise ValueError(
+                        f"You specified the column '{column}', but there's no such "
+                        "column in the dataset. The dataset has columns: "
+                        f"{schema_names_set}"
+                    )
+
+    @property
+    def boundaries(self):
+        return self._boundaries
+
+
+class SortTaskSpec(ExchangeTaskSpec):
+    """
+    The implementation for distributed sort tasks.
+
+    The algorithm is similar to [External Merge Sort]
+    (https://en.wikipedia.org/wiki/External_sorting).
+    Sorting is done in 3 steps: sampling, sorting individual blocks, and
+    merging sorted blocks.
+
+    Sampling (`sample_boundaries`): we get a number of sample items from each block,
+    sort them, and use them to compute boundaries that would partition all items into
+    approximately equal ranges.
+
+    Sorting (`map`): each block is sorted locally, then partitioned into smaller
+    blocks according to the boundaries. Each partitioned block is passed to a merge
+    task.
+
+    Merging (`reduce`): a merge task would receive a block from every worker that
+    consists of items in a certain range. It then merges the sorted blocks into one
+    sorted block and becomes part of the new, sorted block.
+    """
+
+    SORT_SAMPLE_SUB_PROGRESS_BAR_NAME = "Sort Sample"
+
+    def __init__(
+        self,
+        boundaries: List[T],
+        sort_key: SortKey,
+        batch_format: str,
+    ):
+        super().__init__(
+            map_args=[boundaries, sort_key],
+            reduce_args=[sort_key, batch_format],
+        )
+
+    @staticmethod
+    def map(
+        idx: int,
+        block: Block,
+        output_num_blocks: int,
+        boundaries: List[T],
+        sort_key: SortKey,
+    ) -> List[Union[BlockMetadata, Block]]:
+        stats = BlockExecStats.builder()
+        out = BlockAccessor.for_block(block).sort_and_partition(boundaries, sort_key)
+        meta = BlockAccessor.for_block(block).get_metadata(exec_stats=stats.build())
+        return out + [meta]
+
+    @staticmethod
+    def reduce(
+        sort_key: SortKey,
+        batch_format: str,
+        *mapper_outputs: List[Block],
+        partial_reduce: bool = False,
+    ) -> Tuple[Block, BlockMetadata]:
+        normalized_blocks = TableBlockAccessor.normalize_block_types(
+            mapper_outputs, normalize_type=batch_format
+        )
+        return BlockAccessor.for_block(normalized_blocks[0]).merge_sorted_blocks(
+            normalized_blocks, sort_key
+        )
+
+    @staticmethod
+    def sample_boundaries(
+        blocks: List[ObjectRef[Block]],
+        sort_key: SortKey,
+        num_reducers: int,
+        sample_bar: Optional[ProgressBar] = None,
+    ) -> List[T]:
+        """
+        Return (num_reducers - 1) items in ascending order from the blocks that
+        partition the domain into ranges with approximately equally many elements.
+        Each boundary item is a tuple of a form (col1_value, col2_value, ...).
+        """
+        columns = sort_key.get_columns()
+        n_samples = int(num_reducers * 10 / len(blocks))
+
+        sample_block = cached_remote_fn(_sample_block)
+
+        sample_results = [
+            sample_block.remote(block, n_samples, sort_key) for block in blocks
+        ]
+        if sample_bar is None:
+            sample_bar = ProgressBar(
+                SortTaskSpec.SORT_SAMPLE_SUB_PROGRESS_BAR_NAME,
+                len(blocks) * n_samples,
+                unit="rows",
+            )
+        # TODO(zhilong): Update sort sample bar before finished.
+        samples = sample_bar.fetch_until_complete(sample_results)
+        del sample_results
+        samples: List[Block] = [s for s in samples if len(s) > 0]
+        # The dataset is empty
+        if len(samples) == 0:
+            return [None] * (num_reducers - 1)
+
+        # Convert samples to a sorted list[tuple[...]] where each tuple represents a
+        # sample.
+        # TODO: Once we deprecate pandas blocks, we can avoid this conversion and
+        # directly sort the samples.
+        builder = DelegatingBlockBuilder()
+        for sample in samples:
+            builder.add_block(sample)
+        samples_table = builder.build()
+        samples_dict = BlockAccessor.for_block(samples_table).to_numpy(columns=columns)
+        # This zip does the transposition from list of column values to list of tuples.
+        samples_list = list(zip(*samples_dict.values()))
+
+        def is_na(x):
+            # Check if x is None or NaN. Type casting to np.array first to avoid
+            # isnan failing on strings and other types.
+            if x is None:
+                return True
+            x = np.asarray(x)
+            if np.issubdtype(x.dtype, np.number):
+                return np.isnan(x)
+            return False
+
+        # To allow multi-directional sort, we utilize Python's stable sort: we
+        # sort several times with different directions. We do this in reverse, so
+        # that the last key we sort by is the primary sort key passed by the user.
+        for i, desc in list(enumerate(sort_key.get_descending()))[::-1]:
+            # Sort the list, but Nones should be NULL_SENTINEL to ensure safe sorting.
+            samples_list.sort(
+                key=lambda sample: NULL_SENTINEL if is_na(sample[i]) else sample[i],
+                reverse=desc,
+            )
+
+        # Each boundary corresponds to a quantile of the data.
+        quantile_indices = [
+            int(q * (len(samples_list) - 1))
+            for q in np.linspace(0, 1, num_reducers + 1)
+        ]
+        # Exclude the first and last quantiles because they're 0 and 1.
+        return [samples_list[i] for i in quantile_indices[1:-1]]
+
+
+def _sample_block(block: Block, n_samples: int, sort_key: SortKey) -> Block:
+    return BlockAccessor.for_block(block).sample(n_samples, sort_key)
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/exchange/split_repartition_task_scheduler.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/exchange/split_repartition_task_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc21f699667b48f64ef6d08da493b37f1b3d0583
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/exchange/split_repartition_task_scheduler.py
@@ -0,0 +1,138 @@
+from typing import Any, Dict, List, Optional, Tuple
+
+import ray
+from ray.data._internal.execution.interfaces import RefBundle, TaskContext
+from ray.data._internal.planner.exchange.interfaces import ExchangeTaskScheduler
+from ray.data._internal.planner.exchange.shuffle_task_spec import ShuffleTaskSpec
+from ray.data._internal.remote_fn import cached_remote_fn
+from ray.data._internal.split import _split_at_indices
+from ray.data._internal.stats import StatsDict
+from ray.data.block import Block, BlockAccessor, BlockMetadata
+from ray.types import ObjectRef
+
+
+class SplitRepartitionTaskScheduler(ExchangeTaskScheduler):
+    """
+    The split (non-shuffle) repartition scheduler.
+
+    First, we calculate global splits needed to produce `output_num_blocks` blocks.
+    After the split blocks are generated accordingly, reduce tasks are scheduled
+    to combine split blocks together.
+    """
+
+    def execute(
+        self,
+        refs: List[RefBundle],
+        output_num_blocks: int,
+        ctx: TaskContext,
+        map_ray_remote_args: Optional[Dict[str, Any]] = None,
+        reduce_ray_remote_args: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[List[RefBundle], StatsDict]:
+        input_num_rows = 0
+        input_owned_by_consumer = True
+        for ref_bundle in refs:
+            block_num_rows = ref_bundle.num_rows()
+            if block_num_rows is None:
+                raise ValueError(
+                    "Cannot split partition on blocks with unknown number of rows."
+                )
+            input_num_rows += block_num_rows
+            if not ref_bundle.owns_blocks:
+                input_owned_by_consumer = False
+
+        # Compute the (output_num_blocks) indices needed for an equal split of the
+        # input blocks. When output_num_blocks=1, the total number of
+        # input rows is used as the end index during the split calculation,
+        # so that we can combine all input blocks into a single output block.
+        indices = []
+        if output_num_blocks == 1:
+            indices = [input_num_rows]
+        else:
+            cur_idx = 0
+            for _ in range(output_num_blocks - 1):
+                cur_idx += input_num_rows / output_num_blocks
+                indices.append(int(cur_idx))
+        assert len(indices) <= output_num_blocks, (indices, output_num_blocks)
+
+        if map_ray_remote_args is None:
+            map_ray_remote_args = {}
+        if reduce_ray_remote_args is None:
+            reduce_ray_remote_args = {}
+        if "scheduling_strategy" not in reduce_ray_remote_args:
+            reduce_ray_remote_args = reduce_ray_remote_args.copy()
+            reduce_ray_remote_args["scheduling_strategy"] = "SPREAD"
+
+        blocks_with_metadata: List[Tuple[ObjectRef[Block], BlockMetadata]] = []
+        for ref_bundle in refs:
+            blocks_with_metadata.extend(ref_bundle.blocks)
+        split_return = _split_at_indices(
+            blocks_with_metadata, indices, input_owned_by_consumer
+        )
+        split_block_refs, split_metadata = [], []
+        for b, m in zip(*split_return):
+            split_block_refs.append(b)
+            split_metadata.extend(m)
+
+        sub_progress_bar_dict = ctx.sub_progress_bar_dict
+        bar_name = ShuffleTaskSpec.SPLIT_REPARTITION_SUB_PROGRESS_BAR_NAME
+        assert bar_name in sub_progress_bar_dict, sub_progress_bar_dict
+        reduce_bar = sub_progress_bar_dict[bar_name]
+
+        reduce_task = cached_remote_fn(self._exchange_spec.reduce)
+        reduce_return = [
+            reduce_task.options(**reduce_ray_remote_args, num_returns=2).remote(
+                *self._exchange_spec._reduce_args,
+                *split_block_refs[j],
+            )
+            for j in range(output_num_blocks)
+            # Only process splits which contain blocks.
+            if len(split_block_refs[j]) > 0
+        ]
+
+        reduce_block_refs, reduce_metadata = zip(*reduce_return)
+        reduce_metadata = reduce_bar.fetch_until_complete(list(reduce_metadata))
+        reduce_block_refs, reduce_metadata = list(reduce_block_refs), list(
+            reduce_metadata
+        )
+
+        # Handle empty blocks.
+        if len(reduce_block_refs) < output_num_blocks:
+            import pyarrow as pa
+
+            from ray.data._internal.arrow_block import ArrowBlockBuilder
+            from ray.data._internal.pandas_block import (
+                PandasBlockBuilder,
+                PandasBlockSchema,
+            )
+
+            num_empty_blocks = output_num_blocks - len(reduce_block_refs)
+            first_block_schema = reduce_metadata[0].schema
+            if first_block_schema is None:
+                raise ValueError(
+                    "Cannot split partition on blocks with unknown block format."
+                )
+            elif isinstance(first_block_schema, pa.Schema):
+                builder = ArrowBlockBuilder()
+            elif isinstance(first_block_schema, PandasBlockSchema):
+                builder = PandasBlockBuilder()
+            empty_block = builder.build()
+            empty_meta = BlockAccessor.for_block(empty_block).get_metadata(
+                exec_stats=None
+            )  # No stats for empty block.
+            empty_block_refs, empty_metadata = zip(
+                *[(ray.put(empty_block), empty_meta) for _ in range(num_empty_blocks)]
+            )
+            reduce_block_refs.extend(empty_block_refs)
+            reduce_metadata.extend(empty_metadata)
+
+        output = []
+        for block, meta in zip(reduce_block_refs, reduce_metadata):
+            output.append(
+                RefBundle([(block, meta)], owns_blocks=input_owned_by_consumer)
+            )
+        stats = {
+            "split": split_metadata,
+            "reduce": reduce_metadata,
+        }
+
+        return (output, stats)
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/plan_all_to_all_op.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/plan_all_to_all_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..7677ab6a0fc00d90aea7cae22a9ff882c093b140
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/plan_all_to_all_op.py
@@ -0,0 +1,94 @@
+from typing import List
+
+from ray.data._internal.execution.interfaces import PhysicalOperator
+from ray.data._internal.execution.operators.base_physical_operator import (
+    AllToAllOperator,
+)
+from ray.data._internal.logical.operators.all_to_all_operator import (
+    AbstractAllToAll,
+    Aggregate,
+    RandomizeBlocks,
+    RandomShuffle,
+    Repartition,
+    Sort,
+)
+from ray.data._internal.planner.aggregate import generate_aggregate_fn
+from ray.data._internal.planner.random_shuffle import generate_random_shuffle_fn
+from ray.data._internal.planner.randomize_blocks import generate_randomize_blocks_fn
+from ray.data._internal.planner.repartition import generate_repartition_fn
+from ray.data._internal.planner.sort import generate_sort_fn
+from ray.data.context import DataContext
+
+
+def plan_all_to_all_op(
+    op: AbstractAllToAll,
+    physical_children: List[PhysicalOperator],
+    data_context: DataContext,
+) -> AllToAllOperator:
+    """Get the corresponding physical operators DAG for AbstractAllToAll operators.
+
+    Note this method only converts the given `op`, but not its input dependencies.
+    See Planner.plan() for more details.
+    """
+    assert len(physical_children) == 1
+    input_physical_dag = physical_children[0]
+
+    target_max_block_size = None
+    if isinstance(op, RandomizeBlocks):
+        fn = generate_randomize_blocks_fn(op)
+        # Randomize block order does not actually compute anything, so we
+        # want to inherit the upstream op's target max block size.
+    elif isinstance(op, RandomShuffle):
+        debug_limit_shuffle_execution_to_num_blocks = data_context.get_config(
+            "debug_limit_shuffle_execution_to_num_blocks", None
+        )
+        fn = generate_random_shuffle_fn(
+            op._seed,
+            op._num_outputs,
+            op._ray_remote_args,
+            debug_limit_shuffle_execution_to_num_blocks,
+        )
+        target_max_block_size = data_context.target_shuffle_max_block_size
+    elif isinstance(op, Repartition):
+        debug_limit_shuffle_execution_to_num_blocks = None
+        if op._shuffle:
+            target_max_block_size = data_context.target_shuffle_max_block_size
+            debug_limit_shuffle_execution_to_num_blocks = data_context.get_config(
+                "debug_limit_shuffle_execution_to_num_blocks", None
+            )
+        fn = generate_repartition_fn(
+            op._num_outputs,
+            op._shuffle,
+            debug_limit_shuffle_execution_to_num_blocks,
+        )
+    elif isinstance(op, Sort):
+        debug_limit_shuffle_execution_to_num_blocks = data_context.get_config(
+            "debug_limit_shuffle_execution_to_num_blocks", None
+        )
+        fn = generate_sort_fn(
+            op._sort_key, op._batch_format, debug_limit_shuffle_execution_to_num_blocks
+        )
+        target_max_block_size = data_context.target_shuffle_max_block_size
+    elif isinstance(op, Aggregate):
+        debug_limit_shuffle_execution_to_num_blocks = data_context.get_config(
+            "debug_limit_shuffle_execution_to_num_blocks", None
+        )
+        fn = generate_aggregate_fn(
+            op._key,
+            op._aggs,
+            op._batch_format,
+            debug_limit_shuffle_execution_to_num_blocks,
+        )
+        target_max_block_size = data_context.target_shuffle_max_block_size
+    else:
+        raise ValueError(f"Found unknown logical operator during planning: {op}")
+
+    return AllToAllOperator(
+        fn,
+        input_physical_dag,
+        data_context,
+        target_max_block_size=target_max_block_size,
+        num_outputs=op._num_outputs,
+        sub_progress_bar_names=op._sub_progress_bar_names,
+        name=op.name,
+    )
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/plan_from_arrow_op.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/plan_from_arrow_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/plan_from_numpy_op.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/plan_from_numpy_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/plan_udf_map_op.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/plan_udf_map_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..feea97b52ab99f604aaa8de34dad1fb667d12f99
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/plan_udf_map_op.py
@@ -0,0 +1,648 @@
+import asyncio
+import collections
+import inspect
+import queue
+from threading import Thread
+from types import GeneratorType
+from typing import Any, Callable, Iterable, Iterator, List, Optional
+
+import numpy as np
+import pandas as pd
+import pyarrow as pa
+
+import ray
+from ray._private.utils import get_or_create_event_loop
+from ray.data._internal.compute import get_compute
+from ray.data._internal.execution.interfaces import PhysicalOperator
+from ray.data._internal.execution.interfaces.task_context import TaskContext
+from ray.data._internal.execution.operators.map_operator import MapOperator
+from ray.data._internal.execution.operators.map_transformer import (
+    BatchMapTransformFn,
+    BlockMapTransformFn,
+    BlocksToBatchesMapTransformFn,
+    BlocksToRowsMapTransformFn,
+    BuildOutputBlocksMapTransformFn,
+    MapTransformCallable,
+    MapTransformer,
+    Row,
+    RowMapTransformFn,
+)
+from ray.data._internal.execution.util import make_callable_class_concurrent
+from ray.data._internal.logical.operators.map_operator import (
+    AbstractUDFMap,
+    Filter,
+    FlatMap,
+    MapBatches,
+    MapRows,
+    Project,
+)
+from ray.data._internal.numpy_support import is_valid_udf_return
+from ray.data._internal.util import _truncated_repr
+from ray.data.block import (
+    Block,
+    BlockAccessor,
+    BlockType,
+    CallableClass,
+    DataBatch,
+    UserDefinedFunction,
+)
+from ray.data.context import DataContext
+from ray.data.exceptions import UserCodeException
+from ray.util.rpdb import _is_ray_debugger_post_mortem_enabled
+
+
+class _MapActorContext:
+    def __init__(
+        self,
+        udf_map_cls: UserDefinedFunction,
+        udf_map_fn: Callable[[Any], Any],
+        is_async: bool,
+    ):
+        self.udf_map_cls = udf_map_cls
+        self.udf_map_fn = udf_map_fn
+        self.is_async = is_async
+        self.udf_map_asyncio_loop = None
+        self.udf_map_asyncio_thread = None
+
+        if is_async:
+            self._init_async()
+
+    def _init_async(self):
+        # Only used for callable class with async generator `__call__` method.
+        loop = get_or_create_event_loop()
+
+        def run_loop():
+            asyncio.set_event_loop(loop)
+            loop.run_forever()
+
+        thread = Thread(target=run_loop)
+        thread.start()
+        self.udf_map_asyncio_loop = loop
+        self.udf_map_asyncio_thread = thread
+
+
+def plan_project_op(
+    op: Project,
+    physical_children: List[PhysicalOperator],
+    data_context: DataContext,
+) -> MapOperator:
+    assert len(physical_children) == 1
+    input_physical_dag = physical_children[0]
+
+    columns = op.cols
+    columns_rename = op.cols_rename
+
+    def fn(block: Block) -> Block:
+        try:
+            if BlockAccessor.for_block(block).block_type() == BlockType.PANDAS:
+                # TODO (srinathk) PandasBlockAccessor combine method needs to handle
+                # None types correctly. Until then, convert to Arrow Table.
+                block = BlockAccessor.for_block(block).to_arrow()
+            if not BlockAccessor.for_block(block).num_rows():
+                return block
+            if columns:
+                block = BlockAccessor.for_block(block).select(columns)
+            if columns_rename:
+                block = block.rename_columns(
+                    [columns_rename.get(col, col) for col in block.schema.names]
+                )
+            return block
+        except Exception as e:
+            _handle_debugger_exception(e)
+
+    compute = get_compute(op._compute)
+    transform_fn = _generate_transform_fn_for_map_block(fn)
+    map_transformer = _create_map_transformer_for_block_based_map_op(
+        transform_fn,
+    )
+
+    return MapOperator.create(
+        map_transformer,
+        input_physical_dag,
+        data_context,
+        name=op.name,
+        compute_strategy=compute,
+        ray_remote_args=op._ray_remote_args,
+        ray_remote_args_fn=op._ray_remote_args_fn,
+    )
+
+
+def plan_filter_op(
+    op: Filter,
+    physical_children: List[PhysicalOperator],
+    data_context: DataContext,
+) -> MapOperator:
+    assert len(physical_children) == 1
+    input_physical_dag = physical_children[0]
+
+    expression = op._filter_expr
+    compute = get_compute(op._compute)
+    if expression is not None:
+
+        def filter_batch_fn(block: "pa.Table") -> "pa.Table":
+            try:
+                return block.filter(expression)
+            except Exception as e:
+                _handle_debugger_exception(e)
+
+        transform_fn = _generate_transform_fn_for_map_batches(filter_batch_fn)
+        map_transformer = _create_map_transformer_for_map_batches_op(
+            transform_fn,
+            batch_size=None,
+            batch_format="pyarrow",
+            zero_copy_batch=True,
+        )
+    else:
+        filter_fn, init_fn = _parse_op_fn(op)
+        transform_fn = _generate_transform_fn_for_filter(filter_fn)
+        map_transformer = _create_map_transformer_for_row_based_map_op(
+            transform_fn, init_fn
+        )
+
+    return MapOperator.create(
+        map_transformer,
+        input_physical_dag,
+        data_context,
+        name=op.name,
+        compute_strategy=compute,
+        ray_remote_args=op._ray_remote_args,
+        ray_remote_args_fn=op._ray_remote_args_fn,
+    )
+
+
+def plan_udf_map_op(
+    op: AbstractUDFMap,
+    physical_children: List[PhysicalOperator],
+    data_context: DataContext,
+) -> MapOperator:
+    """Get the corresponding physical operators DAG for AbstractUDFMap operators.
+
+    Note this method only converts the given `op`, but not its input dependencies.
+    See Planner.plan() for more details.
+    """
+    assert len(physical_children) == 1
+    input_physical_dag = physical_children[0]
+
+    compute = get_compute(op._compute)
+    fn, init_fn = _parse_op_fn(op)
+
+    if isinstance(op, MapBatches):
+        transform_fn = _generate_transform_fn_for_map_batches(fn)
+        map_transformer = _create_map_transformer_for_map_batches_op(
+            transform_fn,
+            op._batch_size,
+            op._batch_format,
+            op._zero_copy_batch,
+            init_fn,
+        )
+    else:
+        if isinstance(op, MapRows):
+            transform_fn = _generate_transform_fn_for_map_rows(fn)
+        elif isinstance(op, FlatMap):
+            transform_fn = _generate_transform_fn_for_flat_map(fn)
+        else:
+            raise ValueError(f"Found unknown logical operator during planning: {op}")
+
+        map_transformer = _create_map_transformer_for_row_based_map_op(
+            transform_fn, init_fn
+        )
+
+    return MapOperator.create(
+        map_transformer,
+        input_physical_dag,
+        data_context,
+        name=op.name,
+        target_max_block_size=None,
+        compute_strategy=compute,
+        min_rows_per_bundle=op._min_rows_per_bundled_input,
+        ray_remote_args_fn=op._ray_remote_args_fn,
+        ray_remote_args=op._ray_remote_args,
+    )
+
+
+def _parse_op_fn(op: AbstractUDFMap):
+    # Note, it's important to define these standalone variables.
+    # So the parsed functions won't need to caputure the entire operator, which may not
+    # be serializable.
+    op_fn = op._fn
+    fn_args = op._fn_args or ()
+    fn_kwargs = op._fn_kwargs or {}
+
+    if isinstance(op._fn, CallableClass):
+        fn_constructor_args = op._fn_constructor_args or ()
+        fn_constructor_kwargs = op._fn_constructor_kwargs or {}
+
+        is_async_gen = inspect.isasyncgenfunction(op._fn.__call__)
+
+        # TODO(scottjlee): (1) support non-generator async functions
+        # (2) make the map actor async
+        if not is_async_gen:
+            op_fn = make_callable_class_concurrent(op_fn)
+
+        def init_fn():
+            if ray.data._map_actor_context is None:
+                ray.data._map_actor_context = _MapActorContext(
+                    udf_map_cls=op_fn,
+                    udf_map_fn=op_fn(
+                        *fn_constructor_args,
+                        **fn_constructor_kwargs,
+                    ),
+                    is_async=is_async_gen,
+                )
+
+        if is_async_gen:
+
+            async def fn(item: Any) -> Any:
+                assert ray.data._map_actor_context is not None
+                assert ray.data._map_actor_context.is_async
+
+                try:
+                    return ray.data._map_actor_context.udf_map_fn(
+                        item,
+                        *fn_args,
+                        **fn_kwargs,
+                    )
+                except Exception as e:
+                    _handle_debugger_exception(e)
+
+        else:
+
+            def fn(item: Any) -> Any:
+                assert ray.data._map_actor_context is not None
+                assert not ray.data._map_actor_context.is_async
+                try:
+                    return ray.data._map_actor_context.udf_map_fn(
+                        item,
+                        *fn_args,
+                        **fn_kwargs,
+                    )
+                except Exception as e:
+                    _handle_debugger_exception(e)
+
+    else:
+
+        def fn(item: Any) -> Any:
+            try:
+                return op_fn(item, *fn_args, **fn_kwargs)
+            except Exception as e:
+                _handle_debugger_exception(e)
+
+        def init_fn():
+            pass
+
+    return fn, init_fn
+
+
+def _handle_debugger_exception(e: Exception):
+    """If the Ray Debugger is enabled, keep the full stack trace unmodified
+    so that the debugger can stop at the initial unhandled exception.
+    Otherwise, clear the stack trace to omit noisy internal code path."""
+    ctx = ray.data.DataContext.get_current()
+    if _is_ray_debugger_post_mortem_enabled() or ctx.raise_original_map_exception:
+        raise e
+    else:
+        raise UserCodeException() from e
+
+
+# Following are util functions for converting UDFs to `MapTransformCallable`s.
+
+
+def _validate_batch_output(batch: Block) -> None:
+    if not isinstance(
+        batch,
+        (
+            list,
+            pa.Table,
+            np.ndarray,
+            collections.abc.Mapping,
+            pd.core.frame.DataFrame,
+            dict,
+        ),
+    ):
+        raise ValueError(
+            "The `fn` you passed to `map_batches` returned a value of type "
+            f"{type(batch)}. This isn't allowed -- `map_batches` expects "
+            "`fn` to return a `pandas.DataFrame`, `pyarrow.Table`, "
+            "`numpy.ndarray`, `list`, or `dict[str, numpy.ndarray]`."
+        )
+
+    if isinstance(batch, list):
+        raise ValueError(
+            f"Error validating {_truncated_repr(batch)}: "
+            "Returning a list of objects from `map_batches` is not "
+            "allowed in Ray 2.5. To return Python objects, "
+            "wrap them in a named dict field, e.g., "
+            "return `{'results': objects}` instead of just `objects`."
+        )
+
+    if isinstance(batch, collections.abc.Mapping):
+        for key, value in list(batch.items()):
+            if not is_valid_udf_return(value):
+                raise ValueError(
+                    f"Error validating {_truncated_repr(batch)}: "
+                    "The `fn` you passed to `map_batches` returned a "
+                    f"`dict`. `map_batches` expects all `dict` values "
+                    f"to be `list` or `np.ndarray` type, but the value "
+                    f"corresponding to key {key!r} is of type "
+                    f"{type(value)}. To fix this issue, convert "
+                    f"the {type(value)} to a `np.ndarray`."
+                )
+
+
+def _generate_transform_fn_for_map_batches(
+    fn: UserDefinedFunction,
+) -> MapTransformCallable[DataBatch, DataBatch]:
+    if inspect.iscoroutinefunction(fn):
+        # UDF is a callable class with async generator `__call__` method.
+        transform_fn = _generate_transform_fn_for_async_map_batches(fn)
+
+    else:
+
+        def transform_fn(
+            batches: Iterable[DataBatch], _: TaskContext
+        ) -> Iterable[DataBatch]:
+            for batch in batches:
+                try:
+                    if (
+                        not isinstance(batch, collections.abc.Mapping)
+                        and BlockAccessor.for_block(batch).num_rows() == 0
+                    ):
+                        # For empty input blocks, we directly ouptut them without
+                        # calling the UDF.
+                        # TODO(hchen): This workaround is because some all-to-all
+                        # operators output empty blocks with no schema.
+                        res = [batch]
+                    else:
+                        res = fn(batch)
+                        if not isinstance(res, GeneratorType):
+                            res = [res]
+                except ValueError as e:
+                    read_only_msgs = [
+                        "assignment destination is read-only",
+                        "buffer source array is read-only",
+                    ]
+                    err_msg = str(e)
+                    if any(msg in err_msg for msg in read_only_msgs):
+                        raise ValueError(
+                            f"Batch mapper function {fn.__name__} tried to mutate a "
+                            "zero-copy read-only batch. To be able to mutate the "
+                            "batch, pass zero_copy_batch=False to map_batches(); "
+                            "this will create a writable copy of the batch before "
+                            "giving it to fn. To elide this copy, modify your mapper "
+                            "function so it doesn't try to mutate its input."
+                        ) from e
+                    else:
+                        raise e from None
+                else:
+                    for out_batch in res:
+                        _validate_batch_output(out_batch)
+                        yield out_batch
+
+    return transform_fn
+
+
+def _generate_transform_fn_for_async_map_batches(
+    fn: UserDefinedFunction,
+) -> MapTransformCallable[DataBatch, DataBatch]:
+    def transform_fn(
+        input_iterable: Iterable[DataBatch], _: TaskContext
+    ) -> Iterable[DataBatch]:
+        # Use a queue to store outputs from async generator calls.
+        # We will put output batches into this queue from async
+        # generators, and in the main event loop, yield them from
+        # the queue as they become available.
+        output_batch_queue = queue.Queue()
+        # Sentinel object to signal the end of the async generator.
+        sentinel = object()
+
+        async def process_batch(batch: DataBatch):
+            try:
+                output_batch_iterator = await fn(batch)
+                # As soon as results become available from the async generator,
+                # put them into the result queue so they can be yielded.
+                async for output_batch in output_batch_iterator:
+                    output_batch_queue.put(output_batch)
+            except Exception as e:
+                output_batch_queue.put(
+                    e
+                )  # Put the exception into the queue to signal an error
+
+        async def process_all_batches():
+            try:
+                loop = ray.data._map_actor_context.udf_map_asyncio_loop
+                tasks = [loop.create_task(process_batch(x)) for x in input_iterable]
+
+                ctx = ray.data.DataContext.get_current()
+                if ctx.execution_options.preserve_order:
+                    for task in tasks:
+                        await task()
+                else:
+                    for task in asyncio.as_completed(tasks):
+                        await task
+            finally:
+                output_batch_queue.put(sentinel)
+
+        # Use the existing event loop to create and run Tasks to process each batch
+        loop = ray.data._map_actor_context.udf_map_asyncio_loop
+        asyncio.run_coroutine_threadsafe(process_all_batches(), loop)
+
+        # Yield results as they become available.
+        while True:
+            # Here, `out_batch` is a one-row output batch
+            # from the async generator, corresponding to a
+            # single row from the input batch.
+            out_batch = output_batch_queue.get()
+            if out_batch is sentinel:
+                # Break out of the loop when the sentinel is received.
+                break
+            if isinstance(out_batch, Exception):
+                raise out_batch
+            _validate_batch_output(out_batch)
+            yield out_batch
+
+    return transform_fn
+
+
+def _validate_row_output(item):
+    if not isinstance(item, collections.abc.Mapping):
+        raise ValueError(
+            f"Error validating {_truncated_repr(item)}: "
+            "Standalone Python objects are not "
+            "allowed in Ray 2.5. To return Python objects from map(), "
+            "wrap them in a dict, e.g., "
+            "return `{'item': item}` instead of just `item`."
+        )
+
+
+def _generate_transform_fn_for_map_rows(
+    fn: UserDefinedFunction,
+) -> MapTransformCallable[Row, Row]:
+    def transform_fn(rows: Iterable[Row], _: TaskContext) -> Iterable[Row]:
+        for row in rows:
+            out_row = fn(row)
+            _validate_row_output(out_row)
+            yield out_row
+
+    return transform_fn
+
+
+def _generate_transform_fn_for_flat_map(
+    fn: UserDefinedFunction,
+) -> MapTransformCallable[Row, Row]:
+    def transform_fn(rows: Iterable[Row], _: TaskContext) -> Iterable[Row]:
+        for row in rows:
+            for out_row in fn(row):
+                _validate_row_output(out_row)
+                yield out_row
+
+    return transform_fn
+
+
+def _generate_transform_fn_for_filter(
+    fn: UserDefinedFunction,
+) -> MapTransformCallable[Row, Row]:
+    def transform_fn(rows: Iterable[Row], _: TaskContext) -> Iterable[Row]:
+        for row in rows:
+            if fn(row):
+                yield row
+
+    return transform_fn
+
+
+def _generate_transform_fn_for_map_block(
+    fn: UserDefinedFunction,
+) -> MapTransformCallable[Block, Block]:
+    def transform_fn(blocks: Iterable[Block], _: TaskContext) -> Iterable[Block]:
+        for block in blocks:
+            out_block = fn(block)
+            yield out_block
+
+    return transform_fn
+
+
+# Following are util functions for creating `MapTransformer`s.
+
+
+def _create_map_transformer_for_map_batches_op(
+    batch_fn: MapTransformCallable[DataBatch, DataBatch],
+    batch_size: Optional[int] = None,
+    batch_format: str = "default",
+    zero_copy_batch: bool = False,
+    init_fn: Optional[Callable[[], None]] = None,
+) -> MapTransformer:
+    """Create a MapTransformer for a map_batches operator."""
+    transform_fns = [
+        # Convert input blocks to batches.
+        BlocksToBatchesMapTransformFn(
+            batch_size=batch_size,
+            batch_format=batch_format,
+            zero_copy_batch=zero_copy_batch,
+        ),
+        # Apply the UDF.
+        BatchMapTransformFn(batch_fn, is_udf=True),
+        # Convert output batches to blocks.
+        BuildOutputBlocksMapTransformFn.for_batches(),
+    ]
+    return MapTransformer(transform_fns, init_fn)
+
+
+def _create_map_transformer_for_row_based_map_op(
+    row_fn: MapTransformCallable[Row, Row],
+    init_fn: Optional[Callable[[], None]] = None,
+) -> MapTransformer:
+    """Create a MapTransformer for a row-based map operator
+    (e.g. map, flat_map, filter)."""
+    transform_fns = [
+        # Convert input blocks to rows.
+        BlocksToRowsMapTransformFn.instance(),
+        # Apply the UDF.
+        RowMapTransformFn(row_fn, is_udf=True),
+        # Convert output rows to blocks.
+        BuildOutputBlocksMapTransformFn.for_rows(),
+    ]
+    return MapTransformer(transform_fns, init_fn=init_fn)
+
+
+def _create_map_transformer_for_block_based_map_op(
+    block_fn: MapTransformCallable[Block, Block],
+    init_fn: Optional[Callable[[], None]] = None,
+) -> MapTransformer:
+    """Create a MapTransformer for a block-based map operator."""
+    transform_fns = [
+        # Apply the UDF.
+        BlockMapTransformFn(block_fn),
+        BuildOutputBlocksMapTransformFn.for_blocks(),
+    ]
+    return MapTransformer(transform_fns, init_fn=init_fn)
+
+
+# Following are util functions for the legacy code path.
+
+
+def generate_map_rows_fn(
+    target_max_block_size: int,
+) -> Callable[[Iterator[Block], TaskContext, UserDefinedFunction], Iterator[Block]]:
+    """Generate function to apply the UDF to each record of blocks."""
+
+    def fn(
+        blocks: Iterator[Block],
+        ctx: TaskContext,
+        row_fn: UserDefinedFunction,
+    ) -> Iterator[Block]:
+        transform_fn = _generate_transform_fn_for_map_rows(row_fn)
+        map_transformer = _create_map_transformer_for_row_based_map_op(transform_fn)
+        map_transformer.set_target_max_block_size(target_max_block_size)
+        yield from map_transformer.apply_transform(blocks, ctx)
+
+    return fn
+
+
+def generate_flat_map_fn(
+    target_max_block_size: int,
+) -> Callable[[Iterator[Block], TaskContext, UserDefinedFunction], Iterator[Block]]:
+    """Generate function to apply the UDF to each record of blocks,
+    and then flatten results.
+    """
+
+    def fn(
+        blocks: Iterator[Block],
+        ctx: TaskContext,
+        row_fn: UserDefinedFunction,
+    ) -> Iterator[Block]:
+        transform_fn = _generate_transform_fn_for_flat_map(row_fn)
+        map_transformer = _create_map_transformer_for_row_based_map_op(transform_fn)
+        map_transformer.set_target_max_block_size(target_max_block_size)
+        yield from map_transformer.apply_transform(blocks, ctx)
+
+    return fn
+
+
+def generate_map_batches_fn(
+    target_max_block_size: int,
+    batch_size: Optional[int] = None,
+    batch_format: str = "default",
+    zero_copy_batch: bool = False,
+) -> Callable[[Iterator[Block], TaskContext, UserDefinedFunction], Iterator[Block]]:
+    """Generate function to apply the batch UDF to blocks."""
+
+    def fn(
+        blocks: Iterable[Block],
+        ctx: TaskContext,
+        batch_fn: UserDefinedFunction,
+        *fn_args,
+        **fn_kwargs,
+    ) -> Iterator[Block]:
+        def _batch_fn(batch):
+            return batch_fn(batch, *fn_args, **fn_kwargs)
+
+        transform_fn = _generate_transform_fn_for_map_batches(_batch_fn)
+        map_transformer = _create_map_transformer_for_map_batches_op(
+            transform_fn,
+            batch_size,
+            batch_format,
+            zero_copy_batch,
+        )
+        map_transformer.set_target_max_block_size(target_max_block_size)
+        yield from map_transformer.apply_transform(blocks, ctx)
+
+    return fn
diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/plan_write_op.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/plan_write_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..8efeda39973f23f1f46a0d47d3f3e706fcbf9020
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/plan_write_op.py
@@ -0,0 +1,110 @@
+import itertools
+from typing import Callable, Iterator, List, Union
+
+from pandas import DataFrame
+
+from ray.data._internal.compute import TaskPoolStrategy
+from ray.data._internal.execution.interfaces import PhysicalOperator
+from ray.data._internal.execution.interfaces.task_context import TaskContext
+from ray.data._internal.execution.operators.map_operator import MapOperator
+from ray.data._internal.execution.operators.map_transformer import (
+    BlockMapTransformFn,
+    MapTransformer,
+)
+from ray.data._internal.logical.operators.write_operator import Write
+from ray.data.block import Block, BlockAccessor
+from ray.data.context import DataContext
+from ray.data.datasource.datasink import Datasink, WriteResult
+from ray.data.datasource.datasource import Datasource
+
+
+def gen_datasink_write_result(
+    write_result_blocks: List[Block],
+) -> WriteResult:
+    assert all(
+        isinstance(block, DataFrame) and len(block) == 1
+        for block in write_result_blocks
+    )
+    total_num_rows = sum(result["num_rows"].sum() for result in write_result_blocks)
+    total_size_bytes = sum(result["size_bytes"].sum() for result in write_result_blocks)
+
+    write_returns = [result["write_return"][0] for result in write_result_blocks]
+    return WriteResult(total_num_rows, total_size_bytes, write_returns)
+
+
+def generate_write_fn(
+    datasink_or_legacy_datasource: Union[Datasink, Datasource], **write_args
+) -> Callable[[Iterator[Block], TaskContext], Iterator[Block]]:
+    def fn(blocks: Iterator[Block], ctx: TaskContext) -> Iterator[Block]:
+        """Writes the blocks to the given datasink or legacy datasource.
+
+        Outputs the original blocks to be written."""
+        # Create a copy of the iterator, so we can return the original blocks.
+        it1, it2 = itertools.tee(blocks, 2)
+        if isinstance(datasink_or_legacy_datasource, Datasink):
+            ctx.kwargs["_datasink_write_return"] = datasink_or_legacy_datasource.write(
+                it1, ctx
+            )
+        else:
+            datasink_or_legacy_datasource.write(it1, ctx, **write_args)
+
+        return it2
+
+    return fn
+
+
+def generate_collect_write_stats_fn() -> (
+    Callable[[Iterator[Block], TaskContext], Iterator[Block]]
+):
+    # If the write op succeeds, the resulting Dataset is a list of
+    # one Block which contain stats/metrics about the write.
+    # Otherwise, an error will be raised. The Datasource can handle
+    # execution outcomes with `on_write_complete()`` and `on_write_failed()``.
+    def fn(blocks: Iterator[Block], ctx: TaskContext) -> Iterator[Block]:
+        """Handles stats collection for block writes."""
+        block_accessors = [BlockAccessor.for_block(block) for block in blocks]
+        total_num_rows = sum(ba.num_rows() for ba in block_accessors)
+        total_size_bytes = sum(ba.size_bytes() for ba in block_accessors)
+
+        # NOTE: Write tasks can return anything, so we need to wrap it in a valid block
+        # type.
+        import pandas as pd
+
+        block = pd.DataFrame(
+            {
+                "num_rows": [total_num_rows],
+                "size_bytes": [total_size_bytes],
+                "write_return": [ctx.kwargs.get("_datasink_write_return", None)],
+            }
+        )
+        return iter([block])
+
+    return fn
+
+
+def plan_write_op(
+    op: Write,
+    physical_children: List[PhysicalOperator],
+    data_context: DataContext,
+) -> PhysicalOperator:
+    assert len(physical_children) == 1
+    input_physical_dag = physical_children[0]
+
+    write_fn = generate_write_fn(op._datasink_or_legacy_datasource, **op._write_args)
+    collect_stats_fn = generate_collect_write_stats_fn()
+    # Create a MapTransformer for a write operator
+    transform_fns = [
+        BlockMapTransformFn(write_fn),
+        BlockMapTransformFn(collect_stats_fn),
+    ]
+    map_transformer = MapTransformer(transform_fns)
+    return MapOperator.create(
+        map_transformer,
+        input_physical_dag,
+        data_context,
+        name="Write",
+        target_max_block_size=None,
+        ray_remote_args=op._ray_remote_args,
+        min_rows_per_bundle=op._min_rows_per_bundled_input,
+        compute_strategy=TaskPoolStrategy(op._concurrency),
+    )